[{"split": "train", "image_id": 299207, "question_id": "22MexNkBPpdZGX6sxbxVBH", "question": "What is the man by the bags awaiting?", "choices": ["skateboarder", "train", "delivery", "cab"], "correct_choice_idx": 3, "direct_answers": ["ride", "ride", "bus", "taxi", "travelling", "traffic", "taxi", "cab", "cab", "his ride"], "difficult_direct_answer": false, "rationales": ["A train would not be on the street, he would not have luggage waiting for a delivery, and the skateboarder is there and not paying attention to him so a cab is the only possible answer.", "He has bags as if he is going someone, and he is on a road waiting for vehicle that can only be moved on the road and is big enough to hold the bags.", "He looks to be waiting for a paid ride to pick him up."], "image": "val2014/COCO_val2014_000000299207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39446, "question_id": "22ZAvqke8EhGDj8e4eyios", "question": "Where does this man eat pizza?", "choices": ["office", "cafe", "motel", "outside"], "correct_choice_idx": 0, "direct_answers": ["work", "office", "work", "work", "at work", "desk", "at desk", "office", "work desk", "office"], "difficult_direct_answer": false, "rationales": ["The man is eating pizza at a work desk in an office setting.", "The man is near a computer keyboard.", "The man is sitting at an office desk at his job and eating his pizza."], "image": "train2014/COCO_train2014_000000039446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312452, "question_id": "22oou4R5ejh4Ay9UQG7yGT", "question": "What is the occupation of the person driving?", "choices": ["waiter", "farmer", "cashier", "musician"], "correct_choice_idx": 1, "direct_answers": ["farmer", "farmer", "bus driver", "farmer", "shepherd", "farmer", "bus driver", "farmer", "farmer", "farmer"], "difficult_direct_answer": false, "rationales": ["The place is full of sheep that shows the person is a farmer.", "Farmer is the obvious profession as the picture shows.", "With the tractor he is in and the livestock shown it is easy to surmise his profession."], "image": "train2014/COCO_train2014_000000312452.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46408, "question_id": "22qCSTGL82TcgGtu9wtcrL", "question": "How were the drivers of the cars able to park here?", "choices": ["firemen", "airport workers", "police", "postal workers"], "correct_choice_idx": 1, "direct_answers": ["airport workers", "driving", "stilts", "parking lot", "designated spots", "airport vehicles", "by driving", "work", "special rights", "workers"], "difficult_direct_answer": true, "rationales": ["These drivers work at the airport.", "Cars are parked on a tarmac near an airplane. only worker vehicles are allowed on airport tarmacs.", "The people that work here use the vehicles on the tarmac."], "image": "train2014/COCO_train2014_000000046408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282150, "question_id": "22qvnfBREuA4LUGw3cfTgg", "question": "How many people can ride this motorcycle at a time?", "choices": ["four", "two", "three", "one"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Two people can be on the bike.", "There is a passenger seat and a driver's seat.", "There are two seats and seat backs on the motorcycle that apply that two could ride simultaneously."], "image": "val2014/COCO_val2014_000000282150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420501, "question_id": "23NaDk4gncPVsGzZ7UuxNK", "question": "Where would one most likely see the show advertised in the poster?", "choices": ["theater", "tv", "internet", "cinema"], "correct_choice_idx": 0, "direct_answers": ["theatre", "broadway", "tv", "theater", "theater", "bus stop", "broadway", "theater", "theater", "in theater"], "difficult_direct_answer": false, "rationales": ["Most likely the sign is depicting a broadway show at a theater.", "The poster would be in a theater.", "This type of display with a character and the added writing is usually used to advertise either a movie or play both of which could be viewed in a theater."], "image": "train2014/COCO_train2014_000000420501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445368, "question_id": "23VQ24vGdCfx2vuwoNqxwk", "question": "What mood do the cows seem to be in?", "choices": ["sad", "happy", "curious", "scared"], "correct_choice_idx": 1, "direct_answers": ["curious", "good", "happy", "happy", "curious", "happy", "happy", "curious", "curious", "curious"], "difficult_direct_answer": false, "rationales": ["The cows are happy.", "These cows appear happy with their heads up.", "The cows standing in the grass look happy and satisfied as they drink water and eat grass."], "image": "train2014/COCO_train2014_000000445368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498730, "question_id": "23ibLk5tF4wVKd7MMms9Yp", "question": "What type of rain is this called?", "choices": ["average", "drizzle", "sprinkle", "downpour"], "correct_choice_idx": 3, "direct_answers": ["downpour", "flood", "heavy rainfall", "downpour", "downpour", "downpour", "downpour", "downpour", "downpour", "down pour"], "difficult_direct_answer": false, "rationales": ["The way the woman uses the umbrella.", "The rain is really heavy and uncomfortable to be in.", "You can see how hard it is raining and how much it is."], "image": "train2014/COCO_train2014_000000498730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558229, "question_id": "23x4iQXWfAeZxqCJVbTuv5", "question": "What is the man putting on the bus?", "choices": ["camera", "ribbon", "rag", "bow"], "correct_choice_idx": 3, "direct_answers": ["bow", "bow", "bow", "bow", "bow", "cloth", "bow", "towel", "bow", "bow"], "difficult_direct_answer": false, "rationales": ["He affixed a tidy, unripped, tied tow to the bus.", "They are putting a big red bow on it.", "You can see the red looped decoration in his hands."], "image": "val2014/COCO_val2014_000000558229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231863, "question_id": "23xWxgFhSv5sZaxwJrqJwm", "question": "If the cameraman were driving what do they have to do from this position?", "choices": ["turn left", "drive straight", "reverse course", "turn right"], "correct_choice_idx": 3, "direct_answers": ["turn right", "take photo", "turn left", "point", "turn right", "turn right", "turn right", "car", "turn right", "turn right"], "difficult_direct_answer": false, "rationales": ["The would have to turn right because the lane has right turn arrows painted on it.", "The arrow on the street indicates that this lane can only go in one direction at the intersection.", "The sign on the road says to turn right."], "image": "train2014/COCO_train2014_000000231863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298540, "question_id": "24Vbs8HwsWk7uuMZcSFBHd", "question": "How did these frisbee throwers get to this location?", "choices": ["police car", "bike", "jogged", "motorcycle"], "correct_choice_idx": 1, "direct_answers": ["biked", "bicycled", "bike", "bikes", "biked", "bike", "bicycle", "cycling", "bike", "bicycles"], "difficult_direct_answer": false, "rationales": ["The only modes of transportation which near these frisbee throwers are the bikes on the ground.", "There are non-motorized two-wheeled vehicles parked near the frisbee throwers.", "There are two wheeled manual powered vehicles next to them."], "image": "val2014/COCO_val2014_000000298540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358239, "question_id": "24pdoq3jwTbLqATxYufZJZ", "question": "What animal will most likely eat this meal?", "choices": ["elephant", "human", "bird", "cow"], "correct_choice_idx": 1, "direct_answers": ["human", "coyote", "human", "human", "human", "human", "humans", "human", "human", "human"], "difficult_direct_answer": false, "rationales": ["Only humans will make this and eat it.", "The animal is a human.", "The sandwich is sold for people to eat."], "image": "train2014/COCO_train2014_000000358239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220502, "question_id": "24vcZcaxamEJwTD7wHYb4v", "question": "In which country is this bus located?", "choices": ["uk", "cuba", "usa", "mexico"], "correct_choice_idx": 0, "direct_answers": ["england", "uk", "uk", "united kingdom", "england", "england", "england", "england", "london", "england"], "difficult_direct_answer": false, "rationales": ["Most double decker buses designed like this are traditionally found in the uk.", "This is a double decker bus used in london", "England is known for its double-decker buses."], "image": "train2014/COCO_train2014_000000220502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94681, "question_id": "25DU8SkTV8JifnFUGcgv4r", "question": "When did the namesake of this theater die?", "choices": ["1998", "2009", "2015", "2020"], "correct_choice_idx": 1, "direct_answers": ["1958", "20th century", "2009", "2009", "long ago", "2009", "1989", "2009", "2009", "2009"], "difficult_direct_answer": false, "rationales": ["Looks to be in 2009 or so.", "He died in 2009", "The person whose name is behind the theater died in 2009."], "image": "train2014/COCO_train2014_000000094681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101581, "question_id": "25EKstQLh3QXDC7R4Cggpr", "question": "What animals are these?", "choices": ["llama", "donkey", "sheep", "horse"], "correct_choice_idx": 3, "direct_answers": ["horses", "horse", "horses", "horses", "horse", "horse", "horses", "horses", "horses", "horses"], "difficult_direct_answer": false, "rationales": ["These animals are horses since they have manes.", "The animals on the pasture are large horses.", "The animals on the hill are wild horses that are running free."], "image": "train2014/COCO_train2014_000000101581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550862, "question_id": "25NqUCWjGRoMadvbsp5EhW", "question": "What kind of cake has been served?", "choices": ["cinnamon", "red velvet", "chocolate", "carrot"], "correct_choice_idx": 3, "direct_answers": ["carrot", "chocolate", "carrot", "carrot", "carrot", "carrot cake", "carrot", "carrot", "carrot", "carrot"], "difficult_direct_answer": false, "rationales": ["The carrot and icing of the cake gives it away for what type of cake it is.", "The carrot is seen on the cake.", "The cake is carrot cake."], "image": "val2014/COCO_val2014_000000550862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549797, "question_id": "25V87YPnFdGNMCTyDCL4KW", "question": "What kind of computer is near the woman in blue?", "choices": ["acer", "macintosh", "hp", "dell"], "correct_choice_idx": 1, "direct_answers": ["apple", "apple computer", "apple computer", "macintosh", "apple", "apple", "apple", "apple", "apple", "apple computer"], "difficult_direct_answer": false, "rationales": ["The computer is a mac.", "You can see the little apple in the center.", "One can see the apple logo on the computer near her."], "image": "val2014/COCO_val2014_000000549797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5131, "question_id": "25Vd9e8dW8qu6Tsmagyds6", "question": "What job do the people shown here share?", "choices": ["manufacturing", "taxi driver", "movie stars", "flying"], "correct_choice_idx": 3, "direct_answers": ["pilot", "pilot", "military", "pilot", "officers", "airline pilots", "military service", "fly planes", "flying", "pilot"], "difficult_direct_answer": false, "rationales": ["The people are pilots.", "The people wear these outfits when working inside planes.", "They are wearing hats with wings on them."], "image": "train2014/COCO_train2014_000000005131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339006, "question_id": "25XbFPtCMaRohAb9mWMUqV", "question": "Who last gave force to the ball shown?", "choices": ["batter", "coach", "pitcher", "catcher"], "correct_choice_idx": 2, "direct_answers": ["pitcher", "batter", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher"], "difficult_direct_answer": false, "rationales": ["The ball is in the air and approaching the batter.", "The pitcher is throwing the ball to the batter in baseball which is standard.", "The ball flying towards the batter was thrown that direction by the pitcher."], "image": "train2014/COCO_train2014_000000339006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373075, "question_id": "25aCRRGXTWrsn6YNokiu37", "question": "What type of gathering is this?", "choices": ["meeting", "ceremony", "barbeque", "wedding"], "correct_choice_idx": 2, "direct_answers": ["barbeque", "fish fry", "cookout", "barbeque", "bbq", "cookout", "family bbq", "bbq", "bbq", "barbeque"], "difficult_direct_answer": false, "rationales": ["The gathering is a bbq.", "There is food on the grill surface.", "The grill is used outdoors to prevent fire hazards. it can cook several types of food."], "image": "val2014/COCO_val2014_000000373075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450275, "question_id": "25ixsJaCVY6yNtqvMBGiQ4", "question": "What song mentions the animal under the desk?", "choices": ["cat people", "camel song", "baby shark", "good dog"], "correct_choice_idx": 2, "direct_answers": ["baby shark", "baby shark", "baby shark", "baby shark", "baby shark", "baby shark", "baby shark", "baby shark", "baby shark", "baby shark"], "difficult_direct_answer": false, "rationales": ["The animal is a carnivorous fish, not a cat, dog, or camel.", "The animal under the desk is a shark and baby shark is a popular children's song.", "The song is baby shark."], "image": "train2014/COCO_train2014_000000450275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453352, "question_id": "25nambkR7uUGby8c3As6Ss", "question": "What letter appears twice in a row on the train?", "choices": ["g", "d", "w", "b"], "correct_choice_idx": 3, "direct_answers": ["letter b", "letter b", "letter b", "b", "letter b", "bb", "letter b", "letter b", "bb", "bb"], "difficult_direct_answer": false, "rationales": ["Obb appears on the train.", "There are only two duplicate letters.", "The letter \"b\" appears twice on the train."], "image": "train2014/COCO_train2014_000000453352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12993, "question_id": "25ptF6QJPGuf9aCqSXm2R8", "question": "Which object is in the greatest danger?", "choices": ["bus", "motorcyclist", "blue car", "cyclist"], "correct_choice_idx": 3, "direct_answers": ["person", "cyclist", "bicycle", "bicycle", "bike", "person", "bike", "bike", "bus", "cyclist"], "difficult_direct_answer": false, "rationales": ["The water is very deep for the bike.", "The person on the bicycle might get splashed by water.", "The cyclist is most likely to have problems because of the rain."], "image": "val2014/COCO_val2014_000000012993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458308, "question_id": "25sUWjaZcsndzmwJa4UJxR", "question": "What type of race is this?", "choices": ["sheep racing", "horse racing", "cat racing", "dog racing"], "correct_choice_idx": 1, "direct_answers": ["horse", "horse", "jumpers", "horse racing", "horse", "horse", "horse racing", "equestrian", "horse", "showjumping"], "difficult_direct_answer": false, "rationales": ["Of the options only answer a is visible. if the other answers were viable there would likely be one of those animals present or represented in imagery.", "The woman is sitting on a horse and none of the other animals are visible so it would have to be a horse race", "This is a competition for horses."], "image": "train2014/COCO_train2014_000000458308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452663, "question_id": "268B6iiuB9yTEd9sG8tuvY", "question": "What period of the day is depicted in the photo?", "choices": ["morning", "night", "evening", "afternoon"], "correct_choice_idx": 0, "direct_answers": ["midday", "morning", "midday", "midmorning", "midday", "midday", "midday", "morning", "morning", "morning"], "difficult_direct_answer": false, "rationales": ["The sun is out and so it's during the day.", "There is sunlight.", "According to the clock it is almost 11:30. it is not dark outside so it is day time, but it is not quite noon yet."], "image": "val2014/COCO_val2014_000000452663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31392, "question_id": "26Cxre88izVxEdJEuggHYo", "question": "What is the cat growling at?", "choices": ["kittens", "mirror", "other car", "passenger"], "correct_choice_idx": 1, "direct_answers": ["outside object", "scenery", "reflection", "passerby", "self", "outside", "mirror", "something outside", "mirror", "animal"], "difficult_direct_answer": true, "rationales": ["The cat is seated on the dashboard of the car an is looking at is reflection.", "The cat is growling at the mirror.", "The cat sees himself in the mirror."], "image": "val2014/COCO_val2014_000000031392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205251, "question_id": "26FeBKm6vmfcWcjaMQbLEp", "question": "What sort of building is seen behind this amusement?", "choices": ["school", "barn", "cafe", "financial"], "correct_choice_idx": 3, "direct_answers": ["hotel", "hotel", "us", "office building", "office", "office", "banking", "financial", "office", "bank"], "difficult_direct_answer": false, "rationales": ["A bank logo is on the side of a bank.", "Ubs is a bank.", "Ubs is a well known bank. one can see its logo on the building."], "image": "val2014/COCO_val2014_000000205251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551418, "question_id": "26VTMNEcfNN2Rs5ZGZUB9F", "question": "What do these pilots hope for?", "choices": ["higher wages", "free peanuts", "better views", "wings"], "correct_choice_idx": 0, "direct_answers": ["better conditions", "better management", "more pay", "change", "meet demands", "fairness", "better management", "better management", "better management", "higher wages"], "difficult_direct_answer": false, "rationales": ["The pilots feel they are underpaid and are picketing.", "Pilots are holding signs about asking about what a pilot is worth. they are talking about salary.", "The signs mention \"worth\" and the other options aren't nearly as important as the money they earn."], "image": "train2014/COCO_train2014_000000551418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510087, "question_id": "26q3Ap8si2cRuRdfcqCiYP", "question": "What is the name of this dog breed?", "choices": ["poodles", "retriever", "bulldog", "pomeranian"], "correct_choice_idx": 0, "direct_answers": ["terrier", "poodles", "toy", "maltese dog", "terrier", "terrier", "terrier", "terrier", "scottie", "terrier"], "difficult_direct_answer": false, "rationales": ["The breed is a poodle.", "The dog is very fluffy.", "It looks like a terrier but has the size and color of some poodles."], "image": "train2014/COCO_train2014_000000510087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123639, "question_id": "274szRA2ikb5Kke7QZgY23", "question": "Which direction is this meter pointing towards?", "choices": ["left", "down", "right", "up"], "correct_choice_idx": 2, "direct_answers": ["right side", "fail", "right", "right", "right", "right", "right", "towards road", "left", "right"], "difficult_direct_answer": false, "rationales": ["A meter has an arrow on it that points to the right.", "The arrow points right.", "A meter has an arrow on it that points in one direction."], "image": "val2014/COCO_val2014_000000123639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95106, "question_id": "27CjdRPwZ6bkSmThVynHt4", "question": "Which team won this sport's championship in 2019?", "choices": ["calgary cannons", "washington nationals", "seattle mariners", "toronto jays"], "correct_choice_idx": 1, "direct_answers": ["giants", "washington nationals", "washington nationals", "washington nationals", "nationals", "nationals", "washington nationals", "nationals", "washington nationals", "washington nationals"], "difficult_direct_answer": false, "rationales": ["Washington nationals won.", "That is the team that won in 2019.", "The washington nationals took this title."], "image": "val2014/COCO_val2014_000000095106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278656, "question_id": "27GtM7fKrPGq9BgUibRXog", "question": "According to its nickname this city never does what?", "choices": ["sleeps", "disappoints", "cheats", "loses"], "correct_choice_idx": 0, "direct_answers": ["sleep", "sleeps", "street", "sleeps", "sleep", "sleeps", "sleep", "sleeps", "sleeps", "sleeps"], "difficult_direct_answer": false, "rationales": ["This is new york city according to the banner on the light pole", "New york is always awake.", "The city is always open and always has things to do that are open around the clock."], "image": "train2014/COCO_train2014_000000278656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341768, "question_id": "285MXnFKQWZ8JG8D2z7mis", "question": "What number comes after the number on the top of the bus?", "choices": ["19", "52", "23", "ten"], "correct_choice_idx": 3, "direct_answers": ["ten", "ten", "ten", "ten", "nine", "ten", "ten", "ten", "license number", "ten"], "difficult_direct_answer": false, "rationales": ["The number is 9.", "The number on top of the bus is 9 so the next consecutive number was chosen.", "When you count 1-10, the bus number is right before 10"], "image": "train2014/COCO_train2014_000000341768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372817, "question_id": "28MCx7HftxZgdtuqQqcdtA", "question": "One of the umbrellas is inspired by which country's flag?", "choices": ["morocco", "germany", "usa", "denmark"], "correct_choice_idx": 2, "direct_answers": ["united states", "usa", "america", "usa", "united states", "england", "usa", "united states", "england", "united states"], "difficult_direct_answer": false, "rationales": ["The umbrella beside the yellow one has stars and red and white stripes.", "It has red,white,blue and stars.", "Usa's flag is red white and blue."], "image": "val2014/COCO_val2014_000000372817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22628, "question_id": "28MpCKuEtHJ7k5aS782udJ", "question": "What time of the day are people exploring the beach?", "choices": ["mixed tide", "spring tide", "high tide", "low tide"], "correct_choice_idx": 3, "direct_answers": ["noon", "afternoon", "day", "afternoon", "midday", "low tide", "afternoon", "afternoon", "low tide", "noon"], "difficult_direct_answer": false, "rationales": ["The tide is far out and you can see a lot of sand.", "The water has receded and you can see puddles of water that have been left as the tide went out.", "The ocean is not very high. it is easy to see the sand."], "image": "val2014/COCO_val2014_000000022628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164439, "question_id": "28zRXPzGV2aafu7c9mnvi7", "question": "How does this dog's fur feel at this time?", "choices": ["wet", "crispy", "clean", "fluffy"], "correct_choice_idx": 0, "direct_answers": ["wet cold", "wet", "wet", "wet", "wet cold", "wet", "wet", "wet", "wet", "wet"], "difficult_direct_answer": false, "rationales": ["The dog is walking outside in the rain and its fur is wet.", "The dog is running in the rain and has wet fur.", "It is raining out so his fur will be wet."], "image": "train2014/COCO_train2014_000000164439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365569, "question_id": "2972Ggqy2w95CYFUJUVegH", "question": "The food on the plate that is farthest away from the woman is usually attributed to what country?", "choices": ["italy", "germany", "russia", "india"], "correct_choice_idx": 0, "direct_answers": ["italy", "pizza", "italy", "italy", "italy", "italy", "pizza", "italy", "italy", "italy"], "difficult_direct_answer": false, "rationales": ["The food is from italy.", "On the plate described there is a pizza. pizza is a food traditionally associated with answer a.", "Traditionally pizza is known to be an italian cuisine."], "image": "train2014/COCO_train2014_000000365569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271565, "question_id": "29C9zSREbUamALaQjz3oaJ", "question": "The shade is causing the motorcyclists to turn what on?", "choices": ["windshield wipers", "headlights", "radio", "turning signal"], "correct_choice_idx": 1, "direct_answers": ["headlights", "lights", "lights", "headlights", "headlights", "lights", "lights", "headlights", "lights", "lights"], "difficult_direct_answer": false, "rationales": ["The shade is causing the motorcyclists to turn on their headlights because they need to see clearly to ride.", "One can see that the motorcycle lights are on.", "There are lights visibly on in the front of each motorcycle. the sun is clearly out but there may be enough shade for them to turn the lights on out of precaution."], "image": "val2014/COCO_val2014_000000271565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421360, "question_id": "29GQz3jfvvNvhNmTvfdHqZ", "question": "What type of contest is being held?", "choices": ["spelling", "running", "trivia", "eating"], "correct_choice_idx": 3, "direct_answers": ["eating", "eating", "eating", "food contest", "food eating", "eating", "eating", "eating", "eating contest", "eating contest"], "difficult_direct_answer": false, "rationales": ["This is an eating contest between the man and the woman.", "Both people sitting at the table have food in front of them.", "The eating contest is held."], "image": "val2014/COCO_val2014_000000421360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259567, "question_id": "29HEEGvCYGJhNKYdxLgPV2", "question": "Where are these people located?", "choices": ["work", "hospital", "library", "home"], "correct_choice_idx": 3, "direct_answers": ["home", "living room", "living room", "living room", "home", "home", "family room", "den", "living room", "home"], "difficult_direct_answer": false, "rationales": ["They look to be in a house and in someones living room.", "They are in a messy living room and dressed casually, and with a dog.", "The people are at home."], "image": "val2014/COCO_val2014_000000259567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84449, "question_id": "29VgB2ivboXv7B6JxARjcg", "question": "What do people put in that black tank?", "choices": ["elephant food", "grain", "seeds", "water"], "correct_choice_idx": 3, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["Large tanks can hold liquid drinking supplies for later.", "This collects rainwater and has some spilling over", "Two elephants are standing at the side of large tanks and the side of the tanks are wet."], "image": "train2014/COCO_train2014_000000084449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417607, "question_id": "29egW8WUrZQjVK4hcoyfHp", "question": "Why is the man wearing a yellow vest?", "choices": ["fashion", "visibility", "camouflage", "costume"], "correct_choice_idx": 1, "direct_answers": ["safety", "safety", "safety", "visibility", "safety", "construction worker", "safety", "safety worker", "safety", "construction worker"], "difficult_direct_answer": false, "rationales": ["The visible vest is a neon yellow. neon yellow vests of this type are often worn by professionals who want to be seen for their safety.", "Yellow is very visible and as a construction worker he would want to be visible while being in the street", "The man is wearing the yellow vest because it is reflective and makes it easy to see him."], "image": "train2014/COCO_train2014_000000417607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132038, "question_id": "29mDTuu4CzRcDnLGMjPPUt", "question": "What is the man looking down at?", "choices": ["onions", "cue balls", "tennis balls", "apples"], "correct_choice_idx": 2, "direct_answers": ["tennis ball", "tennis balls", "tennis balls", "balls", "tennis balls", "tennis balls", "dirt", "tennis balls", "balls", "balls"], "difficult_direct_answer": false, "rationales": ["The color and the fact they are on a tennis court, it's easy to understand what the balls are.", "A line of tennis balls are on the ground.", "The man is looking at the tennis balls in the dirt."], "image": "train2014/COCO_train2014_000000132038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261316, "question_id": "29ouBCs2QswSK5kwRGCcjH", "question": "Why is the bike here?", "choices": ["storage", "speed", "for sale", "fashion"], "correct_choice_idx": 0, "direct_answers": ["transport", "storage", "transportation", "transportation", "transportation", "saving time", "moving", "carrying elsewhere", "transported", "owner inside"], "difficult_direct_answer": false, "rationales": ["The bike is there for storage.", "People store their bikes at the front of busses while they ride the bus.", "A bus does not have room inside for a bicycle so that have a special rack on the front to hold the bicycle until the owner exits the bus."], "image": "train2014/COCO_train2014_000000261316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568854, "question_id": "29x8nKziCBrW6gQt4PKwFT", "question": "How was this room secured by the group using it?", "choices": ["picketed", "purchased building", "sit in", "rented"], "correct_choice_idx": 3, "direct_answers": ["reservation", "door locks", "rented", "reservation", "early booking", "doors", "reservation", "close door", "locked doors", "official means"], "difficult_direct_answer": false, "rationales": ["This is a venue used for people when they have weddings and other gatherings", "This room is normally at hotels which are used for events and receptions.", "It is full of decorated round tables where people are eating. there aren't any indications of this being a room that has a standard purpose."], "image": "train2014/COCO_train2014_000000568854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398246, "question_id": "29xC35SLennUQGvVRPfG9U", "question": "What is the likely temperature of the lady's beverage?", "choices": ["hot", "cold", "room temperature", "cool"], "correct_choice_idx": 0, "direct_answers": ["hot", "hot", "hot", "hot", "hot", "hot", "85f", "hot", "hot", "hot"], "difficult_direct_answer": false, "rationales": ["It looks to be cold out so they are hot beverages.", "The beverage is likely hot coffee.", "That type of cup is designed to keep beverages more than warm for longer periods of time than plastic."], "image": "val2014/COCO_val2014_000000398246.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462315, "question_id": "2A3znX3UZJ8tbLoGd2eaUF", "question": "Why are all these people here?", "choices": ["invited", "for dinner", "fighting fire", "watching show"], "correct_choice_idx": 2, "direct_answers": ["fighting fire", "truck fire", "handling accident", "moving", "accident", "fire", "truck fire", "fighting fire", "uncontrolled fire", "extinguish fire"], "difficult_direct_answer": false, "rationales": ["The people are standing near a scene that looks like a fire took place and was extinguished. there is black damage on the building.", "There are firefighters seen in their gear actively using a hose and spraying water.", "There was an accident and they are there to put the fire out."], "image": "val2014/COCO_val2014_000000462315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286328, "question_id": "2A7Smsnf5EfKie2on67nuD", "question": "What word is directly under the word Sony on the phone?", "choices": ["video", "baby", "sky", "leave"], "correct_choice_idx": 2, "direct_answers": ["sky", "sky", "sky", "sky", "sky", "sky", "sky", "sky", "sky", "sky"], "difficult_direct_answer": false, "rationales": ["The word is shown in the top middle of the screen.", "The word on the top of the screen is sky.", "That's the first work on the screen."], "image": "train2014/COCO_train2014_000000286328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344231, "question_id": "2AGU6KLBbDTXxF9TdUX6je", "question": "Why are they chasing the ball?", "choices": ["to steal", "to grab", "are confused", "to kick"], "correct_choice_idx": 3, "direct_answers": ["playing soccer", "to kick", "soccer game", "playing soccer", "playing", "to play", "playing soccer", "competition", "playing soccer", "playing soccer"], "difficult_direct_answer": false, "rationales": ["The girls are wearing different colors, so they want to get it with their feet before the other does to make a goal. in soccer, you can only use your feet.", "The girls are chasing the soccer ball because they are each trying to kick it before the other does.", "They need to direct the ball to another team member so they can try to score points"], "image": "val2014/COCO_val2014_000000344231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228624, "question_id": "2ASwVZKw8rVbBfp9kMMtR2", "question": "What does he do for a living?", "choices": ["construction", "farming", "lawyer", "teacher"], "correct_choice_idx": 1, "direct_answers": ["vending bananas", "sell food", "farming", "picks bananas", "sell food", "sell fruit", "sell fruit", "sell food", "sells food", "sell banana's"], "difficult_direct_answer": false, "rationales": ["He has a lot of produce for sale", "Most farmers will sell their crops after harvest at the farmers market.", "The man grows various kinds of crops and sells them for people to eat."], "image": "train2014/COCO_train2014_000000228624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95039, "question_id": "2AtzV9mGidSx3L2Ywurf5a", "question": "What role is being fulfilled by the kneeling gray shirted person?", "choices": ["batter", "catcher", "referee", "coach"], "correct_choice_idx": 1, "direct_answers": ["catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "ball catcher", "ball catcher"], "difficult_direct_answer": false, "rationales": ["The person is wearing pads that only a catcher in baseball would wear and is positioned behind home plate where a catcher would be.", "The kneeling player has special protective gear on that is worn due to the position he plays. he is the person that the pitcher throws the ball to while pitching to the batter.", "You can tell by his face gear and position on the mound as to what his position is."], "image": "val2014/COCO_val2014_000000095039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223670, "question_id": "2B7CZNoAuAhQBZtgp3aLsJ", "question": "What will the child try to do first?", "choices": ["rest", "flip board", "yell", "eat"], "correct_choice_idx": 1, "direct_answers": ["flip board", "flip board", "flip skateboard", "flip skateboard", "correct skateboard", "flip board", "flip skateboard", "flip it", "turn over", "flip skateboard"], "difficult_direct_answer": false, "rationales": ["He will try to flip his board around.", "The child is participating in an extreme sport. the four-wheeled object he is using is upside down.", "The child wants to flip the board."], "image": "train2014/COCO_train2014_000000223670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246650, "question_id": "2BqX4LUV4EPA3ZnZw9AjSw", "question": "What direction will the beige car travel in after the light turns green above the intersection?", "choices": ["right", "reverse", "left", "straight"], "correct_choice_idx": 3, "direct_answers": ["straight", "straight", "straight", "straight", "straight", "straight", "forward", "straight", "forward", "straight ahead"], "difficult_direct_answer": false, "rationales": ["The direction is straight.", "They do not have a blinker on so probaly will go straight.", "The beige sedan is stopped at the light and is going straight ahead based on the lane they are in."], "image": "val2014/COCO_val2014_000000246650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64800, "question_id": "2BruGEviB9jrmMvJhe47Wr", "question": "Why are the people lined up outside the silver vehicle?", "choices": ["getting ride", "getting in", "buying food", "driving it"], "correct_choice_idx": 2, "direct_answers": ["awaiting food", "buying food", "want food", "buying food", "hungry", "food", "buy cupcakes", "for cupcakes", "ordering lunch", "for cupcakes"], "difficult_direct_answer": false, "rationales": ["A food truck is parked on a busy street and people go to food trucks for lunch and other occasions.", "This vehicle sells ice cream.", "There is a large cupcake on the top of the van which means they are probably selling treats."], "image": "train2014/COCO_train2014_000000064800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138921, "question_id": "2C9Uqrr3U85YBE9Azp5Eja", "question": "What kind of transportation is this?", "choices": ["land", "rail", "air", "water"], "correct_choice_idx": 2, "direct_answers": ["airplane", "air", "airplane", "airplane", "plane", "airplane", "airplane", "jet plane", "airplane", "plane"], "difficult_direct_answer": false, "rationales": ["This would be an airplane that flies in the air.", "The vehicle is above the ground. it has wings and jet engines.", "Airplanes travel in the sky to transport people or objects from one destination to another."], "image": "train2014/COCO_train2014_000000138921.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91190, "question_id": "2CngqYkvnhyTnDDrSFYh3S", "question": "This airline company is headquartered in which country?", "choices": ["india", "china", "japan", "korea"], "correct_choice_idx": 2, "direct_answers": ["japan", "uk", "japan", "italy", "japan", "america", "tokyo", "japan", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["Sotaseed air is written on the side of the plane which is likely representing the company that owns the plane. that company is located in japan.", "This is the airline for that country", "Solaseed airlines is a regional airline located in japan"], "image": "train2014/COCO_train2014_000000091190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566765, "question_id": "2CoyAWY2B8gJ9DhVfWqZR4", "question": "What is the cat on the toilet lid staring at?", "choices": ["toilet bowl", "reflection", "upper cat", "sink"], "correct_choice_idx": 2, "direct_answers": ["another cat", "upper cat", "other cat", "other cat", "another cat", "cat", "other cat", "other cat", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["The feline on the toilet lid is locking eyes with the other feline.", "The cats are making eye contact with each other.", "There is a cat standing above the cat on the toilet lid."], "image": "train2014/COCO_train2014_000000566765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410648, "question_id": "2DVwH39rUB9Re9pU6x7pQ8", "question": "What is not unique about this animals?", "choices": ["have husks", "large ears", "have trunks", "four legs"], "correct_choice_idx": 3, "direct_answers": ["skin", "common elephants", "color", "trunk", "skin", "four legs", "height", "it's ears", "legs", "family oriented"], "difficult_direct_answer": true, "rationales": ["Many mammals have four legs and the rest of the items on the list are not as common.", "They have trunks that most animals do not have.", "Elephants are in a group. lots of animals have four legs."], "image": "train2014/COCO_train2014_000000410648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324626, "question_id": "2DggDb93b89ddhD4ks8UHN", "question": "What are the two men looking at?", "choices": ["food", "phone", "book", "letter"], "correct_choice_idx": 1, "direct_answers": ["cellphone", "cell phone", "phone", "phone", "cellphone", "phone", "phone", "phone", "phone", "phone"], "difficult_direct_answer": false, "rationales": ["They are looking at someones on the one mans phone.", "Both men are looking at the object being held in the right hand of the man on the right side. this object is a cell phone.", "The men are looking at the phone."], "image": "train2014/COCO_train2014_000000324626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22195, "question_id": "2Di76pYbZoB2D5TAuxqKXz", "question": "What is the name of the painting on the outside of the blue train?", "choices": ["fine art", "acrylic", "graffiti", "oil"], "correct_choice_idx": 2, "direct_answers": ["graffiti", "graffiti", "blue", "graffiti", "railway", "means", "graffiti", "jeans", "graffiti", "graffiti"], "difficult_direct_answer": false, "rationales": ["The name is graffiti.", "It was done casually, not by someone at the manufacturing plant or a \"classically\" trained artist.", "It is random artwork spray painted on the train."], "image": "train2014/COCO_train2014_000000022195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69182, "question_id": "2ECi8bHNcnjeCdghvFfTCq", "question": "What can people do here besides skateboarding?", "choices": ["play basketball", "play tennis", "swim", "hike"], "correct_choice_idx": 2, "direct_answers": ["visit friends", "talk", "scooter", "watch skaters", "roller skate", "swim", "swim", "spectate", "beach", "bicycle"], "difficult_direct_answer": true, "rationales": ["The area has a drain and is a recessed concrete pit that's water proof.", "If you fill the bowl with water, you can swim.", "The people are at a skatepark that could be filled with water for swimming."], "image": "train2014/COCO_train2014_000000069182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389549, "question_id": "2EUbaA8LmbwZF3WRGXVJRA", "question": "Why is the brown object placed near the fence?", "choices": ["to disguise", "to inform", "decoration", "to warn"], "correct_choice_idx": 1, "direct_answers": ["give information", "provide information", "information plaque", "information", "information", "to inform", "information", "information area", "information", "historical information"], "difficult_direct_answer": false, "rationales": ["The brown objects gives directions and information.", "The poster to the right is an informational poster about the area.", "It is a sign with information on it."], "image": "val2014/COCO_val2014_000000389549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149449, "question_id": "2EbZu7iEtdGJzJCeNgwQsG", "question": "What are people watching the elephant likely to use to shoot it?", "choices": ["camera", "gun", "bow/ arrows", "darts"], "correct_choice_idx": 0, "direct_answers": ["water", "camera", "camera", "water", "nothing", "camera", "gun", "camera", "camera", "camera"], "difficult_direct_answer": false, "rationales": ["The people have a camera.", "People are likely to shoot a photo of this elephant with a camera.", "Tourists are standing around an elephant enclosure. people take pictures at zoos."], "image": "train2014/COCO_train2014_000000149449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147568, "question_id": "2EgWEpcCyB2LEDTEc5LQkC", "question": "Where is the man sitting?", "choices": ["bike", "couch", "dugout", "bed"], "correct_choice_idx": 2, "direct_answers": ["dugout", "dugout", "bench", "bench", "bleachers", "dugout", "dugout", "dugout", "dugout", "dugout"], "difficult_direct_answer": false, "rationales": ["The man is in a dugout.", "The seating is located with baseball bats and it has step seating which indicates that the man is at a baseball field in the area where players wait to bat next.", "He is sitting in the dugout where players waiting to play sit."], "image": "train2014/COCO_train2014_000000147568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485129, "question_id": "2EjooY7GfUBSDqbKifPdb8", "question": "If the person here falls off the board what might help them retrieve their board?", "choices": ["leg rope", "dog", "shark", "satellite dish"], "correct_choice_idx": 0, "direct_answers": ["cable", "leg rope", "surfboard leash", "leash", "string", "rope", "hand", "wave", "leg rope", "cord"], "difficult_direct_answer": true, "rationales": ["There is a cable that attaches the board to the surfer's lower appendage. a shark, satellite dish, or dog would not help them retrieve the board.", "A person is surfing. surfers connect their boards to their legs with a tether.", "Surfers have a tether between their ankle and the board."], "image": "val2014/COCO_val2014_000000485129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567205, "question_id": "2Et6VPVRsaB7ZNoD5uEhfh", "question": "The technician on the sidewalk is in the process of repairing what item next to the SUV?", "choices": ["crosswalk signal", "ticket kiosk", "parking meter", "payphone"], "correct_choice_idx": 2, "direct_answers": ["parking meter", "meter", "parking meter", "meter", "parking meter", "meter", "parking meter", "parking meter", "parking meter", "parking meter"], "difficult_direct_answer": false, "rationales": ["The man is taking apart a parking meter which is obvious by what the picture is showing.", "The man is repairing the meters used for parking.", "It is on a main road next to a parking spot, where paid parking is offered."], "image": "val2014/COCO_val2014_000000567205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219848, "question_id": "2EyRwb6xcnEdGWYP38dBr5", "question": "What should they have worn before starting the activity?", "choices": ["life jacket", "headband", "wristband", "helmet"], "correct_choice_idx": 0, "direct_answers": ["life jackets", "safety suit", "flotation jacket", "life vest", "life jacket", "life vests", "life vests", "life jacket", "wetsuit", "hats"], "difficult_direct_answer": false, "rationales": ["People wear that in case they fall in the water.", "Life jackets keep people afloat.", "The people in the raft should wear life jackets because they'll be safe if they fall in the water."], "image": "val2014/COCO_val2014_000000219848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395964, "question_id": "2Ez3AmPxxsBshAnSCeLm7C", "question": "What is the boy on the right holding?", "choices": ["sword", "shield", "pumpkin", "mace"], "correct_choice_idx": 0, "direct_answers": ["sword", "sword", "sword", "sword", "sword", "sword", "gun", "gun", "sword", "sword"], "difficult_direct_answer": false, "rationales": ["The long and shiny weapon is in his hand.", "The item the boy is holding has a handle, cross guard, and long edged blade.", "The object looks metallic with a handle and symmetrical things under the blade."], "image": "train2014/COCO_train2014_000000395964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539665, "question_id": "2F4QUEBvL7D4YeCrjMWtJw", "question": "What kind of item is the man very likely to be carrying in the case?", "choices": ["equipment", "costume", "clothing", "stringed instrument"], "correct_choice_idx": 3, "direct_answers": ["cello", "bass", "bass viol", "cello", "instrument", "cello", "music equipment", "cello", "instrument", "stringed instrument"], "difficult_direct_answer": false, "rationales": ["Looks like a double bass case.", "The shape of the item looks like a large instrument such as a cello.", "A man is carrying a large musical instrument case. basses are large stringed instruments."], "image": "train2014/COCO_train2014_000000539665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21083, "question_id": "2FKBP4dRVvGJvukEQ9a76c", "question": "What is the silver vehicle by the building used for?", "choices": ["transport luggage", "sell tickets", "clean grounds", "shuttle"], "correct_choice_idx": 0, "direct_answers": ["rail transport", "transport luggage", "train", "pushing goods", "transportation", "transportation", "train", "train station", "transport", "transportation"], "difficult_direct_answer": false, "rationales": ["It is used to move heavy bags onto the train quickly.", "There are suitcases on top of it.", "The vehicle transports luggage."], "image": "train2014/COCO_train2014_000000021083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270163, "question_id": "2FQYhL4vCZP5riVcSEsRdB", "question": "What powers the boat farthest away?", "choices": ["sail", "oars", "motor", "nothing"], "correct_choice_idx": 2, "direct_answers": ["motor", "gasoline", "motor", "motor", "motor", "motor", "motor", "gasoline", "motor", "motor"], "difficult_direct_answer": false, "rationales": ["The boat in the distance has a motor on the back of it that helps it to move.", "The boat further out on the river has a motor on the back and is moved by that engine.", "The boat in the back has an engine."], "image": "train2014/COCO_train2014_000000270163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3939, "question_id": "2FSperyeaGTP7m2B8S53G4", "question": "What toy is held by more children?", "choices": ["magic kit", "tape", "teddy bear", "ez bake"], "correct_choice_idx": 2, "direct_answers": ["teddy bear", "bear", "teddy bear", "bear", "teddy bear", "teddy bear", "teddy bear", "teddy bear", "bear", "teddy bear"], "difficult_direct_answer": false, "rationales": ["There are two stuffed bears in the picture.", "There are several children holding teddy bears.", "Three children stand together, two holding plush animals, and one holding tape."], "image": "val2014/COCO_val2014_000000003939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208618, "question_id": "2FgXSubwAUnTtLdGoEWQyi", "question": "What color is the long haired man's vest?", "choices": ["green", "red", "violet", "orange"], "correct_choice_idx": 0, "direct_answers": ["green", "green", "green", "blonde", "green", "light green", "brown", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["The man is wearing a tie and vest in pale green, perhaps chosen to match the bridesmaids' dresses in a wedding.", "A man is in a tuxedo with a light green and white vest under the jacket.", "The vest is the same color as the grass."], "image": "train2014/COCO_train2014_000000208618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509030, "question_id": "2Fm3gjfdbGC2WFgXQs54zF", "question": "What base is he on his way to?", "choices": ["home", "second", "first", "third"], "correct_choice_idx": 2, "direct_answers": ["running", "home", "first", "first", "first", "first", "first", "home", "first", "first"], "difficult_direct_answer": false, "rationales": ["You can see the bat on the ground so he just hit the ball and will be running to the first base.", "The young boy is lunging forward from first base which means his is running towards the next base.", "The bat is on the ground so he just hit the ball and is running to first base."], "image": "train2014/COCO_train2014_000000509030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509415, "question_id": "2G43btuthrpLgAq7JDtcDf", "question": "Why is he inside the wave?", "choices": ["fell there", "is lost", "swam there", "showing off"], "correct_choice_idx": 3, "direct_answers": ["surfing", "surfing", "surfing", "surfing", "showing off", "surfing", "surfing", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["It's likely a. that said, it could also be just to prove his skill. conquering tube riding is a surfer dream.", "He is doing a trick to show off what he can do on the waves.", "He is off his board so he probably lost balance going over the top"], "image": "train2014/COCO_train2014_000000509415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462026, "question_id": "2G9AJvMhAqWmo85MHJHrVz", "question": "What type of stand is holding up the motorcycle?", "choices": ["music stand", "display stand", "kick stand", "grand stand"], "correct_choice_idx": 1, "direct_answers": ["kickstand", "kickstand", "bike stand", "wheel stand", "paddock stand", "rack", "kickstand", "kickstand", "metal", "display stand"], "difficult_direct_answer": false, "rationales": ["This is a stand to put the bike on display.", "A stand used to show the bike off to people that want to see it.", "The display stand keeps the motorcycle up."], "image": "val2014/COCO_val2014_000000462026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145741, "question_id": "2GKUn9BGqKQTjVfYDfhhjL", "question": "Which item that is missing would help complete the home office setup?", "choices": ["mouse pad", "laptop", "microphone", "web cam"], "correct_choice_idx": 2, "direct_answers": ["things", "printer", "phone", "speaker", "printer", "printer", "printer", "speakers", "speakers", "microphone"], "difficult_direct_answer": false, "rationales": ["There is already a silver laptop on the right, a circular mouse pad underneath the mouse, and a web cam on top of the monitor on the left so none of those items are missing.", "The item is the microphone.", "The desk is covered with equipment for a home office setup but it is missing a microphone."], "image": "train2014/COCO_train2014_000000145741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180470, "question_id": "2GMAffwJPvDSBZhXkmGvPa", "question": "What kind of outfit is the statue dressed in?", "choices": ["suit", "pajamas", "athletic wear", "swim wear"], "correct_choice_idx": 0, "direct_answers": ["suit", "tuxedo", "suit", "suit", "suit", "suit", "suit", "dress suit", "suit", "formal"], "difficult_direct_answer": false, "rationales": ["The statue is formally dressed and is wearing a coat, tie, and dress shoes.", "The statue on the sidewalk is dressed in a three-piece suit.", "The statue is wearing a tie, dress shoes, and other formal pieces of clothing."], "image": "train2014/COCO_train2014_000000180470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304735, "question_id": "2GXveGE4Lr5ZvhMq7GQNJQ", "question": "At which event do these people pose?", "choices": ["exhibition", "mall", "office meeting", "zoo"], "correct_choice_idx": 0, "direct_answers": ["movie premiere", "party", "gala", "gathering", "convention", "party", "party", "exhibition", "party", "awards"], "difficult_direct_answer": false, "rationales": ["Based on their clothes they are not at a zoo or a mall and there is no meeting taking place.", "People would be at an exhibit.", "It looks like there are several stands behind them."], "image": "train2014/COCO_train2014_000000304735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340998, "question_id": "2GbNQjqZxokHFcjY7EWW78", "question": "What occupation does the person with the loaded bike beside them?", "choices": ["patisserie", "fortune teller", "florist", "food seller"], "correct_choice_idx": 2, "direct_answers": ["florist", "florist", "florist", "sell flowers", "florist", "flower salesmen", "florist", "florist", "florist", "florist"], "difficult_direct_answer": false, "rationales": ["A person is riding a bike loaded down with flowers. florists sell flowers.", "The person's bike is loaded with flowers so they work at a flower shop selling and arranging bouquets.", "The occupation is a florist."], "image": "val2014/COCO_val2014_000000340998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536166, "question_id": "2Gsxvj4m4cy3pC9PqoJpNG", "question": "How did this man get to this location?", "choices": ["walk", "uber", "via horseback", "cow back"], "correct_choice_idx": 2, "direct_answers": ["rode horse", "by horse", "horse", "horseback", "rode", "horse", "via horseback", "horseback", "horse", "by horse"], "difficult_direct_answer": false, "rationales": ["Men are riding on horses.", "The man wearing the hat rode a horse to get to where he is standing.", "The man does not appear to be near any road or markings of civilization but does appear on a horse. to get to this location without roads it is most likely they appeared on the horse they are currently shown on."], "image": "val2014/COCO_val2014_000000536166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258538, "question_id": "2GtKwL7VMhTpGiGcpgTp2B", "question": "What is the purpose of the umbrellas?", "choices": ["decorative", "hide people", "sun protection", "rain protection"], "correct_choice_idx": 2, "direct_answers": ["for shade", "shade", "shade", "shield sun", "protection", "rain", "sun protection", "protection", "shade", "shade"], "difficult_direct_answer": false, "rationales": ["The outdoors are sunny.", "The umbrellas are used for sun protection.", "Umbrella can serve two purposes, both protection from light and water."], "image": "train2014/COCO_train2014_000000258538.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79869, "question_id": "2GycmJug2sT6jXAsjnpeYf", "question": "The person who uses this space likes to collect what?", "choices": ["key chains", "trolls", "cleaning supplies", "lap tops"], "correct_choice_idx": 0, "direct_answers": ["key chains", "keychain", "keychain", "toys", "toys", "keychain", "keychain", "keychain", "keychain", "keychain"], "difficult_direct_answer": false, "rationales": ["The person will collect keychains.", "They have eight keychains hanging on the wall in front of the desk.", "The person who uses this space clearly collects key chains because there are almost ten in the space"], "image": "train2014/COCO_train2014_000000079869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24636, "question_id": "2HEDoy2SFfwFxBScbVbQqM", "question": "What bathroom fixture is to the left of the towel rack?", "choices": ["bidet", "sink", "toilet", "shower"], "correct_choice_idx": 2, "direct_answers": ["to holder", "light fixture", "paper dispenser", "to holder", "roll holder", "toilet paper", "toilet roll", "toilet paper", "toiletpaper roll", "toilet"], "difficult_direct_answer": false, "rationales": ["There is a toilet paper roll showing.", "The toilet is to the left of the towel rack.", "Though out of frame all bathrooms have toilets."], "image": "val2014/COCO_val2014_000000024636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71310, "question_id": "2HHjJPad78rFXxxE2HVY4K", "question": "A car turning which way is a hazard to this man?", "choices": ["reversing", "straight", "left", "right"], "correct_choice_idx": 3, "direct_answers": ["right", "right", "right", "right", "right", "right", "right", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["A man is skateboarding in the right turn lane of a road.", "The man on the skateboard is in the lane designated for cars that are turning right.", "The car turning right would be in the right lane which is where he is currently skating. this would be dangerous for him because the car could hit him."], "image": "train2014/COCO_train2014_000000071310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513577, "question_id": "2HM53uQAH3gzaPkudXTvJj", "question": "What is he doing to the tie?", "choices": ["stealing it", "straightening it", "tying it", "stealing it"], "correct_choice_idx": 2, "direct_answers": ["straightening it", "straightening", "tying", "fixing it", "tying it", "tieing it", "tying it", "tying", "close", "tying"], "difficult_direct_answer": false, "rationales": ["The man is tying it.", "He is tying it for the other man.", "He is started to tie the tie."], "image": "train2014/COCO_train2014_000000513577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468319, "question_id": "2HUoaEcLEX8KSDsjvY4g8L", "question": "What action will he take with the ball?", "choices": ["roll", "dunk", "swing", "dribble"], "correct_choice_idx": 2, "direct_answers": ["hitting it", "hit it", "throw", "hit it", "hit", "hit", "hit", "swing", "playing tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["He needs to hit the ball to the other player", "He has thrown it up in the air and will hit it with the racket", "He is holding a racket under the ball. rackets are used to hit things like the ball."], "image": "train2014/COCO_train2014_000000468319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268413, "question_id": "2HamnfFKZ3NefA7VBsm9Wg", "question": "What animal produced the food on the tray?", "choices": ["goat", "chicken", "pig", "cow"], "correct_choice_idx": 1, "direct_answers": ["pig", "chicken", "pig", "chicken", "chicken", "chicken", "food", "chickens", "cow", "chicken"], "difficult_direct_answer": false, "rationales": ["Eggs come from chickens.", "There are eggs.", "The tray has eggs. cows, pigs, and goats are mammals that do not produce eggs."], "image": "val2014/COCO_val2014_000000268413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581815, "question_id": "2HdMkhS8q7QHExjTB59id3", "question": "What type feet do these birds have?", "choices": ["human like", "talons", "webbed", "none"], "correct_choice_idx": 2, "direct_answers": ["webbed", "webbed", "webbed", "webbed", "water", "webbed", "webbed", "webbed feet", "web", "webbed"], "difficult_direct_answer": false, "rationales": ["These birds are ducks and ducks are known to have webbed feet.", "Ducks are in water. ducks have webbed feet.", "The birds have webbed feet."], "image": "train2014/COCO_train2014_000000581815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249288, "question_id": "2JHdiXdz5wNYxHDo3tLsbQ", "question": "WHich is a chain burger restaurant sign?", "choices": ["gifts", "mcdonalds", "forever 21", "disney"], "correct_choice_idx": 1, "direct_answers": ["mcdonalds", "mcdonalds", "mcdonalds", "mcdonalds", "mcdonalds", "mcdonald's", "mcdonalds", "mcdonalds", "mcdonalds", "mcdonald's"], "difficult_direct_answer": false, "rationales": ["This is a recognizable food logo found in many areas.", "This is a chain restaurant that sells burgers. its logo is golden arches.", "You can tell by the golden arches as to what fast food place is shown here."], "image": "train2014/COCO_train2014_000000249288.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531244, "question_id": "2JNGou59iXCn3NxR2Ug5wt", "question": "Why is the woman touching the woman's belly?", "choices": ["blessing baby", "baby moving", "joking around", "showing love"], "correct_choice_idx": 1, "direct_answers": ["watch movements", "pregnant", "feel baby", "feel baby", "feel baby", "feeling", "pregnant", "feel baby", "pregnancy", "baby moving"], "difficult_direct_answer": false, "rationales": ["The woman on the right is pregnant. the other woman is feeling the actions of the fetus.", "The woman is very pregnant and the baby is probably active", "The woman is pregnant and the other one is feeling the baby moving."], "image": "val2014/COCO_val2014_000000531244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137420, "question_id": "2JbbFqFz6QvReaUgNpqAmr", "question": "Which bird can grind their own calcium supplements?", "choices": ["peacock", "dove", "parrot", "crow"], "correct_choice_idx": 2, "direct_answers": ["parrot", "parrots", "parrot", "parrot", "parrots", "parrots", "parrot", "parrot", "parrot", "parrots"], "difficult_direct_answer": false, "rationales": ["There is a bright colored bird in the picture.", "The bird is the parrot.", "Parrots are known to be able to grind their own calcium with their beaks."], "image": "train2014/COCO_train2014_000000137420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287900, "question_id": "2Jd2oFdoS5hpyfMi8wKHxx", "question": "Why is he carrying the surfboard?", "choices": ["no wheels", "stole it", "exercise", "found it"], "correct_choice_idx": 0, "direct_answers": ["transportation", "to surf", "for surfing", "heading home", "no wheels", "on land", "leaving", "going home", "he's done", "surfing"], "difficult_direct_answer": true, "rationales": ["Most likely he is a surfer for the thrill but could also do it for staying in shape.", "A surfboard is made to be utilized on water, but in order to get it to the water, it must be carried.", "The board has to be carried to and from your car."], "image": "train2014/COCO_train2014_000000287900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237632, "question_id": "2JizQxpcQbViUouNgEYMET", "question": "What state is the computer most likely in?", "choices": ["off", "starting up", "at desktop", "processing video"], "correct_choice_idx": 1, "direct_answers": ["starting up", "sleep mode", "new york", "usa", "sleep", "rest", "sleep", "fl", "california", "california"], "difficult_direct_answer": false, "rationales": ["The computer is likely starting up since it's giving the loading message.", "The boot screen is active.", "It has the loading screens visible"], "image": "train2014/COCO_train2014_000000237632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80040, "question_id": "2Jxo7EzfEguLnWDuERYVeH", "question": "Which of the bowls of food is a side dish of the main meal?", "choices": ["top left", "bottom left", "bottom right", "top right"], "correct_choice_idx": 1, "direct_answers": ["silver bowl", "greens", "bread one", "bottom left", "bread", "bread bowl", "mushrooms", "all 4", "all", "top right"], "difficult_direct_answer": true, "rationales": ["The item is not in a white bowl and is laid out as though it were an appetizer.", "The bottom left is the side dish.", "The item on the bottom left is not contained in a small bowl which means it is the main dish."], "image": "train2014/COCO_train2014_000000080040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108452, "question_id": "2K74DjwV2oYcPHcN5WfEp3", "question": "These animals are known to do what?", "choices": ["hibernate", "gallop", "fly", "swim"], "correct_choice_idx": 1, "direct_answers": ["run", "run", "run", "run", "gallop", "run", "gallop", "gallop", "gallop", "run"], "difficult_direct_answer": false, "rationales": ["The horses gallop.", "They can do other things but galloping is fast.", "These animals can move really quickly and gracefully."], "image": "val2014/COCO_val2014_000000108452.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341712, "question_id": "2KESXsxu94YnYqdJNskkgw", "question": "What can you check out from that van?", "choices": ["electronics", "clothes", "games", "books"], "correct_choice_idx": 3, "direct_answers": ["books", "picture", "painting", "books", "books", "books", "books", "books", "guitar", "artwork"], "difficult_direct_answer": false, "rationales": ["It has a library website advertised on its door.", "According to the website on the side of the van, the van belongs to the brooklyn public library.", "The website on the door indicates that it is a library."], "image": "val2014/COCO_val2014_000000341712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416545, "question_id": "2KGXLvnAUL5FwMvGXLJ6pv", "question": "The red metal marker attached to the fire hydrant is most useful during which season?", "choices": ["fall", "winter", "spring", "summer"], "correct_choice_idx": 1, "direct_answers": ["winter", "spring", "summer", "summer", "winter", "winter", "summer", "snow", "winter", "summer"], "difficult_direct_answer": false, "rationales": ["In winter in case the hydrant is covered in snow.", "In winter the marker can make it clear where the hydrant is even if there's snow.", "Snow can hide the fire hydrant. it is easily visible in seasons that do not have snow."], "image": "train2014/COCO_train2014_000000416545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50350, "question_id": "2KKVSWfMUFqbN9HbB9KHzr", "question": "What type of rolls are in the wicker basket?", "choices": ["jelly", "toilet", "camera", "dinner"], "correct_choice_idx": 1, "direct_answers": ["toilet paper", "toilet paper", "toilet paper", "toilet paper", "toilet paper", "toilet paper", "toilet paper", "toilet", "toilet paper", "toilet"], "difficult_direct_answer": false, "rationales": ["The rolls are paper that is used to wipe yourself after going to the bathroom.", "These are in the bathroom and used as toilet paper.", "There is a toilet by the basket."], "image": "val2014/COCO_val2014_000000050350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440244, "question_id": "2KT3TUaz9BE6zGXcsPqRBS", "question": "What job does the person with the larger item on their head hold?", "choices": ["doctor", "mascot", "janitor", "lawn mower"], "correct_choice_idx": 1, "direct_answers": ["mascot", "mascot", "mascot", "mascot", "hat", "entertain", "entertain", "mascot", "mascot", "entertain"], "difficult_direct_answer": false, "rationales": ["Most sports teams have mascot's that dress up as a character depicting the team.", "The person is wearing a costume of the team's logo character.", "The person is wearing a typical mascot costume."], "image": "train2014/COCO_train2014_000000440244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394058, "question_id": "2KvcoZ2Hhauw3tWzRbFBV2", "question": "Why are these people covering their faces?", "choices": ["keeping warm", "stopping covid", "for style", "as punishment"], "correct_choice_idx": 0, "direct_answers": ["cold protection", "it's cold", "wind protection", "cold", "cold", "warmth", "cold", "keeping warm", "protection", "cold"], "difficult_direct_answer": false, "rationales": ["They are in the snow which is cold.", "They are standing on a snowy mountain.", "The people are in the cold."], "image": "train2014/COCO_train2014_000000394058.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79503, "question_id": "2KxBffd8N4VXfKisowNqNH", "question": "Where does the URL text actually exist?", "choices": ["on boy", "on grass", "on shoe", "image file"], "correct_choice_idx": 3, "direct_answers": ["on internet", "internet", "arkansas", "image file", "online", "website", "arkansas website", "online", "graphic", "baseball field"], "difficult_direct_answer": true, "rationales": ["Because it is a url you can easily surmise it is a computerized photo.", "The url indicates the image file.", "The photo belongs to the website printed on it."], "image": "train2014/COCO_train2014_000000079503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431660, "question_id": "2L75TZTqq3MjHZkKXPULBM", "question": "In what year did number 5 retire?", "choices": ["2015", "2006", "1996", "2011"], "correct_choice_idx": 0, "direct_answers": ["2015", "2015", "2015", "2015", "2015", "2015", "2015", "2015", "2015", "2015"], "difficult_direct_answer": false, "rationales": ["He retired in 2015", "The name shows that the person was famous and the year he retire.", "Retired in 2015"], "image": "val2014/COCO_val2014_000000431660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411617, "question_id": "2L9AhsqC3g8GKCSzvM8qyq", "question": "What type of game is being played?", "choices": ["card", "athletic", "board", "video"], "correct_choice_idx": 3, "direct_answers": ["racing", "auto racing", "racing", "auto racing", "racing", "racing", "racing", "racing", "video", "racing"], "difficult_direct_answer": false, "rationales": ["A screenshot of a car on a track is seen on a television. there are many racing video games.", "The game is being played on a television on a console device.", "Games on the television are video games."], "image": "train2014/COCO_train2014_000000411617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116963, "question_id": "2LDCipjoCqAACXeiFhgX4w", "question": "What companies logo can be seen on the half pipe?", "choices": ["dc", "etnies", "billabong", "quicksilver"], "correct_choice_idx": 3, "direct_answers": ["tony hawk", "tony hawk", "quicksilver", "tony hawk", "quiksilver", "normal", "ride", "tony hawk", "ride", "quicksilver"], "difficult_direct_answer": false, "rationales": ["Quicksilver can be seen as a sponsor as it is often a sponsor of skateboarding events.", "Quicksilver logo is on the top.", "The logo is quicksilver."], "image": "train2014/COCO_train2014_000000116963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122902, "question_id": "2LKyyXzRnjLrrAePAcfDm3", "question": "How is this bus different from traditional US buses?", "choices": ["electric", "double wide", "windowless", "double-decker"], "correct_choice_idx": 3, "direct_answers": ["double decker", "double decker", "double decker", "double decker", "double decker", "double decker", "double decker", "double-decker", "double decker", "double decker"], "difficult_direct_answer": false, "rationales": ["This bus has two levels visible through the front windshield. most buses operating in the us have one level so this would be distinct because of the double deck.", "Most us buses do not include more than one layer or floor.", "This bus has stairs that go to an upper deck on the bus."], "image": "train2014/COCO_train2014_000000122902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129510, "question_id": "2LUfcGhjQpb3hc2KiMnSoh", "question": "What surface are the boys playing on?", "choices": ["indoor hard", "grass", "clay", "outdoor hard"], "correct_choice_idx": 3, "direct_answers": ["tennis court", "turf", "outdoor hard", "clay", "court", "rubber", "court", "tennis court", "tennis court", "clay"], "difficult_direct_answer": false, "rationales": ["Two guys are playing tennis on a court that is outside.", "People are playing tennis on a court outside.", "The boys are on a tennis court."], "image": "train2014/COCO_train2014_000000129510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51702, "question_id": "2LbzYAXpBvtgXT4NoH9wVN", "question": "In what sort of setting do these people find themselves?", "choices": ["mall", "desert", "urban", "rural"], "correct_choice_idx": 2, "direct_answers": ["boardwalk", "city", "urban", "urban", "city", "metropolitan", "city", "city", "city", "bokkay"], "difficult_direct_answer": false, "rationales": ["There are mid-rise and high-rise buildings behind the people.", "This is a city since there are skyscrapers.", "The setting is urban."], "image": "val2014/COCO_val2014_000000051702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517517, "question_id": "2LmJjHgSk3dz9kt9tYCrhF", "question": "Which one of these is a sister island to this location?", "choices": ["jamaica", "bonaire", "barbados", "cuba"], "correct_choice_idx": 1, "direct_answers": ["curacao", "curacao", "bahamas", "aruba", "bonaire", "white bus", "curacao", "curacao", "palm island", "jamaica"], "difficult_direct_answer": false, "rationales": ["The markings on the side of the vehicle indicate that it is the aruba banana bus. jamaica, barbados, and cuba are not sister islands to aruba.", "The island is bonaire.", "The bus has aruba on it. that means that it has to be close to bonaire."], "image": "train2014/COCO_train2014_000000517517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159269, "question_id": "2MDzJQaYUiTxWxpJmAmgFQ", "question": "What is near the table?", "choices": ["cow", "baby", "cat", "chair"], "correct_choice_idx": 3, "direct_answers": ["chairs", "ocean", "chairs", "umbrella", "chair", "chairs", "plastic", "umbrellas", "chair", "umbrellas"], "difficult_direct_answer": false, "rationales": ["There are no humans or animals near the table.", "The table has a chair.", "There are no animals or babies. there are objects that can be used for sitting."], "image": "val2014/COCO_val2014_000000159269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385390, "question_id": "2MsJhFXZuWPHk27VQgBQfp", "question": "What is the flush on the toilet called?", "choices": ["toilet flapper", "tap", "influencer", "shower pipe"], "correct_choice_idx": 0, "direct_answers": ["handle", "handle", "flusher", "flusher", "handle", "flusher", "toilet flapper", "handle", "handle", "handle"], "difficult_direct_answer": false, "rationales": ["The flush is generally known as the toilet flapper.", "The toilet flapper flushes the toilet.", "It is called a toilet flapper to describe the movement that it makes when flushing the toilet."], "image": "train2014/COCO_train2014_000000385390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446521, "question_id": "2MuNNjofWHs6BuHn2MGszZ", "question": "Why is the man on the right wearing the vest?", "choices": ["style", "fashion", "visibility", "cosplay"], "correct_choice_idx": 2, "direct_answers": ["safety", "security", "be seen", "visibility", "municipal worker", "for safety", "safety vest", "more visibility", "sanitation worker", "safety"], "difficult_direct_answer": true, "rationales": ["A worker is wearing bright yellow. bright colors are used for visibility.", "A man is walking in a bright yellow work vest and green pants that roadworkers wear. the man is walking along a sidewalk.", "He is wearing a safety vest so he can be seen."], "image": "train2014/COCO_train2014_000000446521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509339, "question_id": "2N8UpuspqzA37Ch5ZkLhPY", "question": "What does the rectangular object on the wall on the left allow for?", "choices": ["storage", "vision", "water flow", "electrical power"], "correct_choice_idx": 3, "direct_answers": ["plug cords", "electric power", "electrical power", "electric outlet", "electrical power", "outlets", "plugs", "display", "electric plug", "electricity"], "difficult_direct_answer": true, "rationales": ["The object is for power.", "An outlet is on the wall. outlets are used for electricity.", "This is an electrical socket to plug in electronics."], "image": "train2014/COCO_train2014_000000509339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407905, "question_id": "2N9EmesVddbgTxGytF98GB", "question": "What are the people in the blue car doing?", "choices": ["shopping", "filming movie", "car safari", "drive-in movie"], "correct_choice_idx": 2, "direct_answers": ["car safari", "riding", "watching zebras", "site seeing", "camera", "watching zebras", "viewing zebras", "zebra watching", "watching animals", "watching zebras"], "difficult_direct_answer": false, "rationales": ["The people in the blue car are looking at zebras.", "The people appear to be observing the animals as they drive by.", "The people are on a safari."], "image": "train2014/COCO_train2014_000000407905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431885, "question_id": "2NKrCrQRfovHrvpo7DuUGF", "question": "What is the skier passing through?", "choices": ["security", "ride", "inspection", "gate"], "correct_choice_idx": 3, "direct_answers": ["snow", "gate", "snow", "gate", "gate", "gate", "gate", "gate", "gate", "gate"], "difficult_direct_answer": false, "rationales": ["There is a metal gate near the skiier.", "The person is passing through a gate to go ski.", "A gate is typically something you might have to go through to get to your destination. it is clear that the skier is walking through a gate; there are no rides visible and it's clear that there are no inspection or security issues here."], "image": "train2014/COCO_train2014_000000431885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430375, "question_id": "2NTPdd2vgHZomFccHKsG5p", "question": "What are the bars for?", "choices": ["holds poles", "holds skis", "stand waiting", "holds clothing"], "correct_choice_idx": 2, "direct_answers": ["stand waiting", "entry", "pushing off", "waiting turn", "holding onto", "guidance", "support", "form lines", "entry", "holding up"], "difficult_direct_answer": true, "rationales": ["Aside from the person, the bars do not hold anything. the person will take the lift, but it is not there yet.", "The bars are for the people who stand there waiting for the sky lift.", "A person is standing on skiis looking for something before going."], "image": "train2014/COCO_train2014_000000430375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133050, "question_id": "2NU2mVhjgHK6TRxPPBoCoh", "question": "What should someone use first to treat the carrot before using the knife to cut it?", "choices": ["fork", "cutter", "peeler", "spoon"], "correct_choice_idx": 2, "direct_answers": ["peeler", "peeler", "water", "water", "peeler", "peeler", "peeler", "water", "peeler", "peeler"], "difficult_direct_answer": false, "rationales": ["An unpeeled carrot is on table. carrots are peeled before eating.", "The skin of the carrot needs to be removed.", "The skin of the carrot would be awkward to remove with the knife alone, so an additional tool would be needed."], "image": "val2014/COCO_val2014_000000133050.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401071, "question_id": "2NtoLZpK2LeBy78XwyUUXC", "question": "What activity might those under umbrellas take part in at some point during the day?", "choices": ["drag racing", "binge eating", "betting", "swimming"], "correct_choice_idx": 3, "direct_answers": ["swimming", "swimming", "swimming", "tanning", "drinks", "swimming", "swimming", "swimming", "sun bathing", "shade"], "difficult_direct_answer": false, "rationales": ["They are lounging next to the water.", "There is a large pool of water by the sand.", "The people under the yellow umbrellas are closer to the ocean and have better access to enter it."], "image": "train2014/COCO_train2014_000000401071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354167, "question_id": "2P53cFpZVumY6quP7QbqiA", "question": "What leave imprints in the snow with every step they take?", "choices": ["nothing", "their shadow", "their shoes", "their hands"], "correct_choice_idx": 2, "direct_answers": ["humans", "footprints", "boots", "boots", "shoes", "their shoes", "people", "humans", "skis", "feet"], "difficult_direct_answer": false, "rationales": ["When you walk on snow it will always leave some imprint.", "Imprints in snow can only come from shoes. shadows don't make physical marks, there would be no reason to put your hands in snow.", "In the snow, one can see imprints of the items worn on the people's feet."], "image": "train2014/COCO_train2014_000000354167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32236, "question_id": "2P5ZPboExGbqsRf9mj5Jx5", "question": "What fruit is seen in the cup on the truck?", "choices": ["mango", "blueberry", "raspberry", "banana"], "correct_choice_idx": 2, "direct_answers": ["strawberries", "raspberry", "raspberry", "strawberry", "strawberry", "strawberry", "raspberry", "strawberries", "strawberry", "strawberry"], "difficult_direct_answer": false, "rationales": ["A red berry is seen in a cup. raspberries are red.", "The fruit is red, not yellow or blue.", "Though there are a number of fruits shown the main one is the raspberry."], "image": "train2014/COCO_train2014_000000032236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472676, "question_id": "2P6y5WUsJGoWaLacTsiM6b", "question": "What are they doing with the surfboards?", "choices": ["selling them", "taking home", "tossing them", "riding them"], "correct_choice_idx": 1, "direct_answers": ["taking home", "carrying", "surfing", "walking", "walking", "walking", "walking", "carrying them", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["They are carrying them probably to their cars to leave", "They are out of the water and carrying their surfboards.", "They are walking along the shore and not surfing."], "image": "train2014/COCO_train2014_000000472676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423025, "question_id": "2PJ3yj9TxbFNbW4K7zsgfr", "question": "What type of enthusiasts are gathering here?", "choices": ["gamers", "democrats", "bikers", "liberals"], "correct_choice_idx": 2, "direct_answers": ["motorcycle", "bikers", "bike", "motorbike enthusiasts", "bike enthusiasts", "motorcycle", "motorcyclists", "racing enthusiasts", "bikers", "motorcycle"], "difficult_direct_answer": false, "rationales": ["You can tell by the multitude of bikes as to what hobbies they are into.", "One can see the motorcycles parked there.", "The people gathered all love motorcycles."], "image": "train2014/COCO_train2014_000000423025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125995, "question_id": "2PntAN5muV4rR9YetMfHVF", "question": "What is in the room?", "choices": ["television", "toothbrush", "elephant", "bed"], "correct_choice_idx": 1, "direct_answers": ["toothbrush", "sink", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "sink", "sink", "toothbrush", "toothbrush"], "difficult_direct_answer": false, "rationales": ["This is a bathroom. a dental hygiene item is located to the left of the faucet.", "There s a toothbrush by the faucet.", "A toothbrush is on the side of a sink."], "image": "train2014/COCO_train2014_000000125995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117366, "question_id": "2PxT93kHzLqNfsc2VsGXpt", "question": "Beta carotene rich vegetable in the image is?", "choices": ["cabbage", "broccoli", "beet", "carrot"], "correct_choice_idx": 3, "direct_answers": ["carrot", "carrot", "carrot", "carrot", "carrots", "carrot", "carrot", "carrots", "carrot", "carrots"], "difficult_direct_answer": false, "rationales": ["The carrots have beta carotene.", "The vegetable in the picture that is high in beta carotene is an carrot.", "The carrots at the produce stand are high in carotene which is a nutrient."], "image": "train2014/COCO_train2014_000000117366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263041, "question_id": "2Q2DpW5agkNZPfpnfmKoKG", "question": "If you put a giant board in front of them what current action of theirs would you prevent them from doing?", "choices": ["eating", "fishing", "talking", "playing videogames"], "correct_choice_idx": 3, "direct_answers": ["playing wii", "playing videogames", "playing wii", "playing game", "playing game", "seeing", "swinging", "playing game", "playing game", "playing"], "difficult_direct_answer": false, "rationales": ["It would block them from being able to see their game.", "A board would block them from seeing the screen.", "They would not be able to see the television in order to play."], "image": "train2014/COCO_train2014_000000263041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492736, "question_id": "2Q2jgfUMRfynLAubuBwUe7", "question": "Which hand is she using to hold the phone?", "choices": ["left", "both", "right", "neither"], "correct_choice_idx": 2, "direct_answers": ["right", "right", "right", "right hand", "right", "right", "right hand", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["The woman is using her right hand.", "It is on that side of her body", "The phone is in the woman's right hand."], "image": "train2014/COCO_train2014_000000492736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339356, "question_id": "2Q9b7pYG8Ys5oHBpf2ursg", "question": "Which food unprepared to eat?", "choices": ["vegetables", "meat", "rice", "banana"], "correct_choice_idx": 3, "direct_answers": ["rice", "banana", "rice", "banana", "banana", "banana", "banana", "rice", "banana", "banana"], "difficult_direct_answer": false, "rationales": ["The banana doesn't require preparation, nor is it prepared for this tray.", "These types of fruits need no preparation or cooking to be eaten.", "The rice, meat, and vegetables are ready to be eaten. the yellow fruit needs to be peeled."], "image": "val2014/COCO_val2014_000000339356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377513, "question_id": "2QKuV8yUJRkDQFcTZYJWtn", "question": "What will rehydrate the people at the table if they are dehydrated?", "choices": ["beer", "beer", "pizza", "water"], "correct_choice_idx": 3, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "beer", "water"], "difficult_direct_answer": false, "rationales": ["Water is the only drink on this table which will hydrate rather than dehydrate.", "A table with pizza on it also has bottles of water. water is hydrating.", "Water will rehydrate."], "image": "train2014/COCO_train2014_000000377513.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76847, "question_id": "2QvtQLDQkaZtppL4dGRLUk", "question": "People standing on something solid furthest into and above water stand on what?", "choices": ["plane", "wharf", "pier", "ship"], "correct_choice_idx": 2, "direct_answers": ["pier", "dock", "deck", "dock", "dock", "dock", "platform", "dock", "dock", "boat"], "difficult_direct_answer": false, "rationales": ["A pier juts out into the water and allows people to get on a boat if they wish to.", "It's a small private area along the water for boats to dock.", "They are standing on the cement made area near the water on land."], "image": "train2014/COCO_train2014_000000076847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291527, "question_id": "2QvucSvxg9jyNCYPJPhtt9", "question": "What might you feel more like buying after viewing the wall here?", "choices": ["fine food", "cars", "fast food", "bicycles"], "correct_choice_idx": 1, "direct_answers": ["cars", "car", "bmw", "car", "car", "car", "car", "bmw", "bmw", "car"], "difficult_direct_answer": false, "rationales": ["There is an ad for bmw.", "The cars are being advertised.", "The labels advertises the car and one will be eagle to be buy car."], "image": "train2014/COCO_train2014_000000291527.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535179, "question_id": "2RUVZT35KFMhnZ5tJUvPLG", "question": "Which ski does the skier set down first to land safely?", "choices": ["poles", "left", "both", "right"], "correct_choice_idx": 2, "direct_answers": ["right", "both skis", "right", "left", "right", "right", "both", "right", "both ski", "left"], "difficult_direct_answer": false, "rationales": ["Landing on just one will probably cause them to fall.", "When airborne it is advisable to set both skis down at the same time to create a smooth landing. if not, the skier could become off balance and fall which would not be safe.", "The skis both allow safe landing."], "image": "val2014/COCO_val2014_000000535179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192023, "question_id": "2Roi29PDttqQkqWiaH85d9", "question": "Why is less snow visible on the area lower than the land near the photographer here?", "choices": ["no reason", "snow misers", "warmer there", "colder there"], "correct_choice_idx": 2, "direct_answers": ["more trees", "lower elevation", "warmer there", "melted", "warmer area", "melted", "it's warmer", "skis", "shade", "different temperatures"], "difficult_direct_answer": true, "rationales": ["The weather is warmer in the bottom level.", "The lower elevation is not as cold because air cools as the pressure drops and the higher elevation has lower air pressure.", "There is warmer weather."], "image": "train2014/COCO_train2014_000000192023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389498, "question_id": "2SBJvBMzDnqCKdYhKZ3neR", "question": "Why is he extending his arm?", "choices": ["it hurts", "taking selfie", "holding kite", "waving"], "correct_choice_idx": 1, "direct_answers": ["taking selfie", "taking selfie", "to photograph", "taking picture", "take selfie", "selfie", "selfie", "taking selfie", "taking pictures", "taking picture"], "difficult_direct_answer": false, "rationales": ["The man is taking a selfie.", "The man in the yellow shirt has his arm extended so he can take a selfie with his camera.", "The people are smiling at a camera."], "image": "train2014/COCO_train2014_000000389498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550911, "question_id": "2SF65rAjWXnGqTejjzqrSe", "question": "Where has most of her weight been shifted?", "choices": ["biceps", "calves", "wrists", "quads"], "correct_choice_idx": 3, "direct_answers": ["quads", "right side", "legs", "her abdomen", "feet", "back", "legs", "heels", "right leg", "right"], "difficult_direct_answer": true, "rationales": ["Foot is flat on the ground and she is leaning forward.", "She has most of her weight shifted behind her knees on her quads.", "The woman is squatting and putting pressure on her legs."], "image": "train2014/COCO_train2014_000000550911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76590, "question_id": "2SiwCweoPGJjmZWJ2WkTas", "question": "Which fruit juice has anti cancer properties?", "choices": ["apple", "passion", "orange", "guava"], "correct_choice_idx": 1, "direct_answers": ["orange", "orange", "orange", "orange juice", "orange", "orange", "pineapple juice", "orange juice", "orange juice", "passion"], "difficult_direct_answer": false, "rationales": ["The passion fruit has anti-cancer properties.", "The juice is passion fruit.", "A table has glasses of yellow juice. passion juice is yellow."], "image": "train2014/COCO_train2014_000000076590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377319, "question_id": "2SraeiEVzojURtZSV3kxTi", "question": "What role does FedEx play in this game?", "choices": ["sponsor", "delivery", "food provider", "transportation"], "correct_choice_idx": 0, "direct_answers": ["sponsor", "sponsor", "hitter", "sponsor", "hitter", "sponsor", "sponsor", "sponsor", "hitter", "sponsor"], "difficult_direct_answer": false, "rationales": ["There is a fedex banner at the tennis game because fedex probably sponsored the event.", "Those who's names are around a stadium are usually these.", "When companies decide to give money to a sporting event, they get to put their logo on the wall to advertise."], "image": "train2014/COCO_train2014_000000377319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349068, "question_id": "2SzcYAHFFkcC7mEft39MXc", "question": "At which direction is the highland cattle above staring to?", "choices": ["up", "right", "left", "front"], "correct_choice_idx": 3, "direct_answers": ["towards camera", "front", "away", "towards camera", "toward camera", "towards camera", "front", "straight", "forward", "north"], "difficult_direct_answer": false, "rationales": ["You can see the face of the cattle and they are facing the front.", "A cow is staring and facing forward.", "He is looking straight ahead."], "image": "train2014/COCO_train2014_000000349068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118598, "question_id": "2T4D4R9KPffjpfhwjcjev7", "question": "What does the yellow and white kite resemble?", "choices": ["badger", "werewolf", "squid", "crab"], "correct_choice_idx": 2, "direct_answers": ["squid", "octopus", "squid", "squid", "octopus", "squid", "squid", "octopus", "octopus", "squid"], "difficult_direct_answer": false, "rationales": ["The kite flying in the sky is shaped like a giant squid with long tentacles.", "These kites look like squid.", "The yellow and white animal looks like a squid given its long legs."], "image": "val2014/COCO_val2014_000000118598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272048, "question_id": "2TFWpwJqcuEe3PzHD8Lnug", "question": "What does the company whose name is on the left chair sell?", "choices": ["tires", "stocks", "butter", "pogs"], "correct_choice_idx": 1, "direct_answers": ["tennis balls", "fishing gear", "tennis balls", "stocks", "tennis balls", "stocks", "stocks", "stocks", "stocks", "tennis stuff"], "difficult_direct_answer": false, "rationales": ["That company is known for stocks.", "The chairs have the name scottrade on them which is a company for buying and selling stocks.", "The company is a stock trading company."], "image": "val2014/COCO_val2014_000000272048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298784, "question_id": "2TGQN2UFcs6JKAzQuv7iSa", "question": "What is the large silver object in the middle of the group?", "choices": ["boat", "scooter", "plane", "pool"], "correct_choice_idx": 0, "direct_answers": ["boat", "boat", "boat", "boat", "boat", "boat", "boat", "boat", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["The object is a boat.", "It is a boat that has been built to act a a statue.", "It is on wheels in order to be transported on land."], "image": "val2014/COCO_val2014_000000298784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526004, "question_id": "2TaLEC5vqhgu8XzfBZPuK8", "question": "Where did the baby get the pizza?", "choices": ["baked it", "bought it", "from adult", "stole it"], "correct_choice_idx": 2, "direct_answers": ["parent", "parents", "dad", "from mom", "from parent", "table", "mom", "from parents", "parent", "from adult"], "difficult_direct_answer": true, "rationales": ["The baby is eating prepared pizza and sitting on an adult.", "The baby is too young so he likely didn't get the pizza himself.", "The baby got it from an adult."], "image": "val2014/COCO_val2014_000000526004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296459, "question_id": "2TwwWphpLumQMpWvNiVp2C", "question": "Approximately what time is it?", "choices": ["925", "155", "1205", "255"], "correct_choice_idx": 3, "direct_answers": ["nine", "noon", "afternoon", "1015 am", "daytime", "noon", "morning", "three pm", "eleven", "255"], "difficult_direct_answer": true, "rationales": ["The sky looks a bit dark.", "Turn the picture upside down and the little hand is nearly at the three spot on the dial. this might be harder to read with digital or a tinier analog dial.", "From the looks on his watch it's almost 3 o'clock."], "image": "val2014/COCO_val2014_000000296459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227851, "question_id": "2U4yQYLucLxMyLz63LtgbF", "question": "What is the weather like in the scene?", "choices": ["windy", "cold", "hot", "rainy"], "correct_choice_idx": 2, "direct_answers": ["pleasant", "sunny", "sunny", "comfortable", "overcast", "warm", "dry", "hot", "pleasant", "hot"], "difficult_direct_answer": false, "rationales": ["The weather must be hot since the woman is fanning herself.", "They are flapping fans to cool themselves off, indicating that it is hot.", "The woman is fanning herself to keep cool, indicating that it's clearly a hot day."], "image": "val2014/COCO_val2014_000000227851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334850, "question_id": "2UMschzckNvKn5VHTeFPYF", "question": "What does the person standing here wait to see?", "choices": ["walk light", "candle", "their friend", "warning"], "correct_choice_idx": 0, "direct_answers": ["ride", "traffic light", "walk", "message", "light change", "walk light", "walk symbol", "car", "bus", "green light"], "difficult_direct_answer": true, "rationales": ["She is at an intersection and the light is red", "They are waiting to cross the road and for the walking change.", "She is waiting until the light tells her it is safe to cross the street"], "image": "train2014/COCO_train2014_000000334850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16084, "question_id": "2UPny9XTfkz2EBXPmxDXQB", "question": "What is most shocking in this picture?", "choices": ["gas mask", "carpet", "legs", "shoes"], "correct_choice_idx": 0, "direct_answers": ["gas mask", "gas mask", "gas mask", "gas mask", "mask", "socks-n-crocs", "gas mask", "gas mask", "mask", "gas mask"], "difficult_direct_answer": false, "rationales": ["Not many people wear them while in the bathroom.", "The item is only worn in more serious situations that pose a health risk.", "The man is wearing a gas mask on the toilet which is not normal behavior."], "image": "train2014/COCO_train2014_000000016084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393075, "question_id": "2UWWRJLgL24afa9c67Y5sC", "question": "What kind of outdoor event are the two on the bench attending?", "choices": ["art fair", "concert", "car show", "live auction"], "correct_choice_idx": 0, "direct_answers": ["art fair", "photo show", "art exhibition", "art", "craft fair", "art show", "art gallery", "fair", "art fair", "art fair"], "difficult_direct_answer": false, "rationales": ["Paintings are shown outside.", "In front of the two are paintings and artwork on display.", "There are a lot of paintings on sale, which is what is sold in the option a event."], "image": "val2014/COCO_val2014_000000393075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427384, "question_id": "2UooNt5xSReD35ydnRyqMm", "question": "What word is on the wall?", "choices": ["fiction", "biography", "biology", "mathematics"], "correct_choice_idx": 1, "direct_answers": ["biography", "biography", "biography", "biography", "biography", "biography", "biography", "biography", "biography", "biography"], "difficult_direct_answer": false, "rationales": ["This is a genre of books found in the library.", "This is a library, and the word on the wall indicates that is the section where a biography book can be found.", "The whole word is shown across the wall stating what types of books are in that section."], "image": "train2014/COCO_train2014_000000427384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305425, "question_id": "2UsWr3RH9modhAASzauEXD", "question": "What is the occupation of the man in the red vest?", "choices": ["fashion model", "produce vendor", "circus acrobat", "restaurant chef"], "correct_choice_idx": 1, "direct_answers": ["vendor", "salesman", "vendor", "farmer", "vendor", "vendor", "vendor", "produce vendor", "vendor", "store clerk"], "difficult_direct_answer": false, "rationales": ["He is behind the lettuce or celery that is on the table. it looks like he is doing something work related behind the counter.", "This area is a market. the man in red vest is selling, but not cooking, food.", "The man in the red vest works at the market and sells produce."], "image": "train2014/COCO_train2014_000000305425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38919, "question_id": "2V57SaZsLCEJxfKW7bcr3u", "question": "The ruins were probably once what type of structure?", "choices": ["castle", "church", "casino", "school"], "correct_choice_idx": 0, "direct_answers": ["castle", "castle", "castle", "fort", "castle", "castle", "fort", "fort", "castle", "fort"], "difficult_direct_answer": false, "rationales": ["The building has the round structure found in castles and the types of windows that castles had.", "The ruins were a castle.", "The remains of the turret, a common feature for castles, seem to indicate that the structure was once a castle."], "image": "val2014/COCO_val2014_000000038919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68789, "question_id": "2V7icML5PcP6ymWQKNHhH5", "question": "Who is the parking meter for?", "choices": ["bicyclists", "pedestrians", "animals", "drivers"], "correct_choice_idx": 3, "direct_answers": ["people", "parkers", "motorists", "drivers", "cars", "drivers", "parked cars", "drivers", "pay parking", "road"], "difficult_direct_answer": false, "rationales": ["The parking meter is used for drivers to pay for parking their cars on the side of the street.", "Pedestrians and animals do not have to park. bicyclists can park for free.", "Parking meters are a tax for parking a car. people drive cars."], "image": "train2014/COCO_train2014_000000068789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139490, "question_id": "2Vh52Htu8XFWfh9c922MMS", "question": "What kind of cuisine is this?", "choices": ["chinese", "japanese", "indian", "korean"], "correct_choice_idx": 1, "direct_answers": ["japanese", "lunch", "hot pocket", "japanese", "lunch", "bento box", "japanese", "healthy", "japanese", "japanese"], "difficult_direct_answer": false, "rationales": ["The cuisine in the lunch box is japanese food consisting of rice and veggies and the snack has japanese text on it.", "Japanese you can see it written on top of the package with the frog on it.", "The meal has been packed in a bento box style common to japanese meals and there are foods, cartoon characters and writing that would all be common in japan."], "image": "val2014/COCO_val2014_000000139490.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93141, "question_id": "2VjFrmK7DbaoCRjgJd7HaW", "question": "This collage shows the surfer riding a wave but at different what?", "choices": ["outfits", "angles", "surfboards", "weather"], "correct_choice_idx": 1, "direct_answers": ["angle", "angles", "times", "heights", "intervals", "times", "angles", "angles", "surfboard", "angles"], "difficult_direct_answer": false, "rationales": ["The image is flipped and reversed so it looks like they're moving in different directions.", "The collage shows angles.", "The orientation of the surfer and the board is different in each picture which could be seen as different angles."], "image": "train2014/COCO_train2014_000000093141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188311, "question_id": "2Weup6NrxpDsXimuuNThgg", "question": "What does one need to keep the items in the pink container working?", "choices": ["sharpener", "electricity", "gas", "knife"], "correct_choice_idx": 0, "direct_answers": ["sharpener", "pencils", "sharpener", "sharpener", "pencil sharpener", "sharpener", "sharpener", "by sharpening", "storage", "pencil sharpener"], "difficult_direct_answer": false, "rationales": ["The items in the pink container are pencils which need to be kept sharp to work.", "They will need to be cut down some when they get dull.", "The pink container is full of pencils that require a sharpener to use them."], "image": "val2014/COCO_val2014_000000188311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223256, "question_id": "2WfXPsfn2vHDMcfKPTE46U", "question": "What video game character are the boys mimicking?", "choices": ["donkey kong", "samus", "mario", "link"], "correct_choice_idx": 0, "direct_answers": ["donkey kong", "donkey kong", "donkey kong", "donkey kong", "donkey kong", "donkey kong", "donkey kong", "donkey kong", "donkey kong", "donkey kong"], "difficult_direct_answer": false, "rationales": ["This character dresses like the people in the photo.", "By there costumes and body language it can help you determine what they are mimicking.", "They have the clothing on and are acting like apes"], "image": "val2014/COCO_val2014_000000223256.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14056, "question_id": "2WoPXrE32zYmXEgeVEJeG4", "question": "How is the image made to look?", "choices": ["upside down", "old fashioned", "futuristic", "inverted"], "correct_choice_idx": 1, "direct_answers": ["wavy", "old fashioned", "old", "old", "sepia", "sepia", "antique", "sepia", "old", "ancient"], "difficult_direct_answer": false, "rationales": ["The picture is colored with sepia tone and that makes it look like an antique photo.", "The picture is sort of black and white so it looks old fashioned.", "This image has the old black and white and tan colors to it like classic photos."], "image": "val2014/COCO_val2014_000000014056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467130, "question_id": "2YCLfVzTgZMUZGmK6uE79Y", "question": "Where are they likely to work from?", "choices": ["office", "home", "warehouse", "constraction"], "correct_choice_idx": 0, "direct_answers": ["office", "office", "hospital", "office", "office", "government", "teachers", "office", "office", "office"], "difficult_direct_answer": false, "rationales": ["The women are wearing office appropriate attire.", "The women are wearing office professional clothing.", "They are dressed in clothing that is similar to suits"], "image": "val2014/COCO_val2014_000000467130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308277, "question_id": "2YJ8XxsCtysLu5b9CtwNkC", "question": "What are the people wearing red's job?", "choices": ["secretaries", "lifeguards", "police", "dancers"], "correct_choice_idx": 1, "direct_answers": ["lifeguards", "lifeguards", "surfers", "surfers", "lifeguards", "lifeguards", "lifeguard", "surfers", "surfers", "surf instructors"], "difficult_direct_answer": false, "rationales": ["The people work at the beach.", "They are at a beach in front of a surfboard.", "Lifeguards wear red for visibility."], "image": "train2014/COCO_train2014_000000308277.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402248, "question_id": "2YKj8xh576AnSDMoS9YJXN", "question": "What is the boy sitting in the grass doing?", "choices": ["playing pokemon", "resting", "texting", "flying kite"], "correct_choice_idx": 3, "direct_answers": ["kite flying", "flying kite", "kite flying", "flying kite", "kite flying", "flying kite", "flying kite", "flying kite", "flying kite", "flying kite"], "difficult_direct_answer": false, "rationales": ["The boy is flying the kite.", "The boy is awake and is not using a modern piece of technology. he is holding a control line.", "He is holding the string to his kite that is in the air."], "image": "val2014/COCO_val2014_000000402248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559462, "question_id": "2YLFuMTDUehPNDmgCzXGDP", "question": "Why are they standing like that?", "choices": ["are resting", "are falling", "are posing", "are fighting"], "correct_choice_idx": 2, "direct_answers": ["posing", "wedding photos", "wedding photographs", "picture taking", "romantic", "just married", "wedding photos", "are posing", "picture posing", "wedding pictures"], "difficult_direct_answer": true, "rationales": ["Or they're just fooling around. the other options don't really fit unless the bride is c.", "They're taking their wedding photos.", "They are posing for pictures."], "image": "val2014/COCO_val2014_000000559462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22882, "question_id": "2YUiPaiEhxV8P2qNMQEYdR", "question": "What drink goes in this type of glass?", "choices": ["coffee", "tea", "beer", "wine"], "correct_choice_idx": 3, "direct_answers": ["wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine"], "difficult_direct_answer": false, "rationales": ["This is a specialized glass for an alcoholic beverage.", "Wine is in the glass.", "This shape of glass is most typically used to drink wine from."], "image": "train2014/COCO_train2014_000000022882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492347, "question_id": "2Yk4XskvmwNEjwp2tqRKuA", "question": "In what direction will the train go next with respect to the person taking this person?", "choices": ["south", "north", "south", "east"], "correct_choice_idx": 1, "direct_answers": ["straight", "opposite", "go forward", "away", "south", "up", "further north", "away", "right", "north"], "difficult_direct_answer": true, "rationales": ["Camera is behind train so train is pointed north of the person taking the photo", "That's the direction the train appears to be going from the photographer's point of view.", "The picture is showing the caboose, so then you can safely figure out it's going north."], "image": "train2014/COCO_train2014_000000492347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19060, "question_id": "2YoeVJKZ7Agxen9pgXTfzF", "question": "Based on the leaves on the trees what season is it?", "choices": ["fall", "summer", "spring", "winter"], "correct_choice_idx": 3, "direct_answers": ["summer", "summer", "winter", "summer", "summer", "summer", "summer", "summer", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["Looks like there are a lot of green leaves.", "The leaves are freshly turned green.", "Leaves are filled with green leaves."], "image": "train2014/COCO_train2014_000000019060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298350, "question_id": "2ZMA2ZF7knbw4NzScvpNEZ", "question": "Where did the youth get the bruises on his legs?", "choices": ["skateboard fall", "racquetball", "riding broncos", "bull riding"], "correct_choice_idx": 0, "direct_answers": ["shin", "street", "shins", "skateboard park", "falling", "skate park", "concrete", "skateboard fall", "skateboarding", "skateboarding"], "difficult_direct_answer": true, "rationales": ["The youth is participating in an extreme sport. there are no large non-human animals present.", "A guy is riding a skateboard which is a dangerous sport.", "The young person is on a skateboard and therefore skates, by the marks on their legs, indicates that most likely they got them from skateboarding."], "image": "train2014/COCO_train2014_000000298350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541496, "question_id": "2ZPcMJZusRLGonvRwwNo9T", "question": "Universal shape behind the zips are what?", "choices": ["m", "t", "u", "v"], "correct_choice_idx": 2, "direct_answers": ["square", "rectangle", "u", "teeth", "rectangle", "pulls", "rectangular", "rectangle", "rectangle", "zippers"], "difficult_direct_answer": false, "rationales": ["I have to assume this answer because the image is unclear about the object behind the zippers.", "Traditional zippers are used on several pieces of luggage.", "The zips are in a u shape."], "image": "train2014/COCO_train2014_000000541496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405530, "question_id": "2ZavE2nqLuhLjkjBrN2QcJ", "question": "What is this man doing?", "choices": ["is resting", "is surfing", "watching movie", "driving bike"], "correct_choice_idx": 0, "direct_answers": ["watching sea", "sitting down", "sitting", "resting", "is resting", "sitting", "fake", "sitting", "statue", "sitting"], "difficult_direct_answer": false, "rationales": ["The man is resting.", "The man is sitting on the bench probably resting and relaxing.", "He is sitting down to relax."], "image": "val2014/COCO_val2014_000000405530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178379, "question_id": "2ZmkSYntJcm5pfucM78SU6", "question": "What are the black helmets the people are wearing made for?", "choices": ["halloween", "style", "riding", "sweat reduction"], "correct_choice_idx": 2, "direct_answers": ["protection", "riding", "protection", "plastic", "plastic", "riding", "protecting head", "safety", "protecting head", "head protection"], "difficult_direct_answer": false, "rationales": ["The men are on horses and have helmets on in case they fall.", "The people are riding on the horses.", "They are helmets specifically designed for riding horses, to protect the riders."], "image": "train2014/COCO_train2014_000000178379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479379, "question_id": "2ZnuoiULmmvxWx5y5mSnyq", "question": "What setting does the boarder pose in here?", "choices": ["urban", "suburban", "desert", "farm"], "correct_choice_idx": 1, "direct_answers": ["suburban", "mountain", "mountain", "street", "road", "neighborhood", "street", "street", "cool guy", "rural"], "difficult_direct_answer": false, "rationales": ["The setting is suburban.", "There are some buildings with green around.", "The boarder is posing on a street in the suburbs."], "image": "val2014/COCO_val2014_000000479379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86725, "question_id": "2ZoHeCYcmmYo5owS6Fa59V", "question": "The red item is what type of food?", "choices": ["dairy", "fruit", "grain", "legume"], "correct_choice_idx": 1, "direct_answers": ["tomato", "tomato", "fruit", "tomato", "tomato", "tomato", "vegetable", "vegetable", "tomato", "tomato"], "difficult_direct_answer": false, "rationales": ["These are tomatoes on their food.", "Tomatoes are on a pizza. tomatoes are a fruit.", "The red item is tomatoes, which have seeds."], "image": "train2014/COCO_train2014_000000086725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438721, "question_id": "2a2bRG3peGyaX6CRff9UuP", "question": "What are these men looking at?", "choices": ["baseball", "stars", "moon", "sun"], "correct_choice_idx": 0, "direct_answers": ["ball", "sky", "baseball", "ball", "ball", "looking ball", "baseball", "baseball", "baseball", "fly ball"], "difficult_direct_answer": false, "rationales": ["They are in baseball uniforms and equipment, and the bat is in the aid, indicating that a baseball was just hit.", "The men are playing baseball.", "The batter swung the bat and followed the ball with his head."], "image": "val2014/COCO_val2014_000000438721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199522, "question_id": "2aDToHqZbXViJwi6PhxwJy", "question": "What has peaked the interest of the little girl?", "choices": ["sheep", "mother", "cage", "puddle"], "correct_choice_idx": 0, "direct_answers": ["sheep", "sheep", "sheep", "sheep", "sheep", "sheep", "sheep", "sheep", "sheep", "sheep"], "difficult_direct_answer": false, "rationales": ["The little girl is pointing to it.", "The little girl is pointing at the animal in the cage and is looking in that direction.", "She is pointing her finger at the animal"], "image": "train2014/COCO_train2014_000000199522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262710, "question_id": "2aidh4PEccrguoUpbd4gsy", "question": "What is the model year of Toyota Tacoma?", "choices": ["1995-2004", "1998-2004", "1996-2004", "1998-2000"], "correct_choice_idx": 0, "direct_answers": ["2010", "2010", "1995", "2015", "1999", "1995-2004", "tacoma", "2004", "new model", "1998"], "difficult_direct_answer": true, "rationales": ["The toyota tacoma comes from the 1995-2004 era.", "It's somewhere between 95 and 2004.", "The toyota tacoma is from 1995 and 2004."], "image": "train2014/COCO_train2014_000000262710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440387, "question_id": "2av2cYqAMvRApmn29DAaaS", "question": "Why is everyone headed downhill?", "choices": ["going home", "they're skiing", "it's easier", "saves time"], "correct_choice_idx": 1, "direct_answers": ["to ski", "they're skiing", "skiing", "gravity", "fun", "skiing", "gravity", "skiing", "skiing", "they're skiing"], "difficult_direct_answer": false, "rationales": ["They are at a resort and have skis on", "There are people wearing skis and doing the wedge.", "The people are on skis."], "image": "val2014/COCO_val2014_000000440387.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224053, "question_id": "2awVVpETbTrjFtzQpRCv8Q", "question": "Why is the yellow sign posted outdoors?", "choices": ["to scare", "to inform", "to protest", "to sell"], "correct_choice_idx": 1, "direct_answers": ["to inform", "directions", "directions", "gives directions", "convenience", "street sign", "diversion", "for parking", "information", "parking information"], "difficult_direct_answer": true, "rationales": ["The sign is for information.", "The department of transportation wants drivers to know that there has been an accident and they cannot travel that way.", "A sign is posted on the side of a road and gives road information."], "image": "val2014/COCO_val2014_000000224053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559966, "question_id": "2bC9L3qFFfK4xWBp2NWACz", "question": "What starchy food is visible here?", "choices": ["fries", "bacon", "meat", "tomato sauce"], "correct_choice_idx": 0, "direct_answers": ["fries", "bread", "french fries", "fries", "bun", "fries", "french fries", "bun", "french fries", "french fries"], "difficult_direct_answer": false, "rationales": ["Potatoes are full of starch.", "Fries are all starch since they are made from potatoes.", "Potatoes are known as a starchy vegetable, and fries are made out of potatoes."], "image": "train2014/COCO_train2014_000000559966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536233, "question_id": "2bJ57CrqDrdgr7qrxQuLPx", "question": "Who is steering the flying object?", "choices": ["man", "boy", "girl", "woman"], "correct_choice_idx": 0, "direct_answers": ["man blue", "man", "parent", "grown man", "man", "man", "wind", "man", "man", "green-hat guy"], "difficult_direct_answer": false, "rationales": ["You can tell by the fact that he is holding the kite string as to who is flying it.", "The man is holding the string that is controlling the kite.", "The man in the green hat is holding the end of the string that controls the kite."], "image": "train2014/COCO_train2014_000000536233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160102, "question_id": "2bQNkvRjDLWbSyBAW3PvgY", "question": "What type of game are the woman playing?", "choices": ["board", "card", "relay", "video"], "correct_choice_idx": 3, "direct_answers": ["boxing game", "wii", "video", "wii boxing", "wii", "wii", "video", "fighting", "wii", "wii boxing"], "difficult_direct_answer": false, "rationales": ["You can see the game on the tv screen so it's a video game.", "They are playing wii boxing on the tv.", "The woman are playing nintendo wii which is a video game system"], "image": "train2014/COCO_train2014_000000160102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418535, "question_id": "2bUxJCA9zzL2yXMeNPBgos", "question": "Which of the man's accessories need to be replaced?", "choices": ["hat", "shoes", "tie", "gloves"], "correct_choice_idx": 1, "direct_answers": ["shoes", "shoes", "shoes", "tie", "shoes", "shoes", "shoes", "tie", "shoes", "shoes"], "difficult_direct_answer": false, "rationales": ["The shoes are outdated.", "A man is in dress clothes but is wearing casual shoes.", "His footwear looks to be very worn."], "image": "val2014/COCO_val2014_000000418535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530394, "question_id": "2bXw7EYkxkbUukEPbqtcaJ", "question": "What type of internet connection is being used in the residence?", "choices": ["dsl", "cellular", "fiber", "cable"], "correct_choice_idx": 0, "direct_answers": ["ethernet", "wired", "cable", "dial up", "dsl", "dsl", "wifi", "wifi", "modem", "dial up"], "difficult_direct_answer": false, "rationales": ["Looks like they use dsl", "The internet connection is wired. the wire is connected to the phone jack.", "It's plugged into the phone jack in the wall."], "image": "train2014/COCO_train2014_000000530394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107188, "question_id": "2cVU4M9aKXRAw6mrRLVzG5", "question": "What is the cat doing on the bench?", "choices": ["sleeping", "grooming", "eating", "playing"], "correct_choice_idx": 0, "direct_answers": ["sleeping", "sleeping", "sleeping", "napping", "sleeping", "sleeping", "sleeping", "napping", "sleeping", "napping"], "difficult_direct_answer": false, "rationales": ["The cat is lying on the bench with its eyes closed.", "When an animal is lying down and it's eyes are closed, it is probably asleep.", "The cat is curled up and sleeping on the bench outside,"], "image": "train2014/COCO_train2014_000000107188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386062, "question_id": "2cYua4b8qvJmTbjSiAWoXG", "question": "What is the man in the brown shirt emulating with the white controller?", "choices": ["driving", "shooting", "boxing", "skiing"], "correct_choice_idx": 0, "direct_answers": ["driving", "driving", "driving", "driving", "driving", "driving", "driving", "driver", "steering wheel", "driving"], "difficult_direct_answer": false, "rationales": ["The man is holding a car steering wheel which is an accessory that is used in a popular video racing game.", "The man is driving.", "He has his hand on a white wheel."], "image": "train2014/COCO_train2014_000000386062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482841, "question_id": "2cZ9VshVCUFFgs23rqm5zu", "question": "Why is the cat in the sink?", "choices": ["to sleep", "to eat", "to bathe", "to comb"], "correct_choice_idx": 0, "direct_answers": ["being washed", "taking bath", "to sleep", "resting", "cozy", "bath time", "relaxing", "sleeping", "resting", "relaxing"], "difficult_direct_answer": false, "rationales": ["Cats love to sleep and rest in sinks.", "Cats like to lay on or in anything to sleep.", "It is laying down and curled up"], "image": "train2014/COCO_train2014_000000482841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535770, "question_id": "2cbrFP6dHAyaQRb993hX27", "question": "Who is the most successful quarterback of her favorite team?", "choices": ["drew bledsoe", "eli manning", "tom brady", "brett favre"], "correct_choice_idx": 2, "direct_answers": ["horror", "tom brady", "tom brady", "tom brady", "horror", "tom brady", "tom brady", "horror", "tom brady", "tom brady"], "difficult_direct_answer": false, "rationales": ["He plays for the team listed on her shirt.", "The woman is wearing a t shirt by tom.", "Tom brady is super successful."], "image": "val2014/COCO_val2014_000000535770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80941, "question_id": "2cd24GfN9GpcRmk3dWnhfL", "question": "Why is the person on the board crouching?", "choices": ["to dance", "to sit", "to jump", "to balance"], "correct_choice_idx": 3, "direct_answers": ["balance", "keep balance", "balance", "for balance", "surfing", "to balance", "safety", "surf wave", "surfing", "positioning"], "difficult_direct_answer": false, "rationales": ["The man is trying to have balance.", "They will bend their knees so they can keep their balance on the board.", "The surfer is leaning forward and has their knees bent indicating they are trying to balance themselves."], "image": "train2014/COCO_train2014_000000080941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241617, "question_id": "2cuqjPUdUM7rT7sK842Gib", "question": "That cake is for two people who are involved how?", "choices": ["rivals", "siblings", "colleagues", "romantically"], "correct_choice_idx": 3, "direct_answers": ["marriage", "parent child", "marriage", "marriage", "married", "getting married", "getting married", "romantically", "getting married", "getting married"], "difficult_direct_answer": false, "rationales": ["These are love birds on the top of the cake used in weddings", "The birds on the cake are kissing.", "The cake is for a wedding."], "image": "train2014/COCO_train2014_000000241617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185834, "question_id": "2d5nqEbMJNzd6Vg5Edb4zQ", "question": "If the green item is part of this person's religion what are they most likely unfamiliar with?", "choices": ["diwali", "baptism", "extreme unction", "christmas"], "correct_choice_idx": 0, "direct_answers": ["buddhism", "diwali", "unknown", "other religions", "buddhism", "atheists", "atheism", "christmas", "christmas", "eastern religions"], "difficult_direct_answer": false, "rationales": ["The item is unlike diwali.", "A person who celebrates christmas is most likely unfamiliar with the hindu festival of lights.", "That is a celebration for hindus and other religions who are not christian."], "image": "train2014/COCO_train2014_000000185834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425644, "question_id": "2d9FkhiuZmENPEgimsG27Z", "question": "What kind of trees can be seen?", "choices": ["birch", "palm tree", "pine tree", "oak tree"], "correct_choice_idx": 1, "direct_answers": ["palm", "palm", "palm trees", "palm", "palm", "palm trees", "palm trees", "palm tree", "palm trees", "palm trees"], "difficult_direct_answer": false, "rationales": ["Tall trees with long trunks and leaves at the top are around a building.", "There are palm trees.", "There are several tall tropical trees."], "image": "val2014/COCO_val2014_000000425644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366295, "question_id": "2dMHR5VQBeP483myqvCWr2", "question": "What animals are present?", "choices": ["giraffe", "dog", "elephant", "deer"], "correct_choice_idx": 2, "direct_answers": ["elephants", "elephants", "elephants", "elephants", "elephants", "elephant", "elephants", "elephants ducks", "elephants", "ducks"], "difficult_direct_answer": false, "rationales": ["Only elephants are shown.", "Anyone can tell by the animals seen here what they are by the trunks and grey color.", "The animals have tusks, are grey, and have trunks."], "image": "val2014/COCO_val2014_000000366295.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275377, "question_id": "2dTU6qSHdxDn8fMcm4EgAh", "question": "What is the person on the bench doing?", "choices": ["reading", "working", "cooking", "sleeping"], "correct_choice_idx": 0, "direct_answers": ["reading", "sitting", "reading", "reading", "reading", "sitting", "reading", "reading", "sitting", "reading"], "difficult_direct_answer": false, "rationales": ["There is a corner of an object visible past the person that appears to have pages and be in their hand which is consistent with a book. the person appears to be looking down at the book which is what one would do to read it.", "She has an open book in her hands", "The person on the bench is reading a book."], "image": "train2014/COCO_train2014_000000275377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324643, "question_id": "2dXAXiwvmmpdT2y3URaoQp", "question": "What caused the lines in the snow?", "choices": ["car wheels", "skis", "animal", "shovel"], "correct_choice_idx": 0, "direct_answers": ["vehicle", "tires", "cars", "car wheels", "tires", "tires", "tires", "tires", "cars", "truck"], "difficult_direct_answer": false, "rationales": ["The car wheels cause the lines.", "The lines in the snow were caused by tire tracks because you can see clearly the outlines of the tires.", "The man is skiing by a road."], "image": "train2014/COCO_train2014_000000324643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172743, "question_id": "2dyQCZVjJXqiB8x5ZNzyCX", "question": "What kind of building is the one with the sign on the left?", "choices": ["hotel", "restaurant", "pub", "library"], "correct_choice_idx": 0, "direct_answers": ["hotel", "hotel", "hotel", "hotel", "hotel", "hotel", "hotel", "hotel", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["The days inn & suites is a hotel business building.", "The sign is for a hotel.", "A hotel sign is lit on the side of a road."], "image": "train2014/COCO_train2014_000000172743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116911, "question_id": "2eJ5hZRn6k4Z8erZAxMyCL", "question": "What brand is the blue shirt on the right?", "choices": ["adidas", "new balance", "reebok", "nike"], "correct_choice_idx": 0, "direct_answers": ["adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas"], "difficult_direct_answer": false, "rationales": ["The shirt has the adidas logo on it.", "The boy's shirt has three stripes which is a signature of adidas.", "The brand is adidas."], "image": "train2014/COCO_train2014_000000116911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578350, "question_id": "2eK8GgdxPLv7VP9DENoJsE", "question": "Which things would be easiest for the giraffes to eat here?", "choices": ["ground bushes", "cookies", "trees", "hair"], "correct_choice_idx": 2, "direct_answers": ["trees", "leaves", "leaves", "leaves", "leaves", "tree leaves", "leaves", "leaves", "leaves", "tree leaves"], "difficult_direct_answer": false, "rationales": ["The tree is next to the giraffe so the giraffe can easily get to it for consumption.", "Due to their incredibly long necks, giraffes can easily eat the leaves from high places.", "Giraffes are herbivores. it is easiest for them to eat leaves that are not at ground level."], "image": "val2014/COCO_val2014_000000578350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537417, "question_id": "2eNMJcsKCSD3ZHJHmjzwN9", "question": "Why is the dog on the table?", "choices": ["to groom", "to sit", "to eat", "to play"], "correct_choice_idx": 1, "direct_answers": ["cat resting", "comfortable bowl", "no dog", "to sit", "cat", "cat relaxing", "resting", "no dog", "naptime", "cat there"], "difficult_direct_answer": true, "rationales": ["The cat is in the middle of the table lying down.", "A cat is sitting in a bowl on a table.", "It's a cat that is sitting in a bowl on the table."], "image": "train2014/COCO_train2014_000000537417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188815, "question_id": "2ePEgq38nSREAQdcap5rdU", "question": "What does the man have on his head besides lipstick?", "choices": ["fake blood", "jello", "ketchup", "custard"], "correct_choice_idx": 0, "direct_answers": ["ball", "fake blood", "ball", "blood", "blood", "tennis ball", "fake blood", "ball", "tennis ball", "blood"], "difficult_direct_answer": false, "rationales": ["The man is wearing a costume, and isn't really injured, meaning his wounds aren't real.", "The man is dressed like a zombie and has fake blood on his face.", "The man is wearing makeup for a fake blood effect."], "image": "train2014/COCO_train2014_000000188815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317015, "question_id": "2egjHpkd8Dj4z5GTX29hFj", "question": "Why is she jumping through the air?", "choices": ["catch frisbee", "impress others", "is falling", "was pushed"], "correct_choice_idx": 0, "direct_answers": ["catch frisbee", "happy", "catch frisbee", "catching frisbee", "catch frisbee", "catching frisbee", "catch frisbee", "catch", "playing", "catching frisbee"], "difficult_direct_answer": false, "rationales": ["The frisbee is coming towards her in the air and her hand is outstretched towards it, and it's clear this could be the only thing she's reaching for.", "It looks like she's trying to grab something. it's unclear, but the other options don't really match.", "She is reaching for the frisbee so it is obvious what they are doing."], "image": "val2014/COCO_val2014_000000317015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218846, "question_id": "2f93vsMWnETonopbuokJpR", "question": "What should you eat among these food if you lack in vitamin A?", "choices": ["tomato", "broccoli", "carrot", "pepper"], "correct_choice_idx": 2, "direct_answers": ["carrots", "spinach", "carrots", "carrot", "carrots", "carrot", "carrots", "leafy greens", "carrots", "carrot"], "difficult_direct_answer": false, "rationales": ["Orange and yellow vegetables are full of vitamin a.", "These are carrots which are high in vitamin a.", "The orangery long veggies contain a high amount of vitamin a."], "image": "train2014/COCO_train2014_000000218846.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166987, "question_id": "2fAaN6uaKhVoPiTQEhuGMe", "question": "What is in the center of the plate served at this banquet?", "choices": ["bacon", "meatballs", "spaghetti", "lasagna"], "correct_choice_idx": 1, "direct_answers": ["chicken", "meatballs", "meatballs", "meatballs", "meatball", "meatball", "meatballs", "meatballs", "meatballs", "meatballs"], "difficult_direct_answer": false, "rationales": ["That's what the round things look like.", "These are a type fo meatball being served with broccoli", "The center has meatballs."], "image": "train2014/COCO_train2014_000000166987.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423624, "question_id": "2fQQtruJvQuDugUUpLPBTv", "question": "Why are so many benches empty?", "choices": ["people afraid", "benches broken", "nobody around", "late night"], "correct_choice_idx": 2, "direct_answers": ["no people", "low attendance", "nobody around", "park empty", "no people", "no people", "no people", "park closed", "it's morning", "no people"], "difficult_direct_answer": false, "rationales": ["There are no visible people to take up spaces in any of the benches as they are designed to accommodate.", "There are almost no people in the park", "Nobody is around the park."], "image": "train2014/COCO_train2014_000000423624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206280, "question_id": "2fY3HPZgrnQki4N8e2JcA9", "question": "What meal is being served?", "choices": ["lunch", "brunch", "breakfast", "dinner"], "correct_choice_idx": 3, "direct_answers": ["dinner", "christmas dinner", "dinner", "dinner", "dinner", "christmas", "dinner", "dinner", "christmas", "dinner"], "difficult_direct_answer": false, "rationales": ["People are gathered around a table with candles lit and the table set with plates and silverware. of the three meals, dinner is usually the most formal.", "It is dark outside and the people are sitting together to eat dinner.", "People are sitting around a table that is formally set with candles. dinner is usually served formally."], "image": "train2014/COCO_train2014_000000206280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543411, "question_id": "2faSWUYbSTLYGyGrEMUsjE", "question": "Why do persons lay on their surfboard what is this part of?", "choices": ["strike", "video craze", "lesson", "work slowdown"], "correct_choice_idx": 2, "direct_answers": ["body positioning", "training", "learning", "lesson", "surf lessons", "class", "paddling", "training", "lesson", "swimming"], "difficult_direct_answer": false, "rationales": ["In order to teach people how to surf, they have to start with the basics of laying on the board and paddling out to the wave. there is also a teacher in this group that is giving instruction", "These people are learning how to surf.", "People are all laying on surfboards in a row and are all wearing the same outfit."], "image": "train2014/COCO_train2014_000000543411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449270, "question_id": "2fvHcW7pq5CUy8dcHdxLg5", "question": "Where might you find the item in the window?", "choices": ["crib", "nightclub", "butcher", "garden shed"], "correct_choice_idx": 0, "direct_answers": ["crib", "bed", "sofa", "antique shop", "bed", "bed", "home", "bed", "bed", "furniture store"], "difficult_direct_answer": false, "rationales": ["This is a pillow so it is normally on a bed", "There is a pillow in the window, which can be used in a crib.", "This is a fancy pillow that would be used in a crib for decoration."], "image": "train2014/COCO_train2014_000000449270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501995, "question_id": "2fw64fNawBgUiCS3Q8e4Gz", "question": "Where is the food made?", "choices": ["at home", "on beach", "in truck", "in restaurant"], "correct_choice_idx": 2, "direct_answers": ["food truck", "in truck", "truck", "food truck", "in truck", "truck", "food truck", "food truck", "in truck", "new york"], "difficult_direct_answer": false, "rationales": ["This is a food truck and they make the food inside to serve to walk up customers.", "The food is in the truck.", "They cook the food inside of the vehicle with the equipment they have in there."], "image": "train2014/COCO_train2014_000000501995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304295, "question_id": "2g5XwKBwm6EykruiNjLxbR", "question": "What shape are two of the grass wreaths fashioned into?", "choices": ["octagon", "rectangle", "tree", "cross"], "correct_choice_idx": 3, "direct_answers": ["crosses", "cross", "cross", "crosses", "cross", "crosses", "crosses", "cross", "cross", "cross"], "difficult_direct_answer": false, "rationales": ["They are shaped like a cross and hanging up.", "The wreaths are shaped into two planks on top of each other.", "There are four grass wreaths in total but only one shape is shared by two of the wreaths and that shape is a cross."], "image": "train2014/COCO_train2014_000000304295.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43393, "question_id": "2g7JexNfEqCcdoTGQyUxa5", "question": "Which quadrant of the picture has the most cows in it?", "choices": ["bottom left", "bottom right", "top right", "top left"], "correct_choice_idx": 3, "direct_answers": ["upper left", "left", "none sheeponly", "top", "left", "upper left", "top", "southwest", "top left", "no cows"], "difficult_direct_answer": false, "rationales": ["The quadrant is the top left.", "There are more of them on this side then the other.", "The upper left has more in it."], "image": "train2014/COCO_train2014_000000043393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572036, "question_id": "2gM9uk6xXbgQH5CLu2kVhL", "question": "What type of field are the kids playing on?", "choices": ["softball", "soccer", "lacrosse", "football"], "correct_choice_idx": 0, "direct_answers": ["baseball", "baseball", "baseball diamond", "baseball", "baseball", "baseball", "baseball", "baseball diamond", "baseball", "softball"], "difficult_direct_answer": false, "rationales": ["The kids are playing a batting game and need a certain type of field.", "There is a boy that is ready to swing and he's at the home plate.", "They are playing with a bat, bases and mitts."], "image": "train2014/COCO_train2014_000000572036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234100, "question_id": "2gQhA4mvjvyK5e4snBhhS6", "question": "Where was the fruit being used as flavoring here grown?", "choices": ["lime tree", "orange tree", "no where", "pepper plant"], "correct_choice_idx": 0, "direct_answers": ["lime", "florida", "lime tree", "lime", "lime", "phoenix", "tropics", "malaysia", "tree", "mexico"], "difficult_direct_answer": false, "rationales": ["The cup contains a drink flavored with a slice of lime that grows on a tree.", "This is a citrus fruit", "There is a green slice of fruit in a drink. lemons and limes are commonly used in cocktails."], "image": "train2014/COCO_train2014_000000234100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576067, "question_id": "2h2Gk7SBLrkbJKmJdoGaGf", "question": "What are the silver wrappers from?", "choices": ["mm's", "hershey's kisses", "snickers", "reese's pieces"], "correct_choice_idx": 1, "direct_answers": ["candy", "candy", "chocolate wrapper", "hershey's kisses", "candy", "hershey's kisses", "hershey kisses", "hershey kisses", "candy", "candy kiss"], "difficult_direct_answer": false, "rationales": ["The silver wrappers are for the chocolate kisses.", "These are the candies decorating the cake.", "The wrappers are from hershey's."], "image": "val2014/COCO_val2014_000000576067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389295, "question_id": "2h2PKuqShg6DHoyk9Yk92B", "question": "What position does the player wearing yellow play?", "choices": ["shortstop", "third base", "pitcher", "catcher"], "correct_choice_idx": 1, "direct_answers": ["third base", "baseball", "short stop", "fourth baseman", "fourth baseman", "third base", "third", "third base", "third base", "third"], "difficult_direct_answer": false, "rationales": ["The position is third base.", "The player in yellow is standing by third base so he plays that position.", "He is standing at third base."], "image": "val2014/COCO_val2014_000000389295.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553780, "question_id": "2h6a2jBxk4AenDEVNDorHm", "question": "In which manner were the desserts here prepared?", "choices": ["baking", "open fire", "grilling", "frying"], "correct_choice_idx": 3, "direct_answers": ["carefull", "baked", "boxed", "frying", "fried", "fried", "baked", "divided", "store made", "careful"], "difficult_direct_answer": false, "rationales": ["Donuts are usually fried in oil.", "The desserts in the box are doughnuts and are made by frying sweet dough.", "The desserts are donuts by their shape and appearance. donuts are cooked by frying which you can see slightly on the sides."], "image": "val2014/COCO_val2014_000000553780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262893, "question_id": "2hF5hbR9xXAGKB3FCL2kYi", "question": "What is the boy in blue and white trying to do?", "choices": ["backflip", "kick ball", "tackle boy", "grab ball"], "correct_choice_idx": 1, "direct_answers": ["kick ball", "kick ball", "kick", "kick ball", "kick ball", "kick ball", "block", "kick ball", "kick ball", "evade defender"], "difficult_direct_answer": false, "rationales": ["The boys are kicking the ball.", "The boy in the blue and white is trying to kick the ball in soccer.", "He is in position to kick the ball."], "image": "train2014/COCO_train2014_000000262893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457636, "question_id": "2hPiuCM5x2AsnwYRbbBgoP", "question": "What foot appeared are the hockey players wearing to play on the ice?", "choices": ["cleats", "skates", "rollerblades", "sticks"], "correct_choice_idx": 1, "direct_answers": ["ice skates", "ice skates", "ice skates", "ice skates", "skates", "hockey skate", "skates", "skates", "ice skates", "ice skates"], "difficult_direct_answer": false, "rationales": ["Hockey players play on ice and wear skates in order to do that.", "The foot is in a skate.", "These bladed boots are necessary for travel on the ice."], "image": "val2014/COCO_val2014_000000457636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274885, "question_id": "2hTF5beuHN5qtU4A4vjdpQ", "question": "Why do sheep have colored dye on their backs?", "choices": ["unknown", "mating details", "identify owner", "identify breed"], "correct_choice_idx": 1, "direct_answers": ["tracking", "identification", "unknown", "marking", "mark sex", "mating details", "mated", "marking", "identification", "identification"], "difficult_direct_answer": false, "rationales": ["The dye indicates if the sheep has been bred and are potentially pregnant.", "The sheep are marked so they are easy to identify.", "This coloring might be used to distinguish certain sheep from others because to humans they make look similar to each other especially when there are many."], "image": "train2014/COCO_train2014_000000274885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485822, "question_id": "2hi9B5XRjJiGRubKuWN86E", "question": "What event is going to take place?", "choices": ["car speeding", "car show", "motorcycle parade", "motorcycle sale"], "correct_choice_idx": 2, "direct_answers": ["race", "motorcycle parade", "biking", "motorcycle rally", "race", "bike race", "festival", "motorcycle show", "motorcycle rally", "motorcycle event"], "difficult_direct_answer": false, "rationales": ["There are many motorcycles parked together.", "Car dealerships sell motorcycles as well.", "A motorcycle parade will take place since so many are gathered up."], "image": "val2014/COCO_val2014_000000485822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508811, "question_id": "2hmBV6cU4oZPkKSsy4riPA", "question": "What is most likely in the air?", "choices": ["tennis ball", "kite", "airplane", "frisbee"], "correct_choice_idx": 0, "direct_answers": ["tennis ball", "ball", "ball", "tennis ball", "tennis ball", "tennis ball", "tennis ball", "tennis ball", "tennis ball", "tennis ball"], "difficult_direct_answer": false, "rationales": ["The man is holding a racquet and standing on a court.", "The ball is in the air.", "He is holding a racket meant to play said sport on a court for said game."], "image": "val2014/COCO_val2014_000000508811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393274, "question_id": "2i6X43BtxZiuiFByMcE7tx", "question": "What letter is on the front of the train?", "choices": ["e", "x", "c", "w"], "correct_choice_idx": 2, "direct_answers": ["c", "letter c", "letter c", "letter c", "ca", "cee", "letter c", "c", "letter c", "letter c"], "difficult_direct_answer": false, "rationales": ["The letter is c.", "C is the letter.", "A train is marked with ca on the side."], "image": "val2014/COCO_val2014_000000393274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403913, "question_id": "2i9oGcGomsHkxTGQcDNLqg", "question": "What is the orange stuff in the bowl?", "choices": ["pumpkin", "candy corn", "carrot", "squash"], "correct_choice_idx": 2, "direct_answers": ["carrots", "carrot", "carrot", "carrots", "carrots", "carrots", "carrots", "carrot", "carrot", "carrot"], "difficult_direct_answer": false, "rationales": ["It is shredded and added to the red cabbage.", "The ingredients are shown beside the bowl and the orange ones are carrots.", "It is a long orange vegetable."], "image": "train2014/COCO_train2014_000000403913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201969, "question_id": "2iMfro3PsLGb6fBkfCauzf", "question": "What breed of dog is sitting near the fence?", "choices": ["pomeranian", "rottweiler", "dachshund", "husky"], "correct_choice_idx": 3, "direct_answers": ["husky", "siberian husky", "labrador", "golden retriever", "husky", "husky", "golden retriever", "husky", "husky", "husky"], "difficult_direct_answer": false, "rationales": ["That is the type of dog near the fence.", "The dog sitting near the fence has a white face and blue eyes like huskies have.", "The distinct look of the husky to wolves, makes it easy to identify the breed."], "image": "train2014/COCO_train2014_000000201969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251408, "question_id": "2iUS23H6MfnnYfHddA4o2s", "question": "What is on the top of the building?", "choices": ["cross", "human", "weathervane", "bird"], "correct_choice_idx": 0, "direct_answers": ["cross", "clock", "cross", "cross", "spire", "cross", "cross", "cross", "cross", "cross"], "difficult_direct_answer": false, "rationales": ["There is a cross on the top of the building that is lit up.", "It is lit up and clearly visible, even at night.", "Crosses are symbols often found at the top of tall buildings."], "image": "train2014/COCO_train2014_000000251408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7952, "question_id": "2icnVjovXa6yTr3cAJwaxD", "question": "What could happen if the white truck parks a few feet directly ahead?", "choices": ["parade", "collision", "movie", "sunset"], "correct_choice_idx": 1, "direct_answers": ["train crash", "train accident", "get hit", "collision", "train crash", "train collision", "collision", "collision", "accident", "acciednt"], "difficult_direct_answer": false, "rationales": ["The white truck is near train tracks. the train would hit the white truck if it were to park a few feet directly ahead.", "The two vehicles would drive into each other.", "The truck is close to the train tracks and moving a few feet ahead would put it in the trains path."], "image": "val2014/COCO_val2014_000000007952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470474, "question_id": "2idNVWgEPGkhxBQEcXXAhR", "question": "What material is used to make roofing for buildings on the right side of this street?", "choices": ["grass", "clay", "tin", "sod"], "correct_choice_idx": 1, "direct_answers": ["clay", "stone", "terra cotta", "tiles", "clay", "shingles", "brick", "thatch", "wood", "mud"], "difficult_direct_answer": true, "rationales": ["This kind of dirt can be molded into tiles for a roof.", "Looks like a clay roof.", "The roofing on the right side building appear to be made from soft material due to its shape and texture. the most likely material that fits this criteria is clay."], "image": "train2014/COCO_train2014_000000470474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458682, "question_id": "2iiEs5EqxQHc7Bgqf9b7j4", "question": "What fruit is in the picture?", "choices": ["mangos", "apples", "peaches", "oranges"], "correct_choice_idx": 3, "direct_answers": ["oranges", "oranges", "oranges", "orange", "oranges", "oranges", "orange", "oranges", "orange", "oranges"], "difficult_direct_answer": false, "rationales": ["This fruit is the same color.", "Those are the only things in the picture.", "These are round and the color that is the same as its name"], "image": "train2014/COCO_train2014_000000458682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421630, "question_id": "2iyXffYQGw9trYxBALkkgA", "question": "Which artist depicted Polynesians practicing this sport on the Sandwich Islands?", "choices": ["paul gauguin", "georges seurat", "michael donahue", "john webber"], "correct_choice_idx": 3, "direct_answers": ["dali", "john webber", "lady", "captain cook", "polynesian artist", "philip roberts", "buzz", "leonardo dicaprio", "john webber", "isabella bird"], "difficult_direct_answer": true, "rationales": ["The artist was webber.", "He was an artist that did this work", "According to google john webber was the first to depict polynesians paddle boarding."], "image": "train2014/COCO_train2014_000000421630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381684, "question_id": "2jLWwjQVsrjUxtatEvx6nM", "question": "What animals are looking back at the cows?", "choices": ["horse", "dog", "giraffe", "cat"], "correct_choice_idx": 1, "direct_answers": ["dog", "dogs", "dogs", "dogs", "dogs", "dogs", "dogs", "dogs", "dogs", "dogs"], "difficult_direct_answer": false, "rationales": ["There are two dogs looking at the cows.", "Two dogs stand on the opposite side of a fence as a bunch of cows.", "The animals are on leashes and have canine features."], "image": "train2014/COCO_train2014_000000381684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334715, "question_id": "2jVxmNziSRBMRcqc8ogmHH", "question": "What type of bottoms does the woman in white have on?", "choices": ["capris", "bikini", "skirt", "shorts"], "correct_choice_idx": 1, "direct_answers": ["swim trunk", "bikini", "bikini", "bathing suit", "bikini", "bikini", "swim suit", "bikini", "swim suit", "one piece"], "difficult_direct_answer": false, "rationales": ["The surfer has on a bikini swimsuit.", "A woman is surfing in small bottoms. bikinis are worn in the water.", "The woman has bikini bottoms."], "image": "train2014/COCO_train2014_000000334715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489845, "question_id": "2jXjxP2xTX6CeYAASqksa4", "question": "What type of room are the kids in?", "choices": ["bedroom", "recreation", "bathroom", "kitchen"], "correct_choice_idx": 1, "direct_answers": ["living room", "living room", "living room", "media", "living room", "living room", "tv room", "recreation", "living", "living room"], "difficult_direct_answer": false, "rationales": ["The kids are surrounded with games and movies.", "The room has a television and movies. it does not have a stove, a toilet, or a bed.", "The kids are in the rec room."], "image": "train2014/COCO_train2014_000000489845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216042, "question_id": "2jezXbAdgtzbLo8cmKk6Qy", "question": "What does the white truck do?", "choices": ["sells food", "transports passengers", "transports utensils", "transports rice"], "correct_choice_idx": 0, "direct_answers": ["sell food", "food truck", "food truck", "sells food", "sells food", "food truck", "sell food", "sells food", "food truck", "makes food"], "difficult_direct_answer": false, "rationales": ["You can tell because the side of the vehicle says burritos and rice plates.", "A large truck lists types of food on the side.", "The truck advertises tacos and burritos."], "image": "train2014/COCO_train2014_000000216042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132954, "question_id": "2jmKA8paLFHKnXxN86f6Ja", "question": "When was the Union Jack invented?", "choices": ["1606", "1612", "1672", "1619"], "correct_choice_idx": 0, "direct_answers": ["sixteenhundred six", "1606", "1946", "long ago", "1606", "olden times", "1606", "1606", "1606", "1950s"], "difficult_direct_answer": false, "rationales": ["The union jack was invented in 1606.", "I used the internet to search for the date when the union jack was created.", "The union jack was invented just after 1600."], "image": "train2014/COCO_train2014_000000132954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395230, "question_id": "2k9KTy5hmfGhrUiPwZgMba", "question": "What hobby is the person who is driving this car today doing now?", "choices": ["pet torture", "photography", "golf", "sewing"], "correct_choice_idx": 1, "direct_answers": ["dog sitting", "pet sitting", "dog park", "driving dog", "walking dog", "dog caring", "doggy sitting", "walking", "photography", "dogs"], "difficult_direct_answer": true, "rationales": ["There is no one in the car now so they are probably nearby somewhere. chance are that they are the one taking this photo since a pet dog is being left alone there posing.", "Photography seems like the only feasible answer to this picture.", "The person is taking a picture of the dog in the car."], "image": "val2014/COCO_val2014_000000395230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249128, "question_id": "2kPt5KZso5oeE845N8Tghx", "question": "What kind of athlete was the man in the black and white image most likely?", "choices": ["footballer", "swimmer", "runner", "cyclist"], "correct_choice_idx": 3, "direct_answers": ["runner", "tennis", "bicyclist", "golfer", "basketball player", "cyclist", "baseball player", "baseball player", "founder", "runner"], "difficult_direct_answer": false, "rationales": ["He looks to be a football player", "The athlete is a cyclist.", "The picture has a sign underneath that mentions the tour de france, which is a famous bike race."], "image": "train2014/COCO_train2014_000000249128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294954, "question_id": "2kS3WozT56ua84kA3kBFCq", "question": "Who are the men standing on the right of the image?", "choices": ["passengers", "adventurers", "drivers", "workers"], "correct_choice_idx": 3, "direct_answers": ["workers", "employees", "maintenance workers", "workers", "workers", "new crew", "workers", "workers", "mechanics", "passengers"], "difficult_direct_answer": false, "rationales": ["The men are wearing working gear.", "Men in uniforms are on the side of a train. people wear uniforms when they are working.", "You can tell by the work clothes the people have on as to who or what they are."], "image": "train2014/COCO_train2014_000000294954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193318, "question_id": "2kSq6yFLdFwAN2eqebrhae", "question": "What type of pants is this person wearing?", "choices": ["shorts", "bell bottoms", "sweatpants", "jeans"], "correct_choice_idx": 2, "direct_answers": ["suede", "jeans", "jeans", "sweatpants", "jeans", "sweat pants", "jeans", "jeans", "sweat pants", "jeans"], "difficult_direct_answer": false, "rationales": ["This person has a set of comfy pants on.", "The man is wearing slouchy warm pants.", "The pants are sweats."], "image": "train2014/COCO_train2014_000000193318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69826, "question_id": "2kXbJSFY92afqGVwXmj358", "question": "What is the toothbrush inside of?", "choices": ["cabinet", "flower pot", "skull dish", "milk crate"], "correct_choice_idx": 2, "direct_answers": ["skull dish", "skull mug", "skull", "skull", "skull", "skull", "skull", "skull cup", "skull", "plastic"], "difficult_direct_answer": false, "rationales": ["The toothbrush is inside of a toothbrush holder that is shaped like a skull.", "It is a ceramic depiction of this", "There is a skull on it."], "image": "train2014/COCO_train2014_000000069826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52689, "question_id": "2kZRSW6FbFMS52EWY5Jg2K", "question": "To what elevation might someone ride on the ski lift?", "choices": ["same", "higher", "none", "lower"], "correct_choice_idx": 1, "direct_answers": ["four meters", "medium", "higher", "20 meters", "fifty feet", "high up", "snow", "4000 feet", "high", "higher"], "difficult_direct_answer": true, "rationales": ["A ski lift takes a person from the bottom of the hill to the top of the hill so the elevation would increase as the lift went up.", "The guy will go higher.", "The purpose of a ski lift is to move people from the bottom of a hill or mountain to the top. if one is on a chair lift and not specifically at the top they are likely moving up."], "image": "val2014/COCO_val2014_000000052689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538204, "question_id": "2kr95K8WFZkN4kAtkDMJYh", "question": "What type of horticulture is occurring here?", "choices": ["container", "aeroponics", "raised bed", "hydroponics"], "correct_choice_idx": 3, "direct_answers": ["rooting", "onion growing", "growing bulbs", "olericulture", "medicinal", "growing", "growing onions", "growing crops", "hydroponics", "hydroponic"], "difficult_direct_answer": true, "rationales": ["This type of growing is done with water and no soil as is seen in these vases.", "The process is used to put onions in a jar.", "The plants are growing in water."], "image": "val2014/COCO_val2014_000000538204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125316, "question_id": "2ksCvFRCeHy7kJKuo58Bn5", "question": "How many living species of elephants are currently recognized?", "choices": ["three", "four", "six", "five"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "one", "two", "three", "five", "three", "three", "three-living species"], "difficult_direct_answer": false, "rationales": ["There are african savannah, african forest, and asian elephants.", "The species are the african bush, african forest, and asian.", "Living there are three of them."], "image": "train2014/COCO_train2014_000000125316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118477, "question_id": "2mepZf3dZHz7C4xvds6qYV", "question": "Based on the reflections where is this bowl of cereal placed?", "choices": ["kitchen", "cafe", "living room", "office"], "correct_choice_idx": 0, "direct_answers": ["granite countertop", "kitchen counter", "counter", "kitchen", "counter", "counter", "strawberry foods", "counter", "table", "kitchen"], "difficult_direct_answer": false, "rationales": ["There is a reflection of an oven off of the countertop.", "The kitchen is present.", "The cereal is on the kitchen counter."], "image": "train2014/COCO_train2014_000000118477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192513, "question_id": "2mrBfDVMPzD2TCDpEKZoCQ", "question": "To whom is this ball thrown?", "choices": ["infield", "ref", "coach", "batter"], "correct_choice_idx": 3, "direct_answers": ["batter", "catcher", "batter", "batter", "batter", "batter", "catcher", "catcher", "batter", "catcher"], "difficult_direct_answer": false, "rationales": ["The ball is going to the batter.", "The pitcher's position the way they are throwing the ball is normally towards the batter in baseball.", "The pitcher will throw it to other players during the game."], "image": "train2014/COCO_train2014_000000192513.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561442, "question_id": "2mvRzfVCWUHgnQvSEUyJXi", "question": "What is on the sign?", "choices": ["stop", "yield", "go", "x"], "correct_choice_idx": 3, "direct_answers": ["red x", "no entry", "red x", "red x", "x", "x", "x", "x", "no parking", "x sign"], "difficult_direct_answer": false, "rationales": ["It is a crossing sign.", "The street sign by the streetlight is a crossing sign.", "The sign is a round circle with a red x going through the middle."], "image": "train2014/COCO_train2014_000000561442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228172, "question_id": "2npuLtjjUqnW9sp9RsGZGT", "question": "What is the large vehicle's purpose?", "choices": ["transport cars", "transport furniture", "transport trash", "transport family"], "correct_choice_idx": 2, "direct_answers": ["transport trash", "collecting garbage", "trash collection", "trash collecting", "fire extigusher", "collect garbage", "trash pickup", "garbage removal", "garbage", "waste management"], "difficult_direct_answer": true, "rationales": ["It says \"waste management\" on its side.", "The large vehicle is a truck. a dumpster is being emptied into the truck.", "If you see a large green truck the odds are that it is a garbage truck."], "image": "train2014/COCO_train2014_000000228172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567836, "question_id": "2nsdYDqygVPg8w84tHWHQ2", "question": "Why are the horses hitched together?", "choices": ["training", "pulling wagon", "mating", "to plow"], "correct_choice_idx": 3, "direct_answers": ["plowing", "share workload", "plowing", "plowing", "plowing", "to plow", "to plow", "control", "roof", "plowing"], "difficult_direct_answer": false, "rationales": ["The horses are working on a farm. they are not pulling a wagon.", "The horses are hitched together to keep them aligned to accomplish the task of pulling the farming equipment.", "Horses have more power than a man. it takes power to till a field."], "image": "train2014/COCO_train2014_000000567836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403489, "question_id": "2o2WR3QrwogfzuvQsUQnT9", "question": "What is his favorite sport?", "choices": ["swimming", "lacrosse", "running", "basketball"], "correct_choice_idx": 2, "direct_answers": ["running", "track", "baseball", "track", "track", "track", "track field", "track field", "track field", "track field"], "difficult_direct_answer": false, "rationales": ["The person is wearing a track and field jacket.", "This person loves track and field since that's what the jacket states.", "A man is wearing a coat with a track and field logo on the back."], "image": "train2014/COCO_train2014_000000403489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294297, "question_id": "2oCb2CXagaKxPLcsTU8sXu", "question": "If this is Chinese food how was it most likely cooked?", "choices": ["barbecue grill", "pan seared", "stir fried", "oven"], "correct_choice_idx": 2, "direct_answers": ["wok", "stir fry", "stir fried", "chinese food", "steamed", "steamed", "in wok", "fried", "wok", "steamed"], "difficult_direct_answer": false, "rationales": ["The chinese food was probably stir fried as this is a common cooking technique in that cuisine.", "Food like this is usually cooked in a wok with oil as it's stirred.", "All the veggies mixed is usually in a stir fry."], "image": "val2014/COCO_val2014_000000294297.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288921, "question_id": "2oNbYWfxkwwH2xMomihL6d", "question": "What type of event is this?", "choices": ["birthday party", "funeral", "reception", "farmer's market"], "correct_choice_idx": 3, "direct_answers": ["marketing", "market", "market", "market", "fruit market", "farmer's market", "market", "fruit market", "market", "farmers market"], "difficult_direct_answer": false, "rationales": ["The food is at a farmer's market.", "Multiple kinds of fruits are for sale.", "The picture is obvious to understand what is going on due to the different types of produce."], "image": "train2014/COCO_train2014_000000288921.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43561, "question_id": "2oWf9FxcRonMKfY6bcHw8B", "question": "What is the rectangular grey object in the middle of the dirt field?", "choices": ["outhouse", "garbage can", "mailbox", "chest"], "correct_choice_idx": 1, "direct_answers": ["bus", "trashcan", "trash can", "trash can", "trashcan", "trashcan", "trash can", "trash can", "trash bin", "garbage can"], "difficult_direct_answer": false, "rationales": ["This is a standard grey trash bin.", "The object is a garbage can.", "The object is clearly visible and has a lid and wheels and is the right size to be answer a."], "image": "val2014/COCO_val2014_000000043561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468884, "question_id": "2ogZpBiHFQD4Bdv2KcPp6x", "question": "What are they walking in?", "choices": ["woodchips", "water", "mud", "gravel"], "correct_choice_idx": 2, "direct_answers": ["dirt", "mud", "bush", "field", "mud", "mud", "safari", "dirt", "dirt", "water"], "difficult_direct_answer": false, "rationales": ["The ground looks very muddy.", "Zebras are known to be black and white, but they have brown smears onn them here, indicating that they are muddy.", "They are zebras. they walk in natural environments, which has natural sediments and deposits on the ground."], "image": "train2014/COCO_train2014_000000468884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349363, "question_id": "2p4WjSaLzfpqtNkf4SSgbJ", "question": "During which season are the cars traveling on the road?", "choices": ["winter", "summer", "fall", "spring"], "correct_choice_idx": 3, "direct_answers": ["fall", "fall", "winter", "spring", "summer", "winter", "spring", "winter", "spring", "winter"], "difficult_direct_answer": false, "rationales": ["Looks to be a pretty day out.", "The trees are a fresh green color.", "Cars are on the street and the trees on the side have green leaves. trees have green leaves during the spring."], "image": "train2014/COCO_train2014_000000349363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427518, "question_id": "2p5yyyu4QUyMfBhwNtMGWN", "question": "What are they doing?", "choices": ["chatting", "fighting", "arguing", "resting"], "correct_choice_idx": 2, "direct_answers": ["playing baseball", "playing baseball", "talking", "talking", "talking", "talking", "arguing", "talking", "playing baseball", "talking"], "difficult_direct_answer": false, "rationales": ["The men in the picture seem calm and at ease.", "They are seen standing hence they are chatting.", "Two men with different color uniforms on are on a baseball field talking with stern expressions."], "image": "val2014/COCO_val2014_000000427518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249384, "question_id": "2pCZGCKf2bcqatCUJ4mXgK", "question": "What's the name of the company that made the canned drink?", "choices": ["pepsi", "sprite", "coca-cola", "dr. pepper"], "correct_choice_idx": 0, "direct_answers": ["pepsi", "coca cola", "pepsico", "pepsi", "pepsi", "pepsi co", "pepsi", "pepsi", "pepsi", "pepsi"], "difficult_direct_answer": false, "rationales": ["You can see a part of the logo on the can that belongs to pepsi.", "That company has the red, white, and blue swirl in a circle.", "The can on the table has the pepsi logo on it."], "image": "train2014/COCO_train2014_000000249384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114598, "question_id": "2pMWV5w7dWAShEW2ZGRh2f", "question": "Where are they going?", "choices": ["rest stop", "home", "lunch", "uphill"], "correct_choice_idx": 3, "direct_answers": ["skiing", "uphill", "uphill", "uphill", "over hill", "uphill", "uphill", "cross country", "skiing", "downhill"], "difficult_direct_answer": false, "rationales": ["The people are climbing up higher or uphill.", "The incline of the surface is tilted so that the lower part is behind them and they are facing the higher part.", "People are skiing and are facing uphill on a mountain."], "image": "train2014/COCO_train2014_000000114598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581071, "question_id": "2pMqUz97nCZNNHfyLLm8CA", "question": "What is sold inside this store?", "choices": ["tires", "groceries food", "jewels", "paper hats"], "correct_choice_idx": 1, "direct_answers": ["sundries", "fruits", "apples", "produce", "fruit", "drug store", "produce", "food", "groceries food", "bikes"], "difficult_direct_answer": true, "rationales": ["The groceries are sold inside.", "The store has fruits on the front of it indicating it sells produce.", "You can see the fruits on the sign so it's a grocery store."], "image": "train2014/COCO_train2014_000000581071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184377, "question_id": "2pQgwofg9Pj8S4AAKPfLCo", "question": "What non living animals are portrayed most frequently here?", "choices": ["none", "horses", "dogs", "lions"], "correct_choice_idx": 3, "direct_answers": ["lion", "lions", "horses", "lions", "horses", "vehicles", "cars", "statue", "lions", "horses"], "difficult_direct_answer": false, "rationales": ["The statues are made of marble and are shaped in the likeness of a lion", "The animal is the lion.", "The statue features a lion."], "image": "train2014/COCO_train2014_000000184377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42055, "question_id": "2pbBe9M67smArNNMW2fWeP", "question": "What item here is most profuse and likely offered for sale?", "choices": ["boots", "rain coats", "hats", "parasols"], "correct_choice_idx": 3, "direct_answers": ["umbrella", "umbrellas", "umbrella", "parasols", "umbrella", "umbrella", "umbrella", "umbrellas", "umbrella", "umbrellas"], "difficult_direct_answer": false, "rationales": ["There are many sizes of the umbrellas.", "Parasols are on display.", "A shop will set up items for sale in a display and open things like these umbrellas to show the design."], "image": "val2014/COCO_val2014_000000042055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432486, "question_id": "2pmQHTP9usUi9j6izsRGqj", "question": "The equine figure seen here is what type?", "choices": ["rocking", "taxidermied", "stuffed", "roan"], "correct_choice_idx": 0, "direct_answers": ["toy horse", "rocking horse", "horse", "rocking horse", "horse", "rocking horse", "hobby horse", "rocking", "toy", "vase"], "difficult_direct_answer": false, "rationales": ["It is wooden, on a spring and small enough for a child to play on.", "Equine means horse. the horse is a toy.", "The horse is a rocking one."], "image": "train2014/COCO_train2014_000000432486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189241, "question_id": "2q2iRewZiKvao7vqL8WqH3", "question": "What orange vegetable is probably in the jar on the left?", "choices": ["carrots", "peppers", "tomatoes", "squash"], "correct_choice_idx": 1, "direct_answers": ["carrot", "carrots", "carrot", "carrots", "carrots", "peppers", "peppers", "carrots", "peppers", "carrots"], "difficult_direct_answer": false, "rationales": ["The shape looks like they are peppers.", "Peppers are usually a vibrant orange color.", "The orange vegetable is not a carrot, only really leaving one type left for it to be."], "image": "val2014/COCO_val2014_000000189241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11300, "question_id": "2q6YSHE97ZoNayC9armhFZ", "question": "What is next to the skateboard?", "choices": ["baby", "cat", "shoe", "apple"], "correct_choice_idx": 0, "direct_answers": ["boy", "rail", "baby", "kid", "rail", "baby", "kid", "boy", "child", "toddler"], "difficult_direct_answer": false, "rationales": ["There is a skateboard on the floor next to a young baby.", "A quick look tells us that there is a baby (toddler) right next to the skateboard.", "The little guy is touching the skateboard."], "image": "val2014/COCO_val2014_000000011300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310770, "question_id": "2qFBsdJzskopcje4kMhKvt", "question": "What is the container on top of the tin foil holding?", "choices": ["ice cream", "fries", "sauce", "milk"], "correct_choice_idx": 2, "direct_answers": ["cheese", "sauce", "liquid", "no idea", "sauce", "sauce", "sauce", "ranch dressing", "sauce", "cheese"], "difficult_direct_answer": false, "rationales": ["The container has sauce.", "The person would dip their sandwich in the sauce", "The preparation seems to be of sauce."], "image": "val2014/COCO_val2014_000000310770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17936, "question_id": "2qJFQrZkz6drJhQhunsG6S", "question": "What is in the white rectangular package to the left of the cat?", "choices": ["cheese", "mail", "chocolate", "legos"], "correct_choice_idx": 2, "direct_answers": ["seeds", "chocolate", "chocolate", "chocolate", "cat food", "chocolate", "candy bar", "chocolate bar", "chocolate", "chocolate"], "difficult_direct_answer": false, "rationales": ["The wrapping indicates the dark kind and shows the actual item.", "The package has chocolate.", "It is a dark variant of a food item. cheese is perishable, so it would not be stored outside of a fridge."], "image": "train2014/COCO_train2014_000000017936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268334, "question_id": "2qfDjbRQKMGRm8u3GMfULf", "question": "What is the person trying to record?", "choices": ["images", "notes", "sound", "movement"], "correct_choice_idx": 2, "direct_answers": ["sound", "sheep", "dogs", "sheep", "sheep sounds", "sheep sounds", "animals", "animals", "park", "sheep count"], "difficult_direct_answer": false, "rationales": ["A man is holding a microphone near two animals.", "The person is holding a boom microphone, not a camera.", "The man is trying to capture the sheep snoring or making noise."], "image": "train2014/COCO_train2014_000000268334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479560, "question_id": "2r8SWUogFQSTaCTSjcMPQL", "question": "Why does he have his arm up?", "choices": ["reach", "gesture", "wave", "measure"], "correct_choice_idx": 0, "direct_answers": ["reach", "hitting ball", "hit ball", "to swing", "serving", "serving", "hitting ball", "hit ball", "hitting ball", "serve ball"], "difficult_direct_answer": false, "rationales": ["Having his arm stretched out gives the man a better range of motion for his hand and racket.", "The tennis player has his arm up to hit the tennis ball with his racket.", "A tennis player has his arm elevated and is swinging."], "image": "train2014/COCO_train2014_000000479560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46322, "question_id": "2r8bdfHsstEya2aEPzGb4g", "question": "What contest are the men participating in?", "choices": ["wrestling", "boxing", "eating", "karate"], "correct_choice_idx": 2, "direct_answers": ["hotdog eating", "eating contest", "eating", "hotdog eating", "eating", "eating", "hot dog", "hotdog eating", "eating", "hotdog eating"], "difficult_direct_answer": false, "rationales": ["It appears that the two contestants have hotdogs in their hands so they much be competing to see who can consume the most hotdogs.", "The picture of the men eating hotdogs is the only reasonable answer here.", "The people are eating hotdogs."], "image": "train2014/COCO_train2014_000000046322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394418, "question_id": "2rGGBBZMGi9PRcYZpYhZtQ", "question": "What is his wrist accessory used for?", "choices": ["administer insulin", "measure speed", "tell time", "wipe sweat"], "correct_choice_idx": 2, "direct_answers": ["time", "telling time", "tell time", "serving", "time", "time", "time", "watch", "tell time", "telling time"], "difficult_direct_answer": false, "rationales": ["The accessory is for telling time.", "He is wearing a wristwatch.", "It is based on a 24 hour day. the mechanisms move each second to indicate progression through the day."], "image": "val2014/COCO_val2014_000000394418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391486, "question_id": "2rJaxPnac7DkSdJxFXbYZX", "question": "Based on the food and location this is most likely this person's what?", "choices": ["dinner", "breakfast", "midnight snack", "lunch"], "correct_choice_idx": 1, "direct_answers": ["orange", "breakfast", "sick person", "breakfast", "snack", "lunch", "breakfast", "breakfast", "snack", "breakfast"], "difficult_direct_answer": false, "rationales": ["Many people like to eat fruit and protein bars for their first meal to give them a good start on the day.", "The food is breakfast.", "The energy drink, oranges and meal bar are food items to start the day with."], "image": "train2014/COCO_train2014_000000391486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489382, "question_id": "2rKpTCw9vxsMje7jBHTCUw", "question": "What are the animals using the bridge to do?", "choices": ["play", "sleep", "sit", "cross water"], "correct_choice_idx": 3, "direct_answers": ["cross", "cross river", "cross stream", "cross", "cross river", "cross over", "cross creek", "cross water", "cross water", "cross stream"], "difficult_direct_answer": false, "rationales": ["The animals want to cross.", "The animals are awake and are walking. they are not playing.", "The sheep want to get to the other field. they are on the bridge."], "image": "val2014/COCO_val2014_000000489382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141163, "question_id": "2rMhLFFFqxFpXNAr5mnMfe", "question": "What is being cut?", "choices": ["chocolate sauce", "pudding", "gooey cake", "bread"], "correct_choice_idx": 2, "direct_answers": ["gooey cake", "pie", "cookie", "dessert", "dessert", "dessert", "cake", "cake", "desert", "cake"], "difficult_direct_answer": false, "rationales": ["The chocolate is really runny and sticky with a pastry.", "A pastry with chocolate running out of it is on a plate on a table and is being cut. cakes sometimes have filling.", "There is brown liquid flowing out of the cut."], "image": "val2014/COCO_val2014_000000141163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153814, "question_id": "2reFmN4HbDRdHVwVdstsYG", "question": "Why is the woman seated here?", "choices": ["to eat", "to work", "to wait", "to paint"], "correct_choice_idx": 0, "direct_answers": ["to eat", "eating", "eating", "eating", "eating pizza", "eating", "eating", "to eat", "to eat", "to eat"], "difficult_direct_answer": false, "rationales": ["This woman is sitting in front of pizza so she wants to eat it.", "A woman is seated in front of a pizza at a table.", "There is food and drinks on the table, indicating it is at a restaurant."], "image": "train2014/COCO_train2014_000000153814.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322427, "question_id": "2rtUztjWdJVzKLv9uZxM2A", "question": "What is the highest object in the room?", "choices": ["couch", "hanging light", "dog bed", "table"], "correct_choice_idx": 1, "direct_answers": ["light", "ceiling lamp", "chandelier", "ceiling", "vent", "ceiling light", "lamp", "ceiling lamp", "vent", "hanging light"], "difficult_direct_answer": false, "rationales": ["The light that is hanging from the ceiling.", "The hanging light is suspended from the ceiling and it's the only object on the ceiling, so it must be the highest object in the room.", "A light hangs from the ceiling in a room."], "image": "train2014/COCO_train2014_000000322427.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63671, "question_id": "2s77G67Z5HtiK6jLaLXeNR", "question": "Where can these foods be found?", "choices": ["fast food", "bar", "garden", "office"], "correct_choice_idx": 2, "direct_answers": ["ground", "vegetable market", "produce section", "garden", "ground", "produce section", "garden", "garden", "grocery store", "garden"], "difficult_direct_answer": false, "rationales": ["The items are various vegetables that are grown in a garden.", "All of these can be grown in a garden.", "Vegetables are typically grown in a garden."], "image": "val2014/COCO_val2014_000000063671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517723, "question_id": "2siybFe8K22eBFQEXYAEZR", "question": "What vehicles can use the lane nearest to the men?", "choices": ["bicycles", "cabs", "trucks", "motorcycles"], "correct_choice_idx": 0, "direct_answers": ["bicycles", "bike", "bikes", "bikes", "bikes", "bicycles", "bicycles", "bicycles", "bikes", "bicycles"], "difficult_direct_answer": false, "rationales": ["The vehicles are bikes.", "The non-arrow white symbols in that lane represent two-wheeled non-motorized vehicles.", "People are walking along a thin path next to a road. bike paths run along roads."], "image": "train2014/COCO_train2014_000000517723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364031, "question_id": "2st5q86oarN2Tm7Lm7KjJZ", "question": "What is this person's profession?", "choices": ["singer", "dancer", "lawyer", "fisherman"], "correct_choice_idx": 3, "direct_answers": ["food vendor", "cook", "cook", "fisherman", "chef", "fisherman", "chef", "cooking", "vendor", "chef"], "difficult_direct_answer": false, "rationales": ["He is a fisherman and you can see his boat there too.", "The person has seafood they are selling.", "The person has the available for sale that was probably caught that day."], "image": "val2014/COCO_val2014_000000364031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268523, "question_id": "2tMdYfr2pzaftvy3kjBvRf", "question": "These boats are most likely in what kind of place?", "choices": ["marina", "ocean", "lake", "river"], "correct_choice_idx": 0, "direct_answers": ["port", "harbor", "harbor", "marina", "marina", "harbor", "marina", "marina", "harbor", "war"], "difficult_direct_answer": false, "rationales": ["The boats are in a body of water. they are docked near its edge.", "A marina is normally the place boats are kept.", "They are all docked in a marina."], "image": "train2014/COCO_train2014_000000268523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224601, "question_id": "2tXyR9M3cjTKGUrhaftPWk", "question": "What time of day does the bus drive in here?", "choices": ["noon", "1 pm", "11 am", "sunset"], "correct_choice_idx": 3, "direct_answers": ["evening", "early evening", "evening", "evening", "sunset", "noon", "evening", "evening", "evening", "morning"], "difficult_direct_answer": false, "rationales": ["The sky is beginning to darken, and the sun is not visible in the sky. all the other listed options take place at a time of day when it would be light outside.", "The bus drives here during sunset.", "Although it is still light out, the sun is not brightly shining overhead which eliminated all the other options."], "image": "train2014/COCO_train2014_000000224601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354027, "question_id": "2tZLRVygQcYpearj4oR38a", "question": "Why is the young boy holding the older woman's hand?", "choices": ["for guidance", "for protection", "for fun", "for play"], "correct_choice_idx": 0, "direct_answers": ["safety", "safety", "for guidance", "crossing", "safety", "safety", "his mother", "keeping close", "protection", "safety"], "difficult_direct_answer": false, "rationales": ["The boy needs guidance.", "A small child is holding the hand of an adult as she crosses the road.", "A woman is holding a child's hand as they cross the street. adults help children cross safely."], "image": "val2014/COCO_val2014_000000354027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67496, "question_id": "2tbSsLPG5kmdhnxpFZZWZN", "question": "What texture will the main dish have when this has finished cooking?", "choices": ["crunchy", "mushy", "chewy", "al dente"], "correct_choice_idx": 0, "direct_answers": ["gloss", "tender", "crunchy", "soft", "crunchy", "crisp", "oily", "crunchy", "soft", "crispy"], "difficult_direct_answer": false, "rationales": ["The food will be crunchy when cooked.", "The snap peas will still hold their snap.", "It it's fried to a crisp."], "image": "train2014/COCO_train2014_000000067496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401229, "question_id": "2tnrsfgVw8uvKtsUuZ7tKK", "question": "Which majority group of people live in the area before the red sign on the bridge?", "choices": ["spanish", "american", "chinese", "french"], "correct_choice_idx": 2, "direct_answers": ["workers", "chinese", "chinese", "caucasian", "canadians", "chinese", "chinese", "chinese", "chinese", "chinese"], "difficult_direct_answer": false, "rationales": ["The sign has an arrow and references chinatown.", "These are the people who speak the language written on the sign.", "The chinese live here."], "image": "train2014/COCO_train2014_000000401229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3770, "question_id": "2tsqTAUtrDxKjrijpnbM9Q", "question": "What leg is the player using to push her body up?", "choices": ["both", "neither", "left", "right"], "correct_choice_idx": 2, "direct_answers": ["left", "left", "left", "left", "left", "left", "left", "left", "left", "left"], "difficult_direct_answer": false, "rationales": ["The player is standing with her right leg in the air and balancing on the other leg.", "The person is standing on their left leg.", "The leg is the left."], "image": "train2014/COCO_train2014_000000003770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406294, "question_id": "2txsTcKVQndwtnZzzhjwrD", "question": "What is this woman listening to?", "choices": ["person talking", "music", "video", "radio"], "correct_choice_idx": 0, "direct_answers": ["phone", "person's voice", "phone", "cellphone", "voicemail", "cellphone", "phone", "person talking", "cell phone", "conversation"], "difficult_direct_answer": false, "rationales": ["She is holding a phone up to her ear", "She's holding a cell phone to her ear", "This womani s listening to a person talking on the phone."], "image": "val2014/COCO_val2014_000000406294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2525, "question_id": "2uiKkiEA2VMZLe2tH6D5Z5", "question": "Why is the woman standing behind the cupcake displays?", "choices": ["selling them", "for practice", "smelling them", "hoarding them"], "correct_choice_idx": 0, "direct_answers": ["selling them", "selling them", "works there", "selling", "selling", "baker", "selling cupcakes", "cashier", "selling them", "she's selling"], "difficult_direct_answer": false, "rationales": ["The woman is selling the cupcakes.", "The woman in the apron is standing behind the cupcake displays because she is selling the cupcakes.", "She looks to be selling them at her stand and going to take money in exchange for her items."], "image": "val2014/COCO_val2014_000000002525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380057, "question_id": "2ukBACGpXbaGpNTJ2S4DxN", "question": "Which is the only direction vehicles can travel?", "choices": ["downwards", "left", "upwards", "right"], "correct_choice_idx": 3, "direct_answers": ["right", "right", "right", "straight", "right", "one way", "east", "right", "one way", "right"], "difficult_direct_answer": false, "rationales": ["There is a one way sign and the arrow is pointing right", "The one way sign marks that cars can only go in one direction.", "There is a \"one way\" road sign pointing in this direction."], "image": "val2014/COCO_val2014_000000380057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183938, "question_id": "2upTqZVzWL6azGoLBmLbuD", "question": "Which one of these Scandinavian countries is represented here?", "choices": ["iceland", "sweden", "finland", "denmark"], "correct_choice_idx": 1, "direct_answers": ["sweden", "sweden", "sweden", "sweden", "boat", "sweden", "sweden", "sweden", "norway", "netherlands"], "difficult_direct_answer": false, "rationales": ["Sweden's flag is shown on the boat.", "You can tell by the flag that is being flown as to what country it is from.", "The country is sweden."], "image": "train2014/COCO_train2014_000000183938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316142, "question_id": "2utpuoa4LAJsUxjewhYmVZ", "question": "What type of horse is it?", "choices": ["rocking", "female", "stuffed", "male"], "correct_choice_idx": 0, "direct_answers": ["rocking horse", "rocking", "rocking", "rocking horse", "brown horse", "rocking horse", "rocking", "rocking", "rocking horse", "rocking"], "difficult_direct_answer": false, "rationales": ["The horse has curved supports that enable it to have a back and forth motion.", "This is a wooden horse you can sit on and move back and forth on.", "It is a rocking horse."], "image": "train2014/COCO_train2014_000000316142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85846, "question_id": "2vJ3vwxi8MMAZF3AQgcd8N", "question": "What is the purple thing on the table?", "choices": ["eggplant", "hammer", "flowers", "poster"], "correct_choice_idx": 2, "direct_answers": ["flowers", "flowers", "flowers", "flowers", "flowers", "flowers", "flowers", "flowers", "flowers", "flowers"], "difficult_direct_answer": false, "rationales": ["The flowers are in a vase which shows what they are.", "Flowers are on the table in a vase.", "These seem to be flowers in a pot on the table."], "image": "train2014/COCO_train2014_000000085846.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256903, "question_id": "2w4WzkMbdqJF73FTwqW9LS", "question": "What technique was used to manipulate this photo?", "choices": ["blending", "cloning", "time lapse", "superimposition"], "correct_choice_idx": 2, "direct_answers": ["pile snows", "sequence photography", "copying", "slomo", "coping", "time lapse", "freeze frame", "photoshop", "copying", "replication"], "difficult_direct_answer": true, "rationales": ["The same person is shown in different parts of the photograph which shows the pictures were taken at different times.", "The photo shows the snowboard at various locations along the run.", "The technique is time lapse."], "image": "val2014/COCO_val2014_000000256903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384503, "question_id": "2wEhAisp84z629c35FX4iH", "question": "What holiday is most likely next?", "choices": ["christmas", "thanksgiving", "halloween", "easter"], "correct_choice_idx": 0, "direct_answers": ["christmas", "new years", "christmas", "christmas", "christmas", "new years", "christmas", "christmas", "christmas", "christmas"], "difficult_direct_answer": false, "rationales": ["There is a holiday wreath in the picture, which tells you the holiday coming.", "The next holiday is most likely christmas judging by the wreath in the foreground.", "There appears to be a tree decorated for yuletide with a bow in the corner."], "image": "val2014/COCO_val2014_000000384503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455772, "question_id": "2wZJ4LaB6hNPfBabBGfnms", "question": "What is stopping people from walking up the stairs?", "choices": ["cuffs", "live wire", "chain", "snake"], "correct_choice_idx": 2, "direct_answers": ["chains", "chain", "chains", "chain", "chain", "chain", "chain", "chain", "chain", "chain"], "difficult_direct_answer": false, "rationales": ["There is a yellow chain.", "The stairs are blocked by a non-living yellow metallic object.", "A yellow item with linked segments is blocking off the stairs."], "image": "val2014/COCO_val2014_000000455772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111188, "question_id": "2wjx6PRh6T6YvJq7axTsVK", "question": "Who placed the teddy bear here?", "choices": ["child", "child services", "neighbor", "child's parent"], "correct_choice_idx": 3, "direct_answers": ["on baby", "mother", "mom", "parent", "mother", "mother", "parent", "probably mama", "child's parent", "parent"], "difficult_direct_answer": false, "rationales": ["The teddy bear was put there by an adult, probably the parent.", "You would assume the parent as the infant does not have the ability to do so.", "This is a newborn so the parents put the bear there."], "image": "train2014/COCO_train2014_000000111188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146887, "question_id": "2wphWPKwLVymhu5ERpDHvC", "question": "How many birds are there?", "choices": ["three", "two", "one", "four"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "parrot", "three", "three", "three", "three", "parrot"], "difficult_direct_answer": false, "rationales": ["There are two birds near the top of the tree. an additional bird is below them.", "Three parrots perch on the limbs of a dead tree, with only blue sky in the background. a great majority of wild parrots live in the warm areas of the southern hemisphere.", "Three birds are on a branch."], "image": "val2014/COCO_val2014_000000146887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153038, "question_id": "2wqfH2u2rS9cvJcsskf9Z7", "question": "What fuel does the vehicle in the center of the image use?", "choices": ["coal", "jet fuel", "electricity", "human powered"], "correct_choice_idx": 1, "direct_answers": ["jet fuel", "jet fuel", "jet fuel", "jet fuel", "jet fuel", "jet fuel", "aviation kerosene", "aviation kerosene", "aviation kerosene", "jet fuel"], "difficult_direct_answer": false, "rationales": ["The fuel is jet fuel.", "The vehicle is a jet. the answer is an obvious one. one of the reasons jets burn so hot during accidents is because of the fuel.", "The vehicle taking off is a jet which is powered by jet fuel for flying."], "image": "val2014/COCO_val2014_000000153038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573892, "question_id": "2wtQbTM3U9hE5J7H9Uh3Lg", "question": "From what setting were these fruits picked?", "choices": ["temperate", "none", "arctic", "tropical"], "correct_choice_idx": 3, "direct_answers": ["trees", "banana plants", "not ripe", "plantation", "tropical", "trees", "tropical", "jungle", "trees", "tropical"], "difficult_direct_answer": false, "rationales": ["The setting is tropical.", "The fruits are bananas. they need year-round consistently warm temperatures in order to grow.", "Bananas grow on trees in hot tropical climates."], "image": "train2014/COCO_train2014_000000573892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213381, "question_id": "2wye5cqwjufVhoTxionwuA", "question": "What type of people can normally be found near this beach?", "choices": ["refugees", "farmers", "tourists", "royalty"], "correct_choice_idx": 2, "direct_answers": ["tourists", "tourists", "floridians", "regular", "tourists", "tourists", "tourists", "tourists", "tourists", "amish"], "difficult_direct_answer": false, "rationales": ["The people are tourists.", "They visit different areas and take vacations just to enjoy.", "Horse-drawn carriages and a cruise liner are present in this image. these activities are intended for tourists"], "image": "train2014/COCO_train2014_000000213381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316542, "question_id": "2x27xTjotd4HCZabf24qnF", "question": "The neckwear visible here that is longest is what?", "choices": ["cravat", "necktie", "bow tie", "bolo"], "correct_choice_idx": 1, "direct_answers": ["necktie", "tie", "tie", "necktie", "neck tie", "tie", "tie", "necktie", "man", "tie"], "difficult_direct_answer": false, "rationales": ["The necktie is longer than any of the other neckwear like the bowtie.", "A longer one around the neck is called this.", "It hangs lower than a bow tie"], "image": "train2014/COCO_train2014_000000316542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105482, "question_id": "2xKp9fyKAAKp3rvFjdC6uW", "question": "Why is his empty hand raised?", "choices": ["to balance", "catch ball", "has question", "is waving"], "correct_choice_idx": 0, "direct_answers": ["threw ball", "serving", "tossing ball", "tossed ball", "to balance", "balance", "threw ball", "throwing", "setting serve", "serving ball"], "difficult_direct_answer": true, "rationales": ["He is getting ready to hit the tennis ball which is out of frame, and it takes concentration.", "The person is trying to throw the ball up.", "The hand is for balance."], "image": "val2014/COCO_val2014_000000105482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9451, "question_id": "2xqH68d6RceuVDRAsgUSYk", "question": "What should the skateboarder do right now?", "choices": ["stop", "slow down", "back up", "speed up"], "correct_choice_idx": 3, "direct_answers": ["stop", "stop", "avoid bus", "stand up", "move", "move", "slow down", "speed up", "stay safe", "avoid bus"], "difficult_direct_answer": false, "rationales": ["There is a bus behind them which may start to move.", "The skateboarder is moving downhill so it can increase speed.", "There is a large bus right behind him"], "image": "train2014/COCO_train2014_000000009451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288023, "question_id": "2xyjr4Cteb43SRUv4wgByA", "question": "The treat shown here contains what dairy product?", "choices": ["cheddar", "mozzarella", "cream cheese", "cottage cheese"], "correct_choice_idx": 2, "direct_answers": ["cream cheese", "milk", "cheese", "cheese", "cheese", "cheese", "cheesecake", "cheesecake", "cream cheese", "milk"], "difficult_direct_answer": false, "rationales": ["This is a cheesecake", "The treat shown is a cheesecake and one of the main ingredients is cream cheese.", "It is a main ingredient in this cheesecake."], "image": "train2014/COCO_train2014_000000288023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47873, "question_id": "2xykkBDGfgFF6btWmW5LiE", "question": "What will the cheese listed be served on?", "choices": ["burger", "spaghetti", "bread", "pizza"], "correct_choice_idx": 2, "direct_answers": ["bread", "bread", "grilled sandwich", "bread", "counter", "plate", "plate", "bread", "plate", "plate"], "difficult_direct_answer": false, "rationales": ["The grilled cheese sandwich is served between two slice of this item.", "The restaurant has a sign that says it serves grilled cheese which contains bread.", "The cheese will be served on the bread."], "image": "val2014/COCO_val2014_000000047873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551686, "question_id": "2y3zK8shUWAHMTVg2GP9fW", "question": "What do the persons standing here hope most to see?", "choices": ["their luggage", "cinnabon", "sandwiches", "their cat"], "correct_choice_idx": 0, "direct_answers": ["luggage", "luggage", "their luggage", "their luggage", "service", "luggage", "luggage", "their bags", "luggage", "suitcases"], "difficult_direct_answer": false, "rationales": ["There are several suitcases on a conveyor belt that rotates until the suitcases are claimed.", "The people are standing here hoping to find their luggage.", "This is an arrivals area at an airport. food or pets would not be found here."], "image": "train2014/COCO_train2014_000000551686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531896, "question_id": "2y5FeAprVJNWRBaUzGQrZe", "question": "Where is the man taking the cart?", "choices": ["home", "store", "airport", "goodwill"], "correct_choice_idx": 2, "direct_answers": ["airplane", "airport", "luggage checkin", "home", "bag check", "travelling", "airport", "check in", "airport", "airplane"], "difficult_direct_answer": false, "rationales": ["The man is carrying his luggage on a cart through the airport terminal.", "The man is at an airport.", "The scene on the right side of the picture is typical of people waiting to check in at an airport."], "image": "val2014/COCO_val2014_000000531896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105465, "question_id": "2y6zpCz73AgHD8GUR33CZB", "question": "Which country does this airline likely belong to?", "choices": ["china", "japan", "singapore", "thailand"], "correct_choice_idx": 3, "direct_answers": ["singapore", "thailand", "airline", "thailand", "asia", "japan", "japan", "korea", "america", "japan"], "difficult_direct_answer": false, "rationales": ["Looks to be from thailand.", "The airline is thai airways.", "There is a cleo magazine near the woman. the seats are purple."], "image": "val2014/COCO_val2014_000000105465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119666, "question_id": "2yL2F7Ybq2PBPHxLUdL5qE", "question": "What company made this vehicle?", "choices": ["mercedes", "toyota", "audi", "hyundai"], "correct_choice_idx": 0, "direct_answers": ["mercedes", "bmw", "ford", "chrysler", "mercedes", "mercedes-benz", "mercedes benz", "usa", "mercedes-benz", "mercedes"], "difficult_direct_answer": false, "rationales": ["The round logo on front of the vehicle is for the \"mercedes\" company. it's fairly universally known.", "The symbol on the front of the truck, is widely known as being the branding for mercedes.", "A mercedes logo is on a truck."], "image": "train2014/COCO_train2014_000000119666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319905, "question_id": "2yT7o7NqUySs4ohAuPHwFr", "question": "What is featured by the TV?", "choices": ["dolls", "dancing show", "video game", "workout"], "correct_choice_idx": 2, "direct_answers": ["music", "dresser", "dancing", "hand motion", "dancing animals", "video game", "nintendo game", "game", "video game", "games"], "difficult_direct_answer": true, "rationales": ["The people are playing a wii game together and the wii is a video game console, so a video game is featured by the tv.", "Kids with remotes stand in front of a television.", "The controllers in the children's hands and the graphics on the screen make it look like they're playing a console."], "image": "train2014/COCO_train2014_000000319905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361993, "question_id": "2yYfBFduFsU7yhh66ayxLm", "question": "In many areas of the world what could this man be ticketed for doing?", "choices": ["walking slow", "impeding traffic", "inattentiveness", "jaywalking"], "correct_choice_idx": 3, "direct_answers": ["jaywalking", "crossing", "jaywalking", "jaywalking", "jaywalking", "jaywalking", "crossing", "jaywalking", "jaywalking", "crossing"], "difficult_direct_answer": false, "rationales": ["You have to cross in the cross walks to not get in trouble.", "The man is walking through an intersection not in a crosswalk which is called jaywalking. in many place, this is a misdemeanor offense one can be ticketed for.", "The person is jaywalking."], "image": "val2014/COCO_val2014_000000361993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281906, "question_id": "2zdHTSzNfn2p8KFWTXWTz7", "question": "What is in the silver bottle?", "choices": ["hair spray", "whipped cream", "bug spray", "cooking spray"], "correct_choice_idx": 1, "direct_answers": ["icing", "whipped cream", "whipped cream", "whipped cream", "icing", "whipped cream", "whipped cream", "whip cream", "whipped cream", "whipped cream"], "difficult_direct_answer": false, "rationales": ["The bottle has whipped cream.", "Whip cream usually comes in these type of spray bottles.", "This is the type of container whipped cream comes in and they look to be making a dessert."], "image": "train2014/COCO_train2014_000000281906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259465, "question_id": "2zoCgwo5yY8wb7pFD7qsVc", "question": "What type of diet might the girl have?", "choices": ["omnivore", "meat carnivore", "vegan", "fasting"], "correct_choice_idx": 2, "direct_answers": ["vegan", "vegetable", "vegetarian", "vegan", "vegetarian", "vegan", "vegetarian", "vegetarian", "vegetarian", "fruits vegetables"], "difficult_direct_answer": false, "rationales": ["She is eating, so she is not fasting. the food items are all fruits and vegetables.", "There are only vegetables on the plate.", "A child is in front of a table with only vegetables on it."], "image": "val2014/COCO_val2014_000000259465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464265, "question_id": "328vSVNCTKtBmM92DkuSfg", "question": "What is the oldfashioned name for this type of store?", "choices": ["greengrocer", "famer's market", "greenery", "retail"], "correct_choice_idx": 0, "direct_answers": ["groceries", "greengrocer", "market", "market", "market", "market", "food stand", "market", "market", "produce market"], "difficult_direct_answer": false, "rationales": ["Most times outdoor places selling produce is and was called farmer's market.", "The name is the green grocer.", "That was the name of a grocery store before more stuff was added."], "image": "val2014/COCO_val2014_000000464265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27777, "question_id": "32ej4uhg8gmnWWPB7xBBxn", "question": "What doe the lines behind the bottom most car represent?", "choices": ["turn left", "slow down", "turn right", "no parking"], "correct_choice_idx": 3, "direct_answers": ["no parking", "no parking", "no parking", "no parking", "parking", "no parking", "no parking", "no parking", "no parking", "parking"], "difficult_direct_answer": false, "rationales": ["The lines indicate no parking.", "That is not a driving lane. there is a fire hydrant near the bottom most car that must be accessible.", "The lines indicate where cars can't park."], "image": "train2014/COCO_train2014_000000027777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516619, "question_id": "32i474zFFB2UfdSbttdzQU", "question": "What is the unique skill of this elephant?", "choices": ["throwing", "balancing", "counting", "painting"], "correct_choice_idx": 3, "direct_answers": ["painter", "painting", "painting", "it paints", "painting", "weight eleven", "painting", "painting", "painting", "painting"], "difficult_direct_answer": false, "rationales": ["The skill is painting.", "The elephant is using its trunk to hold a paintbrush to paper.", "The elephant has a paintbrush in its trunk."], "image": "train2014/COCO_train2014_000000516619.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235082, "question_id": "32jjS6cJqmK2eGwfMiRXVH", "question": "What guy in blue do?", "choices": ["swung bat", "dropped ball", "hit ball", "hit catcher"], "correct_choice_idx": 0, "direct_answers": ["hit ball", "swinging", "bat", "hit ball", "swinging bat", "swing bat", "swing", "batting", "swung bat", "hit ball"], "difficult_direct_answer": false, "rationales": ["A baseball player is at the home plate and has the bat over his shoulder as the catcher reaches forward with his glove.", "The batter is at the end of his swinging of the bat.", "The player is standing in the batter's box with a bat in hand that is in the position of a follow through, based on their handedness and the location of the bat. if a baseball player was in this position they would likely have just swung the bat which is the action this player would be partaking in for this sport."], "image": "val2014/COCO_val2014_000000235082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113159, "question_id": "32jufRUa6GAAujNLw6QLqx", "question": "What is the girl missing?", "choices": ["socks", "underwear", "shoes", "hat"], "correct_choice_idx": 1, "direct_answers": ["underwear", "staties", "underwear", "underwear", "underwear", "underwear", "underwear", "underwear", "underwear", "underwear"], "difficult_direct_answer": false, "rationales": ["She is wearing shoes, socks, and a hat.", "All you have to do is look. enough said.", "A girl is on a tennis court and hear butt is visible."], "image": "val2014/COCO_val2014_000000113159.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356257, "question_id": "32vRajjARssLMrXbUQb5fy", "question": "Which shoes require watering more than daily?", "choices": ["right ones", "none", "all", "left ones"], "correct_choice_idx": 1, "direct_answers": ["non-cactus", "none", "plant shoes", "plant pots", "left", "fourth", "middle", "left", "middle", "far left"], "difficult_direct_answer": false, "rationales": ["The shoes all have cacti planted in them and they don't require a lot of water.", "Cacti don't generally need to be watered daily.", "These are being used as flower pots."], "image": "train2014/COCO_train2014_000000356257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363939, "question_id": "32wUNU23sZjaZD8up9BVU9", "question": "What type of building is this?", "choices": ["hospital", "school", "library", "station"], "correct_choice_idx": 3, "direct_answers": ["train station", "no building", "no building", "train station", "station", "train station", "train station", "train station", "train station", "train station"], "difficult_direct_answer": false, "rationales": ["A train and tracks are inside the building.", "There is a train on the tracks so they normally leave from train stations.", "There is a train and tracks next to a platform, which indicates it is a train station."], "image": "train2014/COCO_train2014_000000363939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511407, "question_id": "332stYnsCpZaWqGXwAg5qb", "question": "What is the item being displayed by the man?", "choices": ["floaty", "surf board", "drawing", "wall paper"], "correct_choice_idx": 0, "direct_answers": ["surfboard", "surfboard", "floaty", "boogie board", "shotboard", "wakeboard", "board", "surfboard", "boogie board", "paddle board"], "difficult_direct_answer": false, "rationales": ["This is a flotation board displayed by the man.", "The man is holding a body board.", "The item is used to ride waves, thus corresponding to the item in option c."], "image": "val2014/COCO_val2014_000000511407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410173, "question_id": "33DPuC3HsYxY85pCTfcoxv", "question": "This travels is belongs to which country?", "choices": ["italy", "us", "germany", "france"], "correct_choice_idx": 2, "direct_answers": ["mexico", "united states", "mexico", "united states", "chile", "usa", "germany", "turkey", "germany", "chile"], "difficult_direct_answer": false, "rationales": ["The bus has the word \"tur\" on it which is german.", "The plane is from germany.", "The green buses give tours in the land of the pope."], "image": "train2014/COCO_train2014_000000410173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206653, "question_id": "33GnTLGZ37e3ET4XiWjjpX", "question": "What type of monster is the woman grooming herself to be?", "choices": ["vampire", "werewolf", "ghost", "zombie"], "correct_choice_idx": 3, "direct_answers": ["zombie", "vampire", "zombie", "zombie", "troll", "zombie", "zombie", "zombie", "zombie", "chuckie"], "difficult_direct_answer": false, "rationales": ["The man has blood all over her and looks dead.", "The outfit is covered in blood and blood is dripping from the mouth, with tattered clothing and a pale face, denoting a zombie.", "The monster is a zombie."], "image": "train2014/COCO_train2014_000000206653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403523, "question_id": "33KfQ6bntCsKAQ2dMSqLjV", "question": "Who do these people wave to?", "choices": ["local citizenry", "camera holder", "donkeys", "selves"], "correct_choice_idx": 1, "direct_answers": ["people", "photographer", "tourists", "photographer", "camera holder", "friends", "cameraman", "friends", "friends", "people belo"], "difficult_direct_answer": false, "rationales": ["The people on the elephant's back are waving to the person that is holding the camera.", "These people wave to the photographers.", "These people are staring at one point, towards the camera."], "image": "train2014/COCO_train2014_000000403523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416644, "question_id": "33Pnsx5T4H7ss6hzU4r687", "question": "Why is the piece of wood under the laptop?", "choices": ["cool down", "perfect angle", "cushion bottom", "charge battery"], "correct_choice_idx": 1, "direct_answers": ["prop up", "prop it", "prop", "viewing up", "perfect angle", "prop", "block", "balance it", "raise angle", "keep cool"], "difficult_direct_answer": true, "rationales": ["The wood provides a good angle for the laptop to be seen and used.", "The piece of wood is wedged under the laptop to give a better view.", "This makes it easier to type on the keyboard"], "image": "train2014/COCO_train2014_000000416644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325775, "question_id": "33RAXSXtDTXUBxzgkftfiR", "question": "What country is the white airplane most likely from?", "choices": ["usa", "turkey", "france", "germany"], "correct_choice_idx": 1, "direct_answers": ["turkey", "australia", "turkey", "turkey", "turkey", "turkey", "turkey", "turkey", "turkey", "turkey"], "difficult_direct_answer": false, "rationales": ["The name of the airline is turkish airlines.", "The name of the airline is turkish airlines", "The airplane is called turkish airlines."], "image": "train2014/COCO_train2014_000000325775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361055, "question_id": "33iTjiCzhsAsjQtkcDRjjc", "question": "What is the dog about to do?", "choices": ["attacking", "biting", "lying down", "catch food"], "correct_choice_idx": 3, "direct_answers": ["jump", "eat", "jump", "play", "jump", "catch food", "jump", "sit", "jump", "jump"], "difficult_direct_answer": false, "rationales": ["The man is raising his hand towards the dog.", "The man is holding a treat in his hand for the dog.", "The dog wants to catch food."], "image": "val2014/COCO_val2014_000000361055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319354, "question_id": "33v64Y8hC69xvrwwLMNeXg", "question": "What is this model of car called in South Korea?", "choices": ["hyundai verna", "hyundai tucson", "hyundai minho", "hyundai kia"], "correct_choice_idx": 0, "direct_answers": ["hyundai verna", "hyundai", "sedan", "honda", "hyundai", "avant", "hyundai", "hundai", "accent", "hyundai"], "difficult_direct_answer": false, "rationales": ["The car is an accent which is a verna in south korea.", "The model is the hyundai verna.", "The model is shown on the car."], "image": "train2014/COCO_train2014_000000319354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180650, "question_id": "33vb4bXSp6it6BVqY9APAw", "question": "What can the circular object do?", "choices": ["drive", "cut metal", "fly autonomously", "glide"], "correct_choice_idx": 3, "direct_answers": ["flies", "fly", "fly", "fly", "fly", "play", "fly", "fly", "be thrown", "glide"], "difficult_direct_answer": false, "rationales": ["It can fly through the air with no sound.", "The frisbee can fly through the air when thrown.", "It spins as it moves through the air which keeps it level"], "image": "val2014/COCO_val2014_000000180650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271058, "question_id": "33whF8sbHyBnXx2Qxfamxy", "question": "What type of kitchen is he cooking in?", "choices": ["residential", "hospital", "commercial", "food truck"], "correct_choice_idx": 0, "direct_answers": ["small", "small", "home", "home", "residential", "home", "small", "home", "residential kitchen", "apartment"], "difficult_direct_answer": false, "rationales": ["There are no signs that indicate that this kitchen is used to generate a profit.", "This appears to be inside a home.", "He is cooking in a home kitchen"], "image": "train2014/COCO_train2014_000000271058.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417857, "question_id": "344TWsCnZdHWSH2a5ub3T7", "question": "What does the large number rhyme with?", "choices": ["flu", "bun", "tea", "poor"], "correct_choice_idx": 0, "direct_answers": ["flu", "who", "do", "shoe", "blue", "you", "flu", "you", "you", "boo"], "difficult_direct_answer": false, "rationales": ["The number 2 rhymes with the word flu.", "Both are one syllable words ending in the \"oo\" sound.", "The number is two and can only rhyme with one answer."], "image": "val2014/COCO_val2014_000000417857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367619, "question_id": "34AniAfmwoyzVqSbNLWbTS", "question": "What is put inside the silver bowl for processing?", "choices": ["cream", "meat", "flour", "nut"], "correct_choice_idx": 2, "direct_answers": ["batter", "cake batter", "cake ingredients", "ingredients", "mix", "batter", "flour", "flour", "icing", "flour"], "difficult_direct_answer": false, "rationales": ["The silver bowler is part of a mixed system. with the cake in full view, it seems rather obvious that flour had to be processed in the silver bowl.", "You put the ingredients in there to mix them up.", "A cake is next to a silver bowl. cake is made with flour."], "image": "val2014/COCO_val2014_000000367619.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139637, "question_id": "34DnLbMHB2m8g8xX5cfAzk", "question": "What is prohibited in this area?", "choices": ["swimming", "running", "dog", "hiking"], "correct_choice_idx": 2, "direct_answers": ["cars", "cars", "dog", "cars", "passing", "dog walking", "automobiles", "parking", "cars", "motor vehicles"], "difficult_direct_answer": false, "rationales": ["Some public areas prohibit pets to be in the zone.", "A man is riding on a bike path near the beach. often pets are not allowed on beaches and paths.", "The red, white, and black sign on the far left indicates the animal type that is prohibited in this area."], "image": "val2014/COCO_val2014_000000139637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398878, "question_id": "34H9tPJUEEvkiHxjW3vPCU", "question": "What is the name of the nation with the flag in this picture?", "choices": ["united kingdom", "south korea", "canada", "united states"], "correct_choice_idx": 3, "direct_answers": ["america", "usa", "united states", "united states", "united states", "united states", "america", "united states", "america", "usa"], "difficult_direct_answer": false, "rationales": ["The red blue and white colors with stars and stripes identifies the flag's in this image as american ones.", "The american flag is in this picture.", "This is the flag of the usa"], "image": "train2014/COCO_train2014_000000398878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257117, "question_id": "34Nb98xWQ4Wftkr7Hyvh76", "question": "What is looking at the zebras?", "choices": ["grass", "buffalo", "dirt", "sky"], "correct_choice_idx": 1, "direct_answers": ["photographer", "yaks", "human", "cameraperson", "buffalo", "elephant", "camera", "elephants", "humans", "cameras"], "difficult_direct_answer": true, "rationales": ["Buffalo are standing behind zebra in a field.", "The other options don't have the ability to look at anything.", "The buffalo is looking."], "image": "train2014/COCO_train2014_000000257117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361469, "question_id": "34NsUPxzwYRSDStTvnN52G", "question": "What office reprieve does this man avail himself of?", "choices": ["coffee break", "airplane building", "nap", "filing"], "correct_choice_idx": 0, "direct_answers": ["doughnut", "snack", "snack", "donuts", "coffee", "doughnut", "eating", "donut", "coffee break", "work"], "difficult_direct_answer": false, "rationales": ["The man in the office is holding a snack and taking a coffee break.", "The person is taking a pause from work without sleeping, an activity matching option a.", "The man has a donut in his hand."], "image": "train2014/COCO_train2014_000000361469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107305, "question_id": "34cfJ3axX2cwzCE8cZgdcd", "question": "What is likely the main language spoken here?", "choices": ["swahili", "spanish", "chinese", "french"], "correct_choice_idx": 1, "direct_answers": ["spanish", "portuguese", "spanish", "spanish", "spanish", "spanish", "spanish", "portuguese", "spanish", "spanish"], "difficult_direct_answer": false, "rationales": ["The sign on the store says mercado which is a spanish word for a place to shop.", "The language is spanish.", "The name of a place of business is a spanish word and other spanish words can be seen on other objects."], "image": "train2014/COCO_train2014_000000107305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94055, "question_id": "34cuS7Sv6N5k8U6WGbUc9n", "question": "When first born what were these animals called?", "choices": ["calves", "foals", "puppies", "water dogs"], "correct_choice_idx": 1, "direct_answers": ["foal", "foals", "foals", "foals", "foal", "foal", "foals", "colt", "foals", "pony"], "difficult_direct_answer": false, "rationales": ["This is the name of a baby horse.", "These are horses and this is what they're known as when young.", "The first child of a mare is known as a foal."], "image": "val2014/COCO_val2014_000000094055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13916, "question_id": "34pFTyFGFhT9yJFB5VxfeF", "question": "In what sort of building is this bed sited?", "choices": ["flop house", "garage", "bar", "motel"], "correct_choice_idx": 3, "direct_answers": ["bedroom", "hotel", "hotel", "motel", "hotel", "bedroom", "hotel", "hotel", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["The room has furnishings and decor that are found in a cheap motel.", "The type of design and metal door makes it obvious what type of room this is.", "This is a motel room with the sign on the door."], "image": "train2014/COCO_train2014_000000013916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65901, "question_id": "34pmv3z8Yj7dvmPC7prWJ8", "question": "What is attached to the blue strap on the surfers ankle?", "choices": ["seaweed", "surf leash", "netting", "wallet"], "correct_choice_idx": 1, "direct_answers": ["surf leash", "surfboard", "surfboard", "surfboard", "surfboard", "surfboard", "surf board", "surfboard", "surf board", "surfboard"], "difficult_direct_answer": false, "rationales": ["The man is surfing so it would be a surf leash attached to his ankle so he doesn't lose the board.", "While on a surfboard, it is common to be attached to it with a 'leash'. this leash can be seen on the man's wrist.", "The surfer has a leash attached to his ankle so he doesn't lose his board in the water."], "image": "train2014/COCO_train2014_000000065901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359220, "question_id": "34tZVWivakJZjrXEJSqBr7", "question": "What material are the bottles made of?", "choices": ["sheetrock", "styrofoam", "glass", "plastic"], "correct_choice_idx": 3, "direct_answers": ["glass", "plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "glass", "glass", "plastic"], "difficult_direct_answer": false, "rationales": ["By the color and design of the bottle it's easy to tell what it is made of.", "The bottles are use for soda pop.", "They are soda bottles. most soda bottles are made of a non breakable material."], "image": "val2014/COCO_val2014_000000359220.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81811, "question_id": "34wn2zunbWsvsrFvQSTUBM", "question": "How might the real version of the kite animal on top defend itself?", "choices": ["hard shell", "tusks", "stinger", "camouflage"], "correct_choice_idx": 2, "direct_answers": ["its stinger", "stab", "stinger", "tentacles", "barb", "tentacles", "ink", "sting", "tentacles", "with tentacles"], "difficult_direct_answer": false, "rationales": ["There is a big animal.", "The top kite is a stingray which is an animal known for having a sharp spinal blade that can pierce and release venom to protect itself.", "Jellyfish generally sting predators."], "image": "train2014/COCO_train2014_000000081811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262933, "question_id": "35RzYSEbSVgNqQGQGaFsbG", "question": "What are the bears doing in the water?", "choices": ["fighting", "crying", "mating", "eating"], "correct_choice_idx": 0, "direct_answers": ["fighting", "fighting", "playing", "fighting", "playing", "wrestling", "fighting", "fighting", "playing", "playing"], "difficult_direct_answer": false, "rationales": ["They seem to be fighting over something in the water.", "They are fighting in the water.", "The bears are fighting."], "image": "val2014/COCO_val2014_000000262933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406244, "question_id": "35SWU5gThVQDSAJN4W4WPH", "question": "What wheeled object is the man riding on to perform the stunt?", "choices": ["motorcycle", "skateboard", "rollerblades", "bike"], "correct_choice_idx": 1, "direct_answers": ["skateboard", "skateboard", "skateboard", "skate board", "skate board", "skateboard", "skateboard", "skate board", "skateboard", "skateboard"], "difficult_direct_answer": false, "rationales": ["A guy is on a board with wheels on the bottom in the front and back and he is riding a rail as skateboarders often do.", "The man is riding on a skateboard.", "He is using a board to do this stunt."], "image": "train2014/COCO_train2014_000000406244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115502, "question_id": "35iMYuKQy4DXymamhbGyps", "question": "In which neighborhood does this bus drive?", "choices": ["ghetto", "suburbs", "china town", "downtown"], "correct_choice_idx": 2, "direct_answers": ["city", "shopping area", "china town", "chinatown", "chinatown", "city", "chinese", "montague", "paris", "new york"], "difficult_direct_answer": false, "rationales": ["The signs are in chinese.", "Based on the characters on the building, this bus is in an asian part of a city.", "If you look at the words on all of the buildings, they are in chinese."], "image": "train2014/COCO_train2014_000000115502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191613, "question_id": "3683ebKSm8f6cjxyBVAMRN", "question": "What is the purpose of the wires above the vehicle?", "choices": ["for climbing", "for swinging", "power source", "for decoration"], "correct_choice_idx": 2, "direct_answers": ["accelerate faster", "power", "electricity", "provide electricity", "electricity", "locommotive", "power source", "electricity", "moves vehicle", "power"], "difficult_direct_answer": false, "rationales": ["The wires in this image are power lines. these wires are used as a power source.", "The ropes above the vehicle is a power distribution line.", "They provide electricity."], "image": "val2014/COCO_val2014_000000191613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68752, "question_id": "36GTiLVAb58jmrfUh8N74M", "question": "What color is in the middle of the kite?", "choices": ["red", "blue", "black", "green"], "correct_choice_idx": 0, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["There is a red stripe in the middle of the kite.", "A kite has a rainbow pattern with yellows, oranges, and red.", "The kite has numerous colors and red is in the middle of the others."], "image": "train2014/COCO_train2014_000000068752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215549, "question_id": "36KgoVo7KcEFHiW8gnKGDR", "question": "This elevated train is part of the public transportation system of which large US city?", "choices": ["chicago", "philadelphia", "new york", "boston"], "correct_choice_idx": 2, "direct_answers": ["chicago", "ny", "yes", "ny", "chicago", "chicago", "new york", "chicago", "chicago", "chicago"], "difficult_direct_answer": false, "rationales": ["These kind of trains can be found in new york.", "The train is in new york.", "The new york city skyline can be seen behind a train."], "image": "val2014/COCO_val2014_000000215549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46537, "question_id": "36W5y3H7ksVib2sh2ij2HW", "question": "The person on the motorcycle escorting the double decker bus is what type of public servant?", "choices": ["fireman", "soldier", "policeman", "inspector"], "correct_choice_idx": 2, "direct_answers": ["police", "police", "officer", "comfortable", "police", "policeman", "police", "police", "police officer", "police man"], "difficult_direct_answer": false, "rationales": ["The person is a policeman since he's wearing an officer uniform.", "The colors on the bike are black and white.", "A man with a badge drives around on a bike and helps keep law and order."], "image": "train2014/COCO_train2014_000000046537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104400, "question_id": "36XKFSRexU37GWH5qVpCmn", "question": "What sort of trees are visible here?", "choices": ["oak", "evergreen", "spring blooming", "deciduous"], "correct_choice_idx": 1, "direct_answers": ["pine", "cypress", "pine", "fir trees", "evergreen", "pine", "pine", "fir", "evergreen", "alpine"], "difficult_direct_answer": false, "rationales": ["Green trees are on a snowy mountain. evergreens are green in the cold.", "The trees on the ski slope are evergreen trees that live all winter long.", "The color of the tree provides the answer."], "image": "train2014/COCO_train2014_000000104400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8284, "question_id": "36pkpkmU7DrybBFpqMVdCD", "question": "What is the subject of the speech being given?", "choices": ["candy", "cell phones", "animal husbandry", "womens rights"], "correct_choice_idx": 1, "direct_answers": ["cell phones", "phones", "cell phones", "cellphones", "communication", "phones", "cellphones", "cell phones", "cellphones", "tech"], "difficult_direct_answer": false, "rationales": ["The subject is cell phones.", "There is a large picture of cell phones in the background so it is likely that is what the speaker is talking about.", "There are cell phones visible on the backdrop behind the speaker. speakers usually use visual aids that match what they are talking about."], "image": "train2014/COCO_train2014_000000008284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22271, "question_id": "36zFAXHnDCERMr2MHkeC9M", "question": "What can be bought from the silver machine on the right hand side?", "choices": ["soda", "gum", "bread", "newspapers"], "correct_choice_idx": 3, "direct_answers": ["newspapers", "newspapers", "news paper", "newspaper", "newspaper", "newspaper", "newspaper", "paper", "newspaper", "newspaper"], "difficult_direct_answer": false, "rationales": ["Newspapers could be bought from the silver machine on the right hand side.", "The silver machine holds newspapers which you can purchase.", "There's magazine like items with headlines and pictures."], "image": "val2014/COCO_val2014_000000022271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502914, "question_id": "37iYoCuHJeJsVNpsAxKEod", "question": "What activity is carried out by the person?", "choices": ["designing toys", "playing", "manufacturing toys", "asembling toys"], "correct_choice_idx": 3, "direct_answers": ["crafting", "reading", "building robots", "asembling toys", "model building", "building model", "building", "model building", "build model", "model assembly"], "difficult_direct_answer": true, "rationales": ["There are toy models in the ladies hands and table, so it's obvious what she is doing.", "She is gluing parts together", "The woman is putting together a little toy robot."], "image": "train2014/COCO_train2014_000000502914.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411291, "question_id": "37pgyrRm3x8H9dfk9K6mYL", "question": "Why do some elephants have trunks in the water?", "choices": ["to drink", "to play", "to eat", "to sit"], "correct_choice_idx": 0, "direct_answers": ["drinking", "cooling", "drinking", "drinking", "to drink", "drinking", "drinking", "to drink", "drinking", "drink clean"], "difficult_direct_answer": false, "rationales": ["They are drinking the water", "The elephants want to quench their thirst.", "Some of the elephants have their trunks in the water so they can get a drink if they are thirsty."], "image": "train2014/COCO_train2014_000000411291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138022, "question_id": "37yEgPWZu6DFxq9rHMBStL", "question": "What activity does the van driver want to do?", "choices": ["ski", "surf", "gaming", "suntanning"], "correct_choice_idx": 1, "direct_answers": ["surf", "surf", "surf", "surf", "park", "surf", "surfing", "surf", "surf", "surf"], "difficult_direct_answer": false, "rationales": ["The look of the seen the driver want to surf.", "There is a surboard attached to the side of the van.", "There is an old red van that has a board on top of it."], "image": "val2014/COCO_val2014_000000138022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207094, "question_id": "385qrdi4qQNLyMbkSZ7UaX", "question": "What is the boy about to do?", "choices": ["spit", "throw up", "blow candle", "smell"], "correct_choice_idx": 2, "direct_answers": ["extinguish candles", "blow candles", "blow", "blow candles", "blow candles", "blow candle", "blow candles", "blow candle", "blow", "blow candles"], "difficult_direct_answer": false, "rationales": ["The boy will blow the candles.", "Given the candles in this image set in front of a boy with pursed lips we can assume he is blowing out the fire on his candles; probably in celebration of something.", "This little boy is having a low-budget birthday bash with doughnuts instead of cake, but there are nevertheless candles in the doughnuts and he is going to blow them all out!!."], "image": "train2014/COCO_train2014_000000207094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464719, "question_id": "386eHGf5XcCQUf49uPqWfd", "question": "What are the people probably laying on?", "choices": ["surf boards", "floaties", "raft", "skateboard"], "correct_choice_idx": 0, "direct_answers": ["boogie board", "boogie board", "surfboard", "surfboard", "surfboard", "boogie board", "surfboard", "surfboard", "surfboard", "surf boards"], "difficult_direct_answer": false, "rationales": ["They are in a body of water wearing surfing attire. they are laying on objects that are long enough that the objects can hold their bodies up from their feet to their chests.", "People are often on this device in the water. it is used in large bodies of water that have waves.", "They have a colorful wood piece visible in front of them and are on the water"], "image": "val2014/COCO_val2014_000000464719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55375, "question_id": "388HEnTRUYXpKYiDT6KQub", "question": "Why is the baby wet?", "choices": ["in rain", "being bathed", "is resting", "got sweaty"], "correct_choice_idx": 1, "direct_answers": ["bathing", "bath", "being bathed", "bath water", "bath time", "bathed", "bathing", "bathe", "bath time", "bath"], "difficult_direct_answer": false, "rationales": ["The baby is being washed in a special tub that safe for use by infants.", "The baby is in a baby bath with baby shampoo near it.", "The baby is being washed."], "image": "val2014/COCO_val2014_000000055375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115060, "question_id": "38VofNSbjvwiaTRN6sxZ4X", "question": "What is located behind the clock?", "choices": ["large room", "mirror", "open museum", "nothing"], "correct_choice_idx": 1, "direct_answers": ["people", "mirror", "mirror", "wall", "mirror", "mirror", "mirror", "mirror", "people", "people"], "difficult_direct_answer": false, "rationales": ["People's shadows are being reflected in the mirror.", "There is a reflective surface behind the clock.", "The mirror is behind."], "image": "val2014/COCO_val2014_000000115060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569801, "question_id": "38Yce7KdAq9eLRwFivC6Ha", "question": "What could you do with the metallic item that has 3 varied sized circles atop it?", "choices": ["microwave", "freeze", "hear music", "cook"], "correct_choice_idx": 3, "direct_answers": ["open it", "cook", "open", "cook", "cook", "cook", "cook", "cook", "open", "cook"], "difficult_direct_answer": false, "rationales": ["There is a can of food on the counter.", "The metallic item is a stove top which is used for cooking.", "It is for cooking."], "image": "val2014/COCO_val2014_000000569801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358176, "question_id": "38gZxsZeMJNFRRj9MAvTHL", "question": "What did the man bending over do with the ball?", "choices": ["throw it", "polish it", "catch it", "sell it"], "correct_choice_idx": 0, "direct_answers": ["threw it", "pitch", "throw", "throw", "pitch", "pitch", "pitch", "pitch it", "throw it", "pitching"], "difficult_direct_answer": false, "rationales": ["The man is throwing the ball.", "The man just let go of the ball as he is pitching it.", "The player has his hand in a throwing motion with the ball going away from him."], "image": "train2014/COCO_train2014_000000358176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522827, "question_id": "39GH5AU7AiVwJY4v3evXAH", "question": "What are the rectangular green structures on the left used as?", "choices": ["bathrooms", "changing rooms", "kitchens", "showers"], "correct_choice_idx": 0, "direct_answers": ["bathrooms", "port-a-potties", "toilets", "toilets", "bathrooms", "porta potty", "plates", "bathrooms", "bathrooms", "porta potties"], "difficult_direct_answer": false, "rationales": ["The structures are known as porta-potties or portable bathrooms.", "They are portable washrooms which are used to events without built in facilities.", "The structures are portable enclosures containing a toilet often found on construction sites or at popular outdoor events."], "image": "train2014/COCO_train2014_000000522827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36522, "question_id": "39Q5fstEkFrtSmPkhjkDHx", "question": "At least how many different ways are there to identify which bus this is?", "choices": ["two", "ten", "five", "four"], "correct_choice_idx": 0, "direct_answers": ["number name", "four", "two ways", "two", "three", "two", "three", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The bus only goes two ways.", "It is written on the front and the side", "There is a name on the side and a number above the door."], "image": "val2014/COCO_val2014_000000036522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347292, "question_id": "39XdnBGQvRvStbRSokCyP3", "question": "What is his hat made from?", "choices": ["leather", "straw", "cotton", "felt"], "correct_choice_idx": 1, "direct_answers": ["straw", "straw", "straw", "straw", "straw", "straw", "straw", "straw", "straw", "straw"], "difficult_direct_answer": false, "rationales": ["It is lightly colored and woven", "The hat is made of straw since it's a golden yellow color.", "His hat is made from straw and called a straw hat."], "image": "train2014/COCO_train2014_000000347292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385378, "question_id": "39eDV8op57ocaB7u39jVSJ", "question": "What kind of activity is held nearby?", "choices": ["canoeing", "car racing", "fishing", "mountain climbing"], "correct_choice_idx": 0, "direct_answers": ["kayaking", "boating", "kayaking", "kayaking", "canoeing", "kayaking", "boating", "kayaking", "canoeing", "boating"], "difficult_direct_answer": false, "rationales": ["Multiple cars have boats on them.", "There are canoe boats secured on the cars.", "Several cars have boats on their roofs which means that the owners of the cars will participate in this activity."], "image": "val2014/COCO_val2014_000000385378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96785, "question_id": "39hgYLcgdLvGBopydhMACo", "question": "How is the woman protecting her hairdo?", "choices": ["scarf", "helmet", "hat", "hairnet"], "correct_choice_idx": 2, "direct_answers": ["umbrella scarf", "use umbrella", "scarf", "with scarf", "hat", "with umbrella", "umbrella", "umbrella", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["There is a woman with her hair wrapped up in babushka fashion looking through a shop window. the headwear worn by babushkas is more commonly known as 'scarf' in the west.", "The woman is using her hat.", "Putting this piece of cloth over ones hair keep it from getting windblown."], "image": "val2014/COCO_val2014_000000096785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252783, "question_id": "39viJBNH5xm4h4nbde4h7x", "question": "The green item was probably obtained from where?", "choices": ["mattress firm", "amusement park", "toy store", "law office"], "correct_choice_idx": 1, "direct_answers": ["amusement park", "bar", "front desk", "hospital", "bar", "funpark", "party", "venue", "hospital", "venue"], "difficult_direct_answer": false, "rationales": ["The green item is a band that shows that the person paid to enter a restricted area. mattress firms, toy stores, and law offices do not have admission fees.", "Wristbands are given at amusement parks for admission.", "It's a bracelet to show they paid admission to get in."], "image": "val2014/COCO_val2014_000000252783.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120790, "question_id": "3A8PDnPFebQd9udNjx6w3i", "question": "Where is the nearest place for persons to await this bus?", "choices": ["behind it", "in front", "1 block", "unknown"], "correct_choice_idx": 0, "direct_answers": ["stop", "behind it", "bus stop", "top left", "stockton", "bus stop", "bus stop", "left", "bust stop", "far left"], "difficult_direct_answer": false, "rationales": ["There is a bus stop at the rear of the bus. it is a small, enclosed space with a few chairs to keep people out of the elements.", "There is a bus stop located near where the back of the bus is passing.", "There is a covered shelter with seats next to the road."], "image": "train2014/COCO_train2014_000000120790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26992, "question_id": "3AESHmHLNf2dMZ3UEnNfob", "question": "How do the people know each other?", "choices": ["coworkers", "teammates", "classmates", "family"], "correct_choice_idx": 3, "direct_answers": ["friends", "family", "family", "family", "family", "relatives", "family", "family", "family", "friends"], "difficult_direct_answer": false, "rationales": ["They are different ages (some are very young) and are sharing a meal.", "The people are family.", "These people look like family since a baby is present."], "image": "val2014/COCO_val2014_000000026992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517399, "question_id": "3AMfFhyBnvbrjfBHuXd9hV", "question": "Where does the baby most likely go to sleep?", "choices": ["crib", "table", "pull-out bed", "sofa"], "correct_choice_idx": 0, "direct_answers": ["crib", "crib", "crib", "crib", "crib", "crib", "crib", "crib", "crib", "crib"], "difficult_direct_answer": false, "rationales": ["The baby will sleep in the baby bed.", "A woman is holding a baby in a room that has a crib along one wall.", "A table would not be a suitable place for anyone to sleep. the baby is too young to use the sofa or pull-out bed."], "image": "val2014/COCO_val2014_000000517399.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322143, "question_id": "3ATHf8RpHAqJM2a7QmLost", "question": "What kind of weather is this area in danger of?", "choices": ["snow", "thunderstorms", "extreme heat", "extreme cold"], "correct_choice_idx": 1, "direct_answers": ["storm", "rainy weather", "rain", "thunderstorms", "thunderstorm", "storm", "storm", "thunderstorm", "storm", "rain"], "difficult_direct_answer": false, "rationales": ["The sky is dark, and the clouds are low and heavy signifying that it will soon rain.", "The dark clouds were now above the city.", "The densely overcast grey appearance of the sky in this picture suggests impending precipitation and perhaps thunder and lightning too."], "image": "val2014/COCO_val2014_000000322143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114024, "question_id": "3ATpEWtb2zzYowgZqT5hAQ", "question": "Which vehicle has violated the laws?", "choices": ["red car", "black car", "white car", "grey bus"], "correct_choice_idx": 0, "direct_answers": ["red car", "bus", "bus", "bus", "bus", "car", "bus", "car", "bus", "car"], "difficult_direct_answer": false, "rationales": ["It's in the bus zone.", "There is nothing clear to assume any answer, but frequently the place where buses park is restricted to other vehicles so a car parked in the manner of answer a might be against the rules.", "The red car is parked on the side of the road where the bus should be."], "image": "train2014/COCO_train2014_000000114024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481736, "question_id": "3B572r8XjMWuAMafuET6WQ", "question": "What sport is enjoyed by the person in black shorts?", "choices": ["chess", "surfing", "skiing", "drone flying"], "correct_choice_idx": 1, "direct_answers": ["surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["The sport is surfing.", "The man is wearing a wet suit and is carrying the board that is used by participants in this water sport.", "He is carrying a board that allows him to ride waves."], "image": "train2014/COCO_train2014_000000481736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253395, "question_id": "3BHz3PPXScW2xBDdYjRcGw", "question": "What were the potatoes seen here cooked in?", "choices": ["water", "milk", "vinegar", "oil"], "correct_choice_idx": 3, "direct_answers": ["oil", "fryer", "oil", "oven", "fryer", "oven", "fryer", "oil", "oil", "oil"], "difficult_direct_answer": false, "rationales": ["These potatoes were fried", "French fries are made crispy by frying them in oil.", "The potatoes are french fries and have been deep fried."], "image": "train2014/COCO_train2014_000000253395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117941, "question_id": "3BQdPBrrUKQmU4ffeyHDFn", "question": "What is the man wearing sunglasses?", "choices": ["dancing", "playing frisbee", "playing ball", "squatting"], "correct_choice_idx": 1, "direct_answers": ["playing frisbee", "protect eyes", "beachgoers", "sun protection", "eye protection", "human male", "man", "its sunny", "throwing frisbee", "protecting purpose"], "difficult_direct_answer": true, "rationales": ["The man is playing frisbee on the beach and wearing sunglasses to keep the sun out of his eyes while playing.", "So he can see to play and keep the sun out of his eyes.", "The man is wearing sunglasses to keep the sun out of his eyes while he plays frisbee."], "image": "train2014/COCO_train2014_000000117941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73302, "question_id": "3C83GsKzDmg3UzWTrpoy5q", "question": "In which liquid were most of the shown treats boiled?", "choices": ["dishwater", "oil", "petrol", "water"], "correct_choice_idx": 1, "direct_answers": ["oil", "oil", "oil", "oil", "oil", "oil", "oil", "oil", "oil", "grease"], "difficult_direct_answer": false, "rationales": ["A display case is filled with donuts.", "The treats appear to be pastries that were fried. when frying something, one would most commonly use oil.", "The treats were fried."], "image": "train2014/COCO_train2014_000000073302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375806, "question_id": "3CArjnWnDJeqb8oRc25DVL", "question": "What is the building for?", "choices": ["library", "church", "lifeguard", "hospital"], "correct_choice_idx": 2, "direct_answers": ["lifeguard", "lifeguard", "lifeguard", "lifeguards", "lifeguard", "beach", "lifeguard", "lifeguard", "lifeguard", "lifeguard"], "difficult_direct_answer": false, "rationales": ["The building gives a good vantage of the beach, while protecting the person from the elements.", "The light blue building is where the lifeguard would be to watch out for people that need their help", "To keep people safe while swimming, the trained professionals need to have a shelter on the beach, usually to store equipment and be raised higher up for a better view."], "image": "train2014/COCO_train2014_000000375806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270799, "question_id": "3CUuBTCWJ3kcGhYRM2oUxu", "question": "Why is the truck backed up to the building?", "choices": ["loading", "robbery", "blocking door", "sales"], "correct_choice_idx": 0, "direct_answers": ["loading", "loading", "loading", "to unload", "loading cargo", "loading", "drop off", "unloading", "delivery", "delivering"], "difficult_direct_answer": false, "rationales": ["The trunk is put close the the entrance and there is an object on the trunk bed of the truck. a person seems to have moved something onto it.", "The truck is there to be loaded in order to move stuff.", "The truck is being filled with items."], "image": "train2014/COCO_train2014_000000270799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261902, "question_id": "3Ca2foznGoaSiT9AR6dqsU", "question": "Why is the dog on his leg?", "choices": ["lives there", "fell there", "is stuck", "catching frisbee"], "correct_choice_idx": 3, "direct_answers": ["catching frisbee", "frisbee", "catch frisbee", "it jumped", "jumping", "playing catch", "jumping", "catching frisbee", "catching frisbee", "launching"], "difficult_direct_answer": false, "rationales": ["The dog is leaping towards a frisbee.", "The dog is using his leg to propel upward.", "The dog is catching the frisbee."], "image": "train2014/COCO_train2014_000000261902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492129, "question_id": "3CbudarGrGjg79P6JnroTr", "question": "Which person is he videotaping?", "choices": ["black top", "himself", "black pants", "blue jeans"], "correct_choice_idx": 2, "direct_answers": ["skater", "friend", "seating mean", "skateboarder", "his friend", "person behind", "skateboarder", "working person", "black pants", "skate boarder"], "difficult_direct_answer": true, "rationales": ["The man is videotaping the person wearing the black pants.", "The man in the gray sweatshirt is filming the man in black pants as he skateboards.", "The person with the camera has it pointed at himself while he skates."], "image": "train2014/COCO_train2014_000000492129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172924, "question_id": "3CkaWkrznc53PXrSxYWgSN", "question": "In which continent is the train?", "choices": ["north america", "europe", "africa", "asia"], "correct_choice_idx": 3, "direct_answers": ["asia", "asia", "asia", "asia", "asia", "china", "asia", "japan", "asia", "japan"], "difficult_direct_answer": false, "rationales": ["By the writing above the one building it looks to be an asian culture.", "Asian writing is visible in the image indicating that the continent is asia.", "The writing is in japanese."], "image": "val2014/COCO_val2014_000000172924.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326161, "question_id": "3DQAaaPBj9iiTpgTXwxQWB", "question": "What are the lamps trying to help the people do?", "choices": ["sleep", "see", "smell", "hear"], "correct_choice_idx": 1, "direct_answers": ["see", "see", "see", "see", "see", "see", "see", "see", "ski", "ski"], "difficult_direct_answer": false, "rationales": ["Lamps are used to light the darkness and allow for people enjoying the snow to see at night.", "They are trying to help the people see because it is night time so they need them.", "The lights illuminate the dark slopes so they can safely ski."], "image": "train2014/COCO_train2014_000000326161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342051, "question_id": "3DZNUfCf3xxgJpM68v8FVH", "question": "What is the country first had double decker busses?", "choices": ["japan", "england", "america", "germany"], "correct_choice_idx": 1, "direct_answers": ["england", "france", "hong king", "england", "england", "england", "hong kong", "uk", "hong kong", "england"], "difficult_direct_answer": false, "rationales": ["The first double decker buses were made in england and are very popular there.", "The country is england.", "England had double decker buses before other countries."], "image": "val2014/COCO_val2014_000000342051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350425, "question_id": "3DhwRssmLMA4VmhA5iCfUv", "question": "What is taller than the giraffe here?", "choices": ["ladder", "tree", "statue", "skyscraper"], "correct_choice_idx": 1, "direct_answers": ["tree", "tree", "trees", "trees", "trees", "tree", "tree", "trees", "trees", "tree"], "difficult_direct_answer": false, "rationales": ["A tree is the only thing pictured that stands higher than the giraffe.", "There are trees around the giraffe that are taller.", "This plant is the only kind that usually grows to be taller than giraffes if it isn't cut down."], "image": "train2014/COCO_train2014_000000350425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247071, "question_id": "3DwvZUDny7mDhL7LoE24rs", "question": "What is the cow doing?", "choices": ["drinking water", "finding friends", "resting", "finding food"], "correct_choice_idx": 3, "direct_answers": ["sniffing", "eating", "smelling", "eating", "eating fruits", "finding food", "scavenging", "standing", "eating", "exploring"], "difficult_direct_answer": false, "rationales": ["It has its nose down sniffing things on the ground", "The cow is sniffing an orange on the ground.", "The cow is standing on the side of the busy road looking for food that has fallen on the ground."], "image": "train2014/COCO_train2014_000000247071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300326, "question_id": "3E5UZ3jGCbdUVwTYViTGuV", "question": "Why is the man on the left jumping in the air?", "choices": ["to catch", "to flip", "to grind", "to ollie"], "correct_choice_idx": 0, "direct_answers": ["catching frisbee", "catch frisbee", "catch frisbee", "catching frisbee", "to catch", "catch frisbee", "frisbee", "catch", "catch frisbee", "catch frisbee"], "difficult_direct_answer": false, "rationales": ["The man wants to get a hold of the frisbee.", "A man is jumping and reaching up for a frisbee.", "The frisbee was too high in the air for him to easily get it."], "image": "train2014/COCO_train2014_000000300326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189855, "question_id": "3Efygs9jgKpde6u8tEf388", "question": "What injury could she sustain if she touched the top of the candles?", "choices": ["nothing", "electrocution", "cut", "burn"], "correct_choice_idx": 3, "direct_answers": ["burn", "burn", "burn", "burn", "burn", "burn", "burn", "burns", "burns", "burn"], "difficult_direct_answer": false, "rationales": ["Fire is bad and it's hot when touching people's skin.", "The candles are lit so the hot flame will damage her skin if she gets too close.", "The girl could sear her skin."], "image": "val2014/COCO_val2014_000000189855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317851, "question_id": "3EmgSq25kZMRhv3XAzeAQT", "question": "Why is the hood of the car open?", "choices": ["for repairs", "capture rain", "trap birds", "loading luggage"], "correct_choice_idx": 0, "direct_answers": ["automotive repair", "getting fixed", "repair", "for repairs", "for repair", "engine work", "repair", "fixing", "broken engine", "engine repair"], "difficult_direct_answer": true, "rationales": ["When a car needs to be fixed you have to go under the hood.", "The picture shows a car part hanging from it. most times you will only open the hood to make repairs or add fluids.", "The hood goes up to see the inner mechanics of the car."], "image": "train2014/COCO_train2014_000000317851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90888, "question_id": "3EzmFKFCAUrym8qEoaEnx2", "question": "What is on the right?", "choices": ["bike", "couch", "blocks", "tv"], "correct_choice_idx": 2, "direct_answers": ["no image", "remote", "remote controls", "toys", "tv remotes", "giant legos", "giant legos", "carpet", "remote", "blocks"], "difficult_direct_answer": false, "rationales": ["Big toys are on the side.", "The right has blocks.", "The couch is to the left, and the tv is near the couch. there is no bike."], "image": "train2014/COCO_train2014_000000090888.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406734, "question_id": "3FEjzyzb9d4Dho7uQwzghn", "question": "What event will the people participate in?", "choices": ["motorcycle parade", "marathon", "protest", "touring"], "correct_choice_idx": 0, "direct_answers": ["biking", "motorcycle parade", "bike rally", "bike rally", "bike ride", "biking", "sturgis", "motor crossing", "rally", "bike rally"], "difficult_direct_answer": false, "rationales": ["There are tons of motorcycles in the street.", "There are several motorcycles in a row.", "An event for people that ride motorcycles like a parade."], "image": "val2014/COCO_val2014_000000406734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371654, "question_id": "3FPyNprs6xjnzjzfweNtFg", "question": "Which material has been used to build the pots hanged on the wall?", "choices": ["aluminum", "copper", "silver", "iron"], "correct_choice_idx": 1, "direct_answers": ["wood", "wood", "copper", "wood", "copper", "copper", "copper", "copper", "copper", "wood"], "difficult_direct_answer": false, "rationales": ["The material is copper.", "The color of them gives it away.", "The pots on the wall are made out of a red, not silver or grey, metal."], "image": "train2014/COCO_train2014_000000371654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358302, "question_id": "3FZ8hC9pz4WwQPcmbwrnw4", "question": "What is in the plastic bag?", "choices": ["groceries", "recycling", "cleaning supplies", "dirty clothes"], "correct_choice_idx": 2, "direct_answers": ["groceries", "cleaning supplies", "trash", "trash", "waste", "trash", "waste", "trash garbage", "trash", "trash"], "difficult_direct_answer": false, "rationales": ["It is full of empty and rinsed food containers.", "She has the items that need to be recycled ready to go out to the trash.", "A kitchen sized trash bag is on the floor near a door and appears ready to be taken out."], "image": "val2014/COCO_val2014_000000358302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323934, "question_id": "3GDEXUhpuTWdVPADt5N8AQ", "question": "How many bananas is the store offering for nineteen cents?", "choices": ["four", "three", "one", "two"], "correct_choice_idx": 2, "direct_answers": ["five", "five", "five", "one", "many", "one", "one", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["Since the price of the bananas is advertised as \"19 cents each\", then that means 19 cents for one.", "The sign says they are 19 cents each so only one is 19 cents.", "One banana is 19 cents."], "image": "train2014/COCO_train2014_000000323934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402960, "question_id": "3GquYdHagPxwYjok34bJha", "question": "How far away is the item being photographed?", "choices": ["2 feet", "10 feet", "behind photographer", "very far"], "correct_choice_idx": 3, "direct_answers": ["quite far", "far", "very far", "meters", "distant", "one mile", "very far", "very far", "far away", "near"], "difficult_direct_answer": false, "rationales": ["The man is using a long lens to photograph something in the distance.", "The cameraman has a zoom lens which means the item is not near to him.", "He is shooting something farther away with that long lens"], "image": "train2014/COCO_train2014_000000402960.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135796, "question_id": "3HCRGjXgXi67xJhUTiTMTd", "question": "What is the girl doing with the silver device?", "choices": ["cutting", "scooping", "mixing", "eating"], "correct_choice_idx": 2, "direct_answers": ["mixing", "child", "mixing", "two", "mixing", "stirring", "mixing", "stirring", "mixing", "mixing"], "difficult_direct_answer": false, "rationales": ["This has beaters on it that move food around", "The girl is holding a mixer.", "She is using a tool for cooking. the food is still in a raw form."], "image": "train2014/COCO_train2014_000000135796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284143, "question_id": "3HWViYgG9rzLfYggQrU3AP", "question": "At which stage of the game are these players?", "choices": ["end", "starting", "before starting", "first set"], "correct_choice_idx": 0, "direct_answers": ["end", "end match", "end", "end", "final stage", "end", "finished", "end", "end", "end"], "difficult_direct_answer": false, "rationales": ["Both people are sweating and congratulating eachother.", "The players are dripping in sweat.", "The stage is at the end."], "image": "val2014/COCO_val2014_000000284143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484614, "question_id": "3HbdCttfXek9HKVGGoNuA2", "question": "What is the man's hobby?", "choices": ["painting", "knitting", "surfing", "sculpting"], "correct_choice_idx": 2, "direct_answers": ["surfing", "surfing", "surfing", "surf", "surfing", "surfing", "sea scaring", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["The man is holding a surfboard and is in the water.", "A man is holding a surfboard.", "The hobby is surfing."], "image": "val2014/COCO_val2014_000000484614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154346, "question_id": "3HoYpS62VGywTXNeV32JVn", "question": "What act of nature could potentially physically impede progress on the road?", "choices": ["pandemic", "flood", "landslide", "lightning"], "correct_choice_idx": 2, "direct_answers": ["avalanche", "rocks", "avalanche", "rockfall", "rockslide", "landslide", "landslide", "avalanche", "avalanche", "rockslide"], "difficult_direct_answer": false, "rationales": ["Rock and landslides often occur down mountains.", "An avalanche would cause the hillside to collapse and tumble down blocking the road.", "Sometimes mountains have landslides."], "image": "train2014/COCO_train2014_000000154346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195079, "question_id": "3HsiUQTfxr4k6FZGud8r5p", "question": "What type window is the person who is photographing this luggage looking here?", "choices": ["side", "windshield", "rear", "front"], "correct_choice_idx": 2, "direct_answers": ["windshield", "car", "car", "car", "vehicle", "back window", "rear", "car window", "back", "windshield"], "difficult_direct_answer": false, "rationales": ["I suppose. the glass is too dark.", "They look to be in a truck and taking a pick out the back.", "This appears to be a trailer attached to the rear of a car so the person photographing would have to be looking through the rear window of the vehicle in order to have this perspective."], "image": "val2014/COCO_val2014_000000195079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300368, "question_id": "3JJQ8FoLzBm9SfSmYNg2fG", "question": "Who is the cake for?", "choices": ["birthday boy", "married couple", "victorious team", "retiring boss"], "correct_choice_idx": 1, "direct_answers": ["couple", "celebration", "couple", "bride", "bride", "wedding guests", "girl", "married couple", "married couple", "newly weds"], "difficult_direct_answer": false, "rationales": ["She is wearing a white dress and he is wearing a suit. they are cutting the cake together. cutting the cake is a wedding tradition.", "Two people are cutting the cake. one is a man in a black suit, and the other is a woman in a white dress.", "The cake appears to be a layer cake, the type served at weddings, and the two people standing in front of it are dressed for a wedding and are cutting the cake, which is part of the ceremony."], "image": "val2014/COCO_val2014_000000300368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511463, "question_id": "3JLGeN4XcK9DCne43xnbzA", "question": "What type of terrain is available here?", "choices": ["gravel", "path", "road", "sidewalk"], "correct_choice_idx": 1, "direct_answers": ["forest", "forrest", "mountain", "rocky", "dirt", "dirt", "dirt", "forested", "rocky", "path"], "difficult_direct_answer": false, "rationales": ["The terrain is not paved and is not gravel.", "There is a path available in the forest.", "A woman is walking in the forest on a dirt walkway."], "image": "val2014/COCO_val2014_000000511463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131784, "question_id": "3JLbj82jZwmseKrBG3f6zt", "question": "How many biplanes are there?", "choices": ["four", "one", "three", "five"], "correct_choice_idx": 2, "direct_answers": ["three", "two", "three", "three", "one", "one", "two", "three", "zero", "two"], "difficult_direct_answer": false, "rationales": ["Three planes are depicted, one gray and two gold.", "A biplane has a propeller while the others are jet fueled.", "Three planes are flying near each other."], "image": "train2014/COCO_train2014_000000131784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128110, "question_id": "3JR9AFzd8RZi9HSUpqECGb", "question": "What could happen weather wise in this area?", "choices": ["rain", "floods", "hail", "snow"], "correct_choice_idx": 0, "direct_answers": ["rain", "rain", "rain", "flooding", "flooding", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The sky looks overcast. people are wearing short sleeves.", "Rain is the most likely weather event because the sky is cloudy and rather dark.", "The sky is filled with clouds."], "image": "val2014/COCO_val2014_000000128110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412764, "question_id": "3JVPyFDqhfQcfcdqeuSWd4", "question": "What species is competing here?", "choices": ["feline", "canine", "bovine", "ovine"], "correct_choice_idx": 1, "direct_answers": ["cats", "dogs", "canine", "animals", "canine", "canine", "dogs", "dog", "dogs", "dog"], "difficult_direct_answer": false, "rationales": ["The canine as the dog is seen in the image.", "The dog is visible catching the frisbee on the field.", "This is a competition for dogs."], "image": "val2014/COCO_val2014_000000412764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396495, "question_id": "3JaSTocvyoMnrcAAj7puN9", "question": "What is the term for the way the player has her body positioned?", "choices": ["crouched", "stretching", "crossed legs", "kneeling"], "correct_choice_idx": 0, "direct_answers": ["crouched", "power position", "stance", "service reception", "ready stance", "stance", "forward", "crouched", "crouched", "crouch"], "difficult_direct_answer": false, "rationales": ["Her legs are partially bent and she is slightly hunched over.", "Both of the woman's knees are bent and her body is lowered.", "The woman is standing facing forward with both knees slightly bent."], "image": "train2014/COCO_train2014_000000396495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576386, "question_id": "3K2tqFghroLqCUcU4t9zyz", "question": "What style of riding is the woman doing?", "choices": ["straddle", "lady-like", "side-saddle", "cowboy"], "correct_choice_idx": 2, "direct_answers": ["side-saddle", "dressage", "sidesaddle", "english", "side saddle", "equestrian", "fancy", "sideback", "horseback", "show"], "difficult_direct_answer": true, "rationales": ["A woman is on a horse with both legs on the same side.", "A woman is sitting on a horse with both legs on one side.", "Her right leg is on the same side as her left leg, which isn't the usual way to ride a horse."], "image": "train2014/COCO_train2014_000000576386.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57224, "question_id": "3KAWvu43LmRWsGGbGr9fR7", "question": "Where might you see these people compete in this sport?", "choices": ["summer olympics", "super bowl", "winter olympics", "world series"], "correct_choice_idx": 2, "direct_answers": ["switzerland", "winter olympics", "olympics", "snow areas", "mountain", "winter olympics", "norway", "olympics", "winter olympics", "snow mountain"], "difficult_direct_answer": false, "rationales": ["The winter olympics has skiing events like the one shown in the picture.", "They would compete in the winter olympics.", "The people in the snow are skiing which is a sport that takes place at the winter olympics."], "image": "val2014/COCO_val2014_000000057224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148415, "question_id": "3KBwRzTVjCWraMbbPHJZBM", "question": "The white stuffed toy is made of what material?", "choices": ["denim", "wool", "nylon", "synthetic fabric"], "correct_choice_idx": 3, "direct_answers": ["cotton", "feathers", "cotton", "fleece", "cotton", "red scarf", "plush", "faux fur", "cotton", "synthetic fabric"], "difficult_direct_answer": false, "rationales": ["The toy is made of fake fabric.", "The fabric seems to be somewhat fake.", "Non-organic fibers are used to stuff such toys due to the reduced costs of production."], "image": "train2014/COCO_train2014_000000148415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93269, "question_id": "3KFBNtMYRGPfipp7fQ3v7c", "question": "What are not allowed according to the sign?", "choices": ["dogs", "birds", "cats", "children"], "correct_choice_idx": 1, "direct_answers": ["birds", "birds", "birds", "birds", "no birds", "birds", "birds", "no birds", "birds", "no birds"], "difficult_direct_answer": false, "rationales": ["Birds are outside an establishment on a stroller.", "The sign says no birds allowed.", "The words of the sign are visible and the the words say that birds are not allowed."], "image": "train2014/COCO_train2014_000000093269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475978, "question_id": "3KFrrRX3Xaj6YzdiavxxB5", "question": "What does the boy need to do to decrease his risk of choking on the food in his mouth?", "choices": ["chew food", "spit out", "chug drink", "swallow whole"], "correct_choice_idx": 0, "direct_answers": ["chew", "chew", "chew food", "chew", "swallow food", "chew", "chew", "chew", "smaller bites", "chew properly"], "difficult_direct_answer": false, "rationales": ["The boy needs to use his teeth to grind the food into smaller particles before he swallows them to prevent choking.", "If you don't properly chew your food, you can choke.", "The boy needs to chew."], "image": "train2014/COCO_train2014_000000475978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259535, "question_id": "3KTt4pwmtdEaEf6tVKRUQC", "question": "What is he doing with the frisbee?", "choices": ["catching it", "tossing it", "spinning it", "cleaning it"], "correct_choice_idx": 0, "direct_answers": ["catching", "catching it", "playing", "catching frisbee", "catching", "catching it", "catching", "catching", "catching it", "catching"], "difficult_direct_answer": false, "rationales": ["The man is reaching up for a frisbee that is in the air.", "The frisbee is not in his hands. the frisbee is moving towards him.", "The man is catching it."], "image": "train2014/COCO_train2014_000000259535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181948, "question_id": "3KcuCyyxgdDmWcUfbb6Hc8", "question": "Which direction is the arrow pointing?", "choices": ["left", "down", "up", "right"], "correct_choice_idx": 2, "direct_answers": ["straight", "up", "upward", "straight", "forward", "forward", "straight", "straight", "up", "up"], "difficult_direct_answer": false, "rationales": ["The arrow is pointing up toward the sky on the sign.", "The arrow on the sign is pointing up to indicate going straight.", "The arrow is pointing up to the sky."], "image": "val2014/COCO_val2014_000000181948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303408, "question_id": "3KmVyDufeBE5UcY5bDMG7h", "question": "How many different times are photographed here?", "choices": ["three", "two", "none", "one"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "one", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is only one clock it is just reflected in the mirror.", "It is the same clock, reflected in a mirror.", "There is a mirror and it is showing the same clock and the hands are in the same position, they are just a mirror image of each other."], "image": "train2014/COCO_train2014_000000303408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456756, "question_id": "3KoHkmtxXCopcAUn6hy7WS", "question": "What feature is this animal known for?", "choices": ["spinning webs", "long neck", "gills", "armored shell"], "correct_choice_idx": 1, "direct_answers": ["long neck", "long neck", "long neck", "long neck", "long neck", "long neck", "long neck", "long neck", "long neck", "long neck"], "difficult_direct_answer": false, "rationales": ["A giraffe is in a zoo.", "Giraffe's have long necks so their able to eat. they eat the leaves from the top of the trees.", "Giraffes have notoriously long necks."], "image": "val2014/COCO_val2014_000000456756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549335, "question_id": "3KvSxMXMnkwcR7umqZj9n2", "question": "What type of surfboard is the man in green holding?", "choices": ["midboard", "bodyboard", "shortboard", "longboard"], "correct_choice_idx": 3, "direct_answers": ["longboard", "white", "longboard", "longboard", "long", "surf board", "long board", "long board", "blue", "oversize"], "difficult_direct_answer": false, "rationales": ["This one has extra length on it.", "There are several types of surfboards but this in the longer board.", "The man standing in the water is holding a surfboard known as a longboard because it's longer than a normal surfboard."], "image": "val2014/COCO_val2014_000000549335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49058, "question_id": "3KzN5jbsXo3Yn8N5f2gzgR", "question": "The area underneath the structure is illuminated by what?", "choices": ["fluorescent lights", "hps lights", "led lights", "incandescent lights"], "correct_choice_idx": 2, "direct_answers": ["lights", "artificial light", "led lights", "led lights", "lights", "colored lights", "neon lights", "light", "led lighting", "led lights"], "difficult_direct_answer": false, "rationales": ["The illumination is 90% greater using this method.", "You can tell by the colors and hue of the lights as to what type they are.", "The area is lit up by lights that are modern and blue."], "image": "train2014/COCO_train2014_000000049058.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411093, "question_id": "3L4gWsaSHRPR8QRtWQw448", "question": "What vehicle was brought on the bought?", "choices": ["motorcycle", "car", "truck", "bus"], "correct_choice_idx": 1, "direct_answers": ["red", "car", "car", "car", "car", "boat", "car", "red", "car", "sports car"], "difficult_direct_answer": false, "rationales": ["There is a small vehicle with four wheels. it has been put on by driving it up.", "There is a automobile being transported by the boat.", "The vehicle on the boat has four wheels. it is too small to be a truck or bus."], "image": "val2014/COCO_val2014_000000411093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323705, "question_id": "3L5qL3P2rBpNST5Gehwaas", "question": "If the clock is showing times in the PM how many hours ago did the New York Stock Exchange open?", "choices": ["one", "three", "four", "six"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "12 hours", "3 hours", "fifteen hours", "five", "3 1/2", "2/12 hours", "seventeen", "12 hours"], "difficult_direct_answer": false, "rationales": ["The nyse opens at 9:30 am and the clock displays 12:30 pm.", "The clock is showing 12:30 pm. the nyse opens at 9:30 am.", "The nyse opened three hours ago."], "image": "train2014/COCO_train2014_000000323705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253686, "question_id": "3LKTBEfinwBnKQEbnhd2Xo", "question": "What is the little girl wearing on her legs?", "choices": ["leggings", "jeans", "tights", "knee socks"], "correct_choice_idx": 0, "direct_answers": ["leggings", "leggings", "slippers", "thongs", "tights", "leggings", "leggings", "leggings", "leggings", "stockings"], "difficult_direct_answer": false, "rationales": ["The girl is wearing a skin tight clothing item on her legs that end just before the ankles. this type of clothing is known as answer a.", "A child is wearing a dress with form fitting tights under it.", "The little girl is wearing striped leggings on her legs."], "image": "train2014/COCO_train2014_000000253686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163076, "question_id": "3LKwdeaBDi4ncHu68C29sd", "question": "What material is the round table made of?", "choices": ["granite", "wood", "ceramic", "metal"], "correct_choice_idx": 3, "direct_answers": ["metal", "iron", "metal", "metal", "metal", "metal", "iron", "metal", "metal", "iron"], "difficult_direct_answer": false, "rationales": ["The table is made of wrought iron or something similar.", "The table is made of metal.", "The table is made of metal."], "image": "val2014/COCO_val2014_000000163076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402228, "question_id": "3LQ8GeCGiV8TyKo6n9JpHj", "question": "What type of TV is that?", "choices": ["crt", "projector", "lcd", "toy"], "correct_choice_idx": 0, "direct_answers": ["tube", "crt", "flatscreen", "old", "tubed", "crt", "crt", "crt", "box", "old school"], "difficult_direct_answer": false, "rationales": ["That older style of tv is called crt.", "The tv works, so it is not a toy. it is deep but does not have a projector.", "An older television with a large back is on a stand in a family room."], "image": "train2014/COCO_train2014_000000402228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558618, "question_id": "3LjgkgZVatRmUMPeDrG3Qb", "question": "Where could coins potentially be hidden?", "choices": ["in xylophone", "under carpet", "under cushions", "in diaper"], "correct_choice_idx": 2, "direct_answers": ["mat", "couch cushions", "couch cushions", "under toys", "under toys", "couch cushions", "couch cushions", "under cushions", "under couch", "couch cushions"], "difficult_direct_answer": false, "rationales": ["Change always falls into couches", "The first two options could be possible, because coins are small.", "There is a couch next to the baby and there could be coins hidden in the cushions."], "image": "train2014/COCO_train2014_000000558618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318225, "question_id": "3M66JXW43ZAGGFCsULchYX", "question": "Which person works at this facility?", "choices": ["red top", "back kid", "front kid", "purple shirt"], "correct_choice_idx": 3, "direct_answers": ["worker", "purple shirt", "purple shirt", "purple shirt", "cow boy", "woman", "purple shirt", "woman", "horse trainer", "horse trainer"], "difficult_direct_answer": false, "rationales": ["The person teaching the kids to ride is walking on foot.", "A girl in a purple shirt is walking in the center of horses walking in a circle with kids with helmets riding them.", "You can tell that the purple shirt person works there because she is an adult."], "image": "train2014/COCO_train2014_000000318225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375996, "question_id": "3MHFFQGhntPD9caKWXTV6a", "question": "Which person probably has the most recent ancestry in Africa?", "choices": ["none", "middle", "left", "right"], "correct_choice_idx": 1, "direct_answers": ["middle person", "middle", "middle", "middle", "middle person", "middle person", "middle", "middle person", "middle", "middle person"], "difficult_direct_answer": false, "rationales": ["The person in the middle is black while the people sitting on either side are of different ethnic background.", "Their dark skin tone aligns with the skin tone of someone originating in africa.", "The person in the middle is black. many people from africa are black."], "image": "train2014/COCO_train2014_000000375996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292549, "question_id": "3Mhm37xeMCxVNQpe25nRWF", "question": "Balaclava used as what?", "choices": ["grip", "sports wear", "ski shoe", "ski mask"], "correct_choice_idx": 3, "direct_answers": ["ski mask", "clothing", "hat", "face protection", "ski hat", "head covering", "hat", "head warmth", "skiing", "hat"], "difficult_direct_answer": false, "rationales": ["The girl is wearing goggles.", "It's used as a ski mast.", "She is wearing a protection for her face."], "image": "train2014/COCO_train2014_000000292549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12044, "question_id": "3MmqSTgzjtjzaoM79VDDvP", "question": "The item the person is cutting is harmful to who?", "choices": ["aquaphobic", "agoraphobic", "diabetic", "hypochondriac"], "correct_choice_idx": 2, "direct_answers": ["diabetics", "diabetics", "people", "diabetics", "diabetic", "diabetics", "people", "diabetics", "cake", "diabetics"], "difficult_direct_answer": false, "rationales": ["The person is cutting a cake which is harmful to a diabetic because it is high in sugar.", "Too many sugary foods are bad for people with diabetes.", "The cake contains sugar so it would make the blood sugar of a person with this condition rise which could be dangerous."], "image": "train2014/COCO_train2014_000000012044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530081, "question_id": "3N37XLfBLTz5hcTrd6tWkP", "question": "Who might be in possession of this?", "choices": ["politicians", "hollywood stars", "billionaires", "schoolchildren"], "correct_choice_idx": 3, "direct_answers": ["child", "schoolchildren", "child", "school child", "receptionist", "person", "worker", "student", "office worker", "employee"], "difficult_direct_answer": true, "rationales": ["The portions are small and are in cute containers. there is also a toy so it is fit for a child.", "These are containers that parents can pack their kids school lunch in.", "Kids would have a packed lunch."], "image": "train2014/COCO_train2014_000000530081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318906, "question_id": "3NMotyNopPsv43dNEqqiTV", "question": "By which technology standard is the monitor connected to the laptop?", "choices": ["vga", "dvi", "displayport", "hdmi"], "correct_choice_idx": 0, "direct_answers": ["hdmi cable", "vga", "vga cable", "vga", "lcd", "monitor", "cables", "cord", "cable", "hdmi cable"], "difficult_direct_answer": false, "rationales": ["The vga is connected.", "It's probably connected by an hdmi cord.", "It is an older interface."], "image": "train2014/COCO_train2014_000000318906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101240, "question_id": "3P6LcNopLk4wheftYSScuE", "question": "What is the standing sheep most likely doing?", "choices": ["bleating", "walking", "eating", "sleeping"], "correct_choice_idx": 0, "direct_answers": ["looking around", "grazing", "observing", "dumping", "eating", "bleating", "walking", "observing", "walking", "grazing"], "difficult_direct_answer": false, "rationales": ["The animal standing upright with its mouth open.", "The sheep is standing while the others are sitting.", "The sheep is standing and looking off in the distance. the sheep's mouth is partially open."], "image": "train2014/COCO_train2014_000000101240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214810, "question_id": "3P7VViDGUdSy28AkDFisXq", "question": "What type of animals are present in the image?", "choices": ["sheep", "goat", "cow", "dog"], "correct_choice_idx": 2, "direct_answers": ["cows", "cows", "cows", "cows", "cattle", "cow", "cattle", "cows", "cow", "cows"], "difficult_direct_answer": false, "rationales": ["The animals are on a ranch. they are too big to be goats, dogs, or sheep.", "The animals are too large to be any of the other choices", "Heifers are roaming the field."], "image": "val2014/COCO_val2014_000000214810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434580, "question_id": "3PAHepRpacq7M53Bg6UfWP", "question": "If something goes wrong with this woman's work what can she blame?", "choices": ["incompetence", "dog", "cat", "boss"], "correct_choice_idx": 2, "direct_answers": ["cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["The animal is distracting the woman from doing her job.", "You can tell by the size and fur as to what type of pet is on her laptop.", "A woman is on a couch with a laptop and a cat is climbing on her lap."], "image": "val2014/COCO_val2014_000000434580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499141, "question_id": "3PFSjtPis6ZqbsNLdLK6E2", "question": "What year was this video game console first released?", "choices": ["2006", "2021", "1999", "2012"], "correct_choice_idx": 0, "direct_answers": ["two-thoousand twelve", "last decade", "2012", "2006", "2006", "2001", "2012", "2006", "90's", "2006"], "difficult_direct_answer": false, "rationales": ["The year was 2006.", "The remote in the picture lets you know it's the nintendo wii.", "This is a wii which came out in 2006."], "image": "train2014/COCO_train2014_000000499141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384953, "question_id": "3PFoWD7KUSyVdeCXNycJLc", "question": "What is the best material for chopsticks?", "choices": ["steel", "silver", "bamboo", "metal"], "correct_choice_idx": 2, "direct_answers": ["bamboo", "bamboo", "wood", "wood", "wood", "bamboo", "bamboo", "bamboo", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["Chopsticks are traditionally made of bamboo.", "The material is bamboo.", "This asian wood does not burn or splinter easily. it is important to eat with utensils that don't give splinters."], "image": "train2014/COCO_train2014_000000384953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318175, "question_id": "3PZBdM54bnDtNiXavwk9bT", "question": "The band on the shirt of the man wearing a mask belongs to what genre of music?", "choices": ["punk", "country", "hip hop", "blues"], "correct_choice_idx": 0, "direct_answers": ["rock", "punk", "punk rock", "metal", "punk", "misfits", "punk", "emo", "punk rock", "rock"], "difficult_direct_answer": false, "rationales": ["The man in the mask is wearing a misfits tshirt which is a famous punk band.", "The genre of music is accompanied by horror film themes and imagery and is referred to as punk.", "The man in the white mask is wearing a misfits tshirt which is a punk band."], "image": "train2014/COCO_train2014_000000318175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74875, "question_id": "3Pd72NK9szXDAA7JpeiZLb", "question": "The man has what kind of facial hair?", "choices": ["peach fuzz", "goatee", "mutton chops", "clean shaven"], "correct_choice_idx": 1, "direct_answers": ["goatee", "beard", "bearded moustache", "goatee", "goatee", "black", "goatee", "goatee", "beard", "goatee"], "difficult_direct_answer": false, "rationales": ["A goatee is a beard connected with a moustache.", "The man has a goatee.", "There is hair that is growing on his chin."], "image": "train2014/COCO_train2014_000000074875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162557, "question_id": "3QPYUXq3erNaEBxTZEcqW2", "question": "What are the men standing on the bench doing?", "choices": ["eating", "debating", "playing sports", "singing"], "correct_choice_idx": 3, "direct_answers": ["singing", "singing", "singing", "singing", "singing", "singing", "singing", "singing choir", "singing", "singing"], "difficult_direct_answer": false, "rationales": ["The men on the risers are singing together in a choir that performs music.", "The men are singing together in a chorus on the benches.", "They are holding music and are watching a conductor while their mouths are open."], "image": "train2014/COCO_train2014_000000162557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248702, "question_id": "3QdBuG7DDWtCN3pQ99Nnj5", "question": "What style of building is located near the men?", "choices": ["hospital", "police station", "museum", "castle"], "correct_choice_idx": 3, "direct_answers": ["castle", "castle", "castle", "castle", "islamic", "castle", "black", "castle", "castle", "castle"], "difficult_direct_answer": false, "rationales": ["The building is fort-like and is made out of stone.", "By the type of material and the towers design make it easy to tell what it is.", "The stone building near the men is fortified. hospitals, police stations, and museums are not fortified."], "image": "train2014/COCO_train2014_000000248702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31773, "question_id": "3QduxAjbFtHtBnw37WX6qB", "question": "What is attached to the computer and sits on top of the placemat?", "choices": ["speakers", "mouse", "microphone", "headphones"], "correct_choice_idx": 1, "direct_answers": ["keyboard", "keyboard", "mouse", "keyboard", "mouse", "mouse", "mouse", "keyboard", "keyboard", "keyboard"], "difficult_direct_answer": false, "rationales": ["That is how a mouse looks.", "The headphones have a cord that wraps around and attaches to it", "There is a mouse attached to the computer and it is sitting on top of the placemat."], "image": "val2014/COCO_val2014_000000031773.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250706, "question_id": "3QknhShKn8Zf7Z3vLjHEjD", "question": "Why is this place unsuitable for feeding these animals?", "choices": ["steep slope", "no water", "no grass", "rocky"], "correct_choice_idx": 2, "direct_answers": ["no grass", "rocky", "no vegetation", "wild", "dangerous", "too steep", "no grass", "no grass", "no grass", "wild"], "difficult_direct_answer": false, "rationales": ["These animals eat grass and there is none growing.", "It is only dirt and rocks, neither of which are edible.", "The place has no grass."], "image": "train2014/COCO_train2014_000000250706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199640, "question_id": "3R4MnARrDj5zN5KvawxKeJ", "question": "What is held in the canisters at the back?", "choices": ["gasoline", "oil", "propane", "pepper spray"], "correct_choice_idx": 2, "direct_answers": ["fuel", "gas", "goods", "propane", "gas", "gas", "gas", "clothes", "propane", "propane"], "difficult_direct_answer": false, "rationales": ["The propane is in canisters.", "Traditionally those type of metal cylinders hold propane.", "Scooters hold fuel in the back of the bike for the purpose of acceleration."], "image": "train2014/COCO_train2014_000000199640.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526534, "question_id": "3R6C5NWaBMDDEaqdst78aL", "question": "The butt of what animal is visible at the bottom left corner of the giraffe?", "choices": ["kangaroo", "elephant", "deer", "zebra"], "correct_choice_idx": 3, "direct_answers": ["zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra"], "difficult_direct_answer": false, "rationales": ["A zebra's bottom is hanging out.", "It has the distinct black and white stripes that are synonymous with a zebra.", "The black and white stripes look to be that of a zebra."], "image": "train2014/COCO_train2014_000000526534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539372, "question_id": "3RHYqRttJpEJwBRpuuxXwa", "question": "Which base is he running to?", "choices": ["second", "home", "first", "third"], "correct_choice_idx": 2, "direct_answers": ["first", "first", "first", "first", "first", "first", "first", "first", "first", "first"], "difficult_direct_answer": false, "rationales": ["The player is still holding the bat and is moving forward.", "A baseball batter is beginning to run as he drops the bat. first base is the first base to run to after hitting.", "The batter just hit the ball and will run to the first base."], "image": "train2014/COCO_train2014_000000539372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280778, "question_id": "3RYWgtEQTHgu5Ss5xqGtb3", "question": "What is inside the bus that connects the two levels?", "choices": ["stairs", "pole", "door", "ladder"], "correct_choice_idx": 0, "direct_answers": ["stairs", "stairs", "stairs", "staircase", "stairs", "stairs", "floor", "floor", "stairs", "stairs"], "difficult_direct_answer": false, "rationales": ["The way that you get from one level on a double decker bus to the other level is up the stairs that are on the inside.", "People, in this case passengers, go comfortably up and down between two levels by way of this common engineering structure.", "This is a double decker bus and there is a staircase to connect the two levels."], "image": "train2014/COCO_train2014_000000280778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451473, "question_id": "3S2CbUQ5Jf6MaPoes38pky", "question": "Which ancient civilization utilized the support structure shown in the image?", "choices": ["native americans", "slovaks", "romans", "germans"], "correct_choice_idx": 2, "direct_answers": ["greek", "romans", "romans", "romans", "mayans", "romans", "rome", "norse", "mayans", "unknown"], "difficult_direct_answer": false, "rationales": ["The romans used the structure.", "The structure to the left is a type of ancient construction last used in the mountains by the romans.", "Curved archways are on a building at a ski resort."], "image": "val2014/COCO_val2014_000000451473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317054, "question_id": "3SAtvUYWggKDCVoduw4xXd", "question": "What material is the white part of this specimen made of?", "choices": ["real fur", "cotton", "synthetic fabric", "wool"], "correct_choice_idx": 0, "direct_answers": ["fur", "fur", "fur", "bear fur", "fur", "fur", "bear fur", "real fur", "fur", "fur"], "difficult_direct_answer": false, "rationales": ["The material is real fur.", "This is a real animal that they kept the fur to use as a rug or blanket.", "This specimen is made of white fur, and is a real hide of a bear."], "image": "train2014/COCO_train2014_000000317054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328120, "question_id": "3SPBY49XDczxEaFj72SPs7", "question": "Where are the owners of these bikes while this photo was taken?", "choices": ["at work", "overseas", "skydiving", "home"], "correct_choice_idx": 0, "direct_answers": ["shop", "city", "away", "in building", "absent", "at work", "inside building", "in building", "in buildings", "work"], "difficult_direct_answer": true, "rationales": ["They leave their bikes there while they work.", "A profession that allows you to earn a living.", "The bikes are parked outside of a busy city street and the people that rode them are probably working in the buildings."], "image": "train2014/COCO_train2014_000000328120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561582, "question_id": "3SPxqEx6r4GdMnLmcmTMYZ", "question": "Why is the cats pupil green?", "choices": ["contacts", "birth defect", "genetics", "camera flash"], "correct_choice_idx": 3, "direct_answers": ["flash", "flash", "natural pigment", "injury", "colored glow", "camera flash", "genetic", "camera flash", "reflection", "light reflection"], "difficult_direct_answer": false, "rationales": ["Some cats have green eyes.", "They turn that color when the flash from the camera goes off.", "Animal eyes reflect when a camera flashes."], "image": "train2014/COCO_train2014_000000561582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547164, "question_id": "3ST5yeWxMUAPD5pJ4dkH75", "question": "What could someone do inside the yellow building?", "choices": ["mail letters", "buy clothing", "watch movie", "exercise"], "correct_choice_idx": 2, "direct_answers": ["watch", "watch movies", "watch movies", "watch movie", "perform", "watch movie", "watch movie", "watch movie", "movie", "see movies"], "difficult_direct_answer": false, "rationales": ["The building is a theater.", "The building is used to watch movies.", "The building has the word theater which tells you what you can do there."], "image": "train2014/COCO_train2014_000000547164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567135, "question_id": "3SiaCywbAjxbk3VVfp7g9y", "question": "What are most phones here being used for?", "choices": ["callling", "texting", "filming", "gaming"], "correct_choice_idx": 2, "direct_answers": ["taking photos", "filming", "recording", "filming", "recording", "recording", "recording", "pictures", "recording", "recording"], "difficult_direct_answer": false, "rationales": ["There's no indication of still images being take, and the cameras of the phones are pointing at something moving.", "There are pictures of the event shown on the phone.", "Everyone is aiming their phone at a certain point as if they are taking a video."], "image": "train2014/COCO_train2014_000000567135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516189, "question_id": "3SyetoLYLSVjVCr32ABLQA", "question": "What does the snorkel on the truck protect it from?", "choices": ["fire", "animals", "dust", "water"], "correct_choice_idx": 3, "direct_answers": ["flooding", "exhaust", "bumper damage", "water", "collisions", "collisions", "water", "water", "branches", "water"], "difficult_direct_answer": false, "rationales": ["It protects it from water.", "Snorkels are used to breath in water.", "The vehicle is an offroad vehicle and snorkels are used so the exhaust doesn't flood in short periods of being submerged."], "image": "train2014/COCO_train2014_000000516189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540554, "question_id": "3T2f5TUQrTqWBjAPmjx2fr", "question": "The two trains are traveling in which European country?", "choices": ["spain", "germany", "france", "united kingdom"], "correct_choice_idx": 2, "direct_answers": ["france", "france", "france", "france", "france", "france", "france", "france", "france", "france"], "difficult_direct_answer": false, "rationales": ["There is a reference to the city of versailles.", "The sign near the red train indicates that this is the versailles-chantiers station.", "The trains go to france."], "image": "train2014/COCO_train2014_000000540554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8568, "question_id": "3TCgYhMinHos6JwPhQcQsa", "question": "Where does the black and silver item in the middle compartment belong?", "choices": ["ankle", "neck", "wrist", "waist"], "correct_choice_idx": 2, "direct_answers": ["on wrist", "wrist", "on wrist", "on pad", "pencil case", "wrist", "wrist", "sink", "desk", "mouse pad"], "difficult_direct_answer": false, "rationales": ["The fitness watch or fitbit fits on one's wrist. a fitness watch tracks activity and helps to monitor areas of health when worn.", "The black and silver item is a watch which is strapped to this body part in order for the person to keep track of the time.", "An item with a band and time piece is on a desk."], "image": "train2014/COCO_train2014_000000008568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493724, "question_id": "3TDCyZAmR5WThTX5RQbTD3", "question": "What activity is being shared by the people?", "choices": ["cooking", "video gaming", "karaoke", "live audition"], "correct_choice_idx": 1, "direct_answers": ["music", "guitar hero", "video gaming", "rock band", "video game", "music", "rock band", "music", "video game", "karaoke"], "difficult_direct_answer": false, "rationales": ["They are holding musical instruments and a microphone and are looking at a screen. karaoke types the words to a song for sing along purposes.", "They are playing guitar hero", "They are all holding wii remotes to play the game."], "image": "val2014/COCO_val2014_000000493724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294238, "question_id": "3TDXN5eKDHkbrmsSWNA64E", "question": "The glass drink on the table has what as its primary flavor?", "choices": ["citrus", "tea", "cola", "pineapple"], "correct_choice_idx": 0, "direct_answers": ["orange", "orange", "orange", "citrus", "lemon", "orange", "lemon", "orange", "lemon", "orange"], "difficult_direct_answer": false, "rationales": ["The shape of the container suggests that it is a fruit and the color supports that it may be a citrus flavor.", "The drink is for citrus.", "The drink is orange, not brown or yellow."], "image": "train2014/COCO_train2014_000000294238.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254266, "question_id": "3TDkXuDCAqrt85CS4mA4CZ", "question": "What time of the day are the surfers showering here?", "choices": ["dusk", "midnight", "noon", "sunrise"], "correct_choice_idx": 0, "direct_answers": ["dusk", "night", "dawn", "morning", "golden hour", "morning", "dusk", "night", "afternoon", "dusk"], "difficult_direct_answer": false, "rationales": ["Could be two possibilities dawn or the latter.", "The time is dusk.", "The sun must be about to set."], "image": "train2014/COCO_train2014_000000254266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24958, "question_id": "3TEfrKV9fDMDz5vEdp79Ea", "question": "Why does the man cover his head with a cloth?", "choices": ["religion", "prevents sunburn", "fashion", "for warmth"], "correct_choice_idx": 0, "direct_answers": ["religious purposes", "religion", "religion", "religious beliefs", "religion", "religious", "sun protection", "religious practice", "religious tradition", "religious reasons"], "difficult_direct_answer": false, "rationales": ["Most men who are in that region and religious beliefs wear that type of head attire.", "This is part of a hebrew attire when visiting religious sites for the jewish.", "The man is wearing a special cloth to cover his head in observance with his religion."], "image": "val2014/COCO_val2014_000000024958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422785, "question_id": "3TU96KDN7m6PtU2YJcX3Vf", "question": "What is the large circular blue object under the sink?", "choices": ["hose", "bucket", "ball", "mat"], "correct_choice_idx": 1, "direct_answers": ["bucket", "bucket", "bucket", "bucket", "bucket", "bucket", "bucket", "bucket", "bucket", "plastic tub"], "difficult_direct_answer": false, "rationales": ["The design and the plastic its made of shows you what it is.", "The large blue object is a bucket used for cleaning or moping.", "The item is made of plastic and has a handle for carrying as these items typically do,"], "image": "val2014/COCO_val2014_000000422785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480641, "question_id": "3TfP2huzS5Xv887kczAdK3", "question": "Where does the man on the board want to go?", "choices": ["up", "forward", "backwards", "down"], "correct_choice_idx": 2, "direct_answers": ["below", "down", "backwards", "back", "aerial", "on ramp", "downhill", "back down", "off structure", "down"], "difficult_direct_answer": true, "rationales": ["He needs to go this direction to stay on the course. if he goes off the edge that is a long drop onto uneven grass and dirt", "The man is flipping his board so he can go back.", "The skater has stopped and is about to go in reverse."], "image": "train2014/COCO_train2014_000000480641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440735, "question_id": "3UPxEiTpsJKXTCvVUDjvjR", "question": "What likely relation do the two spoon looking things have?", "choices": ["connected", "both expensive", "opposite colors", "no relation"], "correct_choice_idx": 0, "direct_answers": ["serve", "tongs", "connected", "tong", "feeding", "connected", "connected", "tongs", "tongs", "tongs"], "difficult_direct_answer": false, "rationales": ["They are up on their sides and facing each other which wouldn't be possible if they were separate", "The spoons are facing each other like tongs.", "The spoons-like objects are part of a pair of tongs. they are joined by a hinge-like mechanism."], "image": "train2014/COCO_train2014_000000440735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505343, "question_id": "3UTPKtMpDC7mRHjKrhH4Nf", "question": "When is it safe to proceed going forward in a vehicle?", "choices": ["3 minutes", "now", "never", "10 minutes"], "correct_choice_idx": 1, "direct_answers": ["now", "now", "now", "now", "green light", "green", "yes", "now", "green light", "now"], "difficult_direct_answer": false, "rationales": ["If the light is green it's normally safe to cross.", "The traffic light on the post is green which tells drivers they can go forward.", "There is currently a green light on the traffic light. green lights are associated with \"go\" for vehicles and transitively, it is safe to go at this time."], "image": "train2014/COCO_train2014_000000505343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498449, "question_id": "3Uie9jdrStFb76MpSdTpSG", "question": "Where is most of the kids weight?", "choices": ["on heads", "on feet", "on arms", "on skis"], "correct_choice_idx": 3, "direct_answers": ["on skis", "poles", "sixty pounds", "front", "below", "poles", "ground", "feet", "poles", "stomach area"], "difficult_direct_answer": false, "rationales": ["They are both leaning forward onto their poles and putting their weight on them.", "The kids are on skis.", "They are standing on them with their feet."], "image": "train2014/COCO_train2014_000000498449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59463, "question_id": "3UtoH8LfQzWXk7DrU7VJrH", "question": "What type of building is this black structure?", "choices": ["deli", "bathroom", "phonebooth", "post office"], "correct_choice_idx": 1, "direct_answers": ["bathroom", "toilet", "toilet", "bathroom", "bathroom", "toilet", "toilet", "bathroom", "bathroom", "toilet"], "difficult_direct_answer": false, "rationales": ["The logo shows a man and woman.", "The building is a bathroom.", "The black structure has a sign on it showing that it contains a toilet."], "image": "val2014/COCO_val2014_000000059463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451389, "question_id": "3VWTsgMUAMcUNHgkntCjqL", "question": "What is the minimum number of players who can partake in a match of this sport?", "choices": ["eight", "four", "two", "three"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["It is called singles. where there is one person on each side of the net.", "A singles match would involve one competitor on each side.", "The sport being played is tennis and while they are currently playing a doubles variation, at minimum in this sport as a whole, one would need at least two players to play a match."], "image": "train2014/COCO_train2014_000000451389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473720, "question_id": "3VZqnUtemBsherHyzwCMAG", "question": "What is the person doing with the food in his lap?", "choices": ["eating", "selling", "decorating", "cooking"], "correct_choice_idx": 0, "direct_answers": ["eating", "eating it", "eating", "eating", "balancing", "eating", "eating", "eating", "tempting dog", "eating"], "difficult_direct_answer": false, "rationales": ["The person has pizza on his lap, so he must be about to eat.", "The person has a pizza in his lap that he is going to eat.", "A cooked pizza is sitting in his lap, which is ready to be consumed by eating."], "image": "val2014/COCO_val2014_000000473720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298545, "question_id": "3VjzdDAufAvioYm6yMFNGQ", "question": "What is the man watching?", "choices": ["television", "child", "bird", "ball"], "correct_choice_idx": 3, "direct_answers": ["ball", "ball", "ball", "ball", "tennis ball", "ball", "tennis ball", "ball", "ball", "ball"], "difficult_direct_answer": false, "rationales": ["He is holding his arm and the racket backward in anticipation of returning the volley.", "A man is playing tennis and has his racket pulled back ready to swing. playing tennis requires hitting a bll.", "The man is playing tennis so that would be the most important thing to look at."], "image": "train2014/COCO_train2014_000000298545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78892, "question_id": "3VzaP7G2XJ6LuqvZPs2Eog", "question": "Where are these people going?", "choices": ["club", "zoo", "ocean", "to work"], "correct_choice_idx": 3, "direct_answers": ["boarding train", "home", "city", "their destinations", "downtown", "to work", "city", "north", "town", "to work"], "difficult_direct_answer": false, "rationales": ["The people are on the train because they are commuting in the city to work.", "People are riding on an elevated train. elevated trains are used in cities to get to work and home.", "These people are wearing business clothing."], "image": "val2014/COCO_val2014_000000078892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226502, "question_id": "3W9cNyJc54ukgXKGFiJAne", "question": "What is next to the phone?", "choices": ["ear", "dog", "leg", "cat"], "correct_choice_idx": 0, "direct_answers": ["ear", "head", "woman's hand", "woman", "purse", "hair", "woman's face", "hair", "ear", "woman's face"], "difficult_direct_answer": false, "rationales": ["She is holding it up to the side of her head", "This is where you hold the phone to speak and listen into it.", "The woman is holding the phone up to her ear."], "image": "train2014/COCO_train2014_000000226502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368994, "question_id": "3WPc7eSiChSp5ij7Bi34uL", "question": "Which type flower is most oft repeated here?", "choices": ["gladiola", "daisy", "rose", "iris"], "correct_choice_idx": 2, "direct_answers": ["rose", "rose", "roses", "rose", "rose", "carnation", "rose", "sunflower", "rose", "rose"], "difficult_direct_answer": false, "rationales": ["Roses are scattered throughout.", "You can tell by the floral design and color as to what the dominant flower is here.", "The most common flower is the rose."], "image": "val2014/COCO_val2014_000000368994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22652, "question_id": "3WnVcjoaBVtmrYpXQsWvUz", "question": "What area these people going to take?", "choices": ["olympic race", "zoo visit", "ski lessons", "recordings"], "correct_choice_idx": 2, "direct_answers": ["hill", "snowy area", "uphill", "bunny slopes", "picture", "lessons", "ski lessons", "hill", "lessons", "hill"], "difficult_direct_answer": false, "rationales": ["These people are on snow and are carrying poles and other objects that are needed for an extreme sport. they are children, so they do not have much experience in this sport.", "They are on a snowy mountain and carring skis and poles", "These people are all little kids so they're learning to ski."], "image": "train2014/COCO_train2014_000000022652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123642, "question_id": "3X3agUCxRqU9evQFVhpa48", "question": "What company is known for making the thing on the plate on the right?", "choices": ["subway", "moen", "snicker's", "ginsu"], "correct_choice_idx": 0, "direct_answers": ["kikkoman", "panera", "panini", "panera", "unknown", "coach", "panama", "subway", "panera", "food company"], "difficult_direct_answer": false, "rationales": ["Subway can make good subs", "Subway makes sandwiches.", "Of the options, this is the only one that makes sandwiches."], "image": "val2014/COCO_val2014_000000123642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501176, "question_id": "3XJghtDksxybN8J54zYYG4", "question": "Who's bike is this?", "choices": ["child", "man", "woman", "tourist"], "correct_choice_idx": 0, "direct_answers": ["boy's", "child", "child's", "boys", "child", "little boy's", "kid's", "boy's", "children's", "kid"], "difficult_direct_answer": false, "rationales": ["It's a little bike for a child.", "The bike is too small for the adult that is riding it. it belongs to the boy running behind it.", "The bike is small. it is for a smaller person."], "image": "train2014/COCO_train2014_000000501176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213577, "question_id": "3XMSXTusgDujAoDU2dscBd", "question": "Why can we still see them?", "choices": ["daytime", "bonfire", "white skin", "artificial light"], "correct_choice_idx": 3, "direct_answers": ["flash light", "artificial light", "lighting", "light", "lights", "street lights", "court lights", "lighting", "light", "light"], "difficult_direct_answer": false, "rationales": ["The men are standing on a tennis court at night that has artificial lights installed so they can see.", "The artificial light in the area shows their faces.", "Two men are on a lit tennis court at night. courts are lit with artificial lights at night."], "image": "train2014/COCO_train2014_000000213577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391415, "question_id": "3Xe5zFSgEQ6rwdZ9WzrRAb", "question": "Why is the red car on the bed of the blue vehicle?", "choices": ["sell car", "buy car", "tow car", "race car"], "correct_choice_idx": 2, "direct_answers": ["tow car", "to transport", "mechanical failure", "station wagon", "being towed", "being towed", "for transport", "being towed", "it's broken", "broken"], "difficult_direct_answer": false, "rationales": ["The red car was broken.", "Cars are sometimes put on a bed because they are parked illegally or can't move.", "This truck is taking the car somewhere by towing it."], "image": "val2014/COCO_val2014_000000391415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226904, "question_id": "3Xrgnum3SmLXCu8fW8Ymsc", "question": "Why is he wearing a glove?", "choices": ["warmth", "health", "fashion", "grip"], "correct_choice_idx": 3, "direct_answers": ["catch ball", "throwing ball", "catch ball", "catch ball", "grip", "playing baseball", "pitching", "to catch", "baseball", "catching"], "difficult_direct_answer": false, "rationales": ["A baseball player is wearing a uniform and glove. gloves are worn for grip and protection in baseball.", "That is a mitt used to catch a ball when the other players throw it towards him so he can have a better grasp of it.", "In baseball, a glove is used to hold and catch a ball as hands are not big or study enough to catch the ball."], "image": "train2014/COCO_train2014_000000226904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27330, "question_id": "3XvysRoW4Hs8szYVYxLwvY", "question": "Why is the man in the black shorts jumping in the air?", "choices": ["to exercise", "tackling player", "catch frisbee", "dodging ball"], "correct_choice_idx": 2, "direct_answers": ["playing", "catcher box", "throwing frisbee", "catching frisbee", "catch frisbee", "catch frisbee", "catching frisbee", "to catch", "throwing hard", "catch frisbee"], "difficult_direct_answer": false, "rationales": ["The frisbee is visible and he is angled towards it with a hand reaching.", "The persons hand is extending out to the frisbee so it's obvious what they are doing.", "The man in the black shorts is jumping so he can catch the white frisbee before it lands."], "image": "train2014/COCO_train2014_000000027330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168571, "question_id": "3YituQJeT8iJyp7KXtiPTg", "question": "What hair style is the woman wearing?", "choices": ["pig tails", "bird tails", "cow tails", "duck tails"], "correct_choice_idx": 0, "direct_answers": ["pigtails", "pigtails", "braids", "pig tails", "ponytail", "wave", "pigtails", "ponytail", "pig tails", "braid"], "difficult_direct_answer": false, "rationales": ["Two pony tail hairs are extending from her hair.", "Half her hair is pulled to one side of the back of her head. the other half of her hair is pulled to the other side of the back of her head, and each of the two sides is held in place by an elastic band.", "The woman on the phone has her hair styled in two sections called pig tails."], "image": "train2014/COCO_train2014_000000168571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393896, "question_id": "3Yk7vYtTpVJuyPRA8JgWEb", "question": "What country does the blue car originate from?", "choices": ["poland", "ukraine", "japan", "china"], "correct_choice_idx": 2, "direct_answers": ["japan", "japan", "japan", "queensland", "japan", "japan", "united states", "australia", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["That car is made in that country.", "The car is made there.", "The car's manufacturer has their headquarters in the country."], "image": "train2014/COCO_train2014_000000393896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90659, "question_id": "3YoFX7S3YXM3tseVFjjkBk", "question": "What color furniture will the cat here likely go next to step?", "choices": ["blue", "cream", "maroon", "red"], "correct_choice_idx": 1, "direct_answers": ["white", "grey", "white", "white", "grey", "cream", "white", "beige", "white", "white"], "difficult_direct_answer": false, "rationales": ["He is heading towards the couch.", "The cat is likely to jump onto the cream couch.", "The cat is looking at the cream colored couch like it wants to go to sleep on it."], "image": "val2014/COCO_val2014_000000090659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104444, "question_id": "3Z2hiB55oNd7a6Wtkw9dwT", "question": "What type of sneakers is the child wearing?", "choices": ["velcro", "laced", "zip up", "tie"], "correct_choice_idx": 0, "direct_answers": ["velcro", "velcro", "white", "mary janes", "white small", "velcro", "labour", "white running", "kids", "tennis shoes"], "difficult_direct_answer": false, "rationales": ["The sneakers do not have laces or zippers. they have straps.", "Her shoes do not contain zippers, laces or ties.", "There are 2 straps instead of laces"], "image": "train2014/COCO_train2014_000000104444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487239, "question_id": "3Z5pAiiWS5cduQWaYEpmvb", "question": "What are they doing in the mountains?", "choices": ["sightseeing", "migrating", "working", "hunting"], "correct_choice_idx": 0, "direct_answers": ["horseback riding", "riding horses", "horseback riding", "riding", "riding horses", "riding horse", "exploring", "sightseeing", "trail riding", "riding ponies"], "difficult_direct_answer": false, "rationales": ["They are sightseeing on horseback in the mountains.", "Guided tours are taken in remote areas for the safety of the people who have never been there. there is a beautiful scenery to soak in.", "The animals are sightseeing."], "image": "val2014/COCO_val2014_000000487239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210149, "question_id": "3ZLS2xsT3azjoNBDqMq8Px", "question": "What activity is possible for those seated here?", "choices": ["film development", "racing", "fishing", "running"], "correct_choice_idx": 2, "direct_answers": ["paddleboating", "fishing", "paddle boating", "boating", "floating", "fishing", "rowing", "rowing", "fishing", "paddle boating"], "difficult_direct_answer": false, "rationales": ["The water is full of animals you can catch.", "The paddle boat is in the lake so it would be possible.", "The people are on a boat in the middle of calm water."], "image": "val2014/COCO_val2014_000000210149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375838, "question_id": "3ZQcuDRoKWozgsCbgKLjEZ", "question": "The shape of the baseball field is?", "choices": ["sphere", "diamond", "cube", "ring"], "correct_choice_idx": 1, "direct_answers": ["diamond", "diamond", "diamond", "diamond", "diamond", "diamond", "diamond", "good", "diamond", "diamond"], "difficult_direct_answer": false, "rationales": ["Baseball is played on a diamond.", "The shape is a diamond.", "The shape is a diamond"], "image": "train2014/COCO_train2014_000000375838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105866, "question_id": "3ZTwg6or9PjpcbQkouBu4D", "question": "What is the most likely level of this room?", "choices": ["basement", "ground", "penthouse", "third"], "correct_choice_idx": 1, "direct_answers": ["garden", "main floor", "double standard", "ground", "second", "ground", "basement", "ground", "ground floor", "floor"], "difficult_direct_answer": false, "rationales": ["One can see the pavement outside which is the same level as the bedroom", "The room is on the first floor since there is pavement visible out of the window.", "One can see through the glass that they are not high up based on the pavement which is on the same level."], "image": "val2014/COCO_val2014_000000105866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445745, "question_id": "3ZZKmUMZgytFaWWLCotBLX", "question": "What is this type of apple called?", "choices": ["ladybug", "granny smith", "red delicious", "baking"], "correct_choice_idx": 1, "direct_answers": ["granny smith", "granny smith", "granny smith", "granny smith", "golden delicious", "green apple", "green", "granny smith", "green", "green"], "difficult_direct_answer": false, "rationales": ["The color green is color for certain fruits. it is often used in pies or eaten raw.", "The apple is a light green apple, indicating its type.", "The apple is small and green."], "image": "train2014/COCO_train2014_000000445745.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463753, "question_id": "3ZdjpTrtvWsTFEuY8BtaVr", "question": "What is cut in half on the right?", "choices": ["mango", "pear", "apple", "avocado"], "correct_choice_idx": 3, "direct_answers": ["avocado", "square", "avocado", "desert", "avocado", "pizza", "avocado", "pizza", "avocado", "avocado"], "difficult_direct_answer": false, "rationales": ["The green food with the seed removed is easily recognizable.", "The avocado is cut.", "They are cut in half so they can eat them."], "image": "train2014/COCO_train2014_000000463753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286425, "question_id": "3aWFbWPskgxVT7CEAE7zYb", "question": "What type of potentially harmful light does the laptop screen produce?", "choices": ["rainbow waves", "flashing lights", "uv rays", "neon lights"], "correct_choice_idx": 2, "direct_answers": ["glare", "blue light", "uv rays", "ultraviolet", "blue light", "ultraviolet", "blue light", "blue light", "white", "blue light"], "difficult_direct_answer": false, "rationales": ["The uv rays are harmful.", "They can flash lights if you look at it too much.", "The light produces uv rays which can hurt people's eyes and skin."], "image": "train2014/COCO_train2014_000000286425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437594, "question_id": "3aa87hunAFL46XbRedwTwn", "question": "In what year did the website on her screen become a company?", "choices": ["1999", "1994", "1998", "2003"], "correct_choice_idx": 1, "direct_answers": ["1994", "1994", "1994", "1994", "2000", "1994", "1994", "1994", "2000", "1994"], "difficult_direct_answer": false, "rationales": ["This was the period of evolution in most countries.", "They started when internet got easier to obtain", "I used the internet to search for the year that yahoo was founded."], "image": "val2014/COCO_val2014_000000437594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431553, "question_id": "3aeBwMANNgD3Svjfhcw8bZ", "question": "What allows the blade to remain in place with the handle?", "choices": ["tags", "screw", "rivet", "nail"], "correct_choice_idx": 2, "direct_answers": ["rivet", "screw", "rivet", "knife holder", "bolster", "screws", "screws", "metal", "glue", "gravity"], "difficult_direct_answer": false, "rationales": ["The rivet is used two objects together to stick.", "The rivet allows the blade to stay in place.", "The rivet on the handle of the knife allows the blade to stay stuck to the blade and not get separated."], "image": "train2014/COCO_train2014_000000431553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492266, "question_id": "3anJv6R7J7xtLq8ueumwb7", "question": "How many different individuals are actually depicted here?", "choices": ["four", "one", "none", "eight"], "correct_choice_idx": 1, "direct_answers": ["one", "1 person", "one", "one", "four", "one", "three", "one", "four", "three"], "difficult_direct_answer": false, "rationales": ["The person in each shot has the exact same clothes and board in every picture.", "The person in the image is depicted in the same clothing in each instance and scene at different stages of trajectory over a jump. because of this it is the same person so only one is depicted.", "A lone snowboarder is caught in freeze frames making a jump in the snow. the snowboard was invented in the usa in 1965."], "image": "train2014/COCO_train2014_000000492266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209935, "question_id": "3anxpfB2VzsXUrCgZiUJLc", "question": "What should traffic do by the light?", "choices": ["move backwards", "go", "stop", "yield"], "correct_choice_idx": 1, "direct_answers": ["go", "go", "go", "go slow", "go", "go", "go slow", "go", "go", "go"], "difficult_direct_answer": false, "rationales": ["There is a green light which means it is safe to proceed.", "The traffic light is green. a green light is a signal that traffic may proceed.", "Traffic should continue through the green light visible at the intersection, as green denotes the right to continue through."], "image": "val2014/COCO_val2014_000000209935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52729, "question_id": "3bM8tXi5zcP6dsshMYQy8T", "question": "What tool shares the name as the sponsor on the vest?", "choices": ["wrench", "air compressor", "hammer", "screwdriver"], "correct_choice_idx": 3, "direct_answers": ["screwdriver", "screwdriver", "screwdriver", "screwdriver", "bulbs", "screwdriver", "screwdriver", "screwdriver", "philips", "bulbs"], "difficult_direct_answer": false, "rationales": ["The sponsor's name is philips. there is a star-headed tool that shares this name.", "The people are wearing vests that say philips on it. that is the same name of a philips-head screwdriver.", "A brand logo for tools can be seen on a skiers outfit."], "image": "train2014/COCO_train2014_000000052729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161762, "question_id": "3cTunpivQQPAD7RwgdvXze", "question": "Why is the umbrella like structure in the middle of the grass?", "choices": ["for shade", "to play", "to climb", "to swing"], "correct_choice_idx": 0, "direct_answers": ["shade", "shade", "shade", "for shade", "shade", "feed animals", "shade", "provide shade", "provide shade", "shade"], "difficult_direct_answer": false, "rationales": ["The umbrella blocks the harsh rays of the african sun.", "The umbrella is standing in the middle of the grass to provide shade for the animals while they eat.", "The thing is casting a shadow where the animals are eating."], "image": "train2014/COCO_train2014_000000161762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555144, "question_id": "3cVW34ubmkRKGHygfu7X45", "question": "The screen in the middle is taking the place of the what?", "choices": ["audience", "ground", "ball", "net"], "correct_choice_idx": 3, "direct_answers": ["net", "net", "virtual reality", "net", "video game", "television", "tv monitor", "net", "net", "race track"], "difficult_direct_answer": false, "rationales": ["That is used to play tennis.", "The screen is a net.", "The net should be between them."], "image": "val2014/COCO_val2014_000000555144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143952, "question_id": "3coHXXi3mpebE3dvQKF587", "question": "What is the main holding as he's walking by looking at the No 5 train?", "choices": ["lunch and", "beer can", "puppy", "bicycle"], "correct_choice_idx": 3, "direct_answers": ["bicycle", "bike", "rag", "bicycle", "bike", "bike", "bike", "bike", "bicycle", "bicycle"], "difficult_direct_answer": false, "rationales": ["A man has one hand on a bike, pushing it, as he walks by a train.", "He is pushing a vehicle that has two wheels.", "A man is walking while balancing a bike next to him. he is turned towards a train nearby."], "image": "train2014/COCO_train2014_000000143952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573300, "question_id": "3cqFiyBWH7YKRJYssYHHR5", "question": "Which of the objects on the plate is inedible?", "choices": ["soup", "vegetables", "bread", "utensil"], "correct_choice_idx": 3, "direct_answers": ["fork", "utensil", "spoon", "fork", "spoon", "fork", "fork", "utensil", "spoon", "utensil"], "difficult_direct_answer": false, "rationales": ["The soup, bread, and vegetables are edible. the metallic object is not.", "There is a picture of some veggies, a sandwich, soup and a metal spoon.", "A fork is on the plate and is not made out of food."], "image": "val2014/COCO_val2014_000000573300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186130, "question_id": "3creLvwcoEzNqztGiUmWio", "question": "What cool treat can be found inside this cake?", "choices": ["ice cream", "ice", "nothing", "lava"], "correct_choice_idx": 0, "direct_answers": ["ice cream", "ice cream", "ice cream", "candy", "ice cream", "ice cream", "ice cream", "icecream", "ice cream", "ice cream"], "difficult_direct_answer": false, "rationales": ["The cake was bought at carvel, which is an ice cream shop that sells ice cream cakes.", "This is an ice cream cake and ice cream can be found in it.", "The box gives away the contents of the cake. there is a small cone depicting on it."], "image": "val2014/COCO_val2014_000000186130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243153, "question_id": "3cxn4W5PKKcBQpencs7m4F", "question": "How are the two women related?", "choices": ["doubles pair", "students", "cousins", "sisters"], "correct_choice_idx": 0, "direct_answers": ["doubles pair", "sisters", "teammates", "mother daughter", "doubles partners", "tennis partners", "teammates", "sisters", "friends", "doubles partners"], "difficult_direct_answer": false, "rationales": ["They seems to be pair of partners.", "The woman are playing tennis. they are on the same side of the net.", "They are playing on the same side of the court in tennis"], "image": "train2014/COCO_train2014_000000243153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273450, "question_id": "3dMPTWSLe9SgTjgfomcY39", "question": "What are they doing?", "choices": ["attacking meter", "paying meter", "reading meter", "expressing displeasure"], "correct_choice_idx": 3, "direct_answers": ["parking", "parking", "flipping meter", "expressing displeasure", "flipping off", "flipping off", "paying meter", "parking", "parking", "giving finger"], "difficult_direct_answer": false, "rationales": ["The middle finger is a universal hand gesture to express displeasure.", "A hand is seen with seen that shows the driver.", "The people are angry."], "image": "val2014/COCO_val2014_000000273450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556101, "question_id": "3dcjzH3siLZugFmEsVZbgF", "question": "What is the man attempting to do?", "choices": ["play games", "take picture", "paint picture", "play sports"], "correct_choice_idx": 1, "direct_answers": ["take picture", "take photo", "take photo", "photograph", "photograph", "take photo", "take photograph", "photography", "take photo", "photo shoot"], "difficult_direct_answer": false, "rationales": ["The man is holding a camera and looking through it.", "The way the camera is being directed and they way he is looking through it you can tel what he is doing.", "The man wants a picture."], "image": "val2014/COCO_val2014_000000556101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74466, "question_id": "3ddrxvJV4niRQoUTMZqF46", "question": "What breakage caused the lights repositioning?", "choices": ["sidwalk", "car", "limb", "light pole"], "correct_choice_idx": 3, "direct_answers": ["falling", "accident", "tornado", "broken post", "weather", "down line", "pole", "pole", "thunder", "light pole"], "difficult_direct_answer": true, "rationales": ["The breakage is in the light pole.", "The post has snapped in half and part of it is on the ground.", "It looks like the pole just broke in two. there doesn't seem to be damage to the bottom like a car ran into it."], "image": "train2014/COCO_train2014_000000074466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95674, "question_id": "3dh7KhkCAERyU3js4bjAjH", "question": "What is the most common type of pizza crust?", "choices": ["whole wheat", "cauliflower", "thin crust", "thick crust"], "correct_choice_idx": 2, "direct_answers": ["bread", "thin crust", "thin crust", "cheese", "hand tossed", "flour", "thin", "thin", "thin crust", "dough"], "difficult_direct_answer": false, "rationales": ["Traditionally pizzas are made with a somewhat thick crust.", "The bread type is traditional and one of the earliest types of pizza available.", "Besides chicago deep dish, most cities and countries are best known for pizza that has thin crust, such as new york and italy."], "image": "train2014/COCO_train2014_000000095674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288737, "question_id": "3dwxfqApkmp79gQQAmf34u", "question": "Why is he standing like that?", "choices": ["is scared", "bouncing", "stay balanced", "falling"], "correct_choice_idx": 2, "direct_answers": ["stay balanced", "balancing", "balance", "balance", "showboating", "maintain balance", "balancing", "surfing", "surfing", "balance"], "difficult_direct_answer": false, "rationales": ["The man is standing with slightly bent knees while on a surfboard. surfboard is a sport one needs to use balance to be successful at and the athletic stance is commonly used in balance sports.", "The man is surfing and wants to stay on the board.", "By standing with his knees bent and leaning slightly forward he is able to maintain balance on the surfboard so as not to fall into the water."], "image": "train2014/COCO_train2014_000000288737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207703, "question_id": "3e4SVkNLBFztQAjqqotmR4", "question": "The humanoid kite is dressed for which environment?", "choices": ["mountains", "sea", "north pole", "tundra"], "correct_choice_idx": 1, "direct_answers": ["underwater", "swimming", "ocean", "water", "sea", "summer", "underwater", "snorkeling", "water", "underwater"], "difficult_direct_answer": false, "rationales": ["The humanoid kite is dressed in flippers and a snorkel for diving.", "It is wearing flippers and goggles which are used to create vision and movement through water.", "It is wearing a swimsuit, snorkel mask and flippers."], "image": "train2014/COCO_train2014_000000207703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578649, "question_id": "3e8At8XWyB2fDY9GbFUuxp", "question": "What is the little lamb doing?", "choices": ["drinking milk", "hiding", "sleeping", "attacking"], "correct_choice_idx": 0, "direct_answers": ["drinking milk", "eating", "feeding", "nursing", "feeding", "sheep", "eating", "drinking milk", "drinking", "nursing"], "difficult_direct_answer": false, "rationales": ["The little lamb is under another animal who appears to be his mother. he thus will get nourishment by drinking milk from her.", "The little lamb is standing beneath the mother and drinking its milk.", "The lamb is young and is nursing from the teats located underneath its mother."], "image": "train2014/COCO_train2014_000000578649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64196, "question_id": "3eEQfaB9zEpFR3wKaVmCNG", "question": "What have these ladies been doing?", "choices": ["drinking", "baking", "sleeping", "watching tv"], "correct_choice_idx": 1, "direct_answers": ["baking", "making desserts", "baking icing", "baking", "baking", "baking", "baking icing", "baking", "baking", "baking"], "difficult_direct_answer": false, "rationales": ["The ladies have out a mixer, bowl, wire racks for cooling, and unfrosted cupcakes. they are currently in the process of frosting their homemade treats.", "There are mixing bowls, a mixer, drying racks and finished muffins on the counter.", "The women are baking."], "image": "val2014/COCO_val2014_000000064196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396611, "question_id": "3eER9WCEARfjKK2wZNtCm2", "question": "Where are the two old men located in?", "choices": ["train station", "ferry station", "shopping mall", "airport"], "correct_choice_idx": 3, "direct_answers": ["airport", "airport", "airport", "escalator", "airport", "airport", "airport", "escalator", "escalator", "escalator"], "difficult_direct_answer": false, "rationales": ["The men have luggage, the word concourse is seen on a sign and one of the men is dressed like a pilot.", "Both are carrying luggage and the one in front is wearing a pilot's hat and wings on his jacket.", "The two gentlemen are choosing to ride down the escalator instead of walking down a large and long set of stairs as they leave the airport after a long flight."], "image": "train2014/COCO_train2014_000000396611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338894, "question_id": "3eVFrnzkSsGDVFF4yVfb83", "question": "What sport is the video game on the monitor simulating?", "choices": ["wrestling", "hockey", "baseball", "boxing"], "correct_choice_idx": 3, "direct_answers": ["tennis", "boxing", "boxing", "boxing", "boxing", "boxing", "videogame", "boxing", "boxing", "martial arts"], "difficult_direct_answer": false, "rationales": ["A boxing ring is shown on the screen.", "A television shows a boxing ring on it.", "The characters from the game on the screen are in an arena and have large punching gloves."], "image": "train2014/COCO_train2014_000000338894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330726, "question_id": "3ebbms7VWtnyUXRJKSnuyS", "question": "What purpose does the metal object in front of the building serve?", "choices": ["toll booth", "recycling", "art display", "food stand"], "correct_choice_idx": 2, "direct_answers": ["decoration", "gift", "transport", "art", "art", "art", "artistic", "decoration", "art display", "decoration"], "difficult_direct_answer": false, "rationales": ["The sculpture is there as a decoration.", "The object is a statue and is in view for the public.", "There is a bird statue in front of the building to decorate it."], "image": "train2014/COCO_train2014_000000330726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357542, "question_id": "3ezYejGMsFrfkcUWYjkGqy", "question": "This animal is the same species as what character on Game of Thrones?", "choices": ["hot pie", "ghost", "hound", "mountain"], "correct_choice_idx": 1, "direct_answers": ["siberian husky", "canine", "ghost", "direwolf", "wolf", "wolf", "husky", "wolf", "dog", "direwolf"], "difficult_direct_answer": false, "rationales": ["A dog is with a man. ghost on game of thrones was a wolf.", "The animal is a ghost.", "The dog is beautiful. however, it is also obedient and appears in various shows."], "image": "val2014/COCO_val2014_000000357542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81749, "question_id": "3fB7oCt3bmwpYL4KVmpBCP", "question": "What is the largest city in this country by population?", "choices": ["london", "dublin", "paris", "glasgow"], "correct_choice_idx": 3, "direct_answers": ["beijing", "glasgow", "paris", "edinburgh", "london", "cant see", "midlothian", "glasgow", "half million", "glasgow"], "difficult_direct_answer": false, "rationales": ["The country is scotland, because another big city of edinburgh is seen on the sign.", "That's the most populated city in scotland.", "The colors on the train represent that location."], "image": "train2014/COCO_train2014_000000081749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169089, "question_id": "3fW4UbnpuJWhaLP6Uk2mpw", "question": "Why are they lying down?", "choices": ["to hide", "to relax", "to hunt", "to sleep"], "correct_choice_idx": 1, "direct_answers": ["resting", "boogie boarding", "having fun", "catch wave", "paddle boarding", "paddleboarding", "bodysurfing", "surfing", "resting", "to relax"], "difficult_direct_answer": true, "rationales": ["They are enjoying themselves.", "They just rode a wave into shore and are taking a minute to rest", "They are just chilling on their boards."], "image": "val2014/COCO_val2014_000000169089.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550023, "question_id": "3fXxWfyV87y8sAPeTmG58j", "question": "Why does the man have a hand on the ground?", "choices": ["catch fall", "do handstand", "do cartwheel", "dig"], "correct_choice_idx": 0, "direct_answers": ["slow speed", "balance", "balance", "catch fall", "balance", "balance", "balance", "balance", "balance", "slow speed"], "difficult_direct_answer": false, "rationales": ["He is using his arm to help his balance on a turn.", "The man is trying to prevent falling.", "The hand is used to balance and reduce the likelihood of falling."], "image": "train2014/COCO_train2014_000000550023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231028, "question_id": "3fyg5228CLdLNGeiRKdd3t", "question": "Why are they off the path?", "choices": ["fighting", "confused", "buying tickets", "posing"], "correct_choice_idx": 3, "direct_answers": ["snow path", "exploring", "taking picture", "snow", "ski competition", "fresh snow", "paying", "taking photo", "posing", "for pictures"], "difficult_direct_answer": true, "rationales": ["They are having their photo taken.", "The people are posing.", "The people are striking a pose."], "image": "val2014/COCO_val2014_000000231028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270554, "question_id": "3g5uL22Y9WyQKSTAtuCakx", "question": "What is the more realistic setting for these baskets of food items?", "choices": ["home", "lemonade stand", "farmer's market", "grocery"], "correct_choice_idx": 2, "direct_answers": ["farmer's market", "farmer's market", "garden", "ocean", "stores", "farmers market", "table", "farmer's market", "famers market", "picnic"], "difficult_direct_answer": false, "rationales": ["The items are vegetables and would be at place at a farmer's market where they would be sold locally and in fresh condition.", "The foods are fruit and vegetables being displayed for purchase. they are being presented in individual baskets which are not traditionally used in grocery stores.", "Usually these are the containers people sell their vegetables in at these events, rather than purpose-built permanent shelving, etc."], "image": "val2014/COCO_val2014_000000270554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315196, "question_id": "3gG3zUoiJUd3FsJgHqUN3T", "question": "What dish is most likely to be enjoyed by the bikers parked here?", "choices": ["pizza", "slaw", "ice cream", "none"], "correct_choice_idx": 0, "direct_answers": ["pizza", "pizza", "hamburgers", "pizza", "hamburgers", "pizza", "burgers", "pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["The red storefront near the multitude of parked motorcycles reads 'pizzeria'.", "People often deliver that food on their bikes.", "There is a sign that says \"pizzeria\"."], "image": "val2014/COCO_val2014_000000315196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136185, "question_id": "3gX52AFXnAHR62emB9jSaD", "question": "Where is this bookshelf located?", "choices": ["home", "courtroom", "store", "library"], "correct_choice_idx": 0, "direct_answers": ["house", "by wall", "home", "against wall", "room", "living room", "bedroom", "living room", "home", "house"], "difficult_direct_answer": false, "rationales": ["A bookshelf is in a room with a desk and personal items. homes are filled with personal items.", "The bookshelf is located at home.", "The shelf is at home."], "image": "train2014/COCO_train2014_000000136185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199119, "question_id": "3gfVAeBKfZ6Bmy6K8LmRob", "question": "How many people besides the driver ride in this bus at this time?", "choices": ["none", "one", "15", "20"], "correct_choice_idx": 0, "direct_answers": ["22", "none", "many", "unknown", "zero", "40", "zero", "few dozen", "many", "42"], "difficult_direct_answer": false, "rationales": ["There are no people.", "The bus has a sign on the front that says it is not in service. that means there are no passengers on the bus.", "The bus is not in service so there should not be any passengers."], "image": "train2014/COCO_train2014_000000199119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567287, "question_id": "3h4seTVNUJvBqfM5oZPgap", "question": "Why are the umbrellas different colors?", "choices": ["privacy", "random colors", "being colorful", "identification"], "correct_choice_idx": 2, "direct_answers": ["neon lights", "decoration", "event", "being colorful", "varied manufacturers", "glowing lights", "decoration", "phosphor", "light", "decoration"], "difficult_direct_answer": false, "rationales": ["Many umbrellas in varied colors are shown.", "There are many umbrellas lit up at night. they are giving off visually striking features of vibrant hues.", "Each person uses an umbrella that they choose and all the umbrellas are not the same color."], "image": "train2014/COCO_train2014_000000567287.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378055, "question_id": "3hDW6dofypddwx8gpEaqEN", "question": "What are the people at this event trying to help gain on behalf of medical research?", "choices": ["equality", "funding", "awareness", "rights"], "correct_choice_idx": 1, "direct_answers": ["awareness", "awareness", "breast cancer", "funding", "support", "awareness", "money", "awareness", "funding", "money"], "difficult_direct_answer": false, "rationales": ["The people are looking for funding to back their research.", "The shirt one of them is wearing state's the purpose of the event.", "One person is wearing a t-shirt regarding a rally for medical research which generally means trying to secure funding."], "image": "train2014/COCO_train2014_000000378055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285099, "question_id": "3hkwNJh6KBA4rfYL2z5osQ", "question": "Which furnishing would be easiest to move?", "choices": ["desk", "pegboard", "chair", "stool"], "correct_choice_idx": 2, "direct_answers": ["chair", "chair", "chair", "chair", "chair", "chair", "chair", "chair", "wheel chair", "chair"], "difficult_direct_answer": false, "rationales": ["The seat has wheels attached to the bottom, making it convenient and easy to move.", "A room has a desk and chair. a chair is lighter than a desk.", "Furniture on wheels can be moved with a simple push."], "image": "train2014/COCO_train2014_000000285099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44504, "question_id": "3hubjNjE9htHiFobbjncBP", "question": "What is this machine?", "choices": ["production wheel", "artwork", "retail display", "dentist wheel"], "correct_choice_idx": 1, "direct_answers": ["brushed", "ferris wheel", "artwork", "brush maker", "toothbrush machine", "ferris wheel", "toothbrush maker", "desersin", "toothbrush maker", "ferris wheel"], "difficult_direct_answer": false, "rationales": ["The machine is made to resemble a ferris wheel so it would not have any purpose other than being artwork.", "Looks like a type of ferris wheel.", "There is the same item packaged and appearing multiple times on a rack like structure. this is how things for sale are often displayed."], "image": "val2014/COCO_val2014_000000044504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29709, "question_id": "3hzju3a2tgiVMwF4QAxDd2", "question": "What type of area is shown?", "choices": ["exercise", "boarding", "waiting", "dining"], "correct_choice_idx": 3, "direct_answers": ["patio", "patio", "patio", "patio", "outside dinning", "dining area", "dining area", "dining", "outdoor cafe", "patio"], "difficult_direct_answer": false, "rationales": ["The area is for dining.", "There are tables with chairs set on a patio under umbrellas.", "The area is a patio that contains tables, chairs, and umbrellas. people could eat here."], "image": "val2014/COCO_val2014_000000029709.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252610, "question_id": "3iDWVED2twwpJ6XPuZPHGk", "question": "Why is the toilet paper on top of the toilet?", "choices": ["easy access", "reduce noise", "safety", "aesthetics"], "correct_choice_idx": 0, "direct_answers": ["extra to", "extra", "to use", "easy access", "extra", "hygiene", "restock", "for accessibility", "extra", "easy use"], "difficult_direct_answer": false, "rationales": ["The paper is for easy access.", "Since it is located behing the toilet, it is in easy reach for the person needing access to it.", "The toilet paper is on the top of the toilet for ease of access."], "image": "val2014/COCO_val2014_000000252610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106978, "question_id": "3iPjW27onGxRj8QrfHKqpc", "question": "What kind of business is this street vendor engaged in?", "choices": ["selling", "entertainment", "shoe shine", "art"], "correct_choice_idx": 2, "direct_answers": ["shoe shining", "shoe cleaner", "selling shoes", "shoes", "shoe shining", "shoe shine", "sales", "shoe", "selling shoes", "selling"], "difficult_direct_answer": false, "rationales": ["The people are looking to shine shoes based on the sign.", "The street vendor has a seat for people that want their shoes shined with his many polishes.", "The man has a bunch of shoes near him as if he cleans them."], "image": "train2014/COCO_train2014_000000106978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430192, "question_id": "3ipVNsqRyRER6wMdRCdzrx", "question": "Where are the skaters located?", "choices": ["street", "store", "park", "mall"], "correct_choice_idx": 2, "direct_answers": ["skate park", "park", "park", "skate park", "skater's park", "skate park", "in ramp", "skate park", "skate park", "skatepark"], "difficult_direct_answer": false, "rationales": ["The skaters are at an outdoor skate park.", "It's outdoors and there are no cars or other traffic nearby.", "The skaters are at a park."], "image": "train2014/COCO_train2014_000000430192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105139, "question_id": "3ivs6fgUAP2dJU6CzuHufL", "question": "Where is someone probably enjoying this food?", "choices": ["party", "porch", "restaurant", "kitchen"], "correct_choice_idx": 2, "direct_answers": ["restaurant", "diner", "diner", "restaurant", "restaurant", "restaurant", "resturant", "diner", "diner", "restaurant"], "difficult_direct_answer": false, "rationales": ["The table and dishes look commercial, plus the food was set to look very presentable.", "The person is probably eating in a restaurant because they put creamers on a small plate for their client's coffee.", "The food is neatly arranged on a plate and there are individual sized creamers."], "image": "train2014/COCO_train2014_000000105139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396905, "question_id": "3jQtWJQaTfiSSLf8JZgNCd", "question": "The individual pieces of the flooring are referred to as what?", "choices": ["bricks", "planks", "tiles", "shingles"], "correct_choice_idx": 1, "direct_answers": ["boards", "dog", "wood", "tiles", "skate", "boards", "planks", "planks", "slats", "hardwood"], "difficult_direct_answer": false, "rationales": ["This is hardwood flooring which uses planks of wood.", "They are long pieces of wood that are evenly cut and smooth for flooring", "The floor is wood."], "image": "train2014/COCO_train2014_000000396905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19484, "question_id": "3jyfFDnWUo3P3jHddRhEyV", "question": "Which one of these vegetables is used in the manufacture of the item in the cans?", "choices": ["eggplant", "tomato", "corn", "pumpkin"], "correct_choice_idx": 2, "direct_answers": ["corn", "cane sugar", "maple", "corn", "unknown", "kniives", "corn", "tomatoes", "corn", "maple"], "difficult_direct_answer": false, "rationales": ["The cans under the counter contain corn syrup that is made from corn.", "The cans on the shelf contain corn syrup which is made from corn.", "Corn helps provide filling in the tin cans."], "image": "val2014/COCO_val2014_000000019484.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472041, "question_id": "3k3PsLNHTc5NYppEHigxre", "question": "To park here what must someone possess?", "choices": ["dollar bills", "coins", "nothing", "script"], "correct_choice_idx": 1, "direct_answers": ["coins", "coins", "money", "change", "money", "coins", "quarters", "coins", "money", "money"], "difficult_direct_answer": false, "rationales": ["There are meters beside the parking spots. they do not accept paper currency.", "The person parking here needs coins because he needs to feed the parking meter.", "Someone needs coins."], "image": "train2014/COCO_train2014_000000472041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378440, "question_id": "3kPTFMjmrog4g5qWoxqfe9", "question": "What food on the table has the highest level of fat?", "choices": ["bacon", "syrup", "pancake", "egg"], "correct_choice_idx": 0, "direct_answers": ["pancake", "butter", "sausage", "cheese", "pancakes", "bacon", "bacon", "bacon", "pancakes", "bacon"], "difficult_direct_answer": false, "rationales": ["The food is bacon.", "It is a more fatty food then the other items on the table.", "The men are eating bacon which is a fatty part of a pig."], "image": "train2014/COCO_train2014_000000378440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400622, "question_id": "3kgxBQUcuaBs74v8DMTDp8", "question": "What are they doing?", "choices": ["waiting", "eating", "arguing", "resting"], "correct_choice_idx": 3, "direct_answers": ["resting", "talking", "sitting", "sitting", "sitting", "resting", "sitting", "sitting", "sitting", "resting"], "difficult_direct_answer": false, "rationales": ["Their seated position shows that they are resting before the next ski run.", "The people are resting.", "The people are sitting down and smiling, they most likely are resting."], "image": "train2014/COCO_train2014_000000400622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254420, "question_id": "3kym4o9DPZ9hY4tbaXjEb9", "question": "What are the devices with screens called?", "choices": ["microwaves", "televisions", "computers", "recorders"], "correct_choice_idx": 2, "direct_answers": ["computers", "monitors", "computers", "computers", "monitors", "computers", "computers", "monitors", "desktop", "monitors"], "difficult_direct_answer": false, "rationales": ["There are keyboards near the screens. recorders, microwaves, and televisions do not use keyboards.", "Flat screens are on top of a desk behind keyboards.", "The screens are attached to keyboards and have displays that would be consistent with answer a. this also appears to be an office based on the decor and setup which would likely have computers in modern times."], "image": "train2014/COCO_train2014_000000254420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533976, "question_id": "3mAPbNACoVn3NFKZ3Q7YSy", "question": "The white part of the icing here is likely flavored with what?", "choices": ["vanilla", "potato", "peppermint", "onion"], "correct_choice_idx": 0, "direct_answers": ["sugar", "sugar", "ice cream", "sugar", "vanilla", "vanilla", "vanilla", "strawberry", "vanilla", "vanilla"], "difficult_direct_answer": false, "rationales": ["It's white so probably a more plain like vanilla.", "White icing is normally the vanilla flavor", "This is the only sweet option that would likely be used for cake, peppermint is possible but very uncommon."], "image": "val2014/COCO_val2014_000000533976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321119, "question_id": "3msqrPY2SJde3gFAC2AvdP", "question": "What is the plane pictured above doing?", "choices": ["take off", "stopped", "landing", "fuelling"], "correct_choice_idx": 0, "direct_answers": ["taking off", "take off", "ascending", "taking off", "landing", "landing", "flying", "taking off", "taking off", "taking off"], "difficult_direct_answer": false, "rationales": ["A plane is pointed up and is above a runway.", "The plane above the runway has just taken off and gaining height.", "When planes are landing, their noses are tipped downward. the nose of this plane is tipped up, indicating it is headed into the sky."], "image": "train2014/COCO_train2014_000000321119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45071, "question_id": "3mxNwn2Rp6r5NgGvEH7ktm", "question": "What is the yellow ball near the cat used for?", "choices": ["tennis", "bowling", "exercise", "basketball"], "correct_choice_idx": 2, "direct_answers": ["bowling", "toy", "exercise", "sitting", "playing", "exercising", "exercise", "cat", "exercise", "excersize"], "difficult_direct_answer": false, "rationales": ["It is used to exercise with and work out.", "The yellow ball is for exercise.", "Basketball uses orange balls, and tennis uses green balls. the ball is not the right size for bowling."], "image": "train2014/COCO_train2014_000000045071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542506, "question_id": "3n49cbmZA9Nt3Vc43CAYNj", "question": "What is the recreation depicted in the photo?", "choices": ["running", "flying kite", "working out", "dancing"], "correct_choice_idx": 1, "direct_answers": ["kite flying", "kite", "flying kite", "flying kite", "flying kite", "kite flying", "kite", "shock", "kites", "kite flying"], "difficult_direct_answer": false, "rationales": ["A person is running with a kite.", "The people fly a kite.", "The person is running in the opposite direction trying to get their kite to fly in the air."], "image": "train2014/COCO_train2014_000000542506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210857, "question_id": "3naRZY7VjAzYLqEPbk5gNe", "question": "Who is winning so far?", "choices": ["smith", "broady", "olsen", "voracova"], "correct_choice_idx": 3, "direct_answers": ["voracova", "voracova", "voracova", "opponent", "voracove/voskoboana", "voracova", "voracova", "voracova", "voracova", "voracove/voskoboana"], "difficult_direct_answer": false, "rationales": ["Only one person has a point on the board.", "The score sign says this person has 1", "The voracovas have a 1 point lead on the other team so they are currently winning"], "image": "val2014/COCO_val2014_000000210857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170439, "question_id": "3nno9dNkMr8SCUoXUtw23y", "question": "What is the man wearing?", "choices": ["wetsuit", "leggings", "swimsuit", "scuba gear"], "correct_choice_idx": 0, "direct_answers": ["wet suit", "wetsuit", "wetsuit", "surf gear", "wetsuit", "wetsuit", "wetsuit", "wetsuit", "wetsuit", "wetsuit"], "difficult_direct_answer": false, "rationales": ["He is wearing a wet suit to protect him from the salt water and keep him warm", "The man is wearing a suit to surf in.", "The man with the surfboard is wearing a wetsuit to keep him warm while surfing in the water."], "image": "train2014/COCO_train2014_000000170439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160186, "question_id": "3nnqVKxe5nWXfRwYtb9oBD", "question": "Why is he holding the door?", "choices": ["showing off", "is hiding", "keep open", "is joking"], "correct_choice_idx": 2, "direct_answers": ["keep open", "maintain control", "opening refrigerator", "getting drink", "getting food", "joking", "hiding", "no movement", "getting something", "get food"], "difficult_direct_answer": true, "rationales": ["The man is peeking around the door and has a wry smile.", "He looks to be playing around and joking with someone.", "Any of these answers could be correct but most likely it is to keep it open."], "image": "val2014/COCO_val2014_000000160186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31796, "question_id": "3noLKpbCTTZPUji5RkBPBA", "question": "What type of creature made the white object in the salad bowl?", "choices": ["horse", "pig", "cow", "chicken"], "correct_choice_idx": 3, "direct_answers": ["cows", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "cows"], "difficult_direct_answer": false, "rationales": ["The creature is the chicken.", "They lay eggs.", "This animal lays eggs. the white object in the salad bowl is a hard boiled egg."], "image": "val2014/COCO_val2014_000000031796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281701, "question_id": "3nx5dR7c9QRkvuTbfcht3u", "question": "Why are the objects stacked?", "choices": ["save space", "make taller", "hide hole", "prevent explosion"], "correct_choice_idx": 0, "direct_answers": ["counter top", "cups", "save space", "efficiency", "clean", "ready/usable", "good fit", "save space", "cups", "bowls"], "difficult_direct_answer": false, "rationales": ["If they were laid out they would take up a lot of area.", "If they weren't stacked, they couldn't have that many dishes there.", "The bowls are stacked to save room and be easier to use them."], "image": "train2014/COCO_train2014_000000281701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166474, "question_id": "3o4rE7NHwaBkozhdyiaRHp", "question": "Who is the maker of the white shoes?", "choices": ["under armour", "nike", "adidas", "new balance"], "correct_choice_idx": 2, "direct_answers": ["reebok", "adidas", "adidas", "adidas", "converse", "adidas", "adidas", "adidas", "adidas", "adidas"], "difficult_direct_answer": false, "rationales": ["A person is wearing shoes with three stripes. the logo for adidas has three stripes.", "One can make out the three lined logo of this company.", "The shoes have three stripes on the side like many adidas shoes."], "image": "train2014/COCO_train2014_000000166474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267611, "question_id": "3oPe3pYP5skNbwAoJsnyQ8", "question": "What will the dog want to do with the frisbee seen here?", "choices": ["what frisbee", "avoid it", "catch it", "nothing"], "correct_choice_idx": 2, "direct_answers": ["catch it", "catch", "catch", "catch", "catch it", "catch it", "catch it", "catch it", "catch", "catch"], "difficult_direct_answer": false, "rationales": ["The dog is in a competition and trained well to get it.", "The dog wants to catch.", "It is a competition to see how well the dog can do this skill."], "image": "train2014/COCO_train2014_000000267611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252121, "question_id": "3oTcunTVqKLenGbB5k7zxS", "question": "Skating is which seasonal game?", "choices": ["autumn", "winter", "summer", "spring"], "correct_choice_idx": 2, "direct_answers": ["summer", "summer", "summer", "summer", "summer", "summer", "summer", "summer", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["To be honest you can skate during most every season.", "Skating is best in the summer when there is not leaves or snow on the ground.", "Skating is something that is done outdoors and is popular in the summer."], "image": "train2014/COCO_train2014_000000252121.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514346, "question_id": "3ovFQSGZtwt5w2tmqoYYVz", "question": "Which one likely weighs the least?", "choices": ["blue shirt", "brown shirt", "floral shirt", "red head"], "correct_choice_idx": 0, "direct_answers": ["left", "blue shirt", "farthest left", "blue shirt", "blue shirt", "left", "left", "left girl", "right", "left"], "difficult_direct_answer": false, "rationales": ["She has the thinnest face and arms", "She is the smallest of the three and has the thinnest arms.", "Answer a appears to be the skinniest woman pictured. the relative skinniness of a person usually corresponds to their weight with skinnier people weighing less."], "image": "train2014/COCO_train2014_000000514346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488823, "question_id": "3oxP8vwmMujnXFEGxvnCuW", "question": "What method of cooking is being used in this area?", "choices": ["baking", "deep frying", "broiling", "open flame"], "correct_choice_idx": 0, "direct_answers": ["baking", "baking", "mixing", "baking", "baking", "institutional", "mixing", "mixing", "mixing", "baking"], "difficult_direct_answer": false, "rationales": ["There is an abundance of white powder through out the establishment, flour is a staple in the baking world. there is also a large mixer the man is using which is used to kneed dough.", "You can see the flour everywhere from baking.", "Most of the stuff is used for baking cakes and bread."], "image": "val2014/COCO_val2014_000000488823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153652, "question_id": "3p3LjwzTdsfJVtfmSZapBF", "question": "Which season of the HBO show The Wire focused on an area that looked like this?", "choices": ["nine", "six", "two", "eight"], "correct_choice_idx": 2, "direct_answers": ["two", "second", "no clue", "two", "first", "rain", "season 2", "summer", "bad question", "second"], "difficult_direct_answer": false, "rationales": ["The wire's season 2 was set around the port of baltimore.", "This is a coastal area. the wire only had five seasons.", "Season 2 of the wire focused on a grim beach area."], "image": "train2014/COCO_train2014_000000153652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215616, "question_id": "3pCgwUBf6ciE5igpry7xNv", "question": "What animal is walking alongside the man?", "choices": ["camel", "alpaca", "horse", "donkey"], "correct_choice_idx": 3, "direct_answers": ["donkey", "donkey", "mule", "donkey", "donkey", "donkey", "horse", "mule", "donkey", "camel"], "difficult_direct_answer": false, "rationales": ["The animal looks like a horse but smaller.", "This animal has a similar appearance to a horse or mule.", "The animal is a donkey."], "image": "train2014/COCO_train2014_000000215616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336675, "question_id": "3pFr7zEZZuCq5MpQp8CQFW", "question": "What is the body of water categorized as?", "choices": ["ocean", "pond", "river", "lake"], "correct_choice_idx": 3, "direct_answers": ["lake", "lake", "lake", "lake", "lake", "lake", "lake", "lake", "lake", "lake"], "difficult_direct_answer": false, "rationales": ["It is an enclosed body of water, not an ocean, and it is mostly still meaning it is not running like a river. also there is a sign that states exactly what it is.", "A large body of water has visible shores.", "All we have to do is read the sign to know the answer."], "image": "train2014/COCO_train2014_000000336675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55879, "question_id": "3pVXkeGRrx3sqV5jxYGBej", "question": "What are the horses doing?", "choices": ["pulling plow", "eating grass", "performing tricks", "resting"], "correct_choice_idx": 0, "direct_answers": ["plowing", "plowing field", "pulling plow", "pulling plow", "plowing field", "pulling plow", "plowing", "plowing", "plowing", "cultivating"], "difficult_direct_answer": false, "rationales": ["The horses are attached to the front of a plow by a harness. for this tool to be used effectively, the horses would be expected to move forward and in doing so would pull the plow.", "The horses are pulling the plow.", "The horses here are pulling plow because they are on a path of dirt with a man behing them minding them."], "image": "train2014/COCO_train2014_000000055879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476347, "question_id": "3pbJoSfg9Xdwazca9C9oWX", "question": "Where are they most probably swimming?", "choices": ["river", "ocean", "fountain", "pond"], "correct_choice_idx": 3, "direct_answers": ["pond", "lake", "lake", "lake", "duck", "lake", "south", "pond", "lake", "lake"], "difficult_direct_answer": false, "rationales": ["These ducks appear to be in a placid body of water, most likely being a pond.", "Geese usually swim in freshwater and this appears to be deep, but doesn't have waves or a current.", "Ducks like to sit on small and water bodies of water which are most likely found in ponds."], "image": "train2014/COCO_train2014_000000476347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380828, "question_id": "3pidPPqntDN5WZqZ7iUkr5", "question": "From which room could items used to make this chair originate?", "choices": ["library", "kitchen", "dining room", "sewing room"], "correct_choice_idx": 0, "direct_answers": ["wood/trees", "library", "library", "library", "wood/trees", "wood/trees", "library", "library", "library", "library"], "difficult_direct_answer": false, "rationales": ["The room is the library.", "The bench is made out of books.", "The items comprising the chair are books. books are frequently found in the libraries of homes."], "image": "val2014/COCO_val2014_000000380828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349459, "question_id": "3pnTgVGmvLVeZLjgHEbZiN", "question": "What country is served by this airline?", "choices": ["china", "egypt", "sweden", "netherlands"], "correct_choice_idx": 2, "direct_answers": ["sweden", "scandinavia", "sweden", "scandanavia", "sweden", "sweden", "scandinavia", "scandenavia", "scandinavia", "sweden"], "difficult_direct_answer": false, "rationales": ["There are scandinavian flags on the side of the airplane's fuselage.", "It is sweden because sas written on the back of the plane is scandinavian airlines", "This is an airline from sweden"], "image": "val2014/COCO_val2014_000000349459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323853, "question_id": "3pycLbAcwiiZf9ZcqN6Pmb", "question": "Why are there black spots on the apples?", "choices": ["fresh", "painted", "stained", "rotting"], "correct_choice_idx": 3, "direct_answers": ["eaten", "bugs", "pestilence", "wormholes", "sooty blotch", "worm holes", "rotting", "worms", "rotten", "bugs"], "difficult_direct_answer": true, "rationales": ["They have been on the tree too long and are getting moldy.", "The spots are rotting.", "When black spots appear on apples when they have not been picked it means that they are going bad."], "image": "val2014/COCO_val2014_000000323853.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303603, "question_id": "3q4HNjmyFCXtHpY3nwT6J8", "question": "Which thing here is the highest?", "choices": ["train", "boat", "airplane", "car"], "correct_choice_idx": 2, "direct_answers": ["plane", "airplane", "airplane", "plane", "plane", "airplane", "plane", "plane", "plane", "airplane"], "difficult_direct_answer": false, "rationales": ["The airplane is flying through the air and there's nothing else in the air at the moment, so the airplane is definitely the highest object.", "Boats and people can't fly.", "The aircraft is flying above everything else."], "image": "val2014/COCO_val2014_000000303603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209274, "question_id": "3qFTJm6YJSPtxWQosisjWN", "question": "What food is the same color as the largest portion of this vehicle?", "choices": ["corn", "lemon", "cherry", "spinach"], "correct_choice_idx": 3, "direct_answers": ["spinach", "apple", "grapes", "lime", "brocolli", "broccoli", "apple", "kales", "broccoli", "salad"], "difficult_direct_answer": false, "rationales": ["Spinach is green in color.", "It is a green vegetable. the train cars are green.", "The train is a bright green which is the same color as spinach."], "image": "val2014/COCO_val2014_000000209274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271900, "question_id": "3qUaxWTd5JvSUTgGVpxsdW", "question": "Black head goats have similar sex organs to?", "choices": ["sheep dogs", "mountain cats", "roosters", "human females"], "correct_choice_idx": 3, "direct_answers": ["i do", "other goats", "human females", "pigs", "sheep", "sheep", "white goats", "males", "does", "sheep"], "difficult_direct_answer": false, "rationales": ["The black heads indicate that they're girls.", "Black head goats have ovaries and breasts. their breasts have similar tissue to humans, and are sought after for genetic breeding purpose.", "The goats have the same as humans."], "image": "val2014/COCO_val2014_000000271900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229180, "question_id": "3qydN9Q2iEHL69Wct8AYVG", "question": "Why is she doing this?", "choices": ["is hungry", "showing off", "for photo", "is traveling"], "correct_choice_idx": 2, "direct_answers": ["eating", "eating", "eating", "hungry", "hungry", "eating", "humour", "eating banana", "hungry", "for photo"], "difficult_direct_answer": false, "rationales": ["She is wearing the sunglasses to look cool for the picture.", "The girl has sunglasses on and is staring forward.", "The girl is posing."], "image": "train2014/COCO_train2014_000000229180.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517382, "question_id": "3r8qDmgaaSPg4TMycizVXx", "question": "What session of the day is it?", "choices": ["midnight", "evening", "morning", "afternoon"], "correct_choice_idx": 0, "direct_answers": ["birthday", "night", "game time", "night", "game play", "night", "night", "night", "midnight", "evening"], "difficult_direct_answer": false, "rationales": ["The session takes place at midnight because the clock on the wall says it is close to 12:00.", "It's midnight out.", "There is a visible clock in the background and based on the darkness outside and the lights on inside, option a is the closest fit."], "image": "train2014/COCO_train2014_000000517382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107009, "question_id": "3rxgLpGZj2q4GKdAfTkhis", "question": "What ingredient gives you the most fat?", "choices": ["cream", "sugar", "chocolate", "flour"], "correct_choice_idx": 0, "direct_answers": ["cream", "doughnut", "carbs", "eclair", "cream", "sugar", "sugar", "sweet", "cream", "butter"], "difficult_direct_answer": false, "rationales": ["The white thick consistency has milk, eggs and sugar that is highly fattening.", "Donuts are on a tray and one has cream in the center. cream is high in fat.", "Because it is unhealthy when its consumed in large portions."], "image": "train2014/COCO_train2014_000000107009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280972, "question_id": "3rzQDDnZwGZXiSS7Pj5WZ4", "question": "Why is he flushing with his foot?", "choices": ["touchless", "showing off", "handless", "exercise"], "correct_choice_idx": 0, "direct_answers": ["germs", "hygiene", "cleanliness", "germs", "touchless", "cleaning toilet", "not touch", "germ phobia", "sanitary", "pee"], "difficult_direct_answer": true, "rationales": ["He doesn't want to put his hand on the button because of germs", "There is much bacteria in bathroom. it makes sense that a person would not want to touch the button.", "He is doing that so he does not get his hands dirty"], "image": "val2014/COCO_val2014_000000280972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78314, "question_id": "3s9WgjkfaSJM5xrqnDYaaC", "question": "What type of tires are on this truck?", "choices": ["small", "monster size", "medium", "standard"], "correct_choice_idx": 1, "direct_answers": ["all terrain", "military", "big", "off road", "lorry type", "big tires", "motorcycle tires", "all terrain", "big", "monster size"], "difficult_direct_answer": false, "rationales": ["The tires are huge.", "Extra large tires are on a truck.", "The truck is raised off the ground by a bit, so it is obvious they are bigger than normal."], "image": "train2014/COCO_train2014_000000078314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244768, "question_id": "3sZxfkj32d5n9mdCGwL2nv", "question": "What's the name for the trippy picture behind the woman?", "choices": ["optical illusion", "sight gag", "mind poster", "visual puzzle"], "correct_choice_idx": 0, "direct_answers": ["optical illusion", "optical illusion", "illusion", "psychedelia", "3d art", "optical illusion", "psychedelic", "pop art", "optical illusion", "psychedelic image"], "difficult_direct_answer": false, "rationales": ["The wallpaper behind this woman creates an illusion of depth due to square arrangement and placement. this would be classified as an optical illusion.", "The picture is abstract and geometric with vivid colors and appears somewhat 3d.", "The picture on the wall is designed to give the illusion of looking like something it is not."], "image": "train2014/COCO_train2014_000000244768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576780, "question_id": "3ssz3N8HCCzaBW8KQvuSuM", "question": "Where is this fruit stand?", "choices": ["australia", "india", "ireland", "asia"], "correct_choice_idx": 3, "direct_answers": ["asia", "in front", "downtown", "street", "on street", "america", "street", "korea", "road", "outdoors"], "difficult_direct_answer": true, "rationales": ["The signs are in one of the languages from this continent", "Some of the writing on the signs is in asian.", "There are indian words on the boxes of fruit."], "image": "train2014/COCO_train2014_000000576780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218578, "question_id": "3sx2t2rgcKs5SUkssAp9hn", "question": "What type of road is behind the person who took this picture?", "choices": ["two way", "one way", "on ramp", "none"], "correct_choice_idx": 0, "direct_answers": ["one way", "one way", "one way", "one way", "one way", "one way", "two way", "one way", "one way", "one way"], "difficult_direct_answer": false, "rationales": ["The one way starts at the sign, so it must be two lanes elsewhere.", "There is a white sign that says you can't go the other way.", "The sign means that the directions have changed. they are about to become \"one way\" so the road behind the person who took the picture has to be different."], "image": "train2014/COCO_train2014_000000218578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175978, "question_id": "3t52GqmKKLPMNDnBpvzbAJ", "question": "What is definitely not allowed here?", "choices": ["texting", "eating", "smoking", "crying"], "correct_choice_idx": 2, "direct_answers": ["smoking", "dog poop", "smoking", "smoking", "pets", "food fight", "smoking", "smoking", "smoking", "smoking"], "difficult_direct_answer": false, "rationales": ["A no smoking sign is on a wall in a restaurant.", "A drawing of a lit cigarette on the upper left has a diagonal line through it, which of course means, \"no lighting up in here, please\". smoking is no longer permitted in a lot of restaurants around the world.", "A kid is sitting at a restaurant and doesn't want someone to blow stuff at them. it isn't respectful."], "image": "train2014/COCO_train2014_000000175978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524679, "question_id": "3tJrHsiwKRmFwZdUbq9VcS", "question": "Why is the man on the tall bike?", "choices": ["confused", "entertainment", "being chased", "exercise"], "correct_choice_idx": 1, "direct_answers": ["entertainment", "showing off", "fun", "fun", "for fun", "comedian", "entertainment", "for attention", "riding", "entertainment"], "difficult_direct_answer": false, "rationales": ["He is dressed in costume which means he wants to give people a laugh.", "It is unusual to see a nun on an extra tall bicycle.", "The bike and man's costume are very impractical. furthermore, the bus is a tour bus."], "image": "train2014/COCO_train2014_000000524679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418893, "question_id": "3tPSXe2iheA9xKjNvYyohx", "question": "In what location would you have the most fun with the toy shown?", "choices": ["aloft outside", "parked car", "kitchen", "bedroom"], "correct_choice_idx": 0, "direct_answers": ["park", "park", "vacation place", "aloft outside", "park", "beach", "ice cream", "beach", "outside", "park"], "difficult_direct_answer": false, "rationales": ["The toy is a kite. in order to play with it there needs to be wind which would not be found in an indoor area.", "The location is aloft.", "They would have fun with the kite outside."], "image": "val2014/COCO_val2014_000000418893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494637, "question_id": "3tPzYdomhP2Dro8LAmU7Hc", "question": "What is the black headgear on the horses made from?", "choices": ["leather", "cotton", "wood", "plastic"], "correct_choice_idx": 0, "direct_answers": ["leather", "leather", "leather", "leather", "leather", "leather", "leather", "leather", "leather", "leather"], "difficult_direct_answer": false, "rationales": ["The black headgear on the horses is made from leather because it is strong and flexible.", "The black headgear are horse bridles. horse bridles are made of this strong, lasting material in addition to their metal components.", "Because of the durability of leather it's mostly used with horses."], "image": "train2014/COCO_train2014_000000494637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105120, "question_id": "3tUC7FfTbSuJzaQChzGh3p", "question": "What does the orange fabric carrot next to the cat contain?", "choices": ["plastic beads", "catnip", "cotton stuffing", "wool stuffing"], "correct_choice_idx": 1, "direct_answers": ["stuffing", "catnip", "catnip", "catnip", "catnip", "catnip", "antioxidants", "stuffing", "vitamin", "catnip"], "difficult_direct_answer": false, "rationales": ["The orange toy next to the cat contains catnip so the cat gets more playful.", "A cat is playing with a toy. cat toys often have catnip in their toys.", "Generally cats don't eat plants, so it would be easy to surmise that the carrot is fake and is packed with catnip."], "image": "val2014/COCO_val2014_000000105120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67015, "question_id": "3tX4fX57VPm5kqgXnTgWvg", "question": "How many bears are there?", "choices": ["two", "twenty", "three", "none"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is one bear in one chair and a smaller bear in the chair to the right.", "There is a old and young bear.", "A big bear is sitting beside a smaller bear."], "image": "val2014/COCO_val2014_000000067015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1592, "question_id": "3tYnUkQjshUuaXX3vczYPa", "question": "How is this girl feeling?", "choices": ["sassy", "bored", "sad", "tired"], "correct_choice_idx": 0, "direct_answers": ["happy", "happy", "feisty", "happy", "sassy", "happy", "happy", "happy", "happy", "feisty"], "difficult_direct_answer": false, "rationales": ["Her leg up in the air is a clear sign that she can't possibly be sad, bored or tired.", "The little girl is acting like a movie star...all sassy and cute.", "The girl is smiling and has a smug like body language."], "image": "val2014/COCO_val2014_000000001592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332003, "question_id": "3tmLovfyFzGGPLftnxEjas", "question": "What is a good word to describe all of these people?", "choices": ["toddlers", "women", "bespectacled", "senior citizens"], "correct_choice_idx": 2, "direct_answers": ["caucasian", "bespectacled", "family", "happy", "friends", "friends", "happy", "siblings", "friends", "happy"], "difficult_direct_answer": false, "rationales": ["We could call them bespectacled because they are all wearing glasses.", "Each person in the picture together has a pair of glasses on their face. these are called bispectacles.", "They all are wearing glasses on their face."], "image": "train2014/COCO_train2014_000000332003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7050, "question_id": "3tw8tVegEKauSi6dZ8EuqD", "question": "What do the blue vests being done help the boaters with in case of tipping over?", "choices": ["balance", "visibility", "buoyancy", "signaling"], "correct_choice_idx": 2, "direct_answers": ["drowning", "paddling", "life jackets", "keeps afloat", "floating", "buoyancy", "flotation", "floating", "floating", "float"], "difficult_direct_answer": false, "rationales": ["The blue life vests will help keep boaters floating in the water should their boat tip over and throw them out.", "The vests are lifejackets that are worn by the boaters so that they will float if they fall out of the boat. it protects them from drowning.", "These blue vests provide buoyancy in emergency purposes."], "image": "val2014/COCO_val2014_000000007050.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159402, "question_id": "3twFunfTRRJjfBwWsX2YMZ", "question": "What is this type of train car called?", "choices": ["caboose", "passenger", "cargo", "sleeper"], "correct_choice_idx": 2, "direct_answers": ["suitcase", "passenger", "boxcar", "cargo train", "luggage", "freight", "cargo", "work", "cargo", "passenger"], "difficult_direct_answer": false, "rationales": ["The people are putting their luggage in this train car. the passengers ride and sleep in other cars.", "The inside of this train does not have any seats or mechanical engine parts. it has quite a few bags so it is probably use for all types of cargo.", "The train car is for cargo."], "image": "val2014/COCO_val2014_000000159402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63865, "question_id": "3u7thAGfR8LTnFXCd9QVA7", "question": "Which bird's product can be found here?", "choices": ["ostrich", "canary", "chicken", "quail"], "correct_choice_idx": 2, "direct_answers": ["chicken", "egg", "food", "egg", "eggs", "eggs", "edge", "chicken", "chicken", "chicken"], "difficult_direct_answer": false, "rationales": ["There are slices of chicken eggs on the pizza.", "There are chicken eggs on the pizza.", "The eggs are produced by this type of bird."], "image": "train2014/COCO_train2014_000000063865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224961, "question_id": "3uT4xVqkqya84nKMDWJBVw", "question": "New Orleans is inventor of what?", "choices": ["beverages", "soft drinks", "coffee", "cocktail"], "correct_choice_idx": 3, "direct_answers": ["jazz", "beignets", "jazz music", "jazz", "good", "sazerac", "jazz", "cocktail", "jazz", "jazz"], "difficult_direct_answer": false, "rationales": ["There is an unverified claim it was this product", "New orleans is credited with inventing the sazerac, which uses peychaud's bitters.", "This type of drink was invented in the french quarter's carousel bar."], "image": "train2014/COCO_train2014_000000224961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30049, "question_id": "3ui5ypx4nMLkTag4e6t54a", "question": "What does the photographer stand on to take this photo?", "choices": ["glider", "island", "motor boat", "bank"], "correct_choice_idx": 2, "direct_answers": ["boat", "boat", "boat", "boat", "boat", "boat", "boat", "motor boat", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["He would be on the boat that is pulling the person in the water.", "The photo is of a water skiier being pulled by a rope. the photographer is standing at the opposite end of the rope from the water skiier.", "A person is standing at the edge of the boat and taking a picture. he is water skiing back and forth."], "image": "val2014/COCO_val2014_000000030049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570801, "question_id": "3uj5F7atZMZZ5pPPkDDz47", "question": "What genetic order does the pet seen here belong to?", "choices": ["rodentia", "snake", "canine", "ruminant"], "correct_choice_idx": 0, "direct_answers": ["rodentia", "rat", "hairless", "chordata", "naked molerat", "rat", "carnivora", "rat", "rat", "ms"], "difficult_direct_answer": false, "rationales": ["The pet appears to be a rat or mouse which are both types of rodents and belong to the order of answer a.", "The pet is not a dog, cow, or snake.", "The order is rodentia."], "image": "train2014/COCO_train2014_000000570801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180787, "question_id": "3v267FwnQoPjwTm2sPrXxN", "question": "What is being held on the curved metal structure?", "choices": ["bread", "vegetable", "dough", "meat"], "correct_choice_idx": 3, "direct_answers": ["leg", "leg", "meat", "meat", "meat", "meat", "meat", "beef", "meat", "fish"], "difficult_direct_answer": false, "rationales": ["A fish is on the structure. option a describes the flesh of an animal.", "There is a large piece of cooked meat being held on the metal structure.", "A large slab of meat is on a structure on a table next to a chef. chefs often cook meat."], "image": "val2014/COCO_val2014_000000180787.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358185, "question_id": "3v6kLeA3PVd2wbcfaYcTcK", "question": "Why do they all have laptops?", "choices": ["working", "distracted", "selling them", "trying out"], "correct_choice_idx": 0, "direct_answers": ["working", "study", "working", "commuting workers", "school", "work", "study", "working", "working", "reading"], "difficult_direct_answer": false, "rationales": ["The person is concentrating as they are looking at the computer.", "Most people bring there laptops everywhere do catch up on there work.", "The people are working."], "image": "train2014/COCO_train2014_000000358185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14966, "question_id": "3vesHnfJ8KYh9XRXHvBrcZ", "question": "Tropical climate is suits for which tree?", "choices": ["coco bean", "coconut", "maple", "palm"], "correct_choice_idx": 3, "direct_answers": ["palm", "palm", "palm", "palm tree", "palm tree", "palm", "palm", "palm", "palm tree", "palm"], "difficult_direct_answer": false, "rationales": ["Palm trees like the warmer weather.", "Palm trees are one of the most grown trees in a tropical climate.", "Palm trees are growing."], "image": "train2014/COCO_train2014_000000014966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227769, "question_id": "3wFqwBsCzxLFwMoUgJhsTq", "question": "What item on the kitchen counter is used for cutting foods such as fruits and vegetables?", "choices": ["toaster oven", "cutting board", "counter", "pan"], "correct_choice_idx": 1, "direct_answers": ["knife", "chopper", "knife", "cutting board", "knife", "cutting board", "knife", "knife", "kitchen", "cutting board"], "difficult_direct_answer": false, "rationales": ["It is a piece of wood that a knife can safely be used on", "A cutting board is there to use to chop things on.", "There is a rectangular wooden item near the sink that protects the counter when cutting fruits and vegetables."], "image": "train2014/COCO_train2014_000000227769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479727, "question_id": "3wKpA4knLZx99Pj3h2NfR4", "question": "Where is the plant that is depicted on the sign usually found?", "choices": ["desert", "tropics", "arctic", "rainforest"], "correct_choice_idx": 0, "direct_answers": ["desert", "tree", "tree", "desert", "desert", "desert", "desert", "desert", "desert", "desert"], "difficult_direct_answer": false, "rationales": ["A cactus is on a sign. cactuses grow in deserts.", "The plant is used to give out water in the dessert.", "A cactus is always in the desert."], "image": "val2014/COCO_val2014_000000479727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446807, "question_id": "3wnxLamP2dAZexahAufqz8", "question": "What is on the floor?", "choices": ["chair", "urinal", "drain", "rug"], "correct_choice_idx": 2, "direct_answers": ["tiles", "tile", "tile", "tile", "floor tile", "tile", "drain hole", "tiles", "drain", "tile"], "difficult_direct_answer": false, "rationales": ["There is a small metal grate in the floor that allows water to drain out to prevent flooding.", "A drain is on the floor for water to go down it", "One can see the grill where water will flow downwards and out of the building."], "image": "train2014/COCO_train2014_000000446807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557251, "question_id": "3wzPH2jW4HeRanWYNFvhih", "question": "What type of sport is this?", "choices": ["aquatic", "team", "winter", "tropical"], "correct_choice_idx": 2, "direct_answers": ["skiing", "winter", "backpacking", "ski", "skiing", "cross-country skiing", "cross-country skiing", "skiing", "winter", "skiing"], "difficult_direct_answer": false, "rationales": ["Because they're outdoors and there's snow on the ground.", "The sport is a winter one.", "This is a winter ski sport."], "image": "train2014/COCO_train2014_000000557251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68778, "question_id": "3x4jCQuLkhRJ8pg2fh38su", "question": "What time of day is the woman snowboarding?", "choices": ["morning", "day", "night", "afternoon"], "correct_choice_idx": 2, "direct_answers": ["night", "night", "night", "night", "night", "night", "night", "night", "night", "night"], "difficult_direct_answer": false, "rationales": ["It is dark outdoors with the exception of some lights.", "It is dark out.", "A woman is snowboarding and it is dark out. it is dark at night."], "image": "train2014/COCO_train2014_000000068778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464444, "question_id": "3x6JoPQeLrnUfhMtBghMjs", "question": "Which person has a legendary story involving this animal?", "choices": ["lady godiva", "paul wight", "stephen hawking", "tim cook"], "correct_choice_idx": 0, "direct_answers": ["paul revere", "black stallion", "cowboys", "john wayne", "john wayne", "zeus", "hidalgo", "lady godiva", "pocahontas", "horseback rider"], "difficult_direct_answer": true, "rationales": ["Horses are walking on a beach. lady godiva is associated with horses.", "These are horses and that lady rode a horse with no clothes on - the lady, not the horse", "There is a legend about horses involving lady godiva."], "image": "train2014/COCO_train2014_000000464444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581821, "question_id": "3xegKqMWH9smzsW6zZ4fj3", "question": "How are the objects on the shelf near the window arranged?", "choices": ["by author", "by smell", "by color", "by size"], "correct_choice_idx": 2, "direct_answers": ["by color", "symmetrically", "by color", "stacked", "five", "vertically", "color", "by color", "straight", "shelves"], "difficult_direct_answer": false, "rationales": ["Each shelf has one specific color book on it.", "Each row is the same color.", "The objects are arranged by color."], "image": "train2014/COCO_train2014_000000581821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233961, "question_id": "3xhvCjYag6hk2qDRaU6TEF", "question": "What type of activity are the people participating in?", "choices": ["water gliding", "river running", "surfing", "river surfing"], "correct_choice_idx": 3, "direct_answers": ["surfing", "surfing", "river surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["By the water that is shown and what they are wearing you can tell what they are doing.", "These people are riding the waves on a river.", "The activity is river surfing."], "image": "val2014/COCO_val2014_000000233961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143559, "question_id": "3xjPsHmiK9Tgs3pp4Kq2LS", "question": "Why is the girl in black extending her arm?", "choices": ["to throw", "to dodge", "to catch", "to roll"], "correct_choice_idx": 0, "direct_answers": ["throwing", "catch", "throwing", "throwing frisbee", "throwing", "catch frisbee", "throwing", "to throw", "catch frisbee", "throwing frisbee"], "difficult_direct_answer": false, "rationales": ["There is a flying disc near the girl in black. it is moving away from her.", "She is throwing the frisbee.", "The girl is playing ultimate frisbee. the frisbee is moving away from her."], "image": "train2014/COCO_train2014_000000143559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527899, "question_id": "3y5omHmQCZvnGrNgxjsPBw", "question": "Where are the three walking?", "choices": ["zoo", "neighborhood", "park", "mall"], "correct_choice_idx": 1, "direct_answers": ["road", "neighborhood", "street", "street", "home", "neighborhood", "street", "street", "neighborhood", "street"], "difficult_direct_answer": false, "rationales": ["The people are in a neighborhood.", "The three people are walking in a neighborhood.", "A bunch of residential buildings can be seen in the area."], "image": "train2014/COCO_train2014_000000527899.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534735, "question_id": "3y8aNKczUgffUP3ZAkgAv2", "question": "What is one of the favorite food of this person?", "choices": ["korean food", "chinese food", "mediterranean food", "japanese food"], "correct_choice_idx": 1, "direct_answers": ["chinese", "chinese food", "chinese food", "chinese", "chinese food", "chinese food", "chinese", "chinese food", "mexican", "chinese food"], "difficult_direct_answer": false, "rationales": ["You can tell by the business card as to what is he favorite type of cuisine.", "A menu with script from china can be seen.", "The only food related item is the chinese food business card."], "image": "train2014/COCO_train2014_000000534735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276069, "question_id": "3y9Uf4bcAFWD4FpPDcpeTp", "question": "What is the cookie in the shape of?", "choices": ["windmill", "apple", "cat", "baby"], "correct_choice_idx": 0, "direct_answers": ["windmill", "windmill", "windmill", "windmill", "barn", "windmill", "windmill", "windmills", "windmill", "windmill"], "difficult_direct_answer": false, "rationales": ["The cookie has four blades that are attached to a building.", "The cookie is like a windmill.", "The cookie is shaped like a tower seen on a farm with a large fan on it."], "image": "train2014/COCO_train2014_000000276069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62128, "question_id": "3yCipuNbKVPMTnQS78T2Lp", "question": "Looking at the man in the black shirt what are his pants made of?", "choices": ["denim", "pleather", "plastic", "leather"], "correct_choice_idx": 0, "direct_answers": ["denim", "denim", "denim", "denim", "denim", "denim", "denim", "denim", "denim", "denim"], "difficult_direct_answer": false, "rationales": ["He is wearing jeans.", "These are jeans", "The jeans could be made of anything but if you look closely it's denim jeans."], "image": "train2014/COCO_train2014_000000062128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213181, "question_id": "3yUKJ24dFYiPefkHpvvccY", "question": "How is the man propelled forward?", "choices": ["ski poles", "cable", "gravity", "he isn't"], "correct_choice_idx": 1, "direct_answers": ["skiis", "skis", "ski lift", "ski line", "ski's", "skis poles", "roop", "skis", "cable", "skis"], "difficult_direct_answer": false, "rationales": ["There is no vehicle shown, and the person is holding the cable.", "A man is using a long black wire to pull his body up the hill.", "There is a visible horizontal fabric, which the man is holding onto while on flat ground. this is used in skiing environments for kinetic motion."], "image": "train2014/COCO_train2014_000000213181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308278, "question_id": "3yUoP5efh6mi3NeVn3Bevq", "question": "Which direction do the riders of this lift go?", "choices": ["up", "down", "east", "back"], "correct_choice_idx": 0, "direct_answers": ["up", "right", "east", "upwards", "upward", "up", "up", "up", "up", "up"], "difficult_direct_answer": false, "rationales": ["The riders go up.", "A ski lift takes you to the top of the hill and then you ski down to get to the bottom.", "In order to ski or snowboard down the hill multiple times, they must be taken to the top of the hill each time."], "image": "val2014/COCO_val2014_000000308278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243829, "question_id": "3yXQa9Th2jNULBowbm7dLr", "question": "What does the girl want to pet?", "choices": ["foxes", "snakes", "sheep", "chickens"], "correct_choice_idx": 2, "direct_answers": ["sheep", "sheep", "sheep", "sheep", "sheep", "goat", "goat", "sheep", "goat", "sheep"], "difficult_direct_answer": false, "rationales": ["The sheep are only pet in the picture.", "A girl is looking at sheep in a pasture.", "She is gazing at white small grazing animals across the street."], "image": "train2014/COCO_train2014_000000243829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579725, "question_id": "3yfWbtYeqrGJgdEEbKour2", "question": "What activity might be done in this outdoor area?", "choices": ["funeral", "baking", "singing", "smoking"], "correct_choice_idx": 3, "direct_answers": ["smoking", "smoke cigarettes", "smoking", "eating", "eating", "smoking", "smoking", "smoking cigarettes", "smoking", "smoking"], "difficult_direct_answer": false, "rationales": ["There is an ashtray on the table which would be used for ashes and cigarettes when smoking.", "There is an ashtray visible.", "The round clear item is an ashtray. it has grooves in the top for holding a cigarette."], "image": "train2014/COCO_train2014_000000579725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470002, "question_id": "3yvzPbwXkAcCmcnL5htSPV", "question": "Why is the giraffe's head near the basket?", "choices": ["to exercise", "to drink", "to eat", "to play"], "correct_choice_idx": 2, "direct_answers": ["to eat", "eating", "eat", "eating", "eating", "to eat", "eating", "eating", "to eat", "it's eating"], "difficult_direct_answer": false, "rationales": ["The giraffe is aiming its mouth at some food.", "The basket contains food for the giraffe.", "The head is near to eat."], "image": "train2014/COCO_train2014_000000470002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412455, "question_id": "3yyhz6wVb77aqpAH9knaP2", "question": "The red color in the fire hydrant indicates what factor?", "choices": ["force", "limit", "speed", "quality"], "correct_choice_idx": 0, "direct_answers": ["alarm fire", "fire", "force", "for fires", "fire", "don't park", "water", "usa", "flow rate", "in use"], "difficult_direct_answer": true, "rationales": ["Fire hydrants are colored according to force of the hydrant.", "The fire hydrant indicates the force in which water is coming through.", "The force of it."], "image": "val2014/COCO_val2014_000000412455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152657, "question_id": "3z2F2ueobUCgq5FLcFn27n", "question": "What is the object with a hose connected to it on the table in front of the tv?", "choices": ["remote", "geyser", "hookah", "controller"], "correct_choice_idx": 2, "direct_answers": ["hookah", "hookah", "she'sa", "hookah", "hookah", "she'sa", "hookah", "hookah", "hookah", "hookah"], "difficult_direct_answer": false, "rationales": ["The object on the table is a hookah and the hoses are used for smoking tobacco.", "There is a tall blue indian glass smoking pipe on the table.", "A hookah is on a table in a living room. hookahs have hoses."], "image": "train2014/COCO_train2014_000000152657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238827, "question_id": "3zGpo2qyVj6ohS6hYvEFNe", "question": "What is number seven attempting to do?", "choices": ["catch ball", "hit ball", "throw ball", "run bases"], "correct_choice_idx": 3, "direct_answers": ["receive", "steal base", "receive", "steal base", "steal base", "steal base", "steal base", "run bases", "receive", "steal base"], "difficult_direct_answer": false, "rationales": ["The number 7 is positioning to run to the next base.", "Number 7 is running from one base to another.", "The person wants to run bases."], "image": "train2014/COCO_train2014_000000238827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535750, "question_id": "3zQxoGpsBGi2o6CpZZrgYU", "question": "What country is associated with the tan hat the man is wearing?", "choices": ["china", "russia", "ethiopia", "france"], "correct_choice_idx": 0, "direct_answers": ["china", "vietnam", "china", "china", "china", "china", "vietnam", "vietnam", "china", "china"], "difficult_direct_answer": false, "rationales": ["The man is wearing a tan hat that is often seen worn by chinese farmers to keep the sun off them.", "Straw pointy hats are popular in china.", "You see a lot of people in china wearing straw hats."], "image": "val2014/COCO_val2014_000000535750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51928, "question_id": "3zXwvzEkqE6LUGTFghYURe", "question": "What material are these fluffy animals made of?", "choices": ["wool", "pic", "denim", "cotton"], "correct_choice_idx": 0, "direct_answers": ["cotton", "velour", "wool", "cotton", "velour", "cotton", "cotton", "velour", "cotton", "cotton"], "difficult_direct_answer": false, "rationales": ["The first ones were made from this.", "They are soft and made from wool.", "Children are holding teddy bears."], "image": "val2014/COCO_val2014_000000051928.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229949, "question_id": "3zaHGBxc6XfEHDUzru3LiT", "question": "What sort of power moves this vehicle?", "choices": ["electric", "diesel", "gas", "coal"], "correct_choice_idx": 0, "direct_answers": ["electricity", "electric", "electricity", "electric", "electric", "electric", "wires", "diesel power", "electric", "electricity"], "difficult_direct_answer": false, "rationales": ["You can see the power lines above it", "You can see all the wires criss-crossed over the train.", "There are wires connected to the car."], "image": "train2014/COCO_train2014_000000229949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168355, "question_id": "3zoj8QnetKhHwrvgzUChg4", "question": "What is being sold in this shop?", "choices": ["sand", "spices", "perfume", "dye"], "correct_choice_idx": 1, "direct_answers": ["grocery things", "spices", "spices", "spices", "spices", "spices", "spices", "spices", "grocery things", "spices"], "difficult_direct_answer": false, "rationales": ["A man is sitting and grinning as he watches over his mixes. they can be used to add taste to food.", "A man is sitting in front of large containers of spices. personal use of spices does not require large bowls of each spice.", "You can see all the spices in the bowls in front of him."], "image": "val2014/COCO_val2014_000000168355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133620, "question_id": "3zpCERvdw3ELaaqNrNiZsy", "question": "Why is the woman wearing a hat?", "choices": ["fashion", "warmth", "costume", "uniform"], "correct_choice_idx": 3, "direct_answers": ["uniform", "job", "uniform", "uniform", "constable", "attire", "uniform", "uniform", "uniform", "protection"], "difficult_direct_answer": false, "rationales": ["The woman is wearing the white hat as part of her official uniform.", "A girl is in a white hat with a blue, formal coat with emblems on it.", "The woman is wearing an official suit and hat with emblems."], "image": "val2014/COCO_val2014_000000133620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251042, "question_id": "3ztW6SbvEM7shNgQN6P3tP", "question": "For what reason is the man carrying the long object tucked between his arms?", "choices": ["self-defense", "reach", "animal control", "visibility"], "correct_choice_idx": 2, "direct_answers": ["guide elephant", "prodding", "control elephant", "balance", "animal control", "hit animal", "leading elephant", "guide", "herd elephants", "control"], "difficult_direct_answer": true, "rationales": ["They use it to control the animal and make it do what they want.", "The man is part of animal control.", "The man is carrying the stick to steer the elephant."], "image": "val2014/COCO_val2014_000000251042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465996, "question_id": "424kcSWiZXbRBUhtvYTcWv", "question": "What is the lined white area near the lights for?", "choices": ["parking", "crossing", "biking", "skating"], "correct_choice_idx": 1, "direct_answers": ["crosswalk", "crosswalk", "crosswalk", "pedestrian cross", "crosswalk", "crossing", "crossing", "crossing", "crosswalk", "crosswalk"], "difficult_direct_answer": false, "rationales": ["Even in the dark of night, the reflective white stripes of the crosswalk are visible.", "A crosswalk is shown by painting lines such as this at an intersection from one sidewalk to the other.", "Straight white lines are painted on a road near an intersection. busy intersections have crosswalks."], "image": "val2014/COCO_val2014_000000465996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360189, "question_id": "424nvcudcsGWRLzS4BV7cN", "question": "When can breakfast be eaten here?", "choices": ["early morning", "morning", "noon", "all day"], "correct_choice_idx": 3, "direct_answers": ["all day", "all day", "eggs", "all day", "all day", "all day", "all day", "all day", "all day", "all day"], "difficult_direct_answer": false, "rationales": ["The sign says so.", "The sign literally says breakfast all day, which indicates when breakfast is available.", "It says this on the sign above the door"], "image": "train2014/COCO_train2014_000000360189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442220, "question_id": "427Fyv8wP2kmh8ebmY4Edt", "question": "What is the sign pointing to?", "choices": ["toy boat", "wrestler", "baby", "bus lane"], "correct_choice_idx": 3, "direct_answers": ["bus lane", "bus lane", "bus lane", "bus lane", "left", "bus lane", "left side", "bus lane", "bus lane", "bus"], "difficult_direct_answer": false, "rationales": ["The sign is pointing left to the bus lane.", "It says on the sign that it's where you go to get in the bus lane.", "A white sign is on the side of the road. there is a vehicle icon on the sign."], "image": "train2014/COCO_train2014_000000442220.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504502, "question_id": "42VKQNt3srAhrCt2uyD2DL", "question": "What is the purpose of the cup?", "choices": ["carry drinks", "is novelty", "carry toothbrushes", "child's drink"], "correct_choice_idx": 1, "direct_answers": ["hold toothbrushes", "is novelty", "toothbrush holder", "toothbrush holder", "toothbrush holder", "holding toothbrushes", "toothbrush holder", "toothbrush holder", "hold toothbrushes", "holding"], "difficult_direct_answer": false, "rationales": ["The cup has holes for the brushes.", "It has holes to hold the tools used to brush teeth.", "An enclosed glass with holes in the top is on a counter with toothbrushes next to it."], "image": "train2014/COCO_train2014_000000504502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173221, "question_id": "42Ymbpv5d8QK3VKSDrT9F5", "question": "What type of items are on the bed?", "choices": ["food", "drinks", "clothing", "suitcases"], "correct_choice_idx": 2, "direct_answers": ["clothes", "clothing", "clothing", "clothes", "shorts shirts", "pillows", "clothes", "clothes", "clothes", "pajamas"], "difficult_direct_answer": false, "rationales": ["There are several pieces of clothing that are folded on the bed.", "The items on the bed are made of fabric and have the size and shape of folded clothing.", "There are pajamas on the beds."], "image": "train2014/COCO_train2014_000000173221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287415, "question_id": "43MvFPks9uEHtvzo58UMxU", "question": "Where are the men standing while looking at their phones?", "choices": ["walkway", "street", "curbside", "park"], "correct_choice_idx": 0, "direct_answers": ["city", "sidewalk", "street", "sidewalk", "sidewalk", "sidewalk", "walkway", "walkway", "sidewalk", "sidewalk"], "difficult_direct_answer": false, "rationales": ["They look to be standing on a walkway with other people around them.", "The men are at the sidewalk.", "There are a lot of people walking about. the area has a special kind of paving look to it."], "image": "train2014/COCO_train2014_000000287415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545220, "question_id": "43eiuLxPg7JZfYXaSHaMoa", "question": "What is causing the glare in the image?", "choices": ["flashlights", "street lights", "sun", "torches"], "correct_choice_idx": 2, "direct_answers": ["sun", "sun", "sun", "sun", "sun", "sun", "sunlight", "sun", "sunlight", "sun"], "difficult_direct_answer": false, "rationales": ["The glare in the picture is most visible on the right side. if you look at the far right top corner, there is part of the sun visible. this is mostly source of glare.", "The image was taken on a sunny day and the sun caused a glare because it is so bright.", "The image was taken with the camera facing the sun which caused a glare."], "image": "val2014/COCO_val2014_000000545220.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268644, "question_id": "44FQuQnuAom7P2PZJ3Lyje", "question": "Why does the horse have yellow around the ankles?", "choices": ["fashion", "visibility", "camouflage", "protection"], "correct_choice_idx": 1, "direct_answers": ["marking", "visibility", "visibility", "for visibility", "hoof covers", "for visibility", "visibility", "safety", "reflectors", "ankle guard"], "difficult_direct_answer": false, "rationales": ["The yellow bands makes it easier to see the horses especially in !owed light conditions due to the reflective aspect.", "Horses and people are wearing bright yellow. bright colors are used for visibility.", "These horses are dark and at night they need to be able to be seen just like the riders."], "image": "train2014/COCO_train2014_000000268644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438709, "question_id": "44Gz26UMaNunEsGmTefZtg", "question": "These people are most likely on what type of event?", "choices": ["vacation", "demolition", "mob hit", "diplomatic meeting"], "correct_choice_idx": 0, "direct_answers": ["beach party", "beach", "vacation", "vacation", "vacation", "surf", "vacation", "beach party", "vacation", "vacation"], "difficult_direct_answer": false, "rationales": ["Many people travel to the beach for relaxation.", "They are most def on a vacation and enjoying the water.", "The event is in a sunny destination next ro a beach, with a hut, which would be synonymous with a beach vacation."], "image": "train2014/COCO_train2014_000000438709.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207447, "question_id": "44LucxB83FxWc8ti9Cigs6", "question": "What part of her body can break if she falls to the ground?", "choices": ["foot", "wrist", "finger", "hip"], "correct_choice_idx": 3, "direct_answers": ["leg", "all", "hip", "hip", "leg", "hip", "arm", "neck", "leg", "back"], "difficult_direct_answer": false, "rationales": ["In older individuals, if a hip is broken it is very serious, so that is going to be the body part they are the most concerned about if she falls.", "She could break several things if she would fall but hip would be really bad.", "An elderly person is on a bench. elderly people commonly break hips when they fall."], "image": "train2014/COCO_train2014_000000207447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394334, "question_id": "44MVLAmdnhyTSMM7W3Jatv", "question": "Where can you buy these donuts?", "choices": ["japan", "south korea", "china", "singapore"], "correct_choice_idx": 0, "direct_answers": ["bakery", "donut store", "store", "japan", "bakery", "bakery", "bakery", "bakery", "donut shop", "donut shop"], "difficult_direct_answer": false, "rationales": ["The donuts are in japan.", "The language on the signs shows the language it is.", "The prices are in yen."], "image": "val2014/COCO_val2014_000000394334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59526, "question_id": "44WfBnZAe6pnSWhQGFdrf7", "question": "What type of round sliced topping is on the pizza?", "choices": ["mushroom", "pepperoni", "olive", "onion"], "correct_choice_idx": 3, "direct_answers": ["onion", "mushrooms", "mushrooms", "mushrooms", "mushrooms", "pepperoni", "mushrooms", "pepperoni", "pepperoni", "pepperoni"], "difficult_direct_answer": false, "rationales": ["The onion is on top.", "The round topping is white with a purple edge, like a red onion.", "Slices of onions are on top pizza. onions are round."], "image": "train2014/COCO_train2014_000000059526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523985, "question_id": "44q5DcxiUo7yGs2vqH9Eac", "question": "What type of object is laying on the chair?", "choices": ["hair dryer", "vacuum", "phone", "stuffed animal"], "correct_choice_idx": 3, "direct_answers": ["toy", "teddy bear", "teddy bear", "teddy bear", "teddy bear", "stuffed bear", "teddy bear", "stuffed animal", "bear", "teddy bear"], "difficult_direct_answer": false, "rationales": ["You can see clearly a stuffed animal lying on the chair", "The object is a stuffed bear.", "The object looks like a bear and is furry and soft-looking. the definition of a stuffed animal."], "image": "train2014/COCO_train2014_000000523985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127527, "question_id": "455gL4iKwS2DYZBpy3ULj4", "question": "What animal is closest to the water?", "choices": ["dog", "cat", "seal", "bird"], "correct_choice_idx": 3, "direct_answers": ["bird", "bird", "crane", "swan", "bird", "bird", "crane", "bird", "bird", "bird"], "difficult_direct_answer": false, "rationales": ["The animal that flies is standing on the edge while the giraffes are further away.", "The bird is closest to the puddle.", "The bird is standing next to the pool."], "image": "train2014/COCO_train2014_000000127527.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46106, "question_id": "45UVZnLcADpqaijZWS9D4D", "question": "Why is the woman looking down into her hand?", "choices": ["she's frantic", "she's embarrassed", "she's crying", "answering text"], "correct_choice_idx": 3, "direct_answers": ["answering text", "phone", "using phone", "cellphone", "phone messages", "smart phone", "texting", "checking phone", "phone", "checking phone"], "difficult_direct_answer": false, "rationales": ["She is checking her phone to respond to someone.", "She has a phone.", "The woman has a phone in her hand and is touching the screen."], "image": "train2014/COCO_train2014_000000046106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245182, "question_id": "45gp5VgLJNWEneWiXSDoH8", "question": "What is required for this activity?", "choices": ["water", "snow", "sun", "wind"], "correct_choice_idx": 1, "direct_answers": ["stamina", "skis", "skis", "skis", "skis", "skis", "snow", "snow", "snow", "skis"], "difficult_direct_answer": false, "rationales": ["Snow is necessary for skiing in order to move down the hill.", "You need snow to use the boards.", "In order to move the athletes need snow."], "image": "val2014/COCO_val2014_000000245182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274105, "question_id": "45jghpsHrhCHXXaTcKT5UN", "question": "What is the dog wearing?", "choices": ["glasses", "leash", "scarf", "boots"], "correct_choice_idx": 0, "direct_answers": ["hat", "hat", "hat", "sunglasses", "hat sunglasses", "hat", "hat", "cap", "cap", "glasses"], "difficult_direct_answer": false, "rationales": ["The dog has glasses on its eyes.", "There is a wire rimmed item on his nose", "The dog has sunglasses and a hat on."], "image": "train2014/COCO_train2014_000000274105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480872, "question_id": "45unnxqPLMdKKDwpwemBes", "question": "Which vehicle could be considered illegally parked?", "choices": ["white car", "black bus", "white van", "grey car"], "correct_choice_idx": 3, "direct_answers": ["silver van", "in front", "on end", "second van", "gray suv", "car", "grey car", "first one", "bus", "corner"], "difficult_direct_answer": true, "rationales": ["The first car is in the curb.", "The black bus and white car are moving. the white van is parked properly.", "The grey car is too close to the crosswalk."], "image": "train2014/COCO_train2014_000000480872.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28154, "question_id": "463pcmQPZjxRv9sfYUMh9v", "question": "What type of activity are they playing?", "choices": ["sports", "video game", "board game", "watching movies"], "correct_choice_idx": 1, "direct_answers": ["playing games", "nintendo wii", "video gamming", "video games", "video game", "playing wii", "video game", "video game", "video games", "video game"], "difficult_direct_answer": false, "rationales": ["The women are hold remotes and pointing them while playing a game.", "They are playing while holding and pointing wii controllers.", "They are holding up wii controllers"], "image": "train2014/COCO_train2014_000000028154.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348400, "question_id": "46GuKbrX9Mg6pqQeRw92gq", "question": "What is the name of the hockey team that resides in this city?", "choices": ["flames", "rangers", "lakers", "red wings"], "correct_choice_idx": 1, "direct_answers": ["rangers", "capitals", "ny rangers", "rangers", "nicks", "manchester", "rangers", "rangers", "blue jackets", "rangers"], "difficult_direct_answer": false, "rationales": ["New york's hockey team is called the new york rangers.", "You see a new york city cab so you know that we are refering to the rangers", "The sign says \"east 42nd street.\" that's a famous street in new york; home of the new york rangers."], "image": "val2014/COCO_val2014_000000348400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398214, "question_id": "46zQcd8bWwyvarDbZfWMPF", "question": "Which street is a oneway street?", "choices": ["henry", "webster", "morris", "williams"], "correct_choice_idx": 1, "direct_answers": ["webster", "webster", "webster", "webster", "don't know", "webster", "webster", "webster", "webster", "webster"], "difficult_direct_answer": false, "rationales": ["The arrow sign is the same direction as the named street on the pole", "The street sign for webster appears on the same pole as a one way street sign. if they are orientated in the same direction on the same pole they are likely referencing the same street.", "As long as you can read, then you can tell what the sign is indicating."], "image": "train2014/COCO_train2014_000000398214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101053, "question_id": "47BUkgNqptJZsMAhDXjNx6", "question": "When was the first bus stop installed?", "choices": ["1820s", "1840s", "1860s", "1850s"], "correct_choice_idx": 0, "direct_answers": ["don't know", "1824", "thirties", "1824", "17th century", "1924", "1824", "1829", "1820s", "20th century"], "difficult_direct_answer": false, "rationales": ["None of these dates seem likely because automobiles were not invented yet but this date is closest to the date for sure.", "That was when the first stop was invented.", "The very first bus route opened on 4 july 1829."], "image": "val2014/COCO_val2014_000000101053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435764, "question_id": "47HJQwKaToJKTeBWWC67VA", "question": "What do the trees indicate about the region?", "choices": ["forest", "cold", "southern", "northern"], "correct_choice_idx": 2, "direct_answers": ["tropical", "southern", "warm climate", "fertility", "warm", "climate", "near beach", "climate", "arid", "tropical"], "difficult_direct_answer": false, "rationales": ["These grow closer to the equator", "Most palm trees are found in southerns states such as florida.", "The trees are indicative of southern heat."], "image": "train2014/COCO_train2014_000000435764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28116, "question_id": "47NQWs7WTQdK4JhcZU9CPB", "question": "What animal might make this area its home?", "choices": ["horse", "elephant", "owl", "cat"], "correct_choice_idx": 2, "direct_answers": ["deer", "owl", "bird", "bear", "squirrel", "wolf", "bear", "horse", "bear", "bear"], "difficult_direct_answer": false, "rationales": ["The area is a heavily wooded forest.", "There are trees all around and they live in trees.", "The area is full of trees and an owl would likely make a nest on a branch."], "image": "train2014/COCO_train2014_000000028116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353808, "question_id": "47TQhyv7Z3LVmmaLPPAGpT", "question": "What is this type of sandwich called?", "choices": ["subway", "panini", "monte cristo", "hoagie"], "correct_choice_idx": 1, "direct_answers": ["pita", "grilled avocado", "grilled", "grilled", "panini", "panini", "panini", "panini", "panini", "panini"], "difficult_direct_answer": false, "rationales": ["A panini is a sandwich made with italian bread that is toasted or grilled, which is true for this sandwich, so it's obvious that this sandwich is a panini.", "This sandwich is pressed on a hot griddle after its assembled", "By the type of bread shown you can see what the type of sandwich it is."], "image": "val2014/COCO_val2014_000000353808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53514, "question_id": "489dRsAkerJnzdC5XRMcAD", "question": "The ornate metal girders are covering what object?", "choices": ["train tracks", "sidewalk", "roadway", "electrical wires"], "correct_choice_idx": 0, "direct_answers": ["clock", "train tunnel", "clock", "train tracks", "train station", "roof", "skylights", "train station", "ceiling", "train station"], "difficult_direct_answer": false, "rationales": ["It is like a canopy.", "They are there to cover the tracks.", "The place is a cement surface for people to walk on."], "image": "train2014/COCO_train2014_000000053514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237355, "question_id": "48NrMhWejuoNTKPL6TftwM", "question": "What is the brown and green bag made from?", "choices": ["rubber", "plastic", "paper", "vinyl"], "correct_choice_idx": 2, "direct_answers": ["paper", "paper", "paper", "paper", "paper", "paper", "paper", "paper", "paper", "paper"], "difficult_direct_answer": false, "rationales": ["The brown and green bag is a crinkled paper looking material.", "The brown and green back on the girl's lap is made from recycled paper.", "The bag is wrinkled as paper would be."], "image": "train2014/COCO_train2014_000000237355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15110, "question_id": "48YingfbwEqZrzMdBfBwM7", "question": "What type of area is shown?", "choices": ["forest", "urban", "coastal", "rural"], "correct_choice_idx": 1, "direct_answers": ["city streets", "sidewalk", "rainy", "street", "park", "urban", "urban", "street", "city", "sidewalk"], "difficult_direct_answer": false, "rationales": ["The tall buildings and street lights gives it away to where they are.", "This is a city location and cities are classified as urban so this is, by definition, an urban area.", "People walk down a sidewalk in an area with a lot of buildings."], "image": "train2014/COCO_train2014_000000015110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474453, "question_id": "48atNfzvMGrYyVNLSH6CbU", "question": "What body part might these animals likely lose soon?", "choices": ["noses", "ears", "tails", "hooves"], "correct_choice_idx": 2, "direct_answers": ["wool", "tails", "coat", "fur", "wool", "coat", "coat", "fleece", "wool", "legs"], "difficult_direct_answer": false, "rationales": ["The tails will be gone from the body soon.", "Tails are usually cut off lambs but these two sheep need tails removed to prevent fly strike and they are marked in blue.", "These animals could lose their tails due to shearing."], "image": "train2014/COCO_train2014_000000474453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177253, "question_id": "48hx6ud9uyzRSUQd6R2ZpB", "question": "What aisle in the grocery store is the man in the gray shirt shopping in?", "choices": ["produce", "wine", "milk", "meat"], "correct_choice_idx": 0, "direct_answers": ["sales", "produce", "produce", "fruit aisle", "produce", "produce", "produce", "produce", "produce", "produce"], "difficult_direct_answer": false, "rationales": ["The produce area is shown.", "The man in the gray shirt is surrounded by fruits and vegetables.", "Fruit can bee seen all around."], "image": "train2014/COCO_train2014_000000177253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168779, "question_id": "48ofzhYNPF2M7mBMNg78sh", "question": "Which vehicle rejects public service?", "choices": ["blue car", "red bus", "yellow car", "double decker"], "correct_choice_idx": 2, "direct_answers": ["school bus", "yellow car", "car", "red bus", "double decker", "school bus", "tour bus", "bus", "double-decker bus", "private bus"], "difficult_direct_answer": true, "rationales": ["The design of the red bus is normally for school aged children and not the general public.", "It is a taxi.", "Buses take lots of people places, but regular cars do not pick up strangers and transport them for money."], "image": "train2014/COCO_train2014_000000168779.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38232, "question_id": "48v4MMD2USYnQ5t4WDzoju", "question": "What is this woman trying to hit?", "choices": ["person", "ball", "target", "puck"], "correct_choice_idx": 1, "direct_answers": ["net", "ground", "cage", "frisbee net", "ball", "goalpost", "ball", "ball", "paper plate", "ball"], "difficult_direct_answer": false, "rationales": ["She has a frisbee in her hand and will throw it to someone else", "The woman wants to get a hit on a ball.", "She is aiming the frisbee before throwing it."], "image": "train2014/COCO_train2014_000000038232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423010, "question_id": "49PpSRaTJjKHmyHpryPoYV", "question": "Why are the vases on the white pedestals?", "choices": ["to move", "to display", "to paint", "to shoot"], "correct_choice_idx": 1, "direct_answers": ["designing purpose", "display", "decoration", "display", "for display", "its design", "display", "museum", "for sale", "to display"], "difficult_direct_answer": false, "rationales": ["These vases are being shown off.", "The objects are obviously for this purpose and not the other options.", "They are on the pedestal to keep them closer to eye level for museum visitors."], "image": "train2014/COCO_train2014_000000423010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41085, "question_id": "49deMrmBy8aQH7M7AvHC5L", "question": "How might you most easily bait this animal into moving?", "choices": ["with steak", "with mouse", "with worms", "with carrots"], "correct_choice_idx": 3, "direct_answers": ["whip", "carrot", "food", "with carrots", "food", "whip", "food", "running", "with carrot", "carrot"], "difficult_direct_answer": false, "rationales": ["Horses enjoy carrots and will make an effort to reach one, including walking forward if one is used to compel them.", "Horses love them and the other options are inedible for them.", "Horses like crunchy vegetables and fruits such as these."], "image": "train2014/COCO_train2014_000000041085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154892, "question_id": "49kj23UqLbaX84seNngwkw", "question": "What is the raw ingredient of chocolate cake?", "choices": ["wheat flour", "coco powder", "all purpose", "sugar powder"], "correct_choice_idx": 1, "direct_answers": ["cocoa", "egg", "flour sugar", "chocolate", "cocoa", "cocoa", "cocoa", "eggs", "coco powder", "eating cake"], "difficult_direct_answer": false, "rationales": ["The cake contains chocolate which is made of cocoa powder.", "This is the ingredient that gives it the flavor", "Chocolate cakes are made with cocoa powder."], "image": "val2014/COCO_val2014_000000154892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183913, "question_id": "49yg5HtbzLDBSQQjyVqma5", "question": "What governing body uses the flag closest to the camera?", "choices": ["caricom", "belgium", "european union", "united states"], "correct_choice_idx": 2, "direct_answers": ["european union", "french", "european union", "government", "european union", "european union", "european union", "state", "european union", "nation"], "difficult_direct_answer": false, "rationales": ["The body is the eu.", "The flag closest is blue with yellow markings.", "The european union uses the flag of that design."], "image": "train2014/COCO_train2014_000000183913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91155, "question_id": "4ATMY67G7R8fUqqeJU95zk", "question": "Why is the food eaten by the boy unhealthy?", "choices": ["high sodium", "high carbohydrate", "high fat", "high sugar"], "correct_choice_idx": 0, "direct_answers": ["contains sugars", "salty", "processed meat", "nitrates", "sugar", "high sodium", "high calories", "carbs", "carbs", "hot dog"], "difficult_direct_answer": true, "rationales": ["The food is a sausage. salt is used, often in large quantities, to make sausage. salt contains a lot of a specific element.", "This type of food has a lot of salt in it.", "The food has high sodium."], "image": "train2014/COCO_train2014_000000091155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496617, "question_id": "4ATk3m7DFSdE3324P3GfgX", "question": "How is the occupancy of this room?", "choices": ["partial", "one person", "full", "empty"], "correct_choice_idx": 0, "direct_answers": ["small", "five people", "occupied", "mostly full", "full", "partial", "five people", "five", "lunch room", "somewhat full"], "difficult_direct_answer": true, "rationales": ["Not all of the seats are taken.", "There are five people in the room. some of the chairs are still empty.", "The room is partially full."], "image": "train2014/COCO_train2014_000000496617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566529, "question_id": "4AZCeeMpym9e6S4JTSMXqM", "question": "What does the stuffed animal the man is holding resemble?", "choices": ["elmo", "cabbage patch", "beanie baby", "troll"], "correct_choice_idx": 2, "direct_answers": ["bear", "teddy bear", "beanie baby", "bear", "bear", "bear", "bear", "teddy bear", "bear", "bear"], "difficult_direct_answer": false, "rationales": ["The animal is a beanie baby bear.", "This is the only one which could be a teddy bear like the one depicted.", "The doll looks like a plush."], "image": "val2014/COCO_val2014_000000566529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519950, "question_id": "4AoDxc4jiu4mtRDNSPgt2U", "question": "Why is his arm raised so high?", "choices": ["is tired", "hit ball", "wants attention", "is falling"], "correct_choice_idx": 1, "direct_answers": ["hit ball", "swinging", "hitting ball", "overhand", "hitting ball", "serving", "hit ball", "energy", "to hit", "effort"], "difficult_direct_answer": false, "rationales": ["When playing tennis you have to raise your arm in order to return the ball.", "There is a ball above his head and a racket in his hand also raised above his head. it appears that both are in motion moving towards each out so he is likely doing answer a.", "When tennis players serve they take a large overhead swing."], "image": "train2014/COCO_train2014_000000519950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189838, "question_id": "4B4QJDbypuSXFuQHHWnmWJ", "question": "What is/are contained inside the vases?", "choices": ["soil", "water", "marbles", "wine"], "correct_choice_idx": 1, "direct_answers": ["flowers", "flowers", "flowers", "flowers", "water", "flowers", "flowers", "flowers", "flowers", "daffodils"], "difficult_direct_answer": false, "rationales": ["There are flowers in the vases and to keep them alive water is given to them.", "The vases contain flowers. water would keep these flowers fresher for longer so it is most likely water inside the vases.", "There are flower vases that need this natural liquid to survive."], "image": "train2014/COCO_train2014_000000189838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561069, "question_id": "4BGFvDHMrLaKnHVQB5EGNU", "question": "Which leg would be hurt if he fell?", "choices": ["his left", "both", "his right", "neither"], "correct_choice_idx": 0, "direct_answers": ["left", "left leg", "left leg", "right", "left leg", "his left", "left", "left leg", "left", "left"], "difficult_direct_answer": false, "rationales": ["A guy is on a skateboard and is in the air with the left side of his body closer to the ground than his right side is.", "His left leg is facing downward so that would take the brunt of the fall.", "The skater's left leg is closer to the ground above which he is temporarily suspended. we can assume this is the leg to be hurt should he fall."], "image": "train2014/COCO_train2014_000000561069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429321, "question_id": "4Bv6qPT99k6DGPC3CRpgmv", "question": "Why do they have their arms stretched out to the side?", "choices": ["threatening others", "posing", "to balance", "is falling"], "correct_choice_idx": 2, "direct_answers": ["to balance", "balance", "balance", "balance", "maintain balance", "balance", "skating", "balance", "balancing", "balance"], "difficult_direct_answer": false, "rationales": ["Holding your arms out as you make a turn helps keep you upright", "When you hold your arms out it is easier not to fall.", "This keeps them from falling off when they lean their body over"], "image": "train2014/COCO_train2014_000000429321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322738, "question_id": "4CKNrcR8rPw9bkrnhcJXfy", "question": "Why is the horse on the the ground?", "choices": ["resting", "getting even", "fell", "backing up"], "correct_choice_idx": 2, "direct_answers": ["pawing", "stopping", "stopping", "fell", "slowing down", "collision", "slowing down", "stop suddenly", "dragging", "he fell"], "difficult_direct_answer": false, "rationales": ["It looks like the horse lost traction by the dirt being kicked up and it falling makes the most sense.", "You can see that the horse fell because its hind legs are on the ground", "In this scenario it is safe to say he lost his footing."], "image": "train2014/COCO_train2014_000000322738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293598, "question_id": "4Chf4o7vY7rTR2eHWQjmjs", "question": "What are the sheep doing in this picture?", "choices": ["sleeping", "grazing", "fighting", "being trimmed"], "correct_choice_idx": 1, "direct_answers": ["grazing", "grazing", "grazing", "earing grass", "grazing", "grazing", "eating", "grazing", "grazing", "grazing"], "difficult_direct_answer": false, "rationales": ["The sheep are leisurely eating grass which is called grazing.", "The sheep are grazing.", "These animals are in a field they are standing upright and you can see that their heads or towards the ground and they are consuming the grass around them this activity is called grazing."], "image": "train2014/COCO_train2014_000000293598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416733, "question_id": "4Chsxe6WshNhVGWzD9fkcZ", "question": "How many cycles are there in the room?", "choices": ["three", "four", "two", "one"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "three", "two", "three", "three"], "difficult_direct_answer": false, "rationales": ["There is one bicycle on the ceiling and two on the floor.", "There is one bicycle in the air and one on the ground.", "There is one on the ceiling and two on the floor."], "image": "val2014/COCO_val2014_000000416733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331544, "question_id": "4CnmYACiKoFPWUk2YAfTEP", "question": "What is item the woman is wearing on her head called?", "choices": ["beanie", "muffler", "winter headband", "scarf"], "correct_choice_idx": 2, "direct_answers": ["head band", "headband", "headband", "headband", "head band", "headband", "headband", "headband", "winter headband", "head band"], "difficult_direct_answer": false, "rationales": ["It's a headband for when it's cold.", "The headband keeps the woman's head warm.", "It is made of wool and used to keep her ears warm."], "image": "train2014/COCO_train2014_000000331544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242499, "question_id": "4CuzhdRgbjkJBXKy2dQpn5", "question": "What tusked entity allows tourists to cross this body of water while keeping their feed dry?", "choices": ["elephants", "walrus", "boar", "elephant seals"], "correct_choice_idx": 0, "direct_answers": ["elephants", "elephant", "elephants", "elephants", "elephant", "elephants", "elephant", "elephants", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["There is a depiction of many huge animals with floppy ears going across the river. they are carrying people on their backs as they go across.", "Elephants can be seen in the image and people don't usually ride walruses, seals, and boars.", "The animals with long trunks have tusks."], "image": "val2014/COCO_val2014_000000242499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511410, "question_id": "4DLdxtWupmWfP7nBqTPhxf", "question": "What type of vehicle are the people looking at?", "choices": ["car", "boat", "plane", "motorcycle"], "correct_choice_idx": 3, "direct_answers": ["motorcycles", "motorcycle", "motorcycle", "motorcycle", "expensive car", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycles"], "difficult_direct_answer": false, "rationales": ["Motorcycles are on display. people are gathered all around.", "There are motorcycles.", "The people are standing around a motorcycle that is on display at an event."], "image": "train2014/COCO_train2014_000000511410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448574, "question_id": "4DLfs6cN5LpSnbgn2NcE5V", "question": "What is the tip of the bed structures called?", "choices": ["pillows", "finials", "mattress", "headrest"], "correct_choice_idx": 1, "direct_answers": ["post", "finials", "post", "finials", "bedpost", "bedpost", "tester", "bedpost", "post", "finials"], "difficult_direct_answer": false, "rationales": ["The tips have a technical name known as the finials.", "The only logical answer is the first based on the choices.", "The tips of the poles at the bottom of the bed are called finials."], "image": "train2014/COCO_train2014_000000448574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292689, "question_id": "4DZSuoufpjmN6rocZtsUYF", "question": "What mode of transport did this person recently take advantage of?", "choices": ["lyft", "biike", "uber", "air travel"], "correct_choice_idx": 3, "direct_answers": ["airplane", "airplane", "trip", "flying", "airplane", "air travel", "plane", "bus", "airplane", "plane"], "difficult_direct_answer": false, "rationales": ["They have many suitcases which would not fit into the other modes of transportation, and luggage tags which don't exist for the other modes of transportation.", "The transportation is for air travel.", "A person is standing on the curb with suitcases. people take suitcases on planes."], "image": "train2014/COCO_train2014_000000292689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224907, "question_id": "4EHWsgq9hiU9Poziq64UiN", "question": "How many balls is the dog carrying?", "choices": ["two", "five", "one", "three"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two balls", "no dog", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is a yellow one and a ball one.", "There is one yellow and one green ball in the paws of the animal.", "The dog has a yellow ball and a green ball."], "image": "val2014/COCO_val2014_000000224907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80808, "question_id": "4EffQRthc6yUZZTSUfDBDd", "question": "Who wears a similar item to what the boy is wearing on his head?", "choices": ["clown", "chef", "baker", "biker"], "correct_choice_idx": 3, "direct_answers": ["baseball players", "biker", "construction workers", "baseball player", "football players", "player", "babe ruth", "football player", "bike rider", "football player"], "difficult_direct_answer": true, "rationales": ["The boy is wearing a helmet. a helmet can also help protect your head when you are riding a bike.", "Most bikers wear helmets to avoid injury.", "The boy is wearing a helmet, which bikers also need to wear to protect from injury incase they fall off their bikes."], "image": "train2014/COCO_train2014_000000080808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505092, "question_id": "4Ehv4p3GX2mnc6cEvp3EAY", "question": "What are the kites here meant to resemble?", "choices": ["dogs", "cats", "martians", "sea creatures"], "correct_choice_idx": 3, "direct_answers": ["octopus", "squids", "color change", "aliens", "fish", "octopus", "sea creatures", "octopus", "octopus", "octopus"], "difficult_direct_answer": false, "rationales": ["The kites are sea creatures.", "The kites look like octopi.", "The kites are shaped like squid."], "image": "train2014/COCO_train2014_000000505092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48759, "question_id": "4EmJvAqEyHhpfecjkdkUXD", "question": "What is the reason for the woman in yellow standing in the street here?", "choices": ["drunk test", "road construction", "police stop", "trapping robbers"], "correct_choice_idx": 1, "direct_answers": ["directing traffic", "crossing guard", "traffic controller", "stopping traffic", "road construction", "guiding traffic", "crossing guard", "directing traffic", "directing traffic", "construction"], "difficult_direct_answer": false, "rationales": ["The reason is road construction.", "This woman is holding an orange sign that states \"stop\" these are used to halt traffic which work is being done on a portion of the road. it allows traffic to flow smoothly and prevents accidents from occuring.", "A woman is in the street holding a street sign and wearing bright colors. road workers wear bright colors."], "image": "train2014/COCO_train2014_000000048759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203615, "question_id": "4Erbs4oKhTx2djMq9ivCYd", "question": "What is protruding from the water?", "choices": ["branch", "octopus", "shark", "flying saucer"], "correct_choice_idx": 0, "direct_answers": ["branch", "stick", "branch", "ducks", "branch", "tree", "ducks", "duck", "trees", "branch"], "difficult_direct_answer": false, "rationales": ["You can tell by the color and design as to what is coming from the water.", "The branch are seen at the edge of the water.", "A part of the tree that has lost its leaves is protruding from the water."], "image": "train2014/COCO_train2014_000000203615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101241, "question_id": "4FDpmBDFGVTcXpoYyckafE", "question": "What sort of space is this?", "choices": ["public business", "warehouse", "storage", "private home"], "correct_choice_idx": 3, "direct_answers": ["kitchen", "kitchen", "kitchen", "bar space", "kitchen", "kitchen", "private home", "kitchen", "bar", "kitchen"], "difficult_direct_answer": false, "rationales": ["The objects in the background indicate that this is a residential kitchen.", "The place is at a private home.", "The picture depicts an obvious kitchen in a private home."], "image": "train2014/COCO_train2014_000000101241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258815, "question_id": "4FEgi9vAbkGfxjeoNfxsvz", "question": "The child is learning what?", "choices": ["snacking", "singing", "bubble blowing", "dental hygiene"], "correct_choice_idx": 3, "direct_answers": ["brushing teeth", "tooth brushing", "toothbrushing", "brushing teeth", "tooth brushing", "tub", "brushing teeth", "brush teeth", "brushing", "dental hygiene"], "difficult_direct_answer": false, "rationales": ["The adult is brushing the child's teeth so the child can learn good dental hygiene.", "The child is learning how to take proper care of his teeth. an adult is using a toothbrush on him.", "The adult is brushing the child's teeth."], "image": "val2014/COCO_val2014_000000258815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353610, "question_id": "4FErxPVPPckv3e6eabG3SM", "question": "What color is the jacket at the end of the camera lens?", "choices": ["blue", "white", "black", "red"], "correct_choice_idx": 0, "direct_answers": ["black", "red", "dark gray", "dark", "black", "grey", "black", "blue", "black", "red"], "difficult_direct_answer": false, "rationales": ["It is the same color as the woman's clothing", "There is a man wearing a red jacket.", "A person is holding a camera with a long lens. the person with the camera is wearing a blue jacket."], "image": "train2014/COCO_train2014_000000353610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2732, "question_id": "4FSWagpQig99W7ar2c5EeD", "question": "What number is on the yellow train?", "choices": ["9637", "4782", "4528", "8161"], "correct_choice_idx": 3, "direct_answers": ["8161", "8161", "eight one", "8161", "8161", "eight one", "8161", "8161", "8161", "8161"], "difficult_direct_answer": false, "rationales": ["It is a large gray number on the front of the yellow train.", "The number on the front of the train says 8161 in white paint.", "The number is below and above the windows on the front of the train."], "image": "train2014/COCO_train2014_000000002732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470117, "question_id": "4FTiddcVQLdBYfGWHa8zhW", "question": "What does the person in green try to block?", "choices": ["ball", "frisbee", "marauders", "horses"], "correct_choice_idx": 1, "direct_answers": ["frisbee", "frisbee holder", "frisbee", "throw", "frisbee", "person", "boy", "frisbe", "frisbee thrower", "frisbee"], "difficult_direct_answer": false, "rationales": ["You can tell by the setting in the picture as to what possibly is happening.", "A person is standing in front of a person with a frisbee.", "A frisbee can be seen in the hand of the man that is being blocked and none of the other options are present."], "image": "val2014/COCO_val2014_000000470117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386429, "question_id": "4FYLy6PjRVoayqTUGUp4eb", "question": "In which country do these players play?", "choices": ["united states", "uganda", "canada", "japan"], "correct_choice_idx": 0, "direct_answers": ["usa", "united states", "usa", "usa", "united states", "united states", "usa", "usa", "united states", "usa"], "difficult_direct_answer": false, "rationales": ["There is a logo on the wall that says american and they have flags on their outfits.", "Baseball is an american sport. english words are on the advertisements.", "The baseball players are playing in the united states and have the american flag on their uniforms."], "image": "train2014/COCO_train2014_000000386429.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128286, "question_id": "4FZvhFnhqS9qTjXajwMAMZ", "question": "What do the kites resemble?", "choices": ["squid", "tiger", "dog", "monkey"], "correct_choice_idx": 0, "direct_answers": ["rainbow ghosts", "squid", "squid", "squid", "squids", "squids", "squid", "sea creatures", "sea creatures", "squid"], "difficult_direct_answer": false, "rationales": ["They have oblong bodies with many narrow, long tails on each one.", "The kites have large bulbous sections with tentacle like ribbons and streamers.", "Each kite has a head and tentacles."], "image": "train2014/COCO_train2014_000000128286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292893, "question_id": "4Fjn337cfGQYnUkmEnzkig", "question": "Why is the man wearing a yellow vest?", "choices": ["visibility", "costume", "costume", "warmth"], "correct_choice_idx": 0, "direct_answers": ["police", "safety", "safety", "safety", "safety", "safety", "visibility", "security", "visibility", "safety"], "difficult_direct_answer": false, "rationales": ["The man is wearing a yellow safety vest.", "This day glow yellow color is used in safety equipment to make sure the person is visible at night.", "He is on a horse with a helmet. his vest is brightly colored so he can be seen."], "image": "train2014/COCO_train2014_000000292893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530777, "question_id": "4FnkZHDeWXhmPvUR8r7ZSn", "question": "How do you know there is work being done on the white building?", "choices": ["construction workers", "signs", "materials", "scaffolding"], "correct_choice_idx": 3, "direct_answers": ["scaffolding", "workers", "scaffolding", "scaffolding", "scaffolding", "scaffolding", "scaffolding", "construction", "scaffolding", "scaffolding"], "difficult_direct_answer": false, "rationales": ["When construction is being completed on a building sometimes a temporary structure is put up to support people and supplies in otherwise hard to reach areas.", "The scaffolding indicates work.", "There are structures on the side of the building to hold people as they are working."], "image": "val2014/COCO_val2014_000000530777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462953, "question_id": "4FnmY68xHruxd5qQ7MPde7", "question": "What business is this display promoting?", "choices": ["nature trips", "restaurant", "travel agency", "sporting goods"], "correct_choice_idx": 2, "direct_answers": ["traveling", "travel", "moving", "store", "suitcases", "travel", "luggage", "rail", "vintage collectors", "travel agency"], "difficult_direct_answer": true, "rationales": ["A variety of suitcases are on display. suitcases are associated with travel.", "The business is a travel agency.", "There is luggage displayed together."], "image": "val2014/COCO_val2014_000000462953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275817, "question_id": "4GCUmcdAYmmexYnZmC8xBF", "question": "What are the smaller animals doing here?", "choices": ["grazing", "nursing", "killing sheep", "eating meat"], "correct_choice_idx": 1, "direct_answers": ["sheep", "eating", "feeding", "nursing", "feeding", "nursing", "touching", "drinking", "nursing", "feeding"], "difficult_direct_answer": false, "rationales": ["Based on the sizes it looks like there is and adult and two adolescent sheep. when young animals reach under an adult in this fashion is it usually connected to nursing.", "The animals are nursing.", "They are suckling at the udders of the mother"], "image": "train2014/COCO_train2014_000000275817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290072, "question_id": "4GJEdhLbnxuCa57Je3SKMz", "question": "What are the small surfboards called?", "choices": ["short hands", "foam boards", "little", "short boards"], "correct_choice_idx": 1, "direct_answers": ["plates", "shortboard", "boogie boards", "fish surfboards", "shortboards", "body boards", "surfboards", "foam boards", "fish surfboards", "unknown"], "difficult_direct_answer": true, "rationales": ["These boards are shorter than the regular surfboards.", "The boards are made of foam.", "The boards are made of foam."], "image": "train2014/COCO_train2014_000000290072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257774, "question_id": "4GapWqUQ9JHiGeAuHzvG7G", "question": "What is the white item on the leftmost shelf?", "choices": ["cat", "dishwasher", "washing machine", "microwave"], "correct_choice_idx": 3, "direct_answers": ["microwave", "microwave", "microwave", "bowl", "microwave", "microwave", "microwave oven", "microwave", "microwave", "microwave"], "difficult_direct_answer": false, "rationales": ["A microwave is on the left shelf.", "A dishwasher and a washing machine would be too large to fit on the shelf. it is obviously an appliance and not an animal.", "The white item is the microwave."], "image": "train2014/COCO_train2014_000000257774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56187, "question_id": "4Gbdz2YSwUaSzgcj8aBfes", "question": "Why is the person on the right raising her hands?", "choices": ["taking photo", "exercising", "waving", "getting help"], "correct_choice_idx": 0, "direct_answers": ["photographer", "take picture", "taking photo", "take picture", "snapping photo", "taking picture", "take picture", "picture taking", "taking picture", "photograph"], "difficult_direct_answer": false, "rationales": ["The woman is holding a camera in her hands so that she can steady the camera.", "The person is taking a photo.", "The person on the right is using the camera on their phone."], "image": "val2014/COCO_val2014_000000056187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54344, "question_id": "4GsC49HzUfCWxc3gfmrnpT", "question": "What biome is in the background?", "choices": ["desert", "tundra", "rainforest", "savanna"], "correct_choice_idx": 3, "direct_answers": ["savannah", "field", "grassland", "grassland", "grassland", "desert", "desert", "grassland", "desert", "savanna"], "difficult_direct_answer": false, "rationales": ["This looks like the savanna in the background.", "Just like in the plains of africa the lack of trees and tall grass lets you know what type if region they are in.", "The savanna can be seen in the background."], "image": "val2014/COCO_val2014_000000054344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175020, "question_id": "4H7wWa9f4WPdpjtMnFmW28", "question": "What did the man on the sidewalk most likely just do?", "choices": ["shop", "shower", "steal", "exercise"], "correct_choice_idx": 0, "direct_answers": ["grocery shop", "go shopping", "shop", "grocery shop", "groceries", "bought food", "groceries", "shop", "shop", "shop"], "difficult_direct_answer": false, "rationales": ["He is holding two grocery bags so he probably just bought food.", "He is carrying a bag in each hand.", "He has two shopping bags in his hands."], "image": "train2014/COCO_train2014_000000175020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154307, "question_id": "4HVWphh6pmw5ws2DVD5LCx", "question": "What will happen to the yellow and white slices?", "choices": ["will evaporate", "will burn", "get crispy", "will melt"], "correct_choice_idx": 3, "direct_answers": ["melt", "cooking purpose", "cooking purpose", "melt", "cooking purpose", "will melt", "melt", "melt", "melt", "melt"], "difficult_direct_answer": false, "rationales": ["The slices will melt.", "The yellow and white slices of cheese are on the frying pan and will melt.", "When the heat hits the cheese it will melt due to physics of this world."], "image": "train2014/COCO_train2014_000000154307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300239, "question_id": "4HmRpBY9kzXWQre29SaoNr", "question": "How are the people feeling while holding the food?", "choices": ["sad", "angry", "scared", "proud"], "correct_choice_idx": 3, "direct_answers": ["proud", "happy", "proud", "happy", "happy", "happy", "happy", "happy", "happy hungry", "happy"], "difficult_direct_answer": false, "rationales": ["They are showing off what they made.", "They assembled this pizza together and are proud of the result. their pride is evident as they display the pizza to the camera.", "They appear to have made this themselves so they are excited about how it turned out."], "image": "train2014/COCO_train2014_000000300239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117071, "question_id": "4JCzb5z4owAWTNg6Nky2oA", "question": "The green arrow is giving the instruction to walk which direction?", "choices": ["turn left", "turn right", "straight", "turn around"], "correct_choice_idx": 2, "direct_answers": ["straight", "straight", "north", "straight", "north", "straight", "north", "straight", "straight", "north"], "difficult_direct_answer": false, "rationales": ["The direction shown by the green arrow is forward.", "The sharp point leads one way. it is found on heavily populated roads to aid in traffic flow.", "Technically, it's b and then a and then in an up direction."], "image": "train2014/COCO_train2014_000000117071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157618, "question_id": "4JHP9ofRaqGNo8sARM8xZt", "question": "Why do some boats have a big pole sticking up from it?", "choices": ["for sails", "for navigation", "send sos", "for lookout"], "correct_choice_idx": 0, "direct_answers": ["sail mast", "sails", "for sails", "sail", "to sail", "sail boat", "support sails", "wind sails", "for sails", "for sails"], "difficult_direct_answer": false, "rationales": ["Wind powered boat have to have something to catch the wind. sheets are strung up on tall poles called \"masts\".", "This pole is used to hold their sails up when they put them up.", "These are sailboats and these poles hold up the cloth"], "image": "train2014/COCO_train2014_000000157618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530905, "question_id": "4JhbJjoV6pbmCqmnHgKKmk", "question": "What is in the little white and green tub?", "choices": ["tobacco", "dental floss", "pills", "lip balm"], "correct_choice_idx": 1, "direct_answers": ["dental floss", "dental floss", "dental floss", "toothpaste", "floss", "gum", "dental floss", "toothpaste", "floss", "dental floss"], "difficult_direct_answer": false, "rationales": ["There is gum dental floss in the container.", "The tub is floss.", "The tub holds a special thread that is used to clean between the teeth."], "image": "val2014/COCO_val2014_000000530905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318811, "question_id": "4JhuPC9T8DvZnstDcNdrds", "question": "Why is there a description for the person's bear?", "choices": ["puzzle game", "to sell", "share history", "to buy"], "correct_choice_idx": 2, "direct_answers": ["information", "museum item", "antique", "museum", "antique", "for purchase", "share history", "antique", "alan turing's", "bears history"], "difficult_direct_answer": false, "rationales": ["A bear is on display with a card below it. collectibles often are displayed with information.", "The description shares the history.", "This bear has a plaque below it."], "image": "train2014/COCO_train2014_000000318811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550354, "question_id": "4K8EiUgyCh9EpETP57eUwy", "question": "What type of activity is this?", "choices": ["winter", "aquatic", "summer", "tropical"], "correct_choice_idx": 0, "direct_answers": ["winter", "skiing", "skiing", "skiing", "skiing", "skiing", "ski", "skiing", "alpine skiing", "alpine skiing"], "difficult_direct_answer": false, "rationales": ["There is snow on the ground and this only occurs in this particular season.", "The ground is covered in snow. the people have warm outfits and equipment for skiing.", "Snow only falls when it is cold enough for the water to freeze which occurs during this season."], "image": "train2014/COCO_train2014_000000550354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391187, "question_id": "4K8VM2gaiG9V5vekf3xN9A", "question": "What city is this bus in?", "choices": ["london", "camrose", "kyiv", "brighton"], "correct_choice_idx": 0, "direct_answers": ["cobham", "london", "esher", "church cobham", "florence", "london", "england", "london", "london", "church cobham"], "difficult_direct_answer": false, "rationales": ["Church cobham on the bus is only located in the london area of the uk.", "A red double decker bus has a location lit on a digital board on it.", "The city's name is before transport on the back of the bus."], "image": "val2014/COCO_val2014_000000391187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8686, "question_id": "4KBD6dFtLeUceijLXpEkXn", "question": "Why is there more than one clock?", "choices": ["purchase surplus", "reflection", "individually owned", "easier viewing"], "correct_choice_idx": 3, "direct_answers": ["viewing angle", "multiple angles", "multi directions", "all streets", "different visual-angles", "multidirectional", "different viewpoints", "directional views", "different angles", "easier viewing"], "difficult_direct_answer": true, "rationales": ["The clock has more than one face. it can be seen in many directions.", "There are clocks on each face of the tower to that the time can be seen from whatever direction a person is approaching from.", "There are clock faces facing every direction."], "image": "train2014/COCO_train2014_000000008686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303626, "question_id": "4KNBfmkBrHx3wV7feDHEp6", "question": "Excellence in the American and International film industry award is what?", "choices": ["cambridge", "oscar", "national", "oxford"], "correct_choice_idx": 1, "direct_answers": ["oscar", "oscar", "oscar", "oscar", "oscar", "oscar", "bafta", "oscars", "academy award", "oscar"], "difficult_direct_answer": false, "rationales": ["You get an oscar if you win an award in the film industry.", "Movies that are the best of the year win the academy award.", "The awards are called the oscars."], "image": "val2014/COCO_val2014_000000303626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296014, "question_id": "4KTiUbjPXyK6A8jsKKUDFD", "question": "What type of boat are they navigating the water on?", "choices": ["fishing", "raft", "canoe", "kayak"], "correct_choice_idx": 1, "direct_answers": ["raft", "inflatable raft", "raft", "raft", "raft", "raft", "inflatable", "relatable", "raft", "raft"], "difficult_direct_answer": false, "rationales": ["This is a big raft", "Given the slope and power of the water, a raft is typically used for this activity.", "They are in an inflatable boat used for navigating white waters. option a matches the name for such a boat."], "image": "val2014/COCO_val2014_000000296014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402823, "question_id": "4KoRNAhPGxzx4mAH6v66Cu", "question": "What is this horse practicing?", "choices": ["steeplechase", "escape", "posing", "bucking"], "correct_choice_idx": 0, "direct_answers": ["jumping", "jumping", "jumping hurdles", "jumping", "jumping", "jumping", "jumping", "steeplechase", "jumping hurdles", "jumping"], "difficult_direct_answer": false, "rationales": ["The horse is jumping over the barrier.", "They are at a staplechase.", "A steeplechase is when a horse has to jump over obstacles as is the case here; the horse is in the middle of jumping over that blue horizontal bar."], "image": "val2014/COCO_val2014_000000402823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144727, "question_id": "4KqGyQZWiWqzTciPHmCAkx", "question": "Is this game available in android?", "choices": ["no", "none", "yes", "maybe"], "correct_choice_idx": 2, "direct_answers": ["yes", "yes", "no", "yes", "no", "yes", "no", "no", "yes", "yes"], "difficult_direct_answer": false, "rationales": ["Android is still used by eighty percent of cell phone users.", "This game is on android's app store.", "The game is available."], "image": "train2014/COCO_train2014_000000144727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241261, "question_id": "4KvUWujpXTHUqcNHh9hMYm", "question": "What are colorful objects hanging on the pole behind the men?", "choices": ["ties", "socks", "ribbons", "shorts"], "correct_choice_idx": 0, "direct_answers": ["ties", "bowties", "neckties", "yard vines", "ties", "ties", "ties", "ties", "ties", "owner"], "difficult_direct_answer": false, "rationales": ["The objects on the pole are fabric that is made to be worn around the neck when a man is dressing professionally.", "There are all sorts of colorful ties behind the men.", "Men are standing in a commercial area with multiple apparel items hanging overhead that are thin, long pieces of material in different patterns and colors."], "image": "train2014/COCO_train2014_000000241261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123824, "question_id": "4L7aMXbBQawe4AZN39xAni", "question": "What type of transportation is shown?", "choices": ["road", "air", "water", "rail"], "correct_choice_idx": 2, "direct_answers": ["boats", "boats", "kayaking", "boat", "boats", "kayak", "raft", "water", "rafts/row-boats", "boats"], "difficult_direct_answer": false, "rationales": ["The transportation is in water.", "There are boats on the water which move on water.", "There is a large river between the mountains and people are traveling on the water with boats."], "image": "train2014/COCO_train2014_000000123824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36739, "question_id": "4LGspmZpvLtNJMtLK2tPEu", "question": "What type of nutrient is missing in the above meal?", "choices": ["carbohydrate", "vitamins", "none", "proteins"], "correct_choice_idx": 0, "direct_answers": ["carbohydrate", "dressing", "carbohydrate", "protein", "carb", "zinc", "grains", "carbohydrates", "dairy", "bread"], "difficult_direct_answer": true, "rationales": ["Based on the distinct color, shapes and textures of the foods visible, there is nothing that falls in the category of answer a which is a category normally included in a meal.", "There are no carbs shown in this picture.", "A well balanced meal includes some type of sugar."], "image": "train2014/COCO_train2014_000000036739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534173, "question_id": "4LKmHyfhqwXdbm4qmvNG6P", "question": "What has the woman sat down to do?", "choices": ["eat", "sleep", "make call", "exercise"], "correct_choice_idx": 2, "direct_answers": ["talk", "use phone", "make call", "talk", "talking", "phone call", "talk", "it", "phone call", "use phone"], "difficult_direct_answer": false, "rationales": ["A woman at a gathering of some sort takes a break by sitting down and talking to someone on her cellphone.", "She is holding a phone up to her head.", "The woman has her cell phone up to her ear and appears to be having a conversation with the person on the other end."], "image": "val2014/COCO_val2014_000000534173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195525, "question_id": "4LZwVgJ8ZZyhk93ibBCMAz", "question": "Where is the boy located at?", "choices": ["school", "department store", "home", "salon"], "correct_choice_idx": 3, "direct_answers": ["salon", "barber shop", "barber", "hairdresser", "salon", "salon", "salon", "hair salon", "hair salon", "hair salon"], "difficult_direct_answer": false, "rationales": ["He is with a hairdresser getting his hair dried.", "This is the only place where someone would be using a hairdryer and scissors while he sits in a chair.", "His hair looks like it has just been cut and as can be seen, is getting blow dried by a woman. he is wearing a tarp, which is what people wear at salons so hair doesn't stick to their clothes."], "image": "train2014/COCO_train2014_000000195525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292689, "question_id": "4LdmMSVfSKGYg7UtmSbhH7", "question": "What is this man enjoying here?", "choices": ["sales job", "beach", "sleeping", "vacation"], "correct_choice_idx": 3, "direct_answers": ["scenic city", "sitting", "view", "weather", "sightseeing", "resting", "sights", "architecture", "weather", "vacation"], "difficult_direct_answer": true, "rationales": ["He is on a vacation with all his luggage.", "Because he is carrying a lot of luggage.", "The man has lots of luggage with him meaning he is likely enjoying a holiday traveling."], "image": "train2014/COCO_train2014_000000292689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416523, "question_id": "4Lvtz2MdGChbtRoU68KNaL", "question": "What continent is this place in?", "choices": ["australia", "north america", "europe", "asia"], "correct_choice_idx": 2, "direct_answers": ["europe", "italy", "europe", "europe", "europe", "sport area", "europe", "europe", "european", "europe"], "difficult_direct_answer": false, "rationales": ["There is a european language on the signs.", "This is the location of most of the brands advertised on the window.", "It's unclear in the image, but the building architecture and the parking sign imply a or c."], "image": "val2014/COCO_val2014_000000416523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389932, "question_id": "4M8CdFYXc2nx22AtqdcZhM", "question": "What is the yellow truck doing?", "choices": ["selling food", "extinguishing fire", "repairing ground", "delivering mail"], "correct_choice_idx": 0, "direct_answers": ["selling food", "selling food", "selling food", "selling food", "serving food", "parking", "serving food", "selling food", "selling food", "selling food"], "difficult_direct_answer": false, "rationales": ["The yellow vehicle is a food truck and sells things to eat.", "The truck is selling food.", "The truck has images of food all over it and a window where they can serve the food."], "image": "train2014/COCO_train2014_000000389932.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209610, "question_id": "4MWXozj7Cwn7shs54GLBDd", "question": "What element is outside the physical reality of the photo?", "choices": ["caption", "emoji", "date", "name"], "correct_choice_idx": 3, "direct_answers": ["sky", "atmosphere", "time", "name", "ground", "rain", "copyright", "letters", "camera", "metal stands"], "difficult_direct_answer": true, "rationales": ["The identity of the photographer is beside a copyright symbol on the far left of the photo.", "The name wasn't really in the scene.", "The photo of the train station has a name and copyright printed on the photo which was not at the train station."], "image": "val2014/COCO_val2014_000000209610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535519, "question_id": "4MtjXu8Yne7HGPRJThaqGb", "question": "What color gamma is the picture in?", "choices": ["cold filter", "monochromatic", "full color", "sepia"], "correct_choice_idx": 1, "direct_answers": ["black white", "black", "black", "black white", "monochromatic", "high", "black white", "black", "black white", "black white"], "difficult_direct_answer": false, "rationales": ["The color is grayscale and only shows shades of black and white.", "The photo is black and white.", "There is only black and white in the picture."], "image": "val2014/COCO_val2014_000000535519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46014, "question_id": "4NA9YZPNSuKxEJjTcQXb9u", "question": "What would usually be where the pasta is?", "choices": ["eggs", "hamburger", "meatloaf", "hot dog"], "correct_choice_idx": 3, "direct_answers": ["hot dog", "hot dog", "hot dog", "hotdog", "hot dog", "hotdog", "hotdog", "hot dog", "hot dog", "hot dog"], "difficult_direct_answer": false, "rationales": ["This is a hot dog bun and normally there would be some kind of long meat in it", "There is a long bun.", "The shape and cut of the bread is the traditional look of a bratwurst or hot dog bun."], "image": "train2014/COCO_train2014_000000046014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573968, "question_id": "4NEJhmpv6MJmQgUmJCMuf2", "question": "What is the man going to do next?", "choices": ["jumping off", "flipping over", "lying down", "sitting down"], "correct_choice_idx": 0, "direct_answers": ["jump", "jump", "ski downhill", "jump", "jump", "jump off", "jump", "jump", "ski", "jumping off"], "difficult_direct_answer": false, "rationales": ["The skier is beginning to crouch to jump off the rail.", "The man will jump.", "The man wants to get off from the snow."], "image": "val2014/COCO_val2014_000000573968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128351, "question_id": "4NQ7LBSfpET8qhrayWFPGp", "question": "What bus company information is posted immediately above the license plate?", "choices": ["name", "website", "phone number", "specials"], "correct_choice_idx": 1, "direct_answers": ["brand name", "website", "megabus", "website", "megabus.com", "megabus", "megabus", "megabus", "megabus", "name"], "difficult_direct_answer": false, "rationales": ["It's on the bus and this is often the case.", "Their website is on the bus", "The website can be seen above it."], "image": "train2014/COCO_train2014_000000128351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334752, "question_id": "4NQBRZAmYcq5Qzk7wmvZgd", "question": "What zone is this street likely to be?", "choices": ["shopping", "tourist", "business", "residential"], "correct_choice_idx": 0, "direct_answers": ["city street", "inner city", "na", "fast", "city", "downtown", "city", "city", "downtown", "shopping"], "difficult_direct_answer": false, "rationales": ["The zone is for shopping.", "This area has a lot of stores in it.", "Storefronts are lit on a busy city street. there is shopping in cities."], "image": "train2014/COCO_train2014_000000334752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85472, "question_id": "4NbGB2ZdG2XiwNaygyPdFb", "question": "What color is the water?", "choices": ["gray", "white", "pink", "blue"], "correct_choice_idx": 0, "direct_answers": ["gray", "gray", "gray blue", "clear", "brown", "brown", "gray", "gray", "grey", "gray"], "difficult_direct_answer": false, "rationales": ["Since this is a black and white photo, the water certainly looks grey.", "The color of the water reflects the sky and it is a cloudy day.", "The color is gray."], "image": "train2014/COCO_train2014_000000085472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532565, "question_id": "4NiRZqwSm9RWogibyxVqZG", "question": "What is the most likely seriousness of this event?", "choices": ["funeral", "formal", "business casual", "informal"], "correct_choice_idx": 3, "direct_answers": ["unknown", "pizza party", "low", "unknown", "party", "jokingly", "low", "unknown party", "party", "informal"], "difficult_direct_answer": false, "rationales": ["The man is wearing a t-shirt. t-shirts are very casual.", "It's nothing serious since he is wearing a tshirt and they are being silly.", "The outfit is very causal."], "image": "train2014/COCO_train2014_000000532565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67347, "question_id": "4Nt2D6SZcCYSTbJDRaReRr", "question": "What number can be found on the train?", "choices": ["822", "753", "405", "982"], "correct_choice_idx": 1, "direct_answers": ["753", "753", "753", "seven-hundred fiftythree", "753", "753", "seven-hundred fiftythree", "753", "753", "753"], "difficult_direct_answer": false, "rationales": ["A plate with identifying numbers is on the back of a train.", "The numbers 753 appear on the back.", "You can see the number on the front of the train on a black plate."], "image": "train2014/COCO_train2014_000000067347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51012, "question_id": "4NyQ4qwPgQPWd9HtVs4sxN", "question": "Which country might you find the living replica of the item on the bed?", "choices": ["new zealand", "canada", "england", "germany"], "correct_choice_idx": 1, "direct_answers": ["canada", "canada", "america", "canada", "russia", "canada", "united states", "america", "canada", "canada"], "difficult_direct_answer": false, "rationales": ["The country has a lot of wilderness and the bear is native to its continent.", "I might find a living bear in canada.", "The stuffed animals are bears. real bears are found in north america."], "image": "train2014/COCO_train2014_000000051012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383704, "question_id": "4P8sSoFHupHyituj6vuc9k", "question": "What are the donuts getting placed in?", "choices": ["oil", "coke", "sprite", "water"], "correct_choice_idx": 0, "direct_answers": ["oil", "fryer", "fryer", "oil", "oil", "fryer", "hot grease", "fryer", "fryer", "fryer"], "difficult_direct_answer": false, "rationales": ["They are placed in oil to cook them.", "That's how donuts were created to be cooked.", "Donuts are made by frying the circular shaped dough. the donuts are being dropped into the liquid that is used for frying."], "image": "train2014/COCO_train2014_000000383704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577213, "question_id": "4PDpm2TCKSUEw6qJzctq29", "question": "What kind of weather is one likely to experience in this area?", "choices": ["tropical", "arid", "rainy", "cold"], "correct_choice_idx": 0, "direct_answers": ["winter", "rainy", "rain", "tropical", "rainy", "sunshine", "tropical", "tropical", "rainy", "rain"], "difficult_direct_answer": false, "rationales": ["Palm trees are seen in the area which only grow where it is warm year-round.", "The palm trees in the picture signify it is in a tropical biome.", "The weather is tropical."], "image": "train2014/COCO_train2014_000000577213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516727, "question_id": "4PJyEzoB3Xs3DzD8Z2KSQM", "question": "What is marine safety in the Coast Guard?", "choices": ["enforcement", "coast", "protection", "rescue"], "correct_choice_idx": 3, "direct_answers": ["water safety", "job", "us", "vessels", "inspecting vessels", "boat patrol", "coast guard", "mission", "rescue", "water safety"], "difficult_direct_answer": true, "rationales": ["There is a coast guard boat where the people will look for those that need help.", "A coast guard boat is in the water. coast guard does rescue.", "The boat is used to help people. it has a lifesaver on the side."], "image": "val2014/COCO_val2014_000000516727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314616, "question_id": "4PbEpETA8pwdpThbHerD9V", "question": "What is happening in the photo?", "choices": ["thunderstorm", "flooding", "cow showering", "raining"], "correct_choice_idx": 2, "direct_answers": ["bathing animals", "rain", "raining", "cleaning", "cleaning", "cow showering", "washing", "bath", "cows washed", "feeding"], "difficult_direct_answer": true, "rationales": ["The cow is showering.", "Cows are being sprayed by water.", "There is a hose on the ground and everything is wet after being sprayed."], "image": "val2014/COCO_val2014_000000314616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409855, "question_id": "4PfqGydtnzdWg7c2PtR4Pp", "question": "Where is this kitchen located?", "choices": ["restaurant", "hospital", "school", "home"], "correct_choice_idx": 3, "direct_answers": ["neat kitchen", "apartment", "in apartment", "apartment", "home", "home", "apartment", "apartment kitchen", "apartment", "apartment"], "difficult_direct_answer": false, "rationales": ["This is a residential kitchen.", "The kitchen is at home.", "This kitchen is too small to be a restaurant, school, or hospital kitchen. personal belongings are on the counter."], "image": "val2014/COCO_val2014_000000409855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480583, "question_id": "4Pyq7vbR3HdymNeFrhjYuT", "question": "What restaurant do these donuts come from?", "choices": ["starbucks", "krispy kreme", "dunkin donuts", "tim hortons"], "correct_choice_idx": 1, "direct_answers": ["krispy kreme", "bakery", "bakery", "krispy kreme", "krispy kreme", "dunking donuts", "bakery", "bakery", "bakery", "bakery"], "difficult_direct_answer": false, "rationales": ["These donuts are made by krispy kreme.", "The signs are green.", "Rows of donuts are in a display case on white trays."], "image": "val2014/COCO_val2014_000000480583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74555, "question_id": "4QJPfcae9uPCxCASytHVHm", "question": "What makes these objects worthy to put behind glass?", "choices": ["color", "age", "size", "shape"], "correct_choice_idx": 1, "direct_answers": ["vase", "handmade", "rarity beauty", "culture", "artifact", "artifacts", "artifacts", "age", "antiques", "high value"], "difficult_direct_answer": true, "rationales": ["These objects are part of a museum display and are held behind glass.", "The pottery is ancient.", "The items look like ancient pottery based on the decorative motifs."], "image": "train2014/COCO_train2014_000000074555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481239, "question_id": "4QJZj37Xw3mShpB2pnSFCP", "question": "Near what type of area do the people here wait?", "choices": ["snow field", "ocean", "woods", "salt flats"], "correct_choice_idx": 1, "direct_answers": ["beach", "beach", "beach", "sand", "beach", "beach", "beach", "ocean", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["They are on the coast on a beach and there is a surfboard", "There is sand near the water.", "The people are standing on the sand of a beach that is near the ocean."], "image": "val2014/COCO_val2014_000000481239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387599, "question_id": "4QM5sjp59QgVorYF9QAVkR", "question": "What is in the center?", "choices": ["baby", "cat", "flower", "poster"], "correct_choice_idx": 2, "direct_answers": ["flower", "flowers", "vase", "flowers", "flowers", "flowers", "flowers", "flowers", "flowers", "flower jug"], "difficult_direct_answer": false, "rationales": ["Flowers are in the center.", "The center has flowers.", "The vase is located exactly between the other objects."], "image": "train2014/COCO_train2014_000000387599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494218, "question_id": "4QrgnFGRT7fuBxrzsv3NAG", "question": "Where is the setting in this photo?", "choices": ["salad bar", "restaurant", "juice shop", "apartment"], "correct_choice_idx": 3, "direct_answers": ["kitchen", "apartment", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen"], "difficult_direct_answer": false, "rationales": ["The decor is not of a professional kitchen based on the amenities and the tv stand in the background and the space he is working in. the other place one cooks that would look like this would be answer a.", "Looks to be in someones house.", "You can see a stereo and a coffee table in the background so more than likely someone's house."], "image": "val2014/COCO_val2014_000000494218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304445, "question_id": "4Qsg4LBqjRUYj8idi5C57P", "question": "What demographic most likely lives in this area?", "choices": ["black", "white", "hispanic", "asian"], "correct_choice_idx": 3, "direct_answers": ["korean", "americans", "middle class", "koreans", "asian", "asian", "adults", "young adults", "workers", "asians"], "difficult_direct_answer": true, "rationales": ["The demographic is asian.", "The people visible in the image are all or mostly asian as are the people in the marketing on the bus. marketing usually reflects the target demographic of the area.", "The writing is in korean."], "image": "train2014/COCO_train2014_000000304445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302713, "question_id": "4R7mzH53gd9i4fNm8pBG32", "question": "Why are the license plates invisible?", "choices": ["blurred", "on front", "new cars", "stolen"], "correct_choice_idx": 0, "direct_answers": ["privacy", "blurred", "blurred", "blurred", "blurred out", "blur", "blurred out", "privacy", "blurred out", "identification"], "difficult_direct_answer": false, "rationales": ["You can't see the numbers clearly.", "This is a common method in order to hide personal information, and not of the text is visible through artificial means.", "There is a license plate, but it has been blurred out or distorted."], "image": "train2014/COCO_train2014_000000302713.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517251, "question_id": "4RQ7hTXzGeAsqKVM7NW9q3", "question": "Why is he holding several tennis balls?", "choices": ["bombard opponent", "standard gameplay", "practicing serve", "prevent theft"], "correct_choice_idx": 2, "direct_answers": ["showing off", "practicing serve", "practicing", "trick shot", "for practice", "to serve", "speed serving", "practicing", "practicing", "keeping extras"], "difficult_direct_answer": false, "rationales": ["The man is holding several tennis balls so he can hit them consecutively while practicing his serve.", "If this was a real game he would only be serving one ball.", "The racquet is used to hit the balls for the game of tennis. he is holding more than one to make several shots."], "image": "train2014/COCO_train2014_000000517251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524255, "question_id": "4RbrDDFVtd3fD3ND85VsCd", "question": "Judging from evidence in the picture what has the horse most likely received?", "choices": ["apples", "money", "oats", "training"], "correct_choice_idx": 3, "direct_answers": ["applause", "training", "training", "stability", "training", "awards", "training", "training", "training", "treat"], "difficult_direct_answer": false, "rationales": ["Horses have to be trained to do what is being depicted in the photo, because they are normally wild and would buck them off.", "The horse has had to have been taught to do a trick like this.", "It would have had to receive a to stay calm while they stand on it."], "image": "train2014/COCO_train2014_000000524255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512967, "question_id": "4RoLLERueawCgXiVYsyMVa", "question": "What pair of devices are mounted on the wall and in the window sill?", "choices": ["intercom", "walkie-talkie", "speaker", "radio"], "correct_choice_idx": 2, "direct_answers": ["speakers", "speakers", "speaker", "speakers", "speaker", "speakers", "speakers", "speakers", "dehumidifiers", "speaker"], "difficult_direct_answer": false, "rationales": ["Speakers take the form of the object on the wall.", "There are boxes behind the wall above the couch. these are used to listen to sound coming out of it.", "There are two speakers mounted on the wall of the living room and the window sill."], "image": "train2014/COCO_train2014_000000512967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399618, "question_id": "4RscNXx4Jy4J5ZRarTZYZk", "question": "What type of plate material is this dish being served upon?", "choices": ["plastic", "metal", "ceramic", "wood"], "correct_choice_idx": 2, "direct_answers": ["ceramic", "pottery", "ceramic", "glass", "ceramic", "ceramic", "china", "ceramic", "ceramic", "glass"], "difficult_direct_answer": false, "rationales": ["The dish is ceramic since it's been glazed and fired.", "A shiny, glass container is being used to serve food.", "A white shiny plate is under food. dishes are often ceramic."], "image": "train2014/COCO_train2014_000000399618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331904, "question_id": "4RsyxURrXLjF5Dymxo2wie", "question": "Why are the people standing behind the yellow line?", "choices": ["to dance", "safety", "to race", "it's wet"], "correct_choice_idx": 1, "direct_answers": ["safety", "waiting", "safety", "safety", "waiting", "safety", "safety", "waiting", "waiting", "waiting"], "difficult_direct_answer": false, "rationales": ["That where they are supposed to stand so they don't get hurt.", "The people are behind the line because in front of it they could get hit by a train.", "The lines are used to warn people from falling."], "image": "train2014/COCO_train2014_000000331904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46508, "question_id": "4SC3kVu4QzHdp7AYmcy9Pj", "question": "What would someone have to do to get to ride this elephant?", "choices": ["ask", "pay", "feed it", "hail it"], "correct_choice_idx": 1, "direct_answers": ["pay money", "pay", "pay", "pay owner", "pay money", "pay", "earn trust", "climb up", "ladder", "climb up"], "difficult_direct_answer": false, "rationales": ["The person on the elephant works for a company as he is wearing a uniform.", "This is a taxi or entertainment type of ride", "This man makes a living giving rides on his elephant."], "image": "val2014/COCO_val2014_000000046508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167551, "question_id": "4SCkLgeouKF3AihAfnb7h7", "question": "How many people wearing tan pants and black shirts are seen here?", "choices": ["two", "five", "one", "six"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "five", "one", "four", "five", "one", "one", "one", "five one"], "difficult_direct_answer": false, "rationales": ["It is the same person, but it is a photo of them throughout the movements.", "Five people are wearing tan pants and black shirts.", "The person wearing the pants is just being duplicated into multiple images."], "image": "train2014/COCO_train2014_000000167551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124873, "question_id": "4SCqyNJXvvDxziQLmYYYJW", "question": "What action did the girl take?", "choices": ["throw", "fall", "roll", "jump"], "correct_choice_idx": 0, "direct_answers": ["threw ball", "threw frisbee", "extended arm", "throw frisbee", "toss frisbee", "throw", "throw", "throw", "throw frisbee", "throw"], "difficult_direct_answer": false, "rationales": ["The girl is throwing.", "The girl tossed the frisbee.", "The girl has extended her arm and opened her palm as if she released a ball."], "image": "val2014/COCO_val2014_000000124873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111492, "question_id": "4SFZpJuWcSdn6ZEewUacNc", "question": "What looks like it could happen any minute?", "choices": ["tornado", "sunshine", "rain", "fireworks"], "correct_choice_idx": 2, "direct_answers": ["boat escape", "rain", "rain", "rain", "rain", "rain", "truck moves", "raining", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The skies are darkening, and the clouds are heavy and low as if a downpour will soon begin.", "The sky will rain.", "The sky is grey and clouds are gathering overhead. rain tends to fall when such conditions are visible. the climate looks suited for such phenomenon."], "image": "train2014/COCO_train2014_000000111492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434047, "question_id": "4SHZUfAF7pNGvnAxghSN7m", "question": "What do the boys need to put on their toothbrushes before brushing?", "choices": ["fruit", "grease", "food", "toothpaste"], "correct_choice_idx": 3, "direct_answers": ["toothpaste", "toothpaste", "toothpaste", "toothpaste", "toothpaste", "toothpaste", "toothpaste", "toothpaste", "toothpaste", "toothpaste"], "difficult_direct_answer": false, "rationales": ["They are brushing their teeth.", "In order to clean your teeth you need to use toothpaste", "Toothpaste has to be used to brush teeth properly."], "image": "train2014/COCO_train2014_000000434047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55727, "question_id": "4SKwKq2xfsBQnNbaUSJwVX", "question": "How will an occupant be able to submerge themselves completely in water?", "choices": ["bathtub", "sink", "shower", "toilet"], "correct_choice_idx": 0, "direct_answers": ["bathtub", "take bath", "tub", "bathtub", "bathtub", "bathe", "bathtub", "bathtub", "bathtub", "bathtub"], "difficult_direct_answer": false, "rationales": ["They would be able to fill this and climb into it.", "The bathtub will have a lot of water.", "It can hold a lot of water."], "image": "val2014/COCO_val2014_000000055727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193547, "question_id": "4SLZZdMz8LdVbcARxfATyT", "question": "Who does the person most look like?", "choices": ["tim duncan", "maria sharapova", "rick moranis", "serena williams"], "correct_choice_idx": 2, "direct_answers": ["rick moranis", "human", "woody allen", "man", "monk", "unknown", "jared", "man", "boss", "penn"], "difficult_direct_answer": true, "rationales": ["The person looks like moranis.", "The person is a white man. he is wearing glasses.", "Rick moranis always wears this facial expression."], "image": "val2014/COCO_val2014_000000193547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327663, "question_id": "4SQBkvURaTwP8GcUC92s2g", "question": "What is the group of people being watched by police likely doing?", "choices": ["protesting", "enlisting", "dancing", "shopping"], "correct_choice_idx": 0, "direct_answers": ["protesting", "protesting", "stand", "protesting", "protesting", "protesting", "protesting", "protest", "protesting", "protest"], "difficult_direct_answer": false, "rationales": ["The people are protesting since they're in a mob.", "The people are protesting.", "The police are holding back a crowd of people."], "image": "val2014/COCO_val2014_000000327663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534428, "question_id": "4STkx7vD5NpP2DmG9bAXU4", "question": "In which country are these clear plastic umbrellas commonly used?", "choices": ["thailand", "south korea", "japan", "china"], "correct_choice_idx": 2, "direct_answers": ["england", "japan", "japan", "japan", "china", "japan", "china", "germany", "china", "france"], "difficult_direct_answer": false, "rationales": ["People stand in a sidewalk all holding umbrellas .", "That is the color that country represents.", "The japanese love these types of umbrellas."], "image": "val2014/COCO_val2014_000000534428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561766, "question_id": "4SZ4TodzG4BtQnsxHiqdAq", "question": "What is the trail created by the boat in the water called?", "choices": ["flood", "eruption", "draft", "wake"], "correct_choice_idx": 3, "direct_answers": ["wake", "waves", "wake", "waves", "wake", "wake", "wake", "wake", "wake", "ship's wake"], "difficult_direct_answer": false, "rationales": ["The boat has waves following it in the water from its movement.", "There are small waves that follow the boat caused by the twirling of the motor blade.", "The trail behind the boat is the wake of its travel."], "image": "val2014/COCO_val2014_000000561766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344805, "question_id": "4Sdmor4cSjDb7SuoCUXDE2", "question": "What are the people using?", "choices": ["basketball", "laptop", "refrigerator", "sink"], "correct_choice_idx": 1, "direct_answers": ["laptop", "laptop", "computers", "laptops", "laptops", "laptops", "laptop", "computers", "laptops", "laptops"], "difficult_direct_answer": false, "rationales": ["The people are using computers that sit on their laps.", "They have portable computers on their laps", "Both people are using portable apple computers."], "image": "train2014/COCO_train2014_000000344805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326504, "question_id": "4SihVBJjCzbtapsCgPpckf", "question": "Why is she facing away from the others?", "choices": ["inadequate space", "hiding", "privacy", "cleaning fingers"], "correct_choice_idx": 2, "direct_answers": ["privacy", "eating", "not friends", "strangers", "shy", "privacy", "by herself", "social distance", "wants privacy", "strangers"], "difficult_direct_answer": false, "rationales": ["She probably doesn't know the others and needs a rest", "She is looking down at something and doesn't appear to be wanting to interact with others.", "There is only room for three people on the other side of the bench."], "image": "train2014/COCO_train2014_000000326504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461543, "question_id": "4SksX6LMKfVcSTLatX22mN", "question": "Which seasonal Olympic game is skateboarding?", "choices": ["winter", "summer", "spring", "autumn"], "correct_choice_idx": 1, "direct_answers": ["summer", "summer", "summer", "summer", "summer", "summer", "summer", "summer", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["Skateboarding is a summer sport.", "Skateboarding is not cold weather, but a hot weather sport, corresponding to the season in option a.", "There are only two olympic seasons and skateboarding could not take place in the snow."], "image": "train2014/COCO_train2014_000000461543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164569, "question_id": "4SsW8NtbMcTvWATLnQhQ5h", "question": "What entity most likely owns the tallest building pictured?", "choices": ["willis", "chrysler", "sears", "hsbc"], "correct_choice_idx": 3, "direct_answers": ["church", "city", "hsbc", "government", "bank", "hsbc", "government", "government", "government", "hsbc"], "difficult_direct_answer": false, "rationales": ["The entity is hsbc.", "The sign on the tallest building indicates who likely owns it.", "Their name is on the top of the building."], "image": "train2014/COCO_train2014_000000164569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404652, "question_id": "4StW8jcSrXso9BZAPk3X9E", "question": "What does the stove use to heat food?", "choices": ["natural gas", "electricity", "electromagnetic technology", "fire"], "correct_choice_idx": 0, "direct_answers": ["coil", "gas", "coil", "coil", "gas", "gas", "gas", "gas", "natural gas", "gas"], "difficult_direct_answer": false, "rationales": ["The stove has gas burners for fire to come out.", "The burners on the stove's surface are the type that dispense flames. natural gas is most commonly used to create the element of fire when it comes to stoves.", "The stove uses natural gas."], "image": "val2014/COCO_val2014_000000404652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236747, "question_id": "4TVja49We2kG3amAFZoJRe", "question": "What work is done in this space?", "choices": ["coding", "machine shop", "cooking", "typing"], "correct_choice_idx": 1, "direct_answers": ["steel working", "mechanic", "machine shop", "metal work", "welding", "mechanic", "building", "metal work", "grooming", "machining work"], "difficult_direct_answer": false, "rationales": ["There are tools on the work surface.", "The large metal power tools identifies this location as a metal working shop.", "This space is where machines can be built."], "image": "val2014/COCO_val2014_000000236747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461868, "question_id": "4Tc7ehWKjb8MSDKzm8wzc5", "question": "What is needed for this activity?", "choices": ["ice", "water", "sun", "wind"], "correct_choice_idx": 3, "direct_answers": ["wind", "wind", "strings", "wind", "wind", "wind", "strings", "sky", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["Kites fly in the air and need wind to keep them up.", "To fly a kite there needs to be a windy environment.", "Kites fly through the air."], "image": "train2014/COCO_train2014_000000461868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94625, "question_id": "4U6Tj9tteoxPR3EhBZt3TK", "question": "These walls have a similar color scheme to what place?", "choices": ["nathan's", "arthur treacher's", "popeye's", "kfc"], "correct_choice_idx": 3, "direct_answers": ["burger joint", "pc richards", "fast food", "walmart", "kfc", "building", "train colour", "target", "gas station", "target"], "difficult_direct_answer": true, "rationales": ["Kfc has a red and white color scheme.", "Kfc is a chicken restaurant with the same color scheme.", "The colors are like kfc because you red and grey on the building"], "image": "train2014/COCO_train2014_000000094625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64116, "question_id": "4U6urZut3pg5dQj2iJ7jEj", "question": "What is the person called who would take care of the sheep?", "choices": ["zookeeper", "breeder", "manager", "shepard"], "correct_choice_idx": 3, "direct_answers": ["shepherd", "shepherd", "shepard", "herder", "sheepherder", "shepherd", "farmer", "shepard", "shepherd", "herder"], "difficult_direct_answer": false, "rationales": ["Shepherds are individuals who are tasked with taking care of sheep. this is a know fact.", "The person is a shepard.", "Shepards take care of sheep. a large flock of sheep is in grassy area."], "image": "val2014/COCO_val2014_000000064116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276750, "question_id": "4U9RVBc5x7zKxCbUBgZ2QY", "question": "What is starting here?", "choices": ["snow", "fight", "rain", "sleet"], "correct_choice_idx": 2, "direct_answers": ["rainstorm", "rain", "rain", "rain", "rain", "joke", "rain", "rain", "rain", "love"], "difficult_direct_answer": false, "rationales": ["The rain is starting.", "Droplets of water can be seen on the top of the umbrella.", "The couple is under an umbrella which means they are protecting themselves for rainfall."], "image": "train2014/COCO_train2014_000000276750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437900, "question_id": "4UCC5wwtkz9xiYTrtgJN9X", "question": "What is the name given to the type of meat gotten from the animals above?", "choices": ["mutton", "beef", "chicken", "pork"], "correct_choice_idx": 0, "direct_answers": ["goat", "mutton", "mutton", "mutton", "mutton", "mutton", "mutton", "lamb", "mutton", "sheep"], "difficult_direct_answer": false, "rationales": ["They are sheep.", "Mutton comes from sheep.", "The animals are sheep, not pigs, cows, or chickens."], "image": "train2014/COCO_train2014_000000437900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448694, "question_id": "4UNrfaYVdxxyKWtTBAC2y2", "question": "The object they are reaching for resembles what?", "choices": ["cone", "car", "bucket", "flying saucer"], "correct_choice_idx": 3, "direct_answers": ["frisbee", "plate", "circle", "frisbee", "flying saucer", "disc", "frisbee", "frisbee", "disc", "plate"], "difficult_direct_answer": false, "rationales": ["The object is a flat disk shape like a ufo.", "The players are reaching for a frisbee that looks like a flying saucer because it is flat.", "They are reaching for a frisbee. it is a disc that glides."], "image": "val2014/COCO_val2014_000000448694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73843, "question_id": "4Un6CApK6rrecj7kdZeewm", "question": "The train gliding on what in order to move?", "choices": ["rails", "wheels", "bridge", "station"], "correct_choice_idx": 0, "direct_answers": ["tracks", "track", "tracks", "tracks", "tracks", "rails", "tracks", "tracks", "track", "track"], "difficult_direct_answer": false, "rationales": ["There are rails there for the train to run on.", "The train is on rails.", "An elevated train is seen moving along a metal bridge. trains run along tracks or rails."], "image": "train2014/COCO_train2014_000000073843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500474, "question_id": "4UrmgTTXrNC2KvLPieE8Y9", "question": "How was this pie most likely cut?", "choices": ["fork", "pizza cutter", "spoon", "spatula"], "correct_choice_idx": 1, "direct_answers": ["pizza cutter", "pizza cutter", "slices", "slicer", "roller", "pizza cutter", "pizza cutter", "pizza cutter", "in half", "eighths"], "difficult_direct_answer": false, "rationales": ["It was mostly likely cut with a pizza cutter because the pieces are sliced perfectly.", "This is a pizza and is cut by a pizza cutter.", "There is a special tool to cut the pizza."], "image": "train2014/COCO_train2014_000000500474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529668, "question_id": "4UzQKCRrxABPMKzLjBqACG", "question": "What part of the outfit did the woman expect to stand out?", "choices": ["her dress", "her coat", "handbag", "her stockings"], "correct_choice_idx": 2, "direct_answers": ["handbag", "skirt", "purse", "purse", "center", "stockings", "purse", "purse", "open curve", "purse"], "difficult_direct_answer": false, "rationales": ["The woman is wearing black except for the handbag which is bright colored. to stand out and attract attention, bright colors would be employed and the only thing that is bright is the bag.", "She has a handbag in her hand.", "The bag is sticking out from the dark."], "image": "val2014/COCO_val2014_000000529668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17559, "question_id": "4UzhvaCnVC7o3AUAi5rJxE", "question": "What type of medium is the woman reading?", "choices": ["book", "diary", "magazine", "kindle"], "correct_choice_idx": 2, "direct_answers": ["magazine", "magazine", "magazine", "magazine", "magazine", "magazine", "hot pants", "magazine", "magazine", "hot pants"], "difficult_direct_answer": false, "rationales": ["The medium is a magazine.", "The woman is reading a magazine on the bench.", "It is a printed, not digital, medium. it is too thin to be a book or diary."], "image": "train2014/COCO_train2014_000000017559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488033, "question_id": "4V8umnbeKY45eeV7oqjTih", "question": "What does the man have in his hand?", "choices": ["cane", "dollar bill", "can", "whistle"], "correct_choice_idx": 0, "direct_answers": ["umbrella", "cane umbrella", "hat", "umbrella", "umbrella", "cane", "umberrla", "umbrella", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["The man has a cane.", "The shape is all we need to know to answer this.", "This man has various things in either hand however a cane is the only of these options which he is holding."], "image": "train2014/COCO_train2014_000000488033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282768, "question_id": "4VN3Wy8WXicsAT2bDm3MNL", "question": "What kind of dog is it?", "choices": ["service dog", "pet", "farm dog", "strayed dog"], "correct_choice_idx": 1, "direct_answers": ["black lab", "pet", "black", "labrador", "lab", "black lab", "labrador", "black shepherd", "black dog", "black lab"], "difficult_direct_answer": false, "rationales": ["The dog has a collar and is staying with the man.", "The dog is wearing a collar, is not wearing a vest, and is in an urban area.", "The dog is a pet."], "image": "val2014/COCO_val2014_000000282768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400298, "question_id": "4VU7LtSEHzavtsCwmuEHjA", "question": "How many more people can fit on the bench?", "choices": ["four", "six", "none", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "two", "two", "two", "one", "one", "two", "lots", "two", "two"], "difficult_direct_answer": false, "rationales": ["The bench appears to comfortably hold three people and there are already two people seated.", "Two more bodies could conceivably fit on the bench, but it would be very uncomfortable for all involved, so, just one more would be fine.", "Possibly two or up to three small children."], "image": "train2014/COCO_train2014_000000400298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122955, "question_id": "4VaLrezi557KVnwppGheJc", "question": "What kind of pattern is on this short haired man's tie?", "choices": ["dots", "handkerchief", "rainbow", "rag"], "correct_choice_idx": 2, "direct_answers": ["striped", "squiggles", "striped", "rainbow", "striped", "stripes", "striped", "stripes", "striped", "rainbow"], "difficult_direct_answer": false, "rationales": ["The man has stripes on his tie. the stripes are of all different colors.", "A man wears a tie with stripes of different colors.", "The tie has lines, not dots, in a wide variety of colours."], "image": "train2014/COCO_train2014_000000122955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234614, "question_id": "4VetazewJLcsjc6mxdpGqd", "question": "What vegetable is bundled together?", "choices": ["asparagus", "broccoli", "celery", "onion"], "correct_choice_idx": 3, "direct_answers": ["oranges", "scallions", "orange", "oranges", "onions", "green onions", "oranges", "oranges", "onions", "onion"], "difficult_direct_answer": false, "rationales": ["There is a rubber band around the green vegetables.", "The onion is tied up.", "Fruit and vegetables are in a produce net typical of onions being sold at a market."], "image": "val2014/COCO_val2014_000000234614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386908, "question_id": "4WcH5CitUAnohbZLVRwX8p", "question": "What type of bike is this?", "choices": ["chopper", "tandem", "mountain", "electric"], "correct_choice_idx": 0, "direct_answers": ["motorcycle", "bike", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "chopper", "motorbike", "roadster", "motorcycle"], "difficult_direct_answer": false, "rationales": ["A nickname for motorcycles is known as a chopper.", "Traditionally motorcycles are also called \"choppers\".", "These bikes are gas powered bikes, they are driven in mountains but are not classified as mountain bikes. they can also be riden tandem but do not possess a second set of pedals so they are not tandem bikes. this is a chopper which is a type of motorcycle."], "image": "val2014/COCO_val2014_000000386908.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415904, "question_id": "4WtsWeS9dyhAudRCips6WR", "question": "The bikes here belong to whom?", "choices": ["racers", "no one", "santa", "children"], "correct_choice_idx": 0, "direct_answers": ["slipstream sports", "bikers", "slipstream sports", "driver", "bikers", "splitstream", "team members", "slipstream", "racers", "cyclists"], "difficult_direct_answer": false, "rationales": ["The bikes are loaded carefully on a rack on the top of a car that has a sports team's insignia on it. the car is near an event.", "The bikes are on top of a racing team's car.", "The care is heavily branded with a sport team's information. there are also a lot of spectators, making this an event of some kind."], "image": "val2014/COCO_val2014_000000415904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8356, "question_id": "4XWjRkpRFhnzCn73KBcxPY", "question": "What does the big white object do for the food inside?", "choices": ["keep cool", "grind up", "warm up", "melt"], "correct_choice_idx": 0, "direct_answers": ["keep cold", "keep cool", "grilling", "keep cold", "freeze", "freeze", "freeze it", "keep cold", "freezer", "chill"], "difficult_direct_answer": false, "rationales": ["The big white object is a freezer used to store foods that don't fit in the kitchen.", "It is a freezer and that is the purpose of a freezer.", "This is a deep freezer that you can put frozen food in to keep cold."], "image": "train2014/COCO_train2014_000000008356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482158, "question_id": "4XbYhLSUd6fdLQnGX3RPbG", "question": "What happened in this town square?", "choices": ["tornado", "play", "parade", "storm"], "correct_choice_idx": 2, "direct_answers": ["party", "festivities", "celebration", "parade", "litter", "storm passed", "celebration", "festival", "festival parade", "party"], "difficult_direct_answer": false, "rationales": ["One can see the remains of the confetti and ribbons from the celebration.", "Confetti and streamers are all over the street showing that an outdoor celebration previously took place here.", "There is colorful debris on the ground."], "image": "val2014/COCO_val2014_000000482158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50885, "question_id": "4XnzfmabBw5QLMyYV4XeE3", "question": "What's the name for the type of shirt the man in blue is wearing?", "choices": ["jersey", "tank top", "polo", "button up"], "correct_choice_idx": 0, "direct_answers": ["jersey", "jersey", "jersey", "basketball shirt", "sport shirt", "jersey", "jersey", "jersey", "jersey", "jersey"], "difficult_direct_answer": false, "rationales": ["These tops are worn by athletes and the fans who want to rep their team.", "The name is a jersey.", "A guy is playing video game with a blue shirt with a number on it."], "image": "train2014/COCO_train2014_000000050885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410724, "question_id": "4Xpcv8U6UEzNWwNice2eMy", "question": "What can be readily done with the metal object near the cat?", "choices": ["start car", "shoot bullets", "knitting", "cut hair"], "correct_choice_idx": 3, "direct_answers": ["cut paper", "cut", "cutting", "cutting", "cut", "cut", "cut", "cut", "cut hair", "cutting"], "difficult_direct_answer": false, "rationales": ["These can be used to make the growth on a head shorter.", "There are metal scissors beside the cat sculpture and scissors are meant to cut things.", "The cat can get a haircut."], "image": "val2014/COCO_val2014_000000410724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246057, "question_id": "4Y7Rq5fmcr8W3bGP9szdRW", "question": "Why is the man wearing a yellow sweatshirt?", "choices": ["visibility", "as prank", "fashion", "amusement"], "correct_choice_idx": 0, "direct_answers": ["keep warm", "visibility", "visibility", "cold", "stay warm", "worker", "visibility", "safety sweatshirt", "personal taste", "visibility"], "difficult_direct_answer": false, "rationales": ["It is fluorescent colored on purpose for workers for people to be a ke to see them.", "He is wearing a bright yellow shirt to be seen for safety.", "The man is wearing a yellow sweatshirt to make sure people see him."], "image": "val2014/COCO_val2014_000000246057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382873, "question_id": "4Y8rH9avkuvKGuZ2X3MdN7", "question": "What is the chair on top of the elephant called?", "choices": ["saddle", "gondola", "howdah", "chaise"], "correct_choice_idx": 2, "direct_answers": ["platform", "howdah", "box", "howdah", "seat", "elephant seat", "platform", "howdah", "sofa", "sitting purpose"], "difficult_direct_answer": false, "rationales": ["This is the arabic word for a bed on a camel.", "Elaborate chairs are on top of saddles on elephants.", "Chairs that go on top of elephants are called howdahs."], "image": "train2014/COCO_train2014_000000382873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434657, "question_id": "4YKCLpwtKpgxoTYkQ5vXXV", "question": "Why are her hands in the air?", "choices": ["pointing", "clapping", "dancing", "maintain balance"], "correct_choice_idx": 3, "direct_answers": ["for balance", "balance", "balance", "balance", "maintain balance", "for balance", "balance", "for balance", "posing", "maintain balance"], "difficult_direct_answer": false, "rationales": ["The woman is trying to stand up on her board.", "The woman doesn't want to fall over into the water.", "It's very important to maintain balance in her position. it's clear that she is not pointing or clapping and there would be no reason for her to dance in the water."], "image": "val2014/COCO_val2014_000000434657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264128, "question_id": "4YcGKqWbpdNypYxe5QtmrW", "question": "What sport is being played?", "choices": ["soccer", "rugby", "cricket", "ultimate frisbee"], "correct_choice_idx": 3, "direct_answers": ["frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "ultimate frisbee", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["The man has one in his hand and other people are standing around waiting", "There is a frisbee being held by someone looking to throw it and a person standing nearby in a defensive position. for two people and a frisbee to be in these positions they would be playing answer a.", "The player is holding a disc in his hand, which would be a piece of equipment for ultimate frisbee."], "image": "train2014/COCO_train2014_000000264128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277857, "question_id": "4Yj3EAmw3WrDrwfDoRbKcT", "question": "Estrogen and Progesterone are responsible for which feeling?", "choices": ["aches", "happy", "craving", "anger"], "correct_choice_idx": 2, "direct_answers": ["mood", "strong", "craving", "mood", "emotions", "love", "positive feelings", "fullness", "happiness", "happiness"], "difficult_direct_answer": false, "rationales": ["One is a feel good hormone and one is a mood hormone.", "The effects of high or low levels of estrogen and progesterone can lead to emotional distress.", "They play a part in a female's sex drive."], "image": "train2014/COCO_train2014_000000277857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403521, "question_id": "4Z5ziDjjyxaop9C9MhiCvG", "question": "Why does the clock show different times?", "choices": ["dead battery", "for fun", "different countries", "as prank"], "correct_choice_idx": 2, "direct_answers": ["time zones", "different timezones", "different countries", "time zones", "different countries", "different countries", "time zones", "time zones", "timezones", "different timezones"], "difficult_direct_answer": false, "rationales": ["The clock shows different time zones.", "There are country's names on each of the clock faces and the saying \"home away from home\". these elements indicate the times are showing for the different countries.", "The clocks are labelled ghana, germany, canada, and u.s.a."], "image": "train2014/COCO_train2014_000000403521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81390, "question_id": "4ZHL5xoCXMgs8qqqKPx9qw", "question": "What distinguishes the animals above from the rest?", "choices": ["shortest", "fastest", "tallest", "browsers"], "correct_choice_idx": 2, "direct_answers": ["long neck", "height", "height", "neck", "long neck", "tall neck", "long neck", "long necks", "tallest", "long necks"], "difficult_direct_answer": false, "rationales": ["Giraffes are shown and are taller than most other animals.", "The giraffe is tall.", "The animals are believe to be tallest in the whole world."], "image": "val2014/COCO_val2014_000000081390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563645, "question_id": "4ZLR3HRjWHdX6HYZGfDMw9", "question": "What is the red kite near the two woman shaped like?", "choices": ["parrot", "crab", "ladybug", "ball"], "correct_choice_idx": 2, "direct_answers": ["ladybug", "lady bug", "ladybug", "ladybug", "lady bug", "ladybug", "bug", "ladybug", "air flow", "ladybug"], "difficult_direct_answer": false, "rationales": ["The kite has a ladybug on it.", "You can tell by the color and spots as to what the kite looks like.", "The kite is shaped like a red bug with black spots on it."], "image": "train2014/COCO_train2014_000000563645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79303, "question_id": "4ZaTnuveRUHsZtNNsqANJb", "question": "Why does the coloring in the signage appear uneven and different at top than bottom?", "choices": ["uniquely painted", "light glare", "art display", "sun faded"], "correct_choice_idx": 3, "direct_answers": ["vandalism", "lighting", "vandals", "scratched off", "light", "light", "camera flash", "sun faded", "light reflection", "camera flash"], "difficult_direct_answer": false, "rationales": ["The flash of a camera can change the color and dimness of items, especially on a metallic finish like these signs.", "A sun is lighter on the top. sun fades paint.", "Constant exposure to light can cause a change in color."], "image": "train2014/COCO_train2014_000000079303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536806, "question_id": "4ZcRAVyBH2wp4xTsi55yje", "question": "Which sentient beings are able to swim?", "choices": ["bugs", "humans", "goat", "birds"], "correct_choice_idx": 1, "direct_answers": ["humans", "humans", "human", "humans", "humans", "humans", "cows", "humans", "people", "humans"], "difficult_direct_answer": false, "rationales": ["The humans on the beach are the sentient beings that are able to swim.", "The humans in the image are the only ones that know how to swim.", "Some of them are in or near the water."], "image": "train2014/COCO_train2014_000000536806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302823, "question_id": "4Zvjc5fijSJawXLizy336J", "question": "Who made the donuts?", "choices": ["kfc", "dunkin donuts", "children", "cafeteria"], "correct_choice_idx": 1, "direct_answers": ["baker", "dunkin donuts", "dunkin donuts", "dunkin", "dunkin", "dunkin donuts", "dunkin", "dunkin donuts", "dunkin donuts", "dunkin donuts"], "difficult_direct_answer": false, "rationales": ["Next to the donuts is a white cup where part of the logo can be seen of the company that presumably made the donuts.", "The cup has a dunkin' donuts color and design on it is it makes sense the donut came from there too.", "The coffee cup has the brand on it."], "image": "val2014/COCO_val2014_000000302823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262394, "question_id": "4a4EfzjNwrY3hqjnYtWd6z", "question": "What is this type of motorcycle known as?", "choices": ["minibike", "dirt bike", "scooter", "cruiser"], "correct_choice_idx": 3, "direct_answers": ["biker", "harley", "harley", "harley davidson", "harley", "cruiser", "harley", "harley", "harley", "cruiser"], "difficult_direct_answer": false, "rationales": ["Motorcycles that are cruisers are extra sleek.", "The motorcycle can cruise from one place to another.", "It has a low profile."], "image": "val2014/COCO_val2014_000000262394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334217, "question_id": "4a5zjQuivrbK97MmrwiiHr", "question": "What is clouding up the image?", "choices": ["snow", "smoke", "fog", "rain"], "correct_choice_idx": 1, "direct_answers": ["smoke", "smoke", "smoke", "smoke", "smoke", "smoke", "smoke", "smoke", "smoke", "smoke"], "difficult_direct_answer": false, "rationales": ["There is smoke from a fire.", "There is a small fire in the field causing smoke.", "There is a fire."], "image": "train2014/COCO_train2014_000000334217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193349, "question_id": "4aL4Rxvm23dZczTcKArKqR", "question": "Where does the pitcher here stand?", "choices": ["pitcher's mound", "home base", "grandstands", "infield"], "correct_choice_idx": 0, "direct_answers": ["mound", "front base", "mound", "mound", "pitcher's mound", "mound", "pitcher's mound", "pitcher's mound", "mound", "mound"], "difficult_direct_answer": false, "rationales": ["The pitcher stands on a bed of raised dirt in the middle of the infield that makes him a little higher up that the rest of the field. this makes it easier for him to pitch to the batter.", "The player throws the ball from a slightly elevated dirt area towards a batter.", "The man is standing in an area of dirt surrounding by grass."], "image": "val2014/COCO_val2014_000000193349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452963, "question_id": "4agsYgwzdmwFYtnTGqExkY", "question": "The couple on the bench are fans of which professional baseball team?", "choices": ["atlanta braves", "red sox", "cincinnati reds", "yankees"], "correct_choice_idx": 2, "direct_answers": ["cincinnati reds", "reds", "cincinnati reds", "cincinnati", "cardinals", "cubs", "chicago bears", "cincinnati reds", "cincinnati reds", "cardinals"], "difficult_direct_answer": false, "rationales": ["They are wearing branded shirts. each shirt has a white c.", "The baseball team wears red t shirts.", "The jerseys are for the reds."], "image": "train2014/COCO_train2014_000000452963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235446, "question_id": "4bJmPADdrGGCuLgP6QUfKy", "question": "Where is this dog's owner?", "choices": ["down street", "inside building", "prison", "at school"], "correct_choice_idx": 1, "direct_answers": ["in restaurant", "in restaurant", "getting food", "inside restaurant", "in restaurant", "inside", "home", "inside building", "taqueria", "eating tacos"], "difficult_direct_answer": false, "rationales": ["The person left their dog there while they go into the store.", "The dog is waiting while its owner is shopping inside.", "The dog is inside the building."], "image": "train2014/COCO_train2014_000000235446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266240, "question_id": "4bK4ByEforC4sDsTobr8r9", "question": "Who wears a similar item to what the man has over his shirt?", "choices": ["gordon ramsay", "snoop dogg", "ken patera", "aubrey plaza"], "correct_choice_idx": 0, "direct_answers": ["baker", "butcher", "sailor", "baristas", "coworkers", "gordon ramsay", "left person", "sailor", "woman", "chef"], "difficult_direct_answer": true, "rationales": ["The man in the apron is a cook.", "He is wearing an apron, that many chefs wear to keep themselves clean when cooking, and gordon ramsay is a chef who is known to do this.", "He is wearing an apron. this is an item that a professional chef would wear."], "image": "train2014/COCO_train2014_000000266240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406446, "question_id": "4bKXMedfHdN52Zdv9vNErj", "question": "What is the woman in green doing?", "choices": ["programming", "singing", "texting", "listening"], "correct_choice_idx": 3, "direct_answers": ["phone call", "using phone", "taking call", "on phone", "talking", "talking", "listening", "calling", "phone call", "talking"], "difficult_direct_answer": false, "rationales": ["The woman is holding a phone up to her ear.", "The woman is holding a phone up to her ear, but she is not currently the person doing the talking because her lips are not moving.", "The woman has a mobile held up to her ear and does not appear to be currently talking. when using a mobile in this way, one is doing one of two things and if not talking she would be doing answer a."], "image": "train2014/COCO_train2014_000000406446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92001, "question_id": "4bWeKQVNmf5p9u4nm3P2PQ", "question": "How is the woman on the right in the black shirt feeling?", "choices": ["excited", "depressed", "sad", "scared"], "correct_choice_idx": 0, "direct_answers": ["happy", "excited", "happy", "excited", "competitive", "excitement", "happy", "excited", "happy", "excited"], "difficult_direct_answer": false, "rationales": ["The woman has her mouth open in a way that would be consistent with answer a.", "She is feeling excited because the expression on her face shows positive emotion.", "The agape mouth and bared teeth while holding video game controllers suggests this woman is excited."], "image": "train2014/COCO_train2014_000000092001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43511, "question_id": "4bdNXTcKbqBVqw9GUZWAcv", "question": "Which thing here is most out of place?", "choices": ["chicken", "train", "overhead wires", "tracks"], "correct_choice_idx": 0, "direct_answers": ["chicken", "chicken", "chicken", "chicken", "rooster", "chicken", "chicken", "chicken", "rooster", "chicken"], "difficult_direct_answer": false, "rationales": ["This is a train station, so the train, overhead wires, and tracks are supposed to be here.", "You don't often see chickens at train stations.", "The chicken is out of place."], "image": "train2014/COCO_train2014_000000043511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59455, "question_id": "4c7ZAvUvsQi84fn4W5nV4C", "question": "Why is the man at the back wearing a yellow jacket?", "choices": ["visibility", "camouflage", "dress code", "fashion"], "correct_choice_idx": 0, "direct_answers": ["for safety", "safety", "safety", "safety visibility", "visibility", "visibility", "safety", "visibility", "safety", "safety"], "difficult_direct_answer": false, "rationales": ["The man in the yellow vest wants to be seen.", "It helps so that drivers can see him in any type of weather.", "The type of colors in the jacket allow the person to be seen from far away and in certain situations in the dark."], "image": "val2014/COCO_val2014_000000059455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427223, "question_id": "4cBciWu5B8G26otWdDZDPC", "question": "What shot is this player making?", "choices": ["serve", "backhand", "lob", "forehand"], "correct_choice_idx": 3, "direct_answers": ["forehand", "forehand", "return", "forehand", "forehand", "hit ball", "forehand", "forehand", "return shot", "tennis"], "difficult_direct_answer": false, "rationales": ["The player is presenting the front of their arm, giving the shot its name.", "She is stepping foreword and moving the racquet forward toward the ball.", "The player is making a forehand because she is hitting the ball with her right hand which is her dominant arm"], "image": "val2014/COCO_val2014_000000427223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18885, "question_id": "4ccwWnj36fvU2bu2mHhnBn", "question": "What does the color of traffic light in the above picture imply to road users?", "choices": ["give way", "go", "wait", "stop"], "correct_choice_idx": 1, "direct_answers": ["go now", "go", "go", "go", "go", "go", "go", "safe crossing", "go", "stop"], "difficult_direct_answer": false, "rationales": ["The traffic light is green so it is safe to proceed.", "The light in the traffic light is green.", "The color means go."], "image": "train2014/COCO_train2014_000000018885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437332, "question_id": "4coUXg6p7irhRMJPE3GJdY", "question": "The shadow of what is visible?", "choices": ["skateboarder", "bird", "tank", "cow"], "correct_choice_idx": 0, "direct_answers": ["fence", "fence", "kid", "boy", "fence", "fence", "skateboarder", "fence", "fence", "skateboarder fence"], "difficult_direct_answer": false, "rationales": ["The shadow of the person on the skateboarding can be seen on the ground.", "The dark area matches the person's silhouette.", "A skateboarder is jumping on a sunny day on a sidewalk. shadows are formed on a sunny day."], "image": "val2014/COCO_val2014_000000437332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157491, "question_id": "4cuLXppt37ytmJeRxkoSmA", "question": "What object can keep beverages cold?", "choices": ["cooler", "bench", "racquet", "shoes"], "correct_choice_idx": 0, "direct_answers": ["cooler", "cooler", "cooler", "cooler", "fridge", "coolers", "ice chest", "coolers", "cooler", "cooler"], "difficult_direct_answer": false, "rationales": ["It's in the top left corner.", "The blue plastic container is used to keep drinks cold that's where it got its name.", "There is an insulated box on the sidelines."], "image": "train2014/COCO_train2014_000000157491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517685, "question_id": "4d2FKkeZeZwmHvMTZS2RRw", "question": "What role are these men likely getting ready for?", "choices": ["groomsmen", "college play", "oscars", "bridesmaids"], "correct_choice_idx": 0, "direct_answers": ["groomsmen", "groomsman", "best men", "wedding", "wedding", "best men", "groomsmen", "groomsmen", "wedding", "wedding"], "difficult_direct_answer": false, "rationales": ["The men are groomsmen.", "Given the general age of the men and the style of dress they're wearing, it's very likely that they are preparing to be groomsmen.", "They look to be helping the groom get ready."], "image": "train2014/COCO_train2014_000000517685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171479, "question_id": "4d4fK9xxP2diKcW2EvyUb3", "question": "What kind of event are the people participating in?", "choices": ["training", "commuting", "competition", "leisure activity"], "correct_choice_idx": 2, "direct_answers": ["snow skating", "skiing", "nordic skiing", "competition", "ski race", "ski competition", "cross-country skiing", "skiing race", "racing", "skiing competition"], "difficult_direct_answer": true, "rationales": ["The people both have bibs with numbers on them.", "The men have very intense expressions and are skiing very closely to one another, indicating they're taking part in a race, which is a competition.", "People are skiing with numbers on their chests and advertising is on signage."], "image": "train2014/COCO_train2014_000000171479.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523494, "question_id": "4dLDtQjhG4LMcWQuf8QGtq", "question": "What will people standing here have to pay?", "choices": ["fine", "nothing", "bus fare", "airline fees"], "correct_choice_idx": 2, "direct_answers": ["bus fare", "bus ride", "bus fare", "bus fare", "bus fare", "bus ticket", "dollar", "bus fare", "bus fare", "bus fare"], "difficult_direct_answer": false, "rationales": ["The people need to pay the bus driver.", "Majority of city buses require that riders pay a bus fare as they enter.", "Though there are some free buses, most bus companies require money to ride."], "image": "val2014/COCO_val2014_000000523494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346972, "question_id": "4dPncSSgcaS46UEFHtk49J", "question": "What is this room most likely called?", "choices": ["dining room", "family room", "utility room", "bedroom"], "correct_choice_idx": 2, "direct_answers": ["kitchen", "kitchen", "kitchen", "kitchen", "utility room", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen"], "difficult_direct_answer": false, "rationales": ["This is likely a room for utilities such as running water sinks.", "This room is for people to do quick tasks with since it's so small.", "It is the kitchen so it probably has a table in it for people to use"], "image": "val2014/COCO_val2014_000000346972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277998, "question_id": "4dTBfzz4bZyLrQ2Ge6McQM", "question": "What type of bread is on the plate?", "choices": ["cinnamon", "white", "rye", "whole grain"], "correct_choice_idx": 3, "direct_answers": ["whole wheat", "grain bread", "whole wheat", "whole grain", "whole grain", "whole grain", "whole grain", "wheat", "bennet bread", "wheat"], "difficult_direct_answer": false, "rationales": ["You can see the seeds in it", "The bread has flecks of rye in it.", "A slice of bread is brown and speckled on a plate. whole grain bread is brown."], "image": "train2014/COCO_train2014_000000277998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239975, "question_id": "4dTtf6v5BQvEFhAGJxnXmQ", "question": "What is the tall shelving in the corner of the room being used for?", "choices": ["climbing", "storage", "exercising", "sleeping"], "correct_choice_idx": 1, "direct_answers": ["resting", "plant holder", "flower stand", "plants", "storing files", "cabinet", "hold flowerpots", "plant stand", "storage", "plant stand"], "difficult_direct_answer": true, "rationales": ["It has drawers to hold and hide things", "The shelves is located behind the couch.", "The tall shelving in the corner is a file cabinet that is used for storage of documents."], "image": "val2014/COCO_val2014_000000239975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224548, "question_id": "4ddgPSid7sUFasZeDuTHhn", "question": "During what time of day are the cars traveling on the road?", "choices": ["noon", "night", "morning", "evening"], "correct_choice_idx": 3, "direct_answers": ["dusk", "night", "night", "dusk", "dusk", "night", "dusk", "night", "evening", "night"], "difficult_direct_answer": false, "rationales": ["The cars are moving on the road while the sun sets.", "The sun is setting .", "The sky is dark but not complete pitch black."], "image": "train2014/COCO_train2014_000000224548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296882, "question_id": "4dwQFgEZKdYNR2WUjWaJV9", "question": "What does this person have on her teeth?", "choices": ["braces", "candy", "food", "gum"], "correct_choice_idx": 0, "direct_answers": ["braces", "braces", "braces", "braces", "clip", "braces", "braces", "clip", "braces", "clip"], "difficult_direct_answer": false, "rationales": ["One can see the brackets from her orthodontic installation.", "The girl has wires on her teeth.", "The wires and metal covering the girl's teeth are braces."], "image": "train2014/COCO_train2014_000000296882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220615, "question_id": "4dzJNpmqWD5SXX9f5wXe6P", "question": "What type of vehicle is this?", "choices": ["rental", "commercial", "passenger", "transport"], "correct_choice_idx": 1, "direct_answers": ["street sweeper", "truck", "street cleaner", "truck", "street cleaner", "truck", "commercial", "truck", "truck", "truck"], "difficult_direct_answer": false, "rationales": ["A large truck has logos all over it. commercial vehicles have logos.", "Based on the company name and equipment on the truck, this is used for running a business.", "This is a street-sweeper type of vehicle, which is commercial in nature."], "image": "train2014/COCO_train2014_000000220615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575649, "question_id": "4eEbJuWTXG47LzfAqxKKAC", "question": "What syndrome does the baby on the left have?", "choices": ["broken leg", "torticollis", "cerebral palsy", "down's syndrome"], "correct_choice_idx": 3, "direct_answers": ["down syndrome", "nothing", "down syndrome", "baldness", "aids", "down's", "down's syndrome", "nothing", "down syndrome", "downs"], "difficult_direct_answer": false, "rationales": ["The facial characteristics are evident in the child's face.", "You can tell by the way the baby looks.", "The facial features show the medical issue"], "image": "train2014/COCO_train2014_000000575649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186298, "question_id": "4eVULTeyRUVQWe7r3Qiy65", "question": "What is this semi truck delivering?", "choices": ["groceries", "appliances", "boats", "cars"], "correct_choice_idx": 2, "direct_answers": ["speedboats", "boats", "boats", "boats", "boats", "boats", "boats", "speedboats", "boats", "boats"], "difficult_direct_answer": false, "rationales": ["You can see the shape of the hulls under the covers", "There are boats on the bed of the truck.", "They are extremely large but don't have any wheels visible."], "image": "train2014/COCO_train2014_000000186298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117683, "question_id": "4ee655YMEKxxDKW5drmfW9", "question": "What mode of transportation is The Southerner?", "choices": ["truck", "train", "bus", "van"], "correct_choice_idx": 1, "direct_answers": ["train", "train", "travelling mode", "train", "train", "train", "train", "travelling mode", "train", "train"], "difficult_direct_answer": false, "rationales": ["The transportation is a train.", "The southerner is named as a railway.", "\"the southerner\" is a train, because there is a train on the flyer."], "image": "val2014/COCO_val2014_000000117683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115970, "question_id": "4fAy2cpkKqoK28xmcqT8W8", "question": "What is the silver device on the red paper used for?", "choices": ["paper weight", "making calls", "blending food", "cracking nuts"], "correct_choice_idx": 1, "direct_answers": ["phone", "making calls", "communication", "phone", "making calls", "phone calls", "calling", "cell phone", "calls", "making calls"], "difficult_direct_answer": false, "rationales": ["This is a cell phone.", "The silver device is a cell phone, not a blender, nut cracker, or paper weight.", "The device is for calls."], "image": "train2014/COCO_train2014_000000115970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261487, "question_id": "4fBuNWFNh74QQmMp48Eo5M", "question": "Who was the most recent player of this sport to be on the cover of Sports Illustrated?", "choices": ["naomi osaka", "andre agassi", "monica seles", "serena williams"], "correct_choice_idx": 0, "direct_answers": ["serena williams", "williams", "naomi osaka", "serena williams", "naomi osaka", "tiger woods", "naomi osaka", "naomi osaka", "rafael nadal", "naomi osaka"], "difficult_direct_answer": false, "rationales": ["She is the only one who is not retired from the sport.", "Various people are playing tennis on different courts. noami osaka is a famous tennis player.", "The recent player is osaka."], "image": "val2014/COCO_val2014_000000261487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31235, "question_id": "4fRLAAR2gBbH5bVRVhjYvU", "question": "Which food is normally made with the thing from the store name?", "choices": ["poached eggs", "steak", "bread", "tofu"], "correct_choice_idx": 3, "direct_answers": ["bean", "tofu", "chili", "some burritos", "doughnuts", "nachos", "chili", "coffee", "coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["Tofu comes from soybeans.", "Tofu is made from beans, which is the same as the name on the sign.", "It is also known as bean curd."], "image": "val2014/COCO_val2014_000000031235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451101, "question_id": "4fSMxFBth6aWKsec6a7KUb", "question": "What do the ducks here await?", "choices": ["eggs", "swimming", "food", "rain"], "correct_choice_idx": 2, "direct_answers": ["food", "food", "food", "for food", "food", "treats", "food", "feed", "food", "food"], "difficult_direct_answer": false, "rationales": ["Whenever wild birds are hovering around humans, it is most likely awaiting food.", "It is common for people to throw pieces of bread to birds in parks.", "The little girls is throwing them bread crumbs."], "image": "train2014/COCO_train2014_000000451101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45885, "question_id": "4fjJunMJsfqu8xeffBPjux", "question": "What is the man doing with the object in his hand?", "choices": ["selling", "dancing", "eating", "smoking"], "correct_choice_idx": 3, "direct_answers": ["cigarate", "smoking", "smoking", "smoking", "smoking", "obscene gesture", "holding cigarette", "smoking", "smoking", "middle finger"], "difficult_direct_answer": false, "rationales": ["The man is holding a cigarette. cigarettes are not edible.", "He has a cigarette in his hand.", "The man is holding a lit cigarette."], "image": "train2014/COCO_train2014_000000045885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231878, "question_id": "4frsiV6uYajq3BZchRUzaM", "question": "How many women are holding umbrellas in front of the building?", "choices": ["three", "five", "six", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "one", "1 woman"], "difficult_direct_answer": false, "rationales": ["There are two umbrellas being held by women.", "There are four women in front of the building. half of them are holding umbrellas.", "One woman has one umbrella above them both and is holding the umbrella for the other woman while she looks at something."], "image": "train2014/COCO_train2014_000000231878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235237, "question_id": "4fwzeto6cEAHA4ApZbfYCd", "question": "Who would most likely fly that colorful flag?", "choices": ["heterosexual", "plumber", "politician", "homosexual"], "correct_choice_idx": 3, "direct_answers": ["pride", "homosexuals", "lgbtq community", "lgbt people", "gay people", "car lot", "man", "homosexual", "gay pride", "gays"], "difficult_direct_answer": true, "rationales": ["This is the lgbtqia flag", "A rainbow flag is on a pole. rainbow flags are associated with the lgbtq.", "The pride flag is rainbow in colour and would most often be dawned by the lgbtq+ community."], "image": "train2014/COCO_train2014_000000235237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369299, "question_id": "4fya5afoh8hs6rD4X78BCU", "question": "Which color car will go past the light first?", "choices": ["purple", "white", "red", "yellow"], "correct_choice_idx": 1, "direct_answers": ["white", "white", "white", "white", "white", "white", "gray", "silver", "silver", "silver"], "difficult_direct_answer": false, "rationales": ["It is in front of the red car.", "The white sedan is nearest to the traffic light.", "The white car will go first."], "image": "train2014/COCO_train2014_000000369299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33505, "question_id": "4g3ZjMB2BcYxpXc2gVCv59", "question": "What is this type of game called?", "choices": ["video", "card", "relay", "board"], "correct_choice_idx": 0, "direct_answers": ["boxing", "boxing", "video", "wrestling", "videogame", "video game", "boxing", "boxing", "wii sports", "boxing"], "difficult_direct_answer": false, "rationales": ["This is a video game because the young boy is playing it with the use of a monitor.", "The contents of the game are projected on tv like a video would be.", "This game is played on a tv and is a video game."], "image": "train2014/COCO_train2014_000000033505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352938, "question_id": "4gMQt3w4RachRRrPVMQf8h", "question": "In which area do these buses run?", "choices": ["tundra", "desert", "urban", "rural"], "correct_choice_idx": 2, "direct_answers": ["metro", "route 95", "urban", "carlisle rail", "railways", "city", "england", "carlisle", "street", "carlisle"], "difficult_direct_answer": true, "rationales": ["It's common knowledge that buses run in almost every urban center.", "There are buildings everywhere and many vehicles.", "Buses run in the areas that have the biggest populations and that is almost always the urban areas."], "image": "train2014/COCO_train2014_000000352938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237464, "question_id": "4gosrGMa4yxUdzAnboNdPR", "question": "What do most of the people at the beach hope for today weather wise?", "choices": ["wind", "sleet", "rain", "snow"], "correct_choice_idx": 0, "direct_answers": ["wind", "sun", "wind", "sun", "sunny", "sun", "wind", "wind", "sunny", "sun"], "difficult_direct_answer": false, "rationales": ["Most people are flying kites. rain, snow, or sleet would not help with this activity.", "The are hoping for windy conditions so the kites can perform as designed.", "Many of the people at the beach are flying kites. in order for a kite to fly successfully through the sky there needs to be a force in the air that creates lift, drag and thrust."], "image": "val2014/COCO_val2014_000000237464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15600, "question_id": "4h2KwXzxh7cuEqMsnj7fhZ", "question": "The girl is going to get hurt if the carrot goes in her throat because she will start doing what?", "choices": ["choking", "passing out", "chewing", "laughing"], "correct_choice_idx": 0, "direct_answers": ["choking", "choking", "chewing", "choking", "choking", "choking", "gag", "choking", "choking", "choke"], "difficult_direct_answer": false, "rationales": ["She needs to cut it up or chew it really good so she doesnt choke.", "People have to chew large food items because swallowing them whole means they will not fit in the relatively smaller human throat, resulting in choking.", "The girl could possibly choke on the carrot if she gets a piece stuck in her throat."], "image": "val2014/COCO_val2014_000000015600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498687, "question_id": "4h97cXPdAvAEKuPaB3TuDs", "question": "What type of transportation is shown?", "choices": ["water", "rail", "road", "air"], "correct_choice_idx": 2, "direct_answers": ["truck", "truck", "car", "vehicle transportation", "road", "truck", "automobile", "truck", "antique truck", "automotive"], "difficult_direct_answer": false, "rationales": ["A car is pictured, and it is a type of transportation that uses a road.", "The car travels with wheels on the street.", "This is a pickup truck meant to be driven on the highway."], "image": "val2014/COCO_val2014_000000498687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179926, "question_id": "4hkDwVtEGDUkPA8ZSRNJ7G", "question": "What is the sign telling drivers?", "choices": ["left only", "go straight", "no u-turns", "right only"], "correct_choice_idx": 0, "direct_answers": ["turn left", "turn left", "left", "turn left", "turn left", "left turn", "make left", "turn left", "turn left", "left only"], "difficult_direct_answer": false, "rationales": ["The sign has a bent left arrow on it.", "The sign says left only.", "The sign is showing the arrow going left."], "image": "val2014/COCO_val2014_000000179926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29715, "question_id": "4hnWyUHr4CKKV9TLMB3mrb", "question": "What are the boys doing?", "choices": ["interrogating her", "being friendly", "asking favor", "being curious"], "correct_choice_idx": 1, "direct_answers": ["talking", "talking", "talking", "flirting", "talking", "talking", "being friendly", "flirting", "talking", "flirting"], "difficult_direct_answer": false, "rationales": ["They are young people talking to an attractive woman, and it is obvious by the body language that they are interested in her.", "The boys are talking to the girl.", "The boys are being talkative and social."], "image": "train2014/COCO_train2014_000000029715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531633, "question_id": "4iNCThpU7kbg8ShBMEyWR4", "question": "Which board will this man likely use?", "choices": ["bigger", "none", "both", "smaller"], "correct_choice_idx": 0, "direct_answers": ["large one", "longest", "either", "longer", "big board", "both", "left", "longboard", "long board", "bigger"], "difficult_direct_answer": true, "rationales": ["The bigger board would be the more convenient to surf with.", "The surfer could use either board to surf at the beach.", "There is a large man walking on the sand. he is carrying two boards on either side of him."], "image": "val2014/COCO_val2014_000000531633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192307, "question_id": "4iR9gBGYFx8ripcLtcMB7a", "question": "What style of jeans are these?", "choices": ["flare", "cargo", "straight leg", "crop"], "correct_choice_idx": 0, "direct_answers": ["flare", "blue denim", "casual", "low cut", "levi's", "blue", "bootcut", "loose fit", "tapered", "boot cut"], "difficult_direct_answer": true, "rationales": ["The bottoms of the jean are not quite straight and has a little flaring.", "The jeans worn by the person in the photo can be seen becoming wider around the ankles. this style is called flared.", "They are loose at the ankles and hang to the bottom of the shoes"], "image": "train2014/COCO_train2014_000000192307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554625, "question_id": "4igWR6B3LZUoKfvicuwr3z", "question": "What is this place likely to be?", "choices": ["school library", "home", "game center", "public library"], "correct_choice_idx": 3, "direct_answers": ["public library", "school", "school", "computer lab", "computer lab", "school", "school", "school", "school", "computer lab"], "difficult_direct_answer": false, "rationales": ["The fact that there are adults and kids here indicate it is not a school library. clearly this is not in a home or in a game center.", "People are sitting in a long line of computers. libraries have computers for public use.", "This place is probably in an area for the public since so many diverse ages are using the computers."], "image": "val2014/COCO_val2014_000000554625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276972, "question_id": "4iqTfb4xpp2LofEvyAfPqe", "question": "What brand of shoes is the woman wearing?", "choices": ["adidas", "converse", "nike", "sketchers"], "correct_choice_idx": 1, "direct_answers": ["converse", "converse", "converse", "vans", "converse", "converse", "converse", "vans", "converse", "converse"], "difficult_direct_answer": false, "rationales": ["Her shoes do not have swooshes or white stripes. she is wearing chuck taylor shoes.", "These look to be a type of tennis shoe that converse makes.", "Those sneakers are converse."], "image": "train2014/COCO_train2014_000000276972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422115, "question_id": "4j32evkYNwJPCRMSGQ3Prd", "question": "What material is the brown briefcase made of?", "choices": ["latex", "nylon", "artificial leather", "denim"], "correct_choice_idx": 2, "direct_answers": ["leather", "leather", "leather", "leather", "leather", "leather", "leather", "artificial leather", "leather", "leather"], "difficult_direct_answer": false, "rationales": ["Latex, denim, or nylon would not be suitable for a suitcase.", "Suitcases are usually made of leather or artificial leather.", "The brown briefcase is made of a plastic leather."], "image": "train2014/COCO_train2014_000000422115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188405, "question_id": "4j43R8DxkV4NqMMUQm8ZJ8", "question": "What is his hairstyle?", "choices": ["short", "long", "curly", "shaved"], "correct_choice_idx": 1, "direct_answers": ["short", "short", "short", "short", "short cut", "long", "fringed", "short", "short", "fringed"], "difficult_direct_answer": false, "rationales": ["Haircut lengths are subjective however in today many consider the shaven head as being short. the man in the photo has reasonable amount of hair and would thus be considered long.", "The hairstyle is long.", "His hair is straight. his scalp is not visible."], "image": "train2014/COCO_train2014_000000188405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109561, "question_id": "4j6pJEETrkHwZPbSVn5eoj", "question": "What might a person do inside the blue lit area?", "choices": ["shower", "text", "cook", "rest"], "correct_choice_idx": 0, "direct_answers": ["take shower", "shower", "take shower", "shower", "take shower", "shower", "shower", "take shower", "shower", "take shower"], "difficult_direct_answer": false, "rationales": ["This is a place to bathe standing up.", "The person can shower.", "This is a shower that they can get clean in."], "image": "train2014/COCO_train2014_000000109561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134271, "question_id": "4jNfPyejPmHwAvXK8V2H2p", "question": "What is the most likely year this picture was taken?", "choices": ["1700", "2000", "3000", "1900"], "correct_choice_idx": 1, "direct_answers": ["2000", "recent year", "2005", "twothousand three", "2020", "2007", "2005", "1995", "this year", "2000"], "difficult_direct_answer": false, "rationales": ["The year is 2000.", "One person is holding a cell phone and the photo is in color, so the most recent year is the only option.", "The photo seems modern so is in 2000."], "image": "train2014/COCO_train2014_000000134271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528707, "question_id": "4jUCWNwbwNuNYdaJ8vTziC", "question": "What does the woman here hope to capture?", "choices": ["man", "lion", "elephant", "picture"], "correct_choice_idx": 3, "direct_answers": ["photo", "picture", "elephant", "elephant", "pictures", "photo", "picture", "photos", "elephant", "picture elephant"], "difficult_direct_answer": false, "rationales": ["The woman is holding a camera that she points at the elephant and hopes to capture a picture.", "The woman is taking the picture as he is in that position.", "She is holding a camera aimed towards the other elephant."], "image": "train2014/COCO_train2014_000000528707.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350028, "question_id": "4jUHZc95Jw8mwVwnz8K3rC", "question": "What is the shape of this parachute?", "choices": ["dome", "circular", "square", "rectangle"], "correct_choice_idx": 0, "direct_answers": ["oblong", "circular", "round", "dome", "round", "circle", "dome", "circle", "round", "round"], "difficult_direct_answer": false, "rationales": ["The parachute is in a half circle shape.", "The object is clearly visible and has an outline consistent with answer a.", "It is a half-circle which catches air underneath it in order to stay aloft."], "image": "train2014/COCO_train2014_000000350028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513087, "question_id": "4kAciESrizmnoNEBrkTviQ", "question": "What will be removed when the door is closed?", "choices": ["girl", "wine", "drawer", "condiment"], "correct_choice_idx": 0, "direct_answers": ["baby", "child", "baby", "child", "girl", "child", "baby", "baby", "child", "child"], "difficult_direct_answer": false, "rationales": ["There is a girl in the door of the refrigerator.", "The girl will be removed.", "The refrigerator won't be able to close with her standing there."], "image": "train2014/COCO_train2014_000000513087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422255, "question_id": "4kuE3tDFrfWpVEbPBWkhVT", "question": "Without the computers where would they look up definitions?", "choices": ["black book", "green book", "white book", "red book"], "correct_choice_idx": 3, "direct_answers": ["library dictionary", "dictionary", "dictionary", "dictionary", "red book", "library dictionary", "dictionary", "dictionary", "dictionary", "dictionary"], "difficult_direct_answer": false, "rationales": ["The people would use the red book.", "There is a book entitled dictionary near them.", "In the old days when something had to be looked up, we used books and not computers. these books were called a dictionary and in this photo, it is the book in red."], "image": "train2014/COCO_train2014_000000422255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202990, "question_id": "4m5WNojgmqYzvRYEhdnxrf", "question": "When is it safe to cross the street here?", "choices": ["1 minute", "tomorrow", "now", "2 minutes"], "correct_choice_idx": 2, "direct_answers": ["no", "now", "now", "now", "green light", "now", "green light", "no cars", "green light", "now"], "difficult_direct_answer": false, "rationales": ["You can tell by the orange hand on the street sign as to if it is safe to cross.", "There is a white man icon in the signal across the street.", "It is currently safe to cross the street because the pedestrian traffic signal shows a person walking."], "image": "train2014/COCO_train2014_000000202990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175804, "question_id": "4mPgKjBz8Gx6NEuGo795so", "question": "Why is she aiming the device at the child?", "choices": ["is evil", "was bad", "is wet", "cleaning her"], "correct_choice_idx": 2, "direct_answers": ["drying", "drying body", "drying off", "hair drying", "drying hair", "drying her", "is wet", "drying hair", "dry", "drying"], "difficult_direct_answer": false, "rationales": ["The child's hair is wet and they are wearing a towel so the child is wet and they are trying to dry them.", "The hair is wet.", "She is trying to dry the child off with the hair dryer."], "image": "val2014/COCO_val2014_000000175804.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495100, "question_id": "4mQ7K5FtJmyqxg76pwYNKY", "question": "What profession is the man facing the crowd?", "choices": ["cook", "police officer", "janitor", "librarian"], "correct_choice_idx": 1, "direct_answers": ["police", "security", "home plate", "security guard", "police officer", "police officer", "police", "baseball player", "guard", "security"], "difficult_direct_answer": false, "rationales": ["The man is wearing a blue uniform and is carrying a weapon.", "The man is a baseball player.", "A man in a dark blue uniform is facing a crown on a baseball field. police wear dark blue uniforms."], "image": "train2014/COCO_train2014_000000495100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428665, "question_id": "4mUK2S4UkRzziVFopgu74c", "question": "Why is he hunched over?", "choices": ["is scared", "stay balanced", "sliding off", "falling"], "correct_choice_idx": 1, "direct_answers": ["surfing", "stay balanced", "balance", "balance", "surfing", "balance", "maintain balance", "balance", "maintain balance", "for balance"], "difficult_direct_answer": false, "rationales": ["The man doesn't want to fall off the board.", "He is trying to keep his balance on his board.", "They squat down so they can stay balanced."], "image": "train2014/COCO_train2014_000000428665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24125, "question_id": "4mo3UTHMWXeNxpxnfcmgs5", "question": "What is the brown food with the hole in it on the plate called?", "choices": ["bagel", "cupcake", "muffin", "donut"], "correct_choice_idx": 0, "direct_answers": ["bagel", "bagel", "bagel", "bagel", "doughnut", "doughnut", "bagel", "bagel", "bagel", "bagel"], "difficult_direct_answer": false, "rationales": ["The food is a bagel.", "This circular bread is thicker in consistency than that of a doughnut and it is more savory than sweet.", "The bread is plain and has a hole in it."], "image": "train2014/COCO_train2014_000000024125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296770, "question_id": "4nEqJHJ6PH2cPSjhYqUgxs", "question": "The man near the orange Umbrella sells what?", "choices": ["bricks", "food", "liquor", "ovens"], "correct_choice_idx": 1, "direct_answers": ["food", "food", "pizza", "pizza", "pizza", "pizza", "food", "food", "pizza", "food"], "difficult_direct_answer": false, "rationales": ["The man has food.", "Looks like he is making food.", "The man near the orange umbrella is cooking food in the oven and selling it."], "image": "train2014/COCO_train2014_000000296770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314306, "question_id": "4nrNPAHW3yh5qpGREVHszK", "question": "What country is this?", "choices": ["japan", "usa", "hungary", "uk"], "correct_choice_idx": 2, "direct_answers": ["hungary", "german", "possibly hungary", "hungary", "hungary", "hungary", "hungary", "hungary", "hungary", "hungary"], "difficult_direct_answer": false, "rationales": ["The website says .hu, which is short for hungary.", "You can tell by the long word on the cabinet as to what country its from.", "The picture has a url on it with a hungarian website."], "image": "train2014/COCO_train2014_000000314306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169322, "question_id": "4nyjbMRopNDsCCZa8Pz9fy", "question": "Why would someone be seated here?", "choices": ["to eat", "to work", "to paint", "to wait"], "correct_choice_idx": 0, "direct_answers": ["to eat", "relaxation", "dining", "date", "eat meal", "dining", "eating", "eat", "view", "to eat"], "difficult_direct_answer": false, "rationales": ["The table is set with a full meal, a drink, and a fork.", "It is a restaurant table with food and cutlery ready to be eaten.", "Someone would want to seat here to eat food."], "image": "train2014/COCO_train2014_000000169322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538615, "question_id": "4oVY6JwoyV9wXqLH2UMrDc", "question": "What international tournament takes place every 4 years where this sport is played?", "choices": ["world championship", "winner's cup", "stanley cup", "world cup"], "correct_choice_idx": 3, "direct_answers": ["world cup", "world cup", "world cup", "olympics", "olympics", "olympics", "world cup", "world cup", "world cup", "world cup"], "difficult_direct_answer": false, "rationales": ["Every four years the nations of the world gather to compete in this soccer tournament.", "It's a bigger tournament.", "They are playing soccer."], "image": "train2014/COCO_train2014_000000538615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470028, "question_id": "4p5DA7vQpDpTW7LzRJqAgs", "question": "What is the woman boarding?", "choices": ["plane", "horse", "taxi", "bus"], "correct_choice_idx": 3, "direct_answers": ["bus", "bus", "train", "train", "train", "bus", "bus", "bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["The mode of transport is long and is on the road.", "To get on a bus you have to walk up a few steps. in addition, the doors shown here are made to open and close for passengers on buses.", "The vehicle is not an animal and it is too large to be a car and too small to be an airplane."], "image": "train2014/COCO_train2014_000000470028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420236, "question_id": "4p6u7p2Q7xHHZxttGv9Eqh", "question": "What type of truck is this?", "choices": ["mail", "ice cream", "suv", "ambulance"], "correct_choice_idx": 1, "direct_answers": ["wirral whip", "ice cream", "beverage", "ice cream", "drink", "icecream truck", "ice cream", "wirral whip", "beverage", "ice cream"], "difficult_direct_answer": false, "rationales": ["The truck sells ice cream.", "It serves cool treats and there are cones on the side and front.", "There are pictures of ice cream cons on the truck."], "image": "train2014/COCO_train2014_000000420236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249198, "question_id": "4p8hL6CRXxTRmTxDD6pHYa", "question": "What two factors are allowing the people to move?", "choices": ["horse", "driver", "wheels", "all correct"], "correct_choice_idx": 3, "direct_answers": ["wheels horse", "all correct", "horse power", "horse buggy", "horse buggy", "horses wheels", "horse wheels", "parade", "wheels horses", "horsecart"], "difficult_direct_answer": true, "rationales": ["The horse, wheels and driver all propel the people forward.", "The horse, wheels and driver are allowing the people to move.", "The wheels are allowing the people to move because without wheels the horses could not transport the large carriage filled with people."], "image": "train2014/COCO_train2014_000000249198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226654, "question_id": "4pBnvQCGrHsPWPdMDXHYe3", "question": "Why is the person in the water wearing?", "choices": ["t-shirt", "swim trunks", "wetsuit", "goggles"], "correct_choice_idx": 2, "direct_answers": ["wetsuit", "wetsuit", "wetsuit", "wetsuit", "wetsuit", "wetsuit", "wet suit", "wetsuit", "wetsuit", "wetsuit"], "difficult_direct_answer": false, "rationales": ["They are wearing a wet suit to protect themself from the salt water and stay warm", "A person is in water with a dark colored, form fitting outfit.", "The person appears to be wearing a long sleeved black piece of clothing. this is consistent with a wetsuit which is intended to be worn in water."], "image": "train2014/COCO_train2014_000000226654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186056, "question_id": "4pEfRTPL8LnsdErcVSSCKJ", "question": "When finished with his meal where should the plate being used be placed?", "choices": ["dishwasher", "chef's table", "trash can", "dish drainer"], "correct_choice_idx": 2, "direct_answers": ["trash can", "garbage", "garbage", "trash", "garbage", "garbage", "heater", "garbage", "trash can", "trash"], "difficult_direct_answer": false, "rationales": ["That is a paper plate which is meant to be discarded after use.", "It is a fast food restaurant and judging from the image this is the option most accessible to diners.", "This is the disposal device for soiled paper products."], "image": "train2014/COCO_train2014_000000186056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306343, "question_id": "4pb4VLHKP349RvFDt3pKKw", "question": "What are the people paying attention to?", "choices": ["random person", "event organizer", "vehicle", "dog"], "correct_choice_idx": 1, "direct_answers": ["something", "people", "skateboarding", "direction", "other direction", "leader", "skateboarder", "performer", "event organizer", "other skaters"], "difficult_direct_answer": true, "rationales": ["A group of people are standing together all looking in the same direction. events are usually organized by someone.", "They are watching a person do a trick on a skateboard.", "The boys in helmets are watching for directions from the person in charge. they are all looking in the same direction."], "image": "val2014/COCO_val2014_000000306343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57523, "question_id": "4pry5atEH9gBWkaqJfDR7g", "question": "What is this large contraption called?", "choices": ["skateboarding ramp", "shed", "roof", "slope"], "correct_choice_idx": 0, "direct_answers": ["ramp", "ramp", "ramp", "ramp", "skateboarding ramp", "ramp", "skating deck", "ramp", "ramp", "rube goldberg"], "difficult_direct_answer": false, "rationales": ["The ramp can be used to ride skateboards up and down.", "A ramp they can skate down and do tricks off of.", "The ramp is sloped so people can go down it with a skateboard."], "image": "train2014/COCO_train2014_000000057523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175024, "question_id": "4py9hwDu5QkoFxAuqiRSfs", "question": "What is he carrying that's unusual?", "choices": ["bag", "gps", "helmet", "stuffed animal"], "correct_choice_idx": 3, "direct_answers": ["stuffed toy", "stuff animal", "stuffed animal", "stuffed unicorn", "stuffed animal", "stuffed animal", "stuff animal", "stuffed animal", "stuffed unicorn", "stuffed animal"], "difficult_direct_answer": false, "rationales": ["There is a stuffed animal.", "It's odd for a biker to have a stuffed animal.", "There is a stuffed animal in the back of the motorcycle driver which is a highly unusual object to have on a motorcycle."], "image": "val2014/COCO_val2014_000000175024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253733, "question_id": "4q87GXkiNTaGqRb8bTfJdV", "question": "What is on the cabinet?", "choices": ["whistle", "baby", "cups", "cat"], "correct_choice_idx": 2, "direct_answers": ["pottery", "cup", "artisan pieces", "art", "pottery", "cups", "pottery photographs", "frame", "art", "photo frame"], "difficult_direct_answer": false, "rationales": ["There are decorative pieces on the shelf.", "The cabinet is full of cups since they're drinking vessels.", "There are a number of cups all over the cabinet."], "image": "train2014/COCO_train2014_000000253733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270703, "question_id": "4qKdknL9jS3cQndnvyvN4e", "question": "Why are the fruits in the basket?", "choices": ["to clean", "to sell", "to decorate", "to eat"], "correct_choice_idx": 1, "direct_answers": ["for sale", "selling", "selling", "to sell", "to sell", "for sale", "selling", "to display", "separation", "keep fresh"], "difficult_direct_answer": false, "rationales": ["The set up with this variety of fruit, the price tags visible throughout, the plastic produce bags and the grocery carts are all elements of a grocery store. a display like this at a grocery store would be to sell the fruit.", "They are in a store to sell", "There a prices listed on the baskets and shopping carts next to them."], "image": "train2014/COCO_train2014_000000270703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342448, "question_id": "4qzqQbAURsaExuQ8arFC8e", "question": "What could cause harm to the surfers?", "choices": ["children", "bucket", "sand", "rocks"], "correct_choice_idx": 3, "direct_answers": ["rocks", "waves", "sharks", "sharks", "water", "rocks", "sharks", "sharks", "rocks", "sharks"], "difficult_direct_answer": false, "rationales": ["There are large rocks in the water.", "They could hurt themselves if anything was hard in the water.", "There are rocks in the water and if one of the surfers his them they could get hurt."], "image": "val2014/COCO_val2014_000000342448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456695, "question_id": "4rP9AiScoYEY5UHz7VEYNw", "question": "What will riding the bike do to the blender?", "choices": ["power it", "spill contents", "help mix", "destroy it"], "correct_choice_idx": 0, "direct_answers": ["power it", "power it", "nothing", "power it", "power it", "mix drink", "turn on", "stir", "upset it", "power it"], "difficult_direct_answer": false, "rationales": ["When the bike is pedalled the blender will work.", "A bike is attached to a blender showing it can be used on the bike.", "The riding would power it."], "image": "val2014/COCO_val2014_000000456695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53101, "question_id": "4rc27uUnp7EgMnJn3qopSx", "question": "What animal is on her tank top?", "choices": ["lion", "horse", "elephant", "fox"], "correct_choice_idx": 1, "direct_answers": ["horse", "horse", "horse", "horse", "horse", "nothing", "horse", "horse", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["The animal is galloping and has a tail and mane, so it's a horse.", "The woman is wearing a tank top that has a red horse on it.", "The picture depicts a horse."], "image": "train2014/COCO_train2014_000000053101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224991, "question_id": "4rjuCkJAgsMyLPjuRgqiqa", "question": "What is this person doing with a kite?", "choices": ["surfing", "sailing", "kitesurfing", "flying"], "correct_choice_idx": 2, "direct_answers": ["kiteboarding", "surfing", "kiteboarding", "kitesurfing", "surfing", "kite surfing", "wind surfing", "riding", "surfing", "kitesurfing"], "difficult_direct_answer": false, "rationales": ["The person is surfing while flying the kite.", "The man is holding up a kite in the water.", "He is surfing in the water."], "image": "val2014/COCO_val2014_000000224991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469301, "question_id": "4rvg2KWRmfkHFNJBc77Pbs", "question": "What is the type of grass that is used to create the top sides of the rowboat?", "choices": ["pampas", "bamboo", "lemongrass", "ryegrass"], "correct_choice_idx": 1, "direct_answers": ["bamboo", "bamboo", "straw", "bamboo", "bamboo", "zosia grass", "bamboo", "straw", "bamboo", "bamboo"], "difficult_direct_answer": false, "rationales": ["The boat is made out of bamboo.", "The top sides of the row boat are a brownish color and they are very stiff.", "You can see the rings that protrude from the wood that is a trait of bamboo"], "image": "train2014/COCO_train2014_000000469301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297182, "question_id": "4rwziQ8mBq2yv2s5LoY9hX", "question": "Which costume resembles the companion of Tinker Bell?", "choices": ["none", "gypsy", "peter pan", "tennis player"], "correct_choice_idx": 2, "direct_answers": ["far right", "right", "green costume", "green one", "right", "pirate", "green", "peter pan", "green one", "right"], "difficult_direct_answer": false, "rationales": ["Tinker bells companion in literature is known to be peter pan primarily. peter pan is known to wear a green costume with a fedora hat which is a costume visible on a person in the picture.", "The costume is peter pan.", "Tinkerbell is paired with peter pan."], "image": "train2014/COCO_train2014_000000297182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418397, "question_id": "4siQbdvzJ8r6kK37yV99BV", "question": "What is the standing player ready to do?", "choices": ["dribble", "serve", "dunk", "swing"], "correct_choice_idx": 3, "direct_answers": ["swing", "take swing", "hit ball", "hit", "bat", "hit ball", "swing bat", "hit", "swing bat", "bat"], "difficult_direct_answer": false, "rationales": ["By the look the player is ready to swing.", "He has his bat up and poised waiting for the ball to come across the plate", "He is trying to hit the ball with the bat."], "image": "train2014/COCO_train2014_000000418397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153321, "question_id": "4tPyvjFVV59qanhtNeG2zw", "question": "The inflatable wing used to fly in which game?", "choices": ["skating", "paragliding", "kiting", "parachuting"], "correct_choice_idx": 1, "direct_answers": ["flying kite", "kiteboating", "paraglide", "paragliding", "parasail", "minecraft", "pac man", "paragliding", "parasailing", "parasailing"], "difficult_direct_answer": false, "rationales": ["People would parachute out of a plane and wings are not used in skating. kites do not pick up people.", "These are used for paragliding and flying in the sky.", "The wing is for paragliding."], "image": "train2014/COCO_train2014_000000153321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341756, "question_id": "4tYzjn5ABaqphWmXs5Psci", "question": "Where is this truck going?", "choices": ["lunch", "shopping", "fire", "trapped kitten"], "correct_choice_idx": 2, "direct_answers": ["fire", "fire", "fire", "fire", "fire", "to fire", "fire", "somewhere burning", "fire", "along road"], "difficult_direct_answer": false, "rationales": ["The truck is big, very long, bright red, and has a very long collapsible ladder folded on the top. these vehicles are known to drive fast to emergencies where people's lives are at stake.", "The truck is a fire engine.", "A fire truck is driving with lights and sirens. firefighters travel to emergencies with their lights on."], "image": "train2014/COCO_train2014_000000341756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125684, "question_id": "4tpAJRE6Px3LxwbmhQz5hu", "question": "What street sign is directly next to the street light?", "choices": ["stop", "no u-turn", "one way", "yield"], "correct_choice_idx": 0, "direct_answers": ["stop sign", "stop sign", "stop sign", "stop sign", "stop", "stop sign", "stop sign", "stop sign", "stop sign", "stop sign"], "difficult_direct_answer": false, "rationales": ["The street sign next to the stop light is red and white and has four letters.", "While this is uncommon in a three-light stop light, it appears this one is always on red and always has cars halt their driving.", "There is a red octagon sign next to the street light to indicate cars must stop."], "image": "train2014/COCO_train2014_000000125684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147105, "question_id": "4tpuELiw2GrN5wjHu4ULVm", "question": "What skill are they displaying?", "choices": ["strength", "archery", "math", "balance"], "correct_choice_idx": 3, "direct_answers": ["balance", "surfing skills", "surfing", "balance", "surfing", "surfing", "balance", "surfing", "surfing", "balance"], "difficult_direct_answer": false, "rationales": ["Two people are sharing a board. they have their arms out to the side to keep them from falling over.", "The have their hands out as though in a balancing situation.", "They are showing off how to balance."], "image": "train2014/COCO_train2014_000000147105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488244, "question_id": "4uz8bBwBDrjjWnAREie2YT", "question": "Why are the people riding elephants through the streets?", "choices": ["to colonize", "to destroy", "to celebrate", "to subdue"], "correct_choice_idx": 2, "direct_answers": ["parade", "parade", "parade", "to celebrate", "transporting them", "parade", "parade", "parade", "transporting them", "parade"], "difficult_direct_answer": false, "rationales": ["People are riding elephants in a parade.", "They are having a celebration with them.", "They are riding them like in a parade."], "image": "train2014/COCO_train2014_000000488244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325495, "question_id": "4uzw42RL3orSADYWoqfz6s", "question": "What sort of specific skill is being focused on here?", "choices": ["precision kicking", "dribbling", "power kicking", "head butting"], "correct_choice_idx": 1, "direct_answers": ["dribbling", "soccer handling", "balance", "soccer skills", "dribbling", "soccer", "dribbling", "ball handling", "balance", "footwork"], "difficult_direct_answer": false, "rationales": ["When you use your feet to kick the ball back and forth.", "A child is moving a soccer ball through cones.", "The boy is kicking the ball around the red spots."], "image": "train2014/COCO_train2014_000000325495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197683, "question_id": "4v6cch6QA6AHTfhxLWxXeF", "question": "What does the brown hair belong to?", "choices": ["driver's hood", "someone's hair", "driver's dog", "driver's beard"], "correct_choice_idx": 1, "direct_answers": ["woman", "man", "riding man", "passenger", "beard", "motorcyclist", "man", "motorcyclist", "person", "someone's hair"], "difficult_direct_answer": false, "rationales": ["The motorcycler's beard has a bit of brown on the ends.", "The man has a passenger behind him who has brown hair.", "There is a rider."], "image": "val2014/COCO_val2014_000000197683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168096, "question_id": "4vDYDJpjugkrivqT8zyUb4", "question": "What has caused traffic to stop?", "choices": ["accident", "traffic light", "construction", "animal crossing"], "correct_choice_idx": 1, "direct_answers": ["red light", "stoplight", "signal", "lights", "red light", "red light", "traffic light", "stop light", "light", "traffic light"], "difficult_direct_answer": false, "rationales": ["The light is red in front of them", "There is a red light in front of the cars and all of them are lined up behind it.", "There are red signals above the cars."], "image": "val2014/COCO_val2014_000000168096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54589, "question_id": "4vWErSpSuz4aWqtVFbP6zn", "question": "What is stacked up near the wall on the right?", "choices": ["books", "crates", "bowling pins", "cones"], "correct_choice_idx": 1, "direct_answers": ["milk crates", "crates", "crates", "crates", "crates", "crates", "crates", "crates", "plastic crates", "plastic bins"], "difficult_direct_answer": false, "rationales": ["There are a bunch of open boxes stacked.", "The crates are stacked.", "Crates are stacked up"], "image": "train2014/COCO_train2014_000000054589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132102, "question_id": "4vYSeDu5iTb5uJvMDuQdhU", "question": "How is the hat the person in the closest boat wearing called?", "choices": ["baseball cap", "asian conical", "beret", "fedora"], "correct_choice_idx": 1, "direct_answers": ["rice hat", "basket hat", "rice hat", "conical", "panama", "straw hat", "rice hat", "rice hat", "rice hat", "asian conical"], "difficult_direct_answer": false, "rationales": ["This is a hat that is worn more so in asians cultures.", "The hat is an asian conical one.", "A person in a boat is wearing a round, flat hat that has a point."], "image": "train2014/COCO_train2014_000000132102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356256, "question_id": "4vm7KDH8VeT3C5dk4zUwxM", "question": "Which type shop is seen here?", "choices": ["peets", "mcdonald's", "burger king", "starbucks"], "correct_choice_idx": 3, "direct_answers": ["coffee", "coffee", "coffee shop", "coffee shop", "coffee", "starbucks", "coffee shop", "coffee", "coffee", "coffee shop"], "difficult_direct_answer": false, "rationales": ["The logo lettering is green and includes the word \"coffee\".", "The sign above the window says coffee and is written in green.", "The word \"coffee\" is in the shade of green that starbucks uses."], "image": "train2014/COCO_train2014_000000356256.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388382, "question_id": "4vnN4kpt42S9Xk66uaqwcj", "question": "Why do people plow with cows?", "choices": ["strength", "luck", "smell", "cost"], "correct_choice_idx": 0, "direct_answers": ["primitive", "strong", "helpful", "strong", "its cheap", "farming", "strong animals", "strength", "strength", "strength"], "difficult_direct_answer": false, "rationales": ["Cows and bulls are sometimes used to pull things.", "Cows are strong and can pull the plough.", "The people use their strength."], "image": "train2014/COCO_train2014_000000388382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310772, "question_id": "4vrxjfdJLS3afDxuzdo9Xc", "question": "Why is the band on the coach?", "choices": ["find conductor", "leave town", "play instruments", "go concert"], "correct_choice_idx": 2, "direct_answers": ["on tour", "parade", "identification", "parade", "parade", "they're passangers", "entertainment", "parade", "play instruments", "flute"], "difficult_direct_answer": false, "rationales": ["They need to sit whilst they play thus music in the parade.", "People are holding musical instruments while sitting in a wagon being pulled by horses.", "They are holding a trumpet and saxophone."], "image": "train2014/COCO_train2014_000000310772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459346, "question_id": "4wCuCfcES2p4eUvkNXhJGE", "question": "What colour is the tie on the far right?", "choices": ["pink", "red", "yellow", "orange"], "correct_choice_idx": 3, "direct_answers": ["orange", "orange", "tan", "salmon", "red", "mauve", "brown", "tan", "orange", "blue"], "difficult_direct_answer": false, "rationales": ["The tie is orange.", "The far right tie is the same colour as the one next to red in a rainbow.", "The tie on the far right is a reddish color."], "image": "train2014/COCO_train2014_000000459346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162641, "question_id": "4wJywqfmX437yvQFT4MGur", "question": "Why is the man raising his arm while skiing?", "choices": ["getting help", "climbing", "claiming victory", "doing tricks"], "correct_choice_idx": 2, "direct_answers": ["waving", "move poles", "waving", "waving", "claiming victory", "celebration", "waving", "waving", "waving", "waving"], "difficult_direct_answer": false, "rationales": ["The man appears to be ahead of the others and appears to be holding his hand up in victory. when people win races, they sometimes hold their hands up to celebrate.", "The man is raising his arm as if celebrating.", "The man is claiming a win."], "image": "train2014/COCO_train2014_000000162641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545405, "question_id": "4wKhV6psjY2sTHCNuhbkC4", "question": "Which ingredient contains the highest amount of sodium?", "choices": ["cucumber", "ketchup", "sausage", "mustard"], "correct_choice_idx": 2, "direct_answers": ["hot dog", "hotdog", "hot dog", "hot dog", "hot dog", "hot dog", "ketchup", "hot dog", "sausage", "sausage"], "difficult_direct_answer": false, "rationales": ["The ingredient is sausage.", "The bun, mustard and cucumber are relatively low in sodium, so taking into account all of the ingredients that are present, the sausage itself would have to categorized as the ingredient which contains the highest amount of sodium.", "Processing meats uses lots of salt."], "image": "train2014/COCO_train2014_000000545405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239515, "question_id": "4wSRVfRTAfFSjgUFtH5gUF", "question": "What sport is she ready to play?", "choices": ["soccer", "tennis", "football", "baseball"], "correct_choice_idx": 3, "direct_answers": ["baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["She is holding a bat, not a racquet. soccer and football also do not use bats.", "This is the only sport of the 4 that requires the bat she's holding.", "The sport involves hitting something with a club like she is holding."], "image": "train2014/COCO_train2014_000000239515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358869, "question_id": "4wxV7Sd9z9K4MwSJyNPGiE", "question": "What do these animals have on their feet?", "choices": ["webbing", "tails", "hooves", "talons"], "correct_choice_idx": 2, "direct_answers": ["hooves", "hooves", "hooves", "hooves", "hooves", "hooves", "horse", "horseshoes", "horseshoes", "hooves"], "difficult_direct_answer": false, "rationales": ["These types of animals have hooves that are used to walk.", "They are horses.", "The horses have a hard shell on the bottom of their legs."], "image": "val2014/COCO_val2014_000000358869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481609, "question_id": "4xNBmKig9e3qVTSFPbC7bu", "question": "What is the red item to the right?", "choices": ["bucket", "plate", "bookcase", "cherry"], "correct_choice_idx": 0, "direct_answers": ["bucket", "container", "bucket", "bucket", "bucket", "container", "bucket", "bucket", "container", "bucket"], "difficult_direct_answer": false, "rationales": ["The red item is a plastic bucket.", "A cup rests on two buckets below it.", "It is the only option that would have a handle on it in order to be carried."], "image": "train2014/COCO_train2014_000000481609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483569, "question_id": "4xTKxjqnrhoSAtThJJgkhX", "question": "The man wearing what color of shirt is in the greatest danger?", "choices": ["red", "white", "yellow", "grey"], "correct_choice_idx": 1, "direct_answers": ["white", "red", "white", "blue", "white", "white", "blue", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["He is standing in the road and not riding on or in a motor vehicle.", "The man in white is in danger.", "The man in white could get hit since he's not on a vehicle that could escape quickly."], "image": "train2014/COCO_train2014_000000483569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487693, "question_id": "4xfzbhbsfHzuFp88o8k9MQ", "question": "What kind of top are all the boys wearing?", "choices": ["polo", "tank", "blazer", "hoody"], "correct_choice_idx": 3, "direct_answers": ["hoodies", "hoodies", "hoodies", "hoodies", "jacket", "sweaters", "hoods", "hoody", "hoody", "hoody"], "difficult_direct_answer": false, "rationales": ["The boys are all wearing hooded sweatshirts.", "The boys are all wearing a sweatshirt that has an attached head covering.", "All the boys are wearing a shirt or jacket that has a head covering attached at the back of the neck that can be left hanging at the back or pulled up over the head, covering most or all of their hair but leaving their face visible."], "image": "train2014/COCO_train2014_000000487693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442343, "question_id": "4xoKJ78vbVFsuNHsbDKnc9", "question": "What constant force is being combated based on the direction the skier is walking?", "choices": ["magnetism", "inertia", "gravity", "velocity"], "correct_choice_idx": 2, "direct_answers": ["gravity", "gravity", "wind", "gravity", "gravity", "gravity", "using ski-poles", "winds", "gravity", "gravity"], "difficult_direct_answer": false, "rationales": ["They are walking up towards the top of a steep slope.", "Gravity makes it harder to walk up hill.", "A skier is walking up a hill on skis."], "image": "train2014/COCO_train2014_000000442343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389255, "question_id": "4y8MsdJtfYRtWxrFFrn5TN", "question": "What is the serving size of this restaurant's main offering?", "choices": ["pint", "kilograms", "slice", "bushel"], "correct_choice_idx": 2, "direct_answers": ["slice", "slice", "twenty people", "slice", "large", "slice", "unknown", "slice", "slice", "small"], "difficult_direct_answer": false, "rationales": ["They sell it by slice.", "A pizza is made of slices and this is a pizza place.", "The sign indicates that this is a pizza restaurant."], "image": "val2014/COCO_val2014_000000389255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363202, "question_id": "4yQDqLhC48WpLFvRvt25nN", "question": "What is the man in front of the window wearing?", "choices": ["shorts", "dress slacks", "sweat pants", "jeans"], "correct_choice_idx": 1, "direct_answers": ["glasses", "black shirt", "black jacket", "jeans", "black jacket", "jacket", "sweater", "black shirt", "business casual", "dress slacks"], "difficult_direct_answer": false, "rationales": ["The man is dressed casually in denim.", "By the color and the way it's fitted on the man you can tell they are blue jeans.", "The man has dress slacks."], "image": "train2014/COCO_train2014_000000363202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390814, "question_id": "4yVq4vGkCaLGf97AJhj4tT", "question": "What type of natural disaster could occur if the severity of the situation in the picture is increased?", "choices": ["earthquake", "tornado", "tsunami", "drought"], "correct_choice_idx": 2, "direct_answers": ["tsunami", "hurricane", "tsunami", "tsunami", "tsunami", "tsunami", "tsunami", "tsunami", "flood", "tsunami"], "difficult_direct_answer": false, "rationales": ["A tsunami would make the wave enormous.", "A very large wave called a tsunami could be disastrous and dangerous.", "The waves from the ocean are quite large, and there's a large amount of water."], "image": "train2014/COCO_train2014_000000390814.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487378, "question_id": "4yYWiJN7usgQhWmVwMuatC", "question": "What is the white disc being carried meant to do?", "choices": ["nothing", "sail", "scoot", "scrape"], "correct_choice_idx": 1, "direct_answers": ["frisbee", "playing", "playing", "throwing catching", "frisbee", "thrown", "thrown", "fly", "fly", "sail"], "difficult_direct_answer": false, "rationales": ["A girl is carrying a frisbee. frisbees are thrown in the air.", "The person is carrying a frisbee.", "The disc is a sail."], "image": "train2014/COCO_train2014_000000487378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110573, "question_id": "4ybQFTvfENTtwyWSwGRShJ", "question": "What is hidden behind him?", "choices": ["mirror", "urinal", "table", "shelf"], "correct_choice_idx": 1, "direct_answers": ["urinal", "urinal", "urinal", "toilet", "bathroom", "toilet", "toilet", "urinal", "urinal", "stall"], "difficult_direct_answer": false, "rationales": ["The man is posing for a photo in the washroom.", "A man is taking a selfie in the bathroom.", "A stand up toilet for men."], "image": "train2014/COCO_train2014_000000110573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424502, "question_id": "4ybkddhgRB75MZuV9bKa63", "question": "What is the person in blue and white with long black socks doing?", "choices": ["pitching", "counting cards", "eating", "walking"], "correct_choice_idx": 0, "direct_answers": ["pitcher", "pitching", "pitching", "pitching", "pitcher", "pitching", "pitching", "pitching", "pitching", "baseball"], "difficult_direct_answer": false, "rationales": ["The person on the mound is throwing the baseball.", "The man is pitching because he is on the pitching mound and he is pitching a ball at the hitter.", "He is getting ready to throw the ball to the batter."], "image": "val2014/COCO_val2014_000000424502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163727, "question_id": "4yjSmnPMe7WNhqmWSM3VTy", "question": "What reptile is depicted in the extra large kite?", "choices": ["chameleon", "iguana", "tortoise", "lizard"], "correct_choice_idx": 0, "direct_answers": ["lizard", "alligator", "gecko", "chameleon", "chameleon", "salamander", "lizard", "lizard", "lizard", "chameleon"], "difficult_direct_answer": false, "rationales": ["The reptile has lizard like features and bulbous eyes.", "This is a chameleon.", "The reptile is large and has eyes on top of its head."], "image": "train2014/COCO_train2014_000000163727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489140, "question_id": "4yv6byosgCWBQFnfvt5iyW", "question": "What is being dissected here?", "choices": ["machine", "sandwich", "frog", "table"], "correct_choice_idx": 1, "direct_answers": ["crab", "sandwich", "sandwich", "burger", "sandwich", "spices", "sandwich", "crab", "tomato", "crab"], "difficult_direct_answer": false, "rationales": ["The sandwich has been split apart.", "The sandwich has been opened up.", "There is meat, vegetables and bread and it's served with fries"], "image": "train2014/COCO_train2014_000000489140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7556, "question_id": "4zNADq7xN6SWjUMKFd47sA", "question": "Which direction does the wind blow?", "choices": ["toward boat", "rightward", "from hills", "up down"], "correct_choice_idx": 0, "direct_answers": ["toward boat", "from right", "east", "west", "straight", "opposite", "left", "west", "left", "west"], "difficult_direct_answer": false, "rationales": ["The sail will blow towards the boats.", "The direction is going toward the boat.", "The kites are all flowing in one direction."], "image": "val2014/COCO_val2014_000000007556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433078, "question_id": "4zQRtvvDLDZawdAXykA397", "question": "What does the woman want to do on the ramp?", "choices": ["sit", "paint it", "ride it", "lay down"], "correct_choice_idx": 2, "direct_answers": ["go down", "skateboard", "descend", "glide", "go down", "skate down", "skate", "start fast", "ride it", "skateboard down"], "difficult_direct_answer": true, "rationales": ["She wants to ride down it.", "She will ride the skateboard on the ramp.", "She is on a skate board at the top of the ramp. she is a thrill seeker."], "image": "train2014/COCO_train2014_000000433078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365999, "question_id": "4zUhKmPoJSAtkAdMihXndJ", "question": "How many of the green items will be used at a time?", "choices": ["one", "four", "two", "three"], "correct_choice_idx": 0, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["You only use one tennis ball at a time.", "Tennis balls are near a racket on a tennis court.", "The green items are tennis balls. the tennis player will not use multiple balls at the same time."], "image": "train2014/COCO_train2014_000000365999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483159, "question_id": "4ztjXqFyLcNSzkvyqRAaXL", "question": "These items are usually eaten for what?", "choices": ["lunch", "snack", "fancy wedding", "dinner"], "correct_choice_idx": 1, "direct_answers": ["breakfast", "dessert", "sweet", "sweet treat", "breakfast", "pleasure", "snack", "breakfast", "breakfast", "calories"], "difficult_direct_answer": false, "rationales": ["These items are donuts. they are not eaten at meals or fancy weddings.", "They are seems to be snacks.", "The low nutritional value of this food means it's best only used as an occasional treat then a full meal."], "image": "val2014/COCO_val2014_000000483159.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408336, "question_id": "52LXxzXaexLt8hHQ4cjFFP", "question": "How is the zebra decorated?", "choices": ["white stripes", "black stripes", "all black", "all white"], "correct_choice_idx": 0, "direct_answers": ["stripes", "with stripes", "horizontal stripes", "stripes", "stripes", "stripes", "striped", "white stripes", "striped", "stripes"], "difficult_direct_answer": false, "rationales": ["Zebras are known to have a pattern on their black skin of white stripes.", "The zebra's base color is white. the strip colors are black.", "Zebras naturally grow black hair so the other color occurs when there is no pigment."], "image": "val2014/COCO_val2014_000000408336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577310, "question_id": "52MyLVbAQsQaZqQ39hyThg", "question": "What might the most colorful of kites be meant to represent?", "choices": ["mexico", "gay pride", "america", "pinata"], "correct_choice_idx": 1, "direct_answers": ["rainbow", "rainbow", "countries", "fish", "flags", "country", "rainbows", "pride", "country", "gay pride"], "difficult_direct_answer": false, "rationales": ["Because the kite has a rainbow of colors which is similar to the gay flag.", "The rainbow represents the gay pride.", "The kites are rainbow colored, associated with lgbt movements."], "image": "val2014/COCO_val2014_000000577310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217522, "question_id": "52VMBMTvvfMYgPkYq6Xvvu", "question": "How much is a small hotdog?", "choices": ["PS2.50", "PS3.00", "PS2.00", "PS1.50"], "correct_choice_idx": 3, "direct_answers": ["1.50 euros", "dollar fifty", "PS1.50", "PS1.50", "PS1.50", "1.5", "dollar fifty", "1.50", "1.50", "1.50"], "difficult_direct_answer": false, "rationales": ["The hot dog is 1.50.", "The sign on the side of the cart indicates the price of a small hotdog.", "The price of the hot dog is on the sign."], "image": "val2014/COCO_val2014_000000217522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113724, "question_id": "538ifDPwUQiuBNBVYdwwE3", "question": "What type of transportation is this?", "choices": ["air", "automobile", "water", "rail"], "correct_choice_idx": 3, "direct_answers": ["train", "highspeed rail", "train", "highspeed rail", "highspeed rail", "train", "train", "rail", "train", "train"], "difficult_direct_answer": false, "rationales": ["The shape, size and design of the vehicle as well as the setting and visible tracks underneath are all consistent with answer a.", "The bars under the vessel help it to slide along.", "This is a land, not air or water, vehicle. there are tracks beneath it."], "image": "train2014/COCO_train2014_000000113724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510527, "question_id": "53AHMfsmsgLGHXqnBiZPJ4", "question": "What is he doing?", "choices": ["tying tie", "driving", "checking himself", "eating"], "correct_choice_idx": 2, "direct_answers": ["fastening tie", "fixing tie", "adjusting tie", "adjusting tie", "loosening tie", "adjusting tie", "checking himself", "tying tie", "adjusting tie", "straightening tie"], "difficult_direct_answer": false, "rationales": ["The man is adjusting his tie. the man is looking in the direction of the camera screen which people use as a mirror sometimes.", "He is looking in a mirror and positioning his tie.", "The man is adjusting himself."], "image": "val2014/COCO_val2014_000000510527.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577403, "question_id": "53UaisN7KjfwWVqedeWtGy", "question": "Why is the boy on the skateboard crouching down?", "choices": ["performing dance", "to grind", "to sit", "performing trick"], "correct_choice_idx": 3, "direct_answers": ["performing trick", "balance", "performing trick", "stability", "balance", "sidewalk", "balance", "balance", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["The boy is trying to do tricks with the board.", "The boy is trying to jump for a trick.", "He is doing a specific move."], "image": "val2014/COCO_val2014_000000577403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567530, "question_id": "53xA9Egdacvo6QVs6Km4ND", "question": "What is the man ready to do next?", "choices": ["flush", "throw", "rinse", "burn"], "correct_choice_idx": 0, "direct_answers": ["flush toilet", "flush", "flush", "flush", "flush", "use bathroom", "urinate", "use toilet", "urinate", "flush"], "difficult_direct_answer": false, "rationales": ["The man is standing next to a toilet so we can assume he's done what he needed to do and will now purge the system.", "The man is leaning over a toilet with his hand outstretched.", "People flush toilets after use."], "image": "train2014/COCO_train2014_000000567530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34824, "question_id": "544yEDq6pxKWTDEhX9Kn7H", "question": "Where is the dog staying under?", "choices": ["bed", "table", "island", "cabinet"], "correct_choice_idx": 1, "direct_answers": ["table", "table", "table", "table", "table", "table", "table", "table", "table", "table"], "difficult_direct_answer": false, "rationales": ["There are chairs and legs next to the dog as well as a bracket in the corner of the wood above him", "The dog is under a table.", "The inside bottom of the furniture over the dog has the characteristics of that item."], "image": "train2014/COCO_train2014_000000034824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144049, "question_id": "54ByewpJeqUubsvFkG8jAR", "question": "Where do tomatoes usually come from?", "choices": ["china", "italy", "america", "canada"], "correct_choice_idx": 2, "direct_answers": ["farm", "vines", "farms", "farm", "vines", "vine", "trees", "garden", "tomato vines", "america"], "difficult_direct_answer": false, "rationales": ["Most of the countries listed produce tomatoes, china produces the most in the world.", "Specifically they are from south america", "The usa is the second largest producer."], "image": "train2014/COCO_train2014_000000144049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313278, "question_id": "54NaeHRkeZCqy78MuC2F9Z", "question": "What is number 23 attempting to do?", "choices": ["tackle", "pass", "block", "score"], "correct_choice_idx": 2, "direct_answers": ["stop dribbler", "steal ball", "block", "block", "block", "get ball", "block", "block ball", "deflect ball", "stop"], "difficult_direct_answer": false, "rationales": ["The player wearing a 23 is in a black uniform and the girl with the ball in her position wears a white uniform reading 7. they are on opposite teams and player 23 is attempting to slow or stop player 7's progress", "Number 23 is trying to prevent the other player from scoring.", "She is trying to block the ball from her opponent."], "image": "train2014/COCO_train2014_000000313278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214197, "question_id": "54UkfUxYF7sA8QTQRLqxtx", "question": "Why are there mounds on the surface?", "choices": ["for tricks", "for grazing", "for barriers", "for gardening"], "correct_choice_idx": 0, "direct_answers": ["skating", "jumps stunts", "for tricks", "skate tricks", "for tricks", "skate tricks", "skateboard park", "more challenging", "jumps", "ramps"], "difficult_direct_answer": false, "rationales": ["The mounds are for tricks.", "This is a skate park where skaters go to preform and practice on the rinks.", "These are there for skateboards to be able to do tricks on."], "image": "val2014/COCO_val2014_000000214197.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249508, "question_id": "54enpNcD7b5jwu7UKxmq6u", "question": "What is the boy in the white shirt using as a seat?", "choices": ["skateboard", "laptop", "backpack", "suitcase"], "correct_choice_idx": 0, "direct_answers": ["skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard"], "difficult_direct_answer": false, "rationales": ["A boy is sitting on a board that has wheels on it.", "The item has four wheels and a deck. the boy is watching another person use one of these items to do stunts.", "A kid is sitting on a skateboard on the sidewalk."], "image": "train2014/COCO_train2014_000000249508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1732, "question_id": "54tHPfesiQwqtVQBvgwEMT", "question": "What are guitar cases called?", "choices": ["gig case", "guitar box", "travel gig", "gig bag"], "correct_choice_idx": 3, "direct_answers": ["gig cases", "cases", "hard shell", "gig bags", "gig bag", "hard case", "gig bag", "shell", "gig bag", "gig bag"], "difficult_direct_answer": false, "rationales": ["Cases for guitars are known as gig bags denoting something used to carry a guitar for a gig or performance.", "The case is a gig bag.", "Most guitarist's call it a gig bag or case due to they perform gigs at night clubs or concerts."], "image": "train2014/COCO_train2014_000000001732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429759, "question_id": "54wFbJpruixuD72uLLS2so", "question": "Why are some items covered in tarps here?", "choices": ["surprise", "tariff rules", "rain protection", "black market"], "correct_choice_idx": 2, "direct_answers": ["shade", "stay dry", "stalls", "rain protection", "rain protection", "fruits", "rain protection", "protection", "rain", "keeping dry"], "difficult_direct_answer": false, "rationales": ["The people have umbrellas.", "Several people in the picture are carrying umbrellas so the items have been covered so they do not get wet.", "They are protecting them from the elements."], "image": "val2014/COCO_val2014_000000429759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487456, "question_id": "54zFWA6BbXaTLxMokeqLNE", "question": "Why is the machinery white?", "choices": ["toilet paper", "snow covered", "styrofoam", "painted white"], "correct_choice_idx": 1, "direct_answers": ["ice", "frozen", "ice/snow covered", "visibility", "snow covered", "snow", "snow", "space", "it's frozen", "full snow"], "difficult_direct_answer": true, "rationales": ["It is the same as the material on the ground, where people are on skis and wearing winter gear.", "There is snow all over the ground and including on the machinery.", "It is cold."], "image": "train2014/COCO_train2014_000000487456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5253, "question_id": "54zWLtVVMsRRpfN8GJwLUq", "question": "What color is the sign with the white arrows?", "choices": ["green", "yellow", "pink", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "blue", "green", "blue", "blue", "blue", "blue", "blue", "blue", "green"], "difficult_direct_answer": false, "rationales": ["The sign with white arrows is not yellow, green, or pink.", "It is blue and white.", "This is one of the primary colors"], "image": "train2014/COCO_train2014_000000005253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443712, "question_id": "55iFGfcfQ2Mtw2VUfCVz87", "question": "Which dark fruit is visible here?", "choices": ["cherry", "olive", "strawberry", "corn"], "correct_choice_idx": 1, "direct_answers": ["cucumber", "raisin", "olive", "watermelon", "olive", "cucumber", "banana", "black olives", "olives", "avocado"], "difficult_direct_answer": false, "rationales": ["There is dark fruit on the pizza.", "You can tell by the color and the pizza that it is on as to what type of veggie it is.", "It looks like a piece of the black variety on the food."], "image": "val2014/COCO_val2014_000000443712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450592, "question_id": "55qGVpRegbX4d7tgtMepw8", "question": "What are the white horses used for?", "choices": ["pulling carriage", "breeding", "racing", "tilling land"], "correct_choice_idx": 0, "direct_answers": ["pulling carriage", "pulling wagons", "pulling wagon", "pulling", "pulling wagons", "pulling", "horse", "pulling carriage", "pulling", "pull carriage"], "difficult_direct_answer": false, "rationales": ["The horses have harnesses that are meant to be attached to a structure with wheels in order for the horses to pull and move.", "The white horses are harnessed to a vehicle behind them which they will pull.", "The horses are shackled to the apparatus behind them."], "image": "train2014/COCO_train2014_000000450592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387210, "question_id": "55tHEWGfbUuJSa2CRrS9Bp", "question": "What is the most dangerous obstacle the surfer's will have to deal with?", "choices": ["rocks", "seaweed", "sand", "waves"], "correct_choice_idx": 0, "direct_answers": ["rocks", "rocks", "rocks", "high surf", "rocks", "rocks", "rocks", "wave", "huge rocks", "undertow"], "difficult_direct_answer": false, "rationales": ["The rocks are dangerous.", "Most people avoid rocks in the water.", "If they hit anything hard it could really hurt them."], "image": "val2014/COCO_val2014_000000387210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441471, "question_id": "55x4SvQeJV4EvD7UyPLdzv", "question": "Why is he touching the screen?", "choices": ["cleaning", "navigating", "massaging", "taking fingerprint"], "correct_choice_idx": 1, "direct_answers": ["control", "browsing", "using browser", "to scroll", "navigating", "using tablet", "working", "using tablet", "get information", "change image"], "difficult_direct_answer": true, "rationales": ["There are multiple tabs open on the tablet. he is switching between them.", "The person is looking through pages.", "He needs to move through the screens"], "image": "val2014/COCO_val2014_000000441471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70104, "question_id": "569vSJQqUzUMEUw72CseGQ", "question": "What service does the red bus connect passengers to?", "choices": ["subway service", "tram service", "train service", "plane service"], "correct_choice_idx": 3, "direct_answers": ["airport", "airport", "airport", "airport", "airport", "airport", "plane service", "airport", "airport", "airport"], "difficult_direct_answer": false, "rationales": ["The red bus has a sign on the front that says it goes to the airport.", "A red bus with the destination of airport is driving on the street.", "The red bus connects to a plane service because the sign on the bus reads london city airport"], "image": "train2014/COCO_train2014_000000070104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273561, "question_id": "56FQ5huh37ZZaZDsLEPdCi", "question": "Besides the valence what is being used to cover the windows?", "choices": ["curtains", "horizontal blinds", "vertical blinds", "shade"], "correct_choice_idx": 0, "direct_answers": ["curtains", "blinds", "blinds", "blinds", "ruffles", "blinds", "ruffles", "blinds", "blinds", "blinds"], "difficult_direct_answer": false, "rationales": ["They have shades up to cover the windows that can be opened or closed.", "There are nice wooden slats that have strings to maneuver them", "The valence is at the top and the rest of the window is covered by slats that run side to side across the windows and can be rotated to be in an opened or closed position."], "image": "train2014/COCO_train2014_000000273561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570917, "question_id": "56XPuM6afPXgEEuoVZsGtu", "question": "If you were frying eggs what would you be facing most directly?", "choices": ["melon baller", "refrigerator", "microwave", "sink"], "correct_choice_idx": 2, "direct_answers": ["microwave", "stove", "stove", "stove", "stove", "stove", "microwave", "stove", "stove", "microwave"], "difficult_direct_answer": false, "rationales": ["It's right above the oven.", "The stove is underneath the microwave. if you were frying eggs, the microwave would be right in front of you.", "The microwave is on top of the stove."], "image": "train2014/COCO_train2014_000000570917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423341, "question_id": "56ajK7Gvq7qFkdoofbuuLW", "question": "What type of vehicle is the person in the middle lane using?", "choices": ["bus", "motorcycle", "car", "bicycle"], "correct_choice_idx": 3, "direct_answers": ["bike", "bike", "bicycle", "bicycle", "bike", "bike", "bicycle", "bicycle", "bike", "bike"], "difficult_direct_answer": false, "rationales": ["This is the vehicle situated between the other larger ones.", "The vehicle is the bike.", "With busses to the left and cars to the right, the vehicle in the middle is a bicycle."], "image": "train2014/COCO_train2014_000000423341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92957, "question_id": "56mid3cVweABAaSmqQ7Axj", "question": "What are the numbered pieces of paper for?", "choices": ["games", "score", "food tickets", "prizes"], "correct_choice_idx": 2, "direct_answers": ["order numbers", "meal", "tickets", "order", "assigned seating", "tracking", "auction number", "food tickets", "ticket numbers", "order number"], "difficult_direct_answer": true, "rationales": ["This is a restaurant and the tickets are to pick up your food.", "Small papers with numbers are on a table with wine glasses.", "Their order number is on there."], "image": "train2014/COCO_train2014_000000092957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183626, "question_id": "56nKSt7qB4i82DtPatX26u", "question": "What kind of walls are in this house?", "choices": ["stone", "plaster", "log", "brick"], "correct_choice_idx": 2, "direct_answers": ["wood", "logs", "wooden", "wooden", "log", "windows", "log", "logs", "log", "wood"], "difficult_direct_answer": false, "rationales": ["The walls look like a cabin wall.", "Round, wood beams can be seen making up the walls of the house.", "The walls are logs."], "image": "train2014/COCO_train2014_000000183626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282234, "question_id": "56sKqV7LP7QJdBQi2jNUrf", "question": "What style of hat is the boy wearing?", "choices": ["fedora", "beanie", "baseball cap", "derby"], "correct_choice_idx": 2, "direct_answers": ["baseball cap", "baseball cap", "baseball", "baseball cap", "beanie", "cap", "baseball cap", "baseball cap", "baseball cap", "tshart"], "difficult_direct_answer": false, "rationales": ["His hat is round with a bill, the kind they use when playing baseball.", "The brim visible on this cap; partially visible extending from the back here, identifies it as a baseball style hat.", "The boy is wearing a style of baseball cap."], "image": "train2014/COCO_train2014_000000282234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310511, "question_id": "56uFdbyaSMXBFBkmQRf5Xr", "question": "The man is wearing a shirt of what type of company?", "choices": ["farm", "search engine", "catering", "warehouse"], "correct_choice_idx": 1, "direct_answers": ["search engine", "google", "internet", "tech company", "google", "google", "google", "tech", "google", "google"], "difficult_direct_answer": false, "rationales": ["The shirt says google and they are known for being a search engine.", "This company is known for the internet", "The shirt has a google logo on it."], "image": "train2014/COCO_train2014_000000310511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478380, "question_id": "58DAfC9egMWPHJD4mCZ8FP", "question": "Which animal is classified as a similar toed ungulate as these?", "choices": ["squid", "horse", "deer", "jellyfish"], "correct_choice_idx": 2, "direct_answers": ["goats", "goat", "goat", "sheep herd", "goats", "deer", "goat", "lambs", "goats", "goat"], "difficult_direct_answer": false, "rationales": ["This common animal has hooves and meanders through wild natural areas, pastures and roadways like these animals. however, this animal is a wild animal.", "A sheep is an even-toed ungulate and so is a deer.", "These animals are sheep. jellyfish and squid are not ungulates, and horses have an odd, not even, number of toes."], "image": "val2014/COCO_val2014_000000478380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359379, "question_id": "58G76qkdhHNEUwteUMCFzX", "question": "What type of sign is shown in the image?", "choices": ["stop", "yield", "pedestrians crossing", "train crossing"], "correct_choice_idx": 2, "direct_answers": ["school crossing", "street traffic", "pedestrians crossing", "street", "street", "street", "traffic", "street sign", "street sign", "street sign"], "difficult_direct_answer": false, "rationales": ["A yellow sign showing people walking is shown at a corner of a street. there are pedestrian crossings at intersections.", "The yellow sign on the post is to indicate that pedestrians will cross the street.", "School crossing sign signifies there will be little pedestrians."], "image": "train2014/COCO_train2014_000000359379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21060, "question_id": "58nnPgA3tnSdzrBuArenRB", "question": "What is on the desk?", "choices": ["laptop", "fishbowl", "cat", "rat"], "correct_choice_idx": 0, "direct_answers": ["monitor", "laptop", "post-it notes", "monitor", "post-it notes", "monitor", "monitor", "monitor", "monitor", "monitor"], "difficult_direct_answer": false, "rationales": ["There are quite a few things on the desk but none are animals or mammals.", "A laptop is propped up on the desk.", "A computer is on the desk."], "image": "val2014/COCO_val2014_000000021060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450026, "question_id": "59T56KYGmjAarRmYdGGXGi", "question": "Why plants are planted on roadside?", "choices": ["decorative purpose", "wind breaks", "climatic excesses", "crop prevention"], "correct_choice_idx": 1, "direct_answers": ["decoration", "trees", "aesthetics", "decoration", "beautification", "greens", "avoid accident", "cactus", "decoration", "wind breaks"], "difficult_direct_answer": false, "rationales": ["The plants are use to stop the wind.", "Plants make the area look nicer.", "These are to make a concrete and buildings area look better"], "image": "train2014/COCO_train2014_000000450026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409667, "question_id": "59Tc2thgYw7TDbD3JKvBet", "question": "Why does he have one leg in the air?", "choices": ["just pitched", "to balance", "is falling", "is angry"], "correct_choice_idx": 0, "direct_answers": ["power", "tossing baseball", "forceful throwing", "extra power", "pitching", "pitching", "throwing", "pitching", "pitching forth", "just pitched"], "difficult_direct_answer": false, "rationales": ["Pitching a baseball uses the whole body and flinging a leg up and behind gives more speed to the ball.", "He just threw the ball very fast to the batter and the force of the throw makes his leg go up behind him", "The man is just pitching a ball."], "image": "val2014/COCO_val2014_000000409667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82456, "question_id": "59XB9MDfMXraE6uBAeDSYf", "question": "What is the ancestral animal this current elephants originated from?", "choices": ["snow elephant", "woolly mammoth", "russian mammoth", "mega elephant"], "correct_choice_idx": 1, "direct_answers": ["elephant", "mammoth", "wooly mammoth", "woolly mammoth", "african elephant", "mastadons", "mastodon", "elephants", "elephant", "india"], "difficult_direct_answer": true, "rationales": ["They originated from the woolly mammoth.", "The wooly mammoth lived many centuries ago.", "Traditionally modern day elephants are descendants of the woolly mammoth."], "image": "val2014/COCO_val2014_000000082456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441213, "question_id": "59gNdguBEM6th3Nscvsjh6", "question": "What allowed the man to get air?", "choices": ["trampoline", "pipe", "ramp", "barrel"], "correct_choice_idx": 2, "direct_answers": ["jumping", "thrust", "skateboard", "ramp", "ramp", "gravity", "ramp", "speed", "skateboard", "jumped ramp"], "difficult_direct_answer": false, "rationales": ["He went up an incline at a high rate of speed", "He is jumping off the ramp with the skateboard.", "A man is on a skateboard above a ramp. ramps are used to get air."], "image": "train2014/COCO_train2014_000000441213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196653, "question_id": "59okrvPfQxJDozEm3xYymF", "question": "What is the white device the man is holding in his left hand?", "choices": ["clock", "calculator", "cell phone", "game controller"], "correct_choice_idx": 3, "direct_answers": ["game controller", "game controller", "game controller", "controller", "game controller", "controller", "controller", "controller", "controller", "game controller"], "difficult_direct_answer": false, "rationales": ["The man is holding a white video game controller is his left hand to play a game.", "The man is using a controller to play a game.", "The man is holding a game controller in his left hand."], "image": "train2014/COCO_train2014_000000196653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303247, "question_id": "59zoSmMcyPg3kRiwXbDh5z", "question": "What is a group of the fruit called?", "choices": ["hand", "peck", "bushel", "pint"], "correct_choice_idx": 0, "direct_answers": ["bunch", "fruit", "bunch", "banana", "bunch", "grove", "bunch", "bunch", "hand", "banana"], "difficult_direct_answer": false, "rationales": ["The bananas are grouped together and called a bushel.", "The fruits are bananas. each banana is a finger.", "A bunch of bananas is called a hand."], "image": "train2014/COCO_train2014_000000303247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441432, "question_id": "5A58ZgK6jySbiuAc7ccGPN", "question": "What weather event happened recently here?", "choices": ["hail", "none", "windy rain", "tsunami"], "correct_choice_idx": 2, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "windy rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["There is a broken umbrella in the garbage can.", "The umbrella is there from the rain.", "An umbrella as well as some other items are stuffed in a trash can."], "image": "val2014/COCO_val2014_000000441432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397445, "question_id": "5AAS4ENqRrzQQkUZrc3i7Y", "question": "What unusual material is being used to prop up the traffic signal in this intersection?", "choices": ["wood", "stone", "titanium", "brick"], "correct_choice_idx": 3, "direct_answers": ["signpost", "brick", "brick wall", "another sign", "brick", "brick", "sticks", "tree", "traffic signal", "traffic"], "difficult_direct_answer": false, "rationales": ["The items have the shape, size and color as a brick would. brick is one of the most common building materials and would be held together with grout as it appears in this platform.", "Street lights are on top of a brick structure. street lights are usually not on bricks.", "The sign on the street is being held up by a brick wall."], "image": "train2014/COCO_train2014_000000397445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127702, "question_id": "5AGQQfTLUq2FYVqFFr697S", "question": "What kind of skiing is done here?", "choices": ["downhill", "trick", "alpine", "cross country"], "correct_choice_idx": 3, "direct_answers": ["cross country", "trail skiing", "alpine", "snow skiing", "crosscountry", "cross country", "cross country", "snow", "nordic", "crosscountry"], "difficult_direct_answer": false, "rationales": ["The skiing appears to be taking place on flat ground. downhill, alpine and trick would all require a hill.", "Cross country skiing is being done.", "The terrain is flat with a ski trail going through the woods."], "image": "val2014/COCO_val2014_000000127702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117310, "question_id": "5ASYyCTPxPDaF5hYikXZx3", "question": "What are the people on the couch looking at?", "choices": ["mirror", "child", "picture window", "gaming screen"], "correct_choice_idx": 3, "direct_answers": ["television", "video screen", "tv", "television", "game", "wii tv", "gaming screen", "television", "television", "tv"], "difficult_direct_answer": false, "rationales": ["They are playing a game.", "The people are gaming.", "The people are both looking in the same direction and holding game device controllers. gaming devices would belong to a system that would be connected to a screen."], "image": "val2014/COCO_val2014_000000117310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251181, "question_id": "5AV2gAYZ2M9qtJEf6NAqVo", "question": "What food does the company whose sign is right next to the yellow car likely sell?", "choices": ["caviar", "lizard", "beef", "pudding"], "correct_choice_idx": 2, "direct_answers": ["meat", "meat", "bar blue", "meat", "beef", "barbecue", "bar-b-que", "sausage", "bbq meat", "meat"], "difficult_direct_answer": false, "rationales": ["The sign says they sell sausage, barbecue and meat.", "The food company has a sign posted that says they sell beet products.", "Barbecue is normally food made from cows. the sign states barbecue, sausage, and fresh meat."], "image": "val2014/COCO_val2014_000000251181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196851, "question_id": "5Akamji6trTLBvqzMfiicA", "question": "Why are the people wearing white aprons?", "choices": ["to dance", "to paint", "to cook", "for cosplay"], "correct_choice_idx": 2, "direct_answers": ["protect clothes", "keep clean", "working food", "protect clothing", "protect clothing", "deflect food", "cleanliness", "working kitchen", "cooks", "to cook"], "difficult_direct_answer": true, "rationales": ["The people are cooking.", "The people are preparing food.", "The people are cooking up a meal on the counter."], "image": "train2014/COCO_train2014_000000196851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189278, "question_id": "5Ar5LgxGdJVvQ9hVhk2hQm", "question": "Why is the person using a towel on the handle?", "choices": ["to clean", "it's cold", "it's hot", "to paint"], "correct_choice_idx": 2, "direct_answers": ["hot", "it's hot", "hot", "hot", "it's hot", "hot stove", "hot", "handle hot", "hot", "hot"], "difficult_direct_answer": false, "rationales": ["An oven cooks with heat.", "Oven get hot. people sometimes use towels to block heat when pot holders are not handy.", "There's steam or smoke coming out of the item, and there's a thermometer on the door. the device is also made out of metal, which transfers heat."], "image": "train2014/COCO_train2014_000000189278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66085, "question_id": "5Atf4zxB3VcSvUtrWyVtXA", "question": "What is the woman reaching into the backpack wearing on her wrist?", "choices": ["diamonds", "wristwatch", "rope", "bracelet"], "correct_choice_idx": 1, "direct_answers": ["watch", "watch", "watch", "watch", "watch", "watch", "watch", "watch", "wristwatch", "watch"], "difficult_direct_answer": false, "rationales": ["The woman has a clock on her wrist.", "The woman has a watch on her wrist.", "She has a watch on her wrist."], "image": "train2014/COCO_train2014_000000066085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250258, "question_id": "5AxybfF4PwN8sWteNJpvpF", "question": "How many of these professional American venues have artificial turf?", "choices": ["30", "eight", "five", "12"], "correct_choice_idx": 2, "direct_answers": ["many", "five", "all", "all", "all", "most", "five", "five", "100", "44"], "difficult_direct_answer": false, "rationales": ["Many fields use turf.", "Five professional ballparks use turf.", "There are five venues."], "image": "val2014/COCO_val2014_000000250258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479078, "question_id": "5AzUDF8hUPt9t299qh9cmS", "question": "Which bar is the horse meant to pass over?", "choices": ["top bar", "right vertical", "left vertical", "bottom"], "correct_choice_idx": 0, "direct_answers": ["top bar", "top", "top", "top", "highest one", "bar", "top", "top", "top most", "top bar"], "difficult_direct_answer": false, "rationales": ["A horse is jumping over a bar in an obstacle course with a jockey riding on it. the bars are set up, one directly over the other.", "The goal of the sport is to jump over the bar without knocking any off.", "The bar is the top one."], "image": "val2014/COCO_val2014_000000479078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50556, "question_id": "5B2gcbpsdhCHKbeqHrixcR", "question": "Which food will add an acidic flavor to the food?", "choices": ["meat", "lemon", "soy sauce", "miso"], "correct_choice_idx": 1, "direct_answers": ["tasty food", "lemon", "bitter", "wasabi sauce", "sauce", "lemon", "soy sauce", "soy sauce", "lemon", "vinegar"], "difficult_direct_answer": false, "rationales": ["Lemons add acid to the food making it sour.", "The food is a lemon.", "Lemon is the most acidic of the visible food options. the other options are more in the sweet or salt range."], "image": "train2014/COCO_train2014_000000050556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112922, "question_id": "5BA4kLSLXLSYEe8cA8ekNS", "question": "What is another name for the sport written on the board?", "choices": ["skiing", "football", "rugby", "tennis"], "correct_choice_idx": 1, "direct_answers": ["football", "football", "futbol", "football", "football", "futbol", "football", "football", "futbol", "football"], "difficult_direct_answer": false, "rationales": ["The name is football.", "In the usa it's called soccer but in some other places in the world they all it football.", "The sign says soccer."], "image": "train2014/COCO_train2014_000000112922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161657, "question_id": "5BHFY9Qzy3KFhKYsCHgWxL", "question": "What kind of bathroom is this?", "choices": ["home", "hotel", "hospital", "school"], "correct_choice_idx": 1, "direct_answers": ["hotel", "large", "apartment bathroom", "hotel", "hotel", "master", "5-piece", "marble", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["The bathroom looks very large and fancy.", "The bathroom is fancy.", "The way it's laid out looks like a paid room bathroom."], "image": "train2014/COCO_train2014_000000161657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157160, "question_id": "5BPU6DHF6tNAoXKdXaDc3W", "question": "What number is the batter?", "choices": ["five", "42", "12", "nine"], "correct_choice_idx": 0, "direct_answers": ["five", "five", "five", "five", "five", "five", "five", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["It is on the front of his shirt", "His number is beneath the name of his team on the front of his jersey.", "The number is the fifth in the line."], "image": "train2014/COCO_train2014_000000157160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392083, "question_id": "5BT7GV564kGttzkrGmc3Vc", "question": "What is the person in the elevated stand watching?", "choices": ["sunset", "swimmers", "boats", "sea slugs"], "correct_choice_idx": 1, "direct_answers": ["swimmers", "swimmers", "swimmers", "safeguarding", "swimmers", "safeguarding", "ocean", "swimmers", "swimmers", "swimmers"], "difficult_direct_answer": false, "rationales": ["They are a lifeguard.", "The person in the stand is a lifeguard and watch to make sure no one drowns.", "The person in the stand is a lifeguard and he's watching for anyone in the water who might need help."], "image": "train2014/COCO_train2014_000000392083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482951, "question_id": "5BURHbqVnBoptHRoRKWS6S", "question": "Where does the girl want the toy she holds to go?", "choices": ["skyward", "nowhere", "down", "sideways"], "correct_choice_idx": 0, "direct_answers": ["airborne", "air", "air", "upwards", "in air", "air", "skyward", "in air", "sky", "in air"], "difficult_direct_answer": false, "rationales": ["Looks to be some kind of different shape kite.", "The girl wants to fly the kite.", "The girl is holding a kite."], "image": "val2014/COCO_val2014_000000482951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452618, "question_id": "5C9AmTFrPzQXKhncBig7mr", "question": "Where is this woman feasting?", "choices": ["airplane", "train", "subway", "bus"], "correct_choice_idx": 0, "direct_answers": ["airplane", "plane", "airplane", "cake", "plane", "airplane", "airplane", "first class", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["The food is on a tray that is attached to the seat in front of the woman. the windows are small.", "A woman is holding several plates of food. she is sitting in a seat with a window.", "The woman is sitting by a small airplane window."], "image": "val2014/COCO_val2014_000000452618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413070, "question_id": "5CCcNkzDWE9E7XaCRpz9iR", "question": "What is one of the colors on the curtain?", "choices": ["red", "purple", "yellow", "blue"], "correct_choice_idx": 2, "direct_answers": ["yellow", "orange", "yellow", "white", "yellow", "green", "white", "bathroom", "white", "gray"], "difficult_direct_answer": false, "rationales": ["The curtain is gray and orange.", "Its the most brilliant color on the curtain.", "One of the circle patterns on the curtain is a stark contrast with the white and can be identified as one if the primary colours of yellow."], "image": "train2014/COCO_train2014_000000413070.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535355, "question_id": "5CKkKsauYVTpx24aUmauuD", "question": "What type fruit is seen on this pie?", "choices": ["strawberry", "carrots", "cherry", "olives"], "correct_choice_idx": 3, "direct_answers": ["olives", "olives", "olive", "olives", "spinach", "blueberry", "black olives", "olive", "blueberry", "blueberry"], "difficult_direct_answer": false, "rationales": ["The thing on the top is round and black.", "The toppings are black.", "The pizza slice is topped with whole black olives which are a fruit."], "image": "train2014/COCO_train2014_000000535355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66831, "question_id": "5CTMpgmaDr3LGZieG2Jrkd", "question": "Where are these fruits being sold?", "choices": ["mall", "outdoor stall", "supermarket", "farmer's market"], "correct_choice_idx": 2, "direct_answers": ["grocery store", "market", "grocery store", "supermarket", "grocery store", "bananas", "bananas", "supermarket", "supermarket", "grocery store"], "difficult_direct_answer": false, "rationales": ["There are several rows of food indicating a large store.", "These bananas are inside of a store that sells food. there are people in the background shopping in this store. there is a sign posted at the bananas that states the price of the bananas.", "These are being sold at a maket."], "image": "train2014/COCO_train2014_000000066831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384023, "question_id": "5CV9uSgCFpkYgi9BjHNJuB", "question": "If the train continues forward what will make it go out of sight first?", "choices": ["bridge", "building", "tunnel", "extreme distance"], "correct_choice_idx": 2, "direct_answers": ["tunnel", "tunnel", "tunnel", "caboose", "tunnel", "first car", "tunnel", "tunnel", "tunnel", "tunnel"], "difficult_direct_answer": false, "rationales": ["Even though the train is still close, when it goes inside this it can't be seen", "The train is entering a tunnel and is not visible inside the tunnel.", "The train is headed into a tunnel."], "image": "train2014/COCO_train2014_000000384023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74411, "question_id": "5CVLk8mn7BBth5RrEU3SWD", "question": "Why is the person wearing an orange vest?", "choices": ["costume", "disguise", "warmth", "visibility"], "correct_choice_idx": 3, "direct_answers": ["for safety", "fire chief", "standout", "visibility safety", "safety", "traffic control", "bus", "visibility", "visibility", "his opinion"], "difficult_direct_answer": true, "rationales": ["A person is wearing a bright colored vest in an area of an emergency. orange is often used to increase visibility of things like runners, construction workers, etc.", "Orange or neon clothing is usually worn for safety reasons because it is very easily seen.", "The person wants to be visible."], "image": "train2014/COCO_train2014_000000074411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335975, "question_id": "5Ck5RLWfEUH5XSbzEBW95i", "question": "Which electronics manufacturer is advertised?", "choices": ["lg", "sony", "toshiba", "hitachi"], "correct_choice_idx": 3, "direct_answers": ["hitachi", "hitachi", "hitachi", "hitachi", "hitachi", "hitachi", "hitachi", "hitachi", "hitachi", "hitachi"], "difficult_direct_answer": false, "rationales": ["There is a sign saying hitachi on the building.", "A building has the logo on the front, above the doors.", "They have a sign above and behind the no entry sign."], "image": "train2014/COCO_train2014_000000335975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253323, "question_id": "5Ck9J4voggfYyDMtBzaMzq", "question": "What purpose is served by the open glass building with green posts?", "choices": ["green grocer", "phone booth", "bus stop", "lemonaid stand"], "correct_choice_idx": 2, "direct_answers": ["shelter", "waiting", "bus boarding", "bus stop", "bus stop", "bus stop", "bus stop", "bus stop", "bus stop", "shelter"], "difficult_direct_answer": false, "rationales": ["The metal and glass structure is for waiting for the bus in which we see is now parked there.", "There are multiple buses nearby and this is a place for passengers to sit while they wait for one to arrive.", "Bus stops are on the side of streets with coverings for the people waiting to stay out of the elements."], "image": "train2014/COCO_train2014_000000253323.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47055, "question_id": "5CkQtE7zHYDVQEaLmUA9QK", "question": "The layout and style of this room can be labeled as what?", "choices": ["hip", "modern", "retro", "classic"], "correct_choice_idx": 1, "direct_answers": ["modern", "minimalist", "modern", "modern", "bedroom", "modern", "modern", "modern", "modern", "modern"], "difficult_direct_answer": false, "rationales": ["The layout is modern.", "The style is sleek and bold.", "The design looks very modern."], "image": "val2014/COCO_val2014_000000047055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153692, "question_id": "5CzCQM6XAs9mPqmQWPBghr", "question": "What is an object that shares a color with the frame of the bike?", "choices": ["blueberries", "oranges", "bananas", "watermelon"], "correct_choice_idx": 1, "direct_answers": ["orange", "dog", "traffic light", "orange", "oranges", "road markings", "pumpkin", "traffic light", "orange", "pumpkin"], "difficult_direct_answer": false, "rationales": ["The bike is orange.", "The bike is orange and so are oranges.", "This fruit has a bright color that is a mixture of red and yellow"], "image": "val2014/COCO_val2014_000000153692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188037, "question_id": "5D4wHf8SzYXi9jsP6PhAm5", "question": "The RTO code in the motor vehicle represent which state?", "choices": ["maharashtra", "assam", "kerala", "delhi"], "correct_choice_idx": 0, "direct_answers": ["my", "marshal islands", "a4520", "don't know", "marshall island", "maharashtra", "arizona", "akola", "anda", "rhode island"], "difficult_direct_answer": true, "rationales": ["The rto code is used in maharashtra.", "The plate says my and is from india.", "The rto code on the motor vehicle is mh-30."], "image": "train2014/COCO_train2014_000000188037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141056, "question_id": "5D9tMRQS8prE5yW3MAxb2G", "question": "How many calories are in melted cheese?", "choices": ["321kcal", "541kcal", "654kcal", "983kcal"], "correct_choice_idx": 3, "direct_answers": ["lot", "250", "many", "hundreds", "983kcal", "too many", "one hundred", "983", "lot", "272"], "difficult_direct_answer": true, "rationales": ["Cheese has a lot of calories.", "Cheese is a high calorie food. cheese is on a pizza.", "Cheese is a high calorie food."], "image": "train2014/COCO_train2014_000000141056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312638, "question_id": "5DJD5QJqzWR9GhKPeGSqgV", "question": "Where does he hope his toy will go?", "choices": ["sky", "trees", "water", "sand"], "correct_choice_idx": 0, "direct_answers": ["sky", "sky", "sky", "dryland", "sky", "sky", "air", "sky", "air", "sky"], "difficult_direct_answer": false, "rationales": ["The boy is playing with a kite on the beach.", "The toy is a kite, designed for what is mentioned in option a.", "A boy is preparing a kite on a beach. kites are flown in the air."], "image": "train2014/COCO_train2014_000000312638.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26611, "question_id": "5DL4FbHQjxhmqRP24kDyUF", "question": "What was the black item used for?", "choices": ["chopping parsley", "peeling broccoli", "mixing dressing", "peeling carrots"], "correct_choice_idx": 3, "direct_answers": ["peeling skin", "grating", "peeling", "peel carrots", "peeling", "shredding carrots", "peeling", "peeling carrots", "peeling", "pealing"], "difficult_direct_answer": false, "rationales": ["They are sharp-edged and have some orange matter remaining on the blade.", "A kitchen tool with a blade and a handle is next to a bowl of carrots.", "This item is a vegetable peeler."], "image": "val2014/COCO_val2014_000000026611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465507, "question_id": "5DPwuTU6SdSvnG3xvau3rc", "question": "What does the transportation a little behind the red umbrella generally hold?", "choices": ["horse", "numerous people", "baby", "cargo"], "correct_choice_idx": 2, "direct_answers": ["red", "people", "baby", "baby", "baby", "babies", "baby", "baby", "luggage", "baby"], "difficult_direct_answer": false, "rationales": ["Behind the red umbrella there is a stroller visible based on its design and size. strollers are intended to contain babies.", "The cart is for babies", "There is a stroller behind the lady."], "image": "train2014/COCO_train2014_000000465507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265243, "question_id": "5DZseNbbfistQApCtAqUst", "question": "What photographic technique was used to capture the movement of traffic on the road?", "choices": ["panorama", "hdr", "time-lapse", "bokeh"], "correct_choice_idx": 2, "direct_answers": ["long exposure", "rolling camera", "light trails", "light trail", "blurry light", "time-lapse", "modern technique", "time alteration", "long exposure", "panning"], "difficult_direct_answer": true, "rationales": ["It is slow framed photography that catches cars at different points", "Timelapse effect is used to create the lines.", "If a stationary camera's shutter is clicked open and shut slowly and repeatedly, it produces this type of blurred photo."], "image": "val2014/COCO_val2014_000000265243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113716, "question_id": "5EGrARwha6nRfVvum9G8aH", "question": "What is used to cover train tracks?", "choices": ["glass", "coal", "ballast", "cement"], "correct_choice_idx": 2, "direct_answers": ["ballast", "metal", "iron", "train", "bridges", "trains", "rocks", "steel", "gravel", "ballast"], "difficult_direct_answer": true, "rationales": ["The tracks are covered in ballast.", "The track appears to have gravel and rocks on and around the tracks. regarding train tracks, this gravel would be referred to as answer a.", "This is the only thing that would not eventually be destroyed or used up by the train."], "image": "train2014/COCO_train2014_000000113716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380979, "question_id": "5ERhua776rWZk89bfRX3vr", "question": "Who are these three people?", "choices": ["visitors", "customers", "firefighters", "passengers"], "correct_choice_idx": 0, "direct_answers": ["family", "people", "riders", "family", "family", "family", "tourists", "visitors", "volunteers", "fire dept"], "difficult_direct_answer": false, "rationales": ["They are tourists.", "They look to be passengers in the car.", "The group of people look like a family that's visiting local attractions. they appear to be a father, mother, and daughter, and are dressed in tourist clothes."], "image": "val2014/COCO_val2014_000000380979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445268, "question_id": "5EW7vUxUs5VdjcMpFyrWsP", "question": "Where has the white car on the yellow straps been?", "choices": ["harbor water", "towtruck bed", "boat", "dock"], "correct_choice_idx": 0, "direct_answers": ["in water", "harbor water", "water", "lake", "under water", "underwater", "water", "water", "submerged", "underwater"], "difficult_direct_answer": false, "rationales": ["The yellow harness is lifting the formerly submerged car.", "It was in the water and being pulled out.", "A car is being lifted and is above a harbor and water."], "image": "train2014/COCO_train2014_000000445268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161231, "question_id": "5EqCuLQtoVJvDHbUiVkX3b", "question": "When leaving in which directions do these trains travel?", "choices": ["none", "opposite", "east west", "same"], "correct_choice_idx": 3, "direct_answers": ["right side", "right", "north", "parallel", "forward", "same direction", "same", "same direction", "forward", "up"], "difficult_direct_answer": false, "rationales": ["The trains are going in the same direction.", "The trains are both faced in the identical direction.", "They are facing different directions."], "image": "val2014/COCO_val2014_000000161231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267734, "question_id": "5FFEuva9Y54SinvA43Q6oA", "question": "What is this type of mirror on a bike called?", "choices": ["rear view", "helping", "utility", "tracking"], "correct_choice_idx": 0, "direct_answers": ["rearview mirror", "side mirror", "side", "rearview", "rear view", "review mirror", "side view", "side mirror", "side mirror", "rear view"], "difficult_direct_answer": false, "rationales": ["It shows you a reflection from behind so you don't have to turn around to see.", "This is a mirror to see behind him", "It allows the rider to see behind them without having to turn their head."], "image": "train2014/COCO_train2014_000000267734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401850, "question_id": "5FHhmxJj8s3bL8cYXKVqsj", "question": "What is sold in most of the stores seen here?", "choices": ["cars", "sleds", "clothes", "stocks bonds"], "correct_choice_idx": 2, "direct_answers": ["clothes", "clothing", "clothing", "clothes", "clothing", "clothing", "clothes", "clothing", "clothing", "clothing"], "difficult_direct_answer": false, "rationales": ["These department stores primarily sell clothing but none of the other items listed.", "Storefronts in a city are shown. many stores in cities sell clothing.", "Most of the stores here are seen selling clothes."], "image": "val2014/COCO_val2014_000000401850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32301, "question_id": "5FLZAzro3vDFNG96x3rhGX", "question": "What kind of baskets are for sale in this shop?", "choices": ["cracker", "vegetable", "chocolate", "fruit"], "correct_choice_idx": 3, "direct_answers": ["flower", "flower", "fruit", "fruit", "fruit", "fruit", "fruit", "fruit", "fruit", "fruit baskets"], "difficult_direct_answer": false, "rationales": ["The baskets are for fruit.", "You can see all the fruit in the store.", "There is produce in the window of the shop."], "image": "train2014/COCO_train2014_000000032301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224020, "question_id": "5FUAcYnDqpv6QxsQYks9zB", "question": "When did the restaurant make this donut?", "choices": ["same day", "month before", "week before", "day before"], "correct_choice_idx": 0, "direct_answers": ["recently", "same day", "this morning", "recently", "recently", "today", "recently", "morning", "today", "morning"], "difficult_direct_answer": false, "rationales": ["On the packaging next to the donut you can see majority of the wording fresh. this would indicate that products sold with this package would be baked the same day.", "The bag says fresh so it is presumed the item is made the same day.", "Usually donuts are made the same day."], "image": "val2014/COCO_val2014_000000224020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282134, "question_id": "5FW8nMy3AfLT2idzPAsfUY", "question": "Why is he smiling?", "choices": ["ate dinner", "stopped fight", "likes dogs", "new shirt"], "correct_choice_idx": 2, "direct_answers": ["dogs", "happy", "loves dogs", "happy", "posing", "hes happy", "likes dogs", "love", "happy", "loves dogs"], "difficult_direct_answer": false, "rationales": ["He is happy with his animals.", "He has two dogs in front of him.", "He is kneeling down and interacting with the dogs, so they are likely the thing making him happy."], "image": "val2014/COCO_val2014_000000282134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394190, "question_id": "5FZzjvJa9ytgFxwVz6xSNB", "question": "What is decorating the top of this girl's hat?", "choices": ["pom-pom", "dye", "glitter", "tassel"], "correct_choice_idx": 0, "direct_answers": ["hearts", "ball", "hearts", "hearts", "pom pom", "pom pom", "tussle", "pom-pom", "fuzzy ball", "ball"], "difficult_direct_answer": false, "rationales": ["The hat has a round fluffy ball on the top of it.", "In knitting balls of yarn that looks like this on top of the girls cat are known as the type of decoration there are also smaller version and used in arts and crafts projects.", "The round part is the same name as a cheerleader part."], "image": "train2014/COCO_train2014_000000394190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348636, "question_id": "5FbzwMsT8cjKtG9yMUnAxG", "question": "Why does the boy have his arms out?", "choices": ["swing", "break fall", "wave", "reach"], "correct_choice_idx": 0, "direct_answers": ["baseball", "swinging bat", "hitting ball", "hit ball", "hit ball", "batting", "hitting ball", "swing bat", "swing", "hitting ball"], "difficult_direct_answer": false, "rationales": ["There is a boy with a bat trying to hit a ball in the air.", "He is trying to hit the ball with the bat.", "He is holding a bat and ready to hit the ball."], "image": "train2014/COCO_train2014_000000348636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528117, "question_id": "5FgECdSYcESN2ySn7QVTyQ", "question": "Where did 19 just step away from?", "choices": ["home base", "bunker", "bunk house", "pitchers mound"], "correct_choice_idx": 0, "direct_answers": ["home plate", "home base", "home base", "base", "home plate", "home plate", "dugout", "plate", "home", "base"], "difficult_direct_answer": false, "rationales": ["19 is walking away from home plate.", "He was up to bat then hit the ball and ran the bases. he just came in and is done with his turn.", "They are high-fiving each other which means number nineteen likely just scored a point by rounding all the bases."], "image": "train2014/COCO_train2014_000000528117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399540, "question_id": "5FjkxeQskjdsJVQ3uM2guy", "question": "What is the school bus doing?", "choices": ["going", "stopping", "unloading students", "being parked"], "correct_choice_idx": 2, "direct_answers": ["stopping", "stopping", "driving", "driving", "transporting students", "stopping", "driving", "transporting kids", "unloading students", "stopping"], "difficult_direct_answer": false, "rationales": ["The school bus is parked on the side of the road so it can let students out.", "The bus is in the street, more towards the center line than the curb. the bus is not near a school.", "The bus has its red stop arm extended showing that cars must stop because passengers will be exiting."], "image": "train2014/COCO_train2014_000000399540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399012, "question_id": "5Fm4wdqBoJJmjHhA6nLmbv", "question": "What type of refreshment are the donuts being served with on the bench?", "choices": ["tea", "coffee", "juice", "dairy"], "correct_choice_idx": 0, "direct_answers": ["tea", "tea", "drink", "tea", "tea", "cake", "tea", "juice", "tea", "tea"], "difficult_direct_answer": false, "rationales": ["The doughnuts are being served with tea.", "These are traditional tea dishes to drink from", "There are cups of tea sitting next to the donuts on the bench."], "image": "val2014/COCO_val2014_000000399012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312790, "question_id": "5GTT8KTFnVq78JDhkYoSf8", "question": "What might the buoys near the boats be used as?", "choices": ["lights", "anchors", "beach balls", "diving boards"], "correct_choice_idx": 1, "direct_answers": ["anchor", "marking", "fishing net", "anchors", "guide", "anchors", "anchors", "markers", "protectors", "anchors"], "difficult_direct_answer": false, "rationales": ["The buoys keep the boats anchored down.", "They might be used to stop the vehicle in the area.", "The buoys are held by anchors."], "image": "train2014/COCO_train2014_000000312790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306936, "question_id": "5GsVM4xWxwH8iYurWyAPif", "question": "Why are there yellow strips on the men's vests?", "choices": ["dress code", "camouflage", "fashion", "visibility"], "correct_choice_idx": 3, "direct_answers": ["safety reflection", "safety stripes", "reflectors", "safety", "safety visibility", "airplane workers", "visibility", "workers", "high visibility", "reflective tape"], "difficult_direct_answer": true, "rationales": ["The men want to be seen.", "The man is wearing a safety vest.", "The men have yellow stripes on their vests that make it easy to see them in the dark"], "image": "val2014/COCO_val2014_000000306936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305224, "question_id": "5HXQK4aq6jUPZEVSra8gYg", "question": "What type of seat is she using?", "choices": ["recliner", "bean bag", "sofa", "desk chair"], "correct_choice_idx": 1, "direct_answers": ["bean bag", "bean bag", "inflatable chair", "bean bag", "pillow", "beanbag", "beanbag", "beanbag chair", "bean bag", "beanbag"], "difficult_direct_answer": false, "rationales": ["The seat is a bean bag.", "The girl is on a squishy seat.", "A girls is sitting in a large round chair that is soft."], "image": "train2014/COCO_train2014_000000305224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321886, "question_id": "5HkofQvXtKJsZiYNP3tpB7", "question": "What are the cyclists doing?", "choices": ["practicing", "racing", "sightseeing", "commuting"], "correct_choice_idx": 3, "direct_answers": ["commuting", "commuting", "commuting", "riding", "riding", "commuting", "commuting", "riding", "traveling", "riding"], "difficult_direct_answer": false, "rationales": ["The people are riding bicycles on a road in order to get to their destination.", "They appear to be traveling during rush hour traffic.", "The cyclists are riding on a city road to get to and from work."], "image": "val2014/COCO_val2014_000000321886.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64146, "question_id": "5HrvQ272JVKJLDZhWcaFoK", "question": "The man wears a hat made of what?", "choices": ["lemons", "margarine", "tea leaves", "bananas"], "correct_choice_idx": 3, "direct_answers": ["bananas", "bananas", "bananas", "bananas", "bananas", "bananas", "bananas", "bananas", "bananas", "bananas"], "difficult_direct_answer": false, "rationales": ["The man's hat is made of bananas.", "There are bananas on the man's head.", "The hat is made of yellow fruits that are very long."], "image": "val2014/COCO_val2014_000000064146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437049, "question_id": "5HxTGmtFyk3BEJbRfQZjhh", "question": "Umbrellas provide what here?", "choices": ["shade", "color", "advertising", "rain cover"], "correct_choice_idx": 0, "direct_answers": ["shade", "shade", "shade", "shade", "shade", "shade", "shade", "shade", "shade", "shade"], "difficult_direct_answer": false, "rationales": ["It is a sunny day with no rain", "Umbrellas offer shade from excessive sunlight and rainfall.", "The umbrellas provide protection from the sun."], "image": "val2014/COCO_val2014_000000437049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92813, "question_id": "5J37XookodNnUri9fsdTdL", "question": "If all humans left this scene exactly as is what would likely approach it first?", "choices": ["fish", "elephants", "bugs", "cars"], "correct_choice_idx": 2, "direct_answers": ["bird", "ants", "ants", "ants", "ants", "bugs", "ants", "ants", "ants", "ants"], "difficult_direct_answer": false, "rationales": ["On a summer day in the park, nothing attracts hungry insects like sugary items. this large target is the perfect feast for ants, flies and so forth!", "If the food is on the ground too long it will attract ants.", "The scene is not necessarily in asia or africa, is not near a road, and is not near water."], "image": "train2014/COCO_train2014_000000092813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267861, "question_id": "5JFinSUU5VbsNwLAdPQ82v", "question": "What does the vehicle look like?", "choices": ["motorcycle", "tank", "boat", "car"], "correct_choice_idx": 0, "direct_answers": ["tricycle", "tricycle", "tricycle", "auto", "tricycle", "motorcycle", "tricycle", "motorcycle", "bike", "tricycle"], "difficult_direct_answer": false, "rationales": ["A vehicle is open and has two tires in the back and one in the front. motorcycles are not enclosed.", "The vehicle is a three-wheel motorcycle.", "The vehicle is a motorbike."], "image": "val2014/COCO_val2014_000000267861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318444, "question_id": "5JTjFw2hXXoq7uWsm9U9tV", "question": "What type of region is the man visiting?", "choices": ["desert", "mountain", "tropical", "aquatic"], "correct_choice_idx": 1, "direct_answers": ["mountain", "snowy", "mountains", "ski", "mountains", "mountain", "mountain", "mountain", "mountain", "snowy alps"], "difficult_direct_answer": false, "rationales": ["The man is skiing downhill from the top of a snowy hill.", "The region has mountains.", "The skier is on top of a high snowy mountainous area and is headed down."], "image": "train2014/COCO_train2014_000000318444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79975, "question_id": "5JU4hXqK9PqNK95KwxAUea", "question": "In the event of the boarder losing their balance what will protect their cranium?", "choices": ["shirt", "helmet", "knee pads", "wrist guards"], "correct_choice_idx": 1, "direct_answers": ["helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet"], "difficult_direct_answer": false, "rationales": ["The item is a padded and protective hat to protect their head.", "A helmet on their head will protect their head from falls.", "A helmet is used to protect your head from falls."], "image": "val2014/COCO_val2014_000000079975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27371, "question_id": "5JZ5QBxwx566qzE3XAM9xZ", "question": "Where might the lady on the sidewalk be going?", "choices": ["vacation", "protest", "work", "sales job"], "correct_choice_idx": 0, "direct_answers": ["hotel", "airport", "abroad", "train", "vacation", "vacation", "going home", "hotel", "airport", "traveling"], "difficult_direct_answer": false, "rationales": ["She is likely going on vacation judging by the casual style of her dress and luggage.", "She is pulling a suitcase so she probably is on vacation.", "She has a suitcase with her which is not usually brought to work, jobs, or protests."], "image": "val2014/COCO_val2014_000000027371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252091, "question_id": "5JiW4T8bhhccNvB6EhX3oZ", "question": "Where did this lady get her lunch?", "choices": ["home", "mall", "restaurant", "food truck"], "correct_choice_idx": 3, "direct_answers": ["food truck", "holy grillca", "food truck", "outside hotel", "food truck", "truck", "behind her", "food truck", "truck", "hotel"], "difficult_direct_answer": false, "rationales": ["The woman went to a food truck.", "There is a food truck behind her.", "The lady got her lunch from the food truck located behind her and she is holding food in a paper plate"], "image": "val2014/COCO_val2014_000000252091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73634, "question_id": "5JvaMwCL8ufWN6xmWZNY7B", "question": "What will the silver paddles sticking out of the red button do once in the air?", "choices": ["spin", "change color", "detach", "join together"], "correct_choice_idx": 0, "direct_answers": ["rotate", "spin", "spin", "spin", "turn", "twirl", "power engine", "spin", "propel", "turn"], "difficult_direct_answer": false, "rationales": ["The silver paddles will spin in order for the plane to fly", "The other options don't remotely fit this scene. it's a propellor.", "The propeller will turn quickly and keep the plan in the air."], "image": "val2014/COCO_val2014_000000073634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298726, "question_id": "5K63VrUDrXiRVTMzRh477b", "question": "What do they have to hit balls over?", "choices": ["water", "plate", "net", "sand"], "correct_choice_idx": 2, "direct_answers": ["net", "rackets", "net", "net", "net", "net", "net", "net", "net", "rackets"], "difficult_direct_answer": false, "rationales": ["There is a net between the two teams.", "They hit the balls over the net.", "They are playing tennis. they have to hit the balls over the object located at the middle of the court."], "image": "val2014/COCO_val2014_000000298726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21711, "question_id": "5KD6zUZKjC7KXQn5cJgabo", "question": "Why are the motorbike riders wearing helmets?", "choices": ["style", "halloween", "protection", "visibility"], "correct_choice_idx": 2, "direct_answers": ["safety", "safety", "safety", "protection", "safety", "rally runthrough", "save travelling", "for protection", "safety", "protection"], "difficult_direct_answer": false, "rationales": ["A motorcycle accident is very dangerous, and helmets are worn for safety reasons to absorb the shock in case of impact.", "The motorbike riders are wearing helmets to protect their heads.", "The helmets provide protection for the riders' heads in case of a fall or accident."], "image": "val2014/COCO_val2014_000000021711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539891, "question_id": "5KEFKHVk9yT83NAo2dxxjq", "question": "What color shirt does the person who threw the frisbee wear here?", "choices": ["black", "none", "blue", "green"], "correct_choice_idx": 0, "direct_answers": ["blue", "black", "blue", "blue", "blue", "blue", "blue", "blue", "turquoise", "black"], "difficult_direct_answer": false, "rationales": ["The boy has his arm stretched out to the right which would be in line with throwing a frisbee.", "They have on a dark blue shirt.", "He is wearing a blue shirt."], "image": "train2014/COCO_train2014_000000539891.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231835, "question_id": "5KPmnyhT7y5bNWmLS8Q67C", "question": "What type hat does the owner of this boat prefer appropriately?", "choices": ["none", "baseball", "floater", "boater"], "correct_choice_idx": 3, "direct_answers": ["straw", "boat", "boater", "straw", "unknown", "fedora", "brimmed", "straw", "fedora", "boater"], "difficult_direct_answer": false, "rationales": ["It stands to reason that the owner of a boat would like boater hats even though i have never heard the term.", "This boat has a boater hat sitting on the floor.", "Usually made of straw, this hat has an even flat brim."], "image": "train2014/COCO_train2014_000000231835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342757, "question_id": "5KeCjm8Xkeed5c4fafDZaA", "question": "What is the boat using to be seen better?", "choices": ["bright paint", "light", "large mast", "horn"], "correct_choice_idx": 1, "direct_answers": ["lights", "light", "light", "lights", "light", "sail", "light", "lights", "lights", "lights"], "difficult_direct_answer": false, "rationales": ["Several lights are on it to show other boaters it's there when it gets dark.", "The boat has light.", "There is a light on the boat."], "image": "train2014/COCO_train2014_000000342757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469729, "question_id": "5KkV3bfgmQzASBBddySskm", "question": "Why is he smiling?", "choices": ["is surprised", "for camera", "won money", "is friendly"], "correct_choice_idx": 1, "direct_answers": ["good mood", "picture taken", "happy", "happy", "picture", "for camera", "happy", "getting married", "looks nice", "likes cameraman"], "difficult_direct_answer": false, "rationales": ["The man is posing.", "He is posing for a picture.", "The man is smiling directly at the camera."], "image": "train2014/COCO_train2014_000000469729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360595, "question_id": "5KoCcuwMSNcSQEYA9SfwTv", "question": "Who is allowed to share and use this space?", "choices": ["members only", "anyone", "police only", "wealthy"], "correct_choice_idx": 1, "direct_answers": ["general public", "park goers", "public", "skateboarders", "everyone", "anyone", "residents", "people", "anyone", "public"], "difficult_direct_answer": false, "rationales": ["This space is meant for the public.", "This is a public park area with a lot of benches and wide walk ways for many people", "They are at a park."], "image": "val2014/COCO_val2014_000000360595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407820, "question_id": "5Kod4qDcgmJV7tLN5FDdYW", "question": "What is under the red and white sign?", "choices": ["boy", "polar bear", "seal", "cat"], "correct_choice_idx": 0, "direct_answers": ["people queing", "people", "human", "boy", "warning", "pedestrian signal", "pedestrians", "street sign", "people", "people"], "difficult_direct_answer": false, "rationales": ["A child is standing under a stop sign on a sidewalk.", "A little boy is standing up beneath the sign.", "There are no non-human animals under the do not enter sign."], "image": "train2014/COCO_train2014_000000407820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110381, "question_id": "5Krv5vRwDHT47Kaxt8pRS3", "question": "What is the small white tank most likely filled with?", "choices": ["tea", "electricity", "propane", "water"], "correct_choice_idx": 2, "direct_answers": ["propane", "propane", "propane", "gas", "propane", "gas", "propane", "gas", "gas", "propane"], "difficult_direct_answer": false, "rationales": ["The small white tank holds gas.", "The tank is propane.", "A white receptacle typical of propane used for ovens and grills sits by a stove."], "image": "train2014/COCO_train2014_000000110381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29875, "question_id": "5KuKkjdXnccQJDgZ44W5jG", "question": "What is the metal item used as a handbag?", "choices": ["peppermill", "sifter", "blender", "cheese grater"], "correct_choice_idx": 3, "direct_answers": ["grater", "cheese grater", "cheese grater", "grater", "cheese grater", "grater", "grater", "cheese grater", "cheese grater", "grater"], "difficult_direct_answer": false, "rationales": ["A grater to chop up cheese.", "There is a metal box that has holes in it. people put food in it to shred it into smaller pieces.", "The metal helps grate soft foods."], "image": "train2014/COCO_train2014_000000029875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431922, "question_id": "5KvSojUUvzLypv8tK69iX4", "question": "When during the day is this laptop being used?", "choices": ["morning", "night", "evening", "noon"], "correct_choice_idx": 0, "direct_answers": ["morning", "breakfast", "morning", "morning", "unknown", "morning", "morning", "night", "breakfast", "morning"], "difficult_direct_answer": false, "rationales": ["The person is eating a breakfast cereal as they work which means this is the first meal of the day.", "There is cereal which people usually eat during the morning.", "Looks like the peroson is eating cereal which most people eat for breakfast."], "image": "train2014/COCO_train2014_000000431922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484332, "question_id": "5Lo6SWa2faBwqdri9BHiYV", "question": "What is the name of this appliance?", "choices": ["refrigerator", "blender", "freezer", "oven"], "correct_choice_idx": 3, "direct_answers": ["oven", "stove", "stove oven", "stove", "oven", "stove", "stove", "oven", "stove", "stove"], "difficult_direct_answer": false, "rationales": ["It is the only one of the 4 options which generates heat.", "A stove with doors below the burners is in a kitchen.", "An oven is an appliance often found in a kitchen with a stove and burners on top."], "image": "train2014/COCO_train2014_000000484332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400380, "question_id": "5LqL4odyYkA2HSmivVvExj", "question": "What type of location is this?", "choices": ["country", "city", "desert", "suburbs"], "correct_choice_idx": 1, "direct_answers": ["urban", "city", "city", "city", "city street", "city", "urban", "city", "city street", "city"], "difficult_direct_answer": false, "rationales": ["The place is a city.", "The location must be urban since taxis are around.", "There are buildings, taxis, and public transit buses."], "image": "train2014/COCO_train2014_000000400380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462878, "question_id": "5LwoE3gAKocnZ8s5m4GATQ", "question": "Who is most likely the youngest?", "choices": ["yellow outfit", "pink outfit", "blue outfit", "black outfit"], "correct_choice_idx": 1, "direct_answers": ["girl", "pink", "pink outfit", "pink girl", "purple outfit", "blue coat", "pink girl", "pink", "pink", "front"], "difficult_direct_answer": false, "rationales": ["The person in the pink outfit is short and probably younger than everyone else.", "This person is the smallest and holding hands with someone else.", "They are the smallest person, and someone is holding their hand indicating that they need to most help and are newest to the sport."], "image": "train2014/COCO_train2014_000000462878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551144, "question_id": "5MCnszFWwic58DZVTmwuQQ", "question": "Which season game it is?", "choices": ["autumn", "spring", "winter", "summer"], "correct_choice_idx": 3, "direct_answers": ["fall", "summer", "summer", "summer", "summer", "summer", "sixth", "skateboarding", "skateboarding", "winter"], "difficult_direct_answer": false, "rationales": ["This is good for the summer when there is no snow or leaves on the ground.", "The game is in the summer.", "Skateboarding is typically done in warmer months."], "image": "train2014/COCO_train2014_000000551144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50553, "question_id": "5MDgAtzJVVp4hzeMZdCsvn", "question": "What is the man wearing?", "choices": ["boa", "crown", "armor", "tie"], "correct_choice_idx": 3, "direct_answers": ["suit", "tie", "suit tie", "tie", "tie", "tie", "tie", "suit", "tie", "tie"], "difficult_direct_answer": false, "rationales": ["The man has a red tie around his neck.", "It is red and around his neck and hanging down the front of hi shirt", "The man on the chair is wearing a red necktie."], "image": "train2014/COCO_train2014_000000050553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542180, "question_id": "5MH5w75QWptWNEqvbcYLwq", "question": "The woman on the phone has what on her foot?", "choices": ["seaweed", "dirt", "eel", "flour"], "correct_choice_idx": 1, "direct_answers": ["dirt", "dirt", "dirt", "dirt", "dirt", "dirt", "dirt", "dirt", "dirt", "dirt"], "difficult_direct_answer": false, "rationales": ["There is dirt on her foot.", "You can see the bottom of her foot because she is barefoot. it is black.", "Her foot has a black, not white, substance on it."], "image": "train2014/COCO_train2014_000000542180.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81768, "question_id": "5MJNtPizGcnKYrbYFjFJqw", "question": "What kind of sportswear can you buy on the right side of the street?", "choices": ["mouthpieces", "shoes", "belts", "kneepads"], "correct_choice_idx": 1, "direct_answers": ["shoes", "shoes", "shoes", "bag", "athletic wear", "shoes", "shoes", "sports equipment", "shoes", "shoes"], "difficult_direct_answer": false, "rationales": ["Foot locker is known to sell foot wear for your feet.", "The foot locker store specializes in shoes and footwear for sports.", "The store is foot locker, which sells footwear."], "image": "train2014/COCO_train2014_000000081768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527374, "question_id": "5MjwqssawhLHQKsyFGVzPa", "question": "What gave the cheese that consistency?", "choices": ["starch", "cold", "salt", "heat"], "correct_choice_idx": 3, "direct_answers": ["melted", "heat", "heat", "heat", "melted", "melted", "melted", "cream", "cream", "heat"], "difficult_direct_answer": false, "rationales": ["The cheese topping of this dish shows signs of being melted over it.", "When cheese is in this state it is said to be melted. for cheese to move from its normal state to a melted state heat will have been added.", "Cheese is melted on top of fries. heat melts cheese."], "image": "val2014/COCO_val2014_000000527374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519673, "question_id": "5N775wi4352ypjN9AefYsY", "question": "What are the first two numbers on the train?", "choices": ["32", "92", "88", "65"], "correct_choice_idx": 1, "direct_answers": ["92", "nine two", "nine two", "nine two", "92", "ninety two", "nine two", "92", "ninety-two", "92"], "difficult_direct_answer": false, "rationales": ["The train has a call number.", "The first two numbers of the sequence are 92.", "The first two numbers are clearly visible in white letters on the black train."], "image": "val2014/COCO_val2014_000000519673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416795, "question_id": "5N7wg9t3re7pztDFmRXJbS", "question": "What is the pizza on?", "choices": ["floor", "paper plate", "tray", "fine china"], "correct_choice_idx": 1, "direct_answers": ["plate", "paper plate", "paper plate", "plate", "cheese", "cheese", "cheese", "paper plate", "plate", "cheese"], "difficult_direct_answer": false, "rationales": ["The pizza is on a thin disposable plate.", "A slice of pizza is on a white, thing plate.", "The pizza is placed on a paper plate."], "image": "val2014/COCO_val2014_000000416795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176702, "question_id": "5N8gdgGzvhzGuaG5vobKa3", "question": "What are the black circular pieces on the front of the train?", "choices": ["frisbees", "bumpers", "guns", "holders"], "correct_choice_idx": 1, "direct_answers": ["bumpers", "clear", "stops", "metal", "car attachment", "brakes", "smoke", "attackers", "brake", "pilot"], "difficult_direct_answer": true, "rationales": ["Round objects are on the front of a vehicle. bumpers are used on the front of vehicles.", "Bumpers so they don't hit anything.", "The bumpers help prevent collisions and mitigate disasters."], "image": "train2014/COCO_train2014_000000176702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231163, "question_id": "5NVR8LERi8GrF3DWeCN5s5", "question": "Who is sheltered here?", "choices": ["no one", "train riders", "bus riders", "mall walkers"], "correct_choice_idx": 1, "direct_answers": ["passenger", "train passengers", "person", "commuters", "many people", "passengers", "train riders", "train passengers", "train riders", "passengers"], "difficult_direct_answer": false, "rationales": ["Most train stations have a roof so the riders won't get wet while waiting.", "The cover is located near railroad tracks and people are standing under it.", "This is a waiting area and there are train tracks next to it."], "image": "val2014/COCO_val2014_000000231163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216676, "question_id": "5NgdUhemLptNMHSvCfZPbj", "question": "What is the man wearing on his head?", "choices": ["helmet", "hat", "glasses", "hood"], "correct_choice_idx": 3, "direct_answers": ["hood", "gas mask", "hood", "breathing mask", "ventilation mask", "mask", "wood", "hoodie", "hood", "hood"], "difficult_direct_answer": false, "rationales": ["This is worn to prevent inhalation of small particles that are present during sanding and object.", "The man is wearing a hooded sweater.", "His head is covered."], "image": "train2014/COCO_train2014_000000216676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210989, "question_id": "5NqR4ToNYJEVYNNue4Zp35", "question": "What purpose does the large round item held by the man in yellow serve?", "choices": ["sound making", "counting mechanism", "visual trickery", "fishing pole"], "correct_choice_idx": 0, "direct_answers": ["horn", "loud speaker", "music", "noise", "megaphone", "make noise", "sound", "horn", "sound making", "makes sounds"], "difficult_direct_answer": true, "rationales": ["The man appears to be holding some sort of horn.", "The man is holding a horn.", "It is a long round tube that can amplify noise. he is holding it to his mouth."], "image": "train2014/COCO_train2014_000000210989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313088, "question_id": "5Nv83VhuBdzcPnZUwM5QLu", "question": "What position is this player playing?", "choices": ["pitcher", "outfielder", "catcher", "1st base"], "correct_choice_idx": 1, "direct_answers": ["catch position", "outfielder", "catching", "outfield", "outfield", "outfield", "catch", "outfield", "outfield", "outfielder"], "difficult_direct_answer": false, "rationales": ["The outfielder is always playing on the side of the field.", "The player is about to catch a baseball at the end of the field.", "A baseball player is in the grassy part of a baseball diamond. outfielders stand in the grassy part of the field."], "image": "train2014/COCO_train2014_000000313088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578334, "question_id": "5NweDwFFi5bxfWtRy8gsZR", "question": "Where is this location?", "choices": ["starting line", "park", "riverside", "finishing line"], "correct_choice_idx": 0, "direct_answers": ["starting line", "racetrack", "kentucky derby", "racetracks", "racetrack", "racetrack", "horse track", "racetrack", "horse track", "horserace"], "difficult_direct_answer": false, "rationales": ["The horses and riders have just gone through the gate and started the race.", "The picture was taken at the horse track. the numbered gates right behind the horses is where the horses stay until they are released signaling the beginning of the race.", "It appears they just left the starting position as the doors are opened."], "image": "train2014/COCO_train2014_000000578334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37864, "question_id": "5PJ2nE3diMYh5JtrHcX7Jn", "question": "What is he doing with the horse?", "choices": ["riding it", "stealing it", "feeding it", "mounting it"], "correct_choice_idx": 0, "direct_answers": ["riding horse", "riding", "riding", "riding", "riding", "riding it", "riding", "riding", "riding", "riding"], "difficult_direct_answer": false, "rationales": ["The man is mounted on the horse.", "He is in the road riding it to somewhere.", "A man on his horse is clopping over the cobblestone road as they go through their little town."], "image": "train2014/COCO_train2014_000000037864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128500, "question_id": "5PNTrws7WZXiQKMCwVcNzy", "question": "This sign for the airport is most likely in which country?", "choices": ["france", "switzerland", "germany", "norway"], "correct_choice_idx": 2, "direct_answers": ["germany", "germany", "germany", "germany", "germany", "germany", "germany", "germany", "germany", "germany"], "difficult_direct_answer": false, "rationales": ["The schonefeld airport is located just outside of berlin.", "The sign is for germany.", "These two cities are in this country."], "image": "train2014/COCO_train2014_000000128500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45926, "question_id": "5PXSQxxMwpiav6EsMF9REf", "question": "What mode of transportation would probably be more comfortable for the travelers?", "choices": ["skateboard", "cattle truck", "bus", "tank"], "correct_choice_idx": 2, "direct_answers": ["jeep", "bus", "bus", "bus", "bus", "car", "bus", "bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["You have your own individual seat on a bus.", "This vehicle is not really intended to carry people, and a more enclosed vehicle would more comfortable seating would be better.", "This image has passengers in a converted truck as though they were in a bus. we can assume an actual bus would be more accomodating."], "image": "train2014/COCO_train2014_000000045926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469085, "question_id": "5PYRqofJrocDgwD95Hh6F6", "question": "What activity is taking place in this venue?", "choices": ["hiking", "fishing", "mountain climbing", "camping"], "correct_choice_idx": 3, "direct_answers": ["brushing", "camping", "camping", "brushing teeth", "camping", "camping", "walking", "teeth brushing", "hiking", "brushing teeth"], "difficult_direct_answer": false, "rationales": ["The clothes seem to indicate a. that said, it could also be b or d.", "They appear to be camping in the wild", "They have toothbrushes which means that they slept there overnight which one would do when camping."], "image": "val2014/COCO_val2014_000000469085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298591, "question_id": "5PnCqXHUxBd8WN8qXiJkfP", "question": "What keeps the elephants out of the buildings?", "choices": ["water", "herders", "noise", "rocks"], "correct_choice_idx": 3, "direct_answers": ["fence", "rocks", "high wall", "cliffside", "rocks", "rock wall", "rock wall", "rocks", "high rocks", "rocks"], "difficult_direct_answer": false, "rationales": ["There is a visible rock barrier around the front of the buildings. a rock face this steep would not be possible for an elephant to climb.", "It would be difficult for elephants to climb these because they are so large and heavy.", "The elevated stone structure separating where the elephants are and the buildings would be difficult for them to traverse up."], "image": "train2014/COCO_train2014_000000298591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469728, "question_id": "5QYBNzYJeWkp3qWnNsHdQE", "question": "Why is the bike attached to the rail?", "choices": ["stay upright", "easily spotted", "prevent theft", "to sell"], "correct_choice_idx": 2, "direct_answers": ["thieves", "locked", "security", "security", "security", "prevent theft", "theft prevention", "security", "prevent theft", "locked"], "difficult_direct_answer": false, "rationales": ["The owner of the bike doesn't want it to get stolen and attaching it to the rail makes it harder to steal.", "The bike is preventing theft.", "The owner doesn't want it stolen."], "image": "train2014/COCO_train2014_000000469728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201903, "question_id": "5Qm2TZbQZSjd4LyTS6qgCQ", "question": "What venue is this scene?", "choices": ["expressway", "highway", "front yard", "parking lot"], "correct_choice_idx": 3, "direct_answers": ["countryside", "beach", "parking lot", "beach", "roadside", "lot", "empty", "parking lot", "dirt lot", "near road"], "difficult_direct_answer": false, "rationales": ["The trucks and cars are parked.", "The image has parked cars sitting on an empty lot.", "There are vehicles parked on a dirt rectangle between a road and a free-standing bathroom."], "image": "train2014/COCO_train2014_000000201903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284355, "question_id": "5RBTHH7RPUCJh7h5zA3P9F", "question": "What are the men on horses dressed as?", "choices": ["cowboys", "indians", "police", "ghosts"], "correct_choice_idx": 0, "direct_answers": ["cowboys", "cowboys", "outlaws", "cowboys", "cowboys", "outlaws", "outlaws", "cowboys", "cowboys", "cowboys"], "difficult_direct_answer": false, "rationales": ["They are dressed up like cowboys.", "They are doing a western act for the folks on the train.", "The men are wearing cowboy hats."], "image": "train2014/COCO_train2014_000000284355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28589, "question_id": "5RDygZ5LcthTMYGuUFXS3K", "question": "What usually goes inside of the item with the lid?", "choices": ["human waste", "towels", "cows", "mcdonald's"], "correct_choice_idx": 0, "direct_answers": ["waste", "human waste", "human waste", "human waste", "sewage", "feces urine", "water", "waste", "human waste", "water"], "difficult_direct_answer": false, "rationales": ["This is a toilet that people use to go to the bathroom in.", "Everyone is taught the purpose of a toilet at a very young age. generally nothing goes in a toilet bowl besides human waste.", "This is an object always found in a bathroom and used by humans."], "image": "train2014/COCO_train2014_000000028589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43279, "question_id": "5REX9RsHEkGKKcuAn5rEZp", "question": "Why is the horse there?", "choices": ["is lost", "pull cart", "for sale", "giving rides"], "correct_choice_idx": 1, "direct_answers": ["pull cart", "parked", "pulling buggy", "transportation", "pulling cart", "standing", "waiting", "pulling cart", "pulling cart", "pulling cart"], "difficult_direct_answer": false, "rationales": ["The horse is wearing reigns which are attached to a cart, and this is a well known and old fashioned mode of transport which can still be used today.", "The picture depicts the horse pulling a cart.", "The horse is here to pull a cart."], "image": "train2014/COCO_train2014_000000043279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495454, "question_id": "5RMwxMtae8b2h8a5WMXk9z", "question": "What allows the man to drag his luggage on the ground without getting damaged?", "choices": ["grass", "dirt", "ice", "snow"], "correct_choice_idx": 3, "direct_answers": ["snow", "plastic cover", "smooth surface", "suitcase", "ice", "snow", "snow", "snow", "snow", "snow surface"], "difficult_direct_answer": false, "rationales": ["The snow allows the man to drag it.", "The snow keeps the luggage gliding along.", "It is slippery and doesn't cause a lot of friction"], "image": "train2014/COCO_train2014_000000495454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14321, "question_id": "5RN55Ke9wAgMS5hNzfYcGp", "question": "What is to the left of the motorcycles?", "choices": ["dog", "boat", "cat", "person"], "correct_choice_idx": 3, "direct_answers": ["person", "people", "man", "people", "human", "person", "men", "people", "officer", "person"], "difficult_direct_answer": false, "rationales": ["A man is standing near the motorcycles.", "A man is standing near the motorcycles.", "The person is to the left."], "image": "val2014/COCO_val2014_000000014321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157037, "question_id": "5RS7WxUEQumvoHJH46RwoG", "question": "What is this type of building called?", "choices": ["library", "mall", "supermarket", "deli"], "correct_choice_idx": 1, "direct_answers": ["shopping mall", "mall", "shopping mall", "shopping mall", "shopping mall", "mall", "shopping mall", "mall", "mall", "mall"], "difficult_direct_answer": false, "rationales": ["There are many stores.", "A mall has many stores in one building, as shown.", "That is recognizable as a large indoor shopping center with individual stores on multiple levels."], "image": "train2014/COCO_train2014_000000157037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408253, "question_id": "5RzFeXXCQHke3nuoP8qvSa", "question": "What is the old woman doing?", "choices": ["itching", "using phone", "laughing", "massaging"], "correct_choice_idx": 1, "direct_answers": ["walking", "phone", "walking", "walking", "walking", "using phone", "phone", "walking", "phone", "walking"], "difficult_direct_answer": false, "rationales": ["She is holding it to her ear.", "She is holding a phone next to her face so she must be using it to communicate with someone.", "She has her hand up to her face and ear which is how people hold their phones to talk"], "image": "val2014/COCO_val2014_000000408253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20517, "question_id": "5S4LjyQAvRXJ3ijSVcmVK6", "question": "What time is it in this location local time?", "choices": ["10 pm", "710 am", "710 pm", "1 am"], "correct_choice_idx": 2, "direct_answers": ["night", "711 pm", "710 pm", "seven", "eight fifteen", "seven eleven", "710", "two forty", "night time", "seven"], "difficult_direct_answer": true, "rationales": ["The clock is pointed to 7 10 and it is night time.", "It is definitely pm given the darkness and the hour hand is pointing towards the 7.", "The time is at night."], "image": "train2014/COCO_train2014_000000020517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16964, "question_id": "5SSRuuwHCftxFV6Rx48AyF", "question": "What type of area is shown?", "choices": ["country", "city", "forest", "mountains"], "correct_choice_idx": 1, "direct_answers": ["station", "city", "industrial", "train station", "train station", "urban", "city", "train tracks", "train tracks", "city"], "difficult_direct_answer": false, "rationales": ["There are several buildings and trains representing a metropolitan area.", "An urban area with a train and a lot of buildings can be seen.", "An urban area with many buildings can be seen."], "image": "train2014/COCO_train2014_000000016964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222842, "question_id": "5SXnjvKjJjgzPbr6o8RWv4", "question": "How do you say the name of the item on the plate next to the cake in Italian?", "choices": ["cucchiaio", "grazie", "spoon", "forchetta"], "correct_choice_idx": 3, "direct_answers": ["forchetta", "tiramisu", "fork", "invalid question", "forchetta", "forchetta", "gelato", "cock tail", "tiramisu", "forchetta"], "difficult_direct_answer": false, "rationales": ["The italian word for fork is forchetta.", "Forchetta is the italian word for fork.", "The name is forchetta."], "image": "val2014/COCO_val2014_000000222842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48432, "question_id": "5SeYLjqbAW2QCtekbb237d", "question": "What sport is the stick used for?", "choices": ["hurling", "golf", "lacrosse", "hockey"], "correct_choice_idx": 2, "direct_answers": ["lacrosse", "lacrosse", "lacrosse", "rugby", "atanding", "bag", "lacrosse", "hockey", "lacrosse", "lacrosse"], "difficult_direct_answer": false, "rationales": ["Only one of the options is a sport that has a stick with netting on the end that is used to catch the ball.", "The net on it and the shape are the indicators.", "My grandson plays lacrosse and that is what the sticks look like."], "image": "train2014/COCO_train2014_000000048432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510515, "question_id": "5SgpgEVUbkLezoK9hfZyoh", "question": "What type of statue is to the right front of the bench?", "choices": ["wooden", "jade", "bust", "kinetic"], "correct_choice_idx": 2, "direct_answers": ["bust", "portrait bust", "bust", "bust", "bust", "stone head", "bust", "head", "bust", "bust"], "difficult_direct_answer": false, "rationales": ["Only the head of the statue is seen.", "It is a cement rendering of a person's head.", "The statue on the right of the bench is called a bust and consists of just a face and neck"], "image": "val2014/COCO_val2014_000000510515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151648, "question_id": "5SnyCDayVEEqz886jPRszH", "question": "What are the animals eating?", "choices": ["grass", "dirt", "hay", "food"], "correct_choice_idx": 0, "direct_answers": ["grass", "hay", "hay", "hay", "grass", "grass", "hay", "hay", "hay", "hay"], "difficult_direct_answer": false, "rationales": ["There is green grass hanging out of their mouths.", "Though cows eat grass this picture shows them eating hay mixed with grass.", "The cows are eating grass."], "image": "train2014/COCO_train2014_000000151648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455536, "question_id": "5SpjgFMUWib77iDFHmQMS7", "question": "What is the area the man is skating in usually called?", "choices": ["arena", "bowl", "deck", "zoo"], "correct_choice_idx": 1, "direct_answers": ["railing", "skatepark", "skatepark", "bowl", "skate park", "skateboard park", "bowl", "pool", "bowl", "bowl"], "difficult_direct_answer": false, "rationales": ["Empty pools and purpose-built skateparks are called bowls for their shape.", "A skatepark has a bowl to do tricks in.", "The structure this skater performs on is known as a half pipe. a half pipe can be said to have a bowl shaped structure."], "image": "train2014/COCO_train2014_000000455536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30815, "question_id": "5SvmVMqiTsYja3NiCAS98H", "question": "What type animal appears to be reading?", "choices": ["live bear", "stuffed bear", "monkey", "gazelle"], "correct_choice_idx": 1, "direct_answers": ["bear", "teddy bear", "bear", "monkey", "stuffed bear", "stuffed", "panda bear", "bear", "teddy bear", "teddy bear"], "difficult_direct_answer": false, "rationales": ["The animal is not alive. it is a teddy.", "The little ears and paws give it away as a cuddly stuffed bear.", "The animal is the stuffed bear."], "image": "train2014/COCO_train2014_000000030815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56452, "question_id": "5TM4AFHhmzRo7TWg3tZ6r3", "question": "What is the name of this sport referred to as in Europe?", "choices": ["soccer", "ballball", "football", "slimball"], "correct_choice_idx": 2, "direct_answers": ["football", "football", "football", "football", "football", "football", "futbol", "football", "football", "football"], "difficult_direct_answer": false, "rationales": ["People are playing soccer which is called football in europe.", "The game is known football as they use feet to play.", "The name is football."], "image": "train2014/COCO_train2014_000000056452.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125365, "question_id": "5TUa9N9UXb9TrWJnGXnZZy", "question": "What surrounds the land?", "choices": ["snow", "water", "sand", "fire"], "correct_choice_idx": 1, "direct_answers": ["ocean", "water", "surround water", "ocean", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["There is a blue ocean.", "There is blue water all around the land.", "We can see this land mass has boats sailing on both it's top and bottom sides. boats sail in water."], "image": "train2014/COCO_train2014_000000125365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558153, "question_id": "5Tk42VzmNBDbwijR6d8wVd", "question": "In what environment does the apple tree appear to be located?", "choices": ["farm", "forest", "backyard", "greenhouse"], "correct_choice_idx": 2, "direct_answers": ["backyard", "backyard", "backyard", "outside", "back yard", "yard", "yard", "backyard", "outdoors", "tundra"], "difficult_direct_answer": false, "rationales": ["There are houses. the apple tree is near a fenced-off grassy area.", "There is a house and deck visible in the background, separated by a fence.", "There are houses and fences near the apple tree."], "image": "train2014/COCO_train2014_000000558153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87356, "question_id": "5TxdfQseXYqN3dyrDQThtQ", "question": "How many wheels are visible on the large vehicle?", "choices": ["two", "three", "six", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "four", "four", "two"], "difficult_direct_answer": false, "rationales": ["The bus has two wheels seen on the left side.", "A large bus is in the street, visible from the side.", "There are 2 in the front and 2 in the back"], "image": "val2014/COCO_val2014_000000087356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105751, "question_id": "5TzU4qRWShDDPu48QqFTyw", "question": "What does the man want to do with the ball?", "choices": ["hit it", "kick it", "catch it", "dodge it"], "correct_choice_idx": 0, "direct_answers": ["hit hard", "hit it", "hit it", "return it", "serve", "hit it", "hit ball", "hit", "hit it", "serve ball"], "difficult_direct_answer": false, "rationales": ["The man is swinging his racket to hit the tennis ball over the net.", "He is reaching up either to serve the ball or return his opponent's serve.", "The man is reaching for the ball with a racquet."], "image": "val2014/COCO_val2014_000000105751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537069, "question_id": "5UHTC5c8A8diaVVjfpSes8", "question": "What is the boy ready to do here?", "choices": ["dribble", "dunk", "swing", "catch"], "correct_choice_idx": 2, "direct_answers": ["swing", "hit ball", "bat", "hit ball", "hit balls", "hit ball", "swing", "hit ball", "hit ball", "swing bat"], "difficult_direct_answer": false, "rationales": ["The boy is trying to swing the bat to hit the ball.", "The boy is holding a bat and is about to use it to hit a ball.", "The boy is holding a bat and is prepared to use it to hit the ball."], "image": "val2014/COCO_val2014_000000537069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126833, "question_id": "5Us2dtByCrqmSqWX8pqo3J", "question": "Why might the man be sitting by himself?", "choices": ["he's contagious", "he's sick", "he's antisocial", "he's popular"], "correct_choice_idx": 2, "direct_answers": ["stroll", "taking break", "no friends", "alone", "relaxing", "no friends", "he's antisocial", "waiting", "single", "very shy"], "difficult_direct_answer": true, "rationales": ["The man is sitting alone on the bench because he is not looking to have company with anyone at the moment.", "The man is not sitting with friends.", "He wants to be alone."], "image": "val2014/COCO_val2014_000000126833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201891, "question_id": "5V27zUG2ofyZM6rXFccWeK", "question": "Which one is doing silent work?", "choices": ["none", "middle", "right", "left"], "correct_choice_idx": 1, "direct_answers": ["blue shirt", "back", "middle", "nearest", "centered", "middle", "cameraman", "baldy", "cameraman", "middle man"], "difficult_direct_answer": false, "rationales": ["They are all working together and probably not being silent.", "The man on the right is wearing earbuds. the man on the left is wearing headphones.", "The middle man is doing some silent work on his laptop."], "image": "train2014/COCO_train2014_000000201891.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25858, "question_id": "5VFVpzKaXSUgbwUjZbpvDj", "question": "What sound do these animals make?", "choices": ["woof", "whistle", "moo", "meow"], "correct_choice_idx": 2, "direct_answers": ["moo", "moo", "moo", "moo", "moo", "moo", "moo", "moo", "moo", "moo"], "difficult_direct_answer": false, "rationales": ["These are cows and cows do not meow, woof or whistle--everyone knows that cows go \"moo.\".", "Cows make mooing sounds.", "The animals moo."], "image": "train2014/COCO_train2014_000000025858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547744, "question_id": "5VJEiR9knKGfBdZy528JKt", "question": "What region of this country does this bus travel in?", "choices": ["east", "west", "north", "south"], "correct_choice_idx": 1, "direct_answers": ["city", "city", "tourist", "city", "unknown", "west", "bombay", "southern", "boonlay", "bus"], "difficult_direct_answer": false, "rationales": ["The route of a transit bus is listed on the digital sign.", "Due to the palm trees you could safely say it was taken in the south of the country.", "The region is western."], "image": "val2014/COCO_val2014_000000547744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219580, "question_id": "5VS9CTMjFzJKagNE6QWL2N", "question": "Which country is this in?", "choices": ["canada", "france", "united states", "netherlands"], "correct_choice_idx": 3, "direct_answers": ["unknown", "not clear", "germany", "germany", "netherlands", "germany", "unknown", "germany", "germany", "germany"], "difficult_direct_answer": false, "rationales": ["The writing on the sign is in dutch, which is the language spoken in the netherlands.", "The street signs are in the dutch language which is spoken in this country.", "The sign says vergunning houders which is in dutch. dutch is spoken in the netherlands."], "image": "train2014/COCO_train2014_000000219580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428263, "question_id": "5VxjaciskVgQEnR3QgPWNq", "question": "What is the person in the white shirt ready to do?", "choices": ["hit", "duck", "throw", "catch"], "correct_choice_idx": 3, "direct_answers": ["catch", "play frisbee", "catch frisbee", "catch", "catch", "catch", "catch frisbee", "catch frisbee", "catch", "catch"], "difficult_direct_answer": false, "rationales": ["The boy is looking up at the frisbee and has his hands in front of him.", "They are running forward to grab the white frisbee in the air", "The person is ready to catch it."], "image": "val2014/COCO_val2014_000000428263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565869, "question_id": "5WDyXLgNJoo3xvEMwhiAra", "question": "What is in the water?", "choices": ["submarine", "surfboarder", "boat", "seal"], "correct_choice_idx": 1, "direct_answers": ["waves", "surfer", "surfer", "fish", "man", "surfer", "surfer", "surfboarder", "wave", "waves"], "difficult_direct_answer": false, "rationales": ["There is a person on a surfboard in the water.", "A surfboarder is riding the waves.", "There is a man in the water that is standing on a surfboard and riding waves."], "image": "train2014/COCO_train2014_000000565869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157093, "question_id": "5WHYT84h85BNnYaAqBsTVW", "question": "What flavor candy does this player chew here?", "choices": ["bubble gum", "chocolate", "banana", "mint"], "correct_choice_idx": 0, "direct_answers": ["chery", "bubblegum", "gum", "bubblegum", "bubblemint", "bubble gum", "bubble gum", "bubble gum", "bubblegum", "mint"], "difficult_direct_answer": false, "rationales": ["This is the only one that can be chewed for a long time and blown into bubbles.", "Since he is blowing a pink bubble, it's safe to assume that the baseball player is chewing bubble gum-type candy.", "You can tell this is bubble gum because the player is blowing a bubble."], "image": "val2014/COCO_val2014_000000157093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318702, "question_id": "5WHhLic9nLfBMQ9gpr2c4N", "question": "These people are most likely to show up as part of the cast for a sequel to what film?", "choices": ["american beauty", "downton abbey", "blue velvet", "us"], "correct_choice_idx": 3, "direct_answers": ["oh god", "hotel rwanda", "survivor", "congo", "documentary", "us", "black panther", "n/a", "black panther", "king kong"], "difficult_direct_answer": true, "rationales": ["The sequel is us.", "They are of african decent. the movie \"us\" had an african american cast.", "These people look like characters from \"us\"."], "image": "val2014/COCO_val2014_000000318702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456696, "question_id": "5WcAHrZUrS2wV27XR9G3aw", "question": "What type of shoes are visible?", "choices": ["trainers", "plimsolls", "heels", "mules"], "correct_choice_idx": 0, "direct_answers": ["trainers", "sneakers", "tennis shoes", "tennis shoes", "tennis", "tennis", "tennis", "trainers", "athletic shoes", "tennis"], "difficult_direct_answer": false, "rationales": ["These are tennis shoes used for athletic sports.", "Those are what the tennis player is wearing.", "The people are playing tennis. heels, mules, or plimsolls would not be appropriate for this sport."], "image": "train2014/COCO_train2014_000000456696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326936, "question_id": "5WdRLcqGwBtPjMm52ovTT3", "question": "What Item is a human most likely to trip over?", "choices": ["stool", "firewood", "fan", "bleach"], "correct_choice_idx": 1, "direct_answers": ["log", "firewood", "stool", "log", "wood", "chair", "wood", "firewood", "wood", "floor wood"], "difficult_direct_answer": false, "rationales": ["It is scattered on the floor and sticking out of the stove.", "The branches are sticking drastically out at shin level.", "There is firewood scattered all over the ground."], "image": "train2014/COCO_train2014_000000326936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134888, "question_id": "5WvQ5a92i5HCXkKugbvQMD", "question": "What baseball player has the same first name as the name on the player all the way to the left's jersey?", "choices": ["duke snider", "jim kaat", "alabama al", "stanford napoli"], "correct_choice_idx": 0, "direct_answers": ["hank", "chris", "duke snider", "duke", "batter waiting", "duke", "duke snider", "duke snider", "duke snider", "duke snider"], "difficult_direct_answer": false, "rationales": ["Duke snider has the name \"duke.\".", "The men in blue and grey uniforms have 'duke' printed on the front of their shirts. duke snider is the only name of those listed here which matches.", "The person who is on the left is named duke."], "image": "val2014/COCO_val2014_000000134888.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264161, "question_id": "5WwroWRDf2vmwPMGAWjZAG", "question": "What is the pink object next to the red book used to do?", "choices": ["comb hair", "brush teeth", "write notes", "clip nails"], "correct_choice_idx": 0, "direct_answers": ["brush", "brush hair", "brush hair", "brush hair", "brush hair", "hair brush", "brush", "comb", "writing", "comb hair"], "difficult_direct_answer": false, "rationales": ["It has plastic bristles on it for detangling and smoothing hair.", "That is used to comb hair.", "The pink object is a brush."], "image": "val2014/COCO_val2014_000000264161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261587, "question_id": "5XAWpTsPJmcm9rtt9hWC2n", "question": "What is the name of the safety garment the wakeboarder is wearing?", "choices": ["shin guards", "helmet", "glasses", "life vest"], "correct_choice_idx": 3, "direct_answers": ["life jacket", "lifejacket", "life jacket", "life vest", "lifevest", "vest", "life preserver", "life jacket", "lifevest", "lifeboat"], "difficult_direct_answer": false, "rationales": ["A life vest keeps the person safe from falling in the water.", "The safety harness around his chest is usually given this name.", "The garment has no sleeves and is buoyant."], "image": "val2014/COCO_val2014_000000261587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176440, "question_id": "5XXVkGNJxkCWAuKQEiTGsX", "question": "In what year was this company involved in a large scale scandal?", "choices": ["2007", "2020", "2018", "2015"], "correct_choice_idx": 2, "direct_answers": ["oh eight", "2016", "2016", "2018", "oh eight", "2016", "2016", "2016", "2016", "2016"], "difficult_direct_answer": false, "rationales": ["The year was 2018.", "The building which is in the center of this picture is wells cargo. in 2018 it was involved in a very big financial scandal.", "A wells fargo logo is on the side of a building. wells fargo was in a scandal a few years back."], "image": "train2014/COCO_train2014_000000176440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215691, "question_id": "5XaJvpeHDB432LetBjXveR", "question": "What is the name of the service that can fix sinks?", "choices": ["carpenter", "electrician", "plumber", "roofer"], "correct_choice_idx": 2, "direct_answers": ["plumber", "plumber", "plumbing", "plumber", "plumbing", "plumbing", "plumbing", "plumber", "plumber", "plumber"], "difficult_direct_answer": false, "rationales": ["The plumber will fix it.", "The plumber helps repair the water pipes.", "A plumber is someone who fixes sinks and other appliances with pipes."], "image": "val2014/COCO_val2014_000000215691.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4289, "question_id": "5XhFBgzLrDLsiSi69abZ3Q", "question": "Why is the ground reflecting light?", "choices": ["its plastic", "its glass", "its sand", "its wet"], "correct_choice_idx": 3, "direct_answers": ["wet", "rain", "rain", "wetness", "wet", "its wet", "it's wet", "sun", "wetness", "wet"], "difficult_direct_answer": false, "rationales": ["It appears there is water on the ground, probably from a recent rain.", "The ground is wet.", "It has rained and left water on the sidewalk."], "image": "train2014/COCO_train2014_000000004289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292257, "question_id": "5XiudGLEHLgaganUe4wM2k", "question": "What is the man closest to the ground using?", "choices": ["pogo stick", "drill", "machete", "baseball glove"], "correct_choice_idx": 3, "direct_answers": ["catchers mit", "gloves", "mitt", "catcher's mitt", "catcher's mitt", "glove", "baseball glove", "hand", "catcher's mitt", "baseball mitt"], "difficult_direct_answer": false, "rationales": ["The man has a baseball mitt on.", "The man that is bending down is holding a catcher's mitt.", "The relative closeness to the ground of the men is clear and the most distinct feature of the lowest man is answer a and consistent with this setting and activity."], "image": "train2014/COCO_train2014_000000292257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109738, "question_id": "5YGJobUtohE56uLUcwunnw", "question": "The small truck was customized to fit at least how many people?", "choices": ["nine", "seven", "four", "21"], "correct_choice_idx": 2, "direct_answers": ["two", "one", "four", "four", "two", "three", "two people", "two", "two", "five"], "difficult_direct_answer": false, "rationales": ["There are two seats in the front and two seats in the bed of the truck.", "There are two seats in the cab and two in the bed.", "A two door truck usually only seats two people but this truck has two extra seats attached to the trunk space."], "image": "train2014/COCO_train2014_000000109738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565875, "question_id": "5YQDr5KcqtSLRsZDNgWAs2", "question": "What does this animal eat?", "choices": ["bears only", "candy only", "everything", "veggies only"], "correct_choice_idx": 2, "direct_answers": ["fish", "grass", "meat", "everything", "meat", "meat", "grass", "grass", "grass", "everything"], "difficult_direct_answer": false, "rationales": ["There is a bear which is an omnivore.", "The animal is a bear and bears are notorious for being omnivores in nature but also convenience eaters that will eat anything they can get.", "A lone bear is walking down a dirt road."], "image": "train2014/COCO_train2014_000000565875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214481, "question_id": "5Ynm8Q4VAfpEDThfmsapSf", "question": "What is the window treatment called?", "choices": ["shade", "valance", "cornice", "cafe curtain"], "correct_choice_idx": 1, "direct_answers": ["wind chimes", "molding", "tinted", "valance", "glass etching", "tempered glass", "shades", "tint", "curtain", "valance"], "difficult_direct_answer": true, "rationales": ["Traditionally the windows treatment are called valances.", "The other options don't apply because that is what it's normally called.", "A shorter curtain above a window frame."], "image": "train2014/COCO_train2014_000000214481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403981, "question_id": "5ZCt8Js6JDiVVcpoXhktTw", "question": "What is the purpose of the crate in this image?", "choices": ["protection", "heater", "chair", "storage"], "correct_choice_idx": 2, "direct_answers": ["chair", "sitting", "seat", "sitting", "container", "sitting", "sitting", "sitting", "chair", "chair"], "difficult_direct_answer": false, "rationales": ["The crate is for people to sit on.", "The purpose is a chair.", "The man is sitting on the crate, also there are no chairs."], "image": "train2014/COCO_train2014_000000403981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30448, "question_id": "5ZMsd4eCwrwKSxYh72vRda", "question": "What type of area is this?", "choices": ["tropical", "urban", "mountains", "farm"], "correct_choice_idx": 0, "direct_answers": ["residential", "marina", "tropical", "suburban", "beach", "side street", "tropical coast", "tropical", "beach", "tropical"], "difficult_direct_answer": false, "rationales": ["The area has palm trees which only grow in areas where it is warm year-round.", "This is a tropical area because there are many palm trees in the background.", "The palm trees are always in that type of area."], "image": "val2014/COCO_val2014_000000030448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409566, "question_id": "5ZYswyYpefAZyGqVM9R2MH", "question": "What items are facing each other?", "choices": ["posters", "faucet", "buckets", "spoons"], "correct_choice_idx": 1, "direct_answers": ["sink bathtub", "sink tap", "sink tub", "tub sink", "sink tub", "faucets", "sink bathtub", "tub mirror", "tub sink", "faucet"], "difficult_direct_answer": false, "rationales": ["The spouts are on opposite sides of the sink.", "The water dispensers on the sink are turned towards each other.", "The faucets are facing one another."], "image": "val2014/COCO_val2014_000000409566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91442, "question_id": "5ZdSRXNsiwxdgMNjhSVSkC", "question": "Why can't people go down this road at this time?", "choices": ["tow accident", "bus accident", "escaped prisoner", "fire"], "correct_choice_idx": 1, "direct_answers": ["barrier", "it's blocked", "accident", "bus accident", "closed", "accident", "wreck", "bus accident", "protection", "blocked"], "difficult_direct_answer": false, "rationales": ["There is a bus behind the caution tape.", "The vehicle is sideways and off the road in someone's hard", "There's a bus on the side of the road facing the grass"], "image": "train2014/COCO_train2014_000000091442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371945, "question_id": "5ZoqGAFyFgDdMDYXxtDg9P", "question": "What nation is this train from?", "choices": ["norway", "holland", "germany", "sweden"], "correct_choice_idx": 0, "direct_answers": ["england", "norway", "rotterdam", "rotterdam", "germany", "netherlands", "germany", "netherlands", "netherlands", "belgium"], "difficult_direct_answer": false, "rationales": ["There is a dutch flag painted on the side of the train, as well as the word 'rotterdam.' rotterdam is the capital of the netherlands (holland).", "The train has a red, white, and blue flag. rotterdam is on the side of the train.", "The nation is norway."], "image": "val2014/COCO_val2014_000000371945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432503, "question_id": "5Zspx79qwZ6eQyePm8y7Ac", "question": "In what setting is the skater skating?", "choices": ["desert", "oceanic", "farm", "urban"], "correct_choice_idx": 3, "direct_answers": ["beach", "urban", "skate park", "skate park", "skate park", "skate park", "skatepark", "skate park", "tropics", "competition"], "difficult_direct_answer": false, "rationales": ["The setting is urban.", "A skateboarder is doing a trick and there is pavement, houses, and other people around.", "The skater is in a skate park."], "image": "val2014/COCO_val2014_000000432503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168583, "question_id": "5a4QFyTRi6u3RSjH2AVgJK", "question": "Where is this crowd of people hanging out?", "choices": ["shopping mall", "grocery store", "movie theater", "amusement park"], "correct_choice_idx": 0, "direct_answers": ["mall", "mall", "mall", "shopping mall", "mall", "shops", "street", "mall", "mall", "mall"], "difficult_direct_answer": false, "rationales": ["The people are standing inside a shopping mall surrounded by many stores.", "The people are surrounded by several different retail stores.", "The shopping mall is not only where people come to shop, but, for younger patrons, a great place to hang with friends and catch up on the latest news. a mall food court provides an opportunity to spend quality time together as well."], "image": "val2014/COCO_val2014_000000168583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32391, "question_id": "5aAcjizpEbjkKMqGCDHA6Y", "question": "Which type of writing is wrote on the top of the white banner of the shop closest to the camera?", "choices": ["katakana", "spanish", "english or", "kanji"], "correct_choice_idx": 3, "direct_answers": ["shop name", "graffiti", "japanese", "kanji", "handwriting", "asian", "handwriting", "foreign", "japanese", "japanese"], "difficult_direct_answer": false, "rationales": ["That writing is in japanese.", "There is japanese writing on the banner.", "This is in japan and it appears to be a."], "image": "val2014/COCO_val2014_000000032391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312326, "question_id": "5aFcExocYJTSMDvpbCuJcG", "question": "What sort of place is this room inside of?", "choices": ["jail", "museum", "store", "butcher"], "correct_choice_idx": 1, "direct_answers": ["museum", "museum", "hotel", "museum", "museum", "museum", "museum", "museum", "museum", "bedroom"], "difficult_direct_answer": false, "rationales": ["A small bed is in the room as well as many framed pictures on the wall. to the right of the bed is a display with some artifacts.", "The old paintings on the wall and the artifacts under glass indicate that this room is meant to be viewed by visitors.", "This must be a museum since objects are in cases for display."], "image": "train2014/COCO_train2014_000000312326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469996, "question_id": "5aJsxmKUZ7hAwruxme7Wpn", "question": "What can be gotten at that booth?", "choices": ["information", "tickets", "police", "food"], "correct_choice_idx": 0, "direct_answers": ["information", "telephone", "information", "information", "ice cream", "ticket", "tickets", "tickets", "information", "bus ticket"], "difficult_direct_answer": false, "rationales": ["An in on a booth usually stands for information.", "There is a sign on top of the booth. it has an encircled letter i.", "The i symbol is for those who have questions of the transit authority."], "image": "val2014/COCO_val2014_000000469996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264567, "question_id": "5aRKHQGZqBmeqtjGeKCD5R", "question": "What type of career have these horsemen/women pursued?", "choices": ["cowboys", "mounted police", "town criers", "performers"], "correct_choice_idx": 1, "direct_answers": ["police", "horsemen", "leading", "police", "police", "police", "mounted police", "race", "poice offices", "police"], "difficult_direct_answer": false, "rationales": ["The logo on the saddle indicates the career of the horsemen/women.", "These people mounted on the horses are wearing law enforcement uniforms and have law enforcement blanket/saddles on the horses.", "In some places cops ride around on horses instead of cars."], "image": "train2014/COCO_train2014_000000264567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70347, "question_id": "5aTn9dxGJtnuEHfrzSFrLV", "question": "Where is the horse's head most likely?", "choices": ["museum", "zoo", "airport", "racetrack"], "correct_choice_idx": 2, "direct_answers": ["left", "on horse", "horse", "photo-shopped", "airport", "mans body", "on his", "photoshopped", "in airport", "halloween party"], "difficult_direct_answer": true, "rationales": ["The man has a badge.", "The head is in the airport.", "The chairs, the style of ceiling, and the moving concourse are all staples of airport architecture."], "image": "train2014/COCO_train2014_000000070347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549394, "question_id": "5ab6J6J3mVBxDgC6CvSbpu", "question": "What important piece of safety gear is the kid missing?", "choices": ["elbow pads", "knee pads", "helmet", "wrist wraps"], "correct_choice_idx": 2, "direct_answers": ["helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet"], "difficult_direct_answer": false, "rationales": ["The kid is wearing pads on his knees, wrists, and elbows. his head is uncovered.", "The kid is wearing knee pads, wrist wraps, and elbow pads. his head is uncovered.", "Skateboarding is dangerous if you fall. children are new to skateboarding and should wear helmets to protect their brains."], "image": "train2014/COCO_train2014_000000549394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565395, "question_id": "5amXXXonCebbJMjmJg4mX5", "question": "What are the couple travelling on?", "choices": ["ferry", "jet", "helicopter", "bus"], "correct_choice_idx": 0, "direct_answers": ["bus", "ferris wheel", "ferry", "cheese shop", "cheese shop", "cheese shop", "large wheel", "lift", "cable car", "wisconsin"], "difficult_direct_answer": false, "rationales": ["The couple goes on the ferry.", "Behind we can see a huge water mass.", "There is a body of water visible behind them."], "image": "val2014/COCO_val2014_000000565395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266418, "question_id": "5apETWiL6L5GHpE3jsMHYE", "question": "What is the man standing in the boat trying to do?", "choices": ["fish", "swim", "stretch", "get help"], "correct_choice_idx": 0, "direct_answers": ["fish", "fish", "fish", "fish", "fish", "fish", "catch fish", "catch fish", "fish", "fish"], "difficult_direct_answer": false, "rationales": ["The man in the boat is standing so he cast out his fishing line.", "He is waving a fly fishing rod while standing with the fishing line flying through the air.", "A man stand in a boat with a long pole. people fish from boats."], "image": "train2014/COCO_train2014_000000266418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438620, "question_id": "5azMHUyHzQq54qYrmMdT6f", "question": "Which pedestrian is walking safely?", "choices": ["neither", "both", "pink shirt", "blue shirt"], "correct_choice_idx": 2, "direct_answers": ["pink shirt", "man", "man", "man", "man", "man", "right", "on sidewalk", "pink shirt", "pink shirt"], "difficult_direct_answer": false, "rationales": ["A man in a pink shirt is safely walking on the sidewalk, and is the only prominent figure of a pedestrian for this image.", "The man in the pinkish shirt is walking facing the traffic and not too close to the street.", "Both are walking on the sidewalk."], "image": "train2014/COCO_train2014_000000438620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153462, "question_id": "5b4g3Ke55xDFet8NyjYmKd", "question": "Why is the man wearing an orange jacket?", "choices": ["visibility", "dress code", "camouflage", "fashion"], "correct_choice_idx": 0, "direct_answers": ["visibility", "safety", "visibility", "safety", "visibility", "protection", "safety visibility", "safety", "guide trains", "increased visibility"], "difficult_direct_answer": false, "rationales": ["He needs to maintain visibility for safety reasons.", "It's a bright color so he can easily be seen by the train engineers", "He is wearing to be safe and visible to the trains."], "image": "train2014/COCO_train2014_000000153462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507996, "question_id": "5b8fQA8uBuxU7q8EBB7pyr", "question": "What is the boy look at across the water?", "choices": ["sand", "nature", "land", "trees"], "correct_choice_idx": 2, "direct_answers": ["boat", "plant", "see boats", "waves", "shore", "land", "horizon", "land", "shore", "horizon"], "difficult_direct_answer": false, "rationales": ["The land on the other side of the lake.", "The only thing visible across the water is more land.", "There is sand and trees on the other side of the water and it's difficult to see exactly what he sees"], "image": "train2014/COCO_train2014_000000507996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221916, "question_id": "5bHhaDMAuzaH9d68Pw8Hv8", "question": "What number can be found on the plate in the ground all the way to the right?", "choices": ["552", "961", "886", "169"], "correct_choice_idx": 1, "direct_answers": ["961", "961", "961", "961", "961", "961", "961", "961", "961", "961"], "difficult_direct_answer": false, "rationales": ["A pillar can be seen in the right corner with this number displayed on it.", "The number 961 is on the structure.", "The numbers nine six and one are in black on the metal plate."], "image": "train2014/COCO_train2014_000000221916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470057, "question_id": "5baGyFi5rkJJwfBWzJ8qv9", "question": "What is the large rectangular container against the white wall used to collect?", "choices": ["toys", "water", "animals", "trash"], "correct_choice_idx": 3, "direct_answers": ["trash", "trash", "trash", "trash", "trash", "trash", "trash", "trash", "trash", "trash"], "difficult_direct_answer": false, "rationales": ["The large rectangular container is a dumpster.", "The sign on the container says something about waste.", "There is a dumpster."], "image": "train2014/COCO_train2014_000000470057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579859, "question_id": "5bkJZLcSoMDm2sa593XRst", "question": "The people riding on the red vehicle on the road can be classified as what type of passengers?", "choices": ["commuters", "tourists", "workers", "students"], "correct_choice_idx": 1, "direct_answers": ["tourists", "tourists", "working class", "tourists", "tourists", "tourists", "working class", "tourists", "working class", "tourists"], "difficult_direct_answer": false, "rationales": ["The bus is a tour bus.", "The bus is full of people that are on a tour.", "They are in a double-decker bus in the open top area. visitors like to see the neighborhood they are traveling in."], "image": "val2014/COCO_val2014_000000579859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566798, "question_id": "5bn8PcFE3ossA2TwLFBvkj", "question": "Who does the man communicate with here?", "choices": ["waiter", "child", "seated woman", "phone caller"], "correct_choice_idx": 3, "direct_answers": ["colleague", "phone", "partner", "phone", "phone", "other person", "phone caller", "woman", "lady", "phone"], "difficult_direct_answer": false, "rationales": ["He is speaking with someone not present in this location.", "The man has a phone.", "The man is speaking on the phone; it assumed that he is talking to someone on the phone."], "image": "train2014/COCO_train2014_000000566798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386581, "question_id": "5cVN97XsWdDewcBtGuy63D", "question": "What will she put on the car?", "choices": ["ticket", "business card", "flyer", "registration"], "correct_choice_idx": 0, "direct_answers": ["ticket", "ticket", "parking ticket", "ticket", "ticket", "ticket", "ticket", "ticket", "ticket", "ticket"], "difficult_direct_answer": false, "rationales": ["The police officer is ticketing the car.", "The police officer hands out tickets.", "The ticket will go on the car."], "image": "val2014/COCO_val2014_000000386581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445019, "question_id": "5cVXnyprTzgdpDGfansZJB", "question": "What is near the ball?", "choices": ["toddler", "baby", "batter", "dog"], "correct_choice_idx": 2, "direct_answers": ["bat", "bat", "baseball bat", "bat", "baseball", "batter", "bat", "batter", "batter", "bat"], "difficult_direct_answer": false, "rationales": ["The batter is the closest person to the ball.", "There are no kids or non-human animals near the ball.", "The ball is flying toward the batter that is going to try to hit it."], "image": "val2014/COCO_val2014_000000445019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116694, "question_id": "5cfxAHGLsZvFekBFwZ2PVv", "question": "What might the yellow vehicle carry?", "choices": ["cars", "airplane", "furniture", "mobile home"], "correct_choice_idx": 2, "direct_answers": ["moving luggage", "people", "money", "antiques", "merchandise", "packages", "cargo", "food", "furniture", "food"], "difficult_direct_answer": true, "rationales": ["Household items will fit in a delivery truck.", "The box cab visible on the partially visible yellow truck on the right side of the image identifies it as a moving truck often rented to move houses.", "This is a large box moving truck that can hold many items"], "image": "train2014/COCO_train2014_000000116694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391235, "question_id": "5cgdWEiBdoSbFiU8V23crs", "question": "What does this girl pretend to play here?", "choices": ["tennis", "horseback roping", "golf", "cooking"], "correct_choice_idx": 2, "direct_answers": ["video games", "golf", "wii golf", "golf", "golf", "golf", "golf", "golf", "golf", "golf"], "difficult_direct_answer": false, "rationales": ["The character on the screen is playing an animal-free sport that uses a club, not a racquet, on a grassy surface.", "The character in the game is using a club to hit a ball.", "The girl is playing golf."], "image": "train2014/COCO_train2014_000000391235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416516, "question_id": "5cynHe82CBgkamrH5KSpW8", "question": "What is the woman keeping in the stroller?", "choices": ["fruit", "groceries", "baby", "packages"], "correct_choice_idx": 2, "direct_answers": ["baby", "baby", "baby", "baby", "baby", "baby", "baby", "baby", "baby", "baby"], "difficult_direct_answer": false, "rationales": ["The woman has a baby stroller.", "The stroller is for babies.", "A stroller such as the one here is commonly used for babies. it's possible to keep almost any small object in a stroller, but strollers are specifically designed to keep babies safe."], "image": "train2014/COCO_train2014_000000416516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530030, "question_id": "5d3eLY44gmkJmRHZdP5zR9", "question": "What is the white swirly thing?", "choices": ["marshmallow", "piglet", "ice cream", "cloud"], "correct_choice_idx": 2, "direct_answers": ["ice cream", "ice cream", "ice cream", "icecream", "ice cream", "ice cream", "ice cream", "ice cream", "ice cream", "icecream cone"], "difficult_direct_answer": false, "rationales": ["The white swirl is from an ice cream cone.", "It is vanilla ice cream in a cone.", "The swirl is ice cream."], "image": "train2014/COCO_train2014_000000530030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511842, "question_id": "5dTskK9Zw9JMQuxzEKdTJd", "question": "What can be done with these sauces?", "choices": ["marinating", "dipping", "sauteing", "grilling"], "correct_choice_idx": 1, "direct_answers": ["dip", "dipped", "dipping", "dipped", "dip", "dipping", "dipping", "store sauce", "donut dipping", "dipping"], "difficult_direct_answer": false, "rationales": ["The sauces can be used to dip the donuts in.", "In this image, that's what can be done. they can also be used in the other options.", "Some people like to dip their donuts."], "image": "train2014/COCO_train2014_000000511842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541999, "question_id": "5dfh7QjMqTu3uWzXzUes3S", "question": "What type of pants is the man in orange wearing?", "choices": ["khakis", "suit pants", "jeans", "shorts"], "correct_choice_idx": 2, "direct_answers": ["jeans", "jeans", "jeans", "jeans", "blue jeans", "jeans", "jeans", "jeans", "jeans", "jeans"], "difficult_direct_answer": false, "rationales": ["The man is clearly identifiable and his pants are of a color, style and texture consistent with answer a.", "The man is wearing pants made of blue denim.", "The man is wearing jeans because the pants are made from denim."], "image": "train2014/COCO_train2014_000000541999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576363, "question_id": "5dohAB95DKShjHRraQnTg3", "question": "What item would help the woman in the light tan shirt?", "choices": ["bookend", "ottoman", "seesaw", "luggage cart"], "correct_choice_idx": 3, "direct_answers": ["another hand", "line movement", "cart", "stroller", "another cart", "backpack", "larger cart", "luggage cart", "cart", "large cart"], "difficult_direct_answer": true, "rationales": ["She needs something to put her luggage on to pull it.", "She is carrying a lot of bags with her.", "The woman is carrying a lot of luggage."], "image": "val2014/COCO_val2014_000000576363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431954, "question_id": "5e7oyZNwJPfRiesMeeqAUe", "question": "In which sort of event does this woman pose?", "choices": ["sale", "auction", "art musem", "expo"], "correct_choice_idx": 3, "direct_answers": ["halloween", "marathon", "cosplay", "superhero convention", "convention", "comicon", "cosplay", "expo", "cosplay", "hero"], "difficult_direct_answer": false, "rationales": ["The woman is dressed in a costume at an expo or convention center.", "There appears to be many booths in the background with displays of different characters. the woman is also in costume as one would be at option a.", "She is dressed up at an expo."], "image": "val2014/COCO_val2014_000000431954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435820, "question_id": "5eAoS3ZfZPBQ9aeyXBn4kX", "question": "What do two of the three men have on?", "choices": ["glasses", "tie", "shirt", "pants"], "correct_choice_idx": 0, "direct_answers": ["wine glasses", "glasses", "glasses", "glasses", "dress shirts", "glasses", "black pants", "eye glasses", "glasses", "glasses"], "difficult_direct_answer": false, "rationales": ["The men have glasses.", "All are wearing shirts and pants. they do not have ties.", "None of the men are wearing a tie, all have shirts, and all have pants. two of the three men are wearing spectacles."], "image": "val2014/COCO_val2014_000000435820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411417, "question_id": "5eHNeauJqNi82qot7GHTaj", "question": "What kind of thing is hanging on the motorcycle?", "choices": ["air freshener", "groceries", "parking permit", "necklace"], "correct_choice_idx": 2, "direct_answers": ["handicap placard", "strap", "parking pass", "dog", "disabled card", "handicap sign", "parking permit", "cord", "handicapped marker", "dog"], "difficult_direct_answer": true, "rationales": ["This shows they are allowed to park in handicap spots", "There is a visible blue placard with a handicap sign hanging off the motorcycle. this item is placed on and in vehicles to designate their permission to park in certain handicap accessible spots.", "The parking permit is hanging."], "image": "train2014/COCO_train2014_000000411417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232963, "question_id": "5eTm852dC2QXbGoEuXKL9Y", "question": "What type of piano is the man playing?", "choices": ["grand piano", "console piano", "spinet piano", "upright piano"], "correct_choice_idx": 0, "direct_answers": ["grand", "grand", "baby grand", "grand piano", "grand piano", "gran piano", "baby grand", "grand pain", "grand", "grand piano"], "difficult_direct_answer": false, "rationales": ["The man is sitting at a piano that's large enough to be a grand one.", "The man is playing a piano that has the top open.", "This is a grand piano."], "image": "train2014/COCO_train2014_000000232963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539233, "question_id": "5ea3BXYHHKZJAmJEmCj7nE", "question": "How many desk lamps are there?", "choices": ["two", "one", "four", "three"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "two", "three", "three", "four", "three", "three", "four", "two"], "difficult_direct_answer": false, "rationales": ["The desk has many lamps making it easy to see.", "That's how many lamps are there.", "There are two desk lamps on the left. two additional desk lamps are on the right."], "image": "train2014/COCO_train2014_000000539233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41890, "question_id": "5eeLSasArCim7kdxP5PKSb", "question": "What country are the Bikers travelling in?", "choices": ["holland", "united states", "mexico", "canada"], "correct_choice_idx": 1, "direct_answers": ["america", "usa", "united states", "united states", "usa", "usa", "united states", "usa", "united states", "usa"], "difficult_direct_answer": false, "rationales": ["The bikers are traveling on a road with the american flag hanging so they are in the united states.", "You can see the usa flag in the background.", "An american flag can be seen waving behind the group of people."], "image": "train2014/COCO_train2014_000000041890.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466882, "question_id": "5ekXPJf5pLtezAEMPaFrKE", "question": "Which sport is played on a similar field?", "choices": ["soccer", "racquetball", "ice hockey", "water polo"], "correct_choice_idx": 1, "direct_answers": ["tennis", "volleyball", "pickleball", "handball", "volleyball", "racquetball", "doubles tennis", "pickleball", "table tennis", "volleyball"], "difficult_direct_answer": false, "rationales": ["This pretty much is the only setting for my choice due to the other options have vastly different settings.", "The people are playing tennis. the field does not have grass, water, or ice.", "There is a game of squash being playing on the tennis court."], "image": "val2014/COCO_val2014_000000466882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433715, "question_id": "5etGsJuDnkJBsWgboh3Kxq", "question": "Where did OG skateboarders develop this style of boarding?", "choices": ["underpasses", "skateparks", "canals", "swimming pools"], "correct_choice_idx": 3, "direct_answers": ["california", "swimming pools", "empty pools", "swimming pools", "skate parks", "california", "california", "neighborhood", "united states", "california"], "difficult_direct_answer": false, "rationales": ["In california many people have these in their backyard and empty ones are deep enough to practice skating tricks.", "And then later b.", "The skateboarder is skating inside a pit that looks like a swimming pool."], "image": "train2014/COCO_train2014_000000433715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268059, "question_id": "5f7Epq7fhnfwGNMStpMVev", "question": "Why is she cooking with wood?", "choices": ["environmentally friendly", "it's cheaper", "better taste", "no electricity"], "correct_choice_idx": 3, "direct_answers": ["old fashion", "fun", "fire", "make soup", "on burner", "hot fire", "no electricity", "campground", "prairie experience", "to eat"], "difficult_direct_answer": true, "rationales": ["She doesn't have a way to heat the food except over the wood.", "This simple wooden hut provides a bit of shelter, but has no electricity, so to heat up food items or water, a wood fire will provide plenty of heat.", "She doesn't have electricity."], "image": "val2014/COCO_val2014_000000268059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373683, "question_id": "5fLEeqjJAVh2Q7qXQS6cdn", "question": "Based on their gear they are most likely competing in what event?", "choices": ["biathlon", "heptathlon", "triathlon", "pentathlon"], "correct_choice_idx": 0, "direct_answers": ["shooting", "shooting", "shooting", "biathlon", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["There is a rifle on their back and skis on their feet. the prefix bi means two.", "They have a gun and skis and those are the two elements of a biathlon.", "They are skiing together."], "image": "train2014/COCO_train2014_000000373683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556512, "question_id": "5fLL6vkqm7UPsFRG6Zdjv4", "question": "What is the orange object on the woman's foot?", "choices": ["crocs", "water bottle", "swimfins", "socks"], "correct_choice_idx": 2, "direct_answers": ["flipper", "flipper", "flipper", "swimfins", "flipper", "flipper", "flipper", "flipper", "flipper", "flipper"], "difficult_direct_answer": false, "rationales": ["I see the swimfins on them.", "This helps her move through the water easier as she swims", "These people are swimming in the ocean where answer a would be used. the object is the right size and shape to be a swim fin and would be consistent with the activity viewed and the intended purpose of the object."], "image": "train2014/COCO_train2014_000000556512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242301, "question_id": "5fRP46NnG2wCj6CTGtDxo8", "question": "It is time to hit the?", "choices": ["bed", "waves", "hay", "weights"], "correct_choice_idx": 2, "direct_answers": ["hay", "sack", "hay", "hay", "sack", "sack", "sack", "hay", "hay", "sack"], "difficult_direct_answer": false, "rationales": ["These people are jumping into or already landed on their bed. hitting the hay is a phrase that means going to bed.", "This is a figure of speech for \"going to bed\", which is what they're all on or next to.", "These people look like they are jumping in to bed and based on the light on in the room it is likely to be night. colloquially, getting into bed at night or an appropriate sleeping time is known as hitting the hay."], "image": "val2014/COCO_val2014_000000242301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482300, "question_id": "5fdovEepZy9rEd6GymVuP2", "question": "What is the most appropriate surface for this truck to drive on?", "choices": ["grass", "asphalt", "sidewalk", "concrete"], "correct_choice_idx": 1, "direct_answers": ["flat road", "highway", "road", "road", "street", "asphalt", "road", "asphalt", "road", "road"], "difficult_direct_answer": false, "rationales": ["The best place would be on a black top driveway", "This truck drives best over asphalt roads.", "Asphalt is a smooth surface. tires ride best on a smooth surface."], "image": "train2014/COCO_train2014_000000482300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62889, "question_id": "5fp3Vm7Esr6ZZxderDsTeV", "question": "What would people call the animals that are being caged?", "choices": ["livestock", "wild animals", "endangered species", "zoo animals"], "correct_choice_idx": 0, "direct_answers": ["sheep", "captive", "sheep", "sheep", "cows", "sheep", "sheep", "sheep", "livestock", "sheep"], "difficult_direct_answer": false, "rationales": ["People would call these caged animals to be livestock such as sheep.", "Large animals are grazing in a pasture. livestock is kept in pastures and graze.", "The animals are sheep. they are being raised for commercial purposes."], "image": "val2014/COCO_val2014_000000062889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483992, "question_id": "5gG25WupEEmsugwh9MscmY", "question": "What s the main property of the red material in the jar on the back table?", "choices": ["sour", "spicy", "oily", "salty"], "correct_choice_idx": 1, "direct_answers": ["spice", "red chili", "peppery heat", "spice", "pepper flakes", "peppers", "spicy", "pepper", "pepper", "heat"], "difficult_direct_answer": false, "rationales": ["The jar has red pepper flakes in it.", "The red material is peppers which are spicy.", "The main property is spicy."], "image": "train2014/COCO_train2014_000000483992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86913, "question_id": "5gcbQ6JqzYo2znUtYoCauo", "question": "What is the man swinging?", "choices": ["baseball bat", "shoelaces", "basket", "tennis racquet"], "correct_choice_idx": 3, "direct_answers": ["tennis racquet", "racket", "tennis racket", "tennis racket", "tennis raquel", "tennis racket", "hit ball", "racket", "racket", "tennis racket"], "difficult_direct_answer": false, "rationales": ["The man is using a racquet.", "The man has a tennis racquet in his hand.", "The man is swinging a tennis racquet because he is playing on a tennis court and swinging at a ball."], "image": "val2014/COCO_val2014_000000086913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283210, "question_id": "5go7LMBwjA7tQq57PwTHb5", "question": "How can they be identified?", "choices": ["paws", "eyes", "tags", "scars"], "correct_choice_idx": 2, "direct_answers": ["teeth", "skin color", "aggressive dogs", "tags", "collars", "tags", "tags", "dogs", "collars", "collar"], "difficult_direct_answer": false, "rationales": ["They have nametags on their collars.", "The have collars with tags.", "The dogs have tags on their collars likely with names and addresses."], "image": "val2014/COCO_val2014_000000283210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99995, "question_id": "5gzrVP7BaTYtQFa9BjHog6", "question": "The man plays a similar sport to what person?", "choices": ["randy couture", "mike trout", "alex morgan", "daniel bryan"], "correct_choice_idx": 1, "direct_answers": ["aaron judge", "mike trout", "baseball player", "babe ruth", "cricketer", "baseball", "derek jeter", "hank aaron", "palmer", "professional player"], "difficult_direct_answer": true, "rationales": ["Mike trout is a baseball player.", "The man is playing baseball, not wrestling, mixed martial arts, or soccer.", "The man plays tennis."], "image": "train2014/COCO_train2014_000000099995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240101, "question_id": "5h4vSC48ZBpRh5gg7ACYBs", "question": "The remote is meant to communicate with what?", "choices": ["telephone", "computer", "television", "video game"], "correct_choice_idx": 2, "direct_answers": ["television", "tea shirt", "wii console", "tv", "television", "video monitor", "wii console", "wii", "television", "tv"], "difficult_direct_answer": false, "rationales": ["The remote has buttons to change channels.", "Many functions can be performed by touching one of the multiple buttons.", "The remote is meant to direct the television's channels."], "image": "train2014/COCO_train2014_000000240101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468798, "question_id": "5hCJMCG3VvsXLmykpZhpuV", "question": "What gas causes these lights to glow?", "choices": ["argon", "oxygen", "neon", "hydrogen"], "correct_choice_idx": 2, "direct_answers": ["neon", "neon", "natural", "neon", "neon gas", "neon", "neon", "neon", "i do", "neon"], "difficult_direct_answer": false, "rationales": ["Lights that glow are caused by neon gas used with electricity.", "The tube is sealed with the gas and glows when electricity is put through it.", "Signs on a building are lit up. many signs are lit with neon."], "image": "train2014/COCO_train2014_000000468798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318396, "question_id": "5hfV9yvbtdZ23ApLh8roNY", "question": "What is this snowboarder in the process of doing?", "choices": ["grabbing", "jibbing", "airing", "stalling"], "correct_choice_idx": 1, "direct_answers": ["sliding", "jibbing", "grinding", "railing", "grinding rail", "railing", "trick", "preparing jump", "grinding", "railing"], "difficult_direct_answer": false, "rationales": ["The snowboarder is trying to slide down.", "They are performing a stunt.", "The snowboarder is jibbing down the mountain."], "image": "train2014/COCO_train2014_000000318396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430166, "question_id": "5hriKhATFin6FXfKAdVvEK", "question": "What items are being shown off most frequently here?", "choices": ["door knobs", "green plants", "boxes", "pottery"], "correct_choice_idx": 3, "direct_answers": ["pottery", "vases", "planters", "planters", "flowers", "planters", "pottery", "ceramics", "vases", "ornaments"], "difficult_direct_answer": false, "rationales": ["Pottery is being shown.", "There are plants, but there are more things that are made out of clay on display.", "Potted plants are on display with other dishes as well."], "image": "train2014/COCO_train2014_000000430166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6357, "question_id": "5ht6g4fNyooffRhmXHqYnt", "question": "Who is in the bed?", "choices": ["mom", "rabbit", "little girl", "man"], "correct_choice_idx": 2, "direct_answers": ["little girl", "child", "girl", "child", "child", "girl", "young girl", "little girl", "little girl", "little girl"], "difficult_direct_answer": false, "rationales": ["She is young and holding a stuffed bunny.", "There is no adult in the bed. there is a rabbit above the bed, but it is a stuffed animal.", "A person, not an animal, is in the bed. the person is not an adult."], "image": "train2014/COCO_train2014_000000006357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27191, "question_id": "5i6gfh9x7MCp2qLqYZQLhq", "question": "Which of the following is famous for drowning while practicing this sport?", "choices": ["mark foo", "julian wilson", "gabriel medina", "lakey peterson"], "correct_choice_idx": 0, "direct_answers": ["ace cool", "surf", "unknown", "mark foo", "mark foo", "kirk passmore", "mark foo", "oscar serra", "mark foo", "oscar serra"], "difficult_direct_answer": false, "rationales": ["They are surfing.", "Mark foo died while surfing.", "The figures in this image hold surfboards near the waves of the ocean. mark foo is a famous surfboarder who drowned."], "image": "train2014/COCO_train2014_000000027191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418219, "question_id": "5iAYesxp5L4cVxAv9MpJb9", "question": "What type trees are shown in the area nearest the red shirted walker?", "choices": ["fern", "palm", "evergreen", "deciduous"], "correct_choice_idx": 3, "direct_answers": ["willow trees", "dormant trees", "old ancient", "fall trees", "deciduous", "oak", "maple", "deciduous", "weeping willow", "maple"], "difficult_direct_answer": false, "rationales": ["The trees are bare and shed their leaves each year when the weather becomes cold.", "The trees lose their leaves every year.", "They are classified as this because they lose their leaves every fall"], "image": "val2014/COCO_val2014_000000418219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322302, "question_id": "5iHazvgEZ3c2oq2P5hBTb7", "question": "Where is this desktop computer most likely located?", "choices": ["work", "library", "home", "classroom"], "correct_choice_idx": 2, "direct_answers": ["home", "home office", "home office", "home office", "office", "office", "office", "office", "below desk", "home office"], "difficult_direct_answer": false, "rationales": ["There is a cat looking out the window. that's a clear sign that this computer is in someone's home.", "There is a cat which makes it seem like it is at home.", "Due to the configuration of the computer workstation, window and cat, this scene is mostly likely in someone's home."], "image": "train2014/COCO_train2014_000000322302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235281, "question_id": "5iRcH7z2HoSaGWT9jDLGrh", "question": "What is the bottom signal on the light?", "choices": ["heart", "cow", "egg", "star"], "correct_choice_idx": 3, "direct_answers": ["star", "star", "star", "star", "star", "star", "star", "star", "star", "star"], "difficult_direct_answer": false, "rationales": ["The bottom shape is pointed like a star.", "A lit sign has three shapes in a column with a bike on top, a heart in the middle, and a star on the bottom.", "It is a star."], "image": "train2014/COCO_train2014_000000235281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174901, "question_id": "5ig59SKZyvFaKYWEtJXMkS", "question": "What type of lighting surrounds the mirror?", "choices": ["led", "florescent", "incandescent", "laser"], "correct_choice_idx": 1, "direct_answers": ["halogen", "florescent", "tube", "flourescent", "vanity", "fluorescent", "halogen", "soft", "halogen", "wall sconce"], "difficult_direct_answer": false, "rationales": ["The mirror is surrounded by florescent lighting.", "Traditionally light bulbs are of a florescent type so you can illuminate the room.", "This is a tube lightbulb"], "image": "train2014/COCO_train2014_000000174901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146294, "question_id": "5ijUFQaye3QjyAvx4anGZ3", "question": "What does Fly Emirates provide to the game?", "choices": ["drink", "food", "sponsor", "transportation"], "correct_choice_idx": 2, "direct_answers": ["sponsor", "sponsor", "sponsorship", "sponsorship", "sponsorship", "sponsorship", "plane", "sponsorship", "plane", "sponsorship"], "difficult_direct_answer": false, "rationales": ["Sponsors are generally displayed on the boards of tennis games.", "They give money to hold the match and get advertisement in return", "They provide funding for the event in exchange for advertisement"], "image": "train2014/COCO_train2014_000000146294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103413, "question_id": "5ipdNK2HncWR6gisyabo5U", "question": "What will happen to the boy next?", "choices": ["falling off", "lying down", "landing", "flipping"], "correct_choice_idx": 2, "direct_answers": ["land", "landing", "landing", "fall", "fall", "landing", "falling down", "fall", "fall", "landing"], "difficult_direct_answer": false, "rationales": ["He will land from his stunt.", "After a skateboard trick, they land on the ground.", "The boy is going back to the ground to land."], "image": "val2014/COCO_val2014_000000103413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9542, "question_id": "5j4SYnjB9xGomi9bEwGZtS", "question": "What letter is obscured by the sign after the PHO?", "choices": ["n", "t", "m", "g"], "correct_choice_idx": 1, "direct_answers": ["t", "letter t", "o", "letter t", "letter t", "t", "letter t", "rain", "letter t", "letter o"], "difficult_direct_answer": false, "rationales": ["The word is 'photo'.", "The sign says photo", "The letter \"t\" is being covered up."], "image": "train2014/COCO_train2014_000000009542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403937, "question_id": "5jDFspvnRS626XNEzLCy36", "question": "Who founded this school?", "choices": ["singh", "nehru", "chaudhary", "patel"], "correct_choice_idx": 1, "direct_answers": ["jawaharlal nehru", "someone", "india", "mayor", "keeper", "nehru", "indian", "indians", "indians", "na"], "difficult_direct_answer": true, "rationales": ["A school is shown with a sign on it.", "Nehru is the visionary behind the school.", "Nehru founded the school."], "image": "val2014/COCO_val2014_000000403937.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264296, "question_id": "5jHYgMzuSQi8tDCbd2975s", "question": "What game would one expect to be played in this room?", "choices": ["soccer", "basketball", "tennis", "football"], "correct_choice_idx": 1, "direct_answers": ["basket ball", "basketball", "basketball", "baseball", "basketball", "ultimate frisbee", "basketball", "basketball", "basketball frisbee", "basketball"], "difficult_direct_answer": false, "rationales": ["Students are playing on a basketball court.", "The floor of the gym has lines painted on it that are used for basketball games.", "One can see the nets on the wall where the ball would be thrown."], "image": "train2014/COCO_train2014_000000264296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174028, "question_id": "5jJ6V4vDTHzLbK4hd9HQT7", "question": "What is the brown wall behind the group made out of?", "choices": ["wood", "glass", "bronze", "plaster"], "correct_choice_idx": 0, "direct_answers": ["wood", "wood", "wood", "wood", "wood", "wood", "painting", "wood", "painting", "wood"], "difficult_direct_answer": false, "rationales": ["Wood is brown and has grains in it, just like the wall.", "It is the only material that is brown and can be easily painted on as pictured here.", "The wall is wood."], "image": "train2014/COCO_train2014_000000174028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81622, "question_id": "5jJt97hWAAENs7ELPxPUQd", "question": "From which material is the roofing most visible here sourced?", "choices": ["clay", "tar", "concrete", "wood"], "correct_choice_idx": 0, "direct_answers": ["clay", "stucco", "tile", "brick", "clay", "tile", "tile", "clay", "wood", "ceramic"], "difficult_direct_answer": false, "rationales": ["The type of roofing on the left building is may with clay.", "Red tiles can be seen on the roof.", "The materials are likely clay since the rooftops are red."], "image": "train2014/COCO_train2014_000000081622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572012, "question_id": "5jNoDjmmjaULPNVgGaiGxA", "question": "The feet of the man are placed on what two wheeled object?", "choices": ["scooter", "hoverboard", "skateboard", "caster board"], "correct_choice_idx": 3, "direct_answers": ["skateboard", "skateboard", "skateboard", "skateboard", "balance board", "skateboard", "hover board", "skateboard", "caster board", "skateboard"], "difficult_direct_answer": false, "rationales": ["A guy is riding on a wheeled object that is not a skateboard.", "It has only two wheels and does not have handlebars on it.", "A skateboard has four wheels and this one has two."], "image": "train2014/COCO_train2014_000000572012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411629, "question_id": "5jYBiXwnBaZNMTrgvHaYDw", "question": "What is traveling towards the man?", "choices": ["tennis ball", "dog", "cow", "bee"], "correct_choice_idx": 0, "direct_answers": ["tennis ball", "tennis ball", "tennis ball", "table tennis", "tennis ball", "ball", "tennis ball", "ball", "tennis ball", "ball"], "difficult_direct_answer": false, "rationales": ["The man is going to hit the ball.", "The ball is moving toward him.", "There is a ball traveling toward the tennis player."], "image": "train2014/COCO_train2014_000000411629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103516, "question_id": "5jhDWtqCoXU47ZnkK3xrcH", "question": "Why are the top of the rails in front of the train rusty?", "choices": ["age", "material quality", "aesthetics", "recent precipitation"], "correct_choice_idx": 3, "direct_answers": ["rain", "oxidation", "old", "damp climate", "corrosion", "rain", "recent precipitation", "oxidation", "heat", "rain"], "difficult_direct_answer": false, "rationales": ["The top shows precipitation.", "Water can cause corrosion to form on metal over time.", "Metal rusts when it gets wet."], "image": "train2014/COCO_train2014_000000103516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580238, "question_id": "5jmsPk2kumre6DZUAhDFH7", "question": "Why does he have so much stuff with him?", "choices": ["shopping", "moving", "homeless", "traveling"], "correct_choice_idx": 2, "direct_answers": ["homeless", "homeless", "homeless", "homeless", "homeless", "homeless", "no home", "homeless", "no home", "homeless"], "difficult_direct_answer": false, "rationales": ["The man is homeless.", "The man has all his possessions with him because he has no permanent shelter to leave it at. he must carry it with him wherever he goes if he wants to keep it.", "The man doesn't have a home to keep them in."], "image": "train2014/COCO_train2014_000000580238.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134770, "question_id": "5jsivST3NnQgNkvf3R7QQw", "question": "When was that flavor of beverage relaunched in the United States?", "choices": ["2015", "2010", "2008", "2007"], "correct_choice_idx": 3, "direct_answers": ["1980", "may twothousandseven", "coca-cola vanilla", "2020", "seventy one", "vanilla", "coca cola", "vanilla coke", "2002", "2007"], "difficult_direct_answer": true, "rationales": ["That was when coca cola vanilla was relaunched in the us.", "Coke was launched with vanilla in 2007.", "The ad on the bus is for coca-cola vanilla. it relaunched in the us before 2008."], "image": "train2014/COCO_train2014_000000134770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347131, "question_id": "5jwFc2k2YvBr6N8SEJGPUd", "question": "What fruit might this person eat first?", "choices": ["kiwi", "olives", "acai berries", "ugli"], "correct_choice_idx": 1, "direct_answers": ["pizza", "pineapple", "tomato", "olives", "olive", "tomato", "pepperoni", "apple", "olive", "apple"], "difficult_direct_answer": false, "rationales": ["The olives are on the pizza.", "The little girl has olives on her pizza.", "The person wants the olives."], "image": "train2014/COCO_train2014_000000347131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413874, "question_id": "5kDpARkjvFSKxCFWUDM5XE", "question": "What is the profession of the men in white?", "choices": ["athletes", "nurses", "doctors", "teachers"], "correct_choice_idx": 0, "direct_answers": ["tennis players", "tennis playing", "tennis players", "tennis players", "tennis players", "tennis playing", "tennis players", "tennis playing", "tennis players", "athletes"], "difficult_direct_answer": false, "rationales": ["The profession is an athlete.", "The men in white are playing tennis.", "The men are tennis players."], "image": "train2014/COCO_train2014_000000413874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326594, "question_id": "5kVj8PJbsnVSQEmSwxJtaA", "question": "This crumble is most likely what flavor?", "choices": ["plum", "blueberry", "rhubarb", "apple"], "correct_choice_idx": 3, "direct_answers": ["apple", "apple", "sweetness", "food", "apple", "bread", "apple", "bread", "apple", "apple"], "difficult_direct_answer": false, "rationales": ["The dish with the crumble in it is in front of some apples which were probably one of the ingredients.", "An apple crumble is a popular dish.", "Most crumble dishes are apple filled."], "image": "train2014/COCO_train2014_000000326594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62531, "question_id": "5ki7uuJ29wp58PLdcfveZG", "question": "Where is the man standing?", "choices": ["kitchen", "ladies restroom", "family restroom", "mens restroom"], "correct_choice_idx": 3, "direct_answers": ["mens restroom", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom", "in bathroom", "restroom", "bathroom"], "difficult_direct_answer": false, "rationales": ["The man is in a men's bathroom.", "There are urinals behind the man.", "Urinals are only located in this type of restroom."], "image": "train2014/COCO_train2014_000000062531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214124, "question_id": "5kvYxggkX2Z4xD5XQ8QDuD", "question": "What shape is the food?", "choices": ["circle", "hexagon", "square", "triangle"], "correct_choice_idx": 0, "direct_answers": ["round", "circle", "circular", "round", "circle", "round", "circle", "round", "round", "circular"], "difficult_direct_answer": false, "rationales": ["It's also called round.", "The food is a pizza which is circular.", "The small, round pizza-looking snack is ready to be sliced and served."], "image": "val2014/COCO_val2014_000000214124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81361, "question_id": "5mMLvBRaBWxtmYNzFB7k44", "question": "The type of animal the doll is is the same as what famous character?", "choices": ["daffy", "garfield", "pluto", "yogi"], "correct_choice_idx": 3, "direct_answers": ["bear", "pooh", "yogi", "yogi", "bear", "smokey", "pooh bear", "cartoon", "bear", "teddy bear"], "difficult_direct_answer": false, "rationales": ["Yogi is also a bear.", "Yogi is an animated bear on cartoons.", "The doll is the yogi."], "image": "val2014/COCO_val2014_000000081361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238385, "question_id": "5mfwK6NeDLtypVHgkJDytL", "question": "What zone is depicted in the photo?", "choices": ["residential", "traffic", "business", "shopping"], "correct_choice_idx": 3, "direct_answers": ["street mall", "street", "business zone", "city", "shopping", "tourist", "pedestrian", "city street", "shopping area", "pedestrian"], "difficult_direct_answer": true, "rationales": ["There are many shops.", "Building fronts are lit up in bright appealing colors on this street. this suggests a shopping zone.", "The area seems to have a lot of stores in it."], "image": "val2014/COCO_val2014_000000238385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548545, "question_id": "5moZdMe9jAsDPwvyuSXwHv", "question": "Who folded the toilet paper roll into a point to the left of the sink?", "choices": ["nanny", "clerk", "cleaning staff", "butler"], "correct_choice_idx": 2, "direct_answers": ["maid", "cleaning crew", "cleaning staff", "bottle", "cleaning staff", "attendant janitor", "maid", "toilet paper", "maid", "maid"], "difficult_direct_answer": false, "rationales": ["The cleaning staff folded the toilet paper roll to make it look presentable.", "The cleaning staff of the hotel needs to freshen up the bathroom.", "In most hotels, small touches seem welcoming to guests, including the folding of toilet paper into a point by housekeeping."], "image": "train2014/COCO_train2014_000000548545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128607, "question_id": "5mpKTnqrzhpFqpk6jP5gCz", "question": "What is the piece of equipment to the left of the monitor?", "choices": ["printer", "fax machine", "router", "computer tower"], "correct_choice_idx": 0, "direct_answers": ["printer", "printer", "printer", "hutch", "printer", "laser printer", "printer", "printer", "copier", "printer"], "difficult_direct_answer": false, "rationales": ["It is used to produce physical copies of typed documents.", "A printer is often used to print documents from a computer.", "A printer is usually the bulkiest equipment in the room which, in this case, is also true."], "image": "train2014/COCO_train2014_000000128607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445503, "question_id": "5mu6GjnQ82UZjii8KNrV3B", "question": "Where are the people located?", "choices": ["restaurant", "garage", "stadium", "office building"], "correct_choice_idx": 2, "direct_answers": ["bleachers", "spectator stands", "stands", "stands", "stands", "stands", "bleachers", "football stadium", "stadium", "stands"], "difficult_direct_answer": false, "rationales": ["The people are seated in the bleachers watching a sporting event that has cheerleaders.", "There is a large grandstand facing a field.", "The elevated rafter seating with spectators around a field with cheerleaders and athletes identifies this locale as a stadium."], "image": "train2014/COCO_train2014_000000445503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121018, "question_id": "5nLxriUZWYqJeRAoeC6RMZ", "question": "These people are making what kind of food?", "choices": ["mexican", "halal", "kosher", "chinese"], "correct_choice_idx": 0, "direct_answers": ["sandwich", "sandwiches", "sandwich", "sandwich", "sandwich", "sandwiches", "pulled pork", "mexican", "sandwich", "subs"], "difficult_direct_answer": false, "rationales": ["The food is being wrapped up in a burrito wrap.", "The people are making mexican food.", "They are spreading shredded meat and topping with a mole sauce."], "image": "train2014/COCO_train2014_000000121018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488897, "question_id": "5nqqqhXNBVzYdY6crKXtGV", "question": "What is holding the candles?", "choices": ["spinner", "candelabra", "handle", "silver stand"], "correct_choice_idx": 1, "direct_answers": ["chandelier", "holder", "candelabra", "candle stick", "candlestick", "candelabra", "candelabra", "candlestick", "candelabra", "candlestick holder"], "difficult_direct_answer": false, "rationales": ["The candle holder in the middle of the table.", "The candelabra has the candles.", "The candles are being held by a special named candle holder that has multiple arms for the candles."], "image": "train2014/COCO_train2014_000000488897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24763, "question_id": "5nvnDMCggupSzH9sLQT5cZ", "question": "What unhealthy ingredient does this food contain the most?", "choices": ["flour", "sugar", "nuts", "chocolate"], "correct_choice_idx": 1, "direct_answers": ["sugar", "sugar", "sugar", "sugar", "sugar", "sugar", "sugar", "sugar", "sugar", "sugar"], "difficult_direct_answer": false, "rationales": ["The donuts have sugar.", "Donuts are covered in sweet toppings.", "Donuts and pastries are filled with sweetener."], "image": "train2014/COCO_train2014_000000024763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243820, "question_id": "5nwNt3ymkeQbDK4s7jiVYq", "question": "What is to the right of the sink?", "choices": ["cat", "cardboard box", "scissors", "roach"], "correct_choice_idx": 2, "direct_answers": ["drying container", "water filter", "towel", "towel", "dish rack", "scissors", "towel", "mat", "cloth", "strainer"], "difficult_direct_answer": false, "rationales": ["The item is a pair of blades that pivot off a central point.", "There is a pair of scissors on the towel.", "There are no animals or boxes to the right of the sink."], "image": "train2014/COCO_train2014_000000243820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345667, "question_id": "5nzuPseZnP8atYgLxHz5Ji", "question": "What business park does this company operate out of?", "choices": ["pathfields", "safeways", "journeyman", "legacy"], "correct_choice_idx": 0, "direct_answers": ["south molten", "south molten", "pathfields park", "south molten", "pathfields", "south bolton", "mway", "pathfields", "pathfields", "southern molten"], "difficult_direct_answer": false, "rationales": ["The display of the bus shows the park but it does.", "The park works on pathfields.", "Looks like a pathfields truck."], "image": "train2014/COCO_train2014_000000345667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236718, "question_id": "5oGYejx653S5kRVLodbY8n", "question": "The children seen here are helping their family do what?", "choices": ["sell", "move", "couch surf", "play ball"], "correct_choice_idx": 1, "direct_answers": ["move", "move", "move", "moving", "move furniture", "move furniture", "transport", "move", "move", "move"], "difficult_direct_answer": false, "rationales": ["There is furniture in the back of the truck and the kids are also in the back. they are with the family while they are moving the furniture.", "The couch is in the back for them to move it.", "They are moving and have the back of the truck packed with furniture."], "image": "train2014/COCO_train2014_000000236718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48332, "question_id": "5oQTJm3UdqCeTpA4EJ2LRq", "question": "What number anniversary is being commemorated at this event?", "choices": ["316th", "216th", "276th", "376th"], "correct_choice_idx": 3, "direct_answers": ["can't see", "376th", "twenty fifth", "375th", "no idea", "not legible", "376th", "375th", "376th", "376"], "difficult_direct_answer": false, "rationales": ["It says 376 on the projection.", "The number is on the cake.", "The number says 376 on the projector."], "image": "val2014/COCO_val2014_000000048332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142080, "question_id": "5of6jkW2QcpbLBAsg4ZpS5", "question": "What does the man in white need to use?", "choices": ["grill", "emergency exit", "restroom", "sword"], "correct_choice_idx": 2, "direct_answers": ["bathroom", "bathroom", "bathroom", "toilet", "bathroom", "bathroom", "bathroom", "bathroom", "door", "restroom"], "difficult_direct_answer": false, "rationales": ["He is going to the restroom.", "They are at a place where they label the bathrooms in mid evil terms and this is a bathroom.", "He has to go to the bathroom."], "image": "train2014/COCO_train2014_000000142080.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288943, "question_id": "5oimF8pX8GaxZkRjuPBqTu", "question": "Why is the man leaning to his left?", "choices": ["to dodge", "to duck", "to flip", "to align"], "correct_choice_idx": 3, "direct_answers": ["hit ball", "to align", "hit ball", "hit ball", "hit ball", "hit ball", "reach ball", "tracking ball", "hitting balls", "reaching ball"], "difficult_direct_answer": false, "rationales": ["The man wants to align.", "The man is trying to put himself into the best position so that he is able to make contact with the ball.", "He is standing as he needs to to hit the ball."], "image": "train2014/COCO_train2014_000000288943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511299, "question_id": "5p7FVp78tdNGh6QHXsSKvH", "question": "Why are clothes hung here?", "choices": ["blew there", "easy storage", "signal", "drying"], "correct_choice_idx": 3, "direct_answers": ["drying", "drying", "drying", "drying", "to dry", "to dry", "boat house", "to dry", "drying", "drying"], "difficult_direct_answer": false, "rationales": ["The clothes are drying.", "They are hanging on a clothesline.", "The wind will dry them more quickly outdoors."], "image": "val2014/COCO_val2014_000000511299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490647, "question_id": "5p9dRppxtTn9AbkorKHZUN", "question": "What holiday is celebrated during this time of the year?", "choices": ["easter", "halloween", "christmas", "thanksgiving"], "correct_choice_idx": 2, "direct_answers": ["christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas"], "difficult_direct_answer": false, "rationales": ["The people are skiing on snow, so it must be winter. halloween and thanksgiving are in the fall, and easter is in the spring.", "During the winter months, snow and skiing is normally at its maximum. a popular holiday during this time is the celebration of christmas.", "Around december 25th the weather is usually cold and when it is cold it can snow."], "image": "train2014/COCO_train2014_000000490647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363067, "question_id": "5pZXBDXkGyWsPBr6S8iSs4", "question": "What country are this breed of cows originally from?", "choices": ["russia", "france", "south korea", "england"], "correct_choice_idx": 3, "direct_answers": ["usa", "texas", "africa", "england", "england", "australia", "scotland", "england", "scotland", "turkey"], "difficult_direct_answer": false, "rationales": ["These yorkshire cows are from england.", "Hereford cows are english.", "The cows seen in the grass are brown cows that are seen in england."], "image": "train2014/COCO_train2014_000000363067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21889, "question_id": "5ptnHwc4q2BP9E4LTU3Ua4", "question": "Why is the man wearing an orange jumpsuit?", "choices": ["punishment", "cosplay", "for fun", "visibility"], "correct_choice_idx": 3, "direct_answers": ["worker", "safety", "visibility", "safety", "safety", "visibility", "safety", "safety", "visibility", "visibility"], "difficult_direct_answer": false, "rationales": ["The man is wearing a bright jumpsuit so that he can be easily seen.", "The man is working near a boat and wearing a jumper.", "The man is working near large boats on a dock so he would want to be seen."], "image": "val2014/COCO_val2014_000000021889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134496, "question_id": "5q45HQoNFadQbKST839HGy", "question": "What religion is associated with her facial decoration?", "choices": ["buddhism", "hinduism", "islam", "judaism"], "correct_choice_idx": 1, "direct_answers": ["hinduism", "hindu", "hindi", "hinduism", "hinduism", "hinduism", "hinduism", "india", "hindu", "hinduism"], "difficult_direct_answer": false, "rationales": ["This is from hinduism.", "The woman has a dot on her head which hindu people often have.", "The dot on the face shows that she married in hindu."], "image": "train2014/COCO_train2014_000000134496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181996, "question_id": "5qeQ6mKm988M9JbSnY7UmB", "question": "What kind of service does the white sign promise?", "choices": ["food prep", "teeth cleaning", "tanning", "gardening"], "correct_choice_idx": 0, "direct_answers": ["chef catering", "catering", "catering", "chef catering", "chef catering", "catering", "food prep", "catering", "catering", "catering"], "difficult_direct_answer": false, "rationales": ["It is a sign for a catering company that would cater events and such.", "A sign advertises a chef.", "The white sign says my chef catering and offers foods for events."], "image": "train2014/COCO_train2014_000000181996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564063, "question_id": "5qir7gVnRmY4KwduF7oqSh", "question": "What safety gear will allow the people standing from worrying about getting fatally shot?", "choices": ["bullet vest", "bulletproof vest", "gun vest", "hard vest"], "correct_choice_idx": 1, "direct_answers": ["bulletproof vest", "blanks", "barricade", "bulletproof vests", "guns", "bulletproof", "steel", "barbed wire", "blanks", "safety cones"], "difficult_direct_answer": true, "rationales": ["This helps protect you from gunfire", "A bulletproof vest is one of the only things that can prevent injury if you're hit by a bullet.", "The person wears a suit that protects them from guns."], "image": "train2014/COCO_train2014_000000564063.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507502, "question_id": "5qj6hJr6Y9rfQkKbyzHnKp", "question": "What is the person in blue coat doing?", "choices": ["sending email", "taking photo", "watching video", "online shopping"], "correct_choice_idx": 1, "direct_answers": ["taking pictures", "taking photo", "taking picture", "taking pictures", "taking photos", "taking picture", "taking photo", "taking photo", "taking pictures", "taking pictures"], "difficult_direct_answer": false, "rationales": ["The person in the blue coat has his camera on and pointed at the red bus. the bus can be seen in view on the screen.", "He is holding a camera up to his face", "The man in the blue coat is taking a picture."], "image": "train2014/COCO_train2014_000000507502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36077, "question_id": "5qrs2p9se9uxcncrTJzHXf", "question": "What is the small mirror in this bus called?", "choices": ["safety", "rear view", "back", "traffic"], "correct_choice_idx": 1, "direct_answers": ["convex mirror", "student mirror", "rear view", "rear view", "rear view", "student mirror", "mirror", "convex mirror", "mirror", "convex"], "difficult_direct_answer": false, "rationales": ["A rear view mirror is very important in any type of vehicle and a bus is no exception. these mirrors are known as \"rear view mirrors\" not \"back mirrors.\"", "This type of mirror is named that because it allows the driver to see behind the bus.", "A mirror in the corner of the bus allows the driver to see passengers better."], "image": "val2014/COCO_val2014_000000036077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158225, "question_id": "5r4rBnWrF8g3aVTTFd5AxX", "question": "What might the temperature be like where they are standing?", "choices": ["dry", "cold", "moist", "humid"], "correct_choice_idx": 3, "direct_answers": ["warm", "75 degrees", "70 degrees", "humid", "warm", "warm", "hot", "humid", "hot", "hot"], "difficult_direct_answer": false, "rationales": ["Greenhouses are kept hot and humid to promote the growth of the plants.", "Plants need moisture and warmth to grow", "Plants need a humid environment to grow."], "image": "train2014/COCO_train2014_000000158225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309404, "question_id": "5rMXfpsLsQhCPEdrAGAWKD", "question": "Why do people eat this thing?", "choices": ["animal abuse", "thanksgiving", "birthday celebration", "meat lover"], "correct_choice_idx": 2, "direct_answers": ["birthday celebration", "celebrations", "gluttony", "party", "birthday", "for birthdays", "birthday", "dessert", "dessert celebration", "birthday"], "difficult_direct_answer": false, "rationales": ["The words beside the cake indicate its purpose.", "People eat cake in commemoration of birthdays.", "This is likely a cake for a person who loves dogs to celebrate getting a year older."], "image": "train2014/COCO_train2014_000000309404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179875, "question_id": "5rh5mQjwqzMqubQsDLuZN6", "question": "What human food would these animals be most willing to eat?", "choices": ["bread", "chocolate", "hot peppers", "steak"], "correct_choice_idx": 0, "direct_answers": ["mice", "bread", "bread", "french fries", "bread", "seeds", "bread", "hot dogs", "bread", "bread"], "difficult_direct_answer": false, "rationales": ["The birds on the beach would probably be interested in eating bread crumbs the most.", "Birds are on the beach near people and birds are often fed bread by pedestrians in public places.", "People are on a beach feeding birds. people often feed birds bread."], "image": "train2014/COCO_train2014_000000179875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581282, "question_id": "5rjybtpozDM8zdXqygEP7B", "question": "What activity are people holding umbrellas taking part in?", "choices": ["standing", "fleeing", "singing", "parade"], "correct_choice_idx": 3, "direct_answers": ["parade", "parade", "parade", "parade", "parade", "parade", "parade", "parade", "parade", "parade"], "difficult_direct_answer": false, "rationales": ["The people are marching down the street and others are watching them.", "They are marching in a parade.", "The people re in a parade."], "image": "train2014/COCO_train2014_000000581282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470174, "question_id": "5sC2JtrYVRASWTBfzmpFGw", "question": "What is in the center of the picture?", "choices": ["statue", "tv", "window", "wood stove"], "correct_choice_idx": 3, "direct_answers": ["coffee table", "wood stove", "table", "table", "coffee table", "table", "table", "coffee table", "table", "table"], "difficult_direct_answer": false, "rationales": ["This is a living room, so the pit in the center is likely a fireplace.", "It is directly opposite the couch, rather than off to one side.", "A little fireplace or stove is there."], "image": "train2014/COCO_train2014_000000470174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280596, "question_id": "5sFKy5R42k5SzaFNSjDxu5", "question": "What profession do the gentlemen in orange wraps belong to?", "choices": ["buddhist monk", "builders", "gardeners", "salesmen"], "correct_choice_idx": 0, "direct_answers": ["monks", "monks", "buddhist monk", "monk", "hinduism", "monks", "religion", "monks", "monk", "monks"], "difficult_direct_answer": false, "rationales": ["These people are monks since they're wearing robes.", "The people are monks.", "Those kinds of people shave their heads and have that kind of wardrobe signifying simplicity."], "image": "train2014/COCO_train2014_000000280596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581199, "question_id": "5sLUtsUfcy8JDVaUccf3QL", "question": "Which topping contains the most calcium?", "choices": ["cheese", "pepper", "meat", "olive"], "correct_choice_idx": 0, "direct_answers": ["cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese"], "difficult_direct_answer": false, "rationales": ["Cheese and dairy products contain a lot of calcium and are good for bones.", "It is a dairy product made from milk", "The cheese on top of the pizza is made from milk which is high in calcium."], "image": "train2014/COCO_train2014_000000581199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297210, "question_id": "5sZRu2SkkjtgRhpCXMsbVB", "question": "What made the grooves seen here?", "choices": ["skis", "gophers", "snow groomer", "children"], "correct_choice_idx": 2, "direct_answers": ["rake", "snow groomer", "rake", "ski", "groomers", "skiers", "skis", "snow groomer", "snow machine", "truck"], "difficult_direct_answer": false, "rationales": ["The lines are too perfect to be man made, so it must have been the groomer.", "The person who drags a machine over the snow created the grooves.", "The grooves are from the snow groomer."], "image": "train2014/COCO_train2014_000000297210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209253, "question_id": "5sjG3zHKUxfPZt4twXtJ7f", "question": "Who will serve each person their food?", "choices": ["themselves", "no one", "mother", "dad"], "correct_choice_idx": 0, "direct_answers": ["self serve", "self", "mother", "themselves", "woman", "self service", "man", "woman", "guests", "themselves"], "difficult_direct_answer": false, "rationales": ["Food is spread out along a counter top with serving spoons in each dish.", "The food is set out on the kitchen bar with serving utensils ready for people to get what they want", "The food is laid out in a manner consistent with settings where people assemble their own plates of food based on everything set out."], "image": "train2014/COCO_train2014_000000209253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558286, "question_id": "5sy4a4aH2KeyciCxTfkinS", "question": "How does what he's looking at differ from reading news on a phone?", "choices": ["lacks words", "lacks news", "lacks ads", "on paper"], "correct_choice_idx": 3, "direct_answers": ["paper news", "less recent", "physical newspaper", "written form", "tabloids", "no phone", "on paper", "newspaper", "print", "on paper"], "difficult_direct_answer": true, "rationales": ["He is reading a newspaper.", "He is reading a newspaper.", "The man is reading the paper."], "image": "train2014/COCO_train2014_000000558286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580791, "question_id": "5t6dsqoSzkZuVai7RLNeiq", "question": "What type of water is being served?", "choices": ["spring", "mineral", "distilled", "lemon"], "correct_choice_idx": 1, "direct_answers": ["sparkling", "seltzer", "sparkling", "mineral", "mineral", "clean", "aqua nepi", "bottled", "italian", "acqua nepi"], "difficult_direct_answer": false, "rationales": ["The brand is known for mineral.", "The water is mineral water.", "There is an acqua nepi bottle."], "image": "val2014/COCO_val2014_000000580791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444955, "question_id": "5tTiYRWErXZuxdJoo3TgYx", "question": "Where is the man storing his things?", "choices": ["suitcase", "duffel bag", "purse", "backpack"], "correct_choice_idx": 3, "direct_answers": ["backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack"], "difficult_direct_answer": false, "rationales": ["The man is wearing a bag in the back.", "The man is storing all of his things into a backpack.", "The bag is hanging on both of the man's shoulders and is meant to store things."], "image": "val2014/COCO_val2014_000000444955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181436, "question_id": "5thbuHmvfP9FbyGKSNGQXW", "question": "People in the red car hope to see what today?", "choices": ["car wash", "cyclists", "eclipse", "animals"], "correct_choice_idx": 3, "direct_answers": ["animals", "animals", "wildlife", "giraffe", "animals", "wild animals", "wild animals", "animals", "animals", "animals"], "difficult_direct_answer": false, "rationales": ["The road passes through the animal park.", "The people in the red car are driving through an area where they can see wild animals like giraffes.", "There is a zebra and two giraffes in a field. they are driving through a safari experience."], "image": "train2014/COCO_train2014_000000181436.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303508, "question_id": "5tiiFwtSaVQqCfACXVTHoH", "question": "What has been done to the watermelon?", "choices": ["cut", "smashed", "diced", "cooked"], "correct_choice_idx": 0, "direct_answers": ["sliced", "halved", "cut", "halved", "cut", "sliced", "halved", "halved", "halved", "chopped"], "difficult_direct_answer": false, "rationales": ["The watermelon has been sliced in half.", "The watermelon is sliced open.", "You can slice the watermelon."], "image": "train2014/COCO_train2014_000000303508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201928, "question_id": "5tm6xjKsBhHNYELwfkMjHA", "question": "Where is this meal served?", "choices": ["restaurant", "airplane", "picnic", "home"], "correct_choice_idx": 1, "direct_answers": ["airport", "airplane", "airplane", "plane", "airplane", "restaurant", "airplane", "airplane", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["The dishes are disposable.", "The meal is served in an airplane because it is on an airplane tray.", "The meal is being served on a plane tray."], "image": "val2014/COCO_val2014_000000201928.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534484, "question_id": "5uAhyYNNRXVwhvLeN7Rqmk", "question": "Which country is famous for elephants?", "choices": ["norway", "sydney", "thailand", "dutch"], "correct_choice_idx": 2, "direct_answers": ["thailand", "india", "india", "india", "africa", "botswana", "india", "thailand", "botswana", "thailand"], "difficult_direct_answer": false, "rationales": ["Thailand has a lot of elephants.", "Thailand is a country famous for elephants.", "The men riding on top of the elephants are of asian decent."], "image": "train2014/COCO_train2014_000000534484.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401054, "question_id": "5uCxgPHA2A5EWg5RQkKR3V", "question": "What level of skier is this part of the hill designed for?", "choices": ["advanced", "expert", "intermediate", "beginner"], "correct_choice_idx": 3, "direct_answers": ["beginner", "beginner", "beginner", "beginner", "beginner", "intermediate", "rookies", "beginner", "beginner", "beginner"], "difficult_direct_answer": false, "rationales": ["This is a small hill with a minimal slope, it is easy to maneuver and is a good environment to learn and develop ability without high risk. you can also see several children present who are generally just starting to learn.", "The skier is a beginner.", "The hill is very slight so a beginner could handle it."], "image": "train2014/COCO_train2014_000000401054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481422, "question_id": "5uMk5c2PawJFWwMiHhg5X4", "question": "How many spokes in the umbrella?", "choices": ["four", "ten", "eight", "three"], "correct_choice_idx": 2, "direct_answers": ["multiple", "eight", "eight", "eight", "eight", "eight", "eight", "nine", "eight", "eight"], "difficult_direct_answer": false, "rationales": ["One can easily count the spokes that are showing in the underside of the umbrella.", "A lifeguard is sitting under a large umbrella with silver spokes.", "There are eight of them."], "image": "val2014/COCO_val2014_000000481422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115243, "question_id": "5ufsL77mUWPCmZtW8Q4GMu", "question": "What is he doing?", "choices": ["showing off", "waiting turn", "resting", "cleaning poles"], "correct_choice_idx": 1, "direct_answers": ["skiing", "resenting photographer", "ice scatting", "ski lessons", "waiting turn", "skiing", "skating", "crosscountry skiing", "waiting", "skiing"], "difficult_direct_answer": false, "rationales": ["He is standing in one place and looking annoyed.", "The man is waiting for his turn to ski.", "There is nothing to indicate he is doing anything specifically although he does not appear to be in motion and looks like he is in a casual stance."], "image": "val2014/COCO_val2014_000000115243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229481, "question_id": "5utnZeAsA87AdbpmEFPDdK", "question": "What the is the woman to do?", "choices": ["exercise", "sleep", "eat", "travel"], "correct_choice_idx": 2, "direct_answers": ["cooking", "eat", "cooking", "cooking", "cooking", "cooking", "cook", "cooking", "cooking onions", "cook"], "difficult_direct_answer": false, "rationales": ["She is cooking food because she is probably hungry or feeding someone else.", "Technically she's cooking with the plan to do a.", "The picture is showing her cooking food so she can eat it."], "image": "val2014/COCO_val2014_000000229481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88185, "question_id": "5v7NC4yBW3Qjk6dA8dZyd6", "question": "What could be seen on the other side of this post?", "choices": ["mirror", "tv screen", "walk sign", "ads"], "correct_choice_idx": 2, "direct_answers": ["crosswalk", "walk sign", "walk signal", "street name", "crossing", "road signs", "construction", "buildings people", "sign", "pedestrian"], "difficult_direct_answer": true, "rationales": ["The post tells pedestrians when they can walk.", "The signal is pointed towards a cross walk.", "There is a walking sign on the other end of this post."], "image": "train2014/COCO_train2014_000000088185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540529, "question_id": "5vRW5495GoS3GjBZfhLJKa", "question": "What event caused the tree on the left to look so discolored and bare?", "choices": ["paint", "fire", "wind", "water"], "correct_choice_idx": 1, "direct_answers": ["park", "drought", "winter", "drought", "winter", "drought", "trimmed", "drought", "giraffes eating", "fire"], "difficult_direct_answer": false, "rationales": ["A tree is gray and damaged. forest fires are common.", "The fire caused the trees to be brown.", "The tree is black and mostly dead. it looks like it was burned."], "image": "val2014/COCO_val2014_000000540529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561411, "question_id": "5vUNasBuPM4nWBEbU3CYnD", "question": "What does this dog focus on?", "choices": ["time clock", "toys", "screen", "person"], "correct_choice_idx": 3, "direct_answers": ["laptop screen", "camera lens", "laptop", "laptop", "laptop", "computer screen", "computer", "computer", "screen", "person"], "difficult_direct_answer": false, "rationales": ["The dog is looking at the laptop, and it's reflected in its eyes.", "The dog is looking at the laptop screen.", "The dog is staring at the computer."], "image": "val2014/COCO_val2014_000000561411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35726, "question_id": "5vuhodJrpRsC9ErEXEvd4d", "question": "Who wrote a book whose title matches the word at the front top of the boat?", "choices": ["joe hill", "franz kafka", "jack ryan", "jim sturgess"], "correct_choice_idx": 1, "direct_answers": ["franz kafka", "plato", "franz kafka", "kafka", "franz kafka", "kafka", "kafka", "franz kafka", "franz kafka", "franz kafka"], "difficult_direct_answer": false, "rationales": ["He had a book with that word in the title and is known for that work that he did.", "It was about a man changing into a cockroach.", "This is the only one of the four who is an an author."], "image": "val2014/COCO_val2014_000000035726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232143, "question_id": "5wEsn4M4Gbhg3c5Tj4AQqp", "question": "What do you hang with the thing sitting on the folded blanket?", "choices": ["clothes", "hats", "flowers", "picture"], "correct_choice_idx": 0, "direct_answers": ["clothing", "clothes", "clothing", "clothes", "clothes", "clothing", "clothes", "clothes", "clothes", "clothes"], "difficult_direct_answer": false, "rationales": ["It's a hanger and that is its purpose. you might also use it decoratively with c.", "The item on the folded blanket is a hanger. these are most notably used to hang clothes.", "You can hang it with the clothes."], "image": "train2014/COCO_train2014_000000232143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260901, "question_id": "5wQaVq2R7ME2TRauYLU5sD", "question": "What word is written on the blue truck?", "choices": ["reuse", "reduce", "quickly", "orange"], "correct_choice_idx": 1, "direct_answers": ["reduce", "reduce", "reduce", "reduce", "letter", "reduce", "reduce", "reduce", "diesel", "letter"], "difficult_direct_answer": false, "rationales": ["Reduce is on the side of the blue truck.", "The word is reduce.", "It has the word spelled out. it's probably promoting to recycle and not get as much trash."], "image": "train2014/COCO_train2014_000000260901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333665, "question_id": "5wRK3bCkWmT4jFyBT5RYYJ", "question": "Why are people looking at the ground?", "choices": ["dog", "bright", "tripping hazard", "slippery"], "correct_choice_idx": 1, "direct_answers": ["being careful", "sun", "trip avoidance", "avoid conversation", "just walking", "bright", "uneven pavement", "avoid sunlight", "sunny", "it's sunny"], "difficult_direct_answer": true, "rationales": ["The guy is turning away from the sun.", "The people are looking at the ground because the sun too bright for their eyes.", "The sun is shining in the people's eyes."], "image": "val2014/COCO_val2014_000000333665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159475, "question_id": "5wfXUrxGf3ypZPzkroomEX", "question": "Why is he leaning forward?", "choices": ["is falling", "maintaining balance", "was surprised", "bad back"], "correct_choice_idx": 1, "direct_answers": ["maintaining balance", "balance", "steering", "balance", "balance", "maintain balance", "balance", "maintain balance", "balance", "maintain balance"], "difficult_direct_answer": false, "rationales": ["The man is trying not to fall.", "He is trying to have balance so he won't fall.", "A surfer is on a surfboard. people lean to gain balance."], "image": "train2014/COCO_train2014_000000159475.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280733, "question_id": "5wmFQY4zgfbkLezjSVENXD", "question": "If this bus is in its hometown what is the closest destination?", "choices": ["turkey", "japan", "sweden", "usa"], "correct_choice_idx": 2, "direct_answers": ["buurtbus", "buurt", "buurtbus", "sweden", "buurt", "copenhagen", "amsterdam", "buurt", "station", "next corner"], "difficult_direct_answer": false, "rationales": ["This bus company is based in sweden.", "The bus is in sweden.", "Buurtbus is written on top of the bus which is likely the bus company. the company is from the netherlands which is closest to answer a of the options provided."], "image": "val2014/COCO_val2014_000000280733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23630, "question_id": "5xDoE4kFWV3hnSwbgGZeaH", "question": "What is there biggest predator?", "choices": ["elephants", "lions", "crocodiles", "rhinos"], "correct_choice_idx": 1, "direct_answers": ["lion", "lion", "lions", "lions", "lions crocodiles", "lion", "lions", "lions", "lion", "lion"], "difficult_direct_answer": false, "rationales": ["Traditionally lions are the biggest predators where zebras live.", "Lions are the only animals which are both predatory to zebras and inhabit the same habitats as zebras.", "The lion is the most feared predator by whole animals."], "image": "train2014/COCO_train2014_000000023630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348948, "question_id": "5xeiBbsvU22sNzXBqGpZeQ", "question": "How many people are probably sitting down to the meal?", "choices": ["four", "one", "two", "three"], "correct_choice_idx": 1, "direct_answers": ["one", "single", "one", "one", "two", "one", "single", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is just a plate for one.", "The salad can feed a single person.", "There is only one plate here."], "image": "train2014/COCO_train2014_000000348948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240163, "question_id": "5xhVRW4BhziLvmqaNWykMB", "question": "What is the man taking?", "choices": ["selfie", "tray", "his temperature", "ticket"], "correct_choice_idx": 0, "direct_answers": ["selfie", "selfie", "selfie", "self photo", "selfie", "self photo", "selfie", "selfie", "selfie", "self photo"], "difficult_direct_answer": false, "rationales": ["This is the most likely option and the only one that would apply in this setting.", "The man is taking a picture of himself.", "He's taking a picture of himself."], "image": "train2014/COCO_train2014_000000240163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144618, "question_id": "5xjhLGKTLaWQcdiazHmz93", "question": "What type of hat is the man in the air wearing?", "choices": ["beanie", "fedora", "baseball cap", "derby"], "correct_choice_idx": 2, "direct_answers": ["cap", "baseball cap", "baseball cap", "baseball cap", "baseball", "baseball cap", "ballcap", "cap", "baseball cap", "baseball"], "difficult_direct_answer": false, "rationales": ["The hat is a baseball cap.", "The man is wearing a hat with a brim and an adjustable back.", "By the design of the hat it is easy to tell what it is and the type of hat."], "image": "train2014/COCO_train2014_000000144618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109869, "question_id": "5xpP9QMw2vgxbV9LVqnzW5", "question": "Cats use what body part to hold on tightly to an object while jumping?", "choices": ["hoofs", "nose", "ears", "claws"], "correct_choice_idx": 3, "direct_answers": ["paws", "paws", "claws", "paws", "claws", "claws", "claws", "claws", "claws", "paws"], "difficult_direct_answer": false, "rationales": ["The cat can cling to objects with its paws.", "Cats have claws that can be used to grip on to things when climbing and jumping.", "Cats use claws."], "image": "val2014/COCO_val2014_000000109869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5643, "question_id": "5xzddKXdtsrb3NYZa3jRPx", "question": "Where is the cat located at?", "choices": ["on table", "under table", "floor", "under chair"], "correct_choice_idx": 1, "direct_answers": ["under chair", "chair", "office chair", "on chair", "under table", "chair", "chair", "under table", "under desk", "chair"], "difficult_direct_answer": false, "rationales": ["The picture depicts the cat on the chair under the computer desk.", "The cat is lying on a chair which is under a table.", "The animal is sitting in a chair right underneath the table."], "image": "train2014/COCO_train2014_000000005643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566704, "question_id": "5yPaKMqiwYLwRh7yeQji9G", "question": "What garment is being worn by this man?", "choices": ["shirt", "diaper", "tightie whities", "none"], "correct_choice_idx": 1, "direct_answers": ["diaper", "underwear", "diaper", "diaper", "diaper", "diaper", "underwear", "diaper", "diaper", "diaper"], "difficult_direct_answer": false, "rationales": ["The man is wearing a diaper.", "He is in an adult diaper", "It is plastic and has adhesive tabs on the sides"], "image": "train2014/COCO_train2014_000000566704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278753, "question_id": "5ycvdcniKq6sKtsQtedCp6", "question": "What is the item under the counter?", "choices": ["plunger", "waste basket", "mop", "sink"], "correct_choice_idx": 1, "direct_answers": ["trash bin", "trash can", "trash bin", "trash can", "garbage can", "waste basket", "trashcan", "trash can", "trash can", "trash can"], "difficult_direct_answer": false, "rationales": ["The sink is above the counter. there is no plunger or mop.", "The item is a waste basket.", "There is a metal waste basket under the counter."], "image": "train2014/COCO_train2014_000000278753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278744, "question_id": "5yhPoXdLkzJz6ydcqso6qN", "question": "What is the northern neighboring country of this country?", "choices": ["canada", "usa", "mexico", "guatemala"], "correct_choice_idx": 0, "direct_answers": ["canada", "tennis", "canada", "canada", "canada", "canada", "canada", "tennis", "canada", "germany"], "difficult_direct_answer": false, "rationales": ["The sign in the background indicates that this is the us open series. mexico and guatemala are to the south of the united states.", "The man is playing tennis at the us open so i chose the country directly north of the united states of america.", "The country is canada."], "image": "train2014/COCO_train2014_000000278744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515214, "question_id": "5yhs9tWcuQJrHJKfMHVieb", "question": "What substance flies up around this persons right shoe?", "choices": ["clay", "tar", "coal", "dried paint"], "correct_choice_idx": 0, "direct_answers": ["dirt", "sand", "clay", "chalk", "dirt", "clay", "soil", "red sand", "clay", "clay"], "difficult_direct_answer": false, "rationales": ["It is hard packed red soil", "The tennis court has a clay surface and some of it has kicked up around the player's shoe.", "This is the only option which is also a surface tennis is played on."], "image": "val2014/COCO_val2014_000000515214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61215, "question_id": "5ypQq5VMmXNVLqjaJunspc", "question": "This style of food is covered in what substance that transfers to your hands easily?", "choices": ["grease", "salt", "meat", "pepper"], "correct_choice_idx": 0, "direct_answers": ["grease", "grease", "hotdog bun", "foil", "paper", "grease", "greasy", "grease", "fast", "grease"], "difficult_direct_answer": false, "rationales": ["Grease is a substance that is on fast foods and transfers easily to hands. the foods seen here are fast food and have visible grease.", "The fries in this picture have a lightly reflective sheen indicative of being fried in fat and grease.", "This causes stains on clothes and is sometimes hard to wipe off."], "image": "train2014/COCO_train2014_000000061215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102090, "question_id": "5ysNKP5pDZqq9oHFzYV2HV", "question": "What are these people doing?", "choices": ["cleaning up", "drinking wine", "complaining", "arguing"], "correct_choice_idx": 1, "direct_answers": ["drinking", "drinking wine", "drinking wine", "wine party", "drinking", "drinking wine", "talking", "drinking", "standing", "drinking wine"], "difficult_direct_answer": false, "rationales": ["The people are drinking wine.", "These people are standing and drinking.", "These people have cups with stems with red liquid."], "image": "train2014/COCO_train2014_000000102090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262006, "question_id": "5yznUQmR7MUVn8RT4kBisk", "question": "Who are inside cars driving here?", "choices": ["tourists", "prisoners", "zoo keepers", "hunters"], "correct_choice_idx": 0, "direct_answers": ["people", "tourists", "humans", "tourists", "tourists", "humans", "tourists", "tourists", "people", "drivethru safari"], "difficult_direct_answer": false, "rationales": ["There are giraffes roaming freely in a park area and the people inside of the cars are observing the giraffes at the zoo.", "The people in the cars are visiting.", "They are visiting for fun in order to see the animals, not working or trophy-hunting."], "image": "val2014/COCO_val2014_000000262006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143824, "question_id": "5z8v3XrqwGqaq9pJLDGaKt", "question": "What animal is on the television screen?", "choices": ["dog", "elephant", "lion", "sheep"], "correct_choice_idx": 0, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["A dog is shown on the tv screen.", "The screen of the tv is clearly visible and has four-legged animal with features common to a dog.", "There is a saint bernard pooch visible on the screen."], "image": "val2014/COCO_val2014_000000143824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116678, "question_id": "5zaUKyXfwDt3Dn6VzTBaVC", "question": "What level is this room on?", "choices": ["second", "ground", "first", "basement"], "correct_choice_idx": 1, "direct_answers": ["first", "ground", "ground", "first", "ground", "first floor", "ground", "second", "ground", "first floor"], "difficult_direct_answer": false, "rationales": ["It is on the same level as the street outside.", "The car in the window makes this obvious. that said, it could be an a level c room.", "This level is on the ground floor because you can see the street out from the window"], "image": "val2014/COCO_val2014_000000116678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488857, "question_id": "5zrmKNEKMK3CtzvmwmvtaE", "question": "Why does the green box have wires?", "choices": ["vandalism", "phone line", "electric service", "hold steady"], "correct_choice_idx": 2, "direct_answers": ["power supply", "delivers electricity", "electricity", "electrical box", "electricity", "conduct electricity", "electric service", "telephone", "electrical box", "telephones"], "difficult_direct_answer": false, "rationales": ["Utility boxes are on poles with buildings behind.", "Its for the electric to the house.", "The box is attached to a electric power line."], "image": "train2014/COCO_train2014_000000488857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327726, "question_id": "62hUCubHvGnW5gA9RXtmZB", "question": "What is the black couch against the wall made out of?", "choices": ["wool", "denim", "nylon", "leather"], "correct_choice_idx": 3, "direct_answers": ["leather", "pleather", "leather", "leather", "leather", "pleather", "leather", "leather", "leather", "leather"], "difficult_direct_answer": false, "rationales": ["This is a leather couch judging by the shiny appearance of the material.", "This is the most common and durable material that furniture might be made with.", "Black is the color of cow hide."], "image": "train2014/COCO_train2014_000000327726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227186, "question_id": "62ivimEkxEN4fbQdj2UCK6", "question": "What is the man doing on the bike?", "choices": ["talking", "drinking", "eating", "waiving"], "correct_choice_idx": 0, "direct_answers": ["talking", "using phone", "communicating", "talking", "phone call", "phone call", "phoning", "on cellphone", "talking", "taking call"], "difficult_direct_answer": false, "rationales": ["A man is riding a bike and holding a phone up to his ear.", "The man has his phone up to his ear so it is likely he's talking on it.", "The man is on his cell phone held up to his ear."], "image": "train2014/COCO_train2014_000000227186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259186, "question_id": "62sa83u4NQLDtMznqPCADt", "question": "What kind of support sleeves or braces is one player wearing?", "choices": ["ankle", "knee", "elbow", "wrist"], "correct_choice_idx": 1, "direct_answers": ["knee braces", "play", "knee braces", "knee brace", "knee pads", "knee pads", "kneepads", "knee", "knee pads", "knee pads"], "difficult_direct_answer": false, "rationales": ["They're also known as support braces.", "She does not have braces on her arms. they are near the middle of her legs.", "The person has pads on their knees."], "image": "train2014/COCO_train2014_000000259186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61582, "question_id": "62skyG4CB5u7UJ6hZitvxQ", "question": "How many individual sandwich pieces are in the image?", "choices": ["four", "seven", "nine", "two"], "correct_choice_idx": 3, "direct_answers": ["five", "two", "five", "two", "two", "five", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The sandwich is very colorful.", "One sandwich piece is next to another piece.", "A sandwich is cut into halves on a plate."], "image": "train2014/COCO_train2014_000000061582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304008, "question_id": "62tSCJPLWrrRLHT73TQVNx", "question": "What action is the man performing with the frisbee?", "choices": ["throwing", "blocking", "slapping", "catching"], "correct_choice_idx": 0, "direct_answers": ["throw", "throwing", "throwing", "throw", "throwing", "throwing", "throwing", "throwing it", "throwing it", "tossing"], "difficult_direct_answer": false, "rationales": ["The man is tossing the frisbee forward.", "The man is tossing the frisbee.", "The man is throwing the frisbee."], "image": "val2014/COCO_val2014_000000304008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18792, "question_id": "63EmTBpWhyL37UmiBxyEwf", "question": "Where are the elephants located at?", "choices": ["themed park", "zoo", "circus", "wilderness"], "correct_choice_idx": 3, "direct_answers": ["jungle", "wild", "savannah", "africa", "savannah", "outdoors", "africa", "wilderness", "game pack", "outdoors"], "difficult_direct_answer": false, "rationales": ["Elephants roam in an open area without fences.", "The wilderness as shown by the elephants.", "There is no fence."], "image": "train2014/COCO_train2014_000000018792.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50992, "question_id": "63KLCcmvor5xBjGLaML4zY", "question": "What did the man do to get the plastic bag?", "choices": ["beg", "shop", "steal", "spin"], "correct_choice_idx": 1, "direct_answers": ["bought something", "shopped", "went store", "purchased something", "buy from iceland", "groceries", "bought something", "bought it", "shop", "shop"], "difficult_direct_answer": false, "rationales": ["The bag has the name of a store on it and these plastic bags are given with purchases", "The man went shopping.", "The bag is a plastic bag that one gets when shopping at a grocery store. there is the name of a store visible on the bag so they likely were at the store shopping in order to receive this."], "image": "train2014/COCO_train2014_000000050992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319682, "question_id": "63Mvm3Fb6C9B4i7Tdr9adu", "question": "When these animals move what might one hear?", "choices": ["screaming", "thunder", "trumpets", "bells"], "correct_choice_idx": 3, "direct_answers": ["bells", "chains", "neigh", "hoofbeats", "snow fall", "bells", "bells", "bells", "hoof sounds", "bells"], "difficult_direct_answer": false, "rationales": ["The horses are hooked up to pull a sleigh. it is common to use sleigh bells for rides in the winter.", "The metal on their reins often makes a jingling sound.", "There are hollow cups that make sounds on the horse harness."], "image": "train2014/COCO_train2014_000000319682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190566, "question_id": "63zDwhxTLqCFX8WEq7hHYd", "question": "What color jacket is the person wearing closer to another person?", "choices": ["green black", "yellow", "yellow black", "yellow green"], "correct_choice_idx": 2, "direct_answers": ["blue", "yellie", "yellow", "yellow", "yellow", "yellow black", "yellow", "yellow", "yellie", "yellow"], "difficult_direct_answer": false, "rationales": ["The child has on a black and yellow coat.", "Their jackets are yellow and black.", "The jacket is yellow."], "image": "train2014/COCO_train2014_000000190566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217133, "question_id": "64QJvGSC8ZKdaADDviuydx", "question": "What hobby does the car owner enjoy?", "choices": ["skating", "painting", "surfing", "biking"], "correct_choice_idx": 2, "direct_answers": ["surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "racing", "racing", "surfing"], "difficult_direct_answer": false, "rationales": ["There is a board for a water sport on the roof of the car.", "A surfboard is tied on top of the roof. people tie surfboards to the roof when they want to go surfing.", "There is a surfboard."], "image": "val2014/COCO_val2014_000000217133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362309, "question_id": "64uQQoD77PbkA2EsAtfNZ7", "question": "In what nation is this bus found?", "choices": ["dominica", "philippines", "india", "thailand"], "correct_choice_idx": 1, "direct_answers": ["philippines", "dominican republic", "dominican republic", "philippines", "phillipines", "phillipines", "usa", "philippines", "usa", "dominica"], "difficult_direct_answer": false, "rationales": ["The dominion bus lines brand is found in the philippines.", "The nation is the philippines.", "This bus is from the philippines based on the colors of the designs."], "image": "train2014/COCO_train2014_000000362309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77628, "question_id": "64xmFnxwXZjLCCvpC4G4rp", "question": "The lights of the motorcycles are reflecting off the pavement because of what reason?", "choices": ["snow", "slush", "rain", "sleet"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "rain", "rain", "wet pavement", "wet ground", "wet", "it's wet", "shade", "rain"], "difficult_direct_answer": false, "rationales": ["The lights reflect the rain.", "The lights are reflecting the wet puddles on the street.", "The street is not wet under the bridge."], "image": "val2014/COCO_val2014_000000077628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128428, "question_id": "65HUddHfX9yEh9urSiG4ep", "question": "The company whose name appears on the can was founded in what year?", "choices": ["1886", "1812", "1940", "1922"], "correct_choice_idx": 0, "direct_answers": ["1886", "1886", "1900", "1886", "early 1900's", "1886", "1886", "del monte", "1886", "1886"], "difficult_direct_answer": false, "rationales": ["Del monte was started in 1886.", "Del monte was founded in san francisco in 1886.", "This is when the company started"], "image": "train2014/COCO_train2014_000000128428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137407, "question_id": "65MEFzxDqyYbes9Drz8drg", "question": "What American city is the bus most likely pictured in?", "choices": ["detroit", "chicago", "d.c", "philadelphia"], "correct_choice_idx": 2, "direct_answers": ["metrobus", "metrobus", "glover park", "washington dc", "d.c", "detroit", "washington", "michigan", "washington dc", "washington dc"], "difficult_direct_answer": false, "rationales": ["The city is dc.", "The license plate on the bus is from washington dc and dupont circle station is part of their metro subway system.", "Glover park is in washington dc."], "image": "train2014/COCO_train2014_000000137407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9039, "question_id": "65TbtxK3qKPHQNJwiN9GUi", "question": "What is the blue back piece of the plane called?", "choices": ["fuselage", "rotor", "tail pipe", "fin"], "correct_choice_idx": 3, "direct_answers": ["tail", "tail", "tail", "tail", "tail", "tail", "tail", "tail", "tail", "fin"], "difficult_direct_answer": false, "rationales": ["It is aerodynamic and helps to steer the plane.", "The rotor is a working part; the tail pipe is on a car; and a fuselage is the main part of the plane.", "The triangular upwards facing extension in the rear of these planes are called fins."], "image": "train2014/COCO_train2014_000000009039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268798, "question_id": "65qn4YcfGdMwZF3kopfeTY", "question": "Why is the man under the plane?", "choices": ["is lost", "stealing plane", "is passenger", "maintenance"], "correct_choice_idx": 3, "direct_answers": ["getting ready", "checking engine", "checking engine", "checking mechanics", "repair", "exiting", "servicing", "repair", "closing door", "maintenance"], "difficult_direct_answer": false, "rationales": ["This maintenance worker is doing some work on the front section of the plane.", "The man is wearing a safety vest and is near the landing gear of a commercial plane.", "He is wearing a bright vest so that others can easily see him."], "image": "train2014/COCO_train2014_000000268798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303556, "question_id": "664HUsSSVeZUyDojPWbKjJ", "question": "Who is throwing the frisbee?", "choices": ["boy", "girl", "man", "woman"], "correct_choice_idx": 2, "direct_answers": ["man", "man", "man", "person", "man", "man", "person", "man", "man", "man"], "difficult_direct_answer": false, "rationales": ["The frisbee is coming to the woman. the man is watching it land.", "It is a male near the water who threw it", "The man is tossing the frisbee to the woman."], "image": "val2014/COCO_val2014_000000303556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434354, "question_id": "66GvCxzgWsgrHAJtu54F9H", "question": "What can often be found under the tree seen here?", "choices": ["mice", "elves", "dogs", "gifts"], "correct_choice_idx": 3, "direct_answers": ["presents", "presents", "presents", "presents", "presents", "gifts", "presents", "presents", "presents", "presents"], "difficult_direct_answer": false, "rationales": ["Gifts are traditionally found under christmas trees such as this.", "People put christmas presents under the tree until it's time to open them.", "The tree is a christmas tree and tradition has people and maybe even santa putting presents under the tree to be opened christmas morning."], "image": "train2014/COCO_train2014_000000434354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306160, "question_id": "66WxJw8hV4KqkaS7uNb6q7", "question": "How does the woman feel?", "choices": ["cool", "wet", "cold", "hot"], "correct_choice_idx": 0, "direct_answers": ["happy", "happy", "pensive", "refreshed", "happy", "happy", "peaceful", "happy", "lonely", "cool"], "difficult_direct_answer": false, "rationales": ["She is under the tree in the shade", "She is in the shade and by the water so she is likely feeling cool.", "She is by the water, which means there is probably a gentle breeze."], "image": "train2014/COCO_train2014_000000306160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314572, "question_id": "66v4k2dXxjoS2ZfobydVsi", "question": "Why is the animal that is hooked up to the cart doing?", "choices": ["racing", "eating", "waiting", "drinking"], "correct_choice_idx": 2, "direct_answers": ["waiting", "horse", "pulling", "waiting", "dragging", "waiting", "pulling", "pull cart", "pulling", "waiting"], "difficult_direct_answer": false, "rationales": ["The animal--a horse--hooked up to the cart is just standing there patiently, not doing anything, and his idle nature suggests that he is just waiting for something and/or someone.", "There is a horse hooked up to the cart. it is standing still, not racing, eating, or drinking.", "The owner of the the animal will soon return and he will tow the owner somewhere."], "image": "train2014/COCO_train2014_000000314572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123286, "question_id": "66wzokT5jNqhT2EKwRLb5p", "question": "What is the relationship between the two players?", "choices": ["competitors", "coworkers", "strangers", "teammates"], "correct_choice_idx": 3, "direct_answers": ["partners", "partners", "tennis partners", "teammates", "teammate", "teammates", "teammates", "teammates", "teammates", "team mates"], "difficult_direct_answer": false, "rationales": ["The players are on the same side of the court.", "The two players are standing on the same side of the net which means they are a doubles team.", "The players are teammates."], "image": "train2014/COCO_train2014_000000123286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476517, "question_id": "67BgqQoFtees4ouCx3TPFj", "question": "What bottled water company shares the same name as the plane?", "choices": ["dasani", "delta", "poland spring", "fiji"], "correct_choice_idx": 3, "direct_answers": ["fiji", "fiji", "fiji", "fiji", "fiji", "fiji", "fiji", "fiji", "fiji", "fiji"], "difficult_direct_answer": false, "rationales": ["The plane says fiji which is also a bottled water company.", "There is a bottled water with the same name as on the plane.", "Fiji is also a bottled water."], "image": "train2014/COCO_train2014_000000476517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94770, "question_id": "67KVXPEL2WnkCsJzJWA9qF", "question": "The inflatable display is meant to simulate which winter sport?", "choices": ["skiing", "snowboarding", "curling", "ice skating"], "correct_choice_idx": 1, "direct_answers": ["snowboard", "snowboarding", "snowboarding", "snowboard", "snowboarding", "snowboarding", "skiing", "snowboarding", "snowboarding", "snowboarding"], "difficult_direct_answer": false, "rationales": ["This is the only one of these sports done with a board on a slope.", "The inflatable is snowboarding.", "He is in a simulation game."], "image": "train2014/COCO_train2014_000000094770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308536, "question_id": "67LREdes5Ae4nDsLgG6wGw", "question": "Which one of these items would pair well with the items in the photo?", "choices": ["monitor", "clock", "fireplace", "pillow"], "correct_choice_idx": 0, "direct_answers": ["monitor", "computer", "computer monitor", "monitor", "keyboard", "mouse", "computer", "mousepad", "monitor", "computer"], "difficult_direct_answer": false, "rationales": ["A pile of keyboards and computer mice are visible. monitors go with keyboards and mice.", "A keyboard would need a monitor to work.", "The photographed items are keyboards and computer mouses. clocks, fireplaces, and pillows are unrelated to computers or technology."], "image": "train2014/COCO_train2014_000000308536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488785, "question_id": "67TT7iWo9RxUFsZFBTEuXt", "question": "What religion is practised in the visible building?", "choices": ["judaism", "hinduism", "islam", "christianity"], "correct_choice_idx": 3, "direct_answers": ["christianity", "christianity", "catholic", "catholic", "puthaa", "christianity", "christianity", "christianity", "christianity", "catholic"], "difficult_direct_answer": false, "rationales": ["There is a steeple.", "Christianity is the only religion to use steeples.", "A church steeple is visible outside of a window."], "image": "val2014/COCO_val2014_000000488785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202782, "question_id": "67W4Qh8gcbDgEohGQyXM6h", "question": "The woman is doing what?", "choices": ["crossing arms", "running", "stretching", "jumping jacks"], "correct_choice_idx": 0, "direct_answers": ["crossing arms", "crossing arms", "walking", "being cold", "walking", "embracing herself", "standing", "standing", "walking", "crossing arms"], "difficult_direct_answer": false, "rationales": ["A woman is standing near a table with her arms folded across the front of her body.", "Her arms are still and she's holding them in front of her.", "She is standing there with her arms over one another."], "image": "train2014/COCO_train2014_000000202782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386272, "question_id": "67iM2xiFmNkxLz2ZWRjMqZ", "question": "What are the people in line concerned about?", "choices": ["buying tickets", "reclaiming baggage", "buying food", "making reservations"], "correct_choice_idx": 1, "direct_answers": ["baggage", "their luggage", "boarding", "wait time", "missing flight", "reclaiming baggage", "boarding", "baggage", "lost baggage", "boarding"], "difficult_direct_answer": false, "rationales": ["They are getting luggage.", "There is a sign over the counter that says so.", "Looks like they are all checking in."], "image": "train2014/COCO_train2014_000000386272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132223, "question_id": "683Vja3KkJntGjHUq5EhbL", "question": "What athlete plays the same sport the man is playing?", "choices": ["kemba walker", "dennis rodman", "michael jackson", "aaron judge"], "correct_choice_idx": 3, "direct_answers": ["babe ruth", "jalen battles", "freeman", "mike trout", "lou gerri", "baseball player", "baseball player", "aaron judge", "aaron judge", "hank aaron"], "difficult_direct_answer": false, "rationales": ["Aaron judge is great at baseball.", "The man is playing baseball based on the screen display and how he is holding the remote. answer a is a baseball player.", "Aaron judge plays baseball."], "image": "val2014/COCO_val2014_000000132223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159714, "question_id": "686KmzimvcdkHHw78Ja9Mx", "question": "Why are the animals in the enclosed area?", "choices": ["to sell", "to judge", "to trim", "to ride"], "correct_choice_idx": 1, "direct_answers": ["being judged", "livestock competition", "show", "for showcase", "livestock show", "judging", "safety", "to judge", "being judged", "cattle competition"], "difficult_direct_answer": true, "rationales": ["The animals are lined up and there is a man inspecting them.", "The sign indicates what is being done to the livestock.", "The animals are in an enclosed area because they are being judged in a contest."], "image": "train2014/COCO_train2014_000000159714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221004, "question_id": "68VX3Bboq3YnzuArh7W56G", "question": "The yellow and green objects are the same what?", "choices": ["age", "caliber", "species", "model"], "correct_choice_idx": 2, "direct_answers": ["bananas", "banana", "fruit", "species", "fruit", "fruit", "bananas", "bananas", "fruit", "fruit banana"], "difficult_direct_answer": false, "rationales": ["Yellow and green bananas are all around.", "It is only the color that had change but there of the same species.", "Bananas are green until they are ripe when they turn yellow."], "image": "train2014/COCO_train2014_000000221004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535418, "question_id": "68ZBe8RVwCQVcqpvxXr5um", "question": "Where is the boy visiting?", "choices": ["zoo", "jungle", "yard", "school"], "correct_choice_idx": 0, "direct_answers": ["zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "zoo"], "difficult_direct_answer": false, "rationales": ["He is at the zoo looking at the animals.", "The boy is looking at an animal that is in an enclosure.", "There is a bear visible in an enclosure as would be the case in a zoo. zoos have such enclosures for animal habitats."], "image": "train2014/COCO_train2014_000000535418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304434, "question_id": "68ZNYV46UWXjevbRH2L7Pf", "question": "What company designed this operating system?", "choices": ["apple", "samsung", "google", "microsoft"], "correct_choice_idx": 3, "direct_answers": ["microsoft", "microsoft", "microsoft", "microsoft", "nicrosoft", "microsoft", "microsoft", "microsoft", "microsoft", "microsoft"], "difficult_direct_answer": false, "rationales": ["This is windows", "The microsoft logo is quite distinctive and recognizable on the laptop screen shown.", "As seen on the laptop the microsoft logo is being shown."], "image": "train2014/COCO_train2014_000000304434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410068, "question_id": "69Q4kReG6MdG4bcmnBVYM4", "question": "In how many directions or orientations are cars parked on either side of the street here?", "choices": ["two", "four", "one", "three"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two 2", "two", "one", "two", "one", "left"], "difficult_direct_answer": false, "rationales": ["Cars are parked facing forward and backward.", "As seen by looking at the directions of the cars on both sides of the street.", "The street only has two sides and on either side there are cars facing toward the top or toward the bottom of the image. there are no cars facing another direction."], "image": "val2014/COCO_val2014_000000410068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329931, "question_id": "69ZxcZekY2wcwGmb696hhp", "question": "What lights up this girls lower face?", "choices": ["spotlight", "refrigerator light", "overhead light", "flashlight"], "correct_choice_idx": 1, "direct_answers": ["fridge light", "refrigerator light", "refrigerator", "refrigerator", "refrigerator", "fridge", "refrigerator", "refrigerator", "refrigerator light", "refrigerator light"], "difficult_direct_answer": false, "rationales": ["The room is dark. the light is coming from inside the appliance.", "There is a light being emitted from the refrigerator.", "The fridge light is lit."], "image": "val2014/COCO_val2014_000000329931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505967, "question_id": "69ha53bzcMcJ26B3dnMuut", "question": "In which city may you find this street?", "choices": ["new york", "las vegas", "las angeles", "reno"], "correct_choice_idx": 1, "direct_answers": ["new york", "new york", "las vegas", "nyc", "las vegas", "new york", "new york", "las vegas", "new york", "vegas"], "difficult_direct_answer": false, "rationales": ["The are has a wide roads to allow many vehicles at the same time.", "There is a sign that tells the city", "This is in the city that never sleeps."], "image": "val2014/COCO_val2014_000000505967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533619, "question_id": "69iRfTv7nYSzBzt2KqFJPJ", "question": "What are the women intending to bake?", "choices": ["bread", "pie", "pizza", "meat"], "correct_choice_idx": 1, "direct_answers": ["cafe", "pies", "quiche", "pizza", "pies", "pie", "pie", "pie", "pie", "pies"], "difficult_direct_answer": false, "rationales": ["The two women have made a circular item stuffed with filling.", "They have beautiful desserts.", "They have pies in their hands."], "image": "train2014/COCO_train2014_000000533619.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145369, "question_id": "69ykbzFbnfmLX9wDAceZSu", "question": "What are the white objects near the elephants mouth made of?", "choices": ["plastic", "carbon", "ivory", "steel"], "correct_choice_idx": 2, "direct_answers": ["ivory", "ivory", "ivory", "ivory", "ivory", "ivory", "ivory", "ivory", "ivory", "ivory"], "difficult_direct_answer": false, "rationales": ["Elephant tusks are made of this precious mineral.", "The elephants have tusks made out of ivory near their mouth.", "The objects are ivory."], "image": "val2014/COCO_val2014_000000145369.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174019, "question_id": "6AD5baPWWUieiFnsXe8rwc", "question": "Is it safe to cross here across division street at this exact time?", "choices": ["unknown", "no", "perhaps", "yes"], "correct_choice_idx": 1, "direct_answers": ["no", "no", "no", "two hours", "no", "yes", "no", "no", "no", "no"], "difficult_direct_answer": false, "rationales": ["The hand on the sign means \"don't cross.\".", "There is a visible sign indicating if it is safe for pedestrians to proceed and it is showing the symbol synonymous for communicating answer a.", "The crosswalk indicates it is not safe to cross the street."], "image": "val2014/COCO_val2014_000000174019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187072, "question_id": "6AGhA6wrMnJVpvYjwnKwQY", "question": "What race is this family?", "choices": ["asia", "multi-race", "white", "black"], "correct_choice_idx": 1, "direct_answers": ["asian", "asian", "asian", "asian", "asian", "multi-race", "asian", "asian", "asian", "asian"], "difficult_direct_answer": false, "rationales": ["They look like they are asian.", "A white and an asian kid are playing together.", "They appear to be of the asian race."], "image": "val2014/COCO_val2014_000000187072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506327, "question_id": "6AQzCx89PbVAFDZhSsvPW4", "question": "What poisonous gas can be produced here?", "choices": ["arsine", "carbon monoxide", "hydrogen sulfide", "hydrogen fluoride"], "correct_choice_idx": 1, "direct_answers": ["carbon monoxide", "carbon monoxide", "smoke", "carbon monoxide", "co", "carbon monoxide", "carbon monoxide", "carbon dioxide", "co", "carbon dioxide"], "difficult_direct_answer": false, "rationales": ["Fireplaces can produce carbon monoxide.", "There is a fireplace visible. if used, a byproduct of fire is answer a.", "There is a fireplace in the bedroom which can produce carbon monoxide if there isn't ventilation."], "image": "train2014/COCO_train2014_000000506327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23108, "question_id": "6AVerNHAnco5VnstWEVkAQ", "question": "What are they showing with this display?", "choices": ["possibilities", "randomness", "contrast", "colors"], "correct_choice_idx": 2, "direct_answers": ["size differences", "flower vase", "flower", "pitcher", "jug", "contrast", "water pitcher", "pitcher", "glass", "succulent"], "difficult_direct_answer": true, "rationales": ["They are using two polar opposites.", "The colour and texture of the plant compared to the jug, shows a great differentiator or contrast.", "These objects have different textures and materials"], "image": "train2014/COCO_train2014_000000023108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369961, "question_id": "6AgTF76i8RAVM9L6P8zVDz", "question": "What does the boy come here for?", "choices": ["cross country", "working", "finding frisbee", "hiding"], "correct_choice_idx": 2, "direct_answers": ["play frisbee", "play frisbee", "frisbee", "finding frisbee", "frisbee", "frisbee", "playing", "frisbee throwing", "frisbeeing", "frisbee"], "difficult_direct_answer": false, "rationales": ["The other options obviously don't fit with this scene.", "The boy is holding a frisbee.", "The boy is running through the field with a disk in his hand."], "image": "train2014/COCO_train2014_000000369961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293505, "question_id": "6AsDjUJudYPBKvLAw2fKbk", "question": "Who paved this street?", "choices": ["municipality", "neighbors", "cows", "no one"], "correct_choice_idx": 3, "direct_answers": ["not paved", "nobody", "no one", "machines", "no one", "no one", "workers", "no one", "residents", "nobody"], "difficult_direct_answer": false, "rationales": ["This is an unpaved dirt street.", "It is a dirt road.", "This is a dirt path and it's not paved."], "image": "val2014/COCO_val2014_000000293505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228920, "question_id": "6AzaGx7FnUfUZHVXwBPgGz", "question": "Where is the child doing arts and crafts?", "choices": ["school", "home", "daycare", "expo"], "correct_choice_idx": 0, "direct_answers": ["school", "school", "school", "classroom", "classroom", "school", "school", "kitchen table", "school", "cutting"], "difficult_direct_answer": false, "rationales": ["There is a colorful mat behind him. the boy is a young age.", "There are cabinets with labels and other educational materials nearby like an easel and mat.", "A boy is sitting at a desk and is working on a project."], "image": "train2014/COCO_train2014_000000228920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110821, "question_id": "6B8PKazokGxoLky3ufx5kD", "question": "What caused the tomatoes to shrivel up?", "choices": ["heat", "spice", "knife", "acid"], "correct_choice_idx": 0, "direct_answers": ["heat", "baked", "ingredients", "broiling", "heat", "bake", "cooking", "cooking", "dehydration", "heat"], "difficult_direct_answer": false, "rationales": ["When tomatoes are cooked they will shrivel up some.", "The tomatoes have been toasted in the oven.", "The tomatoes were baked."], "image": "train2014/COCO_train2014_000000110821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489333, "question_id": "6BFgoP37XVfzCDZ667sv29", "question": "Why do jockey's wear helmets?", "choices": ["match clothes", "look nice", "protect head", "draw attention"], "correct_choice_idx": 2, "direct_answers": ["protection", "for protection", "for protection", "protection", "protect head", "prevent injury", "protection", "protection", "safety", "protect head"], "difficult_direct_answer": false, "rationales": ["Riding can be dangerous if they fall off the horse, so they need protection for the head.", "Jockeys wear helmets to protect their heads.", "Not only jockeys, but people who take part in any potentially dangerous sport wear helmets to protect their head."], "image": "train2014/COCO_train2014_000000489333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404464, "question_id": "6BcKdNkY7VEicBKzscCqdw", "question": "What is the man riding?", "choices": ["bicycle", "motorcycle", "snowboard", "skateboard"], "correct_choice_idx": 2, "direct_answers": ["snowboard", "snowboard", "snowboard", "snowboard", "snowboard", "snowboard", "snowboard", "snowboard", "snowboard", "snowboard"], "difficult_direct_answer": false, "rationales": ["He is using a single flat board without wheels, sliding down a snowy hill.", "Snowboards only can move in snow, simple as that.", "Based on the setting and the size and shape of the object as well as the manner it is being ridden, answer a is consistent."], "image": "val2014/COCO_val2014_000000404464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547752, "question_id": "6BxuqZk4Rr3RM7VcY6wgpk", "question": "What is in the refrigerator?", "choices": ["bees", "ant", "coconut", "milk"], "correct_choice_idx": 3, "direct_answers": ["foods", "perishables", "lettuce", "perishables", "milk", "milk", "differed bottle", "groceries", "bottles", "milk"], "difficult_direct_answer": false, "rationales": ["The refrigerator has milk cartons.", "The fridge has milk.", "A fridge door is open with cartons of milk on the shelf."], "image": "train2014/COCO_train2014_000000547752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388374, "question_id": "6CKcBZRS2HLmaQDLJVb8Pm", "question": "What type of trees are growing in this location?", "choices": ["pine trees", "palm trees", "willow trees", "birch trees"], "correct_choice_idx": 1, "direct_answers": ["palm", "palm trees", "palm trees", "palm trees", "palm tree", "palm trees", "palm", "palm trees", "palm", "coconut"], "difficult_direct_answer": false, "rationales": ["The fronds at the top of the trees are easy to identify as palms", "This is a tropical area with palm trees.", "The trees are recognizable because of their trunk shape and texture as well as the leaves at the top that are found on palms."], "image": "val2014/COCO_val2014_000000388374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48034, "question_id": "6CssVDWyfKwBpycbNviFrU", "question": "What is the boy doing?", "choices": ["stretching", "running", "descending", "ascending"], "correct_choice_idx": 2, "direct_answers": ["skiing", "sking", "skiing", "skiing", "skiing", "snow", "snowboarding", "descending", "snow skates", "snow skating"], "difficult_direct_answer": false, "rationales": ["The skier started at the top of the hill with their knees bent for speed and are heading downwards on the hill.", "The boy is moving down the hill.", "The boy is descending."], "image": "val2014/COCO_val2014_000000048034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333565, "question_id": "6CvPHvB2sXDFGmB9ipVRa5", "question": "Which animal particularly likes to eat the food she is holding?", "choices": ["shark", "rabbit", "tiger", "monkey"], "correct_choice_idx": 3, "direct_answers": ["monkey", "monkeys", "monkeys", "primates", "monkey", "monkey", "monkey", "monkey", "monkey", "monkey"], "difficult_direct_answer": false, "rationales": ["Monkeys stereotypically like bananas.", "She is holding bananas, not meat or carrots.", "Traditionally those of the ape family love bananas."], "image": "val2014/COCO_val2014_000000333565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112444, "question_id": "6CvW4ERxz3yPPWe8RYvuBG", "question": "Who would be found here?", "choices": ["vampire", "taoist", "altar boy", "buddhist"], "correct_choice_idx": 2, "direct_answers": ["priest", "priest", "priest", "priest", "altar boy", "priest", "priest", "priests", "clergy", "congregators"], "difficult_direct_answer": false, "rationales": ["The scene in the picture is that of a catholic church. what is missing are the altar boys.", "The altar boy can be found.", "This is a catholic church."], "image": "train2014/COCO_train2014_000000112444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283267, "question_id": "6CzqKmWJwwuUvhLhiWBTSV", "question": "What color pants is the woman wearing?", "choices": ["black", "yellow", "red", "green"], "correct_choice_idx": 0, "direct_answers": ["black", "black", "black", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The woman is wearing black bikini bottoms.", "All you have to do to answer this is to decide which person is the female and the answer becomes obvious.", "Her pants are not green, red, or yellow."], "image": "train2014/COCO_train2014_000000283267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89765, "question_id": "6D9GxNghxLaW2siXU8piWR", "question": "What is in the bottle on the left?", "choices": ["sesame oil", "olive oil", "canola oil", "vegetable oil"], "correct_choice_idx": 1, "direct_answers": ["olive oil", "olive oil", "plate", "olive oil", "olive oil", "olive oil", "cooking oil", "olive oil", "vegetable oil", "olive oil"], "difficult_direct_answer": false, "rationales": ["A bottle with a light colored liquid is on a counter by a stove. oil is used for cooking.", "This is a bottle of oil that people use to cook with.", "The bottle contains a liquid that is often used when cooking carrots, onions, and celery on a stove top. the bottle has a picture of olives on it."], "image": "val2014/COCO_val2014_000000089765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239483, "question_id": "6DFFwAtToq3kVNABPRdD6v", "question": "What is the green cord or line wrapping under the bike and on the grass?", "choices": ["tether", "hose", "vine", "string"], "correct_choice_idx": 1, "direct_answers": ["water hose", "hose", "water hose", "hose", "hose", "hose", "hose", "water hose", "hose", "water hose"], "difficult_direct_answer": false, "rationales": ["The green cord is used by people to water their gardens or grass.", "They are outside. hoses usually come in this color.", "The cord is a hose."], "image": "train2014/COCO_train2014_000000239483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356286, "question_id": "6DHompnQVwLDrX72qyBZqd", "question": "What type room is this?", "choices": ["parking garage", "bathroom", "living", "showroom"], "correct_choice_idx": 3, "direct_answers": ["showroom", "show", "showroom", "showroom", "gallery", "showroom", "showroom", "gallery", "showroom", "showroom"], "difficult_direct_answer": false, "rationales": ["The motorcycles are displayed for sale.", "There are many bikes shown and each has a poster beside it explaining something about the bike.", "The room is displaying bikes."], "image": "train2014/COCO_train2014_000000356286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233994, "question_id": "6DLc2g6kXy3jmYpZd6JNFw", "question": "What is usually found in this setting?", "choices": ["fish", "tigers", "wolves", "lions"], "correct_choice_idx": 0, "direct_answers": ["dolphins", "fish", "boats", "boats", "boats", "beach", "sailing boats", "ships", "fish", "boats"], "difficult_direct_answer": false, "rationales": ["This is deep water and these animals live there", "Only one of the animal options live in the water.", "This is a marine environment so there would be sea life present."], "image": "val2014/COCO_val2014_000000233994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319055, "question_id": "6DPaEjkqmjHyHgFBtUH4MX", "question": "The antenna on top of the building is used to receive what type of broadcast signal?", "choices": ["cellular", "radio", "television", "weather alerts"], "correct_choice_idx": 2, "direct_answers": ["radio", "television", "radio", "radio", "radio", "tv", "radio", "television", "tv", "television"], "difficult_direct_answer": false, "rationales": ["By the design of the antenna it is safe to say what type id waves that are being received.", "These are to catch the waves from stations nearby", "Some places need those antennas get tv stations."], "image": "val2014/COCO_val2014_000000319055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83178, "question_id": "6DWzMTDuAxNNj8ppu6dbYn", "question": "What are people doing here?", "choices": ["dancing", "jogging", "singing", "buying food"], "correct_choice_idx": 3, "direct_answers": ["shopping", "buying food", "tourists", "having fun", "getting food", "waiting bus", "shopping", "shopping", "eating socializing", "eating"], "difficult_direct_answer": false, "rationales": ["There are visible food stands with people milling around them. people in the same vicinity of food stands are likely purchasing food.", "People are getting food.", "The people here are buying food and groceries."], "image": "train2014/COCO_train2014_000000083178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64481, "question_id": "6DeZRphecf6nkpkqBSocUC", "question": "What is the dominate color of the object the woman is carrying with her left arm?", "choices": ["pink", "red", "orange", "purple"], "correct_choice_idx": 2, "direct_answers": ["blue", "orange", "orange", "red", "orange", "orange", "orange", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The board she is carrying is not pink, red, or purple.", "The biggest part of the object is the board, which is orange.", "A-type creamsicle color actually."], "image": "train2014/COCO_train2014_000000064481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198936, "question_id": "6Dh3983nJZLA2M8wdTMHrr", "question": "What is the girl doing?", "choices": ["feeding dog", "watching tv", "selling dog", "stealing dog"], "correct_choice_idx": 1, "direct_answers": ["watching tv", "watching tv", "sitting", "sitting", "watching tv", "watching television", "watching tv", "sitting", "watching tv", "watching tv"], "difficult_direct_answer": false, "rationales": ["She is looking off in the other direction and holding the remote control, rather than looking at the camera.", "The girl is watching tv.", "The girl is relaxing on the couch with a tv remote."], "image": "train2014/COCO_train2014_000000198936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544457, "question_id": "6DnWXF2rPjxjm8rZp3JnR6", "question": "The symbols are from what culture?", "choices": ["greek", "egyptian", "assyrian", "asian"], "correct_choice_idx": 3, "direct_answers": ["asian culture", "japan", "chinese", "asian", "chinese", "japan", "chinese", "japan", "asian", "chinese"], "difficult_direct_answer": false, "rationales": ["The writing is in japanese. the wares are in the japanese style and design.", "The pictures on the wall show asian language words. asian culture likes to decorate with asian language words.", "These symbols are from the asian culture."], "image": "train2014/COCO_train2014_000000544457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170630, "question_id": "6DqFiw8Ca832UPYjn2oTpN", "question": "What is the woman using the bowl for here?", "choices": ["hidden camera", "feeding baby", "feed pet", "trapping rats"], "correct_choice_idx": 2, "direct_answers": ["feed pet", "feeding pet", "feed pet", "mixing", "storage", "feed pet", "feed pet", "storage", "unpacking", "food"], "difficult_direct_answer": false, "rationales": ["Looks like she may be feeding a pet with it.", "She is going to feed her pet.", "Though we cannot see her pet, the woman is looking down and smiling at something. the bowl in her hand seems to signify that she's about to feed her pet."], "image": "val2014/COCO_val2014_000000170630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313341, "question_id": "6DttjTigkUcmeKQiQzGoKd", "question": "Which process has been performed on the closer lady's hair?", "choices": ["dying", "braiding", "perming", "shaving completely"], "correct_choice_idx": 0, "direct_answers": ["hair coloring", "dyed", "dye", "dyeing", "dyeing", "dye job", "died", "dye", "dying", "dye"], "difficult_direct_answer": false, "rationales": ["The hair is a little bright to be a natural color.", "Her hair is straight and somewhat long. it is an unnatural red color.", "Her hair is unnatural in color."], "image": "val2014/COCO_val2014_000000313341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38840, "question_id": "6EDWaNVpZpqcouPDdQnM5r", "question": "How did the snow get on the ground here?", "choices": ["blizzard", "carried up", "snow machine", "blew there"], "correct_choice_idx": 2, "direct_answers": ["from sky", "snow machine", "winter weather", "from sky", "snow machine", "snowfall", "freezing temperatures", "fell", "weather system", "storm"], "difficult_direct_answer": false, "rationales": ["The snow has the consistency and appearance of man made snow which is a common feature at ski resorts, which this visibly is. when man is making snow, they use a snow machine.", "The snow is artificial since it's so flat.", "The answer is unknowable based on the image, but it looks to be of a consistency that would be in line with answer a and none of the other answers."], "image": "train2014/COCO_train2014_000000038840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12269, "question_id": "6EQrkGpHzNYiGmSWAbtCAL", "question": "Why is the girl holding the umbrella?", "choices": ["blocking sun", "blocking rain", "to buy", "to sell"], "correct_choice_idx": 2, "direct_answers": ["stay cool", "buying", "she's playing", "rain", "wants it", "testing out", "having fun", "fun", "to buy", "posing"], "difficult_direct_answer": true, "rationales": ["The girl wants to buy the umbrella.", "The girl is inside a store, so she is not blocking sun or rain. she is too young to sell umbrellas.", "She is in a store and wants to buy it."], "image": "val2014/COCO_val2014_000000012269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573063, "question_id": "6EkAYAH9y5aq7Ynag2YmtL", "question": "The birthday celebration is occurring during which part of the day?", "choices": ["night", "morning", "noon", "afternoon"], "correct_choice_idx": 0, "direct_answers": ["evening", "birthday", "night", "evening", "evening", "night", "night", "night", "cake cutting", "evening"], "difficult_direct_answer": false, "rationales": ["The lights are on.", "Cake is generally eaten at night.", "The lighting is all artifical and there is so sign of sunlight. also, the time on the clock indicates it is 9:48."], "image": "train2014/COCO_train2014_000000573063.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311522, "question_id": "6FnM5Vy4TQJWxxNfvrzsPb", "question": "What is the man dressed in all black and in the middle of the scene holding?", "choices": ["carrot", "rabbit", "ski poles", "baby"], "correct_choice_idx": 2, "direct_answers": ["ski poles", "ski poles", "poles", "ski poles", "ski poles", "poles", "ski poles", "ski poles", "ski poles", "ski poles"], "difficult_direct_answer": false, "rationales": ["The man is participating in a winter sport. he is not holding a living thing.", "The man is holding ski poles in his hands.", "A man on skis, in dark clothing, is holding two long, thin poles customarily used when skiing."], "image": "train2014/COCO_train2014_000000311522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398615, "question_id": "6FnXqau8PoL7KPHn4p7XUC", "question": "What vehicles are on the street?", "choices": ["train", "bus", "car", "motorcycle"], "correct_choice_idx": 1, "direct_answers": ["double-decker buses", "buses", "busses", "buses", "buses", "bus", "buses", "busses", "buses", "busses"], "difficult_direct_answer": false, "rationales": ["The street has a row of red busses lined up in one lane.", "The vehicles are buses.", "There are red buses all over the streets."], "image": "train2014/COCO_train2014_000000398615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434700, "question_id": "6FuJKvRbLNgdSink98hZwD", "question": "Which company has sponsored this event?", "choices": ["american express", "bmw", "sony", "visa"], "correct_choice_idx": 0, "direct_answers": ["american express", "american express", "american express", "american express", "american express", "american express", "american express", "american express", "american express", "american express"], "difficult_direct_answer": false, "rationales": ["The sponsor is in front of the people on the far left.", "American express's logo is on the wall.", "It's clearly written on the wall."], "image": "val2014/COCO_val2014_000000434700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66976, "question_id": "6GEtkkasqFsW6AQ66odPCF", "question": "What object is the item under the dog linked to?", "choices": ["television", "computer", "boombox", "radio"], "correct_choice_idx": 0, "direct_answers": ["television", "television", "television", "television", "tv", "dog", "tv", "collar", "television", "dog"], "difficult_direct_answer": false, "rationales": ["There is a remote.", "It's a remote control to change channels and volume on a television", "The shape of the object, as well as varieties and placement of the buttons, are all consistent with controls for this type of device."], "image": "train2014/COCO_train2014_000000066976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247576, "question_id": "6GsnzHNSYkuw3mZUy5Uebv", "question": "What is indicated by the marking on the road?", "choices": ["bike lane", "crosswalk", "railroad crossing", "two lanes"], "correct_choice_idx": 1, "direct_answers": ["crosswalk", "crosswalk", "crosswalk", "crosswalk", "pedestrian crossing", "crosswalk", "crosswalk", "pedestrian crossing", "crosswalk", "crosswalk"], "difficult_direct_answer": false, "rationales": ["The marking is a crosswalk.", "It's where people can cross the street safely.", "The white lines in the road indicate a pedestrian crossing."], "image": "val2014/COCO_val2014_000000247576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320875, "question_id": "6HEmUy6UhDNJqUErPfRPNd", "question": "Why is she holding her hands out?", "choices": ["threatening woman", "is gesturing", "eating tacos", "drying out"], "correct_choice_idx": 1, "direct_answers": ["conversation gestures", "talking", "asking question", "being expressive", "talking", "conversation gestures", "explaining", "gesturing", "showing concern", "is gesturing"], "difficult_direct_answer": false, "rationales": ["The woman is emphasizing something with her hands.", "Two woman are standing in a kitchen looking at each other from across a counter.", "She is talking and using her hands and gesturing as she talks."], "image": "train2014/COCO_train2014_000000320875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160938, "question_id": "6HTyLjPTZxXfz3w64JHDPV", "question": "In which type space is cake being cut?", "choices": ["arena", "private home", "stadium", "rest home"], "correct_choice_idx": 1, "direct_answers": ["square", "dining room", "table", "private home", "dining room", "dining room", "kitchen", "kitchen", "dining room", "dining room"], "difficult_direct_answer": false, "rationales": ["The man is in a residence with a small table and kitchen in the background.", "Looks like they are in someones living room or house.", "This is in someones house."], "image": "train2014/COCO_train2014_000000160938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513676, "question_id": "6HiuMUTyZDza3XihQxyqSj", "question": "How many flavors available in AquaBall water?", "choices": ["ten", "14", "12", "20"], "correct_choice_idx": 2, "direct_answers": ["four", "one", "five", "numerous", "several", "two", "four", "12", "four", "three"], "difficult_direct_answer": false, "rationales": ["There are twelve flavors.", "Because people have different choices on the flavors they like.", "They have many flavors available."], "image": "train2014/COCO_train2014_000000513676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502232, "question_id": "6HkGQ9US8gVtLTcrBVvdGg", "question": "What is the mouse's wife's name?", "choices": ["justina", "minnie", "dasha", "delores"], "correct_choice_idx": 1, "direct_answers": ["minnie", "minnie mouse", "minnie", "minnie", "minnie mouse", "minnie", "minnie", "minnie", "minnie", "minnie mouse"], "difficult_direct_answer": false, "rationales": ["Minnie is mickey's wife.", "This is mickey mouse from disney", "This is one of the oldest and best known disney characters, aside from mickey."], "image": "train2014/COCO_train2014_000000502232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573854, "question_id": "6HuMzjrphkzD5NSQQiXPGf", "question": "Which vehicle can carry the most volume of supplies?", "choices": ["green plane", "blue plane", "van", "luggage cart"], "correct_choice_idx": 1, "direct_answers": ["plane", "jumbo jet", "blue plane", "larger airliner", "larger plane", "blue plane", "blue ana", "plane", "blue plane", "blue plane"], "difficult_direct_answer": false, "rationales": ["It is the largest vehicle with the most space inside", "The vehicle is the blue plane.", "There are two sizes and colors of plane and the green one is much smaller and could not hold as many items."], "image": "train2014/COCO_train2014_000000573854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61498, "question_id": "6JnJ6hWFb7EXBYxCDFq9am", "question": "What are the tennis balls in the cart for?", "choices": ["selling", "training", "playing", "gifting"], "correct_choice_idx": 1, "direct_answers": ["using", "to serve", "storage", "training", "playing", "practice", "holding balls", "practice", "extras", "practice"], "difficult_direct_answer": false, "rationales": ["Their lack of uniform indicates that they are just practicing, and many balls are needed in order to save time from running after the balk each time.", "There are several tennis balls in the cart.", "The tennis balls are in the cart for players."], "image": "train2014/COCO_train2014_000000061498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497504, "question_id": "6Js5u3NF6qam8pqng4PNF4", "question": "Which one of these businesses can use the space advertised?", "choices": ["restaurant", "skating rink", "lawyer", "theater"], "correct_choice_idx": 2, "direct_answers": ["lawyer", "store", "any", "offices", "office", "office building", "left business", "zero", "any", "office spaces"], "difficult_direct_answer": true, "rationales": ["This is a business type of space available", "People who work in the court system work out of offices.", "A lawyer can use office space to do their work."], "image": "val2014/COCO_val2014_000000497504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202923, "question_id": "6JzK2m9LEusai3PnPDBE6E", "question": "What is the purpose of the fence?", "choices": ["stop balls", "restrain fans", "detain players", "clean field"], "correct_choice_idx": 0, "direct_answers": ["protect fans", "stop balls", "stop ball", "protection spectators", "safety", "protection", "ball safety", "stop baseballs", "block balls", "safety"], "difficult_direct_answer": true, "rationales": ["The fence is used to stop balls from flying all over.", "The fence is there to protect the spectators.", "Baseball is a sport that uses balls and the balls can hit people. a fence is used for protection."], "image": "val2014/COCO_val2014_000000202923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287387, "question_id": "6K8WDFUpsr3ESRGcFb8hzW", "question": "What plant adds bitterness to this beverage?", "choices": ["hops", "corn", "mugwort", "sugar"], "correct_choice_idx": 0, "direct_answers": ["hops", "hops", "hops", "hops", "hops", "hop", "hops", "hop", "hops", "hops"], "difficult_direct_answer": false, "rationales": ["Hops provide bitterness to beers, and guinness is a beer.", "Traditionally all beers have hops in them.", "A quick google search informed me that every beer on the market contains hops. guinness is a famous brand of beer."], "image": "val2014/COCO_val2014_000000287387.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524459, "question_id": "6Kbb9ufVPYRxmVhKyNEqGG", "question": "What are the toucan and the man going to enjoy?", "choices": ["soda", "beer", "some wine", "some water"], "correct_choice_idx": 1, "direct_answers": ["beer", "guinness", "guinness", "guinness", "guineas", "beer", "drink", "guinness beer", "guineas", "guinness beer"], "difficult_direct_answer": false, "rationales": ["The toucan on the poster is holding a glass of guiness beer.", "The poster on the wall shows the man and the toucan drinking an alcoholic beverage made by guinness.", "A picture is shown with a man and a toucan facing each other and each are holding a glass of beer."], "image": "val2014/COCO_val2014_000000524459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296876, "question_id": "6KhA4UN5U4Lx7KaasqKUeV", "question": "What are the animals gathering in the middle of?", "choices": ["parking lot", "lake", "field", "forest"], "correct_choice_idx": 2, "direct_answers": ["pasture", "field", "pasture", "pasture", "field", "field", "horses", "pasture", "pasture", "grass"], "difficult_direct_answer": false, "rationales": ["Horses are in a large grassy, open area. fields are large and open.", "This place is an open area for the animals to graze.", "The land is vast and there's vegetation all around."], "image": "train2014/COCO_train2014_000000296876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431675, "question_id": "6Kin6ygwUysknLc7dw7Kxt", "question": "Why are the cars lined up along the curb?", "choices": ["to vacuum", "to inspect", "to wash", "to park"], "correct_choice_idx": 3, "direct_answers": ["parked", "to park", "parking", "parked", "parking", "parked", "parked", "parked", "parking", "parking"], "difficult_direct_answer": false, "rationales": ["Cars are lined up on a street. people park on the side of the street.", "Vehicles are lined up at the curb of a street.", "The cars are parked by the side of the street."], "image": "train2014/COCO_train2014_000000431675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503689, "question_id": "6KmehhjefpYEGfXwUUtnR5", "question": "Where are the boy's hands while batting a baseball?", "choices": ["above", "side", "behind", "front"], "correct_choice_idx": 3, "direct_answers": ["on bat", "on bat", "on bat", "on bat", "on bat", "handle", "front", "bat", "on bat", "on bat"], "difficult_direct_answer": false, "rationales": ["The boy's hands are not by the side or back of his body, but are in the position mentioned in option a.", "The boy's hands are at the front of the bat.", "The boy's hands are in front of him."], "image": "train2014/COCO_train2014_000000503689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278001, "question_id": "6Ko8bLJspSExA2cjZXBCGF", "question": "What is the tall thin thing above the red thing used for?", "choices": ["climbing", "holding balloons", "holding kites", "displaying colors"], "correct_choice_idx": 0, "direct_answers": ["tower", "ski-lift support", "climbing", "safety", "climbing", "climbing ladder", "lights", "climbing", "observation", "ladder"], "difficult_direct_answer": false, "rationales": ["There is a ladder on the thing going up.", "The red thing is used to get up the hill.", "It has rungs on it for people to step on"], "image": "train2014/COCO_train2014_000000278001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271011, "question_id": "6LAhPM2oyWybY95RdfepBF", "question": "Why are the men all wearing helmets?", "choices": ["accident safety", "trendy", "warmth", "sunblock"], "correct_choice_idx": 0, "direct_answers": ["head protection", "for safety", "protection", "for protection", "accident safety", "for protection", "protection", "safety", "snowboarding", "protection"], "difficult_direct_answer": false, "rationales": ["Helmets are worn on the head to protect the head in case of an accident.", "The snowboarders are wearing helmets to protect their heads from harm if they take a spill while boarding down a hill.", "The man are skiing, and they can fall on their heads, so they want to protect their heads."], "image": "train2014/COCO_train2014_000000271011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201452, "question_id": "6LhdHZHtytk5vo4fCr6b9B", "question": "Why are the surfer's hesitant to surf here?", "choices": ["huge waves", "rocks", "small waves", "cold water"], "correct_choice_idx": 1, "direct_answers": ["rocks", "rocky terrain", "high tide", "high tide", "sunset", "large wave", "rocks", "strong waves", "strong waves", "waves crashing"], "difficult_direct_answer": false, "rationales": ["The surfers don't want to crash into the rocks.", "There are big rocks there they could get hurt on.", "The surfers don't want to hit the rocks."], "image": "val2014/COCO_val2014_000000201452.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560010, "question_id": "6MCmRykxeLLe2N8dTDecjQ", "question": "The kite flying looks like what?", "choices": ["canoe", "ship", "car", "duck"], "correct_choice_idx": 1, "direct_answers": ["ship", "boat", "boat", "fancy boat", "ship", "ship", "pirate ship", "boat", "ship", "house"], "difficult_direct_answer": false, "rationales": ["It's obvious by the shape and what look like sails on it.", "The kite is in the shape of a ship with sails.", "It looks like a ship flying in the sky."], "image": "val2014/COCO_val2014_000000560010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416105, "question_id": "6MDmbzFVg4kfu9JmZWgfvT", "question": "What did the person shown here just do?", "choices": ["fly", "quit", "return ball", "serve"], "correct_choice_idx": 2, "direct_answers": ["hit ball", "serve", "return", "serve", "serve", "serve", "hit ball", "hit ball", "hit ball", "return ball"], "difficult_direct_answer": false, "rationales": ["A tennis player is swinging a racket and jumping.", "The man is playing tennis and he has his arm in front of him with his racket with the ball going away from him, indicating that he hit the ball.", "The person returned the ball."], "image": "val2014/COCO_val2014_000000416105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263603, "question_id": "6Mmox7wqJLuwt4Xog5AoA2", "question": "What is roughly the fastest you could go using anything in this picture?", "choices": ["28 mph", "220 mph", "14 mph", "120 mph"], "correct_choice_idx": 3, "direct_answers": ["pretty fast", "100 mph", "very fast", "vehicle", "120 mph", "95 mph", "60 mph", "car", "200km/h", "120 mph"], "difficult_direct_answer": true, "rationales": ["There is a car in the image and the cars can typically go around 120 top speed.", "The fastest you can go is 120 miles per hour.", "Most vehicles go up to a."], "image": "train2014/COCO_train2014_000000263603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154083, "question_id": "6MoH7WhJgiFKsGHbdrEs6M", "question": "What is holding down the kites?", "choices": ["dumbbells", "rocks", "feet", "bricks"], "correct_choice_idx": 2, "direct_answers": ["string", "man's foot", "string", "string", "feet", "string", "string", "string", "strings", "weight"], "difficult_direct_answer": false, "rationales": ["The person is using their feet to keep the kite down.", "All kites needs something on the ground to keep it secured. the only object which is close to the kite is the man's feet.", "The man looks to be stepping on the string."], "image": "train2014/COCO_train2014_000000154083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293576, "question_id": "6Msx2cd2BLVeHdCWFDWDCH", "question": "What would you get help with if you went into the green Fidelity building?", "choices": ["investing", "cooking", "mailing", "marriage counseling"], "correct_choice_idx": 0, "direct_answers": ["finances", "finance", "finances", "check cashing", "investment", "investing", "financial matters", "financial advice", "investments", "banking"], "difficult_direct_answer": true, "rationales": ["Fidelity is a bank.", "A fidelity building is on a street corner. fidelity is a bank.", "Investing is a service provided."], "image": "train2014/COCO_train2014_000000293576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327713, "question_id": "6Mv5oiKbGxvMAwkiyjtkfn", "question": "What type of station is in this area?", "choices": ["bus", "gas", "train", "fire"], "correct_choice_idx": 1, "direct_answers": ["gas station", "gas", "gas", "bus", "metro", "gas", "arena", "train", "train", "gas"], "difficult_direct_answer": false, "rationales": ["The station is for gas.", "A lit sign is in the shape of a shell. shell is the name of a gas station.", "There is a sign for a shell station on the right side."], "image": "train2014/COCO_train2014_000000327713.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50837, "question_id": "6NPsds2pDsrXEUBFuNaM8X", "question": "Which country's flag is in the middle of the three?", "choices": ["united states", "canada", "mexico", "france"], "correct_choice_idx": 0, "direct_answers": ["usa", "united states", "united states", "united states", "usa", "united states", "usa", "united states", "usa", "usa"], "difficult_direct_answer": false, "rationales": ["The flag in the middle is the usa flag.", "The flag is the usa one.", "The flag in the middle has white and red alternating horizontal stripes. in the upper right corner, there is a blue background with fifty white stars on it."], "image": "train2014/COCO_train2014_000000050837.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496313, "question_id": "6NS4k4WDZD9joapdYgpF9U", "question": "What is the bus doing?", "choices": ["going straight", "backing up", "yielding", "turning right"], "correct_choice_idx": 2, "direct_answers": ["bus", "merging lanes", "parked", "parking", "yielding", "merging", "waiting", "moving", "stopping", "joining lane"], "difficult_direct_answer": true, "rationales": ["A bus is stopped at a toll area and watching a truck drive into the lane.", "It is looking to see if anyone is coming.", "The bus is waiting for traffic to go by."], "image": "val2014/COCO_val2014_000000496313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509584, "question_id": "6NkJbw2CYebyQgtghnjhCY", "question": "What are the vehicles getting onto?", "choices": ["parking lot", "dock", "street", "boat"], "correct_choice_idx": 3, "direct_answers": ["ferry", "ferry", "carrier ship", "boat", "ferry", "ferry", "carrier ship", "carrier ship", "ferry", "ferry"], "difficult_direct_answer": false, "rationales": ["The vehicles are lining up to get on a ship.", "The entrance to a ferry looks like this.", "The vehicles are boarding the boat."], "image": "train2014/COCO_train2014_000000509584.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224592, "question_id": "6NvRsruUEKrXujhMUwmdxR", "question": "What allows this machine to be airborne?", "choices": ["reverse gravity", "magic", "lift", "wind"], "correct_choice_idx": 2, "direct_answers": ["gas", "engine", "mechaine", "bernoulli principle", "motor", "wings", "propeller", "wings", "lift", "airplane"], "difficult_direct_answer": true, "rationales": ["Lift is what keeps airplanes in the air. the object in the air is an airplane.", "The machine is being lifted.", "The engines provide speed which allows the wings to lift the plane from the ground"], "image": "train2014/COCO_train2014_000000224592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159233, "question_id": "6NwD5SdiDa8ZgynRCM5hqr", "question": "What is this type of bike called?", "choices": ["ripper", "stroller", "cruiser", "chopper"], "correct_choice_idx": 3, "direct_answers": ["chopper", "custom", "chopper", "chopper", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "chopper", "motorcycle"], "difficult_direct_answer": false, "rationales": ["I am pretty sure that the bikes with the big gap in the middle are called choppers.", "It's american slang to call a bike with a large fork a chopper", "The bike is a chopper."], "image": "val2014/COCO_val2014_000000159233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180539, "question_id": "6Ny4VKqx44LJDjU8Y8XLvj", "question": "WHat kind of competition is taking place?", "choices": ["snowball", "skiing", "ice", "snowboard"], "correct_choice_idx": 1, "direct_answers": ["snowboarding", "skiing", "skiing", "snowboarding", "snowboarding", "skiing", "snowboarding", "snowboarding", "skiing", "snowboarding"], "difficult_direct_answer": false, "rationales": ["The people are all carrying snowboards.", "They are in a skiing competition.", "Most people are carrying skiis with them."], "image": "val2014/COCO_val2014_000000180539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67078, "question_id": "6P5TK2uERwUcbWNCrmAZ52", "question": "Why is the man holding a microphone?", "choices": ["he's yelling", "he's singing", "he's crying", "he's speaking"], "correct_choice_idx": 3, "direct_answers": ["announcing winner", "announcement", "announcing", "amplify voice", "amplify voice", "reading", "mca", "he's speaking", "talking", "announcer"], "difficult_direct_answer": true, "rationales": ["The man wants to confirm that people can hear him.", "The man is holding a microphone because he is speaking.", "The man is reading from a piece of paper. he is not singing, yelling, or crying."], "image": "val2014/COCO_val2014_000000067078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442874, "question_id": "6PE5RootdN7TeVYfgxUcHh", "question": "What does the boat at the water's edge run on?", "choices": ["engine", "rowing", "sails", "no propulsion"], "correct_choice_idx": 1, "direct_answers": ["gas", "engine", "gas", "water", "gasoline", "gas", "electricity", "gas", "rowing", "motor"], "difficult_direct_answer": false, "rationales": ["The boat is of a size and weight that it would need to be powered by an engine to make it move. there is also a visible engine on the boat and if there is an engine on a boat it is likely the power source.", "The boat closest to the water's edge is a motorboat with the motor seen in the rear.", "There is one on the back tilted up since the boat is in shallow water"], "image": "train2014/COCO_train2014_000000442874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521874, "question_id": "6PKro9EBjQkyFSsX2Q6ZML", "question": "Why is the train so small?", "choices": ["for paupers", "no money", "for children", "easily sold"], "correct_choice_idx": 2, "direct_answers": ["model", "model", "amusement train", "single car", "kiddie train", "smaller cargo", "for kids", "for children", "model train", "transport"], "difficult_direct_answer": true, "rationales": ["The train is designed for little kids to ride on it for fun.", "The train is meant to entertain kids.", "The train is small because it is designed as a fun ride for children."], "image": "val2014/COCO_val2014_000000521874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368905, "question_id": "6PLwt62KTWgpQ2Ax4iCjkF", "question": "What collar is the leash in the dog at the top?", "choices": ["black", "pink", "blue", "green"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "blue", "no collar", "blue", "blue", "no collar", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The leash is blue in the top picture.", "A dog has a blue leash.", "The leash is blue in the top image."], "image": "val2014/COCO_val2014_000000368905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351749, "question_id": "6Pvs9RddvmUEdaDJKL9fob", "question": "Where is the person who threw the ball?", "choices": ["outfield", "pitcher's mound", "batters cage", "bull pen"], "correct_choice_idx": 1, "direct_answers": ["pitcher's mound", "background", "infront", "pitcher's mound", "pitcher mound", "pitchers mound", "pitchers mound", "pitcher", "pitcher's mound", "in field"], "difficult_direct_answer": false, "rationales": ["This person stands a distance away and throws the ball towards the person holding the bat.", "The pitcher is at the mound in order to pitch the ball.", "A baseball player leans forward from a small hill in the middle of a baseball field."], "image": "train2014/COCO_train2014_000000351749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334775, "question_id": "6Q9DCLoA6kQgwg8e7pPHyG", "question": "How many people can sleep in this room?", "choices": ["six", "four", "two", "eight"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["These are twin size beds that are used for one person each", "There are two single beds in this room, and the beds are only big enoigg to comfortably sleep one person each.", "There are two beds meant for two people to sleep on."], "image": "train2014/COCO_train2014_000000334775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278662, "question_id": "6QMQ6dkkVVs5Rq3RaPgVGQ", "question": "How is the small vessel being moved?", "choices": ["motor", "sail", "pushed", "towed"], "correct_choice_idx": 3, "direct_answers": ["motor", "boat", "being towed", "towed", "tube", "pulled", "larger boat", "towed", "gas motor", "by boat"], "difficult_direct_answer": true, "rationales": ["The small vessel does not have a motor or a sail. there is a cable that connects the larger vessel to the smaller one.", "A person is riding on a small raft behind a boat. people use boats to pull skiers and people on tubes.", "The small vessel is getting dragged along."], "image": "train2014/COCO_train2014_000000278662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373073, "question_id": "6QP54fuqDBPLkCAsEXuDrm", "question": "If someone bought this house how might they clean their dinner plates most easily?", "choices": ["windex", "microwave", "dishwasher", "scrub brush"], "correct_choice_idx": 2, "direct_answers": ["use sink", "dishwasher", "dishwasher", "dishwasher", "sink", "dishwasher", "sink", "dishwasher", "dishwasher", "dishwasher"], "difficult_direct_answer": false, "rationales": ["This appliance is visible in the photo and is the only one used to clean dishes.", "A dishwasher is under the counter in a kitchen. dishwashers are used to wash dishes with minimal effort.", "The dishwasher could be used."], "image": "train2014/COCO_train2014_000000373073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112995, "question_id": "6QTB2c7kgMjoX5W7Esyp7e", "question": "How will the watercraft user manage to direct themselves toward a goal?", "choices": ["drone", "oars", "pulling", "will"], "correct_choice_idx": 1, "direct_answers": ["oar", "paddle", "oar", "paddle", "oars", "paddle", "paddle", "paddle", "paddle", "paddle"], "difficult_direct_answer": false, "rationales": ["The watercraft is rowed with a pair of oars.", "A special paddle with a flat blade on one end that is used for rowing is laying across the watercraft user's lap.", "These push through the water to move you forward"], "image": "train2014/COCO_train2014_000000112995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422170, "question_id": "6QWVA6WEtXGV4wHzujvMoJ", "question": "What are the people sitting on?", "choices": ["bleachers", "beach", "beds", "chairs"], "correct_choice_idx": 0, "direct_answers": ["bleachers", "bench", "bleachers", "bleachers", "bleachers", "bench", "bleachers", "bleachers", "bleachers", "bleachers"], "difficult_direct_answer": false, "rationales": ["Bleachers are flat seats which ascend a sloping structure and that describes this seating, which means these are bleachers.", "The seats are long.", "They are in a stadium."], "image": "train2014/COCO_train2014_000000422170.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339111, "question_id": "6QdjRDBAZ2QVg8xCckbXKz", "question": "The person next to the train looks like who?", "choices": ["tracy ifeachor", "james horner", "ned beatty", "margaret qualley"], "correct_choice_idx": 3, "direct_answers": ["margaret qualley", "female", "actress", "dr who", "avril lavigne", "woman", "woman", "lena dunham", "billie eilish", "woman"], "difficult_direct_answer": false, "rationales": ["The person wears a similar outfit the character.", "They look like margaret qualley.", "The person is a young white person, and the person in option a matches that description, while those in the other options do not."], "image": "train2014/COCO_train2014_000000339111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342765, "question_id": "6QrVKF5ZWyuMXSoXWEf8r7", "question": "What is the object on the right side of the top freezer compartment?", "choices": ["pencil sharpener", "button maker", "blade sharpener", "bottle opener"], "correct_choice_idx": 3, "direct_answers": ["bottle opener", "beer", "cap remover", "beer", "decoration", "beer", "bottle opener", "beer", "hair", "magnet"], "difficult_direct_answer": false, "rationales": ["The object is a bottle opener.", "The object on the right side can help someone open a bottle.", "Bottles can be opened by putting the cap into the silver thing and pulling."], "image": "val2014/COCO_val2014_000000342765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21194, "question_id": "6QtHH32kND4fXWsscYWUfP", "question": "What can he do with this ball?", "choices": ["juggle", "serve", "dunk", "dribble"], "correct_choice_idx": 1, "direct_answers": ["return ball", "serve", "hit it", "hit", "spike it", "hit it", "hit it", "hit", "hit it", "hit ball"], "difficult_direct_answer": false, "rationales": ["It is a tennis ball, not a basketball or juggling ball.", "A kid is playing tennis and is standing at the back line of the court.", "As a tennis player, if he has the ball, he needs to send it over to his opponent."], "image": "train2014/COCO_train2014_000000021194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65835, "question_id": "6Qz3rtGE6SY9oTKHX8GcTL", "question": "What is the white linen item under the dinnerware called?", "choices": ["placemat", "tablecloth", "doily", "table runner"], "correct_choice_idx": 0, "direct_answers": ["placemat", "napkin", "napkin", "placemat", "placemat", "napkin", "placemat", "placemat", "napkin", "placemat"], "difficult_direct_answer": false, "rationales": ["A rectangular piece of fabric is under a plate on a table.", "It is a mat that holds a place for the table setting.", "The item is rectangular and used as a dinner setting under a plate."], "image": "train2014/COCO_train2014_000000065835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44856, "question_id": "6RD2iFAYMXuKQEsqMtERok", "question": "What helps propel the person forward at this location?", "choices": ["poles", "nothing", "gravity", "sheer will"], "correct_choice_idx": 0, "direct_answers": ["ski poles", "poles", "poles", "poles", "poles", "ski", "ski poles", "poles", "ski sticks", "poles"], "difficult_direct_answer": false, "rationales": ["The person is on skis with poles in hand. the person is on flat ground.", "The person is using the ski poles to help propel them forward.", "The poles will help them propel themself and keep them balanced."], "image": "val2014/COCO_val2014_000000044856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390742, "question_id": "6RP8ERztvqgAgMbrg2ab7G", "question": "What will happen if someone leaves their vehicle in front of this sign?", "choices": ["arrested", "stolen", "towed", "ticketed"], "correct_choice_idx": 2, "direct_answers": ["tow", "towing", "damage", "accident", "towed", "towed", "towed", "tow-away", "towed", "get towed"], "difficult_direct_answer": false, "rationales": ["The sign shows an image of a car being towed.", "Parking is not allowed here and the sign shows a truck pulling a car away.", "The truck will take it away because no parking is allowed"], "image": "train2014/COCO_train2014_000000390742.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34624, "question_id": "6RkfTUqWdhK2XUxxwVpUBP", "question": "What was used to make their desk?", "choices": ["granite", "metal", "marble", "wood"], "correct_choice_idx": 3, "direct_answers": ["wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["The desk is made of wood.", "The desk is made out of wood.", "The desk is made of a rich wood."], "image": "train2014/COCO_train2014_000000034624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208003, "question_id": "6RqDyB85y2oYXtRzdfqjGU", "question": "What prevents the Giraffes from escaping the fence?", "choices": ["material", "color", "it's width", "texture"], "correct_choice_idx": 2, "direct_answers": ["thick wall", "barrier", "zookeeper", "rock barrier", "cant jump", "it's width", "fence", "short legs", "common sense", "height"], "difficult_direct_answer": true, "rationales": ["The fence is long but prevents the giraffe.", "The stone wall in front of the fence would be too wide for the giraffe to be able to clear it with their legs.", "The giraffes are too tall to escape."], "image": "val2014/COCO_val2014_000000208003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49119, "question_id": "6S3jmvjApD9TwBhCzfnCJE", "question": "What are ships carrying goods called?", "choices": ["freighter", "sail boat", "cargo ship", "goods ship"], "correct_choice_idx": 0, "direct_answers": ["cargo", "ferry", "cargo ships", "cargo ships", "freighters", "cargo sheep", "cargo", "freighter", "cargo", "freight"], "difficult_direct_answer": false, "rationales": ["These ships are for cargo.", "Though you could pick a few other answers to the question, they are mostly called cargo ships because they carry cargo.", "The ships that carry cargo are known as cargo ships."], "image": "val2014/COCO_val2014_000000049119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218917, "question_id": "6S5h34TZtDuewri6ro4z2i", "question": "In which country is this shop located?", "choices": ["kenya", "spain", "china", "united states"], "correct_choice_idx": 3, "direct_answers": ["united states", "united states", "united states", "united states", "usa", "usa", "united states", "united states", "united states", "usa"], "difficult_direct_answer": false, "rationales": ["Based on the options, and with the amount of english on the menu, this shop is most likely located in the united states.", "The menu and labels are all in english.", "There are a lot of shops that use price boards like this one in a."], "image": "train2014/COCO_train2014_000000218917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543082, "question_id": "6SMMW3i5KqDX44goTjKBaW", "question": "The man who received the first patent for this sport was from which country?", "choices": ["australia", "togo", "lithuania", "netherlands"], "correct_choice_idx": 3, "direct_answers": ["united states", "netherlands", "netherlands", "france", "netherlands", "usa", "tom blake", "usa", "unknown", "united states"], "difficult_direct_answer": false, "rationales": ["The man who received the first patent was from a european country. the netherlands would be the only answer to meet this criteria.", "The man doing parasailing was from the netherlands first.", "He was from the netherlands."], "image": "val2014/COCO_val2014_000000543082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123389, "question_id": "6SQGt8mxLPHsXC9RGkobAt", "question": "What is the purpose of the little white container?", "choices": ["dip", "shot", "toy", "paint"], "correct_choice_idx": 0, "direct_answers": ["pizza", "ranch dressing", "condiment holder", "ranch", "holds sauce", "dipping sauce", "hold sauce", "dipping sauce", "dip", "salad dressing"], "difficult_direct_answer": true, "rationales": ["The small container has ranch in it to put on your pizza.", "There are pizza slices on the table, and most pizza joints serve sauces in those little cups. the lid is open and there seems to be ranch or bleu cheese in it.", "The little white container has ranch dip in it."], "image": "train2014/COCO_train2014_000000123389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469834, "question_id": "6SWvFSp8qzwygm4U3397Xy", "question": "What are the long poles under the umbrella made out of?", "choices": ["plastic", "stone", "bamboo", "steel"], "correct_choice_idx": 2, "direct_answers": ["bamboo", "bamboo", "bamboo", "bamboo", "bamboo", "bamboo", "bamboo", "bamboo", "bamboo", "bamboo"], "difficult_direct_answer": false, "rationales": ["The poles are made of bamboo shoots.", "You can tell by the bumps they are bamboo.", "Bamboo has a distinctive, segmented construction so based on that comparison, it's clear that these poles are made out of bamboo."], "image": "train2014/COCO_train2014_000000469834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539418, "question_id": "6SekzawkUeA7s4AB9Zjah6", "question": "Why is he carrying his surfboard?", "choices": ["stealing it", "exercise", "done surfing", "hiding it"], "correct_choice_idx": 2, "direct_answers": ["going surfing", "boat", "scatting", "going home", "going home", "leaving", "safty", "surf", "going surfing", "done surfing"], "difficult_direct_answer": false, "rationales": ["By the time of day and the direction is facing suggests they are done for the day.", "He is walking out of the water with it.", "The man is done surfing."], "image": "train2014/COCO_train2014_000000539418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190360, "question_id": "6SmiiGaEXadV2nshv9MCcM", "question": "Why would the cat lay here?", "choices": ["warmth", "playfulness", "food", "cooling"], "correct_choice_idx": 0, "direct_answers": ["warm", "resting", "enjoy warmth", "cozy", "warm", "nap", "warmth", "tired", "warmth", "to rest"], "difficult_direct_answer": false, "rationales": ["The car is giving off heat so the cat likes the warmth.", "The cat is sitting on the hood of a car. the cat likes the heat that is given off by the engine.", "The cat would lay here to absorb some of the heat given off by the engine."], "image": "train2014/COCO_train2014_000000190360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250295, "question_id": "6TDYoeMYq3ZCxo6GKPag93", "question": "Why does the man use a rope?", "choices": ["control", "tie", "climb", "attach"], "correct_choice_idx": 0, "direct_answers": ["control elephant", "control", "control", "train elephants", "elephant play", "pull elephant", "handle trunk", "lifting trunk", "guide elephant", "string"], "difficult_direct_answer": true, "rationales": ["The rope is used by the man to control the elephant so it doesn't stray away", "He is using it to keep a hold on him.", "The man is an elephant trainer using objects to guide the animal."], "image": "train2014/COCO_train2014_000000250295.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446974, "question_id": "6TWbQjKVUTigdeNCLgfkQ5", "question": "What's seen on the window?", "choices": ["insects", "fingerprints", "raindrops", "stickers"], "correct_choice_idx": 2, "direct_answers": ["raindrops", "raindrops", "raindrops", "rain", "raindrops", "rain drops", "rain", "rain", "red light", "raindrops"], "difficult_direct_answer": false, "rationales": ["It is raining and you can see the glare on the road.", "It's has been raining outside so the window is speckled with rain.", "Beads of water appear on the window."], "image": "train2014/COCO_train2014_000000446974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522234, "question_id": "6U59BEzZQfAExGoEPoTiZh", "question": "Which company makes the green object here?", "choices": ["wilson", "coleco", "kenner", "timberland"], "correct_choice_idx": 0, "direct_answers": ["spalding", "wilson", "wilson", "wilson", "wilson", "wilson", "wilson", "wilson", "wilson", "wilson"], "difficult_direct_answer": false, "rationales": ["Wilson is known for making tennis balls.", "Wilson makes the ball.", "Wilson is popular sports manufacturer."], "image": "train2014/COCO_train2014_000000522234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166716, "question_id": "6UQyzMvnSH4pssBx7uGnUX", "question": "What will probably happen next?", "choices": ["crash", "trick", "jump", "rest"], "correct_choice_idx": 0, "direct_answers": ["fall", "land", "wreck", "fall", "fall off", "movements", "man falls", "fall down", "crash", "injury"], "difficult_direct_answer": true, "rationales": ["The snowboarder already jumped and tried to do a trick. he is about to fall.", "They will crash since they are skating besides a metal bar.", "Two snowboarders are both on a ramp in the snow at the same time."], "image": "train2014/COCO_train2014_000000166716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2687, "question_id": "6UWyiJuWVVCSSk5nWthGvd", "question": "Why are the men's coats yellow in color?", "choices": ["camouflage", "dress code", "visibility", "fashion"], "correct_choice_idx": 2, "direct_answers": ["safety visibility", "safety", "they're firefighters", "uniform", "visibility", "firemen", "safety", "firefighter uniform", "safety", "workers"], "difficult_direct_answer": false, "rationales": ["The men are wearing reflective gear.", "They need to be seen.", "These coats are yellow for safety reasons."], "image": "train2014/COCO_train2014_000000002687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165094, "question_id": "6V2cWFJJi5GZy7rfa6kGYd", "question": "What does the object in the image do?", "choices": ["takes money", "car", "phone", "tracks you"], "correct_choice_idx": 0, "direct_answers": ["earns points", "keeps time", "parking meter", "no image", "takes money", "pay parking", "meter", "parking meters", "parking meter", "parking meter"], "difficult_direct_answer": false, "rationales": ["A large vending type machine with a slot for a credit card is on the sidewalk.", "The object allows you to pay by credit card or a mobile phone for a parking spot.", "There is a place to put coins or scan your credit card"], "image": "train2014/COCO_train2014_000000165094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133456, "question_id": "6V2gC67rKSZrYAQdh7bmte", "question": "What are the animals doing on the hill?", "choices": ["mating", "eating", "fighting", "sitting"], "correct_choice_idx": 3, "direct_answers": ["standing", "laying down", "relaxing", "lounging", "resting", "sitting", "grazing", "resting", "resting", "grazing"], "difficult_direct_answer": false, "rationales": ["The animals are lying down. they are not doing anything else.", "The animals are sitting on the hill.", "The animals have their legs out in front of them as they relax in the sun"], "image": "train2014/COCO_train2014_000000133456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466615, "question_id": "6VFUvexijc4ninADbeAXZS", "question": "What did the woman in purple just do?", "choices": ["reading", "working", "shopping", "parked car"], "correct_choice_idx": 3, "direct_answers": ["pay meter", "parked car", "put money", "deposit coins", "pay meter", "paid parking", "pay meter", "pay parkingmeter", "pay meter", "pay money"], "difficult_direct_answer": false, "rationales": ["The man in purple just parked her car.", "She recently parked her car and is putting money in the meter to avoid getting a parking ticket.", "The woman is putting coins in a meter. her vehicle is near the meter."], "image": "train2014/COCO_train2014_000000466615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391887, "question_id": "6Vx2TZSwY8ci7KxiiRZbqf", "question": "What profession can be seen?", "choices": ["garbage man", "rancher", "firefighter", "cowboy"], "correct_choice_idx": 2, "direct_answers": ["firefighter", "fire fighting", "firefighters", "firefighters", "fire fighters", "firefighters", "firefighting", "firefighters", "firefighters", "firefighters"], "difficult_direct_answer": false, "rationales": ["They are carrying hoses, wearing protective gear and there is smoke visible in the sky.", "They are wearing the uniforms that firemen typically use.", "Firefighters can be seen working with hoses and equipment at the scene of a fire."], "image": "train2014/COCO_train2014_000000391887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32720, "question_id": "6W2H5Nbu8kvTm2hbGitXF2", "question": "What letter might she see on the racquet from her perspective?", "choices": ["d", "p", "g", "q"], "correct_choice_idx": 3, "direct_answers": ["letter p", "letter p", "lowercase d", "letter p", "q", "backwards p", "letter p", "p", "pees", "letter p"], "difficult_direct_answer": false, "rationales": ["The letter is q when reversed.", "The brand begins with p, but backwards (from her view) it would look like a q.", "The letter q might be seen."], "image": "train2014/COCO_train2014_000000032720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243947, "question_id": "6W37tGEnmkdEN4TinW3DkG", "question": "What indicates that this is a tourist area?", "choices": ["aquarium", "lodges", "lawns", "people"], "correct_choice_idx": 0, "direct_answers": ["stores", "gift shops", "aquarium", "aquarium", "signs", "signs", "crowded", "signs", "aquarium", "people"], "difficult_direct_answer": false, "rationales": ["There are many people there, more than usual if it was just a normal business section.", "The aquarium will generally draw tourists out.", "The aquarium attracts tourists."], "image": "train2014/COCO_train2014_000000243947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355415, "question_id": "6W6qZahx9rcsNzvQdEEBxb", "question": "How many days ago was this cake made?", "choices": ["one day", "today", "three days", "two days"], "correct_choice_idx": 1, "direct_answers": ["two days", "one", "zero", "today", "one", "today", "cake", "one", "today", "zero"], "difficult_direct_answer": false, "rationales": ["Food doesn't stay fresh and looking good for days.", "The cake is made fresh daily.", "The box the cake came in says when the product is made."], "image": "train2014/COCO_train2014_000000355415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261118, "question_id": "6WFx5LMuSEsjCTaU9diTU4", "question": "What is the red rectangular box with windows on the right used for?", "choices": ["sitting", "collecting trash", "buying food", "phones calls"], "correct_choice_idx": 3, "direct_answers": ["telephone", "telephone booth", "telephone booth", "phone calls", "telephone", "telephone calls", "phone calls", "phone calls", "phone booth", "phones calls"], "difficult_direct_answer": false, "rationales": ["It is used to make phone calls", "That is a telephone booth that the public can use to make a phone call.", "The box is for phone calls."], "image": "train2014/COCO_train2014_000000261118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265513, "question_id": "6WgjKYvR72MyLqa9ES6bar", "question": "What is the boy in the blue t shirt climbing?", "choices": ["tree", "fence", "swing set", "hill"], "correct_choice_idx": 3, "direct_answers": ["hill", "hill", "hill", "watching kite", "hill", "hill", "mountain", "hill", "mountain", "hill"], "difficult_direct_answer": false, "rationales": ["The boy is climbing up a grassy hill.", "The boy is climbing where there are trees and grass above the other children.", "The boy in the blue t-shirt is climbing a hill to catch up to the kite."], "image": "train2014/COCO_train2014_000000265513.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108123, "question_id": "6WvBcHpvijPYtSzihNPqEq", "question": "What city is located in the state that the players in the dugout are from?", "choices": ["detroit", "tulsa", "tucson", "atlanta"], "correct_choice_idx": 3, "direct_answers": ["georgia", "atlanta", "georgia", "atlanta", "atlanta", "atlanta", "atlanta", "atlanta", "atlanta", "georgia"], "difficult_direct_answer": false, "rationales": ["The players in the dugout are from georgia, not oklahoma, michigan, or arizona.", "Choice \"a\" is a large and well known city in georgia.", "Atlanta is a city in the state of georgia. the players in the dugout are wearing jerseys that say \"georgia\" across the front."], "image": "train2014/COCO_train2014_000000108123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306030, "question_id": "6XUd6UwoG8fbR6tiwYxtas", "question": "Which country headquarters the brand company of the laptop?", "choices": ["italy", "india", "japan", "china"], "correct_choice_idx": 2, "direct_answers": ["japan", "japan", "japan", "japan", "japan", "japan", "japan", "japan", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["Is the most likely option given that many of them come out of a.", "The laptop is a toshiba and their headquarters is in tokyo.", "The brand is toshiba, a japanese company."], "image": "train2014/COCO_train2014_000000306030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337525, "question_id": "6YAc8mYuqcgmwjKJmnQDef", "question": "What are the white particles around the dog?", "choices": ["sparkling spray", "water", "hail", "snow"], "correct_choice_idx": 1, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "snow", "snow", "water"], "difficult_direct_answer": false, "rationales": ["\"water\" is the only answer that makes sense. snow flakes do not look like that; there is no sign of hail and sparkling spray is not a factor here.", "The dog is splashing water particles.", "The way that the drops of water are being reflected off of the dog are making them look white."], "image": "val2014/COCO_val2014_000000337525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551194, "question_id": "6YP62xWGaKyFk6J9q9VmPk", "question": "The animals that the children are riding belongs to what family of animals?", "choices": ["bovidae", "felidae", "equidae", "canidae"], "correct_choice_idx": 2, "direct_answers": ["horse", "horses", "equidae", "horses", "equidae", "equine", "equines", "donkeys", "horses", "equidae"], "difficult_direct_answer": false, "rationales": ["The animals that the children are riding on are horses.", "The children are riding horses, not cows, cats, or dogs.", "These are mules which are part of this family of animals"], "image": "train2014/COCO_train2014_000000551194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429437, "question_id": "6YR52gQZpoD936TzxmBYa8", "question": "What type of rice is that?", "choices": ["raw", "boiled", "steamed", "fried"], "correct_choice_idx": 2, "direct_answers": ["white rice", "white", "white", "white", "white", "white", "steamed", "white", "white rice", "white"], "difficult_direct_answer": false, "rationales": ["That is plain white steamed rice.", "The rice is fluffy and white.", "The rice is cooked."], "image": "train2014/COCO_train2014_000000429437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160239, "question_id": "6YW2YkGSDagLYdMorwcNgC", "question": "What is the float in the shape of?", "choices": ["deathstar", "carrot", "whale", "fox"], "correct_choice_idx": 2, "direct_answers": ["whale", "whale", "blue whale", "whale", "whale", "whale", "whale", "whale", "blue whale", "blue whale"], "difficult_direct_answer": false, "rationales": ["The float is the shape of a large whale.", "This is a whale balloon.", "If there is any doubt,just eliminate the other options. this can't possibly be a death star, a fox, and is definitely not a carrot."], "image": "val2014/COCO_val2014_000000160239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467564, "question_id": "6YqiWCUTsiUzHNgg4zxN3w", "question": "What objects are stored on top of the red vehicle?", "choices": ["tools", "hoses", "luggage", "tires"], "correct_choice_idx": 3, "direct_answers": ["tires", "luggage", "luggage", "tires", "tires", "luggage", "tires", "tires", "tires", "tires"], "difficult_direct_answer": false, "rationales": ["The items on top of the red vehicle are wheel-shaped and are made out of rubber.", "Tires are on the red vehicle.", "The two round black mounted to the red truck are known as tires."], "image": "train2014/COCO_train2014_000000467564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123229, "question_id": "6Yvd9vwJ3M4AQZYyBcSWo5", "question": "What likely made the tracks on the ground?", "choices": ["plane", "skis", "cows", "truck"], "correct_choice_idx": 3, "direct_answers": ["truck", "tractor", "truck", "truck", "truck", "tractor", "vehicle", "vehicle", "cows", "truck"], "difficult_direct_answer": false, "rationales": ["There are two rows of tires and cows need extra feed sometimes.", "There are tire tracks on the field.", "The tracks are parallel in the manner a vehicle would make and not a natural phenomena."], "image": "train2014/COCO_train2014_000000123229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85948, "question_id": "6ZqVq937XzyziTAqMFD33Q", "question": "What kind of ball is the elephant holding?", "choices": ["baseball", "golf ball", "basketball", "volleyball"], "correct_choice_idx": 2, "direct_answers": ["basketball", "basketball", "basketball", "basketball", "basketball", "basketball", "basketball", "basketball", "basketball", "basketball"], "difficult_direct_answer": false, "rationales": ["This ball holds an orange ball wrapped in it's trunk towards a basketball hoop.", "One can see the hoops that the elephant is about to dunk the ball in.", "It is near a basketball hoop."], "image": "train2014/COCO_train2014_000000085948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333613, "question_id": "6ZztZvLApYkmUVB6ZqPUH7", "question": "What should have the guy down with the toilet seat prior to urinating?", "choices": ["kept down", "raise it", "broken it", "disassembled"], "correct_choice_idx": 1, "direct_answers": ["raise it", "lifted", "pull up", "he shouldn't", "drinking", "lift seat", "liift", "lift", "put up", "put up"], "difficult_direct_answer": true, "rationales": ["Guys should lift the toilet seat.", "It should be in the up position so that he doesn't have to wipe it off after he's finished.", "If the seat is lifted up, it makes it easier for men to aim directly into the bowl."], "image": "train2014/COCO_train2014_000000333613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81784, "question_id": "6a3x3SDaS5T9zQd5SKFfdZ", "question": "What is the man helping the young woman with in the sand?", "choices": ["collect turtles", "collect water", "lay towel", "build sandcastle"], "correct_choice_idx": 3, "direct_answers": ["kite", "to play", "kiteboard", "kite", "kite", "her kite", "fly kite", "child", "build sandcastle", "sand castle"], "difficult_direct_answer": false, "rationales": ["He is playing in the sand building things with her.", "They are digging and have a sand structure", "He has a large pile of sand next to him shaped in a structure and is digging"], "image": "train2014/COCO_train2014_000000081784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212801, "question_id": "6a5dY2K2uLetwEt2GEb8rT", "question": "What is unusual about the animal on the sail?", "choices": ["wearing clothes", "talking", "wrong color", "wrong habitat"], "correct_choice_idx": 0, "direct_answers": ["wearing clothing", "wearing clothes", "wearing suit", "human rabbit", "anthropomorphic rabbit", "evil bunny", "wearing clothes", "has muscles", "wearing clothes", "human rabbit"], "difficult_direct_answer": false, "rationales": ["The animal has a full suit of armour.", "The sail which is in the photo includes a picture on a rabbit. what really stands out for this animal is that it is wearing clothes.", "The animal on the sail has clothes on."], "image": "val2014/COCO_val2014_000000212801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333130, "question_id": "6aGcmhLYtZcfTicAcirdpg", "question": "What kind of power does this train use?", "choices": ["steam", "coal", "diesel", "electricity"], "correct_choice_idx": 3, "direct_answers": ["electricity", "electric", "electric", "electricity", "fuel", "electric", "electric", "electricity", "electric", "electric"], "difficult_direct_answer": false, "rationales": ["As evidenced by the lines above it and the lack of an engine that deals with b or a. and no smoke.", "The wires above it are connected and power it.", "The power lines can be seen above the train."], "image": "val2014/COCO_val2014_000000333130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23368, "question_id": "6aNveh3pX8ZGPyDDzzETuT", "question": "What type of enclosure is shown?", "choices": ["fence", "barn", "cage", "gate"], "correct_choice_idx": 0, "direct_answers": ["giraffic", "chainlink", "road", "giraffe enclosure", "fencing", "pen", "zoo", "fence", "zoo", "zoo enclosure"], "difficult_direct_answer": true, "rationales": ["One can see the posts and chain links of this type of enclosure.", "The top part of the area is not enclosed. the chain-link enclosure does not have an opening that would allow the giraffes to enter or exit.", "The chain link from the enclosure can clearly be seen."], "image": "train2014/COCO_train2014_000000023368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312412, "question_id": "6aSz2a4MSfF77dRN5KsbQA", "question": "Which object would be grabbed if someone wanted to get a drink of water?", "choices": ["wine bottle", "bottom-left glass", "food plate", "wine glass"], "correct_choice_idx": 1, "direct_answers": ["glass", "glass", "water glass", "short glass", "glass", "short glass", "bottom-left glass", "glass", "water glass", "glass cup"], "difficult_direct_answer": false, "rationales": ["This is a glass that isn't used for wine", "This glass has clear liquid in it.", "This is the only container or surface which holds water; the rest contain either food or wine."], "image": "val2014/COCO_val2014_000000312412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372612, "question_id": "6afQxSxk32XGJ4LsBthCu3", "question": "What is the man on the phone doing?", "choices": ["exercising", "crying", "laughing", "eating"], "correct_choice_idx": 2, "direct_answers": ["laughing", "speaking", "laughing", "laughing", "laughing", "speaking", "laughing", "laughing", "laughing", "laughing"], "difficult_direct_answer": false, "rationales": ["The man with the cellphone at his ear is smiling widely and hunched over indicating that he is joyous.", "The man has his mouth open and arm over his stomach with an excited expression.", "The man is laughing."], "image": "train2014/COCO_train2014_000000372612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46055, "question_id": "6akyPi3bPjpu5UrMEWTbwC", "question": "The utensils above are mainly made from?", "choices": ["clay", "minerals", "loam", "glass"], "correct_choice_idx": 0, "direct_answers": ["clay", "ceramic", "ceramics", "glass", "ceramic", "clay", "cotton", "glass", "ceramic", "ceramic"], "difficult_direct_answer": false, "rationales": ["The utensils are made of clay.", "Ceramic objects are made with non-metallic materials and clay is main material used.", "The utensils are made of clay since they're forms of pottery."], "image": "train2014/COCO_train2014_000000046055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139789, "question_id": "6aqicqnrQWPfYpHnc4yTiW", "question": "What is this man doing with this jewelry?", "choices": ["wearing it", "selling it", "stealing it", "destroying it"], "correct_choice_idx": 1, "direct_answers": ["carrying it", "selling it", "selling it", "selling", "selling", "selling", "selling it", "selling", "selling", "carrying it"], "difficult_direct_answer": false, "rationales": ["He has it displayed in a case for people to see", "The man is holding jewelry in a box to show off his wares.", "He is displaying it openly so he didn't steal it. it is too valuable to throw away and he's not wearing it."], "image": "train2014/COCO_train2014_000000139789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471473, "question_id": "6b7ar6FGkGbrXnPoci7v6r", "question": "What is the name of the way the man in the street is wearing his pants?", "choices": ["sagging", "cut-offs", "casual", "high-waisted"], "correct_choice_idx": 0, "direct_answers": ["low", "sagging", "baggy", "gang member", "sagging", "lowrider", "low rider", "low rider", "sagging", "baggy"], "difficult_direct_answer": false, "rationales": ["The man has his belt below his underwear.", "Pants are meant to be worn around the waist and this person is wearing them clearly much lower. this style is known as answer a.", "The man is wearing his shorts low on his waist which is known as saggy pants."], "image": "train2014/COCO_train2014_000000471473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34993, "question_id": "6bkvkxjucs7W8ZtM8gQCM7", "question": "What is taboo to wear with his footwear?", "choices": ["gloves", "pants", "hat", "socks"], "correct_choice_idx": 3, "direct_answers": ["socks", "socks sandals", "socks", "socks", "socks", "socks", "socks", "socks", "socks", "socks"], "difficult_direct_answer": false, "rationales": ["There is a white material on the guys feet.", "The man is wearing sandals. these usually are worn barefoot.", "The man is wearing socks and sandals, giving him a strange look."], "image": "val2014/COCO_val2014_000000034993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287400, "question_id": "6bqHnvhEimBxFqz5nrcEij", "question": "How many uses is the cup container designed for?", "choices": ["three", "one", "two", "infinite"], "correct_choice_idx": 1, "direct_answers": ["many", "one", "one", "1 use", "two", "one", "one", "many", "one", "one"], "difficult_direct_answer": false, "rationales": ["The one purpose of the cup is to hold the sauce for the sandwich.", "This is a paper cup designed to hold hot foods so it doesn't melt through or fall apart before the food can be consumed", "The cup is best suited for a side of sauce meant for this sandwich; it's too small for a beverage or side order for an entree."], "image": "train2014/COCO_train2014_000000287400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354955, "question_id": "6bu6sQadpiTgcuJH8rrFfy", "question": "What is the bench covered in?", "choices": ["mud", "moss", "vines", "animals"], "correct_choice_idx": 1, "direct_answers": ["moss", "moss", "moss", "moss", "mold moss", "foliage", "moss", "moss", "mold moss", "moss"], "difficult_direct_answer": false, "rationales": ["The bench has moss.", "It's what grows when there is a lot of moisture in an area.", "By the color and the bench being outside all the time it's easy to tell that moss is growing on the bench."], "image": "train2014/COCO_train2014_000000354955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495266, "question_id": "6c4ZNBmGkT4tQdXwCjgyfa", "question": "What sex is this man's opponent here most likely?", "choices": ["woman", "trans", "intersex", "man"], "correct_choice_idx": 3, "direct_answers": ["man", "male", "male", "male", "man", "male", "male", "male", "man", "male"], "difficult_direct_answer": false, "rationales": ["Single players usually play against other members of their own sex and a man is pictured here.", "In a professional tennis match, singles opponents are normally of the same sex and each sex has their own separate competition.", "The man is playing a tennis match which is usually same gender."], "image": "train2014/COCO_train2014_000000495266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460115, "question_id": "6c6RDTdTjerDy3FuxuSbvW", "question": "What country manufactures the goods made by the sponsor under the clock?", "choices": ["germany", "uganda", "france", "switzerland"], "correct_choice_idx": 3, "direct_answers": ["vietnam", "united kingdom", "china", "macao", "switzerland", "rolex", "rolex", "france", "england", "switzerland"], "difficult_direct_answer": false, "rationales": ["The country is switzerland.", "Switzerland is where rolex is from.", "Swiss watches are very popular."], "image": "train2014/COCO_train2014_000000460115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279524, "question_id": "6cXy5BF7ULfawtksHJjv99", "question": "Where is the boy most likely to be headed?", "choices": ["shower", "prom", "swimming", "gym"], "correct_choice_idx": 1, "direct_answers": ["out", "sitting room", "to kitchen", "to work", "other room", "living room", "prom", "living room", "work", "work"], "difficult_direct_answer": false, "rationales": ["The boy is dressed up for a fancy event.", "He is dressed up in a nice shirt and nice pants.", "The boy is going to prom."], "image": "val2014/COCO_val2014_000000279524.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390749, "question_id": "6cYLpYCNDXh9quGqD28s9H", "question": "Where should the reflective items on the peoples high foreheads really be?", "choices": ["ski bottoms", "on knees", "over eyes", "on chin"], "correct_choice_idx": 2, "direct_answers": ["covering eyes", "over eyes", "over eyes", "over eyes", "goggles", "googles", "on eyes", "over eyes", "over eyes", "eyes"], "difficult_direct_answer": false, "rationales": ["Goggles are made for protection and to enhance the ability to see on sunny snow days.", "The googles are meant for the eyes.", "When skiing you put these on your face to keep you from getting snow blind"], "image": "train2014/COCO_train2014_000000390749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211600, "question_id": "6cjfokSrLbCWzw7da3GxXu", "question": "Where is this person located?", "choices": ["office", "restaurant", "home", "store"], "correct_choice_idx": 2, "direct_answers": ["couch", "couch", "sofa", "home", "couch", "sofa", "couch", "couch", "couch", "couch"], "difficult_direct_answer": false, "rationales": ["They are sitting on a couch next to their cat.", "There is a sofa with a cat on.", "The person is at home."], "image": "train2014/COCO_train2014_000000211600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227813, "question_id": "6cu5Jhxsq7PD74iGDqp6EV", "question": "What might he have applied before going out there?", "choices": ["hairspray", "lipstick", "sunscreen", "foundation"], "correct_choice_idx": 2, "direct_answers": ["sunscreen", "sunscreen", "sunscreen", "sun screen", "sunscreen", "sunscreen", "sunscreen", "sunscreen", "sun screen", "good"], "difficult_direct_answer": false, "rationales": ["The man is surfing wearing swim trunks and short sleeves. he may have applied lotion or oil to his skin to help protect him from the harmful rays of the sun.", "The reflective light off the water can make sunburns worse.", "The man is under the sun so perhaps needed something to protect his skin."], "image": "train2014/COCO_train2014_000000227813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278512, "question_id": "6dJGjW7iCy6FiL8LfrY2rR", "question": "What had recently happened when this plane landed prior to this place?", "choices": ["snow", "rain", "sunny day", "tornado"], "correct_choice_idx": 1, "direct_answers": ["rain", "mud splatter", "muddied", "rain", "drop off", "rain", "landing", "rainfall", "rain", "skid mud"], "difficult_direct_answer": false, "rationales": ["There is mud all over the bottom of the plane.", "The wet weather has caused the runway to become muddy.", "There is lots of mud, so there had been also rain."], "image": "val2014/COCO_val2014_000000278512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222588, "question_id": "6dWqMaNpWmkRyyUv889SpF", "question": "These people are most likely at what kind of an event?", "choices": ["motorcycle rally", "inauguration", "asian parade", "circus"], "correct_choice_idx": 0, "direct_answers": ["parade", "pride parade", "motorcycle rally", "lgbtq", "biker rally", "motorcycle rally", "motorbike", "bike rally", "rally", "gay pride"], "difficult_direct_answer": true, "rationales": ["They are seen in motorcycle that seems to be having fun.", "One can see that they are sitting on a bike and wearing helmets.", "The people are riding a motorcycle and wearing black leather at a rally."], "image": "train2014/COCO_train2014_000000222588.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403064, "question_id": "6dXqcU9xCtLXCsCsNJBtq8", "question": "What is causing the faint red and green images in the right straight ahead?", "choices": ["milking cows", "window glare", "cows", "road sign"], "correct_choice_idx": 1, "direct_answers": ["reflection", "reflection", "books", "dashboard reflection", "window glare", "reflections", "reflection", "reflection", "reflection", "books"], "difficult_direct_answer": false, "rationales": ["The images are a reflection in the window.", "That is from the flag.", "Faint images are often caused by glare. this image was taken from inside a car window."], "image": "train2014/COCO_train2014_000000403064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24380, "question_id": "6djm8rkfhp4R2bHBbL2Hxc", "question": "What soft drink is advertised to the left of William Hill?", "choices": ["mountain dew", "7-up", "coke zero", "diet pepsi"], "correct_choice_idx": 2, "direct_answers": ["coca-cola zero", "wine", "coke zero", "coca cola", "coke zero", "coca cola", "coca cola", "coco cola", "coke", "coca-cola zero"], "difficult_direct_answer": false, "rationales": ["The logo for the cola drink can be seen on the left.", "Coca-cola is a soda company known for many products in this photo you can see the word zero bellow the company name this indicates that it is a specialized product and what the name of this product is.", "There is a black sign next to william hill that says coke zero on it."], "image": "train2014/COCO_train2014_000000024380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213532, "question_id": "6e3iztfSsP3Qh9ugx9rkiY", "question": "These children have ancestors most likely from where?", "choices": ["finland", "mexico", "vietnam", "kazakhstan"], "correct_choice_idx": 2, "direct_answers": ["china", "asia", "asia", "asia", "china", "asia", "asia", "asia", "vietnam", "asia"], "difficult_direct_answer": false, "rationales": ["The eyes shows that they come from vietmans.", "They look to be maybe asian.", "These children appear to be of southeast asian, not european, latino, or central asian, descent."], "image": "train2014/COCO_train2014_000000213532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537604, "question_id": "6eLyU844VJ46HdD7ACgphH", "question": "Why does the dog have a party hat tied to him?", "choices": ["fashion", "protection", "his birthday", "identification"], "correct_choice_idx": 2, "direct_answers": ["his birthday", "his birthday", "his birthday", "birthday", "cute decoration", "birthday", "identification", "birthday", "birthday boy", "for fun"], "difficult_direct_answer": false, "rationales": ["The dog is in front of a cake and there are dog pictures on the plate.", "The cake has dog treats on it so it must be his birthday.", "The dog's birthday is being observed."], "image": "val2014/COCO_val2014_000000537604.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55733, "question_id": "6eN6KGikB9q3gCEbRfxbAv", "question": "What have the girls been doing?", "choices": ["skiing", "building snowmen", "snowboarding", "cooking"], "correct_choice_idx": 2, "direct_answers": ["snowboarding", "snowboarding", "skiing", "snowboarding", "skiing", "skiing", "snowboarding", "snowboarding", "snowboarding", "snowboarding"], "difficult_direct_answer": false, "rationales": ["The girls are wearing googles and have snow on them. they are holding onto the equipment that were using in the snow which are not thin enough to be skis.", "The girls have been snowboarding.", "The winter wear, style of goggles and snowboards present in this image tells us they were likely recently snowboarding."], "image": "train2014/COCO_train2014_000000055733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151844, "question_id": "6eP3FyeaPq4iCwdTwi6Vjb", "question": "If you wanted to wash clothes near here what would you need?", "choices": ["credit cards", "nothing", "dollar bills", "coins"], "correct_choice_idx": 3, "direct_answers": ["coins", "washer", "coins", "launderette", "need water", "water", "coins", "quarters", "towel", "detergent"], "difficult_direct_answer": false, "rationales": ["It takes money.", "There is a launderette on the street and the sign on the front of the building shows what method of payment is needed for operation.", "There is a laundromat here. the sign outside of the laundromat says the machines are operated by inserting these into the machines."], "image": "val2014/COCO_val2014_000000151844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569775, "question_id": "6eU6uv8mChLPkfqmcGffHi", "question": "What is on the placemat next to the woman?", "choices": ["cat", "scissors", "glass", "teapot"], "correct_choice_idx": 3, "direct_answers": ["tea pot", "teapot", "kettle", "teapot", "dog", "teapot", "kettle", "tea pot", "teapot", "tea"], "difficult_direct_answer": false, "rationales": ["The object on the place mat is a vessel in which to make tea.", "It has a lid and a spout on it", "There is a receptacle with a spout next to the woman."], "image": "train2014/COCO_train2014_000000569775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17487, "question_id": "6eVqEDc9SJ2uTPTis38j4u", "question": "What is the person doing to the cat?", "choices": ["feeding it", "bathing it", "petting it", "hitting it"], "correct_choice_idx": 2, "direct_answers": ["petting", "petting", "petting", "petting", "petting", "petting", "patting", "petting it", "patting", "petting"], "difficult_direct_answer": false, "rationales": ["The cat is on the lap.", "Both of them are relaxed and the person's hand is resting gently on top of the cat.", "The cat is on their lap and they seem to be petting it with their hand."], "image": "train2014/COCO_train2014_000000017487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391703, "question_id": "6eWCHoDTgRr6aUrwF8VKxL", "question": "How big is this hot dog compared to the average hot dog?", "choices": ["much smaller", "same size", "much bigger", "little smaller"], "correct_choice_idx": 2, "direct_answers": ["double", "huge", "larger", "double", "much larger", "fifteen inches", "double", "much bigger", "quarter pound", "very long"], "difficult_direct_answer": false, "rationales": ["The hotdog visibly overhangs the bun with implies it is larger than the standard sized bun.", "The hot dog is much longer than the bun and it is sticking out of both ends.", "The hot dog is much bigger than the bun."], "image": "train2014/COCO_train2014_000000391703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152328, "question_id": "6f7GgCZb6FzSiLsr9vyXzb", "question": "Which wrong furnishing has the woman put on?", "choices": ["pants", "shoes", "shirt", "purse"], "correct_choice_idx": 1, "direct_answers": ["no helmet", "fanny pack", "n/a", "bad question", "shoes", "unknown", "sunglasses", "shoes", "short", "nothing"], "difficult_direct_answer": true, "rationales": ["The woman is wearing block heels to cycle.", "She lacks the right shoes.", "While riding a bike, most people dress casually or athletically. evening shoes are not part of this attire."], "image": "train2014/COCO_train2014_000000152328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232689, "question_id": "6fJ3EBZBoF2wLgS9itfSFZ", "question": "What is the right side of the room mainly used for?", "choices": ["sleeping", "gaming", "bathing", "cooking"], "correct_choice_idx": 3, "direct_answers": ["kitchen", "cooking", "cooking", "kitchen", "cooking", "cooking", "cooking", "cooking", "cooking", "cooking"], "difficult_direct_answer": false, "rationales": ["There is a stove near the right side of the kitchen.", "In this setting the oven on the right helps to prepare food.", "The area is used for cooking."], "image": "train2014/COCO_train2014_000000232689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359719, "question_id": "6fJdCzfeceWUxgX4RukBbV", "question": "'what time of day is this ultimate frisbee game being played?", "choices": ["morning", "two pm", "noon", "night"], "correct_choice_idx": 3, "direct_answers": ["midday", "night", "night", "night", "night time", "afternoon", "night", "night", "night", "night"], "difficult_direct_answer": false, "rationales": ["The game is being played at night.", "It is very dark outside", "There is still light, so it might be afternoon."], "image": "train2014/COCO_train2014_000000359719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200033, "question_id": "6fVzBF8SFSiCopzQUMJNMf", "question": "What is the man in the yellow shirt playing?", "choices": ["drums", "guitar", "buckets", "harp"], "correct_choice_idx": 1, "direct_answers": ["guitar", "guitar", "guitar", "guitar", "guitar", "guitar", "guitar", "guitar", "guitar", "guitar"], "difficult_direct_answer": false, "rationales": ["That stick he's holding is part of a string instrument that you can tune.", "The man in the yellow shirt can be seen holding an instrument that has a long neck with tuning keys.", "It is a stringed instrument which he holds on his lap."], "image": "train2014/COCO_train2014_000000200033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291320, "question_id": "6fWwCapQRv9na7iyMPHB5B", "question": "Why is the man wearing a yellow vest?", "choices": ["costume", "disguise", "visibility", "fashion"], "correct_choice_idx": 2, "direct_answers": ["visibility", "visibility", "maintenance", "visibility safety", "safety", "visibility", "bus driver", "safety", "safety", "safety"], "difficult_direct_answer": false, "rationales": ["To be easily be seen in the area as it make him different.", "The person is wearing that when it's dark.", "He is wearing a safety vest"], "image": "train2014/COCO_train2014_000000291320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274541, "question_id": "6fX9oZyiKviMgY7gqzHUpN", "question": "What type of electronic devices are they using?", "choices": ["cell phone", "tablet", "desktop computer", "laptop computer"], "correct_choice_idx": 2, "direct_answers": ["desktop computer", "laptops", "laptop", "laptops", "laptops", "laptops", "laptops", "laptops", "laptop", "laptop"], "difficult_direct_answer": false, "rationales": ["There are laptops.", "The people are using laptops plugged into outlets in the conference room.", "The computers are visible that are in use and they are portable folding computers that are known as laptops."], "image": "val2014/COCO_val2014_000000274541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59207, "question_id": "6fnzNeFBvXB4pckNVmMxA9", "question": "Which one is likely to be the daughter?", "choices": ["front", "they're men", "back", "they're sisters"], "correct_choice_idx": 0, "direct_answers": ["driver", "driver", "front one", "driver", "front", "driver", "front one", "driver", "front", "driver"], "difficult_direct_answer": false, "rationales": ["The daughter should be younger than the mom.", "The woman at the front looks more young.", "The woman on the front is younger looking out of the two and most likely the daughter."], "image": "val2014/COCO_val2014_000000059207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491732, "question_id": "6fomMkRN775Vo4hU2rVMYR", "question": "In which position is the ball being served?", "choices": ["under handed", "none", "left handed", "over head"], "correct_choice_idx": 3, "direct_answers": ["up", "forward", "one back", "overhanded", "serve", "serve", "overhead", "over head", "outside", "tennis court"], "difficult_direct_answer": true, "rationales": ["The ball is served overhead by the man in a gray shirt.", "He is hitting it over his head to his opponent.", "The player has his arm stretched up over his head."], "image": "val2014/COCO_val2014_000000491732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545002, "question_id": "6g6kkbtFP8BJGWh8NMzqMY", "question": "What protective gear should the man wear?", "choices": ["headband", "scarf", "helmet", "knee pads"], "correct_choice_idx": 2, "direct_answers": ["helmet", "helmet", "helmet", "coat", "skateboard", "helmet", "helmet", "suit helmet", "jacket", "helmet"], "difficult_direct_answer": false, "rationales": ["He should have something on his head in case he falls.", "The man should wear a helmet for head protection.", "The gear is the helmet."], "image": "val2014/COCO_val2014_000000545002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279543, "question_id": "6gFFxRvQYkVdj6MsMDfNDk", "question": "What are the white fin shapes parts on the back of the plane called?", "choices": ["air tips", "spoilers", "horizontal stabilizers", "slats"], "correct_choice_idx": 2, "direct_answers": ["swirl", "vertical stabilizers", "wings", "tail", "tail", "fins", "brakes", "wings", "horizontal stabilizers", "tail wings"], "difficult_direct_answer": false, "rationales": ["These are attached to the plane on both sides to provide stability in the air and keep it flying straight.", "The shapes are stabilizers.", "They are positioned in a horizontal way and their purpose is to keep the plane from being unstabilized."], "image": "train2014/COCO_train2014_000000279543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24097, "question_id": "6gHBAtiQEC3XCcPCM4xqAC", "question": "Where are these elephants located?", "choices": ["circus", "wild", "parade", "zoo"], "correct_choice_idx": 2, "direct_answers": ["street", "street", "road", "street", "road", "street", "parade", "road", "street", "street"], "difficult_direct_answer": false, "rationales": ["The elephants are marching alongside people that are located outdoors on a paved street. there are also people lining the sides of the street who are watching.", "These animals are walking in a procession that is moving down the middle of a road with spectators on the side and elephants would not naturally walk in this way or space. these are features of a parade.", "These elephants are marching in the streets."], "image": "val2014/COCO_val2014_000000024097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80430, "question_id": "6gPKgj38GE9YkMA8gHMgvr", "question": "What is the man about to do?", "choices": ["bat", "swing", "dunk", "dribble"], "correct_choice_idx": 1, "direct_answers": ["swing", "hit ball", "hit ball", "hit ball", "serve ball", "serve ball", "hit ball", "hit ball", "hit ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["The man is about to swing his racquet.", "The other options relate to baseball and basketball, respectively.", "He will move the racket so he can hit the ball back to the other player"], "image": "train2014/COCO_train2014_000000080430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539072, "question_id": "6gdvqQ85o4TRCzNM67ztVP", "question": "What type of shop is the one with the woman's picture in a circle on the window?", "choices": ["bakery", "bookstore", "music shop", "women's clothing"], "correct_choice_idx": 0, "direct_answers": ["bakery", "bakery", "bakery", "cafe", "cafe", "bakery", "cafe", "bakery", "bakery", "bakery"], "difficult_direct_answer": false, "rationales": ["This is swedish for \"bakery\".", "The woman in the picture is wearing an apron.", "The shop is a bakery."], "image": "train2014/COCO_train2014_000000539072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213393, "question_id": "6goUhjchQR42YLymuHx6k2", "question": "What is the best material for a skateboard?", "choices": ["maple", "pine", "palm", "coconut"], "correct_choice_idx": 0, "direct_answers": ["wood", "maple wood", "maple", "wood", "wood", "wood", "wood", "unknown", "maple wood", "maple"], "difficult_direct_answer": false, "rationales": ["Its not soft and its light.", "The board is made of that wood.", "The best material is a strong wood material."], "image": "train2014/COCO_train2014_000000213393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289621, "question_id": "6gtAQDp7mM8VvqUAiVfKXV", "question": "What kind of brush is being used?", "choices": ["paint", "hair", "tooth", "pet"], "correct_choice_idx": 0, "direct_answers": ["paint", "paint", "paint", "paint", "paint brush", "paint brush", "paint brush", "paint brush", "paint", "paint brush"], "difficult_direct_answer": false, "rationales": ["A paintbrush is ebing used near the paint cans.", "The woman is holding the brush on a paper. there is paint near her.", "As evident by the cans of a."], "image": "train2014/COCO_train2014_000000289621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142014, "question_id": "6gxDMgcbituf7vzpV7ewwX", "question": "What kind of sign is shown?", "choices": ["traffic", "name", "brand", "price"], "correct_choice_idx": 0, "direct_answers": ["bike lane", "stop", "stop sign", "bike lane", "stop", "stop sign", "traffic", "stop", "traffic", "stop"], "difficult_direct_answer": false, "rationales": ["You can tell by the shape and the wording as to what type of sign is being shown.", "The stop sign is there to direct the cars on how to operate the vehicles on the road.", "The stop sign is a traffic sign."], "image": "train2014/COCO_train2014_000000142014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385405, "question_id": "6hNH7q6mfqXopBumcNHP95", "question": "What type of store is this?", "choices": ["shoe", "grocery", "pet", "beauty"], "correct_choice_idx": 2, "direct_answers": ["pet", "bird store", "pet shop", "pet", "pet", "pet", "pet store", "pet store", "pet", "pet"], "difficult_direct_answer": false, "rationales": ["There are cages hanging or stacked along the store containing different birds for sale.", "This is a pet store that appears to specialize in selling birds, as many cages with birds in them are quite visible here, but no dogs, cats or fish are seen.", "It has many cages with animals inside them."], "image": "train2014/COCO_train2014_000000385405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406116, "question_id": "6hTjo2jj4YMAREKiVxtHma", "question": "The woman using the cell phone traveled to which Caribbean country?", "choices": ["bahamas", "jamaica", "puerto rico", "cuba"], "correct_choice_idx": 0, "direct_answers": ["bahamas", "bahamas", "saint lucia", "jamaica", "jamaica", "bahamas", "cuba", "norway", "jamaica", "bahamas"], "difficult_direct_answer": false, "rationales": ["The woman has been to the bahamas.", "It looks like there is a flag of the bahamas on this woman's keychain.", "A woman is holding a phone and some keys. the key has a flag with blue and yellow on it."], "image": "train2014/COCO_train2014_000000406116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134079, "question_id": "6hvkUAuUNTduNAeXGyQrUx", "question": "What company is famous for making the item the man has that is covering his legs?", "choices": ["mizuno", "green giant", "nathan's", "levi strauss"], "correct_choice_idx": 3, "direct_answers": ["levi", "levis", "levi strauss", "book", "levis", "levis", "sony", "levi strauss", "levi's", "levis"], "difficult_direct_answer": false, "rationales": ["They have made denim clothing for over a hundred years", "The man on the bench is wearing denim jeans which are made by companies such as levi strauss,.", "The man is most likely wearing a pair of levi's, the most famous jeans in the world. levi strauss opened up for business in 1873 and never slowed down since then."], "image": "train2014/COCO_train2014_000000134079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380552, "question_id": "6iJfSJBR3x269pmCdtpmb3", "question": "If you want to use this transport what can you feed it?", "choices": ["electricity", "carrots", "coal", "gas"], "correct_choice_idx": 1, "direct_answers": ["hay", "hay", "hay", "hay", "hay", "hay", "carrots", "grass", "hay", "grass"], "difficult_direct_answer": false, "rationales": ["Horses eat orange vegetables as a treat.", "The animal in this image is a horse. a carrot is the only item here listed which would be fed to a horse.", "Carrots are the only food item on the list."], "image": "val2014/COCO_val2014_000000380552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241058, "question_id": "6iP9eCfDgSSEAqfDPoUKqW", "question": "The kitchen is currently in the process of what residence-related activity?", "choices": ["selling", "new construction", "remodeling", "demolition"], "correct_choice_idx": 2, "direct_answers": ["remodeling", "renovation", "renovation", "cooking", "yes", "moving out", "renovation", "remodeling", "cleaning", "being cleaned"], "difficult_direct_answer": false, "rationales": ["The cabinets have stickers on them still", "The kitchen here is undergoing renovation.", "There are stickers on the cabinets which indicates they were just installed"], "image": "train2014/COCO_train2014_000000241058.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435010, "question_id": "6iQcc26MLArXLDk7T5TaGX", "question": "Where is this man located?", "choices": ["desert", "ski resort", "beach", "mountains"], "correct_choice_idx": 2, "direct_answers": ["near beach", "beach", "skate ramp", "skatepark beach", "bendcrete skatepark", "park", "edge bowl", "beach", "skate park", "skate park"], "difficult_direct_answer": false, "rationales": ["There is sand near the man. a body of water is past the sand.", "There is sand and ocean near the man.", "A man is skateboarding and a sandy area with water can be seen behind him."], "image": "train2014/COCO_train2014_000000435010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234516, "question_id": "6iYgejWGZPbefXqFAAbyt4", "question": "What kind of store are the trucks parked in front of?", "choices": ["electronics", "toy", "grocery", "hardware"], "correct_choice_idx": 3, "direct_answers": ["paint", "paint", "interior design", "home supply", "pain", "home improvement", "paint store", "hardware", "paint", "paint store"], "difficult_direct_answer": false, "rationales": ["Based on the signs in the window, this is an old-fashioned hardware store.", "The store advertises paint and wallpaper among other things so it is for home improvement and repair", "The store says that they sell things such as paint and glass."], "image": "train2014/COCO_train2014_000000234516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334400, "question_id": "6iZkdppGCw92heuUnBr7De", "question": "Why is the man's shirt yellow?", "choices": ["camouflage", "visibility", "dress code", "fashion"], "correct_choice_idx": 1, "direct_answers": ["parking advertising", "increased visibility", "parking attendant", "parking attendant", "visibility", "visibility", "visibility", "design", "waiter", "indicator"], "difficult_direct_answer": false, "rationales": ["He is a parking attendant so it is necessary for travelers to be able to spot him easily.", "The highlighter yellow color helps people see him.", "The man wears it to be visible in the dark."], "image": "train2014/COCO_train2014_000000334400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48531, "question_id": "6if9krYfpHS62mPQXFE3BT", "question": "What will the player do next?", "choices": ["block", "dribble", "swing", "run"], "correct_choice_idx": 2, "direct_answers": ["hit ball", "serve", "hit", "hit ball", "hit ball", "hit ball", "hit ball", "hit ball", "swing", "serve"], "difficult_direct_answer": false, "rationales": ["He is reaching up to swing at the ball as it comes to him.", "The man is playing tennis and is about to hit the ball with his racquet.", "He has thrown the ball in the air to serve it"], "image": "train2014/COCO_train2014_000000048531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434187, "question_id": "6isctxbLAe9N5VDyDQBnxJ", "question": "Why is the man holding a phone out in front of him?", "choices": ["sending email", "scanning barcode", "reading text", "taking photo"], "correct_choice_idx": 3, "direct_answers": ["take photo", "photo", "taking photo", "picture", "photography", "taking picture", "taking picture", "taking photo", "taking picture", "taking picture"], "difficult_direct_answer": false, "rationales": ["He is holding the phone up toward the people behind bars which means he is taking a photo. he would hold the phone closer to himself if he was reading it.", "The man wants a photo.", "People take photos with their phones."], "image": "val2014/COCO_val2014_000000434187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84162, "question_id": "6jEQhthcyuKFyKRSenT3kx", "question": "What vehicle used for this water transportation?", "choices": ["yacht", "canoe", "cargo ship", "raft boat"], "correct_choice_idx": 2, "direct_answers": ["ferry", "small", "ship", "cruise ship", "ferry", "ship", "ship", "cargo ship", "cruise", "ships"], "difficult_direct_answer": false, "rationales": ["Cargo ships are massive enough to account for the size of this water transportation.", "The question isn't worded clearly, all answers provided would use water for transportation.", "The ship is transporting cargo."], "image": "train2014/COCO_train2014_000000084162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42785, "question_id": "6jP4GGbFughwS37jYJAA3D", "question": "This dish is usually eaten using what?", "choices": ["hands", "chopsticks", "spoon", "fork"], "correct_choice_idx": 0, "direct_answers": ["hands", "fork", "hands", "hands", "hands", "hands", "hands fork", "hands", "hands", "hands"], "difficult_direct_answer": false, "rationales": ["Most people pick up a piece of pizza and eat it without eating utensils.", "Pizza is usually cut into manageable slices so one doesn't have to use utensils.", "Most people just pick it up to eat it."], "image": "train2014/COCO_train2014_000000042785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506075, "question_id": "6jbofhHMM9vEQ8ndWu6NhP", "question": "What activity is the individual engaging in?", "choices": ["boxing", "climbing", "running", "taekwondo"], "correct_choice_idx": 0, "direct_answers": ["video game", "gaming", "boxing", "playing wii", "playing wii", "playing wii", "video gaming", "holding remote", "wii", "video game"], "difficult_direct_answer": false, "rationales": ["He is pushing his fist forward like he is punching.", "Most likely because the persons hand positions and the remotes in there hands it looks like boxing.", "They are playing with a wii."], "image": "train2014/COCO_train2014_000000506075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365313, "question_id": "6kDfwrWCVE7qCM4gZ8mvGb", "question": "What is the tall white object behind the couch?", "choices": ["barrier", "statue", "vase", "lamp"], "correct_choice_idx": 3, "direct_answers": ["lamp", "lamp", "lamp", "lamp", "lamp", "lamp", "lamp", "lamp", "lamp", "lamp"], "difficult_direct_answer": false, "rationales": ["By the design, shade and cord connected to it, it's easy to tell that is a lamp.", "There is a tall white light behind the couch.", "You can tell by the shade and shape as to what type of appliance it is."], "image": "train2014/COCO_train2014_000000365313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231655, "question_id": "6kP39icDVL4o5ft54urRkT", "question": "What material is this grey boat made of?", "choices": ["carbon fiber", "rubber", "metal", "wood"], "correct_choice_idx": 1, "direct_answers": ["plastic", "fiberglass", "rubber", "rubber", "rubber", "tube", "plastic", "rubber", "rubber", "rubber"], "difficult_direct_answer": false, "rationales": ["Those types of rafts are always made from it because it's waterproof and durable.", "The material is rubber.", "The grey boat is made of rubber and is floating."], "image": "val2014/COCO_val2014_000000231655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514601, "question_id": "6kQxZamWwTVuCauLVEPj5N", "question": "Why are there signs hanging from the ceiling?", "choices": ["advertisements", "identify benches", "guide travelers", "cameras"], "correct_choice_idx": 2, "direct_answers": ["directions", "visibility", "show times", "notification signage", "train times", "for directions", "track information", "directions", "guide travelers", "info"], "difficult_direct_answer": true, "rationales": ["The signs provide information related to each track like destinations and times so that riders know which track to go to and when they need to be there.", "There are signs that show when trains will arrive, which give information to travelers.", "The signs guide travelers."], "image": "train2014/COCO_train2014_000000514601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522146, "question_id": "6kSSc2VVUDQ2oo2hwYb9ti", "question": "Why are there ropes attached to the boat?", "choices": ["to swing", "to fish", "to climb", "to anchor"], "correct_choice_idx": 3, "direct_answers": ["anchor boat", "movement prevention", "anchor", "to anchor", "anchor", "anchors", "pulling raft", "moorings", "hold it", "tethered"], "difficult_direct_answer": true, "rationales": ["When a ship needs to stop for a period of time the captain will throw an anchor in the water to keep the boat from moving.", "The boat does not appear to be occupied. to keep a boat from drifting off while unoccupied it needs to be anchored and anchors utilize ropes to connect them to the boat.", "The ropes help keep the boat from floating away."], "image": "train2014/COCO_train2014_000000522146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19542, "question_id": "6kaFiv8iHmFyEtLMy5WR6P", "question": "The car is operating during which season?", "choices": ["summer", "spring", "winter", "fall"], "correct_choice_idx": 2, "direct_answers": ["spring", "winter", "spring", "winter", "summer", "winter", "summer", "summer", "current season", "autumn"], "difficult_direct_answer": false, "rationales": ["The trees in the back look dead.", "The trees in the distance are bare and have lost their leaves due to the cold weather.", "The car is at the winter."], "image": "train2014/COCO_train2014_000000019542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177994, "question_id": "6kttzKq4JWLYM3YjXa9Uge", "question": "What train car is this a virtual depiction of?", "choices": ["coach", "dining", "sleeper", "business"], "correct_choice_idx": 1, "direct_answers": ["dining", "dining car", "dining", "orient express", "restaurant", "dinner car", "orient express", "dining", "dining car", "dining car"], "difficult_direct_answer": false, "rationales": ["There are dining plates and a tablecloth on the table.", "An area has tables set with dishes and tablecloths.", "It has chairs up to tables with dishes on them ready to have food served"], "image": "val2014/COCO_val2014_000000177994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391254, "question_id": "6mGbteBCDZuUotPzMHLCq6", "question": "What kind of protest is taking place?", "choices": ["racial justice", "union workers", "religious", "political"], "correct_choice_idx": 2, "direct_answers": ["religious bigotry", "religious", "religious", "homo sexuality", "religious", "religious", "anti-gay", "peaceful", "religious", "anti-gay"], "difficult_direct_answer": false, "rationales": ["There are bible verses.", "The words on the signs are about going to hell.", "The protest sign refers to the bible."], "image": "train2014/COCO_train2014_000000391254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254774, "question_id": "6mWzsrnzj67X58JfmJxZGf", "question": "What type of trick has the person in blue done?", "choices": ["superman", "mctwist", "flip", "grind"], "correct_choice_idx": 2, "direct_answers": ["flip", "flip", "spin", "flip", "loop", "flip", "skateboard", "tumble", "flip", "flip"], "difficult_direct_answer": false, "rationales": ["He is flipping in the air.", "The person in blue is doing a flip on the ski slope.", "They are doing gymnastics in the air."], "image": "train2014/COCO_train2014_000000254774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46149, "question_id": "6mnhNWAHfRhAEpc3acVrJV", "question": "Where are the people located?", "choices": ["canada", "africa", "antarctica", "us"], "correct_choice_idx": 1, "direct_answers": ["asia", "asia", "parade", "india", "india", "africa", "on elephants", "india", "india", "india"], "difficult_direct_answer": false, "rationales": ["The buildings are in africa", "The people on the elephants are in africa where those animals are native.", "In africa these types of elephants can be found."], "image": "train2014/COCO_train2014_000000046149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11034, "question_id": "6msRJX9cf29nAmEYQKnbAU", "question": "They have appropriate accommodations for which one of these animals?", "choices": ["snake", "ant", "gerbil", "guppy"], "correct_choice_idx": 3, "direct_answers": ["fish", "fish", "fish", "fish", "fish", "fish", "fish", "guppy", "fish", "fish"], "difficult_direct_answer": false, "rationales": ["A guppy can swim in the fish tank.", "This is a fish and there is an aquarium in the room", "They have a fish tank in the back ground for their fish."], "image": "val2014/COCO_val2014_000000011034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389031, "question_id": "6nZHPP62khjxnemUWokArD", "question": "Why are they under umbrellas?", "choices": ["rain", "privacy", "sun", "snow"], "correct_choice_idx": 1, "direct_answers": ["rain", "for effects", "rain", "from rain", "raining", "rain", "privacy", "raining", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["This could be to keep the elements off of them but also to have some privacy.", "The people seated under the colorful umbrellas are looking to not be bothered by others.", "There is rain."], "image": "train2014/COCO_train2014_000000389031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560391, "question_id": "6nipfVH9s2WzaCw5HW8Nzp", "question": "What are the stacks of newspaper for?", "choices": ["reading material", "selling them", "cleaning area", "hold fruit"], "correct_choice_idx": 3, "direct_answers": ["hold fruit", "for drainage", "wrap", "packaging", "presenting food", "lining", "wrapping", "preserving fruit", "displays", "wrapping food"], "difficult_direct_answer": true, "rationales": ["The newspapers are holding a variety of bananas.", "Fruit is displayed on top of newspaper. newspapers are used to wrap and line things.", "The newspapers hold fruit."], "image": "train2014/COCO_train2014_000000560391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91636, "question_id": "6nmgeQcNSPgmCTan6xMHZz", "question": "Who manufactured this motorcycle?", "choices": ["honda", "harley davidson", "suzuki", "kawasaki"], "correct_choice_idx": 0, "direct_answers": ["honda", "honda", "honda", "honda", "harley", "harley", "honda", "harley davidson", "honda", "honda"], "difficult_direct_answer": false, "rationales": ["A honda logo is on a motorcycle.", "The motorcycle was made by honda and says honda on the front below the headlight.", "The motorcycle has the name honda printed on the front of the body."], "image": "val2014/COCO_val2014_000000091636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330204, "question_id": "6oCmYjQxXaffjq5ZtDapD9", "question": "In what state was this vehicle operational based on its information screen?", "choices": ["new york", "colorado", "california", "ohio"], "correct_choice_idx": 3, "direct_answers": ["ohio", "ohio", "california", "ohio", "fairly good", "ohio", "ohio", "ohio", "cleveland", "ohio"], "difficult_direct_answer": false, "rationales": ["Cleveland ohio is on the bus.", "The destination sign on the bus says \"cleveland\". cleveland is a city located in this state.", "The state is ohio."], "image": "val2014/COCO_val2014_000000330204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54421, "question_id": "6oKKWBN3GYT2kRZaTqdoMt", "question": "What type bear does the photographer favor?", "choices": ["bud", "none", "coors", "heineken"], "correct_choice_idx": 3, "direct_answers": ["heineken", "light", "heineken", "teddy", "uw0tm8", "heineken", "meat", "polar", "unknown", "brown"], "difficult_direct_answer": false, "rationales": ["The beer's name is on the glass.", "The heineken is favored.", "There is a glass that says heineken on the table."], "image": "val2014/COCO_val2014_000000054421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191511, "question_id": "6oTDJxzdC6F5uL62VEMKrB", "question": "What is the blue round object sitting on the desk a model of?", "choices": ["planet", "gyroscope", "moon", "globe"], "correct_choice_idx": 3, "direct_answers": ["world", "globe", "earth", "earth", "earth", "globe", "earth", "globe", "globe", "globe"], "difficult_direct_answer": false, "rationales": ["The blue round object on the desk is a globe that represents the earth.", "Traditionally globes show countries of the planet and show the blue depicting the ocean.", "A globe is sitting on the desk. this is useful for geography."], "image": "train2014/COCO_train2014_000000191511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141755, "question_id": "6p2iQYY2xdvVeevH4kX62w", "question": "What type of building is in the distance?", "choices": ["hospital", "store", "residence", "church"], "correct_choice_idx": 2, "direct_answers": ["house", "big house", "house", "residence", "houses", "house", "house", "cottage", "house", "house"], "difficult_direct_answer": false, "rationales": ["The building in the distance doesn't appear to be commercial or municipal, so based on its appearance, it can be assumed to be a residence.", "A residential home is in the distance.", "The building is a residence."], "image": "val2014/COCO_val2014_000000141755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570128, "question_id": "6pUseyL2PBgav8ofYKePfM", "question": "How did the man get his hair to stand up?", "choices": ["steam", "glue", "gel", "water"], "correct_choice_idx": 2, "direct_answers": ["gel", "combing it", "hair gel", "mousse", "gel", "gel", "gel", "gel", "gel", "gel"], "difficult_direct_answer": false, "rationales": ["People use gel to style their hair.", "He used a product in his hair to get it to do that", "The man uses that to make his hair sticking up."], "image": "train2014/COCO_train2014_000000570128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360307, "question_id": "6pdtwmrNZ2FUGKuzVjjei3", "question": "Who is known for playing the same position as the man with the black wristband?", "choices": ["gary carter", "otis nixon", "mike trout", "trevor story"], "correct_choice_idx": 0, "direct_answers": ["bill klem", "mike piazza", "johnny bench", "mike socia", "piazza", "javiar molina", "gary carter", "baseball player", "yogi berry", "unknown"], "difficult_direct_answer": true, "rationales": ["The other options are baseball players but don't play the position of catcher.", "The man with the wristband plays the same position as gary carter.", "A man is playing in the catcher position as another player bats."], "image": "train2014/COCO_train2014_000000360307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18918, "question_id": "6pj5zHRhMoKiX74wcK3Tdj", "question": "How is dough kneaded here?", "choices": ["in sink", "by machine", "with knives", "hand only"], "correct_choice_idx": 1, "direct_answers": ["by machine", "mixer", "machine", "machine", "mixing bowl", "mixer", "by mixer", "mixer", "machine", "machine"], "difficult_direct_answer": false, "rationales": ["There is a large whisking machine to the left. people put stuff in it and turn it on to knead the dough.", "The place has an appliance just made for kneading bread.", "There are large machines to make dough."], "image": "val2014/COCO_val2014_000000018918.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103711, "question_id": "6psMHriktJX5ktik5aAAYL", "question": "What is the man in the centre holding?", "choices": ["bat", "sword", "rifle", "ax"], "correct_choice_idx": 2, "direct_answers": ["gun", "rifle", "gun", "rifle", "gun", "gun", "ak47", "gun", "gun", "gun"], "difficult_direct_answer": false, "rationales": ["The man is in military uniform and holding a large automatic weapon that based on style, material and size is answer a.", "The man in the centre is a soldier. he is carrying an ak-47.", "The man in the center is dressed in military uniform and holding a rifle."], "image": "train2014/COCO_train2014_000000103711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182860, "question_id": "6qA4Caku3XGSustV6yxjWF", "question": "Which national flag is in the segment screen of this broadcast?", "choices": ["france", "uk", "netherlands", "german"], "correct_choice_idx": 1, "direct_answers": ["united kingdom", "uk", "usa", "great britain", "union jack", "unknown", "england", "england", "united kingdom", "united kingdom"], "difficult_direct_answer": false, "rationales": ["The other options have different flags.", "There is a flag for the united kingdom on the screen.", "The flag that is depicted is from the uk."], "image": "train2014/COCO_train2014_000000182860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143554, "question_id": "6qLY7fkVD3Q4FFizQX3vx4", "question": "What is he doing with the frisbee?", "choices": ["holding it", "throwing it", "balancing it", "twirling it"], "correct_choice_idx": 2, "direct_answers": ["fingers", "playing", "spinning frisbee", "spinning it", "tricks", "tricks", "showing off", "spinning", "twirling it", "balancing it"], "difficult_direct_answer": true, "rationales": ["The man is trying to keep it upright.", "He is balancing it on his finger.", "The man is trying to balance the frisbee on just one of his fingers."], "image": "val2014/COCO_val2014_000000143554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360143, "question_id": "6qeLtDpnfCNLkXWj5TTZx9", "question": "Why is he leaning sideways?", "choices": ["bouncing", "balance himself", "falling", "jumping"], "correct_choice_idx": 1, "direct_answers": ["surfing", "balance", "balance", "wave movement", "surfing", "balance", "balancing", "balance himself", "balance", "maintain balance"], "difficult_direct_answer": false, "rationales": ["A person is surfing on a wave in the ocean.", "Trying to keep himself on the board as he moves.", "The man is trying not to fall."], "image": "val2014/COCO_val2014_000000360143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252244, "question_id": "6qrAU3AhjvABQrTXbjRe5q", "question": "What part of his body will be most harmed by the item in his mouth?", "choices": ["back", "lungs", "feet", "eyes"], "correct_choice_idx": 1, "direct_answers": ["lungs", "lungs", "secrete", "lungs", "secrete", "lungs", "lungs", "lungs", "lungs", "secrete"], "difficult_direct_answer": false, "rationales": ["Inhaling heated smoke causes tissue damage", "And his immune system.", "Because cigarettes' are harmful to the body especially the lungs, as they result to chronic diseases."], "image": "train2014/COCO_train2014_000000252244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521288, "question_id": "6quVcfDsgC3HW8oB86krov", "question": "Whos is the little elephant likely following?", "choices": ["friend", "teacher", "mother", "great grandfather"], "correct_choice_idx": 2, "direct_answers": ["mom", "mother", "mother", "mother", "mother", "its mother", "few elephants", "mom", "big elephant", "it's mom"], "difficult_direct_answer": false, "rationales": ["It could be the other options too, but this is the most likely reason.", "The elephant is following the larger one.", "Baby elephants usually only follow their mothers and are comfortable enough to latch onto them."], "image": "val2014/COCO_val2014_000000521288.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476785, "question_id": "6qzKif7AVgUepKiuKotXfV", "question": "What is the woman staring at?", "choices": ["cat", "dog", "television", "man"], "correct_choice_idx": 3, "direct_answers": ["man", "man", "man", "boy", "man", "boy", "man", "man", "man", "man"], "difficult_direct_answer": false, "rationales": ["The woman is making eye contact with the guy that is in front of her.", "The woman's eyes are not on her computer but are instead directly on the person in the striped shirt sitting across from her.", "The woman is seated in front of the man across from her, looking at him."], "image": "train2014/COCO_train2014_000000476785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315303, "question_id": "6rRUCRJKP27mhXvXFzvpNm", "question": "What necessary condition hasn't been met for this vehicle to travel?", "choices": ["coal burning", "water underneath", "nice weather", "wind blowing"], "correct_choice_idx": 1, "direct_answers": ["water", "high tide", "windy", "water", "water", "water underneath", "water", "high tide", "turned on", "high tide"], "difficult_direct_answer": false, "rationales": ["Water is necessary for the boat to float and be propelled along.", "The boat is on sand, and needs to be in water.", "The other options wouldn't apply to this type of boat. and it doesn't need b because it has an engine and fan for that."], "image": "val2014/COCO_val2014_000000315303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152731, "question_id": "6rjJCg5fgStcai5tZbp9Qq", "question": "What terrain is featured here?", "choices": ["desert", "plain", "savanna", "beach"], "correct_choice_idx": 1, "direct_answers": ["plateau", "field", "field", "plains", "grasslands", "plains", "grassy", "tundra", "plain", "plain"], "difficult_direct_answer": false, "rationales": ["It is mostly flat land with low vegetation", "There is very little mountains or trees which is characteristic of this type of terrain.", "This is a grassy area with sheep."], "image": "val2014/COCO_val2014_000000152731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414495, "question_id": "6sAJ3UZcTXT29wBRD2C5gp", "question": "What is in the can on the back of the toilet?", "choices": ["hairspray", "cleanser", "air freshener", "shampoo"], "correct_choice_idx": 2, "direct_answers": ["room freshener", "air freshener", "air freshener", "air fresher", "air freshener", "air freshener", "air freshener", "cat", "air freshener", "febreze"], "difficult_direct_answer": false, "rationales": ["It has a spray top and this is the bathroom which will have odors", "There is toilet spray on the tank.", "Those cans are normally kept in the bathroom to eliminate the odors from a bathroom."], "image": "train2014/COCO_train2014_000000414495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526922, "question_id": "6sFCG9zZknV7Skugs3nGLC", "question": "What type of problem is happening?", "choices": ["snowstorm", "traffic jam", "thunderstorm", "house fire"], "correct_choice_idx": 1, "direct_answers": ["traffic", "traffic jam", "traffic jam", "traffic jam", "traffic", "traffic jam", "traffic", "traffic jam", "traffic jam", "traffic jam"], "difficult_direct_answer": false, "rationales": ["There are several vehicles crammed together on a road", "There are several vehicles stopped in the same area", "A number of vehicles are on the road and very close to each other, which is indicative of a traffic jam on a street."], "image": "train2014/COCO_train2014_000000526922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461805, "question_id": "6siSmsjca5WivB2eqGNKNQ", "question": "What is the event shown in the picture?", "choices": ["food festival", "carnival", "car parking", "car show"], "correct_choice_idx": 3, "direct_answers": ["motorcycle race", "automotive convention", "car show", "motorcycle show", "car show", "bike event", "rally", "motorcycle show", "motorcycle club", "bike show"], "difficult_direct_answer": false, "rationales": ["They are at a car show", "A show is about to start featuring bikes and cars.", "This event is a car show."], "image": "val2014/COCO_val2014_000000461805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300578, "question_id": "6smidLMTAzohcxvu2ocNNq", "question": "What is the woman on the right holding in her hand?", "choices": ["squash", "potatoes", "cabbage", "watermelon"], "correct_choice_idx": 2, "direct_answers": ["umbrella", "lettuce", "cabbage", "lettuce", "cabbage", "bag", "large vegetable", "cabbage", "lettuce", "money"], "difficult_direct_answer": false, "rationales": ["She has a cabbage in her hand that she is probably going to sell.", "The woman has a cabbage in her arms.", "A woman in a coat vest is holding up a greenish and white vegetable."], "image": "train2014/COCO_train2014_000000300578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265225, "question_id": "6tAVqSFXQCGBJuyFKvM5aX", "question": "What in the room has to be changed before new guests arrive?", "choices": ["curtains", "linens", "office chair", "sofa"], "correct_choice_idx": 1, "direct_answers": ["sheets", "bed sheets", "sheets", "sheets", "bed sheets", "linens", "sheets", "bed sheets", "sheets", "sheets"], "difficult_direct_answer": false, "rationales": ["The sheets and pillowcases are changed for every new guest.", "Sheets are always fresh for new visitors.", "The linens should always be swapped out since they can get dirty."], "image": "train2014/COCO_train2014_000000265225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339382, "question_id": "6tFEv9YLBixLjWYeGGgn8W", "question": "Why is he holding the umbrella?", "choices": ["confused", "food dry", "likes umbrella", "self dry"], "correct_choice_idx": 1, "direct_answers": ["raining", "cooking", "raining", "repel rain", "raining", "raining", "sun protection", "hot sun", "rain", "food dry"], "difficult_direct_answer": false, "rationales": ["He's holding it over himself and not the grill", "There is no point in having a bar-b-que if rain is going to go on the food. this chef is prepared.", "It is raining out and he is keeping himself and the food from getting wet."], "image": "train2014/COCO_train2014_000000339382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566757, "question_id": "6tGHtVYUMSbmM4EFoX3p3D", "question": "What will persons on train most likely do next?", "choices": ["eat dinner", "board train", "sing", "get off"], "correct_choice_idx": 3, "direct_answers": ["get off", "exit", "get off", "alight", "stop", "get off", "get off", "travel", "ride train", "sit"], "difficult_direct_answer": false, "rationales": ["They are on the train and will exit at their stop.", "The people already boarded the train. it would not be appropriate to eat or sing on a train.", "People ride trains to get to a destination. once at the destination they disembark from the train."], "image": "val2014/COCO_val2014_000000566757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258628, "question_id": "6tH5AVh4fNyciKw5d6hA3i", "question": "Who will get the point being played?", "choices": ["shown player", "let", "no one", "opposite player"], "correct_choice_idx": 3, "direct_answers": ["opposite player", "opponent", "other player", "tommy", "server", "tommy paul", "other player", "white shirt", "opponent", "opponent"], "difficult_direct_answer": false, "rationales": ["The opponent on the other side of the net.", "The ball is about to hit the court on this person's side.", "The opposite player will get the point since the ball is on the ground."], "image": "val2014/COCO_val2014_000000258628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238749, "question_id": "6tNnpBhz7GthUHB7ASU4gD", "question": "Why is there a lid on the yellow cup?", "choices": ["prevent spills", "to hit", "decorative purposes", "to shake"], "correct_choice_idx": 0, "direct_answers": ["prevent spilling", "prevent spilling", "prevent spills", "child", "prevent spilling", "prevent spilling", "soup", "prevent spills", "prevent spills", "no spill"], "difficult_direct_answer": false, "rationales": ["The tops on plastic glasses are for kid because they're likely to knock the glass over or drop it.", "Accidents can happen and the lid prevents them.", "The person is a child, and has a higher chance of knocking over the cup."], "image": "train2014/COCO_train2014_000000238749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479612, "question_id": "6tZV5Y4ZhWqXGdRff2NXYH", "question": "What company makes the appliance?", "choices": ["sunbeam", "sharp", "panasonic", "dyson"], "correct_choice_idx": 1, "direct_answers": ["sharp", "sharp", "sharp", "sharp", "sharp", "sharp", "sharp", "sharp", "sharp", "sharp"], "difficult_direct_answer": false, "rationales": ["Sharp is the company whose logo appears.", "A brand logo is on the front of an appliance.", "The microwave's brand is on its door."], "image": "val2014/COCO_val2014_000000479612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324603, "question_id": "6tqBoWHB6qZvLqZzzTr4Ey", "question": "What hair style does the girl in blue have?", "choices": ["pig tails", "mullet", "pony tail", "crew cut"], "correct_choice_idx": 2, "direct_answers": ["pony tail", "neat", "ponytail", "pony tail", "ponytail", "ponytail", "ponytail", "ponytail", "pony tail", "ponytail"], "difficult_direct_answer": false, "rationales": ["Her hair has one scrunchie near at the back of her head while the rest below the scrunchie are freeflowing.", "Her hair is pulled back in a pony tail", "The girl in the blue has all of her hair pulled away from her face, gathered in the back, and secured with one hair tie."], "image": "train2014/COCO_train2014_000000324603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293274, "question_id": "6uWnttMZXixo48pN5YP3BN", "question": "What is the flag called that is on the poster in the front window of the bus?", "choices": ["union jack", "french flag", "old glory", "american flag"], "correct_choice_idx": 0, "direct_answers": ["united kingdom", "union jack", "union jack", "union jack", "united kingdom", "union jack", "union jack", "union jack", "union jack", "union jack"], "difficult_direct_answer": false, "rationales": ["The flag in the bus window is the united states flag and is known as the union jack.", "The flag that's located on the bus is called a union jack.", "This is american flag as it resembles it."], "image": "train2014/COCO_train2014_000000293274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93763, "question_id": "6uYy7J4auSgJNrSadJVPhP", "question": "Where is the group focusing their attention?", "choices": ["poster", "screen", "speaker", "performer"], "correct_choice_idx": 1, "direct_answers": ["video screen", "video game", "television", "beer pong", "video games", "screen", "camera", "tv screen", "toward tv", "screen"], "difficult_direct_answer": true, "rationales": ["The group is watching something on the screen.", "The man has a wii remote, which means they are watching him play a video game.", "They are concentrating on the videogame which is being played on the screen."], "image": "val2014/COCO_val2014_000000093763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61460, "question_id": "6udTREKZLjHCaeSZZ8jY9P", "question": "What is the camera looking at?", "choices": ["beach", "chair", "campground", "floor"], "correct_choice_idx": 3, "direct_answers": ["luggage", "floor", "floor", "cast", "sidewalk", "two people", "ground", "ground", "floor", "luggage"], "difficult_direct_answer": false, "rationales": ["The camera is looking down on top of and past a hand. for this angle to be captured on the picture the camera must be pointed down to the ground or floor.", "The person is looking at the floor.", "The camera is pointing straight down and showing people's shoes"], "image": "train2014/COCO_train2014_000000061460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260618, "question_id": "6uhpLQtmwjwHdoQuWbqskP", "question": "What is next to the food?", "choices": ["egg timer", "map", "measuring cup", "pizza cutter"], "correct_choice_idx": 3, "direct_answers": ["cutlery", "fork", "pizza cutter", "knife", "vegetable", "pizza cutter", "fork", "pizza cutter", "fork/knife", "eating utensils"], "difficult_direct_answer": false, "rationales": ["It is a sharp, rounded tool used to slice food.", "The pizza cutter is near the pizza.", "There is a pizza cutter sitting on the tray next to the pizza."], "image": "val2014/COCO_val2014_000000260618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581451, "question_id": "6uuWRdbmwfWpHxnWgmgewq", "question": "What item on this hot dog is most unusual?", "choices": ["ketchup", "peas", "onions", "mustard"], "correct_choice_idx": 1, "direct_answers": ["peas", "green peas", "peas", "peas", "peas", "peas", "peas", "peas", "peas", "green peas"], "difficult_direct_answer": false, "rationales": ["Peas aren't generally on hot dogs.", "Hot dogs don't usually come with peas on them.", "Peas are not typically found on a hot dog; they are a vegetable eaten separately."], "image": "val2014/COCO_val2014_000000581451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51168, "question_id": "6uw8fHv46QBNrNvY7mUbra", "question": "During which season is the train operating?", "choices": ["fall", "summer", "winter", "spring"], "correct_choice_idx": 3, "direct_answers": ["summer", "summer", "summer", "spring", "spring", "spring", "spring", "summer", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["The grass is green and the trees are full of leaves so it's past spring", "There grass is bright green.", "A train is on tracks in a heavily wooded area. the trees and brush is all green and there are leaves on everything."], "image": "train2014/COCO_train2014_000000051168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484575, "question_id": "6vQ4osogtfxh9zZWKhn4Pi", "question": "Why is the ball above her racquet?", "choices": ["hitting ball", "is confused", "showing off", "is targeted"], "correct_choice_idx": 0, "direct_answers": ["midair", "swinging", "hitting ball", "protection", "airborne", "hitting", "returning volley", "aiming", "hand-eye coordination", "play"], "difficult_direct_answer": true, "rationales": ["The ball above her racket is there to be hit.", "The woman is trying to serve the ball.", "The woman is playing tennis. she is holding her tennis racket and is preparing to strike the yellow round object used to play the sport."], "image": "val2014/COCO_val2014_000000484575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73109, "question_id": "6va8bULdMHQWFFvTfzBQTy", "question": "Where can we find the sign above?", "choices": ["kitchen", "ocean", "road", "home"], "correct_choice_idx": 2, "direct_answers": ["road", "road", "stop", "intersection", "intersections", "intersectin", "street", "road", "street", "stop"], "difficult_direct_answer": false, "rationales": ["This is a traffic sign used to let drivers know what they can or should do", "It is a road sign.", "It is on the road ."], "image": "train2014/COCO_train2014_000000073109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51920, "question_id": "6wcZ7hxFLCV8LHmrbeK56G", "question": "Where does pizza come from?", "choices": ["america", "germany", "italy", "britain"], "correct_choice_idx": 2, "direct_answers": ["restaurant", "italy", "italy", "restaurant", "italy", "italy", "italy", "italy", "italy", "italy"], "difficult_direct_answer": false, "rationales": ["The pizza is from italy.", "Answer a is commonly known as the answer to the question and confirmed by an internet search.", "Pizza is an italian food."], "image": "val2014/COCO_val2014_000000051920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78982, "question_id": "6wk3DHPqptMHMLa9oVxRbm", "question": "What is being promoted on the right truck?", "choices": ["beer", "local radio", "milk", "cookies"], "correct_choice_idx": 0, "direct_answers": ["travelling", "disney", "beer", "pabst beer", "beer", "beer", "beer", "pabst beer", "bus", "beer"], "difficult_direct_answer": false, "rationales": ["A beer can is shown on the truck's side.", "The glass of liquid on the image has suds at the top. the bottom of a beer can will have a lot of suds.", "Pabst blue ribbon is on the truck. it is a well known brand."], "image": "val2014/COCO_val2014_000000078982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122861, "question_id": "6x4KQRiZPLjz5Reix3ZskQ", "question": "Which direction are the surfers going?", "choices": ["towards shore", "randomly", "along shore", "leaving shore"], "correct_choice_idx": 0, "direct_answers": ["inland", "south", "forward", "front", "towards shore", "forward", "toward camera", "beach", "towards shore", "forward"], "difficult_direct_answer": false, "rationales": ["Surfers are always surfing towards the beach.", "The waves are pushing them towards the sand.", "The surfers go toward the shore."], "image": "val2014/COCO_val2014_000000122861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191197, "question_id": "6x7zVeQgMM7Dh7ukH5pyjJ", "question": "What is the girl pointing to?", "choices": ["elbow", "head", "knee", "foot"], "correct_choice_idx": 1, "direct_answers": ["head", "glasses", "head", "head", "head", "camera", "head", "head", "her forehead", "her head"], "difficult_direct_answer": false, "rationales": ["Her index finger is rested against her noggin.", "The girl is holding a finger to her temple.", "The girl has her finger on her head."], "image": "train2014/COCO_train2014_000000191197.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123412, "question_id": "6xVUyUEZ3EP3hf6hMSieCJ", "question": "What is the name for the symbols used on the clock?", "choices": ["wingdings", "cursive", "roman numerals", "widgets"], "correct_choice_idx": 2, "direct_answers": ["roman numerals", "roman numerals", "roman numerals", "roman numerals", "roman numerals", "roman numerals", "roman numerals", "roman numerals", "clock", "roman numerals"], "difficult_direct_answer": false, "rationales": ["Roman numerals are often used on clocks.", "Roman numerals refer to the numbers on the clock.", "The other options don't apply. these were historically used on analog clocks."], "image": "train2014/COCO_train2014_000000123412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278301, "question_id": "6xnF65fEUBnVr9uT6EsqDt", "question": "What is the gold lining in the doorway called?", "choices": ["gold leaf", "tinsel", "plastic", "ermine"], "correct_choice_idx": 3, "direct_answers": ["tinsel", "garland", "tree garland", "tinsel", "garland", "trim", "garland", "tinsel", "ermine", "tinsel"], "difficult_direct_answer": false, "rationales": ["The gold lining on the door is a shiny material called tinsel often seen on christmas trees.", "Ermine is a sparkly lining.", "The gold lining is often known as ermine."], "image": "train2014/COCO_train2014_000000278301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173340, "question_id": "6xpTuhR7WfJtsbEdT2X3aA", "question": "What is the problem with this picture?", "choices": ["too bright", "photoshopping", "too dark", "cropping"], "correct_choice_idx": 3, "direct_answers": ["cropping", "picture cropped", "cut off", "eye contact", "fingertips missing", "bit blurry", "crop distortion", "plate", "cut off", "fingers misaligned"], "difficult_direct_answer": true, "rationales": ["The picture is a little dark.", "The photo cuts off and the plate is distorted.", "You can tell by the broken lines in the picture as to what is wrong with it."], "image": "val2014/COCO_val2014_000000173340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34525, "question_id": "6xx6HfLnZ5oAgXNeerw7bK", "question": "The river here seem to have done what?", "choices": ["dammed up", "flooded", "meandered", "dried up"], "correct_choice_idx": 1, "direct_answers": ["flooded", "flooded", "flooded", "flooded", "flood", "flood", "flooded", "flooded", "flooded", "flooded"], "difficult_direct_answer": false, "rationales": ["The water is up over the grass and cattle are in it where they'd normally be along the sides", "The river has vegetation growing which is stretched across it. the only way that this could have happened is if there had been recent flooding.", "The other options don't match the image."], "image": "train2014/COCO_train2014_000000034525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579127, "question_id": "6yKgas4yasv3h7R9xNmpuL", "question": "What's the name of the skateboarding trick the man is doing?", "choices": ["tail slide", "aerial grab", "kick flip", "180"], "correct_choice_idx": 1, "direct_answers": ["ollie", "railing", "jump", "rail jump", "jump", "ollie", "leaping", "aerial grab", "kick flip", "half pipe"], "difficult_direct_answer": false, "rationales": ["The man is doing an aerial since he's in the air.", "The name is an aerial grab.", "He is reaching down to grab his board as he jumps in the air."], "image": "train2014/COCO_train2014_000000579127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495124, "question_id": "6yM8EoKGny4o4xUuzvhKyn", "question": "What can be done with the large flat white object?", "choices": ["iron clothes", "eat dinner", "store food", "open/close"], "correct_choice_idx": 3, "direct_answers": ["open", "open", "open door", "play nintendo", "detect smoke", "open/close", "opened", "opening", "exit", "play videogames"], "difficult_direct_answer": true, "rationales": ["It is a door.", "The large white door is what can both be opened and closed.", "The large flat white object is a door, not an ironing board, table, or fridge."], "image": "train2014/COCO_train2014_000000495124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42529, "question_id": "6yeJaxxdP36Ryn2MwV29xb", "question": "What period of the day is reflected in the image?", "choices": ["morning", "afternoon", "evening", "night"], "correct_choice_idx": 1, "direct_answers": ["daytime", "late afternoon", "midafternoon", "afternoon", "afternoon", "morning", "afternoon", "afternoon", "afternoon", "afternoon"], "difficult_direct_answer": false, "rationales": ["The day is very bright.", "It is afternoon vegas the clock on the tower shows the time to be shortly after 5 p.m.", "It is daylight in the image, and there are people interacting leisurely in a grassy area while traffic drives by. these activities are common during this part of the day."], "image": "train2014/COCO_train2014_000000042529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262869, "question_id": "6yp3Uz8trQaDdea4wkcwnc", "question": "What is the person wearing?", "choices": ["helmet", "suspenders", "goggles", "tie"], "correct_choice_idx": 2, "direct_answers": ["ski goggles", "yellow jacket", "winter coat", "ski pants", "snowsuit", "snowboard outfit", "parka", "snow suit", "winter gear", "goggles"], "difficult_direct_answer": true, "rationales": ["The person is wearing eyewear.", "The person is snowboarding. their face is covered.", "The object on their eyes are like protective glasses."], "image": "train2014/COCO_train2014_000000262869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70809, "question_id": "6ytzQ7kZtZJtW2tqXagfNL", "question": "What footwear company made the grey sneakers?", "choices": ["vans", "sketchers", "reebok", "nike"], "correct_choice_idx": 3, "direct_answers": ["nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike"], "difficult_direct_answer": false, "rationales": ["Their logo is on the side of the shoe.", "The logo is on the shoes.", "Nike is known for the swoosh logo."], "image": "train2014/COCO_train2014_000000070809.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137004, "question_id": "6yzUa9Q8wpwpMbeNXRTym9", "question": "What are the tan baskets made out of?", "choices": ["plastic", "cotton", "straw", "aluminum"], "correct_choice_idx": 2, "direct_answers": ["reeds", "straw", "oranges", "wicker", "wood", "wicker", "straw", "wicker", "straw", "orange"], "difficult_direct_answer": false, "rationales": ["This is a sturdy natural material that can be woven into containers like the ones in the photo.", "They are made from straw", "These types of baskets are very inexpensive and they can be used to store many different things."], "image": "val2014/COCO_val2014_000000137004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226498, "question_id": "6yzcpjAn8dNAeBERdsccQB", "question": "Why are they all running in the same direction?", "choices": ["being chased", "going home", "return school", "chasing ball"], "correct_choice_idx": 3, "direct_answers": ["playing soccer", "soccer game", "playing soccer", "attack direction", "playing soccer", "playing", "chasing ball", "make goal", "chasing ball", "going forward"], "difficult_direct_answer": false, "rationales": ["The players are all either trying to kick the ball into their goal or steal it to kick it towards the other goal.", "Everyone is focused on a ball that is on the field.", "The people want the ball."], "image": "val2014/COCO_val2014_000000226498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44471, "question_id": "6zVsQqYL4DuhMMP7XGNqN6", "question": "What should the pedestrians do in this situation?", "choices": ["wait", "go", "say hi", "slow down"], "correct_choice_idx": 0, "direct_answers": ["stop", "stop", "stop walking", "not walk", "cross street", "stop", "stop", "stop", "wait", "wait"], "difficult_direct_answer": false, "rationales": ["A raised hand on a pedestrian traffic signal signifies that the person should not begin to walk across the street.", "The sign on the post is a red hand that indicates pedestrians should not walk yet.", "The logo tells people to wait."], "image": "train2014/COCO_train2014_000000044471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465677, "question_id": "6zd4vBEvaDSA23QWJpgBfN", "question": "Where is this man located?", "choices": ["florida", "canada", "maine", "alaska"], "correct_choice_idx": 0, "direct_answers": ["skate park", "park", "skate park", "beach", "skate park", "skate park", "skate park", "skate park", "florida", "skatepark"], "difficult_direct_answer": false, "rationales": ["A person is skateboarding in a sunny place with palm trees. florida has palm trees and is sunny a lot.", "He is in a warmer state.", "Florida has palm trees."], "image": "train2014/COCO_train2014_000000465677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506500, "question_id": "6zp7TTSPGEFyQowAZCMnQQ", "question": "How does the man power the small boat?", "choices": ["sail", "engine", "sun", "paddle"], "correct_choice_idx": 3, "direct_answers": ["oar", "oars", "paddle", "oars", "paddles", "paddle", "oars", "paddles", "hands", "paddle"], "difficult_direct_answer": false, "rationales": ["The man is using the paddle to get from place to place.", "One can see one of the oars behind him on the boat.", "An oar can be seen inside a small boat behind where a man is sitting in it. boats can be moved by paddling with oars."], "image": "train2014/COCO_train2014_000000506500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378090, "question_id": "6zrfz4q5CBdrxXr3wQ4DDP", "question": "What session of the day is shown here?", "choices": ["morning", "dawn", "evening", "afternoon"], "correct_choice_idx": 0, "direct_answers": ["morning", "morning", "morning", "morning", "morning", "noon", "morning", "noon", "noon", "morning"], "difficult_direct_answer": false, "rationales": ["There is light that you can see through the window.", "It is in the morning according the outside and clock.", "The sky is very bright."], "image": "train2014/COCO_train2014_000000378090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207424, "question_id": "72HXRzo6pAegM3g7zpfXyb", "question": "This sandwich is likely high in what?", "choices": ["roe", "vegetables", "cholesterol", "vitamins"], "correct_choice_idx": 2, "direct_answers": ["calories", "calories", "fat", "fat", "calories", "calories", "salt", "sodium", "cholesterol", "cholesterol"], "difficult_direct_answer": false, "rationales": ["The sandwich has a lot of creamy mayonnaise in it.", "The other options don't really work except for c. all of the grease, cheese and bread point to a being the best answer.", "It has a lot of meat, cheese and butter is used to cook it."], "image": "train2014/COCO_train2014_000000207424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451104, "question_id": "72KmKAjaMDEqZnYeTEC5Kk", "question": "What is he doing?", "choices": ["clearing snow", "stealing machine", "selling machine", "exercising"], "correct_choice_idx": 0, "direct_answers": ["plowing snow", "plowing snow", "shoveling snow", "clearing snow", "clearing snow", "snow plowing", "pull sled", "pull sledge", "plowing snow", "clearing snow"], "difficult_direct_answer": false, "rationales": ["The man is trying to clear snow from the pathway.", "A man is pushing a machine along a path and snow is flying off to the side and the path is clear behind him.", "The man is using a snowblower to clear snow off of the path."], "image": "val2014/COCO_val2014_000000451104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376342, "question_id": "72SF5G6KZRSH5nMVhAqYof", "question": "What reason is the person wearing black marks under his eye?", "choices": ["mascara", "being pretty", "glare reduction", "native tattoo"], "correct_choice_idx": 2, "direct_answers": ["reduce glare", "sun", "sun glare", "sun protection", "glare prevention", "sun block", "sun glare", "glare reduction", "sun", "war paint"], "difficult_direct_answer": false, "rationales": ["Athletes and baseball players specifically wear eye black in this manner believing it helps in the way of answer a.", "The man doesn't want the sun to get in his eyes.", "A baseball player is in uniform with black under his eyes. sports players wear black under their eyes to reduce glare."], "image": "val2014/COCO_val2014_000000376342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511670, "question_id": "72ahEa9ur6DP9yYHb8K93w", "question": "What goes in the black pouch clipped to the man's belt?", "choices": ["whistle", "bear spray", "cellphone", "glasses"], "correct_choice_idx": 2, "direct_answers": ["cellphone", "cell phone", "phone", "phone", "frisbee", "cellphone", "cell phone", "gun", "phone", "phone"], "difficult_direct_answer": false, "rationales": ["Sometimes there's not enough space for the phone to fit in someone's pocket. a phone case eliminates that problem.", "This is a clip that attaches to a belt so keep a phone so you don't lose it", "People wear their cell phones on their belts."], "image": "train2014/COCO_train2014_000000511670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336862, "question_id": "72qrVrRqf6obF8SWD7RuzM", "question": "What company designed the red outfit?", "choices": ["nike", "zara", "champion", "adidas"], "correct_choice_idx": 3, "direct_answers": ["adidas", "adidas", "adidas", "unknown", "adidas", "nike", "adidas", "adidas", "adidas", "adidas"], "difficult_direct_answer": false, "rationales": ["The company is adidas.", "The boy's track suit has three white lines down the side.", "A boy is in a red sweatsuit with white stripes going down the side. adidas is known for their stripes down the side of their products."], "image": "val2014/COCO_val2014_000000336862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382589, "question_id": "72yov8iyY2jq4UDXh4ytDv", "question": "What was someone doing in the bed?", "choices": ["gaming", "reading", "eating", "painting"], "correct_choice_idx": 1, "direct_answers": ["sleeping", "sleeping", "reading", "reading", "reading", "reading", "reading", "sleeping", "reading", "sleeping"], "difficult_direct_answer": false, "rationales": ["There is a book on one of the pillows", "Someone was reading in bed.", "Someone had a book in bed."], "image": "train2014/COCO_train2014_000000382589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8746, "question_id": "73cZjseoR3A3VAztweYPf2", "question": "Why does he have the ball?", "choices": ["showing off", "losing control", "stole it", "curious"], "correct_choice_idx": 0, "direct_answers": ["showing off", "doing trick", "playing", "to throw", "playing", "dribbling", "entertainment", "doing tricks", "performing", "throw tricks"], "difficult_direct_answer": true, "rationales": ["The man is doing tricks on stage.", "The man is showing off.", "He is on a stage and likely just showing off his skills."], "image": "train2014/COCO_train2014_000000008746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47054, "question_id": "73zpzh4FwM3LZZQ55Dbch4", "question": "A digital single lens reflex is normally known as?", "choices": ["dssl", "dlrs", "dssl", "dslr"], "correct_choice_idx": 3, "direct_answers": ["digital slr", "dslr", "dslr", "red eye", "reflection", "camera", "flash", "camera", "dslr", "dslr"], "difficult_direct_answer": false, "rationales": ["A digital single lens reflex is briefly known as a dslr", "The camera has a digital single lens reflex.", "The man is holding a dslr camera."], "image": "train2014/COCO_train2014_000000047054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205221, "question_id": "745uWYDeYyuvwtbAnKxuRY", "question": "How is this guy most likely moving around?", "choices": ["taxi", "car", "bike", "foot"], "correct_choice_idx": 2, "direct_answers": ["bicycle", "bike", "bike", "bike", "bicycle", "bike", "bike", "bicycle", "bicycle", "bike"], "difficult_direct_answer": false, "rationales": ["There is a bicycle parked next to the bench.", "There is a picture of some handles by a guy that is sitting on a bench.", "The man has a bike near him."], "image": "train2014/COCO_train2014_000000205221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134772, "question_id": "746UjHxCpZSV4QJizTNUUN", "question": "What is the woman doing near the kitchen counter?", "choices": ["texting", "pouring", "eating", "exercising"], "correct_choice_idx": 1, "direct_answers": ["pouring drink", "pouring", "pouring", "pouring", "pouring drink", "measuring", "pouring", "spilling", "pouring", "water"], "difficult_direct_answer": false, "rationales": ["This woman is leaning down and looking very carefully as she tilts a bottle as she allows liquid to go from the bottle to a cup.", "She has one container tilted over another container so liquid can come out", "She is measuring liquid."], "image": "train2014/COCO_train2014_000000134772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405663, "question_id": "7494Bte7it77bMYKppFY9T", "question": "What is likely to next touch this cow?", "choices": ["taser", "gun", "doggie", "rope"], "correct_choice_idx": 3, "direct_answers": ["cow wrangler", "blue horseman", "rope", "horse", "horse", "man", "rope", "horse", "rope", "rope"], "difficult_direct_answer": false, "rationales": ["It looks like they're attempting to wrangle it.", "The setting and people following the cow use ropes to catch it.", "The rope is going to be used to hit the cow."], "image": "train2014/COCO_train2014_000000405663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444719, "question_id": "74HgMq8bTHnFL7gBWXXECW", "question": "Why is the woman holding her hand to her head?", "choices": ["to direct", "to wave", "to pose", "to dance"], "correct_choice_idx": 2, "direct_answers": ["posing", "touching hair", "scratching", "just only", "touching hair", "to pose", "showing off", "posing", "posing", "posing"], "difficult_direct_answer": false, "rationales": ["The woman is posing for the picture.", "The woman is an influencer and wants the best angle.", "The woman is holding her hand by her head because she is posing for the camera."], "image": "val2014/COCO_val2014_000000444719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120472, "question_id": "74bnCmGH8weReQJjjNy7wB", "question": "What part of the flower is covering up the reproductive parts from view?", "choices": ["stem", "sepal", "petals", "leaf"], "correct_choice_idx": 2, "direct_answers": ["petals", "petals", "petals", "petals", "petals", "leaves", "petals", "petals", "vase", "petals"], "difficult_direct_answer": false, "rationales": ["The petals hide the inner part of the flower.", "They cover up the pistol and stamen", "The flowers have petals."], "image": "val2014/COCO_val2014_000000120472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340594, "question_id": "74odr7aDfcJJZmGwfVZyvu", "question": "What do the two people at the ends of each side of the table have in common?", "choices": ["glasses", "coats", "hats", "backpacks"], "correct_choice_idx": 0, "direct_answers": ["glasses", "wearing glasses", "glasses", "glasses", "glasses", "glasses", "glasses", "asian", "glasses", "glasses"], "difficult_direct_answer": false, "rationales": ["The person on the far left end of the table and the person on the far right end of the table have several things in common, but the most immediately noticeable is the fact that both are wearing eyeglasses.", "The people at each end are wearing glasses.", "The man on the left and the woman on the right are both wearing clear glasses"], "image": "val2014/COCO_val2014_000000340594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279784, "question_id": "74pWkqKWZeWe7kXyNJRBnt", "question": "What is next to the sandwich?", "choices": ["baby", "apple", "dipping sauce", "woman"], "correct_choice_idx": 2, "direct_answers": ["fries", "french fries", "fries", "salad", "ketchup", "ketchup", "french fries", "fries", "fries ketchup", "dipping sauce"], "difficult_direct_answer": false, "rationales": ["The bread has sauce.", "The small white container has ketchup.", "There is dipping sauce with the sandwich to add flavor."], "image": "val2014/COCO_val2014_000000279784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437518, "question_id": "75HjLjX9bqTmUGZku7bfgc", "question": "What is he doing?", "choices": ["seeking car", "reading sign", "seeking food", "waiting crossing"], "correct_choice_idx": 1, "direct_answers": ["reading sign", "read", "stopping", "reading", "reading sign", "stop", "reading sign", "watching", "reading sign", "looking sign"], "difficult_direct_answer": false, "rationales": ["He is looking at signs.", "The man is standing to read the sign in front of him.", "He is looking at the writing on it."], "image": "train2014/COCO_train2014_000000437518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204238, "question_id": "75TQ4cKtpPPQHsKToyGDft", "question": "What cartoon is featured on the backpack on the right?", "choices": ["clone wars", "family guy", "simpsons", "rugrats"], "correct_choice_idx": 0, "direct_answers": ["clone wars", "clone wars", "star wars", "yoda", "clone wars", "yoda", "star wars", "clone wars", "clone wars", "star wars"], "difficult_direct_answer": false, "rationales": ["Star wars characters are on a bag a child wears on his back.", "A star wars backpack is being held by someone.", "The name of the cartoon is near the top of the bag."], "image": "train2014/COCO_train2014_000000204238.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318857, "question_id": "75aLr8VApVpZQqXjZ7cbc6", "question": "Why can we see his foot through the table?", "choices": ["it's glass", "hole", "it's broken", "it's plastic"], "correct_choice_idx": 0, "direct_answers": ["glass", "it's glass", "glass", "glass table", "glass table", "glass table", "is glass", "glass top", "they're hidden", "glass"], "difficult_direct_answer": false, "rationales": ["The table is see-through because it's made of glass.", "The table is made of glass.", "The table is glass."], "image": "train2014/COCO_train2014_000000318857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147309, "question_id": "75gTfpDKAwrthNesVPgSpa", "question": "What is on the plate in the foreground?", "choices": ["bran muffin", "cake", "apple", "cookie"], "correct_choice_idx": 1, "direct_answers": ["cake", "cheesecake", "cheesecake", "cheesecake", "cheese cake", "sugar", "cake", "sauce", "dessert", "cake"], "difficult_direct_answer": false, "rationales": ["The piece of food has some chocolate sauce and is in a triangle slice.", "It is a slice of a dessert cheesecake", "Cheese a specifically."], "image": "val2014/COCO_val2014_000000147309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347217, "question_id": "75rN3NPd9Ddhft4Y35uNxJ", "question": "What's the man taking a break from?", "choices": ["skateboarding", "wrestling", "basketball", "volleyball"], "correct_choice_idx": 0, "direct_answers": ["skateboarding", "skateboarding", "skating", "skateboarding", "skating", "skateboarding", "skateboarding", "skateboarding", "skating", "skateboarding"], "difficult_direct_answer": false, "rationales": ["The man is sitting with his skateboard on a bench and taking a break.,", "The man has a skateboard under his feet.", "The man is resting with a skateboard."], "image": "train2014/COCO_train2014_000000347217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41645, "question_id": "75z5oeMcn2t72b5stU6FB2", "question": "Why are the cookies on the rack?", "choices": ["showcasing", "painting", "squishing", "cooling"], "correct_choice_idx": 3, "direct_answers": ["too hot", "to cool", "cooling", "for baking", "cooling", "cooling", "cooling", "cooling", "cooling", "to bake"], "difficult_direct_answer": false, "rationales": ["They are on there to cool off after baking.", "The rack lets air go under the cookies, helping them get colder faster.", "Cookies are hot when they first come out of the oven."], "image": "train2014/COCO_train2014_000000041645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37001, "question_id": "76uZ5QTH679VWqwsJThKQh", "question": "How many different kinds of animals are represented by the fluffy toys?", "choices": ["four", "two", "three", "one"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "three", "two", "two", "three", "two", "two", "two", "three"], "difficult_direct_answer": false, "rationales": ["There are bears and a moose shown.", "There are bears and moose represented by the stuffed animals.", "There are two animals."], "image": "train2014/COCO_train2014_000000037001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86738, "question_id": "76vcya627j2jygaitTeuGU", "question": "Whose home field stadium is this?", "choices": ["mariners", "yankees", "rockies", "mets"], "correct_choice_idx": 3, "direct_answers": ["mets", "mets", "mets", "mets", "city field", "mets", "mets", "mets", "city field", "mets"], "difficult_direct_answer": false, "rationales": ["You can see on their shirt they are with the mets", "You can see the name on the stands.", "The field is home to the new york mets."], "image": "val2014/COCO_val2014_000000086738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473190, "question_id": "773SJBdAXp4jL5HwLzTWQm", "question": "These items that are moving can be referred to as being part of what?", "choices": ["school", "fleet", "clowder", "database"], "correct_choice_idx": 1, "direct_answers": ["fleet", "boat", "fleet", "fleet", "fleet", "fleet", "fleet", "boats", "fleet", "harbor"], "difficult_direct_answer": false, "rationales": ["A group of ships is called a fleet.", "The word is a plural name for several boats that are associated with each other.", "A group of boats together is called a fleet."], "image": "train2014/COCO_train2014_000000473190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518321, "question_id": "77U437qRdxzEozTnrnyfH4", "question": "This man looks most like what celebrity?", "choices": ["cynthia nixon", "ed sheeran", "omar epps", "frank zappa"], "correct_choice_idx": 3, "direct_answers": ["sylvester stallion", "elvis", "bobby lee", "singer", "keanu reeves", "frank zappa", "don novella", "johnny depp", "no idea", "jim belushi"], "difficult_direct_answer": true, "rationales": ["The man has a hat and a mustache. answer a is a celebrity known for this type of look.", "The man looks like frank zappa with his leather jacket and black hair.", "The man looks like zappa."], "image": "train2014/COCO_train2014_000000518321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271038, "question_id": "77wG5FshbpiUKYoisunUkS", "question": "Why would you look at this building?", "choices": ["schedule", "menu", "time", "temperature"], "correct_choice_idx": 2, "direct_answers": ["time", "beauty", "clock tower", "see time", "time", "time", "check time", "see time", "old", "time"], "difficult_direct_answer": false, "rationales": ["There is a clock on the front of it.", "The building tells time.", "The building has a clock on it."], "image": "train2014/COCO_train2014_000000271038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468178, "question_id": "787ceKZ2qZUna6W7p6u5fL", "question": "What terrain is this?", "choices": ["savanna", "beach", "desert", "plain"], "correct_choice_idx": 3, "direct_answers": ["flat", "savanna", "plain", "prairie", "plains", "grassland", "dessert", "grassy", "safari", "desert"], "difficult_direct_answer": true, "rationales": ["The terrain is a plain.", "These zebras are on flat land in a warm temp area.", "Zebras are found in the open plains."], "image": "val2014/COCO_val2014_000000468178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474069, "question_id": "78FUisnhC8Zx2zWCdBqRQm", "question": "Which structure was put up most recently?", "choices": ["high rise", "tent", "clock tower", "statue"], "correct_choice_idx": 1, "direct_answers": ["skyscraper", "hotel", "left", "tall building", "tent", "curvy building", "tent", "tent", "highrise building", "market roof"], "difficult_direct_answer": false, "rationales": ["A tent was likely put up most recently.", "There are tents at the streets.", "The tent is a temporary structure."], "image": "train2014/COCO_train2014_000000474069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252535, "question_id": "78Qx9QkU4PtEhfigErM8Ui", "question": "What drags skiers up the mountain?", "choices": ["taxis", "goats", "cables", "sherpas"], "correct_choice_idx": 2, "direct_answers": ["cables", "ski lift", "cable car", "lift", "lift", "lift", "ski lift", "rope", "ropes", "rope"], "difficult_direct_answer": false, "rationales": ["Cables are shown in a ski hill with people holding on. skiers use lifts to get up ski runs.", "They are pulled by a machine, not by goats, sherpas, or taxis.", "The cables bring the skiers up."], "image": "train2014/COCO_train2014_000000252535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254392, "question_id": "78SidA6SdjtgwzJuYTvjji", "question": "To open and close the umbrella the cat is missing what ability?", "choices": ["pushing", "grabbing", "all correct", "grasping"], "correct_choice_idx": 2, "direct_answers": ["thumbs", "thumbs", "all correct", "thumbs", "thumbs", "thumbs", "hands", "thumbs", "opposable thumbs", "grasp"], "difficult_direct_answer": false, "rationales": ["A cat can't grasp, grab or push the umbrella open or closed.", "The cat is able to do many things with their paw including moving and gripping items.", "The cat does not have prehensile fingers."], "image": "val2014/COCO_val2014_000000254392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545237, "question_id": "78TXibrywt49x77BJAoW6t", "question": "What is the circular light on the tall building?", "choices": ["clock", "floodlight", "window", "sign"], "correct_choice_idx": 0, "direct_answers": ["clock face", "street light", "streetlight", "clock", "light", "clock", "clock", "clock", "clock", "clock"], "difficult_direct_answer": false, "rationales": ["The light is a clock.", "The circular light has hands and numbers that are used to tell time.", "It has hands and numerals to tell time."], "image": "train2014/COCO_train2014_000000545237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306711, "question_id": "78YR2HL3PJKXdEGtiTdbmC", "question": "What are the smaller animals to the right of the black sheep called?", "choices": ["fledglings", "lamb", "puppies", "kittens"], "correct_choice_idx": 1, "direct_answers": ["lambs", "lamb", "lambs", "lamb", "lambs", "lambs", "lambs", "lambs", "lambs", "two"], "difficult_direct_answer": false, "rationales": ["The animals are the lamb.", "Smaller baby sheep are called lambs.", "The smaller animals are baby sheep, not baby dogs, cats, or birds."], "image": "train2014/COCO_train2014_000000306711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215170, "question_id": "78iSsugNMHCDzfAt2VZtoH", "question": "What is stacked on the table?", "choices": ["notebooks", "candy", "vinyl records", "cellphones"], "correct_choice_idx": 3, "direct_answers": ["cellphones", "cellphones", "phones", "phones", "phones", "cellphones", "cellphones", "phones", "cellphones", "phones"], "difficult_direct_answer": false, "rationales": ["There are a group of cellphones.", "A bunch of phones are scattered.", "The look of the picture shows on the table phones."], "image": "val2014/COCO_val2014_000000215170.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351747, "question_id": "78j2tTZSGaKhoFZqdP4WBb", "question": "Toward what does number 15 run?", "choices": ["outfield", "home base", "first base", "coach"], "correct_choice_idx": 2, "direct_answers": ["first base", "first base", "first base", "right", "first base", "first", "first base", "next base", "first base", "first base"], "difficult_direct_answer": false, "rationales": ["Number 15 is the batter. he currently is at home plate with the catcher and umpire.", "Number 15 wants to reach first base.", "When a baseball player bats they run towards first base every time they have to run."], "image": "val2014/COCO_val2014_000000351747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305103, "question_id": "78s2bZCyRHJPZkHaUm8bSa", "question": "What is the woman doing with her legs in preparation to serve the ball?", "choices": ["crossing", "moving", "positioning", "exercising"], "correct_choice_idx": 2, "direct_answers": ["playing", "stance", "bracing self", "setting up", "balancing", "positioning them", "bending", "stabilizing", "backing up", "positioning"], "difficult_direct_answer": true, "rationales": ["The woman is positioning herself.", "The other three options are not observed or relevant. the player is getting in position to perform a serve in tennis.", "She needs a wide center of balance and leverage in order to hit the ball as far and hard as possible."], "image": "train2014/COCO_train2014_000000305103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308701, "question_id": "79DLgAoBD2RkGWuAnw5yfQ", "question": "What does the red X sign signify?", "choices": ["construction", "traffic light", "crossing", "school zone"], "correct_choice_idx": 2, "direct_answers": ["no driving", "crossing", "cycle", "no entry", "no crossing", "danger", "stop", "railroad crossing", "no stopping", "don't enter"], "difficult_direct_answer": true, "rationales": ["It is for a crossing.", "The \"x\" is for crossing", "It means don't go across there."], "image": "train2014/COCO_train2014_000000308701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371608, "question_id": "79DUsUHTWhQAKQ7G9qCp8n", "question": "What is the job of this horse?", "choices": ["carry", "jump", "race", "pull"], "correct_choice_idx": 3, "direct_answers": ["pull", "pull", "pulling carriage", "to pull", "carriage pulling", "pull carriage", "pulling cart", "pull cart", "pull", "pulling"], "difficult_direct_answer": false, "rationales": ["The horse is connected to a wheeled vehicle in the front. in order for this to move which would be the objective of this vehicle, the horse would pull.", "They are carriage horses.", "The horse will pull the cart with people in it."], "image": "train2014/COCO_train2014_000000371608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88835, "question_id": "79Us4soVwb7BePJYjgovFR", "question": "What is the right lane used for?", "choices": ["turns", "parking", "racing", "paying"], "correct_choice_idx": 0, "direct_answers": ["4th street", "turning", "turns", "safe lane", "finding route", "exiting", "turning", "exit", "turn lane", "travel"], "difficult_direct_answer": true, "rationales": ["The right lane has arrows painted on it showing that all those in that lane need to exit towards the right.", "The lane is for turns.", "This is a freeway, so it would not be acceptable to park or race here."], "image": "train2014/COCO_train2014_000000088835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504980, "question_id": "79gBSY9SMtYhxue85QUNsc", "question": "What meal is being served?", "choices": ["dinner", "breakfast", "afternoon tea", "lunch"], "correct_choice_idx": 2, "direct_answers": ["dessert", "afternoon tea", "pizza", "dessert", "breakfast", "dessert", "dessert", "pizza", "dessert", "dessert"], "difficult_direct_answer": false, "rationales": ["Thsi is a tea cup and tea is being served.", "There is a cup of tea on the table as well as some pastries, but no sandwiches or entrees, so the meal is likely to be afternoon tea.", "The little cakes and teacups are popular foods and dishes served with afternoon tea."], "image": "train2014/COCO_train2014_000000504980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536429, "question_id": "79ncyTY9W2K83p4ni7iTRW", "question": "Why are so many vases together?", "choices": ["to sell", "storage", "to break", "collection"], "correct_choice_idx": 0, "direct_answers": ["for sale", "for sale", "for sale", "for sale", "sales display", "selling", "at store", "for sale", "selling them", "to sell"], "difficult_direct_answer": false, "rationales": ["The vases are set on shelves for customers to browse and make their selections.", "The vases are being sold in a store.", "The vases are arranged on shelves. this makes it easier to sell."], "image": "val2014/COCO_val2014_000000536429.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135989, "question_id": "7A4PyHFeaUo6WtAZVBjL4M", "question": "What would explain the bad smell here?", "choices": ["construction site", "toilet", "dirty floor", "sink"], "correct_choice_idx": 1, "direct_answers": ["toilet", "man", "poop", "poop", "throw up", "toilet", "unclean toilet", "toilet", "feces", "dirty toilet"], "difficult_direct_answer": false, "rationales": ["There is a toilet with is associated to bad odors.", "This is where people relieve themselves", "The person is in a bathroom."], "image": "train2014/COCO_train2014_000000135989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264353, "question_id": "7A8BHWuourEEr8fzpX6dxH", "question": "What is the relationship of the smaller giraffe to the bigger one?", "choices": ["peer", "offspring", "mate", "prey"], "correct_choice_idx": 1, "direct_answers": ["child", "her offspring", "offspring", "baby", "child", "baby", "parent child", "baby", "baby", "child"], "difficult_direct_answer": false, "rationales": ["The mother giraffe is looking after its child.", "The smaller giraffe is the young of the taller, parent giraffe.", "Giraffes are not predators. peers or mates would be similar in size."], "image": "train2014/COCO_train2014_000000264353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150114, "question_id": "7AUUsZjEJ24Ay8jW5DE8dA", "question": "What will be at the train station waiting for it?", "choices": ["passengers", "employees", "people", "all correct"], "correct_choice_idx": 3, "direct_answers": ["passengers", "passengers", "clock", "passengers", "passengers", "passangers", "all correct", "travelers", "people", "passengers"], "difficult_direct_answer": false, "rationales": ["They will be waiting for passengers.", "There usually are all the above at a train station.", "This is a train that has people cars"], "image": "train2014/COCO_train2014_000000150114.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513854, "question_id": "7AXhWQBumcfGrBpXRf4ivC", "question": "How is electricity being transported?", "choices": ["power lines", "trucks", "frisbee", "clouds"], "correct_choice_idx": 0, "direct_answers": ["line", "wires", "powerlines", "electric wires", "wires", "wires", "wires", "power lines", "power lines", "wires"], "difficult_direct_answer": false, "rationales": ["There are long steel poles with electric lines attached to them.", "The cable on the poles.", "These wires held up with wood posts are purpose-built to transport energy across large distances."], "image": "train2014/COCO_train2014_000000513854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457230, "question_id": "7AZqbzf8K5CPvB6jPYAnst", "question": "What body part is protected by the attachment on the helmets they are wearing?", "choices": ["arms", "neck", "throat", "face"], "correct_choice_idx": 3, "direct_answers": ["head", "head", "head", "head", "head", "face", "head", "head", "head", "head"], "difficult_direct_answer": false, "rationales": ["The head of the police are protected by shields.", "The horses' faces are covered.", "The body part is the face."], "image": "val2014/COCO_val2014_000000457230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547533, "question_id": "7B7S4dqrevvsHrbqvZRCF5", "question": "What are the rectangular objects above the handlebars?", "choices": ["risers", "mirrors", "shocks", "lights"], "correct_choice_idx": 1, "direct_answers": ["mirror", "mirrors", "mirrors", "mirrors", "brakes", "mirrors", "rear view", "mirrors", "mirrors", "mirrors"], "difficult_direct_answer": false, "rationales": ["These are mirrors so the biker can see behind them on the road.", "A woman is sitting on a motorcycle. above the handles are used to see people and cars behind.", "The rectangular objects allow the motorcyclist to see."], "image": "train2014/COCO_train2014_000000547533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225574, "question_id": "7BCPAhduvNmhpjuS8V2q8i", "question": "What sort of creature is the large kite made to resemble?", "choices": ["bird", "amphibian", "man", "mammal"], "correct_choice_idx": 1, "direct_answers": ["gecko", "camellon", "lizard", "chameleon", "lizard", "geko", "amphibian", "dragon", "lizard", "gekko"], "difficult_direct_answer": false, "rationales": ["A large kite has a long tail and big round eyes depicted on it.", "The kite is shaped like a gecko. a gecko is an amphibian.", "The creature looks like a lizard."], "image": "val2014/COCO_val2014_000000225574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65957, "question_id": "7BKPNS5NVzQqKre4kZhjrV", "question": "The female player is making what shot?", "choices": ["backhand", "forehand", "lob", "serve"], "correct_choice_idx": 3, "direct_answers": ["serve", "serving", "serve", "over head", "serve", "serve", "serve", "over head", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["She tossed the ball up in the air to hit it to her opponent.", "The woman is throwing the ball in the air to hit it.", "The player throws the ball up in the air and is getting ready to hit it to her opponent."], "image": "train2014/COCO_train2014_000000065957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80360, "question_id": "7BRC6gqV4b8gxdgPCp69dx", "question": "What is in the little white plastic containers?", "choices": ["coffee creamer", "ketchup", "jelly", "butter"], "correct_choice_idx": 0, "direct_answers": ["creamer", "coffee creamer", "creamer", "coffee creamer", "cream", "creamer", "coffee creamer", "creamer", "creamer", "coffee creamer"], "difficult_direct_answer": false, "rationales": ["You can tell by the shape and size, as to what it holds.", "There are small containers filled with liquid.", "Coffee creamer usually comes in that small size container for a single cup of coffee."], "image": "train2014/COCO_train2014_000000080360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176397, "question_id": "7BWVysUmkjXzYxuiJDJU3v", "question": "The item the man is sitting on is likely made of what?", "choices": ["wheat", "wood", "straw", "mud"], "correct_choice_idx": 1, "direct_answers": ["metal", "wood", "wood", "wood", "wood", "wood", "metal", "metal", "wood", "metal"], "difficult_direct_answer": false, "rationales": ["The item is made of wood.", "The man is sitting on a bench.", "The primary makeup of the bench are long rectangular shaped planks. these planks are mostly made of wood."], "image": "train2014/COCO_train2014_000000176397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378214, "question_id": "7BYy6oWvcsruAPPxZ8UjaX", "question": "What event are the animals taking part in?", "choices": ["revolt", "livestock show", "slaughter", "escaping"], "correct_choice_idx": 1, "direct_answers": ["animal party", "showcase competition", "farm festival", "livestock competition", "contest", "livestock show", "auction", "rodeo", "4h show", "animal show"], "difficult_direct_answer": true, "rationales": ["The animals have been brought out to the field to be presented to an audience.", "The animals are being led and shown at a livestock show, commonly part of county and state fairs. the people are using their techniques and tools such as sticks and rope leads to place and and walk the assortment of livestock animals.", "A cow and other animals are standing on display while people look over them."], "image": "train2014/COCO_train2014_000000378214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203612, "question_id": "7Bbau7qrcHweScAK59BHA6", "question": "Why is the man wearing a glove?", "choices": ["fashion", "health", "warmth", "to catch"], "correct_choice_idx": 3, "direct_answers": ["to catch", "catching", "he's batting", "catch ball", "catcher", "catching", "catching ball", "catch ball", "catch ball", "to protect"], "difficult_direct_answer": false, "rationales": ["The man needs to be able to catch the ball.", "The man between the umpire and the batter is the catcher; he uses the glove to catch the baseball.", "He is the catcher so he wears a catcher's mitt."], "image": "train2014/COCO_train2014_000000203612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23631, "question_id": "7BbizR7bR3J7ZpfFWa9zYR", "question": "What does the number 56 signify here?", "choices": ["prime number", "racing entry", "end ranking", "model number"], "correct_choice_idx": 1, "direct_answers": ["entry number", "racing entry", "player number", "entry number", "entry race", "competitor", "racing number", "competitor number", "participant number", "racing number"], "difficult_direct_answer": false, "rationales": ["The person is competing in an event and numbers are used to list individual entrants as unique competitors.", "Each person in a race is given a competition number for the race.", "The number lets the crowd and officials know who he is."], "image": "train2014/COCO_train2014_000000023631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64551, "question_id": "7BqCy5WSnBvmAF4qNCkwTN", "question": "What type of biking event is being held here?", "choices": ["hetrosexual", "sit in", "bake sale", "gay"], "correct_choice_idx": 3, "direct_answers": ["ralley", "gay", "gay", "motorcycle", "motorcycle rally", "motorcycle", "parade", "motorcycling", "show", "meetup"], "difficult_direct_answer": false, "rationales": ["The event is for gay pride.", "The rainbow flag is a symbol used by the lgbtq community.", "There are many rainbow flags on display which is congruous with gay pride."], "image": "train2014/COCO_train2014_000000064551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257804, "question_id": "7C4LbgWz8ZzLM3HWAzGR59", "question": "What are these people doing with their cellphones?", "choices": ["taking selfie", "taking photo", "making call", "watching video"], "correct_choice_idx": 1, "direct_answers": ["taking pictures", "texting", "taking picture", "filming", "taking photo", "filming", "taking pictures", "take photo", "watch", "video recording"], "difficult_direct_answer": false, "rationales": ["This is the most likely option.", "Both phones shown have a similar image in the frame as they take essentially the same photo with their cellphones.", "They are holding their phones up and looking at the screen."], "image": "train2014/COCO_train2014_000000257804.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167346, "question_id": "7CDs2XYRifkWNTFMZTEgUT", "question": "What group of people are they likely to be?", "choices": ["americans", "russians", "europeans", "australians"], "correct_choice_idx": 2, "direct_answers": ["british", "tourist", "tourists", "europeans", "family boaters", "tourists", "tourists", "tourists", "tourist", "passengers"], "difficult_direct_answer": false, "rationales": ["There appears to be a lot of blondes in the boat at a higher percentage than would be found in the general public but at a rate consistent with some eurpoean countries.", "I'm pretty sure that is the flag that represents great britain. therefore, these folks are most likely europeans.", "The boat's flag has a union jack on it and the background looks alpine (rather than warm and tropical)"], "image": "train2014/COCO_train2014_000000167346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486620, "question_id": "7CUySUJzSqoJXQdrdPmDQU", "question": "What is on the skateboarder in the middle's head?", "choices": ["cowboy hat", "baseball cap", "hood", "crown"], "correct_choice_idx": 1, "direct_answers": ["bolt", "cap", "hat", "hat", "cap", "baseball cap", "cap", "baseball cap", "cap", "hat"], "difficult_direct_answer": false, "rationales": ["The skateboarder is wearing a ball cap.", "The skateboarder in the middle is wearing a baseball cap on his head.", "The skater is wearing a baseball hat on his head."], "image": "train2014/COCO_train2014_000000486620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154462, "question_id": "7CaSNQRtPnKaCBR6zTaaW4", "question": "What type of phone is being used?", "choices": ["pay", "cellular", "rotary", "landline"], "correct_choice_idx": 1, "direct_answers": ["cellular", "cell phone", "cell phone", "mobile", "cell", "iphone", "cell phone", "iphone", "cellphone", "smart phone"], "difficult_direct_answer": false, "rationales": ["There are cellular phones in use by the other passengers.", "The person sitting down is using a cellular phone in their hand.", "It is small, portable and not attached to a cord in a home or public booth."], "image": "train2014/COCO_train2014_000000154462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507889, "question_id": "7D5c3nHH768bVmBRMDUCBn", "question": "What type of area is this?", "choices": ["stadium", "port", "beach", "backyard"], "correct_choice_idx": 1, "direct_answers": ["river", "harbor", "marina", "dock", "port", "port", "bay", "seaport", "beach", "river"], "difficult_direct_answer": false, "rationales": ["There are boats parked here.", "The ships and water features visible here next to the docks and urban setting are consistent with answer a.", "Traditionally ships gather at a marina or drop off goods at a shipping port."], "image": "train2014/COCO_train2014_000000507889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299675, "question_id": "7DTQL6HjPrFvXRkq6o675g", "question": "What type of floor has been laid in the kitchen?", "choices": ["tyle", "carpet", "linoleum", "hardwood"], "correct_choice_idx": 3, "direct_answers": ["tile", "tile", "hardwood", "tile", "tile", "tile", "tile", "tile", "tile", "tile"], "difficult_direct_answer": false, "rationales": ["There are small squares of flooring", "The floor has tiles all over it.", "The floor has squares on it in a grid pattern."], "image": "train2014/COCO_train2014_000000299675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322194, "question_id": "7DXRZQSvJrZUvigwazdapM", "question": "What would one see if the red item is removed?", "choices": ["head", "pumpkin", "waist", "foot"], "correct_choice_idx": 0, "direct_answers": ["head", "head", "woman's face", "person's head", "head", "person's head", "head", "person's face", "her head", "head"], "difficult_direct_answer": false, "rationales": ["The other options aren't under it.", "The red item is an umbrella. it is above the person and is not blocking the person's feet or waist.", "There is a person holding the umbrella."], "image": "val2014/COCO_val2014_000000322194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519906, "question_id": "7Dd4NZKth7GLHa29amnize", "question": "Which drink is a sponsor of the event?", "choices": ["coke", "dasani", "monster", "budweiser"], "correct_choice_idx": 2, "direct_answers": ["monster", "monster", "monster", "monster", "monster", "monster", "monster", "monster", "monster", "monster"], "difficult_direct_answer": false, "rationales": ["Monster's logo is green with claws.", "You can tell by looking on the side panel and the logo as to who is sponsoring the event.", "There are clearly monster energy logos throughout the image. at this type of event when there are visible logos around the course they are most commonly sponsors of the event."], "image": "train2014/COCO_train2014_000000519906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51808, "question_id": "7DwTwrEzmkoB5T33wdXLGP", "question": "In case of fire which direction would one turn the pentagonal nipples on the hydrant shown here?", "choices": ["in", "down", "right", "left"], "correct_choice_idx": 3, "direct_answers": ["left", "left", "counter clockwise", "left", "counter clockwise", "right", "left", "left", "counter clockwise", "left"], "difficult_direct_answer": false, "rationales": ["They have to turn them counterclockwise to open.", "The left side should be turned for water.", "In case of a fire the wrench would need to turn left to open the hydrant."], "image": "train2014/COCO_train2014_000000051808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362971, "question_id": "7DyQ2FwqCdPrmhTjiaY7DN", "question": "What aisle of the grocery store might this product be found?", "choices": ["toilet paper", "produce", "canned goods", "meats"], "correct_choice_idx": 1, "direct_answers": ["produce", "produce", "produce", "produce", "produce", "produce", "produce", "produce", "produce", "produce"], "difficult_direct_answer": false, "rationales": ["The picture shows bunches of bananas which are sold in the produce section.", "Bananas (pictured) are typically found in the aisle mentioned in option a.", "Bananas are a form of produce."], "image": "val2014/COCO_val2014_000000362971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336101, "question_id": "7E2DSWHNw6wtiuMyFAGEE4", "question": "Who is the man holding hotdogs?", "choices": ["audience", "sport player", "customer", "referee"], "correct_choice_idx": 0, "direct_answers": ["giants fan", "giants fan", "giants fan", "sports fan", "seller", "unknown", "fan", "fan", "audience", "giants fan"], "difficult_direct_answer": false, "rationales": ["A man is dressed in team gear and is holding a hot dog. hot dogs are sold at sporting events.", "The man is having his picture taken to show others.", "He is in the bleachers of a stadium"], "image": "val2014/COCO_val2014_000000336101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214055, "question_id": "7E6aaukGuMwqVyGogmYRUu", "question": "Who created the first successful vehicle of this type?", "choices": ["elon musk", "nikola tesla", "orville wright", "karl benz"], "correct_choice_idx": 2, "direct_answers": ["wright brothers", "wright brothers", "wright brothers", "boeing", "van", "wright brothers", "orville wright", "wright brothers", "boeing", "boeing"], "difficult_direct_answer": false, "rationales": ["Orville wright created the first plane.", "Orville and wilbur wright invented the first successful airplane in 1903.", "Wright created the plane."], "image": "train2014/COCO_train2014_000000214055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362348, "question_id": "7E6kwhyiogpBmWwPexmFke", "question": "Where are bananas from?", "choices": ["asia", "italy", "spain", "africa"], "correct_choice_idx": 0, "direct_answers": ["warm climates", "banana tree", "florida", "asia", "trees", "store", "philippians", "market", "asia", "africa"], "difficult_direct_answer": true, "rationales": ["The bananas are from asia.", "The people look to be asian.", "Bananas are predominantly produced there, latin america, and africa."], "image": "val2014/COCO_val2014_000000362348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445462, "question_id": "7E8YfNedKPdmqiNNCj3QtK", "question": "What is the color of chair?", "choices": ["green", "pink", "white", "red"], "correct_choice_idx": 2, "direct_answers": ["grey", "white", "grey", "gray", "white", "white", "white", "white", "gray", "white"], "difficult_direct_answer": false, "rationales": ["The chair is a whitish color.", "A quick look tells us that the chair is white. it can't possibly be any of the other three options.", "The seating at the airport is white in color."], "image": "train2014/COCO_train2014_000000445462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283118, "question_id": "7FHygUKeqB2dzed9Rz7HPs", "question": "How is the food item shown here prepared?", "choices": ["baked", "boiled", "broiled", "fried"], "correct_choice_idx": 0, "direct_answers": ["baked", "baked", "baked", "baked", "baked", "baked", "baked", "baked", "baked", "baked"], "difficult_direct_answer": false, "rationales": ["Pizza is on a plate. pizzas are baked.", "You don't normally make pizza any other way than baked, overall.", "The food item was baked."], "image": "val2014/COCO_val2014_000000283118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27005, "question_id": "7FLKV7zHqbJA5ivWuxDx9a", "question": "What type of labeling is on the barrel?", "choices": ["brand", "regulatory", "warning", "directional"], "correct_choice_idx": 0, "direct_answers": ["cracker label", "montpellier crackers", "brand", "montpellier crackers", "crackers", "montpellier crackers", "round", "brand", "advertising", "montpilier crackers"], "difficult_direct_answer": false, "rationales": ["It depicts a logo for a food company.", "The company name is on the label.", "It's the name of the company that created the crackers."], "image": "train2014/COCO_train2014_000000027005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568560, "question_id": "7FQYweaJmhcPzhejFJG4Mo", "question": "What is he ready to do?", "choices": ["strike", "swing", "dunk", "rebound"], "correct_choice_idx": 1, "direct_answers": ["play tennis", "hit ball", "win", "win", "swing", "return hit", "hit ball", "swing", "return volley", "hit ball"], "difficult_direct_answer": false, "rationales": ["He is running for a ball that is headed for him", "His stance is in a swinging motion.", "By his body position and that he is playing tennis more than likely he is ready to swing his racket."], "image": "val2014/COCO_val2014_000000568560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143989, "question_id": "7FUgwmfHAPpz33qnBFB6TW", "question": "What kind of transportation is shown?", "choices": ["air", "rail", "road", "water"], "correct_choice_idx": 2, "direct_answers": ["car", "car", "cars", "cars", "car", "cars", "cars", "car", "car", "road"], "difficult_direct_answer": false, "rationales": ["He is standing near some cars in a parking lot.", "The vehicles are cars, not trains, airplanes, or boats.", "The picture shows two cars."], "image": "train2014/COCO_train2014_000000143989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191061, "question_id": "7FVAWsKPTAiyXWTkT6rhFP", "question": "What is the color of second vehicle?", "choices": ["brown", "pink", "red", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "write", "white", "white", "black", "white", "white", "white", "write"], "difficult_direct_answer": false, "rationales": ["The first food truck is black, whereas the second one is white. both (as well as the third truck) have customers waiting to order.", "The second vehicle is not pink, red, or brown.", "Two trucks are parked and one is white."], "image": "train2014/COCO_train2014_000000191061.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251252, "question_id": "7GFPUGsU9TcTMLJ6ZnSjtA", "question": "What kind of bike is this?", "choices": ["scooter", "motorbike", "bicycle", "vespa"], "correct_choice_idx": 1, "direct_answers": ["motorbike", "motorcycle", "dirt bike", "motorbike", "sport motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle"], "difficult_direct_answer": false, "rationales": ["The other options wouldn't apply in this image.", "A two wheeled vehicle with a motor is displayed.", "The bike shows that there is consumption of fuel."], "image": "val2014/COCO_val2014_000000251252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307993, "question_id": "7GvfHmi9KKyZG2wicjrbND", "question": "Which finger of the man's right hand is obscured?", "choices": ["middle", "thumb", "pinky", "ring"], "correct_choice_idx": 1, "direct_answers": ["thumb", "thumb", "thumb", "thumb", "thumb", "thumb", "thumb", "thumb", "thumb", "thumb"], "difficult_direct_answer": false, "rationales": ["The man isn't showing his thumb.", "The man's thumb is hidden from view.", "His 4 fingers are showing outside the car"], "image": "val2014/COCO_val2014_000000307993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456584, "question_id": "7H7JqahhanUTmHstVNTYe3", "question": "What does the top-right board display?", "choices": ["tv show", "train departures", "advertisements", "plane departures"], "correct_choice_idx": 1, "direct_answers": ["arrival time", "arrivals", "departure", "schedule", "flight times", "train departures", "times", "departures arrivals", "train times", "train times"], "difficult_direct_answer": true, "rationales": ["The board has train departures.", "The display is used to inform passengers and others of the arrival and leaving times of the trains.", "Train stations show departure and arrival times. the board appears to show departures with a clock showing the expected time."], "image": "val2014/COCO_val2014_000000456584.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365711, "question_id": "7HXYr6nAmQ8BepBAPpREVT", "question": "What will the white material on this surfer's face prevent?", "choices": ["sunburn", "nothing", "drowning", "recognition"], "correct_choice_idx": 0, "direct_answers": ["sunburn", "sunburn", "sunburn", "sunburn", "sunburn", "sunburn", "sunburn", "sunburn", "sunburn", "sunburn"], "difficult_direct_answer": false, "rationales": ["It will help protect him from the sun", "The white material is probably zinc oxide and is used to form a barrier against your skin to protect it from uv light exposure.", "The man is wearing sunscreen to keep his skin from burning."], "image": "val2014/COCO_val2014_000000365711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167642, "question_id": "7Ht7hgW3mzQCwcq4ioEC6b", "question": "What would someone use this room to do?", "choices": ["sleep", "shower", "play", "cook"], "correct_choice_idx": 3, "direct_answers": ["cook", "cook", "cook", "cook", "cook", "cooking", "cook", "cook", "cook", "cooking"], "difficult_direct_answer": false, "rationales": ["This room has a stove, oven, and a microwave. it is a kitchen, not a bedroom, recreation room, or bathroom.", "This is a kitchen. you can see the stove on the side.", "This room has an oven and a microwave. it is a kitchen, not a bedroom, play room, or bathroom."], "image": "train2014/COCO_train2014_000000167642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419632, "question_id": "7JAwx5QPu2Bi5Gs4Mviihf", "question": "What profession is shown here?", "choices": ["construction", "farmer", "cowboy", "firefighter"], "correct_choice_idx": 0, "direct_answers": ["road construction", "construction workers", "police", "road construction", "construction worker", "road workers", "construction worker", "construction worker", "construction crew", "construction"], "difficult_direct_answer": false, "rationales": ["The people are wearing construction gear.", "The man in yellow uniforms are construction workers that build and fix large structures.", "There are men is reflective uniforms working on the street."], "image": "train2014/COCO_train2014_000000419632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262605, "question_id": "7JGCUvMtqoHSBtVU2Bd3vf", "question": "What hat was the woman just wearing?", "choices": ["baseball cap", "knit beanie", "top hat", "fireman's hat"], "correct_choice_idx": 2, "direct_answers": ["black", "top hat", "top hat", "flying hat", "tophat", "top hat", "tank top", "top hat", "top", "top"], "difficult_direct_answer": false, "rationales": ["The hat is on the wall.", "The woman is tossing a black top hat.", "It is reasonable to assume she was wearing the hat prior to her tossing it in the air. the elongated top portion and the rim are common of this type of head dressing."], "image": "val2014/COCO_val2014_000000262605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297509, "question_id": "7JeWF223Y62j2dAuAwTjey", "question": "What is the woman doing?", "choices": ["eating", "talking", "sleeping", "working"], "correct_choice_idx": 3, "direct_answers": ["cooking/cleaning", "cooking", "dishes", "cooking", "cooking", "working", "cooking", "dishes", "cooking", "cooking"], "difficult_direct_answer": false, "rationales": ["This setting appears to be a kitchen of some sorts. her position indicates that she is preparing some sort of food. most restaurants only allow employees to prepare meals so she is most likely employed.", "She is doing an activity in a kitchen.", "The woman is working in the kitchen."], "image": "train2014/COCO_train2014_000000297509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530653, "question_id": "7KBG9ReKkB2Uy3NJd4ZFMX", "question": "What action will put the man playing skateboard at risk?", "choices": ["go straight", "back up", "turn left", "turn right"], "correct_choice_idx": 3, "direct_answers": ["hitting bus", "falling", "bus", "accident", "crossing", "turn right", "movement", "traffic", "crossing bus", "crash"], "difficult_direct_answer": true, "rationales": ["The action is turning right.", "The bus would hit the boy if it turned right.", "The man will get hit by a car if he turns right."], "image": "train2014/COCO_train2014_000000530653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337552, "question_id": "7KFWxvPg63Nc4gTBnHAnAG", "question": "In which state do these boarders walk?", "choices": ["hawaii", "arkansas", "washington", "oregon"], "correct_choice_idx": 0, "direct_answers": ["hawaii", "hawaii", "california", "hawaii", "hawaii", "florida", "hawaii", "florida", "hawaii", "florida"], "difficult_direct_answer": false, "rationales": ["The surfers are in hawaii.", "The palm trees indicate the location. the other options don't fit with the climate or plants.", "They are holding surfboards. hawaii is the only one of these states to have a beach."], "image": "train2014/COCO_train2014_000000337552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219820, "question_id": "7KKzJcwZgj2dnmh7AksrqS", "question": "What type of view do the passengers have?", "choices": ["forest", "desert", "waves", "mountains"], "correct_choice_idx": 0, "direct_answers": ["beautiful", "water view", "trees", "colorful", "forest", "water", "riverside", "scenic", "fall", "beautiful view"], "difficult_direct_answer": true, "rationales": ["There are lots of trees in the area.", "There are many different types of trees clustered close together.", "There are many trees up the side of the mountain"], "image": "val2014/COCO_val2014_000000219820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415750, "question_id": "7KLmtg3cuV3DDDdscxiBot", "question": "Behance network has done most projects on which mobile?", "choices": ["samsung", "nokia", "lenovo", "lg"], "correct_choice_idx": 0, "direct_answers": ["cricket", "cricket", "samsung", "cricket", "cricket", "samsung", "cricket", "cricket", "cricket", "cricut"], "difficult_direct_answer": false, "rationales": ["The network is samsung.", "Behance as partnered the most often with samsung.", "A samsung logo can be seen behind a stage people are standing on."], "image": "val2014/COCO_val2014_000000415750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116851, "question_id": "7KjRLAR4gBPN4Kp3snKmfm", "question": "In which setting is this clock?", "choices": ["tundra", "suburban", "rural", "urban"], "correct_choice_idx": 3, "direct_answers": ["roman numerals", "street", "town centre", "urban", "town square", "city", "round", "summer", "city", "city"], "difficult_direct_answer": false, "rationales": ["The clock is in a setting with tall buildings all around.", "The clock is in front of stores and shops.", "It is in a city."], "image": "train2014/COCO_train2014_000000116851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116517, "question_id": "7L4nNQHEWWZReT58sDEriW", "question": "What field are these people in?", "choices": ["broadcasting", "scientific", "medical", "commercial"], "correct_choice_idx": 2, "direct_answers": ["medical", "surgery", "surgery", "medical", "medical", "medical field", "medical", "medical", "medical", "medical"], "difficult_direct_answer": false, "rationales": ["The people in this room are all wearing scrubs and that indicates that they are all working in a hospital.", "They are wearing scrubs which is protective clothing worn by professionals in the medical field.", "The people are all in scrubs and other medical type gear so they would be at a hospital or similar."], "image": "val2014/COCO_val2014_000000116517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392457, "question_id": "7LR3kWMSpfckiLddyRWRX6", "question": "What kind of parking is available?", "choices": ["lot", "parallel", "diagonal", "valet"], "correct_choice_idx": 1, "direct_answers": ["street", "street", "street", "parallel", "street", "street", "street", "street", "street", "street"], "difficult_direct_answer": false, "rationales": ["There is parking in the street where the cars are all parked in a single file line along the side of the road.", "Cars are parked in a single file line along one side of a street and not the other.", "The cars are parked in the street. they are all in a straight line."], "image": "train2014/COCO_train2014_000000392457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560173, "question_id": "7LWzDbTdfZHZjZmGwD22Hc", "question": "What are the red planters on the left made from?", "choices": ["metal", "aluminum", "bricks", "plastic"], "correct_choice_idx": 2, "direct_answers": ["brick", "bricks", "bricks", "brick", "metal", "bricks", "metal", "clay", "bricks", "bricks"], "difficult_direct_answer": false, "rationales": ["The planters are brick.", "The building materials around the plants are manufactured into small rectangular pieces.", "The red planters on the left are made of brick material."], "image": "train2014/COCO_train2014_000000560173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212241, "question_id": "7LZBmELhpY9ncg9XEX5Bqz", "question": "What are the wires above the train for?", "choices": ["climbing", "decoration", "protection", "power"], "correct_choice_idx": 3, "direct_answers": ["electricity", "power supply", "power train", "power lines", "electrical current", "transmit electricity", "power", "route it", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["This is an electric train. it is fueled by the wires.", "A train is being held on by some cables that help give it energy to move.", "The lines supply electricity. they hang above as a source of electricity for the transportation."], "image": "val2014/COCO_val2014_000000212241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246307, "question_id": "7LexTK2BgZJvfEcFmE7zQq", "question": "In what type of housing complex is this kitchen part of?", "choices": ["university dorm", "condominium", "apartment", "detached home"], "correct_choice_idx": 2, "direct_answers": ["condominium", "tenement", "apartments", "apartment building", "apartment", "apartment", "kitchen bolgani", "apartment", "apartment", "apartment"], "difficult_direct_answer": false, "rationales": ["The complex is an apartment.", "The picture shows multiple rooms in an apartment complex.", "There are apartments across the way."], "image": "train2014/COCO_train2014_000000246307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45059, "question_id": "7LqXiMAXBXhuuaxNeoBD7H", "question": "What effect appears on the jacket of the cyclist behind the bus?", "choices": ["camouflage", "neon", "lighting", "sparkling"], "correct_choice_idx": 1, "direct_answers": ["reflective", "security event", "neon", "glowing", "reflection", "reflection", "reflective light", "reflection", "bus", "reflection"], "difficult_direct_answer": false, "rationales": ["It glows so it can be seen in the dark when light reflects off of it.", "The cyclist behind the bus is wearing a bright neon jacket so drivers can see him at night,", "The green from the jacket is glowing."], "image": "train2014/COCO_train2014_000000045059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491191, "question_id": "7MEh6Nk5wcXowZKV34BERR", "question": "What style apartment is this?", "choices": ["garden", "high rise", "loft", "penthouse"], "correct_choice_idx": 0, "direct_answers": ["mission", "garden", "dorm", "studio", "bachelor", "college style", "efficiency", "dorm", "flat", "studio"], "difficult_direct_answer": false, "rationales": ["The apartment has a lot of plants in it.", "It is on the first floor and you can see the plants right outside", "The windows show vegetation."], "image": "train2014/COCO_train2014_000000491191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382659, "question_id": "7MQinexUCszQincstygpR2", "question": "The clock is on a building that has the name of what on it's side?", "choices": ["car", "tulip", "vodka", "beer"], "correct_choice_idx": 3, "direct_answers": ["heineken", "heldbec", "beer", "hotel", "beer", "heineken", "zakharov", "beer", "bedford", "beer"], "difficult_direct_answer": false, "rationales": ["The word is heineken, which is the brand of a world renowned beer.", "The building has heineken on its side.", "The writing on the building is heineken."], "image": "train2014/COCO_train2014_000000382659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133225, "question_id": "7MQrmuqLU68zBeEVemdFtH", "question": "What type of food is on top of the bread?", "choices": ["fruit", "vegetables", "noodles", "meat"], "correct_choice_idx": 2, "direct_answers": ["noodles", "macaroni", "macaroni cheese", "pasta", "mac and cheese", "macaroni", "macaroni", "macaroni", "pasta", "rigatoni"], "difficult_direct_answer": false, "rationales": ["The food on top is macaroni and cheese judging by the appearance and answer a is a component of macaroni and cheese.", "One can make out the long shape and hole of macaroni.", "Mac and cheese is on the bread."], "image": "train2014/COCO_train2014_000000133225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356648, "question_id": "7MbF4kR2BiUJoN5THmcga4", "question": "One of the athletes drinks in the refrigerator contains what substance that increases the body's ability to generate energy?", "choices": ["lemonade", "electrolyte", "water", "juice"], "correct_choice_idx": 1, "direct_answers": ["gatorade", "electrolyte", "electrolytes", "gatorade", "electrolytes", "electrolytes", "gatorade", "electrolyte", "gatorade", "electrolytes"], "difficult_direct_answer": false, "rationales": ["Without electrolytes, one would be too tired and worn out to play and have fun.", "Electrolytes are in gatorade.", "The electrolytes are needed."], "image": "val2014/COCO_val2014_000000356648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187280, "question_id": "7Md4g8aQ3zWFaGdbYsnyFA", "question": "What is shown on the front of the bus?", "choices": ["destination", "warning", "owner", "speed"], "correct_choice_idx": 0, "direct_answers": ["destination sign", "bank", "destination", "bank 25", "bank", "destination", "bus numbers", "destination", "sign", "bank 25"], "difficult_direct_answer": false, "rationales": ["The name of the place is spelled out digitally.", "The front of the bus shows where it's headed.", "The route name and number is on the front"], "image": "train2014/COCO_train2014_000000187280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64244, "question_id": "7MqaNzsuWbLNYiqhKAV56p", "question": "What body of water is this likely to be?", "choices": ["pool", "pond", "river", "sea"], "correct_choice_idx": 2, "direct_answers": ["lake", "lake", "lake", "lake", "river", "lake", "lake", "lake", "lake", "lake"], "difficult_direct_answer": false, "rationales": ["The body of water is large but shores on both sides can be seen.", "By the color of the water and background you can tell where they are.", "It is too big to be a pond or a pool, and these kind of water activities are typically not done in a sea."], "image": "train2014/COCO_train2014_000000064244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258346, "question_id": "7N39HZJ4RcJx36zwPYRfaP", "question": "What animals can be seen in this picture?", "choices": ["dogs", "cats", "gulls", "chickens"], "correct_choice_idx": 2, "direct_answers": ["birds", "birds", "gulls", "bird", "bird", "whale", "seagulls", "bird", "bird", "seagulls"], "difficult_direct_answer": false, "rationales": ["There are birds flying above the people.", "There are gulls over the ocean water.", "There are some flying creatures in the air."], "image": "val2014/COCO_val2014_000000258346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392136, "question_id": "7N4n7JpkXtnAPAb2vFcLJt", "question": "Why is the man in short sleeves walking near the bus?", "choices": ["for fun", "to arrest", "to enter", "to race"], "correct_choice_idx": 2, "direct_answers": ["unknown", "to enter", "going somewhere", "boarding", "to board", "to ride", "boarding", "to board", "boarding", "crowded sidewalk"], "difficult_direct_answer": false, "rationales": ["He looks to board the bus.", "The man wants to enter.", "He looks like he may be getting onthe bus"], "image": "train2014/COCO_train2014_000000392136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 925, "question_id": "7PJtRCfMeVL8ZeeFhAkKxL", "question": "His outfit is well suited for what setting?", "choices": ["club", "beach", "office", "space"], "correct_choice_idx": 2, "direct_answers": ["work", "wedding", "business meeting", "wedding", "office", "office", "business meeting", "business", "business meeting", "office"], "difficult_direct_answer": false, "rationales": ["The man is wearing a suit. people often wear formal attire like this at work.", "In an office setting people usually dress formally.", "The man is wearing a suit."], "image": "train2014/COCO_train2014_000000000925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392445, "question_id": "7PPmSND5PrQ5sxB8wtVzvC", "question": "What are his fingers touching?", "choices": ["legs", "charger", "keyboard", "screen"], "correct_choice_idx": 2, "direct_answers": ["laptop", "keys", "laptop", "laptop computer", "laptop", "keyboard", "keyboard", "laptop", "keys", "laptop"], "difficult_direct_answer": false, "rationales": ["He has a lap top in front of him that he is working on.", "The man's fingers are touching a keyboard on a laptop.", "The fingers are by the keyboard."], "image": "train2014/COCO_train2014_000000392445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486350, "question_id": "7Pe3pXJMuYcMtrQ6sJqsRR", "question": "What state is this in?", "choices": ["nevada", "utah", "california", "washington"], "correct_choice_idx": 1, "direct_answers": ["utah", "not known", "not known", "utah", "utah", "utah", "utah", "utah", "utah", "utah"], "difficult_direct_answer": false, "rationales": ["The firetruck has salt lake city printed on it which is the capital city of this state.", "Salt lake city is in utah.", "The state is utah."], "image": "train2014/COCO_train2014_000000486350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53913, "question_id": "7PtYEhLQoCQQ2CjvH3LDx3", "question": "What move has the player just used?", "choices": ["lob", "backhand", "forehand", "serve"], "correct_choice_idx": 3, "direct_answers": ["serve", "lob", "tennis player", "swatting", "serve", "swing", "swing", "swing", "serve", "underhand swing"], "difficult_direct_answer": false, "rationales": ["When serving in tennis, the player swings higher up into the air to shoot the ball really fast.", "A person is jumping in the area and swinging a tennis racket. to serve in tennis the ball is thrown in the air and the player jumps to hit it.", "The move is a serve."], "image": "train2014/COCO_train2014_000000053913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281562, "question_id": "7PvnB66BA3CGRvFyLwRWAn", "question": "What is the average size of the neck of this animal?", "choices": ["six feet", "two feet", "twelve feet", "thirty feet"], "correct_choice_idx": 0, "direct_answers": ["many feet", "six feet", "long", "6 feet", "six feet", "six feet", "large", "six feet", "unknown", "five feet"], "difficult_direct_answer": false, "rationales": ["On average giraffes have very long necks up to six feet long.", "A giraffe's neck is generally six feet.", "Since we can't judge the size from the picture, this answer was found on the internet."], "image": "train2014/COCO_train2014_000000281562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291528, "question_id": "7PyyYJ6NuSDxiwzApVkLZu", "question": "What spread is on the toast?", "choices": ["jam", "nutella", "margarine", "peanut butter"], "correct_choice_idx": 3, "direct_answers": ["peanut butter", "peanut butter", "banana", "peanut butter", "peanut butter", "peanut butter", "peanut butter", "peanut butter", "peanut butter", "banana"], "difficult_direct_answer": false, "rationales": ["The toast appears to have a brown sticky substance spread evenly across it. a well-know spread for toasts is called peanut butter.", "The spread is thick and brown. it is also paired with a banana.", "A brown substance is on bread. peanut butter is brown."], "image": "val2014/COCO_val2014_000000291528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157516, "question_id": "7QCsJtBjrCcSFmCXdpxWaa", "question": "Where is this man and child sitting?", "choices": ["starbucks", "peets", "bus stop", "orange julius"], "correct_choice_idx": 0, "direct_answers": ["mall", "coffee house", "restaurant", "starbucks", "starbucks", "at table", "restaurant", "chair", "restaurant", "coffee shop"], "difficult_direct_answer": false, "rationales": ["The man and kid are at starbucks.", "The company's logo is on the far right.", "The logo is visible in the background."], "image": "val2014/COCO_val2014_000000157516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470117, "question_id": "7QcxwsnezH5H8mcweWoYSe", "question": "What is the wooden structure for?", "choices": ["driving", "walking up/down", "storing groceries", "grinding grain"], "correct_choice_idx": 1, "direct_answers": ["stairs", "steps", "climbing", "walking up/down", "stairs", "stairs", "bleachers", "its stairs", "playing", "climbing"], "difficult_direct_answer": false, "rationales": ["The stairs are used to enter and exit a building up to or from a higher level.", "The structure is for walking.", "The wooden structure is a staircase. stairs are used for getting up and down."], "image": "val2014/COCO_val2014_000000470117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407607, "question_id": "7QpHHB7PCLqPiWCeNLzB6u", "question": "What is on the wall above the TV?", "choices": ["poster", "smoke alarm", "monkey", "clock"], "correct_choice_idx": 1, "direct_answers": ["fire alarm", "smoke detector", "smoke detector", "smoke detector", "smoke detector", "smoke alarm", "fire detector", "clock", "smoke alarm", "smoke detector"], "difficult_direct_answer": false, "rationales": ["The wall above the tv has a round safety device that will make a loud noise if it detects there may be a fire in the area.", "There is a smoke alarm on the wall above the tv.", "There is a smoke alarm hanging on the wall above the tv."], "image": "train2014/COCO_train2014_000000407607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531949, "question_id": "7QrFykkUXF2BmWGdNbtaE5", "question": "What emotion is the woman feeling?", "choices": ["fear", "anger", "sadness", "joy"], "correct_choice_idx": 3, "direct_answers": ["happy", "joy", "happiness", "joy", "happy", "joy", "joy", "happy", "happiness", "happy"], "difficult_direct_answer": false, "rationales": ["People smile when they have this feeling.", "She looks to be happy and smiling.", "The woman is smiling."], "image": "train2014/COCO_train2014_000000531949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70932, "question_id": "7QxtUatLY7io3j7eSLTw8W", "question": "How many functional keys in the keyboard?", "choices": ["15", "13", "12", "11"], "correct_choice_idx": 3, "direct_answers": ["101", "64", "12", "hundred four", "twelve", "twelve", "101", "twelve", "11", "102"], "difficult_direct_answer": false, "rationales": ["Eleven keys are functional.", "There are eleven keys.", "The keyboard has that amount of function keys."], "image": "val2014/COCO_val2014_000000070932.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65488, "question_id": "7QzcEzYQd8J5xR7RYu4Vj8", "question": "Why is the woman wearing the covering around her neck?", "choices": ["keeping warm", "covering scar", "vanity", "style"], "correct_choice_idx": 0, "direct_answers": ["keeping warm", "scarf", "cold", "cold", "keeping warm", "cold out", "scarf", "keep warm", "cold", "cool place"], "difficult_direct_answer": false, "rationales": ["The woman is covering her neck with a scarf to keep warm during winter.", "The woman is wearing a scarf around her neck to keep her warm in the snowy weather.", "The woman doesn't want to freeze in the snow."], "image": "train2014/COCO_train2014_000000065488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240689, "question_id": "7R36KFwuN2yXBopLrjWum9", "question": "Why is the boy standing on one leg?", "choices": ["hot sand", "yoga", "lost balance", "earthquake"], "correct_choice_idx": 2, "direct_answers": ["kite", "kite flying", "balancing", "lost balance", "windy day", "balance", "balancing", "flying kite", "fun", "balance"], "difficult_direct_answer": false, "rationales": ["The boy is standing on one leg because he lost balance.", "He is playing a game and reaching for something", "The boy is being tilted over by the kite."], "image": "train2014/COCO_train2014_000000240689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535218, "question_id": "7R54ALSG2gzY9GiEr6iPAi", "question": "Why is the man reaching under his leg?", "choices": ["to dance", "to pick", "to catch", "to itch"], "correct_choice_idx": 2, "direct_answers": ["frisbee", "frisbee", "catch frisbee", "to catch", "throwing frisbee", "trick catch", "catch frisbee", "to catch", "frisbee", "catching frisbee"], "difficult_direct_answer": false, "rationales": ["The man wants to catch.", "The man is reaching to catch the frisbee. it is a common frisbee trick to catch it under one's leg.", "There is a frisbee under his leg."], "image": "train2014/COCO_train2014_000000535218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451643, "question_id": "7RcGF2hKBc9aB45C9AVUFq", "question": "What is the blue box used for?", "choices": ["storage", "parking", "toilet", "resting"], "correct_choice_idx": 2, "direct_answers": ["toilet", "public toilet", "toilet", "toilet", "toilet", "toilet", "lavatory", "toilet", "bathroom", "bathroom"], "difficult_direct_answer": false, "rationales": ["It is in a busy public place, has ventilation and a door and holds one person.", "The blue box is used for toiletries.", "The blue box behind the man is a portable toilet for people that need a bathroom."], "image": "train2014/COCO_train2014_000000451643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205432, "question_id": "7RdHFuYGh9HBcg8KWoKemu", "question": "What is the object hanging underneath the roof eave?", "choices": ["signal light", "flood light", "camera", "speaker"], "correct_choice_idx": 2, "direct_answers": ["elevator", "sign", "elevator sign", "camera", "sign", "camera", "sign", "camera", "passers", "elevator sign"], "difficult_direct_answer": false, "rationales": ["The purpose of the object hanging is to take footage.", "This is for security to keep crimes from happening", "Cameras are hanging underneath the eaves."], "image": "train2014/COCO_train2014_000000205432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491372, "question_id": "7Ryj6o46tcFuCq4Hi7HquX", "question": "What color skin does the tartest fruit seen here have?", "choices": ["green", "yellow", "orange", "red"], "correct_choice_idx": 0, "direct_answers": ["orange", "orange", "orange", "green", "green", "orange", "green", "orange", "green", "orange"], "difficult_direct_answer": false, "rationales": ["Green apples are more tart.", "The granny smith apple is green.", "The green apple is quite tart and certainly the tartest of the others, including the orange and red apple."], "image": "train2014/COCO_train2014_000000491372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456563, "question_id": "7S6DunCbFSgSLCT8CNMmcU", "question": "What are the breaded items?", "choices": ["beef", "shrimp", "sardines", "chicken"], "correct_choice_idx": 1, "direct_answers": ["crouton", "chicken", "chicken", "eating things", "chicken", "shrimp", "crotons", "fried chicken", "meat", "shrimp"], "difficult_direct_answer": false, "rationales": ["The size and the shape eliminate all the other options.", "The breaded items are probably chicken tenders.", "A salad includes brown, fried food items that are a bit curved."], "image": "train2014/COCO_train2014_000000456563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579057, "question_id": "7SCQabYQnfYVQwJCH9dD9f", "question": "What type of food is the yellow item at the back of the bowl?", "choices": ["fruit", "pepper", "egg", "vegetable"], "correct_choice_idx": 0, "direct_answers": ["banana", "banana", "bananas", "bananas", "banana", "banana", "bananas", "banana", "fruit", "banana"], "difficult_direct_answer": false, "rationales": ["Traditionally banana's are categorized as fruits.", "The food is a fruit.", "The food at the back is banana which is a fruit."], "image": "train2014/COCO_train2014_000000579057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224523, "question_id": "7SYwwPTGJ5B8gQAyiqx62z", "question": "The man in black is dressed like what star?", "choices": ["john travolta", "tom cruise", "elvis", "danny devito"], "correct_choice_idx": 2, "direct_answers": ["elvis", "elvis", "elvis", "elvis", "elvis", "elvis", "elvis", "elvis", "elvis", "elvis"], "difficult_direct_answer": false, "rationales": ["He has dark hair, sideburns, sunglasses and a jumpsuit, which is what the singer used to wear during performances.", "The king of rock and roll was famous for his sideburns and his jumpsuits.", "The man looks like the king of rock."], "image": "val2014/COCO_val2014_000000224523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240586, "question_id": "7SdBzDNAUT3CZ6nSUqmTdj", "question": "Asphalts are used to construct what?", "choices": ["house", "roads", "building", "harbor"], "correct_choice_idx": 1, "direct_answers": ["roads", "roads", "roads", "streets", "roads", "maintain roads", "roads", "roads", "commercial vehicle", "road"], "difficult_direct_answer": false, "rationales": ["That's basically why it's made.", "Roads are made with asphalt.", "Asphalt is for roads."], "image": "train2014/COCO_train2014_000000240586.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50578, "question_id": "7T3s88MLvRPSVGwENAd4yE", "question": "Which acid is present in orange?", "choices": ["tannic acid", "citric acid", "tartaric acid", "amino acid"], "correct_choice_idx": 1, "direct_answers": ["citric", "citric acid", "citric", "citric", "citric", "citric acid", "citric acid", "citric", "folic", "citric acid"], "difficult_direct_answer": false, "rationales": ["The orange has citric.", "This is a citrus fruit with citric acid in it.", "Oranges are part of the citrus family."], "image": "val2014/COCO_val2014_000000050578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150686, "question_id": "7T6XMCJuok5T7RQxWXaZa5", "question": "The destination on the top of the bus is a city in what country?", "choices": ["guam", "nepal", "thailand", "netherlands"], "correct_choice_idx": 3, "direct_answers": ["netherlands", "england", "germany", "utrechi", "netherlands", "utrechi", "belgium", "usa", "germany", "netherlands"], "difficult_direct_answer": false, "rationales": ["A bus is shown with the next stop listed in the digital sign. the location is in the netherlands.", "The destination is the netherlands.", "The destination is utrecht. it is in europe, not oceania or asia."], "image": "val2014/COCO_val2014_000000150686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32549, "question_id": "7TH7jRBcM8NiMqfBQSFDD6", "question": "What activity is the woman participating in?", "choices": ["shopping", "cleaning", "sleep", "travel"], "correct_choice_idx": 3, "direct_answers": ["travel", "walking", "travelling", "street crossing", "bus riding", "travel", "traveling", "traveling", "travel", "travelling"], "difficult_direct_answer": false, "rationales": ["The woman is traveling with her luggage.", "The woman is participating in world travel.", "This woman is travelling as evidenced by her suitcase and presence near a bus depot."], "image": "val2014/COCO_val2014_000000032549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63140, "question_id": "7TMXBukznx6zkNkkh9BKBA", "question": "What cloth item hangs next to the man?", "choices": ["tablecloth", "banner", "curtain", "poster"], "correct_choice_idx": 0, "direct_answers": ["table cloth", "tablecloth", "table cloth", "tablecloth", "tablecloth", "tablecloth", "table cloth", "table cloth", "picnic tablecloth", "tablecloth"], "difficult_direct_answer": false, "rationales": ["The tablecloth lines the table behind the man.", "A covering is over the table to protect it.", "This is a wooden picnic table"], "image": "train2014/COCO_train2014_000000063140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499786, "question_id": "7TR6WAH2LQHYKUMW4XVEBF", "question": "What type of parking is shown?", "choices": ["valet", "diagonal", "lot", "street"], "correct_choice_idx": 3, "direct_answers": ["street", "parallel", "street", "street side", "street parking", "street", "curbside", "side street", "street", "prohibited"], "difficult_direct_answer": false, "rationales": ["There are two parked cars across the road.", "The cars are parked on the street.", "The cars are parked on the side of the street as they would be in areas where street parking is allowed. they should not block any fire hydrants or driveways when they park to avoid a ticket."], "image": "train2014/COCO_train2014_000000499786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73450, "question_id": "7TXpqhMEFy869hrydbP4zB", "question": "What is the worker doing?", "choices": ["loading cargo", "unloading cargo", "cleaning cargo", "selling cargo"], "correct_choice_idx": 0, "direct_answers": ["working", "loading luggage", "loading", "transferring luggage", "loading place", "loading cargo", "loading", "loading plane", "loading suitcases", "transferring luggage"], "difficult_direct_answer": false, "rationales": ["One can see the cart with the suitcases ready to be put in the hold.", "The worker is loading cargo.", "He is putting luggage onto a conveyor belt to go into the plane."], "image": "val2014/COCO_val2014_000000073450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301266, "question_id": "7TaSMbXRkaV9Db6HVzqkco", "question": "What type items are the focus of the work here?", "choices": ["baskets", "chop sticks", "cutlery", "skewers"], "correct_choice_idx": 2, "direct_answers": ["utensils", "silverware", "silverware", "utensils", "cutlery", "silverware", "silverware", "utensils", "silverware", "utensils"], "difficult_direct_answer": false, "rationales": ["The people are focusing on baskets of spoons, forks, and knives.", "Knives, spoons and forks can be seen in the baskets.", "They are rolling forks, spoons and knives into the napkins."], "image": "val2014/COCO_val2014_000000301266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152600, "question_id": "7ThWGpwke8VSYJxS2JVdKv", "question": "What is the glass container likely to be?", "choices": ["bookshelf", "tv stand", "fish tank", "china cabinet"], "correct_choice_idx": 2, "direct_answers": ["aquarium", "tempered glass", "habitat", "tank", "fish tank", "display case", "dresser", "table", "unsure", "cabinet"], "difficult_direct_answer": true, "rationales": ["It has wood and a structure inside it for animals", "The glass container likely holds water and fish.", "The container is in a living room-like environment. it is somewhat large and contains wood and concrete arranged in a way that would be comfortable for aquatic animals."], "image": "train2014/COCO_train2014_000000152600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493841, "question_id": "7Tr7yzqQyfhKXBjXJJuf3t", "question": "What period of the day is it in the image?", "choices": ["afternoon", "night", "evening", "morning"], "correct_choice_idx": 0, "direct_answers": ["afternoon", "noon", "late morning", "afternoon", "afternoon", "early noon", "midday", "daytime", "noon", "afternoon"], "difficult_direct_answer": false, "rationales": ["A clock reads just after twelve and a shadow falls below it to make a right angle.", "From the shade, it looks to be afternoon.", "It is just after 12:00 and it is light outside, so it is not midnight."], "image": "train2014/COCO_train2014_000000493841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411223, "question_id": "7UFJZjquddEmnPeBLyypMn", "question": "What is closest to the person?", "choices": ["banana", "barrel", "baby", "tiger"], "correct_choice_idx": 0, "direct_answers": ["lighter", "salad", "salad", "platter", "salad", "salad", "salad", "banana", "salad", "food"], "difficult_direct_answer": false, "rationales": ["The banana is closest to the person's hand.", "There are yellow oblong fruits next to the person.", "The person is standing near bananas."], "image": "val2014/COCO_val2014_000000411223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453496, "question_id": "7UtgmPi6VEJvVc5EHXvCnF", "question": "What does the circular sign below the left traffic light mean?", "choices": ["no exit", "no loitering", "no turns", "no parking"], "correct_choice_idx": 2, "direct_answers": ["no turns", "no left", "don't turn", "no turn", "no turning", "no turn", "no turn", "no turn", "no turn", "no left"], "difficult_direct_answer": false, "rationales": ["The sign means you can't turn.", "The sign has an arrow curved to the left with a line slashed through it. that signifies that cars may not travel in that direction down the street.", "The sign has an arrow with a red line through it."], "image": "train2014/COCO_train2014_000000453496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482525, "question_id": "7V7rmEiRDAKhWVLeyt9LaF", "question": "How many people are probably getting ready to dig into the desserts?", "choices": ["two", "one", "three", "four"], "correct_choice_idx": 1, "direct_answers": ["one", "one", "many", "one", "one", "one", "one", "one person", "one", "one"], "difficult_direct_answer": false, "rationales": ["It is on a single plate with a single place setting", "One plate is set.", "There is a single napkin and a single set of utensils."], "image": "train2014/COCO_train2014_000000482525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51920, "question_id": "7VQhzmMX72PDgz8MYp8Tr4", "question": "How many different flavors of pizza did they order?", "choices": ["three", "five", "one", "two"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "one", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["You can see two different styles on pizza on the pan in the image.", "There is one flavor near the man and another one across from him.", "There looks to be two different kinds."], "image": "val2014/COCO_val2014_000000051920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342232, "question_id": "7VcgBjcZBJPuT4RCuvakmB", "question": "Why is he smiling?", "choices": ["is wealthy", "for camera", "showing off", "is proud"], "correct_choice_idx": 3, "direct_answers": ["happy", "he's happy", "having fun", "posing", "is proud", "happy", "having fun", "happy", "skateboarding", "fun"], "difficult_direct_answer": false, "rationales": ["Though it could be any of the answers listed above, more than likely he is smiling for the camera.", "It is hard to skateboard. the boy is happy that he learned.", "He is a happy kid."], "image": "val2014/COCO_val2014_000000342232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579664, "question_id": "7VhpKnm4RDauwkhVrnVnq5", "question": "What color are the spots on the fruits?", "choices": ["red", "black", "white", "blue"], "correct_choice_idx": 1, "direct_answers": ["brown", "black", "black", "black", "brown", "brown", "brown", "dark brown", "green yellow", "orange"], "difficult_direct_answer": false, "rationales": ["One can make out the dark colored freckles on the fruit.", "The spots on the bananas are black.", "Dark spots form on bananas when they start turning bad."], "image": "val2014/COCO_val2014_000000579664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331272, "question_id": "7Vi76ojMC5ocfJE9ZXKkS9", "question": "What building or structure is the electric train underneath of?", "choices": ["funnel", "archway", "tunnel", "bridge"], "correct_choice_idx": 3, "direct_answers": ["bridge", "overpass bridge", "bridge", "overpass bridge", "bridge", "bridge", "bridge", "bridge", "bridge", "bridge"], "difficult_direct_answer": false, "rationales": ["By its design and length it's easy to discern what the structure is.", "There is an indoor bridge over top of the train.", "The building is the bridge."], "image": "train2014/COCO_train2014_000000331272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211164, "question_id": "7ViKF9K6tkdE4kFzkbVzJ6", "question": "What is the bowl made from?", "choices": ["wood", "steel", "plastic", "glass"], "correct_choice_idx": 3, "direct_answers": ["plastic", "metal", "from ceramic", "glass", "ceramic", "porcelain", "soup", "glass", "glass", "metal"], "difficult_direct_answer": false, "rationales": ["It is breakable.", "The bowl has a white appearance like plastic and may be used in a restaurant setting to reduce costs over ceramic bowls.", "The bowl the food is in is made from glass."], "image": "train2014/COCO_train2014_000000211164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487835, "question_id": "7VknNuMr5dkZzhdb9CPAp7", "question": "Where is the person walking?", "choices": ["river", "forest", "roadway", "subway"], "correct_choice_idx": 2, "direct_answers": ["crosswalk", "crosswalk", "across street", "roadway", "across street", "crosswalk", "crosswalk", "across", "crosswalk", "street"], "difficult_direct_answer": false, "rationales": ["He is crossing the road in the crosswalk", "A man is moving across the street in the crosswalk.", "The person is using a crosswalk to cross a street."], "image": "train2014/COCO_train2014_000000487835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557886, "question_id": "7Vvu7zSxbobDaTVzv3ryJY", "question": "What is the tallest item here?", "choices": ["horse", "building", "bush", "telephone pole"], "correct_choice_idx": 3, "direct_answers": ["clouds", "telephone poles", "telephone pole", "hill", "telephone poles", "electric pole", "telephone poles", "electric poles", "mountain", "power poles"], "difficult_direct_answer": false, "rationales": ["Is the tallest thing connected to the planet. the highest would be the clouds.", "The bush and horse are relatively short. there does not seem to be a building.", "There are tall phone lines."], "image": "train2014/COCO_train2014_000000557886.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367951, "question_id": "7W5K9Ce9u9xC8dKoB2Dehq", "question": "Why might the dog be near the tub?", "choices": ["to bathe", "to guard", "to play", "to eat"], "correct_choice_idx": 0, "direct_answers": ["be washed", "bath time", "bath time", "bath", "recently washed", "bath time", "waiting", "washing", "to bathe", "bath"], "difficult_direct_answer": false, "rationales": ["This is the only reason a dog might be in the bathroom, since they don't use the toilet or the sink.", "The dog wants to be cleaned.", "Some dogs love to take a bath."], "image": "train2014/COCO_train2014_000000367951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510096, "question_id": "7W74gomDvUa7PDjXxpHZF8", "question": "What action are they taking?", "choices": ["stop", "descend", "strike", "ascend"], "correct_choice_idx": 1, "direct_answers": ["descend", "turning", "skiing", "snowboarding", "skiing", "skiing", "skiing", "skiing", "sliding down", "skiing"], "difficult_direct_answer": false, "rationales": ["They are skiing on a hill, which only works going down.", "They are skiing down the hill.", "These people are skiing down a mountain."], "image": "train2014/COCO_train2014_000000510096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542111, "question_id": "7W8PWsFEbkA3kGyMQomQoD", "question": "Why is he standing like that?", "choices": ["falling", "slipped", "showing off", "hitting ball"], "correct_choice_idx": 3, "direct_answers": ["reaching ball", "balance", "playing", "reaching ball", "hitting ball", "tennis player", "hit ball", "playing tennis", "tennis", "return hit"], "difficult_direct_answer": true, "rationales": ["This is an action photo where he is stretching to return his opponent's serve as quickly as possible to score a point.", "He is stretched out to reach the ball", "This tennis player has just run as fast as he could to strike the incoming tennis ball. he had to stop quickly to position himself to make the hit, which meant throwing his right leg up to maintain his balance."], "image": "train2014/COCO_train2014_000000542111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491830, "question_id": "7WDZmcvg7T4SEummw4txMi", "question": "What type vehicle does this sign refer to?", "choices": ["large truck", "bike", "roller blade", "mini bike"], "correct_choice_idx": 0, "direct_answers": ["car", "car", "all vehicles", "runaways", "cars", "automobile", "large truck", "trucks", "car", "car"], "difficult_direct_answer": false, "rationales": ["This is because a big vehicle will roll easy on a hill", "Trucks are generally the type of vehicle that is involved in runaways because of their size and weight they are carrying.", "A sign instructs how to park on a hill. trucks can be dangerous if they are left in gear on a hill."], "image": "train2014/COCO_train2014_000000491830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113251, "question_id": "7WSVo6AiWFgCHF3c6EVWDJ", "question": "What time of day is shown here?", "choices": ["midnight", "noon", "dawn", "late morning"], "correct_choice_idx": 2, "direct_answers": ["early evening", "night", "night", "dawn", "night", "night", "night", "night", "evening", "evening"], "difficult_direct_answer": false, "rationales": ["There is not a lot of light out and there aren't a lot of people on the road.", "The street is seen in early morning when the sun is just coming up and not many people are awake.", "It must be early out since the sky is gray."], "image": "val2014/COCO_val2014_000000113251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175459, "question_id": "7WTXXos7C82iEWwqdoJ7Gy", "question": "What does the open store on the left sell?", "choices": ["bikes", "drugs", "shirts", "gas"], "correct_choice_idx": 1, "direct_answers": ["chemicals", "medicine", "medicine", "medicine", "drugs", "drugs", "medicine", "drugs", "pharmacy items", "drugs"], "difficult_direct_answer": false, "rationales": ["The store on the left is a pharmacist.", "A sign advertises a chemist. drugs are made and sold by chemists.", "There is a sign to the left that says chemist. these are people that mix things for people to take to make them feel certain things for help them feel better."], "image": "train2014/COCO_train2014_000000175459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507783, "question_id": "7WTviPetKCLrpt6Lw68WBp", "question": "What are these types of bags primarily used for?", "choices": ["selling", "buying", "traveling", "trading"], "correct_choice_idx": 2, "direct_answers": ["travelling", "travel", "vacations", "traveling", "travel", "travel", "travel", "packing", "travel", "travel"], "difficult_direct_answer": false, "rationales": ["These bags are generally used for people to travel with.", "The bags are suitcases that are used for traveling.", "The luggage is for traveling."], "image": "val2014/COCO_val2014_000000507783.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259099, "question_id": "7WUVhKmoYGRcB5GqEgYWDS", "question": "What vehicle is boarded in this building?", "choices": ["cars", "bus", "train", "gondola"], "correct_choice_idx": 3, "direct_answers": ["nothing", "bus", "zamboni", "snowbike", "gondola", "buses", "gondola", "gondola", "cable car", "plane"], "difficult_direct_answer": false, "rationales": ["The building is labeled with the word gondola, which in this case is a euphemism for ski lift, so \"gondolas\" have to be the vehicle that people board in this building.", "A gondola would be in this building.", "The sign on the side of the building indicates which vehicle is boarded inside."], "image": "val2014/COCO_val2014_000000259099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240543, "question_id": "7WmrVjYHuzBUEF7NVWWwQW", "question": "What might this trick's technique be called?", "choices": ["kick flip", "christ air", "side spill", "rim rocker"], "correct_choice_idx": 0, "direct_answers": ["ollie", "flip", "ollie", "ollie", "flip", "flip", "kick flip", "ollie", "flip", "flip"], "difficult_direct_answer": false, "rationales": ["This looks to be flipping up and a kick flip", "The trick is the kick flip.", "The board is flipping over into the air."], "image": "train2014/COCO_train2014_000000240543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566576, "question_id": "7WopFf53ovrzuRR4wLYcFN", "question": "What country is this station located at?", "choices": ["singapore", "norway", "america", "england"], "correct_choice_idx": 3, "direct_answers": ["united kingdom", "england", "norway", "england", "england", "uk", "england", "england", "england", "england"], "difficult_direct_answer": false, "rationales": ["Norwich is located in england.", "The name of the bus station is on the building, i've been there and it's in england.", "This city is located not far from london so the station is in england."], "image": "val2014/COCO_val2014_000000566576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85284, "question_id": "7Ws8VR32GERn6DEw4scNeR", "question": "Where is this parking structure located?", "choices": ["culver city", "portland", "chicago", "los angeles"], "correct_choice_idx": 3, "direct_answers": ["hollywood", "hollywood", "hollywood", "house", "los angeles", "house", "hollywood", "hollywood", "house", "hollywood"], "difficult_direct_answer": false, "rationales": ["Hollywood is a part of los angeles.", "It is near the 101 as well as the kodak and chinese theatres.", "Hollywood blvd is in la."], "image": "train2014/COCO_train2014_000000085284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292931, "question_id": "7X7P83TVbYa4a6hxR5Dtay", "question": "What has this piece of pottery been repurposed as?", "choices": ["dog bed", "planter", "cat bed", "paper weight"], "correct_choice_idx": 2, "direct_answers": ["cat bed", "cat bed", "cat bed", "bed", "bed", "cat", "cat bed", "bed", "cat bed", "cat bed"], "difficult_direct_answer": false, "rationales": ["There are two felines that are curled up and sleeping in the pot.", "The pottery has been taken over by cats that use it as a bed.", "Cats love to sleep in circular things."], "image": "val2014/COCO_val2014_000000292931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518774, "question_id": "7XSQKxSW2RsGLbc3o3EibP", "question": "Why is the woman in purple with the purple purse holding up her right hand?", "choices": ["flying kite", "signaling help", "volunteering", "waving"], "correct_choice_idx": 0, "direct_answers": ["flying kite", "kite", "fly kite", "kite", "kite flying", "flying kite", "kite", "flying kite", "flying kite", "flying kite"], "difficult_direct_answer": false, "rationales": ["The woman in purple is holding a kite in her hand.", "The woman is flying the kite.", "Her kite is visibly attached to her, and it is flying."], "image": "train2014/COCO_train2014_000000518774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201043, "question_id": "7XWGQZ5gLrNpN8oPspREfi", "question": "Why are the umbrellas hung upside down?", "choices": ["protest", "luck", "sales display", "rain protection"], "correct_choice_idx": 2, "direct_answers": ["advertising", "for display", "air", "sales display", "decoration", "display", "decoration", "exhibiting", "sales display", "for sale"], "difficult_direct_answer": false, "rationales": ["A market has umbrellas in containers as well as hanging above.", "The umbrellas are being shown to customers to sell.", "The umbrellas are in a shop."], "image": "val2014/COCO_val2014_000000201043.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299343, "question_id": "7XZhqeqmeH4hAMkxg3mraN", "question": "What does this bus have a connection to?", "choices": ["television", "restaurants", "subways", "internet"], "correct_choice_idx": 3, "direct_answers": ["city bus", "wifi", "five star", "wifi", "wifi", "internet", "road", "leonidas smith", "town", "next stop"], "difficult_direct_answer": false, "rationales": ["Lettering on top of the front of the bus lets passengers know that they'll have free wi-fi if they choose to travel on this bus.", "There is internet.", "A bus advertises an internet connection."], "image": "train2014/COCO_train2014_000000299343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180315, "question_id": "7XarKnkY7JPfWEb8YftXoR", "question": "What is the elephant doing?", "choices": ["bathing", "eating dinner", "escaping", "drinking water"], "correct_choice_idx": 3, "direct_answers": ["drinking", "drinking water", "drinking water", "drinking", "drinking", "drinking water", "drinking", "drinking", "drinking", "drinking water"], "difficult_direct_answer": false, "rationales": ["The elephant wants a drink.", "The elephant is drinking.", "We can see the ripples in the water where the elephant had his trunk as he filled it to get a drink. the end of his trunk is now in his mouth, and he can quench his thirst with a cool drink."], "image": "train2014/COCO_train2014_000000180315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485413, "question_id": "7XiWR5DXGzfQyh9nbkFjbs", "question": "What kind of horseback riding style is this?", "choices": ["western", "arabian", "english", "group"], "correct_choice_idx": 2, "direct_answers": ["show jumping", "equestrian", "english", "equestrian", "standing up", "western", "equestrian", "jumping", "jumping", "freestyle"], "difficult_direct_answer": false, "rationales": ["English horseback riding involves jumping.", "The horseback rider is performing an english sport.", "The style is english."], "image": "val2014/COCO_val2014_000000485413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27012, "question_id": "7YhQnF8vRxB9Sa7PQ5Xwds", "question": "What types of signs are shown?", "choices": ["informational", "warning", "traffic", "directional"], "correct_choice_idx": 0, "direct_answers": ["information", "informational", "informative history", "foreign", "warning signs", "zoo signs", "foreign", "park signs", "informational", "informational"], "difficult_direct_answer": false, "rationales": ["The signs posted are informational that visitors can read and learn about the surroundings.", "Signs are posted all around an area with fences in nature.", "Signs are on the side of enclosures. signs on enclosures usually tell about what is inside."], "image": "train2014/COCO_train2014_000000027012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200416, "question_id": "7Yj7cjA85sgo6T8Dep9wbh", "question": "What kind of skateboard ramp is this?", "choices": ["bowl", "half pipe", "quarter pipe", "launch"], "correct_choice_idx": 2, "direct_answers": ["half pipe", "slope ramp", "half pipe", "half pipe", "half pipe", "curved", "halfpipe", "quarter pipe", "unknown", "quarter pipe"], "difficult_direct_answer": false, "rationales": ["The skateboard ramp is smaller so it's only a quarter pipe.", "The skateboard ramp is a quarter pipe.", "That is what it's called."], "image": "train2014/COCO_train2014_000000200416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184532, "question_id": "7ZKw9exx4EzpzHBMsmxHie", "question": "Where would more well known government workers work here?", "choices": ["rightmost building", "middle building", "leftmost building", "park"], "correct_choice_idx": 0, "direct_answers": ["white house", "capitol", "white house", "capitol", "rightmost building", "white house", "washington dc", "whitehouse", "capitol", "capitol"], "difficult_direct_answer": false, "rationales": ["The capitol building is on the far right. congress people work in this building.", "It is a government building.", "The building has a recognizable color and dome shape on top of it."], "image": "train2014/COCO_train2014_000000184532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536416, "question_id": "7ZYCDVhLoJH4CDuY3bLeux", "question": "What color is the wetsuit of the woman who is standing on the surf board?", "choices": ["green", "black", "blue", "red"], "correct_choice_idx": 2, "direct_answers": ["blue", "black", "black", "black blue", "black blue", "black", "black", "black", "black", "black blue"], "difficult_direct_answer": false, "rationales": ["The woman's wet suit is a bright royal blue hue.", "The woman's wetsuit is both black and blue.", "The color is blue."], "image": "val2014/COCO_val2014_000000536416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380252, "question_id": "7ZhtaDvi6YmLfH56Tte5Gu", "question": "How many species rest here?", "choices": ["three", "none", "one", "ten"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two different kinds of animals and a person here.", "There's a trio of species here...two cats, a dog, and a human.", "There are cats, a dog, and a person in the bed that add up to 3 species."], "image": "train2014/COCO_train2014_000000380252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381630, "question_id": "7ZnJey5JM5dqiCowB3bMSJ", "question": "What kind of place is this?", "choices": ["garage", "shed", "barn", "city"], "correct_choice_idx": 3, "direct_answers": ["industrial area", "city", "city", "city", "train station", "city", "train station", "city", "industrial", "train station"], "difficult_direct_answer": false, "rationales": ["A large urban area is shown.", "The train is traveling on tracks that go through a busy city.", "This place is a busy urban area."], "image": "val2014/COCO_val2014_000000381630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559840, "question_id": "7ZoKajkHqVdQadYrMvBRS3", "question": "What side dish would compliment his food quite well?", "choices": ["soup", "apples", "fries", "milk"], "correct_choice_idx": 2, "direct_answers": ["mashed potatoes", "potatoes", "cole slaw", "fries", "rice", "fries", "fries", "french fries", "cole slaw", "green beans"], "difficult_direct_answer": false, "rationales": ["The side dish is fries.", "Chicken strips are on a plate. fries are often served with chicken strips.", "Fries go well with chicken strips."], "image": "train2014/COCO_train2014_000000559840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176432, "question_id": "7aFxTXbmMDY7kMkQXdRWTR", "question": "What boy would feel at home among these characters?", "choices": ["christopher robin", "hansel", "peter pan", "jack"], "correct_choice_idx": 0, "direct_answers": ["christopher robin", "christopher robin", "young boy", "christopher robin", "feminine", "tommy", "christopher robin", "christopher robin", "christopher robin", "pooh fan"], "difficult_direct_answer": false, "rationales": ["The characters are from winnie-the-pooh.", "He is the only human in the hundred acre wood.", "The characters include winnie-the-pooh, piglet, and eeyore. they are from the books written by a. a. milne."], "image": "train2014/COCO_train2014_000000176432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429042, "question_id": "7aiBPrKRPFKEbjwjZyzuQq", "question": "What size helmet does a 6 year old need?", "choices": ["60cm", "35cm", "53cm", "78cm"], "correct_choice_idx": 2, "direct_answers": ["small", "53cm", "small", "small", "small", "small", "childsize", "53 cm", "53 centimeters", "child size"], "difficult_direct_answer": false, "rationales": ["A child has a small head. 53 cm is a standard size for a child.", "A child has a small head that would need a 53 cm helmet.", "Though people have different sized heads, 53 cm would be the average size head of a 6 year old child."], "image": "val2014/COCO_val2014_000000429042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432897, "question_id": "7ajWvnhrsWRVeZSf87cZRL", "question": "What is primarily done on the furniture in the background?", "choices": ["swing", "sleep", "sit", "lay"], "correct_choice_idx": 2, "direct_answers": ["sitting", "sitting", "sitting", "audience", "sitting", "audience", "audience", "sit", "sitting", "sitting"], "difficult_direct_answer": false, "rationales": ["Two women are on a baseball field. there are bleachers in the background.", "There are rows of seats behind the field for the audience to sit.", "The bleachers are used to sit on."], "image": "train2014/COCO_train2014_000000432897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407115, "question_id": "7aptyUKNvdcy5Y93mdj4BE", "question": "What does this store sell?", "choices": ["tires", "coffee", "cars", "horses"], "correct_choice_idx": 1, "direct_answers": ["coffee", "food stuff", "coffee", "super market", "coffee", "kitchen appliances", "beverages", "coffee", "food", "coffee"], "difficult_direct_answer": false, "rationales": ["The store has a bunch of pots of coffee.", "A sore has coffee makers and flavors to add to the coffee on the counter.", "The store has several coffee urns and flavored syrups for sale."], "image": "val2014/COCO_val2014_000000407115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170077, "question_id": "7aq4BZ25ikjF9yGGnpS93K", "question": "What weather phenomena wouldn't threaten this boy?", "choices": ["heat", "tornado", "rain", "hurricane"], "correct_choice_idx": 2, "direct_answers": ["rain", "storm", "rain", "rain", "storm", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The boy is wearing an umbrella hat. the boy is also inside.", "He is wearing an umbrella.", "He has an umbrella as a hat that would protect him from falling water"], "image": "val2014/COCO_val2014_000000170077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184416, "question_id": "7b34eDxThrUwTqMbmoXaRL", "question": "How many hours until midnight?", "choices": ["two", "three", "four", "eight"], "correct_choice_idx": 3, "direct_answers": ["5.5", "9 hours", "3 hours", "nine", "eight", "eight", "six", "eight half", "six", "eight hours"], "difficult_direct_answer": false, "rationales": ["The clock at the train station says it is 3:30 and about 8 hours until midnight.", "The clock shows the current time as 3:31 so i chose the closest possible option.", "It's currently four o'clock."], "image": "train2014/COCO_train2014_000000184416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566752, "question_id": "7b3qfxYoaqDyrjeMdSQgJf", "question": "What publication did this magazine start out as?", "choices": ["pamphlet", "novel", "tabloid", "newspaper"], "correct_choice_idx": 3, "direct_answers": ["vogue", "weekly newspaper", "unknown", "vogue novita", "newspaper", "vogue", "vogue", "weekly newspaper", "vogue", "vogue"], "difficult_direct_answer": false, "rationales": ["Vogue was previously printed with news pieces.", "The magazine is vogue.", "It would of been a newspaper before a magazine."], "image": "train2014/COCO_train2014_000000566752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166840, "question_id": "7bMPbdQDt8B3EHLZDeFXvP", "question": "What type of headwear is the boy in white wearing?", "choices": ["bandana", "skullet", "beanie", "cap"], "correct_choice_idx": 2, "direct_answers": ["helmet", "beanie", "beanie", "normal", "beanie", "cap", "helmet", "beanie", "toboggan", "hat"], "difficult_direct_answer": false, "rationales": ["The boy is wearing headgear that matches the item mentioned in option a.", "A beanie is a cap with no brim that hugs the head and people usually cuff it to make it fit.", "It is knitted, fits snugly on his head and does not have a brim on the front."], "image": "val2014/COCO_val2014_000000166840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524533, "question_id": "7bS6JHynpAFHCC5rV82mzL", "question": "What country is this street found in?", "choices": ["france", "australia", "united states", "britain"], "correct_choice_idx": 2, "direct_answers": ["uk", "london", "usa", "new york", "united states", "uk", "united kingdom", "united states", "england", "united states"], "difficult_direct_answer": false, "rationales": ["There is an american flag at the end of the bus.", "The country is the usa.", "The street is in the united states. everything appears to be in english."], "image": "val2014/COCO_val2014_000000524533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270836, "question_id": "7bT7WdTwLqfsB9qxaHJvDy", "question": "Where is this elephant located?", "choices": ["wild", "pet store", "farm", "zoo"], "correct_choice_idx": 0, "direct_answers": ["zoo", "zoo", "zoo", "wild", "zoo", "zoo", "zoo", "in zoo", "zoo", "wild"], "difficult_direct_answer": false, "rationales": ["He is in a contained area.", "The elephant is in an unnatural environment made to look natural.", "One can see the gate and the keeper on the left."], "image": "train2014/COCO_train2014_000000270836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230468, "question_id": "7bfWJ4xyqNdpzXLEdyV9Ns", "question": "Where are the people located?", "choices": ["school", "hospital", "restaurant", "home"], "correct_choice_idx": 3, "direct_answers": ["balcony", "balcony", "outside", "balcony", "porch", "playing ground", "patio", "home", "home", "back deck"], "difficult_direct_answer": false, "rationales": ["The people are at home.", "There are rows of houses visible in the background to indicate this may be a residential area. the balcony size they are on as well as the personal grill and the overall decor are most commonly found in residences.", "The people are at home and are on the porch."], "image": "val2014/COCO_val2014_000000230468.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157358, "question_id": "7cdfuEgKiWzBZuKoveUa3f", "question": "What enables these people to go faster on the street?", "choices": ["roller blades", "ice skates", "skate boards", "roller skates"], "correct_choice_idx": 0, "direct_answers": ["roller blades", "roller blades", "roller blades", "wheels", "skateboard", "wheels", "wheels", "rollerblades", "incline skates", "roller blades"], "difficult_direct_answer": false, "rationales": ["They are on roller blades on the street.", "The people are wearing skates with inline wheels in the street.", "They are having wheels on their shoes."], "image": "val2014/COCO_val2014_000000157358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414162, "question_id": "7cqS4TMtN9XYN2EDoTbrgw", "question": "Who are these people?", "choices": ["athletes", "criminals", "interns", "ambassadors"], "correct_choice_idx": 2, "direct_answers": ["interns", "christians", "students", "college students", "students", "united nations", "college club", "frat boys", "kids un", "students"], "difficult_direct_answer": false, "rationales": ["They look young and are probably interns for a company.", "The people are young and look like high schoolers.", "These people look to be young adults and the flag is for the united nations. people at this age would be too young to work full time there but would likely be able to participate in internships."], "image": "train2014/COCO_train2014_000000414162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216681, "question_id": "7cx8uAWJR2ugSQVwe48XUH", "question": "What is the person with the kite doing?", "choices": ["flying", "kite flying", "sailing", "kitesurfing"], "correct_choice_idx": 3, "direct_answers": ["surfing", "flying", "controlling kite", "waterflying", "waveboarding", "skiing", "kitesurfing", "flying", "kite surfing", "kiting"], "difficult_direct_answer": true, "rationales": ["The person is using the kite for surfing.", "A person is on the water on a surfboard attached to a kite.", "The person is using the kite to surf in the water."], "image": "train2014/COCO_train2014_000000216681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521671, "question_id": "7d7cYRGLMCQyDz4NSQodXQ", "question": "What creature does this animal like to feed on?", "choices": ["fish", "beef", "eggs", "oranges"], "correct_choice_idx": 0, "direct_answers": ["bugs", "fish", "fish", "duck", "fish", "bugs", "ants", "bugs", "insects", "bread"], "difficult_direct_answer": false, "rationales": ["Ducks eat fish.", "It makes b and doesn't eat c. i'm not sure if they like d at all.", "Ducks swim in water and catch the fish"], "image": "train2014/COCO_train2014_000000521671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479948, "question_id": "7dZnX2MbEdS3jveAJrijEb", "question": "What is the net in front of the spectators there for?", "choices": ["stop ball", "chicken pen", "player captivity", "punishment"], "correct_choice_idx": 0, "direct_answers": ["protection", "protection", "protect them", "protection", "baseball protection", "safety", "catch ball", "stop ball", "protection", "seperation"], "difficult_direct_answer": false, "rationales": ["Without some kind of barrier, such as this net, there's a chance the baseball will be shot into the spectators some kind of way, which can lead to painful injuries.", "Nets protect the people from balls.", "The net prevents bystanders from getting hurt if a ball comes flying over the wall."], "image": "val2014/COCO_val2014_000000479948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59494, "question_id": "7dtpycHHa4qttQMNCVPfp4", "question": "In what area is this chair located?", "choices": ["park", "playground", "side walk", "backyard"], "correct_choice_idx": 2, "direct_answers": ["sidewalk", "city", "sidewalk", "sidewalk", "sidewalk", "city", "sidewalk", "inner city", "side walk", "sidewalk"], "difficult_direct_answer": false, "rationales": ["The nearby road and the pavement below the bench give away its location.", "There is concrete under the object and vents on the concrete. it is very close to a building on the area for foot traffic. right beside the bench would be a road for traffic.", "People can't sit on the street because it's dangerous. i also see three people walking in the background."], "image": "train2014/COCO_train2014_000000059494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447147, "question_id": "7dxST6BYDW7s2XwWzgqj6T", "question": "What is safest to touch without being burned?", "choices": ["foil", "inside stove", "food", "pan"], "correct_choice_idx": 0, "direct_answers": ["foil", "outer door", "nothing", "oven mitt", "oven mitt", "aluminum foil", "white parts", "door", "foil", "outer surface"], "difficult_direct_answer": false, "rationales": ["The product made out of aluminum does not retain the heat like pans and other items.", "The foil is safe.", "The food conducts the least heat out of everything."], "image": "val2014/COCO_val2014_000000447147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494594, "question_id": "7e2DtVJi7x72XDoLbjBfM7", "question": "This person is wearing what type of orthodox headwear?", "choices": ["estonian", "amish", "russian", "jewish"], "correct_choice_idx": 3, "direct_answers": ["hat", "hat", "kippah", "kippah", "hat", "jewish", "hat", "hat", "hat", "hat"], "difficult_direct_answer": false, "rationales": ["Jewish people wear that hat.", "That is who would wear this head gear.", "He is jewish."], "image": "train2014/COCO_train2014_000000494594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235296, "question_id": "7e2tqy5zcx7zNZKZmDW5Lr", "question": "How did the woman come here?", "choices": ["by car", "by bike", "by train", "on foot"], "correct_choice_idx": 1, "direct_answers": ["bike", "by bike", "bicycle", "bicycle", "bike", "bike", "bike", "bike", "view sky", "bicycle"], "difficult_direct_answer": false, "rationales": ["She rode her bike that is sitting next to her.", "The woman rode her bike to the bench near the water. when she arrived she parked the bike nearby and sat on the bench.", "You can tell by the bicycle leaning near her as to how she got there."], "image": "val2014/COCO_val2014_000000235296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214385, "question_id": "7e7bCzwYQWc6WzFfreiBDb", "question": "What age group is most represented at this location?", "choices": ["seniors", "teens", "children", "adults"], "correct_choice_idx": 2, "direct_answers": ["children", "children", "children", "youth", "children", "youth", "youth", "children", "children", "unknown"], "difficult_direct_answer": false, "rationales": ["Kids are generally on the bunny slope.", "The kid is small.", "Aside from a few adults, all of the people are young minors."], "image": "train2014/COCO_train2014_000000214385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346138, "question_id": "7eAm6RbDMGDtA27q6yj9kC", "question": "What vessel is needed to serve these foods?", "choices": ["plate", "pan", "cup", "bowl"], "correct_choice_idx": 3, "direct_answers": ["bowl", "bowl", "bowl", "ladle tongs", "bowl", "bowls", "plate", "bowl", "bowl", "bowls"], "difficult_direct_answer": false, "rationales": ["These foods all have a lot of liquid in them and need to be served in a bowl.", "The vessel is the bowl.", "All these noodles and soups are best contained in a bowl."], "image": "val2014/COCO_val2014_000000346138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106794, "question_id": "7eS9RFmf58pU2wYFcgT3TL", "question": "What is the person in the carriage most likely looking for?", "choices": ["food", "hay", "predators", "passengers"], "correct_choice_idx": 3, "direct_answers": ["passengers", "passengers", "customers", "passengers", "passengers", "passengers", "tourists", "fares", "tourists", "riders"], "difficult_direct_answer": false, "rationales": ["The horse needs food and water and that costs money.", "This person earns money only if he takes people on a ride.", "Horse and buggies aren't very common and are used in cities for sight seeing"], "image": "val2014/COCO_val2014_000000106794.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84540, "question_id": "7egrctZhBHyjNHPgotZPJD", "question": "When did two companies merge into this one bank?", "choices": ["2018", "1981", "2008", "1995"], "correct_choice_idx": 3, "direct_answers": ["1995", "1995", "recently", "2013", "1995", "1995", "1995", "2015", "racing", "nineteen ninetyfive"], "difficult_direct_answer": false, "rationales": ["Lloyds s&b was a merger from the year 1995.", "I used the internet to search the lloyds and tub merger year.", "The companies merged in 1995."], "image": "val2014/COCO_val2014_000000084540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405980, "question_id": "7eh8ci9GT7ztr6sn8QKgHw", "question": "What color shirt does the person opposing the wii woman in black?", "choices": ["striped gray", "green stripe", "black", "none"], "correct_choice_idx": 0, "direct_answers": ["gray striped", "striped", "green", "grey", "grey stripes", "grey black", "gray", "gray", "gray", "striped gray"], "difficult_direct_answer": false, "rationales": ["The shirt is striped gray.", "There is a man wearing a sweater next to the woman. his sweater has stripes and the color is in the gray tones.", "The pattern on the shirt is striped."], "image": "train2014/COCO_train2014_000000405980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479727, "question_id": "7ew54Hc6UkEDUzdZuomZAf", "question": "What color building material is popular for construction here?", "choices": ["red", "green", "white", "clear"], "correct_choice_idx": 0, "direct_answers": ["red brick", "brown", "brown", "brown", "red brick", "red", "red brick", "brick", "red brick", "red"], "difficult_direct_answer": false, "rationales": ["Many of the buildings are made out of bricks. bricks are not white, green, or clear.", "There are a lot of reddish bricks here.", "There are several buildings with bricks."], "image": "val2014/COCO_val2014_000000479727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250001, "question_id": "7eyf4ZcSZn33rXBvxhv98z", "question": "Who is the person sitting high above the tennis net?", "choices": ["referee", "trainer", "coach", "announcer"], "correct_choice_idx": 0, "direct_answers": ["referee", "referee", "umpire", "referee", "referee", "umpire", "referee", "umpire", "umpire", "referee"], "difficult_direct_answer": false, "rationales": ["The chair umpire is responsible for making sure all the play is fair.", "The person sitting above the tennis net is a referee that makes calls during the game.", "A referee judges plays from the high seat."], "image": "val2014/COCO_val2014_000000250001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423711, "question_id": "7fBVfQbqBRy48NF5ARLYkU", "question": "Why is one person wearing a uniform?", "choices": ["police officer", "event employee", "firefighter", "nurse"], "correct_choice_idx": 1, "direct_answers": ["worker", "waiter", "chef", "worker", "serving food", "waitress", "they're working", "chef", "chef", "event employee"], "difficult_direct_answer": false, "rationales": ["The person is the only one wearing a white uniform. since she is standing she is most likely a waitress for the event.", "The person is the event employee.", "They are working the event as a waiter or cook."], "image": "train2014/COCO_train2014_000000423711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284902, "question_id": "7fKHyXqggoLoHxZXPB6qJj", "question": "What color is most of the fruit?", "choices": ["red", "yellow", "green", "orange"], "correct_choice_idx": 2, "direct_answers": ["green", "greenish", "green", "green", "green", "green", "greenish", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["There are bananas, but they aren't ripe yet.", "Most of the fruit are unripened bananas. they are not yellow, red, or orange.", "This is the color of unripe bananas."], "image": "train2014/COCO_train2014_000000284902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110353, "question_id": "7fSZUeJYiugneaZxBCwLx5", "question": "Who is this area designed for?", "choices": ["employees", "public", "customers", "politicians"], "correct_choice_idx": 1, "direct_answers": ["lovers", "pedestrians", "park visitors", "sightseers", "tourists", "public", "men", "tourist", "tourists", "sitting"], "difficult_direct_answer": true, "rationales": ["This area consists of a body of water, a sidewalk, and a paved area with benches, and there aren't any signs designating the area for specific people.", "It is a large and shared open space with many areas for people to sit and walk.", "There is a walkway and lots of benches"], "image": "train2014/COCO_train2014_000000110353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454172, "question_id": "7fUSM7F3J8nKg4mYriRJ8r", "question": "What is the man trying to do to the bird?", "choices": ["photograph it", "trim it", "capture it", "feed it"], "correct_choice_idx": 0, "direct_answers": ["take picture", "take picture", "take picture", "photograph", "take picture", "photograph it", "photograph", "take picture", "photograph", "take picture"], "difficult_direct_answer": false, "rationales": ["The man is holding a camera up to his face. when people do that, they are taking a picture of what's in front of them.", "A man in a car is visible in the rearview mirror as he aims his camera at a large brown, unusual bird on the ground below him.", "The man wants to take a picture of the bird."], "image": "train2014/COCO_train2014_000000454172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373338, "question_id": "7fmgv9naBoNo4zSBZdF6Ey", "question": "What type of vehicle is in front of the building?", "choices": ["rental", "bus", "passenger", "commercial"], "correct_choice_idx": 3, "direct_answers": ["crane", "commercial", "crane", "truck", "crane", "turk", "construction vehicle", "crane lift", "amherst", "crane carrier"], "difficult_direct_answer": false, "rationales": ["The vehicle is enormous so it must be commercial.", "The vehicle is commercial.", "The vehicle is a crane for construction."], "image": "train2014/COCO_train2014_000000373338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144298, "question_id": "7fowKHWtcwn4zhyoFwxXvR", "question": "What letter is on the bumper?", "choices": ["p", "", "g", "o"], "correct_choice_idx": 0, "direct_answers": ["p p", "letter p", "p is", "letter p", "p", "letter p", "letter p", "p", "letter p", "p"], "difficult_direct_answer": false, "rationales": ["The letter \"p\" is shown on the sticker.", "The letter \"p\" appears on the circular sticker.", "A red circle with a distinct letter p can be seen on the bottom right of the bus."], "image": "train2014/COCO_train2014_000000144298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357755, "question_id": "7g85Fenf7P9rLx9bP9aSMy", "question": "What part of the country is he riding on?", "choices": ["valley", "coastline", "plateau", "mountain"], "correct_choice_idx": 1, "direct_answers": ["beach", "beach", "beach", "coast", "coastal area", "beach", "coastline", "shore", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["The person is riding along the shoreline.", "The horse is on the coastline.", "The person is riding near the waves."], "image": "train2014/COCO_train2014_000000357755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72155, "question_id": "7gDFTsdekrVXRxFFMPCBC7", "question": "What part of the bench has been removed?", "choices": ["seat", "legs", "footrest", "back"], "correct_choice_idx": 3, "direct_answers": ["back rest", "backrest", "back", "back", "back", "back section", "back", "back", "bench back", "back part"], "difficult_direct_answer": false, "rationales": ["The back was removed.", "A boy is sitting on a bench with no back.", "You are able to see that the boy's back is not resting on the bench. the yellow stands have brackets, but nothing is attached to them."], "image": "val2014/COCO_val2014_000000072155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464079, "question_id": "7gVLXwT3ydV8TsCbZvAGTF", "question": "What are they looking at?", "choices": ["small child", "soap opera", "video screen", "trained puppy"], "correct_choice_idx": 2, "direct_answers": ["wii", "screen", "television", "tv", "television", "video screen", "tv", "tv", "television", "television screen"], "difficult_direct_answer": false, "rationales": ["They are holding video game controllers and are all looking in the same direction.", "They are looking at the tv while playing nintendo wii.", "The men in front are holding nintendo wii remotes. they are playing a game."], "image": "train2014/COCO_train2014_000000464079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436585, "question_id": "7gfXPkPy68qj8cr8TEH6FN", "question": "What is available on this meal among the following ingredients options?", "choices": ["rice", "beans", "broccoli", "kale"], "correct_choice_idx": 0, "direct_answers": ["rice", "rice", "rice", "rice", "carrots", "carrots", "veggies", "carrots", "rice", "rice"], "difficult_direct_answer": false, "rationales": ["The other options don't appear here on the plate.", "There is white rice under the vegetables.", "A plate has food on it that is arranged on top of rice."], "image": "train2014/COCO_train2014_000000436585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477852, "question_id": "7ggiLf8bsUyccPhvYaBzG5", "question": "Who is most likely named Laurent?", "choices": ["lead bike", "rear bike", "sponsor", "photographer"], "correct_choice_idx": 3, "direct_answers": ["closest driver", "photographer", "three", "french person", "photographer", "number three", "owner", "bike", "photographer", "photographer"], "difficult_direct_answer": false, "rationales": ["The photographer is named laurent since it's copyrighted by that person.", "Laurent is the person who took the picture.", "The picture was copyrighted by laurent which is something that would be done by the owner of the photo which would be the person who took the picture."], "image": "train2014/COCO_train2014_000000477852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267694, "question_id": "7go8NXvGNefmgLL4yK7884", "question": "What is the pattern of the tablecloth?", "choices": ["farm", "checkered", "striped", "spotted"], "correct_choice_idx": 1, "direct_answers": ["checkerboard", "checkered", "plaid", "checked", "checkered", "plaid", "checkered", "check", "pizza", "plaid"], "difficult_direct_answer": false, "rationales": ["The pattern has repeating squares of different colors. this pattern is described as answer a.", "The table cloth has yellow and white boxes.", "The pattern is checkered."], "image": "val2014/COCO_val2014_000000267694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189780, "question_id": "7h8WJctYT6ZVTpR3CpvX4N", "question": "What are the people waiting to do?", "choices": ["eat", "work", "ride train", "dance"], "correct_choice_idx": 2, "direct_answers": ["board traubs", "take train", "tickets", "board train", "board trains", "ride train", "board", "catch train", "board train", "board train"], "difficult_direct_answer": false, "rationales": ["They appear to be waiting for a, as evidenced by their luggage.", "This famous location is known to be a train station you can see in the picture the ticket booth windows and the lines of passengers all with baggage waiting to purchase such tickets. this is the only type of transportation provided in this type of setting.", "Most of them have a few large suitcases and are standing in a line."], "image": "train2014/COCO_train2014_000000189780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220821, "question_id": "7h9S9Zyrrvk6nsiVxtscv6", "question": "What animal is in the photo?", "choices": ["bull", "dog", "giraffe", "hen"], "correct_choice_idx": 2, "direct_answers": ["giraffe", "giraffe", "giraffe", "giraffe", "giraffe", "giraffe", "giraffe", "giraffe", "giraffe", "giraffe"], "difficult_direct_answer": false, "rationales": ["The brown and white pattern is known to only be associated with a giraffe, along with the long neck.", "This animal is really tall with a long neck. it is not a dog, a bull, or a hen. it has brown spots and outlined in white.", "The animal is a giraffe. it has a long neck and legs."], "image": "train2014/COCO_train2014_000000220821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184889, "question_id": "7hMn8pFyFgB5jVNsA8mvC4", "question": "What are the kids here going to do?", "choices": ["board train", "hide", "exit train", "sell lemonade"], "correct_choice_idx": 0, "direct_answers": ["ride train", "travel", "board train", "ride train", "board train", "ride train", "ride train", "travel", "board train", "travel"], "difficult_direct_answer": false, "rationales": ["There is a train behind them.", "The kids want to board.", "People have bags and are sitting on the platform next to a train. people wait on the platform before boarding trains."], "image": "train2014/COCO_train2014_000000184889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434221, "question_id": "7hS8yAC87ZzsDTd5fc2WLr", "question": "What country is this vehicle associated with?", "choices": ["mexico", "uk", "kenya", "us"], "correct_choice_idx": 1, "direct_answers": ["england", "england", "uk", "europe", "england", "europe", "europe", "england", "england", "england"], "difficult_direct_answer": false, "rationales": ["This vehicle is a double-decker bus. it is driving on the left side of the road.", "This vehicle is a double decker bus that is bright red. that type of vehicle is famous for being seen in a specific country.", "The vehicle is a double decker bus that is driving on the left side of the road."], "image": "train2014/COCO_train2014_000000434221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52073, "question_id": "7hTmvK2ohfreVDqBNTfqpD", "question": "Why is this food unhealthy?", "choices": ["high sugar", "high sodium", "high fat", "high carbohydrate"], "correct_choice_idx": 1, "direct_answers": ["salty", "fattening", "bacon", "fried", "high sodium", "oil", "fat content", "greasy", "fried", "fatty"], "difficult_direct_answer": true, "rationales": ["The food has high sodium.", "It has a lot of salt in it.", "They usually have a lot of salt to help preserve the meat"], "image": "train2014/COCO_train2014_000000052073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394071, "question_id": "7hUsfVEdeNw7QprPkW9THS", "question": "How are her shoes tightened?", "choices": ["velcro", "buckles", "zippers", "laces"], "correct_choice_idx": 0, "direct_answers": ["velcro", "velcro straps", "velcro", "velcro", "velcro", "velcro", "velcro", "velcro", "velcro straps", "velcro"], "difficult_direct_answer": false, "rationales": ["The straps on her shoes are velcro.", "They don't have the other options.", "There are straps that are on the shoes."], "image": "train2014/COCO_train2014_000000394071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235949, "question_id": "7hWJkNz7ZT4cEK3kGW9nSa", "question": "If the boy overeats he will get what kind of body ache?", "choices": ["ear", "eye", "stomach", "back"], "correct_choice_idx": 2, "direct_answers": ["stomach", "stomach", "stomach", "stomach", "stomach", "stomach", "stomachache", "stomachache", "stomach", "stomach"], "difficult_direct_answer": false, "rationales": ["The boy is eating a lot of cake that will give him a stomachache if he eats too much", "He will likely have a sore stomach from eating too much cake.", "Food goes into the stomach and stays there a couple hours so it hurts if it gets too full"], "image": "train2014/COCO_train2014_000000235949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369259, "question_id": "7haF5wgiW5mDHdnoqxiF67", "question": "What time of day is it?", "choices": ["night", "evening", "morning", "afternoon"], "correct_choice_idx": 2, "direct_answers": ["morning", "day", "morning", "morning", "morning", "morning", "morning", "noon", "noon", "morning"], "difficult_direct_answer": false, "rationales": ["They have cereal and other breakfast items out", "The people are eating breakfast foods, such as cereal and fruit.", "It is sunny outside. the counter has breakfast foods, like cereal and fruit."], "image": "val2014/COCO_val2014_000000369259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54479, "question_id": "7hexnnbfosYaMJ6oJJFw9r", "question": "What is different about the batter from most other batters?", "choices": ["weight", "height", "bats left-handed", "glasses"], "correct_choice_idx": 2, "direct_answers": ["lefty", "left handed", "left handed", "left handed", "left handed", "batting left", "weight", "bats left-handed", "left handed", "left-handed batter"], "difficult_direct_answer": false, "rationales": ["This person is batting with their left hand.", "He is batting with his left hand.", "By far most hitters are right handed."], "image": "train2014/COCO_train2014_000000054479.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507887, "question_id": "7i2K9jLU8m7FQME8s2eQDs", "question": "Where would you most likely see this type of pizza served with wine?", "choices": ["uk", "canada", "italy", "usa"], "correct_choice_idx": 2, "direct_answers": ["italian restaurant", "italy", "italian restaurant", "italy", "restaurant", "italy", "italy", "restaurant", "restaurant", "fancy restaurant"], "difficult_direct_answer": false, "rationales": ["Pizza and wine are an italian meal.", "This is a traditional type of pizza", "Pizza and wine are typically paired in italy."], "image": "val2014/COCO_val2014_000000507887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35367, "question_id": "7i9J58PUaEjN5zLVNoAw2m", "question": "How many cakes needed to cool down before adding a creamy glaze to it?", "choices": ["two", "none", "three", "one"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "two", "three", "three", "two", "three", "two", "three", "two"], "difficult_direct_answer": false, "rationales": ["I would assume all of the cakes needed to do this so one would need to identify and count the cakes and answer a is closest.", "There are six cakes total and half of them have cream on top.", "There are three cakes that are bare."], "image": "train2014/COCO_train2014_000000035367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6701, "question_id": "7iAK4zu7vVLZTpN2U6AA3S", "question": "What are the construction barrels filled with?", "choices": ["sand", "tar", "equipment", "paint"], "correct_choice_idx": 0, "direct_answers": ["sand", "sand", "nothing", "air", "sand water", "ait", "traffic light", "sand", "rocks", "sand"], "difficult_direct_answer": false, "rationales": ["The construction barrels have crushed rock in them.", "The barrels have sand.", "Construction barrels are on a road blocking an area. construction barrels are often filled with sand so they don't move."], "image": "val2014/COCO_val2014_000000006701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379502, "question_id": "7iFbuDZJHFhvPnNxXLpFK4", "question": "What category of pizza would this fall into?", "choices": ["vegetarian", "meat lovers", "pesto", "pepperoni"], "correct_choice_idx": 0, "direct_answers": ["vegetarian", "thin crust", "margarita", "veggie", "marinara", "margherita", "vegetarian", "speciality", "vegan", "cooked"], "difficult_direct_answer": true, "rationales": ["The pizza has no meat on it.", "The pizza has no meat.", "There is no meat on the pizza."], "image": "val2014/COCO_val2014_000000379502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347542, "question_id": "7iXh7LNZkYVvaR7VMUXqAU", "question": "What powers the lights here?", "choices": ["hot water", "oil", "gas", "solar panels"], "correct_choice_idx": 3, "direct_answers": ["solar", "solar panel", "solar", "sun", "sun", "sun", "solar panels", "electricity", "solar panels", "solar"], "difficult_direct_answer": false, "rationales": ["There are visible solar panels in the top left corner. this is a source of power and happens to be the only one visible.", "The panels collect power from the sun.", "The solar panels power the lights."], "image": "train2014/COCO_train2014_000000347542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239130, "question_id": "7iaft3zttd6DeT2PRc6UMd", "question": "Why is the man's vest orange?", "choices": ["visibility", "camouflage", "fashion", "dress code"], "correct_choice_idx": 0, "direct_answers": ["visibility", "visibility", "visibility", "safety", "easily seen", "safety", "alert drivers", "alert drivers", "visibility", "visibility"], "difficult_direct_answer": false, "rationales": ["The man wears that in case it gets dark.", "The vest needs to be visible.", "The man is wearing an orange vest so it reflects for safety."], "image": "val2014/COCO_val2014_000000239130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131093, "question_id": "7insRs3CUxRumwVBudGotu", "question": "What are these animals called?", "choices": ["dogs", "cows", "sheep", "deer"], "correct_choice_idx": 2, "direct_answers": ["ram", "goats", "rams", "sheep", "sheep", "sheeps", "goats", "sheep", "sheep", "sheep"], "difficult_direct_answer": false, "rationales": ["The animals have horns and wool. they are not cows, deer, or dogs.", "The animals are relatively short, have fur and horns and small faces. these would indicate that they are part of sheep family.", "The animals on the grass are sheep and have lots of wool."], "image": "train2014/COCO_train2014_000000131093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216502, "question_id": "7is4RYmPTYxK899TB7cbbE", "question": "What color range is shown in the image?", "choices": ["sepia", "warm colors", "full-color", "monochrome"], "correct_choice_idx": 3, "direct_answers": ["gray", "black white", "monochrome", "black white", "grays", "black white", "bw", "black white", "white", "black"], "difficult_direct_answer": false, "rationales": ["The photo is black and white.", "The image is in black and white, which can also be described as monochrome.", "A black and white photo is shown."], "image": "train2014/COCO_train2014_000000216502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345844, "question_id": "7j7RbV7oRofRfUowDLKJEM", "question": "What field do the men work in?", "choices": ["art", "medicine", "technology", "construction"], "correct_choice_idx": 2, "direct_answers": ["it", "technology", "technology", "computers", "computer science", "computers", "technology", "it", "it", "it"], "difficult_direct_answer": false, "rationales": ["The men are in tech.", "The men are working with computers in an indoor environment which shows they likely work in technology.", "They are all working on computers"], "image": "train2014/COCO_train2014_000000345844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364665, "question_id": "7jVC2BsYPBrhDDuEHAjd4d", "question": "What is the likely flavor of these muffins?", "choices": ["pumpkin", "blueberry", "red velvet", "banana"], "correct_choice_idx": 2, "direct_answers": ["chocolate", "chocolate", "chocolate", "chocolate", "red velvet", "chocolate", "chocolate", "chocolate", "chocolate", "chocolate"], "difficult_direct_answer": false, "rationales": ["The muffins are a reddish color.", "The muffins have a reddish color.", "The color appears to be red."], "image": "val2014/COCO_val2014_000000364665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427895, "question_id": "7jaXkBBXmkRMd8fVgXk6Fw", "question": "What are the men doing?", "choices": ["fighting", "fixing plane", "playing tennis", "falling"], "correct_choice_idx": 2, "direct_answers": ["playing badminton", "playing tennis", "playing tennis", "tennis", "wing walking", "tennis", "playing tennis", "playing tennis", "playing tennis", "playing tennis"], "difficult_direct_answer": false, "rationales": ["They are hitting a ball back and forth with a net between them.", "The men are using racquets to hit a ball over a net.", "They have tennis racks and a net between them."], "image": "train2014/COCO_train2014_000000427895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4739, "question_id": "7k3Yiyy8cvwyNVDVzdtFrr", "question": "What is the cow doing?", "choices": ["sleeping", "smelling meat", "eating", "grilling"], "correct_choice_idx": 2, "direct_answers": ["drinking", "eating", "eating", "walking", "resting", "standing", "standing", "eating", "standing", "chilling"], "difficult_direct_answer": false, "rationales": ["The cow is standing over a bowl with a white substance in it.", "There is a bowl of food.", "The cow is eating food from its bowl."], "image": "train2014/COCO_train2014_000000004739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323536, "question_id": "7k5vUXJWy8uhH6ajnkbKrh", "question": "What event is happening here?", "choices": ["motorcycle parade", "car race", "car show", "car sale"], "correct_choice_idx": 2, "direct_answers": ["bike show", "meet up", "bike rally", "car show", "road show", "motorcycle show", "bike show", "rally", "motorcycle rally", "rally"], "difficult_direct_answer": false, "rationales": ["There are many bikes here. they are being displayed for people to see.", "People want to show off their cars and motorcycles.", "Seeing as there are motorcycles in evidence, but not being ridden in the streets, there are no cars to be seen."], "image": "train2014/COCO_train2014_000000323536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333088, "question_id": "7kDJ9ZLvVWFeHy7Vbxxa2S", "question": "What kind of room is this one?", "choices": ["dining room", "music room", "karaoke room", "recreation room"], "correct_choice_idx": 3, "direct_answers": ["entertainment", "game", "recreation room", "recreation room", "game room", "dining room", "recreation", "rec room", "game room", "recreation room"], "difficult_direct_answer": false, "rationales": ["It has a pool table and piano for some fun", "This room is for recreation purposes.", "There is a pool table and piano."], "image": "train2014/COCO_train2014_000000333088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48692, "question_id": "7kv5xfe3FYntxtCFtUeRU8", "question": "What kind of scale is used here?", "choices": ["potted", "balance", "virtual", "accuracy"], "correct_choice_idx": 1, "direct_answers": ["balance", "pulley", "weight", "balance", "weight", "weight", "balance scale", "fruit scale", "food", "balance"], "difficult_direct_answer": false, "rationales": ["There is a weight balance on the table.", "The fruit on the table is being weighed using a balance scale.", "The scale has two bowls and a fulcrum."], "image": "val2014/COCO_val2014_000000048692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146601, "question_id": "7mJvC4zogSvBs99KbrNNwg", "question": "What is this type of cooking called?", "choices": ["vegan", "handmade", "fat-free", "production line"], "correct_choice_idx": 3, "direct_answers": ["deep fry", "frying", "grill", "deep fry", "frying", "production line", "frying", "frying", "deep fry", "deep fry"], "difficult_direct_answer": false, "rationales": ["It has machinery that moves the food through different processes with little human intervention", "The cooking is on a line.", "A commercial way of making donuts where they go from one cooking element to the next."], "image": "val2014/COCO_val2014_000000146601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391011, "question_id": "7mNp3QyJh3Ev8Rjt5LWiXf", "question": "The man listed was Mayor of what city?", "choices": ["denver", "oklahoma city", "jackson", "austin"], "correct_choice_idx": 3, "direct_answers": ["austin", "motorcycle city", "not applicable", "new orleans", "austin", "nashville", "wrong photo", "no clue", "no list", "roman"], "difficult_direct_answer": true, "rationales": ["The man listed is known for his leadership over austin.", "The man is listed as the mayor of austin texas.", "The building on the left has the name joseph nalle. nalle was the mayor of a city in texas."], "image": "val2014/COCO_val2014_000000391011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428576, "question_id": "7mnXarFHddshbnEgahKvFQ", "question": "What drink is normally put in the white cups on the table?", "choices": ["wine", "soda", "coffee", "water"], "correct_choice_idx": 3, "direct_answers": ["coffee", "coffee", "coffee", "tea", "coffee", "water", "coffee", "tea", "coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["The drink is water.", "Those types of cups are usually used to drink warm beverages.", "The drink normally put in the white cup of the table is water from the sink."], "image": "train2014/COCO_train2014_000000428576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315322, "question_id": "7nBzNvmKDS9fESsoD2pJEB", "question": "Which vehicle is most likely to have more than 10 passengers?", "choices": ["double-decker bus", "blue car", "white truck", "silver truck"], "correct_choice_idx": 0, "direct_answers": ["bus", "bus", "bus", "bus", "double-decker bus", "bus", "bus", "bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["Traditionally this type of vehicles hold a large number of people.", "The bus has two levels and can hold more than 10 passengers.", "The bus can carry a lot of passengers."], "image": "train2014/COCO_train2014_000000315322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6842, "question_id": "7nFg3eRgipHwsfFzqLSBWF", "question": "Where are the cats playing?", "choices": ["sink", "sand", "river", "parking lot"], "correct_choice_idx": 0, "direct_answers": ["sinks", "sinks", "sink", "sinks", "sink", "bath time", "sinks", "bathroom sinks", "sinks", "bathroom sinks"], "difficult_direct_answer": false, "rationales": ["The cats are each in sinks.", "They are in the countertop basins in the bathroom.", "The cats are playing in the sinks."], "image": "train2014/COCO_train2014_000000006842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212817, "question_id": "7ndfMBHwPfQRNM4Y4rjbmr", "question": "Why does he have goggles on?", "choices": ["be found", "protect eyes", "stop rain", "showing off"], "correct_choice_idx": 1, "direct_answers": ["protection", "sun protection", "sun protection", "protect eyes", "protect eyes", "protect eyes", "protection", "safety", "eye protection", "shield eyes"], "difficult_direct_answer": false, "rationales": ["This prevents damage to your eyes caused by extended exposure to things like snow and wind.", "The little boy wants to keep the sun and snow out of his eyes.", "Snow blindness is a real thing."], "image": "val2014/COCO_val2014_000000212817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263202, "question_id": "7nhnbBzMHV2RpEwnvC63L8", "question": "Where is LG main headquarters?", "choices": ["busan", "ulsan", "seoul", "changwon"], "correct_choice_idx": 2, "direct_answers": ["south korea", "seoul", "seoul", "seoul", "south korea", "china", "south korea", "seoul", "south korea", "china"], "difficult_direct_answer": false, "rationales": ["Lg is based in the capital of south korea.", "The lg brand is in seoul.", "Lg's main headquarters building is not in busan, ulsan, or changwon."], "image": "val2014/COCO_val2014_000000263202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30838, "question_id": "7nkdcUANStF3RS392jf5Rq", "question": "What mechanism is the seat attached to?", "choices": ["ski lift", "roller coaster", "slide", "swing"], "correct_choice_idx": 0, "direct_answers": ["lift", "ski lift", "ski lift", "ski lift", "ski lift", "lift", "lift", "ski lift", "lift", "ski lift"], "difficult_direct_answer": false, "rationales": ["People are in their winter attire. that kind of bench with no front protection means that it's going super slow, bringing you up to a higher place to go fast downwards.", "People with ski equipment and warm clothes are preparing to sit on a wood slatted seat.", "The family is using it to ride to the top of the mountain and ski back down."], "image": "train2014/COCO_train2014_000000030838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80078, "question_id": "7nwjBy2myqR4kyAsSG7sjy", "question": "What section of the grocery store is this?", "choices": ["frozen", "vegetables", "dairy", "fruits"], "correct_choice_idx": 3, "direct_answers": ["produce", "fruit section", "fruit", "fruits", "produce", "produce", "groceries", "produce", "produce", "banana section"], "difficult_direct_answer": false, "rationales": ["This section of the grocery store is filled with bananas and melons.", "There is a multitude of fruits and vegetables so this is the fruit and or produce section.", "The aisle of the grocery store that is shown contains the food group where you would find bananas, melons, and pineapples."], "image": "train2014/COCO_train2014_000000080078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400950, "question_id": "7nyrb22myuqPKPLmz9ywDa", "question": "Why is the girl wearing a helmet?", "choices": ["costume", "protection", "style", "for fun"], "correct_choice_idx": 1, "direct_answers": ["yes", "safety", "safety", "horse reyding", "safety", "protect head", "protection", "for safety", "protection", "protection"], "difficult_direct_answer": false, "rationales": ["The little girl is wearing a safety helmet so her head won't get badly hurt just in case she takes a fall.", "She is wearing it for safety and protection.", "The girl is riding a horse and looks to be a beginner. it is advisable to wear a helmet riding horse to protect oneself from any injury regardless of experience, but especially if one is new to horses."], "image": "val2014/COCO_val2014_000000400950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536041, "question_id": "7o5kiUPCAE6v7iyJVmBgyE", "question": "What is she doing?", "choices": ["eating sheep", "stealing sheep", "watching sheep", "counting sheep"], "correct_choice_idx": 2, "direct_answers": ["watching sheep", "watching sheep", "leaning", "watching sheep", "watching sheep", "watching sheep", "leaning", "watching", "standing", "watching sheeps"], "difficult_direct_answer": false, "rationales": ["There are many in the field and she is looking at them", "The woman is leaning against a fence as she stares at the sheep from afar..", "The woman is keeping an eye on the sheep."], "image": "train2014/COCO_train2014_000000536041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398798, "question_id": "7oCsrm25Mzh7XwsaDLYCAU", "question": "Adidas multinational brand is belongs to which country?", "choices": ["uk", "germany", "us", "canada"], "correct_choice_idx": 1, "direct_answers": ["germany", "germany", "germany", "germany", "germany", "germany", "germany", "united states", "united states", "united states"], "difficult_direct_answer": false, "rationales": ["Adidas headquarters are in germany.", "That is where the company was from.", "Adidas is a german company."], "image": "val2014/COCO_val2014_000000398798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565875, "question_id": "7oWMrFeBSW579mw4DPEaTi", "question": "What is the driver doing?", "choices": ["resting", "taking photo", "yielding", "driving"], "correct_choice_idx": 3, "direct_answers": ["avoiding sheep", "slowing down", "taking picture", "driving", "driving bus", "driving", "driving", "yielding", "taking photo", "driving slowly"], "difficult_direct_answer": false, "rationales": ["A person is behind the wheel of a vehicle that is traveling down the road.", "The bus is seen moving to the area near the sheep.", "The bus is being driven."], "image": "train2014/COCO_train2014_000000565875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451412, "question_id": "7oYYfA5qxZV9QaaKApiJyQ", "question": "What is the toddler about to do?", "choices": ["flush toilet", "poo", "throw up", "pee"], "correct_choice_idx": 0, "direct_answers": ["flush toilet", "flush", "flush toilet", "flush toilet", "flash toilet", "close", "flush toilet", "flush toilet", "flush toilet", "flush toilet"], "difficult_direct_answer": false, "rationales": ["They are going to flush the toilet after going to the bathroom.", "There is waste already in the toilet and the toddler has her hand on the toilet lever.", "The toddler wants to flush."], "image": "val2014/COCO_val2014_000000451412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146397, "question_id": "7oZni68FuwBBrzPNiu6enF", "question": "How many scatters partially skate on one wheel?", "choices": ["two", "one", "seven", "four"], "correct_choice_idx": 3, "direct_answers": ["one", "four", "five", "three", "five", "five", "five", "four", "five", "one"], "difficult_direct_answer": false, "rationales": ["These are timelapse photos of the same guy", "There are four of them only on one wheel.", "Of the images, there are four skaters on one wheel."], "image": "val2014/COCO_val2014_000000146397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39100, "question_id": "7oghNXdNHn5FWFmeN3nWC4", "question": "What month of the year is represented by the number on his bike?", "choices": ["april", "august", "november", "july"], "correct_choice_idx": 3, "direct_answers": ["july", "august", "july", "summer", "july", "july", "july", "july", "july", "july"], "difficult_direct_answer": false, "rationales": ["The bike has the number seven on it.", "July is for most bikers.", "We see the number \"7.\" july is the seventh month of the year."], "image": "train2014/COCO_train2014_000000039100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139011, "question_id": "7pAdD2nEcBcxXPEAA43BQR", "question": "What area is shown here?", "choices": ["bus stop", "cake walk", "taxi stand", "bike pull"], "correct_choice_idx": 0, "direct_answers": ["downtown", "bus stop", "bus stop", "city", "station", "urban", "metropolitan", "transit mall", "bus stop", "city"], "difficult_direct_answer": false, "rationales": ["People stand at a covered area near the curb of a busy street as a bus approaches.", "There are many people waiting around, and there is a sign called \"rapid ride\" indicating someone can catch a ride in that spot. there is also a bus stopped there.", "There is a covered area. a bus is on the road parked near people."], "image": "val2014/COCO_val2014_000000139011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246084, "question_id": "7q79n3vccmcj2uPNXjppmn", "question": "How does the seated man think the standing man is acting?", "choices": ["guilty", "funny", "depressed", "whiny"], "correct_choice_idx": 1, "direct_answers": ["funny", "funny", "crazy", "funny", "funny", "silly", "silly", "funny", "funny", "silly"], "difficult_direct_answer": false, "rationales": ["The man seated is visibly smiling while looking at the other person. when people are smiling while looking at something that thing is often funny.", "The seated man is laughing at the standing man.", "The man that is seated is smiling and laughing as if the other man is funny."], "image": "train2014/COCO_train2014_000000246084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141501, "question_id": "7qErjZkXMD9qFyQQ2xJXRU", "question": "What is the tall fence for?", "choices": ["security", "blocking animals", "blocking vehicles", "sturdy structure"], "correct_choice_idx": 0, "direct_answers": ["security", "forbidding entrance", "keeping out", "fence", "prevent theft", "block trespassers", "security", "security", "intruders away", "prisoners"], "difficult_direct_answer": false, "rationales": ["The tall fence keeps unwanted people out.", "The tall fence is posted for security with signs.", "There is barb wire at the top."], "image": "val2014/COCO_val2014_000000141501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37826, "question_id": "7qLEueNgwzSr7WRxA93dg8", "question": "What does the man do with the object around his neck?", "choices": ["call", "text", "paint", "take photos"], "correct_choice_idx": 3, "direct_answers": ["take photos", "photograph", "take photos", "photograph", "take pictures", "photograph", "take photos", "hanging", "photograph", "take photos"], "difficult_direct_answer": false, "rationales": ["The man is taking photos.", "The object around his neck is a canon camera.", "The man has a camera strap around his neck."], "image": "train2014/COCO_train2014_000000037826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230057, "question_id": "7qTaHAEsnCTHztDD69xVbR", "question": "Why is the zebra moving its head to the ground?", "choices": ["to attack", "to eat", "to hide", "to drink"], "correct_choice_idx": 1, "direct_answers": ["to eat", "to eat", "eating", "flies", "to eat", "eat", "eating grass", "to graze", "eating", "feeding"], "difficult_direct_answer": false, "rationales": ["The zebra is eating.", "A zebra is bent down to grass. zebras graze on and eat grass.", "Zebra eat grass and plants and it is likely looking for food."], "image": "train2014/COCO_train2014_000000230057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329952, "question_id": "7qUxX7hgNAQuii7QnEpaYn", "question": "What is the snowboarder doing in the air?", "choices": ["tailwhip", "grinding", "grab", "falling"], "correct_choice_idx": 2, "direct_answers": ["jumping", "tricks", "trick", "trick", "jumping", "flying", "flying", "ollie", "grab", "trick"], "difficult_direct_answer": false, "rationales": ["The snowboarder is holding onto his board.", "The snowboarder is grabbing.", "The person is jumping and grabbing the board."], "image": "train2014/COCO_train2014_000000329952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478874, "question_id": "7r5m7LQ8DdBhzACYAHYmBu", "question": "What is the young girl using the pink object in her hand to do?", "choices": ["wash hair", "comb hair", "dry hair", "brush teeth"], "correct_choice_idx": 2, "direct_answers": ["dry hair", "dry hair", "dry hair", "dry hair", "dry hair", "dry hair", "dry hair", "dry hair", "dry hair", "dry hair"], "difficult_direct_answer": false, "rationales": ["The person's hair is wet.", "The girl is using the hair dryer to dry off her wet hair.", "The girl wants to make sure her hair isn't wet."], "image": "val2014/COCO_val2014_000000478874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145637, "question_id": "7rEiL8WU23v4Doxbsn4PJC", "question": "What country is represented on the surf board?", "choices": ["united kingdom", "russia", "germany", "united states"], "correct_choice_idx": 0, "direct_answers": ["england", "scaring", "england", "united kingdom", "great britain", "england", "great britain", "uk", "united kingdom", "united kingdom"], "difficult_direct_answer": false, "rationales": ["The cross sign with red, white, and blue coloring is a flag for the uk.", "The union jack colors are seen on the surfboard, which is known as the flag of the uk.", "That is the british flag"], "image": "val2014/COCO_val2014_000000145637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182801, "question_id": "7rNL2aqmWVUcjgzhaBbhyW", "question": "Who has priority on the crossing when the light is red?", "choices": ["women", "ceos", "pedestrians", "drivers"], "correct_choice_idx": 2, "direct_answers": ["pedestrians", "pedestrians", "pedestrians", "pedestrians", "pedestrian", "pedestrians", "pedestrian", "pedestrian", "pedestrians", "pedestrians"], "difficult_direct_answer": false, "rationales": ["When the light turns red, the vehicles are supposed to come to a halt so that walkers can cross the street.", "People at this time are suppose to stand and drivers to go through.", "Pedestrians always have right of way over cars."], "image": "train2014/COCO_train2014_000000182801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184523, "question_id": "7rYHDMEp7YGHhRXMft8Tc8", "question": "In what kind of room is this bed?", "choices": ["den", "barn", "motel", "luxury mansion"], "correct_choice_idx": 2, "direct_answers": ["hotel room", "hotel", "bed room", "hotel", "hotel", "bedroom", "hotel room", "motel", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["He lamps are attached to the wall and there is a couch very close to the bed which is normal for these types of rooms", "This is a motel room, and you can tell from the two wall lamps on either side of the bed: that's classic motel lighting at its finest!.", "It has furniture and decor that is generic and mass-produced."], "image": "train2014/COCO_train2014_000000184523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352073, "question_id": "7rc4XXN8qoWMKMPmUcb53o", "question": "Which person has exposed to the sun the most?", "choices": ["right girl", "left girl", "back man", "back woman"], "correct_choice_idx": 0, "direct_answers": ["on right", "lady", "middle woman", "pale woman", "blond", "right one", "equally", "right girl", "tannest person", "tanned girl"], "difficult_direct_answer": true, "rationales": ["Exposure to the sun causes skin to darken.", "The girl on the right is more tan.", "The person on the right is exposed."], "image": "train2014/COCO_train2014_000000352073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88825, "question_id": "7rmgmpsNLhsEprKkobPts5", "question": "To which direction of the woman is the sun located?", "choices": ["back", "left", "right", "front"], "correct_choice_idx": 1, "direct_answers": ["left", "left", "east", "right", "front", "her right", "west", "left", "east", "northeast"], "difficult_direct_answer": false, "rationales": ["Because her reflection is on the right side.", "The woman's shadow indicates the sun is to the left.", "The sun is at the left."], "image": "train2014/COCO_train2014_000000088825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440339, "question_id": "7sB2umzt7EXCSnasSFpnGw", "question": "Donuts sprinkles are made up of what?", "choices": ["plants", "sugar", "honey", "flour"], "correct_choice_idx": 1, "direct_answers": ["food coloring", "wax", "flour", "colored sugar", "sugar", "sugar", "sugar", "sugar", "sugar", "candy"], "difficult_direct_answer": false, "rationales": ["The sprinkles are made of colored sugar.", "They're also often made with other ingredients, but primarily a.", "Sprinkles are used as decoration on baked goods. the main ingredient of sprinkles is sugar."], "image": "train2014/COCO_train2014_000000440339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544502, "question_id": "7sDRqZUnKZLimC63F6rRr7", "question": "What will the person here do with the ball?", "choices": ["throw netwards", "throw upwards", "pocket it", "throw away"], "correct_choice_idx": 1, "direct_answers": ["serve", "serve", "hit", "serve", "serve", "throw upwards", "serve", "hit", "hit", "serve"], "difficult_direct_answer": false, "rationales": ["He will toss it up in the air so he can hit it to start the game.", "They launch it into the air above them as they go in for a swing to hit to the other player", "When serving a tennis ball, it is thrown upwards into the air before hitting it with the racquet."], "image": "train2014/COCO_train2014_000000544502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399165, "question_id": "7sVA52JtN7K6MQ6qhNmyUV", "question": "What is another sport that takes place in this sort of environment?", "choices": ["tennis", "snowboarding", "cycling", "baseball"], "correct_choice_idx": 1, "direct_answers": ["snowboarding", "snowboarding", "snowboarding", "slalom", "bobsledding", "bobsled", "snowboarding", "bobsledding", "snowboarding", "snowboarding"], "difficult_direct_answer": false, "rationales": ["This is the only sport of the four options that can also be done downhill in a snowy environment.", "Skiing and snowboarding use the same venues and locations, and can be done in tandem with each other.", "The people are skiing in the white fluffy powder. only one of the options takes place in a winter environment."], "image": "train2014/COCO_train2014_000000399165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57306, "question_id": "7sb4bjpirxS9tq9eC4msLh", "question": "What is the man attempting to do with the device in his hand?", "choices": ["throw it", "eat it", "make call", "break it"], "correct_choice_idx": 2, "direct_answers": ["make call", "talk", "telephone call", "make call", "smoke", "talking", "communicate", "talk", "phone call", "talk"], "difficult_direct_answer": false, "rationales": ["A cell phone is used to make calls; certainly not to eat it, throw it, or break it.", "The guy is talking on a call.", "The man wants to use his phone to make a call."], "image": "val2014/COCO_val2014_000000057306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194704, "question_id": "7tVWnTXg8hJLZieco3r2Fq", "question": "How does the stuff collected on the ski change when warm?", "choices": ["into water", "gets sticky", "gets smelly", "gets hard"], "correct_choice_idx": 0, "direct_answers": ["melt off", "melts", "melts", "it melts", "melts", "melts", "into water", "melts", "melt", "snow melts"], "difficult_direct_answer": false, "rationales": ["When warm snow turns into liquid.", "Snow always melts when warmed.", "When snow melts it becomes a liquid"], "image": "val2014/COCO_val2014_000000194704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222394, "question_id": "7teRuEztizYtuViSnyfxt7", "question": "What do they have attached to their feet?", "choices": ["snow shoes", "flippers", "skis", "skates"], "correct_choice_idx": 0, "direct_answers": ["skis", "snowshoes", "snow shoes", "skis", "skis", "snow shoes", "skis", "snow shoes", "skis", "snowshoes"], "difficult_direct_answer": false, "rationales": ["The people are walking in the snow.", "These attach to regular shoes to give more surface area on top of the snow to make it easier to walk", "The snowshoes are attached."], "image": "train2014/COCO_train2014_000000222394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235543, "question_id": "7tmHMMyEDY8A3QVGvcHNLy", "question": "Why is the child sitting next to the cake?", "choices": ["safety", "it's his", "cleaner", "no chairs"], "correct_choice_idx": 1, "direct_answers": ["birthday", "eat it", "birthday", "his birthday", "posing", "eat", "it's his", "birthday", "blowing cake", "birthday"], "difficult_direct_answer": false, "rationales": ["The cake has a name on it, implying that the cake was made for the boy. it's likely that it's the boy's birthday based on the decorative joy and atmosphere.", "Its his birthday and therefor his cake.", "The child's cake is his."], "image": "train2014/COCO_train2014_000000235543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380893, "question_id": "7txmmeEzrc8KhDisnaKmhB", "question": "Why is the dog on the other dog?", "choices": ["mate", "fight", "hide", "play"], "correct_choice_idx": 3, "direct_answers": ["playing", "playing", "playing", "playing", "playing", "playing", "playing", "playing", "playing", "play"], "difficult_direct_answer": false, "rationales": ["The black one has his face against the other. the brown one doesn't seem to be in distress and has its paw on the other in a funny way.", "When dogs play with each other, they wrestle, which can look like fighting but it is actually harmless.", "One dog is playfully pouncing on the other."], "image": "train2014/COCO_train2014_000000380893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338544, "question_id": "7uB6qk4uvocFa6tms39TPE", "question": "Which person seen here goes faster over time?", "choices": ["shark rider", "boat", "surfer", "canoe"], "correct_choice_idx": 1, "direct_answers": ["on jetski", "top left", "surfer", "boater", "jetskier", "boat", "back", "jet ski", "jet skier", "jet ski"], "difficult_direct_answer": true, "rationales": ["Boats are powered by a gasoline motor. the person on the board is just using their body.", "The boat can go way faster since it's powered by a motor.", "The object in the back is powered by gas it contains an engine which produces high speeds a person on a surfboard has a limited amount of speed even when propelled by waves."], "image": "train2014/COCO_train2014_000000338544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291144, "question_id": "7uRZMtYXhCRU2hoXu5tAnu", "question": "What happens if you leave your car parked here an hour and a half?", "choices": ["bulk rate", "nothing", "commendation", "ticket"], "correct_choice_idx": 3, "direct_answers": ["parking ticket", "fine", "fine", "ticket", "ticket", "ticket", "ticket", "ticket", "get ticketed", "ticket"], "difficult_direct_answer": false, "rationales": ["There is a parking meter. the text on it indicates that the maximum parking time is one hour.", "The parking meter needs to be fed otherwise a car will be ticketed.", "Staying in a parking spot too long will result in a ticket."], "image": "val2014/COCO_val2014_000000291144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447354, "question_id": "7uUk2SdP7PrvdXzhrHxouA", "question": "What tubed type cured sausage is seen here?", "choices": ["hot dogs", "italian", "relish", "pepperoni"], "correct_choice_idx": 3, "direct_answers": ["pepperoni", "pepperoni", "pepperoni", "pepperoni", "pepperoni", "pepperoni", "pepperoni", "pepperoni", "pepperoni", "pepperoni"], "difficult_direct_answer": false, "rationales": ["One of the most popular ingredients to put on a pizza is the slices of tubed meat.", "Pepperoni is often seen on pizza and is the only option that is cured sausage.", "Pepperoni is a common topping on pizzas."], "image": "val2014/COCO_val2014_000000447354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551571, "question_id": "7uq3gKYPdm47eXi9AoppBR", "question": "Which lane may this car continue forward on?", "choices": ["left", "any", "none", "right"], "correct_choice_idx": 3, "direct_answers": ["right", "current lane", "right", "right", "current lane", "right", "right", "right", "right", "left lane"], "difficult_direct_answer": false, "rationales": ["Though it is possible to go any direction, by the signs on the road itself suggests they would or could go right.", "There is a sign that indicates the left lane is a turning lane so the driver needs to stay in the right side lane to continue going forward.", "The car is in the right lane and will continue through the green light in the same lane."], "image": "train2014/COCO_train2014_000000551571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17707, "question_id": "7uvEMWGw6TQEVGHETbPGG4", "question": "What will this tent offer protection from?", "choices": ["tsunami", "insects", "gangs", "sun"], "correct_choice_idx": 3, "direct_answers": ["sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["The tent has fabric that is used for protection from the sun.", "It gives shade.", "This is on the beach on a clear day so it will block the sun"], "image": "val2014/COCO_val2014_000000017707.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46674, "question_id": "7uw9JqkrWZ8j8FpTcZizaF", "question": "Where is the water pouring on the umbrella coming from?", "choices": ["rain", "roof", "beach", "garden-hose"], "correct_choice_idx": 3, "direct_answers": ["hose", "water hose", "hose", "garden hose", "hose pipe", "hose", "hose", "garden-hose", "hose", "drain"], "difficult_direct_answer": false, "rationales": ["The weather is clear and sunny. the adult to the far right is spraying the child.", "The water is from the hose.", "It is sunny. the person to the right is causing the water to pour on the umbrella."], "image": "train2014/COCO_train2014_000000046674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118019, "question_id": "7uzzNWL85ibkDXc5h3vYwN", "question": "What is the two people's relationship?", "choices": ["coworkers", "siblings", "strangers", "lovers"], "correct_choice_idx": 3, "direct_answers": ["lovers", "lovers", "dating", "lovers", "couple", "lovers", "romantically involved", "lovers", "romantic", "couple"], "difficult_direct_answer": false, "rationales": ["They are kissing on the bench.", "Two people are embracing and kissing while sitting on a bench.", "The are kissing on a bench so they are definitely a couple."], "image": "train2014/COCO_train2014_000000118019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95808, "question_id": "7v8GRmS5TpJtkkkJvZbH9u", "question": "What protective gear does the man in yellow have?", "choices": ["helmet", "pads", "mask", "goggles"], "correct_choice_idx": 1, "direct_answers": ["elbow pad", "knee pads", "knee pads", "pads", "elbow pads", "pads", "pads", "pads", "knee pads", "pads"], "difficult_direct_answer": false, "rationales": ["The man on the skateboard is wearing protective pads on his knees.", "The man is wearing pads around his wrist and elbows that can protect them.", "Traditionally skating is dangerous and you need to use protection."], "image": "val2014/COCO_val2014_000000095808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442840, "question_id": "7vKsN2TcrZrcfZ7UrUvtKd", "question": "What item is drawn on the cake?", "choices": ["stalactite", "sun", "earth", "moon"], "correct_choice_idx": 1, "direct_answers": ["sun", "flowers", "sun", "flowers sun", "flowers", "flower", "flowers", "flowers", "flowers", "flowers"], "difficult_direct_answer": false, "rationales": ["It is yellow with rays extending towards the flowers next to it.", "The cake has a sun.", "The item is yellow; that's all that is needed to answer the question."], "image": "train2014/COCO_train2014_000000442840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237919, "question_id": "7vWNGUhCgUyf9aoT9YqFbn", "question": "What is the woman in the chair's role?", "choices": ["ball boy", "line judge", "referee", "chair umpire"], "correct_choice_idx": 3, "direct_answers": ["referred", "ref", "line judge", "chair umpire", "protection", "referee", "judge", "referee", "referee", "umpire"], "difficult_direct_answer": false, "rationales": ["In the sport of tennis the person in the chair officiates.", "The woman is supposed to be overseeing the game.", "She oversees the game for rule infractions."], "image": "train2014/COCO_train2014_000000237919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166328, "question_id": "7w5uTY3t5fHYUqTXNqVZpv", "question": "What type of dog is this?", "choices": ["poodle", "husky", "setter", "cat"], "correct_choice_idx": 2, "direct_answers": ["setter", "shepherd", "setter", "retriever", "setter", "retriever", "retriever", "labrador", "golden retriever", "retriever"], "difficult_direct_answer": false, "rationales": ["This is a setter dog that is playing.", "The colour, coat, and shape of the dog is that comparable to one most known as a setter.", "That's what the dog is."], "image": "train2014/COCO_train2014_000000166328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551598, "question_id": "7w6aCfVvwc48mZnWdPy75L", "question": "What are the stuffed animals shaped like?", "choices": ["bears", "boars", "bulls", "baboons"], "correct_choice_idx": 0, "direct_answers": ["bears", "bears", "bears", "bear", "bears", "teddy bears", "mice", "bears", "bears", "bears"], "difficult_direct_answer": false, "rationales": ["The stuffed animals have a face and body like a teddy bear.", "They are shaped like teddy bears.", "They are bears."], "image": "train2014/COCO_train2014_000000551598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424482, "question_id": "7wFBvS2j9LPsawQ9KSensV", "question": "What type of shot is the man hitting?", "choices": ["slice", "forehand", "backhand", "serve"], "correct_choice_idx": 3, "direct_answers": ["overhand", "serve", "serve", "overhand", "overhand", "overhand", "serve", "serve", "overhand", "serve"], "difficult_direct_answer": false, "rationales": ["A man is jumping up to hit a tennis ball at the back line of a court. tennis is served at the backline of the court by throwing the ball up and hitting it.", "A man is hitting a tennis ball from the back line of a tennis court and is jumping in the air.", "The ball is over his head."], "image": "train2014/COCO_train2014_000000424482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68594, "question_id": "7wJPdUWZZSLydqJbz5YWSL", "question": "Why is the man holding a plastic bag?", "choices": ["being mischievous", "as punishment", "for fun", "making purchase"], "correct_choice_idx": 3, "direct_answers": ["picking", "storing vegetables", "buying produce", "carry vegetables", "bagging vegetables", "making purchase", "hold vegetables", "shopping", "shopping", "filling it"], "difficult_direct_answer": true, "rationales": ["He is gathering food.", "The man wants to pick out produce that he'll buy.", "The man looks to be searching for the best produce to buy."], "image": "train2014/COCO_train2014_000000068594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199502, "question_id": "7wjhMjiTq7dQHQEZAbe7Gv", "question": "Which hormone is responsible for sleep?", "choices": ["oxytocin", "estrogen", "progesterone", "melatonin"], "correct_choice_idx": 3, "direct_answers": ["melatonin", "melatonin", "melatonin", "melatonin", "seratonin", "melatonin", "unknown", "melatonin", "melatonin", "melatonin"], "difficult_direct_answer": false, "rationales": ["Melatonin relaxes you for sleep.", "The hormone melatonin is known for inducing sleep.", "The hormone is know to help with sleep."], "image": "train2014/COCO_train2014_000000199502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233164, "question_id": "7wv7XbKGWTewGXDZBhYsUm", "question": "What type of sign is this?", "choices": ["brand", "warning", "directional", "regulatory"], "correct_choice_idx": 2, "direct_answers": ["traffic", "traffic", "directional", "directional sign", "directions", "traffic", "stop", "traffic", "traffic directions", "lane"], "difficult_direct_answer": false, "rationales": ["It has arrows showing which ways vehicles are expected to proceed.", "The sign contains two arrows pointing in one direction and one in the other. these signs are known as directional.", "The sign lets drivers know which way they can travel and when."], "image": "val2014/COCO_val2014_000000233164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191188, "question_id": "7wzfTFSo4VPhh9TtT2Bzq7", "question": "Which animals here are being penned?", "choices": ["horses", "all", "dogs", "pigs"], "correct_choice_idx": 1, "direct_answers": ["cows", "zebras", "pigs", "goats", "pig", "warthogs", "zebra", "all", "cows", "cows"], "difficult_direct_answer": false, "rationales": ["All types of animals such as zebras and pigs are in a pen.", "The animals are all fenced in.", "The animals are all penned."], "image": "train2014/COCO_train2014_000000191188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580735, "question_id": "7x3jumLxmDqDZELti2aj3z", "question": "What are the objects placed on?", "choices": ["floor", "sofa", "towel", "paper"], "correct_choice_idx": 2, "direct_answers": ["table", "table", "blanket", "tablecloth", "table cloth", "kitchen", "table", "towel", "placemat", "tabletop cloth"], "difficult_direct_answer": false, "rationales": ["The objects are placed on a brightly colored towel.", "The objects are on a wide cloth over tile of the type that is part of a floor.", "The food is placed on a striped piece of cloth."], "image": "train2014/COCO_train2014_000000580735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147073, "question_id": "7x4cTAD8Q3VJHWWMxgNa4d", "question": "The work of art on the large wall is meant to look like something that cooks what?", "choices": ["eggs", "eggs", "hot dogs", "bread"], "correct_choice_idx": 3, "direct_answers": ["toaster", "toast", "toast", "bread", "bread", "toast", "bread", "bread", "bread", "bread"], "difficult_direct_answer": false, "rationales": ["Bread is shown on the wall.", "The work of art is depicting a toaster.", "The painting on the wall is of a toaster that is used to cook bread."], "image": "train2014/COCO_train2014_000000147073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113334, "question_id": "7x98NPA5iLJzrqKZRE4tFn", "question": "What is near the trees?", "choices": ["wolves", "hyenas", "beavers", "snow"], "correct_choice_idx": 3, "direct_answers": ["road", "snow", "snow", "mountain", "road", "snow", "road", "cars", "road", "snow"], "difficult_direct_answer": false, "rationales": ["There is white all around them", "Snow is layered on the leaves.", "Snow is covering the trees."], "image": "train2014/COCO_train2014_000000113334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550666, "question_id": "7xHC6T9coN6gKACa7ZQgUk", "question": "Hawes Garage is the repairing center of?", "choices": ["software", "mobiles", "appliances", "automobiles"], "correct_choice_idx": 3, "direct_answers": ["motorcycles", "cars", "automobiles", "bikes", "motorcycles", "motorcycles", "motorcycles", "motorcycles", "motorcycles", "motorcycles"], "difficult_direct_answer": false, "rationales": ["A business is shown with signs and cars parked all around.", "There are automobiles seen outside the garage.", "The center is for cars."], "image": "val2014/COCO_val2014_000000550666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511892, "question_id": "7xYMhMYrzwUvCFdsU8xttS", "question": "What kind of setting is this venue?", "choices": ["laboratory", "domestic kitchen", "factory", "commercial kitchen"], "correct_choice_idx": 3, "direct_answers": ["industrial kitchen", "kitchen", "kitchen", "industrial kitchen", "industrial kitchen", "kitchen", "kitchen", "commercial kitchen", "kitchen", "kitchen"], "difficult_direct_answer": false, "rationales": ["This has large pots and a big cooking surface plus it's mostly stainless steel", "The setting is a commercial kitchen.", "Big kitchen is seen on the room."], "image": "val2014/COCO_val2014_000000511892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58171, "question_id": "7xe6JJuxhTCRSCPReiZjMK", "question": "What type tooth is this youngster lacking?", "choices": ["molar", "none", "wisdom", "baby"], "correct_choice_idx": 3, "direct_answers": ["front", "baby", "front", "front", "front", "baby tooth", "front", "front", "baby tooth", "front tooth"], "difficult_direct_answer": false, "rationales": ["The other options don't fit this image. children lose their first set.", "The girl is at the age where she would lose her teeth.", "Baby teeth fall out when you are young."], "image": "train2014/COCO_train2014_000000058171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129803, "question_id": "7xsT2WVwGWMH5dRCWLCMkP", "question": "Which people first brought this animal to the Americas?", "choices": ["british", "portuguese", "italian", "spanish"], "correct_choice_idx": 3, "direct_answers": ["indigenous", "spain", "spanish", "spanish people", "europeans", "spaniards", "spanish", "cortez", "europeans", "spaniards"], "difficult_direct_answer": false, "rationales": ["The spanish did.", "The spanish brought horses over to the americas.", "These actually originate in the americas"], "image": "val2014/COCO_val2014_000000129803.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249275, "question_id": "7xtBRsLnupmpaHAQpdYz9v", "question": "What is she doing?", "choices": ["playing music", "learning phone", "using phone", "cleaning phone"], "correct_choice_idx": 2, "direct_answers": ["texting", "texting", "texting", "texting", "texting", "texting", "using phone", "texting", "texting", "texting"], "difficult_direct_answer": false, "rationales": ["The girls is texting on the phone.", "The woman is holding a cell phone and typing.", "The woman seems to be using phone to chat with a friend."], "image": "train2014/COCO_train2014_000000249275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170602, "question_id": "7xtf2WTYLpojmYGNh7tBi6", "question": "What is up against the wall at the left?", "choices": ["chair", "human back", "cat", "dog"], "correct_choice_idx": 0, "direct_answers": ["chair", "chair", "chair", "chair", "chair", "chair", "chair", "chair", "pictures", "chair"], "difficult_direct_answer": false, "rationales": ["A chair is pushed up to the wall.", "There are no living beings in this room; it must be a chair.", "It is an article of furniture used for sitting."], "image": "train2014/COCO_train2014_000000170602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239791, "question_id": "7yRkrFXFBKmPwdhuSWj7xa", "question": "What might the man's profession be?", "choices": ["salesman", "artist", "carpenter", "priest"], "correct_choice_idx": 1, "direct_answers": ["artist", "artist", "artist", "sales", "artist", "artist", "sales", "artist", "artist", "artist"], "difficult_direct_answer": false, "rationales": ["The man is an artist since his briefcase says to buy art.", "The suit case the man is sitting on is promoting that people buy from him.", "The man is sitting on a briefcase that has a quote that relates to his profession."], "image": "train2014/COCO_train2014_000000239791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133680, "question_id": "7yWR4JR3tEQbYLiipArHhe", "question": "Why is he sitting on the bench?", "choices": ["scheming", "resting", "playing", "waiting"], "correct_choice_idx": 1, "direct_answers": ["resting", "resting", "resting", "cycle", "resting", "bus stop", "resting", "waiting", "occupied", "resting"], "difficult_direct_answer": false, "rationales": ["The man is taking a break from riding his bike.", "The person is resting.", "The parked bicycle, indicates that he rode to that location and has stopped to rest."], "image": "val2014/COCO_val2014_000000133680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558924, "question_id": "7z8gssywHbDec4Dy8NaG5R", "question": "Why does the animal want to go elsewhere to feed itself?", "choices": ["no water", "short grass", "boats nearby", "muddy"], "correct_choice_idx": 1, "direct_answers": ["low tide", "barren land", "be free", "adventure", "no grass", "no food", "no grass", "no grass", "drought", "short grass"], "difficult_direct_answer": false, "rationales": ["The animal wants the short grass.", "The animal wants to find some short grass to eat.", "The animal can't find sufficient grass."], "image": "train2014/COCO_train2014_000000558924.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507385, "question_id": "7zxfmXrPpEzrTVd7Toty2U", "question": "What cartoon character is the man dressed as?", "choices": ["donald duck", "charlie brown", "super mario", "mickey mouse"], "correct_choice_idx": 1, "direct_answers": ["charlie brown", "minion", "charlie brown", "charlie brown", "charlie brown", "minion", "charlie brown", "charlie brown", "charlie brown", "charlie brown"], "difficult_direct_answer": false, "rationales": ["He is wearing a yellow shirt.", "Charlie brown is known for wearing an orange shirt and black pants.", "You can tell by the color and design of his shirt as to what who he is dressed as."], "image": "train2014/COCO_train2014_000000507385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248793, "question_id": "82RqqLgy4ukpz2H4WhYqKh", "question": "What birds are seen in flight here?", "choices": ["robins", "pigeon", "swans", "ducks"], "correct_choice_idx": 1, "direct_answers": ["crows", "pigeons", "eagle", "crows", "seagulls", "pigeons", "pigeon", "doves", "doves", "pigeons"], "difficult_direct_answer": false, "rationales": ["The birds flying over the building are pigeons that live in cities.", "Pigeons can be seen here.", "Dark colored birds are flying overhead. pigeons are dark colored birds that are common."], "image": "val2014/COCO_val2014_000000248793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225271, "question_id": "82Vb45CKYAsmrRijhhVuFg", "question": "What's the area where the people are walking called?", "choices": ["crosswalk", "sidewalk", "runway", "boulevard"], "correct_choice_idx": 0, "direct_answers": ["zebra crossing", "crosswalk", "street", "crosswalk", "crosswalk", "crosswalk", "crosswalk", "crosswalk", "crosswalk", "median"], "difficult_direct_answer": false, "rationales": ["People can cross a street where there are a series of white lines.", "The people are walking across a crosswalk in a street.", "The area is a crosswalk."], "image": "train2014/COCO_train2014_000000225271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265356, "question_id": "82uDFWsngm29xtCc9NDs2A", "question": "What forest animal might one associate with the fruit here?", "choices": ["spider", "gorilla", "bat", "wolf"], "correct_choice_idx": 1, "direct_answers": ["monkey", "monkey", "monkey", "monkey", "monkey", "gorilla", "monkey", "monkey", "monkey", "monkey"], "difficult_direct_answer": false, "rationales": ["Gorillas are known for eating bananas.", "The boy is eating a banana.", "Those big silverbacks love bananas."], "image": "train2014/COCO_train2014_000000265356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215482, "question_id": "838aN7sSZN9s5tesTgofLY", "question": "Why is he holding the bat behind him?", "choices": ["intimidate others", "lost bet", "novice", "hit ball"], "correct_choice_idx": 3, "direct_answers": ["swinging", "preparing swing", "hit preparation", "hit ball", "hit ball", "awaiting pitch", "swing preparation", "preparing swing", "watching pitch", "swing"], "difficult_direct_answer": false, "rationales": ["The batter is getting ready to attempt to hit the ball, as is customary in baseball games.", "The man wants to hit the ball with more force.", "He is ready to swing when the ball comes"], "image": "val2014/COCO_val2014_000000215482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117098, "question_id": "839ptoWEovA7DBSn2jxgMK", "question": "Where are these bikes located?", "choices": ["bike show", "garage", "parking lot", "road"], "correct_choice_idx": 0, "direct_answers": ["motorcycle show", "bike show", "showroom", "exhibition show", "dealer show", "guard", "auto show", "showfloor", "car show", "showroom floor"], "difficult_direct_answer": true, "rationales": ["The bikes are on a shiny floor and the tires are held in place.", "They are on display indoors with people inspecting them and taking photos.", "The bikes are at a show."], "image": "val2014/COCO_val2014_000000117098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374520, "question_id": "83E3Tdt9AVtfaCedrPHq7U", "question": "Which continent headquarters this airline company?", "choices": ["asia", "europe", "north america", "africa"], "correct_choice_idx": 1, "direct_answers": ["european", "europe", "europe", "australia", "europe", "austria", "europe", "europe", "austria", "australia"], "difficult_direct_answer": false, "rationales": ["The airline is based in austria.", "The country is part of that continent.", "The company is headquartered in austria which is on this continent."], "image": "train2014/COCO_train2014_000000374520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528929, "question_id": "83NNSHVYZpy2FHprFLV4xK", "question": "What is an alternate spelling for the name on the license plate?", "choices": ["howard", "harrison", "henry", "harold"], "correct_choice_idx": 2, "direct_answers": ["henry", "henry", "henry", "henry", "henry", "henry", "henry", "henry", "henry", "henry"], "difficult_direct_answer": false, "rationales": ["The license place says henre which sounds close to henry.", "Most people with this name spell it with a \"y\" on the end.", "The name is most often seen with a y on the end."], "image": "train2014/COCO_train2014_000000528929.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267684, "question_id": "83kK5ed47BWwk24xaHhYSc", "question": "Where is the sun with respect to the person wearing red coat?", "choices": ["front", "right", "left", "back"], "correct_choice_idx": 3, "direct_answers": ["behind", "left", "behind her", "opposite", "above", "east", "due west", "west", "back", "left"], "difficult_direct_answer": true, "rationales": ["The sun is behind the man with the coat given his shadow.", "The sun is behind the man.", "Reflection come while reflected from the back."], "image": "val2014/COCO_val2014_000000267684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323421, "question_id": "83mJEAX27CLcmWbVbWKbND", "question": "What items are found on the wall?", "choices": ["posters", "hams", "pots", "sausages"], "correct_choice_idx": 2, "direct_answers": ["pots pans", "pots", "cookware", "pans", "pans", "vessels", "pots", "kitchen tools", "pots pans", "pots"], "difficult_direct_answer": false, "rationales": ["Pots of various sizes are on the wall.", "The pots are usually kept by the stove which are used to cook.", "The items are pots."], "image": "train2014/COCO_train2014_000000323421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118480, "question_id": "83pH5caHHxmthbMCUvK4Nu", "question": "Why should this cat be scared?", "choices": ["water", "fire", "noise", "physical harm"], "correct_choice_idx": 0, "direct_answers": ["faucet water", "water", "get wet", "water", "pointing camera", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["Cat's are known to not enjoy water too much. the cat is under a faucet which function is to provide water and because of cat's dislike of water, it should be scared that the water would start and get on it.", "It is in a sink with a faucet", "The cat is in the sink. if the faucet were turned on, then the cat would be scared."], "image": "train2014/COCO_train2014_000000118480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283679, "question_id": "83pghoFg748vSwyyNjGKcv", "question": "What country is it?", "choices": ["spain", "britain", "france", "italy"], "correct_choice_idx": 1, "direct_answers": ["united kingdom", "united kingdom", "england", "london", "england", "england", "england", "england", "britain", "united kingdom"], "difficult_direct_answer": false, "rationales": ["London bridge quarter is shown on two different signs.", "There is a double decker bus.", "England is know for its double-decker buses."], "image": "train2014/COCO_train2014_000000283679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25230, "question_id": "83zzMg7KxL66eshpL2LS8o", "question": "Which cupcake is alcohol-free?", "choices": ["red velvet", "strawberry daiquiri", "cherry amaretto", "bailey's chocolate"], "correct_choice_idx": 0, "direct_answers": ["red velvet", "red velvet", "red velvet", "red velvet", "red velvet", "red velvet", "red velvet", "red velvet", "red velvet", "red velvet"], "difficult_direct_answer": false, "rationales": ["No booze in red velvet.", "The red velvet cupcakes have no alcohol.", "The red velvet is chocolate."], "image": "train2014/COCO_train2014_000000025230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303036, "question_id": "843tmacwD9DbiJFvaf6Ppm", "question": "WHat flower is on the man's blazer?", "choices": ["carnation", "daisy", "poppy", "rose"], "correct_choice_idx": 2, "direct_answers": ["poppy", "poppy", "poppy", "rose", "poppy", "red", "charity organization", "poppy", "red", "poppy"], "difficult_direct_answer": false, "rationales": ["The flower is the same red flower that made the characters sleepy in the wizard of oz.", "It is worn to commemorate armistice day in the uk every november.", "The flower is the poppy."], "image": "train2014/COCO_train2014_000000303036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66459, "question_id": "84asA4Sm8T9ASAXcrihJCQ", "question": "What does the man here look at?", "choices": ["friend", "oncoming traffic", "police", "lights"], "correct_choice_idx": 1, "direct_answers": ["oncoming traffic", "oncoming traffic", "traffic", "traffic", "sidewalk", "traffic", "street", "roadway", "traffic", "street traffic"], "difficult_direct_answer": false, "rationales": ["The man is looking at oncoming traffic.", "A man is trying to cross the street safely.", "He is waiting to cross the street and wants to be sure there aren't any vehicles nearby."], "image": "train2014/COCO_train2014_000000066459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12251, "question_id": "84kZev2PGE8vDT9b3MNJZ4", "question": "What is the man's job?", "choices": ["cashier", "doctor", "dog sitter", "waiter"], "correct_choice_idx": 2, "direct_answers": ["herd dogs", "dog sitter", "invalid question", "dog sitter", "dog man", "dog sitting", "balancing surfboard", "surfer", "dog trainer", "dog surfing"], "difficult_direct_answer": true, "rationales": ["He has a lot of dogs and he probably takes care of them.", "The man is with man dogs because he is a dog sitting that watches them during the day.", "The man is trying to keep the pets occupied."], "image": "train2014/COCO_train2014_000000012251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540642, "question_id": "84p7GsYCoixQdMjTBptstf", "question": "How many cooks prepared meals in this kitchen today?", "choices": ["ten", "four", "five", "none"], "correct_choice_idx": 3, "direct_answers": ["one cook", "one", "none", "one", "zero", "zero", "zero", "zero", "no", "one"], "difficult_direct_answer": false, "rationales": ["The kitchen is clean. it is devoid of garbage and dirty pots and pans.", "There are no people.", "Although there is no way to know for sure, the kitchen is very clean and does not appear to have been used."], "image": "val2014/COCO_val2014_000000540642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556695, "question_id": "84tLGJpQbG8AdJBkrYyQMW", "question": "What is making the man hold the sign?", "choices": ["community service", "safety", "humor", "job"], "correct_choice_idx": 2, "direct_answers": ["humor", "friends", "his hand", "work", "brown stick", "temporary", "construction", "photo pose", "camera person", "job"], "difficult_direct_answer": true, "rationales": ["The man wants to be funny.", "He's posing for a picture and holding coffee so he's trying to be funny", "The man is posing for the camera for a photo."], "image": "train2014/COCO_train2014_000000556695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137228, "question_id": "84wfo9CKhAi5EuMCegCvWB", "question": "What is the man in the blue shirt holding?", "choices": ["drink", "coffee", "game controller", "cell phone"], "correct_choice_idx": 2, "direct_answers": ["wii remote", "cellphone", "controller", "control", "tickets", "remote", "remote", "wii controller", "game controller", "wii remote"], "difficult_direct_answer": false, "rationales": ["The man has a game controller.", "The man is holding a white wii controller and wii is a game and the object is what controls the motion for playing the game.", "He has a strap around his wrist and is facing the television along with someone else."], "image": "train2014/COCO_train2014_000000137228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555738, "question_id": "85FbgH5siJp6b9Mkc8PCxg", "question": "What is the long pipe in the ground most likely used for?", "choices": ["hiking", "irrigation", "decoration", "sports"], "correct_choice_idx": 1, "direct_answers": ["water", "irrigation", "water", "irrigation", "irrigation", "water", "irrigation", "water", "irrigation", "irrigation"], "difficult_direct_answer": false, "rationales": ["The pipe is used for water movement.", "The pipe is for irrigation.", "The long pipe helps make sure the grass gets water."], "image": "train2014/COCO_train2014_000000555738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14543, "question_id": "85hB5utKqDdupW539PLYAR", "question": "What is this type of scene called?", "choices": ["circus", "group photo", "painting", "war"], "correct_choice_idx": 1, "direct_answers": ["mountain", "mountain", "group photo", "snowboarding", "group photo", "mountain", "mountains", "snowboarding", "winter", "snow"], "difficult_direct_answer": false, "rationales": ["People are posing together for a camera while on an expedition, which is common for such a photo.", "This is a group picture being taken.", "The people on the mountain are posed close together to take a group photo."], "image": "train2014/COCO_train2014_000000014543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74299, "question_id": "85hE85vN2knSQoKVoNV3m3", "question": "Why is the bicyclist in the right side of the car?", "choices": ["parked car", "speeding car", "accident", "slow car"], "correct_choice_idx": 0, "direct_answers": ["bike lane", "lanes", "bike lane", "bike lane", "traffic laws", "bike lane", "on sidewalk", "bike lane", "they're driving", "parked car"], "difficult_direct_answer": false, "rationales": ["A car is in the street not moving. cars on the side of the street are parked.", "The car is parked.", "Cars park on the left in europe."], "image": "train2014/COCO_train2014_000000074299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72629, "question_id": "85qaediJAxSACfjrTP59Rc", "question": "Why is it glowing red?", "choices": ["hot", "paint", "juice", "neon"], "correct_choice_idx": 0, "direct_answers": ["hot", "hot", "hot", "heat", "power on", "heating element", "cooking", "it's hot", "hot", "heat"], "difficult_direct_answer": false, "rationales": ["The metal changes color as the temperature changes", "It's an oven element heated up.", "When an electric tube in an oven has been on for awhile, it glows red. an electric tube in an oven produces heat."], "image": "train2014/COCO_train2014_000000072629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35668, "question_id": "85qprTNUGdJ8wBkwb8JRAs", "question": "How are the knives able to hang on the wall?", "choices": ["glue", "rope", "tape", "magnetism"], "correct_choice_idx": 3, "direct_answers": ["magnet", "knife holder", "magnet", "hooks", "magnetism", "magnetic strip", "magnets", "shelf", "on hooks", "hooks"], "difficult_direct_answer": false, "rationales": ["They are hanging by a magnet.", "The shield behind them is one large magnet.", "Knives are on a board on a wall with no hooks."], "image": "train2014/COCO_train2014_000000035668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392170, "question_id": "85zE6bpDX737DMEqHWRG9G", "question": "Storing the items in the water prevents them from what?", "choices": ["changing color", "gathering bacteria", "harboring flies", "losing flavor"], "correct_choice_idx": 0, "direct_answers": ["oxidizing", "changing color", "cooking", "cold", "over cooking", "drying", "brown", "turning brown", "browning", "burning"], "difficult_direct_answer": true, "rationales": ["Cooking the potato's kills any bacteria growth on the vegetable.", "The potatoes won't brown.", "Storing these items in the water prevents them from browning."], "image": "train2014/COCO_train2014_000000392170.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141603, "question_id": "865fjdtjBVMmiCNzvY4HqX", "question": "Where are these vehicles located?", "choices": ["parking lot", "garage", "highway", "driveway"], "correct_choice_idx": 0, "direct_answers": ["parking lot", "street", "street", "road", "road", "road", "parking lot", "street", "street", "parking lot"], "difficult_direct_answer": false, "rationales": ["They are in between lines in a parking lot.", "These vehicles are not moving. they are outside and are near other vehicles.", "The area is outside and on a paved surface. there are other vehicles around."], "image": "train2014/COCO_train2014_000000141603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41311, "question_id": "86Hb9Pf4RuiVkU6LmK7f6z", "question": "Around what body part is this person likely to wear the items he holds?", "choices": ["head", "neck", "leg", "arms"], "correct_choice_idx": 1, "direct_answers": ["neck", "neck", "neck", "neck", "neck", "neck", "neck", "neck", "neck", "neck"], "difficult_direct_answer": false, "rationales": ["A necktie is usually around someone's neck.", "You wear a tie around your neck.", "The man is holding up two neck ties that are usually tied around a person's neck."], "image": "train2014/COCO_train2014_000000041311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429935, "question_id": "86U7f69ZiZNEi8G5Ci86uh", "question": "How do these people know each other?", "choices": ["coworkers", "spouses", "rivals", "pen pals"], "correct_choice_idx": 1, "direct_answers": ["lovers", "married", "lovers", "married", "friends", "spouses", "mates", "couple", "couple", "married"], "difficult_direct_answer": false, "rationales": ["These people might be spouses since they're vacationing together.", "They are holding hands like they have a close relationship.", "There is a couple that is holding hands as they ride a wave on their boards."], "image": "train2014/COCO_train2014_000000429935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223580, "question_id": "86VxbGK5MGv89db9bP7sSB", "question": "What type food is more likely served here?", "choices": ["salads", "filet mignon", "hot dog", "chile"], "correct_choice_idx": 2, "direct_answers": ["hot dog", "hot dogs", "hot dog", "bbq", "concession food", "fair food", "hotdogs/snacks", "fast food", "barbecue", "carnival food"], "difficult_direct_answer": true, "rationales": ["Hot dogs would be a good food to serve at the event.", "This is a small fair type event and that would be essential for serving.", "The food is the hot dog."], "image": "train2014/COCO_train2014_000000223580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451944, "question_id": "86XmQXYgFUWaG7n4Hohnsr", "question": "To shop at this mall one must book a ticket to which state?", "choices": ["new jersey", "minnesota", "california", "florida"], "correct_choice_idx": 3, "direct_answers": ["unknown", "florida", "florida", "florida", "california", "california", "florida", "unknown", "florida", "florida"], "difficult_direct_answer": false, "rationales": ["This mall is located in florida.", "Aventura mall, the bus's destination, is located in this state.", "Adventura mall is in miami."], "image": "val2014/COCO_val2014_000000451944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305600, "question_id": "86gTyJiRvnnuzKtpdLxjxG", "question": "What will happen to the refrigerator next?", "choices": ["nothing", "chilling", "freezing", "warm up"], "correct_choice_idx": 3, "direct_answers": ["replaced", "thrown out", "defrosting", "warm up", "become warm", "disassembled", "get repaired", "trash", "clean", "assembled"], "difficult_direct_answer": true, "rationales": ["The refrigerator will warm up without a door on it.", "The door is off the refrigerator so it will not work as well cooling off the products inside without a door to hold in the cold.", "The refrigerator has its door removed which will cause the inside to get warm."], "image": "train2014/COCO_train2014_000000305600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469026, "question_id": "86rwL6yYxeSbw4jpL8iRQU", "question": "Why are the vehicles so old?", "choices": ["people poor", "vintage collectors", "cuban embargo", "old picture"], "correct_choice_idx": 2, "direct_answers": ["cuba", "antique", "cuba", "vintage", "collectors", "no new/imports", "old models", "cuban embargo", "third world", "poor country"], "difficult_direct_answer": true, "rationales": ["Old cars are in the street with worn buildings behind that are lightly colored with metal roofs.", "The cuban embargo made it difficult for cuba to progress technologically and the result were older vehicles and buildings, among others thing.", "The vehicles are in cuba."], "image": "train2014/COCO_train2014_000000469026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157036, "question_id": "86vRxVTrVQAVs2gzdGpSDR", "question": "How old are the animals in this photograph?", "choices": ["middle aged", "young", "various ages", "old"], "correct_choice_idx": 2, "direct_answers": ["various ages", "varied ages", "ten", "young", "diverse ages", "five", "one year", "young", "steppe", "one"], "difficult_direct_answer": true, "rationales": ["There are babies and adult sheep too.", "Some sheep are young, while others are older.", "The animals are various sizes."], "image": "train2014/COCO_train2014_000000157036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222086, "question_id": "877poETZnGE4FuBaT3dUKS", "question": "What is inside the items being upheld here?", "choices": ["dog guts", "cash", "air", "stuffing"], "correct_choice_idx": 3, "direct_answers": ["stuffing", "stuffing", "foam", "stuffing", "teddy bear", "stuffing", "stuffing", "camera", "playing", "cotton"], "difficult_direct_answer": false, "rationales": ["These toys tend to be filled with soft like materials to give them their shapes.", "Because normally dolls are packed with a soft material then it's likely the answer.", "Stuffing is inside of the stuffed animals."], "image": "train2014/COCO_train2014_000000222086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268469, "question_id": "878fPq8vXcuZphY6iU2MKh", "question": "For what purpose is this bike being exhibited?", "choices": ["sale", "no reason", "parked temporarily", "display only"], "correct_choice_idx": 3, "direct_answers": ["collectable", "show", "historical purposes", "display only", "historical interest", "entertainment", "display", "historical", "antique show", "antique"], "difficult_direct_answer": true, "rationales": ["It has the price tag to show that it is to be solved.", "The purpose is for display.", "The sign says it is a festival and there is an exhibit number on it"], "image": "val2014/COCO_val2014_000000268469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159763, "question_id": "87EBYV4nRNotooR2XhqBgx", "question": "What is the man playing about to do?", "choices": ["catch", "swing", "dunk", "block"], "correct_choice_idx": 1, "direct_answers": ["hit ball", "backhand swing", "hit ball", "serve", "return ball", "swing", "hit ball", "hit", "hit ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["He has pulled his arm backward so he can hit the ball which is on its way.", "The man is playing tennis.", "His arm is pulled back as one does before swinging the racket."], "image": "train2014/COCO_train2014_000000159763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449990, "question_id": "87NJda3iyB9tgizaKP6aVq", "question": "What is the woman ready to do?", "choices": ["swing", "tackle", "dunk", "dribble"], "correct_choice_idx": 0, "direct_answers": ["swing", "hit ball", "hit baseball", "hit ball", "hit ball", "hit ball", "catch", "hit ball", "swing", "hit ball"], "difficult_direct_answer": false, "rationales": ["The woman is holding a bat.", "The woman wants to swing.", "The woman is holding a bat and looking intently."], "image": "val2014/COCO_val2014_000000449990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26699, "question_id": "87QJGXagmeULwsUYbgbtjE", "question": "What is the person dress in black's job?", "choices": ["umpire", "referee", "1st base", "catcher"], "correct_choice_idx": 1, "direct_answers": ["judgement", "referee", "umpire", "umpire", "referee", "coach", "referee", "umpire", "umpire", "umpire"], "difficult_direct_answer": false, "rationales": ["They are not in the player uniform, and their job is to watch and make sure the players are correctly following the rules.", "He keeps track of the rules and makes it fair.", "The person is the referee and would make calls about the game."], "image": "train2014/COCO_train2014_000000026699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264645, "question_id": "87g69e3ps7MJYrWGi5AKJ6", "question": "The persons here are inside what?", "choices": ["space shuttle", "parking garage", "school", "retail space"], "correct_choice_idx": 3, "direct_answers": ["retail space", "bar", "store", "happy excited", "store", "store", "conversation", "store", "building", "store"], "difficult_direct_answer": false, "rationales": ["The people are in a retail store.", "Many people are in a large indoor area with fluorescent lighting. aisles of products can be seen behind.", "The person is inside of a store."], "image": "train2014/COCO_train2014_000000264645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418944, "question_id": "87mrXjPRSbgMRp67poKYNZ", "question": "What temperature is the item held by the girl when in fullest use?", "choices": ["room temperature", "20 degrees", "cold", "hot"], "correct_choice_idx": 3, "direct_answers": ["hot", "hot", "hot", "low", "197 degrees", "hot", "100 degrees", "hot", "hot", "140 f"], "difficult_direct_answer": false, "rationales": ["The hair dryer is really hot since it can dry off hair.", "This is a hair dryer used to dry wet hair", "The temperature is hot."], "image": "val2014/COCO_val2014_000000418944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547435, "question_id": "883qwgSgPGGtcYxogUmm3n", "question": "What athlete has a last name that is similar to the name of the street?", "choices": ["jennie finch", "shawn johnson", "bo jackson", "sasha banks"], "correct_choice_idx": 3, "direct_answers": ["bank street", "jordan", "william banks", "scott banks", "sasha banks", "banksy", "bank", "banksy", "brian banks", "willie banks"], "difficult_direct_answer": true, "rationales": ["Sasha banks is an athlete and the street sign says bank.", "The athlete has the last name \"banks\"", "The name of the street is bank street according to the street sign, which is one letter off from banks."], "image": "val2014/COCO_val2014_000000547435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121557, "question_id": "8848XNfFdA4eEcGdrQJoXN", "question": "What is missing from the white girls mouth?", "choices": ["tongue", "lips", "food", "tooth"], "correct_choice_idx": 3, "direct_answers": ["tooth", "teeth", "tooth", "tooth", "tooth", "tooth", "tooth", "tooth", "tooth", "tooth"], "difficult_direct_answer": false, "rationales": ["She has lost one in the front", "There is a gap between her teeth.", "The little girl has lost her baby teeth."], "image": "train2014/COCO_train2014_000000121557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361572, "question_id": "886mLP9wzooxASjVfrvjTT", "question": "Where could the person quickly get a great tropical fruit to add to their home made ice cream?", "choices": ["spain", "next door", "door dash", "safeway"], "correct_choice_idx": 1, "direct_answers": ["market", "market", "market", "next door", "market", "behind them", "market", "market", "this market", "market"], "difficult_direct_answer": false, "rationales": ["There are tropical fruits at this stand.", "Someone living on this street could walk next door to buy tropical fruit for their home made ice cream.", "The fruit stand appears to be in a neighborhood and can be visited by anyone living next door or in the neighborhood."], "image": "val2014/COCO_val2014_000000361572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156098, "question_id": "889Nh7KXTzoLMDidbL9nGL", "question": "What will cars do when they reach the light?", "choices": ["stop", "slow down", "turn around", "go"], "correct_choice_idx": 3, "direct_answers": ["drive", "go", "speed up", "go", "go", "continue forward", "stop", "stop", "get lighter", "turn"], "difficult_direct_answer": false, "rationales": ["A streetlight is lit green on a large street at night.", "The cars will go when the lights are on.", "The cars will go."], "image": "val2014/COCO_val2014_000000156098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342353, "question_id": "88CyCsGm8v3ak8siJg9Sex", "question": "The hats signify their status as what?", "choices": ["thieves", "dancers", "lawyers", "chefs"], "correct_choice_idx": 3, "direct_answers": ["chefs", "chefs", "chef", "chefs", "chefs", "chief", "chefs", "chefs", "chef", "cooks"], "difficult_direct_answer": false, "rationales": ["The hats are of a size and shape that is commonly known to be used in one setting and imply one specific rank in that setting consistent with answer a.", "Men cooking are wearing white hats. chef wear white hats.", "The people are cutting foods, which means they are likely chefs."], "image": "train2014/COCO_train2014_000000342353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214725, "question_id": "88hbujuPREFx5CnDKc74cA", "question": "Where are the people?", "choices": ["restaurant", "hotel", "lake", "hostel"], "correct_choice_idx": 0, "direct_answers": ["house", "restaurant", "restaurant", "restaurant", "hotel", "sitting down", "indoors", "restaurant", "restaurant", "hotel"], "difficult_direct_answer": false, "rationales": ["The people are gathered at tables for a meal.", "The people are eating at a restaurant.", "People are sitting at tables eating and drinking."], "image": "train2014/COCO_train2014_000000214725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361738, "question_id": "88mYVXQDu74ofGusuBQygg", "question": "What word can be spelled using three of the letters on the red sign?", "choices": ["pot", "led", "lop", ""], "correct_choice_idx": 0, "direct_answers": ["pot", "top", "pot top", "top", "top", "top", "pot", "pot", "pots", "top"], "difficult_direct_answer": false, "rationales": ["The red octagonal sign is a stop sign. it contains the letters s t o p, three of which are also in one of the words given as options.", "The word is stop which contains s-t-o-p.", "The word \"pot\" could be with the last three letters of \"stop\"."], "image": "train2014/COCO_train2014_000000361738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485532, "question_id": "88pWiiw2eUn2yUdUEyopYX", "question": "What type of truck is this?", "choices": ["tow", "ladder", "moving", "fire"], "correct_choice_idx": 0, "direct_answers": ["antique", "big rig", "service truck", "fire", "pick up", "antique", "towing", "tow", "old jeep", "fire truck"], "difficult_direct_answer": true, "rationales": ["The truck is for towing.", "The track has a pulling ladder behind it.", "This looks to be an old tow truck."], "image": "val2014/COCO_val2014_000000485532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91912, "question_id": "88z3opNxSuzKdVnbNzz5fD", "question": "When choosing which water to swim in which color water here seems safer?", "choices": ["light blue", "yellow", "dark blue", "green"], "correct_choice_idx": 0, "direct_answers": ["bright", "blue pool", "blue", "light blue", "bright blue", "light blue", "pool", "pool", "swim", "lighter"], "difficult_direct_answer": false, "rationales": ["The pool is the safer option for swimming.", "This isn't very deep", "The light blue water is generally the safest."], "image": "val2014/COCO_val2014_000000091912.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204830, "question_id": "8956gAtmbpkFrEDDMBt85f", "question": "What are the red vegetables at the bottom right?", "choices": ["radishes", "beets", "rhubarb", "cabbage"], "correct_choice_idx": 2, "direct_answers": ["rhubarb", "rhubarb", "rhubarb", "onions", "onions", "rhubarb", "radish", "kale", "swiss chard", "radishes"], "difficult_direct_answer": false, "rationales": ["The pink leafy vegetable is called rhubarb,.", "The produce has red stems.", "The veggies are rhubarb."], "image": "val2014/COCO_val2014_000000204830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410350, "question_id": "89Pg8H3VVjNHncgZceSjMf", "question": "What is the man wearing the black apron doing?", "choices": ["cleaning", "cooking", "streaming", "painting"], "correct_choice_idx": 1, "direct_answers": ["cooking", "watching crowd", "cooking", "cooking", "cooking", "cooking", "cooking", "drinking beverage", "drinking", "cooking"], "difficult_direct_answer": false, "rationales": ["The man in the black apron is a chef and he is cooking a meal.", "Aprons are used when cooking.", "A man is in an apron and uniform with a chefs hat in a kitchen."], "image": "val2014/COCO_val2014_000000410350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573704, "question_id": "89WqKoQe9fdHszPZqMVPbW", "question": "What are they doing?", "choices": ["paying cleaning", "buying uniforms", "selling bats", "betting"], "correct_choice_idx": 3, "direct_answers": ["grabbing money", "conferring", "talking", "gambling", "talking", "waiting", "playing baseball", "talking", "playing baseball", "betting"], "difficult_direct_answer": false, "rationales": ["Although these guys could get in a lot of trouble for betting on baseball (remember pete rose?) none of the other answers make sense. money is changing hands.", "The ball players are trading money.", "The men are at a baseball game and taking out cash which implies they are gambling."], "image": "train2014/COCO_train2014_000000573704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523349, "question_id": "89dR6TxLpohnkZrRC5uhom", "question": "The instrument in the picture is used to play for?", "choices": ["snowboarding", "skiing", "surfing", "skating"], "correct_choice_idx": 1, "direct_answers": ["skiing", "snowboarding", "twin tip", "olympics", "archery", "skiboarding", "skiing", "keytar", "skiing", "snowboard"], "difficult_direct_answer": false, "rationales": ["The instrument in the picture is used to ski.", "There is snow.", "Skiing is done on the snow and these people are in a cold area with snow."], "image": "train2014/COCO_train2014_000000523349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578813, "question_id": "8A3oqs7WUgXYHTTNKqga5a", "question": "What sort of weather happens here frequently?", "choices": ["flood", "rain", "sleet", "wind"], "correct_choice_idx": 3, "direct_answers": ["snow", "snowing", "snow", "snow", "wind", "snow", "snow", "snow", "snowing", "snow"], "difficult_direct_answer": false, "rationales": ["It is at the top of a mountain which gets strong weather", "It must be windy for ice and snow to form.", "This is on top of a mountain. it's very windy at higher altitudes."], "image": "val2014/COCO_val2014_000000578813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261887, "question_id": "8AUZ7dLHuZuTUwdMHFgqqe", "question": "What type of boats are these?", "choices": ["rowboat", "sailboat", "dinghy", "catamaran"], "correct_choice_idx": 1, "direct_answers": ["sailboats", "sail", "sailboat", "sailboats", "sailboats", "sailboats", "sail boats", "sailboats", "sailboats", "sail boats"], "difficult_direct_answer": false, "rationales": ["The boats are sailboats.", "They have tall masts and rigging for sails", "There are visible poles upright on the boats. the purpose of such an apparatus on a boat is almost always for a sail which would make these boats answer a."], "image": "train2014/COCO_train2014_000000261887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116570, "question_id": "8AaPs4oYybusWajU84myi6", "question": "The person operating this computer works in what type of position?", "choices": ["web designer", "doctor", "singer", "architect"], "correct_choice_idx": 0, "direct_answers": ["broadcasting", "sitting", "seated", "web designer", "broadcast/audio", "css podcast", "podcast host", "radio", "seated", "remote"], "difficult_direct_answer": true, "rationales": ["A computer screen shows the logo of a web design program.", "There is a design company on the computer screens.", "The text on the monitor refers to css. this technology is not used by doctors, architects, or singers."], "image": "train2014/COCO_train2014_000000116570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363037, "question_id": "8B3Xyf57xrnkcBymKzhyLs", "question": "What country is the athlete from?", "choices": ["germany", "russia", "poland", "vietnam"], "correct_choice_idx": 1, "direct_answers": ["russia", "usa", "russia", "russia", "russia", "russia", "russia", "russia", "russia", "russia"], "difficult_direct_answer": false, "rationales": ["The tennis player is from russia.", "The back of the woman's shirt contains the name of the country of russia so that's bound to be her home country or else she wouldn't have it on her shirt in this context.", "The country name is on her shirt"], "image": "train2014/COCO_train2014_000000363037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553353, "question_id": "8B5dRa4XtJbHFJvBz9Y73U", "question": "What single item is most out of place?", "choices": ["man", "beverage", "suit", "shower"], "correct_choice_idx": 3, "direct_answers": ["shower", "clothes", "shower", "shower", "suit", "shower", "bottle", "suit", "bottle", "shower"], "difficult_direct_answer": false, "rationales": ["The shower does not belong.", "This man is full clothed and having a drink. a shower is normally used to clean oneself up.", "The man is fully dressed and standing in a shower where his clothes are getting wet."], "image": "val2014/COCO_val2014_000000553353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549119, "question_id": "8B6RaPQVT2vxVBszTRp6Zm", "question": "Where does the bus go next?", "choices": ["bus stop", "bus terminal", "downtown", "uptown"], "correct_choice_idx": 1, "direct_answers": ["next stop", "stop", "new york", "station", "bus terminal", "garage", "city", "servicing area", "bus please", "next stop"], "difficult_direct_answer": true, "rationales": ["The bus is headed to the bus terminal since the sign says to take the next bus.", "The bus goes to the terminal.", "The bus is written on the front face."], "image": "val2014/COCO_val2014_000000549119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265449, "question_id": "8BW5Jue2eEKSJaGYkzaigK", "question": "What is the relationship between the two players?", "choices": ["competitors", "teammates", "coworkers", "siblings"], "correct_choice_idx": 0, "direct_answers": ["lovers", "opponents", "competitors", "friends", "opponents", "competitors", "opponents", "friends", "teammates", "friends"], "difficult_direct_answer": false, "rationales": ["The men are standing on opposite sides of the net and are probably playing against each other.", "Since the men are on opposite sides of the net, they probably played against each other.", "They are standing on opposite sides of the net and likely opponents."], "image": "train2014/COCO_train2014_000000265449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289575, "question_id": "8BXjZ4gy7UXiGo9SUeddVt", "question": "What style of umbrella is seen here?", "choices": ["cruising", "nylon", "thatched", "modern"], "correct_choice_idx": 2, "direct_answers": ["ancient", "straw", "dome", "resort style", "tiki", "coconut fiber", "thatched", "straw", "thatched", "outdoor"], "difficult_direct_answer": false, "rationales": ["The umbrellas are covered in a straw type covering.", "The umbrella is thatched.", "This style is a thatched one."], "image": "train2014/COCO_train2014_000000289575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69553, "question_id": "8BZNwQbjbdV2QVxmkd7ery", "question": "Why is the train stopped?", "choices": ["no fuel", "broken", "red light", "abandoned"], "correct_choice_idx": 2, "direct_answers": ["repair", "red light", "approaching intersection", "red light", "red light", "intersection stop", "train station", "red light", "passengers", "red light"], "difficult_direct_answer": false, "rationales": ["In the front of the train there is a traffic signal on the left of it. since the signal has a red light on, this would explain the train being stopped.", "The train is waiting for the green light.", "There is a signal at the front of the train with red on it"], "image": "val2014/COCO_val2014_000000069553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411109, "question_id": "8BnAfQHHce4rzmztwitYmK", "question": "Why is the man laying on the ground?", "choices": ["unhappy", "fell down", "resting", "touching base"], "correct_choice_idx": 3, "direct_answers": ["slid home", "slidingto base", "sliding", "touching base", "touch base", "he slid", "sliding", "sliding", "diving", "sliding"], "difficult_direct_answer": false, "rationales": ["He slid in the base.", "He just ran really fast from another base and slid in before another player with the ball touches him", "A baseball player is on his stomach in front of a base with his hand outstretched and the umpire standing close by."], "image": "train2014/COCO_train2014_000000411109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566785, "question_id": "8BpVszdQCymke9rySVy9to", "question": "What type of potato is on the plate?", "choices": ["mashed", "scalloped", "sweet", "french fried"], "correct_choice_idx": 0, "direct_answers": ["mashed", "mashed", "mashed", "mashed", "puree", "puree", "mashed", "mashed", "mashed", "mashed"], "difficult_direct_answer": false, "rationales": ["The potatoes are mashed.", "White creamy potatos can be seen on a plate.", "Mashed potatoes are being eaten with the meal."], "image": "train2014/COCO_train2014_000000566785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561131, "question_id": "8BuW2mf9L9EoFQdtJKFF5y", "question": "What must the weather be like in this area?", "choices": ["tropical", "warm", "mild", "cold"], "correct_choice_idx": 3, "direct_answers": ["cold", "cold", "freezing", "cold", "winter", "cold", "cold", "snowy", "cold", "cold"], "difficult_direct_answer": false, "rationales": ["There is snow everywhere, and the people have heavy jackets and gloves.", "People are skiing in a snowy area. it must be cold for snow to remain frozen.", "The area is snowy so it must be cold."], "image": "train2014/COCO_train2014_000000561131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525211, "question_id": "8CkxE5xoYNZkSj3GZ33AqM", "question": "What does the container hold?", "choices": ["jelly", "cream", "sugar", "butter"], "correct_choice_idx": 0, "direct_answers": ["pastry creme", "jelly", "filling", "jelly", "filling", "jelly", "chocolate icing", "jelly", "chocolate", "caramel"], "difficult_direct_answer": false, "rationales": ["The person is squeezing the fruit filling for these kind of donuts into the holes", "These types of pastries are normally filled with some sort of sugary cream.", "A person is squirting a bottle filled with a red substance into donut holes. donuts are often filled with jelly."], "image": "val2014/COCO_val2014_000000525211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6941, "question_id": "8D3aaVvkSua55NqT6fmqVK", "question": "What is the name of the file that is playing?", "choices": ["music", "jamsign", "signaljam", "signali"], "correct_choice_idx": 2, "direct_answers": ["signaljam", "signal jam", "singlejam", "wav", "signaljam", "signal jam", "sign jam", "video", "signaljam", "mp4"], "difficult_direct_answer": false, "rationales": ["The title at the top of the video signals the type of file", "The video that is being played has the file name at the top of the window that says signal_jam", "The file name is listed on the top of the window."], "image": "train2014/COCO_train2014_000000006941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406340, "question_id": "8DGEo4otD9FCAesVsgUnpB", "question": "Where is someone working?", "choices": ["home", "library", "office", "courthouse"], "correct_choice_idx": 0, "direct_answers": ["at home", "kitchen", "kitchen", "kitchen", "house", "counter", "kitchen shelf", "home", "good", "kitchen"], "difficult_direct_answer": false, "rationales": ["The laptop and bag are on a counter in a house.", "There is a laptop.", "The sink and dishwasher indicate that this is a residential kitchen."], "image": "val2014/COCO_val2014_000000406340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393864, "question_id": "8DKJE3FZcJ9W8UgJzwwLy8", "question": "What does it look like someone spilled here?", "choices": ["white wine", "milk", "red wine", "salt"], "correct_choice_idx": 2, "direct_answers": ["food", "nothing", "food", "coffee", "liquid", "crumbs", "cheese", "red wine", "crumbs", "water"], "difficult_direct_answer": false, "rationales": ["The pan cake seems to have red color on top of it.", "A dark spot is on a table that is not well lit.", "Due to the color of the liquid you can safely tell what was spilled."], "image": "val2014/COCO_val2014_000000393864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434006, "question_id": "8DV2f3wGUcKPeBEexvdxaM", "question": "What is the sky producing?", "choices": ["hail", "rain", "snow", "sunshine"], "correct_choice_idx": 3, "direct_answers": ["nothing", "sunshine", "sunshine", "sun", "wind", "sunshine", "sunshine", "shadows", "sunshine", "ight"], "difficult_direct_answer": false, "rationales": ["The sun is producing sunshine on the beach.", "Kids are playing on a sunny day.", "The sky is clear. it is not hailing, snowing, or raining."], "image": "val2014/COCO_val2014_000000434006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556516, "question_id": "8DnRgVBTEHyv8K6GE3Se2X", "question": "What seasonings are visible?", "choices": ["mayo", "salt pepper", "paprika", "hot pepper"], "correct_choice_idx": 1, "direct_answers": ["salt pepper", "salt pepper", "salt pepper", "pepper", "yellow pepper", "salt pepper", "salt", "salt pepper", "salt pepper", "salt pepper"], "difficult_direct_answer": false, "rationales": ["The shakers are black and white.", "The are salt and pepper shakers on the dining room table.", "Salt and pepper shakers are available."], "image": "val2014/COCO_val2014_000000556516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133025, "question_id": "8E5y8ETCHnzmEACjQrerZA", "question": "What is the name of the sports equipment the three players are holding?", "choices": ["ball", "bat", "stick", "hook"], "correct_choice_idx": 1, "direct_answers": ["bat", "bat", "bats", "bat", "cricket", "bat", "bat", "baseball bats", "baseball bats", "bats"], "difficult_direct_answer": false, "rationales": ["The three men are all holding baseball bats. they are wearing baseball uniforms.", "Each man is holding something to whack a baseball.", "They're holding clubs made of metal that hit baseballs."], "image": "val2014/COCO_val2014_000000133025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187582, "question_id": "8EERzDvqSjgiaBPEt86AqM", "question": "Who likely packed this persons bags?", "choices": ["she did", "port authority", "dog", "parents"], "correct_choice_idx": 3, "direct_answers": ["adult", "parents", "child", "parents", "parent", "parent", "parents", "parents", "her parents", "clothes"], "difficult_direct_answer": false, "rationales": ["The child is much too young to pack his own bags.", "The parents packed.", "She is too young to pack a bag. dogs cannot pack bags."], "image": "train2014/COCO_train2014_000000187582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71908, "question_id": "8EJTLbZa3Q3sc9gW2iuRUy", "question": "Where is this ramp located?", "choices": ["skate park", "boardwalk", "grocery parking", "parking lot"], "correct_choice_idx": 0, "direct_answers": ["parking lot", "skate park", "parking lot", "skate park", "parking lot", "skate park", "parking lot", "skate park", "skate park", "skate park"], "difficult_direct_answer": false, "rationales": ["These kids are skating; we can assume this is a skate park.", "This is a built ramp that is commonly associated with skateboarding and would only belong in a place meant for skateboarding.", "The ramp is for a sports camp since the camp is advertising for the summer."], "image": "train2014/COCO_train2014_000000071908.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401552, "question_id": "8ESofyraMtEEMxP4cF5TU4", "question": "What is the child protecting themselves from with the umbrella?", "choices": ["snow", "sand", "sun", "rain"], "correct_choice_idx": 3, "direct_answers": ["raining", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "raining", "rain"], "difficult_direct_answer": false, "rationales": ["The child is using the umbrella to protect themselves from the rain.", "Given that the child is wearing summer clothing and the ground is wet, rain would be what is falling.", "The lighting is somewhat dark, as if it is cloudy outside where the child is standing. there is a little water on the ground around the child in places where water wouldn't normally be."], "image": "val2014/COCO_val2014_000000401552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327032, "question_id": "8EgWizCoaBFHcV7nT9CYYv", "question": "What are the metal objects hanging from the ceiling?", "choices": ["chimes", "pots", "pipes", "forks"], "correct_choice_idx": 1, "direct_answers": ["cooking pots", "pans", "cooking pots", "pans", "pans", "pans", "pans", "pans", "pans", "pots"], "difficult_direct_answer": false, "rationales": ["The objects are pots.", "The items are in the kitchen and they are shiny containers.", "It is a hanger to keep the kitchen organized so you can use cabinets for other things"], "image": "train2014/COCO_train2014_000000327032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173284, "question_id": "8FyKrqB5ZAsdBFBEXCtfh6", "question": "What type of building are they in?", "choices": ["hospital", "commercial", "school", "residential"], "correct_choice_idx": 3, "direct_answers": ["home", "kitchen", "house", "house", "residential", "house", "house", "house", "house", "home"], "difficult_direct_answer": false, "rationales": ["The house looks cozy.", "They appear to be standing in the kitchen of a private home.", "This room shown is a residential kitchen."], "image": "train2014/COCO_train2014_000000173284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253770, "question_id": "8G4La75DCKmAtpdk5QQ6ZU", "question": "What major gaming franchise is being advertised on the window?", "choices": ["pokemon", "twin peaks", "ghostbusters", "mask"], "correct_choice_idx": 0, "direct_answers": ["pokemon", "pokemon", "pokemon", "pokemon", "pokemon", "pokemon", "pokemon", "pokemon", "pokemon", "pokemon"], "difficult_direct_answer": false, "rationales": ["Pokemon's logo is on the window.", "The blue and yellow lettering matches their branding, as well as the letters m, o, and n can be seen.", "There is a sign in the window that has the yellow, puffy font of this popular game."], "image": "val2014/COCO_val2014_000000253770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178635, "question_id": "8G5QRE2rXbtgCtLvSvS66q", "question": "What is the average size of skateboard?", "choices": ["9inches", "12inches", "15inches", "8inches"], "correct_choice_idx": 3, "direct_answers": ["8inches", "33x8.5", "twenty seven", "31 inches", "twelve inches", "31-33", "foot", "32 inches", "one meter", "31 inches"], "difficult_direct_answer": true, "rationales": ["The average skateboard is over a foot.", "The average skateboard is eight inches.", "The answer is internet searchable, and none of the answers are the correct length but a is close to the width."], "image": "train2014/COCO_train2014_000000178635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316807, "question_id": "8GM5X8Rd5DwUXWPuRAHxh9", "question": "What type of shot is the girl hitting?", "choices": ["drop shot", "serve", "forehand", "slice"], "correct_choice_idx": 1, "direct_answers": ["tennis shot", "backhand", "upwards", "overhand", "over hand", "overhand", "lob", "underhand", "serve", "forward"], "difficult_direct_answer": true, "rationales": ["The girl swung her tennis racquet to serve the ball over the net.", "The girl is serving the ball.", "She is hitting the ball to her opponent."], "image": "train2014/COCO_train2014_000000316807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14873, "question_id": "8GMt5LXWZoC4hP4LsvnNky", "question": "What type of area is shown?", "choices": ["public", "rural", "residential", "private"], "correct_choice_idx": 0, "direct_answers": ["intersection", "street", "street", "road", "street", "pedestrian zone", "public", "city street", "urban", "busy"], "difficult_direct_answer": false, "rationales": ["The place is open with a lot of streets.", "This picture is an outside city setting with many people walking around.", "The area is public."], "image": "train2014/COCO_train2014_000000014873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61308, "question_id": "8GQo6C4xMJ7x7vyzRjss8j", "question": "What type of monster is the man trying to be?", "choices": ["werewolf", "vampire", "ghost", "zombie"], "correct_choice_idx": 3, "direct_answers": ["zombie", "zombie", "zombie", "zombie", "zombie", "vampire", "vampire", "zombie", "zombie", "zombie"], "difficult_direct_answer": false, "rationales": ["The man is walking around like a human but with blood all over his face so he is probably one of the walking dead.", "The man has fake blood painted on his mouth and looks undead.", "A zombie is being deplicated."], "image": "train2014/COCO_train2014_000000061308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259265, "question_id": "8GX3YMya6hzqYaaCQKcwBj", "question": "What is the next color for the traffic light?", "choices": ["white", "blue", "yellow", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "green", "green", "red", "green", "yellow", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["The light is currently red. the light that comes next after waiting long enough is always green.", "Traffic lights have three distinct colors, each with a different meaning to drivers negotiating the traffic.", "The light is red first."], "image": "train2014/COCO_train2014_000000259265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77936, "question_id": "8Gi92wsQWtRLK3wufuBkob", "question": "What color shown here is most unique?", "choices": ["cream", "black flower", "shiny tan", "gray"], "correct_choice_idx": 1, "direct_answers": ["black flower", "blue", "blue", "purple", "purple", "purple", "blue", "blue", "blue", "purple"], "difficult_direct_answer": false, "rationales": ["The black flower is the odd one out.", "The color is a flower.", "It is rare to see black flowers."], "image": "train2014/COCO_train2014_000000077936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158922, "question_id": "8Gj7tDauwUPa2nL9RsYNFo", "question": "What type of location is this?", "choices": ["marsh", "slope", "desert", "field"], "correct_choice_idx": 3, "direct_answers": ["field", "field", "field", "park", "park", "field", "field", "field", "field", "park"], "difficult_direct_answer": false, "rationales": ["There is a large area of grass.", "They are in a large, flat, grassy area.", "The area is grassy."], "image": "val2014/COCO_val2014_000000158922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548792, "question_id": "8HAPWkFe4PLZQctB2xgwYp", "question": "What is the size of the buildings?", "choices": ["short", "miniature", "tall", "flat"], "correct_choice_idx": 2, "direct_answers": ["tall", "skyscraper tall", "large", "midrise", "skyscraper", "skyscraper tall", "tall", "skyscrapers", "fairly high", "skyscraper"], "difficult_direct_answer": false, "rationales": ["The buildings are skyscrapers.", "The picture is showing the buildings from the group up and tower over the lights.", "Traffic lights can be considered as tall, and the building clearly is much taller than the traffic light. the sky as a backdrop emphasizes the building's vast height."], "image": "train2014/COCO_train2014_000000548792.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136565, "question_id": "8HCnNRph23wfQ4DPa2igpZ", "question": "Which way is the skateboard most likely to fall?", "choices": ["forward", "backward", "left", "right"], "correct_choice_idx": 1, "direct_answers": ["behind him", "left", "backward", "back", "backward", "back", "forward", "forward", "forward", "scatting"], "difficult_direct_answer": false, "rationales": ["It seems to be in balance and can easily fell back.", "The answer is not knowable based on the image, but if he walked forward his forward momentum might tip it back.", "The board has more weight in the back."], "image": "train2014/COCO_train2014_000000136565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518626, "question_id": "8HLhe7xWFWZiFWZCMz6do5", "question": "What item is painted in an unconventional color?", "choices": ["balloon", "nearest plane", "tarmac", "farthest plane"], "correct_choice_idx": 1, "direct_answers": ["airplane", "entire plane", "airplane", "nearest plane", "plane", "airplane", "airplane", "military jet", "plane", "plane"], "difficult_direct_answer": false, "rationales": ["Most military aircraft are painted gray but this one is yellow.", "It is not normal to see a yellow plane", "The lime green color is not often seen on aircraft."], "image": "train2014/COCO_train2014_000000518626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174802, "question_id": "8HRWzHQxgSz9EMurXFv4CB", "question": "What does the sign say?", "choices": ["biohazard", "stop", "garbage", "proceed"], "correct_choice_idx": 0, "direct_answers": ["biohazard", "biohazard", "biohazard", "biohazard", "biohazard", "biohazard", "biohazard", "biohazard", "biohazard", "biohazard"], "difficult_direct_answer": false, "rationales": ["A biohazard sign is in a bathroom.", "The sign on the wall says biohazard on it.", "The sign says that it's biohazard."], "image": "train2014/COCO_train2014_000000174802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96660, "question_id": "8Ha6jnMYKzYRoC53X5M9QY", "question": "Why is he wearing this suit?", "choices": ["costume", "warmth", "fashion", "visibility"], "correct_choice_idx": 1, "direct_answers": ["playing", "for warmth", "surfing", "surfer", "surfing", "warmth", "keep warm", "cold water", "surfing", "water sport"], "difficult_direct_answer": false, "rationales": ["The man needs to stay warm while in the ocean water.", "These are worn when the water is really cold", "Ocean water can be cold."], "image": "train2014/COCO_train2014_000000096660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572575, "question_id": "8HdJs6CUHBUkxbLkrFv8HU", "question": "What is the monitor sitting on top of above the desk?", "choices": ["papers", "books", "blocks", "stand"], "correct_choice_idx": 1, "direct_answers": ["books", "books", "books", "books", "books", "cpu", "computer monitor", "samsung", "computer monitor", "books"], "difficult_direct_answer": false, "rationales": ["It looks like to be sitting on some books", "The monitor is on books.", "Those are the spines of the objects containing the titles with the pictures of what the stories' characters would be about."], "image": "train2014/COCO_train2014_000000572575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347359, "question_id": "8HpSbcgS5Nvu2FhZyF69v7", "question": "In what state does the person skateboard here?", "choices": ["england", "new mexico", "new york", "bermuda"], "correct_choice_idx": 2, "direct_answers": ["new york", "new york", "new york", "brooklyn", "new york", "new york", "brooklyn", "new york", "brooklyn", "new york"], "difficult_direct_answer": false, "rationales": ["The sign lists the name of one of this city's five boroughs.", "The person is from new york.", "A bigger and well known city is listed on the sign."], "image": "train2014/COCO_train2014_000000347359.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69000, "question_id": "8J36tKtoHdLoDuStHbmfsS", "question": "What is the woman wearing sleeveless shirt doing?", "choices": ["recording", "taking photo", "using phone", "itching"], "correct_choice_idx": 2, "direct_answers": ["smiling", "writing", "writing", "on phone", "talking", "reading", "smiling", "talking phone", "smiling", "using phone"], "difficult_direct_answer": false, "rationales": ["The woman has a mobile device in her hand. she is having a conversation with someone else.", "Her hand is by her ear with a device in it.", "The woman appears to have a device in her hand of the right size of answer a usage and she is holding it to her ear as one would do."], "image": "val2014/COCO_val2014_000000069000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408138, "question_id": "8J4t2srKxhspNu5A8rEx6U", "question": "The bus in the background is going to the home of which endangered animal?", "choices": ["bald eagle", "red squirrel", "peregrine falcon", "koala"], "correct_choice_idx": 1, "direct_answers": ["red squirrels", "red squirrel", "whinfell", "elephant", "zoo animal", "whales", "red squirrel", "tiger", "newt", "whinefell"], "difficult_direct_answer": true, "rationales": ["The bus is home to the red squirrel.", "The animal is endangered.", "The bus in the background is from the red squirrel's home."], "image": "val2014/COCO_val2014_000000408138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24621, "question_id": "8JEAkxUQsAjRJLjhs2wEDh", "question": "What is the yellow object with blue legs?", "choices": ["desk", "ramp", "box", "door"], "correct_choice_idx": 0, "direct_answers": ["stand", "desk", "desk", "desk", "table", "table", "table", "desk", "desk", "desk"], "difficult_direct_answer": false, "rationales": ["This has drawers and a top and a place for a chair to sit in", "There is an old office desk out side a building that seems to be broken.", "It has a large flat rectangular surface, space to sit on one side and three drawers on the other."], "image": "train2014/COCO_train2014_000000024621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169152, "question_id": "8JEVh5g7TS5rGtgjHxitXM", "question": "What activity are the people on the beach doing?", "choices": ["surfing", "running races", "building sandcastles", "flying kites"], "correct_choice_idx": 3, "direct_answers": ["skating", "flying kites", "flying kites", "flying kites", "parasailing", "flying kites", "kite surfing", "kite flying", "flying kites", "surfing"], "difficult_direct_answer": false, "rationales": ["The items seen in the sky, are toys known as kites which can be flown in the air as a recreational activity.", "They are standing on the beach with large airborne objects floating directly above them.", "They are flying kites while on the beach."], "image": "val2014/COCO_val2014_000000169152.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207385, "question_id": "8JHjnd4C4xPgZZfJrE4sVZ", "question": "What are the plates made from?", "choices": ["plastic", "glass", "steel", "wood"], "correct_choice_idx": 1, "direct_answers": ["ceramic", "glass", "porcelain", "ceramic material", "ceramic", "ceramic", "ceramic", "ceramic", "porcelain", "ceramic"], "difficult_direct_answer": false, "rationales": ["The plates are white, not brown or silver. they are not made out of plastic.", "The plates are made of ceramic but that is not an option.", "The dinnerplates look shiny and smooth which looks like glass."], "image": "train2014/COCO_train2014_000000207385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487076, "question_id": "8JUWvr3b79zi7TqKZoVLBh", "question": "At this intersection how many directions of traffic are required to first stop before proceeding?", "choices": ["three", "four", "two", "one"], "correct_choice_idx": 2, "direct_answers": ["all", "two", "all ways", "four", "four", "one", "one", "two", "four", "all directions"], "difficult_direct_answer": false, "rationales": ["A car needs to look both ways to the side and at the front.", "Two sides have \"stop\" written in the lanes.", "The other road doesn't have any stop signs."], "image": "val2014/COCO_val2014_000000487076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469919, "question_id": "8JVgR9FMhfySh7eFkuuemy", "question": "Why are the umbrellas different colors?", "choices": ["discolored", "hiding", "less expensive", "for sale"], "correct_choice_idx": 3, "direct_answers": ["different brands", "visibility", "for sale", "fun", "to differentiate", "decoration", "style", "decorative purposes", "just are", "variety"], "difficult_direct_answer": true, "rationales": ["The umbrellas are different colors for the purpose of sale.", "The person wants to display a rainbow for sales.", "They are for sale."], "image": "val2014/COCO_val2014_000000469919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409796, "question_id": "8JhLagkQuPxhvamHr5zmth", "question": "What kind of activity is on the image above?", "choices": ["broadcasting", "ploughing", "cultivating", "digging"], "correct_choice_idx": 1, "direct_answers": ["plowing", "ploughing", "plowing", "farming", "plowing", "plowing", "plowing", "plowing", "farming", "plowing"], "difficult_direct_answer": false, "rationales": ["The man is ploughing up and clearing the field.", "They are working in the field.", "Because of their strength and stamina bulls are used by farmers to call equipment that stirs up the ground so that they can plant crops. i can see in this picture the animals are attached to a device and the people are directing the animals."], "image": "train2014/COCO_train2014_000000409796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367519, "question_id": "8JrCpFHnyt4o2LxuHp3rJR", "question": "What is associated with this sport?", "choices": ["nba", "nhl", "mls", "mlb"], "correct_choice_idx": 2, "direct_answers": ["football", "fifa", "legs", "soccer", "soccer", "mls", "running", "running", "kicks", "kicking"], "difficult_direct_answer": false, "rationales": ["The kids are playing soccer.", "The mls is associated.", "They are playing association football, not baseball, basketball, or hockey."], "image": "train2014/COCO_train2014_000000367519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419503, "question_id": "8JtC9onn6J5q4pMcvF7P8K", "question": "What type of meat is most visible near the front of the glass?", "choices": ["red meat", "chicken", "fake", "fish"], "correct_choice_idx": 0, "direct_answers": ["roast beef", "ha", "red meat", "steak", "steak", "red meat", "ham", "steak", "pork", "ham"], "difficult_direct_answer": false, "rationales": ["The meat is red meat.", "Red meat is shown near the glass.", "Red meat in the form of roast beef is visible near the front of the glass."], "image": "val2014/COCO_val2014_000000419503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140088, "question_id": "8JwyrEDh2PTbFZtUBQeFD8", "question": "The woman must look at what to ensure the frisbee goes the right direction?", "choices": ["people sitting", "other person", "sky", "ground"], "correct_choice_idx": 1, "direct_answers": ["other person", "receiver", "her target", "target", "left", "other participant", "hand", "other player", "hand", "hand"], "difficult_direct_answer": false, "rationales": ["A woman is playing frisbee. it takes two people to play frisbee.", "Answer a is commonly known to be the most effective way to direct a frisbee.", "The person's goal is for the other person to catch the frisbee so she must look at the other person in order to properly aim the frisbee."], "image": "train2014/COCO_train2014_000000140088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326320, "question_id": "8JyJ5UDdXFhp5q4K7Qg9H5", "question": "What is the man drinking under the Phone sign?", "choices": ["coffee", "slurpee", "milkshake", "coke"], "correct_choice_idx": 0, "direct_answers": ["soda", "better hearing", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "better hearing", "coffee"], "difficult_direct_answer": false, "rationales": ["He has a disposable hot cup in his hand.", "A man is holding a paper cup with a lid. coffee is served in a cup with a lid.", "He is holding a coffee in his hand."], "image": "train2014/COCO_train2014_000000326320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193540, "question_id": "8KBZ52a6SV4cKXqUzFpzcJ", "question": "What color hair does the woman at the back of this quartet have?", "choices": ["pink", "blonde", "red", "blue"], "correct_choice_idx": 1, "direct_answers": ["black", "blonde", "blonde", "black", "black", "blond", "blonde", "brown", "blonde", "blonde"], "difficult_direct_answer": false, "rationales": ["The woman's hair is a light color.", "Blonde hair is light colored hair. the other options are not light colors.", "The woman has yellow hair."], "image": "train2014/COCO_train2014_000000193540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81443, "question_id": "8KaRCeozkgqJ9i6zuuoXjc", "question": "How was the skater able to elevate the skateboard?", "choices": ["ramp", "glue", "thrown up", "kick flip"], "correct_choice_idx": 3, "direct_answers": ["kick flip", "force", "jump", "jumping", "jumping", "kick flip", "kick flip", "jumping", "kickflip", "propulsion"], "difficult_direct_answer": false, "rationales": ["The skater kicked the board up.", "The skater was able to jump by doing a kick flip with his board.", "The surface is flat, and glue would not be useful in this situation. the skater elevated the skateboard by performing a trick."], "image": "train2014/COCO_train2014_000000081443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27074, "question_id": "8Kw9PhTNHAgXjdEHxtd2UV", "question": "What did the man most likely use the giant scissors for?", "choices": ["digging hole", "giant paper", "giant pizza", "cutting ribbon"], "correct_choice_idx": 3, "direct_answers": ["cut ribbon", "ribbon cutting", "ribbon cutting", "cutting", "cut ribbon", "big ribbon", "big", "cutting ribbon", "ceremony", "cut ribbon"], "difficult_direct_answer": false, "rationales": ["When a new place is open they will cut ribbon for the grand opening.", "The huge scissors are used in the celebration of the opening of a new building.", "A man in business attire is holding large scissors up outside while surrounded by others."], "image": "train2014/COCO_train2014_000000027074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449136, "question_id": "8LJpjChJQW7tMX7d8mcVmg", "question": "What does the surfer need to ride that only the water can produce?", "choices": ["drops", "foam", "waves", "salt"], "correct_choice_idx": 2, "direct_answers": ["waves", "wave", "waves", "waves", "wave", "wave", "waves", "waves", "waves", "wave"], "difficult_direct_answer": false, "rationales": ["The only way for the surfboard to move is to have a wave propel it, and the ocean or big body of water is the only thing capable of naturally producing waves.", "Surfing is a sport that is conducted on waves and waves are something that is produced in water.", "The water pushes fast and rises as suddenly hits more shallow water and lifts the surfer on it"], "image": "train2014/COCO_train2014_000000449136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442743, "question_id": "8LRUTCtfLRLNcKqNFaCAnu", "question": "Which lass of nutrient is missing in the meal in the above picture?", "choices": ["proteins", "vitamins", "fats", "carbohydrate"], "correct_choice_idx": 3, "direct_answers": ["protein", "starch", "fruit", "carbohydrate", "dairy", "carbohydrates", "carbs", "carbs", "starch", "fruit"], "difficult_direct_answer": false, "rationales": ["There are meat and veggies on the plate.", "They are missing out on carbs.", "The meal includes meat (protein) and vegetables, but does not have bread or potatoes."], "image": "train2014/COCO_train2014_000000442743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69015, "question_id": "8LYUhQNPHzeYYZfP5rR8Qs", "question": "Why would someone sit at this area?", "choices": ["wash", "work", "clean", "eat"], "correct_choice_idx": 1, "direct_answers": ["work", "computer work", "internet access", "work", "work", "to work", "to work", "work", "work", "to work"], "difficult_direct_answer": false, "rationales": ["Most people in this scenario will use the laptop for work.", "There is a laptop sitting on the table and most people would use it to work.", "The person would be getting business done at their laptop."], "image": "val2014/COCO_val2014_000000069015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362284, "question_id": "8LZVwVHiUnBmjytgfcHmYb", "question": "Where can you find this animal?", "choices": ["new jersey", "india", "siberia", "russia"], "correct_choice_idx": 1, "direct_answers": ["africa", "differnt places", "africa", "africa asia", "zoo", "africa", "india", "africa", "forest", "africa"], "difficult_direct_answer": false, "rationales": ["Elephants can be found in india.", "Elephants are in india.", "The elephant is in india."], "image": "train2014/COCO_train2014_000000362284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431306, "question_id": "8LiHEj6TqKPvWpTzhWvW28", "question": "What bathroom is it on the right?", "choices": ["women", "handicapped", "transgender woman", "men"], "correct_choice_idx": 3, "direct_answers": ["men", "men's bathroom", "male", "mens", "women", "public", "public bathroom", "gents", "mens", "men"], "difficult_direct_answer": false, "rationales": ["There is a man in this bathroom. it is a bathroom that has multiple stalls, and is not residential.", "The men's room is on the right and it says so.", "The bathroom is for men."], "image": "val2014/COCO_val2014_000000431306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293385, "question_id": "8LogLzD5fJ28iFdzxj7rwM", "question": "What type of bread is on the sandwich?", "choices": ["wheat", "white", "sourdough", "rye"], "correct_choice_idx": 0, "direct_answers": ["wheat", "wheat", "seeded", "brown bread", "whole wheat", "whole wheat", "chocolate", "wheat", "rye", "bread sandwich"], "difficult_direct_answer": false, "rationales": ["The sandwich is on wheat bread.", "Wheat bread is used.", "The bread is wheat bread since it's darker in color."], "image": "train2014/COCO_train2014_000000293385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7125, "question_id": "8M5aEtT34H4gZjz3uNYexB", "question": "For whom is the paved path used?", "choices": ["pedestrians", "military", "engineers", "pilots"], "correct_choice_idx": 0, "direct_answers": ["cyclists pedestrians", "bike", "walkers", "pedestrians", "bicyclists", "walkers bikers", "every one", "bicycles", "pedestrians", "pedestrians"], "difficult_direct_answer": false, "rationales": ["Civilian people are walking on the paved path. there are no airplanes or trains near the path.", "People like to walk on even surfaces.", "The path is for people to walk along."], "image": "val2014/COCO_val2014_000000007125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294714, "question_id": "8M6SZLGBz5Wk6TfYJamdod", "question": "Approximately how many people live in this city?", "choices": ["2 million", "1 billion", "9 million", "6 million"], "correct_choice_idx": 2, "direct_answers": ["nine million", "9425622 people", "8 million", "thousands", "9 million", "two million", "eight million", "twelve million", "9 million", "9 million"], "difficult_direct_answer": false, "rationales": ["The city's population is documented in several locations.", "The population of london is approximately 8.9 million people.", "The amphibious bus is based in london."], "image": "train2014/COCO_train2014_000000294714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452022, "question_id": "8MAVRocnasDUJjZzynZ2vn", "question": "What is the name of the sporting item the man hold in his hand?", "choices": ["bat", "steel", "racket", "stick"], "correct_choice_idx": 2, "direct_answers": ["tennis racquet", "tennis racquet", "racket", "tennis racket", "racket", "tennis racquet", "racket", "wimbledon", "tennis racket", "racquet"], "difficult_direct_answer": false, "rationales": ["By its design and setting you can easily tell what the man is holding.", "Tennis is played with a racquet.", "He is playing tennis."], "image": "train2014/COCO_train2014_000000452022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390483, "question_id": "8MGh6tNZCsHEDN68mhNXYy", "question": "What are these birds called?", "choices": ["stork", "pelican", "wren", "flamingo"], "correct_choice_idx": 3, "direct_answers": ["flamingo", "pelican", "flamingoes", "flamingos", "flamingo", "flamingos", "flamingoes", "flamingo", "flamingos", "flamingo"], "difficult_direct_answer": false, "rationales": ["The birds in the water are flamingos that are pink and have long legs.", "The pink birds in the water are flamingos that live in the tropics.", "The birds are clearly visible in the image and have the coloring and body shape that is known to be of answer a."], "image": "train2014/COCO_train2014_000000390483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450308, "question_id": "8MJKbRcEwFaXJXBQYnC87K", "question": "Where is the man in?", "choices": ["pool", "street", "yard", "playground"], "correct_choice_idx": 1, "direct_answers": ["street", "flood water", "flooded street", "floodwater", "flood", "water", "flood", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The man is biking through a street flood.", "The man is in water but it appears the street is flooded as it is in front of a business.", "It is a flooded road in a town"], "image": "val2014/COCO_val2014_000000450308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569272, "question_id": "8MU7MGmYpQ5xLiQ3DakgxE", "question": "What type tobacco product will the person who sits here smoke?", "choices": ["snuff", "cigar", "hookah pipe", "bong"], "correct_choice_idx": 1, "direct_answers": ["cigar", "plastic", "cigars", "pot", "cigar", "cigar", "cigar", "cigars", "cigar", "cigar"], "difficult_direct_answer": false, "rationales": ["There is a cigar.", "As you can see near the keyboard a cigar is the only tobacco product shown.", "There is a visible cigar on the desk which is a tobacco product that would be smoked. it is likely that the person who uses this desk also uses the things on the desk and thus would smoke the cigar."], "image": "val2014/COCO_val2014_000000569272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489469, "question_id": "8Mj5Rx5J4agiTDvMP3iqBN", "question": "In what country are these people?", "choices": ["australia", "britain", "china", "us"], "correct_choice_idx": 1, "direct_answers": ["great britain", "england", "england", "england", "england", "britain", "britain", "united kingdom", "usa", "england"], "difficult_direct_answer": false, "rationales": ["Their flag has red, straight and diagonal strips against a blue background.", "The flags depicted are obviously of british descent.", "The flags are from that country."], "image": "train2014/COCO_train2014_000000489469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575826, "question_id": "8MkBSSkZujbUA6v9RpyMTb", "question": "What does the logo of the automobile company represent?", "choices": ["windmill", "steering wheel", "daimler engines", "peace sign"], "correct_choice_idx": 2, "direct_answers": ["mercedes-benz", "mercedes benz", "sponsorship", "mercedes benz", "thirds", "mercedes benz", "mercedes benz", "mercedes", "mercedes benz", "daimler engines"], "difficult_direct_answer": false, "rationales": ["A man is playing tennis and the mercedes logo is on the wall behind him.", "There is a blue sign behind the player. it is a mercedes which is a car company that uses an engine for power.", "A mercedes logo is on a sign. mercedes is made by daimler."], "image": "train2014/COCO_train2014_000000575826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142666, "question_id": "8NBUo6iJpixQtLfBRaepnN", "question": "What does the plastic do here?", "choices": ["protects", "heats", "tricks", "nothing"], "correct_choice_idx": 0, "direct_answers": ["protect", "shield rain", "protects", "protect", "protection", "keep dry", "protects", "keep dry", "shield rain", "keep dry"], "difficult_direct_answer": false, "rationales": ["The plastic coat is made to protect rain.", "The plastic keeps the rain and water off the man.", "The folks are trying to stay dry though it's raining."], "image": "train2014/COCO_train2014_000000142666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414698, "question_id": "8NF4hupZodZ3am5JsQ3BMN", "question": "What style will this player return the ball in?", "choices": ["two handed", "backhand", "he won't", "forehand"], "correct_choice_idx": 1, "direct_answers": ["backhand", "backhand", "hard serve", "backhand", "backhand", "backhand", "hard serve", "backhand", "hard serve", "backhand"], "difficult_direct_answer": false, "rationales": ["His hand is in the back of his body.", "The man will make sure he hits the ball with a backhand serve.", "The top of the players hand is facing the ball."], "image": "val2014/COCO_val2014_000000414698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291630, "question_id": "8NXmBMXGYWkZJiS3esLERE", "question": "How many legs is this insect known to have?", "choices": ["six", "eight", "four", "two"], "correct_choice_idx": 1, "direct_answers": ["eight", "eight", "eight", "six", "eight", "four", "eight", "six", "eight", "eight"], "difficult_direct_answer": false, "rationales": ["This is a tarantula which is known to have eight legs.", "Tarantulas have eight legs.", "The spider has eight legs."], "image": "train2014/COCO_train2014_000000291630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241170, "question_id": "8NeSsKuhKQvq4JeeTBnvdB", "question": "What kind of bird flies over the boys head?", "choices": ["bald eagle", "gull", "chicken", "pigeon"], "correct_choice_idx": 1, "direct_answers": ["gull", "seagull", "sea gull", "seagull", "seagull", "swan", "seagull", "hawk", "seagull", "seagull"], "difficult_direct_answer": false, "rationales": ["A seagull is flying in the sky.", "A large white bird is flying above a kids head at the beach. gulls are common at the beach.", "A gull is white in color."], "image": "train2014/COCO_train2014_000000241170.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179153, "question_id": "8NnSfTF6xHEYqQJw8PC82J", "question": "Why is the woman in the white jacket wearing a helmet?", "choices": ["dress code", "for fun", "style", "protection"], "correct_choice_idx": 3, "direct_answers": ["safety", "safety", "safety", "protect head", "safety", "protection", "protection", "safety", "safety", "for protection"], "difficult_direct_answer": false, "rationales": ["Helmets are used for protection from head damage.", "The woman is skiing and skiers often wear safety helmets in case of an accident.", "The purpose of wearing a helmet is for protection; you never know when you'll need protection."], "image": "train2014/COCO_train2014_000000179153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392949, "question_id": "8P44PmAkZLi47UEyjbQZPZ", "question": "In which country could you find this street?", "choices": ["holland", "belgium", "usa", "canada"], "correct_choice_idx": 2, "direct_answers": ["usa", "united states", "usa", "usa", "america", "america", "america", "usa", "usa", "usa"], "difficult_direct_answer": false, "rationales": ["The cars are being driven on the right side and the license plates look like they're american.", "American cars can be seen parked up and down a residential street.", "The country is the usa."], "image": "train2014/COCO_train2014_000000392949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309517, "question_id": "8PAnwxqjCBYdsyRMSENwVm", "question": "What time of day is it?", "choices": ["early", "mid day", "noon", "late"], "correct_choice_idx": 3, "direct_answers": ["evening", "sun down", "evening", "dusk", "dusk", "dusk", "sunset", "dusk", "late", "sunset"], "difficult_direct_answer": false, "rationales": ["It looks like it is sunset.", "The sun appears orientated with the horizon in this way twice a day so either answer a or b could be possible, but it is dark enough to likely be answer a.", "The sun is setting."], "image": "train2014/COCO_train2014_000000309517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483475, "question_id": "8PCnuJKoVFjaRN4LVwaKGz", "question": "What is the man to the right directing?", "choices": ["sheep", "planes", "dogs", "cars"], "correct_choice_idx": 1, "direct_answers": ["plane", "planes", "plane", "traffic", "airline traffic", "airplanes", "airplane", "planes", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["Aside from humans, there are no animals present. the vehicles fly and do not drive on roads.", "The man is on a tarmac with airplanes so it's safe to assume that's what he's directing.", "There is a large plane."], "image": "val2014/COCO_val2014_000000483475.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251111, "question_id": "8PDMYViTtVE3HMYQMoQRNk", "question": "The round object on the right table would be useful in which class?", "choices": ["geometry", "biology", "geography", "geology"], "correct_choice_idx": 2, "direct_answers": ["geography", "geography", "geography", "geography class", "geography", "geography class", "geography", "geography", "geography class", "geography"], "difficult_direct_answer": false, "rationales": ["The object is a globe which shows all of the countries of the world.", "It might also help a bit with the other ones, but it would be most useful with a.", "There is a globe on the table. it shows the world."], "image": "train2014/COCO_train2014_000000251111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356286, "question_id": "8PHRGQ9Bd4ZruS2KkRhLkg", "question": "What sort of shop is this?", "choices": ["motorcycle sales", "car sales", "used car", "motorcycle repair"], "correct_choice_idx": 0, "direct_answers": ["motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle sales", "motorcycle", "motorcycle", "motorbike shop", "motorbike shop", "motorcycle"], "difficult_direct_answer": false, "rationales": ["There are bikes on a pedestal for viewing inside a showroom.", "They have many different ones on display in a room", "There are many different bikes on display with information next to each"], "image": "train2014/COCO_train2014_000000356286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111799, "question_id": "8PkX7s7TN2SkjavB3XDbRg", "question": "How many colors of ring does players have?", "choices": ["four", "six", "three", "five"], "correct_choice_idx": 2, "direct_answers": ["one", "three", "three", "three", "two", "three", "two", "three", "three", "one"], "difficult_direct_answer": false, "rationales": ["The players have a yellow, a blue, and a green ring.", "There are blue, green and yellow.", "The rings are yellow, blue, and green."], "image": "train2014/COCO_train2014_000000111799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116096, "question_id": "8PzFQG7AgxTPS3FBNbzVgK", "question": "Where do the boards in this picture go while being used?", "choices": ["kitchen", "ocean", "your car", "air"], "correct_choice_idx": 1, "direct_answers": ["beach", "ocean", "ocean", "ocean", "usa", "water", "ocean", "ocean", "ocean", "water"], "difficult_direct_answer": false, "rationales": ["These are all used in the ocean in the waves.", "The boards are going to be used for riding in the water.", "The surfboards on the ground are going to be used in the ocean."], "image": "val2014/COCO_val2014_000000116096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228234, "question_id": "8PzjFe23rABs29DnSgT38S", "question": "What is teeth of the elephant?", "choices": ["trunks", "tusks", "skin", "tongue"], "correct_choice_idx": 1, "direct_answers": ["ivory", "ivory", "ivory", "hidden", "big", "tusks", "trunk", "in mouth", "tusks", "tusks"], "difficult_direct_answer": false, "rationales": ["Elephants have long tapering objects coming out of their mouth.", "The teeth of elephants are called tusks.", "An elephant's teeth are called tusks."], "image": "train2014/COCO_train2014_000000228234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474253, "question_id": "8QX5bYWZafg2YdFmY8Xs3G", "question": "What event is this horse rider participating in?", "choices": ["riding lesson", "travelling", "patrolling", "horse racing"], "correct_choice_idx": 3, "direct_answers": ["racing", "steeplechase", "polo", "equestrian", "race", "horse racing", "race", "horse racing", "race", "race"], "difficult_direct_answer": false, "rationales": ["The rider of the horse is wearing a bib with a number which implies they are racing other riders.", "A rider is on a horse with a number on their chest.", "The rider is wearing a jersey with a number on it and there are spectators"], "image": "val2014/COCO_val2014_000000474253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92339, "question_id": "8QYunj2J5ivTJFECFjmbU4", "question": "Those horsemen work for which entity?", "choices": ["german government", "spanish government", "british government", "belgian government"], "correct_choice_idx": 2, "direct_answers": ["rome", "military", "royal", "roman legion", "british government", "royal majesty", "military", "royal family", "royal guard", "military"], "difficult_direct_answer": false, "rationales": ["The horsemen work for the british.", "You can tell by what they are wearing who they work for.", "The feathers and the armor on the men are very representative of the britain government. there are also flags that are representative of the uk."], "image": "val2014/COCO_val2014_000000092339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154895, "question_id": "8QgCDiXajfGx74Jpnz2znB", "question": "What has made the sandwiches to look shiny?", "choices": ["saran wrap", "glass", "icing", "butter"], "correct_choice_idx": 0, "direct_answers": ["plastic bags", "plastic wrap", "saran wrap", "plastic", "plastic wrap", "saran wrap", "saran wrap", "cellophane wrap", "plastic wrap", "plastic bag"], "difficult_direct_answer": false, "rationales": ["It's the plastic used to preserve the food.", "Saran wrap is wrapped around the sandwich.", "The saran wrap is shiny."], "image": "train2014/COCO_train2014_000000154895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105762, "question_id": "8QxSxEV9uoA5aygsUJMXuF", "question": "What is the white substance in the middle of the plate used for?", "choices": ["rubbing", "dipping", "pasting", "drinking"], "correct_choice_idx": 1, "direct_answers": ["dipping", "dressing", "dipping", "dipping", "dip", "dipping", "dipping", "dip", "dipping", "dip"], "difficult_direct_answer": false, "rationales": ["The ranch dip in the middle is for the veggie dippers.", "The plate consisted of plane vegetables and some dip to provide some variety in flavor for the vegetables. this is common for vegetable platters.", "Generally the substance in the middle is used for dipping you veggies into it."], "image": "train2014/COCO_train2014_000000105762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113415, "question_id": "8QxXGJdfLrUS7eDGsWFArj", "question": "What weather event happened recently?", "choices": ["rain", "hail", "flash flood", "snow"], "correct_choice_idx": 3, "direct_answers": ["show", "snow", "snow", "snow", "sunrise", "snow", "snow", "snow", "snow", "snowstorm"], "difficult_direct_answer": false, "rationales": ["White flakes can be seen on the ground.", "There is white on the flowers.", "You can see the sprinkling of light snowflakes on the ground."], "image": "train2014/COCO_train2014_000000113415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477919, "question_id": "8QyZRwGRJaPwUrxJjLjhhg", "question": "What type environment are these fruits grown in?", "choices": ["tropical", "desert", "tundra", "temperate"], "correct_choice_idx": 0, "direct_answers": ["warm", "tropical", "tropical", "tropical", "warm", "tropical", "tropical", "tropical", "warm", "forest"], "difficult_direct_answer": false, "rationales": ["Bananas are the fruit and bananas are grown areas with hot, humid weather.", "Bananas grow in the tropics.", "These have to be grown in a tropical environment."], "image": "val2014/COCO_val2014_000000477919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557944, "question_id": "8R9KkhS6oh73tewUiu8UH7", "question": "The man with the bat is from what country?", "choices": ["italy", "japan", "taiwan", "mexico"], "correct_choice_idx": 1, "direct_answers": ["usa", "japan", "mexico", "united states", "japan", "usa", "japan", "japan", "spain", "japan"], "difficult_direct_answer": false, "rationales": ["The man with the bat is playing for a japanese team.", "Their last name is of japanese descent.", "Ichiro is probably the most famous baseball player to ever come from japan."], "image": "train2014/COCO_train2014_000000557944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209763, "question_id": "8RLGURBPUUxsnzUfr2HUUy", "question": "What kind of person does this statue depict?", "choices": ["military general", "spiritual leader", "greek deity", "ceo"], "correct_choice_idx": 1, "direct_answers": ["visually impaired", "grandi", "spiritual leader", "monk", "gahndi", "monk", "mahatma gandhi", "grandi", "peasant", "indian"], "difficult_direct_answer": false, "rationales": ["The person is a well known figure based on his defining features. that figure is known to be associated with answer a.", "This is a statue of ghandi, who was a spiritual leader.", "The man is mahatma gandhi."], "image": "val2014/COCO_val2014_000000209763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501898, "question_id": "8RSSWti7zSgDc3RHJ7Geom", "question": "Whys is she holding the racquet like that?", "choices": ["threatening", "brocken racquet", "hitting ball", "angry"], "correct_choice_idx": 2, "direct_answers": ["hitting ball", "being serious", "hitting ball", "talking", "preparing", "warming up", "checking it", "hitting ball", "hit ball", "tennis"], "difficult_direct_answer": false, "rationales": ["She is threatening her opponent.", "The person is holding the racket in front of their face which would be consistent in tennis on a following through to a swing. players in tennis swing in order to hit the ball.", "She just returned the ball to the other side of the court"], "image": "val2014/COCO_val2014_000000501898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381123, "question_id": "8S9vHffJZYWrYJJ4qabEN4", "question": "What method do these vessels shown here normally gain movement?", "choices": ["motors", "wind", "none", "oars"], "correct_choice_idx": 3, "direct_answers": ["water current", "rowing them", "oar", "paddle", "oars", "paddles", "paddling", "rowing", "paddle", "paddles"], "difficult_direct_answer": false, "rationales": ["Traditionally oars are used to move small non motor boats through water.", "Kayaks are paddled by either one or two passengers.", "Kayaks are lined up in the sand on a beach. kayaks are propelled by a person paddling."], "image": "val2014/COCO_val2014_000000381123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444018, "question_id": "8Sfixtzy7BqDUMMxuqsxtj", "question": "What is the vehicle following when in motion?", "choices": ["sun", "tracks", "roads", "police"], "correct_choice_idx": 1, "direct_answers": ["tracks", "tracks", "tracks", "track", "tracks", "track", "train tracks", "train track", "tracks", "tracks"], "difficult_direct_answer": false, "rationales": ["The tracks are visible", "The vehicle is a train and tracks can be seen in front of and behind the train.", "Trains are unable to move unless they have a metal structure to run on."], "image": "train2014/COCO_train2014_000000444018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10393, "question_id": "8Sr9A8MMQPVwMVoZA6MXT2", "question": "What was the original name of the restaurant?", "choices": ["insta-burger king", "burger queen", "burger express", "burger pronto"], "correct_choice_idx": 0, "direct_answers": ["insta-burger king", "lucy", "insta-burger king", "insta-burger king", "insta-burger king", "insta-burger king", "insta-burger king", "burger king", "insta-burger king", "burger king"], "difficult_direct_answer": false, "rationales": ["At one point burger king was known as instaburger king.", "The name was insta burger king.", "It use to be called insta-burger king then they dropped that insta part."], "image": "val2014/COCO_val2014_000000010393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111591, "question_id": "8SzL5adkNN9nBZHENAGfSL", "question": "Where are the birds going?", "choices": ["beneath mountains", "around mountains", "ocean", "over mountains"], "correct_choice_idx": 3, "direct_answers": ["south", "towards trees", "up", "mountain", "flying", "south", "up", "south", "south", "over mountains"], "difficult_direct_answer": false, "rationales": ["This is a mountain scene and the birds are flying over.", "They are flying in the sky over the mountains.", "You can tell by the position and height, the birds seem to be flying over the mountains."], "image": "train2014/COCO_train2014_000000111591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513274, "question_id": "8ToZKCJKCdBRuwLdSk5P9S", "question": "The device connected to the silver laptop is doing what activity to it?", "choices": ["charging", "formatting it", "cooling", "backing up"], "correct_choice_idx": 0, "direct_answers": ["wifi", "charging", "charging", "charging", "moving mouse", "charging", "game", "charging", "working", "charging it"], "difficult_direct_answer": false, "rationales": ["The device connected to the laptop is helping it get a battery charge.", "The cable looks like a usb cable designed to charge the computer.", "The device is charging it."], "image": "train2014/COCO_train2014_000000513274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159030, "question_id": "8TseEbXJzJvMqJw8wq2eKS", "question": "What holds the rocks together?", "choices": ["tar", "nails", "steel", "mortar"], "correct_choice_idx": 3, "direct_answers": ["cement", "concrete", "mortar", "mortar", "cement", "grout", "mortar", "roof", "concrete", "glue"], "difficult_direct_answer": false, "rationales": ["The rocks are held by mortar.", "A rock fireplace is generally held together by mortar.", "A stone fireplace is shown. stone is held together with mortar."], "image": "val2014/COCO_val2014_000000159030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35156, "question_id": "8U2fzvgVYRcDEKyWghA9PZ", "question": "This animal is usually found where?", "choices": ["farm", "horse stall", "house", "pig sty"], "correct_choice_idx": 0, "direct_answers": ["pastures", "pasture", "pasture", "farm", "farm", "farm", "pasture", "farm", "farm", "farm"], "difficult_direct_answer": false, "rationales": ["Lambs are generally on farms.", "Sheeps often live on farms.", "Sheep are usually found on a farm."], "image": "train2014/COCO_train2014_000000035156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324400, "question_id": "8U94bcLwYmkm8JFefxijDP", "question": "What action is the man wearing blue hat doing?", "choices": ["crouching", "sitting", "standing", "kneeling"], "correct_choice_idx": 0, "direct_answers": ["catching", "crouching", "catching", "catching ball", "catching", "catching", "catching ball", "catching ball", "catching", "catching"], "difficult_direct_answer": false, "rationales": ["He is bent over so he can catch the ball if it goes to him.", "The man is crouching.", "The catcher is crouching behind the batter so he is positioned to snag the baseball if it is not struck by the batter,."], "image": "train2014/COCO_train2014_000000324400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203905, "question_id": "8ULMFifYR9gFCAxUi3RjsX", "question": "What allows these people to move passively?", "choices": ["holding cable", "ski lift", "skiing downhill", "lift ticket"], "correct_choice_idx": 0, "direct_answers": ["inertia", "wire", "rope lift", "skis", "pull rope", "cable", "rope", "holding cable", "rope", "gravity"], "difficult_direct_answer": true, "rationales": ["The people are all gripping the cable rope.", "The people are working together.", "The people have their hands on the rope which is the cable."], "image": "train2014/COCO_train2014_000000203905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454810, "question_id": "8UP3Cmu3PxmHNZ7GEbsb6G", "question": "What is being done to the objects in the sink?", "choices": ["waxing", "breaking", "painting", "cleaning"], "correct_choice_idx": 3, "direct_answers": ["washing", "being washed", "being washed", "drying", "cleaning", "nothing", "wash", "rinsing", "washing", "cleaned"], "difficult_direct_answer": false, "rationales": ["There are some dishes sitting at the bottom of the sink. they need to be washed and put up.", "The dishes are in the sink so they can be washed.", "Traditionally you put dirty dishes in the sink to be cleaned."], "image": "val2014/COCO_val2014_000000454810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156204, "question_id": "8UYMmVjbmLzYFqiwBgLjxx", "question": "What type of plane is being boarded?", "choices": ["747", "jet", "helicopter", "pontoon"], "correct_choice_idx": 3, "direct_answers": ["seaplane", "water plane", "sea plane", "sea", "seaplane", "seaplane", "passenger plane", "pontoon", "water plane", "seaplane"], "difficult_direct_answer": false, "rationales": ["The type of cessna plane being boarded is called a pontoon with skis.", "The plane can go on the water.", "Pontoon planes are very small."], "image": "train2014/COCO_train2014_000000156204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41913, "question_id": "8UkhoyaKHWcCKsd7Jxucj9", "question": "Why are the bananas laying out on the blanket?", "choices": ["to clean", "to dry", "to sell", "to eat"], "correct_choice_idx": 2, "direct_answers": ["to sell", "display", "sale", "being sold", "to sell", "for sale", "to sell", "display", "being sold", "attract sales"], "difficult_direct_answer": false, "rationales": ["The bananas are for sale and there is a sign on the back that says how much they cost.", "The bananas are being sold at a market.", "The bunches of bananas are laid out on the blanket because they are for sale."], "image": "train2014/COCO_train2014_000000041913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485689, "question_id": "8UovN5ppPWGqa2SUK5kjnn", "question": "What is the man who is squatting prepared to do?", "choices": ["dive", "swim", "sing", "catch"], "correct_choice_idx": 3, "direct_answers": ["catch", "catch", "catch ball", "catch ball", "catch ball", "catch", "catch ball", "catch", "catch ball", "catch pitch"], "difficult_direct_answer": false, "rationales": ["He has his mitt extended in case the ball comes to him.", "The job of the man between the umpire and the hitter is to catch the ball if the hitter doesn't make contact with the ball.", "The man is playing a defensive position in baseball, not swimming, diving, or singing."], "image": "train2014/COCO_train2014_000000485689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67002, "question_id": "8UxS7zH6hKD2ZCwzSUxHfB", "question": "Where is this scene likely to take place?", "choices": ["garage", "farmer's market", "airport", "mall"], "correct_choice_idx": 3, "direct_answers": ["bazar", "food market", "market", "trade show", "super market", "mall", "trade fair", "food conference", "market", "shop"], "difficult_direct_answer": true, "rationales": ["These booths are likely located in an indoor retail mall.", "There are food stands.", "It is a stall inside a building"], "image": "val2014/COCO_val2014_000000067002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384078, "question_id": "8VSSNP7hm6qwfcxhyNRVLV", "question": "What Pantone is Coca Cola red?", "choices": ["pms484", "pms492", "pms112", "pms452"], "correct_choice_idx": 0, "direct_answers": ["umbrella", "umbrella", "red", "umbrella", "umbrella", "pms484", "umberalla", "pas 484", "umbrella", "red hex"], "difficult_direct_answer": false, "rationales": ["According to google the pantone color for coke is pms-484.", "The pms484 pantone is used.", "Pas 484 is coca cola red."], "image": "train2014/COCO_train2014_000000384078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140543, "question_id": "8VTZi8QjXzYtQ3E5pdeBPw", "question": "These animals usually live where?", "choices": ["cave", "underground hole", "pasture", "tundra"], "correct_choice_idx": 2, "direct_answers": ["farm", "farms", "cows", "pasture", "farm", "pasture", "pasture", "farm", "farm", "farm"], "difficult_direct_answer": false, "rationales": ["The animals are on a grassy pasture.", "The land is a flat area covered with grass or low plants suitable for grazing animals.", "They like to graze on the grass in one."], "image": "train2014/COCO_train2014_000000140543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395665, "question_id": "8VTsJwuL9t5bvUX2fDRgu3", "question": "What does the woman stand on here?", "choices": ["concrete", "grass", "clay", "macadam"], "correct_choice_idx": 2, "direct_answers": ["playing tennis", "clay", "ground sand", "tennis court", "clay", "sand", "tennis court", "dirt", "clay", "clay"], "difficult_direct_answer": false, "rationales": ["This is a tennis court. it is red in color.", "The other options are obviously not in this image. a is often used for courts.", "The woman is on a red tennis court."], "image": "val2014/COCO_val2014_000000395665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109011, "question_id": "8ViMRXp6WVUQhtvhgvffU4", "question": "What is the white stuff on the food?", "choices": ["sour cream", "cream", "whipping cream", "ice-cream"], "correct_choice_idx": 1, "direct_answers": ["whipped cream", "cream", "whipped cream", "frosting", "whipped cream", "frosting", "whipped cream", "whipped cream", "whipped cream", "whipped cream"], "difficult_direct_answer": false, "rationales": ["It looks like the form of ice cream.", "This is a cream puff and the item in the middle is cream.", "The dessert has a whipped topping from a can sprayed onto it."], "image": "train2014/COCO_train2014_000000109011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493072, "question_id": "8VxbGt2PLQA57a4NqjP2TF", "question": "What is the main purpose of the train shown?", "choices": ["work commuting", "rush hour", "pleasure", "freight"], "correct_choice_idx": 2, "direct_answers": ["fun", "fun", "fun", "entertainment", "ride", "pleasure", "family fun", "carry children", "fun", "entertain kids"], "difficult_direct_answer": false, "rationales": ["The train is a small version of a real train and is meant to entertain kids.", "This train is for fun.", "Small children ride on a small train."], "image": "train2014/COCO_train2014_000000493072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123797, "question_id": "8VzgWQSzoVudcuvpZ2GQmi", "question": "The person is riding what?", "choices": ["bike", "horse", "camel", "wave"], "correct_choice_idx": 3, "direct_answers": ["surfboard", "surfboard", "bodyboard", "water ski", "boogy board", "wave", "surfboard", "surfboard", "surfboard", "body board"], "difficult_direct_answer": false, "rationales": ["The person is above water. there are no animals or vehicles.", "Even people who have never been in or near water know what a wave is. besides, animals are not generally seen in water and it's impossible to ride a bike in water.", "The person is high on a surfing wave."], "image": "train2014/COCO_train2014_000000123797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313166, "question_id": "8WCNFXFKXASHvQ9M84yDeW", "question": "What type trees are visible on this mountain?", "choices": ["orange", "evergreen", "plastic", "deciduous"], "correct_choice_idx": 1, "direct_answers": ["pine", "fir", "evergreen", "evergreen", "evergreen", "evergreen", "evergreen", "pines", "pine tree", "evergreens"], "difficult_direct_answer": false, "rationales": ["The trees are evergreen pines.", "The shape and color gives away these types of trees. also these types of trees remain green all year long.", "A are the ones that can survive and stay green in this type of environment."], "image": "val2014/COCO_val2014_000000313166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147896, "question_id": "8WjjcirkAGFApLAwEYsoPh", "question": "Which television character is the same color as this truck?", "choices": ["uncle fester", "barney", "lamb chop", "popeye"], "correct_choice_idx": 1, "direct_answers": ["barney", "barney", "teletubbies", "barney", "barney", "barney", "barney", "barney", "grimace", "grimace"], "difficult_direct_answer": false, "rationales": ["The purple truck and barney are the same color.", "He's a big purple dinosaur.", "He's a big purple dinosaur"], "image": "train2014/COCO_train2014_000000147896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196035, "question_id": "8Wjzh5V4bdXy7FjaCEvYMw", "question": "What protection does an umbrella offer here?", "choices": ["locusts", "nuclear fallout", "rain", "sun shade"], "correct_choice_idx": 3, "direct_answers": ["shade", "sun", "sun", "sun", "shade", "shade", "sun shade", "sun", "sunny heat", "shade"], "difficult_direct_answer": false, "rationales": ["It is not raining, and there are no locusts. umbrellas would not do much to stop nuclear fallout.", "It is not visibly raining so umbrella usage would commonly be for the purposes of answer a when there is not rain.", "The umbrella is giving shade to the people from the bright sun."], "image": "train2014/COCO_train2014_000000196035.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140092, "question_id": "8WpVyyo4f5xCqiVASSLNMU", "question": "Why is she holding the newspaper?", "choices": ["showing off", "taking away", "helping read", "selling it"], "correct_choice_idx": 2, "direct_answers": ["read", "news", "patient frailty", "giving", "helping read", "reading", "reading assistance", "for patient", "he's reading", "patient care"], "difficult_direct_answer": true, "rationales": ["The man is in a hospital bed so he can't hold the newspaper himself.", "A nurse is holding a paper up to a patient who is leaned forward and looking at it.", "The man is reading."], "image": "train2014/COCO_train2014_000000140092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219200, "question_id": "8WyhegHR8gBmArFvkLe46Q", "question": "What has the man on his stomach done?", "choices": ["good deeds", "nothing", "donations", "crime"], "correct_choice_idx": 3, "direct_answers": ["crime", "crime", "being arrested", "commit crime", "crime", "resisting arrest", "something illegal", "criminal offence", "arrested", "resisted arrest"], "difficult_direct_answer": false, "rationales": ["The police are trying to handcuff him.", "The man has committed crime.", "The man is being handcuffed which means he has committed a crime."], "image": "train2014/COCO_train2014_000000219200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304493, "question_id": "8X2dRJYsXeM5DMcuWdZpKE", "question": "Why does he have his arm out?", "choices": ["for balance", "to signal", "to catch", "to wave"], "correct_choice_idx": 2, "direct_answers": ["throwing frisbee", "throwing frisbee", "throwing frisbee", "catch frisbee", "throwing frisbee", "throwing frisbee", "catch frisbee", "to catch", "catch frisbee", "catch frisbee"], "difficult_direct_answer": false, "rationales": ["There is an item approaching the man, and his hand is opened towards it.", "The frisbee is coming towards him and he's reaching to grab it.", "There's an object flying towards him so he's clearly holding his arm out in order to catch the object instead of letting it hit him."], "image": "train2014/COCO_train2014_000000304493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81102, "question_id": "8X3tZbdATbvwXjwZ9upMXH", "question": "What might be happening on this street?", "choices": ["parade", "fire", "sale", "rush hour"], "correct_choice_idx": 0, "direct_answers": ["parade", "police activity", "emergency", "road work", "emergency", "marathon", "emergency", "parade", "event", "emergency situation"], "difficult_direct_answer": false, "rationales": ["People shouldn't be parking on a street when there is an event on that street.", "An area of the street is marked off to clock traffic.", "There could be a parade since the parking meters are all covered up."], "image": "train2014/COCO_train2014_000000081102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400039, "question_id": "8XczkvDKVHtH5KnT6cnuKE", "question": "What superhero's logo is printed on a wallet in front of the Visa sign?", "choices": ["wonder woman", "green lantern", "superman", "batman"], "correct_choice_idx": 2, "direct_answers": ["superman", "hellboy", "superman", "hellboy", "superman", "superman", "superman", "superman", "superman", "superman"], "difficult_direct_answer": false, "rationales": ["The blue, red, and white s is well known to be the symbol for superman, as that appears on his cape.", "One can see the logo for the man of steel on the wallet package.", "Superman's logo is on the wallet's packaging."], "image": "train2014/COCO_train2014_000000400039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263177, "question_id": "8XftVwbK6U7UP6GuSJdamd", "question": "What type of location is this?", "choices": ["suburb", "city", "country", "park"], "correct_choice_idx": 1, "direct_answers": ["city", "times square", "urban", "downtown", "times square", "city", "large city", "city", "city", "time's square"], "difficult_direct_answer": false, "rationales": ["The location is times square which is in new york city.", "Skyscrapers, broadway shows, and a sign for w 39th street are all things you would find near times square in new york.", "There are tall buildings, taxis, and large groups of people."], "image": "val2014/COCO_val2014_000000263177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79460, "question_id": "8XgywgqHuWrzxF36pFB4nn", "question": "What are the objects that are in a line doing?", "choices": ["smoking", "floating", "cooking", "flying"], "correct_choice_idx": 1, "direct_answers": ["docking", "docked", "parked", "sitting", "parking", "parked", "parking", "floating", "floating", "floating"], "difficult_direct_answer": false, "rationales": ["The lined objects are clearly visible and they are in the water but not submerged.", "The objects are floating in the water.", "Boats float."], "image": "train2014/COCO_train2014_000000079460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419143, "question_id": "8XvPgSUxHgQMH8AnArj824", "question": "What venue is shown here?", "choices": ["home", "airplane", "library", "office"], "correct_choice_idx": 0, "direct_answers": ["home", "home", "home", "living room", "home", "cat", "home", "home", "living room", "home"], "difficult_direct_answer": false, "rationales": ["Looks to be in someones home.", "The person is sitting on a couch that would be found in a house.", "The venue is a home."], "image": "val2014/COCO_val2014_000000419143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267411, "question_id": "8Y6ErpmmnxMZeMDwEubhKf", "question": "How many men are wearing a tie?", "choices": ["two", "four", "one", "three"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["Only one is wearing a tie.", "There is only a single guy wearing a tie.", "The other two have their collars somewhat open."], "image": "val2014/COCO_val2014_000000267411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238584, "question_id": "8YCQzE3DexWthpatPEFZBb", "question": "What type of dog is riding with the man?", "choices": ["poodle", "bulldog", "dachshund", "sheep dog"], "correct_choice_idx": 3, "direct_answers": ["collie", "sheep dog", "shepherd", "sheep dog", "border collie", "sheep dog", "collie", "german shephard", "sheep dog", "collie"], "difficult_direct_answer": false, "rationales": ["The dog with the man is surrounded by sheep. the dog with the man has long hair and is big, resembling a sheep dog.", "That dog guides the sheep.", "There is a sheep dog."], "image": "train2014/COCO_train2014_000000238584.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135336, "question_id": "8YNQfYgYKWDuKjPdVekkCS", "question": "What are the cement blocks in the sea for?", "choices": ["floaty", "protecting shore", "decoration", "pier"], "correct_choice_idx": 1, "direct_answers": ["lower sea level", "tide protection", "protection", "protecting shore", "barrier", "breakwater", "protection", "tide break", "tide breakers", "erosion control"], "difficult_direct_answer": true, "rationales": ["These help block destructive waves to help the sand stay in the area", "They stop people from boating to shore.", "It's dangerous for people to go beyond the blocks."], "image": "train2014/COCO_train2014_000000135336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70829, "question_id": "8YVr7jb3i3g8CQ4wwdbRCM", "question": "What time of year is it here?", "choices": ["solstice", "fall", "winter", "spring"], "correct_choice_idx": 1, "direct_answers": ["autumn", "autumn", "fall", "autumn", "fall", "autumn", "fall", "fall", "fall", "autumn"], "difficult_direct_answer": false, "rationales": ["As apparent because of the color of the leaves.", "The leaves are changing colors.", "It is fall season because there are red leaves on the trees."], "image": "train2014/COCO_train2014_000000070829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385441, "question_id": "8Z4e6iKksBiohmk56HoBuY", "question": "How might the train's orientation be altered here?", "choices": ["reverse", "rotation", "tornado", "upside down"], "correct_choice_idx": 1, "direct_answers": ["turned", "turn around", "turntable", "turned around", "switch tracks", "spin", "on display", "split tracks", "move bridge", "rotation"], "difficult_direct_answer": true, "rationales": ["The center track moves to align with all the other tracks around it", "The train is on a track that rotates in a circle.", "The center moves to point at different tracks in a circle"], "image": "train2014/COCO_train2014_000000385441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241534, "question_id": "8Z8z3ZeJChhKWABDNgtVAP", "question": "During what time of year is this bus driving around?", "choices": ["fall", "summer", "spring", "winter"], "correct_choice_idx": 2, "direct_answers": ["summer", "summer", "summer", "summer", "summer", "summer", "fall", "spring", "fall", "summer"], "difficult_direct_answer": false, "rationales": ["There are buds on the trees and the grass is beginning to turn green so it's the season before summer.", "The trees are green and the grass is too.", "The time is spring."], "image": "train2014/COCO_train2014_000000241534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314147, "question_id": "8ZJnC8K59XqAgToXk7Zrb7", "question": "What unusual design does his tie have?", "choices": ["optical illusion", "polka dots", "like pixels", "cartoons"], "correct_choice_idx": 2, "direct_answers": ["pixelated", "zigzag", "staggered diagonals", "pixelated", "striped", "8 bit", "pixel", "like pixels", "minecraft", "pixelated"], "difficult_direct_answer": false, "rationales": ["Ties are usually formal. this tie looks like an old video game", "The pattern is pixelated on the tie.", "It looks to have pixels on it."], "image": "val2014/COCO_val2014_000000314147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6595, "question_id": "8ZNqV6eceJVyqcL2FRF3SC", "question": "What are the orange vegetables?", "choices": ["carrot", "yam", "pumpkin", "squash"], "correct_choice_idx": 0, "direct_answers": ["carrots", "carrots", "carrots", "carrots", "carrots", "carrots", "carrots", "carrots", "carrot", "carrots"], "difficult_direct_answer": false, "rationales": ["Those are carrots.", "Mixed vegetables are on a plate and some are orange. carrots are orange.", "There are orange carrots on the plate."], "image": "train2014/COCO_train2014_000000006595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531299, "question_id": "8ZQBpLQRs2pDrvatnxZdUp", "question": "The yellow embedded items along the roadside are meant to help people using it to what?", "choices": ["exit road", "avoid merge", "speed up", "remain straight"], "correct_choice_idx": 3, "direct_answers": ["be safe", "remain straight", "navigate", "go straight", "guide lines", "guide", "crashing", "stay alert", "safety", "identify edge"], "difficult_direct_answer": true, "rationales": ["The yellow items are used to guide people that are traveling straight.", "They are in a continuous line and form a direct path.", "The yellow items act as a guide to help them stay in the middle of the road."], "image": "val2014/COCO_val2014_000000531299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68625, "question_id": "8ZSHkaYZVK6utEy3v2FSdr", "question": "What are these animals known for?", "choices": ["quills", "long necks", "tusks", "horns"], "correct_choice_idx": 1, "direct_answers": ["height", "long necks", "long necks", "long necks", "their height", "being tall", "long necks", "deafness", "necks", "long necks"], "difficult_direct_answer": false, "rationales": ["One can observe the unique anatomy of these animals in the picture.", "The animals are giraffes which have elongated necks.", "The animals in the zoo pen are giraffes known for having long necks."], "image": "train2014/COCO_train2014_000000068625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311309, "question_id": "8ZZbbasZTZKpcVnLiHLrHi", "question": "What level of education have these two achieved?", "choices": ["grade school", "college", "masters", "high school"], "correct_choice_idx": 1, "direct_answers": ["college", "college", "college", "university level", "college", "collegiate", "college", "college", "college", "college"], "difficult_direct_answer": false, "rationales": ["The went to college.", "Both of their shirts have the word \"college\"on them and specifically reference swathmore.", "Swarthmore is a college and it's named on the two individuals' sweaters."], "image": "val2014/COCO_val2014_000000311309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368969, "question_id": "8ZkTzDCeyMgzMbmq69EVzd", "question": "What part of the game of baseball is this person preparing to do?", "choices": ["short stop", "batter", "pitcher", "coach"], "correct_choice_idx": 1, "direct_answers": ["batting", "bat", "bat", "bat", "bat", "bat", "hitting", "batter", "bat", "hitting"], "difficult_direct_answer": false, "rationales": ["The man is getting ready to bat.", "He has a bat in his hand and he's at home plate", "The person is holding a baseball bat and standing at the plate. he is next at bat."], "image": "train2014/COCO_train2014_000000368969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537139, "question_id": "8a2VLpPSY7vGKnHvpD9gAd", "question": "What might cause the property values of the houses to stay low in this location?", "choices": ["wind", "weather", "jobs", "train noise"], "correct_choice_idx": 3, "direct_answers": ["noise", "train noise", "train noise", "noise pollution", "train tracks", "noise", "environment", "train", "noise", "railway"], "difficult_direct_answer": false, "rationales": ["They will stay low being so close to the train track.", "The houses are near the railroads.", "The noise of the trains would be unappealing."], "image": "train2014/COCO_train2014_000000537139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534065, "question_id": "8a8LdvkX8TYe2RX3xJBzxB", "question": "Why is he leaning like that?", "choices": ["having trouble", "maintain balance", "new skateboarder", "is falling"], "correct_choice_idx": 1, "direct_answers": ["turning", "balance", "balance", "balance", "to swear", "turning", "turning", "movement", "turning", "maintain balance"], "difficult_direct_answer": false, "rationales": ["A man is on a skateboard and is moving between cones.", "The man is leaning on his skateboard in order to maintain balance and control.", "He's keeping his balance as he tips the skateboard to make it turn"], "image": "train2014/COCO_train2014_000000534065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143098, "question_id": "8aHCgvW93ZN9wwVsLPkGdf", "question": "What's the area where the man is taking a photo from called?", "choices": ["home base", "bench", "dugout", "stable"], "correct_choice_idx": 2, "direct_answers": ["deck", "batter cage", "batters box", "dugout", "dugout", "dugout", "dugout", "home plate", "dugout", "dugout"], "difficult_direct_answer": false, "rationales": ["The man that is holding a camera is standing in the dugout which is partly underground.", "He is in the dugout where they stay to wait their turn.", "That what they call the area."], "image": "val2014/COCO_val2014_000000143098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9395, "question_id": "8aJxPjT9thovAqfPSvryNo", "question": "What is the quotient of each individual digit shown?", "choices": ["ten", "25", "one", "55"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["The numbers are five and five.", "The player's jersey is 55. 5 plus 5 equals 10.", "Five divided by five is one."], "image": "val2014/COCO_val2014_000000009395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238860, "question_id": "8agCKa7Ba7ZCVGgvP4kq9u", "question": "In what year did this soda company resume business in Myanmar?", "choices": ["2020", "2008", "2017", "2012"], "correct_choice_idx": 3, "direct_answers": ["2012", "2012", "2012", "2013", "2007", "2012", "2012", "1990", "2019", "2009"], "difficult_direct_answer": false, "rationales": ["This was after the usa lifted sanctions.", "A coca cola can is on a table people are sitting at.", "According to the internet, after 60 years absent the supplier started again in the year 2012."], "image": "train2014/COCO_train2014_000000238860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339674, "question_id": "8agaiBLXnvF5ZvXDcY9nXm", "question": "Why does the boy have his leg on the tree?", "choices": ["to wipe", "to kick", "to itch", "to climb"], "correct_choice_idx": 3, "direct_answers": ["to climb", "to climb", "climb tree", "stretching", "climbing", "climbing", "climbing", "to climb", "climbing", "to climb"], "difficult_direct_answer": false, "rationales": ["More than likely boys like to climb on trees.", "He wants to get up on the stump.", "The boy wants to get up the tree."], "image": "train2014/COCO_train2014_000000339674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517362, "question_id": "8bEka7pU68eXEVu4wpGWSw", "question": "What do these kids hope for?", "choices": ["school start", "parents warning", "wind", "homework"], "correct_choice_idx": 2, "direct_answers": ["wind", "wind", "flying kites", "wind", "good wind", "kite flying", "wind", "fly kite", "wind", "fun"], "difficult_direct_answer": false, "rationales": ["The kids need wind for their kite.", "It's needed for the kites to stay up.", "The kids want wind."], "image": "train2014/COCO_train2014_000000517362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408016, "question_id": "8bZ8L3JpemAguXWnCSuYN4", "question": "What vehicle is winning the race so far?", "choices": ["tank", "plane", "motorcycle", "boat"], "correct_choice_idx": 2, "direct_answers": ["motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle"], "difficult_direct_answer": false, "rationales": ["A motorcycle is on a street ahead of where a plane flies overhead.", "The airplane is currently behind the bike rider who is in first place.", "There are two vehicles in the photos; one is a motorcycle and the other an airplane. at the is time, the motorcycle is ahead of the plane."], "image": "train2014/COCO_train2014_000000408016.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197697, "question_id": "8bqh7zzxokvS3N89CKuf5c", "question": "Where is the woman's foot resting?", "choices": ["pedal", "floor", "sauna", "chair"], "correct_choice_idx": 0, "direct_answers": ["motorcycle pedal", "footrest", "peg", "break", "pedal", "pedals", "pedal", "foot peg", "sport bike", "on pedals"], "difficult_direct_answer": true, "rationales": ["She is on a bike", "She is putting her leg down and foot on one of the bars.", "A woman is on a motorcycle with her foot on small black objects, there are pedals on motorcycles."], "image": "train2014/COCO_train2014_000000197697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7819, "question_id": "8cRUpSKi6a3DdkKLHXVpDk", "question": "Why is the pavement so shiny?", "choices": ["water", "glass", "ice", "plastic"], "correct_choice_idx": 0, "direct_answers": ["wet area", "rainy", "just rained", "raining", "it's wet", "water", "wet", "rain", "wet", "rain"], "difficult_direct_answer": false, "rationales": ["It is raining", "The people are holding umbrellas and it appears to have just rained here.", "It must have just rained."], "image": "train2014/COCO_train2014_000000007819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113905, "question_id": "8cheGvEBxyiPiGfjtL7Cjm", "question": "What is the man likely to add to the hotdogs in this scene?", "choices": ["relish", "condiments", "spicy sauce", "onion"], "correct_choice_idx": 1, "direct_answers": ["ketchup", "ketchup", "ketchup", "ketchup", "condiments", "mustard", "ketchup", "mustard", "ketchup", "condiments"], "difficult_direct_answer": false, "rationales": ["There is ketchup and mustard on the table near the food", "There are bottles of ketchup and mustard next to his plate.", "Answer a frequently is added to hot dogs and they are visible in the scene while currently there is nothing on the hot dogs at all."], "image": "val2014/COCO_val2014_000000113905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194413, "question_id": "8cmKQBpxifpV3tMiV8J3nN", "question": "What are the people laying down on the left side doing?", "choices": ["digging", "playing", "sunbathing", "sleeping"], "correct_choice_idx": 2, "direct_answers": ["sunbathing", "resting", "sunbathing", "sunbathing", "tanning", "sunbathing", "flying kite", "sunbathing", "sunbathing", "sunbathing"], "difficult_direct_answer": false, "rationales": ["This is a beach area where people would go to get some sun and relax. the person lying at the beach hear is hardly wearing any clothing indicating that they want to get some color on their skin.", "The people are laying at the beach and exposing their full bodies to the sun as one would do for answer a.", "They are on a beach on a towel under a sunny day, which is a prime environment to tan under."], "image": "train2014/COCO_train2014_000000194413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61983, "question_id": "8dHRiESpiFKYoshJNiBezL", "question": "Why is the girl hunched over in bed?", "choices": ["feels amused", "feels energized", "feels embarrassed", "feels outgoing"], "correct_choice_idx": 2, "direct_answers": ["watching computer", "watching computer", "scared", "comfortable position", "feels embarrassed", "sleeping", "sad news", "using laptop", "reading computer", "reading"], "difficult_direct_answer": true, "rationales": ["The girl is balled up since she doesn't want to call attention to herself.", "The girl wants to hole up.", "The woman looks contemplative but still amused."], "image": "val2014/COCO_val2014_000000061983.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78509, "question_id": "8dJyrFuzDmYjsbQHSdqgeN", "question": "Why has everyone been seated?", "choices": ["paint", "pray", "eat", "work"], "correct_choice_idx": 2, "direct_answers": ["dinner", "eating meal", "dinner time", "to eat", "eating dinner", "dinner time", "dinner", "to eat", "dinnertime", "eat"], "difficult_direct_answer": false, "rationales": ["There is food in front of the people and they're holding cutlery.", "There is food on the table that people are consuming.", "People are gathered for a meal."], "image": "train2014/COCO_train2014_000000078509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16483, "question_id": "8dJyyvVmmyjZJAFjxi9SzG", "question": "In which country does this festival occur?", "choices": ["columbia", "united states", "great britain", "chile"], "correct_choice_idx": 1, "direct_answers": ["united states", "united states", "united states", "usa", "usa", "united states", "united states", "united states", "usa", "america"], "difficult_direct_answer": false, "rationales": ["American flags are everywhere.", "There are multiple american flags visible on a tall structure in the background.", "There are usa flags in the background."], "image": "train2014/COCO_train2014_000000016483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140487, "question_id": "8dQeHnCfE8Tc4jPqL4G5Ab", "question": "How are the men moving around here?", "choices": ["on wheels", "running", "they aren't", "floating"], "correct_choice_idx": 0, "direct_answers": ["quickly", "on wheels", "wheels", "bike walk", "quickly", "swiftly", "on wheels", "on wheels", "wheels", "wheels"], "difficult_direct_answer": false, "rationales": ["The man is using a bike indoors.", "Each man is atop a moving contraption that moves by way of these mechanical movement devices on the bottom.", "The men are on wheels."], "image": "val2014/COCO_val2014_000000140487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460665, "question_id": "8dSzEeLRhUYK86YbC7SvFr", "question": "Why has the man covered his head?", "choices": ["costume", "religion", "fashion", "protection"], "correct_choice_idx": 3, "direct_answers": ["protection", "protection", "protection", "protection", "for warmth", "for protection", "cold", "safety", "stay warm", "safety"], "difficult_direct_answer": false, "rationales": ["He has a helmet on to protect his brain from injury", "It's so his head isn't hurt if he has a bad fall", "The helmet is for safety."], "image": "train2014/COCO_train2014_000000460665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418692, "question_id": "8dWfr8wyjbQJ8eFMNe8Zch", "question": "What time of day is it in this image?", "choices": ["900 pm", "1000 am", "330 pm", "1200 pm"], "correct_choice_idx": 0, "direct_answers": ["sofa", "night", "night", "night", "night", "night", "night time", "900 pm", "at night", "night"], "difficult_direct_answer": false, "rationales": ["It's nighttime out since it's black behind the windows.", "The person is not wearing night clothes or pajamas and that make me assume that it's not daytime, also it's dark outside.", "The time of day is nighttime."], "image": "val2014/COCO_val2014_000000418692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461181, "question_id": "8daoaz7X9czSPNoBh8jS3v", "question": "What are the people on the bench doing?", "choices": ["working", "sleeping", "gaming", "waiting"], "correct_choice_idx": 3, "direct_answers": ["waiting", "waiting", "waiting", "sitting", "waiting", "sitting waiting", "waiting", "reading sitting", "watching", "waiting"], "difficult_direct_answer": false, "rationales": ["The people on the bench want to get on the next train.", "Wherever you see a train station there is a very good chance you'll see people waiting; such is the case here. there is no indication any of the people here are asleep.", "The people are at a train station and are seated until their particular train arrives."], "image": "train2014/COCO_train2014_000000461181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499577, "question_id": "8dgtPsDEknMDFwAMYSkD3i", "question": "What will the racquet be used for?", "choices": ["beat child", "hit ball", "cut grass", "biking"], "correct_choice_idx": 1, "direct_answers": ["tennis", "tennis", "play tennis", "tennis", "tennis", "hit ball", "tennis", "hitting", "tennis", "play tennis"], "difficult_direct_answer": false, "rationales": ["This racquet has a ball in front of it and is looking like it might hit the ball.", "This is a tennis raquet and will be used to hit the ball and play the game.", "The racket is used to strike the round yellow object also located inside of the racket carrier in order to play the sport of tennis."], "image": "val2014/COCO_val2014_000000499577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83935, "question_id": "8dk7juqEnMaYBsJXvPDnat", "question": "What zone are the people in?", "choices": ["business", "shopping", "residential", "tourist"], "correct_choice_idx": 1, "direct_answers": ["downtown", "pedestrian", "city street", "pedestrian", "street", "crossing", "pedestrian", "shopping", "traffic", "sidewalk"], "difficult_direct_answer": false, "rationales": ["People are walking in a busy area with stores all around.", "Multiple stores names can be seen on a blackboard, indicating a district in which shops are situated.", "There are shops around."], "image": "val2014/COCO_val2014_000000083935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531069, "question_id": "8dn24DpxxrSsbercs3qUYE", "question": "How is the transportation method operated?", "choices": ["pedals", "air", "gas", "engine"], "correct_choice_idx": 0, "direct_answers": ["person pedalling", "pedaling", "pedaling", "with legs", "legs", "pedals", "riding", "cycle", "pedaling", "riding"], "difficult_direct_answer": false, "rationales": ["People use their legs to move the bicycle rather than some other source of external energy.", "The bike is operated by the man pushing on the pedals.", "The bike uses pedals."], "image": "val2014/COCO_val2014_000000531069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407522, "question_id": "8dvREwMUNYfm8c3LRfBvXN", "question": "Why are all the people in front?", "choices": ["waiting turns", "are lost", "spectators", "competitors"], "correct_choice_idx": 2, "direct_answers": ["witnessing skiing", "spectators", "watching tricks", "to spectate", "spectators", "spectating", "watching show", "watching competition", "performing", "watching"], "difficult_direct_answer": true, "rationales": ["They are watching skiers competing.", "They are behind a bar, watching the professionals in competition.", "They are the audience that is watching."], "image": "train2014/COCO_train2014_000000407522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577879, "question_id": "8dzuKiQqRt9q3bm5sJJ8hS", "question": "Which AEW wrestler is most likely to be from the continent where the symbols on the kite come from?", "choices": ["orange cassidy", "malakai black", "miro", "riho"], "correct_choice_idx": 3, "direct_answers": ["tax", "unknown", "satoshi kojima", "japan", "riho", "asia", "unknown", "riho", "steve austin", "american continent"], "difficult_direct_answer": false, "rationales": ["Riho is more likely to be from asia.", "The symbols are japanese. riho is from japan.", "The name is riho."], "image": "train2014/COCO_train2014_000000577879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580108, "question_id": "8e2tZAWVCiztKkpfcofE3s", "question": "What kind of shop is shown in the background?", "choices": ["car dealer", "department store", "bar", "grocery store"], "correct_choice_idx": 2, "direct_answers": ["bar", "liquor store", "bar", "eatery", "bar", "bar", "bar", "motorcycle", "bar", "restaurant"], "difficult_direct_answer": false, "rationales": ["The place sells coors light which is a beer.", "The area seems to be full of people who have come to have drinks with their bikes.", "There is a sign in the window for the business advertising beer."], "image": "val2014/COCO_val2014_000000580108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128647, "question_id": "8eLcMqE2gtgYSy8iusK7ue", "question": "How do these people know each other?", "choices": ["classmates", "acquaintances", "family", "coworkers"], "correct_choice_idx": 2, "direct_answers": ["family", "family", "friends", "family", "friends", "family", "family", "family", "friends", "family"], "difficult_direct_answer": false, "rationales": ["They look like they're related.", "The people appear to be in a house together and have similar looking faces. people familiar with each other, resembling each other and appearing in the same house are most commonly family members.", "The people are in a very familiar, comfortable position indicating they are probably related to each other."], "image": "train2014/COCO_train2014_000000128647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457437, "question_id": "8eMP6h85ZB28yUYtMUAc4L", "question": "What happens in the ball goes over the yellow barrier?", "choices": ["walk", "strike", "home run", "run"], "correct_choice_idx": 2, "direct_answers": ["home run", "home run", "home run", "home run", "homerun", "homerun", "home run", "foul", "home run", "home run"], "difficult_direct_answer": false, "rationales": ["That will be a goal.", "It can't be easily retrieved so the batter gets to touch all 4 bases to score", "The person will make a homerun."], "image": "train2014/COCO_train2014_000000457437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337915, "question_id": "8eNMJUPVjuPXiUhdkyLGtX", "question": "What kind of light are they using?", "choices": ["flashlight", "sunlight", "floodlight", "solar light"], "correct_choice_idx": 2, "direct_answers": ["stadium light", "light bulbs", "flood light", "spot light", "stage light", "spot", "floodlight", "street light", "arena lights", "street light"], "difficult_direct_answer": true, "rationales": ["The light outline is visible and based on its configuration, it is not natural light and instead one similar to answer a.", "It's a type of spotlight or flood light to light up a smaller area.", "It's the only lighting around."], "image": "train2014/COCO_train2014_000000337915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314993, "question_id": "8edeVxFr2WjD3i8Zsgc3qJ", "question": "What do they hope you will do after you rest?", "choices": ["leave", "go jogging", "buy soda", "help them"], "correct_choice_idx": 2, "direct_answers": ["shop", "relax", "buy soda", "sit", "relax", "buy items", "shop", "relax", "buy something", "finish shopping"], "difficult_direct_answer": false, "rationales": ["The bench is placed next to a soda pop display so they would really like it if you bought some.", "Buy soda from the sodas behind.", "It is surrounded by pop so you buy it."], "image": "train2014/COCO_train2014_000000314993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111131, "question_id": "8epdVbrswgFCGG5vEr4mr8", "question": "What is the most likely location for all of the dogs to be at?", "choices": ["refuge", "local park", "dog pound", "dog park"], "correct_choice_idx": 3, "direct_answers": ["dog park", "field", "park", "park", "park", "dog park", "yard", "dog park", "dog home", "home"], "difficult_direct_answer": false, "rationales": ["Dogs play at the park.", "There are a number of dogs here drinking water in a grassy and open area. this place is most likely a dog park.", "There is a bowl of dog food so they are not at a local park and the scenery resembles a park."], "image": "train2014/COCO_train2014_000000111131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237964, "question_id": "8eqMQK9AufKr9aJ3ZKduyG", "question": "What type of sign is this?", "choices": ["direction", "warning", "regulatory", "sale"], "correct_choice_idx": 0, "direct_answers": ["direction", "direction sign", "directional sign", "direction", "direction", "direction", "location", "directional", "direction", "direction"], "difficult_direct_answer": false, "rationales": ["The sign points to a certain direction and says to do a certain way.", "The sign is shaped like an arrow and tells drivers which direction to go.", "The sign is shaped like an arrow for directions."], "image": "val2014/COCO_val2014_000000237964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239656, "question_id": "8er2cQ8U4G8yu27o82REPD", "question": "What is the slowest vehicle here?", "choices": ["car", "excavator", "bike", "scooter"], "correct_choice_idx": 2, "direct_answers": ["sitting", "bicycle", "bicycles", "walking", "bike", "bicycle", "bike", "truck", "bicycle", "bike"], "difficult_direct_answer": false, "rationales": ["A bike would go slower then the cars.", "There are cars and trucks in the photo but the bikes are the slowest of them all.", "The bikes are slow."], "image": "val2014/COCO_val2014_000000239656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98871, "question_id": "8fHMEgAdJGAT3fZzjXKeYz", "question": "What animal would this creature try to prey on?", "choices": ["goat", "cow", "python", "mouse"], "correct_choice_idx": 3, "direct_answers": ["mouse", "mouse", "mouse", "mouse", "mouse", "rat", "mouse", "rat", "mouse", "mouse"], "difficult_direct_answer": false, "rationales": ["This animal is a cat. it would be scared of pythons and would be too small to prey on a goat or cow.", "Cats are known to catch mice.", "Cats love to chase small animals"], "image": "val2014/COCO_val2014_000000098871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491017, "question_id": "8fKnkSYNYTxApkVJGCA4p8", "question": "What type of persons are featured here?", "choices": ["clowns", "gay", "conservatives", "conventional"], "correct_choice_idx": 1, "direct_answers": ["cross dressers", "gay", "men", "gay", "male dancers", "crossdressers", "leather enthusiasts", "men", "leather loving", "leathermen"], "difficult_direct_answer": false, "rationales": ["Gay people like to wear clothing that crosses the boundaries of the social norm.", "The style of clothing is most known for gay men to wear and express themselves.", "These men look to be dressed in gear that is for a club that is known to gay men."], "image": "val2014/COCO_val2014_000000491017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243910, "question_id": "8fe3uhXA8PgFED6VMWM9oq", "question": "What fruit is next to the eggs?", "choices": ["apples", "bananas", "watermelon", "oranges"], "correct_choice_idx": 1, "direct_answers": ["bananas", "bananas", "bananas", "bananas", "bananas", "bananas", "bananas", "bananas", "bananas", "bananas"], "difficult_direct_answer": false, "rationales": ["There are bananas next to them.", "They are discolored bananas.", "A brown and yellow fruit is sitting next to eggs on a counter."], "image": "train2014/COCO_train2014_000000243910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151528, "question_id": "8g6gDLegaudvQtgUN8juPn", "question": "What is the flag made of?", "choices": ["cloth", "leather", "plastic", "rayon"], "correct_choice_idx": 2, "direct_answers": ["plastic", "paper", "fabric", "plastic", "cloth", "nylon", "fabric", "cloth", "plastic", "polyester"], "difficult_direct_answer": false, "rationales": ["The man is holding a flag made of light red plastic.", "The flag is a kite and generally made of it.", "It looks to be made of plastic."], "image": "val2014/COCO_val2014_000000151528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188498, "question_id": "8gHrBy92xsPqMs5jyEzH67", "question": "What look does the man have on his face?", "choices": ["sadness", "disgust", "love", "joy"], "correct_choice_idx": 1, "direct_answers": ["googles", "goggles", "sunglasses", "disgust", "wonder", "annoyance", "confused", "puzzled", "sun-glass", "smile"], "difficult_direct_answer": true, "rationales": ["A man is looking on with a curled lip and furrowed brow.", "Their top lip is curled and mouth is open. they aren't smiling or crying.", "His mouth is open as he looks at something"], "image": "train2014/COCO_train2014_000000188498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245440, "question_id": "8gJVRZrS72DBdyfzc2LcWr", "question": "Where do the kites owners control their toys from?", "choices": ["river banks", "mid stream", "river bed", "drones"], "correct_choice_idx": 0, "direct_answers": ["ground", "ground", "bleachers", "ground", "earth surface", "river banks", "ground", "ground", "ground", "strings"], "difficult_direct_answer": false, "rationales": ["The kite owners are standing on the riversides.", "They are standing right beside the body of water not in it. they are not using drones but string.", "The kite owners are standing on the river banks and using strings to control their toys."], "image": "val2014/COCO_val2014_000000245440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76310, "question_id": "8gQckSdLcjL2yD5GrBdQc9", "question": "What are the birds doing near the lamb?", "choices": ["eating", "playing", "dancing", "attacking"], "correct_choice_idx": 0, "direct_answers": ["feeding", "fighting", "flying", "flying", "flying", "eating", "flying", "eating", "eating", "fighting"], "difficult_direct_answer": false, "rationales": ["The birds near the lamb are flying near the ground and looking for food to eat.", "The birds are scrounging for food.", "Birds of this type would likely be walking around on this type of ground and approaching it in the fashion they are if there was food they were trying to eat."], "image": "val2014/COCO_val2014_000000076310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400599, "question_id": "8gqXj8mZRA6yqwpGwb78pU", "question": "What are cross country ski poles made of?", "choices": ["aluminum", "wood", "magnet", "copper"], "correct_choice_idx": 0, "direct_answers": ["metal", "aluminum", "aluminum", "aluminum", "fiberglass titanium", "metal", "aluminum", "aluminum", "aluminium", "titanium"], "difficult_direct_answer": false, "rationales": ["Aluminum is used as it is light.", "A person is skiing. ski poles are often made of aluminum.", "They need to be durable and lightweight"], "image": "train2014/COCO_train2014_000000400599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213922, "question_id": "8gsEbjoW8itqw24Fuj7ZYs", "question": "Why are they wearing those jackets?", "choices": ["rain repellant", "snow repellant", "hot", "cold"], "correct_choice_idx": 0, "direct_answers": ["raining", "rain", "rain repellant", "protection", "rain", "protection", "rain", "rain protection", "rain", "raining"], "difficult_direct_answer": false, "rationales": ["They have these to cover themselves up and keep from getting wet.", "The street is wet from precipitation. people are on mopeds waiting for the light to turn green.", "The jackets keep rain out."], "image": "train2014/COCO_train2014_000000213922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373283, "question_id": "8hbgmhyVviZ58Da9ymfJW6", "question": "When the rain stops how will this umbrella be stored?", "choices": ["folded", "outside", "upside down", "vacation rental"], "correct_choice_idx": 0, "direct_answers": ["folded up", "folded up", "folded", "stored away", "folded", "fold", "folded", "collapsed", "drawn together", "folded"], "difficult_direct_answer": false, "rationales": ["There are mechanisms that fold in on themselves on the umbrella.", "Traditionally umbrellas can be folded down so it is easy to store them.", "Umbrellas expand to provide protection when it's raining and fold up when the rain is finished and the umbrella is no longer needed."], "image": "train2014/COCO_train2014_000000373283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425218, "question_id": "8iB34CQzfs4zTNaNfonEMa", "question": "Why liquid ammonia is used in refrigerator?", "choices": ["evaporation", "heating", "refrigeration", "vaporization"], "correct_choice_idx": 3, "direct_answers": ["storage", "cleaning", "clean it", "cooling", "vaporization", "cooling", "store", "cooling", "thermodynamics", "refrigerant"], "difficult_direct_answer": false, "rationales": ["The liquid is vaporized.", "The liquid ammonia helps prevent vaporization.", "Help to keep the food in condition through vaporization."], "image": "train2014/COCO_train2014_000000425218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518361, "question_id": "8j6mDehjgDdTeDU4B8woRz", "question": "Who is this picture can you clearly see is wearing a face mask?", "choices": ["man", "boy", "woman", "elephant"], "correct_choice_idx": 0, "direct_answers": ["man", "man", "adult male", "man", "man", "man", "man", "adult male", "adult male", "man"], "difficult_direct_answer": false, "rationales": ["The man has a mask.", "The only person whose face you can see is the male adult on the right.", "The man has turned his face enough so a strap is visible over his ear which is connected to a fabric covering his face. these features are consistent with a face mask."], "image": "train2014/COCO_train2014_000000518361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567390, "question_id": "8jCNJSPno8hrtdvATRy7ka", "question": "Which legume is being served in this dish?", "choices": ["chickpeas", "split pea", "lentils", "lima beans"], "correct_choice_idx": 0, "direct_answers": ["garbanzo bean", "chickpeas", "chick pea", "garbanzo bean", "chickpeas", "chickpea", "chickpeas", "chickpeas", "lima beans", "bean"], "difficult_direct_answer": false, "rationales": ["One can see the garbanzo beans on the plate.", "Small white objects are mixed with vegetables on a bed of rice. chickpeas are served on rice with vegetables.", "The rounded legumes are known as chickpeas."], "image": "val2014/COCO_val2014_000000567390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165707, "question_id": "8jChpBGf4A35T6siqwTckz", "question": "What have the people on the crossing violated?", "choices": ["traffic laws", "littering", "violent protest", "arson"], "correct_choice_idx": 0, "direct_answers": ["traffic regulation", "traffic law", "green light", "traffic lights", "jaywalking laws", "traffic laws", "signal", "crossing", "crosswalk", "traffic light"], "difficult_direct_answer": true, "rationales": ["The light is green which means the cars heading that direction can go. therefore, the pedestrians would have a do not walk sign which they would be disobeying by crossing the street.", "The people jaywalked which was obvious.", "The people that are crossing the street are violating traffic laws."], "image": "train2014/COCO_train2014_000000165707.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340508, "question_id": "8jfJH87y7bKCGjGQAENjDe", "question": "What game is usually played on this court?", "choices": ["badminton", "tennis", "basketball", "volleyball"], "correct_choice_idx": 2, "direct_answers": ["basketball", "basketball", "basketball", "basketball", "basketball", "basketball", "basketball", "basketball", "basketball", "basketball"], "difficult_direct_answer": false, "rationales": ["A group of boys are playing frisbee in an indoor court. there is a hoop in the background.", "The markings are that of a basketball game.", "There is a hoop with a backboard and lines on the floor showing where to shoot from."], "image": "val2014/COCO_val2014_000000340508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340247, "question_id": "8jfyajRqCDkRNDg3WgUBp9", "question": "What place is famous for having islands where this type of sport takes place?", "choices": ["siberia", "hawaii", "egypt", "kazakhstan"], "correct_choice_idx": 1, "direct_answers": ["hawaii", "river", "hawaii", "hawaii", "hawaii", "hawaii", "hawaii", "hawaii", "hawaii", "hawaii"], "difficult_direct_answer": false, "rationales": ["Waimea bay has great river surfing.", "This is the only place of the 4 that is in a warm climate surrounded by lots of water.", "People are surfing on waves."], "image": "val2014/COCO_val2014_000000340247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49240, "question_id": "8jvQeehLtxixzSjhpLvqFX", "question": "Why is he running?", "choices": ["is hungry", "going home", "stole ball", "hitting ball"], "correct_choice_idx": 3, "direct_answers": ["return ball", "ball", "chasing ball", "hitting ball", "playing tennis", "tennis", "playing tennis", "chase ball", "catch ball", "playing tennis"], "difficult_direct_answer": false, "rationales": ["The ball is in midair and he's running towards it with his racket.", "The man wants to hit the ball.", "The person is trying to hit the ball."], "image": "train2014/COCO_train2014_000000049240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438375, "question_id": "8kk2gxZkTCJDpbaikATynd", "question": "What does number twenty want to do?", "choices": ["kick ball", "catch ball", "dodge ball", "hit ball"], "correct_choice_idx": 3, "direct_answers": ["hit ball", "score run", "hit ball", "hit ball", "hit ball", "home run", "bat", "homerun", "hit", "hit baseball"], "difficult_direct_answer": false, "rationales": ["Number twenty is at bat.", "He is playing baseball.", "This number twenty baseball player is in the position of swinging the bat with the intent to make contact with the ball and put the ball into play."], "image": "train2014/COCO_train2014_000000438375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112362, "question_id": "8mD7L8n3RhPJnrboTRdLHj", "question": "What is the boy doing with the black ball?", "choices": ["dribbling", "passing", "throwing", "polishing"], "correct_choice_idx": 0, "direct_answers": ["bouncing ball", "dribbling", "dribbling", "bouncing ball", "bouncing ball", "dribbling", "dribbling", "dribbling", "dribbling", "dribbling"], "difficult_direct_answer": false, "rationales": ["A boy is bouncing a ball with one hand.", "The way his hand is placed over top of it, shows he is bouncing the ball, or \"dribbling\" as it is called in basketball.", "The boy wants to dribble the basketball."], "image": "val2014/COCO_val2014_000000112362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535768, "question_id": "8miK7Fo4gYyU7mnqjSdBDL", "question": "Who is joining the boy on his bed?", "choices": ["parents", "siblings", "dogs", "stuffed animals"], "correct_choice_idx": 3, "direct_answers": ["stuffed animals", "stuffed animals", "stuffed animals", "stuffed animals", "stuffed animals", "stuffed animals", "teddy bears", "toys", "stuffed animals", "cartoon"], "difficult_direct_answer": false, "rationales": ["There are stuffed animals on his bed. some children like to sleep with their toys.", "His toys are on the bed with him.", "There are a bunch of stuffed animals on the bed."], "image": "train2014/COCO_train2014_000000535768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149305, "question_id": "8mq5nJBpnLLSzxpbkaSCAW", "question": "What activity is the person wearing checks engaged in now?", "choices": ["chess", "cooking recipe", "reading", "phone call"], "correct_choice_idx": 2, "direct_answers": ["reading", "sleeping", "reading", "reading", "reading", "reading", "reading", "reading book", "multitasking", "reading"], "difficult_direct_answer": false, "rationales": ["The person has a book in their hands above their body and is facing the book.", "A person is holding up a book close to their face.", "The person has both a book and a phone in their hand, but they are holding the phone on their lap and not using currently using it to make a call. the book is opened and being held up so they can see it."], "image": "train2014/COCO_train2014_000000149305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579938, "question_id": "8n5xaYQMw993pjQkJEpwDe", "question": "What type of art is this?", "choices": ["sand", "painting", "drawing", "sculpture"], "correct_choice_idx": 3, "direct_answers": ["modern", "abstract", "sculpting", "modern", "modern", "abstract", "sculpture", "modern", "modern", "sculptural"], "difficult_direct_answer": false, "rationales": ["Sculptures are normally three dimensional and often displayed in public.", "A large metal object is outside a building. sculptures are made from metal sometimes.", "The other options obviously don't apply."], "image": "train2014/COCO_train2014_000000579938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358744, "question_id": "8nEvnoujhvfN9WRpsAZXmB", "question": "What is unique about these animals?", "choices": ["vertebrates", "are wild", "mammals", "long neck"], "correct_choice_idx": 3, "direct_answers": ["height", "long neck", "long neck", "neck length", "long neck", "long neck", "tallness", "long necks", "tall", "tall"], "difficult_direct_answer": false, "rationales": ["These animals are giraffes. many other animals live in the wild and are mammals and/or vertebrates.", "The animals seen here are giraffes. giraffes are notable for their long necks which are also seen here.", "These animals are giraffes. they developed this unusual feature to help them reach leaves on the tops of tall trees."], "image": "train2014/COCO_train2014_000000358744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279850, "question_id": "8nHbwiokLjLtsXGre4ZFMw", "question": "What name can be formed from the last three letters at the top of the bus?", "choices": ["tom", "jim", "ron", "dan"], "correct_choice_idx": 3, "direct_answers": ["dan", "and", "dan", "dan", "dan", "dan", "dan", "varikkadan", "dan", "dan"], "difficult_direct_answer": false, "rationales": ["The last three letters are \"dan\".", "The name is a common american name.", "These are the last three letters."], "image": "val2014/COCO_val2014_000000279850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10321, "question_id": "8nNPkPyMX55JFdnDSdphAQ", "question": "What venue is in the background?", "choices": ["museum", "government building", "auditorium", "theater"], "correct_choice_idx": 1, "direct_answers": ["buckingham palace", "castle", "palace", "buckingham palace", "palace", "palace", "buckingham palace", "buckingham palace", "buckingham palace", "government building"], "difficult_direct_answer": false, "rationales": ["The venue is a government building.", "The building belongs to the monarchy of england.", "These are the gates to buckingham palace. it is the home of the british monarchy."], "image": "train2014/COCO_train2014_000000010321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538230, "question_id": "8na6k5JVUfxTxr77hAnWST", "question": "What sort of vehicles are being raced here?", "choices": ["skate boards", "tricycles", "tractors", "dirt bikes"], "correct_choice_idx": 3, "direct_answers": ["motorbike", "dirt bikes", "dirt bikes", "motorcycles", "motorcycles", "dirt bikes", "bikers", "motor bikes", "dirt bike", "bikes"], "difficult_direct_answer": false, "rationales": ["Dirt bikes are being raced here.", "People are on a track driving motorcycles with big tires.", "These are all dirt bikes that they are riding."], "image": "val2014/COCO_val2014_000000538230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555516, "question_id": "8nmA7SDa9MMXhEdiUMPLcn", "question": "What classification is this scene devoid of?", "choices": ["feline", "canine", "female", "male"], "correct_choice_idx": 1, "direct_answers": ["dog", "color", "color", "canine", "color", "playing", "color", "color", "color", "color"], "difficult_direct_answer": false, "rationales": ["There are no dogs in here.", "Of the answers available b-d are observed based on the features visible. by process of elimination a would be valid.", "The only presence of an animal is a cat perched by window."], "image": "train2014/COCO_train2014_000000555516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253332, "question_id": "8nw72KSZ2ndJbBT8HBMZwC", "question": "What is hauled by this type of truck?", "choices": ["animals", "fuel", "sand", "trash"], "correct_choice_idx": 3, "direct_answers": ["furniture", "garbage", "storage", "trash", "garbage", "trash", "cargo", "garbage", "trash", "trash"], "difficult_direct_answer": false, "rationales": ["You can see all the garbage sticking out the top of it.", "The bags of garbage can be seen sticking out of the back.", "The truck has the look that is consistent with a trash truck or a junk hauler and there is visibly stacks of things over the top of the front."], "image": "train2014/COCO_train2014_000000253332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236406, "question_id": "8o3UGE5C8jyH9Cc5mm8nNb", "question": "What type of animals are these?", "choices": ["domestic", "reptiles", "wild", "stuffed"], "correct_choice_idx": 2, "direct_answers": ["wild", "zebras", "zebras", "zebras", "zebras", "zebras", "zebras", "zebras", "zebras", "zebras"], "difficult_direct_answer": false, "rationales": ["Zebras are considered wild animals as opposed to being domesticated like dogs or cats.", "These are zebras which are unpredictable in behaviour", "They are zebras that are in a zoo."], "image": "train2014/COCO_train2014_000000236406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401433, "question_id": "8oMMpMMAdVg25TURruM7ou", "question": "Who is guiding the elephant?", "choices": ["man", "cat", "woman", "nobody"], "correct_choice_idx": 0, "direct_answers": ["boy", "man", "man", "man", "man", "woman", "male", "trainer", "guy behind", "man"], "difficult_direct_answer": false, "rationales": ["The guy behind the woman is giving the lady a ride on the elephant.", "The man is guiding.", "The guy in the wagon is guiding the elephant for the tourist."], "image": "train2014/COCO_train2014_000000401433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114634, "question_id": "8oQjdAk726vw2gJ47sVLLC", "question": "Which country is famous for kite festival?", "choices": ["india", "belgium", "us", "china"], "correct_choice_idx": 3, "direct_answers": ["china", "china", "japan", "china", "india", "china", "china", "india", "united states", "china"], "difficult_direct_answer": false, "rationales": ["The earliest use of kites was among the chinese, approximately 2,800 years ago.", "The country is china.", "China is known for having colorful kites."], "image": "val2014/COCO_val2014_000000114634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444304, "question_id": "8otA2zA4fKuE7TVsdFhJjE", "question": "What is the person using their monitor for?", "choices": ["plate holder", "watching program", "playing wii", "nothing"], "correct_choice_idx": 1, "direct_answers": ["watching movie", "tv", "watching", "television", "watching", "tv", "television", "watching program", "watching tv", "chatting"], "difficult_direct_answer": false, "rationales": ["That's a tv and not a monitor.", "A person is crouched in front of a television. there is a show in the television.", "A person is watchin a monitor that is showing a television show."], "image": "val2014/COCO_val2014_000000444304.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555405, "question_id": "8ozGBnmH2JQPv2oSZmfCNt", "question": "Why do kites have tails?", "choices": ["popularity", "functionality", "tradition", "pretty"], "correct_choice_idx": 1, "direct_answers": ["balance", "balance", "decoration", "flying", "stability", "propels wind", "flying", "fly high", "stability", "functionality"], "difficult_direct_answer": false, "rationales": ["The long extension in the back on these kites allow them to billow in the wind as they are here seen doing.", "They have tails to help them stay in the air and fly.", "The other options are \"extras\" and interesting, but they don't have anything to do with the reason a kite can go up in the air."], "image": "val2014/COCO_val2014_000000555405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354604, "question_id": "8p42FqmFQvCk4rkWewuqei", "question": "What type of transportation is shown?", "choices": ["air", "road", "rail", "water"], "correct_choice_idx": 0, "direct_answers": ["airplane", "airplane", "airplane", "airplane", "air", "jet", "airplane", "plane", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["This is an airplane at an airport.", "A large airplane is on a runway.", "The other options don't fly."], "image": "train2014/COCO_train2014_000000354604.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216581, "question_id": "8p9KMyBDDGmZhPb5sE4wYC", "question": "What are they all doing?", "choices": ["resting", "relaxing", "sleeping", "eating"], "correct_choice_idx": 1, "direct_answers": ["sitting", "resting", "sitting", "sitting", "sunning", "sunning", "sitting", "sitting", "sitting", "relaxing"], "difficult_direct_answer": false, "rationales": ["They are resting.", "They are chatting and sitting.", "Everyone seems to be chilled out and hanging out on the benches."], "image": "val2014/COCO_val2014_000000216581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305667, "question_id": "8pEcPwTAyN7S9tKPR5D8Ey", "question": "Why are the two holding the umbrellas standing in the tunnel?", "choices": ["to hide", "keeping dry", "to kiss", "boarding train"], "correct_choice_idx": 1, "direct_answers": ["water protection", "rain", "hiding out", "avoid rain", "raining", "keeping dry", "keep dry", "avoid rain", "raining", "staying dry"], "difficult_direct_answer": false, "rationales": ["It is raining outside", "Traditionally you would use an umbrella when it's raining to keep dry.", "There is rain falling outside of the tunnel."], "image": "train2014/COCO_train2014_000000305667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345702, "question_id": "8poSXUz9TFfrURpdxnW9Fq", "question": "What street is this event happening on?", "choices": ["3rd", "200th", "4th", "north"], "correct_choice_idx": 0, "direct_answers": ["3rd street", "3rd", "pennsylvania avenue", "3rd street", "third street", "3rd street", "3rd", "3rd", "pennsylvania", "busy"], "difficult_direct_answer": false, "rationales": ["The sign near the elephant indicates the location of this event.", "We can locate the street number of this event by reading the green sign on the traffic pole which reads '3'", "The people are walking to the left"], "image": "train2014/COCO_train2014_000000345702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66514, "question_id": "8q24zDKYRQamwQDFEdtK6u", "question": "How do the people know each other?", "choices": ["neighbors", "spouses", "coworkers", "siblings"], "correct_choice_idx": 3, "direct_answers": ["siblings", "brother sister", "siblings", "siblings", "siblings", "siblings", "family siblings", "siblings", "family siblings", "siblings"], "difficult_direct_answer": false, "rationales": ["They are young children who are close in age and using the bathroom together.", "The people know eachother as siblings.", "The kids are siblings to each other."], "image": "train2014/COCO_train2014_000000066514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34800, "question_id": "8q5SNpJbZynzkrL3dwmGLE", "question": "What problem are the apartment dwellers likely to face?", "choices": ["noise pollution", "trespass", "gun violence", "burglary"], "correct_choice_idx": 0, "direct_answers": ["poverty", "noisy", "graffiti", "poverty", "noise pollution", "poverty", "poverty", "graffiti", "poverty crime", "vandalism"], "difficult_direct_answer": false, "rationales": ["The apartments are very close together and are located very close to a city street which presumably make it a loud neighborhood to reside in.", "Apartment buildings are shown in a city. cities have noise pollution.", "The building will deal with noise."], "image": "train2014/COCO_train2014_000000034800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383185, "question_id": "8q9uU7UrGDapYGspVyxHrA", "question": "What country is famous for exporting the fruit that is on the counter?", "choices": ["ecuador", "japan", "china", "kazakhstan"], "correct_choice_idx": 0, "direct_answers": ["honduras", "south america", "africa", "thailand", "brazil", "peru", "philippines", "honduras", "india", "ecuador"], "difficult_direct_answer": true, "rationales": ["The other options don't offer them.", "They are grown there.", "Bananas are on a counter. ecuador grows bananas."], "image": "val2014/COCO_val2014_000000383185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369212, "question_id": "8qCrmjv7gvnpoXyPhbwZ2J", "question": "What country brand is this product?", "choices": ["british", "french", "american", "swiss"], "correct_choice_idx": 2, "direct_answers": ["american", "chinese", "united states", "usa", "switzerland", "usa", "america", "germany", "swiss", "switzerland"], "difficult_direct_answer": false, "rationales": ["This is from a company in the usa>", "The tool has leatherman printed on it. as internet search revealed that their factory is in portland, oregon.", "The brand of the product is in the us."], "image": "train2014/COCO_train2014_000000369212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210846, "question_id": "8qr9dFf7pw7uX99KjPPxgH", "question": "The stuff being dipped into resembles what canned food brand sauce?", "choices": ["bush's", "chef boyardee", "bumble bee", "uncle ben's"], "correct_choice_idx": 1, "direct_answers": ["chef boyardee", "tomato", "prego", "tomato soup", "heinz", "heinz", "campbell's", "prego", "cheez whiz", "ragu"], "difficult_direct_answer": false, "rationales": ["The soup is tomato sauce.", "Looks like the sauce that is in a can of spaghetti o's.", "It looks like what would be found in a can of chef boyardee."], "image": "val2014/COCO_val2014_000000210846.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312961, "question_id": "8qsNc8UMhp6mFf8ATCvM6t", "question": "What is the person on the ramp doing?", "choices": ["long boarding", "water skiing", "body boarding", "surfing"], "correct_choice_idx": 1, "direct_answers": ["water skiing", "trick", "water surfing", "jumping", "jumping", "jumping it", "water skiing", "ascending", "jump", "ski jumping"], "difficult_direct_answer": false, "rationales": ["The person is skiing over the water.", "They are on skiis and jumping off the ramp.", "The person is water skiing."], "image": "train2014/COCO_train2014_000000312961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215167, "question_id": "8quSufjg2PVQc4YJTsYHh8", "question": "The monitor shows the lock screen from which OS?", "choices": ["windows 7", "windows vista", "windows xp", "windows 10"], "correct_choice_idx": 1, "direct_answers": ["windows", "windows", "windows", "dell", "windows vista", "windows", "windows", "windows", "windows", "lock screen"], "difficult_direct_answer": false, "rationales": ["If you look closely at the bottom of the screen you can see the windows vista wording and icon.", "It says what it is on the screen.", "You can see the logo on the bottom of the screen."], "image": "train2014/COCO_train2014_000000215167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300636, "question_id": "8rXvXjV6fm58KSmNcRHCme", "question": "What type of ring is the woman on the end wearing?", "choices": ["championship", "birthstone", "class", "wedding"], "correct_choice_idx": 3, "direct_answers": ["wedding band", "decorative", "wedding", "wedding ring", "wedding", "engagement ring", "wedding", "gold", "engagment", "engagement"], "difficult_direct_answer": false, "rationales": ["The weeding ring are placed on the finger that is shown.", "The diamond on the ring on this woman's ring finger tells us she is married or engaged.", "You can tell by the finger she is wearing it on."], "image": "train2014/COCO_train2014_000000300636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153956, "question_id": "8rZpKPjPFncQNaRMKVdoDn", "question": "What is stacked on the middle plate?", "choices": ["pancakes", "eggs", "toast", "sea monkeys"], "correct_choice_idx": 2, "direct_answers": ["toast", "toast", "toast", "ham slices", "toast", "toast", "toast", "toast", "toast", "french toast"], "difficult_direct_answer": false, "rationales": ["The small middle plate has bread that has been toasted on it.", "A plate with a stack of bread is on a table surrounded by dishes with eggs on them. eggs and toast are common dishes served for breakfast.", "There is toast by the dish."], "image": "val2014/COCO_val2014_000000153956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368160, "question_id": "8raA2VVeqvi7CcAmPC3owh", "question": "What company is on the t-shirt on the right?", "choices": ["microsoft", "google", "facebook", "amazon"], "correct_choice_idx": 1, "direct_answers": ["google", "google", "google", "google", "google", "google", "google", "google", "google", "google"], "difficult_direct_answer": false, "rationales": ["The company's logo is on the chest of the white shirt.", "The google logo is on the blindfolded man on right.", "It's clear on the shirt. the colors are also typically used in the a logo."], "image": "train2014/COCO_train2014_000000368160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401977, "question_id": "8rcTpLruBh7DzLLuLQ223p", "question": "What kind of flying toys are being sold at this stall?", "choices": ["balloons", "rockets", "kites", "frisbees"], "correct_choice_idx": 2, "direct_answers": ["kites", "kites", "kite", "kites", "kites", "kites", "kites", "kite", "kites", "kite"], "difficult_direct_answer": false, "rationales": ["The other options aren't displayed in this image.", "Kites always fly on strings.", "These are made of a lightweight material and have a wide area to catch wind as they fly"], "image": "train2014/COCO_train2014_000000401977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17975, "question_id": "8rdsd8zsGwzbsnbmmf5ncB", "question": "What are they doing with the luggage?", "choices": ["unloading", "selling", "stealing", "loading"], "correct_choice_idx": 0, "direct_answers": ["load it", "loading", "carrying", "boarding it", "baggage claim", "loading", "unloading", "transferring", "loading", "loading"], "difficult_direct_answer": false, "rationales": ["They are unloading the luggage and stuff.", "The men are putting the luggage into the cart.", "They are putting the luggage from the vehicle onto the conveyor so people can pick them up."], "image": "train2014/COCO_train2014_000000017975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315432, "question_id": "8rnAqyzJFBW5KXSZM7dRFp", "question": "Why are the boats stationary?", "choices": ["unseaworthy", "no gas", "docked", "bad weather"], "correct_choice_idx": 2, "direct_answers": ["docked", "docked", "parked", "docked", "anchored", "calm day", "docked", "docked", "docked", "calm surf"], "difficult_direct_answer": false, "rationales": ["These boats have their sails down and are unmoving arranged around the dock.", "The boats are docked to stop them from moving to other areas.", "Boats that aren't being sailed is called being docked."], "image": "val2014/COCO_val2014_000000315432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469896, "question_id": "8rvxLb5MtKvhzHdeCdomzo", "question": "What might the person here be reading?", "choices": ["school book", "cookbook", "comic", "diary"], "correct_choice_idx": 0, "direct_answers": ["school textbook", "book", "school book", "novel", "favorite book", "book", "romance novel", "novel", "romance novel", "book"], "difficult_direct_answer": false, "rationales": ["The girl is reading a book for school.", "The person is reading a school book.", "It is a thick book with many pages and they are in the bedroom reading"], "image": "val2014/COCO_val2014_000000469896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493243, "question_id": "8s6ivg9rnTAdjrTvvNDUwq", "question": "Why might this longest food be unappealing to some?", "choices": ["lacks condiments", "uncooked", "too hot", "to spicy"], "correct_choice_idx": 0, "direct_answers": ["meat product", "highly processed", "no dressing", "not natural", "processed", "lacks condiments", "meat", "vegetarians", "unhealthy", "dieting"], "difficult_direct_answer": true, "rationales": ["There is no sauces on the hot dog.", "It is just meat and bread which will be dry", "The hotdog is plain so it may be tasteless."], "image": "val2014/COCO_val2014_000000493243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46473, "question_id": "8sVCE7jhZJ5b5E723un2jR", "question": "How is the engine on this motorcycle cooled?", "choices": ["pressure", "oil", "air", "water"], "correct_choice_idx": 2, "direct_answers": ["antifreeze pump", "antifreeze", "water pump", "unknown", "air", "exhaust system", "water", "air", "fan", "stopped"], "difficult_direct_answer": true, "rationales": ["Air helps to cool down a motorcycle's engine.", "The motorcycle is cooled by air.", "There is no casing around the engine."], "image": "train2014/COCO_train2014_000000046473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482801, "question_id": "8sWEMgvkvi5baJDxKWcWfY", "question": "Why does the man holding the umbrella have very red lips?", "choices": ["he's sick", "chapstick", "genetics", "lipstick"], "correct_choice_idx": 3, "direct_answers": ["costume", "wearing lipstick", "lipstick", "lipstick", "lipstick", "trans", "lipstick", "lipstick", "lipstick", "match hair"], "difficult_direct_answer": false, "rationales": ["This kind of look comes from applying lipstick on to make them red.", "The man is dressed like a woman. woman sometimes wear lipstick.", "Lipstick is used as a cosmetic to give lip pigment."], "image": "train2014/COCO_train2014_000000482801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85826, "question_id": "8sgDLSSnic8PSHnh5C3XTG", "question": "What OS is the baby interacting with?", "choices": ["windows 95", "windows vista", "ubuntu", "windows xp"], "correct_choice_idx": 3, "direct_answers": ["windows xp", "windows", "microsoft", "windows", "windows", "mac", "windows", "windows", "windows", "windows"], "difficult_direct_answer": false, "rationales": ["There is a windows xp sticker on the laptop.", "There is a sticker on the lap top that shows what it is.", "A baby is sitting near an hp computer. hp computers run on windows os."], "image": "val2014/COCO_val2014_000000085826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506605, "question_id": "8smbiwX8pDvb4nipPbRHeB", "question": "What does the antenna on the blue object to the right of the monitor transmit?", "choices": ["television", "radio", "cell service", "wi-fi"], "correct_choice_idx": 3, "direct_answers": ["wifi signal", "data", "modem signals", "internet", "wi-fi", "connect internet", "internet signal", "electrical signals", "electromagnetic waves", "wifi"], "difficult_direct_answer": true, "rationales": ["This is the wifi antenna.", "The antenna carries the wi-fi signal.", "The electronic machine is for wifi."], "image": "train2014/COCO_train2014_000000506605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575753, "question_id": "8t4ywLxRWrnseBLb9EeUbj", "question": "Why is there water everywhere?", "choices": ["canal", "sewer", "flooding", "river"], "correct_choice_idx": 2, "direct_answers": ["flood", "flood", "flood", "flooding", "flooded street", "flooding", "flood", "flooded", "flooding", "flood"], "difficult_direct_answer": false, "rationales": ["The streets are flooded.", "When there is a lot of rain, it can take a while to drain.", "The water is flooding."], "image": "train2014/COCO_train2014_000000575753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544, "question_id": "8t9ZysJzqRvoMecYcMMrhC", "question": "The player swinging has the same dominant hand as what person?", "choices": ["kris bryant", "mike trout", "fred mcgriff", "manny ramirez"], "correct_choice_idx": 2, "direct_answers": ["opposite person", "fred mcgriff", "catcher", "lefty", "catcher", "pitcher", "catcher", "catcher", "domain", "cody ross"], "difficult_direct_answer": false, "rationales": ["They are using the same hand.", "The player swinging has a right handed swing.", "Fred mcgriff and this player would have the same dominant hand you can tell by how he's playing."], "image": "val2014/COCO_val2014_000000000544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197352, "question_id": "8t9ctanAGFn2EGUB9ba2fV", "question": "What is the purpose of the long pole?", "choices": ["finding fish", "killing walrus", "moving boat", "defending boy"], "correct_choice_idx": 2, "direct_answers": ["pushing", "steer", "moving boat", "push himself", "propelling", "moving", "steer", "direction", "to guide", "reach longer"], "difficult_direct_answer": true, "rationales": ["The long pole is an oar for moving the boat.", "The other options don't match this setting. it helps with momentum.", "He will use the pole to steer the boat in the direction he wants to go."], "image": "val2014/COCO_val2014_000000197352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521306, "question_id": "8tP2n6L49RZ4tAH4cQKSzX", "question": "Wii remote console is designed for what?", "choices": ["controllers", "call", "chats", "video games"], "correct_choice_idx": 3, "direct_answers": ["playing games", "gaming", "gaming", "playing", "video games", "video games", "charging mobile", "video games", "playing", "remote"], "difficult_direct_answer": false, "rationales": ["The console is for the wii.", "The console is for the wii.", "The console is for video games."], "image": "val2014/COCO_val2014_000000521306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444236, "question_id": "8tQ7NbvZ7oEHQWmitDW2v4", "question": "What movie is advertised on the bus?", "choices": ["transformers", "fight club", "fast furious", "red 2"], "correct_choice_idx": 3, "direct_answers": ["red 2", "red 2", "red 2", "red 2", "red", "red 2", "red two", "red 2", "red 2", "red two"], "difficult_direct_answer": false, "rationales": ["The movie is red 2.", "It's plain to see the words on the side of the bus and what movie is being promoted.", "You can see the words \"red 2\" on the side of the bus."], "image": "val2014/COCO_val2014_000000444236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227878, "question_id": "8tbat9Qfer94smBz3Qh6aM", "question": "What type of birds are in the top images?", "choices": ["crows", "doves", "starlings", "hummingbirds"], "correct_choice_idx": 3, "direct_answers": ["stained glass", "hummingbirds", "hummingbirds", "hummingbird", "hummingbird", "stained glass", "glass hummingbirds", "glass", "humming birds", "humming birds"], "difficult_direct_answer": false, "rationales": ["These are small birds with long beaks used to get nectar from flowers", "That is the kind of bird made with the stained glass.", "Those birds have long wings."], "image": "val2014/COCO_val2014_000000227878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581458, "question_id": "8thJF8WH2GzoUUuJHZMCQ3", "question": "What type tracks do the trains here run upon?", "choices": ["underground", "ground", "none", "elevated"], "correct_choice_idx": 3, "direct_answers": ["railroad", "electric", "subway", "elevated", "electricity", "elevated", "train", "train", "elevated", "elevated"], "difficult_direct_answer": false, "rationales": ["These go above ground.", "A train is on tracks elevated above a city street.", "They are above the road."], "image": "val2014/COCO_val2014_000000581458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340082, "question_id": "8tixzQLBUM6QC8fmdiS5Dd", "question": "Who manufactured the silver vehicle?", "choices": ["mercedes", "toyota", "bmw", "stan"], "correct_choice_idx": 3, "direct_answers": ["stan", "stan", "stan", "stan", "stan", "iveco", "stan", "stan", "stan", "iveco"], "difficult_direct_answer": false, "rationales": ["This is the name on the front of the bus and the logo is not immediately recognizable like the others, which are all popular motor vehicle brands.", "The vehicle is a stan.", "The company name is displayed on the front of the train."], "image": "train2014/COCO_train2014_000000340082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116882, "question_id": "8tvafGwuwSST9QPPsYbiw2", "question": "What does it look like the stuffed animal is doing?", "choices": ["eating", "drinking", "singing", "attacking"], "correct_choice_idx": 1, "direct_answers": ["climbing", "drinking", "drinking", "climbing", "drinking", "climbing", "climbing", "drinking", "drinking", "climbing"], "difficult_direct_answer": false, "rationales": ["The animal is on a glass with liquid in it.", "It looks like it's trying to get the wine in the glass.", "A stuffed animal is hanging on the side of a wine glass."], "image": "train2014/COCO_train2014_000000116882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294549, "question_id": "8u2UhWaGqknZyM7zsKKPFv", "question": "What type of computer is the desktop in this image?", "choices": ["apple", "dell", "toshiba", "microsoft"], "correct_choice_idx": 0, "direct_answers": ["mac", "apple", "apple", "cr monitor", "mac", "mac", "apple", "apple", "apple", "laptop"], "difficult_direct_answer": false, "rationales": ["The type is apple.", "A logo consisting of an apple's profile is visible on this monitor. due to this we can conclude it is an apple machine.", "The desktop computer has a picture of a piece of fruit at the bottom of the monitor which is the logo of this famous corporation."], "image": "train2014/COCO_train2014_000000294549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387601, "question_id": "8uEewS33qtVnPR7jnrC757", "question": "What does the small sign on the pole imply?", "choices": ["free transportation", "popped tire", "free service", "towing"], "correct_choice_idx": 3, "direct_answers": ["no parking", "tow area", "no parking", "towing zone", "right side", "cars towed", "towing", "towing", "tow zone", "towing"], "difficult_direct_answer": false, "rationales": ["There is a no parking symbol above a car getting taken away icon on the sign.", "The sign refers to either a need for towing or that it's a no park section.", "If you park there they may remove your car."], "image": "train2014/COCO_train2014_000000387601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542088, "question_id": "8uKja6yde8NQ4HhmeWQ9qS", "question": "What is the location needed for this hobby?", "choices": ["ocean", "lake", "pool", "swamp"], "correct_choice_idx": 0, "direct_answers": ["ocean", "ocean", "coastal", "ocean", "ocean", "beach", "coast", "ocean", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["The people are surfers and require a body of water that produces big waves.", "They will need waves to surf.", "People are carrying surfboards near the water. surfing requires a wave to ride. the ocean is the most reliable place to find waves on a regular basis."], "image": "train2014/COCO_train2014_000000542088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567616, "question_id": "8uqw9MD6nJejFT9qVAvydV", "question": "How can the juice be extracted from these fruits?", "choices": ["smoker", "juicer", "jugger", "dehydrator"], "correct_choice_idx": 1, "direct_answers": ["squeeze", "squeezed", "squeezing", "squeezing", "juicer", "squeeze", "squeezing", "squeeze", "squeeze", "squeeze"], "difficult_direct_answer": false, "rationales": ["An orange is sliced in a bowl.", "It's squeezed out manually by hand or with this type of machine.", "One sticks them in a juicer to force the juice out."], "image": "train2014/COCO_train2014_000000567616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579231, "question_id": "8vKWKdUXJ2kriaA5To6Yxq", "question": "What does the man in Red focus on here?", "choices": ["pitcher", "audience", "catcher", "batter"], "correct_choice_idx": 0, "direct_answers": ["ball", "pitch", "ball", "strikezone", "pitch", "ball", "pitches", "pitcher", "strike zone", "ball"], "difficult_direct_answer": false, "rationales": ["The man is looking at the pitcher.", "The umpire is watching the ball get released.", "He watches the man trying to swing at the ball to see if he pulls a swing."], "image": "val2014/COCO_val2014_000000579231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34491, "question_id": "8vNDxJA2P33H7ewcwUp49h", "question": "Where did the man in blue get food from?", "choices": ["mcdonalds", "subway", "red robin", "olive garden"], "correct_choice_idx": 1, "direct_answers": ["restaurant", "subway", "subway", "store", "restaurant", "subway", "fast food", "restaurant", "subway", "sandwich shop"], "difficult_direct_answer": false, "rationales": ["A man in blue has a sub sandwich in his left hand.", "The man is holding a sub sandwich and has a subway cup.", "He is holding a sub sandwich in his hand."], "image": "train2014/COCO_train2014_000000034491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187868, "question_id": "8vbg3ATDqLJyrmYYwJxQTa", "question": "What is the name for stuffing animal heads?", "choices": ["doctoring", "designing", "stuffing", "taxidermy"], "correct_choice_idx": 3, "direct_answers": ["taxidermy", "taxidermy", "taxidermy", "taxidermy", "taxidermy", "taxidermy", "taxidermy", "elk", "elk", "taxidermy"], "difficult_direct_answer": false, "rationales": ["Taxidermy is when you take animals and preserve the bodies.", "That is what the method is called.", "People stuff animals that they hunted and killed"], "image": "train2014/COCO_train2014_000000187868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178835, "question_id": "8vr2XgjuvzC7iNHsgn6uLY", "question": "Why is there a blue tarp on the roof of the building?", "choices": ["as landmark", "to sell", "protection", "decoration"], "correct_choice_idx": 2, "direct_answers": ["damage", "leaking", "protection", "shade", "roof broken", "damage", "stop leaks", "water leak", "protection", "leaks"], "difficult_direct_answer": false, "rationales": ["The tarp is used to protect the food.", "It looks like it is protecting the building from rain.", "It appears the roof has been damaged and needs something to prevent leaking."], "image": "val2014/COCO_val2014_000000178835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24507, "question_id": "8vwHySAYuHgF6MCJonAKCf", "question": "What is the fridge decorated with?", "choices": ["postcards", "posters", "magnets", "letters"], "correct_choice_idx": 2, "direct_answers": ["magnets", "stickers", "magnets", "magnets", "sticker", "magnets", "stickers", "magnets", "magnets", "magnets"], "difficult_direct_answer": false, "rationales": ["There are magnets keeping the refrigerator warm.", "There are different magents stuck to it.", "There are many magnets on the fridge door."], "image": "train2014/COCO_train2014_000000024507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551100, "question_id": "8w2ccT2SXZN2CKLYpvSgEY", "question": "What kind of fuel does the elephant use?", "choices": ["gasoline", "food", "jet fuel", "diesel"], "correct_choice_idx": 1, "direct_answers": ["food water", "food", "food", "food", "food", "food", "hay", "grass", "energy", "food"], "difficult_direct_answer": false, "rationales": ["Elephants need to eat to function.", "The elephant eats things to power itself, like all living creatures.", "An elephant does not have an internal combustion engine."], "image": "train2014/COCO_train2014_000000551100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410350, "question_id": "8w9Y8NyGC6s49Hg2HXs446", "question": "Why are the two wearing aprons?", "choices": ["visibility", "aesthetics", "as joke", "for hygiene"], "correct_choice_idx": 3, "direct_answers": ["cooking food", "chefs", "preparing food", "cooking", "for hygiene", "protection", "cooking", "to cook", "they're chefs", "cooks"], "difficult_direct_answer": true, "rationales": ["The two are wearing aprons for hygiene while they are preparing food.", "They are chefs. chefs wear swaths of material covering their front side.", "They are protecting themselves from food all over clothes."], "image": "val2014/COCO_val2014_000000410350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273253, "question_id": "8wDioYSB4PbxCPvMEHDpyM", "question": "Where is this jersey along with the other items probably displayed?", "choices": ["museum", "house", "stadium", "library"], "correct_choice_idx": 0, "direct_answers": ["museum", "museum", "st louis", "baseball museum", "museum", "shop", "hall fame", "baseball museum", "museum", "museum"], "difficult_direct_answer": false, "rationales": ["The jersey is in a museum.", "The jersey is a museum relic.", "These are old sports items likely found in a museum of sports."], "image": "train2014/COCO_train2014_000000273253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391255, "question_id": "8wJicVuk7wacHy8ZVMg3YB", "question": "The item the woman is holding is similar to what?", "choices": ["scramasax", "helmet", "drill", "chainsaw"], "correct_choice_idx": 0, "direct_answers": ["scramasax", "machete", "knife", "knife", "sword", "sword", "sword", "machete", "sword", "sword"], "difficult_direct_answer": false, "rationales": ["The woman is holding a knife. it does not look like a power tool or a helmet.", "Both have a similar shape.", "It's similar to an old english knife that was a type of sword."], "image": "val2014/COCO_val2014_000000391255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246199, "question_id": "8wYk7qtqCUgR8CsMu79jLX", "question": "What is this an image of?", "choices": ["drinks", "fruits", "candy", "vegetables"], "correct_choice_idx": 1, "direct_answers": ["fruit market", "bananas", "bananas", "bananas", "fruits", "fruits", "fruit", "fruits", "fruits", "fruit stand"], "difficult_direct_answer": false, "rationales": ["The items are fruit.", "There are bananas and oranges, not vegetables, candies, or drinks.", "Bananas are prominently featured in the foreground."], "image": "val2014/COCO_val2014_000000246199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46207, "question_id": "8wbK4hkqfsuykjC87cZpfh", "question": "What place is shown in the photo?", "choices": ["hotel room", "camp site", "bedroom", "living room"], "correct_choice_idx": 0, "direct_answers": ["hotel", "hotel", "bed", "bed", "bed", "hotel", "bed", "bed", "hotel room", "hotel"], "difficult_direct_answer": false, "rationales": ["There is a word written on the remote that places the setting.", "A person is laying on a bed with plane white sheets and a remote that is marked as being a hotel remote next to him.", "The remote says \"hotel tv\" on it."], "image": "train2014/COCO_train2014_000000046207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443169, "question_id": "8xGYRZerQkhxpAfcDwoCmy", "question": "How far into the season is this game?", "choices": ["opening week", "late season", "playoffs", "world series"], "correct_choice_idx": 0, "direct_answers": ["far", "beginning", "mid season", "opening week", "middle", "cricket", "opening week", "unknown", "opening day", "opening week"], "difficult_direct_answer": false, "rationales": ["The special sign painted on the field behind the batting area shows that the game being played is one of the first games of the season. playoffs and world series games take place at the end of the season.", "The season is opening week.", "The words \"opening week\" appear in the grass."], "image": "val2014/COCO_val2014_000000443169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268008, "question_id": "8xNgs3JvMTXbRX5ZmvYEYU", "question": "What is the most common usage of the black container?", "choices": ["bats", "garbage", "dirt", "drinks"], "correct_choice_idx": 1, "direct_answers": ["holds bats", "hold bats", "trash", "bat storage", "garbage can", "trash", "garbage", "garbage", "trash", "store bats"], "difficult_direct_answer": false, "rationales": ["The black container is a trash can.", "Normally the plastic containers are used for garbage.", "The container is a convenient place to neatly store no longer wanted items until they are properly disposed of."], "image": "train2014/COCO_train2014_000000268008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357881, "question_id": "8xUXMHNNHWUmfobBfTt5JU", "question": "What is being done on the table the cat is on?", "choices": ["exercising", "reading", "working", "eating"], "correct_choice_idx": 3, "direct_answers": ["eating", "eating", "meal time", "setting table", "set", "dinner set", "setting table", "setting table", "table set", "eating"], "difficult_direct_answer": false, "rationales": ["The table is set for dinner.", "People usually eat at the table on plates using utensils.", "The cat is sitting on a dining room table that has been set for people to eat at."], "image": "train2014/COCO_train2014_000000357881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114744, "question_id": "8xh4mWZiub3TQp7joPCFSx", "question": "Where does this water come from?", "choices": ["hose", "sky", "water nozzle", "fire hydrant"], "correct_choice_idx": 1, "direct_answers": ["sky", "say", "sky", "sky", "sky", "clouds", "sky", "sky", "sky", "rain"], "difficult_direct_answer": false, "rationales": ["It is raining.", "There is water soaking everyone and puddled in the street which indicates that this water is rain. people in the picture are also using umbrellas and raincoats.", "People are huddled under an outdoor patio roof. it is raining."], "image": "val2014/COCO_val2014_000000114744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396673, "question_id": "8xqyNdG9JoNStf4nGyWadu", "question": "Why is the plane there?", "choices": ["being painted", "refueling", "just landed", "preparing flight"], "correct_choice_idx": 2, "direct_answers": ["cargo loading", "transport", "just landed", "preparing", "unloading", "loading", "passenger pickup", "being loaded", "loading luggage", "it landed"], "difficult_direct_answer": true, "rationales": ["The plane appears to be pulled up to the terminal not moving which would be consistent with a plane unloading.", "The plan is preparing for flight because there are fuel trucks adjacent to it.", "The plane just landed."], "image": "val2014/COCO_val2014_000000396673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284310, "question_id": "8xuTJBdi79uDG77dwGs5D2", "question": "What establishment at one time sold items for as low as a nickel?", "choices": ["five dime", "rivercenter", "sunset station", "la villita"], "correct_choice_idx": 0, "direct_answers": ["five dive", "five-and-dime", "five dime", "diner", "fivedime", "five dime", "fivedime", "fivedime", "corner store", "unknown"], "difficult_direct_answer": false, "rationales": ["Five & dime takes its name from selling items for five and ten cents.", "There are a limited number of establishments here with visible names. of the visible names of establishments, answer a is associated with the history of selling items for as low as a nickel which gave the name.", "They were also known as nickel stores."], "image": "train2014/COCO_train2014_000000284310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486770, "question_id": "8y9Tt5VAGQeGitVeBLQ26B", "question": "What type of gathering does this appear to be?", "choices": ["casual", "juvenile", "formal", "dinner"], "correct_choice_idx": 0, "direct_answers": ["casual", "friends", "pizza party", "pizza party", "family", "social gathering", "pizza party", "pizza party", "pizza party", "party"], "difficult_direct_answer": false, "rationales": ["The gathering has pizza.", "This appears to be a casual gathering of pizza eaters.", "Pizza is typically served at non-formal events, and the people in the photo are dressed comfortably."], "image": "val2014/COCO_val2014_000000486770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55578, "question_id": "8y9va3uyErvA72DWZxhodk", "question": "How might the food taste if you dumped the entire contents of the clear container near her onto the food?", "choices": ["spicy", "salty", "bitter", "sour"], "correct_choice_idx": 1, "direct_answers": ["sweet", "salty", "salty", "salty", "sweet", "salty", "sweet", "sweet", "salty", "salty"], "difficult_direct_answer": false, "rationales": ["All of the food will make you want to drink something sweet.", "This is a salt shaker.", "Sugar is in the white container. too much sugar will make things taste sour."], "image": "train2014/COCO_train2014_000000055578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449102, "question_id": "8yUjtraPMyi6XvDVEXYKKc", "question": "Why is he bent over?", "choices": ["follow through", "cleaning pants", "watching ball", "finding ball"], "correct_choice_idx": 0, "direct_answers": ["pitching stance", "follow through", "pitching", "pitching", "pitching", "throwing ball", "follow through", "he's pitching", "pitching", "pitched"], "difficult_direct_answer": false, "rationales": ["He just threw the ball and is still in position for that.", "The pitcher wants to get a full pitch in.", "The man is a baseball pitcher and he is bent over because is completing his throw and follow through."], "image": "val2014/COCO_val2014_000000449102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488952, "question_id": "8yd2Det3RbiEKHxjjVfzP3", "question": "What are the trees with one thin trunk called?", "choices": ["willow trees", "birch trees", "palm trees", "pine trees"], "correct_choice_idx": 2, "direct_answers": ["palm tree", "palm", "palm trees", "palm", "palm trees", "palm trees", "palm", "palm trees", "palm tree", "palm trees"], "difficult_direct_answer": false, "rationales": ["The trees are seen by the water.", "These look like a typical palm trees with their slender design and style of leaves.", "The trees are clearly visible and their visible features are known to be associated with answer a and none of the others."], "image": "train2014/COCO_train2014_000000488952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235862, "question_id": "8yudCKp8zaGn34DP95LDJ6", "question": "What is the yellow sign called on the chest of the rider?", "choices": ["scrimmage vest", "advertisement", "pinny", "bib"], "correct_choice_idx": 3, "direct_answers": ["number", "jersey number", "tag", "bib", "bib", "bib", "signs", "bib number", "one", "tag"], "difficult_direct_answer": false, "rationales": ["Its used to show the name.", "The yellow part is by the mouth.", "The person riding the horse is wearing a yellow bib with a number so he can be identified during the race."], "image": "train2014/COCO_train2014_000000235862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161451, "question_id": "8z39tZLJLPSEKftBSZtSxy", "question": "When would the brown food in front of the man be served?", "choices": ["dawn", "dessert", "mornings", "breakfast"], "correct_choice_idx": 1, "direct_answers": ["dessert", "at dinner", "dessert", "bakery", "at dinner", "dessert", "dessert", "dessert", "dessert", "dessert"], "difficult_direct_answer": false, "rationales": ["It looks like brownies and those are sweets people eat as a dessert.", "The food on the dishes is cake so it would be the last course of the meal.", "The chocolate toppings with sprinkles present, strawberries and accents of chocolate sauce on the food in this image tells us they are after dinner delicacies."], "image": "train2014/COCO_train2014_000000161451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46004, "question_id": "8zYhwLzXMU8z4wr6yzaDFh", "question": "The clouds in the sky depict that a is coming?", "choices": ["earthquake", "hurricane", "snowstorm", "storm"], "correct_choice_idx": 3, "direct_answers": ["storm", "storm", "thunder storm", "thunder storm", "storm", "storm", "rain", "storm", "storm", "storm"], "difficult_direct_answer": false, "rationales": ["Many zebras standing in a grassy field. there are many grey clouds rolling in above them.", "Large clouds can be seen in a sky. the sky is almost completely covered in clouds.", "Clouds have nothing to do with earthquakes. these animals are zebras that live in an area that is not affected by snow or hurricanes."], "image": "train2014/COCO_train2014_000000046004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535312, "question_id": "8zYvqw2CMhRCXabuYt6Amk", "question": "What do the images show?", "choices": ["apartment", "car", "animal", "forest"], "correct_choice_idx": 0, "direct_answers": ["living rooms", "four rooms", "apartment", "inside apartment", "rooms", "living spaces", "household items", "rooms", "organized", "inside house"], "difficult_direct_answer": true, "rationales": ["It is a small living space.", "Based on the items displayed in the rooms it looks like a residential setting. based on the size of the rooms and the layout it is likely a smaller residence in line with answer a.", "The images depict the interior rooms of a person's apartment unit."], "image": "val2014/COCO_val2014_000000535312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322771, "question_id": "8zaErjrQju22kVZCs89LeG", "question": "What is the grey object on top of the Star Wars book used for?", "choices": ["wrestling", "painting", "exercising", "gaming"], "correct_choice_idx": 3, "direct_answers": ["gameboy", "gaming", "play games", "playing games", "tape recorder", "video game", "games", "video game", "games", "video games"], "difficult_direct_answer": false, "rationales": ["The grey object is a handheld nintendo console.", "It's to play hand held games.", "The item looks like a gameboy and has the expected button layout and color scheme expected of the device."], "image": "train2014/COCO_train2014_000000322771.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213780, "question_id": "8zgMEXEx3eCjGtHZqmFxa3", "question": "Can the car go back the way it came from this spot?", "choices": ["make u-turn", "reverse", "make k-turn", "no u-turn"], "correct_choice_idx": 3, "direct_answers": ["no", "no", "yes", "yes", "no", "no", "no", "no u-turn", "no", "no"], "difficult_direct_answer": false, "rationales": ["The sign hanging on the light says the car can not turn around right there and go the opposite direction.", "The sign says no u turns are allowed.", "It can not turn around per the sign with the arrow crossed out."], "image": "train2014/COCO_train2014_000000213780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115378, "question_id": "92QhpbPqgEPJDk4GECaMSb", "question": "What is the difference of these two cats?", "choices": ["species", "eyes", "animal", "breed"], "correct_choice_idx": 3, "direct_answers": ["breed", "their colors", "color", "fur", "coloring", "fur patterns", "color", "color", "colors", "color"], "difficult_direct_answer": false, "rationales": ["One cat is gray and bigger while the other cat is black with brown and smaller indicated that they are not the same species.", "All pet cats are the same species and animal. the two cats have the same eye color.", "The cats are in different shapes and colors."], "image": "train2014/COCO_train2014_000000115378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395311, "question_id": "92pWEJnGHckUWhhqCu326E", "question": "Why are the animals wearing colored shirts?", "choices": ["to constrict", "for warmth", "for style", "to compete"], "correct_choice_idx": 3, "direct_answers": ["racing", "tell apart", "teams", "to compete", "race uniforms", "identify racers", "race", "racing", "identification", "racing"], "difficult_direct_answer": false, "rationales": ["The animals are competing.", "The dogs are on an obstacle course and an audience can be seen in the background.", "The animals are racing, and the shirts make each one easy to identify."], "image": "train2014/COCO_train2014_000000395311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10463, "question_id": "92q366boyDLdEeaaVrumEH", "question": "What form of heat does this space have?", "choices": ["none", "radiator", "wood", "gas furnace"], "correct_choice_idx": 1, "direct_answers": ["water", "radiator", "radiator", "radiator", "radiator", "steam", "heating vent", "natural gas", "radiated", "radiator"], "difficult_direct_answer": false, "rationales": ["The space is warm because of the radiator.", "A large metal rectangular piece with slants up and down supply the heat", "The space has a radiator in the corner."], "image": "val2014/COCO_val2014_000000010463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506335, "question_id": "92s2VkQ3GevGnaYAQRxeFQ", "question": "How are these vehicles propelled forwards?", "choices": ["peddling", "wind", "motor", "solar power"], "correct_choice_idx": 2, "direct_answers": ["pedal", "bike", "motor", "motor", "with motors", "speed drive", "motor", "motors", "with gas", "motors"], "difficult_direct_answer": false, "rationales": ["The bike uses a motor.", "These two-wheeled vehicles do not have pedals, sails, or photovoltaic panels.", "The vehicles do not have pedals, sails, or solar panels."], "image": "val2014/COCO_val2014_000000506335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108130, "question_id": "932azx3jmtkjpbPpKh4VQ3", "question": "What is the man in very dark green and blue shoes doing with the frisbee?", "choices": ["catching it", "throwing forward", "juggling", "hiding it"], "correct_choice_idx": 2, "direct_answers": ["catching", "juggling", "throwing", "juggling", "throwing", "catching", "throwing it", "catching", "tricks", "throwing it"], "difficult_direct_answer": false, "rationales": ["Due to the raised knee with hands reaching under it we can conclude this man is juggling his frisbee.", "The man is trying to juggle behind his back.", "A man is bent over with a frisbee throwing it back and forth behind himself, trying to catch it."], "image": "val2014/COCO_val2014_000000108130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486986, "question_id": "9344HVM5GzxezdPux35dPX", "question": "What two general types of trees are shown?", "choices": ["tulip daffodil", "monkey puzzle", "magnolia", "deciduous evergreen"], "correct_choice_idx": 3, "direct_answers": ["pine", "deciduous evergreen", "pine spruce", "fir deciduous", "pine", "pine birch", "pine", "evergreen birch", "pine", "pine evergreen"], "difficult_direct_answer": false, "rationales": ["The trees shown are green and there is snow. evergreens are green in the winter. deciduous trees grow where evergreens are.", "The trees are evergreens.", "The one tree is a pinetree."], "image": "val2014/COCO_val2014_000000486986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294978, "question_id": "934i8GHrPuSqkJFpdxEony", "question": "What does the item with silver doors regulate?", "choices": ["humidity", "bacterial content", "temperature", "blood pressure"], "correct_choice_idx": 2, "direct_answers": ["temperature", "temperature", "temperature", "temperature", "temperature", "cold temperatures", "temperature", "temperature", "temperature", "coolness"], "difficult_direct_answer": false, "rationales": ["It's a refrigerator to keep food cold.", "The item is for temperature.", "It keeps foods cold."], "image": "train2014/COCO_train2014_000000294978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183237, "question_id": "935dHpiqDX2jPnNYxZqUWd", "question": "What beverage are they most likely consuming?", "choices": ["beer", "champagne", "juice", "iced-tea"], "correct_choice_idx": 1, "direct_answers": ["alcohol", "wine", "champagne", "wine", "wine", "wine", "wine", "alcohol", "wine", "wine"], "difficult_direct_answer": false, "rationales": ["The person on the left is holding a bottle of alcohol. the bottle is bigger than a beer bottle.", "The beige gold alcohol bottle the woman holds and the amber fizzy nature of the beverage the man holds tell us they're likely celebrating with champagne on this boat.", "The woman's holding a champagne bottle so it's likely that their glasses are filled with champagne."], "image": "train2014/COCO_train2014_000000183237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239757, "question_id": "939uMUMGngKx7PGHFu8evy", "question": "What is he about to do?", "choices": ["clean bed", "shave", "brush teeth", "sing song"], "correct_choice_idx": 2, "direct_answers": ["brush teeth", "move", "move toy", "fix something", "fix", "laugh", "brush teeth", "getting trouble", "work", "talk"], "difficult_direct_answer": true, "rationales": ["He is holding a toothbrush in his hand, indicating that his most likely next move would be to use it.", "The little guy is holding a toothbrush in his hand.", "The kid sleeps in a crib, not a bed, and is too young to shave. he is holding an oral hygiene device."], "image": "train2014/COCO_train2014_000000239757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61938, "question_id": "93CnQP8TUqoUcgG8go9Ej5", "question": "In which country does this bus travel?", "choices": ["united states", "belize", "chile", "great britain"], "correct_choice_idx": 3, "direct_answers": ["china", "england", "england", "england", "great britain", "england", "uk", "england", "uk", "united kingdom"], "difficult_direct_answer": false, "rationales": ["The display on the front of the bus says chawton. chawton in a village in hampshire, england.", "The bus's text indicates that it goes through britain.", "The country is in britain."], "image": "val2014/COCO_val2014_000000061938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292945, "question_id": "93GaVJFzsWpesRzLfZavvY", "question": "What do persons here do?", "choices": ["sunbathe", "race", "fish", "sell water"], "correct_choice_idx": 0, "direct_answers": ["sunbathe", "sunbathe", "relax", "laying", "sit", "rest", "sunbathe", "sit", "stretch", "sunbathe"], "difficult_direct_answer": false, "rationales": ["They are resting and getting some sunshine.", "The people are resting on benches. the water is for their personal use.", "The person is sunbathing."], "image": "val2014/COCO_val2014_000000292945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196510, "question_id": "93J3Twk3XT6RPRBCFYHn3S", "question": "If you were riding one of these how could you stop it?", "choices": ["squeeze handle", "call help", "jump off", "pedal harder"], "correct_choice_idx": 0, "direct_answers": ["use brakes", "brakes", "brakes", "brakes", "squeeze handle", "handle brakes", "handle brakes", "press brakes", "brakes", "brakes"], "difficult_direct_answer": false, "rationales": ["The bikes have break levers on the handlebars. when a person applies pressure to the lever, it causes the bike to slow or break.", "The handles have brakes.", "A person is standing near bikes. bike brakes are used by squeezing the handle."], "image": "train2014/COCO_train2014_000000196510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477069, "question_id": "93Xt3a77w6kYXCTTMmWbHT", "question": "What is the child telling the man?", "choices": ["is lost", "nice umbrella", "is hungry", "apologizing"], "correct_choice_idx": 2, "direct_answers": ["is hungry", "impossible", "thank you", "hello", "stay", "i'm hungry", "unsure", "unknown", "religion", "needs help"], "difficult_direct_answer": true, "rationales": ["The man looks puzzled by the child's request and unprepared for the interaction.", "The boy looks like he has nothing.", "He may be hungry and asking about that."], "image": "val2014/COCO_val2014_000000477069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232910, "question_id": "93ejZXPJtJLvHxVr2sJrFD", "question": "Where is the person walking?", "choices": ["river", "forest", "subway", "roadway"], "correct_choice_idx": 3, "direct_answers": ["left", "hotel", "across street", "roadway", "across", "street", "across street", "across street", "sri lanka", "on street"], "difficult_direct_answer": false, "rationales": ["A man is on the phone and walking across street to a vehicle.", "The person is walking in a paved street where there are road vehicles clearly visible.", "You can tell by the background as to where he is walking."], "image": "train2014/COCO_train2014_000000232910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343031, "question_id": "93fJoWvhqZcYZAyjunjMAV", "question": "The purpose of the train car behind the train engine is to hold what?", "choices": ["water", "coal", "cargo", "passengers"], "correct_choice_idx": 1, "direct_answers": ["cargo", "coal", "coal", "cargo", "cargo", "fuel", "coal", "coal", "cargo", "coal"], "difficult_direct_answer": false, "rationales": ["That train car holds the fuel for the engine. cargo and passengers are held in other cars.", "The train car carries coal to burn for fuel.", "The purpose of the train behind the engine is a storage place for coal that fuels the engine."], "image": "val2014/COCO_val2014_000000343031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464605, "question_id": "93mEB2FN6TZdEMPV4piSZg", "question": "Why is the man holding onto the ski?", "choices": ["performing trick", "cleaning it", "waxing it", "unlatching"], "correct_choice_idx": 0, "direct_answers": ["keep below", "doing trick", "freestyle competition", "stability", "performing trick", "keep balance", "control", "balance", "completing trick", "safe"], "difficult_direct_answer": true, "rationales": ["A man on skis is in the air.", "The man is doing a trick.", "The man appears in the air and appears to have gone over a jump. skiers who are airborne over a jump are likely intending to perform a skill."], "image": "train2014/COCO_train2014_000000464605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2135, "question_id": "945wczsZXfpDaJ2fXhbkrx", "question": "What type of weather is the woman holding the umbrella protecting them against?", "choices": ["rain", "sun", "wind", "snow"], "correct_choice_idx": 1, "direct_answers": ["sun", "sun", "sun", "sunshine", "sun", "sunny", "sun", "sunny", "sunny", "sun"], "difficult_direct_answer": false, "rationales": ["It looks to be a nice day and the woman's umbrella is providing her shade.", "Any of these answers could be true but due to the sunny day we get the answer we do.", "The weather is sunny."], "image": "train2014/COCO_train2014_000000002135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89790, "question_id": "947aZ9GPP2e5vtwQ6umrvp", "question": "What type of activity was the vehicle here designed for originally?", "choices": ["luxury dining", "war", "racing", "fire fighting"], "correct_choice_idx": 1, "direct_answers": ["driving", "army transport", "war", "military", "war", "war", "warfare", "transporting military", "war", "military activity"], "difficult_direct_answer": false, "rationales": ["Jeeps were originally founded as military vehicles for use in wwii.", "The vehicle's paint is an olive drab color. this color allows military vehicles to be less visible.", "The drab green color scheme and heavy duty attributes of this truck tell us it was once used in the military."], "image": "val2014/COCO_val2014_000000089790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184585, "question_id": "949aScE2TyCSHvennbxevs", "question": "What action is he taking with the board?", "choices": ["bounce", "throw", "kick", "flip"], "correct_choice_idx": 3, "direct_answers": ["jumping", "trick", "jump", "kickflip", "skating", "kick flip", "jumping", "kickflip", "jumping", "flip"], "difficult_direct_answer": false, "rationales": ["He is flipping his board over.", "You can see his board is upside down and he is flipping it over.", "The skateboard is flipped upside down."], "image": "train2014/COCO_train2014_000000184585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581481, "question_id": "94LTvZnGVDMpCjcMWorKa5", "question": "What are the green veggies on the sandwich called?", "choices": ["cucumbers", "brussels sprouts", "spinach", "lettuce"], "correct_choice_idx": 0, "direct_answers": ["cucumbers", "cucumbers", "cucumbers", "pickles", "cucumbers", "cucumbers", "cucumbers", "cucumber", "pickles", "cucumbers"], "difficult_direct_answer": false, "rationales": ["The vegetables are crunchy, light green and do not have leaves.", "There are sliced green watery fruits on the bread.", "The green veggies are slices of a cylindrical vegetable with seeds in the middle."], "image": "train2014/COCO_train2014_000000581481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174048, "question_id": "94MDcPnvRb3Hu2wxLT695U", "question": "What is the silver object on the green napkin used for?", "choices": ["folding", "stirring", "flipping", "cutting"], "correct_choice_idx": 3, "direct_answers": ["cutting", "cutting pizza", "cutting pizza", "cutting", "cutting pizza", "pizza cutter", "pizza cutter", "cut pizza", "cutting", "cutting pizza"], "difficult_direct_answer": false, "rationales": ["There is a pizza cutter on the green napkin.", "The object is for cutting.", "The rounded blade is likely a pizza cutter based on the presence of the pizza."], "image": "train2014/COCO_train2014_000000174048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422307, "question_id": "94TxHDpXzRxyD6SUnhcZTs", "question": "What are the horses doing?", "choices": ["feeding", "resting", "pulling surfers", "pulling sleds"], "correct_choice_idx": 3, "direct_answers": ["walking", "walking", "walking", "walking", "walking", "walking", "walking", "walking", "pulling sleds", "walking"], "difficult_direct_answer": false, "rationales": ["The horses are on the beach. the people near them appear to be in wetsuits.", "Horses are on the beach pulling surfers.", "The horses are pulling sleds."], "image": "train2014/COCO_train2014_000000422307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234953, "question_id": "954v2HhkRjoYekdSk4V44y", "question": "Who employs the man in the yellow vest?", "choices": ["traffic control", "no one", "airport", "city"], "correct_choice_idx": 2, "direct_answers": ["airport", "airline", "airport", "airport", "airport", "airport", "airport", "airport", "airport", "airport"], "difficult_direct_answer": false, "rationales": ["The area shows a belt with luggage.", "The man is at a place that has planes.", "The man is employed by the airport he is handling luggage."], "image": "val2014/COCO_val2014_000000234953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197636, "question_id": "958MP59Lhenreo7TqzqiGH", "question": "How many people are visible?", "choices": ["five", "22", "ten", "15"], "correct_choice_idx": 2, "direct_answers": ["ten", "ten", "seven", "seven", "ten", "ten", "ten", "ten", "ten", "ten"], "difficult_direct_answer": false, "rationales": ["There are a total of ten people in the water.", "You can count all of them that are wading in the shallow part of the water.", "There are 4 people holding hands, 2 people standing next to each other, 1 person standing alone, and 3 people off to the far right."], "image": "train2014/COCO_train2014_000000197636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581183, "question_id": "95CKaGomWfkLA62t2vBXgK", "question": "What are the elephants doing?", "choices": ["foraging", "performing", "mating", "stampeding"], "correct_choice_idx": 1, "direct_answers": ["performing", "standing", "performing", "circus tricks", "performing", "performing", "standing", "standing", "performing", "circus act"], "difficult_direct_answer": false, "rationales": ["The elephants are at a circus doing their routine.", "They and the people on stage are wearing costumes and standing on circus apparatus.", "The elephants are in a circus so that's what they do."], "image": "val2014/COCO_val2014_000000581183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417164, "question_id": "95KSWGbyJod87mTgNdwkUr", "question": "What is the usual method to pay for parking here?", "choices": ["food stamps", "pennies", "credit card", "nickels"], "correct_choice_idx": 2, "direct_answers": ["credit card", "card", "card", "credit card", "credit card", "credit card", "credit card", "credit card", "card", "credit card"], "difficult_direct_answer": false, "rationales": ["People use their cards a lot.", "This is the only option mentioned on the meter; there is nowhere to insert coins.", "The method is credit card."], "image": "val2014/COCO_val2014_000000417164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563045, "question_id": "95L3UZj2F9vm7d4phfLi9L", "question": "What is the person to the far left sitting on?", "choices": ["bench", "chair", "cardboard box", "boat"], "correct_choice_idx": 3, "direct_answers": ["kayak", "canoe", "kayak", "raft", "kayak", "kayak", "boat", "cannot", "kayak", "kayak"], "difficult_direct_answer": false, "rationales": ["The people on the left are inside of a raft floating.", "This is a kayak for one person", "This is the only object one would reasonably use in or on the water."], "image": "train2014/COCO_train2014_000000563045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447089, "question_id": "95ZdgUrFeu3ch7vwMg6u4m", "question": "Human beings can exercise their freedom of speech by forming together to partake in what?", "choices": ["dance", "protest", "parade", "playing"], "correct_choice_idx": 1, "direct_answers": ["protest", "protest", "protest", "protest", "protest", "protest", "protest", "protest", "protest", "protest"], "difficult_direct_answer": false, "rationales": ["People are holding protest signs.", "All the people gathered with signs and such are out to protest.", "They're holding up signs."], "image": "val2014/COCO_val2014_000000447089.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362604, "question_id": "95fcYYtym9Bq4xKeVGsgMr", "question": "What is the train parked outside of?", "choices": ["street", "mall", "garage", "station"], "correct_choice_idx": 3, "direct_answers": ["station", "train station", "platform", "train station", "station", "train", "train station", "train station", "station", "depot"], "difficult_direct_answer": false, "rationales": ["There are people waiting around a building and platform next to the train.", "There is a building and passengers next to the track", "There are people waiting to board the train."], "image": "train2014/COCO_train2014_000000362604.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267042, "question_id": "95rHxmkneDP37VLp4kSmgk", "question": "What kind of label is on the desk?", "choices": ["instructional", "directional", "brand", "regulatory"], "correct_choice_idx": 2, "direct_answers": ["toshiba", "mousepad", "toshiba", "brand", "mousepad", "toshiba", "toshiba", "mousepad", "toshiba", "toshiba"], "difficult_direct_answer": false, "rationales": ["A toshiba pad is near a computer. toshiba is a brand.", "That is the name of a company that makes electronics", "The brand is on the label."], "image": "train2014/COCO_train2014_000000267042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358817, "question_id": "95uBdo4hQ6fCmsKr8UBHeG", "question": "What should the far away vehicles do seeing this traffic light?", "choices": ["cross intersection", "speed up", "stop slowly", "stop immediately"], "correct_choice_idx": 2, "direct_answers": ["stop", "slow down", "stop", "slow down", "slow down", "slow down", "stop slowly", "stop", "slow", "slow down"], "difficult_direct_answer": false, "rationales": ["Yellow means to slow down.", "The vehicles should stop slowly.", "A yellow light means to slow down to stop."], "image": "val2014/COCO_val2014_000000358817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190497, "question_id": "96e5AEoYaopoAqb3BsAEhW", "question": "How many cows are mostly white in the image?", "choices": ["five", "ten", "twleve", "one"], "correct_choice_idx": 3, "direct_answers": ["three", "one", "three", "two", "one", "two", "three", "two", "two", "one"], "difficult_direct_answer": false, "rationales": ["There is only a single cow that is mostly white.", "There is only one cow with little to no spots.", "There is one white one in the middle."], "image": "val2014/COCO_val2014_000000190497.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413225, "question_id": "96saeCEMLxnDhiKZLBoRib", "question": "What vehicle could all of these people fit in?", "choices": ["palatov d1", "unicycle", "volkswagen tiguan", "bac mono"], "correct_choice_idx": 2, "direct_answers": ["boat", "volkswagen tiguan", "boat", "boat", "boat", "car", "boat", "car", "tandem bicycle", "bus"], "difficult_direct_answer": false, "rationales": ["Based on the list of answer options and the vehicle capacity they have and comparing to the number of people visible, answer a is the most viable option.", "This is a vehicle that holds at least 2 people", "There are two people. unicycles, bac monos, and palatov des are single-seated vehicles."], "image": "val2014/COCO_val2014_000000413225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125118, "question_id": "96t3CRsXU6y27YAWtwDkhP", "question": "What are these buildings mostly made of?", "choices": ["plastic", "gingerbread", "fiberglass", "gingersnap"], "correct_choice_idx": 1, "direct_answers": ["gingerbread", "gingerbread", "gingerbread", "gingerbread", "gingerbread", "gingerbread", "gingerbread", "gingerbread cookies", "gingerbread cookies", "gingerbread"], "difficult_direct_answer": false, "rationales": ["The buildings are made of gingerbread.", "The buildings on displays have sugar and decorations on them. this would lead one to believe they are than made of gingerbread.", "The buildings are made of gingerbread."], "image": "train2014/COCO_train2014_000000125118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291841, "question_id": "96zrTrNk5xSHdQo467RveE", "question": "Who do the bikes likely belong to?", "choices": ["farmers", "children", "chefs", "passengers"], "correct_choice_idx": 3, "direct_answers": ["passengers", "passengers", "community", "passengers", "bus riders", "passenger", "passengers", "passengers", "passengers", "passengers"], "difficult_direct_answer": false, "rationales": ["The bikes are the passengers'.", "Passengers put the bikes on the bus for transportation while they ride.", "The bikes are on the front of a bus. they are too big for children."], "image": "train2014/COCO_train2014_000000291841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250922, "question_id": "96zxkB4c4zxdFzELwhFRNJ", "question": "What is the country of origin of pigs in a blanket?", "choices": ["italy", "britain", "france", "germany"], "correct_choice_idx": 2, "direct_answers": ["czechoslovakia", "no idea", "france", "germany", "england", "germany", "germany", "germany", "germany", "czechoslovakia"], "difficult_direct_answer": false, "rationales": ["France is known for pigs in a blanket.", "A google search provided answer b.", "These hot dogs are wrapped in crescent rolls. this food is called pigs in a blanket and it likely originated in germany."], "image": "train2014/COCO_train2014_000000250922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536752, "question_id": "975WMiKA3Vht4nUdsXNtK2", "question": "What facial expression is the man in the red jacket exhibiting?", "choices": ["sleeping", "crying", "frowning", "smiling"], "correct_choice_idx": 3, "direct_answers": ["smile", "smiling", "smile", "smile", "smiling", "smile", "happy", "smiling", "smiling", "smile"], "difficult_direct_answer": false, "rationales": ["His lips are parted, stretching across his face.", "The man is awake and does not have tears on his face. he is showing his teeth.", "The man in the red jacket looks happy and has a smile on his face."], "image": "train2014/COCO_train2014_000000536752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474953, "question_id": "97JzfWAtvDUoJYWQtshP9c", "question": "What is the cameraman most at risk of getting hit by?", "choices": ["baseball bat", "fist", "baseball", "car"], "correct_choice_idx": 2, "direct_answers": ["baseball", "baseball", "baseball", "ball", "softball", "bat", "ball", "baseball", "baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["A guy is filming behind the plate as a ball is being tossed in his direction.", "The batter probably will not throw the bat. there are no cars near these people.", "Because he is directly standing in the direction the ball will go."], "image": "val2014/COCO_val2014_000000474953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155470, "question_id": "97LUtN72jJBDJBR4277BPH", "question": "What does the athlete have around both of his arms?", "choices": ["bracelets", "handcuffs", "towels", "wristbands"], "correct_choice_idx": 3, "direct_answers": ["sweat bands", "wrist band", "wristband", "wristbands", "wraps", "arm bands", "sweatbands", "sweat bands", "wristbands", "wristbands"], "difficult_direct_answer": false, "rationales": ["The athlete's arms have white bands on them.", "He has wristband on his wrists", "The person wears that for his muscles."], "image": "train2014/COCO_train2014_000000155470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335395, "question_id": "97NW3ZfbHLXAVRWsHYWCSy", "question": "The pattern on the shoes looks like what?", "choices": ["chess board", "ham", "pizza box", "boat"], "correct_choice_idx": 0, "direct_answers": ["checkerboards", "checkerboard", "checkerboard", "checker board", "checkerboard", "chess board", "chess board", "checker board", "checkerboard", "checkers"], "difficult_direct_answer": false, "rationales": ["This is a checkered pattern on this persons shoes.", "They are checkered shoes.", "That is sometimes what the checkerboard pattern is called."], "image": "train2014/COCO_train2014_000000335395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66345, "question_id": "97NZUL9Lymfyj34DTTEdgw", "question": "What is the player ready to do?", "choices": ["sprint", "bat", "swing", "dribble"], "correct_choice_idx": 2, "direct_answers": ["hit ball", "hit ball", "hit ball", "swing", "miss point", "hit ball", "hit ball", "serve", "hit ball", "accept defeat"], "difficult_direct_answer": false, "rationales": ["The person is holding a racket and a ball appears to be approaching which means they will likely have to perform answer a in accordance with the rules and purpose of the game.", "The ball is getting a close to the man so he is about to swing.", "The player wants to swing."], "image": "train2014/COCO_train2014_000000066345.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302383, "question_id": "97TcQVUGMvjvYLR9YV5PMd", "question": "What does the information on the blue legged placard describe?", "choices": ["bus", "motorcycle", "menu", "protest"], "correct_choice_idx": 0, "direct_answers": ["bus history", "bus", "bus", "trivia", "vehicle's importance", "bus", "destination", "fares", "vehicle's importance", "bus"], "difficult_direct_answer": false, "rationales": ["The information is on rawtenstall bus.", "The placard is near the bus.", "I can't read the placard. so, i can only guess through process of elimination."], "image": "train2014/COCO_train2014_000000302383.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480311, "question_id": "97ZhBc2Jii2xCczaeMVGJv", "question": "What relation are the two adult men shown in context to the boys?", "choices": ["students", "prisoners", "teachers", "strangers"], "correct_choice_idx": 2, "direct_answers": ["teachers", "teachers", "teachers", "teachers", "teachers", "teachers", "teachers", "teachers", "teachers", "teachers"], "difficult_direct_answer": false, "rationales": ["They are in a school and there are only a few of them compared to many children.", "The two men are older than the children and are wearing suits. prisoners and strangers would not be included in a class photo.", "The boys are all dressed up and are taking a class photo. the adult men are also in the photo because they are the people responsible for educating the boys."], "image": "val2014/COCO_val2014_000000480311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20611, "question_id": "984WyUc7UyuN4Ja5AJwZ2b", "question": "What event are these snowboarders competing in?", "choices": ["slalom", "half pipe", "big air", "super-g"], "correct_choice_idx": 0, "direct_answers": ["slalom", "usasa", "slalom", "race", "snowboarding", "slalom", "slalom", "half pipe", "downhill race", "scatting"], "difficult_direct_answer": false, "rationales": ["They are racing around markers.", "The blue and red triangle flags these athletes slip inbetween tell us they are competing in a slalom.", "Skiers are skiing on one ski."], "image": "train2014/COCO_train2014_000000020611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298051, "question_id": "98G2LdD7bTV96v2BZQviAb", "question": "What is the name of a band with this number of members?", "choices": ["quartet", "cinqtet", "duet", "sextet"], "correct_choice_idx": 0, "direct_answers": ["queen", "beatles", "quartet", "quartet", "quartet", "beatles", "londsotolley", "beatles", "drums", "beatles"], "difficult_direct_answer": false, "rationales": ["There are four of them.", "There are 4 people on the stage performing", "You can see four members on the stage performing."], "image": "val2014/COCO_val2014_000000298051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148531, "question_id": "98YKg8f6LtSFSPTS4DH4Kg", "question": "Where is this vehicle parked?", "choices": ["backyard", "parking lot", "airfield", "theme park"], "correct_choice_idx": 2, "direct_answers": ["on runway", "runway", "airway", "air way", "runway", "field", "airfield", "ground", "pavement", "airport"], "difficult_direct_answer": true, "rationales": ["Based on the vehicle itself and the wide open flat space in the background and the material and painted lines underneath the vehicle answer a is associated with all these features.", "An airplane is parked on a paved runway with yellow lines printed on it.", "The yellow lines painted on the pavement are meant for aircraft."], "image": "val2014/COCO_val2014_000000148531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231572, "question_id": "98sMVXkQfCGPFoBrtgs2QH", "question": "What country would this plane hail from?", "choices": ["england", "germany", "italy", "france"], "correct_choice_idx": 0, "direct_answers": ["britain", "uk", "england", "united kingdom", "united kingdom", "england", "uk", "great britain", "england", "united kingdom"], "difficult_direct_answer": false, "rationales": ["The plane says british airways on it.", "British airways is in england.", "The plane says british airways on the side which is based out of england."], "image": "train2014/COCO_train2014_000000231572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387223, "question_id": "98u5TuvaqzsTVof73azUN4", "question": "These animals are in a formation that is reminiscent of what sport?", "choices": ["ping pong", "tennis", "archery", "football"], "correct_choice_idx": 3, "direct_answers": ["skating", "rugby", "rugby", "football", "football", "rugby", "football", "football huddle", "rugby", "football"], "difficult_direct_answer": false, "rationales": ["The sheep are huddling each other.", "It's the formation you see when the team is figuring out what play to use next.", "Huddles occur in football."], "image": "val2014/COCO_val2014_000000387223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235522, "question_id": "98yLdmrpvB9tQizrdsFT83", "question": "How many giraffes can clearly be seen grazing in the area?", "choices": ["five", "six", "four", "seven"], "correct_choice_idx": 0, "direct_answers": ["five", "five", "five", "one", "five", "one", "one", "four", "one", "one"], "difficult_direct_answer": false, "rationales": ["Two giraffes are immediately apparent in this image in the foreground and to the left. two more overlap each other in the middle left, and one more is in the center background.", "That's how many giraffes are in the picture.", "There are four giraffes in the front and one in the back."], "image": "val2014/COCO_val2014_000000235522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173693, "question_id": "994PtGpGoyrhxxRdbyFwXq", "question": "What type of event is being held?", "choices": ["play", "race", "concert", "parade"], "correct_choice_idx": 1, "direct_answers": ["horse race", "horse racing", "horse racing", "horse race", "horse racing", "horse race", "race", "horse race", "horse race", "horse racing"], "difficult_direct_answer": false, "rationales": ["There are horses in the area that are usually for racing.", "The horses are competing at the track to see who can run the fastest.", "The people are at a horse race event and there is a horse on the track in front of an audience."], "image": "val2014/COCO_val2014_000000173693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46941, "question_id": "99FwDB8XZEE4KBwRwoWXgQ", "question": "What holiday is likely being celebrated here?", "choices": ["christmas", "april fools", "columbus day", "indigenous people's"], "correct_choice_idx": 0, "direct_answers": ["christmas", "christmas", "birthday", "christmas", "christmas", "thanksgiving", "christmas", "christmas", "christmas", "christmas"], "difficult_direct_answer": false, "rationales": ["The red and white to the left side of the image implies this, but it's difficult to tell.", "People are sitting together all wearing long sleeved shirts and coats. one is wearing red and white.", "There is a red and white decoration on the couch. red and white are christmas colors."], "image": "val2014/COCO_val2014_000000046941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159627, "question_id": "99eCeCiQXYT4kMyCYWS4Mv", "question": "What is the poster on the front of the bus advertising?", "choices": ["beach", "movie", "amusement park", "mall"], "correct_choice_idx": 2, "direct_answers": ["rollercoaster", "knot's", "amusement park", "amusement park", "roller coaster", "coasters", "roller coaster", "amusement park", "knot's", "roller coasters"], "difficult_direct_answer": false, "rationales": ["The advertisement is for knott's berry farm. it is showing their roller coasters.", "Coaster is another work for roller coaster.", "Amusement parks are often advertised around cities."], "image": "val2014/COCO_val2014_000000159627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55294, "question_id": "99kio49MHCJxstQYnkAt4a", "question": "What are her boots made from?", "choices": ["cloth", "leather", "rubber", "vinyl"], "correct_choice_idx": 2, "direct_answers": ["rubber", "boots", "rubber", "rubber", "rubber", "rubber", "rubber", "rubber", "rubber", "boots"], "difficult_direct_answer": false, "rationales": ["The boots are shiny and plastic looking.", "She is wearing rain boots that are made of material that is waterproof.", "A girl is wearing galoshes while she waits."], "image": "train2014/COCO_train2014_000000055294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108303, "question_id": "99nQx8GCLJUg4Y2vh9SKve", "question": "What is the fork next to?", "choices": ["chili", "cherry pie", "cake", "steak"], "correct_choice_idx": 2, "direct_answers": ["cake", "cake", "cake slice", "cake", "cake", "cake", "cake", "cake", "cake", "cake"], "difficult_direct_answer": false, "rationales": ["The fork is next to a piece of fluffy cake.", "A fork is on a plate next to a square piece of dessert with frosting.", "It is a slice of sponge dessert topped with icing"], "image": "val2014/COCO_val2014_000000108303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86745, "question_id": "99xkNEsHmvKGFPac8Kq7zd", "question": "Where are these people walking?", "choices": ["mall", "rental office", "airport", "grocery store"], "correct_choice_idx": 2, "direct_answers": ["airport", "sky bridge", "airport", "airport", "airport", "going home", "airport", "terminal", "airport", "overpass"], "difficult_direct_answer": false, "rationales": ["The people are in the airport.", "You can see the terminal in the back to the left.", "You can tell where they are by the planes in the background."], "image": "train2014/COCO_train2014_000000086745.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207785, "question_id": "9AZAkQFpBsy2maHbDToUhL", "question": "What is the man who stands doing?", "choices": ["posing", "waiting", "making speech", "singing"], "correct_choice_idx": 2, "direct_answers": ["making speech", "speech", "speech", "speaking", "speaking", "speaking", "speaking", "speed", "speech", "spokesperson"], "difficult_direct_answer": false, "rationales": ["He is holding a microphone and talking to the crowd.", "He's standing at a podium with a microphone in his hands.", "He can be identified as barack obama, and the signs show it was the year 08 when he was running for office. the microphone suggests he is speaking to a large crowd."], "image": "val2014/COCO_val2014_000000207785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577083, "question_id": "9AdX6tHm9kFMn3feMnSiab", "question": "What is the elephants trunk doing?", "choices": ["curling", "spraying", "drawing", "grabbing"], "correct_choice_idx": 0, "direct_answers": ["curling", "curling", "touching elephant", "eating", "curling", "curling in", "feeding", "curled", "rolled", "curling"], "difficult_direct_answer": false, "rationales": ["It is pulled up towards his body.", "The elephant's trunk is folded under and twisted around, which is a curling motion.", "There are a bunch of kids standing behind an elephant. the trunk is pulled into its body by rolling trunk."], "image": "train2014/COCO_train2014_000000577083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10779, "question_id": "9Ae2nGzU5WnW6rEuqCZhk9", "question": "Why does the small child have a moustache?", "choices": ["is stolen", "looks nice", "found it", "is fake"], "correct_choice_idx": 3, "direct_answers": ["dress up", "dress up", "fake", "fake", "drawn on", "is fake", "costume", "halloween costume", "dress up", "disguise"], "difficult_direct_answer": false, "rationales": ["A child is wearing a tie and has an awkwardly placed mustache.", "The child's mustache is fake.", "The child's mustache is clearly not real."], "image": "val2014/COCO_val2014_000000010779.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197950, "question_id": "9AkMfii8UTSUGp3xqpwq4M", "question": "What athlete had the same number that the man on the right has on his arm?", "choices": ["john smoltz", "wayne gretzky", "babe ruth", "michael jordan"], "correct_choice_idx": 2, "direct_answers": ["babe ruth", "babe ruth", "babe ruth", "irving", "babe ruth", "mickey mantel", "anthony", "three", "babe ruth", "number 3"], "difficult_direct_answer": false, "rationales": ["A man is in a blue shirt with the number three on the sleeve. babe ruth wore the number three on his uniform.", "He also had the number 3", "He had the three on his jersey because he batted third."], "image": "train2014/COCO_train2014_000000197950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534533, "question_id": "9AoCZYXEhZwdT4d3aXJJpS", "question": "The controllers are fashioned like a weapon that can do what?", "choices": ["slash", "shoot bullets", "burn", "shoot arrows"], "correct_choice_idx": 1, "direct_answers": ["shoot", "shoot", "play videogames", "shoot", "play games", "shoot", "shoot bullets", "shoot", "navigate", "shoot videogames"], "difficult_direct_answer": false, "rationales": ["They resemble a gun.", "The game controllers are fastened together to resemble a gun that can shoot bullets in the video game.", "The controllers look like a revolver with a trigger."], "image": "val2014/COCO_val2014_000000534533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436127, "question_id": "9AyoaFXeiLkLDRiyGRj9Pz", "question": "In which state is this street located?", "choices": ["idaho", "illinois", "ohio", "michigan"], "correct_choice_idx": 1, "direct_answers": ["illinois", "illinois", "illinois", "illinois", "illinois", "illonis", "illinois", "illonis", "illinois", "illinois"], "difficult_direct_answer": false, "rationales": ["You can see the wording on the building behind the officer that says something chicago.com.", "Chicago can be seen on a sign in the background and chicago is in illinois.", "The bus behind the horseriding policeman says 'chicago'. this is a city in the state of illinois."], "image": "val2014/COCO_val2014_000000436127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466818, "question_id": "9Ayr3LK3Tro8FCAxiKLhG2", "question": "How are these people related to each other?", "choices": ["coworkers", "classmates", "strangers", "friends"], "correct_choice_idx": 1, "direct_answers": ["conference attendees", "coworkers", "students", "fellow hackers", "lap top", "classmates", "classmates", "students", "students", "classmates"], "difficult_direct_answer": false, "rationales": ["They are all sitting at desks with computers in a large room", "A bunch of people sit at desks in a large room. students sit at desks.", "They are sitting in a classroom together."], "image": "val2014/COCO_val2014_000000466818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543408, "question_id": "9B4fPXpvHuxkAr7u4QXEt3", "question": "In which location are these children?", "choices": ["desert", "inside home", "beach", "mown lawn"], "correct_choice_idx": 3, "direct_answers": ["park", "park", "park", "park", "park", "park", "park", "park", "mown lawn", "park"], "difficult_direct_answer": false, "rationales": ["The kids are in a grassy area.", "A child is walking in a grassy area. lawns have green grass that is mowed.", "A child is walking near green gas that is not very long."], "image": "val2014/COCO_val2014_000000543408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21563, "question_id": "9BFQ7CJL9zBNfqesqq7W7u", "question": "What type of pizza is on the plate?", "choices": ["pepperoni", "sausage", "white", "marinara"], "correct_choice_idx": 2, "direct_answers": ["cheese", "cheese", "white", "white pizza", "cheese pizza", "cheese", "cheese", "cheese", "cheese", "white pizza"], "difficult_direct_answer": false, "rationales": ["There is no other color on the pizza. the cheese and the sauce are white.", "The pizza on the plate has white cheese and white sauce on it instead of red.", "It does not have tomato sauce on it."], "image": "val2014/COCO_val2014_000000021563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159332, "question_id": "9BW6NmKRKQQSsVQ2Ff79wF", "question": "What plant family is this vegetable in?", "choices": ["pumpkin", "nightshade", "cabbage", "pepper"], "correct_choice_idx": 2, "direct_answers": ["broccoli", "cabbage", "brassicaceae", "cruciferous", "brocolli", "brassicaceae", "broccoli", "brassicaceae", "mustards", "cabbage"], "difficult_direct_answer": false, "rationales": ["Broccoli is from the cabbage family of vegetables.", "Both are from brassica oleracea.", "Broccoli is in the cabbage family."], "image": "train2014/COCO_train2014_000000159332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462998, "question_id": "9BacKeCWJqLQCexetYqQNG", "question": "Which side of the image is the warmest?", "choices": ["top", "left", "bottom", "right"], "correct_choice_idx": 1, "direct_answers": ["left", "left", "left", "left", "left", "left", "left", "left", "left", "left"], "difficult_direct_answer": false, "rationales": ["The people are wearing warm clothing in the bottom.", "The left side can be easier to tell that the snow has melted.", "The side is the left."], "image": "train2014/COCO_train2014_000000462998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300499, "question_id": "9Bdg2orQRncg69RVHEkaA8", "question": "How is the woman carrying her bag in the rain?", "choices": ["back", "basket", "shoulder", "seat"], "correct_choice_idx": 1, "direct_answers": ["basket", "basket", "basket", "basket", "bike basket", "bike basket", "in basket", "in basket", "basket", "in bicycle"], "difficult_direct_answer": false, "rationales": ["It is on the front of the bike in a wire container", "The basket is attached to the front of the bike.", "That's what they are on the bike for."], "image": "train2014/COCO_train2014_000000300499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567877, "question_id": "9C5EbSbsfDMaqP7wwUgvNC", "question": "What is the man using the device in his hand to do?", "choices": ["light candle", "eat cake", "cut cake", "serve cake"], "correct_choice_idx": 0, "direct_answers": ["lighting candle", "light candles", "cut cake", "knife", "light candles", "cut", "light", "slice cake", "light candle", "light candle"], "difficult_direct_answer": false, "rationales": ["The man is lighting the candle.", "The candle lighter has an extended handle to ensure safety when using fire.", "The man is lighting the candles."], "image": "val2014/COCO_val2014_000000567877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462075, "question_id": "9CFoxg9BScmWhGAXKrY8Qe", "question": "Which team does the catcher play for?", "choices": ["blue jays", "rangers", "mets", "cubs"], "correct_choice_idx": 3, "direct_answers": ["cubs", "cubs", "red sox", "cubs", "blue", "chicago cubs", "blue team", "major league", "cubs", "chicago cubs"], "difficult_direct_answer": false, "rationales": ["There is a chicago cubs logo on his sleeve and he is wearing a typical cubs uniform.", "The catcher wearing pinstripes with a team logo on his sleeve plays for the chicago baseball team that is part of the national league.", "The catcher is wearing a white uniform with the cubs logo on his sleeve."], "image": "train2014/COCO_train2014_000000462075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219894, "question_id": "9CbWRG4WQmijXsoE2tfh8w", "question": "Why does this animal like this location?", "choices": ["shade", "food", "cold", "warmth"], "correct_choice_idx": 3, "direct_answers": ["warmth", "cat", "warmth", "warm", "warm", "warmth", "very warm", "warmth", "warmth", "warm"], "difficult_direct_answer": false, "rationales": ["The laptop gives off heat", "The battery generates heat.", "The cat is sitting on a keyboard. the computer is probably letting off heat."], "image": "val2014/COCO_val2014_000000219894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528142, "question_id": "9CdQCt8qg8RMiTNB6ZsQSC", "question": "What type of transportation is being used?", "choices": ["air", "rail", "road", "water"], "correct_choice_idx": 1, "direct_answers": ["train", "train", "train", "rail", "train", "railway transportation", "train", "railway transportation", "train", "train"], "difficult_direct_answer": false, "rationales": ["The train can requires a specific kind of road, and only the train is visible.", "Trains run on tracks on a railroad. the people are boarding a train.", "The transportation on the right is a train and trains are part of the railway."], "image": "val2014/COCO_val2014_000000528142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370709, "question_id": "9CnNHvDfHxUgUHm5Us9SDJ", "question": "What bird genus is shown here next to the horse?", "choices": ["grus", "laridae", "numenius", "egretta"], "correct_choice_idx": 3, "direct_answers": ["seagull", "heron", "white birds", "stork", "seagull", "crane", "dove", "heron", "egretta", "swan"], "difficult_direct_answer": false, "rationales": ["An egret has a symbiotic relationship of commensalism with most farm animals.", "These birds are egretta since they're in the water.", "The birds next to the horse are baby egrets and are part of the genus egretta."], "image": "val2014/COCO_val2014_000000370709.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141240, "question_id": "9CwqJ3mg2RzGdd3wQS3fv6", "question": "What doors are seen in the background?", "choices": ["bathroom", "closet", "bedroom", "kitchen"], "correct_choice_idx": 1, "direct_answers": ["closet", "pull", "closet", "closet", "closet", "pull", "closet", "closet", "closet", "closet"], "difficult_direct_answer": false, "rationales": ["These types of doors are mostly used on closets and this is a bedroom where you find most closets.", "The doors are for a walk-in closet.", "Two cats are sitting on a bed. there are accordion style doors behind."], "image": "train2014/COCO_train2014_000000141240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263970, "question_id": "9DEzhaLtxtmpmxHL9atPh3", "question": "How will the person here get back to where they started?", "choices": ["helicopter", "taxi", "ski", "lift"], "correct_choice_idx": 2, "direct_answers": ["ski", "walk up", "ski lift", "skiing", "ski down", "ski downhill", "ski", "ski down", "skiing down", "ski"], "difficult_direct_answer": false, "rationales": ["Once they're at the top, it's the most likely of only three ways to get back down. the other two options are b and d.", "The person is moving uphill, not downhill. he will use the equipment on his back to return to the starting point.", "The person is going to ski down the mountain since they have skis on their back."], "image": "train2014/COCO_train2014_000000263970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435579, "question_id": "9DMbBMFLwfw24TRqsAKuXW", "question": "Who watches these people while they board on snow?", "choices": ["judges", "coal miners", "no one", "enemies"], "correct_choice_idx": 0, "direct_answers": ["spectators", "other snowboarders", "parents", "spectators", "other snowboarders", "spectators", "spectators", "judges", "spectators", "parents"], "difficult_direct_answer": false, "rationales": ["The judges watch.", "The snow boarders walking up the slope have numbers on their backs. numbers are used for competitions.", "Judges watch the people boarding on snow at this competition."], "image": "train2014/COCO_train2014_000000435579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389760, "question_id": "9DsJV5vQQ6vnAauKo8DSHy", "question": "Where is the baggage being loaded onto?", "choices": ["plane", "truck", "dolly", "conveyor belt"], "correct_choice_idx": 3, "direct_answers": ["luggage trailer", "plane", "plane", "airplane", "conveyor belt", "trolley", "airplane", "conveyor belt", "belt", "conveyor belt"], "difficult_direct_answer": false, "rationales": ["The belt goes up to the plane so they can load or unload easily", "That's the equipment that transports the luggage.", "Here a row of luggage is stacked on a black track. this track convey's the luggage to the two handlers at it's end."], "image": "train2014/COCO_train2014_000000389760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251938, "question_id": "9E6g9ifVWLDqmiuu4Va2rF", "question": "What type of tennis is being played here?", "choices": ["women's singles", "men's singles", "ladies doubles", "men's doubles"], "correct_choice_idx": 2, "direct_answers": ["ladies doubles", "team tennis", "team tennis", "doubles", "doubles", "doubles", "doubles", "doubles", "doubles", "recreational"], "difficult_direct_answer": false, "rationales": ["The genders of the players are visible and discernible based on their defining features and when two players appear on the same side of the court in tennis they are playing doubles.", "Given that two ladies with tennis rackets are here pictured on the same side of the court facing the same way it is most likely they are playing ladies doubles.", "There are two women on the same side of the court and that is referred to as \"ladies doubles\"."], "image": "train2014/COCO_train2014_000000251938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98915, "question_id": "9EFKjy6JMVkfqdzRS5Qhdd", "question": "Where is the man wearing a red jacket standing at?", "choices": ["front yard", "zoo", "backyard", "park"], "correct_choice_idx": 2, "direct_answers": ["outside", "back yard", "backyard", "outside", "outside", "back yard", "outside", "outside", "outside", "outside"], "difficult_direct_answer": false, "rationales": ["He is outside in someones back yard.", "The man is outside of the house near patio tables.", "The man in the red jacket is standing on the outside patio at the rear of a house."], "image": "train2014/COCO_train2014_000000098915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334777, "question_id": "9ELcDiH8S4hwUYJf8bfYzJ", "question": "What is in the bowls?", "choices": ["paper dolls", "food", "tickets", "cardboard cutouts"], "correct_choice_idx": 1, "direct_answers": ["vegetables", "ingredients", "vegetables", "vegetables", "vegetables", "vegetables", "vegetables", "ingredients", "food", "food"], "difficult_direct_answer": false, "rationales": ["The bowls are filled with veggies to make something else.", "The bowls contain broccoli and similar items.", "There is different foods in the bowls."], "image": "train2014/COCO_train2014_000000334777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325891, "question_id": "9EgSeus4bd7s7D4fXqADfd", "question": "From whom did the person with the mouth partly open most recently buy something?", "choices": ["tv salesman", "car salesman", "hotdog vendor", "florist"], "correct_choice_idx": 2, "direct_answers": ["vendor", "shop", "vendor", "vendor", "himself", "concession worker", "vendor", "hot dog", "vendor", "hotdog vendor"], "difficult_direct_answer": false, "rationales": ["The man is eating a hot dog so it is safe to assume he purchased it from a hot dog seller.", "A person selling food at the game.", "He is eating a hot dog on a bun with his hand."], "image": "train2014/COCO_train2014_000000325891.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425380, "question_id": "9EkaaubXsjyk2tXV6gnamJ", "question": "What type of weather could most likely happen soon?", "choices": ["sunshine", "snow", "tornado", "rain"], "correct_choice_idx": 3, "direct_answers": ["rainy", "rain", "rain", "rain", "autumn", "rain", "rain", "storm", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The place seems to have rain that is associated with a lot of cattle for milk.", "Cows are grazing and an overcast sky is above.", "The sky is looking dark and like it may rain."], "image": "train2014/COCO_train2014_000000425380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502805, "question_id": "9EmDpj7CrWfYqGKRLUxvkB", "question": "Why is the boy throwing the Frisbee toward the metal cage?", "choices": ["exercise", "competition", "discard it", "distract other"], "correct_choice_idx": 1, "direct_answers": ["game", "playing game", "game", "frisbee golf", "competition", "disc golf", "playing game", "golf", "game", "to score"], "difficult_direct_answer": false, "rationales": ["The boy is throwing the frisbee toward the metal cage to score a point in the game.", "The boy wants the frisbee to land in the cage so that he can score points to win the game that he is playing.", "This is frisbee golf and you score by making a goal"], "image": "train2014/COCO_train2014_000000502805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122857, "question_id": "9EoGLuPukuwJe3SdcheZXg", "question": "Which country is the word above apparel referring to?", "choices": ["russia", "spain", "united states", "china"], "correct_choice_idx": 2, "direct_answers": ["united states", "america", "america", "country", "usa", "american", "america", "united states", "usa", "united states"], "difficult_direct_answer": false, "rationales": ["It says american on it", "If it's american, it's the usa.", "The sign that is bright tells you it's american."], "image": "train2014/COCO_train2014_000000122857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150746, "question_id": "9EqnSCHwRUUSqJjRfupapY", "question": "Why is the cat sitting here?", "choices": ["to hunt", "to eat", "warmth", "to hide"], "correct_choice_idx": 2, "direct_answers": ["warmth", "warm", "warm", "warm", "worm", "warmth height", "warmth", "warmth", "sleeping", "warmth"], "difficult_direct_answer": false, "rationales": ["The cat is sitting on top of the television for warmth.", "The cat wants warmth.", "The cat is sitting on a television. it gives off heat."], "image": "train2014/COCO_train2014_000000150746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34882, "question_id": "9FKQPjSwiNxy6Xam8j4AJo", "question": "What numbered player last touched the ball?", "choices": ["one", "30", "13", "two"], "correct_choice_idx": 1, "direct_answers": ["30", "30", "30", "30", "thirty", "pitcher", "30", "thirty", "thirty", "30"], "difficult_direct_answer": false, "rationales": ["It is clear he just threw the ball.", "The player is 30.", "The pitcher will throw the ball to the batter for them to hit it."], "image": "train2014/COCO_train2014_000000034882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364522, "question_id": "9FU5xMnTXm7uASWEdjL94L", "question": "What is the silver object made of that the man is carrying?", "choices": ["nylon", "wood", "metal", "plastic"], "correct_choice_idx": 2, "direct_answers": ["bucket", "steel", "pot", "metal", "pot", "metal", "metal", "metal", "aluminum", "metal"], "difficult_direct_answer": false, "rationales": ["It is a silver and shiny object. it is hard and durable.", "The container is shiny and silver.", "The silver in the object indicates that it is made of a form of steel."], "image": "train2014/COCO_train2014_000000364522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109340, "question_id": "9FUbmen2UjUj5dMTEXkH4J", "question": "Students at this college can go where riding this bus?", "choices": ["beach", "zoo", "stadium", "downtown"], "correct_choice_idx": 3, "direct_answers": ["anywhere", "downtown", "downtown", "anywhere", "downtown", "downtown", "anywhere", "downtown", "downtown", "downtown"], "difficult_direct_answer": false, "rationales": ["The bus is taking people to the middle of town.", "The top part of the bus says where it is going next.", "A public bus will have its route listed on the sign at the top front of the bus so that passenger know that they are getting on the correct bus. the bus has its route, which is one of the options, displayed on the front."], "image": "val2014/COCO_val2014_000000109340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440087, "question_id": "9Ffcu7Qs2ExyaHbKDDsyJR", "question": "What type of street is this?", "choices": ["dirt", "private", "residential", "public"], "correct_choice_idx": 3, "direct_answers": ["busy", "one way", "parade", "main", "urban", "public street", "asphalt", "public", "business street", "asphalt"], "difficult_direct_answer": true, "rationales": ["There is nothing in the image to indicate there is any kind of restriction on this street meaning it is likely public.", "The street is public.", "This is a public street with cars on it."], "image": "train2014/COCO_train2014_000000440087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160996, "question_id": "9FggE2Wnd3Dqc2ZEbtSyYN", "question": "Which chair is closer to the camera?", "choices": ["yellow chair", "white chair", "green chair", "brown chair"], "correct_choice_idx": 1, "direct_answers": ["white chair", "light color", "right", "white", "white one", "white one", "white chair", "white", "white chair", "center"], "difficult_direct_answer": false, "rationales": ["The white chair is closest to the camera than any other chairs in the picture.", "A white chair is centered and forward with a brown chair farther back and against the wall in a kitchen.", "The chair is the white one."], "image": "train2014/COCO_train2014_000000160996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572737, "question_id": "9Fh6VgtiUtHqebcw2ihgVB", "question": "Where will the vehicle on the right take you?", "choices": ["airport", "mall", "hospital", "stadium"], "correct_choice_idx": 0, "direct_answers": ["airport", "anywhere", "airport", "airport", "airport", "airport", "airport", "local destinations", "airport", "anywhere"], "difficult_direct_answer": false, "rationales": ["It says airport on the taxi.", "It has a sign on the top for what kind of taxi it is", "The sign on the top says \"airport\"."], "image": "train2014/COCO_train2014_000000572737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1864, "question_id": "9FrgD4eGfQzf2HBVDDDJXN", "question": "What item might mislead someone into thinking the man is royalty?", "choices": ["laptop", "shoes", "chair", "crown"], "correct_choice_idx": 3, "direct_answers": ["crown", "crown", "crown", "crown", "crown", "crown", "crown", "crown", "crown", "crown"], "difficult_direct_answer": false, "rationales": ["He has a headpiece that has pointy tips and is shiny-looking.", "The man is wearing a royal crown on his head.", "That is what a crown is like."], "image": "train2014/COCO_train2014_000000001864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565940, "question_id": "9GKGyvmv45ibPKbLv6XqZW", "question": "What could the style of this room be considered?", "choices": ["modern", "victorian", "art deco", "industrial"], "correct_choice_idx": 1, "direct_answers": ["contemporary", "traditional", "fancy", "modern", "traditional", "victorian", "hall", "formal", "antique", "mid-century"], "difficult_direct_answer": true, "rationales": ["That is what the style was called.", "This style has quaint furnishings.", "A formal room with white couches has a decorative rug."], "image": "train2014/COCO_train2014_000000565940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404948, "question_id": "9GY2XGpREgVNMKwWhfVJ2g", "question": "What are the seats behind the man called?", "choices": ["benches", "folding", "squads", "stools"], "correct_choice_idx": 3, "direct_answers": ["stools", "bar stools", "stools", "barstools", "bar stools", "bar stools", "stools", "stools", "bar stools", "bar stools"], "difficult_direct_answer": false, "rationales": ["These are bar stools that are behind him.", "Each seat can only hold one person. each seat cannot be changed or moved easily.", "The seats are stools."], "image": "train2014/COCO_train2014_000000404948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18072, "question_id": "9GYt6h7ra8bNJRB5JDywmG", "question": "What is the cat resting its head on?", "choices": ["mousepad", "blanket", "cardboard", "paper"], "correct_choice_idx": 0, "direct_answers": ["mousepad", "mouse pad", "mousepad", "mousepad", "mousepad", "mousepad", "pad", "mousepad", "mousepad", "mouse pad"], "difficult_direct_answer": false, "rationales": ["The cat rests on the mousepad.", "The little cat is sleeping on the mousepad.", "There is a computer mouse on top of the pad."], "image": "train2014/COCO_train2014_000000018072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552052, "question_id": "9GdS4np8hSPMv5fmwDQHoB", "question": "How many other people are playing besides these two?", "choices": ["two", "five", "six", "three"], "correct_choice_idx": 0, "direct_answers": ["two others", "three", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["They are playing a doubles match so there should be another pair of competitors playing against them.", "These two are standing on the same side of the court.", "Since the females are playing on the same side, they are playing doubles. therefore, a total of four players are needed."], "image": "train2014/COCO_train2014_000000552052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258905, "question_id": "9H7hsCNVqWqcVqgwCsfYhw", "question": "Which item provides the most protein to the consumer?", "choices": ["toast", "tomato", "egg", "pepper"], "correct_choice_idx": 2, "direct_answers": ["egg", "eggs", "egg", "tomato", "egg", "egg", "tomato", "inside", "eggs", "cheese"], "difficult_direct_answer": false, "rationales": ["The item is the egg.", "Eggs have protein in them.", "Tomatoes, peppers, and toast don't have much protein"], "image": "val2014/COCO_val2014_000000258905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153224, "question_id": "9HDDGiY8sWvJyKYz2YepbD", "question": "What is this type of swing called?", "choices": ["bunt", "home run", "strike", "ball"], "correct_choice_idx": 2, "direct_answers": ["home run", "strike", "strike", "strike", "strike", "bunt", "back swing", "fast swing", "strike", "strike"], "difficult_direct_answer": false, "rationales": ["The bat has been swung based on its position relative to the batter and the ball has gone passed it meaning the action described in answer a has happened.", "A baseball player is swinging at a ball but does not make contact. when a pitch garners a swing without a hit it is called a strike in baseball.", "The swing is a strike."], "image": "train2014/COCO_train2014_000000153224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122899, "question_id": "9HLhadrtkSUqRUkNBcsv7W", "question": "What is the hanging basket for?", "choices": ["storing food", "decoration", "game", "lantern"], "correct_choice_idx": 0, "direct_answers": ["food", "feed giraffes", "storing food", "food", "meal", "food", "feeding", "food", "food", "feeding"], "difficult_direct_answer": false, "rationales": ["There is food hanging up high so the giraffes can eat.", "Giraffes have long necks and eat food from places high in the air.", "The basket is for food."], "image": "train2014/COCO_train2014_000000122899.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67771, "question_id": "9HP3tTuurdCkTJkMMfUWLU", "question": "What will be missing after the truck leaves?", "choices": ["snow", "mail", "trees", "garbage"], "correct_choice_idx": 3, "direct_answers": ["garbage", "garbage", "trash", "garbage", "trash", "love", "trash", "garbage", "trash", "garbage"], "difficult_direct_answer": false, "rationales": ["There will be no waste products on the side of the road after the truck empties the cans.", "The truck is a trash truck based on its shape and size and proximity to trashcans. these trucks serve one function primarily to remove answer a.", "A large truck used to pickup refuse is driving on a street."], "image": "train2014/COCO_train2014_000000067771.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76176, "question_id": "9HPkjTeqfuYeusXs2XMymq", "question": "What is the best place to cut this meat?", "choices": ["cutting board", "sink", "floor", "oven"], "correct_choice_idx": 0, "direct_answers": ["cutting board", "cutting board", "dinner table", "cutting board", "cutting board", "cutting board", "cutting board", "dinner table", "cutting board", "cutting board"], "difficult_direct_answer": false, "rationales": ["The cutting board is a good flat surface to cut meat on.", "Though you can cut this food anywhere a flat strong board would be the best.", "The place is the board."], "image": "val2014/COCO_val2014_000000076176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419015, "question_id": "9J3tcd7PHG7yLkq2ax5rgA", "question": "What is the species of the item wearing yellow farthest to the left and on top of the blue board?", "choices": ["cardboard", "plant", "vegetable", "homo sapien"], "correct_choice_idx": 3, "direct_answers": ["homo sapien", "homo sapien", "human", "human", "human", "human", "homo sapien", "human", "homo sapien", "human"], "difficult_direct_answer": false, "rationales": ["There is a set of people on surfboards trying to catch some waves.", "There is an animal, not a plant-based item, on the blue board. the animal is a human being.", "It is a person."], "image": "train2014/COCO_train2014_000000419015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237453, "question_id": "9J9FNTFVQDD2Qwq9tD6dcp", "question": "What animal is the meat portion of this dish from?", "choices": ["chicken", "cow", "lamb", "pig"], "correct_choice_idx": 2, "direct_answers": ["cow", "lamb", "lamb", "unknown", "cow", "cow", "cow", "cow", "lamb", "cow"], "difficult_direct_answer": false, "rationales": ["Based on the size, shape, location of the bone and serving style, the meat being served is a lamp chop which would be consistent with answer a.", "The bone jutting out of it is normal for this meat.", "The shape of the bone looks curved like a lamb chop."], "image": "train2014/COCO_train2014_000000237453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275449, "question_id": "9JAkEKMXRzgk2oxVNCthGk", "question": "Which display technology is utilized by the television on the stand?", "choices": ["led", "plasma", "crt", "oled"], "correct_choice_idx": 0, "direct_answers": ["flat", "high definition", "led", "lcd", "flatscreen", "lcd", "led", "image", "led", "crt"], "difficult_direct_answer": false, "rationales": ["Usually tvs come equipped with led.", "Led technology is used for tvs.", "This is the technology used in this electronic device"], "image": "val2014/COCO_val2014_000000275449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290689, "question_id": "9JSXgJzdgtz8aVbQuHgY59", "question": "Why type of laptop is the person using?", "choices": ["chromebook", "nintendo", "sony", "mac"], "correct_choice_idx": 3, "direct_answers": ["apple", "mac", "mac", "hp", "mac", "old school", "hp", "apple", "macbook", "notebook"], "difficult_direct_answer": false, "rationales": ["The person is watching a movie on a mac laptop.", "The laptop has the color scheme and overall design of mac laptops.", "A mac book is being used by the person."], "image": "val2014/COCO_val2014_000000290689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44083, "question_id": "9JTvs5SmuYzuTajmi8piE6", "question": "What is this bakery item called?", "choices": ["danish", "eclair", "cupcake", "cream puff"], "correct_choice_idx": 2, "direct_answers": ["cupcake", "cupcake", "cupcake", "cupcake", "cupcake", "cupcake", "cupcake", "cupcake", "cupcake", "cupcake"], "difficult_direct_answer": false, "rationales": ["The confection is a cupcake.", "It is a small cake in a paper cup and topped with icing and sprinkles", "This is a smaller cake with icing on it."], "image": "train2014/COCO_train2014_000000044083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21213, "question_id": "9JbDr8cir6DuT3wDyVpAbg", "question": "What purpose does the weight in the back serve?", "choices": ["paper weight", "weigh money", "balance table", "measure price"], "correct_choice_idx": 3, "direct_answers": ["pricing", "pound price", "measure price", "weigh food", "food", "measure fruits", "for fruit", "holds paper", "weigh produce", "measure"], "difficult_direct_answer": true, "rationales": ["It isn't really price it is measure, but the fruit so they know what to sell them as.", "The fruit is being sold by the pound, so the scale will tell her how much to charge.", "Fruits and vegetables are placed on the weight and then you pay by the weight."], "image": "train2014/COCO_train2014_000000021213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174145, "question_id": "9JcPxYPa2oPntxmzCUGkuJ", "question": "What language are they likely speaking?", "choices": ["jewish", "hebrew", "chinese", "russian"], "correct_choice_idx": 1, "direct_answers": ["arabic", "arabic", "not english", "russian", "hebrew", "russia", "hindu", "arabic", "tibetan", "moldova"], "difficult_direct_answer": false, "rationales": ["The writing on the poster on the wall behind the med sitting is in hebrew. indicating that the men most likely speak hebrew since they are in a place where hebrew is written.", "There is writing on the poster above the skiers.", "There is an israeli language printed on the sign above them."], "image": "val2014/COCO_val2014_000000174145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124970, "question_id": "9KVGnAE64tdbdDxB93dcd8", "question": "Trichloroethylene or tetrachloro ethylene is reason for what?", "choices": ["colored smoke", "none", "mist", "flame"], "correct_choice_idx": 0, "direct_answers": ["unknown", "smoke trail", "solvent", "help", "fuel", "smoke", "colored smoke", "tetrachloride", "chem trail", "kidney cancer"], "difficult_direct_answer": true, "rationales": ["Trichloroethylene is used to color smoke at air shows. planes are flying in the air with colored smoke behind them.", "The chemical named helps create the colored smoke for the planes.", "There is currently an air showing take place. planes are leaving behind smoke trails in sky"], "image": "train2014/COCO_train2014_000000124970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197280, "question_id": "9KdxMLhYa8S3vNC66whtiN", "question": "Why are they looking at the child on the board?", "choices": ["amazing trick", "is suspicious", "is falling", "bouncing"], "correct_choice_idx": 0, "direct_answers": ["he's jumping", "watching", "high jump", "watching him", "watching jump", "see performance", "watching performance", "jumping", "amazing trick", "watching trick"], "difficult_direct_answer": true, "rationales": ["The kids are watching the person on the skateboard. the boarder is doing a trick in the air.", "A small child is in the air while doing a skateboarding trick.", "A child is doing a skateboarding trick and is in the air while others look on. people keep an eye on kids."], "image": "val2014/COCO_val2014_000000197280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52986, "question_id": "9KekuemAPWzaHmP6wgRc23", "question": "What is the girl doing with the orange object?", "choices": ["dancing", "spinning ribbons", "sewing", "controlling kite"], "correct_choice_idx": 3, "direct_answers": ["flying kite", "flying kite", "controlling kite", "steering kite", "flying kite", "steering kite", "flying kite", "flying kite", "flying kite", "steering kite"], "difficult_direct_answer": false, "rationales": ["It's a large fabric object flying high in the air above her.", "The girl is using the handle to control the kite strings.", "The girl is holding the orange item because it contains the string that steers the kite."], "image": "train2014/COCO_train2014_000000052986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134294, "question_id": "9KesX48Gpx27sHdZ8jg9ie", "question": "What piece of equipment is in the window?", "choices": ["air conditioner", "heater", "air filter", "fan"], "correct_choice_idx": 0, "direct_answers": ["air conditioner", "air conditioner", "air conditioner", "air conditioner", "air conditioner", "air conditioner", "air conditioner", "air conditioner", "air conditioner", "air conditioner"], "difficult_direct_answer": false, "rationales": ["The equipment is set into the window to capture air from outside and cool it as it enter the house.", "There is a vent on the wall.", "A white appliance is mounted in a window. room sized air conditioners are mounted in windows."], "image": "train2014/COCO_train2014_000000134294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135589, "question_id": "9KguxYxo2GwCbtJoA46LXR", "question": "What emotion is the man in the red tie feeling?", "choices": ["fear", "happiness", "amusement", "joy"], "correct_choice_idx": 0, "direct_answers": ["fear", "scared", "fear", "worry fear", "fear", "anxiety", "surprised", "shock", "worried", "surprise"], "difficult_direct_answer": false, "rationales": ["A man is staring with a bewildered expression and a frown.", "He looks scared of something.", "The man is fearful."], "image": "train2014/COCO_train2014_000000135589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493227, "question_id": "9KmNxeqvMso9s59KmrggcW", "question": "Who manufactured the car on the right?", "choices": ["lexus", "audi", "bmw", "mercedes"], "correct_choice_idx": 2, "direct_answers": ["bmw", "bmw", "benz", "bmw", "bmw", "bmw", "bmw", "mercedes", "bmw", "bmw"], "difficult_direct_answer": false, "rationales": ["A car with a bmw logo is parked on the side of the street.", "The car has a bmw logo on it.", "Bmw manufactured the car."], "image": "train2014/COCO_train2014_000000493227.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160669, "question_id": "9KyQLaMwijjVchMFEHrkr6", "question": "Who is the man crouching behind the catcher?", "choices": ["umpire", "referee", "coach", "announcer"], "correct_choice_idx": 0, "direct_answers": ["umpire", "catcher", "umpire", "player", "umpire", "umpire", "umpire", "umpire", "umpire", "umpire"], "difficult_direct_answer": false, "rationales": ["The man stooping behind the catcher in this image is not a participant in the athletic competition but and outside judge of the game.", "The umpire stands there so he can keep an eye on the ball and the batter to make a call if needed.", "The person who makes the important decisions at home plate."], "image": "val2014/COCO_val2014_000000160669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517160, "question_id": "9L5w3WFMHRbL8iMpBZpip5", "question": "Which direction did the skater just come from?", "choices": ["train track", "train top", "down", "high up"], "correct_choice_idx": 2, "direct_answers": ["right", "left", "left", "down", "left", "right", "down", "left", "left", "down"], "difficult_direct_answer": false, "rationales": ["There is a curved ramp-type structure behind the skater, which he used to rise up in the air for his trick.", "A skateboarder is on the edge of a cement barrier that is a couple feet off the ground.", "They had to roll up to roll along the top."], "image": "train2014/COCO_train2014_000000517160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540404, "question_id": "9LF3m4ysivhU7V38FFbKSC", "question": "What is the nature of the nearest advertisement?", "choices": ["on lcd", "crudely attached", "on billboard", "has photograph"], "correct_choice_idx": 1, "direct_answers": ["dancing", "dancing academy", "crudely attached", "dancing", "dance", "dance school", "dance studio", "dance", "radio station", "promotional"], "difficult_direct_answer": false, "rationales": ["A large banner is attached to the front of a truck in the street.", "The advertisement's unusual placement on the grille of a truck suggests it was haphazardly put on.", "The sign is loosely taped to the back of the vehicle."], "image": "train2014/COCO_train2014_000000540404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528049, "question_id": "9LHY9YWsZSEhNi9exSrptg", "question": "Which one of these is another flavor of this type of beverage?", "choices": ["sardine", "bread", "butter", "chamomile"], "correct_choice_idx": 3, "direct_answers": ["green tea", "tea", "green tea", "green tea", "green tea", "tea", "green tea", "chamomile", "green tea", "tea"], "difficult_direct_answer": false, "rationales": ["Chamomile is another flavor for the tea beverage shown in the laptop screens.", "The beverage is jasmine tea. this beverage is not similar to sardines, butter, or bread.", "On each of these computer screens a container of jasmine tea is shown. chamomile is another variety of tea."], "image": "val2014/COCO_val2014_000000528049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38357, "question_id": "9LTgtQkUmwGNurHzhoAkFr", "question": "What type of language would someone in this area speak?", "choices": ["german", "italian", "asian", "spanish"], "correct_choice_idx": 2, "direct_answers": ["chinese", "chinese", "mandarin", "english", "chinese", "japanese", "japanese", "asian", "english", "japanese"], "difficult_direct_answer": false, "rationales": ["Answer a is not a language, but based on the characters displayed in the bus that would indicate the language being spoken, it would be an asian language.", "The lettering on the busses is in asian characters, suggesting this is in asia, where the residents would speak an asian language.", "Based on the characters on the side of one bus and on the front of the other bus, it appears to be chinese language."], "image": "val2014/COCO_val2014_000000038357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316363, "question_id": "9M96FzuGADw3ZSBMh6edTq", "question": "What is the largest kite flying made to resemble?", "choices": ["moth", "cell phone", "hummingbird", "human"], "correct_choice_idx": 3, "direct_answers": ["person", "yellow kite", "person", "person", "person", "person", "person", "skydiver", "person", "human"], "difficult_direct_answer": false, "rationales": ["It has arms and legs with the same proportions as a human, which a mother, hummingbird, and cell phone do not have.", "It looks like a scuba diver.", "The yellow kite is the largest. it resembles a human."], "image": "train2014/COCO_train2014_000000316363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26670, "question_id": "9MMHgy6sBTwvQMrf58ttXj", "question": "What is the possible threat faced by the people?", "choices": ["tsunami", "tornado", "volcano eruption", "rain"], "correct_choice_idx": 3, "direct_answers": ["storm", "lighting strike", "thunder storm", "rain", "storm", "rain", "lightning", "lightening", "storm", "storm"], "difficult_direct_answer": false, "rationales": ["Here we see a dark and overset sky. this is indicative of rain coming and would endanger their kite.", "The sky is really cloudy and it's dark. the people are also wearing jackets.", "The clouds are very dark. the weather looks bad."], "image": "val2014/COCO_val2014_000000026670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35599, "question_id": "9MMQzpqijGNCKw36772pVT", "question": "What is the item on the table?", "choices": ["fire extinguisher", "cow", "napkin holder", "ceramic pot"], "correct_choice_idx": 3, "direct_answers": ["candle", "ceramic pot", "candle", "candle", "candle", "pot", "candle", "pot", "candle", "pot"], "difficult_direct_answer": false, "rationales": ["A small clay pot can be found on each table.", "The material of the item is guessable based on the color and shine. answer a is the only reasonable answer based on size, shape and setting.", "The ceramic pot sits in the middle of the tablecloth."], "image": "val2014/COCO_val2014_000000035599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376405, "question_id": "9Mhw52EtLzEogrYcoX75jd", "question": "Where might you find the square blue item in a house?", "choices": ["utulity room", "sofa", "bath", "baby's crib"], "correct_choice_idx": 1, "direct_answers": ["on couches", "floor", "bed", "on bed", "couch bed", "bed couch", "couch", "sofa", "items", "stuffing"], "difficult_direct_answer": true, "rationales": ["Pillows are usually placed on couches in a house.", "Pillows are generally found on couches.", "Normally those pillows would be placed on a couch in the living room."], "image": "train2014/COCO_train2014_000000376405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204181, "question_id": "9MtZChdzcS8vFB2wQi3bCw", "question": "What kind of kite it is?", "choices": ["polygonal", "symmetrical", "rectangle", "rhombus"], "correct_choice_idx": 3, "direct_answers": ["rhombus", "prism", "sting ray", "glider", "triangle", "airplane kite", "wind kite", "jet plane", "delta kite", "delta"], "difficult_direct_answer": true, "rationales": ["The kite has parallel sides which is typical of a rhombus.", "You can tell by the outline of the kite.", "The kite appears to have the same pattern mirrored on both sides of the vertical access which is the definition of answer c."], "image": "train2014/COCO_train2014_000000204181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309104, "question_id": "9N4HvK7UHGs5nj5JAG7N7S", "question": "Why is he out of focus?", "choices": ["is moving", "os hungry", "is confused", "is invisible"], "correct_choice_idx": 0, "direct_answers": ["moving", "he's moving", "bad photographer", "motion", "moving", "moving quickly", "motion", "is moving", "moving", "blurry"], "difficult_direct_answer": false, "rationales": ["The blurry arms present on this picture suggests movement during the time the picture was taken.", "Some cameras cannot capture a moving object with complete clarity, resulting in a blurred effect, which is what's happening here.", "The man that is standing is moving really fast and he turned out blurry in the photo."], "image": "val2014/COCO_val2014_000000309104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203171, "question_id": "9NFeNNSWfRi7NU49kiVucG", "question": "What type of weather is present here?", "choices": ["stormy", "windy", "snow", "tornado"], "correct_choice_idx": 1, "direct_answers": ["overcast", "windy", "cloudy", "windy", "windy", "cloudy", "windy", "windy", "cloudy", "overcast"], "difficult_direct_answer": false, "rationales": ["Due to the number of kites aloft present in this image we can conclude the wind is blowing in this image.", "Kites can be seen flying in the air. kites tend to fly well in a windy environment.", "People are flying kites. there needs to be strong winds for the kites to float in the air."], "image": "val2014/COCO_val2014_000000203171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551305, "question_id": "9NUePwWnD9jH3YfANGfqgC", "question": "Why do cats chase objects?", "choices": ["habit", "irritation", "instinct", "boredom"], "correct_choice_idx": 2, "direct_answers": ["play", "to curious", "prey response", "instinct", "for play", "predators", "instinct", "stalking", "play", "curious"], "difficult_direct_answer": false, "rationales": ["Traditionally cats are predators ans curious creatures.", "They are predators and go after things naturally.", "A cat is looking up at a toy with interest."], "image": "train2014/COCO_train2014_000000551305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448012, "question_id": "9NYy5B5mW3FiJwZC5hsrY9", "question": "What is the nearest business shown here?", "choices": ["seafood", "cafe", "green grocer", "butcher"], "correct_choice_idx": 2, "direct_answers": ["grocery store", "produce shop", "produce", "produce", "grocery store", "groceries", "green grocer", "grocery store", "grocery store", "fruit vendor"], "difficult_direct_answer": false, "rationales": ["There is produce in the window", "A grocery store with fruits is shown in the window.", "The business shown has shelves full of produce for sale."], "image": "val2014/COCO_val2014_000000448012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406147, "question_id": "9NuWQ56sPoEMHS9zGxhhD6", "question": "What event are the people attending?", "choices": ["graduation ceremony", "protest", "photography contest", "picnic"], "correct_choice_idx": 0, "direct_answers": ["graduation", "concert", "graduation ceremony", "graduation", "picnic", "picnic", "park concert", "picnic", "concert", "skating"], "difficult_direct_answer": false, "rationales": ["They look like they may be sitting down in protest of something.", "Three young girls are seen together, with one wearing traditional graduation garb.", "A group of people are sitting outside on blankets and are holding umbrellas."], "image": "val2014/COCO_val2014_000000406147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342051, "question_id": "9Nv75ET7SjkEdKAzk7w6oo", "question": "For which reason might traffic be stopped or controlled here?", "choices": ["road construction", "highjacking", "crime collar", "tolls"], "correct_choice_idx": 0, "direct_answers": ["crossing", "road work", "pedestrians", "school", "construction", "road construction", "pedestrians", "road construction", "construction", "avoiding people"], "difficult_direct_answer": false, "rationales": ["There is construction.", "Traffic is controlled for road construction, as shown by the cones and stop signs.", "The barrels are out for them to work on the road."], "image": "val2014/COCO_val2014_000000342051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182799, "question_id": "9P8bN7mRPYdZfHEjtaPRZT", "question": "What natural event is seen here?", "choices": ["tornado", "waterfall", "sunrise", "none"], "correct_choice_idx": 1, "direct_answers": ["falls", "waterfall", "waterfall", "waterfall", "falls", "waterfall", "waterfall", "waterfall", "waterfall", "waterfall"], "difficult_direct_answer": false, "rationales": ["A woman is taking a photo and there is water running off to bottom of lake.", "A woman stands in front of a large body of water running over an edge hard enough to cause white water.", "Water is cascading over a cliff."], "image": "train2014/COCO_train2014_000000182799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338214, "question_id": "9PVzrLUVRRZZ8VuaADQTz9", "question": "What kind of activities are related to the tools held by the people?", "choices": ["dancing", "water activities", "hiking", "cooking"], "correct_choice_idx": 1, "direct_answers": ["water activities", "surfing", "surfing", "virtual reality", "surfing", "swimming", "swimming", "surfing swimming", "swimming", "surfing"], "difficult_direct_answer": false, "rationales": ["The people are wearing inflatable tubes that are usually seen in pools for water activities.", "The people are holding objects like surfboards, floatation devices which are used in pools and oceans.", "People are holding surfboards. surfboards are used in the water."], "image": "train2014/COCO_train2014_000000338214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274341, "question_id": "9PmFovSvmu7xiAWLL7B5T8", "question": "This truck is probably delivering what?", "choices": ["cattle", "food", "people", "water"], "correct_choice_idx": 2, "direct_answers": ["people", "people", "people", "children", "people", "children", "people", "children", "things", "human"], "difficult_direct_answer": false, "rationales": ["It is a tour bus.", "A bus that is large and yellow is on the road. buses generally carry people.", "The truck is a bus carrying people."], "image": "train2014/COCO_train2014_000000274341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576732, "question_id": "9Pq94mLLsrHC7YpnMTeXwp", "question": "How many calories does that sweetener have?", "choices": ["ten", "45", "30", "zero"], "correct_choice_idx": 3, "direct_answers": ["five hundred", "four", "50", "zero", "thirty", "zero", "zero", "cannot tell", "zero", "two"], "difficult_direct_answer": false, "rationales": ["The pink packs are sweet n low and have no calories in them.", "This type of sweetener is made without calories.", "The sweetener being used is called sweet and low. it is used as a replacement for sugar because it does not have any calories."], "image": "train2014/COCO_train2014_000000576732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520831, "question_id": "9Q3LX2gBog7neTwB4KKLgA", "question": "What kind of tag hangs from the child in red's jacket?", "choices": ["ski pass", "birth certificate", "hall pass", "bank statement"], "correct_choice_idx": 1, "direct_answers": ["identification", "identification", "skii pass", "ski tag", "birth certificate", "ski tag", "ski ticket", "identification", "ski pass", "entrance pass"], "difficult_direct_answer": false, "rationales": ["The people in the picture are skiing.", "A square ticket is hanging from a person's coat who has skis on and is on a ski run.", "You receive the passes so you can ski at the resort."], "image": "train2014/COCO_train2014_000000520831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287213, "question_id": "9Q4Chp52ore6sT7HFtz2xV", "question": "What is the main function of the bed?", "choices": ["decorative", "to repair", "to sleep", "to exercise"], "correct_choice_idx": 2, "direct_answers": ["sleeping", "sleep", "to sleep", "sleeping", "sleeping on", "sleep on", "sleeping", "sleep", "comfortable sleep", "sleeping"], "difficult_direct_answer": false, "rationales": ["This is where people rest for the night for several hours", "The answer is commonly known.", "The bed is a place to rest or for sleeping."], "image": "train2014/COCO_train2014_000000287213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306431, "question_id": "9QEBwHjNG7WAEVJGcCXDzA", "question": "What kind of signs are shown?", "choices": ["brand", "directional", "regulatory", "warning"], "correct_choice_idx": 1, "direct_answers": ["city names", "traffic", "directional signs", "directional", "direction", "freeway", "road signs", "road signs", "roads", "turn right"], "difficult_direct_answer": true, "rationales": ["They have names with arrows on them to let people know which way to go", "The signs are indicating which direction the city is in.", "The signs have arrows on them."], "image": "train2014/COCO_train2014_000000306431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83973, "question_id": "9QdgdknjNsprtpdf5yrY9x", "question": "What type of truck is shown?", "choices": ["moving", "garbage", "commercial", "antique"], "correct_choice_idx": 3, "direct_answers": ["old truck", "historical antique", "flatbed", "work truck", "flatbed", "antique", "flatbed", "pickup", "commercial truck", "old truck"], "difficult_direct_answer": false, "rationales": ["The truck shown is at least 100 years old and is being shown because it is so old.", "It's a very old truck", "The truck is an antique."], "image": "train2014/COCO_train2014_000000083973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141426, "question_id": "9QuBsR5JKSGoZSt7oT3e8g", "question": "What species of animal are the largest mammals here?", "choices": ["bovine", "ovine", "porcine", "equine"], "correct_choice_idx": 0, "direct_answers": ["bovine", "cow", "cow", "sheep", "sheep", "cattle", "bull", "bulls", "cows", "bull"], "difficult_direct_answer": false, "rationales": ["Aside from humans, the only mammals here are cows.", "The large creatures are cows.", "Also known as cows, they are larger than the humans also pictured."], "image": "val2014/COCO_val2014_000000141426.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569716, "question_id": "9R8A5QSiEWCJyforKahQrv", "question": "The item next to the water bottle is usually used in what setting?", "choices": ["pool", "gas station", "police station", "school"], "correct_choice_idx": 0, "direct_answers": ["swimming", "swimming", "computer", "skiing", "swimming", "songs living", "swimming pool", "tanning", "swimming", "pool"], "difficult_direct_answer": false, "rationales": ["The item next to the water bottle are goggles used for swimming.", "Swimming goggles are next to the h2o.", "The item is a pool."], "image": "train2014/COCO_train2014_000000569716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537235, "question_id": "9RUSFME758SahTgqgJLFmx", "question": "What course of the meal is this man eating?", "choices": ["dessert", "soup", "salad", "appetizer"], "correct_choice_idx": 0, "direct_answers": ["catering", "icecream", "lunch", "dessert", "desert", "desert", "dessert", "dessert", "dessert", "lunch"], "difficult_direct_answer": false, "rationales": ["The man's dessert sits in a large sundae glass, and ice cream topped with caramel sauce is clearly visible. his hand holds a spoon up, ready to dive in!.", "Based on the visible consistency of the food and the glass it is served in, the food would be ice cream which is traditionally served in an after meal course.", "Ice cream sundaes are usually considered a dessert after the meal."], "image": "train2014/COCO_train2014_000000537235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173791, "question_id": "9RbUZPifJM8BAn52ynfF6h", "question": "Who is the away team?", "choices": ["pirates", "angels", "mets", "yankees"], "correct_choice_idx": 3, "direct_answers": ["grey team", "mets", "guests", "baseball team", "cubs", "yankees", "mets", "blue team", "blue jays", "ny mets"], "difficult_direct_answer": true, "rationales": ["The yankees are the away team as indicated by the uniforms.", "You can tell by their uniforms who the teams are that are playing.", "The team is the yankees."], "image": "train2014/COCO_train2014_000000173791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68291, "question_id": "9RmcnpwLAsvtFdtkqcc64T", "question": "What structure surrounds the animals?", "choices": ["barn", "pen", "dome", "cage"], "correct_choice_idx": 1, "direct_answers": ["fenced pen", "fence", "fence", "pen", "fence", "fence", "fence", "fencing", "fence", "fence"], "difficult_direct_answer": false, "rationales": ["A large fence is around some farm animals. many people are around the outer perimeter looking at them down in the city.", "There is a fence surrounding the animals to keep them from running off.", "The animals surround the pen so they can feed."], "image": "val2014/COCO_val2014_000000068291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157480, "question_id": "9RpjdAGkNCbJXR3UeBAxra", "question": "How many of the pillows on the bed are unintended for sleeping?", "choices": ["two", "one", "four", "three"], "correct_choice_idx": 1, "direct_answers": ["four", "one", "one", "1 pillow", "four", "one", "four", "four", "one", "three"], "difficult_direct_answer": false, "rationales": ["The one of the pillows is decorative, but the others are functional.", "Based on the comparative size and shapes of the pillows, one can discern which are meant for decoration and which are intended for use.", "The log shaped pillow is decorative."], "image": "train2014/COCO_train2014_000000157480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155811, "question_id": "9RsZ8mPG76AhtiTR829Kgt", "question": "What marks this safe crossing area?", "choices": ["rain", "city center", "white stripes", "mayor office"], "correct_choice_idx": 2, "direct_answers": ["crosswalk", "crosswalk", "white stripes", "crosswalk", "crosswalk", "white lines", "crosswalk", "white lines", "crosswalk", "white stripes"], "difficult_direct_answer": false, "rationales": ["Traditionally these lines painted on the road indicate it's safe to cross.", "There are stripes that tell you what is safe to walk.", "The white lines are for pedestrian crossing."], "image": "val2014/COCO_val2014_000000155811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382557, "question_id": "9RzjAryjFwZgaMau27F33u", "question": "What does the woman here sign?", "choices": ["treaty", "autograph", "check", "sales receipt"], "correct_choice_idx": 0, "direct_answers": ["agreement", "declaration", "government legislation", "contract", "treaty", "document", "treaty", "legislation", "agreement", "certificate"], "difficult_direct_answer": false, "rationales": ["The woman is putting a signature on some documentation surrounded by different people in suits.", "There are two groups of people on different sides of the lady.", "This looks like a political meeting so a treaty is most likely the case here."], "image": "val2014/COCO_val2014_000000382557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214391, "question_id": "9SGXTfh9h6snSzhpXwY5Tm", "question": "Which item can be directly touched and eaten?", "choices": ["right bottle", "left sandwich", "left bottle", "right sandwich"], "correct_choice_idx": 3, "direct_answers": ["sandwich", "juice", "sandwich", "cake", "sandwich", "right sandwich", "sandwich right", "sandwich", "sandwich", "sanchwich"], "difficult_direct_answer": false, "rationales": ["Two sandwiches are on a table and one is wrapped while the other is not.", "The right sandwich is not covered in plastic wrap and is ready to be eaten.", "It doesn't have a wrapper around it."], "image": "val2014/COCO_val2014_000000214391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327695, "question_id": "9SMdwSRMuH2jM6XMLnoiee", "question": "What are the zebras doing?", "choices": ["feeding", "bathing", "attacking giraffes", "grooming"], "correct_choice_idx": 0, "direct_answers": ["feeding", "standing", "relaxing", "standing", "walking", "grazing", "resting", "grazing", "grazing", "grazing"], "difficult_direct_answer": false, "rationales": ["The zebras are visible with their heads down. when zebras move across a grassy surface with their heads down it is most likely for the purpose of eating.", "Zebras eat grass. they have their heads down over the grass.", "They are eating grass."], "image": "train2014/COCO_train2014_000000327695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102506, "question_id": "9SPe5X894PmEssqyg6BXDG", "question": "What is the purpose of the colored lights?", "choices": ["decoration", "decoration", "traffic control", "illumination"], "correct_choice_idx": 2, "direct_answers": ["traffic control", "traffic directopm", "traffic control", "directional", "traffic lights", "traffic regulator", "traffic flow", "direct traffic", "bicycle signal", "traffic control"], "difficult_direct_answer": false, "rationales": ["The rows of colored lights attached to this pole tell each method of transportation when it's safe to proceed.", "The location of the lights, and their design, shape and visible colors are commonly associated with a device that would be used in association with answer a.", "Traffic lights have red, yellow, and green lights in them to let traffic know when to stop and go."], "image": "train2014/COCO_train2014_000000102506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506026, "question_id": "9SQAisFrawdSnHfiHgpYwx", "question": "Why is the man looking down at his hand?", "choices": ["holding cash", "see cut", "answering phone", "to eat"], "correct_choice_idx": 2, "direct_answers": ["mobile phone", "reading text", "answering phone", "phone", "reading phone", "phone", "reading phone", "checking phone", "using device", "phone"], "difficult_direct_answer": false, "rationales": ["The man is checking his cell.", "The man is looking at his hand because he is holding a cell phone and using it.", "He is looking at something on his phone."], "image": "train2014/COCO_train2014_000000506026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293215, "question_id": "9SRxhP99S7RoYNbSnxLoU7", "question": "What company made the shoes the boy is wearing?", "choices": ["adidas", "nike", "reebok", "vans"], "correct_choice_idx": 1, "direct_answers": ["nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike"], "difficult_direct_answer": false, "rationales": ["The company's famous swoosh logo can be seen on the shoes.", "They have the logo on it from that company.", "Only one shoe company is immediately recognizable for their swoosh."], "image": "train2014/COCO_train2014_000000293215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34138, "question_id": "9ShmhTTWHr73svQahQbURV", "question": "What is the woman doing to the pastry?", "choices": ["poking it", "heating it", "stuffing it", "cutting it"], "correct_choice_idx": 3, "direct_answers": ["cutting", "cutting", "cutting", "cutting", "cutting", "cutting it", "cutting", "slicing", "cutting it", "cutting"], "difficult_direct_answer": false, "rationales": ["The woman is using a knife to slice the pastry into two pieces.", "The woman is cutting the food with a knife.", "The woman is slicing into it."], "image": "train2014/COCO_train2014_000000034138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335249, "question_id": "9SiPwXswx7BkrdFA3vhVnu", "question": "How old is this boy?", "choices": ["six", "four", "five", "seven"], "correct_choice_idx": 1, "direct_answers": ["five", "five", "five", "four", "four", "four", "nine", "four", "six", "five"], "difficult_direct_answer": false, "rationales": ["There are four candles on the cake.", "There are four candles on the cake in front of him.", "The boy has a countable number of candles on a cake which usually corresponds with a birthday and how old one is turning."], "image": "train2014/COCO_train2014_000000335249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210683, "question_id": "9SymxnC3dC5Ude5VPQyg4U", "question": "What is the bowl holding the fruit made from?", "choices": ["wood", "plastic", "steel", "glass"], "correct_choice_idx": 3, "direct_answers": ["glass", "glass", "glass", "glass", "glass", "glass", "glass", "glass", "glass", "glass"], "difficult_direct_answer": false, "rationales": ["A clear bowl with fruit in it is on a table. glass is clear.", "A bowl is on a table and fruit inside the bowl can be seen through the bowl.", "It is see through and breakable"], "image": "train2014/COCO_train2014_000000210683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340218, "question_id": "9T6i7TSYAQVJm3eJJiPQuZ", "question": "The man in white t-shirt is participating in what type of competition?", "choices": ["drinking", "hotdog eating", "trivia", "baseball"], "correct_choice_idx": 1, "direct_answers": ["eating", "eating", "eating", "eating", "hotdog eating", "eating", "food", "eating", "food eating", "eating"], "difficult_direct_answer": false, "rationales": ["There are a lot of the food item in from of the man, and he is quickly eating them.", "The man is eating and that is the only link to any of the answers given.", "The man is eating a sandwich on a bun."], "image": "val2014/COCO_val2014_000000340218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107628, "question_id": "9TJR8TqmeS3JCu8his8tZH", "question": "What style hat is this photographer wearing?", "choices": ["fedora", "baseball cap", "ski hat", "beanie"], "correct_choice_idx": 3, "direct_answers": ["helmet", "beanie", "helmet", "helmet", "beanie", "beanie", "helmet", "beanie", "beanie", "helmet"], "difficult_direct_answer": false, "rationales": ["The hat covers his head and it is knit.", "It is knitted fabric, does not have a brim and fits snugly on their head.", "The style hat is a beanie."], "image": "train2014/COCO_train2014_000000107628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371890, "question_id": "9TU5x3yM9JowBGiEMVxLcB", "question": "What faces the most danger of getting hurt if people go to ride the bikes?", "choices": ["cat", "people", "chair", "bikes"], "correct_choice_idx": 0, "direct_answers": ["cat", "cat", "cat", "kill", "cat", "cat", "people", "cat", "falling off", "cats"], "difficult_direct_answer": false, "rationales": ["The cat is sitting very close to the bike tires and could get run over if not careful.", "The cats face is right near the wheel of the bike and therefor could get hurt easily.", "The cat is small and harder to see and could get run over."], "image": "val2014/COCO_val2014_000000371890.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527963, "question_id": "9TVtRkxmj5Pzvn6dZryWPv", "question": "What United States city is the book about with the blue spine on the middle shelf?", "choices": ["new york", "chicago", "las vegas", "los angeles"], "correct_choice_idx": 2, "direct_answers": ["las vegas", "las vegas", "las vegas", "las vegas", "las vegas", "las vegas", "las vegas", "las vegas", "las vegas", "las vegas"], "difficult_direct_answer": false, "rationales": ["A book called \"nine\" is about las vegas. a book called \"nine\" is on a shelf.", "The city is vegas.", "It says the name of the city on the spine and it is also called sin city."], "image": "train2014/COCO_train2014_000000527963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211041, "question_id": "9Ta6XRNsvBRzNuLrDRGr4Y", "question": "How will the scissors move next?", "choices": ["straight up", "remain still", "arc downwards", "straight down"], "correct_choice_idx": 2, "direct_answers": ["down", "right", "towards left", "down", "hit wall", "falling", "fall", "arc downwards", "down", "thrown"], "difficult_direct_answer": false, "rationales": ["Based on the location of the scissors they likely originated from the hand as they could not appear in that location naturally. if moved from the hand and no longer attached to anything, gravity will begin to pull them down.", "These scissors are pictured suspended in mid air having been presumably thrown there by the hand on the left. gravity dictates these scissors will fall to the ground.", "The scissors were thrown upwards and at an angle, so they will fall in the reverse."], "image": "train2014/COCO_train2014_000000211041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162829, "question_id": "9TeEGQnjYp9H5hYqzzMHGB", "question": "What type of event is this?", "choices": ["show", "reception", "presentation", "meeting"], "correct_choice_idx": 1, "direct_answers": ["casual", "reception", "dinner", "party", "company luncheon", "banquet", "wedding", "school reunion", "wedding reception", "party"], "difficult_direct_answer": true, "rationales": ["People are sitting at decorated tables outdoors.", "This looks to be a reception that people are at.", "The people are sitting at banquet tables which are decorated with pretty green table runners, flowers in vases, and dinnerware. some people are eating and there are pitchers of lemonade on the tables."], "image": "val2014/COCO_val2014_000000162829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288340, "question_id": "9Ti2qJUWcNpeSHykLqQL9M", "question": "The chain prevents what from happening?", "choices": ["speeding", "sinking", "theft", "floating away"], "correct_choice_idx": 3, "direct_answers": ["boat leaving", "breaking away", "falling", "boat drifting", "floating away", "boat", "boat moving", "drifting", "floating away", "boat drifting"], "difficult_direct_answer": false, "rationales": ["The chain secures the boat to the dock preventing it from moving from that location.", "A chain is used to secure the boat to a fixed object so that the boat will not move elsewhere due to the water currents.", "The attachment of this boat to the dock keeps it in place."], "image": "val2014/COCO_val2014_000000288340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57702, "question_id": "9TjZedq8ZtERsKLj3pPhS9", "question": "What weather emergency happens if the water continues to spray?", "choices": ["tornado", "lightening storm", "hurricane", "flood"], "correct_choice_idx": 3, "direct_answers": ["flooding", "flood", "flood", "flooding", "flood", "flood", "flood", "flood", "flooding", "rain"], "difficult_direct_answer": false, "rationales": ["The weather will flood.", "An excess of water causes a flood.", "If the fire hydrant isn't contained there will be a mass amount of water concentrated in this one area. this can also be caused naturally when a floor occurs."], "image": "train2014/COCO_train2014_000000057702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36748, "question_id": "9TzutHgWt9JhVaaTrxFn89", "question": "Where will the person who kicked the ball land?", "choices": ["arm", "out-of-bounds", "goal", "rear end"], "correct_choice_idx": 3, "direct_answers": ["his back", "ground", "ground", "ground", "ground", "ground", "on ground", "rear end", "ground", "bottom"], "difficult_direct_answer": false, "rationales": ["The man is situated in a way he will fall on his bum.", "The person who kicked the ball will fall on his butt.", "The man that just kicked the ball is in midair and will land on his backside."], "image": "train2014/COCO_train2014_000000036748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57940, "question_id": "9U8W85mVdueHBXrUrevtvw", "question": "What is Norway's national sport?", "choices": ["swimming", "surfing", "skiing", "kiting"], "correct_choice_idx": 2, "direct_answers": ["skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["Norway is known for skiing.", "It is a cold country with a lot of snow", "Norwegians love their winter sport."], "image": "train2014/COCO_train2014_000000057940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550009, "question_id": "9UDyxtWdy8uxYMrcSL83Kv", "question": "What are surfboards made out of?", "choices": ["foam", "rubber", "cloth", "wood"], "correct_choice_idx": 1, "direct_answers": ["wood", "wood", "fiberglass", "fiberglass", "wood", "rubber", "fiberglass", "fiberglass", "fiberglass", "carbon fiber"], "difficult_direct_answer": false, "rationales": ["The material helps the surf board float", "Cloth, foam, and wood wouldn't be good to stand on in the water.", "This keeps them lightweight and buoyant"], "image": "train2014/COCO_train2014_000000550009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294176, "question_id": "9ULmBb78v87tv7mjPF9ZuV", "question": "What time is shown?", "choices": ["late night", "rush hour", "sunset", "sunrise"], "correct_choice_idx": 1, "direct_answers": ["six twentyfour", "620", "619", "six nineteen", "620", "619", "six twenty", "619", "six nineteen", "rush hour"], "difficult_direct_answer": false, "rationales": ["There are a lot of people to and fro around town. the time is past 6.", "Based on the amount of people in this space, the business attire they are wearing and the times visible on the clocks, this looks to be taking place after working hours where people are commuting home. this time of day is known as rush hour.", "It is packed in the middle of the day."], "image": "val2014/COCO_val2014_000000294176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268049, "question_id": "9UXxmy8tuGuw3N2dR26oFG", "question": "The area outside the building would be described as what by a person?", "choices": ["mild", "warm", "hot", "cold"], "correct_choice_idx": 3, "direct_answers": ["exterior", "exterior", "city", "cold", "entrance", "neighborhood", "white", "sidewalk", "street", "busy"], "difficult_direct_answer": true, "rationales": ["The building casts a shadow. the sun can't shine where there is a shadow.", "There is presence of both sunlight and a shadow.", "The trees are bare and the sky is somewhat cloudy. it looks like winter."], "image": "val2014/COCO_val2014_000000268049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213687, "question_id": "9UaXsqAKfFuRJy9xMWEbnB", "question": "What sort of bird meats are sold in this large building among other things?", "choices": ["chicken", "dove", "duck", "pigeon"], "correct_choice_idx": 0, "direct_answers": ["chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken"], "difficult_direct_answer": false, "rationales": ["One of the store names present in this street scene is kfc. kfc is known for it's fried chicken.", "A building has a kfc logo. kfc sells chicken.", "Kfc sells fried chicken."], "image": "val2014/COCO_val2014_000000213687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271873, "question_id": "9UeQqvqzQMNowJstYWyRen", "question": "Why have these people come to the beach?", "choices": ["to grill", "to run", "to eat", "to surf"], "correct_choice_idx": 3, "direct_answers": ["to surf", "surfing", "to surf", "surf", "surf", "surf", "surfing", "surfing", "surf", "to surf"], "difficult_direct_answer": false, "rationales": ["They are holding surf boards in their arms.", "You can determine this due to what they are wearing and what they are holding.", "People use those boards when they are riding on the water"], "image": "train2014/COCO_train2014_000000271873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207761, "question_id": "9UiaxvM5Frki8xCVE9m3kE", "question": "Where is he most likely to land?", "choices": ["board", "on bricks", "on man", "upper platform"], "correct_choice_idx": 0, "direct_answers": ["in grass", "ground", "plywood", "sheet", "on ground", "board", "table", "ground", "platform", "on board"], "difficult_direct_answer": true, "rationales": ["He is flying off the ledge, and is aiming for the board as it was purposely placed there to assist with this trick.", "They have the wooden plyboards laid out so he will land on them.", "He is heading towards the wooden board on the ground."], "image": "train2014/COCO_train2014_000000207761.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226664, "question_id": "9UkT7BEWApNVAVGVnj5QKc", "question": "To which direction is the dog staring at?", "choices": ["up", "left", "down", "right"], "correct_choice_idx": 1, "direct_answers": ["left", "back", "left", "left", "left", "west", "rear", "behind", "outward", "west"], "difficult_direct_answer": false, "rationales": ["The dog is staring left.", "A dog is in a car with his head hanging out and turned towards the back of the car.", "The dog is looking away from the truck."], "image": "train2014/COCO_train2014_000000226664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556207, "question_id": "9VEKkWks3u2qvuygz2cPRf", "question": "What could be making it more difficult for the heron to catch fish?", "choices": ["wave", "sun", "boat", "boy"], "correct_choice_idx": 2, "direct_answers": ["fisherman", "boat", "boat", "boat", "boat", "boat", "fisherman", "boat", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["A heron is wading in water near a boat. boats scare fish.", "The other options wouldn't impact the fishing unless the sun was way too bright.", "The boat is scaring the fish, which makes it more difficult for the animal to attempt to catch fish."], "image": "train2014/COCO_train2014_000000556207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239204, "question_id": "9Vn9RWQsWCDHbwttQ6oFek", "question": "In what was the item shown here prepared?", "choices": ["frying pan", "open fire", "oven", "stove top"], "correct_choice_idx": 2, "direct_answers": ["cake", "pan", "home", "baked", "oven", "pan", "oven", "oven", "oven", "oven"], "difficult_direct_answer": false, "rationales": ["The cake or brownies are made in the oven.", "The item shown is a cake. it was baked in the oven.", "This would of been baked in the oven"], "image": "train2014/COCO_train2014_000000239204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546538, "question_id": "9Vs289qvcMb8SPWBpeqnJA", "question": "What hair accessory is the player wearing to keep his hair out of his face?", "choices": ["clip", "bandana", "sweatband", "scrunchy"], "correct_choice_idx": 2, "direct_answers": ["sweatband", "bandana", "headband", "hair band", "headband", "headband", "headband", "bandana", "headband", "sweatband"], "difficult_direct_answer": false, "rationales": ["The person is wearing a sweatband on his arm and head.", "A band worn around the forehead is designed to keep water from dripping into the eyes.", "A tennis player has a band of material tied around his head while he plays."], "image": "train2014/COCO_train2014_000000546538.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519916, "question_id": "9WUo2pg3oAXAyNcCdqSs2z", "question": "What type of net is being played over?", "choices": ["tennis", "volleyball", "fish", "fencing"], "correct_choice_idx": 1, "direct_answers": ["volleyball", "volleyball", "badminton", "badminton", "volleyball", "volleyball", "tennis net", "volleyball", "volleyball", "volleyball"], "difficult_direct_answer": false, "rationales": ["We see the beach, the sand & the high nets. only one sport is played in this scenario.", "The people are playing beach volleyball.", "The net is a volleyball one."], "image": "val2014/COCO_val2014_000000519916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263762, "question_id": "9WVqLejeJvckW2ieroiC8S", "question": "Williams-Bala is located in which country?", "choices": ["germany", "us", "uk", "france"], "correct_choice_idx": 2, "direct_answers": ["uk", "uk", "cyprus", "wales", "europe", "england", "wales", "france", "wales", "cynwyd"], "difficult_direct_answer": false, "rationales": ["The country is the uk.", "Williams-bala is located in wales according to an internet search.", "The name itself shows that it comes from uk."], "image": "train2014/COCO_train2014_000000263762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100586, "question_id": "9WgEHiK2KVC2GhoazwCmY2", "question": "The monitors on the desk are displaying which OS?", "choices": ["windows 7", "windows xp", "macos", "ubuntu"], "correct_choice_idx": 2, "direct_answers": ["apple mac", "windows", "mac", "mac os", "windows", "windows", "mac", "unknown", "macos", "macos"], "difficult_direct_answer": false, "rationales": ["The monitors are displaying an apple device.", "The monitors are mac computers.", "The screens look like a form of windows and like the style of window's 7."], "image": "train2014/COCO_train2014_000000100586.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499802, "question_id": "9Wh3aBbepBn2tHeP2G92K3", "question": "Why is this motorcycle in front?", "choices": ["is police", "is lost", "arrived first", "random"], "correct_choice_idx": 0, "direct_answers": ["leading", "parade", "security", "is police", "policeman", "leading procession", "bike race", "leader", "motorcade", "lead"], "difficult_direct_answer": true, "rationales": ["The motorcycle says \"police\" on it.", "The motorcycle is the police.", "To be the lead motorcycle"], "image": "val2014/COCO_val2014_000000499802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166338, "question_id": "9WhiEsvQW7eiV6XpTeSAMR", "question": "What is in the water?", "choices": ["eel", "flamingo", "human", "elephant"], "correct_choice_idx": 3, "direct_answers": ["elephant", "animals", "elephant", "elephants", "elephants", "elephants", "animals", "elephants", "elephants", "elephants"], "difficult_direct_answer": false, "rationales": ["The water has the elephant.", "A giant, grey mammal with big ears and tusks are what make up said species.", "The animals have long trunks."], "image": "train2014/COCO_train2014_000000166338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390213, "question_id": "9WxkMrmNJqM9jvX8Q6ZYfh", "question": "What does the green sign mean?", "choices": ["plug in", "give all", "move mentally", "say prayer"], "correct_choice_idx": 1, "direct_answers": ["cheering", "win", "use power", "use strength", "power", "utilize energy", "use power", "give all", "encouragement", "protest"], "difficult_direct_answer": true, "rationales": ["The sign says use your power, which means give it your all.", "A person is playing tennis in front of an audience with signs. people make signs to encourage athletes.", "The green sign is a form of encouragement."], "image": "train2014/COCO_train2014_000000390213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58869, "question_id": "9XmpxJCFGQ3qMtzPEALX8T", "question": "What seafood is the man enjoying outdoors?", "choices": ["catfish", "squid", "oysters", "halibut"], "correct_choice_idx": 2, "direct_answers": ["oysters", "oysters", "oysters", "oysters", "oysters", "shrimp", "oysters", "shrimp", "oysters", "oysters"], "difficult_direct_answer": false, "rationales": ["The guy is eating what they call \"ostras\" in spanish.", "There are shells over some ice.", "The lake setting, white wine, lemon, sauce, and fisherman attire all imply the small size food is aquatic."], "image": "val2014/COCO_val2014_000000058869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344830, "question_id": "9XuAvho4KjkQikycV8AiyU", "question": "Why is there a saddle on the elephant?", "choices": ["as decoration", "to ride", "to buy", "to sell"], "correct_choice_idx": 1, "direct_answers": ["passenger comfort", "carry passengers", "riding", "for riders", "sitting", "people riding", "to ride", "riders", "riding", "easy riding"], "difficult_direct_answer": true, "rationales": ["People are sitting on top of an elephant.", "In india elephants are used to transport people.", "The saddle is there so people can ride on him."], "image": "train2014/COCO_train2014_000000344830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456192, "question_id": "9XwdGdNNEzfPA3ZEqRx42f", "question": "Which body part seen here had historically been used to construct a Piano Part?", "choices": ["ears", "tail", "tusks", "hooves"], "correct_choice_idx": 2, "direct_answers": ["tusk", "tusks", "tusks", "foot", "tusk", "ivory", "tusk", "tusk", "tusk", "tusk"], "difficult_direct_answer": false, "rationales": ["The elephants under the barn have ivory tusks that used to be used for piano keys.", "The top layer of piano keys was once made with ivory. the elongated teeth found on an elephant are a source of ivory that cause many elephants to be hunted and killed.", "Tusks are made of ivory."], "image": "val2014/COCO_val2014_000000456192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4554, "question_id": "9XzbYxCdpvaUpEFK2Ysp9D", "question": "What skill level do most snowboarders here have?", "choices": ["beginners", "professional", "olympic", "competitive"], "correct_choice_idx": 0, "direct_answers": ["beginner", "beginners", "beginner", "beginner", "beginner", "beginner", "beginner", "beginner", "beginner", "beginner"], "difficult_direct_answer": false, "rationales": ["The people look like they don't know much about the sport.", "The people are learning how to snowboard", "Most of the snowboarders are basic beginners."], "image": "val2014/COCO_val2014_000000004554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98471, "question_id": "9Y9L5hrWKkM7Vrs3wJuNVy", "question": "What is the cover on the back of the truck called?", "choices": ["blanket", "tarp", "hood", "tonneau cover"], "correct_choice_idx": 3, "direct_answers": ["tonneau", "bed cover", "bed cover", "tonneau cover", "tano", "cover", "tonneau cover", "tano cover", "truck top", "tonneau cover"], "difficult_direct_answer": false, "rationales": ["This is the correct term per google search and wikipedia.", "A piece that protects anything in the back of the truck.", "It is a solid cover as opposed to the others which are either fabric or on the front of the truck."], "image": "train2014/COCO_train2014_000000098471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505933, "question_id": "9YJeiCAYRx3XnBfoCEoad9", "question": "What type of environment is this?", "choices": ["city", "forest", "ocean", "desert"], "correct_choice_idx": 0, "direct_answers": ["city", "urban", "urban", "bridge", "city", "city", "urban", "urban", "urban", "city"], "difficult_direct_answer": false, "rationales": ["Lots of buildings in the area that are close together.", "The tall buildings, heavy traffic, and commuter trains are all things found in a busy metropolitan area.", "This is in a city"], "image": "val2014/COCO_val2014_000000505933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53719, "question_id": "9Z5LSXZVgGC2VpaMun6eLS", "question": "Why was the white car abandoned in the street?", "choices": ["snow", "tornados", "flooding", "construction"], "correct_choice_idx": 2, "direct_answers": ["flooding", "flooding", "flooding", "stopped running", "flood", "flooding", "flooding", "flooded", "flooding", "flooding"], "difficult_direct_answer": false, "rationales": ["You can see all the water that is flooding the street and the car got stuck.", "The car has water up to the top half of their tires. it is difficult for cars to operate in this condition and frequently leaves to them being left behind.", "The white car seen here has most of it's tires submerged in water and is not usable."], "image": "train2014/COCO_train2014_000000053719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365140, "question_id": "9ZFE2CiHQP59JcoaPdii6W", "question": "What is the source of smoke?", "choices": ["natural gas", "coal", "gasoline", "indians"], "correct_choice_idx": 1, "direct_answers": ["train", "steam engine", "train", "train", "train", "coal", "train", "coal", "train", "train"], "difficult_direct_answer": false, "rationales": ["This train runs on coal and the smoke coming out is from that.", "The train is burning coal as fuel to make the engine run.", "The photograph depicts a steam engine, which typically uses this as a fuel source."], "image": "train2014/COCO_train2014_000000365140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506062, "question_id": "9ZNDKZNaCrTRadbuNBnPqT", "question": "When stopped what part of the smaller bus might most frequently pop out away from it's side?", "choices": ["tires", "tail pipe", "hood", "stop sign"], "correct_choice_idx": 3, "direct_answers": ["stop sign", "stop sign", "door", "sign", "stop sign", "sign", "stop sign", "sign", "stop sign", "door"], "difficult_direct_answer": false, "rationales": ["The stop sign stopped the bus.", "When the bus stops to pick up or let off riders, a traffic arm extends indicating that it is currently illegal to pass the bus.", "Traditionally school buses have these signs that will come out when it is stopped for the safety of children."], "image": "train2014/COCO_train2014_000000506062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353511, "question_id": "9ZbvjFs85wQM5DzHL3mN8D", "question": "What material is the floor made of?", "choices": ["ceramic tile", "wood", "vinyl", "carpet"], "correct_choice_idx": 0, "direct_answers": ["tile", "tile", "wood", "ceramic tile", "tile", "tile", "linoleum", "ceramic", "stone", "tile stones"], "difficult_direct_answer": false, "rationales": ["There are grout lines between the individual pieces.", "Tiles are on the floor.", "The material is ceramic tile."], "image": "train2014/COCO_train2014_000000353511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313007, "question_id": "9ZdkmasUktodU72SyjydPR", "question": "What does the blue sign mean?", "choices": ["handicap accessible", "caution", "stop", "go"], "correct_choice_idx": 0, "direct_answers": ["handicap", "lift", "handicap", "handicapped", "wheelchair", "handicap", "handicapped accesible", "handicap accessible", "access limited-mobility", "handicapped"], "difficult_direct_answer": false, "rationales": ["A blue sign is on a door and shows the white outline of a person sitting in a wheelchair. signage marks handicap facilities in public places.", "This is to show that people in wheelchairs can use it.", "The symbol represents a person in a wheelchair."], "image": "train2014/COCO_train2014_000000313007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81571, "question_id": "9ZeKzesYekM9Laq6rYyoXa", "question": "What items are obviously artificial here?", "choices": ["leaves", "people", "flowers", "giraffes"], "correct_choice_idx": 3, "direct_answers": ["giraffe", "giraffes", "giraffes", "giraffes", "giraffes", "giraffes", "giraffes", "giraffe", "giraffe", "giraffes"], "difficult_direct_answer": false, "rationales": ["The giraffes would not be loose in an area with so many people.", "Answer a is visible and does not appear how it would naturally in the real world based on the inorganic nature.", "Because they have been man made to create attraction."], "image": "train2014/COCO_train2014_000000081571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377626, "question_id": "9ZeWyLmFRb8rdcgRobXAg4", "question": "Who was behind the saving of the market in 1971?", "choices": ["victor steinbrueck", "perry stephens", "walt schumacher", "marcus finley"], "correct_choice_idx": 0, "direct_answers": ["victor steinbrueck", "citizens", "no clue", "farmers", "government", "nixon", "seattle", "farmers markets", "invalid question", "pike market"], "difficult_direct_answer": true, "rationales": ["Steinbrueck was behind the market's saving.", "Victor steinbrueck led an effort to keep the market in place.", "The farmers market was saved by victor steinbrueck."], "image": "val2014/COCO_val2014_000000377626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201596, "question_id": "9aBzHPLSumrBr4bzWZAcw5", "question": "What is causing the dog to rest his head on the side of the boat?", "choices": ["exhaustion", "laziness", "command", "boredom"], "correct_choice_idx": 3, "direct_answers": ["tiredness", "tired", "sleepiness", "view", "smells", "relaxed", "tired", "smells", "boredom", "tired"], "difficult_direct_answer": false, "rationales": ["The dog is bored and is looking forward.", "A dog is looking out over the side of a boat with his head laying on the rail. animals and people rest their heads when they are bored.", "He's relaxing and looking at where he'd like to be instead"], "image": "train2014/COCO_train2014_000000201596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362812, "question_id": "9aWm7QFw74FtKEM99Y6bfM", "question": "How do these people travel here?", "choices": ["train", "plane", "taxi", "uber"], "correct_choice_idx": 0, "direct_answers": ["standing", "subway", "subway", "train", "subway", "bus", "subway", "subway", "walk", "train"], "difficult_direct_answer": false, "rationales": ["There are hand straps and bars for people to hold onto when they have to stand as they travel", "The interior of the area looks much like a typical train.", "The digital display and poles for standing passengers tells us that these people are riding a train."], "image": "val2014/COCO_val2014_000000362812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86573, "question_id": "9aaL5acpTnzmz4eNjbxMRp", "question": "Where in the house are they likely planning to dine?", "choices": ["living room", "porch", "kitchen", "dining room"], "correct_choice_idx": 0, "direct_answers": ["living room", "living room", "living room", "living room", "living room", "living room", "living room", "living room", "living room", "living room"], "difficult_direct_answer": false, "rationales": ["Their food is on the coffee table in front of the tv.", "The food is on a coffee table. there is a television near the coffee table.", "Because the food is placed on the sitting table where there's a television."], "image": "train2014/COCO_train2014_000000086573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334291, "question_id": "9ae6kac2tmi9wzEcJGbwNV", "question": "What type of energy is being used by the stove?", "choices": ["convection", "microwave", "gas", "electric"], "correct_choice_idx": 2, "direct_answers": ["gas", "gas", "gas", "gas", "gas", "gas", "gas", "gas", "gas", "gas"], "difficult_direct_answer": false, "rationales": ["Gas is being used.", "There is live fire under the pots.", "You can tell by the flame that it is a gas stove."], "image": "val2014/COCO_val2014_000000334291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476772, "question_id": "9ahGQQVsJdZ3P7QG6D6244", "question": "Turning this faucet to the right produces what temperature water?", "choices": ["hot", "cold", "scalding", "boiling"], "correct_choice_idx": 1, "direct_answers": ["cold", "cold", "cold", "cold", "cold", "cool", "cold", "cold", "cold", "hot"], "difficult_direct_answer": false, "rationales": ["Cold water is typically on the right and hot water is on the left.", "The right makes the water very cold.", "The right side has a cold stream."], "image": "train2014/COCO_train2014_000000476772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334868, "question_id": "9am48Urb8QNnxxKF9ctRuB", "question": "What does the truck with the box topped scissor lift carry?", "choices": ["pets", "live bait", "food", "captains"], "correct_choice_idx": 2, "direct_answers": ["food", "food", "baggage", "airplane food", "food meals", "food", "food", "boxcar", "food", "gourmet items"], "difficult_direct_answer": false, "rationales": ["It is labeled as \"gourmet\".", "The truck has food.", "The box has the word gourmet printed on the side of it, gourmet is a type of cuisine so the contents are most likely produced and shipped by this company."], "image": "train2014/COCO_train2014_000000334868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394248, "question_id": "9apPQVtqvyaAQ8WFayyZ5E", "question": "What is the pig here entered in?", "choices": ["show", "race", "roping contest", "beauty contest"], "correct_choice_idx": 0, "direct_answers": ["contest", "contest", "4h show", "show", "show", "contest", "contest", "best pig", "sheepdog race", "contest"], "difficult_direct_answer": false, "rationales": ["The man dressed in business casual attire with the pig by his side and an audience viewing them indicates that they are in some sort of presentation.", "The man is showing how well he can control the pig. there are spectators watching.", "He is in a competition."], "image": "train2014/COCO_train2014_000000394248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259542, "question_id": "9b4NdRhqtZyQ7EXVjWWwae", "question": "What type of activity are the elephants doing?", "choices": ["playing", "eating", "sleeping", "washing"], "correct_choice_idx": 0, "direct_answers": ["giving rides", "performing", "playing soccer", "tricks", "soccer", "soccer", "playing", "soccer", "playing soccer", "playing"], "difficult_direct_answer": false, "rationales": ["There is a ball and net present which would be consistent with answer a and none of the features of any of the other answers are visibly present.", "The elephant is going towards a soccer ball.", "They are on a makeshift soccer field with a soccer ball and nets"], "image": "train2014/COCO_train2014_000000259542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465693, "question_id": "9b4iwLkw5PfM32JKZXicL7", "question": "What are the logs in front of the people for?", "choices": ["distancing", "sitting", "decoration", "climbing"], "correct_choice_idx": 0, "direct_answers": ["safety", "protection", "barrier", "keep out", "distance keeping", "distancing", "promote safety", "protection", "safety", "safety barrier"], "difficult_direct_answer": false, "rationales": ["The logs provide an extra barrier between the people and the animals so that they cannot get too close to each other.", "The logs are to stop the people from getting too close to the giraffes.", "People stand in front of a wood structure of a giraffe enclosure. visitors are separated from animals at a zoo."], "image": "train2014/COCO_train2014_000000465693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535089, "question_id": "9bEgxHrCdp5C86viFmH8JX", "question": "The animals are identified by a system using what color here?", "choices": ["yellow", "red", "green", "black"], "correct_choice_idx": 0, "direct_answers": ["yellow", "yellow", "brown", "brown", "yellow", "brown", "yellow", "yellow", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The animals have tags in their ears. the tags are not red, green, or black.", "The cows have yellow tags on their ears.", "The tags that are on the cows ear helps to identify them."], "image": "train2014/COCO_train2014_000000535089.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20355, "question_id": "9bjFRHy9Y6uazLRuMFfGzH", "question": "Why is there a blanket on top of the elephant?", "choices": ["decoration", "dress code", "to ride", "to warm"], "correct_choice_idx": 2, "direct_answers": ["to ride", "protection", "protection", "seat", "seat", "comfort", "skin protection", "riders", "for protection", "for riders"], "difficult_direct_answer": false, "rationales": ["Elephants can be used to ride on. people generally put a blanket or saddle on an animal when riding them.", "This blanket is on top of what's effectively a saddle for this horse; allowing the elephant to be easier mounted and ridden.", "The blanket is to provide protection while riding the elephant."], "image": "train2014/COCO_train2014_000000020355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3124, "question_id": "9czQxjVQVewt2zqiGewfdC", "question": "What is the man jumping up to do?", "choices": ["hit", "speak", "run", "catch"], "correct_choice_idx": 3, "direct_answers": ["catch frisbee", "catch frisbee", "catch frisbee", "throw frisbee", "throwing", "catch frisbee", "catch", "catch frisbee", "catch frisbee", "catch"], "difficult_direct_answer": false, "rationales": ["He has his hand reached out. the frisbee is coming toward him.", "The man is jumping to catch the ball.", "He is jumping up to get the frisbee that is being thrown at him."], "image": "train2014/COCO_train2014_000000003124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310572, "question_id": "9dCJeJFGcn7KYJK8xuRXUJ", "question": "Where was this food made?", "choices": ["home", "store", "restaurant", "outside"], "correct_choice_idx": 0, "direct_answers": ["kitchen", "kitchen", "kitchen", "kitchen", "home kitchen", "home", "home", "kitchen", "kitchen", "restaurant"], "difficult_direct_answer": false, "rationales": ["Because the food id served in a ordinary plate.", "A plate of food is on a casual style plate and a residential countertop.", "The food was made at home."], "image": "val2014/COCO_val2014_000000310572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450910, "question_id": "9dKFhKbjjLNf8Lh4Vq96MG", "question": "What does the brownish green stuff bring to the beach?", "choices": ["unwanted trash", "minerals", "fish", "salt"], "correct_choice_idx": 0, "direct_answers": ["bacteria", "crabs", "seaweed", "animals", "marine organisms", "seaweed", "algae", "unwanted trash", "sand", "water"], "difficult_direct_answer": true, "rationales": ["Some people throw their trash in the water at the beach.", "There is trash around the sand.", "Seaweed contains ingredients that can nourish plants."], "image": "train2014/COCO_train2014_000000450910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165376, "question_id": "9dVHRvCQD5wwcWFa4YQ9J3", "question": "How many items in the living room may have to share an outlet with the laptop?", "choices": ["five", "three", "two", "four"], "correct_choice_idx": 2, "direct_answers": ["two", "one", "one", "one", "four", "two", "one", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There appears to be two electronics in the room that could potentially share an outlet with the laptop.", "There are two lamps, one in front and one in back of the room.", "The only other electrical items in the living room are the two lamps."], "image": "train2014/COCO_train2014_000000165376.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42741, "question_id": "9dcqz9unwedJGR2wSaqjAo", "question": "Why is he wearing a tie?", "choices": ["stole it", "is joke", "impress cat", "is selling"], "correct_choice_idx": 1, "direct_answers": ["for fun", "is joke", "comedy", "for fun", "being funny", "costume", "loves snoopy", "joking", "dress up", "disguise"], "difficult_direct_answer": true, "rationales": ["The kid is wearing the tie for fun,and is not wearing the proper attire that accompanied with a tie.", "He is playing around.", "He is not dressed up in dress up clothes so he is just playing around."], "image": "train2014/COCO_train2014_000000042741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442223, "question_id": "9dcx2E4GrdBSTmTtfWAx9j", "question": "What type of seating would one expect to find in this building?", "choices": ["futons", "booths", "pews", "stools"], "correct_choice_idx": 2, "direct_answers": ["church", "pews", "pews", "pews", "pews", "pews", "pews", "church", "pews", "pews"], "difficult_direct_answer": false, "rationales": ["This is a small church building and it would have pews in it.", "This building is a church given the cross.", "The crosses on the building designate it is a church so traditional church seating would likely be found inside."], "image": "val2014/COCO_val2014_000000442223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83601, "question_id": "9dmtYedZuojB6HVXeV54w3", "question": "What type of telephone does this person have?", "choices": ["cellular", "rotary", "payphone", "landline"], "correct_choice_idx": 0, "direct_answers": ["cellular", "flip phone", "flip phone", "cell phone", "small phone", "cell", "smartphone", "flip phone", "flip phone", "verizon"], "difficult_direct_answer": false, "rationales": ["It is a small flip phone.", "The phone is of the same size and shape consistent with answer a and has no visible cords.", "The phone is a cell phone."], "image": "train2014/COCO_train2014_000000083601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361907, "question_id": "9e4evqvh626BZGSU6ZcPXE", "question": "What is the slang term for this male athlete's hairstyle?", "choices": ["man bun", "male bun", "pony bun", "knob bun"], "correct_choice_idx": 0, "direct_answers": ["man bun", "man bun", "manbun", "ponytail", "man bun", "pony tail", "tennis", "manbun", "man bun", "man bun"], "difficult_direct_answer": false, "rationales": ["A man has his hair slicked back into a bun.", "That is what they call that style.", "The bunched up end of this man's pulled back long hair could be said to resemble a bun."], "image": "train2014/COCO_train2014_000000361907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294814, "question_id": "9eM7iczsR59BJdx934pRSi", "question": "What type of animals are shown?", "choices": ["tiger", "lion", "horse", "zebra"], "correct_choice_idx": 2, "direct_answers": ["horses", "horses", "horses", "horses", "horses", "horses", "horses", "horses", "horses", "horse"], "difficult_direct_answer": false, "rationales": ["The animals are saddled up so that they can be ridden.", "They have leather strapping on them and saddles so people can ride", "The animals are not big cats. they look similar to zebras but do not have black and white stripes."], "image": "val2014/COCO_val2014_000000294814.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539977, "question_id": "9eRt6Y2qJLKrbRnR7aGBty", "question": "What is the most likely reason the street is filled with bicyclists?", "choices": ["training", "race", "parade", "protest"], "correct_choice_idx": 1, "direct_answers": ["racing", "race", "race", "race", "race", "marathon", "race", "race", "race", "bicycle race"], "difficult_direct_answer": false, "rationales": ["There is a race that is happening.", "They are all riding fast and close together, wearing sunglasses and athletic clothing.", "They are in a competition to see who wins."], "image": "train2014/COCO_train2014_000000539977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9851, "question_id": "9ei2F4QvaS7ovuErYRyR9a", "question": "This animal has a long what?", "choices": ["stinger", "face", "wing", "quill"], "correct_choice_idx": 1, "direct_answers": ["snout", "mane", "snout", "nose", "nose", "face", "nose", "nose", "snout", "snout"], "difficult_direct_answer": false, "rationales": ["The horse staring at the dog looks like he has a very long face.", "The animal has a long face.", "Horses have a very elongated head structure."], "image": "train2014/COCO_train2014_000000009851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102353, "question_id": "9fPehZwSSFGPVFXMhfwtk2", "question": "In what state was the oven manufacturer founded?", "choices": ["nevada", "vermont", "new mexico", "oklahoma"], "correct_choice_idx": 1, "direct_answers": ["vermont", "sweden", "vermont", "no idea", "indiana", "mississippi", "oven state", "vermont", "china", "vermont"], "difficult_direct_answer": false, "rationales": ["The manufacturer, blodgett, is located in essex junction in this state.", "The state is vermont.", "Made in that state."], "image": "val2014/COCO_val2014_000000102353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385743, "question_id": "9fS7LT35BNRCQYkByDGtY8", "question": "In what year did this company go public?", "choices": ["1975", "2004", "1980", "1995"], "correct_choice_idx": 2, "direct_answers": ["1980", "1980", "two thousand", "1980", "after 1960", "in 1980", "before decade", "1980", "apple-unknown", "nineteen eighty"], "difficult_direct_answer": false, "rationales": ["Apple became a public company late in the twentieth century.", "It went public in 1980", "They went public in 1980"], "image": "train2014/COCO_train2014_000000385743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491857, "question_id": "9fgcjCEaAaQ6X4PZqjUVsa", "question": "What are the boaters going to do at their destination?", "choices": ["sight see", "shop", "party", "swim"], "correct_choice_idx": 1, "direct_answers": ["shop", "shop", "sell wares", "sell merchandise", "market", "sell wares", "shop", "shop", "buy/sell goods", "shopping"], "difficult_direct_answer": false, "rationales": ["A variety of services and stores are advertised along these waterways.", "Some boaters have items on their boats that look like they came from stores.", "Boaters are boating in a canal with stores on both sides."], "image": "train2014/COCO_train2014_000000491857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318541, "question_id": "9fjkvdmNrQTJro9JcNseV8", "question": "What type of drink is the lady holding?", "choices": ["lukewarm water", "cool drink", "hot drink", "cocoa"], "correct_choice_idx": 1, "direct_answers": ["soda", "soda", "soda", "unknown", "fruit drink", "soda", "smoothie", "soda", "cool drink", "softdrink"], "difficult_direct_answer": false, "rationales": ["A woman is holding a drink in a plastic cup with a straw. cold drinks are kept in plastic cups.", "The drink is cold as it doesn't have an insulator around the cup.", "The woman's drink is likely soda."], "image": "train2014/COCO_train2014_000000318541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575631, "question_id": "9g4EEjyS2uGyoeAeeAR5cc", "question": "What does the child imagine his toy bear does now?", "choices": ["eat honey", "phone call", "make house", "sing song"], "correct_choice_idx": 1, "direct_answers": ["it talks", "talking", "communicating", "smile", "phone call", "talks", "make call", "phone call", "speak listen", "talk phone"], "difficult_direct_answer": true, "rationales": ["The bear is making a call.", "The child has his toy bear make a phone call.", "The boy holds a phone to his toy's ear. he is pretending it's taking a call."], "image": "train2014/COCO_train2014_000000575631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78701, "question_id": "9gSUJ5jGdTLqV4GyYTBje3", "question": "Where is this cake and woman located?", "choices": ["tech office", "child's home", "zoo", "family home"], "correct_choice_idx": 0, "direct_answers": ["mozilla headquarters", "workers breakroom", "party", "at work", "north america", "work", "office", "office", "england", "tech office"], "difficult_direct_answer": true, "rationales": ["There is an internet explorer logo on the cake.", "She looks to be in an office", "A lot of equipment is around her."], "image": "val2014/COCO_val2014_000000078701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143171, "question_id": "9gekATnS9kBo2t7PBLTA8g", "question": "What might the man do with the white object?", "choices": ["stomp on", "wipe hands", "trade", "sell"], "correct_choice_idx": 1, "direct_answers": ["clean noise", "dry hands", "throw", "wipe", "wipe hands", "throw away", "clean hands", "throw away", "dry hands", "clean"], "difficult_direct_answer": false, "rationales": ["It is a cloth or napkin used to clean yourself off", "Though it has many uses, generally these are used for clean up.", "The man is holding a towel or napkin. he might use it to clean his fingers."], "image": "train2014/COCO_train2014_000000143171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347139, "question_id": "9hG4LvRXJkTp3p7aUU6fHm", "question": "The meat in the bun is most likely harvested from what?", "choices": ["goat", "cow", "duck", "pig"], "correct_choice_idx": 3, "direct_answers": ["cows", "pigs", "pig", "pig", "chicken", "pig", "beef", "cow", "pig", "pig"], "difficult_direct_answer": false, "rationales": ["A man stands on the side of a very large pulled pork sandwich.", "A pulled pork sandwich is on display. pork is from pigs.", "The meat appears to be pulled pork. there is a figure of the source animal on top of the bun."], "image": "train2014/COCO_train2014_000000347139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32056, "question_id": "9hJg5yy94pAJCGrKBbVuZ8", "question": "What school does this boy attend?", "choices": ["none", "elementary", "harvard", "holy cross"], "correct_choice_idx": 3, "direct_answers": ["holy cross", "holy cross", "holy cross", "private", "holy cross", "holy cross", "private", "holy cross", "private", "holy cross"], "difficult_direct_answer": false, "rationales": ["It says on his vest.", "The school name is on the boy's uniform vest.", "The boy is wearing a school uniform and the name of the school is embroidered on his sweater."], "image": "val2014/COCO_val2014_000000032056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539819, "question_id": "9hZGPq8tcbV6bAr4tNcNn9", "question": "What is the man moving to avoid?", "choices": ["branches", "chains", "cones", "leaves"], "correct_choice_idx": 2, "direct_answers": ["cone", "cone", "cone", "cones", "cones", "cone", "cone", "pylon", "cone", "cones"], "difficult_direct_answer": false, "rationales": ["He is on a course and these dictate where you go", "The man is trying to skateboard around the cones.", "A man is skateboarding through cones."], "image": "train2014/COCO_train2014_000000539819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254549, "question_id": "9hjBbrRDDuZNQFEohtjxvU", "question": "What was done to the rice before mixed with the broccoli?", "choices": ["steamed", "baked", "broiled", "grilled"], "correct_choice_idx": 0, "direct_answers": ["cooked", "cooked", "steamed", "boiled", "cooked", "cooked", "steamed", "cooked", "steamed", "cooked"], "difficult_direct_answer": false, "rationales": ["The rice has been steamed before it had been mixed in with the broccoli.", "You usually steam rice in a rice cooker then you can add the broccoli in.", "Rice is often steamed to cook it."], "image": "train2014/COCO_train2014_000000254549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211892, "question_id": "9iH6XhNfMCnDpMJUKGzDM6", "question": "What treat does pictured animal like?", "choices": ["bone", "catnip", "chili peppers", "vinegar"], "correct_choice_idx": 0, "direct_answers": ["bone", "dog bones", "peanut butter", "dog treat", "bones", "bones", "bacon", "bone", "bone", "bone"], "difficult_direct_answer": false, "rationales": ["The treat is a bone.", "The animal is a dog, not a cat. not many pet animals would like chili peppers or vinegar.", "The dog likes the bone."], "image": "train2014/COCO_train2014_000000211892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128020, "question_id": "9iNsUgn8cXFsKQYkE99CuH", "question": "What are these girls allowed to feed any ducks here they see?", "choices": ["seeds", "nothing", "bread", "donuts"], "correct_choice_idx": 1, "direct_answers": ["nothing", "not allowed", "nothing", "nothing", "nothing", "nothing", "nothing", "nothing", "nothing", "not allowed"], "difficult_direct_answer": false, "rationales": ["The girls can't feed them.", "There is a warning sign behind the bench the girls are sitting at indicating not to feed the ducks.", "The sign clearly states no feeding of the ducks."], "image": "train2014/COCO_train2014_000000128020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309311, "question_id": "9iZzDRSZhhgF3TeeeBPKss", "question": "What action is the tennis player doing?", "choices": ["dancing", "serving ball", "receiving ball", "jumping"], "correct_choice_idx": 2, "direct_answers": ["tennis forehand", "serving", "serving", "hitting ball", "jumping", "serving", "receiving ball", "striking", "serving", "hitting"], "difficult_direct_answer": false, "rationales": ["By the position and ball in air you can tell what is happening.", "The tennis player is about to hit the ball.", "The ball is coming to him."], "image": "train2014/COCO_train2014_000000309311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213781, "question_id": "9jUjD5AX4bECCguBjDjMtV", "question": "What citrus fruit is atop the fried food?", "choices": ["lime", "orange", "grapefruit", "lemon"], "correct_choice_idx": 0, "direct_answers": ["lime", "lime", "lime", "lime", "lime", "lime", "lemon", "lemon", "lime", "lemon"], "difficult_direct_answer": false, "rationales": ["The fruit looks similar to a lemon, but is green.", "The green citrus is a tad less sour than the yellow one.", "The citrus fruit is green, not yellow, orange, or red."], "image": "train2014/COCO_train2014_000000213781.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191667, "question_id": "9jYaR7jRzUFd3Q7E8HSqwV", "question": "The cat underneath the chairs is present in what type of store?", "choices": ["convenience store", "bodega", "bookstore", "cafe"], "correct_choice_idx": 3, "direct_answers": ["restaurant", "movable", "coffee shop", "restaurant", "restaurant", "cafe", "cafe", "cafe", "cafe", "coffee shop"], "difficult_direct_answer": false, "rationales": ["A cat is laying on the floor in an area with lots of tables and chairs. cafes have seating.", "The chairs are sort of fancy so the cat could be outside a cute cafe.", "This might be b too, but a is more likely based on the tables above."], "image": "train2014/COCO_train2014_000000191667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306451, "question_id": "9k5HGygXyD37KTiEsZNxVU", "question": "What are the vehicles attempting to do?", "choices": ["collide", "merge", "park", "race"], "correct_choice_idx": 1, "direct_answers": ["proceed through", "drive", "move", "merge", "merge", "merge", "drive", "merge", "merge", "merging"], "difficult_direct_answer": false, "rationales": ["The vehicles want to merge lanes.", "The vehicles are heading in the same direction and are moving from multiple lanes into one. this travel pattern is known as answer a.", "All the cars have to get into one lane so they have to come together and form a single line."], "image": "train2014/COCO_train2014_000000306451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476947, "question_id": "9kjAbKomBeHKzTQmvfhTZi", "question": "Why are the skiers crouched over?", "choices": ["to sit", "to roll", "for safety", "for speed"], "correct_choice_idx": 3, "direct_answers": ["speed", "balance", "for speed", "for speed", "go faster", "speed", "aerodynamics", "avoid wind", "gain speed", "skis"], "difficult_direct_answer": false, "rationales": ["This makes their movement more aerodynamic", "They are skiing.", "Traditionally in skiing bending in that position will help you glide faster on your descent."], "image": "train2014/COCO_train2014_000000476947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438993, "question_id": "9kkwCZ9krDNRZh7eNErCkD", "question": "Which of these four sinks from left to right should the man definitely avoid?", "choices": ["third", "fourth", "first", "second"], "correct_choice_idx": 2, "direct_answers": ["left", "left", "left sink", "left", "left", "left two", "left", "far left", "first", "leftmost"], "difficult_direct_answer": false, "rationales": ["The man should avoid the first sink which is covered in blood.", "The first sink looks like it has blood next to it and that should be avoided.", "It's dirty and perhaps a health hazard."], "image": "train2014/COCO_train2014_000000438993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188865, "question_id": "9m2GDfud9PGyR6sgL8F76W", "question": "Why are both of them wearing cloth on their foreheads?", "choices": ["style", "punishment", "prevent sweat", "for work"], "correct_choice_idx": 2, "direct_answers": ["both sweat", "sweatbands", "headbands", "sweat", "control sweat", "prevent sweat", "sweat band", "sweat", "stop sweat", "sports"], "difficult_direct_answer": true, "rationales": ["Sweatbands prevent sweat from dripping in the eyes of athletes and both of the tennis players are wearing sweatbands on their foreheads.", "When doing exercise the cloth can keep sweat out of the eyes.", "The forehead cloth bands worn by these two are intended to keep sweat out of the eyes."], "image": "val2014/COCO_val2014_000000188865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237976, "question_id": "9m4jp7tXVYf97H5pnXp8Hd", "question": "Which way is the person taking this photo facing in relation to the mirror?", "choices": ["sideways", "different room", "backwards", "facing it"], "correct_choice_idx": 2, "direct_answers": ["backwards", "backwards", "backwards", "away", "backwards", "away", "backwards", "backwards", "backwards", "away"], "difficult_direct_answer": false, "rationales": ["The reflection is clearly visible in the mirror and reflects what is faced towards the mirror which would be answer a.", "The person's back can be seen in the mirror.", "The reflection is backwards."], "image": "train2014/COCO_train2014_000000237976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517921, "question_id": "9mBBN5ZBBJRYCJuyzitZNo", "question": "How is this four wheeler operated?", "choices": ["remote control", "solar power", "robot power", "self driving"], "correct_choice_idx": 0, "direct_answers": ["remote control", "remote control", "remotely", "tractor", "remote control", "remote control", "remote controll", "gas", "remote control", "remote"], "difficult_direct_answer": false, "rationales": ["The person has a remote.", "The four wheeler is operated by a controller held by the person standing next to it.", "The person has a remote control in their hand to move the atv."], "image": "train2014/COCO_train2014_000000517921.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442942, "question_id": "9mcxbySCgG4tX8VLLHQuDE", "question": "What type people ride on this train?", "choices": ["elderly only", "salesmen", "mall workers", "tourists"], "correct_choice_idx": 3, "direct_answers": ["children caregivers", "adults", "kids", "tourists", "children", "kids", "families", "guests", "kids", "tourists"], "difficult_direct_answer": false, "rationales": ["The train is designed for tourists to sit in and it rides them around on a tour.", "Tourists are on the train during their vacation.", "People come from other places to ride the train."], "image": "val2014/COCO_val2014_000000442942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113317, "question_id": "9mgvR5wDjc7UoXuPeZaLTw", "question": "Who is the man in the black pants watching so intently?", "choices": ["doubles partner", "judge", "coach", "fan"], "correct_choice_idx": 1, "direct_answers": ["baseline", "ball", "tennis ball", "opponent", "coach", "referee", "tennis players", "judge", "player", "tennis match"], "difficult_direct_answer": true, "rationales": ["He is looking at the person with the tennis racket.", "The man is watching a sport, and his role is to make sure rules are enforced.", "The man is watching to call whether the play was fair."], "image": "train2014/COCO_train2014_000000113317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432233, "question_id": "9mvihZLSog6y8QRXYpptGs", "question": "How many teams compete here?", "choices": ["one", "none", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Looks to be no teams are competing in this game of frisbee.", "There are two teams.", "Usually there is an offense and a defense and that would leave two teams."], "image": "train2014/COCO_train2014_000000432233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46737, "question_id": "9n9vVYuUygQP6kD7iyY7d9", "question": "What is the make of the blue pickup truck?", "choices": ["chevrolet", "ford", "toyota", "gmc"], "correct_choice_idx": 1, "direct_answers": ["ford", "ford", "ford", "ford", "dodge", "ford", "ford", "ford", "ford", "ford"], "difficult_direct_answer": false, "rationales": ["Ford makes the blue pickup truck.", "This is a 4x4 and it has the name of it on the side of it.", "The side of the truck has the company emblem."], "image": "train2014/COCO_train2014_000000046737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127120, "question_id": "9nEocfvUmf2xFGZ4JWDGDQ", "question": "What is the red thing held by the man?", "choices": ["remote", "water bottle", "microphone", "whistle"], "correct_choice_idx": 2, "direct_answers": ["microphone", "microphone", "microphone", "microphone", "microphone", "microphone", "microphone", "microphone", "microphone", "microphone"], "difficult_direct_answer": false, "rationales": ["As proven by the fact that he's singing into it.", "The man is holding a microphone on stage.", "The man is holding a microphone and singing into it."], "image": "val2014/COCO_val2014_000000127120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391834, "question_id": "9nXgCXZNTd94bWAiFSnrhq", "question": "What is partially blocking this image?", "choices": ["fan", "spokes", "racket", "tubing"], "correct_choice_idx": 2, "direct_answers": ["net", "racket", "racket", "racket", "tennis", "racket", "racquet", "net", "racket", "sun"], "difficult_direct_answer": false, "rationales": ["There is wood with strings on it", "The tennis racket is in the way.", "There is a close up of a tennis racket to the right of the image which takes up part of the picture."], "image": "val2014/COCO_val2014_000000391834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199029, "question_id": "9ndfQWYtCKsY3MovHBnuPn", "question": "What country might be close off to the left?", "choices": ["italy", "america", "germany", "france"], "correct_choice_idx": 0, "direct_answers": ["italy", "italy", "italy", "china", "china", "italy", "italy", "italy", "china", "italy"], "difficult_direct_answer": false, "rationales": ["Italy is on the sign with an arrow pointing that way.", "Italy is the shape of a boot and it's on the sign.", "By its geographical design you can discern the country of origin."], "image": "train2014/COCO_train2014_000000199029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231422, "question_id": "9nunq88xhHyJsjs8MFH98k", "question": "What skill level are these two women in?", "choices": ["professional", "advanced", "athletic", "beginner"], "correct_choice_idx": 3, "direct_answers": ["beginners", "beginners", "beginners", "beginners", "beginner", "excellent", "beginner", "beginners", "beginner", "beginner"], "difficult_direct_answer": false, "rationales": ["The safety gear and cautious stances of these two women tell us they are likely novices at skating.", "They are wearing a lot of protection gear typically worn when learning how to skate for the first time.", "Because she is wearing a helmet."], "image": "train2014/COCO_train2014_000000231422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52936, "question_id": "9o4Uq7C5LY7troXLz4Xq9g", "question": "What does the child hold in her hands?", "choices": ["phone", "jewel box", "tv remote", "wii remote"], "correct_choice_idx": 3, "direct_answers": ["wii remote", "wii remote", "controller", "wii control", "remote", "remote", "remote", "game controller", "remote controller", "remote"], "difficult_direct_answer": false, "rationales": ["It is a small white box and they are looking at a screen.", "She is holding a remote for a video game.", "A child is looking at a screen and holding white controller in her hand. wii game controllers are white."], "image": "val2014/COCO_val2014_000000052936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546412, "question_id": "9on9TYgeqZXQSDizECmRgb", "question": "How many stars does this flag have in total?", "choices": ["25", "50", "60", "55"], "correct_choice_idx": 1, "direct_answers": ["fifty", "50", "50", "fifty", "50", "50", "fifty one", "50", "50", "fifty"], "difficult_direct_answer": false, "rationales": ["There are fifty stars.", "The stars represent the number of states. neither puerto rico or the united states are represented on this flag since one isn't counted as a state and the other isn't allowed voting or state status.", "The united states flag has a star representing each of its states."], "image": "train2014/COCO_train2014_000000546412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171850, "question_id": "9otpprBJiYQa3VbnYzcnjC", "question": "What relationship is held between the two in white?", "choices": ["opposite sides", "strangers", "opponents", "team mates"], "correct_choice_idx": 3, "direct_answers": ["teammates", "teammates", "team mates", "teammates", "teammates", "friends", "teammates", "friends", "team mates", "teammates"], "difficult_direct_answer": false, "rationales": ["The two are wearing same colors of uniform.", "There are four names listed on the court sign meaning they are playing doubles. since they are on the same side of the court they are playing together.", "They are playing doubles."], "image": "val2014/COCO_val2014_000000171850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371955, "question_id": "9oufSq927qhSqniDyCHRGB", "question": "The player with the bat shares the same last name as what person?", "choices": ["aaron sanchez", "ellie trout", "john goodman", "michael phelps"], "correct_choice_idx": 0, "direct_answers": ["gary sanchez", "mark", "mark", "aaron sanchez", "sanchez", "mario", "sanchez", "dirty", "mark sanchez", "alexis sanchez"], "difficult_direct_answer": false, "rationales": ["The man swinging the bat in this image's shirt reads 'sanchez'. aaron sanchez is the only name of those listed which fits.", "The man have the word sanchez which shows they share the name.", "The last name is shown on the back of the number 15 players uniform."], "image": "train2014/COCO_train2014_000000371955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295103, "question_id": "9oyhmuxa8EAtcv7tGWiquk", "question": "What breed of dog is depicted on the toy and actual dog?", "choices": ["pug", "bulldog", "beagle", "mix"], "correct_choice_idx": 1, "direct_answers": ["bulldog", "pug", "bulldog", "bulldog", "bulldog", "bulldog", "bulldog", "bulldog", "pug", "pug"], "difficult_direct_answer": false, "rationales": ["These are both bulldogs.", "The bulldog is in the bed.", "A small dog with a flat face is near a person."], "image": "train2014/COCO_train2014_000000295103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151954, "question_id": "9p8kdvLntHw8nduHpCkGLC", "question": "Why is the item in plastic?", "choices": ["mailing out", "just arrived", "keep clean", "fragile"], "correct_choice_idx": 1, "direct_answers": ["protection", "cleaning", "keep safe", "just arrived", "protection", "protection", "protection", "cheaper", "storage", "brand new"], "difficult_direct_answer": false, "rationales": ["The plastic is usually to protect items when they're being shipped or transported. the plastic still being on means that the item hasn't been unpacked yet.", "The item just got delivered.", "The item was wrapped up for delivery so they just received it."], "image": "train2014/COCO_train2014_000000151954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472828, "question_id": "9p9wkknxtBfe9248maz3nV", "question": "What types of fruits does the vendor here specialize in?", "choices": ["citrus", "tomatoes", "melons", "pome"], "correct_choice_idx": 0, "direct_answers": ["citrus", "oranges", "grapefruits", "oranges", "oranges", "oranges", "orange", "grapefruit", "oranges", "citrus"], "difficult_direct_answer": false, "rationales": ["This vendor sells grapefruits and oranges, which are this type of fruit.", "The oranges are citrus.", "The vendor is selling grapefruits, not apples, pears, melons, or tomatoes."], "image": "val2014/COCO_val2014_000000472828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415150, "question_id": "9pDeCxvGPQ8o3AsVWeao8v", "question": "According to the layout how far are they into eating?", "choices": ["haven't ordered", "almost done", "haven't started", "done"], "correct_choice_idx": 2, "direct_answers": ["main course", "haven't started", "desert", "starting", "just started", "just beginning", "dessert", "beginning", "not", "dessert"], "difficult_direct_answer": true, "rationales": ["Their plate is still clean so they haven't eaten yet.", "A table has glasses with full drinks, clean plates, and food that has not been eaten yet.", "The people haven't started eating."], "image": "train2014/COCO_train2014_000000415150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391179, "question_id": "9paVtrt8dg4HEDc8nVhqLH", "question": "Where was tennis invented?", "choices": ["france", "italy", "england", "venice"], "correct_choice_idx": 0, "direct_answers": ["france", "united kingdom", "france", "france", "france", "france", "france", "nineteen hundreds", "france", "france"], "difficult_direct_answer": false, "rationales": ["A brief search online makes this clear.", "The answer is a based on a google search.", "France is known for being the inventor of tennis."], "image": "val2014/COCO_val2014_000000391179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474190, "question_id": "9qBrqk29yBXZghHqpDFTfj", "question": "What relation is the child in pink to the woman in white?", "choices": ["aunt", "student", "co-star", "daughter"], "correct_choice_idx": 3, "direct_answers": ["child", "child", "mother", "daughter", "daughter", "family", "daughter", "daughter", "girl", "child"], "difficult_direct_answer": false, "rationales": ["The woman is the girl's mom.", "The relation is a daughter.", "She is her daughter and her mom is helping her out."], "image": "train2014/COCO_train2014_000000474190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424110, "question_id": "9qCHNVjbutEkX9xUGEd2vz", "question": "What types of trees are these?", "choices": ["eucalyptus", "birches", "evergreens", "oaks"], "correct_choice_idx": 2, "direct_answers": ["pine", "pine trees", "evergreens", "pine trees", "pine trees", "pine trees", "pine tree", "pine", "evergreen", "pine trees"], "difficult_direct_answer": false, "rationales": ["People are skiing on a snowy mountain with green trees. evergreens are green in the winter.", "They are the only trees that stay green in the winter.", "The trees are evergreens."], "image": "train2014/COCO_train2014_000000424110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505709, "question_id": "9qSapxZU6QDhYTXTJxmyup", "question": "What are wineglasses typically made of?", "choices": ["metal", "glass", "silvered glass", "plastic"], "correct_choice_idx": 1, "direct_answers": ["glass", "glass", "glass", "glass", "glass", "glass", "glass", "glass", "crystal", "crystal"], "difficult_direct_answer": false, "rationales": ["They are made of glass.", "Glasses with stems are on a table and they are opaque.", "Answer a is the most common material for wine glasses to be made of."], "image": "val2014/COCO_val2014_000000505709.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431570, "question_id": "9qSgvj7rLTTiFk7ihs3BmC", "question": "Why is the racer wearing blue outfit?", "choices": ["camouflage", "match motorcycle", "fashion", "dress code"], "correct_choice_idx": 1, "direct_answers": ["matches motorcycle", "uniform", "advertisements", "protection", "uniform", "racing outfit", "safety", "uniform", "brand color", "match motorcycle"], "difficult_direct_answer": false, "rationales": ["The motor bike he's riding is blue", "The racer is wearing a blue outfit to race on the street.", "This one is obviously made to match. it could be argued that it's also b to meet whatever standards set by a patron for the racer, if applicable."], "image": "val2014/COCO_val2014_000000431570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460286, "question_id": "9qnuksbu2QzYXbueW3oqrn", "question": "What may blend in on the bed and be tough to find?", "choices": ["bed frame", "stars", "pillow", "remote controller"], "correct_choice_idx": 3, "direct_answers": ["remote", "cat", "remote controller", "cat", "remote", "remote", "remote", "black cat", "remote", "pillow"], "difficult_direct_answer": false, "rationales": ["The remote might blend with the bed depending on the color.", "It is small and a similar color to the blanket and pillows.", "Electronic devices are usually dark in color. the bed is dark."], "image": "val2014/COCO_val2014_000000460286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500396, "question_id": "9r8V3u99MKfu25pp9WRQgY", "question": "The train is currently carrying cargo during which season?", "choices": ["fall", "winter", "summer", "spring"], "correct_choice_idx": 3, "direct_answers": ["spring", "spring", "summer", "summer", "summer", "fall", "summer", "spring", "spring", "summer"], "difficult_direct_answer": false, "rationales": ["The trees are a bright green color.", "The train is carrying cargo in the spring.", "A train is on tracks in front of trees with green leaves. leaves are green in the spring."], "image": "train2014/COCO_train2014_000000500396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346266, "question_id": "9r99qVzeYS8o8QSK8AMvSx", "question": "What happened to the tide that allowed the yellow boat to list so?", "choices": ["tsunami", "rose", "stayed same", "went out"], "correct_choice_idx": 3, "direct_answers": ["came in", "low tide", "lowered", "wind", "calmed down", "low tide", "went out", "receded", "went out", "low tide"], "difficult_direct_answer": false, "rationales": ["The water has gone down and you can see how low it is next to the dock", "The tide left the area, as it does.", "Tide going out causes water at the shore to retreat and get more shallow. if a boat is parked in shallow water it may tip to the side."], "image": "train2014/COCO_train2014_000000346266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273159, "question_id": "9rDsRG2zY5omMxj7XWTibW", "question": "Where can you most likely catch a ride nearby?", "choices": ["ski lift", "boat", "elevator", "ferris wheel"], "correct_choice_idx": 0, "direct_answers": ["ski lift", "ski lift", "ski lift", "base hill", "ski lift", "ski lift", "base hill", "ski lift", "ski lift", "ski lift"], "difficult_direct_answer": false, "rationales": ["This is a ski resort and a ski lift is a typical type of transportation at a ski resort.", "The people are skiing on a snowy mountain and use the ski lift to get aroujnd.", "The transportation method is visible in the image and would be consistent with the activity on display here."], "image": "train2014/COCO_train2014_000000273159.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352427, "question_id": "9rHpcZR85VCQqwBWCiWZCq", "question": "Which motion is the man in green carrying out?", "choices": ["throwing", "catching", "dancing", "sitting"], "correct_choice_idx": 1, "direct_answers": ["catching", "catching", "catching", "catching frisbee", "catching frisbee", "catching", "catching frisbee", "catch", "catching", "catching"], "difficult_direct_answer": false, "rationales": ["This man's hands and posture suggest he's trying to catch the frisbee flying towards him.", "A person is crouched down with hands outstretched and a frisbee approaching.", "The man is playing frisbee and is reaching out to catch one that is flying towards him."], "image": "train2014/COCO_train2014_000000352427.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10414, "question_id": "9rMYQY8FN5uwUw7ExKXsRE", "question": "How many bowls are containing food on top of the table?", "choices": ["six", "one", "five", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["All of the bowls on the table contain food.", "There are four bowls with food in them present in this image.", "There are four bowls filled with food."], "image": "train2014/COCO_train2014_000000010414.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319669, "question_id": "9ra8GP5z5w3fn9svN9RQ3Y", "question": "In what setting is the man probably unveiling the giant donut?", "choices": ["home", "restaurant", "office", "bakery"], "correct_choice_idx": 2, "direct_answers": ["office", "birthday", "office", "office", "office", "birthday", "office", "office", "office", "birthday"], "difficult_direct_answer": false, "rationales": ["The cubicles, desks and fluorescent lighting in this scene tell us it's a professional working environment.", "There are many desks, so this appears to be an office.", "There are cubicles with computers and desks behind the man. the sign indicates that this area is occupied by employees of livejournal."], "image": "train2014/COCO_train2014_000000319669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543895, "question_id": "9rjuKWP95pEPmLnn9YNVyy", "question": "What do they sell in the building with the Swan sign?", "choices": ["shoes", "crystals", "cigarettes", "chocolates"], "correct_choice_idx": 1, "direct_answers": ["crystals", "jewelry", "crystal", "jewelry", "jewelry", "jewelry", "crystal", "jewelry", "jewelry", "clothes"], "difficult_direct_answer": false, "rationales": ["This is a well known brand that sells these items", "This company traditionally sells jewelry and other crystal items.", "Swarovski is known for its crystals."], "image": "val2014/COCO_val2014_000000543895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411975, "question_id": "9sWC5LbvzxHFHHvKXdez7J", "question": "How will they feel when they stand upright?", "choices": ["thirsty", "hungry", "scared", "dizzy"], "correct_choice_idx": 3, "direct_answers": ["sore", "dizzy", "dizzy", "sore", "dizzy", "dizzy", "dizzy", "dizzy", "dizzy", "sore"], "difficult_direct_answer": false, "rationales": ["Once they stand from playing dizzy bat they'll see stars.", "The people are using the bats to spin very fast to make themselves dizzy on purpose.", "The people are likely spinning based on the orientation of their bodies and the way they are holding the bats. this would result in a feeling of answer a."], "image": "train2014/COCO_train2014_000000411975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143306, "question_id": "9saW6YxQyDWdhDJ4jhLSvj", "question": "What are skis made of?", "choices": ["wood", "aluminum", "steel", "iron"], "correct_choice_idx": 1, "direct_answers": ["wood", "fiberglass", "fiberglass", "wood", "metal", "metal", "fiber core", "wood", "aluminum", "wood"], "difficult_direct_answer": false, "rationales": ["It is long lasting and flexible and skis have been used for a long time", "The answer is found on google.", "Skis are made out of a lightweight metal."], "image": "train2014/COCO_train2014_000000143306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50178, "question_id": "9tHwXpXYQLr36AZtAiKZxu", "question": "What is the upright blue bin intended for?", "choices": ["hazardous waste", "compost", "garbage", "recycling"], "correct_choice_idx": 3, "direct_answers": ["recycling", "trash", "recycling", "trash", "garbage", "recycling", "trash", "garbage", "recycling", "recycling"], "difficult_direct_answer": false, "rationales": ["The symbol synonymous with recycling is visible on the bin.", "It has the familiar symbol with three arrows which differentiates it from a trash receptacle.", "It is there to put recycling in."], "image": "train2014/COCO_train2014_000000050178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150703, "question_id": "9tQY4kJqktTSQnLFX4VE8Q", "question": "What celebrity has a first name that is the same name as the red item in this tomato free sandwich?", "choices": ["pepper keenan", "watermelon o'gallagher", "cherry smith", "apple martin"], "correct_choice_idx": 0, "direct_answers": ["belle", "pepper adams", "jane levy", "pepper potts", "pepper keenan", "tom cruise", "pepper adams", "sargeant pepper", "juice", "sargeant pepper"], "difficult_direct_answer": false, "rationales": ["The red item is a pepper and answer a is a person whose first name is also pepper.", "Pepper's are among the food seen here and pepper keenan is the only name which matches.", "A red pepper is on a sandwich."], "image": "val2014/COCO_val2014_000000150703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229976, "question_id": "9tXjWRGuMY8JoSWKRFFxBN", "question": "What are the little round vegetables called?", "choices": ["brussels sprouts", "tomatoes", "turnip", "onions"], "correct_choice_idx": 3, "direct_answers": ["onions", "onions", "pearl onions", "chestnuts", "mushrooms", "onion", "onions", "onions", "onions", "onion"], "difficult_direct_answer": false, "rationales": ["It is easy to tell by the picture these are caramelized onions.", "The vegetables are the cooked version of a while root vegetable.", "The onions have a rounded shape."], "image": "train2014/COCO_train2014_000000229976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291066, "question_id": "9tZ6mRMH44WgNMjFr8oMXM", "question": "What is the man doing to the pole?", "choices": ["riding it", "repairing it", "demolishing it", "painting it"], "correct_choice_idx": 1, "direct_answers": ["repairing light", "examining repairing", "fixing it", "repairing it", "installing", "climbing", "fixing", "repairing it", "fixing light", "repairing"], "difficult_direct_answer": true, "rationales": ["The traffic light that is hanging down.", "He is fixing the light.", "The man is wearing a safety vest. he has disassembled the traffic light."], "image": "train2014/COCO_train2014_000000291066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466484, "question_id": "9tbES8oejSmLpKSrqJAJGv", "question": "Which object on one of the boats would help someone prepare for rain?", "choices": ["bucket", "tarp", "sail", "motor"], "correct_choice_idx": 1, "direct_answers": ["tarp", "tent", "boat", "tent", "tarp", "tarp", "tarp", "tarp", "tarp", "tarp"], "difficult_direct_answer": false, "rationales": ["Some of the boats have leaks and are covered.", "There is tarp over the boats.", "A blue object that can be layed across can keep things dry."], "image": "train2014/COCO_train2014_000000466484.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544850, "question_id": "9tjhzemZdXQrfWnjJ3ao4P", "question": "What is on the wall?", "choices": ["bat", "hanger", "candle", "painting"], "correct_choice_idx": 3, "direct_answers": ["painting", "picture", "picture", "art", "picture", "art", "portrait", "painting", "photograph", "photos"], "difficult_direct_answer": false, "rationales": ["There is a picture on the wall hanging as art.", "A painting is placed on the back of the wall.", "A picture is on the wall."], "image": "train2014/COCO_train2014_000000544850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143569, "question_id": "9trN39P6MaBZxLugDnmbuc", "question": "Which national museum is in the vicinity?", "choices": ["bar", "temple", "print", "grafton"], "correct_choice_idx": 2, "direct_answers": ["temple bar", "print", "medal", "national point", "british", "enfo", "print", "print", "natural science", "mission print"], "difficult_direct_answer": false, "rationales": ["A sign for a bar is on a street corner in a busy area that looks historical.", "The lowest brown sign shows where this museum is located.", "As per the right lower sign."], "image": "train2014/COCO_train2014_000000143569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303652, "question_id": "9ttKzJHGPGBcfMmqCQoCvB", "question": "What will the person eating this enjoy as dessert?", "choices": ["pie", "cake", "ice cream", "donut"], "correct_choice_idx": 1, "direct_answers": ["cake", "cake", "cake", "cake", "cake", "cake", "cake", "cake", "cake", "cake"], "difficult_direct_answer": false, "rationales": ["Cake is usually served as a dessert and there is some on a plate.", "A dinner is on a plate. cake is a common dessert.", "A slice of the dessert topped with frosting is sitting on a plate behind the meal."], "image": "val2014/COCO_val2014_000000303652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143383, "question_id": "9u69LdQ4xhqVsnDHrMhPR3", "question": "What stuff in the photo is edible?", "choices": ["white pearl", "muffin", "flower", "cherry"], "correct_choice_idx": 3, "direct_answers": ["candy", "cherry", "desserts", "cherries", "food", "cake", "cherries", "icecream", "cherry", "cupcakes"], "difficult_direct_answer": false, "rationales": ["The cherries look to be real.", "The muffins are part of the costume. the flower and white pearls are decorations.", "The item is decorated with cherries which are the only edible item on it."], "image": "train2014/COCO_train2014_000000143383.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281818, "question_id": "9uKWVwLGL26VL5mVvAWnBw", "question": "What causes the red vehicle to move?", "choices": ["coal", "gas", "steam", "electricity"], "correct_choice_idx": 3, "direct_answers": ["electricity", "electric wires", "electricity", "electricity", "electricity", "electricity", "electricity", "electricity", "electric wires", "electricity"], "difficult_direct_answer": false, "rationales": ["There are power lines running to the train.", "The wires above the train is how the train gets the power to move.", "The red vehicle is moved by electricity."], "image": "train2014/COCO_train2014_000000281818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118485, "question_id": "9uMBhi3kiq2Atr8eSzVUUE", "question": "What fruit is in the bright blue bag?", "choices": ["kiwis", "bananas", "jackfruit", "plantains"], "correct_choice_idx": 1, "direct_answers": ["bananas", "bananas", "pineapple", "banana", "bananas", "coconut", "orange", "banana", "bananas", "bananas"], "difficult_direct_answer": false, "rationales": ["There are bananas.", "The bananas ripen faster in the bag.", "Bunches of yellow fruit are inside a blue bag near banana trees."], "image": "train2014/COCO_train2014_000000118485.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2658, "question_id": "9uMKRug7ziB8W2WEswW9xw", "question": "What do these players try to hit with a ball?", "choices": ["bat", "person", "net", "goal"], "correct_choice_idx": 1, "direct_answers": ["hand", "each other", "bodies", "person", "each other", "pins", "people", "other team", "hands", "person"], "difficult_direct_answer": false, "rationales": ["They are playing dodge ball and when you get hit you get out.", "It looks like a game called dodge ball.", "There are many students out on the court with their teacher. they are trying to avoid getting hit by a ball."], "image": "train2014/COCO_train2014_000000002658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381607, "question_id": "9uNtMHqUCLrkjFrzFHsL9v", "question": "What's the name of the wooden structure the woman is sitting on?", "choices": ["sofa", "bar", "chair", "bench"], "correct_choice_idx": 3, "direct_answers": ["bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench"], "difficult_direct_answer": false, "rationales": ["The structure is clearly visible and based on the structure, material, the way it is being utilized and the location, answer a is accurate.", "The name is a bench.", "It's big enough for more than one person."], "image": "val2014/COCO_val2014_000000381607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578070, "question_id": "9uPHStsu5ozkZggXDKePkT", "question": "What type of sport is being played on the truck?", "choices": ["lacrosse", "basketball", "tennis", "hockey"], "correct_choice_idx": 1, "direct_answers": ["basketball", "basketball", "basket ball", "basketball", "basketball", "basketball", "basketball", "baseball", "basket ball", "basketball"], "difficult_direct_answer": false, "rationales": ["There is a net. they are not using sticks or racquets.", "The truck has basketball hoops on the side for people to play basketball.", "There is a basketball hoop on the side of the truck for people to play basketball with."], "image": "train2014/COCO_train2014_000000578070.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538330, "question_id": "9uUcJiAYMwu5eN5NdHUkhP", "question": "What type of drink is in the cup?", "choices": ["water", "none", "blended juice", "beer"], "correct_choice_idx": 2, "direct_answers": ["juice", "jucie", "soda", "juice", "blended juice", "red beer", "soda", "wine", "soda", "watermelon juice"], "difficult_direct_answer": false, "rationales": ["Of the answers provided, only answer a matches the color visible in the cup.", "The cup contains a beverage that is frothy, contains ice cubes, and is a red, somewhat pinkish color.", "It is some kind of juice that has a cherry in it."], "image": "val2014/COCO_val2014_000000538330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143569, "question_id": "9uVrV3rnWMNvNtTmP3E9p8", "question": "Which direction is Henry Street?", "choices": ["right", "down", "left", "up"], "correct_choice_idx": 0, "direct_answers": ["right", "east", "right", "right", "right", "right", "temple bar", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["The sign for henry street has an arrow that indicates its direction.", "The direction is the right.", "The direction sign is pointing on the right side of the image."], "image": "train2014/COCO_train2014_000000143569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247909, "question_id": "9uX9WUQYtUGK8ZRDrpfLkS", "question": "Why are the stuffed animals on display?", "choices": ["as trophies", "to appreciate", "to sell", "as art"], "correct_choice_idx": 2, "direct_answers": ["for sale", "for sale", "for sale", "bears", "for sale", "bears", "for sale", "to sell", "bears", "for sale"], "difficult_direct_answer": false, "rationales": ["They are there so customers can buy them.", "A colorful menagerie of stuffed animals is a great way to attract attention and boost your sales!.", "The animals have price tags on them showing they are merchandise in a store."], "image": "train2014/COCO_train2014_000000247909.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339639, "question_id": "9uiGxJbJR3xfqJVDsyshGf", "question": "What are the people watching here?", "choices": ["musical", "movie", "dance performance", "sport game"], "correct_choice_idx": 3, "direct_answers": ["unknown", "sports event", "baseball", "sport game", "baseball", "baseball game", "event", "match", "game", "games"], "difficult_direct_answer": true, "rationales": ["They are watching a sporting event.", "The audience are enjoying a game.", "A large crowd is in an arena and someone is eating a hot dog. sporting events are held in arenas and hot dogs are often served."], "image": "train2014/COCO_train2014_000000339639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348047, "question_id": "9uosvyCiz6U4xgRKzqd5iz", "question": "What could the condition of the terrain be described as?", "choices": ["shiny", "ridged", "groomed", "straight"], "correct_choice_idx": 2, "direct_answers": ["groomed", "sleepy", "wavy", "snow", "icy", "snowy", "groomed", "smooth", "snowy", "snowy"], "difficult_direct_answer": false, "rationales": ["The snow on the ground is perfectly tended to for uniformity.", "The ground looks like it's comb.", "The neat even arrangement of lines in the snow tell us it's being actively maintained."], "image": "train2014/COCO_train2014_000000348047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493376, "question_id": "9up9KQ5cEF9qvXJmc4UrWU", "question": "Why is the area in the room roped off?", "choices": ["historical significance", "wet paint", "construction", "crime scene"], "correct_choice_idx": 0, "direct_answers": ["historical significance", "display", "display", "off limits", "museum", "museum exhibit", "restricted", "prevent damage", "exclude visitors", "museum"], "difficult_direct_answer": false, "rationales": ["This area of a museum or historical home has very old artifacts in it, and any tampering by humans could potentially damage them. since many tourists ignore \"do not touch\" signs, ropes are employed to protect the valuable pieces.", "This is in a museum of an old building", "The objects are old and may be damaged if they are touched frequently by visitors."], "image": "val2014/COCO_val2014_000000493376.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491400, "question_id": "9uueYdUhHDXf9kn7QuGPWr", "question": "What is the weather like outside here?", "choices": ["sleeting", "hot", "cold", "rainy"], "correct_choice_idx": 1, "direct_answers": ["hot", "sunny", "sunny", "sunny", "sunny", "hot sunny", "sunny", "sunny", "sunny", "hot sunny"], "difficult_direct_answer": false, "rationales": ["The weather is hot.", "Due to the sunny weather on display in this image we conclude the man's umbrella is protecting him from a hot sun rather than rain.", "This man is wearing warm weather attire while shading himself from the sun with an umbrella outside while resting on a bench."], "image": "train2014/COCO_train2014_000000491400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493941, "question_id": "9uvKUHBedLHKM84YWzthEk", "question": "What type of seafood is being served?", "choices": ["scallops", "shrimp", "crab", "fish"], "correct_choice_idx": 0, "direct_answers": ["scallops", "scallops", "scallops", "scallops", "unknown", "scallops", "scallops", "scallops", "unknown", "scallops"], "difficult_direct_answer": false, "rationales": ["The seafood is small round pieces that don't have a tail.", "Scallops are obtained from two shell muscles.", "The plate has cooked scallops on it with vegetables and noodles."], "image": "val2014/COCO_val2014_000000493941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456979, "question_id": "9v2pyeXKigHcjZd3UpMibG", "question": "What is a male of the larger animals called?", "choices": ["drake", "bull", "dog", "ram"], "correct_choice_idx": 1, "direct_answers": ["bull", "bull", "bull", "ox", "cow", "bull", "bull", "bull", "bull", "bull"], "difficult_direct_answer": false, "rationales": ["The male is a bull.", "The animals here are cows.", "The male has horns that are the shown on the animals. the females do not have horns."], "image": "train2014/COCO_train2014_000000456979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462466, "question_id": "9vXLxXuzosK6EDHqq9a6U9", "question": "What is the woman lighting?", "choices": ["candelabra", "letter", "computer", "birthday candle"], "correct_choice_idx": 3, "direct_answers": ["candles", "birthday candle", "candles", "cake", "candles", "candles", "candles", "birthday candles", "birthday cake", "cake candles"], "difficult_direct_answer": false, "rationales": ["It's a cake for someones birthday that has candles on it.", "The woman is lighting a birthday candle.", "The woman is trying to get flames on the candles."], "image": "val2014/COCO_val2014_000000462466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287723, "question_id": "9vqfzD7XhvNnvaaJb4NP4Q", "question": "Why is the woman blowing on the mug?", "choices": ["to cool", "to inflate", "to move", "to spin"], "correct_choice_idx": 0, "direct_answers": ["hot", "to cool", "hit", "cool", "too hot", "hot", "cool down", "it's hot", "cooling contents", "hot"], "difficult_direct_answer": false, "rationales": ["A woman is holding a mug and blowing. people blow on hot things to cool them.", "The woman is cooling the mug.", "The woman trying to get her coffee to cool."], "image": "train2014/COCO_train2014_000000287723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531388, "question_id": "9wJWweujnhs79KyWNppz9i", "question": "What is the purpose of the electrical device that is turned off?", "choices": ["watch", "call", "work", "cool"], "correct_choice_idx": 0, "direct_answers": ["entertainment", "light", "entertainment", "entertainment", "entertainment", "watch", "tv", "television", "broadcasts shows", "television"], "difficult_direct_answer": false, "rationales": ["It is a tv. a person engages with a tv by doing this.", "The purpose is to watch.", "The device a television that is used to play movies and tv shows."], "image": "train2014/COCO_train2014_000000531388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72275, "question_id": "9wmfAp6cAZkmGctsWQ9TRd", "question": "How many kinds of fruit are in the bowl?", "choices": ["four", "five", "two", "three"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "4 types", "4 types", "four"], "difficult_direct_answer": false, "rationales": ["There are bananas, peaches, tangerines, and an apple in the bowl.", "There are 3 different types of round fruits and one long fruit", "There are apples, peaches, bananas, and plums."], "image": "val2014/COCO_val2014_000000072275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51605, "question_id": "9wmrn2kY9RyWzDDdHcxVeC", "question": "Why is the man aiming a glove at the ground?", "choices": ["catching ball", "swatting fly", "scooping dirt", "shaking hands"], "correct_choice_idx": 0, "direct_answers": ["catch", "catch ball", "catch ball", "catch ball", "catch ball", "catching ball", "catch ball", "hit ball", "catching", "catch ball"], "difficult_direct_answer": false, "rationales": ["The man is aiming his glove off the ground to catch the ball.", "He is ready to catch the ball if it comes to him.", "The man with the glove has his hand reached out because he is trying to catch the ball."], "image": "val2014/COCO_val2014_000000051605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286018, "question_id": "9wqxfYz9xNqigFAYYe9Fq3", "question": "What is in this swimming pool?", "choices": ["nothing", "salt water", "fresh water", "soda"], "correct_choice_idx": 0, "direct_answers": ["nothing", "water", "nothing", "its empty", "water", "nothing", "nothing", "nothing", "nothing", "no water"], "difficult_direct_answer": false, "rationales": ["The girl with the skateboard is looking into an empty pool that is used for skating.", "There is a child on a skate board on the lip of the pool ready to drop in.", "A skateboarder stands on the side of a pool with her skateboard propped on the edge. several people watch from all around her."], "image": "val2014/COCO_val2014_000000286018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180480, "question_id": "9x6CSCEEH6fxGxGztxFrj7", "question": "Which direction will the skateboarder next go?", "choices": ["downward", "upwards", "floor", "backwards"], "correct_choice_idx": 0, "direct_answers": ["right", "forward", "down", "down", "forward", "right", "over blocks", "right", "downward", "forward"], "difficult_direct_answer": false, "rationales": ["The skateboarder will fall downward.", "The skateboarder is midair.", "Gravity will pull him down to the ground"], "image": "train2014/COCO_train2014_000000180480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291767, "question_id": "9xZ6LCmWWkRpq6ryEhwQ6n", "question": "What is the sports equipment shown called?", "choices": ["catamarans", "snowboards", "surfboards", "skimmers"], "correct_choice_idx": 2, "direct_answers": ["surfboards", "surfboard", "surfboard", "surfboards", "surfboard", "surfboard", "surfboards", "surfboards", "surfing", "surfboard"], "difficult_direct_answer": false, "rationales": ["With the setting and what the boys are holding and doing you can safely assume they are holding surfboards.", "Surfboards are used on water.", "The people are at the beach and holding boards, so they are surfboards."], "image": "train2014/COCO_train2014_000000291767.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417248, "question_id": "9xdNNgYnfSkyca5QHC29d6", "question": "What is on the back of the right motorcycle?", "choices": ["dog", "luggage", "person", "tire"], "correct_choice_idx": 3, "direct_answers": ["spare tire", "spare tire", "tire", "tire", "tire", "tire", "tire", "tire", "tire", "tire"], "difficult_direct_answer": false, "rationales": ["It is plain to see the rubber tire screwed in the side car to the right.", "There is a spare tire on the sidecar.", "The back has a tire."], "image": "train2014/COCO_train2014_000000417248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29138, "question_id": "9xdRC3VrDmdEXoqPK2g9Gr", "question": "What is a country that is famously a host to this sport?", "choices": ["kenya", "australia", "switzerland", "peru"], "correct_choice_idx": 2, "direct_answers": ["switzerland", "us", "switsland", "united states", "switzerland", "switzerland", "switzerland", "austria", "italy", "greenland"], "difficult_direct_answer": false, "rationales": ["The country is switzerland.", "Many great ski competitions take place in the country that is famous for its alps.", "The sport displayed is skiing/snowboarding based on the equipment and the prevalent snow. of the answers, only answer a is known for these activities and has a climate to match."], "image": "val2014/COCO_val2014_000000029138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395749, "question_id": "9xnw4wRLqSYnd3bgbpNFnz", "question": "Why are they all on the bench?", "choices": ["friends", "closest shore", "own it", "only bench"], "correct_choice_idx": 0, "direct_answers": ["sitting", "talking", "theyre sitting", "talking", "friends", "watching ocean", "setting", "they're resting", "watching lake", "observing"], "difficult_direct_answer": true, "rationales": ["The ocean is a nice place to sit and contemplate life.", "Just a bunch of friends hanging out and enjoying the view.", "They know each other and are enjoying the waterfront"], "image": "val2014/COCO_val2014_000000395749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318533, "question_id": "9xrvRwJzUQcF8XwHDdNLGt", "question": "What is to the left side?", "choices": ["traffic light", "apple", "woman", "baby"], "correct_choice_idx": 0, "direct_answers": ["traffic light", "traffic lights", "traffic light", "trees", "two stoplights", "trees", "trees", "traffic light", "trees", "trees"], "difficult_direct_answer": false, "rationales": ["There is a light on the street with three different lamps.", "There is a stop light.", "The left side has a traffic light."], "image": "train2014/COCO_train2014_000000318533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403473, "question_id": "9y9zZKMHgF8pDjXSQYemvs", "question": "What is the original name for the type of print that is on his bandana?", "choices": ["madras", "stripes", "denim", "kashmir"], "correct_choice_idx": 3, "direct_answers": ["paisley", "kashmir", "paisley", "paisley", "paisley", "paisley", "intermittent", "kashmir", "do rag", "bandana"], "difficult_direct_answer": false, "rationales": ["Paisley was originally called kashmir.", "The pattern is visible on the only visible bandana. a google search provided answer a.", "The man with the red ball wears a bandana with a hashmir pattern on it."], "image": "train2014/COCO_train2014_000000403473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325688, "question_id": "9yQWinxxSLghHmchRJdQZm", "question": "This animal is part of what class?", "choices": ["jellyfish", "insect", "cephalopod", "aves"], "correct_choice_idx": 3, "direct_answers": ["bird", "birds", "bird", "birds", "bird", "bird", "birds", "aves", "mammal", "aves"], "difficult_direct_answer": false, "rationales": ["That's the name of the class for birds. they're found all over the world.", "I performed an internet search on the class that birds belong to.", "They are birds. birds belong to class aves."], "image": "val2014/COCO_val2014_000000325688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140581, "question_id": "9yapjujWCSQ3jWrHrbhtX8", "question": "What is likely the most expensive vehicle to ride in?", "choices": ["sedan", "bike", "bus", "taxi"], "correct_choice_idx": 3, "direct_answers": ["bus", "bus", "taxi", "taxi", "taxi", "taxi", "cab", "taxi", "taxi", "bus"], "difficult_direct_answer": false, "rationales": ["Privacy is expensive.", "A taxi charges by the minute.", "This charges per mile increments whereas the bus charges a single fee no matter how far you go"], "image": "train2014/COCO_train2014_000000140581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32707, "question_id": "9yh2wFUqp9ocjDtT7fTVnb", "question": "What part of the world are the skiers most likely in?", "choices": ["south america", "antarctica", "colorado", "india"], "correct_choice_idx": 2, "direct_answers": ["europe", "europe", "swiss alps", "mountains", "arctic", "north", "alps", "north", "colorado", "mountainous"], "difficult_direct_answer": false, "rationales": ["The state is known for skiing and other winter sports due to how much it snows.", "Colorado has great slopes for skiing.", "The weather is cold but still hospitable to humans in colorado."], "image": "train2014/COCO_train2014_000000032707.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377090, "question_id": "9yne3zsiaxZQvfhL35EMeJ", "question": "What does the tallest structure provide?", "choices": ["light", "music", "disinfection", "water"], "correct_choice_idx": 0, "direct_answers": ["lights", "time", "time", "education", "view", "time", "time", "time", "light", "courthouse"], "difficult_direct_answer": false, "rationales": ["It's a lamppost that will light up at night.", "The tallest structure here is a light post.", "A tall like pole extends above the buildings."], "image": "train2014/COCO_train2014_000000377090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324156, "question_id": "9yvUUzdgn34CdPSaEnk8fh", "question": "Why is the dog on the table making the cake unsafe for the girl?", "choices": ["adding frosting", "dog saliva", "no problem", "no silverware"], "correct_choice_idx": 1, "direct_answers": ["dog saliva", "licking cake", "germs", "spreading germs", "eating it", "eating", "licking", "licking it", "spoiled", "eating"], "difficult_direct_answer": true, "rationales": ["The animal is eating and licking the cake. it is leaving behind liquids from its mouth.", "The dog saliva will be unsafe.", "Dogs have spit in their mouths that can get someone sick."], "image": "train2014/COCO_train2014_000000324156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160740, "question_id": "9yxfecZPpGWN68iB5VgLeU", "question": "What country is the photo from?", "choices": ["china", "japan", "north korea", "south korea"], "correct_choice_idx": 1, "direct_answers": ["japan", "china", "japan", "usa", "japan", "japan", "japan", "japan", "japan", "uk"], "difficult_direct_answer": false, "rationales": ["This is a busy street in this country.", "A city can be seen with japanese writing on the signs.", "The country is japan."], "image": "val2014/COCO_val2014_000000160740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142667, "question_id": "9yyAWTNSWeso8mVj9XPDBH", "question": "What are they doing?", "choices": ["eating breakfast", "standing line", "cleaning up", "waiting buss"], "correct_choice_idx": 1, "direct_answers": ["waiting", "queueing", "buying food", "standing line", "food trucks", "waiting", "buying lunch", "ordering food", "waiting", "waiting"], "difficult_direct_answer": false, "rationales": ["A group of men are all standing on a sidewalk near food trucks. people wait in line to be served.", "The people are in line.", "They are in front of food trucks so they are probably waiting their turn."], "image": "val2014/COCO_val2014_000000142667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22215, "question_id": "9z28PfzUP9DhCSCC8YzaU8", "question": "What area is the train entering?", "choices": ["repair section", "intersection", "train station", "fuel station"], "correct_choice_idx": 2, "direct_answers": ["station", "train station", "train station", "train station", "area 25", "station", "country", "railroad yard", "train station", "station"], "difficult_direct_answer": false, "rationales": ["All the stuff around the train is part of the train station and the building behind it.", "The train is going to the station.", "Trains go to train stations to pick up people so it is likely going to a train station."], "image": "val2014/COCO_val2014_000000022215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313480, "question_id": "9z37ZgQ8zYWDPhdKXEUNDH", "question": "What is the status of the boat?", "choices": ["going backward", "going forward", "turning", "stopped"], "correct_choice_idx": 3, "direct_answers": ["stopped", "floating", "full", "full", "rowing slowly", "stopped", "idle", "full", "floating", "floating"], "difficult_direct_answer": false, "rationales": ["The water around the paddles in this image is mostly still and all passengers appear to be listening to a guide speaking in front. we can assume the boat is not moving.", "A large group of people are in a boat and the person paddling is not holding the oars.", "The status is stopped."], "image": "train2014/COCO_train2014_000000313480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449504, "question_id": "9z3hYbG9gWPNBKFRHucMUM", "question": "Why is there water in the glass containers?", "choices": ["grow flower", "to emergencies", "to drink", "for fish"], "correct_choice_idx": 0, "direct_answers": ["flowers", "feed flowers", "flowers", "feeding flowers", "flowers", "preserve flowers", "grow flower", "for fish", "flowers", "life"], "difficult_direct_answer": false, "rationales": ["The water is to help keep the flowers alive longer.", "The containers have water in them to keep the flowers alive.", "The flowers need the water or they'll wilt."], "image": "train2014/COCO_train2014_000000449504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131825, "question_id": "9z66v2QrHt5LHNLU8cdY6z", "question": "What is he looking at?", "choices": ["his hands", "laptop", "head x-ray", "tent flaps"], "correct_choice_idx": 2, "direct_answers": ["xray", "laptop screen", "screen", "head x-ray", "laptop", "computer screen", "computer", "screen", "laptop screen", "computer"], "difficult_direct_answer": false, "rationales": ["The xray includes a skull xray of a head.", "The solider is looking at a picture of a head in an x-ray photo machine.", "The image on the screen is a round shape with the bottom which is possible the neck."], "image": "train2014/COCO_train2014_000000131825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102609, "question_id": "9z9h3YDtZ37Fqcf6fuLYYF", "question": "What is the piece of equipment on the left side of the desk used for?", "choices": ["faxing", "hard drive", "printing", "copier"], "correct_choice_idx": 2, "direct_answers": ["printer", "printer", "printing", "printing", "printing", "printing", "makes copies", "printing documents", "printing", "printing"], "difficult_direct_answer": false, "rationales": ["It's used to print documents from the computer", "It is used to print documents.", "The computer has a printer to the left."], "image": "val2014/COCO_val2014_000000102609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99753, "question_id": "9zVqgHigV8VZ4hqkVaqYdM", "question": "What type shop is this?", "choices": ["bakery", "ice cream", "pizzeria", "soda"], "correct_choice_idx": 2, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza place", "pizzeria", "pizza shop", "pizza parlor", "pizzaria", "pizza shop"], "difficult_direct_answer": false, "rationales": ["Pizzas are being cooked in the restaurant.", "The shop has pizza.", "The plates have fresh baked pizzas on them."], "image": "train2014/COCO_train2014_000000099753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349530, "question_id": "9zioqQJwPBDGC7GzEYw9gm", "question": "In what capacity is the person pulling the horse likely acting?", "choices": ["visitor", "owner", "worker", "rider"], "correct_choice_idx": 2, "direct_answers": ["training", "little", "racing", "trainer", "training", "forward", "worker", "trainer", "trainer", "calm"], "difficult_direct_answer": false, "rationales": ["The person is wearing a uniform and looks professional, while displaying the horse.", "The person with the leash on the horse is a staff member.", "She is in uniform so she likely works there."], "image": "train2014/COCO_train2014_000000349530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476160, "question_id": "9zuLCTSbtemC3xTkYbitas", "question": "What do the white markings on the road allow for here?", "choices": ["crossing street", "turning left", "speeding up", "turning right"], "correct_choice_idx": 0, "direct_answers": ["pedestrians walking", "crossing street", "people crossing", "crossing", "walk way", "crossing street", "crossing", "pedestrian crossing", "pedestrians", "crossing"], "difficult_direct_answer": false, "rationales": ["They denote a place for pedestrians to get across safely.", "People are supposed to cross here.", "These types of markings are usually placed to let a pedestrian know where to walk in a forward direction."], "image": "train2014/COCO_train2014_000000476160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183676, "question_id": "9zuyP6WoUMMTdWyVd5bvtE", "question": "On what continent was the surfboard made?", "choices": ["africa", "asia", "europe", "north america"], "correct_choice_idx": 3, "direct_answers": ["north america", "north america", "north america", "north america", "europe", "france", "north america", "us", "north america", "north america"], "difficult_direct_answer": false, "rationales": ["The label on the board says \"all american\".", "This brand is made in the states.", "These were invented a very long time ago in hawaii"], "image": "val2014/COCO_val2014_000000183676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318820, "question_id": "9zvBP5xHAn5kmt7esqc6Ly", "question": "In which section of the supermarket is this man standing?", "choices": ["bakery", "produce", "checkout", "meat"], "correct_choice_idx": 1, "direct_answers": ["produce", "produce", "produce", "produce", "produce", "produce", "produce", "produce", "produce", "produce"], "difficult_direct_answer": false, "rationales": ["He is holding fresh fruit which is sold in this department.", "There are fruits and vegetables in the aisle.", "There are rows of vegetables behind him."], "image": "train2014/COCO_train2014_000000318820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331648, "question_id": "A28y7NLq9m5rV7qgeTURBq", "question": "Why is she holding an umbrella in dry weather?", "choices": ["sun protection", "is hiding", "confused", "likes umbrellas"], "correct_choice_idx": 0, "direct_answers": ["block sun", "sun protection", "raining", "sunlight", "shade", "for shade", "shade", "sun protection", "sun", "block sun"], "difficult_direct_answer": false, "rationales": ["Since it is dry, she is using the umbrella to provide shade.", "She wants to keep the sun out.", "If an umbrella is not being used for rain, and no precipitation is visible in this image, it is likely to protect from the sun."], "image": "val2014/COCO_val2014_000000331648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485865, "question_id": "A2QSBDwkUKcbm75xyt46YV", "question": "Where are the animals?", "choices": ["cages", "indoors", "trailer", "outdoors"], "correct_choice_idx": 3, "direct_answers": ["zebras", "zoo", "safari", "zoo", "farm", "finding food", "outdoors", "grassland", "zoo", "farm"], "difficult_direct_answer": false, "rationales": ["The animals are not enclosed. they are surrounded by trees, grass, and the sky.", "The animals are near trees and grass. they are not in a building, enclosure, or vehicle.", "Zebras and horse are standing on some hay. the sun is shining onto ground."], "image": "val2014/COCO_val2014_000000485865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238911, "question_id": "A2aX9xyVMCEYNn4s3idNQc", "question": "What is the style of sandwich on the plate?", "choices": ["toasted", "wrap", "burger", "hot dog"], "correct_choice_idx": 1, "direct_answers": ["fresh", "wrap", "wrap", "wrap", "wrap", "wrap", "wrap", "wrap", "fresh", "wrap"], "difficult_direct_answer": false, "rationales": ["It is a wrapped sandwich.", "The sandwich is rolled up in a tortilla.", "It is rolled in a thinner outer layer rather than between two slices of bread."], "image": "val2014/COCO_val2014_000000238911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11661, "question_id": "A2peBmVyMhcosNEAABozEJ", "question": "The man eating the pizza has what type of skin condition on his face?", "choices": ["rosacea", "psoriasis", "eczema", "acne"], "correct_choice_idx": 3, "direct_answers": ["acne", "sensitive", "pimples", "normal", "acne", "acne", "acne", "acne", "acne", "acne"], "difficult_direct_answer": false, "rationales": ["Acne is red on the young mans face.", "He has red pimples visible on his face.", "There are red spots on his face."], "image": "train2014/COCO_train2014_000000011661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538242, "question_id": "A32UnUvKxPDfoVDWGQHomw", "question": "Where is this meeting likely taking place?", "choices": ["military base", "grocery store", "gym", "mall"], "correct_choice_idx": 0, "direct_answers": ["movie scene", "airport", "base", "airport", "military base", "airport", "military base", "military base", "airport", "military base"], "difficult_direct_answer": false, "rationales": ["Both of these men are wearing army gear.", "Several people dressed in uniforms.", "The two men shown are both wearing different versions of military uniforms."], "image": "val2014/COCO_val2014_000000538242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357870, "question_id": "A346cWmRjSBVdSz97bUvWz", "question": "What is the red button used for?", "choices": ["play music", "motivation/being upbeat", "call people", "ring doorbell"], "correct_choice_idx": 1, "direct_answers": ["panic", "easy", "laughs", "taxes", "motivation/being upbeat", "office supplies", "task completion", "humour", "novelty gag", "stress relief"], "difficult_direct_answer": true, "rationales": ["It is a dummy button used for minimal distraction", "The red button is not connected to anything. it makes noise but is not capable of playing music.", "The button says \"easy\" on it."], "image": "val2014/COCO_val2014_000000357870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519850, "question_id": "A3HkDVfPzQKUDrDNuQzBe2", "question": "What physical danger could she face if she was stuck in the cold with no winter apparel?", "choices": ["frostbite", "sunburn", "nothing", "chicken pox"], "correct_choice_idx": 0, "direct_answers": ["hypothermia", "frostbite", "frostbite", "hypothermia", "freezing", "hospitalization", "freezing", "hypothermia", "freezing", "freeze"], "difficult_direct_answer": false, "rationales": ["If the woman had no winter gear and was stuck in the cold, she could get not only frostbite, but also hypothermia. both conditions are very serious and are to be assiduously avoided.", "Being out in the cold could lead to frostbite.", "The danger is frostbite."], "image": "train2014/COCO_train2014_000000519850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367967, "question_id": "A3ZpT4MPZvZ4ZJpbyxoYch", "question": "What does the fur resemble?", "choices": ["glasses", "toque", "hat", "mug"], "correct_choice_idx": 2, "direct_answers": ["wool", "hair", "carpet", "wool", "afro", "carpet", "felt", "hat", "hair wig", "hair"], "difficult_direct_answer": false, "rationales": ["The hair is covering the sheep's head and shading its eyes.", "It appears that whoever sheared this sheep has a fine sense of humor, and the remaining fuzzy cap on this sheep's head is adorable.", "The fur on the top of the sheep's head looks like a fuzzy hat."], "image": "train2014/COCO_train2014_000000367967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519784, "question_id": "A3arzbrQBFks2fhPW3uehV", "question": "What is on the floor next to the van?", "choices": ["arrow", "footprints", "coyote", "snow"], "correct_choice_idx": 0, "direct_answers": ["lanes", "motorcycle", "tripes", "machine", "bike", "arrow", "motorcycle", "nothing", "tiles", "tiles"], "difficult_direct_answer": false, "rationales": ["It is painted onto the ground; none of the other three options are present in image.", "Just to the right of the van you can see an arrow on the ground telling the drivers what direction to drive in.", "The arrow is on the ground."], "image": "val2014/COCO_val2014_000000519784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279723, "question_id": "A3q3RgovVSJ4RkaTaG3BKe", "question": "For whom does this woman work?", "choices": ["nasa", "uber", "target", "walmart"], "correct_choice_idx": 0, "direct_answers": ["nasa", "nasa", "nasa", "nasa", "nasa", "nasa", "nasa", "nasa", "nasa", "nasa"], "difficult_direct_answer": false, "rationales": ["As a woman sits in a large conference room with lots of other people checking her phone while wearing a \"nasa\" lanyard and a huge \"nasa\" button, she totally works for nasa!.", "She is wearing the spaceship logo of this agency on her clothes.", "The logos on her shirt and lanyard indicate her employer."], "image": "val2014/COCO_val2014_000000279723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225399, "question_id": "A3vgN9xvbmB8zKmw3c4XVj", "question": "In which city of the United states consist of this monument?", "choices": ["washington", "chicago", "new york", "miami"], "correct_choice_idx": 2, "direct_answers": ["washington dc", "new york", "new york", "washington dc", "new york", "new york", "new york", "new york", "new york", "washington dc"], "difficult_direct_answer": false, "rationales": ["The william tecumseh sherman monument can be found in manhattan.", "The monument exists in the city of new york.", "The city is new york."], "image": "train2014/COCO_train2014_000000225399.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210702, "question_id": "A4AKVoqatodweyT7xTJFGo", "question": "What animal is the picture on the truck of?", "choices": ["elephant", "bull", "condor", "giraffe"], "correct_choice_idx": 1, "direct_answers": ["bull", "bull", "bull", "bull", "bull", "bull", "bull", "dinosaur", "dinosaur", "dinosaur"], "difficult_direct_answer": false, "rationales": ["It is a large bovine with horns on the head", "The red drawing shows a draw of a bull.", "The animal has horns and is very large."], "image": "train2014/COCO_train2014_000000210702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108193, "question_id": "A4SzLYpNiqyxHjjfBPJ9XZ", "question": "What brand of Oats have they purchased?", "choices": ["kelloggs", "king arthur", "quaker", "post"], "correct_choice_idx": 2, "direct_answers": ["quaker", "quakers", "quaker", "quaker", "quaker", "quakers", "quaker", "quaker", "quaker", "quaker"], "difficult_direct_answer": false, "rationales": ["It has the picture of a man with a hat from the old days", "The container of oats has the logo for quaker on it.", "You can see the logo for the oats on the container."], "image": "val2014/COCO_val2014_000000108193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524525, "question_id": "A4edzXX46c2aWkHfbntGfQ", "question": "What type of train is this?", "choices": ["steam", "elevated", "model", "bullet"], "correct_choice_idx": 2, "direct_answers": ["model train", "electrician train", "miniature train", "modern", "model", "model train", "model", "toy", "model", "model train"], "difficult_direct_answer": false, "rationales": ["The items in the scene, including the train, are scaled down versions of real life objects.", "This is a small toy train", "This is a toy-sized model train."], "image": "train2014/COCO_train2014_000000524525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417980, "question_id": "A4jv8RVwPJVSNgPNHX7Szm", "question": "What type of hat is the man in the tank top wearing?", "choices": ["top hat", "baseball cap", "beanie", "fedora"], "correct_choice_idx": 0, "direct_answers": ["derby", "top hat", "top hat", "top hat", "fedora", "top hat", "bowler", "top hat", "top hat", "top hat"], "difficult_direct_answer": false, "rationales": ["A man in a tank top is wearing a hat with a tall center. top hats are tall.", "The man is wearing a tank top and top hat.", "The black brimmed hat with tall cylindrical top the man in the middle of this image wears is called a top hat."], "image": "val2014/COCO_val2014_000000417980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467475, "question_id": "A4zDGtjtevDeXWSeFEsoqF", "question": "What is winning the race so far?", "choices": ["kite", "plane", "birds", "balloon"], "correct_choice_idx": 1, "direct_answers": ["plane", "plane", "airplane", "plane", "airplane", "airplane", "plane", "airplane", "airplane", "plane"], "difficult_direct_answer": false, "rationales": ["The airplane look far ahead of the flock.", "The plane is winning.", "The plane is in front of the birds."], "image": "val2014/COCO_val2014_000000467475.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167574, "question_id": "A57PB869VhMNLLs2AuWF7K", "question": "Who is the batter?", "choices": ["jerry lynn", "charlie culberson", "tom brady", "amos otis"], "correct_choice_idx": 1, "direct_answers": ["culberson", "charlie culberson", "professional player", "culberson", "professional player", "culberson", "culberson", "culberson", "culberson", "culberson"], "difficult_direct_answer": false, "rationales": ["The batter's uniform bears the name culberson. charlie culberson is the only name which fits.", "The name above the number 23 on the back of the player's uniform is culberson so the player must indeed be charlie culberson.", "His name is above the number 23 on the back of his shirt."], "image": "train2014/COCO_train2014_000000167574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560787, "question_id": "A5AQoEFrarCc3JUXP43Z4L", "question": "What is the name of this game?", "choices": ["baseball", "cricket", "foot ball", "hockey"], "correct_choice_idx": 2, "direct_answers": ["soccer", "foot ball", "soccer", "soccer/football", "football", "soccer", "soccer", "soccer", "soccer", "football"], "difficult_direct_answer": false, "rationales": ["Kids are standing on a soccer field in front of a net. soccer is referred to as football.", "The name is football.", "They are playing a sport that is also known as soccer."], "image": "train2014/COCO_train2014_000000560787.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120185, "question_id": "A5BQTanCd9Rh7hkVzp3dgZ", "question": "Why is the man holding onto the pole?", "choices": ["to jump", "to twist", "to turn", "stability"], "correct_choice_idx": 3, "direct_answers": ["water skiing", "support steering", "balance", "pole", "stability", "achieve stability", "water skiing", "skiing", "balance", "pulling himself"], "difficult_direct_answer": false, "rationales": ["A person is on skis on the water and is holding a bar connected to the boat.", "A man is on skis in the water holding a large pole connected to the boat.", "The man is water boarding, which requires a lot of balance and the poles aid with balance."], "image": "train2014/COCO_train2014_000000120185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170683, "question_id": "A5StUtbJAZhrbsBFEeGMLG", "question": "How might cars cross the water seen here?", "choices": ["jump over", "ferry boat", "bridge", "driving"], "correct_choice_idx": 1, "direct_answers": ["ferry", "ferry", "ferry", "ferry", "ferry boat", "ferry boat", "ferry", "on ferry", "ferry", "ferry"], "difficult_direct_answer": false, "rationales": ["Boats are parked at a marina.", "A ferry is large enough to transport cars and can float on water. there is no bridge in sight.", "The yellow sign indicates the vehicle that carries the cars across the water."], "image": "train2014/COCO_train2014_000000170683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73019, "question_id": "A5TAJicptU63X6EUVbWZdw", "question": "Which of these foods fall out of the cruciferous food group category?", "choices": ["kale", "cabbage", "broccoli", "cucumber"], "correct_choice_idx": 3, "direct_answers": ["broccoli", "broccoli", "zucchini", "broccoli", "broccoli", "cucumber", "cucumber", "broccoli", "broccoli", "cucumber"], "difficult_direct_answer": false, "rationales": ["Cucumbers and squash are not considered part of the cruciferous food group like broccoli, kale and cabbage.", "A cucumber would not be in this category", "Broccoli, kale and cabbage all belong to the cruciferous food category."], "image": "train2014/COCO_train2014_000000073019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236282, "question_id": "A5k6sJ6qiPALA6DZfkTMZ4", "question": "What are the women in the lower right doing?", "choices": ["texting", "petting", "eating", "photographing"], "correct_choice_idx": 3, "direct_answers": ["picture taking", "standing", "looking", "pictures", "taking photo", "taking photographs", "photographing", "talking", "watching", "taking picture"], "difficult_direct_answer": true, "rationales": ["The women have cameras and they are looking at the animals.", "One woman is snapping a picture and another is holding a camera.", "The person uses a camera to take pictures."], "image": "train2014/COCO_train2014_000000236282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470350, "question_id": "A6c2bgo26Xs5zb4ByxqfwZ", "question": "What is this player getting ready to do?", "choices": ["dribble", "serve", "quit", "return service"], "correct_choice_idx": 1, "direct_answers": ["serve", "serve ball", "serve", "serve ball", "serve", "serve", "tennis serve", "play", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["The player is holding the ball and is about to serve it and start the match.", "The player is getting ready to serve the ball.", "The player wants to serve."], "image": "val2014/COCO_val2014_000000470350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128127, "question_id": "A6d8xx8UuZvz9jneWUd8v9", "question": "What liquid do you put in the things that are hanging upside down?", "choices": ["oil", "water", "honey", "ink"], "correct_choice_idx": 1, "direct_answers": ["wine", "wine", "wine", "wine", "wine", "water", "wine", "wine", "wine", "wine"], "difficult_direct_answer": false, "rationales": ["These are classic basic wine glasses", "You put water in the glasses.", "There are wine glasses hanging upside down."], "image": "train2014/COCO_train2014_000000128127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560347, "question_id": "A6foxzSRvc8gpRMDcV4Yx4", "question": "What type of transportation are they using?", "choices": ["air", "car", "water", "rail"], "correct_choice_idx": 3, "direct_answers": ["train", "train", "train", "train", "train", "train", "train", "subway", "rail", "subway"], "difficult_direct_answer": false, "rationales": ["The people are riding on a train.", "The type is rail transport.", "The people are preparing to board a train."], "image": "train2014/COCO_train2014_000000560347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260603, "question_id": "A6gVW4Z2fNv4mdSe29G63R", "question": "What style of sunglasses are on the man's face?", "choices": ["cats eye", "shield", "aviator", "wraparound"], "correct_choice_idx": 2, "direct_answers": ["aviator", "aviators", "aviator", "aviator", "aviator", "aviators", "aviator", "aviator", "aviators", "aviator"], "difficult_direct_answer": false, "rationales": ["These are glasses that were preferred by pilots.", "The shape and the wire rim indicates this type", "A man is wearing sunglasses with full wire rims. aviator sunglasses have wire rims."], "image": "train2014/COCO_train2014_000000260603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160064, "question_id": "A75g5VrtUdnQjoJ5JnwryF", "question": "What type of area is this?", "choices": ["residential", "commercial", "rural", "tropical"], "correct_choice_idx": 1, "direct_answers": ["commercial", "urban", "public", "city street", "commercial", "street", "busy street", "downtown park", "downtown", "park bench"], "difficult_direct_answer": true, "rationales": ["There are businesses across the street.", "There is a busy street shown with a bench on the sidewalk. this is typical for a downtown city area which is usually deemed commercial.", "All the businesses in the background would make this a commercial area."], "image": "train2014/COCO_train2014_000000160064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41461, "question_id": "A7FbxNCAD5A2vdXwFN2Y4f", "question": "What order does this animal belong to?", "choices": ["primates", "proboscidea", "rodentia", "chiroptera"], "correct_choice_idx": 1, "direct_answers": ["reservation land", "proboscidea", "proboscidea", "proboscidea", "pachyderm", "elephant", "mammal", "animal kingdom", "proboscidea", "elephant"], "difficult_direct_answer": false, "rationales": ["They are from an order of large animals that also encompasses their extinct relatives.", "The elephant is in this order.", "A person is looking at elephants grazing."], "image": "train2014/COCO_train2014_000000041461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360595, "question_id": "A7FhzdBjUdUpneazM2Khho", "question": "What type of space is this?", "choices": ["business", "public", "private", "residential"], "correct_choice_idx": 1, "direct_answers": ["park", "public", "park", "park", "skatepark", "public park", "park", "park", "public park", "public"], "difficult_direct_answer": false, "rationales": ["There are many people using the area for different activities.", "The space is open to everyone. there is no profit motive associated with it.", "The park benches and trees with open sitting area tell us this is a public space."], "image": "val2014/COCO_val2014_000000360595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580396, "question_id": "A7VkstBDcrrd4VnD7Krirr", "question": "Who played a similar sport to this woman?", "choices": ["anna kournikova", "alex morgan", "bo jackson", "lisa leslie"], "correct_choice_idx": 0, "direct_answers": ["anna kournikova", "martina", "andre agassi", "serena williams", "williams sisters", "venus williams", "serena williams", "maria sharapova", "andre agassi", "serena williams"], "difficult_direct_answer": false, "rationales": ["Kournikova played tennis.", "Anna kournikova plays tennis and a woman is shown playing tennis on a tennis court.", "Julio iglesias girlfriend played tennis."], "image": "train2014/COCO_train2014_000000580396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395013, "question_id": "A7VzkYCvWa6rsa9WqXeTiF", "question": "This flag is belongs to which country?", "choices": ["russia", "uk", "nepal", "us"], "correct_choice_idx": 3, "direct_answers": ["america", "united states", "usa", "united states", "united states", "us", "usa", "usa", "usa", "united states"], "difficult_direct_answer": false, "rationales": ["This is obvious based on the colors, stars and stripes.", "The flag is red, white, and blue and has stars and stripes.", "The flag has stars and red and white stripes."], "image": "train2014/COCO_train2014_000000395013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542081, "question_id": "A7b5AYjRwyzvtRebKEv4DJ", "question": "The flying objects are made of what material?", "choices": ["polyester", "copper", "aluminum", "paper"], "correct_choice_idx": 0, "direct_answers": ["nylon", "nylon", "rubber", "cloth", "silk", "fabric", "nylon", "nylon", "polyester", "nylon"], "difficult_direct_answer": false, "rationales": ["They are made out of polyester.", "Kites are made of polyester.", "In the modern age, kites this bright are typically made of non-natural materials."], "image": "train2014/COCO_train2014_000000542081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518755, "question_id": "A7kkTwhjrC58KVoV5VqnHE", "question": "What are the people standing beside?", "choices": ["food truck", "bus", "train", "taxi"], "correct_choice_idx": 0, "direct_answers": ["food truck", "food truck", "food truck", "truck", "food truck", "food truck", "food truck", "truck", "food truck", "food truck"], "difficult_direct_answer": false, "rationales": ["The truck is shaped like a food truck and is the right size.", "The people are standing beside a road vehicle that does not transport members of the general public. a person on the inside of the vehicle is cooking.", "As evident because of the menu and food."], "image": "train2014/COCO_train2014_000000518755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152327, "question_id": "A7kuMGkvEJVVNChipFeZTn", "question": "What are the seating areas of the benches made from?", "choices": ["plastic", "bamboo", "wood", "steel"], "correct_choice_idx": 2, "direct_answers": ["wood", "wood", "wood", "wood", "woods", "wood", "wood", "wood", "wood", "wood metal"], "difficult_direct_answer": false, "rationales": ["The seating area shows some planks that are bowed up due to water damage. water would not damage bamboo, plastic, or steel.", "These benches, as most benches are, are made of planks of wood on a metal frame.", "The benches are made from strips of wood."], "image": "train2014/COCO_train2014_000000152327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394449, "question_id": "A7mGBHyrRhVQoWRmZWaLDo", "question": "What climate do persons await the train in?", "choices": ["tropical", "tundra", "desert", "cold"], "correct_choice_idx": 0, "direct_answers": ["sunny tropical", "tropical", "tropical", "cool weather", "covered", "summer", "warm", "sunny", "warm", "warm"], "difficult_direct_answer": false, "rationales": ["The climate is tropical.", "There are palm trees, they're wearing summer clothing and the language on the sign is from south asia, which has year-round warm weather.", "We can tell by the way the people on the bench are dressed that it's warm out. however, the many palm trees on the left are a good indication it's not just warm out, but that the humidity is off the charts!."], "image": "val2014/COCO_val2014_000000394449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466083, "question_id": "A7pHB7rhh6xqGHGt6wVctT", "question": "What team is practicing?", "choices": ["washington", "chicago", "new york", "toronto"], "correct_choice_idx": 0, "direct_answers": ["washington nationals", "washington", "washington nationals", "washington", "baseball", "batting", "washington", "team w", "washington", "baseball team"], "difficult_direct_answer": false, "rationales": ["They are wearing red hats and shirts. the logo is a stylized w.", "There is a w on his jersey", "The nationals are an mlb team"], "image": "train2014/COCO_train2014_000000466083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193193, "question_id": "A7pRQDdUfJzuD4x7HDiTch", "question": "What is the body of water in the background called?", "choices": ["river", "oxbow", "ocean", "locke"], "correct_choice_idx": 0, "direct_answers": ["lake", "sea", "lake/ocean", "lake", "river", "ocean", "river", "lake", "lake", "river"], "difficult_direct_answer": false, "rationales": ["The water is a river.", "The body of water is a large river.", "The water is a large river outside of the office. there is no beach."], "image": "train2014/COCO_train2014_000000193193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333114, "question_id": "A7yuLgneeVdCT2vma7jcq2", "question": "What are the pink flowers on the trees called?", "choices": ["lavender", "cherry blossoms", "daisies", "lilacs"], "correct_choice_idx": 1, "direct_answers": ["cherry blossoms", "cherry blossoms", "cherry blossoms", "cherry blossoms", "cherry blossom", "cherry blossoms", "cherry blossoms", "cherry blossom", "cherry blossoms", "cherry blossoms"], "difficult_direct_answer": false, "rationales": ["These are cherry trees", "The pink flowers are cherry blossoms.", "The pink flowers grow on the cherry blossom tree in the spring."], "image": "val2014/COCO_val2014_000000333114.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153142, "question_id": "A83d74kuK2u6MmXTXGzWLA", "question": "What theme seems to have inspired the painting of the surfboards?", "choices": ["countries", "sports", "cars", "superheroes"], "correct_choice_idx": 0, "direct_answers": ["flags", "country flags", "country flags", "country flags", "patriotic", "country flags", "national flags", "countries", "united states", "flags"], "difficult_direct_answer": false, "rationales": ["Surfboards are all painted in red, white, and blue. the american flag is red, white, and blue.", "The theme is a country's flag.", "In looking at these surfboard, they seem to each have a different set of colors, stripes and symbols. the leftmost one looks like an american flags while another of the swiss icon. the conclusion is that theme represents countries."], "image": "train2014/COCO_train2014_000000153142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543194, "question_id": "A8J9FwciaNeXTB4RzxWPtm", "question": "The last word on the poster on the right is most likely pronounced similarly to what?", "choices": ["coffee", "juice", "seltzer", "soda"], "correct_choice_idx": 0, "direct_answers": ["coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["A poster has a work that phonetically matches the word coffee.", "The poster looks like coffee.", "The words is similar to the coffee as it original word implies."], "image": "train2014/COCO_train2014_000000543194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41729, "question_id": "A8MvrMKLfjg4CvLF6Yc7ce", "question": "What is this device used for?", "choices": ["phone calls", "music", "arithmetic", "video games"], "correct_choice_idx": 2, "direct_answers": ["mathematical calculations", "calculate", "calculating", "arithmetic", "remote", "calculating", "calculating", "calculating", "calculating", "math"], "difficult_direct_answer": false, "rationales": ["The remote has numbers and math symbols.", "The handheld device is a calculator, not a console, phone, or music player.", "A person is holding an electronic device that has just numbers and math symbols on it."], "image": "train2014/COCO_train2014_000000041729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422212, "question_id": "A8ZcagV4HF87kvjeGckJ8z", "question": "What type of people are show?", "choices": ["actors", "models", "judges", "athletes"], "correct_choice_idx": 3, "direct_answers": ["tennis players", "tennis players", "tennis players", "tennis players", "tennis players", "players", "players", "tennis players", "athletes", "players"], "difficult_direct_answer": false, "rationales": ["They are sportsmen who played in tennis.", "The people are athletes.", "Tennis players use rackets like the ones shown in the picture and on the wall."], "image": "val2014/COCO_val2014_000000422212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578945, "question_id": "A8j5Jazn2SqniGMQv3GeMr", "question": "Where Tapestry is located?", "choices": ["california", "london", "none", "new york"], "correct_choice_idx": 3, "direct_answers": ["store", "town square", "new york", "mississippi", "parade", "left truck", "outside", "behind", "usa", "top middle"], "difficult_direct_answer": true, "rationales": ["The picture doesn't give any real ways of showing where it has been taken.", "Tapestry is located in california according to the license plate.", "Its located in london because most vintage cars are in london."], "image": "train2014/COCO_train2014_000000578945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260221, "question_id": "A8v6wHSjMLx4kcJsoXtjaV", "question": "What is the old man doing with the white device in his hand?", "choices": ["cleaning", "painting", "directing", "gaming"], "correct_choice_idx": 3, "direct_answers": ["wii", "playing games", "playing wii", "nintendo controller", "playing game", "playing wii", "playing game", "playing wii", "gameplay", "gaming"], "difficult_direct_answer": false, "rationales": ["The man is playing wii.", "The man is holding a white video game controller in his hand while he plays a game.", "The man is holding a nintendo wii remote."], "image": "val2014/COCO_val2014_000000260221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225177, "question_id": "A8vkR3FeUbPuqZ6UqkZBgk", "question": "What is the purpose of this bus?", "choices": ["school bus", "transport goods", "limo", "military transport"], "correct_choice_idx": 0, "direct_answers": ["school", "school transportation", "carrying students", "school", "school travel", "transport students", "bus", "school bus", "school bus", "passengers"], "difficult_direct_answer": false, "rationales": ["The digital sign on the front of the bus indicates the purpose.", "There is writing that says so on the front top of the bus.", "It is yellow and has the characteristic markings of this type of vehicle used for this purpose."], "image": "val2014/COCO_val2014_000000225177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155543, "question_id": "A9DT3JhfEpiPbuJxBsKFW8", "question": "What are the red tables under the red roofed structures?", "choices": ["card table", "picnic tables", "dressing table", "bar table"], "correct_choice_idx": 1, "direct_answers": ["picnic tables", "sitting bench", "picnic tables", "picnic tables", "picnic tables", "bench", "picnic", "picnic tables", "picnic tables", "picnic tables"], "difficult_direct_answer": false, "rationales": ["The tables are tables you can sit and eat outside at.", "People can gather and have a picnic on them.", "It's picnic tables for people who are relaxing in the park."], "image": "train2014/COCO_train2014_000000155543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325836, "question_id": "A9Z9RBTK66NEtPjKSP8jWb", "question": "Where is this bike parked?", "choices": ["parking lot", "hotel room", "buffet", "expo"], "correct_choice_idx": 3, "direct_answers": ["indoor facility", "inside", "red carpet", "showroom", "expo", "red carpet", "showroom", "inside", "red carpet", "red carpet"], "difficult_direct_answer": false, "rationales": ["This is a bike expo. there are other bikes for viewing.", "It is a large indoor space with many people present.", "The motorcycle is on a display stage with an audience surrounding it."], "image": "val2014/COCO_val2014_000000325836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463354, "question_id": "A9ZNwiwWjkK6ayZJSCimBL", "question": "This child has a picture of what animal on their vest?", "choices": ["goat", "frog", "dog", "cat"], "correct_choice_idx": 0, "direct_answers": ["reindeer", "goat", "antelope", "ram", "goat", "goat", "gazelle", "goat", "goat", "goat"], "difficult_direct_answer": false, "rationales": ["The animal has horns on the head.", "The animal has horns. cats, dogs, and frogs do not have horns.", "The animal has a set of horns on it."], "image": "train2014/COCO_train2014_000000463354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516167, "question_id": "A9sC6Y5ScT27ZyjWWa5jwE", "question": "Which American automobile manufacturer made the red truck?", "choices": ["ford", "gmc", "chevrolet", "pontiac"], "correct_choice_idx": 0, "direct_answers": ["ford", "ford", "ford", "ford", "ford", "ford", "ford", "ford", "ford", "ford"], "difficult_direct_answer": false, "rationales": ["Above the grille on this truck and below the hood the word 'ford' is written in metal.", "The maker of the vehicle is written across the front of the truck in silver letters.", "It says the make across the front above the grill."], "image": "train2014/COCO_train2014_000000516167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309034, "question_id": "A9tvMPUg5LmGSDFkeCEHUs", "question": "Which level do the people go to?", "choices": ["lower level", "same level", "upper level", "none"], "correct_choice_idx": 1, "direct_answers": ["ground", "boarding", "aeroplane", "top level", "second", "ground level", "second", "same level", "up", "boarding"], "difficult_direct_answer": false, "rationales": ["They are standing on a conveyor belt. these typically remain at one elevation.", "The people go to the same level.", "The people are all going to the same level."], "image": "train2014/COCO_train2014_000000309034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277914, "question_id": "A9tz9Z2YpToaZviLjYQJTP", "question": "What would you call this type of fruit seller?", "choices": ["retailer", "merchant", "grocer", "street vendor"], "correct_choice_idx": 3, "direct_answers": ["exotic fruit", "gone", "produce seller", "market stall", "tropical", "vendor", "tropical", "vendor", "street vendor", "produce"], "difficult_direct_answer": false, "rationales": ["They don't have a building.", "Street vendors are on the street and do not have a brick and mortar store.", "There is a market."], "image": "train2014/COCO_train2014_000000277914.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105685, "question_id": "AAitqMd3D5bQaHSFSVn8rC", "question": "What food is the colorful ingredient put onto?", "choices": ["donut", "ice-cream", "yogurt", "pancake"], "correct_choice_idx": 0, "direct_answers": ["doughnut", "doughnuts", "sprinkles", "confetti", "donut", "sprinkles", "donut", "doughnuts", "donuts", "donuts"], "difficult_direct_answer": false, "rationales": ["The child has a hat on that says krispy kreme and so does the box.", "Kids are standing around a table with krispy kreme hats on.", "The container is holding colorful sprinkles."], "image": "train2014/COCO_train2014_000000105685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253262, "question_id": "AAs4SMabktbrtxm29e5xVw", "question": "What liquid goes through the hose on the ground?", "choices": ["none", "airplane fuel", "waste", "milk"], "correct_choice_idx": 1, "direct_answers": ["jet fuel", "jet fuel", "gas", "jet fuel", "gas", "gas", "airplane fuel", "gasoline", "fuel", "patrol"], "difficult_direct_answer": false, "rationales": ["They use it to put gas in the plane.", "The vehicle has wings and is capable of flying. the hose is connected to a truck that has a royal dutch shell logo and a flammable material warning symbol on its side.", "The hose is connected to a shell gas tank."], "image": "val2014/COCO_val2014_000000253262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308302, "question_id": "ABAp4WxehfEXEkwegMCxwR", "question": "What demographic of people use this lounge area the most?", "choices": ["middle class", "upper class", "working class", "lower class"], "correct_choice_idx": 1, "direct_answers": ["upper class", "tourists", "older people", "tourists", "adults", "old", "retirees", "hikers", "sightseers", "white"], "difficult_direct_answer": true, "rationales": ["There is nothing defining about the space itself to determine what type of person may use the space, but if this was a private residence it would likely be expensive based on the view.", "The lounge area has high quality wooden seating in a secluded area that had a large amount of money invested into maintain and building it.", "A balcony indicative of a fancy resort is shown with a view of the city below."], "image": "train2014/COCO_train2014_000000308302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284233, "question_id": "ABHK8Bm2MDYZ2gHfCe4J5e", "question": "What style of pizza do they serve?", "choices": ["sicilian", "california", "chicago", "new york"], "correct_choice_idx": 3, "direct_answers": ["deep dish", "meat lovers", "sicilian", "al taglio", "sicilian", "thin crust", "new york", "deep dish", "sausage", "meat lovers"], "difficult_direct_answer": false, "rationales": ["The other answers on the list have distinct and defining features that are not visible in the pizza of this image.", "The style of pizza is square like in new york.", "This is a thick square pizza which is more popular in chicago."], "image": "train2014/COCO_train2014_000000284233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525438, "question_id": "ABR7SpMWiuTr3eqBnJ9N5r", "question": "Which letters are missing from the sign?", "choices": ["de", "ne", "mi", "to"], "correct_choice_idx": 1, "direct_answers": ["n e", "ne", "ve", "ne", "n e", "n e", "ca", "n e", "lev", "ne"], "difficult_direct_answer": false, "rationales": ["The shape of the missing letters are still visible, making them readable.", "It is spelled c l e a n e r s.", "The missing words for now leasing."], "image": "train2014/COCO_train2014_000000525438.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271301, "question_id": "ABg5wNRBCDZ5tuDQKh6L9q", "question": "What propels this craft forward?", "choices": ["oars", "sails", "motor", "wind"], "correct_choice_idx": 2, "direct_answers": ["motor", "gasoline motor", "motor", "motor", "motor", "motor", "motor", "gasoline motor", "motor", "motor"], "difficult_direct_answer": false, "rationales": ["There is a wake behind the boat and its front is out of the water, suggesting something is causing it to move rapidly.", "The boat has a motor on the back.", "There's a motor on the boat."], "image": "train2014/COCO_train2014_000000271301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110327, "question_id": "ABnoArzEabh7qYcNFaEgPs", "question": "What is the best way to cook a cucumber?", "choices": ["frying", "toasting", "baking", "grilling"], "correct_choice_idx": 2, "direct_answers": ["air fryer", "grill", "pickle", "stir fry", "pickled", "grill it", "grilled", "roast", "baking", "tasty"], "difficult_direct_answer": true, "rationales": ["This is really a matter of preference but if you cook in the oven it can get crispy without charring.", "You can fry cucumbers to make fried pickles.", "Cooking over an open flame gives it a nice char. charring can bring out more flavor."], "image": "train2014/COCO_train2014_000000110327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563514, "question_id": "ABzC8QfTjX8AC9bmeqyWdq", "question": "This bus is part of what?", "choices": ["reduced rides", "sale", "commute", "exhibition"], "correct_choice_idx": 3, "direct_answers": ["tourist tour", "centennial celebration", "welcome tour", "centennial celebration", "centennial bus", "shopping mall", "celebration", "transit museum", "exhibition", "centennial celebration"], "difficult_direct_answer": false, "rationales": ["This bus is part of a bus exhibition as is read from the sign.", "It is a very old bus with informational signs on it", "A bus is on display with an informational board next to it."], "image": "train2014/COCO_train2014_000000563514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360504, "question_id": "AC5MKvmTFHZnt7huqKPeus", "question": "What is the far end of the pool called?", "choices": ["deep side", "adults only", "deep zone", "deep end"], "correct_choice_idx": 3, "direct_answers": ["deep end", "deep end", "widths", "deep end", "edge", "widths", "deep end", "deep end", "deep end", "picnic area"], "difficult_direct_answer": false, "rationales": ["The water goes down further and it is kept to one end for safety", "It is on the opposite end of the shallow pool.", "That part of the pool has deeper water"], "image": "val2014/COCO_val2014_000000360504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113192, "question_id": "ACBQKzdfXwozTDyxHFRvb2", "question": "The exposure makes the woman look like what?", "choices": ["leprechaun", "ghost", "witch", "vampire"], "correct_choice_idx": 1, "direct_answers": ["ghost", "ghost", "ghost", "ghost", "ghost", "ghost", "ghost", "ghost", "ghost", "ghost"], "difficult_direct_answer": false, "rationales": ["The woman looks like a ghost due to the camera.", "The exposure is ghostly.", "She looks see through like a spirit."], "image": "train2014/COCO_train2014_000000113192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341218, "question_id": "ACEHcPS9sKKmTxwLjsjjpn", "question": "What type of refrigerator would this be called?", "choices": ["french door", "side-by-side", "under counter", "built-in"], "correct_choice_idx": 1, "direct_answers": ["double door", "side", "walk in", "double sided", "french door", "side-by-side", "french doors", "side-by-side", "french door", "side-by-side"], "difficult_direct_answer": false, "rationales": ["The doors are on each side.", "The freezer and the refrigerator areas are both the same height and the doors are not one on top of the other", "The freezer and refrigerator are next to each other instead of on top of each other."], "image": "train2014/COCO_train2014_000000341218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219418, "question_id": "ACFLar6ck2MMAvGzgZ9eZj", "question": "What is the logo on the bear?", "choices": ["maple", "cotton", "ball", "shamrock"], "correct_choice_idx": 3, "direct_answers": ["shamrock", "clover", "clover", "clover", "clover", "shamrock", "shamrock", "four-leaf clover", "clover", "clover"], "difficult_direct_answer": false, "rationales": ["A four leaf clover is sometimes also referred to as a shamrock.", "The logo on the bear is clearly visible and has the outline commonly associated with answer a.", "The logo is a shamrock."], "image": "train2014/COCO_train2014_000000219418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137330, "question_id": "ACQ8rsTNXPgaJBT2t8orux", "question": "What is the person standing here keeping?", "choices": ["lunch", "pace", "shark watch", "nothing"], "correct_choice_idx": 1, "direct_answers": ["time", "rhythm", "rythm", "rope", "pace", "pace", "rhythm", "time", "command", "cadence"], "difficult_direct_answer": false, "rationales": ["The rowers need to paddle at the same time for maximum performance. the standing person shouts a rhythm for them to follow.", "This person keeps everyone in time and on the same rhythm.", "She is shouting instructions to keep the rowers in tune with each other."], "image": "val2014/COCO_val2014_000000137330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196558, "question_id": "ACZntAB4VqTmZAG9aCTYxc", "question": "What is the thin brown pole on the stand made from?", "choices": ["steel", "bamboo", "birch", "plastic"], "correct_choice_idx": 1, "direct_answers": ["bamboo", "bamboo", "bamboo", "wood", "wood", "wood", "bamboo", "bamboo", "holder", "bamboo"], "difficult_direct_answer": false, "rationales": ["Based on the smoothness of the wood, the circumference and the knots, answer a is consistent with what is visible.", "The thin pole has the straightness and regular knots that are commonly observed with answer a.", "The pole is bamboo."], "image": "train2014/COCO_train2014_000000196558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562124, "question_id": "ACpJaqmaGW5YXMf9ihJKv4", "question": "How are these images related?", "choices": ["same film", "sequence", "same subjects", "same camera"], "correct_choice_idx": 1, "direct_answers": ["same person", "time lapse", "tennis", "temporal", "sequence", "duplicates", "identical", "same movement", "fast-action photography", "copies"], "difficult_direct_answer": true, "rationales": ["A woman reaching to return a tennis ball is slightly moved in the second image with everything else being the same.", "Looks as if one is taken right after the other one.", "These pictures are of the same person and were taken very close together or sequentially."], "image": "train2014/COCO_train2014_000000562124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469781, "question_id": "ACqbMkJwsrQwiUtKFqcMUR", "question": "Who is the man in the white platform?", "choices": ["firefighter", "policeman", "utility worker", "stuntman"], "correct_choice_idx": 2, "direct_answers": ["electrician", "utility worker", "service technician", "electrician", "arborist", "worker", "worker", "electrician", "electrician", "tree trimmer"], "difficult_direct_answer": false, "rationales": ["A man is in a lift connected to a large truck and is working near electrical lines. utility workers work near electric lines.", "The man is a lineman. he is in a bucket truck working on the power lines.", "He is a utility worker."], "image": "train2014/COCO_train2014_000000469781.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52161, "question_id": "ACwyEi2m6fZrPTLpb35RAQ", "question": "What is the safest motorcycle jacket?", "choices": ["king trans", "alpinestars", "klim induction", "pilot trans"], "correct_choice_idx": 3, "direct_answers": ["leather", "pilot trans", "leather", "leather", "leather", "leather", "leather", "leather", "leather", "leather"], "difficult_direct_answer": false, "rationales": ["This one is listed as the second best jacket", "A pilot trans is a really strong brand for a motorcycle jacket.", "The pilot trans is the most safe motorcycle jacket."], "image": "val2014/COCO_val2014_000000052161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242365, "question_id": "AD2Ca3bnTWJM6i4hexTgQo", "question": "What chore does the woman perform?", "choices": ["dusting", "basting", "dish washing", "frying"], "correct_choice_idx": 2, "direct_answers": ["dish washing", "wash dishes", "washing dishes", "washing dishes", "dish washing", "washing dishes", "washing dishes", "dish washing", "dish washing", "washing dishes"], "difficult_direct_answer": false, "rationales": ["She is using the sink to clean a pot.", "She has water running and she is cleaning the dishes off.", "The woman is cleaning a pot with soap and water."], "image": "val2014/COCO_val2014_000000242365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42705, "question_id": "AD6kr3JWfbQWDaumAJtUHm", "question": "Where could the man in the scooter cross the street?", "choices": ["crosswalk", "nowhere", "2 blocks", "next city"], "correct_choice_idx": 0, "direct_answers": ["crosswalk", "lines", "crosswalk", "crosswalk", "crosswalk", "zebra crossing", "crosswalk", "crosswalk", "crosswalk", "crosswalk"], "difficult_direct_answer": false, "rationales": ["A mobility scooter is often legally allowed in the same pathways as pedestrians. pedestrians legally cross streets on painted striped paths at intersections.", "The man in the scooter is crossing the street near a crosswalk.", "The crosswalk is a safe place to cross."], "image": "train2014/COCO_train2014_000000042705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346795, "question_id": "ADHNH8AzhmBRn6RBaWqnzb", "question": "Thy are they lined up?", "choices": ["hungry", "lost", "posing", "fighting"], "correct_choice_idx": 2, "direct_answers": ["race", "for photo", "to race", "avoid crashing", "skating", "to ski", "photo opp", "posing", "down slope", "ski"], "difficult_direct_answer": true, "rationales": ["The people are posing.", "They are standing looking at the camera with smiles on their faces.", "The skiers are posing for a photo."], "image": "train2014/COCO_train2014_000000346795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334509, "question_id": "ADYHpNMqoCSSnGCywZakhp", "question": "What sauce will be added to the meat?", "choices": ["barbecue", "mustard", "hot", "ketchup"], "correct_choice_idx": 0, "direct_answers": ["bbq sauce", "bbq", "barbeque sauce", "barbecue", "bbq", "bbq", "barbeque", "barbecue", "barbecue", "bbq"], "difficult_direct_answer": false, "rationales": ["This is barbeque pork and the sauce is the most common on it", "When grilling meat outdoors, the most common sauce would be bbq sauce.", "The meat is being barbecued."], "image": "val2014/COCO_val2014_000000334509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220872, "question_id": "ADuaNieCrXWpiZWyFhKvV3", "question": "What city is this train station located in?", "choices": ["las vegas", "paris", "new york", "london"], "correct_choice_idx": 2, "direct_answers": ["new york", "unknown", "new york", "london", "london", "new york", "new york", "london", "new york", "new york"], "difficult_direct_answer": false, "rationales": ["The train indicates \"grand central\" and that is a famous train station in new york.", "You can tell by the name on the side of the train as to where this is.", "Grand central station is written on the side of the train and that is located in nyc."], "image": "val2014/COCO_val2014_000000220872.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319346, "question_id": "AE3KcRSZ7nm8y6Z9ZfFU3w", "question": "What vegetable is toxic to horses?", "choices": ["eggplant", "carrot", "tomatoes", "beans"], "correct_choice_idx": 2, "direct_answers": ["garlic", "onion", "no clue", "onions", "carrot", "not carrots", "grass", "tomatoes", "onions", "onions"], "difficult_direct_answer": false, "rationales": ["The plant contains an alkaloid that slows gut function.", "An internet search revealed that vegetable in the nightshade family are toxic to horses.", "The vegetable is tomatoes."], "image": "train2014/COCO_train2014_000000319346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181053, "question_id": "AE772byHnn3BpfRBWopor8", "question": "What is he doing with his fist?", "choices": ["gesturing", "annoying other", "threatening other", "defending self"], "correct_choice_idx": 0, "direct_answers": ["fist pump", "shouting", "gesturing", "hitting", "cheering", "gesturing", "shaking it", "raising", "clenched", "raised"], "difficult_direct_answer": true, "rationales": ["He's gesturing.", "He is emphasizing with his hand.", "He is goofing around with his friend and you can tell by the look on his face."], "image": "train2014/COCO_train2014_000000181053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528201, "question_id": "AE8sZoUD73Lvrc3riVvaFm", "question": "What is the relationship between the two tennis players in this situation?", "choices": ["teammates", "classmates", "competitors", "coworkers"], "correct_choice_idx": 2, "direct_answers": ["friends", "teammates", "opponents", "friends", "friends", "competitors", "friends", "partners", "friends", "teammates"], "difficult_direct_answer": false, "rationales": ["They are competitors.", "There are only two visible players on the court. one has to hit the ball back and forth with the opponent.", "They are competing against one another."], "image": "val2014/COCO_val2014_000000528201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245006, "question_id": "AE9nc8bZ6Kkc4rLG4RJtTA", "question": "Where is this cat located?", "choices": ["home", "vet", "museum", "backyard"], "correct_choice_idx": 0, "direct_answers": ["on bed", "chair", "home", "sofa", "couch", "couch", "sofa", "bed", "couch", "on couch"], "difficult_direct_answer": false, "rationales": ["The cat is laying on a throw pillow on a couch with a curtain in the background. this type of background would be found in the house where the cat resides.", "The cat is sitting on a couch that would be found in a traditional house.", "The cat is at home."], "image": "val2014/COCO_val2014_000000245006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228783, "question_id": "AEH8KpgfN2ZnJmwMFSngJS", "question": "What type of sign is shown?", "choices": ["regulatory", "orientation", "location", "directional"], "correct_choice_idx": 2, "direct_answers": ["yellow sign", "wood", "park", "yellow", "funpark", "funpark", "location", "board", "funpark", "funpark"], "difficult_direct_answer": false, "rationales": ["Based on the words visible, they sign is likely labelling the place that it is situated in.", "It tells you where this scene is happening.", "It is the name of the ski resort"], "image": "train2014/COCO_train2014_000000228783.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354305, "question_id": "AEJn2jxYj9BEM8LJb6E2Yf", "question": "Who wrote the book whose name appears on the boat?", "choices": ["hg wells", "linnea quigley", "william shakespeare", "stephen king"], "correct_choice_idx": 2, "direct_answers": ["shakespeare", "william shakespeare", "shakespeare", "shakespeare", "shakespeare", "william shakespeare", "william shakespeare", "william shakespeare", "shakespeare", "no clue"], "difficult_direct_answer": false, "rationales": ["It is a name of a famous play by shakespeare.", "The boat is named tempest. this is a famous play about a ship.", "A boat is named tempest which is also the name of a book by shakespeare."], "image": "train2014/COCO_train2014_000000354305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296754, "question_id": "AEWvs84gC7Sd8qzWfHbYgR", "question": "What might have stopped her from walking further forward?", "choices": ["deep pit", "puddle", "cars", "walls"], "correct_choice_idx": 1, "direct_answers": ["puddle", "rain", "puddles", "traffic", "cars", "dangerous sidewalk", "cars", "traffic", "puddle", "puddle"], "difficult_direct_answer": false, "rationales": ["It's raining and the water is puddling in the road.", "Answer a is visible in front of the person and is something that typically would cause a person to change their path.", "There is a pool of water in front of her. she doesn't want to get wet."], "image": "train2014/COCO_train2014_000000296754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81842, "question_id": "AEriVrLSkC6GmwPYv6Mqyd", "question": "What do the separate traffic lights signal?", "choices": ["stop/go", "caution/stop", "caution/go", "caution/caution"], "correct_choice_idx": 3, "direct_answers": ["stop wait", "motion laws", "safety", "different lanes", "slow down", "directions", "stop", "get ready", "stop slow", "caution/caution"], "difficult_direct_answer": true, "rationales": ["They need to be careful.", "The yellow light means to take care, it's about to turn red.", "It means to slow down and be careful for other cars."], "image": "train2014/COCO_train2014_000000081842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377589, "question_id": "AExKpZfYtgWodQfBxP87LR", "question": "What kind of ground are these people sitting on?", "choices": ["ash", "sand", "marble", "concrete"], "correct_choice_idx": 1, "direct_answers": ["sand", "sand", "sandy ground", "sand", "dirt", "sand", "sand", "sand", "sand", "bench"], "difficult_direct_answer": false, "rationales": ["The ground is sand.", "They are sitting on a beach by the water.", "The people are sitting on a beach."], "image": "train2014/COCO_train2014_000000377589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153824, "question_id": "AF3tM2oN7CZCn67mr9bXXc", "question": "What utensil is missing?", "choices": ["spoon", "knife", "fork", "ladle"], "correct_choice_idx": 2, "direct_answers": ["spoon", "salad fork", "plate", "fork", "fork", "salad fork", "spoon", "fork", "fork", "fork"], "difficult_direct_answer": false, "rationales": ["A fork, knife, and spoon are usually present at the table. but in this case, a fork is missing.", "The fork is off to the side on the napkin.", "A person has a knife and a spoon on their lap."], "image": "train2014/COCO_train2014_000000153824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463495, "question_id": "AF4EeWTuaG8b7Fyizqr8sQ", "question": "What kind of service was this?", "choices": ["catering", "delivery", "home made", "restaurant"], "correct_choice_idx": 0, "direct_answers": ["catering", "buffet", "buffet", "buffet", "buffet", "buffet", "buffet", "buffet", "buffet", "buffet"], "difficult_direct_answer": false, "rationales": ["A table is filled with food in large serving dishes. caterers set food up on tables to serve people.", "One can see the professional food warming dish that is used by professionals at these type of events.", "The food is set up for people to take what they want."], "image": "train2014/COCO_train2014_000000463495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292805, "question_id": "AFLHrhNaz92ASswvvEXhCG", "question": "What is she doing wrong?", "choices": ["watching", "texting", "sitting", "breathing"], "correct_choice_idx": 2, "direct_answers": ["calling", "dialing", "sitting", "sitting down", "sitting", "sitting down", "sitting down", "sitting down", "sitting there", "sitting down"], "difficult_direct_answer": false, "rationales": ["There is a sign that says what she is not allowed to do, and she is still doing it.", "There is a sign prohibiting this action", "The sign on the pillar requires standing in that area which the woman has disobeyed."], "image": "train2014/COCO_train2014_000000292805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546473, "question_id": "AFzHNaGccFRejAr7aKDuBP", "question": "The longboarders are skating on the road during which season of the year?", "choices": ["winter", "summer", "fall", "spring"], "correct_choice_idx": 2, "direct_answers": ["fall", "summer", "fall", "summer", "fall", "fall", "summer", "autumn", "fall", "summer"], "difficult_direct_answer": false, "rationales": ["The trees are changing colors.", "The leaves on the trees appear to be changing color as they do in the time of year given in answer a.", "The leaves are turning colors before they fall off"], "image": "train2014/COCO_train2014_000000546473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270362, "question_id": "AG4VHs55Ghw7b8PuarHqSW", "question": "What object in the photo helped Newton realize gravity?", "choices": ["apple", "forks", "leaf", "silver tray"], "correct_choice_idx": 0, "direct_answers": ["apple", "apple", "apple", "apple", "apple", "apples", "apples", "apple", "apple", "apple"], "difficult_direct_answer": false, "rationales": ["The object is an apple.", "The falling apple helped newton realize gravity.", "The apple is what fell on newton's head and is pictured here."], "image": "train2014/COCO_train2014_000000270362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87017, "question_id": "AG7hdFCQKyVSQPSQ5w5G3r", "question": "Why is she wearing gloves?", "choices": ["warmth", "grip", "health", "fashion"], "correct_choice_idx": 1, "direct_answers": ["horseback rider", "grip", "protection", "riding", "protection", "safety", "protection", "riding horses", "rider", "protect hands"], "difficult_direct_answer": false, "rationales": ["Wearing gloves when horseback riding helps to maintain your hold on the reins and prevents chafing.", "The woman wants a grip.", "Holding the ropes can be hard on the hands and gloves provide protection to the skin."], "image": "train2014/COCO_train2014_000000087017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39846, "question_id": "AGc6wKxjUghihd2iy8LVKQ", "question": "What is the vent on top of the stove for?", "choices": ["smoke", "soda", "coffee", "water"], "correct_choice_idx": 0, "direct_answers": ["for smoke", "ventilate smoke", "removing smoke", "ventilation", "smoke", "ventilation", "smoke", "heat", "fan", "control smoke"], "difficult_direct_answer": false, "rationales": ["A stove vent directs the fumes that occur during cooking to the outside.", "The vent is for smoke.", "The stove will heat up and there will need to have a way for the steam to escape."], "image": "train2014/COCO_train2014_000000039846.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334320, "question_id": "AGhG64MDitxWxoCD52qBkG", "question": "The animals are all doing what with their bodies?", "choices": ["laying down", "crawling", "standing up", "rolling around"], "correct_choice_idx": 0, "direct_answers": ["resting", "laying down", "resting", "laying down", "laying down", "standing", "sitting", "no", "laying down", "laying down"], "difficult_direct_answer": false, "rationales": ["The animals are being allowed to rest.", "All of the animals are lying in the street.", "The animals are laying down."], "image": "train2014/COCO_train2014_000000334320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229494, "question_id": "AGi99r4PMxmsxmkVABQjvt", "question": "How many different species are feeding directly from this plate?", "choices": ["one", "two", "twenty", "none"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two species.", "Just a human and a cat are eating from the plate.", "A woman and a cat are eating from the plate."], "image": "train2014/COCO_train2014_000000229494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202444, "question_id": "AGkmzLfBNKdda6GWNaY7ib", "question": "Why is the skateboard hanging off the pipe?", "choices": ["made mistake", "confused", "showing off", "is falling"], "correct_choice_idx": 2, "direct_answers": ["kid's stupid", "half pipe", "trick", "stunt", "skateboarding trick", "went up", "leverage", "trick", "showing off", "changing directions"], "difficult_direct_answer": true, "rationales": ["A skateboarder is on the side of a ramp preparing to do a trick.", "He's about to drop in on the ramp", "The person is showing how they can balance themself."], "image": "val2014/COCO_val2014_000000202444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142563, "question_id": "AGsGJUgiKVnG2o8wnYGa7C", "question": "What do you call the man with the white hat and jeans?", "choices": ["captain", "jockey", "navigator", "pilot"], "correct_choice_idx": 1, "direct_answers": ["cowboy", "cowboy", "cowboy", "cowboy", "cowboy", "cowboy", "rider", "cowboy", "jockey", "rodeo rider"], "difficult_direct_answer": false, "rationales": ["Jockeys ride horses for sport.", "The man is a jockey.", "Traditionally people who ride horses are called jockeys."], "image": "train2014/COCO_train2014_000000142563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301926, "question_id": "AGvYDebgMAXwajKkfsMjXD", "question": "What companies logo can be seen on the white building?", "choices": ["mcdonalds", "arbys", "taco bell", "burger king"], "correct_choice_idx": 0, "direct_answers": ["mcdonald's", "mcdonald's", "mcdonalds", "mcdonalds", "mcdonald's", "mcdonald's", "mcdonalds", "mcdonalds", "mcdonalds", "toronto sun"], "difficult_direct_answer": false, "rationales": ["It is the golden arches against a red background.", "Mcdonald's is known for its golden arches.", "There are golden arches on a red background."], "image": "val2014/COCO_val2014_000000301926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526456, "question_id": "AH4ozCMwWNKtuX8ieYtGpm", "question": "Which animal look different than the cows in the picture?", "choices": ["snake", "chicken", "goat", "pig"], "correct_choice_idx": 2, "direct_answers": ["goat", "lamb", "calf", "horses", "horse", "calf", "calf", "horse", "calf", "small one"], "difficult_direct_answer": false, "rationales": ["There are several cows with one goat in the bunch.", "The goats look different from the cows.", "The goat blends in the pic but he's there."], "image": "val2014/COCO_val2014_000000526456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114178, "question_id": "AH5uejjsb6qEtij8YnXGtb", "question": "What type of fork is included with the meal?", "choices": ["dessert", "fruit", "baby", "salad"], "correct_choice_idx": 0, "direct_answers": ["metal", "metal", "dinnerware", "dessert fork", "plastic", "meatal fork", "silver", "dessert", "pronged fork", "dessert"], "difficult_direct_answer": false, "rationales": ["The fork is for dessert.", "A pastry is on a plate with a slightly small silver utensil for eating.", "A dessert is on a plate with a fork next to it. dessert forks are used to eat dessert."], "image": "train2014/COCO_train2014_000000114178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377577, "question_id": "AH6CjHvKqYoK7cvzUu6Xqm", "question": "What is the woman doing with the electronic device in her hand?", "choices": ["photographing", "listening", "typing", "math"], "correct_choice_idx": 1, "direct_answers": ["listening", "communicating", "texting", "changing songs", "texting", "listening music", "listening music", "calling", "hearing music", "texting"], "difficult_direct_answer": false, "rationales": ["There are headphones that transfer audio from the device to her ears.", "The woman is listening with earbuds to her phone.", "The lady has earphones in her ears. the earphone are connected to her smartphone."], "image": "val2014/COCO_val2014_000000377577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364745, "question_id": "AHRUAxApcLodHFwuS625z4", "question": "Why is the girl so close to the baby?", "choices": ["stole her", "afraid", "holding her", "no room"], "correct_choice_idx": 2, "direct_answers": ["sister", "holding", "siblings", "loves her", "family", "sibling", "holding her", "siblings", "siblings", "relative"], "difficult_direct_answer": false, "rationales": ["The girl is holding the little baby.", "The small child is sitting on her lap supported by a circular pillow which helps assist in proper and easier holding she also has her hand placed on either side of the small child as a extra protection.", "She is holding the baby on her lap"], "image": "val2014/COCO_val2014_000000364745.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488432, "question_id": "AHWRsbSi9iNoRoi75GMYTU", "question": "What style of motor vehicle can be seen on the right?", "choices": ["model f", "model h", "model b", "model t"], "correct_choice_idx": 3, "direct_answers": ["car", "car", "car", "model t", "car", "motorized carriage", "model t", "mini car", "model t", "ford"], "difficult_direct_answer": false, "rationales": ["The style is model t.", "A model f, h, or b is not a real thing.", "An old fashioned car with large spoked tires is in a street."], "image": "train2014/COCO_train2014_000000488432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464859, "question_id": "AHoiVKkd8iZSZSw3RMmnYB", "question": "What is missing from this picture?", "choices": ["tissues", "trash", "brush", "soap"], "correct_choice_idx": 2, "direct_answers": ["brush", "shower", "shower", "toilet", "shower/bath", "shower", "person", "bathtub", "shower room", "shower"], "difficult_direct_answer": false, "rationales": ["The other items are all in the picture", "The tissues, soap, and trash are already there.", "None of these items could be used to style hair."], "image": "train2014/COCO_train2014_000000464859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232670, "question_id": "AHtdvrG5uMej627Di3EmbF", "question": "Which one of these foods is most likely to be served by the waitress?", "choices": ["sushi", "pierogi", "taco", "roti"], "correct_choice_idx": 0, "direct_answers": ["asian", "all", "pizza", "all", "sushi", "sushi", "muffins", "sushi", "breakfast lunch", "japanese"], "difficult_direct_answer": false, "rationales": ["It's a japanese dish and there is japanese writing on the chalk board", "There's asian writing on the board. sushi is an asian dish.", "This restaurant has asian writing on the blackboard. asian writing is not seen where you would buy roti, tacos, or pierogis."], "image": "train2014/COCO_train2014_000000232670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373198, "question_id": "AJ5VoB2wj8UEcS2SUgDGXj", "question": "What type of matchup is this?", "choices": ["men's singles", "women's singles", "mixed doubles", "men's doubles"], "correct_choice_idx": 0, "direct_answers": ["doubles", "tennis", "doubles tennis", "tennis", "tennis", "doubles tennis", "men's singles", "tennis", "tennis", "doubles"], "difficult_direct_answer": false, "rationales": ["There are 2 men and 2 women on the court", "There are men and women and it's 4 players so 2 will be on each side of the court when they play", "There are both women and men in the game on each side."], "image": "train2014/COCO_train2014_000000373198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8944, "question_id": "AJ9sXhdjYeBgPErEo6vH6g", "question": "What extra parts did the rider add to the front of the motorcycle that will ensure better visibility?", "choices": ["handlebars", "mirrors", "horns", "seatbelts"], "correct_choice_idx": 1, "direct_answers": ["mirrors", "mirrors", "mirrors", "mirrors", "mirrors", "wheels", "eleven", "side mirrors", "golf clubs", "mirrors"], "difficult_direct_answer": false, "rationales": ["They added a bunch of mirrors that you can see sticking out from the bike", "They added a bunch of mirrors.", "Many mirrors are applied to the motorcycle."], "image": "train2014/COCO_train2014_000000008944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193222, "question_id": "AJMTDFLWVxz9NTsnnsk6Ny", "question": "What feeling do these cats seem to be portraying?", "choices": ["pleased", "curiosity", "scared", "tired"], "correct_choice_idx": 1, "direct_answers": ["curiosity", "curiosity", "curiosity", "pensive", "curiosity", "protective", "curiosity", "curious", "curiosity", "happy"], "difficult_direct_answer": false, "rationales": ["Cats are known for being curious and into everything.", "They are looking outside to see what's going on.", "They are staring intently out of the window."], "image": "train2014/COCO_train2014_000000193222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88477, "question_id": "AJVD5bVPDfbAV8jrdBwaFq", "question": "What poses the most danger for the cow?", "choices": ["stop sign", "car", "bicycle", "no danger"], "correct_choice_idx": 1, "direct_answers": ["car", "cars", "traffic", "cars", "cars", "traffic", "vehicles", "traffic", "car", "traffic"], "difficult_direct_answer": false, "rationales": ["The cow could get hit by the car.", "The cow is standing in the middle of the road and could get hit by a vehicle. a bicycle would not hurt the cow very much.", "The cow may be hit by a vehicle as it is standing in the middle of the road."], "image": "train2014/COCO_train2014_000000088477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342204, "question_id": "AJWzbGLvkhsnBdNXMmV2Hb", "question": "The bands worn by the people indicate that they paid for what event?", "choices": ["baseball game", "movie", "play", "concert"], "correct_choice_idx": 3, "direct_answers": ["fair", "sports", "festival", "amusement park", "entrance fee", "concert", "concert", "festival", "this event", "concert"], "difficult_direct_answer": false, "rationales": ["They are at a concert and these are used to make sure everyone had paid", "The people are wearing wristbands to indicate that they paid to get into a concert.", "Items around the wrist are worn as indicators of participating in events such as concerts."], "image": "val2014/COCO_val2014_000000342204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445267, "question_id": "AJfzshahTvHLqiKc6BRk63", "question": "Which NFL team is represented by the bear in green and yellow?", "choices": ["minnesota", "washington", "miami", "green bay"], "correct_choice_idx": 3, "direct_answers": ["packers", "packers", "packers", "packers", "packers", "packers", "greenway packers", "greenway packers", "green bay", "greenway packers"], "difficult_direct_answer": false, "rationales": ["The stuffed animal wearing a football helmet on the left wears the uniform of the green bay packers and on it's helmet part of their logo 'g' is visible.", "The colors are green, yellow, and white on the uniform so duh if you watch football.", "The packers have green and yellow uniforms and a g on their helmets. they are based in wisconsin."], "image": "val2014/COCO_val2014_000000445267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14493, "question_id": "AJg3nZWSp36wrxnPU5Duxq", "question": "What material is the stuffy animal made of?", "choices": ["synthetic fiber", "denim", "wool", "leather"], "correct_choice_idx": 0, "direct_answers": ["cotton", "cotton", "fake fur", "mohair", "cotton", "fabric", "cotton", "cotton", "fur", "synthetic fiber"], "difficult_direct_answer": false, "rationales": ["The stuffed animal looks modern based on the style. most modern stuffed animals are comprised of answer a.", "A woman is holding a plush stuffed animal.", "A machine made this toy. this type of fur is common."], "image": "train2014/COCO_train2014_000000014493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172158, "question_id": "AJknzrepoRHXkjvy9bNBph", "question": "How would one feel in the foreground as opposed to in the background?", "choices": ["more slim", "more relaxed", "more stressed", "more intelligent"], "correct_choice_idx": 1, "direct_answers": ["shaded", "shaded", "shaded", "less excited", "relaxed unhurried", "more relaxed", "better", "shaded", "relaxed", "chilly"], "difficult_direct_answer": false, "rationales": ["There is a peaceful, relaxing quality to the foreground.", "The question is subjective, but the foreground depicts a recreational zone that would likely be used for the purposes of answer a in contrast to the background which is an urban scene known for hustle and bustle.", "The setting is in a unclutter, quiet space versus the loud noises and traffic in the urban area."], "image": "train2014/COCO_train2014_000000172158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324412, "question_id": "AJu8oogsiJR94TUp268CBH", "question": "What does the bridge cross?", "choices": ["river", "electrical wires", "just dirt", "road"], "correct_choice_idx": 0, "direct_answers": ["river", "river", "river", "river", "river", "river", "river", "river", "water", "water"], "difficult_direct_answer": false, "rationales": ["The bridge is used so people walking can cross over the water to get to the other side.", "There is a flowing body of water that passes under the bridge.", "The body of water is narrow and long, but wide and deep enough for boats."], "image": "train2014/COCO_train2014_000000324412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58141, "question_id": "AJuZbeczoPDbYC38x9Msbi", "question": "Why are umbrellas being used today?", "choices": ["snow", "rain", "sun", "privacy"], "correct_choice_idx": 2, "direct_answers": ["block sun", "sunny", "block sunlight", "protection", "sun protection", "sun", "blocking sunlight", "hot", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["Though traditionally umbrellas are used for rain, but it's a sunny day.", "The hot sun is easily blocked by the use of an umbrella. also, afternoon rain storms aren't that unusual on a hot sunny day in warm climates, so it's good to have an umbrella no matter what!.", "The women are carrying umbrellas because it is a sunny day and they want to stay in the shade."], "image": "val2014/COCO_val2014_000000058141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349438, "question_id": "AKF4VewQYUYKHSYy8iHjTr", "question": "What type of trees are most visible here?", "choices": ["evergreen", "myrtle", "oak", "palm"], "correct_choice_idx": 0, "direct_answers": ["pine", "fir", "pine", "deciduous", "evergreen", "pine", "pine", "pine", "pine trees", "pine trees"], "difficult_direct_answer": false, "rationales": ["Tall green trees with pine needles are behind a skatepark.", "Most of these trees are evergreen pines.", "Evergreens are visible."], "image": "val2014/COCO_val2014_000000349438.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345838, "question_id": "AKVACSJe4cRdgxbgv8LsMW", "question": "What is he wearing on his feet?", "choices": ["slippers", "shoes", "sandals", "sneakers"], "correct_choice_idx": 3, "direct_answers": ["sneakers", "tennis shoes", "tennis shoes", "shows", "shoes", "sneakers", "sneakers", "sneakers", "sneakers", "shoes"], "difficult_direct_answer": false, "rationales": ["The man is wearing a pair of athletic sneakers.", "He is wearing athletic shoes that are also known by this as well as other names like tennis shoes.", "The man is wearing sneakers or tennis shoes on his feet."], "image": "val2014/COCO_val2014_000000345838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61354, "question_id": "AKq3APAfEsrVSt9M3pw7QG", "question": "What other food is most likely to be sold here?", "choices": ["onion", "sausage", "radish", "strawberry"], "correct_choice_idx": 3, "direct_answers": ["vegetables", "rice", "fruits", "pears", "oranges", "bananas", "oranges", "vegetables", "strawberry", "fruits"], "difficult_direct_answer": false, "rationales": ["There are fruits visibly being sold which means other fruits may be sold in addition to the ones visible.", "Bananas and apples can be seen for sale at a market. most markets that sell sell strawberries.", "It is a fruit market."], "image": "train2014/COCO_train2014_000000061354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66800, "question_id": "AKwbhbbbDD2FBZn6L2VTZN", "question": "What is the elephant missing on its right side?", "choices": ["tusk", "tail", "beard", "toe"], "correct_choice_idx": 0, "direct_answers": ["tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk"], "difficult_direct_answer": false, "rationales": ["Elephants normally have two tusks and not one.", "The elephant only has one tusk.", "One is much shorter than the other"], "image": "val2014/COCO_val2014_000000066800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251751, "question_id": "AL5TYubAxMzCJqcMp27bno", "question": "What is the man who is using the laptop probably actually doing?", "choices": ["relaxing", "waiting", "gaming", "traveling"], "correct_choice_idx": 1, "direct_answers": ["watching something", "homework", "scrolling", "typing", "working", "typing", "typing", "checking email", "posing", "waiting"], "difficult_direct_answer": false, "rationales": ["The man is sitting in a waiting room.", "A man is sitting in a chair among a row of chairs in a brightly lit room with a laptop in his hands. people use their computers when the wait.", "Hard to tell based on the guys hand position. seems to have it over keys."], "image": "train2014/COCO_train2014_000000251751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256564, "question_id": "ALRj2PxJHmzFqkEkVucaNn", "question": "What is happening to this person?", "choices": ["repairing bike", "bike accident", "resting", "sunbathing"], "correct_choice_idx": 1, "direct_answers": ["lost control", "crashing", "falling", "bike accident", "falling", "falling over", "fell", "fell down", "becoming damp", "they fell"], "difficult_direct_answer": true, "rationales": ["The vehicle has tipped over and the rider is on the ground.", "The road is wet and muddy to which the man on his bike seemed to have fallen over and the man is on the floor.", "The person had an accident."], "image": "train2014/COCO_train2014_000000256564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272765, "question_id": "ALrvQpnVz9Gm65uRieTKkS", "question": "They are prepared for what phenomenon?", "choices": ["lightning", "earthquake", "rain", "tornado"], "correct_choice_idx": 2, "direct_answers": ["rain", "rain", "rain", "romantic", "storm", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["They have the umbrella open so they won't get wet.", "The people are sitting under an umbrella in case it rains.", "That's what the umbrellas were generally intended to be used for."], "image": "train2014/COCO_train2014_000000272765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537925, "question_id": "AMMvUApbFxp6XcjgavE5F6", "question": "If you needed to watch John Oliver on TV where would you patronize here?", "choices": ["skyliner motel", "pawn shop", "texaco", "gas station"], "correct_choice_idx": 0, "direct_answers": ["hotel", "motel", "hotel", "motel", "hotel", "skyliner motel", "skyliner motel", "motel", "motel", "motel"], "difficult_direct_answer": false, "rationales": ["Motels include television and cable as part of their amenities.", "A hotel advertises free hbo. john oliver is on hbo.", "You would patronize the skyliner motel on the side of the road."], "image": "train2014/COCO_train2014_000000537925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566395, "question_id": "AMPLHD7Rv4DWTGMgHL8KEX", "question": "Which car is stopped before white line?", "choices": ["both", "toyota", "neither", "cab"], "correct_choice_idx": 3, "direct_answers": ["taxi", "both", "both", "taxi", "cab", "taxi", "taxi", "both", "taxi", "taxi"], "difficult_direct_answer": false, "rationales": ["The taxi is right before the white line in front of the stoplight.", "The cab is stopped.", "A yellow taxicab is stopped before the white line."], "image": "train2014/COCO_train2014_000000566395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458481, "question_id": "AMUQhMY74QYZgLPe42x4AV", "question": "What is the weather like in the image above?", "choices": ["sunny", "stormy", "snowy", "rainy"], "correct_choice_idx": 0, "direct_answers": ["cloudy", "overcast", "overcast", "cloudy", "overcast", "sunny", "cloudy", "cloudy", "cloudy", "cloudy"], "difficult_direct_answer": false, "rationales": ["The sky is very clear.", "The is shade so the sun is shining.", "The weather is not sunny but not cloudy either."], "image": "val2014/COCO_val2014_000000458481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289821, "question_id": "ANJHEJsjx7RbAKBCgtSAZc", "question": "What genus is this animal?", "choices": ["equus", "algae", "plant", "bovidae"], "correct_choice_idx": 0, "direct_answers": ["zebra", "equus", "equus", "zebra", "equus", "huh", "zebra", "equus", "equus", "zebra"], "difficult_direct_answer": false, "rationales": ["Zebras are included in the genus equus along with horses.", "I knew that zebras belonged to the same genus as a horse.", "They are zebras."], "image": "train2014/COCO_train2014_000000289821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567121, "question_id": "ANN5FwHS2KqKyq4FTHCchB", "question": "What is a term used in this sport?", "choices": ["homerun", "fault", "goal", "touchdown"], "correct_choice_idx": 1, "direct_answers": ["tennis", "serve", "love", "strike", "volley", "serve", "serving", "fault", "tennis", "love"], "difficult_direct_answer": false, "rationales": ["She is playing tennis.", "A woman is playing tennis with her feet barely off the line of the court.", "A fault in tennis refers to a missed serve."], "image": "train2014/COCO_train2014_000000567121.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428881, "question_id": "ANWZVS4rc9zdSSaLjNJKb7", "question": "Why are they wearing gloves?", "choices": ["warmth", "style", "costume", "protection"], "correct_choice_idx": 3, "direct_answers": ["catch ball", "playing baseball", "baseball", "catching", "catch", "baseball", "hand protection", "baseball", "catching", "protection"], "difficult_direct_answer": false, "rationales": ["They are catching balls. balls can be hard.", "Although these are used for baseball, gloves in general are used primarily for protection.", "It's to protect their hand from the ball, and makes it easier to catch them as well"], "image": "val2014/COCO_val2014_000000428881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417583, "question_id": "ANZ4WyXL3T8UwQCQH6figa", "question": "What type of conversation is he having?", "choices": ["cellular", "private", "in person", "group"], "correct_choice_idx": 0, "direct_answers": ["phone conversation", "chatting", "confusing", "mobile phone", "serious", "mobile", "cellphone conversation", "cell", "casual", "cellular"], "difficult_direct_answer": true, "rationales": ["The person is holding a phone to their ear. this is an action that is performed when one is having a cellular phone conversation.", "The person is holding an object to the ear and mouth consistent with how one might have a conversation using a device associated with answer a.", "The man has a cell phone up to his ear. people are talking on the phone when they have it up to their ear."], "image": "train2014/COCO_train2014_000000417583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417822, "question_id": "ANcz3NZFqhsU3vs5LEu3zC", "question": "Why are the men in yellow coming from the bottom of the plane?", "choices": ["unloading", "stealing", "riding", "repairing"], "correct_choice_idx": 0, "direct_answers": ["unloading luggage", "unloading", "plane", "fill tank", "they're fueling", "load luggage", "loading luggage", "loading luggage", "loading luggage", "compartment area"], "difficult_direct_answer": false, "rationales": ["There are passengers walking away from the plane which means it likely just landed. the men in question are wearing safety gear and have opened up the plane so they are likely airport workers who would be unloading if the plane just landed.", "That's the cargo hold and people are exiting the plane.", "That's the unloading area."], "image": "train2014/COCO_train2014_000000417822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270269, "question_id": "ANgTK9N92KrDFnQzYEs9ww", "question": "What does the bottom red light prohibit?", "choices": ["loitering", "crossing", "trading", "racing"], "correct_choice_idx": 1, "direct_answers": ["crossing", "walking", "crossing", "crossing", "crossing", "crossing", "walking", "walking", "walking", "walking bikes"], "difficult_direct_answer": false, "rationales": ["The red light keeps people from crossing the street.", "It tells you not to cross until it's green.", "Just as red means for cars to stop at an intersection, so does this sign for bikes and humans to not cross when red."], "image": "train2014/COCO_train2014_000000270269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78703, "question_id": "ANn5pXHWxFJgzLmWnZiSaT", "question": "What is the main means of getting around here?", "choices": ["train", "horses", "uber", "taxi"], "correct_choice_idx": 1, "direct_answers": ["horses", "horse", "horse", "horse", "horseback", "horse", "horse", "horses", "horses", "horseback"], "difficult_direct_answer": false, "rationales": ["The people in the background are riding stallions", "There are horses at the background.", "In the background of this photo there are a few horses. this could be implied as horses being the most popular means of transport here."], "image": "val2014/COCO_val2014_000000078703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408363, "question_id": "AP4kVprHTtyzDMXpYMxSof", "question": "What does the company that made the hats on the meter make?", "choices": ["subs", "pretzels", "donuts", "coffee"], "correct_choice_idx": 2, "direct_answers": ["doughnuts", "donuts", "doughnuts", "doughnuts", "doughnuts", "donuts", "signs", "doughnuts", "donuts", "advertisment"], "difficult_direct_answer": false, "rationales": ["Each hat has a krispy kreme, not wetzel's pretzels, subway, or starbucks, logo.", "The company has donuts.", "The hats say krispy kreme and they are known for making donuts."], "image": "val2014/COCO_val2014_000000408363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549101, "question_id": "AP6dYbdtcXyjeBC4iRTECE", "question": "Why is the pottery placed on the shelving?", "choices": ["to repair", "to sell", "to display", "to store"], "correct_choice_idx": 2, "direct_answers": ["display", "museum display", "display", "display", "display", "display", "to display", "old thinks", "museum", "storage"], "difficult_direct_answer": false, "rationales": ["The lighting and positioning", "The pottery has signs in front of it and is placed on shelves for display.", "The pottery is spaced out on the shelves for easy viewing."], "image": "train2014/COCO_train2014_000000549101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549255, "question_id": "APCDZeLAAUZSoBwSqUMGTT", "question": "What causes this woman's smokey eyes?", "choices": ["oversleeping", "mascara", "fighting", "forest fires"], "correct_choice_idx": 1, "direct_answers": ["crying", "makeup", "running makeup", "makeup", "eyeliner", "tired", "makeup", "mascara", "makeup", "drunk"], "difficult_direct_answer": false, "rationales": ["Makeup drips down from the eyes when someone has been crying", "The woman has runny mascara.", "Smokey eyes is an effect achieved through the use of a specific type of makeup so this woman's smokey eyes look must have been achieved by using mascara."], "image": "val2014/COCO_val2014_000000549255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305366, "question_id": "APXiyFadqwn6adW4CSrVHA", "question": "What color is the fruit located under the bananas?", "choices": ["pink", "red", "purple", "green"], "correct_choice_idx": 3, "direct_answers": ["yellow", "green", "green", "yellow", "green", "green", "green", "green", "green", "yellow"], "difficult_direct_answer": false, "rationales": ["The bananas are over the mangoes. the mangoes in this image are unripe.", "The limes are green which is the color they are supposed to be.", "It is green."], "image": "train2014/COCO_train2014_000000305366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339536, "question_id": "APzEH2hN4vMza9QAvwew5B", "question": "What type of buildings are in the background?", "choices": ["high rises", "cabanas", "bungalows", "low rises"], "correct_choice_idx": 0, "direct_answers": ["skyscrapers", "apartment buildings", "skyscrapers", "skyscrapers", "high rises", "skyscrapers", "tall buildings", "apartment", "apartments", "apartment buildings"], "difficult_direct_answer": false, "rationales": ["By the design and height you can tell what they are.", "Tall city buildings surround a busy street.", "The buildings in the background are many stories high and low rises, cabanas and bungalows typically have only one floor."], "image": "train2014/COCO_train2014_000000339536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330663, "question_id": "APzyjZyXRZiXkR2XnZrzTz", "question": "What is between the is the fence made of?", "choices": ["steel", "wood", "glass", "plastic"], "correct_choice_idx": 0, "direct_answers": ["iron", "giraffe", "aluminum", "steel", "metal", "metal air", "metal", "giraffe", "wood metal", "metal"], "difficult_direct_answer": false, "rationales": ["The giraffe is behind a fence made of metal at a zoo.", "The fence has to be sturdy to keep the animals in and is made of metal.", "The fence is metallic."], "image": "train2014/COCO_train2014_000000330663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337275, "question_id": "AQ65FMoE3SRRoQk2MtVe3o", "question": "Where do apples originate from?", "choices": ["australia", "europe", "england", "asia"], "correct_choice_idx": 3, "direct_answers": ["kazakhstan", "america", "trees", "asia", "asia", "trees", "asia", "trees", "asia", "kazakhstan"], "difficult_direct_answer": false, "rationales": ["Apples first came from kazakhstan which is in central asia.", "Apple trees began in asia.", "They are originally from kazakhstan"], "image": "val2014/COCO_val2014_000000337275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145135, "question_id": "AQDoHyzrq7v6ofXcUKiPRJ", "question": "What is the white object in the ground in front of the animals?", "choices": ["stone", "snow", "moss", "home plate"], "correct_choice_idx": 0, "direct_answers": ["gravel", "rocks", "rocks", "rock", "boulder", "stone", "concrete", "rock", "stone", "stone"], "difficult_direct_answer": false, "rationales": ["A flat gray object is in the dirt near some animals.", "Sheep are walking in a grassy area and some smooth, shiny, gray objects are in the dirt in front of them.", "There is stones in front of them."], "image": "train2014/COCO_train2014_000000145135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575280, "question_id": "AQFkekaGzzkfaocmWmJWBY", "question": "The small person here learns how to do what?", "choices": ["ski", "boogie board", "sail", "paddle board"], "correct_choice_idx": 3, "direct_answers": ["paddle board", "surf", "surf", "paddle board", "paddleboard", "surf", "surf", "surf", "paddleboard", "paddle"], "difficult_direct_answer": false, "rationales": ["When there is a ore being used while on a surfing board, it is paddle boarding.", "A boy is standing on a paddle board with an older person standing alongside.", "The person is on a paddle board."], "image": "train2014/COCO_train2014_000000575280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460118, "question_id": "AQWabeL7pgVRAbaU7ddYot", "question": "What are dwellings made of here?", "choices": ["grass", "sand", "wood", "stone"], "correct_choice_idx": 3, "direct_answers": ["dirt", "cave", "stone", "wood", "stone", "rock", "clay", "adobe", "rock", "clay"], "difficult_direct_answer": false, "rationales": ["The dwellings are inside mountains that were carved out.", "You can see houses carved into the rock.", "There are stones around the sheep."], "image": "train2014/COCO_train2014_000000460118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365132, "question_id": "AQXWugEir6XtCdTx8FLKS7", "question": "What do people walking on the sand leave behind with their every step?", "choices": ["footprints", "dirt", "salt", "sunscreen"], "correct_choice_idx": 0, "direct_answers": ["footprint", "footsteps", "footprints", "footsteps", "footprints", "footprints", "sand sinks", "footstep", "footprint", "footprints"], "difficult_direct_answer": false, "rationales": ["The people leave footprints.", "Sand is impressionable and the people have weight.", "The ground is not solid and people's feet sink into it."], "image": "val2014/COCO_val2014_000000365132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409100, "question_id": "AQcNonHjbRd73XMuccjWSR", "question": "The man looks like he is headed to what kind of job?", "choices": ["sanitation", "rodeo", "circus", "office"], "correct_choice_idx": 3, "direct_answers": ["restaurant manager", "accounting", "office", "salesman", "office", "office", "insurance salesman", "office", "office", "office"], "difficult_direct_answer": false, "rationales": ["He looks like he is going to work.", "People often wear a white suit with a tie in a office.", "People with professional jobs dress professionally"], "image": "val2014/COCO_val2014_000000409100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316441, "question_id": "AQsXA22XqXf7nccLDXzQtF", "question": "Why is the man kneeling on the ground?", "choices": ["he fell", "dancing", "praying", "repairing something"], "correct_choice_idx": 3, "direct_answers": ["sleeping", "working", "working", "praying", "looking", "drunk", "fixing", "working", "repairing something", "praying"], "difficult_direct_answer": false, "rationales": ["The man has tools with him and appears to be working on the street drain.", "He's repairing something in a panel in the sidewalk.", "The man looks to be in some kind of workers uniform and has a bag next to him and a hole open in front of him. this is not an activity common people would do so he is likely there for a purpose and reaching where he is reaching would probably not be done unless there was something that needed repair."], "image": "train2014/COCO_train2014_000000316441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464430, "question_id": "AQvhvaPYRbPgApvKsACYyZ", "question": "What is the name of this game?", "choices": ["badminton", "golf", "soccer", "cricket"], "correct_choice_idx": 0, "direct_answers": ["tennis", "tennis", "tennis", "tennis", "tennis", "tennis", "tennis", "tennis", "badminton", "tennis"], "difficult_direct_answer": false, "rationales": ["Badminton is played with racquets on a court.", "The name is badminton.", "Two people are on a court with a net in the middle. badminton is played on a court with a net."], "image": "train2014/COCO_train2014_000000464430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265553, "question_id": "AQxfLcsby3tBwPEKQ69rek", "question": "What is this kid playing in?", "choices": ["football", "little guys", "little league", "gone"], "correct_choice_idx": 2, "direct_answers": ["little league", "sand", "sand", "sweatpants", "tee ball", "ball", "t-ball", "tee-ball", "baseball game", "baseball"], "difficult_direct_answer": true, "rationales": ["The kid is playing on a little league team.", "The kid is in little league.", "The kid is playing in little league with a ball stand."], "image": "train2014/COCO_train2014_000000265553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528458, "question_id": "AR5ToQBrricRWSB2gzSv9J", "question": "Why do the horses run?", "choices": ["water ahead", "escape danger", "food ahead", "racing"], "correct_choice_idx": 3, "direct_answers": ["racing", "race", "race", "to race", "race", "racing", "race", "race", "racing", "racing"], "difficult_direct_answer": false, "rationales": ["The horses and their riders have uniforms on consistent with the activity of answer a as well as the surface they are on.", "The horses run because they are racing.", "The riders all have uniforms on and the horses are on a track."], "image": "val2014/COCO_val2014_000000528458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330220, "question_id": "AR5vumfbzmVriMoEAb9wen", "question": "What type of egg dish is shown?", "choices": ["fu yung", "benedict", "scrambled", "omelette"], "correct_choice_idx": 1, "direct_answers": ["eggs benedict", "eggs benedict", "eggs benedict", "eggs benedict", "over easy", "egg burger", "eggs benedict", "benedict", "eggs benedict", "over easy"], "difficult_direct_answer": false, "rationales": ["Only one of the choices are eggs that have hollandaise sauce.", "The eggs are used in a sandwich.", "White eggs with a sauce over them is on a bread on a plate."], "image": "train2014/COCO_train2014_000000330220.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76525, "question_id": "ARAKu9XZJ7TMsZHRbTbjDf", "question": "What is the yellow can being placed in?", "choices": ["pot", "shoe", "refrigerator", "desk drawer"], "correct_choice_idx": 0, "direct_answers": ["pot", "pot", "pot", "coffe", "pot", "pot", "cooking pot", "water", "water", "pot"], "difficult_direct_answer": false, "rationales": ["It is a tall metal cylinder that is used for cooking on a stove top.", "That type of vessel is used for canning.", "It's being placed in a pot of hot water to seal the lid on the jar for canning."], "image": "val2014/COCO_val2014_000000076525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460353, "question_id": "ARQ7QV75FtBdgbiwV8BrPu", "question": "Which fruit has the special offer?", "choices": ["grapes", "apples", "pears", "bananas"], "correct_choice_idx": 3, "direct_answers": ["banana", "banana", "bananas", "bananas", "bananas", "bananas", "bananas", "bananas", "bananas", "banana"], "difficult_direct_answer": false, "rationales": ["The sign states what was on sale and the type.", "The bananas has a sign that they are 2 for 1.", "The sign states what fruit is on special offer."], "image": "train2014/COCO_train2014_000000460353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578974, "question_id": "ARVWSAStHRDWaJsKFjxRtE", "question": "What activity is the person doing?", "choices": ["eating", "baking", "painting", "driving"], "correct_choice_idx": 1, "direct_answers": ["cooking", "baking", "cooking", "baking", "baking", "baking", "baking", "baking", "cooking", "cooking"], "difficult_direct_answer": false, "rationales": ["The person is standing near the oven which is baking bread.", "The woman is sitting in front of the oven. you can see something cooking in the oven.", "There is a pan of food in the oven."], "image": "val2014/COCO_val2014_000000578974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263278, "question_id": "ARhAFYXcbce7XZ8PxHWJBL", "question": "To what is the string connected that is held by the Man?", "choices": ["fish", "girl", "nothing", "surf board"], "correct_choice_idx": 3, "direct_answers": ["surf board", "surfboard", "tether", "child", "surfboard", "surfer", "surfboard", "surfboard", "surfboard", "board"], "difficult_direct_answer": false, "rationales": ["The cable is used to keep from losing the surfboard while surfing.", "It's connected to the board so he will not lose it in the waves", "It is a long board that can be ridden in the ocean."], "image": "train2014/COCO_train2014_000000263278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568559, "question_id": "ARixomFL25wuhUYX9GbVtv", "question": "What emotion does the cat appear to be expressing?", "choices": ["disgust", "love", "excitement", "surprise"], "correct_choice_idx": 3, "direct_answers": ["surprise concern", "fear", "surprise", "shock", "surprise", "surprise", "fright", "scared", "surprise", "mad"], "difficult_direct_answer": false, "rationales": ["The cat's eyes are open extra wide.", "The eyes are very wide open", "The cat's eyes appear wide and its ears are alert."], "image": "val2014/COCO_val2014_000000568559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391891, "question_id": "ARjpxJXxzQXF4RKR2aaYjU", "question": "What meal is shown here?", "choices": ["lunch", "dinner", "midnight snack", "brunch"], "correct_choice_idx": 3, "direct_answers": ["breakfast", "breakfast", "breakfast", "brunch", "scrambled eggs", "scrambled eggs", "breakfast", "scrambled eggs", "breakfast", "breakfast"], "difficult_direct_answer": false, "rationales": ["The food is typical of that served at breakfast, but it is at a restaurant, where brunch is a popular meal served.", "Scrambled eggs and bacon are foods that are most often served as the first meal of the day. since breakfast was not an option, i chose the next earliest meal which combines breakfast and lunch dishes.", "People usually eat things like eggs, bacon, and toast earlier in the day."], "image": "val2014/COCO_val2014_000000391891.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410366, "question_id": "AS8b5uKeHwCWj9WYBq9MH3", "question": "What setting is this photo located in?", "choices": ["city park", "campground", "basement", "beach"], "correct_choice_idx": 1, "direct_answers": ["beach", "play field", "park", "outside", "night", "outdoors", "park", "park", "woods", "campground"], "difficult_direct_answer": false, "rationales": ["The setting is a campground.", "The kids are playing outdoors with frisbees.", "The children are seen in the picture which show poor in the camp."], "image": "train2014/COCO_train2014_000000410366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26260, "question_id": "ASAUnvnSvX69fBgfi9yeYL", "question": "What species animal died so that this meal could be prepared?", "choices": ["capra", "feline", "canine", "bovine"], "correct_choice_idx": 3, "direct_answers": ["bovine", "cow", "cow", "cow", "cow", "cow", "cow", "cow", "cow", "cow"], "difficult_direct_answer": false, "rationales": ["The people are eating beef. it comes from cows, not dogs, cats, or goats.", "It is steak, which comes from a cow which is a member of the bovine species.", "This meal is being prepared out of steak."], "image": "train2014/COCO_train2014_000000026260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124013, "question_id": "ASKRQhpEYQvHS33qfQtYhN", "question": "What color is the person who is responsible for the safety of the two girls on horseback wearing?", "choices": ["teal", "black", "pink", "blue"], "correct_choice_idx": 0, "direct_answers": ["green", "blue", "teal", "white", "green", "yellow", "teal", "blue", "teal", "blue"], "difficult_direct_answer": false, "rationales": ["An older adult is walking beside a horse as two young girls ride the horse.", "A person is leading a horse with two girls riding on it. the person is wearing a greenish, blue shirt.", "The uniform is teal in colour."], "image": "val2014/COCO_val2014_000000124013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85398, "question_id": "ASP2TeAiVrwvePq2pAGVQu", "question": "Why is the man on the counter?", "choices": ["to paint", "to surprise", "to hide", "to rest"], "correct_choice_idx": 0, "direct_answers": ["painting", "to paint", "painting wall", "painting", "working", "painting walls", "renovation", "painting", "painting ceiling", "kitchen"], "difficult_direct_answer": false, "rationales": ["He has a paint bucket in his hand.", "The man is painting.", "The man is standing on the counter to fix and upgrade the ceiling."], "image": "train2014/COCO_train2014_000000085398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254046, "question_id": "ASc9Fc3JwaHXLpaFywmRM9", "question": "Which numbered biker seems to be leading the pack?", "choices": ["89", "96", "66", "99"], "correct_choice_idx": 3, "direct_answers": ["ninety nine", "ninety nine", "99", "99", "99", "ninety nine", "99", "99", "99", "ninety nine"], "difficult_direct_answer": false, "rationales": ["The bike in the front indicates 99 on it.", "The greatest two digit number is the number the motorcycle in the lead.", "99 is leading the pack."], "image": "train2014/COCO_train2014_000000254046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557636, "question_id": "ASgBLyFrghegtSqSXKRpDD", "question": "What are the poles helping the man on the right do?", "choices": ["stand", "flip", "spin", "clean"], "correct_choice_idx": 0, "direct_answers": ["ski", "stand up", "stand", "ski", "balance", "balance", "stand", "pulling", "balance", "lean"], "difficult_direct_answer": false, "rationales": ["The man on the right is stationary. he is not performing tricks.", "The man on the right is stationary and is not attempting to do a trick. he is not cleaning anything.", "The man on the right is using the poles to help him stand up."], "image": "train2014/COCO_train2014_000000557636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562628, "question_id": "ASp4QR9MiG6W7vtaSK5nKc", "question": "What probably wrote on the largest white surface?", "choices": ["ballpoint pen", "mouse", "marker", "pencil"], "correct_choice_idx": 2, "direct_answers": ["marker", "employee", "dry erase", "worker", "dryerase marker", "worker's marker", "marker", "human", "marker", "notes"], "difficult_direct_answer": false, "rationales": ["A special pen that is made for white boards.", "The writing on the board was done with a marker.", "There is some black writing on the whiteboard."], "image": "val2014/COCO_val2014_000000562628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275496, "question_id": "ASpcgHLCgEJVdU2P4SBWcP", "question": "The brown ground is made of what material?", "choices": ["carpet", "wood", "ceramic", "cement"], "correct_choice_idx": 1, "direct_answers": ["cement", "wood", "cement", "wood", "wood", "wood", "wood", "wood", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["The brown ground comes from wooden planks.", "Skateboarders perform on the hardest surface possible.", "There is slats of wood."], "image": "val2014/COCO_val2014_000000275496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110108, "question_id": "ATQxAqUC6XD6jzrfXEjKcD", "question": "What is the giant fish in the air?", "choices": ["blimp", "balloon", "kite", "sculpture"], "correct_choice_idx": 2, "direct_answers": ["flying kite", "kite", "koi", "beta fish", "kite", "kite", "kite", "kite", "kite", "kite"], "difficult_direct_answer": false, "rationales": ["The giant fish is flying like a kite.", "The fish is being flown as a kite.", "The giant fish is made of a light weight material connected to string."], "image": "train2014/COCO_train2014_000000110108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529772, "question_id": "ATRN2gMpcWsvzsE3U7rXgb", "question": "The player with the number 51 jersey plays for what team?", "choices": ["phillies", "galaxy", "patriots", "yankees"], "correct_choice_idx": 0, "direct_answers": ["red socks", "opposing", "cardinals", "cardinals", "phillies", "red", "red team", "nationals", "red sox", "red sox"], "difficult_direct_answer": false, "rationales": ["The philadelphia phillies team colors are grey and red.", "The yankees wear pinstripes and the patriots and galaxy don't even play baseball.", "The person in the red number 51 jersey is a baseball, not football or soccer, player. the yankees do not wear red uniforms."], "image": "train2014/COCO_train2014_000000529772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18683, "question_id": "ATTYqrFY7JT4AjyDDemnnj", "question": "What did the child do which is displayed by her?", "choices": ["read text", "made call", "took call", "took photo"], "correct_choice_idx": 3, "direct_answers": ["take photo", "picture", "photographed sibling", "take photo", "took picture", "took photo", "picture taking", "took photo", "cell phone", "photograph"], "difficult_direct_answer": false, "rationales": ["The child took a photo with the camera.", "The child is showing off a photo on the phone.", "It is a smart phone with a camera"], "image": "train2014/COCO_train2014_000000018683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213009, "question_id": "ATnC86eUhMzcjoGnrVKCLS", "question": "Why does the kneeling man crouch low?", "choices": ["catch ball", "he's tired", "wave hello", "clean base"], "correct_choice_idx": 0, "direct_answers": ["catcher", "catch ball", "catch ball", "catch ball", "catch ball", "catch ball", "catcher", "catch ball", "catcher", "catch ball"], "difficult_direct_answer": false, "rationales": ["The man is the catcher and the ball is thrown low.", "The man is catching the ball.", "This is because the ball drops as it crosses the plate"], "image": "train2014/COCO_train2014_000000213009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185091, "question_id": "AU9oh7vwqN6W6PEc3czRBR", "question": "Which vegetable is reddest here?", "choices": ["squash", "onion", "eggplant", "bell pepper"], "correct_choice_idx": 3, "direct_answers": ["pepper", "pepper", "pepper", "red pepper", "pepper", "tomato", "bell pepper", "pepper", "bell pepper", "pepper"], "difficult_direct_answer": false, "rationales": ["The only red veggie is a red bell pepper", "Bell peppers are more red than green squash and white cheese and onion.", "That's definitely a bell pepper."], "image": "train2014/COCO_train2014_000000185091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400571, "question_id": "AUMBUmEAddvUmQaHJ2wMZy", "question": "Which item contains a lot of potassium?", "choices": ["rice", "banana", "meat", "wine"], "correct_choice_idx": 1, "direct_answers": ["banana", "banana", "banana", "banana", "banana", "banana", "banana", "banana", "banana", "banana"], "difficult_direct_answer": false, "rationales": ["The fruit is a good source of potassium.", "The items in the bowl are clearly visible and identifiable based on their size, shape and color. answer a is commonly known to have high potassium.", "The other options don't contain as much as a single a."], "image": "train2014/COCO_train2014_000000400571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234136, "question_id": "AUMfS6YTrPe5CYR2Jo7jTu", "question": "What is touching the dog?", "choices": ["cat's paw", "man's hand", "bear's claw", "woman's hand"], "correct_choice_idx": 1, "direct_answers": ["hat man", "man's hand", "human", "hand", "man", "human", "hand", "hand", "man's hand", "hand"], "difficult_direct_answer": false, "rationales": ["A man's hand is draped around the dog, hugging it.", "The mans hand is on the neck of the dog.", "The man with the hat has his hand on the dog and is touching it."], "image": "val2014/COCO_val2014_000000234136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471339, "question_id": "AUaoA5wKJnYyP92ZGQ2znT", "question": "Which train is safest to board for those on our right?", "choices": ["near arriving", "gray furthest", "none", "any"], "correct_choice_idx": 0, "direct_answers": ["right", "distant train", "right", "right", "right", "yellow", "near arriving", "right", "left", "farther back"], "difficult_direct_answer": false, "rationales": ["They will be safe if they board the one that is incoming and closest to the loading dock that they are standing on.", "Train stations are built for passengers to board the train nearest the platform. those standing on the right would board the nearest train to them.", "The train is close to arriving."], "image": "train2014/COCO_train2014_000000471339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539235, "question_id": "AUkNGmZ5VyJnkwTw3mVjJp", "question": "Why are the bananas hung up on poles?", "choices": ["to dry", "to sell", "to trade", "to decorate"], "correct_choice_idx": 1, "direct_answers": ["selling bananas", "sale", "fresh longer", "to ripen", "to sell", "to sell", "sales display", "keep fresh", "ripen", "for sale"], "difficult_direct_answer": true, "rationales": ["There are there so customers can buy them.", "The bananas are hanging in a market to be sold.", "The bananas are being sold at a store."], "image": "train2014/COCO_train2014_000000539235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80541, "question_id": "AUwgMNZbe8Lt8MyaaLexjB", "question": "Where does he like to play?", "choices": ["beach", "mountain", "tundra", "desert"], "correct_choice_idx": 0, "direct_answers": ["ocean", "ocean", "beach", "surf", "beach", "ocean", "ocean", "beach", "ocean", "ocean"], "difficult_direct_answer": false, "rationales": ["There is a surfboard, not skis, a dune buggy, or a snowmobile, near the man.", "The man is sitting near a surfboard and he probably likes to take it to the beach.", "The man is surrounded by surfboards and surfboards are used the beach, so the beach must be where he likes to \"play.\"."], "image": "train2014/COCO_train2014_000000080541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558794, "question_id": "AV9bhLdURFwQjBshpBLScF", "question": "Kunst-Wet is a Brussels metro station located in which country?", "choices": ["germany", "uk", "france", "belgium"], "correct_choice_idx": 3, "direct_answers": ["belgium", "belgium", "belgium", "belgium", "belgium", "belgium", "belgium", "belgium", "belgium", "usa"], "difficult_direct_answer": false, "rationales": ["It's the capital of this country.", "Brussels is in belgium.", "Brussels is in the country of belgium."], "image": "val2014/COCO_val2014_000000558794.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510314, "question_id": "AVBy6qghhy6T5QMovpPdmt", "question": "Which one of these would one expect to find in this building?", "choices": ["paintings", "airplanes", "beds", "fossils"], "correct_choice_idx": 0, "direct_answers": ["paintings", "paintings", "artwork", "artifacts", "paintings", "art", "paintings", "artwork", "museum", "painting"], "difficult_direct_answer": false, "rationales": ["The museum is labeled on the outside as an art museum which might likely contain answer a.", "The building is dedicated to art.", "It's an art museum."], "image": "train2014/COCO_train2014_000000510314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279451, "question_id": "AWGpSinPSAPg85KhQZHVyq", "question": "What is the official language of this airline's country?", "choices": ["japanese", "chinese", "korean", "russian"], "correct_choice_idx": 0, "direct_answers": ["english", "japanese", "japanese", "japanese", "japanese", "japanese", "japanese", "english", "japanese", "english"], "difficult_direct_answer": false, "rationales": ["The plane says the country of japan and they speak japanese.", "People speak japanese in japan.", "The airplane is owned by japan airline and their official language is japanese."], "image": "train2014/COCO_train2014_000000279451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552654, "question_id": "AWQR6jPKwYakY5hGdUkyNi", "question": "This man is standing on what?", "choices": ["ski", "ladder", "chair", "stilt"], "correct_choice_idx": 1, "direct_answers": ["snow", "skis", "snow place", "snowboard", "skate board", "snowboard", "snowboard", "ladder", "skii", "snowboard"], "difficult_direct_answer": false, "rationales": ["The man is on a ladder.", "The man is standing on a ski on one leg. the item is thin and connects with one foot like a ski.", "The man is standing on a snowboard."], "image": "val2014/COCO_val2014_000000552654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491947, "question_id": "AWm5FRJ87CcrgXU2Xhzg4X", "question": "What is the lap top controlling here?", "choices": ["music", "weather", "nothing", "red bull"], "correct_choice_idx": 0, "direct_answers": ["music", "speakers", "camera", "music", "music", "music", "music", "video", "music", "music"], "difficult_direct_answer": false, "rationales": ["They look like they are at a party and playing music", "It's part of a do set up.", "The younger man has the laptop connected to a speaker behind him which he is most likely a dj."], "image": "val2014/COCO_val2014_000000491947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93601, "question_id": "AXCrGbTS2LbxKefpRuuCwr", "question": "Why are the cinnamon buns white?", "choices": ["food coloring", "ranch dressing", "mayonnaise", "icing"], "correct_choice_idx": 3, "direct_answers": ["frosting", "icing", "uncooked", "icing", "sugar", "uncooked", "glaze", "frozen", "icing", "iced"], "difficult_direct_answer": false, "rationales": ["The white color on top of these uncooked buns is icing.", "The coating on top is sweet.", "The cinnamon buns are covered in frosting."], "image": "val2014/COCO_val2014_000000093601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190547, "question_id": "AXDuczyZGdisdqT3dRtSKV", "question": "What keeps water from splashing out of the tub?", "choices": ["accordion door", "single door", "shower curtain", "sliding door"], "correct_choice_idx": 2, "direct_answers": ["shower curtain", "shower curtain", "shower curtain", "shower curtain", "shower curtain", "shower curtain", "bathtub", "shower curtain", "shower curtain", "bathtub"], "difficult_direct_answer": false, "rationales": ["To keep the floor from getting wet when showering, the shower curtain is closed.", "The curtain keeps from splashing.", "A shower curtain keeps water from splashing out of the tub."], "image": "train2014/COCO_train2014_000000190547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333480, "question_id": "AXPGQ8G8rW6Wdmov4vZNC6", "question": "What is directly under the green item here?", "choices": ["bacon", "beans", "tomato", "pepper"], "correct_choice_idx": 2, "direct_answers": ["tomato", "tomato", "tomato", "tomato", "tomato", "tomato", "tomato", "tomato", "tomato", "tomato"], "difficult_direct_answer": false, "rationales": ["The standard makeup of a hamburger is lettuce, tomato, pickles, and onion.", "A red tomato is under it.", "It is a slice of a red vegetable."], "image": "val2014/COCO_val2014_000000333480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124765, "question_id": "AXUtTBQyMva8WjKkcUnkeG", "question": "What is the name of the company the truck belongs to?", "choices": ["lantern", "howard", "lincoln", "apple"], "correct_choice_idx": 0, "direct_answers": ["lantern", "lantern", "lantern", "lantern", "lantern", "lantern", "lantern", "lantern", "lantern", "lantern"], "difficult_direct_answer": false, "rationales": ["It's written on the front.", "The name is lantern.", "That is the name of the company."], "image": "train2014/COCO_train2014_000000124765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25433, "question_id": "AXmmKs7faEvpUbx35EUBXQ", "question": "What are these banners promoting?", "choices": ["dance halls", "museums", "open houses", "church"], "correct_choice_idx": 1, "direct_answers": ["museums", "animals", "kids toys", "church", "museum exhibits", "unclear", "exhibition", "events", "church", "unknown"], "difficult_direct_answer": true, "rationales": ["The banners promote museums.", "The banners are promoting museums in the city.", "These banners are promoting museum exhibitions."], "image": "val2014/COCO_val2014_000000025433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195916, "question_id": "AXwJNxk5RyUkFTpQQ5e4Wy", "question": "Which way is illegal to turn on the upcoming cross street?", "choices": ["right", "straight", "none", "left"], "correct_choice_idx": 0, "direct_answers": ["right", "right", "right", "red light", "right", "red light", "right", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["One way street to left", "It is illegal to turn right because there is a sign that says it is a one way street.", "There is a sign showing the upcoming cross street is a one-way street requiring vehicles to only turn to the left."], "image": "train2014/COCO_train2014_000000195916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345883, "question_id": "AXx2TmRLGBry8cT3VsDaXL", "question": "What type of service does this vehicle provide?", "choices": ["dentistry", "gas", "energy", "food"], "correct_choice_idx": 3, "direct_answers": ["food", "food", "food", "transportation", "food truck", "food", "good", "food", "food service", "food"], "difficult_direct_answer": false, "rationales": ["Trucks with this silver material on the side are often food trucks that show up at factories.", "This looks like a typical food truck.", "The vehicle is known as a food truck."], "image": "train2014/COCO_train2014_000000345883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568023, "question_id": "AXy8HNPkjKonLE88FyyuXS", "question": "What brand is the motorcycle?", "choices": ["yamaha", "harley", "honda", "suzuki"], "correct_choice_idx": 0, "direct_answers": ["yamaha", "yamaha", "yamaha", "yamaha", "yamaha", "yamaha", "yamaha", "yamaha", "yamaha", "yamaha"], "difficult_direct_answer": false, "rationales": ["A motorcycle has a brand logo on it.", "The red sign near the top right indicates the brand.", "The brand name is on the wall"], "image": "train2014/COCO_train2014_000000568023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350235, "question_id": "AYfERjdLivwm52kGEvvTW7", "question": "The men here are aiming for what type goal to score in?", "choices": ["hole", "soccer goal", "base", "basketball net"], "correct_choice_idx": 3, "direct_answers": ["basketball", "basketball hoop", "basketball net", "basketball net", "basket ball", "basket", "basketball", "basketball", "basket", "basket"], "difficult_direct_answer": false, "rationales": ["The people are playing basketball. you can see the bottom of the net on the left side at the top.", "One of the men has the ball for the sport.", "They are playing a game of basketball"], "image": "train2014/COCO_train2014_000000350235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309539, "question_id": "AYgXJAHvw6ao8wJ6jgHJS2", "question": "What sort of utensil will the diner use to eat this slice?", "choices": ["fork", "spoon", "none", "knife"], "correct_choice_idx": 2, "direct_answers": ["fork", "none", "fork", "hands", "fork", "fork", "hands", "fork", "fork", "hands"], "difficult_direct_answer": false, "rationales": ["Most people pick up a piece of pizza to eat it.", "The utensil is the hands.", "Usually, a pizza-eater will eschew cutlery in favor of the hands-on approach. since americans consume about 3 billion pizzas a year, this will result in more hand-washing, but much less dish-washing!"], "image": "val2014/COCO_val2014_000000309539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493741, "question_id": "AYgwA5B7LR3wo4WFYBzNmZ", "question": "What is the man wearing?", "choices": ["goggles", "bandana", "potato sack", "garbage bag"], "correct_choice_idx": 0, "direct_answers": ["ski goggles", "goggles", "protective things", "ski jacket", "winter coat", "googles", "ski parka", "ski suit", "winter jacket", "goggles"], "difficult_direct_answer": true, "rationales": ["He has the eye covering his eyes so he can see in the sun and does not get anything in his eyes.", "The man has goggles.", "The man has goggles for eye protection on."], "image": "train2014/COCO_train2014_000000493741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552446, "question_id": "AYj5naZu8eE3Fg7YhEWyeG", "question": "The man wearing what color of shirt enforces the rules of the game?", "choices": ["grey", "white", "red", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "black", "white", "black", "black", "grey", "black", "umpire"], "difficult_direct_answer": false, "rationales": ["An umpire is behind the plate on a baseball diamond. umpires wear dark colored uniforms.", "This is the uniform for umpires in baseball", "The umpire is the rule enforcer on a baseball field and they usually are behind the catcher wearing a different color than the two opposing teams."], "image": "train2014/COCO_train2014_000000552446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64710, "question_id": "AYrsEjR4HQevzQ9KX9W33T", "question": "What is the man using his dogs for on this hike?", "choices": ["retrieving", "hunting", "hauling", "herding"], "correct_choice_idx": 2, "direct_answers": ["to carry", "carrying supplies", "carry items", "pack dogs", "carrying equipment", "carrying stuff", "carrying", "hauling", "luggage", "carrying provisions"], "difficult_direct_answer": true, "rationales": ["Though all answers are plausible, the dogs are packed with supplies wearing a backpack.", "It appears there is a pack attached to the dogs. attaching a pack to something would be a way to haul something.", "Two dogs are with their owner. they are carrying luggage on their back to help the man."], "image": "val2014/COCO_val2014_000000064710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108500, "question_id": "AZAU2FscJWkhMjp7gUdY4d", "question": "Why does the large truck stop here?", "choices": ["fixing vehicle", "traffic", "truck broken", "to advertise"], "correct_choice_idx": 1, "direct_answers": ["make delivery", "delivery", "delivery", "delivery", "unload items", "traffic", "traffic", "deliver goods", "storefront", "deliveries"], "difficult_direct_answer": false, "rationales": ["The scene depicts heavy traffic so it's easy to discern the answer.", "The truck is in traffic.", "A busy street is shown with multiple cars and a large truck in the middle."], "image": "train2014/COCO_train2014_000000108500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534543, "question_id": "AZGMpPczji2gHotFtDApRn", "question": "What is in front of the children?", "choices": ["eggs", "plates", "apples", "watermelons"], "correct_choice_idx": 1, "direct_answers": ["dessert", "cake", "cake", "cake", "cake slice", "food", "cake", "cake", "cake", "plates"], "difficult_direct_answer": false, "rationales": ["One can see the discardable serving trays with cake on them.", "There are paper plates in front of the kids.", "Children sit at a table with round paper plates in front of them with cake on them."], "image": "train2014/COCO_train2014_000000534543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82565, "question_id": "AZPoQdW8CVrqBCfom2Shfc", "question": "When travelling this road for safety what should you allow to cross first?", "choices": ["crickets", "cars", "sheep", "flies"], "correct_choice_idx": 2, "direct_answers": ["cows", "sheep", "animals", "goat", "sheep", "sheep", "sheep", "sheep", "cows", "animals"], "difficult_direct_answer": false, "rationales": ["The sheep have to cross before anyone else can.", "There are no other cars, and insects would not be a safety issue. there are white animals crossing this road.", "There is a herd in the road and more approaching"], "image": "train2014/COCO_train2014_000000082565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5131, "question_id": "AZhAsWuwkoXLsK6eyaKT8k", "question": "What sort of vessel occupationally binds the people marching here?", "choices": ["bikes", "plane", "water craft", "golf cart"], "correct_choice_idx": 1, "direct_answers": ["airplane", "airplane", "airplane", "airplane", "plane", "plane", "government", "airplane", "military", "airplanes"], "difficult_direct_answer": false, "rationales": ["Traditionally people who wear this uniform are pilots of commercial airplanes.", "The people are dressed in pilot uniforms and are wearing other pilot accessories.", "The people marching in the parade are airplane pilots by their jackets and stripes."], "image": "train2014/COCO_train2014_000000005131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493471, "question_id": "Aa28m6sXGXwgmise77TSb8", "question": "What type of food do this animal drink?", "choices": ["juice", "milk", "tea", "water"], "correct_choice_idx": 3, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["This animal drinks water and is a pig.", "Traditionally nearly all creatures of this world drink water.", "The animal needs water."], "image": "train2014/COCO_train2014_000000493471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363581, "question_id": "AaGVBxNhfx4fZKcVXJQ6UD", "question": "What material is the pizza plate made of?", "choices": ["plastic", "silver", "silicon", "ceramic"], "correct_choice_idx": 1, "direct_answers": ["tomato", "paper", "red beries", "dough", "metal", "silver", "cheese", "flour", "ceramic", "ceramic"], "difficult_direct_answer": true, "rationales": ["The plate below the pizza is shiny and silver in color.", "It is metallic and conducts heat in order to keep the food warm.", "Large thick pizza has a spatula to the side and is sitting on a large shiny metal pan."], "image": "val2014/COCO_val2014_000000363581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90359, "question_id": "AaRdtzDfsGUPvaBoTYzUYJ", "question": "How do the two old people feel?", "choices": ["angry", "scared", "entertained", "frustrated"], "correct_choice_idx": 2, "direct_answers": ["entertained", "curious", "interested", "terrific", "happy", "jealous", "relaxed", "good", "curious", "cold"], "difficult_direct_answer": true, "rationales": ["Two elderly people are walking along a boardwalk. the elderly people are watchin a guy do a bike trick.", "The old people are happy with the tricks.", "The people are entertained."], "image": "train2014/COCO_train2014_000000090359.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222751, "question_id": "AamFqk8UP2qDvuZV6DX5TA", "question": "What is the man in the red helmet about to do?", "choices": ["kick", "jump", "sit", "run"], "correct_choice_idx": 3, "direct_answers": ["run home", "playing", "run", "stealing base", "run", "run", "run", "steal base", "run", "run"], "difficult_direct_answer": false, "rationales": ["The man in the red helmet is about to run off base.", "His stance says he's about to run towards the other guy", "The man is running."], "image": "train2014/COCO_train2014_000000222751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2295, "question_id": "AbJ2XkzM3NMReAiGqSQrzm", "question": "What is the most popular type of yarn?", "choices": ["goat hair", "rayon", "wool", "silk"], "correct_choice_idx": 2, "direct_answers": ["wool", "red heart", "yarn", "cotton", "cotton", "wool", "fluffy", "marino", "wool", "wool"], "difficult_direct_answer": false, "rationales": ["The most popular and used the most would be wool", "A cat is playing with a ball of yarn. yarn is often made from wool.", "The popular type is wool."], "image": "val2014/COCO_val2014_000000002295.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563270, "question_id": "AbRpmgDf68vfcLMEjrhffr", "question": "What is the woman in green doing?", "choices": ["eating", "walking", "sitting", "leaning"], "correct_choice_idx": 3, "direct_answers": ["watching", "leaning", "leaning", "leaning", "leaning", "waiting", "talking", "make inquiry", "leaning", "leaning"], "difficult_direct_answer": false, "rationales": ["Both hands are resting on the desk, and when weight is dispersed in this way with the body protruding forward, it is called leaning.", "There is only one person visible who is female and wearing green and based on the hand placement of the object in front of her, the stance is known as answer a.", "Both the woman's arms are on the table and she is leaned forward."], "image": "train2014/COCO_train2014_000000563270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142826, "question_id": "AbZVeq93fCK42tvzyqVc6o", "question": "What do the umbrellas offer those who sit here?", "choices": ["shade", "warmth", "heat", "radiation protection"], "correct_choice_idx": 0, "direct_answers": ["shade", "protection", "shade", "shade", "shade", "shade", "shade", "shade", "shade", "shade"], "difficult_direct_answer": false, "rationales": ["Traditionally umbrella's offer shade and means to be dry when raining.", "Umbrellas are over outdoor tables. umbrellas are used to block sun and rain.", "Umbrellas are on tables in an outdoor area."], "image": "val2014/COCO_val2014_000000142826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39811, "question_id": "AbvaiemnJEVYFswUAmcQcp", "question": "Where is this bus headed next?", "choices": ["jail", "ashton vale", "first street", "24th street"], "correct_choice_idx": 1, "direct_answers": ["ashton vale", "next stop", "ashton vale", "ashton vale", "ashton vale", "ashton vale", "ashton vale", "ashton vale", "stop", "ashton vale"], "difficult_direct_answer": false, "rationales": ["Ashton vale is displayed on the digital screen.", "This is the destination displayed on the top of the bus.", "A bus has a digital sign on it that lists the next stop."], "image": "val2014/COCO_val2014_000000039811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112085, "question_id": "AbvsJfEtjYMXERJH4Poqhc", "question": "What does the horse have near its eyes?", "choices": ["bells", "blinders", "whip", "mask"], "correct_choice_idx": 1, "direct_answers": ["bridle", "blinder", "blinders", "harness", "blinders", "harness", "blinds", "blinders", "blinders", "blinders"], "difficult_direct_answer": false, "rationales": ["The horse has blinders.", "Blinders are used on work horses like them to keep them from getting distracted.", "The horses use that to cover their eyes."], "image": "val2014/COCO_val2014_000000112085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26501, "question_id": "Abxf6DKSkxxXWauQZ4t56W", "question": "Where is the building pictured above located?", "choices": ["france", "belgium", "portugal", "england"], "correct_choice_idx": 3, "direct_answers": ["westminster", "waterloo", "england", "england", "england", "england", "united kingdom", "uk", "big ben", "england"], "difficult_direct_answer": false, "rationales": ["The building pictured is in england since big ben is shown.", "Big ben is a tower next to one of the palaces owned by the british royal family.", "The tower gives it away if not the red bus would tell you where this is."], "image": "val2014/COCO_val2014_000000026501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107571, "question_id": "AcLoxASYJEDfedV2VrT7n3", "question": "Where do pandas come from?", "choices": ["japan", "china", "mongolia", "taiwan"], "correct_choice_idx": 1, "direct_answers": ["babys", "china", "china", "china", "china", "china", "babys", "china", "china", "china"], "difficult_direct_answer": false, "rationales": ["A stuffed panda is on a chair in a home. pandas are from china.", "Pandas originally live in china.", "Pandas come from china and are known there."], "image": "train2014/COCO_train2014_000000107571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324082, "question_id": "AcVFQy8uycvmB8oroEqzge", "question": "How do they communicate with each other when they are far?", "choices": ["cellphone", "megaphone", "walkie talkie", "hand signals"], "correct_choice_idx": 2, "direct_answers": ["walkie talkie", "walkie talkies", "radio", "radio", "radio", "police intercoms", "walkie talkie", "walkie talkies", "walkie talkie", "walkie talkie"], "difficult_direct_answer": false, "rationales": ["They have a device on their belt that they can use to talk to one another when they are not together.", "The people use walkie talkies.", "They have the device on their uniform to communicate with one another when not close."], "image": "val2014/COCO_val2014_000000324082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492840, "question_id": "AcikS3Zf23byqDQuPQsiD6", "question": "What is the man in black doing?", "choices": ["singing", "umpiring", "reprimanding", "carving"], "correct_choice_idx": 1, "direct_answers": ["refereeing", "batting", "umpiring", "umpire", "referred", "umping", "umpire", "umpiring", "watching ball", "umpire"], "difficult_direct_answer": false, "rationales": ["The job of the man in black is to stand behind the batter and catcher and to make calls regarding balls and strikes.", "The man in black is a neutral third party there to enforce the game rules.", "He is the only one not ready to hit or catch the ball because he is officiating."], "image": "val2014/COCO_val2014_000000492840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417284, "question_id": "AczXXRzcDHNUKz3ctyHsuC", "question": "This pool is mainly for what swimmers?", "choices": ["kids", "experienced swimmers", "pregnant women", "elderly people"], "correct_choice_idx": 0, "direct_answers": ["kids", "toddlers", "children", "children", "beginners", "shallow swimmers", "old swimmer", "hotel", "inexperienced swimmers", "hotel customers"], "difficult_direct_answer": true, "rationales": ["This small pool is mostly for little children.", "It is shallow so children can play in it.", "Due to the waters in the foreground of this image being so shallow, perhaps less than one foot, we can presume this part of the pool is for the very young."], "image": "val2014/COCO_val2014_000000417284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327514, "question_id": "Ad3ke9LXpB4nUaEk3ZYzLZ", "question": "What is the man doing here?", "choices": ["selling", "baking", "multitasking", "drying"], "correct_choice_idx": 2, "direct_answers": ["ironing", "cooking ironing", "ironing", "ironing", "ironing", "multitasking", "ironing", "multitasking", "cooking ironing", "cooking ironing"], "difficult_direct_answer": false, "rationales": ["The man is cooking and ironing at the same time.", "The man is multitasking.", "A man is ironing with one hand and cooking food on the stove with the other."], "image": "val2014/COCO_val2014_000000327514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395046, "question_id": "Ad4eeJExB22ioUiyP56J4H", "question": "Why are the people travelling on the train?", "choices": ["eating food", "learning driving", "commuting", "touring"], "correct_choice_idx": 3, "direct_answers": ["tour", "go places", "touring", "affordability", "sightseeing", "body transport", "for fun", "reach destination", "transportation", "reach destination"], "difficult_direct_answer": true, "rationales": ["People are standing on a train platform.", "The train takes people around to see things they've never seen before.", "The compact size of this train probably means that it was not made for mass commuting. it looks like it is still carrying passengers though who may be touring countryside."], "image": "val2014/COCO_val2014_000000395046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12529, "question_id": "AdRrEDnjWtvHK6mpYKjK8c", "question": "What is happening in this venue?", "choices": ["lecture", "presentation", "memorial service", "conference"], "correct_choice_idx": 3, "direct_answers": ["speaker", "lecture", "speech", "speech", "speech", "speech", "motivational speaking", "speech", "speech", "conference"], "difficult_direct_answer": false, "rationales": ["The venue is a conference venue.", "The man is standing behind a lectern and is professionally dressed while speaking to an audience facing him below.", "The man is dressed in a business suit behind a podium with a microphone."], "image": "train2014/COCO_train2014_000000012529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204467, "question_id": "Adem3iVSfzRYrYMgLK4Dji", "question": "What is the emotion shown on the kid's face?", "choices": ["worried", "excited", "embarrassed", "scared"], "correct_choice_idx": 1, "direct_answers": ["masked disappointment", "happy", "happiness", "happy", "happiness", "happiness", "happiness", "happiness", "joy", "excited"], "difficult_direct_answer": false, "rationales": ["The kid has a big smile on their face which eliminates worried, scared and embarrassed.", "The child is wearing a smile which is generally an expression used by someone experiencing a happy, positive emotion. the other options are negative emotions.", "The child is wearing a big smile because they are happy."], "image": "train2014/COCO_train2014_000000204467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387515, "question_id": "AdnM6HofwRxd75zfz6qQWY", "question": "This bus will transport you to what region?", "choices": ["southern france", "central portugal", "western germany", "northern spain"], "correct_choice_idx": 0, "direct_answers": ["city", "marseille", "marseille", "marseille", "lumbar", "marseille", "southern france", "marseille", "bakpar", "australia"], "difficult_direct_answer": false, "rationales": ["The writing on this bus is french.", "It has a french sign on the front", "The bus is in france since the sign says marseille."], "image": "val2014/COCO_val2014_000000387515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429791, "question_id": "AdppdadkGbMZCiifiuH58b", "question": "What is the long part attached to the elephant called?", "choices": ["hose", "nose", "funnel", "trunk"], "correct_choice_idx": 1, "direct_answers": ["trunk", "trunk", "trunk", "trunk", "trunk", "trunk", "trunk", "trunk", "nose", "trunk"], "difficult_direct_answer": false, "rationales": ["The elephant has a long trunk.", "That is what the part of the body is called.", "It is the nose as seen."], "image": "train2014/COCO_train2014_000000429791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5946, "question_id": "AdqJ6yuK2gR8mmbUqg3JcE", "question": "What dog breed does the woman have?", "choices": ["pug", "bassett hound", "dachshund", "shiba inu"], "correct_choice_idx": 1, "direct_answers": ["basset", "beagle", "dachshund", "cocker spaniel", "beagle", "bloodhound", "bassett hound", "basset hound", "bassett hound", "bassett hound"], "difficult_direct_answer": false, "rationales": ["A short dog with long ears stands near a woman in a bench. basset hounds are short and have long ears.", "It's ears are long and droopy like a hounds are.", "The breed is a hound."], "image": "train2014/COCO_train2014_000000005946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324155, "question_id": "Adw9BpeUw4ebMSuta8fHsK", "question": "What type of goods are sold in the store next to the tree?", "choices": ["baked goods", "coffee", "nutritional supplements", "pizza"], "correct_choice_idx": 2, "direct_answers": ["vitamins", "nutritional supplements", "vitamins", "coffee", "vitamins", "vitamins", "vitamins", "vitamins", "vitamins", "coffee"], "difficult_direct_answer": false, "rationales": ["The name of the store is visible and the goods sold within the store can be discerned.", "The store indicates it sells vitamins.", "The goods are supplements."], "image": "val2014/COCO_val2014_000000324155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393292, "question_id": "Ae4UZwpgJCdhqL3vR8i3JP", "question": "The animal in the sink is a descendant of what?", "choices": ["airplanes", "apes", "humans", "dinosaurs"], "correct_choice_idx": 3, "direct_answers": ["dinosaurs", "pigeons", "dove", "dinosaur", "birds", "birds", "dinosaur", "dinosaurs", "dinosaurs", "dove"], "difficult_direct_answer": false, "rationales": ["Traditionally bird species of today are descendants of dinosaurs.", "It's a pigeon.", "It is a bird."], "image": "train2014/COCO_train2014_000000393292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308652, "question_id": "Ae5xifP3Nhub7Beo5mTTxD", "question": "Where should patrons walk?", "choices": ["grass", "bench", "roof", "walkway"], "correct_choice_idx": 3, "direct_answers": ["walkway", "pathway", "along", "paved areas", "walkways", "pathway", "on stones", "sidewalk", "sidewalk", "stone path"], "difficult_direct_answer": false, "rationales": ["People walk along paved paths.", "The bricks walkway is there so people don't have to walk on the grass and damage it.", "There is a designed walkway here through the grass. when there is cultivated landscaping in a park like setting such as this with a manmade walkway, it is intended to be walked on."], "image": "train2014/COCO_train2014_000000308652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527445, "question_id": "AeDHX66w8CxuUq9by6RkUY", "question": "What activity is the person taking this pic taking part in here?", "choices": ["glutton fest", "tasting", "drunken toot", "binge"], "correct_choice_idx": 1, "direct_answers": ["tasting", "wine tasting", "wine tasting", "wine tasting", "wine tasting", "wine tasting", "wine tasting", "wine tasting", "wine tasting", "wine tasting"], "difficult_direct_answer": false, "rationales": ["A wine tasting is being conducted.", "The primary items in the picture are glasses filled with wine. the fact that each has a different color and a number marked under them leads one to believe that this is part of a contest or wine testing event.", "The activity is a tasting."], "image": "val2014/COCO_val2014_000000527445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246278, "question_id": "AeF8rdohy65KVQhWaVXJFy", "question": "Why are the trees fenced?", "choices": ["decoration", "camouflage", "protection", "water reservation"], "correct_choice_idx": 2, "direct_answers": ["protection", "protection", "protection", "zoo enclosure", "protection", "food", "protection", "protection", "in habitat", "zoo"], "difficult_direct_answer": false, "rationales": ["Trees are wrapped in fence poles inside a zebra enclosure. fencing is used to protect trees from the animals in zoos.", "The trees are being protected.", "The trees are there to keep the zebras in."], "image": "train2014/COCO_train2014_000000246278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207331, "question_id": "AeVw7Bp7VXfjzKFe8hj547", "question": "Which item seen here was grown below ground?", "choices": ["orange", "lemon", "noodles", "carrot"], "correct_choice_idx": 3, "direct_answers": ["carrot", "potato", "carrot", "carrots", "carrot", "carrot", "spoons", "carrot", "carrot", "carrot"], "difficult_direct_answer": false, "rationales": ["A bowl has food in it including an orange carrot.", "Carrots are root veggies.", "Lemons and oranges grow on trees. noodles are man-made."], "image": "train2014/COCO_train2014_000000207331.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74456, "question_id": "AeadiiH2yDpEMpzKss9v53", "question": "Which group invented the skateboard?", "choices": ["surfers", "bikers", "druids", "policemen"], "correct_choice_idx": 0, "direct_answers": ["skateboarders", "surf shop", "skateboard fanatics", "california surfshop", "hink", "larry stevenson", "surfers", "skateboarders", "dogtown boyz", "california"], "difficult_direct_answer": true, "rationales": ["People who already used flat boards in water wanted to try putting them on wheels to use on land.", "Surfers invented the skateboard for land surfing.", "This option is closest to the origin story."], "image": "val2014/COCO_val2014_000000074456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83036, "question_id": "Aeci9SdYUdMYzUYAemyrM7", "question": "What is being done here?", "choices": ["power point", "sleep", "movie filming", "math"], "correct_choice_idx": 0, "direct_answers": ["meeting", "meeting", "learning", "presentation", "teaching students", "meeting", "presentation", "training", "power point", "learning"], "difficult_direct_answer": false, "rationales": ["People are looking at a computer presentation", "The large screen for the powerpoint presentation can be seen as well as the speaker.", "A presentation is shown on the screen."], "image": "val2014/COCO_val2014_000000083036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318164, "question_id": "AeykPmDMyKrzDZF3y9T7aG", "question": "What type of people is the bench for?", "choices": ["students", "patients", "passengers", "diners"], "correct_choice_idx": 2, "direct_answers": ["passengers", "train travelers", "train waiters", "passengers", "train station", "train travelers", "tired", "passengers", "passengers", "passengers"], "difficult_direct_answer": false, "rationales": ["There is a train and a train station in the background so the bench is for the people waiting to board the train or the people who have just stepped off of the train.", "The platform and train indicates it is a train station, and the bench can be used for passengers to sit on while waiting for their train to arrive.", "There is a train right behind the bench."], "image": "train2014/COCO_train2014_000000318164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113860, "question_id": "Af2b7bvzPSp7WSV3U5cMGZ", "question": "What does the word on the boat relate?", "choices": ["song", "movie", "bus", "book"], "correct_choice_idx": 1, "direct_answers": ["movie", "nemo", "swimming", "finding nemo", "nemo", "nemo movie", "famous book", "name", "book", "nobody"], "difficult_direct_answer": true, "rationales": ["Nemo was a film about a fish", "There is a movie for kids that is called finding nemo.", "The word on the boat is nemo. there is a well-known disney full length animated feature called finding nemo. however, there is also a character called captain nemo in a book."], "image": "val2014/COCO_val2014_000000113860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421562, "question_id": "Af6hRXyHeEm8rDD7UBnGXA", "question": "The last four letters seen in the background are all found in what word?", "choices": ["pizzeria", "loquacious", "quash", "sublime"], "correct_choice_idx": 0, "direct_answers": ["era", "galleria", "galleria", "store name", "pizzeria", "galleria", "era", "bacteria", "galleria", "era"], "difficult_direct_answer": false, "rationales": ["The letters seen in the background are found in pizzeria.", "The last four letters are clearly visible and readable and are contained in answer a.", "This is the only one ending in \"eria\" as the letters on the building do."], "image": "train2014/COCO_train2014_000000421562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434338, "question_id": "AfKyg6S5nPoFDPvScynPVF", "question": "What is the weather like near the mountain?", "choices": ["foggy", "clear", "stormy", "cloudy"], "correct_choice_idx": 3, "direct_answers": ["cloudy", "cold", "cold", "cloudy", "cloudy", "cloudy", "cloudy", "cold", "cloudy", "cold"], "difficult_direct_answer": false, "rationales": ["The weather near the mountain is a cloudy sky.", "The sky is full of white, fluffy masses that are blocking most of the sunlight.", "The weather is cloudy."], "image": "train2014/COCO_train2014_000000434338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161865, "question_id": "AfTd3Hy4JdurUgbk3HSZKY", "question": "What should be installed on the device with a screen?", "choices": ["operating system", "oil line", "vice", "heater"], "correct_choice_idx": 0, "direct_answers": ["operating system", "program", "windows", "game", "operating system", "games", "video games", "wii", "windows", "game console"], "difficult_direct_answer": false, "rationales": ["An os should be installed.", "A device needs an operating system to operate.", "The device with a screen is a computer. it does not need an oil line, a vice, or a heater."], "image": "train2014/COCO_train2014_000000161865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57095, "question_id": "AfaGVNwSqHcLnkcNBsqEnV", "question": "What does the red light on the pole direct?", "choices": ["bicycles", "traffic", "racers", "airplanes"], "correct_choice_idx": 1, "direct_answers": ["stop", "traffic", "stop", "traffic", "traffic", "stop", "vehicles", "traffic", "traffic", "traffic"], "difficult_direct_answer": false, "rationales": ["The red light tells the traffic when to stop and when a green light goes it it tells them to go.", "These street lights tell cars when it's safe to proceed.", "The red light directs the cars on the road."], "image": "val2014/COCO_val2014_000000057095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485019, "question_id": "AgK6kNCCvFHXmr5CJqikUA", "question": "The person shown here has difficulty doing what?", "choices": ["speaking", "walking", "sitting", "singing"], "correct_choice_idx": 1, "direct_answers": ["walking", "siting", "seeing", "umbrella", "walking", "staying dry", "walking", "walking", "walking", "walking"], "difficult_direct_answer": false, "rationales": ["It looks like they have a cane to help them walk around.", "The person has a cane next to them.", "The person is sitting. there is a cane beside the person."], "image": "train2014/COCO_train2014_000000485019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466822, "question_id": "AgVGZrRDEuJLPgcXVe5JkG", "question": "Which one of these cities is closest to their location?", "choices": ["sacramento", "detroit", "philadelphia", "hartford"], "correct_choice_idx": 1, "direct_answers": ["paris", "paris", "napa", "paso robles", "traverse city", "traverse city", "paris", "paris", "detroit", "paris"], "difficult_direct_answer": false, "rationales": ["Because of the background portrait.", "Sacramento is in california which is known for vineyards.", "Chateau chantal winery is in michigan, so detroit must the closest city to them because detroit is in michigan."], "image": "train2014/COCO_train2014_000000466822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78896, "question_id": "Ah5TAB5TykmhdMfDA5n2Bk", "question": "Who sleeps in this location?", "choices": ["honored guest", "nobody", "washington", "local resident"], "correct_choice_idx": 1, "direct_answers": ["unknown artist", "homeless person", "outsider", "homeless", "bum", "homeless", "no one", "nobody", "homeless person", "hobo"], "difficult_direct_answer": false, "rationales": ["A bed is in the road near the curb. this bed is not in a home where people would normally sleep.", "There is no one sleeping in the bed.", "No one sleeps on the streets."], "image": "train2014/COCO_train2014_000000078896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296913, "question_id": "AhKNiE76zAYVYQFhRYBCaY", "question": "Who does the long brown hair belong to?", "choices": ["cat", "rabbit", "dog", "human"], "correct_choice_idx": 3, "direct_answers": ["girl", "woman", "woman", "girl", "woman", "woman", "woman", "human", "woman", "woman"], "difficult_direct_answer": false, "rationales": ["Animals to do have hair this long and this texture, it is contributed to a human only", "People can grow long hair on their head.", "The long hair in question is clearly visible and is consistent with answer a."], "image": "train2014/COCO_train2014_000000296913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387932, "question_id": "AhMH9bKBjXvJxrYDAkQJdx", "question": "What is the location of the sun in the image?", "choices": ["right", "front", "back", "left"], "correct_choice_idx": 3, "direct_answers": ["left side", "left", "right", "behind camera", "upper left", "top left", "left", "west", "left", "above skiers"], "difficult_direct_answer": false, "rationales": ["Shadows of people cross country skiing fall to the left of them.", "Based on the orientation of the shadows and the people creating them, the location of the sun is discernible.", "Their shadows are going to the right so it's in the opposite direction."], "image": "train2014/COCO_train2014_000000387932.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69827, "question_id": "AhaLBMjZhNCvsoURREcuXz", "question": "What is the man in red shirt doing?", "choices": ["laughing", "crying", "singing", "yelling"], "correct_choice_idx": 3, "direct_answers": ["talking", "yelling", "yelling", "talking", "yelling", "talking", "speaking", "shouting", "talking", "yelling"], "difficult_direct_answer": false, "rationales": ["He is yelling for the driver to hear him because of noise.", "His mouth is wide open and the street is probably noisy.", "He has his mouth wide open"], "image": "val2014/COCO_val2014_000000069827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136230, "question_id": "AhgytgcRAnKKVUvrSLV7YD", "question": "What is the item called that has the red lights emanating from it?", "choices": ["keyboard", "notepad", "flashlight", "mouse"], "correct_choice_idx": 3, "direct_answers": ["mouse", "mouse", "mouse", "mouse", "computer monitor", "mouse", "computer monitor", "mouse", "mouse", "computer monitor"], "difficult_direct_answer": false, "rationales": ["You can see the mouse is lit up. it's used to control the computer.", "On the lower right, there is a black mouse with three red lights visible on it. this is helpful in dim light or darkness when working on computers.", "This is a mouse"], "image": "train2014/COCO_train2014_000000136230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216406, "question_id": "AhpduB4nZqn86rs3KDmRHD", "question": "Where is someone who might easily overheat safest here?", "choices": ["on chair", "under umbrella", "in sand", "water's edge"], "correct_choice_idx": 1, "direct_answers": ["umbberlow", "under umbrella", "under umbrella", "under umbrella", "under umbrella", "on towel", "under umbrillia", "shade", "under umbrella", "under umbrella"], "difficult_direct_answer": false, "rationales": ["They could go to the umbrella.", "This is a beach area. it is sunny too.", "The umbrella can provide shade."], "image": "train2014/COCO_train2014_000000216406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342510, "question_id": "AhyCSssyxVm5kimk8JCgHw", "question": "Tidal bores surfing can be played on which water?", "choices": ["ocean", "pond", "river", "sea"], "correct_choice_idx": 2, "direct_answers": ["any kind", "river", "river", "river", "river", "river", "white water", "rapids", "river", "river"], "difficult_direct_answer": false, "rationales": ["This is being done on a fast-moving body of water that is longer than it is wide and runs through a forest.", "The man is surfing in a river since the body of water is narrow.", "This is obviously a a in this image. the other options aren't represented."], "image": "train2014/COCO_train2014_000000342510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222453, "question_id": "AiKpNB9yCL2NYiwetrwL6v", "question": "The man celebrating his birthday cannot have lit candles because he is in which location?", "choices": ["dorm room", "condominium", "office", "apartment"], "correct_choice_idx": 0, "direct_answers": ["school", "dorm", "dorm room", "fire hazard", "kitchen", "basement", "dorm", "dormitory", "basement", "sitting room"], "difficult_direct_answer": false, "rationales": ["If he lights the candles it could be a fire hazard.", "The man has a pretty barebones room which indicates he's in college.", "That is the only room to use a cake."], "image": "train2014/COCO_train2014_000000222453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112995, "question_id": "AiU4zxwsboufebP6CLHHzT", "question": "Which of the kayakers body parts will help most to propel them forward?", "choices": ["nose", "legs", "arms", "head"], "correct_choice_idx": 2, "direct_answers": ["arms", "arms", "hands", "arms", "arms", "arms", "arms", "hands", "arms", "arms"], "difficult_direct_answer": false, "rationales": ["The body part is the arms.", "A person can paddle with by holding the paddle and using their arms.", "Legs won't help you very much to propel them and their head and nose surely will not."], "image": "train2014/COCO_train2014_000000112995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353881, "question_id": "Aiv5wz5wdGrypKJhaKCqFb", "question": "What creates the colorful patterns on the ground?", "choices": ["traffic", "painting", "thunder", "street lamps"], "correct_choice_idx": 0, "direct_answers": ["lights", "cars", "automobiles", "lines", "traffic", "lights", "lights", "cars", "traffic", "movement cars"], "difficult_direct_answer": false, "rationales": ["There is a lot of traffic from moving cars.", "There are cars on the street. when it's dark the lights on the cars will light up.", "The colorful patters on the ground are lights from moving cars."], "image": "train2014/COCO_train2014_000000353881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29074, "question_id": "AiygnD4YDfUWpbz7KEErMx", "question": "What is the rectangular image in the corner of the room?", "choices": ["poster", "tablet", "television", "painting"], "correct_choice_idx": 2, "direct_answers": ["television", "television", "tv", "television", "television", "television", "television", "fireplace", "fireplace", "tv"], "difficult_direct_answer": false, "rationales": ["The image is the tv.", "The tv is the only object in that part of the room that's rectangular.", "You can see there is something playing on the tv."], "image": "val2014/COCO_val2014_000000029074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398188, "question_id": "Aj75GFbb4yZ6vjSsQBYa3h", "question": "What is the narcotic made popular by the poster on the wall?", "choices": ["marijuana", "caffeine", "cocaine", "meth"], "correct_choice_idx": 1, "direct_answers": ["caffeine", "monster", "caffeine", "taurine", "caffeine", "cocaine", "caffeine", "caffine", "caffeine", "molly"], "difficult_direct_answer": false, "rationales": ["The poster is advertising an energy drink.", "Monster has drinks with a crazy amount of caffeine in them.", "The poster is for monster. this is an energy drink that does not contain any illegal drugs."], "image": "val2014/COCO_val2014_000000398188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307206, "question_id": "AjKVdW2AsZQkWQhJQMLEN5", "question": "What is the large structure in the background?", "choices": ["power lines", "business", "skyscraper", "stadium"], "correct_choice_idx": 0, "direct_answers": ["barn", "barn", "electricity tower", "shed", "power lines", "barn", "transmission tower", "transmission lines", "power lines", "power lines"], "difficult_direct_answer": false, "rationales": ["A large structure made out of metal holding lines along the sky to other similar structure.", "All those long wires explain the purpose of that structure.", "The wires attach to the houses so the people who live there can have electricity."], "image": "val2014/COCO_val2014_000000307206.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54957, "question_id": "Ajdzvu2yuFuEWLdajUBcGs", "question": "Why is he playing with the dog?", "choices": ["is bored", "is waiting", "is lonely", "is distracted"], "correct_choice_idx": 0, "direct_answers": ["man", "entertain", "why not", "entertainment", "bored", "friends", "dog", "bored", "for fun", "is bored"], "difficult_direct_answer": true, "rationales": ["He is bored because the man beside him is on the phone.", "The boy is bored.", "The look on his face says he is bored."], "image": "train2014/COCO_train2014_000000054957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202681, "question_id": "Ak6hdEAmqvb2TCSexjyEqZ", "question": "What clothing brand made the man's blue shirt?", "choices": ["puma", "adidas", "reebok", "gucci"], "correct_choice_idx": 1, "direct_answers": ["adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas"], "difficult_direct_answer": false, "rationales": ["Adidas is the brand of the man's shirt.", "The brand is adidas.", "The brand's logo is on his chest. the white stripes on the arm are associated with this brand."], "image": "train2014/COCO_train2014_000000202681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463678, "question_id": "AkUsKy2kLKxgnjQKvADC5s", "question": "What is the full version of the name being displayed?", "choices": ["western southern", "western soy", "western smith", "western science"], "correct_choice_idx": 0, "direct_answers": ["cincinnati", "open series", "western southern", "southern", "us open", "western southern", "southern", "westernsouthern open", "us open", "western southern"], "difficult_direct_answer": false, "rationales": ["By looking at the repeating pattern it would say western southern.", "The name on the banner behind the tennis player is western & southern.", "It can be seen that the words keep repeating. the rest of the second word can be seen on the left."], "image": "val2014/COCO_val2014_000000463678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328270, "question_id": "AkjMLro8nfxmyrGbc7WYsE", "question": "What type of pizza has already been made?", "choices": ["pineapple", "veggie", "sausage", "pepperoni"], "correct_choice_idx": 1, "direct_answers": ["veggie pizza", "veggie pizza", "vegetable pizza", "aata", "veggie pizza", "pepperoni", "peppers onions", "veggie pizza", "veggie", "veggie"], "difficult_direct_answer": false, "rationales": ["The pizza is clearly visible and the toppings are identifiable. the toppings are all vegetable based on their sizes and shape and would be consistent with a pizza of the type of answer a.", "The pizza that has been prepared and is to the side has vegetables on it.", "A veggie pizza is made."], "image": "train2014/COCO_train2014_000000328270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131335, "question_id": "AknmjVaETgJTwbnMx5NtRb", "question": "What is the circular object decorated with roman numerals used for?", "choices": ["cooking", "translating", "telling time", "imprinting"], "correct_choice_idx": 2, "direct_answers": ["time", "telling time", "clock", "telling time", "time", "time", "tell time", "telling time", "clock", "time"], "difficult_direct_answer": false, "rationales": ["This is a clock", "That is a clock on top of the building.", "The clock is used to indicate time."], "image": "val2014/COCO_val2014_000000131335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12897, "question_id": "AkqDRLbGBUK596JTDcKDSi", "question": "Where will many of the people here be sitting soon?", "choices": ["airplane", "living room", "ship", "protest line"], "correct_choice_idx": 0, "direct_answers": ["plane", "airplane", "waiting room", "arena", "airplane", "airplane", "chairs", "on airplane", "plane", "floor"], "difficult_direct_answer": false, "rationales": ["You can tell by the luggage and setting as to what they are waiting for.", "This is in the public place that is an airport.", "There is luggage. people use luggage when they travel by air."], "image": "val2014/COCO_val2014_000000012897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491229, "question_id": "Akv3dC6SyBBLVLNGB7v6vz", "question": "Who manufactured the fully visible gold car behind him?", "choices": ["toyota", "chevrolet", "ford", "chrysler"], "correct_choice_idx": 3, "direct_answers": ["chrysler", "ford", "oldsmobile", "ford", "ford", "chevy", "chevy", "ford", "ford", "toyota"], "difficult_direct_answer": false, "rationales": ["Chrysler is the manufacturer.", "A car with a chrysler logo is in a parking area.", "That is the type of car it is."], "image": "train2014/COCO_train2014_000000491229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128482, "question_id": "AkvkyM9cj7A2ddqyri7Bfb", "question": "What is a use of the product being advertised?", "choices": ["browse internet", "douse flames", "grind beans", "freeze food"], "correct_choice_idx": 0, "direct_answers": ["smartphone", "smartphone", "browse internet", "calling people", "phone", "smartphone", "smartphone", "communication", "phone", "communication"], "difficult_direct_answer": false, "rationales": ["Smartphones are connected to a network. if it's connected to a network you can use the internet.", "The ad is about a phone.", "You can use a smartphone to look things up on the internet and do many other things."], "image": "train2014/COCO_train2014_000000128482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28109, "question_id": "AmR8BsKpxc8UPdx52Wtd2g", "question": "What type of shot is the man about to hit?", "choices": ["slice", "backhand", "serve", "forehand"], "correct_choice_idx": 2, "direct_answers": ["serve", "serve", "serve", "serve", "serve", "overhand", "serve", "tennis serve", "serve", "highshot"], "difficult_direct_answer": false, "rationales": ["He is serving the ball to start the game.", "A serve involves tossing the ball up as shown.", "The man is standing behind the baseline, winding up for an overhead shot and has thrown the ball above his head based on the off hand. these would be consistent with answer a."], "image": "train2014/COCO_train2014_000000028109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395473, "question_id": "AmUz6z2WzViDwRcRdu6suf", "question": "Which direction will the aloft skateboarder next go?", "choices": ["down", "up", "backwards", "no where"], "correct_choice_idx": 0, "direct_answers": ["right", "down", "right", "down", "down", "down", "down", "down", "downward", "right"], "difficult_direct_answer": false, "rationales": ["The skateboarder is currently up in the air.", "The railing is slanted towards the ground.", "A skateboarder is riding a rail with steps below him."], "image": "train2014/COCO_train2014_000000395473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507749, "question_id": "AmhKR2qdPKemcg4Eay82kL", "question": "What danger does the child face?", "choices": ["getting hit", "getting pinched", "getting frostbite", "getting burned"], "correct_choice_idx": 3, "direct_answers": ["burning", "falling", "burn", "falling off", "getting burned", "burning", "falling", "being burned", "falling", "burn"], "difficult_direct_answer": false, "rationales": ["The kid is at a oven.", "Stoves give off heat which can burn you.", "A child is standing in front of a stove. stoves are hot."], "image": "train2014/COCO_train2014_000000507749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161919, "question_id": "AmmUUPUCP5RELnjcgPRaEb", "question": "Why is the man using a knife with the meat?", "choices": ["tenderizing", "threatening it", "being cruel", "cutting slices"], "correct_choice_idx": 3, "direct_answers": ["cutting", "to slice", "to cut", "to cut", "slicing", "slice it", "to cut", "cut it", "cutting slices", "slicing it"], "difficult_direct_answer": false, "rationales": ["The man is cutting.", "A man is holding meat down with a large fork and running a knife along it.", "The man is slicing a big round of meat with a knife."], "image": "train2014/COCO_train2014_000000161919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232699, "question_id": "An9shAxRijwTj6DE8mcRpm", "question": "The large teams inside of the large canoes are playing what sport?", "choices": ["hunting", "polo", "rowing", "baseball"], "correct_choice_idx": 1, "direct_answers": ["rowing", "rowing", "sculling", "rowing", "dragon boat", "rowing", "rowing", "polo", "canoe race", "rowing"], "difficult_direct_answer": false, "rationales": ["The people are using paddles to move the boats.", "Large teams like this in this type of boat are rowing teams.", "These are rowing teams."], "image": "train2014/COCO_train2014_000000232699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115145, "question_id": "AnQAWh2UemJfC9JnRqKt4U", "question": "What kind of cable is used in the lower right socket?", "choices": ["diamond", "severed", "coax", "split"], "correct_choice_idx": 2, "direct_answers": ["power", "short cable", "wire", "electric plug", "electric", "internet", "coax", "3 prong", "coaxial", "coax"], "difficult_direct_answer": true, "rationales": ["A coax cable is seen.", "An outlet with a single hole in the center is on the wall in a kitchen.", "The cable is the coax."], "image": "train2014/COCO_train2014_000000115145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183217, "question_id": "AnWdFvJXBSUyL4tb9G6Wrv", "question": "What holiday is being celebrated in donut feasting here?", "choices": ["arbor day", "valentines", "thanksgiving", "christmas"], "correct_choice_idx": 1, "direct_answers": ["valentines", "valentine's", "valentine's day", "valentines day", "easter", "valentine's day", "valentines day", "valentine's", "valentines", "valentines"], "difficult_direct_answer": false, "rationales": ["The box of donuts has a few donuts that are shaped like hearts for valentine's day.", "Some of the donuts are heart-shaped. others are pink and are covered in heart sprinkles.", "When one sees doughnuts in the shape of hearts and maybe even slathered in pink icing, it is totally valentines day."], "image": "val2014/COCO_val2014_000000183217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425296, "question_id": "AnfV4Jwe7QytNUkx4b7wbk", "question": "What type location is this?", "choices": ["public park", "mall", "water front", "grocery store"], "correct_choice_idx": 0, "direct_answers": ["mountains", "mountains", "desert", "desert", "desert", "mountains", "public park", "desert", "desert", "mountains"], "difficult_direct_answer": false, "rationales": ["Based on the visible outdoor and natural features, the included benches and the apparent parking lot, answer a is reasonable.", "There is a picnic table and cars in the parking lot.", "This location is outside in the public park."], "image": "train2014/COCO_train2014_000000425296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45587, "question_id": "AnpNZKsP4eP2mdQQUvzzHe", "question": "What is the dog doing?", "choices": ["biting", "swimming", "surfing", "dog paddling"], "correct_choice_idx": 2, "direct_answers": ["surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["A dog is on surfboard in the water.", "The dog is sitting on a surfboard and riding across the wave, which is rhe act of surfing.", "He is balancing atop a surfboard."], "image": "val2014/COCO_val2014_000000045587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231601, "question_id": "AnqCGFWstmV6iQydh6kAzJ", "question": "These animals represent what zodiac sign?", "choices": ["capricorn", "leo", "scorpio", "taurus"], "correct_choice_idx": 0, "direct_answers": ["taurus", "capricorn", "earth", "capricorn", "aries", "capricorn", "capricorn", "aries", "capricorn", "capricorn"], "difficult_direct_answer": false, "rationales": ["The ram is the animal for capricorn.", "They have two horns each which area also seen on a capricorn.", "There are there goats which represent capricorn on the zodiac."], "image": "train2014/COCO_train2014_000000231601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325459, "question_id": "AnwJdLbSf6UAs6fFVcqKyq", "question": "What is the white thing the man is holding?", "choices": ["napkin", "lighter", "game remote", "phone"], "correct_choice_idx": 2, "direct_answers": ["controller", "controller", "controller", "wii remote", "remote", "wii remote", "controller", "game remote", "wii remote", "wii control"], "difficult_direct_answer": false, "rationales": ["As you can see by the picture they are playing the wii and that white controller allows you to play.", "They are playing a game", "The two men are sitting down and playing a game."], "image": "val2014/COCO_val2014_000000325459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148002, "question_id": "AnzuunMiryj5dCiVh28Fvm", "question": "What are groups of these animals called?", "choices": ["lounge", "gang", "tower", "pride"], "correct_choice_idx": 2, "direct_answers": ["zebras", "giraffe", "tower", "tower", "tower", "tower", "giraffes", "herd", "giraffes", "tower"], "difficult_direct_answer": false, "rationales": ["It describes the animal.", "Groups of animals have different names like a 'pride of lions'. for giraffes, it's 'tower'.", "The animals are giraffes, not lions, lizards, or elk."], "image": "train2014/COCO_train2014_000000148002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485858, "question_id": "Ao6dnSDrMUvBVyFmhf27pH", "question": "What real estate structure is this type of sale often named after?", "choices": ["patio", "cabin", "bedroom", "garage"], "correct_choice_idx": 3, "direct_answers": ["garage", "estate", "home sale", "garage sale", "flea market", "open house", "garage", "garage", "garage", "yard sale"], "difficult_direct_answer": false, "rationales": ["There is a garage sale being held.", "The yard sale could also be a garage sale.", "Traditionally this is named tat when selling items in your yard."], "image": "train2014/COCO_train2014_000000485858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223299, "question_id": "AoQAQZLMzB98t72KaYH9vk", "question": "What NHL hockey player had the same jersey number as the person wearing black?", "choices": ["iginla", "gretzky", "satan", "lemieux"], "correct_choice_idx": 1, "direct_answers": ["gretzky", "william", "wayne gretzky", "gretzky", "gretzky", "wayne gretzky", "wayne gretzky", "goat", "wayne gretzky", "william"], "difficult_direct_answer": false, "rationales": ["Wayne gretzky shares the same number.", "Gretzky is an nhl player who, like the player in black, wore number 99.", "The athlete in the black shirt bears the number 99. this number was also worn in hockey by wayne gretzky."], "image": "train2014/COCO_train2014_000000223299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121839, "question_id": "AoQPtzXVzWUbNEGL3rp3Tf", "question": "The board used for skiing is called?", "choices": ["snow cut", "snowblade", "slide blade", "skateboard"], "correct_choice_idx": 1, "direct_answers": ["ski's", "snowblade", "snowboard", "snow blades", "snow ski", "snowboard", "skiis", "snowboard", "snowboard", "skis"], "difficult_direct_answer": false, "rationales": ["The board is a snowblade.", "You can tell by the setting and what they are doing, it easily give you the answer.", "They are shorter versions of skis."], "image": "val2014/COCO_val2014_000000121839.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50410, "question_id": "AoSXMMAXpgE58sAWuKAzkB", "question": "What color is the sky?", "choices": ["orange", "grey", "black", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The sky is blue beyond the trees.", "A person stands on the beach with clear skies above. clear skies appear blue.", "The blue is visible above the trees."], "image": "train2014/COCO_train2014_000000050410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537232, "question_id": "AoivVgWzWMg8ppf2kWTK5e", "question": "What is the man in blue riding?", "choices": ["bicycle", "scooter", "motorcycle", "skateboard"], "correct_choice_idx": 0, "direct_answers": ["bike", "bicycle", "bicycle", "bike", "bicycle", "bike", "bike", "bicycle", "bicycle", "bike"], "difficult_direct_answer": false, "rationales": ["You can see the two wheels on the bike", "The helmet wearing blue clad man visible in this image pumps his legs on two pedals which push two wheels and secures himself on the handlebar.", "The man is riding something he can power with his own two legs that has two wheels."], "image": "train2014/COCO_train2014_000000537232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98642, "question_id": "Aok9LKeCMrEtVxXaQNtTwo", "question": "Where is the person having this meal?", "choices": ["home", "park", "restaurant", "backyard"], "correct_choice_idx": 2, "direct_answers": ["restaurant", "restaurant", "restaurant", "delicatessen", "restaurant", "restaurant", "diner", "diner", "coffee", "restaurant"], "difficult_direct_answer": false, "rationales": ["This is a restaurant meal.", "The place seems to be having waiters to serve the waiter.", "This looks like a typical set up of a diner or other style of restaurant."], "image": "train2014/COCO_train2014_000000098642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8053, "question_id": "AokzJrCAoHumoSy7eMvt4F", "question": "What has the woman most likely just done?", "choices": ["shopped", "worked out", "exercised", "showered"], "correct_choice_idx": 0, "direct_answers": ["shopped", "grocery shopping", "shopped", "shopped", "shopping", "shoppped", "shopped", "went shopping", "shop", "shopping"], "difficult_direct_answer": false, "rationales": ["The woman has shopping bags.", "She is carrying shopping with shopping bags.", "The woman has most likely just shopped."], "image": "train2014/COCO_train2014_000000008053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401967, "question_id": "ApWbpkRPEHQkaey7e53dKj", "question": "What is o top of the large item in the middle of the room?", "choices": ["rabbits", "musical notes", "handcuffs", "chainsaws"], "correct_choice_idx": 1, "direct_answers": ["musical notes", "sheet music", "lamp", "music sheets", "vase", "sheet music", "vase", "sheet music", "sheet music", "plants"], "difficult_direct_answer": false, "rationales": ["Sheet music is on the piano.", "Technically sheet music. the other options aren't in the image.", "The book is open with a bunch of musical notes on it."], "image": "train2014/COCO_train2014_000000401967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541725, "question_id": "Apw5S6wPtQEC4NcQiDEaoy", "question": "Which one of these places is absent from the mementos?", "choices": ["jamaica", "denmark", "colorado", "california"], "correct_choice_idx": 0, "direct_answers": ["jamaica", "smallville", "smallville", "great britain", "united states", "spain", "us", "arkansas", "person", "south africa"], "difficult_direct_answer": true, "rationales": ["It's difficult to tell from the image.", "A suitcase has stickers from several locations on it. there is no sticker mentioning jamaica.", "There is no caribbean country represented."], "image": "train2014/COCO_train2014_000000541725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539941, "question_id": "ApyZ9vCQANz5deBdPjbTxt", "question": "What animal does the item on top of the food come from?", "choices": ["fish", "lobster", "crab", "pig"], "correct_choice_idx": 3, "direct_answers": ["pig", "pig", "cow/pig", "pig", "pig", "pig", "cow", "food", "cow", "cow"], "difficult_direct_answer": false, "rationales": ["The item on top is pepperoni. it comes from a land, not marine, animal.", "The item on the top of the food is pepperoni and pepperoni is made from pigs.", "This is pepperoni which is a type of pork sausage"], "image": "train2014/COCO_train2014_000000539941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406949, "question_id": "AqNBgE4Snh8KeXesHNqiE2", "question": "What should the cars do in this situation?", "choices": ["wait", "go", "park", "stop"], "correct_choice_idx": 1, "direct_answers": ["go", "go", "stop", "proceed slowly", "stop", "clear crosswalk", "proceed", "go", "continue forth", "stop"], "difficult_direct_answer": false, "rationales": ["The traffic signal followed by the cars is green, so they are able to continue driving.", "The cars should go since it's a green light.", "The cars should go."], "image": "val2014/COCO_val2014_000000406949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244782, "question_id": "AqPJxKext4HLBr3ioHt3Tr", "question": "What is the flavor of ice cream it is?", "choices": ["butterscotch", "strawberry", "chocolate", "vanilla"], "correct_choice_idx": 0, "direct_answers": ["strawberry", "chocolate", "strawberry", "chocolate", "strawberry", "chocolate", "chocolate", "chocolate", "butterscotch", "strawberry"], "difficult_direct_answer": false, "rationales": ["The flavor is butterscotch.", "The ice cream is a brownish color.", "The ice cream is tan colored and flavored with butterscotch."], "image": "train2014/COCO_train2014_000000244782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435855, "question_id": "Aqh5thGj7z55sjeRtbLUoF", "question": "What does the backpacking man hope for?", "choices": ["ride", "nothing", "dinner", "motor bike"], "correct_choice_idx": 0, "direct_answers": ["ride", "ride", "ride", "walking destination", "walking destination", "ride", "ride", "walking destination", "ride", "ride"], "difficult_direct_answer": false, "rationales": ["The man is walking on the side of the road with a backpack and a suitcase. the man is hitchhiking.", "Because his walking near the road with his suitcase.", "He is walking along a busy road with passing cars."], "image": "train2014/COCO_train2014_000000435855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238535, "question_id": "AqsDwfyjPiNVx6GQhgp5gH", "question": "Which hall is near this street corner with the pole?", "choices": ["oxfordshire", "church", "warwick", "parish"], "correct_choice_idx": 2, "direct_answers": ["warwick hall", "warwick", "warwick hall", "warwick hall", "warwick", "warwick", "warwick", "warwick", "warwick", "warwick"], "difficult_direct_answer": false, "rationales": ["Warwick is close to the street corner.", "The only item listed on these direction indicating signs which calls itself a hall is the warwick hall.", "The pole on the street corner includes the following location text, warwick hall."], "image": "train2014/COCO_train2014_000000238535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156184, "question_id": "ArD7R2ocxdKYQJJVX6iubX", "question": "How did the boat by the sign get there?", "choices": ["crane lifted", "raced there", "tide lowered", "dragged"], "correct_choice_idx": 3, "direct_answers": ["boat", "pushed", "construction", "lifted up", "dragged", "dragged there", "towed", "boat", "pushed there", "shipwrecked"], "difficult_direct_answer": true, "rationales": ["A boat is on shore, away from the water.", "When the tide was higher they would pull the boat up there.", "Given how far ashore the boat is sitting, it would have had to be dragged to its position."], "image": "val2014/COCO_val2014_000000156184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558968, "question_id": "ArEpcRkCkSPN2Qa4gkXicq", "question": "Where are persons currently sitting here?", "choices": ["deck", "living room", "bathroom", "bedroom"], "correct_choice_idx": 0, "direct_answers": ["outside", "deck", "outside", "outside", "on porch", "outdoor patio", "outside", "outside patio", "outside", "outside"], "difficult_direct_answer": false, "rationales": ["The people are sitting outside, not inside, the house.", "The other rooms aren't shown here, and most don't sit in the bathroom for a long period of time.", "The people are on a deck."], "image": "train2014/COCO_train2014_000000558968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162700, "question_id": "ArJGgTu8SU5rMPz4jxQ7dY", "question": "What is the woman sitting on?", "choices": ["couch", "bed", "suitcases", "chair"], "correct_choice_idx": 2, "direct_answers": ["suitcases", "luggage", "suitcases", "suitcases", "suitcases", "luggage", "luggage", "luggage", "luggage", "suitcases"], "difficult_direct_answer": false, "rationales": ["The objects are clearly visible and have the size, shape and defining features such as handles and clasps consistent with answer a.", "There are luggage underneath the woman.", "The items are luggage, not furniture."], "image": "train2014/COCO_train2014_000000162700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9813, "question_id": "ArKtKzSbZ9DgKufTkS852M", "question": "Which person is holding the horse in what color shirt?", "choices": ["red", "white", "black", "green"], "correct_choice_idx": 1, "direct_answers": ["white shirt", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["A woman in a light colored shirt is holding onto a horse.", "The person is in white.", "That the color of the shirt of the only person touching the horse."], "image": "val2014/COCO_val2014_000000009813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183338, "question_id": "ArXCy8g5pQrHEmmspu946z", "question": "Wha's the man in the left corner attempting to do?", "choices": ["pass", "block", "squat", "serve"], "correct_choice_idx": 3, "direct_answers": ["serve", "serve", "serve", "serve", "serve ball", "serve", "serve", "serve", "serve ball", "serve"], "difficult_direct_answer": false, "rationales": ["The ball is coming down and he is hitting it. the other players are ready.", "He threw the ball up in the air and hitting it.", "The man is serving."], "image": "val2014/COCO_val2014_000000183338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306988, "question_id": "Arbh34LLUEwnggcH4Fj5DK", "question": "What might be something someone might bring to this area to be left behind?", "choices": ["mouse food", "donuts", "umbrellas", "flowers"], "correct_choice_idx": 3, "direct_answers": ["chair", "bird food", "flowers", "food wrapper", "bird seed", "flowers", "flowers", "flowers", "flowers", "flowers"], "difficult_direct_answer": false, "rationales": ["The are seems to be dry so the rain no so no need of umbrella.", "A cemetery is a good place to leave small items for people who are buried there.", "People bring them to their deceased loved ones"], "image": "train2014/COCO_train2014_000000306988.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53464, "question_id": "ArcHeHnTnbLTpUqMMDPvNV", "question": "What occasion does this cake celebrate?", "choices": ["birthday", "divorce", "rodeo", "wedding"], "correct_choice_idx": 3, "direct_answers": ["new job", "wedding", "achievement", "wedding", "wedding", "marriage", "wedding", "engagement", "wedding", "wedding"], "difficult_direct_answer": false, "rationales": ["This wedding cake celebrates a wedding.", "The cake has wedding bands on it.", "A cake with a picture of two rings is being cut."], "image": "train2014/COCO_train2014_000000053464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67438, "question_id": "Arej6scN7w2sQ9yT277mb7", "question": "What is the lady trying to do?", "choices": ["bullying giraffe", "touching giraffe", "attacking giraffe", "feeding giraffe"], "correct_choice_idx": 3, "direct_answers": ["feed giraffe", "feed giraffe", "feeding giraffe", "feed giraffe", "feed giraffe", "feed giraffe", "feed giraffe", "feed giraffe", "feed giraffe", "feed giraffe"], "difficult_direct_answer": false, "rationales": ["A woman is reaching out to a giraffe that has its tongue out.", "She is holding something in her hand and reaching out to the giraffe.", "The woman is feeding the animals."], "image": "train2014/COCO_train2014_000000067438.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463355, "question_id": "As5VmEjs6cEMCEfdm6fBZa", "question": "What will happen to the skateboard next?", "choices": ["roll forward", "pop upward", "lost", "break"], "correct_choice_idx": 0, "direct_answers": ["land parallel", "hit pavement", "fall", "flip", "drop", "roll forward", "fall", "land trick", "finish trick", "roll"], "difficult_direct_answer": true, "rationales": ["Based on the angle of the board, the expected motion of the board and person engaging in skateboarding and gravity the board will fall and, with the person likely on it, will roll forward.", "The skateboard is going to roll forward on its wheels.", "Based on the inferred momentum of the person in this image by their body positioning and the use of this board, answer a is consistent."], "image": "train2014/COCO_train2014_000000463355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92686, "question_id": "AsC6bMkKgrmqXyCGpmqV3d", "question": "What is creating the light coming through the door?", "choices": ["car", "lamp", "sun", "star"], "correct_choice_idx": 2, "direct_answers": ["sunlight", "sunlight", "sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["Sunlight is streaming through the glass in the door.", "The sun is creating light.", "A woman is in a brightly lit room with a window with the curtains open."], "image": "train2014/COCO_train2014_000000092686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227607, "question_id": "AsCJBRBNhtm6VSFoDJLG95", "question": "What does hitting the gong here signal?", "choices": ["opening", "divorce", "parity", "grievance"], "correct_choice_idx": 0, "direct_answers": ["instrument", "cooperation", "peace", "trade show", "beginning", "trade deal", "trade", "event start", "opening ceremony", "opening"], "difficult_direct_answer": true, "rationales": ["Gongs are commonly used as ceremonial instruments.", "The gong is hit while opening or starting an event.", "It is a common tradition in the stock market to hit the gong at the opening of a trade."], "image": "val2014/COCO_val2014_000000227607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565116, "question_id": "AsRiFHQmZeHZpVkyXsz92N", "question": "What food is on the plate in the middle of the table?", "choices": ["toast", "sandwich", "tomato", "ice cream"], "correct_choice_idx": 2, "direct_answers": ["breakfast", "ham", "breakfast", "breakfast", "ham", "tomato", "breakfast", "tomato", "sausage", "breakfast"], "difficult_direct_answer": false, "rationales": ["There is a red vegetable cut in half next to the ham.", "Ice cream is found in a bowl, not a plate. there is no bread to be seen. that leaves us with option \"a\".", "The food is a tomato."], "image": "train2014/COCO_train2014_000000565116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296975, "question_id": "AsXNTgFCC6ERWcvYvbRNsm", "question": "What forms of payment are probably accepted at this restaurant if one has no cash?", "choices": ["credit cards", "checks", "wire", "cryptocurrency"], "correct_choice_idx": 0, "direct_answers": ["credit card", "american express", "cash", "mastercard", "credit card", "credit card", "credit cards", "credit card", "credit card", "credit"], "difficult_direct_answer": false, "rationales": ["The bottom corner of the closed door has stickers on the glass advertising visa and mastercard.", "The signs on the right door indicate that this restaurant accepts visa and similar payment methods.", "Stickers in the window of a business list credit card logos."], "image": "train2014/COCO_train2014_000000296975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141065, "question_id": "Ascpnny9RsUYK4nmv6jCH5", "question": "What is usually found in this room?", "choices": ["bookcase", "desktop computer", "shower curtain", "bed"], "correct_choice_idx": 2, "direct_answers": ["toilet", "shower", "shower", "bath tub", "soap", "shower curtain", "toilet", "toilet", "shower", "shower"], "difficult_direct_answer": false, "rationales": ["A bathroom has a toilet and sink. bathrooms have showers with curtains.", "It should be around the tub to keep water from splashing out when people are bathing", "This is a bathroom."], "image": "train2014/COCO_train2014_000000141065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127882, "question_id": "AsfXHwftYFXKatKWkBMfaf", "question": "What is the food that is most in abundance here?", "choices": ["apple", "pizza", "french fries", "steak"], "correct_choice_idx": 2, "direct_answers": ["fries", "fries", "fries", "french fries", "fries", "fries", "fries", "french fries", "fries", "french fries"], "difficult_direct_answer": false, "rationales": ["Aside from the drinks, there are two main food items. hot dogs are outnumbered by the other item.", "The french fries accompany the hot dogs and are in great abundance.", "The volume of food is visible and comparable. the most common food has the size and shape consistent with answer a."], "image": "train2014/COCO_train2014_000000127882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188766, "question_id": "AskxVdzTnpyDqiTk7jQYo4", "question": "What type of ride is shown?", "choices": ["motorcycle", "pony", "bus", "amusement"], "correct_choice_idx": 3, "direct_answers": ["train", "amusement", "train", "amusement park", "boat", "amusement park", "train", "train", "train", "rollercoaster"], "difficult_direct_answer": false, "rationales": ["Of the answers, none of the required equipment is visible for any answer other than answer a.", "There is a small tram moving past large pieces of stage work.", "This is the inside of a theme park ride."], "image": "val2014/COCO_val2014_000000188766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80480, "question_id": "AtPvYR29VFhQRUeB9hbkF2", "question": "Which one will soon hit the ball?", "choices": ["white pants", "cannot tell", "either one", "black pants"], "correct_choice_idx": 0, "direct_answers": ["man", "grey sweater", "woman", "white pants", "man", "woman", "woman", "guy back", "front player", "rear"], "difficult_direct_answer": false, "rationales": ["The woman in the white pants looks like she is just about to hit the ball.", "The person in the background appears closer to the body position one would be in if they were approaching to hit the ball and their pants color is visible.", "In tennis your opponent stands across from you and hits the ball over netting expecting it to be returned. this woman is standing closer to the net and therefore closer to her opponent when the ball is hit it will get to her first."], "image": "train2014/COCO_train2014_000000080480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159215, "question_id": "AtQEDREYbjTbgi3vxZVVcx", "question": "Which is magnifying or moving towards yourself the photography is called?", "choices": ["none", "scroll", "zoom", "move"], "correct_choice_idx": 2, "direct_answers": ["zooming", "zoom", "zoom", "zoom", "zoom", "zooming in", "zoom", "zoom", "zoom", "zooming in"], "difficult_direct_answer": false, "rationales": ["The term zoom is used when looking closer at an object.", "The picture in the photo is close.", "The zoom is magnifying."], "image": "val2014/COCO_val2014_000000159215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280238, "question_id": "AtVQnteLR9a94WGqFQWuSZ", "question": "What fast food restaurant is seen in the background?", "choices": ["taco bell", "mcdonald's", "wendys", "burger king"], "correct_choice_idx": 1, "direct_answers": ["mcdonald's", "zingers", "mcdonald's", "mcdonald's", "mcdonalds", "zaxbys", "logans", "popeyes", "mcdonalds", "packed"], "difficult_direct_answer": false, "rationales": ["The restaurant is mcdonald's.", "You can see the red and yellow sign behind the tree.", "There is a sign with the golden arches on a red background."], "image": "val2014/COCO_val2014_000000280238.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313019, "question_id": "AtckJT6RXo8BHM9ngJFLs2", "question": "What category of pizza would this fall into?", "choices": ["cheese only", "hawaiian", "vegetarian", "gluten-free"], "correct_choice_idx": 2, "direct_answers": ["california-style", "flat pizza", "supreme", "everything", "vegetarian", "meat", "thin crust", "large", "pepperoni", "meat lover"], "difficult_direct_answer": true, "rationales": ["The pizza does not have meat on it.", "It looks like it is covered in assorted veggies.", "It seems to have a lot of vegetables."], "image": "train2014/COCO_train2014_000000313019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529877, "question_id": "AuDrZitrua5hFZBkEe9YQM", "question": "What does the boy on the right have on backwards?", "choices": ["shirt", "tie", "baseball cap", "helmet"], "correct_choice_idx": 2, "direct_answers": ["hat", "baseball cap", "hat", "hat", "hat", "skating", "baseball cap", "baseball cap", "hat", "hat"], "difficult_direct_answer": false, "rationales": ["The boy is wearing his cap the wrong way.", "The boy on the right is not wearing a tie or helmet. his shirt is not on backwards.", "The boy on the right has his baseball cap on backwards."], "image": "train2014/COCO_train2014_000000529877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325935, "question_id": "AuHXcF8MLeAcpd9Y9o5JnV", "question": "What is he doing?", "choices": ["running away", "eating lunch", "riding bike", "delivering male"], "correct_choice_idx": 3, "direct_answers": ["walking", "walking", "walking", "walking away", "walking away", "walking", "delivering male", "walking", "walking", "walking"], "difficult_direct_answer": false, "rationales": ["A man in a postal uniform is walking on the sidewalk. postal workers deliver mail.", "The mailman has mail.", "The man is pushing a mail cart and is dressed as a mail carrier."], "image": "train2014/COCO_train2014_000000325935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363331, "question_id": "AukMnYYY6ufaWLe22UbE3D", "question": "What are the people standing at the back?", "choices": ["pedestrians", "audience", "visitors", "residents"], "correct_choice_idx": 2, "direct_answers": ["police officers", "spectators", "watching elephants", "observing", "together", "instructors", "visitors", "safety", "spectators", "guards"], "difficult_direct_answer": true, "rationales": ["They could also be described as b.", "Visitors are looking on at the elephants.", "The people are watching the elephants perform."], "image": "train2014/COCO_train2014_000000363331.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475808, "question_id": "Av5z4MXDsZeXMu46xKgZ48", "question": "What are these bakery goods called?", "choices": ["cream puffs", "long johns", "eclairs", "cupcakes"], "correct_choice_idx": 3, "direct_answers": ["cupcakes", "cupcakes", "cupcake", "cupcakes", "cupcake", "cupcakes", "cupcakes", "cupcakes", "cupcakes", "cupcake"], "difficult_direct_answer": false, "rationales": ["The small baked cakes in a paper are referred to as 'cupcakes'. each one is an individual serving.", "These are all cupcakes that are stacked up.", "These goods are cupcakes since they're in cup size."], "image": "train2014/COCO_train2014_000000475808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87151, "question_id": "Av7fZgpoBVRnVvmKnJw6mf", "question": "What animal has a coat similar to the cushion the little girl is next to?", "choices": ["cheetah", "fish", "lion", "dog"], "correct_choice_idx": 0, "direct_answers": ["leopard", "leopard", "leopard", "cheetah", "sheep", "leopard", "leopard", "leopard", "cheetah", "leopard"], "difficult_direct_answer": false, "rationales": ["The cheetah is similar.", "A cheetah print cushion is next to a girl.", "Cheetahs are animals with recognizable and specific patterns on their coat. the pillow near the girl has a pattern similar."], "image": "train2014/COCO_train2014_000000087151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364044, "question_id": "AvAh97oYsWW8NYEQPiJJKV", "question": "What is near the jugs?", "choices": ["lemon", "flower", "monkey", "cat"], "correct_choice_idx": 1, "direct_answers": ["plants", "flower", "vase", "jug", "vase", "wall", "plant", "vase", "plant", "vase"], "difficult_direct_answer": false, "rationales": ["Flowers are near the jugs in the vase.", "The jugs have flowers.", "The jugs are near a green vase."], "image": "train2014/COCO_train2014_000000364044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79603, "question_id": "AvFbRfrCEesEy8iEnbDG7A", "question": "What is the regular activity in this area?", "choices": ["watching tv", "studying", "cooking", "skateboarding"], "correct_choice_idx": 2, "direct_answers": ["skating", "cooking", "cooking", "cooking", "skating", "cooking", "cooking", "cooking", "cooking", "cooking"], "difficult_direct_answer": false, "rationales": ["You can tell by the setting in the picture as to what it is normally used for.", "The person cooks in the kitchen.", "A girl is standing on a skateboard in a room with a stove and cabinets."], "image": "train2014/COCO_train2014_000000079603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383841, "question_id": "AvL9yFca8bjL6uccEaCPvb", "question": "How can you tell that this guy likes his food a little spicy?", "choices": ["salsa", "jalapenos", "tabasco", "hot sauce"], "correct_choice_idx": 1, "direct_answers": ["jalapeno peppers", "jalapenos", "jalapenos", "jalapeno", "hot peppers", "jalapenos", "peppers", "jalapenos", "peppers", "jalapenos"], "difficult_direct_answer": false, "rationales": ["The man has jalapeno peppers on his tortillas.", "The man has prepared food that includes a green pepper based on the size and shape. this food is known to be associated with spiciness.", "The guy wants jalapenos."], "image": "train2014/COCO_train2014_000000383841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553442, "question_id": "AvdUuSbMZb5qjZHuc2hmFW", "question": "Why is the young girl holding luggage?", "choices": ["to sell", "to travel", "to pack", "to purchase"], "correct_choice_idx": 1, "direct_answers": ["to travel", "travel", "travel", "traveling", "it's hers", "travel", "she's traveling", "going trip", "leaving", "traveling"], "difficult_direct_answer": false, "rationales": ["Luggage is used for traveling. the luggage looks new and is likely used for traveling.", "The girl is traveling.", "She is traveling with her mom and posing for a picture"], "image": "val2014/COCO_val2014_000000553442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342963, "question_id": "AvhwNC2oDgTNtBA6nwzXie", "question": "Why do the women have straps around their wrists?", "choices": ["fashion", "safety", "identification", "admission"], "correct_choice_idx": 1, "direct_answers": ["retain device", "game", "playing game", "playing wii", "playing wii", "wii game", "wii controls", "playing games", "safety", "playing wii"], "difficult_direct_answer": false, "rationales": ["Women are holding video game controllers in their hands.", "The controller can sometimes fly out of their hands and hit someone when playing the video game. the straps prevent them from doing that, and keeps it on their arm.", "The controller could fly out of the hand and hurt someone."], "image": "train2014/COCO_train2014_000000342963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217989, "question_id": "AvzTZkdAhjK6qfvak4rEMh", "question": "If you needed to freeze your vodka which color is the door you would want to open first?", "choices": ["brown", "white", "glass", "chrome"], "correct_choice_idx": 3, "direct_answers": ["silver", "silver", "silver", "chrome", "brown", "black", "silver", "silver", "chrome", "grey"], "difficult_direct_answer": false, "rationales": ["It's stainless steel actually.", "The freezer on the right has a metallic, not brown, glass, or white, surface.", "The refrigerator is on the right. it is made out of a shiny metal."], "image": "val2014/COCO_val2014_000000217989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314614, "question_id": "Aw3xih2Nr6ZPebkJT2NipA", "question": "What does the man look at while eating?", "choices": ["bathroom", "salad", "mirror", "screen"], "correct_choice_idx": 3, "direct_answers": ["computer", "screen", "computer", "computer screen", "computer", "screen", "computer", "computer", "screen", "computer screen"], "difficult_direct_answer": false, "rationales": ["The man is looking at a computer screen.", "The man is looking at his computer monitor while holding his pizza.", "A keyboard and mouse are present, so he is sitting in front of a computer monitor."], "image": "train2014/COCO_train2014_000000314614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177957, "question_id": "AwQRTEacNiYWMMoowxzLFF", "question": "Why is the ball passing him?", "choices": ["inattentive", "missed it", "dropped it", "throwing set"], "correct_choice_idx": 1, "direct_answers": ["missed ball", "throwing", "it's hit", "served it", "missed it", "missed swing", "missed", "tennis shot", "not", "missed it"], "difficult_direct_answer": true, "rationales": ["You can tell by his range of motion that he missed the ball.", "The man missed the ball when swinging.", "When he swung he did not hit the ball."], "image": "train2014/COCO_train2014_000000177957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206377, "question_id": "AwcHQNnq2Abg2UA8EK8MCJ", "question": "What kind of diet do they adhere to?", "choices": ["carnivore", "omnivore", "monophagous", "herbivore"], "correct_choice_idx": 1, "direct_answers": ["omnivore", "vegetarian", "vegetarian", "omnivorous", "omnivore", "meat", "berries", "fish berries", "meat", "omnivore"], "difficult_direct_answer": false, "rationales": ["Bears are together in a grassy, wooded area.", "These bears adhere to an omnivorous diet.", "Bears eat other animals, but also plants and berries."], "image": "train2014/COCO_train2014_000000206377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150769, "question_id": "AwhHJTBXunG8YcCTTB9EPH", "question": "What material are the pipes which are holding up the old bike?", "choices": ["concrete", "wood", "sand", "plastic"], "correct_choice_idx": 1, "direct_answers": ["metal", "wood", "steel", "steel", "steel", "steel", "aluminum", "metal", "metallic", "steel"], "difficult_direct_answer": false, "rationales": ["They are flat slats.", "There are wooden planks beneath the metal pipes.", "The majority of the pipes are made out of wood."], "image": "train2014/COCO_train2014_000000150769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268737, "question_id": "Awnrrxjr9opRrBizo58VFW", "question": "Why is the road shiny?", "choices": ["it's wet", "it's new", "it's night", "it's polished"], "correct_choice_idx": 0, "direct_answers": ["wet", "rain", "raining", "rain", "rain", "wet", "it's wet", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The road has rain on it making it reflect from the lights.", "It appears as if it has been raining, and when the ground is wet it reflects the lights from the cars and other items.", "When the road is wet, it looks shiny."], "image": "train2014/COCO_train2014_000000268737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341503, "question_id": "AwtkRzmnB96WXeGGfsGLwb", "question": "What vehicles are being showcased here?", "choices": ["motorcycles", "vans", "monster trucks", "cars"], "correct_choice_idx": 2, "direct_answers": ["demolition", "monster truck", "custom cars", "monster trucks", "monster trucks", "atv", "monster car", "road boss", "big trucks", "monster trucks"], "difficult_direct_answer": false, "rationales": ["The vehicle has huge tires and is lifted very high off of the ground.", "As indicated by the massive wheels and the monster style font for the road boss.", "The size of the tires of this car indicates that it is a monster car."], "image": "train2014/COCO_train2014_000000341503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188752, "question_id": "Ax3kpNYT3h7r3WHtw3ct6V", "question": "What can hanging the bananas avoid?", "choices": ["being damaged", "black spots", "being touched", "being stolen"], "correct_choice_idx": 1, "direct_answers": ["black spots", "falling", "rotting fast", "banana", "rats", "theft", "bruises", "rot", "rotting", "bugs"], "difficult_direct_answer": true, "rationales": ["Hanging up bananas can keep them fresh longer and avoid them going bad.", "Bananas should be hung to slow down the ripening process and to prevent bruising.", "The bananas want to avoid black spots."], "image": "val2014/COCO_val2014_000000188752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160407, "question_id": "AxLg36Mgeouw4dUMqriQH7", "question": "What disease are they concerned about?", "choices": ["diabetes", "cancer", "fibroids", "pneumonia"], "correct_choice_idx": 0, "direct_answers": ["diabetes", "diabetes", "diabetes", "diabetes", "diabetes", "diabetes", "diabetes", "diabetes", "diabetes", "diabetes"], "difficult_direct_answer": false, "rationales": ["Answer a is written in a different language visibly behind the people which means they might be associated with it.", "The disease is diabetes.", "They are trying to have a photo with the bus that states the condition they are trying to bring light to."], "image": "train2014/COCO_train2014_000000160407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126869, "question_id": "AxZDdQ3RcTtvrtjJRehmc6", "question": "How many Ossicones do giraffe's have?", "choices": ["three", "two", "five", "one"], "correct_choice_idx": 1, "direct_answers": ["two", "unknown", "two", "two", "unknown", "two", "two", "unknown", "two", "two"], "difficult_direct_answer": false, "rationales": ["The giraffe has two ossicones.", "This is another name for horns, which the giraffe has on its head.", "There are two horn like items on top of the giraffe's head."], "image": "train2014/COCO_train2014_000000126869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579591, "question_id": "Axn2ksotohRGJMgpqaTaVC", "question": "What is the dog doing near the man's feet?", "choices": ["resting", "playing", "bathing", "eating"], "correct_choice_idx": 0, "direct_answers": ["sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "napping", "resting", "napping", "sleeping"], "difficult_direct_answer": false, "rationales": ["The dog is laying on the floor with his head laid out on the carpet.", "The dog is taking a nap.", "The dog is visible and lying down with eyes not fully open which would be consistent with answer a."], "image": "train2014/COCO_train2014_000000579591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104556, "question_id": "Ay4VuektSZo78iRCHJNYFo", "question": "The cars are parked on the street during which season?", "choices": ["winter", "spring", "fall", "summer"], "correct_choice_idx": 0, "direct_answers": ["winter", "winter", "winter", "winter", "winter", "winter", "spring", "winter", "working season", "winter"], "difficult_direct_answer": false, "rationales": ["The sky is overcast and looks like a dreary cold day", "There are trees with no leaves on them", "The tree branches are bare."], "image": "train2014/COCO_train2014_000000104556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246066, "question_id": "AyCaqid8mAVorytxCqH5Hh", "question": "What forms the shadow on the woman?", "choices": ["tree", "vehicle", "building", "animal"], "correct_choice_idx": 0, "direct_answers": ["sunlight", "tree", "trees", "tree", "sun", "trees", "tree", "tree", "sun reflection", "tree shade"], "difficult_direct_answer": false, "rationales": ["The tree is towering over the woman.", "The building and vehicle are behind the woman and are not blocking the sun. there are no animals near the woman.", "There are shadows that look like leaves"], "image": "val2014/COCO_val2014_000000246066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416668, "question_id": "AyHVUyvCxr5ez3nCRNoGTU", "question": "What is the cat going to do?", "choices": ["watch tv", "wash face", "bath", "wash hands"], "correct_choice_idx": 2, "direct_answers": ["take bath", "attack", "bath", "looking up", "pounce", "attack", "nothing", "pounce", "in sink", "get wet"], "difficult_direct_answer": false, "rationales": ["He is in the sink and will get washed.", "A cat is in a bathroom sink.", "The cat will bathe."], "image": "val2014/COCO_val2014_000000416668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351422, "question_id": "AyNogVVZNiXcffB494CQQ4", "question": "How many awnings are there?", "choices": ["eight", "three", "none", "four"], "correct_choice_idx": 1, "direct_answers": ["three", "many", "three", "three", "three", "three", "three", "three", "many", "many"], "difficult_direct_answer": false, "rationales": ["You can see the coverings that there are three of them.", "There are three seen close to the field.", "There are three awnings near the racetrack."], "image": "train2014/COCO_train2014_000000351422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433023, "question_id": "AyZPBXGfQe4CjBfLB2wWyS", "question": "The room here might be found where?", "choices": ["luxury hotel", "prison", "car", "train"], "correct_choice_idx": 3, "direct_answers": ["train", "train", "train", "hotel", "boat", "boat", "restaurant", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["This long room could be found on a train.", "The room is displaying the width that would be consistent with a train.", "A large red wood table with some chairs on both sides are in a narrow corridor. there are many windows on one side of corridor."], "image": "train2014/COCO_train2014_000000433023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515062, "question_id": "Ayf5xAmCLVpBPmmLobRutC", "question": "What person would avoid this food?", "choices": ["diabetic", "pescatarian", "glutton", "vegetarian"], "correct_choice_idx": 0, "direct_answers": ["diabetic", "diabetic", "diabetic", "diabetic", "diabetes", "diabetic", "diabetic", "diabetes", "diabetic", "diabetic"], "difficult_direct_answer": false, "rationales": ["The items are cupcakes that are covered in chocolates or candies. these items would raise a person's blood sugar.", "It is a sweet cupcake.", "These are desserts, the main ingredient of which is sugar. diabetics should avoid sugar to not elevate their glucose levels."], "image": "train2014/COCO_train2014_000000515062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502629, "question_id": "AyztzvXn8QPqKtho5UQqqu", "question": "What do men do to sheep?", "choices": ["kill them", "nothing", "beat them", "grab them"], "correct_choice_idx": 3, "direct_answers": ["sheer them", "wrangle", "shear", "grab them", "shear them", "hug", "shear", "shear them", "holding", "sheer"], "difficult_direct_answer": false, "rationales": ["The men and the sheep are clearly visible. the men have their arms wrapped around the sheep consistent with answer a.", "You can tell by what the men are doing as to how to answer this.", "The men in the image are all grabbing the sheep in the image."], "image": "train2014/COCO_train2014_000000502629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130875, "question_id": "AzefHmpTeYJPhUKfQXpZ3b", "question": "How many people is the food on the tray meant to serve?", "choices": ["seven", "two", "thirteen", "eight"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "one", "two"], "difficult_direct_answer": false, "rationales": ["Two glasses of orange juice and two bananas, among other items are on a tray.", "The tray appears to include two of many items that would usually be served one per meal.", "There are two sets of every item on the food tray."], "image": "val2014/COCO_val2014_000000130875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178156, "question_id": "AzsV3JfxTKCzazUkAEqawb", "question": "What does the color on the stop light mean?", "choices": ["stop", "yield", "cross", "go"], "correct_choice_idx": 1, "direct_answers": ["caution", "yield", "caution", "slow down", "slow down", "caution", "caution", "slow down", "yield", "yield"], "difficult_direct_answer": false, "rationales": ["The color on the stop light is orange. an orange light intended for traffic often means proceeding forward is acceptable under some circumstances.", "A yellow light means caution it's about to turn red.", "You should slow down if you see yellow."], "image": "val2014/COCO_val2014_000000178156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468613, "question_id": "B22MZyawAu9vZDqhLW5UJv", "question": "What fungus is being added to this pie?", "choices": ["lichens", "squash", "mushrooms", "algae"], "correct_choice_idx": 2, "direct_answers": ["mushrooms", "mushrooms", "mushrooms", "mushrooms", "mushrooms", "mushrooms", "mushrooms", "mushrooms", "mushrooms", "mushrooms"], "difficult_direct_answer": false, "rationales": ["The chef is putting slices of mushrooms on the pizza.", "The man is making a pizza and topping it with fungi called mushrooms.", "People add mushrooms to their pizza along with other toppings"], "image": "val2014/COCO_val2014_000000468613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290498, "question_id": "B22xbHjuunBMTdeH6W6Y4s", "question": "Why is the woman wearing kneepads?", "choices": ["protection", "for halloween", "to wrestle", "cosplaying"], "correct_choice_idx": 0, "direct_answers": ["safety", "protection", "safety", "protection", "protection", "fall protection", "protection", "safety", "protection", "protection"], "difficult_direct_answer": false, "rationales": ["To avoid broken bones or another injury in case she falls off her skateboard.", "The activity that the woman is participating in carries a huge risk of falling, which is why she needs kneepads for knee protection.", "They protect the knees when one falls."], "image": "train2014/COCO_train2014_000000290498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177280, "question_id": "B29mnuCDiZ6DALYvfou8MZ", "question": "How would the tide be described?", "choices": ["low", "very high", "very low", "high"], "correct_choice_idx": 2, "direct_answers": ["low", "low", "low tide", "low", "low", "very low", "low", "low", "low", "low"], "difficult_direct_answer": false, "rationales": ["A lot of land is visible.", "People are standing on the sand. there is minimal water standing on the sand.", "You can see where the tide is super low and there is just sand there with a little water here and there."], "image": "train2014/COCO_train2014_000000177280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441156, "question_id": "B2NLFNK32DFYCjsnkfFCG5", "question": "What things might the person with the camera take photos of today?", "choices": ["wooly mammoths", "snowshoe crabs", "elephants", "whale"], "correct_choice_idx": 2, "direct_answers": ["safari", "elephants", "elephant", "elephants", "elephants", "elephants", "elephants", "animals", "elephant", "wildlife"], "difficult_direct_answer": false, "rationales": ["The person with the camera is likely to take photos of elephants on this day since there an one nearby.", "There is an elephant standing behind him.", "The elephants are the subject."], "image": "val2014/COCO_val2014_000000441156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485613, "question_id": "B38KEsjLWzNPDDLnN98iXa", "question": "What feature distinguishes this animal from a dog?", "choices": ["ears", "paws", "whiskers", "fur"], "correct_choice_idx": 2, "direct_answers": ["stripes", "face", "sound", "whiskers", "whiskers", "mane", "sound", "meow", "nose", "usually smaller"], "difficult_direct_answer": false, "rationales": ["Dog paws are bigger.", "Dogs do not have prominent whiskers to such a degree; cat whiskers are much longer and this animal has long whiskers.", "This animal is a cat. cats and dogs both have paws, fur, and ears."], "image": "val2014/COCO_val2014_000000485613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574808, "question_id": "B3FSiNSoh5E9syuKdmGMpz", "question": "What genders are allowed at this school?", "choices": ["none", "co-ed", "boy's", "girl's"], "correct_choice_idx": 1, "direct_answers": ["both", "coded", "all", "co-ed", "both", "both genders", "both genders", "all", "both", "male"], "difficult_direct_answer": false, "rationales": ["There are both males and females visible in caps and gowns in this pictures. caps and gowns commemorate those who have graduated from a school so for someone to be wearing the attire they were likely allowed in the school and have completed it.", "Both males and females are wearing graduation caps.", "The school is co-ed."], "image": "val2014/COCO_val2014_000000574808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414813, "question_id": "B3HbmHaHYGVfaYf4X4nQJX", "question": "The planes were likely used for what transportation purpose?", "choices": ["cargo", "waterways", "military", "passenger"], "correct_choice_idx": 2, "direct_answers": ["military", "bombing", "military personnel", "bombers", "air", "military", "mission", "persons", "bomb", "flying"], "difficult_direct_answer": true, "rationales": ["They are small planes that can only seat one or 2 people, which resemble those of fighter jets used in the wars.", "The plane is displayed with other planes so it should be in the same category.", "Multiple planes have air force markings. they are fighters, not cargo planes."], "image": "val2014/COCO_val2014_000000414813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330098, "question_id": "B3nT7op4fWpBpPLeo7gsio", "question": "What is the white item folded in the corner?", "choices": ["table", "bed sheet", "poster", "box"], "correct_choice_idx": 0, "direct_answers": ["table", "table", "bag", "table", "bag", "table", "table", "table", "table", "table"], "difficult_direct_answer": false, "rationales": ["Tables can be folded for storage.", "You can tell by the way it's folded and the fact that it is plastic you can tell what it is.", "This is obvious. the other options don't match."], "image": "val2014/COCO_val2014_000000330098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559995, "question_id": "B48CPZtKxCXvNt2UuZCPkz", "question": "What is the black car near the green umbrella doing?", "choices": ["weaving", "turning", "parking", "speeding"], "correct_choice_idx": 2, "direct_answers": ["parked", "parked", "it's parked", "parking", "parking", "parking", "standing", "walking", "turning", "parked"], "difficult_direct_answer": false, "rationales": ["The car is parking.", "The black car near the green umbrella is by the curb and is not moving, which indicates that the car is in a parking state.", "There is a few cars in a row stationary along the curb of street."], "image": "train2014/COCO_train2014_000000559995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421810, "question_id": "B4CDCcZKPheSN9rRJuN78g", "question": "What allows the red kites to fly?", "choices": ["tails", "doldrums", "poles", "string"], "correct_choice_idx": 2, "direct_answers": ["long stick", "poles", "tails", "poles", "wind", "wind", "wind gravity", "wind", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["The kites are sticking to poles.", "One can see that they are attached to the rods that are sticking out of the ground.", "They are sitting on top of the poles."], "image": "train2014/COCO_train2014_000000421810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391150, "question_id": "B4Caf9i3ULFCgdVZHDfN3Y", "question": "What animal are the horses looking at?", "choices": ["cat", "gorilla", "horse", "dog"], "correct_choice_idx": 3, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["You can tell by it's shape, fur and posture what animal it is.", "The horses are looking in the direction of the canine.", "The horses look at the dog."], "image": "train2014/COCO_train2014_000000391150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94061, "question_id": "B4NedGigEDwQ6dKE5Vwxnf", "question": "What are the white flecks on the hanging food?", "choices": ["salt", "cheese", "mold", "garlic"], "correct_choice_idx": 0, "direct_answers": ["salt", "salt", "salt", "salt", "salt", "salt", "salt", "salt", "salt", "salt"], "difficult_direct_answer": false, "rationales": ["The flecks are salt.", "These are pretzels and this item usually has salt on it.", "Pretzels come with or without large grain salt on them."], "image": "train2014/COCO_train2014_000000094061.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257830, "question_id": "B4NjaQNYSUpqcTdKYMPtJX", "question": "What type of trees are visible here?", "choices": ["palms", "deciduous", "olives", "conifers"], "correct_choice_idx": 3, "direct_answers": ["fir", "pine", "pine trees", "fir trees", "pine", "fir tree", "pine", "fir trees", "evergreens", "conifers"], "difficult_direct_answer": false, "rationales": ["The trees are still green even though it is winter with snow on the ground. they are evergreen trees and belong to this group of cone bearing trees.", "There are pine trees.", "The big tree is a conifer."], "image": "train2014/COCO_train2014_000000257830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50878, "question_id": "B4iWYWZPvxTQsN8LztRm2h", "question": "What are the children wearing?", "choices": ["space suits", "bathing suits", "snowsuits", "safari suits"], "correct_choice_idx": 2, "direct_answers": ["snow suits", "snow suit", "snowsuits", "snow suits", "snowsuits", "helmets", "ski jackets", "snowsuits", "snow suits", "snowsuits"], "difficult_direct_answer": false, "rationales": ["The children are in the snow.", "The children are near a ski hill so they are dressed in snowsuits.", "The children are standing near skis. it is too cold to swim or go on safari."], "image": "train2014/COCO_train2014_000000050878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130518, "question_id": "B4qBhrL3FjbFnRYRCECJ7j", "question": "The antenna on the electric device to the right of the cap broadcasts what type of signal?", "choices": ["wi-fi", "radio", "cellular phone", "television"], "correct_choice_idx": 0, "direct_answers": ["wifi", "wifi", "digital", "wifi", "wifi", "radio", "wi-fi", "wifi", "wifi", "analog"], "difficult_direct_answer": false, "rationales": ["There are modern computers visible and most modern computers use internet. the object in question is the right size and shape to be a wifi router which would be consistent with the equipment's need for internet.", "The antenna is for wifi.", "The antenna ensures that people can connect to wifi."], "image": "train2014/COCO_train2014_000000130518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359791, "question_id": "B4tawAJY5jXuqCYAPVAyCb", "question": "What do the umbrellas tell you about the weather?", "choices": ["its windy", "sunny outside", "cold", "its rainy"], "correct_choice_idx": 3, "direct_answers": ["rainy", "rainy", "raining", "rainy", "raining", "rain", "rainy", "its rainy", "raining", "it's raining"], "difficult_direct_answer": false, "rationales": ["It's raining on them.", "By the darkened sky it will or is likely to start raining.", "The umbrellas are used mostly to protect from water."], "image": "val2014/COCO_val2014_000000359791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447574, "question_id": "B56RSJJFnrhtmJ6RMZ6Yvf", "question": "Which of the food will most likely be eaten with silverware?", "choices": ["none", "bread", "soup", "sandwich"], "correct_choice_idx": 2, "direct_answers": ["soup", "sauce", "soup", "soup", "soup", "soup", "soup", "soup", "sauce", "soup"], "difficult_direct_answer": false, "rationales": ["Of the food visible, everything except answer a could reasonably be eaten by hands, but the answer a visible is a liquid and would require utensils.", "Here we see a bowl of soup and a sandwich on a plate. a sandwich can and usually is eaten from the hands, but soup requires a spoon to eat.", "You need to eat the soup with a spoon."], "image": "train2014/COCO_train2014_000000447574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238948, "question_id": "B5Bwo2umr2eDWPmHNpvTXJ", "question": "Which actor has the training to do what these people are doing?", "choices": ["russell crowe", "kevin james", "gabourey sidibe", "jacob tremblay"], "correct_choice_idx": 0, "direct_answers": ["antonio banderas", "john wayne", "stunt man", "viggo mortenson", "tom sawyer", "horseback riding", "horseback riding", "russell crowe", "horse people", "horse teacher"], "difficult_direct_answer": true, "rationales": ["Russell crowe rode a horse in the movie gladiator.", "The people are clearly riding horses and the actor in answer a claims to know this skill based on an internet search.", "This is the only actor of the four who has ridden a horse in a film."], "image": "val2014/COCO_val2014_000000238948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82338, "question_id": "B5JH4pw4dAe3vSSrfXr7WX", "question": "How many cattle are there in image?", "choices": ["four", "one", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two cows", "one", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two cattle.", "Two cows are standing side by side in the street.", "There are six legs visible which makes sense for cows standing close together like these."], "image": "val2014/COCO_val2014_000000082338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406055, "question_id": "B5iAKThx9XKyEftjwPwMa8", "question": "Where is this game being played?", "choices": ["gym", "stadium", "beach", "recess"], "correct_choice_idx": 1, "direct_answers": ["stadium", "baseball pitch", "baseball stadium", "stadium", "baseball field", "baseball stadium", "baseball field", "baseball stadium", "baseball stadium", "rak stadium"], "difficult_direct_answer": false, "rationales": ["The baseball players are at a stadium to play their game in front of their fans.", "They are playing professional baseball which takes place in an arena with a large seating capacity.", "The ball park with the audience stands are generally called that."], "image": "train2014/COCO_train2014_000000406055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442322, "question_id": "B5kijKjWKUPTTQTp6WfFiC", "question": "What play is the best case scenario for the batter?", "choices": ["home run", "foul", "walk", "strike out"], "correct_choice_idx": 0, "direct_answers": ["home run", "strike out", "home run", "strike out", "home run", "home run", "home run", "home run", "strike out", "home run"], "difficult_direct_answer": false, "rationales": ["The best hit a batter can get is hitting it out of the park, which is called a home run.", "The play is a home run.", "The batter wants to successfully hit the ball and the most runs are scored by hitting the ball out of the park."], "image": "train2014/COCO_train2014_000000442322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24195, "question_id": "B5r7Gqs6952TzwuThVyqtB", "question": "What city is this?", "choices": ["chicago", "fremont", "oakland", "broadway"], "correct_choice_idx": 2, "direct_answers": ["oakland", "brooklyn", "oakland", "oakland", "oak city", "oakland", "unknown", "oak", "oakland", "oakland"], "difficult_direct_answer": false, "rationales": ["A city bus has logos on the back of it mentioning \"oak\".", "The name is partially visible in the website on the rear.", "The bus has the word oakland."], "image": "val2014/COCO_val2014_000000024195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216038, "question_id": "B5utfW6XKYZcN6MciWbdFA", "question": "What is the camera on the left setup on?", "choices": ["selfie stick", "table", "tripod", "mixer"], "correct_choice_idx": 2, "direct_answers": ["tripod", "tripod", "tripod", "tripod", "tripod", "tripod", "tripod", "tripod", "tripod", "tripod"], "difficult_direct_answer": false, "rationales": ["A room in a home has a camera in the corner on a stand with three legs.", "There is a tripod that connects the camera.", "The camera is a tripod with three legs."], "image": "train2014/COCO_train2014_000000216038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58149, "question_id": "B5wgbfqe5a4eGi5uxxwSeU", "question": "How many layers in skateboard?", "choices": ["six", "five", "three", "four"], "correct_choice_idx": 2, "direct_answers": ["seven", "three", "two", "seven", "9 layers", "seven", "one", "one", "multi", "n/a"], "difficult_direct_answer": false, "rationales": ["It's made out of wheels, metal hardware and a wooden deck.", "It is made up of several layers to make it sturdy.", "The skateboard has the middle part, and separate pieces stuck to each side for design and for grip."], "image": "train2014/COCO_train2014_000000058149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234929, "question_id": "B63MqKtfKCqvKVH982RpVC", "question": "What was used to build this plane?", "choices": ["metal", "paper", "plastic", "legos"], "correct_choice_idx": 3, "direct_answers": ["legos", "legos", "legos", "legos", "legos", "legos", "plastic", "patience", "plastic", "legos"], "difficult_direct_answer": false, "rationales": ["The materials used for the construction of the plane contain little indentations which can be snapped into place and are consistent with this type of toy.", "The plane was built with lego pieces.", "The legos were used."], "image": "train2014/COCO_train2014_000000234929.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310879, "question_id": "B645cFDHywjUpG2Fc46pGT", "question": "Where are these birds?", "choices": ["ocean", "river", "lake", "pond"], "correct_choice_idx": 0, "direct_answers": ["seagulls", "beach", "seagulls", "seagulls", "seagulls", "sea", "ocean", "pelicans", "ducks", "water"], "difficult_direct_answer": false, "rationales": ["The birds are all just hanging out in the waves.", "Birds are standing at the shore of a large body of water.", "They are on a beach where the tide went out. tides happen in a sea."], "image": "val2014/COCO_val2014_000000310879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558483, "question_id": "B6WHc7uD2zoaw82SssG5r2", "question": "What has dried up and stopped the boats from moving?", "choices": ["water", "grass", "ice", "clouds"], "correct_choice_idx": 0, "direct_answers": ["water", "river", "river", "water", "river", "water", "water", "river", "water", "river"], "difficult_direct_answer": false, "rationales": ["Water has dried.", "Boats sit in the sand in a depression that appears to be a dried out waterway.", "Boats need water to be able to float on."], "image": "val2014/COCO_val2014_000000558483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418535, "question_id": "B6Z6uWMW5kYYdeMrUEYU3C", "question": "This man is most likely a descendant of which historical figure?", "choices": ["temujin", "diocletian", "zoroaster", "rollo"], "correct_choice_idx": 0, "direct_answers": ["mao", "temujin", "al capone", "buddha", "hiro hito", "unknown", "genghis khan", "kim jongil", "no clue", "emperor hirohito"], "difficult_direct_answer": true, "rationales": ["An asian man in a suit is walking in the street.", "The man looks to be asian so that makes more sense than the others listed.", "The man is a descendant of temujin."], "image": "val2014/COCO_val2014_000000418535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559949, "question_id": "B6jF5Nb889ghRAgZJ9xrCE", "question": "Why is the man standing in front of the woman wearing the jacket?", "choices": ["to stalk", "to hug", "to photograph", "to tackle"], "correct_choice_idx": 2, "direct_answers": ["dressed up", "suit", "taking photo", "take picture", "pictures", "taking photo", "taking picture", "proposing", "to photograph", "photographing"], "difficult_direct_answer": true, "rationales": ["The person is the photographer.", "The man seems to have a camera in his hands and is about to take a picture.", "The man has a camera with him."], "image": "train2014/COCO_train2014_000000559949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316704, "question_id": "B6n4hKduvXtfjhiGJ6P2fG", "question": "What does the woman do with her phone?", "choices": ["take photo", "call", "text", "auto dial"], "correct_choice_idx": 0, "direct_answers": ["take video", "taking pictures", "take picture", "take photos", "takes picture", "take picture", "photograph", "take photo", "picture", "take picture"], "difficult_direct_answer": false, "rationales": ["The woman is taking a picture of the man with shaving cream on his head.", "The woman takes a photo with her phone of the man with whipped cream on his head.", "The woman on the right is holding her phone to take a picture with it."], "image": "val2014/COCO_val2014_000000316704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419037, "question_id": "B6pTmK3BZgb9PWfiGsGNZn", "question": "What are the men kicking?", "choices": ["ball", "criminal", "can", "toilet paper"], "correct_choice_idx": 0, "direct_answers": ["ball", "ball", "soccer ball", "soccer ball", "soccer ball", "ball", "ball", "soccer ball", "soccer ball", "soccer ball"], "difficult_direct_answer": false, "rationales": ["The men are wearing soccer uniforms and are kicking a circular item.", "The man has a ball.", "The man is playing soccer."], "image": "train2014/COCO_train2014_000000419037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79645, "question_id": "B6xLppqdCwbrvDbKdo6MKD", "question": "What is the yellow part at the front of the train for?", "choices": ["bumper", "passenger exit", "decoration", "emergency exit"], "correct_choice_idx": 3, "direct_answers": ["lighting", "exit", "bullet train", "emergency exit", "boarding passengers", "emergency exit", "handicapped", "visibility", "bumper", "safety line"], "difficult_direct_answer": true, "rationales": ["The yellow door is for emergency use only.", "There is a door in the front part of the train. if there is a fire or something dire, people can escape from it.", "The front part of the train normally doesn't open for passengers but the yellow door opens in emergencies."], "image": "train2014/COCO_train2014_000000079645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38525, "question_id": "B6z2w3XQ9yDJqESRXY43pc", "question": "What color are the leaves on the sheet over the top of the dog?", "choices": ["red", "purple", "yellow", "green"], "correct_choice_idx": 0, "direct_answers": ["yellow", "green", "tan", "green", "green", "red", "green", "orange", "green", "green"], "difficult_direct_answer": false, "rationales": ["The leaves are green", "The flowers are green and yellow. the leaves are a different color and are not purple.", "The leaves are a maroon color."], "image": "train2014/COCO_train2014_000000038525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60814, "question_id": "B7DBD8PVAUemCZZavzJmhf", "question": "What is behind the truck?", "choices": ["ape", "club", "reindeer", "boat"], "correct_choice_idx": 3, "direct_answers": ["boat", "boat", "boat", "boat", "boat", "boat", "boat", "boat", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["This is obvious and many people put them on trailers/hitches as seen here.", "The truck is hauling a boat behind it.", "It is shaped to float in the water and carry people in it."], "image": "train2014/COCO_train2014_000000060814.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557323, "question_id": "B7KqmMyKK2bH3SKfUEJ7fh", "question": "What are all the people looking at?", "choices": ["goal", "other payers", "player", "jumbotron"], "correct_choice_idx": 0, "direct_answers": ["soccer ball", "soccer game", "players", "player", "soccer game", "game", "goal", "player", "football game", "soccer match"], "difficult_direct_answer": false, "rationales": ["A soccer player is kicking the ball while the audience watches.", "He is about to make a big kick with the ball in front of him", "The fans are watching the goal to see if the player makes the ball in the goal."], "image": "val2014/COCO_val2014_000000557323.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346026, "question_id": "B7aHjegfh3bAYA5FkgSFZC", "question": "What are the pigeons doing?", "choices": ["singing", "sleeping", "finding food", "resting"], "correct_choice_idx": 2, "direct_answers": ["wandering", "walking", "walking", "foraging", "walking", "finding food", "walking", "eating food", "walking", "eating"], "difficult_direct_answer": false, "rationales": ["The pigeons want food.", "The pigeons are scrapping around for a crumb or tidbit after all the people have left for the day, and with any luck, they'll track down a bite or two.", "The pigeons are awake. they are not singing."], "image": "train2014/COCO_train2014_000000346026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535808, "question_id": "B7cox9mcXEk7yWGRMvrNVu", "question": "What are the pictures on the black stand?", "choices": ["dvds", "postcards", "movie posters", "videos"], "correct_choice_idx": 1, "direct_answers": ["postcards", "postcards", "postcards", "postcards", "postcard", "postcards", "postcards", "postcard", "postcards", "magazines"], "difficult_direct_answer": false, "rationales": ["Cards for mailing short messages are on the rack.", "The pictures are the familiar rectangular shape and they have pictures of tourist attractions on them.", "There are postcards in the black stand."], "image": "val2014/COCO_val2014_000000535808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7504, "question_id": "B7qA5WQs6c46znLLCQUZ8M", "question": "How are the clocks powered?", "choices": ["solar", "gas", "nuclear", "battery"], "correct_choice_idx": 3, "direct_answers": ["battery", "battery", "batteries", "battery", "battery", "battery", "batteries", "battery", "batteries", "battery"], "difficult_direct_answer": false, "rationales": ["The clocks are not connected to anything. each clock has an internal power source.", "The clocks have batteries that keep them charged.", "The clocks have a battery."], "image": "train2014/COCO_train2014_000000007504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33647, "question_id": "B8AYo2r6YqWYqEacRXeUiQ", "question": "How many months till the leaves turn green?", "choices": ["3-4", "1-2", "5-7", "10-12"], "correct_choice_idx": 2, "direct_answers": ["5-7", "six", "3 months", "six", "five", "soon", "six", "four", "five", "many"], "difficult_direct_answer": false, "rationales": ["The leaves turn yellow/brown in the fall and turn green in the spring. from fall to spring is almost 1/2 of a year.", "It looks to be fall still", "The leaves will turn green in two seasons."], "image": "val2014/COCO_val2014_000000033647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335433, "question_id": "B8HqW3T6o9MRQAGCoeZX4H", "question": "What is on top of the sandwich?", "choices": ["apple", "toothpick", "syrup", "dressing"], "correct_choice_idx": 1, "direct_answers": ["toothpick", "toothpick", "toothpick", "toothpick", "toothpick", "stick", "bread", "toothpick", "toothpick", "toothpick"], "difficult_direct_answer": false, "rationales": ["A toothpick holds the sandwich together.", "Traditionally large sandwiches will have a toothpick to keep it together.", "It helps to hold the sandwich together and you can use it to pick food out of your teeth when done eating."], "image": "train2014/COCO_train2014_000000335433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539754, "question_id": "B8NWSCm7efPJ2Ej2MLRnG5", "question": "What might the child be doing to the photographer?", "choices": ["photographing them", "complaining", "quitting", "calling them"], "correct_choice_idx": 0, "direct_answers": ["taking picture", "taking picture", "photographing", "photographing", "photographing", "taking pictures", "taking picture", "taking photo", "photographing them", "taking picture"], "difficult_direct_answer": false, "rationales": ["The device the boy is holding has a lens and camera flash pointed forwards.", "They are holding up a camera phone, with the lens facing the person taking the photo.", "The phone in the child's hands is positioned in such a way that his most likely activity would be using the phone to take a picture of the person who's also photographing him."], "image": "train2014/COCO_train2014_000000539754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345802, "question_id": "B8T3cPHVurcsJCyaeeJCWM", "question": "What is making the thin lines in the snow?", "choices": ["snakes", "birds", "tires", "skis"], "correct_choice_idx": 3, "direct_answers": ["skis", "skis", "stick", "stick", "pole", "ticke", "skis", "skis", "skis", "skiis"], "difficult_direct_answer": false, "rationales": ["The skis make the lines.", "The skis are thin and so are the lines they are riding through.", "There are no non-human animals or vehicles. the people doing a winter activity are making the thin lines."], "image": "train2014/COCO_train2014_000000345802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102577, "question_id": "B8VFJwZzKaCRVbtbZDMcbi", "question": "What is above this person?", "choices": ["drone", "cloud", "parasail", "crane"], "correct_choice_idx": 2, "direct_answers": ["parasail", "parachute", "parasailing", "water boarding", "kite", "kite", "sail", "kite", "sky", "kite"], "difficult_direct_answer": false, "rationales": ["Based on the equipment the person is using and the harness they are strapped into, the object above them at the other end of the straps would be answer a.", "The man is holding onto a parasailing tow-rope and he is hanging in the air, so it must be a parasail above that is elevating him this way.", "The person is flying in the air above the water, which means something must be pulling him from above, such as a kite. in thus manner, it is called a parasail"], "image": "val2014/COCO_val2014_000000102577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225183, "question_id": "B8cqzv7mzvD3TDfWVz4FWo", "question": "What is the best reason for these police to ride these animals?", "choices": ["speed", "save energy", "height advantage", "mobility"], "correct_choice_idx": 2, "direct_answers": ["height advantage", "security", "higher vantage", "mobility", "horse", "crowd control", "law enforcement", "higher view", "crowd control", "during parades"], "difficult_direct_answer": true, "rationales": ["The best reason is because the police can tower over other people for a better view.", "Being on horseback allows the police to be able to see above the crowd so they are aware of what is happening in their vicinity.", "It gives them a better line of sight. it can also give them b, c and d, but it's primarily line of sight."], "image": "val2014/COCO_val2014_000000225183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184493, "question_id": "B8dZtPigeUsjd8QjLj459M", "question": "What shot is he using to hit the ball?", "choices": ["overhead", "volley", "backhand", "forehand"], "correct_choice_idx": 3, "direct_answers": ["forehand", "forehand", "forehand", "forehand", "forehand", "forehand", "forearm shot", "back", "volly", "side shot"], "difficult_direct_answer": false, "rationales": ["He's holding the racket in a forehand position and he's just about to hit the ball, so it's obvious that a forehand is what he'll be hitting the ball with.", "The front hand is being used.", "The man is holding his tennis racquet and swinging his arm forward in a forehand shot."], "image": "train2014/COCO_train2014_000000184493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543678, "question_id": "B8ikTppQccwbFQoVqCov26", "question": "What kind of shoes is the skater wearing?", "choices": ["vans", "puma", "airwalk", "nike"], "correct_choice_idx": 3, "direct_answers": ["nikes", "sneakers", "nike", "vans", "tennis shoes", "nike", "nike", "tennis shoes", "nike", "nikes"], "difficult_direct_answer": false, "rationales": ["There is a elongated check mark on the side of the shoe.", "The nike logo is on the man's shoes.", "The skater is wearing the brand of tennis shoes whose logo is a well-known swoosh."], "image": "train2014/COCO_train2014_000000543678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342946, "question_id": "B8oX5PUoHpiLBuUnepHJx3", "question": "Who speaks at this moment?", "choices": ["girl", "no one", "clowns", "caller"], "correct_choice_idx": 3, "direct_answers": ["phone", "caller", "conversational partner", "caller", "other person", "cellphone", "caller", "caller", "phone caller", "receiver"], "difficult_direct_answer": false, "rationales": ["The caller is talking.", "The caller is talking since the woman shown is listening.", "The woman seems to be listening to the caller."], "image": "train2014/COCO_train2014_000000342946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121651, "question_id": "B8v8oyksmSzwGXRC7ZZ9uV", "question": "What does the side of the nearest boat want you to visit?", "choices": ["website", "brazil", "japan", "restaurant"], "correct_choice_idx": 0, "direct_answers": ["can't see", "tourist site", "website", "website", "website", "website", "docks", "tour experience", "island", "website"], "difficult_direct_answer": false, "rationales": ["There is a url ending in .com on the side of the boat.", "The writing on the side is readable and ends in a \".com\" which is a common designation for a website. website being displayed in this manner is likely to try to get people to go to the site.", "It's difficult to see but i think i may see a website address."], "image": "train2014/COCO_train2014_000000121651.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384617, "question_id": "B8xzR6WbuQyAyQQhMus4B4", "question": "Why are there letters on the green banners?", "choices": ["rules regulations", "sign-ups", "directions", "company sponsorships"], "correct_choice_idx": 3, "direct_answers": ["company", "on", "sponsors", "company sponsorships", "sponsors", "promotional poster", "advertisement", "advertising", "advertisement", "advertising"], "difficult_direct_answer": false, "rationales": ["There are ads.", "There is a registered trademark after the lettering that is generally a company marking, there is no additional information suggesting the letters indicate rules and regulations, directions, or signups.", "These are names of companies that will sponsor the event."], "image": "train2014/COCO_train2014_000000384617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48931, "question_id": "B8z3nrQFKNkSfXMm7F5P6R", "question": "What does the woman standing in front of the cart have in that cart?", "choices": ["groceries", "mop", "baby", "nothing"], "correct_choice_idx": 2, "direct_answers": ["baby", "baby", "clothes", "clothes", "baby", "baby", "baby", "bags", "baby", "baby"], "difficult_direct_answer": false, "rationales": ["The cart is a stroller and there's likely an infant in the stroller.", "The woman has a baby.", "The woman has a stroller that is used to carry a baby while walking."], "image": "train2014/COCO_train2014_000000048931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282279, "question_id": "B99r6RGBYSTzW8be2zPZsd", "question": "What is the man wearing black underwear laying on?", "choices": ["table", "bed", "chair", "floor"], "correct_choice_idx": 1, "direct_answers": ["bed", "underwear", "bed", "bed", "bed", "bed", "bed", "bed", "bed", "bed"], "difficult_direct_answer": false, "rationales": ["It has a headboard, and pillows and sheets, which are part of the assembled components of a bed.", "He is on the bed.", "A man is laying on a large flat surface with a blanket across it. people put blankets on beds,"], "image": "train2014/COCO_train2014_000000282279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252188, "question_id": "B9yaCErS3qXVJUUsZUrqWd", "question": "What venue is this likely to be?", "choices": ["art gallery", "restaurant", "hotel", "department store"], "correct_choice_idx": 0, "direct_answers": ["museum", "store", "glassblowing", "art gallery", "art", "restaurant", "museum", "banquet", "museum", "museum"], "difficult_direct_answer": false, "rationales": ["Decorative items are on display in a large indoor area.", "These look like works of art and would be found in a gallery.", "Each object is individually displayed, with most of them coming with their own plaques. the craftsmanship, artistic properties, and high-level of detail are immediately noticeable in each of the objects."], "image": "train2014/COCO_train2014_000000252188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135237, "question_id": "BA7gafv3bhGHGojAfUrbsD", "question": "The graffiti features a word that is a combination of two what?", "choices": ["first names", "verbs", "last names", "adjectives"], "correct_choice_idx": 0, "direct_answers": ["first names", "need", "first names", "names", "first names", "names", "names", "names", "names", "names"], "difficult_direct_answer": false, "rationales": ["John and bob are both names used as first names.", "Graffiti with the words john and bob is on a wall along a sidewalk. john and bob are common names.", "Behind the two people walking, there are many pictures of markings from spray paint. there is a john and bob name."], "image": "train2014/COCO_train2014_000000135237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580345, "question_id": "BAbFGQejQPkbg7YXhtdJMJ", "question": "What is the bus's destination street?", "choices": ["main", "town", "center", "south"], "correct_choice_idx": 2, "direct_answers": ["center street", "center", "center st", "center street", "center street", "center", "center", "center", "center street", "center street"], "difficult_direct_answer": false, "rationales": ["The bus display says center street.", "The digital sign on the front of the bus indicates its destination.", "The street is the center."], "image": "train2014/COCO_train2014_000000580345.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230515, "question_id": "BApL6N9sBoKiC5pDPcyU79", "question": "Two elephants are standing but what are the other two doing?", "choices": ["drinking", "standing", "sleeping", "bathing"], "correct_choice_idx": 3, "direct_answers": ["bathing", "laying down", "bathing", "bathing", "bathing", "laying down", "rolling", "bathing", "bathing", "swimming"], "difficult_direct_answer": false, "rationales": ["The elephants are playing in the water.", "They are lying down in the water.", "The elephants are partially submerged in the water."], "image": "train2014/COCO_train2014_000000230515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58772, "question_id": "BBFseh7CWpvBFw47febBH9", "question": "What is needed to operate the blue items?", "choices": ["horses", "people", "keys", "cars"], "correct_choice_idx": 1, "direct_answers": ["rain", "people", "rain", "hands", "fingers", "people", "hands", "human", "human", "rain"], "difficult_direct_answer": false, "rationales": ["The objects in question are umbrellas based on their size, shape and design. these objects have one common operator.", "Someone needs to open the umbrellas.", "The umbrellas pictured here have a handle suitable for being held by a person's hand."], "image": "val2014/COCO_val2014_000000058772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409877, "question_id": "BBP6n2X78rhCVKBs47gkpV", "question": "What is the man wearing?", "choices": ["sombrero", "hat", "suspenders", "tie"], "correct_choice_idx": 1, "direct_answers": ["coat", "coat", "coat", "parka", "hat", "jacket", "jacket", "winter jacket", "hat", "parka"], "difficult_direct_answer": false, "rationales": ["The man is wearing a hat", "The man's head is covered. he is not wearing a sombrero.", "The man is wearing a beanie on his head to keep his head warm."], "image": "train2014/COCO_train2014_000000409877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120682, "question_id": "BBkFfBZRLrvumQa4gUtwzT", "question": "Why is the man in danger?", "choices": ["poor eyesight", "using phone", "looking down", "being boxed"], "correct_choice_idx": 1, "direct_answers": ["cars", "distracted driving", "distraction", "cell phone", "distracted phone", "using phone", "no helmet", "distracted riding", "car", "multitasking"], "difficult_direct_answer": true, "rationales": ["Using the phone means he is distracted and will not notice any dangers around him.", "The man is distracted on a call.", "Bicycle riders should be focused on the traffic around them for safety."], "image": "val2014/COCO_val2014_000000120682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429251, "question_id": "BBpBURjaboWdKEVFyqtkCg", "question": "If you want to park at a space nearby what do you likely need?", "choices": ["credit card", "pennies", "check", "permit"], "correct_choice_idx": 0, "direct_answers": ["change", "money", "money", "permit", "credit card", "blanket", "money", "coins", "coins", "money"], "difficult_direct_answer": false, "rationales": ["A credit card is needed to feed the meter.", "The sign says that you need to pay to park.", "With a permit one can park anywhere."], "image": "train2014/COCO_train2014_000000429251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457263, "question_id": "BC2vqGhPV7m5ScbNWLnDPs", "question": "What sauce would be a perfect compliment to this meal?", "choices": ["salad dressing", "whipped cream", "peanut butter", "apple sauce"], "correct_choice_idx": 0, "direct_answers": ["salad dressing", "dressing", "dressing", "caesar", "ranch dressing", "salad dressing", "salad dressing", "spicy sauce", "ranch dressing", "ranch"], "difficult_direct_answer": false, "rationales": ["People enjoy putting condiments on their salads.", "A salad is best with some dressing on it for moisture and flavor.", "Since there's a green salad, there should be some kind of dressing on it."], "image": "train2014/COCO_train2014_000000457263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466945, "question_id": "BC3PbLxrGrpbZ4siyUDswu", "question": "What kind of weather is the day like?", "choices": ["rainy", "sunny", "windy", "stormy"], "correct_choice_idx": 1, "direct_answers": ["sunny", "sunny", "cold", "cloudy", "clear", "sunny", "cloudy", "cloudy", "cloudy", "cloudy"], "difficult_direct_answer": false, "rationales": ["There are not any obvious clouds in the sky and it is clear and blue with lots of light throughout. when the sun is out and there are not clouds or inclement weather it is said to be answer a.", "The weather is clearly visible in the photo and there is no inclement weather visible and there is plenty of light.", "B"], "image": "val2014/COCO_val2014_000000466945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112915, "question_id": "BCHVSpzXCywJsKVt32YDcs", "question": "Who most likely lives here?", "choices": ["eccentric", "pilot", "caveman", "criminal"], "correct_choice_idx": 0, "direct_answers": ["eccentric", "bum", "fred flintstone", "artist", "someone eccentric", "psycho", "family members", "rich man", "artist", "old person"], "difficult_direct_answer": true, "rationales": ["There are some really strange things here. people don't usually keep things like a toilet on display.", "The decorations are very vibrant and the person is likely eccentric themselves.", "The place looks artsy and strange."], "image": "val2014/COCO_val2014_000000112915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71726, "question_id": "BCNpupYhm6q3qX9iYzauMD", "question": "How are dishes cleaned in this Kitchen?", "choices": ["by hand", "air bath", "solar", "dishwasher only"], "correct_choice_idx": 0, "direct_answers": ["manually", "by hand", "sink", "sink", "sink", "sink", "soap", "by hand", "kitchen sink", "sink"], "difficult_direct_answer": false, "rationales": ["The dishes are cleaned by hand.", "There is no dishwasher in the kitchen but only a sink that has a faucet and basin.", "The dishes are washed in the sink by hand."], "image": "train2014/COCO_train2014_000000071726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86877, "question_id": "BCV5KB9t3P89AshHthW2Za", "question": "What is in the oven?", "choices": ["cupcake", "roast chicken", "bacon", "pizza"], "correct_choice_idx": 0, "direct_answers": ["muffin", "cupcakes", "quiches", "cupcake", "cupcakes", "muffins", "cup cakes", "quiches", "spinach quiches", "souffles"], "difficult_direct_answer": false, "rationales": ["The items are cooking in individual small round containers.", "The baked goods are in muffin cups.", "The objects are being cooked in a wrapper and are of the right size, shape and consistency for answer a."], "image": "val2014/COCO_val2014_000000086877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544655, "question_id": "BCiaaFVhfFo89Re2prrgXy", "question": "What would happen if you cut the string around her neck?", "choices": ["ice falls", "rings fall", "beads fall", "hair falls"], "correct_choice_idx": 2, "direct_answers": ["break necklace", "beads fall", "tie falls", "beads fall", "fall", "fall down", "beads fall", "beads fall", "fall off", "would fall"], "difficult_direct_answer": false, "rationales": ["The girl's necklace is made of beads attached to a string. when people cut a piece of a necklace, the material comprising it will come off of the string.", "The beads would all drop off the string if it were cut.", "The string's beads would fall."], "image": "val2014/COCO_val2014_000000544655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551669, "question_id": "BCjy25DdfHhTAUKR7fXG8E", "question": "What is the main mode of transportation for the majority of vehicles pictured?", "choices": ["dancing", "driving", "sailing", "walking"], "correct_choice_idx": 2, "direct_answers": ["sea", "water", "boat", "sailing", "boats", "boat", "water", "boats", "waterways", "water"], "difficult_direct_answer": false, "rationales": ["Most of the vehicles are boats with masts.", "The majority of the vehicles are boats.", "The mode is sailing."], "image": "val2014/COCO_val2014_000000551669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498439, "question_id": "BCkr3BGr4yPyQpVn3u89np", "question": "What's the area where players are seated on a bench near gatorade coolers?", "choices": ["dugout", "locker room", "home court", "home base"], "correct_choice_idx": 0, "direct_answers": ["dugout", "dugout", "dugout", "dugout", "dugout", "dugout", "dugout", "dugout", "dugout", "dugout"], "difficult_direct_answer": false, "rationales": ["That is where the players rest.", "The people are playing baseball. the players do not sit in the locker room during the game.", "It is a place that is set lower than the field."], "image": "val2014/COCO_val2014_000000498439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313948, "question_id": "BCmUUQ4K3Mu9ZGKwi2cUzk", "question": "What type event is being held here?", "choices": ["car race", "expo", "beauty contest", "sale"], "correct_choice_idx": 1, "direct_answers": ["exposition", "convention", "car sales", "bike show", "automotive expo", "vehicle show", "exhibition", "motor show", "exhibition", "expo"], "difficult_direct_answer": true, "rationales": ["People are checking out a motorcycle.", "The blue gate around the bike is so people can look at it but not touch it.", "A lot of people are looking at the bike and that is typical at an expo where they show off bikes."], "image": "val2014/COCO_val2014_000000313948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134133, "question_id": "BDDSMCQnaYhD8fQqpC7x4X", "question": "California is the largest producer of which fruit?", "choices": ["apple", "berries", "grapes", "avocados"], "correct_choice_idx": 3, "direct_answers": ["oranges", "grapes", "grapes", "orange", "avocado", "avocado", "apricot", "avocados", "oranges", "oranges"], "difficult_direct_answer": false, "rationales": ["Avocados are a well-known california product.", "California produces avocados in bulk.", "California has avocadoes."], "image": "val2014/COCO_val2014_000000134133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74080, "question_id": "BDH3Kivq8eTKPttuxCRViH", "question": "What are the people doing in this venue?", "choices": ["studying", "watching video", "working", "playing game"], "correct_choice_idx": 2, "direct_answers": ["having meeting", "working", "meeting", "meeting", "business meeting", "work meeting", "working", "working", "working", "using computer"], "difficult_direct_answer": false, "rationales": ["The setting is of a professional board room nature. the laptops are the same brand/design and each have a sticker tag.", "They look to be at a meeting.", "The people are sitting at a conference table and are using business laptops."], "image": "train2014/COCO_train2014_000000074080.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468760, "question_id": "BDM3tm3hkL33SfR9WJcLa7", "question": "What is the green item near the horses?", "choices": ["grape", "watermelon", "sign", "cucumber"], "correct_choice_idx": 2, "direct_answers": ["tree", "signs", "signpost", "grass", "signs", "sign post", "grass", "tree", "sign", "street sign"], "difficult_direct_answer": false, "rationales": ["It's a sign so you know what way you need to go.", "There are no fruits or vegetables near the horses.", "A pole with several different green arrows facing different directions is on a corner."], "image": "train2014/COCO_train2014_000000468760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297981, "question_id": "BDb6kkJRAM4L3BR9YSM9GE", "question": "What sort of persons frequent the building seen here?", "choices": ["readers", "invalids", "salesmen", "gamers"], "correct_choice_idx": 0, "direct_answers": ["readers", "librarians", "men", "readers", "elderly", "readers", "senior citizens", "readers", "elderly", "senior men"], "difficult_direct_answer": false, "rationales": ["The sign indicates that this building is a library, not an arcade, a store, or a facility for people with disabilities.", "The sign on the building indicates it is a library, which is where books are kept.", "The building plaque says library which lends out books, so readers often go there."], "image": "train2014/COCO_train2014_000000297981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308963, "question_id": "BDmt6yfpNrm2C4TDtnmrPQ", "question": "The person holding the bat has a similar last name to what celebrity?", "choices": ["idris elba", "leann rimes", "jennifer connelly", "isabelle adjani"], "correct_choice_idx": 1, "direct_answers": ["leann rimes", "leann rhymes", "leann rimes", "leanne rimes", "rhymes", "bust rhymes", "bust rimes", "leann", "bustas rhymes", "leann"], "difficult_direct_answer": false, "rationales": ["The man holding the bat is wearing a jersey that says rhymes which is similar to leann rimes's name.", "Rhymes is similar with rimes.", "Leann rhimes' name rhymes."], "image": "train2014/COCO_train2014_000000308963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516875, "question_id": "BDnVkjsAvVYPz8gxfS4esE", "question": "What are the dark pieces on top of the bottom hot dog?", "choices": ["pickles", "seaweed", "green chiles", "jalapenos"], "correct_choice_idx": 1, "direct_answers": ["seaweed", "onion", "seaweed", "seaweed", "seaweed", "seaweed", "onions", "vegetables", "seaweed", "seaweed"], "difficult_direct_answer": false, "rationales": ["The pieces are seaweed.", "Green strips of leafy vegetable are on a hotdog. seaweed is green and leafy.", "The pieces are long and thinly but evenly cut. the pieces are dry rather than dripping in juices or brine."], "image": "val2014/COCO_val2014_000000516875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322971, "question_id": "BECZLwhbWLfX6wmUszWXPc", "question": "Which one can go the longest without putting his feet on the ground?", "choices": ["skateboarder", "cyclist", "equal", "cannot tell"], "correct_choice_idx": 1, "direct_answers": ["skateboarder", "bicyclist", "cyclist", "boy", "man", "right", "young man", "young man", "bicyclist", "skateboarder"], "difficult_direct_answer": false, "rationales": ["A skateboarder has to put his feet on the ground to maintain motion and speed. the other mode of transport can continue forward indefinitely without the rider putting his feet on the ground.", "A skateboarder and a bike rider are in a parking lot. a biker does not have to put his feet down except to stop.", "The cyclist can go longest."], "image": "train2014/COCO_train2014_000000322971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199841, "question_id": "BEEdpaeWmbhzw6YFgZL2Uz", "question": "The striped shirt is of what style?", "choices": ["hoodie", "polo shirt", "sweater", "t-shirt"], "correct_choice_idx": 3, "direct_answers": ["collared shirt", "polo", "button down", "polo", "polo shirt", "stripped", "eating", "t-shirt", "polo style", "new style"], "difficult_direct_answer": true, "rationales": ["The striped shirt has a collar and buttons.", "A man is earing a short sleeved shirt with a collar and a couple of buttons at the neck.", "The collar makes this clear."], "image": "train2014/COCO_train2014_000000199841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207978, "question_id": "BEFY5WqeEkMWiq9FQBSQh8", "question": "How many stars does this hotel have?", "choices": ["four", "two", "five", "three"], "correct_choice_idx": 2, "direct_answers": ["four", "multiple", "five", "five", "four", "four", "five", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["A fancy hotel with a horse drawn carriage can be seen from a street view.", "Only a 5-star hotel would have a carriage setup like this.", "It looks like a fancy hotel. fancy hotels usually have the highest rating."], "image": "train2014/COCO_train2014_000000207978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539924, "question_id": "BEYBCGZHwq7BhtsqV3nSDb", "question": "Why does he wear sunglasses?", "choices": ["showing off", "sun blindness", "sees better", "found them"], "correct_choice_idx": 1, "direct_answers": ["sun protection", "sunny", "sun blindness", "sun protection", "protection", "sun", "bright", "glare protection", "too sunny", "shade"], "difficult_direct_answer": true, "rationales": ["The brightness of a sunny day that has snow on the ground to reflect the light can be almost painful. sunglasses considerably dim the brightness of a day like this.", "The person is trying to protect their eye from the sun.", "Sunglasses make the light come into your eyes darker so it prevents you from going blind."], "image": "train2014/COCO_train2014_000000539924.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398934, "question_id": "BEe4vhutAcXP8ETuxfHX2k", "question": "What is the man at the table doing?", "choices": ["drinking", "jumping", "sleeping", "standing"], "correct_choice_idx": 0, "direct_answers": ["eating", "sitting", "drinking", "eating", "eating", "playing", "drinking", "drinking", "drinking", "eating"], "difficult_direct_answer": false, "rationales": ["The man at the table is awake and is sitting.", "The man has a drink.", "The man is holding a coffee cup which implies he is drinking a beverage."], "image": "train2014/COCO_train2014_000000398934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125782, "question_id": "BErSJTq6T9A3zf7PKhgEHs", "question": "What type of international cuisine does this bakery specialize in?", "choices": ["japanese", "italian", "chinese", "french"], "correct_choice_idx": 2, "direct_answers": ["donut", "chinese", "donut", "donut", "chinese", "donut", "donut", "chinese", "chinese", "donut"], "difficult_direct_answer": false, "rationales": ["The writing on the sign looks similar to chinese writing.", "The text on the sign is chinese.", "The writing is in an asian, not european, language. it is not japanese."], "image": "val2014/COCO_val2014_000000125782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303543, "question_id": "BEz4JthUjKsUgCwDpNRcyj", "question": "What are they about to do?", "choices": ["go surfing", "go home", "train dog", "clean boards"], "correct_choice_idx": 0, "direct_answers": ["surf", "playing", "go surfing", "surf", "surf", "surf", "surfing", "surf", "paddleboard", "surf"], "difficult_direct_answer": false, "rationales": ["They are entering the water with their boards to surf.", "The people will go surfing.", "The men are holding surfing boards in water."], "image": "val2014/COCO_val2014_000000303543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282357, "question_id": "BFB382bwF8ogAwm7ry4Cw9", "question": "What color were most carrots originally?", "choices": ["blue", "purple", "green", "neon"], "correct_choice_idx": 1, "direct_answers": ["orange", "orange", "purple", "white", "purple", "yellow", "white", "orange", "green", "orange"], "difficult_direct_answer": false, "rationales": ["When growing carrots are purple in colour.", "Originally they were purple.", "Some are purple."], "image": "train2014/COCO_train2014_000000282357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115862, "question_id": "BFD5oUET3hNHF33VX3Sf7o", "question": "What gesture are the group doing?", "choices": ["wave", "fist", "hang ten", "salute"], "correct_choice_idx": 3, "direct_answers": ["saluting", "salute", "salute", "salute", "saluting", "salute", "saluting", "salute", "saluting", "saluting"], "difficult_direct_answer": false, "rationales": ["These people are saluting.", "The group is giving off a salute with their hands.", "People in military uniforms have their hands raised towards their foreheads, military personnel salute the flag."], "image": "train2014/COCO_train2014_000000115862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175593, "question_id": "BFE3KUXxuM7Xpsuj6Eeru3", "question": "Which food from the sea on the dinner table has to be eaten before it sits out beyond two hours?", "choices": ["prawns", "bread rolls", "eggs", "peppers"], "correct_choice_idx": 0, "direct_answers": ["shrimp", "shrimp", "banana", "shrimp", "banana", "shrimp", "prawns", "shrimp", "shrimp", "shrimp"], "difficult_direct_answer": false, "rationales": ["Shrimp has to be eaten in a timely fashion so that it doesn't cause sickness.", "To avoid them from getting bad and losing its flavor.", "The shrimp will go bad if it's left out."], "image": "val2014/COCO_val2014_000000175593.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299270, "question_id": "BFEVCZyxSw4VsUCxSqmUgk", "question": "Why is the horse attached to the cart with wheels?", "choices": ["to punish", "to pull", "to eat", "to heal"], "correct_choice_idx": 1, "direct_answers": ["transport", "to pull", "for transportation", "pulling", "pulling", "pulling things", "pulling it", "to haul", "work", "to pull"], "difficult_direct_answer": false, "rationales": ["The cart is being pulled by the equine.", "He is being used to pull stuff in the cart.", "A horse is pulling a cart that is filled with brush."], "image": "train2014/COCO_train2014_000000299270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544216, "question_id": "BFQhRXZh7sPGAQoqX5Mgkr", "question": "Where were potato chips invented?", "choices": ["new york", "california", "north dakota", "savoy"], "correct_choice_idx": 0, "direct_answers": ["saratoga springs", "new york", "eighteen hundreds", "new york", "saratoga springs", "eighteen fifty-three", "usa", "saratoga springs", "new york", "saratoga springs"], "difficult_direct_answer": false, "rationales": ["An american citizen in the eastcoast in given credit for creating potato chips", "Potato chips were invented in new york.", "They first came about in a restaurant in saratoga springs."], "image": "val2014/COCO_val2014_000000544216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273321, "question_id": "BFo9ohQAQqVGxLG3smzNwC", "question": "Why is he off the ground?", "choices": ["exercising", "intercept frisbee", "bounced", "is falling"], "correct_choice_idx": 1, "direct_answers": ["catch frisbee", "jumped", "intercept frisbee", "jumped", "playing", "catching", "catching", "he jumped", "jumping catch", "catching"], "difficult_direct_answer": false, "rationales": ["The man has jumped in order to catch a frisbee and is midair.", "A man is jumping up and reaching out for a frisbee.", "The man is off the ground because he jumped high so he can catch the frisbee."], "image": "val2014/COCO_val2014_000000273321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186614, "question_id": "BFviXmobxnD8Uu4k7SnQK6", "question": "What does the object do to molecules to warm up food?", "choices": ["split aaprt", "vibrate", "charge electrically", "freeze"], "correct_choice_idx": 1, "direct_answers": ["microwaves", "microwave", "heating", "vibrate", "microwave", "agitate", "microwave", "radiate", "increases speed", "heat"], "difficult_direct_answer": false, "rationales": ["A microwave is on a cart.", "A microwave is on a stand in a kitchen. microwaves use vibration.", "It is a microwave oven that is used in a residential kitchen."], "image": "train2014/COCO_train2014_000000186614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76937, "question_id": "BFwFJwJu7CDpGbaUhNQYPy", "question": "What kind of soda is the skate boarder drinking?", "choices": ["orange", "cola", "root beer", "ginger ale"], "correct_choice_idx": 2, "direct_answers": ["rootbeer", "rootbeer", "root beer", "rootbeer", "root beer", "aw", "root beer", "rootbeer", "root beer", "root beer"], "difficult_direct_answer": false, "rationales": ["The can has a&w, not coca-cola, canada dry, or fanta, branding.", "She is holding a rootbeer can.", "The soda is in a can that is different shades of brown with an orange, brown, and white logo. these colors are only used this way for a specific product made by a&w."], "image": "train2014/COCO_train2014_000000076937.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305404, "question_id": "BG5AtEFfzniyZdwW5xKVKj", "question": "What counties flag is on the clothesline above the horse?", "choices": ["united states", "united nations", "united kingdom", "united emirates"], "correct_choice_idx": 2, "direct_answers": ["united kingdom", "france", "united kingdom", "bridle", "england", "england", "england", "britain", "england", "uk"], "difficult_direct_answer": false, "rationales": ["The flag is the uk one.", "The flag consists of the red cross of saint george (patron saint of england), edged in white, superimposed on the satire of st patrick (patron saint of ireland), also edged in white, which are superimposed on the satire of saint andrew (patron saint of scotland).", "The uk flag is hung across the roof of the alleyway."], "image": "val2014/COCO_val2014_000000305404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97182, "question_id": "BG9uKiWkhX97QoHSysT6oZ", "question": "What is the smallest animal here doing?", "choices": ["eating", "sleeping", "floating", "drinking"], "correct_choice_idx": 2, "direct_answers": ["floating", "swimming", "swimming", "swimming", "swimming", "swimming", "swimming", "swimming", "swimming", "swimming"], "difficult_direct_answer": false, "rationales": ["The birds on the water are much smaller than the giraffe.", "The littlest animal by the giraffes are the ducks in the lake.", "The ducks are calmly sitting in the water"], "image": "train2014/COCO_train2014_000000097182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510035, "question_id": "BGMCwqBaRA5fyqEaxqyxHJ", "question": "What is the part of the airplane with the star on it called?", "choices": ["flap", "horizontal stabilizer", "vertical stabilizer", "spoiler"], "correct_choice_idx": 2, "direct_answers": ["tail", "vertical stabilizer", "tail", "tail", "tail", "tail", "tail", "tail", "tail", "tail"], "difficult_direct_answer": false, "rationales": ["That is the correct name for the tail of an airplane.", "The vertical stabilizer is used behind the aeroplane.", "It is pointed upward instead of across."], "image": "train2014/COCO_train2014_000000510035.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25833, "question_id": "BGPTh8qRFzNWd6Greubx39", "question": "The large novelty check on the windshield of the double decker bus was likely the result of what action?", "choices": ["corporate sponsorship", "lottery winnings", "charitable donation", "personal loan"], "correct_choice_idx": 2, "direct_answers": ["contest", "contest", "printing", "painting", "charitable donation", "donation", "donation", "charity", "stop bus", "charity"], "difficult_direct_answer": false, "rationales": ["Large cheques are used at charity events.", "Charitable donations are usually novelty checks.", "The lack of representatives"], "image": "train2014/COCO_train2014_000000025833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503886, "question_id": "BGSkNeHYeg2h5G6rnWNRUZ", "question": "What is the woman's profession?", "choices": ["officer", "athlete", "pilot", "doctor"], "correct_choice_idx": 1, "direct_answers": ["tennis player", "tennis player", "tennis player", "tennis player", "tennis player", "tennis player", "tennis player", "tennis player", "athlete", "tennis player"], "difficult_direct_answer": false, "rationales": ["This woman is a tennis player. tennis players are considered athletes.", "She is holding a tennis racket while wearing tennis attire and her hair is worn in a style where it is held back out of her face. she is standing on a tennis court looking pensive.", "This woman is playing tennis based on the equipment and her uniform. the background makes it appear that there is an audience watching which would be consistent with someone who plays this sport professionally which would be known as answer a."], "image": "train2014/COCO_train2014_000000503886.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204188, "question_id": "BGg4hTm8kKSqZ7Hc43da8t", "question": "Where is this toilet located?", "choices": ["car", "bathroom", "kitchen", "expo"], "correct_choice_idx": 3, "direct_answers": ["store", "store", "home furnishings", "convention display", "convention floor", "ikea", "display", "expo", "showroom", "museum"], "difficult_direct_answer": true, "rationales": ["This is the most likely place given that two of the other options wouldn't work and b would be enclosed.", "The toilet is in a display stand that reveals people walking around behind it.", "It appears to be an a place for people to look at it on display."], "image": "val2014/COCO_val2014_000000204188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287422, "question_id": "BGqxN7f3ERRpXbgMARXroB", "question": "What is the wheel called that's behind the bench?", "choices": ["train wheels", "chariot wheels", "artillery wheels", "wagon wheel"], "correct_choice_idx": 3, "direct_answers": ["wagon wheel", "wagon wheel", "wagon", "wagon", "spinning", "wagon wheel", "wagon wheel", "wagonwheel", "wagon wheel", "wagon wheel"], "difficult_direct_answer": false, "rationales": ["There is a wagon wheel behind the dog since it's so large.", "The wheel is built from a material and is of a size and composition consistent with answer a.", "The wheel is huge so it's for a large wagon."], "image": "train2014/COCO_train2014_000000287422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391823, "question_id": "BGr5gc99eNUJwe3b3JFe4T", "question": "What is holding the animals up?", "choices": ["hot air", "tall person", "strings", "helium"], "correct_choice_idx": 3, "direct_answers": ["wind", "strings", "wind", "rope", "kite lines", "helium", "strings", "air/helium", "helium", "helium"], "difficult_direct_answer": false, "rationales": ["Balloons that are used outdoors are filled with the stuff that has the symbol he.", "That gas is used for balloons.", "The animals are balloons that are filled with helium to make them float."], "image": "train2014/COCO_train2014_000000391823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162712, "question_id": "BHDLFQRrvBqfEQ42QuRjm9", "question": "What is the profession of the man on a motorcycle?", "choices": ["athlete", "fireman", "officer", "lifeguard"], "correct_choice_idx": 2, "direct_answers": ["police", "police officer", "policeman", "police officer", "motor officers", "police officer", "officer", "cop", "police", "law enforcement"], "difficult_direct_answer": false, "rationales": ["A man in a police uniform is on a motorcycle. police use motorcycles sometimes.", "He is wearing a police uniform.", "The man is wearing a police uniform as he is on a police motorcycle which indicates his workplace."], "image": "train2014/COCO_train2014_000000162712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8333, "question_id": "BHYjgSvbTU8nZa6CA5hCf7", "question": "The owners of this company first started it in which business?", "choices": ["spaceship", "record", "phones", "airline"], "correct_choice_idx": 1, "direct_answers": ["train", "records", "record", "music", "record stores", "music", "music", "mobile phones", "electronics", "not cleared"], "difficult_direct_answer": false, "rationales": ["The company made records.", "Virgin was first a record company.", "Virgin began as a popular music label before expanding into transportation."], "image": "val2014/COCO_val2014_000000008333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121143, "question_id": "BHcWcXvW4xhZ9gCoSa84md", "question": "What is near the tower?", "choices": ["car", "airplane", "princess", "baby"], "correct_choice_idx": 0, "direct_answers": ["building", "buildings", "black car", "buildings", "car", "streetlight", "cars", "cars", "building", "cars building"], "difficult_direct_answer": false, "rationales": ["There are cars driving by.", "There is a street that passes by the tower.", "There is a car at the base of the tower."], "image": "val2014/COCO_val2014_000000121143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556722, "question_id": "BHjGKDvvWgb735THu78JZC", "question": "What are the pink circles on the front of this train used for?", "choices": ["visibility", "light emitting", "design", "brand"], "correct_choice_idx": 0, "direct_answers": ["safety", "eyes", "visibility", "to highlight", "signals", "lights", "lights", "lights", "signal", "lights"], "difficult_direct_answer": false, "rationales": ["Neon pink circles are on a train. neon colors are used to increase visibility.", "The circles are for visibility.", "The pink circles are designed to be visible in bad weather."], "image": "train2014/COCO_train2014_000000556722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445397, "question_id": "BJ4afkzh7yybciQ3bKBcVG", "question": "What is the wall behind the standing man made out of?", "choices": ["plaster", "plywood", "bricks", "wood"], "correct_choice_idx": 2, "direct_answers": ["brick", "bricks", "bricks", "brick", "bricks", "brick", "brick", "bricks", "brick", "brick"], "difficult_direct_answer": false, "rationales": ["The wall is made up of the individual red kiln baked blocks.", "The wall is red, not brown or white. there is mortar in between the red items.", "Two people are standing in front of a brick wall while playing video games."], "image": "train2014/COCO_train2014_000000445397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199093, "question_id": "BJD9juYEUpoX97jJVY5wRr", "question": "What flavor is the donut?", "choices": ["chocolate", "carrot", "caramel", "lemon"], "correct_choice_idx": 2, "direct_answers": ["glazed", "caramel", "glaze", "glaze", "glazed", "plain", "coffee", "glazed", "glazed", "glazed"], "difficult_direct_answer": false, "rationales": ["The donut has a caramel colored glaze on it.", "The man is eating a glazed donut that may have caramel flavoring in the frosting.", "The donut is caramel because it has a brownish glaze."], "image": "train2014/COCO_train2014_000000199093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418625, "question_id": "BJH9gfdmHezmD2zbdhxZ3L", "question": "In what setting are the zebras resting?", "choices": ["zoo", "roadway", "parade", "market"], "correct_choice_idx": 0, "direct_answers": ["zoo", "zoo", "pen", "zoo", "ground", "zoo", "zoo", "zoo", "zoo", "field"], "difficult_direct_answer": false, "rationales": ["People can be seen behind a fence watching the animals in their enclosure. the animals are in captivity.", "Looks like they are in a fenced in area where people can come and see them.", "They are in captivity."], "image": "val2014/COCO_val2014_000000418625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543026, "question_id": "BJgfZqroPqqrLvbCKVcv3h", "question": "What type of animals are present?", "choices": ["deer", "giraffe", "dog", "horse"], "correct_choice_idx": 3, "direct_answers": ["horses", "horses", "horse", "horses", "horses", "horses", "horses", "horses", "horses", "horses"], "difficult_direct_answer": false, "rationales": ["Two large animals with manes and long snouts grazing in the grasses.", "There are horses at the field.", "Both animals pictured are horses. the animals are larger than a typical dog. horses are similar to deer, but their manes are distinctive to horses."], "image": "train2014/COCO_train2014_000000543026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162717, "question_id": "BJmjELedq5c2vpku3WvgmS", "question": "How many umbrellas are on the top of the signboard?", "choices": ["four", "six", "five", "eight"], "correct_choice_idx": 2, "direct_answers": ["six", "six", "six", "six", "five", "six", "six", "six", "five", "five"], "difficult_direct_answer": false, "rationales": ["Five umbrellas are sitting on top of the board.", "There are five umbrellas.", "There are at least six umbrellas on top of the signboard."], "image": "train2014/COCO_train2014_000000162717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559166, "question_id": "BKYF4eubXhJKeokprAnZ3J", "question": "At least how many firefighters could use the hydrant at one time?", "choices": ["14", "12", "11", "nine"], "correct_choice_idx": 3, "direct_answers": ["nine", "nine", "nine", "nine", "nine", "nine", "nine", "nine", "nine", "nine"], "difficult_direct_answer": false, "rationales": ["A fire hydrant has twelve spots to hookup a hose.", "There are nine hub caps.", "Nine holes are available."], "image": "val2014/COCO_val2014_000000559166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189129, "question_id": "BKey5TP8cqLESgbigummTi", "question": "Where is the man located?", "choices": ["woods", "rural area", "big city", "suburb"], "correct_choice_idx": 2, "direct_answers": ["beltway", "sidewalk", "sidewalk", "dog", "sidewalk", "dog", "sidewalk", "dog", "sidewalk", "big city"], "difficult_direct_answer": false, "rationales": ["The man appears to be in an urban area because of the buildings behind him.", "There are tall buildings.", "A man in business attire is walking with his dog. in the background are many big buildings."], "image": "train2014/COCO_train2014_000000189129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158729, "question_id": "BKr7WgvBNFkcSSH9qBxpxU", "question": "What is illegal at this intersection that normally is allowed at intersections?", "choices": ["large trucks", "left turn", "pedestrian crossing", "right turn"], "correct_choice_idx": 2, "direct_answers": ["pedestrian crossing", "pedestrian crossing", "walking", "pedestrian crossing", "crossing", "pedestrian crossing", "pedestrians", "irvine center", "rules breaking", "pedestrian crossing"], "difficult_direct_answer": false, "rationales": ["There is a sign that shows a pedestrian with a red diagonal line across it.", "Pedestrians can't cross.", "There is a sign on the pole for people not to cross there."], "image": "train2014/COCO_train2014_000000158729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412992, "question_id": "BKxfcFBcg3563JGgA5zRTH", "question": "Where does watermelon come from?", "choices": ["china", "italy", "africa", "sicily"], "correct_choice_idx": 2, "direct_answers": ["garden", "southern africa", "plant", "seeds", "africa", "ground", "watermelons", "ground", "ground", "garden"], "difficult_direct_answer": false, "rationales": ["The watermelon is from africa.", "It originally came from a desert location.", "Watermelons are generally grown in africa."], "image": "train2014/COCO_train2014_000000412992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155795, "question_id": "BL9Dhh7yNxPv29EN68QsjF", "question": "What type of drinks are available here?", "choices": ["margaritas", "cocoa", "milkshakes", "egg nog"], "correct_choice_idx": 0, "direct_answers": ["margaritas", "margaritas", "wine", "margaritas", "alcohol", "french drinks", "margaritas", "margarita", "margaritas", "margaritas"], "difficult_direct_answer": false, "rationales": ["These are margaritas and come in this type of glass.", "This type of drink served in this type of glass is a typical margarita.", "The people are dining outside at a place where there are blue and yellow margaritas avaialble."], "image": "train2014/COCO_train2014_000000155795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107741, "question_id": "BLN3xA3oGm3bijjRjdBvqH", "question": "What two forms of entertainment can be found on this street?", "choices": ["movie/gambling", "concert/dancing", "football/rodeo", "nascar/gymnastics"], "correct_choice_idx": 0, "direct_answers": ["casino food", "gambling movies", "casino", "movie/gambling", "casino cinema", "gambling movies", "casino movie", "casinos", "gambling drinking", "casino movies"], "difficult_direct_answer": true, "rationales": ["The entertainment is the movies.", "One sign in the background is for a cinema. another sign is for a casino.", "A person is on a city street and signs for a theater and gambling are lit behind her."], "image": "val2014/COCO_val2014_000000107741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467151, "question_id": "BLauDJQseykb4x8FaJ8QHb", "question": "Which country invented free public benches?", "choices": ["belgium", "france", "america", "italy"], "correct_choice_idx": 1, "direct_answers": ["france", "unknown", "france", "italy", "france", "france", "france", "france", "france", "paris"], "difficult_direct_answer": false, "rationales": ["France has public benches available in parks.", "Benches are in a public place.", "The country is france."], "image": "val2014/COCO_val2014_000000467151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266550, "question_id": "BLjazGHkZq34cH6XaffmwH", "question": "Why is the person wearing a glove?", "choices": ["fashion", "warmth", "health", "costume"], "correct_choice_idx": 2, "direct_answers": ["handling produce", "germ protection", "health", "handle produce", "handling food", "good hygiene", "avoid contact", "touching food", "sanitation", "protection"], "difficult_direct_answer": true, "rationales": ["They work at the establishment and for health and safety reasons when handling food, it is important to wear gloves to protect others.", "The person works with the produce and the gloves keep away germs.", "They are handling food as part of their job and the gloves prevent contamination."], "image": "train2014/COCO_train2014_000000266550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348010, "question_id": "BLpB6SUd5ZNX2FwQhAn7uR", "question": "How is this steamed cooked?", "choices": ["well", "medium-well", "medium", "rare"], "correct_choice_idx": 3, "direct_answers": ["medium rare", "rare", "rare", "medium", "rare", "rare", "rare", "medium-rare", "cheese", "medium well"], "difficult_direct_answer": false, "rationales": ["The steak is rare.", "This steak is very red in the center.", "There's still blood coming out of the meat"], "image": "train2014/COCO_train2014_000000348010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49428, "question_id": "BLqCXej4s9ThNk9ginjJx9", "question": "What mode of transportation is featured?", "choices": ["bike", "car", "bus", "train"], "correct_choice_idx": 0, "direct_answers": ["bicycle", "bicycle/motorscooter", "bicycle", "bicycle", "bicycle", "bicycle/motorscooter", "bicycle", "bike", "bicycle", "bicycle/motorscooter"], "difficult_direct_answer": false, "rationales": ["It has two wheels with spokes and a small seat and handlebars, consistent with that of a bike", "The transportation is a bike.", "Cars, trains and buses are way too big to be found on a shelf."], "image": "val2014/COCO_val2014_000000049428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539493, "question_id": "BLvsL4a2LDMytVw7j7t2wn", "question": "Why is he standing there?", "choices": ["waiting ride", "is posing", "is afraid", "is lost"], "correct_choice_idx": 1, "direct_answers": ["looking", "posing", "view", "resting", "posing", "posing", "posing", "is posing", "posing", "getting pictured"], "difficult_direct_answer": false, "rationales": ["He looks calm and is clearly looking at something which apparently is a camera.", "That's a picture pose.", "This is obvious given his posture and the direction of his gaze. that said, it might also be b."], "image": "train2014/COCO_train2014_000000539493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132094, "question_id": "BLxGFiEUAjDqH3pXBzqae4", "question": "What is usually found in this room?", "choices": ["bookcase", "toilet plunger", "bed", "refrigerator"], "correct_choice_idx": 1, "direct_answers": ["toilet", "water", "toilet", "toiletries", "toilet plunger", "toilet", "towel", "bathtub", "toilet", "bathtub"], "difficult_direct_answer": false, "rationales": ["This is a bathroom and one would most likely find bathroom items inside of it.", "There is a toilet in the room. toilets can get clogged. there should be a tool to unclog it in the room as well.", "The bathroom has a toilet in it which usually has a plunger next to it in case it clogs."], "image": "train2014/COCO_train2014_000000132094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288607, "question_id": "BM2LodFw3ZPFPdiaeuaimv", "question": "What is the enclosed black area near pavement called?", "choices": ["waiting hub", "bus stop", "truck stop", "bus terminal"], "correct_choice_idx": 1, "direct_answers": ["bus terminal", "bus stop", "bus stop", "bus stop", "bus shelter", "bus shelter", "bus stop", "bus stop", "bus stop", "waiting booth"], "difficult_direct_answer": false, "rationales": ["The enclosed black area is a bus stop that people can wait in and stay out of the sun or rain.", "The black area that's near the pavement is where people wait for the bus.", "This looks like a typical bus stop and there is a bus stopped there so it makes the most sense"], "image": "train2014/COCO_train2014_000000288607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49881, "question_id": "BM94WSRdhJKqmj83LNq3ki", "question": "Why are there two on the bike?", "choices": ["save money", "stay warm", "needs two", "better balance"], "correct_choice_idx": 0, "direct_answers": ["friends", "travelling", "transportation efficiency", "passenger", "save money", "riding street", "ride together", "riding together", "friends", "carpooling"], "difficult_direct_answer": true, "rationales": ["The men are carpooling so they could possibly be saving money by traveling together.", "There are two people to help with money costs.", "The people are saving money."], "image": "val2014/COCO_val2014_000000049881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91808, "question_id": "BMoQeT9MXE7vL4bJcbSiWi", "question": "How are the two people related?", "choices": ["coworkers", "classmates", "lovers", "siblings"], "correct_choice_idx": 2, "direct_answers": ["couple", "married", "siblings", "friends", "romantically", "siblings", "lovers", "husband wife", "married", "married couple"], "difficult_direct_answer": false, "rationales": ["The people are a couple.", "Two people are standing together and are embracing and standing close together.", "The couple are embracing each other affectionately."], "image": "train2014/COCO_train2014_000000091808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151095, "question_id": "BMvKBTts2CwvgxvKCx9wAu", "question": "What is a slang name for the yellow bus?", "choices": ["school doodler", "crack wagon", "school wagon", "cheese wagon"], "correct_choice_idx": 3, "direct_answers": ["school bus", "school bus", "school bus", "cheese wagon", "short bus", "short bus", "cheese wagon", "school bus", "school bus", "school bus"], "difficult_direct_answer": false, "rationales": ["The name is a cheese wagon.", "School buses are known as the cheese wagon because of the color.", "The yellow bus is for school children."], "image": "val2014/COCO_val2014_000000151095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464366, "question_id": "BN3mWFJ8MVA96ozDxPvTX3", "question": "What is the first name of the insurance company's CEO?", "choices": ["michael", "larry", "sean", "george"], "correct_choice_idx": 0, "direct_answers": ["michael", "michael tipsord", "michael", "michael", "jim", "michael", "michael", "michael", "michael tipsord", "michael tipsord"], "difficult_direct_answer": false, "rationales": ["The insurance company is state farm. the ceo's last name is tipsord.", "The leader of this company is named michael.", "The name is michael."], "image": "val2014/COCO_val2014_000000464366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93789, "question_id": "BN6goKwvh6WVUnfAW2FeXB", "question": "What is the player ready to do?", "choices": ["roll", "dunk", "bat", "serve"], "correct_choice_idx": 3, "direct_answers": ["serve", "serve", "serve", "play", "serve ball", "hit ball", "serve ball", "serve ball", "serve ball", "serve ball"], "difficult_direct_answer": false, "rationales": ["He is getting ready to hit the ball to his opponent on the other side by throwing it up in the air.", "The man has the ball in his hand so he can properly serve it.", "He is throwing the ball in the air to hit to his opponent."], "image": "train2014/COCO_train2014_000000093789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149546, "question_id": "BNAj2LoWaXiSpkBEu4GaDq", "question": "What is the name donut without a hole?", "choices": ["apple fritter", "long john", "filled donut", "cream puff"], "correct_choice_idx": 2, "direct_answers": ["filled donut", "jelly donut", "crone", "jelly", "jelly donut", "jelly donut", "jelly doughnut", "jelly donut", "paczki", "jam buster"], "difficult_direct_answer": false, "rationales": ["Donuts that have cream inside are usually round without a hole in the middle.", "Donuts without holes have filling in them.", "Filled donuts don't have holes in them."], "image": "train2014/COCO_train2014_000000149546.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312010, "question_id": "BNBd7bQgBJHC73gGsQzgjo", "question": "Why is the ball in the air?", "choices": ["anti-gravity", "it's stuck", "tornado", "she's serving"], "correct_choice_idx": 3, "direct_answers": ["being hit", "playing tennis", "tennis", "serving", "struck", "playing tennis", "playing racket", "serving", "she's serving", "being hit"], "difficult_direct_answer": false, "rationales": ["Only logical answer is that she is hitting the ball to her opponet.", "The girl throws the ball into the air and launches the serve over the net.", "She is hitting the ball to her opponent across the court."], "image": "train2014/COCO_train2014_000000312010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233744, "question_id": "BNif3kvqFGLoamdGtRFc9d", "question": "What kind of water is shown here?", "choices": ["pool", "ocean", "swamp", "pond"], "correct_choice_idx": 3, "direct_answers": ["pond", "pond", "pond", "pond", "lake", "lake", "lake", "lake", "pond", "pond"], "difficult_direct_answer": false, "rationales": ["The area is smaller than an ocean. it is clean, but it is not man-made.", "The water is too small to be any of the other choices.", "The water is small and contained like a pond."], "image": "train2014/COCO_train2014_000000233744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521465, "question_id": "BNz8KeotyT3xi7PJVidb2N", "question": "What is called a fleece in sheep?", "choices": ["hair", "tail", "skin", "horn"], "correct_choice_idx": 0, "direct_answers": ["fur", "fur", "hair", "wool", "coat", "wool", "wool", "fur", "wool", "wool"], "difficult_direct_answer": false, "rationales": ["The fleece is the hair.", "It's their hair on the sheep", "It grows on their bodies."], "image": "val2014/COCO_val2014_000000521465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548095, "question_id": "BP7EL9ipbppAMSMznh9Zbd", "question": "What are the white animals doing in the water?", "choices": ["eating", "hunting", "swimming", "walking"], "correct_choice_idx": 3, "direct_answers": ["standing", "drinking", "cows", "wading", "drinking", "drinking", "standing", "drinking", "standing", "walking"], "difficult_direct_answer": false, "rationales": ["The animals are standing in the water.", "There are cows going from one side to another.", "The cows are standing in the middle of the river about to move forward."], "image": "val2014/COCO_val2014_000000548095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200680, "question_id": "BP9VZ3L7DsP9DmoJnG8peZ", "question": "Which liquid is most likely to be spilled on a laptop here?", "choices": ["milk", "water", "milkshake", "red wine"], "correct_choice_idx": 3, "direct_answers": ["red wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine"], "difficult_direct_answer": false, "rationales": ["Because the wine glass is so close to the laptop it is most likely the one in question.", "The glass of wine is closest to the laptop so it is most likely to spill on it.", "The wine glass is right next to the computer."], "image": "train2014/COCO_train2014_000000200680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442630, "question_id": "BPB4DqJfz7uLPiyQ2b2MAx", "question": "In which area does the man wait?", "choices": ["rural", "forest", "suburban", "urban"], "correct_choice_idx": 2, "direct_answers": ["bench", "residential", "bench", "bench", "bench", "on bench", "bus stop", "bus stop", "on bench", "suburban"], "difficult_direct_answer": false, "rationales": ["Looks to not be too busy of an area and more suburban.", "This is a street in front of living quarters.", "It is a quiet street lined with 2 storey houses that are close together, which is the typical look of a suburban environment."], "image": "val2014/COCO_val2014_000000442630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61647, "question_id": "BPLVJZMpxpdq3fQHU4bg9G", "question": "What is the cake in the shape of?", "choices": ["elephant", "bear", "cat", "dog"], "correct_choice_idx": 1, "direct_answers": ["teddy bear", "teddy bear", "bunt", "teddy bear", "bear", "teddy", "teddy bear", "teddy bear", "teddy bear", "bear"], "difficult_direct_answer": false, "rationales": ["The cake looks like a teddy bear.", "The cake looks like a little teddy bear.", "The cake has four paws like a bear."], "image": "val2014/COCO_val2014_000000061647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384449, "question_id": "BPMiJtA7SgfhqbQd7Z3YyM", "question": "What is the player ready to do?", "choices": ["catch", "dunk", "dribble", "throw"], "correct_choice_idx": 3, "direct_answers": ["pitch ball", "throw ball", "throw", "pitch", "throw", "pitch ball", "run", "throw pitch", "pitch", "pitch baseball"], "difficult_direct_answer": false, "rationales": ["The player is about to pitch the ball.", "This is a baseball, not basketball, player. he is the pitcher, not catcher.", "This pitcher's arm is blurry from how fast he is trying to throw this pitch."], "image": "train2014/COCO_train2014_000000384449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163710, "question_id": "BPjVupeDmT4Ek7Zx2CvyaS", "question": "Where are the two dining?", "choices": ["on plane", "at home", "at arena", "in restaurant"], "correct_choice_idx": 3, "direct_answers": ["restaurant", "restaurant", "restaurant", "bar", "restaurant", "restaurant", "restaurant", "in restaurant", "restaurant", "sports bar"], "difficult_direct_answer": false, "rationales": ["The people are in a restaurant setting.", "The plates have branding on it, and the dishes look to be professionally assembled. there can also be patrons sitting at another table behind.", "It looks like a fancy restaurant they are at."], "image": "val2014/COCO_val2014_000000163710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489167, "question_id": "BPnkhpBuJ3rzvvwjZshWLK", "question": "These elephants probably belong to what organization?", "choices": ["military", "preserve", "zoo", "circus"], "correct_choice_idx": 3, "direct_answers": ["circus", "circus", "circus", "parade", "circus", "circus", "parade", "circus", "circus", "circus"], "difficult_direct_answer": false, "rationales": ["Only a circus would force elephants to parade around like this, whereas zoos try hard to replicate their natural environment, and would never ask them to do tricks for them.", "By the setting of where they are and how they are positioned you can tell who owns them.", "The elephants are wearing festive decor on their head and probably perform in a circus."], "image": "train2014/COCO_train2014_000000489167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268390, "question_id": "BPoWJt5utGVyxVrhDsyRBQ", "question": "What does the blade belong to?", "choices": ["lawnmower", "food processor", "knife set", "scissors"], "correct_choice_idx": 1, "direct_answers": ["food processor", "food processor", "blender", "blender", "food processor", "mixer", "food processor", "food processor", "food processor", "food processor"], "difficult_direct_answer": false, "rationales": ["The blade is for the processor.", "The blade is part of a food processor appliance.", "A blender-like machine is immediately behind the blade."], "image": "val2014/COCO_val2014_000000268390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225919, "question_id": "BPoqiN9zA5KAXiXdTnHT7g", "question": "Who placed this animal in the box?", "choices": ["hobo", "dorothy", "bike rider", "wicked witch"], "correct_choice_idx": 2, "direct_answers": ["owner", "owner", "man", "dog", "man", "elderly man", "elderly man", "bike rider", "owner", "dog"], "difficult_direct_answer": false, "rationales": ["The man that is riding the bike owns the dog and placed it in the box,", "The dog belongs to the bike rider.", "A dog is in a box on the back of a bike. a person is riding the bike."], "image": "val2014/COCO_val2014_000000225919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392144, "question_id": "BQ8yK3ENgEHZKjq7HAJbSd", "question": "Why are they all wearing the same clothing?", "choices": ["confuse others", "are confused", "uniform", "coincidence"], "correct_choice_idx": 2, "direct_answers": ["uniform", "military", "military", "same team", "team members", "pilots", "uniform", "uniform", "uniform", "teammates"], "difficult_direct_answer": false, "rationales": ["The people are in uniform.", "The men appear to be at a military airfield where they would most likely be personnel and wear option a. the colors and style are consistent with a military uniform.", "Men in the same green, one piece suit are lined up together at an airport. employees wear uniforms."], "image": "val2014/COCO_val2014_000000392144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557965, "question_id": "BQB9xLcbyt8vu9fJqz6ZPT", "question": "Where do the riders here ride their horses?", "choices": ["farm", "vocano", "inner city", "sea shore"], "correct_choice_idx": 3, "direct_answers": ["beach", "beach", "beach", "sea shore", "ocean", "beach", "beach", "beach", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["The picture contains waves and sandy terrain.", "The riders are at the shore.", "The horses are riding near the beach."], "image": "val2014/COCO_val2014_000000557965.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89515, "question_id": "BQa32V6Gur52eqWLSViped", "question": "In train each bogie consist of how many wheels?", "choices": ["two", "six", "four", "eight"], "correct_choice_idx": 1, "direct_answers": ["seven", "two", "four", "ten", "many", "three wheelsets", "two", "twelve", "two", "six"], "difficult_direct_answer": false, "rationales": ["Bogies will have four or six wheels.", "There are two wheels at each bogie.", "The train has six wheels."], "image": "train2014/COCO_train2014_000000089515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10142, "question_id": "BQgPgxqrH5JnNrPCyrfLEc", "question": "What season is up next?", "choices": ["autumn", "spring", "summer", "winter"], "correct_choice_idx": 1, "direct_answers": ["spring", "spring", "spring", "spring", "spring", "spring", "spring", "spring", "spring", "spring"], "difficult_direct_answer": false, "rationales": ["It is winter time currently and the next season to follow is springtime.", "A man is skiing in the snow. spring comes after winter.", "The current season is winter."], "image": "val2014/COCO_val2014_000000010142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238772, "question_id": "BR5kbFj8yYszUS4UhrQoce", "question": "What form of heating is used here?", "choices": ["steam", "coal", "gas", "wood"], "correct_choice_idx": 3, "direct_answers": ["fireplace", "fireplace", "central heat", "fireplace", "fire place", "wood burning", "wood", "fire", "wood", "central"], "difficult_direct_answer": false, "rationales": ["Below there is a chimney on the wall.", "There is a wooden fireplace.", "This has a fireplace that usually uses wood."], "image": "train2014/COCO_train2014_000000238772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63049, "question_id": "BRBpsgntaKzaT526f52FgG", "question": "What brand of bus manufacturer is displayed on the bus?", "choices": ["yellowbird", "grayhound", "good sam", "city"], "correct_choice_idx": 3, "direct_answers": ["citybus", "city", "aty", "city", "city", "showroom", "city", "larry", "n/a", "city"], "difficult_direct_answer": false, "rationales": ["As noted in white beneath the wipers.", "The logo for the brand is on the front under the windshield.", "City is on front of the bus in the red area."], "image": "train2014/COCO_train2014_000000063049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144554, "question_id": "BRp3b6gb9UejuXqgJgpTYM", "question": "What type of fruits are present?", "choices": ["corn", "orange", "apple", "banana"], "correct_choice_idx": 3, "direct_answers": ["banana", "bananas", "bananas", "banana", "bananas", "banana", "banana", "bananas", "bananas", "bananas"], "difficult_direct_answer": false, "rationales": ["The bananas hang on the tree where they grow in abundance.", "There are bananas in the tree.", "There is a fruit that is shaped like bananas and growing in the way that bananas grow."], "image": "train2014/COCO_train2014_000000144554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9218, "question_id": "BRurKeDABMETR2EW6damyS", "question": "What power will be used to move logs here?", "choices": ["elephant", "tractor", "man power", "boats"], "correct_choice_idx": 0, "direct_answers": ["elephant", "elephant", "elephant", "elephant", "elephant", "animal power", "animal", "elephants", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["The elephant is pushing the logs with his trunk.", "The logs are moved here by elephants.", "The elephant in the picture is using its strength to move the logs."], "image": "train2014/COCO_train2014_000000009218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381795, "question_id": "BSEG6HBRXnEsZNYwRUkrAH", "question": "What is the pink kite supposed to be?", "choices": ["dinosaur", "action figure", "teddy bear", "dog"], "correct_choice_idx": 2, "direct_answers": ["bear", "bear", "bear", "bear", "animal", "teddy bear", "bear", "teddy bear", "air", "bear"], "difficult_direct_answer": false, "rationales": ["The object in question is clearly visible based on the question and has the features consistent with answer a.", "A large kite is in the shape of an animal.", "We can see the pink teddy bear floating high up. he has a yellow snout, a black nose, and he's lying on his tummy as he glides through the air!."], "image": "train2014/COCO_train2014_000000381795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359451, "question_id": "BSFPytFkKF7txGEdCTuPG9", "question": "What does the person wearing a blue apron sell at the rightmost kiosk?", "choices": ["food", "shoes", "clothes", "souvenirs"], "correct_choice_idx": 0, "direct_answers": ["food", "coconuts", "food", "food", "shirts", "hotdogs", "food", "apples", "food", "food"], "difficult_direct_answer": false, "rationales": ["The person sells food.", "There appears to be clothing on display and that seems to be what they are selling.", "A person in a white shirt and apron is in front of a stand. people wear aprons when preparing food."], "image": "val2014/COCO_val2014_000000359451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458387, "question_id": "BSH32fDtHtJYH5RP95Cq2Q", "question": "What is the flora next to?", "choices": ["cow", "barn", "water", "baby"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "water", "sea", "water", "sea boat", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The flora is by water.", "The flora is next to a lake.", "The plants are near the lake."], "image": "val2014/COCO_val2014_000000458387.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211683, "question_id": "BSJSsHPF9BADh5Ja45gBCW", "question": "What is the red item one of them is carrying?", "choices": ["case", "bag", "box", "bottle"], "correct_choice_idx": 3, "direct_answers": ["water bottle", "playing", "water bottle", "thermos", "water jug", "water bottle", "thermos", "water bottle", "water jug", "bottle"], "difficult_direct_answer": false, "rationales": ["This container is used for cold or hot beverages.", "It is cylindrical in shape and looks like it would easily hold liquids.", "One is carrying a red bottle. these bottles are for carrying water or other drinks."], "image": "train2014/COCO_train2014_000000211683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37354, "question_id": "BSJTbzRQteyrLUprVMDsNq", "question": "Why does she have so little clothing on?", "choices": ["showing off", "is broke", "is angry", "warm weather"], "correct_choice_idx": 3, "direct_answers": ["hot", "it's hot", "playing tennis", "comfort", "it's hot", "tennis bat", "its hot", "warm weather", "warmth", "hot outside"], "difficult_direct_answer": true, "rationales": ["On a sunny summer day, a tennis player is wearing a halter top and low-slung mini-shorts in an effort to stay cool. the sweat from the body-parts that aren't covered cool off more quickly than covered skin does.", "She's playing a physically demanding game which would make a person want to dress skimpily if it was a hot out, so it must be a warm weather day.", "The sun is shining and she's playing tennis. the lady is feeling hot."], "image": "val2014/COCO_val2014_000000037354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54178, "question_id": "BTCAePA5uvXuUWsSDx3fD4", "question": "Which comparative elevation do the seated people wish for?", "choices": ["lower", "none", "same", "higher"], "correct_choice_idx": 3, "direct_answers": ["higher", "higher", "high", "snow", "mountaintop", "cable car", "higher", "higher", "higher", "higher elevation"], "difficult_direct_answer": false, "rationales": ["They are wearing skis. skiers start from the top of a hill.", "The skiers are riding the ski lift because they want to get to the top of the mountain so that they can ski back down.", "They are using the lift in order to ski, which needs to be done from an elevation as it uses gravity to propel them downwards."], "image": "train2014/COCO_train2014_000000054178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220037, "question_id": "BTjqHaV6QcxBQpj97ReQTQ", "question": "Why are there candles?", "choices": ["cooking", "warmth", "light", "ambiance"], "correct_choice_idx": 3, "direct_answers": ["table decoration", "ambiance", "mood lighting", "decoration", "for lighting", "for ambiance", "ambiance", "on table", "provide atmosphere", "atmosphere"], "difficult_direct_answer": true, "rationales": ["The candles are on the table near the women.", "The candles help give the restaurant a homey, cozy ambiance.", "People are sitting at a restaurant in tables and booth seats."], "image": "train2014/COCO_train2014_000000220037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401850, "question_id": "BU4THbubYPZ7hzieijPVwU", "question": "What kind of buildings are the ones with flags outside them?", "choices": ["stores", "apartments", "hospitals", "municipal"], "correct_choice_idx": 0, "direct_answers": ["stores", "stores", "clothing stores", "stores", "stores", "stores", "store", "store", "stores", "commercial"], "difficult_direct_answer": false, "rationales": ["There are stones used on the buildings.", "The buildings are stores in a downtown area.", "You can see store signs outside near the street."], "image": "val2014/COCO_val2014_000000401850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493704, "question_id": "BUAQZbEmxMrZUJ7xCZHRJ2", "question": "What part of this picture is artificial?", "choices": ["sand", "horse", "border", "sun"], "correct_choice_idx": 2, "direct_answers": ["desert", "border", "background", "border", "color", "border", "lighting", "rider", "tint", "background"], "difficult_direct_answer": false, "rationales": ["The horse, sand, and sun are real.", "The image appears to be natural and uninterrupted except for a line around the edge.", "The framework around the photo is not real. it was edited."], "image": "val2014/COCO_val2014_000000493704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25525, "question_id": "BUDbQiZqyduKeB9ToUBT7q", "question": "What beverage does this person drink?", "choices": ["milkshake", "coffee", "wine", "beer"], "correct_choice_idx": 1, "direct_answers": ["liquid", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "soda", "coffee", "energy drink"], "difficult_direct_answer": false, "rationales": ["The person appears to be drinking from a mug based on the size, shape and handle. a mug would be used to drink a hot beverage, commonly answer a.", "A man is holding a coffee cup up to his mouth.", "The beverage is coffee."], "image": "train2014/COCO_train2014_000000025525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175867, "question_id": "BUG9PC3G5TcScLXmFU9XTy", "question": "During which season are the cars here parked on the street?", "choices": ["spring", "summer", "winter", "fall"], "correct_choice_idx": 1, "direct_answers": ["fall", "spring", "spring", "winter", "winter", "fall", "spring", "summer", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["The trees have full leaves and are green", "The cars are parked during the summer when everything is in full bloom.", "The trees are full of leaves and it looks like it is summer."], "image": "val2014/COCO_val2014_000000175867.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100087, "question_id": "BUQntZZwaD7y8239hQ5R9T", "question": "What type of landmark is located near the base of the silver pole?", "choices": ["kiosk", "crosswalk", "newspaper stand", "parking meter"], "correct_choice_idx": 1, "direct_answers": ["fifa", "sign", "street sign", "trees", "lutz", "crosswalk", "pavement", "trees", "crossing sign", "store"], "difficult_direct_answer": true, "rationales": ["The area is a crosswalk.", "Crosswalks are located near traffic poles.", "The electronic sign attachment shows when it is safe to move across the street."], "image": "val2014/COCO_val2014_000000100087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422877, "question_id": "BUWfVLgPLLJoVwj9kmgsqB", "question": "Why are the cars blurred?", "choices": ["old photograph", "bad camera", "high speed", "are melting"], "correct_choice_idx": 2, "direct_answers": ["fast motion", "from racing", "fast", "speed", "going fast", "moving", "high speed", "fast movement", "long exposure", "picture quality"], "difficult_direct_answer": true, "rationales": ["Anything moving at high speed will be blurry.", "The cars are going so fast it's hard for the camera to catch them.", "They are going really fast."], "image": "val2014/COCO_val2014_000000422877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153104, "question_id": "BVS9Dyh3Xohg7BWb8VvnRJ", "question": "What's the abbreviation of this sporting league?", "choices": ["mlb", "nba", "nfl", "nhl"], "correct_choice_idx": 0, "direct_answers": ["mlb", "mlb", "nba", "mlb", "mlb", "nba", "mlb", "nhl", "mba", "nhl"], "difficult_direct_answer": false, "rationales": ["The league is major league baseball.", "You can tell by the setting and the jerseys as to what venue the photo was taken.", "The nationals play baseball and this is the governing organization the team is part of."], "image": "val2014/COCO_val2014_000000153104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389188, "question_id": "BVqaynZexdQR9ubrTAnBoR", "question": "What type game will these men play?", "choices": ["men's singles", "mixed singles", "doubles", "canadian doubles"], "correct_choice_idx": 0, "direct_answers": ["tennis", "tennis", "tennis", "tennis", "men's singles", "tennis", "tennis", "tennis", "tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["Two men are standing on a tennis court, one on either side of the net. singles is played with two people.", "There are two men posing on either side of a tennis net. they will play one on one against one another.", "There are only two men on the court so they are not playing doubles. there are no women so they are not playing mixed."], "image": "train2014/COCO_train2014_000000389188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32513, "question_id": "BVuPvk92kMqKiF5Lh8RcM5", "question": "Who is he probably smiling with?", "choices": ["his reflection", "adult", "dog", "another child"], "correct_choice_idx": 1, "direct_answers": ["open fridge", "parent", "getting snack", "adult", "parent", "parents", "parents", "mother", "parent", "sibling"], "difficult_direct_answer": false, "rationales": ["The boy is being photographed and it is usually an adult who would take a photograph of a child.", "The child is looking at someone off camera.", "The kid is smiling at an adult."], "image": "train2014/COCO_train2014_000000032513.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575284, "question_id": "BW2M9VKZFFCikhLCYiQqS8", "question": "How are the people feeling?", "choices": ["sad", "bored", "excited", "angry"], "correct_choice_idx": 2, "direct_answers": ["happy", "happy", "good", "friendship", "excited", "good", "happy", "excited", "happy", "happy"], "difficult_direct_answer": false, "rationales": ["The people are smiling and gesturing towards each other.", "The people are excited.", "These people look happy and excited to be in the water."], "image": "train2014/COCO_train2014_000000575284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227882, "question_id": "BWDpaFQGFi43yMrwutGRTZ", "question": "In what year was his organization founded in New York City?", "choices": ["1929", "1902", "1945", "1966"], "correct_choice_idx": 3, "direct_answers": ["1910", "unknown", "recently", "2003", "1975", "2001", "1927", "1966", "1966", "1963"], "difficult_direct_answer": true, "rationales": ["The organization is from 1966.", "The man looks like he may be a member of the hare krishna which was founded in 1966.", "The organization was spearheaded in the 1960s."], "image": "val2014/COCO_val2014_000000227882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134760, "question_id": "BWLLK6XBkCgyJyCeLTTAJQ", "question": "Why are they in midair?", "choices": ["is chairlift", "are lost", "is helicopter", "is magic"], "correct_choice_idx": 0, "direct_answers": ["to ski", "ski lift", "ski lift", "going uphill", "is chairlift", "chairlift", "ski lift", "lift uphill", "ski lift", "transportation"], "difficult_direct_answer": false, "rationales": ["They are on a ski lift that will take them to the top of the mountain.", "The people who are not solidly on group are sitting on an apparatus that is connected to wires and based on the location would be consistent with answer a.", "The people are in a chairlift."], "image": "val2014/COCO_val2014_000000134760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573795, "question_id": "BWXyzG6dhdsaKxWu9M4Uy6", "question": "What people group uses the script on the banner?", "choices": ["vietnamese", "koreans", "cambodians", "chinese"], "correct_choice_idx": 1, "direct_answers": ["skiers", "koreans", "japanese", "skiers", "protection", "unsure", "asian", "koreans", "koreans", "korean"], "difficult_direct_answer": false, "rationales": ["The font makes it obvious.", "The lettering looks like korean letters so they would be most likely to use it.", "The banner is in hangul. this alphabet is not used by vietnamese, cambodian, or chinese people."], "image": "train2014/COCO_train2014_000000573795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141302, "question_id": "BWYW5ZirTUb5cm2GG6eEZT", "question": "What would be the main reason this truck makes frequent stops?", "choices": ["trash collection", "salvage", "drop off", "passengers"], "correct_choice_idx": 1, "direct_answers": ["salvage", "trash pickup", "spilling", "heavy load", "closed", "intersections", "picking trash", "pick ups", "collecting trash", "sign telling"], "difficult_direct_answer": true, "rationales": ["The plethora of things tied down in the bed of this truck tell us it's often picking up whatever it can.", "You can tell by the truck with the garbage in it, what it does.", "The truck has a lot of objects in the bed."], "image": "train2014/COCO_train2014_000000141302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253413, "question_id": "BWc8AVnbmXcw4jLhVwhUBZ", "question": "What type of setting is this past the water?", "choices": ["suburbs", "city", "beach", "country"], "correct_choice_idx": 1, "direct_answers": ["river", "city", "india", "city", "city", "cityline", "urban", "palace", "bridge", "city"], "difficult_direct_answer": false, "rationales": ["Big ben is near the water. this is london.", "The density of the buildings in the area suggests that this is an urban environment.", "There are tall buildings and skyscrapers beyond the water."], "image": "val2014/COCO_val2014_000000253413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55158, "question_id": "BX45vMc2dgXkAjUNjmDM6i", "question": "What do the blue triangular objects do?", "choices": ["mark lanes", "check speed", "freeze ice", "speed bumps"], "correct_choice_idx": 0, "direct_answers": ["mark lanes", "separate", "mark lanes", "separate paths", "create track", "mark track", "mark path", "movement", "dividers", "create barriers"], "difficult_direct_answer": true, "rationales": ["The blue objects mark the lanes of the ski route.", "With a number placed on this athlete's uniform and the fenced off area, it would appear that this is part of a competition. the blue objects would than be probably used to mark the lanes of these various competitors.", "This signifies where skiiers need to stay"], "image": "train2014/COCO_train2014_000000055158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166141, "question_id": "BX7HHQjkgBVsyhCbdb6sUz", "question": "Why is the food bad for the kid?", "choices": ["high sugar", "high carbohydrate", "high sodium", "high fat"], "correct_choice_idx": 2, "direct_answers": ["can choke", "processed meats", "fattening", "fattening", "taste", "processed food", "preservatives", "big", "high sodium", "sodium"], "difficult_direct_answer": true, "rationales": ["The child is eating a hotdog. hot dogs are heavily processed and high in sodium.", "The food is high in sodium.", "Hot dogs are really high in sodium."], "image": "train2014/COCO_train2014_000000166141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41673, "question_id": "BXCwDSEj6sZpfgc7AgijxM", "question": "How will the motorbike be able to refill on petrol?", "choices": ["charge station", "cannister", "aerosol", "gas station"], "correct_choice_idx": 3, "direct_answers": ["gas station", "gas station", "gas station", "gas station", "gas station", "at station", "at station", "gas station", "gas station", "gas station"], "difficult_direct_answer": false, "rationales": ["The motorbike can go to a gas station for gas.", "The motorbike will be allowed to refill on the gas station ahead.", "The motorbike runs on petrol and can get filled up at a gas station."], "image": "train2014/COCO_train2014_000000041673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238227, "question_id": "BXszyYto9e2ojQRhzseC6W", "question": "How does the man wearing brown feel about the man wearing red?", "choices": ["sad", "depressed", "mad", "happy"], "correct_choice_idx": 3, "direct_answers": ["friendship", "warm", "great", "love", "loving", "happy", "happy", "happy", "happy", "affectionate"], "difficult_direct_answer": false, "rationales": ["This hug expresses joyful feelings.", "Both of the men are smiling and embracing each other.", "He is smiling."], "image": "train2014/COCO_train2014_000000238227.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236909, "question_id": "BXtPU3WQri4JgtJZaaHKUt", "question": "What normally unpleasant weather is necessary for these people to enjoy their toys?", "choices": ["none", "rain", "wind", "freezing temperatures"], "correct_choice_idx": 2, "direct_answers": ["wind", "wind", "wind", "wind", "windy", "windy", "wind", "yes", "windy", "wind"], "difficult_direct_answer": false, "rationales": ["People are flying kites. wind is necessary to fly kites.", "Wind has to be available for the kites to fly.", "The kites seen flying in the air in this scene could not stay aloft without the wind."], "image": "val2014/COCO_val2014_000000236909.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153052, "question_id": "BXv6ycXPqmkbPdeTViQK7n", "question": "What position is the man in red kneepads on the field playing?", "choices": ["catcher", "umpire", "outfielder", "first base"], "correct_choice_idx": 0, "direct_answers": ["catcher", "catcher", "umpire", "catcher", "catcher", "catcher", "catcher", "pitcher", "catcher", "catcher"], "difficult_direct_answer": false, "rationales": ["In the game of baseball, this is the only position player that wears a chest protector and shin guards along with a mask and a special mitt as he works crouched down behind the batter catching and calling pitches.", "Catchers have to wear extra protection.", "A group of baseball players are in uniform on the field and one is wearing protective gear."], "image": "train2014/COCO_train2014_000000153052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409668, "question_id": "BXynG6UMgccU2hLusDMm7o", "question": "What is near the cars?", "choices": ["hose", "street lights", "cow", "garage clerk"], "correct_choice_idx": 1, "direct_answers": ["street lights", "traffic light", "traffic light", "buildings", "building", "lights", "traffic lights", "traffic lights", "lbuildings", "street lights"], "difficult_direct_answer": false, "rationales": ["You can see the lights at the top of the poles.", "There are lights on the street next to the road.", "There are lights."], "image": "train2014/COCO_train2014_000000409668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234938, "question_id": "BY625utcRpS2AXUmPw6BPp", "question": "What is the first word on the sign?", "choices": ["happy", "summer", "go", "left"], "correct_choice_idx": 1, "direct_answers": ["summer", "letter s", "summer", "summer", "summer", "summer", "summer", "letter s", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["An umbrella has a logo sewed on the side of it.", "Summer shady is on the sign on the umbrella.", "That is the word on the umbrella."], "image": "val2014/COCO_val2014_000000234938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38558, "question_id": "BYD8VeyZTMLyxces9CmGwk", "question": "What has probably happened here?", "choices": ["shooting", "robbery", "fighting", "car accident"], "correct_choice_idx": 3, "direct_answers": ["accident", "traffic accident", "traffic", "jam", "traffic accident", "car accident", "car accident", "traffic jam", "traffic jam", "traffic jam"], "difficult_direct_answer": false, "rationales": ["There is a lot of traffic on the street that might have been caused by a car accident up ahead.", "There is a police officer and cars piled up.", "Traffic is backed up and there is a policeman standing in the middle of it."], "image": "train2014/COCO_train2014_000000038558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193640, "question_id": "BYDnu28NYg6etTo8Wv59ze", "question": "The girl in the green is wearing a hairstyle that is often compared to what animal?", "choices": ["pig", "horse", "cow", "wolf"], "correct_choice_idx": 0, "direct_answers": ["pig", "pig", "pony", "pony", "pig", "pig", "pony", "pony", "pig", "pig tails"], "difficult_direct_answer": false, "rationales": ["The little girl in the green has her hair gathered and fastened on each side of her head which is commonly known as pigtails.", "The girls has her hair in pig tails.", "The pigtails are a pig's."], "image": "train2014/COCO_train2014_000000193640.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402425, "question_id": "BYMHRoKERgZNpf4Grxwypd", "question": "What is the purpose of the object?", "choices": ["call police", "call ambulance", "help you", "provide parking"], "correct_choice_idx": 3, "direct_answers": ["parking time", "paid parking", "pay parking", "collect money", "provide parking", "city parking", "payment", "take money", "pay parking", "parking meter"], "difficult_direct_answer": true, "rationales": ["The parking meter needs to be fed for cars to park.", "It is a parking meter, where you insert money for the privilege of parking there.", "The object provides parking in exchange for money."], "image": "train2014/COCO_train2014_000000402425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525104, "question_id": "BYkrpyigYAVk6J4SGKmxwh", "question": "What is the name for this type of vehicle?", "choices": ["school", "articulated", "double decker", "off road"], "correct_choice_idx": 2, "direct_answers": ["tourist bus", "double decker", "double decker", "tourbus", "double decker", "double decker", "double decker", "bus", "bus", "tourist"], "difficult_direct_answer": false, "rationales": ["It is called a double decker because it has two levels.", "The bus has two decks.", "There are two decks to the bus."], "image": "train2014/COCO_train2014_000000525104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204074, "question_id": "BZ5uNgztBtBtTyJE5h6tzA", "question": "What are the bikers doing on the street?", "choices": ["tricks", "racing", "gaming", "protesting"], "correct_choice_idx": 1, "direct_answers": ["racing", "racing", "riding", "racing", "racing", "racing", "racing", "racing", "riding", "racing"], "difficult_direct_answer": false, "rationales": ["The bikers are racing.", "They are racing against one another.", "There is a large group of riders on bikes wearing racing uniforms on an empty street."], "image": "train2014/COCO_train2014_000000204074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128515, "question_id": "BZWSL89LkZNVQDqPg3iRyu", "question": "Why is he in midair?", "choices": ["bounced up", "just jumped", "is trick", "is magic"], "correct_choice_idx": 1, "direct_answers": ["jumping", "jolly mood", "lunged", "jambing", "jumping", "jumping", "jumping", "jumping", "jumping", "just jumped"], "difficult_direct_answer": false, "rationales": ["He is coming in too high to bounce up and he likely just jumped from ground and took the picture mid air to appear to be floating.", "He just jumped.", "He launched himself so he lands on the bed"], "image": "val2014/COCO_val2014_000000128515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141002, "question_id": "BZnvuJ7BBGuoADAQHuQbxX", "question": "What does the player wait for the player opposite him to do?", "choices": ["sing", "quit", "serve", "love"], "correct_choice_idx": 2, "direct_answers": ["serve", "serve", "serve", "return serve", "serve", "hit ball", "serve", "hit ball", "hit", "volley"], "difficult_direct_answer": false, "rationales": ["The player is in the ready position. he is holding a tennis racket.", "He is waiting for the tennis ball to be served.", "The man is waiting for the ball to be served."], "image": "val2014/COCO_val2014_000000141002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532501, "question_id": "BZqhSe8HNGA9urAmh2N6im", "question": "A glass with reflecting cover is called?", "choices": ["plywood", "light", "lens", "mirror"], "correct_choice_idx": 3, "direct_answers": ["mirror", "mirror", "anti glare", "mirror", "mirror", "mirror", "mirror", "anti glare", "mirror", "mirror"], "difficult_direct_answer": false, "rationales": ["There is a mirror on top of the sink where people can see their reflection.", "A mirror is just a glass with a special cover to reflect things.", "The object in question is clearly visible and based on the location would be answer a. the answer is also commonly known."], "image": "train2014/COCO_train2014_000000532501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216110, "question_id": "BaBj9NNVxUwie5se2u9TqB", "question": "How is this lady's hair dried?", "choices": ["blow dryer", "hand dryer", "sun", "rain"], "correct_choice_idx": 0, "direct_answers": ["hair dryer", "blonde", "dryer", "blown", "hair dryer", "hair drying", "hair dryer", "curling iron", "blow dryer", "blowdryer"], "difficult_direct_answer": false, "rationales": ["A blow dryer is the most commonly used type of dryer in this type of salon setting.", "The woman is located in a salon based on the decor, the seating and the apron over her. at a salon, answer a would commonly be used to dry hair.", "The hairdresser dries hair with a hairdryer. they are inside so the sun would not get it dry."], "image": "train2014/COCO_train2014_000000216110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101687, "question_id": "BaaHafNi4xo7MpUWbmAtkb", "question": "Why does the background seem so flat and odd what type train scene does this signify that this is?", "choices": ["current", "new train", "old", "model train"], "correct_choice_idx": 3, "direct_answers": ["toy", "model train", "miniature", "painting", "painting", "model town", "model", "model", "miniature", "tabletop"], "difficult_direct_answer": false, "rationales": ["The scene is painted onto a wall and the whole set up is a miniature version of the \"real thing\".", "The background is not real, it is printed on a board to set the scene for the visual appeal of the model train.", "The train scene signifies this is a model train."], "image": "train2014/COCO_train2014_000000101687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498425, "question_id": "BajUPcvsjkrV6bQsXkG5nk", "question": "What activity are the group in the street engaged in?", "choices": ["protesting", "dancing", "voting", "gaming"], "correct_choice_idx": 0, "direct_answers": ["protest corruption", "protest", "protesting", "protest", "protest", "strike", "protest", "protest", "strike", "protest"], "difficult_direct_answer": false, "rationales": ["The people are holding a sign that says occupy dc and representation thru occupation which shows they are protesting.", "A group of people are in the street holding signs and standing together.", "People are holding a banner and standing in the middle of a street."], "image": "train2014/COCO_train2014_000000498425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60069, "question_id": "Bb23vQnE8tBZ8QTbdfPZNk", "question": "Why is he holding onto the string?", "choices": ["showing off", "pulling forward", "being fashionable", "keep kite"], "correct_choice_idx": 1, "direct_answers": ["air gliding", "water ride", "surf", "skiing safeguard", "for balance", "parachute", "wind sailing", "pulling forward", "sail boarding", "forward momentum"], "difficult_direct_answer": true, "rationales": ["The kite is pulling him through the water", "He might also be c, but it's primarily so that he can achieve motion.", "The person is parasailing with the parachute in front which indicates that they are being pulled forward."], "image": "val2014/COCO_val2014_000000060069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570215, "question_id": "BbCAgNjiUUuYcLkP9LM7Sh", "question": "What is being done with the yarn?", "choices": ["crocheting", "knitting", "chunking", "braiding"], "correct_choice_idx": 0, "direct_answers": ["knitting", "knitted", "knitted", "crocheting", "knitting", "knitted", "knitting", "knitted", "knitting", "knitting"], "difficult_direct_answer": false, "rationales": ["The yarn is attacked to a crocheting needle.", "There are two needle sets visible with the word \"crochet\" written on them. if the needles are labeled as crochet needles then the yarn is likely to be used in conjunction for that purpose.", "Several packages containing long metal stick like objects. there is yarn that is used to make different items with it."], "image": "train2014/COCO_train2014_000000570215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567572, "question_id": "BbeVeEa6f6R5MfCfn44rRo", "question": "What is the brush between the doll's legs usually used for?", "choices": ["teeth", "skin", "nails", "hair"], "correct_choice_idx": 0, "direct_answers": ["toothbrush", "teeth", "toothbrushing", "brushing teeth", "toothbrush", "toothbrush", "brushing teeth", "teeth cleaning", "teeth", "cleaning"], "difficult_direct_answer": false, "rationales": ["It's used to brush and clean teeth.", "The brush is a dental hygiene device.", "A toothbrush is between a dolls legs. a toothbrush is used to brush teeth."], "image": "train2014/COCO_train2014_000000567572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124931, "question_id": "BbegZJR5Es4nRUFKdS2eQi", "question": "What is the man with beard doing?", "choices": ["jumping rope", "shaving", "singing", "playing game"], "correct_choice_idx": 3, "direct_answers": ["playing", "playing games", "playing", "playing wii", "playing wii", "playing wii", "playing wii", "playing game", "playing game", "playing game"], "difficult_direct_answer": false, "rationales": ["A man is holding a video game controller.", "He is holding a wii remote to play a game on the wii", "The man with the beard has wii controllers in both hand so he is gaming."], "image": "train2014/COCO_train2014_000000124931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389389, "question_id": "BbhPRdGf4uEK8aWepjr2T3", "question": "What is this type of food called?", "choices": ["wraps", "tacos", "gyros", "hoagies"], "correct_choice_idx": 0, "direct_answers": ["gyro", "gyro", "wrap", "wrap", "gyro", "burrito", "wraps", "shawarma", "casserole", "gyro"], "difficult_direct_answer": false, "rationales": ["The ingredients are held together with a tortilla.", "The food appears to be a circular food contained by an outer shell. this shape of food with an outer thin flexible shell is known as answer a.", "The food is called a wrap because it has the ingredients wrapped in a tortilla."], "image": "val2014/COCO_val2014_000000389389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263258, "question_id": "BbsQRch4pQtqVJ5pfThNT9", "question": "What is the young man holding?", "choices": ["book", "baseball bat", "fish", "fishing rod"], "correct_choice_idx": 1, "direct_answers": ["baseball bat", "bat", "bat", "bat", "bat", "bat", "baseball bat", "baseball bat", "bat", "bat"], "difficult_direct_answer": false, "rationales": ["A kid is on a baseball field in a baseball uniform holding a bat.", "He is in a baseball uniform.", "The man has a bat in his hands and he's a baseball player."], "image": "val2014/COCO_val2014_000000263258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308278, "question_id": "Bc38VxqBU8h8eLFQUbWYbM", "question": "Which direction are the people on the ski lift riding?", "choices": ["down", "upward", "nowhere", "same elevation"], "correct_choice_idx": 1, "direct_answers": ["upwards", "up", "right", "up", "up", "up", "up", "up", "upward", "east"], "difficult_direct_answer": false, "rationales": ["They can't ski up the hill; to get to the top it's necessary to ride there.", "This helps people get up steep inclines faster", "After people ski or snowboard down a hill, they need a way to get back up. ski hills provide a lift for them."], "image": "val2014/COCO_val2014_000000308278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428440, "question_id": "Bc6MMcNeX4hroaLBBnURkR", "question": "What mimics a figurehead here?", "choices": ["snake", "dragon", "goat", "drum"], "correct_choice_idx": 1, "direct_answers": ["dragon", "dragon", "dragons", "dragon", "boat front", "chinese dragon", "dragon", "head boat", "dragon", "dragon"], "difficult_direct_answer": false, "rationales": ["The front of the boat looks like the head of a dragon.", "The figurehead is a dragon.", "The carvings have long scaly necks, spikes along the back and sharp teeth."], "image": "train2014/COCO_train2014_000000428440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484064, "question_id": "BcTszip4CUEMVt7oYwkoGw", "question": "What type or activity does this group enjoy?", "choices": ["holiday", "religious", "winter", "summer"], "correct_choice_idx": 2, "direct_answers": ["skiing", "snowboarding", "snowboarding", "snowboarding", "winter", "skiing", "skiing", "snowboarding", "snowboarding", "snowboarding"], "difficult_direct_answer": false, "rationales": ["The land is covered in snow.", "The people are skiing on snow. this activity would not be possible during the summer.", "The people are seen snowboarding, which indicates that they enjoy their time off during a holiday from work."], "image": "train2014/COCO_train2014_000000484064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410337, "question_id": "BcguQJXdeYhSBPboQQtzyR", "question": "How do people here prefer to communicate?", "choices": ["talking", "pen", "video chat", "texting"], "correct_choice_idx": 3, "direct_answers": ["text messaging", "phone", "text", "texting", "text", "texting", "cell", "text", "texting", "text message"], "difficult_direct_answer": false, "rationales": ["The people have cellphones in their hands as they are standing next to each other. the cellphones have keypads which are good for messaging.", "They use their fingers to type text messages.", "She is holding the phone and using her fingers to text someone."], "image": "val2014/COCO_val2014_000000410337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140843, "question_id": "BchptKiWD54qnQVgQ5mmH9", "question": "How was this ball propelled forward?", "choices": ["blown on", "kicked", "dribbled", "batted"], "correct_choice_idx": 1, "direct_answers": ["kicked", "kicked", "kick", "kick", "kicked", "kicking", "through pass", "kicked", "blue team", "kicked"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to soccer.", "They are playing soccer so they use their feet for kicking the ball.", "They are playing soccer. soccer doesn't allow players to throw the ball."], "image": "val2014/COCO_val2014_000000140843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473427, "question_id": "BcmL2weH74LhoWDKxJApBU", "question": "What is the number written on top of the middle green bananas?", "choices": ["500", "400", "300", "600"], "correct_choice_idx": 2, "direct_answers": ["number 4122", "300", "50", "three hundred", "three hundred", "fifty", "fifty", "fifty", "three hundred", "price"], "difficult_direct_answer": false, "rationales": ["All the bananas are placed on table in a market type setting with a cardboard sign indicating the price.", "It is on a piece of cardboard", "You can see on the green ones is a small sign that says 300"], "image": "val2014/COCO_val2014_000000473427.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523729, "question_id": "BcoKRr6RXk3X8zXD8UABwq", "question": "What is this type of tower often called?", "choices": ["bell tower", "lookout tower", "clock tower", "lookout"], "correct_choice_idx": 2, "direct_answers": ["clock tower", "pagoda", "pagoda", "clock tower", "clock tower", "clock tower", "pagoda", "clock", "pagoda", "clocktower"], "difficult_direct_answer": false, "rationales": ["It is a tower that has the most prominent feature of a clock at the top of it, giving it the name.", "There is a clock on the tower which is visible and identifiable by the round white face with the hands and digits. a structure like this with a clock on it would be known as answer a.", "There is a clock on top of it."], "image": "train2014/COCO_train2014_000000523729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400869, "question_id": "BcpCAGvGNxjEGWaRUMJbZe", "question": "What is the most unhealthy part of this cheeseburger?", "choices": ["donut bun", "lettuce", "beef", "cheese"], "correct_choice_idx": 0, "direct_answers": ["donut", "donut", "donuts", "donuts", "donuts", "donut", "donuts", "donut bun", "meat", "doughnuts"], "difficult_direct_answer": false, "rationales": ["The bun is made of donuts which are full of calories.", "The cheeseburger has a donut on the outsides instead of a bun which is the most unhealthy part.", "The donut bun has a lot of sugar."], "image": "train2014/COCO_train2014_000000400869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117690, "question_id": "BcsabNikDJYGPoT8HwWuso", "question": "Why is the dog flying toward the frisbee?", "choices": ["is diving", "to catch", "was thrown", "is confused"], "correct_choice_idx": 1, "direct_answers": ["to catch", "catching", "catching it", "to catch", "catching", "catching frisbee", "to catch", "show jumping", "catch", "to catch"], "difficult_direct_answer": false, "rationales": ["The dog is jumping trying to catch frisbee.", "The dog is catching.", "The dog is making an effort to retrieve the flying frisbee as they're both about to go into the swimming pool. this kind of competition, known as \"disc dog\", is an event held throughout the us in numerous locations annually."], "image": "val2014/COCO_val2014_000000117690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217958, "question_id": "Bd7cfZi7wZdh2m5mFBRfWF", "question": "What area of the body does the bidet clean?", "choices": ["arms", "face", "genitals", "mouth"], "correct_choice_idx": 2, "direct_answers": ["butt", "toilet", "butt", "butt", "buttocks", "genitals", "genitals", "butt", "tiles", "butt"], "difficult_direct_answer": false, "rationales": ["The bidet cleans one's bottom.", "The bidet is next to the toilet, and is also low to the ground to be used after dedication to clean the private parts.", "They clean the genitals effectively by use of water."], "image": "train2014/COCO_train2014_000000217958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289569, "question_id": "BdKnioakydpcDD5cdXndFv", "question": "What is the woman on the purple sign doing?", "choices": ["dancing", "praying", "eating", "singing"], "correct_choice_idx": 1, "direct_answers": ["praying", "praying", "going bike", "praying", "going bike", "prating", "praying", "praying", "praying", "praying"], "difficult_direct_answer": false, "rationales": ["A woman is pictured on a sign holding her hands in a praying position.", "She has her hands pressed together up to her face", "The woman on the purple sign is praying with her hands together."], "image": "train2014/COCO_train2014_000000289569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174677, "question_id": "BdYN4NStxhnELdovLEQPVc", "question": "What can be enjoyed here?", "choices": ["food", "music", "view", "show"], "correct_choice_idx": 2, "direct_answers": ["ocean", "water", "horizon", "sunset", "view", "sunset", "view", "view", "ocean view", "view"], "difficult_direct_answer": false, "rationales": ["People can look out at the water.", "There is a bench facing the water", "These people are looking out into the water and are here for the sights."], "image": "train2014/COCO_train2014_000000174677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448547, "question_id": "Bdwbfw6UwxznTGb5Yzeh9F", "question": "Why are the riders all wearing gold?", "choices": ["very comfortable", "is parade", "employer provided", "free clothing"], "correct_choice_idx": 1, "direct_answers": ["parade", "parade", "matching uniforms", "parade", "parade", "is parade", "tradition", "parade", "uniform parade", "parade"], "difficult_direct_answer": false, "rationales": ["Because they are representing one agenda and are uniformly dressed as they parade.", "It is a special occasion in which they would wear ceremonial costumes.", "The people are in a parade"], "image": "train2014/COCO_train2014_000000448547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401384, "question_id": "Bdwm9dHgh5WAviPR5AgDx2", "question": "What kind of locations are mentioned in the signs?", "choices": ["continents", "countries", "cities", "cardinal points"], "correct_choice_idx": 2, "direct_answers": ["cities", "cities", "cities", "cities", "international cities", "cities", "cities", "travel places", "different countries", "cities"], "difficult_direct_answer": false, "rationales": ["The locations are places like calgary, tokyo, and sydney. these are not countries, cardinal points, or continents.", "The signs mention large population centers, such as tokyo, sydney, and moscow.", "The colored signs all have the names of different cities and pointing in their direction."], "image": "val2014/COCO_val2014_000000401384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365339, "question_id": "Be5u6dGfiQFQ7bn9sPCSoV", "question": "What is needed for this activity?", "choices": ["ice", "wind", "snow", "sun"], "correct_choice_idx": 1, "direct_answers": ["wind", "kites", "kites", "wind", "kite", "wind", "wind", "air", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["Wind is needed to make the kites fly.", "The kite needs wind.", "The people are flying kites. snow, ice, and sun are not necessary."], "image": "val2014/COCO_val2014_000000365339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244159, "question_id": "Be94rEbXKFhT5EZEAHbNQ3", "question": "What type of plant is on the coffee table?", "choices": ["begonia", "orchid", "violet", "fern"], "correct_choice_idx": 1, "direct_answers": ["orchid", "green plant", "succulent", "green", "succulent", "jade", "green", "succulent", "greens", "faux"], "difficult_direct_answer": false, "rationales": ["Orchids are big, green, and flat.", "The plant is an orchid.", "The leafs are that of an orchid."], "image": "val2014/COCO_val2014_000000244159.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97939, "question_id": "Behe9S8hQxPQHRvkjoScYQ", "question": "The woman on the phone is using what item to move around?", "choices": ["skateboard", "hoverboard", "bicycle", "scooter"], "correct_choice_idx": 3, "direct_answers": ["moped", "cellphone", "stroller", "scooter", "scooter", "scooter", "scooter", "phone", "cane", "cane"], "difficult_direct_answer": false, "rationales": ["There is a pole under one of the women.", "The vehicle in use is a scooter.", "The woman is using a scooter, one of her hands is on the scooter that is not holding the cell phone."], "image": "train2014/COCO_train2014_000000097939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524649, "question_id": "BekEZVFbmnYwL3rs3uDNoY", "question": "What is being promised will stay fresh longer?", "choices": ["milk", "bagels", "baking powder", "eggs"], "correct_choice_idx": 1, "direct_answers": ["bagels", "fridge", "smart balance", "ketchup", "food", "food", "lender's", "bagels", "lenders", "lenders bagels"], "difficult_direct_answer": false, "rationales": ["The package says so.", "The product in the bag made by lender's is labeled that it stays fresh longer.", "The bagels will be fresh."], "image": "train2014/COCO_train2014_000000524649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346924, "question_id": "BenbpWsemH9WmJaEy7JvfR", "question": "What are the illuminated white circles primarily used for?", "choices": ["sports", "showing time", "mounting", "decoration"], "correct_choice_idx": 1, "direct_answers": ["showing time", "clock", "telling time", "telling time", "telling time", "telling time", "telling time", "telling time", "telling time", "clock"], "difficult_direct_answer": false, "rationales": ["The round circles are clocks. people look at clocks to check what time it is.", "The clocks are showing time.", "There is an hour, second and minute hand with notches."], "image": "train2014/COCO_train2014_000000346924.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273200, "question_id": "BexhwPn44A7AtDmEQvzZ4t", "question": "Where is the ball?", "choices": ["pitcher's hand", "coach", "batters glove", "catcher's glove"], "correct_choice_idx": 3, "direct_answers": ["catchers glove", "catcher's glove", "catcher's glove", "glove", "catchers mitt", "umpires glove", "air", "catcher", "catcher's glove", "catcher's mitt"], "difficult_direct_answer": false, "rationales": ["Catcher has his hand up with the ball in his glove.", "It's why the batter is looking at him", "The catcher has caught the ball."], "image": "train2014/COCO_train2014_000000273200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98943, "question_id": "BezZ7EshTPZEhAGSQE5vKP", "question": "The flag on the bus belongs to which Country?", "choices": ["united states", "united kingdom", "switzerland", "brazil"], "correct_choice_idx": 1, "direct_answers": ["united kingdom", "united kingdom", "united kingdom", "england", "britain", "great britain", "britain", "england", "usa", "britain"], "difficult_direct_answer": false, "rationales": ["The flag has red and white crosses on a blue background. it is the union jack.", "The red, white, and blue flag is the union jack.", "This is the union jack for that country"], "image": "train2014/COCO_train2014_000000098943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572724, "question_id": "BfCQeALZzhhASkMjHdPBas", "question": "Why are the bus's seats so high?", "choices": ["see far", "stop jump", "stays warmer", "finding difficult"], "correct_choice_idx": 0, "direct_answers": ["luggage underneath", "tall bus", "luggage underneath", "elevated", "tour bus", "more comfort", "see far", "second level", "tourist bus", "stepup platform"], "difficult_direct_answer": true, "rationales": ["The bus is for tourists.", "The seats allow people to see.", "The bus seats allow people to be tourists."], "image": "train2014/COCO_train2014_000000572724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494905, "question_id": "BfMtd9v6xajz8Y9T8DHpR4", "question": "What is the man doing?", "choices": ["crosscountry skiing", "sand drifting", "mountaineering", "sledding"], "correct_choice_idx": 0, "direct_answers": ["climbing", "skiing", "ski poles", "skiing", "hiking", "hiking", "skiing", "crosscountry skiing", "cross-country skiing", "crosscountry skiing"], "difficult_direct_answer": false, "rationales": ["The man is cross country skiing.", "The man is skiing.", "The man is wearing skis and poles so he is cross country skiing."], "image": "train2014/COCO_train2014_000000494905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376590, "question_id": "BfTQUfj6CkQMA5MixUWgts", "question": "What maneuver is the man wearing red performing?", "choices": ["grind", "front spin", "hand plant", "big air"], "correct_choice_idx": 2, "direct_answers": ["skateboard trick", "jump", "flip", "skateboarding", "handplant", "handstand", "hand plant", "ollie", "skiting", "skate boarding"], "difficult_direct_answer": true, "rationales": ["The man's hand is placed on top of the ramp in a manner that would be done when intentionally performing a trick known as answer a.", "The man is putting his hand on the top of the ramp.", "The man's hand is on the ramp and the rest of him is in the air."], "image": "train2014/COCO_train2014_000000376590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115930, "question_id": "Bfdnyiwfhw5WUS72HEymAu", "question": "What profession are the men wearing aprons?", "choices": ["janitors", "artists", "cooks", "repairmen"], "correct_choice_idx": 2, "direct_answers": ["cooks", "chef", "chefs", "chefs", "chef", "chefs", "chefs", "cooks", "cooks", "chefs"], "difficult_direct_answer": false, "rationales": ["The people are cooks in a restaurant.", "They are cooks at a nice restaurant", "The men are in a commercial kitchen wearing chef robes."], "image": "val2014/COCO_val2014_000000115930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437325, "question_id": "Bg2Uz3ExXYXr7VToqqcyoF", "question": "What setting are these types of seating often seen in?", "choices": ["apartment", "church", "mansion", "office"], "correct_choice_idx": 0, "direct_answers": ["playing game", "college dormitories", "living room", "living room", "living room", "apartment", "dormitories", "relaxed settings", "home", "house"], "difficult_direct_answer": false, "rationales": ["These are often used by people in houses and would not be used in rich houses like b.", "The people are sitting on casual furniture that is found in a cheap apartment.", "These people are sitting inside of an apartment living area."], "image": "val2014/COCO_val2014_000000437325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107767, "question_id": "Bg9TPqj8zdssxBxi3zNUPK", "question": "A baggage carousel is a device generally at where?", "choices": ["school", "hospital", "malls", "airport"], "correct_choice_idx": 3, "direct_answers": ["airport", "airport", "aircraft", "airport", "airport", "airport", "aircraft", "airport", "airport", "airport"], "difficult_direct_answer": false, "rationales": ["People bring luggage while traveling from one place to another.", "All airlines use this conveyor belt.", "People use baggage to contain their things when traveling. commonly they travel by air."], "image": "train2014/COCO_train2014_000000107767.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576714, "question_id": "BgRUk8QWdYmCPjvjrSt64u", "question": "What would happen if the tallest object here fell on you?", "choices": ["get sticky", "get wet", "get crushed", "get burned"], "correct_choice_idx": 2, "direct_answers": ["get crushed", "die", "die", "hurt", "dead", "crushed", "death", "baa", "mutilation death", "hurt myself"], "difficult_direct_answer": true, "rationales": ["The tallest visible object is a large stone. a stone that large must weigh a lot and would do substantial damage if it fell on someone.", "The rock is larger than a person and much heaving that a person's body can take.", "It would hurt you if the rock fell on you."], "image": "val2014/COCO_val2014_000000576714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353408, "question_id": "Bghs7JRMdtn5ksCfyGBxSH", "question": "Which deity looks like these animals?", "choices": ["anubis", "artemis", "ganesh", "set"], "correct_choice_idx": 2, "direct_answers": ["ganesha", "hari krishna", "ganesh", "ganesha", "ganesh", "raja", "ganesha", "ganesha", "ganesha", "elephant"], "difficult_direct_answer": false, "rationales": ["They are an elephant headed god.", "The animals are elephants. the elephant is a representation of a popular hindu deity.", "These animals are elephants, not jackals or dogs."], "image": "val2014/COCO_val2014_000000353408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317320, "question_id": "BgmnQzFsHWawDsxp4fXk8P", "question": "Which vehicle takes less pedaling to move?", "choices": ["bus", "left most", "right most", "equal"], "correct_choice_idx": 2, "direct_answers": ["motorbike", "right most", "smaller one", "scooter", "motorbike", "motorized", "motor scooter", "motorcycle", "motorcycle", "motorcycle"], "difficult_direct_answer": false, "rationales": ["Right most is a motorcycle that uses fuel.", "The little moped goes by itself so it's easier.", "It is a motorcycle."], "image": "val2014/COCO_val2014_000000317320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426845, "question_id": "BhMtnvRaEXstNMWSZvWwwU", "question": "What type of stores are Aerosoles and Skechers?", "choices": ["fur", "footwear", "groceries", "sports memorabilia"], "correct_choice_idx": 1, "direct_answers": ["shoe stores", "footwear", "shoes", "shoes", "shoe", "shoes", "clothes", "shoes", "shoe stores", "sneaker stores"], "difficult_direct_answer": false, "rationales": ["The store is for footwear.", "Aerosols and skechers are both well-known shoe stores.", "These are known for selling shoes."], "image": "train2014/COCO_train2014_000000426845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185258, "question_id": "BhTTSe5ywMeuun6CTKrD4F", "question": "These people would be described as what?", "choices": ["enemies", "strangers", "zombies", "couple"], "correct_choice_idx": 3, "direct_answers": ["tourists", "friends", "tourist", "couple", "tourists", "lovers", "couple", "couple", "couple", "couple"], "difficult_direct_answer": false, "rationales": ["Two people are posing on a pier. they are very close and seem happy to be with each other.", "The two people standing together are a man and a woman and they look like life partners.", "The people are embracing each other and are smiling. they are not zombies."], "image": "train2014/COCO_train2014_000000185258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235466, "question_id": "BhnAe7P9Xdagnej5QKJMWP", "question": "Why is he sitting on the fire hydrant?", "choices": ["no chair", "firefighter", "comfortable", "owns it"], "correct_choice_idx": 0, "direct_answers": ["resting", "resting", "reading", "resting", "resting", "no chair", "no chair", "reading", "rest", "resting"], "difficult_direct_answer": false, "rationales": ["The person has no chair.", "There are no other places to rest.", "No one personally owns a fire hydrant, the shape of it does not contour to a body, and he is not dressed in firefighter attire. there also appears to be no chairs in sight."], "image": "train2014/COCO_train2014_000000235466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362816, "question_id": "Bhr4gNXTmyDe4pk735LwVP", "question": "What kind of oranges are these?", "choices": ["australian", "navel", "juicy", "fresh"], "correct_choice_idx": 1, "direct_answers": ["navel", "australian navel", "navel", "navel", "australian navel", "navel", "navel", "naval", "navel", "navel"], "difficult_direct_answer": false, "rationales": ["The oranges are navels.", "These oranges are large and called navel.", "Naval oranges are displayed."], "image": "train2014/COCO_train2014_000000362816.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394533, "question_id": "Bhs6r4FVB3wv3e2DRF4Ynj", "question": "A form of halfpipe used in extreme sports such as Skateboarding is what?", "choices": ["down skate", "vert ramp", "vert skate", "drop ramp"], "correct_choice_idx": 1, "direct_answers": ["jump", "bowl", "quarter pipe", "slide", "dangerous", "vert ramp", "vert ramp", "vert ramp", "railing", "vert ramp"], "difficult_direct_answer": false, "rationales": ["A skateboarder is on the edge of a ramp doing a trick.", "A vertical ramp", "The ramp is going vertical."], "image": "val2014/COCO_val2014_000000394533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368875, "question_id": "Bi27U4ixmVBSQ4QXwgLUCE", "question": "What is next to the computer leaning against the books?", "choices": ["saw", "hammer", "coffee mug", "printer"], "correct_choice_idx": 2, "direct_answers": ["cellphone charger", "radio", "laptop", "coffee", "coffee mug", "laptop screen", "pouch", "water bottle", "speaker", "mug"], "difficult_direct_answer": true, "rationales": ["A silver, tall cup with a lid is on a desk.", "Based on the description in the question there is a mug located in the spot described. it is a mug because of the shape and size and the handle placement.", "The object in question is identifiable based on the description in the question and based on the size and shape is answer a."], "image": "train2014/COCO_train2014_000000368875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126246, "question_id": "Bi7MybriFxvSn2en6H5oRP", "question": "From which direction did the boarder most recently originate?", "choices": ["their left", "their right", "mars", "under themselves"], "correct_choice_idx": 1, "direct_answers": ["up", "top north", "left", "right", "left", "top ramp", "beneath slope", "up", "their right", "north east"], "difficult_direct_answer": false, "rationales": ["The boarder came from the right since he's skating from the right side.", "The surfer is facing the left so they moved from their right.", "The direction is the right."], "image": "train2014/COCO_train2014_000000126246.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171695, "question_id": "BiESrSHVadEWEKkzAMjQwc", "question": "Which company manufactures this beverage?", "choices": ["nestle", "heineken", "coca cola", "pepsico"], "correct_choice_idx": 3, "direct_answers": ["gatorade", "pepsico", "gatorade", "gatorade", "gatorade", "gatorade", "gatorade", "gatorade", "pepsico", "pepsico"], "difficult_direct_answer": false, "rationales": ["The umbrella has the name and logo for gatorade. this drink is similar to powerade and is made by coca-cola's main competitor.", "This is the company that produces it", "Gatorade is the beverage being advertised. an internet search revealed the company that manufactures the product."], "image": "val2014/COCO_val2014_000000171695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14088, "question_id": "Bj22VLV46HU8ACWT77Rg9W", "question": "What is the manufacture of the laptop that the person is using?", "choices": ["samsung", "lenovo", "dell", "hp"], "correct_choice_idx": 3, "direct_answers": ["hewlett packard", "hp", "hp", "hewlett packard", "hp", "hewlett packard", "hp", "hp", "hp", "hp"], "difficult_direct_answer": false, "rationales": ["These are the letters in the bottom corner of the device, which stand for the brand hewlett-packard.", "The logo is visible on the laptop and is consistent with answer a.", "A hewlit packard logo is on the back of a computer."], "image": "val2014/COCO_val2014_000000014088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364561, "question_id": "BjNBpPWJZSk6cSpdVHwZ2y", "question": "What are they likely getting from the truck?", "choices": ["ice-cream", "tow", "gas", "tacos"], "correct_choice_idx": 0, "direct_answers": ["ice cream", "ice-cream", "ice cream", "person", "ice cream", "person", "ice cream", "ice cream", "ice cream", "ice cream"], "difficult_direct_answer": false, "rationales": ["A white truck is pulled over near people on the side of the road. the white truck has colorful print on it.", "This is an ice cream truck.", "The truck is the ice cream truck."], "image": "train2014/COCO_train2014_000000364561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555183, "question_id": "BjNfBWh3swPVDKs4k5SjRw", "question": "How many functions key are there in a keyboard?", "choices": ["14 keys", "12 keys", "20 keys", "15 keys"], "correct_choice_idx": 1, "direct_answers": ["ten", "twelve", "twelve", "colour", "twelve", "twelve", "twelve", "12", "twelve", "12 keys"], "difficult_direct_answer": false, "rationales": ["Twelve functions are standard on a keyboard.", "A keyboard has twelve keys.", "A standard keyboard is on a table."], "image": "train2014/COCO_train2014_000000555183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534959, "question_id": "BjiP7LqMamkFrtfQaqeXq8", "question": "How many different living creatures are visible here?", "choices": ["three", "zero", "one", "eight"], "correct_choice_idx": 0, "direct_answers": ["three", "two", "three", "two", "two", "three", "three", "two", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are a baby and cat on the bed, and a person taking the photo next to it.", "A man in the mirror is seen looking at his baby and cat, and those are the living creatures that are visible.", "There is a man visible in the mirror in addition to the cat and baby seen on the couch."], "image": "train2014/COCO_train2014_000000534959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16784, "question_id": "BjjdHVGf4mRFE4vHTuLCsq", "question": "In which country does Click airline originate?", "choices": ["guam", "spain", "mexico", "antarctica"], "correct_choice_idx": 2, "direct_answers": ["mexico", "use", "mexico", "mexico", "mexico", "mexico", "mexico", "use", "mexico", "mexico"], "difficult_direct_answer": false, "rationales": ["The plane says \"mexicana\" on it.", "An airplane with a logo is on a runway and the country of origin is also on the side of the plane.", "The plane says \"mexicana\" on it."], "image": "train2014/COCO_train2014_000000016784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383937, "question_id": "BkEMUgXYA59norWCeF7LW7", "question": "Which vehicle is closest to the edge?", "choices": ["sedan", "police", "rover", "mini cooper"], "correct_choice_idx": 2, "direct_answers": ["land rover", "rover", "red", "red one", "jeep", "jeep", "red vehicle", "red", "red jeep", "red truck"], "difficult_direct_answer": false, "rationales": ["The closest vehicle to the edge of this hill is a land rover.", "The rover is parked next to the snow bank, and next to the snow bank there appears to be a steep drop downwards.", "The vehicle is a range rover."], "image": "train2014/COCO_train2014_000000383937.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69936, "question_id": "BkbRHXRyG2hmFAJtbherve", "question": "Which one of these tools were likely used in the design of the walls?", "choices": ["calculator", "protractor", "compass", "ruler"], "correct_choice_idx": 3, "direct_answers": ["rollers", "rulers", "ruler", "ruler", "paint", "ruler", "paintbrush", "cranes", "straightedge", "tiles"], "difficult_direct_answer": false, "rationales": ["A building has a bunch of geometrical designs with straight edges. rulers are used to make a straight edge.", "The lines are so straight they had to use something to get them that way.", "The tool is a ruler."], "image": "train2014/COCO_train2014_000000069936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260679, "question_id": "BkqThgJHh9hkmHxpk2Ygq2", "question": "The animal whose name appears on the side of the bus is found in what country?", "choices": ["mexico", "united states", "brazil", "australia"], "correct_choice_idx": 3, "direct_answers": ["mexico", "africa", "australia", "usa", "australia", "australia", "australia", "australia", "australia", "emu"], "difficult_direct_answer": false, "rationales": ["The animal is from australia.", "The animal whose picture and name is on the bus is called an emu. emu's are native to australia.", "A bus has a logo that mentions the animal the emu. emus are in australia."], "image": "train2014/COCO_train2014_000000260679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443579, "question_id": "Bm4LSAjtfEr2cpeqj9zJVJ", "question": "What is she sitting next to on the left?", "choices": ["microwave", "stove", "dishwasher", "fridge"], "correct_choice_idx": 3, "direct_answers": ["person", "refrigerator", "refrigerator", "bears", "bears", "refrigerator", "bears", "fridge", "fridge", "wall"], "difficult_direct_answer": false, "rationales": ["There is a device with a door.", "The appliance is clearly a two door refrigerator.", "The handle and door of the object are visible and consistent with answer a."], "image": "train2014/COCO_train2014_000000443579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131856, "question_id": "Bmf5e9582RhnaxALXvbTMc", "question": "How is the person in the water being moved?", "choices": ["wind sails", "running", "motor", "via boat"], "correct_choice_idx": 0, "direct_answers": ["wind", "wind", "para sail", "wind", "wind sails", "kite", "walking", "kite", "wind", "kite"], "difficult_direct_answer": false, "rationales": ["While in the air wind is needed for flight.", "The person is using sails in order to pull them across the water.", "The person in the water is holding onto a wind sail that helps him move using the wind."], "image": "val2014/COCO_val2014_000000131856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205762, "question_id": "Bmtd876UH3MgS7zFtsBu5y", "question": "What is usually behind barriers like these?", "choices": ["fish", "prisoners", "people", "dogs"], "correct_choice_idx": 2, "direct_answers": ["luggage", "luggages", "displays", "secure areas", "suitcases", "security areas", "lines", "people", "people", "people"], "difficult_direct_answer": false, "rationales": ["The barrier is designed to control the flow of crowds and restrict access.", "They usually keep people standing in a row.", "The barriers usually keep people out."], "image": "train2014/COCO_train2014_000000205762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113571, "question_id": "BnPCmRJjZ5rrxAj42xtmdN", "question": "What might be a major component of this treat?", "choices": ["carrots", "muffins", "ice cream", "raisins"], "correct_choice_idx": 2, "direct_answers": ["dog treat", "ice cream", "sugar", "frosting", "sugar", "sugar", "cake", "ice cream", "ice cream", "cake"], "difficult_direct_answer": false, "rationales": ["The cake has an ice cream frosting.", "The component is ice cream.", "There appears to be a scoop of it decorating the top of the cake."], "image": "val2014/COCO_val2014_000000113571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397890, "question_id": "BnWUFbx7G6EzyyCLk9hc2V", "question": "What game is played in the room these men are in?", "choices": ["bowling", "pool", "foosball", "hockey"], "correct_choice_idx": 1, "direct_answers": ["billiards", "pool", "pool", "billiards", "pool", "pool", "billiards", "pool", "pool", "pool"], "difficult_direct_answer": false, "rationales": ["The men are near pool sticks.", "The men are standing in a room that has a pool table and cue sticks.", "The men seems to be rich who plays pole around."], "image": "val2014/COCO_val2014_000000397890.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281976, "question_id": "Bndk3meQi5xjS7e635WVSV", "question": "What appliance are the man and woman staring into?", "choices": ["freezer", "oven", "microwave", "fridge"], "correct_choice_idx": 3, "direct_answers": ["microwave", "refrigerator", "fridge", "fridge", "refrigerator", "fridge", "refrigerator", "fridge", "fridge", "fridge"], "difficult_direct_answer": false, "rationales": ["They are looking into the refrigerator.", "They are looking in the fridge at the pickle.", "A fridge door can be seen behind two people staring at food on a shelf."], "image": "val2014/COCO_val2014_000000281976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575702, "question_id": "BngrionHZaEtDfw8bFqLN9", "question": "Why do they have so many umbrellas?", "choices": ["cleaning them", "found them", "selling them", "stole them"], "correct_choice_idx": 2, "direct_answers": ["selling them", "for sale", "vendor", "selling them", "rain protection", "to sell", "shade", "selling them", "selling", "selling them"], "difficult_direct_answer": false, "rationales": ["The umbrellas are for sale.", "They don't look like they are being sold just maybe found them.", "The man is trying to sell his umbrellas."], "image": "train2014/COCO_train2014_000000575702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386332, "question_id": "Bnp3Cap4WwdVDkHxyhBssf", "question": "What type event is being held here?", "choices": ["wedding", "coffee break", "weight watchers", "pizza party"], "correct_choice_idx": 3, "direct_answers": ["pizza party", "pizza party", "pizza party", "pizza party", "pizza party", "eating contest", "eating contest", "pizza party", "eating contest", "pizza party"], "difficult_direct_answer": false, "rationales": ["There are many pizzas on the counter.", "There are at least 6 of these food items on the table.", "There are several cheesy pies in boxes on the table"], "image": "val2014/COCO_val2014_000000386332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482671, "question_id": "Bo6s7FYdhy7HXxsxTquoW8", "question": "What kind of cuisine is being served?", "choices": ["chinese", "korean", "indian", "japanese"], "correct_choice_idx": 0, "direct_answers": ["sichuan", "chinese food", "sichuan", "chinese", "sichuan", "sichuan", "chinese", "sichuan", "chinese", "chinese"], "difficult_direct_answer": false, "rationales": ["Vegetables are on a plate. with chunks of meat. stir fry is a chinese dish.", "A mixture of meats and vegtables that are most often served with rice. the food is mainly cooked in a wok.", "Chinese cuisine is served."], "image": "train2014/COCO_train2014_000000482671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295766, "question_id": "Bo9oKALuLuyjiJJ4Rizf2S", "question": "What life event are they in the middle of celebrating?", "choices": ["anniversary", "graduation", "marriage", "pregnancy"], "correct_choice_idx": 2, "direct_answers": ["wedding", "getting married", "getting married", "getting married", "wedding", "wedding", "wedding", "wedding", "marriage", "wedding"], "difficult_direct_answer": false, "rationales": ["A woman in a white dress and a man in a suit are standing outside together and she is holding flowers.", "The woman is wearing a white dress and a veil making her the bride at a wedding.", "She is wearing a white gown and veil. a white gown and veil are used for weddings."], "image": "train2014/COCO_train2014_000000295766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485710, "question_id": "Bo9sbAZJDAQoJ6YdYkLmeK", "question": "Which type weather does this person hope for today?", "choices": ["freezing", "rain", "tropical", "heat wave"], "correct_choice_idx": 0, "direct_answers": ["clear skies", "clear cold", "snow", "sunny", "snowy", "snow", "snow", "snowing", "snow", "freezing"], "difficult_direct_answer": false, "rationales": ["The activity this man partakes in, skiing, necessitates snow and thus cold weather.", "This person wants it to be cold so he can ski.", "In order to enjoy winter sports the weather must be below freezing."], "image": "val2014/COCO_val2014_000000485710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236904, "question_id": "BoMG4UKboCHoiVTRErrEDz", "question": "What kind of dog is this one?", "choices": ["service dog", "farm dog", "strayed dog", "domestic pet"], "correct_choice_idx": 3, "direct_answers": ["chinese hound", "domestic pet", "small", "chinese crested", "small", "shiz tsu", "terrier", "small", "yorkshire terror", "cylinder"], "difficult_direct_answer": false, "rationales": ["The dog is wearing a collar. domesticated dogs wear collars and leashes.", "A small dog with a collar and leash is on a sidewalk. domestic pets have collars.", "It is well groomed."], "image": "val2014/COCO_val2014_000000236904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377177, "question_id": "BoQYRCnoiPfeTfofnSgPWj", "question": "Which of the five tastes would the food in the plastic bowl provide?", "choices": ["bitter", "sour", "salty", "sweet"], "correct_choice_idx": 3, "direct_answers": ["blueberry", "sweet", "sweet", "sweet", "blueberries", "sweet", "sweet", "cookies", "sweet", "sweet"], "difficult_direct_answer": false, "rationales": ["The foods on the tables are all desserts and jams and are very high in sugar.", "The things in the bowl are cookies and they are made with sugar.", "All of the food has sugar inside."], "image": "train2014/COCO_train2014_000000377177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378928, "question_id": "BoSErJHdb3o68bvRVTn386", "question": "This player is using her aim to position herself to be prepared when the other player does what?", "choices": ["serves", "quits", "runs lap", "talk"], "correct_choice_idx": 0, "direct_answers": ["hit ball", "hits ball", "hits", "serves", "returns", "serves ball", "volleys", "hits ball", "returns", "hits ball"], "difficult_direct_answer": false, "rationales": ["In tennis the other player hits the tennis balls to the other player.", "This woman is playing tennis a game where a small ball is hit with a racket over a net in a back a forth manner. this woman is awaiting the ball and ready to hit it when her opponent sends it her way.", "The player serves."], "image": "val2014/COCO_val2014_000000378928.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224693, "question_id": "BofMCNrybMi5yxasEAXNSk", "question": "Who is a legend in the sport the boys are playing?", "choices": ["chris kanyon", "pele", "roberto alomar", "michael olowokandi"], "correct_choice_idx": 1, "direct_answers": ["pele", "pele", "maradona", "pele", "pele", "lionel messi", "pele", "ronaldo", "lionel messi", "pele"], "difficult_direct_answer": false, "rationales": ["Pele is a legend.", "Pele is a well-known soccer player and the boys are playing soccer.", "The boys are playing soccer, not baseball, basketball, or wrestling."], "image": "val2014/COCO_val2014_000000224693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41455, "question_id": "BofsvSzxn8HfbuAtPvbKRr", "question": "What is used to surround the tub?", "choices": ["fiberglass", "stone", "glass block", "tile"], "correct_choice_idx": 3, "direct_answers": ["tile", "tile", "tile", "tiles", "tile", "tile", "tile", "tiles", "tile", "tiles"], "difficult_direct_answer": false, "rationales": ["The small ceramic rectangles are affixed with mortar and grout", "The tub has border all around it. it is made of decorative pieces.", "The small rectangular items with grout inbetween present around the tub is known as tile."], "image": "train2014/COCO_train2014_000000041455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469010, "question_id": "Bok8eWtmAQ4DFSNQobhdh6", "question": "What do the three white lines represent?", "choices": ["parking", "no stopping", "yield", "crosswalk"], "correct_choice_idx": 3, "direct_answers": ["crosswalk", "crosswalk", "crosswalk", "cross walk", "crosswalk", "crossing", "pedestrian crossing", "crosswalk", "crosswalk", "crosswalk"], "difficult_direct_answer": false, "rationales": ["The white lines are for people crossing the street.", "The lines are the crosswalk.", "The three white lines are for pedestrians, not drivers. they help people get from one sidewalk to the next."], "image": "train2014/COCO_train2014_000000469010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263381, "question_id": "BotnwdZaCdWonagXGGXtC6", "question": "The persons seen here are supporting whom?", "choices": ["firemen", "teachers", "police", "bankers"], "correct_choice_idx": 1, "direct_answers": ["teachers", "organise some", "no cuts", "teachers", "benefits", "employees", "teachers", "protesting cause", "teachers", "goverment"], "difficult_direct_answer": false, "rationales": ["There are protest signs in support of teachers and their benefits.", "Their signs are supporting of teachers.", "The people are teachers."], "image": "train2014/COCO_train2014_000000263381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306231, "question_id": "Bp68zqBwznbecE66TXVbnN", "question": "Why is the shuttle on top of the plane?", "choices": ["hiding it", "following it", "moving it", "selling it"], "correct_choice_idx": 2, "direct_answers": ["carrying", "aeroplane", "transporting", "moving it", "missle", "launch", "transportation", "military plane", "lifting off", "provide fuel"], "difficult_direct_answer": true, "rationales": ["The rocket on top of the plane is getting a ride.", "A fuel plane is following along with a larger plane while it refuels it.", "Moving it to another location."], "image": "train2014/COCO_train2014_000000306231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212759, "question_id": "BpeqQMzbRN3B2FmLkz3GZj", "question": "What might one see if one stays in this spot?", "choices": ["circus", "train", "tsunami", "parade"], "correct_choice_idx": 1, "direct_answers": ["cars", "people crossing", "time pass", "people", "pedestrian", "clock time", "train", "traffic", "time", "foreigners"], "difficult_direct_answer": true, "rationales": ["There is a railway crossing in the background.", "The signage shows that this is a railroad crossing.", "There is a railroad ahead where a train would cross at one point in the day."], "image": "val2014/COCO_val2014_000000212759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88773, "question_id": "Bpt6kef9qScewz6ov6Uqdc", "question": "Where was this sandwich likely cooked?", "choices": ["grill", "oven", "microwave", "fire"], "correct_choice_idx": 0, "direct_answers": ["grill", "restaurant", "restaurant", "kitchen", "restaurant", "kitchen", "grill", "grill", "frying pan", "restaurant"], "difficult_direct_answer": false, "rationales": ["This sandwich was most likely grilled, like a grilled cheese.", "The bread is toasted. sometimes people toast the bread on a skillet.", "The sandwich has consistent brown burn marks on the outside of the bread. even cooking like that that would cause this type of coloration would likely be from an oven."], "image": "train2014/COCO_train2014_000000088773.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185479, "question_id": "Bpxp7epx6FkRZ94usZd6Mq", "question": "Why is this man sitting down?", "choices": ["to drink", "to eat", "to rest", "to work"], "correct_choice_idx": 3, "direct_answers": ["tired", "to work", "on laptop", "working", "no chair", "working", "rest legs", "working", "using laptop", "on laptop"], "difficult_direct_answer": false, "rationales": ["The man is working.", "The man is shown here to be sitting down while he is at a work conference area.", "He has his laptop out and looks to be busy."], "image": "val2014/COCO_val2014_000000185479.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41259, "question_id": "Bqmv2WjibYkYLm5AN4oc7U", "question": "How many players can play baseball?", "choices": ["11", "nine", "five", "12"], "correct_choice_idx": 1, "direct_answers": ["twenty four", "nine", "four", "ten", "nine", "18", "eighteen", "nine", "nine", "nine"], "difficult_direct_answer": false, "rationales": ["Traditionally there are a predetermined amount of people on the field at a time in baseball.", "There are 6 infield and 3 outfield", "Nine players can be on the field."], "image": "train2014/COCO_train2014_000000041259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300684, "question_id": "BqqMYeVYQMLke5Nrf5eqHz", "question": "Where does the man want the ball to go?", "choices": ["behind him", "in front", "in pocket", "in hand"], "correct_choice_idx": 1, "direct_answers": ["over", "over net", "serve", "over net", "over net", "sky", "over net", "over net", "in front", "other side"], "difficult_direct_answer": false, "rationales": ["The man in the picture is winding up his arm and getting ready to swing a tennis racquet. with the net and opponent being in front of him, he also wants the call to go in front of him.", "It needs to go across the court to the other side of the net", "The man wants to hit the ball to the other player."], "image": "train2014/COCO_train2014_000000300684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200671, "question_id": "Br74WQUrcVLnUZyhAvw6C4", "question": "What kind of skateboarding competition is this?", "choices": ["big air", "downhill", "street", "vert"], "correct_choice_idx": 2, "direct_answers": ["professional", "vans", "halfpipe", "ca", "tournament", "park", "street", "skate", "off wall", "vans"], "difficult_direct_answer": true, "rationales": ["A skateboarder is performing a trick on a ramp.", "It's the type of skateboarding that local kids and young adults can do.", "The skateboarding competition is taking place in the street."], "image": "train2014/COCO_train2014_000000200671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207275, "question_id": "BrBNA92isxQqG6RnMcTMbk", "question": "What sort of heat does this room have?", "choices": ["blowtorch", "gas furnace", "fireplace", "small furnace"], "correct_choice_idx": 2, "direct_answers": ["fireplace", "fireplace", "fireplace", "wood", "light", "central heating", "light", "fireplace", "fireplace", "fireplace"], "difficult_direct_answer": false, "rationales": ["The mantel and hearth are partly visible behind the man.", "Th house has a build in furnace that would warm the room if lit.", "If you go by the mantle at the bottom right corner. that said, the fireplace could be decorative, which would then mean its b or d."], "image": "val2014/COCO_val2014_000000207275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568882, "question_id": "BrCJJXR3QrnbMukeoaRGDF", "question": "What food is missing?", "choices": ["strawberry", "tomato", "olive", "broccoli"], "correct_choice_idx": 3, "direct_answers": ["meat", "nothing", "meat", "meat", "meat", "broccoli", "meat", "meat", "steak", "chicken"], "difficult_direct_answer": false, "rationales": ["There is no broccoli.", "There are no olives on the plate.", "There are no obvious olives in the picture; broccoli, tomato, and strawberry are."], "image": "val2014/COCO_val2014_000000568882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374873, "question_id": "BrQ3PkfjpbyysqyMKpjpwS", "question": "How will everyone get off the top of the mountain?", "choices": ["belay", "jet ski", "ropes", "ski"], "correct_choice_idx": 3, "direct_answers": ["ski", "skiis", "hiking", "ski down", "climbing", "ski down", "ski down", "ski", "ski lift", "ski lift"], "difficult_direct_answer": false, "rationales": ["People will ski with the poles.", "Everyone has boots, skis, and poles with them", "The folks are atop a mountain with ski equipment. it logical to assume they will ski down."], "image": "train2014/COCO_train2014_000000374873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459303, "question_id": "BrXGwMVAwJ6E93nGHa4mtv", "question": "What province does this line go to?", "choices": ["namur", "hainaut", "anvers", "luxembourg"], "correct_choice_idx": 1, "direct_answers": ["bergen", "bergen", "north holland", "hainaut", "bergen", "bergen", "bergen", "bergen", "serben", "bergen"], "difficult_direct_answer": false, "rationales": ["A digital sign is at a train station next to a train.", "The sign has the places listed it will be going to.", "The train is going to hainaut as shown by the cities."], "image": "val2014/COCO_val2014_000000459303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308847, "question_id": "BrswYyHVnFtuW7J32jRVpZ", "question": "How many workers are there?", "choices": ["none", "one", "two", "ten"], "correct_choice_idx": 2, "direct_answers": ["two", "one", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are a couple of people wearing high visibility vests with work attire.", "You can tell by the vests worn and the activity they are doing.", "There are two humans that are wearing safety vests."], "image": "train2014/COCO_train2014_000000308847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110777, "question_id": "BsG39vfcYeP5rXDkWWhFDH", "question": "Considering the size of his ears what continent is this elephant from?", "choices": ["north america", "asia", "africa", "europe"], "correct_choice_idx": 1, "direct_answers": ["asia", "asia", "africa", "asia", "asia", "thailand", "asia", "asia", "asia", "india"], "difficult_direct_answer": false, "rationales": ["The ears are smaller", "The elephants from africa have much bigger ears.", "The elephant is probably from asia."], "image": "train2014/COCO_train2014_000000110777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573611, "question_id": "BsT3X7yMwVqZPJxPghN6rF", "question": "What are the group of boys doing with the white remotes?", "choices": ["exercising", "gaming", "changing channels", "wrestling"], "correct_choice_idx": 1, "direct_answers": ["playing wii", "gaming", "playing games", "playing wii", "playing game", "playing", "video games", "playing games", "playing game", "gaming"], "difficult_direct_answer": false, "rationales": ["The kids are gaming.", "They are pushing buttons on the remotes and they are staring at a screen.", "The white remotes are those of a wii gaming system."], "image": "train2014/COCO_train2014_000000573611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143236, "question_id": "Bsh7Aoq6bWbrT2VD5r6vJS", "question": "What type of food are all of these?", "choices": ["vegetables", "protein", "fruit", "starch"], "correct_choice_idx": 0, "direct_answers": ["vegetables", "vegetable", "veggies", "veggies", "vegetables", "vegetables", "vegetables", "root vegetables", "vegetables", "vegetables"], "difficult_direct_answer": false, "rationales": ["There are three plates of food. one has onions, lettuce, and carrots on it.", "There are lettuce, carrots and onions which are classified as vegetables.", "Three plates have carrots, onions, and lettuce on them."], "image": "val2014/COCO_val2014_000000143236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412631, "question_id": "BtMDP4fFL2cjAcy8Xsruku", "question": "What is the dog ready to do?", "choices": ["roll over", "ride", "catch", "walk"], "correct_choice_idx": 2, "direct_answers": ["catch frisbee", "catch frisbee", "catch", "fetch", "catch", "catch frisbee", "catch", "catch", "catch frisbee", "catch"], "difficult_direct_answer": false, "rationales": ["The dog is ready to catch the frisbee.", "This dog is pictured leaping through the air his jaws open towards a frisbee.", "The dog wants to grab the frisbee."], "image": "train2014/COCO_train2014_000000412631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36652, "question_id": "BtfgzHvf65MZ3vfsXzugWd", "question": "What green vegetable is on the plate?", "choices": ["lettuce", "broccoli", "spinach", "celery"], "correct_choice_idx": 3, "direct_answers": ["celery", "celery", "celery", "celery", "celery", "celery", "celery", "onion", "celery", "onion"], "difficult_direct_answer": false, "rationales": ["The shape, and green colour can only be compared to that if celery. it is also usually served with carrots, which are sitting next to it.", "The green vegetable is visible, there is only one, and it has the same color and consistency of answer a.", "Celery is with the food on the plate."], "image": "train2014/COCO_train2014_000000036652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207561, "question_id": "BtjVagSBamLX2UBySrJK9o", "question": "Why are the men near the white water?", "choices": ["to look", "to surf", "to swim", "to fish"], "correct_choice_idx": 1, "direct_answers": ["surfing", "for waves", "surfing", "surfing", "for waves", "surfing", "for waves", "surfing", "to surf", "surfing"], "difficult_direct_answer": false, "rationales": ["They are on surf boards surfing.", "They are on boards trying to ride the waves to shore.", "The men are riding surfboards."], "image": "val2014/COCO_val2014_000000207561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517855, "question_id": "BtpUFesNdDJgzgFwPr4RyD", "question": "What is the player about to do?", "choices": ["love", "return", "serve", "fake out"], "correct_choice_idx": 2, "direct_answers": ["serve", "serve", "serve", "tennis serve", "return ball", "swing", "serve", "serve", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["The player threw the tennis ball in the air and is about to hit it to his opponent.", "The mans hand is up as if a ball has just been tossed above his head which is how this move is performed.", "A tennis player is throwing the ball in the area near the backline. serving is done at the back line."], "image": "val2014/COCO_val2014_000000517855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186344, "question_id": "BuDUpjcVpzAPkujJdP25rb", "question": "What kind of clothes do the kids on the grass have?", "choices": ["baseball uniform", "school uniform", "soccer uniform", "halloween costumes"], "correct_choice_idx": 0, "direct_answers": ["baseball uniform", "baseball", "uniforms", "baseball uniform", "jerseys", "baseball uniform", "baseball uniforms", "baseball uniforms", "baseball uniforms", "baseball uniforms"], "difficult_direct_answer": false, "rationales": ["They are dressed and ready to play the game.", "These are typical baseball uniforms that are worn.", "The white pants baseball caps and shirts with team name written on it as well as the baseball gloves they wear tell us these kids are dressed for baseball"], "image": "train2014/COCO_train2014_000000186344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569437, "question_id": "BuGR4Fd7vVx39FTnCi7PCF", "question": "What will the skater do next?", "choices": ["sit", "run", "stand", "land"], "correct_choice_idx": 3, "direct_answers": ["land skateboard", "land", "get up", "land", "wipeout hard", "fall", "land", "land", "land", "land"], "difficult_direct_answer": false, "rationales": ["The skater will come down from the air.", "He will land because he is in the process of coming down from mid air after doing a trick.", "The skater currently is mid-air. he will soon return to the ground."], "image": "val2014/COCO_val2014_000000569437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289213, "question_id": "BuJMVQPMvYpnx9h2Y33bXX", "question": "What type of sport is he practicing?", "choices": ["team", "winter", "aquatic", "combat"], "correct_choice_idx": 1, "direct_answers": ["skiing", "skiing", "winter", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["The man is skiing on a snowy mountaintop.", "The sport is a winter one.", "The man is practicing a winter sport since there is snow."], "image": "train2014/COCO_train2014_000000289213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325885, "question_id": "BubTZRsUmYU5qiUJwd5SPD", "question": "What move is this man adopting?", "choices": ["serve", "lob", "forehand", "backhand"], "correct_choice_idx": 2, "direct_answers": ["back hand", "forehand", "swing", "swing", "forehand return", "serve", "forehand", "forehand", "tennis stroke", "tennis swing"], "difficult_direct_answer": false, "rationales": ["The man is using his forehand to swing.", "The hand holding the racket has its palm facing outwards.", "The palm of his hand his facing forward."], "image": "val2014/COCO_val2014_000000325885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305576, "question_id": "BuwujKPbXhLkcMKPpPnyud", "question": "What meal is this likely to be?", "choices": ["lunch", "dinner", "breakfast", "afternoon tea"], "correct_choice_idx": 1, "direct_answers": ["dinner", "dinner", "dinner", "dinner", "dinner", "dinner", "potatoes mushrooms", "pasta", "dinner", "dinner"], "difficult_direct_answer": false, "rationales": ["The meal is stacked on a full plate.", "It looks like he's relaxing while eating", "The answer is unknowable by the image, but the meal appears to be of a quality, size and composition consistent with answer a."], "image": "val2014/COCO_val2014_000000305576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101816, "question_id": "Bv43y6oGU3j4s4sr5GZ6BW", "question": "In what century was this type of tub invented?", "choices": ["19th", "20th", "18th", "17th"], "correct_choice_idx": 2, "direct_answers": ["18th", "1700s", "19th", "19th century", "mid-18th", "19th century", "1800s", "18th", "1800s", "1800's"], "difficult_direct_answer": false, "rationales": ["The person is sitting in a claw foot tub.", "The bathtub design the woman pictured here enjoys was invented in the 18th century.", "The person is in a claw foot tub."], "image": "train2014/COCO_train2014_000000101816.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438738, "question_id": "Bv7wbUs4uvERQhzf5nCV42", "question": "Where are the elephants behind held?", "choices": ["in circus", "in park", "in zoo", "in prison"], "correct_choice_idx": 2, "direct_answers": ["fencing pen", "behind fence", "enclosure", "in zoo", "pen", "cages", "lot", "zoo", "behind wood", "enclosure"], "difficult_direct_answer": true, "rationales": ["These animals come from the wild and have to be showcased in a safe setting.", "Is th emost obvious option. that said, the photo doesn't make it clear. this could have been a traveling d in a c even.", "The elephants are fed food through an enclosure of a zoo."], "image": "val2014/COCO_val2014_000000438738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401583, "question_id": "BvMz9LGpEwXFeV9fU7UEeT", "question": "This animal appeared in what movie?", "choices": ["frozen", "aladdin", "dumbo", "robin hood"], "correct_choice_idx": 2, "direct_answers": ["dumbo", "dumbo", "dumbo", "dumbo", "jungle book", "jungle book", "dumbo", "dumbo", "dumbo", "dumbo"], "difficult_direct_answer": false, "rationales": ["Dumbo was about an elephant.", "The animal pictured here is an elephant. dumbo is a famous fictional elephant.", "The animal has appeared in the disney movie dumbo."], "image": "train2014/COCO_train2014_000000401583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156416, "question_id": "BvVBb4GvcUy3k7pFsM9fJr", "question": "What is the man in deep water about to catch?", "choices": ["frisbee", "cold", "whale", "dolphin ride"], "correct_choice_idx": 0, "direct_answers": ["frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "wave", "frisbee", "fish", "frisbee"], "difficult_direct_answer": false, "rationales": ["There is a frisbee being thrown from one person to the other. the catcher will catch the thrower's frisbee.", "The man is throwing a frisbee. the man on the other end waiting is going to catch the frisbee.", "The other man is getting ready to throw him a frisbee."], "image": "val2014/COCO_val2014_000000156416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393251, "question_id": "BvXfNg5hvZdpRazVu9Luj3", "question": "Which person is likely celebrating a birthday?", "choices": ["unknown", "boy", "man", "woman"], "correct_choice_idx": 1, "direct_answers": ["child", "kid", "child", "right", "child", "child", "child", "boy", "child", "child"], "difficult_direct_answer": false, "rationales": ["The cake has a cartoon on it.", "The boy is trying to cut into the cake.", "Most likely by the setting of the picture and the style of cake as to who's birthday it is."], "image": "train2014/COCO_train2014_000000393251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359507, "question_id": "BvdHvyzFm56GyniSPpgEJM", "question": "What is under the sheets hanging of the end of the bed?", "choices": ["clothing", "dogs", "human", "bedding"], "correct_choice_idx": 2, "direct_answers": ["duvet", "person", "mattress", "comforter", "person", "person", "human", "carpet", "mattress", "child"], "difficult_direct_answer": false, "rationales": ["The comforter looks like a person may be under it.", "This looks like the shape and size of a human.", "There is a large lump under the fabric"], "image": "train2014/COCO_train2014_000000359507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306693, "question_id": "Bvkf6zrexcY9EQXTU3YwM6", "question": "How many cars are there in the image?", "choices": ["two", "five", "six", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "five", "one", "five", "five", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is one car.", "The one car is surrounded by trucks.", "A parking lot with many trucks has one car parked in the middle."], "image": "val2014/COCO_val2014_000000306693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567881, "question_id": "BvyhSkikfEVF4Jj9rfTWno", "question": "Which equipment would be fastest for them to use?", "choices": ["mitts", "shoes", "baseball bats", "balls"], "correct_choice_idx": 1, "direct_answers": ["walking", "baseball", "mitts", "mitt", "gloves", "bat", "bat", "balls", "shoes", "ball"], "difficult_direct_answer": true, "rationales": ["People's shoes are already on.", "A bunch of pro baseball players are standing on the field. they are wearing things on the feet that make them run fast.", "It takes time to put on the shoes and mitts. the bat is more accessible and takes less time to get to."], "image": "val2014/COCO_val2014_000000567881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113757, "question_id": "Bw5EKar2ouyLhg9XEgiaa6", "question": "What type of classroom could this be called?", "choices": ["podium", "amphitheater", "stadium", "enclave"], "correct_choice_idx": 2, "direct_answers": ["lecture hall", "college", "stadium", "lecture hall", "university classroom", "lecture hall", "lecture hall", "lecture hall", "auditorium", "auditorium"], "difficult_direct_answer": false, "rationales": ["The seats are arranged in theater format.", "The seats are on ascending levels in a circular pattern around the stage.", "A learning area that resembles a movie theater. large number of students can sit in view of the professor."], "image": "val2014/COCO_val2014_000000113757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508025, "question_id": "Bw6C3T67bQECEt5f9eUH33", "question": "What is the green on the blue vehicle?", "choices": ["scratches", "tape", "reflection", "paint"], "correct_choice_idx": 2, "direct_answers": ["reflection", "scooter", "bike", "paint", "streaked paint", "reflection", "reflection", "reflection storefront", "reflection", "reflection"], "difficult_direct_answer": false, "rationales": ["The green is the reflection.", "An inconsistent green shape can be seen on one area of a very shiny motorcycle that is blue.", "There is neon lighting located somewhere opposite the vehicle."], "image": "train2014/COCO_train2014_000000508025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516840, "question_id": "Bw9gCNp82gdM3peU29P5P6", "question": "What tournament is this?", "choices": ["fa cup", "olympics", "wimbledon", "grand national"], "correct_choice_idx": 2, "direct_answers": ["wimbledon", "tennis", "tennis", "tennis", "tennis", "tennis", "us open", "us open", "tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["They are playing tennis.", "A person is playing tennis on a court with a judge in a uniform.", "This is a tennis tournament."], "image": "train2014/COCO_train2014_000000516840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511852, "question_id": "BwJRzBVAXusRBQGG9H4T2m", "question": "What is the decoration on the man's red hat called?", "choices": ["flap", "pom-pom", "peak", "tassel"], "correct_choice_idx": 1, "direct_answers": ["logo", "beanie", "visor", "pom pom", "tassel", "yarn ball", "dye", "pom pom", "fuzzy ball", "pom-pom"], "difficult_direct_answer": true, "rationales": ["The pom on the back of the hat is a decoration.", "The man with the red hat has a pompom attached to the top which is a fuzzy yarn ball.", "This is a name for a fuzzy ball of wool or yarn found on winter headwear."], "image": "val2014/COCO_val2014_000000511852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211892, "question_id": "BwRdCEnRDk8mVEk4qVUhSC", "question": "What keeps most of the animals from drowning?", "choices": ["life jackets", "english directions", "wet suits", "necklaces"], "correct_choice_idx": 0, "direct_answers": ["life vest", "life vests", "life jacket", "life jacket", "jackets", "life jackets", "life vests", "life jackets", "life vests", "life vests"], "difficult_direct_answer": false, "rationales": ["The dogs have life jackets on, which are used to protect people who go in the water, from drowning.", "The bright colored vests on the animals helps them float on the water.", "All of them are wearing brightly colored vests similar to those people wear for safety in water."], "image": "train2014/COCO_train2014_000000211892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385604, "question_id": "BwiVRhS5Fa93WZQCgGvzDL", "question": "What country corresponds with that top level domain?", "choices": ["china", "cambodia", "chile", "colombia"], "correct_choice_idx": 2, "direct_answers": ["chile", "chile", "chile", "chile", "chile", "chile", "chile", "chile", "chile", "chile"], "difficult_direct_answer": false, "rationales": ["The country is chile.", "The country corresponding with the top level domain is the national government zone of chile.", "Chile has the domain cl."], "image": "train2014/COCO_train2014_000000385604.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6155, "question_id": "BwvJoJjdmZdzFgf6fz7QfF", "question": "What are the people looking at?", "choices": ["kindle", "computer", "cell phone", "tablet"], "correct_choice_idx": 2, "direct_answers": ["cell phone", "phones", "cellphones", "phones", "phone", "phones", "messages", "smart phones", "phones", "phones"], "difficult_direct_answer": false, "rationales": ["The people are looking at their phones.", "People look down at their cellphone in their hands when they text.", "You can tell by the design and size and there position, you can tell what they are looking aat."], "image": "train2014/COCO_train2014_000000006155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110157, "question_id": "Bx4VEQWchd74GfAJQHxWTF", "question": "Where are people here eating pizza today?", "choices": ["icecream shop", "pizzeria", "office setting", "malt shop"], "correct_choice_idx": 2, "direct_answers": ["work", "office", "work", "office", "office setting", "office", "office", "office", "office", "office"], "difficult_direct_answer": false, "rationales": ["Seems that there is a meeting going on in the area.", "There are computers and desks near the people. they are not in a restaurant or shop.", "The people are in the office."], "image": "train2014/COCO_train2014_000000110157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372580, "question_id": "BxGKK23CrPyHEUV6YPJP3o", "question": "What sport does the person in red enjoy?", "choices": ["chess", "snow skiing", "wakeboarding", "biking"], "correct_choice_idx": 2, "direct_answers": ["wakeboard parasailing", "surfing", "para sailing", "waterboarding", "wakeboarding", "windsailing", "kite surfing", "wakeboarding", "wakeboarding", "fishing"], "difficult_direct_answer": false, "rationales": ["By the board and setting they are in you can tell what they enjoy.", "They are on the water", "They are doing this activity so it makes sense that they like water boarding."], "image": "val2014/COCO_val2014_000000372580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201859, "question_id": "Bxgj2gi4PQ7GTwxLFWU2XS", "question": "What does the person not sitting on a horse or car here await?", "choices": ["drag race", "millet delivery", "bus", "lunch"], "correct_choice_idx": 2, "direct_answers": ["unknown", "bus", "traffic light", "bus", "traffic light", "bus", "light", "bus", "street crossing", "traffic light"], "difficult_direct_answer": false, "rationales": ["The person is waiting at a bus stop.", "The person wants a bus.", "The bus station is there."], "image": "val2014/COCO_val2014_000000201859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257940, "question_id": "BxqdddT96ZbcpGp6RBC5wy", "question": "This urban area is within which nation in Asia?", "choices": ["china", "south korea", "hong kong", "japan"], "correct_choice_idx": 3, "direct_answers": ["singapore", "india", "china", "japan", "china", "china", "japan", "indonesia", "china", "city"], "difficult_direct_answer": false, "rationales": ["There is japanese writing on the blue sign in the distance.", "The area is japan.", "The urban area is in japan since the high speed rail is in japan."], "image": "val2014/COCO_val2014_000000257940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577079, "question_id": "ByEmrBi5wvC5EU6sNJ7sss", "question": "Why do kites have tails?", "choices": ["style", "beauty", "habit", "function"], "correct_choice_idx": 3, "direct_answers": ["visibility", "look realistic", "decorations", "function", "decoration", "stability", "flying smoother", "support", "visibility", "balance"], "difficult_direct_answer": true, "rationales": ["The answer is commonly known. kite tails serve a purpose to keep the flight smooth and steady.", "They use tails to fly.", "Kites fly through the wind better with tails."], "image": "train2014/COCO_train2014_000000577079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347306, "question_id": "ByMpkcFbHszYzradjmFytb", "question": "Why does the runner have gloves on?", "choices": ["health", "warmth", "costume", "grip"], "correct_choice_idx": 3, "direct_answers": ["batting", "batted earlier", "protection", "from batting", "baseball player", "grip", "protection", "hold bat", "hands", "protection"], "difficult_direct_answer": false, "rationales": ["Traditionally batters use these items to help with the grip for hitting the baseball.", "The runner just hit a ball with a bat. wearing gloves helps to grip the bat better.", "The runner wants a grip."], "image": "train2014/COCO_train2014_000000347306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435492, "question_id": "ByjCB6vLanEEFPVj9rCmvL", "question": "Which comedy movie is the man with his feet on the desk a big fan of?", "choices": ["hangover", "office space", "borat", "superbad"], "correct_choice_idx": 1, "direct_answers": ["anitech", "office space", "anitech", "office space", "anitech", "office space", "office space", "office space", "office space", "office space"], "difficult_direct_answer": false, "rationales": ["The coffee cup has an initech name and logo on it and initech is the fictional company in the 1999 film comedy \"office space,\" so the man probably wouldn't have an initech mug if he wasn't an \"office space\" fan.", "The man is watching office space.", "The comedy is office space."], "image": "train2014/COCO_train2014_000000435492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289545, "question_id": "BziLJj7cnJfhbmgxkRfuuU", "question": "What video game is the picture with the guy in a space suit and helmet referring to?", "choices": ["metroid", "section z", "bezerk", "moon patrol"], "correct_choice_idx": 0, "direct_answers": ["metroid", "astronaut", "fall", "playing game", "metroid", "metroid", "space suits", "metroid prime", "metroid prime", "minecraft"], "difficult_direct_answer": false, "rationales": ["The character in a space suit is samus.", "Posters depicting video game images are on a wall in a room.", "The painting is of samus."], "image": "train2014/COCO_train2014_000000289545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40886, "question_id": "BztUhfGD85KUoSLyefPe94", "question": "What is being held by the person the pitcher looks at?", "choices": ["slingshot", "corked bottle", "bat", "gun"], "correct_choice_idx": 2, "direct_answers": ["bat", "bat", "mitt", "bat", "bat", "glove", "bat", "bat", "bat", "baseball bat"], "difficult_direct_answer": false, "rationales": ["The pitcher is throwing the ball towards the hitter. a corked bottle, gun, or slingshot would not be able to hit a ball.", "The person depicted is a pitcher based on their location on the baseball diamond and the action they are performing. a pitcher in baseball would be looking at a batter based on the way the game is played.", "The pitcher is throwing the ball to another player who is trying to hit it."], "image": "val2014/COCO_val2014_000000040886.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309386, "question_id": "BzvMMKF4s6i29RG9vyuPKD", "question": "How many pieces of pizza do you see?", "choices": ["full", "two", "four", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "one", "one", "three", "one", "three", "three", "one", "one"], "difficult_direct_answer": false, "rationales": ["The plate in the foreground is the only place we can fully see a slice of pizza in this image.", "There is only one piece on the plate.", "Only one slice of pizza is seen on the table."], "image": "train2014/COCO_train2014_000000309386.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368602, "question_id": "C2FwgYLNwQUSs2ZUnQgjkk", "question": "What powers the vessel pulling the skier?", "choices": ["wind", "coal", "boat motor", "sharks"], "correct_choice_idx": 2, "direct_answers": ["motor", "boat motor", "boat", "motor", "motor", "boat", "gas", "motorboat", "engine", "motor"], "difficult_direct_answer": false, "rationales": ["The boat motor is powering.", "This is located on board and uses gasoline to make the boat move rapidly.", "The vehicle pulling the skier utilizes high speed generated by a manufactured power source."], "image": "val2014/COCO_val2014_000000368602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251660, "question_id": "C2YCsqAYQg5Wr8cAjUchw7", "question": "What vehicles are allowed on the rightmost lane?", "choices": ["cars", "trucks", "vans", "bicycles"], "correct_choice_idx": 3, "direct_answers": ["bicycles", "bikes", "bicycling", "bicycle", "bicycle", "bikes", "bikes", "bicycles", "bikes", "bicycles"], "difficult_direct_answer": false, "rationales": ["A person is riding a bike in the farthest lane to the right. bike lanes are often on the right side of streets.", "Bikes are only for the lane.", "The lane is narrower than a normal road lane and does not have the standard road lines. there is additionally a bicyclist in the lane."], "image": "train2014/COCO_train2014_000000251660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354480, "question_id": "C2hngLtp45B6hQFYbGM9iq", "question": "How many types of boats are there?", "choices": ["five", "four", "six", "nine"], "correct_choice_idx": 0, "direct_answers": ["five", "one", "one", "one", "zero", "one", "one", "zero", "zero", "zero"], "difficult_direct_answer": false, "rationales": ["There are five boats.", "There are five spikes pointed to by the woman.", "There are five different types of boats."], "image": "train2014/COCO_train2014_000000354480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243851, "question_id": "C2kuHZcVv57RFbKQcSLHaE", "question": "What's the area called the player is standing on?", "choices": ["home base", "first base", "pitcher's mound", "outfield"], "correct_choice_idx": 2, "direct_answers": ["mound", "pitcher's mound", "pitcher's mound", "pitcher's mound", "mound", "pitcher's mound", "mound", "mound", "pitcher's mound", "mound"], "difficult_direct_answer": false, "rationales": ["The person is on the mound.", "A baseball player is throwing the ball from an elevated area of sand on the field.", "He is on the pitchers mound"], "image": "train2014/COCO_train2014_000000243851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393760, "question_id": "C2nLXFo4khdDU7NRV8GVZ5", "question": "What is to the left of the bed?", "choices": ["hashtag", "gargoyle", "egg", "alarm clock"], "correct_choice_idx": 3, "direct_answers": ["chair", "alarm clock", "alarm clock", "alarm clock", "clock radio", "nightstand", "alarm clock", "docking station", "doorway", "clock"], "difficult_direct_answer": false, "rationales": ["There is a table on the left side of the bed with an alarm clock on it.", "The alarm is seen beside the bed.", "It is an electronic object with a digital display."], "image": "train2014/COCO_train2014_000000393760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170629, "question_id": "C3SnJZBCwSPohzRsGFzqNe", "question": "Why is the vehicle stopped near the curb?", "choices": ["getting gas", "delivering packages", "accepting passengers", "parking"], "correct_choice_idx": 2, "direct_answers": ["access passengers", "accepting passengers", "unloading", "dropping off", "loading", "board passengers", "bus stop", "passenger pickup", "passenger pickup", "bus stop"], "difficult_direct_answer": false, "rationales": ["The vehicle is letting people onto the bus.", "They are at a bus stop so people can get on and off", "When busses stop, it's to pick people up."], "image": "val2014/COCO_val2014_000000170629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296706, "question_id": "C3T5Qy4J4WqLcxoE7JwekG", "question": "Where are the zebras?", "choices": ["forest", "desert", "jungle", "river"], "correct_choice_idx": 1, "direct_answers": ["home", "lake bed", "savanna", "africa", "africa", "desert", "desert", "dirty area", "outside", "desert"], "difficult_direct_answer": false, "rationales": ["The surroundings are barren, without plants or water.", "This barren landscape is sandy and dusty, which offers proof that this location is a desert.", "The zebras are in a dry sandy area in a desert in the wilderness."], "image": "train2014/COCO_train2014_000000296706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332031, "question_id": "C3c9VVttV5YeXxJRYFjTSi", "question": "What type of meat is on the plate?", "choices": ["bacon", "hamburger", "pork chop", "steak"], "correct_choice_idx": 0, "direct_answers": ["bacon", "breakfast", "bacon", "bacon", "bacon", "bacon", "breakfast", "breakfast", "bacon", "breakfast"], "difficult_direct_answer": false, "rationales": ["The meat comes from a pig, not a cow. they are strips, not chops.", "The plate has two strips of bacon on it.", "The meat is served in strips with striations on it."], "image": "train2014/COCO_train2014_000000332031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469061, "question_id": "C3nD46cKyAQcTVYAxkqVWM", "question": "Why are his skis so small?", "choices": ["are broken", "is new", "is child", "someone else's"], "correct_choice_idx": 2, "direct_answers": ["he's child", "for child", "child sized", "is child", "child skis", "his small", "child size", "child", "child", "young child"], "difficult_direct_answer": true, "rationales": ["An adult and a child are skiing. children wear smaller skis than adults.", "The kid is small so is skis are too.", "The skis are for a kid."], "image": "val2014/COCO_val2014_000000469061.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131416, "question_id": "C4Cfo2i7vRsMVH328F5btm", "question": "What is the primary gas is released from the soda can on the right when opened?", "choices": ["nitrogen", "oxygen", "helium", "carbon dioxide"], "correct_choice_idx": 3, "direct_answers": ["carbon dioxide", "carbon dioxide", "using opener", "carbon", "carbon dioxide", "cop", "carbonation", "carbon dioxide", "carbon dioxide", "carbon dioxide"], "difficult_direct_answer": false, "rationales": ["The soda can will release co2.", "The can contains soda which is carbonated and releases bubbles of co2.", "It is a soda."], "image": "val2014/COCO_val2014_000000131416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482242, "question_id": "C4MeD6n9yQZ3qJXHqGJTRE", "question": "What breed dog it is?", "choices": ["doberman", "poodle", "labrador", "retriever"], "correct_choice_idx": 2, "direct_answers": ["lab", "black lab", "labrador retriever", "lab", "labrador", "lab", "labrador", "whippet", "lab", "rottweiler"], "difficult_direct_answer": false, "rationales": ["The man is sitting with his black labrador on his lap.", "The dog has a coat consistency and color that would match answer a as well as the tail, ears shape and size and nose that would make answer a most likely.", "The black dog is a labrador dog."], "image": "val2014/COCO_val2014_000000482242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76885, "question_id": "C4XVLJ4xRyhj7L73Ggtj7u", "question": "Why is the man wearing a ring on the fourth finger of his left hand?", "choices": ["he's stylish", "he's married", "dress code", "fashion"], "correct_choice_idx": 1, "direct_answers": ["married", "married", "he's married", "married", "he's married", "married", "he's married", "married", "married", "married"], "difficult_direct_answer": false, "rationales": ["Wedding rings are traditionally worn on the fourth finger of your left hand.", "It's a wedding band.", "A gold ring is on the mustached man's left hand on his fourth finger. this is symbolic of being married - also known as a \"wedding band\" - and that is the finger it goes on."], "image": "train2014/COCO_train2014_000000076885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92205, "question_id": "C4wyZGWrEgpkHsubkbCFyZ", "question": "Why does the man have a yellow shirt on?", "choices": ["for work", "for clubbing", "for style", "for halloween"], "correct_choice_idx": 0, "direct_answers": ["construction worker", "get noticed", "safety", "for safety", "for work", "jacket", "visibility", "identification", "increased visibility", "control traffic"], "difficult_direct_answer": true, "rationales": ["A man is standing on the side of the road holding a road sign in a brightly colored vest. road workers wear yellow for visibility.", "The man is employed by a construction company and he wears the yellow shirt for safety so that people can see him while he is performing his duties.", "In that line of work you need the vests for safety."], "image": "val2014/COCO_val2014_000000092205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447479, "question_id": "C5a6C4jPwd8RrCd6jXuH3A", "question": "What baseball player would make sense to own this store?", "choices": ["randy arozarena", "del wilkes", "omar infante", "david wright"], "correct_choice_idx": 0, "direct_answers": ["randy johnson", "randy johnson", "randy", "randy johnson", "randy johnson", "randy johnson", "randy arozarena", "randy", "randy", "randy johnson"], "difficult_direct_answer": false, "rationales": ["The first name on the top of the donut is not del, omar, or david.", "Is is mr. arozarena's first name.", "The sign above the store says the name randy in the possessive form, indicating it is owned by somebody with that name."], "image": "train2014/COCO_train2014_000000447479.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209357, "question_id": "C5bw6XQcbsaR4gjEiHXunF", "question": "What activity are the men going to participate?", "choices": ["surfing", "skateboarding", "skiing", "skiboarding"], "correct_choice_idx": 3, "direct_answers": ["snowboarding", "snow boarding", "snowboarding", "snowboarding", "skiboarding", "snow boarding", "snowboarding", "snowboarding", "snowboarding", "snowboarding"], "difficult_direct_answer": false, "rationales": ["One of them is holding a snowboard.", "This is the only activity of the four that can be performed on a single board going down a snowy hill.", "It looks like a snowboard, which is like skiing."], "image": "val2014/COCO_val2014_000000209357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103726, "question_id": "C5eYJTryHnVfZRzD9CFJww", "question": "What is the person in red outfit doing?", "choices": ["sightseeing", "jogging", "watching race", "swimming"], "correct_choice_idx": 2, "direct_answers": ["walking beach", "walking", "watching", "standing", "racing jockeying", "watching race", "riding", "watching race", "waking", "watching"], "difficult_direct_answer": false, "rationales": ["But they could also be d. it's hard to say at this distance.", "The person in red is watching horses with riders in sports attire riding horses fast together.", "There are horses and jockeys in the foreground. the person in red is looking at them."], "image": "train2014/COCO_train2014_000000103726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52611, "question_id": "C5pVJZvn7ooe95rfuncbEj", "question": "What might you see on top of the white item to the right?", "choices": ["toilet roll", "soap", "sponge", "brush"], "correct_choice_idx": 0, "direct_answers": ["toilet paper", "paper bag", "pole", "human", "tissue", "toilet paper", "toilet paper", "toilet paper", "water", "toilet roll"], "difficult_direct_answer": false, "rationales": ["The toilet roll is on the right.", "The toilet generally will have toilet paper on it.", "Toilet paper is often stored on top of toilets."], "image": "train2014/COCO_train2014_000000052611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243354, "question_id": "C5ptnZKf73rwWpguDMkCwy", "question": "What style of skateboard is the man in the white shirt using?", "choices": ["long board", "radio board", "vert board", "hover board"], "correct_choice_idx": 0, "direct_answers": ["long board", "longbaord", "oldschool", "longboard", "regular", "long board", "beginner", "unknown", "longboard", "cruiser"], "difficult_direct_answer": false, "rationales": ["The board is not a traditional skateboard. it looks like an older \"longboard\" design from decades past.", "The man in the white shirt is riding a skateboard known as a longboard because it is longer than normal", "The man is riding on a long board and we know this because it is longer."], "image": "train2014/COCO_train2014_000000243354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335289, "question_id": "C68dQTf5WEigTf3opMidjD", "question": "What object is this structure modeled after?", "choices": ["temple", "museum", "canal", "egyptian obelisk"], "correct_choice_idx": 3, "direct_answers": ["obelisk", "egyptian obelisks", "obelisk", "tower", "obelisk", "fish", "egyptian obelisk", "lincoln memorial", "empire state", "bullet"], "difficult_direct_answer": false, "rationales": ["The top of this large vertical structure has a pyramid shape.", "The shape of the tower looks like the egyptian obelisk.", "It is almost identical."], "image": "train2014/COCO_train2014_000000335289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409088, "question_id": "C6KJWc8js2gJqcqFR9Foqf", "question": "What type of bird is this?", "choices": ["raven", "finch", "parakeet", "sparrow"], "correct_choice_idx": 3, "direct_answers": ["sparrow", "pigeon", "pigeon", "finch", "robin", "common", "sparrow", "sparrow", "sparrow", "sparrow"], "difficult_direct_answer": false, "rationales": ["This is a small brown bird which makes it likely to be a sparrow.", "It also looks a bit line a wren. the other options don't match.", "A small brown bird is on a table."], "image": "val2014/COCO_val2014_000000409088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315769, "question_id": "C6X4QCDoEBabaKQqjUhDYQ", "question": "Which flag is on the bus?", "choices": ["uk", "danish", "france", "germany"], "correct_choice_idx": 0, "direct_answers": ["uk", "union jack", "england", "uk", "britain", "uk", "uk", "britain", "british", "union jack"], "difficult_direct_answer": false, "rationales": ["The bus is painted to resemble the union jack.", "The flag is the uk.", "This is a bus used in london."], "image": "val2014/COCO_val2014_000000315769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544209, "question_id": "C6bPJdcdYadoDEZdr8pNms", "question": "Where are the cows?", "choices": ["barn", "city", "field", "corn field"], "correct_choice_idx": 2, "direct_answers": ["in field", "pasture", "enclosed field", "field", "behind fence", "behind fence", "inside fence", "farm", "fence", "inside fence"], "difficult_direct_answer": false, "rationales": ["The cows are outside and are in a rural area. they are near grass, not corn.", "The cows are in grass.", "The area where the cows are standing is grassy and green."], "image": "train2014/COCO_train2014_000000544209.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311208, "question_id": "C6fAVt2e8ZqcAGCjUpAibH", "question": "Which mode of transport here is inanimate?", "choices": ["train", "horse", "bike", "car"], "correct_choice_idx": 2, "direct_answers": ["bicycle", "bike", "bicycle", "bike", "bicycle", "bicycle", "bicycle", "bike", "bicycle", "bicycle"], "difficult_direct_answer": false, "rationales": ["This is made from metal and rubber", "The horses are animate. there are no cars or trains.", "The horse is living. there are no motorized vehicles."], "image": "train2014/COCO_train2014_000000311208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184669, "question_id": "C6iZhXhxwaKjpByLZURcc8", "question": "Where is the truck going?", "choices": ["store", "bank", "beach", "restaurant"], "correct_choice_idx": 0, "direct_answers": ["store", "delivery", "to left", "store", "off road", "taking left", "next delivery", "shops", "into parkinglot", "through intersection"], "difficult_direct_answer": true, "rationales": ["The truck holds coca cola bottles which are sold at a store.", "This company delivers its products to stores nationwide.", "A large commercial truck with a soda company logo is driving in the street."], "image": "train2014/COCO_train2014_000000184669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517941, "question_id": "C7EsBniTHa3aDpioSpHsUm", "question": "What kind of shorts does the man have on?", "choices": ["puma", "gucci", "vans", "nike"], "correct_choice_idx": 3, "direct_answers": ["nike", "athletic", "nike", "nike", "nike", "nike shorts", "nike", "tennis", "nike", "tennis"], "difficult_direct_answer": false, "rationales": ["You can see the little checkmark on their shorts.", "The logo on the shorts is referred to as the \"a\" swoosh.", "The shoes have the iconic swoosh."], "image": "train2014/COCO_train2014_000000517941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301766, "question_id": "C7V56goSwWLZiWNGFyTqZq", "question": "What temperatures does the soaring person enjoy?", "choices": ["room", "boiling", "tropical", "freezing"], "correct_choice_idx": 3, "direct_answers": ["cold", "freezing", "cold", "cold temperatures", "frozen", "cold temperatures", "minus degree", "cooler", "freezing", "35"], "difficult_direct_answer": false, "rationales": ["The man that is soaring is freezing because he is skiing on a snowy mountain.", "The person is skiing. this activity is done outside and cannot be done in warm areas.", "The soaring person is wearing ski equipment. skiing is something done in the winter with freezing temperature so if this is something they enjoy they would likely enjoy the weather that produces the necessary conditions."], "image": "val2014/COCO_val2014_000000301766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539068, "question_id": "C7qQ4qUs57hFSGmSRzaRXB", "question": "What street sign is the man standing next to?", "choices": ["stop", "yield", "bump", "caution"], "correct_choice_idx": 2, "direct_answers": ["bump", "bump", "bump", "bump", "bump", "bump", "bump", "bump", "bump", "bump"], "difficult_direct_answer": false, "rationales": ["The man is standing next to an orange street sign that says bump.", "The word on the sign indicates its name.", "The word is on the sign."], "image": "train2014/COCO_train2014_000000539068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357435, "question_id": "C7sEbCmUwfAHUz4mQU3ztt", "question": "Where does the person who holds the ball stand here?", "choices": ["bullpen", "home base", "pitchers mound", "third base"], "correct_choice_idx": 2, "direct_answers": ["mound", "batting area", "pitchers mound", "mound", "on mound", "mound", "pitchers mound", "pitchers mound", "pitching", "pitcher mound"], "difficult_direct_answer": false, "rationales": ["The man who threw the ball is the pitcher and the hill of dirt he stands on is called the pitcher's mound.", "There is a mound in the field where a designated player can throw the ball from.", "The person holding the ball stands near the small elevated piece of dirt called the pitchers mound."], "image": "train2014/COCO_train2014_000000357435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422375, "question_id": "C7zLVvzXi6QD4uuRvAUaic", "question": "What is the black layer of outer clothing he is wearing called?", "choices": ["vest", "chino", "jacket", "blazer"], "correct_choice_idx": 0, "direct_answers": ["vest", "vest", "vest", "vest", "vest", "vest", "vest", "vest", "vest", "vest"], "difficult_direct_answer": false, "rationales": ["The black clothing is a vest.", "The black piece of clothing is put in a layer and it has no sleeves.", "This is a clothing item that covers the torso only"], "image": "val2014/COCO_val2014_000000422375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554561, "question_id": "C8Gwkxz3cCeda7uXEawNKB", "question": "What sport are the boys playing?", "choices": ["cricket", "rugby", "ultimate frisbee", "disc golf"], "correct_choice_idx": 2, "direct_answers": ["frisbee", "frisbee", "ultimate frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["The boys are playing with a frisbee.", "The boys are playing frisbee.", "The boy is clearly holding a frisbee and looking to throw it while being defended by a nearby player. for two people to be orientated in this way with a frisbee involved answer a is most likely."], "image": "train2014/COCO_train2014_000000554561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393097, "question_id": "C8Mhwrm3patEFYdySt9vkR", "question": "What condiment ends in the same four letters that the name on the bus ends in?", "choices": ["mayonnaise", "ketchup", "mustard", "relish"], "correct_choice_idx": 2, "direct_answers": ["custard", "mustard", "mustard", "mustard", "mustard", "mustard", "mustard", "mustard", "mustard", "mustard"], "difficult_direct_answer": false, "rationales": ["Mustard ends with the letters ard.", "A bus with a howard logo is parked near people who are boarding.", "The most common condiment that ends with \"tard\" would be mustard."], "image": "train2014/COCO_train2014_000000393097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327807, "question_id": "C8RrEj56U3hBTNAHytwYGh", "question": "What is the person on the right selling?", "choices": ["pizza", "water", "cars", "swords"], "correct_choice_idx": 1, "direct_answers": ["fruits", "food", "food", "groceries", "water", "food", "water", "food", "vegetables", "drinking water"], "difficult_direct_answer": false, "rationales": ["There are bottles of clear liquid in a basket", "The person has water bottles available for sale.", "The person on the right is selling bottled water."], "image": "val2014/COCO_val2014_000000327807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342649, "question_id": "C8UZYLVr9sN3DWMmfgLCB7", "question": "What is tinfoil made of?", "choices": ["tin", "plastic", "steel", "copper"], "correct_choice_idx": 0, "direct_answers": ["aluminum", "metal", "tin", "aluminum", "aluminum", "metal", "aluminum", "tin", "aluminum", "tin"], "difficult_direct_answer": false, "rationales": ["Foil is made out of tin.", "Foil is aluminum.", "Tinfoil is balled up in the street."], "image": "val2014/COCO_val2014_000000342649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392892, "question_id": "C8a6iCoyTc3WEx6nQ7xH7b", "question": "Lights that attach to a ceiling rack are known as what?", "choices": ["track", "dimmed", "lined", "mount"], "correct_choice_idx": 0, "direct_answers": ["track", "semi-flush lighting", "track lights", "ceiling mount", "fixtures", "pendants", "overhead", "lightbulbs", "ceiling lights", "rack lights"], "difficult_direct_answer": true, "rationales": ["Lights are tracks.", "Track lights are often put on a ceiling.", "The lights are attached to a track."], "image": "val2014/COCO_val2014_000000392892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193261, "question_id": "C98oFSdYjifCq5jdVjN6bU", "question": "The clouds here indicate what might happen?", "choices": ["tornado", "wind tunnel", "rain", "sunny skies"], "correct_choice_idx": 2, "direct_answers": ["storm", "rain", "rain", "rain", "rain", "storm", "storm", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The sky is gray and the sun is not visible.", "Grey and white clouds dot the sky. this indicates stormy weather with participation could occur.", "The skies are overcast and very cloudy."], "image": "val2014/COCO_val2014_000000193261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10484, "question_id": "C9JMded2r2cZUQr3VNtLNX", "question": "What is located above the ironwork on top of the train that is heading away?", "choices": ["coffee shop", "factory", "storefront", "train track"], "correct_choice_idx": 3, "direct_answers": ["bridge", "clamps", "structure", "grates", "building", "rust", "iron", "bridge", "train track", "bridge"], "difficult_direct_answer": false, "rationales": ["It's what the train runs on.", "A train travels on train tracks.", "That is how the train travels on."], "image": "train2014/COCO_train2014_000000010484.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362090, "question_id": "C9K6Zn59z8XX3Lza2nN2eB", "question": "What will be the first the person grabs when they stand up?", "choices": ["dry cleaning", "backpack", "jacket", "cane"], "correct_choice_idx": 3, "direct_answers": ["cane", "cane", "cane", "cane", "cane", "cane", "cane", "cane", "cane", "cane"], "difficult_direct_answer": false, "rationales": ["This person needs a cane to walk so they will grab the cane to balance themselves.", "The woman has a cane to help her walk.", "That helps them keep balanced."], "image": "val2014/COCO_val2014_000000362090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66406, "question_id": "C9LSTdcB3Ep7oAuTW8EjK5", "question": "What is the long part attached to the elephant called?", "choices": ["trunk", "hose", "funnel", "nose"], "correct_choice_idx": 0, "direct_answers": ["trunk", "trunk", "trunk", "trunk", "trunk", "trunk", "trunk", "trunk", "trunk", "trunk"], "difficult_direct_answer": false, "rationales": ["A funnel or a hose is not a body part. other animals have noses.", "It is what the elephant uses to wash himself and pick up things with.", "There is a long nose that is curled up in the air. it's moved out of the way so human can give it milk."], "image": "train2014/COCO_train2014_000000066406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527098, "question_id": "C9roppNwNTC596qzkCaDnA", "question": "Where can you see a similar scene to what is happening behind the boats?", "choices": ["parking garage", "king kullen", "six flags", "costco"], "correct_choice_idx": 2, "direct_answers": ["amusement parks", "six flags", "amusement park", "boardwalk", "state fair", "amusement park", "carnival", "amusement park", "fair", "circus"], "difficult_direct_answer": false, "rationales": ["An amusement park is back there.", "This location also has amusement park and carnival rides like the ones on the shore.", "Six flags has all the carnival rides that are depicted."], "image": "val2014/COCO_val2014_000000527098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495124, "question_id": "CAjPxuXwYzse6tmCfY47dV", "question": "What drug is on the kitchen table?", "choices": ["crack", "methamphetamines", "cocaine", "alcohol"], "correct_choice_idx": 3, "direct_answers": ["beer", "alcohol", "alcohol", "alcohol", "alcohol", "alcohol", "alcohol", "beer", "sloping pills", "alcohol"], "difficult_direct_answer": false, "rationales": ["Bottles of beer are on the table.", "Alcohol bottles are on the table.", "Many people forget that it's technically a drug in liquid form."], "image": "train2014/COCO_train2014_000000495124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447558, "question_id": "CAzadzzwd9D4QVvMyT2CvF", "question": "In what year did number 7 win the World Series?", "choices": ["2013", "2020", "2011", "2000"], "correct_choice_idx": 2, "direct_answers": ["2011", "1917", "1997", "2010", "2011", "unknown", "recent years", "2010", "2011", "2010"], "difficult_direct_answer": false, "rationales": ["The player's team last won in 2011.", "He won in 2011", "The st. louis cardinals won the world series this year. matt holliday was a st. louis cardinal."], "image": "val2014/COCO_val2014_000000447558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91566, "question_id": "CBEtrCTFZ6Z9wfAeNnyM3P", "question": "What is this base of this food?", "choices": ["broccoli", "flour", "potatoes", "milk"], "correct_choice_idx": 1, "direct_answers": ["flour", "dough", "bread", "cheese", "cheese", "dough", "dough", "pizza dough", "crust", "flour"], "difficult_direct_answer": false, "rationales": ["The base is flour.", "The crust is made out of flour.", "Flour is what pizza crust is made of."], "image": "train2014/COCO_train2014_000000091566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99902, "question_id": "CBLoHYkQp8zLx8Hv9KnDMA", "question": "The person wearing what color of shirt is in the greatest danger?", "choices": ["red", "white", "black", "green"], "correct_choice_idx": 1, "direct_answers": ["green", "green", "white", "white", "white", "white baby", "white", "white", "suffocating", "suffering"], "difficult_direct_answer": false, "rationales": ["A small baby is on the far side. one has to be careful for him or her to roll over on stomach.", "The person in white could roll off the bed.", "The person in the white shirt is the youngest and a baby, and quite fragile."], "image": "train2014/COCO_train2014_000000099902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85795, "question_id": "CBWhmGwLw8VkQkn5H98piy", "question": "What is the girl sitting in?", "choices": ["box", "chair", "scooter", "luggage bag"], "correct_choice_idx": 3, "direct_answers": ["bag", "suitcase", "suitcase", "luggage bag", "suitcase", "suitcase", "suitcase", "suitcase", "suitcase", "bag"], "difficult_direct_answer": false, "rationales": ["The other options aren't in this image. it's also known as a suitcase.", "The suitcase is open with a child sitting in it.", "The girl is sitting in a suitcase."], "image": "val2014/COCO_val2014_000000085795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524844, "question_id": "CBZxdnDAKKGNGiA6YXMakY", "question": "What's the name of the recreational area the man is in?", "choices": ["blacktop", "theme park", "playground", "skatepark"], "correct_choice_idx": 3, "direct_answers": ["park", "park", "skate park", "skate park", "park", "skatepark", "park", "skate park", "skate park", "skate park"], "difficult_direct_answer": false, "rationales": ["He is skating on a skatepark.", "There are people who seems to be skating around.", "The man is riding in a skatepark."], "image": "val2014/COCO_val2014_000000524844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30947, "question_id": "CBgAvEheBLz3HPKMKCAYhQ", "question": "What color are the sticky notes that are on the right side of the computer?", "choices": ["brown", "orange", "pink", "blue"], "correct_choice_idx": 2, "direct_answers": ["pink", "red", "pink", "pink", "pink", "red", "red", "red", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["The sticky notes on the right side of the computer monitor are bright pink.", "These sticky notes are not orange, blue, or brown.", "The sticky notes on the computer monitor's right side are pink."], "image": "train2014/COCO_train2014_000000030947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160688, "question_id": "CBivV8mLKv6BzxoDvSsarL", "question": "What are these women doing that is commendable?", "choices": ["eating fruit", "volunteering", "recycling", "running"], "correct_choice_idx": 1, "direct_answers": ["wearing gloves", "volunteering", "volunteering", "volunteering", "distributing food", "selling", "volunteering", "feeding people", "volunteering", "volunteering"], "difficult_direct_answer": false, "rationales": ["One of the women is wearing a badge with the word volunteer on it, which indicates that the activity they're taking part in is volunteering.", "They are volunteering their time.", "The person is helping out."], "image": "train2014/COCO_train2014_000000160688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176328, "question_id": "CBogZ5EVKgpBhdCAUbAuw5", "question": "At this moment what relationship do the two holding White Wiis engage in?", "choices": ["partnership", "mentoring", "competitive", "complaining"], "correct_choice_idx": 2, "direct_answers": ["opponents", "opponents", "competition", "opponents", "opponents", "game", "competitors", "competitors", "competitive", "opponents"], "difficult_direct_answer": false, "rationales": ["There is no way to tell for sure based on the image, but they are holding video game controllers. two people with video game controllers are often in competition with each other.", "They are competiting against one another.", "The people are holding video game controllers and standing next to each other facing the screen."], "image": "val2014/COCO_val2014_000000176328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255115, "question_id": "CC2q6SMavBvPXDhcBV3gy2", "question": "What is the man wearing that is made of rubber?", "choices": ["vest", "shirt", "pants", "boots"], "correct_choice_idx": 3, "direct_answers": ["xtratuf boots", "boots", "boots", "xtratuf boots", "boots", "boots", "boots", "boots", "boots", "boots"], "difficult_direct_answer": false, "rationales": ["Rubber boots can be worn to protect from wet areas.", "He has rubber boots.", "The items on the mans feet are made of rubber and for keeping water off of his feet."], "image": "train2014/COCO_train2014_000000255115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156258, "question_id": "CCnKAmKqim7qpFDG7HEYua", "question": "The white round item on top of this food is part of what other food item?", "choices": ["flounder", "tuna", "onion rings", "calamari"], "correct_choice_idx": 2, "direct_answers": ["onion", "onion rings", "onion", "onions", "onions", "onion", "onion", "onions", "onion", "onion"], "difficult_direct_answer": false, "rationales": ["These are onions, and onion rings are made out of the same thing, just covered in batter.", "It is the same color, pattern, and slice shape of the vegetable.", "It is seems to be from onion as seen with chopped onions."], "image": "train2014/COCO_train2014_000000156258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193943, "question_id": "CCyfHirUmk9JWDxw89aFw8", "question": "The cross buck sign indicates what?", "choices": ["railroad", "train crossing", "need sound", "none"], "correct_choice_idx": 1, "direct_answers": ["crossing", "train crossing", "train crossing", "train crossing", "traffic", "train", "deer", "no crossing", "railway crossing", "deer crossing"], "difficult_direct_answer": false, "rationales": ["The cross buck sign indicates where railroad tracks cross.", "There is a train passing by the road.", "A train is crossing the road."], "image": "train2014/COCO_train2014_000000193943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121994, "question_id": "CD7WxeWzbMmJs4ZvWUXmKu", "question": "What would be the most efficient way to coat the darker treat here?", "choices": ["flicking", "brush", "dipping", "spray bottle"], "correct_choice_idx": 2, "direct_answers": ["drizzle", "dunk", "dip it", "dip it", "dipping", "dunking", "wet frosting", "sprinkle", "dip", "sprinkles"], "difficult_direct_answer": true, "rationales": ["It has sprinkles on it.", "The donut could be dipped to ice it.", "A brush would not give a thick even coating, chocolate is too thick for a spray bottles, and flicking would not be efficient in applying an even coat."], "image": "train2014/COCO_train2014_000000121994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442040, "question_id": "CDP66qe6egk24rcBXZgAbt", "question": "Where are the patrons unable to ski or snowboard?", "choices": ["ski lift", "grass", "snow", "lodge"], "correct_choice_idx": 1, "direct_answers": ["beyond fence", "on dirt", "snowless areas", "fenced area", "marked areas", "behind fence", "dirt", "bottom", "behind fence", "grass"], "difficult_direct_answer": true, "rationales": ["The patrons can't go in grass.", "Grass is not the right texture for snowboarding or skiing.", "Snow is needed to ski and the skis would get stuck on the grass."], "image": "val2014/COCO_val2014_000000442040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520530, "question_id": "CDeTqHxQrBo7Q6Wxj3hGvk", "question": "What style return is being utilized here?", "choices": ["none", "forehand", "backhand", "two handed"], "correct_choice_idx": 2, "direct_answers": ["man effort", "tennis", "backhand", "swing", "backhand serve", "backhand", "backhand", "forehand", "low", "serve"], "difficult_direct_answer": false, "rationales": ["A left handed player reaching across their body like this with his wrist in would be completing the action known as answer a.", "He is using a backhand swing.", "This is obvious based on the positioning of the hand and racket."], "image": "val2014/COCO_val2014_000000520530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185936, "question_id": "CDgNjisU7Jv9inDt5RR7Qr", "question": "What is hanging from the wall?", "choices": ["poster", "swords", "chandelier", "plant"], "correct_choice_idx": 3, "direct_answers": ["plant", "leaves", "blinds", "vines", "window shades", "plant", "plant", "plant", "plant", "plants"], "difficult_direct_answer": false, "rationales": ["There are leaves behind the man and around the wall.", "The leaves falling down inside of a planter, indicate that this is a live plant.", "A plant is hanging down with its leaves."], "image": "val2014/COCO_val2014_000000185936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505768, "question_id": "CDj4fhGW5XPTkz6aJ4wyCU", "question": "What are the little bumps on the tree branches?", "choices": ["insects", "leaves", "seed cones", "hives"], "correct_choice_idx": 2, "direct_answers": ["fruits", "knots", "pines", "seeds", "hard", "seeds", "fruits", "flowers", "seed cones", "leaf"], "difficult_direct_answer": false, "rationales": ["They are the pods that ensure the tree species will continue to grow for years", "They hold unborn trees.", "The bumps are all of the places where seed cones could grow."], "image": "train2014/COCO_train2014_000000505768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550140, "question_id": "CDjJMmb2MDmsZdQXchA9F2", "question": "What type of glove does the man with Casey on his jersey have on?", "choices": ["batting", "first baseman", "shortstops", "catcher"], "correct_choice_idx": 1, "direct_answers": ["baseball", "first baseman", "mitt", "baseball mit", "firstbaseman mitt", "pitcher", "baseball", "baseball mitt", "baseball glove", "mitten"], "difficult_direct_answer": true, "rationales": ["The glove is what the first baseman would wear.", "The glove is usually worn by people who play first base.", "He has a glove for a first baseman."], "image": "train2014/COCO_train2014_000000550140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311773, "question_id": "CDujsBWj49bkRGNvZNG8dw", "question": "What are the two using to play?", "choices": ["screen", "poster", "dresser", "refrigerator"], "correct_choice_idx": 3, "direct_answers": ["magnets", "toys", "magnets", "magnets", "magnets", "toys", "magnets", "refrigerator", "magnets", "magnets"], "difficult_direct_answer": false, "rationales": ["They have magnets on the side of the fridge.", "Two kids are standing near an appliance and sticking magnets to it. people put magnets on refrigerators.", "The two are using the refrigerator to play with kitchen magnets."], "image": "train2014/COCO_train2014_000000311773.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341271, "question_id": "CDyMakTNVNbREpunV847Zw", "question": "What is in the shaker on the left?", "choices": ["salt", "crushed peppers", "parmesan cheese", "sugar"], "correct_choice_idx": 2, "direct_answers": ["parmesan cheese", "parmesan cheese", "parmesan", "parmesan cheese", "parmesan cheese", "parmesan cheese", "parmesan cheese", "parmesan", "cheese", "cheese"], "difficult_direct_answer": false, "rationales": ["The shaker has cheese.", "Many people love to sprinkle this on their pizza.", "Parmesan cheese is used as an extra topping on food."], "image": "train2014/COCO_train2014_000000341271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367578, "question_id": "CE6AQqYqHsigKDd2GBjPpb", "question": "The woman behind and to the left of the closest man to the camera is wearing what color of shirt?", "choices": ["turquoise", "red", "black", "yellow"], "correct_choice_idx": 0, "direct_answers": ["turquoise", "turquoise", "turquoise", "teal", "green", "turquoise", "turquoise", "turquoise", "teal", "turquoise"], "difficult_direct_answer": false, "rationales": ["The woman's shirt is a shade of blue-green.", "The woman is in turquoise.", "There is a woman to the left behind the man wearing a turquoise blouse."], "image": "val2014/COCO_val2014_000000367578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232842, "question_id": "CEDqYKpcroKYUbJAcPPu2v", "question": "What type weather do people here hope for today?", "choices": ["snow", "rain", "wind", "sleet"], "correct_choice_idx": 2, "direct_answers": ["sea side", "windy", "wind", "sea side", "windy", "windy", "windy", "windy", "windy", "windy"], "difficult_direct_answer": false, "rationales": ["The weather is windy.", "Wind is required for kites.", "The air is blowing for the kites to stay in the air."], "image": "val2014/COCO_val2014_000000232842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408824, "question_id": "CEPL4BBXh7YWUg9JaUCrFT", "question": "What is the building on the far right primarily for?", "choices": ["dining", "swimming", "education", "sleeping"], "correct_choice_idx": 3, "direct_answers": ["hotel", "lodging", "hotel", "nightly rentals", "lodging", "hotel", "sleeping", "hotel", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["The sign says hotel.", "The building is for sleeping.", "The building on the far right is a hotel."], "image": "train2014/COCO_train2014_000000408824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479081, "question_id": "CESnfDhWoigjsR3iuHA9KQ", "question": "The store behind the bus is having a sale due to which major event?", "choices": ["boxing day", "halloween", "labor day", "christmas"], "correct_choice_idx": 3, "direct_answers": ["christmas", "christmas", "clothing", "christmas", "christmas", "christmas", "christmas", "christmas", "holiday", "christmas"], "difficult_direct_answer": false, "rationales": ["The sign on the front of the store indicates the holiday that caused the sale.", "The store behind the bus is having a sale for christmas. there are christmas lights in the window.", "The store has many brilliant colors."], "image": "train2014/COCO_train2014_000000479081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474934, "question_id": "CEdei5gSeEuu7AjWqBjvMz", "question": "Which direction are the horses likely to go to together?", "choices": ["inland", "seaward", "nowhere", "city"], "correct_choice_idx": 0, "direct_answers": ["land side", "infront", "neither", "forward", "inland", "east", "left", "left", "forward", "left"], "difficult_direct_answer": false, "rationales": ["Horses are running on the beach together.", "The horses are standing on the seashore. horses are not usually kept in this area.", "Horses cannot swim well, particularly if someone is on their back."], "image": "val2014/COCO_val2014_000000474934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50695, "question_id": "CF5WDH8g2hTT3Y7HiGWsft", "question": "What is the parachute called in paragliding?", "choices": ["canopy", "wing", "balloon", "parachute"], "correct_choice_idx": 0, "direct_answers": ["playing", "paraglider", "paramotor", "paramotoring", "canopy", "parachute", "wing", "sail", "parasail", "paramotor"], "difficult_direct_answer": true, "rationales": ["Canopy is the name.", "It kind of looks like a canopy.", "The parachute is known as the canopy."], "image": "train2014/COCO_train2014_000000050695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472157, "question_id": "CF7VuF3pyrX5xnsCjoP7dX", "question": "What type of elephant is in the image?", "choices": ["stuffed", "adult", "baby", "dead"], "correct_choice_idx": 1, "direct_answers": ["grey", "adult", "adult", "mammoth", "african elephant", "asian", "african", "african", "adult", "african"], "difficult_direct_answer": false, "rationales": ["A grown elephant is there.", "A large elephant is surrounded by others. elephants are larger when they are adults.", "The elephant has developed tusks, which is typical of grown elephants."], "image": "train2014/COCO_train2014_000000472157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570656, "question_id": "CFBdKFD3p8dEFDW8C8JW9B", "question": "What is the woman pulling on?", "choices": ["tie", "rope", "leash", "necklace"], "correct_choice_idx": 0, "direct_answers": ["tie", "rope", "tie", "tie", "tie", "tie", "tie", "tie", "rope", "tie"], "difficult_direct_answer": false, "rationales": ["The woman is pulling someone by their tie.", "A woman is smiling as she pulls up a neck piece that guys wear when they dress up.", "The woman is pulling on the man's necktie."], "image": "train2014/COCO_train2014_000000570656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306722, "question_id": "CFDhfvHx4SDnNnd6MvtPPm", "question": "What is the fruit underneath and to the right of the two bananas?", "choices": ["pineapples", "grapefruit", "apples", "oranges"], "correct_choice_idx": 3, "direct_answers": ["orange", "oranges", "orange", "orange", "orange", "orange", "fruit", "tangerines", "orange", "oranges"], "difficult_direct_answer": false, "rationales": ["It is round and orange.", "The fruit beneath the bananas are oranges with bright orange rinds.", "The items are of orange color and have a pored textured."], "image": "train2014/COCO_train2014_000000306722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520434, "question_id": "CFGZg7eT5KVvzrzMT9Hr8Y", "question": "What does the woman want to do with the ball?", "choices": ["catch it", "hit it", "throw it", "dodge it"], "correct_choice_idx": 1, "direct_answers": ["hit it", "playing", "hit it", "playing", "hit it", "hit", "hit it", "hit it", "return", "volley"], "difficult_direct_answer": false, "rationales": ["The tennis racket is used to return her opponent's serve in the hopes of scoring a point.", "The woman has her racket out towards the moving ball to hit it back.", "She wants to hit it to the other side of the court."], "image": "train2014/COCO_train2014_000000520434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63874, "question_id": "CFHAXgDKnsxgDxDChGkFvs", "question": "What are they waiting for?", "choices": ["explanation", "dinner", "train", "assistance"], "correct_choice_idx": 2, "direct_answers": ["train", "bus stopping", "train", "bus stopping", "train", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["They are at a train station waiting for a the train.", "They are waiting for a train to show up by the track next to them.", "People are waiting on the platform in a train station."], "image": "train2014/COCO_train2014_000000063874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303923, "question_id": "CFeQh7rp6nD35vsFKVbb8j", "question": "What is the shovel leaning against the fence on the left used for?", "choices": ["digging ditches", "snow removal", "planting flowers", "defense"], "correct_choice_idx": 1, "direct_answers": ["snow removal", "shovelling snow", "clearing snow", "shovel walkway", "snow removal", "snow", "snow shoveling", "snow", "shoveling", "snow"], "difficult_direct_answer": false, "rationales": ["The shovel leaning against the fence is used to clear snow off of the path.", "The shovel is visible and is of a style and shape consistent with answer a. there is also snow clearly visible meaning removal is likely needed at times.", "The shovel is to scoop out snow with."], "image": "train2014/COCO_train2014_000000303923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393266, "question_id": "CFnXPAnz4jVEC7qErLa4mW", "question": "What is next to the sign?", "choices": ["monkey", "baby", "tunnel", "alligator"], "correct_choice_idx": 2, "direct_answers": ["tunnel", "tunnel", "tunnel", "window", "tunnel", "tunnel", "window", "tunnel", "window tunnel", "tunnel"], "difficult_direct_answer": false, "rationales": ["There are no non-human animals or babies. there is a hollow area that allows cars and pedestrians to pass through.", "There is a tunnel near the sign.", "The sign is by a tunnel."], "image": "val2014/COCO_val2014_000000393266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273321, "question_id": "CFoKjNPZS3LsNE7wPZkJBj", "question": "Why is he in the air?", "choices": ["grab frisbee", "falling", "angry", "bouncing"], "correct_choice_idx": 0, "direct_answers": ["jumping", "playing", "catching frisbee", "catching", "catching frisbee", "jumped", "grab frisbee", "jumping", "hit ball", "he jumped"], "difficult_direct_answer": false, "rationales": ["He is jumping to catch the flying object.", "He is going towards a frisbee.", "The man is midair because he jumped to grab a flying frisbee."], "image": "val2014/COCO_val2014_000000273321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10216, "question_id": "CFz5RNfWziCQCeNNwGEw2B", "question": "What color hats do the flight attendants from this airline wear?", "choices": ["red", "purple", "white", "green"], "correct_choice_idx": 0, "direct_answers": ["blue", "red", "red", "red", "red", "black", "blue", "red", "red", "yellow"], "difficult_direct_answer": false, "rationales": ["It is part of the working attire which is required in order to do the job.", "To determine the color of the uniform hats, i did an internet search on emirates airline flight attendant images.", "The color is red."], "image": "val2014/COCO_val2014_000000010216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362166, "question_id": "CG3MbAtErooMStGrzGeNzB", "question": "This girl has more hair than which haircut?", "choices": ["buzzcut", "beehive", "bouffant", "afro"], "correct_choice_idx": 0, "direct_answers": ["short", "what", "buzz cut", "mom", "buzzcut", "pixie", "pixie cut", "dryer", "short", "buzzcut"], "difficult_direct_answer": false, "rationales": ["This woman has more hair than a buzzcut.", "A buzzcut means that the hair is extraordinarily short.", "The girl has long hair which is longer than a buzz cut which is shorter than one inch."], "image": "train2014/COCO_train2014_000000362166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295015, "question_id": "CG47p4fPJrJb7myg8PJUts", "question": "What are the people touching?", "choices": ["surfboards", "clown noses", "license plates", "eggs"], "correct_choice_idx": 0, "direct_answers": ["surfboards", "surfboards", "surfboards", "surfboards", "surfboards", "surfboards", "surfboards", "surfboards", "surfboards", "surfboards"], "difficult_direct_answer": false, "rationales": ["The people are touching the boards.", "The men seems to be touching the surfboard.", "These are flat objects that can be ridden through water."], "image": "train2014/COCO_train2014_000000295015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280642, "question_id": "CG5UYrss4x93NwrTJuJBCw", "question": "Where is this picture located?", "choices": ["lake", "river", "beach", "resort"], "correct_choice_idx": 3, "direct_answers": ["pool", "resort", "resort", "resort", "poolside", "in sea", "resort", "hotel", "resort", "tropical area"], "difficult_direct_answer": false, "rationales": ["The other options aren't seen in this image. the layout of the place and umbrellas point to a as well.", "This is a man-made pool at a hotel.", "There is an area for swimming. the body of water is man-made."], "image": "train2014/COCO_train2014_000000280642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346817, "question_id": "CGF7X5xxK3fLNhM9eT6bR6", "question": "What type of vehicle is sold in the building to the rear of the bus?", "choices": ["cars", "motorcycles", "tractors", "trucks"], "correct_choice_idx": 1, "direct_answers": ["motorcycle", "motorcycles", "small cars", "motorcycles", "motorcycles", "small vehicles", "honda", "sedan", "motorcycles", "motorcycle"], "difficult_direct_answer": false, "rationales": ["There are bikes by the bus.", "The dealership is that of motorbikes being sold.", "They look like they sell motorcycles according to the sign."], "image": "train2014/COCO_train2014_000000346817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362740, "question_id": "CGqN8AmnyCwQRurLe4nbVa", "question": "What is the brown table at the left bottom corner for?", "choices": ["preparing food", "reading desk", "coffee table", "playing chess"], "correct_choice_idx": 3, "direct_answers": ["playing chess", "placing purpose", "playing chess", "classy", "chess", "chess", "writing", "chess game", "chair", "chess"], "difficult_direct_answer": false, "rationales": ["A chess board is in a room and a chair is on one side of it.", "The table is a chess table given the checkerboard pattern.", "This has the pattern of a chessboard and a chair next to it so it would be best for playing chess."], "image": "train2014/COCO_train2014_000000362740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494900, "question_id": "CGw4LYCU85WAmvRSkUEeRw", "question": "What city is the sponsor of the arena located?", "choices": ["dubai", "denver", "new york", "calcutta"], "correct_choice_idx": 0, "direct_answers": ["nirates", "mirates", "dubai", "emirates", "dubai", "saudi arabia", "dubai", "no idea", "use", "dubai"], "difficult_direct_answer": false, "rationales": ["The arena has a sponsorship ad on the wall for the emirates airlines. i used the internet to search where their headquarters are located.", "Though partially obscured, arab emirates airlines is advertised here. this airline is headquartered in dubai.", "Emirates air is from the uae, and dubai is there."], "image": "train2014/COCO_train2014_000000494900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550144, "question_id": "CH7gWmGBuB3pSyYpoBUyf2", "question": "How are the planes flying?", "choices": ["racing", "zooming", "formation", "tailgating"], "correct_choice_idx": 2, "direct_answers": ["propeller", "upside down", "diving", "formation", "downward", "formation", "diving", "downward", "aimed down", "vertical"], "difficult_direct_answer": false, "rationales": ["The planes are flying downward in a set of three.", "They are performing in an air show", "The planes are flying as a trio."], "image": "train2014/COCO_train2014_000000550144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190187, "question_id": "CHAezv88TRGbwFLUqhxiTy", "question": "What sort of floor plan is seen here?", "choices": ["separate", "open", "cubicle", "tiny"], "correct_choice_idx": 1, "direct_answers": ["wood", "open", "wood", "open", "open", "open", "open", "open concept", "open", "open"], "difficult_direct_answer": false, "rationales": ["The room has a design that does not incorporate many walls creating subsections of space. this design style is known as answer a.", "There are no interior walls. the room is not tiny.", "There is a kitchen, dining room and bathroom all in one space"], "image": "train2014/COCO_train2014_000000190187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428117, "question_id": "CHRdLfdXf6uX5iFZ6gJB4a", "question": "What Wii sport game is he likely playing?", "choices": ["football", "golf", "bowling", "boxing"], "correct_choice_idx": 1, "direct_answers": ["tennis", "golf", "golfing", "golf", "golf", "tennis", "golf", "golf", "golf", "golf"], "difficult_direct_answer": false, "rationales": ["The orientation of the controller and the stance he is in resembles how one would hold a golf club. when playing wii sports, one holds the remote in association with how the real version of the sport would traditionally be played.", "The person is holding the remote down.", "He has his hands positioned down like he's holding a club"], "image": "train2014/COCO_train2014_000000428117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529102, "question_id": "CHSE8RAHLMJzZNcngKTdoT", "question": "What item in the picture is currently being banned by many major cities?", "choices": ["parsley", "mug", "plastic cup", "straw"], "correct_choice_idx": 3, "direct_answers": ["straw", "plastic straws", "straw", "plastic straw", "water", "straw", "straw", "plastic straw", "plastic straws", "water glass"], "difficult_direct_answer": false, "rationales": ["Many people want to get rid of plastic.", "Recently many eateries have banned these types of items due to environmental reasons.", "An item that is used to pull liquid out of a cup. but it can cause animals to ingest them. it can hurt them."], "image": "val2014/COCO_val2014_000000529102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437677, "question_id": "CHTE3ypAei6dSb5uD9Shn8", "question": "What is the metal the people are standing behind?", "choices": ["barricade", "shelf", "poster", "railway"], "correct_choice_idx": 0, "direct_answers": ["fence", "steel", "barricade", "fence", "fence", "fence", "barrier", "fence", "barricade", "barrier"], "difficult_direct_answer": false, "rationales": ["For the safety of the skateboarder and the spectators, there is often erected a barrier or fence to keep them apart.", "The metal is the barricade.", "The people are behind a fence to watch the skateboarders."], "image": "train2014/COCO_train2014_000000437677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281696, "question_id": "CHbxs2hfjr8afrNRyxPUHe", "question": "Where do you go in this street if you want to buy candy?", "choices": ["restaurant", "convenience store", "shoes store", "bank"], "correct_choice_idx": 1, "direct_answers": ["store", "candy store", "supermarket", "hagaan daze", "7-11", "7-11", "seven eleven", "storefront", "convenience store", "store"], "difficult_direct_answer": false, "rationales": ["The convenience store has candy.", "There's a 7/11 on the street corner.", "People can buy stuff in the store."], "image": "val2014/COCO_val2014_000000281696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425944, "question_id": "CHjaf46aea7QBz6N4yKoyW", "question": "Which culture has this custom?", "choices": ["scotland", "iraq", "india", "iran"], "correct_choice_idx": 3, "direct_answers": ["muslim", "iran", "greece", "middle eastern", "asia", "iran", "barbaric", "pakistan", "arabic", "jewish"], "difficult_direct_answer": true, "rationales": ["They have a sheep.", "This seems like something found in the country of iran.", "The culture of iran features goat slaughter."], "image": "train2014/COCO_train2014_000000425944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474186, "question_id": "CJ9PaNsCASXiknbp9i8Hjm", "question": "Biting what here would yield the lowest ingestion of fat?", "choices": ["carrot", "bread", "cheese", "sandwich"], "correct_choice_idx": 0, "direct_answers": ["sandwich", "carrot", "carrots", "carrot", "carrot", "carrot", "bread", "carrot", "carrot", "carrot"], "difficult_direct_answer": false, "rationales": ["Carrots are fat free foods.", "The carrot has no fat.", "Fruits and vegetables are low in fat"], "image": "val2014/COCO_val2014_000000474186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141040, "question_id": "CJMjAnFbTJL4zTbrZiCTKT", "question": "What sauce is preferred here?", "choices": ["barbeque", "fish", "soy", "mustard"], "correct_choice_idx": 2, "direct_answers": ["hot sauce", "soy", "hot", "soy", "soy", "soy", "soy", "butter", "soy", "soy"], "difficult_direct_answer": false, "rationales": ["The guys are eating asian food and the soy-sauce is the most popular sauce.", "The persons hold chop sticks indicating they are about to eat chinese food. the bottle of soy sauce sits on the table and will be sprinkled on the food.", "Soy sauce is used with asian food."], "image": "val2014/COCO_val2014_000000141040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416489, "question_id": "CJTjZtNKDEVryD2JRHs3mn", "question": "What is the man seated in the back court doing?", "choices": ["eating", "sleeping", "returning serves", "judging"], "correct_choice_idx": 3, "direct_answers": ["judging", "sitting", "refereeing", "umpiring", "referee", "sitting", "referee", "sitting", "officiating", "judging"], "difficult_direct_answer": false, "rationales": ["The man is a judge.", "This places him in a good position to make important decisions regarding the match.", "From his position and what he is wearing you can tell what he is doing there."], "image": "val2014/COCO_val2014_000000416489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368072, "question_id": "CJU8k2F224mEKoSvzYW2Bb", "question": "What do the people walking on the beach carry?", "choices": ["dogs", "string", "babies", "footlongs"], "correct_choice_idx": 1, "direct_answers": ["phones", "beach blanket", "unknown", "kite", "string", "kite", "money", "nothing", "kite", "water"], "difficult_direct_answer": false, "rationales": ["The people walking on the beach are carrying the end of the string that controls the kite above them", "These people carry a string attached to a kite.", "They are flying a kite."], "image": "val2014/COCO_val2014_000000368072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560662, "question_id": "CJW9wvz6v8neVqVdBSbc2Y", "question": "What is in the air?", "choices": ["basketball", "baby", "airplane", "cat"], "correct_choice_idx": 0, "direct_answers": ["basketball", "basketball", "basketball", "basketball", "ball", "ball", "basketball", "ball", "basketball", "basketball"], "difficult_direct_answer": false, "rationales": ["The basketball is suspended in mid air.", "The people are playing a sport on a court. there are no babies, airplanes, or cats.", "The ball is in the air."], "image": "val2014/COCO_val2014_000000560662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20146, "question_id": "CJa3jYyrzbS8AqKwrNmo5N", "question": "How is the kid on the right brushing his teeth differently from the kid on the left?", "choices": ["electric", "different toothpaste", "crying", "lefthanded"], "correct_choice_idx": 3, "direct_answers": ["finger tips", "left handed", "left hand", "right hand", "lefthanded", "wrong end", "different grip", "direction", "left handed", "backwards"], "difficult_direct_answer": true, "rationales": ["He is using the hand opposite the other boy.", "She has the brush in her left hand and he has it in his right.", "One child is brushing with their right hand and the other is not."], "image": "train2014/COCO_train2014_000000020146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343821, "question_id": "CJn8rgeNWnhqyAFUKegg4F", "question": "The animals here were developed in which way?", "choices": ["surrogates", "incubated", "cloned", "live birth"], "correct_choice_idx": 1, "direct_answers": ["hatched", "in egg", "birthed", "from eggs", "incubated", "from egg", "eggs", "land sea", "eggs", "eggs"], "difficult_direct_answer": false, "rationales": ["The animals were incubated.", "They hatched from eggs kept warm by the mother swan.", "Birds incubate within their eggs."], "image": "val2014/COCO_val2014_000000343821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162530, "question_id": "CJosisLLo3RtrRuhHEWfNm", "question": "The owner of the apartment put the least investment into what for his building?", "choices": ["infrastructure", "street access", "aesthetics", "security"], "correct_choice_idx": 2, "direct_answers": ["plants", "curb appeal", "cleaning", "maintenence", "landscaping", "plants", "decorations", "aesthetics", "landscaping", "building"], "difficult_direct_answer": false, "rationales": ["The pots in front are beaten up with dead plants, and there is graffiti on the front.", "Aesthetically it does not look very nice, so money is not being spent there.", "The front of the building is plain, with graffiti and broken planters."], "image": "val2014/COCO_val2014_000000162530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571039, "question_id": "CJqdEXSZWMSUPRDGSzL9Bb", "question": "The hand of the clock is closest to what number?", "choices": ["twelve", "nine", "five", "one"], "correct_choice_idx": 2, "direct_answers": ["five", "number 5", "five", "four", "five", "five", "five", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["The hand is at the five.", "The clock hand is near the number five.", "Based on the normal orientation of numbers on the clock and the position of the hand that is pointing to numbers, it is closest to five."], "image": "train2014/COCO_train2014_000000571039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492769, "question_id": "CJsp2763SEZaP6mPExzDoA", "question": "The food on the table stems from what country?", "choices": ["japan", "germany", "china", "italy"], "correct_choice_idx": 3, "direct_answers": ["italy", "italy", "italy", "italy", "italy", "italy", "italy", "italy", "italy", "italy"], "difficult_direct_answer": false, "rationales": ["Pizza is from there.", "There is pizza on the table. pizza comes from italy.", "The pizza is from italy."], "image": "train2014/COCO_train2014_000000492769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123473, "question_id": "CJu6hcaTB4dTwgS5mjBbDs", "question": "What is the woman most likely doing to the child standing between her legs?", "choices": ["playing", "wiping", "reading", "feeding"], "correct_choice_idx": 3, "direct_answers": ["feeding", "watching", "feeding", "babysitting", "babysitting", "feeding him", "feeding", "babysitting", "feeding", "feeding"], "difficult_direct_answer": false, "rationales": ["The woman is feeding.", "The woman is holding a bowl of food and is probably sharing it with the child.", "The child is holding a bottle of juice and the woman is holding a bowl of food."], "image": "train2014/COCO_train2014_000000123473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438422, "question_id": "CK8DDogUPcefTj39EkwwEU", "question": "What type of kitchen is this?", "choices": ["island", "galley", "residential", "commercial"], "correct_choice_idx": 3, "direct_answers": ["soup", "commercial", "restaurant", "commercial", "commercial", "professional", "industrial", "commercial", "commercial", "food truck"], "difficult_direct_answer": false, "rationales": ["The kitchen is commercial.", "There is a giant fan stretching the length of the kitchen.", "The kitchen appears to be inside a food truck."], "image": "train2014/COCO_train2014_000000438422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206876, "question_id": "CK8gQBhmGbzHTnnWXkhfcs", "question": "What is being done to the elephant here?", "choices": ["fanning", "punishment", "torture", "cleaning"], "correct_choice_idx": 3, "direct_answers": ["backscratching", "brushed", "brushing", "bath", "brushed clean", "cleaning", "brushed", "cleaning", "cleaning", "cleaning"], "difficult_direct_answer": false, "rationales": ["The kids are being allowed to use the brush to clean up the elephants.", "These long poles with scrubbers on them are used to freshen up the animal", "The elephant is being scrubbed."], "image": "val2014/COCO_val2014_000000206876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419232, "question_id": "CKR76yTDJz5zBC6aXZRCY2", "question": "What is a likely occasion for all the kids getting together?", "choices": ["funeral", "school", "church", "birthday party"], "correct_choice_idx": 3, "direct_answers": ["birthday party", "party", "birthday party", "pizza party", "party", "birthday party", "birthday", "birthday", "birthday party", "birthday"], "difficult_direct_answer": false, "rationales": ["Given the presents on the table combined with what appears to be a race car-themed birthday cake, this event would most decidedly have to be a child's birthday party.", "They are kids around the same age, eating pizza at an establishment, which is the most common meal served at a birthday party.", "There are party plates and wrapped presents at the end of the table."], "image": "train2014/COCO_train2014_000000419232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278843, "question_id": "CKT95J9vbCvthbQwsu63Sd", "question": "The man wearing the mask is role playing as what?", "choices": ["luchador", "superhero", "villain", "ninja"], "correct_choice_idx": 0, "direct_answers": ["wrestler", "superhero", "mucha libre", "luchador", "super hero", "wrestler", "lucha libre", "rey mysterion", "hero", "costume"], "difficult_direct_answer": true, "rationales": ["The man with the mask is dressed as luchador.", "Luchador is who he is role playing as.", "These are wrestling costumes"], "image": "val2014/COCO_val2014_000000278843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265236, "question_id": "CKbLmhFywYz6Bk4z8vXs5v", "question": "What is the person doing?", "choices": ["yelling", "posing", "running", "drinking"], "correct_choice_idx": 1, "direct_answers": ["posing", "hiking", "posing", "posing", "hitch hiking", "posing", "posing", "picture", "hitchhiking", "posing"], "difficult_direct_answer": false, "rationales": ["Pretending to hitchhike", "By the person's location and posture you can tell what he is doing and why.", "The man is standing in a still position while looking at the camera."], "image": "train2014/COCO_train2014_000000265236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54492, "question_id": "CKeX8zEbdbDHMAF547QF7q", "question": "What kind of computer/electronic device is being used here?", "choices": ["laptop", "ipad", "tablet", "desktop"], "correct_choice_idx": 3, "direct_answers": ["computer", "desktop", "pc", "macintosh", "refrigerator", "desktop", "desktop", "apple", "apple", "desktop"], "difficult_direct_answer": false, "rationales": ["A computer monitor is on a table. desktop computers have a separate monitor.", "The monitor for the desktop computer can be seen in the corner on the desk.", "The monitor of a full size desktop computer is visible partially behind the black refrigerator."], "image": "train2014/COCO_train2014_000000054492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265196, "question_id": "CKptgVEQ95c45wgd44Tywu", "question": "What is the dog doing at the table?", "choices": ["eating", "barking", "playing", "urinating"], "correct_choice_idx": 0, "direct_answers": ["eating", "eating", "eating", "eating", "eating", "eating", "eating", "eating", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["The dog has a bowl at the table with a fork to eat.", "The dog is eating out of its bowl.", "His bowl is on the table."], "image": "train2014/COCO_train2014_000000265196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38756, "question_id": "CKxAL8ua5zjT2N7dzoSmsM", "question": "How does the woman know the girl?", "choices": ["employee", "grandparent", "parent", "student"], "correct_choice_idx": 2, "direct_answers": ["daughter", "her daughter", "same family", "friend", "mother", "her daughter", "parent", "daughter", "mother", "mother"], "difficult_direct_answer": false, "rationales": ["A woman with a child sitting next to her eat at a restaurant together. the woman and child look alike.", "The younger girl resembles the older woman.", "The woman and girl are similar in appearance. the woman is not old enough to be the girl's grandmother."], "image": "train2014/COCO_train2014_000000038756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567000, "question_id": "CL8Szx2aFqfUqwBNsZa6yo", "question": "What are the 4 men most likely facing?", "choices": ["laptop", "bed", "tv", "refrigerator"], "correct_choice_idx": 2, "direct_answers": ["video screen", "video game", "tv", "television", "class", "students", "screen", "tv", "tv", "tv"], "difficult_direct_answer": false, "rationales": ["The men are holding remote controllers based on their size, shape, color and design. if using these correctly, they would be regarding answer a as they played.", "They're holding wii game controllers.", "The people are playing a game. they are facing a screen."], "image": "train2014/COCO_train2014_000000567000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326236, "question_id": "CLBWBURyRw9oNFMxiMJCWD", "question": "What is the group on the horses doing?", "choices": ["conquering", "racing", "touring", "fighting"], "correct_choice_idx": 1, "direct_answers": ["racing", "racing", "racing", "racing", "racing", "racing", "racing", "racing", "racing", "racing"], "difficult_direct_answer": false, "rationales": ["The group is racing.", "These people have numbers and appear to be going fast like in a race.", "The horses have jockeys on them who are leaning forward as if they are in a race."], "image": "train2014/COCO_train2014_000000326236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80713, "question_id": "CLCauw9PRVHHdnisV9otna", "question": "What are the people using to take pictures of the giraffes?", "choices": ["cameras", "remotes", "cell phones", "tablets"], "correct_choice_idx": 0, "direct_answers": ["cameras", "cameras", "cameras cellphones", "cameras", "cameras", "cameras", "camera", "cameras", "cameras", "cameras"], "difficult_direct_answer": false, "rationales": ["The people all have cameras and are taking photos.", "They are using camera.", "That would be the apparatus for photography."], "image": "val2014/COCO_val2014_000000080713.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77864, "question_id": "CLJMfebNDqFvJCuz2wnVUW", "question": "What is the metal object in between the woman's legs?", "choices": ["cane", "poker", "bat", "racket"], "correct_choice_idx": 0, "direct_answers": ["cane", "cane", "cane", "walking stick", "crutch", "cane", "cane", "walking stick", "cane", "cane"], "difficult_direct_answer": false, "rationales": ["She has a walking cane used to help her keep her balance when walking", "The woman has it there to keep it from falling to the ground.", "This is a metal stick used to help people keep balance as they walk"], "image": "train2014/COCO_train2014_000000077864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357633, "question_id": "CMAsvK56aBvSghBpndrNEF", "question": "What do people use this machine for?", "choices": ["storing cheese", "cooking food", "typing letters", "mopping floors"], "correct_choice_idx": 2, "direct_answers": ["social media", "computing", "working", "email", "work", "work", "surfing internet", "tasks", "typing letters", "working tasks"], "difficult_direct_answer": true, "rationales": ["People use the laptop to type on.", "Laptops have keys for this purpose", "You could use it to type emails or type out any kind of paper."], "image": "val2014/COCO_val2014_000000357633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351827, "question_id": "CMBnoK96eC867gbqaxjHML", "question": "Why are they returning to shore?", "choices": ["too dark", "low fuel", "boat broken", "storm coming"], "correct_choice_idx": 3, "direct_answers": ["day over", "rain", "it's storming", "nightfall", "rain", "tired", "darkness", "its evening", "storm coming", "its dark"], "difficult_direct_answer": true, "rationales": ["By the clouds in the sky most likely a storm is coming.", "A storm is brewing based on the gray clouds.", "The rainbow and dark clouds indicate moisture in the air which could mean foul weather for boaters; it would be wise for them to return to shore for safety."], "image": "train2014/COCO_train2014_000000351827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318778, "question_id": "CMCd3L58Cicq5ViQYvDyX4", "question": "Why are the black tables setup in this location?", "choices": ["for convention", "for eating", "for sitting", "for decoration"], "correct_choice_idx": 0, "direct_answers": ["display product", "display", "sell", "market", "flea market", "vendors", "bazaar", "for convention", "artifacts", "display"], "difficult_direct_answer": true, "rationales": ["There are items set up on the tables and people are looking at them.", "They are set up to show off items.", "The tables are set up for a large convention."], "image": "val2014/COCO_val2014_000000318778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15897, "question_id": "CMDdfz28hnJJhRYifHCthx", "question": "What's the name of the dress the woman is wearing?", "choices": ["summer dress", "schoolgirl", "maid outfit", "wedding dress"], "correct_choice_idx": 2, "direct_answers": ["babydoll", "blouse", "lace", "maid", "babydoll", "black dress", "maid outfit", "maid outfit", "costume", "costume"], "difficult_direct_answer": false, "rationales": ["If you have ever seen a maid in a hotel, then the answer is clear.", "It's sometimes also referred to as a doll outfit.", "Traditionally these types of outfits are identified with someone who cleans houses."], "image": "train2014/COCO_train2014_000000015897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578990, "question_id": "CMkapAWSFpxhbZBoLRXn4W", "question": "What are the seated man and child riding on?", "choices": ["toboggan", "snowboard", "surfboard", "tube"], "correct_choice_idx": 0, "direct_answers": ["toboggan", "sled", "sled", "sled", "sled", "sled", "sled", "sled", "sled", "sled"], "difficult_direct_answer": false, "rationales": ["A sled or toboggan is used in the snow and on ice. the glides are needed to move in cold weather conditions.", "A man and child are sitting on a sled with metal rungs on the bottom.", "The people are on ice are on a board attached to two rails underneath. answer a is used in this climate in these types of activities and has this composition."], "image": "train2014/COCO_train2014_000000578990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341230, "question_id": "CNV4GKTqskhFwvUMh7R6du", "question": "To eat the items here what must be done with the outer peel?", "choices": ["lick it", "bite it", "shred it", "remove it"], "correct_choice_idx": 3, "direct_answers": ["removed", "removed", "remove it", "removed", "removed", "take off", "taken off", "removed", "orange", "remove"], "difficult_direct_answer": false, "rationales": ["You have to peel the skin off to eat oranges for the most part.", "Orange peels aren't edible.", "Oranges have a thick peel and it has to come off before the edible part can be consumed."], "image": "val2014/COCO_val2014_000000341230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547212, "question_id": "CNVN6CzNQfRyyvM4dhEiRb", "question": "What beverage does the woman enjoy?", "choices": ["beer", "coke", "iced tea", "dr. pepper"], "correct_choice_idx": 2, "direct_answers": ["tea", "mixed drink", "tea", "beer", "iced tea", "tea", "cocktail", "iced tea", "iced tea", "alcoholic"], "difficult_direct_answer": false, "rationales": ["The woman has an tea with lemon in her hands.", "The drink is the color of iced tea and the lemon is common in this drink.", "There is a light brown liquid with a lemon in it"], "image": "val2014/COCO_val2014_000000547212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149381, "question_id": "CNejVbmKZytSbEXBgGPFmR", "question": "What weather is greatest threat to this crop?", "choices": ["rain", "breeze", "heat", "freezing"], "correct_choice_idx": 3, "direct_answers": ["frost", "cold", "cold", "frost", "freezing", "winter", "freezing", "wintery", "freezing weather", "frost"], "difficult_direct_answer": false, "rationales": ["Because it inhabits the growth of the crop which requires warmth to grow.", "Oranges are grown in warm climates; exposure to cold weather can kill them.", "Though the other weather could hurt the crops, but it's absolute cold that kills them."], "image": "train2014/COCO_train2014_000000149381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217091, "question_id": "CNhCG92vsTB3WA6HBixzmC", "question": "Why has the bike been placed near the bench?", "choices": ["to repair", "to dry", "to stand", "to paint"], "correct_choice_idx": 2, "direct_answers": ["to stand", "prop it", "for support", "for storage", "tipping prevention", "resting", "support it", "rest stop", "keep upright", "remain upright"], "difficult_direct_answer": true, "rationales": ["The bike is leaning on the bench and maintaining a vertical orientation.", "It is so it isn't getting broken laying on the ground", "The bike is putto stand in the area for support."], "image": "train2014/COCO_train2014_000000217091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380280, "question_id": "CNhwdmABArwMrwTwdhGRTW", "question": "What staff member is responsible for pushing the suitcase carts?", "choices": ["bellhop", "manager", "janitor", "maid"], "correct_choice_idx": 0, "direct_answers": ["baggage handler", "bell boy", "bellhop", "bellhop", "bellhop", "bellhop", "porter", "bellboy", "hotel staff", "bellman"], "difficult_direct_answer": false, "rationales": ["The suitcases were put on the carts by bellhops that work at the hotel and assist with luggage.", "A bunch of luggage is in a hotel lobby and a uniformed employee is pushing more on carts. hotels often have an employee to help get luggage to customer's rooms.", "Bellhops push suitcase carts."], "image": "train2014/COCO_train2014_000000380280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555631, "question_id": "CNofPru4rHhNk6VHR6jP7X", "question": "What is the blue square used for?", "choices": ["visibility", "buoyancy", "block sunlight", "capturing wind"], "correct_choice_idx": 3, "direct_answers": ["sailing", "sail", "sail", "capturing wind", "for movement", "sail", "wind", "sail", "sail", "sail"], "difficult_direct_answer": false, "rationales": ["These are sails and are used to catch wind and move the boat.", "The square is to capture wind.", "The sail is moving the boat using the wind. there is no motor on the boat."], "image": "train2014/COCO_train2014_000000555631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298762, "question_id": "CPAASrm5mGPxgCgHh2dTnb", "question": "Who is in the greatest danger?", "choices": ["right kid", "right woman", "middle person", "coachman"], "correct_choice_idx": 2, "direct_answers": ["pedestrian", "front person", "man walking", "man walking", "horses", "pedestrian", "boot wearer", "pedestrian", "man", "middle person"], "difficult_direct_answer": false, "rationales": ["The person walking in front of the horse on the street could get hurt.", "The person in the middle could be hit by the horses.", "A man opposite of the horse carriage is walking straight in the middle of the pathway of horses."], "image": "val2014/COCO_val2014_000000298762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373666, "question_id": "CPAmyk6WBhhMBYacxBiUh9", "question": "What yellow item is absent?", "choices": ["pineapple", "orange", "banana", "lime"], "correct_choice_idx": 0, "direct_answers": ["apple", "yellow apple", "pepper", "corn", "lemon", "pineapple", "sponge", "pineapple", "corn", "pineapple"], "difficult_direct_answer": false, "rationales": ["There is banana, lime and orange.", "There are no pineapples present in the container.", "There are bananas. limes and oranges are not yellow."], "image": "train2014/COCO_train2014_000000373666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361885, "question_id": "CPihAFaxSoir6tKhHgWE2a", "question": "What is on the wall?", "choices": ["bat", "monkey", "poster", "ceiling fan"], "correct_choice_idx": 3, "direct_answers": ["picture", "mirror", "picture", "picture", "picture", "artwork", "ceiling fan", "fireplace", "mirror", "artwork"], "difficult_direct_answer": false, "rationales": ["Out of all the answers given the first one is most viable.", "The most prominent object on the wall is the framed artwork, which could very well be some kind of minimalist poster.", "A fan is hanging from the ceiling to keep it cooler in the house."], "image": "val2014/COCO_val2014_000000361885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154794, "question_id": "CQ8eRMzYHyt72tCXHGFU3X", "question": "What is the blue object used for?", "choices": ["making pennies", "getting change", "riding", "sight seeing"], "correct_choice_idx": 3, "direct_answers": ["drinking", "sightseeing", "seeing", "sight seeing", "emergency", "telescope", "water", "see distance", "seeing far", "seeing"], "difficult_direct_answer": true, "rationales": ["The blue object is a telescope for sight seeing.", "The object is of a design and is placed in a setting that would be consistent with answer a.", "The blue object is a telescope used to help view objects far away."], "image": "train2014/COCO_train2014_000000154794.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376131, "question_id": "CQ9MVkoiJ7Jr6yqpT2QS2C", "question": "What type of room does this most closely resemble due to the items on the counter?", "choices": ["court room", "doctor's office", "bedroom", "law firm"], "correct_choice_idx": 1, "direct_answers": ["doctor's office", "doctor's office", "doctor's office", "doctor office", "doctor's office", "emergency", "exam room", "doctors office", "doctor's room", "doctors office"], "difficult_direct_answer": false, "rationales": ["The files are medical files from a doctor's office.", "There are medical supplies on the table.", "There is files on the table."], "image": "train2014/COCO_train2014_000000376131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302595, "question_id": "CQNDDADTEq2WDJM9Us7ZJ8", "question": "What company is sponsoring the speed board?", "choices": ["citizen", "geico", "ibm", "olympus"], "correct_choice_idx": 2, "direct_answers": ["ibm", "olympus", "ibm", "ibm", "cozen", "ibm", "ibm", "olympus", "ibm", "ibm"], "difficult_direct_answer": false, "rationales": ["Behind the tennis player is a mph sign that measures speed of ball and has the logo of brand.", "This is the logo displayed above \"serve speed mph\" on the board.", "There is a sign in the background with an empty display and a \"mph\" label which is where a speed would be displayed. the company also displayed on the board is option a."], "image": "train2014/COCO_train2014_000000302595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305833, "question_id": "CQamjpuXfFX7uKw9M4LNhM", "question": "How many watts does a bedside lamp use?", "choices": ["2.5", "5.5", "3.5", "1.5"], "correct_choice_idx": 3, "direct_answers": ["sixty watts", "sixty", "1.5", "60", "20", "varies", "two", "50-60 watts", "variable", "60"], "difficult_direct_answer": true, "rationales": ["A lamp for the bedroom uses a lower amount of watts because it doesn't need to be as bright.", "There are 1.5 watts.", "The lamp isn't all that bright."], "image": "val2014/COCO_val2014_000000305833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481874, "question_id": "CQpQia5a9RKhWKkqc739sR", "question": "They are taking this photo in what?", "choices": ["bus", "car", "train", "airplane"], "correct_choice_idx": 0, "direct_answers": ["car", "car", "mirror", "bus", "car mirror", "mirror", "bus", "city bus", "mirror", "bus"], "difficult_direct_answer": false, "rationales": ["The people are taking a photo while sitting in bus seats.", "It looks like they are sitting in bus seats closest to the rear view mirror.", "From the picture ,the chairs are divided to accommodate more people and is similar to that regular services."], "image": "train2014/COCO_train2014_000000481874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315403, "question_id": "CQqLFyPgacosUhCNsS4P4J", "question": "The people using the flip cell phones are taking pictures of which professional sport?", "choices": ["baseball", "football", "tennis", "golf"], "correct_choice_idx": 0, "direct_answers": ["baseball", "soccer", "soccer", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "soccer"], "difficult_direct_answer": false, "rationales": ["The people using the phones are near a baseball diamond.", "There is a baseball diamond in the background.", "This is obvious based on the line the umpire is walking on the field and the cut of the grass."], "image": "val2014/COCO_val2014_000000315403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19250, "question_id": "CQtYyB98m9pyZT4omm5rfm", "question": "What is the English translation of the French name for these?", "choices": ["little cakes", "little oven", "small squares", "mini bites"], "correct_choice_idx": 1, "direct_answers": ["small oven", "pastries", "pete fours", "petite fours", "small oven", "cake", "little oven", "cakes", "small oven", "pastries"], "difficult_direct_answer": false, "rationales": ["They are mini.", "These are tiny little mini bites of cakes.", "The food looks like little squares."], "image": "train2014/COCO_train2014_000000019250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127006, "question_id": "CRK8Aks7PpJPvaPzA4noii", "question": "What happened to this river made evident here?", "choices": ["flooded", "nothing", "receded", "polluted"], "correct_choice_idx": 0, "direct_answers": ["flooding", "flood", "flooded", "flooding", "flood", "flooded", "flooded", "it flooded", "flood", "flood"], "difficult_direct_answer": false, "rationales": ["The river flooded.", "The metal seats are on water.", "The water is up to the benches."], "image": "train2014/COCO_train2014_000000127006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63791, "question_id": "CRX8ypSRbTuhJkhZmnF9EZ", "question": "In which type area do players play soccer here?", "choices": ["farm", "park", "tundra", "city mall"], "correct_choice_idx": 1, "direct_answers": ["field", "field", "grass", "field", "grassy", "park", "park", "park", "field", "park"], "difficult_direct_answer": false, "rationales": ["The people are not in a rural area. the area is not a mall.", "You can play soccer virtually anywhere, but in this case you can tell by the setting as to where they are.", "They need a lot of room to play soccer so that would be the perfect place to go."], "image": "val2014/COCO_val2014_000000063791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74603, "question_id": "CRYjdB4Nmwq23bPdKxQZkF", "question": "What central type item brings these people together?", "choices": ["sports", "tv", "cell phones", "food"], "correct_choice_idx": 3, "direct_answers": ["food", "kitchen island", "food", "food", "kitchen island", "food", "kitchen island", "kitchen island", "food", "food"], "difficult_direct_answer": false, "rationales": ["There is lots of food on the table.", "There are bags of food around.", "There is a large table with many different types of edible items."], "image": "val2014/COCO_val2014_000000074603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169116, "question_id": "CRfUMq87tyTgVJYzX6boyW", "question": "How is the woman on the right feeling in this moment?", "choices": ["amused", "disturbed", "sad", "angry"], "correct_choice_idx": 0, "direct_answers": ["happy", "happy", "fun", "happy", "happy", "amused", "pretty happy", "happy", "happy", "amused"], "difficult_direct_answer": false, "rationales": ["The woman is very amused, and her face is squinched up in a way that shows just how delighted she actually is!.", "The woman on the right is laughing.", "The woman look happy."], "image": "train2014/COCO_train2014_000000169116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197323, "question_id": "CRyQVNmmAXZWHtVrKRQFot", "question": "Why is he smiling?", "choices": ["selling cake", "likes sweets", "make cake", "stole cake"], "correct_choice_idx": 1, "direct_answers": ["eating", "dessert", "likes dessert", "desserts", "hes hungry", "good cake", "eating dessert", "likes sweets", "loves dessert", "desserts"], "difficult_direct_answer": true, "rationales": ["There are two cakes in front of him. he is holding a fork ready to eat them.", "He is about to eat dessert.", "The individual has delicious cakes in front of them and is having a positive reaction to them."], "image": "train2014/COCO_train2014_000000197323.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83002, "question_id": "CS7bhQbNytp2QzSyFSYTDg", "question": "For what gender was the bathroom designed for?", "choices": ["nonbinary", "women", "men", "genderqueer"], "correct_choice_idx": 1, "direct_answers": ["female", "females", "female", "women", "women", "women", "women", "women", "woman", "females"], "difficult_direct_answer": false, "rationales": ["Only women wear lipstick.", "Women typically wear lipstick, and have bathrooms with only stalls and no urinals.", "A bathroom has a sign mentioning checking lipstick. women wear lipstick."], "image": "train2014/COCO_train2014_000000083002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366714, "question_id": "CSgUEjkxjBDSDStEm5sH7c", "question": "Based on the truck stickers what type of people are being celebrated in this parade?", "choices": ["athletes", "military", "teachers", "first responders"], "correct_choice_idx": 1, "direct_answers": ["veterans", "fire responders", "american", "first responders", "firefighters", "military", "construction workers", "first responders", "freedom", "veterans"], "difficult_direct_answer": false, "rationales": ["A truck with flags all around and the word \"freedom\" on it is in the street.", "There are american flags everywhere.", "The stickers are logos for the navy and similar government institutions."], "image": "val2014/COCO_val2014_000000366714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423860, "question_id": "CSgmGFX6QtRuLtxUN8ezSH", "question": "What happens near the yellow sign?", "choices": ["speed trap", "check stop", "pedestrian crossings", "speed up"], "correct_choice_idx": 2, "direct_answers": ["crossing", "road crossing", "pedestrian crossing", "pedestrians walking", "pedestrian crosswalk", "pedestrian closing", "people cross", "pedestrian crossing", "pedestrian crossings", "pedestrian crossing"], "difficult_direct_answer": false, "rationales": ["A yellow sign with a person walking is shown at a street. crosswalks are marked.", "The yellow sign shows a person walking. this indicates that there is an area ahead that allows people to get from one side of the street to the other.", "It has a person walking on the sign"], "image": "train2014/COCO_train2014_000000423860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396519, "question_id": "CSjgux7dNwPpkwNGJGs94e", "question": "What sport are the boys playing?", "choices": ["ultimate frisbee", "disc golf", "lacrosse", "soccer"], "correct_choice_idx": 0, "direct_answers": ["frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "ultimate frisbee", "frisbee", "ultimate frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["The equipment is visible in the players hand and based on the uniforms and markings on the field, answer a is consistent.", "They are holding a disc and playing competitively on a field.", "The boys are playing with a frisbee."], "image": "val2014/COCO_val2014_000000396519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548363, "question_id": "CTBi635EfPj7Xfp9hyC8MY", "question": "What type terrain is nearby?", "choices": ["high hills", "mesas", "flat", "mountainous"], "correct_choice_idx": 2, "direct_answers": ["tarmac", "open", "flatlands", "water", "flat area", "levelled", "water", "flat", "mountains", "ocean"], "difficult_direct_answer": true, "rationales": ["An airplane is at an airport with flat land all around.", "The terrain is flat.", "While there is some elevation in the background the ground is mostly flat behind the planes."], "image": "val2014/COCO_val2014_000000548363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458057, "question_id": "CTVBcEKxzmV4Vnq6UsoezY", "question": "Where might you relieve yourself here?", "choices": ["behind tree", "under table", "behind stairs", "restroom"], "correct_choice_idx": 3, "direct_answers": ["bathroom", "restroom", "bathroom", "food", "restroom", "back restroom", "restroom", "men's restroom", "restroom", "restroom"], "difficult_direct_answer": false, "rationales": ["Most restaurants have restrooms.", "They are in an eating establishment", "The far right of the picture contains signage for this type of area, which is where people usually relieve themselves."], "image": "train2014/COCO_train2014_000000458057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427462, "question_id": "CTWsCP3Yeh9HJQ2nYdeGhW", "question": "What profession did the founder of this eatery have before he retired?", "choices": ["acupuncturist", "teacher", "dentist", "disc jockey"], "correct_choice_idx": 2, "direct_answers": ["dentist", "dentist", "deli worker", "dentist", "dentist", "dentist", "ceo", "dentist", "dentist", "chef"], "difficult_direct_answer": false, "rationales": ["He was a dentist", "The founder of mcalister's deli was a dentist.", "This eatery is the deli of a former dentist."], "image": "train2014/COCO_train2014_000000427462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398005, "question_id": "CTX3yA6Q55KCt9ihjJgkCY", "question": "How is the zebra decorated?", "choices": ["white stripes", "black stripes", "all black", "all white"], "correct_choice_idx": 0, "direct_answers": ["striped", "stripes", "white stripes", "striped", "stripes", "stripes", "striped", "striped", "stripes", "striped"], "difficult_direct_answer": false, "rationales": ["Zebras are generally known to have white coats and black stripes. decorations would be something added to a base which in this case would be the base coat.", "They have black and white stripes on them.", "Traditionally these animals have these stripes and colors."], "image": "val2014/COCO_val2014_000000398005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453567, "question_id": "CTah44fY62R9VSnMifoa5d", "question": "Why is the man bending down while on the surfboard?", "choices": ["balance", "grabbing", "composure", "style"], "correct_choice_idx": 0, "direct_answers": ["balance", "balance", "catch wave", "balancing", "gravity", "balance", "main balance", "balance", "balance", "surfing"], "difficult_direct_answer": false, "rationales": ["The man is balancing.", "The man's position helps him stay stable on the surfboard.", "Surfers have to stay up on a surfboard. a crouching position can help with not falling off."], "image": "train2014/COCO_train2014_000000453567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456602, "question_id": "CUEDb4rzwV2fYM2KwTKNmZ", "question": "Why are these people on the backs of elephants?", "choices": ["taming them", "confused", "stealing them", "transportation"], "correct_choice_idx": 3, "direct_answers": ["travelling", "traverse water", "cross river", "for fun", "avoid water", "riding", "transportation", "keeping dry", "riding", "caravan"], "difficult_direct_answer": true, "rationales": ["They are riding the elephants on a tour.", "The people need transportation.", "The people are riding on the elephants through the water."], "image": "train2014/COCO_train2014_000000456602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521533, "question_id": "CUeqhztPP4iBbVJAnF9yHU", "question": "What hides this ladies mouth?", "choices": ["false teeth", "hat", "wild animal", "teddy bear"], "correct_choice_idx": 3, "direct_answers": ["object", "hat", "object", "teddy bear", "doll head", "food", "child's head", "toy", "doll", "stuffed animal"], "difficult_direct_answer": true, "rationales": ["She is holding a stuffed animal against her own mouth. this is the most common kind of stuffed animal.", "The woman is holding onto a teddy bear and its covering the bottom half of her face.", "The lady is holding up something that is made out of fabric and is in the shape of a stuffed animal."], "image": "train2014/COCO_train2014_000000521533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92627, "question_id": "CUh58DgvgJRyJCGQhUTXjA", "question": "Judging by the batters expression how hard is she swinging the bat?", "choices": ["very soft", "very hard", "soft", "somewhat hard"], "correct_choice_idx": 1, "direct_answers": ["hard", "very hard", "very hard", "hard", "hard", "very hard", "very", "full attention", "very hard", "hard"], "difficult_direct_answer": false, "rationales": ["She has a look of concentration on her face and is trying to get the ball to go as far as possible.", "The batter is using all her might.", "The person is swinging hard."], "image": "train2014/COCO_train2014_000000092627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320445, "question_id": "CUjMcQGvoKdjLq6AyxaXjT", "question": "What is the sheep breed that produces the best type of wool?", "choices": ["jacob", "dorper", "merino", "suffolk"], "correct_choice_idx": 2, "direct_answers": ["merino", "white kind", "merona", "merino", "sheep", "merino", "merino", "merino", "merino", "wooly"], "difficult_direct_answer": false, "rationales": ["I did an internet search to determine the type of sheep considered the producers of the best wool.", "Merino produce the best type.", "You will have to google the answer for this."], "image": "train2014/COCO_train2014_000000320445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195862, "question_id": "CUpDTidSYv6EYbuumbKoco", "question": "In what setting is the girl atop the horse?", "choices": ["ranch", "skating rink", "parking lot", "mall"], "correct_choice_idx": 0, "direct_answers": ["ranch", "child", "ranch", "riding horse", "fenced-in", "lessons", "lessons", "learning", "rural", "farm"], "difficult_direct_answer": false, "rationales": ["Horses are usually kept at a ranch.", "The girl is on a ranch as there is a field in the background.", "The setting is a ranch."], "image": "val2014/COCO_val2014_000000195862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110811, "question_id": "CUyYHjCvBHYnB6c2KXwMbD", "question": "What has to be done at some point in order for the pictured food to be produced?", "choices": ["boil shrimp", "peel potatoes", "kill animal", "cut bananas"], "correct_choice_idx": 2, "direct_answers": ["heating", "bake", "baking", "kill animal", "bake", "baking", "cook", "cooked", "baked", "bake"], "difficult_direct_answer": false, "rationales": ["The ingredients in the pizza comes from animals.", "Pepperoni is made from pork", "There is meat on the pizza. meat comes from animals."], "image": "train2014/COCO_train2014_000000110811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97502, "question_id": "CV5nXkvY7K7eapeMvLV7Dv", "question": "How is the visible item being held by the person?", "choices": ["head", "umbrella", "hands", "by toes"], "correct_choice_idx": 3, "direct_answers": ["over them", "over head", "feet", "around body", "net", "hand", "holding", "with toes", "between toes", "by toes"], "difficult_direct_answer": true, "rationales": ["He is holding a card.", "In looking at this man's entire body, there is only one part which is holding an item. this would be his toes.", "The only thing the person is holding anywhere is on his foot"], "image": "val2014/COCO_val2014_000000097502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314179, "question_id": "CWN97tY7iyd2gUfKSZ6tZh", "question": "What is the shelf to the left of the doorway used to store?", "choices": ["books", "notepads", "spices", "shoes"], "correct_choice_idx": 3, "direct_answers": ["shoes", "shoes", "items", "shoes", "shoes", "items", "items", "shoes", "shoes", "shoes"], "difficult_direct_answer": false, "rationales": ["There are shoes on the shelf.", "Most of shoes are seen in the area.", "The objects on the shelf are clearly visible and one can infer the intended use based on their presence."], "image": "train2014/COCO_train2014_000000314179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92197, "question_id": "CWXosK32LoC2pF7pH6aydX", "question": "Why is the black surface of the ramp scratched?", "choices": ["skateboards", "running", "erosion", "dirt"], "correct_choice_idx": 0, "direct_answers": ["skateboard wheels", "from wheels", "skateboard wheels", "wheels", "skateboard abrasion", "wheels", "being used", "wheels", "skateboards", "cause skateboards"], "difficult_direct_answer": false, "rationales": ["The deck and the wheels scratch it as the riders perform tricks", "The wheels and decks hit the ramp", "The wheels dig into the surface when they land."], "image": "train2014/COCO_train2014_000000092197.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382640, "question_id": "CWbUFULTjeGqNdp9MqQinJ", "question": "What is prohibited in the shade?", "choices": ["crossing", "driving", "parking/standing", "speeding"], "correct_choice_idx": 2, "direct_answers": ["parking", "parking", "parking", "no parking", "standing", "parking", "parking", "parking", "parking", "parking/standing"], "difficult_direct_answer": false, "rationales": ["There is a no parking sign.", "There is no parking or standing allowed.", "Signs designating prohibited activity on the street in regards to vehicles being left is on the sidewalk."], "image": "val2014/COCO_val2014_000000382640.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382638, "question_id": "CWhADG7oDwSQ7t7XdhkDTa", "question": "What material is the hat worn by the man?", "choices": ["plastic", "metal", "paper", "nylon"], "correct_choice_idx": 2, "direct_answers": ["paper", "paper", "paper", "paper", "paper", "paper", "paper", "paper", "paper", "paper"], "difficult_direct_answer": false, "rationales": ["The birthday hats or party hats are made out of cardstock.", "Most party hats traditionally are made of paper.", "The material is paper."], "image": "train2014/COCO_train2014_000000382638.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203747, "question_id": "CWsBj7eoqPQWtnYwMKzxjR", "question": "Who founded the religion mentioned here?", "choices": ["hubbard", "smith", "eddy", "wesley"], "correct_choice_idx": 0, "direct_answers": ["hubbard", "hubbard", "scientist", "hubbard", "scientology", "ron hubbard", "no idea", "ron hubbard", "hubbard", "aliens"], "difficult_direct_answer": false, "rationales": ["L. ron hubbard started scientology, which is barely a religion.", "The religion is scientology, not christian science, mormonism, or methodism.", "The religion mentioned is scientology which is known to have been founded by answer a."], "image": "val2014/COCO_val2014_000000203747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468548, "question_id": "CX4N4aVLLkppJy84szBpwF", "question": "What is happening to the bananas in water?", "choices": ["storage", "injected", "fertilizing", "washing"], "correct_choice_idx": 3, "direct_answers": ["cleaning", "washing", "washed", "washing", "being misted", "washing", "being cleaned", "boiling", "soaking", "cleaning"], "difficult_direct_answer": false, "rationales": ["Soaking in water gets things clean.", "When anything is in water, it will get washed.", "The bananas are in water."], "image": "val2014/COCO_val2014_000000468548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487630, "question_id": "CXVwLuDLNmcJ4Gu9QrKFYX", "question": "What is the man using to hit the ball?", "choices": ["hand", "paddle", "bat", "racquet"], "correct_choice_idx": 3, "direct_answers": ["racket", "racket", "racquet", "tennis racket", "tennis racket", "tennis racquet", "tennis racket", "racket", "tennis racket", "tennis racket"], "difficult_direct_answer": false, "rationales": ["The man is using a tennis racquet to swing and hit the ball.", "Tennis balls are hit with this type of item.", "The man has a tennis racquet in his hand and is about to hit the ball with it."], "image": "val2014/COCO_val2014_000000487630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78447, "question_id": "CXebKtStk85KvaRUKgTmKJ", "question": "What is the elephant doing in the photo?", "choices": ["smiling", "complaining", "eating", "yawning"], "correct_choice_idx": 2, "direct_answers": ["eating", "walking", "opening mouth", "roaring", "eating", "eating", "yawning", "opening mouth", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["The elephant is eating.", "The elephant has its mouth open for a treat.", "The elephant is eating with its trunk."], "image": "train2014/COCO_train2014_000000078447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225835, "question_id": "CXjbUqKfSyFkFn6C6Vc9Yi", "question": "What prevents the food from making contact with the silver platter?", "choices": ["fork", "wrapping", "chocolate", "air"], "correct_choice_idx": 1, "direct_answers": ["cup", "paper", "paper", "paper liners", "paper", "tongs", "paper", "wrapping", "paper", "cupcake liner"], "difficult_direct_answer": false, "rationales": ["A paper is below each piece of candy on a tray.", "Desserts are on white paper liners on a silver platter.", "There is a piece of fluted paper, similar to a cupcake liner, that the food was placed in. therefore, the paper and not the food touches the silver platter."], "image": "train2014/COCO_train2014_000000225835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454174, "question_id": "CY6UzSjNXdnUMfUkvTwdxk", "question": "Where are they likely headed to?", "choices": ["garage sale", "car show", "sales shop", "junkyard"], "correct_choice_idx": 3, "direct_answers": ["junk yard", "junk yard", "automotive shop", "body shop", "junkyard", "garage", "junk yard", "junk yard", "town", "junkyard"], "difficult_direct_answer": false, "rationales": ["This is obvious based on the use of a tow truck. the other options don't fit as well.", "It is a tow truck, towing a truck that looks like it has seen better days.", "It is a towtruck towing a broken vehicle"], "image": "train2014/COCO_train2014_000000454174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60599, "question_id": "CYFPcnK7pVd6siku5VnfiT", "question": "What in the foreground is most often used as a food source?", "choices": ["brown animal", "black animal", "fence material", "ground material"], "correct_choice_idx": 0, "direct_answers": ["cow", "hay", "cow", "cow", "grain", "cow", "hay", "brown animal", "cow", "cow"], "difficult_direct_answer": false, "rationales": ["The brown animal is used.", "We eat cows", "The cow is eaten for its nutritious meat."], "image": "train2014/COCO_train2014_000000060599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566188, "question_id": "CYKL8c27ExCYo3zrGaeBd7", "question": "What is the woman doing to her hair?", "choices": ["drying it", "cutting it", "shaving it", "dying it"], "correct_choice_idx": 0, "direct_answers": ["drying", "drying", "drying it", "drying", "drying", "blow drying", "drying it", "drying", "shaving it", "blow drying"], "difficult_direct_answer": false, "rationales": ["This is hard to tell. my first impression says a, but it could also be c.", "The woman is using a hair dryer to dry her hair.", "The woman appears to have an object in right hand resembling a blow dryer. since she is pointing it to her hair it is most likely that whe is drying it."], "image": "train2014/COCO_train2014_000000566188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552001, "question_id": "CYMV2MAcewJAeJGBY2hjDR", "question": "What kind of pitch does the pitcher hope to achieve?", "choices": ["home run", "ball", "base hit", "strike"], "correct_choice_idx": 3, "direct_answers": ["strikeout", "strike", "strike", "strike", "curve ball", "strike", "fast ball", "fastball", "strike", "strike"], "difficult_direct_answer": false, "rationales": ["Whenever a pitcher pitches a ball their best outcome is a strike so that is what this person is hoping to do.", "The term is often used for baseball.", "Because he seems to be gathering all his strength to throw the ball."], "image": "train2014/COCO_train2014_000000552001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498971, "question_id": "CYfiS28LvxA7eev4fLwqPq", "question": "What is a group of these items called during wartime?", "choices": ["army", "clowder", "colony", "fleet"], "correct_choice_idx": 3, "direct_answers": ["fleet", "fleet", "flotilla", "war boats", "boats", "vessels", "armada", "armada", "fleet", "ships"], "difficult_direct_answer": false, "rationales": ["The word traces back to the old english \"fleotan,\" which meant to float or to swim.", "Formations of ships that can provide protection for each other and do military maneuvers.", "This is a naval term used mostly by militaries to describe a number of similar boats."], "image": "val2014/COCO_val2014_000000498971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552849, "question_id": "CYkd9Lkcm3JuAwtLXzsUu2", "question": "What part of the market is located here?", "choices": ["fruit stand", "custom packaging", "home wares", "butcher"], "correct_choice_idx": 0, "direct_answers": ["fruit", "produce", "produce", "produce", "produce", "fruit stand", "produce", "produce", "fruit section", "indoor"], "difficult_direct_answer": false, "rationales": ["There are melons, bananas and other produce available for sale.", "The part shown has plenty of assorted fruit.", "Watermelons, bananas, and melons are all produce that would be found with similar items in their food group."], "image": "val2014/COCO_val2014_000000552849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327694, "question_id": "CYuVxPMz5MyqpNKy4677Da", "question": "What is the man helping the young child with?", "choices": ["swimming", "dressing", "brushing teeth", "combing hair"], "correct_choice_idx": 2, "direct_answers": ["brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "toothbrushing", "brushing teeth", "toothbrushing", "brushing teeth", "toothbrushing"], "difficult_direct_answer": false, "rationales": ["The man is holding a kids tooth brush and they are in front of a sink.", "He is holding a tooth brush in the child's mouth.", "The man is holding a toothbrush and helping the child brush his teeth."], "image": "train2014/COCO_train2014_000000327694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425540, "question_id": "CYx3MkR9twgTTJiAzGJisd", "question": "Which vehicle is experiencing problem?", "choices": ["white truck", "white car", "grey car", "red car"], "correct_choice_idx": 2, "direct_answers": ["car", "silver car", "grey car", "gray", "car", "left car", "left", "parked left", "left", "left car"], "difficult_direct_answer": false, "rationales": ["The white and red vehicles are driving normally. the other four-wheeled vehicle is stopped on the side of the road.", "The car is parked on the side of the road.", "The car sitting on the left is grey."], "image": "train2014/COCO_train2014_000000425540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173161, "question_id": "CZN7zaMyMbMVHrC64wehxR", "question": "Where were umbrellas most likely invented?", "choices": ["france", "italy", "japan", "china"], "correct_choice_idx": 3, "direct_answers": ["rainy area", "china", "uk", "chinese", "china", "england", "america", "warehouse", "england", "n/a"], "difficult_direct_answer": false, "rationales": ["The umbrellas are from china.", "Umbrellas likely came from ancient china.", "Umbrellas were a popular accessory in china to shield from the sun."], "image": "val2014/COCO_val2014_000000173161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179408, "question_id": "CZUpwJbsesEXZLtVgQ8F2K", "question": "People are commuting on this road during which time of the year?", "choices": ["fall", "winter", "summer", "spring"], "correct_choice_idx": 3, "direct_answers": ["autumn", "spring", "spring", "winter", "fall", "spring", "morning", "spring", "morning", "christmas"], "difficult_direct_answer": false, "rationales": ["People are commuting in the spring since the tree leaves are green.", "It's springtime.", "There are new leaves on the trees."], "image": "train2014/COCO_train2014_000000179408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421103, "question_id": "CZneqMZYhPjpYrCdrq7pjy", "question": "What is next to the horse?", "choices": ["napkin", "tree", "coffee", "toad"], "correct_choice_idx": 1, "direct_answers": ["tree", "fence", "tree", "tree", "tree", "tree", "fence", "tree", "tree", "tree"], "difficult_direct_answer": false, "rationales": ["The horse is next to a tree.", "It has a thick trunk and branches with leaves.", "There is a tree by the horse."], "image": "train2014/COCO_train2014_000000421103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404933, "question_id": "Ca5oqf2oaVdjr4gdqLgkF5", "question": "What does this lady wish for weather wise?", "choices": ["hot sun", "clear skies", "rain", "snow"], "correct_choice_idx": 3, "direct_answers": ["snow", "snow", "clear weather", "sunny", "snow", "snow", "more snow", "snow", "snow", "snow"], "difficult_direct_answer": false, "rationales": ["The woman is holding a snowboard so she is likely wanting to use it.", "The woman wants snow.", "A girl is holding a snowboard. snow is needed to snowboard."], "image": "train2014/COCO_train2014_000000404933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501917, "question_id": "Caa8LzRtmbpXQEzLVVqgA2", "question": "What is required for this activity?", "choices": ["wind", "ice", "water", "snow"], "correct_choice_idx": 2, "direct_answers": ["surfboard", "balance", "surfboard", "balance", "water", "skate board", "paddle", "paddle board", "water", "water"], "difficult_direct_answer": false, "rationales": ["The people are paddle boarding, not snowboarding, skating, or flying kites.", "The people are using paddles to push themselves through the ocean.", "They are paddle boarding on the ocean."], "image": "train2014/COCO_train2014_000000501917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275976, "question_id": "CahfeTyWpDRqSqK8vPBTUa", "question": "What body of water is the boat using?", "choices": ["swamp", "river", "creek", "ocean"], "correct_choice_idx": 1, "direct_answers": ["lake", "river", "river", "river", "river", "river", "river", "river", "river", "lake"], "difficult_direct_answer": false, "rationales": ["The boat is using a river.", "This is a river they are on.", "The body of water is too narrow to be an ocean or swamp and too wide to be a creek."], "image": "train2014/COCO_train2014_000000275976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216320, "question_id": "CaiFAEro8qtyogKyRAkoL3", "question": "What kind of bathroom is this?", "choices": ["school", "home", "hotel", "restaurant"], "correct_choice_idx": 2, "direct_answers": ["luxury", "hotel", "luxurious", "hotel", "spa", "hotel", "marble", "hotel", "hotel bathroom", "hotel bathroom"], "difficult_direct_answer": false, "rationales": ["There are no personal items in the bathroom. it has a convenience robe and towels all in the same color.", "The fresh white linens hung up suggest that this is a professional hotel.", "The white towels, robe and small shampoo bottles indicate that."], "image": "val2014/COCO_val2014_000000216320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563605, "question_id": "CamDr2qzsfTLpTdC8DCTnr", "question": "From what materials is the wall made?", "choices": ["blocks", "wood", "tile", "bricks"], "correct_choice_idx": 0, "direct_answers": ["stone", "blocks", "stone", "stone", "stone", "stone", "concrete", "marble", "brick", "stone"], "difficult_direct_answer": false, "rationales": ["The wall has several sections that look like they are stacked up like bricks.", "The materials are blocks.", "You can see where the pieces are put in by the lines of each part."], "image": "val2014/COCO_val2014_000000563605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117674, "question_id": "CbEjXeLHLRhEJFCehyyBUZ", "question": "What does the person standing here wait for?", "choices": ["hydrant", "walk light", "stop light", "text signal"], "correct_choice_idx": 1, "direct_answers": ["crossing road", "to cross", "fire hydrant", "pedestrian light", "car", "cross street", "ride", "crossing light", "walk light", "cross street"], "difficult_direct_answer": true, "rationales": ["The person wants to walk but needs the light signal to do so.", "They are on foot and want to cross the street safely.", "The person is standing at the edge of a street based on the vehicle and pavement in front of her. if following the known pedestrian rules, they would be waiting for answer a if crossing safely."], "image": "val2014/COCO_val2014_000000117674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444627, "question_id": "CbX8f7NgZry9Vqv2UEnwAs", "question": "Which player played for the team the man that is eating is a fan of?", "choices": ["david wright", "dale murphy", "hank aaron", "lou gehrig"], "correct_choice_idx": 3, "direct_answers": ["frank thomas", "babe ruth", "babe ruth", "babe ruth", "mickey mantle", "lou gehrig", "not clear", "new york", "new york", "aaron judge"], "difficult_direct_answer": false, "rationales": ["The man is wearing a new york yankees hat which was the team of lou gehrig.", "The team is the yankees and lou gehrig was a famous player for that team.", "The player is gehrig."], "image": "train2014/COCO_train2014_000000444627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14795, "question_id": "CbYYaYxjedDGtYZUopgv2G", "question": "What is the guy on the right doing?", "choices": ["crying", "stretching", "proposing", "praying"], "correct_choice_idx": 1, "direct_answers": ["playing soccer", "cheering", "stretching", "cheering", "calling coach", "cheering", "giving thanks", "stretching", "celebrating", "raising hands"], "difficult_direct_answer": false, "rationales": ["The man is seen raising his hand showing as if he is stretching.", "He has one knee on the ground and extending the other leg behind him", "The guy is stretching."], "image": "train2014/COCO_train2014_000000014795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68435, "question_id": "Cbczno6cRGxtLGtSgkGTnh", "question": "What is the relationship between these people?", "choices": ["siblings", "business partners", "acquaintances", "couple"], "correct_choice_idx": 3, "direct_answers": ["couple", "romantic", "dating", "married", "husband wife", "dance partners", "dates", "married", "married couple", "boyfriend girlfriend"], "difficult_direct_answer": true, "rationales": ["The pair are both wearing red roses and matching silver and black bows. they are also holding on to each other as if they are in a relationship.", "The man and woman are presumably together in a relationship or are at least each other's date because they dressed to coordinate. their flowers match and the print on her hat matches his tie. also, they are posed in a more intimate position.", "The people are a couple."], "image": "val2014/COCO_val2014_000000068435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6579, "question_id": "Cbd3FZne9BBiZDFG9ZkkTq", "question": "What emotion is the boy showing?", "choices": ["happy", "joyful", "excited", "scared"], "correct_choice_idx": 3, "direct_answers": ["sadness", "distressed", "sadness", "sad", "sad", "sadness", "fear", "sadness", "grief", "scared"], "difficult_direct_answer": false, "rationales": ["He has a very worried face that he seems to be frightened.", "The boy looks like he is afraid of the woman and not any positive emotion.", "The boy looks like he's about to cry."], "image": "val2014/COCO_val2014_000000006579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89027, "question_id": "Cbx7gpr78Axyz2Y4ttMuPV", "question": "What do the children want to do with the ball?", "choices": ["dribble it", "hide it", "nothing", "kick it"], "correct_choice_idx": 3, "direct_answers": ["kick it", "kick", "kick", "kick", "playing", "kick", "kick it", "kick it", "kick it", "kick"], "difficult_direct_answer": false, "rationales": ["Soccer is played mainly by kicking the ball", "The yellow ball is a soccer ball it is used in the game soccer the object of this game is to use your feet to propel the ball into the opposing teams net.", "The children are playing soccer. you play soccer by kicking the ball with your foot"], "image": "val2014/COCO_val2014_000000089027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146841, "question_id": "CcrX3juoLEhRNidHEunDXG", "question": "What type of fruit is shown?", "choices": ["banana", "peach", "watermelon", "strawberry"], "correct_choice_idx": 0, "direct_answers": ["bananas", "bananas", "banana", "banana", "tropical", "banana", "banana", "banana", "banana", "bananas"], "difficult_direct_answer": false, "rationales": ["Yellow fruit can be seen among other fruits.", "The fruit shown is yellow and curved. they are in a bunch connected by a stem.", "They are yellow and in the shape of that fruit."], "image": "train2014/COCO_train2014_000000146841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14628, "question_id": "Ccyk3oB59TzKFTf5FNWUqM", "question": "What is the area decorated with stone used to contain?", "choices": ["food", "pets", "fire", "books"], "correct_choice_idx": 2, "direct_answers": ["fireplace", "fire place", "fireplace", "fire", "fireplace", "fire", "fireplace", "fire", "fire", "chimney"], "difficult_direct_answer": false, "rationales": ["The fireplace is made of stone.", "The fireplace is surrounded with stone.", "The brick is to contain heat and the stacked wood is for burning."], "image": "train2014/COCO_train2014_000000014628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219754, "question_id": "CcyuePVHXkeqEFyK8FaB5f", "question": "What climate is this plane parked in?", "choices": ["tropical", "steppe", "tundra", "freezing"], "correct_choice_idx": 0, "direct_answers": ["tropical", "cloudy", "tropical", "rain", "grassland", "warm", "cloudy", "tropical", "temperate", "warm"], "difficult_direct_answer": false, "rationales": ["It looks like a bright, sunny day. warm and comfortable.", "There are palm trees in the background, which only grow in warm locations.", "Palm trees grow in the tropics."], "image": "val2014/COCO_val2014_000000219754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387901, "question_id": "CdAv9Pj5TzirVZpQXeSzpv", "question": "What kind of terrain is best for this activity?", "choices": ["downhill", "rocky", "flat", "uphill"], "correct_choice_idx": 2, "direct_answers": ["snow", "mountainous", "snow", "slopes", "snow", "flat", "snowy", "flat", "mountain", "flat"], "difficult_direct_answer": false, "rationales": ["The terrain is flat.", "Flat terrain is best for walking.", "If they're trying to walk around, a flat surface is best."], "image": "train2014/COCO_train2014_000000387901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419860, "question_id": "CdSj5inG8Nv4iMiHnriZXQ", "question": "What type of table is the pizza on?", "choices": ["mosaic top", "kitchen table", "picnic table", "wooden top"], "correct_choice_idx": 0, "direct_answers": ["mosaic top", "mosaic table", "mosaic table", "mosaic", "mosaic tiled", "mosaic", "tiled", "mosaic tile", "mosaic", "tile top"], "difficult_direct_answer": false, "rationales": ["The pizza is on a mosaic table since little pieces of glass were put together to create it.", "The pizza is on a mosaic table.", "The table top consists of rectangular tiles. it is not made out of wood."], "image": "val2014/COCO_val2014_000000419860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84120, "question_id": "CdVadqBVKBYDvKavtZuNNP", "question": "In what style were the eggs cooked?", "choices": ["side broiled", "scrambled", "hard boiled", "swapped"], "correct_choice_idx": 2, "direct_answers": ["hard boiled", "hard boiled", "hardboiled", "boil", "hard boiled", "hard boiled", "hard boiled", "hard boil", "boiled", "boiled"], "difficult_direct_answer": false, "rationales": ["The eggs were boiled.", "They are fully cooked and have the shape of the shell", "Whole eggs are seen cut in half with a solid center."], "image": "train2014/COCO_train2014_000000084120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148176, "question_id": "CdZ7SEPveiNoweKvqVUAJX", "question": "What is the boy designing?", "choices": ["lady bug", "spider", "bee", "cricket"], "correct_choice_idx": 0, "direct_answers": ["cake", "cake", "cake", "cake", "cake", "cake", "bug", "lady bug", "cake", "ladybug"], "difficult_direct_answer": false, "rationales": ["The design has the red color and black dots that are seen on lady bugs.", "The cake is a lady bug. it has a red body and black dots.", "The boy is designing a ladybug."], "image": "train2014/COCO_train2014_000000148176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72715, "question_id": "CdafkvFbRqrQSuHaF5Purb", "question": "What is the wooden item above the two right drawers and below the countertop called?", "choices": ["backsplash", "spice rack", "cutting board", "potholder"], "correct_choice_idx": 2, "direct_answers": ["secret drawer", "vase", "cabinet", "cutting board", "vase", "cabinets", "cupboard", "cabinet", "cutting board", "vase"], "difficult_direct_answer": false, "rationales": ["It's built into the cabinet.", "A cutting board is up there.", "Many kitchens have inserted cutting boards that people can use and then slide back in after use."], "image": "train2014/COCO_train2014_000000072715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30413, "question_id": "Cdb2PDgLMyuHVepHCienfo", "question": "What are the people waiting for?", "choices": ["boarding train", "taxi cab", "airport bus", "parade"], "correct_choice_idx": 0, "direct_answers": ["train", "boarding train", "train", "train", "train", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["They are waiting inside a station that has tracks.", "The people are standing on a platform of a train station waiting for a train.", "They are waiting to get on a train when it is time to load."], "image": "val2014/COCO_val2014_000000030413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122945, "question_id": "CdvsYEFpun7eEYnTFHk7Py", "question": "What is under her right arm?", "choices": ["body board", "surf board", "boogie board", "skate board"], "correct_choice_idx": 1, "direct_answers": ["surfboard", "surfboard", "surfboard", "surfboard", "surfboard", "surfboard", "surfboard", "surf board", "surfboard", "surfboard"], "difficult_direct_answer": false, "rationales": ["It is plain to see what she is holding ad the setting as well.", "You can tell by the setting she is in and the shape, what she is holding.", "The woman is holding a surf board underneath her right arm."], "image": "train2014/COCO_train2014_000000122945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123247, "question_id": "CeBLfnJ3hzkvrpuqnF2oKC", "question": "What position is number forty three playing?", "choices": ["outfield", "catcher", "pitcher", "first base"], "correct_choice_idx": 2, "direct_answers": ["pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher"], "difficult_direct_answer": false, "rationales": ["The man is pitching the ball.", "He is on the mound where the player who throws to the batter stands", "The baseball player is holding a ball and throwing a ball. he is on the mound."], "image": "train2014/COCO_train2014_000000123247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304958, "question_id": "CeCqnZs8g3Kj9dXpug3Spt", "question": "Where does the cup come from?", "choices": ["peet's", "coffee bean", "roastery", "starbucks"], "correct_choice_idx": 3, "direct_answers": ["starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks"], "difficult_direct_answer": false, "rationales": ["The logo of this coffee shop is seen on the cup.", "The cup comes from the starbucks logo.", "The logo on the side of the cup indicates where it came from."], "image": "train2014/COCO_train2014_000000304958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460962, "question_id": "CePoZ4354vd4CXpEKtTCyv", "question": "What do the sharp things on top of the rectangular tall structure prevent?", "choices": ["rainbow", "pigeons roosting", "space signals", "glare"], "correct_choice_idx": 1, "direct_answers": ["miscommunication errors", "pigeons roosting", "lightning", "birds perching", "lightning", "lightning strike", "lightning strikes", "entry", "lightning strike", "accidents"], "difficult_direct_answer": false, "rationales": ["They would be pigeons roosting on top of the structure.", "Sharp spines on roofs can be used to drive away birds who would otherwise perch atop the roof.", "Keeps the birds from sitting there."], "image": "val2014/COCO_val2014_000000460962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259555, "question_id": "CeRq4hvm5wBgdexoo7pvdJ", "question": "What is she doing?", "choices": ["talkin", "checking facebook", "ordering lunch", "texting friend"], "correct_choice_idx": 3, "direct_answers": ["texting", "texting friend", "mobile checking", "using phone", "reading messages", "using phone", "texting", "looking phone", "chatting", "texting"], "difficult_direct_answer": false, "rationales": ["Most people use phones for texting or talking.", "The person has a phone on her hand.", "She has a phone in her hand and this is before higher end phones. she isn't talking on it."], "image": "val2014/COCO_val2014_000000259555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318638, "question_id": "CeS5NHV7RXx58MbGan8Mhr", "question": "What is the topping shown?", "choices": ["mustard", "pepperoni", "sausage", "bell pepper"], "correct_choice_idx": 3, "direct_answers": ["bell pepper", "sausage", "sausage", "sausage", "cheese", "sausage", "meat", "sausage", "meat veggies", "veggies meat"], "difficult_direct_answer": false, "rationales": ["This is a common pizza topping. the pepper shown is of the red colored variety.", "Looks to be big slices of pepperoni on their pizza.", "There are lighter colored chunks of meat on the pizza and pork is a white meat"], "image": "train2014/COCO_train2014_000000318638.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229149, "question_id": "CefVmdUBUn2Ak247FNPKWH", "question": "What will the man in the blue sweatshirt do next?", "choices": ["take photograph", "give object", "throw object", "eat object"], "correct_choice_idx": 0, "direct_answers": ["take picture", "take picture", "take picture", "snap pictures", "take photograph", "take photo", "take photo", "take photo", "take photo", "take photo"], "difficult_direct_answer": false, "rationales": ["The man is taking a photo.", "The man in the blue sweatshirt has a camera.", "The man is holding a camera and is about to use it."], "image": "train2014/COCO_train2014_000000229149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570205, "question_id": "Cezf4KiTDah4EmA6eK799G", "question": "What creature is associated with the picture on the wall?", "choices": ["wasp", "flea", "spider", "fly"], "correct_choice_idx": 2, "direct_answers": ["spider", "spiders", "spider", "spider", "spider", "spider", "spider", "spiders", "spider", "spiders"], "difficult_direct_answer": false, "rationales": ["Spiders are associated with webs.", "The picture on the wall resembles a web that is made by a spider.", "Spiders spin webs like the one in the picture."], "image": "train2014/COCO_train2014_000000570205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298252, "question_id": "Cf3yVfbSSiRTFMY7ae6WnA", "question": "What kind of country is tis most likely in?", "choices": ["african", "asian", "middle eastern", "american"], "correct_choice_idx": 1, "direct_answers": ["asian", "france", "asian", "japan", "asian", "china", "china", "asian", "japan", "asian country"], "difficult_direct_answer": false, "rationales": ["The language of the prices and description looks chinese.", "The country is asian.", "The printed labels appear to be in japanese."], "image": "val2014/COCO_val2014_000000298252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188136, "question_id": "CfB4uUuSP9LjY8SATeLBZu", "question": "Which food provides the most vitamin A?", "choices": ["yam", "potato", "onion", "carrot"], "correct_choice_idx": 3, "direct_answers": ["carrots", "carrot", "carrots", "carrot", "carrot", "carrots", "carrot", "potatoes", "carrots", "liver"], "difficult_direct_answer": false, "rationales": ["Carrots are high in vitamin a and the highest amount of the foods in the picture.", "Carrots are known to provide many essential vitamins and benefits to the eyes, the most important being vitamin a.", "Carrots are high in vitamins and most likely also in vitamin a."], "image": "val2014/COCO_val2014_000000188136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570344, "question_id": "CfThbPxoyJEo5U2weH57BS", "question": "What is ropeway called?", "choices": ["aerial tramway", "cable way", "cable car", "rope way"], "correct_choice_idx": 0, "direct_answers": ["trolley line", "train", "railway lines", "train", "aerial tramway", "poles", "aerial tramway", "jackstay", "trolley", "cable car"], "difficult_direct_answer": false, "rationales": ["This tram is above the ground with cables hanging over it in sky.", "The ropeway is a tramway.", "It ius called that as it is able to move in rails."], "image": "train2014/COCO_train2014_000000570344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400175, "question_id": "CfbHieDZExssxARtxN9fJf", "question": "How is the man's yellow jacket done up?", "choices": ["zipper", "buckles", "buttons", "laces"], "correct_choice_idx": 0, "direct_answers": ["zipped", "zipped", "zipper", "zipped", "zipper", "zipped", "zipped up", "unzip", "zipped", "zipper"], "difficult_direct_answer": false, "rationales": ["A man is walking with a jacket on that is partially closed. coats are often closed with buttons and zippers.", "The jacket has a zipper.", "The jacket has a zipper to close it."], "image": "train2014/COCO_train2014_000000400175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455614, "question_id": "CfjFNpaKbg3xP9sgKZGGSc", "question": "What profession is the man on the bike?", "choices": ["dentist", "lawyer", "cop", "stunt man"], "correct_choice_idx": 2, "direct_answers": ["police", "cop", "police", "cop", "police", "police officer", "policeman", "police", "cop", "cop"], "difficult_direct_answer": false, "rationales": ["He is in uniform and a police officer.", "You can see from his uniform he is a police officer.", "The motorcycle says police on it."], "image": "train2014/COCO_train2014_000000455614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465374, "question_id": "CgwGtZiojqbvBs5hDpS5K2", "question": "How are the stone walls held together?", "choices": ["mortar", "tape", "glue", "own weight"], "correct_choice_idx": 3, "direct_answers": ["own weight", "mortar", "mud", "gravity", "brick paste", "gravity", "mortar", "cement", "pressure", "stacked"], "difficult_direct_answer": false, "rationales": ["The walls are held by their own weight.", "Traditionally this type of architecture is made without and substance to keep the stones together.", "The stone walls don't have any adhesive between them."], "image": "train2014/COCO_train2014_000000465374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350302, "question_id": "CgyXuTomyNJAH63epDCjVn", "question": "What might the person wearing the hat be doing on the bench?", "choices": ["stealing", "acting", "sleeping", "acting crazy"], "correct_choice_idx": 2, "direct_answers": ["sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping"], "difficult_direct_answer": false, "rationales": ["The person on the bench has his eyes closed.", "They both appear to be doing a. the other options aren't depicted in the image. and people often do this on benches.", "The person is dozing since their eyes are closed."], "image": "train2014/COCO_train2014_000000350302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433554, "question_id": "Ch5ncgPFCDFYj5bfaetcKR", "question": "What is the man wearing in red?", "choices": ["scuba outfit", "lifejacket", "swimsuit", "jeans"], "correct_choice_idx": 1, "direct_answers": ["lifejacket", "life jacket", "wakeboard", "lifevest", "life vest", "lifejacket", "life jacket", "life vest", "life jacket", "life vest"], "difficult_direct_answer": false, "rationales": ["The red vest this man wears is a flotation device.", "The design and the setting lets you know what he is wearing.", "The vibrant orange indicates it is surely a life jacket."], "image": "val2014/COCO_val2014_000000433554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302076, "question_id": "ChFYZC2fT623bajEWpmw9s", "question": "What rule regarding shoes is likely in place here?", "choices": ["boots required", "shoes off", "none", "cleats required"], "correct_choice_idx": 1, "direct_answers": ["no shoes", "no shoes", "no shoes", "no shoes", "taken off", "taken off", "shoes off", "no shoes", "taken off", "no shoes"], "difficult_direct_answer": false, "rationales": ["The white carpet is pristine which means people are probably not allowed to wear their footwear in the house.", "No dirt on the white carpet.", "The carpet is too dainty and clean for people to wear shoes on it."], "image": "train2014/COCO_train2014_000000302076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290798, "question_id": "ChSYowL2y62FPHXSfXHr5Q", "question": "In what continent would this truck setup probably be legal?", "choices": ["europe", "north america", "asia", "south america"], "correct_choice_idx": 3, "direct_answers": ["south america", "asia", "australia", "south america", "asia", "most", "africa", "india", "south america", "asia"], "difficult_direct_answer": false, "rationales": ["South america does not have modern means of transport.", "This would be illegal in the states, and the ethnicity of the people are latino.", "Some south american countries let you over pack your truck like this."], "image": "val2014/COCO_val2014_000000290798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419309, "question_id": "ChX5mbWyMSonq8bG5skBxw", "question": "What session of the day is this likely to be?", "choices": ["evening", "afternoon", "morning", "night"], "correct_choice_idx": 0, "direct_answers": ["dusk", "dusk", "sunset", "dusk", "evening", "dusk", "sunset", "evening", "sunset", "sunset"], "difficult_direct_answer": false, "rationales": ["The sun is setting.", "The sun is setting so it is evening.", "The sun has set, but there is still some daylight."], "image": "val2014/COCO_val2014_000000419309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158514, "question_id": "ChoQivUZM4aSYz8oMaZk2J", "question": "What type of utensil is in the bowl?", "choices": ["scissor", "spoon", "knife", "fork"], "correct_choice_idx": 1, "direct_answers": ["spoon", "spoon", "spoon", "spoon", "spoon", "spoon", "spoon", "spoon", "ceramic", "spoon"], "difficult_direct_answer": false, "rationales": ["Anyone can see what type of utensil this is by its design.", "A black spoon can be seen in a bowl.", "A black utensil that is curved is in a bowl. you can use it to pick up items easy because things can be scooped with it."], "image": "train2014/COCO_train2014_000000158514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475466, "question_id": "Ci9tXmxACeqbZsfYAkMq4U", "question": "What is showing the man's image?", "choices": ["water", "camera", "mirror", "picture"], "correct_choice_idx": 0, "direct_answers": ["reflection", "water", "puddle", "reflection", "reflection", "reflection", "water reflection", "reflection", "reflection", "reflection"], "difficult_direct_answer": false, "rationales": ["The overcast day is able to make puddles on the ground reflect light and the reflection of the man.", "The water is reflecting the man's image.", "There is water shown."], "image": "val2014/COCO_val2014_000000475466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34580, "question_id": "CiYJbKcn3gNPdhHfPvX9NS", "question": "What is the red white and green object in the grass connected to?", "choices": ["electricity", "underground railroad", "water supply", "food court"], "correct_choice_idx": 2, "direct_answers": ["water main", "water pipes", "fire hydrant", "water line", "water line", "water", "waterline", "water main", "water supply", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["Water comes out of this item.", "The red, white and green object is used to spray water.", "This is a hydrant that is for firefighters to connect hoses to when there is a fire"], "image": "val2014/COCO_val2014_000000034580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224850, "question_id": "Citsp9JwNLrF8FkHBXAv72", "question": "What type of event are these people at?", "choices": ["carnival", "toy sale", "fire safety", "school event"], "correct_choice_idx": 0, "direct_answers": ["festival", "carnival", "kids event", "fair", "carnival", "carnival", "carnival", "fair", "fair", "fair"], "difficult_direct_answer": false, "rationales": ["They are riding rides at a carnival.", "They have rides for children to wear", "Little cars that go round and round would be at this type of venue."], "image": "val2014/COCO_val2014_000000224850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176091, "question_id": "CiuiiTAnrphFVc2tQfkSRs", "question": "What is the woman walking towards?", "choices": ["cat", "airplane", "car", "boat"], "correct_choice_idx": 1, "direct_answers": ["airplane", "boarding stairs", "airplane", "boarding airplane", "airplane", "airplane", "airplane", "airplane", "plane", "plane"], "difficult_direct_answer": false, "rationales": ["The woman is walking towards a vehicle, not an animal. the vehicle has wings and a fuselage.", "The woman goes to the plane.", "The large object has wide wings and a downward open door that acts as steps for her to get in."], "image": "train2014/COCO_train2014_000000176091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156751, "question_id": "Cj38M3EgZzZQHFLduXpJee", "question": "How many birds are in focus?", "choices": ["four", "two", "one", "three"], "correct_choice_idx": 2, "direct_answers": ["six", "one", "one", "one", "six", "one", "six", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["While there are several birds in the picture, the features of only the one closest to the camera can be clearly identified.", "Only the one in the front is in focus.", "There is one bird in the foreground of the photo."], "image": "val2014/COCO_val2014_000000156751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65220, "question_id": "Cj4XY74UULHX4om68nFEVB", "question": "What gesture is the man doing with his hand?", "choices": ["peace sign", "thumbs down", "gang sign", "thumbs up"], "correct_choice_idx": 0, "direct_answers": ["peace", "peace sign", "peace sign", "peace sign", "peace sign", "peace sign", "peace sign", "peace", "peace", "peace"], "difficult_direct_answer": false, "rationales": ["The man is making a peace sign with his fingers.", "Allied troops started using this hand sign to indicate victory during world war ii. it indicates peace and the end of battle.", "A man in a black shirt and camo pants is standing gesturing. he has two fingers in a v shaped formation."], "image": "train2014/COCO_train2014_000000065220.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368541, "question_id": "Cj6pQ2ULUPZftv7gPq5c3K", "question": "Why are they in line?", "choices": ["want money", "get lunch", "board airplane", "leave airplane"], "correct_choice_idx": 2, "direct_answers": ["airplane flight", "board airplane", "waiting", "boarding", "seating", "boarding plane", "loading", "board plane", "embark", "board plane"], "difficult_direct_answer": true, "rationales": ["The people are boarding.", "Luggage in tow, a fairly large group of fliers line up as they prepare to board their flight. at least it's a sunny day, so there are no worries of them getting soaked.", "There is a flying vehicle near the people. the people are climbing up stairs in order to enter the vehicle."], "image": "val2014/COCO_val2014_000000368541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10935, "question_id": "CjSy5uxLhwVuFCVFV22MtD", "question": "What are the small green pools on the ground near the elephants?", "choices": ["water", "sprite", "grass", "lemonade"], "correct_choice_idx": 0, "direct_answers": ["algae", "water", "grass", "blue-green algae", "water", "dirty water", "water", "water", "puddles", "moss"], "difficult_direct_answer": false, "rationales": ["These pools look wet and reflect light like water.", "Algae grows on water. elephants are sticking their trunks into small areas that are green in color.", "The pools are water."], "image": "val2014/COCO_val2014_000000010935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392068, "question_id": "CjTbDAgGkrB929ky6wXq5H", "question": "What fruit is visible here?", "choices": ["kiwi", "pineapple", "grapes", "olives"], "correct_choice_idx": 3, "direct_answers": ["olive", "olives", "olive", "kiwi", "olives", "olive", "tomato", "olive", "pineapple", "olives"], "difficult_direct_answer": false, "rationales": ["A fruit is a ripened ovary or ovaries of a seed-bearing plant, together with accessory parts, containing the seeds. these have seeds.", "You can see the olives sitting on the pizza.", "Black pieces can be seen on top of the pizza."], "image": "train2014/COCO_train2014_000000392068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293674, "question_id": "CjVgbgDzuNjWqmJKSfyz7i", "question": "What is the country code of the frisbee shop being advertised?", "choices": ["one", "11", "25", "49"], "correct_choice_idx": 3, "direct_answers": ["can't see", "us", "49", "united states", "49", "de", "512", "com", "united states", "one"], "difficult_direct_answer": false, "rationales": ["The country code of this country is 49.", "The code is 49.", "The country code for the shop would be 49."], "image": "train2014/COCO_train2014_000000293674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491319, "question_id": "CjaMUhERbMMMLVK6pnKbqc", "question": "What is the red circular liquid on the plate?", "choices": ["food coloring", "ketchup", "blood", "dye"], "correct_choice_idx": 1, "direct_answers": ["ketchup", "ketchup", "ketchup", "ketchup", "ketchup", "ketchup", "ketchup", "ketchup", "ketchup", "ketchup"], "difficult_direct_answer": false, "rationales": ["The liquid is ketchup.", "Some people dip different foods into ketchup.", "Most people like to have ketchup with their fries but this person must like it with their pizza."], "image": "train2014/COCO_train2014_000000491319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577653, "question_id": "Cjc5LWDDt7VfvtBDbtFWY7", "question": "What is the woman doing with her umbrella?", "choices": ["trashing it", "buying it", "singing songs", "fixing it"], "correct_choice_idx": 3, "direct_answers": ["fixing", "fixing it", "fixing it", "fixing", "fixing it", "opening it", "fixing it", "folding it", "fixing it", "unfolding"], "difficult_direct_answer": false, "rationales": ["A woman is holding an umbrella that is opened to far and is bent back instead of down.", "Her umbrella is upside down.", "The umbrella has been blown inside out by the wind so will not protect from the rain in current configuration."], "image": "train2014/COCO_train2014_000000577653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39617, "question_id": "CjtHKiqmbYdNJ7umBEBzGx", "question": "What is the man using the rope from the horse to do?", "choices": ["to lead", "to punish", "to whip", "to lasso"], "correct_choice_idx": 0, "direct_answers": ["lead", "normal", "lead", "ride", "leading horses", "lead", "to lead", "lead", "control horse", "hold"], "difficult_direct_answer": false, "rationales": ["The rope is at the front of the horse, and used in the same way a leash would be used to guide or lead an animal.", "The rope is attached to the harness of the horse so it can be used to pull the horse forward like a leash.", "The man is using the rope to lead the horses."], "image": "train2014/COCO_train2014_000000039617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166069, "question_id": "CjuCrP9Cps2E4wHL4uqBbj", "question": "Why does he have his shirt off?", "choices": ["stolen", "confused", "warm weather", "can't find"], "correct_choice_idx": 2, "direct_answers": ["throw disc", "hot", "he's hot", "hot out", "warm weather", "its hot", "cool off", "hot outside", "no tan-lines", "tanning"], "difficult_direct_answer": true, "rationales": ["The man with the frisbee as well as several others pictured are warm so have their shirt off. some of the women are wearing sleeveless dresses or bathing suit tops.", "The weather is hot so no shirt is being worn.", "It appears to be warm outside and he doesn't need to have a shirt on."], "image": "train2014/COCO_train2014_000000166069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168317, "question_id": "CkjMzqQVPngGMaU8JRpftR", "question": "What kind of dog is this?", "choices": ["strayed dog", "service dog", "domestic pet", "farm dog"], "correct_choice_idx": 2, "direct_answers": ["german shepherd", "lazy", "australian shepherd", "domestic pet", "sheep", "border collie", "collie", "collie", "cocker spaniel", "collie"], "difficult_direct_answer": false, "rationales": ["Domestic pets are well kept and clean.", "The dog is domestic.", "A domestic dog that is a pet of someone's"], "image": "train2014/COCO_train2014_000000168317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 813, "question_id": "Ckn955NeqCyCdbwnBdw5F4", "question": "What kind of animals are these?", "choices": ["aquatic", "stuffed", "reptiles", "polar"], "correct_choice_idx": 0, "direct_answers": ["geese", "ducks", "geese", "duck", "swans", "ducks", "ducks", "ducks", "geese", "aquatic"], "difficult_direct_answer": false, "rationales": ["These animals are geese and they are known to like to be near water (like in the picture) for food and safety.", "The ducks are in the water.", "These are real canada geese. they do not live in polar regions and are not reptiles."], "image": "train2014/COCO_train2014_000000000813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388412, "question_id": "CmfDCLwnzFaFe8rynRd9VX", "question": "What is the purpose of the cord plugged into the right side of the laptop?", "choices": ["ethernet cable", "charger", "cyborg connection", "monitor cord"], "correct_choice_idx": 1, "direct_answers": ["charger", "charging", "power", "charger", "electricity", "charge", "charger", "storage", "charge", "for power"], "difficult_direct_answer": false, "rationales": ["The cord is the only visible cord currently plugged into the computer and appears to be the same shape and style as a charger cord.", "The black cord plugged into the side of the laptop is to charge the battery.", "A laptop is being charged."], "image": "train2014/COCO_train2014_000000388412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400913, "question_id": "CmfNsNXM9MY7XpDw4R6PEJ", "question": "Why does the man have his arms out?", "choices": ["wave", "break fall", "reach", "balance"], "correct_choice_idx": 3, "direct_answers": ["balance", "balancing", "balance", "balance", "balance", "balancing", "balance", "balance", "for balance", "balance"], "difficult_direct_answer": false, "rationales": ["With the skateboard he is on you need good balance to stay on.", "The man is on a skateboard which requires balance to stay on and ride correctly. having one's arms out while balancing is a way to maintain balance.", "The man is trying to balance."], "image": "train2014/COCO_train2014_000000400913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290403, "question_id": "Cmkit2JNPKXwhSH7A8T4se", "question": "What person is famous for doing this sport?", "choices": ["tony orlando", "john smoltz", "tony hawk", "john tenta"], "correct_choice_idx": 2, "direct_answers": ["bam margera", "tony hawk", "skateboarding", "tony hawk", "tony hawk", "tony hawk", "skateboarders", "rodney mullen", "tony hawk", "tony hawk"], "difficult_direct_answer": false, "rationales": ["The person shown is skateboarding. one of the men listed is a famous american skateboard with a series of video games that uses his name.", "Tony hawk is a famous skateboarder.", "Tony hawk is most likely the most famous skateboarder to those of us who don't follow the sport too closely. he took the skateboard world by storm in the late 90's and is considered one of the most influential boarders of all time."], "image": "train2014/COCO_train2014_000000290403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478410, "question_id": "CmwMtPkQ3haki8A7D8cfnx", "question": "What kind of dogs are seen here?", "choices": ["maltese", "havanese", "rough collie", "pug"], "correct_choice_idx": 2, "direct_answers": ["rough collie", "collie", "collie", "sheep", "border collie", "shelties", "collies", "collies", "german shepherd", "collies"], "difficult_direct_answer": false, "rationales": ["The dogs look like lassie so i choose the same breed.", "Due to their fur that covers their entire body.", "A collie dog has shaggy hair like the dog shown."], "image": "train2014/COCO_train2014_000000478410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310053, "question_id": "CmxSyjHX4WJ7qoXkUSLinT", "question": "What is blowing in the wind?", "choices": ["sand", "leaves", "rain", "snow"], "correct_choice_idx": 3, "direct_answers": ["snow", "rain", "snow", "snow", "snow", "snow", "snow", "snow", "snow", "snow"], "difficult_direct_answer": false, "rationales": ["It is snowing and the snow is being blown in the wind.", "The vehicles are caught in a snow storm.", "The snow is blowing in the wind."], "image": "train2014/COCO_train2014_000000310053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491279, "question_id": "CnBNjXiKjY6CSUTAoApDr9", "question": "How do these people know each other?", "choices": ["teammates", "coworkers", "family", "classmates"], "correct_choice_idx": 2, "direct_answers": ["parent", "family", "family", "same family", "her children", "family", "family", "same family", "family", "family"], "difficult_direct_answer": false, "rationales": ["The woman has two daughters and they all resemble each other.", "The woman looks to be the children's mother.", "These girls look alike and look like the woman so it is likely mother daughters."], "image": "train2014/COCO_train2014_000000491279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114313, "question_id": "CnBgJC48brEduLmbULgnzU", "question": "What is the man adjusting?", "choices": ["laptop", "belt", "pants", "hydrant"], "correct_choice_idx": 3, "direct_answers": ["fire hydrant", "fire hydrant", "fire hydrant", "hydrant", "fire hydrant", "fire hydrant", "water pressure", "hydrant", "hydrant", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["You can tell by the color and setting as to what the man is adjustng.", "They know just how much water pressure they need.", "The man is adjusting the fire hydrant."], "image": "val2014/COCO_val2014_000000114313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211788, "question_id": "CnSGLGAqwNPDrfc9LNaos6", "question": "What should be put in the nearby trashcan?", "choices": ["paper only", "recyclables", "normal trash", "glass only"], "correct_choice_idx": 2, "direct_answers": ["garbage", "garbage", "trash", "normal trash", "garbage", "trash", "trash", "trash", "garbage", "trash"], "difficult_direct_answer": false, "rationales": ["B, c and d would be put in a blue or green bin.", "It seems to be just a regular place to put your garbage.", "A trashcan is on the sidewalk at a train station."], "image": "train2014/COCO_train2014_000000211788.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285096, "question_id": "CnVDHZLJYz8MJS8ERMKVJM", "question": "Which single step could the yellow boat's owner take to preserve his investment in the boat?", "choices": ["junk it", "sink it", "paint", "add flag"], "correct_choice_idx": 2, "direct_answers": ["paint", "paint it", "fix rust", "sealant", "remove rust", "clean", "paint", "paint", "paint", "paint"], "difficult_direct_answer": false, "rationales": ["It will help prevent rust", "A boat is at a dock and is rusty. new paint can cover rust.", "Painting it will make it last longer and will make it look nice and fresh."], "image": "val2014/COCO_val2014_000000285096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525024, "question_id": "CnVKZYEzM6ESxwPZiPyRQo", "question": "Why are they flying kites?", "choices": ["are bored", "want money", "showing off", "great weather"], "correct_choice_idx": 3, "direct_answers": ["fun", "festival", "great weather", "fun", "fun", "fun", "fun", "festival", "fun", "festival"], "difficult_direct_answer": false, "rationales": ["The kites can only fly in good weather.", "The people want to show off the colorful kites.", "The kites are flying high."], "image": "val2014/COCO_val2014_000000525024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103538, "question_id": "CnjgT8HMtNzZ8xSKR942dQ", "question": "What are the island walls made of?", "choices": ["brick", "wood", "tile", "laminate"], "correct_choice_idx": 0, "direct_answers": ["fake rock", "brick", "stone", "stone", "plaster", "stone", "brick", "tile", "stone", "brick"], "difficult_direct_answer": false, "rationales": ["The walls show some brick work.", "The sides consist of stones held together by cement.", "It is made of brick."], "image": "val2014/COCO_val2014_000000103538.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138276, "question_id": "CnvnRGwsLYBTfBbQsYtJs2", "question": "What is she doing?", "choices": ["stealing book", "hiding book", "reading book", "writing book"], "correct_choice_idx": 2, "direct_answers": ["reading", "reading", "reading", "reading", "reading book", "reading", "reading", "reading", "reading", "reading"], "difficult_direct_answer": false, "rationales": ["The girl is reading in bed.", "She has an open book in her hands", "A woman is propped up in bed holding a book. her left hand supports it so she can clearly see the text of the book."], "image": "train2014/COCO_train2014_000000138276.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509772, "question_id": "Co4dCsb8v28SjQQCdeLEHM", "question": "What recently occurred to the grass within this area?", "choices": ["aerated", "weeded", "mowed", "seeded"], "correct_choice_idx": 2, "direct_answers": ["mowed", "rain", "rain", "mowed", "mowed", "mowing", "mowed", "rained", "mowed", "mowed"], "difficult_direct_answer": false, "rationales": ["The grass looks very short and trimmed indicating it has been mowed recently.", "You can still see the marks from the lawnmower.", "It looks like they recently cut the grass."], "image": "val2014/COCO_val2014_000000509772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418087, "question_id": "Co5sYQtrpPNYNo4po5jqL8", "question": "Who has the ball?", "choices": ["umpire", "catcher", "hitter", "pitcher"], "correct_choice_idx": 3, "direct_answers": ["pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher"], "difficult_direct_answer": false, "rationales": ["The ball is at the mound. it is about to be thrown towards the hitter, catcher, and umpire.", "The pitcher has the ball in his hand.", "You can tell by the ball in the pitcher's hand."], "image": "train2014/COCO_train2014_000000418087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184397, "question_id": "CoFhCNr6XHafk5oNpuX7fu", "question": "What NFL team plays in the town?", "choices": ["chiefs", "texans", "patriots", "cowboys"], "correct_choice_idx": 3, "direct_answers": ["stars", "cowboys", "cowboys", "dallas cowboys", "cowboys", "dallas", "cowboys", "cowboys", "cowboys", "cowboys"], "difficult_direct_answer": false, "rationales": ["They are from dallas.", "That's the nfl team for dallas.", "The text on the police motorcycle indicates that it belongs to the dallas police. the texans play in houston, not dallas."], "image": "train2014/COCO_train2014_000000184397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519024, "question_id": "CoLUqAjWdsCBM2i5Fe8rQN", "question": "How far will this train travel usually?", "choices": ["1000 miles", "local only", "far away", "nowhere"], "correct_choice_idx": 1, "direct_answers": ["long distances", "not far", "short distance", "1.3 kilometers", "long distance", "short distance", "far", "local only", "mile", "not far"], "difficult_direct_answer": false, "rationales": ["It is a small train without full cover and it's for people to ride in", "The train is a junior size train and is suitable for kids on short fun trips; it is not a train for extended trips or actual \"traveling\" passengers.", "By the small size and occupancy capacity you can tell how far it travels."], "image": "train2014/COCO_train2014_000000519024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296901, "question_id": "CoS62BD2zUicRvmxJ7LmbX", "question": "Why is stopped on his bike?", "choices": ["is scared", "needs help", "is lost", "red light"], "correct_choice_idx": 3, "direct_answers": ["give way", "red light", "traffic", "traffic", "waiting", "traffic stop", "red light", "watching traffic", "waiting", "red light"], "difficult_direct_answer": false, "rationales": ["The traffic controlling device located on the other side of the street indicates that vehicles travelling in this direction should stop.", "A man has stopped at the intersection. it is waiting for the light to turn so he doesn't get hit.", "The man is waiting for the green light."], "image": "train2014/COCO_train2014_000000296901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406189, "question_id": "CoXZwvgknwBoituzH6DpKm", "question": "Which giraffe is the youngest?", "choices": ["far right", "far left", "middle left", "middle right"], "correct_choice_idx": 0, "direct_answers": ["smallest", "baby", "smallest", "smallest", "smallest", "far right", "small one", "right giraffe", "far right", "right"], "difficult_direct_answer": false, "rationales": ["The giraffe that's smallest is youngest.", "The one on that side is the smallest.", "The giraffe is on the right."], "image": "val2014/COCO_val2014_000000406189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121372, "question_id": "CpgyAy5GJt4CSfd2G7EBvt", "question": "What type of job are the men doing?", "choices": ["paving", "dancing", "construction", "baking"], "correct_choice_idx": 3, "direct_answers": ["baking", "making donuts", "cooking", "making donuts", "baking", "baking", "baking", "baking", "food manufacturing", "baking donuts"], "difficult_direct_answer": false, "rationales": ["The people are making donuts.", "There are donuts being made. donuts are made by bakers.", "They are making donuts."], "image": "train2014/COCO_train2014_000000121372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171315, "question_id": "Cqo5X8z63wv2yRt2e5F3bd", "question": "Why are the stuffed animals in the window?", "choices": ["to repair", "to decorate", "to block", "to sell"], "correct_choice_idx": 3, "direct_answers": ["to sell", "for sale", "sale", "for sale", "display", "display", "for sale", "retail presentation", "for sale", "to sell"], "difficult_direct_answer": false, "rationales": ["These items have price tags on them and are for sale.", "They are arranged on shelves and have tags with their information for buyers on them.", "The stuffed animals have tags on them as if they are ready to be sold."], "image": "train2014/COCO_train2014_000000171315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35004, "question_id": "CqqAMGaffVteoYdrrQAM5A", "question": "What type of setting are the people most likely located in?", "choices": ["aa meeting", "university", "condo", "coffee shop"], "correct_choice_idx": 1, "direct_answers": ["work office", "classroom", "office", "restaurant", "university", "office", "office", "office", "office", "donut shop"], "difficult_direct_answer": false, "rationales": ["The people are eating breakfast. there is juice on the table.", "The people are probably in college since there are so many youngters.", "You can tell by the books and the shelves in the background as to where they are."], "image": "train2014/COCO_train2014_000000035004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37157, "question_id": "Cqzi4xnceLWE3n6fNJNLDd", "question": "What animal might the black item have come from?", "choices": ["rabbit", "fish", "goat", "cow"], "correct_choice_idx": 3, "direct_answers": ["cow", "cow", "cow", "eagle", "eagle", "cow", "cow", "cow", "cow", "eagle"], "difficult_direct_answer": false, "rationales": ["Leather comes from a cow's hide.", "Leather is a valuable product from the cattle industry.", "It looks to be made of leather."], "image": "train2014/COCO_train2014_000000037157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351861, "question_id": "CrEEqe2uxpDT3qQurkagKR", "question": "What is the name of a famous player of this sport?", "choices": ["johnson", "sampras", "botham", "rooney"], "correct_choice_idx": 1, "direct_answers": ["andre agassi", "roger federer", "federer", "andre agassi", "federer", "federer", "andre agassi", "sampras", "federer", "andre agassi"], "difficult_direct_answer": false, "rationales": ["Pete sampras was one of the most famous tennis players.", "Pete sampras is a well know tennis player.", "Pete sampras is a very famous male tennis star."], "image": "train2014/COCO_train2014_000000351861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5385, "question_id": "CrM8rsZpBCHm2mbZ7ZXkxV", "question": "When was baseball invented?", "choices": ["1884", "1812", "1839", "1891"], "correct_choice_idx": 2, "direct_answers": ["1846", "nineteenth century", "eighteenth century", "1900s", "1846", "knickerbockers", "1846", "eighteen thirtyeight", "1839", "eighteen thirty-nine"], "difficult_direct_answer": false, "rationales": ["The game was first created in 1839", "Baseball was invented in 1839.", "It is claimed that abner doubleday created the game in 1839."], "image": "val2014/COCO_val2014_000000005385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296374, "question_id": "CrSadNmmwQqMcxxoZ5nKGV", "question": "This photo was taken in front of what kind of attraction?", "choices": ["view point", "restaurant", "ski lodge", "museum"], "correct_choice_idx": 2, "direct_answers": ["ski lift", "ski lodge", "building", "ski resort", "lift", "ski lodge", "ski resort", "ski jump", "ski lodge", "ski resort"], "difficult_direct_answer": false, "rationales": ["There is a lot of snow and ski equipment with them", "The location and the signage indicates that this is an area dedicated to skiing.", "The men have are holding snowboards which shows they are at a ski resort."], "image": "train2014/COCO_train2014_000000296374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327038, "question_id": "CriQXAvUtBFS4a4HhGv7KZ", "question": "What type of food is the person probably making?", "choices": ["burger", "sandwich", "burrito", "pizza"], "correct_choice_idx": 2, "direct_answers": ["italian", "burrito", "gyro", "gyro", "falafel", "pizza", "burrito", "taco", "mexican", "quesadilla"], "difficult_direct_answer": false, "rationales": ["The food item is using a tortilla to be folded up.", "There is a tortilla under the other food.", "Burritos are wrapped in tortillas."], "image": "val2014/COCO_val2014_000000327038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573314, "question_id": "CrsCH53N2McGvqDQrMLVZV", "question": "In which way Maritime transport is taken place?", "choices": ["none", "water", "air", "land"], "correct_choice_idx": 1, "direct_answers": ["boat", "via boat", "docking", "boat", "boat", "boat", "ferry", "water vessels", "water", "boat"], "difficult_direct_answer": false, "rationales": ["To get things moved with a boat water is needed.", "Boats can only travel by themselves by water.", "Boats are used for this"], "image": "train2014/COCO_train2014_000000573314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97283, "question_id": "CsAdunGXMqft8TPnJz33De", "question": "What roofing method was used on these houses?", "choices": ["wicked", "raftered", "gabled", "thatching"], "correct_choice_idx": 3, "direct_answers": ["straw", "thatch", "straw", "reeds", "thatch", "thatching", "thatch", "thatch", "thatch", "straw"], "difficult_direct_answer": false, "rationales": ["The houses' rooftops have thatching.", "The method is thatching.", "These houses are thatched straw roofed."], "image": "train2014/COCO_train2014_000000097283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451727, "question_id": "CsBAjFDudpzhnZmY4Pqc7s", "question": "What would someone have to do when baking a cake here to cook it?", "choices": ["bend down", "go sideways", "reach up", "nothing"], "correct_choice_idx": 0, "direct_answers": ["make dough", "bend down", "use oven", "open oven", "turn on", "bake", "use stove", "turn on", "set timer", "preheat oven"], "difficult_direct_answer": true, "rationales": ["The oven is below.", "The oven is below waist level.", "The oven is at a lower level than the counter."], "image": "train2014/COCO_train2014_000000451727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162171, "question_id": "CsCFPkMaZeQErS8EL875Pv", "question": "What animals are present?", "choices": ["goat", "sheep", "cow", "horse"], "correct_choice_idx": 3, "direct_answers": ["horses", "horses", "horses", "horses", "horses", "horses", "horse", "horses", "horses", "horses"], "difficult_direct_answer": false, "rationales": ["These are horses in the field.", "These animals have manes. they are not sheep, cows, or goats.", "Horses are grazing in the field."], "image": "train2014/COCO_train2014_000000162171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10434, "question_id": "CsJRFBUWfrHCRRDmQV5RYy", "question": "Why is the man jumping in the air?", "choices": ["exercising", "to fight", "to avoid", "doing trick"], "correct_choice_idx": 3, "direct_answers": ["showing off", "skateboarding", "doing trick", "skating", "skateboarding", "doing trick", "skateboarding", "skateboard trick", "fun", "doing tricks"], "difficult_direct_answer": false, "rationales": ["Trying to do a trick on the sidewalk.", "The man is using a skateboard. he is not fighting someone or avoiding something.", "He is on a skateboard doing a trick."], "image": "train2014/COCO_train2014_000000010434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144582, "question_id": "CsLeZ8N2NodnixwUfWmczS", "question": "What is the man also probably doing while reading on the bench?", "choices": ["talking", "recording", "playing music", "writing"], "correct_choice_idx": 2, "direct_answers": ["listening music", "listening music", "playing music", "listening music", "listening music", "listening to music", "playing music", "listening music", "music listening", "listening music"], "difficult_direct_answer": false, "rationales": ["The man is listening to music from his headphones.", "The man is wearing headphones and could be listening to a song.", "The man is playing music."], "image": "train2014/COCO_train2014_000000144582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325693, "question_id": "CtFZvBWJgnqVjTX7hRXN9e", "question": "What is the silver plate the man is using made of?", "choices": ["metal", "plastic", "paper", "wood"], "correct_choice_idx": 0, "direct_answers": ["aluminium", "tin", "tin", "metal", "metal", "sauce", "steel", "metal", "silver", "metal"], "difficult_direct_answer": false, "rationales": ["The platter is shiny and reflective. it is silver in color.", "The plate is silver and shiny", "The silver plate is made of metal."], "image": "train2014/COCO_train2014_000000325693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530600, "question_id": "CtnC3KkXufnRQafaSz9R6g", "question": "Though wearing clothes some of these bears lack what garment that most humans consider necessary in public?", "choices": ["pants", "shoe inserts", "vests", "ties"], "correct_choice_idx": 0, "direct_answers": ["pants", "clothes", "pants", "pants", "pants", "clothes", "clothes", "pants", "pants", "clothes"], "difficult_direct_answer": false, "rationales": ["The bears have sweaters on. it's against the law to leave your house without any pants on.", "They have nothing on their lower half of their bodies.", "The bears don't have any pants to cover their bottoms."], "image": "val2014/COCO_val2014_000000530600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527083, "question_id": "CtpShhCBfKWCcmtfPdUR5u", "question": "What material is the lighthouse made from?", "choices": ["brick", "metal", "wood", "stone"], "correct_choice_idx": 3, "direct_answers": ["stone", "bricks", "stone", "rocks", "stone", "brick", "stone", "stone", "stones", "stone"], "difficult_direct_answer": false, "rationales": ["A large lighthouse is lit and is made of material held together with mortar. stone is held together with mortar.", "The construction material appears to be of a color, size and shape consistent with answer a.", "The layers of stone are visible up the side of the lighthouse."], "image": "train2014/COCO_train2014_000000527083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469777, "question_id": "CtpbrknGwMHcET6sRX4Crq", "question": "The person holding the camera is wearing what color shirt?", "choices": ["orange", "yellow", "blue", "red"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The woman aiming a camera is in a blue shirt.", "A woman is standing by herself with her hands close together in air. she looks to be taking a shot of the elephant.", "The person is in blue."], "image": "val2014/COCO_val2014_000000469777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352011, "question_id": "CtqKi4dkJVtMx356NbwQpr", "question": "What comes out of the gray machine in the back?", "choices": ["water", "metal sheets", "pizza", "paper"], "correct_choice_idx": 3, "direct_answers": ["paper", "paper", "paper", "paper", "paper", "printer paper", "printed pages", "printer paper", "printed papers", "paper"], "difficult_direct_answer": false, "rationales": ["The grey machine is a printer. it cannot print on pizza, water, or metal.", "The gray machine has paper.", "It is a printer."], "image": "val2014/COCO_val2014_000000352011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403675, "question_id": "CuqcZnFPjRb5ZN9sGuGsdm", "question": "What is the name of a famous player of this game?", "choices": ["sweet tish", "babe ruth", "young genie", "tiny larry"], "correct_choice_idx": 1, "direct_answers": ["babe ruth", "babe ruth", "babe ruth", "joe dimaggio", "babe ruth", "hank aaron", "eddie matthews", "brakes", "sammy sosa", "babe ruth"], "difficult_direct_answer": false, "rationales": ["Babe ruth plays baseball.", "There are many well-known players of this game, but arguably one the most famous is a player named babe ruth.", "A man is wearing a baseball uniform."], "image": "val2014/COCO_val2014_000000403675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184942, "question_id": "Cv2n5XdLU5gEUk5hb3tpnt", "question": "What could have made the road appear shiny?", "choices": ["wind", "rain", "snow", "paint"], "correct_choice_idx": 1, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "rain", "ice", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The road is wet from rain.", "A road is shiny and the sky is overcast. roads look shiny when they are wet.", "The ground looks wet and it appears warmer out so rain is the likely culprit."], "image": "train2014/COCO_train2014_000000184942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336552, "question_id": "CvF6UKevTeuCAwv8pqMnUW", "question": "What is the man in the middle doing?", "choices": ["posing", "threatening other", "throwing ball", "falling"], "correct_choice_idx": 2, "direct_answers": ["throwing", "throwing", "throwing ball", "throwing", "throwing", "throwing", "throwing ball", "running", "throw ball", "throwing ball"], "difficult_direct_answer": false, "rationales": ["The man is getting ready to throw the ball to the pitcher.", "He has the ball in his hand behind his ear and is bending his arm in order to launch it.", "The catcher is standing in a throwing position."], "image": "train2014/COCO_train2014_000000336552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138975, "question_id": "CvFrNkNAz8EhuRteykpnF3", "question": "What color jacket was the person in the white shirt wearing earlier?", "choices": ["black", "gray", "red", "brown"], "correct_choice_idx": 3, "direct_answers": ["brown", "brown", "brown", "black", "red", "brown", "brown", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["You can see the jacket hanging from the person's backpack.", "The color is brown.", "The person was wearing a brown jacket that is now on the backpack."], "image": "val2014/COCO_val2014_000000138975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423313, "question_id": "CvJvLXZtQzV5A8vNXRuCTq", "question": "Who owns the company with the red sign?", "choices": ["yao ming", "ming khaphu", "khaphu li", "yao khaphu"], "correct_choice_idx": 3, "direct_answers": ["footstep", "footstep", "footstep", "foodstop", "yao khaphu", "footstep", "foodstop", "footstep", "footstep", "footstep"], "difficult_direct_answer": false, "rationales": ["Yao owns the company", "The company is zenco footstep.", "He is the person that who owns zenco footstep."], "image": "train2014/COCO_train2014_000000423313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338960, "question_id": "CvKb2heVXTwCzq7DJu8yge", "question": "Where does the Christmas tree come from?", "choices": ["germany", "england", "switzerland", "russia"], "correct_choice_idx": 0, "direct_answers": ["woods", "forest", "europe", "germany", "mountain", "forest", "woods", "store", "forest", "forest"], "difficult_direct_answer": false, "rationales": ["The christmas tree is a tradition from germany.", "Christmas trees come from germany originally.", "O tannenbaum denotes the german country where this comes from."], "image": "train2014/COCO_train2014_000000338960.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395828, "question_id": "CvLBSbZREwd3ysgj5ioPYy", "question": "The Chesapeake and Ohio Railway was a Class I railroad formed when?", "choices": ["1888", "1958", "1869", "1858"], "correct_choice_idx": 2, "direct_answers": ["long ago", "chesapeake", "1869", "1869", "1869", "1869", "virginia railways", "not sure", "1869", "eighteen sixty-nine"], "difficult_direct_answer": false, "rationales": ["The railway was formed in 1869.", "The railway's creation is from the 1869 era.", "The railroad is from 1869."], "image": "train2014/COCO_train2014_000000395828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393268, "question_id": "Cvdgo4wb5sfPWdfNtPJPLB", "question": "To prevent over fermentation and reactions in beverages they are stored in which color bottle?", "choices": ["green", "transparent", "brown", "black"], "correct_choice_idx": 2, "direct_answers": ["brown", "dark brown", "brown", "brown", "brown", "amber", "green", "dark", "brown bottle", "amber"], "difficult_direct_answer": false, "rationales": ["Many items are stored in brown or dark bottles to preserve their freshness.", "The brown keeps it cool.", "The labels are green, but the bottles are a different color. the bottles are not black or transparent."], "image": "train2014/COCO_train2014_000000393268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18535, "question_id": "CvemzR9cwneD2tQHB4JtXo", "question": "Persons using these umbrellas also enjoy what water sport?", "choices": ["water skiing", "chess", "body boards", "water polo"], "correct_choice_idx": 2, "direct_answers": ["swimming", "surfing", "surfing", "swimming", "swimming", "body boards", "surfing", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["You can see a near the umbrellas.", "The boards are on the beach next to the umbrellas.", "There are body boards visible in the same setting as the umbrellas. people would bring the boards with them to the beach if it was something they enjoyed."], "image": "train2014/COCO_train2014_000000018535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147132, "question_id": "CvkMSaUmzPGzuDx8VpqtJq", "question": "Why are they wearing suits?", "choices": ["costume", "dress code", "uniform", "warmth"], "correct_choice_idx": 3, "direct_answers": ["keep warm", "surfing", "warmth", "surfing", "go surfing", "cold weather", "wet", "cold water", "to surf", "go surfing"], "difficult_direct_answer": false, "rationales": ["They are wearing wet suits which provide thermal protection while surfing.", "They are surfing.", "The ocean is freezing."], "image": "val2014/COCO_val2014_000000147132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296393, "question_id": "CwpLaufGT3Q5y5gE5cGYcv", "question": "How was the granite object shaped?", "choices": ["painting", "erosion", "carving", "drawn"], "correct_choice_idx": 2, "direct_answers": ["power tools", "sculpture tools", "whale", "stone carving", "saw", "carved", "nature", "whale", "carving", "curved"], "difficult_direct_answer": true, "rationales": ["Someone had to carve the granite to get it to look like this.", "Although some natural b might be on display because the sculptor worked around it.", "The granite is put into the shape of an intricate carving."], "image": "val2014/COCO_val2014_000000296393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201141, "question_id": "Cwt8imCgBKR4enhMY5SsDd", "question": "What is the green stalk for?", "choices": ["sale", "decoration", "growing bananas", "hanging bananas"], "correct_choice_idx": 3, "direct_answers": ["holding bananas", "storage", "holding", "base", "hanging bananas", "hanging bananas", "display bananas", "hanging bananas", "growing bananas", "holds bananas"], "difficult_direct_answer": false, "rationales": ["There are bananas hanging from this so that must be its purpose.", "There are bananas attached.", "Bananas are hanging from a green stick."], "image": "val2014/COCO_val2014_000000201141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85035, "question_id": "Cx4ogRCRHmNvspUYWa4ms8", "question": "Why does the man have his arm out?", "choices": ["break fall", "balance", "reach", "point"], "correct_choice_idx": 3, "direct_answers": ["pointing", "giving directions", "point", "pointing", "giving directions", "pointing", "pointing", "pointing", "pointing", "pointing"], "difficult_direct_answer": false, "rationales": ["By his gesture you can tell as to what he is doing.", "He is using his first or pointer finger.", "The man's arm and finger are stretched out because he is trying to specify a location that he wants others to look at."], "image": "train2014/COCO_train2014_000000085035.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19444, "question_id": "Cx6qBNDeZc6yFGwxcQ6dFU", "question": "What is the emotion on the person's face?", "choices": ["confident", "frustrated", "scared", "sad"], "correct_choice_idx": 0, "direct_answers": ["keen concentration", "confident", "determined", "concentrated", "anticipation", "neutral", "serious", "determination", "determination", "focused"], "difficult_direct_answer": true, "rationales": ["The athlete's emotional state appear to be of someone who knows what he is doing. this is also known as confidence.", "The person looks determined.", "Because he is about to catch the frisbee on the air."], "image": "val2014/COCO_val2014_000000019444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278582, "question_id": "CxDvDPNV2YWaA87nfXjmGQ", "question": "What action is taking place here?", "choices": ["cheering", "fighting", "rioting", "protesting"], "correct_choice_idx": 0, "direct_answers": ["taking pictures", "celebration", "cheering winners", "show performance", "lifting people", "dancing", "concert", "cheering", "photo", "funeral"], "difficult_direct_answer": true, "rationales": ["There is a group of people holding others up in a pyramid style, which is a popular formation in cheerleading.", "People in the crowd are cheering.", "People are cheering for a show in this place."], "image": "val2014/COCO_val2014_000000278582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111547, "question_id": "CxKHZSmmJBJot7PKtLxHA2", "question": "What happens when a vehicle is cleared to go?", "choices": ["arrest", "door prize", "free coffee", "arm raises"], "correct_choice_idx": 3, "direct_answers": ["pole raised", "raised", "bar raises", "gate lifts", "arm raises", "over gate", "bus moves", "gate up", "arm raised", "move along"], "difficult_direct_answer": true, "rationales": ["Vehicles are lines up on a street in front of a yellow long board that is blocking the road.", "Vehicles are on a bridge stopped behind a large yellow mechanical arm that blocks the road.", "The pole will go up in the air to allow them passage."], "image": "val2014/COCO_val2014_000000111547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407528, "question_id": "CxP64ovxBvh3KcFYgBKngC", "question": "Which states have the most palm trees?", "choices": ["arizona", "california", "texas", "hawaii"], "correct_choice_idx": 2, "direct_answers": ["california", "california florida", "southern states", "california", "hawaii", "texas", "texas", "southern", "tree", "florida"], "difficult_direct_answer": false, "rationales": ["Texas has three species of indigenous palm trees while other states only have one.", "Palm trees grow really well in tropical places. islands tend to be very tropical.", "The state is texas."], "image": "val2014/COCO_val2014_000000407528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 984, "question_id": "CxYCbfMoV9JieYw6YeoDgD", "question": "Where is the likely location?", "choices": ["outdoor market", "outdoor stadium", "outdoor patio", "outdoor rink"], "correct_choice_idx": 0, "direct_answers": ["market", "market", "market", "market", "market", "grocery", "outdoor market", "market", "market", "grocery store"], "difficult_direct_answer": false, "rationales": ["Fruits and vegetables are frequently found at outdoor markets. the produce is fresh as the person selects what they wish to purchase.", "The person is holding bananas. they are surrounded by large quantities of other fruits.", "Stands with fruit like this are often found in outdoor markets."], "image": "train2014/COCO_train2014_000000000984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301977, "question_id": "CxhgLZSEaQD4HS2GC4PCFs", "question": "The first five letters in white that are on the sign are used in spelling what style?", "choices": ["bohemian", "gangnam", "grunge", "tiger"], "correct_choice_idx": 1, "direct_answers": ["block", "gangnam", "print", "gangnam", "ganga", "capitalized", "gangnam", "english", "ganga", "block"], "difficult_direct_answer": false, "rationales": ["That word starts with those letters.", "The first five letters ganga are gangnam style.", "The first five letters are the exact same in this option."], "image": "val2014/COCO_val2014_000000301977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179520, "question_id": "CxpTtiGJdYTrxsfJaKLuFw", "question": "Why is the player blurry?", "choices": ["moving fast", "bad film", "broken camera", "shaky photographer"], "correct_choice_idx": 0, "direct_answers": ["unfocused", "he's running", "he's running", "he's running", "because camera", "moving fast", "hes running", "running", "speed", "time lapse"], "difficult_direct_answer": true, "rationales": ["He is running.", "The stationary objects in the background are not blurry, so there is nothing wrong with the photographer, camera, or film.", "The player is running during a baseball game and the camera was unable to take a sharp shot of the rapid movement."], "image": "train2014/COCO_train2014_000000179520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174601, "question_id": "CxrpfR8KZwM6nv8ntaaLJY", "question": "For what purpose is the green garment hung most likely?", "choices": ["drying it", "signal", "blew there", "reduce draft"], "correct_choice_idx": 0, "direct_answers": ["line", "drying it", "to dry", "drying", "drying", "to dry", "drying", "drying", "drying", "dry"], "difficult_direct_answer": false, "rationales": ["This is the most likely reason. the other options don't really make sense.", "The person is drying their clothes outside.", "The garment could be hung to make sure the wetness evaporates."], "image": "train2014/COCO_train2014_000000174601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549386, "question_id": "CyGgS2oju3NeApnLR3o4q5", "question": "How many people wear red shirts?", "choices": ["none", "two", "one", "three"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "one", "two", "one", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The picture shows only one person wearing red colored shirt.", "Two figures with arms, legs, and a head are visible in the picture wearing such attire.", "One is a mannequin"], "image": "val2014/COCO_val2014_000000549386.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176790, "question_id": "CyLQZNXhWk3ZaJUfpwN7tN", "question": "What color best describes the dress?", "choices": ["teal", "purple", "burgundy", "red"], "correct_choice_idx": 0, "direct_answers": ["turquoise", "blue", "teal", "blue", "blue", "turquoise", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The dress is a bright blue color.", "The girl's dress is a bright shade of blue called teal.", "The dress is a shade of teal."], "image": "train2014/COCO_train2014_000000176790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173783, "question_id": "CyciVHn7y4NcsZfLMmX2SD", "question": "What does the item in the middle of the phones look like?", "choices": ["umbrella", "marshmallow", "jelly beans", "baby"], "correct_choice_idx": 2, "direct_answers": ["kernels", "jelly beans", "popcorn", "jellybean", "popcorn", "jelly beans", "corn", "person", "corn", "kernels"], "difficult_direct_answer": false, "rationales": ["There are bean like items in the middle.", "They are small and yellow.", "They are small and in the shape of beans, and look like they may be covered in sugar."], "image": "val2014/COCO_val2014_000000173783.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478845, "question_id": "Cymag6Rzd8XHYnBcsDLUJB", "question": "Which food is the most unhealthy?", "choices": ["banana", "candy", "coffee", "orange"], "correct_choice_idx": 1, "direct_answers": ["sweets", "candy", "candy", "candy", "sweets", "candy", "candy", "sweets", "candy", "candy"], "difficult_direct_answer": false, "rationales": ["The candies are the most unhealthy food here.", "The options next to the laptop are healthy. the rest usually contains a lot of sugar.", "The candy is bad."], "image": "train2014/COCO_train2014_000000478845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83268, "question_id": "Cys6yKBnM2nxdzWrBpww8B", "question": "What type of athlete is this?", "choices": ["gymnast", "cheerleader", "surfer", "snowboarder"], "correct_choice_idx": 3, "direct_answers": ["snowboarder", "snowboarder", "snowboarder", "acrobat", "skier", "skier", "acrobat", "skier", "snowboarder", "skier"], "difficult_direct_answer": false, "rationales": ["There is an athlete visible on the right that is strapped into a board that's appearance is consistent with answer a.", "These are snowboarders", "The athletes are using snowboards."], "image": "train2014/COCO_train2014_000000083268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570773, "question_id": "Cz33dnSs5D5SWuyo5to9Xa", "question": "From where did this person directly come?", "choices": ["up high", "sun valley", "below", "ski lodge"], "correct_choice_idx": 0, "direct_answers": ["up high", "up hill", "home", "mountains", "ski lift", "uphill", "hill", "mountain", "mountain top", "ground"], "difficult_direct_answer": true, "rationales": ["The man is going down a mountain so must have came from up higher.", "The person skiing on the mountain was higher up and skied to their location.", "The person is on a slope and is moving away from the higher side of the slope meaning they have come from a higher location in order to get to where they are now."], "image": "val2014/COCO_val2014_000000570773.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88572, "question_id": "Cz4Jp92VwENihRL4QLYka2", "question": "What important protective gear should this kid wear?", "choices": ["sunglasses", "helmet", "knee pads", "elbow pads"], "correct_choice_idx": 1, "direct_answers": ["helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet"], "difficult_direct_answer": false, "rationales": ["If a youngster is going to try to maintain his balance on a skateboard, he should, at the least, be wearing a helmet for protection. elbow and knee pads aren't a bad idea either.", "His head should be protected because a fall could cause serious damage", "The gear is a helmet."], "image": "train2014/COCO_train2014_000000088572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172265, "question_id": "CzRdXuM6GACNiJYeAxKwSY", "question": "What character has a name that includes the longest part of this animal?", "choices": ["man-at-arms", "basilica hands", "mekaneck", "edward scissorhands"], "correct_choice_idx": 2, "direct_answers": ["giraffe", "mekaneck", "no-neck chump", "long neck", "neck", "giraffe", "neck man", "long neck", "long neck", "long neck"], "difficult_direct_answer": false, "rationales": ["The giraffe has a name for its neck.", "The giraffe's neck is the longest, and mekaneck is a character that has a long neck.", "The character is mekaneck."], "image": "val2014/COCO_val2014_000000172265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460342, "question_id": "CzZVn3h5PedboixZFyXmfp", "question": "What is this place?", "choices": ["ski slope", "bus stop", "ice rink", "playground"], "correct_choice_idx": 2, "direct_answers": ["skating rink", "ice rink", "ice rink", "ice rink", "skating rink", "construction area", "ice skating", "park", "rest stop", "camp"], "difficult_direct_answer": false, "rationales": ["This is obvious based on what the people are doing in the background. that said, it could be a winter conversion of d.", "The place is an ice rink.", "There are people skating there."], "image": "train2014/COCO_train2014_000000460342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149467, "question_id": "Czdnjqivu2MscSgvaouEsG", "question": "What team does the catcher play for?", "choices": ["cubs", "yankees", "astros", "mets"], "correct_choice_idx": 0, "direct_answers": ["unknown", "cubs", "chicago cubs", "chicago cubs", "stairs", "defense", "dodgers", "chicago cubs", "braves", "baseball"], "difficult_direct_answer": false, "rationales": ["The team is the cubs.", "The pitcher and catcher play for the new york cubs.", "The chicago cubs team colors are white, blue, and red."], "image": "train2014/COCO_train2014_000000149467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14337, "question_id": "Czm35tiVWQtULx2DonkFpP", "question": "How many eaters are they expecting?", "choices": ["ten", "six", "14", "12"], "correct_choice_idx": 2, "direct_answers": ["many", "three", "15", "twelve", "many", "lot", "lots", "14", "fourteen", "two"], "difficult_direct_answer": true, "rationales": ["There are many hot dogs on the tray.", "Each row has seven cups. there are two rows.", "There are 14 people."], "image": "train2014/COCO_train2014_000000014337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330570, "question_id": "CzwX9FXtjEaz2BtSWDEBTR", "question": "Why are they removing a dirty appliance?", "choices": ["condemned house", "dust storm", "animals playing", "weather-beaten"], "correct_choice_idx": 0, "direct_answers": ["too old", "replacement", "it's old", "condemned house", "demolition", "cleaning", "safety", "relocating", "broken", "its abandoned"], "difficult_direct_answer": true, "rationales": ["The house looks dilapidated.", "The debris on the ramp and the person's clothing suggest this option.", "The appliance is being removed from the old house."], "image": "train2014/COCO_train2014_000000330570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514243, "question_id": "D29dPvqMrYMjWFbDV4fR9t", "question": "What type of shoes is the woman wearing?", "choices": ["sneakers", "heels", "flat", "water shoes"], "correct_choice_idx": 1, "direct_answers": ["sandals", "sandals", "sandals", "sandals", "heels", "sandals", "sandals", "sandals", "sandals", "sandals"], "difficult_direct_answer": false, "rationales": ["A girl is wearing a strappy shoe with an elevated sole.", "The woman is wearing sandals that have no heel.", "The woman sitting on the bench is wearing white shoes with high heels."], "image": "train2014/COCO_train2014_000000514243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354863, "question_id": "D2JY6dLSnkiikg4amWGom3", "question": "What scientific class does the animal on the sign belong to?", "choices": ["loricifera", "asteroidea", "insecta", "aves"], "correct_choice_idx": 3, "direct_answers": ["bird", "aves", "kingdom", "bird", "ornithology", "bird", "aves", "aves", "aves", "birds"], "difficult_direct_answer": false, "rationales": ["The sign has a red bird on it which belongs to the scientific class aves.", "This scientific class is adapted to fly.", "The animal on the sign is a bird and that is its classification."], "image": "train2014/COCO_train2014_000000354863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512961, "question_id": "D2RLrpkPqod67QDrshqmuN", "question": "What type of scene is this?", "choices": ["power plant", "train station", "model", "farm"], "correct_choice_idx": 2, "direct_answers": ["bucolic", "toy model", "model", "miniature", "miniature", "quaint model", "farm", "rural", "model trains", "rural"], "difficult_direct_answer": false, "rationales": ["There is a small train and houses present with figuirines.", "The scene is made of models.", "These are miniature depictions of these items"], "image": "train2014/COCO_train2014_000000512961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14611, "question_id": "D2TyYFB6gp7yUDazJVfknC", "question": "What material are the roof of the boats made of?", "choices": ["plastic", "wood", "metal", "bamboo"], "correct_choice_idx": 3, "direct_answers": ["straw", "wood", "wood", "wood", "branches", "wood", "sticks", "tree branch", "bamboo", "bamboo"], "difficult_direct_answer": false, "rationales": ["The roofing method comprises of also thicket.", "The bamboo on the roof is to protect the people inside.", "The roofs of the boats are made of long shoots of bamboo."], "image": "train2014/COCO_train2014_000000014611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34920, "question_id": "D2VFnMvMszzUuzaNoJaiwG", "question": "Why is the zebra by itself?", "choices": ["is eating", "not giraffe", "is sleeping", "is hungry"], "correct_choice_idx": 1, "direct_answers": ["not giraffe", "sleeping", "resting", "resting", "laying down", "lonely", "not true", "resting", "resting", "different species"], "difficult_direct_answer": false, "rationales": ["There are only giraffes beside the zebra.", "Species tend to hang out together with their own species.", "He looks like he is resting in the background."], "image": "train2014/COCO_train2014_000000034920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215858, "question_id": "D2fJ2qHTpMeEF2dSPiJKYA", "question": "What is the total sum of the each digit on the jersey?", "choices": ["41", "14", "five", "140"], "correct_choice_idx": 2, "direct_answers": ["five", "five", "five", "five", "five", "five", "five", "number 5", "five", "five"], "difficult_direct_answer": false, "rationales": ["One plus four equals five.", "The jersey has the number 140 printed on it. the sum of those digits is 5.", "A boy is wearing the number 140 on his jersey while he skis. one plus four is five."], "image": "train2014/COCO_train2014_000000215858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97683, "question_id": "D2ifJiatnmxtGgjrzUHMRr", "question": "The picture on the screen is in what item category?", "choices": ["clothing", "books", "food", "weapons"], "correct_choice_idx": 3, "direct_answers": ["sword", "weapon", "lightning", "mobile", "knife", "sword", "emoji", "weapons", "vehicle", "weapon"], "difficult_direct_answer": false, "rationales": ["A sword is shown.", "It is a sword. swords can be used to inflict pain and/or death.", "A knife is shown and considered a weapon."], "image": "train2014/COCO_train2014_000000097683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553370, "question_id": "D2zJTNAhjgWwhmLUZDDmiu", "question": "Why is the large boat stopped by the small boat?", "choices": ["to fish", "to deliver", "to race", "to help"], "correct_choice_idx": 3, "direct_answers": ["to help", "rescue", "traffic", "need help", "sinking", "to rescue", "searchrescue", "offering assistance", "assistance", "rescue"], "difficult_direct_answer": true, "rationales": ["The man on the big boat is getting ready to throw a life vest to the little boat.", "The people on the large boat are helping the people on the smaller one.", "The bigger boat is owned by the coast guard, who help other boats."], "image": "train2014/COCO_train2014_000000553370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533888, "question_id": "D39UZGnug8J3NLhQPwVbn5", "question": "What would a person stay away from if they liked only cooked food?", "choices": ["yakitori", "rice", "shrimp tempura", "sushi"], "correct_choice_idx": 3, "direct_answers": ["sushi", "hotel", "sushi", "sushi", "peas", "sushi", "sushi", "raw food", "sushi", "sushi"], "difficult_direct_answer": false, "rationales": ["The person wouldn't eat sushi.", "Sushi is not cooked.", "Sushi is laid out on a table. sushi is raw."], "image": "train2014/COCO_train2014_000000533888.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530408, "question_id": "D3RtYb5dxXTg84ZPeXuE5M", "question": "What vehicle is shown?", "choices": ["motorcycles", "cars", "trucks", "buses"], "correct_choice_idx": 0, "direct_answers": ["motorcycle", "mopeds", "motorcycles", "motorcycles", "motorcycle", "motorcycles", "motorcycle", "motorbikes", "motorcycle", "motorcycles"], "difficult_direct_answer": false, "rationales": ["The vehicles have two wheels and a motor.", "These are a type of motorcycle.", "The vehicles have two, not four or more, wheels."], "image": "train2014/COCO_train2014_000000530408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222361, "question_id": "D3S5wh9e7aPv6JKYdQihj2", "question": "Who is this man supposed to be playing?", "choices": ["john wayne", "lone ranger", "roy rogers", "audie murphy"], "correct_choice_idx": 0, "direct_answers": ["john wayne", "cowboy", "john wayne", "cowboy", "cowboy", "john wayne", "cowboy", "john wayne", "cowboy", "cowboy"], "difficult_direct_answer": false, "rationales": ["The man is dressed like a cowboy. john wayne was a famous cowboy.", "Jone wayne is a cowboy. this man is dressed like him.", "John wayne rides horses."], "image": "train2014/COCO_train2014_000000222361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213203, "question_id": "D3pcZ6EmqgKNoNRf34rtbM", "question": "How many items are on pedestals?", "choices": ["five", "four", "three", "seven"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "3 items", "three", "three", "3 items", "three", "three"], "difficult_direct_answer": false, "rationales": ["Three items show up on the stands.", "There are two vases and a teapot on the pedestals.", "There are three vases shown."], "image": "train2014/COCO_train2014_000000213203.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492131, "question_id": "D3ujtxwtwgfk9V7bmGZHEJ", "question": "What allowed him to achieve that height?", "choices": ["strength", "speed", "ramp", "cars"], "correct_choice_idx": 2, "direct_answers": ["ramp", "ramp", "ramp", "ramp", "ramp", "ramp", "ramp", "ramp", "ramp", "ramp"], "difficult_direct_answer": false, "rationales": ["There is a long, steep diagonal item that the motorcycle used to jump the cards pictured.", "He jumped off the slanted piece of wood.", "You need the apparatus pictured here to achieve the height."], "image": "train2014/COCO_train2014_000000492131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232076, "question_id": "D3veUniuMXf2LX6tDPCRTm", "question": "Who would be married to the person that is listed on the street name?", "choices": ["dauphine", "princess", "queen", "duchess"], "correct_choice_idx": 2, "direct_answers": ["queen", "queen", "no", "queen", "opinion person", "queen", "queen", "queen", "queen", "queen"], "difficult_direct_answer": false, "rationales": ["This is the highest-ranked female royal title.", "The street name is clearly visible and readable and known to refer to the spouse of answer a.", "Kings wives are referred to as queens."], "image": "train2014/COCO_train2014_000000232076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227828, "question_id": "D3y6GiZK9SYmV7qkPz74c6", "question": "What other activity can be carried out here besides surfing?", "choices": ["paddling", "rafting", "canoeing", "swimming"], "correct_choice_idx": 3, "direct_answers": ["swimming", "swimming", "swimming", "swimming", "swimming", "swimming", "fishing", "fishing", "fishing", "swimming"], "difficult_direct_answer": false, "rationales": ["People love to swim in the ocean.", "The activity is swimming.", "The man can swim in the water."], "image": "train2014/COCO_train2014_000000227828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497009, "question_id": "D42us2kXsnsxKMmVtDJAU6", "question": "This man looks most like what celebrity?", "choices": ["ryan gosling", "emma stone", "howard stern", "idris elba"], "correct_choice_idx": 2, "direct_answers": ["howard stern", "howard stern", "al franken", "taylor swift", "howard stern", "prince", "howard stern", "neil gaiman", "carrot top", "howard stern"], "difficult_direct_answer": false, "rationales": ["The person is a white male with dark, curly, longish hair just like the celebrity radio personality.", "The person is a white man, not a woman or black man. he is wearing glasses and does not have blonde hair.", "He has the curly hair and glasses that make him look like howard stern."], "image": "val2014/COCO_val2014_000000497009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434358, "question_id": "D4Rv5ohTeDGFVvXS4VbNTt", "question": "What type of Tennis game is being played here?", "choices": ["women's doubles", "mixed doubles", "singles", "mens doubles"], "correct_choice_idx": 0, "direct_answers": ["doubles", "professional", "match", "doubles", "doubles", "doubles", "female championship", "batmentan", "doubles", "women's doubles"], "difficult_direct_answer": false, "rationales": ["Two women play against another team consisting of two players.", "There are two women on the same side of the net", "There are two women tennis players playing together on one side of the court, making them a team."], "image": "val2014/COCO_val2014_000000434358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28842, "question_id": "D4WeqKifjPY4gwjiJWf2GT", "question": "Which person could be attacked by the dog first?", "choices": ["black shirt", "green shirt", "red shirt", "white shirt"], "correct_choice_idx": 1, "direct_answers": ["white shirt", "sitting", "beige shirt", "skateboarder", "man", "green shirt", "left", "scaring person", "green shirt", "left"], "difficult_direct_answer": false, "rationales": ["The person in the green shirt is closest to the dog.", "A man is in the middle doing a trick on a board. a dog is chasing him and right to the right of him barking.", "The dog is running toward the one in the green."], "image": "val2014/COCO_val2014_000000028842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38336, "question_id": "D4c3oQdunMV5VwqAVXU8Fp", "question": "Which country is this class most likely taught in?", "choices": ["mexico", "saudi arabia", "india", "china"], "correct_choice_idx": 2, "direct_answers": ["india", "india", "india", "india", "indian", "indian", "india", "india", "india", "india"], "difficult_direct_answer": false, "rationales": ["The students are all indian.", "People have saris.", "Based on the attire of the women students it indicates the country they are from since they mostly wearing saris."], "image": "val2014/COCO_val2014_000000038336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314109, "question_id": "D4nErtpUdeg34gABHEQncV", "question": "How many stops will there be before the bus arrives at its destination?", "choices": ["two", "three", "one", "zero"], "correct_choice_idx": 3, "direct_answers": ["one", "three", "one", "zero", "one", "several", "none", "two", "one", "zero"], "difficult_direct_answer": false, "rationales": ["The bus says that it only has one destination left which is its final stop.", "The bus's only destination is written on the front of the bus and no other destinations will be visited.", "It only goes to one destination"], "image": "train2014/COCO_train2014_000000314109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521176, "question_id": "D4oHXEYXnNSyX3pSdqZTXr", "question": "What is in the metal tin?", "choices": ["water", "erasers", "paint", "snacks"], "correct_choice_idx": 0, "direct_answers": ["water", "water", "paint", "water", "water", "water", "water", "water", "paint", "paint"], "difficult_direct_answer": false, "rationales": ["There is water, most likely to clean up the paint off the brush.", "The man is water painting and the tin is used to hold water.", "The person is painting a scene with paint and a paintbrush. when painting, especially on a scene as this, painters may need to clean their brushes which they would use water for and need a container to hold the water."], "image": "train2014/COCO_train2014_000000521176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392162, "question_id": "D4thR8bDJP9Epa58otCtvU", "question": "What animal are these?", "choices": ["cow", "horse", "donkey", "goat"], "correct_choice_idx": 0, "direct_answers": ["cows", "cows", "cows", "cows", "cows", "cows", "cows", "cow", "cows", "cows"], "difficult_direct_answer": false, "rationales": ["The other options aren't in the image. they would also be referred to as cattle.", "The animals are cows.", "This is a herd of cows all together in a pasture."], "image": "train2014/COCO_train2014_000000392162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224702, "question_id": "D4u8uWXkmDJeh5CJhJzvDm", "question": "What is this type of hairstyle called?", "choices": ["crew cut", "mullet", "dreadlocks", "buzz cut"], "correct_choice_idx": 1, "direct_answers": ["mullet", "mullet", "curly", "mullet", "shag", "mullet", "mullet", "mullet", "mullet", "mullet"], "difficult_direct_answer": false, "rationales": ["His hair is long in the back.", "His hair is short in the front but longer in back.", "The man's hair is short in front and long in back."], "image": "val2014/COCO_val2014_000000224702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15356, "question_id": "D5ANDRDPstsYRsCRLtmcNm", "question": "What type of information is on the digital sign?", "choices": ["destination", "cost", "warning", "brand"], "correct_choice_idx": 0, "direct_answers": ["destination", "route destination", "next stop", "next stop", "destination", "next destination", "next destination", "handicap", "destination", "address"], "difficult_direct_answer": false, "rationales": ["The sign is showing where the bus is going.", "Buses display where they are going on digital signs.", "The other details are obviously not on it. a is usually displayed in this position on buses around the world that have digital signs."], "image": "train2014/COCO_train2014_000000015356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260604, "question_id": "D5FQauMa7QLnwTygQ9DM8q", "question": "Which popular toy has been used to build this scene?", "choices": ["magna-tiles", "lego", "lincoln logs", "k'nex"], "correct_choice_idx": 1, "direct_answers": ["lego", "legos", "lego", "lego", "commercial vehicle", "wood blocks", "lego", "legos", "lego", "bus"], "difficult_direct_answer": false, "rationales": ["You can see the small blocks of the lego toy. also those little action figures are clearly lego action figures.", "The scene was built with the popular children's toy made of multicolor plastic interlocking bricks.", "All of the pieces are made of tiny blocks popped together."], "image": "train2014/COCO_train2014_000000260604.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383556, "question_id": "D5gwdEWZq4Ad7ENjGBbopa", "question": "What is missing to make a classic sandwich?", "choices": ["mustard", "pickles", "onions", "lettuce"], "correct_choice_idx": 3, "direct_answers": ["lettuce", "lettuce", "lettuce", "lettuce", "top bun", "lettuce", "lettuce", "cheese", "lettuce", "lettuce"], "difficult_direct_answer": false, "rationales": ["There is no lettuce on the sandwich which is needed for a blt.", "This classic blt has the bacon and the tomato but is missing the \"l\" ingredient.", "A blt contains bacon, greens and tomatoes. the greens are missing."], "image": "train2014/COCO_train2014_000000383556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348519, "question_id": "D5oAVx6i9WovUdu9X293wm", "question": "What is the style of this meat called?", "choices": ["stuffed", "shredded", "circular", "styled"], "correct_choice_idx": 1, "direct_answers": ["pulled pork", "pulled", "shredded", "shredded", "pulled pork", "bbq", "pulled", "pulled pork", "shredded", "pulled"], "difficult_direct_answer": false, "rationales": ["This type of meat is cooked then pulled apart to eat like that on a sandwich.", "Meat can be prepared in many different ways. if is is broken up from its original content, it would be in a shredded form.", "The meat has been pulled apart."], "image": "val2014/COCO_val2014_000000348519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562735, "question_id": "D5sP4pmb4LwY7Gqg6hVZ74", "question": "Where is Restek's headquarters?", "choices": ["california", "florida", "texas", "utah"], "correct_choice_idx": 0, "direct_answers": ["bellefonte pennsylvania", "bellefonte", "pennsylvania", "pennsylvania", "california", "bellefonte pa", "usa", "bellefonte pa", "bellefonte pennsylvania", "bellefonte pa"], "difficult_direct_answer": false, "rationales": ["A logo is on a mug with an address as well.", "It is headquartered in pennsylvania.", "They are located in ca."], "image": "val2014/COCO_val2014_000000562735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305980, "question_id": "D5whcBmb29DFpduqmawSoF", "question": "What team is the catcher on?", "choices": ["phillies", "mets", "yankees", "braves"], "correct_choice_idx": 1, "direct_answers": ["mets", "mets", "blue jays", "red", "defense", "blue team", "ny mets", "visitor", "newyork mets", "arizona"], "difficult_direct_answer": true, "rationales": ["The player has the colors of the new york mets.", "The catcher is wearing blue and orange gear.", "The catcher is wearing a blue and orange mets uniform."], "image": "train2014/COCO_train2014_000000305980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246029, "question_id": "D68iachZKGpU4KgQ5sfLTY", "question": "On the front man what is most protected?", "choices": ["knees", "shins", "chest", "nose"], "correct_choice_idx": 0, "direct_answers": ["knees", "head", "head", "knees", "head", "knees", "knees", "knees", "joints", "knees"], "difficult_direct_answer": false, "rationales": ["His knees are totally covered with protective pads.", "This person has knee pads on to protect their knees.", "The man is wearing large kneepads."], "image": "train2014/COCO_train2014_000000246029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27226, "question_id": "D6AbBVGmVTVXNg9UPBVuj9", "question": "How many years back the photograph was taken?", "choices": ["ten", "eight", "five", "seven"], "correct_choice_idx": 1, "direct_answers": ["eight", "nine", "nine years", "eight", "eight", "eight years", "eight", "nine", "eight", "eight"], "difficult_direct_answer": false, "rationales": ["The date is seen in the bottom right hand corner so we can figure out how many years ago that was by doing some simple math.", "The picture was taken on june 15, 2013. it is currently march 9, 2022. i calculated the duration of time and then chose the closest option.", "The date is in the corner of the photo"], "image": "val2014/COCO_val2014_000000027226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369982, "question_id": "D6XrDyCDmeEoLi9rcjebJa", "question": "Where is this pool located at?", "choices": ["resort", "public park", "winery", "backyard"], "correct_choice_idx": 3, "direct_answers": ["park", "backyard", "resort", "backyard", "report", "hill", "park", "fenced yard", "back yard", "park"], "difficult_direct_answer": false, "rationales": ["The pool is located in the backyard of a house in a residential area.", "This looks like a private pool.", "The pool is in a grassy backyard area."], "image": "val2014/COCO_val2014_000000369982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400915, "question_id": "D6dPhCg3uveGTjkci5pZ3X", "question": "What is the syncline referred to as?", "choices": ["hole", "dip", "cave", "bowl"], "correct_choice_idx": 3, "direct_answers": ["tube", "skate park", "bowl", "halfpipe", "ramp", "unknown", "bowl", "bowl", "closer center", "pipeline"], "difficult_direct_answer": false, "rationales": ["The syncline is in the same shape as a bowl.", "The syncline dips inward in a bowl shape.", "It is a bowl shape."], "image": "val2014/COCO_val2014_000000400915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373284, "question_id": "D6zeBGyCNfULL4W5D2NgHw", "question": "With just a knowledge of English which service could you most easily find here?", "choices": ["atm", "scooter repair", "hotel", "restaurant"], "correct_choice_idx": 0, "direct_answers": ["atm", "atm", "atm", "atm", "atm", "atm", "atm", "atm", "atm", "atm"], "difficult_direct_answer": false, "rationales": ["Atm signs just feature three letters.", "These are the only english words visible.", "There is a large green sign that shows atm and exchange in english. everything else is written in different languages on signs."], "image": "val2014/COCO_val2014_000000373284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47022, "question_id": "D74vF68ze94kKsTdJdiVeb", "question": "Why is the bird indoors?", "choices": ["flew in", "pet bird", "veterinarian visit", "it's stuffed"], "correct_choice_idx": 0, "direct_answers": ["lost", "flew in", "wants food", "stuck", "pet", "by accident", "trapped", "lost", "pet", "pet"], "difficult_direct_answer": false, "rationales": ["The bird must have flown in from a window.", "Considering that the bird has wings and that flying is his most preferred method of movement, one would have to assume that the bird is present in this location because he used his wings to fly into this location.", "The bird flew in."], "image": "train2014/COCO_train2014_000000047022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176588, "question_id": "D7B3n3nvwfqWKJpundvJqK", "question": "What activity are the people carrying out?", "choices": ["playing volleyball", "playing frisbee", "dancing", "hopscotch"], "correct_choice_idx": 1, "direct_answers": ["frisbee", "tag", "frisbee", "kickball", "frisbee throwing", "game", "soccer game", "playing frisbee", "tag", "frisbee"], "difficult_direct_answer": false, "rationales": ["They are throwing discs", "The boy in the front of the picture has a green disc in his hand that he is in the process of throwing. there are other people in the picture that are in the ready position to catch the disc.", "At least two people are holding colorful discs."], "image": "train2014/COCO_train2014_000000176588.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164001, "question_id": "D7DYw3hFycTa2KK4gCM78m", "question": "What is in the tin box?", "choices": ["salt", "pepper", "sugar", "napkins"], "correct_choice_idx": 3, "direct_answers": ["napkins", "coffee", "hot chocolate", "paper napkins", "tea", "coffee", "napkins", "hot chocolate", "napkins", "coffee"], "difficult_direct_answer": false, "rationales": ["Its a place to store napkins", "The tin box has openings on the side that have white napkins at the surface ready for use.", "A silver square with white objects is on a table. napkins are often kept on a table."], "image": "train2014/COCO_train2014_000000164001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373294, "question_id": "D7Jx2x4ZvSUjfVaPgqr7Np", "question": "What is under the broccoli?", "choices": ["macaroni", "tomato", "potato", "beans"], "correct_choice_idx": 0, "direct_answers": ["macaroni", "noodles", "macaroni", "macaroni", "macaroni", "pasta", "potatoes", "pasta", "pasta", "pasta"], "difficult_direct_answer": false, "rationales": ["There is mac and cheese under the broccoli.", "The dish features mac and cheese.", "The broccoli has mac."], "image": "train2014/COCO_train2014_000000373294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93992, "question_id": "D7Pay2RS5ZEAefa2MFsvLE", "question": "Why are the dishes empty?", "choices": ["already ate", "didn't use", "just decoration", "contained salad"], "correct_choice_idx": 0, "direct_answers": ["food eaten", "not served", "people ate", "food eaten", "almost eaten", "not served", "already ate", "to serve", "people ate", "eaten"], "difficult_direct_answer": false, "rationales": ["When dishes are on a table and the contents is gone from them, it is usually because they have eaten", "The dishes have been eaten by others.", "There are remnants of food left over in the empty dishes."], "image": "train2014/COCO_train2014_000000093992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489199, "question_id": "D7STFPXAbc6uxjjnSUT8Vr", "question": "What will get into the food if the dog starts to shed?", "choices": ["his saliva", "his paws", "his bark", "his fur"], "correct_choice_idx": 3, "direct_answers": ["fur", "hair", "hair", "fur", "hair", "hair", "his fur", "fur", "fur", "hair"], "difficult_direct_answer": false, "rationales": ["A man is holding dog in a kitchen. dogs shed.", "Dog hair will get in food if the dog sheds.", "These animals have fur and shed at certain times of the year."], "image": "train2014/COCO_train2014_000000489199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271759, "question_id": "D7XZqg6Dci6hieF6vLXVty", "question": "What is the woman doing to her cake?", "choices": ["stirring", "freezing", "puncturing", "cross hatching"], "correct_choice_idx": 3, "direct_answers": ["slicing", "cutting", "cutting it", "cutting", "cutting", "cutting", "cutting", "cutting", "cutting it", "cross hatching"], "difficult_direct_answer": false, "rationales": ["That's what the process is called.", "The woman is crosshatching.", "She is cutting in a pattern that is done when cross hatching a cake."], "image": "val2014/COCO_val2014_000000271759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335153, "question_id": "D7mwDHA3PZyzxfAr8gQJxp", "question": "What are these people watching?", "choices": ["singing contest", "video game", "tv show", "news report"], "correct_choice_idx": 1, "direct_answers": ["tv", "game screen", "video game", "music", "video game", "video game", "video game", "video game", "wii screen", "video game"], "difficult_direct_answer": false, "rationales": ["These people are watching a video game together.", "One person has a game control in their hand", "The child is holding a wii remote in their hand, which is a controller for a video game."], "image": "val2014/COCO_val2014_000000335153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407150, "question_id": "D7pvCeCcbHygqguWAuw2UV", "question": "Why are the items discarded next to the garbage bin?", "choices": ["garbage full", "needs recycling", "it's raining", "someone's items"], "correct_choice_idx": 0, "direct_answers": ["have fallen", "not wanted", "trash", "garbage full", "no room", "trash", "full bin", "homeless", "full", "trash"], "difficult_direct_answer": false, "rationales": ["There are items sticking out of the top of the container.", "These are big items and the hole in the garbage can seems too full to fit them in so they've been discarded next to the can.", "The garbage is full."], "image": "val2014/COCO_val2014_000000407150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98592, "question_id": "D7vExvCjEnQaDnwoHCp6kn", "question": "What does the woman have on her feet?", "choices": ["seashells", "sneakers", "sandals", "boots"], "correct_choice_idx": 3, "direct_answers": ["boots", "motorcycle boots", "boots", "boots", "boots", "boots", "boots", "boots", "bike", "boots"], "difficult_direct_answer": false, "rationales": ["The woman is wearing white boots.", "It would not be appropriate to wear seashells as shoes. she is not wearing sandals or sneakers", "These are to protect feet while riding"], "image": "train2014/COCO_train2014_000000098592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569996, "question_id": "D8GqmGS47sLZrGFmpWg8Hc", "question": "What form of tennis is this?", "choices": ["women's doubles", "men's doubles", "mixed doubles", "men's singles"], "correct_choice_idx": 1, "direct_answers": ["doubles", "doubles", "double", "men's doubles", "doubles", "doubles", "duos", "doubles", "mixed doubles", "doubles"], "difficult_direct_answer": false, "rationales": ["A man and a woman are playing on the same team.", "The form of tennis shown is mixed doubles when there is a man and woman on each team", "There looks to be two people on the court on the same side and there is one male and one female so i would guess its mixed doubles."], "image": "train2014/COCO_train2014_000000569996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52638, "question_id": "D8QmhhG498q26oDgtLi4HC", "question": "Who is controlling the elephant?", "choices": ["first kid", "fat man", "third kid", "last woman"], "correct_choice_idx": 1, "direct_answers": ["handler", "trainer", "trainer", "kids", "trainer", "guide", "man", "trainer", "man", "fat man"], "difficult_direct_answer": false, "rationales": ["The kids and women are passengers. they are not controlling the elephant.", "There should be someone making sure the elephant stays calm because it's carrying people. the large man is the only person near the elephant so it's safe to assume he is controlling the elephant.", "A large man is walking next to the head of the elephant"], "image": "val2014/COCO_val2014_000000052638.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567430, "question_id": "D8dhHDDivwJJT36L43BVoK", "question": "What is the name for the large tower in the airport?", "choices": ["liberty tower", "control tower", "eiffel tower", "birds nest"], "correct_choice_idx": 1, "direct_answers": ["watch tower", "traffic control", "traffic control", "watch tower", "watch tower", "air control", "control tower", "control tower", "tower control", "lookout"], "difficult_direct_answer": false, "rationales": ["The building needs to be high up in the air to help guide the planes.", "The tower controls the planes' flights.", "It is the tower in which people can be above and see the goings-on on the runway, in order to control the movement of the planes."], "image": "train2014/COCO_train2014_000000567430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384122, "question_id": "D8mqhQUkL7zz2QBPz8tKP6", "question": "The greeting at the top is synonymous with what country?", "choices": ["belgium", "curacao", "tanzania", "philippines"], "correct_choice_idx": 3, "direct_answers": ["phillipines", "phillipines", "mabuhay", "asia", "phillipines", "india", "philippines", "phillipines", "phillipines", "india"], "difficult_direct_answer": false, "rationales": ["It says mabuhay.", "It's a tagalog word that means long live.", "The greeting is generally a pilipino greeting."], "image": "train2014/COCO_train2014_000000384122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384146, "question_id": "D8nCKeEDJDSLVXbzDyazAm", "question": "In what do these horses walk?", "choices": ["bull run", "slaughter", "last roundup", "parade"], "correct_choice_idx": 3, "direct_answers": ["parade", "street", "concrete", "parade", "parade", "parade", "horseshoes", "pairs", "horseshoes", "parade"], "difficult_direct_answer": false, "rationales": ["The horses are walking in the middle of road in with people onlooking which means they are part of a parade.", "There are people on the street like this is a typical parade.", "They are walking down a street slowly, with spectators watching on the side, suggesting this is a parade."], "image": "train2014/COCO_train2014_000000384146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533408, "question_id": "D9AfwSYGsbtce4EamZodjW", "question": "What do people usually have to do for this vehicle?", "choices": ["ride shotgun", "wear seatbelts", "wear helmets", "wait"], "correct_choice_idx": 3, "direct_answers": ["wait", "travelling", "wait", "pay", "wait", "pay toll", "stop", "wait", "pay", "wait"], "difficult_direct_answer": false, "rationales": ["People have to wait at the bus stop.", "The vehicle is a public transit bus. it does not have a front seat or seatbelts.", "The other options usually don't apply to a bus."], "image": "val2014/COCO_val2014_000000533408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19499, "question_id": "D9BFZL9ttXXeaw7uBUnQo3", "question": "What type of accident is this?", "choices": ["upside down", "roll-over", "turn-up", "t-bone"], "correct_choice_idx": 1, "direct_answers": ["vehicle", "vehicle accident", "car", "automobile", "car wreck", "car accident", "roll-over", "automotive", "car crash", "auto"], "difficult_direct_answer": true, "rationales": ["The car rolled over. the wheels are turned.", "The accident has caused the car to roll over.", "This is a rollover wreck, which is obvious because the vehicle is lying upside down on its roof. hopefully, all involved are not seriously injured."], "image": "train2014/COCO_train2014_000000019499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399512, "question_id": "D9CykqY6ZVgnmG3K5gsxnW", "question": "How many minutes can a person legally park here?", "choices": ["eighty", "sixty", "thirty", "seventy"], "correct_choice_idx": 2, "direct_answers": ["30", "thirty", "thirty", "thirty", "thirty", "30", "thirty", "thirty", "thirty", "30"], "difficult_direct_answer": false, "rationales": ["The meter sign says the number of minutes.", "The sign below the stop sign in this image says parking 30 minutes.", "The sign indicates that only 30 minutes of parking are allowed."], "image": "train2014/COCO_train2014_000000399512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427859, "question_id": "D9Edqb3Kc2ZufMbpSdPMZJ", "question": "What is the green bowl on the counter used for?", "choices": ["eating", "sifting", "cooking", "catching water"], "correct_choice_idx": 3, "direct_answers": ["sink", "washing up", "basin", "washing hands", "catching water", "contain water", "sink", "washing hands", "bathroom sink", "washing hands"], "difficult_direct_answer": false, "rationales": ["This is a bathroom, not a kitchen or dining room. the bowl is below a faucet.", "The faucet above the green bowl pours water into the basin for cleaning or washing.", "There is not a sink inside of the cabinet, so there is instead a bowl of water."], "image": "train2014/COCO_train2014_000000427859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407413, "question_id": "D9SRZH23wTxmihty6NAeVH", "question": "If the camera man jumped over the railing closest to them where would they land?", "choices": ["table", "grass", "stairs", "kitchen"], "correct_choice_idx": 2, "direct_answers": ["on stairs", "downstairs", "stairs", "stairs", "downstairs", "floor", "downstairs", "gray mat", "floor", "stairs"], "difficult_direct_answer": false, "rationales": ["The person would land on the stairwell since it juts so far out.", "The stairs are right behind the railing and can be seen below.", "He would land on the stairs."], "image": "train2014/COCO_train2014_000000407413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385716, "question_id": "D9ZKqDRz8FMX8eNm9zFM7S", "question": "What is the building next to the woman?", "choices": ["residential building", "hospital", "office building", "department store"], "correct_choice_idx": 3, "direct_answers": ["store", "alley", "department store", "building", "hotel", "hotel", "department store", "stone building", "store", "concrete"], "difficult_direct_answer": false, "rationales": ["A departement store to shop in.", "This is a large shopping building.", "There are awnings on the large stores."], "image": "train2014/COCO_train2014_000000385716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19766, "question_id": "D9idAj3aHoYuCuMLbtiJBN", "question": "The coating is used on a skateboard?", "choices": ["polyurethane", "nylon", "polyester", "grip coat"], "correct_choice_idx": 0, "direct_answers": ["wax", "clearcoat", "polyurethane", "sealant", "epoxy", "clear coat", "paint", "epoxy resin", "polyurethane coating", "grip tape"], "difficult_direct_answer": true, "rationales": ["The coating on the board is polyurethane.", "They use it to keep it from getting scratched up.", "The answer is commonly known to be a substance used for the purposes of skateboarding."], "image": "train2014/COCO_train2014_000000019766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293895, "question_id": "D9v8PsZxtf9uTUYHfS2awi", "question": "What in the image provides shade?", "choices": ["towers", "towels", "umbrellas", "trees"], "correct_choice_idx": 2, "direct_answers": ["yes", "umbrellas", "porch", "umbrellas", "umbrellas", "umbrella", "umbrella", "umbrella", "umbrellas", "umbrellas"], "difficult_direct_answer": false, "rationales": ["The umbrellas provide shade to any customers.", "The large umbrellas will block the sun out at the tables.", "The objects most used for shade in this picture would be the umbrellas since they are all lined up on the sidewalk, where people are walking under them and are shaded from the sun."], "image": "val2014/COCO_val2014_000000293895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174097, "question_id": "D9yQeRkGCvcYWVVYy5fUiU", "question": "What style meat is served most frequently here?", "choices": ["hot dogs", "steak", "pepperoni", "chops"], "correct_choice_idx": 0, "direct_answers": ["hot dogs", "hot dog", "hot dog", "hot dog", "hot dog", "hot dog", "sausage", "hot dogs", "hot dog", "hot dog"], "difficult_direct_answer": false, "rationales": ["The meat is hot dogs.", "Most of the bread is long and narrow to hold some kind of sausage.", "Most food items are sausage-shaped."], "image": "val2014/COCO_val2014_000000174097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216637, "question_id": "DABaP2JpYuyCyUN4VqNJDP", "question": "If she wants to learn the sport she needs a smaller what?", "choices": ["ball", "racket", "shirt", "shoe"], "correct_choice_idx": 1, "direct_answers": ["tennis racket", "racket", "racket", "racket", "tennis racket", "tennis", "racket", "racket", "tennis racket", "racket"], "difficult_direct_answer": false, "rationales": ["Her clothes are appropriate for the sport. she is not holding a ball.", "The racket is too heavy for the child to swing it.", "She is a little girls and her hands can't reach around the handle."], "image": "train2014/COCO_train2014_000000216637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104486, "question_id": "DACuDyRWVYvwC8d73iZPmP", "question": "Who is the man wearing a red coat?", "choices": ["bell boy", "hotel guest", "hotel manager", "housekeeper"], "correct_choice_idx": 1, "direct_answers": ["male", "gentleman", "tourist", "bellhop", "hotel guest", "bellhop", "old man", "man", "tourist", "elderly man"], "difficult_direct_answer": false, "rationales": ["He seems to be checking out of the hotel with his luggage.", "A man is pushing a cart with luggage out of the entrance doors of an establishment.", "A person is pushing luggage on a cart through a doorway. people use carts to get their luggage to a room in a hotel."], "image": "val2014/COCO_val2014_000000104486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258998, "question_id": "DAJSdutwCqYD2DoEoM62vE", "question": "The animal is looking at what?", "choices": ["horse", "cow", "reflection", "food"], "correct_choice_idx": 2, "direct_answers": ["mirror", "reflection", "mirror", "mirror", "reflection", "reflection", "reflection", "mirror", "reflection", "mirror"], "difficult_direct_answer": false, "rationales": ["The animal is looking at itself in a mirror. it sees this.", "One can see the mirror image of the cat in the glass.", "The cat is looking in a mirror. there are no other animals."], "image": "val2014/COCO_val2014_000000258998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108879, "question_id": "DATZYN4VLPP6VJk7yCcEbw", "question": "What is the color of tennis ball used in earlier days?", "choices": ["red", "white", "green", "yellow"], "correct_choice_idx": 3, "direct_answers": ["green", "green", "white", "green", "green", "yellow", "yellow", "black", "green", "yellow"], "difficult_direct_answer": false, "rationales": ["The tennis ball is yellowish green.", "They used to be more yellowish.", "The color is yellow."], "image": "train2014/COCO_train2014_000000108879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430690, "question_id": "DAgKQnuwgWhbpgCuC2fUtJ", "question": "What is the bright orange traffic sign notifying drivers of?", "choices": ["red light", "parade", "police checkpoint", "construction work"], "correct_choice_idx": 3, "direct_answers": ["construction ahead", "construction", "construction ahead", "construction area", "construction", "hazard", "construction", "construction", "construction ahead", "construction work"], "difficult_direct_answer": false, "rationales": ["The sign tell drivers to watch for work.", "The orange sign is to warn of construction workers, so that drivers are aware to slow down. it is orange because it is more noticeable.", "All construction signs are orange."], "image": "val2014/COCO_val2014_000000430690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348555, "question_id": "DAkt2nAfKXWfYsnCgybfhi", "question": "Which giraffe is farthest from this small herd?", "choices": ["far right", "middle left", "far left", "middle right"], "correct_choice_idx": 2, "direct_answers": ["left giraffe", "left giraffe", "far left", "left", "left giraffe", "left one", "left giraffe", "left one", "left", "smallest"], "difficult_direct_answer": false, "rationales": ["The giraffe is on the left.", "This is obvious based on their positions in the picture.", "The one on the left has the most space between it an another giraffe."], "image": "train2014/COCO_train2014_000000348555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569618, "question_id": "DAqz7MWLQHL5ReavGF9cCb", "question": "What fraction of pizza is shown?", "choices": ["1/3", "1/1", "1/4", "1/2"], "correct_choice_idx": 3, "direct_answers": ["half", "half", "1/2", "half", "half", "half", "half", "half", "half", "half"], "difficult_direct_answer": false, "rationales": ["Half of the pizza is used.", "Exactly one half of the pizza is gone, or 1/2.", "An even amount is missing"], "image": "val2014/COCO_val2014_000000569618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197686, "question_id": "DB3FubuumGuzTFNrV7rAqL", "question": "What continent is the plane in the foreground from?", "choices": ["antarctica", "asia", "north america", "south america"], "correct_choice_idx": 2, "direct_answers": ["north america", "north america", "canada", "north america", "canada", "north america", "canada", "canada", "north america", "canada"], "difficult_direct_answer": false, "rationales": ["The country of origin is on the side of the plane.", "The name of the airline is on the side of the plane.", "The airline is called air canada which is a north american based airline."], "image": "train2014/COCO_train2014_000000197686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489107, "question_id": "DB8FVmwCRK67CxqB6NZ9Mu", "question": "Why is the man on the road wearing a whistle?", "choices": ["crossing guard", "no sidewalk", "street performer", "jaywalking"], "correct_choice_idx": 0, "direct_answers": ["traffic director", "direct traffic", "crossing guard", "traffic cop", "guiding traffic", "traffic control", "signaling", "to communicate", "traffic control", "alerting"], "difficult_direct_answer": true, "rationales": ["The man blows the whistle for safety.", "The crossing guard uses the whistle to direct traffic.", "The man is the guard."], "image": "train2014/COCO_train2014_000000489107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152789, "question_id": "DBHQKALioRgK9idnZk4vA8", "question": "How many legs do the animals have altogether?", "choices": ["two", "six", "ten", "four"], "correct_choice_idx": 2, "direct_answers": ["ten", "four", "ten", "ten", "ten", "four", "four", "four", "six", "eight"], "difficult_direct_answer": false, "rationales": ["There are two four-legged giraffes and one two-legged bird.", "The zebra's backside can be seen as a colorful bird is on top of it. another zebra is in the far distance grazing.", "There are two zebras with four legs each and a bird that has two legs if you add four plus four plus two that adds up to ten."], "image": "train2014/COCO_train2014_000000152789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502581, "question_id": "DBPcBAAwVprLZbjbfJEpgx", "question": "What kind of plate is the boy using?", "choices": ["muppet", "soup", "divider", "bread"], "correct_choice_idx": 2, "direct_answers": ["children's plate", "dinner", "children's plate", "plastic", "divider", "baby", "childrens", "plastic", "toddler plate", "plastic"], "difficult_direct_answer": false, "rationales": ["A kid is sitting in front of a plate that is sectioned of into areas so the food doesn't touch.", "It separates the food into different sections.", "The plate has three separate compartments."], "image": "train2014/COCO_train2014_000000502581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505516, "question_id": "DBrN5zfphbdBVr6bkFckav", "question": "Who famously helped win a 1992 playoff game doing what the boy in the black helmet is doing?", "choices": ["jacob degrom", "manny acta", "pete schourek", "sid bream"], "correct_choice_idx": 3, "direct_answers": ["sidney bream", "baseball", "great athlete", "sid bream", "sliding", "unknown", "pete rose", "sid bream", "lou gerri", "slide"], "difficult_direct_answer": true, "rationales": ["Sid bream helped win the game.", "Sid bream slid into home plate.", "Sid bream helped win a 1992 playoff game by sliding."], "image": "val2014/COCO_val2014_000000505516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84128, "question_id": "DCCD3QXBBcJXeY2aLFkmmD", "question": "What type of video game is the woman probably playing?", "choices": ["driving", "tennis", "fighting", "swimming"], "correct_choice_idx": 0, "direct_answers": ["driving", "racing", "racing", "car racing", "racing", "driving", "wii", "racing", "driving game", "wii"], "difficult_direct_answer": false, "rationales": ["The woman is holding a controller shaped like a steering wheel for a car.", "The game is mimicking driving.", "The woman is holding a steering wheel."], "image": "train2014/COCO_train2014_000000084128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436174, "question_id": "DCaUjCUm4cnPbBhZAzRyhL", "question": "Which direction are all the front wheels facing?", "choices": ["sideways", "right", "straight", "left"], "correct_choice_idx": 3, "direct_answers": ["left", "left", "right", "right", "left", "left", "right", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["They are all facing to the left.", "We are facing the bikes and the wheel is to the right.", "The wheels are all turned to the left."], "image": "val2014/COCO_val2014_000000436174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532631, "question_id": "DCq5VtXZGjxmRZhMchbFJK", "question": "These animals live where?", "choices": ["city", "savanna", "desert", "house"], "correct_choice_idx": 1, "direct_answers": ["africa", "africa", "africa", "grasslands", "africa", "wild", "wilderness", "wild", "africa", "savanna"], "difficult_direct_answer": false, "rationales": ["The animals are roaming a wide open space with trees and grass known as a savanna.", "These animals live in the savanna", "The other options don't fit with this animal type for africa."], "image": "train2014/COCO_train2014_000000532631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115875, "question_id": "DCrGVtm2nk4MSzxv4iaHNn", "question": "What video gaming system is the young child playing?", "choices": ["microsoft xbox", "sony playstation", "atari jaguar", "nintendo wii"], "correct_choice_idx": 3, "direct_answers": ["wii", "nintendo wii", "wii", "wii", "wii", "wii", "wii", "wii", "wii", "wii"], "difficult_direct_answer": false, "rationales": ["The controller is white and has a leash on it.", "The little girl has a white control in her hand which goes with a wii.", "The controller the child is holding is a color, size and shape consistent with answer a and is being used in the manner unique to this brand."], "image": "val2014/COCO_val2014_000000115875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332609, "question_id": "DCvYSrqTctKmk2GANgJghc", "question": "Upon the shelf sits something to celebrate a holiday what holiday is it?", "choices": ["july 4th", "easter", "st patricks", "christmas"], "correct_choice_idx": 1, "direct_answers": ["easter", "easter", "easter", "easter", "easter", "easter", "easter", "easter", "easter", "easter"], "difficult_direct_answer": false, "rationales": ["Easter eggs represent the rising of christ and there are two easter eggs on the top shelf.", "There are many cups above on a shelf. two the left are two eggs that are still wrapped in packaging.", "There are two plastic easter eggs on the shelf"], "image": "val2014/COCO_val2014_000000332609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191069, "question_id": "DCwZ556cshf9JquqTzhHn2", "question": "What are the little pictures on the cell phone named?", "choices": ["sketch", "pictograph", "dot", "icons"], "correct_choice_idx": 3, "direct_answers": ["icons", "icons", "apps", "icons", "apps", "apps", "apps", "icons", "apps", "apps"], "difficult_direct_answer": false, "rationales": ["The pictures are app icons for the iphone.", "The pictures are app icons.", "The pictures are icons."], "image": "val2014/COCO_val2014_000000191069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249384, "question_id": "DCxPhoFDD8jpVCrJVZTzBY", "question": "What does the Pepsi colors mean?", "choices": ["none", "independency", "patriotism", "peace"], "correct_choice_idx": 2, "direct_answers": ["patriotism", "usa flag", "us support", "flag", "unknown", "magnetic field", "cherry flavor", "peace", "flavor", "american flag"], "difficult_direct_answer": true, "rationales": ["The colors means love for country.", "The pepsi colors are red, white, and blue, which are patriotic colors of america.", "The color is patriotic."], "image": "train2014/COCO_train2014_000000249384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130458, "question_id": "DCyuCBc8RGSPrXxfNfaLt4", "question": "Which food provides the most vitamin A?", "choices": ["bitter melon", "eggplant", "tomato", "carrot"], "correct_choice_idx": 3, "direct_answers": ["apple", "broccoli", "tomato", "fruit", "carrot", "tomatoes", "tomato", "carrots", "apples", "tomatoes"], "difficult_direct_answer": false, "rationales": ["The man is standing with a lot of vegetables and the carrots have the most vitamin a.", "The tomatoes give the most vitamin a.", "Carrots are full of vitamin a."], "image": "train2014/COCO_train2014_000000130458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187348, "question_id": "DDBQNiRnKwaYgAELxiwXJd", "question": "Most persons drinking wine here share which type?", "choices": ["rose", "white", "bordeaux", "red"], "correct_choice_idx": 1, "direct_answers": ["white", "white", "white", "white wine", "white", "white wine", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The glasses are clear. red whine is red and these wines aren't red.", "The liquid in the cups are clear.", "The wine is clear, not red or pink."], "image": "val2014/COCO_val2014_000000187348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573248, "question_id": "DDKycBTdwwYkkZdme7aDHm", "question": "Badminton ball is made of what?", "choices": ["cotton", "carbon", "plastic", "wool"], "correct_choice_idx": 3, "direct_answers": ["wool", "rubber", "cork", "cork", "goose feathers", "plastic", "throwing", "rubber", "who cares", "goose feathers"], "difficult_direct_answer": false, "rationales": ["The ball is made of plastics hard and flexible.", "It used to be made of feathers.", "Badminton balls are similar to tennis balls."], "image": "train2014/COCO_train2014_000000573248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367275, "question_id": "DDSwKLNyEzcMecLvChvU9v", "question": "What is this location?", "choices": ["california", "washington dc", "florida", "texas"], "correct_choice_idx": 1, "direct_answers": ["washington", "washington doc", "washington", "washington dc", "washington dc", "city", "dc", "washington dc", "washington dc", "washington dc"], "difficult_direct_answer": false, "rationales": ["The capitol is in the background", "The building in the middle is in dc", "This is in front of of our capital building."], "image": "val2014/COCO_val2014_000000367275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98095, "question_id": "DDYFojFJbpbrvTvBT8eZBS", "question": "What beverage is the man wearing a yellow shirt holding?", "choices": ["soda", "beer", "juice", "coffee"], "correct_choice_idx": 1, "direct_answers": ["beer", "soda", "beer", "beer", "beer", "beer", "soda", "beer", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["A man is holding a can. beer comes in a can.", "The beverage is beer.", "The man in a yellow shirt is holding a can of pabst blue ribbon beer."], "image": "val2014/COCO_val2014_000000098095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578009, "question_id": "DDfuQrHTHXTCiPhUCBGxbC", "question": "Why are these bags being filled?", "choices": ["to clean", "to travel", "to decorate", "to sell"], "correct_choice_idx": 1, "direct_answers": ["packing", "vacation", "vacation", "for traveling", "vacation", "travel", "vacation", "to travel", "travel", "traveling"], "difficult_direct_answer": false, "rationales": ["Suitcases are used to transport clothing and toiletries from one place to another for short periods of time.", "The person is packing for a nice vacation.", "These cases are used to store items one wants to bring on a trip."], "image": "train2014/COCO_train2014_000000578009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472930, "question_id": "DDmWrngw4irjxwu5azHQ9j", "question": "What is closest to the left ledge of the table?", "choices": ["pumpkin", "orange", "tray", "glass"], "correct_choice_idx": 3, "direct_answers": ["pillow", "glass", "cushion", "glass", "water glass", "glass", "pillow", "pillow", "glass", "glass"], "difficult_direct_answer": false, "rationales": ["Various items are on a table including drinkware.", "The glass cup is near the ledge.", "The glass is on the left side of the table."], "image": "val2014/COCO_val2014_000000472930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154168, "question_id": "DDmkpMEmS98n9Jwj8cqwrE", "question": "What day is Double Punch?", "choices": ["wednesday", "friday", "monday", "tuesday"], "correct_choice_idx": 3, "direct_answers": ["sunday", "sunday", "wednesday", "tuesday", "tuesday", "wednesday", "tuesday", "tuesday", "tuesday", "wednesday"], "difficult_direct_answer": false, "rationales": ["The day is tuesday.", "The sign says double punch tuesdays.", "The far right part of the sign on the side of the bus indicates the day."], "image": "val2014/COCO_val2014_000000154168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509037, "question_id": "DDtfzAgzCmiDgqkfGBb25h", "question": "What is happening under the umbrella?", "choices": ["cleaning up", "food sales", "card game", "discussion"], "correct_choice_idx": 1, "direct_answers": ["food sales", "meal", "eating", "praying", "social gathering", "cooking", "people eating", "gathering", "selling food", "food service"], "difficult_direct_answer": true, "rationales": ["There is a food cart under the umbrella and people are buying and selling food.", "It looks like these people are selling food by the looks of the items on the table.", "The scene looks like a food cart and one can make out food preparation happening. food being prepared at a food cart is being made for sale."], "image": "val2014/COCO_val2014_000000509037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334788, "question_id": "DEJFQUPJyvRAGCVGCoDyoQ", "question": "Who put the dog on the surf board?", "choices": ["man", "cat", "dog", "girl"], "correct_choice_idx": 0, "direct_answers": ["man", "man", "male", "humans", "woman", "person", "person", "owner", "man", "human"], "difficult_direct_answer": false, "rationales": ["The man is closest to the dog.", "The man is pushing the dog.", "The dog is floating away from the man and it cannot get onto a surfboard on its own while in water. it is floating toward the woman who is ready to catch it."], "image": "train2014/COCO_train2014_000000334788.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256589, "question_id": "DEnWbDPBshPxkh3PuG7K3n", "question": "What is in the pot in the upper left corner?", "choices": ["gas", "coffee", "noodles", "water"], "correct_choice_idx": 1, "direct_answers": ["coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["That is a coffee maker", "The white machine has coffee in the glass pot.", "The pot has coffee."], "image": "train2014/COCO_train2014_000000256589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473712, "question_id": "DEzvUCpYKxZJajKaZgQu83", "question": "How many cars do these meters currently monitor?", "choices": ["one", "none", "two", "nine"], "correct_choice_idx": 1, "direct_answers": ["five", "seven", "seven", "two", "seven", "zero", "none", "seven", "seven", "two"], "difficult_direct_answer": false, "rationales": ["Because of there position it is easy to surmise that there are no cars being used for it.", "There are no cars near the monitors.", "It is unknown which meters are being used."], "image": "train2014/COCO_train2014_000000473712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223992, "question_id": "DF3WNWGeEE5yB6MRqjYtUQ", "question": "What emotion is the man in the grey hat feeling?", "choices": ["joy", "excitement", "surprise", "sadness"], "correct_choice_idx": 2, "direct_answers": ["shock", "surprise", "surprise", "surprise", "shocked", "surprise", "surprised", "good", "surprise", "suprise"], "difficult_direct_answer": false, "rationales": ["The man expresses surprise, and hopefully it's a good surprise and not a bad one. the man next to him seems awfully happy, so it seems to be a *good* surprise.", "The emotion is surprise.", "The man has a surprised look on his face and his mouth is open."], "image": "train2014/COCO_train2014_000000223992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188587, "question_id": "DF9PoPjt92dc2bcLGNzm6b", "question": "What aircraft type is this?", "choices": ["jet", "biplane", "helicopter", "seaplane"], "correct_choice_idx": 1, "direct_answers": ["biplane", "biplane", "biplane", "biplane", "biplane", "biplane", "biplane", "propeller-driven", "biplane", "propeller-driven"], "difficult_direct_answer": false, "rationales": ["This is a fixed wing aircraft.", "It has two sets of wings. bi means two.", "Biplanes are very small, like the plane depicted."], "image": "train2014/COCO_train2014_000000188587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285417, "question_id": "DFiw4qsb2GDanJhbt8Br96", "question": "This person is operating their laptop in what form of transportation?", "choices": ["bus", "train", "plane", "car"], "correct_choice_idx": 2, "direct_answers": ["airplane", "plane", "plane", "airplane", "airplane", "microsoft", "airplane", "airplane", "train", "airplane"], "difficult_direct_answer": false, "rationales": ["The laptop is in plane mode for the wifi.", "They are on an aiplane.", "There are plane seats all around."], "image": "train2014/COCO_train2014_000000285417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382462, "question_id": "DFpM45482vDQehFWzEYoTy", "question": "If all the people went away and you walked straight the direction the camera was pointing what would you probably run into first?", "choices": ["house", "fence", "car", "bed"], "correct_choice_idx": 1, "direct_answers": ["people", "fence", "players", "fence", "fence", "fence", "fence", "number 19", "fence", "fence"], "difficult_direct_answer": false, "rationales": ["You would run right into the fence.", "This surrounds the field", "This object is directly behind the people."], "image": "val2014/COCO_val2014_000000382462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184669, "question_id": "DGFpLxLs7WuJQdUC6pVzuk", "question": "Why is the truck in the middle of the street?", "choices": ["parking", "turning left", "no gas", "broken down"], "correct_choice_idx": 1, "direct_answers": ["turning", "driving route", "turning left", "turning left", "turning left", "transport", "turning corner", "driving", "turning left", "parked"], "difficult_direct_answer": false, "rationales": ["The truck wants to make a turn.", "The truck is making a turn to go down a different street and to get there they must turn left.", "The truck is making a very wide turn because it is a larger vehicle."], "image": "train2014/COCO_train2014_000000184669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328661, "question_id": "DGJnC9X5cEYCQZzVedEQqe", "question": "Which object is/are in the greatest threat?", "choices": ["pilot", "birds", "plane wings", "plane wheels"], "correct_choice_idx": 1, "direct_answers": ["birds", "birds", "birds", "birds", "birds", "birds", "birds", "birds", "birds", "birds"], "difficult_direct_answer": false, "rationales": ["The birds could get in the way.", "Birds fly all around a man in a glider in the air. birds have been known to cause accidents for planes.", "The birds could mess with the flying of this small plane"], "image": "val2014/COCO_val2014_000000328661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280242, "question_id": "DGUzRcqShq5KdnYnSAs7wH", "question": "What kind of room is this?", "choices": ["university dorm", "hospital ward", "motel room", "bedroom"], "correct_choice_idx": 2, "direct_answers": ["motel room", "bedroom", "bedroom", "bedroom", "bedroom", "bedroom", "bedroom", "bedroom", "bedroom", "bedroom"], "difficult_direct_answer": false, "rationales": ["The area looks like a dorm room because of the heater and desk.", "The room is at a motel.", "The room has a bed and a desk and tv."], "image": "val2014/COCO_val2014_000000280242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391063, "question_id": "DGpsK2SrHbxb5yEJ5YRHT6", "question": "How many people is this food for most likely?", "choices": ["eight", "two", "one", "twenty"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "four", "four", "four", "two", "humans", "four", "two"], "difficult_direct_answer": false, "rationales": ["Looks like there are two sandwiches there.", "There are two different sandwiches so it makes sense that its for two people.", "In most instances a full sandwich is made for one person. in the picture you can see two full sandwiches with both being sliced into halves."], "image": "train2014/COCO_train2014_000000391063.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358247, "question_id": "DHpW9XfnLC8N2wPTGHMszM", "question": "What is the name of a common dessert that uses this fruit?", "choices": ["split", "trifle", "cake", "sponge"], "correct_choice_idx": 0, "direct_answers": ["banana bread", "banana split", "milkshake", "banana bread", "banana split", "pudding", "banana bread", "split", "sahara", "split"], "difficult_direct_answer": false, "rationales": ["This is a popular ice cream dessert and this fruit is the base for it", "The common desert name is banana split.", "They can be used to make a banana split."], "image": "val2014/COCO_val2014_000000358247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80725, "question_id": "DHpadniHikMbeAZcwW7jWU", "question": "When was the stop sign invented?", "choices": ["1915", "1906", "1912", "1904"], "correct_choice_idx": 0, "direct_answers": ["i do", "1916", "nineteen fifteen", "1915", "1900s", "1915", "requires research", "what invalid", "past", "1915"], "difficult_direct_answer": false, "rationales": ["The first stop sign was used in michigan during world war 1.", "It was created in 1915", "It was invented then."], "image": "val2014/COCO_val2014_000000080725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10655, "question_id": "DHvWZLqEzuLUgMWJtroAKm", "question": "What can be adjusted for more privacy?", "choices": ["fireplace", "mirror", "curtain", "window"], "correct_choice_idx": 2, "direct_answers": ["curtains", "drapes", "curtain", "drapes", "curtains", "canopy", "canopy", "curtains", "curtain", "bed curtains"], "difficult_direct_answer": false, "rationales": ["The curtain gives privacy.", "They can be closed to block others view.", "This is a canopy bed and there is more privacy."], "image": "train2014/COCO_train2014_000000010655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253986, "question_id": "DHyVHKFjTFCVUnDcenZ5FU", "question": "Why are the women sitting on the benches?", "choices": ["to sleep", "to rest", "to wait", "to talk"], "correct_choice_idx": 1, "direct_answers": ["resting", "to rest", "waiting", "relaxing", "to relax", "waiting", "relaxation", "resting", "relaxing", "relaxing"], "difficult_direct_answer": false, "rationales": ["When someone is sitting on a bench it is usually because they are resting.", "The women rest on the benches. they don't appear to wish to speak to one another or talk. they might be waiting for someone or something but it looks as though they are just resting.", "Woman are sitting on benches in a park. people sit on benches to rest."], "image": "val2014/COCO_val2014_000000253986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89503, "question_id": "DJ7PD7SvjTo4DEz2t2Q6cP", "question": "What kind of venue is this?", "choices": ["giraffe barn", "zoo", "wilderness", "farm"], "correct_choice_idx": 0, "direct_answers": ["zoo", "giraffe barn", "zoo", "national park", "park", "zoo", "zoo", "zoo", "zoo", "zoo"], "difficult_direct_answer": false, "rationales": ["The animals have four legs and long necks. they are inside a building.", "The animals are indoors as opposed to outside.", "The place is inside. there are giraffes living there."], "image": "val2014/COCO_val2014_000000089503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185487, "question_id": "DJEdg3eM2KhVgXeTpMmS9v", "question": "What element lifts this person skyward?", "choices": ["water", "mineral", "fire", "wind"], "correct_choice_idx": 3, "direct_answers": ["kite", "parachute", "wind", "wind", "kite", "kite", "parachute", "wind", "kinetic energy", "kite"], "difficult_direct_answer": false, "rationales": ["The person is only able to sail through the air because of the wind.", "None of the other options fit with this type of environment. the parachute/kite above them is also obviously linked to this element.", "They are using a kite which catches this force and propels them"], "image": "val2014/COCO_val2014_000000185487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17585, "question_id": "DJNxCN3M2yJm9KcxRboRCF", "question": "What does he need to do?", "choices": ["learn flying", "keep warm", "change shoes", "maintain balance"], "correct_choice_idx": 3, "direct_answers": ["balance", "land safely", "balance", "jump", "balance", "balance", "stay balanced", "stay upright", "maintain balance", "balance"], "difficult_direct_answer": false, "rationales": ["The man needs to keep his weight centered to not fall over.", "The man needs to not fall.", "He needs to stay upright so he doesn't fall."], "image": "train2014/COCO_train2014_000000017585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194669, "question_id": "DJVt3mwpoeFk98tNSZ3xgg", "question": "How did he get on back of the truck?", "choices": ["climbed on", "fell on", "lives there", "jumped on"], "correct_choice_idx": 0, "direct_answers": ["jumped", "step", "jumped", "step", "grab bar", "bumper", "climbed on", "jump", "jump", "climbed"], "difficult_direct_answer": false, "rationales": ["The man hopped onto the truck.", "The man climbed on.", "He would have to step up on the pipes used as a bumper"], "image": "train2014/COCO_train2014_000000194669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494782, "question_id": "DJZ7W2DMAjYiKEfv4iQwG3", "question": "What is the man sitting there doing?", "choices": ["preparing powerpoint", "watching video", "preparing document", "sending email"], "correct_choice_idx": 1, "direct_answers": ["watching video", "computing", "using laptop", "reading", "watching", "looking", "using computer", "checking computer", "watching tv", "laptop"], "difficult_direct_answer": true, "rationales": ["He has headphones in and is staring at the screen instead of typing on the keyboard.", "The man is wearing headphones and is looking at the laptop in front of him.", "The man has a tablet or laptop and is watching something on the screen and listening to the sound through his headphones."], "image": "val2014/COCO_val2014_000000494782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73199, "question_id": "DJdGcifTF5Y52LBvcgJG2v", "question": "What might this group be dressed for?", "choices": ["stripping", "bridal party", "wedding", "sales"], "correct_choice_idx": 2, "direct_answers": ["wedding", "wedding", "wedding", "wedding", "celebration", "concert musicians", "wedding", "wedding", "wedding", "celebration"], "difficult_direct_answer": false, "rationales": ["When men wear matching suits they are likely dressed for a wedding.", "The men are wearing matching suits that are suitable for a groom and his groomsmen at a wedding.", "The group of men are wearing fancy suits that you might see worn by ushers at a wedding."], "image": "val2014/COCO_val2014_000000073199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182019, "question_id": "DJdMimZtwiyT5Bi2kRPLXj", "question": "What holiday do the people seem to be celebrating?", "choices": ["labor day", "christmas", "easter", "thanksgiving"], "correct_choice_idx": 3, "direct_answers": ["thanksgiving", "thanksgiving", "thanksgiving", "thanksgiving", "thanksgiving", "thanksgiving", "thanksgiving", "thanksgiving", "thanksgiving", "thanksgiving"], "difficult_direct_answer": false, "rationales": ["The holiday is thanksgiving.", "Thanksgiving is a holiday that's all about food and turkey.", "Kids are dressed with pilgrim hats and feathers. people use pilgrim decorations at thanksgiving."], "image": "train2014/COCO_train2014_000000182019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118906, "question_id": "DJwqamnf6oNiJj63FFHFkR", "question": "What brand of sandals is the woman wearing?", "choices": ["adidas", "nike", "reef", "pacsun"], "correct_choice_idx": 2, "direct_answers": ["cap", "under amour", "reef", "puma", "reef", "payless", "uggs", "birkenstock", "sketchers", "clarks"], "difficult_direct_answer": true, "rationales": ["The logo is on the sandals.", "They look to be this brand.", "The woman has the reef logo on her sandals."], "image": "val2014/COCO_val2014_000000118906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437283, "question_id": "DJyZbhmiA4Nva5gcRhpBv3", "question": "What is traditionally eaten as an accompaniment to this dish?", "choices": ["bread", "fruit", "eggs", "cake"], "correct_choice_idx": 0, "direct_answers": ["fried noodles", "rice", "rice", "noodles", "crackers", "bread", "noodles", "rice", "noodles", "noodles"], "difficult_direct_answer": false, "rationales": ["Bread is used to eat with soup.", "Bread is usually eaten with soup.", "This is soup, which is often accompanied with bread or crackers, to be dipped into the soup."], "image": "val2014/COCO_val2014_000000437283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319126, "question_id": "DJzic78KYwJQwfejMpDY2z", "question": "What is in the air?", "choices": ["kites", "birds", "helicopter", "blimp"], "correct_choice_idx": 0, "direct_answers": ["kite", "kites", "kites", "kite", "kites", "kites", "kite", "kites", "kites", "kites"], "difficult_direct_answer": false, "rationales": ["Kites are flying in the sky over the field.", "The air has kites.", "There are kites being flown in the sky."], "image": "train2014/COCO_train2014_000000319126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262003, "question_id": "DKEELdk9saXEJZ8eZv44yk", "question": "What is the object behind the batter's leg?", "choices": ["pitching machine", "catcher's mask", "umpire's mask", "catcher's mitt"], "correct_choice_idx": 3, "direct_answers": ["glove", "mitt", "mitt", "catcher", "mitt", "catchers mitt", "catcher's mitt", "glove", "mitt", "base ball"], "difficult_direct_answer": false, "rationales": ["The object is a glove that belongs to the player on the fielding team who plays the home plate position.", "The object is a mitt.", "A player called the catcher stands behind the batter player hitting the ball to catch the ball if the batter misses the ball when it is thrown."], "image": "val2014/COCO_val2014_000000262003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31057, "question_id": "DKLWrtM6wTAPajnJ8mvFhR", "question": "What is near the blue truck?", "choices": ["traffic cone", "carrot", "horse", "stop sign"], "correct_choice_idx": 0, "direct_answers": ["traffic cone", "cone", "back hoe", "excavator", "cones", "back ho", "jap", "pylons", "cones", "bull dozer"], "difficult_direct_answer": true, "rationales": ["There are traffic cones near the truck.", "There are quite a few of them so they would be hard to miss.", "The blue truck is near a cone."], "image": "train2014/COCO_train2014_000000031057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450509, "question_id": "DKNJoQdRPsxovzvxgWLSCW", "question": "Who just threw the disc?", "choices": ["man", "no one", "small child", "lady"], "correct_choice_idx": 2, "direct_answers": ["small child", "child", "child", "boy", "woman", "woman", "boy", "child", "child", "kid"], "difficult_direct_answer": false, "rationales": ["The disc is heading away from the child.", "Based on the follow through body position of the answer a person, they have thrown the frisbee and their arm has continued into its current position.", "The disk is flying nearest to the kid."], "image": "train2014/COCO_train2014_000000450509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224916, "question_id": "DKV4PQf6HArW5sDNwfL6QE", "question": "What type of bird is the one on the far left?", "choices": ["toucan", "cockatiel", "parrot", "dove"], "correct_choice_idx": 1, "direct_answers": ["cockatiel", "parrot", "parrot", "canary", "parrot", "cockatiel", "parrot", "parrot", "parrot", "cockatiel"], "difficult_direct_answer": false, "rationales": ["Its head defines it.", "The bird has orange checks and a yellow crest.", "The animal coloring and feather patterning that would be consistent with answer a."], "image": "val2014/COCO_val2014_000000224916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28402, "question_id": "DL2j4mTvXp8w2ys6TjYtvV", "question": "What was this man hitting with his bat?", "choices": ["invader", "volleyball", "enemy", "baseball"], "correct_choice_idx": 3, "direct_answers": ["baseball", "baseball", "ball", "ball", "ball", "ball", "ball", "ball", "ball", "ball"], "difficult_direct_answer": false, "rationales": ["This man is hitting a baseball with his bat.", "The man is holding a baseball bat and swinging to hit a baseball.", "The man is playing a sport. it would not be appropriate to hit a volleyball with a bat."], "image": "train2014/COCO_train2014_000000028402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493822, "question_id": "DL5QSgEQiMfCHfMZm9wHVq", "question": "Why is the person near the camera wearing two layers?", "choices": ["hot outside", "cold outside", "snowy outside", "rainy outside"], "correct_choice_idx": 1, "direct_answers": ["chilly weather", "chilly weather", "keep warm", "warmth", "chilly", "cold weather", "cold weather", "cold outside", "cold out", "warmth"], "difficult_direct_answer": false, "rationales": ["People often dress in layers when it's cold out so it's likely to be a cold day considering how this man (and others) are dressed.", "There are no leaves on the trees so the weather is likely cold right at the moment here.", "Answer a is consistent with the reason a person might wear two layers and has nothing to do with the image."], "image": "train2014/COCO_train2014_000000493822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59906, "question_id": "DLDiBwH5oDYCSPkLnRzZAS", "question": "Why is the refrigerator covered in plastic?", "choices": ["weather", "broken", "dieting", "construction"], "correct_choice_idx": 3, "direct_answers": ["kitchen renovation", "painting walls", "construction", "it's new", "painting", "new", "brand new", "construction work", "brand new", "painting"], "difficult_direct_answer": false, "rationales": ["The refrigerator is covered for construction purposes.", "Plastic helps it from getting dusty or scratched", "The whole kitchen is being renovated."], "image": "train2014/COCO_train2014_000000059906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288041, "question_id": "DLHGXwnYMLHAL7D6NfPPRP", "question": "How many players are there?", "choices": ["two", "one", "three", "four"], "correct_choice_idx": 1, "direct_answers": ["one", "three", "one", "one", "one", "one", "one", "one", "three", "one"], "difficult_direct_answer": false, "rationales": ["There is only one person with the console.", "Only one person is holding the wii-mote.", "There is only one person playing the others are watching."], "image": "val2014/COCO_val2014_000000288041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413923, "question_id": "DLHtGrENZjopZVdmy4UKJn", "question": "Where were bicycles invented?", "choices": ["russia", "france", "poland", "prussia"], "correct_choice_idx": 1, "direct_answers": ["germany", "1792", "1792", "germany", "germany", "germany", "germany", "germany", "france", "in greeks"], "difficult_direct_answer": false, "rationales": ["The first type of bicycle was invented in the 19th century in france.", "France is credited with coming up with the idea of the bicycle.", "These bicycles are invented in france."], "image": "train2014/COCO_train2014_000000413923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545334, "question_id": "DLgdyugrfQiYYhQ8dw2enY", "question": "Which team has a similar name which some might consider the opposite?", "choices": ["white sox", "yellow sox", "grey sox", "blue sox"], "correct_choice_idx": 0, "direct_answers": ["black sox", "white sox", "white sox", "white sox", "blue jays", "white sox", "white sox", "white sox", "white sox", "blue jays"], "difficult_direct_answer": false, "rationales": ["The team is the white sox.", "They have the same second name just a different color for the first name.", "There are only two \"sox\" teams and the white sox are the other one."], "image": "train2014/COCO_train2014_000000545334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361959, "question_id": "DLoQ7Z7UYv5puS2PaPBtDP", "question": "What material is this oven made out of?", "choices": ["wood", "plastic", "glass", "stainless steel"], "correct_choice_idx": 3, "direct_answers": ["stainless steal", "aluminum", "stainless steel", "stainless steel", "stainless steel", "stainless steel", "stainless steel", "steel", "metal", "stainless steel"], "difficult_direct_answer": false, "rationales": ["An oven is silver. silver appliances are stainless steel.", "From the outside appearance including color, shine and durability it is apparent that glass and wood are not used. plastic would melt.", "It's silver so it's stainless steel."], "image": "train2014/COCO_train2014_000000361959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557323, "question_id": "DM3mgqJpPMB5NZBDgrni2w", "question": "Why is his foot in the air behind him?", "choices": ["kick ball", "avoid ball", "steal ball", "hide ball"], "correct_choice_idx": 0, "direct_answers": ["kicking", "kick ball", "kick", "kicking", "to kick", "kicking", "to kick", "kicking", "running", "kicking"], "difficult_direct_answer": false, "rationales": ["The man's foot is extended back in the air so that he can get the most momentum before striking the round object used to score in the game of soccer.", "A soccer player is kicking the soccer ball.", "He has his right foot back to prepare to exert force against the ball to push it forward really hard. his face shows that he is putting in effort."], "image": "val2014/COCO_val2014_000000557323.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17376, "question_id": "DMB6EG46HDwq2JfDZBiYBV", "question": "What does the orange man represent?", "choices": ["cross", "dance", "male bathroom", "wait"], "correct_choice_idx": 3, "direct_answers": ["don't walk", "stop", "pedestrians", "wait", "don't walk", "don't walk", "traffic signal", "stop", "stop", "don't walk"], "difficult_direct_answer": false, "rationales": ["The man in orange on the sign post represents a pedestrian waiting to cross the street.", "The traffic signal is telling the crowd to not cross.", "The signal and color show that it is not save to cross at this time."], "image": "train2014/COCO_train2014_000000017376.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264537, "question_id": "DMgJKTxi9G6h8LrBQaqer6", "question": "What print is on his shoes?", "choices": ["checkers", "zig zag", "floral", "camouflage"], "correct_choice_idx": 3, "direct_answers": ["camouflage", "camouflage", "camo", "camo", "camouflage", "camouflage", "camo", "camouflage", "camouflage", "cammo"], "difficult_direct_answer": false, "rationales": ["The man's shoes have the same print often used by the military for disguise.", "The print of the shoes have green and brown spots.", "This is a common pattern in clothes and apparel for men."], "image": "train2014/COCO_train2014_000000264537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172010, "question_id": "DMkXRisUkfR49UkVpF6XmB", "question": "This style of flooring comes from a French word meaning what?", "choices": ["great shine", "square pattern", "small compartment", "horizontal lines"], "correct_choice_idx": 2, "direct_answers": ["wood block", "wood", "floor", "hardwood", "parquetry", "parquerte", "parquet", "wood", "small compartment", "fancy"], "difficult_direct_answer": true, "rationales": ["The flooring pattern is clearly visible and unique and the french word translation is searchable on the internet.", "The style is in a compartment.", "It is a parquet floor and it means short strips of wooden flooring."], "image": "train2014/COCO_train2014_000000172010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196156, "question_id": "DMonRWyi8WRWuuJVS8BDgg", "question": "What vegetable is on the pizza?", "choices": ["jalapeno", "broccoli", "spinach", "onions"], "correct_choice_idx": 0, "direct_answers": ["jalapeno", "jalapenos", "jalapeno", "jalapeno peppers", "pepper", "jalapeno", "jalapeno", "pepper", "hot peppers", "jalapenos"], "difficult_direct_answer": false, "rationales": ["The vegetable on the pizza is a type of hot chili pepper that has been sliced.", "There are jalapenos on the pizza.", "There is only one vegetable on the pizza and it looks like sliced jalapenos."], "image": "train2014/COCO_train2014_000000196156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358462, "question_id": "DMxYnzZPpKrSyN5JCJkPLS", "question": "Why is the horse attached to the wagon?", "choices": ["by accident", "eats grass", "pulls wagon", "stops horse"], "correct_choice_idx": 2, "direct_answers": ["pulls wagon", "pulling it", "pulling wagon", "towing", "pulling wagon", "transport items", "pulling it", "pulling it", "to pull", "to pull"], "difficult_direct_answer": false, "rationales": ["The horse can propel forward.", "A horse has a wagon attached to it. horses are used to pull things.", "The method for attachment is intentional and would be done on such a vehicle for the purposes of answer a."], "image": "train2014/COCO_train2014_000000358462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163112, "question_id": "DN532Zoq46Zi5S5QvokzyA", "question": "Why are the lights on the lamps on?", "choices": ["to repair", "for decoration", "to illuminate", "as joke"], "correct_choice_idx": 2, "direct_answers": ["light up", "dark out", "to illuminate", "its dark", "streetlight", "better visibility", "darkness", "street poles", "lamp", "nighttime"], "difficult_direct_answer": true, "rationales": ["These lamp posts are all around most cities to help people see at night.", "To keep the area lite up.", "It shows the man and the surrounding what is around."], "image": "val2014/COCO_val2014_000000163112.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142051, "question_id": "DNCZfH9WUui4CHguLsYvPY", "question": "What category of clocks does the clock by the window belong to?", "choices": ["digital", "tactile", "longcase", "cuckoo"], "correct_choice_idx": 2, "direct_answers": ["grandfather", "grandfather clock", "grandfather clock", "grandfather", "grandfather clock", "grandfather", "grandfather", "longcase", "grandfather clock", "grandfather"], "difficult_direct_answer": false, "rationales": ["Longcase clocks are large clocks in tall slender wooden cabinets.", "The clock is in a long case.", "The clock, also known as a grandfather clock, is a tall freestanding clock with weights and a pendulum."], "image": "train2014/COCO_train2014_000000142051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89908, "question_id": "DNXjjnCGyVYHriya6DWNp9", "question": "What type of phone is available?", "choices": ["payphone", "corded", "cellular", "cordless"], "correct_choice_idx": 1, "direct_answers": ["corded", "corded", "corded", "landline", "cord phone", "corded wall", "landline", "landline", "corded", "landline"], "difficult_direct_answer": false, "rationales": ["The phone is connected to a spiral cord.", "It is attached to the wall with a wire", "The phone is positioned on the wall with a cord attached from the wall to the receiver, this type of phone is known as a corded phone. it is not cellular as you can not remove it, it is not cordless due to the cord, and it is not a payphone these are more common on the streets not in private residences."], "image": "train2014/COCO_train2014_000000089908.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526414, "question_id": "DNmqNS3p8bg5VejPPpFZWi", "question": "The white machine is used to manipulate what?", "choices": ["paper", "fabric", "metal", "plastic"], "correct_choice_idx": 1, "direct_answers": ["thread", "sew", "sewing", "fabric", "fabric", "needle", "needle", "fabric", "sewing fabric", "clothes"], "difficult_direct_answer": false, "rationales": ["Based on the size, shape and design, the object is a sewing machine which would be used for assembling answer a.", "The machine is a sewing machine which is used to sew on materials like fabric.", "The white machine helps sew materials."], "image": "val2014/COCO_val2014_000000526414.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97667, "question_id": "DNzWpFWWr2TmseHm49noVV", "question": "Which leavening was used most here?", "choices": ["none", "salt", "yeast", "vegemite"], "correct_choice_idx": 2, "direct_answers": ["dough", "flour", "yeast", "yeast", "bread", "yeast", "flour", "yeast", "yeast", "dessert"], "difficult_direct_answer": false, "rationales": ["Yeast is used to raise bread dough.", "The leavening is yeast.", "There is a standard looking white bread visible. standard white bread would use answer a for leavening."], "image": "val2014/COCO_val2014_000000097667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243847, "question_id": "DPYexS9seHoGpHQM6jTw7p", "question": "What interactions are the two humans having with the elephant?", "choices": ["riding it", "petting it", "playing", "feeding it"], "correct_choice_idx": 0, "direct_answers": ["riding it", "riding", "riding", "riding", "ride", "riding", "riding", "passengers", "ride", "riding"], "difficult_direct_answer": false, "rationales": ["The two people are on a chair on the elephant's back.", "The people are on top of the elephant in a saddle which is equipment and positioning consistent with answer a.", "There are humans sitting on the elephant."], "image": "train2014/COCO_train2014_000000243847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225124, "question_id": "DPchWjwp2FbzCDm29GU2qQ", "question": "What kind of palm tree is in the center of this photo?", "choices": ["queen palm", "fishtail palm", "fan palm", "sago palm"], "correct_choice_idx": 2, "direct_answers": ["pygmy", "fan palm", "fan leaved", "date palm", "large", "unknown", "california palm", "tropical", "pygmy date", "tropical"], "difficult_direct_answer": true, "rationales": ["As indicated by a google search. the fronds fan outward.", "This is a fan palm because it looks like it's a big fan.", "The leaves on the palm are the same shape and size as a fan."], "image": "train2014/COCO_train2014_000000225124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112978, "question_id": "DPnorHbRWtZTZWBzMdRdQg", "question": "What sort of presentation do those in the audience watch?", "choices": ["power point", "prison play", "skit", "studio movie"], "correct_choice_idx": 0, "direct_answers": ["power point", "cloud sharing", "business", "business", "computer", "cloud", "business", "cloud technologies", "visual", "business technology"], "difficult_direct_answer": false, "rationales": ["The people in the audience are looking at a powerpoint.", "He is using a power point for his presentation.", "Of all the options, a power point presentation is most likely in this situation since there is one man facing a crowd of attentive people."], "image": "train2014/COCO_train2014_000000112978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518524, "question_id": "DPqxYmGW2XuBx7Gnz2BCBW", "question": "What is this room commonly referred to?", "choices": ["laundry room", "livingroom", "dining room", "bedroom"], "correct_choice_idx": 0, "direct_answers": ["laundry room", "laundry room", "disco room", "disco room", "laundry room", "laundry room", "laundry room", "kitchen", "laundry room", "laundry room"], "difficult_direct_answer": false, "rationales": ["The machines are used to wash laundry.", "It's because there's a washer and dryer in it.", "A room with a washing machine and dryer is otherwise empty."], "image": "val2014/COCO_val2014_000000518524.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563257, "question_id": "DPwvQxhBTnaoc3EsqCkLkL", "question": "What is the boy wearing on his head?", "choices": ["helmet", "fedora", "beanie", "baseball cap"], "correct_choice_idx": 2, "direct_answers": ["cap", "beanie", "beanie", "beanie", "toque", "cap", "beanie", "beanie", "beanie", "beanie"], "difficult_direct_answer": false, "rationales": ["A skateboarder is wearing a stocking cap. stocking caps are often called beanies.", "The boy wears a beanie.", "The boy is wearing a beanie cap on the skateboard."], "image": "train2014/COCO_train2014_000000563257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310310, "question_id": "DPztqrRntHYmBdVrojCYAs", "question": "How did the women feel about the man's remark?", "choices": ["bored", "amused", "offended", "embarrassed"], "correct_choice_idx": 1, "direct_answers": ["happy", "amused", "happy", "happy", "amused", "happy", "amused", "pleased", "amused", "happy"], "difficult_direct_answer": false, "rationales": ["The woman is laughing.", "She is smiling.", "The woman is smiling and looks like she is amused by what the man is saying."], "image": "val2014/COCO_val2014_000000310310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571584, "question_id": "DQ5K4oXC5FRvbhpsz78FAL", "question": "What kind of problem is likely to be experienced by the apartment residents?", "choices": ["noise", "pollution", "graffiti", "burglary"], "correct_choice_idx": 0, "direct_answers": ["rat", "traffic noise", "noise", "power shortage", "loud noise", "traffic noise", "traffic noise", "traffic noise", "noise fumes", "parking lack"], "difficult_direct_answer": false, "rationales": ["All the answers listed here could be viable, but most likely in an over-populated area is noisy.", "There is a road extremely close to the building, so traffic may be keeping the residents up at night.", "The problem is the noise."], "image": "val2014/COCO_val2014_000000571584.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427853, "question_id": "DQLaPaZjbB3P8iLqXUMKo8", "question": "How is this young mans neckwear secured?", "choices": ["magic", "pin", "tie", "clothes pin"], "correct_choice_idx": 2, "direct_answers": ["windsor knot", "knotted", "tied", "knot", "tie", "tie", "tie knot", "necktie", "tied", "knot"], "difficult_direct_answer": false, "rationales": ["He has a tie around his neck.", "The man has a tie.", "The man is wearing a red tie around his neck."], "image": "train2014/COCO_train2014_000000427853.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275755, "question_id": "DQRa7VeT7vdAEesarfRpxq", "question": "What gaming system is the shirtless man playing?", "choices": ["nintendo", "microsoft", "sony", "atari"], "correct_choice_idx": 0, "direct_answers": ["wii", "wii", "wii", "wii", "wii", "nintendo wii", "wii", "nintendo", "wii", "wii"], "difficult_direct_answer": false, "rationales": ["The man is playing nintendo with his console.", "The shirtless man is holding a nintendo wii controller and playing a video game.", "The remote looks like a nintendo wii remote and other games don't have the same style."], "image": "val2014/COCO_val2014_000000275755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7685, "question_id": "DQSAeSW4rg2sZhjJ7vZn7p", "question": "What is floating above the rocks?", "choices": ["bird", "duck", "kite", "newspaper"], "correct_choice_idx": 2, "direct_answers": ["kite", "kite", "kite", "kite", "kite", "stones", "kite", "snow", "kite", "kite"], "difficult_direct_answer": false, "rationales": ["It looks like a bird is flying in the sky.", "A kite is floating.", "The item appears to be a kite flying in the air on a windy day high above the ground. people like to fly kites on windy days."], "image": "train2014/COCO_train2014_000000007685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333216, "question_id": "DQikarFAh2jd9ZJKNP3RmF", "question": "What is the large pole near the horse supplying to the homes?", "choices": ["electricity", "milk", "light", "fruit"], "correct_choice_idx": 0, "direct_answers": ["electricity", "electric", "electricity", "electricity", "electricity", "electric", "electricity", "electricity", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["This is the only option that makes sense. it could have a b on it, but it would also need a.", "The poles has visible wires attached to it. this is a common design for bringing electricity to homes.", "Wire drawn over tall poles can supply houses with light and heat."], "image": "train2014/COCO_train2014_000000333216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354174, "question_id": "DQoMAMZpdnJLJCWeQzTbst", "question": "Who was a top ranked player in this sport?", "choices": ["tim cook", "roger federer", "moms mabley", "clete boyer"], "correct_choice_idx": 1, "direct_answers": ["djokovich", "roger federer", "novak djokovic", "federer", "tennis player", "roger federer", "federer", "roger federer", "tennis", "this man"], "difficult_direct_answer": false, "rationales": ["He's one of the best tennis players of all time.", "A man is standing waiting for the ball to come down to hit it. only answer that makes sense if first because others are not tennis.", "I had to look this up on the internet since i couldn't learn it from the picture."], "image": "val2014/COCO_val2014_000000354174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126502, "question_id": "DQpKMXSrncjuCU6uccUcZZ", "question": "Where are the birds standing on?", "choices": ["giraffe", "carpet", "tree", "blanket"], "correct_choice_idx": 0, "direct_answers": ["giraffe", "giraffe", "mountain", "giraffe", "giraffe", "mountain", "mountain", "giraffe", "giraffe", "giraffe"], "difficult_direct_answer": false, "rationales": ["The colour and pattern of the animal they are standing on can clearly be identified as a giraffe.", "The birds are perching on a very large spotted animal with a long neck. the animal is a giraffe.", "The animal they are on has the pattern of a giraffe."], "image": "val2014/COCO_val2014_000000126502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203433, "question_id": "DREEPmqw3mDpy3HdekjZWC", "question": "What can be done using the orange thing?", "choices": ["fly around", "lock house", "eat food", "take pictures"], "correct_choice_idx": 3, "direct_answers": ["take picture", "photo taking", "take pictures", "take picture", "take pictures", "take pictures", "take pictures", "take pictures", "photography", "photography"], "difficult_direct_answer": false, "rationales": ["It has a lens on it. you can use it to take a picture.", "The woman can take photos with the camera.", "The orange thing is for pictures."], "image": "train2014/COCO_train2014_000000203433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528243, "question_id": "DRM6pkKPy3z5YaMEVuKefu", "question": "What kind of services does this building provide?", "choices": ["insurance", "legal", "medical", "banking"], "correct_choice_idx": 3, "direct_answers": ["finance", "banking", "banking", "financial", "banking", "bank", "intesa", "banking", "bank", "banking services"], "difficult_direct_answer": false, "rationales": ["The building has the name of the company inside visible on the outside and that company is a bank.", "The name of the company is written and displayed over the doors and is known to be in the industry of answer a.", "The building provides banking services since the sign indicates that it's a bank."], "image": "train2014/COCO_train2014_000000528243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141224, "question_id": "DRbPBE8UhJpaX3jzy5ycUc", "question": "What is by the chairs?", "choices": ["pool", "car", "pizza", "computer"], "correct_choice_idx": 0, "direct_answers": ["pool", "pool", "umbrella", "umbrella pool", "umbrella", "pool", "umbrella", "pool", "pool", "pool"], "difficult_direct_answer": false, "rationales": ["A pool is near the chairs.", "These are lounge chairs for laying out in the sun - you can see the body of water next to them.", "The chairs are stationed around it."], "image": "train2014/COCO_train2014_000000141224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557394, "question_id": "DRjTPFBNhUbR5iJhvj5h5e", "question": "Why is the rider wearing earphones?", "choices": ["for instruction", "style", "hearing aid", "listening music"], "correct_choice_idx": 3, "direct_answers": ["entertainment", "block sounds", "listen music", "noise pollution", "playing music", "music playing", "listening music", "listen music", "it's noisy", "listen music"], "difficult_direct_answer": false, "rationales": ["The rider wants to jam to tunes on his commute.", "The rider is listening to music.", "The rider wants to hear some tunes."], "image": "train2014/COCO_train2014_000000557394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203878, "question_id": "DRmKUSNZz9qQUSCoQZrhRd", "question": "What is touching the racquet in the foreground?", "choices": ["dog paw", "two hands", "foot", "cat paw"], "correct_choice_idx": 1, "direct_answers": ["tennis player", "hand", "hands", "two hands", "player's hands", "hand", "tennis player", "hands", "human hands", "hands"], "difficult_direct_answer": false, "rationales": ["The player is holding the racquet as he hits the ball", "The man has both hands on his racquet.", "The guy is holding the racquet with both hands"], "image": "val2014/COCO_val2014_000000203878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371841, "question_id": "DSG9x76cKkZueEuqpDjqMM", "question": "The utensils provided with the meal are known as what?", "choices": ["knives", "prongs", "pokers", "chopsticks"], "correct_choice_idx": 3, "direct_answers": ["plate", "chopsticks", "chopsticks", "chopsticks", "chopsticks", "chopsticks", "chopsticks", "chopsticks", "chopsticks", "plate"], "difficult_direct_answer": false, "rationales": ["The utensils are chopsticks that are being used.", "Two sticks used as utensils is known as chopsticks.", "Thats the name of the oriental utensils"], "image": "val2014/COCO_val2014_000000371841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64183, "question_id": "DSNVyAeCQJ2gKnhhnedCTM", "question": "Why is the woman using an umbrella?", "choices": ["rain", "snow", "disguise", "sun"], "correct_choice_idx": 3, "direct_answers": ["sun protection", "shade", "block sun", "sun protection", "shade", "rain", "sunshine", "sun", "shade", "sun"], "difficult_direct_answer": false, "rationales": ["Umbrellas are often used for shade.", "She's using it to protect herself from the sun.", "The woman is blocking the sun's uv rays."], "image": "val2014/COCO_val2014_000000064183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496399, "question_id": "DSaHMwPmGkxtVdF7MbnfKv", "question": "If the gentleman here in the suit wants to call his sweetheart where will he do it?", "choices": ["inside building", "phone booth", "taxi", "standing there"], "correct_choice_idx": 1, "direct_answers": ["talk", "in cage", "phone booth", "phone booth", "phone booth", "phone booth", "use cellphone", "cellphone", "from cellphone", "phonebooth"], "difficult_direct_answer": false, "rationales": ["The man might not have a cellphone but he can for sure make a call from the phone booth.", "The man appears to be holding a cell phone based on the size of the object in his hand and the way he is regarding it. this would be used to make calls.", "You can use a phone booth to make a phone call."], "image": "train2014/COCO_train2014_000000496399.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492408, "question_id": "DSijMRhM8Pn2TzfmVmuL5o", "question": "What weather is being encountered here?", "choices": ["rain", "snow", "sun", "sleet"], "correct_choice_idx": 0, "direct_answers": ["rain", "rain", "rain", "rain storm", "rain", "rain", "winter", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["A person is holding up two umbrellas in an overcast area. the skies are hazy.", "By the background and lack of sun you can safely assume why they are holding up the umbrellas.", "The sky is overcast, and he is holding umbrellas."], "image": "train2014/COCO_train2014_000000492408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77355, "question_id": "DSnnFxNDQvJXuQkf7scHvV", "question": "What is one of the largest breeds of this animal?", "choices": ["maine coon", "greyhound", "munchkin", "doberman"], "correct_choice_idx": 0, "direct_answers": ["unknown", "cat", "lion", "cat", "english mastiff", "lion", "barival", "savannah cat", "savannah", "maine coon"], "difficult_direct_answer": false, "rationales": ["This is a cat, and the maine coon is known as one of the largest domestic breeds.", "This is the largest cat that looks similar", "The maine coon is evident by its nature to be big enough."], "image": "train2014/COCO_train2014_000000077355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472109, "question_id": "DT98m5UdKSh2HMqrPq8Lq5", "question": "What general subject do the books in the bookcase to the left of the phone cover?", "choices": ["mathematics", "engineering", "history", "information technology"], "correct_choice_idx": 3, "direct_answers": ["phone", "business", "computer", "language", "science", "coding", "office", "computers", "information technology", "information technology"], "difficult_direct_answer": true, "rationales": ["The subject is it.", "There are a lot of it books available.", "Though it's very hard to make out the books it's probably something about computers."], "image": "val2014/COCO_val2014_000000472109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504974, "question_id": "DTaqpxJR9aCzDcwJy6Pfaf", "question": "On what continent was this photo most likely taken?", "choices": ["north america", "south america", "europe", "africa"], "correct_choice_idx": 1, "direct_answers": ["north america", "north america", "north america", "europe", "south america", "north america", "europe", "africa", "south america", "not clear"], "difficult_direct_answer": false, "rationales": ["It was taken in south america.", "Based on the language and the bird, this photo is most likely taken in the continent south of north america.", "The pelican is from south america."], "image": "train2014/COCO_train2014_000000504974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287695, "question_id": "DTmJppBW9aW2SbPUYeMjzi", "question": "What are the elephants standing in?", "choices": ["sand", "mud", "woodchips", "grass"], "correct_choice_idx": 0, "direct_answers": ["sand", "sand", "sand", "sand", "pen", "sand", "sand", "zoo sand", "said", "sand"], "difficult_direct_answer": false, "rationales": ["The elephants are near an area standing in sand.", "The elephants are in sand.", "This footing is good for drainage of animal urine."], "image": "train2014/COCO_train2014_000000287695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405838, "question_id": "DU4J9vrRcx2TycZEjhxcWx", "question": "What type of electronic device is next to the fan on the right?", "choices": ["laptop", "cell phone", "tv", "printer"], "correct_choice_idx": 0, "direct_answers": ["computer", "phone", "laptop", "phone", "laptop", "phone", "telephone", "laptop", "phone", "phone"], "difficult_direct_answer": false, "rationales": ["The computer is next to the fan on the desk.", "The closeable screen and keyboard suggest that this device is a laptop.", "It has a keyboard and a screen so it is a type of computer"], "image": "train2014/COCO_train2014_000000405838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468302, "question_id": "DU9EKJNsBDKhSrEeihHp7d", "question": "What position is the man with the red glove most likely?", "choices": ["center fielder", "shortstop", "pitcher", "catcher"], "correct_choice_idx": 2, "direct_answers": ["pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher"], "difficult_direct_answer": false, "rationales": ["The position is the pitcher.", "The baseball player has a glove and baseball in his hand with his knee up towards his chest indicating his playing position.", "The man is in the stance of a pitcher right before throwing a pitch."], "image": "val2014/COCO_val2014_000000468302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111045, "question_id": "DUGVVyYib2Ms7TDb4tQbe5", "question": "What position is played by the kneeling player?", "choices": ["catcher", "outfield", "pitcher", "short stop"], "correct_choice_idx": 1, "direct_answers": ["outfielder", "second basemen", "catcher", "outfielder", "shortstop", "catcher", "defender", "catcher", "catcher", "outfield"], "difficult_direct_answer": false, "rationales": ["The position is the outfield.", "He couldn't be any other positions due to where he is on the field.", "The player is in the grass so he's in the outfield."], "image": "train2014/COCO_train2014_000000111045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103018, "question_id": "DUjL6JkN4VSun9nBCi9KXa", "question": "What is the name of the fruits stacked on the table?", "choices": ["apples", "plums", "loquats", "pears"], "correct_choice_idx": 0, "direct_answers": ["lemons", "apples", "lemons", "lemons", "apples", "apples", "lemons", "apples", "lemons", "apples"], "difficult_direct_answer": false, "rationales": ["Their shape and texture can be immediately identified as apples.", "There are apples available on the table.", "They are yellow and round."], "image": "train2014/COCO_train2014_000000103018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99597, "question_id": "DUkaRPQZLvouCU3MznHECR", "question": "Why do the horses have their heads to the ground?", "choices": ["to eat", "to sit", "to hide", "to lay"], "correct_choice_idx": 0, "direct_answers": ["eating", "eating", "grazing", "eating", "to eat", "eating", "eating", "grazing", "eating", "grazing"], "difficult_direct_answer": false, "rationales": ["Horses don't have hands to eat with.", "They can be seen feeding", "They are putting the grass in their mouths."], "image": "train2014/COCO_train2014_000000099597.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362828, "question_id": "DVGJR8wpaiPSftZz4EuLff", "question": "SEAL Robert O'Neill shots whom?", "choices": ["haram", "osama", "al-qaida", "abu"], "correct_choice_idx": 1, "direct_answers": ["osama", "bin laden", "osama", "bin laden", "osama bin-laden", "victim", "bin laden", "bin laden", "bin laden", "bin laden"], "difficult_direct_answer": false, "rationales": ["Robert o'neill shot osama bin laden.", "The victim was osama.", "This seal was the one who killed osama bin laden."], "image": "val2014/COCO_val2014_000000362828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569550, "question_id": "DVg8MZjNpCeup2b6KrDMJi", "question": "What minimalizing action can this chair be made to do?", "choices": ["disappear", "shrink", "blow up", "fold up"], "correct_choice_idx": 3, "direct_answers": ["collapse", "fold", "fold up", "fold up", "fold up", "fold", "fold", "fold", "fold", "folded"], "difficult_direct_answer": false, "rationales": ["It is a portable chair", "The chair can be folded for storage.", "This is a folding chair."], "image": "train2014/COCO_train2014_000000569550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471861, "question_id": "DVgZDCBi3PWBxwyFQYMjXq", "question": "Which nation is responsible for this beverage?", "choices": ["barbados", "puerto rico", "croatia", "trinidad tobago"], "correct_choice_idx": 3, "direct_answers": ["trinidad tobago", "crib", "germany", "trinidad and tobago", "caribbean islands", "caribbean", "caribbean", "caribbean", "germany", "caribbean"], "difficult_direct_answer": false, "rationales": ["This is a united states beverage based on the carribean", "The bottle says \"caribbean beer\", trinidad is in the caribbean.", "Trinidad and tobago produce this lager."], "image": "val2014/COCO_val2014_000000471861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570039, "question_id": "DVitVSiduTcM9nEupYLivB", "question": "What is the woman helping the child do?", "choices": ["cut nails", "brush teeth", "comb hair", "clean ears"], "correct_choice_idx": 1, "direct_answers": ["brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush", "brush teeth", "brush teeth", "brush", "brush teeth"], "difficult_direct_answer": false, "rationales": ["The kid is brushing their teeth.", "Good brushing habits starts as soon as children have some teeth in.", "The child is getting his teeth brushed."], "image": "val2014/COCO_val2014_000000570039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314390, "question_id": "DVqQM8x3ikXqkEaVx7Krhg", "question": "This type of event should create what kind of mood for the people attending?", "choices": ["excited", "joyous", "bored", "angry"], "correct_choice_idx": 1, "direct_answers": ["excited", "bonding", "fun", "happiness", "joyous", "festive", "festive", "happy", "biker festival", "jovial"], "difficult_direct_answer": true, "rationales": ["The people at the festival must be happy.", "It is a festival with lots of booths", "People are happy at festivals."], "image": "train2014/COCO_train2014_000000314390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465198, "question_id": "DVvQGrncHkS5cFiDXtMZKa", "question": "What surface is the girl playing on?", "choices": ["outdoor hard", "grass", "indoor hard", "clay"], "correct_choice_idx": 0, "direct_answers": ["clay", "asphalt", "tennis court", "tennis court", "clay", "clay", "court", "outdoor hard", "tennis court", "cement"], "difficult_direct_answer": false, "rationales": ["The woman is playing outside on a tennis court.", "It is a sunny day and there is no grass or dirt on the court.", "The girl is playing tennis on a court that is outside and has a hard material such as clay."], "image": "train2014/COCO_train2014_000000465198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355786, "question_id": "DVwAPhpZ8e3k2raN2ZbbCc", "question": "What is flying through the air?", "choices": ["stuffed animal", "livestock", "bear", "chicken"], "correct_choice_idx": 0, "direct_answers": ["stuffed animals", "trash", "toy", "trash", "caps hats", "bear", "stuffed animal", "stuffed animals", "toy", "teddy bear"], "difficult_direct_answer": false, "rationales": ["The people are throwing teddy bears, not chickens, other farm animals, or real bears.", "There are stuffed animals flying through the air and tossed onto the arena.", "There is a teddy bear."], "image": "train2014/COCO_train2014_000000355786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359925, "question_id": "DVzNtZVa7JfjGcjyQhnFSR", "question": "What is this horse being used for?", "choices": ["consumption", "transportation", "companionship", "riding"], "correct_choice_idx": 1, "direct_answers": ["transportation", "pulling carriage", "transportation", "transportation", "pull carriage", "pull carriage", "transportation", "pulling", "pull buggy", "rides"], "difficult_direct_answer": false, "rationales": ["The horse is pulling a cart carrying a couple.", "The horse is pulling the buggy, which is holding passengers in order to bring them to a new destination.", "The horse is pulling a cart."], "image": "val2014/COCO_val2014_000000359925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254177, "question_id": "DW74b7vYzpeRMAnGJCVSco", "question": "What is the dress code of the event he's going to?", "choices": ["casual", "formal", "business", "semi-formal"], "correct_choice_idx": 1, "direct_answers": ["formal", "formal", "formal", "formal", "black tie", "formal", "black tie", "formal", "formal", "formal"], "difficult_direct_answer": false, "rationales": ["He is likely to go to a formal dress event.", "He is attending a very fancy event as he is wearing a tuxedo.", "The code is formal."], "image": "train2014/COCO_train2014_000000254177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171062, "question_id": "DWAuuFCSwCbAHW6EeszAjY", "question": "What brand is the television?", "choices": ["sony", "toshiba", "jvc", "sharp"], "correct_choice_idx": 2, "direct_answers": ["jvc", "jvc", "jvc", "jvc", "jvc", "jvc", "jvc", "jvc", "jvc", "jvc"], "difficult_direct_answer": false, "rationales": ["There are letters on the center of the bottom of the tv that say \"jvc\".", "You can tell by the letters at the bottom of the television.", "The television has jvc's icon on it."], "image": "val2014/COCO_val2014_000000171062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120887, "question_id": "DWDm6vTSYvYTKry4fYVfxt", "question": "The design on the red sign looks like the symbol for what mathematical operation?", "choices": ["subtraction", "division", "multiplication", "addition"], "correct_choice_idx": 0, "direct_answers": ["minus", "minus", "minus", "subtraction", "minus", "minus", "minus", "subtraction", "subtraction", "subtraction"], "difficult_direct_answer": false, "rationales": ["The red sign has a white symbol that looks like a minus sign used in subtraction.", "The sign looks like the minus sign.", "A straight horizontal line is what a subtraction looks like in math."], "image": "train2014/COCO_train2014_000000120887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63912, "question_id": "DWJaEKjB9UbNqbmAJeGx3u", "question": "What weather is it on this rainy day?", "choices": ["thunder storm", "foggy", "windy", "humid"], "correct_choice_idx": 2, "direct_answers": ["rainy", "dreary", "rainy", "rain", "rain showers", "raining", "cloudy", "rain", "rainy", "windy"], "difficult_direct_answer": false, "rationales": ["The sky is dark and their is rain coming down.", "Most times during rain storms wind is accompanied by it.", "The person's umbrella is being blown around and turned inside-out."], "image": "train2014/COCO_train2014_000000063912.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411727, "question_id": "DWjW9tC2ehRHn9ocgxgEfx", "question": "What profession does this person want to practice?", "choices": ["medicine", "programming", "psychology", "teaching"], "correct_choice_idx": 3, "direct_answers": ["leadership", "teacher", "pedagogy/teacher", "teacher", "teaching", "teaching", "teacher", "teaching", "system work", "educator"], "difficult_direct_answer": false, "rationales": ["The literature depicted mentions reading and the shaping of educational leadership, which is more consistent with a than the other listed choices.", "The book is about kids and learning", "The workbook references education and pedagogy which are related to this profession."], "image": "val2014/COCO_val2014_000000411727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498644, "question_id": "DXWtA8wiipc2nnrYAenDun", "question": "How many people at least are breakfasting together here?", "choices": ["six", "two", "four", "three"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two cups of coffee.", "There are 2 plates with 2 forks, and 2 drinks, suggesting each person has a drink and food item.", "There are two of everything so there should be two people there."], "image": "train2014/COCO_train2014_000000498644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360279, "question_id": "DXirua85DetcJa2sbxc2mP", "question": "Where does pizza comes from?", "choices": ["greece", "russia", "italy", "america"], "correct_choice_idx": 2, "direct_answers": ["italy", "italy", "italy", "italy", "oven", "oven", "oven", "italy", "dough", "oven"], "difficult_direct_answer": false, "rationales": ["That is where it is said to have originated", "Pizza is native to italy.", "Pizza is known as an italian food."], "image": "train2014/COCO_train2014_000000360279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350059, "question_id": "DXjcHQYnKe34TBCVgMCrPC", "question": "Where were the first bricks used?", "choices": ["middle east", "spain", "britain", "america"], "correct_choice_idx": 0, "direct_answers": ["building", "tell assad", "jericho", "tell assad", "tell assad", "building", "tell assad", "middle east", "buildings", "germany"], "difficult_direct_answer": false, "rationales": ["The bricks are in the middle east.", "The middle east used bricks first.", "The first bricks were used in the middle east for the construction of some of the world's first buildings."], "image": "train2014/COCO_train2014_000000350059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428740, "question_id": "DXwHM5xrHp4ML9ACuriJCj", "question": "What color is the metal tube at the top right corner of the image?", "choices": ["black", "red", "grey", "green"], "correct_choice_idx": 2, "direct_answers": ["silver", "silver", "silver", "grey", "silver", "silver", "gray", "silver", "gray", "gray"], "difficult_direct_answer": false, "rationales": ["This is obvious by just looking at the image.", "The tubing seen in this picture is grey in color.", "The tube is grey."], "image": "train2014/COCO_train2014_000000428740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493219, "question_id": "DY257RWZ9ZBLzz2ZMDk6JS", "question": "Which direction is the statue oriented?", "choices": ["sideways right", "away from", "towards", "sideways left"], "correct_choice_idx": 1, "direct_answers": ["infront", "away from", "backwards", "north", "away", "north west", "vertical", "right", "away", "forward"], "difficult_direct_answer": true, "rationales": ["You can not see the front of the statue from this angle only its back side.", "The face is pointing towards the bushes not the people", "Its face is not visible to the people on the bench or the photographer."], "image": "val2014/COCO_val2014_000000493219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333842, "question_id": "DYCfXxLiLxipqnK3aVECGf", "question": "What video game does the user of this office space like?", "choices": ["none", "super mario", "pac man", "what's app"], "correct_choice_idx": 1, "direct_answers": ["mario brothers", "mario", "pokemon", "nintendo", "mario", "donkey kong", "super mario", "super mario", "super mario", "pikachu"], "difficult_direct_answer": false, "rationales": ["There is a stuffed mario on the riser.", "The other characters aren't on display.", "The stuffed characters are all from this game."], "image": "train2014/COCO_train2014_000000333842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263929, "question_id": "DYdAzk3bSGXpyDXpYhqLhc", "question": "What is on the grass?", "choices": ["animals", "children", "pears", "eels"], "correct_choice_idx": 0, "direct_answers": ["cattle", "cows", "cattle", "snow", "animals", "cows", "rocks", "rocks", "rocks", "rocks"], "difficult_direct_answer": false, "rationales": ["There are cows on it.", "Cows are grazing in an open area. the ground is covered in green grass.", "There are many cows walking around and grazing on the grass."], "image": "val2014/COCO_val2014_000000263929.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208169, "question_id": "DYpSdhNKefutGZ4yxVPptc", "question": "What is the main reason hundreds of bikers would be riding together down a main street?", "choices": ["conserve gas", "safest route", "short cut", "attention"], "correct_choice_idx": 3, "direct_answers": ["parade", "biker event", "parade", "motorcycle rally", "form movement", "rally", "showing force", "rally", "attention", "rally"], "difficult_direct_answer": false, "rationales": ["The reason is for attention.", "The attention when the bikes is ride together.", "The people are showing off their bikes."], "image": "val2014/COCO_val2014_000000208169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419599, "question_id": "DYtLZGRKdf3XDiHUkKyyJw", "question": "What type of top is the woman on the right wearing?", "choices": ["hoodie", "tank top", "blazer", "suit"], "correct_choice_idx": 0, "direct_answers": ["jacket", "jacket", "jacket", "jacket", "hoodie", "hoodie", "hoodie", "jacket", "hoodie", "hoodie"], "difficult_direct_answer": false, "rationales": ["The woman on the right is wearing a hoodie sweater.", "She has a jacket with a hood.", "The woman is wearing a hoodie."], "image": "train2014/COCO_train2014_000000419599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404065, "question_id": "DZ4bcpxUh9GfwpFkxCziUg", "question": "Who is this lady?", "choices": ["bus driver", "tourist", "homeless", "athlete"], "correct_choice_idx": 1, "direct_answers": ["homeless", "rider", "tourist", "tourist", "passenger", "commuter", "tourist", "traveller", "traveler", "potential passenger"], "difficult_direct_answer": false, "rationales": ["A woman is standing at a train station with luggage. people travel with luggage and by train.", "The woman is waiting for a ride. she has bags.", "She is standing at a bus stop with her backpack full of travel items."], "image": "train2014/COCO_train2014_000000404065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425847, "question_id": "DZDVVL6RB7KhV6WvVyA5Tq", "question": "How did the person pictured get to where they stand immediately prior?", "choices": ["skated", "flew", "walked", "skied"], "correct_choice_idx": 2, "direct_answers": ["stairs", "walked", "walking", "down steps", "walked", "down stairs", "skied", "down stairs", "descend stairs", "walked"], "difficult_direct_answer": false, "rationales": ["As shown with their prints in the snow.", "The person is carrying skis, not skates, but did not leave ski tracks on the stairs. a person cannot fly.", "You can see the footprints in the snow down the stairs."], "image": "train2014/COCO_train2014_000000425847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383905, "question_id": "DZDdX6ta3Got8fcAbRfKFw", "question": "The men are relying on what to move them?", "choices": ["elephants", "people", "car", "motor"], "correct_choice_idx": 0, "direct_answers": ["elephant", "elephants", "elephants", "elephants", "elephant", "elephants", "elephants", "elephants", "elephant", "elephants"], "difficult_direct_answer": false, "rationales": ["Elephants are large and used to transport things sometimes.", "The men won't get anywhere unless the elephants also move.", "In this country these animals are used often in ceremonies."], "image": "train2014/COCO_train2014_000000383905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543289, "question_id": "DZXPe2Ur7tbZK3MjWmtBVA", "question": "What's the term for the way the boats are parked near the land?", "choices": ["docked", "owned", "used", "finished"], "correct_choice_idx": 0, "direct_answers": ["single tie", "ashore", "docked", "docked", "docked", "parallel", "docked", "docked", "docked", "moored"], "difficult_direct_answer": false, "rationales": ["The boats are docked near the land.", "The term is commonly known based on the apparatus that boats are commonly connected to as extensions of the land.", "The boats are tied to the platform."], "image": "train2014/COCO_train2014_000000543289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223719, "question_id": "DZiMBmPFPQFYqbtxWsgq8t", "question": "What is she doing with the food?", "choices": ["trashing it", "eating", "stealing it", "selling it"], "correct_choice_idx": 3, "direct_answers": ["selling", "selling", "selling it", "selling", "selling it", "preparing", "presenting", "selling", "selling", "peeling"], "difficult_direct_answer": false, "rationales": ["She is set up and has too much food for just her so she must be selling it.", "The woman is selling food.", "A woman is sitting behind fruit that is on display on the beach."], "image": "train2014/COCO_train2014_000000223719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77591, "question_id": "DaJDcWqiBjX8nYBuEJhmq6", "question": "What is the relationship of the woman wearing black shirt to the man on her left in this setting?", "choices": ["competitor", "teammate", "coach", "coworker"], "correct_choice_idx": 1, "direct_answers": ["doubles partners", "teammate", "friend", "teammate", "teammate", "partner", "teammate", "partners", "doubles partner", "double partner"], "difficult_direct_answer": false, "rationales": ["They are working together.", "They are playing a sport and are on the same side of the net, so they are playing the game together.", "These people are on the same side of the court so they are on the same team."], "image": "val2014/COCO_val2014_000000077591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548559, "question_id": "DaMqYZqh3PFrhBAyjK4cJh", "question": "Why are so many trains parked side by side here what word describes this site?", "choices": ["racing", "train plant", "prison", "staging/parking"], "correct_choice_idx": 3, "direct_answers": ["station", "terminal", "depot", "trainyard", "train depot", "station", "fun", "staging/parking", "rail yard", "train station"], "difficult_direct_answer": true, "rationales": ["The site is for the trains to park.", "Trains do not race each other or go to prison. there is no factory near the trains.", "The trains are staged."], "image": "train2014/COCO_train2014_000000548559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61175, "question_id": "DaPVxCUK5mTYmVqRGaCj5b", "question": "Who is the lady wearing a green shirt?", "choices": ["audience", "referee", "tennis player", "staff"], "correct_choice_idx": 3, "direct_answers": ["staff", "wife", "fan", "manager", "referred", "winner", "vip", "reporter", "woman", "coach"], "difficult_direct_answer": true, "rationales": ["She is wearing a lanyard which is generally used to hold a company employee's name badge.", "The woman is on staff.", "The woman has a staff badge around her neck."], "image": "val2014/COCO_val2014_000000061175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95283, "question_id": "DaQZeBSAoAEgyifhiTeUCo", "question": "What has occurred in the scene?", "choices": ["car parking", "traffic jam", "accident", "car show"], "correct_choice_idx": 2, "direct_answers": ["fire", "unknown", "accident", "fire", "fire", "accident", "accident", "fire", "fire", "fire"], "difficult_direct_answer": false, "rationales": ["A fire department truck has its lights on . there are people stopped in both their cars and two people standing looking at something in front.", "People are standing around and vehicles are around with emergency vehicles approaching. emergency vehicles come for accidents.", "Cars are all around with an emergency vehicle towards the front. people are standing near the cars and the lights are lit on the emergency vehicle."], "image": "val2014/COCO_val2014_000000095283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107239, "question_id": "DaTXyfV5sXmx2h8NCJNJKv", "question": "Which item here can be turned into something else without eating it?", "choices": ["pizza", "nothing", "pizza box", "woman"], "correct_choice_idx": 2, "direct_answers": ["box", "box", "pizza box", "crust", "box", "pizza", "box", "box", "cardboard", "pizza box"], "difficult_direct_answer": false, "rationales": ["The writing on the side of the box indicates that it can be recycled and thus turned into another product in time.", "The food is in a container. it could be modified into something else without eating it.", "The pizza box says it's recyclable."], "image": "val2014/COCO_val2014_000000107239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399147, "question_id": "DaVr6B6d5f8Q6KtdFpRPdx", "question": "What does lb mean as it is written on the signs?", "choices": ["pounds", "language barrier", "lemon bags", "liters"], "correct_choice_idx": 0, "direct_answers": ["pound", "pound", "pounds", "pound", "pounds", "pound", "pounds", "pound", "pound", "pound"], "difficult_direct_answer": false, "rationales": ["Lb. is short for pounds in english and is often used as an abbreviation.", "Lb means pounds of fruit.", "Many fruits and vegetables are sold by the pound. the vendors have a scale handy to weigh and price the produce purchased."], "image": "train2014/COCO_train2014_000000399147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292822, "question_id": "DafcBqwjiUupPuYqwnEmv8", "question": "What white item creates the tallest white here?", "choices": ["waves", "crests", "hats", "snow"], "correct_choice_idx": 3, "direct_answers": ["wave", "snow", "wave", "snow", "snow", "wave", "snow", "snow", "snow", "snow"], "difficult_direct_answer": false, "rationales": ["The frozen rain can build upon itself to make a mountain taller.", "Mountains have white areas on them. mountains often have snow on the tops of them.", "There are patches of snow on the mountain."], "image": "val2014/COCO_val2014_000000292822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424172, "question_id": "DafzMsHHMq92NJ3WGWibvC", "question": "What is the man who is standing doing?", "choices": ["jumping", "waving", "rowing", "eating"], "correct_choice_idx": 2, "direct_answers": ["surfing", "paddling", "paddling", "paddling", "surfing", "paddle boarding", "rowing", "rowing", "paddling", "paddle boarding"], "difficult_direct_answer": false, "rationales": ["The man is visibly holding a long pole. poles are a part of a paddle that would be seen above the water and a paddle is a traditional wave to move a small craft through water and rowing is another term for paddling.", "He is using poles.", "He is using his oar to maneuver through the water"], "image": "val2014/COCO_val2014_000000424172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101310, "question_id": "Db6SUCjzFPZjwwMcpcwYUw", "question": "The item on the left is traditionally used to carry what?", "choices": ["staples", "books", "bugs", "nails"], "correct_choice_idx": 1, "direct_answers": ["luggage", "books", "book", "luggage", "clothing", "books", "school books", "clothing", "luggage", "personal belongings"], "difficult_direct_answer": false, "rationales": ["A bookbag is on the floor. normally school kids carry all their school supplies in it.", "It is a backpack, which is most popularly used by school children to transport their books back and forth from school.", "Books are in backpacks."], "image": "train2014/COCO_train2014_000000101310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551024, "question_id": "DbasHNp8dPda9ZtKAxCmL2", "question": "Which color item on the plate has a plant origin?", "choices": ["pink", "yellow", "white", "brown"], "correct_choice_idx": 3, "direct_answers": ["brown", "brown", "brown", "brown", "brown", "yellow", "brown", "brown", "brown", "beans"], "difficult_direct_answer": false, "rationales": ["The soup comes from plants.", "The plate in the photo contains ham, eggs and beans. the only item which has a plant origin is the beans which are brown.", "Beans come from plants."], "image": "train2014/COCO_train2014_000000551024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242208, "question_id": "Dbhfuk2PYw9MaytkhWH9CN", "question": "What type of business does the person on the phone walk away from?", "choices": ["movie theater", "auto plant", "zoo", "disease control"], "correct_choice_idx": 0, "direct_answers": ["theatre", "retail", "movie theater", "clothing store", "tire shop", "mall", "movie theater", "biggest", "bus", "movie theater"], "difficult_direct_answer": false, "rationales": ["The person is walking away from the movie theater as seen by the title amc.", "The sign in the background is for amc. this company operates a chain of cinemas.", "The person on the phone is walking away from an amc movie theater."], "image": "val2014/COCO_val2014_000000242208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512785, "question_id": "Dbq6sV7dntqtuXHWkLFZsP", "question": "Who would most be likely to daydream about this spot?", "choices": ["deceased person", "busy worker", "newborn baby", "aquaphobe"], "correct_choice_idx": 1, "direct_answers": ["beachgoers", "beach lover", "beach lovers", "seniors", "working people", "tourist", "sight", "tourist", "busy worker", "human"], "difficult_direct_answer": true, "rationales": ["A person that works a lot would love a beach vacation.", "A person that works in a high stress job would daydream about this.", "Workers would daydream."], "image": "val2014/COCO_val2014_000000512785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37678, "question_id": "Dc6tnF6Sp35Fgj9CAuQUjv", "question": "Which anniversary is being celebrated?", "choices": ["150", "2013", "100", "1863"], "correct_choice_idx": 0, "direct_answers": ["150", "150 years", "steam engine", "150", "steam power", "steam", "steam", "150", "150", "150"], "difficult_direct_answer": false, "rationales": ["There are 150 years between 1863 and 2013 as commemorated on the sign.", "The celebration is a 150th anniversary as shown by the number.", "The years of the company and the celebration year itself is written on the sign above the locomotive."], "image": "val2014/COCO_val2014_000000037678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260345, "question_id": "DcPHKcZNJ7cXoPyP2RxWa6", "question": "What are the people walking through this area looking to do?", "choices": ["paint", "shop", "investigate", "race"], "correct_choice_idx": 1, "direct_answers": ["buy items", "shop", "shop", "shop", "shop", "shop", "shop", "buy items", "shop", "shop"], "difficult_direct_answer": false, "rationales": ["The people are going for a stroll through stores.", "This is an open air marketplace and there are wares on display for people to buy.", "This is a market. the people walking through this area might buy things from the vendors."], "image": "train2014/COCO_train2014_000000260345.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263362, "question_id": "DdyCfuJs3CS5u8os6qEoiy", "question": "What kind of people visit this place throughout the year?", "choices": ["worshippers", "politicians", "tourists", "athletes"], "correct_choice_idx": 2, "direct_answers": ["tourists", "tourists", "tourists", "tourists", "tourists", "tourists", "tourists", "tourists", "tourist", "tourist"], "difficult_direct_answer": false, "rationales": ["Many people travel here to look at this sculpture.", "People are tourists.", "It is the capital of the united states and many people visit for sightseeing."], "image": "train2014/COCO_train2014_000000263362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207764, "question_id": "DeExQNPPLL2QxbmsAULz5B", "question": "Sharks often mistake these athletes for what?", "choices": ["coral", "seals", "octopi", "snakes"], "correct_choice_idx": 1, "direct_answers": ["seals", "seals", "food", "seals", "seals", "seals", "dolphins", "food", "seals", "seals"], "difficult_direct_answer": false, "rationales": ["Sharks mistake these athletes for food. sharks do not eat snakes or coral.", "The sharks eat seals.", "The athletes visible are surfers. because of the shape of a surfboard and a rider appear from below, it is suspected that this is a cause for the shark attacks reported on surfers."], "image": "train2014/COCO_train2014_000000207764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165855, "question_id": "DeK7FFAq2Bzh5CTm7g59QW", "question": "What vehicle is near the ladder?", "choices": ["tank", "car", "boat", "submarine"], "correct_choice_idx": 1, "direct_answers": ["car", "suv", "car", "car", "ford explorer", "ford explorer", "suv", "truck", "car", "van"], "difficult_direct_answer": false, "rationales": ["It has 4 wheels and is smaller than the airplane", "This is a civilian, not military, airport.", "There is a car on the tarmac under tunnel ladder."], "image": "train2014/COCO_train2014_000000165855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398505, "question_id": "DeNrBSASbzmcBdh7xt64FH", "question": "What do the persons on boards here wish for?", "choices": ["chocolate sundaes", "calm water", "big waves", "doldrums"], "correct_choice_idx": 2, "direct_answers": ["waves", "waves", "waves", "waves", "big waves", "waves", "waves", "waves", "waves", "waves"], "difficult_direct_answer": false, "rationales": ["Surfers are always looking for waves.", "The people want waves.", "People are on surfboards. surfers like big waves."], "image": "val2014/COCO_val2014_000000398505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544325, "question_id": "DeQxwv6Gk3rKem6gVcdXkE", "question": "What sort of setting is the gloved man standing in?", "choices": ["baseball field", "ice rink", "basketball court", "soccer field"], "correct_choice_idx": 0, "direct_answers": ["baseball field", "baseball field", "baseball field", "ball field", "baseball field", "centerfield", "baseball diamond", "baseball diamond", "baseball field", "infield"], "difficult_direct_answer": false, "rationales": ["The people are standing in grassy and dirt areas and wearing baseball apparel.", "The setting is a baseball field.", "The people have bats and mittens."], "image": "train2014/COCO_train2014_000000544325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343154, "question_id": "Df2yLfEJ7PNTdwZPy2V7CZ", "question": "What is the most likely number of people this person is preparing food for?", "choices": ["six", "one", "two", "million"], "correct_choice_idx": 0, "direct_answers": ["ten", "ten", "20", "six", "six", "four", "four", "six", "twenty", "24"], "difficult_direct_answer": false, "rationales": ["There are six people.", "There is too much food for one or two people, but it wouldn't feed a million.", "Food is prepared and laid out on a counter. there are six sandwiches made."], "image": "train2014/COCO_train2014_000000343154.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105732, "question_id": "DfRXFtaaEyryVLh2xYuJLX", "question": "What is being given here?", "choices": ["lesson", "music audition", "olympic trials", "initiation"], "correct_choice_idx": 0, "direct_answers": ["lesson", "skiing lessons", "lessons", "ski lesson", "lesson", "lessons", "lessons", "ski lessons", "lessons", "lesson"], "difficult_direct_answer": false, "rationales": ["People are learning to ski in ski school.", "Everyone is standing around trying to learn how to ski.", "A lesson is being given."], "image": "val2014/COCO_val2014_000000105732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285091, "question_id": "DfkMotbjPLTr5GYebLtMsK", "question": "What is the man drinking?", "choices": ["red wine", "white wine", "champagne", "beer"], "correct_choice_idx": 0, "direct_answers": ["wine", "wine", "juice", "wine", "wine", "red wine", "red wine", "juice", "wine", "wine"], "difficult_direct_answer": false, "rationales": ["Based on the drinking vessel and the color of the liquid visible, answer a is the most likely.", "The man is enjoying red wine.", "The shade matches the hue that mostly resembles said color of wine. there is a pinkish tint at the top of the liquid."], "image": "train2014/COCO_train2014_000000285091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69121, "question_id": "DfknuoUqMBxzPT2kV5AiMR", "question": "What makes up the back splash?", "choices": ["river rocks", "wood", "tiles", "marble"], "correct_choice_idx": 0, "direct_answers": ["rocks", "water", "stone", "stones", "marble", "fake rocks", "stones", "river rocks", "stones", "stone"], "difficult_direct_answer": false, "rationales": ["The backsplash has many smooth rocks that would be found in a river.", "The rocks splash.", "The backsplash is made up of different colored rocks."], "image": "val2014/COCO_val2014_000000069121.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373482, "question_id": "Dg3iXFeMFHXnW2fjobSnW5", "question": "What is the type of can the man has made of?", "choices": ["glass", "silver", "aluminum", "tin"], "correct_choice_idx": 2, "direct_answers": ["beer", "beer", "aluminum", "beer", "aluminum", "aluminum", "aluminum", "aluminum", "aluminum", "aluminum"], "difficult_direct_answer": false, "rationales": ["The man is drinking from a beer can, which is most commonly made from the material.", "The man's beverage is a beer and beer comes in aluminum cans.", "This is a lightweight metal that doesn't rust"], "image": "train2014/COCO_train2014_000000373482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501646, "question_id": "Dg6KndiAA79BJoUZxQLFnh", "question": "What's the wood area the boats are stopping at called?", "choices": ["horizon", "fences", "cargo", "docks"], "correct_choice_idx": 3, "direct_answers": ["dock", "dock", "dock", "dock", "pier", "docks", "docks", "docks", "dock", "dock"], "difficult_direct_answer": false, "rationales": ["The area is a dock.", "The wood area has wooden planks tightly placed against each other. they create a narrow walkway over the body of water between land and the boats.", "The wooden area the boats are stopping at are docks."], "image": "train2014/COCO_train2014_000000501646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384099, "question_id": "DgFdamRP46QoUUpbf6knUu", "question": "What fuels the stove?", "choices": ["gas", "charcoal", "microwave", "electricity"], "correct_choice_idx": 3, "direct_answers": ["electricity", "gas", "electricity", "electricity", "gasoline", "gas", "electric", "electricity", "gasoline", "gas"], "difficult_direct_answer": false, "rationales": ["These are electric burners on the stove which means they are powered by electricity.", "A stove with electric burners is shown in a kitchen.", "The coils on the stove look like those of an electric stove top."], "image": "train2014/COCO_train2014_000000384099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378096, "question_id": "Dgqs2UfLZNSk2S5ZEmEe7H", "question": "Black grapes are used to produce which color wine?", "choices": ["red", "green", "purple", "black"], "correct_choice_idx": 0, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["Red wine comes from grapes that are deep in color.", "That is color of wine.", "The answer is common knowledge and not related to the image."], "image": "val2014/COCO_val2014_000000378096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463970, "question_id": "DgyD8sLmXziqgLCVc7PEsA", "question": "What place specializes in these items?", "choices": ["subway", "cheesecake factory", "dunkin donuts", "chipotle"], "correct_choice_idx": 2, "direct_answers": ["sweet", "pastry shop", "bakery", "dunkin donuts", "dunkin donuts", "cafes", "donut shop", "bakery", "bakery", "bakery"], "difficult_direct_answer": false, "rationales": ["Dunkin' donuts sells donuts.", "A doughnut shop would make better doughnuts.", "Dunkin' donuts is a donut shop that sells delicious donuts. some donuts are filled and some are glazed. a bakery would also sell donuts."], "image": "val2014/COCO_val2014_000000463970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396685, "question_id": "Dgz3hEMwbRrTwnbrnjvs6T", "question": "What type of fruit is in a container with nothing else?", "choices": ["pear", "apple", "nectarine", "peach"], "correct_choice_idx": 3, "direct_answers": ["pear", "peach", "peach", "pear", "peach", "peach", "peach", "peach", "peach", "peach"], "difficult_direct_answer": false, "rationales": ["It is light orange and has an indention where a seed was", "That is a peach by itself in a container.", "There is only one container that contains a single type of food. based on the color and shape of the food it would be answer a."], "image": "train2014/COCO_train2014_000000396685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449985, "question_id": "Dh9iBnQPxaQkyEpA5eEVtf", "question": "Who put the tag on the cows ear?", "choices": ["another cow", "dog", "human", "alien"], "correct_choice_idx": 2, "direct_answers": ["farmer", "farmer", "owner", "owner", "farmer", "human", "owner", "farmer", "farmer", "farmer"], "difficult_direct_answer": false, "rationales": ["Only a person can apply a clip onto a cow's ear.", "The tag must be put on by two hands.", "His tag is used for id."], "image": "train2014/COCO_train2014_000000449985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185930, "question_id": "DhCCHbDmoNsFhkDXJiz9wH", "question": "How many sinks are there?", "choices": ["six", "three", "four", "five"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There is a center sink flanked by two sinks", "All you have to do is count to get the answer.", "There are a trio of sinks visible."], "image": "val2014/COCO_val2014_000000185930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277799, "question_id": "DhCz4jrz6K3HwGi7NYSjFd", "question": "Where is the ball likely to go next?", "choices": ["ground", "outfield", "catcher's mitt", "pitcher"], "correct_choice_idx": 2, "direct_answers": ["far", "catcher's mitt", "left field", "bat", "catcher's mitt", "outfield", "to outfield", "catcher", "outfield", "outfield"], "difficult_direct_answer": false, "rationales": ["The ball is likely to go into the catcher's mitt.", "The batter has missed the ball therefore the catcher will get it next.", "The ball was just hit but does not look like it was hit with force."], "image": "train2014/COCO_train2014_000000277799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502894, "question_id": "DhH2d6ejxkNcJhm63PDASc", "question": "What is a basket on a bicycle called?", "choices": ["storage", "carrier", "compartment", "bicycle basket"], "correct_choice_idx": 3, "direct_answers": ["cargo holder", "bicycle basket", "pannier", "bicycle basket", "basket", "bicycle basket", "invalid question", "bicycle basket", "convenient", "bicycle basket"], "difficult_direct_answer": false, "rationales": ["A place to put things on a bike is a bicycle basket.", "The basket is on the bike.", "It is called that because it attaches to the bike and doesn't move while you ride"], "image": "train2014/COCO_train2014_000000502894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344292, "question_id": "DhJZmGLjt9t4RTtJLeu4Tw", "question": "What type of company most likely uses this location?", "choices": ["brewing", "dairy", "catering", "manufacturing"], "correct_choice_idx": 0, "direct_answers": ["food producing", "brewing", "restaurant", "culinary", "restaurant", "food", "restaurant", "restaurant", "food factory", "restaurant"], "difficult_direct_answer": false, "rationales": ["There are several large stainless vats", "A brewing company uses large hoppers.", "There are vats being used to heat up liquids."], "image": "train2014/COCO_train2014_000000344292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554636, "question_id": "DhYhSNVzHZTXXvYDEJvs3G", "question": "What type of business is advertised in white letters on the building?", "choices": ["electronics store", "sports retailer", "food chain", "repair shop"], "correct_choice_idx": 1, "direct_answers": ["clothing store", "sporting retail", "sports retailer", "lillywhites", "flower store", "retail", "bus", "protest", "sports retailer", "restuarant"], "difficult_direct_answer": true, "rationales": ["The business is for sports.", "Lillywhite is a type of sports clothing retailer advertised in white letters on the building far in the background.", "The business being advertised is for sports."], "image": "train2014/COCO_train2014_000000554636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402328, "question_id": "DheR27pLW9UttDLbYUnpdA", "question": "In which area are the bikers biking?", "choices": ["desert", "rural", "suburban", "tundra"], "correct_choice_idx": 2, "direct_answers": ["town", "neighborhood", "slums", "city", "suburban", "town", "parking lot", "sidewalk", "street", "parking lot"], "difficult_direct_answer": false, "rationales": ["The building shown are small and one storied.", "It is not a busy street, but there can be several residential homes seen in the background, suggesting this is a suburban environment.", "There are houses around."], "image": "val2014/COCO_val2014_000000402328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311271, "question_id": "DhfuEnLp9edY6T5BEvT27e", "question": "Why is the man reaching towards the cake?", "choices": ["to smash", "to decorate", "to wipe", "to cut"], "correct_choice_idx": 3, "direct_answers": ["cut", "cutting it", "to cut", "cutting cake", "to cut", "to cut", "to cut", "cut", "cutting it", "to cut"], "difficult_direct_answer": false, "rationales": ["This is a celebration and he has a knife in his hand", "He is going to slice it so everyone can have a piece to eat.", "The man is holding a knife. the cake is very large, and he is dividing it into individual slices making sure everyone gets a piece."], "image": "train2014/COCO_train2014_000000311271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75585, "question_id": "Dht2u8urvzVjRAabXnj4HL", "question": "What number is on the bus in the middle?", "choices": ["888", "202", "194", "456"], "correct_choice_idx": 2, "direct_answers": ["zero", "nine", "194", "194", "194", "194", "one ninety-four", "194", "194", "194"], "difficult_direct_answer": false, "rationales": ["The bus number is 194.", "If you can count then you know the numbers.", "You can see the number 194 when you look above the windows on the front of the bus."], "image": "train2014/COCO_train2014_000000075585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17766, "question_id": "Di5cHzWks7ZroePtK5kxwc", "question": "Why are his arms spread wide?", "choices": ["is falling", "maintain balance", "to fly", "is bouncing"], "correct_choice_idx": 1, "direct_answers": ["maintain balance", "balance", "balance", "balance", "balance", "balance", "balance", "balance", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["He wants to balance.", "The boy doesn't want to fall.", "The man is trying to stay upright."], "image": "train2014/COCO_train2014_000000017766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256577, "question_id": "DiGwn7hiYkT2NP5cMCrzkH", "question": "What does the dog have to do to achieve its goal?", "choices": ["open door", "heard sheep", "bite frisbee", "catch ball"], "correct_choice_idx": 2, "direct_answers": ["catch frisbee", "bite frisbee", "catch", "catch frisbee", "catch frisbee", "catch", "catch frisbee", "catch frisbee", "jump", "catch frisbees"], "difficult_direct_answer": false, "rationales": ["The dog needs to bite the frisbee.", "The dog needs to catch it in his mouth", "There are flying discs, but no balls, doors, or sheep, near the dog."], "image": "val2014/COCO_val2014_000000256577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564448, "question_id": "DiLpekWBscPg92pQnqzqxU", "question": "What type of hat is the kid wearing?", "choices": ["beanie", "fedora", "bucket hat", "baseball cap"], "correct_choice_idx": 2, "direct_answers": ["bucket hat", "beach hats", "sun hat", "sun hat", "sun hat", "sun hat", "sun bonnet", "sun", "sun hat", "sun hat"], "difficult_direct_answer": false, "rationales": ["The kid is wearing a bucket hat for the beach.", "This type of hat has a round top and a wide visor that circles all the way around. it resembles an upside down bucket.,.", "He has a yellow sun hat."], "image": "train2014/COCO_train2014_000000564448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320911, "question_id": "DigFVDgD4WbiW3KKoTMBmL", "question": "What is the thing the boy in the white hat is carrying made of?", "choices": ["metal", "leather", "cardboard", "stone"], "correct_choice_idx": 2, "direct_answers": ["cardboard", "box", "cardboard", "cardboard", "cardboard", "cardboard", "cardboard", "box", "cardboard", "cardboard"], "difficult_direct_answer": false, "rationales": ["This is obvious by the shape and color of the box.", "The boy with the hat is carrying something in a cardboard ox.", "By looking at it, you can tell the answer because of the color and the shape."], "image": "train2014/COCO_train2014_000000320911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270716, "question_id": "Diod9Ea3iiKzDEMSgz8NVW", "question": "Which car is in the greatest danger?", "choices": ["red pickup", "grey van", "black truck", "red sedan"], "correct_choice_idx": 3, "direct_answers": ["red one", "behind truck", "truck", "truck", "suv", "behind truck", "behind", "red sedan", "red car", "behind truck"], "difficult_direct_answer": false, "rationales": ["The red sedan could be hit by the truck.", "The bright colored car directly behind the logging truck is the one in greatest danger because a log could come off that truck and the car is not a safe distance behind the truck.", "The small car could be destroyed by the logs on the truck."], "image": "train2014/COCO_train2014_000000270716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117458, "question_id": "DiqxRbnAfwFjN6xgyTH4Uo", "question": "How do motorcyclists carry gear?", "choices": ["cart", "string", "luggage space", "donkey"], "correct_choice_idx": 2, "direct_answers": ["back side", "on back", "top box", "luggage space", "backseat", "basket", "box", "luggage rack", "boxes", "packs"], "difficult_direct_answer": true, "rationales": ["The gear is for luggage.", "Motorcyclists carry gear on the luggage carrier equipment on back.", "The motorcycle has luggage in the back."], "image": "val2014/COCO_val2014_000000117458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277188, "question_id": "Dj2KbvnDnm9iiJbvRxSi8Q", "question": "How many ambulances are there?", "choices": ["seven", "three", "eight", "five"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["This is obvious simply by counting.", "The ambulances have the word written on the front.", "There are a trio of ambulances pictured."], "image": "train2014/COCO_train2014_000000277188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450458, "question_id": "DjA3DxknKPSDMo4Ffumc2T", "question": "Which number is closest to the number on the train?", "choices": ["325", "50", "240", "110"], "correct_choice_idx": 3, "direct_answers": ["one hundred", "264", "105", "one", "106", "bad question", "110", "onehundredsix", "one", "105"], "difficult_direct_answer": false, "rationales": ["The number 106 is written on this train. 110 is the closest number to that here listed.", "The number on the train is 106 and 110 is closest to that number from these choices.", "The number one hundred and six is shown."], "image": "val2014/COCO_val2014_000000450458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436300, "question_id": "DjAZqrCnoza3f2A6CEhTad", "question": "What is the color of the very top of the bus?", "choices": ["orange", "green", "yellow", "blue"], "correct_choice_idx": 2, "direct_answers": ["yellow", "white", "white", "yellow", "white", "cream", "creme", "cream", "yellow", "white"], "difficult_direct_answer": false, "rationales": ["The top of the bus has a yellowish line on top of a red one.", "It is a beige sort of color.", "A bus is two toned with red on the bottom and yellow on the top."], "image": "train2014/COCO_train2014_000000436300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286493, "question_id": "DjDWzdXgaAbZmraz3WbTgM", "question": "In what US city is this subway station located in?", "choices": ["los angeles", "chicago", "new york", "seattle"], "correct_choice_idx": 1, "direct_answers": ["chicago", "chicago", "new york", "illinois", "chicago", "chicago", "nyc", "boston", "chicago", "boston"], "difficult_direct_answer": false, "rationales": ["The subway station is in chicago based on the sign's locations.", "I believe 95th street is in chicago.", "The city is chicago."], "image": "val2014/COCO_val2014_000000286493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250801, "question_id": "DjDpqKcRQ97tRnxDep5dVf", "question": "The front window of the vehicle is open because it lacks what feature?", "choices": ["air conditioning", "windshield wipers", "heat", "locks"], "correct_choice_idx": 0, "direct_answers": ["human", "handle", "window", "glass", "working condition", "glass panes", "air conditioning", "clarity", "window", "window pane"], "difficult_direct_answer": true, "rationales": ["It is an old vehicle, which was made before air conditioning was a feature of cars. the window is open to allow proper ventilation for the animal inside.", "It would be really hot if the windows were closed.", "The answer is unknowable for sure, but based on the apparent age of the vehicle and a common reason for opening a window, answer a is likely."], "image": "train2014/COCO_train2014_000000250801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503202, "question_id": "DjFTNDtRPMeJvLc7aRoJJe", "question": "What is the man doing with the white remote?", "choices": ["gaming", "powering", "calling", "painting"], "correct_choice_idx": 0, "direct_answers": ["playing", "watching", "wii", "gaming", "playing wii", "playing wii", "playing tennis", "gaming", "playing games", "playing game"], "difficult_direct_answer": false, "rationales": ["He is playing a video game that requires the use of a motion controller.", "This is a wii remote", "The man is holding a wii controller and is pointing it at the screen in order to play tennis."], "image": "val2014/COCO_val2014_000000503202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197257, "question_id": "DjNxbexKrFPibq9GWctawY", "question": "What does the middle knob on the stove turn on?", "choices": ["oven", "right burners", "timer", "left burners"], "correct_choice_idx": 0, "direct_answers": ["oven", "oven", "oven", "oven", "stove", "oven", "stove", "oven", "oven", "oven"], "difficult_direct_answer": false, "rationales": ["Typically the right two turn on the right burners and the left two turn on the left burners.", "The four other knows are for the four burners on the top of the stove.", "The middle knob is for the oven."], "image": "train2014/COCO_train2014_000000197257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421703, "question_id": "DjddYpK7s5wVAMedmufrKS", "question": "What food type is in the pan?", "choices": ["fruit", "meat", "vegetables", "candy"], "correct_choice_idx": 2, "direct_answers": ["broccoli", "broccoli", "vegetables", "vegetables", "broccoli", "vegetables", "vegetables", "broccoli", "stir fry", "broccoli"], "difficult_direct_answer": false, "rationales": ["There are vegetables in the pan being cooked.", "Broccoli is in a pan. broccoli is a vegetable.", "Broccoli and onions are being sauteed."], "image": "val2014/COCO_val2014_000000421703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212380, "question_id": "Djx9SWUHjrudGLBWGAYQUg", "question": "The man wearing the suit and tie is operating what object?", "choices": ["sedan", "coupe", "pickup truck", "suv"], "correct_choice_idx": 2, "direct_answers": ["truck", "audio player", "driving", "car", "truck", "car", "seatbelt", "pickup truck", "vehicle", "car"], "difficult_direct_answer": false, "rationales": ["The vehicle does not have a backseat.", "There is a truck bed behind the man.", "The man in the suit and tie is sitting in a pickup truck that doesn't have a backseat."], "image": "train2014/COCO_train2014_000000212380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523025, "question_id": "DkCvHb5JP9b9sEH2eXQYse", "question": "Why does the man have his head covered?", "choices": ["fashion", "warmth", "visibility", "safety"], "correct_choice_idx": 3, "direct_answers": ["protection", "protection", "protection", "hair", "protection", "protection", "protection", "hair", "safety", "hair"], "difficult_direct_answer": false, "rationales": ["It is for safety because he is skateboarding, and it is common to fall in this sport. hitting one's head from skateboarding would cause a great deal of trauma with no head protection.", "When doing dangerous activities protection is needed.", "So if he falls he does not hurt his head."], "image": "train2014/COCO_train2014_000000523025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144561, "question_id": "DknojKztB2wrYYMgGNmRTd", "question": "What is the blue plane used for?", "choices": ["commercial travel", "cargo shipping", "military exercises", "racing"], "correct_choice_idx": 0, "direct_answers": ["flying", "passengers", "transportation", "transporting people", "commercial travel", "transport", "transportation", "flight", "passenger transport", "transportation"], "difficult_direct_answer": false, "rationales": ["It has the name of the company on the tail", "Those are passenger windows.", "The blue plane is used for people to travel on since it's a southwest flight."], "image": "train2014/COCO_train2014_000000144561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282293, "question_id": "DkpB59RMBF45fejw8MYXM9", "question": "What street name or intersection can be clearly seen on the corner?", "choices": ["tow", "massachusetts", "perry", "roma"], "correct_choice_idx": 1, "direct_answers": ["massachusetts", "massachusetts", "massachusetts", "massachusetts", "massachusetts", "massachusetts", "massachusetts", "massachusetts", "massachusetts", "massachusetts"], "difficult_direct_answer": false, "rationales": ["The name is on the sign on the pole", "The street that is visible is massachusetts street.", "Massachusetts is the name on the street sign."], "image": "train2014/COCO_train2014_000000282293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70927, "question_id": "DkssJymAYPRdWP49Q9tcBF", "question": "How many different species of animals are grazing in the savannah?", "choices": ["eight", "two", "one", "seven"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "three", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are giraffes and zebras.", "There are giraffes and zebras.", "There are zebras and giraffes grazing in the savannah."], "image": "train2014/COCO_train2014_000000070927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105582, "question_id": "Dm5cXWjgQHNe8YiUNhTaCe", "question": "Why is there so much orange in this image?", "choices": ["sunset", "orange filter", "orange lights", "sunrise"], "correct_choice_idx": 2, "direct_answers": ["lights", "lighting", "lights", "lighting", "lights", "orange lights", "reflection", "light reflection", "lights", "lights"], "difficult_direct_answer": false, "rationales": ["The lights are orange.", "There are orange lights casting an orange hue.", "The lights are reflecting off the snow"], "image": "train2014/COCO_train2014_000000105582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147577, "question_id": "Dm8tMn6g3EZNkHa75uaEnR", "question": "Who created the mural?", "choices": ["queen", "dog", "baby", "artist"], "correct_choice_idx": 3, "direct_answers": ["artist", "artist", "artist", "artist", "artist", "artist", "artist", "artist", "artist", "artist"], "difficult_direct_answer": false, "rationales": ["An artist did this mural on the wall.", "The painting or mural is quite extravagant. it would take a professional like an artist to create this.", "An artist needed to have painted the mural."], "image": "val2014/COCO_val2014_000000147577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193112, "question_id": "DmLBJkShqnoZQnx4UFb3ps", "question": "What is the relationship between the men in the foreground in both images?", "choices": ["husbands", "competitors", "brothers", "same person"], "correct_choice_idx": 3, "direct_answers": ["identical twins", "same man", "same man", "same person", "friends", "same person", "same person", "same man", "same person", "same person"], "difficult_direct_answer": false, "rationales": ["It appears to be a. that said, it really could be b, c or d given that the person's head is pointing down and they may merely be two people dressed the same.", "The man's features are visible in both photos and his attire. because they match and the persons don't actually appear together within the same frame at the same time it is likely the same person.", "The person is one in the same."], "image": "val2014/COCO_val2014_000000193112.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336360, "question_id": "DmMLmXyZcsbExV54GmtvsS", "question": "What region of the world is this scene at?", "choices": ["northern", "artic", "middle east", "southeastern"], "correct_choice_idx": 3, "direct_answers": ["asia", "asia", "china market", "southeastern", "asia", "southeast asia", "singapore", "asia", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["The people and writing", "This looks like an asian area and that is in the south eastern side of the world.", "The oranges are at an asian market."], "image": "val2014/COCO_val2014_000000336360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346898, "question_id": "DmPAPXTZfFiSnJeCLSRkTm", "question": "What is standing in the center of the grass?", "choices": ["bobcat", "woman", "baby", "bear"], "correct_choice_idx": 1, "direct_answers": ["person", "woman", "good", "woman", "person", "person", "woman", "woman", "woman", "woman"], "difficult_direct_answer": false, "rationales": ["The other options aren't in this photo. this is obvious.", "It is an adult human.", "A woman is throwing a frisbee."], "image": "train2014/COCO_train2014_000000346898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67886, "question_id": "DmUJ9znLf6cuzDT38fXjGY", "question": "What natural phenomena will occur shortly?", "choices": ["moonrise", "sundown", "sunset", "moonset"], "correct_choice_idx": 2, "direct_answers": ["sunset", "sunset", "waves", "sunset", "sunset", "sunset", "sunset", "sunset", "sunset", "waves"], "difficult_direct_answer": false, "rationales": ["The sun is setting at the beach and it is low to the horizon and it is getting dark outside.", "People are walking on the beach and there is a warm glow and the sun is low in the sky.", "Earth's big star is obvious in the sky and it is a 50/50 chance that it is going down over the horizon."], "image": "val2014/COCO_val2014_000000067886.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96712, "question_id": "DmY5DngV58SS23na4HaKMg", "question": "What are the kites in most danger of getting stuck on top of?", "choices": ["gazebo", "ground", "human", "sky"], "correct_choice_idx": 0, "direct_answers": ["roof", "gazebo", "drone", "gazebo", "roof", "roof", "pagoda", "building", "roof", "tall pavilion"], "difficult_direct_answer": false, "rationales": ["The gazebo is the tallest structure in the area.", "The roof of this structure is quiet tall and can snag the kites if they get too close.", "The gazebo can trap the kites."], "image": "train2014/COCO_train2014_000000096712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476731, "question_id": "DmmnVwEYSMuybKN4kbM2Pq", "question": "This person would be in the minority based on hair color in what country?", "choices": ["greece", "italy", "bulgaria", "finland"], "correct_choice_idx": 3, "direct_answers": ["sweden", "germany", "germany", "europe", "germany", "indian", "finland", "germany", "mexico", "sweden"], "difficult_direct_answer": false, "rationales": ["He has darker hair and a lot of people there have lighter hair and lighter skin features.", "Many people from finland have blonde hair and this person does not.", "In finland a lot of people have lighter hair color."], "image": "val2014/COCO_val2014_000000476731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422105, "question_id": "DnQ4Rpa7NQUosnzKcpanpo", "question": "Judging from the signage beneath the clock where is this structure located?", "choices": ["india", "south america", "europe", "asia"], "correct_choice_idx": 3, "direct_answers": ["thailand", "cho'-ben-thanh", "vietnam", "vietnam", "thailand", "asia", "traffic", "viet nam", "china", "asia"], "difficult_direct_answer": false, "rationales": ["The sign is in vietnamese.", "The writing beneath the clock is in vietnamese.", "The letters on the sign look like their are from the thai alphabet in asia."], "image": "train2014/COCO_train2014_000000422105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536972, "question_id": "DnRvNyEoJhCHZYgYqaWFk9", "question": "What is the cabinet to the left called?", "choices": ["dish rack", "safe", "display cabinet", "wine cabinet"], "correct_choice_idx": 2, "direct_answers": ["china cabinet", "display cabinet", "china cabinet", "china cabinet", "china cabinet", "pantry", "display cabinet", "cupboard", "modern cabinets", "curio"], "difficult_direct_answer": false, "rationales": ["Dishes are in a cabinet behind glass doors.", "The cabinet is displaying objects.", "That cabinet is for showing off dishes and kick knacks which is why it has a glass front."], "image": "val2014/COCO_val2014_000000536972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372804, "question_id": "DndVjyQkEYkUKNKR6t2ph9", "question": "The woman wearing the conical hat is a denizen of which nation?", "choices": ["vietnam", "thailand", "japan", "china"], "correct_choice_idx": 0, "direct_answers": ["vietnam", "china", "china", "china", "china", "japan", "asia", "china", "china", "cambodia"], "difficult_direct_answer": false, "rationales": ["The rice farmer hat is from vietnam.", "Vietnamese farmers wear the conical hat.", "The woman is in vietnam."], "image": "train2014/COCO_train2014_000000372804.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282855, "question_id": "DndoqwV847SAT6Y3snUxok", "question": "Why are there so many microwaves?", "choices": ["are stolen", "for sale", "hiding them", "many users"], "correct_choice_idx": 1, "direct_answers": ["communal kitchen", "restaurant kitchen", "for sale", "restaurant", "commercial kitchen", "for sale", "sale", "to sell", "display", "for sale"], "difficult_direct_answer": false, "rationales": ["The microwaves are lined up and plugged in for trial.", "The microwaves are for sale.", "If the items are stacked like this normally they would be sold."], "image": "train2014/COCO_train2014_000000282855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554953, "question_id": "DnfDCmBsdvXTR9X6uuDgx3", "question": "This man is likely how old?", "choices": ["forty", "fifty", "thirty", "seventy"], "correct_choice_idx": 3, "direct_answers": ["eighty", "elderly", "80", "80", "80", "eighty", "senior citizen", "seventy", "90s", "eighty"], "difficult_direct_answer": false, "rationales": ["The person is 70.", "There is no way to know for sure, but based on the appearance of the man and the gray hair, answer a is most fitting.", "The man has gray hair."], "image": "train2014/COCO_train2014_000000554953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137560, "question_id": "Do5cryRthr2BtWyrv4E8mu", "question": "What activity do the purple shirted children take part in?", "choices": ["tennis lesson", "racquetball", "running", "squash"], "correct_choice_idx": 0, "direct_answers": ["play tennis", "tennis", "tennis", "hooking", "tennis", "tennis", "tennis", "tennis lesson", "tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["These kids all have tennis racquets and are on a tennis court so they are likely learning tennis.", "They are holding rackets and are on a court", "The kids are all holding tennis rackets."], "image": "val2014/COCO_val2014_000000137560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397151, "question_id": "DoAmBEWgfKVqQ2oVDQzdNr", "question": "How will this item fly?", "choices": ["propellers", "wing power", "wind power", "engine"], "correct_choice_idx": 2, "direct_answers": ["wind current", "wind", "wind power", "wings", "wind", "wind", "by rope", "wind", "string", "wings"], "difficult_direct_answer": false, "rationales": ["The kite flies by wind.", "The item is a kite based on its shape, design and the kite string and spool visibly attached. kites use wind power to fly.", "Kites fly with the assistance of wind."], "image": "val2014/COCO_val2014_000000397151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469330, "question_id": "DoGF3zBXWSmiCNjtUAfJbj", "question": "What present location would be the best place to put put a small baby?", "choices": ["snow", "stroller", "under tent", "dog's back"], "correct_choice_idx": 1, "direct_answers": ["baby carriage", "stroller", "stroller", "stroller", "stroller", "carriage", "stroller", "stroller", "stroller", "baby carriage"], "difficult_direct_answer": false, "rationales": ["There is a visible stroller on the right side of the image. strollers are designed to contain babies and be safe for them.", "Strollers are a safe and handy place to put a baby so they can sleep and don't have to be carried when out and about.", "Babies are usually transported in strollers."], "image": "train2014/COCO_train2014_000000469330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150685, "question_id": "DoKi6hgvZywe6aJRyMBQ4U", "question": "What group could split all of these apples between each member evenly?", "choices": ["beatles", "nirvana", "cream", "green day"], "correct_choice_idx": 0, "direct_answers": ["foursome", "beatles", "quartet", "humans", "boy band", "team", "quad", "parent", "family", "quartet"], "difficult_direct_answer": true, "rationales": ["There are four apples in a bowl. there are four members of the beetles.", "The beatles has four members.", "There are four members in the fab 4."], "image": "val2014/COCO_val2014_000000150685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22870, "question_id": "DoMGt3EY9veYo9fAKFPxzo", "question": "The leashes need what to ensure the dogs are safe to avoid them from running away?", "choices": ["cat", "leash", "cart", "human"], "correct_choice_idx": 3, "direct_answers": ["human hand", "stakes", "person", "human holding", "held", "tree", "handlers", "person holding", "human", "safety"], "difficult_direct_answer": true, "rationales": ["The leashes are not attached to anything, so the dogs can get away unless a human holds the leash to use their strength to keep the dogs close to them.", "The leashes are loose now because no one is holding them so the dogs can run away if they want to.", "The leashes need to be held by the dogs' owner."], "image": "val2014/COCO_val2014_000000022870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508840, "question_id": "DoidHE2pWFAeWeSdFX3ZP4", "question": "What is the name of the brand in shirt?", "choices": ["nike", "adidas", "mizuno", "puma"], "correct_choice_idx": 0, "direct_answers": ["nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike"], "difficult_direct_answer": false, "rationales": ["This boy is wearing a shirt with the brand known for the swoosh logo.", "This company's most recognizable symbol is the swoosh visible on the front of the shirt.", "The name of the brand is nike."], "image": "train2014/COCO_train2014_000000508840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200446, "question_id": "DooCg2C5KibCo6DereW6cj", "question": "Which thing shown here is most offensive?", "choices": ["middle finger", "glasses", "open mouth", "sleeping person"], "correct_choice_idx": 0, "direct_answers": ["middle finger", "middle finger", "middle finger", "middle finger", "middle finger", "middle finger", "middle finger", "middle finger", "finger left", "middle finger"], "difficult_direct_answer": false, "rationales": ["The only offensive thing in the picture is the hand giving the middle finger.", "This gesture is offensive to most", "Glasses, sleeping people, and open mouths are not offensive."], "image": "train2014/COCO_train2014_000000200446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147972, "question_id": "DouX6wHySqWzcdTRr4oqNW", "question": "Of the more than 5 transportation options which requires more climbing for passengers to board?", "choices": ["motor bike", "bicycle", "elephant", "van"], "correct_choice_idx": 2, "direct_answers": ["elephant", "elephant", "car", "elephant", "elephant", "elephant", "elephant", "elephant", "car", "elephant"], "difficult_direct_answer": false, "rationales": ["The animal is so large that getting on top of it is not an easy task.", "The elephant is very large and would require assistance to get on.", "You can ride on a elephant that carry more."], "image": "train2014/COCO_train2014_000000147972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227205, "question_id": "DozqTknscbfEkVJzaBgAVx", "question": "In which direction is the train going that is behind the raised arm?", "choices": ["forwards", "backwards", "nowhere", "not train"], "correct_choice_idx": 2, "direct_answers": ["foward", "left", "nowhere", "left", "front", "east", "left", "right", "right", "opposite"], "difficult_direct_answer": false, "rationales": ["The train at the station is not going anywhere at the moment. its smoke is going straight up.", "The train looks to be stopped.", "The train is not moving."], "image": "train2014/COCO_train2014_000000227205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510150, "question_id": "Dp7PoJADvSvXuW7YqNZLx9", "question": "Why do cats have mustaches?", "choices": ["none", "identity", "sensation", "extension"], "correct_choice_idx": 2, "direct_answers": ["feeling", "whiskers", "sensation", "fur", "navigation", "whiskers", "sensory input", "they don't", "whiskers", "fur"], "difficult_direct_answer": false, "rationales": ["Cats need to be able to sense things.", "These are like sensors", "Cats have mustaches so they can sense things around them."], "image": "val2014/COCO_val2014_000000510150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241113, "question_id": "DpBtY9UQeP3tQXPBkwENaa", "question": "The markings on the fairings of the motorcycle indicate that it belongs to which type of public organization?", "choices": ["fire department", "public health", "public works", "police"], "correct_choice_idx": 1, "direct_answers": ["ambulance", "police", "public health", "police", "police", "traffic police", "policing", "police", "police", "police"], "difficult_direct_answer": false, "rationales": ["The motorcycle says ambulance.", "The markings on the motorcycle are for public health.", "Police vehicles have blue coloring on them."], "image": "val2014/COCO_val2014_000000241113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296994, "question_id": "DpFVB2grMqCbjgwkyvTxMC", "question": "In which location is this man drinking?", "choices": ["mall", "city sidewalk", "private yard", "public park"], "correct_choice_idx": 3, "direct_answers": ["north", "park", "park", "left", "park", "park", "can", "public park", "park", "hand"], "difficult_direct_answer": false, "rationales": ["He looks to be in a park.", "Large outdoor areas with public benches and ponds are usually city funded parks.", "The outdoor setting is consistent with answer a in addition to the notice board which is commonly found in answer a."], "image": "val2014/COCO_val2014_000000296994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97173, "question_id": "DpNBaJWRuNPh8ZZznWphuQ", "question": "Who created the character on the girls dress?", "choices": ["walt disney", "dreamworks", "pixar", "warner brothers"], "correct_choice_idx": 0, "direct_answers": ["disney", "disney", "walt disney", "walt disney", "disney", "walt disney", "disney", "mini mouse", "disney", "walt disney"], "difficult_direct_answer": false, "rationales": ["Walt disney created minnie mouse.", "The character on the girl's dress is mickey mouse who was created by walt disney.", "Disney created the character."], "image": "train2014/COCO_train2014_000000097173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341736, "question_id": "DpUhGyakPqqXPdQKyTTLdA", "question": "Which way is the one way arrow pointing?", "choices": ["up", "down", "left", "right"], "correct_choice_idx": 2, "direct_answers": ["left", "left", "left", "right", "left", "right side", "left", "left", "left", "left"], "difficult_direct_answer": false, "rationales": ["The arrow is in a leftward direction.", "The arrow is pointing to the left.", "And technically also \"one\" way."], "image": "train2014/COCO_train2014_000000341736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389503, "question_id": "DpjLxYurD5DtjRbvAiaCVT", "question": "What is the person riding?", "choices": ["waves", "scooter", "car", "skateboard"], "correct_choice_idx": 0, "direct_answers": ["surf board", "surfboard", "surfboard", "surfboard", "board", "tidal bore", "river wave", "waves", "surf board", "surfboard"], "difficult_direct_answer": false, "rationales": ["The person is riding the wave with a surfboard.", "He is on a surfboard riding the water down a river", "The water is cresting. he is riding the crests."], "image": "val2014/COCO_val2014_000000389503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468545, "question_id": "DpoErp5DqrMrhXMbuW82o3", "question": "Which deadly creature is most likely to be lurking nearby?", "choices": ["tiger", "shark", "whale", "elephant"], "correct_choice_idx": 1, "direct_answers": ["scorpion", "shark", "shark", "shark", "shark", "shark", "scorpion", "shark", "shark", "shark"], "difficult_direct_answer": false, "rationales": ["The ocean is where they live", "Sharks are in the ocean.", "The skate park is on a beach near the ocean."], "image": "val2014/COCO_val2014_000000468545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118674, "question_id": "DpvUVzWwyQ8UY3uKK9PboF", "question": "What are they waiting for?", "choices": ["horses", "lunch", "traffic signals", "directions"], "correct_choice_idx": 2, "direct_answers": ["crossing light", "school bus", "traffic signals", "bus", "bus", "bus", "crossing", "bus", "walk sign", "bus"], "difficult_direct_answer": false, "rationales": ["They are standing next to a pole on the edge of the street.", "The signal will tell them when it's time to cross.", "They're waiting for the traffic signal to change so they cross the road."], "image": "train2014/COCO_train2014_000000118674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46339, "question_id": "Dpz2b4u8F2Ddc5MippJgpa", "question": "What is odd about the mannequin in the foreground?", "choices": ["painted red", "no pants", "human", "broken nose"], "correct_choice_idx": 1, "direct_answers": ["no pants", "pantless", "no arms", "no pants", "no pants", "no pants", "no pants", "no pants", "no pants", "no pants"], "difficult_direct_answer": false, "rationales": ["His is wearing a jacket but lacks pants, his nose is not broken, he has a human figure and he is white not red.", "The legs seem to not be covered at all with any clothing.", "The man isn't wearing pants with his jacket."], "image": "train2014/COCO_train2014_000000046339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415365, "question_id": "Dq63mxxoQmDdfNzd4Sgc9U", "question": "How many animal species are present?", "choices": ["seven", "two", "one", "seventeen"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "2 species", "eighteen", "two", "eighteen", "two", "eighteen", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are sheep and a cat.", "Cats and sheep are each an animal species. there are sheep and a cat in this photo.", "There are two species."], "image": "train2014/COCO_train2014_000000415365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122436, "question_id": "DqQZhn7xzDFnUQfB24Fwzu", "question": "What material are the two brown sticks made of?", "choices": ["bamboo", "plastic", "metal", "wood"], "correct_choice_idx": 0, "direct_answers": ["wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood", "bamboo"], "difficult_direct_answer": false, "rationales": ["The material is bamboo.", "Because the stick consist of wood.", "The toothpicks are made of wood."], "image": "train2014/COCO_train2014_000000122436.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526955, "question_id": "Dqc9dXYvnBotHTaQWDZzdZ", "question": "What is she doing?", "choices": ["eating cupcake", "posing", "fixing hand", "chewing finger"], "correct_choice_idx": 0, "direct_answers": ["playing", "eating", "eating", "eating fruit", "eating", "eating fruit", "eating", "eating", "eating cupcake", "eating"], "difficult_direct_answer": false, "rationales": ["She is eating a cupcake.", "She is eating a cupcake.", "The child has something that they're eating."], "image": "val2014/COCO_val2014_000000526955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431712, "question_id": "DqdA6CfVT3keoc9fG4s9GD", "question": "These people are on what continent?", "choices": ["asia", "south america", "africa", "north america"], "correct_choice_idx": 2, "direct_answers": ["african", "asia", "africa", "madagascar", "africa", "africa", "africa", "africa", "africa", "africa"], "difficult_direct_answer": false, "rationales": ["The people are african.", "Madagascar is in africa.", "The sign in the back shows the name of madagascar and that is in africa."], "image": "train2014/COCO_train2014_000000431712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94453, "question_id": "Dqegk4T89x64zrck773WAp", "question": "What is the parking limit in hours at these meters?", "choices": ["ten", "three", "two", "one"], "correct_choice_idx": 0, "direct_answers": ["ten hours", "ten hours", "ten hours", "ten", "ten", "ten hours", "ten", "ten", "ten hours", "ten"], "difficult_direct_answer": false, "rationales": ["The sign states this.", "It is listed on the sign", "The meters allows for ten minutes."], "image": "train2014/COCO_train2014_000000094453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295106, "question_id": "DqkMF7msJP6b3wtWg6hfLm", "question": "How many people will dine at this table?", "choices": ["two", "five", "none", "one"], "correct_choice_idx": 3, "direct_answers": ["one person", "two", "one", "one", "one", "one", "one person", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is only food for one person.", "There is only one cup of water and main dish at this table.", "There is only one entree plated."], "image": "train2014/COCO_train2014_000000295106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210342, "question_id": "DqoPmfenEnhkQTkD2PmEu6", "question": "What type of wet area is nearby?", "choices": ["stream", "pond", "ocean", "swimming pool"], "correct_choice_idx": 2, "direct_answers": ["ocean", "beach", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean"], "difficult_direct_answer": false, "rationales": ["Surfing requires ocean waves and there are surfboards and sand all around.", "The surfboards are lined up on the sand at a beach near the ocean.", "The surfboards are used in the ocean on the waves so they must be near the ocean"], "image": "val2014/COCO_val2014_000000210342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373574, "question_id": "DqoXqxpBn3sHZdZrBJTngv", "question": "What device is next to the TV?", "choices": ["kindle", "tablet", "laptop", "cell phone"], "correct_choice_idx": 2, "direct_answers": ["laptop", "laptop", "laptop", "computer", "computer", "laptop", "bottles", "laptop", "laptop", "laptop"], "difficult_direct_answer": false, "rationales": ["The item is a device that has a screen and keyboard. it fits on your lap, so it is a laptop.", "There is a portable computer.", "A laptop is open with messages shown."], "image": "val2014/COCO_val2014_000000373574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402335, "question_id": "DqzTW2E2DQQCg8P5y4AveH", "question": "What direction is the woman on the red motorcycle traveling?", "choices": ["left", "backwards", "right", "forward"], "correct_choice_idx": 3, "direct_answers": ["straight", "straight", "right", "forward", "north", "forward", "straight", "forward", "forward", "forward"], "difficult_direct_answer": false, "rationales": ["She is moving with traffic all going the same way.", "Her vehicle is faced forward away from the photo.", "The direction is forward."], "image": "train2014/COCO_train2014_000000402335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562575, "question_id": "Dr2565x6brmmpvoWLrEY3V", "question": "Where is the rider of the bike?", "choices": ["at home", "in store", "behind bus", "in bus"], "correct_choice_idx": 3, "direct_answers": ["bus", "inside bus", "on bus", "on bus", "inside bus", "in bus", "on bus", "on bus", "inside bus", "inside bus"], "difficult_direct_answer": false, "rationales": ["The bike is stored on the front of the bus because it belongs to one of the passengers. there is not room inside the bus for the bike and the rack provides convenient storage.", "Since he has attached the bike to the front of the bus, he must be riding inside it.", "The rider of the bike that is mounted on front of the bus is riding inside the bus."], "image": "train2014/COCO_train2014_000000562575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247057, "question_id": "Dr5qeoanimr7RGuQ8Fwp5H", "question": "What position does this player have the snowboards center point on the rail?", "choices": ["flat", "parallel", "right turn", "curved"], "correct_choice_idx": 0, "direct_answers": ["flat", "hitting", "perpendicular", "almost centered", "scatting", "mid", "center", "in balance", "perpendicular", "middle horizontal"], "difficult_direct_answer": true, "rationales": ["In order for the man to balance they must have the snowboard flat on the pipe.", "Flat is the position.", "The snowboard is vertical across the rail."], "image": "val2014/COCO_val2014_000000247057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343422, "question_id": "DrBiLX7UBeb8ZUNsJ95jR5", "question": "Where are they eating?", "choices": ["outside", "roof", "indoors", "basement"], "correct_choice_idx": 0, "direct_answers": ["lunch", "patio", "soup", "food court", "lunch", "outdoor cafe", "cafe", "outside", "outdoor cafe", "outside"], "difficult_direct_answer": false, "rationales": ["People are outside.", "The people are eating outside.", "They are all eating outside because there are pedestrians on the sidewalk."], "image": "train2014/COCO_train2014_000000343422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149233, "question_id": "DrEFDhd38DUnGdRQKy7ybF", "question": "What material is the bat he is holding made of?", "choices": ["wood", "sheetrock", "steel", "plastic"], "correct_choice_idx": 2, "direct_answers": ["aluminum", "aluminum", "aluminum", "metal", "wood", "aluminum", "metal", "aluminum", "steel", "aluminum"], "difficult_direct_answer": false, "rationales": ["The bat is a baseball bat which is made of steel.", "There is no wood present on the equipment the man is holding.", "By looking at the bat you can tell it is not made out of wood plastic or sheetrock"], "image": "train2014/COCO_train2014_000000149233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7320, "question_id": "DrEzANTrUeWdnXdNJAn4Zm", "question": "What is the venue shown in the image?", "choices": ["living room", "show room", "office", "electronics store"], "correct_choice_idx": 1, "direct_answers": ["video game", "nintendo wii", "show room", "theater", "movie set", "gaming", "gaming", "construction", "studio", "stage"], "difficult_direct_answer": true, "rationales": ["The game system is on display in a show room. people can look at the system and give it a try before they buy.", "The living room has the wii remote.", "There is a sign near the tv so it is a show room."], "image": "val2014/COCO_val2014_000000007320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101559, "question_id": "DrFKrL3piijzHpVYydKjBb", "question": "Which objects here can be used to cut other objects?", "choices": ["knives", "potatoes", "carrots", "bottles"], "correct_choice_idx": 0, "direct_answers": ["knives", "knives", "knife", "knife", "knives", "knives", "knives", "knife", "knives", "choose"], "difficult_direct_answer": false, "rationales": ["These are sharp blades with handles", "The knives that are hanging on the back can be used to slice or cut the vegetables.", "Sharp things can cut. there are sharp things hanging on a rack on the wall."], "image": "train2014/COCO_train2014_000000101559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365008, "question_id": "DrYFKkKfHwPfdCPrji4KTi", "question": "What kind behavior is displayed here?", "choices": ["playful", "loving", "aggressive", "friendly"], "correct_choice_idx": 2, "direct_answers": ["animals play", "aggressive", "aggressive", "fighting", "running", "aggressive", "fight", "heard mentality", "aggressive", "kicking"], "difficult_direct_answer": false, "rationales": ["The behavior is aggressive.", "A zebra is seen ramming into another zebra.", "The animal is trying to display dominance"], "image": "train2014/COCO_train2014_000000365008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327352, "question_id": "DraYtw8WfV2Fjo9iyAAbSA", "question": "The color that is most prevalent on the left vehicle is found on what flag?", "choices": ["latvia", "canada", "sierra leone", "germany"], "correct_choice_idx": 2, "direct_answers": ["kazakhstan", "england", "green", "sierra leone", "italy", "uk", "mexico", "afghanistan", "brazil", "spain"], "difficult_direct_answer": true, "rationales": ["The color is for sierra leone.", "Sierra leone has green on its flag.", "The left vehicle is mostly green, not black, red, yellow, or white."], "image": "val2014/COCO_val2014_000000327352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38748, "question_id": "DreTUA5tndWEDuhMbyzdwi", "question": "What is the man doing on his phone?", "choices": ["texting", "reading", "posting", "deleting information"], "correct_choice_idx": 1, "direct_answers": ["browsing", "reading it", "reading", "checking feed", "reading thinking", "checking phone", "investigating", "watching image", "reading phone", "reading"], "difficult_direct_answer": true, "rationales": ["Although the man is holding the phone, he is not touching the screen nor giving commands. therefore, he cannot be texting, deleting or posting.", "The man is reading.", "He is only looking at the screen, not using the keyboard."], "image": "train2014/COCO_train2014_000000038748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400241, "question_id": "Drjh9AvZWJkLKUZFkAZKr5", "question": "What geographical region is partially shown on the map?", "choices": ["australia", "michigan", "florida", "china"], "correct_choice_idx": 2, "direct_answers": ["sky", "florida", "florida", "florida", "florida", "florida", "florida", "florida", "florida", "sky"], "difficult_direct_answer": false, "rationales": ["The map shows part of the florida peninsula.", "The map shape matches the unique shape of the state that looks like a gun.", "You can see part of florida on the map"], "image": "train2014/COCO_train2014_000000400241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564496, "question_id": "DrqFfAdy3Rbwbp4PkYJu8t", "question": "Why is he licking his lips?", "choices": ["lips dry", "teasing", "concentration", "handicap"], "correct_choice_idx": 2, "direct_answers": ["concentrating", "concentration", "dry lips", "confident", "concentration", "concentration", "focusing", "concentrating", "concentrating", "concentrating"], "difficult_direct_answer": false, "rationales": ["The man is licking his lips because he is concentrating hard on his swing.", "The man is focusing on hitting the ball.", "He is trying to hit a tennis ball. it takes focus to play tennis."], "image": "train2014/COCO_train2014_000000564496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243504, "question_id": "DrrYJdaa7dv9Fmr3XmaP8o", "question": "What is the spice in the glass jar with the red top in the foreground?", "choices": ["cloves", "nutmeg", "pepper", "cinnamon"], "correct_choice_idx": 2, "direct_answers": ["pepper", "black pepper", "black pepper", "pepper", "pepper", "pepper", "pepper", "pepper", "pepper", "black pepper"], "difficult_direct_answer": false, "rationales": ["The color consistency", "There is pepper pieces inside of this glass jar.", "By the colors in the small bottle you can tell what it is."], "image": "train2014/COCO_train2014_000000243504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500896, "question_id": "Drxnjm3rsHskvQ6v2xRTQa", "question": "Who is the young girl to the older man?", "choices": ["daughter", "sister", "cousin", "student"], "correct_choice_idx": 0, "direct_answers": ["daughter", "dad", "daughter", "dad", "daughter", "daughter", "daughter", "daughter", "dad", "daughter"], "difficult_direct_answer": false, "rationales": ["The warm condition shows this is a daughter.", "He is holding her hand and guiding her on the skis.", "The girl is his daughter."], "image": "train2014/COCO_train2014_000000500896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349322, "question_id": "DryHr8cDkCwPvMMTf7YyNc", "question": "What is a favorite condiment of the owner?", "choices": ["soy sauce", "mustard", "relish", "ketchup"], "correct_choice_idx": 3, "direct_answers": ["ketchup", "ketchup", "ketchup", "ketchup", "vinegar", "ketchup", "ketchup", "ketchup", "ketchup", "ketchup"], "difficult_direct_answer": false, "rationales": ["There is a bottle of ketchup on the counter.", "The answer is not knowable, but answer a is present in the image while the other answers are not which could indicate the owner's preference.", "There is a ketchup bottle on the counter which is the only condiment."], "image": "train2014/COCO_train2014_000000349322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130749, "question_id": "Ds97npVw2tkSGEfVVNupY2", "question": "What type of transportation hub is this train in?", "choices": ["airport", "train station", "bus station", "subway"], "correct_choice_idx": 0, "direct_answers": ["subway", "airport", "airport", "airport", "train", "airport", "airport", "airport", "subway", "train"], "difficult_direct_answer": false, "rationales": ["The sign says baggage claim.", "Concourses are usually associated with air travel; these locations are often large and require quick transportation from one area to another.", "A train entrance is shown with signs for terminals above."], "image": "train2014/COCO_train2014_000000130749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177207, "question_id": "DsPXP7aEfFXuzFKRykppqw", "question": "What is being done here?", "choices": ["ski lesson", "criticizing", "punishment", "shaming"], "correct_choice_idx": 0, "direct_answers": ["ski lesson", "skiing lesson", "skiing", "teaching", "skiing", "skiing", "skiing", "trees", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["There are people skiing with an instructor beside them.", "The people are getting a lesson in skiing.", "There is a ski instructor."], "image": "val2014/COCO_val2014_000000177207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358200, "question_id": "DtV3fGFdP4u5TyBJgbXYKz", "question": "What continent is this country located at?", "choices": ["north america", "asia", "south america", "australia"], "correct_choice_idx": 0, "direct_answers": ["north america", "north america", "north america", "north america", "north america", "north america", "north america", "north america", "north america", "north america"], "difficult_direct_answer": false, "rationales": ["The country is in north america since it's canada.", "The country here is a maple leaf flag in north america.", "The airline air canada is in north america."], "image": "train2014/COCO_train2014_000000358200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393241, "question_id": "DtstzAs7h3iPNtP9nJvnLt", "question": "Why does the woman in green have her arms out?", "choices": ["to hug", "to block", "to exercise", "to wave"], "correct_choice_idx": 1, "direct_answers": ["to block", "defense", "blocking frisbee", "blocking", "to prevent", "throw frisbee", "blocking opponent", "blocking", "to play", "to block"], "difficult_direct_answer": false, "rationales": ["The woman on the right is trying to throw the frisbee and the woman in green is trying to stop her.", "The woman is blocking the other guy.", "The people playing frisbee are trying to block each other or get the frisbee."], "image": "train2014/COCO_train2014_000000393241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510684, "question_id": "DuD9vnYW96Bb6HC463E2zz", "question": "In a house what room is this typically called?", "choices": ["entertainment room", "kitchen", "dining room", "living room"], "correct_choice_idx": 0, "direct_answers": ["foyer", "foyer", "dining room", "entertainment room", "entryway", "entryway", "entryway", "living room", "foyer", "den"], "difficult_direct_answer": false, "rationales": ["This area is meant for entertainment because of the red frame.", "This room is typically called an entertainment room.", "There isn't much furniture blocking the floor"], "image": "train2014/COCO_train2014_000000510684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483550, "question_id": "DuSfYojjQRrAEH7TVWB7sr", "question": "Why is the maintenance guy wearing protection on his hands?", "choices": ["sticky", "dirty", "heat", "sharp"], "correct_choice_idx": 2, "direct_answers": ["hot steam", "gloves", "for heat", "hot metal", "hot steam", "prevent injuries", "heat protection", "heat", "hot metal", "protection"], "difficult_direct_answer": false, "rationales": ["A man is on a train near pipes that are steaming. train parts get hot.", "The man is wearing gloves on this hands to protect them from the hot steam.", "These warm up from friction of things moving through the metal pipes"], "image": "val2014/COCO_val2014_000000483550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539719, "question_id": "DuaM3VnRrXC2LzBXPUvAgU", "question": "What is the laptop owner a fan of according to the note?", "choices": ["avengers", "eternals", "star trek", "star wars"], "correct_choice_idx": 3, "direct_answers": ["star wars", "star wars", "star wars", "dangerous", "star wars", "star wars", "dangerous", "star wars", "dangerous", "star wars"], "difficult_direct_answer": false, "rationales": ["The person likes stars wars according to the meme.", "These are characters from the movie", "References to han solo and wookies reference characters from a beloved movie and book franchise."], "image": "train2014/COCO_train2014_000000539719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540196, "question_id": "Dv4wwm9TKuAQUKSrH2HAwJ", "question": "What type of eyesight does the man standing here have?", "choices": ["2020", "near sighted", "perfect", "far sighted"], "correct_choice_idx": 3, "direct_answers": ["poor", "near sighted", "far sighted", "farsighted", "farsighted", "needs glasses", "far sighted", "poor", "bad", "poor"], "difficult_direct_answer": false, "rationales": ["The man can see far distances since he's not wearing them to look out into the distance.", "The man has glasses around his neck.", "The man's reading glasses are on a strap so that he can put them on when he needs to read something close up."], "image": "val2014/COCO_val2014_000000540196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547524, "question_id": "Dv5FDg7JuJnKn3Grhds6qt", "question": "What kind of material is the left wall?", "choices": ["bamboo", "wood", "stone", "cement"], "correct_choice_idx": 2, "direct_answers": ["stone", "rock", "moss", "stone", "stone", "carpet", "stucco", "stone", "granite", "glass"], "difficult_direct_answer": false, "rationales": ["The wall is a rough texture, looks very hard, and is porous and slanted and imperfectly cut in a way that looks not entirely manmade.", "The wall is made of rock.", "The wall looks like it is poured and molded cement."], "image": "train2014/COCO_train2014_000000547524.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109506, "question_id": "DvFWU9xbyN5Ab5N5P34ESE", "question": "Who are in the background?", "choices": ["boys", "women", "girls", "men"], "correct_choice_idx": 1, "direct_answers": ["women", "two women", "two women", "people", "two women", "people", "women", "women", "women", "two women"], "difficult_direct_answer": false, "rationales": ["They have feminine faces.", "There are two women in the background.", "Their clothing and hairstyle makes them look like they are female adults"], "image": "train2014/COCO_train2014_000000109506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581458, "question_id": "DvGa6B68MBi8JbWEMggSGj", "question": "What type train is shown here?", "choices": ["oil", "coal", "solar", "elevated"], "correct_choice_idx": 3, "direct_answers": ["elevated", "metro train", "ell train", "l train", "subway", "subway", "elevated", "monorail", "elevated", "subway"], "difficult_direct_answer": false, "rationales": ["The train is on top of a bridge.", "The train has a special name because instead of traveling underground or at ground level, the tracks it uses are raised high up in the air. they are found in chicago and new york.", "The track is shown above the roadway below."], "image": "val2014/COCO_val2014_000000581458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445853, "question_id": "DvL8dtPKVr8ifgLt6yQgPv", "question": "What type of fabric is the blue item of clothing at the foot of the bed in the background?", "choices": ["satin", "denim", "velvet", "wool"], "correct_choice_idx": 1, "direct_answers": ["denim", "sheet", "jeans", "jeans", "denim", "denim", "denim", "denim", "cotton", "denim"], "difficult_direct_answer": false, "rationales": ["The item looks to be a pair of blue jeans.", "The blue is denim like for blue jeans.", "The blue item of clothing is a pair of jeans which are traditionally made from this sturdy textile."], "image": "train2014/COCO_train2014_000000445853.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506641, "question_id": "DvQBFz3HNx9WY35Zzrwx8y", "question": "What helpful object will help keep his hands from being burnt?", "choices": ["spatula", "oven mitts", "glasses", "apron"], "correct_choice_idx": 1, "direct_answers": ["oven mitts", "oven mitts", "glove", "potholder", "gloves", "mit", "oven mitts", "oven mit", "gloves", "glove"], "difficult_direct_answer": false, "rationales": ["The mitts prevent serious burn injuries that may occur while cooking in the kitchen.", "Oven mitts keep people's hands safe.", "The steam from the pot could easily burn his hand in that position."], "image": "train2014/COCO_train2014_000000506641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273537, "question_id": "DvUBJ3ee3sm9QJpdCQPiyW", "question": "Why are some of them looking away from the screen?", "choices": ["bored", "additional screen", "talking", "scary movie"], "correct_choice_idx": 2, "direct_answers": ["talking", "disinterested", "talking", "talking", "distracted", "not playing", "unknown", "hearing someone", "talking", "television"], "difficult_direct_answer": false, "rationales": ["A group of people are in a family room and one of them is looking at the rest of the group rather than the television.", "Some of the people are conversing with each other.", "The person standing up appears to be communicating something, so those looking aware are more interested in what this person is saying, than what is on the screen."], "image": "train2014/COCO_train2014_000000273537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258085, "question_id": "DvcNTVC5zjRiFJqm9P2ZeD", "question": "He is addressing the children using what probable languages?", "choices": ["portuguese/english", "italian/danish", "spanish/german", "zulu/swahili"], "correct_choice_idx": 0, "direct_answers": ["english", "portuguese", "portuguese/english", "portugese", "english", "native language", "portuguese", "english portuguese", "spanish", "english"], "difficult_direct_answer": false, "rationales": ["He is using portuguese and english.", "Because the children have brazilian flag whose national language is portuguese.", "The kids are holding brazilian and canadian flags."], "image": "val2014/COCO_val2014_000000258085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555048, "question_id": "Dvuazm5Ftqn7qZ2TJRiQdS", "question": "What kind of transportation is available?", "choices": ["rail", "water", "air", "road"], "correct_choice_idx": 3, "direct_answers": ["car", "car", "car", "car", "sedan", "road", "car", "sedan", "car", "car"], "difficult_direct_answer": false, "rationales": ["There is a vehicle parked outside and it travels by street.", "A car can be seen parked outside the home, meaning that there is a road attached to use it on.", "A car is used to drive on a road."], "image": "val2014/COCO_val2014_000000555048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327702, "question_id": "Dw39xkTxwZKPZRep93RrSn", "question": "Based on their diet what kind of animal is this?", "choices": ["none", "omnivore", "herbivore", "carnivore"], "correct_choice_idx": 2, "direct_answers": ["giraffe", "herbivore", "vegan animal", "vegan", "giraffe", "herbivore", "herbivore", "herbivore", "herbivore", "herbivore"], "difficult_direct_answer": false, "rationales": ["The animal is a giraffe. it has a plant-based diet.", "Giraffes only eat plants.", "Giraffes usually eat leaves and shrubbery, and no animal products, which would make them herbivores."], "image": "train2014/COCO_train2014_000000327702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444448, "question_id": "DwQ3j7QtU7zTnihvCpYWWG", "question": "What is the woman doing to the fire hydrant?", "choices": ["cleaning it", "building it", "painting it", "dismantling it"], "correct_choice_idx": 2, "direct_answers": ["painting", "painting", "painting", "painting", "painting", "painting", "painting it", "painting", "painting", "painting"], "difficult_direct_answer": false, "rationales": ["The woman is painting.", "The woman is painting the fire hydrant on the sidewalk.", "The hydrant has different colors on it than would be traditional and the woman is holding a paintbrush with paint cans in front of her."], "image": "train2014/COCO_train2014_000000444448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506628, "question_id": "DwiTGSqUhnjR3svZRQAvPR", "question": "In which town is this clock located?", "choices": ["mt pleasant", "new york", "mayberry", "mansfield"], "correct_choice_idx": 0, "direct_answers": ["mt pleasent", "mt pleasant", "mt pleasant", "mount pleasant", "mta pleasant", "mta pleasant", "mta pleasant", "mt pleasant", "broadway", "mta pleasant"], "difficult_direct_answer": false, "rationales": ["There is a sign on the clock", "Mt. pleasant is the town named on the clock's sign.", "The name of the town is on the sign."], "image": "val2014/COCO_val2014_000000506628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257796, "question_id": "Dwz7yDZYgcWhTAjwuzoKio", "question": "What is usually found in the same room as the word on the sign spelled backwards?", "choices": ["cradle", "pans", "bed", "toilet"], "correct_choice_idx": 1, "direct_answers": ["pans", "pans", "pots", "pots", "pans", "stove", "pans", "stove", "pans", "kitchen"], "difficult_direct_answer": false, "rationales": ["The word on the sign spelled backwards is pots. the corresponding room is a kitchen, not a bathroom or bedroom.", "Pan is the most commonly known in the area.", "Pans are found with pots."], "image": "train2014/COCO_train2014_000000257796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189361, "question_id": "Dx64APVLTTGBs3xMePNGRb", "question": "What's the middle name of the person who took this shot?", "choices": ["landsiedel", "georg", "peter", "hans"], "correct_choice_idx": 2, "direct_answers": ["peter", "peter", "peter", "peter", "peter", "peter", "peter", "peter", "peter", "peter"], "difficult_direct_answer": false, "rationales": ["The watermark in the corner says peter is his middle name.", "The watermark says \"georg peter landsiedel\".", "The person who took the shot is named peter, according to the sign."], "image": "train2014/COCO_train2014_000000189361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427946, "question_id": "DxMvdHKAbAwenCxRKRTJFB", "question": "What chore is the woman here organizing?", "choices": ["dusting", "cooking", "laundry", "dental cleaning"], "correct_choice_idx": 2, "direct_answers": ["laundry", "laundry", "laundry", "laundry", "laundry", "laundry", "laundry", "laundry", "laundry", "laundry"], "difficult_direct_answer": false, "rationales": ["The woman is sorting her clothing.", "The other options don't match her actions. it looks like she's using the pillowcase as a a bag.", "She's on the bed handling a pile in the pillowcase. you can put clothes in a pillowcase which acts as a laundrybag."], "image": "train2014/COCO_train2014_000000427946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172297, "question_id": "DxPRsCuqXGLZ4jFoV2jEed", "question": "What type of business is this?", "choices": ["bank", "grocery store", "restaurant", "barber"], "correct_choice_idx": 2, "direct_answers": ["cafe", "cafe", "restaurant", "restaurant", "cafe", "cafe", "cafe", "cafe", "restaurant", "cafe restaurant"], "difficult_direct_answer": false, "rationales": ["The sign on the right indicates that the business is a cafe.", "You can see they offer breakfast and lunch.", "The business sells breakfast and lunch."], "image": "train2014/COCO_train2014_000000172297.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512418, "question_id": "DxweDDdPRwPynppSuh6EQW", "question": "What sort of person is this?", "choices": ["actor", "virtual avatar", "real", "professional model"], "correct_choice_idx": 1, "direct_answers": ["overwhelmed", "virtual person", "animated", "tennis player", "tennis player", "virtual avatar", "cartoon", "robot", "digital", "energetic"], "difficult_direct_answer": true, "rationales": ["This is a virtual avatar of a tennis player.", "This is an animated model of a person. she is not real.", "The person on the tennis court is a virtual character called an avatar."], "image": "train2014/COCO_train2014_000000512418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491981, "question_id": "Dy5KZ7KMKVLezC2Qye4wgD", "question": "How would people get out if there was a fire?", "choices": ["emergency door", "open entryway", "trap door", "open window"], "correct_choice_idx": 0, "direct_answers": ["emergency exit", "emergency exit", "emergency exit", "emergency exit", "emergency exit", "emergency doors", "emergency exit", "emergency door", "emergency doors", "door"], "difficult_direct_answer": false, "rationales": ["The door states its for emergencies", "This exit is right behind them.", "In case of emergency the door is marked with red lettering that is used for this purpose."], "image": "train2014/COCO_train2014_000000491981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375500, "question_id": "DyBVKTKC2Ef9TNm4G9WxMj", "question": "What might happen due to the cat's location?", "choices": ["computer restarts", "duck noises", "curtains torn", "piano noises"], "correct_choice_idx": 3, "direct_answers": ["note played", "noise", "piano noises", "play notes", "piano noise", "noise", "keys pressed", "keyboard tips", "notes played", "music"], "difficult_direct_answer": true, "rationales": ["The weight of the cat on the piano cords may result to piano noises.", "The cat is on a keyboard so could press the keys.", "The cat is resting on the keys. pushing the keys makes sound."], "image": "train2014/COCO_train2014_000000375500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313786, "question_id": "DyJunN6zbMmPDz8tJSX23h", "question": "The person whose name appears at the top is a descendant of whom?", "choices": ["atahualpa", "henry viii", "saladin", "temujin"], "correct_choice_idx": 1, "direct_answers": ["queen elizabeth", "ship", "queen elizabeth", "elizabeth", "queen elizabeth", "henry viii", "british", "queen elizabeth", "royalty", "queen victoria"], "difficult_direct_answer": false, "rationales": ["The other options aren't english.", "Queen elizabeth is a known descendent of henry. this is historical knowledge.", "Henry viii descended from queen elizabeth."], "image": "train2014/COCO_train2014_000000313786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395337, "question_id": "DyWgt5Rk2iGaUY7H4y7yDt", "question": "Which region is the granule food from?", "choices": ["europe", "africa", "australia", "south america"], "correct_choice_idx": 1, "direct_answers": ["asia", "morocco", "mediterranean", "india", "mediterranean", "italy", "mediterranean", "africa", "north africa", "asia"], "difficult_direct_answer": false, "rationales": ["The food appears to be couscous based on the size and texture. this is a food from answer a.", "It's believed it came from that region and is still very popular there.", "The region is africa."], "image": "train2014/COCO_train2014_000000395337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435556, "question_id": "DysGqtJvHcARWsb4tvEDZW", "question": "Which food contains the most vitamin A?", "choices": ["tomato", "lettuce", "carrot", "broccoli"], "correct_choice_idx": 2, "direct_answers": ["carrots", "carrot", "carrots", "carrots", "carrots", "carrots", "carrot", "carrots", "carrots", "kales"], "difficult_direct_answer": false, "rationales": ["The carrots on the produce stand are very high in vitamin a.", "A woman stands surrounded by produce with carrots close to her.", "The tomatoes have the most vitamin a."], "image": "train2014/COCO_train2014_000000435556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328548, "question_id": "DyzVXKVXcwwUbCSU7VtAck", "question": "What is the most used name for the object that the kid is pushing?", "choices": ["luggage helper", "baggage cart", "suitcase pusher", "bag trolley"], "correct_choice_idx": 1, "direct_answers": ["cart", "cart", "cart", "cart", "cart", "luggage cart", "baggage cart", "trolley", "shopping cart", "cart"], "difficult_direct_answer": false, "rationales": ["It is a cart for luggage.", "It is used to carry suitcases through an airport.", "A child is pushing a cart full of luggage."], "image": "train2014/COCO_train2014_000000328548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376136, "question_id": "DzqJBf6dX7naAby2nvKg5u", "question": "What is the name for the man driving the train?", "choices": ["conductor", "attendant", "cabi", "pilot"], "correct_choice_idx": 0, "direct_answers": ["conductor", "conductor", "conductor", "engineer", "conductor", "conductor", "engineer", "engineer", "engineer", "conductor"], "difficult_direct_answer": false, "rationales": ["There is only one person that can drive a train.", "The person is in charge of a public or private conveyance.", "A conductor conducts the train."], "image": "train2014/COCO_train2014_000000376136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255216, "question_id": "Dzsxy34458P3NzTsv4m9jd", "question": "What is the opposite of this event?", "choices": ["vacation", "child birth", "divorce", "double marriage"], "correct_choice_idx": 2, "direct_answers": ["divorce", "divorce", "married", "divorce", "married", "divorce", "divorce", "divorce", "married", "divorce"], "difficult_direct_answer": false, "rationales": ["People that are married and unhappy can go through a process so that they are no longer legally attached to each other.", "A man and woman are celebrating a wedding.", "A wedding is happily coming together as the photo shows, where as a divorce is breaking apart."], "image": "train2014/COCO_train2014_000000255216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23676, "question_id": "DzubuYZ6fGf8MJg97cQkwe", "question": "What sport are these women most likely playing?", "choices": ["tennis", "softball", "lacrosse", "croquet"], "correct_choice_idx": 1, "direct_answers": ["baseball", "softball", "softball", "baseball", "baseball", "softball", "softball", "softball", "baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["They have a bat and women usually play this instead of baseball", "The woman are playing softball with a bat.", "The women are holding bats and playing softball."], "image": "train2014/COCO_train2014_000000023676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346366, "question_id": "DzuqXhx666LZ2z9453Qkb7", "question": "What are a group of these animals called?", "choices": ["school", "herd", "pack", "flock"], "correct_choice_idx": 3, "direct_answers": ["herd", "flock", "flock", "herd", "herd", "flock", "flock", "herd", "herd", "flock"], "difficult_direct_answer": false, "rationales": ["That is what you call a group of sheep", "A large group of sheep are together near a crowd of people.", "A group is a flock."], "image": "train2014/COCO_train2014_000000346366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83642, "question_id": "E2E8UkSBCRySd8NWTy5eXv", "question": "What type of phone is she using?", "choices": ["rotary", "pay", "cellular", "landline"], "correct_choice_idx": 2, "direct_answers": ["cell", "smartphone", "cellphone", "smart phone", "smartphone", "cell phone", "cellular", "cell", "cell", "smart phone"], "difficult_direct_answer": false, "rationales": ["The woman on the sidewalk is talking on a cellular phone with no wire.", "She is outside and using the phone that is not attached to anything.", "The woman does not have a cord on her phone and is outside."], "image": "train2014/COCO_train2014_000000083642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278732, "question_id": "E2FSVXnhXPy5SjkmE6uN79", "question": "What is the name for the pattern used on the window curtains?", "choices": ["floral", "plaid", "birdseye", "polka dot"], "correct_choice_idx": 1, "direct_answers": ["rectangle", "plaid", "plaid", "checkered", "checkered", "flannel", "checkered", "plaid", "tartan", "plaid"], "difficult_direct_answer": false, "rationales": ["The name is plaid.", "The curtains are in a plaid pattern.", "There is a yellow plaid pattern on the top of the window."], "image": "train2014/COCO_train2014_000000278732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325530, "question_id": "E2NgrXPZShiRanCdLEaErm", "question": "Which conference is this game in?", "choices": ["southern", "west coast", "east coast", "northern"], "correct_choice_idx": 1, "direct_answers": ["west coast", "west coast", "west coast", "little league", "west coast", "little league", "west coast", "west coast", "west coast", "west coast"], "difficult_direct_answer": false, "rationales": ["The score board at the baseball field identified the conference.", "It says it on the scoreboard", "The sign on the background wall indicates what conference this is in."], "image": "train2014/COCO_train2014_000000325530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371395, "question_id": "E2sqJcZbG4573SAJhqMSDe", "question": "What is the relationship between the two kids?", "choices": ["siblings", "unrelated", "friends", "classmates"], "correct_choice_idx": 0, "direct_answers": ["siblings", "siblings", "siblings", "siblings", "siblings", "siblings", "siblings", "siblings", "siblings", "siblings"], "difficult_direct_answer": false, "rationales": ["Their mom is in between the two that are brother and sister.", "The kids are with their mom.", "The children, a boy and a girl, are sitting with an older woman. the woman is probably their mother making them brother and sister."], "image": "val2014/COCO_val2014_000000371395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230982, "question_id": "E2yCoHQrmPPAgXYkvvsWA2", "question": "What is this person looking at?", "choices": ["fire", "enemy", "video monitor", "plane"], "correct_choice_idx": 2, "direct_answers": ["inside building", "backyard", "tv screen", "television", "video monitor", "outside", "reflection", "television", "outside", "tv"], "difficult_direct_answer": false, "rationales": ["The person is checking the monitor in their hand.", "The person has a wii control in his hand.", "The person looks at the monitor."], "image": "val2014/COCO_val2014_000000230982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194499, "question_id": "E2zjrmQB7Ngr5opnrc9aBG", "question": "The picture and outfit looks like it is from what decade?", "choices": ["1980s", "1920s", "1970s", "1990s"], "correct_choice_idx": 1, "direct_answers": ["1920s", "twenties", "forties", "thirties", "1920s", "20s", "olden days", "forties", "1900", "twenties"], "difficult_direct_answer": false, "rationales": ["The double breasted coat for young boys was popular in the early twentieth century.", "The person is old-fashioned.", "This is a black and white photo and the clothing style is quite old"], "image": "val2014/COCO_val2014_000000194499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288948, "question_id": "E32J7vCcndterC5JZFPDGT", "question": "What comes out of the black container on the right wall?", "choices": ["water", "mail", "towels", "soap"], "correct_choice_idx": 2, "direct_answers": ["paper towels", "cold items", "paper towel", "towels", "paper towels", "tissue paper", "soda", "paper towels", "food", "paper towels"], "difficult_direct_answer": false, "rationales": ["The black container on the right wall is a paper towel dispenser.", "The items are solid, not liquid. it is not a mailbox.", "This item dispenses paper cleaning products, visible in the photo."], "image": "train2014/COCO_train2014_000000288948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541301, "question_id": "E3PfrSzShKzWApdK7z2eeu", "question": "What does the urinals use to wash away human waste?", "choices": ["waster", "glue", "air", "magnets"], "correct_choice_idx": 0, "direct_answers": ["water", "water", "water", "water", "water", "urine", "waster", "water", "water", "urine"], "difficult_direct_answer": false, "rationales": ["Waste gets flushed away.", "They use water to wash away the waste.", "That is use to wash down human waste."], "image": "train2014/COCO_train2014_000000541301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170346, "question_id": "E3aoDVdjRBSigqpfWdRUkQ", "question": "The animals here possess which trait helping keep them alive?", "choices": ["anomie", "nimbleness", "amity", "meanness"], "correct_choice_idx": 1, "direct_answers": ["horns", "horns", "agility", "agility", "horns", "horns", "nimbleness", "agility", "horns", "horns"], "difficult_direct_answer": false, "rationales": ["These animals need to be able to move on rocky mountain terrain and nimbleness is a big help to let them do it.", "They can climb up and down vertical cliffs.", "These animals are extremely fast."], "image": "val2014/COCO_val2014_000000170346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74135, "question_id": "E3mL846nC4KjKJmwn73495", "question": "What job do these people hold?", "choices": ["servers", "butcher", "dairy maids", "fishers"], "correct_choice_idx": 1, "direct_answers": ["fisherman", "butcher", "butcher", "fish cleaning", "butcher", "fishmongers", "fish", "fish mongers", "butcher", "fishmonger"], "difficult_direct_answer": false, "rationales": ["There are large cuts of meat visible in this kitchen and the men are wearing the attire of those who deal with cuts of meat for a living.", "They are cutting up meat", "These people are carving up animals like a butcher would."], "image": "val2014/COCO_val2014_000000074135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514676, "question_id": "E3pbJx4V8mPYe7Fvtf3Vzg", "question": "The tourists are hoping to get pictures of the giraffe in its natural?", "choices": ["ceremony", "habitat", "hibernation", "humanity"], "correct_choice_idx": 1, "direct_answers": ["habitat", "habitat", "habitat", "habitat", "environment", "environment", "habitat", "habitat", "habitat", "habitat"], "difficult_direct_answer": false, "rationales": ["A single white ban is driving down a safari. a giraffe is roaming around in the field.", "The people look as though they are on a safari which is an activity done to see wild animals where they live.", "To make pictures look more real and organic you have to take it where they live."], "image": "train2014/COCO_train2014_000000514676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66348, "question_id": "E3tYXMXDP93CAdb47bwHkf", "question": "The first three letters on the sign are found in what name?", "choices": ["jess", "maddie", "kennedy", "alison"], "correct_choice_idx": 3, "direct_answers": ["ali", "alice", "aldi", "ali", "michael", "alison", "alison", "alice", "ali", "alison"], "difficult_direct_answer": false, "rationales": ["Large truck is carrying a food item with a white sign on the back side of it. it starts with ali.", "The first three letters of alison are ali.", "The first name was the obvious choice if you can read the sign."], "image": "val2014/COCO_val2014_000000066348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546388, "question_id": "E42fvLZMcxL9XyVm9HDL3v", "question": "The vehicle used for rescue purpose is?", "choices": ["police", "medicine", "ambulance", "pharmacy"], "correct_choice_idx": 2, "direct_answers": ["rescue", "ambulance", "ambulance", "orange", "rescue", "van", "orange", "orange", "orange van", "van"], "difficult_direct_answer": false, "rationales": ["It has rescue written on the door", "The car is meant for rescues.", "The van has space to bring people to help."], "image": "val2014/COCO_val2014_000000546388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102290, "question_id": "E4G2wnZ46fJdqzcXqgqMfk", "question": "If someone wanted to cross near here what should they do?", "choices": ["turn around", "run across", "wait", "walk across"], "correct_choice_idx": 2, "direct_answers": ["go around", "be careful", "wait", "wait", "walk around", "wait", "wait", "press button", "wait", "wait"], "difficult_direct_answer": false, "rationales": ["The signal is showing a red person. this indicates that people should not cross.", "A person is doing a handstand on one hand. if someone walked near him they may get kicked.", "The persons depicted are engaging in what looks like a dance contest based on their body configuration. with such flailing limbs, one might want to do answer a so they are not struck."], "image": "train2014/COCO_train2014_000000102290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25529, "question_id": "E4gCs9jwh4GJXoCfTQpLRd", "question": "What is on the fries?", "choices": ["ketchup", "cheese", "chili", "guacamole"], "correct_choice_idx": 0, "direct_answers": ["ketchup", "ketchup", "ketchup", "ketchup", "ketchup", "ketchup", "burger", "ketchup", "burger", "ketchup"], "difficult_direct_answer": false, "rationales": ["The fries are covered in ketchup.", "The french fries are covered with a red substance which certainly must be ketchup because ketchup is an extremely popular--if not the most popular--topping for fries.", "People like to put ketchup on their fries."], "image": "val2014/COCO_val2014_000000025529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576820, "question_id": "E57FdeXTrdoNQAYuzzgDJX", "question": "Of what religion is the person in the long black coat and black hat?", "choices": ["mormon", "muslim", "hari krishna", "jewish"], "correct_choice_idx": 3, "direct_answers": ["amish", "jewish", "jewish", "amish", "jewish", "amish", "jewish", "jewish", "orthodox judaism", "amish"], "difficult_direct_answer": false, "rationales": ["The man is wearing a yamaka.", "The man is an orthodox jewish person.", "The outfit being worn is commonly worn by sects of answer a's religion and not known to be associated with other religions."], "image": "val2014/COCO_val2014_000000576820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12428, "question_id": "E57j4zR4KP3eBdHhfGT94i", "question": "What animal loves this kind of fruit?", "choices": ["horse", "dog", "codfish", "flea"], "correct_choice_idx": 0, "direct_answers": ["horse", "horse", "parrots", "any", "horse", "horse", "horses", "horse", "monkey", "bat"], "difficult_direct_answer": false, "rationales": ["The fruit is an apple. horses like apples.", "The fruit in question is an apple, based on its color and shape. answer a is an animal who enjoys this food.", "Horses love to eat apples."], "image": "train2014/COCO_train2014_000000012428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497610, "question_id": "E5fnWqL3LbseXGdZNeqqPD", "question": "What fun activity is shown?", "choices": ["snow boarding", "bumper cars", "skiing", "rock climbing"], "correct_choice_idx": 0, "direct_answers": ["snowboarding", "snowboarding", "skiing", "snow boarding", "snowboarding", "snowboarding", "skiing", "skiing", "skiing", "snowboarding"], "difficult_direct_answer": false, "rationales": ["Many people are out there in the winter time. they are messing with their equipment to secure it to their feet.", "The people are riding on snowboards.", "The people are connecting their feet to snowboards."], "image": "train2014/COCO_train2014_000000497610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202739, "question_id": "E62vmyGXuc6e5eLWjgxQnX", "question": "What is the woman doing with the silver object?", "choices": ["drying hair", "curling hair", "applying makeup", "singing"], "correct_choice_idx": 0, "direct_answers": ["blow drying", "drying hair", "drying hair", "drying hair", "drying", "drying hair", "drying hair", "drying hair", "hair drying", "drying hair"], "difficult_direct_answer": false, "rationales": ["She has a hair dryer.", "The woman is holding the item in a position where it is interacting with her hair and based on the size, shape and design of the object it is a hairdryer whose intended purpose is answer a.", "She is using it after she washed her hair to get it dry."], "image": "train2014/COCO_train2014_000000202739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384040, "question_id": "E65FDiuuDAgLiW6nUTwwUQ", "question": "What is being served in the white mug?", "choices": ["beer", "juice", "milk", "coffee"], "correct_choice_idx": 3, "direct_answers": ["coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["Coffee is served.", "This caffienated drink is just the boost that some people need in the morning.", "That is the beverage typically had with breakfast and served in a mug to keep it warm."], "image": "val2014/COCO_val2014_000000384040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486125, "question_id": "E67jJtf7x2iosCmmiiX8oU", "question": "What color theme is the man with the the tie trying to achieve with his outfit?", "choices": ["white", "blue", "black", "grey"], "correct_choice_idx": 3, "direct_answers": ["light colored", "bright", "white", "matching", "white", "grey white", "grey", "white", "white", "monochrome"], "difficult_direct_answer": false, "rationales": ["The tie is gray and the khakis are gray.", "The man with the tie has gray pants on and a gray tie.", "The tie on his color is also stripped with white small spots."], "image": "train2014/COCO_train2014_000000486125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97530, "question_id": "E6EfDnyprk5tBtiNvhNm7C", "question": "What is the blue base the camera is on called?", "choices": ["selfie stick", "dipstick", "tripod", "mini stick"], "correct_choice_idx": 2, "direct_answers": ["stand", "trolley", "foundation", "tripod", "tripod", "tripod", "stand", "camera stand", "tripod", "holder"], "difficult_direct_answer": false, "rationales": ["It has three legs.", "The blue case of the camera is known as a tripod.", "The base is a tripod."], "image": "train2014/COCO_train2014_000000097530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344073, "question_id": "E6biUAmVcg9p2Kb6WKxy8X", "question": "What is the exercise on the computer called?", "choices": ["grandstand", "cartwheel", "flip", "handstand"], "correct_choice_idx": 3, "direct_answers": ["handstand", "summersault", "handstand", "handstand", "hand stands", "handstand", "yoga", "calisthenics", "handstand", "yoga"], "difficult_direct_answer": false, "rationales": ["The person is upside down on their hands.", "A person is upside down and using his hands to stand on.", "A person is standing with hands on the ground and feet in the air."], "image": "train2014/COCO_train2014_000000344073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118457, "question_id": "E6iHpKCp2QwZNVAoq46UnU", "question": "What do the men want to do next?", "choices": ["kiss", "eat", "play frisbee", "play football"], "correct_choice_idx": 2, "direct_answers": ["photo shoot", "play frisbee", "play ball", "play frisbee", "play frisbee", "play", "play frisbee", "play frisbee", "play ball", "play frisbee"], "difficult_direct_answer": false, "rationales": ["The person in the middle is holding a flying disc, not a football. they are all men, so it is unlikely that they will want to kiss.", "Since they are holding the disk, it would mean they want to play a game with it.", "The men are holding a frisbee that they are going to play with by throwing."], "image": "train2014/COCO_train2014_000000118457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106228, "question_id": "E6qBNCu6chqPbX8B3iaV6v", "question": "What animal likes to eat what the man is eating?", "choices": ["slug", "amoeba", "stingray", "monkey"], "correct_choice_idx": 3, "direct_answers": ["monkey", "monkey", "primates", "monkey", "apes", "monkey", "monkey", "monkey", "primates", "monkey"], "difficult_direct_answer": false, "rationales": ["Bananas are a favorite of monkeys. you may see monkeys eating bananas if you visit a zoo.", "The man is eating a banana. monkeys also eat bananas.", "Bananas are much loved by monkeys."], "image": "val2014/COCO_val2014_000000106228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567342, "question_id": "E724Wf7bWAiNQ2U8ndkmpS", "question": "Inside silver round large cans what is seen here in profusion?", "choices": ["meats", "condiments", "main dish", "side dish"], "correct_choice_idx": 1, "direct_answers": ["spoons", "red sauce", "condiments", "condiments", "food", "marinara sauce", "tomato sauce", "condiments", "dips", "forks"], "difficult_direct_answer": false, "rationales": ["The cups are full of liquid which requires spoons to serve.", "The condiments are inside.", "A lot of ketchup and soy sauce and hoisin is in the jars."], "image": "train2014/COCO_train2014_000000567342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310865, "question_id": "E75KDBdGhnvx3fueV66fxv", "question": "What are the black sticks on the green bowl?", "choices": ["spoons", "pins", "knives", "chop sticks"], "correct_choice_idx": 3, "direct_answers": ["chop", "chop sticks", "chopsticks", "chopstick", "to eat", "chop sticks", "chop sticks", "eating", "chopsticks", "chopsticks"], "difficult_direct_answer": false, "rationales": ["These are commonly used with asian dishes as displayed here.", "Those skewers are used to eat asian style cuisine, which is visible on the table.", "Some parts of the world use these instead of utensils."], "image": "train2014/COCO_train2014_000000310865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97878, "question_id": "E7NTpjBJfMME2iqHpoG96s", "question": "What type animals are painted here?", "choices": ["bovine", "rodent", "feline", "apes"], "correct_choice_idx": 2, "direct_answers": ["lions", "lion", "feline", "lions", "lions", "lion", "lion", "lion", "people", "lion"], "difficult_direct_answer": false, "rationales": ["Lions are related to cats.", "Motorcycles are lined up and lions are painted on all of them.", "The animals are cats."], "image": "train2014/COCO_train2014_000000097878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181799, "question_id": "E7QTsSN3rNx2gVC5Q29X3h", "question": "What are the sliced red vegetables on the right side of dish called?", "choices": ["red cabbage", "eggplant", "beets", "radish"], "correct_choice_idx": 2, "direct_answers": ["beets", "beets", "beets", "lettuce", "beats", "beets", "beets", "beetroot", "beets", "cabbage"], "difficult_direct_answer": false, "rationales": ["The vegetables are known as beets since they're ruby red.", "They are burgundy vegetables.", "The veggies are beets."], "image": "val2014/COCO_val2014_000000181799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456245, "question_id": "E7d3v6Kmx9ZZxBhmX4J4Bu", "question": "What would happen if an additional large adult boarded this boat?", "choices": ["nothing", "sink it", "faster progress", "extra charge"], "correct_choice_idx": 1, "direct_answers": ["boat sinks", "sink", "sink", "boat sinks", "sink", "sink", "sink", "sink it", "sink", "tip over"], "difficult_direct_answer": false, "rationales": ["The boat is already sunk into the water so if any other adult boards it will sink some more.", "The boat would sink.", "The boat is already close to this given how low it is in the water."], "image": "train2014/COCO_train2014_000000456245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13965, "question_id": "E7yX9K8gLsPKeDfMj73uZ6", "question": "What is available according to the blue sign?", "choices": ["snacks", "bathrooms", "movies", "handicap seats"], "correct_choice_idx": 3, "direct_answers": ["handicap seats", "disabled entrance", "handicap seating", "wheelchair ramp", "handicapping boarding", "handicap seats", "wheelchair access", "handicap seat", "wheel access", "handicap entry"], "difficult_direct_answer": true, "rationales": ["The blue sign with the person in a wheelchair symbol means that there is special access and seating.", "Handicap seats are available.", "The blue sign signifies that the train is able to accommodate people in wheelchairs."], "image": "val2014/COCO_val2014_000000013965.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271461, "question_id": "E82pzemtpGKbeEx7TwKMus", "question": "What has the rice been cooked in?", "choices": ["skillet", "dish", "pan", "plate"], "correct_choice_idx": 0, "direct_answers": ["skillet", "cooking pan", "skillet", "skillet", "pot", "cast iron", "skillet", "wok", "skillet", "skillet"], "difficult_direct_answer": false, "rationales": ["The rice is in a skillet.", "It is heavy cast-iron and can be placed either in oven or on stove.", "The rice dish has been cooked altogether in one pan."], "image": "val2014/COCO_val2014_000000271461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226599, "question_id": "E8QQL4gJrgdBrhWWDZgcbf", "question": "Who is the manufacturer of the hatchback car?", "choices": ["ford", "chrysler", "chevrolet", "honda"], "correct_choice_idx": 0, "direct_answers": ["ford", "ford", "ford", "ford", "ford", "ford", "ford", "ford", "ford", "unclear"], "difficult_direct_answer": false, "rationales": ["The ford logo is on front of the car.", "The car has the ford logo on the front of the body.", "The manufacturer is ford."], "image": "train2014/COCO_train2014_000000226599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321127, "question_id": "E8TuTxvLKcXZPFhp33T5mA", "question": "The brand company of this man's jacket is headquartered in what country?", "choices": ["united states", "britain", "france", "italy"], "correct_choice_idx": 0, "direct_answers": ["america", "america", "usa", "united states", "united states", "united states", "usa", "united states", "america", "nike"], "difficult_direct_answer": false, "rationales": ["The man has a nike swoosh on his shirt. nike's headquarters are in beaverton, oregon.", "It is an american company", "Nike is based in oregon."], "image": "train2014/COCO_train2014_000000321127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58225, "question_id": "E8qbb569roCHqxbseE9n3D", "question": "Who bit this hot dog?", "choices": ["rat", "photographer", "dog", "child"], "correct_choice_idx": 1, "direct_answers": ["photographer", "photographer", "holding hotdog", "human", "holder", "baseball spectator", "person holding", "camera holder", "man", "photographer"], "difficult_direct_answer": false, "rationales": ["The photographer's hand is holding the hot dog.", "The person that bit the hot dog is the person that is taking the picture.", "The hot dog was bit by the person that took the picture because it is facing the camera."], "image": "val2014/COCO_val2014_000000058225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164100, "question_id": "E8qiDTvTFyqVft9K8fYV7V", "question": "What does Marilyn wear on her head when seen here?", "choices": ["hat", "bobby pin", "sunglasses", "beanie"], "correct_choice_idx": 2, "direct_answers": ["glasses", "sunglasses", "glasses", "sunglasses", "sunglasses", "sunglasses", "glasses", "sunglasses", "sunglasses", "glasses"], "difficult_direct_answer": false, "rationales": ["They are in the same shape as regular glasses but they have a tint used for protection from the sun.", "The woman in question is receiving a cake with her name on it and can be clearly seen. the lenses on her head are consistent in size, shape, color and the manner they are being worn with answer a.", "The woman being handed a cake with the name marilyn on it has sunglasses on her head."], "image": "train2014/COCO_train2014_000000164100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246218, "question_id": "E8uDrEZKvpr5RGT9bTYNC2", "question": "What salad type is in the white bowl?", "choices": ["caesar", "house", "greek", "mixed green"], "correct_choice_idx": 3, "direct_answers": ["spring salad", "spring", "mixed green", "arugula", "plain", "garden", "leaf salad", "greek", "lettuce", "veggie"], "difficult_direct_answer": true, "rationales": ["There are a couple of kinds of lettuce in the bowl.", "The salad is made of greenery.", "There are a few types of lettuce"], "image": "train2014/COCO_train2014_000000246218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220137, "question_id": "E8wGpJocEzqWhQRT3pxTTH", "question": "What method caused the shininess seen here?", "choices": ["glaze", "spray", "chalk", "matte paint"], "correct_choice_idx": 0, "direct_answers": ["polishing", "glaze", "polish", "paint", "glaze", "glazing", "water", "water", "pottery", "glaze"], "difficult_direct_answer": false, "rationales": ["That method is used for making things shiny.", "The method is glaze.", "The shiny varnish seen on ceramic vases are usually the method of glazing."], "image": "val2014/COCO_val2014_000000220137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449902, "question_id": "E9RgqhyaXWNCu33oJdmXMC", "question": "Why is the bed of the dump truck tilted?", "choices": ["parking", "backing up", "showing off", "dump contents"], "correct_choice_idx": 3, "direct_answers": ["dumping", "testing mechanism", "emptying it", "empty contents", "not support", "dump contents", "dumping", "hydraulic rams", "dumped", "dump contents"], "difficult_direct_answer": false, "rationales": ["In order to easily remove the items from the dump truck it is tilted so that it has the assistance of gravity, instead of someone having to shovel the contents out.", "The dump truck wants to get rid of its contents.", "Things can slide out when the dump truck is lifted."], "image": "train2014/COCO_train2014_000000449902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175469, "question_id": "E9SzheHAbeEEdbhMcvjtGD", "question": "What type of bird are floating in the water?", "choices": ["duck", "turkey", "woodpecker", "owl"], "correct_choice_idx": 0, "direct_answers": ["goose", "goose", "ducks", "geese", "goose", "geese", "ducks", "grease", "duck", "goose"], "difficult_direct_answer": false, "rationales": ["Ducks are in the water near the boat.", "Ducks are birds that are known to float on water and are visible doing so in this image based on the size and shape of two of the birds. the other answers are not known for interacting with water.", "The bird is a duck."], "image": "val2014/COCO_val2014_000000175469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172974, "question_id": "E9b2vRedVo3ZnnzXwoZQTH", "question": "What color are the blinds in this sitting room?", "choices": ["taupe", "gray", "blue", "white"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "blue", "blue", "colorless", "blue", "blue", "white", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The blinds are blue that are on the windows.", "You can see them hanging in the far room with the lady in it.", "The color is blue."], "image": "train2014/COCO_train2014_000000172974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529698, "question_id": "E9exqN65pnc79PR8xeWPzr", "question": "Who owns this bike?", "choices": ["city resident", "marilyn manson", "bike dealer", "meatloaf"], "correct_choice_idx": 2, "direct_answers": ["motorcyclist", "individual", "dealership", "shop owner", "nobody", "bike dealer", "man", "no idea", "person", "motorcycle rider"], "difficult_direct_answer": true, "rationales": ["The bike dealer's tag is still on the bike.", "A bike is parked near the sidewalk and has a sign with information about it posted on it.", "The bike dealer is the owner."], "image": "val2014/COCO_val2014_000000529698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211945, "question_id": "EAB8KUx34X8J4FpCHZXAeW", "question": "What breed of dog is riding the bike?", "choices": ["pomeranian", "bulldog", "corgi", "dobermann"], "correct_choice_idx": 1, "direct_answers": ["pug", "french bulldog", "bulldog", "bulldog", "pug", "pit bull", "bulldog", "pug", "bull dog", "pit bull"], "difficult_direct_answer": false, "rationales": ["The dog is a bulldog.", "The dog is black and short.", "The dog which is riding the bike is black, has a short snout, high standing ears and an overall muscular built. this describes a bulldog."], "image": "train2014/COCO_train2014_000000211945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166865, "question_id": "EAGyoTPT4z7am7n6kxfuds", "question": "This street is located where?", "choices": ["field", "suburb", "city", "desert"], "correct_choice_idx": 2, "direct_answers": ["uk", "london", "london", "united kingdom", "england", "england", "city", "england", "city", "england"], "difficult_direct_answer": false, "rationales": ["The area is full of tall buildings and pavement. there is no grass or sand.", "It looks populated and developed", "The multistory buildings, signs, and clock tower indicate that this is an urban setting."], "image": "train2014/COCO_train2014_000000166865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53236, "question_id": "EAW3gWFf7CKDFJjHY9XrRe", "question": "Which people group invented the bread seen here?", "choices": ["french", "belgian", "british", "austrian"], "correct_choice_idx": 3, "direct_answers": ["austrian", "sumerians", "french", "austrians", "french", "chines", "french", "french", "french", "french"], "difficult_direct_answer": false, "rationales": ["The bread on the plate is called a croissant and is famous in france.", "French people like croissants.", "The croissants were invented by an artillery officer, august zang, who is from austria."], "image": "train2014/COCO_train2014_000000053236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537689, "question_id": "EAXLGimbYDPLCB9i8k3Maj", "question": "What type of phone is the woman using?", "choices": ["landline", "rotary", "pay", "cellular"], "correct_choice_idx": 3, "direct_answers": ["cellphone", "cell", "cellphone", "cellphone", "cellular", "mobile", "cell", "cellphone", "cell", "cell"], "difficult_direct_answer": false, "rationales": ["The woman is on the go. the phone is mobile.", "The woman is using a cell phone.", "The woman in question is holding a handheld device up to her ear which is consistent in size and usage to answer a."], "image": "train2014/COCO_train2014_000000537689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504194, "question_id": "EAhkKQganRuHZbBPJyaHUD", "question": "On which direction is the sun in relation to the dog?", "choices": ["left", "back", "front", "right"], "correct_choice_idx": 0, "direct_answers": ["behind", "right", "behind", "right", "behind", "right", "left", "behind", "behind", "behind"], "difficult_direct_answer": false, "rationales": ["His shadow is to the left so the sun is in the opposite direction.", "The shadow is off to the dog's right. based on the location of the dog's shadow, the sun would be to its left.", "The dog's shadow is opposite of the light coming from the sun."], "image": "val2014/COCO_val2014_000000504194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80470, "question_id": "EAmFDnQtxtX6Rm2RXQGS8U", "question": "What is a horse riding outfit called?", "choices": ["breeches", "jodhpurs", "none", "barbie"], "correct_choice_idx": 1, "direct_answers": ["jodhpurs", "dressage", "bolero", "jumping", "jodhpurs", "jodhpurs", "cowboy outfit", "jodhpurs", "jodhpurs", "jodhpurs"], "difficult_direct_answer": false, "rationales": ["Jodhpurs is what it is.", "The outfit that this horse rider is wearing is modern, tight fitting, and reaches down to the ankles. all of this is characteristics of an outfit called jodhpurs.", "These are worn while riding a horse. they are close fitting around the bottom of the leg."], "image": "val2014/COCO_val2014_000000080470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377543, "question_id": "EAshqbYgRSMrDBvdtHmQC3", "question": "Which direction will the woman swing her racket?", "choices": ["behind her", "toward ball", "leftward", "downwards"], "correct_choice_idx": 1, "direct_answers": ["left", "right", "up", "towards ball", "her right", "toward ball", "underhand", "forward", "underhand", "up"], "difficult_direct_answer": false, "rationales": ["The girl goes toward the ball.", "The ball is coming towards the woman so she will likely swing her racket towards it.", "The girl is going to swing her racket toward the ball to hit it."], "image": "train2014/COCO_train2014_000000377543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456356, "question_id": "EB5ggw57jvyz4cMwqndk7C", "question": "What are the towels on the lower shelf used for?", "choices": ["wiping hands", "covering", "painting", "cleaning grease"], "correct_choice_idx": 0, "direct_answers": ["drying hands", "hands", "face", "hand drying", "face/hands", "hands", "dry hands", "dry hands", "wiping hands", "face"], "difficult_direct_answer": false, "rationales": ["Towels are used to help dry hands.", "They are used to dry the hands after bathing with water.", "The smaller the towel, the more likely they are used for smaller body parts."], "image": "train2014/COCO_train2014_000000456356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329703, "question_id": "EB8AYRaxfk6tSNq84N89bC", "question": "What is the left object on top of the cabinet for?", "choices": ["reading", "burning", "chilling food", "storing object"], "correct_choice_idx": 3, "direct_answers": ["storage", "basket", "picnic lunch", "hold items", "basket", "entertainment", "basket", "basket", "storing object", "serving food"], "difficult_direct_answer": false, "rationales": ["They are baskets to put things in.", "The object is for storage.", "The left object is known for storing items."], "image": "train2014/COCO_train2014_000000329703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395022, "question_id": "EBHPP3n9s9z9CmrCqMJ2rY", "question": "Where are the motorcycles parked?", "choices": ["in backyard", "in lot", "in street", "in garage"], "correct_choice_idx": 2, "direct_answers": ["on road", "garage", "on street", "roadway", "in street", "street", "street", "motorcycle show", "in street", "on street"], "difficult_direct_answer": false, "rationales": ["The yellow double lines are proof. they're often parked this way during rally events.", "The traffic lights and yellow lines on the ground indicate that this is a public street.", "The bikes seems to be in street as they are party."], "image": "train2014/COCO_train2014_000000395022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247264, "question_id": "EBMoVBmywcB6HHM9nw5Guq", "question": "Where do these men want to go next?", "choices": ["cafe", "bed", "home", "ocean"], "correct_choice_idx": 3, "direct_answers": ["ocean", "surfing", "ocean", "ocean", "surfing", "surfing", "home", "water", "beach", "ocean"], "difficult_direct_answer": false, "rationales": ["The men are in wetsuits and one is holding a skateboard. this equipment would be traditional employed in the ocean if it were to be used correctly.", "The men want to go to the ocean.", "The man is wearing a wetsuit."], "image": "val2014/COCO_val2014_000000247264.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343852, "question_id": "EBToUGFi7PQmoWweETfQA4", "question": "What abnormal act is the man doing?", "choices": ["wearing hat", "sucking tie", "drinking beer", "wearing suit"], "correct_choice_idx": 1, "direct_answers": ["eating tie", "sucking", "pulling tongue", "chewing tie", "eating tie", "eating", "eating tie", "eating tie", "tounge sticking", "sucking tie"], "difficult_direct_answer": false, "rationales": ["The man has the tie in his mouth. the tie is wet.", "Only one of the options would be considered abnormal. the man has his neckwear in his mouth.", "The act is sucking the tie."], "image": "val2014/COCO_val2014_000000343852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449902, "question_id": "EBVMmG7AyWg9JA8uFwtY3V", "question": "Why is the truck's bed at an angle?", "choices": ["stolen truck", "dump load", "off balance", "broken vehicle"], "correct_choice_idx": 1, "direct_answers": ["strips", "dumping", "not supported", "dumping", "dump load", "testing cables", "dumping", "to empty", "dumping", "dropping off"], "difficult_direct_answer": false, "rationales": ["Answer a is consistent with the purposes of this vehicle and how a load would be delivered.", "The truck is dumping its load.", "This is a faster and more efficient way to empty the contents of the truck bed."], "image": "train2014/COCO_train2014_000000449902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120320, "question_id": "EBZmDSCrDWbWq4YQZBj9Uz", "question": "What is the man using the phone as?", "choices": ["flash light", "speaker", "television", "blender"], "correct_choice_idx": 0, "direct_answers": ["flashlight", "flashlight", "flashlight", "flashlight", "camera/light", "flashlight", "flashlight", "flashlight", "light", "flash light"], "difficult_direct_answer": false, "rationales": ["The man is using the phone as a flash light to see.", "He's trying to see in the dark area under the sink", "The man can't see in the dim area. there is a light on the phone."], "image": "train2014/COCO_train2014_000000120320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292125, "question_id": "EBa6ZpsFNSNYv5tQNXeUhX", "question": "What is next to the yellow sign?", "choices": ["nun", "rat", "cheese", "bicycle"], "correct_choice_idx": 3, "direct_answers": ["bike", "bicycle", "bicycle", "hydrant", "bicycle", "bicycle", "bike", "bike", "bike", "red bicycle"], "difficult_direct_answer": false, "rationales": ["The yellow sign has a bike near it.", "It is a vehicle with pedals, wheels and handlebars.", "The red bike is used for transportation and is packed near the hydrant."], "image": "train2014/COCO_train2014_000000292125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212587, "question_id": "EBeUW9vAbbVHqJUo2JtKxG", "question": "Which colored item here is most tart?", "choices": ["white", "brown", "green", "red"], "correct_choice_idx": 2, "direct_answers": ["hot dog", "pickle", "pickle", "pickles", "pickle", "hotdog", "pickle", "green", "pickle", "pickle"], "difficult_direct_answer": false, "rationales": ["The pickles are sour and green.", "The pickles will have the most sour taste due to their acid.", "The item is green."], "image": "val2014/COCO_val2014_000000212587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295448, "question_id": "EBynstLEtbbzkZ9gjPSjzM", "question": "What is the reason the street and sidewalks are wet?", "choices": ["it's sunny", "it's snowing", "it's dark", "it's raining"], "correct_choice_idx": 3, "direct_answers": ["rain", "rain", "rain", "rainy season", "raining", "it's raining", "it's raining", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["It is raining outside.", "The sky is gloomy.", "A busy street can be seen with an overcast sky above and puddles on the ground."], "image": "train2014/COCO_train2014_000000295448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419171, "question_id": "ECJVQ266zhqBfRqteVbHXc", "question": "Bananas are priced for sale based off what?", "choices": ["how many", "nothing", "height", "weight"], "correct_choice_idx": 3, "direct_answers": ["weight", "weight", "weight", "poundage", "weight", "weight", "weight", "how many", "bunch", "growing conditions"], "difficult_direct_answer": false, "rationales": ["People weigh their produce.", "Most produce is priced by scale", "Bananas are usually charged per pound, so weight would be the pricing basis."], "image": "train2014/COCO_train2014_000000419171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401392, "question_id": "ECUh93QY3w98R9v8oM2Wee", "question": "That the bird is eating?", "choices": ["steak", "another bird", "nothing", "salad"], "correct_choice_idx": 3, "direct_answers": ["food", "seeds", "veggies", "food", "rice", "food", "rice", "rice", "rice", "salad"], "difficult_direct_answer": false, "rationales": ["He's eating a piece of rice from the salad", "There is some green in the food.", "The food on the plate is made out of colorful vegetables."], "image": "train2014/COCO_train2014_000000401392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64339, "question_id": "ECq49UwhCSHDLtWWE8DhiG", "question": "What is this type of job called?", "choices": ["host", "bouncer", "bartender", "accountant"], "correct_choice_idx": 2, "direct_answers": ["bartender", "bartender", "bartending", "bartender", "orchid", "bartender", "bartender", "bartender", "bartender", "bartender"], "difficult_direct_answer": false, "rationales": ["These things are set up at a bar.", "He is behind a counter mixing ingredients together to make a drink.", "There are alcohol bottles behind the counter"], "image": "train2014/COCO_train2014_000000064339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195098, "question_id": "ECrMUii3ZVQsWHJrmf52XP", "question": "What venue is the man in?", "choices": ["home", "restaurant", "bathroom", "hotel lobby"], "correct_choice_idx": 1, "direct_answers": ["restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["There are tables and he is in a booth.", "The lighting and other people sitting at a table behind him engaging in eating, suggests that this is a restaurant.", "A man is in a dimly lit area with tables. restaurants are often dimly lit and have tables."], "image": "val2014/COCO_val2014_000000195098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172874, "question_id": "EDLz6hL7duaoF9mNuEVCDL", "question": "What is the standing man doing with his arms?", "choices": ["lifting", "hiding", "folding", "waving"], "correct_choice_idx": 2, "direct_answers": ["crossing", "folding", "crossing", "crossed", "crossed arms", "crossed", "crossing", "crossing them", "crossing", "crossing"], "difficult_direct_answer": false, "rationales": ["The man has his arms crossed.", "All his hands are attached together at his thoracic cavity.", "The man is folding his arms and crossing them."], "image": "train2014/COCO_train2014_000000172874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140990, "question_id": "EDM5NXKPTynYGPJ4kgHzYw", "question": "What meaning can the number have?", "choices": ["lucky", "unlucky", "unholy", "holy"], "correct_choice_idx": 1, "direct_answers": ["unlucky", "bad luck", "jersey number", "unlucky", "bad luck", "player number", "unlucky", "player number", "birthday", "identification"], "difficult_direct_answer": false, "rationales": ["Traditionally this number is linked to misfortune and bad luck.", "Lawrie is wearing the number 13. this number has a negative connotation.", "The number is 13."], "image": "train2014/COCO_train2014_000000140990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508244, "question_id": "EDat72YcFM2j5ycnHJGixU", "question": "What is he doing?", "choices": ["posing", "drinking", "eating", "playing baseball"], "correct_choice_idx": 0, "direct_answers": ["leaning", "watching ballgame", "hanging out", "leaning", "playing baseball", "loitering", "posing", "taking photo", "relaxing", "leaning"], "difficult_direct_answer": false, "rationales": ["He's technically not actively doing b although he's likely there for that reason. the other two obviously don't apply.", "He is relaxed and smiling to the camera", "The man is striking a pose for the camera."], "image": "train2014/COCO_train2014_000000508244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351622, "question_id": "EDc2EWwy7oCasyD6HKAYUW", "question": "What are the clear pieces underneath the phones?", "choices": ["lights", "memory cards", "chords", "stands"], "correct_choice_idx": 3, "direct_answers": ["last one", "display stands", "stands", "phone stands", "pedestal", "plastic stands", "stands", "stands", "stands", "holder"], "difficult_direct_answer": false, "rationales": ["The pieces are allowing the phones to stand.", "Those plastic pieces help with keeping the phones stable.", "The items have to be placed on something to raise them off of the table."], "image": "train2014/COCO_train2014_000000351622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360306, "question_id": "EDd9vNuTa4bgEwxr2eDF33", "question": "What is the man doing?", "choices": ["cleaning dishes", "eating lunch", "making lunch", "cleaning sink"], "correct_choice_idx": 0, "direct_answers": ["washing dishes", "washing dishes", "cleaning dishes", "washing utensils", "washing dishes", "washing dishes", "washing dishes", "washing dishes", "washing utensils", "washing dishes"], "difficult_direct_answer": false, "rationales": ["The man is standing in front of a sink that appears to have dishes stacked next to it. this is a common orientation of the person and objects that would be consistent with answer a.", "The man is standing at a sink with a pile of dishes visible to his left. based on the combination of these elements, the setting and the level of his hands, answer a is the likely activity.", "He's at the kitchen sink"], "image": "train2014/COCO_train2014_000000360306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540479, "question_id": "EDm65bxmL2b4F2pL6p2Lku", "question": "What direction is the skier going?", "choices": ["down", "up", "left", "right"], "correct_choice_idx": 0, "direct_answers": ["down", "downhill", "down", "down", "down", "downhill", "downhill", "downhill", "south", "down"], "difficult_direct_answer": false, "rationales": ["People ski from a top of a hill downward, using gravity and the lack if friction with the skin on the snow.", "Skiing is done quickly from a higher altitude down to a lower one.", "The person is going downhill."], "image": "train2014/COCO_train2014_000000540479.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524594, "question_id": "EEp3KZ8b5ijjUHFgi6Tnkv", "question": "How many cats are shown here?", "choices": ["three", "two", "one", "four"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "two", "one", "one", "two", "one", "one", "two", "one"], "difficult_direct_answer": false, "rationales": ["A cat and a dog are sitting together.", "The picture shows two animals; a dog on the left and a cat on the right.", "A cat is sitting next to a dog."], "image": "train2014/COCO_train2014_000000524594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338153, "question_id": "EEsQegg9teJhXymybmYT2K", "question": "What type of crossing is this?", "choices": ["horse", "train", "railroad", "ship"], "correct_choice_idx": 0, "direct_answers": ["horse", "street", "horse", "horse", "horse", "horse crossing", "horse", "horse", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["As shown by the icon on the light.", "There is a picture of a man riding this animal on the crosswalk sign. a crosswalk sign always shows one of the means of transportation that crosswalk is meant for.", "The signal is showing a person riding an animal, not a vehicle."], "image": "val2014/COCO_val2014_000000338153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437516, "question_id": "EFKSEHPk3yaKQqSdum9Tvo", "question": "What is the skateboarder not wearing that most serious skateboarders always wear?", "choices": ["safety gear", "chucks", "shorts", "sunglasses"], "correct_choice_idx": 0, "direct_answers": ["helmet", "helmet", "helmet", "safety gear", "helmet", "helmet", "helmet", "scatting", "scatting", "helmet"], "difficult_direct_answer": false, "rationales": ["The skateboarder has no safety pads or helmet on.", "The safety gear is used for protection.", "The skateboarder only has a regular hat on. he really should protect his head in case he falls by wearing a helmet."], "image": "train2014/COCO_train2014_000000437516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255374, "question_id": "EFPzkA7gUcuyZHsLmwyjpv", "question": "What bank is a sponsor of the tennis event?", "choices": ["chase", "wells fargo", "citibank", "abn amro"], "correct_choice_idx": 3, "direct_answers": ["amro", "abn amro", "tennis", "abn amro", "abn amro", "abn amro", "abn amro", "amro", "abn amro", "abn amro"], "difficult_direct_answer": false, "rationales": ["You can see the letters of the name on the items on the court", "The sponsors of the tennis game is usually printed on the sides of the court and the sponsor here is a bank.", "That brand starts with an a."], "image": "train2014/COCO_train2014_000000255374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229936, "question_id": "EFntgXdSCz94QuUNRYh6Ze", "question": "What time will it be in a half hour?", "choices": ["one", "six", "seven", "two"], "correct_choice_idx": 0, "direct_answers": ["one o'clock", "one pm", "one oclock", "100", "one", "100", "one o'clock", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["It's currently thirty minutes to one.", "The little hand is between this and the 12", "It is 12:30 now so in 30 minutes it will be one."], "image": "train2014/COCO_train2014_000000229936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141920, "question_id": "EFvrKVsZrsWeyYsNU9qhgR", "question": "What type of store is this?", "choices": ["comic", "grocery", "beauty", "store"], "correct_choice_idx": 0, "direct_answers": ["comic book", "comic", "comic book", "comic", "comic book", "comic book", "comic book", "comic book", "comic book", "comic"], "difficult_direct_answer": false, "rationales": ["The guy is reading one and there are many on the shelves around him.", "The store is for comics.", "The man is reading a comic book."], "image": "train2014/COCO_train2014_000000141920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71485, "question_id": "EG4jPE25pvK4D9PsmPfFLe", "question": "How is pizza commonly sold here?", "choices": ["whole pie", "by slice", "by bite", "by gross"], "correct_choice_idx": 1, "direct_answers": ["by slice", "by slice", "by slice", "by slice", "slices", "by slice", "slice", "slice", "by slice", "single slices"], "difficult_direct_answer": false, "rationales": ["The pizza is sold by the slice.", "Pizza is cut into a large slice and is in a box that fits it perfectly. an individual box is used for individual servings.", "The pizza is already cut into slices."], "image": "train2014/COCO_train2014_000000071485.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214777, "question_id": "EGVEJrnns6yYxY8gg7mf5t", "question": "What type license might one show to get on this bus?", "choices": ["construction workers", "divorce", "wedding", "welders"], "correct_choice_idx": 2, "direct_answers": ["wedding", "wedding", "drivers", "marriage", "bus", "driver's license", "marriage", "bus drivers", "driving", "tickets"], "difficult_direct_answer": false, "rationales": ["A license is needed to be married.", "There is a special sign on the front of the bus. you get a special price if you have this license.", "There is a wedding special sign on the bus."], "image": "train2014/COCO_train2014_000000214777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180445, "question_id": "EGachyucjyhC5H7zzVAyEG", "question": "The cat's cover what religious icon here?", "choices": ["mary", "jesus", "buddha", "cross"], "correct_choice_idx": 2, "direct_answers": ["buddah", "buddah", "buddah", "buddha", "buddah", "buddah", "buddha", "buddha", "buddha", "buddha"], "difficult_direct_answer": false, "rationales": ["Under the cat themed umbrellas sits a statue which closely resembles a buddha.", "This individual is almost always depicted as fat, bald and chubby in a seated position.", "The icon is of a chubby bald man which is a known leader of a religious organization."], "image": "train2014/COCO_train2014_000000180445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98944, "question_id": "EGciiaDbEAmEnk4Mpzwp2P", "question": "How was the meat most likely prepared?", "choices": ["grilled", "raw", "stewed", "fried"], "correct_choice_idx": 0, "direct_answers": ["grilled", "grill", "grilled", "grilled", "chicken", "grilled", "grilled", "grilled", "grill", "grilled"], "difficult_direct_answer": false, "rationales": ["The meat on the plate has grill marks on it from where it was cooked on the hot grill.", "The black marks seared onto the flesh imply this option. these marks are usually the result of it.", "The darker marks on the meat are most consistent with this type of food preparation."], "image": "train2014/COCO_train2014_000000098944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444568, "question_id": "EGiugRouUx9suagz9himTK", "question": "What is this child doing?", "choices": ["eating", "sleeping", "learning", "cooking"], "correct_choice_idx": 2, "direct_answers": ["computer", "computer work", "using computer", "playing", "computer work", "learning", "watching monitor", "on computer", "computer work", "using computer"], "difficult_direct_answer": false, "rationales": ["They are sitting at the computer watching the screen intently because something has their attention.", "The child is looking at a computer screen.", "He's at a computer with a mouse and headphones"], "image": "val2014/COCO_val2014_000000444568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331395, "question_id": "EGw8N5LYBqQaPqtqRFHF8G", "question": "What are the men doing at the table?", "choices": ["writing novels", "grading wine", "grading papers", "drawing comics"], "correct_choice_idx": 1, "direct_answers": ["talking", "drinking", "working", "drinking wine", "wine testing", "wine tasting", "wine tasting", "grading wine", "tasting wine", "wine tasting"], "difficult_direct_answer": false, "rationales": ["A man is writing in a journal as another is viewing glasses of wine.", "They have multiple glasses and bottles in front of them and one is writing in a notebook.", "The men are tasting wine at the table and taking notes."], "image": "val2014/COCO_val2014_000000331395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16403, "question_id": "EGxk35JEjbhdYfSrY4G4Fy", "question": "What are the people at the back waiting for?", "choices": ["cruise boat", "bus", "cab", "van"], "correct_choice_idx": 0, "direct_answers": ["food", "to order", "food", "boat", "ferry", "food", "cruise boat", "food", "food", "food"], "difficult_direct_answer": false, "rationales": ["The people want to board the boat in the water.", "The people want the cruise boat.", "That's what the sign says."], "image": "train2014/COCO_train2014_000000016403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166621, "question_id": "EHAwAwN2ZWcuwN7gtnZNiU", "question": "Why is he standing there?", "choices": ["selling souvenirs", "resting", "stealing bus", "awaiting bus"], "correct_choice_idx": 3, "direct_answers": ["awaiting bus", "bus stop", "catching bus", "catch bus", "boarding bus", "waiting bus", "awaiting bus", "waiting", "bus stop", "bus stop"], "difficult_direct_answer": false, "rationales": ["It is a bus stop.", "By the words on the road it indicates why he is there.", "The writing on the ground indicates the purpose of this location."], "image": "train2014/COCO_train2014_000000166621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226701, "question_id": "EHBsnVzPx4yxdb3VAChKKP", "question": "What type animal does this girl sit beside?", "choices": ["stuffed bear", "moose", "rabbit", "snake"], "correct_choice_idx": 0, "direct_answers": ["bear", "teddy bear", "stuffed bear", "teddy bear", "bear", "bear", "bear", "bear", "teddy bear", "bear"], "difficult_direct_answer": false, "rationales": ["The girl has a stuffed teddy bear near her.", "The animal is a stuffed bear.", "Option a is a common stuffed animal and the size, color and face shape are consistent with that type of animal."], "image": "val2014/COCO_val2014_000000226701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97563, "question_id": "EHUwRZjgPtNz96hAkqKv3H", "question": "What is the player in white attempting to do?", "choices": ["receive pass", "block", "score", "call attention"], "correct_choice_idx": 1, "direct_answers": ["block player", "throw frisbee", "throw", "block", "block", "throw", "block", "catching frisbee", "toss frisbee", "defend"], "difficult_direct_answer": false, "rationales": ["The player wants to block.", "The player in white is on a different team than the one in black. the one in white is holding out his arms to prevent the one in black from passing it to his teammate.", "It's fairly obvious that they're attempting to prevent the other play from throwing the frisbee."], "image": "train2014/COCO_train2014_000000097563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94002, "question_id": "EHmiyRZSRrtvjv5Q78e77d", "question": "Why is he dismounting the skateboard?", "choices": ["wants dinner", "rammed wall", "is home", "is tired"], "correct_choice_idx": 2, "direct_answers": ["going home", "to stop", "crime scene", "stopping", "going inside", "walking", "is home", "stopping", "to walk", "enter building"], "difficult_direct_answer": true, "rationales": ["He is reaching for the doorknob of a residence while dismounting.", "The man is home.", "The man is at his location."], "image": "val2014/COCO_val2014_000000094002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13992, "question_id": "EHtJ63HuJeLP7DpQ26NF5v", "question": "What type of sign is this?", "choices": ["warning", "brand", "regulatory", "directional"], "correct_choice_idx": 3, "direct_answers": ["street", "pedestrian", "pedestrian traffic", "directional", "information sign", "bathrooms", "informational", "toilet", "traffic", "directions"], "difficult_direct_answer": true, "rationales": ["The sign on the post is a directional sign. it tells people which direction to go.", "The arrows on the sign indicate directions.", "The sign has arrows showing which way to go."], "image": "val2014/COCO_val2014_000000013992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48417, "question_id": "EKWa3vzEtEaGpLmesyrzJn", "question": "Why is the boy's head covered?", "choices": ["religion", "safety", "warmth", "costume"], "correct_choice_idx": 2, "direct_answers": ["for warmth", "warmth", "stay warm", "cold", "warm head", "keep warm", "winter", "its cold", "cold weather", "for warmth"], "difficult_direct_answer": true, "rationales": ["It is cold outside and the boy is wearing a hat.", "The boy is also wearing a coat.", "This is the most likely reason given the rest of the winter wear."], "image": "train2014/COCO_train2014_000000048417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241407, "question_id": "EKpYR8W2fMggBL5WrNGtzo", "question": "Red color in the STOP boards indicates what?", "choices": ["danger", "banned", "none", "peace"], "correct_choice_idx": 0, "direct_answers": ["stop", "visibility", "attention", "stop sign", "danger", "warning halt", "stop car", "stop", "danger", "stop"], "difficult_direct_answer": false, "rationales": ["Answer a is not totally accurate but it is the closest approximate based on the answers provided and common knowledge.", "The color is for danger.", "Red signs are used to alert drivers to potentially dangerous situations."], "image": "train2014/COCO_train2014_000000241407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250192, "question_id": "EKuQiLb395ycHWJN4WR5Us", "question": "What is the elephant doing with the ball?", "choices": ["destroying it", "eating it", "kicking it", "throwing it"], "correct_choice_idx": 2, "direct_answers": ["chasing", "kicking it", "playing", "rolling it", "kicking it", "kicking", "rolling", "kicking", "kicking", "kicking"], "difficult_direct_answer": false, "rationales": ["An elephant is walking down the street with a soccer ball near its feet.", "The elephant is with a group of men and a goal. he does not have hands in which to throw the ball, but he does have feet.", "The elephant is kicking the ball."], "image": "train2014/COCO_train2014_000000250192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529126, "question_id": "EL8EuSta8A55PNqPWyDKMf", "question": "What gift did the woman seen here get for Christmas?", "choices": ["white dress", "wii", "cook book", "santa hat"], "correct_choice_idx": 1, "direct_answers": ["nintendo wii", "wii", "gaming system", "wii", "wii", "wii", "game console", "wii", "wii", "wii"], "difficult_direct_answer": false, "rationales": ["She is holding the remote from this video game system.", "The woman has a white remote.", "She has a white device in her hand that controls the game on the television."], "image": "train2014/COCO_train2014_000000529126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441969, "question_id": "ELFDysiWgPeKeCWU8yauWE", "question": "This outdoor area has heat that is ignited using what?", "choices": ["flint", "wood", "water", "propane"], "correct_choice_idx": 3, "direct_answers": ["propane", "propane", "propane", "propane", "propane", "propane", "solar", "solar", "propane", "propane"], "difficult_direct_answer": false, "rationales": ["A big metal canister is seen on the ground, which looks like a unit holding propane, and it is standing next to a pole that is used to heat the area.", "There is a tank with this in it to keep it warm out there.", "The outdoor seating area has to be heated by the propane tank."], "image": "val2014/COCO_val2014_000000441969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556176, "question_id": "ELNRTVePFvfNqgKhomb9D2", "question": "What kind of remotes are the people holding?", "choices": ["stereo", "air conditioner", "tv", "video game"], "correct_choice_idx": 3, "direct_answers": ["video games", "nintendo wii", "wireless", "nintendo wii", "video game", "wii", "nintendo wii", "wii", "wii", "wii remotes"], "difficult_direct_answer": false, "rationales": ["They are the remotes used for the nintendo wii game system.", "The people are holding white controllers with buttons on them. they are playing a game.", "The people are playing a video game."], "image": "train2014/COCO_train2014_000000556176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575395, "question_id": "ELRWjeEcgGwsifyuBKyHHh", "question": "What green thing does the upper advertisement most relate to?", "choices": ["weed", "dollars", "parrots", "trees"], "correct_choice_idx": 1, "direct_answers": ["rolex watches", "electricity", "watches", "money", "electriciy", "time pieces", "cash", "money", "watches", "dollars"], "difficult_direct_answer": false, "rationales": ["The advertisement literally contains the word \"money\" so it's definitely most related to dollars, which is a form of money.", "The upper advertisement is for ge money. money is not related to trees, weed, or parrots.", "It has to do with money."], "image": "val2014/COCO_val2014_000000575395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227224, "question_id": "ELr32fy3JXWRMpaHRriE9r", "question": "Why is he leaning right?", "choices": ["rounding curve", "falling", "no control", "off balance"], "correct_choice_idx": 0, "direct_answers": ["turning", "turning bike", "turning", "rounding curve", "curve", "riding motorcycle", "turning", "not fall", "to turn", "cornering"], "difficult_direct_answer": false, "rationales": ["He's rounding the curve.", "Leaning in to a turn helps balance the motorcycle and makes the turn faster.", "A second rider is located on the track behind the rider and based on their relative position they are clearly on a curve. in order to turn tighter on a curve while riding a motorcycle one would use their bodyweight and lean."], "image": "val2014/COCO_val2014_000000227224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314515, "question_id": "EMAG67AFiRVNDC9PtExVHL", "question": "The people who gather here are taking part in what?", "choices": ["vacation", "protest", "cycling event", "picnic"], "correct_choice_idx": 2, "direct_answers": ["marathon", "bike race", "cycling event", "bike race", "bike race", "parade", "bicycle race", "festival", "cycling event", "race"], "difficult_direct_answer": false, "rationales": ["There are a lot of bikes on the top of the car.", "There are bikes on the car", "As proven by the bikes and the helmets worn by some."], "image": "val2014/COCO_val2014_000000314515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465723, "question_id": "EMXh3Vj6wp66Tov4uLsfrh", "question": "What is the term used to call this group of horses?", "choices": ["stampede", "herd", "wave", "slide"], "correct_choice_idx": 1, "direct_answers": ["herds", "herd", "herd", "herd", "herd", "herd", "herd", "herd", "wild", "herd"], "difficult_direct_answer": false, "rationales": ["Larger groups of horses are typically herded together.", "It is a large group of horses.", "A commonly used term to describe a multiple horses appearing together would be answer a."], "image": "train2014/COCO_train2014_000000465723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344802, "question_id": "EN85km3dVALWsy5oC5Egqz", "question": "What type of juice is in the glass?", "choices": ["orange", "grapefruit", "tomato", "carrot"], "correct_choice_idx": 3, "direct_answers": ["carrot", "orange juice", "carrot", "carrot juice", "carrot", "orange", "carrot", "tomato mango", "orange juice", "carrot"], "difficult_direct_answer": false, "rationales": ["Juice is typically the same color of the fruit or vegetable it comes from.", "The juice is orange.", "Carrot juice is orange."], "image": "train2014/COCO_train2014_000000344802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151965, "question_id": "ENFZg7mhGkjfLc9eekr2H4", "question": "What can assist in navigating the terrain?", "choices": ["gps", "map", "echo", "snowboard"], "correct_choice_idx": 3, "direct_answers": ["snowboard", "snowboard", "snowboards", "map", "compass cellphone", "guide", "map", "snowboard", "gravity", "snowboard"], "difficult_direct_answer": false, "rationales": ["Snowboards can be used to navigate snow.", "They will use their boards to ride the terrain.", "Because the board has a smooth surface to enable it move easily on the snow."], "image": "val2014/COCO_val2014_000000151965.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45464, "question_id": "ENNS7dTsEZfQW3BnnTUBwA", "question": "What is being used to keep warm?", "choices": ["scarf", "fireplace", "t-shirt", "space heater"], "correct_choice_idx": 0, "direct_answers": ["winter clothes", "winter clothing", "scarf", "winter jackets", "winter jackets", "snow", "ski jackets", "scarf", "snow clothes", "coats"], "difficult_direct_answer": false, "rationales": ["The woman is wearing a scarf around her neck.", "The scarf is keeping the woman's neck warm.", "The people use scarves."], "image": "train2014/COCO_train2014_000000045464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391686, "question_id": "ENXKJwJM93LxvKMLa3urZY", "question": "Why is the man's arm out?", "choices": ["balance", "signal", "wave", "break fall"], "correct_choice_idx": 0, "direct_answers": ["balance", "balance", "for balance", "balance", "balance", "balance", "balance", "balance", "balance", "for balance"], "difficult_direct_answer": false, "rationales": ["The other options are obviously not involved. using this type of gear requires a.", "He needs to do this so he doesn't fall off as he maneuvers", "People use their arms to keep from falling."], "image": "train2014/COCO_train2014_000000391686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186962, "question_id": "ENbRNkaPPPz7McN8SiDkyk", "question": "Why is the woman sitting down?", "choices": ["to paint", "to work", "to sew", "to eat"], "correct_choice_idx": 1, "direct_answers": ["using laptop", "working", "at desk", "working", "to work", "working", "maintain energy", "using laptop", "working computer", "using laptop"], "difficult_direct_answer": false, "rationales": ["The laptop implies this, but they could be waiting for food in b as well.", "The woman is sitting down at a desk with a computer and working.", "The woman is sitting at a desk working on a computer."], "image": "train2014/COCO_train2014_000000186962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403004, "question_id": "ENd8HtBUpzJSfCgfayPzGh", "question": "What is the brown structure likely made of?", "choices": ["cotton", "wood", "brick", "metal"], "correct_choice_idx": 1, "direct_answers": ["wood", "ceramic", "wood", "wood", "wood", "wood", "wood", "wood", "bench", "wood"], "difficult_direct_answer": false, "rationales": ["It is carved and you can see the grain in it", "The brown structure is a wooden bench.", "Things made of wood, such as this bench, often has grain evident. brown is a typical color for wooden items."], "image": "train2014/COCO_train2014_000000403004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376381, "question_id": "ENknX7bYd8EsSXHXv9mckr", "question": "What are they doing?", "choices": ["watching videos", "posing", "video games", "exercise"], "correct_choice_idx": 2, "direct_answers": ["playing games", "playing wii", "videogame", "wii", "playing", "playing", "playing game", "video games", "video games", "playing wii"], "difficult_direct_answer": false, "rationales": ["The people are holding nintendo controllers and playing a video game.", "The man and woman have game controls on their hand.", "You can tell by the remotes and their actions what they are playing."], "image": "train2014/COCO_train2014_000000376381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235812, "question_id": "EP3tQDR8pySSzecMb3ZWzx", "question": "What red object is the boy wearing?", "choices": ["jacket", "turtleneck", "scarf", "vest"], "correct_choice_idx": 3, "direct_answers": ["sweater", "vest", "vest", "sweater vest", "vest", "sweater", "vest", "vest", "vest", "sweater vest"], "difficult_direct_answer": false, "rationales": ["The object is a vest.", "The boy is wearing a red sleeveless sweater.", "A vest is usually worn on top of a shirt."], "image": "train2014/COCO_train2014_000000235812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340368, "question_id": "EPB8i3f7Wj7vNJzypzLBBs", "question": "In which space is this person boarding?", "choices": ["inner city", "tundra", "park", "desert"], "correct_choice_idx": 2, "direct_answers": ["park", "park", "park", "middle", "park", "park", "middle", "park", "park", "park"], "difficult_direct_answer": false, "rationales": ["The image contains grass, trees, benches, and a walkway. the buildings are not consistent with an urban development, and the climate appears moderate.", "The open space with apparently intentionally placed skateboarding apparatus would be consistent with answer a.", "The person is in a park with a grassy field."], "image": "val2014/COCO_val2014_000000340368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44954, "question_id": "EPUqKmTMLW9p8Z9gD4bM9q", "question": "What are these women wearing?", "choices": ["rubber", "wet suits", "dry suits", "casual"], "correct_choice_idx": 1, "direct_answers": ["wetsuits", "wetsuits", "wetsuits", "wet suits", "wetsuit", "wetsuits", "wet suits", "wetsuit", "wet suits", "wet suits"], "difficult_direct_answer": false, "rationales": ["These waterproof suits are typical attire for surfing. they keep you warm and dry.", "The people are in wetsuits.", "Wet suits are needed for surfing."], "image": "train2014/COCO_train2014_000000044954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535676, "question_id": "EPYL7Hy9qfkNwznoDjETwJ", "question": "Which nation is the motorcade of police motorcycles celebrating?", "choices": ["ireland", "italy", "ivory coast", "mexico"], "correct_choice_idx": 0, "direct_answers": ["ireland", "ireland", "ireland", "ireland", "ireland", "ireland", "ireland", "india", "india", "ireland"], "difficult_direct_answer": false, "rationales": ["A group of men in uniform on motorcycles have the irish flag with them.", "The motorcade is flying irish flags.", "The flag displayed on the motorcycles"], "image": "train2014/COCO_train2014_000000535676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270716, "question_id": "EPc3RJ9KtTDQ7pm2J4QKW6", "question": "What is a pile of wood used in construction called?", "choices": ["lumber", "pile", "logs", "chips"], "correct_choice_idx": 0, "direct_answers": ["lumber", "lumber", "lumber", "timbers", "building", "planks", "logs", "logs", "wood piling", "wood"], "difficult_direct_answer": false, "rationales": ["Lumber is another word for wood.", "The truck is carting lumber and wooden logs.", "The word lumber differentiates it from regular wood, so that purchases know it is to be used and ready for building or construction."], "image": "train2014/COCO_train2014_000000270716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418372, "question_id": "EPdvUHfQNFc8kgCL72U39u", "question": "What is in the plastic wrap on the bottom right?", "choices": ["bagel", "cupcake", "doughnut", "muffin"], "correct_choice_idx": 0, "direct_answers": ["bagel", "bagel", "bagel", "bagels", "bagel", "bagel", "bagel", "bagels", "bagel", "bagel"], "difficult_direct_answer": false, "rationales": ["The plastic wrap is identifiable as well as the location given in the question. the item enclosed is written on the front.", "There is a label on the item that indicates what it is.", "The label says bagels."], "image": "train2014/COCO_train2014_000000418372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324103, "question_id": "EQ3E6aGbMJSTznDGiqSQYS", "question": "What instrument is the person on the left playing?", "choices": ["banjo", "harp", "violin", "guitar"], "correct_choice_idx": 2, "direct_answers": ["fiddle", "violin", "violin", "fiddle", "violin", "violin", "violin", "violin", "violin", "violin"], "difficult_direct_answer": false, "rationales": ["It is being held on the shoulder in a certain manner that is only used to play a violin.", "The instrument has strings. it is being played with a bow.", "The instrument has strings. the person is using a bow to pluck them."], "image": "train2014/COCO_train2014_000000324103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32451, "question_id": "EQE8aNHVvTqowmQoH5numj", "question": "How was this artwork created?", "choices": ["photographed", "digitally", "painted", "videoed"], "correct_choice_idx": 1, "direct_answers": ["cgi", "computer", "computer", "digitally", "digitally", "cgi", "painting", "computer", "3d", "computer"], "difficult_direct_answer": false, "rationales": ["This was done on a computer by a graphic artist.", "It's clearly not a real human and has been created by a computer.", "It is a computer generated picture"], "image": "train2014/COCO_train2014_000000032451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253250, "question_id": "EQZTKhUMcvHkKoUNuvqv7g", "question": "What part of the boat is the person in the red shirt standing on?", "choices": ["bow", "stern", "transom", "hull"], "correct_choice_idx": 0, "direct_answers": ["front", "front", "bow", "bow", "bowl", "bow", "front", "bow", "bow", "bow"], "difficult_direct_answer": false, "rationales": ["The person is standing at the pointed or front end of the boat. i searched the internet for the proper boat terminology for the front.", "He is on the front of the ship.", "The orientation of the boat is clearly visible based on the shape and the wake trailing behind. the person in question is locatable and in the front which is also known as answer a."], "image": "train2014/COCO_train2014_000000253250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22899, "question_id": "EQeV4hvskcyouGpUBGzaMi", "question": "Which color kite likely has someone still holding it?", "choices": ["none", "green", "black", "yellow"], "correct_choice_idx": 2, "direct_answers": ["black", "dark blue", "black", "black", "black", "black", "blue", "black", "black", "yellow"], "difficult_direct_answer": false, "rationales": ["The kite in the sky is black.", "The color is black.", "Multiple kites are stuck in the tree. the one above the tree is still being controlled by a person."], "image": "train2014/COCO_train2014_000000022899.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438477, "question_id": "EQzKWawaYGSfZxH9XrFLpK", "question": "Why is the woman wearing a hat?", "choices": ["costume", "warmth", "safety", "uniform"], "correct_choice_idx": 0, "direct_answers": ["fashion", "fashion", "costume", "costume", "costume", "costume", "fashion", "costume", "costume", "costume"], "difficult_direct_answer": false, "rationales": ["The man and woman are dressed up for halloween or some similar occasion.", "The outfit is not a common outfit and is often worn for holidays or parties.", "They are dressed up for a party"], "image": "train2014/COCO_train2014_000000438477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16957, "question_id": "ER2iq8iL6MhNB5sirYgVw3", "question": "Why is there so much spray in the air?", "choices": ["ducks", "waterfall nearby", "violent fisherman", "bridge collapsing"], "correct_choice_idx": 1, "direct_answers": ["waterfall nearby", "waterfall", "mist", "waterfall", "waterfall", "falling", "water pressure", "dam", "water mist", "dam overflow"], "difficult_direct_answer": false, "rationales": ["The water is falling from above.", "There is a waterfall by the boat.", "Water can be seen gushing downwards ahead, and when it hits the still water, it creates a big spray."], "image": "train2014/COCO_train2014_000000016957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165488, "question_id": "EREer3oH4UGHJynE9HnM8n", "question": "Why is she holding an umbrella?", "choices": ["stop cars", "stop sun", "stop rain", "showing off"], "correct_choice_idx": 1, "direct_answers": ["sunny", "rain protection", "prevent sunrays", "needs shade", "shade", "sun protection", "stop sun", "sunny", "sun", "block sun"], "difficult_direct_answer": true, "rationales": ["It is not raining. she is using the umbrella for shade.", "It is not raining. an umbrella cannot stop cars.", "She is holding it to keep cool from the sun hitting her."], "image": "train2014/COCO_train2014_000000165488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131174, "question_id": "ERKzurtatvP6FMna84Y3BX", "question": "What type of weather event most likely happened here recently?", "choices": ["tornado", "hurricane", "hail", "snow"], "correct_choice_idx": 3, "direct_answers": ["storm", "hail", "snow", "snow", "snow", "snow", "snow storm", "snow", "snow", "snowfall"], "difficult_direct_answer": false, "rationales": ["The weather is snowy.", "There is snow still on the ground.", "There is a layer of snow on the ground."], "image": "train2014/COCO_train2014_000000131174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163970, "question_id": "ERP7rAqPYp2TynHEaWffdU", "question": "What country does this vehicle represent?", "choices": ["great britain", "united states", "portugal", "australia"], "correct_choice_idx": 0, "direct_answers": ["britain", "uk", "uk", "uk", "europe", "england", "uk", "united kingdom", "great britain", "united kingdom"], "difficult_direct_answer": false, "rationales": ["Great britain's flag is on the front.", "The country is britain.", "There is a british flag in the background."], "image": "val2014/COCO_val2014_000000163970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166506, "question_id": "ERpQLbxoday9KCCYdg5Eyr", "question": "What type of skateboarding is this guy doing?", "choices": ["dangerous", "beginner", "extreme", "competition"], "correct_choice_idx": 3, "direct_answers": ["speed skateboarding", "freestyle", "street", "street skating", "freestyle", "competition", "competitive", "competitive", "slalom", "street"], "difficult_direct_answer": false, "rationales": ["The man is doing competition skateboarding.", "The skateboarder is competing.", "The guy is competing with others on his skateboard."], "image": "train2014/COCO_train2014_000000166506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226326, "question_id": "ERr2ppGfEwgudrECt6gsQq", "question": "Where is this parking lot?", "choices": ["shopping mall", "airport", "near harbor", "downtown"], "correct_choice_idx": 2, "direct_answers": ["near harbor", "dock", "large ground", "canada", "simonds", "harbour", "near harbor", "in city", "beside road", "canada"], "difficult_direct_answer": false, "rationales": ["The parking lot is near a harbor since boats are in the background.", "There are boats and water in the background", "To the left of these two vehicles is a body of water. there are some boats that are parked in the marina."], "image": "train2014/COCO_train2014_000000226326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323720, "question_id": "ERwpAtjhfLxJN3gt838CHq", "question": "Where is this photo most likely taken at?", "choices": ["desert", "urban city", "sea", "wilderness"], "correct_choice_idx": 3, "direct_answers": ["farm", "farm", "farm", "wilderness", "plains", "farm", "sheep field", "park", "field", "pasture"], "difficult_direct_answer": false, "rationales": ["There are no signs of manmade architecture here. there is also forest and grass growth which are all consistent with answer a.", "The photo is in the wild.", "There is an open field with woods in the background and animals roaming free."], "image": "train2014/COCO_train2014_000000323720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529300, "question_id": "ESEtBPx2kwoM3z9VHPXhD7", "question": "What beverage is the man at the table drinking?", "choices": ["rum", "juice", "cola", "beer"], "correct_choice_idx": 2, "direct_answers": ["soda", "cola", "coke", "cola", "coca cola", "coca-cola", "cola", "coke", "coke", "coke"], "difficult_direct_answer": false, "rationales": ["The bottle has a red and white logo. the drink is dark.", "The label on the glass bottle is the coca cola brand's label.", "The beverage is coke."], "image": "train2014/COCO_train2014_000000529300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196699, "question_id": "ESFpcMDpvJ6fiX67EhngwD", "question": "Which objects here are the sharpest?", "choices": ["cutting board", "peppers", "knives", "bowl"], "correct_choice_idx": 2, "direct_answers": ["knife", "knife", "knives", "knives", "knife", "knives", "knives", "knife", "knife", "knife"], "difficult_direct_answer": false, "rationales": ["The knife has a blade that is sharp.", "Nothing other than the cutting tools are sharp.", "The named implement is used for cutting things and is therefore sharp - we can see it has cut these peppers."], "image": "val2014/COCO_val2014_000000196699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417547, "question_id": "ESL3npfTFPJPgN2LNsSBD5", "question": "Who conducts baseball league?", "choices": ["mlb", "iit", "tts", "nht"], "correct_choice_idx": 0, "direct_answers": ["officials", "officials", "mlb", "mlb", "mlb", "mlb", "mlb", "mlb", "mlb", "mlb"], "difficult_direct_answer": false, "rationales": ["The major league baseball organization is responsible for professional baseball.", "It stands for major league baseball.", "Professional baseball is overseen by the organization known as major league baseball"], "image": "val2014/COCO_val2014_000000417547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515136, "question_id": "ESRCEastxDuPGFN23VqUjK", "question": "What animal on on the greeting card?", "choices": ["horse", "elephant", "raccoon", "dog"], "correct_choice_idx": 1, "direct_answers": ["elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["The animal is an elephant.", "This is obvious given the image.", "The card has a blue elephant on the front of it."], "image": "train2014/COCO_train2014_000000515136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419, "question_id": "ESiNUTqE4FGyGSzzDnNGSJ", "question": "What is the man in red ready to do?", "choices": ["duck", "dribble", "serve", "run"], "correct_choice_idx": 2, "direct_answers": ["serve", "serve", "serve", "hit ball", "serve", "serve", "hit", "serve", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["He is looking up and holding the ball up.", "The man is throwing up the ball in the air.", "He has tossed the ball in the air and is swinging his racket"], "image": "train2014/COCO_train2014_000000000419.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39053, "question_id": "ESn4AcKTTn3TJB4EmRfjZh", "question": "This meal is likely for how many people?", "choices": ["two", "five", "thirty", "one"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "two", "three", "two", "two", "two", "two", "three", "two"], "difficult_direct_answer": false, "rationales": ["There is too much food for one person, but not really enough for more than two.", "This looks like a date night meal.", "You can tell by the matching phones and number of plates as to how many are eating."], "image": "val2014/COCO_val2014_000000039053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51359, "question_id": "ESqYQusVwUnG3t9CFPBqBa", "question": "What style of pants are these?", "choices": ["white wash", "stripped", "camo", "polka-dot"], "correct_choice_idx": 2, "direct_answers": ["camo", "camo", "camo", "camouflage", "camo", "camouflage", "camo", "camo", "camo", "army pant"], "difficult_direct_answer": false, "rationales": ["Camo pants have a greenish brown pattern.", "The pants are made of a combination of colors used for the purpose of concealment.", "The pants look like army pants."], "image": "train2014/COCO_train2014_000000051359.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433373, "question_id": "ESxExgaGRmn85r99u2P6KX", "question": "How many people can sit at this table?", "choices": ["two", "four", "six", "eight"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The table is small and has chairs for two people/.", "The table is small with one chair on each of the short ends for a person to sit in.", "There are two chairs at the table."], "image": "val2014/COCO_val2014_000000433373.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368648, "question_id": "ETFLTaAnw8rCb7HcJmbgS9", "question": "What are white lines on road called?", "choices": ["edge mark", "border line", "traffic line", "cutting line"], "correct_choice_idx": 1, "direct_answers": ["crosswalk", "driving lanes", "dividers", "dividing lines", "barrier line", "lane lines", "border line", "crossroad", "center line", "lanes"], "difficult_direct_answer": true, "rationales": ["The lines are border lines.", "The lines are used to mark the line on the roads.", "The line are for the borders."], "image": "val2014/COCO_val2014_000000368648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560943, "question_id": "ETFt6ySgo7ZnugdR77oqcF", "question": "Why is there a barrier near the curb?", "choices": ["for amusement", "by law", "for racing", "for aesthetics"], "correct_choice_idx": 2, "direct_answers": ["parking", "protection", "protection", "blocking", "protection", "protect onlookers", "for racing", "event", "block people", "to protect"], "difficult_direct_answer": false, "rationales": ["The barrier on the street separates the racers and the spectators so no one gets injured if the bike spins out of control.", "The barrier is in place to mark the route of the race.", "It separates spectators from the fast-moving vehicles."], "image": "train2014/COCO_train2014_000000560943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362352, "question_id": "ETjAsJhDr9fMRWFRJbHfVr", "question": "What is the name of this train?", "choices": ["92.l", "l.92", "l line", "unknown"], "correct_choice_idx": 1, "direct_answers": ["l92", "l.92", "l.92", "rail class", "l.92", "locomotive", "l.92", "l 92", "l.92", "l.92"], "difficult_direct_answer": false, "rationales": ["The name of a train is normally marked quite distinctly in the front of it. i92 can be seen clear in the front of this train.", "A number designation is painted in yellow on a train.", "The name is on the front of the train."], "image": "val2014/COCO_val2014_000000362352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30796, "question_id": "ETppMYmyqGtnQCg3dDokSb", "question": "Why is the light on inside the double-decker bus?", "choices": ["visibility", "convenience", "aesthetics", "by law"], "correct_choice_idx": 0, "direct_answers": ["waiting", "sight", "for visibility", "nighttime", "security", "visibility", "night time", "people", "increase visibility", "to see"], "difficult_direct_answer": true, "rationales": ["The light allows people to see better.", "The light is on so people can see.", "So the people inside are able to see what they are doing."], "image": "val2014/COCO_val2014_000000030796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234352, "question_id": "ETtQWzXEU5uztJsSNvW8Jw", "question": "What can the men do here?", "choices": ["ride", "haircut", "compete", "drink"], "correct_choice_idx": 3, "direct_answers": ["hang out", "ride bikes", "ride bikes", "biking", "sit", "eat", "drink", "drink coffee", "eat outside", "sit"], "difficult_direct_answer": false, "rationales": ["The location seemingly in question is a cafe bar based on the sign. answer a is an activity that can be done at a cafe bar.", "They are outside a bar.", "The men can drink."], "image": "train2014/COCO_train2014_000000234352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78614, "question_id": "EU2RS2uDJS2LXu9VpKEnTd", "question": "What is the man tying?", "choices": ["shoelaces", "cord", "tie", "rope"], "correct_choice_idx": 2, "direct_answers": ["tie", "tie", "tie", "necktie", "tie", "tie", "tie", "necktie", "necktie", "tie"], "difficult_direct_answer": false, "rationales": ["The man is tying his necktie.", "The tie is being tied.", "Neckties are worn around the neck."], "image": "train2014/COCO_train2014_000000078614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12109, "question_id": "EU92UBnG2EZcPsVjT3JTeW", "question": "What does this car run on?", "choices": ["wood", "gasoline", "electricity", "solar"], "correct_choice_idx": 2, "direct_answers": ["electricity", "electricity", "feel", "electricity", "electricity", "electricity", "feel", "electricity", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["There is a electric plug by the car.", "The car needs electricity.", "This is a smart car. it doesn't run on gas."], "image": "train2014/COCO_train2014_000000012109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278796, "question_id": "EUBSaocvB3dojRETMY5N5G", "question": "Why is the woman under the red umbrella holding her hand to her face?", "choices": ["to wave", "to cough", "to block", "to smoke"], "correct_choice_idx": 3, "direct_answers": ["smoking", "smoking", "smoking", "licking juice", "smoking", "smoking", "on phone", "smoking unsure", "to smoke", "fruits"], "difficult_direct_answer": false, "rationales": ["She is holding a cigarette in her lips.", "The person is holding a cigarette in their hand so it is logical to assume they are raising their hand to smoke the cigarette.", "The woman sitting near the food has a cigarette."], "image": "train2014/COCO_train2014_000000278796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406500, "question_id": "EUKTbS8NTNnnz49UwGqj2n", "question": "The old man is wearing what type of hat?", "choices": ["baseball", "sequin", "newsboy", "pork pie"], "correct_choice_idx": 3, "direct_answers": ["fedora", "fedora", "cow boy", "pork pie", "panama", "bucket hat", "top hat", "top", "cowboy", "fishing hat"], "difficult_direct_answer": true, "rationales": ["This kind of hat was popular in the 19th century.", "The man is wearing a top hat which is known as a pork pie.", "The man has a pork pie."], "image": "val2014/COCO_val2014_000000406500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111768, "question_id": "EUTHihnjQySwu2Gdt69vxM", "question": "What would happen if you entered the rightmost text on the bench into a web browser?", "choices": ["crash browser", "visit website", "get error", "nothing happens"], "correct_choice_idx": 1, "direct_answers": ["get information", "website opens", "access website", "website", "website", "president", "visit website", "artstart", "visit website", "car"], "difficult_direct_answer": false, "rationales": ["They would be taken somewhere.", "The text is a link to a site.", "The text would take someone to a website."], "image": "train2014/COCO_train2014_000000111768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392506, "question_id": "EUsCDXu7PAt27tegZMDZEA", "question": "Which of the above fruit is belongs to Cucurbitaceae gourd family?", "choices": ["watermelon", "banana", "lemon", "cucumber"], "correct_choice_idx": 3, "direct_answers": ["cucumber", "cucumber", "cucumber", "cucumbers", "cucumbers", "cucumber", "cucumbers", "cucumber", "gourd", "cucumber"], "difficult_direct_answer": false, "rationales": ["The cucumber is a member of the gourd or cucurbitaceae family. the bananas and limes are from different plant families.", "Traditionally curbitaceae, also called cucurbits belong to the gourd family.", "The member of that family is the cucumber."], "image": "val2014/COCO_val2014_000000392506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124532, "question_id": "EVJcHcQa5Q7QDHSrp5Ujid", "question": "Where is the plane currently located?", "choices": ["mid air", "repair shop", "warehouse", "runway"], "correct_choice_idx": 3, "direct_answers": ["airport", "airport", "runway", "airport", "runway", "runway", "airport", "airport", "runway", "airport"], "difficult_direct_answer": false, "rationales": ["The plane is currently on the runway and is about to take off.", "The plane is on the runway outside.", "Planes use long roads to gain speed for takeoff."], "image": "train2014/COCO_train2014_000000124532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139011, "question_id": "EVKu37fdLVivNzEVZKb2nM", "question": "Who are the people gathering there?", "choices": ["friends", "coworkers", "tourists", "students"], "correct_choice_idx": 2, "direct_answers": ["tourists", "passengers", "bus passengers", "commuters", "tourists", "commuters", "passengers", "tourists", "passengers", "transit users"], "difficult_direct_answer": false, "rationales": ["They are in a city with lots of bags", "Any of the answers could be possible, but the people appear to be lining up to board a bus commonly used by answer a.", "The people gathering in the area are trying to get onto a tourist bus."], "image": "val2014/COCO_val2014_000000139011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48388, "question_id": "EVpDPgw3EqFKhZg4htzT9S", "question": "What does the kite most resemble?", "choices": ["apple", "cookie", "baby", "butterfly"], "correct_choice_idx": 3, "direct_answers": ["butterfly", "butterfly", "butterfly", "bug", "butterfly", "butterfly", "butterfly", "bug", "butterfly", "butterfly"], "difficult_direct_answer": false, "rationales": ["Looks like a very colorful butterfly.", "The kite has many colors and a tail that looks like a butterfly.", "The other options don't even remotely look like this kite."], "image": "train2014/COCO_train2014_000000048388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569021, "question_id": "EW9ywaPt6VmrDvSrsHwf8M", "question": "What is the dark brown object around the top of his pants?", "choices": ["belt", "tape", "holster", "rope"], "correct_choice_idx": 0, "direct_answers": ["zipper", "belt", "pilot", "belt", "belt", "belt", "belt", "pilot", "belt", "belt"], "difficult_direct_answer": false, "rationales": ["In western society, men usually wear belts around their waists to keep their pants up.", "The brown object is made of leather for a belt.", "The brown object is a belt."], "image": "train2014/COCO_train2014_000000569021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400507, "question_id": "EWU67e8awQdWdZDzJYCF9A", "question": "Why are the men's vests green in color?", "choices": ["dress code", "visibility", "camouflage", "fashion"], "correct_choice_idx": 1, "direct_answers": ["color visibility", "safety reflection", "visibility", "safety", "workers", "safety", "visible", "airport employee", "safety", "visibility"], "difficult_direct_answer": false, "rationales": ["They have safety vests.", "They wear yellow in case it's dark.", "The men are wearing bright reflective vests so they can be seen better for safety reasons."], "image": "val2014/COCO_val2014_000000400507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410195, "question_id": "EWWg39nyuEidVQsgoHCWb5", "question": "What type of outfit is the man wearing?", "choices": ["sweat suit", "track suit", "scuba suit", "wet suit"], "correct_choice_idx": 3, "direct_answers": ["wet suit", "wetsuit", "wet suit", "wetsuit", "wet suit", "wet suit", "wetsuit", "wetsuit", "wetsuit", "wet suit"], "difficult_direct_answer": false, "rationales": ["The man is wearing a wetsuit to surf.", "The man is about to go surfboarding. you surfboard in the water.", "The outfit is tight and he is coming in from the water."], "image": "val2014/COCO_val2014_000000410195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21968, "question_id": "EWeWGQeccyNTjz3iZnub6b", "question": "WHat type of animal is fried here?", "choices": ["cow", "chicken", "goat", "frog"], "correct_choice_idx": 3, "direct_answers": ["frogs", "frog legs", "frogs", "chicken", "frogs", "frog", "frog", "frog", "frogs", "frog"], "difficult_direct_answer": false, "rationales": ["Frog legs are being fried.", "The legs look like they're from frogs.", "There is a sign with writing on it that indicates what is being fried."], "image": "train2014/COCO_train2014_000000021968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262985, "question_id": "EWipbDVG83jRy7WjA88ZHh", "question": "The flags indicate that these boats come from which continent?", "choices": ["south america", "africa", "europe", "asia"], "correct_choice_idx": 2, "direct_answers": ["europe", "europe", "europe", "europe", "multiple", "europe", "switzerland", "europe", "europe", "european"], "difficult_direct_answer": false, "rationales": ["The flags are all from european countries.", "A boat has a blue and yellow flag flying from it.", "The flags are european."], "image": "val2014/COCO_val2014_000000262985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90011, "question_id": "EWxCviXdotoJJECQJ7pTrq", "question": "What role is being taken on by the person standing?", "choices": ["hair stylist", "magician", "hair dyer", "blow dryer"], "correct_choice_idx": 0, "direct_answers": ["barber", "haircutted", "hair stylist", "barber", "barber", "cutting hair", "hairdresser", "hair stylist", "stylist", "hair dresser"], "difficult_direct_answer": false, "rationales": ["A person who is standing is holding the hair of a person sitting in front of them between their fingers while also holding scissors. the person holds the hair the way a stylist or barber would.", "The person standing is cutting the other person hair so it looks better.", "The standing person is cutting the hair of the sitting person."], "image": "val2014/COCO_val2014_000000090011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515612, "question_id": "EX7AHC7sLLCsoURX7qGwGQ", "question": "What powers the musical instrument shown here?", "choices": ["solar", "gas", "battery", "oil"], "correct_choice_idx": 2, "direct_answers": ["batteries", "battery", "battery", "batteries", "piano", "batteries", "battery", "battery", "batteries", "batteries"], "difficult_direct_answer": false, "rationales": ["There are no cords attached to the instrument.", "The instrument needs a battery.", "The keyboard is run by batteries."], "image": "val2014/COCO_val2014_000000515612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490505, "question_id": "EXLyrTRUedrELHHSphJnFQ", "question": "What was used to get unique colors on roses here?", "choices": ["rain", "sun", "pencil", "dye"], "correct_choice_idx": 3, "direct_answers": ["color dye", "color dye", "dye", "dye", "dye", "dye", "dye", "dye", "dye", "color dye"], "difficult_direct_answer": false, "rationales": ["Tie dye was used to produce the colors.", "People put color in water for the flowers to soak it up and change to multi colors like this", "The dye was used."], "image": "val2014/COCO_val2014_000000490505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551829, "question_id": "EXN8qYdgBjFa9wbKGST5fC", "question": "What makes these bricks sturdy?", "choices": ["sun baking", "heat firing", "hand stamping", "rebar"], "correct_choice_idx": 1, "direct_answers": ["cement", "cement", "concrete", "cement", "configuration", "cement", "mortar", "clay", "heat firing", "hard clay"], "difficult_direct_answer": false, "rationales": ["Nowadays these are done faster by using fire to solidify the clay", "When baked the bricks become hard.", "These bricks are fired up to solidify."], "image": "train2014/COCO_train2014_000000551829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128275, "question_id": "EXVjtw8keFGrjHFkL3eHEX", "question": "What purpose does the pen and paper serve to track?", "choices": ["wines", "bread", "dessert", "billing"], "correct_choice_idx": 0, "direct_answers": ["menu", "wines", "feedback", "wine taste", "stationary", "wines", "wines", "drink quality", "wines", "wines"], "difficult_direct_answer": false, "rationales": ["The person is serving wine.", "The paper is near some wine that is being poured.", "The purpose is for wine."], "image": "train2014/COCO_train2014_000000128275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465195, "question_id": "EXXJq67mKWA2JaVfH5kHS8", "question": "What led to the cracking of the roads depicted?", "choices": ["earthquake", "heavy traffic", "landslide", "ice expansion"], "correct_choice_idx": 3, "direct_answers": ["frigid temperatures", "ice", "temperature", "soil swelling", "cold", "ice expansion", "frost heaves", "snow", "snow", "snow"], "difficult_direct_answer": false, "rationales": ["The sign states that frost heaves caused the problem.", "The road cracked because water expands.", "A road with cracks and snow on both sides is shown."], "image": "train2014/COCO_train2014_000000465195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187803, "question_id": "EXacFPFVEhy2vRwHhFSeq9", "question": "What is located in the corner?", "choices": ["trash", "couch", "stairs", "lamp"], "correct_choice_idx": 2, "direct_answers": ["stairs", "spiral staircase", "staircase", "painting", "staircase", "books", "staircase", "spiral staircase", "spiral staircase", "book cupboard"], "difficult_direct_answer": false, "rationales": ["There are ascending steps going in a circular fashion to the next floor.", "There is a winding circular staircase in the corner.", "There are stairs to go up to the top part of the library."], "image": "train2014/COCO_train2014_000000187803.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192701, "question_id": "EXcmxnpoN5YixNhzQmdiBD", "question": "What is the physically largest number associated with?", "choices": ["luck", "sin", "misfortune", "greatness"], "correct_choice_idx": 3, "direct_answers": ["teamwork", "greatness", "one", "sex", "race competition", "sun", "entry number", "one", "one", "person"], "difficult_direct_answer": false, "rationales": ["The number one can be seen on the side of a motorcycle and is larger than all of the other numbers on the bike.", "The large number means how great the person is at tricks.", "The largest is for greatness."], "image": "val2014/COCO_val2014_000000192701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307686, "question_id": "EXrk6A7tJ6iKbTkdEe6fm3", "question": "Why is this man sleeping on the bench?", "choices": ["for fun", "being homeless", "being tired", "being sick"], "correct_choice_idx": 1, "direct_answers": ["tired", "man's tired", "homeless", "resting", "homeless", "tired", "tired", "homeless", "homelessness", "being homeless"], "difficult_direct_answer": false, "rationales": ["The man is homeless.", "The person doesn't have a home.", "The man is on a bench on a sidewalk of a busy street laying down."], "image": "train2014/COCO_train2014_000000307686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297910, "question_id": "EXxkuWDUZZFe9oSfWEroVz", "question": "What animal are the people checking out?", "choices": ["goats", "lambs", "donkeys", "horses"], "correct_choice_idx": 3, "direct_answers": ["horse", "horse", "horses", "horses", "horses", "horses", "horses", "horses", "ponies", "horses"], "difficult_direct_answer": false, "rationales": ["The other options aren't in the picture.", "The people are checking out horses since there are so many maned animals in the street.", "The animal is a horse."], "image": "val2014/COCO_val2014_000000297910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136928, "question_id": "EXyuAtMbtdUZRtuS2iwtG5", "question": "The man standing near the buses is probably there to do what?", "choices": ["get directions", "travel", "drive", "sight-see"], "correct_choice_idx": 2, "direct_answers": ["drive", "drive", "drive bus", "drive bus", "drive", "stop bus", "drive", "drive", "drive", "surveying"], "difficult_direct_answer": false, "rationales": ["The man is driving.", "The man is waiting to take the bus out on his route.", "When there are multiple buses parked next to each other, it is probably a gathering place for transportation employees."], "image": "train2014/COCO_train2014_000000136928.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47684, "question_id": "EYP6ANQ6sJFfEWMNoXR8Jr", "question": "What physical activity is the man exerting?", "choices": ["smoking", "farting", "washing", "urinating"], "correct_choice_idx": 3, "direct_answers": ["urinating", "urinating", "urinating", "urinating", "urinating", "peeing", "peeing", "urinating", "urination", "peeing"], "difficult_direct_answer": false, "rationales": ["He is in a bathroom standing in front of the urinal.", "The man is in a restroom.", "You can tell by his position and the urinals as to what he is doing."], "image": "train2014/COCO_train2014_000000047684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555833, "question_id": "EYctjNJRTqYpF3bpP6kJ7a", "question": "Where is the cameraman most likely taking a picture from?", "choices": ["car rooftop", "palm tree", "building", "mountain"], "correct_choice_idx": 2, "direct_answers": ["building", "up", "above", "window", "roof", "roof", "roof top", "tall building", "above", "above"], "difficult_direct_answer": false, "rationales": ["The photographer needed to be from a very high angle to take the photo from so far away.", "The photographer is on top of the building.", "Because the image was taken at a higher level that can only be of a tall building structure."], "image": "train2014/COCO_train2014_000000555833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92815, "question_id": "EYqvitGfAjBY6pNhyS9keA", "question": "How many game players are there?", "choices": ["one", "two", "three", "four"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "four", "two", "four", "two", "four", "four", "two", "1 game"], "difficult_direct_answer": false, "rationales": ["It's impossible to see if the other two people have controllers in their hands.", "There are four people. half of of them have controllers.", "Usually, a family will join together to be entertained by some friendly competition."], "image": "val2014/COCO_val2014_000000092815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343165, "question_id": "EYrraPmWU6gNQjZVxHSdQ7", "question": "How many buttons are on the bottom of the controller in her left hand?", "choices": ["none", "one", "four", "two"], "correct_choice_idx": 1, "direct_answers": ["two", "one", "one", "two", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is one button in the bottom of the controller in her left hand. this is the b button.", "There is one button on the controller.", "There is one button."], "image": "train2014/COCO_train2014_000000343165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332663, "question_id": "EYxJGqN8ZmULvfWJMYDNiK", "question": "These animals have what classification on IUCN's Red List of Threatened Species?", "choices": ["long gone", "extinct", "vulnerable", "endangered"], "correct_choice_idx": 2, "direct_answers": ["zebra", "not extinct", "zebra", "zebras", "population stable", "near threatened", "endangered", "vulnerable", "endangered", "endangered"], "difficult_direct_answer": false, "rationales": ["They are considered vulnerable.", "The zebra population is decreasing and near threatened according to the iucn's red list.", "Zebras are under threat of extinction."], "image": "val2014/COCO_val2014_000000332663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358576, "question_id": "EZL8efujsmiXZsv6fb3AU7", "question": "Where are these foods being sold?", "choices": ["supermarket", "casino", "mall", "flea market"], "correct_choice_idx": 0, "direct_answers": ["market", "fruits", "market", "supermarket", "market", "farmers market", "market", "fruit", "market", "market"], "difficult_direct_answer": false, "rationales": ["They are arranged in well stocked shelves that can be found in a supermarket.", "There is a supermarket with fruit.", "The store looks like a typical supermarket and has the signs that you would find in a supermarket."], "image": "train2014/COCO_train2014_000000358576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120524, "question_id": "EZMmfph6BW7n4ZgXudrrvo", "question": "Which animal is more likely to eat the other?", "choices": ["cat", "dog", "sheep", "goat"], "correct_choice_idx": 1, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["Dogs are more of a predator than a sheep is.", "The dog is smelling.", "It's obviously the more naturally predatory option."], "image": "train2014/COCO_train2014_000000120524.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331031, "question_id": "EZtmg5zw2SKDuFS69i8Lu8", "question": "What is the woman's headband meant to look like?", "choices": ["unicorn horn", "paws", "antlers", "hands"], "correct_choice_idx": 2, "direct_answers": ["antlers", "antlers", "antlers", "horns", "antlers", "antlers", "antlers", "reindeer", "antlers", "antlers"], "difficult_direct_answer": false, "rationales": ["The woman's headband resembles animal antlers.", "The woman has antlers.", "The headband is on her head, so it would not make sense for it to look like hands or paws. there are two horns on the headband, so it does not look like a unicorn."], "image": "train2014/COCO_train2014_000000331031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382401, "question_id": "EZxnuSDk8Znqu82hb6ijv2", "question": "Why does the man on the railing have his hand to his head?", "choices": ["making call", "scratching itch", "to exercise", "blocking noise"], "correct_choice_idx": 0, "direct_answers": ["phone", "cellphone", "using phone", "phone", "talking", "stress", "making call", "cell phone", "phone call", "phone"], "difficult_direct_answer": false, "rationales": ["The man on the railing has his hand to his head because he is talking on the phone.", "He is holding a phone to his ear", "The man is talking on the phone."], "image": "train2014/COCO_train2014_000000382401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457800, "question_id": "EaHZXxW6d22tTG3mH59Lqc", "question": "Who is the old man to the young girl?", "choices": ["teacher", "neighbor", "grandfather", "cousin"], "correct_choice_idx": 2, "direct_answers": ["grandfather", "grandfather", "probably grandfather", "grandfather", "grandfather", "grandpa", "grandfather", "grandfather", "grandfather", "grandfather"], "difficult_direct_answer": false, "rationales": ["The girl is too small to be her daughter and he is taking care of her.", "The child appears to be a grandchild to a man who appears to be a grandfather to the child. grandparents may take their grandkids for a fun kite-flying adventure.", "Based on their relative ages and genders and the familiar embrace, answer a is most likely."], "image": "train2014/COCO_train2014_000000457800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578210, "question_id": "EbAN9pLbJMM8cc6iEmdA9Z", "question": "What is the child who's birthday is being celebrated have a passion for?", "choices": ["texas", "cell phones", "animals", "wax"], "correct_choice_idx": 1, "direct_answers": ["mobile phone", "cell phones", "phones", "music", "cell phones", "phones", "electronics", "cell phones", "music", "cellphones"], "difficult_direct_answer": false, "rationales": ["This is obvious based on the decorations on the cake.", "There is a cell phone as a cake topper.", "The cake has a mobile device and bars on it."], "image": "val2014/COCO_val2014_000000578210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146881, "question_id": "EbCnTpYgEeU4c3K9QNhsUT", "question": "What does the bus say at the top?", "choices": ["red", "shuttle", "open", "closed"], "correct_choice_idx": 1, "direct_answers": ["shuttle", "shuttle", "shuttle", "shuttle", "shuttle", "shuttle", "shuttle", "shuttle", "shuttle", "shuttle"], "difficult_direct_answer": false, "rationales": ["That's what the word is.", "The bus has \"shuttle\" written on it.", "The front top of the bus is clearly visible and the text is readable."], "image": "train2014/COCO_train2014_000000146881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553501, "question_id": "EbLVfpZ3FcVzPyfTJgPFSn", "question": "What will the bananas look like under the skin?", "choices": ["bruised", "dripping wet", "molten", "seedless"], "correct_choice_idx": 0, "direct_answers": ["white", "bruised", "white", "brown", "mushy", "brown", "brown", "brown", "rotten", "brown spots"], "difficult_direct_answer": false, "rationales": ["The bananas will be bruised as they're brown.", "The skin has lots of brown spots which will also be on the edible part of the fruit.", "If the skins are brown then the inside may be too."], "image": "train2014/COCO_train2014_000000553501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456519, "question_id": "Ebz2dWfFWGHfUB2KJNLWB4", "question": "Where does the man want to hit the ball?", "choices": ["above him", "behind him", "on ground", "over net"], "correct_choice_idx": 3, "direct_answers": ["over net", "on racket", "to opponent", "over net", "over net", "court", "over net", "over net", "over net", "over"], "difficult_direct_answer": false, "rationales": ["The man is trying to hit the ball over the net with his tennis racquet.", "The whole goal of tennis is to serve the ball over the net.", "His opponent is on the other side of the divider and he wants to score a point by hitting the ball near him."], "image": "val2014/COCO_val2014_000000456519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514722, "question_id": "Ec8JvxhTKgvrTN3VZ7zwvw", "question": "What is the problem in this area?", "choices": ["water shortage", "traffic congestion", "water pollution", "landslide"], "correct_choice_idx": 2, "direct_answers": ["litter", "smog pollution", "garbage", "pollution", "debris", "water pollution", "litter", "pollution", "trash", "polluted"], "difficult_direct_answer": false, "rationales": ["There is a lot of garbage in the water.", "The water has been contaminated with litters and waste products that are visible.", "The water looks super dirty."], "image": "val2014/COCO_val2014_000000514722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218343, "question_id": "Ec9WHQL3fhdnWuh4rXu2sk", "question": "Which electronic device is likely located in front of the coffee table?", "choices": ["television", "record player", "telephone", "stereo"], "correct_choice_idx": 0, "direct_answers": ["laptop", "television", "television", "fan", "television", "tv", "laptop", "television", "television", "television"], "difficult_direct_answer": false, "rationales": ["It is a normal device for this room and there are remotes on the table", "There are two remotes on the coffee table that are pointed towards where the tv is likely located.", "The tv is usually in front of the couch so you can see it better."], "image": "val2014/COCO_val2014_000000218343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330897, "question_id": "EcGrHLdCnMK7xGcAQ4Bucd", "question": "Why are there so many things on the pizza?", "choices": ["throwing away", "adds flavor", "looks nice", "more money"], "correct_choice_idx": 1, "direct_answers": ["custom toppings", "toppings", "to eat", "supreme", "wanted", "hungry", "toppings", "adding flavors", "toppings", "adds flavor"], "difficult_direct_answer": false, "rationales": ["The things on the pizza are toppings which would each have their own flavor so answer a is intuitive although there is not a specific answer.", "The things on the pizza are food items and not all of them are the same. if additional food items are added to something they increase flavor.", "The toppings add flavor."], "image": "val2014/COCO_val2014_000000330897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372669, "question_id": "EcJ6tDW3xEcdjpmkPcstXM", "question": "What animated series does this person probably enjoy?", "choices": ["simpsons", "spongebob squarepants", "family guy", "animaniacs"], "correct_choice_idx": 2, "direct_answers": ["family guy", "family guy", "family guy", "family guy", "family guy", "family guy", "family guy", "ren", "family guy", "family guy"], "difficult_direct_answer": false, "rationales": ["Chris is the son on family guy.", "Stewie is in the background of the couch.", "The blanket draped behind them looks to feature the character stewie, who is one of the main characters in the show family guy."], "image": "train2014/COCO_train2014_000000372669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25609, "question_id": "EcL6MATZV2WBXQTeTAiNmf", "question": "Which one of these companies makes this type of dessert?", "choices": ["wendy's", "kfc", "subway", "dunkin'"], "correct_choice_idx": 3, "direct_answers": ["dunking donuts", "dunkin", "dunkin donuts", "dunking donuts", "dunkin donuts", "dunking donuts", "dunkin donuts", "dunkin'", "bakery", "bakery"], "difficult_direct_answer": false, "rationales": ["The company makes donuts.", "It is doughnuts.", "Dunkin donuts makes donuts."], "image": "val2014/COCO_val2014_000000025609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60132, "question_id": "EcNhyTCLN5Htzg4jPuim3K", "question": "Where can you find the real counterpart of the graphic on the frisbee?", "choices": ["inside foot", "inside head", "inside chest", "inside arm"], "correct_choice_idx": 1, "direct_answers": ["inside head", "minecraft", "head", "drawing artist", "computer game", "minecraft", "head", "head", "skeleton", "art store"], "difficult_direct_answer": false, "rationales": ["The graphic is of a skull.", "The counterpart of the graphic would be a head since a skull is shown.", "You can find the skull in the head."], "image": "val2014/COCO_val2014_000000060132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207354, "question_id": "Ecka2qFZUw6cT3XVvoGpiu", "question": "What piece of clothing is most dangerous for the boy?", "choices": ["hat", "shirt", "scarf", "pants"], "correct_choice_idx": 2, "direct_answers": ["tie", "handkerchief", "neck tie", "scarf", "scarf", "hankerchief", "scarf", "scarf", "scarf", "yellow scarf"], "difficult_direct_answer": false, "rationales": ["The boy is not wearing a hat. his shirt and pants cannot hurt him.", "The scarf could choke the boy.", "The boy is wearing a yellow scarf for cub scouts."], "image": "train2014/COCO_train2014_000000207354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22440, "question_id": "EdJCM2SCDF22M7yhAGVW2v", "question": "The name of the street at the top of the bus is the name of a team in what sport?", "choices": ["basketball", "soccer", "baseball", "mma"], "correct_choice_idx": 1, "direct_answers": ["soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer"], "difficult_direct_answer": false, "rationales": ["There is a soccer team called liverpool.", "The name of the street is the same as liverpool, the soccer team.", "Soccer or football is a popular sport in all of the uk. liverpool is not exception and has a popular soccer team."], "image": "train2014/COCO_train2014_000000022440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398744, "question_id": "Edj873gnUtac76omJEeMSM", "question": "What color of this fruit is good for eating?", "choices": ["green", "brown", "black", "yellow"], "correct_choice_idx": 3, "direct_answers": ["yellow", "yellow", "green", "green", "yellow", "yellow", "green", "green", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["Ripe bananas are yellow.", "They are bananas.", "These are bananas. green bananas are unripened, and black and brown ones are overripened."], "image": "train2014/COCO_train2014_000000398744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102319, "question_id": "EdsMCjqvsyPp9GQnms4Kdv", "question": "For what planned activity is the person modifying the road bicycle?", "choices": ["newspaper delivery", "indoor exercise", "maintenance", "moving"], "correct_choice_idx": 1, "direct_answers": ["marathon", "biking", "indoor exercise", "bike riding", "race", "cycling", "race", "racing", "bike riding", "fixing"], "difficult_direct_answer": false, "rationales": ["The person wants to make the bike a stationary one.", "They are mounting it so the wheels don't move the bike forward", "The person is making the bike stationary."], "image": "train2014/COCO_train2014_000000102319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133034, "question_id": "Ee8GffM6R2oogsyH56GA9e", "question": "How can you tell this person may be in South Korea?", "choices": ["passport", "sign", "smartphone", "won currency"], "correct_choice_idx": 3, "direct_answers": ["money", "money", "won", "money", "currency", "money", "won currency", "won", "won", "their money"], "difficult_direct_answer": false, "rationales": ["The won currency is found only in korea.", "The money that is being used has \"won\" on it. there is also some pamphlet with asian writing.", "The passport is american. the smartphone is made by lg but could be from any country."], "image": "val2014/COCO_val2014_000000133034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24105, "question_id": "EeCbbnPfXh7aw6n4oZ9JtR", "question": "What type of couch is this?", "choices": ["traditional", "sectional", "mid-century modern", "scandinavian"], "correct_choice_idx": 1, "direct_answers": ["sectional", "velvet", "l shaped", "red", "sectional", "sectional", "sectional", "l couch", "online", "sectional"], "difficult_direct_answer": false, "rationales": ["There are many parts to this couch.", "The type of couch is typically curved in to directions.", "The couch is a sectional."], "image": "train2014/COCO_train2014_000000024105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97946, "question_id": "EeJfXwuVT7LYjXiZCtGZ4e", "question": "What are the men doing with the large wooden poles?", "choices": ["exercising", "fighting", "land paddling", "jousting"], "correct_choice_idx": 2, "direct_answers": ["guiding skateboard", "land paddling", "skating", "skateboarding", "skateboarding oddly", "paddling uphill", "pushing", "boarding", "pushing", "paddling"], "difficult_direct_answer": true, "rationales": ["They are using them to move themselves on the skateboards", "This pushes them along on the skateboards instead of using their feet", "They help to steer while on the skateboard."], "image": "val2014/COCO_val2014_000000097946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343558, "question_id": "EeTXeNxWKMwpKS3ArgQXHb", "question": "What is used to keep the horses in one area?", "choices": ["trees", "dogs", "fences", "guns"], "correct_choice_idx": 2, "direct_answers": ["fence", "field", "fence", "fence", "fences", "field", "fence", "fence", "pasture", "fence"], "difficult_direct_answer": false, "rationales": ["The horses are fenced.", "There are fences behind the horses in the image.", "There are fences making the stables for the horses."], "image": "train2014/COCO_train2014_000000343558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316031, "question_id": "Eed6jBDykceWHv23WJJDT6", "question": "What is the name for the cross shaped structure on top of the tower?", "choices": ["field thermometer", "weather vane", "tungsten cross", "metric barometer"], "correct_choice_idx": 1, "direct_answers": ["weather vane", "spire", "weather vane", "cross", "weather vane", "weather vane", "weathervane", "weathervane", "weathervane", "tower"], "difficult_direct_answer": false, "rationales": ["The name is a weather vane.", "The cross shaped structure is a weather vane.", "The name of the cross shaped structure is a weather vane."], "image": "val2014/COCO_val2014_000000316031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163010, "question_id": "EegvLqKh8nLMAabLbJW27e", "question": "Which surfer is more experienced?", "choices": ["larger one", "smaller", "same", "elderly lady"], "correct_choice_idx": 0, "direct_answers": ["older man", "larger one", "adult", "man", "adult", "adult", "big man", "man", "adult", "older"], "difficult_direct_answer": false, "rationales": ["The older person has more years of experience because he's much older than the child", "The adult likely has more skill than the child and possibly teaching the child.", "A small child is on a surfboard with a man in a wetsuit behind him."], "image": "val2014/COCO_val2014_000000163010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36639, "question_id": "EepB7RuHjwkUupSGKeexxR", "question": "Why is the blue bus not moving?", "choices": ["stop sign", "no engine", "no gas", "broke down"], "correct_choice_idx": 0, "direct_answers": ["its stagnant", "broken", "its parked", "stop sign", "engine trouble", "stop sign", "stopped", "stop sign", "stop sign", "parked"], "difficult_direct_answer": false, "rationales": ["A stop sign requires all vehicles to fully stop before they proceed. the van doesn't appear to have a driver either.", "The van has approached a red octagonal traffic sign that requires it to wait, then check if traffic is clear, before proceeding onto the next street.", "This sign tells the people driving that they must quit driving for a bit."], "image": "train2014/COCO_train2014_000000036639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453625, "question_id": "EevXpcRTRkpEavCYgpfSCC", "question": "Which method of note taking is most frequent here?", "choices": ["legal pad", "laptop", "crayola", "sketch pad"], "correct_choice_idx": 1, "direct_answers": ["laptop", "via laptop", "online", "laptop", "computer", "outline note", "laptop", "typing", "typing", "highlighter"], "difficult_direct_answer": false, "rationales": ["Desks are typically used the most when working on a computer.", "The laptops are in a closer and handier position so they are likely used more frequently than the cell phone or desktop computer.", "Multiple laptops are on the table."], "image": "train2014/COCO_train2014_000000453625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62710, "question_id": "EezNaAhPAvx2BvQNFPepvL", "question": "What is likely on the item by the window?", "choices": ["clothes", "television", "food", "painting"], "correct_choice_idx": 3, "direct_answers": ["painting", "painting", "canvas", "painting", "painting", "painting", "ease", "easel", "painting", "painting"], "difficult_direct_answer": false, "rationales": ["When people paint, they use an easel to hold up their canvas.", "An easel is by a window. easels are used for painting.", "The back of the object is an easel, which is what a would rest on."], "image": "train2014/COCO_train2014_000000062710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136373, "question_id": "EfGLJhbsY8jyePxzasfG6E", "question": "The dog on the right side of the bed resembles what breed of dog?", "choices": ["bulldog", "dalmatian", "doberman", "shiba inu"], "correct_choice_idx": 3, "direct_answers": ["corgi", "akita inu", "husky", "collie", "shibu inu", "husky", "corgi", "shiba inu", "shiba inu", "german shepherd"], "difficult_direct_answer": false, "rationales": ["The dog on the right side of the bed has the traits of the breed known as shiba inu.", "It is light-brown in color with a black nose and pointy ears.", "That is the type of dog."], "image": "train2014/COCO_train2014_000000136373.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185368, "question_id": "EfZECC5m9TxP9RzEmgnbCs", "question": "What kind of activity with respect to the bike is the man on the floor most likely engaging in?", "choices": ["drawing", "painting", "purchasing", "diagnosing"], "correct_choice_idx": 3, "direct_answers": ["inspecting", "troubleshooting", "motorbike", "studying", "diagnosing", "writing", "sketching", "instructions", "giving directions", "checking bike"], "difficult_direct_answer": true, "rationales": ["His hands are out of view behind the bike with eyes looking down, so he is probably about to examine the engine.", "Two men are looking down at a motorcycle. they are looking at different parts of the bike.", "The man on the floor is reading and looking. he is not painting or drawing."], "image": "train2014/COCO_train2014_000000185368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180327, "question_id": "Eff3XBove7k5z4eRMTK6pH", "question": "What type of bread is being used?", "choices": ["rye", "french", "pumpernickel", "wheat"], "correct_choice_idx": 1, "direct_answers": ["baguette", "hoagie", "french", "french bread", "italian sub", "baguette", "hero", "french", "white", "sub"], "difficult_direct_answer": false, "rationales": ["It is a long white roll, which resembles that of a french style baguette.", "The sandwich uses an entire loaf of that bread.", "French bread is being used."], "image": "train2014/COCO_train2014_000000180327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3837, "question_id": "Efu2gU9ct7qJSN37kAHEYG", "question": "Sliding on the waves using fin boards are called?", "choices": ["skiing", "boating", "surfing", "swimming"], "correct_choice_idx": 2, "direct_answers": ["windsurfing", "surfing", "surfing", "surfboard fin", "surfing", "sliding out", "surfing", "paddle boarding", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["The answer is commonly known based on the description of the question and not related to the image necessarily.", "A girl is standing on a surfboard at the water's edge.", "The people are at the beach on surfboards."], "image": "val2014/COCO_val2014_000000003837.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119299, "question_id": "EgALuPdCf2hJWLh6ZXbrEe", "question": "The man's strap is likely connected to what?", "choices": ["dog", "fan", "camera", "chair"], "correct_choice_idx": 2, "direct_answers": ["bag", "bag", "camera", "bag", "bag", "bag", "messenger bag", "bag", "messenger bag", "bag"], "difficult_direct_answer": false, "rationales": ["The way that the strap is positioned, it is likely a camera.", "The man is dressed like a tourist and tourist usually keep their cameras strapped to them to take pictures often and easily.", "The positioning of the strap"], "image": "train2014/COCO_train2014_000000119299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102655, "question_id": "EgNw5xnBzxeoLH7nXz6Pmj", "question": "What are the white squares on the stadium seats?", "choices": ["cameras", "seat numbers", "fans' names", "decoration"], "correct_choice_idx": 1, "direct_answers": ["numbers", "seat covers", "seat numbers", "stickers", "seat numbers", "seat numbers", "seat numbers", "seat numbers", "chair numbers", "numbers"], "difficult_direct_answer": false, "rationales": ["The squares are seat numbers.", "The white squares show what the the seat numbers are.", "The white squares indicate which seats are which."], "image": "train2014/COCO_train2014_000000102655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30347, "question_id": "EgViqBpGtj6755cWHnA2kQ", "question": "What company made the silver laptop the man on the couch is using?", "choices": ["microsoft", "apple", "hp", "dell"], "correct_choice_idx": 1, "direct_answers": ["hp", "apple", "apple", "apple", "apple", "apple", "apple", "linux", "apple", "apple"], "difficult_direct_answer": false, "rationales": ["The person is using a macbook.", "You can see the logo on the back of the computer.", "The company is apple."], "image": "val2014/COCO_val2014_000000030347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371503, "question_id": "EgcaiYbxc3xrrFqQzoBHqG", "question": "What brand are the shoes of the boy who is kicking the ball?", "choices": ["diadora", "nike", "mizuno", "adidas"], "correct_choice_idx": 3, "direct_answers": ["adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas"], "difficult_direct_answer": false, "rationales": ["He is wearing adidas.", "That's what brand he's wearing.", "The shoes have adidas stripes on them."], "image": "val2014/COCO_val2014_000000371503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393978, "question_id": "EghDv2KES9YC3XWsJ9evjs", "question": "What are the brand of his skate shoes?", "choices": ["fallen", "vans", "dc", "emerica"], "correct_choice_idx": 1, "direct_answers": ["hanes", "nike", "converse", "converse", "vans", "converse", "vans", "adidas", "vans", "vans"], "difficult_direct_answer": false, "rationales": ["The brand is vans.", "They are a classic vans shoe.", "Vans have the design that the shoes have."], "image": "train2014/COCO_train2014_000000393978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501299, "question_id": "Eh2G8xzQH8z2t9hbQg9xTg", "question": "What is the man using the device in his hand to do?", "choices": ["shave beard", "make call", "cut hair", "brush teeth"], "correct_choice_idx": 1, "direct_answers": ["speak others", "talk", "phonecall", "call", "talk", "cellphone", "talk phone", "make call", "communicate", "talk"], "difficult_direct_answer": false, "rationales": ["He is holding a cell phone to his ear", "He is communicating with another person.", "The man wants to talk to someone."], "image": "train2014/COCO_train2014_000000501299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5360, "question_id": "EhPcjNTN5bztmaUC9wy7yC", "question": "Why is the woman under the lampshade?", "choices": ["shade", "warmth", "for attention", "to hide"], "correct_choice_idx": 3, "direct_answers": ["playing", "hiding", "to hide", "hiding", "hiding", "hiding", "hiding", "hiding", "hiding", "goofing around"], "difficult_direct_answer": false, "rationales": ["A lamp shade can be used to cover something.", "A lampshade can provide some coverage. she doesn't want to be seen.", "The woman is under the lampshade to hide from the photograph."], "image": "train2014/COCO_train2014_000000005360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58435, "question_id": "EhXohsQFd5QLi9wf68EF7u", "question": "Who is this person most likely to be?", "choices": ["burglar", "friend", "pilot", "buyer"], "correct_choice_idx": 2, "direct_answers": ["pilot", "pilot", "pilot", "pilot", "pilot", "pilot", "pilot", "pilot", "pilot", "pilot"], "difficult_direct_answer": false, "rationales": ["The person is standing near a plane.", "He is posing in front of a plane, which indicates he is about to enter the plane, most likely as the pilot because there does not appear to be much space for passengers.", "The person is standing next to the plane."], "image": "train2014/COCO_train2014_000000058435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456261, "question_id": "EhYX7r2Erqhu5vEgZmEEEN", "question": "What is a very normal use for the body part sticking out near the foot of the bed?", "choices": ["opening doors", "walking", "listening", "drinking"], "correct_choice_idx": 1, "direct_answers": ["walking", "walking", "walking", "walking", "walking", "walking", "feet", "feet", "walking", "walking"], "difficult_direct_answer": false, "rationales": ["The body part in question is clearly visible and identifiable as feet. the primary function for feet is answer a.", "The other options don't normally apply. that said, some people have learned to use their feet like hands.", "Feet are often how people walk."], "image": "val2014/COCO_val2014_000000456261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223451, "question_id": "EhfssF2STkyXd5GrhRSAt5", "question": "What is she ready to do?", "choices": ["dunk", "dribble", "juggle", "swing"], "correct_choice_idx": 3, "direct_answers": ["hit ball", "return ball", "hit ball", "return ball", "swing", "hit ball", "hit ball", "hit ball", "return", "hit ball"], "difficult_direct_answer": false, "rationales": ["Because she is playing tennis and in a swinging position it's obvious.", "The woman is swinging.", "The tennis player is ready to hit the ball."], "image": "train2014/COCO_train2014_000000223451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224000, "question_id": "EhzngCwk9Wdra2eh8tMWcd", "question": "Why are the sheep turning right?", "choices": ["avoiding dust", "eating grass", "avoiding car", "following motorcyclist"], "correct_choice_idx": 2, "direct_answers": ["cross road", "to graze", "herded", "herding", "create way", "avoiding car", "being herded", "avoid road", "owner guidance", "being herded"], "difficult_direct_answer": true, "rationales": ["The sheep are crossing the street.", "The sheep are turning to not be hit by the red car.", "The sheep are crossing the road avoiding the road."], "image": "val2014/COCO_val2014_000000224000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496683, "question_id": "EiAvc2P4LXXzDazEpjuhvg", "question": "What must be done to get to Tech Camps?", "choices": ["turn right", "straight ahead", "do u-turn", "turn left"], "correct_choice_idx": 0, "direct_answers": ["turn", "tech", "turn right", "turn right", "travel", "go right", "turn right", "go right", "go right", "right side"], "difficult_direct_answer": false, "rationales": ["The sign is pointing right.", "The camps are at the right.", "The sign has a rightward arrow."], "image": "train2014/COCO_train2014_000000496683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374966, "question_id": "EiBEL6yJBtDQvVDyYiexHt", "question": "What is the area the boys are skating in called?", "choices": ["arena", "pipe", "bowl", "ramp"], "correct_choice_idx": 2, "direct_answers": ["skate park", "skating pool", "half pipe", "skate park", "bowl", "park", "pool", "bowl", "skatepark", "skateboard park"], "difficult_direct_answer": false, "rationales": ["The skate park has a bowl shape.", "The area is a specially designed area for skateboarding that has curved edges all the way around it.", "The boys are skating in a skate bowl."], "image": "val2014/COCO_val2014_000000374966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546877, "question_id": "EiCAB6Bw95tWhoE2BumWqV", "question": "What do the tight ropes off to the side of the blue boat do to it?", "choices": ["secure it", "play games", "signal", "nothing"], "correct_choice_idx": 0, "direct_answers": ["hold position", "dock", "tie down", "anchor it", "secure it", "tether", "anchor", "secure it", "tether", "anchor"], "difficult_direct_answer": false, "rationales": ["The ropes on the sides of the boats keep them steady. the boats cannot drift out onto the water when tethered.", "The rope keeps the boat from floating away.", "The ropes help keep the boats in place."], "image": "val2014/COCO_val2014_000000546877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369180, "question_id": "EiFtQDv5KmUU8GFKqxgvPu", "question": "The elephants are being contained by what?", "choices": ["string", "leash", "arms", "wall"], "correct_choice_idx": 0, "direct_answers": ["rope", "rope", "ropes", "ropes", "rope", "rope", "rope", "rope", "rope", "string"], "difficult_direct_answer": false, "rationales": ["There is a yellow rope that is containing the elephants.", "It is a long, thin object stretched across a distance.", "The elephants are in a string."], "image": "train2014/COCO_train2014_000000369180.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366591, "question_id": "EiLhRkcxF2gVVLXCrjQrWx", "question": "Protected areas for these types of animals are known as what?", "choices": ["states", "wildlife reserves", "demilitarized zones", "unions"], "correct_choice_idx": 1, "direct_answers": ["sanctuaries", "giraffes", "wildlife reserves", "sanctuary", "conservation", "fences", "sanctuary", "zoos", "sanctuaries", "sanctuary"], "difficult_direct_answer": false, "rationales": ["This is a wildlife reserve for animals.", "The wildlife here is in a wildlife conservation.", "Protected areas for giraffes are called wildlife reserves."], "image": "train2014/COCO_train2014_000000366591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535278, "question_id": "EiQdkrZYYxcReUA4MCts4G", "question": "What is the item below the stuffed frog called?", "choices": ["storage file", "book binder", "brief case", "purse"], "correct_choice_idx": 3, "direct_answers": ["lunch box", "lunch box", "purse", "suitcase", "lunchbox", "desk", "purse", "handbag", "lunchbox", "lunch box"], "difficult_direct_answer": false, "rationales": ["The woman seems to have a purse that she puts her items like other ladies.", "Traditionally these type of items have large handles and are square in shape.", "The item is a purse."], "image": "train2014/COCO_train2014_000000535278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92122, "question_id": "EiitCwfpvq4jbwFAEsx5Fr", "question": "What is 19 trying to do?", "choices": ["sleep", "touch base", "get ball", "avoid player"], "correct_choice_idx": 1, "direct_answers": ["slide", "score", "be safe", "slide", "hit base", "steal base", "touch base", "slide home", "score", "slide"], "difficult_direct_answer": false, "rationales": ["Player #19 is going to slide so that he can reach the base safely before the other player catches the ball and tags him out.", "He is trying to get to the base before the other player catches the ball", "He is a baseball player."], "image": "train2014/COCO_train2014_000000092122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363902, "question_id": "Ej2pZFuyCovoqfnWbjs7kX", "question": "What does the child hope to catch in his glove?", "choices": ["fly", "foul", "home run", "tennis ball"], "correct_choice_idx": 2, "direct_answers": ["ball", "ball", "ball", "fly ball", "home run", "foul ball", "ball", "baseball", "baseball", "ball"], "difficult_direct_answer": false, "rationales": ["The kid is at a baseball game trying to catch a ball.", "The child is behind home plate, so any ball that would come his way would be known as a fly ball.", "The game makes him to hope to catch the home run."], "image": "val2014/COCO_val2014_000000363902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519407, "question_id": "Ej7Mz69FJbPqTfH9yz4LDh", "question": "Which of these vegetable is popular in Asia?", "choices": ["cabbage", "daikon", "onion", "broccoli"], "correct_choice_idx": 1, "direct_answers": ["white", "taro", "arracacha", "daikon", "turnips", "radish", "daikon radish", "radishes", "arracacha", "white radish"], "difficult_direct_answer": true, "rationales": ["Daikon is known to be used in many asian cuisines but not used at the same frequency on the other continents. things are used more frequently when they are popular.", "Traditionally these types of vegetables are popular in japan.", "The long white vegetables in the photo are called daikon"], "image": "train2014/COCO_train2014_000000519407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412094, "question_id": "EjCUjyjKxN3RefkdxdBZBE", "question": "What is the train near?", "choices": ["cow", "building", "wheelbarrow", "freeway"], "correct_choice_idx": 1, "direct_answers": ["train station", "signal", "tracks", "station", "building", "station platform", "train station", "station", "station", "buildings"], "difficult_direct_answer": false, "rationales": ["The train is near the building.", "The train is near a large building.", "It is near a place."], "image": "val2014/COCO_val2014_000000412094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54718, "question_id": "EjQsQxYRdKaJURHnbPwP7r", "question": "What is the name given to the type of sound the animals above produce?", "choices": ["neighs", "chatters", "trumpets", "roars"], "correct_choice_idx": 2, "direct_answers": ["honk", "trumpet", "elephant", "roo", "trumpet blast", "elephant", "trumpeting", "roar", "trumpeting", "trumpets"], "difficult_direct_answer": false, "rationales": ["The elephants lift their trunk and make a loud and distinctive noise to communicate. it is not unlike the sound of the musical instrument, the trumpet.", "Elephants make a trumpeting sound with their trunks.", "The elephants make a trumpeting sound with their trunks."], "image": "val2014/COCO_val2014_000000054718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231671, "question_id": "EjYxpTGofSGMt7uEqn2w6N", "question": "On what surface is this plate of pancakes placed upon?", "choices": ["kitchen counter", "dining table", "desk", "park bench"], "correct_choice_idx": 3, "direct_answers": ["picnic table", "wood", "wooden", "park bench", "paper plate", "picnic table", "wood table", "burger juice", "wood table", "picnic table"], "difficult_direct_answer": false, "rationales": ["It is placed on a picnic table, which is a table where food is eaten.", "There are pancakes on the bench.", "There are thick wooden slats laying next to each other."], "image": "train2014/COCO_train2014_000000231671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109, "question_id": "EjZ6X2HAnnSNXqAuwvqNp9", "question": "On what sort of license can people use this image?", "choices": ["creative commons", "public domain", "wtfpl", "copyright"], "correct_choice_idx": 0, "direct_answers": ["copyright", "copyright", "copyright", "photo", "drone", "creative commons", "copyright", "some rights", "noncommercial", "photo"], "difficult_direct_answer": false, "rationales": ["The license information says \"cc\" which stands for creative commons.", "This photo is for the public to use as the cc indicates at the bottom left.", "A photo depicts the logo for the photographer in the corner."], "image": "train2014/COCO_train2014_000000000109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203021, "question_id": "Ejgo2kCDDJpZMthtEsdVYm", "question": "From what source does this water emanate?", "choices": ["water balloon", "water bottle", "fire hydrant", "hose"], "correct_choice_idx": 2, "direct_answers": ["fire hydrant", "fire hydrant", "firehydrant", "fire hydrant", "hose", "fire hydrant", "fire hydrant", "fire hydrant", "hydrant", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["The water is coming from the fire hydrant.", "Water is spraying all over the place. the device where it comes from is used by fire fighters to put out fires.", "Lots water is sprayed out from the side of the street from a fire hydrant."], "image": "train2014/COCO_train2014_000000203021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81372, "question_id": "EjwZzaCz7vr55ezasRJCGU", "question": "What is the line of string meant to be?", "choices": ["fishing pole", "leash", "dental floss", "bandage"], "correct_choice_idx": 0, "direct_answers": ["fishing line", "fishing line", "fishing rod", "fishing pole", "fishing line", "fishing line", "fishing pole", "fishing line", "fishing line", "fishing string"], "difficult_direct_answer": false, "rationales": ["It is so the bears look like they are fishing", "The line of string in front of the bears is meant to look like fishing line and a fishing pole.", "A group of bears are posed together on a bench and the smallest bear is holding a rod with a string attached."], "image": "train2014/COCO_train2014_000000081372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196341, "question_id": "EkNSC2Cgp59mBmJkbr3NQz", "question": "What time is depicted in the photo?", "choices": ["815", "945", "215", "145"], "correct_choice_idx": 2, "direct_answers": ["two fifteen", "two thirteen", "two", "215 pm", "nine fifty", "nine am", "215", "215pm", "945", "one forty-five"], "difficult_direct_answer": true, "rationales": ["The hands depict 2:15 but in reverse since they are inside the clock.", "The hands on the clock indicate that it is 2:15 if looking at it from outside.", "The big hand is pointing to iii. the small hand is pointing to just past ii."], "image": "val2014/COCO_val2014_000000196341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493815, "question_id": "EkXuDKR4bJvfCA9CaLFDnS", "question": "What street sign is the man standing under?", "choices": ["ellis", "oswald", "canterbury", "earl"], "correct_choice_idx": 0, "direct_answers": ["ellis", "ellis", "ellis", "ellis", "ellis", "ellis", "ellis", "ellis", "ellis", "ellis"], "difficult_direct_answer": false, "rationales": ["The street is ellis.", "The man is standing under the sign \"ellis.\"", "He is standing right under the name of the street."], "image": "val2014/COCO_val2014_000000493815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229342, "question_id": "EknYzhVfYRjv2sUYKpWDfX", "question": "Where is the person most likely working in the living room while the pets lounge?", "choices": ["floor", "sofa", "desk", "table"], "correct_choice_idx": 0, "direct_answers": ["table", "floor", "floor", "floor", "floor", "ground", "floor", "on floor", "on carpet", "sofa"], "difficult_direct_answer": false, "rationales": ["The person has the laptop on the floor.", "It looks like the floor since the laptop is sitting close to the ground", "There is a laptop on the ground by the pets and therefore indicates that a person was using the laptop while on the floor."], "image": "val2014/COCO_val2014_000000229342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349559, "question_id": "EkvjruF5Mf33gbEMHzR3K5", "question": "Why is he crouched down so low?", "choices": ["riding skateboard", "is lost", "is resting", "holding bag"], "correct_choice_idx": 0, "direct_answers": ["luging", "falling", "sitting down", "riding skateboard", "speed", "performing maneuver", "riding skateboard", "skateboarding", "performing trick", "riding skateboard"], "difficult_direct_answer": false, "rationales": ["The other options don't match what he's doing in the picture.", "The man is crouched low because he is riding on a skateboard.", "A man is sitting on a skateboard on the ground."], "image": "val2014/COCO_val2014_000000349559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312772, "question_id": "EmC8YS9F3dCXPBD4matPKC", "question": "What type of weather is likely to occur next?", "choices": ["snow", "sun", "rain", "hurricane"], "correct_choice_idx": 0, "direct_answers": ["snow", "snowing", "snowstorm", "snow", "snow", "snow", "snow", "snow", "snow", "storm"], "difficult_direct_answer": false, "rationales": ["It is likely to snow because of the temperature and cloud coverage.", "There is already snow on the ground and from the look of the clouds in the offing, more may be coming down soon.", "There is snow on the ground already."], "image": "val2014/COCO_val2014_000000312772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111819, "question_id": "EmMs7YZXwYTywSiMsmeNNA", "question": "What does the elephant seek?", "choices": ["friendship", "food", "mate", "baby elephants"], "correct_choice_idx": 1, "direct_answers": ["peanuts", "food", "food", "food", "food", "food", "food", "food", "food", "food"], "difficult_direct_answer": false, "rationales": ["The elephant wants a treat.", "The elephant appears to be reaching in to some kind of bowl that might be containing answer a.", "The woman has treats in the bowl"], "image": "val2014/COCO_val2014_000000111819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73837, "question_id": "EmQqfERbQDae9sX3oPkSqw", "question": "What is coming out of the red and white tower?", "choices": ["water", "smoke", "people", "fire"], "correct_choice_idx": 3, "direct_answers": ["fire", "fire", "flame", "fire", "fire", "fire", "fire", "fire", "flame", "fire"], "difficult_direct_answer": false, "rationales": ["The fire is coming out.", "The tower looks like it is part of an oil refinery system. fire is commonly know to come out of these towers.", "There is a torch at the top of the tower."], "image": "train2014/COCO_train2014_000000073837.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550702, "question_id": "EmUavU8iJMDszrNxtn4aRE", "question": "What type vegetable is the basis for the soup here?", "choices": ["beans", "basil", "pea", "tomato"], "correct_choice_idx": 3, "direct_answers": ["tomato", "carrot", "tomato", "tomato", "tomato", "tomato", "carrot", "tomato", "tomato", "tomato"], "difficult_direct_answer": false, "rationales": ["Do to its rich red color that is brought by the tomato vegetable.", "The soup is a reddish color and smooth looking.", "The soup is red."], "image": "val2014/COCO_val2014_000000550702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211246, "question_id": "EmiV8bekr8Zhzui6VrXHPj", "question": "What word begins with the letter that is at the front of the top of the bus?", "choices": ["food", "koala", "lemon", "moon"], "correct_choice_idx": 1, "direct_answers": ["korea", "kite", "kite", "knead", "kite", "letter k", "koala", "kangaroo", "kangaroo", "kitchen"], "difficult_direct_answer": false, "rationales": ["A sign with a letter on it is in the corner of a bus window.", "The letter k is on the bus.", "The letter k appears at the front of the bus."], "image": "val2014/COCO_val2014_000000211246.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205086, "question_id": "En9F3HrLn2evEMJA6pSCad", "question": "Why is the child wearing the yellow shirt crouching downward?", "choices": ["doing pushups", "lost keys", "having fit", "learning trick"], "correct_choice_idx": 0, "direct_answers": ["doing pushups", "touching ball", "push ups", "tired", "stretching", "kissing ball", "push up", "fell down", "playing", "exercise"], "difficult_direct_answer": true, "rationales": ["The child is on all fours up in the air.", "The child is trying to do a pushup with the help of the soccer ball.", "She is pushing herself toward the ground."], "image": "train2014/COCO_train2014_000000205086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332609, "question_id": "EnKq9GgJPjCCR2yAtg6Xh7", "question": "How much water does the plant shown here require?", "choices": ["none", "minimal", "100 gallons", "daily"], "correct_choice_idx": 2, "direct_answers": ["little", "little", "little", "little", "little", "little", "moderate", "little", "half cup", "100 gallons"], "difficult_direct_answer": false, "rationales": ["This looks like a plant that does not require a lot of water at all.", "100 gallons are required.", "Cactus grows in desert areas and need only small amounts of water."], "image": "val2014/COCO_val2014_000000332609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87249, "question_id": "EnUY4wNpZGWTCQytaAWQr5", "question": "What is beach sand made of?", "choices": ["calcium carbonate", "pebbles", "fish scales", "fish poop"], "correct_choice_idx": 0, "direct_answers": ["grains", "sand grains", "silica", "dirt", "silica", "minerals", "crushed rock", "calcium carbonate", "sand", "shells"], "difficult_direct_answer": true, "rationales": ["It is made up of calcium carbonate that's why it often looks white.", "There is a sandy beach with many loungers dotted side by side.", "The sand has calcium carbonate since that's what makes it soft."], "image": "train2014/COCO_train2014_000000087249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264805, "question_id": "EndNkSsDUvBKcyeWFpzo5a", "question": "Which setting on his camera phone will cause harm to his eyes when it is on?", "choices": ["flash", "lightbulb", "speaker", "camera"], "correct_choice_idx": 0, "direct_answers": ["flash", "flash", "flash", "flash", "flash", "flash", "flash", "flash", "flash", "flash"], "difficult_direct_answer": false, "rationales": ["Flash can harm a person's eyes.", "Flash lights are super bright.", "The flash was used."], "image": "val2014/COCO_val2014_000000264805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172151, "question_id": "EninctgDhhKpock29kpEev", "question": "The sign is notifying drivers that what is closed between midnight and 7AM?", "choices": ["stores", "street", "lightbulbs", "sidewalks"], "correct_choice_idx": 1, "direct_answers": ["road", "street", "exit", "exit", "no exit", "street", "street", "road", "exit", "parking"], "difficult_direct_answer": false, "rationales": ["The sign is saying the street is closed since there is no exit.", "The sign is for the street.", "The street has no exit after midnight."], "image": "train2014/COCO_train2014_000000172151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167084, "question_id": "Enjx3EzTRUswZ6iM9SPERa", "question": "In steam locomotive which part blow smoke?", "choices": ["extinguisher", "chimney", "exhauster", "outlet"], "correct_choice_idx": 1, "direct_answers": ["engine", "smoke stack", "chimney", "smoke stack", "smokebox", "smokebox", "chimney", "chimney", "smokestack", "smoke box"], "difficult_direct_answer": false, "rationales": ["Answer a is universally used to describe an upward extending tube where smoke is expelled. in this image such a tube is visible with smoke coming out of it.", "This is the most obvious answer. it's also known as a smokestack.", "Of the given answers, only answer a makes sense and is consistent with how smoke is commonly expelled."], "image": "val2014/COCO_val2014_000000167084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93730, "question_id": "EnrMNwgYXxzm6b3PbPDRTs", "question": "Why is the man wearing a headband?", "choices": ["as punishment", "keep cold", "catch sweat", "dress code"], "correct_choice_idx": 2, "direct_answers": ["for sweat", "absorb sweat", "sweat protection", "absorb sweat", "tennis", "catch sweat", "sweat protection", "playing tennis", "to play", "improve vision"], "difficult_direct_answer": false, "rationales": ["The band is used to stop sweat.", "The woman is wearing it to stop sweating.", "The man is playing tennis and looks serious."], "image": "val2014/COCO_val2014_000000093730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100001, "question_id": "Enzo9JYxSnZ9nwjbuzATT6", "question": "What countries flag can be seen in the air?", "choices": ["france", "ireland", "scotland", "poland"], "correct_choice_idx": 2, "direct_answers": ["sweden", "scotland", "sweden", "norway", "scotland", "italy", "scotland", "scotland", "scotland", "scotland"], "difficult_direct_answer": false, "rationales": ["It is a blue flag with white crossing diagonal bars.", "The kite in the air is blue in color with a white diagonal cross in it. this is the flag of scotland.", "By the colors and design you can tell what country it is."], "image": "val2014/COCO_val2014_000000100001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242089, "question_id": "EoTk44R85VCs6mqZbXMceb", "question": "How many fire hydrants are in the picture?", "choices": ["nine", "ten", "11", "eight"], "correct_choice_idx": 2, "direct_answers": ["eleven", "eleven", "11", "eleven", "eleven", "eleven", "nine", "twelve", "ten", "eleven"], "difficult_direct_answer": false, "rationales": ["There are that many on the ground.", "There are eleven hydrants.", "There are ten hydrants standing and one additional hydrant on its side."], "image": "train2014/COCO_train2014_000000242089.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22135, "question_id": "EpHGRseSfeBMTntNuEEr6S", "question": "What flag has the colors found on the bottom of the skateboard?", "choices": ["united kingdom", "canada", "united states", "guinea"], "correct_choice_idx": 3, "direct_answers": ["jamaica", "mexico", "jamaica", "jamaica", "jamaica", "jamaica", "bolivia", "mexico", "guinea", "jamaica"], "difficult_direct_answer": false, "rationales": ["The canadian flag is red and white, and the us and uk flags are red, white and blue. this leaves only one other option which has a red, yellow and green flag according to an internet search.", "That is one of the countries that have the red, yellow, and green colors.", "Those colors represent that country."], "image": "train2014/COCO_train2014_000000022135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99629, "question_id": "EpMgm9JE9tmGmZFdRbYeHV", "question": "To travel over the area behind this rider nearing what would be safest for the horse?", "choices": ["mid shore", "water", "rocks", "boulders"], "correct_choice_idx": 1, "direct_answers": ["near water", "land", "on sand", "water", "shoreline", "left", "water", "beach", "usehorse", "water"], "difficult_direct_answer": false, "rationales": ["The horse is walking near the water.", "The travel requires water.", "It would be safest for the horse to walk closer to the water."], "image": "train2014/COCO_train2014_000000099629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176038, "question_id": "EpSrjXRbG5NVShJvCmP2vu", "question": "Which beverage seen here has least calories?", "choices": ["wine", "soda", "water", "beer"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "beer", "ice tea", "water", "ice tea", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The drink on the table that is lowest in calories is the glass of water. water has no calories.", "Water is a beverage without any calories. among a group of other beverages, water would likely be the least caloric especially compared to the visible soda and beer.", "Water doesn't have any calories."], "image": "train2014/COCO_train2014_000000176038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254600, "question_id": "EpxonP5mFtYGYgKMwrfXSP", "question": "What is near the television?", "choices": ["chair", "keyboard", "playpen", "cat"], "correct_choice_idx": 1, "direct_answers": ["keyboard", "remote", "keyboard", "black keyboard", "remotes", "keyboard", "remote", "keyboard", "keyboard", "remote"], "difficult_direct_answer": false, "rationales": ["There is a keyboard to the left.", "None of the other options are on or near the stand.", "A device used for typing is to the left of the television."], "image": "train2014/COCO_train2014_000000254600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210791, "question_id": "Eqm6TSj9gddVTP9TTZCzBe", "question": "Who makes the bat?", "choices": ["easton", "koho", "nike", "spaulding"], "correct_choice_idx": 0, "direct_answers": ["easton", "easton", "easton", "easton", "easton", "easton", "easton", "easton", "easton", "easton"], "difficult_direct_answer": false, "rationales": ["The manufacturer's name is on the side of the bat.", "The company that makes the bat is named in big letters on the barrel of the bat.", "There are several brands of bats. metal bats are frequently manufactured by easton, a common brand."], "image": "train2014/COCO_train2014_000000210791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440762, "question_id": "EqyaCsjrvo7PkHMn3ojPuG", "question": "Why are there eyes on the screen?", "choices": ["people watching", "getting glasses", "playing solitaire", "customizing avatar"], "correct_choice_idx": 3, "direct_answers": ["character customization", "avatar", "avatar selection", "game", "laptop", "avatar face", "game", "customizing avatar", "choosing avatar", "customization"], "difficult_direct_answer": true, "rationales": ["They are creating a character they want to see in a game", "The person is making their logo for the game system.", "There is a cartoon person with eye choices on the screen"], "image": "train2014/COCO_train2014_000000440762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516775, "question_id": "ErGpbYB583FwfyfwBS5yrH", "question": "What sign should I follow if I have lost my wallet?", "choices": ["police", "boatramp", "city focus", "backpackers"], "correct_choice_idx": 0, "direct_answers": ["police", "police", "police", "police", "police", "police", "police sign", "police sign", "police", "police sign"], "difficult_direct_answer": false, "rationales": ["The sign for the police station is to the left. people who lose wallets may check to see if anyone has turned in a wallet that was found.", "If you have lost an item you would need to file a report so that if it is found it would be returned to you and this is the only service listed that could do that for you.", "When something is missing you should try going to the authorities."], "image": "val2014/COCO_val2014_000000516775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150594, "question_id": "ErQ3Jt9S4pNhfg5UdmV5Rb", "question": "What might a person make on the black and silver item on the back left?", "choices": ["jewellery", "clothing", "food", "music"], "correct_choice_idx": 2, "direct_answers": ["gm", "eggs", "cake", "food", "eggs", "eggs", "hot tea", "food", "food", "breakfast dinner"], "difficult_direct_answer": false, "rationales": ["There is a compartment for cooking hot edible items. the top can be used to fry edibles items.", "A stove and oven are the most commonly used kitchen appliances to cook food with.", "Food is stored in the machine to keep it cool."], "image": "train2014/COCO_train2014_000000150594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324336, "question_id": "ErWANAMijv6wBmsK8u84tP", "question": "What is the weather doing?", "choices": ["sunny", "cold", "raining", "snowing"], "correct_choice_idx": 2, "direct_answers": ["raining", "raining", "raining", "raining", "raining", "raining", "raining", "raining", "raining", "raining"], "difficult_direct_answer": false, "rationales": ["He is holding an umbrella and the ground is wet.", "The man is holding an umbrella because it is raining.", "The man has an umbrella to stay dry."], "image": "train2014/COCO_train2014_000000324336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426712, "question_id": "EreruUz5xSR8vdj22hqd2W", "question": "Apart from meat what else does the animal in the picture above provide?", "choices": ["wool", "eggs", "water", "none"], "correct_choice_idx": 0, "direct_answers": ["wool", "wool", "wool", "milk", "wool", "milk", "milk", "milk", "wool", "milk"], "difficult_direct_answer": false, "rationales": ["Sheep can provide wool.", "A sheep is in a arena with a woman.", "The animal can be sheared to produce wool which is used to make clothes."], "image": "train2014/COCO_train2014_000000426712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368595, "question_id": "ErshZkyAZwPMW5SezWavTo", "question": "What type of sign is shown in the image?", "choices": ["stop", "yield", "pedestrians", "train crossing"], "correct_choice_idx": 3, "direct_answers": ["street", "traffic", "train crossing", "traffic", "don't enter", "location traffic", "warning", "don't enter", "road", "no entry"], "difficult_direct_answer": false, "rationales": ["A sign with tracks and a line through it as well as lights are on a street.", "It is not safe for cars to be on tracks when trains come.", "The tracks with a red mark mean do not cross."], "image": "val2014/COCO_val2014_000000368595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170700, "question_id": "EsEn68xf5mT4BFhtCNuvXZ", "question": "Where does this scene take place?", "choices": ["cafe", "house", "condo", "club"], "correct_choice_idx": 0, "direct_answers": ["tropics", "cafe", "hostel", "restaurant exterior", "restaurant", "restaurant", "outdoors", "restaurant", "oceania", "outdoors"], "difficult_direct_answer": false, "rationales": ["You can tell by the tables, chairs and setting where they are at the moment.", "People are seen in the area having good meals.", "It is a small restaurant."], "image": "val2014/COCO_val2014_000000170700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549002, "question_id": "Estivp83eZhLsGfRKKn98v", "question": "What dessert is shown?", "choices": ["ice cream", "donut", "cupcake", "cannoli"], "correct_choice_idx": 1, "direct_answers": ["donuts", "doughnuts", "donuts", "donut", "donuts", "donut", "frosted cheerios", "doughnuts", "donut", "chocolate cheerios"], "difficult_direct_answer": false, "rationales": ["They are tiny little donut shaped.", "The are in a round shape with a hole missing in the middle.", "These have holes in the middle and the design and sprinkles of a donut."], "image": "train2014/COCO_train2014_000000549002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266151, "question_id": "EswAbWRVk7zqh9JaYfiYoy", "question": "What are the men near the spraying water doing?", "choices": ["photographing", "laughing", "repairing", "dancing"], "correct_choice_idx": 2, "direct_answers": ["fixing hydrant", "repairing leak", "cleaning", "walking", "working", "out fire", "carrying ladder", "controlling it", "repair", "repairing"], "difficult_direct_answer": true, "rationales": ["Men are near a leading water supply.", "They have hard hats and bright vests on and are working together doing the same task.", "The men near the spraying water are dressed like construction workers that are fixing the pipe."], "image": "val2014/COCO_val2014_000000266151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79544, "question_id": "EswWAVxRcQrn5Gqhq6zTbZ", "question": "What is the woman with the baby involved in?", "choices": ["burping", "banking", "travelling", "marketing"], "correct_choice_idx": 2, "direct_answers": ["travelling", "packing", "travelling", "unpacking", "vacationing", "seeing", "packing", "motherhood", "packing suitcases", "selling luggage"], "difficult_direct_answer": false, "rationales": ["The woman is likely involved in travel with her baby.", "It looks like she is going to travel, since she has some suitcases.", "The woman is traveling with her luggage."], "image": "val2014/COCO_val2014_000000079544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203450, "question_id": "EtBdGtnQkQPA3Lt5X4enNT", "question": "What is needed for this activity?", "choices": ["sun", "sand", "snow", "rain"], "correct_choice_idx": 2, "direct_answers": ["snow", "skis", "snow", "skis", "snow", "snow", "winter", "stamina", "ski poles", "skill"], "difficult_direct_answer": false, "rationales": ["Snow is needed for skiing.", "The men are skiing and that is the only way to ski.", "The other options aren't depicted and they wouldn't work in winter."], "image": "train2014/COCO_train2014_000000203450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399841, "question_id": "EtJEJBRXfB3o3uPRnuj7qK", "question": "Where is this scene most likely taking place?", "choices": ["date", "promotion", "holiday", "family event"], "correct_choice_idx": 0, "direct_answers": ["fancy restaurant", "restaurant", "date", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "fancy restaurant"], "difficult_direct_answer": false, "rationales": ["The meal and clothing are fancy so it's probably a date night.", "By the setting and how they are dresses you could tell it is a dating scene.", "The woman is on a date since she is so dressed up."], "image": "train2014/COCO_train2014_000000399841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149646, "question_id": "EtUdSdveqMBTTa7fGwA8iE", "question": "What causes the cats glowing eyes?", "choices": ["deep anger", "batteries", "light reflection", "demonic possession"], "correct_choice_idx": 2, "direct_answers": ["camera flash", "photo flash", "reflection", "reflection", "flash", "light reflection", "camera flash", "camera flash", "light refraction", "flash"], "difficult_direct_answer": false, "rationales": ["The camera flash is shown back in the cat's eyes.", "The cat's eyes are reflecting light.", "The camera light is reflecting in the cat's eyes."], "image": "val2014/COCO_val2014_000000149646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287920, "question_id": "EtsQgyCVWDiUbzQ6UTAp3F", "question": "What threw this man aloft?", "choices": ["bellhop", "string", "enemy", "mattress springs"], "correct_choice_idx": 3, "direct_answers": ["mattress springs", "elevation", "bed", "himself", "himself", "bed", "bed", "jumping", "camera", "bed"], "difficult_direct_answer": false, "rationales": ["The man is in the air above a bed. beds have springs.", "The man is above a bed. he jumped on it.", "The bed has springs."], "image": "train2014/COCO_train2014_000000287920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55575, "question_id": "EuEWXF5rYDyzMmc7XqhsbT", "question": "What is the animal alignment mean?", "choices": ["flood", "drought", "playing", "happy"], "correct_choice_idx": 0, "direct_answers": ["noah's ark", "noah arc", "noah's ark", "arc placement", "easily viewed", "flood", "noah's arc", "parade", "noah's ark", "noah ark"], "difficult_direct_answer": false, "rationales": ["The animals are preparing to get on noah's ark.", "It looks like noah's ark.", "The animals entered the ark two by two when the floods where warned about."], "image": "train2014/COCO_train2014_000000055575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56455, "question_id": "EuNSnjWAdR2884YAoLD69G", "question": "What is the purpose of the holes behind him?", "choices": ["ventilation", "seeing outside", "decoration", "hanging things"], "correct_choice_idx": 3, "direct_answers": ["pegboard", "air ventilation", "cork board", "air", "to hold", "eliminate noise", "hang objects", "vent", "peg board", "hanging things"], "difficult_direct_answer": true, "rationales": ["The holes allow things to be hung from them.", "The wall has bead boards for hooks.", "The holes could be used to ventilate air."], "image": "val2014/COCO_val2014_000000056455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186461, "question_id": "EuQGmcS5nXcy3vgJtm8pWF", "question": "Why does he have his arm out?", "choices": ["hold", "wave", "balance", "break fall"], "correct_choice_idx": 0, "direct_answers": ["holding kite", "item", "flying kite", "hold", "kite string", "flying kite", "fly kite", "holding kite", "control kite", "kite"], "difficult_direct_answer": false, "rationales": ["The boy has his arm out to hold onto the kite and not let it fly away.", "He is holding on to the string that is holding the kite", "The man is holding the kite."], "image": "train2014/COCO_train2014_000000186461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349676, "question_id": "EuSno6uMBxbcKK2mhVo8ga", "question": "What is the man here doing?", "choices": ["photographing", "yelling", "protesting", "eating"], "correct_choice_idx": 0, "direct_answers": ["taking picture", "photographing", "taking photo", "taking picture", "taking pictures", "photographing", "photographing giraffes", "taking pictures", "photographing giraffes", "photographing"], "difficult_direct_answer": false, "rationales": ["The man has his hands in the air, that is the best way to take pictures.", "The man is taking photos.", "His right hand is positioned to where he's using either a camera or smartphone to snap pictures while at the zoo."], "image": "val2014/COCO_val2014_000000349676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348134, "question_id": "EucKfPGtZKpjVHGqm6xYwo", "question": "What are the largest rectangular clothing item storage pieces called?", "choices": ["suitcases", "valises", "baskets", "trunks"], "correct_choice_idx": 3, "direct_answers": ["trunks", "trunks", "trunks", "trunks", "trunk", "trunk", "trunks", "trunk", "trunks", "luggage"], "difficult_direct_answer": false, "rationales": ["They are usually heavy and difficult to move because they only have small handles on the sides.", "The objects are of a shape and style consistent with answer a. the handle and lock placement matches.", "The large rectangular piece of storage is known as a trunk."], "image": "train2014/COCO_train2014_000000348134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11569, "question_id": "EuuaM74K7RAveBY5MLcDrF", "question": "What civil rights association in associated with the colour of this bus?", "choices": ["disabled rights", "lbgt rights", "black rights", "jewish rights"], "correct_choice_idx": 1, "direct_answers": ["women", "womens", "women", "lgbtq", "too vague", "womans", "breast cancer", "lbgt rights", "no clue", "susan komen"], "difficult_direct_answer": true, "rationales": ["Possibly lgbt rights.", "The bus is pink indicating women's rights.", "The bus is pink and pink is the color most frequently adopted by glbt activist organizations."], "image": "train2014/COCO_train2014_000000011569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405250, "question_id": "EvBuiqqscbd5FQmoZJxP74", "question": "What is being used for light on the desk?", "choices": ["pendant light", "strobe light", "lava lamp", "candle"], "correct_choice_idx": 2, "direct_answers": ["lava lamp", "lava lamp", "lava lamp", "lava lamp", "lamp", "lava lamp", "lamp", "lava lamp", "lava lamp", "lava lamp"], "difficult_direct_answer": false, "rationales": ["The lamp is powered by electricity and is emitting light. with this particular lamp, once it gets heated enough, the inner contents swish around based on what is comparatively cooler, creating a \"lava\" effect - that is what is happening inside the lamp here.", "The light on the desk is clearly visible and the size, shape and contains is consistent with answer a.", "It is a special decorative light that uses wax to create moving shapes within."], "image": "train2014/COCO_train2014_000000405250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115680, "question_id": "EvL4qz7vfMZunrj5cWUPs5", "question": "What animals are present?", "choices": ["dog", "giraffe", "deer", "bull"], "correct_choice_idx": 3, "direct_answers": ["buffalo", "bulls", "american bison", "texas cattle", "bull", "bulls", "long horns", "steer", "cows", "bulls"], "difficult_direct_answer": false, "rationales": ["They eat grass and have hooves and long horns.", "There are bulls in the picture.", "The animal is a bull."], "image": "val2014/COCO_val2014_000000115680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344065, "question_id": "EvTWLLXq98giivcQyHsa6f", "question": "Why are there so many fruits?", "choices": ["fell there", "hungry man", "showing off", "for sale"], "correct_choice_idx": 3, "direct_answers": ["fruit store", "food stand", "market", "market", "fruit store", "to sell", "for sell", "sales presentation", "selling", "for sale"], "difficult_direct_answer": false, "rationales": ["They are selling them to customers.", "The people are selling the fruit.", "This is a market selling fruit and various other items."], "image": "train2014/COCO_train2014_000000344065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342650, "question_id": "EvWJPYRAZjQRFYPZMCm3bR", "question": "What company is known for making the item that is on the counter?", "choices": ["domino's", "mcdonald's", "nathan's", "gorton's"], "correct_choice_idx": 0, "direct_answers": ["pizza company", "domino's", "papa johns", "digorno", "digiorno", "pizza hut", "pizza hut", "pizza hut", "pizza in", "pizza"], "difficult_direct_answer": false, "rationales": ["The company is domino's.", "Dominos is known for making pizzas.", "Domino's is a pizza chain."], "image": "val2014/COCO_val2014_000000342650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435312, "question_id": "EvWLUAM8cNre2Yk7WJJWR6", "question": "Why is she pointing her tool at the traffic light?", "choices": ["fixing it", "hacking it", "change light", "take picture"], "correct_choice_idx": 3, "direct_answers": ["picture", "taking picture", "take picture", "for photography", "taking picture", "taking picture", "photographing", "taking picture", "take picture", "take photo"], "difficult_direct_answer": false, "rationales": ["The woman is taking a photo.", "The woman is holding and pointing a camera used to take photos.", "The tool she is holding is a camera, and she is looking through it."], "image": "val2014/COCO_val2014_000000435312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138814, "question_id": "EvdT5r2cWweewihMBdi4za", "question": "What is the man in white attempting to do?", "choices": ["jumping jacks", "throw ball", "serve", "sit"], "correct_choice_idx": 2, "direct_answers": ["serve", "hit ball", "serve", "hit ball", "hit", "hit ball", "serve", "play tennis", "tennis", "serve tennis"], "difficult_direct_answer": false, "rationales": ["The man in white is swinging the tennis racquet and attempting to serve the ball over the net.", "The man is playing tennis. he already threw the ball and is about to hit it.", "The man's stance is one that is similar to a tennis player who has thrown the ball up in the air and is ready to hit it, thus serving the ball."], "image": "val2014/COCO_val2014_000000138814.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202843, "question_id": "EvigKWHVHoP3vkvKueRnmH", "question": "Why are so many people wearing orange?", "choices": ["supporting team", "required uniform", "visibility", "distraction"], "correct_choice_idx": 0, "direct_answers": ["giants fans", "fans", "funs", "fans supporting", "rain", "team color", "favorite team", "supporting team", "support team", "support team"], "difficult_direct_answer": true, "rationales": ["The team's uniforms are visible and the team itself can be inferred. fans wear colors of a team to show their pride.", "The people are wearing orange because they are fans of the orange team.", "The giants are playing at home. the fans are not required to wear a specific color."], "image": "val2014/COCO_val2014_000000202843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8657, "question_id": "EvzQ4sbgpFTJcy9LtvhAfT", "question": "What are these people called?", "choices": ["officers", "conductors", "staff", "passengers"], "correct_choice_idx": 3, "direct_answers": ["passengers", "passengers", "passengers", "passengers", "passengers", "passengers", "passengers", "passengers", "passengers", "passengers"], "difficult_direct_answer": false, "rationales": ["Because they are on board a transportation vehicle.", "The people are riders on a subway train car that is what the people who are not driving the trains are called.", "These people are passengers since they're riding a subway."], "image": "train2014/COCO_train2014_000000008657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220819, "question_id": "Ew2npM7pi8gNgVaZeWWfut", "question": "What element is needed for the contents of the pots to extend their lives?", "choices": ["cement", "water", "milk", "juice"], "correct_choice_idx": 1, "direct_answers": ["water", "water", "water sun", "water", "water", "water", "dirt", "water", "water", "sunlight"], "difficult_direct_answer": false, "rationales": ["Plants need water to live.", "Plants need to be watered to live and grow", "All life needs liquid to survive."], "image": "val2014/COCO_val2014_000000220819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223726, "question_id": "EwJrVMD5Ugh4dNrHM37aa6", "question": "Who is the second man from the left?", "choices": ["barak obama", "ronald reagan", "ben franklin", "george bush"], "correct_choice_idx": 0, "direct_answers": ["bill clinton", "obama", "barrack obama", "obama", "barack obama", "obama", "obama", "barak obama", "obama", "obama"], "difficult_direct_answer": false, "rationales": ["He was the current president when the photo was taken. the other men were past presidents.", "Barak obama is the only african american president. the man second from the left is african american.", "He is the first black and white president."], "image": "train2014/COCO_train2014_000000223726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558167, "question_id": "EwvfqQfbqSbjHqrSXskCuA", "question": "What tells you that this is warm year round?", "choices": ["palm tree", "stoplight", "short sleeves", "beach"], "correct_choice_idx": 3, "direct_answers": ["sunlight", "sunshine", "light clothing", "palm tree", "palm tree", "palm trees", "palm trees", "its sunny", "beach", "shirts"], "difficult_direct_answer": false, "rationales": ["The area is a beach town.", "Because of the sunny beach and the presence of the bright sun.", "There is a beach near the water."], "image": "train2014/COCO_train2014_000000558167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58351, "question_id": "ExTRhjLkrfV973v7hNZ6Xq", "question": "Pet care Helpline number?", "choices": ["966", "952", "822", "911"], "correct_choice_idx": 1, "direct_answers": ["952", "952-help-pet", "nowhere", "one", "no clue", "google", "not here", "911", "466-4325", "unknown"], "difficult_direct_answer": true, "rationales": ["This is common knowledge that 952 is the pet helpline number.", "Not sure what the number for this would be. i don't see a number anywhere or any clue.", "The helpline is posted on the cage."], "image": "train2014/COCO_train2014_000000058351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482127, "question_id": "ExY8xNPViRNNKv5rVQBgBD", "question": "What was used to make the yellow coloring on the page?", "choices": ["pencil", "highlighter", "paint", "crayon"], "correct_choice_idx": 1, "direct_answers": ["highlighter", "highlighter", "highlighter", "highlighter", "highlighter", "marker", "highlighter", "high lighter", "highlighter", "highlighter"], "difficult_direct_answer": false, "rationales": ["The yellow markings came from a highlighter.", "Only a highlighter would color the page while still allow you to see the text.", "These are markers used to cover things on paper but not make them hidden"], "image": "train2014/COCO_train2014_000000482127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69081, "question_id": "EyEdBD9SYGYXfKT9vmoPcy", "question": "What important property does this blue outfit have?", "choices": ["breathable", "washable", "lightweight", "waterproof"], "correct_choice_idx": 0, "direct_answers": ["aerodynamic", "breathable", "one piece", "onesie", "glitter", "painting", "fullbody", "helmet", "see through", "reflective"], "difficult_direct_answer": true, "rationales": ["Since the person's face is covered they need breathable material in order to wear the costume very long.", "The fabric is breathable.", "For the person to survive they will need to be able to breath."], "image": "train2014/COCO_train2014_000000069081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336166, "question_id": "EyEr3BTNdtJd5YQuEtwgN4", "question": "What does the boarder here wear near their hands?", "choices": ["wrist guards", "helmet", "chin guard", "knee pads"], "correct_choice_idx": 0, "direct_answers": ["wrist guard", "braces", "wraps", "wrist guards", "wrist braces", "straps", "wrist guards", "pads", "wrist protectors", "wrist protectors"], "difficult_direct_answer": false, "rationales": ["He has on wrist protection to keep his wrist from getting hurt.", "To prevent severe arm injuries that may result from the activity he does.", "The boarder has wrist guards to keep himself safe."], "image": "val2014/COCO_val2014_000000336166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327961, "question_id": "EyUhPMgxmbqnydq8PK6SN9", "question": "What are the horses being forced to do?", "choices": ["carry luggage", "eat grass", "free roam", "drink water"], "correct_choice_idx": 0, "direct_answers": ["carry items", "carry", "pack supplies", "carry items", "transport", "pack animals", "pack animals", "carry luggage", "carry goods", "carry goods"], "difficult_direct_answer": false, "rationales": ["A pile of luggage is in the grass near a horse that is carrying some.", "They're also known as pack animals.", "They have large bags placed on their backs in order to be transported."], "image": "val2014/COCO_val2014_000000327961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115003, "question_id": "EyVMz6eZhkLBpfD76PkX74", "question": "Why is a chain hooked to the fire hydrant?", "choices": ["retaining cover", "dog leashing", "leash left", "display"], "correct_choice_idx": 0, "direct_answers": ["retaining cover", "protect parts", "prevent tampering", "hold cap", "cap-loss prevention", "prevent theft", "prevent stealing", "secure it", "safety", "secure cap"], "difficult_direct_answer": true, "rationales": ["The chain is covering the lid so it won't be open by vandals.", "A chain is hooked on the fire hydrant to retain its cover.", "To prevent theft or loss of the cover."], "image": "train2014/COCO_train2014_000000115003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155125, "question_id": "EykKCz8cMvkiMEtMvjrJgL", "question": "What type of diet does the person shown have?", "choices": ["atkins", "omnivorous", "vegan", "vegetarian"], "correct_choice_idx": 1, "direct_answers": ["omnivorous", "no idea", "pizza", "no diet", "omnivore", "average", "pizza", "meat", "pizza", "junk food"], "difficult_direct_answer": false, "rationales": ["The pizza has meat and veggie.", "The person is eating pizza, so he does not have an atkins diet. the pizza has meat, so his diet is not vegetarian or vegan.", "The person is about to eat a slice of pizza topped with both vegetables and meat (chicken), so the person would seem to be an omnivore--one who eats both plant-based and animal-based food--and therefore he's someone who has an omnivorous diet."], "image": "val2014/COCO_val2014_000000155125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349437, "question_id": "Eyv4hf6hndNnhXsADbUhV8", "question": "The costume of the persons in the image called as?", "choices": ["crafty", "superhero", "cupcake", "cowboy"], "correct_choice_idx": 3, "direct_answers": ["cowboy", "cowboys", "charro", "cowboy", "biker", "cowboy indians", "cowboy", "cowboys", "tradition", "breechcloths"], "difficult_direct_answer": false, "rationales": ["Cowboy hats are being worn.", "He has a cowboy hat on.", "The people on the bike are wearing leather jackets and cowboy hats."], "image": "val2014/COCO_val2014_000000349437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359156, "question_id": "EyyUmSPQwdRLLPxTYnihsj", "question": "What name is associated with the clock tower?", "choices": ["ben", "chad", "jim", "george"], "correct_choice_idx": 0, "direct_answers": ["big ben", "building", "ben", "ben", "ben", "big ben", "clocktower", "big ben", "big ben", "big ben"], "difficult_direct_answer": false, "rationales": ["This is a famous clock in london", "Big ben is a famous tower in london.", "The name is big ben."], "image": "train2014/COCO_train2014_000000359156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550540, "question_id": "EzD8FxzBQYYSykeKtvk78m", "question": "How many dogs could he play this game with simultaneously?", "choices": ["four", "one", "six", "three"], "correct_choice_idx": 0, "direct_answers": ["one", "four", "four", "four", "four", "four", "four", "four", "four", "one"], "difficult_direct_answer": false, "rationales": ["The man has four frisbees so if each dog had one there could be four dogs.", "Four dogs could play.", "Four dogs could be played with since he has four frisbees."], "image": "train2014/COCO_train2014_000000550540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479950, "question_id": "EzLQD7n34YTrWNRBotQA2m", "question": "What setting is shown here?", "choices": ["game room", "library", "office desk", "conference room"], "correct_choice_idx": 3, "direct_answers": ["work conference", "conference room", "meeting", "meeting", "work", "office", "conference room", "office", "conference room", "office meeting"], "difficult_direct_answer": false, "rationales": ["They are in a meeting.", "This is a room where people can go to have a work meeting about whatever topics they need to discuss with a bigger group of people.", "Office workers are gathered at a large table with work in front of them."], "image": "val2014/COCO_val2014_000000479950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359825, "question_id": "Ezo3zHPdsVn3vBKGnEggAK", "question": "What zone is this area likely to be?", "choices": ["shopping", "tourist", "residential", "business"], "correct_choice_idx": 1, "direct_answers": ["tourist", "horse area", "downtown", "loading", "horse", "tourist", "downtown", "usa", "crosswalk", "town square"], "difficult_direct_answer": false, "rationales": ["This type of vehicle serves no effective purpose for modern transportation but still exists for the purposes of answer a.", "The horse is seen to be taking around the tourist.", "Most of the buildings are made for tourists."], "image": "train2014/COCO_train2014_000000359825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151371, "question_id": "EzvCzya33LAhn5rx4YyzYP", "question": "Who is the manufacturer of the large truck?", "choices": ["volvo", "peterbilt", "mack", "daimler"], "correct_choice_idx": 2, "direct_answers": ["mack", "mack", "mack", "mack", "mack", "mack", "mack", "mack", "mack", "mack"], "difficult_direct_answer": false, "rationales": ["Mack's logo is on the truck.", "The manufacturer is mac.", "The name of the manufacturer is on the front of the truck in silver."], "image": "train2014/COCO_train2014_000000151371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460821, "question_id": "EzyAg34aSoXYW5StpLFSww", "question": "How will this man descend this place?", "choices": ["ski lift", "taxi", "via ski", "uber"], "correct_choice_idx": 2, "direct_answers": ["skiis", "ski", "climbing", "ski", "skis", "ski", "via ski", "bye ski", "skis", "with skiis"], "difficult_direct_answer": false, "rationales": ["The man is in snow and he is holding skis.", "The man has skiis.", "He is high on a snowy mountain and he has skis with him"], "image": "train2014/COCO_train2014_000000460821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514559, "question_id": "F2f4WphGqe4wy7DJHPjfgd", "question": "What are the couple likely standing on?", "choices": ["lily pads", "rocks", "bridge", "fish"], "correct_choice_idx": 1, "direct_answers": ["rocks", "rocks", "river", "rocks", "rocks", "river", "creek", "rocks", "stones", "rocks"], "difficult_direct_answer": false, "rationales": ["The couple is standing on rocks.", "In a shallow body of water, it's common practice--if they're spaced appropriately and sufficient enough in number--to use rocks as a way to get across.", "The couple must not be directly in the water and rocks are around them."], "image": "train2014/COCO_train2014_000000514559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430215, "question_id": "F2ocBEBSNWYusZweAphzTx", "question": "What food on the table contains the highest amount of fat?", "choices": ["bacon", "rice", "pancake", "egg"], "correct_choice_idx": 0, "direct_answers": ["bacon", "pancake", "bacon", "bacon", "potatoes", "bacon", "meat", "bacon", "bacon", "bacon"], "difficult_direct_answer": false, "rationales": ["Meat has more fat than the other food items.", "Bacon is high in fat.", "Any food from a pig has a high amount of fat content."], "image": "train2014/COCO_train2014_000000430215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110460, "question_id": "F33KyygWfMoo5AD3icYCE9", "question": "Where is the water coming from?", "choices": ["natural spring", "bucket", "hose", "fire hydrant"], "correct_choice_idx": 3, "direct_answers": ["hydrant", "fire hydrant", "hydrant", "hydrant", "hydrant", "fire hydrant", "fire hydrant", "fire hydrant", "hydrant", "hydrant"], "difficult_direct_answer": false, "rationales": ["You can see the silhouette", "The thing is in the ground. the water is spraying in all directions.", "It is a fixture with a water supply on a sidewalk in a city"], "image": "val2014/COCO_val2014_000000110460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530896, "question_id": "F3CU6pPwtVckzJJn52qegk", "question": "The green handlebars in the bottom left belong to what?", "choices": ["bicycle", "wheelbarrow", "walker", "shopping cart"], "correct_choice_idx": 0, "direct_answers": ["cycle handlebars", "bicycle", "one man", "bike", "bike", "bike", "bike owner", "10 speed", "bicycle", "bicycle"], "difficult_direct_answer": false, "rationales": ["The handles belong to a bicycle that they will ride on.", "They are bicycle handlebars.", "That is the design of an adult bicycle handlebars. there is no cart in front of the handlebars so it's not a shopping cart or wheelbarrow and there are no walker legs present."], "image": "train2014/COCO_train2014_000000530896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311378, "question_id": "F3PMSfiSEyaxy2kPBKajZD", "question": "What type of crust is this called?", "choices": ["cheese", "grilled", "thin", "thick"], "correct_choice_idx": 3, "direct_answers": ["gram cracker", "pizza crust", "deep dish", "thick", "thick", "deep dish", "shortbread crust", "pie crust", "extra-thick", "thick crust"], "difficult_direct_answer": false, "rationales": ["You can tell by its thickness what type of crust it is.", "This is a thick crust on the pizza.", "The crust is tall."], "image": "val2014/COCO_val2014_000000311378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262658, "question_id": "F3W8FMtjMyjbQim4YRstg6", "question": "Where is the couple most probably dining?", "choices": ["home", "restaurant", "park", "boat"], "correct_choice_idx": 3, "direct_answers": ["riverboat", "boat", "boating", "restaurant", "boat", "boat", "riverboat", "city", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["There is a body of water near the people, so they are likely on a boat.", "They are by water.", "There is water in the background and it is in motion."], "image": "val2014/COCO_val2014_000000262658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465137, "question_id": "F3XCXUg2CecJVAvSktYjxs", "question": "What event is being celebrated in the living room?", "choices": ["halloween", "new year's", "birthday", "christmas"], "correct_choice_idx": 2, "direct_answers": ["wii playoff", "wii", "birthday", "nintendo wii", "game night", "birthday", "birthday", "unknown", "birthday", "wii game"], "difficult_direct_answer": false, "rationales": ["There are balloons by a fireplace.", "There are balloons in front of the fireplace.", "There are balloons."], "image": "train2014/COCO_train2014_000000465137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567973, "question_id": "F3bUhWr9XbrcVVkZk8DnGM", "question": "What commonly goes on the long light yellow food here?", "choices": ["soy sauce", "wasabi", "ketchup", "oyster sauce"], "correct_choice_idx": 2, "direct_answers": ["ketchup", "meal", "ketchup", "ketchup", "mustard", "ketchup", "ketchup", "ketchup", "mustard", "ketchup"], "difficult_direct_answer": false, "rationales": ["The food is french fries and many people like to dip them in ketchup.", "The ketchup goes with the fries.", "Ketchup is the common condiment for fries."], "image": "train2014/COCO_train2014_000000567973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195275, "question_id": "F3pniTdzeLVnEe377MPaFL", "question": "Why does he have his forearms wrapped?", "choices": ["is injured", "to strengthen", "keep on", "showing off"], "correct_choice_idx": 1, "direct_answers": ["wipe sweat", "bands", "sweat bands", "to play", "to strengthen", "sweat", "tape", "sweat", "sweat", "sweat"], "difficult_direct_answer": false, "rationales": ["He wants to be more powerful", "He might have weak forearms that need bandaging.", "The wraps are used for protection after an injury."], "image": "val2014/COCO_val2014_000000195275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156296, "question_id": "F3qrZZ32BNRaxaLdkqJs42", "question": "Why are they walking like this?", "choices": ["are hungry", "heavy umbrellas", "relaxing", "is raining"], "correct_choice_idx": 3, "direct_answers": ["is raining", "raining", "educational depression", "it's raining", "rain", "raining", "raining", "rain", "rain cover", "raining"], "difficult_direct_answer": false, "rationales": ["The people are walking with umbrellas over their heads and it is clearly wet all around them. these conditions imply it is raining.", "They are holding objects over their heads trying to stay dry.", "There is water falling from the sky"], "image": "train2014/COCO_train2014_000000156296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78477, "question_id": "F47J5r8oDz2mBpKxSiby3K", "question": "Where do lemons originally come from?", "choices": ["wales", "unknown", "ethiopia", "france"], "correct_choice_idx": 1, "direct_answers": ["trees", "northwestern india", "tree", "india", "unknown", "trees", "northwest india", "northwestern india", "china", "india"], "difficult_direct_answer": false, "rationales": ["They come from ethipoia.", "No one knows the origin of lemons.", "Accounts regarding the origins of lemons are conflicting."], "image": "train2014/COCO_train2014_000000078477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377496, "question_id": "F4FBMqJvkvjhBLuanmfCEy", "question": "What does the woman use her phone for?", "choices": ["mirror", "call 911", "calls", "weight reduction"], "correct_choice_idx": 0, "direct_answers": ["taking photos", "taking pictures", "looking", "text", "communication", "friends", "mirror", "share pictures", "watching videos", "read texts"], "difficult_direct_answer": true, "rationales": ["The woman is looking at herself in her phone.", "The woman is looking into the phone to take a selfie.", "It's a cell phone and she might be texting"], "image": "train2014/COCO_train2014_000000377496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358293, "question_id": "F4LwqNDTKYBbhsH97RqvVz", "question": "What kind of a company is the company whose name appears on the left side of the wall?", "choices": ["dessert", "restaurant", "bank", "computer"], "correct_choice_idx": 2, "direct_answers": ["bank", "bank", "financial", "jpmorgan", "jpmorgan", "banking", "financial company", "bank", "financial institution", "banking"], "difficult_direct_answer": false, "rationales": ["The name on the blue banner is the famous bank jp morgan.", "Jp morgan is a financial institution.", "They offer financial services"], "image": "val2014/COCO_val2014_000000358293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13983, "question_id": "F4TFUjUi3yoAWTj3JpKh67", "question": "What is the name of the pizza shop?", "choices": ["pizza", "med", "slice", "mistic"], "correct_choice_idx": 1, "direct_answers": ["med pizza", "med pizza", "med", "med pizza", "med", "med pizza", "med pizza", "med pizza", "med", "med pizza"], "difficult_direct_answer": false, "rationales": ["The pizza shop has the word \"med\" on it.", "The place is called med pizza.", "The name is on the neon sign."], "image": "train2014/COCO_train2014_000000013983.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106375, "question_id": "F4biHwwoJJvaEikaU3TSwS", "question": "Why does he have a bike on the back of his vehicle?", "choices": ["transporting it", "stealing it", "found it", "selling it"], "correct_choice_idx": 0, "direct_answers": ["transport", "transporting it", "broken", "going bicycling", "transporting it", "towing it", "stupid", "broken", "broken", "hauling"], "difficult_direct_answer": false, "rationales": ["The bike looks like it is to be used in professional competition, and needs to be transported to the starting line, as he is not the one who is about to be using it.", "Putting the bike on the back of his vehicle makes it convenient to take it somewhere.", "He is moving the bike to another spot to ride it."], "image": "val2014/COCO_val2014_000000106375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560898, "question_id": "F4kqdvwC7QPnkdmjRBy3Lr", "question": "What larger makeup group owns this company?", "choices": ["maybelline", "cover girl", "l'oreal", "lancome"], "correct_choice_idx": 2, "direct_answers": ["sos", "sos", "la roche-posay", "l'oreal", "la roche", "lancome", "laroche", "sos", "laroche-posay", "sos"], "difficult_direct_answer": false, "rationales": ["Logos are on a group of umbrellas.", "L'oreal owns this skincare line.", "It's what google confirmed for me."], "image": "train2014/COCO_train2014_000000560898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85706, "question_id": "F4wepozt4G7hJAiWQZCBLw", "question": "What sort of skill is required at the slope in the foreground here?", "choices": ["olympic", "professional", "beginner", "hot dog"], "correct_choice_idx": 2, "direct_answers": ["skiing", "balance", "beginner", "skiing", "balance", "beginning", "ski knowledge", "snow skiing", "skiing", "beginner skill"], "difficult_direct_answer": false, "rationales": ["There is a child skiing down the foreground slope.", "The mountain level is very flat for starters.", "It looks like a bunny slope."], "image": "train2014/COCO_train2014_000000085706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99736, "question_id": "F4yVYQd5asEi5TFVManzY6", "question": "What is real among those things?", "choices": ["bra", "broccoli", "bed", "pearls"], "correct_choice_idx": 1, "direct_answers": ["broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli"], "difficult_direct_answer": false, "rationales": ["The real item is a green vegetable with little soft balls at the ends of the branches. the entire vegetable, as it is in this photo and how it is eaten, is a series of branches.", "The broccoli is real.", "The broccoli is the real thing on the doll furniture."], "image": "val2014/COCO_val2014_000000099736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455672, "question_id": "F5GB9W2RWhacmsMhLRaYiX", "question": "What does the small girl wear around her neck?", "choices": ["lift pass", "her name", "wallet", "fanny pack"], "correct_choice_idx": 0, "direct_answers": ["lift pass", "tag", "scarf", "badge", "lanyard", "lanyard", "scarf", "hat", "ski pass", "tag"], "difficult_direct_answer": false, "rationales": ["A girl who is skiing has a card on a string around her neck.", "A girl is on skis with a white square object attached to her jacket. lift tickets are attached to the jackets of skiers.", "The girl has a lift pass around her neck."], "image": "train2014/COCO_train2014_000000455672.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29501, "question_id": "F5KyVDrzD4oQ7dau8BoD22", "question": "Why are the boats without a driver?", "choices": ["too many", "off hours", "broken", "weather"], "correct_choice_idx": 1, "direct_answers": ["not moving", "docked", "lake", "off hours", "at dock", "docked", "docked", "docked", "it's late", "docking"], "difficult_direct_answer": false, "rationales": ["Boats are parked at a dock and it is dark outside.", "They are not in use and don't require a driver.", "The boats are off hours."], "image": "train2014/COCO_train2014_000000029501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290174, "question_id": "F5MAwmrxTdJDYoTqeB3nAP", "question": "Why is the man wearing a reflective jacket?", "choices": ["visibility", "on team", "fashion", "received free"], "correct_choice_idx": 0, "direct_answers": ["signal presence", "for safety", "visibility", "safety", "high visibility", "visibility", "visibility", "safety", "be seen", "safety visibility"], "difficult_direct_answer": false, "rationales": ["The yellow jacket allows people to see him at night so he can be near traffic.", "The reflective surface makes this apparent. it's also legally required in many areas.", "It is a safety jacket."], "image": "train2014/COCO_train2014_000000290174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559483, "question_id": "F5Y4uDSfqAYvaWeTqBvsMM", "question": "Why is the man in the yellow shirt on the mountain?", "choices": ["to hike", "to eat", "to ski", "to sleep"], "correct_choice_idx": 2, "direct_answers": ["guide", "skiing", "player", "to ski", "to ski", "climbing", "gathering skis", "skiing", "to skii", "to ski"], "difficult_direct_answer": false, "rationales": ["The man is is going to be skiing at some point.", "He is placing his skis down on the snow and he has ski boots on", "He is placing them on the snow"], "image": "val2014/COCO_val2014_000000559483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562030, "question_id": "F5ZQhCvAgfoKUGyeC2W7Y7", "question": "What is the blue handled object used to do?", "choices": ["cut", "stir", "tenderize", "draw"], "correct_choice_idx": 0, "direct_answers": ["pruning", "basin", "cut", "cut", "cut", "cut", "cut", "cut", "cut", "cutting"], "difficult_direct_answer": false, "rationales": ["The scissors cut things.", "These are scissors", "These are scissors."], "image": "val2014/COCO_val2014_000000562030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47735, "question_id": "F5eKCLD7bdo4BG7N8kCaMr", "question": "How many people in this picture need to see an optometrist regularly?", "choices": ["five", "four", "ten", "thirteen"], "correct_choice_idx": 1, "direct_answers": ["four", "four", "five", "four", "five", "eight", "one", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are four people.", "There are four people in the picture wearing glasses.", "There's only a few people that wear eye glasses."], "image": "train2014/COCO_train2014_000000047735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72002, "question_id": "F5jgKcgLn5RznMDs5NTvSP", "question": "The man in blue is in what?", "choices": ["trouble", "church", "motion", "dmv"], "correct_choice_idx": 2, "direct_answers": ["track suit", "jogging suit", "track suit", "catching frisbee", "uniform", "college", "running", "dancing", "running", "motion"], "difficult_direct_answer": false, "rationales": ["The man is running.", "The man in blue is moving toward the frisbee in order to catch it and then proceeds in the same direction.", "He is walking around getting ready to throw a frisbee"], "image": "train2014/COCO_train2014_000000072002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429672, "question_id": "F64uMhvmk3viKm7Fg6rS8Q", "question": "What kind of fuel did this company's vehicles first use?", "choices": ["diesel", "coal", "electric", "ligroin"], "correct_choice_idx": 3, "direct_answers": ["diesel", "diesel", "diesel", "diesel", "diesel", "ligroin", "diesel", "diesel", "diesel", "diesel"], "difficult_direct_answer": false, "rationales": ["The company's vehicles likely used diesel.", "They use diesel.", "This is an actros truck which runs on ligroin fuel."], "image": "train2014/COCO_train2014_000000429672.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501493, "question_id": "F6UagkjABFXPJKQftb7F5d", "question": "What is the person in the orange cap doing?", "choices": ["scoring game", "yelling insults", "spotting cheaters", "establishing rhythm"], "correct_choice_idx": 3, "direct_answers": ["directing", "coaching", "giving orders", "establishing rhythm", "steering", "coxswain", "calling strokes", "sitting", "yelling", "leading"], "difficult_direct_answer": true, "rationales": ["The person sitting at the front of the boat is playing drums to establish rhythm.", "The person in orange is a coxswain, not a referee or scorer.", "The people are engaging in a rowing competition. the person in orange is not rowing, but is at the front of the boat serving a role known to be for the purposes of answer a in this setting."], "image": "train2014/COCO_train2014_000000501493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116004, "question_id": "F6WfAnUAwFHqLPyXXn6B6m", "question": "Why are people in the middle of the street?", "choices": ["parade passing", "marathon", "mass protest", "fire sale"], "correct_choice_idx": 0, "direct_answers": ["parade passing", "parade", "parade", "walking", "avoiding fire", "watching accident", "festival", "parade", "striking", "parade"], "difficult_direct_answer": false, "rationales": ["The people are lining the streets to watch the celebratory procession that is taking place there. santa can be seen riding on the top of the truck in the procession.", "The people are gathered and watching a parade.", "There's a parade going."], "image": "train2014/COCO_train2014_000000116004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255459, "question_id": "F6izvZ2kif2VikCeSCP5nu", "question": "Which team does the player in blue play for?", "choices": ["twins", "orioles", "red sox", "yankees"], "correct_choice_idx": 0, "direct_answers": ["at bat", "twins", "cleveland indians", "mets", "minnesota twins", "tennis", "twins", "minnesota twins", "boston", "opposite"], "difficult_direct_answer": false, "rationales": ["You can tell what is printed on the jersey as to what team it is.", "The player in blue must play for the twins since he's wearing their color.", "The minnesota twins wear a blue jersey."], "image": "train2014/COCO_train2014_000000255459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235090, "question_id": "F6jvdhxA5hwDtpNyk2vUxL", "question": "Where are these people going?", "choices": ["grocery store", "beach", "park", "pool"], "correct_choice_idx": 1, "direct_answers": ["beach", "beach", "going surfing", "beach", "surfing", "hotel", "surfing", "beach", "surfing", "beach"], "difficult_direct_answer": false, "rationales": ["The people are going to the beach.", "The people have surfboards.", "They are wearing bathing suits and carrying surf boards which can only be used at the beach."], "image": "train2014/COCO_train2014_000000235090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337194, "question_id": "F6ymEFuN33aAbdzmM5amiv", "question": "What depth of water do these birds feel most comfortable in?", "choices": ["shallow water", "deep water", "breaker water", "peaking water"], "correct_choice_idx": 1, "direct_answers": ["deep water", "shallow", "deep", "shallow", "shallow", "shallow", "shallow", "deep", "deep", "shallow"], "difficult_direct_answer": false, "rationales": ["These birds like deep water so they can dive.", "A single bird with a large beak is in the water. the water is darker around it that indicates it is deep.", "Pelicans dive into the water so they don't like shallow places."], "image": "val2014/COCO_val2014_000000337194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198240, "question_id": "F7Hg5JyJEYrXnmr8mEyjn6", "question": "Why is this person so messy?", "choices": ["is misbehaving", "bad manners", "baby", "is blind"], "correct_choice_idx": 2, "direct_answers": ["messy kid", "finger eating", "they're immature", "using hands", "baby", "baby", "baby", "baby", "cake", "child"], "difficult_direct_answer": false, "rationales": ["The little tyke doesn't even know he's being messy.", "Most toddlers don't know about being clean when eating.", "Children develop slowly and manners are learned as they get older."], "image": "train2014/COCO_train2014_000000198240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569332, "question_id": "F7QYavMNeQQcR5pEwcWDpc", "question": "What could pop that's attached to the bench?", "choices": ["bubble", "balloon", "tire", "ball"], "correct_choice_idx": 1, "direct_answers": ["balloon", "balloon", "balloon", "balloon", "balloon", "balloon", "balloon", "balloon", "balloon", "balloon"], "difficult_direct_answer": false, "rationales": ["That's the only thing that could pop.", "The balloon could pop", "The balloon can easily be popped with a sharp object."], "image": "train2014/COCO_train2014_000000569332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394447, "question_id": "F7sQvwJxGKBc5GwPPy8LcF", "question": "What is the man holding along in his hands with his sandwich?", "choices": ["camcorder", "camera", "tablet", "phone"], "correct_choice_idx": 0, "direct_answers": ["camera", "camera", "camcorder", "camera", "camera", "camera", "camera", "camera", "camera", "camera"], "difficult_direct_answer": false, "rationales": ["The man is recording with his food.", "The man has a camcorder.", "He has a video recorder in his other hand."], "image": "train2014/COCO_train2014_000000394447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337246, "question_id": "F83RDGvs5qZzwPMdKkHppa", "question": "What is on the other end of these sails?", "choices": ["boats", "dogs", "birds", "paragliders"], "correct_choice_idx": 3, "direct_answers": ["person", "ocean", "rope", "surfers", "ocean", "humans", "paragliders", "string", "water", "water"], "difficult_direct_answer": false, "rationales": ["Parasailers are in the water.", "These people use something similar to these when flying through the air", "Paragliders are hanging from the sails."], "image": "val2014/COCO_val2014_000000337246.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514800, "question_id": "F8CayK4xwXtBGbqbm2sDkV", "question": "What brand of car does this person drive?", "choices": ["volkswagen", "honda", "toyota", "ford"], "correct_choice_idx": 0, "direct_answers": ["unknown", "volkswagen", "vw", "volkswagen", "volkswagen", "volvo", "volks wagon", "volkswagen", "vw", "vw"], "difficult_direct_answer": false, "rationales": ["The key fob for the car has a vw logo on it.", "The keys have a vw logo on them.", "The car keys contain the volkswagen logo."], "image": "train2014/COCO_train2014_000000514800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475142, "question_id": "F8DH9Fm7nyfycEPSy5XxM3", "question": "What is the pink layer on top of the donut?", "choices": ["ice cream", "syrup", "sprinkles", "frosting"], "correct_choice_idx": 3, "direct_answers": ["yoghurt", "icing", "frosting", "icing", "frosting", "frosting", "icing", "frosting", "strawberry frosting", "icing"], "difficult_direct_answer": false, "rationales": ["The donut on top has pink icing.", "The pink layer has been added after baking and has solidified.", "These doughnuts are iced and have different flavors that are associated with the color."], "image": "train2014/COCO_train2014_000000475142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84821, "question_id": "F8HWgDHbhU6pFrTimdsvFj", "question": "What does the line near the hydrant signify?", "choices": ["hopscotch boundary", "road intersection", "turn here", "handicap parking"], "correct_choice_idx": 1, "direct_answers": ["crosswalk", "crosswalk", "cross walk", "crosswalk", "road intersection", "stop line", "stop fire", "crosswalk", "crosswalk", "sidewalk"], "difficult_direct_answer": false, "rationales": ["A single white line is painted across a street.", "The line is the intersection.", "The line on the hydrant signifies the intersection of a road."], "image": "train2014/COCO_train2014_000000084821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95185, "question_id": "F8JuxwRuuwd4uvoAaXULiS", "question": "Which chair would someone most likely bump into if they entered through the door?", "choices": ["far-right one", "red one", "white one", "rocking one"], "correct_choice_idx": 1, "direct_answers": ["white", "red", "pink chair", "red", "red one", "chair", "white chair", "white chair", "red one", "blue one"], "difficult_direct_answer": false, "rationales": ["The red chair is right in front of the door.", "The red chair is more close.", "A room is shown with chairs in it and a red chair is closest to the door."], "image": "train2014/COCO_train2014_000000095185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130816, "question_id": "F8RkhRXehXDuCEQ9eFQBc8", "question": "At which location are the Cardinals playing?", "choices": ["home field", "wrigley", "dodger stadium", "shea"], "correct_choice_idx": 0, "direct_answers": ["busch stadium", "field", "arizona", "home field", "ballpark", "arizona", "home", "phoenix", "home", "baltimore"], "difficult_direct_answer": false, "rationales": ["The baseball team is at home as seen by all the fans in red.", "There is a lot of red in the crowd, so it must be a home game.", "I'd say a given the colors in the stands, but it's hard to be certain."], "image": "train2014/COCO_train2014_000000130816.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227893, "question_id": "F8ZFwraHQCbvovK56vEUFx", "question": "What items does the shop with blue trim sell?", "choices": ["postcards", "cigarettes", "pottery", "hats"], "correct_choice_idx": 2, "direct_answers": ["pottery", "not visible", "pottery", "crockery", "food", "handmade items", "mouse", "household goods", "gifts cards", "porcelain"], "difficult_direct_answer": true, "rationales": ["There are plates and such in the window of the store.", "There are many pieces of pottery in the window.", "As seen in the window and confirmed with google search."], "image": "val2014/COCO_val2014_000000227893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554598, "question_id": "F8jr8phSJjcrghidgeeaXS", "question": "What is the man using in the room?", "choices": ["sofa", "computer", "dresser", "couch"], "correct_choice_idx": 1, "direct_answers": ["computer", "computer", "computer", "computer", "computer", "computer", "computer", "computer", "computer", "computer"], "difficult_direct_answer": false, "rationales": ["The man has a computer set up on his desk.", "The man is clearly visible and regarding a screen that is the size and shape of answer a.", "The man in the room is using computer on the working desk."], "image": "train2014/COCO_train2014_000000554598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559619, "question_id": "F8mqcqpYn5Rj3LUob9iPBh", "question": "What country is this?", "choices": ["india", "canada", "ireland", "mexico"], "correct_choice_idx": 0, "direct_answers": ["saudi arabia", "china", "foreign country", "india", "india", "japan", "asia", "egypt", "saudi arabia", "iran"], "difficult_direct_answer": false, "rationales": ["India is showed due to the language on the stop sign.", "The writing on the sign is in the language of the country.", "This sign is in the north of india."], "image": "train2014/COCO_train2014_000000559619.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380182, "question_id": "F8v83ahRLTvK9A72MopBBZ", "question": "What animals are on top of the cake?", "choices": ["bears", "birds", "dogs", "foxes"], "correct_choice_idx": 1, "direct_answers": ["monkey", "owls", "owls", "owls", "owls", "owls", "birds", "owls", "owls", "owls"], "difficult_direct_answer": false, "rationales": ["There are two owls on top of the cake.", "The animals can be identified from their eyes and ears as being owls, which is a species of bird.", "The cake topper is a pair of owls which are feathered animals that fly."], "image": "train2014/COCO_train2014_000000380182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138054, "question_id": "F93QXWhWmtCTk3M7WbxRMZ", "question": "What base is number 33 defending?", "choices": ["home plate", "second base", "third base", "first base"], "correct_choice_idx": 3, "direct_answers": ["ball", "home", "first", "first base", "first", "first", "first", "first", "first", "first base"], "difficult_direct_answer": false, "rationales": ["The location of 33 is at first base.", "Number thirty three is defending first base.", "He is at the base a batter runs to right after hitting the ball"], "image": "val2014/COCO_val2014_000000138054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467662, "question_id": "F99v2tP3oo2br47KzJzHBR", "question": "How many animals can be seen here?", "choices": ["one", "twelve", "six", "four"], "correct_choice_idx": 0, "direct_answers": ["four", "three", "two", "three", "three", "three", "one", "one", "three", "elephant"], "difficult_direct_answer": false, "rationales": ["There is one single elephant.", "Only a single animal can be seen", "A lone elephant is seen walking away behind a shrub."], "image": "train2014/COCO_train2014_000000467662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238290, "question_id": "F9EGTUgQqTSpxDbu6PM78f", "question": "What is in the air?", "choices": ["tennis ball", "airplane", "balloon", "flying saucer"], "correct_choice_idx": 1, "direct_answers": ["aeroplane", "airplane", "airplane", "aeroplane", "airplane", "aeroplane", "airplane", "airplane", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["The aircraft can be seen flying.", "A plane is flying.", "This is a flying vehicle that has wings"], "image": "train2014/COCO_train2014_000000238290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352117, "question_id": "F9Fjib7cACc753ciHaUsYV", "question": "What are the bottles on top of the fridge very likely to contain?", "choices": ["preserves", "vinegar", "alcohol", "juices"], "correct_choice_idx": 2, "direct_answers": ["wine", "wine", "liquor", "alcohol", "wine", "alcohol", "beverages", "alcohol", "wine", "alcohol"], "difficult_direct_answer": false, "rationales": ["People tend to put alcoholic beverages up high and out of the hands of little kids.", "By the shape and labeling you can tell what they are.", "The color of the bottles as well as the size and shape indicate they may contain a variety of alcoholic beverages."], "image": "train2014/COCO_train2014_000000352117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311471, "question_id": "F9HwBpaYEZHvqeHgACjAJB", "question": "In which direction will the pink car go?", "choices": ["turn right", "turn left", "back up", "go straight"], "correct_choice_idx": 0, "direct_answers": ["right", "forward", "turn around", "straight", "south", "making turn", "opposite", "turn right", "turn right", "left"], "difficult_direct_answer": true, "rationales": ["The direction is right.", "The pink car is in the turning lane to the right.", "The pink car is in the lan to do so."], "image": "val2014/COCO_val2014_000000311471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187450, "question_id": "F9W2f6Qt9mb4R9B5rycRP5", "question": "What is the sum of the three digits on the train?", "choices": ["12", "82", "four", "25"], "correct_choice_idx": 0, "direct_answers": ["192", "twelve", "twelve", "twelve", "twelve", "twelve", "192", "12", "twelve", "12"], "difficult_direct_answer": false, "rationales": ["The sum of the number 1, 9 and 2 would be 12.", "As long as you add and subtract you can get the correct answer.", "The numbers are 1, 9, 2 and added together make twelve."], "image": "val2014/COCO_val2014_000000187450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34854, "question_id": "F9YhejLiLnsSURxEcfm8Zp", "question": "What is most likely the base of this soup?", "choices": ["spinach", "broccoli", "oranges", "tomato"], "correct_choice_idx": 3, "direct_answers": ["tomato", "onion", "tomato", "tomatoes", "tomato", "tomato sauce", "tomato", "tomato", "tomato", "tomato"], "difficult_direct_answer": false, "rationales": ["The base of the red soup is tomato.", "It's red so it's probably a tomato base.", "A red colored soup is in a bowl. tomato soup is red."], "image": "train2014/COCO_train2014_000000034854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269228, "question_id": "F9bfV2VUg9iHbshpRrPBsn", "question": "What is preventing the dog from being submerged in the water?", "choices": ["surf board", "owner", "leash", "collar"], "correct_choice_idx": 0, "direct_answers": ["surfboard", "surfboard", "surfboard", "board", "surfboard", "surfboard", "board", "paddle board", "raft", "surf board"], "difficult_direct_answer": false, "rationales": ["He is standing on top of it as it floats", "The man is balancing on a long board on the water.", "A surfboard floats and the dog is standing on it."], "image": "train2014/COCO_train2014_000000269228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46056, "question_id": "F9dPxNHPcdvMpUdzP7EKWY", "question": "What does the dog leave in the sand every time he takes a step?", "choices": ["water", "pawprints", "footprints", "toys"], "correct_choice_idx": 1, "direct_answers": ["footsteps", "footprints", "pawprints", "paw print", "footprint", "pawprints", "foot prints", "footprints", "pawprints", "paw prints"], "difficult_direct_answer": false, "rationales": ["The dog will leave pawprints as its tracks.", "The dog leaves pawprints.", "The dog is walking in the sand. his paws leave marks since the sand is wet."], "image": "train2014/COCO_train2014_000000046056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132147, "question_id": "F9ybxUAarwDCLdFqkac88b", "question": "What profession deals with the item the baby is using?", "choices": ["police officer", "dentist", "fireman", "cab driver"], "correct_choice_idx": 1, "direct_answers": ["dentistry", "dentistry", "dentist", "dentist", "dentist", "dentist", "dentist", "dentist", "dentist", "dentistry"], "difficult_direct_answer": false, "rationales": ["The baby has a toothbrush in their mouth, the most popular item used to maintain oral health, and the professional of oral health is known as a dentist.", "A baby is holding a toothbrush. dentists take care of teeth.", "Dentists deal in matters of dental hygiene, such as those dealt with by toothbrushes."], "image": "train2014/COCO_train2014_000000132147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36907, "question_id": "FACVJnR4KTU9SgCiHQWkYx", "question": "Who will be riding those planes?", "choices": ["airforce", "animals", "passengers", "stunt pilots"], "correct_choice_idx": 3, "direct_answers": ["pilots", "pilots", "stunt pilots", "pilots", "stunt pilots", "pilots", "stunt pilots", "pilots", "pilots", "stunt pilots"], "difficult_direct_answer": false, "rationales": ["This is a stunt plane at an air show and is piloted by a stunt pilot.", "The plane is demonstrating a stunt while flying through the air, which means the plane is being flown by a stunt pilot.", "The planes are doing tricks in the air and only have enough space one or two people."], "image": "train2014/COCO_train2014_000000036907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203024, "question_id": "FADRJqHrCFS8Ffhekg5jgQ", "question": "What does the man in glasses pretend to play?", "choices": ["piano", "trumpet", "guitar", "organ"], "correct_choice_idx": 2, "direct_answers": ["guitar", "guitar", "guitar", "guitar", "guitar", "air guitar", "electric guitar", "guitar", "guitar", "guitar"], "difficult_direct_answer": false, "rationales": ["The guy is holding his hands like he's playing a guitar.", "The man is mimicking playing on a stringed instrument with a long neck.", "The man is playing air guitar with his fingers."], "image": "val2014/COCO_val2014_000000203024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373024, "question_id": "FAG8riZj4PViATBLpbTNYT", "question": "What is the man doing with the black cat?", "choices": ["feeding it", "petting it", "bathing it", "combing it"], "correct_choice_idx": 1, "direct_answers": ["scratching", "cuddling", "petting cat", "rubbing it", "scratching it", "scratching neck", "holding", "petting it", "petting", "holding it"], "difficult_direct_answer": true, "rationales": ["He has his hands on the cat and is smiling", "The man has his hands on the cat.", "Cats love to be pet. people also love to pet cats."], "image": "train2014/COCO_train2014_000000373024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42441, "question_id": "FAUVm4WFYCLkh57xmwH5bP", "question": "Who controls the horse?", "choices": ["man", "woman", "boy", "girl"], "correct_choice_idx": 0, "direct_answers": ["reigns", "man", "man", "man", "green shirt", "driver", "crop", "man", "driver", "man"], "difficult_direct_answer": false, "rationales": ["The man has access to the controls that make the horse go or stop.", "Several people are inside a carriage as a guy in front pulls on the reigns. it helps direct the speed and direction of the horse.", "A man holds the reigns attached to a horse. reigns are used to control horses by humans."], "image": "train2014/COCO_train2014_000000042441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35948, "question_id": "FAZjzwiuLCHZ8MNK4oRJT3", "question": "What is she getting ready to do?", "choices": ["serve", "duck", "dunk", "paddle"], "correct_choice_idx": 0, "direct_answers": ["serve ball", "serve", "serve", "serve", "serve", "serve ball", "serve", "target ball", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["The person is serving.", "This woman is preparing to throw her tennis ball in the air and strike it. this would be called a serve in the game of tennis.", "She has a tennis racquet in one hand and a tennis ball in the other. she is about to hit the ball."], "image": "train2014/COCO_train2014_000000035948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22979, "question_id": "FAtyX933qKopWtSAnyuyQm", "question": "What activity do the persons near the bus take part in?", "choices": ["harvest", "tourism", "sales", "marketing"], "correct_choice_idx": 1, "direct_answers": ["party", "talking", "christmas party", "touring", "touring", "standing", "touring", "cheering", "sightseeing", "tourism"], "difficult_direct_answer": false, "rationales": ["The persons are near the bus taking part in some tourisms.", "The people are all going on tour in the city.", "They are visiting different places to site see."], "image": "val2014/COCO_val2014_000000022979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469165, "question_id": "FB6EjEVRocQZeZSQ3XTogE", "question": "Over which shoulder will the batter watch the pitcher?", "choices": ["his right", "pitchers", "left", "catchers"], "correct_choice_idx": 0, "direct_answers": ["left", "right", "right", "left", "left", "his right", "left", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["The batter will look to his right.", "He bats left-handed so it would be the other shoulder.", "This shoulder is the one closer to the pitcher"], "image": "val2014/COCO_val2014_000000469165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51396, "question_id": "FBACMMfSMUM3vrn2pYemXf", "question": "What is the rack on the far left used for?", "choices": ["skis", "hats", "coats", "goggles"], "correct_choice_idx": 0, "direct_answers": ["lifting skiers", "protect", "support", "storage", "ski's", "ski rack", "ski storage", "skis", "skis", "skis"], "difficult_direct_answer": false, "rationales": ["Given the skiers and skis in this mountain scene we can conclude this rack holds their skis.", "You can leave your skis here.", "The rack is where skiers can leave their gear."], "image": "train2014/COCO_train2014_000000051396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501138, "question_id": "FBKGGSiEXjjZXiyWWC6ebV", "question": "Where would this set up occur?", "choices": ["office/workplace", "bedroom", "closet", "attic"], "correct_choice_idx": 0, "direct_answers": ["office", "office/workplace", "office", "cubicle", "office", "office", "office building", "office", "office", "office building"], "difficult_direct_answer": false, "rationales": ["The other options don't match a cubicle setting.", "There are office equipment seen in the area.", "It's a cubical at a place of employment."], "image": "train2014/COCO_train2014_000000501138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320643, "question_id": "FBQz9VFPNv5YGEJupfYUrp", "question": "What sound source can be found above the clock here?", "choices": ["intercom", "choir", "boom box", "bell"], "correct_choice_idx": 3, "direct_answers": ["bell", "bell", "belltower", "chime", "bell", "bell", "bells", "bell", "bell", "bell"], "difficult_direct_answer": false, "rationales": ["The bell makes noise.", "There is a bell in the church's tower.", "There are archway openings in the steeple to allow for the sound get out."], "image": "train2014/COCO_train2014_000000320643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549539, "question_id": "FBUXspvJAK3wwVwGqAWisP", "question": "Who have deciduous teeth?", "choices": ["adults", "animals", "babies", "birds"], "correct_choice_idx": 2, "direct_answers": ["unknown", "babies", "toddler", "child", "babies", "babies", "child", "babies", "babies", "babies"], "difficult_direct_answer": false, "rationales": ["That's what they call the primary teeth.", "Babies have these teeth.", "Teeth that are deciduous are found in newborns."], "image": "train2014/COCO_train2014_000000549539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72514, "question_id": "FBfSabf6CJvVCpFSsrXM7B", "question": "Cows belongs to which food classification group?", "choices": ["carnivores", "omnivores", "none", "herbivores"], "correct_choice_idx": 3, "direct_answers": ["meat", "tree", "beef", "proteins", "herbivores", "bovine", "meat", "bovine", "meat", "dairy"], "difficult_direct_answer": false, "rationales": ["They're herbivores.", "Cows eat grass.", "Cows do not eat other animals."], "image": "val2014/COCO_val2014_000000072514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555797, "question_id": "FBoUuro3bzCmKvdKUBAm6G", "question": "What is the large black object used for?", "choices": ["watching television", "eating", "cooking", "storage"], "correct_choice_idx": 0, "direct_answers": ["watching", "watching television", "watching television", "watching television", "television", "watching", "watch television", "tv", "watch television", "watching programs"], "difficult_direct_answer": false, "rationales": ["The large black object is a television set that people can sit around and watch.", "The glossy black screen and square shape identify this item as a television. this item is used by powering it on and watching it.", "You view shows and movies on this"], "image": "train2014/COCO_train2014_000000555797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301648, "question_id": "FC44kSmLJVx8kXvTVjkn95", "question": "What surface is the man playing on?", "choices": ["clay", "carpet", "hard", "grass"], "correct_choice_idx": 2, "direct_answers": ["court", "hard", "cement", "court", "hard", "tennis court", "tennis court", "hard court", "clay", "tennis court"], "difficult_direct_answer": false, "rationales": ["It is a solid surface that isn't soil or grass", "The man is on a tennis court. tennis courts are hard.", "The man is on a solid tennis court."], "image": "train2014/COCO_train2014_000000301648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540460, "question_id": "FCJe7CJYepsh7xJjmVEmaq", "question": "What is the tallest item?", "choices": ["microwave", "refrigerator", "plant", "cabinet"], "correct_choice_idx": 3, "direct_answers": ["dresser", "refrigerator", "cabinet", "cabinet", "picture", "rear cupboard", "cabinet", "cabinet", "cabinet", "cabinet"], "difficult_direct_answer": false, "rationales": ["The refrigerator is tall, but the thing in the corner is taller.", "The cabinet is very tall.", "The other items are much shorter, but the fridge is almost as tall."], "image": "train2014/COCO_train2014_000000540460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70527, "question_id": "FCLMtQ57exFwqGf7fUwRzU", "question": "What type feast is being served here?", "choices": ["burger grill", "barbeque", "fish fry", "clam bake"], "correct_choice_idx": 1, "direct_answers": ["dinner", "meat", "barbecue", "thanksgiving", "cookout", "chicken", "barbecue", "barbeque", "holiday", "barbeque"], "difficult_direct_answer": false, "rationales": ["As indicated by the grilled parts of meat and many sauces. the foods necessary for the other options aren't in the image.", "Due to the meat's red color and the kinds of sauces on this table we can conclude it is barbeque.", "The meat is saucy like barbequed food would be."], "image": "val2014/COCO_val2014_000000070527.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243569, "question_id": "FCQWyk5XUMjRSdQ9ELS4rB", "question": "What art form provides the greatest coverage on these walls?", "choices": ["photography", "mosaic tiles", "painting", "sculpture"], "correct_choice_idx": 0, "direct_answers": ["photography", "pictures", "photos", "photography", "photos", "polaroids", "photographs", "photos", "photography", "photos"], "difficult_direct_answer": false, "rationales": ["The pictures look like they're taken from a camera.", "People usually put pictures on their walls because they're removable.", "The pictures are covering the most area on the wall."], "image": "val2014/COCO_val2014_000000243569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102843, "question_id": "FCTVcYU3n8NSoEauU6cYjz", "question": "What purpose does the red tower serve?", "choices": ["toll taking", "aquarium", "warning ships", "prison"], "correct_choice_idx": 2, "direct_answers": ["visibility", "visibility", "lighthouse", "warn vessels", "lighthouse", "lighthouse", "lighthouse", "warning ships", "lighthouse", "lighthouse"], "difficult_direct_answer": false, "rationales": ["That is a lighthouse which helps ships see the shore.", "The red tower is on a jetty that sticks out into the water. the red tower is a light house.", "The purpose is to warn ships."], "image": "val2014/COCO_val2014_000000102843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283548, "question_id": "FCnQYRaLpuezbwFhtbuoC2", "question": "What was the boy doing?", "choices": ["bathing", "hiding", "sleeping", "eating"], "correct_choice_idx": 1, "direct_answers": ["hiding", "goofing", "inside suitcase", "hiding", "hiding", "hiding", "hiding", "hiding", "hiding", "inside briefcase"], "difficult_direct_answer": false, "rationales": ["He is inside the suitcase", "He was playing and got in there to not be seen.", "He is popping up from inside a suitcase"], "image": "train2014/COCO_train2014_000000283548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30222, "question_id": "FCsGcw7pmPASG2NJkSYfis", "question": "What kind of room lies behind these closed doors?", "choices": ["recording studio", "personal bedroom", "airport lounge", "hotel room"], "correct_choice_idx": 3, "direct_answers": ["hotel room", "hotel room", "hotel", "hotel", "hotel room", "hotel rooms", "hotel", "hotel", "hostels", "hotel"], "difficult_direct_answer": false, "rationales": ["This is a hallway of a hotel and there are several rooms on this floor.", "The rooms behind the doors are hotel rooms for travelers.", "There is a hotel room behind these doors."], "image": "val2014/COCO_val2014_000000030222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109050, "question_id": "FDe6PKFYiz8eJYKAVyApkX", "question": "How are they showing their pride?", "choices": ["raised fists", "american flag", "chinese flag", "rainbow flag"], "correct_choice_idx": 1, "direct_answers": ["flag", "american flag", "flying flag", "boating", "standing", "flags", "flying flag", "flag", "sailing", "american flag"], "difficult_direct_answer": false, "rationales": ["A boat has an american flag on it. flags are flown to show pride.", "They have their country's symbol waving on the boat.", "They have the stars and stripes on the back of the boat"], "image": "train2014/COCO_train2014_000000109050.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528729, "question_id": "FDwXZ9y6GbrLXmj4Y7hRQK", "question": "What kind of establishment is the brown building?", "choices": ["church", "doctor office", "apartment", "restaurant"], "correct_choice_idx": 3, "direct_answers": ["restaurant", "housing", "restaurant", "church", "hotel", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["The establishment is a restaurant.", "A building has a sign out front that advertises salads. restaurants serve salads.", "It has food listed on the sign."], "image": "val2014/COCO_val2014_000000528729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477906, "question_id": "FEGLFKpZFrxWAx5NjAp3hN", "question": "What is the man standing on?", "choices": ["escalator", "skateboard", "box", "horse"], "correct_choice_idx": 1, "direct_answers": ["skate board", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "snowboard"], "difficult_direct_answer": false, "rationales": ["The man is riding a skateboard on a plank.", "The man is on a skateboard.", "The item has a deck and four wheels."], "image": "val2014/COCO_val2014_000000477906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436492, "question_id": "FEHkwmMhQSvvmTdU8Fu8Rs", "question": "What is the purpose of the sign?", "choices": ["diversion", "solicit funds", "reroute traffic", "apologize inconvenience"], "correct_choice_idx": 3, "direct_answers": ["advertisement", "inform people", "maintaince", "directions", "pedestrian crossing", "advertise", "information", "construction promises", "apologize inconvenience", "creating jobs"], "difficult_direct_answer": true, "rationales": ["They are letting everyone know what is going on there.", "It's up so people are aware of the construction going on.", "The sign explains the inconvenience of road work and explains why it is occurring. major legislation lead to more manufacturing activity and the sign explains that."], "image": "val2014/COCO_val2014_000000436492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439770, "question_id": "FF9pjskRNYPSz2KxdhtpWG", "question": "What is below the paddle shown here?", "choices": ["whale", "person", "dolphin", "land shark"], "correct_choice_idx": 1, "direct_answers": ["person", "waves", "wave", "water", "person", "person", "water", "water", "water", "paddle board"], "difficult_direct_answer": false, "rationales": ["The person below the paddle is likely a human submerged in water.", "A person would have ridden the surfboard out into the water, and it is likely they have just fallen off and are holding on to the paddle under the water.", "There is a surfboard. there are no wild animals."], "image": "val2014/COCO_val2014_000000439770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455384, "question_id": "FFmUuRsTBS5DksV64eL45i", "question": "Why does the van have a realistic photo on the side?", "choices": ["by law", "advertising", "for fun", "for aesthetics"], "correct_choice_idx": 1, "direct_answers": ["advertisement", "publicity", "advertisement", "television advertisement", "advertisement", "why not", "advertising", "advertising", "advertisement", "advertisement"], "difficult_direct_answer": false, "rationales": ["It is promoting a show by showing the actors in the show.", "The van has a realistic photo for advertisement purposes on its side.", "The name of a television program with the time and network to watch it on is present on this truck. this truck is serving as an advertisement for the program."], "image": "val2014/COCO_val2014_000000455384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45292, "question_id": "FG3MCbS24HUy9f9HrqRkxt", "question": "What is he doing?", "choices": ["reading", "resting", "eating", "sleeping"], "correct_choice_idx": 0, "direct_answers": ["reading", "reading", "reading", "looking", "recording", "reading", "reading", "reading", "reading", "not man"], "difficult_direct_answer": false, "rationales": ["The women standing by the exit sign is looking down with a book in her hand.", "He is looking down at a document of some sort, and the action in a matches what people looking at documents do.", "He is holding a book in his hands and looking at it while he turns the pages."], "image": "train2014/COCO_train2014_000000045292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42977, "question_id": "FGAxA7HapEthz2d6VXSd7n", "question": "Persons here are viewing part of what?", "choices": ["future ride", "mall", "sale", "exhibition"], "correct_choice_idx": 3, "direct_answers": ["airplane", "plane", "plane wing", "airplane", "plane wing", "airplane", "plane", "airplane", "airplane show", "exhibition"], "difficult_direct_answer": false, "rationales": ["There are people milling about, sitting in chairs, showing that the plane is not about to take off but is part of a show where people can view it and others.", "This is a static display model at an airshow, which is an exhibition.", "These seated people are enjoying an air show"], "image": "train2014/COCO_train2014_000000042977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365739, "question_id": "FGFaoLLMLTeXnGL6vdWvEr", "question": "These animals are known for producing what?", "choices": ["eggs", "silk", "milk", "wool"], "correct_choice_idx": 2, "direct_answers": ["milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk"], "difficult_direct_answer": false, "rationales": ["Cows will produce milk that humans will drink.", "Cows give milk to humans.", "Cows are used to get milk from."], "image": "train2014/COCO_train2014_000000365739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527558, "question_id": "FGPJVu7tGzGNQ2tGDGPEZa", "question": "What do the men hope to bring home?", "choices": ["car", "dog", "fish", "women"], "correct_choice_idx": 2, "direct_answers": ["fish", "fish", "fish", "fish", "fish", "fish", "fish", "fish", "fish", "fish"], "difficult_direct_answer": false, "rationales": ["Men are standing on the shore with fishing poles.", "Several men are standing on the edge of the banks. they are currently fishing in hopes of getting food.", "The men are standing at the water's edge and have visible fishing poles. the object of fishing is most often to bring fish home."], "image": "val2014/COCO_val2014_000000527558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487375, "question_id": "FGtGKQaHVYrEeG5mwi5eeR", "question": "What kind of vegetables are held in the bag on the table?", "choices": ["bananas", "tomatoes", "leeks", "potatoes"], "correct_choice_idx": 3, "direct_answers": ["potatoes", "potato", "potatoes", "potatoes", "potato", "unknown", "potatoes", "potatoes", "potatoes", "apples"], "difficult_direct_answer": false, "rationales": ["These are chips and chips are made of potatoes.", "It's a bag with potatoes in it.", "One can see a bag on chips on the table."], "image": "val2014/COCO_val2014_000000487375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451284, "question_id": "FGta9pzfhUmF2XW3nmQkWL", "question": "What is brewing in the mug in front of the laptop?", "choices": ["kombucha", "juice", "coffee", "tea"], "correct_choice_idx": 3, "direct_answers": ["coffee", "tea", "tea", "tea", "tea", "tea", "tea", "tea", "tea", "tea"], "difficult_direct_answer": false, "rationales": ["There is a teabag floating in the mug.", "There is a bag with leaves brewing that is floating in the top of the mug.", "A mug with a tea bag is on a table. tea bags are used to brew tea."], "image": "val2014/COCO_val2014_000000451284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289276, "question_id": "FH66V3LPzzhrLVz6skauqH", "question": "What room is this woman standing in?", "choices": ["bedroom", "bathroom", "garage", "nursery"], "correct_choice_idx": 2, "direct_answers": ["garage", "garage", "garage", "kitchen", "garage", "kitchen", "garage", "kitchen", "basement", "garage"], "difficult_direct_answer": false, "rationales": ["The woman is in a room with a garage door.", "The room is the garage.", "The woman is in a garage since the room is so cluttered."], "image": "train2014/COCO_train2014_000000289276.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18627, "question_id": "FHAyNAWewuqWpexgpXB3CU", "question": "What do the signs point to?", "choices": ["destinations", "buildings", "buses", "sales"], "correct_choice_idx": 0, "direct_answers": ["streets", "different towns", "towns", "locations", "landmarks", "streets", "roads", "left", "destinations", "frank house"], "difficult_direct_answer": true, "rationales": ["Signs designating local attractions are pointing in different directions.", "The signs point to cities.", "The signs point to different places that are of interest in the air such as anne frank's house."], "image": "train2014/COCO_train2014_000000018627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106119, "question_id": "FHNMiRqCsnJPkUz9FSG5jr", "question": "What is the nickname of the motor vehicle company advertised?", "choices": ["prius", "chevy", "hummer", "beamer"], "correct_choice_idx": 1, "direct_answers": ["chevy", "chevy", "chevy", "chevrolet", "chevy", "chevy", "chevy", "chevy", "chevy", "chevy"], "difficult_direct_answer": false, "rationales": ["An ad for chevrolet can be seen at a baseball stadium.", "The nickname is chevy.", "Chevy's logo is listed."], "image": "train2014/COCO_train2014_000000106119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366470, "question_id": "FHYWjnkiDSAMxdk4mhP3Lq", "question": "What religion does the man in the black hat seem to be?", "choices": ["catholic", "christian", "jewish", "atheist"], "correct_choice_idx": 2, "direct_answers": ["jewish", "jewish", "amish", "jewish", "jewish", "jewish", "jewish", "judaism", "jewish", "jewish"], "difficult_direct_answer": false, "rationales": ["Jewish people wear that type of hat.", "The man has a hat and beard and is wearing a black suit.", "The man is wearing the traditional garb of a rabbi."], "image": "train2014/COCO_train2014_000000366470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162867, "question_id": "FHYqhpEDjWVZSpFsfqsKRS", "question": "Where is this bike located?", "choices": ["driveway", "mechanic", "museum", "parking lot"], "correct_choice_idx": 2, "direct_answers": ["showroom", "museum", "bike shop", "garage", "museum", "convention", "museum", "display", "museum", "museum"], "difficult_direct_answer": false, "rationales": ["The motorcycle is on a red tarp on display for onlookers to observe.", "The bike is at a museum.", "The room is at a muesum."], "image": "val2014/COCO_val2014_000000162867.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337705, "question_id": "FHuNLtjmaue7k22L8JW8TT", "question": "This man looks most similar to what historical figure?", "choices": ["mahatma gandhi", "date masamune", "cesare borgia", "benito mussolini"], "correct_choice_idx": 0, "direct_answers": ["gandhi", "gandhi", "grandi", "gandhi", "dali lama", "indira grandi", "mahatma gandhi", "grandi", "dali lama", "gandhi"], "difficult_direct_answer": false, "rationales": ["He does resemble the great philosopher of nonviolence.", "The man has brown skin and looks to be of indian, not italian or japanese, descent. he is wearing glasses.", "The man is like gandhi."], "image": "val2014/COCO_val2014_000000337705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335984, "question_id": "FHveBseLGJzw9mTAWULLJZ", "question": "What is the person who will take longest to begin skiing wearing?", "choices": ["red jacket", "green jacket", "brown jacket", "blue jacket"], "correct_choice_idx": 2, "direct_answers": ["beginner", "brown jacket", "last man", "brown jacket", "begginer", "brown", "last person", "backplate", "jacket", "coat"], "difficult_direct_answer": true, "rationales": ["That person is at the back of the line.", "The skier in beige and black is the only one to not have both his skis to the ground.", "The person doesn't have both skis on, and they're not prepared to go like everyone else."], "image": "train2014/COCO_train2014_000000335984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3602, "question_id": "FJPkQSRagHTGedZbGQibJK", "question": "Who should a crime be reported to?", "choices": ["pedestrian", "police division", "motorcyclist", "car driver"], "correct_choice_idx": 1, "direct_answers": ["westerville police", "police division", "police division", "police", "police", "police", "police", "police", "police division", "police division"], "difficult_direct_answer": false, "rationales": ["A police sign is hanging above a sidewalk. crimes are reported to the police.", "There is a sign that says \"westerville police division\".", "The crimes should be reported to the police division."], "image": "train2014/COCO_train2014_000000003602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200567, "question_id": "FJRoJnbtbJGnmcwYWoBCWM", "question": "Which player last had the baseball?", "choices": ["batter", "game official", "catcher", "pitcher"], "correct_choice_idx": 3, "direct_answers": ["pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher"], "difficult_direct_answer": false, "rationales": ["Based on his stance, one can see that he has just thrown the ball.", "He just threw it to the batter", "The pitcher has his hand extended as if he just threw the ball."], "image": "val2014/COCO_val2014_000000200567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271591, "question_id": "FJmca7RDFSy5TsUxXxzGeq", "question": "Why is the food in plastic containers?", "choices": ["to cook", "to sell", "to carry", "to marinate"], "correct_choice_idx": 2, "direct_answers": ["carbohydrates", "traveling", "to store", "lunch box", "keep fresh", "preserve freshness", "to eat", "transportation", "lunch storage", "to carry"], "difficult_direct_answer": true, "rationales": ["It's in them to store or take with you easier.", "This container is consumer tubberware. this item is used to transport food.", "This appears to be someone's lunch for work or school. they brought it with them."], "image": "train2014/COCO_train2014_000000271591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311015, "question_id": "FJofbxZZPiu4SgFjDaQbi4", "question": "What region of the United States does this team play in?", "choices": ["northwest", "southwest", "midwest", "northeast"], "correct_choice_idx": 2, "direct_answers": ["midwest", "east", "chicago", "california", "western", "illinois", "chicago", "midwest", "midwest", "los angeles"], "difficult_direct_answer": false, "rationales": ["The person is wearing a hat that has a cubs logo. the cubs play in chicago.", "This team most likely plays in the chicago cubs team of the midwest.", "The region is the midwest."], "image": "val2014/COCO_val2014_000000311015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151492, "question_id": "FJzFG2mpsCaEZ6YNtNz4HL", "question": "What is this type of phone called?", "choices": ["unibody", "smart", "flip", "micro"], "correct_choice_idx": 2, "direct_answers": ["flip", "flip phone", "flip", "flip phone", "flip", "flip", "cell phone", "flip", "flip", "flip phone"], "difficult_direct_answer": false, "rationales": ["The phone can be flipped.", "The phone does not support applications, consists of multiple pieces, and is regular sized. it has a hinge in the middle that allows it to be folded in half.", "A man is holding a phone with a hinge in the middle."], "image": "val2014/COCO_val2014_000000151492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54959, "question_id": "FK42fjwBgKukKxKH67mCd4", "question": "This type of oven is used to cook food in what manner?", "choices": ["fry it", "burn it", "broil it", "toast it"], "correct_choice_idx": 3, "direct_answers": ["toast it", "microwave", "microwave", "microwave", "microwave", "microwave", "fast", "fast", "microwave", "fast"], "difficult_direct_answer": false, "rationales": ["These type of machines use microwaves to cook or heat food.", "You can toast it in a toaster oven.", "This is a toaster oven you place food in it and it gradually warms it up and makes it a crunchy texture."], "image": "val2014/COCO_val2014_000000054959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139072, "question_id": "FKAJMbbGHWsAvg9vw3d5GF", "question": "In what year was a car first produced under the name on the right?", "choices": ["1955", "1926", "1915", "1906"], "correct_choice_idx": 1, "direct_answers": ["1926", "1885", "1885", "1926", "1926", "1926", "1926", "1926", "1926", "1926"], "difficult_direct_answer": false, "rationales": ["An internet search revealed the year that the first car was produced by the company on the right, mercedes benz.", "Benz started making cars in 1926.", "It was in 1926"], "image": "val2014/COCO_val2014_000000139072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427046, "question_id": "FKD5fCHsCrDciificMknYh", "question": "The objective is to move where in relation to the cones?", "choices": ["between them", "behind", "left", "right"], "correct_choice_idx": 0, "direct_answers": ["between them", "between", "around", "around", "in between", "between each", "inbetween", "zigzag", "between them", "between"], "difficult_direct_answer": false, "rationales": ["There is the objective to move in between the cones on either sides.", "The cones on the ground are there for the skater to ride between during the race.", "These two men are competing on skateboards and the cones are placed for an obstacle course."], "image": "train2014/COCO_train2014_000000427046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522243, "question_id": "FKKmzMAgFpCP42njtvUsTS", "question": "Why does the man in white have his arm out?", "choices": ["to wave", "to catch", "for balance", "to reach"], "correct_choice_idx": 1, "direct_answers": ["catch frisbee", "catch frisbee", "to catch", "catch frisbee", "catching", "catch frisbee", "frisbee", "play frisbee", "catch", "catch frisbee"], "difficult_direct_answer": false, "rationales": ["The man opposite to him is about the throw the frisbee.", "The other man is about to throw a frisbee.", "The other man is about to throw a frisbee his way"], "image": "train2014/COCO_train2014_000000522243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420120, "question_id": "FKkKCrKETLnXHZAmxSCWdK", "question": "Where are these people located?", "choices": ["home", "reception hall", "office", "restaurant"], "correct_choice_idx": 0, "direct_answers": ["home", "home", "living room", "in house", "house", "couch", "living room", "living room", "home", "family room"], "difficult_direct_answer": false, "rationales": ["They appear to be sitting in a living room in a home.", "People are sitting on a couch. people have couches in their homes.", "They are on a couch."], "image": "val2014/COCO_val2014_000000420120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576818, "question_id": "FKoxRuWkmJKjKP7fGmEn2Z", "question": "What type of pepper is on the plate?", "choices": ["jalapeno", "cherry pepper", "pepperoncini", "red pepper"], "correct_choice_idx": 2, "direct_answers": ["yellow peppers", "pepperoncini", "green", "pepperocinni", "pepperoncini", "green", "ghost pepper", "pepperoncini", "pepperoncini", "jalapeno"], "difficult_direct_answer": false, "rationales": ["The pepper is a sweet pepper that is similar to the banana pepper.", "The pepper is light green, long, with a long stem.", "Pepperoncinis are light green."], "image": "train2014/COCO_train2014_000000576818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255598, "question_id": "FKuHwyErvDRidY9Q8ZNZzB", "question": "The color of the animal is the same as the color of what?", "choices": ["robin", "flamingo", "blue jay", "rhinoceros"], "correct_choice_idx": 3, "direct_answers": ["rocks", "dust", "black", "rhinoceros", "ground", "gray skies", "cement", "rocks", "rock", "stone"], "difficult_direct_answer": true, "rationales": ["Their color looks similar to a rhino.", "Elephants and rhinos are both gray.", "The color is the same as the rhino."], "image": "train2014/COCO_train2014_000000255598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84724, "question_id": "FKxLJxxoY6Q2buBMwGVwkb", "question": "What type of transportation is shown?", "choices": ["air", "water", "rail", "road"], "correct_choice_idx": 1, "direct_answers": ["water skiing", "boat", "water ski", "wakeboarding", "boat", "pontoon boat", "water", "boat", "wakeboard", "surfboard"], "difficult_direct_answer": false, "rationales": ["The person is skiing while being pulled by a boat.", "He is riding a board being pulled by a boat.", "A person is on a ski in the water holding onto a rope. the person is water skiing."], "image": "train2014/COCO_train2014_000000084724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228000, "question_id": "FL99vKt2QUefReFnT2Axeh", "question": "What class of pet do they have?", "choices": ["bovine", "equine", "canine", "feline"], "correct_choice_idx": 2, "direct_answers": ["dog", "labrador", "dog", "dog", "labrador", "dog", "dog", "dog", "labrador", "canine"], "difficult_direct_answer": false, "rationales": ["A canine is a dog.", "They have a dog.", "Dogs are classified under the canine family."], "image": "train2014/COCO_train2014_000000228000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219177, "question_id": "FLcv6bU4MVe5U3MEhawdRm", "question": "The animals seen most clearly here originate from what?", "choices": ["male organs", "vaginas", "eggs", "magic"], "correct_choice_idx": 2, "direct_answers": ["bird", "pelicans", "water", "eggs", "bird", "bird", "bird", "mallard", "birds", "water"], "difficult_direct_answer": false, "rationales": ["Most birds are hatched.", "The animals are birds which are hatched.", "Birds lay eggs. there are several birds lining the water."], "image": "train2014/COCO_train2014_000000219177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240378, "question_id": "FLmAh23vEcJgsmSARrT556", "question": "What is the woman helping to do?", "choices": ["burn", "tie", "cut", "feed"], "correct_choice_idx": 1, "direct_answers": ["tie", "tie tie", "tie tie", "tie tie", "tie", "button shirt", "knot tie", "tie tie", "tie", "tie tie"], "difficult_direct_answer": false, "rationales": ["The woman has her hands near the man's neck which indicates she's helping with his necktie.", "The woman is helping the man by forming a knot in his neckwear.", "The woman's helping to tie the tie."], "image": "train2014/COCO_train2014_000000240378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549048, "question_id": "FLuWgVaaxS2DComASCu2FW", "question": "What has caused the multiple spots on the road in front of the bus?", "choices": ["gasoline", "gum", "diesel", "motor oil"], "correct_choice_idx": 3, "direct_answers": ["motor oil", "potholes", "oil leaks", "grease", "bus stop", "spills", "oil", "oil", "stop", "lights"], "difficult_direct_answer": true, "rationales": ["Motor oil has dripped onto the road.", "These don't wash away easily so they remain on the street after cars have lost the fluid", "This substance leaves dark stains on asphalt"], "image": "val2014/COCO_val2014_000000549048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300663, "question_id": "FMHftZYwV4thLVyWohPqW5", "question": "What color is the vehicle on the right?", "choices": ["red", "green", "blue", "purple"], "correct_choice_idx": 0, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color is red.", "There are two vehicles in the image and looking at it, it is clear which one is on the right and the color of that vehicle is red.", "Two motorcycles are on the road and the one on the right is red."], "image": "train2014/COCO_train2014_000000300663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41138, "question_id": "FMcpBZk3SNqunjBngzkxhZ", "question": "What type of transit hub are these people standing in?", "choices": ["taxi stand", "bus station", "airport", "train station"], "correct_choice_idx": 3, "direct_answers": ["train station", "subway", "train station", "subway", "train station", "rail", "metrolink", "train", "subway", "subway"], "difficult_direct_answer": false, "rationales": ["There is a train on the left side at the station.", "These passengers board and disembark a recently arrived train.", "There are rails on the right, and the vehicle on the left says what it is on the side."], "image": "train2014/COCO_train2014_000000041138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65383, "question_id": "FMhuUp5aRLiQkKkcN2diqd", "question": "How did the ball get there?", "choices": ["kicked", "dropped it", "wind blown", "found it"], "correct_choice_idx": 0, "direct_answers": ["kicked", "kick", "kick", "kicking", "kicked", "kicked forward", "kick", "was kicked", "kicked", "ball"], "difficult_direct_answer": false, "rationales": ["They are playing the game of soccer where hands are not allowed to be used. they need to use their feet to control the movement of the ball.", "The people are playing soccer. they use their feet to move the ball around.", "In soccer they kick the ball with their feet."], "image": "val2014/COCO_val2014_000000065383.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540135, "question_id": "FMrHUMk9ahiPLptryNjRxS", "question": "Why is the dog in the air?", "choices": ["catching frisbee", "falling", "thrown there", "bouncing"], "correct_choice_idx": 0, "direct_answers": ["catching frisbee", "catching", "jumping", "catching frisbee", "leaping", "frisbee", "catch frisbee", "it jumped", "catching frisbee", "catching frisbee"], "difficult_direct_answer": false, "rationales": ["The dog is using his mouth to capture the flying disc.", "The dog wants the frisbee.", "The dog is catching a frisbee in its mouth."], "image": "train2014/COCO_train2014_000000540135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49378, "question_id": "FNHNSLHdNURm6EML4DVaA5", "question": "Upon what does the elevated man stand?", "choices": ["pitchers mound", "manure pile", "ant hill", "gopher run"], "correct_choice_idx": 0, "direct_answers": ["mound", "pitcher's mound", "mound", "mound", "pitchers mound", "pitcher's mound", "pitchers mound", "pitcher's mound", "mud", "mound"], "difficult_direct_answer": false, "rationales": ["The man is elevated upon a pitcher's mound.", "He is standing on a small hill where he will pitch the ball to the batter.", "That's the coveted spot for players."], "image": "train2014/COCO_train2014_000000049378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278418, "question_id": "FNM7dovexCkAvrakRfYFUr", "question": "What are they about do?", "choices": ["clean up", "go home", "argue", "collide"], "correct_choice_idx": 3, "direct_answers": ["collide", "collide", "kick ball", "playing soccer", "collide", "collide", "play", "kick", "collide", "collide"], "difficult_direct_answer": false, "rationales": ["They're running towards each other, so there's a chance they will run into each other.", "They're about to collide.", "They are running towards each other."], "image": "train2014/COCO_train2014_000000278418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575017, "question_id": "FNx8gMaTLKwnQPCrEDEA4o", "question": "What bathroom fixture is located in front of the woman at waist height?", "choices": ["sink", "hamper", "toilet", "towel rack"], "correct_choice_idx": 0, "direct_answers": ["towel rack", "towel rack", "mirror", "towel", "mirror", "mirror", "sink", "towel holder", "sink", "towel rack"], "difficult_direct_answer": false, "rationales": ["The woman is near her bathroom mirror which is usually near the sink.", "The woman is in a bathroom, and most bathrooms have washbasins under the mirror.", "A woman is standing in front of a mirror in a bathroom. most sinks in bathrooms have mirrors above them."], "image": "train2014/COCO_train2014_000000575017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390238, "question_id": "FNyWSQ2ZWfzMGjAfavAdx9", "question": "What type person would enjoy this pizza?", "choices": ["no one", "vegan", "omnivore", "vegetarian"], "correct_choice_idx": 2, "direct_answers": ["kid", "meat lovers", "pizza lover", "meat eater", "single", "hungry person", "omnivore", "individual", "almost anyone", "single"], "difficult_direct_answer": true, "rationales": ["The pizza has meat and veggies.", "A person that eats meat.", "An omnivore eats meat and vegetables. a pizza has pepperoni and olives on it."], "image": "val2014/COCO_val2014_000000390238.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30133, "question_id": "FNzsXi3USjo5tmRABUZXyD", "question": "Which entity controls the conveyance seen here?", "choices": ["military", "toy stores", "movie studios", "post office"], "correct_choice_idx": 0, "direct_answers": ["driver", "driver", "driver", "army", "military", "military", "military", "military", "general", "french military"], "difficult_direct_answer": false, "rationales": ["The vehicle on the road is painted with colors used by the military for camoflauge.", "The man is wearing a uniform. the vehicle is armoured.", "The military is in control of this space."], "image": "train2014/COCO_train2014_000000030133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467332, "question_id": "FP2nPVna2Bqi2gTAj969zF", "question": "What is the young girl doing?", "choices": ["sleeping", "playing", "crying", "eating"], "correct_choice_idx": 0, "direct_answers": ["napping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "napping", "sleeping", "sleeping"], "difficult_direct_answer": false, "rationales": ["The girl has her eyes closed and is lying down so she must be sleeping.", "The child is stretched out on her back with her eyes closed.", "The girl is sleeping."], "image": "train2014/COCO_train2014_000000467332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566955, "question_id": "FPNhgLR4cN5iETTFRzaDer", "question": "What is the item on the floor called?", "choices": ["staple gun", "machete", "power strip", "bolt cutter"], "correct_choice_idx": 2, "direct_answers": ["power strip", "power strip", "surge protector", "power strip", "extension", "surge protector", "extension cable", "surge protector", "power bar", "power strip"], "difficult_direct_answer": false, "rationales": ["One can see the familiar multiple plus that are used to give electricity to an item.", "It has sockets for you to plug in cords for generating electricity to turn on a device.", "The item is a power strip."], "image": "train2014/COCO_train2014_000000566955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345944, "question_id": "FPeCqfJVUKNiovfvh84Ae7", "question": "What is the orange ball floating in the water behind a parked boat?", "choices": ["medicine ball", "beach ball", "anchor", "safety float"], "correct_choice_idx": 3, "direct_answers": ["helps navigating", "buoy", "buoy", "buoy", "buoy", "buoy", "safety float", "buy", "buoy", "buoy"], "difficult_direct_answer": false, "rationales": ["It marks the channel area where it's safe to move on a boat", "These are made to make the water safer for others and can be held on to in case of falling in the water.", "This is floating so the boats see it."], "image": "train2014/COCO_train2014_000000345944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4551, "question_id": "FPhvzPEuEyN4FCpdPUPqBT", "question": "What type of dinosaur does this represent?", "choices": ["triceratops", "t-rex", "brontosaur", "paradactyl"], "correct_choice_idx": 2, "direct_answers": ["brontosaurus", "stegosaurus", "big dinosaur", "brachiosaurus", "brontosaur", "brontosaurus", "brontosaurus", "sauropods", "brontosaurus", "brontosaurus"], "difficult_direct_answer": false, "rationales": ["This looks like the shape and neck style of a brontosaurus.", "The dinosaur is tall with a very long neck.", "The dinosaur is a brontosaur."], "image": "val2014/COCO_val2014_000000004551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573179, "question_id": "FPjSxNbVVBe5426whxX7Hb", "question": "How many total feet are making contact with the ground?", "choices": ["two", "six", "none", "four"], "correct_choice_idx": 1, "direct_answers": ["six", "six", "six", "six", "six", "six", "six", "six", "six", "six"], "difficult_direct_answer": false, "rationales": ["The horse has 4 and the boy has 2", "The horse has 4 feet and the boy has 2.", "The boy has two feet and the horse has four."], "image": "val2014/COCO_val2014_000000573179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231675, "question_id": "FPnZDBDnz9PbjdzF3Ycmmp", "question": "How are the two children seen here related?", "choices": ["siblings", "enemies", "mentor mentee", "parent child"], "correct_choice_idx": 0, "direct_answers": ["siblings", "siblings", "siblings", "siblings", "siblings", "siblings", "siblings", "siblings", "siblings", "siblings"], "difficult_direct_answer": false, "rationales": ["An adult is holding two children.", "The kids are siblings.", "The two children are both with a woman that is probably their mother, making them siblings."], "image": "train2014/COCO_train2014_000000231675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501153, "question_id": "FQKiaQy5TpakvHQLPvrQp2", "question": "Who did they want to be Vice President?", "choices": ["pence", "palin", "clinton", "biden"], "correct_choice_idx": 3, "direct_answers": ["biden", "biden", "joe biden", "biden", "joe biden", "biden", "biden", "biden", "biden", "biden"], "difficult_direct_answer": false, "rationales": ["The current us president was obama's running mate in 2008.", "The vp is biden.", "Biden was the running mate of obama and who these people would want to be the vice president."], "image": "val2014/COCO_val2014_000000501153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283101, "question_id": "FQpNefmkpH8BCXHKXARuCQ", "question": "What type of object powers these boats?", "choices": ["engine", "battery", "paddle", "sun"], "correct_choice_idx": 2, "direct_answers": ["oars", "oars men", "paddles", "people and oars", "paddle", "oars", "oars", "rowing", "oars", "oars"], "difficult_direct_answer": false, "rationales": ["You can tell by the oars that the people are using as to what is propelling the boat.", "The people are using long paddles to make the boat move.", "The people in the boats are holding a stick with a blade on the end and they use it and their physical strength to move the boat."], "image": "val2014/COCO_val2014_000000283101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124182, "question_id": "FQsCcGry33KRGkCKHjPVar", "question": "What kind of fruit is the yellow one?", "choices": ["mango", "apple", "pineapple", "pear"], "correct_choice_idx": 2, "direct_answers": ["pineapple", "pineapple", "pineapple", "pineapple", "pineapple", "pineapple", "pineapple", "pineapple", "pineapple", "pineapple"], "difficult_direct_answer": false, "rationales": ["The yellow fruit is a pineapple.", "Slices of a yellow fruit are on a plate. pineapple is yellow.", "The fruit is a citrus fibrous kind of plant."], "image": "train2014/COCO_train2014_000000124182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523412, "question_id": "FR46pdFHsx8cHS8mMCAXy5", "question": "What is the boy reaching for?", "choices": ["wine", "blanket", "pan", "scissors"], "correct_choice_idx": 2, "direct_answers": ["towel", "towel", "pan", "food", "towel", "towel", "towel", "pans", "pan", "towel"], "difficult_direct_answer": false, "rationales": ["The baby tries to reach the pan as shown on the pan.", "Blankets, scissors and wine are not pictured", "The boy is leaning on the oven and reaching towards the cookware that is on top of the stove burners."], "image": "train2014/COCO_train2014_000000523412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248250, "question_id": "FR7d5VBgFMViBNpJSRqJZF", "question": "What is probably stinking up the market area?", "choices": ["limburger cheese", "cow", "manure", "dead fish"], "correct_choice_idx": 3, "direct_answers": ["fish", "fish", "dead fish", "fish", "fish", "fish", "fish", "dead fish", "fish", "dead fish"], "difficult_direct_answer": false, "rationales": ["The red and blue container is holding seafood.", "As indicated in the foreground. the other options aren't displayed.", "The dead fish probably smell."], "image": "train2014/COCO_train2014_000000248250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195561, "question_id": "FRDFMAxkT9nf5EVymMVEgG", "question": "When does this season take place?", "choices": ["spring", "summer", "fall", "winter"], "correct_choice_idx": 3, "direct_answers": ["fall", "winter", "december", "rainy", "winter", "winter", "christmas", "christmas", "winter", "winter"], "difficult_direct_answer": false, "rationales": ["The christmas tree indicates it's winter.", "The tree is for a holiday called christmas, which only happens during the cold season.", "There are christmas trees out."], "image": "val2014/COCO_val2014_000000195561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114674, "question_id": "FRLVxEhnsQkrPigSPnw5k2", "question": "What is happening to the young man's hat?", "choices": ["getting frozen", "getting soaked", "getting hot", "getting dirty"], "correct_choice_idx": 1, "direct_answers": ["on head", "on head", "getting wet", "wet", "getting wet", "getting wet", "wet", "getting soaked", "getting wet", "getting wet"], "difficult_direct_answer": false, "rationales": ["The woman is holding an umbrella with rain drops on it. it is raining.", "The young man is getting rained on.", "The man's hat is not under an umbrella and is exposed to rain."], "image": "train2014/COCO_train2014_000000114674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200109, "question_id": "FRPMSk5ihwCDXzYMhVjFHb", "question": "What kind of food is the person eating?", "choices": ["sandwich", "donut", "pizza", "taco"], "correct_choice_idx": 2, "direct_answers": ["pizza", "pizza", "burrito", "burrito", "mobile phone", "bread", "ice cream", "pizza", "pizza", "chicken"], "difficult_direct_answer": false, "rationales": ["You can see the crust and cheese...and maybe a small amount of tomato sauce. that's all the information we need.", "The white and yellow cheese, golden crust and specks of visible red sauce identify this food as pizza.", "The food is pizza."], "image": "val2014/COCO_val2014_000000200109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115718, "question_id": "FRh7pRbs7tLEpVm43QZEyV", "question": "What part of the body does Dr. Eng work on?", "choices": ["heart", "teeth", "ears", "lungs"], "correct_choice_idx": 0, "direct_answers": ["heart", "mouth", "teeth", "teeth", "teeth", "teeth", "teeth", "teeth", "mouth", "mouth"], "difficult_direct_answer": false, "rationales": ["The body part is the heart.", "This doctor labels themselves a dentist. the dentist takes care of teeth.", "Dentists work on teeth."], "image": "train2014/COCO_train2014_000000115718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392666, "question_id": "FRozuDqnAizQgBwkrZ8aJh", "question": "What animal statues are sitting on the chair?", "choices": ["cat", "mouse", "bird", "dog"], "correct_choice_idx": 2, "direct_answers": ["birds", "birds", "birds", "birds", "birds", "birds", "birds", "bird", "birds", "birds"], "difficult_direct_answer": false, "rationales": ["They have the shape of this animal", "The round stout shape and tucked in wigs of these statues identify them as birds.", "The statues are made to look like colorful birds."], "image": "train2014/COCO_train2014_000000392666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330652, "question_id": "FRtkQ3k2gVjr4WPMyPV7NS", "question": "Who will pay for this meal?", "choices": ["parent", "no one", "child rightmost", "leftmost child"], "correct_choice_idx": 0, "direct_answers": ["parent", "parents", "parents", "parents", "parents", "photographer", "parents", "parents", "parent", "adult"], "difficult_direct_answer": false, "rationales": ["The children are too young to pay for their own meal. the parent will likely pay.", "The parent will pay.", "The children pictured are too young to work and would not have money. they are also too young to be unattended so the meal would be paid for by the adult that accompanied them."], "image": "train2014/COCO_train2014_000000330652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360165, "question_id": "FSHenhwXGtKcQzkZAtgg3x", "question": "What drug might be contained in this cup?", "choices": ["cocaine", "meth", "weed", "caffeine"], "correct_choice_idx": 3, "direct_answers": ["sleeping drug", "caffeine", "caffeine", "caffeine", "caffeine", "caffeine", "coffee", "coffee", "caffeine", "caffeine"], "difficult_direct_answer": false, "rationales": ["It is a coffee cup. coffee does not contain narcotics or illegal stimulants.", "A coffee cup is on a dogs head. coffee has caffeine in it typically.", "This is a coffee cup"], "image": "train2014/COCO_train2014_000000360165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97315, "question_id": "FSRFMtr46gcHu7sswJpHuL", "question": "What has been reflected on the glass tabletop?", "choices": ["car", "river", "seagull", "palm tree"], "correct_choice_idx": 3, "direct_answers": ["tree", "palm tree", "tree", "palm tree", "sunlight", "glass", "palm tree", "palm tree", "sky", "palm trees"], "difficult_direct_answer": false, "rationales": ["The tree fronds are visible in the reflection", "You can see the tree on the table top.", "The reflection matches a type of common and well recognized tropical tree."], "image": "val2014/COCO_val2014_000000097315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138871, "question_id": "FSh5gyPCZhJr5bvbswGuti", "question": "What is sitting on the desk on the right is seen very little since the computer age?", "choices": ["typewriter", "mouse pad", "mouse", "corded phone"], "correct_choice_idx": 0, "direct_answers": ["typewriter", "computer mouse", "typewriter", "typewriter", "typewriter", "typewriter", "typewriter", "typewriter", "typewriter", "computer mouse"], "difficult_direct_answer": false, "rationales": ["The typewriter is evident in the picture.", "There is a typewriter on the desk against the wall.", "There is a typewriter next to the laptop."], "image": "val2014/COCO_val2014_000000138871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137339, "question_id": "FSuLVrWg5TYjxsMwm63fsX", "question": "What fruit is the same colour as the roundish flower on the cover?", "choices": ["apple", "plum", "orange", "damson"], "correct_choice_idx": 2, "direct_answers": ["orange", "pomegranate", "plum", "orange", "lime", "tomato", "strawberry", "orange", "apple", "apple"], "difficult_direct_answer": false, "rationales": ["The flower on the sheet is orange. a common fruit that is orange in color is an orange.", "The fruit is an orange.", "The roundish flower is not red, green, purple, or blue."], "image": "train2014/COCO_train2014_000000137339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140664, "question_id": "FSuVSgj5QjzPUKgKSoV3US", "question": "What item here can sometimes be poisonous?", "choices": ["berry", "ant", "frog", "mushroom"], "correct_choice_idx": 3, "direct_answers": ["mushroom", "mushrooms", "mushrooms", "mushroom", "mushroom", "mushroom", "mushrooms", "mushrooms", "mushroom", "snails"], "difficult_direct_answer": false, "rationales": ["There are no fruits, amphibious animals, or insects in this dish. there are versions of the fungus that are poisonous.", "Most varieties of this are not edible", "Items b-d are not pictured"], "image": "val2014/COCO_val2014_000000140664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62652, "question_id": "FSw4FiCGnwHvMEdsBt3dKJ", "question": "What network is being shown on the television?", "choices": ["nbc", "cbs", "abc", "fx"], "correct_choice_idx": 0, "direct_answers": ["nbc", "espn", "nbc", "nbc", "nbc", "nbc", "nbc", "nbc", "nbc", "espn"], "difficult_direct_answer": false, "rationales": ["You can see the network logo on the tv screen.", "The colored logo signifies the peacock network.", "A nos logo is on a television."], "image": "train2014/COCO_train2014_000000062652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307249, "question_id": "FT4nUo2FMZiwsTPu48K4Du", "question": "Who buys the food prepared here?", "choices": ["animals", "their boss", "they do", "restaurant diners"], "correct_choice_idx": 3, "direct_answers": ["chef", "chef", "restaurant customers", "customers", "restaurant customers", "restaurant patrons", "kitchen", "manager", "customers", "restaurant diners"], "difficult_direct_answer": false, "rationales": ["This is a commercial restaurant kitchen.", "The food is for restaurant diners.", "This is a commercial kitchen. the employees and their boss are selling, not buying, the food being prepared."], "image": "train2014/COCO_train2014_000000307249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545082, "question_id": "FTJYza2DoYnPcB9XKmriou", "question": "What word can be seen on the sign to the right?", "choices": ["red", "new", "blue", "old"], "correct_choice_idx": 1, "direct_answers": ["new", "new", "new jhatoye", "new", "new jhatoye", "new hotoyo", "new", "new", "new", "new"], "difficult_direct_answer": false, "rationales": ["The word has three letters and is not old or red.", "The word \"new\" is visible on the side sign to the right.", "The word \"new \" appears on the sign."], "image": "train2014/COCO_train2014_000000545082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501247, "question_id": "FTZttKefzan3bVrR9xLqh2", "question": "How many years ago was this picture taken?", "choices": ["nine", "eight", "seven", "ten"], "correct_choice_idx": 0, "direct_answers": ["nine", "few years", "unknown", "nine", "nine", "nine years", "eight years", "nine", "nine", "nine"], "difficult_direct_answer": false, "rationales": ["You can tell by the small watermark on the bottom right as to when it was taken.", "We are now in the year 2022, so if we subtract the year 2012 which is on the photo, we get this total.", "There is a watermark in the bottom right corner with the number 2012 visible. watermarks on photos often contain the year the photo was taken and this year was 9 years ago."], "image": "train2014/COCO_train2014_000000501247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556954, "question_id": "FTkx7UtYtMsuWAyLuFVTZs", "question": "Where is he brushing his teeth in the house?", "choices": ["kitchen", "bedroom", "living room", "bathroom"], "correct_choice_idx": 0, "direct_answers": ["kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen"], "difficult_direct_answer": false, "rationales": ["A refrigerator, flour, cabinets and a drying rack are present in this room. these items are found in the kitchen.", "A refrigerator, cabinets, and a drying rack for dishes are behind the man.", "He's in the kitchen."], "image": "train2014/COCO_train2014_000000556954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279849, "question_id": "FTmQuer4pwuXVCunLdzYNp", "question": "Where is the person traveling?", "choices": ["forest", "river", "subway", "roadway"], "correct_choice_idx": 3, "direct_answers": ["street", "white lines", "to work", "on road", "straight", "uptown", "on freeway", "shoulder", "away", "roadway"], "difficult_direct_answer": true, "rationales": ["The person is driving a motorcycle down the roadway.", "This person is traveling along a roadway as evidenced by the two lanes of traffic on the right.", "A motorcycle couldn't possibly be used in a subway or a river and it would be nearly impossible to move for very long in a forest."], "image": "train2014/COCO_train2014_000000279849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467318, "question_id": "FTrpCNsWsQeDSQrSf3UCLG", "question": "What type of shop is the woman near?", "choices": ["gas station", "florist", "car wash", "barber"], "correct_choice_idx": 1, "direct_answers": ["flower", "florist", "secondhand shop", "flower shop", "floral shop", "flower shop", "florist", "florist", "flower shop", "flower"], "difficult_direct_answer": false, "rationales": ["As indicated by the plants in the boxes in the window.", "There are a lot of flowers in the background.", "There are flowers behind the lady. flowers are usually sold at a flower shop."], "image": "train2014/COCO_train2014_000000467318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289128, "question_id": "FU4muz5KHnjjQs4KMD67qB", "question": "What drink has the owner of this cooler stocked up on?", "choices": ["beer", "soda", "water", "wine"], "correct_choice_idx": 2, "direct_answers": ["beer", "beer", "water", "beer", "sodas", "beer", "beer", "beer", "beer", "sodas"], "difficult_direct_answer": false, "rationales": ["All the beverages in the cooler are alcoholic.", "You can tell by the name brands and height of the bottles to what they are.", "Various glass bottles can be seen in a cooler."], "image": "val2014/COCO_val2014_000000289128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535393, "question_id": "FU7CWyX3gSgV434fTe3ppv", "question": "What type of food item does the graffiti spell out?", "choices": ["fruit", "pasta", "bread", "cheese"], "correct_choice_idx": 1, "direct_answers": ["pasta", "ziti", "pasta", "pasta", "pasta", "pasta", "ziti", "ziti", "apple", "ziti"], "difficult_direct_answer": false, "rationales": ["The graffiti says \"ziti\" which is a common shape of pasta.", "Ziti is a type of pasta noodle that is popular in many casserole-type dishes in italian cuisine.", "This is the name of a noodle"], "image": "train2014/COCO_train2014_000000535393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516500, "question_id": "FUR6qQjGVfQF4LnbXhoin3", "question": "Who took this picture?", "choices": ["judge", "parent", "reporter", "officer"], "correct_choice_idx": 1, "direct_answers": ["parents", "parent", "parent", "mother", "unknown", "person", "parent", "photographer", "someone else", "parent"], "difficult_direct_answer": false, "rationales": ["The picture is blurry. the person in the focus of the picture is a child.", "The childs mom or dad is with them and taking the picture.", "The parent took the photo."], "image": "train2014/COCO_train2014_000000516500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454894, "question_id": "FUhGhVE9RFdmBKPDb2xaBa", "question": "What is the lady's expression for the camera?", "choices": ["shock", "head down", "smile", "hand raised"], "correct_choice_idx": 0, "direct_answers": ["smile", "worried", "surprise", "smile", "smiling", "smiling", "uncertainty", "shock", "surprised", "worry"], "difficult_direct_answer": false, "rationales": ["Her hand and head are in normal positions. she is showing her teeth, but she is not smiling.", "She looks surprised.", "Her expression in the mirror tells the tale of her mood."], "image": "train2014/COCO_train2014_000000454894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319591, "question_id": "FUoG9YaHcQUzYTxEy8cAwC", "question": "What might people do in the blue structure?", "choices": ["sleep", "sell food", "sail", "cook"], "correct_choice_idx": 0, "direct_answers": ["sleep", "sleep", "sleep", "sleep", "sleep", "sleep", "sleep", "sleep", "sleep", "sleep"], "difficult_direct_answer": false, "rationales": ["There is a tent to shade people from the sun.", "The people will sleep.", "The blue structure is a tent designated for sleeping activity."], "image": "train2014/COCO_train2014_000000319591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211051, "question_id": "FV4t5rXYU7T4n4fih3TWV8", "question": "What would happen if you connected a microphone to the box turned it on and put it near the box?", "choices": ["short circuit", "explosion", "loud noise", "nothing"], "correct_choice_idx": 2, "direct_answers": ["squawk", "screech", "feedback screeching", "static noise", "noise", "feedback", "amplified sound", "squeal", "loud noise", "interference"], "difficult_direct_answer": true, "rationales": ["The microphone would produce a loud sound.", "There is an amp on the floor and it would make a loud noise if someone put a microphone near it.", "Loude noise as the microphone is big."], "image": "train2014/COCO_train2014_000000211051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65177, "question_id": "FVXCpdzD8xNiPprGh5wCnS", "question": "Why is the man in the silver vehicle?", "choices": ["to work", "to travel", "to eat", "to dance"], "correct_choice_idx": 1, "direct_answers": ["transportation", "riding", "going somewhere", "commuting", "transportation", "traveling", "transportation", "going somewhere", "to travel", "waiting"], "difficult_direct_answer": false, "rationales": ["This is a train", "This is obviously a sort of subway or metro car which is used to transport humans from one place to another.", "He is on the train so he can be taken to his destination."], "image": "train2014/COCO_train2014_000000065177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164864, "question_id": "FVcRhKy8fYRUeXwpKERFn3", "question": "What type of insect is the little girl's ride supposed to be?", "choices": ["dragonfly", "ant", "lady bug", "bee"], "correct_choice_idx": 2, "direct_answers": ["beetle", "ladybug", "ladybug", "beetle", "ladybug", "lady bug", "lady bug", "ladybug", "ladybug", "lady bug"], "difficult_direct_answer": false, "rationales": ["The girl's suitcase is red with black spots and resembles a ladybug.", "The ride is red and has black spots.", "The girl's suitcase is red with black dots."], "image": "train2014/COCO_train2014_000000164864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305901, "question_id": "FViAFJ6GVyhcqoshBxdogr", "question": "What is this person getting ready to do?", "choices": ["make call", "change channel", "eat dinner", "check temperature"], "correct_choice_idx": 0, "direct_answers": ["bike ride", "make call", "text", "make call", "make call", "using phone", "call", "call somebody", "bikeride", "bicycle"], "difficult_direct_answer": false, "rationales": ["This is a cellphone", "The phone is held as if dialing a number.", "A mobile phone is quite common for people to have today. its most important function is to make and receive calls."], "image": "train2014/COCO_train2014_000000305901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161725, "question_id": "FVwkaE9x5nR4wFcdArJRYk", "question": "What instrument is the man in the straw hat playing?", "choices": ["saxophone", "clarinet", "trumpet", "guitar"], "correct_choice_idx": 2, "direct_answers": ["trumpet", "trumpet", "trumpet", "trumpet", "trumpet", "trumpet", "trumpet", "trumpet", "trumpet", "trumpet"], "difficult_direct_answer": false, "rationales": ["The man is playing a brass instrument with a flared bell and three valves.", "A man is sitting on a bench with a gold colored instrument up to his pursed lips.", "One can make out the familiar shape of this brass instrument."], "image": "train2014/COCO_train2014_000000161725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263292, "question_id": "FW5iV7idPTxvsySSpE3Q4K", "question": "What is on top of the arch above the clock face?", "choices": ["door", "number", "face", "rooftop"], "correct_choice_idx": 2, "direct_answers": ["statue", "head", "face", "woman's head", "face sculpture", "face", "building name", "sign", "building name", "face"], "difficult_direct_answer": false, "rationales": ["A carved head appears above a clock on a building.", "The top has the face.", "There is a carved facial part of a head."], "image": "val2014/COCO_val2014_000000263292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194756, "question_id": "FWHoQpZBjbRawjgsJXGnPM", "question": "What will be parked in front of the people?", "choices": ["boats", "nothing", "yachts", "ships"], "correct_choice_idx": 1, "direct_answers": ["boats", "nothing", "boat", "boat", "boat", "boat", "boats", "nothing", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["There is a large \"p\" with a diagonal line slashed through it on the wall in front of the people showing that no parking is allowed there.", "People are sitting at benches in front of the water. a street is behind the benches.", "It looks like there is a no parking sign on the wall in front of the people."], "image": "val2014/COCO_val2014_000000194756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572840, "question_id": "FWPhdXieyBuAASrX36PLwZ", "question": "Which one of these cities might that train visit?", "choices": ["honolulu", "istanbul", "lima", "paris"], "correct_choice_idx": 3, "direct_answers": ["switzerland", "oslo", "paris", "china", "paris", "unknown", "paris", "mont louis", "paris", "san francisco"], "difficult_direct_answer": false, "rationales": ["Sncf is operated in france.", "The trains are located in france.", "The brand of train, sncf is native to france, and paris is the most populated city in france."], "image": "train2014/COCO_train2014_000000572840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443864, "question_id": "FWgd5CithDaiGsTysERMsu", "question": "How is this boat powered?", "choices": ["wind", "engine", "sun", "paddle"], "correct_choice_idx": 3, "direct_answers": ["paddle", "pole", "human strength", "oar", "by wind", "child", "kinetic", "paddle", "paddles", "paddle"], "difficult_direct_answer": false, "rationales": ["The boy in the back of the boat is paddling to supply power for the boat. there is no sail for wind or engine or solar panels.", "It's visible here, unlike a motor, solar panels and sails.", "The boat does not have an engine, solar panels, or a sail. the person in the back is using an item to power the boat."], "image": "train2014/COCO_train2014_000000443864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412914, "question_id": "FWjzJqJhnLHYXivPzwQaGy", "question": "What kind of a forest is this?", "choices": ["evergreen", "tropical", "deciduous", "maritime"], "correct_choice_idx": 0, "direct_answers": ["snowy", "snowy forest", "boat", "mountainous", "thick", "mountainous", "evergreen", "alpine", "snowy", "tropical forest"], "difficult_direct_answer": false, "rationales": ["This is a big evergreen forest.", "This is a large evergreen forest.", "The green part shows that it is the evergreen."], "image": "val2014/COCO_val2014_000000412914.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501506, "question_id": "FWzeqRZQ63SjNdbu2u6PXW", "question": "What type of sign is shown?", "choices": ["scoreboard", "warning", "brand", "regulatory"], "correct_choice_idx": 0, "direct_answers": ["shoes", "competitors", "score", "scoreboard", "score board", "score", "scoreboard", "hitting tennisball", "score board", "scoreboard"], "difficult_direct_answer": false, "rationales": ["As indicated by the numbers and names.", "The sign shows the players names and numbers from the matches.", "A board behind a tennis court lists players name and numbers. score boards are used at sporting events."], "image": "val2014/COCO_val2014_000000501506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115005, "question_id": "FX4DrEMib7KG66doyQY4Md", "question": "What type of bicycle is located behind the desk?", "choices": ["hybrid", "road", "fixed gear", "mountain"], "correct_choice_idx": 2, "direct_answers": ["two wheeler", "race bike", "street", "cycle", "beach cruiser", "fixed gear", "mountain bike", "mountain bike", "road bike", "racing bike"], "difficult_direct_answer": true, "rationales": ["There is only one speed on this bike.", "The tires are wrong for c and there's no indication of d. it might be b as well.", "A bike with no gears on the handles is shown."], "image": "train2014/COCO_train2014_000000115005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174229, "question_id": "FXQvWKbVwx33HPmfTm5fSS", "question": "What is he holding in his right hand?", "choices": ["frying pan", "swatter", "lid", "racquet"], "correct_choice_idx": 3, "direct_answers": ["tennis racket", "racket", "tennis racket", "tennis racket", "tennis racket", "tennis bat", "racket", "racquet", "tennis racquet", "tennis racket"], "difficult_direct_answer": false, "rationales": ["The item has a handle, frame, and netting. it is used by tennis players.", "A guy has a stringed object with a handle in his hand.", "He's holding a racquet."], "image": "train2014/COCO_train2014_000000174229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445892, "question_id": "FXUPbDgRDKPdLDgVaoordK", "question": "For what do the people here wait?", "choices": ["taxis", "santa claus", "christmas", "train"], "correct_choice_idx": 3, "direct_answers": ["train", "travel", "train", "subway", "subway", "subway", "subway", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["The people are waiting by train tracks.", "The people have luggage and are waiting alongside tracks.", "They are standing at the edge of a set of tracks."], "image": "train2014/COCO_train2014_000000445892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538054, "question_id": "FXiAgHFX3ms5FGShuRJC7B", "question": "The woman eating the fruit is likely on the path for what reason?", "choices": ["surveying", "tourism", "assisting", "commuting"], "correct_choice_idx": 1, "direct_answers": ["tour", "touring", "hiking", "pick fruit", "tourism", "healthy", "tourist", "plantation tour", "tourist", "sight seeing"], "difficult_direct_answer": true, "rationales": ["She has a camera dangling from her wrist.", "The woman can be seen wearing a camera around her neck, which is often used by tourists to document their adventures.", "The fruit is used for tourisms."], "image": "val2014/COCO_val2014_000000538054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316075, "question_id": "FXwUYxMeKgVVt4byMGYXtZ", "question": "Why is the boy kicking his leg back?", "choices": ["for speed", "to fight", "to flip", "to roll"], "correct_choice_idx": 0, "direct_answers": ["pushing skateboard", "skateboarding", "for speed", "skateboarding", "momentum", "concrete", "skateboarding", "skateboarding", "to move", "moving skateboard"], "difficult_direct_answer": false, "rationales": ["He is pushing him self to get going on the skateboard.", "Kicking on a skateboard increases speed.", "A boy is pulling his leg back and using his foot to generate the skateboard across the road."], "image": "train2014/COCO_train2014_000000316075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575027, "question_id": "FXzBFV3Fz2eKwaydiGtdXo", "question": "What purpose does the green building to the left of the skate park serve?", "choices": ["parking area", "grocery store", "gas station", "convenience store"], "correct_choice_idx": 2, "direct_answers": ["gas station", "concessions", "gas station", "gas sales", "gas station", "gas station", "gas station", "play", "gas", "gas station"], "difficult_direct_answer": false, "rationales": ["The green building to the left of the park is for fueling cars.", "The purpose is a gas station.", "There is a be sign near the building. pumps are near the sign."], "image": "train2014/COCO_train2014_000000575027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92644, "question_id": "FXzTLSaF2TX3pPuUHHeijE", "question": "What does the man in blue shorts have in his hands?", "choices": ["tape recorders", "beer", "magic gloves", "ropes"], "correct_choice_idx": 3, "direct_answers": ["unknown", "boat", "nothing", "nothing", "ropes", "flotation device", "nothing", "gloves", "string", "nothing"], "difficult_direct_answer": false, "rationales": ["The man in blue shorts has apparently a rope inside of his hands.", "The man has ropes.", "Looks like he is holding a rope that is attached to the boat."], "image": "val2014/COCO_val2014_000000092644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575029, "question_id": "FY3E8jXiNTNuzCqykeqtYR", "question": "Who is the man feeding the banana to?", "choices": ["man riding", "himself", "horse", "man standing"], "correct_choice_idx": 2, "direct_answers": ["horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["The man is holding out the banana for a horse to eat.", "He is offering it to the horse so he can eat if he wants.", "The man is feeding the horse."], "image": "train2014/COCO_train2014_000000575029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448851, "question_id": "FY3o8T5CCUE79CPLmcmwUv", "question": "The cats on the desk are illuminated by what source of light?", "choices": ["sun", "desk lamp", "moon", "overhead light"], "correct_choice_idx": 1, "direct_answers": ["desk light", "lamp", "sunlight", "lamp", "desk lamp", "artificial", "lamp", "lamp", "desk lamp", "overhead"], "difficult_direct_answer": false, "rationales": ["One can see that the light is extremely close to them and direct and they are resting on a desk.", "Because they are inside and it's dark outside you can tell what illuminates the desk.", "The cats are lit up with a desk lamp at night."], "image": "train2014/COCO_train2014_000000448851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469793, "question_id": "FY4pWp5atBpBbpt4AgiQvo", "question": "Which character wears a similar hat to this person?", "choices": ["freddy krueger", "link", "chef boyardee", "mario"], "correct_choice_idx": 2, "direct_answers": ["swedish chef", "chef", "chef", "swedish chef", "cartoon", "gordon ramsey", "chef boyardee", "chef louis", "chef", "bird"], "difficult_direct_answer": false, "rationales": ["Chef boyardee wears a chef hat.", "This is a chef's hat", "Chef boyardee wears a chef's hat."], "image": "val2014/COCO_val2014_000000469793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319088, "question_id": "FY7cNLeJ8gM52tAjFmjnKM", "question": "What is on the far left of the room?", "choices": ["lamp", "television", "chair", "baby"], "correct_choice_idx": 0, "direct_answers": ["lamp", "lamp", "nightstand", "nightstand", "lamp", "heater", "nightstand", "lamp", "nightstand", "nightstand"], "difficult_direct_answer": false, "rationales": ["By having a shade on it and the position of where it is you can tell what the appliance is.", "A light fixture is on a small table", "A lamp is on the table stand in the corner."], "image": "train2014/COCO_train2014_000000319088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413446, "question_id": "FY9ss6pMzo3iVxmWqWyHGT", "question": "What natural feature do the surfers like but the sailors of the boat here hate?", "choices": ["big waves", "tornados", "tidal wave", "calm waves"], "correct_choice_idx": 0, "direct_answers": ["wind", "waves", "waves", "waves", "waves", "waves", "waves", "waves", "waves", "big waves"], "difficult_direct_answer": false, "rationales": ["Surfers love huge ocean waves but people on sailboats hate them.", "The boat could be sunk by the big waves.", "These are fun on surfboards but dangerous for boats"], "image": "val2014/COCO_val2014_000000413446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290269, "question_id": "FYGvxgHzDeT7SBwLv3N7sD", "question": "What are the blue marks from?", "choices": ["paint", "crayon", "chalk", "oil"], "correct_choice_idx": 0, "direct_answers": ["dye", "paint", "chalk", "water", "paint", "spray paint", "identifying paint", "paint", "paint", "spray paint"], "difficult_direct_answer": false, "rationales": ["Livestock is marked for various reasons such as breeding or illness", "There are two sky colored marks on a sheep that is not natural which indicates that it was put onto the sheep.", "The marks are called sit marks and are used to help identify the sheep."], "image": "val2014/COCO_val2014_000000290269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475621, "question_id": "FYetpzsshg5cRWukBZjZtR", "question": "What type of mirror is the small circular one referred to as?", "choices": ["circular", "make-up", "extra", "round"], "correct_choice_idx": 1, "direct_answers": ["makeup", "magnifying", "face mirror", "round mirror", "magnifying", "magnifier", "hand mirror", "vanity", "makeup mirror", "make-up"], "difficult_direct_answer": true, "rationales": ["The small mirror is used for makeup application.", "This mirror can be used to put your make up on and look closely at your face or hair.", "Traditionally those shape of mirrors are used to get a closeup look at your face."], "image": "train2014/COCO_train2014_000000475621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111612, "question_id": "FYnuhb7RLLDhsYnWjn7nE4", "question": "Which rider is catching the wave the best?", "choices": ["boogie boarder", "skim boarder", "kite surfer", "surfer"], "correct_choice_idx": 3, "direct_answers": ["surfer", "right", "man", "man", "right rider", "far right", "man", "on right", "right", "surfer"], "difficult_direct_answer": false, "rationales": ["The surfer in green is on the wave.", "The surfer works with the wave in order to ride it. the surfer needs to be skilled in the areas of balance and timing.", "The surfer is catching the wave."], "image": "train2014/COCO_train2014_000000111612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59358, "question_id": "FYyPmHHFaG7TJtPjgvHCux", "question": "How many of the giraffes are attentive to the child?", "choices": ["three", "two", "four", "one"], "correct_choice_idx": 1, "direct_answers": ["one", "two", "one", "two", "two", "one", "two", "one", "two", "two"], "difficult_direct_answer": false, "rationales": ["Two giraffes are facing a child at a fence.", "Two giraffes are near the child.", "There are two giraffes."], "image": "train2014/COCO_train2014_000000059358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351352, "question_id": "FYzBbFE6qk8kkJaXd3nwDw", "question": "The first powered controlled Aero plane to fly is what?", "choices": ["mono plane", "satellite", "rocket", "biplane"], "correct_choice_idx": 3, "direct_answers": ["wright bothers", "wright flyer", "unknown", "biplane", "wright flyer", "wright flyer", "plane", "propeller", "airplane", "wright flyer"], "difficult_direct_answer": false, "rationales": ["Biplane is the name of the controlled airplane that was the first powered.", "The biplane was the first to fly", "The first powered airplanes had two sets of wings."], "image": "train2014/COCO_train2014_000000351352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555668, "question_id": "FZKnEGxmfY3zNj82CpfCqo", "question": "What type of diet does these giraffes have?", "choices": ["scavenger", "carnivore", "omnivore", "herbivore"], "correct_choice_idx": 3, "direct_answers": ["herbivore", "vegetarian", "plants", "herbivore", "straw", "plant based", "eat leaves", "grass", "not food", "vegetarian"], "difficult_direct_answer": false, "rationales": ["While we can't tell from the picture, the internet says giraffes eat only plants like leaves.", "Giraffes eat only plants.", "They are herbivore and eat leaves or plants."], "image": "train2014/COCO_train2014_000000555668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13874, "question_id": "FZjmoPJXdxYtWB3f4TWnCg", "question": "What is the most likely purpose for the glasses on the girls face?", "choices": ["hide hangover", "cool color", "extra weight", "blocking sun"], "correct_choice_idx": 3, "direct_answers": ["blocking sun", "sun protection", "shade", "sunblock", "shade eyes", "black", "sunglasses", "block sunlight", "sun protection", "eye protection"], "difficult_direct_answer": true, "rationales": ["It's a sunny day so it's safe to assume that she's wearing glasses for sun-blocking purposes.", "The shades are dark.", "She is wearing special glasses whose purpose is to protect her eyes from harmful uv rays and prevent glare."], "image": "train2014/COCO_train2014_000000013874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461917, "question_id": "FZntAvyKkcwpbr3vB5n2ty", "question": "What is near the cars?", "choices": ["cat", "turkey", "building", "dog"], "correct_choice_idx": 2, "direct_answers": ["asphalt", "building", "people", "buildings", "truck", "people", "sidewalk", "lamps store", "car", "building"], "difficult_direct_answer": false, "rationales": ["There are no non-human animals near the cars.", "The building is near.", "There isn't a single animal in sight."], "image": "train2014/COCO_train2014_000000461917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476706, "question_id": "FZoP5naBNRy7d7dEVvY5Rd", "question": "What sort of wearable item is available for sale within?", "choices": ["watch", "neck gear", "scarf", "socks"], "correct_choice_idx": 0, "direct_answers": ["watch", "clothes", "women's clothes", "dress", "watch", "watch", "jewelry", "clothes", "watch", "cothing"], "difficult_direct_answer": false, "rationales": ["Omega is a timepiece brand", "The store has a sign for omega watches in their window which are for sale within.", "There are watch brands advertised in the store windows. there is a large clock directly over the door, and the store appears to be somewhat fancy."], "image": "train2014/COCO_train2014_000000476706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49495, "question_id": "FaRKmKAdfMZnYA6UCLhbsP", "question": "Why is the Road Work Ahead sign upside down?", "choices": ["photo upsidedown", "hurried installer", "missing nail", "work finished"], "correct_choice_idx": 2, "direct_answers": ["misassembled", "fell down", "nail fell", "missing nail", "fell", "fell", "mistake", "rotating board", "loose", "loose nail"], "difficult_direct_answer": true, "rationales": ["The sign has fallen due to a faulty nail.", "There is a nail missing to hold the sign.", "The sign needs to have something tacked at the top."], "image": "train2014/COCO_train2014_000000049495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162358, "question_id": "FaZ76LafzCJQYHB929Lpw5", "question": "What large body of water is nearest this location?", "choices": ["arctic ocean", "atlantic ocean", "amazon river", "mississippi river"], "correct_choice_idx": 1, "direct_answers": ["atlantic ocean", "ocean", "atlantic ocean", "lake", "atlantic ocean", "ocean", "ocean", "ocean", "ocean", "atlantic ocean"], "difficult_direct_answer": false, "rationales": ["Maine is on the east coast of the united states.", "The sign identifies being in maine, which is nearest the atlantic ocean.", "The atlantic ocean is near this place."], "image": "val2014/COCO_val2014_000000162358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110551, "question_id": "Fb3ASugL7BhKDMkrHupmQ9", "question": "Why is he grabbing the board?", "choices": ["remove it", "hold on", "stop stealing", "keep it"], "correct_choice_idx": 3, "direct_answers": ["air trick", "to land", "safety", "doing trick", "trick", "trick", "keep it", "skateboard trick", "doing trick", "midair trick"], "difficult_direct_answer": false, "rationales": ["A skateboarder is holding onto the tip of a skateboard as he goes over a jump.", "The man wants to keep it.", "Maintaining possessing on the board is key in this activity"], "image": "val2014/COCO_val2014_000000110551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557678, "question_id": "Fb4hoG4qbEjVLZBrNwEZSD", "question": "What does the white mouse on the table do?", "choices": ["control computers", "food", "listening device", "glasses"], "correct_choice_idx": 0, "direct_answers": ["controls computer", "point", "help computer", "control laptop", "control pointer", "control", "control cursor", "apple laptop", "moves cursor", "control computers"], "difficult_direct_answer": true, "rationales": ["The mouse controls computers.", "A mouse is in front of a bunch of computers. a mouse is used with a computer.", "The white mouse ensure that the computers can be commanded."], "image": "train2014/COCO_train2014_000000557678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82263, "question_id": "FbF74VqHPS8MfLpE3cpPgC", "question": "The name of the boat might be short for what longer name?", "choices": ["vivienne", "victoria", "virginia", "veronica"], "correct_choice_idx": 2, "direct_answers": ["virginia", "virdie", "fishing boat", "virginia", "fishing", "virginia", "ship", "virgil", "virginia", "virginia"], "difficult_direct_answer": false, "rationales": ["The name of this boat is most likely short for virginia out of the options listed; mostly due to the presence of the letter g.", "The name appears to be \"virgie\". many nicknames that end with \"ie\" are shortened from longer names that include the same letters up to the ending \"ie\", but then end differently.", "The boat might be short for the virginia."], "image": "val2014/COCO_val2014_000000082263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171932, "question_id": "FbP54QjRZ942A6KYha5Q4a", "question": "What is the vessel called that's parked in the harbor?", "choices": ["ferry", "cruise ship", "battleship", "cargo ship"], "correct_choice_idx": 1, "direct_answers": ["cruise ship", "cruise ship", "cruise ship", "cruise ship", "ship", "cruise ship", "cruise ship", "ship", "cruise ship", "ship"], "difficult_direct_answer": false, "rationales": ["The ship is very large with many individual stories/floors to house guests", "These large ships are designed to carry a large number of passengers to seaside locations. they are used to entertain along the way have spaces large enough to accommodate people comfortably.", "The ship is white with colorful lining."], "image": "val2014/COCO_val2014_000000171932.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383309, "question_id": "FbVrz9GHRCghkyJR44STyb", "question": "What does the Cola lack?", "choices": ["flavor", "sugar", "gas", "water"], "correct_choice_idx": 1, "direct_answers": ["sugar", "sugar", "sugar", "calories", "sugar", "sugar", "sugar", "sugar", "calories", "calories"], "difficult_direct_answer": false, "rationales": ["The d on the can identifies it as a diet drink. the item in a is typically left out of diet drinks.", "Diet soda doesn't taste as good as regular soda.", "The cola is diet coke. it uses artificial sweeteners instead of a real one."], "image": "train2014/COCO_train2014_000000383309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118150, "question_id": "FbXEwycwByveLfoh3ic6D5", "question": "Which character wore a similar head covering to this man?", "choices": ["tyrion lannister", "garfield", "tweety bird", "tir mcdohl"], "correct_choice_idx": 3, "direct_answers": ["tupac shakur", "jackie chan", "teppan", "tomb", "jack sparrow", "dashboard", "tommy oliver", "jack sparrow", "rosie riveter", "tir mcdohl"], "difficult_direct_answer": true, "rationales": ["The first character is known for wearing a bandana.", "Tir mcdohl is a character who wears a head bandana.", "Tir mcdohl wore a similar thing on this head."], "image": "train2014/COCO_train2014_000000118150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358223, "question_id": "FbacFcqQhFWQt5uoCNag5d", "question": "What is the type of cake?", "choices": ["layer cake", "shortcake", "sheet cake", "bundt cake"], "correct_choice_idx": 3, "direct_answers": ["bunt", "bundt", "marble cake", "bundt cake", "coffee", "bundt", "bundt", "bunt", "spice cake", "bundt"], "difficult_direct_answer": false, "rationales": ["The cake was baked in a bundt pan.", "The consistency and color of this cake identifies it as bundt.", "The cake is bundt cake."], "image": "train2014/COCO_train2014_000000358223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67788, "question_id": "Fc5HpsB95ZJvXFmpdpt4mL", "question": "What do many of the boats shown here normally use but lack here?", "choices": ["pirates", "sails", "steam shovels", "motors"], "correct_choice_idx": 1, "direct_answers": ["seven", "sails", "wind", "wind", "six", "six", "sails", "sails", "sails", "sails"], "difficult_direct_answer": false, "rationales": ["There are many sailboats.", "The sails are used to help move the boats while it is windy but it looks like nobody put them on the boats today.", "Most boats use the wind to propel across the water."], "image": "val2014/COCO_val2014_000000067788.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440822, "question_id": "Fc9brjYtPJZ5AQfFnVBpB4", "question": "What is the primary vegetable in the salad?", "choices": ["lettuce", "tomato", "pumpkin", "cabbage"], "correct_choice_idx": 3, "direct_answers": ["cabbage", "cabbage", "mungbean sprouts", "cabbage", "cabbage", "mungbean sprouts", "cabbage", "cabbage", "cabbage", "veg fries"], "difficult_direct_answer": false, "rationales": ["A bin of vegetables has slices of a white leafy vegetable. cabbage is leafy and white.", "The veggies are cabbage.", "The vegetables in this salad are primarily cabbage."], "image": "train2014/COCO_train2014_000000440822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558194, "question_id": "FcCxmYVLtAEtByhEApQzXD", "question": "What genus is a sheep in?", "choices": ["ovis", "aries", "alium", "linnaeus"], "correct_choice_idx": 0, "direct_answers": ["ovis", "ovis", "ovis", "ovis", "sheep", "animal", "ovis", "mammal", "ovis", "ovis"], "difficult_direct_answer": false, "rationales": ["Sheep belong to the ovis family.", "Sheep are in the ovis genus.", "Ovis covers many breeds of domestic and wild sheep."], "image": "train2014/COCO_train2014_000000558194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391456, "question_id": "FcNp8ugPGG5sCGRmrPdqut", "question": "What flag is flying next to the clock?", "choices": ["united kingdom", "ireland", "scotland", "england"], "correct_choice_idx": 0, "direct_answers": ["england", "british", "united kingdom", "united kingdom", "uk flag", "united kingdom", "great britain", "england", "united kingdom", "british flag"], "difficult_direct_answer": false, "rationales": ["The flag has a red cross and blue and white colors.", "The flag of the united kingdom is flying.", "The clock tower is from that country."], "image": "train2014/COCO_train2014_000000391456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220607, "question_id": "FcfBr2FHwo4RdFVjRyJXUa", "question": "What is the camera setup on in the middle of the room?", "choices": ["chair", "desk", "computer", "tripod"], "correct_choice_idx": 3, "direct_answers": ["tripod", "tripod", "tripod", "projector", "tripod", "capture images", "standing", "tripod", "tripod", "yes"], "difficult_direct_answer": false, "rationales": ["The setup is the tripod.", "This camera setup uses three legs. generally desks, computers and chair do not have just three legs.", "There is a camera sitting on top of a triangular stand."], "image": "val2014/COCO_val2014_000000220607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439781, "question_id": "Fci6EKAUAGpgvvbRQPRcZn", "question": "What is the ground the elephants are walking on made from?", "choices": ["stone", "metal", "grass", "mud"], "correct_choice_idx": 0, "direct_answers": ["dirt", "sand", "concrete", "soil", "cement", "dirt", "dirt", "stone", "sand", "dirt"], "difficult_direct_answer": false, "rationales": ["The ground is stone.", "The ground is the same color and physical makeup as stone. it look strong and uniform.", "The elephants are walking on stone."], "image": "val2014/COCO_val2014_000000439781.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34597, "question_id": "FcoYFUWmYiQNWn4HnDEX9u", "question": "What is the term for the maneuver the skaters will do next?", "choices": ["ollie", "manual", "dropping in", "kick flip"], "correct_choice_idx": 2, "direct_answers": ["downward slope", "skateboarding", "downhill", "skate drive", "dropping in", "jump", "downhill", "drop-in", "skateboard jumping", "drop in"], "difficult_direct_answer": true, "rationales": ["The men will go down the ramp.", "The skaters are ready to start falling on the ramp and are dropping in.", "The term is dropping in."], "image": "val2014/COCO_val2014_000000034597.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553879, "question_id": "FcovaCYoCZJTdzEQCevSkQ", "question": "What structure is uppermost here?", "choices": ["underpass", "overpass", "cloverleaf", "ramp"], "correct_choice_idx": 1, "direct_answers": ["bridge", "overpass", "bridge", "bridge", "bridge", "bridge", "bridge", "bridge", "bridge", "bridge"], "difficult_direct_answer": false, "rationales": ["The structure is an overpass.", "There is another road which crosses perpendicularly above the main one.", "The only thing that is off of the ground is the elevated roadway."], "image": "val2014/COCO_val2014_000000553879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503945, "question_id": "FdBwYYPfoUE8JQFUbB7giF", "question": "The disk in the center of the pies here serve what preventive purpose?", "choices": ["none", "decorative only", "crushing", "mixing up"], "correct_choice_idx": 2, "direct_answers": ["box protection", "topping protection", "crushed", "holding", "pizza saver", "crushing", "collapsing lid", "smashing", "smearing", "pizza box"], "difficult_direct_answer": true, "rationales": ["No one wants the pizzas to get crushed, hence the center disk.", "Disks are in the center of pizza. plastic disks are used to keep boxes from crushing pizza when delivered.", "The white piece in the center keeps the lid from sticking to the cheese on the pizza"], "image": "train2014/COCO_train2014_000000503945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148353, "question_id": "FdJh6amrrgpPe2cRvcZcw3", "question": "What is this kind of bridge called?", "choices": ["river", "antique", "street", "overhead"], "correct_choice_idx": 3, "direct_answers": ["footbridge", "walkway", "pedestrian", "foot", "walkway", "pedestrian", "pedestrian bridge", "pedestrian bridge", "arch bridge", "overhead"], "difficult_direct_answer": false, "rationales": ["The overhead bridge is constructed on the highways.", "The bridge crosses over a road, and therefore is called an overhead bridge.", "The bridge goes over a road."], "image": "val2014/COCO_val2014_000000148353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287140, "question_id": "FdJmyqrPG25Bmab3YvEDPJ", "question": "Why are the bikes in this lot?", "choices": ["for sale", "parked", "stolen", "broken down"], "correct_choice_idx": 1, "direct_answers": ["maintenance", "racing", "designated for", "parking", "preparing race", "accident", "parked", "bike parking", "parked", "bike show"], "difficult_direct_answer": true, "rationales": ["The bikes are in a parking area.", "They are not moving and some are left with the kickstand down and no riders on them.", "The bikes are parked."], "image": "train2014/COCO_train2014_000000287140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367742, "question_id": "FdLxpvxwPaSEgBgv9LbNhM", "question": "How is this train powered?", "choices": ["gas", "electricity", "coal", "steam"], "correct_choice_idx": 1, "direct_answers": ["electric", "electricity", "electricity", "electricity", "electricity", "electricity", "electric", "electricity", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["The train goes by electricity.", "The myriad overhead cables stretching as far as the eye can see make it a sure bet that this train runs on electricity.", "The power lines above the train provide power"], "image": "train2014/COCO_train2014_000000367742.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36614, "question_id": "FdduTZJnNrSjNtWMewkDLE", "question": "What item is causing a condensation puddle on the table?", "choices": ["pizza", "burger", "water", "milk"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "water glass", "water", "water glass", "glass", "water", "drink", "water glass", "water"], "difficult_direct_answer": false, "rationales": ["There is some water causing condensation rings on the table.", "The water cup is cold, causing condensation on the table.", "Condensation of moisture outside the glass has created this puddle of water."], "image": "val2014/COCO_val2014_000000036614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369082, "question_id": "FdiL6GWNhEkTadYRV2C9gW", "question": "What large appliance is shown?", "choices": ["dishwasher", "compost", "refrigirator", "stove"], "correct_choice_idx": 0, "direct_answers": ["stove", "stove", "washing machine", "dishwasher", "stove", "dishwasher", "stove", "stove", "dishwasher", "dishwasher"], "difficult_direct_answer": false, "rationales": ["A dishwasher is kept in the kitchen and most kitchens have them.", "The appliance is a dishwasher.", "The stove is the largest appliance."], "image": "val2014/COCO_val2014_000000369082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533739, "question_id": "FdwqZA6RRAn75M2pQogqm7", "question": "Who designed the first building's logo?", "choices": ["maurice finn", "doug thomas", "beth finkelstein", "paula scher"], "correct_choice_idx": 3, "direct_answers": ["paula scher", "citibank", "bank", "paula scher", "john glenn", "city bank", "citibank", "citibank", "citibank", "citibank"], "difficult_direct_answer": false, "rationales": ["The citibank logo was designed in 1998 by paula scher, a well known designer.", "The first building is citibank. i searched the internet for the name of the person who designed the logo.", "That is the designer of the logo."], "image": "train2014/COCO_train2014_000000533739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379350, "question_id": "Fe9YT9NshVwBox3pLXuqmL", "question": "What is the man on the left drinking?", "choices": ["juice", "wine", "water", "beer"], "correct_choice_idx": 3, "direct_answers": ["beer", "beer", "beer", "beer", "beer", "bud light", "beer", "bud light", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["The man on the left is drinking a can of beer.", "The man has beer.", "The man is holding a can, and beer comes in cans."], "image": "train2014/COCO_train2014_000000379350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405657, "question_id": "FeBeouE4WnLr8DgpRYRdvT", "question": "Why is he looking at the other guy's laptop?", "choices": ["is curious", "is teaching", "stealing information", "is learning"], "correct_choice_idx": 3, "direct_answers": ["working", "comparing work", "watching repair", "studying together", "learning", "interested", "learning", "is learning", "helping", "watching"], "difficult_direct_answer": true, "rationales": ["The guy seems to be taught something by the way he looks.", "The man is showing him something on his lap top", "They are trying to learn."], "image": "val2014/COCO_val2014_000000405657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151587, "question_id": "FeYDSjrNHkNPSwPjhEZVRD", "question": "What is the long vehicle for?", "choices": ["transporting children", "transporting cars", "transporting horses", "transporting goods"], "correct_choice_idx": 0, "direct_answers": ["school transport", "carrying schoolkids", "delivering children", "school transportation", "transporting children", "transporting children", "school bus", "bus", "school bus", "transport students"], "difficult_direct_answer": false, "rationales": ["The long vehicle is a yellow school bus.", "A school bus is yellow and in the street. school buses transport children.", "The long vehicle is a school bus, and it says so on the side. schools are usually reserved for young people."], "image": "train2014/COCO_train2014_000000151587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122871, "question_id": "FepbTsxNoXwW2miN5Vc5aG", "question": "Who did the British copy this game from?", "choices": ["dutch", "indians", "french", "turkish"], "correct_choice_idx": 1, "direct_answers": ["indians", "persia iran", "t-shirt", "persia", "france", "spanish", "iran", "persians", "persia", "persia"], "difficult_direct_answer": false, "rationales": ["A person in a polo uniform is on a horse near a trailer with a polo team logo on it.", "This game was taken from the indians.", "The british used it from indians."], "image": "val2014/COCO_val2014_000000122871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495903, "question_id": "Ff3MiQriH37Jbjo9P9bVqJ", "question": "What makes the stove here hot?", "choices": ["coal", "propane", "electricity", "gas"], "correct_choice_idx": 2, "direct_answers": ["electricity", "electricity", "electricity", "electricity", "electricity", "electricity", "electricity", "electricity", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["It has burners that produce fire", "An oven with coil burners is in a kitchen.", "The coils on the stovetop indicate the type of energy required by this stove."], "image": "train2014/COCO_train2014_000000495903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55467, "question_id": "FfLsZp3FczQLTeDDEd6eWA", "question": "Which teddy bear is playing the role of a sailor?", "choices": ["cream bow", "red scarf", "white hat", "spectacles"], "correct_choice_idx": 2, "direct_answers": ["captain bear", "brown one", "small one", "white hat", "baby bear", "light brown", "tiny foreground", "small one", "floor", "red bandanna"], "difficult_direct_answer": true, "rationales": ["The hat the bear is wearing is a traditional hat that sailors have worn on the past.", "This is typical of a navy hat", "People in the navy wear a white and blue hat."], "image": "val2014/COCO_val2014_000000055467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392269, "question_id": "FfPEEgBkE5X6FhLjFT7B9Y", "question": "What could be used to cover the cupcakes to keep them fresh?", "choices": ["paper", "bag", "plastic wrap", "ziploc bag"], "correct_choice_idx": 2, "direct_answers": ["saran wrap", "glass", "lid", "plastic wrap", "cake dome", "foil", "glass", "cake dome", "lid", "plastic wrap"], "difficult_direct_answer": false, "rationales": ["The cupcakes should be covered with plastic wrap.", "The plastic wrap keeps the air away.", "You use a plastic wrap to keep it fresh."], "image": "train2014/COCO_train2014_000000392269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372230, "question_id": "FfSxZqjZpQ88kBFNpZu84G", "question": "What is she ready to do next?", "choices": ["juggle", "swing", "dribble", "dunk"], "correct_choice_idx": 1, "direct_answers": ["hit ball", "hit ball", "hit", "hit ball", "swing", "hit tennisball", "hit tennisball", "swing", "swing", "hit ball"], "difficult_direct_answer": false, "rationales": ["The woman wants to swing her racquet.", "A tennis player pulls back her racket to hit a ball that's in the air.", "The position and alignment of her suggest that her next move is to hit the ball with the tennis racket in order to do this she must swing the racket which will cause the ball to propel across the net to her opponent."], "image": "val2014/COCO_val2014_000000372230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472614, "question_id": "Ffoh3xzegbPjniYK9zy3ZA", "question": "Where does it seem like this photo was taken based on the umbrellas?", "choices": ["uk", "italy", "germany", "france"], "correct_choice_idx": 0, "direct_answers": ["englad", "england", "busy street", "united kingdom", "uk", "canada", "england", "united kingdom", "france", "uk"], "difficult_direct_answer": false, "rationales": ["The umbrellas have the flag of the united kingdom on them.", "People are walking with umbrellas with the uk flag on it.", "This is the design of the union jack flag"], "image": "train2014/COCO_train2014_000000472614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301634, "question_id": "FfvGi2BZNxh6a5Hg97D2JV", "question": "Which car has violated the law?", "choices": ["silver car", "white car", "grey car", "black car"], "correct_choice_idx": 0, "direct_answers": ["silver car", "no parking", "middle", "small sedan", "first one", "small silver", "silver", "van", "silver", "silver ford"], "difficult_direct_answer": true, "rationales": ["The silver car isn't parked correctly.", "The silver car is parked in a firetruck space.", "The silver car violated the law."], "image": "val2014/COCO_val2014_000000301634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454915, "question_id": "FgTECfiAvBLgtn7G436Kex", "question": "What is the mouse next to?", "choices": ["cat", "goat", "keyboard", "cherry"], "correct_choice_idx": 2, "direct_answers": ["keyboard", "keyboard", "keyboard", "keyboard", "keyboard", "keyboard", "keyboard", "keyboard", "computer", "keyboard"], "difficult_direct_answer": false, "rationales": ["The mouse is beside the keyboard on the desk.", "It is roughly rectangular shaped and has lettered keys on it", "The mouse is next to piece of equipment used to type data, thus matching the item listed in option a."], "image": "train2014/COCO_train2014_000000454915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147506, "question_id": "FgbYpVVLsvhhhqjwb3YV4G", "question": "What type of area is fenced off behind the children?", "choices": ["villa", "boardwalk", "construction", "roadway"], "correct_choice_idx": 1, "direct_answers": ["boardwalk", "office", "street", "street", "sand", "boardwalk", "clock tower", "commercial buildings", "plaza", "building"], "difficult_direct_answer": false, "rationales": ["The boardwalk is fenced off.", "A place for people to walk is behind the fence.", "The children are in a sandy beach area. most beaches are separated from structures by a pathway called boardwalk."], "image": "train2014/COCO_train2014_000000147506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143737, "question_id": "Fgf8qQHUGqDK5mAvDThbUp", "question": "What type of vehicle are the men riding?", "choices": ["train", "motorcycle", "car", "bus"], "correct_choice_idx": 1, "direct_answers": ["motorcycles", "motorcycles", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycles"], "difficult_direct_answer": false, "rationales": ["The men are riding on a motorcycle.", "Each vehicle has only 2 wheels, not 4 or more.", "The men are riding on yellow motorcycles."], "image": "val2014/COCO_val2014_000000143737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128201, "question_id": "Fgm3cVKvD7TbJqGp6969Sg", "question": "Why are the chairs arranged this way?", "choices": ["for cleaning", "protect floors", "group gathering", "for sale"], "correct_choice_idx": 2, "direct_answers": ["make room", "meeting", "to socialize", "group gathering", "circle", "around table", "meeting", "group setting", "for meeting", "for conference"], "difficult_direct_answer": true, "rationales": ["There are several chairs facing each other.", "They're put in a shape that people can sit around and talk to each other.", "The cup on the floor next to the chair indicates someone was sitting there and put the cup on the floor when finished. this set up is common for people sitting in a sort of circle to talk."], "image": "val2014/COCO_val2014_000000128201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562101, "question_id": "Fgq2opQqqPsZoQiMyB6YLn", "question": "What caused the white lines and blurry lines in the sky?", "choices": ["photo shop", "gulls", "airplanes", "conspiriters"], "correct_choice_idx": 2, "direct_answers": ["contrails", "wind", "airplanes", "airplanes", "airplanes", "airplanes", "plane contrails", "clouds", "airplanes", "planes"], "difficult_direct_answer": false, "rationales": ["The lines in the sky are contrails caused by vehicles moving fast in the sky.", "Airplanes flying all over the sky made the lines in it.", "Airplanes flew across the sky and left behind white trails."], "image": "val2014/COCO_val2014_000000562101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376828, "question_id": "Fh2irYNb8qNJL3jpi2RUvk", "question": "What is the plant sitting on?", "choices": ["table", "trunk", "microwave", "refrigerator"], "correct_choice_idx": 3, "direct_answers": ["fridge", "fridge", "fridge", "fridge", "fridge", "mini frig", "refrigerator", "refrigerator", "refrigerator", "fridge"], "difficult_direct_answer": false, "rationales": ["There is a potted plant on the fridge.", "The thing is white and has a door inthe front of it.", "The plant is on the fridge."], "image": "train2014/COCO_train2014_000000376828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33536, "question_id": "FhEZeMfFcUSqG2xWZtp6PB", "question": "What fruit is this man going to eat?", "choices": ["apples", "strawberries", "bananas", "olives"], "correct_choice_idx": 3, "direct_answers": ["olives", "pizza", "olive", "pear", "pineapple", "pineapple", "orange", "unknown", "pineapple", "olive"], "difficult_direct_answer": false, "rationales": ["There are olives on the pizza.", "This pizza slice has round black slices on it. these black slices are likely olives.", "He is eating a slice of pizza, and olives can be seen as a topping on it, and it is also a popular choice of pizza topping, unlike the other fruits mentioned."], "image": "train2014/COCO_train2014_000000033536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321716, "question_id": "FhGb4fCQJw4FHBBDsZ4mxW", "question": "What team's hat is the skater wearing?", "choices": ["mets", "rams", "yankees", "cardinals"], "correct_choice_idx": 2, "direct_answers": ["yankees", "skating bored", "yankees", "yankees", "yankees", "new york", "yankees", "yankees", "skating", "yankees baseball"], "difficult_direct_answer": false, "rationales": ["The man's hat says ny on it.", "The hat has the yankees logo.", "The letters are on the front of it"], "image": "train2014/COCO_train2014_000000321716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360517, "question_id": "FhftJPgMLHitiV7saFdUd7", "question": "What is being sorted in this area?", "choices": ["papers", "laundry", "students", "colors"], "correct_choice_idx": 1, "direct_answers": ["clothes", "laundry", "laundry", "clothing", "laundry", "clothes", "clothes", "laundry", "laundry", "laundry"], "difficult_direct_answer": false, "rationales": ["There is a pile of clothes.", "There are clothes laid out to be folded on the chair and ironing board.", "Items can be seen lying around in someone living room. these most likely indicate that someone is either going to wash them or has just completed laundry."], "image": "train2014/COCO_train2014_000000360517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116326, "question_id": "FhkGgSvXgaV7HsLmgJzypv", "question": "Who is pictured in the painting in the background?", "choices": ["bob marley", "tupac shakur", "michael jackson", "snoop dog"], "correct_choice_idx": 0, "direct_answers": ["bob marley", "bob marley", "bob marley", "bob marley", "bob marley", "bob marley", "bob marley", "bob marley", "bob marley", "bob marley"], "difficult_direct_answer": false, "rationales": ["There is a portrait of bob marley in the background.", "A picture of a man with long dreadlocks and jamaican colors is on the wall. bob marley was jamaican and had long dreadlocks.", "The painting in the picture is of the well known musician bob marley."], "image": "train2014/COCO_train2014_000000116326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267885, "question_id": "FhmPtqa3JvDVya8egN9XPG", "question": "What are the men doing on top of the elephants?", "choices": ["feeding them", "riding them", "selling them", "cleaning them"], "correct_choice_idx": 1, "direct_answers": ["riding them", "riding", "sitting", "riding", "sitting", "riding", "riding", "riding", "riding", "sitting"], "difficult_direct_answer": false, "rationales": ["The elephants are being used for transportation.", "The men are riding.", "The men are sitting on the elephant so they can control them."], "image": "train2014/COCO_train2014_000000267885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136200, "question_id": "Fi3SgWhZUmTEgy9FKrWig2", "question": "What season are the ladies here celebrating?", "choices": ["earth day", "easter", "halloween", "boxing day"], "correct_choice_idx": 2, "direct_answers": ["halloween", "summer", "holidays", "fall", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween"], "difficult_direct_answer": false, "rationales": ["The season is halloween.", "They are both wearing costumes.", "The one woman has a costume."], "image": "train2014/COCO_train2014_000000136200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128072, "question_id": "FiSQaBeZnE2ohNDnYC64Jr", "question": "What is the seafood called that's in this dish?", "choices": ["lobster", "fish filet", "shrimp", "crab"], "correct_choice_idx": 2, "direct_answers": ["shrimp", "eel", "crab", "shrimp", "shrimp", "fish", "shrimp", "shrimp", "shrimp", "shrimp"], "difficult_direct_answer": false, "rationales": ["This dish has a serving of shrimp.", "The seafood is small and curved", "The seafood is shrimp."], "image": "train2014/COCO_train2014_000000128072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361139, "question_id": "FigQH8PwUtrmc6h4D9TtZ9", "question": "Why are the men shaking hands?", "choices": ["baseball rule", "being friendly", "sportsmanship", "distraction"], "correct_choice_idx": 2, "direct_answers": ["sportsmanship", "camaraderie", "congratulating", "sportsmanship", "sportsmanship", "sportsmanship", "sportsmanship", "sportsmanship", "camaraderie", "sportsmanship"], "difficult_direct_answer": false, "rationales": ["They are being good sports.", "They seems to be getting along and being friendly.", "Though any of these options could be correct, most likely they are showing there agreement on a good game played."], "image": "train2014/COCO_train2014_000000361139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362879, "question_id": "Fj2d9BnB2sJiJ42gDHvZr9", "question": "What are the two blond haired people doing?", "choices": ["arguing", "eating", "playing guitar", "cleaning room"], "correct_choice_idx": 2, "direct_answers": ["playing game", "playing guitar", "playing game", "playing game", "playing guitar", "playing game", "playing guitar", "playing instruments", "playing", "playing game"], "difficult_direct_answer": false, "rationales": ["The people are playing guitar.", "Two people with blond hair are holding guitars.", "They are both holding the instruments in their hands."], "image": "train2014/COCO_train2014_000000362879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531844, "question_id": "FjAosz2EDCmJtkrCZy6G2p", "question": "What type of sign is on the building?", "choices": ["informational", "directional", "warning", "brand"], "correct_choice_idx": 0, "direct_answers": ["advertisement", "sales", "sale signs", "sales", "sale", "informational", "sale sign", "advertising", "sale", "sale"], "difficult_direct_answer": false, "rationales": ["The sign mentions sale prices and other information.", "The sign tells about a sale on items.", "The sign is indicating the prices."], "image": "train2014/COCO_train2014_000000531844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431874, "question_id": "FjB7eNu2CyUtq9LM5rC6Xv", "question": "What is on top of the bun?", "choices": ["seeds", "cheese", "ketchup", "nachos"], "correct_choice_idx": 0, "direct_answers": ["sesame seeds", "sesame seeds", "sesame seeds", "seeds", "sesame seeds", "sesame seeds", "sesame seeds", "sesame", "sesame seeds", "sesame"], "difficult_direct_answer": false, "rationales": ["These are sesame seeds and these are found on many different styles of buns.", "Sesame seeds are on top of the bun.", "These are the sesame variety used on fancy buns"], "image": "train2014/COCO_train2014_000000431874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176736, "question_id": "FjDYoqZDUSSGMhwwjW3RPW", "question": "What are the people on the blue boards doing?", "choices": ["sleeping", "eating", "practicing", "tanning"], "correct_choice_idx": 2, "direct_answers": ["practicing", "laying", "standing", "learning surfing", "learning surfing", "practicing surfing", "practicing learning", "practicing surfing", "practicing", "teaching"], "difficult_direct_answer": false, "rationales": ["People are all on the blue boards practicing to surf.", "Due to the fact that the people aren't in the water you can safely tell what they are doing.", "They are learning and rehearsing how to surf so they know what to do when they go in the water."], "image": "train2014/COCO_train2014_000000176736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151594, "question_id": "FjHwNtUpKKFbmZSrx6xGQF", "question": "The Red white and blue circular emblem on the visible billboard here advertises for what company?", "choices": ["keds", "pepsi", "coke", "bank america"], "correct_choice_idx": 1, "direct_answers": ["pepsi", "america", "pepsi", "pepsi", "america", "bank", "pepsi", "pepsi", "pepsi", "pepsi"], "difficult_direct_answer": false, "rationales": ["The pepsi is the ad.", "The swirl red, white and blue logo belongs to this beverage company.", "The soft drink's logo is very easily recognized in this picture."], "image": "train2014/COCO_train2014_000000151594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468457, "question_id": "FjTxRgVUMGDmoUgp2kqQFF", "question": "What style of pizza is being eaten?", "choices": ["think crust", "deep dish", "pretzel crust", "french bread"], "correct_choice_idx": 0, "direct_answers": ["supreme", "flat bread", "breakfast", "round", "breakfast pizza", "thin crust", "brazilian", "round", "think crust", "thin crust"], "difficult_direct_answer": false, "rationales": ["Remove the accidental k from the answer, and a is the only option that makese sense.", "One can see the edge of the crust and that it is not very thick.", "With this pizza cut in half, we get to take a good look at the crust and its relative size. this pizza would qualify as thick crust."], "image": "val2014/COCO_val2014_000000468457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89150, "question_id": "FjksvJE5gmNAXUDPyuKFMv", "question": "What activity is this group preparing for?", "choices": ["running", "whale watching", "sailing", "hiking"], "correct_choice_idx": 3, "direct_answers": ["birdwatching", "hiking", "hiking", "hiking", "hiking", "hike", "hiking", "hike", "hiking", "hiking"], "difficult_direct_answer": false, "rationales": ["The activity is hiking.", "The group has hiking poles.", "The group wants to go out for a hike in the sun since they have hats and hiking boots on."], "image": "val2014/COCO_val2014_000000089150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440032, "question_id": "FjqAL9pVsJWJCY972F9exd", "question": "What type of passenger service is available on this street?", "choices": ["tram", "subway", "ferry", "bus"], "correct_choice_idx": 0, "direct_answers": ["bus", "bus", "bus", "tram", "bus", "bus", "delivery", "cycling", "bus", "train"], "difficult_direct_answer": false, "rationales": ["A tram is about to turn the corner.", "The passenger is for the tram.", "A train is on the far right. it is above ground, so it is not a subway train."], "image": "train2014/COCO_train2014_000000440032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91917, "question_id": "FjxEvZE2bAYHPoJby7Ptyb", "question": "Which mode of transportation shown here is most economical?", "choices": ["truck", "semi", "car", "motor cycle"], "correct_choice_idx": 3, "direct_answers": ["motorbike", "bike", "motorcycle", "motorcycle", "motorcycle", "motorbike", "moped", "bike", "motor cycle", "e-bike"], "difficult_direct_answer": false, "rationales": ["A motorcycle is small and doesn't use much gas.", "The mode is by motorbike.", "Motorcycles and cars are driving on a street. motorcycles are much smaller than cars."], "image": "train2014/COCO_train2014_000000091917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46580, "question_id": "Fk8JepqhtFTyVigJudKJ4d", "question": "What process is used to make that cap?", "choices": ["weaving", "knitting", "crocheting", "sewing"], "correct_choice_idx": 1, "direct_answers": ["knitting", "knitting", "reading", "knitting", "knit", "knitting", "knitting", "beanie template", "knitting", "knitting"], "difficult_direct_answer": false, "rationales": ["The material used for the hat looks like wool based on the texture and there appears to be patterns made from string which is made by knitting.", "The hat pattern is unique to knitting", "The cap is made from wool and it can be stitched together with knitting needles."], "image": "train2014/COCO_train2014_000000046580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383930, "question_id": "FkAfbKNKrbkC2PqLsAVuPQ", "question": "What does the man have on?", "choices": ["bow tie", "hat", "scarf", "suspenders"], "correct_choice_idx": 1, "direct_answers": ["boots", "boots", "riding boots", "hat", "hat", "helmet", "jacket", "boots", "jacket", "boots helmet"], "difficult_direct_answer": false, "rationales": ["The man has a hat on his head.", "The man has a hat.", "The man is wearing a black cap."], "image": "train2014/COCO_train2014_000000383930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73731, "question_id": "FkRU5NhAUvEg2NdVWZ2aiP", "question": "What group of people is the design on the bag associated with?", "choices": ["pirates", "lawyers", "teachers", "doctors"], "correct_choice_idx": 0, "direct_answers": ["pirates", "pirates", "pirates", "goth people", "pirates", "pirates", "pirates", "bell's angels", "pirates", "pirate"], "difficult_direct_answer": false, "rationales": ["The people on the bag are associated with pirates since the graphic shows a skull.", "The skull with swords are associated with pirates.", "The picture of the skull on the bag is part of the logo for them."], "image": "val2014/COCO_val2014_000000073731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76956, "question_id": "FkSHpmyLJaCkqcTtSoThas", "question": "Based on the photo which base is safe from being stolen?", "choices": ["first", "home", "third", "second"], "correct_choice_idx": 1, "direct_answers": ["ball", "third", "third", "third", "third base", "home", "third", "home base", "home plate", "third"], "difficult_direct_answer": false, "rationales": ["The pitcher is throwing towards the hitter. the hitter stands in a box at the base before first base.", "There is no player on third base. the next base is safe from being stolen.", "The pitcher is throwing the ball towards the batter. a player would be foolish to try to steal the base that the ball is being thrown to."], "image": "val2014/COCO_val2014_000000076956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491943, "question_id": "Fkz2yhKYYNTAFqdpuMVzXK", "question": "What might this food attract in this location?", "choices": ["flies", "cheetah", "crocodile", "snakes"], "correct_choice_idx": 0, "direct_answers": ["flies", "bees", "bears", "bugs", "bees", "bears", "flies", "bees", "flies", "bugs"], "difficult_direct_answer": false, "rationales": ["Flies get attracted to food.", "Flies can land on the meat.", "The food is far away from the water to only attract flies outside."], "image": "train2014/COCO_train2014_000000491943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358060, "question_id": "Fm3kvbqSjkx3LEfxtuuCG2", "question": "What type of animal is present on the grass?", "choices": ["dogs", "cats", "sheep", "cows"], "correct_choice_idx": 3, "direct_answers": ["cow", "cow", "calf", "cows", "cow", "cow", "calf", "cow", "bull", "cow"], "difficult_direct_answer": false, "rationales": ["There are some fluffy white cows on the grass.", "A cow is in the grass.", "A cow is standing in a grassy area with another in the distance."], "image": "train2014/COCO_train2014_000000358060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548167, "question_id": "Fm8fwsoirKE758Mb7oNCRb", "question": "What skill level does the young skier exhibit here?", "choices": ["intermediate", "beginner", "pro", "olympic"], "correct_choice_idx": 1, "direct_answers": ["novice", "novice", "beginner", "beginner", "beginner", "novice", "novice", "beginner", "beginner", "beginner"], "difficult_direct_answer": false, "rationales": ["The skier is too young to be at an intermediate, pro, or olympic level.", "The skill level is beginner.", "Based on the child's age it is unlikely they have enough experience to be at an advanced skill level. the way they are holding their poles and their stance looks like they are inexperienced and just beginning."], "image": "train2014/COCO_train2014_000000548167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102420, "question_id": "FmSbExbVTxvYsoogsqX8sf", "question": "What kind or area is being shown?", "choices": ["residential", "private", "commercial", "rural"], "correct_choice_idx": 2, "direct_answers": ["city", "shopping", "commercial", "street", "downtown", "street", "city", "street", "urban", "downtown"], "difficult_direct_answer": false, "rationales": ["There is a paved street and sidewalk with a line of commercial buildings judging by the storefront and the signs over the door. these elements are consistent with answer a.", "A commercial downtown area with shops is shown.", "This is a commercial area as evident by the storefronts adjacent to the sidewalk."], "image": "train2014/COCO_train2014_000000102420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351287, "question_id": "Fmb3nZuqhd6U3wkerGFsi8", "question": "Where did this dog breed originate from?", "choices": ["norway", "scotland", "ireland", "denmark"], "correct_choice_idx": 1, "direct_answers": ["america", "united states", "newfoundland", "england", "scotland", "canada", "england", "uk", "wolves", "united kingdom"], "difficult_direct_answer": true, "rationales": ["The dog was bred for the cold and hunting.", "The dog comes from scotland since it looks built for cold temperatures.", "Golden retrievers originally came from scotland."], "image": "val2014/COCO_val2014_000000351287.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151563, "question_id": "Fmf7tFP2cgx8WHSMqgUZqr", "question": "Why did he put flour on the counter?", "choices": ["he's messy", "was accident", "snacking", "prevent sticking"], "correct_choice_idx": 3, "direct_answers": ["prevent sticking", "prevent sticking", "pizza dough", "prevent sticking", "stick proofing", "kneading", "help cook", "make pizza", "prevent sticking", "to pick"], "difficult_direct_answer": false, "rationales": ["During the cooking of pizza, flour is commonly used to ensure that dish will not stick to pan.", "So when he is making the dough it does not get stuck on the counter.", "It keeps the dough from sticking to the counter."], "image": "train2014/COCO_train2014_000000151563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310202, "question_id": "FmjDddwjD8H8FcCjihRcyB", "question": "What baked good might be the only use for the leftmost bananas?", "choices": ["rye bread", "banana bread", "pumpkin bread", "white bread"], "correct_choice_idx": 1, "direct_answers": ["banana bread", "bread", "banana bread", "bread", "banana bread", "banana bread", "banana bread", "banana bread", "eating purpose", "bread"], "difficult_direct_answer": false, "rationales": ["Pumpkin, white, and rye are not made with overripened yellow fruits.", "The bananas on the left are overripe and would be good to use for banana bread.", "The item shown is a banana which is used as an ingredient in baking option a and not the other options."], "image": "val2014/COCO_val2014_000000310202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262505, "question_id": "FnRhB4qYLYc57Ebx8fHv5f", "question": "Where could the Limo seen here most readily top off their tank?", "choices": ["aamoco", "mile away", "shell", "bowling alley"], "correct_choice_idx": 2, "direct_answers": ["shell", "shell", "shell", "shell station", "shell", "shell station", "shell station", "shell", "shell", "shell station"], "difficult_direct_answer": false, "rationales": ["The shell station could top the gas tanks.", "The shell gas station has gas available.", "The car is driving straight through the intersection and could easily turn right into the shell, which is a well known gas station."], "image": "val2014/COCO_val2014_000000262505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41155, "question_id": "FnV5ZoMmBKpQ58H2DVewWr", "question": "What video game controller does the girl have in her hand?", "choices": ["xbox one", "nintendo wii", "sega genesis", "nintendo 64"], "correct_choice_idx": 1, "direct_answers": ["nintendo wii", "wii", "wii", "wii", "wii", "wii", "nintendo wii", "nintendo wii", "wii", "wii"], "difficult_direct_answer": false, "rationales": ["The girl is holding a wii-mote.", "She is holding a wii game controller in her hand.", "The wii's controller is white and it has a rectangle shape."], "image": "train2014/COCO_train2014_000000041155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439991, "question_id": "FnqGLouag7Xbx4ec3kZQUK", "question": "What is the man with mustache about to do?", "choices": ["board bus", "sightseeing", "block people", "get off"], "correct_choice_idx": 3, "direct_answers": ["leave", "get off", "stop bus", "board", "get off", "deboard", "disembark", "board bus", "depart", "get off"], "difficult_direct_answer": false, "rationales": ["The man is about to get off the bus.", "He is leaning out the door as the bus is about to stop", "The man is in the doorway of the bus and he is leaning out."], "image": "train2014/COCO_train2014_000000439991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491879, "question_id": "FoJ7d62aWSTjjPsh4UyCR6", "question": "What is the man probably applying on the surf?", "choices": ["soap", "powder", "wax", "salt"], "correct_choice_idx": 2, "direct_answers": ["wax", "fiberglass", "wax", "green", "wax", "wax", "wax", "wax", "wax", "wax"], "difficult_direct_answer": false, "rationales": ["A man is rubbing something on a surfboard. wax is put on surfboards.", "Surfboards use the material in option a to make them go faster in the water.", "This coating is applied by wiping it on the board as it helps protect the board and reduce drag."], "image": "train2014/COCO_train2014_000000491879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526441, "question_id": "FoSHNM8ujjsiSbBTvhRtyD", "question": "Seeing dark clouds in the sky will remind you to bring what accessory that would be helpful if it starts to rain?", "choices": ["necklace", "eyeglasses", "umbrella", "watch"], "correct_choice_idx": 2, "direct_answers": ["umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["It looks cloudy and dark like it may rain.", "The umbrella is needed.", "Umbrellas are used to block rain during bad weather. dark clouds can indicate imminent storms."], "image": "train2014/COCO_train2014_000000526441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59367, "question_id": "FoSbVmdDEo5KU6gimryqCr", "question": "Where would you most likely see those long orange things?", "choices": ["classroom", "doctor's office", "pool", "cafe"], "correct_choice_idx": 2, "direct_answers": ["swimming pool", "pool", "swimming pool", "pool", "pool", "pool", "pool", "swimming pool", "pool", "pool"], "difficult_direct_answer": false, "rationales": ["The pool is where the orange things would be.", "These items are used to help keep you afloat.", "There are foam noodles that can be used for flotation."], "image": "train2014/COCO_train2014_000000059367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190680, "question_id": "FoYgfbJaNZdmTdHZetyQN8", "question": "Which way does one go to get to the airport?", "choices": ["turn around", "straight", "turn left", "turn right"], "correct_choice_idx": 1, "direct_answers": ["straight", "straight", "straight", "left", "straight", "left", "straight", "straight", "straight", "flight"], "difficult_direct_answer": false, "rationales": ["The sign on the left indicates where the aerodrom is located.", "The arrows on sign are facing upward/directly ahead.", "The car can go straight."], "image": "val2014/COCO_val2014_000000190680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257109, "question_id": "FogUuTuzMKc8zoJLpjGCks", "question": "What city are these buses in?", "choices": ["barcelona", "london", "new york", "paris"], "correct_choice_idx": 1, "direct_answers": ["london", "london", "london", "oxford", "oxford", "london", "oxford", "london", "london", "london"], "difficult_direct_answer": false, "rationales": ["The city is london.", "The bus is going to oxford circus which is in england.", "Oxford circus, marble arch, and edgware road are all locations listed on the front of the bus and call all be found in the capital city of england."], "image": "train2014/COCO_train2014_000000257109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263453, "question_id": "FoxJ6AGC8cEFQVw3bSvmgx", "question": "What is happening here?", "choices": ["umbrella sale", "going home", "surprise rain", "u2 concert"], "correct_choice_idx": 3, "direct_answers": ["rain", "raining", "raining", "rain", "raining", "rain", "raining", "u2 concert", "raining", "raining"], "difficult_direct_answer": false, "rationales": ["People are in an audience hold flags with the band name u2 on them. u2 gives concerts.", "A concert is going on and you can see their name on the umbrella.", "This is a u2 concert, as the umbrella says u2."], "image": "train2014/COCO_train2014_000000263453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401061, "question_id": "FoxwTRe6ySRPw8j8EKNCbz", "question": "What is the colour of their vehicle?", "choices": ["orange", "yellow", "blue", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The color can be seen in the rearview mirror.", "The color in a matches the color seen in the mirror.", "The color of the vehicle is reflected in the side-view mirror."], "image": "val2014/COCO_val2014_000000401061.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5860, "question_id": "FpPSaUGoaH8UqhqRbSm92G", "question": "What keeps the temperature inside the glass here?", "choices": ["nothing", "dry towel", "ice", "warmer"], "correct_choice_idx": 2, "direct_answers": ["ice", "ice", "ice", "ice", "ice", "ice", "ice", "ice", "ice", "ice"], "difficult_direct_answer": false, "rationales": ["Majority of people use ice to keep their drinks at a cold temperature inside of a cup.", "There is frozen water inside of the glass.", "There are clear cube like things in the soda. it keeps the liquid cold."], "image": "val2014/COCO_val2014_000000005860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388902, "question_id": "FpbEk7nP6sy3erUwdDFL58", "question": "What kind of birds are on the ledge?", "choices": ["pigeon", "bluejay", "wood pecker", "cardinal"], "correct_choice_idx": 3, "direct_answers": ["robin", "cardinal", "cardinal", "cardinals", "cardinal", "cardinals", "oriole", "cardinal", "cardinal", "cardinals"], "difficult_direct_answer": false, "rationales": ["These red birds are called cardinals.", "Cardinal birds are small and red.", "The distinct red colouring indicates that these are cardinals."], "image": "val2014/COCO_val2014_000000388902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182005, "question_id": "Fpc3nh8TCLDoycam3nSY2a", "question": "The flag of which country is flying on the boat?", "choices": ["austria", "canada", "lebanon", "peru"], "correct_choice_idx": 1, "direct_answers": ["canadian", "canada", "canada", "canada", "canada", "japan", "canada", "canada", "canada", "usa"], "difficult_direct_answer": false, "rationales": ["The canadian flag is attached to the back of the boat.", "You can tell by red maple leaf on the flag as to what country it belongs.", "There is a white and red flag with a big leaf in middle."], "image": "train2014/COCO_train2014_000000182005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85941, "question_id": "Fpjht546beNWwkyg7dsnLp", "question": "What sail position utilized here minimizes boats damage during winds?", "choices": ["doubly high", "half up", "down", "fully furled"], "correct_choice_idx": 2, "direct_answers": ["folded", "down", "down", "dock", "down", "down", "no sail", "sails furled", "no sail", "sailess"], "difficult_direct_answer": false, "rationales": ["The position is downward.", "A sailboat is at a dock. sailboat sails are put down during bad weather.", "Wind would make the boats move even more and could turn over a boat."], "image": "train2014/COCO_train2014_000000085941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79720, "question_id": "FpwQoM4aeyC9NzXx4hxVjp", "question": "What is her weight range?", "choices": ["200-300lbs", "100-200lbs", "300-400lbs", "500-600lbs"], "correct_choice_idx": 1, "direct_answers": ["100-105 lbs", "100-200lbs", "healthy", "hundred", "115-120", "one hundred", "60-65", "130ish", "small", "55"], "difficult_direct_answer": true, "rationales": ["She is not overweight.", "She's a skinny lady. she can't be more than 200 pounds.", "She's very tiny."], "image": "train2014/COCO_train2014_000000079720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112065, "question_id": "Fq7FWornhugFNhmztkxziw", "question": "The person in the pink hat pretends they have what type ball?", "choices": ["tennis", "bowling", "baseball", "basketball"], "correct_choice_idx": 1, "direct_answers": ["bowling", "bowling", "bowling", "bowling", "bowling", "bowling", "bowling", "bowling", "bowling", "bowling"], "difficult_direct_answer": false, "rationales": ["The person is bowling.", "The person is playing wii. there's a bowling lane on the screen.", "The person is holding a game controller and is standing in front of a screen showing a bowling alley."], "image": "val2014/COCO_val2014_000000112065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203539, "question_id": "Fq8V5MmhZ2bxnygtXwkFyn", "question": "Why do they all have blood on them?", "choices": ["were attacked", "mass murder", "for film", "killed pig"], "correct_choice_idx": 2, "direct_answers": ["zombies", "halloween", "for film", "decoration", "costumes", "halloween", "costumes", "extras movie", "costume party", "costumes"], "difficult_direct_answer": false, "rationales": ["These people are wearing film make up.", "The blood is for the film.", "They look to be characters in a movie or tv show."], "image": "val2014/COCO_val2014_000000203539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469085, "question_id": "FqCKTPSvoyFrFgp4SRKEHw", "question": "If your teeth are dirty which thing here would be most useful?", "choices": ["hat", "fingers", "mouth wash", "toothbrush"], "correct_choice_idx": 3, "direct_answers": ["toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush"], "difficult_direct_answer": false, "rationales": ["Toothbrushes brush teeth.", "Tooth brush is used to clean the teeth.", "There is no mouth wash. hats or fingers would not be useful for cleaning teeth."], "image": "val2014/COCO_val2014_000000469085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95670, "question_id": "FqaCVSXko8ePGRwy9r4QS4", "question": "How many persons paragliding?", "choices": ["seven", "four", "three", "one"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "plenty", "multiple", "many", "many", "few dozen", "three", "twenty six", "26"], "difficult_direct_answer": false, "rationales": ["There are three people paragliding.", "Two people are on the water. an additional person is midair above these two.", "There is one person in the air and two people in the water."], "image": "val2014/COCO_val2014_000000095670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338439, "question_id": "FqfqbyiK6qPA6utCWZUpst", "question": "Why is she holding the racquet with both hands?", "choices": ["confused", "unsteady", "is anbry", "hit ball"], "correct_choice_idx": 3, "direct_answers": ["strength", "swing", "backhand", "power hit", "maximum power", "grip", "hit harder", "hit ball", "control", "hitting ball"], "difficult_direct_answer": true, "rationales": ["She is holding the racket with both hands in order to hit the ball.", "Due to the type of sport being played and the ball coming to him you can tell what he is about to do.", "A person is approaching a tennis ball with a racket. rackets are used to hit tennis balls."], "image": "train2014/COCO_train2014_000000338439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580057, "question_id": "Fqt78DYWK5SZwqmbhzaBdN", "question": "Why are the people looking up?", "choices": ["hear noise", "seeing bird", "for photo", "shaking heads"], "correct_choice_idx": 2, "direct_answers": ["posing", "camera", "posing", "person", "camera", "camera", "for photographer", "camera", "for photo", "photographer"], "difficult_direct_answer": false, "rationales": ["They are pointing to the camera", "The people are looking towards the camera lens.", "People are staring up at someone with either a security camera or regular camera."], "image": "train2014/COCO_train2014_000000580057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578523, "question_id": "Fr84z4fGxg3TVaS8k9CtAS", "question": "The discarded wax candles present at the base of the cakes are the result of what event?", "choices": ["new year's", "birthday celebration", "power outage", "mood setting"], "correct_choice_idx": 1, "direct_answers": ["birthday celebration", "blowing candles", "blowing out", "birthday", "birthday", "melting", "cake cutting", "birthday party", "birthday party", "birthday"], "difficult_direct_answer": false, "rationales": ["Candles are often placed on a cake when celebrating the date of your birth. the candles are lit, a wish is made, a song is sung, and the candles are blown out and discarded.", "There are cakes near the candles. the people are smiling.", "People traditionally put candles on a cake to celebrate and then they are blown out by the person who is turning one year older."], "image": "train2014/COCO_train2014_000000578523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567403, "question_id": "FrNMzje7kZCJ7dhvTbj8Py", "question": "What is wrong with this image?", "choices": ["blurry", "too close", "too far", "broken"], "correct_choice_idx": 0, "direct_answers": ["blurry", "blurry", "blurry", "blurry", "blurry", "its blurry", "too blurry", "blurry", "blurry", "blurry"], "difficult_direct_answer": false, "rationales": ["The image is blurry and makes for a bad photo.", "You cannot clearly make out the cows.", "The image is blurry."], "image": "train2014/COCO_train2014_000000567403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523473, "question_id": "FrVJ8tKUMdcbnLeEaCo8m4", "question": "Why are some trees here leafless?", "choices": ["they're dead", "spring", "sap suckers", "summer"], "correct_choice_idx": 0, "direct_answers": ["pines", "they're dead", "autumn season", "fall", "getting old", "winter", "fall time", "they're dead", "disease", "coniferous"], "difficult_direct_answer": true, "rationales": ["The trees are dying.", "The trees are browning because they're dying.", "Trees are dead."], "image": "train2014/COCO_train2014_000000523473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508722, "question_id": "FrXUDepqaAftgtw458pMTf", "question": "What is the important part of this sport?", "choices": ["looks", "fun", "security", "fame"], "correct_choice_idx": 1, "direct_answers": ["race", "going downhill", "going downhill", "fun", "going downhill", "snow", "high speed", "snow", "adventure", "having snow"], "difficult_direct_answer": false, "rationales": ["The sport gives a sense of adrenaline and increases endorphins, and would not be so popular if it was not fun.", "The sport is played for enjoyment.", "People are skiing. people participate in sports for fun."], "image": "train2014/COCO_train2014_000000508722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574009, "question_id": "Frrdf3YR4dY5VZAUCbmsU3", "question": "Why is the man wearing a vest with a number on it?", "choices": ["dress code", "fashion", "participating event", "for halloween"], "correct_choice_idx": 2, "direct_answers": ["horse racing", "competition", "37", "contestant", "identification", "participating event", "training", "competition", "in race", "competition"], "difficult_direct_answer": false, "rationales": ["A man has a vest because he is competing with other people on the beach.", "This indicates his number in a contest", "He and his horse both have the same number on them."], "image": "train2014/COCO_train2014_000000574009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238828, "question_id": "FsA2RPQNwbsssoudyrtfiU", "question": "The person reading is likely what kind of person?", "choices": ["married", "agoraphobic", "bachelor", "toddler"], "correct_choice_idx": 0, "direct_answers": ["bookworm", "married", "smart", "bookworm", "intellectual", "reader", "relaxed", "tourist", "reader", "educated"], "difficult_direct_answer": false, "rationales": ["The person is married.", "He has a wedding ring on his finger.", "He's wearing the ring on his left ring finger."], "image": "train2014/COCO_train2014_000000238828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251251, "question_id": "Fsi2tg8pdGMNFEvuTaCFKz", "question": "What is he holding over his head?", "choices": ["skateboard", "skiis", "skates", "snowboard"], "correct_choice_idx": 3, "direct_answers": ["snowboard", "snowboard", "snowboard", "snowboard", "surfboard", "snowboard", "snow board", "surfboard", "snow board", "snowboard"], "difficult_direct_answer": false, "rationales": ["This single, wheel-less piece of equipment is appropriate for the snow", "It is a long board, and a hill of snow can be seen in the background, indicating that he will use it on the hill.", "The guy is proudly holding up his board for riding on snow."], "image": "train2014/COCO_train2014_000000251251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135336, "question_id": "FspYZUTh5mDckFLbpjSf5m", "question": "What is the man in pink shorts holding a black stick doing?", "choices": ["swimming", "metal detecting", "sailing", "running"], "correct_choice_idx": 1, "direct_answers": ["metal detecting", "metal detecting", "hunting treasure", "metal detecting", "walking", "mining", "metal detecting", "detecting metal", "metal detecting", "walking"], "difficult_direct_answer": false, "rationales": ["This is a metal detector and he's looking for missing items in the sand.", "The man in pink shorts is holding a metal detector in his left hand.", "The coiled wire and disk shaped bottom of this black thin device identify it as a metal detector."], "image": "train2014/COCO_train2014_000000135336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104676, "question_id": "FtPNZsQGsugWydfmvfRNad", "question": "A fleece is a kind of hair getting from which mammal?", "choices": ["bear", "goat", "sheep", "deer"], "correct_choice_idx": 2, "direct_answers": ["sheep", "sheep", "sheep", "sheep", "sheep", "sheep", "goats", "goats", "sheep", "sheep"], "difficult_direct_answer": false, "rationales": ["Shorn sheep wool is the source of fleece.", "Traditionally fleece is manufactured from sheep fur.", "That's the only animal we get fleece from."], "image": "val2014/COCO_val2014_000000104676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148077, "question_id": "FtjaW6fTRBGFTCqycd2FNb", "question": "The bottom pieces are made to land on what surface?", "choices": ["water", "snow", "tarmac", "grass"], "correct_choice_idx": 0, "direct_answers": ["water", "ground", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["It is a seaplane so instead of wheels it has floats.", "The plane does not have the traditional wheels, but has pontoons instead. these are used for water craft.", "The bottom part of the plane is made to land on water."], "image": "val2014/COCO_val2014_000000148077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273378, "question_id": "Fu7aXnctRA9N3vX6Wh7Qmk", "question": "What might the man do on the banana peel?", "choices": ["spin", "slip", "jump", "dance"], "correct_choice_idx": 1, "direct_answers": ["slip", "slip", "slip", "slip", "slipping", "slip", "slip", "slip", "slipping", "slip"], "difficult_direct_answer": false, "rationales": ["A man's foot is on top of a banana peel as he walks in the snow.", "Bananas are slippery. if you step on one you might fall.", "The man can slip."], "image": "train2014/COCO_train2014_000000273378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551418, "question_id": "FuEdP3WmeXQx7zAsDB6v9u", "question": "What Sort of union are these people members of?", "choices": ["boating", "airline", "farm", "concession workers"], "correct_choice_idx": 1, "direct_answers": ["airline", "work", "airline union", "aviation", "airline pilots", "airline union", "alpha", "aviation", "teamsters", "teams"], "difficult_direct_answer": false, "rationales": ["They are wearing pilots uniforms and are referencing airlines on their signs so they are most likely to be in airlines. their signs also reference management interference which is a concern of a union.", "The signs say pilot", "The people are protesting an airline."], "image": "train2014/COCO_train2014_000000551418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357943, "question_id": "FuGm2xDKBuD28ef3Kfo832", "question": "What animals activity is being judged here?", "choices": ["bird", "dog", "man", "sheep"], "correct_choice_idx": 1, "direct_answers": ["dog", "sheepdog", "sheep herding", "herding", "running", "corralling", "shepherding", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The dog is rounding up the sheep.", "This is a competition to see how well the animal will herd the sheep.", "The dog is being judged."], "image": "val2014/COCO_val2014_000000357943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414289, "question_id": "FuMCUkZvxwFB6X7uDujiaP", "question": "What pick me up is found in this woman's cup?", "choices": ["soy milk", "foam", "caffeine", "milk"], "correct_choice_idx": 2, "direct_answers": ["coffee", "coffee", "coffee", "coffee", "caffeine", "coffee", "coffee", "coffee", "coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["The cup is indicative of coffee, which has a natural stimulant.", "The thermal wrap on the cup indicates the beverage is hot", "The size and style of the cup is most associated with coffee or tea. these products would both include caffeine in most cases."], "image": "val2014/COCO_val2014_000000414289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233453, "question_id": "FuS4HjrBiWqZbPd9CDpFTb", "question": "What is the purpose of this signage?", "choices": ["multiple stoppages", "destruction", "lightning rods", "art display"], "correct_choice_idx": 3, "direct_answers": ["stopping", "decor", "stop people", "art display", "art", "to stop", "stop", "stop driving", "to wait", "stop drivers"], "difficult_direct_answer": true, "rationales": ["This many traffic signs would not naturally be placed thus close together, so an art display is is reasonable assumption.", "It is not in the middle of a road, and too many stop signs together would create chaos, so it is assumed this is for art.", "Having them clustered together means it is not for functional use but some artistic purpose."], "image": "train2014/COCO_train2014_000000233453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428305, "question_id": "Fus2co3jNS9K3DENYtZU7W", "question": "What do the little bottles on the lower counter contain?", "choices": ["perfumes", "moisturizers", "drinks", "bath soaps"], "correct_choice_idx": 3, "direct_answers": ["soaps", "soaps", "juice", "bath soaps", "soap", "shampoo", "candles", "soap", "bathing products", "condiments"], "difficult_direct_answer": false, "rationales": ["The little bottles on the lower counter have liquids in them near the bath.", "These are sample size toiletries", "Small bottles are on the side of a tub. people keep soap near the bathtub."], "image": "val2014/COCO_val2014_000000428305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91520, "question_id": "FvFuwJXGwyHhLwxizhZqYb", "question": "What are the cars doing in the enclosed animal area?", "choices": ["hunting", "touring", "racing", "capturing"], "correct_choice_idx": 1, "direct_answers": ["observing", "parked", "tour", "driving", "slowing down", "touring", "tourists", "hills", "driving", "sight seeing"], "difficult_direct_answer": true, "rationales": ["Cars are driving through a fenced in area with giraffes walking about.", "A car is driving through an area with giraffes. people go on driving tours through nature preserves.", "The cars are touring."], "image": "val2014/COCO_val2014_000000091520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140990, "question_id": "FvWzkPVvGncgUeyDjvFLRW", "question": "What is 13 about to do?", "choices": ["sit", "run", "break", "quit"], "correct_choice_idx": 1, "direct_answers": ["run", "run", "hit ball", "run", "run", "hit ball", "run", "run", "run", "hit ball"], "difficult_direct_answer": false, "rationales": ["He just hit the ball so he will throw his bat down and run to as many bases as he can.", "13 was just at bat and hit the ball.", "You can tell by his body language that he has hit the ball and is ready to run."], "image": "train2014/COCO_train2014_000000140990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524470, "question_id": "FvdDczUQQ9LgXqR3kdjooX", "question": "What is the person in the street using to get around?", "choices": ["scooter", "skateboard", "car", "bike"], "correct_choice_idx": 3, "direct_answers": ["bicycle", "bike", "bicycle", "bike", "bicycle", "bike", "bike", "bike", "bike", "bicycle"], "difficult_direct_answer": false, "rationales": ["It has a seat, handlebars and two wheels.", "The person is on a two-wheeled vehicle. the item in option a fits the description.", "Looking at the wheels tells us the answer."], "image": "train2014/COCO_train2014_000000524470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474499, "question_id": "FvenzQsukBcYWxt653MUrv", "question": "What material is the sink made of?", "choices": ["wood", "plastic", "stainless steel", "porcelain"], "correct_choice_idx": 2, "direct_answers": ["stainless steel", "metal", "granite", "steel", "metal", "wood", "stainless steel", "steel", "steel", "iron"], "difficult_direct_answer": false, "rationales": ["It is a shiny silver metal.", "The sink is made out of a shiny metal.", "A silver sink is in a kitchen. sinks are made of stainless steel."], "image": "val2014/COCO_val2014_000000474499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267289, "question_id": "Fvn9VbtgzcoRHrjFPGfgce", "question": "What kind of camera does the sponsor hope was used to take this photo?", "choices": ["canon", "kodak", "nikon", "polaroid"], "correct_choice_idx": 2, "direct_answers": ["nikon", "nikon", "nikon", "nikon", "dslr", "nikon", "nikon", "clear camera", "clear camera", "nikon"], "difficult_direct_answer": false, "rationales": ["The sponsor's name is on the banner that's visible.", "There is a nikon photo in the background.", "The corporation has a sponsorship sign on the field and are known for making cameras."], "image": "train2014/COCO_train2014_000000267289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198557, "question_id": "FvrdSEwh7p5AYWF4QkeBJd", "question": "This transit tunnel referred as what?", "choices": ["subway", "train station", "marine way", "metro tunnel"], "correct_choice_idx": 3, "direct_answers": ["station", "terminal", "downtown", "downtown tunnel", "metro tunnel", "railway", "downtown transit", "bus station", "downtown", "downtown"], "difficult_direct_answer": false, "rationales": ["A subway tunnel is shown. subways are sometimes referred to as metro.", "This underground transportation hub is downtown.", "The tunnel is the metro."], "image": "train2014/COCO_train2014_000000198557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112377, "question_id": "FwG8JSjFNrt96b8YqQQW8i", "question": "Why is the bus near the curb?", "choices": ["getting cleaned", "getting passengers", "refueling", "changing tires"], "correct_choice_idx": 1, "direct_answers": ["passenger deboarding", "dropping passengers", "stop preparation", "unloading", "parked", "pickup people", "access passengers", "unloading", "transportation", "getting passengers"], "difficult_direct_answer": true, "rationales": ["The passengers stand near the curb so the bus driver can see them.", "Busses stop to pickup and drop off passengers at stops", "A bus picks up passengers from the curb."], "image": "train2014/COCO_train2014_000000112377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443868, "question_id": "FwN5NRJKbW2KKLG8AAxNA6", "question": "Why is the bus parked near the curb?", "choices": ["for passengers", "for safety", "to race", "for display"], "correct_choice_idx": 0, "direct_answers": ["pick passanger", "boarders", "people", "allow boarding", "bus stop", "stopping", "bus stop", "boarding passengers", "broken", "for passengers"], "difficult_direct_answer": true, "rationales": ["The bus is waiting for passengers.", "Busses park near curbs when they pick up passengers.", "The bus is at a bus stop picking up riders."], "image": "val2014/COCO_val2014_000000443868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195567, "question_id": "Fwntga9WzaJXfh8e3EUigs", "question": "What is this child's parent doing?", "choices": ["taking photograph", "working", "escaping", "abandoning it"], "correct_choice_idx": 0, "direct_answers": ["taking picture", "taking photograph", "taking photo", "pictures", "photographing", "taking photo", "taking picture", "calling", "taking picture", "photographing child"], "difficult_direct_answer": false, "rationales": ["The child is getting their picture taken.", "The child in the photo does not look scared or crying. this would mean that the parents are nearby and probably the ones taking the photograph.", "The parent is taking the photo."], "image": "val2014/COCO_val2014_000000195567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349101, "question_id": "Fx6BH4GGhB8otEPPVmSNBg", "question": "Which vehicle will cost most to repair?", "choices": ["white", "unrepairable", "equal", "black"], "correct_choice_idx": 0, "direct_answers": ["white", "charger", "bus", "car", "car", "car", "car", "trolley", "train", "bus"], "difficult_direct_answer": false, "rationales": ["The white vehicle is a trolley. trolleys are larger and their materials cost more.", "The white vehicle will cost most.", "Due to the car being a high end luxury care it may cost more to repair."], "image": "val2014/COCO_val2014_000000349101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198447, "question_id": "FxNt6NvisJnrEiWhVPvdCH", "question": "How did these skiers get to this location?", "choices": ["ski lift", "train", "uber", "bus"], "correct_choice_idx": 1, "direct_answers": ["skis", "train", "skis", "train", "train", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["There is a train parked behind the people.", "They took a train that they are standing next to.", "Behind these skiers we can see their method of transportation on it's tracks."], "image": "val2014/COCO_val2014_000000198447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50106, "question_id": "Fxe4b6sBPTXrNcN2TwG9nE", "question": "What are the people likely doing?", "choices": ["playing games", "reading", "waiting", "running"], "correct_choice_idx": 2, "direct_answers": ["waiting taxi", "messaging", "texting", "waiting", "waiting", "seeing movie", "waiting", "waiting bus", "waiting", "waiting bus"], "difficult_direct_answer": false, "rationales": ["The people are standing on the side of the road because they are waiting for a bus.", "Looks like they are waiting for a bus to pick them up.", "The people look like they are bored. they are standing near the road, possible waiting for a bus."], "image": "train2014/COCO_train2014_000000050106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30962, "question_id": "FxfF8iVk3nbrZPrb6oixx6", "question": "What type of railway system is the train on?", "choices": ["elevated", "heritage train", "monorail", "trolley"], "correct_choice_idx": 0, "direct_answers": ["monorail", "monorail", "electricity", "bridge railway", "elevated", "bridge", "tracks", "train", "elevated", "elevated"], "difficult_direct_answer": false, "rationales": ["The railway is in the air. the train is on the track there.", "It is above the ground running on rails.", "The rails are on pillars off the ground."], "image": "train2014/COCO_train2014_000000030962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399058, "question_id": "FxvGk6dx8rhi9Y2BQzWEXW", "question": "Why is the man riding the elephant holding a spear above his head?", "choices": ["for amusement", "for protection", "to attack", "for control"], "correct_choice_idx": 3, "direct_answers": ["for control", "bored", "guide elephant", "predators", "for protection", "for balance", "hunting", "intimidation", "hit elephant", "poke preparation"], "difficult_direct_answer": true, "rationales": ["An elephant could get out of control. a spear is a way to get an animal to do what he wants.", "The spear is used to control the elephant in case it goes off course and to help the men balance on the back.", "The man is trying to control the elephant's movements."], "image": "val2014/COCO_val2014_000000399058.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503500, "question_id": "Fy8ZyXYwL7m5ABA2gejUcx", "question": "What do these people ride around transporting?", "choices": ["food", "clothing", "money", "blood"], "correct_choice_idx": 3, "direct_answers": ["people", "blood", "blood", "blood", "blood", "blood", "motorcycle", "bike", "motorcycles", "blood"], "difficult_direct_answer": false, "rationales": ["The motorcycle says blood.", "The motorcycle says blood on it.", "The word blood is on the vehicles showing that they transport blood."], "image": "train2014/COCO_train2014_000000503500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535777, "question_id": "FyE75mTEDrCXx5b2sfLA74", "question": "In what type setting does the skateboarder skate here?", "choices": ["desert", "farm", "suburban", "urban"], "correct_choice_idx": 3, "direct_answers": ["court house", "court house", "court house", "court house", "courthouse", "skate tricks", "court house", "urban", "urban", "government"], "difficult_direct_answer": false, "rationales": ["The setting is urban.", "The skater is in front of the court house in a cement courtyard.", "A court house is usually in a more busier area."], "image": "train2014/COCO_train2014_000000535777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170390, "question_id": "FyFXHGzckiAexMQGCsWdPg", "question": "What language is the text below the English written in?", "choices": ["spanish", "african", "european", "asian"], "correct_choice_idx": 3, "direct_answers": ["asian", "korean", "chinese", "italian", "chinese", "chinese", "cantonese", "cantonese", "japanese", "chinese"], "difficult_direct_answer": false, "rationales": ["The text is written in chinese characters.", "The language of the below text is an asian language.", "This is one of the asian languages"], "image": "train2014/COCO_train2014_000000170390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445388, "question_id": "FyGn4kG2YquFnWJqmLBM3N", "question": "What does the sign showing the arrow with the line through it mean?", "choices": ["no parking", "no crossing", "no exit", "no turns"], "correct_choice_idx": 3, "direct_answers": ["no turn", "no turning", "no turn", "no left", "right turn", "no turns", "do not", "one way", "no turn", "no turning"], "difficult_direct_answer": false, "rationales": ["That means you can not make a left turn there.", "Your automobile may not execute a movement in the indicated direction.", "The sign says no turns."], "image": "train2014/COCO_train2014_000000445388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364939, "question_id": "FyqYm7seKaZveVbgK9rRqU", "question": "What make is the white car?", "choices": ["toyota", "nissan", "honda", "kia"], "correct_choice_idx": 2, "direct_answers": ["volkswagen", "sedan", "sedan", "toyota", "toyota", "honda", "sedan", "honda", "sedan", "sedan"], "difficult_direct_answer": false, "rationales": ["The white car is a honda.", "The white car is a honda.", "If one looks close enough, the logo above the white sedan is an \"h\". this is the logo used by honda."], "image": "train2014/COCO_train2014_000000364939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59933, "question_id": "Fyr26Q9XEBWryoPXMD84yP", "question": "What is in the dog's mouth?", "choices": ["nothing", "bone", "2 bottles", "water bottle"], "correct_choice_idx": 0, "direct_answers": ["bottles", "nothing", "plastic bottle", "water bottles", "bottles", "water bottle", "bottle", "bottles", "water bottles", "bottles"], "difficult_direct_answer": false, "rationales": ["A dog has two clear bottles in its mouth. water bottles are often clear.", "He is holding water containers in his mouth.", "You can clearly see the dog has 2 water bottles in his mouth."], "image": "train2014/COCO_train2014_000000059933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177125, "question_id": "Fyrw8T9SoxqRB5crdtLHtj", "question": "Where are these policemen patrolling?", "choices": ["inside park", "at airport", "in city", "tourist zone"], "correct_choice_idx": 0, "direct_answers": ["park", "park area", "park", "park", "park", "inside park", "park", "park", "park", "street"], "difficult_direct_answer": false, "rationales": ["In looking at the background with its trees, grass, benches and people congregating, it is apparent that it is a park. the police are policing inside this park.", "They are on patrol in a local park", "They patrol the park."], "image": "train2014/COCO_train2014_000000177125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340771, "question_id": "Fyzc65CYqsQAD6XbkXyJt8", "question": "Which fish is mentioned on the top street sign?", "choices": ["pollock", "salmon", "pickerel", "halibut"], "correct_choice_idx": 1, "direct_answers": ["salmon", "salmon", "salmon", "salmon", "salmon", "salmon", "salmon", "salmon", "salmon", "salmon"], "difficult_direct_answer": false, "rationales": ["A street sign lists salmon run as the street name.", "A street sign lists the name salmon run and salmon is a fish.", "The fish is a salmon."], "image": "train2014/COCO_train2014_000000340771.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433822, "question_id": "Fz247rBxtzbNygjViBFPaY", "question": "What kind of print is on his phone?", "choices": ["medium", "jumbo", "large", "fine"], "correct_choice_idx": 3, "direct_answers": ["fingerprint", "engraving", "tiny", "fine", "small", "small print", "tiny", "fine print", "very small", "black"], "difficult_direct_answer": true, "rationales": ["The print is fine.", "He needs a magnifying glass because the print is so small", "The phone print is tiny."], "image": "val2014/COCO_val2014_000000433822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429319, "question_id": "FzMEpr4EA5cDgZQHtreiDg", "question": "What is this man's job?", "choices": ["lawyer", "doctor", "priest", "waiter"], "correct_choice_idx": 3, "direct_answers": ["waiter", "waiter", "server", "waiter", "waiter", "server", "server", "server", "waiter", "server"], "difficult_direct_answer": false, "rationales": ["The man is carrying a tray of food. the most common occupation where people carry trays of food is answer a.", "The man is carrying food on a tray, an action performed by the profession in option a.", "He is a waiter bring guests their food."], "image": "train2014/COCO_train2014_000000429319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16173, "question_id": "FzPm7DFpAJv7TKFqsuqFoK", "question": "What is this man celebrating?", "choices": ["funeral", "his birthday", "retirement", "wedding"], "correct_choice_idx": 1, "direct_answers": ["birthday", "birthday", "birthday", "happy birthday", "birthday", "birthday", "his birthday", "70th birthday", "birthday", "birthday"], "difficult_direct_answer": false, "rationales": ["It has candles to blow out", "The candle on the cake indicates that he has just turned 70.", "The number seventy is on a cake with lit candles and an elderly man blowing them out."], "image": "train2014/COCO_train2014_000000016173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528493, "question_id": "FzaBfRjfQUzhvUcrHCDyJU", "question": "Why are they so close together?", "choices": ["curious", "friendly", "accidental", "angry"], "correct_choice_idx": 1, "direct_answers": ["talking", "friends", "date", "friends", "friendly", "talking", "talking", "friends", "posing", "friends"], "difficult_direct_answer": false, "rationales": ["They're friendly.", "Based on the heads tilted toward each other and the posed genial vibe, these people know each other and enjoy it.", "They are at a stop and talking to each other"], "image": "train2014/COCO_train2014_000000528493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568469, "question_id": "FzcF4wfRFaysAong449R3Q", "question": "What are drivers told is forbidden?", "choices": ["go straight", "turn left", "u-turns", "yield"], "correct_choice_idx": 2, "direct_answers": ["law violation", "u-turns", "u turn", "u turns", "u turn", "u turns", "upturn", "u turn", "right turns", "u-turns"], "difficult_direct_answer": false, "rationales": ["You can tell by the symbols on the sign as to what is not allowed at that street.", "There is a u with a line through it.", "There is a sign with arrows forming a \"u\" shape and a red line through it. this road sign with this symbol is used to instruct drivers that this type of action is not permitted."], "image": "train2014/COCO_train2014_000000568469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361392, "question_id": "Fzhgc6azdtHq7Qpo3Kf3it", "question": "One can board a train in which region after they disembark from this bus?", "choices": ["northern", "london midland", "scottish", "western"], "correct_choice_idx": 3, "direct_answers": ["slough", "slough station", "slough", "western", "slough", "slough station", "slough station", "i do", "slough station", "slough station"], "difficult_direct_answer": false, "rationales": ["These double-decker buses are found in this main uk city.", "It is hard to tell based on this image but maybe western. there is a double decker bus with some foreign signage on the license plate.", "You can go to the west as indicated by the bus sign."], "image": "train2014/COCO_train2014_000000361392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22051, "question_id": "Fzt8LE2Ufvi6ykNJSCC5Lm", "question": "What event is being carried out?", "choices": ["tennis training", "tennis competition", "badminton training", "badminton competition"], "correct_choice_idx": 0, "direct_answers": ["tennis training", "tennis game", "tennis", "tennis match", "tennis match", "tennis", "match", "tennis games", "tennis match", "tennis"], "difficult_direct_answer": false, "rationales": ["There are no spectators, and they are all wearing the same uniform, so it is assumed that this is a practice session. the bright yellow balls indicates that the sport is tennis.", "The players are clearly playing tennis based on the rackets, court and balls and are all wearing the same color jerseys as if they are on a team and there are no spectators. when teammates play amongst themselves they are likely training.", "The contrast of young students with adult instructors and pail full of balls suggests a tennis training camp."], "image": "val2014/COCO_val2014_000000022051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292590, "question_id": "G22XoHL4SMQZfmjv7P4rGk", "question": "What direction is this person going?", "choices": ["backwards", "up", "uphill", "downhill"], "correct_choice_idx": 3, "direct_answers": ["downhill", "down", "down", "downhill", "downhill", "down", "downhill", "downhill", "down", "downhill"], "difficult_direct_answer": false, "rationales": ["A skier can only ski down a hill.", "The person is doing a sport that relies on gravity to slide on a snowy hill.", "This person is skiing. when you ski, you go down the mountian."], "image": "val2014/COCO_val2014_000000292590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339914, "question_id": "G2MUR7aWaL2PMTCupfHeXZ", "question": "What are these people doing?", "choices": ["kiteboarding", "hang gliding", "parasailing", "flying kites"], "correct_choice_idx": 0, "direct_answers": ["sailboarding", "wakeboard parasailing", "kiteboarding", "paraskiing", "kiteboarding", "parasailing", "wakeboarding", "kiteboarding", "wind surfing", "parasailing"], "difficult_direct_answer": false, "rationales": ["They are on surfboard type of item and have sails up in the air to propel them forward", "The people have kites flying.", "It's the only option that requires you to stay in the water. the rest require you stay in the air."], "image": "train2014/COCO_train2014_000000339914.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323695, "question_id": "G2hiH34bpvkqxapFXAfVLd", "question": "Why are the cars so close together?", "choices": ["race", "red light", "accident", "gathering"], "correct_choice_idx": 1, "direct_answers": ["traffic jam", "traffic", "traffic", "traffic", "traffic jam", "stop light", "traffic", "red light", "red light", "rush hour"], "difficult_direct_answer": false, "rationales": ["The cars are stuck at a red light.", "They are all at a stop waiting for it to change to green", "A stoplight can be seen in the background and it is on it's \"stopping color\" so the cars are lined up waiting for it to turn green."], "image": "train2014/COCO_train2014_000000323695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36764, "question_id": "G2iE23uJdwfgxe92mZsbrn", "question": "What is the silverware on the plate wrapped in?", "choices": ["newspaper", "tinfoil", "napkin", "bow"], "correct_choice_idx": 2, "direct_answers": ["napkin", "napkin", "napkin", "fork", "napkin", "napkin", "napkin", "napkin", "napkin", "paper towel"], "difficult_direct_answer": false, "rationales": ["This is common in restaurants to make it easy to get the items quickly to customers", "A white fabric can be seen and is customary to wrap around utensils to protect from bacteria and dirt.", "This paper product helps diners stay clean"], "image": "train2014/COCO_train2014_000000036764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527897, "question_id": "G2su8z3Hd6jz2h84ricJuP", "question": "What type of location is this?", "choices": ["tropical", "country", "city", "desert"], "correct_choice_idx": 1, "direct_answers": ["farm", "farm", "farm", "country", "rural", "chicken coop", "rural", "farm", "chicken coop", "farm"], "difficult_direct_answer": false, "rationales": ["A truck is parked in a wooded rural area.", "There are chickens on the grounds.", "The location is the country."], "image": "train2014/COCO_train2014_000000527897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325237, "question_id": "G3CPdvPtumkDDXBrUXJNfs", "question": "What time of day is it?", "choices": ["midnight", "dusk", "noon", "mid morning"], "correct_choice_idx": 1, "direct_answers": ["evening", "night", "nightime", "evening", "dusk", "night", "night", "night", "night", "evening"], "difficult_direct_answer": false, "rationales": ["The lights are on because it is dark outside.", "You can tell by how the building is lit up the time of day.", "The day is at dusk."], "image": "val2014/COCO_val2014_000000325237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55764, "question_id": "G3HQ5aiZwgLKbvkrYao3Do", "question": "What is the likeliness of the batter hitting this ball?", "choices": ["guaranteed", "impossible", "likely", "unlikely"], "correct_choice_idx": 1, "direct_answers": ["unlikely", "zero", "low", "not likely", "0%", "unlikely", "unlikely", "zero chance", "missed", "impossible"], "difficult_direct_answer": false, "rationales": ["The ball is above the batter.", "The likeness is impossible.", "The ball is already in the catcher's glove."], "image": "train2014/COCO_train2014_000000055764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445014, "question_id": "G3QV78nBwH8W8AYSuuVuG2", "question": "What's most likely to stop him from getting bitten?", "choices": ["taser", "glass", "fence", "self restraint"], "correct_choice_idx": 3, "direct_answers": ["rail", "self restraint", "board", "distance", "fence", "string", "fence", "fence", "railing", "height"], "difficult_direct_answer": false, "rationales": ["His hand is inside the giraffe's enclosure.", "The fence is there to protect them.", "The man is close to the giraffe so could be bitten."], "image": "val2014/COCO_val2014_000000445014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187324, "question_id": "G3cChEoScXbhCDPsR23ALr", "question": "What work does this bus need to have done on it?", "choices": ["wheels rotated", "paint roof", "reverse signaling", "retraining"], "correct_choice_idx": 1, "direct_answers": ["paint", "paint roof", "cleaning", "ceiling paint", "roof repair", "paint", "cleaning", "cleaning", "bus washed", "body"], "difficult_direct_answer": false, "rationales": ["The bus needs the roof cleared and painted.", "Based on the color scheme of the rest of the bus the roof does not appear to be its original color and appears to be disrepair. if a painted surface looks this way it is in need of new paint.", "The roof needs painting."], "image": "train2014/COCO_train2014_000000187324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259149, "question_id": "G3xwHE3G4foLzmWvDnUWEd", "question": "Where is this animal located?", "choices": ["kitchen", "bedroom", "bathroom", "dining room"], "correct_choice_idx": 1, "direct_answers": ["bed", "bedroom", "bed", "bed", "bed", "girls chest", "bed", "bed", "girls chest", "bed"], "difficult_direct_answer": false, "rationales": ["The animal is perched on a comforter on someone's bed.", "The animal is laying on top of the comforter that is covering the human who is laying on their pillow and underneath the covers preparing to sleep.", "There is a comforter and a pillow near the cat. the woman is trying to rest."], "image": "train2014/COCO_train2014_000000259149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299474, "question_id": "G49XCJQVHwzPGr9H4xj9cx", "question": "What are the people wearing?", "choices": ["bathing suits", "coats", "gloves", "boots"], "correct_choice_idx": 0, "direct_answers": ["bathing suits", "bathing suit", "swimsuits", "swimsuit", "surfing", "bathing suits", "goggles", "bathing suits", "bathing suits", "bathing suits"], "difficult_direct_answer": false, "rationales": ["The people are body surfing and are wearing items appropriate for playing in the water.", "They are boogie boarding in the ocean. special lightweight clothing makes it easier to swim.", "They are swimming and wearing bathing suits for this activity."], "image": "train2014/COCO_train2014_000000299474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343403, "question_id": "G4FhmxiuRy2PhpkFq2qDGN", "question": "What is his team's home state?", "choices": ["ontario", "michigan", "alaska", "maine"], "correct_choice_idx": 1, "direct_answers": ["maryland", "to", "michigan", "massachusetts", "michigan", "michigan", "maryland", "not clear", "new york", "mlb team"], "difficult_direct_answer": false, "rationales": ["Michigan is the home state of the dodgers.", "The logo on this baseball players helmet belongs to the detroit tigers. detroit is a big city in the state of michigan.", "This person's team is based in detroit."], "image": "train2014/COCO_train2014_000000343403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226190, "question_id": "G4JpX8MmQjooQmNePcxQAb", "question": "Why is the small child in the water?", "choices": ["enjoys playing", "is lost", "cleaning up", "left home"], "correct_choice_idx": 0, "direct_answers": ["wading", "playing", "playing", "enjoys playing", "wading", "playing", "playing", "playing", "playing", "surfing"], "difficult_direct_answer": false, "rationales": ["This child's bent over stance with one foot in the air suggests he is playing in the waves and surf.", "The child wants to play.", "The child appears to be in movement and possible engaging with a toy or at least the water. usually children at the beach are there to play and particularly in the water."], "image": "train2014/COCO_train2014_000000226190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110105, "question_id": "G4TAve67qCkzYrS5B2mAN4", "question": "Why are the children wearing numbers on their jackets?", "choices": ["for fun", "for style", "as punishment", "for competition"], "correct_choice_idx": 3, "direct_answers": ["identity", "for competition", "race ids", "identification", "race", "competition", "race numbers", "racing", "competition", "identify"], "difficult_direct_answer": true, "rationales": ["The kids are competing.", "They are competing in an event.", "Those are their entrant numbers which allow the judges and spectators to easily spot them."], "image": "train2014/COCO_train2014_000000110105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424146, "question_id": "G4suqMKxXqkSoHTruHXdd9", "question": "The black device in front of the white keyboard indicates that this room has what type of service?", "choices": ["internet", "landline telephone", "cellular telephone", "fax"], "correct_choice_idx": 1, "direct_answers": ["cellphone", "cell service", "monitor", "walkie talkie", "phone", "cell", "phone service", "landline telephone", "landline", "telephone"], "difficult_direct_answer": true, "rationales": ["The device is a landline phone.", "There is a cordless phone in front of the keyboard.", "It's a landline telephone."], "image": "val2014/COCO_val2014_000000424146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51484, "question_id": "G53JuptaENiVk3mZ4wrRrv", "question": "How is the small blue board the man is holding called?", "choices": ["short surf", "surf", "small surf", "shortboard"], "correct_choice_idx": 3, "direct_answers": ["paddle board", "surfboard", "board", "boogie", "boogie board", "paddle board", "body board", "shortboard", "paddle board", "smaller"], "difficult_direct_answer": false, "rationales": ["It's a smaller board then the other ones he is holding.", "Though it looks like a standard surfboard, it's smaller size lets you know what it is.", "The shortboard is blue."], "image": "val2014/COCO_val2014_000000051484.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365218, "question_id": "G5DdcNUVnpL6rDC9rckcna", "question": "What kind of stand is the man with the newspaper standing beside?", "choices": ["newspaper stand", "fruit stand", "fish stand", "meat stand"], "correct_choice_idx": 1, "direct_answers": ["fruit", "fruit", "fruit stand", "fruit stand", "food stand", "fruit stand", "fruit", "produce", "banana stand", "banana"], "difficult_direct_answer": false, "rationales": ["He is standing next to a fruit stand.", "There are various fruits on display next to the man.", "The man is near a bunch of bananas."], "image": "train2014/COCO_train2014_000000365218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342401, "question_id": "G5LVxLe7JanRcoBmNzeCyW", "question": "What time of the year is it?", "choices": ["spring", "solstice", "summer", "winter"], "correct_choice_idx": 3, "direct_answers": ["spring", "spring", "spring", "spring", "spring", "spring", "fall", "baseball season", "2021", "winter"], "difficult_direct_answer": false, "rationales": ["The players are dressed warmly and many of the trees in the background have no leaves. in winter, the trees have often lost their leaves and the weather is frequently cold causing people to dress warmly.", "They are playing baseball, a sport associated with the season in option a.", "The ball players are wearing long sleeves"], "image": "train2014/COCO_train2014_000000342401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30290, "question_id": "G5w6ShqPKf7qKuuNfPuYNN", "question": "What fabric is the stuffed animal made of?", "choices": ["leather", "nylon", "denim", "synthetic fiber"], "correct_choice_idx": 3, "direct_answers": ["stuffed sponge", "fleece", "synthetic fiber", "fluffy fabric", "fur", "cotton", "fur", "cotton", "cotton", "faux fur"], "difficult_direct_answer": false, "rationales": ["It's made of soft fiber.", "The fabric is synthetic.", "The stuffed animal is made of synthetic fiber."], "image": "val2014/COCO_val2014_000000030290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157526, "question_id": "G68FCFkv7qRm8Rgm2mj2Kh", "question": "The diner here had what sort of milestone today?", "choices": ["birthday", "jail release", "passed bar", "none"], "correct_choice_idx": 0, "direct_answers": ["54", "birthday", "birthday", "54th birthday", "54", "birthday", "birthday", "birthday", "birthday", "birthday"], "difficult_direct_answer": false, "rationales": ["There is an age number on the breakfast plate.", "The bacon is laid out to spell 54 which is presumably the person's age.", "A breakfast plate is arranged so that the bacon makes the shape of the numbers fifty four."], "image": "train2014/COCO_train2014_000000157526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91998, "question_id": "G68UzJ6E3MAKbw8SWSMDbd", "question": "What exercise equipment is seen on the right?", "choices": ["rower", "stationary bike", "elliptical", "treadmill"], "correct_choice_idx": 1, "direct_answers": ["stationary bike", "stationary bike", "exercise bike", "stationary bike", "exercise bike", "bicycle", "bike", "stationary bike", "stationary bike", "exercise bike"], "difficult_direct_answer": false, "rationales": ["There is one sitting in front of the chair.", "This piece of equipment has pedals, a bike seat and handlebars. it doesn't have wheels though so we can conclude it's a stationary bike.", "The equipment is a bike."], "image": "train2014/COCO_train2014_000000091998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96654, "question_id": "G6K7ng8vLoshbxf9ZX7Uvy", "question": "How many more animals would be needed to make a dozen?", "choices": ["eleven", "two", "four", "nine"], "correct_choice_idx": 3, "direct_answers": ["nine", "nine", "nine", "nine", "nine", "nine", "nine", "nine", "nine", "nine"], "difficult_direct_answer": false, "rationales": ["There are currently three animals visible. subtracting a dozen (12) from this is nine.", "There are three animals shown", "Nine more animals are needed."], "image": "val2014/COCO_val2014_000000096654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225299, "question_id": "G6jjH6ffuHhQceM6vjum2Q", "question": "Skiing on the sloped area allows the skier to what change in elevation?", "choices": ["same", "higher", "lower", "none"], "correct_choice_idx": 1, "direct_answers": ["higher", "downhill", "higher altitude", "increase", "height", "snow", "higher", "higher", "jump", "go up"], "difficult_direct_answer": false, "rationales": ["This allows them to get height when skiing.", "The skier goes up the slope and then continues with inertia past the highest end of the slope.", "A skier is skiing up a ramp. ramps are used to get elevation."], "image": "val2014/COCO_val2014_000000225299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301381, "question_id": "G6nu7RfLre4cMc9oFth2Sd", "question": "What political party does the mentioned politician belong to?", "choices": ["libertarian", "republican", "independent", "democrat"], "correct_choice_idx": 1, "direct_answers": ["republican", "republican party", "republican", "republican", "republican party", "republican", "republican", "republican", "republican", "republican party"], "difficult_direct_answer": false, "rationales": ["As seen on the tv, the politician is in the republican party.", "The article states what party the politician belongs to.", "They belong to the republican party."], "image": "train2014/COCO_train2014_000000301381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245654, "question_id": "G75p7eUUwjYakbE3eCJP5e", "question": "What are they preparing for?", "choices": ["parade", "war", "riot", "fashion show"], "correct_choice_idx": 2, "direct_answers": ["parade", "racing", "policing", "riot", "ride", "walk streets", "patrol", "work", "work", "patrolling"], "difficult_direct_answer": true, "rationales": ["These people are preparing for a riot with gear and protection.", "They look like they are getting ready for a parade", "They look to be preparing for a parade to ride their horses in or guide traffic."], "image": "train2014/COCO_train2014_000000245654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559950, "question_id": "G7B7HdmrPAoUDCeiYGjPnq", "question": "What kind of pet is looking out the window?", "choices": ["cat", "rabbit", "dog", "hamster"], "correct_choice_idx": 0, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "cat"], "difficult_direct_answer": false, "rationales": ["It is larger than a cat and is a domesticated furry pet", "A cat is looking out the window.", "The animal is on a leash and has a black nose."], "image": "val2014/COCO_val2014_000000559950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61992, "question_id": "G7DFQNWkL4hwN9BmzBCxyG", "question": "What is one location along the buses route?", "choices": ["stadium", "college", "mall", "downtown"], "correct_choice_idx": 1, "direct_answers": ["college pt", "bus stop", "college", "college point", "college point", "college", "city", "college", "college pt", "college point"], "difficult_direct_answer": false, "rationales": ["A bus has a lit sign on the top window advertising the next stop to be a college.", "The display above the windshield of the bus indicates college.", "The location is a college."], "image": "train2014/COCO_train2014_000000061992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353536, "question_id": "G7QouxC2BZ5dzAdvNzPbUu", "question": "Why would someone sit at this table?", "choices": ["to eat", "to work", "to paint", "to sew"], "correct_choice_idx": 0, "direct_answers": ["dining", "to eat", "to eat", "eat", "eat dinner", "to eat", "to eat", "to eat", "to eat", "to eat"], "difficult_direct_answer": false, "rationales": ["It's obvious that someone has dined here.", "The table holds food and dining implements", "It has dishes and food on it"], "image": "val2014/COCO_val2014_000000353536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417330, "question_id": "G7tbRXj7ANNmyCacHxRLMf", "question": "What are the people riding on?", "choices": ["roller blades", "skateboard", "ice skates", "surfboard"], "correct_choice_idx": 1, "direct_answers": ["skateboards", "skateboards", "skateboard", "skateboards", "skateboard", "skateboard", "skateboards", "skateboards", "skateboard", "skateboards"], "difficult_direct_answer": false, "rationales": ["They are riding on boards that have wheels on them.", "They have a deck and four wheels", "They have a deck and four wheels"], "image": "train2014/COCO_train2014_000000417330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469237, "question_id": "G8GdhTzeqD2HfXfa5XFopk", "question": "What is the item closest to the green sign on the left that has the number 4 on it?", "choices": ["train", "track", "briefcase", "clock"], "correct_choice_idx": 3, "direct_answers": ["clock", "clock", "bench", "clock", "clock", "clock", "train", "post", "track assignment", "clock"], "difficult_direct_answer": false, "rationales": ["The clock is near the sign with the four on it.", "It's near the clock.", "A green street sign is to the left of a train and a square object with hands on it is beside the sign."], "image": "train2014/COCO_train2014_000000469237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285651, "question_id": "G8Q7iE9eAtNZMnZmEgLQzG", "question": "What is the greatest danger for the skateboarder right now?", "choices": ["rocks", "other person", "car", "falling"], "correct_choice_idx": 3, "direct_answers": ["cars", "oncoming traffic", "hitting car", "traffic", "black car", "cars", "cars", "falling", "oncoming traffic", "car"], "difficult_direct_answer": false, "rationales": ["A skateboarder is skating down the street. skateboarders fall sometimes.", "There are vehicles with their headlights towards us on the middle of the road.", "The danger is falling."], "image": "val2014/COCO_val2014_000000285651.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364380, "question_id": "G8V8BACMDkN4kVQQfvq5rb", "question": "What is the yellowish hard item sticking out from the animal?", "choices": ["sword", "handle", "tusk", "beating stick"], "correct_choice_idx": 2, "direct_answers": ["tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk"], "difficult_direct_answer": false, "rationales": ["The animal has a white hard item sticking out of its body, and the term in a matches the description.", "It's a tusk that's sort of dirty.", "A man is on an elephant in the water. elephants have tusks."], "image": "train2014/COCO_train2014_000000364380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175745, "question_id": "G8jTpez8U8Vuugh6M9xYjS", "question": "What sport might the woman holding the ball be going to play?", "choices": ["jogging", "weightlifting", "hockey", "volleyball"], "correct_choice_idx": 3, "direct_answers": ["volleyball", "volleyball", "volleyball", "volleyball", "volleyball", "volleyball", "volleyball", "volleyball", "volleyball", "volleyball"], "difficult_direct_answer": false, "rationales": ["The sport is volleyball.", "The texture and markings on this ball identify it as one intended for volleyball.", "The woman on the left is carrying a leather multi-colored ball. based on its size and pattern, it would qualify as a volleyball."], "image": "train2014/COCO_train2014_000000175745.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161738, "question_id": "G8ka59qPd387ZVRHUcBuki", "question": "The signs are expressing their love for which city?", "choices": ["manchester", "moscow", "milan", "manila"], "correct_choice_idx": 1, "direct_answers": ["france", "moscow", "paris", "paris", "kockby", "mockby", "mockby", "kosovo", "paris", "russia"], "difficult_direct_answer": false, "rationales": ["The signs are written in russian.", "The signs are for moscow.", "The vehicles and the architecture imply this event is taking place there."], "image": "train2014/COCO_train2014_000000161738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93964, "question_id": "G8wNExTsDeWM3bsyBvpP9S", "question": "What is the word closest to the left?", "choices": ["three", "city", "blue", "five"], "correct_choice_idx": 1, "direct_answers": ["city", "city", "city", "city", "city", "city", "city", "walmart", "city", "bus"], "difficult_direct_answer": false, "rationales": ["In english, one reads left to right, so the left-most word s the first word.", "The word before coach is closest to the left.", "The word is a city."], "image": "val2014/COCO_val2014_000000093964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463533, "question_id": "G92EBGSmhdpj6NWa7gNqiv", "question": "What is the man on one leg doing?", "choices": ["singing", "hopping", "pitching", "hiding"], "correct_choice_idx": 2, "direct_answers": ["pitching", "throwing", "pitching", "pitching", "pitching", "pitching", "pitching", "pitching", "throwing", "pitching"], "difficult_direct_answer": false, "rationales": ["The man is trying to get the ball.", "The man is in a baseball uniform and standing on the pitcher's mound.", "This man is on the pitchers mound of a baseball field having just thrown a ball towards a batter. pitcher is the name of the position that throws balls to the batter."], "image": "train2014/COCO_train2014_000000463533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513319, "question_id": "G95TuBzShkVVazJrxMV62e", "question": "The Vax Platinum solution in the cleaner targets on which microbe?", "choices": ["virus", "fungi", "protozoa", "bacteria"], "correct_choice_idx": 3, "direct_answers": ["dust", "visitors", "wood", "vaccuum", "germs", "bacteria", "bacteria", "bacteria", "bacteria", "sars-cov2"], "difficult_direct_answer": false, "rationales": ["The solution cleans germs.", "This is a knowledge based question and can only be answered if researching.", "Vacuums clean up dirt and debris which contains bacteria."], "image": "val2014/COCO_val2014_000000513319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390374, "question_id": "G9Lycm4JbWdFppjVchm8hF", "question": "How is the man on the right feeling?", "choices": ["nervous", "scared", "amused", "angry"], "correct_choice_idx": 2, "direct_answers": ["happy", "happy", "happy", "excited", "happy", "amused", "excited", "excited", "happy", "happy"], "difficult_direct_answer": false, "rationales": ["He looks like he is having fun playing a game on the wii", "The man is amused.", "The man on the right is smiling with a huge grin."], "image": "train2014/COCO_train2014_000000390374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5430, "question_id": "G9at9KMEaNsvFZ89SywUfp", "question": "What is the person in the heavy coat using the phone for?", "choices": ["texting", "mirror", "calculator", "talking"], "correct_choice_idx": 1, "direct_answers": ["selfie", "take picture", "mirror", "selfie", "selfie", "take picture", "selfie", "photo", "observing face", "selfie"], "difficult_direct_answer": false, "rationales": ["The person's getting a closeup of their features.", "You can see her face in the phone.", "The person with the heavy coat is using their phone as a mirror to see their reflection."], "image": "train2014/COCO_train2014_000000005430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157519, "question_id": "G9kgpZZWGkywDNnSxfao7A", "question": "Which vehicle shown in the photo goes the fast?", "choices": ["motorcycle", "skateboard", "bike", "bus"], "correct_choice_idx": 3, "direct_answers": ["bus", "truck", "bicycle", "skateboard", "bus", "truck", "skating device", "bike", "bus", "bike"], "difficult_direct_answer": false, "rationales": ["A blurry large vehicle can be seen. objects appear blurry when there is fast movement.", "There is a large passenger vehicle in the picture.", "Far in the background at the upper left corner looks like it might be a. otherwise, it would be b."], "image": "train2014/COCO_train2014_000000157519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353803, "question_id": "GA2RZhdnFbSNcgVSGCp5bJ", "question": "What is on the plate with the two wrapped forks?", "choices": ["butter jelly", "mayonnaise", "ketchup", "mustard"], "correct_choice_idx": 0, "direct_answers": ["butter", "jellies", "jam", "strawberry jam", "condiments", "butter", "butter", "butter", "butter", "butter jelly"], "difficult_direct_answer": false, "rationales": ["The plate has butter and jelly.", "These condiments are appropriate for toast", "Because its near wheel the bread is located and packaged as butter."], "image": "train2014/COCO_train2014_000000353803.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432624, "question_id": "GAKRq7JwMr88SjrnGCFASo", "question": "Where is the man located?", "choices": ["forest", "mountains", "desert", "beach"], "correct_choice_idx": 1, "direct_answers": ["on snow", "ski resort", "snow", "outdoors", "snow", "ski resort", "ski slope", "mountains", "ski slope", "snow"], "difficult_direct_answer": false, "rationales": ["The man is near mountains.", "The man is skiing in the mountains.", "The background of where this man is a good indicator that he is part of a larger mountainous range."], "image": "train2014/COCO_train2014_000000432624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385598, "question_id": "GAehFYhKCxHev8t6Bmrmn6", "question": "What is the conveyer belt for?", "choices": ["align skiers", "train skiers", "resting area", "transporting skiers"], "correct_choice_idx": 0, "direct_answers": ["ski lift", "ski lift", "transporting skiers", "transportation", "movement", "transport", "align skiers", "moving skiers", "transportation", "ski lift"], "difficult_direct_answer": false, "rationales": ["The conveyor belts move these people to new areas", "Skiers take ski lifts to get up mountains at ski resorts.", "The belt is to align skiers."], "image": "val2014/COCO_val2014_000000385598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101088, "question_id": "GAwpMYgXJKH2NX6ePdMxrf", "question": "How will these people standing in water leave where they are?", "choices": ["uber", "lyft", "swim", "fly"], "correct_choice_idx": 3, "direct_answers": ["by seaplane", "by plane", "fly", "plane", "plane", "fly", "plane", "seaplane", "plane", "fly plane"], "difficult_direct_answer": false, "rationales": ["There is a plane in the background that is being prepared. it is likely that a person standing this close to a plane would leave in this vehicle as well as the other options not seeming likely because of the remoteness of this setting.", "They will fly out of there on the plane.", "They will go in the airplane that will take off from the water."], "image": "val2014/COCO_val2014_000000101088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564301, "question_id": "GB62fy6UpsGwfgRxGkU4zB", "question": "What working job does the dog shown have?", "choices": ["shepherding", "junkyard protection", "lap dog", "eating"], "correct_choice_idx": 0, "direct_answers": ["sheep herding", "herding", "sheepherder", "herd", "herder", "sheep harder", "sheep dog", "shepherding", "herding", "herd"], "difficult_direct_answer": false, "rationales": ["The job is a shepherd.", "Sheep are standing together in a grassy area. dogs are used to move and keep sheep together.", "The dog will herd the sheep and make them go where they need to go."], "image": "val2014/COCO_val2014_000000564301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263251, "question_id": "GBV9z3PuUtnRjcFMaHusGc", "question": "Why is the ball on the parasol?", "choices": ["is random", "landed there", "is trick", "fell there"], "correct_choice_idx": 2, "direct_answers": ["trick", "trick", "tricks", "balancing", "entertainment", "is trick", "entertaining", "trick", "playing", "placed there"], "difficult_direct_answer": false, "rationales": ["The man is balancing the ball and has a crowd.", "It looks like the person with the umbrella is entertaining the other people.", "There is a ball doing a trick around the parasol."], "image": "val2014/COCO_val2014_000000263251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16732, "question_id": "GBYcwQ6Equwb2KZEbegcBs", "question": "What common game played by children is depicted by the sculpture?", "choices": ["rock-paper-scissors", "connect four", "tic-tac-toe", "checkers"], "correct_choice_idx": 0, "direct_answers": ["rock-paper-scissors", "rock paper", "rockpaper scissors", "rock paper", "hand game", "rock-paper-scissors", "paper scissors", "rock paper", "rockpaperscissors", "rock/paper/scissors"], "difficult_direct_answer": false, "rationales": ["There is a sculpture with scissors on top of paper on top of a rock.", "A rock is on the bottom, a sheet of paper is in the middle. scissors are on the top.", "The game is rock, paper scissors."], "image": "val2014/COCO_val2014_000000016732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548729, "question_id": "GBmyukzNX7DukULW7GB5Gg", "question": "What animal name does the first three letters on the wall spell?", "choices": ["hen", "pig", "dog", "cat"], "correct_choice_idx": 3, "direct_answers": ["cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["The first two letters are \"c\" and \"a\".", "The word \"cat\" appears.", "The animal is a cat."], "image": "val2014/COCO_val2014_000000548729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276626, "question_id": "GBpL9RsTfKDV5GsiTpTUjS", "question": "What will keep the water from flooding the tracks?", "choices": ["grass", "rocks", "sand", "metal"], "correct_choice_idx": 1, "direct_answers": ["rocks", "rocks", "rocks", "rocks", "rocks", "rocks", "stones", "stones", "rocks", "rocks"], "difficult_direct_answer": false, "rationales": ["The rocks are creating a barrier for the waves to crash on.", "The rocks are blocking the water from rolling onto the tracks.", "The rocks are used as a barrier to keep the water at bay."], "image": "train2014/COCO_train2014_000000276626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301817, "question_id": "GC6TyDFiQ3WuF7hHP427Wm", "question": "What is that thing on top of the building called?", "choices": ["gargoyle", "antennae", "weathervane", "signal"], "correct_choice_idx": 2, "direct_answers": ["clock", "clock tower", "weather vane", "tower", "clock tower", "clock tower", "clock tower", "weathervane", "clocktower", "clocks"], "difficult_direct_answer": false, "rationales": ["There is a weathervane for wind direction on top of the cathederal.", "It's used to show which way the wind is blowing.", "Used to show which way the wind is blowing but also for decoration."], "image": "val2014/COCO_val2014_000000301817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136932, "question_id": "GCCe6Xyq6tmcPc2xG5WqVz", "question": "What sort of festival do these men attend?", "choices": ["july 4th", "santa village", "oktoberfest", "thanksgiving"], "correct_choice_idx": 2, "direct_answers": ["dutch", "oktoberfest", "october fest", "oktoberfest", "oktoberfest", "german", "germany", "octoberfest", "german", "oktoberfest"], "difficult_direct_answer": false, "rationales": ["Men are dressed in white shirts and red smocks. people dress up for oktoberfest.", "The men are dressed in traditional festive german clothing because they are at oktoberfest.", "The men are wearing overalls and eating sausage."], "image": "train2014/COCO_train2014_000000136932.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232769, "question_id": "GCMGqxP9Vnvk3CBMus2she", "question": "Where are these girls playing?", "choices": ["arena", "school yard", "rink", "stadium"], "correct_choice_idx": 1, "direct_answers": ["soccer field", "school yard", "soccer", "soccer", "soccer field", "soccer field", "soccer", "soccer", "soccer", "soccer"], "difficult_direct_answer": false, "rationales": ["These girls are playing soccer in the school yard.", "The other options don't match the background wall.", "They are on the grass surrounded by a chain link fence and a wooden fence with some trees around it."], "image": "val2014/COCO_val2014_000000232769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473095, "question_id": "GCT6xo8mu7WRSjSwvtVamj", "question": "What type of building does this seem to be?", "choices": ["police station", "university", "mall", "carwash"], "correct_choice_idx": 1, "direct_answers": ["college", "university building", "school", "school", "school", "school", "school", "university", "university", "school"], "difficult_direct_answer": false, "rationales": ["There are people who look to be in their 20s, playing.", "It is a brick and metal building with many windows that is at least five stories tall and wider than it is tall. there are many healthy young adults lounging and engaging in recreational activities outside the building.", "The building is large and brick with a big lawn in front and many young people present. this would be typical of a school setting."], "image": "train2014/COCO_train2014_000000473095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579226, "question_id": "GCTfeww5yn2PtiWhHLgqzG", "question": "What language are the words on the clock written in?", "choices": ["greek", "hebrew", "latin", "spanish"], "correct_choice_idx": 2, "direct_answers": ["latin", "latin", "roman", "latin", "latin", "latin", "latin", "latin", "latin", "latin"], "difficult_direct_answer": false, "rationales": ["They are in a foreign language.", "This is the old language that not many people know or use now", "This language is latin which was the common language of the catholic church many years ago."], "image": "val2014/COCO_val2014_000000579226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527631, "question_id": "GCUCxJMg4AE7w4EovqR7am", "question": "Which of these foods on the plate are highest in fat?", "choices": ["fries", "bread", "cheese", "onions"], "correct_choice_idx": 2, "direct_answers": ["fries", "fries", "fries", "fries", "fries", "cheese", "sandwich", "fries", "fries", "meat"], "difficult_direct_answer": false, "rationales": ["Bread, fries, and onions are mostly carbohydrates.", "Most of the fat stays with this when it melts", "This plate contains many fried foods which are high in fat. however it is a well known fact that cheese has a high content of saturated fat."], "image": "val2014/COCO_val2014_000000527631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345973, "question_id": "GCcNwBz8cDgJSRboKmbZQY", "question": "What setting to the animals shown here prefer?", "choices": ["desert", "tropical", "savannah", "tundra"], "correct_choice_idx": 2, "direct_answers": ["savannah", "grass", "shade", "savannah", "park", "zoo", "wild", "nature", "shade", "forest"], "difficult_direct_answer": false, "rationales": ["These are african animals", "Giraffes are grazing. giraffes graze in the savannah.", "Giraffes are from the african savannah and would be more at home in said location."], "image": "train2014/COCO_train2014_000000345973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7274, "question_id": "GCxXj8DBYMGue2ghodw3w8", "question": "What meal are the people most likely eating at the table?", "choices": ["dessert", "breakfast", "dinner", "lunch"], "correct_choice_idx": 2, "direct_answers": ["dinner", "dinner", "dinner", "lunch", "omelette", "dinner", "dinner", "lunch", "dinner", "pizza"], "difficult_direct_answer": false, "rationales": ["The people are eating a meal that consists of a flatbread dish and salad. these food items typically would not be served at breakfast or lunch.", "They are having dinner.", "The room looks like it's dark."], "image": "val2014/COCO_val2014_000000007274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451169, "question_id": "GDF7tybR6wkUUkQbvXDqrP", "question": "What is in the man's hand?", "choices": ["baseball bat", "basketball", "egg", "tennis racquet"], "correct_choice_idx": 3, "direct_answers": ["racket", "tennis racquet", "tennis racket", "racket", "racket", "tennis racquet", "racket", "tennis racket", "racquet", "racket"], "difficult_direct_answer": false, "rationales": ["He is holding a tennis raquet.", "The man is swinging a racquet on a tennis court.", "The man is holding something to hit a tennis ball."], "image": "train2014/COCO_train2014_000000451169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439865, "question_id": "GDGDNN7y3r7jZUDphGnWiJ", "question": "What is the bike being used to transport?", "choices": ["mail", "hamper", "dogs", "water"], "correct_choice_idx": 1, "direct_answers": ["laundry", "laundry basket", "laundry", "hamper", "laundry basket", "basket", "laundry", "laundry", "dog", "people"], "difficult_direct_answer": false, "rationales": ["There is a laundry basket on the back of the bike.", "The bike has a hamper.", "The bike has a laundry hamper attached to it while the dog and water are on the ground, and there is no mail in sight."], "image": "train2014/COCO_train2014_000000439865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50404, "question_id": "GDW89f34MCGpikLQ2oETmw", "question": "What is the opposite of the first word found on the blue sign?", "choices": ["frog", "yes", "down", "go"], "correct_choice_idx": 1, "direct_answers": ["yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes"], "difficult_direct_answer": false, "rationales": ["The word on the sign is no", "The opposite of yes is no.", "The blue sign indicates no outlets in the area but the opposite is that there are outlets in the area."], "image": "train2014/COCO_train2014_000000050404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370522, "question_id": "GDZfg9qDaxnhMzPC49pBrk", "question": "The thing that is under the desk belongs to what family?", "choices": ["canidae", "addams", "partridge", "bovidae"], "correct_choice_idx": 0, "direct_answers": ["canidae", "pet lovers", "canine", "dog", "happy family", "canidae", "canidae", "home owners", "canidae", "canine"], "difficult_direct_answer": false, "rationales": ["The animal under the desk is a dog and belongs to the canine family.", "The thing is a canidae.", "There is a dog under the table."], "image": "train2014/COCO_train2014_000000370522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518303, "question_id": "GDvSG59L44NoNBDYjGsHEP", "question": "What has been done to the license plate?", "choices": ["made dirty", "burned", "number erased", "cut"], "correct_choice_idx": 2, "direct_answers": ["erased", "drawn", "painted over", "blurred out", "painted over", "blurred", "painted over", "blank", "number erased", "whited out"], "difficult_direct_answer": false, "rationales": ["The license plate is supposed to have letters and numbers, but someone removed them.", "The vehicle identification code that is expected to be on the plate is missing.", "The license plate is clean and is not damaged. the unique identifier is missing."], "image": "train2014/COCO_train2014_000000518303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501225, "question_id": "GDvWqbbBfWt7dEiBAEVu7R", "question": "What should this person be wearing?", "choices": ["helmet/kneepads", "cap", "sweater", "gloves"], "correct_choice_idx": 0, "direct_answers": ["helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet/kneepads", "helmet"], "difficult_direct_answer": false, "rationales": ["Safety gear is important because they might fall.", "The person on the skateboard should be wearing a helmet and knee pads in case they fall.", "He could easily fall while skateboarding and land on his knees or head and would have no protection."], "image": "val2014/COCO_val2014_000000501225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477831, "question_id": "GEhadFfw8hQyduy9fCp5GN", "question": "What is near the hydrant?", "choices": ["sign", "egg", "baby", "cow"], "correct_choice_idx": 0, "direct_answers": ["sidewalk", "street", "sign", "road sign", "sign", "sign", "sign", "traffic sign", "road", "sidewalk"], "difficult_direct_answer": false, "rationales": ["The 14-10 on the pole is the closest thing.", "There is a yellow sign that is stating the height of a place cars will have to pass through.", "There is a board with numbers indicating height restrictions for the road"], "image": "train2014/COCO_train2014_000000477831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2774, "question_id": "GF3GK3fnEiPvhF5B7RjgvB", "question": "The man in brown Controls what?", "choices": ["grass", "kite", "flowers", "dog"], "correct_choice_idx": 1, "direct_answers": ["kite", "kite", "kite", "kite", "kite", "kite", "kite", "kite", "kite", "kite"], "difficult_direct_answer": false, "rationales": ["He is looking up at the kite and you can see a string in his hand.", "The man is the only one flying a kite.", "The man is holding a control string. flowers and grass cannot be controlled, and there is no dog."], "image": "train2014/COCO_train2014_000000002774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433938, "question_id": "GFwNmhRejkiqTopWTygAZr", "question": "What are the boys doing in the sand near the shoreline?", "choices": ["building castles", "tunneling", "fighting", "eating"], "correct_choice_idx": 0, "direct_answers": ["building sandcastles", "building castles", "building castles", "talking", "sandcastle", "building sandcastles", "building sandcastles", "sand castles", "building castle", "making castles"], "difficult_direct_answer": false, "rationales": ["The boys are building a structure with the sand.", "We see piles of sand and neat pail shaped sand structures next to these boys.", "The boys are pushing the sand into large structures."], "image": "train2014/COCO_train2014_000000433938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122839, "question_id": "GG2SDrxATqApEYwcYMia6c", "question": "What is the statue holding?", "choices": ["torch", "chicken", "cane", "pistol"], "correct_choice_idx": 2, "direct_answers": ["hat", "hat cane", "hat", "cane", "hat", "hat", "hat cane", "hat", "hat", "hat cane"], "difficult_direct_answer": false, "rationales": ["The statue has a cane in his hand.", "It is easy to surmise what the statue is holding, because of the shape,", "A statue of a man is holding a long stick with one end curved into a handle. people use canes with a handle on one end."], "image": "val2014/COCO_val2014_000000122839.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149812, "question_id": "GGAZcCyyNjWUUoCbZsDh37", "question": "What did this tennis player just do?", "choices": ["returned ball", "lost", "quit", "served"], "correct_choice_idx": 0, "direct_answers": ["hit ball", "hit ball", "hit ball", "smash ball", "hit ball", "hit ball", "return", "returned ball", "hit ball", "swing"], "difficult_direct_answer": false, "rationales": ["The players is hitting the green ball back.", "The way they're positioned seems to indicate this. if they were b, they would have a different form.", "The tennis player just hit a ball."], "image": "train2014/COCO_train2014_000000149812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496522, "question_id": "GGWsDkyd6vpztho3wCwnGb", "question": "What is the smaller bird in between the two larger birds?", "choices": ["pigeon", "duck", "parakeet", "parrot"], "correct_choice_idx": 1, "direct_answers": ["duck", "baby duck", "duck", "not clear", "duck", "small", "duck", "baby swan", "duck", "duck"], "difficult_direct_answer": false, "rationales": ["There are two swans in between a smaller bird with a small beak.", "Two swans are in the water. a smaller bird with a bill is between two swans in the water.", "The small bird is a duck."], "image": "train2014/COCO_train2014_000000496522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109370, "question_id": "GGaxnWTuc9MZiTD26gjGFt", "question": "What game stage is the man involved in?", "choices": ["congratulating winner", "quitting", "return", "serving"], "correct_choice_idx": 3, "direct_answers": ["tennis", "set point", "tennis", "serving", "finals", "start", "serve", "serve", "serving", "tennis"], "difficult_direct_answer": false, "rationales": ["The game will be served.", "The player bounces the ball before they serve.", "He is getting ready to start the game by hitting the ball to his opponent."], "image": "val2014/COCO_val2014_000000109370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395975, "question_id": "GGdPnhJ4D7hjXAYqLQNCd4", "question": "What natural element decorates most fully vertically here?", "choices": ["fur", "stone", "velvet", "hair"], "correct_choice_idx": 1, "direct_answers": ["rock wall", "stone", "rocks", "stone", "stones", "stone", "stone", "rock", "stone", "stone"], "difficult_direct_answer": false, "rationales": ["Stone furnishes the fireplace shown vertically here.", "The wall with the fireplace is made from stones.", "There is stone on the wall."], "image": "val2014/COCO_val2014_000000395975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227512, "question_id": "GGqC5jzDFEgQkhmLRZsDe7", "question": "What vegetable is the unusual one on the pizza?", "choices": ["onions", "green pepper", "mushrooms", "corn"], "correct_choice_idx": 3, "direct_answers": ["corn", "bell pepper", "onion", "pizza", "corn", "corn", "green onion", "green onion", "mushroom", "corn"], "difficult_direct_answer": false, "rationales": ["Normally corn is not on pizza.", "Mushrooms, onions and green peppers are all very popular pizza ingredients. i have never seen yellow kernels on a pizza or as a pizza ingredient option.", "It's not a common pizza topping."], "image": "train2014/COCO_train2014_000000227512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200643, "question_id": "GGwB75GAwQ8p8X6Tq9oVFY", "question": "What country does this flag represent?", "choices": ["arab emirates", "italy", "egypt", "romania"], "correct_choice_idx": 0, "direct_answers": ["mexico", "algeria", "arab emirates", "unknown", "arab emirates", "united arab emeritus", "arab emirates", "monaco", "foreign", "iraq"], "difficult_direct_answer": false, "rationales": ["The uae flag is white red and green.", "The flag contains the pan-arab colors red, green, white and black.", "The flag of the uae is on the top of the boat."], "image": "val2014/COCO_val2014_000000200643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155811, "question_id": "GH4HfMgpgZetp7sBJ74pNS", "question": "What alerts people here of a safe crossing time?", "choices": ["policeman", "crossing guard", "walk light", "horses"], "correct_choice_idx": 2, "direct_answers": ["crosswalk light", "traffic signs", "light", "walking light", "traffic light", "light", "traffic lights", "traffic lights", "judgment", "walk light"], "difficult_direct_answer": false, "rationales": ["It is a crosswalk and there are walking lights to tell people when to go.", "In this city scenario lights indicate when it is safe to cross.", "The walk light alerts the crosses to safety when crossing."], "image": "val2014/COCO_val2014_000000155811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238448, "question_id": "GH4shAm8q6rqfNnYPKjXsQ", "question": "What is on the right and left of the clock?", "choices": ["cats", "reptiles", "birds", "statues"], "correct_choice_idx": 3, "direct_answers": ["figures", "small figurines", "statues", "figures", "building", "figurines", "angels", "statues", "angels", "statues"], "difficult_direct_answer": false, "rationales": ["There are 3d depictions of people.", "These are statues of people and on both sides of the clock.", "You can tell by the architecture as to what is near the clock faces."], "image": "train2014/COCO_train2014_000000238448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463021, "question_id": "GHLxvqfPc8sZFix7WTvFda", "question": "What is an unusual concern that people at this beach have?", "choices": ["sand quality", "temperature", "tide schedule", "time"], "correct_choice_idx": 3, "direct_answers": ["waves", "time", "closing time", "time", "on time", "being late", "time", "crabs", "time", "time"], "difficult_direct_answer": false, "rationales": ["It is hard to carry a watch and not lose it at the beach. there is a clock in the sand.", "Usually you don't care much about time when you go to the beach.", "The concern is time."], "image": "train2014/COCO_train2014_000000463021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312344, "question_id": "GHSiFRK55iQyysimDiz6Gk", "question": "According to the sign what can people do in this downtown?", "choices": ["dine", "fly", "skateboard", "bungee jump"], "correct_choice_idx": 0, "direct_answers": ["dine", "bicycle", "stop", "ride bikes", "cycle", "stop", "ride bikes", "ride bikies", "bicycle", "bicycle"], "difficult_direct_answer": false, "rationales": ["A purple sign to the right reads 'downtown dining'. we can conclude there are restaurants nearby.", "The flag says dining on it", "The banner on the far right mentions this type of activity."], "image": "train2014/COCO_train2014_000000312344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375504, "question_id": "GHT5eb6m79LRsNw9nvyzen", "question": "What kind of view might be seen from the windows on the house?", "choices": ["dirt view", "sky view", "office view", "sea view"], "correct_choice_idx": 3, "direct_answers": ["ocean", "beach view", "beautiful shore", "oceanview", "sea view", "waterfront", "oceanic", "waterfront", "beach", "ocean"], "difficult_direct_answer": false, "rationales": ["A man is caring a surfboard on a beach. the ocean can be seen from many beaches.", "The house in the distance has windows that face the beach and the ocean.", "This area is beachy because of the sand."], "image": "train2014/COCO_train2014_000000375504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550760, "question_id": "GHZxFkmMWu79AeU5U2EnCd", "question": "What is the man helping the kid do?", "choices": ["chew gum", "brush teeth", "eat candy", "makeup"], "correct_choice_idx": 1, "direct_answers": ["brush teeth", "brush teeth", "brush teeth", "brushing", "brush teeth", "brush", "brush teeth", "brushing teeth", "eat", "eat"], "difficult_direct_answer": false, "rationales": ["He is holding a tooth brush in his hand", "The man is holding a toothbrush and putting it inside the child's mouth.", "The man is inserting a toothbrush in the child's mouth"], "image": "train2014/COCO_train2014_000000550760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134383, "question_id": "GHcHnjsyVB4Gj2ueasG5DF", "question": "What is the yellow bus about to do?", "choices": ["back up", "stop", "go", "park"], "correct_choice_idx": 2, "direct_answers": ["go", "stop", "turn", "make turn", "turn", "drive", "turn right", "turn left", "turn left", "stop"], "difficult_direct_answer": false, "rationales": ["The yellow bus is about to go over the intersection.", "The bus is stopped so it's next move is to go.", "The buses are waiting to go."], "image": "train2014/COCO_train2014_000000134383.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19456, "question_id": "GHfWSXYjujMUqmRPrNbxsg", "question": "What is the man using to cook?", "choices": ["veggies", "chocolate", "meat", "fruit"], "correct_choice_idx": 1, "direct_answers": ["baking sheet", "chocolate", "baking sheet", "gloves", "baking sheet", "baking sheet", "sheet", "chocolate", "baking sheet", "baking sheet"], "difficult_direct_answer": false, "rationales": ["This is an ingredient in the dish he is making", "He is touching it while it is on a pan. he is a cook.", "He is making sweets."], "image": "val2014/COCO_val2014_000000019456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479805, "question_id": "GHxXdfTfzvbrzA9hYtfytD", "question": "What is making it difficult to see?", "choices": ["smoke", "glare", "darkness", "snow"], "correct_choice_idx": 1, "direct_answers": ["sun", "sun", "sunset", "sunlight", "dark", "sun", "sun", "sun", "sun", "glare"], "difficult_direct_answer": false, "rationales": ["The sun is low in the sky and it is causing a glare that makes it hard to see around it.", "The sun's glare makes it hard to see properly.", "A sun is setting and shining over a town. sun causes glare."], "image": "val2014/COCO_val2014_000000479805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102630, "question_id": "GJVEp9i7NWGRNQvmFKMQhC", "question": "What is the man doing in the air?", "choices": ["tumble", "spin", "grab", "flip"], "correct_choice_idx": 2, "direct_answers": ["snowboarding", "skiing", "snowboarding", "grab", "snowboarding", "snowboarding", "skating", "snowboarding", "snowboard trick", "jumping"], "difficult_direct_answer": false, "rationales": ["Though he is doing a trick, he is grabbing the snowboard.", "The man is grabbing his snowboard midair.", "The man is right side up. he is touching his board."], "image": "train2014/COCO_train2014_000000102630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481039, "question_id": "GJrGsMhLjHTQekcSR3bomo", "question": "What is near the laptop?", "choices": ["sandwich", "book", "cheese bowl", "lamp"], "correct_choice_idx": 3, "direct_answers": ["mouse", "monitor", "lamp", "mouse", "light", "lamp", "light", "monitor", "mouse", "computer monitor"], "difficult_direct_answer": false, "rationales": ["A lightbulb in a lamp's fixture allows us to see this laptop on a desk.", "The lamp is hanging overhead.", "The lamp is near."], "image": "train2014/COCO_train2014_000000481039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431569, "question_id": "GJsfjGTKBCPqMjKZ23rYvh", "question": "What is the name of the animals present?", "choices": ["elephants", "cattle", "bears", "dogs"], "correct_choice_idx": 0, "direct_answers": ["elephants", "elephant", "elephants", "elephant", "elephant", "elephant", "elephants", "elephants", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["They are very large and have trunks and tusks", "Through the foliage we can see the leathery grey skin, the tusks and trunks which belong to an elephant.", "The animals are visible behind the tree and have visible attributes (size, trunks, tusks, color) that are known to be of elephants."], "image": "train2014/COCO_train2014_000000431569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530750, "question_id": "GJtLCiAbGty7ZZYdvpCrCA", "question": "What other sport might be undertaken in this situation?", "choices": ["rugby", "skydiving", "snowboarding", "tennis"], "correct_choice_idx": 2, "direct_answers": ["sking", "snowboarding", "skiing", "snowboarding", "happy", "snowboarding", "snowboarding", "snowboarding", "snowboarding", "snowboard"], "difficult_direct_answer": false, "rationales": ["People love to ski on the run but they also love snowboards.", "Skiing isn't the only recreation sport that can be done in snow nowadays.", "It is on a hill covered in snow, which is optimized for skiing and for snowboarding."], "image": "train2014/COCO_train2014_000000530750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376882, "question_id": "GKGJMgLvAt49cTJNgV3UGx", "question": "Where is this bench located?", "choices": ["park", "station", "stadium", "gym"], "correct_choice_idx": 1, "direct_answers": ["subway", "station", "city sidewalk", "train station", "sidewalk", "train station", "subway station", "station", "town", "train station"], "difficult_direct_answer": false, "rationales": ["A man is reading a book on the bench. there is a train that is blurred out behind it going fast.", "With the subway cars in the background you can tell where he i.", "There is cement on the ground and a train is zooming past."], "image": "train2014/COCO_train2014_000000376882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549612, "question_id": "GKTjkT4778ZzRyyo7n8YVB", "question": "What type of animals are present?", "choices": ["cattle", "giraffe", "deer", "sheep"], "correct_choice_idx": 1, "direct_answers": ["giraffe", "giraffe", "giraffe human", "giraffes", "giraffe", "giraffe", "giraffe", "giraffe human", "giraffe", "giraffe"], "difficult_direct_answer": false, "rationales": ["The animal is orange and white and has a long neck.", "A giraffe in the cage.", "The animal is very tall with a very long neck. its skin has a very distinct brown and tan pattern."], "image": "train2014/COCO_train2014_000000549612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269394, "question_id": "GKmxCXxaN2wr6qfY8edt9a", "question": "What company makes the cards associated with the machine?", "choices": ["apple", "google", "visa", "amazon"], "correct_choice_idx": 2, "direct_answers": ["usa", "visa", "visa", "visa", "visa", "visa", "mastercard", "visa", "mastercard", "mastercard"], "difficult_direct_answer": false, "rationales": ["Their name is written on it", "Visa is a major credit card company.", "Visa is a credit card company."], "image": "val2014/COCO_val2014_000000269394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42529, "question_id": "GKnkc69u3PwEmjDbr26Brp", "question": "Which amusement allows the greatest view here?", "choices": ["ferris wheel", "clock base", "car drive", "ball player"], "correct_choice_idx": 0, "direct_answers": ["ferris wheel", "ferris wheel", "ferris wheel", "ferris wheel", "clock tower", "ferris wheel", "ferris wheel", "big ben", "ferris wheel", "eye"], "difficult_direct_answer": false, "rationales": ["The ferris wheel in the background affords the best view because of its height.", "The amusement park ride seen in the background on the left side of this image would allow the highest vantage point of any here listed.", "There is a wheel behind the building."], "image": "train2014/COCO_train2014_000000042529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508403, "question_id": "GL3Fhr6FEGP8KPgipzDXTb", "question": "Why is he wearing a glove?", "choices": ["to catch", "fashion", "warmth", "costume"], "correct_choice_idx": 0, "direct_answers": ["playing baseball", "playing baseball", "catch baseball", "to catch", "catch ball", "baseball", "protect hand", "catch ball", "to catch", "catching ball"], "difficult_direct_answer": false, "rationales": ["His uniform and surroundings suggest that he is playing the game of baseball, and a glove is needed to catch the baseball as it would hurt the bare hand.", "A younger boy is playing baseball as he slides down to grass to grab a ball in mit.", "Baseball players wear a glove to pad and protect their hand from the ball."], "image": "train2014/COCO_train2014_000000508403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576684, "question_id": "GLJ9LeX94oaaWZQNvhSyM5", "question": "Which one of these companies makes that type of headgear that he's wearing?", "choices": ["stetson", "speedo", "adidas", "canada goose"], "correct_choice_idx": 0, "direct_answers": ["stetson", "cowboy apparel", "unknown", "resistor", "stetson", "wilson", "cowboy hat", "stetson", "cowboy hat", "stetson"], "difficult_direct_answer": false, "rationales": ["He is wearing a cowboy hat on his head, not a coat, bathing suit, or shoes.", "There are stetson cowboy hats.", "This is specifically a hat brand."], "image": "train2014/COCO_train2014_000000576684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197780, "question_id": "GLZ7w6UhRhJE456tAK439J", "question": "Which character is known for wearing a similar item on their head to this man?", "choices": ["han solo", "darth vader", "beaver cleaver", "rambo"], "correct_choice_idx": 3, "direct_answers": ["john mcenroe", "mri t", "richard simmons", "mana", "rambo", "agassi", "headband", "karate kid", "karate kid", "bret michaels"], "difficult_direct_answer": true, "rationales": ["Rambo wears a headpiece.", "Sylvester stalone wore a handkerchief around his forehead in a movie.", "The character is rambo."], "image": "train2014/COCO_train2014_000000197780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514860, "question_id": "GLmJgxN7S4fa7E9WD4dcon", "question": "What will this couple use to dine here?", "choices": ["chopsticks", "knives", "forks", "spoons"], "correct_choice_idx": 0, "direct_answers": ["plates", "chopsticks", "chopsticks", "chop sticks", "money", "plates", "chopsticks", "chopsticks", "chopsticks", "chopsticks"], "difficult_direct_answer": false, "rationales": ["These utensils are sitting on both plates.", "They are laying on the plate with the food", "Chopsticks have been placed on their dishes."], "image": "val2014/COCO_val2014_000000514860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29327, "question_id": "GLyoyEgFmbaa8n9aLmaz9R", "question": "How would the animal in the middle be described in relation to the other two?", "choices": ["fatter", "smaller", "wider", "taller"], "correct_choice_idx": 1, "direct_answers": ["child", "smaller", "child", "shorter", "child", "in between", "shorter", "predator", "kid", "baby giraffe"], "difficult_direct_answer": false, "rationales": ["The animal is smaller.", "The animal in the middle is a zebra and the others are giraffes. visually both the giraffes are larger than the zebra.", "The giraffe in the center is a baby giraffe and has not yet reached its full height."], "image": "train2014/COCO_train2014_000000029327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445495, "question_id": "GMBXYnnuj5WE5mRHLLYWQS", "question": "This animal will have how many teeth when it is an adult?", "choices": ["60", "42", "50", "25"], "correct_choice_idx": 1, "direct_answers": ["42", "42", "42 permanent", "22", "42", "42", "forty two", "forty", "42", "42"], "difficult_direct_answer": false, "rationales": ["This is the normal healthy number", "The animal is a dog. it will have 20 teeth on the top and 22 on the bottom.", "The animal visible is a dog and dog's have 42 adult teeth on average."], "image": "train2014/COCO_train2014_000000445495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563743, "question_id": "GMLu5e79erwdypg4GAfTWP", "question": "What is the woman standing in front of?", "choices": ["vending machine", "slot machine", "claw machine", "wall"], "correct_choice_idx": 0, "direct_answers": ["fridge", "camera", "beverages", "vending machine", "vending machine", "vending machine", "soft drinks", "cooler", "teddy bear", "drinks"], "difficult_direct_answer": false, "rationales": ["There are drink selections on the screen", "A girl is standing in front of a machine with sodas in it. vending machines are used to sell drinks and snacks.", "The woman is in front of a vending machine since it has so many drinks and beverages."], "image": "train2014/COCO_train2014_000000563743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447130, "question_id": "GMY8pk4sD7JXiURwhNsoUs", "question": "What type of park is this?", "choices": ["dog", "car", "skateboard", "aquatic"], "correct_choice_idx": 2, "direct_answers": ["playing park", "skate", "skate park", "skate", "skateboard", "skateboard", "skate", "skateboarding park", "skate park", "skate park"], "difficult_direct_answer": false, "rationales": ["The man is skating in the skatepark.", "There are skateboarders and ramps.", "This is a course that was based off originally using empty pools"], "image": "train2014/COCO_train2014_000000447130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376084, "question_id": "GMsv4x5tAdQphptCNq8Agh", "question": "When the driver continues going straight what is at risk of getting run over?", "choices": ["traffic cones", "nothing", "tires", "worker"], "correct_choice_idx": 0, "direct_answers": ["cones", "orange cone", "cones", "pylons", "traffic cones", "cones", "cone", "cones", "cones", "cones"], "difficult_direct_answer": false, "rationales": ["The driver wants to avoid the cones.", "The driver is by traffic cones.", "The driver is driving a truck towards road work indicators that are directly in front of the truck in the roadway."], "image": "train2014/COCO_train2014_000000376084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441116, "question_id": "GN3v9Qqm7jnXJyKoRaHST5", "question": "What language seems to be on the back of the train?", "choices": ["italian", "german", "asian", "american"], "correct_choice_idx": 2, "direct_answers": ["chinese", "asian", "chinese", "japanese", "japanese", "chinese", "chinese", "japanese", "chinese", "japanese"], "difficult_direct_answer": false, "rationales": ["The language looks chinese.", "The language is asian.", "There are characters on the train that are used in asia."], "image": "train2014/COCO_train2014_000000441116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532976, "question_id": "GNN9aa2GFc7Shxsbcpmbjy", "question": "What is the yellow truck doing?", "choices": ["transporting goods", "receiving donation", "mobile library", "selling food"], "correct_choice_idx": 3, "direct_answers": ["selling food", "selling food", "selling food", "selling food", "parking", "selling", "serving food", "selling food", "selling food", "selling food"], "difficult_direct_answer": false, "rationales": ["It's a food truck and it's there making and selling food.", "A vehicle is parked in a public area by a food truck. there are people gathered around eating.", "This is a classic food truck at a park where there is another food truck in front selling tacos."], "image": "val2014/COCO_val2014_000000532976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492399, "question_id": "GNSMJJo4rfjgDzAKuNbf3q", "question": "The woman in the blue sweater is holding a device matching which console?", "choices": ["xbox", "nintendo switch", "playstation", "nintendo wii"], "correct_choice_idx": 3, "direct_answers": ["wii", "wii", "wii", "wii", "wii", "wii", "wii", "wii", "nintendo wii", "wii"], "difficult_direct_answer": false, "rationales": ["The device is white. it is shaped like a remote.", "The controller is a proprietary controller for a specific video game console.", "The woman is holding a white rectangular controller."], "image": "train2014/COCO_train2014_000000492399.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524627, "question_id": "GNxrcacr3t9LK6HeezdkDb", "question": "What is the main type of food being served?", "choices": ["seafood", "pastry", "fruit", "fondu"], "correct_choice_idx": 1, "direct_answers": ["donuts", "pizza", "sweets", "pizzas", "junk food", "donuts", "sundae", "pastry", "donuts", "pizza"], "difficult_direct_answer": false, "rationales": ["There is a sweet treat being served.", "The people are eating doughnuts and other desserts.", "The food is a sweet bread item."], "image": "val2014/COCO_val2014_000000524627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537968, "question_id": "GPF99gtWtD7wPBPBaafJBs", "question": "The orange rimmed building probable sells which of these products?", "choices": ["shoes", "gas", "flowers", "televisions"], "correct_choice_idx": 1, "direct_answers": ["magazines", "gas", "gas", "gas", "gas", "gas", "gasoline", "gasoline", "gas", "things"], "difficult_direct_answer": false, "rationales": ["This building is a gas station as cars can be seen filling up.", "A small building on a busy street has cars lined up at it.", "The rim is for gas."], "image": "train2014/COCO_train2014_000000537968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549898, "question_id": "GPJYxZuXidBW5qKMpF6Zv6", "question": "What should the man wearing a white helmet do?", "choices": ["turn right", "back up", "speed up", "slow down"], "correct_choice_idx": 3, "direct_answers": ["pull over", "save", "stop bike", "police", "slow down", "yield", "slow down", "helmet", "slow down", "drive straight"], "difficult_direct_answer": false, "rationales": ["The man needs to slow down.", "The man wearing a white helmet is going straight but should slow down because someone is turning and might not be paying attention.", "The man is on a motorcycle. motorcycles are dangerous. accidents are less dangerous at slower speeds."], "image": "train2014/COCO_train2014_000000549898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236818, "question_id": "GPJq7M43JUdZQrqGLQqcBC", "question": "Where is the man in the uniform walking from?", "choices": ["football field", "parking lot", "tennis court", "baseball field"], "correct_choice_idx": 3, "direct_answers": ["field", "field", "field", "field", "bullpen", "baseball field", "field", "dugout", "baseball field", "baseball field"], "difficult_direct_answer": false, "rationales": ["He is carrying a glove and a bag filled with training equipment which indicates that he is likely a pitcher who was warming up before the start of a game.", "The man is at a diamond.", "The man is wearing a uniform and a baseball field is visible in the background based on the baselines and interior grass."], "image": "train2014/COCO_train2014_000000236818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484362, "question_id": "GPXoGA8h83qKvxhmGDAwKw", "question": "What is the man with dreadlocks attempting?", "choices": ["stop frisbee", "make drink", "pat back", "say hello"], "correct_choice_idx": 0, "direct_answers": ["block frisbee", "block", "block", "block", "block", "defense", "blocking", "blocking", "block", "stop frisbee"], "difficult_direct_answer": false, "rationales": ["The person with dreads is trying to steal the frisbee.", "You can tell by the activity they are doing as to what the man is attempting to do.", "Because his left hand and arm are extended outward and in line with the direction that the frisbee is being aimed."], "image": "train2014/COCO_train2014_000000484362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563243, "question_id": "GQTjuCosbb2FAp8CeWfBn3", "question": "What item is in the officer's breast pocket?", "choices": ["phone", "badge", "taser", "walkie talkie"], "correct_choice_idx": 3, "direct_answers": ["radio", "walkie talkie", "pen", "walkie talkie", "badge", "walkie talkie", "walkie talkie", "walkie talkie", "walkie talkie", "walkie talkie"], "difficult_direct_answer": false, "rationales": ["The small black square device with antenna is a radio this officer uses to stay in touch with his colleagues.", "The officer has a radio in his shirt pocket.", "The item is the walkie talkie."], "image": "train2014/COCO_train2014_000000563243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289644, "question_id": "GQUvo5BtAQkXg4PEiPUg6x", "question": "What protein is in this dish?", "choices": ["beef", "venison", "bison", "chicken"], "correct_choice_idx": 3, "direct_answers": ["chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken"], "difficult_direct_answer": false, "rationales": ["Chicken is a white meat and the others aren't.", "The meat is white. beef, venison, and bison are red meats.", "The meat is white"], "image": "train2014/COCO_train2014_000000289644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404033, "question_id": "GQfWgmTWVTCPiwH283Yuhu", "question": "What is the man attempting to block with his fingers?", "choices": ["sound", "dust", "pollen", "water"], "correct_choice_idx": 0, "direct_answers": ["ears", "noise", "noise", "sound", "ears", "noise", "noise", "noise", "noise", "ears"], "difficult_direct_answer": false, "rationales": ["The man wants to block sound.", "There is a room full of people making noise.", "The annoyed look on his face and phone to his other ear let's us conclude he is trying to block out the noise from the crowd to take his call."], "image": "train2014/COCO_train2014_000000404033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384496, "question_id": "GRW4TpZ8gBp2fBi5vS8sq7", "question": "What is he doing?", "choices": ["stealing horses", "plowing field", "feeding horses", "riding horses"], "correct_choice_idx": 1, "direct_answers": ["plowing field", "plowing", "plowing", "plowing field", "plowing", "plowing field", "plowing", "plowing field", "plowing", "ploughing"], "difficult_direct_answer": false, "rationales": ["He's plowing.", "Draft horse are very helpful for tilling.", "The horses are ploughing the field for the man."], "image": "val2014/COCO_val2014_000000384496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109778, "question_id": "GRYRUtVYeNgXVQUTvRpttz", "question": "What other menu item are they eating besides pizza?", "choices": ["steak", "potatoes", "salad", "fries"], "correct_choice_idx": 2, "direct_answers": ["salad", "salad", "salad", "salad", "salad", "salad", "salad", "salad", "greens", "salad"], "difficult_direct_answer": false, "rationales": ["There is a plate on the table with lettuce and other toppings that looks like a salad.", "They're eating salad.", "The rectangular plate near the pizza does not have fries, potatoes, or steaks."], "image": "train2014/COCO_train2014_000000109778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280423, "question_id": "GRsN27n7EtJ2D29PuqQVRz", "question": "What session of the day is shown in the photo?", "choices": ["dawn", "afternoon", "morning", "evening"], "correct_choice_idx": 1, "direct_answers": ["train", "afternoon", "afternoon", "morning", "afternoon", "afternoon", "bullet train", "afternoon", "afternoon", "afternoon"], "difficult_direct_answer": false, "rationales": ["The clock on the platform says the time is 14:55.", "The time on the clock is in the afternoon.", "The sign shows that it is 14:55 which is 2 in the afternoon."], "image": "train2014/COCO_train2014_000000280423.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300753, "question_id": "GRt92gH5DXboWPAfa9Lj3a", "question": "What did the batter just do?", "choices": ["ran home", "hit ball", "struck out", "missed"], "correct_choice_idx": 1, "direct_answers": ["hit ball", "hit ball", "hit", "swing", "hit", "hit", "just batted", "swing bat", "hit ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["The batter just hit the ball.", "Batters go up to try to hit the ball. this man is running and the ball is on the ground halfway toward the pitcher.", "He swung the bat at it"], "image": "val2014/COCO_val2014_000000300753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99511, "question_id": "GSBjWNqzbgpRCKi5iUnu5D", "question": "Who was famous for doing what the person with the name tag is doing?", "choices": ["ansel elgort", "hansel robles", "ansel adams", "emmanuel"], "correct_choice_idx": 2, "direct_answers": ["photographer", "ansel adams", "photography", "peter parker", "ansel adams", "andy warhol", "scientist", "ansel adams", "peter parker", "photography"], "difficult_direct_answer": false, "rationales": ["A person with a name tag is taking pictures. ansel adams was a famous photographer.", "Ansel adams was famous.", "He was a famous photographer."], "image": "train2014/COCO_train2014_000000099511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422432, "question_id": "GSWecamoemymKMM3nfn4Kn", "question": "What's the name for the body type of the red vehicle?", "choices": ["pickup", "hatchback", "wagon", "sedan"], "correct_choice_idx": 0, "direct_answers": ["truck", "pickup truck", "pickup truck", "truck", "truck", "truck", "truck", "pickup", "pickup truck", "truck"], "difficult_direct_answer": false, "rationales": ["The pickup truck is the vehicle that's red.", "The truck has a bed in the back. trucks of this type are known as answer a.", "It is a vehicle with a small cab and large area in the back for carrying items"], "image": "val2014/COCO_val2014_000000422432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339521, "question_id": "GSXpE9zJmg2z25WfmZvrHZ", "question": "What is the coldest place in the area?", "choices": ["by clock", "by building", "by tree", "shaded area"], "correct_choice_idx": 3, "direct_answers": ["shaded area", "shade", "snow ground", "left", "snow place", "ground", "ground", "ground", "ground", "outside"], "difficult_direct_answer": false, "rationales": ["The coldest place is the shaded area without sunlight.", "The shaded area will be colder when you are out of the sun.", "The sun can't shine in the shade. sun creates warmth."], "image": "train2014/COCO_train2014_000000339521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553640, "question_id": "GSbcrtab7RsvQu3Pc8erDr", "question": "What are the people in the middle of?", "choices": ["wedding", "birthday", "funeral", "graduation"], "correct_choice_idx": 0, "direct_answers": ["wedding", "courtyard", "wedding", "sidewalk", "church", "wedding", "wedding", "wedding", "wedding", "church"], "difficult_direct_answer": false, "rationales": ["The people are getting married.", "The woman is wearing a dress that is traditional for this event.", "She is wearing a wedding dress and holding a bouquet."], "image": "train2014/COCO_train2014_000000553640.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514860, "question_id": "GSoJwmwgATGHbZSxpA7YXB", "question": "How is the man's sight without assistance?", "choices": ["colorblind", "blind", "perfect", "impaired"], "correct_choice_idx": 3, "direct_answers": ["poor", "poor", "needs glasses", "not good", "poor", "poor", "impaired", "bad", "so so", "probably bad"], "difficult_direct_answer": false, "rationales": ["A man wears glasses. people use glasses to see better.", "Humans wear glasses because we all agreed to run the world seeing with 20/20 vision. without his glasses the man would miss out on the sizes of lettering the world commonly uses.", "Option a describes a condition that occurs unless one wears glasses."], "image": "val2014/COCO_val2014_000000514860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168758, "question_id": "GSpg8igt6LbTqGrSxLEEsY", "question": "What is the number 16064 written on?", "choices": ["bathroom wall", "seat", "train", "poster"], "correct_choice_idx": 2, "direct_answers": ["train", "train", "train", "train", "train", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["The only vehicle depicted is a train.", "The number is written on a vehicle which travels on rails, thus corresponding to the item identified in option a.", "16064 is written on a blue vehicle. it travels on tracks."], "image": "train2014/COCO_train2014_000000168758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293362, "question_id": "GT37eHr6XsD88nVs9nrbFL", "question": "What type of bread is being used here?", "choices": ["hoagie bun", "croissant", "bagel", "hamburger bun"], "correct_choice_idx": 1, "direct_answers": ["croissant", "crossaint", "croissant", "croissant", "flat bread", "croissant", "croissant", "croissant", "croissant", "baguette"], "difficult_direct_answer": false, "rationales": ["The shape of this baked good along with its flaky crust can be seen.", "The bread is flaky and appears to have been folded before baking it, and it is an oblong shape that is slightly curved on the long side and has a dull point at each end.", "It has a somewhat moon shape and the folds can be seen on the top from rolling it before it was cooked"], "image": "train2014/COCO_train2014_000000293362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172408, "question_id": "GT97HZjDHY9ZXrrFLa63im", "question": "How will the skateboard land?", "choices": ["on end", "sideways", "won wheels", "upside down"], "correct_choice_idx": 3, "direct_answers": ["upside down", "on board", "on ground", "upside down", "broken", "down", "upside down", "badly", "upside down", "on skateboard"], "difficult_direct_answer": false, "rationales": ["The wheels of the skateboard are facing the sky, in the direction identified in option a.", "This skilled skater will successfully flip his board", "The person flipped their skateboard in the air. it is now falling back to the ground with the part that is ordinarily facing up, facing the ground."], "image": "val2014/COCO_val2014_000000172408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324521, "question_id": "GTAcciPYsHejbmwfVsu3E8", "question": "What type of cat is this?", "choices": ["long hair", "siamese", "short hair", "sphynx"], "correct_choice_idx": 2, "direct_answers": ["ginger tabby", "white tabby", "house cat", "domestic shorthair", "domestic", "short hair", "dangerous", "lazy", "cat", "domestic"], "difficult_direct_answer": true, "rationales": ["That cat's fur does not stick out very much from its body.", "The cat has short hair.", "The cat has short and fuzzy hair."], "image": "train2014/COCO_train2014_000000324521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125672, "question_id": "GTRKUuNmLBpXT5Hffug8K9", "question": "What is everyone looking at?", "choices": ["ball", "bat", "fence", "field"], "correct_choice_idx": 0, "direct_answers": ["ball", "ball", "softball", "batter", "ball", "ball", "ball", "batter", "baseball", "softball"], "difficult_direct_answer": false, "rationales": ["The catcher, umpire and batter are all looking at the ball as it sails above the catcher's mitt.", "They are playing baseball. the main focus is the ball.", "They are trying to look at the ball."], "image": "train2014/COCO_train2014_000000125672.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182523, "question_id": "GTjeqQ9HgaE3cP2kjFuHib", "question": "What protective gear should the woman wear?", "choices": ["knee pads", "ear muffs", "helmet", "scarf"], "correct_choice_idx": 2, "direct_answers": ["goggles", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "goggles"], "difficult_direct_answer": false, "rationales": ["This protects your head in a fall", "They need to wear safety gear on their head so they don't hurt themselves.", "The woman is with two others on a mountain with sky equipment. the woman is the only one not wearing a helmet. helmets help reduce head injuries."], "image": "val2014/COCO_val2014_000000182523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574946, "question_id": "GTwYMPxB2pqmtbySz8k7BC", "question": "What mythical creature does the person who owns the pink boat favor?", "choices": ["peter pan", "mermaids", "tinkerbell", "pinnocchio"], "correct_choice_idx": 1, "direct_answers": ["snail", "mermaids", "mermaid", "snail", "mermaid", "mermaid", "snail", "mermaid", "mermaid", "mermaid"], "difficult_direct_answer": false, "rationales": ["There is one painted on the side of the boat.", "A mermaid can be seen painted in white on the side of the boat, as a human torso but a fin can be seen on it.", "A mermaid is painted on the side of the boat, as opposed to none of the other options listed, so by process of elimination thar can be assumed as the answer."], "image": "val2014/COCO_val2014_000000574946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481841, "question_id": "GTzfHKCi8jc32RXxR8NVXt", "question": "The brown liquid substance on the bottom of the plate is probably?", "choices": ["gravy", "salad dressing", "syrup", "oil"], "correct_choice_idx": 2, "direct_answers": ["sugar", "caramel", "syrup", "syrup", "no", "sauce", "soup", "sauce", "syrup", "syrup"], "difficult_direct_answer": false, "rationales": ["This is a donut dessert so it will be a sweet sauce", "Fried baked goods resembling donuts are on a plate with a brown liquid. syrup is brown.", "These are breads in syrup."], "image": "train2014/COCO_train2014_000000481841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330522, "question_id": "GU2ZkvLeyrSm4d53ERjy75", "question": "What might you buy in this kiosk?", "choices": ["soda", "parking time", "game tokens", "stamps"], "correct_choice_idx": 1, "direct_answers": ["parking", "tickets", "parking time", "parking time", "parking ticket", "parking pass", "parking pass", "parking permit", "parking time", "parking time"], "difficult_direct_answer": false, "rationales": ["This is a kiosk to buy time to park here.", "This is where you can pay for your parking spot at", "This kiosk is design for allow drivers to pay for parking by using a credit card or other means but it indicates at the top that their is a two hour parking limit."], "image": "val2014/COCO_val2014_000000330522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443006, "question_id": "GUMmWiS623paagqaA7qEnw", "question": "These animals are doing what?", "choices": ["walking", "sleeping", "eating", "racing"], "correct_choice_idx": 0, "direct_answers": ["walking", "walking", "walking", "walking", "walking", "walking", "pilgrimage", "walking", "pilgrimage", "walking"], "difficult_direct_answer": false, "rationales": ["Giraffes are in a line together.", "They have a leg forward as they move", "Four giraffes are in an imperfect line facing away from a small body of water."], "image": "val2014/COCO_val2014_000000443006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425175, "question_id": "GUUP25YF9JJyoQo834K7nh", "question": "Which object would have the least amount of flavors in it?", "choices": ["donut", "water glass", "brown food", "soda glass"], "correct_choice_idx": 1, "direct_answers": ["water", "tortilla", "water", "unknown", "brown food", "water", "water", "water", "water glass", "water"], "difficult_direct_answer": false, "rationales": ["Water can seem to be basically flavorless.", "The object is a water glass.", "Water doesn't taste like anything, it's flavorless."], "image": "train2014/COCO_train2014_000000425175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387208, "question_id": "GUXe4Db2Je3iX6WyoVdn8e", "question": "In what place was the bread eaten here cooked?", "choices": ["grill", "oven", "deep fryer", "fryer"], "correct_choice_idx": 1, "direct_answers": ["bakery", "bakery", "steamer", "restaurant", "bakery", "oven", "restaurant", "restaurant", "oven", "restaurant"], "difficult_direct_answer": false, "rationales": ["The bread is not cooked in oil, only using heat, so the oven is the best option.", "Bread is baked in an oven.", "Bread is usually baked to be cooked."], "image": "val2014/COCO_val2014_000000387208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211771, "question_id": "GUaZs6KtpRjCbeHTZsyiDs", "question": "Where are these items being stored?", "choices": ["refrigerator", "cabinet", "shed", "box"], "correct_choice_idx": 1, "direct_answers": ["pantry", "cupboard", "cabinet", "cupboard", "pantry", "cabinet", "cabinet", "pantry", "pantry", "cabinet"], "difficult_direct_answer": false, "rationales": ["It's also likely a kitchen one.", "There are plates and glasses stacked in a closet like room.", "There are plates stacked up in a closet looking area."], "image": "train2014/COCO_train2014_000000211771.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187852, "question_id": "GUc3PsECzVLDy4UYTdhbwX", "question": "What job do many of the people shown here share?", "choices": ["fire fighters", "journalism", "baby photographers", "politicians"], "correct_choice_idx": 1, "direct_answers": ["photos", "journalism", "journalism", "photographer", "media", "news", "reporter", "photography", "media people", "camera man"], "difficult_direct_answer": true, "rationales": ["A crowd of people have cameras and microphones. journalists have microphones and go on camera.", "These people have television newscameras and are reporting on an event.", "The people are journalists and have cameras in order to get photos of an event that will be in the news."], "image": "val2014/COCO_val2014_000000187852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452027, "question_id": "GUemkRG3c9ZWUxSVTwLfLe", "question": "How many distinct species of animals are in the field?", "choices": ["two", "four", "three", "five"], "correct_choice_idx": 1, "direct_answers": ["three", "four", "four", "three", "four", "three", "4 species", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are giraffes, deer, a pig and some kind of other animal.", "There are four species.", "There are three giraffes, several antelopes, a zebra, and a wildebeest."], "image": "val2014/COCO_val2014_000000452027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385517, "question_id": "GUfhfHsdoiEnoTMZxmBsXe", "question": "What is being held in this room?", "choices": ["conference", "church", "aa meeting", "paper route"], "correct_choice_idx": 0, "direct_answers": ["lecture", "meeting", "class", "wii sport", "class", "class", "gaming", "game session", "conference", "video games"], "difficult_direct_answer": false, "rationales": ["Looks like a meeting where they are doing different things.", "The conference is in the room.", "Seems as if there is meeting that was going on here."], "image": "train2014/COCO_train2014_000000385517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352357, "question_id": "GUnue5FG5ZNRT2QX9kssoU", "question": "From where did the most fragrant plant originate here?", "choices": ["tulip", "tree", "daffodil", "rose bush"], "correct_choice_idx": 3, "direct_answers": ["rose bush", "england", "garden", "england", "holland", "rose bush", "ground", "persia", "britain", "roses"], "difficult_direct_answer": false, "rationales": ["The white flower seen in the vase is identified as a rose, which comes from a rose bush.", "Roses have a strong smell and they come from a rose bush.", "The biggest flower is a rose."], "image": "train2014/COCO_train2014_000000352357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20136, "question_id": "GUrZnP67Ug3RKzLcojavaq", "question": "What might someone do if they sit at the table shown?", "choices": ["eat", "play cards", "protest", "gamble"], "correct_choice_idx": 0, "direct_answers": ["eat", "eat", "pose", "eat", "take picture", "take photo", "eat", "have lunch", "eat", "nothing"], "difficult_direct_answer": false, "rationales": ["The table is in a hallway of a commercial area. it has a menu on it, and the table and chairs around it are all dining style.", "You would eat at a table.", "The place is a food court. people usually at eat at the table when they're at a food court."], "image": "train2014/COCO_train2014_000000020136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196192, "question_id": "GVC3wWp6Y72bWkUNpudksN", "question": "What is written on the tags?", "choices": ["hotel", "destination", "good luck", "itinerary"], "correct_choice_idx": 1, "direct_answers": ["destination", "owner's name", "owner name", "price", "prices", "prices", "price", "clearances", "names", "names"], "difficult_direct_answer": false, "rationales": ["There are luggage tags on the luggage.", "Luggage tags are used to identify where the bags are going.", "Tags on luggage either refer to the owner's identification or will provide information regarding the next location that the luggage should be sent to."], "image": "val2014/COCO_val2014_000000196192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449463, "question_id": "GVE3YLGYjH2HijMhjBudED", "question": "What is the chairs on the elephant called?", "choices": ["stool", "howdah", "spinner", "recliner"], "correct_choice_idx": 1, "direct_answers": ["bench", "howdah", "stools", "seats", "howdah", "howdah", "carriage", "benches", "bench", "bench"], "difficult_direct_answer": false, "rationales": ["That's what they call the elephant seat in southeast asia.", "Traditionally these chairs are called howdahs.", "It is a word originating in south asia, and the practice of riding elephants is popular there."], "image": "train2014/COCO_train2014_000000449463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32625, "question_id": "GVMw6CQv8y93UadRfWVQpt", "question": "What does the nearby metal utensil excel at?", "choices": ["sipping", "cutting", "scooping", "jabbing"], "correct_choice_idx": 3, "direct_answers": ["spearing", "forking", "lifting food", "poking spearing", "picking up", "eat food", "eating", "jabbing", "poking food", "puncturing"], "difficult_direct_answer": true, "rationales": ["The utensil is a fork. its tines are used to pierce the food so that it can then be brought up to the mouth to eat.", "There are prongs on the utensil to pick up food.", "The utensil on the table is a fork that is used for jabbing foods."], "image": "val2014/COCO_val2014_000000032625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50101, "question_id": "GVNaAxUvy54EgT6grcMBR4", "question": "What type of animal is shown?", "choices": ["domestic", "aquatic", "nocturnal", "wild"], "correct_choice_idx": 3, "direct_answers": ["giraffe", "giraffe", "4 giraffes", "giraffe", "4 giraffes", "wild", "giraffe", "giraffe", "giraffe", "giraffe"], "difficult_direct_answer": false, "rationales": ["The animal is wild.", "These animals are not pets nor fish", "There are several giraffes which are wild animals."], "image": "train2014/COCO_train2014_000000050101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268512, "question_id": "GVZ2r3zhYTsacdofvEFSqz", "question": "What type of area is shown?", "choices": ["residential", "rural", "private", "commercial"], "correct_choice_idx": 3, "direct_answers": ["street", "skate park", "street", "commercial", "downtown", "skatepark", "city", "city", "street", "skateboard stunts"], "difficult_direct_answer": false, "rationales": ["The area is commercial.", "This is an area with many cars and people skateboarding, so it's commercial.", "A city street is shown with many buildings. commercial areas are in cities."], "image": "train2014/COCO_train2014_000000268512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375021, "question_id": "GVjTZiG8ys2gkTzVT4BNfP", "question": "What person played a similar sport to this person?", "choices": ["alex morgan", "bo jackson", "jim kelly", "martina navratilova"], "correct_choice_idx": 3, "direct_answers": ["serena williams", "serena williams", "martina navratilova", "jiyugaoka", "serena williams", "serena williams", "serena williams", "serena williams", "serrina", "roger federer"], "difficult_direct_answer": false, "rationales": ["Martina is a famous tennis player, and the court and racquet used by this person suggests that they are playing tennis.", "The person is playing tennis and martina navratilova played tennis as well.", "She is also a tennis player."], "image": "val2014/COCO_val2014_000000375021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98322, "question_id": "GVmYL6FRYGHa8JuXmCpSMH", "question": "What would be the most appropriate beverage for the family to have?", "choices": ["milk", "iced tea", "cola", "coffee"], "correct_choice_idx": 3, "direct_answers": ["hot cocoa", "hot chocolate", "hot chocolate", "hot chocolate", "hot coffee", "cocoa", "coffee", "hot chocolate", "hot chocolate", "hot cocoa"], "difficult_direct_answer": false, "rationales": ["Coffee would be a hot beverage which is suitable to drink when it's cold out.", "The family should drink coffee to warm up.", "It is cold outside so a warm liquid can help beat the cold"], "image": "val2014/COCO_val2014_000000098322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453930, "question_id": "GVppbr6BkRNCYgnLKYvY7j", "question": "What is the woman holding the umbrellas hat shaped like?", "choices": ["dog", "red panda", "fox", "cat"], "correct_choice_idx": 1, "direct_answers": ["bear", "red panda", "fox", "bear", "bear", "teddy bear", "bear", "panda", "bear", "tanuki"], "difficult_direct_answer": false, "rationales": ["As indicated by the patterns on the face and ear tufts.", "The hat looks like a red panda.", "It has the markings and colors of the animal"], "image": "train2014/COCO_train2014_000000453930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144018, "question_id": "GVyrFsVQK4mNzp5K5JhdKV", "question": "What material are these boats made out of?", "choices": ["oak wood", "bamboo", "eucalyptus", "birch wood"], "correct_choice_idx": 1, "direct_answers": ["bamboo", "bamboo", "bamboo", "bamboo", "bamboo", "wood", "bamboo", "bamboo", "bamboo", "wood"], "difficult_direct_answer": false, "rationales": ["The boats are made out of long circular pieces of wood.", "These boats are made up of long thin wooden rods. this shape is associated with bamboo wood.", "They are very narrow poles which is how the wood grows"], "image": "train2014/COCO_train2014_000000144018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300008, "question_id": "GW8CK5LUFsuGm7yNxw6Z8u", "question": "In which Country do these bovines recline?", "choices": ["belgium", "germany", "united states", "india"], "correct_choice_idx": 3, "direct_answers": ["pakistan", "brazil", "india", "india", "brazil", "usa", "india", "brazil", "india", "indonesia"], "difficult_direct_answer": false, "rationales": ["These bovines recline in india.", "The country is india.", "They reside in india"], "image": "val2014/COCO_val2014_000000300008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180071, "question_id": "GWCmmYmQwNqoFoRVhdn8dp", "question": "Why is the man holding an umbrella?", "choices": ["to swim", "for cosplay", "to dance", "for shade"], "correct_choice_idx": 3, "direct_answers": ["for shade", "sunlight", "shade", "funny", "ironic effect", "shade", "reading", "shade", "balancing", "raining"], "difficult_direct_answer": false, "rationales": ["When you're outside on a sunny day, you want to be protected from the sunlight.", "The man wants some shade from the sun.", "It is a sunny day so you can tell why he is using the umbrella."], "image": "train2014/COCO_train2014_000000180071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118227, "question_id": "GWKhFvFaQkC2aopuaCw2vV", "question": "Is this toilet is wall hung type?", "choices": ["surface mount", "yes", "no", "impressed type"], "correct_choice_idx": 2, "direct_answers": ["no", "mounted floor", "nop", "no", "tv", "floor mounted", "no", "no", "no", "yes"], "difficult_direct_answer": false, "rationales": ["It is one that is placed on the ground. the body against the wall and the lid area needs something to support them from the bottom.", "It is mounted on the floor", "A standard toilet is in a bathroom. standard toilets are not connected to the wall."], "image": "train2014/COCO_train2014_000000118227.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457478, "question_id": "GWgoVaMRtiqkJ5F4ZZR6Yd", "question": "What is the man in the blue shirt looking at?", "choices": ["sky", "dog", "cameraman", "pool"], "correct_choice_idx": 1, "direct_answers": ["watching", "dog", "dog", "pool dog", "dog", "pool", "swimming dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The man's looking at a dog.", "The man is facing and watching an animal swim through the water.", "He is watching the canine swim in the water."], "image": "train2014/COCO_train2014_000000457478.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232341, "question_id": "GWnmC4vyM7WnxXqWemFWCw", "question": "What are the fruits arranged to resemble?", "choices": ["bike", "car", "dog", "face"], "correct_choice_idx": 3, "direct_answers": ["face", "smile", "face", "face", "smile", "smile", "smiling face", "smiley face", "happy face", "smiley face"], "difficult_direct_answer": false, "rationales": ["The fruits are like a face.", "The two apples represent eyes. the banana represents a smile.", "The curve of a banana looks like a smile and the stem of an apple may resemble the pupils in an eye."], "image": "train2014/COCO_train2014_000000232341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20487, "question_id": "GWpAKQ2224KioZUygq5s7D", "question": "What are the two men doing?", "choices": ["painting", "watching scenery", "flying kite", "watching sunrise"], "correct_choice_idx": 2, "direct_answers": ["kite flying", "flying kites", "flying kites", "flying kites", "flying kite", "looking up", "flying kites", "kiting", "flying kite", "flying kites"], "difficult_direct_answer": false, "rationales": ["They are looking in the sky. something is flying in the air.", "They are looking at the kite in the sky they are flying", "Two men are standing together and looking up at two kites in the air."], "image": "val2014/COCO_val2014_000000020487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482910, "question_id": "GWtRuB8Gc3GXHavibZcraG", "question": "Player with what number threw the frisbee?", "choices": ["17", "one", "ten", "seven"], "correct_choice_idx": 1, "direct_answers": ["one", "seven", "seven", "seven", "seven", "seven", "one", "one", "one", "seven"], "difficult_direct_answer": false, "rationales": ["Number 1 looks like she is releasing the frisbee.", "The player with the number 1 threw it.", "The frisbee appears in front of the player and their arms are in the position one's would be after releasing the frisbee. the number of that player is visible on the front of their jersey."], "image": "val2014/COCO_val2014_000000482910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1637, "question_id": "GXEvwooXgWz5yJduxtts8v", "question": "What is the first terrain on the right?", "choices": ["cliff", "savanna", "valley", "mountain"], "correct_choice_idx": 0, "direct_answers": ["shoreline", "horse", "sand", "beach", "beach", "cliff", "klippe", "cliff", "sand", "cliff"], "difficult_direct_answer": false, "rationales": ["In the distance behind the water to the right we see the topology defining a cliff.", "The area to the right has big rocks overlooking the water.", "The terrain is a cliff."], "image": "train2014/COCO_train2014_000000001637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93329, "question_id": "GXLkkGUvNyQrt7MV5aNDFV", "question": "What style of facial hair is the man sporting?", "choices": ["extended goatee", "soul patch", "horseshoe", "mutton chops"], "correct_choice_idx": 3, "direct_answers": ["beard", "beard", "beard", "mutton chops", "mutton chops", "mutton chops", "mutton chops", "scruff", "beard", "beard"], "difficult_direct_answer": false, "rationales": ["A man has a beard that extends from his temples down into a full beard.", "By the shape of the hair growth on the mans face you can tell what type it is.", "Mutton chops is what's on his face."], "image": "val2014/COCO_val2014_000000093329.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38539, "question_id": "GXbmETKS7PfGzCurWUzR38", "question": "What us reflecting in the glass?", "choices": ["fruit", "dogs", "lights", "clocks"], "correct_choice_idx": 2, "direct_answers": ["window", "light fixtures", "fluorescent lighting", "lights", "lights", "shade", "lights", "fluorescent lights", "flourescent lights", "lights"], "difficult_direct_answer": false, "rationales": ["There are fluorescent lamps being reflected.", "Lights are shown as blurry streaks.", "It is showing lit fixtures from the room"], "image": "val2014/COCO_val2014_000000038539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470976, "question_id": "GXnQK2nsk8zxikcxXwMYNZ", "question": "What is the name of dining candles?", "choices": ["tea light", "votives", "taper candles", "pillar candles"], "correct_choice_idx": 1, "direct_answers": ["candlesticks", "candelabra", "tapers", "votives", "candlesticks", "pillar candles", "pillar", "taper candles", "dinner candles", "candlesticks"], "difficult_direct_answer": false, "rationales": ["Long dining candles are called votive candles.", "These are called taper candles because they are more narrow at the top as compared to the bottom.", "These are long taper candles."], "image": "train2014/COCO_train2014_000000470976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332844, "question_id": "GXxMmNq2JyHjkbW3BtYctw", "question": "What is the woman doing with the poodle?", "choices": ["feeding it", "grooming it", "hitting it", "playing fetch"], "correct_choice_idx": 3, "direct_answers": ["walking", "playing", "playing fetch", "playing fetch", "fetch", "playing fetch", "playing fetch", "playing", "playing catch", "walking beach"], "difficult_direct_answer": false, "rationales": ["There is a stick in the water. the woman is playing a game where she throws the stick for the dog to retrieve and bring it back.", "She is about to pick up an item to throw for the dog", "The woman and dog are looking at a stick."], "image": "val2014/COCO_val2014_000000332844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409906, "question_id": "GXz5rJsdXNZ28DA76jtqnP", "question": "What is the yellow object on the billboard to the right?", "choices": ["paper", "rose", "candy", "fruit"], "correct_choice_idx": 1, "direct_answers": ["rose", "rose", "rose", "rose", "rose", "rose", "flower", "indicate", "rose", "rose"], "difficult_direct_answer": false, "rationales": ["It's an open one with the first indication of wilting.", "A flower is on the billboard.", "The flower has the style and shape of a rose."], "image": "train2014/COCO_train2014_000000409906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289883, "question_id": "GYBar2LbwWZ5p7speWK9Jt", "question": "What style design connects the two bus parts here?", "choices": ["invisible", "bolts", "accordion", "rope"], "correct_choice_idx": 2, "direct_answers": ["accordion", "tow", "accordion", "accordion", "accordion", "accordion", "accordion", "extended", "accordion", "accordion"], "difficult_direct_answer": false, "rationales": ["Two buses are connected by a section in the middle.", "The buses have a flexible black rubber piece connecting them.", "The style is an accordion."], "image": "val2014/COCO_val2014_000000289883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24974, "question_id": "GYEPn5DUAdcC2Wyf53jzdW", "question": "What type of surface is holding this vase?", "choices": ["table", "desk", "bench", "porch"], "correct_choice_idx": 0, "direct_answers": ["tile", "granite", "table", "table", "marble", "marble", "table", "flat", "marble", "granite"], "difficult_direct_answer": false, "rationales": ["Beneath the vase we see a small square surface holding it up with it's base visible. it is too small to be a desk and would be called a table.", "You can tell by the surface as to what is holding up the vase.", "The surface is too small to be a desk, porch, or bench."], "image": "train2014/COCO_train2014_000000024974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109403, "question_id": "GYJakE96jTEZkksEr8Jk5e", "question": "What type activity does the man here take part in?", "choices": ["barrel racing", "bull riding", "car race", "roping"], "correct_choice_idx": 0, "direct_answers": ["rodeo", "horse riding", "rodeo", "barrel racing", "rodeo", "barrel race", "rodeo", "barrel racing", "rodeo", "rodeo"], "difficult_direct_answer": false, "rationales": ["A man is in an arena with a cowboy hat and is on a horse. bull riders wear cowboy hats and ride horses.", "The man is on a horse and going around the barrel.", "He is dressed in the attire and riding a horse around the red and white barrel."], "image": "val2014/COCO_val2014_000000109403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526043, "question_id": "GYKn6wwnNVUhJRWZ2wFfzE", "question": "What is placed in the underneath of a plane storage?", "choices": ["passangers", "luggage", "flight attendant", "pilot"], "correct_choice_idx": 1, "direct_answers": ["ladder", "luggage", "trucks", "baggage", "luggage", "cart", "luggage", "luggage", "luggage", "luggage"], "difficult_direct_answer": false, "rationales": ["Luggage is loaded into the bottom of planes. vehicles are parked nbear a plane at an airport.", "The plane is getting loaded for a trip to another city", "There is an area to hold suicases."], "image": "train2014/COCO_train2014_000000526043.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179186, "question_id": "GYXptqj4nYYmEzKmf96v73", "question": "How was the item on the plate cooked?", "choices": ["microwave", "open flame", "oven", "stovetop"], "correct_choice_idx": 2, "direct_answers": ["oven", "baked", "baked", "oven", "baked", "baked", "oven", "baked", "baked", "baked"], "difficult_direct_answer": false, "rationales": ["The cake was baked.", "The item was baked.", "Cakes need to be baked."], "image": "train2014/COCO_train2014_000000179186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279104, "question_id": "GYY2rcc5dMWt3R3bh24cAc", "question": "Where is this meal being served?", "choices": ["home", "school", "restaurant", "mc donalds"], "correct_choice_idx": 2, "direct_answers": ["restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["This looks like a typical restaurant dish and the frilly toothpick wouldn't be found at home, mcdonalds or at school.", "There are two half sandwiches as well as soup. there is a colorful toothpick holding toppings together.", "The food is restaurant quality in appearance."], "image": "train2014/COCO_train2014_000000279104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222015, "question_id": "GYiwiuQKtHPnmFmPzKHAYs", "question": "This modern cutlery design is invented specially for?", "choices": ["airports", "hospital", "school", "kitchen"], "correct_choice_idx": 3, "direct_answers": ["clock", "kitchen", "eating", "children", "chefs", "kitchen", "eggs", "telling time", "eating", "kitchens"], "difficult_direct_answer": false, "rationales": ["There are forks and spoons which are used for eating.", "It's in the kitchen.", "There are forks and spoons around the clock."], "image": "train2014/COCO_train2014_000000222015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559442, "question_id": "GYnk5Rb4ZxCJnLEU5vr7qd", "question": "What God is worshiped here?", "choices": ["jesus", "satan", "zeus", "buddha"], "correct_choice_idx": 0, "direct_answers": ["christian", "christian", "christianity", "christian", "jesus", "catholic", "christian", "god", "christian", "jesus"], "difficult_direct_answer": false, "rationales": ["You can tell by the architecture as to what kind of religion they are.", "Jesus is part of the church.", "There are crosses on the structure which typically symbolize jesus being nailed to a cross."], "image": "val2014/COCO_val2014_000000559442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473531, "question_id": "GZQV92NMzppFZgkwCU9q5W", "question": "This animal is frequently used as transportation by what profession?", "choices": ["plumber", "police officer", "butcher", "baker"], "correct_choice_idx": 1, "direct_answers": ["police", "police", "cowboy", "cowboy", "cowboy", "police officer", "cowboy", "cowboy", "farming", "cowboy"], "difficult_direct_answer": false, "rationales": ["The animal is with the police officer.", "Police officers usually ride horses.", "Many officers in new york city use these animals instead of cars."], "image": "train2014/COCO_train2014_000000473531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82787, "question_id": "GZS6dyLQYjJP88G65mti3q", "question": "What type of traffic is allowed on the overpass?", "choices": ["busses only", "pedestrian", "taxis", "skateboard only"], "correct_choice_idx": 1, "direct_answers": ["pedestrian", "no overpass", "walking", "bus", "human walking", "people walking", "pedestrian", "pedestrian", "pedestrian", "pedestrian"], "difficult_direct_answer": false, "rationales": ["This is a bridge for walking only", "Pedestrian traffic is allowed as two people can be seen walking across the overpass.", "A person is walking on the overpass."], "image": "val2014/COCO_val2014_000000082787.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490051, "question_id": "GZkUhzToiP9fr5VFFy54dc", "question": "How many of the 4 kids are holding skies?", "choices": ["1/4", "4/4", "2/4", "3/4"], "correct_choice_idx": 3, "direct_answers": ["three", "two", "three", "three", "four", "four", "three", "three", "3/4", "four"], "difficult_direct_answer": false, "rationales": ["There are 3 kids out of 4 who are holding skis.", "You can tell by the shape and design as to what the boys are holding.", "Three of the kids have skis."], "image": "val2014/COCO_val2014_000000490051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186145, "question_id": "GZn7bHcRG6E7q5uqM8Knbs", "question": "How is the kitchen counter by the stove illuminated?", "choices": ["led light", "incandescent light", "halogen light", "fluorescent light"], "correct_choice_idx": 3, "direct_answers": ["overhead lamp", "overhead lighting", "fluorescent lights", "light", "fluorescent light", "lighting", "mounted light", "cabinet light", "florescent lighting", "overhead light"], "difficult_direct_answer": true, "rationales": ["There is a light up above the sink.", "There is a light on the ceiling", "The light shows the light itself."], "image": "train2014/COCO_train2014_000000186145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119529, "question_id": "GZpZbYBmBepm3UEpmLR5Ju", "question": "How long is a giraffe's neck?", "choices": ["4 feet", "7 feet", "5 feet", "6 feet"], "correct_choice_idx": 2, "direct_answers": ["long", "very long", "six feet", "six feet", "very long", "six feet", "8 feet", "6 feet", "5 feet", "8 feet"], "difficult_direct_answer": false, "rationales": ["The giraffe's neck is 5 feet.", "The animal has a long neck. it is reaching the leaves in the tree.", "A giraffe has a long neck."], "image": "train2014/COCO_train2014_000000119529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562045, "question_id": "Ga2ZBQSF79EGP29zf59SNp", "question": "What does he use to build momentum?", "choices": ["pedal", "foot", "remote", "rope"], "correct_choice_idx": 1, "direct_answers": ["legs", "foot", "feet", "feet", "feet", "foot", "foot", "weight", "foot", "his feet"], "difficult_direct_answer": false, "rationales": ["A guy is on a skateboard as he cruises down the street. he uses bottom part of leg to propel speed forward.", "Pushing with your foot makes a skateboard roll on the wheels.", "If on a flat surface you must use your legs to propel further on a skateboard."], "image": "val2014/COCO_val2014_000000562045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140630, "question_id": "Ga3CFsvdL9hRYqzUB3gAFS", "question": "How many people must stop at the intersection?", "choices": ["two", "four", "three", "one"], "correct_choice_idx": 1, "direct_answers": ["four", "four", "one", "all", "all", "everyone", "all", "all", "four", "everyone"], "difficult_direct_answer": false, "rationales": ["All four ways should stop at the intersection.", "It says all way on it so all cars will have to stop.", "Intersections are four ways. therefore, four drivers must stop."], "image": "train2014/COCO_train2014_000000140630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574350, "question_id": "GaSddYwYYkYisQUdLGRG8f", "question": "What are they using the umbrellas to protect themselves from?", "choices": ["sun", "moon", "rain", "air"], "correct_choice_idx": 0, "direct_answers": ["sun", "sun", "sun", "sun", "sun", "sun", "sun heat", "sun", "wind", "hot sun"], "difficult_direct_answer": false, "rationales": ["The people are using their umbrellas to shield themselves from the sun.", "No rain can be seen, but their shadows can be seen in the sun. the sun has harmful uv rays, and an umbrella that block that and protect people's skin.", "There are only two things an umbrella will protect us from and it doesn't appear to be raining."], "image": "val2014/COCO_val2014_000000574350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353009, "question_id": "GaYiwPgeVmS6fSNHAowDbr", "question": "What game is being played here?", "choices": ["pickle ball", "racket ball", "squash", "tennis"], "correct_choice_idx": 0, "direct_answers": ["tennis", "tennis", "tennis", "tennis", "tennis", "pickle ball", "tennis", "tennis", "tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["The man is holding a pickleball paddle so that's what he's playing.", "The racket is a little different on a court similar to tennis", "The people are on what looks to be a small tennis court."], "image": "train2014/COCO_train2014_000000353009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152273, "question_id": "GacbL4FfuGNJpCLWT8oVYc", "question": "What is decade are the phones most likely from?", "choices": ["2010's", "2020's", "1990's", "1970's"], "correct_choice_idx": 2, "direct_answers": ["nineties", "nineties", "2000s", "1990's", "2000's", "2000s", "2000", "1980s", "early 2000s", "aughts"], "difficult_direct_answer": false, "rationales": ["The phones have a screen but they are not touch screen. most phones made now don't have a keypad.", "These look like really older phones.", "The cell phones on the table are very old-fashioned like they are from the 90s."], "image": "train2014/COCO_train2014_000000152273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18375, "question_id": "GakW7LXQ6cqQ67b9Bfpkds", "question": "What is the role of Bank of America to the game?", "choices": ["loan provider", "site provider", "sponsor", "fund provider"], "correct_choice_idx": 2, "direct_answers": ["sponsor", "sponsor", "umpire", "umpire", "sponsor", "sponsor", "sponsor", "sponsor", "sponsor", "sponsor"], "difficult_direct_answer": false, "rationales": ["The role is a sponsor.", "Many companies have their logos in professional sports games.", "They have a banner in the back along the fence."], "image": "train2014/COCO_train2014_000000018375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124471, "question_id": "Gauub2jwydZn8hz36NAqDX", "question": "Why is the wall here?", "choices": ["prevent flooding", "trap giraffes", "people barrier", "random"], "correct_choice_idx": 1, "direct_answers": ["sequester animals", "stop animals", "prevent runaways", "trap giraffes", "protect giraffes", "contain animals", "habitat", "enclosure", "barrier", "fence"], "difficult_direct_answer": true, "rationales": ["The wall is there to keep the giraffes in the enclosure.", "The wall is designed to keep giraffes from leaving their designated area at the zoo.", "The top wall stops the giraffes to the left."], "image": "train2014/COCO_train2014_000000124471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458286, "question_id": "GaxXMmzYUXjxWo3McdbQgY", "question": "What does one do when sitting at this piece of furniture?", "choices": ["exercise", "sleep", "work", "eat"], "correct_choice_idx": 2, "direct_answers": ["type", "using computer", "work", "computer activities", "type", "use computer", "sit", "type", "compute", "use computer"], "difficult_direct_answer": false, "rationales": ["This desk holds a computer used for work", "With the desks computer shown in the photo, most likely you would work while there.", "You would sit at a desk to get things done that you need to do."], "image": "train2014/COCO_train2014_000000458286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231654, "question_id": "Gb3LMPCuNF7ekQ697EMYaT", "question": "What needs to be fixed here on an urgent basis?", "choices": ["ground", "traffic lights", "parking lot", "house"], "correct_choice_idx": 1, "direct_answers": ["traffic sign", "traffic lights", "traffic lights", "traffic light", "stoplight", "traffic light", "street light", "traffic light", "lights", "light"], "difficult_direct_answer": false, "rationales": ["There is a stop light leaning on a pole.", "A long black pole is ripped out of the ground. it is in danger of falling on cars.", "The light is falling over and it could hurt someone. it also will be hard for drivers to see it to know when to stop or go."], "image": "train2014/COCO_train2014_000000231654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367610, "question_id": "GbDTpXb99JdofdYi2bPxoK", "question": "What side of the rest is usually best for passing?", "choices": ["under", "right", "left", "over"], "correct_choice_idx": 2, "direct_answers": ["right", "left", "left", "right", "left", "right", "right", "right", "left", "right"], "difficult_direct_answer": false, "rationales": ["We drive on the right and pass on the left.", "The side is the left.", "A man is herding sheep along a highway with the right side of the road remaining clear."], "image": "val2014/COCO_val2014_000000367610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331616, "question_id": "GbTdxvFj4eMtZ6MVifVLLB", "question": "What type of event is this?", "choices": ["reception", "wedding", "shower", "competition"], "correct_choice_idx": 3, "direct_answers": ["competition", "skateboard competition", "skateboard competition", "skateboard competition", "skiing", "skateboard", "skateboarding competition", "skateboard", "skateboarding", "skateboard event"], "difficult_direct_answer": false, "rationales": ["There are banners with sponsors names on them in the background which would be present for answer a at this location.", "There are banners for sponsors, which only happen during events, and everyone is skateboarding.", "Looks like many people are waiting for their turn in to compete."], "image": "train2014/COCO_train2014_000000331616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394133, "question_id": "GbUW5qF7MXvPvkCHa5Q7pb", "question": "The bags were likely placed on the bed by whom?", "choices": ["unknown", "guests", "owner", "staff"], "correct_choice_idx": 1, "direct_answers": ["traveler", "traveler", "owner", "customers", "guests", "person", "traveler", "hotel guests", "guest", "hotel staff"], "difficult_direct_answer": false, "rationales": ["It looks like a hotel room, so odds are it is the guest who checked into the room.", "The bed and curtain resemble that of one seen in a hotel, so the people in the room are likely temporary guests.", "When checking into a hotel room, bags are often thrown on the bed."], "image": "val2014/COCO_val2014_000000394133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580851, "question_id": "Gbbom5Q3skhPAtxj9Edds9", "question": "Where are the brushes place?", "choices": ["in cup", "on floor", "on table", "beside cup"], "correct_choice_idx": 0, "direct_answers": ["cup", "metal cup", "in cup", "cup", "cup", "metal cup", "silver cup", "cup", "cup", "bathroom"], "difficult_direct_answer": false, "rationales": ["A metallic container is holding the brushes.", "People place their toothbrushes in a cup beside the sink.", "The bottoms of the brushes cannot be seen as they are being held in place by a cylindrical item, which is the cup."], "image": "train2014/COCO_train2014_000000580851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175112, "question_id": "GbdrQ5LiZ2XEEsGsuGcmDJ", "question": "What is the green and yellow helmet made out of?", "choices": ["metal", "aluminum", "cloth", "carbon fiber"], "correct_choice_idx": 3, "direct_answers": ["metal", "plastic", "carbon fiber", "plastic", "plastic", "plastic", "plastic", "metal", "plastic", "metal"], "difficult_direct_answer": false, "rationales": ["It's made out of carbon fiber.", "The helmet has a plastic look to it.", "According to an internet search all mlb helmets are made of aerospace grade carbon fiber."], "image": "train2014/COCO_train2014_000000175112.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529187, "question_id": "GbpEfbA7E2LYerZKhDFF3B", "question": "What relationship does the large animal have with the smaller one?", "choices": ["parent", "enemies", "adversarial", "friends only"], "correct_choice_idx": 0, "direct_answers": ["mother", "parent", "parent", "parental", "mother/child", "mom/baby", "parent", "mother", "parent child", "mother"], "difficult_direct_answer": false, "rationales": ["The giraffe is bigger than the other one and standing in close proximity.", "With how the giraffes are interacting it's safe to assume there relationship.", "The bigger giraffe is the mother of the small giraffe."], "image": "val2014/COCO_val2014_000000529187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483817, "question_id": "Gbs7nm399RE5NjwrXXBXte", "question": "What phenomenon do these surfers hope for?", "choices": ["tranquility", "doldrums", "eclipse", "large tides"], "correct_choice_idx": 3, "direct_answers": ["wave curl", "wave", "waves", "large waves", "huge waves", "large tides", "ocean waves", "big waves", "wave", "waves"], "difficult_direct_answer": false, "rationales": ["They are waiting for big waves.", "Tranquility or doldrums would reduce the size of the waves and would make it harder to surf. an eclipse would make it harder to see.", "These surfers all together hope for large tides."], "image": "train2014/COCO_train2014_000000483817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167851, "question_id": "GbwSYSRxZ5WeGgDrR24YEu", "question": "Why are the buses lined up?", "choices": ["awaiting passengers", "racing", "heavy traffic", "are lost"], "correct_choice_idx": 0, "direct_answers": ["pickup passengers", "loading passengers", "awaiting passengers", "boarding", "leaving station", "parking", "travel", "bus stop", "parked", "traffic"], "difficult_direct_answer": true, "rationales": ["Based on the background and the fact that these buses are lined up like this, they are probably parked near a tourist destination of some kind. there are no visible passengers on the buses, so if they are present, the buses are likely waiting for them to return to the bus.", "They carry people to different locations", "The buses are lined up so that passengers can get on at the bus stop."], "image": "train2014/COCO_train2014_000000167851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69856, "question_id": "GcPZ4RF2ZjL6kTMYVjaDhp", "question": "What is the capital of the state depicted here?", "choices": ["manhattan", "buffalo", "albany", "poughkeepsie"], "correct_choice_idx": 2, "direct_answers": ["nyc", "albany", "albany", "albany", "albany", "madison", "new york", "albany", "albany", "new york"], "difficult_direct_answer": false, "rationales": ["The capital is albany.", "Madison avenue and east 42nd street are famous streets in new york city. therefore, i chose the capital of the state of new york.", "It's the capital of new york city."], "image": "train2014/COCO_train2014_000000069856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291140, "question_id": "GcQUtZ3tK2mHXzRzoQeeYS", "question": "Why do you need to frequently repair beach houses?", "choices": ["law", "nosy neighbors", "environmental wear", "beauty"], "correct_choice_idx": 2, "direct_answers": ["hurricanes", "frequent flooding", "wind damage", "environmental wear", "keep clean", "weather damage", "water damage", "salt water", "safety", "water"], "difficult_direct_answer": true, "rationales": ["Beachfront property is more subject to wear and tear.", "With the large amounts of wind and moisture near beaches, houses need to be repainted and repaired more often than houses inland.", "There is a lot of wind and water erosion on coasts"], "image": "train2014/COCO_train2014_000000291140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414324, "question_id": "GcyKpdQuazrWdmmz6pzrQU", "question": "What area of the building is this?", "choices": ["kitchen", "lobby", "restroom", "dining room"], "correct_choice_idx": 2, "direct_answers": ["bathroom", "bathroom", "restroom", "restroom", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom"], "difficult_direct_answer": false, "rationales": ["Here we see soap dispensers, sinks, mirrors and tissues. these items are normally found in a bathroom.", "The area is the bathroom.", "There are bathroom sinks and mirrors along one wall."], "image": "val2014/COCO_val2014_000000414324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204273, "question_id": "Gd89KyvvPfxFdQd6JF8gm9", "question": "What does Northwestern Medicine provide in this game?", "choices": ["medical service", "medical advice", "drugs", "sponsor"], "correct_choice_idx": 3, "direct_answers": ["advertising", "money", "advertising", "sponsor", "money", "sponsorship", "sponsorship", "money", "sponsorship", "sponsorship"], "difficult_direct_answer": false, "rationales": ["There is a northwestern medicine sign near the dugout. this company paid to put the sign there.", "Their logo on the wall means they've donated money.", "The group is a sponsor."], "image": "train2014/COCO_train2014_000000204273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289995, "question_id": "GdLWCH33zyaErQFV8fwrB3", "question": "What do the items shown here come from originally?", "choices": ["seeds", "retailers", "boxes", "tv"], "correct_choice_idx": 0, "direct_answers": ["farm", "seeds", "farm", "farms", "farms", "farm", "asia", "farms", "middle east", "africa"], "difficult_direct_answer": false, "rationales": ["The items are fruits in vegetables. fruits and vegetables start as seeds.", "They are produce. produce grow from the ground.", "Plants grow from a small embryonic plant in a protective out covering."], "image": "val2014/COCO_val2014_000000289995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292480, "question_id": "GdQYDEJLYhmBADUCw7en48", "question": "What is sprinkled on the donuts?", "choices": ["sesame seeds", "peanuts", "pistachios", "sunflower seeds"], "correct_choice_idx": 1, "direct_answers": ["nuts", "peanuts", "peanuts", "peanuts", "nuts", "peanuts", "nuts", "nuts", "nuts", "peanuts"], "difficult_direct_answer": false, "rationales": ["Nuts are on the donuts.", "One can tell what kind of nut it is based on the size, shape and color.", "The donuts have nuts."], "image": "val2014/COCO_val2014_000000292480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6066, "question_id": "Gdgw9KxEg3DqGWYvAWrfBe", "question": "What nation's flag are the birds flying towards?", "choices": ["colombia", "romania", "switzerland", "canada"], "correct_choice_idx": 3, "direct_answers": ["canada", "canada", "canada", "canada", "canada", "canada", "canada", "canada", "canada", "canada"], "difficult_direct_answer": false, "rationales": ["The flag has two vertical red bars around a white center with a red maple leaf. this is the flag of canada.", "As indicated by the red maple leaf.", "The red and white colors with a leaf in the center identifies the national flag of canada."], "image": "train2014/COCO_train2014_000000006066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562160, "question_id": "GdwmyhBA6uR3DeB29Pgy9E", "question": "The third book from the left that has a title on the spine would be used by who?", "choices": ["programmer", "fireman", "dancer", "singer"], "correct_choice_idx": 0, "direct_answers": ["realtor", "rails", "unix", "computer programmer", "computer programmer", "kids", "programmer", "academic", "engineer", "slue estimation"], "difficult_direct_answer": true, "rationales": ["The book is about rails so that's who would be interested in using it.", "This third book as well as most the books nearby all have to do with computer programming.", "The word on the book is synonymous with computer programming, and would not have much use or interest to anyone else."], "image": "val2014/COCO_val2014_000000562160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186605, "question_id": "Ge6y3rdfHb7meC5yz2A4Cr", "question": "What type of board are the two standing on?", "choices": ["shuffle board", "short board", "long board", "hover board"], "correct_choice_idx": 2, "direct_answers": ["skate", "skateboard", "skateboard", "skateboard", "long board", "skateboard", "skateboard", "skate", "skate", "skateboard"], "difficult_direct_answer": false, "rationales": ["They are on a longer board since it will for both of them.", "Long boards are extra long and can fit more people than just one.", "It's a longer then normal board."], "image": "train2014/COCO_train2014_000000186605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402020, "question_id": "GeBwJjEgSFr6UXummj4WYb", "question": "What is the dark food product on the sandwich?", "choices": ["caviar", "cheese", "gravy", "pepper"], "correct_choice_idx": 0, "direct_answers": ["nutmeg", "cranberry sauce", "blueberry jam", "chocolate", "olives", "caviar", "meat", "olives", "cabbage", "mushrooms"], "difficult_direct_answer": true, "rationales": ["The black item on this sandwich is made up of small black eggs. caviar is small black fish eggs.", "This is a dark food used on sandwiches and sometimes crackers or as a stand alone. it is derived from fish eggs and is considered a delicacy in certain areas of the world, often consumed by the rich. the setting of this dish suggests it is in an upscale restaurant.", "The other options aren't in this image."], "image": "train2014/COCO_train2014_000000402020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527535, "question_id": "GeGQYMu2EPxwtxRh2BMyqp", "question": "What would be the most fitting name for this custom dessert?", "choices": ["crumb cake", "dessert pizza", "sorbet", "flambe"], "correct_choice_idx": 1, "direct_answers": ["dessert pizza", "sweet pizza", "creamy pizza", "whipped cream", "dessert pizza", "banana crumble", "sweet pizza", "custard pie", "banana pizza", "banana heaven"], "difficult_direct_answer": false, "rationales": ["It is round like a pizza, and cut into slices the same way a pizza would be.", "The pizza has a lot of sweet toppings on it.", "The banana sundae is in the shape of a new york pie."], "image": "val2014/COCO_val2014_000000527535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136323, "question_id": "GeLpnDLg8JWfJ6HTw5m5Rx", "question": "What do the tarps shown on these vessels do for the inside of the boats?", "choices": ["hold water", "nothing", "keep dry", "signal"], "correct_choice_idx": 2, "direct_answers": ["uv protection", "keeps dry", "keep dry", "keep dry", "keep dry", "protection", "shield rain", "keep dry", "cargo transporting", "ensure dryness"], "difficult_direct_answer": false, "rationales": ["The tarps are waterproof and are covering the vulnerable areas.", "Tarps repel water.", "They cover spaces. tarps are water resistant."], "image": "train2014/COCO_train2014_000000136323.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401109, "question_id": "GeMArxoR5gJkL7xPXZrr5q", "question": "What are the guys in the background doing for jobs?", "choices": ["waiters", "maitre d", "cooks", "comics"], "correct_choice_idx": 2, "direct_answers": ["cooking", "cooking", "cooking", "cooks", "cooks", "cooks", "cooking", "cooking food", "cooking", "cooking"], "difficult_direct_answer": false, "rationales": ["The kitchen is in the background.", "People can be seen in a kitchen of a restaurant behind customers sitting at a table.", "The guys are cooks."], "image": "train2014/COCO_train2014_000000401109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359239, "question_id": "GeiAUp6hWZ7HBvXa8Nc9tD", "question": "What type of restaurant serves this food?", "choices": ["fast food", "italian", "chinese", "mexican"], "correct_choice_idx": 1, "direct_answers": ["italian", "pizzeria", "pizzeria", "pizzaria", "italian restaurant", "pizzeria", "pizzeria", "pizza shop", "pizzeria", "italian"], "difficult_direct_answer": false, "rationales": ["This is a pizza which is from italy", "A pizza is on a table on a pan. italian restaurants serve pizza.", "This is a pizza which is considered italian."], "image": "val2014/COCO_val2014_000000359239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534044, "question_id": "GetVG3byBHrUoocn5wWWNX", "question": "This train is moved by what energy?", "choices": ["magnetic force", "coal", "gas", "electricity"], "correct_choice_idx": 3, "direct_answers": ["electricity", "electric", "electricity", "electricity", "electric", "electricity", "electric", "electricity", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["The train is connected to power lines and runs on electricity.", "The train is connected to electricity lines.", "The train is connected to a cable that provides electricity for it to move."], "image": "train2014/COCO_train2014_000000534044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196413, "question_id": "GfDbYtLMiEoCeiSDTTWbnX", "question": "Where are these cars located?", "choices": ["driveway", "garage", "road", "parking lot"], "correct_choice_idx": 3, "direct_answers": ["parking lot", "parking lot", "truck", "parking lot", "parking lot", "parking lot", "parking lot", "parking lot", "city", "city"], "difficult_direct_answer": false, "rationales": ["The cars are outside and are not moving. they are near a meter.", "The cars are in a lot.", "The area is for people to park their cars."], "image": "val2014/COCO_val2014_000000196413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508896, "question_id": "GfGkzJoTsqGqrk2CZb8Sw9", "question": "What kind of trick is the man doing on the half pipe?", "choices": ["manual", "flip trick", "lip trick", "hand spin"], "correct_choice_idx": 2, "direct_answers": ["grind", "skateboard", "standing", "airs", "lip trick", "fakie", "grind", "nose slide", "grinding", "grind"], "difficult_direct_answer": false, "rationales": ["The section of the half pipe that the board is in contact with is called a lip. this type of skateboarding in this apparatus would be for doing tricks so he is currently doing a trick on the lip or lip trick.", "He is doing a flip on his skateboad.", "The man is trying to do a lip trick on the pipe."], "image": "train2014/COCO_train2014_000000508896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206394, "question_id": "GfQnupttxnNyhzttdV4Xvh", "question": "What is the circular silver object on the boat used for?", "choices": ["diving", "steering", "sitting", "fishing"], "correct_choice_idx": 1, "direct_answers": ["steering", "steering", "steering", "boat", "safety", "steering", "steering", "steering", "steering", "attachment"], "difficult_direct_answer": false, "rationales": ["This object helps with navigation", "There is a circular steering wheel coming out of the top of the boat.", "This turns the boat"], "image": "train2014/COCO_train2014_000000206394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242301, "question_id": "GfWsaXtVhUFQFYJAigyNAg", "question": "Where are these people?", "choices": ["car dealership", "spa", "hotel room", "outside"], "correct_choice_idx": 2, "direct_answers": ["hotel room", "bedroom", "hotel", "bed", "hotel", "bedroom", "hotel", "hotel room", "hotel", "bedroom"], "difficult_direct_answer": false, "rationales": ["The room is in a hotel.", "There are two beds side by side", "The people are inside a building. there are no massage tables or cars near the people."], "image": "val2014/COCO_val2014_000000242301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532605, "question_id": "GfcTUxQGPLWiW7vMMzc8Rj", "question": "What does the number on his back signify?", "choices": ["participation number", "location", "age", "speed"], "correct_choice_idx": 0, "direct_answers": ["participation number", "racing bib", "race placement", "competitor identifier", "participant", "race participant", "race", "competition", "entrance number", "bib number"], "difficult_direct_answer": true, "rationales": ["Participants in athletic competitions are often given an identification number.", "The person is engaged in a competitive event and is wearing something used to identify people in the event, thus matching the item in option a.", "This is the number that the person is competing with."], "image": "train2014/COCO_train2014_000000532605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392166, "question_id": "Gfddkfq86Enny8W3tg96Us", "question": "What is the key to getting the dog to stay in place here?", "choices": ["rope", "balance", "collar", "getting view"], "correct_choice_idx": 1, "direct_answers": ["balance", "leash", "balance", "leash", "training", "trust", "leash", "leash", "leash", "training"], "difficult_direct_answer": false, "rationales": ["The man is maintaining his balance in order to hold the dog in that position.", "This man is balancing himself on his skateboard as well as balancing the dog on his outstretched arm.", "The rope is keeping the dog tied down."], "image": "train2014/COCO_train2014_000000392166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83067, "question_id": "GgQVDZshKN4QZjxy7pBUri", "question": "Why is he in the middle of the lake?", "choices": ["is captive", "enjoys sailing", "is lost", "no map"], "correct_choice_idx": 1, "direct_answers": ["sailing", "having fun", "sailing", "pleasure boating", "enjoys sailing", "sailing", "sailing", "enjoying boat", "sailing", "sailing"], "difficult_direct_answer": false, "rationales": ["He is in a sailboat so it would seem that he likes it.", "A person is on a boat on the water. people like to sail.", "He is in a sail boat"], "image": "val2014/COCO_val2014_000000083067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416625, "question_id": "GgVK2Xz3NP9JX7PHq5Zibx", "question": "Why is the boy reaching for the ball?", "choices": ["to throw", "to show", "to hit", "to catch"], "correct_choice_idx": 2, "direct_answers": ["return attempt", "return", "hit", "to hit", "serving", "return serve", "hit", "return ball", "hit it", "to hit"], "difficult_direct_answer": false, "rationales": ["The boy wants to hit the ball.", "The boy is reaching to hit the little tennis ball.", "The boy is playing tennis and has his racket close to the ball indicating that he is going to hit it."], "image": "train2014/COCO_train2014_000000416625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301305, "question_id": "GgkkuPsCLm2LKLXRKLpCFm", "question": "What will the people shown here have for dessert?", "choices": ["waffles", "ice cream", "pie", "cheesecake"], "correct_choice_idx": 3, "direct_answers": ["cheesecake", "pie", "tart", "cheese cake", "cheesecake", "cheesecake", "cheesecake", "cheesecake", "cheesecake", "cake"], "difficult_direct_answer": false, "rationales": ["A person is cutting a cake that is cheese colored.", "A cheesecake is on a table that is surrounded by people. people eat cheesecake for dessert.", "They will eat the cake the person is cutting"], "image": "val2014/COCO_val2014_000000301305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283054, "question_id": "GgqsfDwJMNCqMfdY86g4So", "question": "How fast can an automobile travel on this street?", "choices": ["50mph", "75mph", "30mph", "10mph"], "correct_choice_idx": 2, "direct_answers": ["30 mph", "thirty mph", "thirty miles", "30", "30mph", "30 mph", "30 mph", "30 mph", "twenty five", "thirty mph"], "difficult_direct_answer": false, "rationales": ["You can tell what the speed limit is by looking at the speed limit sign in the background.", "This is a residential area where high speeds are not limited", "The sign gives the speed limit as 30mph."], "image": "val2014/COCO_val2014_000000283054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272101, "question_id": "GgvTexnqZyHY8UzcdyZen6", "question": "What is the toddler doing?", "choices": ["exercising", "posing", "surrendering", "dancing"], "correct_choice_idx": 1, "direct_answers": ["skiing", "skiing", "cheering", "saying hello", "posing", "posing", "scatting", "standing", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["The toddler is posing.", "This toddler is not in position for actively skiing but is striking a pose for the camera.", "The toddler has her hands up and is looking at the camera."], "image": "train2014/COCO_train2014_000000272101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494113, "question_id": "Gh69JChb5NAuhPTP8Qa6ZB", "question": "Why is he standing like that?", "choices": ["ball coming", "is fighting", "is afraid", "is falling"], "correct_choice_idx": 0, "direct_answers": ["hit ball", "serving", "playing", "serving tennis-ball", "serving", "serving", "hit", "food ball", "ball coming", "overhead serve"], "difficult_direct_answer": false, "rationales": ["The person is waiting for something to fall to him so he can hit it with a racquet. option a matches the item and its action.", "He is playing tennis and is in the process of serving.", "The person is standing like this because the ball is coming back down."], "image": "train2014/COCO_train2014_000000494113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22222, "question_id": "GhDTb82D2qwNvUCLX5jJ4f", "question": "The clear umbrellas used by the people on this street is indicative of which country's culture?", "choices": ["japan", "south korea", "china", "vietnam"], "correct_choice_idx": 0, "direct_answers": ["japan", "japan", "japan", "japan", "japanese", "japan", "asian", "japan", "korean", "japan"], "difficult_direct_answer": false, "rationales": ["Japanese people use clear umbrellas.", "The umbrellas are from japan.", "There is japanese lettering on the signs."], "image": "train2014/COCO_train2014_000000022222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234595, "question_id": "GhJY7KQPjxaiDgDVye9FdL", "question": "What activity might most people here do on this day?", "choices": ["get refunds", "sell things", "swim", "eat sharks"], "correct_choice_idx": 2, "direct_answers": ["sun bathe", "swim", "sun bath", "swimming", "swim", "tanning", "swim", "swimming", "swim", "sun bathe"], "difficult_direct_answer": false, "rationales": ["These people are on the beach.", "There are many people sitting at an umbrella at the beach. they are enjoying the ocean as well.", "The people can swim on the beach."], "image": "val2014/COCO_val2014_000000234595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205116, "question_id": "Ghc8zyiRmEnt288GpM9nsH", "question": "What is the old man doing with the scissors?", "choices": ["cutting hair", "cutting string", "cutting paper", "cutting fruit"], "correct_choice_idx": 0, "direct_answers": ["cutting hair", "cutting hair", "cutting hair", "cutting hair", "cutting hair", "cutting hair", "cutting hair", "cutting hair", "trimming hair", "trimming hair"], "difficult_direct_answer": false, "rationales": ["He has scissors right next to her hair to cut it.", "The man is holding scissors up to another's hair while in a barber shop.", "The scissors are near the younger man's head. the younger man does not have paper, string, or fruit on his head."], "image": "train2014/COCO_train2014_000000205116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84123, "question_id": "GhfPssSpBBJi6VU5AsPZQM", "question": "What type of signs are these?", "choices": ["directional", "warning", "brand", "regulatory"], "correct_choice_idx": 0, "direct_answers": ["information signs", "street signs", "street signs", "directional", "road", "directional", "direction", "street", "road", "street signs"], "difficult_direct_answer": false, "rationales": ["The signs are pointing to different areas.", "The signs are pointing in specific directions.", "The signs point directions."], "image": "val2014/COCO_val2014_000000084123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282224, "question_id": "GhnAMDBFSgcg25v2F2B9BL", "question": "What is the relationship of the man wearing white checker shirt to the woman wearing white skirt in this situation?", "choices": ["coworker", "competitor", "teammate", "coach"], "correct_choice_idx": 2, "direct_answers": ["players", "doubles partners", "married", "partners", "teammates", "tennis partner", "friendship", "teammate", "teammate", "teammates"], "difficult_direct_answer": false, "rationales": ["The man in the checker shirt and the woman in white are both on the same team.", "The man in the checker shirt is teammates with the woman since they are on the same side of the net.", "They're teammates."], "image": "val2014/COCO_val2014_000000282224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488571, "question_id": "GhpfrveNTrkTiuvszxz4Kd", "question": "What kind of fruits might be said to sit on the items being prepared here besides tomatoes?", "choices": ["oranges", "cheese", "pepperoni", "olives"], "correct_choice_idx": 3, "direct_answers": ["olive", "pizza", "olives", "strawberries", "olives", "pineapple", "pineapple", "olives", "olives", "olives"], "difficult_direct_answer": false, "rationales": ["They are also more savory than other fruits", "Olives are common toppings for sandwiches.", "Olives are on the sandwiches."], "image": "val2014/COCO_val2014_000000488571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348790, "question_id": "Gi6GXm9CFWhQ6ADTRFTtTN", "question": "What are the colorful lights used for?", "choices": ["parades", "decoration", "controlling traffic", "dancing"], "correct_choice_idx": 2, "direct_answers": ["street lights", "stop lights", "control traffic", "controlling traffic", "traffic", "traffic control", "navigation", "traffic", "traffic stops", "traffic"], "difficult_direct_answer": false, "rationales": ["The colored lights in this street scene are the red yellow and green of traffic lights. these lights allow and disallow traffic through intersections.", "Where there are intersections in the road, the colorful lights need to be used to direct traffic to avoid collisions.", "They are used to control to run areas that are very congested with vehicles."], "image": "train2014/COCO_train2014_000000348790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322059, "question_id": "GiZ9WXFfsXxzWw2VHx7cyJ", "question": "What might live in this environment?", "choices": ["fish", "birds", "worms", "cats"], "correct_choice_idx": 0, "direct_answers": ["fish", "fish", "fish", "fish", "fish", "sharks", "fish", "sharks", "fish", "fish"], "difficult_direct_answer": false, "rationales": ["This is a water area. sea creatures would live here.", "There is water in this environment. birds, cats, and worms live on land.", "This is water and they have gills to breathe in it"], "image": "train2014/COCO_train2014_000000322059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80521, "question_id": "GjNQERYLqj6nAjRUL8LocZ", "question": "What part of this image is added post shooting?", "choices": ["tracks", "text", "men", "darkened corners"], "correct_choice_idx": 3, "direct_answers": ["over hill", "color", "filter", "person", "foreground", "filter", "sleds", "lighting", "credits", "darkened corners"], "difficult_direct_answer": true, "rationales": ["The colors got darkened after the shot was taken.", "This image is edited for darker contrasts at corners.", "The edges wouldn't be darker without manipulation"], "image": "train2014/COCO_train2014_000000080521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277461, "question_id": "GjU5ntsKU4GccT2mcnHuPM", "question": "Who is the CEO of this airline?", "choices": ["michael rogers", "susan mcdowell", "timothy farrell", "ed bastian"], "correct_choice_idx": 3, "direct_answers": ["owner", "ed bastian", "unknown", "ed bastian", "not clear", "ed bastian", "bill gates", "ed bastian", "ed bastian", "ed bastian"], "difficult_direct_answer": false, "rationales": ["This businessman leads a team of people in one of the major airlines.", "It is delta.", "The airplane has a delta livery."], "image": "train2014/COCO_train2014_000000277461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563058, "question_id": "GjYivX5RoKP7y2F7EdqpLU", "question": "What is the orientation of this vehicle?", "choices": ["rhombus", "circle", "square", "straight line"], "correct_choice_idx": 3, "direct_answers": ["left", "forward", "forward", "train", "horizontal", "forward", "travelling", "west", "up", "straight line"], "difficult_direct_answer": false, "rationales": ["A train is not bendable nor does it have any geometric shape on the tracks.", "It's in a straight line.", "The train moves in a straight line."], "image": "train2014/COCO_train2014_000000563058.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230537, "question_id": "GjZjXJVGh28YC8mUKCKawi", "question": "How would this man defend himself if attacked?", "choices": ["gun", "knife", "karate", "he wouldn't"], "correct_choice_idx": 0, "direct_answers": ["gun", "shoot gun", "gun", "gun", "gun", "gun", "gun", "gun", "gun", "shoot gun"], "difficult_direct_answer": false, "rationales": ["The man has a firearm on his waist. if someone tries to harm him he'll use his firearm.", "The man would use the gun.", "Guns are frequently used as self defense items and there is a gun handle visible protruding from his shirt and pants. if attacked this, being in arms reach, would likely be the most effective and most practical item to use for defense."], "image": "train2014/COCO_train2014_000000230537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554273, "question_id": "GkDbyDzMSFF5L6pBhFS7cH", "question": "What does the person in mid air want to do with the frisbee?", "choices": ["throw it", "bite it", "avoid it", "catch it"], "correct_choice_idx": 3, "direct_answers": ["catch it", "catch", "catch", "catch", "catch", "catch", "catch it", "catch it", "catch", "catch it"], "difficult_direct_answer": false, "rationales": ["The person wants to grab ahold of the frisbee.", "The person is trying to get a handle on the frisbee.", "The person wants to catch."], "image": "val2014/COCO_val2014_000000554273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203989, "question_id": "GkJ4BnqyBwrQu4rSPDwURM", "question": "What is the person on the skateboard wearing?", "choices": ["backpack", "samurai sword", "guitar case", "gas mask"], "correct_choice_idx": 0, "direct_answers": ["backpack", "black", "jeans", "jeans", "backpack", "pants", "skating", "jeans", "jacket", "bag"], "difficult_direct_answer": false, "rationales": ["He has a pack on his back to carry things in so his hands can be free to balance.", "The person has a backpack.", "He is wearing a cloth bag on his back that has shoulder straps."], "image": "train2014/COCO_train2014_000000203989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21281, "question_id": "GkLsUhamRkczaUhPnWUzcS", "question": "How is this image created?", "choices": ["collage", "cgi", "photography", "watercolor"], "correct_choice_idx": 1, "direct_answers": ["photoshop", "digitally", "computer", "digital technology", "photoshop", "computer imaging", "computer", "photoshop", "cgi", "computer"], "difficult_direct_answer": false, "rationales": ["It's how most images are created now.", "It could of been painted, but it looks like it was made on a computer.", "The image uses cgi."], "image": "train2014/COCO_train2014_000000021281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93866, "question_id": "GkUPZ255GfuBUcPSDZ4Et5", "question": "Whose leg is visible on the background?", "choices": ["giraffe", "zebra", "human", "elephant"], "correct_choice_idx": 0, "direct_answers": ["giraffe", "giraffe", "giraffe", "zebra", "zebras", "giraffe", "giraffe", "giraffe", "giraffe", "zebra"], "difficult_direct_answer": false, "rationales": ["The pattern of the leg is a giraffe's.", "The animals in front are zebras. the non-human animal leg in the background belongs to a different animal and is too skinny to belong to an elephant.", "That is a leg of a giraffe."], "image": "train2014/COCO_train2014_000000093866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41826, "question_id": "GkY7HdgBURbgcnGrNqCy4v", "question": "Why is this man with these animals?", "choices": ["wash them", "herd them", "sell them", "kill them"], "correct_choice_idx": 1, "direct_answers": ["herding them", "herding", "herding", "heading", "herding", "herder", "herding", "herding cattle", "shepherd", "herd them"], "difficult_direct_answer": false, "rationales": ["The man is behind the cows and bulls grazing them.", "A person is standing with a bunch of cows. people herd animals.", "You can tell since the animals are grouped together as to what the mans profession is."], "image": "train2014/COCO_train2014_000000041826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45084, "question_id": "GkjR9UA4HMWwhQenxjCNVE", "question": "Why is the white lines on the glass?", "choices": ["visibility", "height restriction", "decoration", "measurement"], "correct_choice_idx": 0, "direct_answers": ["decoration", "visibility", "decoration", "design", "alert", "stand out", "stop", "separation", "warning", "visibility"], "difficult_direct_answer": false, "rationales": ["The white lines assist in visibility of the clear glass pane.", "The lines are their for visibility.", "The white lines are to increase visibility on the translucent glass preventing people or birds from colliding into it."], "image": "train2014/COCO_train2014_000000045084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331785, "question_id": "Gksvru7JJ3J9SiFdUBcY2P", "question": "Why is the woman wearing a white bandana holding a phone up?", "choices": ["buying items", "playing games", "taking pictures", "calling 911"], "correct_choice_idx": 2, "direct_answers": ["taking picture", "fashion style", "taking picture", "taking picture", "taking pictures", "taking pictures", "hat", "protect hair", "craovd", "selfie"], "difficult_direct_answer": false, "rationales": ["The man is looking at her phone as if aiming the camera at something.", "A phone has a camera on it. there are lots of views on a ride like this.", "The woman is taking photos."], "image": "val2014/COCO_val2014_000000331785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404163, "question_id": "Gkz8m9ckZ7ho5hEACvNBLv", "question": "Where is this dog's owner?", "choices": ["at work", "another building", "inside", "overseas"], "correct_choice_idx": 2, "direct_answers": ["bar", "house", "inside", "bar", "in bar", "inside", "inside", "in restaurant", "bar", "inside"], "difficult_direct_answer": false, "rationales": ["The dog is sitting outside of a restaurant while its owner eats inside.", "The dog's owner isn't available to come do greetings outside.", "Because the dog remains inside the room, his owner must be inside."], "image": "train2014/COCO_train2014_000000404163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382563, "question_id": "GmAwg7ZyHtf52sLRyyi2zA", "question": "What would normally be stored in these cases?", "choices": ["cleaning supplies", "clothes", "water", "dishes"], "correct_choice_idx": 1, "direct_answers": ["clothing", "clothes", "records/ clothes", "luggage", "clothing", "clothes", "clothing", "clothing", "clothing", "clothes"], "difficult_direct_answer": false, "rationales": ["These cases resemble suitcases, which are used to transport people's clothes when they are traveling.", "These are suitcases being made into something else.", "These cases are suitcases. they are not cabinets or tanks."], "image": "val2014/COCO_val2014_000000382563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33417, "question_id": "GmE7oSDS6HubtNf7aDUWvN", "question": "What is the spoon used for with the red paste?", "choices": ["to spread", "to cook", "to fling", "to boil"], "correct_choice_idx": 0, "direct_answers": ["spread", "to spread", "spreading", "spreading", "to spread", "slathering", "serving", "serve", "spreading", "add paste"], "difficult_direct_answer": false, "rationales": ["The spoon is used to spread the red pepper paste.", "The sauce in the brown bowl goes on the bread.", "The spoon is for spreading."], "image": "train2014/COCO_train2014_000000033417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195927, "question_id": "GmE9jt7GV2N5VZhyEKLcPA", "question": "What is the white bowl with holes in it on the left used for?", "choices": ["mashing", "straining", "mixing", "tenderizing"], "correct_choice_idx": 1, "direct_answers": ["draining vegetables", "straining", "draining", "stain food", "light lamp", "straining", "draining food", "straining", "sieve", "drain water"], "difficult_direct_answer": false, "rationales": ["The white bowl is a colander and is used to drain liquids away from food.", "You can pour things that have liquid that needs drained out into it.", "It is a colander used to strain wet food."], "image": "train2014/COCO_train2014_000000195927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565957, "question_id": "GmEtmBmy2568eERPSTzKox", "question": "What is the man doing in the shadows?", "choices": ["drawing", "sleeping", "exercising", "using phone"], "correct_choice_idx": 3, "direct_answers": ["phone call", "making call", "phoning", "call", "phone call", "phone call", "calling", "making phonecall", "using phone", "phone call"], "difficult_direct_answer": false, "rationales": ["The man is talking on the phone.", "The man has a cellphone to his ear.", "The man is on his phone."], "image": "val2014/COCO_val2014_000000565957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417373, "question_id": "GmijCygEpzfyJPHx8nF5Lm", "question": "What kind of screen is furthest left in this messy room?", "choices": ["crt", "projector", "phone lcd", "computer lcd"], "correct_choice_idx": 0, "direct_answers": ["crt", "old monitor", "crt", "pc", "monitor", "monitor", "old computer", "monitor", "computer", "crt monitor"], "difficult_direct_answer": false, "rationales": ["The screen is for the crt.", "The screen in the corner is an old, boxy monitor.", "The biggest monitor screen is the crt monitor."], "image": "val2014/COCO_val2014_000000417373.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105633, "question_id": "GmnzDevweUgyBDvcKjQJ4v", "question": "Which famous painter liked to paint women with hair the colour of the woman on the left's?", "choices": ["donatello", "michaelangelo", "da vinci", "titian"], "correct_choice_idx": 3, "direct_answers": ["titian", "titian", "van gough", "van gogh", "matisses", "rousseau", "titian", "pablo picasso", "titian", "da vinci"], "difficult_direct_answer": false, "rationales": ["The painter is titian.", "Titian enjoyed painting women with orange hair.", "Titian usually painted women with fiery hair."], "image": "train2014/COCO_train2014_000000105633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289919, "question_id": "Gmr76nbd9tS6Y9s3pPEAaw", "question": "What type of loose material is strewn on the floor where the animals are standing?", "choices": ["sawdust", "grain", "leaves", "straw"], "correct_choice_idx": 0, "direct_answers": ["straw", "hay", "sawdust", "insulation", "hay", "straw", "hay", "hay", "hay", "hay"], "difficult_direct_answer": false, "rationales": ["On the ground you can see sawdust which is used to make cleaning the animals waste easier.", "The type is sawdust.", "Sawdust is usually strewn on the ground in barns."], "image": "train2014/COCO_train2014_000000289919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246001, "question_id": "Gn7vvJ7CmyJtQP7goyfg6Q", "question": "What is being displayed on the screen in front of the class?", "choices": ["movie", "powerpoint presentation", "live tv", "weekly news"], "correct_choice_idx": 1, "direct_answers": ["slides", "class guideline", "classwork", "laptops", "powerpoint presentation", "slideshow", "class curriculum", "presentation", "document", "presentation"], "difficult_direct_answer": true, "rationales": ["This is a program which is designed to present an idea to a group in an organized manner. this is projected on to the screen in the case with intentions of being shown to students.", "The screen has a presentation.", "The text and computer display projected in the front mean it is a presentation of slides."], "image": "val2014/COCO_val2014_000000246001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190900, "question_id": "GnrYrJPHdkLppK7qkDdrrV", "question": "What would this vehicle primarily be used for?", "choices": ["war", "racing", "travel", "cargo shipments"], "correct_choice_idx": 2, "direct_answers": ["flying", "passenger travel", "transport", "air transport", "travel", "travel", "flight", "air travel", "air transportation", "travel"], "difficult_direct_answer": false, "rationales": ["The vehicle is a passenger airplane based on the windows on the side.", "This plane doesn't look very aerodynamic and has many small windows for many passengers to look out of.", "The vehicle is a civilian passenger jet that has a us airways livery."], "image": "train2014/COCO_train2014_000000190900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111048, "question_id": "GnxVrTRvCCFWYV9HsfFAJp", "question": "What material do these kite flyers stand upon?", "choices": ["snow", "water", "grass", "sand"], "correct_choice_idx": 3, "direct_answers": ["plastic", "sand", "sand", "plastic", "plastic", "sand", "sand", "sand", "sand", "sand"], "difficult_direct_answer": false, "rationales": ["The kite flyers are on beach sand.", "The kite flyers are at the beach on the sand.", "The kite flyers are performing on a sandy beach."], "image": "train2014/COCO_train2014_000000111048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20774, "question_id": "GoE4VFkRpLSuJ2kihLEyj6", "question": "What problem are the two people on the right facing?", "choices": ["getting soaked", "getting thirsty", "getting tired", "getting sunburned"], "correct_choice_idx": 0, "direct_answers": ["no umbrellas", "getting wet", "getting wet", "getting wet", "rain", "getting soaked", "rain", "getting wet", "no umbrella", "getting wet"], "difficult_direct_answer": false, "rationales": ["The people don't have something to cover themselves.", "The two people on the right are facing getting wet from the rain that is falling.", "Due to the wet texture of the ground and the people on the left being under umbrellas we can conclude the two on the right may get wet."], "image": "val2014/COCO_val2014_000000020774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349193, "question_id": "GoT2Z2dSRMARH384TRGQRM", "question": "What kind of net is shown?", "choices": ["tennis", "beach volleyball", "fishing", "butterfly"], "correct_choice_idx": 1, "direct_answers": ["volleyball", "frisbee", "volleyball", "volleyball", "volleyball net", "beach volleyball", "volleyball net", "volleyball net", "score", "volleyball net"], "difficult_direct_answer": false, "rationales": ["Due to the setting and the the actions that they are doing you can easily tell what kind of net it is.", "A volleyball net in the sand.", "The net is over sand outdoors and the height of it is synonymous with the game of beach volleyball."], "image": "train2014/COCO_train2014_000000349193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488032, "question_id": "Gopddi2bk2RBfK6FwEieTh", "question": "What company is sponsoring the tennis match?", "choices": ["dunlop", "asics", "gamma", "wilson"], "correct_choice_idx": 3, "direct_answers": ["wilson", "wilson", "wilson", "wilson", "wilson", "wilson", "wilson", "american express", "wilson", "wisconsin"], "difficult_direct_answer": false, "rationales": ["The logo of the company, a lowercase letter w, is visible on the tarp covering the court's fence.", "This company wall sponsors wilson.", "There is a large w in the background."], "image": "train2014/COCO_train2014_000000488032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60945, "question_id": "GotunR6Sqowi9npUExfFVu", "question": "What is the birthday person's name?", "choices": ["laura", "josie", "emily", "jonas"], "correct_choice_idx": 1, "direct_answers": ["josie", "josie", "josie", "josie", "josie", "josie", "josie", "josie", "josie", "josie"], "difficult_direct_answer": false, "rationales": ["The cake says 'josie' on it.", "The candles spell out j o s i e.", "You can tell who's party it is by the candles spelling her name."], "image": "train2014/COCO_train2014_000000060945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174019, "question_id": "GowTD9LpFi2QUUJjiGGf5G", "question": "In which country is this picture taken?", "choices": ["china", "luxembourg", "spain", "canada"], "correct_choice_idx": 3, "direct_answers": ["canada", "canada", "usa", "usa", "canada", "canada", "canada", "canada", "united states", "united states"], "difficult_direct_answer": false, "rationales": ["The street sign is in english.", "One can see several of this country's maple leaf flag hanging.", "The canadian flag can be seen hanging from several store fronts in the photo."], "image": "val2014/COCO_val2014_000000174019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265831, "question_id": "GpBHxTJEUFcqhQkTNNGBb7", "question": "What word would best describe their movement?", "choices": ["gallop", "skip", "walk", "sprint"], "correct_choice_idx": 2, "direct_answers": ["walking", "slow", "beautifully", "walking", "walking", "slow", "walk", "slow", "walking", "riding"], "difficult_direct_answer": false, "rationales": ["The horse appears to be moving slowly.", "Only one foot of the horse is off the ground at a time.", "The horses are moving at a slow pace you can tell by their foot placement."], "image": "train2014/COCO_train2014_000000265831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292907, "question_id": "GpGGdZtdGzGvfnD2MdWGsz", "question": "What type of social gathering probably occurs here?", "choices": ["swimming", "worship", "party", "gambling"], "correct_choice_idx": 1, "direct_answers": ["church", "communion", "mass", "praying", "worship", "church", "church", "church worship", "worship", "religious"], "difficult_direct_answer": false, "rationales": ["The room is a church where people gather to worship and pray.", "This is a church.", "It is a church."], "image": "train2014/COCO_train2014_000000292907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510359, "question_id": "GpNQFwJxjNPPTcoVo8vLD5", "question": "What keeps the object in the sky stationary?", "choices": ["orbit", "iron beams", "eclipses", "strings"], "correct_choice_idx": 3, "direct_answers": ["strings", "strings", "wind", "string", "strings", "wind", "string", "strings", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["The item in the sky pictured here would fly away were it not anchored by the person holding a wire on the ground.", "The object in the sky is a kite that stays in its place with strings held by the man on the ground.", "These are attached on one end to the object in the sky, and on the other end they are held by a person and might also be attached to a stationary object that is on the ground."], "image": "train2014/COCO_train2014_000000510359.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554952, "question_id": "GpdSrvkvrNrJLBxkm2JeH3", "question": "What video game system are they playing?", "choices": ["nintendo switch", "x box", "wii", "playstation"], "correct_choice_idx": 2, "direct_answers": ["wii", "nintendo wii", "nintendo wii", "wi", "wii", "wii", "will", "wii", "wii", "wii"], "difficult_direct_answer": false, "rationales": ["The video game system is the nintendo wii.", "The people are playing the nintendo wii.", "They are holding remotes that are white and clearly wii remotes based on their design. if handling wii remotes, they would be playing a wii video game system."], "image": "train2014/COCO_train2014_000000554952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19349, "question_id": "GppeGAyobdB2VxG9kAFetc", "question": "Most of these items are probably used on what?", "choices": ["televisions", "cars", "light fixtures", "air conditioners"], "correct_choice_idx": 0, "direct_answers": ["televisions", "televisions", "televisions", "televisions", "televisions", "televisions", "television", "televisions", "television", "television"], "difficult_direct_answer": false, "rationales": ["The items are for tvs.", "These look like most of them would work best on a tv.", "These devices all have the number of buttons needed and layout suggestive of being television remotes."], "image": "train2014/COCO_train2014_000000019349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95427, "question_id": "GqViEycJCZky6edszjkCLZ", "question": "How was the orange used for display prepared?", "choices": ["grated", "pounded", "sliced", "pulverized"], "correct_choice_idx": 2, "direct_answers": ["cut", "halved", "cut", "sliced", "sliced", "sliced", "sliced", "sliced", "sliced", "half sliced"], "difficult_direct_answer": false, "rationales": ["The orange is cut in half.", "The orange was cut in half.", "The oranges on the top of the piles have been sliced."], "image": "val2014/COCO_val2014_000000095427.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285498, "question_id": "GqWREwUxWHDk8htFmXvLyS", "question": "What does the woman use the umbrella for?", "choices": ["flying", "hiding", "rain cover", "shade"], "correct_choice_idx": 3, "direct_answers": ["shade", "shade", "shade", "sun", "shade", "shade", "shade", "block sun", "protection", "sun protection"], "difficult_direct_answer": false, "rationales": ["The woman wants shade.", "Given the weather setting it's easy to understand why she is using it as she is.", "Umbrella are normally used for protection against rain or the sun. since it is not raining, use must be for shade."], "image": "train2014/COCO_train2014_000000285498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370922, "question_id": "GqoAmhFeR6iWnBGaW2sx3x", "question": "The ATVs being carried on the flatbed truck are used by which public agency?", "choices": ["fire department", "police", "city hall", "health department"], "correct_choice_idx": 1, "direct_answers": ["police", "police", "police", "rigid", "police", "police", "police", "police", "police", "rigid"], "difficult_direct_answer": false, "rationales": ["The atvs have flashing lights on them and are blue in color.", "It says who the flat bed truck for is on the side of it.", "Pd sometimes need this type of vehicle."], "image": "train2014/COCO_train2014_000000370922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243021, "question_id": "GqyEfTu5h4pNFu5Gxnbt6W", "question": "Why should they wear gloves?", "choices": ["cold weather", "identifying themselves", "hygiene", "fashion"], "correct_choice_idx": 2, "direct_answers": ["germs", "hygiene", "sanitary use", "promote sanitation", "prevent sickness", "prevent covid", "safety", "handling food", "serving food", "to protect"], "difficult_direct_answer": true, "rationales": ["These plastic gloves protect food from germs", "Germs can be passed when people touch surfaces.", "They want to be hygienic."], "image": "train2014/COCO_train2014_000000243021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166244, "question_id": "GrSNXrtdbL423QvxgWBSkx", "question": "What is the last letter on the last screen to the right?", "choices": ["m", "c", "w", "d"], "correct_choice_idx": 1, "direct_answers": ["see", "letter c", "v c", "letter c", "letter c", "cc", "letter c", "letter c", "capital c", "c"], "difficult_direct_answer": false, "rationales": ["The last letter on this screen is a sideways semi-circle. this shape identifies the letter c.", "The last letter is c.", "The word on the last screen to the right in ingsoc."], "image": "val2014/COCO_val2014_000000166244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550707, "question_id": "GrVroq5duzV88XBrbfZZes", "question": "Why does cut the cake?", "choices": ["punish baker", "make smaller", "easy disposal", "feed friends"], "correct_choice_idx": 3, "direct_answers": ["woman", "birthday", "to serve", "to eat", "birthday", "birthday cake", "feed friends", "to serve", "birthday girl", "birthday party"], "difficult_direct_answer": false, "rationales": ["They have cut it so they can all have a piece.", "The cake is meant to feed friends.", "It's a more sanitary way for everyone to eat some of it"], "image": "val2014/COCO_val2014_000000550707.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30391, "question_id": "Grdx7XkjTsMj3FeVDYWNGP", "question": "What type of building might this bathroom be in?", "choices": ["library", "school", "hotel", "house"], "correct_choice_idx": 2, "direct_answers": ["hotel", "hotel", "hotel", "hotel", "airplane", "airport", "hotel", "hotel", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["A formal bathroom with commercial components is shown.", "The level of cleanness shows that it is a hotel.", "This looks like it's in a hotel bathroom."], "image": "train2014/COCO_train2014_000000030391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573786, "question_id": "GrsNJxqWBNMQwBpywq7FLA", "question": "What is he getting read to do?", "choices": ["smoke", "sing", "sleep", "eat"], "correct_choice_idx": 3, "direct_answers": ["eat", "eat dinner", "eat", "eat", "eat", "eat", "eat", "eat", "eat", "eat"], "difficult_direct_answer": false, "rationales": ["The person is eating.", "There is food in front of the man.", "The man has a meal in front of him."], "image": "train2014/COCO_train2014_000000573786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344661, "question_id": "Gs42bnJTZXx877iCzdbKMr", "question": "What prevents the motorcycle from falling over?", "choices": ["kickstand", "wheels", "brakes", "curb"], "correct_choice_idx": 0, "direct_answers": ["stand", "kickstand", "kickstand", "kickstand", "bike stand", "kickstand", "kick stand", "kickstand", "kickstand", "kickstand"], "difficult_direct_answer": false, "rationales": ["The motorcycle has a kickstand.", "There is a kickstand under the bike.", "The sides, front, and back of the motorcycle are not touching anything, and would fall over without something extending its body to the right or left."], "image": "train2014/COCO_train2014_000000344661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543263, "question_id": "Gs7d4XChPD8crrvxHVjeks", "question": "What is the process that produces the type of animal depicted on the bench called?", "choices": ["germination", "pollination", "metamorphosis", "mitosis"], "correct_choice_idx": 2, "direct_answers": ["butterfly", "metamorphosis", "metamorphosis", "butterfly", "metamorphosis", "metamorphosis", "metamorphosis", "metamorphosis", "metamorphosis", "metamorphosis"], "difficult_direct_answer": false, "rationales": ["Butterflies use this process to change from a caterpillar into a butterfly.", "A caterpillar will turn into this.", "The bench depicts a butterfly, which are produced by the process known as metamorphosis."], "image": "train2014/COCO_train2014_000000543263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220307, "question_id": "GsDDevqhkWZCL69xNfXiSu", "question": "What are the people on the vehicle to the right involved in?", "choices": ["safari", "hitch hiking", "school ride", "selling"], "correct_choice_idx": 0, "direct_answers": ["safari", "tourism", "touring", "safari tour", "safari", "safari", "feeding animals", "feeding", "safari", "tour"], "difficult_direct_answer": false, "rationales": ["The people are watching the giraffes.", "A white bus is driving through an area with giraffes. people go on safaris to see animals.", "The people are on a safari."], "image": "val2014/COCO_val2014_000000220307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75234, "question_id": "GsHQPskrFr3vN6AnXvt4T6", "question": "What surface are the men playing on?", "choices": ["indoor hard", "grass", "clay", "outdoor hard"], "correct_choice_idx": 3, "direct_answers": ["artificial", "hard", "tennis court", "harcourt", "outdoor hard", "clay", "cement", "tennis court", "court", "hard court"], "difficult_direct_answer": true, "rationales": ["You can tell by the setting and look of the surface of the ground as to what they are playing on.", "A man is in the middle of a tennis court. he has a shadow above him as he plays.", "The natural lighting is created by the suns rays which are not blocked by any sort of enclosure. you can also tell that the surface is not soft because the weight from the man does not create any indentation."], "image": "val2014/COCO_val2014_000000075234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77178, "question_id": "GsL5QayBvhSAwpjyppEvPr", "question": "The clay cooking ware made from hand is at least how old?", "choices": ["20 years", "50 years", "100 years", "500 years"], "correct_choice_idx": 3, "direct_answers": ["1000", "100 years", "500 years", "thousand years", "1000 years", "500 years", "1000 years", "wood product", "ancient", "five hundred"], "difficult_direct_answer": false, "rationales": ["That is how old the plates might be.", "The clay must be super old.", "They were made a long time ago."], "image": "val2014/COCO_val2014_000000077178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515540, "question_id": "GsTJ5wzfDU4Br8fnjWW3Ut", "question": "What flavoured beverage is in the bottle?", "choices": ["soda", "beer", "water", "wine"], "correct_choice_idx": 2, "direct_answers": ["not visible", "beer", "tea", "vitamin water", "fruit punch", "sports drink", "vitamin water", "water", "vitamin water", "water"], "difficult_direct_answer": false, "rationales": ["The beverage is bottled vitamin water.", "The type of beverage appears after vitamin on the label of the bottle.", "The bottle says what it is."], "image": "val2014/COCO_val2014_000000515540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61621, "question_id": "GsVRJnZFRSDCvDxxXf52do", "question": "What kind of chopper is this?", "choices": ["cargo", "attack", "medical", "law enforcement"], "correct_choice_idx": 2, "direct_answers": ["medical", "his zulu", "red cross", "helicopter 474", "ambulance", "helicopter", "rescue", "rescue", "medical helicopter", "medic"], "difficult_direct_answer": true, "rationales": ["There is a red cross on it which is an international sign for medical", "This helicopter or chopper has a white square with a red cross inside which is the symbol for people that help others with health crises.", "The red cross on the side indicates that it's a."], "image": "train2014/COCO_train2014_000000061621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149280, "question_id": "GsWhujkWVtrMxsrAhLwrH4", "question": "Which direction are the people seen riding the lift going?", "choices": ["sideways", "none", "up", "down"], "correct_choice_idx": 2, "direct_answers": ["up", "up", "up", "up", "up", "going up", "uphill", "going up", "up", "up"], "difficult_direct_answer": false, "rationales": ["The people in the lift are seen riding upwards.", "The ski lift only takes people up the mountain. they have to ski down the mountain if they want to come down.", "Ski lifts travel to the top of a mountain."], "image": "train2014/COCO_train2014_000000149280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326863, "question_id": "GsZFkQjaXa2GHNxmu7gysS", "question": "What type terrain is this train passing over?", "choices": ["sand", "rocky", "even", "mountain"], "correct_choice_idx": 2, "direct_answers": ["grass", "gravel", "thickets", "woods", "narrow area", "grass", "forest", "even", "wooded", "grassy"], "difficult_direct_answer": true, "rationales": ["There are no slopes", "The area is grassy.", "The terrain is even."], "image": "val2014/COCO_val2014_000000326863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222771, "question_id": "GszYSnqdfVXfFwrPPfPnWz", "question": "Why are the men wearing a green vest?", "choices": ["fashion", "camouflage", "dress code", "visibility"], "correct_choice_idx": 3, "direct_answers": ["jerseys", "identification", "noticed", "safety", "visibility", "safety race", "for scoring", "to identify", "safety", "racing motorcycles"], "difficult_direct_answer": true, "rationales": ["So drivers can see them on the road", "The bright green colour will allow people to see them, especially out on the road.", "The men are trying to stay visible."], "image": "val2014/COCO_val2014_000000222771.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192866, "question_id": "GtNdJvCutr8sQyS6XRFebF", "question": "What temperature is in the orange and white box?", "choices": ["warm", "cold", "room temp", "hot"], "correct_choice_idx": 1, "direct_answers": ["cold", "cold", "cold", "cold", "30 degrees", "cold", "cool", "cold", "cool", "cold"], "difficult_direct_answer": false, "rationales": ["The orange and white box is a cooler. coolers are designed to keep things cold.", "The orange and white box is a cooler. people usually put food and drinks in there to keep them cool.", "The temperature is cold."], "image": "train2014/COCO_train2014_000000192866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286576, "question_id": "GtSkQecT8it4jqL4Ezd5H9", "question": "What color seat does someone handicapped sit on here?", "choices": ["brown", "candy striped", "red", "gray"], "correct_choice_idx": 3, "direct_answers": ["blue", "blue", "gray", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["There is a sign over two seats of this color that says the seats are for people who are handicapped.", "The seats with the wheelchair sign are gray.", "They can sit on the grey seats."], "image": "train2014/COCO_train2014_000000286576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517973, "question_id": "GtuhQsVxM7eDCxRfNaw7zQ", "question": "What type of car is this?", "choices": ["buggy", "van", "hatchback", "convertible"], "correct_choice_idx": 0, "direct_answers": ["ladybug", "volkswagen bug", "volkswagen", "vw bug", "buggy", "vw", "volkswagen beetle", "vw bug", "volkswagen", "vw beetle"], "difficult_direct_answer": false, "rationales": ["The car is a vw bug.", "The car is small like a bug.", "The car pictured is a volkswagen bug."], "image": "val2014/COCO_val2014_000000517973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367398, "question_id": "GuAEVGnsuzrRLEqK9QUM23", "question": "What is the video game console connected to the television currently doing?", "choices": ["updating", "formatting", "rebooting", "starting"], "correct_choice_idx": 0, "direct_answers": ["loading", "updating", "nintendo wii", "idle", "display", "updating", "loading", "updating", "updating game", "updating"], "difficult_direct_answer": false, "rationales": ["The video game console is currently updating.", "The text on the screen indicates what the console is doing.", "An update screen is shown behind a person holding video game controllers."], "image": "val2014/COCO_val2014_000000367398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426996, "question_id": "GuDCq63fA9Fjbd5sdxZudu", "question": "Why does the boy in yellow cover his head?", "choices": ["religion", "warmth", "protection", "disguise"], "correct_choice_idx": 2, "direct_answers": ["for protection", "safety", "safety", "stay warm", "safety", "protection", "cold", "safety moment", "safety", "protection"], "difficult_direct_answer": false, "rationales": ["The boy is visibly wearing a helmet and skiing. when skiing most people wear helmets and this is for protection because of the risk of head injury during this activity.", "Snow skiing can be a dangerous sport. he wears the helmet as safety equipment.", "The boy needs protection."], "image": "train2014/COCO_train2014_000000426996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430688, "question_id": "GuGFEVFuyWfrBwneMQrZKV", "question": "In which direction is the bear seen here currently moving?", "choices": ["down", "up", "forward", "none"], "correct_choice_idx": 3, "direct_answers": ["right", "up", "cloaks", "none", "forward", "right", "west", "upwards", "up", "downward"], "difficult_direct_answer": false, "rationales": ["It is not a real bear, and it is not actually moving. it is an art installation that remains in the same place.", "The counterweight suspended below and supports on either side of this statue makes sure it doesn't move.", "The bear has its hands out in front with one foot in front indicating that is will be walking ahead."], "image": "train2014/COCO_train2014_000000430688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234717, "question_id": "GuSzyVN2kgnxqMofRMQNj2", "question": "Where would lighting be most likely to hit in this area?", "choices": ["water", "vehicle", "rocks", "lightning rod"], "correct_choice_idx": 3, "direct_answers": ["steeple", "lightning pole", "castle", "building spire", "top castle", "building", "lightning rod", "everywhere", "tower", "church tower"], "difficult_direct_answer": true, "rationales": ["Lightning usually strikes the highest object in an area. here, an item is placed on the highest point possible to attract the lightning, thus keeping it away from other areas.", "The lightning rod would hit.", "The highest point is a metal lightning rod."], "image": "train2014/COCO_train2014_000000234717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230133, "question_id": "GuTW2P2xwCVmS4aoB2DHkV", "question": "What might her religion be?", "choices": ["jew", "muslim", "christian", "buddhist"], "correct_choice_idx": 1, "direct_answers": ["watching", "muslim", "muslim", "islam", "islam", "hindi", "islam", "muslim", "muslim", "islam"], "difficult_direct_answer": false, "rationales": ["Many women of this religion use these type of head covers", "A woman wears a head covering hijab. muslim woman cover their heads.", "The woman is wearing a hijab which is worn by muslims."], "image": "train2014/COCO_train2014_000000230133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20853, "question_id": "GujQUf4Sy2VHDccZAJCifo", "question": "What is the maximum speed of the horse?", "choices": ["88km/h", "75km/h", "50km/h", "80km/h"], "correct_choice_idx": 0, "direct_answers": ["88km/h", "30 mph", "40mph", "55mph", "55 mph", "100kmhr", "fast", "forty mph", "55 mph", "55 mph"], "difficult_direct_answer": false, "rationales": ["Horses are known to be extremely quick.", "Horses can go at eighty eight kilometers an hour.", "The speed is 88."], "image": "train2014/COCO_train2014_000000020853.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214937, "question_id": "GusUo4cN98Raq8uSuj7ZdF", "question": "What can be played on here?", "choices": ["trampoline", "sand box", "bounce castle", "slide"], "correct_choice_idx": 3, "direct_answers": ["frisbee", "frisbee", "frisbee", "slide", "slide", "frisbee", "slide", "frisbee", "slide", "frisbee"], "difficult_direct_answer": false, "rationales": ["There is a red slide in the background.", "There is a playground with a slide in the background.", "A slide is in the background."], "image": "train2014/COCO_train2014_000000214937.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143436, "question_id": "GvBPyPsBcZqRBxssyVcEEc", "question": "What is the wall on the right made from?", "choices": ["stone", "wood", "steel", "plaster"], "correct_choice_idx": 0, "direct_answers": ["concrete", "brick", "bricks", "bricks", "brick", "bricks", "brick", "bricks", "stone", "bricks"], "difficult_direct_answer": false, "rationales": ["The wall is stone.", "Most outdoor structures are made of stone because they last longer than most other building materials.", "The bricks are made of a hard natural material."], "image": "train2014/COCO_train2014_000000143436.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321026, "question_id": "GvEYYFZymBaVZ3e5g6QNeR", "question": "What type byway is shown here?", "choices": ["freeway", "raceway", "nature path", "railway"], "correct_choice_idx": 2, "direct_answers": ["trail river", "river", "nature path", "canal", "canal", "canal", "river", "river", "river", "water"], "difficult_direct_answer": false, "rationales": ["There is a small path next to the water.", "The scene is surrounded by nature with a small paved pave that is to narrow to be fit for vehicles. answers b-d are facilitate vehicles.", "The byway is a nature path for hiking."], "image": "val2014/COCO_val2014_000000321026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54337, "question_id": "GvRKd2j3m5HhstgnTsXdZU", "question": "What are the men taste testing?", "choices": ["milk", "water", "juice", "wine"], "correct_choice_idx": 3, "direct_answers": ["beer", "beer", "wine", "wine", "wine", "wine", "wine", "wine", "wine", "beer"], "difficult_direct_answer": false, "rationales": ["There are glasses of wine on the table.", "There are wine bottles on the table.", "The men taste wine."], "image": "val2014/COCO_val2014_000000054337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575756, "question_id": "GvqpbQmjJHMD3W4co4kvJU", "question": "What body part can you see in the shadows?", "choices": ["head", "hand", "shoe", "finger"], "correct_choice_idx": 0, "direct_answers": ["head", "head", "head", "heads", "head", "head", "head", "hear", "heads", "head"], "difficult_direct_answer": false, "rationales": ["The sun must be shining behind them as their hair and skull are giving off their shadow on the ground.", "A person's head is showing up since the body part is round.", "There is a shadow of a person's head on the floor in front of the skateboard."], "image": "train2014/COCO_train2014_000000575756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357529, "question_id": "Gw2f7qQNWmFA5iVZrY7tjF", "question": "To whom is the ball being thrown?", "choices": ["game official", "batter", "fans", "manager"], "correct_choice_idx": 1, "direct_answers": ["catcher", "batter", "catcher", "batter", "batter", "catcher", "catcher", "batter", "batter", "batter"], "difficult_direct_answer": false, "rationales": ["Because the pitcher always throws the ball to the batter.", "The man is trying to get the batter out.", "This is a pitcher and it's his job to throw it to this type of player"], "image": "val2014/COCO_val2014_000000357529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259753, "question_id": "Gw2fwL3LSu9VHRxtJKhcqq", "question": "What is the woman preparing?", "choices": ["cake", "cookies", "croissants", "cinnamon rolls"], "correct_choice_idx": 3, "direct_answers": ["cinnamon rolls", "cinnamon rolls", "donuts", "cinnamon rolls", "desserts", "cinnamon buns", "buns", "donuts", "donuts", "desserts"], "difficult_direct_answer": false, "rationales": ["The woman is making rolls with white icing on top.", "As indicated by the shape, color, glaze and positioning on the rack.", "There are cinnamon rolls on the cooling rack."], "image": "train2014/COCO_train2014_000000259753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448930, "question_id": "Gw5CUC7mqMaXTgPVxkJ7og", "question": "What airport is this?", "choices": ["san francisco", "los angeles", "new york", "hamilton"], "correct_choice_idx": 0, "direct_answers": ["unknown", "san francisco", "san francisco", "san francisco", "so", "so", "san francisco", "u.s. airport", "busy", "unsure"], "difficult_direct_answer": false, "rationales": ["The airport is in san francisco.", "That city has the busiest airport.", "There are luggage tags with the abbreviation \"sfo\" which is the san francisco airport abbreviation. the tags on the luggage usually represent the end destination."], "image": "train2014/COCO_train2014_000000448930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182765, "question_id": "GwGFUwirXZ5UNW9ZjaF9Q2", "question": "What type of structures are shown?", "choices": ["home", "tent", "hotel", "garage"], "correct_choice_idx": 1, "direct_answers": ["tents", "huts", "tent", "tents", "tents", "tents", "tents", "tents", "huts", "huts"], "difficult_direct_answer": false, "rationales": ["The structures are tents.", "The structures shown here are tents for the kite festival.", "Each structure has a canopy that is supported by poles. the blue one does not have walls, so it is not a home, hotel, or garage."], "image": "train2014/COCO_train2014_000000182765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46298, "question_id": "GwdbwWoiK6bGLKwSt6oKeJ", "question": "What is bad about this food?", "choices": ["high fat", "high carb", "high sugar", "high sodium"], "correct_choice_idx": 3, "direct_answers": ["sodium", "nothing", "high sodium", "processed", "lack nutrition", "sodium", "unnatural", "sodium", "salt content", "raw"], "difficult_direct_answer": false, "rationales": ["It is a partially cured meat which needs this substance to make it effective", "The hot dogs have sodium.", "The food in question is a hot dog. hot dogs are known to have high sodium which is considered bad."], "image": "train2014/COCO_train2014_000000046298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437180, "question_id": "Gwr627MSUaXstrSVefNryx", "question": "The blue bottle is there to satisfy what need?", "choices": ["elimination", "thirst", "medication", "hunger"], "correct_choice_idx": 1, "direct_answers": ["thirst", "thirst", "thirst", "water", "water", "thirst", "thirst", "thirst", "thirst", "thirst"], "difficult_direct_answer": false, "rationales": ["The bottle is for thirst.", "The bottle has water in it.", "The bottle is filled with water."], "image": "val2014/COCO_val2014_000000437180.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525865, "question_id": "GwsEog4dkC6JAcDarhwVrZ", "question": "Where is this scene taking place?", "choices": ["house", "store", "cafeteria", "furniture store"], "correct_choice_idx": 2, "direct_answers": ["kitchen", "cafeteria", "kitchen", "cafeteria", "fridge", "break room", "kitchen", "school", "cafeteria", "cafeteria"], "difficult_direct_answer": false, "rationales": ["You can tell by the roll-away coolers as to where this is.", "This is in a place where several people eat.", "The 4 silver appliances are refrigerators. refrigerators store food."], "image": "train2014/COCO_train2014_000000525865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8733, "question_id": "GwypMtnaZWoSU6MD4NTTVb", "question": "Where was this food obtained?", "choices": ["restaurant", "home", "relative's", "school"], "correct_choice_idx": 0, "direct_answers": ["thai restaurant", "thai restaurant", "restaurant", "store", "restaurant", "restaurant", "deli", "cafeteria", "cafeteria", "restaurant"], "difficult_direct_answer": false, "rationales": ["The food is in a take-out container from a restaurant that offers food to-go.", "The food includes a plastic container and a fork individually wrapped.", "A meal is on a table in a takeout container. restaurants use takeout containers."], "image": "train2014/COCO_train2014_000000008733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313993, "question_id": "GwzND2dpwQaizVsfNnb5Fy", "question": "The animal on the right can best be described how?", "choices": ["six-legged", "hairless", "fluffy", "miniature"], "correct_choice_idx": 2, "direct_answers": ["cat", "gray cat", "cat", "fluffy", "domestic cat", "cat", "cat", "cat", "cat", "longhair fluffy"], "difficult_direct_answer": false, "rationales": ["The animal on the right is a cat. it has fur, is regular sized, and has four legs.", "The animal is fluffy.", "The animal on the right is a four-legged cat that has hair and is regular-sized."], "image": "train2014/COCO_train2014_000000313993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71264, "question_id": "Gx3X3njzKENMX5XuTsMgyF", "question": "What type of building does this dog live in?", "choices": ["highrise", "duplex", "bungalow", "trailer"], "correct_choice_idx": 0, "direct_answers": ["skyscraper", "condo", "apartment", "highrise", "highrise", "high rise", "expensive house", "apartment", "high rise", "high rise"], "difficult_direct_answer": false, "rationales": ["You can tell the building is tall by looking out the window. you can see all of the other tall buildings.", "The building is a highrise.", "The outside view indicates that the apartment is on a high floor of the building."], "image": "train2014/COCO_train2014_000000071264.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74577, "question_id": "GxDLyN8Y6wvh8y6Zca29mr", "question": "What meal is being served at the white table?", "choices": ["lunch", "breakfast", "dessert", "dinner"], "correct_choice_idx": 2, "direct_answers": ["dessert", "dessert", "pizza", "dessert", "dessert", "desert", "dessert", "dessert", "dessert", "dessert"], "difficult_direct_answer": false, "rationales": ["There are two cakes with two tubs of ice cream on the table.", "The items on the table look to be cake and ice cream, which are popular sweet items served as dessert at celebratory events.", "There are cakes on the table."], "image": "train2014/COCO_train2014_000000074577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123462, "question_id": "Gxrf3QCosixuxuAFeHicDn", "question": "What is the black outfit the surfer is wearing made of?", "choices": ["plastic", "leather", "neoprene", "wool"], "correct_choice_idx": 2, "direct_answers": ["rubber", "neoprene", "neoprene", "synthetic material", "neoprene", "rubber", "neoprene", "neoprene", "spandex", "neoprene"], "difficult_direct_answer": false, "rationales": ["That is what the suit is made.", "The man's wetsuit is made of neoprene.", "The surfer is wearing a leather outfit because it does not penetrate any water."], "image": "train2014/COCO_train2014_000000123462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410378, "question_id": "Gy5S8wemCJPd3gqR2a2Jy3", "question": "What is the man putting on?", "choices": ["tie", "gloves", "armor", "hat"], "correct_choice_idx": 0, "direct_answers": ["tie", "necktie", "tie", "neck tie", "tie", "necktie", "tie", "tie", "necktie", "tie"], "difficult_direct_answer": false, "rationales": ["The man has a tie around his neck that's being fastened.", "The man is pulling on his tie.", "The man is putting on the tie for his neck."], "image": "train2014/COCO_train2014_000000410378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435713, "question_id": "GyAcGHZij8FdoMeavw826T", "question": "Who went on a quest for the item the man has in his hand?", "choices": ["achilles", "hulk hogan", "hercules", "sir galahad"], "correct_choice_idx": 3, "direct_answers": ["harry potter", "indiana jones", "indiana jones", "indiana jones", "indiana jones", "man", "king arthur", "indiana jones", "sir galahad", "indiana jones"], "difficult_direct_answer": false, "rationales": ["The holy grail was well sought after by this night of king arthur's round table.", "That is the name of the traveller.", "Sir galahad went on a mission for this."], "image": "train2014/COCO_train2014_000000435713.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485381, "question_id": "GyTUAMHWyZJ6dsLzHKrSg7", "question": "Why are the skis pointing away from each other?", "choices": ["he's unbalanced", "stay still", "wants fall", "no control"], "correct_choice_idx": 1, "direct_answers": ["pushing off", "walking", "stopping", "stay still", "skier accelerating", "bad skier", "build momentum", "balance", "positioning", "for balance"], "difficult_direct_answer": true, "rationales": ["The individual is attempting to generate additional speed and balance.", "The skier has their poles pointing away from each other so they can stay still on the snow.", "The skis are both pointing away from eachother in an effort to remain still."], "image": "train2014/COCO_train2014_000000485381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468956, "question_id": "GyXxQmCvjcfNgxERjHEFqR", "question": "What skill does the child hone here?", "choices": ["tooth brushing", "singing", "reading", "sleeping"], "correct_choice_idx": 2, "direct_answers": ["reading", "reading", "reading", "reading", "reading", "reading", "reading", "reading", "reading", "reading"], "difficult_direct_answer": false, "rationales": ["He has a bunch of books on the bed and one open.", "The child has a bunch of books on his bed. people read books.", "The skill is reading."], "image": "train2014/COCO_train2014_000000468956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329018, "question_id": "GyYrWAhmZXbT2tm55Wp7WE", "question": "Which child is probably the guest of honor?", "choices": ["orange", "yellow dress", "black shirt", "pink dress"], "correct_choice_idx": 1, "direct_answers": ["middle girl", "blond girl", "princess outfit", "center", "yellow dress", "girl", "blonde girl", "girl", "blonde girl", "blonde girl"], "difficult_direct_answer": false, "rationales": ["The child in yellow is being honored.", "Everyone is looking at her.", "The guest of honors gets the first piece of cake"], "image": "train2014/COCO_train2014_000000329018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425472, "question_id": "GycYNcWiy5CrxCT5ZYbJf6", "question": "What is she doing on the bench?", "choices": ["selling umbrella", "resting", "enjoying scenery", "hiding"], "correct_choice_idx": 2, "direct_answers": ["sitting", "enjoying scenery", "sitting", "sitting", "sitting", "sitting", "sitting", "sitting", "sitting", "sitting"], "difficult_direct_answer": false, "rationales": ["The person is enjoying the ocean.", "The woman is taking a look at the water.", "A person is sitting on a bench in front of the ocean. people enjoy watching the ocean."], "image": "val2014/COCO_val2014_000000425472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429456, "question_id": "GyiBbudjirUvFfR9EMefHr", "question": "Why is the person using an umbrella?", "choices": ["sun", "rain", "snow", "costume"], "correct_choice_idx": 0, "direct_answers": ["avoiding sunlight", "sun protection", "sun shield", "shade", "sun", "sun protection", "shade", "rain", "sun", "shade"], "difficult_direct_answer": false, "rationales": ["They are using it to stay cooler in the hot weather.", "In the pictures it's sunny and there's no water on the ground. when the sun makes it too hot some people hold an umbrella over their head.", "The sun is shining onto the boy."], "image": "train2014/COCO_train2014_000000429456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561643, "question_id": "GyiPRVRxSdmxx8P5jsbfeM", "question": "What is the equipment in the background used for?", "choices": ["relaxation", "heat", "filtration", "brewing"], "correct_choice_idx": 3, "direct_answers": ["show display", "brewing", "brewing", "making beer", "brewing", "parties", "brewing beer", "sitting", "brewing", "make alcohol"], "difficult_direct_answer": false, "rationales": ["It's to brew assorted types of beer.", "The equipment is used to brew coffee.", "The tank in the background behind the glass is used to make beer."], "image": "train2014/COCO_train2014_000000561643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426482, "question_id": "GzCFCv9jJWvgUW8d9itEsa", "question": "What continent are these animals naturally found?", "choices": ["asia", "africa", "europe", "north america"], "correct_choice_idx": 1, "direct_answers": ["africa", "africa", "africa", "africa", "africa", "africa", "africa", "africa", "africa", "africa"], "difficult_direct_answer": false, "rationales": ["The continent is africa.", "This habitat houses giraffes and zebras. both these animals are found in africa.", "These animals can be found in africa."], "image": "train2014/COCO_train2014_000000426482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489520, "question_id": "GzPBSkeJsPWYttojhbQDmb", "question": "WHat is the item with wires called?", "choices": ["ski wires", "wire chair", "chair lift", "wiring"], "correct_choice_idx": 2, "direct_answers": ["ski lift", "lift", "chairlift", "chairlift", "ski lift", "ski lift", "ski lift", "ski lift", "ski lift", "chair lift"], "difficult_direct_answer": false, "rationales": ["Also called a skit lift, chair lifts are used on mountain and ski slopes for people to ski off of.", "That's a lift to carry skiers to the top of the mountain.", "People sit in it to be transported up the mountain."], "image": "train2014/COCO_train2014_000000489520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388940, "question_id": "GzRSaUCwgyCtW2xtx6tBme", "question": "Which wood used to make baseball bat?", "choices": ["ash", "pine", "sandal", "maple"], "correct_choice_idx": 3, "direct_answers": ["oak", "ash", "maple", "pine", "oak", "aluminum", "maple bats", "maple", "ash", "maple"], "difficult_direct_answer": false, "rationales": ["The baseball bat used by the batter at the plate is made from maple which is a very strong wood.", "That is the strongest wood for bats.", "Usually bats are made of maple."], "image": "train2014/COCO_train2014_000000388940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155189, "question_id": "GzSPiJc8oj3GcfNwH7xLfd", "question": "What action is the player here about to take?", "choices": ["serving", "return volley", "love", "side out"], "correct_choice_idx": 1, "direct_answers": ["swing", "lop", "hit ball", "serve", "forehand", "hitting ball", "return volley", "hit ball", "hitting ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["He is using both hands to hit the ball.", "The player is about to swing to return the tennis ball over the net.", "By the position of the racket and ball it's plain to see what is going to happen next."], "image": "val2014/COCO_val2014_000000155189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256879, "question_id": "H24b5gMbkwL9xzEAZBenLA", "question": "In which class of the sport does the tennis player compete?", "choices": ["college", "amateur", "juniors", "wheelchair"], "correct_choice_idx": 3, "direct_answers": ["disabled category", "wheel chair", "olympics", "special olympics", "wheelchair", "handicapped", "wheelchair", "special olympics", "handicap", "parathlete"], "difficult_direct_answer": false, "rationales": ["The player is in a chair while competing.", "She is playing in a wheelchair league.", "The tennis player has a mobility impairment and is participating in the paralympics games."], "image": "val2014/COCO_val2014_000000256879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321802, "question_id": "H2ADDatGBrzhjE9itqofb6", "question": "What activity happens near and in this structure?", "choices": ["baseball", "tourism", "office work", "banking"], "correct_choice_idx": 1, "direct_answers": ["worship", "sitting", "hiking", "church activities", "tourism", "contemplation", "sight seeing", "tourism", "wedding", "resting"], "difficult_direct_answer": true, "rationales": ["There is a ruin that people would want to visit to look at.", "The activity is tourism.", "This is a place that tourists would go to."], "image": "train2014/COCO_train2014_000000321802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63231, "question_id": "H2dRq25GBx9NnGUpgV9Dzw", "question": "What sort of oil is available on this table?", "choices": ["canola", "motor", "olive", "sesame"], "correct_choice_idx": 2, "direct_answers": ["olive", "edible", "edible", "olive", "olive", "olive", "olive", "edible", "olive", "olive"], "difficult_direct_answer": false, "rationales": ["It's the most popular oil in many regions.", "Olive oil is in the vases.", "A yellow oil can be seen in the clear glassware."], "image": "val2014/COCO_val2014_000000063231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6896, "question_id": "H2dhpTgVBeGZ3uUb7reR7g", "question": "What state are giraffes in?", "choices": ["free", "hospitalized", "dead", "captive"], "correct_choice_idx": 3, "direct_answers": ["new york", "captive", "captive", "attention", "standing", "standing", "captivity", "domestic", "awake", "present"], "difficult_direct_answer": false, "rationales": ["Two giraffe are in the distance held in by a wired fence.", "The giraffes are in a tall enclosure.", "The animals are at a zoo."], "image": "val2014/COCO_val2014_000000006896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392687, "question_id": "H2dyJtT7tYMcUKeBGBm2eK", "question": "The kite here is designed to resemble what?", "choices": ["butterfly", "house fly", "dog", "bird"], "correct_choice_idx": 0, "direct_answers": ["butterfly", "butterfly", "butterfly", "butterfly", "butterfly", "butterfly", "butterfly", "butterfly", "butterfly", "butterfly"], "difficult_direct_answer": false, "rationales": ["Monarchs have yellow base and black accents.", "It has pretty wings that resemble this insect", "The kite looks like a butterfly."], "image": "val2014/COCO_val2014_000000392687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281214, "question_id": "H2mwbw8TiSeQjJ2BfaPVkp", "question": "What shot is the man about to hit?", "choices": ["backhand", "forehand", "drop shot", "serve"], "correct_choice_idx": 1, "direct_answers": ["forehand", "ball", "forehand", "tennis", "forehand", "forearm", "forehand shot", "ball", "underhand", "overhand"], "difficult_direct_answer": false, "rationales": ["It's right in front of him", "The palm of his hand is out to hit the ball.", "The shot is for the forehand."], "image": "train2014/COCO_train2014_000000281214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105291, "question_id": "H3A7UNGk9j5BVVtUytCo8E", "question": "What are the blue signs on the pole showing?", "choices": ["traffic warnings", "animal crossings", "parking prices", "street names"], "correct_choice_idx": 3, "direct_answers": ["guide", "road signs", "street sign", "distance", "school house", "street names", "street names", "street names", "street names", "street names"], "difficult_direct_answer": false, "rationales": ["The blue signs on the poles are showing street names.", "The road signs tell us where this is", "Cities and towns provide names of roads at crossroads which is where you would also find stop signs. these signs are usually the same color all through town."], "image": "val2014/COCO_val2014_000000105291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138166, "question_id": "H3UGbNGmE6nttzoUWiyYcv", "question": "What type of bread was used for the sandwich?", "choices": ["white", "rye", "wheat", "ciabatta"], "correct_choice_idx": 0, "direct_answers": ["sesame", "roll", "french bread", "french", "hoagie", "flatbread", "white", "baguette", "hoagie", "wheat"], "difficult_direct_answer": true, "rationales": ["You can tell by the color of the bread as to what type it is.", "The sandwich was made with a white baguette.", "No darker grain is visible"], "image": "train2014/COCO_train2014_000000138166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43968, "question_id": "H3dHTfqoWBF8VGsjJbv3j9", "question": "Where is the boy playing at?", "choices": ["mountain", "resort area", "ski resort", "neighborhood"], "correct_choice_idx": 3, "direct_answers": ["mountain", "snowy hill", "neighborhood", "ski resort", "snow hill", "course", "above snow", "ski resort", "sky resort", "beach"], "difficult_direct_answer": true, "rationales": ["Residential houses can be seen around.", "A boy is in the snow in front of large apartment buildings. apartment buildings are in neighborhoods.", "The boy is in a neighborhood."], "image": "train2014/COCO_train2014_000000043968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155806, "question_id": "H3ezDmyKAbPgaxnijzfCQv", "question": "Which holiday is being celebrated at this home?", "choices": ["new years", "christmas", "valentine's day", "halloween"], "correct_choice_idx": 3, "direct_answers": ["halloween", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween"], "difficult_direct_answer": false, "rationales": ["The holiday is halloween.", "The people here are celebrating halloween.", "A typical halloween symbol is a pumpkin, which is seen on the flag hanging on the house, meaning it is that holiday which is being celebrated."], "image": "train2014/COCO_train2014_000000155806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432657, "question_id": "H4PgCkzVCYxS8VKBt2aKV5", "question": "What is the position of the player in the middle of the field?", "choices": ["first baseman", "pitcher", "outfielder", "shortstop"], "correct_choice_idx": 1, "direct_answers": ["pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "battler", "pitcher", "white", "pitcher"], "difficult_direct_answer": false, "rationales": ["The player in the middle of the field is pitching the ball.", "The pitcher stands in the middle of the field.", "Men are on a baseball field in uniform and one throws the ball from an elevated mound in the center of the diamond."], "image": "val2014/COCO_val2014_000000432657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66708, "question_id": "H4p4wJjsHm2qscSkPvFpLC", "question": "What is the method being used to cook the broccoli?", "choices": ["bake", "steam", "fry", "grill"], "correct_choice_idx": 1, "direct_answers": ["steam", "steam", "steaming", "steaming", "saute", "steam", "steaming", "steamed", "steam", "steam"], "difficult_direct_answer": false, "rationales": ["The broccoli is in a basket that is used to steam veggies over boiling water.", "As indicated by the steaming tray that has holes in it. the other options don't fit with this image.", "The broccoli is in a container with holes in the bottom. vegetables and broccoli in particular are placed in these type of containers when there is a desire to allow steam to come up through and cook the food."], "image": "train2014/COCO_train2014_000000066708.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3134, "question_id": "H5GLvX9jjUsDJLnsn9KicF", "question": "Which type of vehicle avoids this road?", "choices": ["tractor", "tank", "bus", "semi"], "correct_choice_idx": 1, "direct_answers": ["bikes", "truck", "bus", "tank", "train", "boat", "semis", "scooters", "bike", "front"], "difficult_direct_answer": true, "rationales": ["More in likely the only time this vehicle will be on streets is during city warfare.", "These are the public roads of a city. a tank would be an unusual vehicle to see on this road.", "The vehicle is a tank."], "image": "val2014/COCO_val2014_000000003134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158272, "question_id": "H5KswMY3xiEXwEqgAQDvHk", "question": "Why is the women using the paper in her hands?", "choices": ["to wrap", "to draw", "to wipe", "for directions"], "correct_choice_idx": 3, "direct_answers": ["for direction", "get directions", "get directions", "for directions", "find directions", "traveling", "read directions", "finding location", "reading map", "navigation"], "difficult_direct_answer": true, "rationales": ["It is a folded map", "She is consulting a map so she probably needs to find her way to a location.", "She is using the map to figure out where she needs to go."], "image": "val2014/COCO_val2014_000000158272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39812, "question_id": "H5gZtSjFEDxdbs7tDT5PSK", "question": "What type of dress code seems to be in place here?", "choices": ["skate wear", "formal wear", "casual attire", "beach wear"], "correct_choice_idx": 1, "direct_answers": ["formal", "formal", "formal", "formal", "black tie", "formal", "formal wear", "formal", "formal", "suite"], "difficult_direct_answer": false, "rationales": ["The dress code is formal.", "Tuxedos and dresses are considered formal wear.", "Tuxedo and fancy dresses indicate formal wear."], "image": "train2014/COCO_train2014_000000039812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227982, "question_id": "H7CbcWteT2y6gNkLbhRFFB", "question": "What is next to the tracks?", "choices": ["cats", "signs", "dogs", "trees"], "correct_choice_idx": 3, "direct_answers": ["grass", "tanker", "walkway", "grass", "pavement", "train", "trees", "tank", "vegetation", "train"], "difficult_direct_answer": false, "rationales": ["Green foliage that is grown wildly.", "There are no signs or non-human animals near the tracks. there are tall green plants behind the trains.", "The leafy green foliage that can be seen on the other side of the train, sticks out above the train indicating that they are tall trees."], "image": "train2014/COCO_train2014_000000227982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477451, "question_id": "H7DtfLj5ZzdbNvkEgfzMAk", "question": "Why is the boy reaching towards the ground?", "choices": ["to exercise", "to stretch", "to sit", "to catch"], "correct_choice_idx": 3, "direct_answers": ["catch ball", "to catch", "catch ball", "catch ball", "grab baseball", "get baseball", "catch baseball", "fielding ball", "catching ball", "catch baseball"], "difficult_direct_answer": false, "rationales": ["A boy is playing baseball. he has his mitt down to get a ball in it.", "He is holding out the glove which means he wants to catch the ball.", "The boy wants to catch."], "image": "train2014/COCO_train2014_000000477451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460916, "question_id": "H7bahyYwnjfiwxRdZiDmQ2", "question": "What size bed is this?", "choices": ["full", "king", "queen", "single"], "correct_choice_idx": 0, "direct_answers": ["queen", "queen", "queen", "queen", "double", "queen", "full", "full", "full", "queen"], "difficult_direct_answer": false, "rationales": ["The bed is a full one.", "The bed is bigger than twin but smaller than a queen.", "It is the size of a full bed."], "image": "train2014/COCO_train2014_000000460916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230381, "question_id": "H7e5rpAotnba7smmyUtcMw", "question": "Which area is viewed here?", "choices": ["bakery", "produce", "butcher", "meat counter"], "correct_choice_idx": 1, "direct_answers": ["store", "produce market", "produce", "fresh vegetables", "fruit stand", "fruit", "produce", "produce area", "produce", "produce"], "difficult_direct_answer": false, "rationales": ["There are only fruits and vegetables visible so this must be a produce section.", "There are fruits and vegetables in boxes.", "The area has produce."], "image": "train2014/COCO_train2014_000000230381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488693, "question_id": "H7txAhUhepCUbSCCZCnvRX", "question": "Why are they shoveling sand in the bucket?", "choices": ["ritual", "take home", "to sell", "to stabilize"], "correct_choice_idx": 3, "direct_answers": ["weighdown bucket", "weight", "weight", "sand castles", "make castle", "sand castles", "make castles", "stability", "to stabilize", "move it"], "difficult_direct_answer": false, "rationales": ["The people are filling buckets with sand in order to use the weight to stabilize something.", "It will hold the umbrella in place better", "They are pushing sand in a bucket so it becomes more compact, likely for a sandcastle."], "image": "val2014/COCO_val2014_000000488693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317622, "question_id": "H7ui8ZzULS2hr6wka3wiUn", "question": "Which building would be hardest to invade?", "choices": ["shortest", "on hill", "with flag", "darkest color"], "correct_choice_idx": 1, "direct_answers": ["on hill", "mountaintop", "castle", "largest", "castle", "fort", "castle", "castle", "mountain building", "top hill"], "difficult_direct_answer": false, "rationales": ["Any hostiles would have to scale a mountain before reaching this building. it's elevated position also gives it superior visibility and tactical advantages.", "The building is on the hill.", "A large mansion is on top of a hill and is visible to all in the city below. large houses, mansions, often have staff and possibly even security."], "image": "val2014/COCO_val2014_000000317622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371315, "question_id": "H87cobfrGTyJXVb3bn5pM7", "question": "What is the image of?", "choices": ["river", "buffet", "forest", "roadway"], "correct_choice_idx": 1, "direct_answers": ["food", "buffet", "food", "salad bar", "salad bar", "saladbar", "salad bar", "food", "salad bar", "food"], "difficult_direct_answer": false, "rationales": ["There are tons of trays of food out which means this is a buffet.", "There is a buffet in the middle of the image.", "The image shows a salad bar."], "image": "train2014/COCO_train2014_000000371315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366157, "question_id": "H87rUMRvqJKmXLVZCB5bZp", "question": "Why might the animals need to be moved into the red vehicle?", "choices": ["to feed", "to transport", "to groom", "to slaughter"], "correct_choice_idx": 1, "direct_answers": ["shearing", "transport", "shearing", "transport", "ramp", "be sold", "transport", "transportation", "to transport", "shipped off"], "difficult_direct_answer": false, "rationales": ["These sheep are being corralled into the back of a truck. this is not a suitable habitat for these animals so it is likely this truck will be taking them somewhere else.", "They are being sent somewhere so they need to get on the truck.", "The animals are being loaded into the red vehicle so they can be transported somewhere else."], "image": "val2014/COCO_val2014_000000366157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268356, "question_id": "H89z2sLGtfE97mJiRoSz3d", "question": "What are the white triangles in the distance surrounded by blue?", "choices": ["people", "ice caps", "birds", "sailboats"], "correct_choice_idx": 3, "direct_answers": ["boats", "boats", "sea", "sails", "boats", "sailboats", "for sailing", "sails", "sailboats", "mountains"], "difficult_direct_answer": false, "rationales": ["The white triangles are boats.", "The white triangles in the distance are the sails of the boats in the water.", "Boats can be propelled by large pieces of material that can catch the wind."], "image": "val2014/COCO_val2014_000000268356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454222, "question_id": "H8BUbw8HJFcVWZjgWndnx6", "question": "What type of room is seen here?", "choices": ["condo", "luxury hotel", "public restroom", "work office"], "correct_choice_idx": 1, "direct_answers": ["luxury hotel", "toilet", "bath room", "bathroom", "bathroom", "toilet", "hotel", "bathroom", "bathroom", "bathroom"], "difficult_direct_answer": false, "rationales": ["Based on the white bathrobe and neatly folded white towels.", "It looks like the bathroom of a fancy hotel.", "The room is a hotel room."], "image": "train2014/COCO_train2014_000000454222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541721, "question_id": "H8BZYCVXDt7LSqYvSeWxv2", "question": "What is the name of the white device in the men's hands?", "choices": ["game controller", "calculator", "tv remote", "phone"], "correct_choice_idx": 0, "direct_answers": ["wii control", "game controller", "wii", "wii controller", "wiimote", "game controller", "wii remote", "nintendo wii", "remote", "remote"], "difficult_direct_answer": false, "rationales": ["A father and son are sitting on the couch playing with a wii remote.", "It's a controller for a wii.", "The men are controlling a video game."], "image": "train2014/COCO_train2014_000000541721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253389, "question_id": "H8PLRsoAh5JWvwoAc9Yaco", "question": "What is the white squared on the upper left used for?", "choices": ["tennis", "football", "basketball", "hanging billboards"], "correct_choice_idx": 2, "direct_answers": ["basketball", "basketball", "basketball", "protect", "basketball net", "backboard", "basketball", "hoops", "basketball", "distance"], "difficult_direct_answer": false, "rationales": ["A backboard with a rim can be found above the man on the far left.", "The white square on the upper left is a basketball hoop.", "The white square above the blacktop with a rim attached is used for the sport basketball."], "image": "train2014/COCO_train2014_000000253389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258344, "question_id": "H8UdUGyPoFWSmyEiSWiTFe", "question": "What is the breed of the cat in the image?", "choices": ["ragdoll", "maine coon", "sphynx", "persian"], "correct_choice_idx": 1, "direct_answers": ["tabby", "domestic", "tabby", "tabby", "tabby", "maine coon", "tabby", "tabby", "domestic", "tabby"], "difficult_direct_answer": false, "rationales": ["Maine coons are a bright orange color.", "The cat has fur, so it is not a sphynx. the fur is orange, not white.", "The cat in the photo matches the description of the animal in a. it does not resemble any of the animals in the other options."], "image": "train2014/COCO_train2014_000000258344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441212, "question_id": "H8ViqR2TkAfU8YGqjZ2XtT", "question": "What is the sign in front of?", "choices": ["stairs", "window", "fire hydrant", "bush"], "correct_choice_idx": 3, "direct_answers": ["fire hydrants", "blurry", "bush", "hydrant", "building", "hydrants", "building", "bush", "door", "red building"], "difficult_direct_answer": false, "rationales": ["The items are fire hydrants used to access the water supply provided by the city.", "The sign is in front of some bushes at the left.", "There are some bushes in the background."], "image": "train2014/COCO_train2014_000000441212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205134, "question_id": "H8ZgQZWJMLWuiMdyqoucBS", "question": "What type media theme might the man cutting the cake enjoy?", "choices": ["love stories", "basketball", "zombies", "romance"], "correct_choice_idx": 2, "direct_answers": ["birthday cake", "horror films", "zombie", "horror", "movies", "movie", "horror films", "zombies", "horror movies", "video games"], "difficult_direct_answer": true, "rationales": ["The man likes zombies and death.", "An old man is in the middle of cutting thru a cake. the caption talks about dead people.", "The word on the side of the cake shows the meaning."], "image": "val2014/COCO_val2014_000000205134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321235, "question_id": "H8ijXgZXgQ9bzvHNKEqVwD", "question": "What kind of fuel does the lamp use?", "choices": ["manure", "fossil", "wood", "solar"], "correct_choice_idx": 1, "direct_answers": ["oil", "oil", "oil", "kerosene", "oil", "kerosene", "kerosene", "fossil", "oil", "kerosene"], "difficult_direct_answer": false, "rationales": ["The lamp is powered by oil.", "The lamp is using gas.", "This is an old fashioned oil lamp"], "image": "train2014/COCO_train2014_000000321235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388740, "question_id": "H93mB9YCRQyDRMcw2BxmGS", "question": "What is this woman doing?", "choices": ["sewing", "folding napkins", "tearing", "plaiting"], "correct_choice_idx": 1, "direct_answers": ["sewing", "sewing", "folding napkins", "sewing", "sewing", "sewing", "sewing", "watching", "knitting", "folding"], "difficult_direct_answer": false, "rationales": ["The woman has a stack of napkins and is folding them.", "She is folding napkin before putting them away.", "The stack of folded cloth in front of her is the product of her work"], "image": "train2014/COCO_train2014_000000388740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572095, "question_id": "H9ENs69QJWGd4F54GZDU7E", "question": "What American state might this location be?", "choices": ["milwaukee", "deleware", "new york", "illinois"], "correct_choice_idx": 3, "direct_answers": ["chicago", "illinois", "chicago", "chicago", "chicago", "illinois", "illinois", "chicago", "illinois", "illinois"], "difficult_direct_answer": false, "rationales": ["There are signs for chicago in the background. milwaukee is a city, not a state.", "The restaurant sign in the back reads \"old chicago\" and chicago is in illinois.", "One of the signs has chicago on it"], "image": "val2014/COCO_val2014_000000572095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380343, "question_id": "H9EeDnJEbV9avALpppWaGg", "question": "Why are the bananas green?", "choices": ["stained", "ripe", "painted", "unripe"], "correct_choice_idx": 3, "direct_answers": ["unripe", "not ripe", "unripe", "unripe", "unripe", "not ripe", "unripe", "unripe", "not ripe", "not ripe"], "difficult_direct_answer": false, "rationales": ["When bananas are are ripe they are yellow. when bananas aren't ripe they are green.", "They are still growing", "When bananas aren't yet ripe, they're colored green so these bananas must be unripe because they're green."], "image": "val2014/COCO_val2014_000000380343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176873, "question_id": "H9JNPcGDadCpGGNn2cudLS", "question": "What is the lane painted green for?", "choices": ["minivans only", "pedestrians only", "bikes only", "keep out"], "correct_choice_idx": 2, "direct_answers": ["pedestrians", "bikes only", "bike safety", "crosswalk", "bikes", "crosswalk", "indicator", "bicycle", "bikes/pedestrians", "bikers"], "difficult_direct_answer": true, "rationales": ["The center line is painted green for bikes to cross.", "It's the bike lane for cyclists.", "The green painted lane also includes images of bicycles. this would indicate that it is reserved for bikes only."], "image": "train2014/COCO_train2014_000000176873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568492, "question_id": "H9PSnZLpZzWTA2z7XhA82S", "question": "What is in the yellow bottle by the sink?", "choices": ["dishwashing liquid", "wine", "olive oil", "soda pop"], "correct_choice_idx": 0, "direct_answers": ["soap", "dish soap", "soap", "sauce", "dish soap", "dishwashing liquid", "dish soap", "dish soap", "dishsoap", "dish soap"], "difficult_direct_answer": false, "rationales": ["The bottles look to contain liquid soap.", "The bottle has liquid for dishes.", "The dishwasher is used to clean the utensils."], "image": "train2014/COCO_train2014_000000568492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424434, "question_id": "H9QKYoU25oPdPUd7hz2siR", "question": "What is the person with the hat on sitting in?", "choices": ["quicksand", "sand box", "mud", "boat"], "correct_choice_idx": 3, "direct_answers": ["boat", "canoe", "boat", "boat", "boat", "boat", "canoe", "boat", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["The woman is on a canoe type boat.", "It is floating in the water next to the dock", "The person with the hat is sitting in a boat on the river making food."], "image": "train2014/COCO_train2014_000000424434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48632, "question_id": "H9k9L9zXSYB4nqjgarkDPv", "question": "What action caused the dust to fly?", "choices": ["bats", "waving arms", "sliding", "running"], "correct_choice_idx": 2, "direct_answers": ["slide", "player sliding", "player sliding", "sliding", "player sliding", "slide", "slide", "slide", "sliding", "sliding"], "difficult_direct_answer": false, "rationales": ["A baseball player is on the ground with another from the opposing team there as well.", "In baseball when running to the base, sometimes sliding can create speed, and when doing so, it lifts up the dirt on the ground, creating dust.", "A baseball players is on home plate and dust is in the air. sliding causes dust and sand to be stirred up."], "image": "train2014/COCO_train2014_000000048632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462755, "question_id": "H9mcoWoirNrMBfceKAA6Li", "question": "The kite on the left looks like what beast?", "choices": ["gorgon", "cyclops", "chimera", "phoenix"], "correct_choice_idx": 3, "direct_answers": ["dinosaur", "lizard", "phoenix", "orange iguana", "dragon", "dinosaur", "dragon", "dinosaur", "lizard", "dinosaur"], "difficult_direct_answer": false, "rationales": ["The kite looks like a large gecko or bird.", "A bird like kite is up in the air as a man stands on ground and guides it.", "The kite looks like a transformed bird that rises from the ashes."], "image": "val2014/COCO_val2014_000000462755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468321, "question_id": "H9wF9JVJgkCNQFipYRDGLs", "question": "What brand skirt she worn?", "choices": ["adidas", "nike", "asics", "puma"], "correct_choice_idx": 0, "direct_answers": ["adidas", "nike", "nike", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas"], "difficult_direct_answer": false, "rationales": ["There is an adidas logo on the skirt.", "The triangular logo on the skirt belongs to the brand named in option a.", "The logo is on the skirt and the logo is adidas"], "image": "train2014/COCO_train2014_000000468321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254001, "question_id": "HA5uZbB5XBdaudcCGE2L9Y", "question": "Which sense would be stimulated if one sat here?", "choices": ["seeing", "smell", "hearing", "taste"], "correct_choice_idx": 1, "direct_answers": ["smell", "smell", "smell", "smell", "smell", "smell", "smell", "smell", "smell", "smell"], "difficult_direct_answer": false, "rationales": ["One would be able to inhale the scent of the fresh flowers.", "These flowers are very fragrant.", "There are flowers on both sides of the white bench. they give off a certain fragrance."], "image": "val2014/COCO_val2014_000000254001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505818, "question_id": "HA8ePGYG3rr2nQ7AsiDkK8", "question": "Person wearing what color of shirt is playing game with the woman in black?", "choices": ["green", "red", "black", "pink"], "correct_choice_idx": 2, "direct_answers": ["wii", "green", "green", "black", "green striped", "black", "black", "sweat", "black", "black"], "difficult_direct_answer": false, "rationales": ["He is the one holding a game controller.", "Two people wearing black shirts are holding video game controllers.", "The shirt is black."], "image": "val2014/COCO_val2014_000000505818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497464, "question_id": "HAYEx4cDMtzJVdLRJd7jJ3", "question": "How many types of surfboards are there?", "choices": ["five", "nine", "three", "two"], "correct_choice_idx": 2, "direct_answers": ["three", "one", "one", "three", "three", "one", "one", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three surfboards for each person shown.", "You can tell by the designs and colors as to how many types of boards there are.", "There are three different surfboards on the sand."], "image": "train2014/COCO_train2014_000000497464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250749, "question_id": "HAYGrF3kwCVj9VfkNo7nh5", "question": "When did she get married?", "choices": ["that day", "next year", "next day", "next week"], "correct_choice_idx": 0, "direct_answers": ["today", "today", "today", "just married", "today", "today", "that day", "today", "just married", "today"], "difficult_direct_answer": false, "rationales": ["The woman appears to be cutting a wedding cake while in a wedding dress. these two things are usually done on the day of the wedding and no other day.", "The woman is cutting a wedding cake, an action performed on the time event listed in a.", "Cutting the cake is a wedding day tradition."], "image": "train2014/COCO_train2014_000000250749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53142, "question_id": "HAb4M5fzBunxRjZGk3XYbu", "question": "What type of lighting technology is present within the traffic light?", "choices": ["incandescent", "led", "halogen", "fluorescent"], "correct_choice_idx": 1, "direct_answers": ["electric lights", "traffic light", "electric", "led", "led", "darkness", "electric", "led", "led", "led"], "difficult_direct_answer": false, "rationales": ["The lighting is led.", "Traffic lights are known to employ led lighting in order to function, so that would be the kind of lighting technology that is in use here.", "Led light is on the traffic light."], "image": "train2014/COCO_train2014_000000053142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449004, "question_id": "HAc2zG4hDKyHP3ETEWCgDk", "question": "Who is the man in black wearing a hat on the left?", "choices": ["fire marshal", "mail man", "truck driver", "police"], "correct_choice_idx": 3, "direct_answers": ["cop", "sheriff", "mounty", "rain hat", "police", "sheriff", "police", "policeman", "police", "law enforcement"], "difficult_direct_answer": false, "rationales": ["That is a state trooper", "The style of hat with uniform and visible handcuffs allows us to identify the man in black as a state trooper.", "The man is wearing a highway patrol hat."], "image": "train2014/COCO_train2014_000000449004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480587, "question_id": "HAog4acg2NwzwodGeTbnDm", "question": "What type of area is shown?", "choices": ["coastal", "rural", "urban", "forest"], "correct_choice_idx": 2, "direct_answers": ["street", "market", "bac thai", "business street", "street", "urban", "city", "street", "urban", "roadway"], "difficult_direct_answer": false, "rationales": ["This is a more crowded area.", "Urban places have many cyclist.", "There are many people together in a city."], "image": "train2014/COCO_train2014_000000480587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554326, "question_id": "HApDNDxtjHuQwqBeMa4Ghv", "question": "What are bricks mostly made of?", "choices": ["straw", "sand", "clay", "pebbles"], "correct_choice_idx": 2, "direct_answers": ["mud", "clay", "clay", "clay", "clay", "sand", "clay", "clay", "mud", "clay"], "difficult_direct_answer": false, "rationales": ["The bricks are made of clay.", "Bricks are hard and molded into their shape, and clay is easily moldable and hardens when dried.", "Even the color of these bricks can tell you the base element, the reddish, orangish hues are indicative of clay."], "image": "train2014/COCO_train2014_000000554326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188405, "question_id": "HArJVss5Yg3vHcmWWVxjiB", "question": "Where will people located here sleep tonight?", "choices": ["no where", "tents", "limos", "duplexes"], "correct_choice_idx": 1, "direct_answers": ["tents", "tents", "tents", "tent", "tent", "tents", "tent", "tents", "tents", "in tents"], "difficult_direct_answer": false, "rationales": ["We see a multitude of tents set up in the background. we can conclude many people at this event will be staying overnight in them.", "These people are going to sleep in the tents out back.", "There are several set up in the background"], "image": "train2014/COCO_train2014_000000188405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162156, "question_id": "HAw6cVBxz8cWxUqrDnGZz2", "question": "What privacy violation is missing from the bathroom?", "choices": ["pillow", "bath", "door", "toilet"], "correct_choice_idx": 2, "direct_answers": ["no door", "door", "toilet door", "door", "door", "door", "door", "door", "door", "camera"], "difficult_direct_answer": false, "rationales": ["There isn't a door in front of the toilet.", "There seems to be no door to the bathroom.", "There is no barrier separating the bathroom from the rest of the room."], "image": "val2014/COCO_val2014_000000162156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69809, "question_id": "HAwdP5LWTcX7Ra8S5tWAJH", "question": "What are they doing?", "choices": ["claning house", "stealing food", "preparing food", "eating food"], "correct_choice_idx": 2, "direct_answers": ["eating", "eating", "preparing food", "preparing food", "eating", "eating", "eating", "eating", "cooking", "cooking"], "difficult_direct_answer": false, "rationales": ["The people prepare food.", "We can see a variety of ingredients laid out and these men applying them to food items.", "The table is the location of several food items, as well as bottles and cans of various products, and the people are holding food and one is buttering a roll, so taking all these things account leads to the conclusion that they are preparing food to eat."], "image": "train2014/COCO_train2014_000000069809.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37169, "question_id": "HB8KDSDosFaoYqjJdTiogC", "question": "What season is it in the image?", "choices": ["spring", "summer", "spring-summer", "fall-winter"], "correct_choice_idx": 3, "direct_answers": ["winter", "winter", "winter", "fall", "fall", "fall-winter", "spring", "winter", "fall", "winter"], "difficult_direct_answer": false, "rationales": ["The trees are bare. people are wearing long sleeved shirts.", "There are no leaves on the tress and people are wearing jackets.", "There are no leaves on the trees."], "image": "train2014/COCO_train2014_000000037169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149849, "question_id": "HBATc4qDweY3j2ETDCvghF", "question": "What is being hauled on the right?", "choices": ["sofa", "nightstand", "recliner", "mattress"], "correct_choice_idx": 3, "direct_answers": ["mattress", "mattress", "mattress", "mattress", "mattress", "mattress", "matress", "mattress", "mattress", "bed"], "difficult_direct_answer": false, "rationales": ["There is a bed frame on the left. the rectangular object on the right goes on the bed frame.", "The other options don't appear in this image.", "The cart on the right has a large mattress laid over it."], "image": "train2014/COCO_train2014_000000149849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434873, "question_id": "HBPEQ3ApfNhSR7ox4TBnaK", "question": "What is the man ready to do with the ball?", "choices": ["dribble", "serve", "juggle", "dunk"], "correct_choice_idx": 1, "direct_answers": ["kick", "football", "kick", "kick it", "kick", "serve", "play volleyball", "pickup volleyball", "pick up", "kick it"], "difficult_direct_answer": false, "rationales": ["The man wants to serve.", "You can tell by the setting that most likely he is playing volleyball.", "Hes at the back of the court on a volleyball field where one would serve."], "image": "val2014/COCO_val2014_000000434873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554854, "question_id": "HC8Px3VKPjTg5MB9YJr7K7", "question": "What drug is this man ingesting?", "choices": ["marijuana", "cocaine", "mdma", "alcohol"], "correct_choice_idx": 3, "direct_answers": ["alcohol", "alka-seltzer", "alcohol", "alcohol", "beer", "alcohol", "alcohol", "drinking", "alcohol", "beer"], "difficult_direct_answer": false, "rationales": ["The drink is golden and transparent.", "He is drinking a beer.", "The glass holds a light amber color liquid with foam on the top of the liquid."], "image": "train2014/COCO_train2014_000000554854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73174, "question_id": "HCFfmdwEMa4xJcr5yFUDNb", "question": "What is the likeliness this dog is enjoying being groomed?", "choices": ["low", "very low", "high", "very high"], "correct_choice_idx": 1, "direct_answers": ["possibly", "not likely", "liking", "low", "very low", "low", "makeup", "5%", "not likely", "unlikely"], "difficult_direct_answer": false, "rationales": ["The dog has a stressed look on their face.", "It looks scared and sad.", "The dog does not look happy."], "image": "train2014/COCO_train2014_000000073174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296731, "question_id": "HCHuvevyFghVxSdwUmg6qx", "question": "Where are the players going?", "choices": ["downhill", "uphill", "right", "left"], "correct_choice_idx": 0, "direct_answers": ["downhill", "downhill", "sking", "downhill", "sking", "sking", "downhill", "downhill", "downhill", "downhill"], "difficult_direct_answer": false, "rationales": ["They are in the air at the top of a hill so when they land they will be going down the other side of the hill.", "The players are skiing from the top of the ski slope down the hill.", "Based on the way they are leaning, they are moving downward."], "image": "val2014/COCO_val2014_000000296731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507444, "question_id": "HCM5wQ8PgmNnKHW6BQbXmY", "question": "Why is he sitting on a bike?", "choices": ["powering blender", "racing someone", "going somewhere", "burning calories"], "correct_choice_idx": 0, "direct_answers": ["exercise", "for seat", "powering blender", "powering blender", "use blender", "power juicer", "as chair", "for seating", "it's chair", "stopped"], "difficult_direct_answer": true, "rationales": ["He is using an electricity generating bike to power the appliance.", "He is pedaling the bike and holding the top of the appliance.", "This bicyclist as well as at least one other has strangely parked his vehicle in front of a tent. in the test there is a table which has a blender on it. there appears to be some type of connection between this blender and the power it is getting from the bike."], "image": "train2014/COCO_train2014_000000507444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408537, "question_id": "HCZJiKxzPn8UpUJZj4J8eY", "question": "If you need to leave your car for a while and need to go down the narrow street ahead what should you do?", "choices": ["go right", "just leave", "street parking", "turn left"], "correct_choice_idx": 3, "direct_answers": ["park", "park it", "park it", "turn left", "park", "park", "park it", "park", "park", "park"], "difficult_direct_answer": false, "rationales": ["Head where the sign directs you so you can walk down the street", "The car can go left.", "The sign says that parking is available to the left."], "image": "train2014/COCO_train2014_000000408537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456113, "question_id": "HCeHjCxRB7k3Y2VhQaPUKP", "question": "What number president was the man on the cover of the magazine?", "choices": ["12", "66", "44", "31"], "correct_choice_idx": 2, "direct_answers": ["44", "44th", "forty four", "forty-forth", "44", "fourty four", "44", "44", "44", "46th"], "difficult_direct_answer": false, "rationales": ["The man is barack obama, who sometimes goes by \"44\", as he is known as the 44th president of the usa.", "Barack obama was president after the 43rd president.", "Barrack obama is on a magazine. obama was the 44th president."], "image": "train2014/COCO_train2014_000000456113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451550, "question_id": "HCwjZ8q6K7MPPinqrepzuC", "question": "What's the maximum speed that a car's speedometer can read in this area?", "choices": ["20", "15", "60", "35"], "correct_choice_idx": 0, "direct_answers": ["maximum", "20", "twenty mph", "twenty", "20", "twenty five", "twenty", "20", "can't see", "twenty"], "difficult_direct_answer": false, "rationales": ["The highway sign reads 20 mph.", "The speed is 20.", "The car's speed limit is twenty."], "image": "train2014/COCO_train2014_000000451550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273653, "question_id": "HDR3qNw22SUU9qBmS7M7bK", "question": "What kind of team is this celebrating?", "choices": ["nfl football", "mlb baseball", "nhl hockey", "nba basketball"], "correct_choice_idx": 2, "direct_answers": ["hockey", "hockey", "hockey", "hockey", "hockey team", "hockey team", "hockey team", "hockey", "nhl hockey", "hockey team"], "difficult_direct_answer": false, "rationales": ["Stanley cup is for hockey.", "The parade is celebrating the la kings winning the 2012 stanley cup championship. they play the sport that takes place on ice.", "The side of the bus is a logo for a nhl team located in california."], "image": "train2014/COCO_train2014_000000273653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153834, "question_id": "HDbDoKceqTra7kqwqLzggX", "question": "The cow belongs to which genus?", "choices": ["bovinae", "bovidae", "bos", "cattle"], "correct_choice_idx": 2, "direct_answers": ["bos", "bovine", "bos", "bovines", "bos", "bovine", "bos", "bos", "bovine", "bovine"], "difficult_direct_answer": false, "rationales": ["The cow belongs to the genus bos.", "Cows are bovine animals.", "The cow is a bovidae."], "image": "val2014/COCO_val2014_000000153834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416308, "question_id": "HDenfEx38F6dPbKSmU8e5y", "question": "The flying objects are part of what sport?", "choices": ["chess", "kite jumping", "snow skiing", "parasailing"], "correct_choice_idx": 3, "direct_answers": ["kiting", "wakeboard parasailing", "kite", "parasailing", "flying kites", "stunt kite", "kite flying", "kiting", "kite flying", "parasailing"], "difficult_direct_answer": false, "rationales": ["These are similar to what this sport uses", "They are used for gliding in the sky.", "Most of these kites are positioned over the water, in this sport kites are used to propel riders through the water using wind and force. you can also see a man strapped into the kites harness on the left side of the photo, this is a safety measure and helps the rider give direction to the kite."], "image": "val2014/COCO_val2014_000000416308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570019, "question_id": "HE5DdSJ5GTS5gAQHsKUA53", "question": "What is the vessel called that holds the most amount of beer on the table?", "choices": ["stein", "pitcher", "mug", "keg"], "correct_choice_idx": 1, "direct_answers": ["mug", "mug", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "mug", "pitcher", "glass"], "difficult_direct_answer": false, "rationales": ["The vessel that's largest is the pitcher.", "The vessel is the pitcher.", "This container is used to pour beer into a drinking glass."], "image": "train2014/COCO_train2014_000000570019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250162, "question_id": "HE9x3SXtvpnByfRWmfapEq", "question": "What will the man have to do to catch the Frisbee coming at him?", "choices": ["lift hands", "turn around", "lay down", "jump up"], "correct_choice_idx": 0, "direct_answers": ["jump", "raise arm", "raise hand", "lift hands", "jump", "jump up", "jump", "raise hands", "reach", "jump"], "difficult_direct_answer": false, "rationales": ["A frisbee is a fast spinning item, and a human can only accurately and safely catch it with their hands positioned in the air.", "The frisbee is up high so that's what he's have to do to catch it.", "Frisbees are caught with hands."], "image": "train2014/COCO_train2014_000000250162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367905, "question_id": "HECZeY9wsY3SqymdoXT9LA", "question": "What dangerous event might occur?", "choices": ["frowning", "drowning", "vomiting", "crying"], "correct_choice_idx": 1, "direct_answers": ["drowning", "drowning", "drowning", "drowning", "drowning", "drowning", "drowning", "drowning", "drowning", "drowning"], "difficult_direct_answer": false, "rationales": ["One might drown when submerged under this wave.", "The surfer is about to get hit by a crashing wave and could drown.", "If they fall deep enough into the water, their is a chance of them drowning."], "image": "val2014/COCO_val2014_000000367905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42667, "question_id": "HEMC9pJ4oJkmua73EEK4VQ", "question": "What nationality is the young girl?", "choices": ["mexican", "asian", "egyptian", "indian"], "correct_choice_idx": 3, "direct_answers": ["indian", "hispanic", "indian", "indian", "mexican", "indian", "indian", "indian", "indian", "indian"], "difficult_direct_answer": false, "rationales": ["The girl has a dot on her head which is common in india.", "The nationality is indian.", "The coloring and hair type of the little girl indicates that she is most likely from india."], "image": "val2014/COCO_val2014_000000042667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335992, "question_id": "HEVJvdah6xLfv3bWKHVTLA", "question": "The pitcher that is covered here contains what?", "choices": ["ice cream", "vegetable juice", "pistachios", "milk shake"], "correct_choice_idx": 1, "direct_answers": ["veggies", "vegetable juice", "salsa", "green juice", "fresh produce", "fruit shake", "vegetable juice", "smoothie", "veggie juice", "juice"], "difficult_direct_answer": true, "rationales": ["The pitcher has green juice in it.", "The pitcher contains vegetable juice.", "The pitcher has veggie juice."], "image": "val2014/COCO_val2014_000000335992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452005, "question_id": "HEeWfPPkisW6X8mZndTDkH", "question": "What substance is about to be poured into the construction area?", "choices": ["gravel", "cement", "water", "sand"], "correct_choice_idx": 1, "direct_answers": ["concrete", "cement", "concrete", "concrete", "cement", "concrete", "plain", "concrete", "concrete", "cement"], "difficult_direct_answer": false, "rationales": ["The construction workers will pour cement into the area.", "The construction workers are building a foundation. a mixing truck is parked next to the workers.", "The mixer in the middle left part of the image indicates the likelihood of this happening. of course, a is made up of b, c and d."], "image": "train2014/COCO_train2014_000000452005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559270, "question_id": "HEy2D3yTrpbxP56rBcvAh8", "question": "Which one of these natural disasters might this car get caught in?", "choices": ["blizzard", "volcano", "monsoon", "earthquake"], "correct_choice_idx": 3, "direct_answers": ["flood", "hurricane", "flood", "tsunami", "hurricane", "typhoon", "earthquake", "earthquake", "tsunami", "tsunami"], "difficult_direct_answer": false, "rationales": ["California has plenty of earthquakes and california is a surf state.", "It indicates it is on a coastal area because of seafood and surfing", "The surfboard and sunny setting in this scene likely places it in the state of california. this state is subject to frequent tectonic disruption."], "image": "val2014/COCO_val2014_000000559270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72860, "question_id": "HFVKppLTJFtGWWnCzRhx5i", "question": "The boat here moves under what sort of power?", "choices": ["solar", "engine", "wind", "tow"], "correct_choice_idx": 1, "direct_answers": ["byounancy", "steam", "steam", "diesel", "fuel", "engine", "motor", "diesel", "steam", "steam"], "difficult_direct_answer": false, "rationales": ["The boat on the water has an engine that helps it move on the water.", "The boat is propelled by itself.", "Most boats run by engines. the boat is too big for wind or solar power."], "image": "val2014/COCO_val2014_000000072860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60370, "question_id": "HFXwjbqxvmCrFRdGxgqR6C", "question": "What can you use here if you want to withdraw money from your account?", "choices": ["teller service", "nothing", "atm", "drive-through service"], "correct_choice_idx": 2, "direct_answers": ["atm", "atm", "atm card", "atm", "atm card", "atm", "atm", "card", "atm", "atm"], "difficult_direct_answer": false, "rationales": ["There is an atm here to withdraw money when somebody wants it.", "The atm allows someone to take money out.", "Atms are used to withdraw money from your bank account."], "image": "train2014/COCO_train2014_000000060370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355569, "question_id": "HFdHjHsRygrap9MUxz4DX5", "question": "What religion is shared by the turbaned men?", "choices": ["sikh", "christian", "athiesm", "muslim"], "correct_choice_idx": 0, "direct_answers": ["sikh", "muslim", "sikh", "sikhs", "sikh", "sikh", "sikh", "hinduism", "sikh", "islam"], "difficult_direct_answer": false, "rationales": ["Muslims wear this head piece in their religion and culture.", "The men's religion is printed on the patch on the back of their vests.", "This is a common head covering for men of this religion"], "image": "val2014/COCO_val2014_000000355569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581466, "question_id": "HFioPgop68nLcyCZFGUS57", "question": "How much milk can a cow give per day?", "choices": ["10 gallons", "5 gallons", "8 gallons", "6 gallons"], "correct_choice_idx": 2, "direct_answers": ["seven gallons", "seven gallons", "six gallons", "6-7 gallons", "seven gallons", "seven gallons", "seven", "seven gallons", "8 gallons", "seven gallons"], "difficult_direct_answer": false, "rationales": ["Cows can give many gallons of milk a day.", "When calculated and averaged out in previous tests, it is widely concluded that the average cow can comfortably produce 8 gallons per day.", "The average cow gives 8 gallons of milk per day."], "image": "train2014/COCO_train2014_000000581466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494764, "question_id": "HFuHvB6tPeSbKEdcY37qUv", "question": "Who many normally enter this space?", "choices": ["most wealthy", "anyone", "poor", "only couples"], "correct_choice_idx": 1, "direct_answers": ["people", "two", "anyone", "two", "two", "two", "two", "couples", "people", "tourists"], "difficult_direct_answer": false, "rationales": ["The bench is typically as a public park, and anyone who is there for exercise or to enjoy the scenery is welcome to sit there.", "Anyone could enter the public park.", "This outdoor bench is located by an oceanfront and therefore accessible to all."], "image": "train2014/COCO_train2014_000000494764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368717, "question_id": "HFwdjRtgnkxqaV6pwrjGXe", "question": "What are the metal arches used for?", "choices": ["support", "tradition", "light", "style"], "correct_choice_idx": 0, "direct_answers": ["height", "hold ceiling", "safety beam", "roof support", "train station", "roofing supports", "ceiling", "block elements", "support", "cover"], "difficult_direct_answer": true, "rationales": ["The arches are for support.", "The metal arches are beams that are used to hold up the ceiling in the train station.", "The metal beams are used to keep the ceiling up."], "image": "train2014/COCO_train2014_000000368717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574034, "question_id": "HG6iepVyU66kQdKjLDETfB", "question": "What province is this located?", "choices": ["ontario", "alberta", "pei", "bc"], "correct_choice_idx": 1, "direct_answers": ["alberta", "arizona", "alberta", "alberta", "japan", "edmonton", "america", "unknown", "edmonton", "nos"], "difficult_direct_answer": false, "rationales": ["The sign says the city edmonton, which is located in the province of alberta.", "They are in alberta", "The radio station is in edmonton, which is in this province."], "image": "train2014/COCO_train2014_000000574034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303520, "question_id": "HGChoGEtE2mBkoQhsVXDWJ", "question": "What sort of business innovation is being heralded here?", "choices": ["dancing", "manufacturing", "computer", "banking"], "correct_choice_idx": 2, "direct_answers": ["internet browser", "computer", "web browser", "fire fox", "internet", "browser", "new browser", "online website", "software launch", "internet explorer"], "difficult_direct_answer": true, "rationales": ["A cake is decorated and has a wish of congratulations for a computer program printed on the top in icing.", "There is a computer browser icon on the cake.", "This cake has the firefox logo on it."], "image": "val2014/COCO_val2014_000000303520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395957, "question_id": "HGfAPcfUxudQwCRFevsQJm", "question": "What is the man doing at the front of the train car?", "choices": ["painting", "stopping", "chaining", "driving"], "correct_choice_idx": 3, "direct_answers": ["driving it", "conducting", "driving", "driving train", "driving", "driving train", "conducting", "driving", "driving", "engineer"], "difficult_direct_answer": false, "rationales": ["A man in a uniform is standing at the beginning of a train. a conductor drives the train.", "The man in the front of the train car is the conductor that drives the train.", "The man is wearing a conductor's uniform. a conductor sitting at the front of the train is known to be responsible for driving the train."], "image": "train2014/COCO_train2014_000000395957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187199, "question_id": "HGs8CsNrLqgnASFvuhrPXX", "question": "What is on the bed?", "choices": ["pet", "woman", "man", "single rose"], "correct_choice_idx": 0, "direct_answers": ["pet", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The pet is on the bed.", "One can see the outline and fur of the animal.", "A dog is on the bed."], "image": "val2014/COCO_val2014_000000187199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387567, "question_id": "HGuk2narrZjsVsQkK5d8NG", "question": "What other utensil usually goes alongside the one shown?", "choices": ["axe", "spatula", "fork", "ladle"], "correct_choice_idx": 2, "direct_answers": ["fork", "fork", "fork", "fork", "fork", "spoon", "pizza cutter", "fork", "fork", "fork"], "difficult_direct_answer": false, "rationales": ["The knife is used to cut food into smaller pieces and a fork is used to put those smaller pieces into a mouth.", "The item shown is a knife that would frequently be used with a fork when eating.", "There is a fork which usually goes with the knife."], "image": "val2014/COCO_val2014_000000387567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260608, "question_id": "HHBKdSmgTpvDapQM5tJH7g", "question": "What is the aim or end hope area for the girls moving the ball where do they want it to go?", "choices": ["hole", "goal net", "inside basket", "end zone"], "correct_choice_idx": 1, "direct_answers": ["goal net", "net", "net", "hit goal", "goal", "goal net", "netted goal", "goal", "goal", "upfield"], "difficult_direct_answer": false, "rationales": ["The aim is to go in the goal.", "In soccer when someone is trying to score they kick the ball into the net.", "The girls are aiming to get the ball into the goal."], "image": "val2014/COCO_val2014_000000260608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502679, "question_id": "HHCuZhXcsvGTkek3sR3GHj", "question": "What kind of surfboard it is?", "choices": ["fish", "spin", "short board", "fun"], "correct_choice_idx": 2, "direct_answers": ["white", "short board", "noe", "short board", "noe", "noe", "surfing board", "boogy board", "boogy board", "noe"], "difficult_direct_answer": false, "rationales": ["The board is miniature sized.", "The board is very short.", "Its not as long as a normal size board."], "image": "train2014/COCO_train2014_000000502679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187240, "question_id": "HHMhkYuHzqoyjpeZoS5gq4", "question": "Why is the red vehicle stopped here?", "choices": ["protest", "sales trick", "boarding passengers", "accident"], "correct_choice_idx": 2, "direct_answers": ["board passengers", "boarding", "getting people", "boarding passengers", "unloading", "loading", "mechanical trouble", "emergency", "for repair", "passenger pickup"], "difficult_direct_answer": true, "rationales": ["There is a sign on the side of the road and partial writing can be seen in the road lane stating that this is a bus stop.", "The bus is in the process of boarding passengers who are on the sidewalk.", "When a bus is stopped, it is to pick up passengers."], "image": "val2014/COCO_val2014_000000187240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226594, "question_id": "HHXD9rfMgxd58btzUTJDKG", "question": "What color jacket is the leftmost person wearing?", "choices": ["black", "purple", "blue", "green"], "correct_choice_idx": 2, "direct_answers": ["blue", "navy blue", "blue", "navy blue", "blue", "blue", "navy blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["It is lighter than the black pants", "The person to the left is wearing a blue jacket.", "As long as you are not colorblind you can tell the color."], "image": "train2014/COCO_train2014_000000226594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549928, "question_id": "HHibnqsK8hX86w7RG4yXty", "question": "Who was a famous version of this animal?", "choices": ["benji", "garfield", "robin hood", "dumbo"], "correct_choice_idx": 3, "direct_answers": ["dumbo", "dumbo", "dumbo", "dumbo", "jumbo", "dumbo", "dumbo", "dumbo", "jumbo", "dumbo"], "difficult_direct_answer": false, "rationales": ["This is the only one of the fictional animals that is also an elephant.", "The famous one is dumbo.", "The animal is an elephant, not a horse, dog, or cat."], "image": "train2014/COCO_train2014_000000549928.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68594, "question_id": "HHmEaECuL7ERH4BxHLART4", "question": "Which vegetable contains the most vitamin A?", "choices": ["green bean", "carrot", "beet", "cauliflower"], "correct_choice_idx": 1, "direct_answers": ["carrot", "carrots", "cauliflower", "carrot", "kales", "carrots", "carrots", "cauliflower", "collards", "carrots"], "difficult_direct_answer": false, "rationales": ["The carrots are believing to have vitamins with large quantities.", "Carrots are the vegetable most associated with vitamin a present at this outdoor market.", "Carrots have a lot of vitamin a in them."], "image": "train2014/COCO_train2014_000000068594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511622, "question_id": "HHw7Tz3FT8VyKdDh9m458N", "question": "What's the name of the large pan the woman is using?", "choices": ["wok", "wip", "wik", "wak"], "correct_choice_idx": 0, "direct_answers": ["wok", "wok", "wok", "wok", "wok", "wok", "wok", "wok", "wok", "wok"], "difficult_direct_answer": false, "rationales": ["It is a rounded pan used in asian cooking", "That is a wok you can use on a burner on the stove.", "The woman is using a wok."], "image": "val2014/COCO_val2014_000000511622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270202, "question_id": "HJD2YRnF2pN9C8g8AS8zAn", "question": "How is the person that is standing likely related to the person shown eating?", "choices": ["father", "grandfather", "grandmother", "mother"], "correct_choice_idx": 2, "direct_answers": ["grandmother", "mother", "grandmother", "grandmother", "grandmother", "grandparent", "grandparent", "grandma", "mother", "grandmother"], "difficult_direct_answer": false, "rationales": ["The young girl looks remarkably like the older lady.", "The person standing is likely the girl's grandma.", "The person standing near this child eating is the grandmother."], "image": "train2014/COCO_train2014_000000270202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107389, "question_id": "HJDFacSt8fi2KKQ8LKms7w", "question": "What is the person doing?", "choices": ["returning", "star gazing", "sun spotting", "serving"], "correct_choice_idx": 3, "direct_answers": ["serving ball", "playing tennis", "playing tennis", "serving ball", "playing tennis", "playing tennis", "serving ball", "serving tennis", "playing tennis", "serving"], "difficult_direct_answer": false, "rationales": ["The person is hitting the ball.", "The ball is in the air directly above her as she swings the racket behind her", "The person is serving."], "image": "val2014/COCO_val2014_000000107389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526904, "question_id": "HJEsNLRfiC3JNHmCvyt8Fd", "question": "What is the man using to control the grey device?", "choices": ["desktop", "phone", "laptop", "remote"], "correct_choice_idx": 2, "direct_answers": ["laptop", "sound", "remote", "knobs", "knobs", "knob", "knobs", "fingers", "hands", "controller"], "difficult_direct_answer": false, "rationales": ["The grey device is plugged in to the laptop.", "The portable computer is attached by wire to the grey device this man is using.", "The man uses a laptop."], "image": "val2014/COCO_val2014_000000526904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199783, "question_id": "HJG8DFU6caTfGieibWMTjx", "question": "How many people can this room accommodate?", "choices": ["two", "one", "six", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The beds are small enough for one person each.", "There are two twins beds, each of which accommodates one person.", "There are two beds in the room."], "image": "val2014/COCO_val2014_000000199783.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360110, "question_id": "HJRb24Mpe3aKMuYUMgpNSD", "question": "What season is it here?", "choices": ["st. patricks", "ground hogs", "christmas", "easter"], "correct_choice_idx": 2, "direct_answers": ["christmas", "christmas", "christmas", "christmas", "winter", "christmas", "christmas", "christmas", "christmas", "winter"], "difficult_direct_answer": false, "rationales": ["The man on this bus or train wears a hat associated with the christmas season.", "The season is christmas.", "This man is wearing a santa hat."], "image": "train2014/COCO_train2014_000000360110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421536, "question_id": "HJRgFzaxZtZdY9wrhmjkBt", "question": "What group of people are specially accommodated in the bus?", "choices": ["elderly people", "handicapped people", "babies", "pregnant women"], "correct_choice_idx": 1, "direct_answers": ["handicapped", "handicap", "travellers", "old people", "working class", "handicapped people", "handicapped", "disabled", "passengers", "handicap"], "difficult_direct_answer": false, "rationales": ["The bus has a handicap sign on it.", "There is a handicapped access sticker so they are able to board this bus.", "There is a wheelchair decal."], "image": "train2014/COCO_train2014_000000421536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50305, "question_id": "HJd6CNcmGwELSE5TBaLkYU", "question": "Where are these boys on the way to or from?", "choices": ["kitchen", "car wash", "junk yard", "school"], "correct_choice_idx": 3, "direct_answers": ["school", "school", "school", "school", "school", "school", "school", "school", "school", "school"], "difficult_direct_answer": false, "rationales": ["The boys are wearing uniforms and are carrying backpacks.", "The boys are going to school.", "The boys look to be wearing school uniforms since they all match."], "image": "train2014/COCO_train2014_000000050305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171950, "question_id": "HKAHLMoutDwuXcYVTwBd7J", "question": "What stadium is this game taking place in?", "choices": ["mets", "astros", "yankee", "cubs"], "correct_choice_idx": 2, "direct_answers": ["yankee", "yankee stadium", "yankee", "yankee stadium", "yankee", "yankee", "yankee", "yankee stadium", "yankees", "yankee stadium"], "difficult_direct_answer": false, "rationales": ["The stadium is the yankees.", "The stadium has the logo for the new york yankees painted on the grass.", "One can see the famous ny logo painted on the field."], "image": "train2014/COCO_train2014_000000171950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8794, "question_id": "HKAzuYjjYf8VBsR5HyQdqk", "question": "What kind of hat is the man wearing?", "choices": ["ball cap", "fedora", "sunhat", "boater"], "correct_choice_idx": 3, "direct_answers": ["safari", "straw hat", "panama", "straw hat", "top", "boater", "boater", "straw", "bowler", "bowler"], "difficult_direct_answer": false, "rationales": ["The man is wearing a circular hat with a wide brim.", "The man is wearing a boater top hat.", "The man is a boater."], "image": "train2014/COCO_train2014_000000008794.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450991, "question_id": "HKBcFV5PTNoDtRkNfpYsK2", "question": "What is the outcome if the ball went over the fence?", "choices": ["hit", "walk", "out", "home run"], "correct_choice_idx": 3, "direct_answers": ["home run", "free run", "homerun", "homerun", "homerun", "homerun", "homerun", "homerun", "home run", "homerun"], "difficult_direct_answer": false, "rationales": ["The outcome is a home run.", "This is the term used for when the ball is hit out of the park.", "The player is playing baseball based on their uniform, the fence in the background and the field. in baseball a ball over the fence would be a homerun."], "image": "train2014/COCO_train2014_000000450991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557084, "question_id": "HKKssX7fzg25J5pA9VWzQa", "question": "What kind of advertisement is the one on the side of the bus?", "choices": ["health plan", "childcare", "feminine hygiene", "housing"], "correct_choice_idx": 0, "direct_answers": ["family deserves", "health plan", "health insurance", "medical", "healthcare", "health insurance", "medicaid", "medical insurance", "health care", "medicaid health"], "difficult_direct_answer": true, "rationales": ["The advertisement is for a version of medicaid offered by va premier.", "A blue banner is currently on the side of a bus. it is advertising medicaid part of your health.", "The sign has the words health plan on it which match the choice."], "image": "train2014/COCO_train2014_000000557084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428231, "question_id": "HKRDKAG8hLXHUUn6uhRGUR", "question": "What will be visible on the rug if the coffee table is moved?", "choices": ["tears", "paint", "splinters", "indentations"], "correct_choice_idx": 3, "direct_answers": ["stains", "stains", "leg marks", "table marks", "indentations", "leg inprints", "spots", "carpet", "indentation marks", "rug"], "difficult_direct_answer": true, "rationales": ["There will be marks from where the table was.", "Indentations will be visible from the weight of the coffee table legs.", "Heavy furniture leaves marks when it is moved."], "image": "val2014/COCO_val2014_000000428231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29642, "question_id": "HKRavr83pSmV99ATCuiaio", "question": "Where is this person working?", "choices": ["office", "library", "bedroom", "school"], "correct_choice_idx": 2, "direct_answers": ["on laptop", "bed", "hotel", "bedroom", "bed", "bedroom", "bedroom", "bedroom", "bedroom", "bedroom"], "difficult_direct_answer": false, "rationales": ["The person is in a bedroom.", "The person is on a bed", "One of the most prominent item in a bedroom is a bed. this person is currently lying on her bed while working on laptop."], "image": "train2014/COCO_train2014_000000029642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568405, "question_id": "HKYPewz7hyebXGDp4RaEJj", "question": "In which country can you find this train?", "choices": ["france", "netherlands", "germany", "italy"], "correct_choice_idx": 1, "direct_answers": ["netherlands", "amsterdam", "netherlands", "netherlands", "germany", "germany", "germany", "germany", "netherlands", "germany"], "difficult_direct_answer": false, "rationales": ["This train has writing in dutch on the side.", "The blue writing on the side of the train is in dutch, not german, french, or italian.", "Life is not for everyone is on the the side of the train in a foreign language."], "image": "train2014/COCO_train2014_000000568405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489617, "question_id": "HKueu4Y2Hn4q4tcZD6UWSz", "question": "How did the skateboarder get so high in the air?", "choices": ["vaulted", "ramp", "trampoline", "spring board"], "correct_choice_idx": 1, "direct_answers": ["momentum", "ramp", "jumped", "ramp", "ramped", "jumped ramp", "jumped", "jumped", "effort", "jumping high"], "difficult_direct_answer": false, "rationales": ["The skater rode off an elevated platform", "There is a ramp directly behind the boy, indicating that he used it to propel himself into the air.", "The skateboarder used the ramp."], "image": "train2014/COCO_train2014_000000489617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258442, "question_id": "HKv8PJNEYp3h93KpNtTw4w", "question": "What birds roost here?", "choices": ["none", "gulls", "chickens", "pigeon"], "correct_choice_idx": 3, "direct_answers": ["pigeon", "pigeons", "roof", "pigeons", "pigeons", "crows", "cock", "pigeons", "pigeons", "pigeons"], "difficult_direct_answer": false, "rationales": ["Most birds in urban areas are of the pigeon variety.", "There are birds on the building. they are too small to be chickens or gulls.", "These are the most common birds found in cities."], "image": "train2014/COCO_train2014_000000258442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407400, "question_id": "HKynCFj4G2E5m8xddiUugE", "question": "What is the man doing?", "choices": ["reading", "sleeping", "drinking", "eating"], "correct_choice_idx": 0, "direct_answers": ["reading", "reading", "reading", "reading", "reading", "reading", "laying down", "reading", "reading", "reading"], "difficult_direct_answer": false, "rationales": ["The man is enjoying a book.", "The man on the bench is holding a book that he is reading.", "The man has a book out in front of him."], "image": "train2014/COCO_train2014_000000407400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237474, "question_id": "HLBHossQDkS3AysoNKmhoa", "question": "Why is the person on the right of the elephants holding a bucket?", "choices": ["catch spit", "throw mud", "help wash", "catch poop"], "correct_choice_idx": 2, "direct_answers": ["for water", "bathing them", "food temptation", "water", "scoop water", "cleaning", "washing", "washing", "help wash", "cleaning them"], "difficult_direct_answer": true, "rationales": ["The person on the right of the elephants has a bucket for washing.", "He picks up water and throws it on the elephants", "Two men are standing around elephants in water. buckets can be used to scoop water."], "image": "train2014/COCO_train2014_000000237474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427555, "question_id": "HLpMKm8wWVfaKSAmEfM53B", "question": "What beverage are these ladies having?", "choices": ["whiskey", "beer", "water", "wine"], "correct_choice_idx": 2, "direct_answers": ["water", "wine", "water", "wine", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["Women are sitting at a table with a clear liquid in their glasses. water is often given at restaurants.", "It looks like a. given the girl's age, this is the most likely answer.", "The ladies are drinking water."], "image": "train2014/COCO_train2014_000000427555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310008, "question_id": "HLrkSGtZeGzdpFajaBgtbh", "question": "In which country is this sport most popular?", "choices": ["belgium", "new zealand", "france", "us"], "correct_choice_idx": 3, "direct_answers": ["usa", "usa", "united states", "japan", "usa", "us", "usa", "america", "america", "united states"], "difficult_direct_answer": false, "rationales": ["Baseball has a professional league in the us and many children grow up learning to play the sport.", "Baseball is more popular in america than anywhere else in the world.", "Baseball is an american pasttime."], "image": "val2014/COCO_val2014_000000310008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430428, "question_id": "HLuNEtr8LqFFa8dtUmyyDU", "question": "What is the man doing with his utensils?", "choices": ["playing", "cutting food", "wiping them", "cleaning them"], "correct_choice_idx": 1, "direct_answers": ["cutting food", "cutting", "slicing food", "cutting", "cutting food", "cutting", "cutting pizza", "cutting", "eating", "cutting"], "difficult_direct_answer": false, "rationales": ["This man holds his knife and fork in a position conducive to cutting it into smaller pieces for easier ingestion.", "The man is holding the knife and form over the food to cut it.", "The man is using a knife, and the pizza is too big to fit in his mouth."], "image": "train2014/COCO_train2014_000000430428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510430, "question_id": "HM3EM7F9GuHcfPsi77RQo3", "question": "What is near the tree?", "choices": ["elephant", "cat", "baby", "house"], "correct_choice_idx": 3, "direct_answers": ["sign", "signpost", "sign", "street signs", "fire hydrant", "sign", "house", "sign", "street sign", "building"], "difficult_direct_answer": false, "rationales": ["The building looks like a home.", "A large building with a roof and windows is buy a tall tree with long limbs.", "There is a building near the tree."], "image": "train2014/COCO_train2014_000000510430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423508, "question_id": "HMBGHqYFvXK7et4LWvaDej", "question": "The sculpture in front of the building is modeled after what common object found on a sidewalk?", "choices": ["fire hydrant", "newspaper box", "parking meter", "garbage can"], "correct_choice_idx": 0, "direct_answers": ["fire hydrant", "fire hydrant", "fire hydrant", "fire hydrant", "fire hydrant", "fire hydrant", "fire hydrant", "fire hydrant", "fire hydrant", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["It's made to look like a fire hydrant.", "It does look like one.", "The figure looks like something firefighters used."], "image": "val2014/COCO_val2014_000000423508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2742, "question_id": "HMPZhwNEDDJzsRHxBgy9xU", "question": "What type of cheese is generally used on this food?", "choices": ["mozzarella", "brie", "american", "cheddar"], "correct_choice_idx": 0, "direct_answers": ["mozzarella", "mozzarella", "pizza", "mozzarella", "mozzarella cheese", "mozzarella", "mozzarella", "mozzarella", "mozzarella", "pizza"], "difficult_direct_answer": false, "rationales": ["The pizza on the table is topped with the most popular cheese for pizzas which is mozzarella.", "Pizzas typically use this white cheese because it has an elastic texture and melts easily.", "That cheese is used on pizzas."], "image": "train2014/COCO_train2014_000000002742.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198775, "question_id": "HMTowgc8DSCv5ufiFtE2FY", "question": "What is unique about the size of this parking spot?", "choices": ["direction", "overly large", "small size", "meter"], "correct_choice_idx": 2, "direct_answers": ["small size", "very small", "fits perfectly", "mini parking", "small vehicles", "very small", "mini compact", "tiny", "ultra compact", "small"], "difficult_direct_answer": true, "rationales": ["The parking spot is just big enough for a smart car. bigger cars would not fit.", "The car is a smart car but the parking spot fits it perfectly.", "The car is very small."], "image": "val2014/COCO_val2014_000000198775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42204, "question_id": "HMZ2xSCjhajxtAT2DapAV2", "question": "What job do those behind the various stands have?", "choices": ["computer programming", "sewing", "vendors", "turking"], "correct_choice_idx": 2, "direct_answers": ["vendors", "vendors", "clerks", "cashier", "sales people", "sales people", "vendors", "sales", "sellers", "merchant"], "difficult_direct_answer": false, "rationales": ["The vendors behind the stands sell food.", "The people behind the stands are selling items.", "These are common in many areas of the world along streets between buildings."], "image": "val2014/COCO_val2014_000000042204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433963, "question_id": "HMgvmbsjxf4FiJVbRX7yKN", "question": "What can you get for the night if you call 210-878-0034?", "choices": ["surprise", "quickie", "bus", "something unmentionable"], "correct_choice_idx": 2, "direct_answers": ["hotel", "help", "bus", "bus", "tourist bus", "bus", "good time", "good time", "ad space", "this bus"], "difficult_direct_answer": false, "rationales": ["The number if called promises a 'good time'. without any more specification of what this means we can assume this is something unsavory or salacious.", "That number is written on the back of the bus to advertise its services.", "A touristy-looking bus has this information printed on the back of it in large font."], "image": "val2014/COCO_val2014_000000433963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280560, "question_id": "HMv8JEJYwoVqh2fpqnE45R", "question": "What is the person practicing?", "choices": ["moves", "law", "medicine", "serve"], "correct_choice_idx": 3, "direct_answers": ["tennis", "tennis", "serve", "tennis", "tennis", "tennis", "tennis", "tennis", "tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["The person is playing tennis and they are holding the ball in their hand getting ready to throw it up in the air to begin play.", "They are getting ready to serve the ball.", "The person is standing on a tennis court; this eliminates all the other choices."], "image": "val2014/COCO_val2014_000000280560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16486, "question_id": "HMwVSTp4a2GRPsrTWoGVGf", "question": "What are the men worrying about?", "choices": ["car accident", "earthquake", "landslide", "tornado"], "correct_choice_idx": 0, "direct_answers": ["broken car", "car accident", "tires", "tires", "flat tire", "car", "broken vehicle", "farming", "broken axle", "broken vehicle"], "difficult_direct_answer": false, "rationales": ["The vehicle has crashed.", "The truck has a lot of damage", "There is a big rut in the road which has caused damaged to the vehicle now missing its front wheels."], "image": "train2014/COCO_train2014_000000016486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134074, "question_id": "HN8Ce5ahaL5HTPR4bhuabn", "question": "What is between the trees?", "choices": ["apples", "children", "cars", "animals"], "correct_choice_idx": 3, "direct_answers": ["cows", "animals", "animals", "cows", "cattle", "cows", "cows", "cattle", "trunks", "animals"], "difficult_direct_answer": false, "rationales": ["There are cows in the pasture.", "Animals are between.", "Just by looking the answer is obvious."], "image": "val2014/COCO_val2014_000000134074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49913, "question_id": "HNLZiGfUrV7EUR9qS7GnBj", "question": "What type of sky is this?", "choices": ["clear", "sunny", "rain", "overcast"], "correct_choice_idx": 3, "direct_answers": ["cloudy", "cloudy", "cloudy", "threatening", "cloudy", "cloudy", "overcast", "cloudy stormy", "cloudy", "cloudy"], "difficult_direct_answer": false, "rationales": ["It's a cloudy day.", "It is somewhat overcast due to all of the clouds in the sky.", "There are many clouds visible in the background. when there are many clouds in the sky the weather can be said to be answer a."], "image": "train2014/COCO_train2014_000000049913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566049, "question_id": "HNRE4KrduCyh7y6VquDvk5", "question": "What might the bucket shown here be used for here?", "choices": ["building sandcastles", "frisbee tossing", "swimming", "carrying gifts"], "correct_choice_idx": 0, "direct_answers": ["sand", "carry shells", "collecting shells", "sandcastle building", "building sandcastles", "sand", "collecting seashells", "sand castle", "sand collection", "shells"], "difficult_direct_answer": true, "rationales": ["Buckets are used to stack sand.", "This bucket is in the hand of a young boy on the beach. this tool used by such a person in such a place is most likely to build small structures out of sand.", "The bucket is a good tool for putting sand into."], "image": "val2014/COCO_val2014_000000566049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94558, "question_id": "HNd2YUtpFus9QTH6CinaPP", "question": "What sport are the men playing?", "choices": ["rugby", "european handball", "ultimate frisbee", "disc golf"], "correct_choice_idx": 2, "direct_answers": ["frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "ultimate frisbee"], "difficult_direct_answer": false, "rationales": ["The man is holding a frisbee.", "You can tell by the setting and what they are doing you can tell what sport is portrayed.", "You can tell by how they are dressed and what the man is reaching for as to what they are doing."], "image": "train2014/COCO_train2014_000000094558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516697, "question_id": "HNdzPGgjmhqNo6au25VDjw", "question": "Why is the man wearing glasses?", "choices": ["fashion", "dress code", "block sunlight", "halloween costume"], "correct_choice_idx": 2, "direct_answers": ["too sunny", "block sunrays", "solar protection", "shade eyes", "block sunlight", "block sunlight", "block sunlight", "sun", "bright sun", "sun rays"], "difficult_direct_answer": false, "rationales": ["The man is blocking sunlight.", "Dark glasses most practical purpose is to mitigate the blinding effects of a bright day.", "The sun is bright on this day."], "image": "train2014/COCO_train2014_000000516697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16166, "question_id": "HNfGorYtSQwxrDExQrM46z", "question": "What material is the boat made of?", "choices": ["carbon", "wood", "metal", "plastic"], "correct_choice_idx": 3, "direct_answers": ["plastic", "rubber", "plastic", "plastic", "rubber", "rubber", "rubber", "plastic", "rubber", "rubber"], "difficult_direct_answer": false, "rationales": ["You can tell by the design and color as to what the boat is made of.", "An orange and black boat is on the water. boats are made of plastic sometimes.", "The boat is made out an inflatable plastic material."], "image": "train2014/COCO_train2014_000000016166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464923, "question_id": "HNoY2PsuKdd7JoRa3zFZsJ", "question": "What type of food is the animal on the bed classified as?", "choices": ["seafood", "beef", "meat", "legumes"], "correct_choice_idx": 0, "direct_answers": ["seafood", "seafood", "fish", "fish", "fish", "fish", "pescatarian", "fish", "fish", "seafood"], "difficult_direct_answer": false, "rationales": ["The other options don't match a salmon, which is a fish.", "Seafood generally includes fish.", "There is a fish on the bed and fish are considered seafood given that they are from the ocean."], "image": "train2014/COCO_train2014_000000464923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423001, "question_id": "HNpjNRRnyfGhgRhJ2xRJZq", "question": "What material is the red jacket made of?", "choices": ["cotton", "pic", "leather", "fleece"], "correct_choice_idx": 3, "direct_answers": ["nylon", "polyester", "fleece", "nylon", "cotton", "vinyl", "down", "woolen", "cloth", "polyester"], "difficult_direct_answer": false, "rationales": ["The jacket is being used bya skiier in the snow and fleece is a warm material.", "This is a warm synthetic fabric and good for cold areas", "It is an insulating fabric good for cold temperatures."], "image": "val2014/COCO_val2014_000000423001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175831, "question_id": "HPHpuBeTjJnDpBnWEhUZt5", "question": "From which floors balcony could someone get the most optimal view here?", "choices": ["second", "first", "fourth", "third"], "correct_choice_idx": 2, "direct_answers": ["second", "top", "bike", "top", "third", "fourth", "fourth", "top", "fourth", "top"], "difficult_direct_answer": false, "rationales": ["You can see the most on the 4th floor because it is highest up.", "Although there are only three balconies, the third one appears to be on the a floor.", "The higher the floor is the more you can see."], "image": "val2014/COCO_val2014_000000175831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360399, "question_id": "HPZViaKEfgD4dn7AGKGoLj", "question": "What is the cause of the visual aberration beneath the skateboard?", "choices": ["heat", "sound", "light", "wind"], "correct_choice_idx": 0, "direct_answers": ["road", "aperture setting", "jumping", "wear", "water", "stunt", "heat", "graffiti", "motion blur", "aperture setting"], "difficult_direct_answer": true, "rationales": ["Its used to soften the wheels.", "The cause is heat.", "Heat emanating from a concrete surface gives off this fuzzy appearance."], "image": "train2014/COCO_train2014_000000360399.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301714, "question_id": "HPfZgAEZtGvXQDxrYZHJUa", "question": "These foods belong to what family?", "choices": ["meat", "vegetables", "dairy", "fish"], "correct_choice_idx": 1, "direct_answers": ["root vegetables", "vegetables", "vegetable", "roots", "vegetables", "root", "vegetables", "apiaceae", "root", "root vegetables"], "difficult_direct_answer": false, "rationales": ["The foods are veggies.", "These belong in the veggie family.", "These are different rooted vegetables that you can find at a grocery store."], "image": "train2014/COCO_train2014_000000301714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222251, "question_id": "HPzZRh37rvWBMUyQTLVs5w", "question": "The woman on the left has what above her nose?", "choices": ["jewelry", "glasses", "hat", "dirt"], "correct_choice_idx": 1, "direct_answers": ["glasses", "comfort", "glasses", "glasses", "glasses", "glasses", "glasses", "comfort", "glasses", "glasses"], "difficult_direct_answer": false, "rationales": ["The woman has glasses.", "She has glasses on so she can see to play the game.", "A woman is wearing glasses. glasses rest on the nose of the person wearing them."], "image": "train2014/COCO_train2014_000000222251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58800, "question_id": "HQ43H3ATUcaRqMP96HpifP", "question": "What trick is the man with his hand up doing?", "choices": ["front flip", "ollie", "tail whip", "back flip"], "correct_choice_idx": 1, "direct_answers": ["skateboarding", "flip", "jumping", "skating", "air trick", "skateboard", "ollie", "ollie", "scatting", "jump"], "difficult_direct_answer": true, "rationales": ["By the position of the skater in the air you can safely assume what trick he is attempting.", "That trick is known as an ollie.", "A man is riding his board in the air before landing."], "image": "val2014/COCO_val2014_000000058800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397658, "question_id": "HQY5ZNQXUPC6FCgQinEqAp", "question": "Why is the girl holding her hand to her ear?", "choices": ["posing", "soothing pain", "using phone", "she's sleepy"], "correct_choice_idx": 2, "direct_answers": ["talk", "phone", "cell phone", "phone call", "using phone", "on phone", "phone", "speaking mobile", "phone", "talk"], "difficult_direct_answer": false, "rationales": ["We can tell from her stance and expression that this woman is listening to someone speak through a device held to her ear.", "There is a cellphone held up to her ear.", "The woman is holding up a phone to her ear."], "image": "val2014/COCO_val2014_000000397658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318566, "question_id": "HQakMqeLpMxhJQxhiMrE3C", "question": "What does the woman standing want to take here?", "choices": ["picture", "pulse", "dinner", "statue"], "correct_choice_idx": 0, "direct_answers": ["picture", "photograph", "selfie", "selfie", "selfie", "photo", "photograph", "selfie", "selfie", "photograph"], "difficult_direct_answer": false, "rationales": ["A woman is standing there with her phone taking a selfie.", "The woman is posing while holding a device that is capable of taking photographs.", "Sheis taking a selfie with the statue."], "image": "val2014/COCO_val2014_000000318566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445368, "question_id": "HQi7LY25mchNVC3xbFoCeD", "question": "What is the man doing?", "choices": ["sitting", "sleeping", "working", "cooking"], "correct_choice_idx": 0, "direct_answers": ["siting", "sitting", "sitting", "resting", "feeding cows", "sitting", "watching cows", "digging", "sitting", "sitting"], "difficult_direct_answer": false, "rationales": ["He's sitting down looking at the animals.", "The man is sitting.", "He is resting on the hillside watching the cows."], "image": "train2014/COCO_train2014_000000445368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214046, "question_id": "HQkV9SgyDp6QPJoHawuvTv", "question": "What kind of juice is the woman probably retrieving from the refrigerator?", "choices": ["cranberry", "orange", "grape", "apple"], "correct_choice_idx": 1, "direct_answers": ["fruit", "orange", "milk", "orange", "orange", "orange", "orange juice", "orange juice", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The color of the carton makes a likely.", "The woman is probably retrieving orange juice from the refrigerator.", "The woman seems to be looking for grapes."], "image": "val2014/COCO_val2014_000000214046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555065, "question_id": "HQz56z7jZ2ehhzZpW8MmW2", "question": "What is unusual about the name of the boat?", "choices": ["missing n", "extra l", "extra m", "extra e"], "correct_choice_idx": 1, "direct_answers": ["woman", "nothing", "two names", "actress", "misspelled name", "extra l", "person's name", "unknown", "last name", "first name"], "difficult_direct_answer": true, "rationales": ["Helen is usually spelled with one l.", "There is an extra l.", "Helen is usually spelled with one l."], "image": "train2014/COCO_train2014_000000555065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124591, "question_id": "HRGvytdVscCxTs2LDscFhE", "question": "What allows the surfer to maintain proper body temperature?", "choices": ["surfboard", "gloves", "flippers", "wetsuit"], "correct_choice_idx": 3, "direct_answers": ["wetsuit", "wetsuit", "wetsuit", "wetsuit", "wetsuit", "wetsuit", "wetsuit", "wetsuit", "wetsuit", "wetsuit"], "difficult_direct_answer": false, "rationales": ["The surfers have wetsuits.", "The surfer wears a wetsuit to maintain a proper body temperature.", "A surfer usually wears a suit that is meant to be wet while they are surfing in the ocean and to keep them from feeling cold."], "image": "train2014/COCO_train2014_000000124591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113594, "question_id": "HRdtHhrJ4TLdnohvYr79u5", "question": "What is the man's position?", "choices": ["standing", "sitting", "grounded", "midair"], "correct_choice_idx": 3, "direct_answers": ["airborne", "air", "airborne", "off ground", "leaping", "skateboarding", "airborn", "jump", "high above", "midair"], "difficult_direct_answer": true, "rationales": ["The man just jumped and is not touching the ground.", "He is up in the air doing a stunt.", "The man is jumping in the air."], "image": "train2014/COCO_train2014_000000113594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419013, "question_id": "HRiqm4aL8HpGPEKiyRYyHd", "question": "What brand of socks does the man have on?", "choices": ["puma", "nike", "converse", "fila"], "correct_choice_idx": 0, "direct_answers": ["pumba", "puma", "puma", "puma", "puma", "puma", "puma", "pumba", "puma", "puma"], "difficult_direct_answer": false, "rationales": ["The man has on puma socks.", "The brand is puma.", "On the top of the two socks it has the brand logo name of puma."], "image": "train2014/COCO_train2014_000000419013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140738, "question_id": "HS32atnsrm5dryw5oygViy", "question": "Where are these people sitting?", "choices": ["uber", "train", "taxi", "plane"], "correct_choice_idx": 1, "direct_answers": ["bus", "train", "bus", "train", "train", "bus", "bus", "bus", "bus", "train"], "difficult_direct_answer": false, "rationales": ["They are in open land and it has passenger seats", "They are in the middle of nowhere by looking out the window.they are on the ground so it's the only answer would be a locomotive.", "The people are sitting on a train."], "image": "train2014/COCO_train2014_000000140738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218947, "question_id": "HS9vybHtnuGeAYDagVHQXS", "question": "Why is the image mostly green?", "choices": ["bad film", "camera filter", "green snow", "green clouds"], "correct_choice_idx": 1, "direct_answers": ["filter", "filter", "color filter", "frozen lens", "filter", "camera filter", "filter", "camera filter", "photo effect", "filter"], "difficult_direct_answer": false, "rationales": ["The camera filter allows the image to be green.", "There is an image with a camera filter on it, then that is why the image is green.", "The image has a filter."], "image": "val2014/COCO_val2014_000000218947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440937, "question_id": "HSUtMJQkHmzTBMJDiEScXx", "question": "The user of the phone is drinking a beer in which city?", "choices": ["boston", "philadelphia", "pittsburgh", "new york"], "correct_choice_idx": 1, "direct_answers": ["philadelphia", "philadelphia", "philadelphia", "philadelphia", "philadelphia", "philadelphia", "philadelphia", "brotherly love", "brotherly love", "philadelphia"], "difficult_direct_answer": false, "rationales": ["\"the city of brotherly love\" has been the nickname of this city for decades, if not centuries.", "The \"city of brotherly love\" is philadelphia.", "The city of brotherly love is philly."], "image": "val2014/COCO_val2014_000000440937.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354584, "question_id": "HSasGRXbdLWQJXyVP7G7TV", "question": "What move is this female player making?", "choices": ["lob", "serve", "receive", "backhand"], "correct_choice_idx": 2, "direct_answers": ["swing", "hitting ball", "save", "hitting", "receive", "backhand", "backhand", "return", "return", "backhand"], "difficult_direct_answer": false, "rationales": ["The move is to receive.", "The woman is looking to hit the ball.", "She's looking up at the ball so she knows where to put her racket."], "image": "val2014/COCO_val2014_000000354584.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176229, "question_id": "HSxfHCN8q2os8S2JC5kzeH", "question": "What is the man doing on the skateboard?", "choices": ["walking", "laying down", "kneeling", "standing"], "correct_choice_idx": 2, "direct_answers": ["skating", "riding", "sitting", "riding it", "crouching", "skating", "street luge", "kneeling", "kneeling", "crouching"], "difficult_direct_answer": false, "rationales": ["One can see that he is hunched over and his knees are bent.", "He has his knee almost all the way down on the deck", "All anyone has to do is look at him. he is obviously not standing, laying down or walking."], "image": "train2014/COCO_train2014_000000176229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90732, "question_id": "HSxwaLogdkjaotahDfqyef", "question": "Why is she standing on the stool?", "choices": ["too short", "exercise legs", "her favorite", "floor cold"], "correct_choice_idx": 0, "direct_answers": ["stirring", "too short", "too short", "too short", "to reach", "mixing", "stirring bowl", "be higher", "reach bowl", "to eat"], "difficult_direct_answer": false, "rationales": ["The girl is otherwise too short.", "Small children aren't tall enough to reach things without stools.", "She's too short to reach."], "image": "val2014/COCO_val2014_000000090732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90640, "question_id": "HSzv54V2BLG9THsdT52hfh", "question": "What period of the day is it in the photo?", "choices": ["night", "morning", "evening", "afternoon"], "correct_choice_idx": 3, "direct_answers": ["daytime", "afternoon", "midday", "morning", "daytime", "daytime", "afternoon", "afternoon", "daytime", "afternoon"], "difficult_direct_answer": false, "rationales": ["It appears to be the middle of the day and not one of the other options closer to night time.", "The day must be in the afternoon since the sun is out but it's dull.", "It's actually hard to tell from the image. that said, it's obviously not c or d. that only leaves a and b."], "image": "val2014/COCO_val2014_000000090640.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235124, "question_id": "HT4GMMs6ppV3cfZUApyrWT", "question": "What's the name of the wooden structure on the stones?", "choices": ["patio seat", "recliner", "pew", "picnic table"], "correct_choice_idx": 3, "direct_answers": ["stairs", "deck", "table", "bench", "deck", "deck", "picnic table", "deck", "lifeguard station", "bench"], "difficult_direct_answer": false, "rationales": ["There is a picnic table in the background behind the stones.", "The wooden structure is where people have picnics.", "The picnic table allows people to rest."], "image": "val2014/COCO_val2014_000000235124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130914, "question_id": "HT4pLwdn6PSLakSqRsqVWp", "question": "What is the plane flying over?", "choices": ["highway", "ocean", "forest", "desert"], "correct_choice_idx": 0, "direct_answers": ["highway", "highway", "highway", "highway", "road", "landing", "highway", "freeway", "road", "highway"], "difficult_direct_answer": false, "rationales": ["The roadway is clearly a high-speed roadway and the plane is directly above it.", "The airplane is over a four lane street with no traffic lights.", "This is indicated by the cars beneath it. this may be close to a c, but the plane is directly over a."], "image": "train2014/COCO_train2014_000000130914.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75775, "question_id": "HT7JPvWxJwMXcvAJBd9Kym", "question": "What game is being played here?", "choices": ["racquet ball", "golf", "pickle ball", "tennis"], "correct_choice_idx": 2, "direct_answers": ["tennis", "tennis", "badminton", "tennis", "pickle ball", "tennis", "tennis", "tennis", "tennis", "badminton"], "difficult_direct_answer": false, "rationales": ["This is a small tennis court.", "Pickleball is played on a court with racquets that are wider.", "The sport is a combination of tennis and table tennis."], "image": "val2014/COCO_val2014_000000075775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4834, "question_id": "HTDvCkDH824UEVJWGhr8pt", "question": "Why all the logs?", "choices": ["for sale", "start fire", "build house", "balance boat"], "correct_choice_idx": 0, "direct_answers": ["for sale", "firewood", "cut firewood", "sales hope", "for sale", "for fire", "to store", "behind man", "taking market", "build fires"], "difficult_direct_answer": true, "rationales": ["A shy girl sits in front of neatly stacked large and small logs as she waits for her next customer. such precise stacking adds to the attractiveness of the product.", "She has them in piles as if selling them.", "All of the logs on the boat are assembled for sale."], "image": "train2014/COCO_train2014_000000004834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431738, "question_id": "HTGVGdENT2nmq5Z5Jid2WY", "question": "Who is the woman in black seated on the court?", "choices": ["host", "official", "relative", "announcer"], "correct_choice_idx": 1, "direct_answers": ["atvacate", "judge", "coach", "referee", "referee", "referee", "referee", "official", "hard work", "coach"], "difficult_direct_answer": false, "rationales": ["Officials of tennis matches sit on the side of the court. officials often wear clothes that distinguish them from others. the person on the side of the court is in all black.", "The woman is the official.", "A person in a uniform is on the side of a tennis court. officials officiate sporting events from the side of the court."], "image": "val2014/COCO_val2014_000000431738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96640, "question_id": "HTermTSY2gysrs2aBa2D7z", "question": "This street is located in what continent?", "choices": ["africa", "europe", "asia", "america"], "correct_choice_idx": 2, "direct_answers": ["asia", "asia", "asia", "unknown", "asia", "india", "thailand", "india", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["The bus has writing from an indo-aryan language on its front. the bus is manufactured by ashok leyland, which is an indian company.", "This country looks to be india so it would be that continent.", "The street is in asia."], "image": "train2014/COCO_train2014_000000096640.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305000, "question_id": "HTuxW3LZNiufAV8aPqE2gu", "question": "What might the lady standing here purchase?", "choices": ["grains", "produce", "toys", "meats"], "correct_choice_idx": 1, "direct_answers": ["produce", "food", "fruit", "produce", "fruits", "fruit", "produce", "produce", "fruits", "fruit vegetables"], "difficult_direct_answer": false, "rationales": ["There are fruits and veggies.", "Fresh fruits and vegetables are laid out in front of this woman in this marketplace scene.", "There are bananas, leeks, pineapples and lettuce for sale."], "image": "val2014/COCO_val2014_000000305000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181837, "question_id": "HU6P5pHoCY5g2pY23sQasi", "question": "What is the nickname for the first name on the wood sign?", "choices": ["frank", "dick", "gill", "bill"], "correct_choice_idx": 3, "direct_answers": ["will", "bill", "bill", "bill", "bill", "milk", "will", "bill", "dick", "bill"], "difficult_direct_answer": false, "rationales": ["It is short for william.", "The first name on the sign is william.", "The name will could be said as bill."], "image": "train2014/COCO_train2014_000000181837.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11487, "question_id": "HUBmii68JomcXSMBg4gJtG", "question": "Why no hands or head?", "choices": ["cut off", "small limbs", "is mannequin", "under dress"], "correct_choice_idx": 2, "direct_answers": ["umbrella covering", "weird angle", "mannequin", "no shown", "covered", "under umbrella", "mannequin", "is mannequin", "umbrella", "mannequin"], "difficult_direct_answer": false, "rationales": ["Her hands are under the sleeves.", "Not a person", "There is a fake woman standing there in the street. she has an umbrella under her."], "image": "train2014/COCO_train2014_000000011487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105052, "question_id": "HUE2o9hB6mwX2Ez7LYn9f9", "question": "What activity is this location used for?", "choices": ["retirement", "grocery shopping", "banking", "sleeping"], "correct_choice_idx": 3, "direct_answers": ["sleeping", "socializing", "shooting", "sleeping", "lodging", "weddings", "conventions", "rest", "sleeping", "weddings"], "difficult_direct_answer": false, "rationales": ["This location is used for sleeping and is a bed and breakfast.", "The other options usually don't apply to a place called an \"inn,\" which indicates lodging.", "Most hotels are generally used to stay overnight."], "image": "val2014/COCO_val2014_000000105052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17869, "question_id": "HUPcUCVYzP3x2g8gANFkBH", "question": "What does the man in white shirt and black and white shorts want to do with the frisbee first here?", "choices": ["toss it", "catch it", "avoid it", "hide it"], "correct_choice_idx": 1, "direct_answers": ["catcher box", "catch", "catch", "catch", "catch", "heand", "exercise", "catch it", "catch", "catch it"], "difficult_direct_answer": false, "rationales": ["He has both hands in the air with his fingers extended so that he can grasp the frisbee as it approaches.", "A man is reaching up towards an incoming frisbee.", "The man is holding up each hand. he is waiting for the frisbee."], "image": "val2014/COCO_val2014_000000017869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509367, "question_id": "HUc5s8wLS4GUWLYWh75Xcf", "question": "How many lower classifications do ostriches has?", "choices": ["five", "one", "four", "two"], "correct_choice_idx": 2, "direct_answers": ["three", "few", "five", "four", "one", "two", "four", "two", "two", "one"], "difficult_direct_answer": false, "rationales": ["If google is correct, then they're bird and reptile.", "Ostriches have four classifications underneath them.", "The classifications are north african, southern, arabian, and masai."], "image": "train2014/COCO_train2014_000000509367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504977, "question_id": "HUc95oRRfPNbPb9QJutVqt", "question": "What activity is the old lady engaging in?", "choices": ["resting", "crying", "sleeping", "praying"], "correct_choice_idx": 0, "direct_answers": ["sitting", "resting", "prayer", "sitting", "sitting", "sitting", "praying", "sitting", "bird watching", "sitting"], "difficult_direct_answer": false, "rationales": ["The woman is hanging out on the bench.", "She is sitting down on a bench in a park.", "She is sitting."], "image": "val2014/COCO_val2014_000000504977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374832, "question_id": "HVWyQ6bDdDvpw3o4dTBT4p", "question": "What is delivered in the brown box?", "choices": ["lamp", "pillows", "computer", "toys"], "correct_choice_idx": 2, "direct_answers": ["computer", "computer", "computer", "computer", "computer", "computer", "computer", "computer", "computer", "computer"], "difficult_direct_answer": false, "rationales": ["Dell is a computer company.", "It says dell on it which is known for this type of electronic", "The box says dell."], "image": "train2014/COCO_train2014_000000374832.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87649, "question_id": "HVh8mHWmiVcQMRpx7ek67v", "question": "What is the gold object near the front of the train?", "choices": ["shield", "handle", "bell", "helmet"], "correct_choice_idx": 2, "direct_answers": ["bell", "bell", "bell", "bell", "bell", "bell", "bell", "bell", "bell", "bell"], "difficult_direct_answer": false, "rationales": ["Its a bell that rings to let people know that its coming down the tracks.", "A way to give notice to people on or near the tracks, a train's bell is often gold and shiny.", "The gold object is the bell on the train they ring it to let people know they are coming."], "image": "train2014/COCO_train2014_000000087649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464200, "question_id": "HVkmCKMzkKxKYshJjAE64c", "question": "What does this lady intend to do?", "choices": ["wash disc", "throw disc", "eat food", "drink beverage"], "correct_choice_idx": 1, "direct_answers": ["through", "toss frisbee", "throw", "throw frisbee", "score", "toss frisbee", "throw", "throw frisbee", "throw disc", "frisbee"], "difficult_direct_answer": false, "rationales": ["She is holding the toy in her hand with the intention to throw it into the game piece that has the chains on it.", "The lady is playing a game with the frisbee. she needs to get the frisbee into the basket to score.", "The lady has an object similar to a frisbee in her hands is ready to toss it."], "image": "val2014/COCO_val2014_000000464200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101215, "question_id": "HVoT9vaTJaMMXKrGMTXNTX", "question": "What is she getting ready to do?", "choices": ["stand", "swing", "swim", "sit"], "correct_choice_idx": 1, "direct_answers": ["swing", "hit ball", "hit ball", "hit ball", "move", "hit ball", "hit ball", "play tennis", "hit ball", "hit"], "difficult_direct_answer": false, "rationales": ["A girl in a tennis outfit is hold a racket up and to the side.", "The girl wants to swing her racquet.", "As indicated by her stance and the positioning of the racket."], "image": "train2014/COCO_train2014_000000101215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260, "question_id": "HWVZ9B9SaSD5ibcYfeeVJr", "question": "What is the woman in yellow waiting for?", "choices": ["her pet", "rain", "ride", "check"], "correct_choice_idx": 2, "direct_answers": ["ride", "bus", "train", "bus", "bus", "bus", "bus", "taxi", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["This woman appears to have luggage with her and waiting outside of the baggage claim area. in all likelihood she is on the sidewalk waiting for a ride.", "This is a bus stop likely near an airport or other public transportation depot.", "The bus this woman is presumably waiting for is seen arriving to the right and we can assume she will be boarding it."], "image": "train2014/COCO_train2014_000000000260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532007, "question_id": "HWXsstnyPKJCLnEhFjSWQK", "question": "In which direction will the horses most likely go next?", "choices": ["left", "backwards", "straight", "nowhere"], "correct_choice_idx": 0, "direct_answers": ["along shore", "left side", "into water", "towards water", "left", "left", "left", "left", "forward", "left"], "difficult_direct_answer": false, "rationales": ["There are three people on horses. if they go straight they will go into water so turning will help.", "It would make no sense to go right as it goes into the ocean.", "The horses can not go straight ahead as they would be going into the ocean. they are already slightly angled towards our left so it is most likely that's where they'll trot towards next."], "image": "val2014/COCO_val2014_000000532007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313201, "question_id": "HX5mjnGeuD7rx5qKULVQUg", "question": "Which food is this country famous for?", "choices": ["pizza", "poutine", "sushi", "borsht"], "correct_choice_idx": 2, "direct_answers": ["sushi", "chinese food", "sushi", "sushi", "chinese food", "sushi", "chinese food", "sushi", "sushi", "sushi"], "difficult_direct_answer": false, "rationales": ["The food is sushi.", "Raw fish is famous in a lot of asian countries.", "The signs are in japanese, not italian, russian, or french."], "image": "train2014/COCO_train2014_000000313201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134454, "question_id": "HXC3j2QM2tE9gZ9jgGJYZm", "question": "What league is the player playing in?", "choices": ["college", "major league", "little league", "minor league"], "correct_choice_idx": 3, "direct_answers": ["national", "american", "mba", "amc", "minor", "mlb", "minors", "major", "minor league", "mlb"], "difficult_direct_answer": true, "rationales": ["The player is in the minor league.", "The league is the minor one.", "This is not a major league team so he must be just under that."], "image": "train2014/COCO_train2014_000000134454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395822, "question_id": "HXagcwRy5z6aZ24n6fygnm", "question": "What is inside the tall glasses?", "choices": ["candles", "beer", "wine", "candy"], "correct_choice_idx": 0, "direct_answers": ["candles", "candles", "items", "candles", "items", "items", "candles", "candles", "candles", "candles"], "difficult_direct_answer": false, "rationales": ["There are candles sitting inside of the tall glass.", "The glasses are holding the pillar variety of this type of home decor.", "The glasses contain items typically used to provide lighting for nice dinners, thus matching the items identified in a."], "image": "train2014/COCO_train2014_000000395822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455435, "question_id": "HXb2Cyk33ebs4NqoSzh58J", "question": "What is the source of the water here?", "choices": ["hose", "fire hydrant", "rainstorm", "snow"], "correct_choice_idx": 1, "direct_answers": ["fire hydrant", "fire hydrant", "fire hydrant", "fire hydrant", "rain", "hydrant", "fire hydrant", "hydrant", "fire hydrant", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["The water is spraying upwards on a city street", "The fire hydrant is spouting water.", "The fire hydrant is spraying all the water out."], "image": "val2014/COCO_val2014_000000455435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168162, "question_id": "HXc3gqnQtb4G8RfrVLsRKA", "question": "What event is this horse good at?", "choices": ["running", "prancing", "counting", "jumping"], "correct_choice_idx": 3, "direct_answers": ["steeplechase", "jumping", "horse race", "hurdles", "jumping", "racing", "jumping", "horse race", "jumping", "jumping"], "difficult_direct_answer": false, "rationales": ["The horse is jumping over the hurdle.", "The horse is in the air going over poles", "The horizontal poles between vertical stands are used for horses jumping. this horse has cleared them so far with its front hooves, good so far!."], "image": "val2014/COCO_val2014_000000168162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309314, "question_id": "HXtCHdtCcyCqfVwi8fKyx5", "question": "What is being trimmed here?", "choices": ["wool", "horn", "hoof", "tail"], "correct_choice_idx": 2, "direct_answers": ["nails", "sheep", "toenails", "hoof", "hooves", "hooves", "hooves", "hoof", "hoof", "hoof"], "difficult_direct_answer": false, "rationales": ["The man is holding up the animal's hand and has the clippers in his hand.", "The hoof is being trimmed.", "The llama's hoof is being trimmed."], "image": "train2014/COCO_train2014_000000309314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555904, "question_id": "HYJxV6dyY8hiCAa4eHiQLw", "question": "What is the woman behind the counter's profession?", "choices": ["banker", "bartender", "card dealer", "accountant"], "correct_choice_idx": 1, "direct_answers": ["wine taster", "bartender", "bartender", "bartender", "wine counter", "bartender", "bartender", "bartender", "bartender", "bartender"], "difficult_direct_answer": false, "rationales": ["A woman stands behind a bar. bartenders work in bars.", "The woman is by a bartender.", "The shelves behind this lady are lines with bottles of alcohol. there are barstools up at the counter and several of the customers have alcoholic beverages with them."], "image": "val2014/COCO_val2014_000000555904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365251, "question_id": "HYfdTzLRcuUoQVLnwkpyWA", "question": "What do the buttons to the right of the tissue rolls control?", "choices": ["temperature", "lights", "toilet", "radio"], "correct_choice_idx": 2, "direct_answers": ["bidet", "toilet", "bidet settings", "bidet", "toilet", "bidet", "bidet", "flush", "bidet", "toilet flush"], "difficult_direct_answer": false, "rationales": ["These are high end toilets that you can use the buttons to flush.", "The buttons are for the toilet.", "The buttons are attached to the toilet."], "image": "train2014/COCO_train2014_000000365251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551686, "question_id": "HYfqRo5eo3TAKjF9TzhYBH", "question": "What did the people who wait here just exit?", "choices": ["elephant", "taxi", "plane", "rental car"], "correct_choice_idx": 2, "direct_answers": ["airplane", "more people", "more people", "airplane", "airplane", "airplane", "plane", "airplane", "airplane", "plane"], "difficult_direct_answer": false, "rationales": ["The people waiting at the luggage carousel to pick up their luggage just got off of the aircraft which transported them.", "You can tell by the people being at the baggage claim, what they just exited.", "The people got off the plane."], "image": "train2014/COCO_train2014_000000551686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274685, "question_id": "HYihS9LJ8aZJ3969Rpcobk", "question": "Why is the man in the rear wearing green clothing?", "choices": ["mobility", "horse-riding outfit", "camouflage", "visibility"], "correct_choice_idx": 2, "direct_answers": ["different team", "camouflage", "equestrian look", "military", "playing soldier", "camouflage", "uniform", "hunting", "ranger", "camouflage"], "difficult_direct_answer": false, "rationales": ["The man is wearing it as he's trying to blend in with the grass.", "He is wearing that so he blends in with his surroundings more so.", "The man is camouflaged."], "image": "train2014/COCO_train2014_000000274685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454692, "question_id": "HZHsNE8wk685qfKX4SKxg8", "question": "When does the Children's Online Privacy Protection Act took effect in?", "choices": ["sep 1999", "apr 2000", "may 2000", "aug 1990"], "correct_choice_idx": 1, "direct_answers": ["apr 2000", "1998", "playing", "1996", "1998", "april 2000", "no idea", "2000", "past", "twenty nineteen"], "difficult_direct_answer": true, "rationales": ["That's when it took effect.", "The act was from april 2000.", "The act was passed in 1998 to be effective in the fourth month of the new millennium."], "image": "train2014/COCO_train2014_000000454692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531086, "question_id": "HZKwX9yhNxP3ywmMTWugSN", "question": "Which object here would be the heaviest?", "choices": ["fridge", "oven", "bowl", "water purifier"], "correct_choice_idx": 0, "direct_answers": ["refrigerator", "refrigerator", "fridge", "fridge", "fridge", "fridge", "refrigerator", "refrigerator", "fridge", "fridge"], "difficult_direct_answer": false, "rationales": ["The refrigerator is the heaviest item in this scene which is not also bolted down or attached to the room.", "Fridges are one of the heaviest appliances.", "The fridge would be the heaviest."], "image": "val2014/COCO_val2014_000000531086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459372, "question_id": "HZLAm8XMmgEU7K7fYTDyGV", "question": "This man is wearing a mask to look like a politician from what country?", "choices": ["germany", "australia", "luxembourg", "united kingdom"], "correct_choice_idx": 3, "direct_answers": ["great britain", "england", "uk", "united kingdom", "united kingdom", "england", "america", "america", "united state", "england"], "difficult_direct_answer": false, "rationales": ["David cameron is the former prime minister of the united kingdom.", "The politician is a well known world leader from this country.", "He is trying to look like one from the uk"], "image": "train2014/COCO_train2014_000000459372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163692, "question_id": "HZjeYCu4BpmFoxn6kVmUhE", "question": "Where is the person to whom the shown keys belong now?", "choices": ["errand", "car", "mexico", "here"], "correct_choice_idx": 3, "direct_answers": ["away", "behind computer", "here", "office", "at desk", "desk", "room", "work", "teddy bear", "away"], "difficult_direct_answer": true, "rationales": ["The person is here because they are taking the picture.", "They are in the room, sitting on the other side of the desk.", "This is an office, not mexico. the keys are on the desk, so the person is not in the car or running an errand."], "image": "train2014/COCO_train2014_000000163692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287737, "question_id": "HZoDbnEgmT3pRkTichZqEv", "question": "What is the man doing with the red machine?", "choices": ["making cider", "coring/peeling apples", "cutting vegetables", "making juice"], "correct_choice_idx": 1, "direct_answers": ["peeling apple", "de-coring apple", "coring apple", "peeling apples", "coring/peeling apples", "moving it", "peeling", "coring apples", "peeling apples", "peeling apples"], "difficult_direct_answer": false, "rationales": ["The man is trying to core and peel apples.", "There is an apple attached on a rod. the machine has a cutter so it can core the apples.", "The man is here with the red machine to core and peel the apples."], "image": "train2014/COCO_train2014_000000287737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241728, "question_id": "HZrU49v4ASex24DLNJCDSB", "question": "What type of area is shown?", "choices": ["residential", "tropical", "city", "country"], "correct_choice_idx": 2, "direct_answers": ["city", "street", "city", "food stand", "downtown city", "intersection", "street", "urban", "street corner", "city"], "difficult_direct_answer": false, "rationales": ["A city with a street cart is shown.", "The area is a city.", "The people are buying hot dogs from a vendor on a city street."], "image": "val2014/COCO_val2014_000000241728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41135, "question_id": "Ha5YkSwpUZk7PWL6tJ6iGh", "question": "What is the other successful auto company from this company's country?", "choices": ["audi", "renault", "hyundai", "ford"], "correct_choice_idx": 2, "direct_answers": ["kia", "hyundai", "hyundai", "kia", "hyundai", "kia", "hyundai", "hyundai", "mitsubishi", "hyundai"], "difficult_direct_answer": false, "rationales": ["The car company advertised on the banner is from the same asian country that makes hyundais.", "Hyundai and kia are both run out of south korea.", "Korea is a country that is home to car manufacturers."], "image": "train2014/COCO_train2014_000000041135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519356, "question_id": "Ha7gBnTKT4AY4qKsibTMxn", "question": "Why does the man on the surf board crouch?", "choices": ["stealth hiding", "shark sighting", "improved balance", "he's falling"], "correct_choice_idx": 2, "direct_answers": ["balance", "playing", "balance", "balance", "balance", "for stability", "balance", "improved balance", "maintain balance", "surfing"], "difficult_direct_answer": false, "rationales": ["A surfer is riding a wave on top of the surfboard with knees slightly ben and arms out.", "A man is in a crouched position on a surfboard. people crouch on surfboards to get balance.", "The other options don't apply to surfing or his position."], "image": "train2014/COCO_train2014_000000519356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343769, "question_id": "Ha9kfSV2W3LYo5eyHGDoke", "question": "The dominant color on the shirt is the same color as what food item?", "choices": ["ketchup", "mustard", "salt", "relish"], "correct_choice_idx": 1, "direct_answers": ["frisbee", "banana", "banana", "banana", "banana", "banana", "frisbee", "mustard", "banana", "banana"], "difficult_direct_answer": false, "rationales": ["The dominant color is yellow and only one choice is yellow. the others food item choices are red, white, and green.", "It is yellow", "The shirt is yellow."], "image": "val2014/COCO_val2014_000000343769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579036, "question_id": "HaPSFcZwBH6XsQGQzUrxRV", "question": "What type of persons ride the bus here?", "choices": ["tourists", "city workers", "union workers", "taxi drivers"], "correct_choice_idx": 0, "direct_answers": ["tourists", "tourists", "tourists", "tourists", "passenger", "tourists", "tourist", "tourist", "pedestrians", "tourist"], "difficult_direct_answer": false, "rationales": ["Tourists climb aboard the buses here.", "The bus is a double decker one with an open top to see better.", "This is a double decker bus with a view for tourists."], "image": "val2014/COCO_val2014_000000579036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113977, "question_id": "HafWcCxiCSDqFuMAiDPPJz", "question": "What is the with glasses on her head looking at?", "choices": ["performance", "bird", "grass", "camera"], "correct_choice_idx": 1, "direct_answers": ["camera", "bird", "bird", "photographer", "bird", "at camera", "camera", "eagle", "eagle", "eagle"], "difficult_direct_answer": false, "rationales": ["The lady with the glasses on her head is watching the bird fly.", "The person with glasses on her head is looking at the camera.", "The big bird flying over."], "image": "val2014/COCO_val2014_000000113977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459569, "question_id": "HagxErLMm4qiojnPvuukRj", "question": "Why is the player in white wearing gloves?", "choices": ["grip", "costume", "warmth", "health"], "correct_choice_idx": 0, "direct_answers": ["batter", "for grip", "playing", "better grip", "better grip", "sebastian value", "nine", "batting glove", "grip", "grip"], "difficult_direct_answer": false, "rationales": ["The player is wearing gloves so his hands don't slip.", "The player is gripping the bat.", "Gloves help to hold the wooden handle of the bat tighter."], "image": "train2014/COCO_train2014_000000459569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280230, "question_id": "HakVSgGGBgS9M9c7YiYqdp", "question": "In which manner are the potatoes here prepared?", "choices": ["dried", "baked", "fried", "boiled"], "correct_choice_idx": 2, "direct_answers": ["fried", "french fried", "fried", "fried", "fried", "french fries", "fried", "fried", "french fries", "fried"], "difficult_direct_answer": false, "rationales": ["The other options obviously don't apply. the name fries is also an indication.", "These potatoes are french fries and can be cooked in a fryer.", "These potatoes are prepared by slicing them and deep frying."], "image": "val2014/COCO_val2014_000000280230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415280, "question_id": "HamjD8VjN2uRyYihjXoDcp", "question": "Why is this statue partially white?", "choices": ["age", "style", "bird droppings", "water discoloration"], "correct_choice_idx": 2, "direct_answers": ["chipped paint", "age", "poo", "paint chip", "bird poop", "bird droppings", "peeling", "chipped paint", "bird droppings", "damage"], "difficult_direct_answer": false, "rationales": ["Birds sit on it a lot and they poop everywhere", "The statue is partially white because it is covered with bird droppings.", "The statue is partially white because it is covered in bird droppings."], "image": "train2014/COCO_train2014_000000415280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325593, "question_id": "Hb87CixWgGAeUWzZqSQ9Gx", "question": "Which elevation is this skateboarder likely to go to next?", "choices": ["stay's still", "same", "higher", "lower"], "correct_choice_idx": 3, "direct_answers": ["down", "bowl", "beneath", "up", "lower", "lower", "down", "down", "lower ground", "skating deck"], "difficult_direct_answer": false, "rationales": ["The elevation is lower.", "The only way to go on the ramp is down lower.", "He is at the upper part of the skateboard park"], "image": "val2014/COCO_val2014_000000325593.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115618, "question_id": "Hb9Dk2FUdi6xUiQRDtXnmb", "question": "What purpose does the orange netting serve?", "choices": ["control drifts", "decorative only", "cattle control", "property lines"], "correct_choice_idx": 0, "direct_answers": ["safety warning", "storm fencing", "danger zone", "warning", "barrier", "barrier", "border marker", "safety", "safety", "control drifts"], "difficult_direct_answer": false, "rationales": ["The orange netting serve control purposes for the hillside.", "The orange netting controls any drifting snow.", "The purpose is to control drifts."], "image": "val2014/COCO_val2014_000000115618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197888, "question_id": "HbTtCdG3amyXu33Sy3f4z3", "question": "This food is likely high in what?", "choices": ["radon", "vitamin d", "arsenic", "sodium"], "correct_choice_idx": 3, "direct_answers": ["fat", "calories", "sodium", "calories", "fat", "starch", "sodium", "calories", "sodium", "calories"], "difficult_direct_answer": false, "rationales": ["The hot dog is most likely high in sodium.", "The food has sodium.", "Processed foods like the hotdog on the plate are often high in sodium."], "image": "train2014/COCO_train2014_000000197888.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205847, "question_id": "Hbpf6HCsY9w5fcqk9VYtvM", "question": "What venue is this?", "choices": ["hospital ward", "hotel room", "bedroom", "apartment"], "correct_choice_idx": 1, "direct_answers": ["hotel", "hotel", "hotel/motel", "hotel", "hotel", "hotel room", "hotel", "hotel", "hotel", "hotel room"], "difficult_direct_answer": false, "rationales": ["The venue is a hotel room.", "Based on the badly framed artwork, common bed situation, and drab lighting, this room is of the type for travelling humans to stay in temporarily.", "Two beds with matching bedspreads, white sheets and a nightstand in between along with a large, framed picture and smoke detector on the wall are all things found in a place where you pay for your lodging."], "image": "val2014/COCO_val2014_000000205847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428851, "question_id": "HbuoAV4HRdcudTuv8NhvCX", "question": "Who is the woman in relation to the child in red?", "choices": ["teacher", "mother", "grandmother", "sister"], "correct_choice_idx": 1, "direct_answers": ["mother", "mother", "mother", "mother", "mother", "mother", "mother", "mother/caretaker", "mother/caretaker", "mother"], "difficult_direct_answer": false, "rationales": ["The woman with the child in red is the child's mother.", "The woman is the mom.", "Mothers hold their children in this exact manner."], "image": "train2014/COCO_train2014_000000428851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423637, "question_id": "HcLsmMy8NVmKhKLke6Y7uM", "question": "What organism was usually transported in the black object?", "choices": ["bird", "baby", "cat", "dog"], "correct_choice_idx": 1, "direct_answers": ["baby", "baby person", "baby", "babies", "baby", "baby", "baby", "baby", "babies", "baby"], "difficult_direct_answer": false, "rationales": ["A bassinet on wheels is meant for a newborn.", "The black item is a stroller for parents to use.", "The organism is a baby."], "image": "train2014/COCO_train2014_000000423637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373750, "question_id": "HcSmtQV6Vz3ZoYidXzAYqa", "question": "What does the girl in blue have on her hand?", "choices": ["sequin glove", "oven mitt", "baseball glove", "paint"], "correct_choice_idx": 2, "direct_answers": ["baseball mitt", "baseball glove", "baseball mitt", "baseball mitt", "baseball mitt", "baseball mitt", "mitt", "mitten", "baseball glove", "baseball glove"], "difficult_direct_answer": false, "rationales": ["The girl has a mitt on her glove that is used for catching baseballs.", "If you are a sports fan you can easily tell what she has on her hand.", "The girl has a glove."], "image": "train2014/COCO_train2014_000000373750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502911, "question_id": "HcbuFnE2Gwb7CQciMWeKUe", "question": "How are these desserts cooked?", "choices": ["baked", "grilled", "sauteed", "fried"], "correct_choice_idx": 3, "direct_answers": ["baked", "fried", "fried", "fried", "fried", "fried", "sugared", "fried", "frying pan", "fried"], "difficult_direct_answer": false, "rationales": ["Donuts are fried in oil.", "The items are friend sugary foods that require the frying processes to look the way they appear in the image. the dough must be fried to be edible in a prepared form.", "These are donuts"], "image": "train2014/COCO_train2014_000000502911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118769, "question_id": "HciKB6aqnqSxgQ2tcQKFmE", "question": "How do these two know each other?", "choices": ["coworkers", "classmates", "spouses", "teammates"], "correct_choice_idx": 2, "direct_answers": ["married", "married", "couple", "descendant", "lovers", "married", "spouses", "married", "husband wife", "married"], "difficult_direct_answer": false, "rationales": ["These people are embracing like husband and wife.", "The people are a couple.", "The people are married."], "image": "train2014/COCO_train2014_000000118769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499369, "question_id": "Hck5ztzdB6nxABpFj4rUdr", "question": "What is the name of the beard style?", "choices": ["round", "pin", "circular", "french"], "correct_choice_idx": 3, "direct_answers": ["goatee", "goatee", "goatee", "goatee", "goatee", "french", "french", "full", "circle beard", "goatee"], "difficult_direct_answer": false, "rationales": ["The man's beard is in the french style.", "A man has a beard that is thin on the sides and does not cover his cheeks and there are no sideburns.", "The man has a french beard."], "image": "train2014/COCO_train2014_000000499369.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478016, "question_id": "HcuwU8LUpRTSZEEz4MsgcC", "question": "What is needed to light the candles on the chandeliers?", "choices": ["air", "minerals", "fire", "water"], "correct_choice_idx": 2, "direct_answers": ["ladder", "pole lighter", "matches", "fire", "fire", "ladder", "fire", "flame", "lightbulb", "match"], "difficult_direct_answer": false, "rationales": ["Candle wicks work if they are lit by a fire source rather than other sources of energy.", "It is a candle chandelier.", "They are candles that burn"], "image": "train2014/COCO_train2014_000000478016.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458804, "question_id": "HdHexXHhYSdMiyT7jmkiNQ", "question": "What type of animal is the middle toy the child is holding?", "choices": ["pig", "donkey", "tiger", "bear"], "correct_choice_idx": 1, "direct_answers": ["donkey", "donkey", "donkey", "donkey", "donkey", "tiger", "donkey", "eeyore", "stuffed", "donkey"], "difficult_direct_answer": false, "rationales": ["One of them resembles a donkey.", "The little child is holding a stuffed donkey from disney.", "That's eeyore from winnie the pooh."], "image": "train2014/COCO_train2014_000000458804.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274793, "question_id": "HdfMf82pGiUNggX9v884op", "question": "Which part of the animals is abnormal?", "choices": ["fur skin", "legs", "tail", "mane"], "correct_choice_idx": 2, "direct_answers": ["tail", "dark brown", "tail chopped", "short tails", "tails", "tail", "tails", "tail", "tails", "tails"], "difficult_direct_answer": false, "rationales": ["The horses' tails have been chopped off.", "The tails of the animals are abnormally short.", "What would normally be long they have cut their hair on their tails."], "image": "train2014/COCO_train2014_000000274793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454477, "question_id": "HdnUesd4FnB3ySdgZvXFKi", "question": "What protection feature covering the glass on the door is made out of what material?", "choices": ["wood", "metal", "glass", "aluminum"], "correct_choice_idx": 1, "direct_answers": ["metal", "metal", "wood", "iron", "iron", "metal", "metal", "metal", "iron", "metal"], "difficult_direct_answer": false, "rationales": ["The protection is metal.", "The glass door has metal on it.", "Black bars are in a grid on the glass of a door at a restaurant."], "image": "train2014/COCO_train2014_000000454477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402632, "question_id": "HewNRR7tcwTz2Nhnz3fv9y", "question": "What is the occupation of the two men?", "choices": ["driver", "farmer", "hawker", "pan holder"], "correct_choice_idx": 2, "direct_answers": ["food vendors", "hawker", "selling produce", "fruit vendors", "grocer", "grocer", "venders", "fruit vendors", "fruit vendor", "vendors"], "difficult_direct_answer": false, "rationales": ["These two men are shop hawkers.", "Portable stores like this are used as an efficient way to travel and sell products. these people are known to 'hawk their wares'.", "The occupation is a hawker."], "image": "train2014/COCO_train2014_000000402632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368541, "question_id": "Hf66ht53cMecZ4aj9MChb6", "question": "What type of luggage do persons have here?", "choices": ["freight", "cargo", "carryon", "animals"], "correct_choice_idx": 2, "direct_answers": ["suitcases", "suitcases", "carry on", "rolling", "suitcases", "carryon", "rolling suitcases", "suitcases", "leather", "airport"], "difficult_direct_answer": false, "rationales": ["The luggage is carry on.", "The people boarding the plane have suitcases that will be stored in the overhead compartments.", "The passengers are getting on to the plane with these bags rather than having them checked inside the airport"], "image": "val2014/COCO_val2014_000000368541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503407, "question_id": "HfK2hjr3BJE94au4SfFFTR", "question": "What is this type of waterway called?", "choices": ["ocean", "canal", "lake", "stream"], "correct_choice_idx": 1, "direct_answers": ["river", "river", "river", "canal", "river", "river", "canal", "canal", "unknown", "canal"], "difficult_direct_answer": false, "rationales": ["It runs between a prescribed channel.", "Canals are routes with water.", "The waterway is a canal."], "image": "train2014/COCO_train2014_000000503407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425788, "question_id": "HfNCPsFw5yApionrwivenL", "question": "What other utensil is paired with this one?", "choices": ["slicer", "knife", "spoon", "spork"], "correct_choice_idx": 1, "direct_answers": ["knife", "knife", "knife", "knife", "knife", "knife", "spoon", "knife", "knife", "knife"], "difficult_direct_answer": false, "rationales": ["It is a fork.", "A knife is usually paired with a fork.", "A fork and knife are often used at the same time where as a fork and a spoon are usually used separately."], "image": "train2014/COCO_train2014_000000425788.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360751, "question_id": "HfNN3M2eA8SE3cagyMReEQ", "question": "What is the group doing at the beach?", "choices": ["wedding", "clam bake", "bon fire", "blanket bingo"], "correct_choice_idx": 0, "direct_answers": ["taking pictures", "leisuring", "photography", "photographing horses", "standing", "wedding photos", "enjoying view", "wedding", "pictures", "wedding"], "difficult_direct_answer": true, "rationales": ["People are dressing formally and taking pictures. during a wedding you have to dress nice and some people like to take pictures.", "The group is dressed in formal attire and there are horses and chairs at the beach.", "The group is at a wedding."], "image": "train2014/COCO_train2014_000000360751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560561, "question_id": "HfXKpezQHFVhMeMGpgywkP", "question": "What is the food held by the person used for?", "choices": ["dumping", "feeding", "eating", "training"], "correct_choice_idx": 3, "direct_answers": ["encouragement", "not food", "eating", "treat", "incentive", "training", "dog treat", "training", "train dog", "eating"], "difficult_direct_answer": false, "rationales": ["To entice the dog to move forward", "A small handheld treat is a common method for encouraging dogs to perform certain behaviors. the dog appears to be learning a new skill and is focused on the treat in the person's hand.", "He is using it to teach the dog a trick with the skateboard."], "image": "train2014/COCO_train2014_000000560561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 731, "question_id": "HfdTicmP4k4xuJFNy9Arkz", "question": "Why is the woman using an umbrella?", "choices": ["rain", "costume", "sun", "snow"], "correct_choice_idx": 0, "direct_answers": ["rain", "rain", "raining", "raining", "its raining", "rain", "raining", "its raining", "rain", "raining"], "difficult_direct_answer": false, "rationales": ["This woman stands in a flooded field in rubber boots. below her we see the ripples of rain drops hitting the muddy water.", "The woman is using an umbrella for protection from rain in the flooded plain.", "There is water on the ground everywhere and the campground is not near a beach. rain would create this scenario."], "image": "train2014/COCO_train2014_000000000731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56323, "question_id": "HfwTQwgEZLh5yjE6WdWzuT", "question": "What information does such an electronic billboard depict in this scenario?", "choices": ["train", "weather", "stock market", "politics"], "correct_choice_idx": 0, "direct_answers": ["train times", "train", "train times", "information", "train arrival", "train times", "destination", "train times", "destinations", "times"], "difficult_direct_answer": false, "rationales": ["It gives route and times for the next arrivals", "The picture was taken in a transit station. in transit stations, signs of this type usually display information related to departure and arrival times.", "It tells what train is coming in or going."], "image": "train2014/COCO_train2014_000000056323.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555748, "question_id": "HfyXe7Nq79YaCsBWZBwxdo", "question": "What is on the advertisement overlooked by the gold clock?", "choices": ["watch", "dress", "food", "perfume"], "correct_choice_idx": 1, "direct_answers": ["black gown", "dress", "dress", "dress", "desire", "dress", "black dress", "dress", "dress", "dress"], "difficult_direct_answer": false, "rationales": ["A silhoutte of an outfit.", "There is an advertisement for a black dress on the wall below the gold clock.", "The advertisement is for a dress."], "image": "val2014/COCO_val2014_000000555748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476843, "question_id": "HgL7JRgHAtgaZLG6B8hS7c", "question": "Why are they dressed in black?", "choices": ["fashionable", "lack money", "easier spotting", "wetsuits"], "correct_choice_idx": 3, "direct_answers": ["sun warmth", "stay dry", "wet suits", "surfers", "wetsuit color", "keep warm", "wetsuits", "stay warm", "keep warm", "fashion statement"], "difficult_direct_answer": true, "rationales": ["The wetsuits and colour black keep the heat inside the body so they do not get too cold in the natural elements of the sea.", "It's a common color for wetsuits.", "Black is the most common color; perhaps because it is a color that can be seen relatively easily from a distance."], "image": "val2014/COCO_val2014_000000476843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469495, "question_id": "HgboD9cmuBxz4h7FdUYNnn", "question": "Besides seating what do the white items shown serve as?", "choices": ["bathrooms", "planters", "fire break", "housing"], "correct_choice_idx": 1, "direct_answers": ["planter", "planters", "planters", "planters", "planter", "planters", "planters", "planters", "planters", "planters"], "difficult_direct_answer": false, "rationales": ["These seats have no space in the bottom to go inside and have too much space in them to be a fire break.", "The white seats have built in planters on the top because there are green plants on the top.", "You can see the plants on top of the white seats."], "image": "train2014/COCO_train2014_000000469495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107753, "question_id": "HgcFUPezFaXtp5F8mNKeGi", "question": "What is near the waves?", "choices": ["people", "dolphins", "elephants", "sharks"], "correct_choice_idx": 0, "direct_answers": ["beach", "surfers", "surfers", "surfers", "surfers", "surfers", "people", "surfers", "surfers", "beach"], "difficult_direct_answer": false, "rationales": ["The people are walking along the water edge with their surfboards.", "People are near.", "There are people with surfboards near the water."], "image": "train2014/COCO_train2014_000000107753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292804, "question_id": "HghaKLZuwtqFsjYW53KwfV", "question": "Dark condition is due to the absence of which molecule?", "choices": ["electrons", "protons", "neutrons", "photons"], "correct_choice_idx": 3, "direct_answers": ["light", "light", "light", "light", "photons", "p680", "lights", "sun", "light", "unknown"], "difficult_direct_answer": false, "rationales": ["Those are in light.", "Darkness is the absence of light. light is not transmitted via protons, neutrons, or electrons.", "The condition is from photons."], "image": "val2014/COCO_val2014_000000292804.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486834, "question_id": "HguTBcv5DdX7b9LmwVQ94q", "question": "Why is the man with the yellow surfboard holding swim fins?", "choices": ["show off", "dress code", "style", "enhance performance"], "correct_choice_idx": 3, "direct_answers": ["weak swimmer", "swim easier", "playing", "enhance performance", "not wearing", "carrying", "snorkeling", "boogie board", "his swum", "swimming"], "difficult_direct_answer": true, "rationales": ["He is holding swim fins to help him push his board faster.", "Fins can help a swimmer go faster. surfers have to swim.", "The fins help the board be more powerful."], "image": "val2014/COCO_val2014_000000486834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292543, "question_id": "HhFoiZTodeWKBphzf73StR", "question": "What is the man doing with the beans?", "choices": ["selling them", "cooking", "eating them", "counting"], "correct_choice_idx": 0, "direct_answers": ["sales beans", "selling them", "selling them", "bagging them", "buying", "selling", "peeling", "selling", "for sale", "selling beans"], "difficult_direct_answer": false, "rationales": ["The man is trying to sell beans at the vegetable market.", "Due to the setting and presentation of the beans it's easy to understand what he is doing there.", "The man is selling."], "image": "train2014/COCO_train2014_000000292543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457766, "question_id": "HhH7rVfiLyqo3AEAiWqH4f", "question": "From where is the snow that is being loaded here?", "choices": ["roads", "rooftops", "trucks", "fields"], "correct_choice_idx": 0, "direct_answers": ["street", "road", "road", "street", "road", "road", "road", "street", "street", "roads"], "difficult_direct_answer": false, "rationales": ["The snow is lifted by the construction equipment off the roads for the purpose of snow removal for roads.", "The trucks are clearing the roads.", "Snowplows are often used in this way during winter along with dump trucks."], "image": "val2014/COCO_val2014_000000457766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132760, "question_id": "HhdjAYkJpTtydSvSTFvhTu", "question": "What is the man in the suit and hat doing?", "choices": ["comic relief", "dancing", "judging horses", "lookalike contest"], "correct_choice_idx": 2, "direct_answers": ["judging", "judging", "judging horses", "judging", "announcing", "judging horse", "watching", "judging", "watching", "judging"], "difficult_direct_answer": false, "rationales": ["With this type of equestrian sport there must be a judge.", "He is watching how the horse is performing.", "A woman with a number on her back is walking a horse by a person watching closely. judges look at horses at competitions."], "image": "train2014/COCO_train2014_000000132760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327234, "question_id": "Hhr9ZFCVbTyYjPgBLc6Hmk", "question": "What activity is he doing?", "choices": ["skiing", "surfing", "hiking", "running"], "correct_choice_idx": 1, "direct_answers": ["surfing", "surfing", "bodyboarding", "surfboarding", "surfing", "surfing", "surfing", "surfing", "boogy boarding", "surfing"], "difficult_direct_answer": false, "rationales": ["The man is surfing on the board.", "He is not on land, so he is not running, hiking, or skiing. he is on water and is paddling on a board.", "The man is surfing in the water."], "image": "train2014/COCO_train2014_000000327234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5967, "question_id": "HiGkQJtqBMxb9g7dSyioZH", "question": "Where is the plane in the foreground from?", "choices": ["china", "brazil", "london", "turkey"], "correct_choice_idx": 2, "direct_answers": ["london", "london", "london", "london", "london", "united kingdom", "london", "london", "united kingdom", "london"], "difficult_direct_answer": false, "rationales": ["The plane is from london.", "A plane at an airport has a logo with london on the side.", "The plane is called ban-air london."], "image": "train2014/COCO_train2014_000000005967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274116, "question_id": "HiJq9EWBSJzdfKazcbBhF7", "question": "The plush bear is dressed to celebrate what occupation?", "choices": ["fisherman", "sailor", "whaler", "lobster fisherman"], "correct_choice_idx": 3, "direct_answers": ["ocean workers", "fishermen", "lobsterman", "lobster", "fisherman", "firefighter", "fishing", "crabber", "fisherman", "lobster fisherman"], "difficult_direct_answer": true, "rationales": ["The bear looks like a lobster.", "His hat shows what he fishes.", "The bear is decorated as a fisherman for lobsters."], "image": "train2014/COCO_train2014_000000274116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52305, "question_id": "HiXjrcb67tXssv7sVvKQjR", "question": "Which mode of transportation seen here would get someone to a location faster?", "choices": ["walking", "airplane", "boat", "pickup"], "correct_choice_idx": 1, "direct_answers": ["airplane", "ship", "flying", "airplane", "plane", "flight", "boat", "plane", "boat", "airplane"], "difficult_direct_answer": false, "rationales": ["A plane is faster than a boat.", "An airplane can travel over any terrain like a boat, and its speeds are unmatched besides space travel.", "Airplanes are the fastest, faster than boats."], "image": "train2014/COCO_train2014_000000052305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272670, "question_id": "HjHHDVQpBuBJuTTNZL8JXL", "question": "The yellow liquid in the bottle with the green cap comes from what item?", "choices": ["grape", "strawberry", "orange", "lemon"], "correct_choice_idx": 2, "direct_answers": ["orange fruit", "oranges", "oranges", "oranges", "orange", "orange", "orange fruit", "oranges", "oranges", "oranges"], "difficult_direct_answer": false, "rationales": ["The bottle with the green cap has a tropicana label. the source fruit is on the label.", "The brand tropicana is famous for creating orange juice.", "The picture on the bottle shows a citrus fruit of the shade of the drink."], "image": "train2014/COCO_train2014_000000272670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276260, "question_id": "HjKtxfnMXuZeGn95LzRW9J", "question": "Why is the woman wearing a neon yellow vest?", "choices": ["visibility", "punishment", "fashion", "cosplay"], "correct_choice_idx": 0, "direct_answers": ["safety", "high visibility", "visibility", "safety vest", "visibility", "visibility", "visibility", "be seen", "keep warm", "worker"], "difficult_direct_answer": false, "rationales": ["People can see the lime green color easily.", "The woman is wearing a safety vest.", "Yellow is a highly visible color."], "image": "val2014/COCO_val2014_000000276260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482706, "question_id": "HjbaJRmCrSEmymX4xkCJPT", "question": "What skill is the small person here learning?", "choices": ["floor mopping", "dental hygene", "smiling", "spitting"], "correct_choice_idx": 1, "direct_answers": ["tooth brushing", "toothbrushing", "teeth brushing", "toothbrushing", "brushing teeth", "toothbrushing", "dental hygene", "teeth brushing", "teeth brushing", "teeth brushing"], "difficult_direct_answer": false, "rationales": ["The skill is a hygienist.", "They are brushing their teeth", "A mother holds her young baby in her arm as they both insert toothbrushes into their mouths. it's never too soon to show your kids good tips for living well!."], "image": "train2014/COCO_train2014_000000482706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119443, "question_id": "HjtCpwPG3GxQzmo6gVj54X", "question": "Why si the board hanging from the box?", "choices": ["is stuck", "bounced there", "showing off", "fell there"], "correct_choice_idx": 2, "direct_answers": ["performing trick", "doing trick", "grinding", "obstacle", "jumping", "showing off", "trick move", "trick", "jumping", "mid-trick"], "difficult_direct_answer": true, "rationales": ["The skateboarder is doing a trick.", "The board is showing off.", "A man is doing a skateboard trip and having it be photographed which means he is showing off."], "image": "train2014/COCO_train2014_000000119443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245002, "question_id": "HkB3YDuRg7DohLtkRttdMT", "question": "Who plays this sport?", "choices": ["serena williams", "john wayne", "sabrina glevissig", "john franco"], "correct_choice_idx": 0, "direct_answers": ["tennis player", "serena williams", "man", "tennis player", "tennis players", "men", "tennis player", "tennis player", "tennis", "tennis players"], "difficult_direct_answer": false, "rationales": ["Williams plays tennis.", "This is the only person of the four who plays tennis.", "The woman is widely known for this game."], "image": "train2014/COCO_train2014_000000245002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511469, "question_id": "HkCXjeXXEBicaYc6eCypnf", "question": "What is located behind the plane?", "choices": ["nothing", "desert", "airport", "tropical island"], "correct_choice_idx": 2, "direct_answers": ["beachgoers", "people", "people", "beachgoers", "people", "people", "people", "people", "airport", "people"], "difficult_direct_answer": false, "rationales": ["The plane looks to be taking off from the airport.", "The plane is taking off from the airport behind it.", "The plane is going to land."], "image": "val2014/COCO_val2014_000000511469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434990, "question_id": "HkH7GbvpsiXZTE7yVHswda", "question": "What can be used for identification here?", "choices": ["sign", "license plate", "snow", "passport"], "correct_choice_idx": 1, "direct_answers": ["badge", "cards", "license plate", "license plate", "bridge", "license", "dental records", "licence plates", "id", "license"], "difficult_direct_answer": false, "rationales": ["The car can be identified by the number on its tag.", "The police car has a license plate on the back with a number that is used to identify it.", "The car has a unique license plate."], "image": "val2014/COCO_val2014_000000434990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110087, "question_id": "HkNX8nt6knTYobujTDHAWa", "question": "How many teeth a human does have?", "choices": ["28", "42", "16", "32"], "correct_choice_idx": 3, "direct_answers": ["thirty two", "32", "32", "thirty two", "28-32", "32", "24", "thirty two", "thirty two", "32"], "difficult_direct_answer": false, "rationales": ["Humans generally have 32 teeth in total.", "An average adult person has 32 teeth in their mouth.", "That is the total amount of top and bottom teeth."], "image": "train2014/COCO_train2014_000000110087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261266, "question_id": "HkU9rfxRSZJzcinkhKFSJ5", "question": "Which objects are out of their natural habitat?", "choices": ["sand", "clouds", "water", "boats"], "correct_choice_idx": 3, "direct_answers": ["boats", "boats", "boats", "boats", "boats", "boats", "boats", "boats", "boats", "boats"], "difficult_direct_answer": false, "rationales": ["There are boats on shore.", "The boats should be in water.", "Boats are normally in water and not on land."], "image": "train2014/COCO_train2014_000000261266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322324, "question_id": "HkgYgjfHrHdEpS3FynLxGd", "question": "What country is known for an annual festival that revolves around the liquid in the glass?", "choices": ["india", "kazakhstan", "nepal", "germany"], "correct_choice_idx": 3, "direct_answers": ["germany", "germany", "germany", "germany", "germany", "germany", "germany", "germany", "germany", "georgia"], "difficult_direct_answer": false, "rationales": ["The country is germany.", "Germany is known for its beer.", "Beer is in a glass. germany has beer festivals."], "image": "train2014/COCO_train2014_000000322324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541580, "question_id": "HkkFrnumWcLYVG8ZFdiHQT", "question": "What is probably behind the face of the circle up top?", "choices": ["gears", "nets", "balloons", "kids"], "correct_choice_idx": 0, "direct_answers": ["gears", "more sky", "clock mechanism", "clock face", "hardware", "power", "gears", "wheels gears", "nothing", "motor"], "difficult_direct_answer": true, "rationales": ["The circle is a clock, and gears are needed to help the hands move in order to tell the time correctly.", "Kids are walking around in the background.", "There is a clock on the top, and they need gears to run. and it's not safe for kids to be up there."], "image": "train2014/COCO_train2014_000000541580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201069, "question_id": "HkmiagjzUwFqBhjUxqhTcP", "question": "What is most likely inside of the building next to the cars?", "choices": ["washing machine", "firemen", "clowns", "atm"], "correct_choice_idx": 3, "direct_answers": ["money", "atm", "money", "money", "bank", "bank", "bank", "bank", "bankers", "bank"], "difficult_direct_answer": false, "rationales": ["People use that to take out money if they're in the city.", "The atm is nearby.", "It seems that their are dealers around the building dealing with cars."], "image": "train2014/COCO_train2014_000000201069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501947, "question_id": "HkrY2iaJADbjjGNnTxfH6V", "question": "Which food provides the most vitamin A?", "choices": ["vegetable", "carrot", "meat", "dressing"], "correct_choice_idx": 1, "direct_answers": ["carrots", "carrot", "carrots", "carrot", "carrots", "carrots", "carrots", "carrot", "carrot", "carrots"], "difficult_direct_answer": false, "rationales": ["The carrots give the most nutrients on the plate.", "The carrots have vitamins.", "These vegetables are known to be rich in this nutrient."], "image": "train2014/COCO_train2014_000000501947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130592, "question_id": "Hm5qkdp5jMx5mkY3m9kyYt", "question": "What is the bus primarily used for?", "choices": ["mail delivery", "school transportation", "racing", "tours"], "correct_choice_idx": 3, "direct_answers": ["tourism", "marine tour", "tours", "transportation", "touring", "tour groups", "transporting people", "tour", "passengers", "touring"], "difficult_direct_answer": true, "rationales": ["The bus is used for tours around the city.", "The bus is for tours.", "The bus has tour displayed on it. tour buses typically have large windows for passengers to look out of."], "image": "train2014/COCO_train2014_000000130592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136596, "question_id": "HmDvh9h3PpRpJvR5XTvUGR", "question": "Who utilizes the ramp shown here?", "choices": ["water skiers", "skateboarers", "surfboarders", "snow skiers"], "correct_choice_idx": 0, "direct_answers": ["ski jumper", "athletes", "water skiers", "water skiers", "water skiers", "water skiers", "water athletes", "wakeboarders", "water-skier", "skiers"], "difficult_direct_answer": false, "rationales": ["This is in a body of water for them to ski over", "People who are trying to ski on water use the ramp to lift off from.", "The people have skis on and there is water beneath them."], "image": "val2014/COCO_val2014_000000136596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524044, "question_id": "HmWcCCQ3u9kxvDBJeFGYUZ", "question": "Who uses this toilet located here?", "choices": ["child", "adult", "elder", "no body"], "correct_choice_idx": 3, "direct_answers": ["hopefully nobody", "nobody", "no one", "no one", "no one", "nobody", "no body", "nobody", "exhibitionist", "no one"], "difficult_direct_answer": false, "rationales": ["The toilet has no one on it.", "The toilet is place here as part of an art exhibit.", "No one...it's for an exhibit."], "image": "train2014/COCO_train2014_000000524044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399236, "question_id": "Hn3msLHwnQZ83A94Rjcrda", "question": "What amenity in the bus is shown on the window?", "choices": ["telephone", "movies", "tv", "wifi"], "correct_choice_idx": 3, "direct_answers": ["wifi", "wifi", "transportation", "wi fi", "wifi", "transportation", "wifi", "wifi", "wifi", "wifi"], "difficult_direct_answer": false, "rationales": ["The bus has a wifi logo on its front windshield.", "The sign on the left window indicates the amenity.", "There is a large sign on the front window of the bus indicating wireless access from within the bus."], "image": "train2014/COCO_train2014_000000399236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322056, "question_id": "Hn7TEPrn4Lij3jDCmcSMov", "question": "What pastime does the cigar smoker here take part in?", "choices": ["bowling", "sales", "kite flying", "reenactment"], "correct_choice_idx": 3, "direct_answers": ["war reenactments", "military reenactment", "reenactment", "horse riding", "horseback riding", "horse riding", "war reenactment", "horseriding", "civil war", "war reenactments"], "difficult_direct_answer": false, "rationales": ["He is dressed up to do a war reenactment.", "The man is wearing a civil war uniform.", "He's dressed in a civil war costume"], "image": "val2014/COCO_val2014_000000322056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301797, "question_id": "HnEKrvxVbdwAD4gGxbwww3", "question": "This boy would most likely watch what athlete on TV?", "choices": ["bryce harper", "jaromir jagr", "ernie els", "karl malone"], "correct_choice_idx": 0, "direct_answers": ["ichiro suzuki", "baseball", "bryce harper", "baseball", "baseball player", "lou gerri", "baseball player", "babe ruth", "baseball player", "matt olsen"], "difficult_direct_answer": false, "rationales": ["The only person that plays baseball in this list is bryce harper.", "The boy is bryce harper.", "Bryce harper plays baseball."], "image": "val2014/COCO_val2014_000000301797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450173, "question_id": "HnesyM8MPRejzEnPfnTagp", "question": "How does the person feet contact the bed?", "choices": ["barefoot", "cast", "heels", "socks"], "correct_choice_idx": 3, "direct_answers": ["falling", "landing", "on comforter", "jumping", "it doesn't", "bouncing", "boucing", "fall", "jumping", "socks"], "difficult_direct_answer": true, "rationales": ["Her feet are covered by a non-medical clothing item. she is not wearing shoes.", "This person is wearing socks on her feet.", "The person has socks."], "image": "train2014/COCO_train2014_000000450173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262704, "question_id": "HoDuuWQyWnR2Wj26BPMnnP", "question": "What type of electronic device are the headphones connected to?", "choices": ["ipod", "cd player", "iphone", "microsoft zune"], "correct_choice_idx": 0, "direct_answers": ["ipod", "ipod", "mp3 player", "ipod", "ipod", "ipod", "ipod", "ipod", "ipod", "mp3 player"], "difficult_direct_answer": false, "rationales": ["The device is the ipod.", "They are connected to an ipod.", "The headphones are near a pink ipod nano."], "image": "train2014/COCO_train2014_000000262704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477272, "question_id": "HoUFDznonBssdtLkC985ZL", "question": "Where is this airplane parked?", "choices": ["museum", "tarmac", "hangar", "ship"], "correct_choice_idx": 0, "direct_answers": ["inside building", "museum", "hanger", "museum", "hanger", "museum", "museum", "museum", "museum", "museum"], "difficult_direct_answer": false, "rationales": ["The airplane is inside of a building and is on display.", "The plane is at a museum.", "It is being shown in a museum."], "image": "train2014/COCO_train2014_000000477272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345937, "question_id": "Hopzri69etsnXKZvkZbKbh", "question": "Which animal is a predator of these types of animals?", "choices": ["ant", "rabbit", "eagle", "goat"], "correct_choice_idx": 2, "direct_answers": ["lion", "coyotes", "wolf", "wolf", "wolves", "wolf", "coyote", "eagle", "wolf", "tiger"], "difficult_direct_answer": false, "rationales": ["The animal is an eagle.", "Eagles attack animals on the ground.", "Ants can bite livestock. rabbits and goats are herbivores. an eagle is too small to attack a sheep."], "image": "val2014/COCO_val2014_000000345937.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20268, "question_id": "HovQHWReR9eAbRzaMRxZ7R", "question": "How is the measuring cup being heated?", "choices": ["grill", "microwave", "oven", "stove"], "correct_choice_idx": 1, "direct_answers": ["microwave", "microwave", "microwave", "microwave", "microwave", "microwave", "microwave", "microwave", "microwave", "microwave"], "difficult_direct_answer": false, "rationales": ["The measuring cup is in an oven. it does not have heating elements, so it is not a convection oven.", "The appliance heats items quickly.", "They put it in the microwave to be heated."], "image": "val2014/COCO_val2014_000000020268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283129, "question_id": "HovSfjVjYads6mWn99amsS", "question": "How are the fruits transported?", "choices": ["in crates", "in boxes", "in water", "in bags"], "correct_choice_idx": 0, "direct_answers": ["crate", "crates", "in crates", "crates", "crates", "cart", "crates", "crates", "crates", "basket"], "difficult_direct_answer": false, "rationales": ["The fruit is in them", "The oranges are pictured in this type of item, commonly used for carrying.", "They are in plastic baskets."], "image": "train2014/COCO_train2014_000000283129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337707, "question_id": "Hp2qqEeq4rbhdnzbfaBuGR", "question": "Before going aloft what did the man ride?", "choices": ["unicycle", "skateboard", "plane", "car"], "correct_choice_idx": 1, "direct_answers": ["skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard"], "difficult_direct_answer": false, "rationales": ["There is a board with wheels on the front and the back.", "The person has a skateboard.", "The man is in the air with a skateboard below him. many skateboard tricks involve jumping."], "image": "train2014/COCO_train2014_000000337707.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132816, "question_id": "HpKh8h5gVTU6ZYTMNzgd98", "question": "This child has what on their face?", "choices": ["mud", "vinegar", "sugar", "carrot juice"], "correct_choice_idx": 2, "direct_answers": ["icing", "sugar", "sugar", "crumb", "food", "crumbs", "food", "food", "frown", "donut crumbs"], "difficult_direct_answer": false, "rationales": ["The child is eating a doughnut and has some of the coating on his face. vinegar, carrot juice and mud are not things that coat a doughnut.", "The child is eating a donut and has sugar on his face.", "There is some sugar on the sides of the child's mouth."], "image": "train2014/COCO_train2014_000000132816.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42818, "question_id": "HpuHYCzKeDuboGuYUTqwdr", "question": "Why does he hold the string?", "choices": ["his job", "to fly", "control kite", "take away"], "correct_choice_idx": 2, "direct_answers": ["fly kite", "flying kite", "fly kite", "control kite", "kite", "flying", "control kite", "fly kite", "wind", "control kite"], "difficult_direct_answer": false, "rationales": ["He is flying a kite.", "The little boy is flying a kite. the string keeps the kite from flying away.", "This can move the kite different directions as well as make sure it doesn't go too far"], "image": "train2014/COCO_train2014_000000042818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357572, "question_id": "HpueRTofTuwzVcse7wATQZ", "question": "Why is the man holding a large sign?", "choices": ["to protest", "to paint", "to celebrate", "to advertise"], "correct_choice_idx": 3, "direct_answers": ["advertising", "advertising", "to advertise", "he's advertising", "advertising", "advertising", "promoting", "advertising", "political rally", "advertising"], "difficult_direct_answer": false, "rationales": ["The man is holding a large sign to advertise somebody's business.", "This type of sign is typically held up in streets to sell a product or service, thus matching the action in option a.", "This placard has a mans face, name, title and a website address. this placard is advertising for this man."], "image": "train2014/COCO_train2014_000000357572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119428, "question_id": "Hq5tK5cR8A7BFErak4gXa4", "question": "What step of donut creation is this scene at?", "choices": ["frying", "cutting", "adding sprinkles", "adding glaze"], "correct_choice_idx": 0, "direct_answers": ["frying", "frying", "final", "final step", "cooling", "drying", "frying", "icing", "frying", "frying"], "difficult_direct_answer": false, "rationales": ["Donuts are on a metal surface in front of others that are sitting in oil.", "The donuts are being dropped into a frying oil.", "The donut is being fried."], "image": "train2014/COCO_train2014_000000119428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120676, "question_id": "HqHq4rKaswQgXHaWXYmafv", "question": "Why does the woman have her arms out?", "choices": ["measure", "take picture", "wave", "balance"], "correct_choice_idx": 1, "direct_answers": ["taking selfie", "taking pictures", "taking selfie", "phone handling", "taking selfie", "using phone", "taking selfie", "taking picture", "take picture", "taking picture"], "difficult_direct_answer": false, "rationales": ["The woman sitting on the steps has her arms out to take a picture of herself with her cameraphone.", "The woman is taking a selfie while she sits.", "The woman is trying to take a photo."], "image": "train2014/COCO_train2014_000000120676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362852, "question_id": "HqMbuAoD2crrFVp5RinPiR", "question": "What is she doing?", "choices": ["posing", "cleaning chin", "fixing teeth", "brushing teeth"], "correct_choice_idx": 3, "direct_answers": ["brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth"], "difficult_direct_answer": false, "rationales": ["The woman has her toothbrush in her mouth.", "This woman is in a bathroom with a toothbrush in her mouth. this tool is used to clean teeth.", "She's brushing teeth."], "image": "val2014/COCO_val2014_000000362852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560544, "question_id": "HqSMx2w35xzu3QnisXdHVA", "question": "Who is the traffic signs for?", "choices": ["animals", "bicyclists", "pedestrians", "drivers"], "correct_choice_idx": 3, "direct_answers": ["drivers", "drivers", "cars", "drivers", "drivers", "automobiles", "vehicles", "drivers", "drivers", "cars"], "difficult_direct_answer": false, "rationales": ["The traffic sign tells drivers that the right lane can be used for making a right turn or going straight ahead.", "The signs shown off the side of the street are for people who drive their vehicles by them.", "This lets them know what is allowed and what to expect on the road"], "image": "train2014/COCO_train2014_000000560544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327271, "question_id": "HqceQCLgLtAyz2YZT8JP9u", "question": "What are the two objects on the pole above the motorcycles used for?", "choices": ["signaling traffic", "fishing", "giving tickets", "light"], "correct_choice_idx": 3, "direct_answers": ["light", "street lights", "light", "lighting", "marking lane", "light", "light", "car", "street lamps", "lighting"], "difficult_direct_answer": false, "rationales": ["Lights are often located at the top of street poles.", "The objects are lights.", "The poles have lightbulbs."], "image": "train2014/COCO_train2014_000000327271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149498, "question_id": "HqemgLYWHMMBPDRM9nmVTu", "question": "What type food is this man enjoying?", "choices": ["pizza", "soup", "dessert food", "salad"], "correct_choice_idx": 2, "direct_answers": ["dessert", "cake", "cake", "dessert", "desert", "dessert food", "cake", "desserts", "cake", "desert"], "difficult_direct_answer": false, "rationales": ["The item being eaten is solid, not liquid. the man is enjoying a cake, not a pizza or salad.", "The man is eating a stack of french toast. it look like a type of pastry.", "The food is a dessert."], "image": "train2014/COCO_train2014_000000149498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476852, "question_id": "Hqupp5MnVum8ELWHuisyQd", "question": "What are most keys made of?", "choices": ["copper", "plastic", "tin", "steel/brass/iron"], "correct_choice_idx": 3, "direct_answers": ["metal", "metal", "metal", "plastic", "steel/brass/iron", "metal", "metal", "steel", "plastic", "metal"], "difficult_direct_answer": false, "rationales": ["The keys are made of steel.", "The keys are made of thick metal.", "The set of keys on the left have various colors to include silver and brown. when it comes to key manufacturing these colors most likely are made out of steel, brass or iron."], "image": "train2014/COCO_train2014_000000476852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204606, "question_id": "HrSwpwqXcSkximUYAvxW5t", "question": "What energy powers the toothbrush?", "choices": ["solar", "hydropower", "manual force", "battery"], "correct_choice_idx": 3, "direct_answers": ["battery", "battery", "batteries", "battery power", "battery", "battery", "electric", "battery", "battery", "battery"], "difficult_direct_answer": false, "rationales": ["The toothbrush is an electric one which means it has a battery.", "The toothbrush is an electric toothbrush which is plugged in to recharge when needed.", "The toothbrush isn't corded so it would be run on batteries."], "image": "train2014/COCO_train2014_000000204606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405456, "question_id": "Hrjz8p49L5zzTCaG9yGAkQ", "question": "This man most closely resembles who?", "choices": ["moby", "chris rock", "jimmy smits", "sanjay gupta"], "correct_choice_idx": 0, "direct_answers": ["stop", "vin diesel", "bald man", "moby", "vin diesel", "movie star", "neo", "moby", "celebrity", "vin desiel"], "difficult_direct_answer": false, "rationales": ["The bald head and white skin match the person in option a.", "Moby has a bald head and is brown.", "The man is wearing sunglasses as moby does."], "image": "train2014/COCO_train2014_000000405456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163559, "question_id": "HrpvTMEMgYYySucEoZkTG7", "question": "What produce is featured in this image?", "choices": ["string beans", "okra", "lettuce", "celery"], "correct_choice_idx": 2, "direct_answers": ["brocolli", "vegetables", "radish", "fresh", "radish", "lettuce", "beets", "beets", "radishes broccoli", "lettuce beets"], "difficult_direct_answer": false, "rationales": ["Green lettuce is prominent at the middle and top of the image.", "Lettuce is green and comes in a head. you can see some at the top.", "There is no celery, okra or string beans."], "image": "train2014/COCO_train2014_000000163559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574299, "question_id": "HrvhB7yrJfLdo57f6emswD", "question": "What color is the officer riding on the police motorcycle to the left?", "choices": ["black", "green", "white", "purple"], "correct_choice_idx": 2, "direct_answers": ["white", "white", "white", "white", "white", "ride", "ride", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["His skin color is pale normally people who are pale are of european descent.", "The man is caucasian.", "He is caucasian."], "image": "train2014/COCO_train2014_000000574299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432884, "question_id": "HrwV8RHsrPNWSWPYt3rQGQ", "question": "What body type does this woman have?", "choices": ["petite", "athletic", "thick", "husky"], "correct_choice_idx": 1, "direct_answers": ["athletic", "athletic", "athletic", "mesomorph", "muscular", "athletic", "athletic", "athletic", "muscular", "athletic"], "difficult_direct_answer": false, "rationales": ["She has muscular arms and legs", "The woman is in shape and has some muscle definition.", "The woman is fit and works out."], "image": "train2014/COCO_train2014_000000432884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15725, "question_id": "HsBv6u85dGqFnHuwurjaPe", "question": "What is contained in each magnet seen here?", "choices": ["ads", "icons", "logos", "word"], "correct_choice_idx": 3, "direct_answers": ["word", "magnets", "words", "word", "word", "words", "words", "letter", "metal", "letters"], "difficult_direct_answer": false, "rationales": ["There are words on these that you can put together to make sentences.", "Each magnet has a word on it that you can put them together to make a sentence.", "As you look closer you can tell words are on the magnets."], "image": "val2014/COCO_val2014_000000015725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437221, "question_id": "HsSjPtg4mP25wbajH8J8nt", "question": "World skate is the head controller of which sport?", "choices": ["surfing", "swimming", "skiing", "skating"], "correct_choice_idx": 3, "direct_answers": ["skateboarding", "skateboarding", "skating", "skateboarding", "skateboarding", "bapp", "skateboarding", "skateboarding", "skating", "skateboarding"], "difficult_direct_answer": false, "rationales": ["World skate is the governing body for roller sports.", "World skate is the head controller of skating.", "A person is skateboarding and world skate refers to skateboarding."], "image": "val2014/COCO_val2014_000000437221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333083, "question_id": "Hsbugziw5mrtLne4y5Zcgg", "question": "What type of bus is shown?", "choices": ["school", "commuter", "double decker", "shuttle"], "correct_choice_idx": 1, "direct_answers": ["commuter", "city", "city", "double decker", "double decker", "double decker", "city", "city", "city", "city"], "difficult_direct_answer": false, "rationales": ["A commuter bus is shown.", "This bus is of a size and style that is used for transportation purposes. there is a route displayed on the electronic sign meaning this probably has a consistent route and schedule that a commuter could use.", "The bus is used to travel on the streets."], "image": "train2014/COCO_train2014_000000333083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494099, "question_id": "HsoeGCVC4Yf2N9etyynVL2", "question": "Is the water safe for swimming?", "choices": ["unsure", "maybe", "no", "yes"], "correct_choice_idx": 2, "direct_answers": ["no", "no", "no", "yes", "no", "yes", "no", "yes", "no", "no"], "difficult_direct_answer": false, "rationales": ["There are signs out to keep people out of the water.", "The waves look quite choppy, and the red flag raised on the beach is put there to let beachgoers know that the water is not currently safe for swimming.", "There are signs in front of the water."], "image": "val2014/COCO_val2014_000000494099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111873, "question_id": "HstmcVtUsC9Y57tispgY6Z", "question": "The middle athlete looks like an what?", "choices": ["shark", "dog", "bee", "horse"], "correct_choice_idx": 2, "direct_answers": ["soldier", "bee", "bee", "bee", "bumblebee", "animal", "cartoon character", "bee", "bee", "snowboarder"], "difficult_direct_answer": false, "rationales": ["He has a striped shirt on that is yellow and black.", "The middle is a bee.", "He is wearing a black and yellow striped shirt."], "image": "train2014/COCO_train2014_000000111873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319201, "question_id": "HtEK4MdLKzf5bqx4CPDt39", "question": "What is the relationship of the man to the woman?", "choices": ["brother", "friend", "son", "husband"], "correct_choice_idx": 3, "direct_answers": ["husband", "husband", "spouses", "couple", "groom", "spouse", "husband", "husband", "married", "spouse"], "difficult_direct_answer": false, "rationales": ["The couple are dressed like a bride and groom and are cutting the cake. they were just married.", "The woman is wearing a bridal gown, and it is customary for the bride and groom to cut their cake together on their wedding day.", "She is wearing a wedding dress and they are cutting a cake together. cutting the cake together is a wedding day tradition."], "image": "train2014/COCO_train2014_000000319201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51110, "question_id": "HtG8Sph4iMm6xsu3wgdEFt", "question": "What kind of Riesling is possibly being served?", "choices": ["sweet", "semi-sweet", "noir", "dry"], "correct_choice_idx": 3, "direct_answers": ["dry", "dry", "blanc", "dry", "via dry", "dry", "dry", "blanc", "blanc", "dry"], "difficult_direct_answer": false, "rationales": ["On the chalkboard, the word dry is next to the word riesling, indicate that is the type of wine that is being served.", "This is likely a white wine or otherwise dry.", "Riesling should be dry."], "image": "train2014/COCO_train2014_000000051110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328374, "question_id": "HtKmGbzxBES7QgmT5yCCNY", "question": "What is a good age to start skiing?", "choices": ["six", "three", "five", "two"], "correct_choice_idx": 2, "direct_answers": ["five", "eight", "young", "early", "eight", "four", "young age", "five years", "six", "child"], "difficult_direct_answer": true, "rationales": ["That age is old enough to learn about skiing.", "Five year old children can ski.", "A child is big enough for a bunny slope at this age"], "image": "val2014/COCO_val2014_000000328374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570255, "question_id": "HtnYdn4tdWeotSjrXKikxL", "question": "What does the Red skyborne item indicate?", "choices": ["toy kite", "banner", "surrender", "flag"], "correct_choice_idx": 0, "direct_answers": ["wind", "toy kite", "wind conditions", "danger", "kite", "danger", "windy", "kite", "alert danger", "windy"], "difficult_direct_answer": false, "rationales": ["The item is in a diamond shape.", "The boys are flying a kite.", "There is a red kite in the sky."], "image": "train2014/COCO_train2014_000000570255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422851, "question_id": "HuB844MuWEDEXYjydn9SUt", "question": "What is this action called?", "choices": ["water jumping", "seat ejection", "tube surfing", "jet propulsion"], "correct_choice_idx": 3, "direct_answers": ["surfing", "rain", "wakeboarding", "water sking", "water skiing", "sactting ride", "waterboard", "water sking", "hydrofoiling", "jet propulsion"], "difficult_direct_answer": true, "rationales": ["A man is on a machine that pushes water down to keep him the air.", "The person is being shot up by a jet of water.", "The person is being jetted across the water."], "image": "train2014/COCO_train2014_000000422851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312384, "question_id": "HuBM9XwFJCHVTeEYUVxqqR", "question": "What direction are the cars going?", "choices": ["left", "right", "back", "straight"], "correct_choice_idx": 3, "direct_answers": ["straight", "infront", "north", "away", "forward", "straight", "ahead", "straight", "forward", "both"], "difficult_direct_answer": false, "rationales": ["The cars are on a linear path.", "The cars are going forward. they are not turning.", "The cars are going forward."], "image": "val2014/COCO_val2014_000000312384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400516, "question_id": "HuCSTZyBoXdKM2vCD8jK8E", "question": "What type of utensils would you use if you ate in Shanghai restaurant?", "choices": ["knives", "chop sticks", "spoons", "ladles"], "correct_choice_idx": 1, "direct_answers": ["chop sticks", "chop sticks", "chop sticks", "chopsticks", "chopsticks", "chopsticks", "chop sticks", "chop sticks", "chopsticks", "chopsticks"], "difficult_direct_answer": false, "rationales": ["Chop sticks are a common thing to use in china.", "Shanghai is located in china and often people at chinese restaurants use two special sticks of equal length to pick up their food instead of using silverware.", "This is the universal eating utensils for this area"], "image": "val2014/COCO_val2014_000000400516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434976, "question_id": "HuGFC9acoVUmeFh8hvpdN7", "question": "What does the child control symbolically with the white device?", "choices": ["volume", "movie", "father", "bowling ball"], "correct_choice_idx": 3, "direct_answers": ["bowler", "bowling ball", "bowling ball", "remote control", "bowling ball", "bowling ball", "game", "ball", "throwing ball", "bowling ball"], "difficult_direct_answer": false, "rationales": ["The child is bowling on a video game.", "The kid needs a bowling ball.", "The child is playing a game that involves knocking down pins."], "image": "val2014/COCO_val2014_000000434976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155262, "question_id": "HuTmmV3r3eLKyNZ3HKUsUb", "question": "What is the name of this game?", "choices": ["cricket", "basket ball", "tennis", "tennikoit"], "correct_choice_idx": 3, "direct_answers": ["soccer", "tennikoit", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer"], "difficult_direct_answer": false, "rationales": ["This game is called tennikoit.", "The game is played with many players.", "The name is tennikoit."], "image": "train2014/COCO_train2014_000000155262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512166, "question_id": "HuxDpjwkyJDTeiDhV6zYNJ", "question": "Where is she located?", "choices": ["florist", "restaurant", "home", "dentist"], "correct_choice_idx": 1, "direct_answers": ["kitchen", "kitchen", "kitchen", "restaurant", "commercial kitchen", "kitchen", "restaurant", "restaurant kitchen", "kitchen", "kitchen"], "difficult_direct_answer": false, "rationales": ["She's at a restaurant.", "The woman is plating up food in containers with lids along with individual dipping sauces. the containers will then be placed in a paper bag so that the delivery person or customer can take them home.", "She is in a kitchen."], "image": "val2014/COCO_val2014_000000512166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6603, "question_id": "HvAvzWs3duNUKv5SDgLMsR", "question": "Why is there a chain on this elephant?", "choices": ["health", "balance", "decoration", "control"], "correct_choice_idx": 3, "direct_answers": ["tie-up", "prevent escape", "got stuck", "steering", "enslaved", "control", "elephant", "go nowhere", "for safety", "captive"], "difficult_direct_answer": true, "rationales": ["The elephant is in captivity and the chain can be used to keep it in one spot and not try to escape.", "This chain prevents the elephant from moving fast and lets them control it better.", "These animal abusers maintain control over this elephant via restraints."], "image": "train2014/COCO_train2014_000000006603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324616, "question_id": "HvJdkqhfwQDkpmuHGrLW5F", "question": "Bakers usually charge by the what to make these items?", "choices": ["hour", "slice", "minute", "weight"], "correct_choice_idx": 1, "direct_answers": ["serving", "tier", "slice", "layer", "hour", "hour", "fee", "top fillings", "dollars", "by tier"], "difficult_direct_answer": true, "rationales": ["That said, some also charge by the time it takes to bake, which would mean b and c, and/or by d.", "The bigger the size the more it costs.", "Most bakeries charge by the weight and time it takes for them to make it."], "image": "val2014/COCO_val2014_000000324616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434045, "question_id": "Hvcr4HnzuSHPpNqaRRU2At", "question": "What state does it look like the photographer might be in?", "choices": ["bleeding", "flying", "wet", "super cold"], "correct_choice_idx": 2, "direct_answers": ["california", "new hampshire", "newyork", "wet", "blissful state", "california", "florida", "florida", "california", "florida"], "difficult_direct_answer": false, "rationales": ["The person next to the pool could be wet from the water.", "He is next to a pool.", "The person is near the pool wearing swim shorts."], "image": "train2014/COCO_train2014_000000434045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322698, "question_id": "HvhSKZM3wApc7m8k93PHQt", "question": "Why is his hand stretched out?", "choices": ["catch frisbee", "throw frisbee", "showing off", "stop falling"], "correct_choice_idx": 0, "direct_answers": ["catch frisbee", "throw frisbee", "throwing frisbee", "throwing frisbee", "frisbee", "throwing frisbee", "throwing", "just threw", "throwing", "catch frisbee"], "difficult_direct_answer": false, "rationales": ["You can tell why his hand is stretched out due to the frisbee coming to him.", "He has his hand open so he can grab it", "The man has his arm extended with his hand open indicated that he just tossed the frisbee."], "image": "train2014/COCO_train2014_000000322698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275175, "question_id": "HvpnPPpHZWkTk6UL5oTMSh", "question": "What company is known for selling the green item here?", "choices": ["granny smith", "dunkin donuts", "birds eye", "bega cheese"], "correct_choice_idx": 2, "direct_answers": ["brocolli", "green giant", "birds eye", "green giant", "green giant", "broccoli", "green giant", "brocolli", "green giant", "green giant"], "difficult_direct_answer": false, "rationales": ["This is a large company that grows a lot of produce", "The item is broccoli, something the company in a is known to sell.", "The green item is broccoli and is sold from the grocery store birds eye."], "image": "val2014/COCO_val2014_000000275175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20398, "question_id": "HvpnyxGrQj8gupPJrfw67W", "question": "What item shown here is most likely to be litter?", "choices": ["cap", "water faucet", "hydrant", "drink can"], "correct_choice_idx": 3, "direct_answers": ["can", "can", "drink can", "can", "can", "can", "soda can", "can", "water", "drink can"], "difficult_direct_answer": false, "rationales": ["The item is a can.", "This can has been abandoned.", "The hydrant and its components, including the cap and water faucet, are attached to the ground. the item on top of the hydrant is litter."], "image": "train2014/COCO_train2014_000000020398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279418, "question_id": "HvqtHWxjrtEwTg9ZB99mgd", "question": "What activity is she participating in?", "choices": ["swimming", "frisbee", "kite flying", "fishing"], "correct_choice_idx": 2, "direct_answers": ["kite flying", "flying kite", "kite", "kiting", "flying kite", "kiting", "kite flying", "fly kite", "kite flying", "kite flying"], "difficult_direct_answer": false, "rationales": ["She does not have fishing equipment, a frisbee, or a bathing suit.", "The activity is kite flying.", "The woman is standing with a kite."], "image": "train2014/COCO_train2014_000000279418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87096, "question_id": "Hvt4KKEWx3SYByTT2ot4em", "question": "What is he doing?", "choices": ["eating bananas", "selling bananas", "stealing bananas", "resting"], "correct_choice_idx": 1, "direct_answers": ["smiling", "selling food", "selling bananas", "laughing", "selling bananas", "smiling", "selling fruit", "selling bananas", "laughing", "laughing"], "difficult_direct_answer": false, "rationales": ["He is in a booth with the fruit surrounding him", "The man is at a produce stand and he is selling bananas.", "This man is sitting behind a vendors counter with a bunch of bananas surrounding him."], "image": "train2014/COCO_train2014_000000087096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84183, "question_id": "HwD7QcKTgE9pi9PCA7wcbK", "question": "In which setting is this street?", "choices": ["farm", "rural", "urban", "suburban"], "correct_choice_idx": 2, "direct_answers": ["city", "union square", "city", "urban", "city", "city", "town", "street fair", "busy", "downtown"], "difficult_direct_answer": false, "rationales": ["The street is an urban setting.", "There are lots of people and buildings.", "The street is urban."], "image": "train2014/COCO_train2014_000000084183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292827, "question_id": "HwDob2AUd3w7GTkt5Djqy4", "question": "Where is this cat located?", "choices": ["school", "office", "home", "backyard"], "correct_choice_idx": 3, "direct_answers": ["backyard", "table", "table", "table", "table", "table", "outside", "table", "outside", "table"], "difficult_direct_answer": false, "rationales": ["There is a fence behind the table.", "This cat is located in the backyard where there is a backyard fence.", "The cat is in a backyard."], "image": "train2014/COCO_train2014_000000292827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528275, "question_id": "HwdBnUx5ohWL5o9LiKYzwx", "question": "Which of these horses would stand out in a dark setting?", "choices": ["far left", "second right", "second left", "far right"], "correct_choice_idx": 2, "direct_answers": ["white horse", "piebald", "white", "white horse", "white one", "white one", "white horse", "second left", "white", "white/brown"], "difficult_direct_answer": false, "rationales": ["The second left horse is white.", "The second left stands out most.", "The horse is white and when in the dark it will stand out because all over the other horses are only brown"], "image": "train2014/COCO_train2014_000000528275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386370, "question_id": "Hwr4vko7nb7hUNAogjd2C9", "question": "Why are there leaves on the ground?", "choices": ["it's spring", "it's winter", "it's autumn", "it's summer"], "correct_choice_idx": 2, "direct_answers": ["autumn", "fall", "autumn", "it's fall", "it's fall", "nature", "it's autumn", "fall weather", "fall weather", "tree felling"], "difficult_direct_answer": false, "rationales": ["Leaves fall off trees when the weather starts getting cool.", "The multicolored appearance of the leaves and the fact that many are on the ground means this picture was taken in the season of fall.", "The leaves are on the ground for fall."], "image": "val2014/COCO_val2014_000000386370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164462, "question_id": "HwyZC46CaZSCEWgn68GBen", "question": "What surface are the girls playing on?", "choices": ["grass", "indoor hard", "clay", "outdoor hard"], "correct_choice_idx": 3, "direct_answers": ["cement", "clay court", "court", "clay", "tennis court", "tennis court", "clay", "rubber", "clay", "outdoor hard"], "difficult_direct_answer": false, "rationales": ["People are playing tennis on an outdoor court. outdoor tennis courts are hard.", "Tennis can be played on different types of courts. by looking at the patches of grasses on the perimeter it must be outdoors. also the surface looks hardened and players have the appropriate shoes.", "The surface is outdoors."], "image": "train2014/COCO_train2014_000000164462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147471, "question_id": "HxHQCtZWKi3tK4oc3oQyqv", "question": "In what type of setting do the sitting persons find themselves?", "choices": ["dating game", "park", "zoo", "market"], "correct_choice_idx": 1, "direct_answers": ["forest", "forest", "park", "woods", "forest", "wooded", "wilderness", "trail", "forest", "forest"], "difficult_direct_answer": false, "rationales": ["The people are sitting on the ground in a park surrounded by nature.", "The persons are finding their selves sitting in a park.", "You can tell by the setting and equipment as to where they are in the photo."], "image": "val2014/COCO_val2014_000000147471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485267, "question_id": "HxebpCh9NMwgj27jowZTqV", "question": "What type of field are they playing on?", "choices": ["softball", "soccer", "field hockey", "baseball"], "correct_choice_idx": 3, "direct_answers": ["baseball diamond", "baseball", "baseball diamond", "baseball", "baseball field", "baseball", "baseball", "baseball", "baseball", "baseball field"], "difficult_direct_answer": false, "rationales": ["Softball and baseball fields are synonymous, but as can be seen, the ball approaching the batter is small in design, implying that it is a baseball. softballs are larger, easier to see, harder to throw and catch, and easier to hit.", "There is a diamond then an outfield", "Uniformed players are on a sandy field with bases."], "image": "train2014/COCO_train2014_000000485267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524844, "question_id": "HxreLf7q5AJWHZB9MpGqoW", "question": "Why is the man raising his hands above his head?", "choices": ["for fun", "for exercise", "getting help", "for balance"], "correct_choice_idx": 3, "direct_answers": ["for balance", "balance", "balance", "balance", "keep balance", "balance", "for balance", "maintain balance", "for balance", "maintain balance"], "difficult_direct_answer": false, "rationales": ["The person is on a rail on a skateboard and has his arms lifted in the air.", "The man needs balance.", "He's performing a trick that requires him to leave the surfaces"], "image": "val2014/COCO_val2014_000000524844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326300, "question_id": "HxzX2EWdaG53QyNnKXarLb", "question": "What style of food is being served?", "choices": ["indian", "mexican", "italian", "american"], "correct_choice_idx": 3, "direct_answers": ["american", "hot dog", "fast food", "hotdog", "hot dog", "hot dog", "hot dog", "fast food", "hot dog", "hotdog"], "difficult_direct_answer": false, "rationales": ["The style is american.", "The person in the foreground is eating a hot dog.", "It is a hot dog."], "image": "train2014/COCO_train2014_000000326300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222587, "question_id": "Hy62mcWpHGQwnGSedSsFiD", "question": "What type of sign is the one with a red hand?", "choices": ["directional", "traffic", "brand", "sale"], "correct_choice_idx": 1, "direct_answers": ["don't walk", "walk sign", "pedestrian", "stop pedestrians", "stop", "stop", "farrell", "traffic", "crosswalk sign", "street sign"], "difficult_direct_answer": true, "rationales": ["There is lot of cars on the road.", "This sign is used at crosswalks.", "The sign is for traffic."], "image": "train2014/COCO_train2014_000000222587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92136, "question_id": "Hy8ZR4BmfUsX7FXjsJQsNJ", "question": "What venue is shown on the right?", "choices": ["resort area", "water park", "beach", "reservoir"], "correct_choice_idx": 0, "direct_answers": ["pool", "resort area", "pool party", "resort", "resort", "park", "swimming pools", "resort", "resort", "pool area"], "difficult_direct_answer": false, "rationales": ["The area looks tropical.", "There are several pools and decks with a lot of chairs", "Looks to be a great place to vacation to"], "image": "train2014/COCO_train2014_000000092136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467973, "question_id": "HyB93QH8icLx3Fyq4QYpJj", "question": "What is being served in the tall glass?", "choices": ["wine", "beer", "milk", "juice"], "correct_choice_idx": 1, "direct_answers": ["beer", "beer", "ale", "beer", "ale", "wine", "beer", "beer", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["The glass has beer in it since beer comes in these bottles.", "Kerberos is a hops alcoholic beverage.", "It's a type of beer."], "image": "val2014/COCO_val2014_000000467973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157588, "question_id": "HyLJELddyV6p7CWryzEkYK", "question": "What is the name of the trick the man in red is performing?", "choices": ["manual", "grab", "fakie", "grind"], "correct_choice_idx": 1, "direct_answers": ["ollie grab", "ollie", "board grab", "wheelie", "flip", "skating", "grab", "hold", "grab", "airwalk"], "difficult_direct_answer": true, "rationales": ["He is holding on to his board as he is in the air.", "He is holding on.", "The man is holding onto the board."], "image": "train2014/COCO_train2014_000000157588.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443547, "question_id": "HyPveEhh9mfZXQ9cf6jiVw", "question": "The bright blue lights are doing what in the dark?", "choices": ["ruining", "blinking", "glowing", "flashing"], "correct_choice_idx": 2, "direct_answers": ["lighting", "glowing", "glowing", "shining", "glowing", "glowing", "glowing", "glowing", "glowing", "glowing"], "difficult_direct_answer": false, "rationales": ["The bright blue lights are glowing.", "The lights are illuminated in the dark.", "The blue lights are shining in the dark."], "image": "train2014/COCO_train2014_000000443547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27907, "question_id": "HygkyyTXBSBAQ4axznzTWd", "question": "How are the people in the stands here likely related to the players on the field here?", "choices": ["unrelated fans", "passersby", "relatives", "enemies"], "correct_choice_idx": 2, "direct_answers": ["parents", "parents", "parents", "parents", "relatives", "parents", "parents", "parents", "parents", "parents"], "difficult_direct_answer": false, "rationales": ["This is a baseball game for kids which is usually watched by families.", "The people in the stand are most likely relatives watching their kids play. there may also be some family friends there who have come out to root for the kids.", "This is little league so their parents and siblings would be in attendance"], "image": "train2014/COCO_train2014_000000027907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315734, "question_id": "HynmqhsBUXkez9usHUigLv", "question": "What is the silver bumper of the truck made of?", "choices": ["chrome", "aluminum", "plastic", "leather"], "correct_choice_idx": 0, "direct_answers": ["metal", "metal", "gmc", "chrome", "steel", "metal", "chrome", "chrome", "chrome", "metal"], "difficult_direct_answer": false, "rationales": ["The silver, shiny surface is indicative that it is some sort of metal.", "The bumper is a shiny piece of steel.", "That is another word for silver."], "image": "train2014/COCO_train2014_000000315734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180779, "question_id": "HyoirfLDkXCbsaxJJnYoZq", "question": "What are they walking on?", "choices": ["sand", "grass", "pavement", "snow"], "correct_choice_idx": 2, "direct_answers": ["pavement", "crosswalk", "zebra crossing", "crosswalk", "asphalt lines", "pavement", "crosswalk", "street", "street", "crosswalk"], "difficult_direct_answer": false, "rationales": ["These people are walking across the street.", "The people with umbrellas are walking on the pavement in the street.", "They are walking on a zebra crossing."], "image": "val2014/COCO_val2014_000000180779.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202004, "question_id": "Hytgk7a7BSaGAHwcUC7uoj", "question": "What item here makes the horse go forward focusing?", "choices": ["shoes", "garbage bags", "blinders", "hat"], "correct_choice_idx": 2, "direct_answers": ["blinders", "whip", "blinders", "reins", "blinders", "road", "blinders", "reigns", "reins", "reigns"], "difficult_direct_answer": false, "rationales": ["They put shades on the side of the horse's eyes so they don't get startled by something off to the side.", "A horse is pulling a trailer with blinders on. blinders are used on horses to avoid distractions.", "The horses need blinders of some sort so they focus on moving the people along."], "image": "train2014/COCO_train2014_000000202004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271112, "question_id": "HyxwAkZdtzq9C2RCCgZMAZ", "question": "Which celebrity is known for owning this type of pet?", "choices": ["mike tyson", "ariana grande", "taylor swift", "mahatma gandhi"], "correct_choice_idx": 2, "direct_answers": ["cat lady", "drew barrymore", "katy perry", "unknown", "taylor swift", "taylor swift", "katy perry", "taylor swift", "shakespeare", "taylor swift"], "difficult_direct_answer": false, "rationales": ["The celeb is swift.", "In this image we see a cat. the popstar taylor swift is outspoken about owning a cat.", "Taylor swift is a cat lover."], "image": "train2014/COCO_train2014_000000271112.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550529, "question_id": "Hz4njKq9dWMuWx7ycCAUxY", "question": "Where can you legally ride this type of bike?", "choices": ["sidewalk", "city streets", "off road", "around neighborhoods"], "correct_choice_idx": 2, "direct_answers": ["street", "dirt", "race track", "dirt", "private property", "off road", "dirt road", "road", "bike lane", "nowhere"], "difficult_direct_answer": true, "rationales": ["The bike can be ridden off road.", "These types of vehicles generally are for off road activities.", "This kind of bike can't be ridden where cars are."], "image": "val2014/COCO_val2014_000000550529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563370, "question_id": "HzJtWyPaPVFFkQ7JM3jd2u", "question": "What activity are they partaking in?", "choices": ["surfing", "scuba diving", "fishing", "swimming"], "correct_choice_idx": 2, "direct_answers": ["surfing", "surfing", "surfing", "fishing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["The activity is fishing.", "The men are standing on boards on the water.", "They are standing on boards that are used to ride waves in the water, known as surfing."], "image": "train2014/COCO_train2014_000000563370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86708, "question_id": "HzT5xomT94ZBLy4W8MFTBJ", "question": "What is the man in the back doing?", "choices": ["photobombing", "eating", "writing", "sleeping"], "correct_choice_idx": 0, "direct_answers": ["hiding", "peeking", "hiding", "peeking out", "photobombing", "looking", "hiding", "snooping", "watching", "hiding"], "difficult_direct_answer": false, "rationales": ["The man is sneaking into the photo behind the man. when people sneak into the pictures of others this is called photobombing.", "He snuck up behind the other guy and got in the photo", "The man in the backs strange posture; barely peeking over the shoulder of the man in front who appears to be posing with intention, implies that he is 'spoiling' this picture."], "image": "train2014/COCO_train2014_000000086708.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30681, "question_id": "Hzxttpo369nrEC2vjiHVe3", "question": "What will allow the people to see should this scene take place at night?", "choices": ["birds", "lamppost", "reflective pavement", "moonlight"], "correct_choice_idx": 1, "direct_answers": ["street lamp", "lamppost", "lamp post", "lamps", "bench", "street lamp", "street lights", "streetlamp", "lights", "flashlight"], "difficult_direct_answer": true, "rationales": ["The lamp will give off enough light to see all around the area.", "Unless the lamppost is lit up, the moonlight will show off more.", "The lamppost has a light that comes on at night."], "image": "train2014/COCO_train2014_000000030681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144251, "question_id": "J284mbrjKEwx5rcmMaVTX6", "question": "Which animal would weigh more than this vehicle if it had no passengers?", "choices": ["giraffe", "elephant", "bison", "diplodocus"], "correct_choice_idx": 3, "direct_answers": ["diplodocus", "whale", "elephant", "elephant", "whale", "whale", "blue whale", "elephant", "whale", "whale"], "difficult_direct_answer": false, "rationales": ["This is a guess since the animal is currently extinct. that said, the other options don't seem like they would weight more.", "Vehicle in the picture is a bus. looking up the weight of an average bus and comparing it to the weight of answer a confirms that answer a is accurate.", "There isn't any way of knowing this without researching about it. but this is a species of dino."], "image": "val2014/COCO_val2014_000000144251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430731, "question_id": "J29r75sqNvuRZCXgmaDXr7", "question": "Which screen would help one find directions?", "choices": ["left", "none", "middle", "right"], "correct_choice_idx": 3, "direct_answers": ["right", "right", "right", "rightmost", "right", "right", "right one", "right screen", "right", "right"], "difficult_direct_answer": false, "rationales": ["The left and middle screens are not displaying maps.", "The left and middle screens do not have maps. the other one does.", "It has a map on it"], "image": "train2014/COCO_train2014_000000430731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3234, "question_id": "J2GqmS5yEic8BYUoyBeL7Q", "question": "What is the person eating?", "choices": ["eggs", "pudding", "vegetables", "yogurt"], "correct_choice_idx": 2, "direct_answers": ["vegetables", "green vegetables", "vegetables", "greens", "vegetables", "salad", "vegetables", "zucchini broccoli", "vegetables", "broccoli"], "difficult_direct_answer": false, "rationales": ["The person is chowing down on broccoli and cucumbers.", "There is brocolli and cucumber on the plate", "Vegetables are in the plate and green."], "image": "train2014/COCO_train2014_000000003234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346540, "question_id": "J2J6sXaMBmgSdSXJPAysHB", "question": "Where were soft bear dolls invented?", "choices": ["spain", "italy", "wales", "america/germany"], "correct_choice_idx": 3, "direct_answers": ["1897", "germany", "america", "1902", "pa", "germany", "germany", "2000s", "america/germany", "america"], "difficult_direct_answer": false, "rationales": ["They were named after theodore roosevelt and american.", "Teddy bears are from the us since they're named after teddy roosevelt.", "The teddy bear's history (wikipedia) identifies its countries of origin as those listed in a."], "image": "train2014/COCO_train2014_000000346540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575349, "question_id": "J2Lq7uf8r894BavZYXZ96t", "question": "What essential workers wear the same hat that the boy is wearing?", "choices": ["firefighters", "chefs", "doctors", "police officers"], "correct_choice_idx": 0, "direct_answers": ["firefighter", "firefighter", "firefighter", "firemen", "firefighter", "firefighters", "firefighter", "fire fighters", "firefighters", "fire fighters"], "difficult_direct_answer": false, "rationales": ["The man has a fire hat on.", "Traditionally firefighters use the pointed red hat for protection.", "The hat is red and firefighters wear red hats."], "image": "val2014/COCO_val2014_000000575349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509823, "question_id": "J35WGkSbLjE8RqEE7bvykr", "question": "The tanks seen in the background above the building once held what?", "choices": ["butane", "oil", "propane", "water"], "correct_choice_idx": 3, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["There are old fashioned water tanks, which many buildings used to need before modern modes of water systems were invented.", "The big tanks are water tanks.", "That's what the tanks held or are holding now."], "image": "train2014/COCO_train2014_000000509823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118866, "question_id": "J3AiGUmwBKtz8S6LbH4SgV", "question": "What is in the shaker jar next to the beverage?", "choices": ["hot peppers", "parmesan cheese", "sugar", "salt"], "correct_choice_idx": 1, "direct_answers": ["parmesan cheese", "pizza", "parmesan cheese", "parmesan", "parmesan", "parmesan cheese", "parmesan cheese", "parmesan", "grated cheese", "grated cheese"], "difficult_direct_answer": false, "rationales": ["In this scene a pizza is served. parmesan cheese is usually available to put on top of pizza at restaurants.", "The colour and consistency, as well as the container it is in, indicates that it is cheese. it is next to a pizza, and this is a popular topping many people sprinkle on italian food, such as pizza.", "The picture shows the shaker jar in it."], "image": "train2014/COCO_train2014_000000118866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252137, "question_id": "J3FTfFCr39M49cbneUKZN8", "question": "The longest item here is usually found with what character?", "choices": ["yogi bear", "bugs bunny", "charlie brown", "garfield"], "correct_choice_idx": 1, "direct_answers": ["bunny", "bugs bunny", "bugs bunny", "bunny", "carrots", "rabbit", "bugs bunny", "bugs bunny", "bugs bunny", "rabbit"], "difficult_direct_answer": false, "rationales": ["The carrots are used for bugs bunny.", "The longest item is a carrot. rabbits eat carrots like the cartoon character.", "The carrots are eaten by rabbits."], "image": "val2014/COCO_val2014_000000252137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533137, "question_id": "J3iEZYaH7oHL5aKwggPMkB", "question": "What type of trees are shown?", "choices": ["deciduous", "palm", "evergreen", "fake"], "correct_choice_idx": 2, "direct_answers": ["pine", "pine", "evergreen", "pine", "pine", "evergreen", "evergreen", "pine", "fir pine", "evergreen"], "difficult_direct_answer": false, "rationales": ["These trees stay green because they have needles instead of leaves", "The trees are covered in and surrounded by snow yet still have leaves.", "A snowy climate is shown on the top of a mountain and there are green trees with pine needles all around."], "image": "val2014/COCO_val2014_000000533137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459835, "question_id": "J42yJEmWuw5ptXgb6v8tir", "question": "What countries flag is seen on one of the boats?", "choices": ["united states", "united kingdom", "france", "sweden"], "correct_choice_idx": 1, "direct_answers": ["england", "great britain", "australia possibly", "united kingdom", "great britain", "united kingdom", "britain", "united kingdom", "england", "united kingdom"], "difficult_direct_answer": false, "rationales": ["The flag of the uk is seen on the boat of the left.", "A red, white, and blue flag is painted on boats. the uk flag is red, white, and blue.", "The union jack is on the far left boat."], "image": "train2014/COCO_train2014_000000459835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289192, "question_id": "J4H4NQXev7rmTRhXwCUYpf", "question": "What does the red and white sign prohibit?", "choices": ["stopping", "right turn", "entry", "loitering"], "correct_choice_idx": 1, "direct_answers": ["turning right", "turning", "right turns", "right turns", "right turn", "stop", "right turns", "turning", "right turn", "right turns"], "difficult_direct_answer": false, "rationales": ["The sign prohibits going right.", "The arrow pointing in that direction has a red line through it to indicate that is not allowed.", "The red and white sign has an arrow. it points towards the geo building."], "image": "train2014/COCO_train2014_000000289192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471916, "question_id": "J4cZCwrdyTRKem4ad9Hq3a", "question": "What is the person here most likely to do to the Elephants?", "choices": ["photograph them", "ride them", "eat them", "poke them"], "correct_choice_idx": 0, "direct_answers": ["feed them", "observe", "photograph", "photograph them", "photograph", "pictures", "poach", "watch", "take photos", "photograph"], "difficult_direct_answer": false, "rationales": ["The person will take pictures of them.", "The person is photographing.", "The person is probably on a photo safari and is at a good distance for making photos."], "image": "train2014/COCO_train2014_000000471916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510768, "question_id": "J4faXPXGYqyFUJoeHBx7ub", "question": "What would happen if the lines in the air were damaged?", "choices": ["train accelerates", "train stops", "train crashes", "train continues"], "correct_choice_idx": 1, "direct_answers": ["train stoppage", "train stops", "train stop", "power outage", "train stops", "stop", "train stops", "no electricity", "no brakes", "train shutdown"], "difficult_direct_answer": false, "rationales": ["The train would stop.", "Electricity powers the transportation object to move on the rails. without a source of power, it wouldn't be able to budge.", "The train would stop if the lines were damaged."], "image": "train2014/COCO_train2014_000000510768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468530, "question_id": "J4tz2kWRCVTi3PAWjKYx9f", "question": "What is hanging from the shelf on the right?", "choices": ["hat", "umbrella", "jacket", "plant"], "correct_choice_idx": 0, "direct_answers": ["cap", "cap", "hat", "hat", "hat", "cap", "cap", "hat", "cap", "hat"], "difficult_direct_answer": false, "rationales": ["The brim and the shape match the item identified in option a.", "A plant is sitting on the shelf. a baseball cap is hanging from the side of the shelf.", "A cap is hanging from a shelf."], "image": "train2014/COCO_train2014_000000468530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571384, "question_id": "J56bwbkcGJjyJCBsk2Aosx", "question": "Where are the people waiting to go?", "choices": ["in bus", "home", "in building", "to hotel"], "correct_choice_idx": 0, "direct_answers": ["bus", "bus", "palm's", "on bus", "board bus", "in bus", "sightseeing", "work", "bus", "tour bus"], "difficult_direct_answer": false, "rationales": ["They are waiting in line to get on board.", "The people want to get on the bus.", "The people are waiting to get on a bus."], "image": "val2014/COCO_val2014_000000571384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73339, "question_id": "J5GbikavkxuqDyEnnVB7Ta", "question": "What type of lane is shown?", "choices": ["fire", "middle", "passing", "bike"], "correct_choice_idx": 3, "direct_answers": ["bike lane", "bike lane", "bike lane", "bike lane", "bike", "bike", "bike", "bike", "bike", "bike lane"], "difficult_direct_answer": false, "rationales": ["This is a narrow lane and people are riding bikes on it.", "The person riding the bike is in a bike lane.", "The lane is for bikes."], "image": "train2014/COCO_train2014_000000073339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395382, "question_id": "J5ScahAbQpFZZCcjzRQhuZ", "question": "How is the boy dressed differently from the girl?", "choices": ["t-shirt", "denim shorts", "flip flops", "caps"], "correct_choice_idx": 1, "direct_answers": ["jean shorts", "jean shorts", "denim shorts", "jeans", "denim", "jean shorts", "different pants", "denim shorts", "loger shorts", "denim shorts"], "difficult_direct_answer": false, "rationales": ["The boy is wearing denim shorts, and the girl is wearing polyester shorts.", "The other options don't match the image.", "Though both these kids wear shorts only the boy is wearing jeans."], "image": "val2014/COCO_val2014_000000395382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156652, "question_id": "J5XxXHtgnDZGx6gnGzsKHL", "question": "What type of shirt is the man on the right wearing?", "choices": ["plaid", "tie dye", "flannel", "hippie special"], "correct_choice_idx": 1, "direct_answers": ["checkered", "tie dye", "tie dye", "tie dye", "tie dye", "tie-dyed", "tie dye", "checkered", "tie dye", "tie dye"], "difficult_direct_answer": false, "rationales": ["One can see the telltale markings of the dip dye technique.", "The distinctive pattern on the shirt matches the style listed in option a.", "The man has tie dye on since his shirt has a starburst."], "image": "train2014/COCO_train2014_000000156652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421557, "question_id": "J5bfyhPZ7EZgTC5fjoVtLu", "question": "What kind of strike is he preparing to do?", "choices": ["lower hand", "upper hand", "backhand", "forehand"], "correct_choice_idx": 2, "direct_answers": ["backhand", "forehand", "backhand", "backhand", "backhand", "backhand", "upper", "forehand", "tennis", "backhand"], "difficult_direct_answer": false, "rationales": ["This athlete prepares to swing his racket with knuckles forward. swinging in such a position is known as backhand.", "The man is holding the racket in his hand and shifting his position.", "The hand that is on the bottom of the racquet is the dominant had here, and since the back of the hand is facing out, the move is called a backhand."], "image": "train2014/COCO_train2014_000000421557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245089, "question_id": "J5eFHe5jJSiQwnhNn5uXLL", "question": "What keeps the White teddy bear suspended?", "choices": ["gravity", "fire", "suckers", "string"], "correct_choice_idx": 2, "direct_answers": ["suction cups", "suction cups", "suction", "suction", "suction cups", "suckers", "suction cups", "suction cups", "suction cups", "suction"], "difficult_direct_answer": false, "rationales": ["The suckers keep the bear suspended.", "Yellow suction cups are visible on all four paws.", "The white teddy bear is suspended with suction cups."], "image": "train2014/COCO_train2014_000000245089.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500018, "question_id": "J67j3YHs6fwTUAuuiScwXA", "question": "What does the man hold in his left hand?", "choices": ["human scalp", "hair brush", "cookie", "elephant tail"], "correct_choice_idx": 3, "direct_answers": ["tail", "phone", "cellphone", "elephant tail", "elephant hoof", "trunk", "phone", "elephant tail", "trunk", "elephant trunk"], "difficult_direct_answer": false, "rationales": ["An elephant is laying on the ground and a man has a long portion with hair on the end in his hand.", "He is holding the elephants tail and looking at it.", "The man is holding the end of an elephants tail in his hand."], "image": "val2014/COCO_val2014_000000500018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407687, "question_id": "J6AhVtMrUXvSZXBNrGMmJZ", "question": "What is on the plate near the left of the table?", "choices": ["bacon", "berry", "apple", "orange"], "correct_choice_idx": 0, "direct_answers": ["glass", "grill chicken", "bacon", "english muffin", "bacon", "bacon", "bacon", "english muffins", "bacon", "muffin"], "difficult_direct_answer": false, "rationales": ["They are long strips of cooked meat", "There are three slices of bacon near the water glass on the left.", "The food on the plate is in the form of long flat strips and it's brownish-reddish in color, plus it looks crispy and slightly burned, which are all characteristics of bacon."], "image": "train2014/COCO_train2014_000000407687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326168, "question_id": "J6CnZ2jJnsBY6cXFhbYL2U", "question": "How will the people standing in the street mostly travel today?", "choices": ["taxi", "flying", "walking", "by bike"], "correct_choice_idx": 3, "direct_answers": ["bike", "biking", "bikes", "by bike", "bicycle", "bicycle", "bike", "bus", "bicycle", "bikes"], "difficult_direct_answer": false, "rationales": ["Given the notice on the back of the bus and quantity of people on bicycles present we can assume this is a day for biking.", "The people have a bike.", "The people are waiting on their bikes."], "image": "val2014/COCO_val2014_000000326168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52037, "question_id": "J6U98iBN4imDdcvYaX3fC8", "question": "What pattern is the young guy's shirt?", "choices": ["stripes", "tartan", "checked", "plaid"], "correct_choice_idx": 3, "direct_answers": ["plaid", "plaid", "stopped", "plaid", "plaid", "plaid", "checked", "plaid", "checks", "checkered"], "difficult_direct_answer": false, "rationales": ["The blond guy is the younger looking that the man beside him and his shirt has this type of pattern.", "The guy's shirt has plaid markings on it.", "It has different stripes crossing each other in a pattern"], "image": "train2014/COCO_train2014_000000052037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295683, "question_id": "J6mRzYL2Sdvayev457C3JW", "question": "Which direction does branch street run?", "choices": ["one way", "east-west", "north-south", "sideways"], "correct_choice_idx": 2, "direct_answers": ["north", "north south", "north", "north", "north-south", "north south", "north", "north", "north", "north"], "difficult_direct_answer": false, "rationales": ["The sign has an \"n\".", "This branch street runs from the north to the south.", "The sign indicates that this is the 200n block of branch street."], "image": "val2014/COCO_val2014_000000295683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447196, "question_id": "J8DHTDvwjDK4ABuHZFKxwg", "question": "What sport might be being played here?", "choices": ["soccer", "baseball", "badminton", "golf"], "correct_choice_idx": 2, "direct_answers": ["tennis", "badminton", "badminton", "badminton", "racquetball", "badminton", "badminton", "badminton", "badminton", "badminton"], "difficult_direct_answer": false, "rationales": ["She is using a racquet for this game.", "These people could be playing badminton.", "The sport has a racket."], "image": "train2014/COCO_train2014_000000447196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501204, "question_id": "J8bkgMdRkyPVgGFqFW5Tty", "question": "Which vehicle most likely runs on diesel?", "choices": ["white car", "orange car", "monster truck", "blue car"], "correct_choice_idx": 2, "direct_answers": ["monster truck", "monster truck", "monster truck", "monster truck", "monster truck", "truck", "trucker", "truck", "monster truck", "monster truck"], "difficult_direct_answer": false, "rationales": ["The orange, blue, and white cars probably run on gasoline.", "There are cars and a truck on a cake. trucks tend to run on diesel more commonly than a sedan style car.", "The monster truck on the cake is the kind of vehicle that normally runs on diesel fuel."], "image": "train2014/COCO_train2014_000000501204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234203, "question_id": "J8xZkgYuwgdvmxpP5Hbw58", "question": "Where were kites invented?", "choices": ["pakistan", "china", "korea", "france"], "correct_choice_idx": 1, "direct_answers": ["china", "china", "china", "china", "no clue", "china", "china", "foreign country", "1344", "549 ad"], "difficult_direct_answer": false, "rationales": ["I did an internet search on the origin of kites to provide the answer.", "There are many articles on the internet that point to option a being the answer to this question.", "Kites are being flown. kites were invented in china."], "image": "train2014/COCO_train2014_000000234203.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8676, "question_id": "J95HNv6Rfu8eBnoXYFhveS", "question": "What is the main reason to stay in this room?", "choices": ["to sleep", "to bathe", "to cook", "to exercise"], "correct_choice_idx": 0, "direct_answers": ["to sleep", "vacation", "vacation", "sleeping", "sleep", "sleep sex", "sleep", "hotel", "sleeping", "natural"], "difficult_direct_answer": false, "rationales": ["There is a bed in the room so it is clear this room is meant for nightly slumber.", "The reason is to sleep.", "That is what you do in a bedroom."], "image": "val2014/COCO_val2014_000000008676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92366, "question_id": "J95pARa9Rhy7MPSsjYTywF", "question": "The player in the image playing which sport?", "choices": ["basket ball", "tennis", "baseball", "cricket"], "correct_choice_idx": 2, "direct_answers": ["baseball", "tennis", "baseball", "softball", "baseball", "tennis", "baseball", "softball", "baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["The baseball cap and bat indicate that baseball is being played, along with the red dirt on the ground is indicative of a baseball diamond.", "The person is holding a baseball bat which is a piece of equipment used in playing baseball.", "A girl is in a baseball cap on a baseball field."], "image": "train2014/COCO_train2014_000000092366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184835, "question_id": "J9UZzdtcyWTCKZBhmTpHJ4", "question": "Why is the man swinging his arm?", "choices": ["to wave", "to hit", "to exercise", "to control"], "correct_choice_idx": 3, "direct_answers": ["playing wii", "video games", "playing game", "remote", "playing game", "playing game", "controlling game", "to control", "playing game", "wii game"], "difficult_direct_answer": false, "rationales": ["The man is trying to hit something", "The man is playing a nintendo wii. the controller he's holding controls the nintendo wii.", "The man is holding a controller for a video game."], "image": "train2014/COCO_train2014_000000184835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400379, "question_id": "J9bEUCtqNecn3P9AhzpV5m", "question": "What sort of diet might the person at the table have?", "choices": ["white food", "carnivore", "vegan", "fasting"], "correct_choice_idx": 2, "direct_answers": ["vegan", "vegetarian", "vegan", "vitamins", "fruit", "vegetarian", "vegan", "vegetarian", "vegan", "vegan"], "difficult_direct_answer": false, "rationales": ["The bowl in front of the person is not empty, so he is not fasting. the bowl has bananas and grapes, not meat.", "There are all plant products in his bowl", "If he truly only eats veggies, then he is a vegan."], "image": "train2014/COCO_train2014_000000400379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6074, "question_id": "J9bmGuGdKMSxsgrAFwLLJh", "question": "What activity is the truck here taking part in?", "choices": ["parade", "boycot", "protest", "strike"], "correct_choice_idx": 0, "direct_answers": ["parade", "parade", "parade", "parade", "christmas parade", "festivity", "parade", "christmas parade", "holiday parade", "parade"], "difficult_direct_answer": false, "rationales": ["There is a wreath on the front of the truck. people are standing on the sides of the road to watch it pass.", "There is a white and blue truck with people standing and going down road. it has a decorative wreath on front.", "The truck is decorated and people are watching."], "image": "val2014/COCO_val2014_000000006074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221223, "question_id": "J9nE9kCEvXcSmiTVhRrd4V", "question": "What program is on the TV?", "choices": ["talk show", "drama", "music video", "news reporting"], "correct_choice_idx": 3, "direct_answers": ["news", "local news", "news", "news", "news", "news reporting", "news", "news", "news", "fox traffic"], "difficult_direct_answer": false, "rationales": ["Fox news is on the tv.", "The person on the news is a reporter. the time and station identifier are in the corner of the television.", "The show is about news report."], "image": "train2014/COCO_train2014_000000221223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168801, "question_id": "J9o6mz3NgnViPWEKftjuUv", "question": "How does the young boarder balance himself?", "choices": ["head bobbing", "holding hands", "foot flipping", "inner ear"], "correct_choice_idx": 1, "direct_answers": ["holding hand", "holding on", "holding hands", "holding hands", "dads hand", "holds hand", "hand holding", "fathers hand", "dad's hand", "his arms"], "difficult_direct_answer": true, "rationales": ["The young boarded is seen holding an adult's hand while attempting to balance on the skateboard.", "The young boarder is holding hands with a walking adult.", "The boarder is holding onto his dad's hand."], "image": "val2014/COCO_val2014_000000168801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247146, "question_id": "J9rFzvwjoj7cRQqHUvPqWW", "question": "What sort of skill level do the opposing teams have at this game?", "choices": ["both novices", "even", "lopsided", "senior masters"], "correct_choice_idx": 2, "direct_answers": ["novice", "equal", "professional", "moderate", "lopsided", "juvenile", "novice", "amateur", "baseball", "semi pro"], "difficult_direct_answer": true, "rationales": ["The people do not appear to be skilled.", "This is little league so they are amateurs", "There is a scoreboard in the background. it indicates that the score is 12 to 1, so one team is significantly better than the other."], "image": "train2014/COCO_train2014_000000247146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45222, "question_id": "J9zADLJgaCkvPLxKfdsChN", "question": "In what city is the company that makes this beverage located?", "choices": ["philadelphia", "des moines", "newport", "miami"], "correct_choice_idx": 2, "direct_answers": ["ashland oregon", "newport", "brussels", "newport", "seattle", "los angeles", "newport", "america", "newport", "newport oregon"], "difficult_direct_answer": false, "rationales": ["An internet search revealed that rogue beer is brewed at their headquarters in newport, oregon.", "The beer brand is rogue. i searched the company on the internet and discovered their headquarters located in this oregon city.", "It's located in newport."], "image": "train2014/COCO_train2014_000000045222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395445, "question_id": "JAA2KqDRPB2LoaXzJWdh5R", "question": "What alcoholic beverage is being consumed here?", "choices": ["wine", "margaritas", "whiskey", "beer"], "correct_choice_idx": 3, "direct_answers": ["beer", "beer", "beer", "beer", "beer", "beer", "beer", "beer", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["The men are drinking dark brown beverages.", "The bottles are for beer.", "Two distinct characteristics of beer in a glass is it's brown color and foam on the top."], "image": "train2014/COCO_train2014_000000395445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399741, "question_id": "JABTUmEpUat2kBeFpns6cq", "question": "The stuffed doll has four what?", "choices": ["tails", "talons", "paws", "noses"], "correct_choice_idx": 2, "direct_answers": ["paws", "legs", "paws", "paws", "paws", "paws", "bear", "bear", "paws", "paws"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to this type of stuffed toy, at least not in this image.", "The stuffed animal appears to be a bear. live bears have 4 paws and the sutffed counterpart has 4 paws visible.", "The stuffed doll is not a bird. it has one nose and no tail."], "image": "val2014/COCO_val2014_000000399741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13465, "question_id": "JAMLHXaWc45mL2SXA2oquy", "question": "What is the boy in the green shirt's hands touching?", "choices": ["basketball hoop", "chair", "pillow", "skateboard"], "correct_choice_idx": 3, "direct_answers": ["skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard"], "difficult_direct_answer": false, "rationales": ["His hand is on the skateboard.", "The person is using a board.", "The person's hand is clearly visible and the object they are touching is a deck with two sets of wheels that another person has their feet on top of and can be confirmed to be a skateboard."], "image": "val2014/COCO_val2014_000000013465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493959, "question_id": "JAnMBHMBuBEqb6eyvnLtcH", "question": "What subject is mentioned on the cover of the book?", "choices": ["gerontology", "genome", "geology", "genealogy"], "correct_choice_idx": 1, "direct_answers": ["genome", "genome", "genome", "genome", "genome", "genome", "genome", "genome", "genome", "genome"], "difficult_direct_answer": false, "rationales": ["The man in the middle here is reading a book called \"genome\" while the guy on the left chats on the phone and the guy on the right is clicking the remote to the television. in the fields of molecular biology and genetics, a genome is all genetic information of an organism.", "The subject is a genome.", "If one looks closely, one can make out the title of the book."], "image": "train2014/COCO_train2014_000000493959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44952, "question_id": "JBoMdXKLkd5K6yo3fNeh4H", "question": "Who are these men riding on horses?", "choices": ["soldiers", "royal people", "policemen", "athletes"], "correct_choice_idx": 1, "direct_answers": ["rangers", "cowboys", "cowboys", "chiefs", "royal people", "sheriffs", "cowboys", "mexican cowboys", "soldiers", "cowboys"], "difficult_direct_answer": false, "rationales": ["These men look like soldiers since they're wearing uniforms.", "There are a couple of people with cowboy hats sitting on horses.", "They look like soldiers dressed up"], "image": "val2014/COCO_val2014_000000044952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401602, "question_id": "JBpyYGER2wZrJde7VxNNL6", "question": "What word can describe the bun best?", "choices": ["raw", "over toasted", "perfect", "doughy"], "correct_choice_idx": 1, "direct_answers": ["burned", "over toasted", "toasted", "toasted", "sandwich sub", "burnt", "hot dog", "toasted", "hotdog", "toasted"], "difficult_direct_answer": false, "rationales": ["One can see the scorch marks on the bun, so it was grilled for too long.", "The bun has a golden toast to it.", "The bun with the hot dog in it has been toasted too long and looks brown and slightly burnt."], "image": "train2014/COCO_train2014_000000401602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90985, "question_id": "JBsjsPQiakJDuni24Vb54P", "question": "What vegetable is shown in the picture?", "choices": ["broccoli", "lettuce", "tomato", "spinach"], "correct_choice_idx": 2, "direct_answers": ["tomato", "tomato", "tomato", "tomato", "tomato", "tomato", "tomato", "tomato", "tomato", "tomato"], "difficult_direct_answer": false, "rationales": ["The red vegetable is a tomatoe as shown.", "The red color and spherical shape matches the vegetable named in option a.", "Oranges and a tomato are shown in this picture. oranges are universally considered fruits and tomatoes are often considered vegetables."], "image": "train2014/COCO_train2014_000000090985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131330, "question_id": "JBzrEVpNgaxfKZUCYaApQu", "question": "What kind of vegetable is pictured?", "choices": ["watermelon", "tomato", "broccoli", "spinach"], "correct_choice_idx": 2, "direct_answers": ["broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "brocolli", "broccoli", "brocolli", "broccoli"], "difficult_direct_answer": false, "rationales": ["The word on the stick indicates which vegetable is pictured.", "The pot is labeled as being such.", "The stick says broccoli."], "image": "train2014/COCO_train2014_000000131330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36750, "question_id": "JC678mmTQJgXcLkiCEZZuv", "question": "What are the bottles to the right of the sink used for?", "choices": ["maintaining hairstyles", "drinking", "brushing teeth", "washing hands"], "correct_choice_idx": 3, "direct_answers": ["wash hands", "washing hands", "cleaning hands", "washing", "soap", "wash hands", "hand washing", "soap sanitizer", "washing hands", "washing hands"], "difficult_direct_answer": false, "rationales": ["To the immediate right of the sink are soap dispensers. these devices disperse soap to wash hands.", "They are soap bottles, and it is customary to wash hands when soap when using the bathroom in order to maintain hand hygiene.", "Hand soap is on the righthand side of a bathroom sink."], "image": "train2014/COCO_train2014_000000036750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56068, "question_id": "JC6W67RvAvJLSNWeX4K3h8", "question": "What type of water are they riding by?", "choices": ["lake", "river", "pond", "ocean"], "correct_choice_idx": 3, "direct_answers": ["ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean"], "difficult_direct_answer": false, "rationales": ["The waves shown can only be one thing out of these.", "We can see waves breaking on a sandy beach in this scene. these features occur in and next to oceans.", "The body of water is very large."], "image": "val2014/COCO_val2014_000000056068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262810, "question_id": "JC82ZwuexNhvYwQ8ZUxryd", "question": "What will the couple looking at the cake do now?", "choices": ["dance", "feel themselves", "sing", "cut it"], "correct_choice_idx": 3, "direct_answers": ["cut it", "cut cake", "cut cake", "cut cake", "cut cake", "eat", "slice eat", "dancing", "cut it", "celebrate marriage"], "difficult_direct_answer": false, "rationales": ["They are posing so it can be served.", "They are dressed in bridal attire, and it is customary for the bride and groom to cut their cake together on their wedding day.", "The couple are standing in front of a table in formal clothes. wedding cakes are usually cut by the couple before being served."], "image": "val2014/COCO_val2014_000000262810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366907, "question_id": "JC8FxEoHJqWd5XQunuCMae", "question": "Where is the owner of this bicycle now?", "choices": ["on bus", "home", "on sidewalk", "running"], "correct_choice_idx": 0, "direct_answers": ["in bus", "bus", "inside bus", "on bus", "on bus", "on bus", "inside bus", "bus driver", "riding bus", "bus"], "difficult_direct_answer": false, "rationales": ["A bike is on the front of the bus as the man is in the bus.", "The bus is carrying the bike for the traveller.", "The bike is on the front of the bus."], "image": "train2014/COCO_train2014_000000366907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155167, "question_id": "JCdat8SqppzstR4RZvFBig", "question": "Who are the street signs for?", "choices": ["drivers", "directions", "downtown", "pedestrians"], "correct_choice_idx": 0, "direct_answers": ["people", "drivers", "directing traffic", "cars", "one way", "cars", "drivers", "drivers", "people", "drivers"], "difficult_direct_answer": false, "rationales": ["There are one way and no stopping anytime signs. pedestrians are not affected by these signs.", "The street signs tell the vehicles when a road is a one way street. signs hang over and near roads.", "The signs are for drivers."], "image": "train2014/COCO_train2014_000000155167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238086, "question_id": "JChjWKV76oHE5RjJ27TE4P", "question": "Where is the person on the bike headed?", "choices": ["garage", "candy shop", "tea shop", "market"], "correct_choice_idx": 3, "direct_answers": ["market", "produce stand", "market", "market", "to market", "marketplace", "market", "market", "market", "market"], "difficult_direct_answer": false, "rationales": ["The man looks like he sells fruit.", "The man on the bicycle is hauling fruits and veggies to which he most likely is head to a stand where he cans sell them along with other vendors.", "The person is going to the market."], "image": "train2014/COCO_train2014_000000238086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516998, "question_id": "JCp5pJAUPUEUs4zAJuk8rQ", "question": "Which gang wears similar colors to these shirts?", "choices": ["one-niners", "bloods", "crips", "mayans"], "correct_choice_idx": 1, "direct_answers": ["working", "bloods", "bloods", "bloods", "bloods", "bloods", "bloods", "rose", "bloods", "bloods"], "difficult_direct_answer": false, "rationales": ["Bloods wear red shirts too.", "A gang called the bloods wears red.", "The shirts are he same color as blood."], "image": "val2014/COCO_val2014_000000516998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357943, "question_id": "JD7SeEkRhgHYpMU63R4S9s", "question": "Where is the dog supposed to get the sheep to go?", "choices": ["roadway", "no where", "white pen", "barn"], "correct_choice_idx": 2, "direct_answers": ["herding", "chase them", "fenced box", "pen", "pen", "pen", "white pen", "barking", "in pen", "pen"], "difficult_direct_answer": false, "rationales": ["The sheep are supposed to go in the white pen.", "The dog is in the white pen.", "Based on the space and the spectators this looks like some kind of event. in an event such as this where the dog, sheep and handler are standing the objective would be to demonstrate herding skills and get the sheep to the visible target."], "image": "val2014/COCO_val2014_000000357943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549924, "question_id": "JD93N9VBEpw7EeCHXhmc8m", "question": "What can is on the table?", "choices": ["fanta", "coke", "pepsi", "sprite"], "correct_choice_idx": 2, "direct_answers": ["pepsi", "pepsi", "pepsi", "diet pepsi", "pepsi", "diet pepsi", "diet pepsi", "pepsi", "pepsi", "pepsi"], "difficult_direct_answer": false, "rationales": ["A can of pepsi is on the table.", "Pepsi is on the table.", "The bottle with the label of pepsi is seen on the table."], "image": "train2014/COCO_train2014_000000549924.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291557, "question_id": "JDEzCBVjat8fRWrsZBKHFw", "question": "What unique feature do the childs pants have?", "choices": ["springs", "lights", "none", "pleats"], "correct_choice_idx": 1, "direct_answers": ["lights", "sweater", "shiny", "reflective strips", "reflective pants", "lights", "glowing parts", "glow", "reflection tape", "reflective"], "difficult_direct_answer": true, "rationales": ["Shines are showing on the bottom of the pants.", "One can see the glowing orbs attached to his pant legs.", "The child's pants have lights on the back of them so they are easy to see in the dark."], "image": "train2014/COCO_train2014_000000291557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43305, "question_id": "JDLzBo9VvmboFKJBoM8U6Y", "question": "Why is the boy wearing a glove?", "choices": ["fashion", "warmth", "catch", "dress code"], "correct_choice_idx": 2, "direct_answers": ["finished batting", "to catch", "to catch", "to catch", "catcher", "catch", "catch ball", "safety", "catching", "he's catcher"], "difficult_direct_answer": false, "rationales": ["The glove is used to catch an item and in this case it's a baseball for the game they are playing.", "The glove is used to help him in catching any balls that come his way.", "The boy wants to catch."], "image": "val2014/COCO_val2014_000000043305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4066, "question_id": "JDRUqTbxxt26zbHfanPVyw", "question": "What type of clothing are the people wearing?", "choices": ["water wear", "surf uniforms", "hoodies", "wetsuits"], "correct_choice_idx": 3, "direct_answers": ["wetsuits", "wetsuits", "wet suits", "wetsuit", "wet suits", "wet suits", "wetsuit", "wetsuit", "wet suits", "wetsuit"], "difficult_direct_answer": false, "rationales": ["One can see that they are wearing the skintight outfits that go by this name.", "The clothing is a wetsuit.", "The women are surfing so they are wearing wetsuits."], "image": "val2014/COCO_val2014_000000004066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109450, "question_id": "JDuYpZjePL9XjxnsnxvG52", "question": "WHat is the price of the coffee?", "choices": ["1.39", ".99", "1.29", "1.09"], "correct_choice_idx": 1, "direct_answers": ["one dollar", "99c", ".99", "99 cents", "99 cent", "no idea", "150", "ninety nine", "99 cents", "99 cents"], "difficult_direct_answer": false, "rationales": ["Coffee is usually cheaper.", "The price of the coffee is 0.99.", "There is a sign with the price"], "image": "train2014/COCO_train2014_000000109450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221350, "question_id": "JDyJxCzTuVDcvnr7CxszM2", "question": "The handwriting on the design of the mans tie is written in what form?", "choices": ["cursive", "calligraphy", "lower case", "bubble letters"], "correct_choice_idx": 0, "direct_answers": ["cursive", "joining letter", "cursive", "cursive", "cursive", "sandfolks", "joining letter", "cursive", "cursive", "cursive"], "difficult_direct_answer": false, "rationales": ["Cursive letters are connected to each other and written in a curly fashion.", "The writing is in script.", "The handwriting is cursive."], "image": "val2014/COCO_val2014_000000221350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351868, "question_id": "JE2BGknopgdMJPLUeQguqv", "question": "What are the girls holding?", "choices": ["books", "candy", "umbrella", "clothes"], "correct_choice_idx": 2, "direct_answers": ["umbrella", "umbrella", "umbrellas", "umbrellas", "umbrellas", "umbrellas", "umbrellas", "umbrella", "umbrellas", "umbrellas"], "difficult_direct_answer": false, "rationales": ["They are sheltering themselves from the rain.", "Each girl is holding an object that allows her to avoid the rain.", "The girls are holding umbrellas."], "image": "val2014/COCO_val2014_000000351868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356384, "question_id": "JE4WfRfBrotVNZnVKXraqF", "question": "What is the person on the board trying to maintain?", "choices": ["dryness", "height", "speed", "balance"], "correct_choice_idx": 3, "direct_answers": ["balance", "balance", "balance", "balance", "balance", "balance", "balance", "balance", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["Surfboards require balance in order to stay on them which is the objective. the stance being displayed is a stance one would do to maintain balance.", "The person needs balance.", "The woman does not want to fall off of her surfboard."], "image": "train2014/COCO_train2014_000000356384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152397, "question_id": "JE4tyzFKEdYtjx7L2mAhtS", "question": "The topping on the pizza falls under what food group?", "choices": ["meats", "grains", "vegetables", "fruits"], "correct_choice_idx": 2, "direct_answers": ["fungi", "fungus", "vegetables", "vegetable", "fungus", "vegetable", "vegetable", "veggies", "vegetables", "vegetable"], "difficult_direct_answer": false, "rationales": ["The topping is vegetarian.", "The pizza has mushrooms on it. they are fungi, not meats, grains, or fruits.", "There are no meats, grains or vegetables on the pizza."], "image": "train2014/COCO_train2014_000000152397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390671, "question_id": "JEJvCCwqTfur3oBznuzDwP", "question": "Where are the plates?", "choices": ["cabinet", "bed", "office desk", "table"], "correct_choice_idx": 0, "direct_answers": ["cupboard", "cabinet", "bottom shelf", "bottom shelf", "kitchen drawer", "bottom shelf", "cupboard", "cabinet", "bottom shelf", "bottom shelf"], "difficult_direct_answer": false, "rationales": ["The plates are sitting in an overhead compartment in a kitchen. the item in a performs the function described.", "The plates are being stacked in the cabinet.", "The object contains a door, different shelves, and is above the kitchen counters."], "image": "val2014/COCO_val2014_000000390671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421086, "question_id": "JERpfLLCoprrm6MkyYvF5R", "question": "Why is the man wearing gloves?", "choices": ["fashion", "grip", "health", "warmth"], "correct_choice_idx": 2, "direct_answers": ["safety", "handling food", "food safety", "food safety", "hygiene", "health", "keep hygiene", "safety", "safety", "cleanliness"], "difficult_direct_answer": false, "rationales": ["Restaurant workers must cover their hands so they do not have their hands all over your food.", "In the food industry when a person wears gloves its to protect the health of the customers and the employee to help prevent the spread of germs.", "This makes sure nothing from his hands gets in the food"], "image": "train2014/COCO_train2014_000000421086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484208, "question_id": "JEifGkj5DyaMouxiNQSLoq", "question": "What product might these animals produce without causing the animal's deaths?", "choices": ["ivory", "hay", "silk", "mohair"], "correct_choice_idx": 3, "direct_answers": ["wool", "wool", "sweaters", "milk", "wool", "wool", "meat", "mohair", "wool", "wool"], "difficult_direct_answer": false, "rationales": ["The sheep have been sheered of their fur.", "Their hair can be shaved off safely, and is used for wool and mohair.", "The product is mohair."], "image": "train2014/COCO_train2014_000000484208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78194, "question_id": "JEk7AHjqrjgEh7Bam3BkjG", "question": "What is this child being told to do?", "choices": ["brush teeth", "eat vegetable", "wash dog", "clean room"], "correct_choice_idx": 0, "direct_answers": ["nap", "brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush", "stay", "brush teeth", "brush teeth", "brush teeth"], "difficult_direct_answer": false, "rationales": ["The child is told to brush his teeth and holds a toothbrush.", "He is holding a toothbrush in his hand so that would be the most likely command.", "The child needs to brush."], "image": "val2014/COCO_val2014_000000078194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285405, "question_id": "JEmTpBHtDhaPkdYFK4unQL", "question": "What road is this school on?", "choices": ["davis", "bourne", "laurel", "narrows"], "correct_choice_idx": 3, "direct_answers": ["ludmilla", "ludmilla primary", "ludmilla primary", "ludmilla", "ludmilla", "parking lot", "narrows", "ludmilla", "ludmilla", "ludmilla"], "difficult_direct_answer": false, "rationales": ["Ludmilla primary school is on narrows road.", "The school is on a narrow street.", "The narrows road goes to the opera house."], "image": "train2014/COCO_train2014_000000285405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199136, "question_id": "JFHzthxnAVemX8PoWDhrbq", "question": "What animal is the man on the horse facing?", "choices": ["bull", "boar", "bear", "panther"], "correct_choice_idx": 0, "direct_answers": ["bull", "bull", "bull", "bull", "bull", "bull", "bull", "bull", "bull", "bull"], "difficult_direct_answer": false, "rationales": ["The animal is a bovine creature with horns and male characteristics.", "There is a bull several metres away from the horse.", "He is facing a bull in a ring."], "image": "train2014/COCO_train2014_000000199136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515364, "question_id": "JFayA5GXmR7By7pkJ7ps3r", "question": "How many species likely share this bed including the owner?", "choices": ["five", "none", "four", "two"], "correct_choice_idx": 3, "direct_answers": ["three", "one", "two", "three", "two", "two", "two", "two", "two", "three"], "difficult_direct_answer": false, "rationales": ["Two species share this bed, which include the dogs and the human owner.", "There are dogs on the bed which would mean that two species share the bed including the owner.", "The owner is human and these are dogs"], "image": "train2014/COCO_train2014_000000515364.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279984, "question_id": "JGGiioACbZGdTgLkhHhYHt", "question": "The flags share the same colors as the flag of what other country?", "choices": ["spain", "brazil", "united kingdom", "lithuania"], "correct_choice_idx": 2, "direct_answers": ["uk", "united kingdom", "united kingdom", "usa", "ukraine", "ukraine", "liberia", "united kingdom", "united states", "america"], "difficult_direct_answer": false, "rationales": ["The two countries use similar colors but different designs.", "The usa and the uk have the same color scheme in their flags of red, white and blue.", "The usa and the uk both have red, white, and blue as colors."], "image": "val2014/COCO_val2014_000000279984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470746, "question_id": "JGJfcsc29zaubP3yktQ8Pd", "question": "The decor is reminiscent of what public building?", "choices": ["courthouse", "city hall", "school", "library"], "correct_choice_idx": 3, "direct_answers": ["library", "library", "moma", "cafe", "library", "library", "hotel lounge", "library", "library", "library"], "difficult_direct_answer": false, "rationales": ["This building looks like a library because there are large bookshelves.", "The bookshelves looks like it's like a library", "The decor is for a library."], "image": "val2014/COCO_val2014_000000470746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477477, "question_id": "JHNeskz7G55oDkvt9XfVDs", "question": "When might the most recent snow have been in this locale?", "choices": ["last night", "never", "long ago", "today"], "correct_choice_idx": 2, "direct_answers": ["yesterday", "long ago", "two weeks", "week", "days ago", "not recent", "yesterday", "yesterday", "yesterday", "night before"], "difficult_direct_answer": false, "rationales": ["A lot of the snow has melted.", "The snows seems to have been around for awhile, thus matching the time frame in option a.", "The snow is probably old since it's partially melted."], "image": "val2014/COCO_val2014_000000477477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449731, "question_id": "JHVy6UYXpUzxM79UG3SGNe", "question": "In which ways can one clean their body here?", "choices": ["mopping", "shower/ bath", "antiseptic wipe", "dry cleaning"], "correct_choice_idx": 1, "direct_answers": ["sink shower", "shower", "shower/ bath", "shower", "shower", "shower", "shower", "shower sink", "shower", "water"], "difficult_direct_answer": false, "rationales": ["The other three options clean other things than the body.", "The sinks are used for shower cleaning.", "One could clean their body by using the shower and bath area."], "image": "val2014/COCO_val2014_000000449731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248150, "question_id": "JHoKrzi6ehmpfk5jRqdpWD", "question": "Why is their hand held way back?", "choices": ["stop falling", "swat mosquito", "swing ball", "self defense"], "correct_choice_idx": 2, "direct_answers": ["forehand", "forehand stroke", "add pressure", "swing ball", "returning serve", "swinging", "effort", "hit ball", "getting ready", "exert pressure"], "difficult_direct_answer": true, "rationales": ["The hand is swinging the ball.", "The racquet is held back and the ball is coming.", "A man with a tennis racket in hand is reared back as a ball approaches."], "image": "train2014/COCO_train2014_000000248150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537920, "question_id": "JHprosFUgtkQTpsK4mSDEJ", "question": "What kind of cheese is on top of the pizza?", "choices": ["mozzarella", "cheddar", "bleu cheese", "american cheese"], "correct_choice_idx": 2, "direct_answers": ["mozzarella", "blue cheese", "bread", "blue cheese", "parm", "feta", "mozzarella", "bleu cheese", "mozzarella", "goat"], "difficult_direct_answer": false, "rationales": ["Looks like blue cheese on top.", "The cheese has a bluish tint.", "You can tell by the chunkiness and shape of the cheese as to what kind it is."], "image": "train2014/COCO_train2014_000000537920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237327, "question_id": "JJDr33hCUfyukLZVMgrnWL", "question": "What does the street sign indicate is not allowed?", "choices": ["parking", "turning", "biking", "stopping"], "correct_choice_idx": 0, "direct_answers": ["parking", "parking", "parking", "parking", "parking", "no parking", "no parking", "parking", "no parking", "parking"], "difficult_direct_answer": false, "rationales": ["The text on the sign beneath the crossed out p indicates what is not allowed.", "The street sign is not allowed parking for people on bikes nearby.", "The street sign says no parking."], "image": "train2014/COCO_train2014_000000237327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100303, "question_id": "JJQtcpVR8YPckukPePZAis", "question": "What is on the desk?", "choices": ["laptop", "stuffed doll", "textbook", "candy dish"], "correct_choice_idx": 0, "direct_answers": ["keyboard", "keyboard mouse", "keyboard", "computer", "computer", "laptop", "keyboard", "keyboard", "mouse", "keyboard"], "difficult_direct_answer": false, "rationales": ["A computer keyboard and screen are on a desk.", "A computer is on the desk.", "The desk has a laptop."], "image": "train2014/COCO_train2014_000000100303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288176, "question_id": "JJR5uFwezqtyBs65G3urLY", "question": "What numbered day of the week is it?", "choices": ["two", "three", "five", "seven"], "correct_choice_idx": 3, "direct_answers": ["seven", "unknown", "seven", "six", "six", "sunday", "15", "seventh", "seven", "fifteen sunday"], "difficult_direct_answer": false, "rationales": ["The day is the seventh.", "It is the seventh day of the weke.", "It is a sunday on the clock."], "image": "train2014/COCO_train2014_000000288176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270812, "question_id": "JJURuduZHKyu8Ywsh9YB8r", "question": "What is the name for the tallest building?", "choices": ["clock tower", "library", "station", "pub"], "correct_choice_idx": 0, "direct_answers": ["clock tower", "clock tower", "clock tower", "burn khalifa", "clock tower", "clock tower", "clock tower", "clock tower", "big ben", "big ben"], "difficult_direct_answer": false, "rationales": ["The tallest building is called the clock tower.", "The tower has a giant timepiece.", "The name is the clock tower."], "image": "train2014/COCO_train2014_000000270812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181766, "question_id": "JJbXVie9kEA3ZTsD2Ma4uM", "question": "What kind of 'day' is this known as to hill enthusiast?", "choices": ["powder", "puffy", "wintery", "fluffy"], "correct_choice_idx": 0, "direct_answers": ["packed", "snow day", "mountain day", "good", "powder", "snow day", "snow", "snow day", "powder", "snow"], "difficult_direct_answer": false, "rationales": ["The snow is light so it must be powdery.", "The day is powdery.", "The snow is powdery."], "image": "train2014/COCO_train2014_000000181766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427091, "question_id": "JJqqe8jBUVg5Ey5jRjkrvh", "question": "How is the child crossing the street seating making progress?", "choices": ["being pulled", "mechanized wheelchair", "being pushed", "sheer will"], "correct_choice_idx": 2, "direct_answers": ["stroller", "being pushed", "stroller", "bike", "being pushed", "being pushed", "executive car", "wheels", "crosswalk", "being pushed"], "difficult_direct_answer": false, "rationales": ["The child is being pushed in a stroller.", "There is a child pushed across from behind to cross the zebra stripes.", "The child is in the stroller and the only way for a child in a stroller to move is to be pushed."], "image": "val2014/COCO_val2014_000000427091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523637, "question_id": "JKU3Ffj7MhczBnC8egnkzU", "question": "Who brought the paper objects that are on the table to the house?", "choices": ["fire fighter", "police officer", "postal worker", "sanitation worker"], "correct_choice_idx": 2, "direct_answers": ["mail carrier", "mail carrier", "mailman", "mailman", "mailman", "mail carriers", "mailman", "kid", "youngman", "postal worker"], "difficult_direct_answer": false, "rationales": ["The postal worker did.", "The paper objects are addressed envelopes that the mailman delivered.", "Postal workers deliver mail."], "image": "val2014/COCO_val2014_000000523637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269557, "question_id": "JKi7nsNbBHqrMAGzAFxGX9", "question": "Why is the boy blocking the view of his phone?", "choices": ["visibility", "safety", "in anger", "as joke"], "correct_choice_idx": 0, "direct_answers": ["privacy", "sun", "glare", "visibility", "privacy", "sunlight", "privacy", "too bright", "from sunlight", "over lighting"], "difficult_direct_answer": false, "rationales": ["He is doing that so he can see it and no one else can.", "The boy wants privacy.", "Another person is nearby. the hand is providing privacy."], "image": "train2014/COCO_train2014_000000269557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334769, "question_id": "JKm3NpgFRGckpNk7ao9by3", "question": "What is he looking at?", "choices": ["fan", "bird", "sun", "ball"], "correct_choice_idx": 3, "direct_answers": ["tennis ball", "tennis ball", "ball", "ball", "ball", "tennis ball", "ball", "tennis ball", "ball", "tennis ball"], "difficult_direct_answer": false, "rationales": ["He's looking at the ball.", "The tennis player is aiming for the ball.", "."], "image": "val2014/COCO_val2014_000000334769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268539, "question_id": "JL2DsBNti6TH6eiDNTgKVi", "question": "Why is the skateboard in the air?", "choices": ["showing off", "fell", "broken", "bounced"], "correct_choice_idx": 0, "direct_answers": ["showing off", "doing tricks", "doing trick", "trick", "jumping trick", "ramp", "doing tricks", "preforming skate-tricks", "jumping", "skater leaping"], "difficult_direct_answer": true, "rationales": ["A man is on a skateboard as he soars in the air.", "It appears the skateboard is in the air and the person is heading over a bike. the placement of the bike at the end of a ramp looks intentional and if someone is intentionally jumping over something with their skateboard they are likely showing off.", "The man is doing a trick."], "image": "val2014/COCO_val2014_000000268539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375989, "question_id": "JLCJ9KqypsTdt8HTCjJaev", "question": "What footwear maker is advertised in the outfield?", "choices": ["new balance", "adidas", "reebok", "nike"], "correct_choice_idx": 2, "direct_answers": ["reebok", "frond food", "reebok", "reebok", "reebok", "reebok", "frond food", "reebok", "reebok", "reebok"], "difficult_direct_answer": false, "rationales": ["There are reebok footwear makers all advertised in the outfield.", "The red sign has the brand name on it.", "Reebok is the only company with advertisements here which is known for shoes."], "image": "train2014/COCO_train2014_000000375989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232432, "question_id": "JLJPwxESqiBGR7JChrSxkW", "question": "What type of vegetables are shown?", "choices": ["fruit", "berries", "roots", "flowers"], "correct_choice_idx": 2, "direct_answers": ["peppers", "nightshades", "garden", "carrots", "root", "beets/peppers", "roots", "radishes", "carrots", "carrot"], "difficult_direct_answer": true, "rationales": ["There are carrots and turnips.", "The carrots and other vegetables grow beneath the surface of the ground.", "The veggies are roots."], "image": "val2014/COCO_val2014_000000232432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280643, "question_id": "JLazEG4Frz28wprvWbkDBK", "question": "What does the truck next to the delta jet carry?", "choices": ["air", "fuel", "oil", "water"], "correct_choice_idx": 1, "direct_answers": ["compressed air", "northern air", "gas", "air", "fuel", "fuel", "luggage", "fuel", "fuel", "fuel"], "difficult_direct_answer": false, "rationales": ["There is a truck with a large tank near the plane.", "The truck has a tank to carry liquids, and loading the airplane.", "The truck is there to put fuel in it."], "image": "train2014/COCO_train2014_000000280643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256819, "question_id": "JLu5HLazBRJ6D8xBsCGjdd", "question": "What is attached to the string?", "choices": ["pizza", "donut", "cupcake", "bagel"], "correct_choice_idx": 1, "direct_answers": ["donut", "doughnut", "donut", "donut", "doughnut", "doughnut", "donut", "doughnut", "doughnut", "donut"], "difficult_direct_answer": false, "rationales": ["It is a round piece of fried dough", "Donuts are round with a hole in the middle, which looks like the pastry on the string.", "Donuts are tied to the string."], "image": "train2014/COCO_train2014_000000256819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435069, "question_id": "JLyhZackWQT68fZYX3m4ak", "question": "What kind of animal is this dog?", "choices": ["service dog", "strayed dog", "pet", "police dog"], "correct_choice_idx": 2, "direct_answers": ["mutt", "mut", "shepherd", "terrier", "canine", "dog", "dog", "pet", "mutt", "mutt"], "difficult_direct_answer": false, "rationales": ["He has a collar.", "The dog is tied to the pole and has a bandana collar, so most likely it's someone's pet.", "The dog is a pet."], "image": "val2014/COCO_val2014_000000435069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108708, "question_id": "JM2dsusvXRb8eq7pmmncYj", "question": "What type of video game is the man in black playing?", "choices": ["action", "fighting", "tennis", "field hockey"], "correct_choice_idx": 2, "direct_answers": ["wii tennis", "wii", "minecraft", "wii", "wii", "tennies", "tennis", "tennis", "tennis", "wii"], "difficult_direct_answer": false, "rationales": ["The characters on the screen are on a tennis court.", "He is playing tennis on a video game.", "The man is playing tennis on the screen."], "image": "train2014/COCO_train2014_000000108708.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151655, "question_id": "JM7PVQmj5yxciFDPCWT2bL", "question": "What game is being played here?", "choices": ["basketball", "tag", "frisbee golf", "ultimate frisbee"], "correct_choice_idx": 3, "direct_answers": ["ultimate frisbee", "frisbee", "frisbee", "frisbee", "ultimate frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["The girl is throwing a frisbee.", "The people are using a flying disc, not a ball. they are not trying to put the flying disc into a golf basket.", "There are no posts with metal nets around so it is a game of frisbee against another."], "image": "train2014/COCO_train2014_000000151655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345520, "question_id": "JMcW3PQAf7nowMKwBFa8Dn", "question": "What are the hats the men are wearing called?", "choices": ["baseball caps", "derby hats", "safari hats", "top hats"], "correct_choice_idx": 2, "direct_answers": ["rice farmer", "straw hats", "wide brim", "pith helmet", "mahout hat", "sun hat", "safari hats", "sombrero", "sun hat", "sun hats"], "difficult_direct_answer": true, "rationales": ["The men would wear these to go on a safari.", "The men are wearing hats on a safari.", "The hats are for safaris."], "image": "val2014/COCO_val2014_000000345520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385918, "question_id": "JNFexkVt7xiZuGTUVVUxuy", "question": "What sport are the people playing?", "choices": ["ultimate frisbee", "baseball", "football", "field hockey"], "correct_choice_idx": 0, "direct_answers": ["frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "ultimate frisbee", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["These people are playing ultimate frisbee as evidenced by the white frisbee they are trying to grab.", "By the setting and what is leaving the mans hands you can assume what they are doing.", "The people are in a field and throwing a frisbee around for fun."], "image": "val2014/COCO_val2014_000000385918.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200605, "question_id": "JNLaufr4fFNDoUGYXPkVqE", "question": "In what setting does the woman walk?", "choices": ["rural", "circus", "actor's studio", "city"], "correct_choice_idx": 3, "direct_answers": ["city", "city", "city street", "sidewalk", "rain", "rainy neighborhood", "city sidewalk", "city", "urban", "city"], "difficult_direct_answer": false, "rationales": ["The woman is surrounded by buildings, cars, sidewalks, and paved streets. there are no clowns or actors.", "This looks like a business district in a large area.", "The setting is the city."], "image": "val2014/COCO_val2014_000000200605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316534, "question_id": "JNYxnZ2rUPt9MqAdLVxhcq", "question": "Which one of these would make the cows want to leave this location?", "choices": ["birds", "sailors", "seals", "hurricane"], "correct_choice_idx": 3, "direct_answers": ["water", "rain", "tsunami", "water", "sand", "no food", "high tide", "water", "hurricane", "ocean"], "difficult_direct_answer": false, "rationales": ["Hurricanes are dangerous and wildlife usually evacuate the area for these.", "The area is a tropical beach next to the ocean which is prone to storms.", "This storm would bring large waves that would drown the cows if they didn't move from this area."], "image": "val2014/COCO_val2014_000000316534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425269, "question_id": "JNeM4Uq9NCxQVtRrS7yQqJ", "question": "What are they doing?", "choices": ["eating breakfast", "resting", "enjoying scenery", "arguing"], "correct_choice_idx": 2, "direct_answers": ["enjoying view", "watching", "enjoying view", "resting", "sitting", "taking break", "sitting", "enjoying view", "enjoying scenery", "sittin"], "difficult_direct_answer": false, "rationales": ["The people are sitting on the bench and enjoying the view of the scenery.", "The people are enjoying the scenery in front of them.", "The people are taking in the view."], "image": "train2014/COCO_train2014_000000425269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138151, "question_id": "JNfraUdQHdLQHL8nwje3Ak", "question": "What city is this?", "choices": ["new york", "honolulu", "chicago", "pittsburgh"], "correct_choice_idx": 2, "direct_answers": ["chicago", "chicago", "chicago", "chicago", "chicago", "chicago", "chicago", "chicago", "chicago", "chicago"], "difficult_direct_answer": false, "rationales": ["It is the most populated city in illinois.", "A street corner has a map and directional information on it.", "The city is chicago."], "image": "val2014/COCO_val2014_000000138151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140008, "question_id": "JNjFTT68tAhEuhEYWc8PWE", "question": "Where is the school bus in relation to mirror?", "choices": ["in building", "behind", "inside", "in front"], "correct_choice_idx": 1, "direct_answers": ["behind it", "behind it", "behind", "behind", "behind", "behind", "behind", "behind it", "behind", "behind"], "difficult_direct_answer": false, "rationales": ["The mirror s shown from the back.", "The school bus is behind since this is a rearview mirror.", "The bus is behind."], "image": "train2014/COCO_train2014_000000140008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277594, "question_id": "JPCdHgbiniebAHwTPJut9Q", "question": "What one improvement could be made to this truck to make it seem more valuable?", "choices": ["new driver", "moon roof", "painting", "oil change"], "correct_choice_idx": 2, "direct_answers": ["cleaning", "paint", "paint job", "paint job", "paint job", "paint job", "paint", "painting", "paint", "paint"], "difficult_direct_answer": false, "rationales": ["Adding a fresh coating to the truck would hide all the rust stains.", "You can make the truck look cleaner by paint", "The truck is really rusted."], "image": "train2014/COCO_train2014_000000277594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403813, "question_id": "JPL5AVMNW9AWhqEqyVrEq2", "question": "What does the woman need this bag for?", "choices": ["travel", "umbrella", "gift", "laundry"], "correct_choice_idx": 0, "direct_answers": ["store items", "travel", "carry belongings", "travelling", "travel", "carrying luggage", "makeup", "working", "carrying things", "travel"], "difficult_direct_answer": false, "rationales": ["It is a small suitcase, which is customary to use when traveling outside of the home and to carry essential belongings.", "The woman is holding a small suitcase looking bag.", "This is a small suitcase designed for traveling."], "image": "train2014/COCO_train2014_000000403813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328917, "question_id": "JPZMxYEEosNmivh4fU8T6P", "question": "What has the sheep been entered in here?", "choices": ["fair", "car race", "bodybuilding contest", "beauty contest"], "correct_choice_idx": 0, "direct_answers": ["fair", "contest", "competition", "fair", "competition", "contest", "competition", "competition", "contest", "competition"], "difficult_direct_answer": false, "rationales": ["The sheep is at a fair.", "The sheep is in a county fair.", "The sheep is located in a fair contest."], "image": "train2014/COCO_train2014_000000328917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240903, "question_id": "JQ3w9vteXu6LamWyUGmimY", "question": "How will she tell her supervisor about the welfare of the animal?", "choices": ["walkie talkie", "flare", "shouting", "text message"], "correct_choice_idx": 0, "direct_answers": ["healthy", "walkie talkie", "radio", "radio", "happy", "communicating", "walkie talkies", "walkie talkie", "walkie talkie", "radio"], "difficult_direct_answer": false, "rationales": ["She has a handheld device hanging from her pocket", "The person has a walkie talkie in her pocket.", "The woman by the fence has a walkie talkie that she can use to call her supervisor about the animal's welfare."], "image": "val2014/COCO_val2014_000000240903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497207, "question_id": "JQ5zqF2ydSSTWMP2MfmauW", "question": "What is inside of the large cake with green top and bottom?", "choices": ["caramel", "nothing", "chocolate", "coconut"], "correct_choice_idx": 1, "direct_answers": ["icing", "yellow cake", "cake", "cake", "cake", "pistachios", "baby", "nothing", "key lime", "frosting"], "difficult_direct_answer": false, "rationales": ["There is nothing inside.", "The whole cake comprises only of layers of cake.", "There is nothing inside of the large cake with the green pedestal."], "image": "train2014/COCO_train2014_000000497207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454329, "question_id": "JQGdqHfWvdwnLRSjAXV2RK", "question": "How are these types of clocks called?", "choices": ["street clocks", "vintage posts", "clock towers", "post clocks"], "correct_choice_idx": 3, "direct_answers": ["analog clocks", "four", "analog", "town clock", "standing clock", "city clocks", "clock tower", "analog", "post clocks", "analog"], "difficult_direct_answer": false, "rationales": ["The clocks are outside lining the street.", "Clocks are scattered around in a town on tall poles.", "They are on tall posts."], "image": "val2014/COCO_val2014_000000454329.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149115, "question_id": "JQUBQefNGkpmwDYDBqjDAZ", "question": "What former Atlanta Brave is pictured in this jersey?", "choices": ["john franco", "bruce sutter", "mark canha", "mark teixeira"], "correct_choice_idx": 3, "direct_answers": ["javier vasquez", "mark teixeira", "joc pederson", "chipper jones", "mark teixeira", "n/a", "justice", "no clue", "brian mccann", "giancarlo stanton"], "difficult_direct_answer": true, "rationales": ["A baseball player is wearing a braves uniform and swinging a bat.", "Mark teixeira is pictured above.", "The player is mark teixeira, nicknamed tex."], "image": "val2014/COCO_val2014_000000149115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210672, "question_id": "JQsmDoe7SiwBkG82zFAnmG", "question": "What is the main ingredient of this artificial sweetener?", "choices": ["maple syrup", "sucralose", "coconut", "agave"], "correct_choice_idx": 1, "direct_answers": ["ascertain", "stevia", "sucralose", "sucralose", "sweeter", "sucralose", "sucralose", "sucralose", "splenda", "aspartame"], "difficult_direct_answer": false, "rationales": ["Sucralose is the main ingredient of artificial sweetener.", "Sucralose is in it.", "There is a splenda box above the toaster oven. agave, coconut, and maple syrup are not artificial."], "image": "train2014/COCO_train2014_000000210672.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282568, "question_id": "JQuF9HaETsRWJwFedZjRyc", "question": "What sport are the men playing?", "choices": ["ultimate frisbee", "soccer", "baseball", "hockey"], "correct_choice_idx": 0, "direct_answers": ["frisbee", "frisbee", "frisbee", "ultimate frisbee", "basketball", "frisbee", "frisbee", "ultimate frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["The men are jumping up to get a frisbee.", "They are both trying to catch a flat circular disc, not a puck or ball.", "This game is ultimate frisbee as shown by the two players leaping for a frisbee."], "image": "train2014/COCO_train2014_000000282568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339678, "question_id": "JQxg9b5cYYDrLhnrvFsw3u", "question": "What will the person walk on to cross the street here?", "choices": ["dirt", "trolley", "crosswalk", "taxi"], "correct_choice_idx": 2, "direct_answers": ["crossing", "cross walk", "crosswalk", "crosswalk", "crosswalk", "slush", "snow", "sidewalk", "crosswalk", "snow"], "difficult_direct_answer": false, "rationales": ["The person is walking into the crosswalk in order to get to the other side safely.", "The person needs to cross using the crosswalk for pedestrians.", "There are lines painted at the intersection denoting a safe place to walk."], "image": "val2014/COCO_val2014_000000339678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489159, "question_id": "JQzE3SSjgTcajyfFktjKcJ", "question": "Which of these foods on the plate are highest in carbs?", "choices": ["mayonnaise", "sauerkraut", "bread", "meat"], "correct_choice_idx": 2, "direct_answers": ["pasta", "macaroni", "mac cheese", "bread", "potato salad", "meat sandwich", "bread", "potato salad", "potato salad", "sandwich"], "difficult_direct_answer": false, "rationales": ["The food is bread.", "Bread is mostly carbs. the others might have a little, but not like the bread.", "Bread and grains are all high in carbs and the item here that would have the most."], "image": "val2014/COCO_val2014_000000489159.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33841, "question_id": "JR3a4enCS4gfQnErVcm68M", "question": "What famous scientific instrument was created by the person's name on the cargo?", "choices": ["internet", "computer", "telephone", "telescope"], "correct_choice_idx": 3, "direct_answers": ["telescope", "telescope", "galileo", "galileo", "telescope", "car", "telescope", "telescope", "telescope", "compass"], "difficult_direct_answer": false, "rationales": ["Galileo is famous for this, and lived in the time before technology such as phone, internet, or computers.", "The instrument is a telescope.", "The name is galileo, not bell, cerf, kahn, or babbage."], "image": "train2014/COCO_train2014_000000033841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45844, "question_id": "JRLNFysXEfGChMiSZMcdSw", "question": "What is the boy in the foreground doing?", "choices": ["eating lunch", "playing tennis", "cleaning park", "threatening others"], "correct_choice_idx": 1, "direct_answers": ["holding racket", "playing tennis", "playing frisbee", "playing tennis", "holding something", "waiting", "playing tennis", "playing tennis", "frisbee", "throwing frisbee"], "difficult_direct_answer": false, "rationales": ["The boy is playing tennis.", "The boy in the foreground is holding a racquet. he is standing near a net.", "He is holding a racquet and facing a boy with a racquet."], "image": "val2014/COCO_val2014_000000045844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235876, "question_id": "JRWoWa9ebf8kkNurEKGCbb", "question": "What is the man standing on the green sign doing?", "choices": ["protesting", "exercising", "photographing", "dancing"], "correct_choice_idx": 2, "direct_answers": ["filming", "videotaping", "photography", "recording video", "shooting pictures", "taking photos", "watching", "photographing", "taking pictures", "taking photo"], "difficult_direct_answer": true, "rationales": ["The man has a camera in his hand. cameras take pictures and record videos.", "The man standing on the green sign is holding a camera and taking a picture.", "He is holding a camera."], "image": "train2014/COCO_train2014_000000235876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133503, "question_id": "JRrH4ouawi6MhLEahtEfyR", "question": "What type of vehicle will be needed if this weather continues?", "choices": ["plow", "bulldozer", "convertible", "garbage truck"], "correct_choice_idx": 0, "direct_answers": ["four wheel", "snow plow", "snowplow", "snow plow", "snow plow", "snow plow", "snowmobile", "plow", "toyota avalon", "snowplow"], "difficult_direct_answer": false, "rationales": ["It is snowing. a vehicle capable of removing the snow would be needed.", "The snow will need to be removed from the roads.", "A plow truck will be needed to move the snow."], "image": "val2014/COCO_val2014_000000133503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119822, "question_id": "JRtWBAsmRirjZMjtWPGDD8", "question": "What countries flag can be seen at the front of the plane?", "choices": ["france", "germany", "united states", "italy"], "correct_choice_idx": 2, "direct_answers": ["usa", "usa", "united states", "usa", "united states", "usa", "united states", "united states", "united states", "usa"], "difficult_direct_answer": false, "rationales": ["The red, white, and blue stars and stripes of the american flag can be seen.", "The united states is known for having a red, blue, and white flag that has stars and stripes.", "There is a flag visible with white stars on a blue corner and interchanging red and white stripes which is known to be the american flag."], "image": "train2014/COCO_train2014_000000119822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244614, "question_id": "JS6KEyLsQufTqWJd73uFj7", "question": "What state does this team come from?", "choices": ["texas", "delaware", "new york", "new jersey"], "correct_choice_idx": 0, "direct_answers": ["texas", "texas", "texas", "texas", "new jersey", "texas", "new jersey", "texas", "new jersey", "texas"], "difficult_direct_answer": false, "rationales": ["The aggies represent an a&m college that is located in the south.", "The player in the middle is wearing an aggies jersey. this college is based in college station.", "The state is texas."], "image": "train2014/COCO_train2014_000000244614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163229, "question_id": "JSPb5aLDgBooiB24KYivqF", "question": "Which one of these will be useful after dinner is finished?", "choices": ["oil", "pepper", "vaseline", "baking soda"], "correct_choice_idx": 3, "direct_answers": ["pie", "pie", "baking soda", "pie", "tray", "pie", "dessert", "pie", "both", "dessert"], "difficult_direct_answer": false, "rationales": ["Baking soda would be a useful addition to the dinner, instead of the rest of all options.", "The baking soda is useful.", "This can be used for heartburn"], "image": "train2014/COCO_train2014_000000163229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478621, "question_id": "JShRYPTm2yDm7F6g9Qzvnh", "question": "If you left a bucket out here what would you most likely get?", "choices": ["fish", "donations", "nothing", "rain water"], "correct_choice_idx": 3, "direct_answers": ["bucket water", "water", "rain water", "rain", "water", "water", "water", "water", "rain water", "water"], "difficult_direct_answer": false, "rationales": ["The bucket would get rain.", "There are not many people here due to the weather, so you would not get donations. there are no fish here.", "It is a downpour so the bucket would get filled."], "image": "val2014/COCO_val2014_000000478621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419212, "question_id": "JSuD9v4ZdP8VGMzt7qMNaK", "question": "World surf league is the highest governing body of which sport?", "choices": ["kiting", "swimming", "skating", "surfing"], "correct_choice_idx": 3, "direct_answers": ["surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["The word is in the name", "A organization of surfing would likely be the governing body of the sport of surfing and not any other sport.", "The man is holding a surfboard."], "image": "train2014/COCO_train2014_000000419212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203110, "question_id": "JT2PMYczvXUd6aMTgfe9Yp", "question": "What is the bus type shown in picture?", "choices": ["coach", "none", "single decker", "double decker"], "correct_choice_idx": 2, "direct_answers": ["trolley", "trolley", "streetcar", "cable car", "streetcar", "single decker", "trolley", "trolley", "street car", "old"], "difficult_direct_answer": false, "rationales": ["This vehicle's track and wires identify it as a trolley rather than any kind of bus.", "It only has one level.", "There is a single decker bus shown in the foreground."], "image": "val2014/COCO_val2014_000000203110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256852, "question_id": "JTaso7MLr8MPcUNyVtoc6G", "question": "In which location does this bus run?", "choices": ["rural", "city", "suburbs", "farm"], "correct_choice_idx": 1, "direct_answers": ["washington street", "north", "city street", "city", "london", "san diego", "street", "downtown", "city", "street"], "difficult_direct_answer": false, "rationales": ["There are big buildings and a lot of people in the street.", "It looks like it runs in the city, through the business area.", "Buses always run where there are the most people so it would be in an urban area."], "image": "val2014/COCO_val2014_000000256852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351477, "question_id": "JTbDacA2Btf5Cea6tWSLan", "question": "Where are the adults probably?", "choices": ["nearby", "another state", "in lounge", "in home"], "correct_choice_idx": 0, "direct_answers": ["watching kids", "ahead/on left", "snow", "ahead", "photographing kids", "nearby", "taking picture", "watching", "taking photo", "in back"], "difficult_direct_answer": true, "rationales": ["These are all kids in the photo, but someone had to have taken the picture. it is likely that an adult is behind the camera, as they're kind of in a desolate region and require supervision; also noteworthy is the height at which the photo has been taken, meaning well above the heads of the small children.", "The adults are likely watching the children but are not in viw.", "These are small children who need supervision"], "image": "val2014/COCO_val2014_000000351477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561962, "question_id": "JU66Yuk2oBXkPq9KPx8wH8", "question": "What is under the clock?", "choices": ["garbage bag", "brick square", "scales", "bird"], "correct_choice_idx": 1, "direct_answers": ["pole", "pedestal", "stand", "pole", "brick square", "pedestal", "shrubs", "pole", "pole", "bushes"], "difficult_direct_answer": false, "rationales": ["The clock is easy to see so the answer is easy to figure out.", "It's a pedestal that is square shaped and has red bricks", "The other options aren't in this image. it's actually the base."], "image": "train2014/COCO_train2014_000000561962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49777, "question_id": "JUHADH28bdacoruZbMhVZr", "question": "What type of vehicle are the people boarding?", "choices": ["private", "rental", "commercial", "bicycle"], "correct_choice_idx": 2, "direct_answers": ["bus", "bus", "commercial", "bus", "bus", "tour bus", "bus", "elderly", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["The other options are unlikely given the setting. that said, it might be a mix of a and b.", "Commercial vehicles can accommodate many passengers like this one can.", "They are getting ready to board a commercial bus"], "image": "val2014/COCO_val2014_000000049777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460228, "question_id": "JUHL8Miuxcig2ZGGc8taR8", "question": "What happened to the sandwich along the edge?", "choices": ["split half", "glued together", "broken", "melted"], "correct_choice_idx": 0, "direct_answers": ["burnt", "eaten", "burnt", "cut", "cut", "split half", "cut", "cut", "cut", "cut"], "difficult_direct_answer": false, "rationales": ["This sandwich is in two pieces which means it was cut apart at some point.", "The sandwich has been cut into two with a knife which is traditionally how a restaurant would serve a sandwich.", "The edge of the bread is not shown."], "image": "train2014/COCO_train2014_000000460228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309060, "question_id": "JUN3oDNMmnmvobRSvnn6pf", "question": "What could block the washer's door from opening?", "choices": ["stove", "stool", "window", "sink"], "correct_choice_idx": 0, "direct_answers": ["oven", "oven", "stove", "stool", "stove", "stove", "stovetop", "oven", "oven", "oven"], "difficult_direct_answer": false, "rationales": ["The stove and washer are close together.", "The stove could block.", "The washer door is right in front of the range preventing it from opening."], "image": "train2014/COCO_train2014_000000309060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70346, "question_id": "JUPcbCqzphguVas8WTJJzY", "question": "What shape is the outer edge of the sink?", "choices": ["polygon", "hexagon", "octagon", "heptagon"], "correct_choice_idx": 1, "direct_answers": ["white", "octagon", "octagonal", "rounded", "hexagon", "straight", "triangle", "pentagonal", "heptagon", "hexagon"], "difficult_direct_answer": true, "rationales": ["The sink has 6 edges.", "It has 6 sides", "The sink has six sides."], "image": "train2014/COCO_train2014_000000070346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89770, "question_id": "JURXBoAUyNfVUHHX86B7mN", "question": "Why are the boards there?", "choices": ["holds vehicles", "always there", "sheds rain", "fell truck"], "correct_choice_idx": 2, "direct_answers": ["boardwalk", "pier", "for walking", "walk on", "boardwalk", "sheds rain", "walking", "support pier", "boardwalk", "create walkway"], "difficult_direct_answer": false, "rationales": ["The boards are used to let the rain go through.", "It is good drainage for storms.", "The boards are made to shield rain."], "image": "train2014/COCO_train2014_000000089770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406253, "question_id": "JV5JTdPpssfXHV7ZMAbuKN", "question": "What holds the scooter up when it's parked?", "choices": ["gearshift", "kickstand", "parking rack", "brake pedal"], "correct_choice_idx": 1, "direct_answers": ["stand", "kickstand", "kickstand", "stand", "stand", "kickstand", "stand", "kickstand", "kickstand", "kickstand"], "difficult_direct_answer": false, "rationales": ["There is a small piece of metal at the bottom of the scooter that is propped against the pavement to stabilize it.", "A metal part folds down to balance the scooter", "Because it has 2 wheels, it is hard to stand on its own, so the stand keeps it balanced between the 2 wheels."], "image": "val2014/COCO_val2014_000000406253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64733, "question_id": "JVNBqqqRDn5P6FRWLEqiP9", "question": "The item in the medium plate is often eaten with what?", "choices": ["ketchup", "mustard", "tomato sauce", "applesauce"], "correct_choice_idx": 2, "direct_answers": ["rolls", "tomato sauce", "sauce", "sauce", "sauce", "pizza", "soup", "fork", "salad", "food"], "difficult_direct_answer": false, "rationales": ["Pizza typically has tomato sauce on it.", "Several pizzas are shown and use a red sauce for main ingredient.", "Noodles are usually put with spaghetti sauce to form a type of spaghetti."], "image": "train2014/COCO_train2014_000000064733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470663, "question_id": "JVZ7vjGaE9rTYmd9YVFSwn", "question": "What is making the man's pocket pop up?", "choices": ["billiard balls", "meat balls", "bouncy balls", "tennis balls"], "correct_choice_idx": 3, "direct_answers": ["ball", "boner", "tennis ball", "tennis ball", "wallet", "ball", "tennis balls", "tennis balls", "tennis ball", "phone"], "difficult_direct_answer": false, "rationales": ["The people are holding rackets.", "They are playing tennis.", "The pocket has tennis balls."], "image": "train2014/COCO_train2014_000000470663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430750, "question_id": "JVoBX4hGatLYhgWamDyEV4", "question": "Why is the man near the plane wearing a yellow vest?", "choices": ["punishment", "visibility", "to hide", "fashion"], "correct_choice_idx": 1, "direct_answers": ["increased visibility", "be seen", "visibility", "safety visibility", "safety", "identify himself", "safety", "visibility", "safety", "airport worker"], "difficult_direct_answer": false, "rationales": ["So he can be easily seen.", "He's an employee and needs to be seen by vehicles", "The man near the plane is wearing a yellow vest that has reflectors on it so he will not be crashed into."], "image": "val2014/COCO_val2014_000000430750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191298, "question_id": "JVr8XpX5HZXnHgS3uJAQn4", "question": "What are these creatures doing?", "choices": ["driving", "eating", "swimming", "flying"], "correct_choice_idx": 1, "direct_answers": ["resting", "eating", "grazing", "grazing", "eating", "eating", "wandering", "grazing", "grazing", "eating"], "difficult_direct_answer": false, "rationales": ["They are all eating.", "Their heads are up in the trees eating leaves. they are grazing grass.", "There are some odd looking animals grazing on grass."], "image": "train2014/COCO_train2014_000000191298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498228, "question_id": "JWJt7US9wnmPgefbrwB4TX", "question": "What is the name for the large birds near the shore?", "choices": ["swans", "ducks", "pigeons", "pelicans"], "correct_choice_idx": 3, "direct_answers": ["seagull", "egret", "pelican", "pelicans", "pelicans", "storks", "pelican", "pelican", "seagulls", "pelican"], "difficult_direct_answer": false, "rationales": ["They have long beaks with a pouch for holding fish.", "The large birds have huge beaks.", "The name is a pelican."], "image": "val2014/COCO_val2014_000000498228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493659, "question_id": "JWPjMhNxXPGjqffYZVXhXm", "question": "What is the man wearing on his head?", "choices": ["swim cap", "fedora", "baseball hat", "helmet"], "correct_choice_idx": 1, "direct_answers": ["hat", "hat", "hipster hat", "fedora", "hat", "hat", "fedora", "fedora", "fedora", "hat"], "difficult_direct_answer": false, "rationales": ["The hat has a short brim and a crease in the top.", "The hat has a red ribbon.", "The man is wearing a fedora cap."], "image": "train2014/COCO_train2014_000000493659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441458, "question_id": "JWZ9WfyK99ziqVE2TNsGw9", "question": "What is the tool used to cut a dog's nails?", "choices": ["clipper", "buzzer", "snapper", "tweezers"], "correct_choice_idx": 0, "direct_answers": ["clipper", "clippers", "clipper", "clipper", "nail clipper", "clippers", "clipper", "cutter", "clipper", "clipper"], "difficult_direct_answer": false, "rationales": ["Dogs nails are cut with special nail clippers.", "Clippers can make nails shorter.", "Clippers are universally used to trim and cut nails for most animals and people."], "image": "val2014/COCO_val2014_000000441458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30508, "question_id": "JWgzcwjxYZapn8xAVR7TBa", "question": "What must people refrain from doing for the safety of the animals?", "choices": ["eat them", "shoot them", "feed them", "pet them"], "correct_choice_idx": 2, "direct_answers": ["feed them", "feeding them", "feed", "feeding", "feeding them", "speeding", "hunting", "driving", "feeding", "driving"], "difficult_direct_answer": false, "rationales": ["If one were to shoot an animal that would harm them and violate their safety.", "Cars are parked near animals gathered by the street. people sometimes avoid feeding wild animals.", "The animals next to the van are wild animals and it would be dangerous to feed them."], "image": "train2014/COCO_train2014_000000030508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235189, "question_id": "JWwY5M65PZ6WDzCSB2sXXt", "question": "What would you likely put in the thing on the ground that looks like garbage?", "choices": ["hamburger", "dollar bills", "toys", "water"], "correct_choice_idx": 3, "direct_answers": ["water", "water", "water", "water", "trashcan", "water", "kite", "water", "water", "trashcan"], "difficult_direct_answer": false, "rationales": ["The water is in the garbage.", "There is a plastic water bottle on the ground.", "The plastic bottle on the ground is used to contain water."], "image": "train2014/COCO_train2014_000000235189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54692, "question_id": "JX7Su4fQpt9Bd6DvwTxwjD", "question": "What item other than the eyeglasses is upside down on the table?", "choices": ["statue", "flower", "glass", "cat"], "correct_choice_idx": 2, "direct_answers": ["liquor glasses", "flower pots", "wine glasses", "wine glasses", "wine glasses", "glass", "wine glasses", "glasses", "glasses", "wineglass"], "difficult_direct_answer": false, "rationales": ["The item is glass.", "A see thru wine glass is upside down and no drink in it.", "There are upside down wine glasses on the table."], "image": "train2014/COCO_train2014_000000054692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196483, "question_id": "JXAmdWSkJKY9wbMhyLZFK2", "question": "The persons here are having what?", "choices": ["party", "wake", "baby", "complaint"], "correct_choice_idx": 0, "direct_answers": ["fun", "party", "party", "party", "party", "party", "fun", "game", "party", "fun"], "difficult_direct_answer": false, "rationales": ["The excited position of the woman in the background and colorful streamers through the room mean this is probably a celebration of some sort.", "Games are usually played when groups of people get together for a celebration.", "They appear to be having fun, and the white controller is for a video game which is commonly used in a socializing setting."], "image": "val2014/COCO_val2014_000000196483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443764, "question_id": "JXJS435JKJ59xhn6A45K6S", "question": "Where was this bus before it went into the water?", "choices": ["field", "harbor", "road", "dock"], "correct_choice_idx": 2, "direct_answers": ["roosevelt square", "road", "street", "road", "land", "road", "land", "road", "riverside", "shore"], "difficult_direct_answer": false, "rationales": ["The bus can also go on the road.", "The bus was likely at a dock because that is how the many passengers would have boarded.", "This is a submersible bus that is driven from the road into the water."], "image": "train2014/COCO_train2014_000000443764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11025, "question_id": "JXK2pDvGRWYAhSLtCd3qkz", "question": "What is the brand of sport shoes worn by the man who is performing on the skateboard?", "choices": ["vans", "nike", "dc", "adidas"], "correct_choice_idx": 1, "direct_answers": ["tennis", "nike", "nike", "nike", "tennis", "nike", "nike", "tennis", "nike", "nike"], "difficult_direct_answer": false, "rationales": ["The brand is nike.", "The side of the shoes has the nike swoosh, which is a copyrighted branding of nike only.", "The shoes have the trademark swoosh on the side of them."], "image": "train2014/COCO_train2014_000000011025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434159, "question_id": "JXKvVAdXS4EteCXHYSmc3U", "question": "What instance of building is shown in the image?", "choices": ["tourist spot", "government building", "university", "castle"], "correct_choice_idx": 0, "direct_answers": ["gothic", "castle", "wall", "neon", "olden version", "photograph", "castle", "castle", "tourist spot", "castle"], "difficult_direct_answer": false, "rationales": ["This is a tourist castle.", "The tourist area is being shown", "This medieval castle is part of a tourism location."], "image": "val2014/COCO_val2014_000000434159.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315751, "question_id": "JYVxyoGk48hd7Mx7xbSjwT", "question": "What winter sport equipment are the people holding?", "choices": ["luge", "snowboard", "curling", "skiis"], "correct_choice_idx": 3, "direct_answers": ["skiis", "skis", "skiing", "skis", "ski gear", "ski/poles", "skiing", "skis", "skis", "skis"], "difficult_direct_answer": false, "rationales": ["The winter sport equipment is skis.", "They have skis.", "Skis are made up of long poles."], "image": "train2014/COCO_train2014_000000315751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163167, "question_id": "JYaGaBxBzSmUerUvJwDyqw", "question": "Where is this person located?", "choices": ["dentist office", "church", "doctor's office", "home"], "correct_choice_idx": 3, "direct_answers": ["kitchen room", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen room", "home", "kitchen", "kitchen", "kitchen room"], "difficult_direct_answer": false, "rationales": ["This person is in their kitchen.", "They are in a kitchen of a building", "The decor looks to be of the size and style of a home kitchen and lacks professional grade equipment. home kitchens are found in answer a."], "image": "val2014/COCO_val2014_000000163167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497494, "question_id": "JYhNzthUNgrjzuXqYJvVc4", "question": "How are these men related?", "choices": ["lovers", "arch villians", "siblings", "enemies"], "correct_choice_idx": 0, "direct_answers": ["father son", "paternal", "father son", "married", "couple", "lovers", "lovers", "same family", "husbands", "parent child"], "difficult_direct_answer": false, "rationales": ["The men are lovers.", "Two men are comfortable in bed watching television and laying with a dog.", "These men are related as lovers."], "image": "train2014/COCO_train2014_000000497494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192400, "question_id": "JYwnSW6W359LCMyVHgkQkZ", "question": "What cap/apparel company is a sponsor for the stadium?", "choices": ["new era", "adidas", "nike", "cub"], "correct_choice_idx": 0, "direct_answers": ["new era", "new era", "new era", "steel", "new era", "new era", "new era", "block white", "new era", "new erafits"], "difficult_direct_answer": false, "rationales": ["New era fits is shown as a logo.", "You can tell by the logo on the sidelines as to what company is sponsoring the stadium.", "New era has a banner in the background, which is a hat company."], "image": "train2014/COCO_train2014_000000192400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339965, "question_id": "JYxbZxhBrbxvzDJJbuaLaT", "question": "Why is he wearing a crown?", "choices": ["is cold", "confused", "wants attention", "his birthday"], "correct_choice_idx": 3, "direct_answers": ["birthday boy", "his birthday", "birthday", "birthday kid", "birthday", "his birthday", "his birthday", "his birthday", "his birthday", "birthday"], "difficult_direct_answer": false, "rationales": ["There is a cake with candles sitting in front of him.", "He's in front of a cake with lit candles", "A kid is sitting in front of a birthday cake with a paper crown on. kids wear hats and crowns at birthday parties."], "image": "val2014/COCO_val2014_000000339965.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403427, "question_id": "JZVBw8g3TCkDWwrsLFFo7n", "question": "How old is the girl at the table?", "choices": ["7 years", "8 years", "5 years", "6 years"], "correct_choice_idx": 3, "direct_answers": ["five", "six", "five", "five", "six", "6 years", "cake", "seven", "five", "six"], "difficult_direct_answer": false, "rationales": ["The girl is six.", "You can tell by the number of candles on the cake.", "Candles on a cake normally indicate that it is someone's birthday. a long time tradition is that the number of candle equate to the age of the person. in this case 6 candles equal to 6 years old."], "image": "train2014/COCO_train2014_000000403427.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129663, "question_id": "JZb3MhB7b3BSuXC4uRzqXp", "question": "What style of tennis is being played here?", "choices": ["men's doubles", "womens doubles", "mixed doubles", "men's singles"], "correct_choice_idx": 2, "direct_answers": ["professional", "doubles", "mens", "doubles", "mixed doubles", "doubles", "mixed doubles", "doubles", "professional", "doubles"], "difficult_direct_answer": false, "rationales": ["A man and woman playing together is referred to by such a term.", "Tennis is being played as a doubles sport.", "A man and a woman are playing on the same side of a court."], "image": "val2014/COCO_val2014_000000129663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211325, "question_id": "JZbD8Yrm9BKUTgAniWbBSx", "question": "In which country modern surfing has been initiated?", "choices": ["hawaii", "canada", "china", "taiwan"], "correct_choice_idx": 0, "direct_answers": ["usa", "usa", "united states", "hawaii", "polynesia", "hawaii usa", "american", "australia", "australia", "america"], "difficult_direct_answer": false, "rationales": ["The country is hawaii.", "Surfing comes from hawaii.", "That country was famous for starting the sport before being part of the us."], "image": "train2014/COCO_train2014_000000211325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363181, "question_id": "JZhn3wgoCXLwVjyNAfzNsa", "question": "What century does this picture depict?", "choices": ["tenth", "fourth", "nineteenth", "twenty first"], "correct_choice_idx": 3, "direct_answers": ["20th century", "2000s", "twenty first", "modern century", "twenty first", "recent century", "twenty first", "21st", "21st", "21st"], "difficult_direct_answer": false, "rationales": ["One man has a cell phone", "The presence of modern day clothes and cell phones dates this image to modern times.", "The image has modern clothing."], "image": "val2014/COCO_val2014_000000363181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435075, "question_id": "JZx8yqP4bLaZ49sbTd79Wb", "question": "Which flower blends best with its leaves?", "choices": ["violet flower", "green flower", "pink flower", "yellow flower"], "correct_choice_idx": 1, "direct_answers": ["roses", "green", "rose", "mums", "roses", "lime colored", "green", "green", "green flower", "chrysanthemum"], "difficult_direct_answer": false, "rationales": ["It's a lighter tone, but still matches/blends better than the other options.", "The leaves are green, any other color would not blend but would contrast with these.", "The yellow, violet, and pink flowers have different colors than their leaves. the other flower has the same color for its flowers and leaves."], "image": "val2014/COCO_val2014_000000435075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494341, "question_id": "JZy4ik8WgURWjiKwHGHEPJ", "question": "What part of the woman is hanging over the left side of the couch?", "choices": ["hair", "arm", "knee", "ear"], "correct_choice_idx": 1, "direct_answers": ["arm", "arm", "arm", "arm", "arm", "arm", "arm", "arm", "arm", "arm"], "difficult_direct_answer": false, "rationales": ["The woman's arm is draped over the couch.", "The woman is laying on the couch and her arm is hanging off of the left side.", "She is lying down. her knees, hair, and ears are above the couch."], "image": "train2014/COCO_train2014_000000494341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155299, "question_id": "Ja4UPKycP3UUSizeNGQfBH", "question": "What is the large silver object to the left used to store?", "choices": ["animals", "clothing", "toys", "food"], "correct_choice_idx": 3, "direct_answers": ["food", "perishable food", "cold foods", "light", "food", "food", "perishable foods", "food", "food", "food"], "difficult_direct_answer": false, "rationales": ["It is a fridge.", "A refrigerator keeps our food cold or frozen until we are ready to use it.", "The large silver object on the left is a refrigerator. most people store food in these."], "image": "train2014/COCO_train2014_000000155299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537720, "question_id": "JaE7LZs4tvmtFRyaJKrbwZ", "question": "What country could this elephant come from?", "choices": ["botswana", "tanzania", "zimbabwe", "myanmar"], "correct_choice_idx": 3, "direct_answers": ["africa", "africa", "africa", "india", "thailand", "africa", "myanmar", "india", "india", "asia"], "difficult_direct_answer": false, "rationales": ["The country is myanmar.", "The asian elephant is from myanmar.", "Elephants with small ears are native to asia."], "image": "train2014/COCO_train2014_000000537720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341132, "question_id": "Jab2RcpXBrRmXYRp6nebdF", "question": "Why is the woman with gray hair leaning towards the table?", "choices": ["hiding", "resting", "sitting down", "blowing candles"], "correct_choice_idx": 3, "direct_answers": ["blowing candles", "maintain balance", "looking", "blowing candles", "candles", "blowing candles", "blowing candles", "blowing candles", "blowout candles", "admiring cake"], "difficult_direct_answer": false, "rationales": ["A woman is standing over candles on a cake. people blow out candles.", "The woman with the gray hair is leaning towards the table to blow out the candles on the birthday cake.", "The woman is leaning over a birthday cake, which is usually done to blow out the candles."], "image": "val2014/COCO_val2014_000000341132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270315, "question_id": "JacZhirGLVjNJtY6UpvuKc", "question": "Based on the dog's short legs what is it's most likely breed?", "choices": ["chihuahua", "husky", "dachsund", "corgi"], "correct_choice_idx": 3, "direct_answers": ["corgi", "dachshund", "corgi", "wiener dog", "corgi", "corgi", "pooch", "shepherd", "corgi", "dachshunds"], "difficult_direct_answer": false, "rationales": ["A black short and squatty do with pointy ears is in the middle of catching a frisbee.", "Corgis are short and stocky.", "That is the only breed in the list that has short legs."], "image": "train2014/COCO_train2014_000000270315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144715, "question_id": "Jb25qt4h2zkd2uVcLV6hcW", "question": "What carbon-based mineral powers the engine?", "choices": ["coal", "wood", "pentane", "steam"], "correct_choice_idx": 0, "direct_answers": ["coal", "coal", "coal", "coal", "steam", "coal", "coal", "coal", "coal", "coal"], "difficult_direct_answer": false, "rationales": ["Trains like this were made to run on coal. the other options aren't as good.", "These are coal powered engines because of the black smoke.", "The carbon based mineral is coal."], "image": "val2014/COCO_val2014_000000144715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186308, "question_id": "Jb4hMTZeph3QJKqWcLYGK7", "question": "What part of the flight is the AeroMexico plane in?", "choices": ["landing", "loading", "taxiing", "storage"], "correct_choice_idx": 0, "direct_answers": ["usa", "take off", "departing", "takeoff", "landing", "takeoff", "front", "front", "take off", "take off"], "difficult_direct_answer": false, "rationales": ["The aeromexico flight has its landing gear out and it is just about to land on the runway.", "The plane is partially lifted off the ground. a plane is only partially lifted off the ground in this manner when taking off or landing.", "This airplane is in landing or takeoff."], "image": "train2014/COCO_train2014_000000186308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369504, "question_id": "JbeCbjpe472of8zs7ZpJnV", "question": "What kind of facility can be found nearby?", "choices": ["fish pond", "bike trail", "hiking trail", "picnic area"], "correct_choice_idx": 1, "direct_answers": ["rest stop", "parking lot", "biking", "resort", "cycle shop", "restaurant", "bike renting", "national park", "bike trail", "bike club"], "difficult_direct_answer": true, "rationales": ["There are many bikes there to ride around on trails and biking paths.", "There are tons of bikes out.", "There are many bikes in the compoud."], "image": "train2014/COCO_train2014_000000369504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10785, "question_id": "Jc2gvs7NzxTVhjugAAWS4R", "question": "What does the girl dine on?", "choices": ["broccoli", "cauliflower", "carrots", "beef"], "correct_choice_idx": 0, "direct_answers": ["broccoli", "brocolli", "broccoli", "brocolli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "brocolli"], "difficult_direct_answer": false, "rationales": ["The girl is chewing on broccoli.", "A broccoli has green surface.", "The girl is biting into a big piece of broccoli."], "image": "val2014/COCO_val2014_000000010785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536605, "question_id": "JcHrzCdqbXCKY4NNpZ7Yxv", "question": "Where is the woman getting hair cut?", "choices": ["barbershop", "salon", "school", "home"], "correct_choice_idx": 3, "direct_answers": ["back", "salon", "above shoulders", "home", "salon", "beauty shop", "back", "barber", "beauty salon", "salon"], "difficult_direct_answer": false, "rationales": ["The woman is at home.", "The woman is getting her hair cut at home.", "Her mom is cutting her hair."], "image": "val2014/COCO_val2014_000000536605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240137, "question_id": "JcMdUiKVP3U9b8t9iwT2pW", "question": "Where was the frisbee invented?", "choices": ["greece", "pakistan", "turkey", "america"], "correct_choice_idx": 3, "direct_answers": ["bridgeport connecticut", "america", "usa", "connecticut", "america", "bridgeport connecticut", "italy", "connecticut", "connecticut", "bridgeport connecticut"], "difficult_direct_answer": false, "rationales": ["The frisbee was invented in bridgeport, ct, where william frisbie opened the frisbie pie company in 1871. students from nearby universities would throw the pie pans around yelling \"frisbie!!\", and about a century later, playing \"frisbee\" became a national pastime.", "That sport was made in that country.", "Specifically, it was ct."], "image": "val2014/COCO_val2014_000000240137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464286, "question_id": "JcawEBvVqNdjGcepyt9uFb", "question": "Which way is the black cow with yellow tag facing?", "choices": ["west", "forward", "south", "down"], "correct_choice_idx": 0, "direct_answers": ["right", "right", "upward", "right", "right", "up", "right", "west", "up", "down"], "difficult_direct_answer": false, "rationales": ["The cow with the tag is probably facing west since the sun is in that direction.", "The cow is facing right.", "His head is through the bars so he can eat."], "image": "val2014/COCO_val2014_000000464286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308265, "question_id": "Jchn4zBpAEaDEBcsLUQkFs", "question": "What type of lampshade is on the lamp?", "choices": ["fabric", "tiffany style", "clear glass", "fringe"], "correct_choice_idx": 1, "direct_answers": ["stained glass", "stained glass", "tiffany", "fan", "bowl", "glass", "tiffany style", "vintage", "glass", "umbrella shade"], "difficult_direct_answer": false, "rationales": ["The other options don't match this style.", "The lampshade is made of stained glass.", "A stained glass lamp shade was a symbol of excellence from the famous manufacturer back in the day."], "image": "train2014/COCO_train2014_000000308265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275380, "question_id": "JcvypHeXSfJwMR5z4vQyPn", "question": "What is the small blue cart used for?", "choices": ["luggage", "painting lines", "fuel", "repositioning plane"], "correct_choice_idx": 3, "direct_answers": ["airplane", "luggage transportation", "repositioning plane", "luggage", "fueling", "luggage", "luggage", "pulling", "touring people", "pull airplane"], "difficult_direct_answer": false, "rationales": ["The blue cart helps drag the plane.", "The car is tied to the plane to move it.", "The cart is for the plane."], "image": "train2014/COCO_train2014_000000275380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578651, "question_id": "JcwRSysRVkY6JCqSgzLu7T", "question": "What is the black oval-shaped object mounted above the sink?", "choices": ["hand dryer", "paper holder", "soap dispenser", "air purifier"], "correct_choice_idx": 2, "direct_answers": ["soap dispenser", "dispense soap", "soap", "soap dispenser", "soap", "toilet", "towel holder", "soap dispenser", "soap dispenser", "soap dispenser"], "difficult_direct_answer": false, "rationales": ["This is the common use for these types of dispensers.", "The device is used for soap.", "The item dispenses soap near the sink."], "image": "train2014/COCO_train2014_000000578651.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305101, "question_id": "JdJDy9fdEERvinVpaNPnCk", "question": "There are a line of cars at the intersection because of what reason?", "choices": ["emergency vehicle", "traffic light", "approaching train", "traffic jam"], "correct_choice_idx": 1, "direct_answers": ["oncoming train", "railroad crossing", "traffic", "traffic light", "traffic jam", "train", "train crossing", "train coming", "train crossing", "train"], "difficult_direct_answer": false, "rationales": ["The light here is red. you stop on red.", "The light is going for a train to be crossing so the cars are waiting.", "The cars are in traffic."], "image": "val2014/COCO_val2014_000000305101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360877, "question_id": "JdT3AsTaXM7yTjnFUwAJYA", "question": "What is used to weigh the produce before purchasing?", "choices": ["tape measure", "hands", "price tag", "scale"], "correct_choice_idx": 3, "direct_answers": ["scale", "scale", "scale", "scale", "scale", "scale", "weighing machines", "scale", "scale", "weighing machines"], "difficult_direct_answer": false, "rationales": ["Fruit is often sold by weight, which is measured with a scale, and this is a fruit store with an evident scale on premises.", "The produce is weight on a scale before purchasing.", "The scale is used."], "image": "val2014/COCO_val2014_000000360877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556823, "question_id": "JdiEiGKKwRq6RJnPEURVFL", "question": "What type of red sliced topping is on the pizza?", "choices": ["pepper", "olive", "pepperoni", "mushroom"], "correct_choice_idx": 0, "direct_answers": ["pepper", "bellpepper", "pepper", "red pepper", "pepper", "pepper", "pepper", "peppers", "peppers", "red pepper"], "difficult_direct_answer": false, "rationales": ["The only sliced topping on the pizza which is red comes from a pepper.", "The slices are light red like bell pepper.", "The topping is a pepper."], "image": "train2014/COCO_train2014_000000556823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48354, "question_id": "JdnocfadRDekAHAxomLDoT", "question": "Who was a famous player for this team?", "choices": ["bob orton", "karl malone", "jose reyes", "otis nixon"], "correct_choice_idx": 2, "direct_answers": ["keith hernandez", "tom seaver", "dwight golden", "dwight golden", "darrell strawberry", "carlos beltran", "tom seaver", "jose reyes", "mike piazza", "joe dimaggio"], "difficult_direct_answer": false, "rationales": ["The person is a baseball player, not a wrestler or basketball player. he is wearing a mets, not atlanta braves, jersey.", "Jose reyes was a famous baseball player who played for the mlb which is the professional baseball league in the us and canada.", "He is a baseball player, not wrestler or basketball player, who plays for the mets."], "image": "train2014/COCO_train2014_000000048354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23121, "question_id": "JdphmJvnYah4zSxgcFrrNo", "question": "What is bouncing on the floor?", "choices": ["tennis ball", "marble", "jumping bean", "egg"], "correct_choice_idx": 0, "direct_answers": ["tennis ball", "tennis ball", "tennis ball", "tennis ball", "tennis ball", "tennis ball", "tennis ball", "tennis ball", "tennis ball", "tennis ball"], "difficult_direct_answer": false, "rationales": ["It's the only one of the choices that you'd find on a tennis court.", "The ball is bouncing.", "The ball is bouncing."], "image": "val2014/COCO_val2014_000000023121.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477537, "question_id": "JdwvV8LtNMQHJDXn7GX4FT", "question": "Where is this train located?", "choices": ["museum", "bridge", "country", "tunnel"], "correct_choice_idx": 0, "direct_answers": ["museum", "museum", "germany", "museum", "museum", "parking lot", "train station", "outdoor museum", "europe", "museum"], "difficult_direct_answer": false, "rationales": ["The train is not on a track and it has a rope around it to protect it from people and there are people looking at it.", "It has ropes around it so people can see it but not touch it.", "The train is on display for people to look at."], "image": "train2014/COCO_train2014_000000477537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304587, "question_id": "JdxGLf8sSXix96U29agXNB", "question": "The electrical outlets in the room are following the electrical standards of which country?", "choices": ["united states", "germany", "united kingdom", "italy"], "correct_choice_idx": 2, "direct_answers": ["uk", "usa", "united kingdom", "usa", "united kingdom", "usa", "usa", "united states", "america", "usa"], "difficult_direct_answer": false, "rationales": ["The uk has electrical outlets that have this number of holes.", "The electrical outlets in the wall are the ones that are found in the united kingdom.", "The electrical outlets are for the uk."], "image": "train2014/COCO_train2014_000000304587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136539, "question_id": "JdxnpvFoPaZTTvFgbYYjG4", "question": "Why are they in this enclosed case?", "choices": ["anti-theft", "disinfect", "defrost", "keep cold"], "correct_choice_idx": 3, "direct_answers": ["for sale", "keep cold", "keep cold", "need refrigeration", "fridge", "keep cold", "keep cold", "keep cold", "need refrigeration", "drinks"], "difficult_direct_answer": false, "rationales": ["These a beverages which are ideally served cold, so they are kept in this fridge to preserve them.", "The drinks need to be cold.", "Drinks at the store need to be kept at a low temperature so people can enjoy them immediately after buying them. this case is like a \"store fridge\" with the see-through glass."], "image": "train2014/COCO_train2014_000000136539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52957, "question_id": "Je5CbT55P5d9cxhMhp8qRe", "question": "Where is the child practicing?", "choices": ["football field", "hockey rink", "turf field", "batting cage"], "correct_choice_idx": 3, "direct_answers": ["batting cage", "batting range", "field", "batting cage", "baseball", "swinging bat", "cage", "batting cage", "batting cage", "batting cage"], "difficult_direct_answer": false, "rationales": ["The other options dono't match this sport or the setting.", "Baseball players often practice at batting cages and they have nets around them to catch the balls like in the picture.", "He is hitting a baseball while surrounded by nets."], "image": "train2014/COCO_train2014_000000052957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455156, "question_id": "Je8LptCT23C4adB4287jJZ", "question": "Where is this game being played?", "choices": ["gym", "park", "backyard", "stadium"], "correct_choice_idx": 3, "direct_answers": ["stadium", "baseball field", "baltimore", "baseball stadium", "baseball stadium", "stadium", "baseball field", "pennsylvania", "baseball field", "baltimore"], "difficult_direct_answer": false, "rationales": ["The game is a stadium.", "The place is a stadium as people are seen watching.", "The presence of bleachers filled with spectators alongside the dugout and baseball field confirms that this is a professional sporting event in stadium."], "image": "train2014/COCO_train2014_000000455156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384475, "question_id": "JeTaHWmnPk8SdXLypv34qa", "question": "What is the lady about to do?", "choices": ["dance", "board train", "leave platform", "cross rail"], "correct_choice_idx": 1, "direct_answers": ["ride train", "board", "board train", "board train", "board train", "board train", "train ride", "enter bus", "board train", "board subway"], "difficult_direct_answer": false, "rationales": ["She is standing on a platform and a train can be seen in motion passing through, likely about to stop to let people on and off.", "There is a white transportation vehicle under in a subway. it is on a track of some kind.", "She is standing on the boarding platform facing the way a person would do this."], "image": "train2014/COCO_train2014_000000384475.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442338, "question_id": "Jepbaqmz5NW4cHme5oyamz", "question": "What are the two people engaging in?", "choices": ["fishing", "paddling", "swimming", "surfing"], "correct_choice_idx": 0, "direct_answers": ["paddleboard", "windsurfing", "paddle boarding", "fishing", "surfing", "surfing", "paddle boarding", "paddle board", "surfing", "rowing"], "difficult_direct_answer": false, "rationales": ["It's not easy to see but it looks like they are holding an object which has to be a pole. since they are in water the answer becomes obvious.", "They are on boards in the water, and the sticks they are holding are paddles, which help to propel them.", "The people are fishing in the sea."], "image": "train2014/COCO_train2014_000000442338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169495, "question_id": "JfU9AAHoM2Bz4P8k9qGWwe", "question": "Why would someone sit at this table?", "choices": ["sew", "work", "talk", "eat"], "correct_choice_idx": 3, "direct_answers": ["eat", "comfort", "drink", "eat", "eating", "to eat", "comfortness", "eat", "relax", "eating"], "difficult_direct_answer": false, "rationales": ["You would sit at this table to eat food such as dinner, breakfast or lunch.", "Most tables of that size are used to eat or drink meals.", "A dining room table is shown."], "image": "train2014/COCO_train2014_000000169495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93599, "question_id": "JftBCo7XzpdTZsYVBfNd3F", "question": "Why does the man have numbers written on his arm?", "choices": ["tattoo", "event participant", "as joke", "for surgery"], "correct_choice_idx": 1, "direct_answers": ["for marathon", "competition", "race", "to compete", "race", "identification", "competitor", "charity race", "tattoo", "event participant"], "difficult_direct_answer": true, "rationales": ["The man is part of a event.", "The man is part of a race.", "Runners have race numbers on their arms."], "image": "train2014/COCO_train2014_000000093599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41700, "question_id": "JgNmNS8zXSiYq7ENnvZd4u", "question": "What's the name of the type of outfit the woman in blue is wearing?", "choices": ["peekaboo", "streetwear", "overalls", "pajamas"], "correct_choice_idx": 2, "direct_answers": ["coveralls", "apron", "overalls", "coveralls", "overalls", "overalls", "overalls", "overalls", "overalls", "coveralls"], "difficult_direct_answer": false, "rationales": ["These are denim like material that are pants with a bib that attaches over the arm.", "The woman is wearing a blue set of coveralls over her clothes.", "The lady has overalls on that are jeans with a bib."], "image": "train2014/COCO_train2014_000000041700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116358, "question_id": "JgPtM3eZzhLdLEMGEehdpE", "question": "What time of day is it during the game?", "choices": ["midnight", "twilight", "dusk", "dawn"], "correct_choice_idx": 2, "direct_answers": ["sunset", "evening", "twilight", "evening", "night time", "evening", "dusk", "dusk", "dusk", "evening"], "difficult_direct_answer": false, "rationales": ["The sun is setting and there are pink and purple in the sky.", "Red at night, sailors delight. baseball games are usually played closer to evening than any other time.", "It looks like the sun is setting."], "image": "val2014/COCO_val2014_000000116358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120431, "question_id": "JgTfLuG5FwfyogrBtBMjBa", "question": "What corporation made the shirt the woman is wearing?", "choices": ["adidas", "new balance", "hanes", "everlast"], "correct_choice_idx": 0, "direct_answers": ["adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "alias", "alias"], "difficult_direct_answer": false, "rationales": ["It has the triangular logo in the middle of her shirt", "Adidas's logo is on the shirt.", "The corporation is adidas."], "image": "train2014/COCO_train2014_000000120431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328, "question_id": "JgVFfPadjapnLNwdUS7a4K", "question": "What footwear are these people wearing?", "choices": ["sneakers", "shoes", "skis", "boots"], "correct_choice_idx": 1, "direct_answers": ["dress shoes", "dress shoes", "shoes", "shoes", "shoes", "dress shoes", "dress shoes", "dress shoes", "dress shoes", "dress shoes"], "difficult_direct_answer": false, "rationales": ["They are wearing shoes.", "They are wearing oxfords and boots.", "They are wearing closed dressy footwear."], "image": "val2014/COCO_val2014_000000000328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58857, "question_id": "JgWckAExTfxJH44DLcSRGu", "question": "What kind of sandwich is the child making?", "choices": ["butter", "meat paste", "peanut jelly", "peanut butter"], "correct_choice_idx": 2, "direct_answers": ["peanut jelly", "pba sandwich", "peanut butter", "peanut butter", "peanutbutter jelly", "peanut butter", "pub j", "peanutbutter jelly", "pba", "peanut butter"], "difficult_direct_answer": false, "rationales": ["The child is spreading peanut butter.", "The child has the ingredients out for make a peanut butter and jelly sandwich and is actively spreading peanut butter on bread.", "The containers of peanut butter and jelly can be seen on the counter, and this is known as a popular combination of items that children like to make a sandwich out of."], "image": "train2014/COCO_train2014_000000058857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307786, "question_id": "JgYyUTX4USRYnb88LDSpJM", "question": "Why do skiers wear suits?", "choices": ["pilgrims", "bikini", "ski suit", "snowsuit"], "correct_choice_idx": 3, "direct_answers": ["warmth", "snowsuit", "for warmth", "keep warm", "warmth", "temperature control", "warmth", "keepwarm", "protect bodies", "stay warm"], "difficult_direct_answer": false, "rationales": ["They wear them to keep warm and keep the snow off of them.", "A snow skier is in a snowsuit in a snowy area. snowsuits are used to keep warm in cold areas.", "Skiers need the item in option a to protect themselves from the snow and to keep warm in the cold."], "image": "val2014/COCO_val2014_000000307786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314370, "question_id": "JgZmBvH3BLZeVpDDZXioLB", "question": "What cereal is flavored with chocolate to make this cereal?", "choices": ["corn flakes", "corn pops", "kix", "rice krispies"], "correct_choice_idx": 2, "direct_answers": ["coco puffs", "coco puffs", "coco puffs", "coco puffs", "coco pops", "coco puffs", "kix", "coco puffs", "coco pops", "coco pops"], "difficult_direct_answer": false, "rationales": ["Kix can be flavored with chocolate to make chocolate corn puffs.", "The cereal is kix.", "You can read the label on the yellow box."], "image": "val2014/COCO_val2014_000000314370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536795, "question_id": "Jh2AehFixV2SydWx5ff7mV", "question": "How many cars are visibly shown in this photo?", "choices": ["four", "two", "five", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are two parked and two in the road", "In looking at the total picture, we have two in the foreground and one in the background for a total of three.", "There are two on this side and one on the other."], "image": "val2014/COCO_val2014_000000536795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486232, "question_id": "Jh2RxAcSQgZpRu9jS5DLWz", "question": "What race is this person holding the apples?", "choices": ["african", "hispanic", "east asian", "white"], "correct_choice_idx": 0, "direct_answers": ["black", "black", "black", "black", "black", "african", "african", "african", "black", "african"], "difficult_direct_answer": false, "rationales": ["A person holding fruit has dark hands. africans have dark skin.", "The skin on the hand is very dark.", "The person has black, not yellow, white, or brown, skin."], "image": "val2014/COCO_val2014_000000486232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148208, "question_id": "Jh5RKcbt5eUtnT65cmfH7X", "question": "Whose birth is being celebrated here?", "choices": ["jesus", "mohammad", "buddha", "zoroaster"], "correct_choice_idx": 0, "direct_answers": ["jeebus", "jesus", "jesus", "jesus", "jesus", "jesus", "jesus", "king", "jesus christ", "jesus"], "difficult_direct_answer": false, "rationales": ["The person whose birth is being celebrated is jesus.", "Christmas decorations are present all around a room. christmas celebrates the birth of jesus.", "Jesus's birth is celebrated at christmas."], "image": "train2014/COCO_train2014_000000148208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62915, "question_id": "JhQw5pL8Yo9CLw2czGLEJr", "question": "What is the blue vehicle doing?", "choices": ["pulling", "overturning", "sinking", "burning"], "correct_choice_idx": 0, "direct_answers": ["pulling", "pulling baggage", "carrying luggage", "towing something", "towing luggage", "pulling carts", "transporting luggage", "loading airplane", "pulling containers", "pulling carts"], "difficult_direct_answer": true, "rationales": ["It has luggage carts hooked to the back", "The vehicle is pulling a trailer.", "The blue vehicle in front is connected to the luggage trailer by a strap and it pulls the luggage trailer between planes and terminal."], "image": "val2014/COCO_val2014_000000062915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527248, "question_id": "JhRUZrYLAYjo7GAXyvHohY", "question": "How is the roof shape of the building called?", "choices": ["pointy roof", "conical roof", "pavilion roof", "tower roof"], "correct_choice_idx": 2, "direct_answers": ["pyramid", "slanted", "steeple", "turret", "pavilion roof", "pyramid", "pagoda", "spire", "flat", "tower"], "difficult_direct_answer": true, "rationales": ["The design of the roof is used for churches.", "A building has a large tower on one end.", "That is what the type of roof on the tower is called."], "image": "val2014/COCO_val2014_000000527248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373199, "question_id": "JhYobweVGQpkd3ooujtxBV", "question": "What made the icing that color?", "choices": ["blueberries", "indigo", "food coloring", "corn flowers"], "correct_choice_idx": 2, "direct_answers": ["confectioners sugar", "food coloring", "food color", "food coloring", "food dye", "egg", "food coloring", "right man", "white cream", "food coloring"], "difficult_direct_answer": false, "rationales": ["The coloring is in frosting that is edible and safe to eat.", "The icing was originally white so something had to be put in it to change the color to blue.", "People used food coloring for the color of icing."], "image": "train2014/COCO_train2014_000000373199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500900, "question_id": "JhbVtFzbzbVzFxSqLsi6yE", "question": "What type job were people in yellow vests recently doing?", "choices": ["erecting bridge", "concrete work", "cone manufacture", "arresting students"], "correct_choice_idx": 1, "direct_answers": ["construction", "construction", "flaggers", "construction", "cementing", "concrete work", "construction", "concrete", "construction", "construction"], "difficult_direct_answer": false, "rationales": ["There are several men standing around a grey slab with cones around it.", "The person is doing work on the sidewalk.", "The people in yellow were cementing the place."], "image": "train2014/COCO_train2014_000000500900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458475, "question_id": "JhkiwNfzLGWm5BGwgJMbGV", "question": "What is the skier ready to do?", "choices": ["sit", "land", "roll", "ascend"], "correct_choice_idx": 1, "direct_answers": ["land", "land", "land", "wear skies", "land", "land", "fly", "to land", "land", "landing"], "difficult_direct_answer": false, "rationales": ["The skier is in the air. what goes up must come down.", "The skier is about to go downhill.", "He is in the air. what goes up must come down."], "image": "train2014/COCO_train2014_000000458475.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378048, "question_id": "JhmYSMWDC4eY7DXXytSGQa", "question": "Where are the women on the blankets sitting?", "choices": ["forest", "yard", "park", "beach"], "correct_choice_idx": 2, "direct_answers": ["park", "park", "park", "park", "grass", "park", "grass", "park", "lawn", "field"], "difficult_direct_answer": false, "rationales": ["Due to the setting and the kite flying, you can tell where they are.", "They are on a big grassy area with other people walking around.", "The women are sitting on blankets in the yard."], "image": "val2014/COCO_val2014_000000378048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383934, "question_id": "Ji8VbiTkdoTimPeRgodVGt", "question": "Where is this meal served at?", "choices": ["restaurant", "home", "office cafeteria", "school cafeteria"], "correct_choice_idx": 1, "direct_answers": ["pizzeria", "italian restaurant", "table", "home", "home", "dining table", "pizzeria", "pizzaria", "pizzaria", "restaurant"], "difficult_direct_answer": false, "rationales": ["The plate on the table is someones kitchen.", "This meal is being served in a residential area.", "The meal is at a dining table with a simple chair and plate."], "image": "train2014/COCO_train2014_000000383934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354212, "question_id": "Ji8eLHxvbvaDVfrc8WvBAW", "question": "What useful item is he missing?", "choices": ["mouse pad", "pillow", "head phones", "shoes"], "correct_choice_idx": 0, "direct_answers": ["mousepad", "nothing", "mouse pad", "mouse pad", "mousepad", "monitor", "mouse pad", "screen", "mouse pad", "mouse pad"], "difficult_direct_answer": false, "rationales": ["A mouse pad is missing from the desk where the man is using the computer.", "A lot of people don't use them even tough they're useful.", "The child is using a mouse on the wooden surface."], "image": "val2014/COCO_val2014_000000354212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356147, "question_id": "JiLNvbHvafdUGXmwE6XjW8", "question": "What are the men most probably trying to do to the elephants?", "choices": ["wash", "train", "play", "feed"], "correct_choice_idx": 0, "direct_answers": ["ride them", "wash", "wash", "ride", "ride", "catch them", "train them", "ride them", "ride them", "bathe them"], "difficult_direct_answer": false, "rationales": ["Since they are standing in a large body of water, it is most likely that they want to clean the elephants.", "They are giving them a bath.", "Water is used to wash things are the elephants are in the water."], "image": "train2014/COCO_train2014_000000356147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471686, "question_id": "JiLo7SMrPQ4RxikSisLoNX", "question": "What is the green object with holes in it called?", "choices": ["colander", "noodle maker", "cheese grater", "bowl"], "correct_choice_idx": 0, "direct_answers": ["colander", "colander", "colander", "colander", "strainer", "colander", "strainer", "strainer", "strainer", "colander"], "difficult_direct_answer": false, "rationales": ["Colanders have holes in them to strain food.", "The green object, also known as a strainer, has holes all over it and is used to drain liquid away from a food.", "This bowl has holes in it to drain water from food. this device is known as a colander."], "image": "train2014/COCO_train2014_000000471686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100726, "question_id": "JiTnDtfPnGrJo63cDJciMH", "question": "What do the people here enjoy?", "choices": ["gaming", "exreme sports", "sales", "dancing"], "correct_choice_idx": 0, "direct_answers": ["video games", "games", "playing wii", "wii", "video games", "bowling", "wii bowling", "playing wii", "wii", "gaming"], "difficult_direct_answer": false, "rationales": ["They are playing a game on wii", "The women are playing the wii console.", "They are bowling on the nintendo wii."], "image": "val2014/COCO_val2014_000000100726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465360, "question_id": "JiogdiBG8PPZD8orknCbcZ", "question": "Where is the lady located in?", "choices": ["theme park", "playground", "picnic area", "forest"], "correct_choice_idx": 1, "direct_answers": ["park", "playground", "park", "playground", "park", "park", "park", "park", "park", "playground"], "difficult_direct_answer": false, "rationales": ["The person is at a playground.", "The playground is visible behind the woman.", "Behind her appears to be yellow bars that children would play on."], "image": "train2014/COCO_train2014_000000465360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282042, "question_id": "JjCzSCPLR2yb7AjVnJqdPL", "question": "Why does the woman use an umbrella?", "choices": ["for snow", "for hail", "for rain", "for shade"], "correct_choice_idx": 3, "direct_answers": ["protection", "sun protection", "sunny", "protection", "for shade", "sun protection", "shade", "protection", "shade", "protection"], "difficult_direct_answer": false, "rationales": ["The woman is holding an umbrella during a rainless sunny day, and is thus using the item for the reason shown in option a.", "No moisture is seen on the ground, so she must be using it to block the sun.", "The woman is using an umbrella for shade in the day."], "image": "train2014/COCO_train2014_000000282042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385448, "question_id": "JjDFEPHa8M3bGDnHoKKYmV", "question": "What animal is near the motorcycles?", "choices": ["zebra", "cow", "tiger", "bear"], "correct_choice_idx": 1, "direct_answers": ["bull", "bull", "cow", "cow", "cow", "cow", "bull", "cow", "cow", "cow"], "difficult_direct_answer": false, "rationales": ["The cow is near.", "There is a black cow near the motorcycles.", "A cow is outside and has horns."], "image": "train2014/COCO_train2014_000000385448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119441, "question_id": "JjSfbgu7Kaevw4hr25YEBR", "question": "Why does the woman have the helmet on?", "choices": ["fashion", "protection", "to dance", "cosplay"], "correct_choice_idx": 1, "direct_answers": ["safety", "protection", "helmet", "street ski", "safety", "protection", "protection", "protection", "for protection", "safety"], "difficult_direct_answer": false, "rationales": ["The helmet keeps her head safe.", "She does not want to get hurt.", "It's on her head to protect her head if she falls."], "image": "train2014/COCO_train2014_000000119441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452930, "question_id": "JjbeqkrXBZjCLaDX9UkJoB", "question": "What will this child hold while they play with this toy?", "choices": ["tail", "drone", "string", "control"], "correct_choice_idx": 2, "direct_answers": ["string", "kite", "string", "string", "kite", "string", "string", "string", "string", "kite"], "difficult_direct_answer": false, "rationales": ["The child will hold the string for the kite.", "The child will have the string.", "Kites are controlled by holding the strings. so the kid will hold the string."], "image": "train2014/COCO_train2014_000000452930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279083, "question_id": "Jk2kYSyXueDuYfBSNi8dMm", "question": "What handedness does the batter here have?", "choices": ["right", "left", "none", "both"], "correct_choice_idx": 1, "direct_answers": ["right", "left", "left", "left", "left", "right", "right handed", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["By the position of where the batter is hitting from indicates which is his dominant hand.", "The batter is on the left side of the mound and has therefore a left handed swing.", "The batter is favoring his dominant hand."], "image": "val2014/COCO_val2014_000000279083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499911, "question_id": "JkHSbjE8bStPKkdcBoSSC6", "question": "What style is this bike decorated in?", "choices": ["chef", "golfer", "cowboy", "jock"], "correct_choice_idx": 2, "direct_answers": ["cowboy", "traditional", "antique", "wild west", "western", "vintage", "cowboy", "aboriginal", "native american", "western"], "difficult_direct_answer": false, "rationales": ["The motorcycle has leather decorations and fringe that might be see worn by a cowboy.", "The bike has tassels on it which is popular with cowboys.", "The seats have fringes like boots."], "image": "train2014/COCO_train2014_000000499911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69486, "question_id": "JkSCUetkDbYKoGDRUnKRCQ", "question": "What could you buy here?", "choices": ["boat", "food", "shoes", "car"], "correct_choice_idx": 1, "direct_answers": ["food", "fruits", "fruits", "fruit", "fruit", "fruit", "fruit", "fruits", "fruit", "fruit"], "difficult_direct_answer": false, "rationales": ["This place sells fruit.", "People sit in front of a market with hanging fruit all around.", "Food can be bought."], "image": "train2014/COCO_train2014_000000069486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108240, "question_id": "JkvPmPfMEKkNfopeiMM49d", "question": "What does the blue traffic sign with a stick figure in the center most likely indicate?", "choices": ["bicycle lane", "school crossing", "pedestrian crossing", "barrier"], "correct_choice_idx": 2, "direct_answers": ["pedestrian traffic", "pedestrian crossing", "pedestrians crossing", "crosswalk", "pedestrian", "ice", "crosswalk", "cross walk", "pedestrian crossing", "pedestrian"], "difficult_direct_answer": false, "rationales": ["The sign is telling that people are walking.", "The traffic sign has a pictograph of a man crossing a crosswalk.", "It shows a person walking on zebra stripes"], "image": "train2014/COCO_train2014_000000108240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579229, "question_id": "Jkzgvya9syLPweFmaMWofh", "question": "The man looks like he is doing what kind of move?", "choices": ["crab walk", "leap frog", "worm", "electric slide"], "correct_choice_idx": 0, "direct_answers": ["lying", "scatting", "skateboard", "wipeout", "falling", "skateboard", "skateboard trick", "crab walk", "grind", "horror"], "difficult_direct_answer": true, "rationales": ["He looks like he is walking like one.", "The man has his hands and feet on the ground as if he is doing a crab walk.", "The move in the skateboarding world is called crab walking."], "image": "train2014/COCO_train2014_000000579229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555322, "question_id": "Jm6d2rcJCP69zmB26HQ3vK", "question": "What is the wooden bit around the ceiling called?", "choices": ["frill", "baseboard", "molding", "trim"], "correct_choice_idx": 2, "direct_answers": ["molding", "crown molding", "molding", "molding", "molding", "trim", "crown molding", "crown molding", "frame", "coffee table"], "difficult_direct_answer": false, "rationales": ["Molding is used at the top of the wall to separate the ceiling from wall. the wooden item appears to be molding separating ceiling from wall.", "The room has a beam of decorative wood called molding that runs around the room at the ceiling.", "There are thick beams going around the outside of the ceilig."], "image": "val2014/COCO_val2014_000000555322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227127, "question_id": "JmLe3gkhxqcxXZpvA9Btjr", "question": "What is the man in jeans sitting on?", "choices": ["chair", "mattress", "stool", "bench"], "correct_choice_idx": 1, "direct_answers": ["bed", "mattress", "bed", "mattress", "mattress", "mattress", "bed", "mattress", "mattress", "bed"], "difficult_direct_answer": false, "rationales": ["The man is sitting on a bed.", "There are mattresses in the background as well as a headboard; the man is sitting on a mattress.", "There are several of them in this building"], "image": "train2014/COCO_train2014_000000227127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357992, "question_id": "JmLoSNiaWDB37cgSVs6top", "question": "What is the white rounded object on the right?", "choices": ["mouse", "router", "case", "soap"], "correct_choice_idx": 0, "direct_answers": ["computer mouse", "computer mouse", "pc mouse", "mouse", "computer mouse", "computer mouse", "mouse", "mouse", "mouse", "mouse"], "difficult_direct_answer": false, "rationales": ["The object is next to a computer. it is on a workdesk.", "This object is a mouse used to control the cursor on one of the computers.", "The round palm sized device near this computing equipment is known as a mouse."], "image": "val2014/COCO_val2014_000000357992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497819, "question_id": "JmnQkAwLh37Zeq4vwDbyPN", "question": "Where is the safest place for a skateboarder to cross the street?", "choices": ["sidewalk", "parking lot", "crosswalk", "highway"], "correct_choice_idx": 2, "direct_answers": ["crosswalk", "crosswalk", "crosswalk", "intersection", "intersection", "intersection", "yellow line", "crosswalk", "crosswalk", "crosswalk"], "difficult_direct_answer": false, "rationales": ["Skateboarding on the highway is dangerous. the sidewalk and parking lot are beside the street.", "There are many lines on a street however the crosswalk is there to protect people. this includes pedestrians and skateboarders.", "The skateboarder would be safest if they crossed the road at the crosswalk."], "image": "train2014/COCO_train2014_000000497819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270844, "question_id": "JmrPpCpY4zUaXuVrABWxyk", "question": "Why is the man smiling?", "choices": ["delicious food", "received gift", "quit job", "telling joke"], "correct_choice_idx": 1, "direct_answers": ["happy", "received gift", "happiness", "happy", "happy", "his happy", "received gift", "happy", "happy", "present"], "difficult_direct_answer": false, "rationales": ["A man has some sort of present in his hand.", "He is at a restaurant, holding a gift box in his hand, indicating he is dining there for an occasion and is receiving the gift from a loved one.", "He is opening a present."], "image": "train2014/COCO_train2014_000000270844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52012, "question_id": "JmytGgKHzEhCnQkdgYNgZf", "question": "What might the occasion be?", "choices": ["bar mitzah", "party", "christening", "funeral"], "correct_choice_idx": 1, "direct_answers": ["birthday", "class party", "party", "parent meetings", "party", "party", "potluck", "party", "celebration", "school party"], "difficult_direct_answer": false, "rationales": ["Dishes of food are laid out on a table with a tablecloth. people serve food at parties.", "This might be a party with several appetizers.", "Given the kids pics in the upper right corner, this is most likely a party for kids or teachers."], "image": "train2014/COCO_train2014_000000052012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358517, "question_id": "Jn2nR7HdrmV4PPMiFompVW", "question": "What is the stuffed bear most likely being used as?", "choices": ["disguise", "support animal", "shelter", "gift"], "correct_choice_idx": 3, "direct_answers": ["toy", "gift", "toy", "shield", "present", "toy", "gift", "gift", "gift", "toy"], "difficult_direct_answer": false, "rationales": ["The woman is holding the bear at an airport as if ready to present it to someone.", "The other options don't match the image or the typical use of a stuffed bear.", "Large stuffed animals are adored by most people. giving a stiff bear as a gift is a popular thing to do."], "image": "train2014/COCO_train2014_000000358517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493812, "question_id": "Jn4e3Hi8mfYgCQLwXanjnH", "question": "What ringing item can be found above the clock?", "choices": ["phone tower", "glasses", "phone", "bells"], "correct_choice_idx": 3, "direct_answers": ["bells", "bell", "bell", "bell", "bell", "bells", "bell", "bells", "bell", "bell"], "difficult_direct_answer": false, "rationales": ["This type of clock tower often houses a bell or bells.", "A tower with a clock and a large area above the clock can be seen. towers with clocks often have bells that chime with the turning of the clock hands.", "There is a bell on the tower."], "image": "train2014/COCO_train2014_000000493812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17703, "question_id": "JnB7TrL9tefJrvzmgk2WLA", "question": "What type of hairline does the standing man have?", "choices": ["triangle", "receding", "straight", "widow's peak"], "correct_choice_idx": 1, "direct_answers": ["receding", "receding", "balding", "receding", "balding", "receding", "balding", "bald", "receding", "receding"], "difficult_direct_answer": false, "rationales": ["He is losing his hair and you can see it's farther back on his head.", "His hair is slowly falling out.", "The man is balding."], "image": "train2014/COCO_train2014_000000017703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158846, "question_id": "JnDAMvvKZgHmjxZZofk48c", "question": "What type action is the tennis player here doing?", "choices": ["judging", "return", "serving", "resting"], "correct_choice_idx": 2, "direct_answers": ["serving", "serving", "serving", "jumping", "hitting cork", "serving", "hes serving", "serving", "hitting ball", "serving ball"], "difficult_direct_answer": false, "rationales": ["The stance the guy has is that of the player who is about the serve the ball.", "He is in the position to hit the ball to his opponent.", "The man is trying to serve the ball."], "image": "val2014/COCO_val2014_000000158846.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265818, "question_id": "JnPzGnmvMmnBxZWg2w2jjc", "question": "What might come from the right or left at any time?", "choices": ["lava", "train", "tank", "car"], "correct_choice_idx": 3, "direct_answers": ["traffic", "traffic", "traffic", "people", "car", "traffic", "traffic", "vehicle", "turn", "traffic"], "difficult_direct_answer": false, "rationales": ["A car could pass through this intersection from the right or left at any moment.", "This is a road for vehicles", "It is a road that motor vehicles drive on."], "image": "train2014/COCO_train2014_000000265818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189274, "question_id": "JntAWdnYjnbUyGaL9BJGnL", "question": "What has the boy made using the bottle?", "choices": ["bottle rocket", "grape jelly", "lava lamp", "hand sanitizer"], "correct_choice_idx": 2, "direct_answers": ["volcano", "gas", "lava", "lava lamp", "droplets", "bomb", "oil", "lava lamp", "color water", "lava lamp"], "difficult_direct_answer": false, "rationales": ["The bottle contains oil and red food coloring.", "The two tones of the fluid and the appearing different densities make a the most logical choice. this boy also appears to be at the age where this kind of project may be done in school.", "It looks like a volcano exploding."], "image": "train2014/COCO_train2014_000000189274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253229, "question_id": "JnttEGx3xAQCMNriHXwm3u", "question": "What famous Japanese franchise for children is part of the kite on display by the vendor?", "choices": ["anpanman", "pokemon", "dragon ball", "doraemon"], "correct_choice_idx": 3, "direct_answers": ["pokemon", "pokemon", "doraemon", "anime", "playing", "pokemon", "sanrio", "doraemon", "sake", "mario kat"], "difficult_direct_answer": false, "rationales": ["There is a blue cat.", "The cat is named doraemon and is part of a japanese anime series.", "The woman is in front of a blue cat."], "image": "train2014/COCO_train2014_000000253229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499884, "question_id": "Jnw68RSN3NNnAtGPD3goLa", "question": "What is on the ground in front of the group?", "choices": ["console", "mug", "trophy", "bottle"], "correct_choice_idx": 2, "direct_answers": ["trophy", "trophy", "trophy", "trophy", "trophy", "cup", "asphalt", "trophy", "trophy", "trophy"], "difficult_direct_answer": false, "rationales": ["They look like they just wont a competition.", "There is a small trophy placed in front of the group.", "A group of people are standing on a tennis court with a small statue in front of them. trophies are given out in sports."], "image": "val2014/COCO_val2014_000000499884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518522, "question_id": "Jo4t3FRcjpysKEM7BvSTqB", "question": "What period of the day is shown here?", "choices": ["afternoon", "evening", "early morning", "almost noon"], "correct_choice_idx": 3, "direct_answers": ["morning", "afternoon", "noon", "day", "midday", "noon", "midday", "afternoon", "noon", "almost noon"], "difficult_direct_answer": false, "rationales": ["It is almost noon.", "The clock in the background is almost at noon.", "The clock seen on the tower in the back has its small hand close to the 12, and the sun is out, indicated that it is not midnight."], "image": "train2014/COCO_train2014_000000518522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199969, "question_id": "JoKC59tAyBK2JRYPuSqRHd", "question": "What language must someone speak in order to understand what items are offered?", "choices": ["english", "spanish", "french", "italian"], "correct_choice_idx": 0, "direct_answers": ["english", "english", "english", "english", "english", "english", "english", "english", "english", "english"], "difficult_direct_answer": false, "rationales": ["Everything is written down in english.", "Though other languages are present in this market place scene; every one of the food items is labelled by it's name in english.", "The writing is written in english."], "image": "val2014/COCO_val2014_000000199969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318162, "question_id": "JoVqwGy8Ze4zhUKyBZNcbi", "question": "Why is the image blurred?", "choices": ["shaky photographer", "unfocussed", "girl moving", "broken camera"], "correct_choice_idx": 1, "direct_answers": ["camera quality", "unfocused", "camera", "unfocused", "moving", "shaky photographer", "not focused", "unfocussed", "bad focus", "camera"], "difficult_direct_answer": false, "rationales": ["The photo seems to be a bit unfocused.", "When pictures are not focused properly they will be blurry.", "The camera isn't focused."], "image": "train2014/COCO_train2014_000000318162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321238, "question_id": "JoaNSETMWVU6XmnBn7jgip", "question": "What does the man all the way to the right have on?", "choices": ["boots", "hat", "clown nose", "headphones"], "correct_choice_idx": 1, "direct_answers": ["baseball cap", "hat", "ballcap", "red shirt", "hat", "shirts", "jersey", "hat", "hat", "cap"], "difficult_direct_answer": false, "rationales": ["The man in the red shirt is wearing a cap.", "The man is wearing a cap.", "The man has a hat."], "image": "train2014/COCO_train2014_000000321238.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413811, "question_id": "JoaQDhGh2MxBf736CutvLK", "question": "Who can stand on their float?", "choices": ["no one", "black suit", "pink suit", "orange suit"], "correct_choice_idx": 3, "direct_answers": ["female pink", "girl", "anyone", "nobody", "surfer", "woman", "woman", "orange suit", "pinkie", "woman"], "difficult_direct_answer": false, "rationales": ["The person in orange can stand on their float.", "It is too light to stand on top of.", "The person in the orange suit can stand on it."], "image": "val2014/COCO_val2014_000000413811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299649, "question_id": "JohNLWaTRHquYpjU7itoeK", "question": "What type employees move the smaller vehicles shown here?", "choices": ["ground crew", "hostesses", "pilots", "stewardesses"], "correct_choice_idx": 0, "direct_answers": ["ground crew", "cargo", "airport", "support", "drivers", "ground crew", "drivers", "airline", "taxi drivers", "airport"], "difficult_direct_answer": false, "rationales": ["The small vehicles in the pictures are used to transport baggage and maintain the aircraft. the people who operate these are called ground crew.", "The ground crew will move it.", "The employees are the crew."], "image": "val2014/COCO_val2014_000000299649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71582, "question_id": "JokecbVUyBCNk4xWyZ9wCZ", "question": "What is the most obvious thing that has been done to the unusual hair?", "choices": ["shaved", "glittered", "dyed", "cut"], "correct_choice_idx": 2, "direct_answers": ["dyed", "hair colored", "dye", "dye", "dyed red", "dyed", "dyed", "dye", "dying red", "dyed"], "difficult_direct_answer": false, "rationales": ["Someone has bright red hair.", "The hair is an unnatural shade of red.", "The persons hair color is an unnatural shade of red."], "image": "train2014/COCO_train2014_000000071582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259715, "question_id": "JopU5yq8QFhQYw4k5TKfqk", "question": "Why are the bikes leaning over?", "choices": ["showing off", "resting", "better turning", "falling"], "correct_choice_idx": 2, "direct_answers": ["centripetal force", "better turning", "cornering", "negotiate corner", "fast turns", "turning", "maintain balance", "speed up", "gravity", "turn"], "difficult_direct_answer": true, "rationales": ["The drivers are using their weight to turn easier.", "The bikes can turn better.", "People are turning around a corner on motorcycles. leaning is required to make turns on motorcycles."], "image": "train2014/COCO_train2014_000000259715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504216, "question_id": "JotLSNNuAmPmr9846D5ido", "question": "What is the music stores name used as a substitute for in the signage?", "choices": ["pool", "mirror", "jail", "dart board"], "correct_choice_idx": 3, "direct_answers": ["steve's", "stevie", "dart", "dartboard", "dart board", "steve's", "steve's", "dart", "owner", "steve's pizza"], "difficult_direct_answer": false, "rationales": ["The thing in the sign looks like it was a target.", "It has a large dart placed in the middle", "It has a large dart stuck to the sign"], "image": "train2014/COCO_train2014_000000504216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5018, "question_id": "JovXPV5BE3xbvvYpZcKS2Z", "question": "Which person has control when riding in the flag adorned vehicle?", "choices": ["left", "neither", "right", "both"], "correct_choice_idx": 2, "direct_answers": ["right side", "bicyclist", "driver", "motorcycle driver", "right side", "driver", "right", "right", "motorcycle driver", "left person"], "difficult_direct_answer": false, "rationales": ["The person in the side car is just a passenger.", "The person is on the right.", "The person riding the motorcycle has total control of the vehicle."], "image": "train2014/COCO_train2014_000000005018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227242, "question_id": "Jowp8P78zS8Ebv9nZ89jKW", "question": "What is crossing the bridge?", "choices": ["train", "car", "bus", "bike"], "correct_choice_idx": 1, "direct_answers": ["train", "train", "train", "train", "car", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["The car is crossing.", "The bridge is used by cars.", "The item on the bridge is long and appears to be a train."], "image": "train2014/COCO_train2014_000000227242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281888, "question_id": "JpBvt8dkEHaHQAi6852RV9", "question": "The restaurant on the banner is named for a location on what island?", "choices": ["sicily", "capri", "malta", "cyprus"], "correct_choice_idx": 1, "direct_answers": ["unknown", "azzura", "staten", "gotta aura", "gotta azzura", "azzura", "capri", "coney island", "azzures", "capri"], "difficult_direct_answer": false, "rationales": ["The restaurant on the banner serves italian food that originates in sicily.", "A green banner is in front of some people walking. it has gotta azzura on side of it.", "This sign is italian and named for a location on the island of capri."], "image": "train2014/COCO_train2014_000000281888.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400818, "question_id": "JpCTDFFAxfMWLN6aBaFFBT", "question": "What is the type of tent which is behind the man in the red shirt?", "choices": ["a-frame", "dome", "pop up", "screen house"], "correct_choice_idx": 3, "direct_answers": ["shade", "canopy", "mesh", "outdoor camper", "outdoor tent", "screen house", "gazebo", "family tent", "see through", "frame style"], "difficult_direct_answer": true, "rationales": ["A tent with screen walls is on the beach behind a guy in a t-shirt.", "A tent that has walls made of black netting is behind a group of people. screen is used to keep bugs out when hanging out outside.", "It is for shade and also keeps bugs out but you can still see around you"], "image": "train2014/COCO_train2014_000000400818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231668, "question_id": "JpLsCnHQNQb9pAtDd8TDj5", "question": "How many species can be seen here of mammals?", "choices": ["one", "four", "five", "nine"], "correct_choice_idx": 0, "direct_answers": ["cows", "one", "one", "one", "two", "two", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["Cattle are the only ones apparent in this image.", "There is a group of cows grazing in the field.", "These animals are all cows despite having different colored coats."], "image": "train2014/COCO_train2014_000000231668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352065, "question_id": "JpSQzJ4sDiSfvV7842CaJs", "question": "What is the man carrying with his right arm?", "choices": ["lounge chair", "surfboard", "boogie board", "umbrella"], "correct_choice_idx": 2, "direct_answers": ["board", "surfboard", "surfboard", "surfboard", "surfboard", "boogie board", "boogie board", "boogie board", "flippers", "surf board"], "difficult_direct_answer": false, "rationales": ["The man has a boogie board.", "It is shorter than a surfboard.", "The man is wearing a surf wet suit."], "image": "train2014/COCO_train2014_000000352065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400881, "question_id": "JprqbJDZR2zrvFJxbXTkYB", "question": "What missing connector caused the positioning of this sign?", "choices": ["top fastener", "velcro", "side fastener", "bottom nail"], "correct_choice_idx": 0, "direct_answers": ["screw", "screw", "top", "tie", "screw", "screw", "screw", "top fastener", "bolt", "screw"], "difficult_direct_answer": false, "rationales": ["It is clear that the sign has flipped upside-down, meaning the piece that was supposed to connect it at the top has come undone as the bottom piece is still intact.", "The top fastener has fallen off.", "The bottom of the sign is connected, but the opposing point is not."], "image": "train2014/COCO_train2014_000000400881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282224, "question_id": "JqE328hmmMzQRjopckFZfZ", "question": "How much farther can the red socks be pulled up normally?", "choices": ["to calves", "to head", "not much", "to stomach"], "correct_choice_idx": 2, "direct_answers": ["knee", "knee caps", "knees", "knees", "to knees", "knee", "not much", "close", "knees", "knee"], "difficult_direct_answer": false, "rationales": ["They are almost to the knees", "A man is on a tennis court with socks pulled up nearly to his knees. knee socks come to just below most people's knees.", "The socks looks like they are pulled all the way up."], "image": "val2014/COCO_val2014_000000282224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47093, "question_id": "JqL9pvYMmqDurjcRTGzij6", "question": "What type of phone is being used?", "choices": ["rotary", "pay", "cellular", "landline"], "correct_choice_idx": 2, "direct_answers": ["cell phone", "red cell", "mobile", "smartphone", "cell", "smartphone", "cellular", "cellphone", "cellphone", "mobile phone"], "difficult_direct_answer": false, "rationales": ["The woman seems to be calling while moving.", "The woman has a mobile phone.", "The girl is using a cell phone."], "image": "train2014/COCO_train2014_000000047093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407972, "question_id": "JqLGJNQRLfYerDru4U3FnS", "question": "What is this name for this type of restaurant?", "choices": ["gluten free", "vegan", "fast food", "kosher"], "correct_choice_idx": 2, "direct_answers": ["mcdonalds", "fast food", "mcdonalds", "fast food", "mcdonalds", "fast food", "fast food", "fast food", "mcdonalds", "mcdonalds"], "difficult_direct_answer": false, "rationales": ["It is a mcdonald's restaurant.", "This is mcdonalds and it's a fast food place.", "The name is fast food."], "image": "train2014/COCO_train2014_000000407972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399349, "question_id": "JqMyTWCVFC2BEBeTuZzkZh", "question": "What accessory should the boy wear for better protection?", "choices": ["gloves", "sunglasses", "helmet", "knee pads"], "correct_choice_idx": 2, "direct_answers": ["helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet"], "difficult_direct_answer": false, "rationales": ["A horse rider should wear a helmet for greater protection while riding a horse.", "The boy could fall from the horse and hit his head.", "In case the little boy falls off, protection on his head would be good."], "image": "val2014/COCO_val2014_000000399349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90594, "question_id": "JqPsXi2djyhE6zKocqTTSM", "question": "How do these people know each other?", "choices": ["rivals", "coworkers", "teammates", "family"], "correct_choice_idx": 3, "direct_answers": ["mother daughter", "same family", "family", "family", "family", "related", "mother daughter", "family", "mother daughter", "family"], "difficult_direct_answer": false, "rationales": ["Given the familiarity and comfort of this child and adult posing together it is most likely they are related.", "This appears to be a mother and child, which matches the category in option a. the relationships in the other options are not logical for an adult and a child.", "The woman is the girl's mom."], "image": "val2014/COCO_val2014_000000090594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203965, "question_id": "JqYmcGLT9ohjYvGacmBCgV", "question": "After this batter finishes what does the person batting next wear on their head?", "choices": ["black helmet", "do rag", "baseball cap", "nothing"], "correct_choice_idx": 0, "direct_answers": ["helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "black helmet"], "difficult_direct_answer": false, "rationales": ["The batter is wearing a dark helmet.", "All players are required to have protection on their heads while in the batter's box or running the bases.", "The person batting next would wear a black helmet because that is the color used by the team for which they play."], "image": "val2014/COCO_val2014_000000203965.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323726, "question_id": "JqiQSUrdikY96aUsTiDJcH", "question": "What is the brown object in the squatting man's hand?", "choices": ["football", "resin", "glove", "pretzel"], "correct_choice_idx": 2, "direct_answers": ["catchers mitt", "gloves", "catcher's mitt", "catchers mitt", "mitt", "shirt", "glove", "glove", "catchers mitt", "catching glove"], "difficult_direct_answer": false, "rationales": ["The catcher is using a mitt on his left hand.", "The man is holding a leather glove.", "The other options don't match this sport."], "image": "val2014/COCO_val2014_000000323726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403325, "question_id": "Jqj4fMdwpj2Z5QpkkPXNUE", "question": "What does the left jars store?", "choices": ["sugar", "salt", "flour", "pepper"], "correct_choice_idx": 2, "direct_answers": ["sugar", "cooking utensils", "utensils", "flour", "sugar", "food", "spoons", "dry goods", "flour", "herbs"], "difficult_direct_answer": false, "rationales": ["The jar has a label indicating what is inside of it.", "Traditionally these type of container will store confectionery type of foods.", "They store several things like flour and sugar."], "image": "train2014/COCO_train2014_000000403325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438364, "question_id": "JqvtHhXEWAbgQMFJYWbGf6", "question": "In which country is this bus most likely traveling?", "choices": ["great britain", "canada", "mexico", "united states"], "correct_choice_idx": 0, "direct_answers": ["england", "united kingdom", "great britain", "england", "united kingdom", "england", "england", "england", "usa", "london"], "difficult_direct_answer": false, "rationales": ["Double deckers are common in this country.", "It's in great brittain.", "The bus is driving on the left side of the road."], "image": "val2014/COCO_val2014_000000438364.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484355, "question_id": "JrFJhcFiBcVNT4FBNLCTEu", "question": "What service are they providing to the kids?", "choices": ["teaching lesson", "offering exchange", "signing contracts", "signing ball"], "correct_choice_idx": 3, "direct_answers": ["autographs", "autograph", "signing ball", "fan", "autographs", "autographing", "autographs", "signature", "autographing tickets", "autographs"], "difficult_direct_answer": false, "rationales": ["He is autographing a ball.", "People love to get autographs of the players.", "The men play a professional sport that according to their logo requires a bat while the kids are wearing gloves. the players are holding pens and are using it to write their name on a round object."], "image": "train2014/COCO_train2014_000000484355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356068, "question_id": "JrH6niuwJPbiCVuDzWrtpw", "question": "What is in the tomato sauce?", "choices": ["steak", "chicken wings", "meatballs", "sloppy joe"], "correct_choice_idx": 2, "direct_answers": ["tomatoes", "meatballs", "tomatoes", "meatballs", "paste", "meatballs", "meatballs", "tomatoes", "meatballs", "meatballs"], "difficult_direct_answer": false, "rationales": ["You can see the shape of them under the sauce.", "There are round lumps under the sauce", "They are round and usually do come with sauce"], "image": "train2014/COCO_train2014_000000356068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143129, "question_id": "JrPg65Mq8sxM7T8CJnwJjr", "question": "Where is the person in?", "choices": ["farm", "barn", "zoo", "wilderness"], "correct_choice_idx": 3, "direct_answers": ["nature preserve", "african plains", "africa", "grass", "meadow", "jungle", "shade", "africa", "wilderness", "nature"], "difficult_direct_answer": true, "rationales": ["The person is sitting with the wild animals which makes it seem like its in the wilderness since there is no enclosure.", "The person is outside. there are giraffes near the person.", "The person is in the wild."], "image": "val2014/COCO_val2014_000000143129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104392, "question_id": "JrS5LNW2yqjnKcnSkSBEm9", "question": "Which appliance is free of thermal conduction?", "choices": ["oven", "microwave", "stovetop", "fridge"], "correct_choice_idx": 3, "direct_answers": ["fan", "refrigerator", "stovetop", "fridge", "stove", "stove", "stovetop", "refrigerator", "fridge", "fridge"], "difficult_direct_answer": false, "rationales": ["The only appliance that doesn't use heat is the fridge in this picture.", "The fridge is free.", "The fridge cools things rather than heats them as is done in in the case of thermal conduction."], "image": "val2014/COCO_val2014_000000104392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387849, "question_id": "JrTuCkwkvayySordfyDSYs", "question": "What does this vehicle travel on?", "choices": ["water", "rails", "air currents", "roadways"], "correct_choice_idx": 1, "direct_answers": ["rails", "tracks", "railroad", "rails", "rails", "rails", "tracks", "rails", "rails", "tracks"], "difficult_direct_answer": false, "rationales": ["This vehicle is a train, not an airplane, car, or boat.", "It travels on the rails.", "It travels on tracks to get from place to place."], "image": "train2014/COCO_train2014_000000387849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208506, "question_id": "JrZT5C9ViBXvYY3rdMyfCy", "question": "What are the two men waving?", "choices": ["game remotes", "phones", "soda cans", "chargers"], "correct_choice_idx": 0, "direct_answers": ["wii remotes", "game controllers", "game controllers", "playing game", "controllers", "wii mote", "remotes", "game remotes", "wii controller", "wii remotes"], "difficult_direct_answer": false, "rationales": ["The two men have remotes.", "The men are both holding white wii controllers.", "They are holding up these controllers for an electronic entertainment activity that involves people moving around. they are looking at or at the level of these objects."], "image": "train2014/COCO_train2014_000000208506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242664, "question_id": "Jrgus69prABC7T4hcBqyUe", "question": "Jaldapara National Park is famous for which animal?", "choices": ["kangaroo", "tiger", "elephant", "lion"], "correct_choice_idx": 2, "direct_answers": ["elephants", "elephants", "elephants", "elephants", "indian rhinoceros", "elephant", "elephant", "elephants", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["The park has a lot of indian elephants.", "I'm using the image as the answer. an online search indicates that none of these answers are correct.", "There are a lot of large massive animals with tusk and trunks."], "image": "train2014/COCO_train2014_000000242664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329139, "question_id": "JrhZFr9zytz2mCUHbDNcEJ", "question": "What are the bananas intended for?", "choices": ["planting", "selling", "eating", "disposal"], "correct_choice_idx": 2, "direct_answers": ["runners", "eating", "marathon runners", "marathoners", "snack", "energy", "throwing", "eating", "eating", "runners"], "difficult_direct_answer": false, "rationales": ["The bananas are to eat.", "Bananas are piled on tables and runners run in the background. marathon runners eat bananas to maintain energy.", "The fruit is ready to be eaten after the race and is useful for recovery."], "image": "val2014/COCO_val2014_000000329139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373153, "question_id": "Js4Bn5vsNWbaSsRA3XLQFW", "question": "What type of animal is shown?", "choices": ["domestic", "aquatic", "wild", "stuffed"], "correct_choice_idx": 2, "direct_answers": ["cow", "yak", "yak", "cow", "wild", "cow", "cow", "cow", "yak", "highland cow"], "difficult_direct_answer": false, "rationales": ["He is roaming free.", "Looks to be like some kind of longer hair cow.", "A wild cow is shown."], "image": "train2014/COCO_train2014_000000373153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512022, "question_id": "JseWyCZtGwMquPnifuHwBe", "question": "What could be used to make the walnuts in their current condition?", "choices": ["knife", "strainer", "pitcher", "spatula"], "correct_choice_idx": 0, "direct_answers": ["nutcracker", "knife", "crushed", "hammer blender", "hammer", "mallet", "grinder", "mallet", "knife", "cake"], "difficult_direct_answer": false, "rationales": ["The walnuts can be sliced.", "There are walnut pieces on the top of this cupcake. these pieces looks like they were cut precisely so a knife would fit this criteria.", "The walnuts have been chopped. a tool that can be used to cut whole solid things into smaller pieces would have been used."], "image": "train2014/COCO_train2014_000000512022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444619, "question_id": "Jt3uvRKs3yq2u8AdwSTAoT", "question": "Why are people sitting on the sidewalk?", "choices": ["watching parade", "watching traffic", "as punishment", "resting"], "correct_choice_idx": 0, "direct_answers": ["spectating", "parade", "watching bikers", "spectating", "watching", "watching motorcycles", "watching parade", "watching motorcycles", "watching parade", "watching"], "difficult_direct_answer": false, "rationales": ["Normally people sit on the side of the road to view processions like this one.", "The people are watching the parade.", "There are people on motorcycles in the street driving in formation. there are many people filling the sidewalks, watching the people in the street."], "image": "train2014/COCO_train2014_000000444619.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111940, "question_id": "JtKpr8be3QR8beDCYYbTNJ", "question": "How many mufflers are there in the bike?", "choices": ["two", "four", "one", "three"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["You can see one canister that handles the exhaust.", "There is a single muffler that i can see.", "There is one muffler pipe on the back of the motorcycle."], "image": "train2014/COCO_train2014_000000111940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31019, "question_id": "JtUpzVpY8tXM8LNRtsyNrf", "question": "Why are the traffic cones positioned in the location that they are?", "choices": ["directions", "road closure", "art", "construction"], "correct_choice_idx": 1, "direct_answers": ["road closure", "traffic control", "blocking road", "safety", "crowd control", "road closed", "traffic control", "block traffic", "direct traffic", "barrier"], "difficult_direct_answer": true, "rationales": ["There are many pedestrians beyond the road. for so many people to congregate in the middle of the road it would likely be for an arranged event where the road would be closed and need to be marked off as such.", "The traffic cones are for road closure.", "It's to prevent cars from driving where people are walking"], "image": "val2014/COCO_val2014_000000031019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295409, "question_id": "JtYVGXrotWVHoGYvjWXLnY", "question": "What type of video game might the man be playing?", "choices": ["puzzle", "fighting", "bowling", "shooting"], "correct_choice_idx": 2, "direct_answers": ["bowling", "wii", "bowling", "baseball", "bowling", "sports", "wii", "wii", "wii", "wii"], "difficult_direct_answer": false, "rationales": ["The man might be playing wii bowling.", "The game is for bowling.", "He has his palm up in front of him which is how you throw a bowling ball"], "image": "val2014/COCO_val2014_000000295409.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209106, "question_id": "JtqJquKUCoV59YSzqdgjHM", "question": "Which river is shown in picture?", "choices": ["ganges", "nile", "indus", "yamuna"], "correct_choice_idx": 0, "direct_answers": ["indian", "ganges", "denali", "ganga", "ganges", "ganges", "unknown", "unknown", "thai", "ganges"], "difficult_direct_answer": false, "rationales": ["This river is in the ganges.", "That is the river shown here.", "This would be the ganges river."], "image": "train2014/COCO_train2014_000000209106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396542, "question_id": "JtwbX4984TJQuDuawd83cb", "question": "Why is the girl pulling a tree behind her?", "choices": ["to recycle", "to take", "to break", "to paint"], "correct_choice_idx": 1, "direct_answers": ["keeping it", "for christmas", "christmas", "to take", "christmas tree", "to decorate", "christmas", "for christmas", "tree", "christmas tree"], "difficult_direct_answer": false, "rationales": ["The girl is dragging the tree on purpose. it will go with her.", "During christmas people cut down trees and bring them home.", "She will put it in her house and decorate it"], "image": "val2014/COCO_val2014_000000396542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264737, "question_id": "JuYcGn3pWD29r8uUg46Dne", "question": "What does the girl come to this venue for?", "choices": ["raising animals", "feeding animals", "petting animals", "riding animals"], "correct_choice_idx": 2, "direct_answers": ["goats", "pet animals", "showing livestock", "playing", "see animals", "petting", "petting", "pet animals", "petting", "petting animals"], "difficult_direct_answer": false, "rationales": ["These are docile animals you can get close to", "It looks like a zoo where people can pet the animals.", "Here people are inside a cage with animals; seemingly not to feed or clean them. such a scene would normally be found in a petting zoo."], "image": "val2014/COCO_val2014_000000264737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379177, "question_id": "JufwBbiidBitAZdEfARb7h", "question": "The food containing the most potassium is in which row?", "choices": ["second", "third", "first", "fourth"], "correct_choice_idx": 2, "direct_answers": ["first", "first", "row one", "first", "middle", "top", "first row", "top", "top", "first row"], "difficult_direct_answer": false, "rationales": ["The food most known for containing the most potassium is a banana, which can be seen in the middle of the first row.", "Bananas are in the first row.", "Bananas are high in potassium"], "image": "train2014/COCO_train2014_000000379177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516422, "question_id": "JvE9MiRKTcnjkQNj4onvMj", "question": "The man seated on the bench is interested in what?", "choices": ["basketball", "fashion", "news", "cell phones"], "correct_choice_idx": 2, "direct_answers": ["newspaper", "reading newspaper", "newspaper", "reading news", "newspaper", "news", "reading", "sports", "news", "newspaper"], "difficult_direct_answer": false, "rationales": ["The man on the bench is reading the newspaper in his lap because he is interested in the news.", "The man is reading a newspaper.", "He has a newspaper open on his lap."], "image": "val2014/COCO_val2014_000000516422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446307, "question_id": "JvKqzuLrZpf2NpiJF5QTA6", "question": "At what locale do the people stand?", "choices": ["forest station", "train depot", "market", "music studios"], "correct_choice_idx": 1, "direct_answers": ["yellow line", "train station", "train stop", "platform", "train depot", "train station", "railway station", "station", "train station", "tracks"], "difficult_direct_answer": false, "rationales": ["The people are waiting to get on the yellow train.", "The local is a train depot.", "The people are at a train station."], "image": "train2014/COCO_train2014_000000446307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538309, "question_id": "JvSeWMWLubP43AK2oj8UGX", "question": "What is this animals biggest predator?", "choices": ["tigers", "humans", "crocodiles", "hyenas"], "correct_choice_idx": 1, "direct_answers": ["tiger", "lion", "humans", "lions", "humans", "lion", "human", "humans", "lions humans", "lion"], "difficult_direct_answer": false, "rationales": ["The animals fear humans.", "Humans kill the most elephants.", "People kill more elephants for no reason than any other wild animal."], "image": "train2014/COCO_train2014_000000538309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111969, "question_id": "Jvgw2XS5gEqfpNtn6unkn5", "question": "What type of vehicle is shown?", "choices": ["scooter", "train", "airplane", "bus"], "correct_choice_idx": 2, "direct_answers": ["airplane", "plane", "plane", "airplane", "airplane", "plane", "airplane", "airplane", "plane", "plane"], "difficult_direct_answer": false, "rationales": ["A large vehicle with wings and a cockpit is parked.", "The vehicle is a plane.", "The transport has wings."], "image": "train2014/COCO_train2014_000000111969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401603, "question_id": "Jvj2fj2isYTDaB3D5nxfoN", "question": "At what kind of location are they skateboarding?", "choices": ["basement", "gymnasium", "backyard", "skate park"], "correct_choice_idx": 3, "direct_answers": ["park", "skate park", "skateboarding park", "park", "skatepark", "skate park", "skate park", "skate park", "skatepark", "skate park"], "difficult_direct_answer": false, "rationales": ["There are people using skateboards at the park.", "The location is a skate park.", "There is a large cement structure with drops and curves."], "image": "train2014/COCO_train2014_000000401603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443805, "question_id": "JvxHNyG3ym2Zjj6aZtiH6L", "question": "What is on the clock?", "choices": ["cat", "cherub statues", "gravy", "hat"], "correct_choice_idx": 1, "direct_answers": ["numbers", "cherub statues", "hands", "angels", "hands", "hands", "angels", "angels", "angles", "hands"], "difficult_direct_answer": false, "rationales": ["The clock has statues of angels on it.", "They are tiny angels", "There are angels sitting on either side of the middle clock face."], "image": "train2014/COCO_train2014_000000443805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245793, "question_id": "JvxaP26QvAYDHAH5Yvurhh", "question": "Where might someone buy a ticket for this train?", "choices": ["online", "newsboy", "inside building", "train"], "correct_choice_idx": 2, "direct_answers": ["train station", "ticket booth", "bottom right", "train station", "travel", "train station", "station kiosk", "inside building", "ticket booth", "train station"], "difficult_direct_answer": false, "rationales": ["Train boarding passes are traditionally sold at buildings at the station as we can see here.", "Someone wants to go in the building.", "They sell tickets in the station."], "image": "val2014/COCO_val2014_000000245793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390969, "question_id": "Jw95t7N8CYCNcaYmvkyRHk", "question": "What device is the one most probably attached to the man's arm?", "choices": ["game", "music player", "phone", "hard drive"], "correct_choice_idx": 1, "direct_answers": ["music player", "glucose monitoring", "ipod", "activity tracker", "phone", "fitbit", "phone", "phone", "phone holder", "phone"], "difficult_direct_answer": false, "rationales": ["There is a man with a mp3 or something with headphones.", "He has it attached to his arm to listen to through head phones.", "The device is for music."], "image": "train2014/COCO_train2014_000000390969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135666, "question_id": "JwJaSN63pVkWjEw8b6tAge", "question": "Why are the bikes all stopped?", "choices": ["resting", "traffic congestion", "red light", "no gas"], "correct_choice_idx": 2, "direct_answers": ["red light", "red stoplight", "red light", "red light", "red light", "red stoplight", "red light", "red light", "red light", "red stoplight"], "difficult_direct_answer": false, "rationales": ["The bikes have stopped because they're at an intersection and the light is red. when it turns green again, they'll be on their way.", "Bikes are in the street. traffic lights are used to stop traffic.", "The bikes are at a red light."], "image": "val2014/COCO_val2014_000000135666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166173, "question_id": "JwTeo5uWJysA6Mq4eoB8du", "question": "What must make contact with the surface of the ground in order to stick the landing?", "choices": ["board", "sneakers", "wheels", "lining"], "correct_choice_idx": 2, "direct_answers": ["wheels", "wheels", "skateboard", "skateboard", "wheels", "wheels", "wheels", "wheels", "skateboard", "wheels"], "difficult_direct_answer": false, "rationales": ["The wheels need to hit the ground first before the man lands on top.", "The wheels need to contact with the boardwalk for a landing.", "If the other part hits the board will skid to a stop"], "image": "train2014/COCO_train2014_000000166173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525058, "question_id": "JwgcjDvq486X6NZQpq4gqV", "question": "What act are these boys doing?", "choices": ["jaywalking", "trespassing", "joggling", "running"], "correct_choice_idx": 0, "direct_answers": ["standing", "waiting", "waiting", "waiting", "jaywalking", "walking", "standing", "awaiting crossing", "loiter", "standing"], "difficult_direct_answer": false, "rationales": ["Is the most likely given that the light tells them not to cross.", "Given the traffic sign and the position of the boys you can tell what they are doing.", "The boys are crossing the street in the middle of the block instead of at an intersection, which is known as jaywalking."], "image": "val2014/COCO_val2014_000000525058.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342259, "question_id": "JxCcUiqcyp7ASTiuHxDUJa", "question": "The popular American brand of hot sauce is made up of what?", "choices": ["chilly", "tabasco peppers", "peppercorn", "capsicum"], "correct_choice_idx": 1, "direct_answers": ["pepper", "tabasco peppers", "hot peppers", "tabasco", "tabasco", "peppers", "peppers", "tabasco", "tabasco peppers", "tabasco sauce"], "difficult_direct_answer": false, "rationales": ["It says on the bottle.", "These are a patented pepper that are only used in this product", "Tabasco peppers is in hot sauce."], "image": "train2014/COCO_train2014_000000342259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358539, "question_id": "JxLQ5XphdjtGTbMNEpwBGp", "question": "What kind of music band do they play music in?", "choices": ["pop", "rock", "mariachi", "country"], "correct_choice_idx": 2, "direct_answers": ["traditional band", "mariachi music", "mexican music", "spanish", "mariachi", "mariachi", "mexican", "spanish music", "mexican", "jam"], "difficult_direct_answer": false, "rationales": ["As indicated by their clothes, they're likely a. that said, they could be playing d music as well, as indicated by the guy's hat.", "They have the style of clothes and look like they are hispanic.", "The outfits and instruments carried by these men suggests a latin flare."], "image": "train2014/COCO_train2014_000000358539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379310, "question_id": "JxgZq3Ut67KvSpiM5JRRWc", "question": "What type of phone is available?", "choices": ["cellular", "landline", "pay", "cordless"], "correct_choice_idx": 1, "direct_answers": ["internet", "office phone", "landline", "landline", "dial", "nothing", "home phone", "landline", "landline", "landline"], "difficult_direct_answer": false, "rationales": ["The phone is a standard landline with a base and a dial tone. the phone is not mobile and does have a cord.", "There is a landline phone sitting next to the computer.", "The phone has a cord attached to the box that has a cord attached to a wall."], "image": "val2014/COCO_val2014_000000379310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411821, "question_id": "Jxjh7uuGAj6NCuD2xNyfKR", "question": "What is the photo showing?", "choices": ["chairs", "beds", "suitcases", "tables"], "correct_choice_idx": 2, "direct_answers": ["suitcases", "suitcases", "suitcases", "suitcases", "suitcases", "travel case", "suitcases", "suitcases", "luggage", "suitcases"], "difficult_direct_answer": false, "rationales": ["One can see the luggage of various sizes and colors that are stacked on each other.", "The photo shows a stack of suitcases.", "This is a stack of luggage."], "image": "val2014/COCO_val2014_000000411821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278461, "question_id": "JxtmY8o2tAwKx4zeGXZ6Fi", "question": "What kind of musical group is this?", "choices": ["girl band", "boy group", "boy band", "man band"], "correct_choice_idx": 2, "direct_answers": ["singing", "band", "christian", "men's choir", "chorus", "folk", "boy band", "boy band", "quintet", "quintet"], "difficult_direct_answer": false, "rationales": ["The group on stage is a boy band consisting of five male singers.", "There are several men singing on stage. they are singing a song of some kind.", "Bands imply that there are instruments. they do not have instruments other than their voices."], "image": "train2014/COCO_train2014_000000278461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116834, "question_id": "JxwQpBxqgXjeFff8EcCL29", "question": "Why is there writing on the boat?", "choices": ["fleet number", "sales ad", "graffiti", "boat name"], "correct_choice_idx": 3, "direct_answers": ["boat name", "name", "boat name", "boat name", "boat name", "boat name", "name", "owner's name", "boat name", "boat's name"], "difficult_direct_answer": false, "rationales": ["Boats are given a name so they can be different from each other.", "It is identification.", "The owners chose a moniker to use to identify their boat."], "image": "train2014/COCO_train2014_000000116834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332006, "question_id": "Jxymwm4Q3QhNkfC8syoJbc", "question": "What are the trees in the background called?", "choices": ["oak", "ash", "pine", "marula"], "correct_choice_idx": 3, "direct_answers": ["oleifera", "joshua", "marula", "big trees", "sawaro", "acacias", "savannah", "acacia", "marula", "acacia"], "difficult_direct_answer": false, "rationales": ["There are marula trees in africa.", "The trees are marulas.", "Marula trees have the shape of trees shown behind a woman."], "image": "train2014/COCO_train2014_000000332006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297419, "question_id": "JyEBmxF2EsH9uX2Cf6cuNv", "question": "What animal is near the flowers?", "choices": ["dog", "rooster", "cat", "cow"], "correct_choice_idx": 1, "direct_answers": ["chickens", "chicken", "rooster", "rooster", "chicken", "chicken", "chicken", "rooster", "chicken", "rooster"], "difficult_direct_answer": false, "rationales": ["The animal as evident is the roast one.", "Roosters are near the daffodils.", "There is a rooster near the flowers that has black feathers and a red thing on his head."], "image": "val2014/COCO_val2014_000000297419.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67838, "question_id": "JyQmtaXns6Ru5LnYdJcfv8", "question": "What type of top is the boy wearing?", "choices": ["jacket", "tank", "vest", "polo"], "correct_choice_idx": 1, "direct_answers": ["tank top", "tank", "vest", "tank top", "vest", "tank top", "tank top", "sleeveless tee", "vest", "sleeveless"], "difficult_direct_answer": false, "rationales": ["As indicated by the sleeveless design. c doesn't work with this image.", "The boy is wearing a sleeveless shirt.", "The boy is wearing a top with the sleeves cut off."], "image": "train2014/COCO_train2014_000000067838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317792, "question_id": "JzwdajqbCDyfrgN6hFvkfX", "question": "What is at the top of the pole with the square top?", "choices": ["bird", "flag", "clock", "cat"], "correct_choice_idx": 2, "direct_answers": ["clock", "clock", "clock", "finial", "clock", "clock", "spike", "clock", "clock", "clock"], "difficult_direct_answer": false, "rationales": ["There is a clock face on the pole.", "The pole has a clock.", "There is a circle with a watch face."], "image": "train2014/COCO_train2014_000000317792.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200663, "question_id": "K2H5jrXA3vBFUk4KQoAuP5", "question": "What kind of occasion is the light blue clothing for?", "choices": ["informal", "swimming", "cooking", "formal"], "correct_choice_idx": 3, "direct_answers": ["wedding", "formal", "formal", "school", "church", "school uniform", "formal", "celebration", "wedding", "formal"], "difficult_direct_answer": false, "rationales": ["The light blue clothing is a necktie which is worn in a more professional or elegant setting.", "You wear that clothing for business and dances.", "Suits are worn for formal occasions."], "image": "train2014/COCO_train2014_000000200663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71434, "question_id": "K2jQgRQWXV9Uy4jeCpSZpH", "question": "What technique is being utilized to capture movement in this scene?", "choices": ["hdr imaging", "contre-jour", "bokeh", "time-lapse"], "correct_choice_idx": 3, "direct_answers": ["light editing", "slow motion", "laser", "fading", "photo", "light", "camera technique", "light painting", "iso", "time-lapse"], "difficult_direct_answer": true, "rationales": ["By the placement of the people in the photo it is easy to tell what is being done.", "The people are pictured being in multiple places.", "There is a light across and shadows of people."], "image": "train2014/COCO_train2014_000000071434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37514, "question_id": "K2pNXD5utfj4JrYKWcrsT7", "question": "What is the oval object on the bike tire used for?", "choices": ["protection", "esthetics", "carrying things", "mud flap"], "correct_choice_idx": 2, "direct_answers": ["seat", "carry possessions", "basket", "storage", "carrying", "traversing land", "storage", "balance", "basket", "carrying things"], "difficult_direct_answer": false, "rationales": ["It looks like a seat frame.", "Boxes can attach to it.", "An oval rack is on the back of a bike near the tire. the rack is metal. people use racks on their bikes to carry stuff."], "image": "train2014/COCO_train2014_000000037514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422870, "question_id": "K2qJxrhdfobrE9hmd8B227", "question": "What state is this driver's final destination?", "choices": ["minnesota", "texas", "south dakota", "north dakota"], "correct_choice_idx": 2, "direct_answers": ["south dakota", "sturgis", "sturgis", "south dakota", "south dakota", "sturgis", "south dakota", "south dakota", "sturgis", "sturgis"], "difficult_direct_answer": false, "rationales": ["The city of sturgis is in south dakota.", "Sturgis is in south dakota", "I chose the state where the famous sturgis motorcycle rally takes place in the black hills."], "image": "val2014/COCO_val2014_000000422870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142847, "question_id": "K2yR9oHByqibEWiesQUaFd", "question": "On which countries soil does this bus operate?", "choices": ["north korea", "us", "south korea", "japan"], "correct_choice_idx": 2, "direct_answers": ["korea", "japan", "korea", "korea", "korea", "korea", "korea", "south korea", "korea", "japan"], "difficult_direct_answer": false, "rationales": ["The country is south korea.", "The white text on the side of the bus before international travel company indicates the country.", "It says 'korea\" on the bus and it's much more likely that south korea, much more \"business oriented\" than north korea, would be running this business."], "image": "train2014/COCO_train2014_000000142847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503310, "question_id": "K3GizxGsicm6wMjTD8MT7F", "question": "Where are the seats placed inside?", "choices": ["subway", "sedan", "van", "airplane"], "correct_choice_idx": 3, "direct_answers": ["airplane", "near window", "plane", "plane", "airplane", "airplane", "bus", "airplane", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["The seats are at an airplane.", "The area is the interior of an passenger plane.", "The sky is the only thing visible outside the windows, so the vehicle is not on the ground or underground."], "image": "train2014/COCO_train2014_000000503310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442487, "question_id": "K3ZW7LvZTjeUJoyyauRzCa", "question": "What is the boy doing?", "choices": ["running", "push ups", "sitting", "standing"], "correct_choice_idx": 3, "direct_answers": ["screaming", "standing", "wii", "playing wii", "playing game", "playing", "laughing", "playing games", "playing", "clapping"], "difficult_direct_answer": true, "rationales": ["The boy's feet are both firmly planted on the ground with his legs and body all straight up and down.", "He is standing up and laughing at something.", "The kid could not take his eyes off of the television. he positioned himself directly in front of it."], "image": "val2014/COCO_val2014_000000442487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101157, "question_id": "K3ZnYpxTCATDReAMWeHYzV", "question": "How many people can most likely sit down to a meal at the dinner table?", "choices": ["four", "two", "six", "eight"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["That is the number of chairs available", "This is a pub table. so, usually they can only handle a or b, but in this case, it's a small one with only a chairs.", "There is one chair to the left of the table and an additional chair to the right."], "image": "train2014/COCO_train2014_000000101157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194729, "question_id": "K3bgZZbPmCnCBJ95scycZi", "question": "What event is most likely memorialized in the display?", "choices": ["pearl harbor", "wwi", "9/11", "wwii"], "correct_choice_idx": 2, "direct_answers": ["nine eleven", "9-11", "war", "911", "9/11", "nine eleven", "life", "war", "tragedy", "9-11"], "difficult_direct_answer": false, "rationales": ["A memorial to 911.", "The pic on the memorial shows a pic of the twin towers that went down.", "There is a rendition of the twin towers on the statue."], "image": "val2014/COCO_val2014_000000194729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165291, "question_id": "K4cMKnLPKaz7fVvrzLM7DA", "question": "What is the relationship of the woman to the girl?", "choices": ["great grandmother", "daughter", "sister", "mother"], "correct_choice_idx": 3, "direct_answers": ["mother", "mother", "mother", "mother", "friend", "friendly", "mother daughter", "friend", "mother", "mother"], "difficult_direct_answer": false, "rationales": ["The women look very similar to each other. based on their relative ages and their similar appearances it is likely that they are mother and daughter with the older woman being the mother.", "They look similar and one is about twice as old as the other.", "A woman in blue is older and looks like the other person."], "image": "train2014/COCO_train2014_000000165291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552962, "question_id": "K4ioWb74g5J9nkvZ3AhFNX", "question": "The clock is reading five minutes before which hour?", "choices": ["eleven", "twelve", "seven", "eight"], "correct_choice_idx": 2, "direct_answers": ["seven", "seven", "seven", "seven", "seven", "7 o'clock", "seven", "six", "7 o'clock", "seven"], "difficult_direct_answer": false, "rationales": ["The hour hand is closest to seven.", "The hour hand is past six. it is almost at the next hour.", "As long as you know how to read an old clock face, you can tell what time it is."], "image": "train2014/COCO_train2014_000000552962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179727, "question_id": "K57e9sz7EPngJpErPTiBmz", "question": "Which shadow is the longest?", "choices": ["pole", "tennis racket", "woman", "man"], "correct_choice_idx": 2, "direct_answers": ["woman", "woman's", "umpire", "line judge", "player shadow", "tennis player's", "girls", "womans", "man's", "fence post"], "difficult_direct_answer": true, "rationales": ["This person is standing on a flat surface further away from the wall than the other person.", "A man and woman are both standing on a tennis court in the sun and the woman's shadow extends beyond that of the man.", "The way the sun is hitting the woman her shadow is the longest."], "image": "val2014/COCO_val2014_000000179727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557539, "question_id": "K5JbfzZMKZ8wLMjxoBVQKm", "question": "What is the item in the middle called?", "choices": ["corn dog", "hot dog", "chicago dog", "philly steak"], "correct_choice_idx": 0, "direct_answers": ["corndog", "corn dog", "corndog", "brat", "corn dog", "corndog", "corndog", "corn dog", "corndog", "corndog"], "difficult_direct_answer": false, "rationales": ["It is a hot dog dipped in corn batter and fried.", "The middle item is fried in corn batter.", "It's a hotdog wrapped with dough and then fried."], "image": "train2014/COCO_train2014_000000557539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435885, "question_id": "K5TihZXETkFpVfPRXbcUwV", "question": "What is the kite above the man with the board used for?", "choices": ["decoration", "air tricks", "surfing", "jumping"], "correct_choice_idx": 2, "direct_answers": ["kite surfing", "parasailing", "kite boarding", "parasailing", "surfing", "parasailing", "windsurfing", "wind surfing", "wakeboard parasailing", "surfing"], "difficult_direct_answer": false, "rationales": ["It helps move people through the water on boards", "These kites help drag a surfer in the water behind a boat.", "The kite is used to surf in the water."], "image": "train2014/COCO_train2014_000000435885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570543, "question_id": "K5UAFwQk35pGWLSJYdnaMr", "question": "What is the vessel in the foreground called?", "choices": ["tugboat", "galley", "catamaran", "rowboat"], "correct_choice_idx": 2, "direct_answers": ["boat", "boat", "sailboat", "catamaran", "boat", "catamaran", "boat", "sailboat", "sailboat", "sailboat"], "difficult_direct_answer": false, "rationales": ["The vessel is in the water and has a sail on it.", "This is a type of sail boat that is up off the water some.", "It is similar to a sailboat but the part in the water is different"], "image": "train2014/COCO_train2014_000000570543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517082, "question_id": "K5ZjaiB8H6LKyUAFecW55p", "question": "What relationship do the girls under the umbrella likely have?", "choices": ["strangers", "newly met", "close friendship", "enemies"], "correct_choice_idx": 2, "direct_answers": ["sisters", "friends", "friends", "close friendship", "sisters", "sisters", "classmates", "friends", "best friends", "sister"], "difficult_direct_answer": false, "rationales": ["These girls are friends since they're so close to each other.", "There are two young girls that are sharing an umbrella. they look happy and glad to be with each other.", "The girls are wearing matching outfits and standing close together."], "image": "val2014/COCO_val2014_000000517082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219525, "question_id": "K5s5q7nLsfaQe4VBPh7XcU", "question": "Why does the woman use pink umbrella?", "choices": ["camouflage", "sunproof", "match clothes", "visibility"], "correct_choice_idx": 2, "direct_answers": ["for style", "match clothes", "raining", "shade", "shade", "rain", "protection", "rain", "stay dry", "prevent sunrays"], "difficult_direct_answer": false, "rationales": ["This woman has a fashion sense and wants to have her umbrella match her shirt.", "The woman is wearing a pink top and seems to coordinate her top with her umbrella.", "The umbrella matches the top color."], "image": "val2014/COCO_val2014_000000219525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416981, "question_id": "K67CAUzFcBnFRrZS5xq2LY", "question": "What type stove is seen here?", "choices": ["electric", "natural gas", "wood", "conductive"], "correct_choice_idx": 1, "direct_answers": ["gas stove", "gas", "gas", "gas", "gas", "natural gas", "gas stove", "gas", "gas stove", "gas stove"], "difficult_direct_answer": false, "rationales": ["The stove is natural gas.", "There are holes on the burners which indicates flames will come up", "The burners are made to run on natural gas."], "image": "train2014/COCO_train2014_000000416981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117759, "question_id": "K68JsGFEvC6FTjxVbK6NDE", "question": "What is she doing?", "choices": ["fixing hair", "cleaning neck", "hiding shirt", "heating head"], "correct_choice_idx": 0, "direct_answers": ["drying hair", "drying hair", "drying hair", "fixing hair", "drying curling", "hair drying", "hair drying", "drying hair", "styling hair", "drying hair"], "difficult_direct_answer": false, "rationales": ["She is using a dryer and brush to style.", "The woman is styling her hair using a blow dryer in her left hand and a round brush in the right.", "A woman is holding a hair dryer and standing in front of a mirror. hair dryers are used to style hair."], "image": "val2014/COCO_val2014_000000117759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289107, "question_id": "K68tt7sRiwnTC5orAxBzSY", "question": "What is the umbrella topped structure on the right supposed to resemble?", "choices": ["traffic light", "bush", "lamp", "tree"], "correct_choice_idx": 3, "direct_answers": ["tree", "tree", "tree", "tree", "tree", "tree", "tree", "tree", "tree", "tree"], "difficult_direct_answer": false, "rationales": ["The umbrella is covered in fake leaves.", "It's to keep the park aesthetics", "It covers and provides shelter some like a tree would."], "image": "train2014/COCO_train2014_000000289107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69344, "question_id": "K6EGaNMgDzeLaPjHaN267r", "question": "What are the kids doing?", "choices": ["count candles", "pull candles", "cut cake", "insert candles"], "correct_choice_idx": 1, "direct_answers": ["pull candles", "placing candles", "eating cake", "eating cake", "removing candles", "eating cake", "removing candles", "removing candles", "removing candles", "eating"], "difficult_direct_answer": false, "rationales": ["They want to lick the frosting off the bottom of the candles.", "They are taking the candles out of the cake so they can eat it.", "They have their hands on them lifting them up"], "image": "train2014/COCO_train2014_000000069344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434135, "question_id": "K6HbHooPBCtWEB2sKRApuG", "question": "Why does the sign have all the strange writing?", "choices": ["graffiti", "amuse tourists", "in india", "confuse elephant"], "correct_choice_idx": 2, "direct_answers": ["foreign country", "in india", "foreign language", "foreign", "indian language", "information", "foreign advertising", "sanscript", "advertisement", "different language"], "difficult_direct_answer": true, "rationales": ["The characters depicted on this storefront belong to the language of hindi.", "This sign is on the front of a shop in india.", "This is in india."], "image": "train2014/COCO_train2014_000000434135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484158, "question_id": "K6KBC7QKF5a58srxbtsvcr", "question": "Which US state is this street situated in?", "choices": ["ohio", "wisconsin", "illinois", "idaho"], "correct_choice_idx": 2, "direct_answers": ["illinois", "new york", "chicago", "new york", "detroit", "chicago", "illinois", "chicago", "illinois", "illinois"], "difficult_direct_answer": false, "rationales": ["That is where chicago is located", "The state is illinois.", "On the far left of the picture, text on the bus mentions the city name of chicago. chicago is located within the u.s. state listed in a."], "image": "train2014/COCO_train2014_000000484158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77745, "question_id": "K6NdrTbHBrEhGdE8b2Bo6P", "question": "When the rainfall total is high the town faces the possibility of what natural disaster?", "choices": ["flood", "fire", "tornado", "earthquake"], "correct_choice_idx": 0, "direct_answers": ["flooding", "flooding", "flood", "flooding", "floods", "boat", "flood", "flooding", "flooding", "flood"], "difficult_direct_answer": false, "rationales": ["There would be a flood.", "High rainfall totals could cause the water around the town to rise. the water could then enter the town and cause damage.", "Heavy rains cause floods."], "image": "train2014/COCO_train2014_000000077745.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114946, "question_id": "K6Sr5YZqLMMzo5nWZ9itrt", "question": "What is he observing through the lens?", "choices": ["unseen", "himself", "woman", "umbrella"], "correct_choice_idx": 0, "direct_answers": ["trees", "amberly", "umbrella", "unseen", "unknown", "picture", "camera", "bird", "monument", "birds"], "difficult_direct_answer": true, "rationales": ["The think he is looking at is in the sky and not in the image so it is uncertain.", "We dont know what he is looking at.", "He's trying to be unseen."], "image": "val2014/COCO_val2014_000000114946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208174, "question_id": "K6WwCfUbMYXrVom9MhvmQa", "question": "What organization is the man's outfit from?", "choices": ["firefighters", "paramedics", "boy scouts", "navy"], "correct_choice_idx": 2, "direct_answers": ["mounted police", "scout", "scouts", "police", "girl scouts", "peace corps", "scouts", "boy scouts", "boy scouts", "navy"], "difficult_direct_answer": false, "rationales": ["The boy scouts are the entity that hands out these uniforms.", "Everyone is wearing scouting uniforms by gender so the man would be wearing one for his gender.", "He is in the scouts."], "image": "val2014/COCO_val2014_000000208174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83067, "question_id": "K6eRvPJdR44gy2B3fhqNgr", "question": "Where will the boat go if the wind stops?", "choices": ["west", "nowhere", "east", "north"], "correct_choice_idx": 1, "direct_answers": ["downstream", "downstream", "nowhere", "nowhere", "nowhere", "nowhere", "nowhere", "nowhere", "downstream", "nowhere"], "difficult_direct_answer": false, "rationales": ["The direction in option a matches what where a sailboat goes in the event the winds stops blowing.", "The boat will sit still due to no wind.", "Its a sailboat that moves from the wind so no wind it won't move."], "image": "val2014/COCO_val2014_000000083067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305206, "question_id": "K6h3RBcX5h8h6yrGUaLiGp", "question": "What position is the man in blue on the dirt playing?", "choices": ["first base", "pitcher", "catcher", "second base"], "correct_choice_idx": 1, "direct_answers": ["pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher"], "difficult_direct_answer": false, "rationales": ["He is standing on the mound in the center of the baseball diamond.", "The position is the pitcher.", "We see this man on a pitchers mound preparing to throw a baseball. the person who fulfills these duties is known as a pitcher in baseball."], "image": "val2014/COCO_val2014_000000305206.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39556, "question_id": "K6q5rGiv3aiA5nwc7mavxv", "question": "What surface allows the trains to be mobile?", "choices": ["rails", "cement", "asphalt", "pavement"], "correct_choice_idx": 0, "direct_answers": ["rails", "tracks", "tracks", "track", "rails", "tracks", "tracks", "tracks", "wheels", "rails"], "difficult_direct_answer": false, "rationales": ["Trains are not road vehicles. they cannot travel on pavement, cement, or asphalt.", "Trains have wheels that ride exclusively on rails.", "The train tracks are what help the train move from one place to another."], "image": "train2014/COCO_train2014_000000039556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388026, "question_id": "K7v9Y4xMwApUTDPWCQW65u", "question": "Why has caused traffic to be so light on this roadway?", "choices": ["flooding", "rain", "tornados", "snow"], "correct_choice_idx": 3, "direct_answers": ["snow", "weather", "snow", "bad weather", "ice", "snow", "snow", "snow storm", "snow", "snow"], "difficult_direct_answer": false, "rationales": ["The snow caused the lightness.", "It's slippery and causes accidents.", "People don't want to deal with the slow and slippery commute that flurries bring."], "image": "val2014/COCO_val2014_000000388026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368827, "question_id": "K82CpqQyYg3Yex3BPAhPmH", "question": "What are the donuts being stored in?", "choices": ["bottle", "case", "box", "bag"], "correct_choice_idx": 2, "direct_answers": ["box", "box", "box", "box", "box", "box", "box", "box", "box", "box"], "difficult_direct_answer": false, "rationales": ["The donuts are being stored in a cardboard box with lid.", "When you buy donuts they are usually placed in a box. the box protects the donuts.", "The container has a top and sides and is made out of cardboard."], "image": "train2014/COCO_train2014_000000368827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174213, "question_id": "K8CPjkc8w6LiqXDcdbGmyu", "question": "What is the man most likely using to move in the water?", "choices": ["balloon", "raft", "boat", "kite"], "correct_choice_idx": 2, "direct_answers": ["boat", "ski", "boat", "board", "stick use", "rain", "skies", "boat", "skating", "boat"], "difficult_direct_answer": false, "rationales": ["A man is skiing on the water. people are pulled by boats to ski on the water.", "A man is in water on skis holding onto a taught rope. people are normally pulled by boats when they water ski.", "The man is using a boat to pull the other man."], "image": "train2014/COCO_train2014_000000174213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227830, "question_id": "K8Davng3umtBUcwzbgUvdx", "question": "When was the company on the truck founded?", "choices": ["1920", "1492", "1667", "1855"], "correct_choice_idx": 3, "direct_answers": ["1855", "1855", "1855", "no idea", "before years", "recent years", "1855/1973", "1855", "1975", "twentieth century"], "difficult_direct_answer": false, "rationales": ["The miller brewing company is an american brewery and beer company in milwaukee, wisconsin founded in 1855.", "The company is from 1855.", "That's when the miller lite company was founded."], "image": "val2014/COCO_val2014_000000227830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392665, "question_id": "K8zaNfeXa4N5FSrMnHdGkw", "question": "What is the child wearing the pink head covering for?", "choices": ["fashion", "game", "warmth", "concealing identity"], "correct_choice_idx": 2, "direct_answers": ["from cold", "warmth", "stay warm", "wind protection", "warmth", "warmth", "marvin", "keeping warm", "head protection", "keep warm"], "difficult_direct_answer": false, "rationales": ["The child is sitting in snow while engaged in a recreational activity.", "Given the snowy setting this child is in it is likely this covering is for warmth.", "They are in the snow"], "image": "val2014/COCO_val2014_000000392665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183829, "question_id": "K9mmhCd8k42LQdgsc8PNZP", "question": "What colour is the board on the bottom right?", "choices": ["green", "red", "yellow", "orange"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["It is the same color as a banana", "The brightest color is yellow and the board on the bottom right is the brightest, which is caused by its yellow coloring.", "The board is the color of a banana."], "image": "train2014/COCO_train2014_000000183829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345833, "question_id": "K9pouyZi7RqJyfMJkhPquB", "question": "What is in his shirt pocket?", "choices": ["paper", "phone", "pen", "glasses"], "correct_choice_idx": 0, "direct_answers": ["paper", "paper", "paper", "paper", "paper", "paper", "paper", "paper", "paper", "paper"], "difficult_direct_answer": false, "rationales": ["We see a folded thin white stiff item in this mans shirt pocket. these are properties of bleached white paper.", "The man has a folded piece of paper in his shirt pocket.", "The pocket has paper."], "image": "train2014/COCO_train2014_000000345833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199883, "question_id": "K9ujKCZ9Xg6X3g4pHZiBui", "question": "The lane closest to the sidewalk is for which person?", "choices": ["black coat", "blue shirt", "striped shirt", "red shirt"], "correct_choice_idx": 1, "direct_answers": ["bike", "bicycle", "bicycles", "bicyclist", "blue shirt", "biclist", "handicapped", "cyclist", "bicyclist", "cyclist"], "difficult_direct_answer": false, "rationales": ["The lane closest to the sidewalk is a bike lane. the person in the blue shirt is on a bike.", "The person in the blue shirt is closest to the lane.", "The person wears that shirt."], "image": "val2014/COCO_val2014_000000199883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264919, "question_id": "KAGyH5kcz7jLEW7ufpc8t8", "question": "What does the man here do?", "choices": ["sails", "listens", "protests", "swims"], "correct_choice_idx": 1, "direct_answers": ["listen", "balloons", "unknown", "parade", "phone conversation", "listens", "phone call", "wear balloons", "blow balloons", "party"], "difficult_direct_answer": true, "rationales": ["The man holds his phone to his ear", "The man listens.", "A man is holding a phone up to his ear. the man is covering his other ear with his hand while surrounded by people outdoors."], "image": "val2014/COCO_val2014_000000264919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110393, "question_id": "KASE8rWzXNbNLMeV8AyjgL", "question": "Where would the contents of this image probably be seen exactly assis?", "choices": ["on floor", "in person", "on painting", "on tv"], "correct_choice_idx": 3, "direct_answers": ["asia", "game show", "mirror", "magazines", "tv show", "asia", "on tv", "japan", "tv show", "game show"], "difficult_direct_answer": false, "rationales": ["The image has subtitles.", "This looks like it was taken from a television show.", "Since there is writing in front of the guy, the person seeing this would be on television."], "image": "train2014/COCO_train2014_000000110393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547431, "question_id": "KAUAnCniJxNXuK3RKJMgp3", "question": "The large blue plane has a logo that looks similar to the logo of what brand?", "choices": ["pepsi", "green giant", "nathan's", "mcdonald's"], "correct_choice_idx": 0, "direct_answers": ["korean air", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi"], "difficult_direct_answer": false, "rationales": ["The largest airplane has a logo that is similar to the pepsi cola logo.", "The plane has a pepsi logo.", "The red, white, and blue logo looks similar to the logo for the soft drink company."], "image": "val2014/COCO_val2014_000000547431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420496, "question_id": "KAZcBmq5TAAHZpLx3yZQcn", "question": "Which photographic technique was used to capture the flow of traffic?", "choices": ["panoramic", "vignetting", "bokeh", "time-lapse"], "correct_choice_idx": 3, "direct_answers": ["unknown", "high resolution", "motion", "black light", "slow motion", "time-lapse", "flash", "motion blur", "light trails", "panning"], "difficult_direct_answer": true, "rationales": ["There are lines above the street of light that shows things are moving.", "It looks like pictures were taken over time.", "The time lapse feature is used."], "image": "train2014/COCO_train2014_000000420496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254653, "question_id": "KAiai5q7mTaHuQBXWnqvxs", "question": "What group of people are accommodated in this bus?", "choices": ["handicapped", "elderly", "blind", "pregnant women"], "correct_choice_idx": 0, "direct_answers": ["tourists", "handicapped", "everyone", "company employees", "handicapped", "non-drivers", "tourists", "passengers", "regular people", "pedestrians"], "difficult_direct_answer": false, "rationales": ["The bus can accommodate handicapped people.", "The group is handicapped.", "The wide entrances of the bus would make it ideal to handle the need of handicapped passengers."], "image": "val2014/COCO_val2014_000000254653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118022, "question_id": "KAmELXhkuMV3fpZQAxZjcy", "question": "What does the red stuff add to this dish?", "choices": ["saltiness", "spiciness", "bitterness", "sourness"], "correct_choice_idx": 1, "direct_answers": ["pepper", "color", "spiciness", "heat/flavor", "flavour", "spice", "color", "spice", "peppers", "spice"], "difficult_direct_answer": false, "rationales": ["The red stuff adds spiciness to the dish in question.", "Makes is spicy", "It is pepper."], "image": "train2014/COCO_train2014_000000118022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424503, "question_id": "KB3y9vtgHEUEqHsZBDeSHB", "question": "What taste does the green food have?", "choices": ["sour", "sweet", "spicy", "bitter"], "correct_choice_idx": 0, "direct_answers": ["sour", "salty sour", "salty sour", "sour", "sour", "sour", "sour", "sour", "salty sour", "sour"], "difficult_direct_answer": false, "rationales": ["The green food is a tart pickle.", "The green food is a pickle.", "The pickle on the plate has a sour taste from being brined."], "image": "train2014/COCO_train2014_000000424503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437944, "question_id": "KBSHEZCveFkX8EpSFxkLVJ", "question": "Which superhero does she admire?", "choices": ["wonder woman", "superman", "spiderman", "xena"], "correct_choice_idx": 0, "direct_answers": ["wonder woman", "wonder woman", "wonder woman", "wonder woman", "wonder woman", "wonderwoman", "wonder woman", "superwoman", "superwoman", "superwoman"], "difficult_direct_answer": false, "rationales": ["The outfit on the cake and breasts indicate a female superhero and the gold indicates a.", "The costume she wears is the cake", "The woman is cutting a cake that is shaped like wonder woman."], "image": "train2014/COCO_train2014_000000437944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178505, "question_id": "KBTcHbsrrvsjiHpAbneRE2", "question": "Where in the world would you most be likely to find a location like the one these people are at?", "choices": ["siberia", "mexico", "iceland", "antarctica"], "correct_choice_idx": 1, "direct_answers": ["pick", "caribbean island", "see", "near ocean", "beach", "beach", "coast", "coast", "mexico", "along coast"], "difficult_direct_answer": false, "rationales": ["The other options are all cold or cool regions.", "Mexico is warm and there are beaches that attract tourists and locals. the setting is warm and not cold.", "The people are wearing swimsuits and playing at the beach. i chose the location that is a warm popular vacation spot."], "image": "train2014/COCO_train2014_000000178505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119513, "question_id": "KBYZjwJtjgwUFWrbVbdG9i", "question": "What type of trick is the boy in grey performing?", "choices": ["manual", "flip", "grind", "grab"], "correct_choice_idx": 3, "direct_answers": ["airwalk", "ollie", "skateboard trick", "jumping", "grab", "skateboard jumping", "skateboarding", "jump", "grab", "jump"], "difficult_direct_answer": false, "rationales": ["The trick the boy on the skateboard is performing is a grab.", "He has his hand on the skateboard as he is jumping.", "The boy is reaching down with one hand and grabbing the edge of the skateboard."], "image": "val2014/COCO_val2014_000000119513.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528117, "question_id": "KBxdorRpdCqJTCNBrgeoxz", "question": "What category of animal is their mascot in?", "choices": ["snake", "cat", "bird", "dog"], "correct_choice_idx": 2, "direct_answers": ["bird", "birds", "bird", "bird", "bird", "birds", "bird", "bird", "what animal", "bird"], "difficult_direct_answer": false, "rationales": ["Their team is the orioles.", "Their mascot is bird because they are the baltimore orioles.", "An oriole is an orange and black flying animal."], "image": "train2014/COCO_train2014_000000528117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522252, "question_id": "KCayEyZpaNY2FcSdybqKzE", "question": "Why are the ducks nestling their beaks in this manner?", "choices": ["no reason", "picking bugs", "looking behind", "they sleep"], "correct_choice_idx": 3, "direct_answers": ["warmth", "sleeping", "cold", "keeping warm", "sleeping", "to sleep", "resting", "they sleep", "sleeping", "to preen"], "difficult_direct_answer": false, "rationales": ["Ducks put their heads down to sleep.", "The ducks are resting.", "The ducks are sleeping."], "image": "train2014/COCO_train2014_000000522252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146711, "question_id": "KCh8eSm8FDMNe2MxWmrud2", "question": "Which food had its pit removed?", "choices": ["carrot", "radish", "avocado", "cucumber"], "correct_choice_idx": 2, "direct_answers": ["avocado", "avocado", "avocado", "avocado", "avocado", "avocado", "avocado", "avocado", "avocado", "avocado"], "difficult_direct_answer": false, "rationales": ["The only food here visible that has a pit is an avocado.", "You can see the big hole in the avocado where the seed was.", "The food is the avocado."], "image": "train2014/COCO_train2014_000000146711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134078, "question_id": "KCp6EAchFwFZ4LX8NhpZ3P", "question": "Where is this food located?", "choices": ["gas station", "outside", "home kitchen", "restaurant"], "correct_choice_idx": 2, "direct_answers": ["table", "bowls", "table", "table", "home", "in bowls", "dining table", "kitchen table", "home kitchen", "kitchen"], "difficult_direct_answer": false, "rationales": ["The food is at home.", "The rest of the building looks like it is in a residential building.", "A woman is standing in front of a table with a dishes on it on top of a plastic tablecloth decorated with holly."], "image": "train2014/COCO_train2014_000000134078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241410, "question_id": "KD5HTCszSf76fBujLMq65R", "question": "What are the white objects near the elephants trunk?", "choices": ["collars", "horns", "tusks", "nails"], "correct_choice_idx": 2, "direct_answers": ["tusks", "tusks", "ivory", "tusks", "tusks", "tusks", "tusks", "tusks", "ivory", "tusks"], "difficult_direct_answer": false, "rationales": ["The objects are tusks.", "Elephants have these long teeth near their trunks made of ivory.", "The white objects are visible next to the trunks and are consistent with the size, shape and apparent material of answer a. elephants are known to have answer a of this type."], "image": "train2014/COCO_train2014_000000241410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296851, "question_id": "KDJsKJmNEongy4GSFuh2ka", "question": "What mode of transport is in the above picture?", "choices": ["railway", "air", "water", "road"], "correct_choice_idx": 2, "direct_answers": ["ship", "ships", "sailboat", "boat", "water", "ship", "sailing ship", "ship", "boats", "boat"], "difficult_direct_answer": false, "rationales": ["Boats can currently only travel on water.", "We see a variety of boats in this scene. boats travel through the medium of water.", "There are boats in the ocean."], "image": "train2014/COCO_train2014_000000296851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21396, "question_id": "KDfCtvrPJzpNrRAa3BgZ3v", "question": "What type of diet are these creatures known to be?", "choices": ["omnivore", "herbivores", "carnivores", "vegan"], "correct_choice_idx": 2, "direct_answers": ["carnivorous", "carnivorous", "carnivores", "carnivorous", "carnivore", "carnivores", "carnivorous", "carnivorous", "omnivores", "carnivore"], "difficult_direct_answer": false, "rationales": ["The cats on the chair are in the feline family and eat meat.", "The diet is carnivorous.", "Cats generally eat mice."], "image": "val2014/COCO_val2014_000000021396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462283, "question_id": "KDtqao4u4eUghjnpcW8Uuh", "question": "Who is the man standing in front of the group?", "choices": ["uncle", "father", "pastor", "instructor"], "correct_choice_idx": 3, "direct_answers": ["instructor", "instructor", "ski instructor", "instructor", "guide", "instructor", "teaching", "instructor", "man", "instructor"], "difficult_direct_answer": false, "rationales": ["The person at the front is facing the group. each person in the group is facing this individual while engaging in an identical body position with a snowboard.", "The people who are learning are siting down. when a teacher is teaching the students are sitting down.", "The man is the teacher."], "image": "train2014/COCO_train2014_000000462283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92847, "question_id": "KEWGadvXbFRsvXdJPeRK7C", "question": "What is happening in the scene?", "choices": ["protest", "riot", "game", "war"], "correct_choice_idx": 2, "direct_answers": ["jousting", "jousting", "fencing", "pretend fighting", "knights jousting", "jousting", "polo", "game", "war reenactment", "battle"], "difficult_direct_answer": false, "rationales": ["The plastic and wooden weapons present and unarmored children suggest this is not real combat and that they are playing around.", "The game is happening.", "They are playing around."], "image": "val2014/COCO_val2014_000000092847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507782, "question_id": "KEfpYUvxWvcY4wzCd8xHmi", "question": "What is the likely relationship of the old woman to the young woman?", "choices": ["mother", "great grandmother", "client", "coworker"], "correct_choice_idx": 2, "direct_answers": ["client", "colleagues", "coworkers", "her boss", "work related", "customer", "coworker", "coworker", "mother", "teacher/student"], "difficult_direct_answer": true, "rationales": ["Since we can see a nametag on the younger woman and the older woman is in her outer clothes, it's safe to assume the younger one works there while the older one is a consumer.", "The two women look like they work together.", "The older lady is most likely a client of this company, and the other woman is an employee helping her."], "image": "val2014/COCO_val2014_000000507782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491727, "question_id": "KEg6DXrebJ7JXixwJeyqP7", "question": "What kind of pizza does the person like?", "choices": ["spinach", "pepperoni", "vegan", "hates pizza"], "correct_choice_idx": 1, "direct_answers": ["pepperoni", "cheese", "pepperoni", "pepperoni", "pepperoni", "pepperoni", "pepperoni", "pepperoni", "cheese", "pepperoni"], "difficult_direct_answer": false, "rationales": ["There are meat circles on it.", "The person appears to be happy to be eating this slice, and the small red circles resemble that of pepperoni, the most popular pizza topping.", "They are eating a slice with red circles of meat"], "image": "train2014/COCO_train2014_000000491727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266153, "question_id": "KFLNRxit5TrrvTmQn8XXJH", "question": "Who might offer a paid ride to somebody?", "choices": ["frisbee player", "jogger", "biker", "taxi"], "correct_choice_idx": 3, "direct_answers": ["taxi driver", "taxi", "taxi", "taxi", "taxi", "taxi", "cab driver", "taxi", "cab", "taxi"], "difficult_direct_answer": false, "rationales": ["A cab is a paid ride.", "Taxis are used to help people to travel", "There is a taxi in the background."], "image": "train2014/COCO_train2014_000000266153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289100, "question_id": "KFd7KSpKUN8mcTCSMiJcEP", "question": "What type of fruit is hanging from the ceiling?", "choices": ["banana", "orange", "apple", "watermelon"], "correct_choice_idx": 0, "direct_answers": ["bananas", "bananas", "banana", "banana", "bananas", "banana", "banana", "banana", "bananas", "banana"], "difficult_direct_answer": false, "rationales": ["The yellow colour and long length and shape is that of bananas, and they grown in bunches as is shown here.", "There are bananas in an elevated position as compared to other fruit nearby.", "There is a bushel of several yellow long fruits hanging. they have a long stem on one end."], "image": "train2014/COCO_train2014_000000289100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263588, "question_id": "KFmEG6cL8pwuyCcuEeTU6b", "question": "What is the name given to the birds in the picture?", "choices": ["flamingo", "owls", "eagles", "penguins"], "correct_choice_idx": 3, "direct_answers": ["penguin", "penguin", "penguin", "penguins", "penguins", "penguins", "penguins", "penguin", "penguin", "penguins"], "difficult_direct_answer": false, "rationales": ["The birds are black and white and are standing upright.", "This black and white animal is called a penguin.", "They are black and white with short beaks."], "image": "val2014/COCO_val2014_000000263588.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459824, "question_id": "KFxowJnGarsMXkRuHqYYwE", "question": "What is the tray made from?", "choices": ["wood", "plastic", "steel", "glass"], "correct_choice_idx": 2, "direct_answers": ["aluminum", "metal", "aluminum", "metal", "metal", "metal", "aluminum", "aluminum", "steel", "metal"], "difficult_direct_answer": false, "rationales": ["It's a serving tray that is made out of metal.", "The silver color means it is probably a metal tray.", "It needs to be this material to be safe in an oven"], "image": "train2014/COCO_train2014_000000459824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174896, "question_id": "KGZrcBCXe9PbeJ3g5p9KRq", "question": "What is in the little green bottle in between the two top bowls?", "choices": ["salad dressing", "steak sauce", "bitters", "alcohol"], "correct_choice_idx": 0, "direct_answers": ["dressing", "salad dressing", "salad dressing", "salad dressing", "liquor", "dressing", "condiment", "sauce", "sauce", "dressing"], "difficult_direct_answer": false, "rationales": ["The bottle is dressing.", "The bottle is next to a salad.", "The little green bottle is dressing for the salad."], "image": "train2014/COCO_train2014_000000174896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459003, "question_id": "KGbrcztQsbUgRacYX2fgDh", "question": "What is the composition of the falling material?", "choices": ["stone", "brick", "cloth", "water"], "correct_choice_idx": 3, "direct_answers": ["snow", "water", "water", "snow", "water", "snow", "snow", "water", "water", "snow"], "difficult_direct_answer": false, "rationales": ["Basic science tells you that when water freezes it turns into ice, hail, and snow.", "The snow is made from water.", "The falling material is snow. it is not made out of stone, cloth, or brick."], "image": "train2014/COCO_train2014_000000459003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435897, "question_id": "KGkNgmBQzKMCrLU7wuNGue", "question": "This photo was taken from inside what?", "choices": ["helmet", "box", "backpack", "car"], "correct_choice_idx": 3, "direct_answers": ["car", "car", "car", "car", "vehicle", "car", "car", "car", "vehicle", "car"], "difficult_direct_answer": false, "rationales": ["The photo is from a car.", "You can see the dashboard of a car from the inside and a windshield.", "You can see part of the dashboard"], "image": "train2014/COCO_train2014_000000435897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391300, "question_id": "KGt59bavP6pQKa5BYNmGeE", "question": "What might the people who ride in the vehicle be returning from?", "choices": ["eating contest", "sales call", "office", "vacation"], "correct_choice_idx": 3, "direct_answers": ["trip", "vacation", "vacation", "vacation", "vacation", "vacation", "trip", "vacation", "vacation", "trip"], "difficult_direct_answer": false, "rationales": ["There are some vacation suitcases in the back of the car.", "The people have suitcases with tourist stickers on them.", "There are two suitcases with travel sticker on them in the back of the car."], "image": "val2014/COCO_val2014_000000391300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283188, "question_id": "KH92ocoVqSifUxBUwzp8XB", "question": "What is on the wall?", "choices": ["portrait", "cabinet", "poster", "bat"], "correct_choice_idx": 1, "direct_answers": ["vanity", "towel rack", "medicine cabinet", "cupboard", "cupboard", "cabinet", "cabinet", "cabinet", "shelve", "cabinet"], "difficult_direct_answer": false, "rationales": ["The only thing on the wall is a medicine cabinet.", "The cabinet is on the wall.", "Here a medicine cabinet is mounted to the wall."], "image": "train2014/COCO_train2014_000000283188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435163, "question_id": "KHECi25T22qnD85naZ2KeX", "question": "Which direction is to Botetourt?", "choices": ["none", "west", "east", "north"], "correct_choice_idx": 1, "direct_answers": ["east", "west", "rightward", "turn right", "right left", "left", "right", "east", "rightward", "right"], "difficult_direct_answer": false, "rationales": ["It is hard to tell which way is north here, but if you are facing north while reading this sign the direction would be east.", "Botetourt street is with the grain of the sun.", "The botetourt sign is pointing in a cardinal direction."], "image": "train2014/COCO_train2014_000000435163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48900, "question_id": "KHYfix7aV56xknuSZZZTqG", "question": "What is sitting next to the laptop?", "choices": ["book", "magazine", "newspaper", "cell phone"], "correct_choice_idx": 3, "direct_answers": ["phone", "cellphone", "phone", "cell phone", "cell phone", "smart phone", "cell phone", "laptop", "cell phone", "cell phone"], "difficult_direct_answer": false, "rationales": ["The phone is to the right of the laptop.", "A small device with a screen is next to a laptop. phones are small devices with a screen.", "The screen is on and it's small and flat"], "image": "train2014/COCO_train2014_000000048900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42683, "question_id": "KHZXtjqUh98GyD7vZrsJm3", "question": "What fruit does the condiment originate from?", "choices": ["cucumber", "tomato", "raspberry", "strawberry"], "correct_choice_idx": 1, "direct_answers": ["tomato", "tomato", "tomato", "tomato", "tomato", "tomato", "tomato", "tomato", "tomato", "tomato"], "difficult_direct_answer": false, "rationales": ["The fruit is the tomato.", "There is ketchup beside the sausage. it comes from a red fruit.", "There is a red sauce used with the brat sandwich."], "image": "train2014/COCO_train2014_000000042683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450904, "question_id": "KHsV33hgsoTuxSkj3D2ikc", "question": "What is in the plastic bottle on the right?", "choices": ["mouth wash", "contact solution", "toothpaste", "shampoo"], "correct_choice_idx": 0, "direct_answers": ["mouthwash", "mouth wash", "mouth wash", "mouth wash", "mouthwash", "mouthwash", "listerine", "mouthwash", "mouthwash", "mouthwash"], "difficult_direct_answer": false, "rationales": ["The bottle has mouthwash.", "The bottle is shaped like a typical mouthwash bottle.", "That shape of bottle with that type of cap is usually used for the mouth gargling solution."], "image": "train2014/COCO_train2014_000000450904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549127, "question_id": "KJEVYhrLZAUBHw2TcV4TbH", "question": "What sort of food is being moved here?", "choices": ["seafood", "goat", "chicken", "beef"], "correct_choice_idx": 0, "direct_answers": ["seafood", "seafood", "seafood", "seafood", "seafood", "seafood", "bread", "seafood", "bread", "seafood"], "difficult_direct_answer": false, "rationales": ["Shellfish and other seafood can be clearly seen. beef, chicken, and goat do not come in shells.", "If you look close you can make out the shelled food, which lets you know what type of food is being sold.", "You can tell by the region there in and the food that is shown as to what is being sold here."], "image": "train2014/COCO_train2014_000000549127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518769, "question_id": "KJNNJRSREvtkNFuXFsi9BC", "question": "The man wearing what color of shirt is responsible for rendering decisions on judgment calls?", "choices": ["red", "white", "blue", "black"], "correct_choice_idx": 3, "direct_answers": ["umpire", "umpire", "black", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The umpire of a baseball game is wearing black.", "He is a referee.", "That is the color umpires wear"], "image": "train2014/COCO_train2014_000000518769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352515, "question_id": "KJSYMu5xcgYwTewns5fxVp", "question": "What move is this player employing?", "choices": ["serve", "backhand", "forehand", "receive"], "correct_choice_idx": 3, "direct_answers": ["strike", "serve", "serve", "serve", "serve", "receive", "serve", "returning", "serve", "serving"], "difficult_direct_answer": false, "rationales": ["The man is reaching for the ball.", "He is getting the ball", "The player is about to hit the ball so he is receiving."], "image": "train2014/COCO_train2014_000000352515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151081, "question_id": "KJXNNsw8jgwoXtUjNB3ECr", "question": "What does the flying thing make use of on the ground?", "choices": ["seeds", "runway", "crops", "nest"], "correct_choice_idx": 1, "direct_answers": ["landing strip", "landing gear", "airplane", "runway", "landing strip", "landing", "landing", "runway", "runway", "pesticide"], "difficult_direct_answer": false, "rationales": ["An airplane is in the air above a strip of paved road. airplanes take off and land on runways.", "There is a cement strip wide enough for a plane to land, and it is located in a rural area away from homes and businesses. planes need a place on the ground to land.", "The flying thing is a small airplane that is approaching a runway for landing."], "image": "val2014/COCO_val2014_000000151081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352370, "question_id": "KJZot9zZBbW2VmtXDxAf8V", "question": "What kind of environment is this?", "choices": ["countryside", "mountain vista", "rural", "urban"], "correct_choice_idx": 3, "direct_answers": ["urban", "city", "tropical", "urban", "city", "city", "urban", "urban", "urban", "urban"], "difficult_direct_answer": false, "rationales": ["An intersection is shown with tall buildings in the background and signage for a nearby hospital.", "The traffic lights and tall buildings all close together are that of ones you would find in a highly populated metropolis.", "There are traffic lights, buildings, and hospitals in this environment. there are no mountains."], "image": "train2014/COCO_train2014_000000352370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561527, "question_id": "KJdNMbWVWnCdX2nPcMwHRg", "question": "Why is the train car parked by itself?", "choices": ["to load", "its connecting", "to unload", "its abandoned"], "correct_choice_idx": 3, "direct_answers": ["deserted", "its abandoned", "abandoned", "abandoned", "abandoned", "decommissioned", "it's abandoned", "garage", "broken down", "getting refurbished"], "difficult_direct_answer": false, "rationales": ["There are parts missing on the train car.", "The train car is rusted, destroyed, and has not wheels are windows.", "There is a dumpster right next to it and debris piled up"], "image": "train2014/COCO_train2014_000000561527.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297173, "question_id": "KJmbgu5AhbHiomF6CkZtmG", "question": "What is the orange building?", "choices": ["rest room", "kitchen", "dining room", "dugout"], "correct_choice_idx": 3, "direct_answers": ["dugout", "dugout", "dugout", "wall", "dugout", "dugout", "dugout", "dugout", "dugout", "wall"], "difficult_direct_answer": false, "rationales": ["That building is used to hide in the shade.", "There are players visibly waiting in the structure. in baseball, when players are not playing, they wait in a dugout which is a building that looks like the orange one here.", "The building is a roofed shelter dug into the ground for usage by a sports team."], "image": "train2014/COCO_train2014_000000297173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36341, "question_id": "KKM2Mx5ozvy6bQENszDVWC", "question": "What is the man standing in?", "choices": ["school yard", "batting cage", "prison cell", "backyard"], "correct_choice_idx": 1, "direct_answers": ["batting cage", "batting cage", "cage", "batting cage", "batting cage", "batting cafe", "concrete", "batting cage", "batting cage", "batting cage"], "difficult_direct_answer": false, "rationales": ["The man is standing inside of a batters cage where he is practicing ball.", "The fence is made of metal and the guy has a bat.", "The man is at a batting cage."], "image": "train2014/COCO_train2014_000000036341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19828, "question_id": "KKPAJE4xVmCydVJRLbDX2X", "question": "What should the car do when it approaches this light?", "choices": ["turn", "yield", "go", "stop"], "correct_choice_idx": 3, "direct_answers": ["stop", "go", "stop", "stop", "stop", "yield", "stop", "stop", "slow down", "stop"], "difficult_direct_answer": false, "rationales": ["The light in the foreground is red.", "If you see the red hand on the traffic light, it indicates what you can or cannot do.", "The car should stop."], "image": "train2014/COCO_train2014_000000019828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216856, "question_id": "KKPkJVTbdvtc6w7EXFANDX", "question": "What did this person just do with their left hand?", "choices": ["signal ref", "nothing", "threw ball", "hit racquet"], "correct_choice_idx": 2, "direct_answers": ["toss ball", "throw ball", "toss ball", "throw", "throw ball", "throw ball", "threw ball", "throw ball", "throw", "tossed ball"], "difficult_direct_answer": false, "rationales": ["The person just tossed the ball up.", "The person's body language looks like they just heaved something upwards.", "A tennis player is holding a hand in the air. to serve a tennis player throws the ball in the air to hit it."], "image": "train2014/COCO_train2014_000000216856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251111, "question_id": "KKRzJNBv6vgVx69FZaQPiu", "question": "What can be found on the round table?", "choices": ["globe", "candy", "pens", "flowers"], "correct_choice_idx": 0, "direct_answers": ["globe", "globe", "globe", "globe", "globe", "globe", "globe", "globe", "globe", "globe"], "difficult_direct_answer": false, "rationales": ["There is a model of the world.", "There is a round globe on the blue table.", "This is fairly obvious in the picture."], "image": "train2014/COCO_train2014_000000251111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513602, "question_id": "KKfVqdx4Z3Fd68UsLq5UmR", "question": "What is the most common horse competition?", "choices": ["riding", "polo", "racing", "jumping"], "correct_choice_idx": 2, "direct_answers": ["flat racing", "horse racing", "horse racing", "racing", "track race", "jumping", "jumping", "race", "racing", "racing"], "difficult_direct_answer": false, "rationales": ["Horse racing is very popular.", "Horses are most commonly raced.", "Horses generally race in competitions."], "image": "train2014/COCO_train2014_000000513602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41005, "question_id": "KL4bD9nfySAoWvEaNmcNPz", "question": "What are they ready to do here?", "choices": ["descend", "cross", "ascend", "retreat"], "correct_choice_idx": 0, "direct_answers": ["skii", "ski", "ski", "ski", "ski", "descend", "ski downhill", "ski", "ski downhill", "ski"], "difficult_direct_answer": false, "rationales": ["Once a skier is at the top of the mountain, there is nothing else to do but ski down.", "The skiers are ready to go down the mountain.", "The people are at the top of a ski slope and are about to ski down the mountain."], "image": "train2014/COCO_train2014_000000041005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523217, "question_id": "KL8NEytiSN5Asw2jnzskKm", "question": "These people will most likely celebrate what wedding anniversary next year?", "choices": ["fortieth", "tenth", "fifteenth", "first"], "correct_choice_idx": 3, "direct_answers": ["bride groom", "one", "first", "first", "second", "first", "yes", "first", "first", "first"], "difficult_direct_answer": false, "rationales": ["These people are just getting married now.", "These people are just getting married now.", "The people just got married."], "image": "val2014/COCO_val2014_000000523217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400985, "question_id": "KLEpyuyXg3kYrzcm9Xf4Kz", "question": "What type of donuts are these?", "choices": ["chocolate", "glazed", "plain cake", "powdered sugar"], "correct_choice_idx": 2, "direct_answers": ["cronuts", "cake", "cake", "cinnamon", "cake", "cake", "plain", "plain cake", "cake", "plain"], "difficult_direct_answer": false, "rationales": ["These are just plain donuts.", "This pastry does not have any toppings on it, nor coatings of icing.", "They are plain cake donuts."], "image": "val2014/COCO_val2014_000000400985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378747, "question_id": "KLXzqcH55AUsHJYKmHNS7N", "question": "What are the orange cones on the road called?", "choices": ["road cap", "pylons", "highway cone", "safety cones"], "correct_choice_idx": 1, "direct_answers": ["traffic cones", "traffic cones", "pylons", "construction cones", "pylons", "traffic cones", "orange cones", "traffic cones", "traffic cones", "traffic cones"], "difficult_direct_answer": false, "rationales": ["They are bright orange so people can see them", "The cones protect people.", "There are pylons on the roadsides."], "image": "train2014/COCO_train2014_000000378747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242361, "question_id": "KLb3sP5GzBbgBuVogbNnLc", "question": "What company has trademarked the popular name of this toy?", "choices": ["fisher price", "wham-o", "hasbro", "mattel"], "correct_choice_idx": 1, "direct_answers": ["wham-o", "wham-o toy", "frisbee", "wham-o", "frisbee", "mate", "wham-o", "hasbro", "hasbro", "mattel"], "difficult_direct_answer": false, "rationales": ["Whamo is a game that involves frisbees.", "It is a frisbee.", "The toy is a frisbee."], "image": "train2014/COCO_train2014_000000242361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145212, "question_id": "KM5p6hTeiUfr3tRXK6F9XC", "question": "Where are these polar bears being kept?", "choices": ["museum", "zoo", "jail", "backyard"], "correct_choice_idx": 1, "direct_answers": ["zoo", "zoo", "for amusement", "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "zoo"], "difficult_direct_answer": false, "rationales": ["The bears are in a zoo.", "The polar bears are inside an enclosure where people are watching them.", "There are people in the background looking at the animals and this is a typical zoo setting."], "image": "train2014/COCO_train2014_000000145212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160341, "question_id": "KM7h6Hqai8gjbdDA7KFWbK", "question": "How does the smoke escape from the building behind the person?", "choices": ["window", "door", "chimney", "antenna"], "correct_choice_idx": 2, "direct_answers": ["chimney", "chimney", "chimney", "chimney", "chimney", "chimney", "chimney", "chimney", "chimney", "chimney"], "difficult_direct_answer": false, "rationales": ["The smoke escapes out of the chimney.", "A fireplace has this to direct the smoke outside", "The smoke goes through the chimney."], "image": "val2014/COCO_val2014_000000160341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58684, "question_id": "KMjhnL49f5HL7mMRpMdbkx", "question": "Who founded the company shown in the building?", "choices": ["bill gates", "tony fauci", "donald trump", "steve jobs"], "correct_choice_idx": 3, "direct_answers": ["steve jobs", "steve jobs", "steve jobs", "steve jobs", "steve jobs", "tim cook", "apple", "steve jobs", "apple", "gates"], "difficult_direct_answer": false, "rationales": ["The company is apple", "Apple is founded by steve jobs.", "The building has the apple logo on it which is a company founded by steve jobs."], "image": "train2014/COCO_train2014_000000058684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92894, "question_id": "KMq4wqu9tZaRmG3nrdT6Yb", "question": "What handedness does the Giants player possess?", "choices": ["none", "right", "left", "normal"], "correct_choice_idx": 2, "direct_answers": ["left", "left", "right", "left", "left", "right", "left", "left", "left", "left"], "difficult_direct_answer": false, "rationales": ["Left handed players are usually on the side of home plate that is closer to first base.", "A baseball player is standing on the opposite side of plate. he has right leg forward.", "He is standing on the side of the plate that left handed players typically stand on."], "image": "val2014/COCO_val2014_000000092894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261876, "question_id": "KMtn4i46rSEX7Z4XJLM5M6", "question": "How does the person who lives here relax at bedtime?", "choices": ["bon fires", "reading", "gaming", "singing"], "correct_choice_idx": 1, "direct_answers": ["read", "sleep", "reading", "light", "reading", "read", "read books", "reading", "sleeps", "reading"], "difficult_direct_answer": false, "rationales": ["The person reads.", "A book is on a nightstand near a bed. people read to relax.", "The person reads the book on the nightstand."], "image": "train2014/COCO_train2014_000000261876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360767, "question_id": "KN98DJz84SUeTkKte7z5hw", "question": "Why are some cyclists wearing yellow?", "choices": ["uniform", "costume", "visibility", "style"], "correct_choice_idx": 2, "direct_answers": ["visibility", "safety", "marathoning", "safety", "protection", "be seen", "visibility", "visibility", "be seen", "safety"], "difficult_direct_answer": false, "rationales": ["There are some cyclists wearing the color yellow for additional visibility.", "They are wearing safety vests so they are easier to see.", "So cars can see them easily"], "image": "val2014/COCO_val2014_000000360767.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575029, "question_id": "KP5tru6grPUwUGNgeCATe7", "question": "What profession is the man who is riding the horse?", "choices": ["police", "cook", "professional rider", "stunt man"], "correct_choice_idx": 0, "direct_answers": ["police", "cop", "police", "police", "cop", "policeman", "policeman", "officer", "police", "police officer"], "difficult_direct_answer": false, "rationales": ["The officer riding the horse is wearing a hat that only a policeman could wear.", "By the uniform the man on the horse is wearing it is easy to understand what he does for a living.", "The profession is the police."], "image": "train2014/COCO_train2014_000000575029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447363, "question_id": "KPAFvwrowNovcF9iNnGvC5", "question": "What is the most odd ingredient of this pizza?", "choices": ["jalapeno", "lemon", "cheese", "bacon"], "correct_choice_idx": 1, "direct_answers": ["lemon slices", "lemon", "lemon slices", "smoked reindeer", "lemon", "lemon slices", "lemon", "lemon", "lemon", "lemon"], "difficult_direct_answer": false, "rationales": ["A pizza typically does not contain fruits, especially sour and acidic ones like a lemon, as the toppings are usually salty and savoury.", "Lemon is shown in the diagram.", "This citrus fruit is the most unusual topping present on this pizza."], "image": "train2014/COCO_train2014_000000447363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200681, "question_id": "KPDS7cGWt4wHS7qX2miDdS", "question": "What prevents the dog from biting the frisbee?", "choices": ["tree limb", "man", "other dog", "nothing"], "correct_choice_idx": 0, "direct_answers": ["tree", "too high", "tree limb", "tree", "too far", "tree limb", "tree", "height", "branch", "tree"], "difficult_direct_answer": false, "rationales": ["The frisbee is up in the tree where the dog can't get to it.", "The item identified in a is the same one that is holding the frisbee above the dog's head.", "The man and other dog are behind the dog and are not stopping it from getting the frisbee."], "image": "val2014/COCO_val2014_000000200681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54327, "question_id": "KPPXmSNzUxDn6ubWaNgsKo", "question": "What might you likely do at the building with a clock on it?", "choices": ["boxing", "indoor fishing", "office work", "pray"], "correct_choice_idx": 3, "direct_answers": ["worship", "pray", "tell time", "pray", "pray", "worship", "worship", "tell time", "church", "pray"], "difficult_direct_answer": false, "rationales": ["This building is a church that people go to for worship.", "The building looks like a church, so fishing isn't happening. there will not be a lot of business work happening either.", "Many office buildings have a clock on them."], "image": "train2014/COCO_train2014_000000054327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167968, "question_id": "KPa7Z5eH98SNHJzt5NAdBy", "question": "What does the child do here?", "choices": ["brush teeth", "number 1", "number 2", "read"], "correct_choice_idx": 3, "direct_answers": ["potty", "lift seat", "potty", "use bathroom", "read book", "poop", "read", "sit", "read book", "reading"], "difficult_direct_answer": false, "rationales": ["A child is holding a book and looking at it.", "He is sitting on a potty with his pants around his ankles.", "The child is reading."], "image": "train2014/COCO_train2014_000000167968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158342, "question_id": "KPnwv2cHUf2jdDX2Yf8x29", "question": "If a boat was responsible for their momentum the sport would be called what?", "choices": ["paddle boarding", "water skiing", "sky diving", "bungee jumping"], "correct_choice_idx": 1, "direct_answers": ["water skiing", "parasailing", "boat racing", "sea", "paragliding", "parasailing", "wakeboarding", "water skiing", "water skiing", "water skiing"], "difficult_direct_answer": false, "rationales": ["A skier in the water is being pulled by a parachute.", "This is merely a fact.", "If the boat was keeping the kites going then the sport would be water skiing."], "image": "train2014/COCO_train2014_000000158342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18688, "question_id": "KPs2suRhx9Y7wUaZkCMDyN", "question": "Approximately how many miles are there between the home cities of these two teams?", "choices": ["350", "640", "1012", "940"], "correct_choice_idx": 1, "direct_answers": ["six hundred", "600 miles", "639", "640", "1000", "600", "640", "25 miles", "640", "four hundred"], "difficult_direct_answer": false, "rationales": ["The pitch are mostly not wide on the width.", "The mileage listed in a corresponds to the distance between boston and cleveland.", "The indians play in cleveland and the red sox play in boston. an internet search of the distance between the two cities provided 638 as the answer so i chose the closest option."], "image": "val2014/COCO_val2014_000000018688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391721, "question_id": "KPzhWMqyXRuXmwHaNngxPY", "question": "What is the driver of the blue car participating in?", "choices": ["play", "demolition derby", "race", "parade"], "correct_choice_idx": 3, "direct_answers": ["parade", "parade", "parade", "parade", "parade", "parade", "parade", "parade", "parade", "parade"], "difficult_direct_answer": false, "rationales": ["The people are in a parade.", "The driver is in a parade.", "Based on the crowd of spectators gathered on the sidewalk and the number of people in the bed of the truck, the event that they're participating in here must be a parade."], "image": "train2014/COCO_train2014_000000391721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489799, "question_id": "KQH5rcb2b7KSJhyu6t5oST", "question": "Why is the one guy wearing a yellow uniform?", "choices": ["spectator", "referee", "water boy", "goalie"], "correct_choice_idx": 3, "direct_answers": ["goalie", "goalie", "goalie", "goalie", "goalie", "goalie", "goalie", "goalie", "goalie", "goalie"], "difficult_direct_answer": false, "rationales": ["People are playing soccer and one is in a different uniform from the rest. soccer goalies wear a different shirt from everyone else.", "The man is differentiating himself so he is more visible.", "The person in the yellow uniform has gloves on to catch the ball."], "image": "val2014/COCO_val2014_000000489799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526769, "question_id": "KQHknp8CXfHLfTPdwumjZX", "question": "The man on the motorcycle is pretending to act as what type of person?", "choices": ["policeman", "businessman", "secret serviceman", "fireman"], "correct_choice_idx": 2, "direct_answers": ["motorcycle rider", "business man", "police", "police", "biker", "hbo exec", "gang member", "businessman", "celebrity", "secret serviceman"], "difficult_direct_answer": true, "rationales": ["The man is in a black suit and wearing sunglasses.", "The man on the motorcycle is pretending to be in the secret police.", "He is dressed all in black like the secret service."], "image": "train2014/COCO_train2014_000000526769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293841, "question_id": "KQPLeyzGgVe4deSbHeQPt6", "question": "What is the bus stopped outside of?", "choices": ["gas station", "restaurant", "dentist", "library"], "correct_choice_idx": 1, "direct_answers": ["restaurant", "restaurant", "restaurant", "restaurant", "restaurant stores", "kitchen", "restaurant", "restaurant", "restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["The bus is stopped outside the gourmet burger kitchen", "The place says they serve burgers.", "The bus is outside of \"gourmet burger kitchen\"."], "image": "val2014/COCO_val2014_000000293841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83986, "question_id": "KQQrMqHs8DEpXFymaudbTZ", "question": "What is the name of the Garage?", "choices": ["polo", "imam", "leke", "akd"], "correct_choice_idx": 0, "direct_answers": ["polo", "polo", "polo garage", "polo garage", "polo", "polo garage", "polo", "polo", "polo", "polo"], "difficult_direct_answer": false, "rationales": ["A garage sign hangs above a business.", "The name is on the cover.", "There is a sign with the name of the garage on it."], "image": "train2014/COCO_train2014_000000083986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282590, "question_id": "KQbsMYhEWzPqPzEZhMUfYg", "question": "What is the woman most likely to be most days of the week?", "choices": ["spelunker", "archaeologist", "animal lover", "princess"], "correct_choice_idx": 2, "direct_answers": ["investment banker", "happy", "working", "park", "working", "animal lover", "work", "work", "teacher", "park"], "difficult_direct_answer": false, "rationales": ["The woman on the left has a dog she plays with which makes her an animal lover.", "She has a dog and plays with it. so it's safe we got the right answer.", "She is playing with a dog"], "image": "train2014/COCO_train2014_000000282590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27656, "question_id": "KQfbBfhj4bjZzosm3vCa2Z", "question": "What animal might be found in this things?", "choices": ["cat", "dog", "beetle", "bee"], "correct_choice_idx": 3, "direct_answers": ["bee", "bee", "bumblebee", "bee", "ox", "bee", "aphid", "bird", "bugs", "bee"], "difficult_direct_answer": false, "rationales": ["This insect gets its nourishment from flowers.", "A bee produces honey from pollen.", "Flowers produce pollen that bees can convert to food for themselves."], "image": "val2014/COCO_val2014_000000027656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329035, "question_id": "KQkU9BaRUHQFk4w2dwb3US", "question": "How do the people know each other?", "choices": ["coworkers", "siblings", "spouses", "neighbors"], "correct_choice_idx": 0, "direct_answers": ["work", "friends", "coworkers", "friends", "coworkers", "mutual training", "classmates", "siblings", "friends", "same work"], "difficult_direct_answer": false, "rationales": ["They are sitting next to each other working on their computers.", "Looks like they are in a break room, eating on paper plates and there are 3 of them so that rules out spouses.", "The people are colleagues."], "image": "train2014/COCO_train2014_000000329035.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431890, "question_id": "KQn324G8beFLHhGTmz28ab", "question": "What type of outerwear is the woman wearing?", "choices": ["athletic wear", "wetsuit", "dress", "pajamas"], "correct_choice_idx": 2, "direct_answers": ["sundress", "dress", "dress", "dress", "hat", "dress", "hat", "dress", "dress", "dress"], "difficult_direct_answer": false, "rationales": ["The outerwear is a dress.", "The woman is wearing a dress.", "The top is attached to a skirt"], "image": "train2014/COCO_train2014_000000431890.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164663, "question_id": "KQsE4Qa4pqQzSm2C2gDkRS", "question": "What is the man wearing a purple shirt doing?", "choices": ["dancing", "high jumping", "throwing frisbee", "catching frisbee"], "correct_choice_idx": 3, "direct_answers": ["catching frisbee", "playing frisbee", "catching frisbee", "catching frisbee", "catching frisbee", "catching frisbee", "catching", "playing frisbee", "catching frisbee", "playing frisbee"], "difficult_direct_answer": false, "rationales": ["He is trying to catch the disc.", "The frisbee is coming at him through the air", "The man is reaching up as if to catch a frisbee that is approaching."], "image": "train2014/COCO_train2014_000000164663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304997, "question_id": "KQsrh6XTHvGVaYLWomiNC6", "question": "Before 2021 when was the last time this home team won the World Series?", "choices": ["1965", "2012", "2019", "1980"], "correct_choice_idx": 2, "direct_answers": ["2019", "don't know", "1928", "2019", "1928", "2019", "2019", "2019", "2019", "2019"], "difficult_direct_answer": false, "rationales": ["Based on signs on the field and scoreboard, the home team is the washington nationals. before 2021, the last time they won the world series was in 2019.", "The washington nationals won the world series in 2019 over the houston astros.", "The last time they won was in 2019."], "image": "train2014/COCO_train2014_000000304997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216369, "question_id": "KR8GHTeLKkapH5LPE8TZ9Q", "question": "What is on the far left?", "choices": ["bench", "hammock", "mouse", "cat"], "correct_choice_idx": 0, "direct_answers": ["bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench"], "difficult_direct_answer": false, "rationales": ["A bench is on the far left.", "There is a bench for sitting on the far left.", "The bench is on the left."], "image": "val2014/COCO_val2014_000000216369.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169891, "question_id": "KRYxjBmC3PLE8zGpF3xkvw", "question": "What type of room is this?", "choices": ["studio room", "hotel room", "single house", "university dorm"], "correct_choice_idx": 0, "direct_answers": ["kitchen", "studio room", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen"], "difficult_direct_answer": false, "rationales": ["Everything for a home is in the same basic room", "The kitchen and the bedroom are together", "You can see the kitchen is part of the rest of the room. a living area and kitchen are shown as one space."], "image": "val2014/COCO_val2014_000000169891.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248476, "question_id": "KRabiRYKQZxVP7VabUQHEY", "question": "Why is the person reaching out to the giraffe?", "choices": ["to bathe", "to feed", "to pet", "to comb"], "correct_choice_idx": 1, "direct_answers": ["feeding", "food", "feeding", "to feed", "to feed", "feeding", "feed it", "feeding", "feeding", "feeding"], "difficult_direct_answer": false, "rationales": ["A person is holding out some food to a giraffe's tongue.", "The person has food.", "By the tongue sticking out and food in the persons hand you can tell what is happening."], "image": "train2014/COCO_train2014_000000248476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404205, "question_id": "KRgmfLKF3Gg3G8yYSQxQZv", "question": "What are these babies considered to be?", "choices": ["foals", "lambs", "kittens", "puppies"], "correct_choice_idx": 1, "direct_answers": ["lambs", "lamb twins", "lambs", "lamb", "lambs", "lambs", "sheep", "lambs", "lambs", "lambs"], "difficult_direct_answer": false, "rationales": ["The mother of the animals is a sheep.", "They are baby sheep.", "The babies are lambs."], "image": "train2014/COCO_train2014_000000404205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142346, "question_id": "KRhhjcoWEaCfzMsPNs9R5j", "question": "Why does the small person in blue hold the stick?", "choices": ["dragging them", "keep balance", "sheer wickedness", "traction"], "correct_choice_idx": 0, "direct_answers": ["still learning", "for guidance", "provides balance", "dragged", "towing forward", "training", "dragging them", "too young", "holding on", "getting pulled"], "difficult_direct_answer": true, "rationales": ["The person in front is still moving forward, and the child isn't putting any effort into going forward.", "It is so the adult can help the child ski", "The angle of the skis pointing up show that they are going uphill. maybe the boy struggles with riding skis and needs help going up."], "image": "val2014/COCO_val2014_000000142346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60182, "question_id": "KS2hihZimMdGB5CSdYxdKU", "question": "What's the long silver object on the bike behind the man's foot?", "choices": ["handlebars", "fender", "brakes", "muffler"], "correct_choice_idx": 3, "direct_answers": ["exhaust pipe", "muffler", "muffler", "muffler", "muffler", "exhaust", "exhaust", "muffler", "exhaust", "muffler"], "difficult_direct_answer": false, "rationales": ["That's the muffler for the bike.", "A large silver cylinder pulls the exhaust through it and out the end as smoke.", "The long object keeps the sound volume down."], "image": "train2014/COCO_train2014_000000060182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529829, "question_id": "KSAaf2nUmP6SYwh9JMtEM5", "question": "What might you find here?", "choices": ["doctor", "mechanic", "lawyer", "merchant"], "correct_choice_idx": 3, "direct_answers": ["food", "herbs", "vegetables", "merchant", "food", "vegetables", "spices", "food", "groceries", "textiles"], "difficult_direct_answer": false, "rationales": ["There are multiple stalls selling items.", "A merchant sells wares.", "This is a market, not a garage, clinic, or law office."], "image": "train2014/COCO_train2014_000000529829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363549, "question_id": "KSGTR9xzhR4bDyJdKZi5Sg", "question": "How is the dog probably traveling?", "choices": ["motorcycle", "bike", "scooter", "skateboard"], "correct_choice_idx": 1, "direct_answers": ["bicycle", "by bicycle", "pouch", "riding", "motorcycle", "bicycle", "riding along", "bicycle pouch", "bike", "passenger"], "difficult_direct_answer": true, "rationales": ["Here we see a dog in a small compartment which is attached to the back of a bicycle.", "This dog is riding inside of a bike equipped to carry a dog.", "The other options don't match this image well."], "image": "train2014/COCO_train2014_000000363549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467091, "question_id": "KSWRhqyA7exQqoiqVe3LEe", "question": "How many stops will the bus make excluding the final destination?", "choices": ["one", "three", "two", "zero"], "correct_choice_idx": 3, "direct_answers": ["zero", "one", "not many", "twohundred fiftyseven", "257", "several", "not many", "zero", "zero", "fifteen"], "difficult_direct_answer": false, "rationales": ["There are no stops left.", "Express buses usually go quickly from one place to another.", "The bus will make no stops since it's an express bus."], "image": "train2014/COCO_train2014_000000467091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315404, "question_id": "KSgGQVJ9kY6fiHs25eAZ5X", "question": "What is the first number on the license plate?", "choices": ["four", "five", "three", "nine"], "correct_choice_idx": 0, "direct_answers": ["four", "number 4", "four", "four", "four", "four", "number 4", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["The license plate has a series of numbers and letters on it and upon investigation, it's apparent that the first number listed on the plate is the number 4.", "License plates are read from left to right. the leftmost number on this plate is a four.", "It is shown on the front"], "image": "train2014/COCO_train2014_000000315404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223612, "question_id": "KSj2dnZCg8jphjKpQLrQqZ", "question": "What kind of loads are the trucks probably used to haul?", "choices": ["gravel", "metal", "trash", "furniture"], "correct_choice_idx": 2, "direct_answers": ["trash", "junk", "junk", "garbage", "cargo", "gravel", "trash", "cargo", "trash", "trash"], "difficult_direct_answer": false, "rationales": ["This looks like a trash company that picks up peoples trash weekly.", "There are piles of it in the warehouse", "There are trucks visible and their style is consistent with a truck used to haul trash. there is also a dump visible that the trucks are in and around and some of the trucks have trash loads visible inside."], "image": "train2014/COCO_train2014_000000223612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267417, "question_id": "KSnPGiut7cDrDgxMCNgwBC", "question": "What type of animal is shown?", "choices": ["domestic", "wild", "aquatic", "stuffed"], "correct_choice_idx": 3, "direct_answers": ["bear", "bear", "bear", "teddy bear", "bear", "stuffed", "teddy bear", "teddy bear", "teddy bear", "bear"], "difficult_direct_answer": false, "rationales": ["The animal on the grass is a stuffed teddy bear.", "The animal is a teddy bear. it is not alive.", "This is a toy depiction of a bear"], "image": "train2014/COCO_train2014_000000267417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451532, "question_id": "KSuZb4KHitFfoAa4K2XXNA", "question": "What are the people looking over?", "choices": ["corner", "ledge", "brilliance", "crest"], "correct_choice_idx": 1, "direct_answers": ["mountain", "mountain", "hill", "cliff", "cliff", "ledge", "cliff edge", "mountain", "mountain", "down"], "difficult_direct_answer": false, "rationales": ["The people are standing near the edge of a steep surface to enjoy the view.", "They are looking down at the trail", "The people are looking over a steep hill."], "image": "val2014/COCO_val2014_000000451532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82889, "question_id": "KTEn7fQea2m3oRwVXdibsG", "question": "What is the white paper with words on it?", "choices": ["advertisement", "receipt", "note", "napkin"], "correct_choice_idx": 1, "direct_answers": ["receipt", "receipt", "receipt", "receipt", "receipt", "receipt", "receipt", "receipt", "receipt", "receipt"], "difficult_direct_answer": false, "rationales": ["It is a computerized printout of the food order.", "They get the paper for their payment.", "The paper is a receipt."], "image": "val2014/COCO_val2014_000000082889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177680, "question_id": "KTF8T28x99pd3U2ZvXqGoV", "question": "How many people are visible in the stands?", "choices": ["thousands", "dozens", "hundreds", "few"], "correct_choice_idx": 3, "direct_answers": ["six", "baseball", "few people", "seven", "few", "seven", "7-8", "8 people", "eight", "six"], "difficult_direct_answer": false, "rationales": ["There are a few people visible in the stands across the field from the batter wearing the red jersey.", "Hardly anyone is sitting in the bleachers. they are almost empty.", "The stands are almost empty so there can't be more than a few people there in total."], "image": "train2014/COCO_train2014_000000177680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205621, "question_id": "KTZBNjixwJMTucovrvspH7", "question": "What does the weather seem like it'd be here?", "choices": ["raining", "scorching", "cold", "hot"], "correct_choice_idx": 2, "direct_answers": ["cold", "cold", "cold", "cold", "sunny", "sunny", "cold", "sunny", "cold", "cold"], "difficult_direct_answer": false, "rationales": ["There is snow on the ground and for it to snow the temperature had to be below freezing.", "There is snow on the ground so it is likely very cold.", "There snow on the ground so it looks cold"], "image": "train2014/COCO_train2014_000000205621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263589, "question_id": "KTapi4WMKyojUFRh2gnLGX", "question": "The sandwiches that are popular here are sourced from what animal?", "choices": ["pigs", "horse", "cows", "sheep"], "correct_choice_idx": 0, "direct_answers": ["pig", "pigs", "pigs", "pig", "pig", "pigs", "pork chop", "pigs", "pork", "pig"], "difficult_direct_answer": false, "rationales": ["Pork is the name of the meat from these animals", "The sign has the name of the meat that comes from the animal listed in option a.", "The sign says that the depot sells pork chop sandwiches."], "image": "train2014/COCO_train2014_000000263589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425418, "question_id": "KTk9shJWkiwa4YKwDssKJh", "question": "If the drinks consist a little amount of alcohol what it will be called?", "choices": ["coffee", "beverages", "cocktail", "soft drinks"], "correct_choice_idx": 2, "direct_answers": ["wine", "alcoholic beverage", "mixed drink", "wine", "cocktail", "nip", "wine", "drunk", "wine", "wine"], "difficult_direct_answer": false, "rationales": ["It is mixed with other ingredients", "That drink often has little amounts of alcohol.", "The drinks on the table are called cocktails if there is any alcohol served with the meal."], "image": "train2014/COCO_train2014_000000425418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350663, "question_id": "KTpRr3ssMVa7JKuTMpY7Ve", "question": "Why is the boy in red kneeling?", "choices": ["to catch", "to hide", "to exercise", "to pray"], "correct_choice_idx": 0, "direct_answers": ["catcher", "catcher", "to catch", "catching", "catching", "catching", "catch ball", "catcher", "catching", "catcher"], "difficult_direct_answer": false, "rationales": ["There is a catcher that has equipment on that is trying to grab a ball.", "He is the catcher to catch the ball if it goes back there.", "The boy in the red has on his catcher's gear and is kneeling so that he can stop the ball if the batter swings and misses or just doesn't swing at all."], "image": "val2014/COCO_val2014_000000350663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478755, "question_id": "KTuDhCzkwedp5QqUM2Tqbr", "question": "Where will the guy on third base run to next?", "choices": ["2nd base", "pitcher's mound", "homeplate", "1st base"], "correct_choice_idx": 2, "direct_answers": ["near player", "home", "home plate", "home plate", "home plate", "homeplate", "home", "home", "home", "home plate"], "difficult_direct_answer": false, "rationales": ["In baseball when a runner is on third base their next objective is to run to homeplate when the ball is in play. the person visible at third is a runner because they ahve the same jersey color as the visible batter and they are in a ready to run stance in the base path.", "Traditionally there is only one place the batter can run to when he is on third.", "Homeplate is next."], "image": "train2014/COCO_train2014_000000478755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581394, "question_id": "KTxZsrUU6PCdx5dHjk2A4S", "question": "What are the people doing near the bus?", "choices": ["dancing", "cleaning", "sleeping", "packing"], "correct_choice_idx": 3, "direct_answers": ["waiting", "loading bikes", "watching", "mustering", "waiting", "packing", "gathering", "talking", "waiting", "reloding"], "difficult_direct_answer": false, "rationales": ["The people near the bus are packing their luggage and getting ready to travel.", "There seems to be a group of men lifting items and bikes to the man standing on top of the bus, indicating that they are loading it with items needed for a trip.", "The people are getting ready to board the bus. they must put their belongings somewhere before doing so."], "image": "val2014/COCO_val2014_000000581394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378118, "question_id": "KU8vHbiiHbEpvu6VFDurrB", "question": "What is the shovel carried here meant to be used for first?", "choices": ["signaling", "grave digging", "snow removal", "coal"], "correct_choice_idx": 2, "direct_answers": ["clear snow", "path-digging", "snow removal", "emergencies", "driveways", "snow", "snow removal", "skiing", "digging avalanche", "snow"], "difficult_direct_answer": false, "rationales": ["In case they got stuck or needed to get through a path.", "The shovel is for snow removal.", "The shovel removes snow."], "image": "train2014/COCO_train2014_000000378118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55039, "question_id": "KUTNmV3QLBi8eQt4BVaFJ5", "question": "What is wrong with this photo?", "choices": ["photoshopped", "too bright", "too dark", "blurred"], "correct_choice_idx": 1, "direct_answers": ["too bright", "very bright", "sunny", "too bright", "too bright", "excessive light", "sun exposure", "lighting", "too light", "brightness"], "difficult_direct_answer": false, "rationales": ["There are also too many shadows as compared to bright spots.", "The sun is much to bright in this picture.", "A group of people are standing at the edge of the water and a glare from the setting sun is present across the center."], "image": "train2014/COCO_train2014_000000055039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62250, "question_id": "KUTsR4LZBDqKPeg5bdJHn4", "question": "How would these animals be described?", "choices": ["canine", "human made", "feline", "bovine"], "correct_choice_idx": 1, "direct_answers": ["elephant sculptures", "fake", "human made", "sculptures", "colorful", "elephants", "elephant sculptures", "elephant sculptures", "elephant sculptures", "plastic"], "difficult_direct_answer": false, "rationales": ["They have a shiny surface rather than the leathery skin that real animals have.", "The animals are fake.", "They're statues made to look as if they are elephants."], "image": "val2014/COCO_val2014_000000062250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232724, "question_id": "KUaMRjrbDMutxQC45ZM6PL", "question": "Why has the man covered his head?", "choices": ["uniform", "religion", "warmth", "protection"], "correct_choice_idx": 3, "direct_answers": ["protection", "protection", "for protection", "protection", "protection", "prevent injury", "protection", "safety", "protection", "protection"], "difficult_direct_answer": false, "rationales": ["Skateboarding can be a particularly brutal sport, because there's nothing to land on except concrete. a smart boarder will protect every reasonable part of his body.", "The man wants protection.", "Some sports have the participant wear a helmet to protect their head in case of a fall."], "image": "train2014/COCO_train2014_000000232724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324591, "question_id": "KUgPjTFjva4LWHSKE5rW6n", "question": "Why are the animals there?", "choices": ["for sale", "were stolen", "for dinner", "for exhibition"], "correct_choice_idx": 3, "direct_answers": ["for showing", "4h competition", "fair", "for exhibition", "fair", "goat show", "win prizes", "judgement", "competition", "goat show"], "difficult_direct_answer": false, "rationales": ["The wool is gone and from the venue it seems that this is an exhibition.", "Sheep are on display with people holding their leashed and marked with numbers. people display livestock at fairs and events.", "The animals are being exhibited."], "image": "train2014/COCO_train2014_000000324591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11856, "question_id": "KUwfL2vJCjaGsDaZxpCyzJ", "question": "What is the large object trying to get you to do?", "choices": ["drive safe", "buy goods", "watch tv", "join army"], "correct_choice_idx": 1, "direct_answers": ["buy stuff", "shop", "buy things", "buy goods", "buy food", "buy", "shop", "shop", "waiting bus", "buy"], "difficult_direct_answer": false, "rationales": ["The large object is advertising items for purchase at essential warehouse.", "A website is shown behind the woman sitting. it advertising buying things for your house.", "Advertising can cause people to purchase things."], "image": "train2014/COCO_train2014_000000011856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184974, "question_id": "KV2sHcKPCPZ4FpwyAqPWzS", "question": "What continent is this?", "choices": ["europe", "south america", "asia", "asia"], "correct_choice_idx": 0, "direct_answers": ["germany", "europe", "europe", "asia", "europe", "europe", "boats", "asia", "europe", "oceania"], "difficult_direct_answer": false, "rationales": ["The language on the sign is european.", "It's the most likely answer given the language on the sign.", "A boat dock can be seen with signs in german. germany is in europe."], "image": "val2014/COCO_val2014_000000184974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265625, "question_id": "KVTtvtpte89V7k4Af8oB9g", "question": "What is the woman looking down at?", "choices": ["plate", "glass", "shaker", "phone"], "correct_choice_idx": 3, "direct_answers": ["phone", "phone", "phone", "phone", "her phone", "her phone", "phone", "mobile phone", "phone", "phone"], "difficult_direct_answer": false, "rationales": ["The woman is holding her cell phone.", "A woman is looking down at a handheld device and pressing on the buttons.", "The woman is looking at her phone."], "image": "train2014/COCO_train2014_000000265625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340145, "question_id": "KVajNrVciGcNBTF54wqyf2", "question": "What setting is this venue?", "choices": ["living room", "hotel lobby", "furniture store", "waiting room"], "correct_choice_idx": 2, "direct_answers": ["furniture store", "indoors", "store", "modern", "livingroom", "furniture store", "furniture store", "living", "furniture store", "nature"], "difficult_direct_answer": false, "rationales": ["The venue is a furniture store.", "There is a lot of furniture here. it is arranged, but there is other arrangements in the background made to look like rooms.", "This is in a store that sells furniture for your home."], "image": "train2014/COCO_train2014_000000340145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507966, "question_id": "KVoebR4jbq4EbKhrJWUMrQ", "question": "Who provided these umbrellas?", "choices": ["beach owner", "homeless people", "salvation army", "beach goers"], "correct_choice_idx": 0, "direct_answers": ["hotel", "beach owner", "resort", "resort", "hotel", "resort owners", "rental service", "beach resort", "resort", "hotel"], "difficult_direct_answer": false, "rationales": ["The beach owner provided the umbrellas.", "The city basically owns the beach.", "They are all uniform in color and style so it's locally owned"], "image": "val2014/COCO_val2014_000000507966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190805, "question_id": "KW7DYZY2jyDgiBAGvCPcDN", "question": "Where is the fruit or vegetable which contains the most potassium?", "choices": ["top", "right", "bottom", "left"], "correct_choice_idx": 0, "direct_answers": ["rear banana", "hanging", "top", "hanging", "spinach", "top", "up top", "banana", "vegetable", "parsnip"], "difficult_direct_answer": false, "rationales": ["The fruit that has a lot of potassium is bananas and they are on top of the picture.", "These are bananas", "Bananas have a lot of potassium."], "image": "train2014/COCO_train2014_000000190805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479213, "question_id": "KWBBcUwTm86JumN7bvzbfq", "question": "How old is the birthday girl?", "choices": ["30", "ten", "20", "40"], "correct_choice_idx": 3, "direct_answers": ["40", "fourty", "40", "fourty", "40", "fourty", "40", "40", "40", "forty"], "difficult_direct_answer": false, "rationales": ["The candles on the cake indicate her age.", "The girl is 40.", "The candles state \"40.\"."], "image": "val2014/COCO_val2014_000000479213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143521, "question_id": "KWK78f9xRYpTEmAgottwjc", "question": "A group of these animals is referred to as what?", "choices": ["pride", "flock", "pack", "herd"], "correct_choice_idx": 3, "direct_answers": ["elephants", "elephant", "herd", "herd", "herd", "herd", "herd", "herd", "herd", "herd"], "difficult_direct_answer": false, "rationales": ["The other options refer to lions, birds and wolves, among other types of animals.", "Elephants live together in herds.", "This is the name given to a group of elephants."], "image": "train2014/COCO_train2014_000000143521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119195, "question_id": "KWTPEbS8hWtWCbBY7iEQGL", "question": "What is being displayed behind glass on the lower shelf?", "choices": ["donuts", "bagels", "sandwiches", "pastries"], "correct_choice_idx": 2, "direct_answers": ["sandwiches", "cake", "sandwiches", "sandwich", "sandwiches", "sandwiches", "sandwiches", "sandwiches", "sandwiches", "sandwiches"], "difficult_direct_answer": false, "rationales": ["There is bread with lettuce, cheese and meat.", "Sandwiches are displayed.", "The food is made out of bread and a filling in layers."], "image": "train2014/COCO_train2014_000000119195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82431, "question_id": "KWV6YNC6iqMHzf63MdQwos", "question": "What kind of location is this most likely to be?", "choices": ["mall", "strip mall", "campus", "amusement park"], "correct_choice_idx": 2, "direct_answers": ["university", "college campus", "school", "college campus", "campus", "park", "campus park", "university", "college campus", "campus park"], "difficult_direct_answer": false, "rationales": ["There are no stores or rides near the people.", "Most universities are large and have a lot of walkways that lead to other buildings on campus.", "There are large buildings around a park area with a clock"], "image": "val2014/COCO_val2014_000000082431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404546, "question_id": "KWWyywQQSVmMtWDmCzquSM", "question": "What is the name of the yellow vehicle the man in red is on?", "choices": ["snow scooter", "snowmobile", "snow quad", "ski truck"], "correct_choice_idx": 1, "direct_answers": ["snow mobile", "snow mobile", "snow mobile", "snowmobile", "snow mobile", "snowmobile", "snowmobile", "snowmobile", "snow mobile", "snowmobile"], "difficult_direct_answer": false, "rationales": ["People on a yellow and black snow machine are stopped in the snow near a skier. snow machines are used by emergency personal and others in cold climates.", "The man in the red jacket is riding a snowmobile.", "The name is a snowmobile."], "image": "val2014/COCO_val2014_000000404546.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22118, "question_id": "KWZvjAu48RsrWgTo3TXW3X", "question": "Why are the people laying on blankets?", "choices": ["to dry", "to rest", "to tan", "to sleep"], "correct_choice_idx": 2, "direct_answers": ["avoid sand", "to tan", "sand's hot", "to sunbathe", "sun bathe", "sand", "hot sand", "lounging", "hot sand", "taking sunbath"], "difficult_direct_answer": true, "rationales": ["People like to lie down without getting sand on them.", "The people are tanning.", "They are laying in the sun"], "image": "val2014/COCO_val2014_000000022118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447613, "question_id": "KWbLYdPwcMZRpk5DwzkFLM", "question": "What sort of business has left their wares on the street and sidewalk here?", "choices": ["cooper", "baker", "milliner", "florist"], "correct_choice_idx": 3, "direct_answers": ["flower shop", "florist", "florist", "florist", "florist", "florist", "florist", "flower shop", "florist", "florist"], "difficult_direct_answer": false, "rationales": ["There are flowers outside.", "The florist has a bunch of flowers on the street and sidewalk.", "Many plants and flowers can be seen outside of a business in the background, indicating that the business is selling them."], "image": "val2014/COCO_val2014_000000447613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389205, "question_id": "KXGt7qSa5HsvUiwAndLFRQ", "question": "What branch of a travel company is advertised here?", "choices": ["canadian", "burmese", "japanese", "british"], "correct_choice_idx": 0, "direct_answers": ["travel", "expedia", "expedia", "expedia", "expedia", "expedia", "canadian", "expedia", "expedia canada", "expedia"], "difficult_direct_answer": false, "rationales": ["The branch is canadian.", "The travel company is expedia.ca.", "Expedia.ca is advertised on a banner. .ca domains stand for canada."], "image": "val2014/COCO_val2014_000000389205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306800, "question_id": "KXk39ZHLgR84imPkFbEHtA", "question": "Why is there a bucket by the man playing guitar?", "choices": ["for water", "he's panhandling", "holding picks", "holding toys"], "correct_choice_idx": 1, "direct_answers": ["he's panhandling", "money", "tips", "get money", "taking donations", "panhandling", "for money", "asking money", "collect donations", "yes"], "difficult_direct_answer": true, "rationales": ["The man is homeless.", "There's a bucket next to the man playing guitar because he's playing for tips.", "He's trying to raise money."], "image": "train2014/COCO_train2014_000000306800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330139, "question_id": "KY7DDtqRXSJ7Ez2fFangqF", "question": "What direction are the elephants facing?", "choices": ["down", "right", "left", "up"], "correct_choice_idx": 1, "direct_answers": ["right", "right", "east", "water drinking", "right", "right", "east", "right", "water", "side"], "difficult_direct_answer": false, "rationales": ["The elephants are going the opposite direction of left.", "The elephants seems to be looking right from the river.", "Their right sides are showing."], "image": "train2014/COCO_train2014_000000330139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345142, "question_id": "KYD3zmPRMWjBDLFaV5sNew", "question": "What bank is a sponsor of the tennis match?", "choices": ["beneficial", "geico", "villanova", "park america"], "correct_choice_idx": 0, "direct_answers": ["park america", "america inc", "beneficial", "beneficial", "beneficial", "park america", "park america", "beneficial bank", "beneficial", "beneficial"], "difficult_direct_answer": false, "rationales": ["It says beneficial bank on the bottom.", "Beneficial is the words next to the player and the word bank is right next to that word.", "The name is on the billboard."], "image": "train2014/COCO_train2014_000000345142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128013, "question_id": "KYDgrb5EePrFQg8VcDtxfq", "question": "What shirt brand is represented by the amphibious animal?", "choices": ["polo", "nike", "lulu lemon", "adidas"], "correct_choice_idx": 0, "direct_answers": ["lacoste", "puma", "crocodile", "alligator", "lacoste", "polo", "not visible", "lacoste", "lacoste", "crocodile"], "difficult_direct_answer": false, "rationales": ["The brand usually has an aligator as their logo.", "Polo often has animals as their logos and none of these other companies do.", "Polo is represented by this aligator."], "image": "train2014/COCO_train2014_000000128013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385454, "question_id": "KYJ8TzftxPffhq8qEbcqEg", "question": "How does he block out the noise of the city?", "choices": ["stays inside", "earbuds", "singing", "covers ears"], "correct_choice_idx": 1, "direct_answers": ["earbuds", "earbuds", "earplugs", "earphones", "earbuds", "using earphones", "earbuds", "headphones", "earphones", "headset"], "difficult_direct_answer": false, "rationales": ["The man has earbuds.", "He has little buds in his ears.", "The items in the mans ears help to drown out exterior noise."], "image": "val2014/COCO_val2014_000000385454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130007, "question_id": "KYagZjoygQzkn3KKitVLUD", "question": "What is this sports equipment made of?", "choices": ["cloth", "rods", "grass", "strings"], "correct_choice_idx": 3, "direct_answers": ["graphite", "graphite", "graphite", "graphite", "plastic", "strings", "metal", "metal", "fiberglass", "plastic"], "difficult_direct_answer": false, "rationales": ["The sport being played is tennis based on the racket and the player's attire. tennis rackets, the equipment used, are known to have strings as visible here.", "Traditionally tennis rackets are made of fiberglass and string cloth.", "The equipment has strings."], "image": "train2014/COCO_train2014_000000130007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213592, "question_id": "KZ9w986rJLjcKiDTdB5gWn", "question": "What are these men on horseback holding in their hands?", "choices": ["brooms", "mallets", "clubs", "bats"], "correct_choice_idx": 1, "direct_answers": ["mallet", "polo sticks", "polo mallets", "polo mallets", "riding club", "clubs", "poles", "mallets", "polo mallets", "holding things"], "difficult_direct_answer": false, "rationales": ["Men are on horses on a polo field. people use mallets to play polo.", "You can tell by the sport that they are playing as to what the men are holding.", "The mallets are used in polo games."], "image": "val2014/COCO_val2014_000000213592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434022, "question_id": "KZF56gkeHkEMKYQkSATDsZ", "question": "What powers this train?", "choices": ["diesel", "natural gas", "unleaded", "electric"], "correct_choice_idx": 0, "direct_answers": ["steam", "diesel", "diesel", "coal", "diesel", "electricity", "coal", "fuel", "coal", "diesel"], "difficult_direct_answer": false, "rationales": ["The train has dark exhaust.", "The train uses the diesel to run the engine.", "The train on the tracks is powered by diesel fuel and lets off smoke."], "image": "val2014/COCO_val2014_000000434022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208372, "question_id": "KZn9wESBynmxAmFWMmuvhf", "question": "What type of bus is shown?", "choices": ["shuttle", "school", "double decker", "toy"], "correct_choice_idx": 0, "direct_answers": ["shuttle", "transport bus", "tourist", "tourist bus", "luxury coach", "coach", "travel", "travel coach", "passenger", "metro"], "difficult_direct_answer": true, "rationales": ["You can tell by the design and setting as to what type of bus is shown.", "There is a shuttle on the road.", "The bus has one level and is full sized. it does not appear to be affiliated with an educational institution."], "image": "train2014/COCO_train2014_000000208372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146315, "question_id": "KZnJS3Eu4tsDKTVerH8CpG", "question": "What are the men displaying around their necks?", "choices": ["photo", "drivers license", "passes", "names"], "correct_choice_idx": 2, "direct_answers": ["id passes", "event badges", "badges", "company tags", "lanyards", "name tags", "name tags", "passes", "name tags", "lanyards"], "difficult_direct_answer": false, "rationales": ["They have cards around their neck so they can get into their function they are attending.", "The men have passes on their necks.", "These papers are on a lanyard and indicate they are allowed in certain areas"], "image": "val2014/COCO_val2014_000000146315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158322, "question_id": "KZnnajRtqCtMBkLFfzb8Tj", "question": "What is the relationship status of this man?", "choices": ["married", "divorced", "single", "asexual"], "correct_choice_idx": 0, "direct_answers": ["married", "married", "married", "married", "married", "single", "married", "single", "married", "single"], "difficult_direct_answer": false, "rationales": ["He has a ring on his left ring finger.", "The man has a wedding ring on his left finger which usually symbolizes marriage.", "This man wears a piece of jewelry on his ring finger that is associated with marriage."], "image": "train2014/COCO_train2014_000000158322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107394, "question_id": "KZsqsjMTzXPy8kRAhNzva4", "question": "Which animal makes a food that is advertised here?", "choices": ["bee", "cow", "shrimp", "pig"], "correct_choice_idx": 0, "direct_answers": ["bee", "bee", "bees", "honey", "bee", "bees", "bee", "chicken", "tiger", "bee"], "difficult_direct_answer": false, "rationales": ["The sign on the left is advertising pure honey. cows, pigs, and shrimp do not produce honey.", "The bee makes honey which is advertised to the left.", "There is a sign adversities honey, which is an item produced by bees."], "image": "train2014/COCO_train2014_000000107394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365618, "question_id": "KZv4ReLbmpGE5wnwNPDsPE", "question": "What type of lights are on on the car?", "choices": ["headlights", "siren", "brake", "turn signal"], "correct_choice_idx": 2, "direct_answers": ["highway", "brake lights", "brake lights", "tail lights", "brake lights", "brake lights", "brake lights", "brake", "stop lights", "tail lights"], "difficult_direct_answer": false, "rationales": ["The lights on the back of a car turn red when they use them to stop.", "There are brake lights on the back of the car.", "The lights are brakes."], "image": "val2014/COCO_val2014_000000365618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422289, "question_id": "KaCnM2L4NvimQ2W5LBsbbG", "question": "What might someone need if they are walking by this clock?", "choices": ["umbrella", "dog", "watch", "snacks"], "correct_choice_idx": 0, "direct_answers": ["umbrella", "umbrella", "walking", "dinner", "umbrella", "umbrella", "time", "umbrella", "seat", "time"], "difficult_direct_answer": false, "rationales": ["A hazy rain can be seen in an area near a clock.", "The umbrella could be used.", "It is raining."], "image": "train2014/COCO_train2014_000000422289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466097, "question_id": "KaiYpmkuPKvWMjZzD53xkh", "question": "What might happen below the TV?", "choices": ["calling", "napping", "fire", "dog sleeping"], "correct_choice_idx": 2, "direct_answers": ["start fire", "fire", "fire started", "fire", "fire", "fire", "fireplace", "fireplace", "fire", "fire"], "difficult_direct_answer": false, "rationales": ["The tv is situated on top of the mantel.", "There is a fireplace below the tv.", "There is a fireplace located under the television."], "image": "train2014/COCO_train2014_000000466097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578497, "question_id": "Kb53iZtuFDGtysRSdjgGKu", "question": "What is used to sook the food?", "choices": ["light", "sun", "coal", "stove"], "correct_choice_idx": 3, "direct_answers": ["chop sticks", "oil", "bread", "soy sauce", "plate", "unknown", "knife", "stove", "oil", "water"], "difficult_direct_answer": true, "rationales": ["A stove is used to keep the food hot and warm prior to preparation.", "The other options don't match the frying process.", "You can use the stove to cook your food."], "image": "train2014/COCO_train2014_000000578497.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289658, "question_id": "Kb7uZ5Xp4DsmFBC5WJcBxR", "question": "How many watts does a night lamp use?", "choices": ["3-7.5", "5-10", "1-2", "2.5-3.5"], "correct_choice_idx": 0, "direct_answers": ["45 watts", "one half", "varies", "60 watts", "fifteen", "3-7.5", "hundreds", "sixty", "three", "ten"], "difficult_direct_answer": true, "rationales": ["It depends on the type of bulb.", "A night light generally uses small bulbs that require less watts to run.", "This is an led light bulb that uses very small wattage."], "image": "train2014/COCO_train2014_000000289658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325567, "question_id": "KbJV7SDSrmgFgDEQh5oABH", "question": "How many spotted white chickens are there?", "choices": ["three", "one", "two", "four"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "two", "threes", "three", "two", "three", "three", "three", "four"], "difficult_direct_answer": false, "rationales": ["There is one by the green part of the house, one by the grass and one in the dirt.", "There are 3 light colored chickens.", "Chickens are seen together with three being white."], "image": "train2014/COCO_train2014_000000325567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560797, "question_id": "KbUMRsAHw8ACvTSvjUHrvf", "question": "Near what feature does the person throw the ball to the catcher?", "choices": ["mountain", "dugout", "pitchers mound", "sand lot"], "correct_choice_idx": 2, "direct_answers": ["mound", "mound", "pitchers mound", "mound", "mound", "mound", "mound", "mound", "mound", "mound"], "difficult_direct_answer": false, "rationales": ["The pitcher's mound is where the pitcher stands.", "There is a pile of dirt behind them", "The raised pile of dirt in a sports field identifies it as the item in option a."], "image": "val2014/COCO_val2014_000000560797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330253, "question_id": "KbUNXiAZHo2rgJCJKdD9Ap", "question": "What has caused the puddle in front of the bus?", "choices": ["flooding", "rain", "snow", "hose"], "correct_choice_idx": 1, "direct_answers": ["rain", "rain", "bus", "rain water", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["Rain has caused a puddle of water to form under the front of the bus.", "There are clouds above the vehicles. it is not snowing.", "The gray sky and wet surface indicate precipitation, which corresponds to the term in option a."], "image": "train2014/COCO_train2014_000000330253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467135, "question_id": "KbVUF4Z5fNMBKfCQdAd8wG", "question": "What was done to this pizza?", "choices": ["lost", "thrown", "rubberized", "bite"], "correct_choice_idx": 3, "direct_answers": ["eaten", "got eaten", "eaten", "got eaten", "bite", "eaten", "eaten", "eaten", "eaten", "got eaten"], "difficult_direct_answer": false, "rationales": ["There are several bites taken out of the pizza in the person's hand.", "A person is holding a partial slice. it has bite marks in it.", "Part of it is missing and the edge has half moon shapes"], "image": "val2014/COCO_val2014_000000467135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215191, "question_id": "KbgECyhpBC7nFh67W9Stgd", "question": "Which room is this?", "choices": ["ball", "kitchen", "men's restroom", "ladies room"], "correct_choice_idx": 2, "direct_answers": ["toilet", "men's restroom", "bathroom", "bathroom", "men's room", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom"], "difficult_direct_answer": false, "rationales": ["A man is standing in from of a urinal that is in a row of urinals and he is standing like he is using the bathroom", "There is a man inside the restroom with toilets lined up indicated that only males go into this room.", "It appears to be a given the lifted seats and the form of the figure in the middle."], "image": "train2014/COCO_train2014_000000215191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549962, "question_id": "KbjYhzBHvXYDddd4CBCPFn", "question": "What type of vegetable is the bowl full of?", "choices": ["peas", "cruciferous", "fruits", "root"], "correct_choice_idx": 1, "direct_answers": ["broccoli", "cauliflower", "broccoli cauliflower", "broccoli", "cruciferous", "vegetables", "cruciferous", "greens", "brocolli", "cruciferous"], "difficult_direct_answer": false, "rationales": ["Broccoli and cauliflower are both cruciferous.", "Vegetables are in the bowl.", "The veggies are cruciferous."], "image": "train2014/COCO_train2014_000000549962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73016, "question_id": "KbsZXKRcDREkza3v92QyTY", "question": "How are the people in the background descending?", "choices": ["elevator", "stairs", "escalator", "ramp"], "correct_choice_idx": 2, "direct_answers": ["escalator", "escalator", "up", "escalator", "escalator", "escalator", "escalator", "escalator", "two", "escalator"], "difficult_direct_answer": false, "rationales": ["They are coming down an escalator.", "They are on steps. these steps are not stationary.", "An escalator is seen in the picture."], "image": "train2014/COCO_train2014_000000073016.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107097, "question_id": "Kc6ZKxmoeSE7bNAqUzZvUx", "question": "What is the man holding the bat doing?", "choices": ["practicing", "injuring", "dancing", "fighting"], "correct_choice_idx": 0, "direct_answers": ["batting practice", "practicing", "swinging", "practicing", "practice swing", "swinging", "practicing", "practice swing", "warming up", "swinging"], "difficult_direct_answer": false, "rationales": ["The man is warming up before its his turn to bat.", "The man is holding a bat and swinging but he is not at home plate.", "He is hitting balls to practice while he is waiting to play"], "image": "train2014/COCO_train2014_000000107097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19874, "question_id": "KcUKsB8aGRiQqkoUQennWt", "question": "What is the purpose of the object?", "choices": ["help you", "give water", "provide food", "provide parking"], "correct_choice_idx": 3, "direct_answers": ["meter parking", "payment", "monitor time", "paid parking", "collect fee", "parking fare", "take money", "time parking", "pay parking", "provide parking"], "difficult_direct_answer": true, "rationales": ["This meter is alongside the road and allows motorists to pay to park their car in the provided space for a designated time. the blue sign with the white 'p' on the meter's side shows that it is for parking.", "This is a meter where you put money in it. this is for when cars stop and pulled along curb to go into a building.", "The purpose is for parking."], "image": "train2014/COCO_train2014_000000019874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522819, "question_id": "KcdKatEeXAzXfbSfqnkUfR", "question": "What type of meal will be served later?", "choices": ["formal", "buffet", "casual", "potluck"], "correct_choice_idx": 0, "direct_answers": ["dinner", "dinner", "gourmet", "fancy meal", "formal", "fancy meal", "fancy", "dinner", "hot", "fancy"], "difficult_direct_answer": false, "rationales": ["The table is fully set with silver settings.", "From the fancy polished dinnerware, this is a formal get together most likely.", "It looks fancy."], "image": "train2014/COCO_train2014_000000522819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286101, "question_id": "Kci9UQnTZMZjGR3tJQGsWM", "question": "In what continent is this airport situated at?", "choices": ["america", "africa", "asia", "europe"], "correct_choice_idx": 3, "direct_answers": ["europe", "europe", "europe", "europe", "air europa", "europe", "asia", "europe", "europe", "airplain"], "difficult_direct_answer": false, "rationales": ["We can see an airberlin plane in this picture. berlin is a city in europe.", "The red lettering on the side of the airplane read, \"aireuropa\".", "The word is on the plane"], "image": "train2014/COCO_train2014_000000286101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18377, "question_id": "KciM9M3DLT9pjX4Jw72YHx", "question": "What is different about this pizza than most pizzas?", "choices": ["square shape", "no cheese", "all mushrooms", "missing toppings"], "correct_choice_idx": 0, "direct_answers": ["shape", "its square", "shape", "square", "square shaped", "square", "square", "square shape", "square", "rectangle shape"], "difficult_direct_answer": false, "rationales": ["The pizza is not in its regular round shape but is shaped like a box.", "Most pizzas are round. however, this pizza has no curved edges just sharp corners.", "Most pizzas are served as a circle."], "image": "train2014/COCO_train2014_000000018377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221202, "question_id": "Kd2B8bpFwgtpMGuayLEEq5", "question": "What are the pants called being worn by the equestrians?", "choices": ["peddle pushers", "capris", "tights", "jodhpurs"], "correct_choice_idx": 3, "direct_answers": ["jodhpurs", "jodhpurs", "riding pants", "jumpers", "jodhpurs", "jodhpurs", "riding pants", "jodhpurs", "jodhpurs", "breeches"], "difficult_direct_answer": false, "rationales": ["These type of horse riders do wear like to wear them.", "Traditionally these type of pants are used by jockeys and equestrians.", "The pants are jodhpurs."], "image": "train2014/COCO_train2014_000000221202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236075, "question_id": "Kd362HPfipE5UDqWhVWJiW", "question": "What might the man be using the dog to find?", "choices": ["food", "killers", "squirrels", "drugs"], "correct_choice_idx": 3, "direct_answers": ["drugs", "drugs", "criminal", "drugs", "drugs", "drugs", "drugs", "drugs", "drugs", "criminal"], "difficult_direct_answer": false, "rationales": ["The man is a law enforcement officer and wants to be sure nothing illegal is being transported.", "The man is using the dog to sniff the vehicle for hidden drugs.", "The man is dressed in police uniform, and they have a well known canine unit where dogs find drugs, as their noses can easily sniff them out."], "image": "train2014/COCO_train2014_000000236075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82668, "question_id": "KdQ7amNBfwvb2HfsqZat5b", "question": "What direction is the wind blowing here?", "choices": ["west", "north", "east", "none"], "correct_choice_idx": 3, "direct_answers": ["east", "right", "no wind", "no wind", "no wind", "right", "none", "right", "no wind", "non windy"], "difficult_direct_answer": false, "rationales": ["There is no wind...the flag is hanging down.", "It does not appear to be windy.", "There is no movement."], "image": "val2014/COCO_val2014_000000082668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473604, "question_id": "KddCpC2NFdLkT4EcdkozLT", "question": "What is a mini version of this food called?", "choices": ["petit four", "chipolata", "pizzetta", "slider"], "correct_choice_idx": 2, "direct_answers": ["mini pizza", "personal pizza", "pizzetta", "pizza bites", "pizzetta", "mini pizza", "pizzetta", "personal pizza", "calzone", "pizzetta"], "difficult_direct_answer": false, "rationales": ["The word is a form of the word \"pizza\"", "The mini version is a pizzetta.", "The mini version is the pizzetta."], "image": "train2014/COCO_train2014_000000473604.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505038, "question_id": "Kdo2LWv6YHYerz6XQaLz9a", "question": "What is on the couch?", "choices": ["lounge singer", "pillow", "baby", "kitten"], "correct_choice_idx": 1, "direct_answers": ["pillows", "pillows", "pillows", "pillow", "pillow", "pillows", "pillows", "throw pillows", "pillows", "pillows"], "difficult_direct_answer": false, "rationales": ["They are matching cloth squares filled with soft cushioning.", "The pillow is on the couch.", "Pillows are on both ends."], "image": "train2014/COCO_train2014_000000505038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97131, "question_id": "KdxNaKfwiDPAtQrhYjMjph", "question": "How many cars are visible or barely visible around the black car in focus?", "choices": ["three", "five", "four", "two"], "correct_choice_idx": 0, "direct_answers": ["three", "four", "three", "four", "three", "three", "three", "one", "two", "three"], "difficult_direct_answer": false, "rationales": ["There is one car that is close up. you can see part of two cars in the background", "There are at least two cars behind of the car barely visible.", "There are two cars directly in the front and one around the corner"], "image": "val2014/COCO_val2014_000000097131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292432, "question_id": "KeCEG6Q7VFv2UZt4toHFYB", "question": "If you have trouble walking what pictured thing might assist you here?", "choices": ["walker", "railing", "lamp post", "wheelchair"], "correct_choice_idx": 1, "direct_answers": ["handrail", "train", "railing", "hand rails", "wheelchair", "train", "hand rail", "fear game", "hand rail", "railing"], "difficult_direct_answer": false, "rationales": ["There is a safety railing.", "There is a silver railing at the train station for people that have trouble walking.", "There is a railway to assist people walking here."], "image": "train2014/COCO_train2014_000000292432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318099, "question_id": "KeCgPkkHjgs5zxQjHRPuzJ", "question": "Why are their mouths open?", "choices": ["talking", "drinking", "hunting", "hungry"], "correct_choice_idx": 3, "direct_answers": ["hungry", "anticipating food", "hungry", "food", "hungry", "crow", "hungry", "hungry", "hungry", "hungry"], "difficult_direct_answer": false, "rationales": ["Baby birds often open their mouths like this to be fed by the mother bird.", "The birds want to eat.", "The birds want the mother brood to drop food in their mouths."], "image": "val2014/COCO_val2014_000000318099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572725, "question_id": "KeMGcULPPtiVwqvtfEAh8E", "question": "What has been rubbed off the top of the railing shown here?", "choices": ["food", "grass", "paint", "nothing"], "correct_choice_idx": 2, "direct_answers": ["paint", "paint", "paint", "paint", "metal", "paint", "paint", "green paint", "paint", "scatter"], "difficult_direct_answer": false, "rationales": ["It is down to the bare metal", "A green railing is scraped up and a skateboarder is grinding on it.", "The skateboard grinding removes it."], "image": "val2014/COCO_val2014_000000572725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421602, "question_id": "KeV5AME2eheSWLniAWhpU2", "question": "What country is this bus in?", "choices": ["china", "england", "united states", "japan"], "correct_choice_idx": 1, "direct_answers": ["england", "england", "england", "england", "england", "canada", "england", "england", "united kingdom", "wanton"], "difficult_direct_answer": false, "rationales": ["Bournemouth is in england.", "The towns on the bus are from that country.", "The signs tell the cities."], "image": "train2014/COCO_train2014_000000421602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338258, "question_id": "KeowDv63QiAtByodqrcwv5", "question": "Who goes to this place?", "choices": ["surfers", "fishermen", "children", "kayakers"], "correct_choice_idx": 0, "direct_answers": ["surfers", "beachgoers", "tourists", "swimming", "vacationers", "everyone", "people", "swimmer surfers", "surfers", "families"], "difficult_direct_answer": true, "rationales": ["There are large waves to ride", "The people beside the beach have water boards.", "The people are at the ocean which is popular place for surfers to go surfing in the water.1"], "image": "train2014/COCO_train2014_000000338258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408368, "question_id": "KerM3baD6NNEY6XK4PMT9f", "question": "What is the horse on the right staring at?", "choices": ["goose", "tree", "bush", "mouse"], "correct_choice_idx": 0, "direct_answers": ["goose", "ducks", "goose", "duck", "ground", "duck", "ducks", "ducks", "goose", "bird"], "difficult_direct_answer": false, "rationales": ["The trees and bushes are behind the horse. there are no mice.", "The horse is by the goose.", "The goose is the only bird in the area hence the goose."], "image": "train2014/COCO_train2014_000000408368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211891, "question_id": "KfQYdqhz2ZmrfucqkuvyZU", "question": "WHat type of architecture is on the ceiling?", "choices": ["arches", "coverings", "hooks", "semi-circle"], "correct_choice_idx": 0, "direct_answers": ["arch", "curved", "baroque", "arches", "arches", "glass", "arch", "windows", "gothic architecture", "dome"], "difficult_direct_answer": false, "rationales": ["The ceiling is curved.", "The ceiling has a curve to it.", "The ceiling of the train station is made of decorative metal arches."], "image": "val2014/COCO_val2014_000000211891.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485440, "question_id": "KfUnrHixMtvWoiaKke9fuL", "question": "Why is he playing the instrument?", "choices": ["for money", "is lost", "amuse people", "practicing"], "correct_choice_idx": 0, "direct_answers": ["hobby", "not clear", "make music", "entertaining people", "for money", "provide entertainment", "make money", "entertain guests", "entertainment", "entertainment"], "difficult_direct_answer": true, "rationales": ["The person wants money.", "He is hoping for tips from the beachgoers.", "He is a vendor that is playing music to make money on the side."], "image": "train2014/COCO_train2014_000000485440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118974, "question_id": "KfZfzgkVdy2nxoRbGoS3Jj", "question": "What would impede a disabled person in a wheelchair from joining the people at the table?", "choices": ["dog", "step", "chairs", "people"], "correct_choice_idx": 1, "direct_answers": ["step", "steps", "raised floor", "cant enter", "step", "step", "steps", "step", "step", "step"], "difficult_direct_answer": false, "rationales": ["There is a heightened area that would make someone in a wheelchair struggle. the wheel would have to be picked up.", "The step would make it hard for a wheelchair to get up onto the table area.", "The floor changes in elevation without a ramp."], "image": "train2014/COCO_train2014_000000118974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570116, "question_id": "Kfet9FfTFinHQe8FkdyPj2", "question": "Why is the man bending over?", "choices": ["steal cild", "child's level", "grab ball", "lost money"], "correct_choice_idx": 1, "direct_answers": ["coaching", "child's level", "help child", "reach ball", "giving instruction", "short child", "help kid", "help child", "helping child", "help boy"], "difficult_direct_answer": true, "rationales": ["To be closer to the child when he helps the child out.", "The boy in the photo appears to be looking at something across the street. it appears that the man next is also looking there but must get to the child's level to see it.", "The man is trying to teach the little boy something so he is bending down to be closer to him."], "image": "val2014/COCO_val2014_000000570116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338478, "question_id": "KfimuEsm5pupmwkAD3GJsM", "question": "Why is the player wearing a glove?", "choices": ["fashion", "warmth", "health", "grip"], "correct_choice_idx": 3, "direct_answers": ["catch ball", "catching mitt", "playing baseball", "catch ball", "baseball", "pitching baseball", "catch ball", "catch ball", "practice pitch", "grip"], "difficult_direct_answer": false, "rationales": ["The person wants to grip the ball.", "The player is in a baseball game, which requires careful control of the baseball so the glove helps the player with the grip of the ball, adding to his control ability.", "The man is trying to hold a grip on the ball."], "image": "train2014/COCO_train2014_000000338478.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565575, "question_id": "Kfnzuyr4YRzRZgYHMPGDcs", "question": "What would be the best outcome for this person shown here?", "choices": ["base run", "love point", "strike out", "strike"], "correct_choice_idx": 3, "direct_answers": ["strike", "strike", "strike", "strike", "strike", "strike", "strike", "strike", "strike", "strike"], "difficult_direct_answer": false, "rationales": ["To knock down all the pins they would get the best score possible.", "There is a boy with his hand up in the air playing a sorta bowling game with a remote in hand.", "He is bowling."], "image": "val2014/COCO_val2014_000000565575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279846, "question_id": "KfvwuHQDaHtaJ5M628DfZa", "question": "What animal did the topmost ingredient come from?", "choices": ["cow", "pig", "chicken", "fish"], "correct_choice_idx": 2, "direct_answers": ["chicken", "chicken", "chicken", "hen", "chicken", "chicken", "hen", "hen", "hen", "chicken"], "difficult_direct_answer": false, "rationales": ["Eggs come from chickens.", "Eggs are on the pizza.", "Chickens are known for laying eggs on a regular basis."], "image": "val2014/COCO_val2014_000000279846.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252086, "question_id": "Kfwkmi744Pb8TZKQB2VJVp", "question": "What fun activity is shown?", "choices": ["free fall", "water slide", "rollar coaster", "bumper cars"], "correct_choice_idx": 1, "direct_answers": ["surfing", "water slide", "waterslide", "water slide", "wake boarding", "surfing", "boogie boarding", "surfing", "boogie boarding", "surfing"], "difficult_direct_answer": false, "rationales": ["There is water and it slopes down", "This is a fun water slide.", "The slides are blue and have running water on them."], "image": "train2014/COCO_train2014_000000252086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48943, "question_id": "KgFK5jtx66SZDPPw9qdNsH", "question": "What are airplane propellers made of?", "choices": ["kevlar", "metal", "graphite", "aluminum alloy"], "correct_choice_idx": 3, "direct_answers": ["metal wood", "aluminum alloy", "wood aluminum", "metal", "metal", "aluminum", "fiberglass", "steel", "steel", "aluminum"], "difficult_direct_answer": false, "rationales": ["The plane is made of metal.", "This is a common type of propeller", "Airplain propellers are made of a light metal."], "image": "train2014/COCO_train2014_000000048943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248786, "question_id": "KgGKvx8SNCxgVn9rqgBFyQ", "question": "What is on the table that can help them refill their drinks?", "choices": ["chef", "waiter", "plate", "pitcher"], "correct_choice_idx": 3, "direct_answers": ["eating", "pitcher", "pitcher", "tea jug", "wine", "pitcher", "pitcher", "pitcher", "pitcher", "wheel"], "difficult_direct_answer": false, "rationales": ["The table has pitchers of water on it.", "While option a and b are possible and the others are not answers, option b is not something that is on the table as the question asks.", "Most times servers will give tables a pitcher of beer for convenience."], "image": "val2014/COCO_val2014_000000248786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427060, "question_id": "KgdzNG6JSpBfEnPbQDnwGH", "question": "What flag does the woman have on her motorcycle?", "choices": ["american", "spanish", "puerto rican", "canadian"], "correct_choice_idx": 2, "direct_answers": ["puerto rico", "puerto rico", "puerto rico", "puerto rican", "puerto rico", "puerto rican", "haiti", "puerto rico", "african", "puerto rico"], "difficult_direct_answer": false, "rationales": ["The flag is puerto rican.", "A flag with one star and red with white strips belongs to the country puerto rico.", "The woman has a flag that looks like an american flag but has a star inside a triangle."], "image": "train2014/COCO_train2014_000000427060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293749, "question_id": "KgnEWHis9ExWjyi7C4yMkx", "question": "What type of trees are on the left?", "choices": ["palm", "pine", "cherry blossoms", "willow"], "correct_choice_idx": 2, "direct_answers": ["cherry", "cherry", "cherry blossoms", "cherry", "cherry", "cherry blossom", "cherry blossoms", "peach", "cherries", "cherry"], "difficult_direct_answer": false, "rationales": ["The trees have cherry blossoms actively blooming on them.", "The trees have the pink leaves that come with cherry blossoms.", "The flowers are pink which is characteristic of the blooms of this type of tree."], "image": "val2014/COCO_val2014_000000293749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572572, "question_id": "KgwxjPogHTSiAa4GSfUBZF", "question": "What is separating the boy from the polar bear?", "choices": ["rubber", "plastic", "glass", "nothing"], "correct_choice_idx": 2, "direct_answers": ["glass", "glass", "glass", "glass", "glass", "thick glass", "glass", "glass", "glass", "glass"], "difficult_direct_answer": false, "rationales": ["The boy has his hand against the surface and one can see that something see through and solid is holding the water back.", "The boy is in the viewing area underground, where the polar bear can be safely viewed at a zoo.", "There is a thick see through wall there so the animal does not hurt the child and the animal can stay safely in the water."], "image": "train2014/COCO_train2014_000000572572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42481, "question_id": "KhEdpphoWBymwTTMNncbBn", "question": "What sport are these people fans of?", "choices": ["soccer", "football", "basketball", "tennis"], "correct_choice_idx": 1, "direct_answers": ["marathon", "marathon", "football", "football", "football", "football", "football", "football", "marathon", "football"], "difficult_direct_answer": false, "rationales": ["The woman has a sign that says \"pats\" on it and others have new england patriots gear on which is a football team.", "Many people are standing around and some holding a pats sign.", "Some of the people are sporting the team logo on their beanies and a flag, as well as a kid holding up the sign with the name of the team."], "image": "train2014/COCO_train2014_000000042481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123579, "question_id": "KhHhpup7oBmLCMz3rWPhp6", "question": "What company took over that phone company?", "choices": ["tmobile", "verizon", "att", "boost"], "correct_choice_idx": 0, "direct_answers": ["tmobile", "verizon", "verizon", "t-mobile", "t-mobile", "verizon", "t-mobile", "t-mobile", "t-mobile", "t-mobile"], "difficult_direct_answer": false, "rationales": ["Sprint was acquired by tmobile.", "The company is t mobile.", "Tmobile took it over."], "image": "train2014/COCO_train2014_000000123579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289515, "question_id": "KhWYW8u5Yg8dmwPftzbtxV", "question": "Person's who work here report at which time of day to work?", "choices": ["rush hour", "noon", "pre dawn", "nine"], "correct_choice_idx": 2, "direct_answers": ["morning", "early morning", "morning", "morning", "morning", "morning", "early", "morning", "day time", "pre dawn"], "difficult_direct_answer": false, "rationales": ["That type of snack is sometimes eaten as a breakfast food, so people might want them early in the morning.", "There is a curb with a bike and some cars parked on side of road. there is the sun in the distance but not full overhead.", "They need to have the baking done before people are out for breakfast"], "image": "train2014/COCO_train2014_000000289515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236717, "question_id": "KheHaxUJ9ULLzk5MPfvEVR", "question": "What is the luxury division of this motor company?", "choices": ["jaguar", "lexus", "infinity", "acura"], "correct_choice_idx": 2, "direct_answers": ["infinity", "ducati", "altima", "lexus", "mercedes", "nissan", "car", "landrover", "subaru", "infiniti"], "difficult_direct_answer": true, "rationales": ["This is a nissan, not honda, tata, or toyota, pickup truck.", "A nissan truck can be seen parked.", "Nissan and infiniti are owned by the same company."], "image": "val2014/COCO_val2014_000000236717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451969, "question_id": "Ki2YZ6Y7c9X2Da9v2T42i4", "question": "What type of service sponsors this stadium?", "choices": ["banking", "dining", "crafting", "plumbing"], "correct_choice_idx": 0, "direct_answers": ["banking", "bank", "bank", "bank", "financial", "banking", "banking", "banking", "bank", "banking"], "difficult_direct_answer": false, "rationales": ["The sign in the bankground, says bank of america, indicating that company has paid for advertising in this stadium.", "Banking services are the sponsor.", "The bright orange and white sign indicates that one of the sponsors is a bank."], "image": "train2014/COCO_train2014_000000451969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283168, "question_id": "KiCFaEdCNFKs3t339ydNce", "question": "The bread looks like it is filled with what?", "choices": ["mustard", "cabbage", "meat", "butter"], "correct_choice_idx": 2, "direct_answers": ["pulled pork", "pulled pork", "meat", "beef", "pulled pork", "ham", "liquid", "meat", "meat", "meat"], "difficult_direct_answer": false, "rationales": ["The bread has a sloppy joe sandwich in it.", "The bread has meat.", "You can clearly see it in the sandwich."], "image": "val2014/COCO_val2014_000000283168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74412, "question_id": "KiKLdtJF7tM5WPzF4mWimv", "question": "What kind of Panorama photography it is?", "choices": ["circular", "rectangular", "pin", "parallel"], "correct_choice_idx": 2, "direct_answers": ["fish eye", "landscape", "pin", "fish eye", "three sixty", "fisheye", "wide format", "polaroid", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["The photography pins down on a central focus.", "The panorama is in pin style.", "The photography style is known as a pin."], "image": "val2014/COCO_val2014_000000074412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175013, "question_id": "KiSeQb9nSPYt6XyQJFpcNA", "question": "The man on the motorcycle is outside of which brand of motorcycle dealer?", "choices": ["ducati", "harley-davidson", "kawasaki", "yamaha"], "correct_choice_idx": 0, "direct_answers": ["dainese", "ducati", "ducati", "dames", "yamaha", "ducati", "dainese", "dainese", "ninja", "ducati"], "difficult_direct_answer": false, "rationales": ["The brand logo is visible on the building in the background. based on the shape and design of the logo it is most likely answer a.", "The name is on the store.", "A man is wearing a black jacket. it has the name of a d brand on side of it."], "image": "train2014/COCO_train2014_000000175013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571842, "question_id": "KiV2HAkRYQt2EBDoX8QBiC", "question": "What animal is the blue balloon on the right shaped as?", "choices": ["whale", "dog", "unicorn", "dolphin"], "correct_choice_idx": 2, "direct_answers": ["unicorn", "horse", "unicorn", "unicorn", "unicorn", "unicorn", "unicorn", "unicorn", "unicorn", "dog"], "difficult_direct_answer": false, "rationales": ["A blue balloon has a pointed object on it's head. unicorns have a horn on their heads.", "The animal is a unicorn.", "The blue one has one horn on the middle of it's head."], "image": "train2014/COCO_train2014_000000571842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511062, "question_id": "KiZ5JmspFZuU257juAShrH", "question": "Where is the woman with the large surfboard?", "choices": ["beach", "park", "forest", "pier"], "correct_choice_idx": 0, "direct_answers": ["beach", "sand", "beach", "beach", "beach", "beach", "sand", "beach", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["The ground is covered in sand and the woman is wearing a swimsuit and holding a surfboard, which are clues that this location is a beach.", "There is sand and you use a surfboard near beaches", "The woman is at the beach."], "image": "train2014/COCO_train2014_000000511062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61217, "question_id": "KibHUqPQ8fDYUitHyL8N47", "question": "What type of printing is used to create these items?", "choices": ["4d", "3d", "3b", "d3"], "correct_choice_idx": 1, "direct_answers": ["three d", "3 d", "3d", "3d", "3d", "three d", "3d", "3d printing", "3d", "3d printing"], "difficult_direct_answer": false, "rationales": ["The printing is 3d.", "The items were created with a 3d printer.", "The item looks real but is not and is made with a 3d printer."], "image": "train2014/COCO_train2014_000000061217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485947, "question_id": "KihWNQbtqfpjTdAnvcxiH3", "question": "Who is likely taking this picture in relation to the person who poses?", "choices": ["enemy", "spy", "news team", "intimate friend"], "correct_choice_idx": 3, "direct_answers": ["intimate friend", "photographer", "significant other", "partner", "in front", "wife", "spouse", "spouse", "friend", "wife"], "difficult_direct_answer": false, "rationales": ["The man who is posing is in a back yard and is partially naked. the person taking the picture must be someone he knows and trusts.", "The man is very relaxed as he looks at the camera", "The man is not wearing pants and looks comfortable."], "image": "train2014/COCO_train2014_000000485947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407688, "question_id": "KirUqYGbFuPLfMYrw3V8GG", "question": "Which one of these foods can likely be purchased inside?", "choices": ["chicken", "pork", "beef", "tuna"], "correct_choice_idx": 3, "direct_answers": ["seafood", "seafood", "seafood", "tuna", "seafood", "seafood", "seafood", "seafood", "seafood", "seafood"], "difficult_direct_answer": false, "rationales": ["Tuna can be bought inside since this is a seafood market.", "The food is tuna.", "The sign indicates that this is a seafood restaurant. chicken, beef, and pork are not seafood."], "image": "train2014/COCO_train2014_000000407688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359141, "question_id": "KjEAStyeSraW6oyN5XQCBf", "question": "Why do horses need shoes?", "choices": ["make taller", "look good", "protect hooves", "run"], "correct_choice_idx": 2, "direct_answers": ["prevent chipping", "foot protection", "for protection", "protect feet", "protect hooves", "sore hooves", "protect hooves", "protection", "protect feet", "walk"], "difficult_direct_answer": false, "rationales": ["The shoes protect their hooves.", "Horses need to keep their hooves protected.", "The shoes keep their feet from cracking and hurting."], "image": "train2014/COCO_train2014_000000359141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522191, "question_id": "KjGmumTAWgPHrHXbaBqopT", "question": "What country is depicted in the photo?", "choices": ["non-english speaking", "korean speaking", "chinese speaking", "english speaking"], "correct_choice_idx": 0, "direct_answers": ["ireland", "ireland", "ireland", "germany", "mexico", "ireland", "non-english speaking", "ireland", "germany", "ireland"], "difficult_direct_answer": false, "rationales": ["The text on the bus isn't in english.", "The wording on the bus and elsewhere are in a foreign language to and english speaking person.", "The country doesn't use english."], "image": "train2014/COCO_train2014_000000522191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526371, "question_id": "KjnGPQp3jxr5Ws4GMFSeXc", "question": "Why is the man wearing a covering over his face?", "choices": ["keeping cool", "keeping warm", "hiding acne", "hiding scar"], "correct_choice_idx": 1, "direct_answers": ["keeping warm", "for warmth", "sun protection", "goggles", "sunglasses", "cold", "keep warm", "goggles", "cold", "stay warm"], "difficult_direct_answer": false, "rationales": ["It's a cold climate, hence all the snow on the ground, so the face covering (and all the other clothing) is certainly being worn for warmth.", "Here we see a man skiing in heavy snow. this type of locale is often cold.", "There is snow on the ground"], "image": "val2014/COCO_val2014_000000526371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403943, "question_id": "KjpWqbaHhhbe59wKPjizoT", "question": "If he is going to step out from under this structure he should put on what?", "choices": ["sunblock", "shoes", "suit", "music"], "correct_choice_idx": 0, "direct_answers": ["sunscreen", "sunscreen", "sunscreen", "sunblock", "sunscreen", "sunscreen", "sun block", "beach sand", "sunscreen", "sunscreen"], "difficult_direct_answer": false, "rationales": ["He is in the shade now but will need to protect himself with sunblock if he goes into the sun.", "Sunblock helps block out the sun.", "The person needs sunscreen."], "image": "val2014/COCO_val2014_000000403943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41319, "question_id": "Kjv7d2wpGXPP8rqYRRNdXH", "question": "In which location was the man probably photographed?", "choices": ["school auditorium", "banquet hall", "at home", "workplace"], "correct_choice_idx": 2, "direct_answers": ["living room", "home", "house", "living room", "home", "home", "home", "home", "home", "at home"], "difficult_direct_answer": false, "rationales": ["The man is standing in the living room of a house when his picture was taken.", "He looks to be at his residence before he goes out dressed up.", "A guy is in formal clothes in a casual looking, residential area with wallpaper on the walls and household items visible."], "image": "train2014/COCO_train2014_000000041319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565543, "question_id": "Kk5CJimiEXbXsbd8njp27G", "question": "What is spraying all around?", "choices": ["water", "soda", "silly string", "foam"], "correct_choice_idx": 0, "direct_answers": ["sprinklers", "mist", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["It is spraying to water the grass.", "There is water spraying all around the umbrellas.", "There is water spraying out from the tops of the umbrellas in the lawn."], "image": "val2014/COCO_val2014_000000565543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326073, "question_id": "KkFM8PnLLrLQG5aTCkkgZM", "question": "Where is this cat likely hanging out?", "choices": ["park", "backyard", "playground", "forest"], "correct_choice_idx": 1, "direct_answers": ["yard", "picnic table", "outside", "yard", "bench", "backyard", "picnic table", "park", "table", "outside"], "difficult_direct_answer": false, "rationales": ["There appears to be a house in the background judging by the window and siding and because house cats are often found near the homes they live in. this outside scene directly next to a house would likely be answer a.", "The cat is on a picnic table in front of a house.", "There is a metal picnic table which is common in public areas"], "image": "train2014/COCO_train2014_000000326073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15885, "question_id": "KkSQBLa2Jmi4WpLyo2YDqG", "question": "Why are the men wearing badges?", "choices": ["costume", "uniform", "visibility", "protection"], "correct_choice_idx": 1, "direct_answers": ["policemen", "police", "police officers", "police", "police officers", "mounted police", "policemen", "police", "police", "uniform"], "difficult_direct_answer": false, "rationales": ["The men are in uniform.", "The men are police.", "Police officers usually wear attire that have metal, palm-sized signifiers showing that they are real officers."], "image": "train2014/COCO_train2014_000000015885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199927, "question_id": "KkSVx63ogd5JQoAAZLjSCb", "question": "From what source does the item held in this woman's mouth come?", "choices": ["elephants", "factory", "zoo", "tree"], "correct_choice_idx": 3, "direct_answers": ["orange", "orange tree", "tree", "tree", "orange", "orange", "orange", "tree", "tree", "trees"], "difficult_direct_answer": false, "rationales": ["It is an orange.", "The item held in this woman's mouth is an orange. oranges grow on this plant.", "A woman is eating an orange. oranges grow on trees."], "image": "val2014/COCO_val2014_000000199927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528984, "question_id": "KkWSRQGf7mLVhrH3Mzi4vN", "question": "Why is this hill so small?", "choices": ["for practice", "buried snow", "nothing else", "for children"], "correct_choice_idx": 3, "direct_answers": ["kids area", "for children", "beginners", "bunny hill", "kids slope", "bunny hill", "nature", "prairies", "for kids", "glacial carving"], "difficult_direct_answer": true, "rationales": ["The sign labeling the hill says it is a kids' area.", "The hill is set up for beginner skiers and often that is how children learn.", "Children begin on these bunny slopes and they are much smaller and less steep."], "image": "val2014/COCO_val2014_000000528984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68656, "question_id": "KmDAqaszAZUaBvSbeSBKM9", "question": "In what century was this picture taken?", "choices": ["20th", "19th", "18th", "26th"], "correct_choice_idx": 0, "direct_answers": ["i do", "20th", "20th", "20th", "twentieth", "twentieth", "nineteth", "twentieth", "eighteenth", "eighteenth"], "difficult_direct_answer": false, "rationales": ["The 1900's are the 20th century", "They were taken in 1926", "The picture must have been taken in the 20th century given the date at the top right."], "image": "train2014/COCO_train2014_000000068656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556986, "question_id": "KmEcojWcqegc8TuA9JREis", "question": "What part of his body is he trying to protect with equipment?", "choices": ["knee", "wrist", "head", "elbow"], "correct_choice_idx": 2, "direct_answers": ["head", "head", "head", "head", "head", "head", "head", "head", "head", "head"], "difficult_direct_answer": false, "rationales": ["He is wearing a helmet. he is not wearing any other pads.", "He is wearing a helmet.", "He is wearing a helmet, which is an essential item used to protect the head because head injuries are prevalent in skateboarding."], "image": "train2014/COCO_train2014_000000556986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379987, "question_id": "KmRmUfzZt5HkmGFbF68onL", "question": "Why are the eggs placed in the container?", "choices": ["protection", "to cook", "to eat", "to dye"], "correct_choice_idx": 0, "direct_answers": ["storage", "safety", "protection", "vegetable", "storage", "safety", "not crack", "keep safe", "protection", "carry"], "difficult_direct_answer": false, "rationales": ["Eggs are very fragile and can easily crack, so the container cradles them and keeps them protected until someone wishes to crack them for cooking.", "The styrofoam container helps cushion the eggs against breakage in case the package is handled roughly.", "Egg cartons are used to keep eggs from cracking."], "image": "train2014/COCO_train2014_000000379987.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36046, "question_id": "KmfMmW33SVGSzNgEBwdXom", "question": "What is the man in the middle doing?", "choices": ["resting", "using phone", "resting", "shaving"], "correct_choice_idx": 1, "direct_answers": ["talking", "on phone", "talking phone", "using phone", "using phone", "talking phone", "using phone", "riding train", "talking phone", "sitting"], "difficult_direct_answer": false, "rationales": ["The man has a phone up to his face.", "He has his hand up to his ear", "The man has the phone up to his head."], "image": "train2014/COCO_train2014_000000036046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307341, "question_id": "KmvoKzPMQrTPYhZAAXdDdc", "question": "Who owns the dogs shown here?", "choices": ["no one", "farmer", "skiing lady", "pet store"], "correct_choice_idx": 2, "direct_answers": ["lady", "skier", "woman", "woman", "skiing lady", "woman", "lady", "man", "woman", "purple jacket"], "difficult_direct_answer": false, "rationales": ["The dogs are wearing collars, so they belong to someone. their owner is standing beside them.", "There's only one person visible near the dogs, and the dogs seem friendly to her.", "A woman is standing behind the dogs so she likely owns the dogs."], "image": "val2014/COCO_val2014_000000307341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389634, "question_id": "Kmwkm6Co7ownUHwp685Qs5", "question": "What complimentary service does the bus offer on board?", "choices": ["air-conditioner", "restrooms", "wi-fi", "movies"], "correct_choice_idx": 2, "direct_answers": ["rides", "wifi", "wi-fi", "wifi", "wifi", "wifi", "rides", "wifi", "wifi", "wifi"], "difficult_direct_answer": false, "rationales": ["It clearly states on the front of the bus that it offers free wifi. this is a courtesy extended more and more now that internet access is a vital part of so many lives.", "A bus has a sign informing of free internet on it.", "More of a recent amenity with buses, wifi is accessible."], "image": "train2014/COCO_train2014_000000389634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219454, "question_id": "KnDTxtpi2s8pi5udMtEARF", "question": "What is this movie most likely to be?", "choices": ["horror", "comedy", "action", "history"], "correct_choice_idx": 1, "direct_answers": ["funny", "comedy", "safety last", "conmedy", "silent", "bw", "comedy", "comedy", "comedy", "comedy"], "difficult_direct_answer": false, "rationales": ["The person hanging from the clock is being comedic.", "The man is performing an exaggerated act so this should be a comedy film.", "The movie is a comedy."], "image": "train2014/COCO_train2014_000000219454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47552, "question_id": "KnRbhXTGkMMBUTQ6fiW94h", "question": "Who is the bench for?", "choices": ["defendants", "judges", "passengers", "patients"], "correct_choice_idx": 2, "direct_answers": ["passengers", "passengers", "train waiting", "passengers", "people", "train riders", "passengers", "passengers", "passengers", "passengers"], "difficult_direct_answer": false, "rationales": ["The bench on the train platform is for passengers that want to sit while they wait.", "People wait for the train on the bench.", "The passenger seems to be sit here to wait the train."], "image": "val2014/COCO_val2014_000000047552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259467, "question_id": "KnyspRCWsK4gmWvJEVk52v", "question": "What type of course is being taught by the women with the white lab coat?", "choices": ["history", "math", "economics", "science"], "correct_choice_idx": 3, "direct_answers": ["science", "science", "science", "computer science", "chemistry", "science", "system work", "computer course", "science", "science"], "difficult_direct_answer": false, "rationales": ["White coats indicate a scientist is wearing them.", "Scientists wear lab coats.", "The course is science."], "image": "train2014/COCO_train2014_000000259467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577758, "question_id": "Ko79uAWsLs7YRVLhMziiZi", "question": "Which one of these numbers is the route number?", "choices": ["m69", "m69 cyj", "81a", "69"], "correct_choice_idx": 2, "direct_answers": ["81a", "bra", "81a", "eighty one", "81a", "eighty one", "81a", "81", "81a", "81"], "difficult_direct_answer": false, "rationales": ["The route number is lite up on the top of the bus.", "The bus says 81a on the front.", "The bus has an electronic display on the front with answer a visible. this is where buses display their routes."], "image": "train2014/COCO_train2014_000000577758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279421, "question_id": "Ko8PWFj6wwUE8W3cDKQi3K", "question": "What will this vehicle be traveling on?", "choices": ["roads", "rails", "air", "wooden blocks"], "correct_choice_idx": 1, "direct_answers": ["rails", "rail", "track", "tracks", "tracks", "track", "rail", "track", "track", "train tracks"], "difficult_direct_answer": false, "rationales": ["There are rails visible in the image as well as a train. trains typically travel on rails.", "It is a train", "This is a train station, with a train and platform as cues, and a train gets around by traveling across tracks, or rails."], "image": "train2014/COCO_train2014_000000279421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133876, "question_id": "KoCcmjSxDt8GeL6EEGzJKb", "question": "What is the woman laying in?", "choices": ["hospital bed", "couch", "chair", "mri machine"], "correct_choice_idx": 0, "direct_answers": ["chair", "cot", "hospital chair", "patient", "hospital chair", "hospital bed", "hospital recliner", "bed", "hospital bed", "chair"], "difficult_direct_answer": false, "rationales": ["The woman is in the hospital.", "This is obvious given the medical equipment nearby and her wrist dressing.", "There is breathing apparatus and an identifying arm band on her wrist while she is in a bed. hospitals are the kind of place to have life saving resources immediately available."], "image": "val2014/COCO_val2014_000000133876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542931, "question_id": "KoNKWrkCffTjd37EmXH2d7", "question": "In which US city has this place?", "choices": ["peoria", "elgin", "joliot", "chicago"], "correct_choice_idx": 3, "direct_answers": ["washington", "wisconsin", "milwaukee", "chicago", "milwaukee", "wisconsin", "milwaukee", "detroit", "milwaukee", "milwaukee"], "difficult_direct_answer": false, "rationales": ["Though the picture states it is in milwaukee answer \"a\" is the best choice.", "This setting takes place in chicago.", "This is in chicago"], "image": "train2014/COCO_train2014_000000542931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568895, "question_id": "Kob9pwsh8Bdymiezmuk3qy", "question": "The scene is in what color?", "choices": ["blue", "green", "sepia", "red"], "correct_choice_idx": 2, "direct_answers": ["black white", "sepia", "brown", "sepia", "sepia", "sepia", "sepia", "sepia", "sepia", "brown"], "difficult_direct_answer": false, "rationales": ["The tone of the picture is mostly brown.", "The colors are mostly brown, black and white.", "The scene is in sepia."], "image": "train2014/COCO_train2014_000000568895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414113, "question_id": "Kog4ueAizYef3cSDHkomPx", "question": "What are the people on the street side focused on?", "choices": ["snacks", "arrest", "sirens", "parade"], "correct_choice_idx": 3, "direct_answers": ["parade", "bike riding", "crowd", "police", "parade", "parade", "parade", "bike ridemen", "parade", "parade"], "difficult_direct_answer": false, "rationales": ["The people are lined up on the street to watch something.", "The people on the street side are focused on a parade.", "The people on the side of the street are watching a parade that is passing by."], "image": "train2014/COCO_train2014_000000414113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98523, "question_id": "KoxHhW6ofwiesfJMNQGa9Y", "question": "What is the profession of the people that use the plane?", "choices": ["musicians", "actors", "scientists", "managers"], "correct_choice_idx": 0, "direct_answers": ["pilots", "band", "pilots", "pilot", "metal musicians", "musicians", "musicians", "music band", "music band", "pilots"], "difficult_direct_answer": false, "rationales": ["The side of the plane has the logo of a metal band on it and they were musicians.", "The plane says \"iron maiden\" on it which is a band.", "The plane says iron maiden."], "image": "val2014/COCO_val2014_000000098523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132336, "question_id": "KoxMFnQCUweUymbF53oh8E", "question": "What type of company owns the roofless bus?", "choices": ["travel", "city bus", "tourist", "sightseeing"], "correct_choice_idx": 3, "direct_answers": ["sightseeing", "sightseeing", "big bus", "big bus", "sightseeing", "sightseeing", "sightseeing", "tour", "big bus", "sightseeing"], "difficult_direct_answer": false, "rationales": ["A company that gives tours of the city.", "A sightseeing company owns this roofless bus.", "This is open so people can see as it drives around"], "image": "val2014/COCO_val2014_000000132336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382472, "question_id": "Kp6AGgZZwHA8LouXpb8tvY", "question": "Where are the people on the ski lift being taken?", "choices": ["up slope", "front entrance", "hotel lobby", "to lunch"], "correct_choice_idx": 0, "direct_answers": ["mountain top", "uphill", "slope", "mountaintop", "playing area", "up slope", "mountain", "skiing", "hill top", "mountain top"], "difficult_direct_answer": true, "rationales": ["A ski lift is in the background with it primary purpose to bring skiers up the slope.", "This moves people quickly up a mountain then they ski down", "The ski lift will carry people to the top of the mountain then they can ski down."], "image": "train2014/COCO_train2014_000000382472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125270, "question_id": "KpJM7WUhLV2PdyAknSwfBS", "question": "What is the boy doing?", "choices": ["selling juice", "selling blenders", "food demonstration", "playing magoc"], "correct_choice_idx": 0, "direct_answers": ["handle mixer", "cannot see", "blending", "blending", "making drinks", "selling juice", "making drink", "making shakes", "making smoothies", "making drinks"], "difficult_direct_answer": false, "rationales": ["Given the time of day, or rather night, it's most likely a as smoothies. the other options don't fit well.", "There are mixers on the counter.", "A person is standing behind a counter with a lot of people around. blenders are filled with different colored liquids."], "image": "train2014/COCO_train2014_000000125270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506508, "question_id": "KpQTyuWJMFJhZDTUpJZZwx", "question": "What do the tall thin things carry?", "choices": ["graphite", "wind turbine", "fruits", "power lines"], "correct_choice_idx": 3, "direct_answers": ["grass", "young boy", "rider", "hay", "power lines", "hay", "weight", "power", "power lines", "hay"], "difficult_direct_answer": false, "rationales": ["These keep the wires up off the ground for safety", "Long wood structures in the distance holding cables in the air. there are multiple in ground.", "Electric is sent through them."], "image": "train2014/COCO_train2014_000000506508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6943, "question_id": "KparC3opgcLU94wRdYzfhJ", "question": "What does the shop to the right of the girls sell?", "choices": ["subs", "donuts", "bagels", "pizza"], "correct_choice_idx": 0, "direct_answers": ["subs", "grinders", "food", "subs", "sandwiches", "sandwich", "submarine sandwiches", "subs", "subs", "subs"], "difficult_direct_answer": false, "rationales": ["The shop is for subs.", "A picture of the sandwich is on the board.", "There is a subway sign on their right."], "image": "train2014/COCO_train2014_000000006943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376988, "question_id": "KpijS8xS4gEJb6ZmEL8kR2", "question": "What color light filter is being used?", "choices": ["black", "purple", "green", "none"], "correct_choice_idx": 1, "direct_answers": ["purple", "pink", "purple", "pink", "red", "pink", "purple", "purple", "pink", "violet"], "difficult_direct_answer": false, "rationales": ["Unless you are colorblind you can tell what the color is.", "The color is purple around the laptop.", "That color filter makes everything look purplish."], "image": "train2014/COCO_train2014_000000376988.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412248, "question_id": "KppCxgeM8R3ev73FW5H6WS", "question": "Why is the woman in the blue top laying on the sand?", "choices": ["to heal", "to eat", "to exercise", "to sunbathe"], "correct_choice_idx": 3, "direct_answers": ["relaxing", "sunbathing", "tanning", "to sunbathe", "tanning", "sunbathing", "sunbathing", "to sunbathe", "sunbathing", "to bask"], "difficult_direct_answer": false, "rationales": ["The woman wants to have a tan.", "She is trying to get a suntan.", "The woman is laying in the sand so she can catch some rays."], "image": "train2014/COCO_train2014_000000412248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191047, "question_id": "Kpt8pbZcapJ6nXtasZLxye", "question": "Why are they all walking in different directions?", "choices": ["different destinations", "being followed", "are confused", "to confuse"], "correct_choice_idx": 0, "direct_answers": ["different goals", "scattering", "different destinations", "crossing road", "different destinations", "escaping", "splitting up", "different destinations", "escape", "not together"], "difficult_direct_answer": false, "rationales": ["They are walking to different places as a group.", "They don't look like they're part of the same group.", "The men are traveling to different places."], "image": "val2014/COCO_val2014_000000191047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83964, "question_id": "KpuX4UG5GjH4owwZZLtZwz", "question": "Why might her skin be darker than the others?", "choices": ["oil", "tan", "tattoo", "paint"], "correct_choice_idx": 1, "direct_answers": ["got tan", "tan", "tanned", "sunbathing", "tanned", "tan", "tanning", "tanning booth", "tanned", "tan"], "difficult_direct_answer": false, "rationales": ["She's been out in the sun a lot", "The gal spends a lot of time on the beach and has a great suntan.", "The woman spends a lot of time out in the sun."], "image": "train2014/COCO_train2014_000000083964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279763, "question_id": "KpzZyQXKP2HmVZaLvJDFx6", "question": "Which color is the person who controls most of these kites wearing?", "choices": ["white", "red", "none", "blue"], "correct_choice_idx": 2, "direct_answers": ["red", "white", "no person", "black", "red", "red", "red", "yellow", "pink", "none"], "difficult_direct_answer": false, "rationales": ["The kites are being attached to long poles in the parking lot.", "There is nobody in the picture that is controlling the kites.", "The kites are visibly anchored to poles that are not currently manned."], "image": "train2014/COCO_train2014_000000279763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143479, "question_id": "Kq5YzThzCGuFhJnRucCCWe", "question": "What event is the man taking part in?", "choices": ["gala", "ramadan", "barbeque", "eating contest"], "correct_choice_idx": 2, "direct_answers": ["barbecue", "cookout", "barbecue", "barbeque", "bbq", "barbecue", "barbecue", "bbq", "hot dog", "cookout"], "difficult_direct_answer": false, "rationales": ["The man just grilled hot dogs.", "He is holding food and there is a grill behind him.", "The event is a bbq."], "image": "val2014/COCO_val2014_000000143479.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340778, "question_id": "Kq849g6TJHYnPGnZjq44zM", "question": "What area these officers likely involved in?", "choices": ["bake sale", "police escort", "race", "prostitution"], "correct_choice_idx": 1, "direct_answers": ["escort", "parade", "boston", "patrol", "police escort", "boston", "government", "police escort", "parade", "downtown"], "difficult_direct_answer": false, "rationales": ["The area is a police escort.", "Given the positioning of the police motorcycles relative to the vehicle, one could argue that the policemen are escorting the vehicle in an official capacity.", "The people on motorcycles are providing an escort."], "image": "val2014/COCO_val2014_000000340778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49979, "question_id": "KqPvPVfbgy6E8EZZaDJYdZ", "question": "What do motorcycle riders have the ability to purchase that offers safety in protecting the eyes?", "choices": ["goggles", "all correct", "shield", "sunglasses"], "correct_choice_idx": 1, "direct_answers": ["sunglasses", "goggles", "goggles", "all correct", "sunglasses", "goggles", "goggles", "face shield", "sunglasses", "glasses"], "difficult_direct_answer": false, "rationales": ["The answers are all right.", "These men can buy any sort of eye protection to protect themselves while riding.", "Motorcycle riders can buy shields, sunglasses and goggles."], "image": "train2014/COCO_train2014_000000049979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45204, "question_id": "Kr3hebvYJrmxbEjwELu6Ey", "question": "In what building is this bathroom?", "choices": ["train station", "spa", "home", "hotel"], "correct_choice_idx": 2, "direct_answers": ["hotel", "apartment", "hotel", "hotel", "spa", "hotel", "home", "hotel", "hotel", "house"], "difficult_direct_answer": false, "rationales": ["The bathroom has the fixtures and decor that you would see in a home.", "The bathroom is really fancy so it seems it's in a hotel.", "The fresh white linens and robe hanging up, along with the travel size items and sign on the bench indicate that this is a hotel."], "image": "train2014/COCO_train2014_000000045204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52066, "question_id": "KrCFMFHRLsTdiyzzggyZSa", "question": "What will the man have to grab to board the train?", "choices": ["jacket", "suitcase", "newspaper", "newspaper"], "correct_choice_idx": 1, "direct_answers": ["suitcase", "bag", "ticket", "his luggage", "luggage", "luggage", "bag", "rail", "ticket", "bag"], "difficult_direct_answer": false, "rationales": ["The bag is on the side of him. you can tell he is a traveller.", "A man is standing on a train platform with a suitcase at his feet. people pack suitcases to go on trips on trains.", "The man has a black suitcase next to him that he will have to grab when he boards the train."], "image": "val2014/COCO_val2014_000000052066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268734, "question_id": "KrEXyuZgSPYCwrMR9ggXwb", "question": "What color is the liquid of the beverage?", "choices": ["green", "blue", "black", "white"], "correct_choice_idx": 2, "direct_answers": ["brown", "yellow", "black", "brown", "brown", "clear", "yellow", "brown", "black", "brown"], "difficult_direct_answer": false, "rationales": ["This drink is usually a light shade of yellow or completely clear.", "Pepsi is generally very dark.", "A lot of soda is this color"], "image": "val2014/COCO_val2014_000000268734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276077, "question_id": "KrJyxeuEDr7tfsnWCcjDU7", "question": "Where may this lady be completing the cleaning?", "choices": ["hotel", "nursing home", "residence", "conference center"], "correct_choice_idx": 2, "direct_answers": ["kitchen", "dishes", "residence", "kitchen", "sink", "kitchen", "kitchen", "kitchen", "dishes", "kitchen"], "difficult_direct_answer": false, "rationales": ["There is a dishwasher. hotels don't have dishwashers.", "This looks like a room that is found in a house in a residential home.", "The woman is standing in a kitchen that is most likely in a person's home."], "image": "train2014/COCO_train2014_000000276077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27136, "question_id": "KrVQo9nQVHauwDTmDMXTin", "question": "What headgear is the man on the left wearing?", "choices": ["helmet", "beanie", "beret", "shako"], "correct_choice_idx": 2, "direct_answers": ["beret", "beret", "beret", "beret", "beret", "beret", "beret", "beret", "beret", "beret"], "difficult_direct_answer": false, "rationales": ["He has a beret on his head.", "The person wears a french style hat.", "The poofy rounded hat this man wears is called a beret."], "image": "train2014/COCO_train2014_000000027136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445684, "question_id": "KraBNo5FfcWfiLGp5a8DAT", "question": "What is the person on the left doing?", "choices": ["studying", "eating", "walking", "writing"], "correct_choice_idx": 2, "direct_answers": ["walking", "skating", "walking", "walking", "walking", "walking", "skating", "smart", "walking", "walking"], "difficult_direct_answer": false, "rationales": ["One person is walking and the other is riding a skateboard.", "He's got his feet on the ground", "The person on the left walks on foot."], "image": "train2014/COCO_train2014_000000445684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203669, "question_id": "Krac8AYjtxSDCa7XSUmz97", "question": "The color of the shirt matches the color of what?", "choices": ["sky", "peaches", "elephants", "money"], "correct_choice_idx": 3, "direct_answers": ["money", "trees", "trees", "trees", "trees", "trees", "plants", "plants", "trees", "trees"], "difficult_direct_answer": false, "rationales": ["Paper dollars are green.", "The color green is associated with money.", "The man is wearing a light green shirt that is similar to the color of paper money."], "image": "val2014/COCO_val2014_000000203669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475580, "question_id": "KrpBajwFoN4fdM2AimdWmU", "question": "Other than the bus what method of public transportation is close by?", "choices": ["taxi", "airplane", "metro", "scooter"], "correct_choice_idx": 2, "direct_answers": ["can't see", "subway", "metro", "cars", "underground", "car", "train", "normal", "cars", "cars"], "difficult_direct_answer": false, "rationales": ["The metro is nearby.", "On the pole is the logo for the \"london underground\" which is a public transportation system in london.", "Subway systems are commonly found in city locales such as this and we see signs indicating one is nearby as well."], "image": "train2014/COCO_train2014_000000475580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206975, "question_id": "Ks6bGrVZkywwEP7tFBsM5E", "question": "What mode of transport is in the picture above?", "choices": ["road", "water", "air", "railway"], "correct_choice_idx": 3, "direct_answers": ["trains", "railway", "train", "train", "train", "railroad", "train", "train", "railroad", "train"], "difficult_direct_answer": false, "rationales": ["There is both visibly trains and rails which are associated with the manner of travel in answer a.", "There are trains.", "The mode is the railway."], "image": "train2014/COCO_train2014_000000206975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329336, "question_id": "KsCcUTpYHoXoAKxrN3Wjry", "question": "What type of vehicles are the people riding?", "choices": ["trucks", "wagons", "jeeps", "cars"], "correct_choice_idx": 1, "direct_answers": ["horse cart", "wagons", "cart", "wagons", "traditional vehicle", "wagons", "wagons", "buggies", "chariot", "wagon"], "difficult_direct_answer": false, "rationales": ["The people are riding covered wagons that are pulled by horses.", "The people are on horse drawn wagons.", "You can tell by the enclosed shelter and the design of the vehicle to what it is."], "image": "val2014/COCO_val2014_000000329336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415464, "question_id": "KsCd2gjYQNpw6NamcWYGeu", "question": "Who is the man up to bat?", "choices": ["reggie jackson", "derek jeter", "alexa rodriguez", "mariano rivera"], "correct_choice_idx": 1, "direct_answers": ["mets player", "derek jeter", "derek jeter", "jeter", "derek jeter", "derrick jeter", "baseball player", "derek jeter", "derek jeter", "yankee"], "difficult_direct_answer": false, "rationales": ["The man is wearing a new york uniform and derek jeter plays for new york.", "The face and team are that of yankee legend derek jeter.", "The man is jeter."], "image": "train2014/COCO_train2014_000000415464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256197, "question_id": "KsEx8CA7qStNXzZ2vjgqAQ", "question": "Where is the edible part of the oval food?", "choices": ["only seed", "none edible", "all edible", "inside shell"], "correct_choice_idx": 3, "direct_answers": ["at top", "inside", "inside", "on plate", "egg", "inside", "inside shell", "inside shell", "in shell", "in shell"], "difficult_direct_answer": false, "rationales": ["The inside of the shell is edible in the egg.", "The food is an egg, whose edible portion is located in the section identified in a.", "The oval food is an egg and it is usually shelled before being served."], "image": "val2014/COCO_val2014_000000256197.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566568, "question_id": "KsKTQ8jXs23RAZXZBdSUE5", "question": "These types of plants are good in what environment?", "choices": ["snowy", "desert", "temperate", "tropical"], "correct_choice_idx": 1, "direct_answers": ["desert", "dry", "cactus", "dry", "arid", "desert", "desert", "desert", "desert", "desert"], "difficult_direct_answer": false, "rationales": ["This is cactus which needs a lot of sun and little water", "The plants visible are deserts. these are known to commonly excel in desert environments.", "These plants don't need a lot of water and thrive in arid environments."], "image": "train2014/COCO_train2014_000000566568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514902, "question_id": "KsRjcHquJa5V6KwMqs4JXL", "question": "What is the condition of these items?", "choices": ["cooking", "plated", "wrapped", "exposed"], "correct_choice_idx": 2, "direct_answers": ["wrapped", "fresh", "unopened", "good", "new", "good", "rotten", "new", "ripe", "packing apple"], "difficult_direct_answer": false, "rationales": ["Fruit is seen in plastic bags with ties.", "The fruit is put into assorted bags.", "The apples are wrapped up."], "image": "train2014/COCO_train2014_000000514902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236650, "question_id": "KsRrybKMGjMrSPF5iuZWvk", "question": "What did the soccer ball just hit?", "choices": ["girl's head", "leg", "fence", "arm"], "correct_choice_idx": 0, "direct_answers": ["girl's head", "head", "head", "her head", "kid", "girl's head", "girls head", "head", "head", "head"], "difficult_direct_answer": false, "rationales": ["The ball is right above her hair", "She is wincing as it bounces back up", "The soccer ball bounced off the girl in blue's head."], "image": "train2014/COCO_train2014_000000236650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22224, "question_id": "KseEt2QPtKr8jb5zdNpKk2", "question": "The train which carries goods are called?", "choices": ["goods", "cargo", "passenger", "lodge truck"], "correct_choice_idx": 1, "direct_answers": ["freight train", "freight train", "freight train", "cargo", "cargo", "freight train", "goods", "freight train", "cargo", "cargo"], "difficult_direct_answer": false, "rationales": ["The train has cargo.", "A train that carries supplies and such are cargo trains.", "A train without passenger cars is moving along tracks. trains are used to move cargo or passengers."], "image": "train2014/COCO_train2014_000000022224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8781, "question_id": "KsrxcLUyrRSffiU3MoWJoq", "question": "What is the person pictured above doing?", "choices": ["reading", "playing", "riding", "eating"], "correct_choice_idx": 0, "direct_answers": ["reading", "reading", "reading", "sitting", "reading", "reading", "reading", "sitting", "reading", "reading"], "difficult_direct_answer": false, "rationales": ["They have an open book on their lap", "There is a man with his dog sitting on the couch. the man is looking down at a book.", "The man is sitting on the couch and reading a book."], "image": "train2014/COCO_train2014_000000008781.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307764, "question_id": "Ksy7cvYTHErXUnJGY5fNv8", "question": "Why is the surfer wearing a wetsuit?", "choices": ["fashion", "prevent sunburn", "visibility", "insulation"], "correct_choice_idx": 3, "direct_answers": ["insulation", "surfing", "insulation", "yes", "warmth", "stay warm", "it's wet", "keep warm", "keep dry", "col water"], "difficult_direct_answer": true, "rationales": ["The ocean can have very cold water, and a wetsuit protects one's body from the elements.", "The surfer is trying to stay warm.", "The surfer is wearing a wetsuit to keep them warm with insulation."], "image": "train2014/COCO_train2014_000000307764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285292, "question_id": "Kt7JHTr2UojHvChYBEsfix", "question": "American multinational footwear manufacturing company is what?", "choices": ["puma", "nike", "adidas", "converse"], "correct_choice_idx": 1, "direct_answers": ["nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike"], "difficult_direct_answer": false, "rationales": ["The company is nike.", "Nike has the little checkmark logo that you can see.", "There is only one footwear manufacturing company name that's visible (and it's visible twice), so the shoe company being referred to could only be adidas."], "image": "train2014/COCO_train2014_000000285292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450136, "question_id": "KtXtiRnCXkKDupFbHToYgy", "question": "Which President's life does the resident here know several details about?", "choices": ["gore", "obama", "trump", "clinton"], "correct_choice_idx": 3, "direct_answers": ["clinton", "clinton", "nixon", "bill clinton", "clinton", "bill clinton", "clinton", "bill clinton", "bill clinton", "bill clinton"], "difficult_direct_answer": false, "rationales": ["There are books about clinton.", "The book shelf has a book about president bill clinton on the middle shelf.", "My life is written by clinton."], "image": "train2014/COCO_train2014_000000450136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344974, "question_id": "KtnxT9Hr5SmnCtcSjMNTZM", "question": "What is the area behind the large tree on the left?", "choices": ["school", "bathroom", "baseball field", "playground"], "correct_choice_idx": 3, "direct_answers": ["playground", "playground", "playground", "residential area", "park", "playground", "play ground", "playground", "playground", "playground"], "difficult_direct_answer": false, "rationales": ["A single bench can be seen and a kid playing on a see-saw in the distance.", "There is a playground behind the large tree behind the bench.", "There are items that are used to play hence playinground."], "image": "train2014/COCO_train2014_000000344974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445810, "question_id": "KttNJq5sgDk9GLevGUQxE6", "question": "What industry are these kids trying training for?", "choices": ["legal", "it", "culinary", "medical"], "correct_choice_idx": 1, "direct_answers": ["technology", "computer programming", "it", "accounting", "technology", "technology", "technology", "technology", "computer", "computer"], "difficult_direct_answer": false, "rationales": ["They are on computers.", "That industry is known for using tech.", "You can tell by what the kids are working with, to which field they are interested in."], "image": "train2014/COCO_train2014_000000445810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359136, "question_id": "KtvUvrUTJ5j65wCJZ5vmVF", "question": "What is to the left of the woman's head?", "choices": ["dinner plate", "photo ring", "plunger", "toilet"], "correct_choice_idx": 3, "direct_answers": ["sink", "toilet", "toilet", "sink", "toilet", "toilet", "toilet", "toilet", "basin", "toilet"], "difficult_direct_answer": false, "rationales": ["A woman is laying on the floor near a bathtub and a white, round toilet seat is to her right. toilets have toilet seats.", "It is a low round porcelain bowl. it is in a bathroom.", "A toilet is to the right of the image."], "image": "val2014/COCO_val2014_000000359136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196665, "question_id": "Ku6PhGwTSx8XAoDHegUcYM", "question": "What style of facial hair is the man sporting?", "choices": ["zorro", "goatee", "handlebar", "chevron"], "correct_choice_idx": 1, "direct_answers": ["goatee", "goatee", "goatee", "goatee", "goatee", "goatee", "rolaids", "goalie", "goatee", "goatee"], "difficult_direct_answer": false, "rationales": ["This style is a goatee. it is a beard and moustache together.", "The hair on the moustache and beard is referred to as a goatee.", "He only has hair going around his mouth"], "image": "val2014/COCO_val2014_000000196665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32738, "question_id": "KuSVGGLNWztY8bFbFPBYKe", "question": "What activity is being undertaken by the woman holding the cat?", "choices": ["grumpy contest", "marketing", "dancing", "cat judging"], "correct_choice_idx": 3, "direct_answers": ["judging", "examining", "cat", "check up", "inspecting", "judging", "judging", "cat judging", "cat judging", "vet check"], "difficult_direct_answer": false, "rationales": ["She is looking closing at the cat's confirmation.", "She is inspecting the cat as a judge.", "The woman holding the cat is judging it."], "image": "train2014/COCO_train2014_000000032738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23723, "question_id": "KufYNiWm2xYzBfZGC5ou3R", "question": "How many function keys present in the keyboard?", "choices": ["15", "12", "14", "11"], "correct_choice_idx": 1, "direct_answers": ["twelve", "101", "twenty", "twelve", "many", "12", "twelve", "12", "12", "fourteen"], "difficult_direct_answer": false, "rationales": ["There are 12 function keys on the laptop.", "There are twelve keys.", "There are twelve function keys."], "image": "train2014/COCO_train2014_000000023723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314690, "question_id": "KukshNvEEiJRf337jZUPki", "question": "What small creature is likely living here?", "choices": ["baby", "midget", "mini monster", "monkey"], "correct_choice_idx": 0, "direct_answers": ["child", "cat", "cat", "cat", "dog cat", "child", "spider", "baby", "cat", "toddler"], "difficult_direct_answer": false, "rationales": ["A baby likely lives here since there is a toy.", "There is a gate and toys", "There are kids book on the floor."], "image": "val2014/COCO_val2014_000000314690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23093, "question_id": "KupScZysySvnmfVm23WXEp", "question": "How many different vegetables were used to create the red sauce on the hot dog?", "choices": ["one", "three", "four", "two"], "correct_choice_idx": 0, "direct_answers": ["one", "one", "one", "one", "three", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["Ketchup is seen on this hot dog. ketchup is derived from tomatoes and no other vegetables.", "Tomatoes are the main ingredient for the sauce that we put on our food.", "Tomatoes were used to make the ketchup. that said, it's technically a fruit. corn might have been used as well for syrup and vinegar might have come from a corn source. so, b might also apply."], "image": "train2014/COCO_train2014_000000023093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421618, "question_id": "KuvQPNtg5MR2E4jcYa2dcp", "question": "Where do these grow?", "choices": ["tree", "ground", "bush", "flower"], "correct_choice_idx": 0, "direct_answers": ["tropical", "jungle", "tree", "on trees", "malaysia", "trees", "western", "tropical climate", "farm", "jungle"], "difficult_direct_answer": true, "rationales": ["Bananas are from trees.", "These are bananas that have been grown on trees and cut down to sell.", "There are tree limbs still attached to the bananas in the photo."], "image": "train2014/COCO_train2014_000000421618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188735, "question_id": "KvE9wpRQv8FjZubzVV6gPU", "question": "What's the quickest time they will be able to sit on the bench?", "choices": ["few hours", "few months", "few minutes", "few seconds"], "correct_choice_idx": 0, "direct_answers": ["soon", "few hours", "two seconds", "minute", "4 hours", "1 second", "eight hours", "two hours", "few hours", "1 hour"], "difficult_direct_answer": true, "rationales": ["The people are painting the bench and it will take several hours to dry before anyone can sit on it.", "This bench needs to dry.", "The quickest time the people can sit is in a few hours since the paint needs to dry."], "image": "val2014/COCO_val2014_000000188735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359253, "question_id": "KvGcKMCyWXcCtdNg3fPoR3", "question": "What eatery does someone here frequent?", "choices": ["hardees", "mcdonald's", "tavern greene", "tim horton's"], "correct_choice_idx": 1, "direct_answers": ["mcdonalds", "mcdonalds", "mcdonalds", "mcdonalds", "mcdonalds", "mcdonald's", "mcdonalds", "mcdonald's", "mcdonalds", "mcdonalds"], "difficult_direct_answer": false, "rationales": ["The eatery is mcdonald's.", "The golden arches can be seen in the back of the room.", "There is a sign with a golden arch."], "image": "train2014/COCO_train2014_000000359253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443589, "question_id": "KvZ4B5NNzCpzbbLjpgLjzm", "question": "What is the first name of the person who rode this bike?", "choices": ["mary", "mann", "dick", "manny"], "correct_choice_idx": 2, "direct_answers": ["dick mann", "dick mann", "dick", "dick", "mann", "dick", "mann", "dick mann", "dick mann", "mann"], "difficult_direct_answer": false, "rationales": ["The name is dick.", "The paper says dick mann.", "The name says \"dick\" on the sign."], "image": "train2014/COCO_train2014_000000443589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67042, "question_id": "KvtDGByWMJFGGxo8DBYgLn", "question": "What is he looking at?", "choices": ["his hand", "floor", "his phone", "his pants"], "correct_choice_idx": 2, "direct_answers": ["phone", "cellphone", "cellphone", "phone", "phone", "phone", "his phone", "cellphone", "phone", "phone"], "difficult_direct_answer": false, "rationales": ["You can tell by the design and size of the electronic as to what it is.", "A man is looking down at a device as he stands in place.", "The man is visibly holding a phone and based on his eye line and the level he is holding it, this is what he is looking at."], "image": "val2014/COCO_val2014_000000067042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71295, "question_id": "KvuT348AoGdbbHJGq4vKTD", "question": "What location would this suitcase be scanned at before getting onto an airplane?", "choices": ["airport", "hospital", "bus station", "train station"], "correct_choice_idx": 0, "direct_answers": ["security gate", "terminal", "luggage", "airport", "security", "airport security", "airport", "tsa", "luggage bag", "airport"], "difficult_direct_answer": false, "rationales": ["Security checks it to make sure it's safe", "Airplanes are not found at bus stations, train stations, or hospitals.", "A suitcase is packed. airports scan luggage before it is put on planes."], "image": "train2014/COCO_train2014_000000071295.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27486, "question_id": "KvvLwufXfCbwXbaprMjGyw", "question": "What type of field is being played on?", "choices": ["turf", "carpet", "grass", "clay"], "correct_choice_idx": 0, "direct_answers": ["soccer", "astro turf", "soccer", "soccer", "soccer", "turf", "astro turf", "soccer", "astro turf", "soccer"], "difficult_direct_answer": false, "rationales": ["Although the field is green like grass, the length and color make it more likely to be a synthetic material or artificial surface that sports are played on.", "A green artificial product that resembles grass.", "The players are on turf."], "image": "train2014/COCO_train2014_000000027486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278823, "question_id": "Kw6XYvMDDf63gadMUzguPH", "question": "What is the yellow hose connected to?", "choices": ["drainage", "sewers", "outlet", "fire hydrant"], "correct_choice_idx": 3, "direct_answers": ["fire hydrant", "fire hydrant", "fire hydrant", "hydrant", "fire hydrant", "fire hydrant", "hydrant", "fire hydrant", "fire hydrant", "hydrant"], "difficult_direct_answer": false, "rationales": ["Fireman use it to put fires out.", "It is short and red and has valves on the sides for hoses", "On the left side of this image we can see that the yellow hose strewn across the street is screwed on to the side of a roadside hydrant."], "image": "train2014/COCO_train2014_000000278823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457119, "question_id": "KwQYQFyfrBuFyeyzJHo5EL", "question": "What are the pigeons standing on the back of the bench doing?", "choices": ["cooing", "eating", "fighting", "preening"], "correct_choice_idx": 3, "direct_answers": ["nothing", "cleaning", "resting", "cleaning themselves", "sleeping", "preening", "scavenge", "waiting", "pecking food", "preening"], "difficult_direct_answer": true, "rationales": ["They are hanging out and cleaning themselves.", "The pigeons are preening.", "Pigeons preen to show off their feathers."], "image": "val2014/COCO_val2014_000000457119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401917, "question_id": "KwVpXMJ3EM7iYyqRkum5vT", "question": "What type of job does the man in the black suit most likely have?", "choices": ["teacher", "model", "news reporter", "taxi driver"], "correct_choice_idx": 2, "direct_answers": ["interviewer", "news reporter", "reporter", "anchor", "reporter", "reporter", "television personality", "reporter", "news reporter", "news reporter"], "difficult_direct_answer": false, "rationales": ["The man in the black suit is holding a microphone and talking into a camera because he is doing a news report.", "This man in the suit also has a microphone in one hand and papers in the other. he also appears to be in front of a camera stand. this all points to him being a news reporter.", "The man is likely a reporter since he has a microphone."], "image": "train2014/COCO_train2014_000000401917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401123, "question_id": "KweQZXhbZjy6Vdf8fbjesY", "question": "What will the Giraffe have to eat?", "choices": ["hay", "arugula", "watermelon", "meat"], "correct_choice_idx": 0, "direct_answers": ["grass", "hay", "grass", "hay", "grass", "hay", "hay", "hay", "hay", "hay"], "difficult_direct_answer": false, "rationales": ["The man is pushing a wheelbarrow full of hay which is probably for the giraffe.", "The man is carting green plants.", "He will have hay to eat."], "image": "val2014/COCO_val2014_000000401123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378092, "question_id": "KwoZjuL9RedqitZnfFiEpw", "question": "What company manufactures this game?", "choices": ["microsoft", "nintendo", "sony", "sega"], "correct_choice_idx": 1, "direct_answers": ["nintendo", "nintendo", "nintendo", "nintendo", "nintendo", "nintendo", "nintendo", "nintendo", "nintendo", "nintendo"], "difficult_direct_answer": false, "rationales": ["The company is nintendo.", "The word \"wii\" is on the cabinet under the tv and nintendo makes the wii.", "The sign says wii."], "image": "train2014/COCO_train2014_000000378092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495149, "question_id": "Kx3Nrg93D84C5awhiDTsiq", "question": "Where are these young people seated?", "choices": ["library", "church", "airport", "school"], "correct_choice_idx": 3, "direct_answers": ["classroom", "school cafeteria", "school", "classroom", "school", "students", "school room", "school", "library", "table"], "difficult_direct_answer": false, "rationales": ["Young people are sitting in a room with a bunch of tables and chairs.", "The people are at a school due to the laptops.", "The young people are in school."], "image": "train2014/COCO_train2014_000000495149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250827, "question_id": "KxCwyCjNUsMMr6PEkTxRHG", "question": "How would this view be described?", "choices": ["dilapidated", "fancy", "cheap", "stifling"], "correct_choice_idx": 1, "direct_answers": ["water view", "window", "skyscraper view", "fancy", "beautiful", "wide", "high", "aerial", "scenic", "resort"], "difficult_direct_answer": true, "rationales": ["This is a high class hotel with a window view.", "It has a desirable view of attractions.", "The view is a terrific view from a fancy hotel."], "image": "train2014/COCO_train2014_000000250827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48753, "question_id": "KxDbUvSMu7kSyy54555VaL", "question": "At what type of shop an you obtain the above foods?", "choices": ["liquor", "butchery", "grocery", "none"], "correct_choice_idx": 2, "direct_answers": ["market", "fruit stand", "market", "grocery store", "groceries", "produce market", "supermarket", "fruit shop", "grocery", "market"], "difficult_direct_answer": false, "rationales": ["Produce sections like these are found in the type of stores that sell food.", "Grocery stores often have a produce section.", "The shop is a grocery store where people can buy produce and other foods."], "image": "train2014/COCO_train2014_000000048753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428503, "question_id": "KxSvDCmViGLUnFfeU84VUX", "question": "The name of the game shows in the image is?", "choices": ["parachuting", "kiting", "surfing", "paragliding"], "correct_choice_idx": 3, "direct_answers": ["paragliding", "windsurfing", "kite racing", "kite flying", "parasailing", "kite flying", "kiting", "kite flying", "kite flying", "wind surfing"], "difficult_direct_answer": false, "rationales": ["The name is paragliding.", "The beach has multiple long kite looking things in the air.", "Many parachutes can be seen in the air above a beach. paragliding is a common activity near the ocean."], "image": "train2014/COCO_train2014_000000428503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394151, "question_id": "KxVwyfDoqYmRRnrTuFd2md", "question": "Why did the woman open her car trunk?", "choices": ["get spare", "pack luggage", "find jack", "pack groceries"], "correct_choice_idx": 1, "direct_answers": ["for luggage", "packing", "for luggage", "loading", "pack luggage", "add luggage", "move luggage", "luggage", "load luggage", "to park"], "difficult_direct_answer": true, "rationales": ["The woman wants to load her suitcases.", "She has suitcases next to the car", "The woman is packing her car with her suitcases."], "image": "train2014/COCO_train2014_000000394151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524551, "question_id": "KxWbA8FMCMixKX436WtFQB", "question": "From where did the design for the shelter here come from originally?", "choices": ["eskimos", "native americans", "muscovites", "new york"], "correct_choice_idx": 1, "direct_answers": ["native americans", "native americans", "indigenous people", "native americans", "native americans", "native americans", "native americans", "aboriginal housing", "indians", "indians"], "difficult_direct_answer": false, "rationales": ["Teepees are shaped this way.", "The structure is a teepee shape.", "American indians made tpees."], "image": "train2014/COCO_train2014_000000524551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96294, "question_id": "Kxxd5N3HQYc93TGa7F9VJz", "question": "What typically pours out of the brown dispenser mill near the stove?", "choices": ["hot sauce", "blue cheese", "pepper", "lemon juice"], "correct_choice_idx": 2, "direct_answers": ["pepper", "pepper", "soup", "pepper", "pepper", "pepper", "pepper", "oil", "pepper", "pepper"], "difficult_direct_answer": false, "rationales": ["This grinds the peppercorns", "Pepper will come out of the dispenser.", "Typically pepper pours out of the pepper shaker."], "image": "train2014/COCO_train2014_000000096294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335749, "question_id": "KyKRJ2j7nbfYrstCqBUewp", "question": "What is contained in the red fruit that should not be ingested?", "choices": ["skin", "sticks", "juice", "seed"], "correct_choice_idx": 3, "direct_answers": ["pit", "pit", "seeds", "pit", "pit", "seed", "seed", "stem", "pit", "pit"], "difficult_direct_answer": false, "rationales": ["The fruit is a cherry and contains a pit in the center.", "Cherries have pits.", "The red fruit is a cherry that has a pit in the center."], "image": "train2014/COCO_train2014_000000335749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378751, "question_id": "KyNY8AU2oZ5ii8968npQ8W", "question": "What brand are the shoes on the ground?", "choices": ["vans", "adidas", "reebok", "nike"], "correct_choice_idx": 3, "direct_answers": ["nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike"], "difficult_direct_answer": false, "rationales": ["The swoosh logo on the shoes belong to the company listed in option a.", "The shoes on the ground have the nike logo on the side.", "The shoes have a swoosh."], "image": "val2014/COCO_val2014_000000378751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184613, "question_id": "KycEGgtbqAd8XwWq8GtM6S", "question": "Why does he have the umbrella?", "choices": ["rain protection", "showing off", "is weapon", "sun protection"], "correct_choice_idx": 3, "direct_answers": ["block sun", "raining", "shade", "raining", "might rain", "sun protection", "shade sun", "drizzling outside", "rain protection", "shade"], "difficult_direct_answer": false, "rationales": ["The sky is cloudy. the boy is outside in a field, and he might want to stay out there or need some time to get back home, regardless of if the weather condition changes.", "The sky is clear", "He has sunblock."], "image": "val2014/COCO_val2014_000000184613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546664, "question_id": "Kycjtfj6nYM9vrWy59MUfs", "question": "Why is he standing like that?", "choices": ["falling", "tired", "slipping", "maintain balance"], "correct_choice_idx": 3, "direct_answers": ["balance", "balancing", "maintain balance", "balance", "balance", "balance", "on surfboard", "maintain balance", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["The man doesn't want to fall.", "We see a surfer riding on turbulent waters with arms outstretched trying to stay on.", "The other options don't match what he's doing."], "image": "val2014/COCO_val2014_000000546664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43190, "question_id": "Kyfq9vePgNg2Sr56VKrhBs", "question": "Why is the refrigerator covered in papers?", "choices": ["decorative", "hide fridge", "reminders", "for sale"], "correct_choice_idx": 2, "direct_answers": ["advertisment", "positive memories", "reminders", "magnets", "decoration", "store information", "reminders", "magnets hold", "for remembrance", "reminders clutter"], "difficult_direct_answer": true, "rationales": ["The fridge has reminders.", "The fridge has several lists.", "The fridge has all kinds of to do lists."], "image": "train2014/COCO_train2014_000000043190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106166, "question_id": "KykCQvASqyrPSPssMAszCg", "question": "Why is the driver on the left side?", "choices": ["is lost", "is confused", "fearing dog", "is uk"], "correct_choice_idx": 3, "direct_answers": ["is uk", "in england", "uk driver", "different country", "another country", "not usa", "in uk", "japan", "europe", "it england"], "difficult_direct_answer": true, "rationales": ["The steering wheel in cars made for the uk are on the right side of the car. from this perspective a person driving on the left would be behind a right side steering wheel.", "The uk demands that you drive on the left side of the road.", "People that drive on the left side are from the uk."], "image": "val2014/COCO_val2014_000000106166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349616, "question_id": "KyoFf8MsceRhRUzTQ5uUER", "question": "What type of food could be found in this environment?", "choices": ["cockles", "frogs", "lettuce", "rabbits"], "correct_choice_idx": 0, "direct_answers": ["shrub", "insects", "cockles", "fish", "grass", "seafood", "shellfish", "beef", "limited", "crabs"], "difficult_direct_answer": true, "rationales": ["Cockles are sea creatures.", "You can find them in the sand.", "Cockles are found by water and this location is on a beach near water."], "image": "val2014/COCO_val2014_000000349616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49424, "question_id": "KyoQqeF5Xawcefymw8e8us", "question": "What is in the bowl?", "choices": ["chips", "markers", "keys", "marbles"], "correct_choice_idx": 1, "direct_answers": ["markers", "cigars", "pens", "pens", "markers", "markers", "markers", "pens", "sharpies", "markers"], "difficult_direct_answer": false, "rationales": ["The bowl has markers.", "Writing utensils with colored caps are in a bowl. markers have colored caps to indicate the color of the marker.", "There are markers inside of the bowl."], "image": "train2014/COCO_train2014_000000049424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571805, "question_id": "KyzdAXJqhS4GhuWQauBSgP", "question": "What white item is the player wearing that is not a regular part of a tennis uniform?", "choices": ["ring", "brace", "bandage", "patch"], "correct_choice_idx": 2, "direct_answers": ["hat", "baseball cap", "wrist bands/sweatbands", "tape", "baseball cap", "wrist band", "bandage", "tennis", "bandage", "wristband"], "difficult_direct_answer": false, "rationales": ["The person is wearing a bandage which is not usually part of their uniform.", "He has tape wrapped around his middle finger.", "Bandages indicate an injury and aren't part of the uniform."], "image": "train2014/COCO_train2014_000000571805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130851, "question_id": "Kz7HFxUVTWhtcLmXgs3ZmY", "question": "What is the red thing on the bed that is on top of the covers?", "choices": ["book", "collar", "comic", "folder"], "correct_choice_idx": 1, "direct_answers": ["dog collar", "collar", "pillow", "dog", "quilt", "dog collar", "dog collar", "dog collar", "dog collar", "dog collar"], "difficult_direct_answer": false, "rationales": ["This is around the dog's neck and holds tags", "It is on the dogs neck to hold tags and a clip for a leash", "The dog's collar is red and is on top of the bed and the covers."], "image": "train2014/COCO_train2014_000000130851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4424, "question_id": "KzV7MBrj7nbvLqdiYp5QWR", "question": "Why is the man holding up the devices?", "choices": ["to drop", "exercising", "to sell", "showing off"], "correct_choice_idx": 3, "direct_answers": ["phone cover", "picture taken", "new ideas", "showing off", "showing camera", "showing off", "showing off", "demonstrate devices", "showing off", "mobile"], "difficult_direct_answer": false, "rationales": ["The man is aiming the devices at the camera as if wanting people to look at them.", "By the way he is showing the phone and case you can surmise what he is trying to do.", "The smile on the man shows he is happy to have a nice smartphone and case."], "image": "train2014/COCO_train2014_000000004424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184659, "question_id": "KzqYcGpjaiTosuWqeckBAQ", "question": "Why is that band around his finger?", "choices": ["dirty", "cut", "operation", "tired"], "correct_choice_idx": 1, "direct_answers": ["band aid", "ring", "picking", "bandaid", "injured", "cut", "bandage", "bandaid", "bandaid", "toy"], "difficult_direct_answer": false, "rationales": ["The boy cut his finger.", "The band appears to be a bandaid and covering what is likely a cut.", "It is a decorative bandage, which is used to protect small wounds and help them heal."], "image": "val2014/COCO_val2014_000000184659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173797, "question_id": "KzuPf7hqRz7hRHfkciq6K2", "question": "Why is the bus connected to a wire above it?", "choices": ["it's electric", "keep place", "aesthetics", "speed"], "correct_choice_idx": 0, "direct_answers": ["track", "electricity", "electric", "power", "electric power", "power it", "testing communication", "it's electric", "power", "towing"], "difficult_direct_answer": true, "rationales": ["The bus runs on electricity. the wires bring electricity to the bus.", "Some vehicles run off electricity and that is what this bus is using to move.", "The bus runs on electric."], "image": "val2014/COCO_val2014_000000173797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174336, "question_id": "L2Kjg8S8pxRFxxjjKwjLXN", "question": "Which food should the girl eat for intake of more protein?", "choices": ["vegetable", "beans", "lemon", "tomato"], "correct_choice_idx": 1, "direct_answers": ["chicken", "beans", "beans", "chicken", "beans", "shrimp", "beans meat", "potatoes fried", "clams", "calamari"], "difficult_direct_answer": false, "rationales": ["Beans are the food with the most protein content of those listed here.", "These vegetables have protein in them for her.", "Beans are high in protein."], "image": "val2014/COCO_val2014_000000174336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280024, "question_id": "L2XWjNC6kYKVjqwbJPJA4x", "question": "The items on the left and right of the front of the biggest vehicle here are called what?", "choices": ["jet engines", "wheels", "missiles", "cannons"], "correct_choice_idx": 0, "direct_answers": ["port", "wings", "engines", "carriers", "trolley", "luggage cars", "flight", "wings", "jet engines", "jet engines"], "difficult_direct_answer": false, "rationales": ["These are what powers the vehicle and they are called jet engines.", "The big round items on both sides of the airplane are jet engines that power the plane.", "The items on the left and right are jet engines."], "image": "train2014/COCO_train2014_000000280024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271870, "question_id": "L2fVFG4GwEEqX8iRCi2MEa", "question": "Where is the food truck festival taking place?", "choices": ["canada", "jamaica", "mexico", "utah"], "correct_choice_idx": 0, "direct_answers": ["canada", "canada", "canada", "america", "canada", "canada", "canada", "america", "canada", "canada"], "difficult_direct_answer": false, "rationales": ["There are maple leaf flags all over the area.", "The truck is in canada.", "There are canadian flags above the trucks."], "image": "train2014/COCO_train2014_000000271870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73139, "question_id": "L2oEX9hbrFPkok8WMyXiVY", "question": "What type of hats are the dolls wearing?", "choices": ["fedora", "swimmers", "chef", "baseball"], "correct_choice_idx": 2, "direct_answers": ["chefs hat", "chef's cap", "chef", "chef hats", "chef", "bakers", "chef", "chef", "chef", "chef"], "difficult_direct_answer": false, "rationales": ["The campbell's soup kits are wearing hats worn by chefs.", "The hats are for chefs.", "They are cooking something."], "image": "val2014/COCO_val2014_000000073139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51328, "question_id": "L2vHqZtjdevJgnJBQEbuTZ", "question": "What cannot be done in front of this object?", "choices": ["singing", "parking", "eating", "walking"], "correct_choice_idx": 1, "direct_answers": ["park", "park", "parking", "move forward", "park", "water", "park", "parking", "parking", "open"], "difficult_direct_answer": false, "rationales": ["It is a fire hydrant. vehicles are never allowed to block access to a fire hydrant by stopping in front of one.", "Fireman need to be able to access hydrants in case of a fire.", "No one can park in front of fire hydrants."], "image": "train2014/COCO_train2014_000000051328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160597, "question_id": "L3JXeZNawdVXrJuE9SYDNY", "question": "Why do safety workers wear this florescent color?", "choices": ["visibility", "style", "tradition", "cheaper"], "correct_choice_idx": 0, "direct_answers": ["visibility", "visual identification", "visibility", "night reflection", "easily visible", "stand out", "make visible", "easy seeing", "visibility", "jacket"], "difficult_direct_answer": false, "rationales": ["Neon yellow is used so people can be seen.", "This color provides safety to the worker by increasing their visibility to anyone nearby. fluorescent colours provide the greatest contrast against most backgrounds.", "The colors makes the workers stick out much more, especially in low light conditions."], "image": "train2014/COCO_train2014_000000160597.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154854, "question_id": "L3PkjMv4QpLK54uTvzC3KY", "question": "Why is the steeple lit better?", "choices": ["is sunset", "is miracle", "is reflective", "is closer"], "correct_choice_idx": 0, "direct_answers": ["sundown", "taller", "shows hope", "is sunset", "sun setting", "sunlight", "sunset", "sunlight", "more lights", "taller"], "difficult_direct_answer": false, "rationales": ["The steeple is still being hit by the sun.", "The steeple has the sunlight reflecting off of its side.", "The steeple is at sunset."], "image": "val2014/COCO_val2014_000000154854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233112, "question_id": "L3SeSBjFw7EQuv6dr3wLxU", "question": "What is near the bottle of alcohol?", "choices": ["toilet", "whistle", "man", "bunny"], "correct_choice_idx": 0, "direct_answers": ["book", "book", "book", "toilet", "book", "card", "book", "books", "book", "magic eye"], "difficult_direct_answer": false, "rationales": ["The bottle is by the toilet.", "The whiskey is on a shelf above the commode.", "The bottle is on a shelf above the commode."], "image": "val2014/COCO_val2014_000000233112.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515512, "question_id": "L3go4CdsNL33e6TtBB3HJ6", "question": "What is used making the toppings?", "choices": ["plate", "cream", "butter", "chocolate"], "correct_choice_idx": 3, "direct_answers": ["toffee", "frosting", "powder sugar", "chocolate", "chocolate caramel", "chocolate", "whipped cream", "icing", "nuts", "chocolate"], "difficult_direct_answer": false, "rationales": ["There are brown chips on the white cake.", "As seen on the middle slice.", "There is chopped pieces of chocolate and candy on the pie for presentation."], "image": "train2014/COCO_train2014_000000515512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269483, "question_id": "L3tL7V6NVBbKEuey9o9hk3", "question": "Besides his head what part of his body is the skateboarder on the edge of the ramp protecting?", "choices": ["shoulders", "wrists", "knees", "elbows"], "correct_choice_idx": 2, "direct_answers": ["knees", "knees", "knees", "knees", "knees", "knees", "knees", "knees", "knees", "knees"], "difficult_direct_answer": false, "rationales": ["The skateboarder is wearing pads on the body parts he wants to protect and none are on his shoulders, wrists, or elbows.", "The person is wearing kneepads.", "He's protecting his knees."], "image": "train2014/COCO_train2014_000000269483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259056, "question_id": "L42CADvzd4NxFEm5p3CbCS", "question": "What is he taking pictures of?", "choices": ["snowboard", "snow", "clouds", "sky"], "correct_choice_idx": 0, "direct_answers": ["snowboarder", "skier", "memories", "snowboarder", "snowboarder", "snowboard", "snowboarder", "snowboarder", "snowboarder", "snowboarder"], "difficult_direct_answer": false, "rationales": ["The photographer is braving a cold winter day in order to capture great shots of a snowboarder in action. as the snowboarder virtually flies above him, the odds of getting a great shot are excellent!.", "The man is taking pictures of the snowboarder.", "The person being photographed is using a board for traveling on the snow, thus corresponding to the item listed in a."], "image": "val2014/COCO_val2014_000000259056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536120, "question_id": "L45ZgLPrRfS5yiZ82yDoZ9", "question": "What sport could the red object be used for?", "choices": ["soccer", "baseball", "frisbee golf", "football"], "correct_choice_idx": 3, "direct_answers": ["frisbee", "frisbee", "frisbee", "frisbee", "football", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["The red object is a frisbee, which is an item that could easily be used in--and in fact is necessary for--a game of frisbee golf.", "The sport is football.", "The disc could be also be used in football."], "image": "val2014/COCO_val2014_000000536120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373426, "question_id": "L47L9CrW9MiMnJXHuH8WkS", "question": "What does the catcher wear on his legs?", "choices": ["long pants", "flexible shield", "leg guards", "knee pads"], "correct_choice_idx": 2, "direct_answers": ["padded gear", "pads", "leg guards", "pads", "pads", "ball", "leg guards", "shin guards", "pads", "pads"], "difficult_direct_answer": false, "rationales": ["They used the guards on the legs.", "The man has protective items on his knees.", "The catcher at the plate has pads on his legs to guard him from injury."], "image": "train2014/COCO_train2014_000000373426.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391807, "question_id": "L4LuvjFGfj2bjB2sVc3FT9", "question": "What type of sign is shown?", "choices": ["yield", "stop", "train", "animal crossing"], "correct_choice_idx": 3, "direct_answers": ["animal crossing", "cow crossing", "street signs", "street", "cows", "cow crossing", "livestock crossing", "street cow", "road", "livestock present"], "difficult_direct_answer": true, "rationales": ["There is a picture of a cow on the sign.", "There is a picture of a cow on the sign.", "This is a crossing sign for cows."], "image": "train2014/COCO_train2014_000000391807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373309, "question_id": "L4MQsazjhxeGNis5c6JPqD", "question": "How many cows are looking at the camera?", "choices": ["one", "four", "two", "three"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Only two are the rest are eating.", "The cow on the far right is looking at the camera. the one next to it also is looking at the camera.", "Only two cows are looking up."], "image": "train2014/COCO_train2014_000000373309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368148, "question_id": "L4VprQJb9QDjJwm8kqn44H", "question": "What is actually the tallest object in the picture?", "choices": ["boy", "buildings", "bus", "clock tower"], "correct_choice_idx": 1, "direct_answers": ["building behind", "building", "corner building", "building", "building", "building", "building", "building", "clock", "buildings"], "difficult_direct_answer": false, "rationales": ["The buildings in the back are the tallest.", "The buildings are tallest.", "The buildings are the tallest."], "image": "train2014/COCO_train2014_000000368148.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131697, "question_id": "L4WuEB3PHE4YwxBscy9cCy", "question": "Who is probably pointing above the students?", "choices": ["teacher", "janitor", "student", "parent"], "correct_choice_idx": 0, "direct_answers": ["teacher", "teacher", "teacher", "teacher", "teacher", "teacher", "teacher", "teacher", "teacher", "teacher"], "difficult_direct_answer": false, "rationales": ["The students look to be sitting at a desk in a classroom, and the pointed finger seems to belong to an older person. the only older person typically in a classroom would be the teacher.", "The kids are in a class. a teacher is in the room with them.", "The teachers are helping out the students."], "image": "train2014/COCO_train2014_000000131697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28480, "question_id": "L4dV3ukzK4p8eY2xiXmuKm", "question": "What animal is the same color as the bike?", "choices": ["panther", "leopard", "swan", "giraffe"], "correct_choice_idx": 2, "direct_answers": ["bear", "polar bears", "polar bear", "swan", "polar bear", "polar bear", "bear", "zebra", "polar bell", "polar near"], "difficult_direct_answer": false, "rationales": ["A swan is the same color as the white on the bikes.", "The bike is a solid white color, like the bird.", "This animal is white also."], "image": "train2014/COCO_train2014_000000028480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21883, "question_id": "L57vaaQQewUzfBVpTtLD9D", "question": "In which state does this man's favorite team headquartered?", "choices": ["louisiana", "arkansas", "california", "north dakota"], "correct_choice_idx": 0, "direct_answers": ["louisiana", "louisiana", "louisiana", "louisiana", "louisiana", "louisiana", "louisiana", "louisiana", "louisiana", "louisiana"], "difficult_direct_answer": false, "rationales": ["A man with a beard has a purple lsu hat on.", "The man is wearing an lsu hat. lsu is located in the state of louisiana.", "Those are the letters for the state university"], "image": "train2014/COCO_train2014_000000021883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203865, "question_id": "L5H7zGzd7zraK7JeafrwUp", "question": "Where has the bus pulled up to?", "choices": ["sidewalk", "grass", "sand", "dirt"], "correct_choice_idx": 0, "direct_answers": ["stop", "side walk", "bus stop", "bus stop", "employee parking", "crosstown", "sidewalk", "bus stop", "bus stop", "stop"], "difficult_direct_answer": false, "rationales": ["As indicated by the concrete and the yellow line.", "For the safety and convenience of passengers, busses pull to the side of the roads where there are slabs of concrete for people to walk upon.", "There are raised sections of concrete next to a road"], "image": "val2014/COCO_val2014_000000203865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55809, "question_id": "L5myKL6V6FuWJVhJ4nmysB", "question": "The dog would be able to keep up with the skateboarder at about what speed?", "choices": ["25 mph", "105 mph", "45 mph", "75 mph"], "correct_choice_idx": 0, "direct_answers": ["unknown", "ten mph", "25 mph", "unknown", "15mph", "twenty km", "20 mph", "skateboarder speed", "15 mph", "20"], "difficult_direct_answer": true, "rationales": ["That is how fast they are going.", "The dog can't move as fast as a car.", "This is about the average top speed for dogs"], "image": "train2014/COCO_train2014_000000055809.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104800, "question_id": "L5wPe5SnCSrVNHjiPivG6w", "question": "Which wrong accessory has the woman worn for riding bike?", "choices": ["shoes", "hat", "sunglasses", "name tag"], "correct_choice_idx": 0, "direct_answers": ["sandals", "sandals", "sandals", "sandals", "sandals", "sandals", "sandals", "sandals", "shoes", "sandals"], "difficult_direct_answer": false, "rationales": ["Sandals aren't appropriate for pedaling with.", "She needs to be wearing better shoes to protect her feet.", "The accessory is the shoes."], "image": "train2014/COCO_train2014_000000104800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152004, "question_id": "L6Gfjxrv8jSdqGJqhErTWS", "question": "What is the cause of distortion seen here?", "choices": ["camera lens", "water", "glass shape", "glare"], "correct_choice_idx": 2, "direct_answers": ["glass vase", "glass shape", "glass", "glass", "visual", "glass shape", "drinking glass", "glass", "vase", "light"], "difficult_direct_answer": false, "rationales": ["It looks weird because of the way the vase is shaped.", "The curve in the clear material causes the shapes to curve as well.", "The glass is curved, which changes the angle at which the light is reflected."], "image": "val2014/COCO_val2014_000000152004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476552, "question_id": "L6Haamx5DvCy77JuySLyoE", "question": "What is the woman preparing?", "choices": ["bananas", "fritos", "chicken", "papayas"], "correct_choice_idx": 0, "direct_answers": ["food", "lunch", "fried plantains", "food", "fried bananas", "bananas", "lunch", "bananas", "food", "food"], "difficult_direct_answer": false, "rationales": ["The woman is cooking bananas.", "The woman is preparing bananas.", "She is frying a batch of legs."], "image": "val2014/COCO_val2014_000000476552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410056, "question_id": "L6Md5TkUsnbmCgieks4nir", "question": "What activity is the woman engaging in?", "choices": ["surfing", "canoeing", "kayaking", "paddling"], "correct_choice_idx": 3, "direct_answers": ["paddle boarding", "paddling", "paddle boarding", "paddleboarding", "paddle boarding", "paddle boarding", "paddling", "paddle board", "water paddling", "rowing"], "difficult_direct_answer": false, "rationales": ["The woman has a paddle and a board.", "A woman has board with a flat part on the end. she is pushing it thru the water to move her board.", "She is holding a paddle and standing on the paddle board."], "image": "val2014/COCO_val2014_000000410056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112581, "question_id": "L6TyQxGL5tXSyHuRiLYB8v", "question": "What sort of thing does this man hold?", "choices": ["rain protection", "condiments", "tickets", "tribe totem"], "correct_choice_idx": 1, "direct_answers": ["condoment holder", "peppershaker", "condiment stand", "condiments", "condiments", "condiment holder", "condiments", "condiments", "condiments", "collectibles"], "difficult_direct_answer": false, "rationales": ["There are ketchup and mustard bottles on it", "The items are condiments.", "This item has ketchup and mustard."], "image": "val2014/COCO_val2014_000000112581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253031, "question_id": "L6XrM5SaUyhXW8BdPtUpRV", "question": "What is the URL here for?", "choices": ["online shopping", "sports team", "video games", "social media"], "correct_choice_idx": 1, "direct_answers": ["marketing", "dodgers", "dodgers", "dodgers", "dodgers team", "dodgers", "sports team", "dodgers team", "baseball team", "dodgers"], "difficult_direct_answer": false, "rationales": ["The url is for the dodgers team.", "It's likely the home team's website.", "It is the website for the team playing the game."], "image": "train2014/COCO_train2014_000000253031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432637, "question_id": "L6pkTYU2oaDoFU3ntkYv5S", "question": "Are these horses real?", "choices": ["yes", "maybe", "unsure", "no"], "correct_choice_idx": 3, "direct_answers": ["no", "no", "no way", "no", "no", "no", "not real", "no", "no", "no"], "difficult_direct_answer": false, "rationales": ["The horses are attached to poles. they are part of a carousel ride.", "People are sitting on the horses of a carousal.", "This is a carousel"], "image": "train2014/COCO_train2014_000000432637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70870, "question_id": "L74XHivpzZFg692AWDqMvw", "question": "What is the person waiting to do?", "choices": ["board", "speak", "check out", "eat"], "correct_choice_idx": 0, "direct_answers": ["enter car", "get on", "board subway", "board train", "enter train", "ride train", "board train", "board train", "board train", "board"], "difficult_direct_answer": false, "rationales": ["The person is waiting to get on the subway.", "The train is at the station and the person is waiting at the platform.", "The person wants to board."], "image": "val2014/COCO_val2014_000000070870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512308, "question_id": "L7CqzkErChvEfd5pTHPN6n", "question": "What is this toy called?", "choices": ["lego house", "dollhouse", "wobbler house", "barbie hotel"], "correct_choice_idx": 1, "direct_answers": ["doll house", "doll", "dollhouse", "dollhouse", "dollhouse", "milk", "dollhouse", "dollhouse", "dollhouse", "doll"], "difficult_direct_answer": false, "rationales": ["There are dolls in the structure.", "This is a small house with small items in it.", "This is a doll house as can be seen with the tiny furniture and the presence of the doll."], "image": "train2014/COCO_train2014_000000512308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229383, "question_id": "L7WKZ6yCEKAPYUeVMRCNrU", "question": "If one was standing what would assist in maintaining their balance?", "choices": ["chair", "phone", "window", "pole"], "correct_choice_idx": 3, "direct_answers": ["pole", "pole", "metal pole", "pole", "pole", "pole", "pole", "pole", "metal pole", "pole"], "difficult_direct_answer": false, "rationales": ["The pole allows people to hold on for balance.", "He would use the pole.", "The metal pole is attached to the floor and ceiling for stability."], "image": "val2014/COCO_val2014_000000229383.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424907, "question_id": "L7qJmuoNxcsK8iNrxWxhsj", "question": "Why are they standing on the sidewalk?", "choices": ["lost", "taking bus", "taking cab", "socializing"], "correct_choice_idx": 1, "direct_answers": ["bus stop", "bus stop", "bus stop", "bus stop", "bus stop", "waiting", "waiting", "waiting", "taking bus", "bus stop"], "difficult_direct_answer": false, "rationales": ["The people are waiting on the sidewalk so they are likely about to board the bus.", "The vehicle is parked next to the curb, with the stop there as well.", "These people are waiting to board the bus at this bus stop."], "image": "train2014/COCO_train2014_000000424907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131465, "question_id": "L7qMExXnbB7WbR4CGcocvr", "question": "What is the brand advertising along the sides of the court?", "choices": ["condiment", "sun glasses", "cars", "clothes"], "correct_choice_idx": 0, "direct_answers": ["vegeta", "vegeta", "vegeta", "vegeta", "vegeta", "vegeta", "condiment", "vegeta", "vegeta", "vegeta"], "difficult_direct_answer": false, "rationales": ["It is a seasoning put on foods", "According to wiki, vegeta is a season mix so \"a\" is the most appropriate answer.", "The word says vegeta."], "image": "train2014/COCO_train2014_000000131465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100010, "question_id": "L85TuwrMMkcBgpvrNwKtLz", "question": "Where does the white light come from?", "choices": ["sun", "lamp", "star", "moon"], "correct_choice_idx": 1, "direct_answers": ["moon", "snow", "light pole", "sun", "sun", "sun", "sun", "sun", "lamp", "airplane"], "difficult_direct_answer": false, "rationales": ["There is a round celestial body in the sky.", "This is a common thing that happens with dusk and night photos. the light diffuses in a blurry way. there appears to be a pole as well, but it's hard to tell. if not for that pole, this could also be b.", "There is a cloudy sky. clouds can mute light."], "image": "val2014/COCO_val2014_000000100010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215553, "question_id": "L8ALCMSgPZNja9DSZ997fe", "question": "If she were barefoot what would she most likely be feeling right now?", "choices": ["grass", "sand", "pavement", "water"], "correct_choice_idx": 0, "direct_answers": ["get prickled", "happy", "grass", "grass", "grass", "grass", "wet", "wet grass", "wet", "grass"], "difficult_direct_answer": false, "rationales": ["She would be standing in grass.", "As indicated by the ground in the background. if she were closer to water, then it would be c.", "The area immediately behind the woman is green."], "image": "train2014/COCO_train2014_000000215553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267499, "question_id": "L8CDhmkfnKaMJvZMiGM52h", "question": "What did the boy most likely just do to the ball with his racket?", "choices": ["returned it", "served it", "missed it", "launched it"], "correct_choice_idx": 2, "direct_answers": ["missed it", "swing", "serve", "hit it", "hit it", "hit", "missed it", "return", "hit", "hit it"], "difficult_direct_answer": false, "rationales": ["You can tell by the position of the ball that he missed it.", "The boy has attempted to hit something due to his forward motion and location of racket. however below him is a ball which indicates that he probably missed it.", "The boy just missed the ball since it's on the ground."], "image": "train2014/COCO_train2014_000000267499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169048, "question_id": "L8EeMYiLVMDroeMisBeZ7E", "question": "How many functional keys in laptop keyboard?", "choices": ["15", "14", "13", "11"], "correct_choice_idx": 3, "direct_answers": ["11", "two", "unknown", "hundred four", "105", "twelve", "12 keys", "12", "twelve", "twelve"], "difficult_direct_answer": false, "rationales": ["There are 11 keys that actually work.", "There are 12 functional keys on a keyboard.", "There are 12 function keys"], "image": "val2014/COCO_val2014_000000169048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561953, "question_id": "L8N7qhdUkrL7x8RxeVg4VK", "question": "Which round items have the most strong flavor?", "choices": ["onions", "spinach", "olives", "mushrooms"], "correct_choice_idx": 0, "direct_answers": ["onions", "scallions", "onions", "banana", "green onions", "onions", "onions", "onion", "green options", "onions"], "difficult_direct_answer": false, "rationales": ["There is not too many round items in the dish except those that resemble onions. the onions also match the criteria of having a strong flavor.", "Onions have a very distinct flavor and smell and are overpowering sometimes.", "The onions on the pizza are very strong in flaveor."], "image": "train2014/COCO_train2014_000000561953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571702, "question_id": "L8PKj5RnWXWfcQzRZ7uaY3", "question": "What is the baby playing with?", "choices": ["cellphone", "remote control", "calculator", "toy"], "correct_choice_idx": 1, "direct_answers": ["remote", "remote", "remote", "remote", "tv remote", "remote control", "remote control", "remote", "remote", "tv remote"], "difficult_direct_answer": false, "rationales": ["The baby is playing with a remote control to a tv or game.", "This looks like the back of many tv remotes.", "The baby is holding a black remote control for a tv and playing with it."], "image": "train2014/COCO_train2014_000000571702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525340, "question_id": "L8hGyQqfpAMWqRu7RhMQZ9", "question": "Why are the roofs at street level?", "choices": ["street missing", "houses sank", "broken camera", "street below"], "correct_choice_idx": 3, "direct_answers": ["street below", "embankment", "lower elevation", "club", "hilly terrain", "bridge", "hill", "single floor", "unique style", "no reason"], "difficult_direct_answer": true, "rationales": ["The road is actually elevated and the whole houses are below.", "The street is built above the houses.", "We are in the point of view of a bridge."], "image": "train2014/COCO_train2014_000000525340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7125, "question_id": "L8jeSJUJB2S7bKEwDkHkoU", "question": "Which city does this person bike in?", "choices": ["washington dc", "melbourne", "baton rouge", "austin"], "correct_choice_idx": 0, "direct_answers": ["usa", "washington dc", "urban", "washington dc", "washington dc", "washington dc", "chicago", "texas", "washington dc", "washington"], "difficult_direct_answer": false, "rationales": ["The washington monument is visible in the background.", "The person on the bike is in washington dc and the washington monument is in the distance.", "The washington monument is across the river and it is only located in one city"], "image": "val2014/COCO_val2014_000000007125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209232, "question_id": "L8uk9ZAFxDbgepJGLT5jna", "question": "Why are these people waiting for?", "choices": ["lunch", "leader", "sunshine", "green light"], "correct_choice_idx": 3, "direct_answers": ["green light", "light change", "red light", "traffic light", "light", "clear passage", "stoplight change", "crossing", "cross street", "light"], "difficult_direct_answer": true, "rationales": ["They are waiting for the light to tell them it's ok to cross the street.", "The light is red and is causing all these people to wait to go.", "The people are waiting at a crosswalk."], "image": "train2014/COCO_train2014_000000209232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393384, "question_id": "L9JR6DuNuJoK247vybnzzN", "question": "What venue is it likely to be?", "choices": ["football field", "airfield", "zoo", "park"], "correct_choice_idx": 1, "direct_answers": ["airport", "parking lot", "boardwalk", "playing ground", "drive in", "parking lot", "beach", "beach", "airfield", "street soccer"], "difficult_direct_answer": false, "rationales": ["It looks like they are on a tarmac.", "Many people are walking around a concrete platform. there is a fence that separates both sides.", "You can tell by the people and what they are doing there as to where they are."], "image": "train2014/COCO_train2014_000000393384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180329, "question_id": "L9MTV7meBUpuUFqMzvMBAP", "question": "What are they doing outside the cab?", "choices": ["cleaning cab", "selling cab", "talking driver", "stealing cab"], "correct_choice_idx": 2, "direct_answers": ["buying", "selling fruit", "talking", "selling fruit", "talking", "vending", "talking", "talking driver", "selling", "talking"], "difficult_direct_answer": false, "rationales": ["The man is talking to the cab driver.", "It looks like they are talking to the driver about something.", "They are facing the drivers door and looking at them"], "image": "val2014/COCO_val2014_000000180329.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67080, "question_id": "L9NFvCJ586rPoYL9mYhvym", "question": "What is a common term given to this type of motorcycle?", "choices": ["touring", "moped", "scooter", "cruiser"], "correct_choice_idx": 3, "direct_answers": ["bike", "harley", "cruiser", "harley", "cruiser", "soft tail", "harley", "hog", "harley", "chopper"], "difficult_direct_answer": false, "rationales": ["The motorcycle is a cruiser.", "This bike sits the rider upright or leaning slightly back with the rider's feet in front of them.", "This is a cruiser."], "image": "train2014/COCO_train2014_000000067080.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338025, "question_id": "L9SCn84rbkwFE3jMyGJLTw", "question": "What breed of dog is this?", "choices": ["greyhound", "doberman", "pit bull", "chow chow"], "correct_choice_idx": 3, "direct_answers": ["bulldog", "unknown", "terrier", "chow chow", "chow chow", "chow", "shitzu", "shiatsu", "pug", "chow"], "difficult_direct_answer": false, "rationales": ["That's what the beautiful dog is.", "The breed is a chow chow.", "The dog is very small and fluffy."], "image": "train2014/COCO_train2014_000000338025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283093, "question_id": "L9XmNfUozyXswupvxU7rSf", "question": "What does the red and yellow flag allow?", "choices": ["bathing", "wrestling", "parking", "eating"], "correct_choice_idx": 0, "direct_answers": ["dogs", "swimming", "surfing", "lifeguard", "on beach", "ocean swimming", "bathing", "danger", "swimming", "swimming"], "difficult_direct_answer": false, "rationales": ["A flag at the beach with red on top and yellow on the bottom signifies that swimming is allowed and a lifeguard is on duty.", "The flag allows people to be on the beach.", "The flag allows for bathing."], "image": "train2014/COCO_train2014_000000283093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331386, "question_id": "L9orwLFRDT8oT92SFnDScC", "question": "What country are these people in?", "choices": ["united kingdom", "italy", "germany", "france"], "correct_choice_idx": 1, "direct_answers": ["england", "britain", "italy", "uk", "england", "england", "britain", "united kingdom", "england", "england"], "difficult_direct_answer": false, "rationales": ["The country is italy.", "The stores across the road have a flag from the united kingdom.", "There is a britain rocks sign."], "image": "val2014/COCO_val2014_000000331386.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97922, "question_id": "L9qb9ijaAuZJAi6GS2TyGa", "question": "Where is this kind of plate normally used?", "choices": ["diner", "picnic", "bar", "wedding"], "correct_choice_idx": 1, "direct_answers": ["table", "parties", "barbecues", "conventions", "vegetables", "picnics barbecues", "cafeteria", "table", "home", "picnic"], "difficult_direct_answer": true, "rationales": ["Because its a disposable plate.", "The plate is paper.", "The plate is made out of paper and is disposable. fancier non-disposable plates are used at diners, bars, and weddings."], "image": "val2014/COCO_val2014_000000097922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112358, "question_id": "LA6qDHA9aCS2Q6Fviw4Cvc", "question": "Where is the person controlling the glider located?", "choices": ["ocean", "under water", "shore", "air"], "correct_choice_idx": 0, "direct_answers": ["ocean", "on water", "on ocean", "water", "water", "in water", "ocean", "surfboard", "in sea", "in ocean"], "difficult_direct_answer": false, "rationales": ["The person is on the surface of the water.", "They are on the water on a board", "They are on a board letting the kite propel them forward"], "image": "train2014/COCO_train2014_000000112358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532827, "question_id": "LAAuR9WbZRrpCtgNYoBAEi", "question": "The string like items seen here are sourced from which animal?", "choices": ["chickens", "fish", "cows", "sheep"], "correct_choice_idx": 3, "direct_answers": ["cow", "sheep", "sheep", "sheep", "sheep", "sheep", "sheep", "sheep", "sheep", "sheep"], "difficult_direct_answer": false, "rationales": ["The string-like items are sourced from the wool of sheep.", "Yarn comes from sheep's wool.", "This would be from sheep"], "image": "val2014/COCO_val2014_000000532827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72769, "question_id": "LABDPEmgahXrLasP9U6aNg", "question": "What relationship might these two children with the birthday cakes likely have?", "choices": ["random strangers", "enemies", "they're twins", "none"], "correct_choice_idx": 2, "direct_answers": ["friends", "party", "twins", "friends", "friends", "they're twins", "friend", "friends", "birthday", "friends"], "difficult_direct_answer": false, "rationales": ["The other options don't really fit except for d. they might merely be children in the same class.", "They might just have birthdays on the same day.", "They are having the same birthday party."], "image": "train2014/COCO_train2014_000000072769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133994, "question_id": "LANsnbzdyk4LC5kZhBcEWj", "question": "This vehicle was made during what era?", "choices": ["baroque", "renaissance", "enlightenment", "romanticism"], "correct_choice_idx": 3, "direct_answers": ["modern", "1950s", "romanticism", "before years", "1827", "industrial era", "industrial", "steam", "19th century", "nineteenth century"], "difficult_direct_answer": true, "rationales": ["The vehicle is from the romantic era.", "The first trains appeared during the romanticism era and were invented to make land travel faster.", "The modern train was developed during the late 18th century. this period of time was also known is history as romanticism."], "image": "train2014/COCO_train2014_000000133994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42819, "question_id": "LAPxSz7TRxfzrCdKr7Gsxp", "question": "What can be seen to the left of the train?", "choices": ["ocean", "roads", "trees", "mountains"], "correct_choice_idx": 2, "direct_answers": ["screen", "trees", "trees", "trees", "passengers boarding", "trees", "picture", "trees", "trees", "forest"], "difficult_direct_answer": false, "rationales": ["The objects to the left of the train have green leaves.", "There is thick green foliage at a height that is above the train, indicating that they are tall trees.", "The plant seems to be a potrait."], "image": "train2014/COCO_train2014_000000042819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282310, "question_id": "LAdQr4k8ys7BmSUBEySkhk", "question": "What base is this?", "choices": ["third", "first", "home plate", "second"], "correct_choice_idx": 1, "direct_answers": ["home", "home", "first", "home plate", "home", "first", "home plate", "home", "home plate", "home"], "difficult_direct_answer": false, "rationales": ["The base is first because the players are two.", "It is home plate since the catcher is right by it.", "That is the first base."], "image": "train2014/COCO_train2014_000000282310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345104, "question_id": "LArWEhJJKx94ykdixzC2Wi", "question": "What are the small yellow objects on the tree?", "choices": ["flowers", "butterfly", "lights", "bugs"], "correct_choice_idx": 2, "direct_answers": ["lights", "lights", "lights", "lights", "lights", "lights", "lights", "lights", "lights", "lights"], "difficult_direct_answer": false, "rationales": ["The trees have glowing lights", "These are tiny bulbs on a wire plugged in", "These objects are giving off a luminous glow you can see in the color lights are generally put around trees during the holiday season as decoration."], "image": "train2014/COCO_train2014_000000345104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496618, "question_id": "LAv8LGaeq4RGo2FoHf5ipA", "question": "What did the chef do to this egg?", "choices": ["pickle", "fry", "scramble", "boil"], "correct_choice_idx": 1, "direct_answers": ["cook", "fry", "turn", "fry it", "cook", "fried", "fry", "fry", "fried", "fried it"], "difficult_direct_answer": false, "rationales": ["This egg was fried.", "Cooking the egg flat in oil gives it this shape and texture.", "The egg is fried."], "image": "val2014/COCO_val2014_000000496618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16672, "question_id": "LAwoPDK59mCBKHox3rKTo8", "question": "Which vehicle shown gets the best mileage?", "choices": ["biker", "silver car", "white car", "red car"], "correct_choice_idx": 0, "direct_answers": ["motorbike", "biker", "motorbike", "moped", "motorcycle", "moped", "bike", "motor scooter", "scooter", "motorcycle"], "difficult_direct_answer": false, "rationales": ["It is smaller, lighter and requires little or no gasoline.", "It has a very small engine that needs only a little gas", "The motorcycle will get the best mileage because it is smaller."], "image": "train2014/COCO_train2014_000000016672.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157158, "question_id": "LBBGoiDPfEzzLKEBmKuLFt", "question": "Why is the bread of the sandwich have black on it?", "choices": ["natural color", "slightly burnt", "mold", "dirt"], "correct_choice_idx": 1, "direct_answers": ["grilled", "grilled", "burned", "toasted", "burned", "toasted", "grilled", "grilled", "slightly burnt", "burned"], "difficult_direct_answer": false, "rationales": ["The bread is slightly overburned.", "A sandwich is on plate and has dark spots on it. sandwiches are often grilled in a pan and served warm.", "It is slightly burnt but still good to eat."], "image": "train2014/COCO_train2014_000000157158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74316, "question_id": "LBBUDZpp7X7J7XtSTH69hh", "question": "Which two species often share this space?", "choices": ["humans dogs", "none", "snakes alligators", "hobbit ents"], "correct_choice_idx": 0, "direct_answers": ["dogs humans", "humans dogs", "humans dogs", "canines humans", "dog human", "dogs humans", "human canine", "dogs", "humans plants", "human dog"], "difficult_direct_answer": false, "rationales": ["You can tell by the picture of the dog that he occupies this space with a owner.", "Humans and dogs normally share this space.", "There is a pet in the living room."], "image": "train2014/COCO_train2014_000000074316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484666, "question_id": "LBG7NKi9oXVbgavQHUabB8", "question": "Where could someone buy a used appliance on this street?", "choices": ["off curb", "pawn shop", "7 11", "garbage can"], "correct_choice_idx": 1, "direct_answers": ["pawn shop", "pawnshop", "pawn shop", "pawn shop", "used computer", "pawn shop", "pawnshop", "pawn shop", "pawn shop", "pawn shop"], "difficult_direct_answer": false, "rationales": ["The sign above the clock indicates that there is a particular type of store behind the person. it sells appliances.", "Someone could go to the pawn shop.", "Someone could go to the pawn shop."], "image": "train2014/COCO_train2014_000000484666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535669, "question_id": "LBL7uecXQJawK2gmKRcZUr", "question": "What does the first initial stand for?", "choices": ["aeronautical", "arithmetic", "arts", "agricultural"], "correct_choice_idx": 3, "direct_answers": ["age", "show up", "agricultural", "win success", "something", "group", "name", "texas", "agricultural", "name"], "difficult_direct_answer": false, "rationales": ["The university. texas a&m, was originally know for agriculture and mechanical sciences.", "The men play for texas a & m. i chose the option that the a in the name represents.", "The initial is for agricultural."], "image": "train2014/COCO_train2014_000000535669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433499, "question_id": "LBRVvvfqUq6Q9uJcHETRqY", "question": "What kind of energy moves this train?", "choices": ["electricity", "coal", "manual force", "gas"], "correct_choice_idx": 0, "direct_answers": ["diesel", "gas", "kinetic", "steam", "coal", "electricity", "kinetic", "electricity", "steam", "steam"], "difficult_direct_answer": false, "rationales": ["This is an old train", "The train doesn't use coal, but doesn't use electric.", "It its a model train that is most likely powered by battery."], "image": "val2014/COCO_val2014_000000433499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146091, "question_id": "LBY3WH5kTQ86ho9RxY3hfh", "question": "Who has the right of way?", "choices": ["pedestrians", "car", "truckers", "cyclist"], "correct_choice_idx": 3, "direct_answers": ["cyclist", "biker", "cyclist", "biker", "pedestrian", "biker", "bicyclist", "cyclist", "cyclist", "cyclist"], "difficult_direct_answer": false, "rationales": ["The cyclist had stop signs at the crossing.", "The cyclist is in the crosswalk.", "The biker is almost at the other side of the path."], "image": "train2014/COCO_train2014_000000146091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547148, "question_id": "LBeLzZ8CyLAiXNXzBcAHV4", "question": "What is the large clear area on the front of the vehicle called?", "choices": ["door", "windshield", "bumper", "shutter"], "correct_choice_idx": 1, "direct_answers": ["windshield", "windshield", "windshield", "windshield", "windshield", "windshield", "windshield", "windshield", "bumper", "dashboard"], "difficult_direct_answer": false, "rationales": ["They are located on the top front and for drivers to see when they drive. it also protects one from rain and debri.", "The doors are on the sides, not front. the bumper is on the front but is not clear.", "It is a window in front."], "image": "train2014/COCO_train2014_000000547148.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327843, "question_id": "LBiyTW7FgCYgEn5LC6DoUa", "question": "What is the paper in front of the men at the table?", "choices": ["book", "menu", "poster", "magazine"], "correct_choice_idx": 1, "direct_answers": ["international agreement", "menu", "menus", "document", "menu", "government treaty", "contract", "business agreement", "document", "menu"], "difficult_direct_answer": false, "rationales": ["They are in a restaurant where menus are given out.", "The paper is a menu.", "The people order food from the menu."], "image": "train2014/COCO_train2014_000000327843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356394, "question_id": "LBkFbocoDbYmFVpk5UfPbg", "question": "What type of action is allowed by the traffic light?", "choices": ["straight travel", "left turn", "pedestrian crossing", "right turn"], "correct_choice_idx": 1, "direct_answers": ["left turn", "pass", "walk", "turn", "left turn", "take left", "left turn", "left turn", "turn", "left turn"], "difficult_direct_answer": false, "rationales": ["The action is to turn left.", "The traffic light is red but there is a green arrow pointing left which indicates a turn is allowed.", "Although there is a red light, there is also a green arrow which means that traffic may proceed in the direction that the arrow is pointing."], "image": "val2014/COCO_val2014_000000356394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335415, "question_id": "LBkkfHkKjytLWhTCdK5BsD", "question": "What kind of parasols in this picture?", "choices": ["patio", "wicker", "bamboo parasols", "straw parasol"], "correct_choice_idx": 2, "direct_answers": ["straw", "beach umbrellas", "bamboo parasols", "straw", "straw", "straw", "bamboo", "straw", "straw", "ocean"], "difficult_direct_answer": false, "rationales": ["These are made of round hollow tubes that look like the material that bamboo is.", "Bamboo parasols you often see at the beach or a beach resort.", "The top of the umbrellas are made of bamboo."], "image": "train2014/COCO_train2014_000000335415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117667, "question_id": "LBozB6VsRWU66dhzMDi2qF", "question": "The shape of the kite in the image is called?", "choices": ["box", "circular", "delta", "bow"], "correct_choice_idx": 2, "direct_answers": ["triangle", "triangle", "triangle", "diamond", "delta", "bird", "triangle", "triangle", "triangle", "triangle"], "difficult_direct_answer": false, "rationales": ["This shape is called a delta.", "The kite has delta wings.", "The kite on the ground is in the shape of a delta or a triangle."], "image": "train2014/COCO_train2014_000000117667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18370, "question_id": "LBqzM3EkeJU2rEvsJRPmtD", "question": "Why are the cars parked on the grass?", "choices": ["to show", "to fix", "to clean", "to paint"], "correct_choice_idx": 0, "direct_answers": ["sales event", "for sale", "for show", "display", "display", "show", "to show", "car show", "event", "not used"], "difficult_direct_answer": true, "rationales": ["They are lined up with their hoods open and large yellow paper on their windshields.", "The cars are parked at a carshow on the grass.", "There are multiple cars with flyers saying what kind of vehicle they are."], "image": "train2014/COCO_train2014_000000018370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520879, "question_id": "LBtvgUoJQuvhHA3rL8sjUY", "question": "Why is the man seated by a table of fruit?", "choices": ["he's painting", "he's photographing", "he's selling", "he's eating"], "correct_choice_idx": 2, "direct_answers": ["yes", "vendor", "sell fruit", "fruit stand", "selling", "he's selling", "to sell", "selling them", "to sell", "he's tired"], "difficult_direct_answer": true, "rationales": ["The multitude of fruits with price signs in front and a lockbox nearby suggests they are for sale.", "The man is selling fruit at a stand.", "He rests as he waits for customers"], "image": "train2014/COCO_train2014_000000520879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62263, "question_id": "LBu6Upr8SFeMKvmQKAKkmt", "question": "What type of animals are present on the dirt behind the elephants body?", "choices": ["jaguar", "zebra", "tiger", "giraffe"], "correct_choice_idx": 1, "direct_answers": ["zebra", "zebras", "zebras", "zebras", "zebra", "zebra", "zebras", "zebra", "zebras", "zebra"], "difficult_direct_answer": false, "rationales": ["Zebras are behind the elephants.", "The animals behind the elephants are black and white striped.", "Behind the elephants are the black and white striped animals belonging to the equine family."], "image": "train2014/COCO_train2014_000000062263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153677, "question_id": "LC2Pr3WS53hDGP6PDDPEd8", "question": "What style of sweater is she wearing?", "choices": ["turtleneck", "crewneck", "v-neck", "cardigan"], "correct_choice_idx": 3, "direct_answers": ["cardigan", "cardigan", "cardigan", "knit", "knitted", "cardigan", "cardigan", "shawl", "knitted", "crochet"], "difficult_direct_answer": false, "rationales": ["The style is a cardigan.", "Some woman enjoy the looks and comfort of a cotton wrap with a pattern of holes in them. this is also called a cardigan.", "It has an open front."], "image": "train2014/COCO_train2014_000000153677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451364, "question_id": "LCH2qcuhAa5NGD3uNDrk9m", "question": "For which purpose are bikes parked indoors?", "choices": ["sales room", "racing mark", "intimidation", "easy getaway"], "correct_choice_idx": 0, "direct_answers": ["sales room", "keep safe", "for display", "safety", "sales floor", "exhibition dealership", "keep dry", "for display", "security", "display"], "difficult_direct_answer": true, "rationales": ["The bikes are located in this room so they can be sold.", "The purpose is a sales room.", "There are several motorcycles as if being sold."], "image": "train2014/COCO_train2014_000000451364.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327507, "question_id": "LCMemr78ogcULiHTreqTYE", "question": "Where is the white substance coming out from on the train?", "choices": ["post", "hose", "ceiling", "smokestack"], "correct_choice_idx": 3, "direct_answers": ["engine", "chimney stack", "smoke stack", "steam", "top", "steam", "up", "smokestack", "steam", "smoke"], "difficult_direct_answer": false, "rationales": ["Steam engines puff out smoke from the smokestack.", "There is smoke coming from the engine's smokestack.", "The smokestack is put on the train as a place where the smoke can easily be released and keep the train moving."], "image": "train2014/COCO_train2014_000000327507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127807, "question_id": "LCSuJBbdYcF4i2sk4JjNhf", "question": "What is the jugglers greatest interference right now?", "choices": ["darkness", "contacts", "glasses", "his nose"], "correct_choice_idx": 0, "direct_answers": ["gravity", "darkness", "camera", "tree", "nighttime", "darkness", "light", "darkness", "flash", "darkness"], "difficult_direct_answer": false, "rationales": ["The juggler is trying to juggle at night so it would harder to see without good light.", "The jugglers are in the dark.", "The juggler can't see well in the dark."], "image": "train2014/COCO_train2014_000000127807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456886, "question_id": "LCfoJqPy7wwZAWXoGZfGgg", "question": "What motorcycle brand can be seen advertised?", "choices": ["toyota", "outlaw", "rough rider", "harley-davidson"], "correct_choice_idx": 3, "direct_answers": ["harley", "harley", "harley", "harley davidson", "harley davidson", "harley", "harley", "harley-davidson", "harley davidson", "harley davidson"], "difficult_direct_answer": false, "rationales": ["The harley davidson logo is seen on the sign above \"we are corin\".", "The sign on the side of the building is widely recognized as the most popular motorcycle brand.", "Harley-davidson is the most well known manufacturer of motorcycles; which this image has many of."], "image": "train2014/COCO_train2014_000000456886.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340129, "question_id": "LD6wVSoegFr2pZj8yuSPqy", "question": "The man in the blue shirt is posing next to what Philadelphia Phillies player?", "choices": ["ryan howard", "david wright", "jim those", "chase utley"], "correct_choice_idx": 2, "direct_answers": ["friendly one", "professional player", "jim thome", "debrand", "debrano", "oebrand", "unknown", "wheeler", "jim those", "brock"], "difficult_direct_answer": true, "rationales": ["A man poses with a baseball player wearing a phillies uniform.", "If you are a baseball fan you can tell who the celebrity is.", "Jim those is posing."], "image": "train2014/COCO_train2014_000000340129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573744, "question_id": "LE4EgnKd6RhvDXZE5fGsah", "question": "What is the object behind the truck?", "choices": ["block wall", "motorhome", "trailer", "ufo"], "correct_choice_idx": 2, "direct_answers": ["trailer", "haul truck", "trailer", "dog", "light cargo", "trailer", "trailer", "trailer", "trailer", "trailer"], "difficult_direct_answer": false, "rationales": ["The red color and metal top is normal for this type of equipment/vehicle.", "There is a large red object following the car.", "This is to carry more things than the truck can"], "image": "train2014/COCO_train2014_000000573744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541901, "question_id": "LEbMGtdRjnnVHvcEAJk6Gp", "question": "Which group allegedly invented skateboards?", "choices": ["scientists", "football players", "surfers", "baseball fans"], "correct_choice_idx": 2, "direct_answers": ["californians", "surfers", "surfers", "surfers", "larry stevenson", "kids", "california surfers", "surfers", "surfers", "surfers"], "difficult_direct_answer": false, "rationales": ["People in the 60's and 70's invented skateboarding, and they were former surfers.", "The skateboard looks a bit like a surfboard.", "Since surfboards are used in the ocean, skateboards have the same shape only wheels are added to be ridden, so their inventor must of surfed before thinking to skateboard."], "image": "train2014/COCO_train2014_000000541901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178411, "question_id": "LEiJja5gL628hXa7gmBVbu", "question": "Why is there green tape on the pickup?", "choices": ["repairs", "vandalism", "advertising", "decorative"], "correct_choice_idx": 2, "direct_answers": ["repair damaged", "advertising", "sale", "provide information", "caution tape", "makes signs", "warning message", "advertising", "signage", "advertising"], "difficult_direct_answer": false, "rationales": ["The tape is for advertising.", "The bed of the truck is full of boxes that appear to be from a vendor. there are people going through boxes.", "The green tape is meant for advertising."], "image": "train2014/COCO_train2014_000000178411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203036, "question_id": "LEn9tgNLizC8REji5MoYbz", "question": "What is the man on the rightmost learning?", "choices": ["wearing belt", "wearing shirt", "playing magic", "tying tie"], "correct_choice_idx": 3, "direct_answers": ["wear tie", "tie tying", "trying tie", "tying tie", "tie", "adjust tie", "tie knot", "tying tie", "tie tie", "to wear"], "difficult_direct_answer": true, "rationales": ["As indicated by how some of them are watching each other and looking at and holding ties.", "The man is holding his tie.", "Men with ties are standing around and watching one guy tie one for another."], "image": "train2014/COCO_train2014_000000203036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492057, "question_id": "LEu5qmy49wXCqcKiBJnXt7", "question": "Who took the photo of this man?", "choices": ["this man", "professional photographer", "another woman", "blackmailer"], "correct_choice_idx": 0, "direct_answers": ["himself", "himself", "this man", "himself", "himself", "himself", "he idd", "himself", "he did", "this man"], "difficult_direct_answer": false, "rationales": ["The man is taking a selfie.", "The man took the picture of himself.", "The man took a photo of himself."], "image": "val2014/COCO_val2014_000000492057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198164, "question_id": "LFP8Edh6u549bmbreiyLrt", "question": "How many turn options do cars entering this intersection have?", "choices": ["one", "two", "five", "none"], "correct_choice_idx": 1, "direct_answers": ["three", "two", "two", "two", "2 options", "two", "three", "three", "three", "two"], "difficult_direct_answer": false, "rationales": ["The intersection is three-way type so if the car doesn't keep going straight, it can either turn right or left, which equals a total of two turn options.", "Cars can turn left or right.", "There are two options."], "image": "val2014/COCO_val2014_000000198164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188044, "question_id": "LFi8F4yRzU2kpGhZf8Vw84", "question": "In what geographic location are they waiting for the train?", "choices": ["usa", "western europe", "south africa", "asia"], "correct_choice_idx": 1, "direct_answers": ["train station", "belgium", "kust wet", "underground", "station", "underground", "western europe", "underground", "train station", "brussels"], "difficult_direct_answer": false, "rationales": ["The people are waiting for a train in a station in western europe in a city with a german name.", "These are european people and the writing on the wall isn't english. if it was in the usa it would be in english.", "The sign says \"kunst-wet\" which is in some kind of european language."], "image": "val2014/COCO_val2014_000000188044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536720, "question_id": "LFr6gPjC2CAav23irDiiGi", "question": "Why is the short haired woman wearing a khaki shirt?", "choices": ["fashion", "staying cool", "dress code", "keeping warm"], "correct_choice_idx": 2, "direct_answers": ["uniform", "employee uniform", "comfort", "farm", "zoo keeper", "uniform", "uniform", "zookeeper", "dress code", "uniform shirt"], "difficult_direct_answer": false, "rationales": ["This looks like she works in a zoo of some sort and this is part of the staffs unirform.", "A khaki polo shirt worn with brown pants and boots looks like a uniform when combined with the name tag she is wearing.", "This is a common material for people working with animals"], "image": "val2014/COCO_val2014_000000536720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561731, "question_id": "LFuuWPcfcXNB8MkmcH3cD2", "question": "What is the boy doing with the food on the plate?", "choices": ["throwing it", "cooking it", "eating it", "decorating it"], "correct_choice_idx": 2, "direct_answers": ["eating", "eating", "picking food", "holding", "eating it", "eating", "eating", "eating it", "eating", "eating it"], "difficult_direct_answer": false, "rationales": ["The boy is holding the plate close to his mouth and using a fork to grab some.", "The boy is using a utensil and putting it into the food. when one is holding a plate with food on it and using a utensil their intention is likely to eat it.", "The boy has a fork in the food. a fork is a utensil with tines that are used to stab the food and then bring food up to the mouth for eating."], "image": "train2014/COCO_train2014_000000561731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324852, "question_id": "LFzKd3S7v7GpTZqiAmh2VP", "question": "What type of street is shown?", "choices": ["residential", "public", "private", "dirt"], "correct_choice_idx": 1, "direct_answers": ["public", "city", "asphalt", "commercial vehicle", "urban", "city", "city street", "downtown street", "urban", "urban"], "difficult_direct_answer": false, "rationales": ["The type is public.", "The paved street is open to everyone and does not have houses.", "The street is large."], "image": "train2014/COCO_train2014_000000324852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116244, "question_id": "LG2cygjbXoWGH7d7KnS2sL", "question": "What type of area is shown?", "choices": ["slope", "hillside", "desert", "beach"], "correct_choice_idx": 1, "direct_answers": ["pasture", "hills", "pasture", "pasture", "field", "hills", "pasture", "hillside", "hills", "pasture"], "difficult_direct_answer": false, "rationales": ["The other options don't match entirely. that said, d could also be argued as relevant here given the a is d.", "The area is hilly and green for animals to graze.", "Animals are grazing on a hill with mountains in the background."], "image": "val2014/COCO_val2014_000000116244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272129, "question_id": "LGEUTgNCe6EPEy4JxNbB8N", "question": "What type of vehicle are the creatures depicted rumored to travel in?", "choices": ["submarine", "flying saucer", "snowmobile", "helicopter"], "correct_choice_idx": 1, "direct_answers": ["spaceship", "flying saucer", "flying saucer", "ufo", "spacecrafts", "space ship", "spaceships", "flying saucers", "ufo", "flying saucer"], "difficult_direct_answer": false, "rationales": ["The vehicle has a flying saucer.", "The creatures are green and are not from earth. they use spacecraft to travel around.", "Ufos are depicted on the kites and they are rumored to travel in this type of spaceship."], "image": "val2014/COCO_val2014_000000272129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270198, "question_id": "LGEjKiwJwv9gr6NuXs6LRU", "question": "What are the circular patterns on the ground?", "choices": ["abstract drawing", "paint", "tire tracks", "cracks"], "correct_choice_idx": 2, "direct_answers": ["tire marks", "eights", "donut tricks", "tire marks", "tire rubber", "tire tracks", "truck", "doughnuts", "vehicle donuts", "burnouts"], "difficult_direct_answer": true, "rationales": ["The marks on the ground happen when something with rubber tires drives really fast in a circle and then brakes hard.", "The circular patterns are left by the tires after doing wheelies and donuts.", "The circular patterns are from rubber coming off on asphalt."], "image": "train2014/COCO_train2014_000000270198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403190, "question_id": "LGNHQLaiiRRRbuZ7G2pmx3", "question": "Who are those people riding on the elephants?", "choices": ["migrants", "refugees", "workers", "visitors"], "correct_choice_idx": 3, "direct_answers": ["tourists", "visitors", "tourists", "tourists", "tourists", "tourists", "tourists", "tourists", "tourists", "tourists"], "difficult_direct_answer": false, "rationales": ["They are dressed and have the physical features of people who don't live in areas where these animals live.", "Elephants like these are often used to give rides to tourists and visitors, which is what it's doing now.", "This type of event is typical for asia, and the people on the backs of the elephants do not look like locals, and they are partaking in a tourist activity."], "image": "train2014/COCO_train2014_000000403190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376828, "question_id": "LGffKGaiHzdL6KCLS8Mp6H", "question": "What is sitting on the mini fridge in the center of the room?", "choices": ["potted plant", "small child", "adult male", "hammer"], "correct_choice_idx": 0, "direct_answers": ["jade plant", "potted plant", "plant", "plant", "potted plant", "plant", "bouquet", "planter", "flower", "plant"], "difficult_direct_answer": false, "rationales": ["A small plant in a pot is on top of a small white appliance in a home.", "A terra cotta bowl-shaped item that contains something sits on the mini fridge. items with this description can be alive but are not people.", "The plant is on the fridge."], "image": "train2014/COCO_train2014_000000376828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554658, "question_id": "LGpuACfFYugfoxpXkZfQRo", "question": "What are the letters on the license plate?", "choices": ["zgv", "jyr", "aty", "nlz"], "correct_choice_idx": 3, "direct_answers": ["299nlz", "nlz", "nlz", "nlz", "nlz", "299", "nlz", "nlz", "nlz", "nlz"], "difficult_direct_answer": false, "rationales": ["The letters on the blue license plate are nlz", "The license plate has three letters which are nlz.", "The plate on the fridge has nlz on it."], "image": "train2014/COCO_train2014_000000554658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258270, "question_id": "LH5AqzMaLc5X3c8AHkfSCP", "question": "What is the traditional name for what's in the white cup?", "choices": ["pozole", "humus", "chowder", "menudo"], "correct_choice_idx": 0, "direct_answers": ["curry", "chili", "pozole", "stew", "sauce", "soup", "unknown", "salsa", "chickpea curry", "beans"], "difficult_direct_answer": true, "rationales": ["The look of the picture shows the name itself.", "This is a dish that is regional and goes by the name of pozole.", "It is a type of soup"], "image": "train2014/COCO_train2014_000000258270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546148, "question_id": "LHFC9jYdZaTqEAVqWunEok", "question": "What ingredient on the donut is usually unseen in donuts?", "choices": ["cream", "dough", "frosting", "pretzel"], "correct_choice_idx": 3, "direct_answers": ["frosting", "pretzel", "frosting", "label", "eggs", "chocolate", "pretzel", "pretzel", "pretzel", "baker's salt"], "difficult_direct_answer": false, "rationales": ["Donuts are usually a baked good with icing, and a pretzel is typically a separate entity of a snack.", "The dough is not seen in donuts.", "Normally only dough, frosting, cream and jelly or any combination of those can be found on a donut."], "image": "train2014/COCO_train2014_000000546148.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268734, "question_id": "LHdnLWMBgTawUb9YD7Wdf4", "question": "What sort of diet does the person biting this sandwich have?", "choices": ["vegetarian", "vegan", "omnivore", "piscadarian"], "correct_choice_idx": 2, "direct_answers": ["omnivore", "carniverous", "carnivore", "omnivore", "no diet", "high calorie", "omnivore", "unhealthy", "fast food", "meat"], "difficult_direct_answer": false, "rationales": ["There are plant and animal products here.", "They are eating meat.", "The food in the person's hand is a sandwich with meat which would be eaten by an omnivore."], "image": "val2014/COCO_val2014_000000268734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138922, "question_id": "LHjvsVwwnntksoEKnouD6J", "question": "Why did he cover his head?", "choices": ["warmth", "protection", "religion", "costume"], "correct_choice_idx": 1, "direct_answers": ["protection", "safety", "skateboarding", "protection", "protection", "protection", "protection", "skateboarding", "protection", "protect head"], "difficult_direct_answer": false, "rationales": ["The person here is using the helmet to protect their head from injury.", "The man wants protection.", "A boy is skateboarding and has a hard, plastic helmet on his head."], "image": "train2014/COCO_train2014_000000138922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316298, "question_id": "LJ33mqXtfSaQLPn7sdsZKL", "question": "What does the person have in their hand?", "choices": ["teacup", "baby", "marble", "jelly bean"], "correct_choice_idx": 0, "direct_answers": ["coffee cup", "keyboard", "coffee mug", "tea", "cup", "teacup", "coffee", "keyboard", "cup", "coffee cup"], "difficult_direct_answer": false, "rationales": ["The person has a teacup.", "The person is drinking chai tea.", "The item is a cup to drink a beverage out of and it could be used for tea."], "image": "train2014/COCO_train2014_000000316298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486449, "question_id": "LJ7LcvCN3FaHLqX5J9rP3M", "question": "What is this place?", "choices": ["car dealer", "subway", "biker bar", "walmart"], "correct_choice_idx": 2, "direct_answers": ["garage", "amusement park", "restaurant", "resort", "parking lot", "town", "bar", "tourist trap", "park", "biker bar"], "difficult_direct_answer": true, "rationales": ["There is a large row of motorcycles parked out front. when there are this many motorcycles parked out front of a location it is frequently a place that caters to their owners.", "The place is likely a bar that people who ride bikes like to frequent.", "The profusion of motorcycles indicates the the type of establishment identified in option a."], "image": "train2014/COCO_train2014_000000486449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330023, "question_id": "LJd2KjJWBr63NJtRfU22WC", "question": "What type of couch is it?", "choices": ["sectional", "divan", "futon", "scandinavian"], "correct_choice_idx": 2, "direct_answers": ["futon", "futon", "wooden", "futon", "futon", "futon", "futon", "wooden", "futon", "futon"], "difficult_direct_answer": false, "rationales": ["The couch shown is a simple folding couch known as a futon.", "This is a couch that can pull out to be a bed too.", "This is the most likely options given the shape of the cushions is designed to double as bed."], "image": "val2014/COCO_val2014_000000330023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327575, "question_id": "LJeTkNiAAKWGqufNX2vHha", "question": "Where is this animal located in this picture?", "choices": ["ocean", "jungle", "enclosure", "forest"], "correct_choice_idx": 1, "direct_answers": ["elephant", "jungle", "elephant", "middle", "residential area", "in pen", "zoo", "inside fencing", "yard", "enclosure"], "difficult_direct_answer": true, "rationales": ["There is a fence around them", "An elephant is standing in an urban area but tropical trees can be seen throughout the area.", "A single elephant with large tusks to the ground is standing within a fenced in area."], "image": "train2014/COCO_train2014_000000327575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64149, "question_id": "LJeumkLkGKcp8ExPFw8Ln4", "question": "What will they clean with the item in the pink box?", "choices": ["face", "floor", "table", "screen"], "correct_choice_idx": 0, "direct_answers": ["nose", "nose", "nose", "table", "their nose", "kleenex", "nose", "face", "can't see", "nose face"], "difficult_direct_answer": false, "rationales": ["They'll use the face.", "Kleenex is used to blow or wipe your nose.", "The pink item is a tissue box."], "image": "train2014/COCO_train2014_000000064149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476660, "question_id": "LKF2vQtYr2vKAwe2mb2piv", "question": "The colors on the pants resemble what animal?", "choices": ["flamingo", "zebra", "tiger", "seal"], "correct_choice_idx": 2, "direct_answers": ["tiger", "tigers", "tiger", "tiger", "bengal", "tiger", "tiger", "tiger", "bengal", "tiger"], "difficult_direct_answer": false, "rationales": ["The colors are like a tiger.", "Both the pants and tigers are orange and black.", "Orange is the color of a tiger."], "image": "train2014/COCO_train2014_000000476660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370448, "question_id": "LKGzayzmGQZAfM6AwzTtiC", "question": "What is the yellow object on the sidewalk connected to?", "choices": ["wall", "water lines", "mayor", "police"], "correct_choice_idx": 1, "direct_answers": ["pipes", "water line", "joinwith floor", "water main", "water lines", "fire hydrant", "firehydrant", "ground", "water pipe", "water"], "difficult_direct_answer": true, "rationales": ["Firefighters use the yellow object, which is a fire hydrant, to put of fires.", "This is a fire hydrant that is used by firefighters to connect to water lines to put out fires nearby.", "Fire hydrants use water to put fires out so there would need to be a source of water that goes to the hydrant."], "image": "val2014/COCO_val2014_000000370448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273855, "question_id": "LKau27AQtaSXtGQ6t5v9aN", "question": "What commodity has this young man exhausted?", "choices": ["oats", "milk", "wine", "none"], "correct_choice_idx": 1, "direct_answers": ["milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk"], "difficult_direct_answer": false, "rationales": ["The milk container in front of the boy is empty and there is some in his bowl.", "The milk carton is empty so has been exhausted by the young man.", "The milk bottle is empty, indicating that all the milk has been consumed."], "image": "val2014/COCO_val2014_000000273855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27616, "question_id": "LKeSAryjZMHJaDf84hZ9yZ", "question": "When place is it?", "choices": ["garage", "car show", "car dealer", "auto shop"], "correct_choice_idx": 0, "direct_answers": ["garage", "garage", "garage", "garage", "garage", "garage", "garage", "garage", "garage", "garage"], "difficult_direct_answer": false, "rationales": ["There is a car and motorbike so it's likely a garage.", "The place looks like someone's large garage.", "There are a variety of items in side here such as snowblower, car, and other items."], "image": "train2014/COCO_train2014_000000027616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301109, "question_id": "LKhKyc6GiKRYvhcnVuGhqp", "question": "What silent movie star does the resident of this apartment like?", "choices": ["charlie chaplin", "theda", "none", "fay wray"], "correct_choice_idx": 0, "direct_answers": ["charlie chaplin", "charlie chaplin", "charlie chaplin", "charlie chaplin", "pictured", "charlie chaplin", "charlie chaplin", "pictured", "charlie chaplin", "mime"], "difficult_direct_answer": false, "rationales": ["The star is chaplin.", "The man on the poster has a mustache and bowler hat.", "That's who's on the poster on the wall."], "image": "train2014/COCO_train2014_000000301109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484200, "question_id": "LMMgzk23Y9RFJLnMbyhaC8", "question": "What caused the brown marks on the crust?", "choices": ["air fryer", "stove", "grill", "oven"], "correct_choice_idx": 3, "direct_answers": ["heat", "fire", "food", "baking", "oven", "baking", "oven", "burn", "oven", "oven"], "difficult_direct_answer": false, "rationales": ["The food is pizza, which is typically and traditionally baked.", "Pizza is made in the oven and often the pizza gets too hot and the pizza gets burned, making brown marks.", "An oven bakes the pizza with the brown marks."], "image": "train2014/COCO_train2014_000000484200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167152, "question_id": "LMNpi2YEWk9MD8szSnmULr", "question": "What is in the bottom of the picture?", "choices": ["water", "fence", "zebra", "path"], "correct_choice_idx": 3, "direct_answers": ["grass", "dusty ground", "path", "dead grass", "zebras", "dirt", "brush", "grass", "dry plants", "grass"], "difficult_direct_answer": false, "rationales": ["There is a path on the ground.", "There is a smoothed out piece of dirt.", "There is a path formed in the grass."], "image": "val2014/COCO_val2014_000000167152.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112226, "question_id": "LMRejbweGztrXSt7Ge9zr9", "question": "What is the woman doing with the object in her hand?", "choices": ["throwing it", "passing it", "trading it", "eating it"], "correct_choice_idx": 3, "direct_answers": ["eating", "eating", "eating it", "eating", "eating", "eating", "eating", "eating", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["The woman is eating.", "The woman is holding a sandwich near her mouth, and sandwiches are usually consumed when in this position.", "The woman is eating the sandwich."], "image": "train2014/COCO_train2014_000000112226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571048, "question_id": "LMg4YzVfuwr4pvpvhue3xH", "question": "What type of seating is in front of the yellow plane?", "choices": ["chair", "bench", "bed", "sofa"], "correct_choice_idx": 0, "direct_answers": ["metal chairs", "single", "metal chairs", "single", "chairs", "single", "single", "chairs", "chair", "metal chairs"], "difficult_direct_answer": false, "rationales": ["Chairs are lined up in front of a plane. chairs are used for sitting.", "The front portion of the yellow place is a small area for someone to sit in. through the window it is apparent that a chair is located there.", "There are chairs to sit on."], "image": "train2014/COCO_train2014_000000571048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483288, "question_id": "LMpA2HTGJBtAyW9Zdnedei", "question": "What is the plane flying over?", "choices": ["mountain", "building", "lake", "forest"], "correct_choice_idx": 1, "direct_answers": ["building", "building", "building", "building", "building", "building", "building", "building", "city", "building"], "difficult_direct_answer": false, "rationales": ["The plane is flying over a tall man-made object that has lots of windows.", "A plant is seen in the air above a large building.", "There are floors and windows"], "image": "train2014/COCO_train2014_000000483288.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140432, "question_id": "LMwGvbG7vDFXAMpVQCoGnp", "question": "What type of emergency is happening?", "choices": ["assault", "crash", "flood", "fire"], "correct_choice_idx": 3, "direct_answers": ["fire", "fire", "fire", "airplane fire", "airplane fire", "fire", "fire", "fire", "fire", "fire"], "difficult_direct_answer": false, "rationales": ["There is billowing smoke in the background. where there is smoke there is fire and fire is considered an emergency.", "The plane in the distance had a fire and there is black smoke rising from it.", "There is smoke in the background. something is burning."], "image": "train2014/COCO_train2014_000000140432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126204, "question_id": "LNGrQkZkwnraxeRFbT4LRQ", "question": "The restaurant the man is sitting down at specializes in which general food item?", "choices": ["hot dogs", "hamburgers", "ribs", "pitas"], "correct_choice_idx": 1, "direct_answers": ["beer", "burgers", "burgers", "seafood", "hamburger", "hamburgers", "hamburgers", "fast food", "burgers", "burgers"], "difficult_direct_answer": false, "rationales": ["The restaurant has burgers.", "Red robin has burgers.", "That place makes burgers."], "image": "train2014/COCO_train2014_000000126204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191096, "question_id": "LNHoSsz5Je2XwfAWF5mEpD", "question": "Which direction will number 15 run toward?", "choices": ["back", "right", "left", "nowhere"], "correct_choice_idx": 1, "direct_answers": ["right", "first", "first base", "right", "up", "right", "first base", "right", "first base", "right"], "difficult_direct_answer": false, "rationales": ["A baseball player is standing at home plate with a bat down by his side and facing towards the right.", "The batter will want to go right.", "The man will go on the right."], "image": "val2014/COCO_val2014_000000191096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454916, "question_id": "LNgzPGb8gopHS5Y7nrEnnS", "question": "The stripe on the wall is the same color as what logo?", "choices": ["mcdonald's", "white castle", "nathan's famous", "popeye's"], "correct_choice_idx": 0, "direct_answers": ["yield", "mcdonald's", "mcdonald's", "mcdonalds", "mcdonald's", "denny's", "mcdonalds", "nickelodeon", "mcdonald's", "mcdonald's"], "difficult_direct_answer": false, "rationales": ["The stripe is yellow, not orange, white, or green.", "That fast food joint has that shade of color of its arch in its sign.", "The other options don't have this matching color and shape."], "image": "train2014/COCO_train2014_000000454916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239448, "question_id": "LNj2RS3Mt54jXfQMoQfmRj", "question": "What is the device shown in the image?", "choices": ["compass", "painting", "clock", "photography"], "correct_choice_idx": 0, "direct_answers": ["compass", "clock", "clocktower", "clock", "compass", "compass", "clock", "barometer", "clock", "clock"], "difficult_direct_answer": false, "rationales": ["There is a compass on the side of the tower that shows the directions north, south, east, and west.", "It has the letters for directions", "This object shows the cardinal directions."], "image": "val2014/COCO_val2014_000000239448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243251, "question_id": "LNyXg3gwH2YP6LRxAwVT7L", "question": "What does the white arrow sign usually mean?", "choices": ["stop", "one way", "no turns", "yield"], "correct_choice_idx": 1, "direct_answers": ["go left", "tun left", "one way", "crossing", "one way", "this way", "one way", "one way", "one way", "concentrate"], "difficult_direct_answer": false, "rationales": ["The white arrow means one way.", "A white arrow on a black background is a well-known sign showing that traffic can only turn in the direction that the arrow is pointing.", "The white arrow means people can only go in one way."], "image": "train2014/COCO_train2014_000000243251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533454, "question_id": "LP7E8dUoNEVUm9jfgaCN7i", "question": "Why is the woman raising the tennis ball?", "choices": ["to serve", "to rub", "to pocket", "to inspect"], "correct_choice_idx": 0, "direct_answers": ["getting ready", "serve ball", "to hit", "serving", "throw it", "serving", "playing", "to serve", "to serve", "to serve"], "difficult_direct_answer": false, "rationales": ["The woman wants to serve.", "The woman is trying to serve the ball.", "She is raising the ball to toss it over her head and complete the serve to her opponent."], "image": "train2014/COCO_train2014_000000533454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45593, "question_id": "LP7UdtBMxjJ4KxRNSGgApS", "question": "What is the white bag on the sign used to do?", "choices": ["anchor", "throw", "kick", "sell"], "correct_choice_idx": 0, "direct_answers": ["weight", "anchor", "hold sign", "anchor", "hold sign", "hold it", "weight", "anchor sign", "hold place", "hold it"], "difficult_direct_answer": false, "rationales": ["The sandbag holds the signs down.", "It's probably full of sand, which will help hold down the sign.", "The sandbag is heavier than the sign and will keep it from being blown or moved out of place."], "image": "train2014/COCO_train2014_000000045593.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217269, "question_id": "LPDxCWUKu2Lzqo7uCcFpMg", "question": "What type of station is this?", "choices": ["train", "work", "bus", "fire"], "correct_choice_idx": 1, "direct_answers": ["computer station", "desktop", "workstation", "work", "work", "workstation", "work", "computer", "office station", "work"], "difficult_direct_answer": false, "rationales": ["There are several computers in the room", "There are computers and a desk here which are common office items.", "The area is set up as a workstation."], "image": "val2014/COCO_val2014_000000217269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112342, "question_id": "LPX5i3aTAbENp8ypPMLMxb", "question": "What would happen to the purple item if it was dropped?", "choices": ["float", "sink", "disappear", "fly"], "correct_choice_idx": 0, "direct_answers": ["fall down", "float", "dog", "float", "get wet", "sink", "sink", "get wet", "float", "float"], "difficult_direct_answer": false, "rationales": ["The purple item is circular and light.", "The item would float.", "Plastic disks won't sink"], "image": "val2014/COCO_val2014_000000112342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376457, "question_id": "LPgKMjUReHNYue7JLujDng", "question": "What are these white objects used to hold?", "choices": ["popcorn", "urine", "cats", "bread"], "correct_choice_idx": 1, "direct_answers": ["urine", "urine", "urine", "urine", "urine", "urine", "urine", "urine", "urine", "pee"], "difficult_direct_answer": false, "rationales": ["These are men's toilets", "The white objects are urinals found in mens rooms used to relieve themselves.", "This is a scene in a men's restroom, with the equipment shown serving to receive the item listed in option a."], "image": "train2014/COCO_train2014_000000376457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169331, "question_id": "LPuibPnzKex4KxN2vLKiBo", "question": "Which make of vehicle is parked nearest to the meter?", "choices": ["honda", "jaguar", "bmw", "toyota"], "correct_choice_idx": 1, "direct_answers": ["chrysler", "jaguar", "jaguar", "saturn", "jaguar", "jaguar", "chrysler", "chrysler", "jaguar", "jaguar"], "difficult_direct_answer": false, "rationales": ["The vehicle is a jaguar.", "The car is made by jaguar.", "The car parked near the meter is a silver jaguar."], "image": "val2014/COCO_val2014_000000169331.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358523, "question_id": "LQ6cqTPb9St8bqpdqNhzuP", "question": "What is this image?", "choices": ["puzzle", "photo", "photoshopped picture", "drawing"], "correct_choice_idx": 3, "direct_answers": ["beach", "beach", "beach", "drawing", "altered photograph", "beach scene", "painting", "beach activities", "beach", "abstract"], "difficult_direct_answer": false, "rationales": ["You can see the brush strokes", "This is not a real photo, it was made with art supplies.", "It shows the strokes of the pencils"], "image": "train2014/COCO_train2014_000000358523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420629, "question_id": "LQT5uhsRLHV4V8uuWPBJUa", "question": "What kind of transportation is shown?", "choices": ["water", "road", "air", "rail"], "correct_choice_idx": 1, "direct_answers": ["motorcycle", "scooter", "road", "cycles", "motorcycles", "motorcycle", "bicycles motorbikes", "bikes", "motorcycle", "two wheeled"], "difficult_direct_answer": false, "rationales": ["The vehicles are bicycles and motorcycles. these cannot travel on tracks, cannot fly, and cannot float on water.", "They are bicycles and motorcycles with wheels that can only be moved on the ground, across a land road.", "Motorcycles are driven on the road."], "image": "train2014/COCO_train2014_000000420629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195798, "question_id": "LRAmtfdMbSTvJ8FHABHHgz", "question": "From what does the umbrella held here offer protection?", "choices": ["sun", "snow", "prying eyes", "rain"], "correct_choice_idx": 0, "direct_answers": ["sun", "sun", "sun", "sun", "sun", "shade", "sun", "sun", "sunlight", "sunlight"], "difficult_direct_answer": false, "rationales": ["The umbrella is being held by a man sitting outside. there isn't any precipitation and there aren't any security issues, but an umbrella is useful under another circumstance.", "The umbrella blocks sun.", "It can protect him from getting too hot or sunburnt."], "image": "val2014/COCO_val2014_000000195798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510949, "question_id": "LREqPz6MJpXKvbg9yAnRL9", "question": "What is the status of the red truck?", "choices": ["parked", "going", "waiting", "broken down"], "correct_choice_idx": 0, "direct_answers": ["good condition", "parked", "parked", "parked", "sitting", "parked", "stand", "parked", "parked", "parked"], "difficult_direct_answer": false, "rationales": ["The red truck is parked on the side of the road and not moving.", "A truck is stopped along the side of a road, next to the sidewalk. it is common to park along the side of the road.", "A red truck is on the side of the road near the sidewalk and no one is in it. people park vehicles on the sides of roads."], "image": "train2014/COCO_train2014_000000510949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222245, "question_id": "LRcHWUKkP3EWVPXyDTUmT8", "question": "Who is the advertiser in the background?", "choices": ["sharp", "lg", "samsung", "sony"], "correct_choice_idx": 2, "direct_answers": ["samsung", "old navy", "samsung", "samsung", "phone", "samsung", "samsung", "retail store", "samsung", "samsung"], "difficult_direct_answer": false, "rationales": ["It looks like samsung is advertising a product in the background.", "The ad is for a smart phone. the logo has white text inside a blue oval.", "Samsung is on the billboard."], "image": "train2014/COCO_train2014_000000222245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314904, "question_id": "LRpEo8NN2YPy3q8yeWrPJd", "question": "Who originally created this sport?", "choices": ["shaq o'neal", "james naismith", "michael jordan", "wayne gretzky"], "correct_choice_idx": 1, "direct_answers": ["people", "requester", "naismith", "james naismith", "jordan", "james naismith", "james naismith", "james naismith", "james naismith", "james naismith"], "difficult_direct_answer": false, "rationales": ["The sport is basketball.", "James naismith created basketball.", "The man who originally created the sport of basketball is named james naismith."], "image": "train2014/COCO_train2014_000000314904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255271, "question_id": "LS6S72h6BLdbioWZA7hJr7", "question": "The numbers on the sign are informing the people of what?", "choices": ["population", "signs", "cars", "temperature"], "correct_choice_idx": 3, "direct_answers": ["temperature", "temperature", "eleven", "temperature", "11", "temperature", "temperature", "signal", "temperature", "temperature"], "difficult_direct_answer": false, "rationales": ["The numbers indicate the temperature.", "The numbers on the sign are to indicate that the temperature is negative 11 degrees outside.", "The numbers are showing the temperature on the sign."], "image": "train2014/COCO_train2014_000000255271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95999, "question_id": "LS74UwxYu4BzL4GDwxZpBN", "question": "Why is the man holding something up in front of the aircraft?", "choices": ["to signal", "to show", "to give", "to photograph"], "correct_choice_idx": 3, "direct_answers": ["picture taking", "for photography", "to photograph", "taking photo", "taking picture", "taking pictures", "taking picture", "take picture", "camera", "taking picture"], "difficult_direct_answer": false, "rationales": ["The man is holding a camera. he is using it to take a picture.", "The man is taking a photo.", "The man is taking a picture of the vintage airplane."], "image": "train2014/COCO_train2014_000000095999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326712, "question_id": "LSBcFE4bmrHUMwwz7476Qe", "question": "What is needed for this activity?", "choices": ["rain", "snow", "water", "wind"], "correct_choice_idx": 3, "direct_answers": ["wind", "wind", "wind", "wind", "string", "wind", "string", "wind", "wind", "string kite"], "difficult_direct_answer": false, "rationales": ["Wind is needed.", "A kite is flown in the air and needs gusts to stay up and flying.", "Wind is needed to keep the kite afloat."], "image": "train2014/COCO_train2014_000000326712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551494, "question_id": "LSCGcstTPpMQhEhazeTEnD", "question": "Why is she smiling?", "choices": ["for camera", "stolen toy", "is confused", "has baby"], "correct_choice_idx": 3, "direct_answers": ["taddy", "she's happy", "baby", "for camera", "happiness", "baby", "new mother", "has baby", "baby", "shes happy"], "difficult_direct_answer": false, "rationales": ["She is smiling because she is holding onto her baby.", "The woman has a new baby in her arms.", "The baby looks to be newly born, and she is likely meeting a new family for the first time, causing happiness."], "image": "val2014/COCO_val2014_000000551494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245609, "question_id": "LSP2DTw7tyqCHraeK4M87h", "question": "Why is he not on the surfboard?", "choices": ["fell off", "jumped off", "fell asleep", "too cold"], "correct_choice_idx": 0, "direct_answers": ["crashing wave", "fell off", "he fell", "fell off", "fell off", "swimming", "fell off", "crash", "fell off", "he fell"], "difficult_direct_answer": false, "rationales": ["Surfing is a difficult sport that requires a lot of balance.", "The person fell off of the board.", "His position in the water is not normal unless you fall off."], "image": "train2014/COCO_train2014_000000245609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4704, "question_id": "LSVtmmmFqqTyWqr25hvvMC", "question": "What keeps the snowboarder's feet to the board?", "choices": ["bindings", "tape", "tape", "magnets"], "correct_choice_idx": 0, "direct_answers": ["bindings", "straps", "shake", "feet straps", "shoe holds", "foot straps", "straps", "straps", "clips", "bindings"], "difficult_direct_answer": false, "rationales": ["The snowboarder used binding clips to hold their feet onto the board.", "The other options don't fit with this type of equipment.", "There are straps on the board."], "image": "train2014/COCO_train2014_000000004704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325184, "question_id": "LSbBaDZCqDca2eZNqoqZPp", "question": "At which elevation are the elephants compared to those looking at them?", "choices": ["same", "lower", "changing", "higher"], "correct_choice_idx": 1, "direct_answers": ["lower", "lower elevation", "below", "much lower", "low", "lower", "below", "lower", "lower", "lower"], "difficult_direct_answer": false, "rationales": ["The people are behind a fence which has a drop and a stone wall showing them higher than the elephants.", "The picture is at eyelevel with the elephant so the bottom of the elephant is lower than the picture point.", "The elephants are slightly lower to the ground than the person that took the picture."], "image": "val2014/COCO_val2014_000000325184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408978, "question_id": "LSdXysR6ZE3vwCZv6oFWce", "question": "What purpose does the person in blue standing at the back serve?", "choices": ["super fan", "police", "ball retrieval", "ticket taker"], "correct_choice_idx": 2, "direct_answers": ["ball girl", "ball girl", "ball girl", "referee", "retrieve balls", "ball retrieval", "monitor game", "repree", "ball catcher", "catcher"], "difficult_direct_answer": false, "rationales": ["They are back there to get the tennis balls so they don't roll on the court.", "The purpose is to get the ball.", "The person in the back is there to get the balls as they go back by her."], "image": "train2014/COCO_train2014_000000408978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571821, "question_id": "LSicT4vZ6B99u896Hk5X2e", "question": "Why do the persons in front of the bus stand closely?", "choices": ["in line", "photo posing", "protest", "for warmth"], "correct_choice_idx": 1, "direct_answers": ["photo opp", "photograph", "photo posing", "taking photo", "pose", "photograph", "picturing", "picture", "taking picture", "sidewalk"], "difficult_direct_answer": true, "rationales": ["This is the most common reason given how they're positioned and smiling.", "Their group is standing to have their picture taken.", "They are smiling and standing together."], "image": "val2014/COCO_val2014_000000571821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99025, "question_id": "LSraDkTqGpTSmy2DVBB5WP", "question": "Why are the clouds dark?", "choices": ["storms coming", "its snowing", "hurricane", "it's night"], "correct_choice_idx": 0, "direct_answers": ["rain", "storms coming", "raining", "storm", "rain", "storm approaching", "storm", "storm", "stormy weather", "storm"], "difficult_direct_answer": false, "rationales": ["Dark clouds in the sky often indicate bad weather coming.", "Dark clouds indicate either rain or a storm.", "It looks like it will rain."], "image": "train2014/COCO_train2014_000000099025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117987, "question_id": "LTCwi9A3VcAQdPvAKLfCR6", "question": "What style of sunglasses does the man seem to be wearing?", "choices": ["mountain", "cat eye", "shield", "aviator"], "correct_choice_idx": 3, "direct_answers": ["aviators", "sunglasses", "aviator", "aviator", "aviators", "aviator", "aviators", "dark", "aviators", "sun glasses"], "difficult_direct_answer": false, "rationales": ["The large frames are like those in the aviator style of glasses.", "Aviator sunglasses have this cool, bug-eyed look.", "The man is wearing aviator sunglasses that are popular with airline pilots."], "image": "train2014/COCO_train2014_000000117987.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440381, "question_id": "LTMUQugkLGcP7XQEeMbzrW", "question": "What item is dipped in the liquid in the cup?", "choices": ["cracker", "cookie", "banana", "spoon"], "correct_choice_idx": 2, "direct_answers": ["banana", "banana", "banana", "banana", "banana", "banana", "banana", "banana", "banana", "banana"], "difficult_direct_answer": false, "rationales": ["The item is a banana.", "A long pale yellow fruit is in a cup with orange juice.", "The item has the shape and size of a typical banana."], "image": "train2014/COCO_train2014_000000440381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254930, "question_id": "LV2ZgCuUvtTpWxSM7UbJes", "question": "What company might the man in the blue shirt work for?", "choices": ["apple", "microsoft", "samsung", "sony"], "correct_choice_idx": 1, "direct_answers": ["microsoft", "microsoft", "microsoft", "microsoft", "microsoft", "microsoft", "microsoft", "microsoft", "microsoft", "microsoft"], "difficult_direct_answer": false, "rationales": ["As indicated by the logo and company name on the shirt.", "The mans shirt says microsoft.", "Many companies provide shirts for their employees with their logo on them. microsoft is one such company."], "image": "val2014/COCO_val2014_000000254930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328662, "question_id": "LVRWkjNAWuduZgACK7NeZ4", "question": "A person who goes by the nationality that is written on the bike is from what continent?", "choices": ["europe", "australia", "south america", "asia"], "correct_choice_idx": 3, "direct_answers": ["india", "india", "asia", "asia", "north america", "asia", "asia", "usa", "spanish", "asia"], "difficult_direct_answer": false, "rationales": ["It's a bike that is indian.", "The person would be from asia which is where india is located.", "Indians are considered asians."], "image": "train2014/COCO_train2014_000000328662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149657, "question_id": "LVcN253idJQtsYg3JEqw7Y", "question": "What type of animal is on the TV screen?", "choices": ["aquatic", "reptile", "domestic", "wild"], "correct_choice_idx": 3, "direct_answers": ["bear", "bear", "bear", "bear", "cat", "wild", "bear", "cat", "bear", "bear"], "difficult_direct_answer": false, "rationales": ["The scene on the tv is taking place outdoors and the animal is likely wild.", "This is a large and dangerous animal", "This animal is a bear that lives in the woods."], "image": "train2014/COCO_train2014_000000149657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289591, "question_id": "LVy7KQjKmRFvDyb4fRY85y", "question": "What state is this city in?", "choices": ["illinois", "new zealand", "new york", "ohio"], "correct_choice_idx": 3, "direct_answers": ["ohio", "ohio", "ohio", "ohio", "cleveland", "ohio", "ohio", "ohio", "not known", "ohio"], "difficult_direct_answer": false, "rationales": ["The word above fire on the truck is cleveland. this city is in the united states of america but is not in illinois or new york.", "Cleveland is in ohio.", "The state is ohio."], "image": "train2014/COCO_train2014_000000289591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116771, "question_id": "LW8KGeLJFmUh6tdHQfMXku", "question": "How were the purple characters written?", "choices": ["spray can", "paintbrush", "crayon", "pen"], "correct_choice_idx": 0, "direct_answers": ["spray paint", "spray can", "graffiti", "spray painted", "spraypaint", "spray paint", "spray paint", "spray paint", "painted", "graffiti"], "difficult_direct_answer": false, "rationales": ["The man was using a can.", "The purple letters were sprayed on by graffiti artist.", "There is graffiti on the back of the stop sign. graffiti is done with paint in an areso container."], "image": "train2014/COCO_train2014_000000116771.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366795, "question_id": "LWCJomqiVnzfRVKPo7yWWQ", "question": "Why is he wearing a bib?", "choices": ["helps sleep", "store things", "protect clothing", "stylish"], "correct_choice_idx": 2, "direct_answers": ["baby", "eating", "protect clothing", "stay clean", "keep clean", "messy eater", "prevent messes", "keep clean", "protect clothing", "eating"], "difficult_direct_answer": false, "rationales": ["Due to a lack of mature coordination skills and an underdeveloped motor control ability, this young child is very likely to get messy when eating food so his bib is being obviously employed as a means of protecting his clothing from stains and spills.", "He is trying to protect his cloths from stains.", "His parents don't want him to spill any food on his outfit because babies are less steady when it comes to feeding themselves."], "image": "train2014/COCO_train2014_000000366795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174603, "question_id": "LWQYCd96DC7bbfVkmXguo2", "question": "What color shirt does the person most likely to catch the frisbee wear?", "choices": ["black", "orange", "purple", "red"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "blue", "purple", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The shirt color is purple.", "The person leaping closest to the frisbee is in purple.", "The person is wearing purple."], "image": "val2014/COCO_val2014_000000174603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3716, "question_id": "LWW5NdXTkukKKQxPeNhH9m", "question": "Where is the girl located?", "choices": ["library", "museum", "home", "office"], "correct_choice_idx": 2, "direct_answers": ["home", "den", "couch", "bed", "sitting room", "bed", "on sofa", "home", "living room", "livingroom"], "difficult_direct_answer": false, "rationales": ["A girls is sitting on a couch in a casual room and she is wearing casual clothes. people relax at home in causal clothes.", "Items surrounding the girl include a coach, kitchen, bookshelf, and family pictures. these are all commonly found in a home.", "The girl is at home."], "image": "val2014/COCO_val2014_000000003716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409944, "question_id": "LWYexjAFsMbkLTSa7Zh9g9", "question": "How many times has he won Wimbledon?", "choices": ["one", "eight", "six", "four"], "correct_choice_idx": 1, "direct_answers": ["unknown", "five", "four times", "8 wins", "eight", "eight", "once", "eight", "eight", "12"], "difficult_direct_answer": false, "rationales": ["According to an internet search, roger federer won wimbledon titles in 2003, 2004, 2005, 2006, 2007, 2009, 2012, and 2017.", "That person won that many games.", "There have been eight times."], "image": "val2014/COCO_val2014_000000409944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98616, "question_id": "LWnC6JAm5VdJkXiLd8yDQq", "question": "Which sort of art is the elephant practicing?", "choices": ["stone stacking", "pottery", "carving", "painting"], "correct_choice_idx": 3, "direct_answers": ["painting", "abstract", "painting", "line art", "painting", "painting", "painting", "painting", "painting", "panting"], "difficult_direct_answer": false, "rationales": ["The elephant is grasping a brush and has applied paint to a piece of paper in front of it. this action and tool are associate with painting.", "The elephant has a paint brush.", "It has a paintbrush and an easel"], "image": "train2014/COCO_train2014_000000098616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149394, "question_id": "LWsj4FyPkcUMdWWCs7CZrz", "question": "What is this player doing?", "choices": ["judging", "resting", "returning", "serving"], "correct_choice_idx": 3, "direct_answers": ["serving", "serving", "serving", "serving", "serving ball", "serving ball", "serving", "serving ball", "playing tennis", "serving"], "difficult_direct_answer": false, "rationales": ["He has just thrown the ball in the air with his hand which is the only time a player touches a ball", "His racket is in the air as he is getting ready to serve the ball.", "The tennis player is stretching his arms to serve the ball over the net."], "image": "train2014/COCO_train2014_000000149394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143445, "question_id": "LWvMC3X2ri38rJkzhJpP5Q", "question": "What is the thing around the drinking lady's neck good for?", "choices": ["purely aesthetic", "making calls", "physical protection", "identification"], "correct_choice_idx": 3, "direct_answers": ["identify her", "entrance", "cards keys", "identification", "badge", "lanyard", "hold tag", "access", "identification", "hanging"], "difficult_direct_answer": true, "rationales": ["The woman has an id tag.", "The thing is for id.", "People use lanyards to wear id around their necks at work functions and other similar places."], "image": "val2014/COCO_val2014_000000143445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580255, "question_id": "LWwE7yTAsnzAMR6N4UVWoJ", "question": "What term is used to refer to these animals?", "choices": ["pooch", "reptile", "amphibian", "kitty cat"], "correct_choice_idx": 0, "direct_answers": ["pitbulls", "dogs", "pooch", "dog", "dogs", "dogs", "dog", "dogs", "dogs", "dog"], "difficult_direct_answer": false, "rationales": ["Dogs are refereed as that name.", "The animals are dogs.", "The word is a nickname for dogs."], "image": "val2014/COCO_val2014_000000580255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12522, "question_id": "LWxWkpKzaevRWBSnz9so2H", "question": "If the boy is unsuccessful in doing what he is doing three times in a row what is it called?", "choices": ["flag", "strikeout", "penalty", "base hit"], "correct_choice_idx": 1, "direct_answers": ["strike out", "strike", "strikeout", "strike out", "strikeout", "strike out", "strike", "striking out", "strikeout", "strike out"], "difficult_direct_answer": false, "rationales": ["This game would happen in baseball. he's holding a bat, there's a fence behind him to prevent balls from hitting the audience, and he's wearing those baseball helmets.", "The boy will strike out if he misses the ball.", "The sport the boy is playing is baseball."], "image": "train2014/COCO_train2014_000000012522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258680, "question_id": "LWyN4wLxiQNNM7DBcTC8DS", "question": "The beaded man is celebrating what?", "choices": ["christmas", "armistice day", "thanksgiving", "mardi gras"], "correct_choice_idx": 3, "direct_answers": ["mardi gras", "festival", "mardi gras", "mardigras", "festival", "festival", "mardi gras", "mardi gras", "mardi gras", "mardi gras"], "difficult_direct_answer": false, "rationales": ["These are widely known to be used for the celebration", "The man has colorful necklaces around his neck which is typical of mardi gras.", "The man has mardi gras beads."], "image": "val2014/COCO_val2014_000000258680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382673, "question_id": "LXMpDTkgJSQGF7QZh3rWBN", "question": "What is in the blue can?", "choices": ["contact solution", "shaving gel", "toothpaste", "hair spray"], "correct_choice_idx": 1, "direct_answers": ["shaving cream", "garbage", "shaving cream", "deodorant", "shaving cream", "shower jell", "shaving cream", "shaving cream", "shaving gel", "deodorant"], "difficult_direct_answer": false, "rationales": ["The can has the brand of a shaving company on it indicating that it is shaving gel. plus saving gel would be found in a bathroom.", "The blue can is shaving cream based on the brand and label.", "The cylinder shape of this can and visible logos identifies it as a can of shaving cream."], "image": "train2014/COCO_train2014_000000382673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109950, "question_id": "LXQhpNJpBvGiZrA6EdeccR", "question": "The yellow ball in this game is made up of what?", "choices": ["cotton", "wool", "nylon", "polyester"], "correct_choice_idx": 1, "direct_answers": ["rubber", "rubber", "leather", "rubber", "rubber", "rubber", "rubber", "rubber", "wool", "rubber"], "difficult_direct_answer": false, "rationales": ["The ball is covered with wool.", "The exterior of the ball causes drag on purpose so that the ball won't fly out of the court.", "The ball is made of wool."], "image": "train2014/COCO_train2014_000000109950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406292, "question_id": "LXTqg3CssBAXbxs5Nd85tU", "question": "What is this venue?", "choices": ["savanna", "desert", "animal market", "animal farm"], "correct_choice_idx": 2, "direct_answers": ["farm", "farm", "farm", "rodeo", "sheep auction", "animal market", "animal selling", "livestock market", "fair", "market"], "difficult_direct_answer": false, "rationales": ["There are several sheep for sale.", "People are here to buy and sell animalss.s.", "There are multiple animals lined up and people talking and inspecting them."], "image": "train2014/COCO_train2014_000000406292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413446, "question_id": "LXc2ePF7is25vMKaKXgTtY", "question": "How is this boat powered?", "choices": ["motor", "wind", "whale", "coal"], "correct_choice_idx": 1, "direct_answers": ["sails", "motor", "sail", "wind", "sail", "wind", "engine", "wind", "sail", "wind"], "difficult_direct_answer": false, "rationales": ["The boat has a sail and is in the water.", "The boat has sails.", "It has a sail out to catch the breeze"], "image": "val2014/COCO_val2014_000000413446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39089, "question_id": "LYN83E4b4dxgpzKSQenjfN", "question": "The man is trying to make the representation of Harry Potter in the video game perform what action?", "choices": ["cast spell", "play quidditch", "talk", "learn magic"], "correct_choice_idx": 0, "direct_answers": ["waving", "spellcasting", "cast spell", "cast spell", "cast spell", "magic trick", "spell", "cast spell", "cast spells", "graffics"], "difficult_direct_answer": false, "rationales": ["The man wants to cast a spell.", "He is pointing the magic wand in the air.", "The man is holding his control like a wand."], "image": "train2014/COCO_train2014_000000039089.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278237, "question_id": "LYTWTcZfSZNP6bvLigSFrz", "question": "How many directions are the vehicles shown going in?", "choices": ["seven", "one", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The car in the foreground is travelling right. in the background another car and a bus are travelling left.", "There is one car going right and a car and a bus going left.", "There are two directions."], "image": "val2014/COCO_val2014_000000278237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184215, "question_id": "LYcZDeypAV7USwTwULTEmX", "question": "These people are dressed as what?", "choices": ["tools", "food", "electronics", "animals"], "correct_choice_idx": 3, "direct_answers": ["chipmunks whales", "woodland critters", "actors", "animals", "beavers", "fish beavers", "animals", "beavers", "animals", "animals"], "difficult_direct_answer": false, "rationales": ["The costume heads have mouths and eyes. some have noses and teeth.", "The faces with the long teeth and colouring resemble that of a beave,r which is an animal.", "These people have on beaver and fish costumes and those are both living creatures"], "image": "train2014/COCO_train2014_000000184215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506508, "question_id": "LZ66s6e5RyvXNtZG9UFfaJ", "question": "What animal food is being handled here?", "choices": ["horse chow", "oats", "wheat", "hay"], "correct_choice_idx": 3, "direct_answers": ["hay", "grass", "hay", "hay", "straw", "hay", "alfalfa", "hay", "hay", "hay"], "difficult_direct_answer": false, "rationales": ["This is a type of grass used for farm animals", "Hay is being carted around.", "Hay is handled here by the people with pitchforks."], "image": "train2014/COCO_train2014_000000506508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200103, "question_id": "LZWUdzmV6wdbEL6TuFKoYb", "question": "In what kind of terrain do persons enjoy skiing here?", "choices": ["flats", "mountain", "desert", "tropical forest"], "correct_choice_idx": 1, "direct_answers": ["ice", "mountains", "trails", "cross country", "mountain", "snow", "snowy mountain", "flat", "snowy mountains", "hilly snowy"], "difficult_direct_answer": true, "rationales": ["The terrain is a mountain.", "They are skiing down a snowy hill.", "The people are skiing down a slope on a mountain covered in snow."], "image": "train2014/COCO_train2014_000000200103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174282, "question_id": "LZXo3FQwvfKRuc6BDCEDEE", "question": "What number are both hands of the front-facing clock on?", "choices": ["eight", "twelve", "nine", "seven"], "correct_choice_idx": 0, "direct_answers": ["seven", "eight", "eight", "viii", "eight", "eight", "eight", "eight", "eight", "eight"], "difficult_direct_answer": false, "rationales": ["Both hands are on the number 8 on this tower clock", "The time reads 8:40 and both hands are on the 8.", "There is a large watch tower with hands pointing southwest."], "image": "train2014/COCO_train2014_000000174282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548423, "question_id": "LZxcuACYisHNaqepehCN9s", "question": "What is the object next to the bookcase?", "choices": ["speaker", "jukebox", "computer tower", "karaoke machine"], "correct_choice_idx": 3, "direct_answers": ["stereo", "stereo", "stereo", "radio system", "laptop", "stereo", "karaoke machine", "stereo", "stereo", "speaker"], "difficult_direct_answer": false, "rationales": ["It's a machine that will show you the words to different songs. you can use the microphone to follow along and sing with the music.", "The box next to the bookcase you can use to pretend to sing with.", "It has knobs and a microphone"], "image": "val2014/COCO_val2014_000000548423.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400238, "question_id": "La65bdLirb2w4WGKibbruF", "question": "What item does the device held in the man's hand control?", "choices": ["video game", "spoon", "mop", "spatula"], "correct_choice_idx": 0, "direct_answers": ["video game", "video game", "video game", "switch", "nintendo wii", "video game", "wii", "stereo", "game", "video game"], "difficult_direct_answer": false, "rationales": ["They are holding a wii remote so they can play video games.", "The item is the game.", "The device is a wii controller. wii is a game console. a game console plays games."], "image": "val2014/COCO_val2014_000000400238.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292697, "question_id": "LaVccyu5mDpAF5wZ48EctX", "question": "What is the layout of this picture called?", "choices": ["photography", "finger paint", "square match", "collage"], "correct_choice_idx": 3, "direct_answers": ["collage", "card", "montage", "collage", "cricket layout", "collage", "group photos", "collage", "collage", "baseball"], "difficult_direct_answer": false, "rationales": ["The photos are arranged in a grid. they have a certain layout so more photos can be seen.", "There is a collage since a bunch of photos are woven together.", "The layout is a collage."], "image": "val2014/COCO_val2014_000000292697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482300, "question_id": "LaXBEQ6zaxT7qp3GL5RdNc", "question": "Why does it say safety truck?", "choices": ["is safe", "sell trucks", "for sale", "is stolen"], "correct_choice_idx": 1, "direct_answers": ["saves lives", "sell trucks", "emergency vehicle", "brand message", "extra safety", "advertisement", "protection", "wireless-camera attached", "saves lives", "saves lives"], "difficult_direct_answer": false, "rationales": ["It's a truck used for safety.", "The marketing department pushes safety to sell more trucks.", "This truck does something that causes it to need to be safer than other vehicles."], "image": "train2014/COCO_train2014_000000482300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5038, "question_id": "Laa3eWpvuhVkNR7wag2ViF", "question": "Who is this woman?", "choices": ["hotel guest", "housekeeper", "hotel manager", "janitor"], "correct_choice_idx": 0, "direct_answers": ["guest", "wife", "rented room", "hotel guest", "unknown", "unknown", "visitor", "unknown", "mother", "actress"], "difficult_direct_answer": false, "rationales": ["She is sitting relaxed on a bed that has all white bedding in a bedroom.", "The lady has the keys in her hand and is not wearing a uniform.", "The woman is a guest."], "image": "val2014/COCO_val2014_000000005038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208548, "question_id": "LavpTXwqjKJvm7dtzepXWH", "question": "What general term can we give to the type of meal above?", "choices": ["drinks", "beverages", "fruit salad", "appetizer"], "correct_choice_idx": 2, "direct_answers": ["healthy", "healthy", "breakfast", "healthy", "healthy", "fruit", "fruit", "fruit salad", "breakfast", "light lunch"], "difficult_direct_answer": false, "rationales": ["It's a mixture of fruits all together.", "There are different types of fruit and this is often called a fruit salad.", "The item is fruit salad."], "image": "train2014/COCO_train2014_000000208548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502366, "question_id": "LayCybQ4r5Yzsrmw99vEWF", "question": "Which two probably shop in the same place?", "choices": ["orange/pink", "pink/green", "orange/green", "all three"], "correct_choice_idx": 0, "direct_answers": ["store", "pink orange", "pink yellow", "under umbrella", "orange/pink", "pink orange", "middle two", "umbrella couple", "pink orange", "pink/orange"], "difficult_direct_answer": false, "rationales": ["The pink and orange umbrellas have the same pattern.", "Orange and pink have the same style umbrellas, but with different colors.", "The orange and pink went to the same place."], "image": "train2014/COCO_train2014_000000502366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300741, "question_id": "LayMebpHAzTTWveNKbh3pw", "question": "The animal can be referred to as what?", "choices": ["avian", "equine", "insect", "bovine"], "correct_choice_idx": 1, "direct_answers": ["horse", "horse", "horse", "brown", "horse", "horse", "brown", "brown", "horse", "equine"], "difficult_direct_answer": false, "rationales": ["This animal is a horse.", "This is the classification for horses", "This is an equine horse."], "image": "train2014/COCO_train2014_000000300741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484884, "question_id": "Lb3uksKfcDrjxywMmJkPEC", "question": "What is usually found in this room?", "choices": ["plunger", "refrigerator", "bookcase", "television"], "correct_choice_idx": 0, "direct_answers": ["soap", "shower", "plunger", "toilet paper", "bath tub", "towels", "towels", "clean", "hygiene products", "toilet"], "difficult_direct_answer": true, "rationales": ["Plungers are kept near toilets.", "The pictured room is a bathroom. the only option that belongs in the bathroom is the one that could unclog the toilet or drain.", "The item mentioned in option a is used to unclog toilets, such as the one found in the photo."], "image": "train2014/COCO_train2014_000000484884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256549, "question_id": "LbCw5GmbJMMurEwFu4XtYZ", "question": "What is being sold to the right of the speed limit sign?", "choices": ["books", "used cars", "toys", "burgers"], "correct_choice_idx": 1, "direct_answers": ["tires", "used cars", "cars", "vehicles", "used vehicles", "tires", "cars", "used cars", "cars", "cars"], "difficult_direct_answer": false, "rationales": ["There is a sign for certified used vehicles by the car lot on the right side of the street.", "The sign says used cars.", "The sign on the right side of this image advertises cars for sale."], "image": "train2014/COCO_train2014_000000256549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321588, "question_id": "LbyV7TGdSbz89AdAE8c43d", "question": "What type of surf is seen closest to the camera?", "choices": ["hybrid", "fish", "gun", "longboard"], "correct_choice_idx": 2, "direct_answers": ["blue surfboard", "low tide", "gun", "board", "beach", "low tide", "flat", "gun board", "plunging", "lady"], "difficult_direct_answer": true, "rationales": ["The surfboard in the foreground is a longer version of a regular surfboard.", "There is a long surf board that is at least 7 feet long.", "This is a long wide board that is good for taking on big waves."], "image": "train2014/COCO_train2014_000000321588.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463702, "question_id": "Lc26gS3xdPu8g6oRqrodx3", "question": "What type of furniture are the flowers placed on?", "choices": ["desks", "bookshelves", "tables", "chairs"], "correct_choice_idx": 2, "direct_answers": ["table", "vase", "tables", "table", "table", "table", "table", "table", "tables", "tables"], "difficult_direct_answer": false, "rationales": ["The flowers are on a table.", "It is a flat wooden surface with chairs around it", "The flowers are put on each unit for ambiance."], "image": "train2014/COCO_train2014_000000463702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67252, "question_id": "Lc4bYpjCnRWuoviLTo74fE", "question": "What past time is the rightmost person involved in?", "choices": ["wakeboarding", "skate boarding", "photography", "beer drinking"], "correct_choice_idx": 2, "direct_answers": ["photography", "skateboarding", "skateboarding", "skateboarding", "skateboarding", "photography", "skateboarding", "photography", "skateboarding", "skateboarding"], "difficult_direct_answer": false, "rationales": ["The person the right has a camera.", "The person has a skateboard.", "The pastime is photography."], "image": "val2014/COCO_val2014_000000067252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301582, "question_id": "LcGv8mXDhrtQbMQxueRyVB", "question": "Why is the man wearing a badge on his shirt?", "choices": ["for halloween", "dress code", "style", "fashion"], "correct_choice_idx": 1, "direct_answers": ["employee", "he's working", "name tag", "identification", "name tag", "dress code", "employee identification", "for identity", "name tag", "identification"], "difficult_direct_answer": false, "rationales": ["The company insignia on this man's nametag implies he is an employee of that company. it is likely wearing these badges is required while on the clock.", "He works for the company with the logo on the badge and it is a dress code.", "The man is wearing a name tag which is part of his uniform for his job."], "image": "val2014/COCO_val2014_000000301582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167411, "question_id": "LcSvMmta7reBMW2TJWP7Ha", "question": "For what purpose are apples displayed?", "choices": ["wine", "fair judging", "for sale", "lunch buffet"], "correct_choice_idx": 2, "direct_answers": ["sale", "for sale", "marketing", "sale", "for sale", "for sale", "for sale", "for sale", "selling", "for sale"], "difficult_direct_answer": false, "rationales": ["They are set up with information cards and pricing.", "They are there so people can buy them", "The person touching the apple is trying to see which one they'll take home."], "image": "train2014/COCO_train2014_000000167411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38892, "question_id": "LcXYUDp6E6k66hr9MhyHVq", "question": "What holiday is most likely closest?", "choices": ["christmas", "halloween", "easter", "thanksgiving"], "correct_choice_idx": 0, "direct_answers": ["christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas"], "difficult_direct_answer": false, "rationales": ["The date on the photo shows that st. nick will soon be coming.", "She is next to a christmas stocking so christmas is coming.", "The child is seen sitting with a christmas sock next to her."], "image": "val2014/COCO_val2014_000000038892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97514, "question_id": "Ld6aygXeZ3yLy47g4NrAAM", "question": "Where did most elements of this meal have to cook?", "choices": ["oven", "deep fryer", "garden", "grill"], "correct_choice_idx": 0, "direct_answers": ["pizza", "oven", "oven", "oven", "oven", "oven", "crust", "oven", "oven", "oven"], "difficult_direct_answer": false, "rationales": ["The elements are in the oven.", "Most elements of this meal are oven cooked goods.", "The pizza and pastries need to be baked."], "image": "train2014/COCO_train2014_000000097514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319521, "question_id": "Ld863TyoPzyoD43i9oTTnw", "question": "What is the black man's occupation?", "choices": ["doctor", "salesman", "lifeguard", "officer"], "correct_choice_idx": 1, "direct_answers": ["retailer", "sales", "vendor", "salesman", "speaker", "vendor", "souvenir seller", "vendor", "singer", "sells trinkets"], "difficult_direct_answer": false, "rationales": ["The occupation is a salesman.", "The man is sitting next to a table with things for sale.", "The black man is selling his wares."], "image": "train2014/COCO_train2014_000000319521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412681, "question_id": "LdK8cVwiNin9GFTR2np5Si", "question": "What Toronto sporting team is represented on his sweatshirt?", "choices": ["raptors", "nationals", "maple leafs", "blue jays"], "correct_choice_idx": 3, "direct_answers": ["basetkball", "blue jays", "blue jays", "blue jays", "blue jays", "blue jays", "blue jays", "blue jays", "basetkball", "blue jays"], "difficult_direct_answer": false, "rationales": ["The shirt says blue jays.", "A woman with shorthair is wearing a long sleeve that says the jays on it.", "The blue jays play in toronto."], "image": "train2014/COCO_train2014_000000412681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477263, "question_id": "LdS8zDYNyKfw7w7gwpFby9", "question": "What can happen if too much of this liquid is ingested?", "choices": ["floatation", "sickness", "strangulation", "hypnosis"], "correct_choice_idx": 1, "direct_answers": ["drunkenness", "get drunk", "drunk", "drunk", "alcohol poisoning", "drunk", "drunk", "sickness", "drunk", "getting drunk"], "difficult_direct_answer": false, "rationales": ["The kidneys can't work fast enough to remove excess liquid and can cause nausea, diarrhea and vomiting.", "Wine is known to make people drunk and sick.", "You can get sick if you drink too much."], "image": "train2014/COCO_train2014_000000477263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202667, "question_id": "Le35xtmxct2zWWia6WRHku", "question": "The man put the red flag on what tool he is using to catch fish?", "choices": ["basket", "thermometer", "cup", "fishing pole"], "correct_choice_idx": 3, "direct_answers": ["buoy", "net", "earthworm", "trap", "flag", "trout line", "netting", "boat", "net", "fishing pole"], "difficult_direct_answer": true, "rationales": ["A fishing pole can be used to catch fish.", "This is so he can see when it moves when a fish is on the line", "Looks to be on his bobber so he can see it better."], "image": "train2014/COCO_train2014_000000202667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504023, "question_id": "Le3kH9FUPUW6fSTgSnGpav", "question": "Make is the make of the black car?", "choices": ["infiniti", "lexus", "chevrolet", "honda"], "correct_choice_idx": 0, "direct_answers": ["infiniti", "aston martin", "infiniti", "acura", "lexus", "no clue", "infinity", "infiniti", "lexus", "infinity"], "difficult_direct_answer": false, "rationales": ["As indicated by the logo on the grill.", "It has a logo on the front of the car, which is that of the brand infiniti.", "The logo on the front of the car tells you what it is."], "image": "train2014/COCO_train2014_000000504023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26577, "question_id": "LeHYuZrXr5KjCSiE7HJ7cE", "question": "Who invented a device related to the type of activities the people standing are doing?", "choices": ["nolan bushnell", "eli whitney", "guglielmo marconi", "jonas salk"], "correct_choice_idx": 0, "direct_answers": ["nintendo", "nintendo", "nintendo", "shigeru miyamoto", "nintendo", "ralph bear", "nintendo", "nolan bushnell", "nintendo", "shigeru miyamoto"], "difficult_direct_answer": false, "rationales": ["The nintendo wii is similar to a device invented by nolan bushnell.", "That device is made from that person.", "That is the man who made it."], "image": "train2014/COCO_train2014_000000026577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85481, "question_id": "LePcjhetLY2msSTvLV6doP", "question": "Which item is probably the coldest?", "choices": ["can", "plate", "bowl", "tray"], "correct_choice_idx": 0, "direct_answers": ["beverage", "beverage", "can", "drink", "drink", "drink", "beverage", "drink", "beverage", "soda"], "difficult_direct_answer": false, "rationales": ["Most cans are stored in the fridge.", "The bowl and plate have cooked food. the tray probably is at room temperature.", "The other options are likely hot either from being warmed/cooked or recently washed."], "image": "val2014/COCO_val2014_000000085481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12047, "question_id": "Ley7qBjpr2wLvNXbS5GPUR", "question": "What position is the payer whos feet are in the air?", "choices": ["umpire", "coach", "catcher", "pitcher"], "correct_choice_idx": 3, "direct_answers": ["pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "runner", "runner", "pitcher", "pitcher", "batter"], "difficult_direct_answer": false, "rationales": ["Coaches and umpires are not players. the catcher's feet are on the ground.", "He puts his foot up to keep balance when he throws", "The person on the mound throws the ball at the catcher."], "image": "val2014/COCO_val2014_000000012047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132042, "question_id": "Lf9uS5yg78p7GFSaBDJ473", "question": "What type of furniture is the boy sitting on?", "choices": ["bench", "sectional", "chaise", "futon"], "correct_choice_idx": 3, "direct_answers": ["sofa", "sofa", "futon", "couch", "sofa", "couch", "chair", "futon", "sofa", "couch"], "difficult_direct_answer": false, "rationales": ["It is a couch that resembles a mattress and is low to the ground. it is a common piece of furniture used by people this age.", "Judging from its appearance (low to the ground and has cushions), the piece of furniture upon which the person is seated is known as a futon.", "It is a small couch shaped item"], "image": "val2014/COCO_val2014_000000132042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316014, "question_id": "LfLsNY35QurvPLDgWehYTB", "question": "Who is known for playing a similar sport to these people?", "choices": ["ken shamrock", "mike tyson", "serena williams", "otis nixon"], "correct_choice_idx": 2, "direct_answers": ["venus williams", "serena williams", "serena williams", "serena", "serena williams", "ping pong", "andre agassi", "serena williams", "madam irene", "williams"], "difficult_direct_answer": false, "rationales": ["The other people aren't tennis stars.", "These people are swinging rackets at flying tennis balls. serena williams is a popular athlete in this sport.", "They are playing tennis, not boxing or baseball."], "image": "train2014/COCO_train2014_000000316014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416088, "question_id": "LfMUDdE6sCwGbCTmZii9YW", "question": "What type of building is this most probably looking at the people in the courtyard?", "choices": ["school", "museum", "housing", "government office"], "correct_choice_idx": 0, "direct_answers": ["university", "school", "school", "state house", "palace", "college library", "church", "university", "school", "school"], "difficult_direct_answer": false, "rationales": ["This is most likely a college campus with students.", "There are several young people who might attend here.", "The building is a college campus."], "image": "val2014/COCO_val2014_000000416088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161051, "question_id": "LfjWtZnedGqMe9pVxJ5Lim", "question": "What color are most of the post-it notes?", "choices": ["red", "orange", "blue", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "yellow", "yellow", "green", "yellow", "green", "green", "chartreuse", "green", "yellow"], "difficult_direct_answer": false, "rationales": ["There are two colours of post-it notes. most of them are not blue.", "There are some post its on the desk itself as well as backwall. they are the same color as the flower stems.", "The post-it notes are a bright lime-green"], "image": "train2014/COCO_train2014_000000161051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158883, "question_id": "LfmsJ56ExVWcBdLaurfqXb", "question": "What is the white part called?", "choices": ["sprinkle", "tooth", "tusk", "sugar"], "correct_choice_idx": 2, "direct_answers": ["tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk"], "difficult_direct_answer": false, "rationales": ["The white things coming out of their face.", "The white part near the elephants mouths are tusks and are made of ivory.", "The white parts are tusks."], "image": "train2014/COCO_train2014_000000158883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104340, "question_id": "LfqQLsmo8nGzrVCtZr7bZm", "question": "What does the seated person look like they are dressed as?", "choices": ["red mm", "dog", "mime", "clown"], "correct_choice_idx": 0, "direct_answers": ["blood drop", "mm", "heart", "mm", "mm", "heart", "photographer", "mm", "red mm", "strawberry"], "difficult_direct_answer": false, "rationales": ["The person seated has a red circular dome for a head.", "The person in the chair is dressed as a red m&m based on the shape and color.", "They look like they are dressed as a red m&m."], "image": "val2014/COCO_val2014_000000104340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360870, "question_id": "Lg2aLZARsJvJ9MhPD9EnCu", "question": "At what stage of preparation are the two nearest plates?", "choices": ["cleanup", "pickup", "salad", "garnish"], "correct_choice_idx": 0, "direct_answers": ["wash plate", "finished", "washing", "finished", "already eaten", "beginning", "cleanup", "end", "plating", "post eating"], "difficult_direct_answer": true, "rationales": ["Plates are normally used to hold food while people eat. when it become empty or close to it, it is time for cleanup.", "Someone has already eaten off them and just crumbs remain", "The plates' contents have been eaten and they need to be washed."], "image": "train2014/COCO_train2014_000000360870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331203, "question_id": "LgJGnUE2F6ys3ySXMnjvRz", "question": "The contractors that build bridges always need to ensure that they are than the train?", "choices": ["higher", "equal", "wider", "smaller"], "correct_choice_idx": 0, "direct_answers": ["higher", "higher", "higher", "higher", "higher", "taller", "higher", "taller", "higher", "higher"], "difficult_direct_answer": false, "rationales": ["The contractors need to ensure the train doesn't hit the bridge.", "The train needs to have enough clearance to not hit the bridge.", "Bridges must be taller than the trains below them, otherwise, damage will occur to both the train and the bridge when the train passes underneath."], "image": "train2014/COCO_train2014_000000331203.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143151, "question_id": "LgJQr7ecBb3azQ5RKKTq2C", "question": "What is sold from the red box on the sidewalk?", "choices": ["umbrellas", "raincoats", "newspapers", "ball caps"], "correct_choice_idx": 2, "direct_answers": ["newspapers", "cake", "newspapers", "cake", "newspapers", "newspapers", "newspapers", "newspapers", "newspapers", "cake"], "difficult_direct_answer": false, "rationales": ["Newspapers are sold.", "The box is an old fashioned news stand machine.", "Those boxes are usually used to sell the daily news bulletin."], "image": "train2014/COCO_train2014_000000143151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400431, "question_id": "LgKCucmydQjcS4KbaqGZax", "question": "What is the most likely income level for most people living in this area?", "choices": ["middle", "high", "wealthy", "low"], "correct_choice_idx": 3, "direct_answers": ["low", "sea side", "lower tier", "low", "low", "poor", "fishing", "low", "fishing", "low income"], "difficult_direct_answer": false, "rationales": ["The boats in the picture look old.", "The boats look rusty and dilapidated.", "Fishing is not a lucrative business for a single fisherman."], "image": "val2014/COCO_val2014_000000400431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159889, "question_id": "LgMzLbYYZmsA2o7bjuup4m", "question": "What is the girl holding?", "choices": ["cell phone", "remote", "kindle", "book"], "correct_choice_idx": 0, "direct_answers": ["phone", "phone", "phone", "phone", "cellphone", "cellphone", "phone", "cell phone", "cellphone", "cellphone"], "difficult_direct_answer": false, "rationales": ["The girl is holding her cell phone.", "The girl is holding a black cell phone in her hand.", "She is holding her phone and looking at it."], "image": "train2014/COCO_train2014_000000159889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78865, "question_id": "LgQjNobSFsgZtF28wo2cEV", "question": "How would you travel through this area?", "choices": ["by bike", "by kayak", "by boat", "by train"], "correct_choice_idx": 3, "direct_answers": ["by train", "train", "by train", "by train", "train", "train", "train", "by train", "train", "train"], "difficult_direct_answer": false, "rationales": ["People would use a train.", "There is a railway with an engine on it.", "There is one traveling along the nearby track."], "image": "val2014/COCO_val2014_000000078865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442019, "question_id": "LghDjNpnFbmFEXURTzx5LB", "question": "What did 10 just do to the ball?", "choices": ["missed it", "ran it", "bunted", "hit it"], "correct_choice_idx": 3, "direct_answers": ["hit", "hit it", "hit", "hit it", "hit", "swing bat", "hit", "hit ball", "baseball", "hit it"], "difficult_direct_answer": false, "rationales": ["The player hit the ball.", "The baseball player belliard has his bat behind him and is lunged forward indicating that he just hit a baseball.", "They swung the bat as the ball came to them"], "image": "val2014/COCO_val2014_000000442019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382316, "question_id": "LgtS2M5P52gvXhbKbcmZrY", "question": "Why is he walking the bike?", "choices": ["stolen bike", "bike broken", "flat tire", "holding umbrella"], "correct_choice_idx": 3, "direct_answers": ["taking break", "flat tire", "flat tire", "uphill", "tired", "tired", "tired", "too hot", "holding umbrella", "hot"], "difficult_direct_answer": false, "rationales": ["The man has an umbrella.", "You need to hands to ride a bike. the man is holding an umbrella with one hand.", "The man can't ride when he has the umbrella."], "image": "train2014/COCO_train2014_000000382316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490491, "question_id": "LgvLEcdmzYAzztHG8Q4KN4", "question": "What is disallowed around this area?", "choices": ["waiting", "turning left", "horning", "parking"], "correct_choice_idx": 2, "direct_answers": ["horns", "noise", "no horn", "horn honking", "horns", "alto", "horn honking", "horning", "playing", "music"], "difficult_direct_answer": false, "rationales": ["The sign says not to do it.", "You can not play a horn or trumpet near this area.", "Horning around this area is strictly prohibited."], "image": "val2014/COCO_val2014_000000490491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182571, "question_id": "Lh6YT7DFNfo4ZDAGt8t69Q", "question": "What is the woman wearing on her head?", "choices": ["baseball cap", "bandana", "straw hat", "headband"], "correct_choice_idx": 3, "direct_answers": ["santa hat", "hat", "santa hat", "headband", "santa hat", "santa hat", "hat", "santa hat", "hat", "hat"], "difficult_direct_answer": false, "rationales": ["The woman is wearing a headband.", "The woman has a headband.", "The woman is wearing a headband around her hat."], "image": "train2014/COCO_train2014_000000182571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124128, "question_id": "LhMfFFNn2Whpz4oDudgMXp", "question": "Which part of the symbol was crossed out?", "choices": ["stop", "go forward", "turn left", "turn right"], "correct_choice_idx": 1, "direct_answers": ["arrow", "top", "left", "middle", "sidewalk", "up arrow", "symbol", "arrows", "normal", "go forward"], "difficult_direct_answer": true, "rationales": ["The up arrow typically indicates that a driver, pedestrian, etc. go in the a direction.", "A street sign with arrows pointing to each side is shown with black covering and area in the middle.", "The go forward part of the symbol was crossed out."], "image": "val2014/COCO_val2014_000000124128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253262, "question_id": "LhNkprNj9ADbXwoDAZKQR7", "question": "What does the truck with yellow and red and white on it serve to do here?", "choices": ["issue citations", "provide snacks", "fuel plane", "sell icecream"], "correct_choice_idx": 2, "direct_answers": ["fuel plane", "provide fuel", "fuel", "fuel plane", "fuel airplane", "fueling", "fuel planes", "fuel supply", "gasoline fill", "gas"], "difficult_direct_answer": true, "rationales": ["The shell logo can be seen on the truck. shell is an oil and gas company.", "A truck with the logo for a gas company is parked beside a plane. planes are fueled before takeoff.", "The truck is fueling the plane."], "image": "val2014/COCO_val2014_000000253262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322562, "question_id": "LhTsNw4vQmEVvMRjcDdbE3", "question": "Which type action propels the nearest boat forward?", "choices": ["poking", "motor", "none", "rowing"], "correct_choice_idx": 0, "direct_answers": ["pushing", "paddling", "paddling", "oaring", "setting pole", "poking", "paddling", "sea", "rowing", "rowing"], "difficult_direct_answer": false, "rationales": ["The action is poking.", "The water is fairly shallow so the poles are used to push from the bottom", "In shallow waters with shallow punts such as this, poles are used to stab the bottom of the river to propel it along."], "image": "train2014/COCO_train2014_000000322562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61566, "question_id": "LhgGRsUZaaZAgZRYVUCiiL", "question": "What is tied to the surfers foot?", "choices": ["jewels", "surf board", "ankle monitor", "dog"], "correct_choice_idx": 1, "direct_answers": ["leash", "leash", "rope", "rope", "surfboard", "surfboard", "surf board", "line", "string", "tether"], "difficult_direct_answer": false, "rationales": ["The strap on the man's ankle is attached to the surfboard under him.", "It's a tether so the board doesn't get lost in the water", "It is a strap that keep the surfboard with him if he falls off."], "image": "val2014/COCO_val2014_000000061566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147409, "question_id": "Lhtify5Qc9bjtRmsWHLxKy", "question": "Who is the road for?", "choices": ["pedestrians", "trucks", "bicycles", "drivers"], "correct_choice_idx": 3, "direct_answers": ["traffic", "everyone", "cars", "cars", "buses", "pedestrian", "drivers", "buses", "busses", "pedestrians"], "difficult_direct_answer": false, "rationales": ["Cars drive on the road.", "The road is for vehicles.", "The road is used for truckers."], "image": "val2014/COCO_val2014_000000147409.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118173, "question_id": "LiDFbEJhnnki6VuVrwUMQj", "question": "Where is this bathroom found?", "choices": ["school", "hospital", "hotel", "home"], "correct_choice_idx": 3, "direct_answers": ["house", "indoors", "apartment", "home", "in house", "house", "home", "home", "home", "in house"], "difficult_direct_answer": false, "rationales": ["Homes have bathrooms.", "This bathroom is found inside somebody's home.", "The bathroom is in someone's home."], "image": "train2014/COCO_train2014_000000118173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11567, "question_id": "LiZANC4VnubKeeyPerxNc6", "question": "Why does the man have bulging pockets?", "choices": ["smuggling fruit", "carrying balls", "comfort", "fashion"], "correct_choice_idx": 1, "direct_answers": ["tennis bal", "tennis balls", "tennis balls", "tennis balls", "tennis balls", "carrying balls", "balls", "tennis balls", "balls", "extra balls"], "difficult_direct_answer": false, "rationales": ["The man has the balls.", "He has extra ones so he can be ready to serve if he loses one", "There are balls in his pockets. he is playing tennis."], "image": "val2014/COCO_val2014_000000011567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380734, "question_id": "Liut38333gfU7Qtufd8eX5", "question": "What kind of meal are they having?", "choices": ["steak", "chicken", "lobster", "vegetarian"], "correct_choice_idx": 2, "direct_answers": ["lunch", "lunch", "lobster", "lobster", "lunch", "lunch", "lobster", "lunch", "lunch", "lunch"], "difficult_direct_answer": false, "rationales": ["There are people wearing plastic bibs over there shirts which is associated with eating lobster as it is a food that is messy and sprays. there is also a red food on the nearest plates that looks like a lobster.", "They are wearing bibs showing an animal on it with claws.", "The red shell is on the plate"], "image": "val2014/COCO_val2014_000000380734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57138, "question_id": "Liwdx45nw74oACXPgtj8Hv", "question": "What type of fruit is this exotic variation most related to?", "choices": ["kiwi", "raspberry", "banana", "grape"], "correct_choice_idx": 2, "direct_answers": ["banana", "banana", "banana", "bananas", "bananas", "banana", "banana", "banana", "banana", "bananas"], "difficult_direct_answer": false, "rationales": ["The fruit grows on a tree. the fruit is elongated.", "It has the same basic shape and peel", "These grown in bunches"], "image": "train2014/COCO_train2014_000000057138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553443, "question_id": "Lj7JhUPWhQ7FKSWdLB2mb9", "question": "What is being eaten?", "choices": ["plum", "apple", "jawbreaker", "orange"], "correct_choice_idx": 1, "direct_answers": ["apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple"], "difficult_direct_answer": false, "rationales": ["An apple is being eaten.", "You can see a little bit of red on the fruit.", "The man has a red fruit."], "image": "val2014/COCO_val2014_000000553443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395040, "question_id": "LjCpYAxqqN88bdAvjwuQ8P", "question": "Which weapon in usage most resembles the object on her back?", "choices": ["spear", "rocket launcher", "mace", "crossbow"], "correct_choice_idx": 3, "direct_answers": ["shotgun", "sword", "sword", "rifle", "rifle", "crossbow", "gun", "gun", "rifle", "rifle"], "difficult_direct_answer": false, "rationales": ["A crossbow looks like the object on the woman's back.", "Crossbows are the same shape as the object on her back.", "The weapon is the crossbow."], "image": "train2014/COCO_train2014_000000395040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56638, "question_id": "LjD3ek9vpFgKzd7vFyZbsv", "question": "Where do the bench and chairs appear to be located?", "choices": ["city square", "indoors", "sidewalk", "park"], "correct_choice_idx": 1, "direct_answers": ["indoors", "indoors", "distance chair", "outside", "downtown", "office patio", "sun room", "inside", "outside", "inside"], "difficult_direct_answer": false, "rationales": ["They look to be outside in a public place.", "The ground is made out of cement and you can see the light shining from the sun.", "The bench and chairs appear to be in a building."], "image": "val2014/COCO_val2014_000000056638.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395745, "question_id": "LjSxC4LAHgaqYLGJkvkS8v", "question": "What vegetable was used to make the side dish for this sandwich lunch?", "choices": ["peas", "endive", "celery", "potato"], "correct_choice_idx": 3, "direct_answers": ["potato", "potatoes", "burger", "potato", "potatoes", "potato", "potato", "tomato", "lettuce", "potato"], "difficult_direct_answer": false, "rationales": ["The side dish is potato chips which are made of potatoes.", "The sandwich on the plate has a side of chips that are made from potatoes.", "Potatoes are used for the chips."], "image": "val2014/COCO_val2014_000000395745.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268711, "question_id": "LjcptyPxVsCztGchTtkUsd", "question": "What is the destination of the bus?", "choices": ["rose mountain", "rosemont", "197", "mont"], "correct_choice_idx": 1, "direct_answers": ["rosemont", "rosemont", "rosemont", "rosemont", "rosemont", "rosemont", "rosemont", "rosemont", "rosemont", "rosemont"], "difficult_direct_answer": false, "rationales": ["It's what is on the destination display of the bus.", "The destination is rosemont.", "On the top of the bus in lights it says where the bus will be going next."], "image": "train2014/COCO_train2014_000000268711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551172, "question_id": "LkNB5pCbFbak96uLS27U56", "question": "What is the person on the right's hand touching?", "choices": ["cat", "snow", "donkey", "tree"], "correct_choice_idx": 1, "direct_answers": ["snow", "snow", "snow", "snow", "snow", "snow", "snow", "snow", "snow", "snow"], "difficult_direct_answer": false, "rationales": ["The person on the right has their hand near the snow.", "It's white and frozen and they are on snowboards", "It is frozen water that turns white."], "image": "train2014/COCO_train2014_000000551172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129926, "question_id": "LkPnYLeobeowEFBsqf7anx", "question": "What is the object on the beach used for?", "choices": ["kiteboarding", "saving people", "surfing", "windsurfing"], "correct_choice_idx": 1, "direct_answers": ["surfing", "surfing", "saving lives", "surfing", "saving people", "surfing", "rescue", "surfboarding", "surfing", "saving person"], "difficult_direct_answer": false, "rationales": ["This is a lifeguard's surfboard.", "The object is to save people.", "A yellow, large surfboard is propped up on the beach and has a red cross on it. a red cross is designated to indicate medical personel."], "image": "train2014/COCO_train2014_000000129926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54363, "question_id": "LkU2tzK7bj3a9sbzZX9oHf", "question": "What dark fruit was used to top the salad?", "choices": ["peaches", "plums", "pears", "raisins"], "correct_choice_idx": 3, "direct_answers": ["raisin", "raisins", "grapes", "raisins", "cranberries", "raisin", "dried cranberries", "prunes", "raisin", "raisins"], "difficult_direct_answer": false, "rationales": ["The raisins are on the top of the salad.", "This looks like a dried fruit and is either a craisin or a raisin.", "There are dried purple fruits."], "image": "train2014/COCO_train2014_000000054363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561346, "question_id": "LkZH9cegyk7baJphhkZk44", "question": "What type of remote is the man holding?", "choices": ["xbox", "playstation 5", "tv", "nintendo wii"], "correct_choice_idx": 3, "direct_answers": ["wii", "tv remote", "nintendo wii", "tv", "wii", "wii", "wii", "wii", "wiimote", "modern"], "difficult_direct_answer": false, "rationales": ["The remote is for wii.", "He is holding a video game controller that is white. it is shaped like a remote.", "Remotes for the wii are white with the string on the back."], "image": "train2014/COCO_train2014_000000561346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408418, "question_id": "LkdesyMNHcCgL4HZJVxMKm", "question": "The canned beverages for sale in the lower left corner were produced in which country?", "choices": ["vietnam", "china", "japan", "thailand"], "correct_choice_idx": 2, "direct_answers": ["japan", "japan", "thailand", "philippines", "japan", "mexico", "mexico", "japan", "china", "japan"], "difficult_direct_answer": false, "rationales": ["The beverages are from japan.", "The letters are from japan.", "Goya dry is made by helios distillery co. in okinawa."], "image": "train2014/COCO_train2014_000000408418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354608, "question_id": "LksFhtfKJsoh3hbK43wWZR", "question": "What professional would most likely have this collection of books?", "choices": ["veterinarian", "ornithologist", "pest control", "marine biologist"], "correct_choice_idx": 1, "direct_answers": ["bird keeper", "ornithologist", "vet", "ornithologist", "ornithologist", "teacher", "birdwatcher", "biology professional", "bird professional", "ornithologist"], "difficult_direct_answer": false, "rationales": ["The books are about birds, not marine animals, pests, or other animals.", "The books on the shelf are about birds which would be interesting for an ornithologist or bird-lover.", "These books are all about birds."], "image": "train2014/COCO_train2014_000000354608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152633, "question_id": "Lky3L7UAxp84sDDvScs9QS", "question": "What is the player ready to do?", "choices": ["swing", "kick", "dunk", "tackle"], "correct_choice_idx": 1, "direct_answers": ["kick ball", "kick ball", "kick", "kick ball", "kick", "kick", "kick ball", "kick ball", "kick", "kick ball"], "difficult_direct_answer": false, "rationales": ["When playing soccer you are not allowed to use your hands or intentionally harm the other players.", "The player has set the ball up so it can be kicked to make a goal or to another player.", "He's running behind a moving soccer ball"], "image": "train2014/COCO_train2014_000000152633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525555, "question_id": "Lm6fBy6tZeTSrAjN9UmPMv", "question": "What type of seating is under the umbrella?", "choices": ["sofa", "rocking chair", "lounge chair", "adirondack chair"], "correct_choice_idx": 2, "direct_answers": ["lawn chairs", "beach", "lounge chair", "lounge chair", "beach chairs", "beach chairs", "lounge chair", "lounge", "lawn chairs", "beach chair"], "difficult_direct_answer": false, "rationales": ["There is a lounge chair.", "There is a lounge chair sitting underneath of the umbrella.", "These chairs you can put in different positions to lay down or sit up."], "image": "train2014/COCO_train2014_000000525555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524248, "question_id": "Lm8o4Vp25tXAJKCz42gJgh", "question": "What province in this location?", "choices": ["maine", "alberta", "victoria", "ontario"], "correct_choice_idx": 2, "direct_answers": ["melbourne", "victoria", "melbourne", "flinders", "victoria", "flinders", "australia", "ontario", "victoria", "flinders"], "difficult_direct_answer": false, "rationales": ["The province is victoria.", "Flinders street station is in victoria, australia.", "It's a well known location in canada"], "image": "train2014/COCO_train2014_000000524248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288576, "question_id": "LmEbcafduFxCR2BDCH7kzk", "question": "What weapon works similar to the item the man is looking at?", "choices": ["warhammer", "rapier", "chakram", "mace"], "correct_choice_idx": 2, "direct_answers": ["chakram", "boomerang", "javelin", "wheel", "throwing star", "grenade", "boomerang", "ninja star", "boomerang", "chakram"], "difficult_direct_answer": false, "rationales": ["The weapon is a disc shaped blade meant for throwing.", "The chakram is a circular weapon thrown through the air like a frisbee.", "The man is looking at a frisbee. a ranged weapon works in a similar manner."], "image": "val2014/COCO_val2014_000000288576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67748, "question_id": "LmQdwRVMMY9R9Ga6WUNeey", "question": "What is under the blanket?", "choices": ["rabbit", "balloon", "child", "cat"], "correct_choice_idx": 2, "direct_answers": ["girl", "child", "child", "child", "bed", "child", "stool", "child", "kid", "child"], "difficult_direct_answer": false, "rationales": ["A little girl is lying on a small bed under the covers.", "The girl is under.", "A kid is wrapped up in a blanket."], "image": "train2014/COCO_train2014_000000067748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562241, "question_id": "LmZfQQiqX4AvWqQaq34uq8", "question": "What ski skill level have the line of young people shown here?", "choices": ["beginner", "olympic", "intermediate", "pro"], "correct_choice_idx": 0, "direct_answers": ["skating", "beginner", "beginner", "beginner", "novice", "advanced", "beginners", "novice", "basic", "beginner"], "difficult_direct_answer": false, "rationales": ["The kids are in a skiing glass so they have just starting skiing.", "The children appear to be in a lesson based on their orientation on the hill in a line. if one is receiving lessons and they are young, they are likely to be a beginner.", "The level is for beginners."], "image": "val2014/COCO_val2014_000000562241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524216, "question_id": "LmsHvENHSSK8fPaQSJxcQ3", "question": "What does the business sell?", "choices": ["animals", "books", "gas", "electronics"], "correct_choice_idx": 2, "direct_answers": ["gas", "gasoline", "fuel", "fuel", "gas", "gas", "gas", "gasoline", "gas", "petroleum gas"], "difficult_direct_answer": false, "rationales": ["The sign has a royal dutch shell logo. prices per gallon are below the logo.", "The sign is for a shell station where fuel can be purchased to power vehicles.", "The shell sign has gasoline prices."], "image": "val2014/COCO_val2014_000000524216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14677, "question_id": "LnJnUrZ9UgVFnC4EKJpVFR", "question": "What are the men using in their hands?", "choices": ["baseball glove", "hockey stick", "boxing glove", "skiis"], "correct_choice_idx": 3, "direct_answers": ["skies", "skiing", "ski poles", "poles", "ski poles", "ski poles", "balancing", "poles", "ski poles", "skiis"], "difficult_direct_answer": false, "rationales": ["The men are trying to keep their skis moving along.", "The men are skiing.", "The men are on a hill that is covered in snow. they are not playing hockey, baseball, or boxing."], "image": "train2014/COCO_train2014_000000014677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352739, "question_id": "LnNv26rqyyaEen4QU5d9N9", "question": "What game often played in bars is played by the occupant here?", "choices": ["cornhole", "win draw", "darts", "quarters"], "correct_choice_idx": 2, "direct_answers": ["darts", "darts", "darts", "darts", "pool", "billiards", "darts", "interesting game", "darts", "video game"], "difficult_direct_answer": false, "rationales": ["There is a circular board to throw darts at.", "A circular board of even thickness of about 3\" is in this room. the circle is divided into pie shapes and there are numbers around the edges.", "There is a board on the table."], "image": "train2014/COCO_train2014_000000352739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12228, "question_id": "LnRFcZsNiWXiN8JhsP8xtJ", "question": "It is safest for dogs to sit in which car seat?", "choices": ["drivers seat", "front seat", "back seats", "trunk"], "correct_choice_idx": 2, "direct_answers": ["back", "backseat", "back", "backseat", "back seat", "back", "passenger", "back seats", "back seat", "kids"], "difficult_direct_answer": false, "rationales": ["It's far too easy for them to hit a hard dashboard when in the front and they can distract a driver. the trunk is also bad because they won't get enough air and knock around too much.", "Dogs should sit in the back so they don't get impacted as much by accidents.", "There are airbags in the front seat which could harm an animal and they cannot drive. the trunk is also an illegal place to put a living thing."], "image": "train2014/COCO_train2014_000000012228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176925, "question_id": "LnWQECy6ueMx38RHNpXPDC", "question": "What is the long fabric item hanging down the elephant's side?", "choices": ["blanket", "curtain", "banner", "flag"], "correct_choice_idx": 2, "direct_answers": ["banner", "blanket", "banner", "canvas", "banner", "banner", "banner", "banner", "banner", "banner"], "difficult_direct_answer": false, "rationales": ["The long fabric item on the side of the elephant contains a set of letters and words. when these words are put together and displayed for others to see, it most likely part of a banner.", "As indicated by the text on it.", "The elephant has a banner on it's back."], "image": "train2014/COCO_train2014_000000176925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399759, "question_id": "LndmBFch3PpwLPsshYGi73", "question": "Why is the woman holding the hot dog in her hand?", "choices": ["to cook", "to sell", "to eat", "to throw"], "correct_choice_idx": 2, "direct_answers": ["hungry", "eating", "taking picture", "to eat", "to pose", "eating it", "eating", "to eat", "to eat", "to eat"], "difficult_direct_answer": false, "rationales": ["She looks like she is getting ready to eat it.", "She does not have more than one hotdog so selling them would be useless. the hotdog is already cooked and throwing it would be wasteful.", "The woman wants to eat."], "image": "val2014/COCO_val2014_000000399759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503278, "question_id": "Lntay9FhvCTJmZ33oqReYz", "question": "They are dancing where?", "choices": ["bar mitzvah", "wedding reception", "retirement party", "birthday party"], "correct_choice_idx": 1, "direct_answers": ["wedding", "ballroom", "wedding", "wedding", "wedding", "ballroom", "wedding", "reception", "wedding reception", "wedding"], "difficult_direct_answer": false, "rationales": ["There is a wedding cake on the table. some girls have on bridemaids' dress and one woman is wearing a wedding gown.", "There is a large cake in the back.", "There is a cake and a ring."], "image": "val2014/COCO_val2014_000000503278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475610, "question_id": "LoJiBFkS8EiudLoEc5gPZh", "question": "What tourist attraction is this likely to be?", "choices": ["circus", "zoo", "reserve", "savannah"], "correct_choice_idx": 1, "direct_answers": ["zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "zoo"], "difficult_direct_answer": false, "rationales": ["The elephants look to be in an area that is mimicked after their natural environment, but a fence can be seen surrounding them, indicating they are in captivity at a zoo.", "Many elephants are wading around water and metal fencing can be seen. zoos use fencing.", "This is an animal that is commonly seen at a zoo. there is a fence around these animals which would be consistent with these animals appearing in a zoo."], "image": "train2014/COCO_train2014_000000475610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248142, "question_id": "LogXWu2RM8K8dA3qzpYsuA", "question": "What is inside the green bottle on the table?", "choices": ["juice", "beer", "champagne", "wine"], "correct_choice_idx": 1, "direct_answers": ["beer", "beer", "beer", "beer", "beverage", "beer", "beverage", "beer", "beverage", "beer"], "difficult_direct_answer": false, "rationales": ["The beer is inside.", "Beer comes in green bottles and the brand name makes beer.", "This is a type of beer that is served in this bottle."], "image": "val2014/COCO_val2014_000000248142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568873, "question_id": "LozkhhJgWrNhusVsQqLsAv", "question": "What type of court is in the background of the photo?", "choices": ["lacrosse", "badminton", "basketball", "baseball"], "correct_choice_idx": 2, "direct_answers": ["basketball", "basketball", "basketball", "basketball", "basketball", "basketball court", "basketball", "basketball", "basketball court", "basketball court"], "difficult_direct_answer": false, "rationales": ["There is a basketball court sitting in the background.", "The background court is a basketball court.", "There are hoops mounted on backboards attached to a tall pole."], "image": "train2014/COCO_train2014_000000568873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168500, "question_id": "LpHpKGWFFWuSWQvvsSNUdp", "question": "What type of dog is this?", "choices": ["chow", "black lab", "poodle", "chihuahua"], "correct_choice_idx": 1, "direct_answers": ["pomerian", "rottweiler", "lab", "black lab", "hungarian", "labrador", "black lab", "black labrador", "black labrador", "black dog"], "difficult_direct_answer": false, "rationales": ["The dog in the bathroom is a large black labrador.", "The dog is a black lab.", "A dog is in a bathroom of a home. the dog is black and a large sized dog. black labs are common family pets."], "image": "train2014/COCO_train2014_000000168500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251079, "question_id": "LpYt3RgjmoJTkK4X9MjRaz", "question": "What is the same color as the door?", "choices": ["orange", "lime", "strawberry", "carrot"], "correct_choice_idx": 1, "direct_answers": ["bicycle", "grass", "green", "green", "plant", "bike", "plant", "lime", "bicycle", "plant"], "difficult_direct_answer": false, "rationales": ["The color is like a lime.", "The door is green, not red or orange.", "The door is green. strawberries are red, oranges are orange, and carrots are also orange."], "image": "train2014/COCO_train2014_000000251079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393486, "question_id": "Lpa7Xwf7pLKkzX5iCdavPS", "question": "What purpose does the cord connecting to the shower faucet provide?", "choices": ["defense", "bend air", "maneuverability", "hold towel"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "extension", "more reach", "shower sprayer", "maneuverability", "spray", "toilet", "water hose", "moves water"], "difficult_direct_answer": true, "rationales": ["The cord allows one to move the shower head.", "The cord allows the faucet to move.", "The cord allows the person to move the faucet around and get hard-to-reach areas of their body."], "image": "train2014/COCO_train2014_000000393486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123579, "question_id": "LpaVJVnCHD4mwy4ED682Bf", "question": "What's the long thing on the phone for?", "choices": ["reading", "decoration", "drawing", "signal"], "correct_choice_idx": 3, "direct_answers": ["connection", "reception", "antenna", "antennae", "antenna", "antenna", "antenna", "antenna", "signal", "reception"], "difficult_direct_answer": false, "rationales": ["The antenna was needed to use the phone.", "The man is trying to get a signal.", "There is an antenna on the phone."], "image": "train2014/COCO_train2014_000000123579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127360, "question_id": "LqWdAWMowazQcD8UWBb3rL", "question": "This plane is away from the airport so the pilot must be preparing for what?", "choices": ["take off", "attack", "cruise around", "landing"], "correct_choice_idx": 0, "direct_answers": ["takeoff", "take off", "takeoff", "raising", "takeoff", "take off", "takeoff", "takeoff", "take off", "take off"], "difficult_direct_answer": false, "rationales": ["It is being assisted to another spot at the airport", "It is taxi-ing in the opposite direction of the airport, indicating that it is leaving, not coming, and to leave it must take off.", "When a plane is landing, it faces towards the building. this plane is being towed by a smaller vehicle and so is most likely preparing for take off."], "image": "train2014/COCO_train2014_000000127360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45923, "question_id": "LqgMLGov8p7uGrnpwb2K9i", "question": "What type of hat is the man wearing?", "choices": ["chef", "baseball", "top", "fedora"], "correct_choice_idx": 3, "direct_answers": ["fedora", "trilby", "fedora", "fedora", "fedora", "homburg", "straw", "fedora", "cowboy", "straw"], "difficult_direct_answer": false, "rationales": ["The brim of a fedora hat goes all the way around the hat.", "It has a distinct short brim and crease in the top.", "That's what the hat is called."], "image": "train2014/COCO_train2014_000000045923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426754, "question_id": "LqingdRPgUzrcW6Sxr4vXj", "question": "What nationality were the founders of this helmet company?", "choices": ["italian", "swiss", "french", "russian"], "correct_choice_idx": 0, "direct_answers": ["italian", "usa", "italian", "american", "british", "americans", "dutch", "italian", "italian", "americans"], "difficult_direct_answer": false, "rationales": ["The nationality is italian.", "The founders of boeri helmets were from milan, italy.", "This particular one is from italian makers."], "image": "val2014/COCO_val2014_000000426754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516902, "question_id": "LqqMNZbHNV89PRcBr72QKD", "question": "Why is she holding the ball?", "choices": ["will serve", "for sale", "hiding it", "will throw"], "correct_choice_idx": 0, "direct_answers": ["serving", "will serve", "serving", "tennis", "to serve", "to serve", "serving", "to serve", "to hit", "serving"], "difficult_direct_answer": false, "rationales": ["Holding and throwing the ball up at the beginning of the match is to hit it over the net; this is called 'serving'.", "This woman holds the ball on the tips of her fingers underhanded; in her other hand is a tennis racket. it is most likely she will throw this ball into the air and swing at it aiming for somewhere opposite the tennis net.", "The woman is serving."], "image": "val2014/COCO_val2014_000000516902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576652, "question_id": "Lr5jfX8Si25v6tARtCgjpp", "question": "What's the name of the spot the player is standing on?", "choices": ["home grass", "out field", "ball field", "pitcher's mound"], "correct_choice_idx": 3, "direct_answers": ["baseball", "pitcher's mound", "not clear", "pitcher's mound", "pitchers mound", "mound", "pitchers mound", "pitcher's mound", "mound", "mound"], "difficult_direct_answer": false, "rationales": ["The guy is pitching the ball and the space is called the pitcher's mound.", "The player is pitching from the mound.", "The name is the mound."], "image": "val2014/COCO_val2014_000000576652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2867, "question_id": "LrDKRGuNVEHKLv8KAryyec", "question": "What is taking place here?", "choices": ["pep speech", "skiing lessons", "protest", "punishment"], "correct_choice_idx": 1, "direct_answers": ["skiing", "skiing lessons", "ski trip", "ski preparation", "skiing", "skiing", "skiing", "plan making", "conversing", "skiing lessons"], "difficult_direct_answer": false, "rationales": ["We see a small group assembled on skis. they appear to be awaiting instruction pertaining to their skiing.", "They are taking a skiing class.", "There are a group of skiers standing together."], "image": "val2014/COCO_val2014_000000002867.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575228, "question_id": "LrMZhHLAxKLQ6MB7uxsAqM", "question": "In what nation is this apartment likely to be situated?", "choices": ["england", "denmark", "france", "canada"], "correct_choice_idx": 1, "direct_answers": ["poland", "italy", "italy", "denmark", "poland", "america", "italy", "italy", "italy", "italy"], "difficult_direct_answer": false, "rationales": ["The language that the sign on the fridge is written is that of the country.", "The apartment is in denmark.", "The word on the sticker is in the language spoken in the country identified in option a."], "image": "train2014/COCO_train2014_000000575228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67994, "question_id": "LrSsf8HmMXtCVwxWSRCXgP", "question": "This University is affiliated with what denomination?", "choices": ["baptist", "lutheran", "mormon", "methodist"], "correct_choice_idx": 3, "direct_answers": ["high point", "unknown", "methodist", "roman", "church", "mormon", "anglican", "christian", "high point", "welch"], "difficult_direct_answer": true, "rationales": ["The university is methodist.", "The chapel exists to help hpu live out its methodist heritage in meaningful ways for today's world.", "The clock has high point university markings. this university is not affiliated with mormons, baptists, or lutherans."], "image": "val2014/COCO_val2014_000000067994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168909, "question_id": "LrZPzmvHemjEEDSNGngrND", "question": "How many Muskmelons are there?", "choices": ["four", "three", "one", "two"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "1 melon", "one", "one", "one", "one", "1 melon", "one", "1 melon"], "difficult_direct_answer": false, "rationales": ["There is only one melon.", "Only one melon is shown with the fruit.", "A single melon can be seen closest to the camera in a basket."], "image": "val2014/COCO_val2014_000000168909.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527440, "question_id": "Lra7x6eAPENA9xjreYcLPU", "question": "Why is the man wearing a bucket hat?", "choices": ["his style", "sun protection", "as joke", "dress code"], "correct_choice_idx": 1, "direct_answers": ["sun protection", "sun protection", "sun", "farmer", "sun protection", "sun protection", "sun protection", "farmer", "protection sun", "shield sun"], "difficult_direct_answer": false, "rationales": ["The man wants sunblock.", "A man is farming with a large brimmed hat on a sunny day. hats are used to block sun.", "He is wearing it to keep the sun out of his eyes."], "image": "val2014/COCO_val2014_000000527440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249675, "question_id": "Lra7zAR4pkiemqPN5JXoGF", "question": "Who does this bus have particular accommodations for?", "choices": ["handicap people", "students", "athletes", "performers"], "correct_choice_idx": 0, "direct_answers": ["handicapped", "handicapped", "wheelchairs", "disabilities", "handicapped", "wheelchairs", "wheelchairs", "handicap people", "handicap", "wheelchair users"], "difficult_direct_answer": false, "rationales": ["It has a depiction of a person in a wheelchair", "Handicapped people can use the bus.", "The bus has an orange handicap sign."], "image": "train2014/COCO_train2014_000000249675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451798, "question_id": "LrdCMoPniRBbDZgEsKt8K5", "question": "Who does this room belong to?", "choices": ["man", "girl", "woman", "baby"], "correct_choice_idx": 0, "direct_answers": ["man", "man", "man", "man", "man", "man", "man", "man", "man", "man"], "difficult_direct_answer": false, "rationales": ["The room belongs to a man.", "There are ties which when usually wear.", "There are many ties hanging on the wall which would indicate a man's room."], "image": "val2014/COCO_val2014_000000451798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465223, "question_id": "Lrh6yHRJ8TQB6PazSxkPaX", "question": "At which point in the match are these players?", "choices": ["mid way", "beginning", "half time", "end"], "correct_choice_idx": 3, "direct_answers": ["end", "finals", "end", "game point", "final", "ending", "end", "finished", "match point", "end"], "difficult_direct_answer": false, "rationales": ["After a tennis match is over, the players traditionally shake hands.", "The players are close to the end since they're shaking hands.", "As indicated by their handshake."], "image": "val2014/COCO_val2014_000000465223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178941, "question_id": "LrnZZxj6FxcafQL6CfrkbF", "question": "Cats love what kind of feeling?", "choices": ["cold", "warmth", "hot", "freezing"], "correct_choice_idx": 1, "direct_answers": ["sun", "warmth", "warmth", "warmth", "warmth", "warmth", "warmth", "calmness", "warmth", "petting"], "difficult_direct_answer": false, "rationales": ["Cats love to be warm and cuddly.", "Cats love to be warm.", "The cat is feeling toasty."], "image": "val2014/COCO_val2014_000000178941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405060, "question_id": "LrpePGc7fRZFVuAu3bVLyB", "question": "Who is the woman on the red and white poster?", "choices": ["courtney cox", "patrician arquette", "gillian anderson", "roma downey"], "correct_choice_idx": 1, "direct_answers": ["model", "model", "patricia arquette", "patricia arquette", "patrician arquette", "actress", "model", "medium", "patricia arquette", "patricia arquette"], "difficult_direct_answer": false, "rationales": ["The woman is arquette.", "Patricia arquette was the star of medium and can be seen on the package in the photo.", "The red and white poster is for medium, not friends, the x-files, or touched by an angel."], "image": "train2014/COCO_train2014_000000405060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559790, "question_id": "LsVKDMfgBP7o8PFQhiCvqo", "question": "What sort of energy propels the trains here?", "choices": ["electric", "diesel", "coal", "water"], "correct_choice_idx": 0, "direct_answers": ["electricity", "electricity", "electric", "electricity", "electricity", "gas", "electric", "electric", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["The trains are hooked to power lines above them.", "The train has electric polls on top.", "There are wires running to the train."], "image": "val2014/COCO_val2014_000000559790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337814, "question_id": "LsWSUPPiHNwz68iBV6Z9xo", "question": "What do the two people holding white objects stare at?", "choices": ["mirrors", "video screen", "enemies", "each other"], "correct_choice_idx": 1, "direct_answers": ["television", "screen", "television screen", "video screen", "projection screen", "tv", "television", "television", "tv", "video game"], "difficult_direct_answer": false, "rationales": ["The people are at a video screen.", "The people are holding joysticks.", "The devices in these two's hands are used for playing games on screens."], "image": "val2014/COCO_val2014_000000337814.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443360, "question_id": "LsfKQG9jYSVB9F5tZcm3Li", "question": "Why do the birds seek high up places?", "choices": ["like flying", "safety", "see more", "exercise"], "correct_choice_idx": 1, "direct_answers": ["for safety", "good", "safety", "flying", "safety", "to hide", "safety", "vision", "safety", "resting"], "difficult_direct_answer": false, "rationales": ["The birds feel safe high up in the air.", "Small birds are prey to larger animals.", "Birds in general have many predators to be on the alert for. when they are flying or sitting in a high place, it is probably for safety reasons."], "image": "train2014/COCO_train2014_000000443360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77851, "question_id": "LsfrqwVefB77YKPtv5ign8", "question": "What are these people likely waiting for to take them to their destinations?", "choices": ["train", "plane", "taxi", "bus"], "correct_choice_idx": 1, "direct_answers": ["airport", "flights", "airplane", "airplane", "airplane", "airplane", "plane", "plane", "flights", "plane"], "difficult_direct_answer": false, "rationales": ["They are waiting in an airport for their flight.", "It appears that these people are waiting at the gate for their ride in the air. they are on their electronic devices and sitting in chairs in front of large windows.", "These people are going on a plane."], "image": "train2014/COCO_train2014_000000077851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348577, "question_id": "Lsidm5Eto8JQCm9HCkLBAk", "question": "What is this area for?", "choices": ["claiming baggage", "boarding flight", "waiting", "security"], "correct_choice_idx": 0, "direct_answers": ["traveling", "baggage claim", "plane", "claiming baggage", "airport", "travel", "airport", "baggage claim", "luggage claim", "baggage claim"], "difficult_direct_answer": false, "rationales": ["There is a round conveyor belt that would turn to get luggage to people.", "People pick up their bags at this area of the airport.", "The area is for luggage."], "image": "train2014/COCO_train2014_000000348577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409217, "question_id": "LsjsuM249cStfL8weRA7xu", "question": "In which way are both the green and white foods similar?", "choices": ["both meat", "both vegetables", "both fruits", "same species"], "correct_choice_idx": 1, "direct_answers": ["cruciferous", "vegetables", "vegetables", "shape", "both vegetables", "vegetables", "cruciferous", "cruciferous", "vegetables", "colorful way"], "difficult_direct_answer": false, "rationales": ["Cauliflour and brocolli are both vegetables.", "They are broccoli and cauliflower.", "Broccoli and cauliflower are both veggies."], "image": "val2014/COCO_val2014_000000409217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474119, "question_id": "Lt6SF5QFa8X3zRvtwL95NK", "question": "What is he doing?", "choices": ["cleaning up", "speaking phone", "hiding himself", "hiding drink"], "correct_choice_idx": 1, "direct_answers": ["talking", "using phone", "talking", "talking", "talking", "speaking phone", "talking", "phoning someone", "talking phone", "talking phone"], "difficult_direct_answer": false, "rationales": ["The man is sitting on the toilet seat and talking on the telephone.", "He is holding the receiver to his ear.", "He's on the phone."], "image": "val2014/COCO_val2014_000000474119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268385, "question_id": "LtABwsJyLD345so6tSHX6Z", "question": "What is the building made of?", "choices": ["porcelain", "plastic", "metal", "wood"], "correct_choice_idx": 2, "direct_answers": ["glass", "glass", "metal", "steel", "metal", "metal", "steel glass", "glass", "glass", "glass"], "difficult_direct_answer": false, "rationales": ["As indicated in the image. the other options don't match.", "The room appears to be tin. when it rains it will produce a sound.", "The building is built out of metallic material."], "image": "train2014/COCO_train2014_000000268385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200710, "question_id": "LtELZaWg5opjHADCq3yqnd", "question": "For what type of formal event are the plush bears being used as decoration?", "choices": ["prom", "anniversary", "wedding", "birthday"], "correct_choice_idx": 2, "direct_answers": ["wedding", "wedding", "wedding", "wedding", "wedding", "wedding", "wedding", "wedding", "wedding", "wedding"], "difficult_direct_answer": false, "rationales": ["The bears are representing a bride and groom.", "They are in formal gear, and one of them is wearing a white dress, which is most commonly known as a bridal gown.", "The bears are dressed as a bride and groom."], "image": "val2014/COCO_val2014_000000200710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51606, "question_id": "LtHEvvVefdFBqx99Yban3A", "question": "Why are they approaching each other?", "choices": ["making up", "have discussion", "random encounter", "asking directions"], "correct_choice_idx": 1, "direct_answers": ["to meet", "talking", "have discussion", "shake hands", "talking", "teammates", "talking", "match over", "sportsmanship", "shake hands"], "difficult_direct_answer": false, "rationales": ["The two woman are on a tennis court and they seem to be competitors, so the only reason they would be close to each other is to talk about something.", "The tennis players look like their about to have a conversation.", "They are trying to talk."], "image": "train2014/COCO_train2014_000000051606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128750, "question_id": "LtNrrja5tjM9hPah3Sp3Kf", "question": "What is the month depicted in the image?", "choices": ["november", "december", "january", "february"], "correct_choice_idx": 1, "direct_answers": ["december", "december", "december", "december", "december", "december", "december", "december", "december", "december"], "difficult_direct_answer": false, "rationales": ["These types of decorated trees are usually on display during the christmas season.", "The month is december.", "A christmas tree is decorated behind a man playing a video game."], "image": "train2014/COCO_train2014_000000128750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128480, "question_id": "Ltao8w8CyAmk6fG73XJ66G", "question": "What is making the women with the orange and white outfit smile?", "choices": ["tv show", "cat video", "picture", "mobile game"], "correct_choice_idx": 2, "direct_answers": ["pictures", "photos", "picture", "photo", "phone", "picture", "phone", "photos", "picture", "photos"], "difficult_direct_answer": false, "rationales": ["The woman in the orange and white outfit is smiling for the picture.", "The woman is looking at a photo on the camera.", "The woman is looking at the other's camera."], "image": "train2014/COCO_train2014_000000128480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27041, "question_id": "LtiPSroHjAYr7dSVRSxW8Z", "question": "Why is she working here?", "choices": ["left home", "sunny outside", "power source", "is hiding"], "correct_choice_idx": 2, "direct_answers": ["nice weather", "sunlight", "multitasking", "power source", "outlet access", "warm", "nice out", "near outlet", "fresh air", "laptop"], "difficult_direct_answer": true, "rationales": ["There is a common electrical power outlet located in the wall that the woman has set the computer on.", "She has her lap top plugged in there.", "The computer needs an outlet."], "image": "train2014/COCO_train2014_000000027041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85045, "question_id": "LtxfUecJtoedumP9zZepop", "question": "The wheeled vehicle parked ahead is used for what?", "choices": ["food truck", "train car", "city trolley", "oil tanker"], "correct_choice_idx": 0, "direct_answers": ["sell food", "food truck", "transportation", "transportation", "transportation", "transportation", "food truck", "transportation", "food truck", "food truck"], "difficult_direct_answer": false, "rationales": ["A sandwich is held up in front of a truck that is also a store. we can presume this sandwich was obtained from the truck.", "City trolleys run in the city and usually have open doors and windows.", "It has condiments on a shelf on the side"], "image": "val2014/COCO_val2014_000000085045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279030, "question_id": "LtzUiph8qAG2rFPJSXi9yM", "question": "What maneuver is the man trying to do?", "choices": ["swerve", "serve", "swivel", "back hand"], "correct_choice_idx": 1, "direct_answers": ["smash", "overhand serve", "hit ball", "serve", "backhand", "tennis", "serve", "serve", "forehand", "serve"], "difficult_direct_answer": false, "rationales": ["The maneuver is to serve.", "The man is playing tennis where a person with their body configured like this would be engaging in a serve shot and likely no other shot in the game.", "By the mans position on the court it is easy to ascertain what he is doing."], "image": "val2014/COCO_val2014_000000279030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4546, "question_id": "LuPwQWqdVLMLK2THVBCJ3u", "question": "What is this flying called?", "choices": ["random", "day flying", "formation", "lost"], "correct_choice_idx": 2, "direct_answers": ["airplane", "airplanes", "formation", "airplane", "formation", "aerial combat", "airshow", "synchronized", "formation", "flock"], "difficult_direct_answer": false, "rationales": ["They are all doing the same thing.", "The other options don't fit with this image. teamwork among pilots is often tested in this way.", "The planes are at equal distance from each other."], "image": "train2014/COCO_train2014_000000004546.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34404, "question_id": "LuXvNT4Fg5Rew32ZHe3tW8", "question": "What other circumstances might the yellow thing on the boy on the left be used?", "choices": ["online gaming", "flying", "biking", "shopping"], "correct_choice_idx": 2, "direct_answers": ["riding bikes", "accident", "riding motorcycle", "biking", "preventing injury", "accident", "skateboarding", "biking", "skating", "safety"], "difficult_direct_answer": false, "rationales": ["The yellow thing on the boy is a helmet. helmets are not used when flying, shopping, or online gaming.", "His helmet could protect his head when on a bicycle", "The yellow helmet could also be used for biking."], "image": "train2014/COCO_train2014_000000034404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474387, "question_id": "LvGJarH8yc2B2FAEpQYjRa", "question": "Why is she there?", "choices": ["get dinner", "awaiting train", "find friend", "use phone"], "correct_choice_idx": 1, "direct_answers": ["awaiting train", "waiting embark", "ride train", "awaiting train", "waiting", "to travel", "ride train", "train rider", "waiting", "waiting"], "difficult_direct_answer": false, "rationales": ["This is the most likely reason given she's on a platform. she might be doing b if her friend is on the a transportation.", "She is standing there so she can get on the train when allowed.", "The woman is waiting for the train."], "image": "train2014/COCO_train2014_000000474387.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178268, "question_id": "LvK4iqnCFmMu6QdAgqdfzP", "question": "What does the man have on?", "choices": ["sandals", "goggles", "biking helmet", "belt"], "correct_choice_idx": 3, "direct_answers": ["cowboy hat", "hat", "cowboy hat", "jeans", "denim", "cowboy hat", "cowboy hat", "cowboy hat", "blue clothing", "belt"], "difficult_direct_answer": false, "rationales": ["The man has a belt.", "He is wearing a belt to keep his pants up.", "A belt is used to support while he is riding."], "image": "train2014/COCO_train2014_000000178268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149, "question_id": "LvmTEdTCCVajHFLrhEb7rY", "question": "What sport could be played on this field easiest?", "choices": ["boxing", "basketball", "ultimate frisbee", "golf"], "correct_choice_idx": 2, "direct_answers": ["soccer", "soccer", "soccer", "ultimate frisbee", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer"], "difficult_direct_answer": false, "rationales": ["This is a good wide open area to throw a frisbee in.", "The sport is ultimate frisbee.", "Ultimate frisbee only requires a frisbee and no other equipment."], "image": "train2014/COCO_train2014_000000000149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399258, "question_id": "LvqrjF9PhvVFVb6X7BZjWP", "question": "Why do the horses wear leg coverings?", "choices": ["parade elevation", "armor", "decoration", "digging tools"], "correct_choice_idx": 1, "direct_answers": ["decoration", "bugs", "safety", "protection", "for protection", "protection", "support", "protection", "armor", "protection"], "difficult_direct_answer": false, "rationales": ["Horses are part of a police unit and wear stuff on legs to give them added protection.", "The horses are wearing protection.", "The horses are wearing plating on their ankles and heels."], "image": "train2014/COCO_train2014_000000399258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33240, "question_id": "LwT2SXyFAbWF3VHHodPqC3", "question": "What wave maker is likely very near here?", "choices": ["chicken", "slurpee machine", "ocean", "snow machine"], "correct_choice_idx": 2, "direct_answers": ["ocean water", "ocean", "tsunami", "high tide", "ocean", "ocean", "ocean", "ocena", "ocean", "beach"], "difficult_direct_answer": false, "rationales": ["This is a beach so the answer becomes obvious.", "The area is a beach, which are adjacent to this large body of water.", "This looks like it is taking place on a beach and oceans often have beaches and make waves."], "image": "train2014/COCO_train2014_000000033240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349199, "question_id": "LwxWRzLgHpV3uBKKxXmGoh", "question": "Why is the boy holding up his shoe?", "choices": ["disposing sand", "avoiding water", "beating people", "tying shoe"], "correct_choice_idx": 0, "direct_answers": ["sandy", "dump sand", "showing dirt", "showing friend", "drying", "disposing sand", "unknown", "sand inside", "view", "sand"], "difficult_direct_answer": true, "rationales": ["The boy is wearing his other shoe and is one a beach so he is likely walking on the beach and getting sand in his shoes that he may want to empty. he is also tilting his held shoe at such an angle that something inside it would pour out.", "The boy is holding his shoe upside down as if to get something out of it.", "When a person is wearing sneakers at the beach sand will get inside and they have to take off the shoe to shake it out."], "image": "val2014/COCO_val2014_000000349199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512334, "question_id": "Lx2rRE5G39ZJzxaeMfj4w7", "question": "Why are the men wearing yellow vests?", "choices": ["visibility", "punishment", "style", "peer pressure"], "correct_choice_idx": 0, "direct_answers": ["tsa", "safety", "safety", "increased visibility", "workers", "visibility", "safety", "safety measures", "visibility", "visibility"], "difficult_direct_answer": false, "rationales": ["The men are trying to be visible.", "The bright colour makes it easier to see the workers.", "The men on the tarmac are wearing yellow vests to make them visible in the dark."], "image": "val2014/COCO_val2014_000000512334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534318, "question_id": "LxZr4vY3bdznVQFEPjNgvy", "question": "What group of people is the white area on the platform built for?", "choices": ["elderly", "pregnant women", "blind", "handicapped"], "correct_choice_idx": 2, "direct_answers": ["passengers", "passengers", "train passengers", "whites", "passengers", "passengers waiting", "blind", "travellers", "passengers", "maintenance"], "difficult_direct_answer": false, "rationales": ["Blind people use the grooved white area as a way to know when to stay back.", "Is so people that cant see can feel the bumps and not go past it unless there is a train there.", "The blind need the platform."], "image": "train2014/COCO_train2014_000000534318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331667, "question_id": "LxhrzYGYzBz2tSZdF867Rq", "question": "What does Stafford indicate?", "choices": ["school name", "next passenger", "passenger's name", "stop name"], "correct_choice_idx": 3, "direct_answers": ["station", "stop name", "train station", "station location", "stop name", "destination", "train stop", "station name", "destination", "city"], "difficult_direct_answer": false, "rationales": ["It is a big sign on a train platform next to the train, indicating to passengers the location of where they currently are.", "One can see the train in the background, so the sign must refer to that location's name.", "It appears to be a sign at the train station shop, as there is a train pictured."], "image": "val2014/COCO_val2014_000000331667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494019, "question_id": "Ly6WB6sK7ettbznVoVRQne", "question": "Why does he have on that type of vest?", "choices": ["avoiding bullet", "blending in", "standing out", "carry items"], "correct_choice_idx": 1, "direct_answers": ["for protection", "warmth", "camouflage", "soldier", "warmth", "camouflage", "blending in", "army", "hunting safety", "military personal"], "difficult_direct_answer": false, "rationales": ["The man's clothes are the same color as the snow.", "It is camo so along with his white clothing he melts into the area more because there is snow and leafless vegetation", "The vest is camouflage and meant to blend someone into their surroundings in forested areas."], "image": "train2014/COCO_train2014_000000494019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521613, "question_id": "LyNQ3Q69jLiZj9iRtxLPEr", "question": "What is the main source of light in the room?", "choices": ["lantern", "window", "fireplace", "torch"], "correct_choice_idx": 1, "direct_answers": ["sunshine", "natural sunlight", "window", "window", "sun", "sunlight", "window", "window", "window", "sun"], "difficult_direct_answer": false, "rationales": ["There is a large window in the room where most of the light is coming from.", "The source is the window.", "The sun is coming in the room through the glass window."], "image": "val2014/COCO_val2014_000000521613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123777, "question_id": "LyParHZeAGXeLAHiPeKbWi", "question": "What is inside the green and yellow object on the sidewalk?", "choices": ["mud", "water", "beer", "candy"], "correct_choice_idx": 1, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["A dark colored hydrant can be seen where someone can open it to put out fires.", "A fire hydrant is the main focus.", "It is a fire hydrant."], "image": "train2014/COCO_train2014_000000123777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272632, "question_id": "LyUUAmLmqHkuQKQAfyS8oH", "question": "Which one was born last?", "choices": ["they're triplets", "middle bunk", "bottom bunk", "top bunk"], "correct_choice_idx": 2, "direct_answers": ["top bunk", "red pj's", "bottom child", "bottom boy", "bottom child", "bottom", "lowest bunk", "bottom baby", "bottom bunk", "bottom"], "difficult_direct_answer": false, "rationales": ["The baby is on the bottom bunk.", "The child on the lowest bunk is the youngest child.", "The younger child is on the bottom."], "image": "train2014/COCO_train2014_000000272632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545163, "question_id": "Lyc5n6bvPXnj4deUdbznhU", "question": "What traffic is not allowed behind the fence here?", "choices": ["cane assisted", "automobile", "foot", "elderly walkers"], "correct_choice_idx": 1, "direct_answers": ["bus", "public", "automobile", "trucks", "semis", "cars", "automobile", "trucks", "motor vehicals", "trucks"], "difficult_direct_answer": false, "rationales": ["The traffic prohibited is cars.", "There is a sign by the fence area with a vehicle and a red crossed out mark over it.", "Cars are not allowed behind the fence."], "image": "train2014/COCO_train2014_000000545163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402541, "question_id": "LygSgUSUAdYFcjdNB9LXvw", "question": "Why do these bikers all have numbers on their bikes?", "choices": ["insurance", "driver iq", "racing numbers", "vehicle registrations"], "correct_choice_idx": 2, "direct_answers": ["racers", "race numbers", "competition", "racing numbers", "riding", "competition", "racing", "competing", "race", "racing"], "difficult_direct_answer": false, "rationales": ["Racing numbers so they know who is on each bike.", "That's what given to the riders who enter the competition.", "They are about to have a competition"], "image": "train2014/COCO_train2014_000000402541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407992, "question_id": "Lz4T4QvLa6QzUez477EAQR", "question": "How is this automated kiosk powered?", "choices": ["solar energy", "gas", "coal", "manual cranking"], "correct_choice_idx": 0, "direct_answers": ["solar power", "solar energy", "solar energy", "sunlight", "solar powered", "solar", "solar power", "solar power", "solar", "electric"], "difficult_direct_answer": false, "rationales": ["An automated kiosk needs electricity. there is a photovoltaic panel on top of the kiosk.", "The kiosk has solar energy.", "There is a solar panel on the top of the kiosk."], "image": "train2014/COCO_train2014_000000407992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476233, "question_id": "Lz99afmX2b2s295hBT5bWf", "question": "What are the boys doing with the orange ball?", "choices": ["painting it", "dribbling it", "throwing it", "kicking it"], "correct_choice_idx": 3, "direct_answers": ["kicking it", "soccer", "playing soccer", "playing soccer", "kicking", "soccer", "soccer", "kicking", "playing soccer", "soccer"], "difficult_direct_answer": false, "rationales": ["They are in soccer gear, and it is a soccer ball, and the game is played by kicking it, as using your hands is not allowed in the game.", "The boys are playing soccer where it is against the rules to touch the ball with your hands. that would rule out throwing, dribbling and painting.", "The boys are playing soccer with the ball so they kick it."], "image": "val2014/COCO_val2014_000000476233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118404, "question_id": "LzZiiJjGZRNjCqaNvzxre3", "question": "Which vehicle has violated the law?", "choices": ["white car", "black car", "bicycle", "bus"], "correct_choice_idx": 0, "direct_answers": ["bicycle", "white car", "car", "suv", "bicycle", "white car", "bicycle", "bike", "minivan", "car"], "difficult_direct_answer": false, "rationales": ["There is a white car in the background that has parked on top of the sidewalk which is a pedestrian zone. it is typically illegal to park a vehicle on the sidewalk which is intended for pedestrian use.", "The bus and black car are driving normally. the bicycle is parked properly.", "It is parked on the sidewalk"], "image": "train2014/COCO_train2014_000000118404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336046, "question_id": "LznDg9HR5H3eGpNijVXnan", "question": "What item does the person cut?", "choices": ["cloth", "paper", "markers", "chalk"], "correct_choice_idx": 0, "direct_answers": ["fabric", "fabric", "fabric", "cloth", "fabric", "fabric", "cloth", "fabric", "cloth", "cloth"], "difficult_direct_answer": false, "rationales": ["The item is for cloth.", "There is cloth near the scissors.", "The woman is holding the scissors to wrapping paper."], "image": "val2014/COCO_val2014_000000336046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74887, "question_id": "Lzxib2SujKjx4tjnXbcznM", "question": "What are the animals in the pasture doing?", "choices": ["mating", "running", "sleeping", "eating"], "correct_choice_idx": 3, "direct_answers": ["grazing", "grazing", "goats", "feeding", "eating", "grazing", "grazing", "grazing", "eating grass", "grazing"], "difficult_direct_answer": false, "rationales": ["These sheep's heads are all down in the grass to graze.", "The animals are eating.", "The sheep are eating grass given their heads near the grass and mouths making contact with the ground."], "image": "train2014/COCO_train2014_000000074887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504606, "question_id": "M27JfBWYHWyQsNPJUNV498", "question": "What is the woman having for her meal?", "choices": ["sandwich", "pizza", "liquor", "salad"], "correct_choice_idx": 0, "direct_answers": ["sandwich", "taco", "sandwich", "sandwich", "sandwich", "pizza", "sandwich", "sandwich", "sandwich", "sandwich"], "difficult_direct_answer": false, "rationales": ["A woman is sitting in front of a plate with bread on it. sandwiches are made of bread and are a popular food item.", "The other options don't appear in the image.", "Sandwiches are 2 slices of bread with other toppings in between. this is what the woman has in both hands."], "image": "train2014/COCO_train2014_000000504606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365066, "question_id": "M28oxJMsStDDQpAVkcj2kA", "question": "What is keeping the giraffes confined?", "choices": ["fence", "cliff", "forest", "river"], "correct_choice_idx": 0, "direct_answers": ["fence", "fence", "eyelid", "fence", "eyelid", "eyelid", "fence", "fence", "fence", "fence"], "difficult_direct_answer": false, "rationales": ["In the background of this photo the boundary between the pen and outside area is divided by metal. this fence also keeps the giraffes from getting loose.", "The giraffe is in an enclosure.", "As seen in the background of the image."], "image": "train2014/COCO_train2014_000000365066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156833, "question_id": "M2EmiLd27gkspmQHfTRu4h", "question": "What is this person doing at this moment?", "choices": ["drinking", "typing", "playing game", "thinking"], "correct_choice_idx": 3, "direct_answers": ["thinking", "typing", "on computer", "typing", "reading", "thinking", "thinking", "working", "thinking", "think"], "difficult_direct_answer": false, "rationales": ["The person is working.", "The person has his hand near his face as if he having a though provoking moment.", "They have their hand to their chin which and are not typing"], "image": "train2014/COCO_train2014_000000156833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66082, "question_id": "M2ZY9Ytd8ecvmx2MQLLeMF", "question": "What is the light brown building in the background?", "choices": ["car garage", "college", "motel", "residential building"], "correct_choice_idx": 0, "direct_answers": ["parking garage", "parking", "parking garage", "jail", "parking structure", "parking garage", "parking garage", "car garage", "parking garage", "parking garage"], "difficult_direct_answer": false, "rationales": ["The building is a car garage.", "The light brown building has no windows and has openings for car fumes to escape.", "The multistory set up and what little of the entrance we see behind the pink bus matches a parking structure."], "image": "train2014/COCO_train2014_000000066082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442268, "question_id": "M2hb5qjK7nsUfk2YSonSSA", "question": "What direction is the sun with respect to the boy?", "choices": ["back", "left", "right", "front"], "correct_choice_idx": 3, "direct_answers": ["left", "right", "front", "his left", "right", "in front", "north", "front", "in front", "infront"], "difficult_direct_answer": false, "rationales": ["There is light shining on his face.", "There is light shining on his face.", "It isn't showing behind him."], "image": "train2014/COCO_train2014_000000442268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105200, "question_id": "M2iMVF4W5ozPYqCfCbozSg", "question": "Where does the player with the ball want it to go?", "choices": ["in net", "behind him", "in forest", "in hands"], "correct_choice_idx": 0, "direct_answers": ["goal", "make goal", "goal", "in net", "in net", "goal", "kick goal", "goal", "goal", "goal"], "difficult_direct_answer": false, "rationales": ["He wants to make a goal with the ball.", "The object of the game is to score a goal.", "This is the goal that means points to win"], "image": "train2014/COCO_train2014_000000105200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500884, "question_id": "M2om7ToRooitggjkvVYxa3", "question": "What has made the ground shiny?", "choices": ["snow", "water", "wax", "oil"], "correct_choice_idx": 1, "direct_answers": ["rain", "rain", "water", "rain", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The rain has made the ground wet.", "Water is on the ground from the rain coming down.", "The ground is wet from the rain."], "image": "val2014/COCO_val2014_000000500884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500179, "question_id": "M2pwvkdxpAL2Q9hJUoJfph", "question": "What is the woman doing with the device in her hands most likely?", "choices": ["carrying", "filming", "calling", "playing music"], "correct_choice_idx": 3, "direct_answers": ["texting", "listening", "texting", "texting", "playing music", "finding music", "playing music", "texting", "texting", "texting"], "difficult_direct_answer": false, "rationales": ["The woman has visible headphones connected to her ears and to the device she is holding. the device is capable of playing music and headphones would be used to listen.", "She has a headphone wire coming from it", "The woman has headphones on."], "image": "train2014/COCO_train2014_000000500179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412604, "question_id": "M2vc46iWoAUnx3iHpoHT8p", "question": "What is the name of this dessert?", "choices": ["tart", "cookie", "eclair", "cupcake"], "correct_choice_idx": 2, "direct_answers": ["donut", "eclair", "eclair", "eclair", "eclair", "donut", "donut", "chocolate eclair", "eclair", "eclair"], "difficult_direct_answer": false, "rationales": ["The name is an eclair.", "It is the only choice that is an oblong shaped pastry filled with a cream and topped with chocolate icing.", "The dessert is rectangular. cupcakes, tarts, and cookies are round."], "image": "val2014/COCO_val2014_000000412604.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123831, "question_id": "M2zmtkrNSXWmcrsFBSse7c", "question": "How are these items ordered?", "choices": ["by size", "by name", "alphabetically", "by color"], "correct_choice_idx": 0, "direct_answers": ["size", "size", "by size", "by size", "size", "size order", "by size", "stack", "small largest", "amazon"], "difficult_direct_answer": false, "rationales": ["The items are ordered by size.", "The items are largest on the bottom and smallest on the top.", "The suitcases are arranged in size order with the smallest on top and largest on the bottom."], "image": "train2014/COCO_train2014_000000123831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316993, "question_id": "M33bCcEmDwKoemFADz29dW", "question": "Which city in addition to San Jose is noted on the sign for the interstate?", "choices": ["alameda", "oakland", "san francisco", "hollywood"], "correct_choice_idx": 1, "direct_answers": ["oakland", "oakland", "oakland", "oakland", "oakland", "oakland", "oakland", "oakland", "oakland", "oakland"], "difficult_direct_answer": false, "rationales": ["Oakland is not that far away from san jose, so they would be next to each other.", "The city listed above san jose on the sign is not san francisco, hollywood, or alameda.", "The city is seen on the sign board."], "image": "train2014/COCO_train2014_000000316993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33697, "question_id": "M3JjqzpDN4JDbUzeFdmBc9", "question": "What is the person whose belongings can be seen here now doing?", "choices": ["nothing", "working", "swimming", "driving"], "correct_choice_idx": 2, "direct_answers": ["swimming", "swimming", "swimming", "swimming", "swimming", "swimming", "boating", "swimming", "swimming", "swimming"], "difficult_direct_answer": false, "rationales": ["The scene is a beach and an ocean so the person is likely in the water.", "Their belongs are keeping safe under the umbrella, as they are at the beach and the most common activity one would partake in here would be swimming.", "The person left their belongings near the water, so they likely went in."], "image": "val2014/COCO_val2014_000000033697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250592, "question_id": "M3METqQZ9H8LaHt4wupTUK", "question": "What's the term for how this boat is parked?", "choices": ["docked", "waiting", "anchored", "setting"], "correct_choice_idx": 0, "direct_answers": ["docked", "docking", "quality", "docked", "docked", "docked", "docked", "docked", "docked", "moored"], "difficult_direct_answer": false, "rationales": ["The term is docked.", "That's what they called it when a boat is parked in the harbor.", "When it is tied to the wooden pier, it is given this name."], "image": "val2014/COCO_val2014_000000250592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108034, "question_id": "M3pzRe535NardFBSBZmTjr", "question": "The white rectangular wired boxes on the desk are what kind of electronic product?", "choices": ["game consoles", "computers", "speakers", "tablets"], "correct_choice_idx": 2, "direct_answers": ["speakers", "project", "speaker", "speakers", "monitors", "speakers", "computer speakers", "monster", "speakers", "game consoles"], "difficult_direct_answer": false, "rationales": ["You can tell by the desktop computer setup as to what the white boxes are.", "The boxes are wired for sound.", "The boxes are for speakers."], "image": "train2014/COCO_train2014_000000108034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574672, "question_id": "M46nGgv5Qxc5pEhgnNpkYw", "question": "At what degrees in Fahrenheit will the surface shown here melt?", "choices": ["zero", "33", "15", "25"], "correct_choice_idx": 1, "direct_answers": ["thirty three", "33", "32", "forty", "thirty three", "thirty-two", "thirty two", "32 degrees", "twenty five", "33"], "difficult_direct_answer": false, "rationales": ["The surface will start to melt when it's 33 degrees out.", "Ice melts at a temperature of 33 degrees fahrenheit.", "Snow melts at 33 degrees."], "image": "val2014/COCO_val2014_000000574672.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532471, "question_id": "M4STpNFhbEGA4RfLuXQsJe", "question": "What activity are the people engaging in?", "choices": ["watching sunset", "crossing border", "swimming", "fishing"], "correct_choice_idx": 0, "direct_answers": ["sightseeing", "watching sunset", "watching sunset", "sunset watching", "tourism", "watch sun", "sightseeing", "swimming", "sunset viewing", "surfing"], "difficult_direct_answer": false, "rationales": ["The people are watching the sunset.", "It's a tour bus and they are watching the beautiful sun set on the beach.", "The people are gathered to the sun set down over the ocean."], "image": "train2014/COCO_train2014_000000532471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124230, "question_id": "M4VWpkp65WU7Un9u8aYNZ6", "question": "How many more animals need to be added to the animals closest to the camera to make a dozen?", "choices": ["ten", "five", "seven", "six"], "correct_choice_idx": 0, "direct_answers": ["ten", "ten", "ten", "ten", "ten", "ten", "ten", "ten", "ten", "ten"], "difficult_direct_answer": false, "rationales": ["There are twelve things in a dozen.", "There are only two in the front so you would need ten more to make a dozen.", "There are currently two animals close to the camera."], "image": "train2014/COCO_train2014_000000124230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85225, "question_id": "M4YnMowf7XqrvQSfH4KRQB", "question": "What animal would you be most unlikely to see in this setting?", "choices": ["tiger", "dog", "horse", "cat"], "correct_choice_idx": 0, "direct_answers": ["cat", "tiger", "moose", "bear", "alligator", "black bear", "bear", "bear", "deer", "moose"], "difficult_direct_answer": false, "rationales": ["You don't see a tiger on the streets.", "Tigers are not in urban areas.", "Even on the outskirts of a major city, as we are here, it would be virtually impossible to see a tiger in these environs."], "image": "train2014/COCO_train2014_000000085225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548647, "question_id": "M56A4R2N9moA6hSmJoQ5TM", "question": "What is the man speaking into?", "choices": ["cup", "microphone", "megaphone", "telephone"], "correct_choice_idx": 1, "direct_answers": ["microphone", "microphone", "microphone", "microphone", "microphone", "microphone", "microphone", "microphone", "microphone", "microphone"], "difficult_direct_answer": false, "rationales": ["The man is talking into a microphone.", "A man wearing a suit and surrounded by oddly-dressed people stands on a small stage and speaks into a microphone.", "The man talks into the mic."], "image": "train2014/COCO_train2014_000000548647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469130, "question_id": "M5bg3XXcENoQh7NjDYQr5t", "question": "The symbol on the planes looks like what logo?", "choices": ["pepsi", "mcdonald's", "starbucks", "nbc"], "correct_choice_idx": 0, "direct_answers": ["pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi"], "difficult_direct_answer": false, "rationales": ["The logo is a circle that is red, white, and blue. a soda company uses a similar logo.", "The color and design is very similar", "The symbol is a red, white, and blue circular shape, not golden arches, a rainbow, or a green and white mermaid."], "image": "val2014/COCO_val2014_000000469130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507979, "question_id": "M5hwArcomvpsaZeU7HVTjk", "question": "The persons here are doing what?", "choices": ["comic con", "selling wine", "wine tasting", "drunken binge"], "correct_choice_idx": 2, "direct_answers": ["laughing", "wine tasting", "drinks", "wine tasting", "wine tasting", "tasting wine", "drinking wine", "drinking wine", "wine tasting", "wayne tasting"], "difficult_direct_answer": false, "rationales": ["They have glasses with many different colored liquids in them", "The preponderance of wine glasses would suggest that this is a winery and the people are taking part in a wine tasting.", "Rows of wine glasses with sample sized amounts of different liquid in them."], "image": "val2014/COCO_val2014_000000507979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513863, "question_id": "M5wwQNGkrTyDFDHAuKxu5L", "question": "How many strings in Guitar?", "choices": ["three", "five", "six", "four"], "correct_choice_idx": 2, "direct_answers": ["six", "six", "six", "six", "five", "many", "six", "six", "six strings", "four"], "difficult_direct_answer": false, "rationales": ["Most guitars have six strings.", "A guitar has mostly 6 strings.", "This is the typical number. it's hard to tell in this image."], "image": "train2014/COCO_train2014_000000513863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193332, "question_id": "M65cqAgfcQkXiEUvS6drxP", "question": "What are the two using the silver object to do?", "choices": ["steer", "cut cake", "dance", "take photo"], "correct_choice_idx": 1, "direct_answers": ["cut cake", "cut cake", "cut cake", "cut cake", "slice", "cut cake", "cut cake", "cut cake", "cut cake", "slice"], "difficult_direct_answer": false, "rationales": ["The silver object is a knife. it could not be used to take a picture, steer, or dance.", "The silver object is a knife. they are using it to on their wedding cake to make individual portions to serve to their guests.", "They are about to eat dessert at their wedding reception."], "image": "val2014/COCO_val2014_000000193332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246434, "question_id": "M6EWtfKdVYF36sWWmyWSPE", "question": "Where is this student studying?", "choices": ["dormitory", "apartment", "library", "library"], "correct_choice_idx": 0, "direct_answers": ["in school", "at school", "school", "table", "table", "library", "room", "for exams", "table", "dormitory"], "difficult_direct_answer": false, "rationales": ["A man is sitting at a table as he studies on the laptop.", "The student is studying in a college dorm room.", "The student is in his dorm."], "image": "train2014/COCO_train2014_000000246434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537456, "question_id": "M6J6u6dqptUNHgZtWC2hwo", "question": "Why is there so much stuff here?", "choices": ["is house", "is stolen", "is trash", "for sale"], "correct_choice_idx": 3, "direct_answers": ["store", "retail shop", "for sale", "convenience", "market", "for sale", "selling", "goods sold", "small newsstall", "store"], "difficult_direct_answer": false, "rationales": ["It is a stall that is open for business, and since it is a small establishment, it will look crowded to fit all of the items they wish to sell.", "The shop is used to sell items.", "This is a vendor's cart that sells things"], "image": "val2014/COCO_val2014_000000537456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54294, "question_id": "M6VtJZVcqjBCA3rRBYCdEH", "question": "What soda does she like to drink?", "choices": ["fanta", "coca-cola", "sprite", "mountain dew"], "correct_choice_idx": 1, "direct_answers": ["coca cola", "coca cola", "coca cola", "coca cola", "coca cola", "coca-cola", "cola", "coke", "coca-cola", "coca cola"], "difficult_direct_answer": false, "rationales": ["There is a bottle of one brand of soda on the counter but none of the others.", "There is a large bottle of coca cola on the counter.", "The soda is coke."], "image": "train2014/COCO_train2014_000000054294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35328, "question_id": "M6o3Mx7jg5nh6zq6bvNkqL", "question": "What is the clear piece the railing on the right is attached to made of?", "choices": ["ceramic", "brick", "glass", "wood"], "correct_choice_idx": 2, "direct_answers": ["glass", "plexiglass", "plexiglass", "glass", "glass", "glass", "plexiglass", "glass", "glass", "plexiglass"], "difficult_direct_answer": false, "rationales": ["The railing is attached to large panes of glass.", "The piece is glass.", "The railing is attached to glass."], "image": "val2014/COCO_val2014_000000035328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61672, "question_id": "M6rGzoVcWr2CTiALqHXuJB", "question": "What food came in a plastic cup with foil lid?", "choices": ["pudding", "yogurt", "soup", "cereal"], "correct_choice_idx": 1, "direct_answers": ["soup", "yogurt", "yougurt", "yogurt", "yogurt", "yogurt", "yogurt", "yogurt", "eatable food", "yogurt"], "difficult_direct_answer": false, "rationales": ["This dairy product often comes in a similar packaging with a plastic cup and foil lid like you finding individual servings of pudding and jello.", "The plastic container with the foil lid contains white yogurt.", "There is a creamy substance in a plastic cup that is ready to eat."], "image": "train2014/COCO_train2014_000000061672.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539932, "question_id": "M7H44cmhEHtGELbL828FF6", "question": "Why is the man holding sticks while skiing?", "choices": ["to fight", "style", "balance", "to flip"], "correct_choice_idx": 2, "direct_answers": ["balance", "support", "stability", "balance", "locomotion", "balance", "balance", "stabilization", "support", "go forward"], "difficult_direct_answer": false, "rationales": ["The man is holding sticks for balance while he skis.", "He will hold those to help him move and stay up on his skiis.", "The man is trying to have balance."], "image": "train2014/COCO_train2014_000000539932.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372790, "question_id": "M7bi9PT8CZopuJxZFY4vZa", "question": "Where do these people pose?", "choices": ["paris", "sacramento", "vegas", "new york"], "correct_choice_idx": 0, "direct_answers": ["paris", "holding hands", "in paris", "eiffel tower", "eiffel tower", "taking photo", "paris", "eiffel tower", "france", "field"], "difficult_direct_answer": false, "rationales": ["The eiffel tower is behind them.", "The eiffel tower is in back.", "The eiffel tower is behind the people."], "image": "train2014/COCO_train2014_000000372790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365738, "question_id": "M7jC6aaY67CJiRhFW2hiW6", "question": "What does the red flag mean?", "choices": ["hot weather", "swimming prohibited", "victory", "tsunami"], "correct_choice_idx": 1, "direct_answers": ["safety", "danger", "high wind", "rough conditions", "high surfs", "stop", "danger", "swimming prohibited", "danger", "dangerous water"], "difficult_direct_answer": false, "rationales": ["Flags like this are used to bring peoples attention to some danger they should be aware of. at a beach a warning flag near the water is likely warning people not to swim.", "The flag means they can't go in the water.", "The red flag means that no swimming is allowed."], "image": "train2014/COCO_train2014_000000365738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184431, "question_id": "M7nMAQF7JS9ErzkJZu4LCr", "question": "What is the vehicle on the right called?", "choices": ["van", "train", "bike", "car"], "correct_choice_idx": 1, "direct_answers": ["train", "train", "train", "train", "train", "train", "bus", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["The vehicle is a train.", "Because the train is running on the train tracks.", "It runs on a track instead of along a road."], "image": "train2014/COCO_train2014_000000184431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521200, "question_id": "M7pPARk6BFwZ4Bvx5ftSWh", "question": "What usually happens in this room?", "choices": ["pool playing", "sleeping", "hand washing", "cooking"], "correct_choice_idx": 2, "direct_answers": ["shower", "toilet", "personal hygiene", "bathe", "shower", "bathingpeeing", "showering", "hand washing", "bathroom", "restroom"], "difficult_direct_answer": true, "rationales": ["You can wash your hands at the sink. you can also use the shower to clean your body off.", "People often wash their hands in the sink.", "This room contains a sink, soap and towels so, extrapolating from that, it's clear that at least one of the activities that (hopefully) takes place here is hand washing."], "image": "train2014/COCO_train2014_000000521200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432527, "question_id": "M8LZ92A3M9onPJxGVWb99s", "question": "Graphite is used in the making of what?", "choices": ["ball", "shoe", "net", "racket"], "correct_choice_idx": 3, "direct_answers": ["tennis raquets", "pencils", "lining", "tennis racket", "tennis", "tennis racket", "pencils", "tennis rackets", "racket", "tennis rackets"], "difficult_direct_answer": false, "rationales": ["Graphite is used for the racquet.", "The racquet is made from graphite and the net on it.", "Even if you don't know the answer, it certainly can't be used to make shoes, a net or a ball."], "image": "train2014/COCO_train2014_000000432527.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316503, "question_id": "M8RjWVdRGEBnUpxsmGkKM9", "question": "Which fruit is the most prominent topping here?", "choices": ["basil", "olives", "pepper", "cherries"], "correct_choice_idx": 1, "direct_answers": ["alives", "peppers", "olive", "olives", "cheese", "peppers", "olive", "pine apple", "olive", "olives"], "difficult_direct_answer": false, "rationales": ["The olives are easily seen on the pizza and olives are fruits.", "The fruit on top of the pizza are unsliced olives.", "There are several green and black ones"], "image": "val2014/COCO_val2014_000000316503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575441, "question_id": "M8UXyi4Uz8x84aHGR3P4bY", "question": "What sort of method is used to secure these vessels to the shore?", "choices": ["knots", "oars", "motors", "dolphins"], "correct_choice_idx": 0, "direct_answers": ["rope", "anchor", "anchors", "rope", "cords", "knots", "rope", "ropes", "ropes", "tying"], "difficult_direct_answer": false, "rationales": ["The boat has ropes on the sides that can be tied in knots to secure it to the dock.", "They tie them to the shore with huge ropes.", "Traditionally knots are used for boats because they are the most secure."], "image": "val2014/COCO_val2014_000000575441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563676, "question_id": "M8XAuKRupzmX9W9m49SmVr", "question": "What is the rectangular object with a chord seen in the mirror used for?", "choices": ["phone calls", "drying hair", "gaming", "powering television"], "correct_choice_idx": 0, "direct_answers": ["phone", "phone calls", "cupboard", "calling", "talk", "phone calls", "phone calls", "phone calls", "phone calls", "phone"], "difficult_direct_answer": false, "rationales": ["The rectangular object in the mirror is a phone.", "A phone is there for phone calls.", "The coiled wire and cradle shape of this wall mounted device identify it as a phone."], "image": "train2014/COCO_train2014_000000563676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261587, "question_id": "M8n4La55wHdLfrE9BFsdPH", "question": "What form of exercise is this?", "choices": ["jet skiing", "surfboarding", "water skiing", "water boarding"], "correct_choice_idx": 2, "direct_answers": ["skiing", "wakeboarding", "surfing", "water skiing", "wakeboarding", "cardio", "skiing", "surfing", "water skiing", "water sailing"], "difficult_direct_answer": false, "rationales": ["This is water skiing.", "They're on skis in the water being pulled.", "The person is water skiing."], "image": "val2014/COCO_val2014_000000261587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92216, "question_id": "M9iqCWsrj2PrhZ3JdaZMnm", "question": "What holds the beverages for the players in the dugout?", "choices": ["coolers", "shelves", "umpire", "fans"], "correct_choice_idx": 0, "direct_answers": ["cooler", "cooler", "coolers", "bottles", "coolers", "coaches", "coolers", "containers", "manager", "coolers"], "difficult_direct_answer": false, "rationales": ["The coolers have drinks.", "The beverages are in gatorade coolers.", "The containers are used to put things in."], "image": "train2014/COCO_train2014_000000092216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318678, "question_id": "M9nKgkPQvgS6HnfghnNxhz", "question": "What is she ready to do?", "choices": ["swing", "sit", "swim", "eat"], "correct_choice_idx": 0, "direct_answers": ["hit ball", "hit ball", "swing", "hit ball", "hit ball", "hit ball", "hit ball", "return volley", "hit ball", "play tennis"], "difficult_direct_answer": false, "rationales": ["The tennis player is getting ready to swing and hit the ball.", "The woman wants to swing her racquet.", "The woman is ready to swing."], "image": "train2014/COCO_train2014_000000318678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192838, "question_id": "MA594Xz8hgNTp5Fafxh79G", "question": "Which one of these skills is required to practice this sport?", "choices": ["balance", "perfect pitch", "memory", "intelligence"], "correct_choice_idx": 0, "direct_answers": ["balance", "balance", "balance", "swimming", "surfing", "water riding", "surfing", "balancing", "swimming", "scatting"], "difficult_direct_answer": false, "rationales": ["Balance is needed.", "Balance is required to surf.", "A girl is balancing on a surfboard in the water. balance is required for surfing."], "image": "val2014/COCO_val2014_000000192838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328289, "question_id": "MAAUxkzRWammT2xuyW3f3w", "question": "What is this land used for?", "choices": ["kite flying", "ranch", "gardening", "wheat farming"], "correct_choice_idx": 1, "direct_answers": ["riding", "horses", "farm animals", "grazing", "grazing", "ranch", "runing", "ranch", "horseback riding", "horseback riding"], "difficult_direct_answer": false, "rationales": ["This land is used as a ranch for the horses.", "The animals are on a ranch.", "The land is a ranch."], "image": "val2014/COCO_val2014_000000328289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286414, "question_id": "MAqJxu9fbMdMMje5UayZJx", "question": "The cat on the left is most likely not engaging with the one sniffing it because it is what?", "choices": ["asleep", "dead", "inanimate", "bored"], "correct_choice_idx": 2, "direct_answers": ["feeling mad", "statue", "ceramic", "inanimate", "not amused", "statue", "female", "statue", "fake", "inanimate"], "difficult_direct_answer": false, "rationales": ["The cat on the left is an inanimate statue.", "The cat looks more stand offish and doesn't want to get into it with the other cat.", "The cat is not moving."], "image": "train2014/COCO_train2014_000000286414.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448533, "question_id": "MBMYEP8aW7jtR6Bqv9B6Xt", "question": "What sound are you most likely to hear if you went in this shop?", "choices": ["metal music", "tick tock", "meow", "piano noises"], "correct_choice_idx": 1, "direct_answers": ["tick tock", "cuckoo", "tick tock", "tick", "tick tock", "chime", "tick tock", "cuckoo", "chimes", "clicking"], "difficult_direct_answer": false, "rationales": ["The sound will come from the clocks.", "The display window is for a clock shop, a place where you would hear tick tock.", "The sound is tick tock."], "image": "train2014/COCO_train2014_000000448533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185476, "question_id": "MBQE3p8q5i4fHrrVC4rpVN", "question": "How will the people here likely propel themselves upon waves?", "choices": ["running", "scuba diving", "kiting", "surfing"], "correct_choice_idx": 3, "direct_answers": ["surfboard", "surfboards", "kinetic", "paddling", "surfboards", "push", "wind", "paddle", "surfing", "swim towards"], "difficult_direct_answer": true, "rationales": ["They have boards used to ride waves", "The people propel themselves by surfing.", "Using their surfboard they can be able to to curve through?."], "image": "val2014/COCO_val2014_000000185476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437967, "question_id": "MBhHBUKXRUZqrbxMKDBXT6", "question": "What is the meat on the dish?", "choices": ["chicken", "beef", "pork chop", "salmon"], "correct_choice_idx": 0, "direct_answers": ["chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken"], "difficult_direct_answer": false, "rationales": ["The meat is chicken.", "It is very lightly colored meat", "A white protein that has a thicker texture."], "image": "train2014/COCO_train2014_000000437967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499068, "question_id": "MBhcQRUTeDdqNKsU2XyXEY", "question": "What type of trees can be seen near the pink building?", "choices": ["maple trees", "pine trees", "elm trees", "palm trees"], "correct_choice_idx": 3, "direct_answers": ["palm", "palm", "palm", "palm", "palm trees", "palm trees", "palm trees", "palm trees", "palm", "palm"], "difficult_direct_answer": false, "rationales": ["The long thin nature of these trees' leaves and they're marked up wispy trunks identifies them as palm trees.", "These trees can be found in warmer climates.", "There are palm trees between the sandy beach and the pink building ."], "image": "val2014/COCO_val2014_000000499068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46077, "question_id": "MC8TqakBoprbZYoqhxB8PU", "question": "What does the passenger lack that the driver has?", "choices": ["helmet", "shoes", "pants", "shirt"], "correct_choice_idx": 0, "direct_answers": ["helmet", "helmet", "handlebars", "helmet", "helmet", "handlebars", "handlebars", "helmet", "helmet", "helmet"], "difficult_direct_answer": false, "rationales": ["The younger passenger has a helmet to protect their head.", "The driver is wearing protective head gear and the passenger is not.", "There is nothing on the passenger's head."], "image": "val2014/COCO_val2014_000000046077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268306, "question_id": "MCNg6ETNykqvrNfeQQuDzZ", "question": "What is the large glass object on the bike called?", "choices": ["headlight", "turn light", "break light", "night light"], "correct_choice_idx": 0, "direct_answers": ["headlight", "light", "headlight", "headlight", "light", "headlight", "headlight", "headlight", "light", "seat"], "difficult_direct_answer": false, "rationales": ["The headlight is the large piece of glass.", "The object mounted on the bike serves to shine out during night, thus performing the function of the equipment listed in option a.", "The item is a light on the front of the motorcycle and helps the motorist see at night."], "image": "train2014/COCO_train2014_000000268306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243875, "question_id": "MCQrkVnsKs3vVReBRmG64W", "question": "What is the man planning to use to hit what is thrown at him?", "choices": ["club", "rock", "ball", "bat"], "correct_choice_idx": 3, "direct_answers": ["baseball", "bat", "bat", "baseball bat", "baseball", "bat", "bat", "baseball", "bat", "bat"], "difficult_direct_answer": false, "rationales": ["They appear to be playing the game of baseball, and he is holding a bat in his hand, which is the only thing that can be safely and most optimized to use to hit the ball.", "The man uses his bat to hit the ball.", "The man's holding a bat and he's in a batter's position, so it's obvious that the bat is what he'll use to hit the ball when it's pitched."], "image": "val2014/COCO_val2014_000000243875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168854, "question_id": "MCR6Npr2hy3Px5dBLzt2yX", "question": "The orange object is made of what material?", "choices": ["paper", "cotton", "foam", "polyester"], "correct_choice_idx": 3, "direct_answers": ["kite", "rubber", "plastic nylon", "nylon", "nylon", "nylon", "polyester", "plastic", "latex/nylon-fabric", "nylon"], "difficult_direct_answer": false, "rationales": ["The size of the kite indicates that it must be made out of synthetic material that can easily stay together and also be flimsy so it can fly in the air.", "This material makes it stronger", "The kite is made out of a synthetic material so it is most likely polyester."], "image": "train2014/COCO_train2014_000000168854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135086, "question_id": "MCYWibZcRfGKgTKDbRNEHD", "question": "What number is on the train?", "choices": ["31233", "97256", "45398", "78256"], "correct_choice_idx": 0, "direct_answers": ["31233", "21253", "thirtyonethousand twohundredthirtythree", "31233", "31233", "31233", "three thousand", "31233", "31233", "31233"], "difficult_direct_answer": false, "rationales": ["The five-digit number is beneath the windows near the front of the train.", "The number is below the window on the side of the train.", "The number is shown on the side."], "image": "train2014/COCO_train2014_000000135086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373487, "question_id": "MCc24RxX8AJrxFXwdQXZ4t", "question": "This dealership serves what region?", "choices": ["south florida", "northern california", "central ohio", "west texas"], "correct_choice_idx": 0, "direct_answers": ["marooned", "south florida", "marooned", "marine", "marooned", "marooned", "marooned", "southeast us", "truck", "marooned"], "difficult_direct_answer": false, "rationales": ["You can tell by the signage as to what area they serve.", "According to an internet search, maroone chevrolet is located in west palm beach. also, the palm trees eliminated texas and ohio as options.", "The sign is for maroone chevrolet. it is located in west palm beach."], "image": "train2014/COCO_train2014_000000373487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114726, "question_id": "MCeijcNNn9DgUUWSBjHTyG", "question": "What old device can be seen on the left end of the shelf?", "choices": ["camera", "television", "pager", "phone"], "correct_choice_idx": 0, "direct_answers": ["radio", "camera", "camera", "radio", "camera", "camera", "camera", "radio", "camera", "camera"], "difficult_direct_answer": false, "rationales": ["A black, small object with a lens is on a shelf. cameras have lenses.", "It's an old camera that is box shaped.", "A camera has a lens and the object on the shelf has a lens, so it's easy to conclude that the object is indeed a camera."], "image": "val2014/COCO_val2014_000000114726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279846, "question_id": "MCgpGNWQKLAbYXfo38UBxp", "question": "What kind of bird created something that sits on this pizza?", "choices": ["quail", "sparrow", "chicken", "ostrich"], "correct_choice_idx": 2, "direct_answers": ["hen", "chicken", "hen", "chicken", "chicken", "chicken", "hen", "chicken", "chicken", "chicken"], "difficult_direct_answer": false, "rationales": ["There is an egg in the middle of the pizza, and birds are known to lay eggs.", "The bird is a chicken.", "Chickens lay eggs. humans eat the eggs in all sorts of different ways."], "image": "val2014/COCO_val2014_000000279846.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528112, "question_id": "MCnqRqTvVfBmdUARGbG6gQ", "question": "What is the boat using to navigate?", "choices": ["paddles", "sail", "engine", "oars"], "correct_choice_idx": 1, "direct_answers": ["sail", "sail", "sail", "sail", "sail", "sail", "sail", "sail", "sail", "sail"], "difficult_direct_answer": false, "rationales": ["It has a red sail on it and there isn't any motor visible on the boat.", "The boat can sail across the water.", "The boat uses a sail."], "image": "train2014/COCO_train2014_000000528112.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260927, "question_id": "MD4H4Eb4wm8AE2iVDJsy6w", "question": "Who is the figure depicted in the statue?", "choices": ["lloyd", "davv", "dav", "davy"], "correct_choice_idx": 3, "direct_answers": ["super man", "davy", "davy", "davy crockett", "davy", "davy crockett", "davy", "famous person", "davy", "davy"], "difficult_direct_answer": false, "rationales": ["The word on the side of the statue indicates who is being depicted.", "Davy crockett is in the photo.", "The figure is davy."], "image": "val2014/COCO_val2014_000000260927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113801, "question_id": "MD5FCxRBKAN6dyv3EZ8YcT", "question": "What is the ladder for?", "choices": ["entering plane", "entering hangar", "entering roof", "entering bus"], "correct_choice_idx": 0, "direct_answers": ["boarding", "airplane", "entering plane", "boarding", "metal", "for plane", "passengers", "entering plane", "boarding", "passengers"], "difficult_direct_answer": false, "rationales": ["The bus and hangar are accessible at ground level. the ladder is too short to access the roof.", "The ladder helps people board.", "The ladder helps people board."], "image": "val2014/COCO_val2014_000000113801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180824, "question_id": "MDAargdmx5BZ7gdigCYaM6", "question": "What is the NYSE symbol of this bank?", "choices": ["mdb", "bom", "bdm", "bmo"], "correct_choice_idx": 3, "direct_answers": ["bmo", "bom", "name", "bmo", "unclear", "bmo", "bmo", "bmo", "bmo", "bmo"], "difficult_direct_answer": false, "rationales": ["Bmo harris bank sign can be seen which has the nyse symbol bmo.", "The nyse symbol is bmo.", "Bank of montreal, also goes by bmo and it says bmo on the sign as well, so it is easily used as the 3 letter stock exchange title."], "image": "val2014/COCO_val2014_000000180824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564294, "question_id": "MDBEwQ6Sy5maoocAVg3vRd", "question": "Why are the children's heads covered?", "choices": ["visibility", "fashion", "protection", "religion"], "correct_choice_idx": 2, "direct_answers": ["safety", "cold weather", "for safety", "stay warm", "from cold", "for protection", "keep warm", "head safety", "safety", "protection"], "difficult_direct_answer": true, "rationales": ["The children are skiing. skiing is done in cold places with snow where exposed skin can be problematic and head injuries could happen which is why most skiers and especially beginners where helmets.", "These helmets keep heads safe in a fall", "The children are skiing. they are wearing safety helmets."], "image": "train2014/COCO_train2014_000000564294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237894, "question_id": "MDCt3vbruR23HRJE3nZQxw", "question": "How is the street in the picture?", "choices": ["wet", "snowy", "dry", "dirt"], "correct_choice_idx": 0, "direct_answers": ["wet", "rainy", "wet", "wet", "wet", "wet", "wet", "rainy", "wet", "wet"], "difficult_direct_answer": false, "rationales": ["The street is wet.", "The street is wet from rain.", "The shine of the street shows us it's wet."], "image": "train2014/COCO_train2014_000000237894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171737, "question_id": "MDDah6yVU5w8vXyPDEFiR4", "question": "Why is the man holding the racket back?", "choices": ["to itch", "to swing", "to drop", "to block"], "correct_choice_idx": 1, "direct_answers": ["swinging", "velocity", "swinging", "to swing", "hand", "hit tennisball", "preparing shot", "backhand stroke", "hit ball", "awaiting ball"], "difficult_direct_answer": true, "rationales": ["He is preparing to perform a back swing.", "He is ready to swing the racquet at the ball.", "The man wants to swing."], "image": "train2014/COCO_train2014_000000171737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46013, "question_id": "MDULTDB6RE9eN49M4PHA2r", "question": "What is he doing?", "choices": ["attacking", "charging", "dancing", "posing"], "correct_choice_idx": 3, "direct_answers": ["posing", "posing", "dancing", "dancing", "standing", "standing", "standing", "smiling", "dancing", "standing"], "difficult_direct_answer": false, "rationales": ["The man is smiling for the camera.", "The man is standing still for a picture.", "He is standing and smiling with his arms outstretched."], "image": "train2014/COCO_train2014_000000046013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134639, "question_id": "MDXc4GDToPnetRzdRESqBW", "question": "What is the man who crouches doing?", "choices": ["judging", "racing", "resting", "serving"], "correct_choice_idx": 0, "direct_answers": ["readyto retrieve", "ball boy", "ball retrieval", "waiting", "take exercise", "observing", "judging", "waiting", "watching ball", "waiting"], "difficult_direct_answer": false, "rationales": ["The man is watching the match.", "The man who is crouching is at the middle of a tennis court. he is staring intently at the court so he must be there to officiate or is judging the match.", "The man is looking at who might win the game."], "image": "train2014/COCO_train2014_000000134639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550601, "question_id": "MDd9jbP3qvb24wcnnxTbnU", "question": "By what method do the people become aloft?", "choices": ["sheer will", "ramp", "magic", "trick photography"], "correct_choice_idx": 1, "direct_answers": ["waves", "night", "ramp", "ramp", "air", "air", "jump lift", "by boat", "ramp", "ramp"], "difficult_direct_answer": false, "rationales": ["The skiers are pulled by a boat that carries the over a ramp, and it sends them flying in the air.", "The people can use a ramp to lift off.", "The method is the ramp."], "image": "val2014/COCO_val2014_000000550601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503682, "question_id": "ME5j7kkeZkCbfNHhYBjtcp", "question": "What will happen to the water in the hose?", "choices": ["evaporate", "freeze", "melt snow", "horses drink"], "correct_choice_idx": 3, "direct_answers": ["horses drink", "will freeze", "shot out", "into buckets", "spray", "freeze", "freeze", "freeze", "freeze", "freeze"], "difficult_direct_answer": false, "rationales": ["The man is putting out water for the horses to have.", "Humans need to tend to animals they keep in captivity. an easy way to give water is through a hose like the one seen in the picture.", "It's being put in containers near the animals"], "image": "train2014/COCO_train2014_000000503682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166420, "question_id": "ME8R6HfzVHW7RpCScTWyGY", "question": "Why is the man swinging his arms?", "choices": ["to catch", "to wave", "to hit", "to exercise"], "correct_choice_idx": 2, "direct_answers": ["playing tennis", "tennis", "hit ball", "kicking ball", "tennis", "to hit", "hitting ball", "playing tennis", "tennis", "playing"], "difficult_direct_answer": false, "rationales": ["The man wants to hit.", "You move your arms forward to hit when the ball comes towards you.", "He is going to try to hit the ball."], "image": "train2014/COCO_train2014_000000166420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430125, "question_id": "MEQbrCSWx7BoWmoAmTCvpU", "question": "What sort of area is the man eating pizza in?", "choices": ["cow pasture", "midway", "ski lodge", "front yard"], "correct_choice_idx": 1, "direct_answers": ["carnival", "fair", "fair", "food court", "carnival", "carnival", "fair", "food court", "fair", "midway"], "difficult_direct_answer": false, "rationales": ["This is an artificial street created in a carnival or festival", "It is daylight and it does not appear that it is dusk or dawn.", "These are rolling vender vehicles set up temporarily"], "image": "val2014/COCO_val2014_000000430125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362340, "question_id": "MEaVPdVoAJFfy7s9MdozEm", "question": "What is the name for this sort of event?", "choices": ["fight", "tailgate", "stakeout", "drive"], "correct_choice_idx": 1, "direct_answers": ["tailgate", "tailgate", "rodeo", "tailgate", "tailgating", "tailgate", "tailgating", "rally", "truck show", "rally"], "difficult_direct_answer": false, "rationales": ["People using the back of their trucks for entertainment area.", "People do this thing called tailgating to have fun before a big event.", "The back of the trucks trunk that folds down is called a tailgate, and when people put that down when they park up and eat and celebrate, it is called a tailgate after that."], "image": "train2014/COCO_train2014_000000362340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188310, "question_id": "MEircvjFp7HjfM3jx969vq", "question": "Where in the South does the pitcher go to school?", "choices": ["florida", "texas", "alabama", "north carolina"], "correct_choice_idx": 3, "direct_answers": ["north carolina", "duke", "kentucky", "north carolina", "durham", "duke", "duke", "duke", "duke", "duke"], "difficult_direct_answer": false, "rationales": ["North carolina is the state.", "The pitcher goes to school at duke university located in durham.", "Duke is in north carolina."], "image": "train2014/COCO_train2014_000000188310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349358, "question_id": "MFBfgGyfuFyBMfRgqfhrXo", "question": "What does the man holding his glove out want to catch?", "choices": ["soccer ball", "birdie", "football", "baseball"], "correct_choice_idx": 3, "direct_answers": ["baseball", "ball", "baseball", "ball", "baseball", "ball", "baseball", "ball", "baseball", "ball"], "difficult_direct_answer": false, "rationales": ["The baseball diamond is visible and the only person holding a glove is in the catcher's position. the catcher's job description is to catch the baseball thrown by the pitcher in his glove.", "The man has his glove out because he is going to try to catch the ball.", "They are playing this sport which uses a bat, so naturally they want to catch that type of ball."], "image": "train2014/COCO_train2014_000000349358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424091, "question_id": "MFByrtp98eN27vUeccyoPK", "question": "In which city do these cars drive?", "choices": ["boston", "little rock", "sacramento", "new york"], "correct_choice_idx": 3, "direct_answers": ["nyc", "new york", "new york", "new york", "new york", "brooklyn", "manhattan", "nyc", "brokilliar", "new york"], "difficult_direct_answer": false, "rationales": ["Manhattan and brooklyn are in new york.", "The signs list new york locations such as brooklyn and manhattan so it is reasonable to assume that the cars are driving through new york city.", "It is a dense city scape, and the signs are places that are found in new york, such as manhattan, and the brooklyn bridge."], "image": "train2014/COCO_train2014_000000424091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487498, "question_id": "MFM45LUtjwn24mjVAqKqKG", "question": "Why is the girl holding an umbrella?", "choices": ["for cosplay", "to buy", "to sell", "it's raining"], "correct_choice_idx": 3, "direct_answers": ["rain protection", "avoid rain", "rain protection", "rain", "rain", "it's raining", "it's raining", "rain", "its raining", "repel rain"], "difficult_direct_answer": false, "rationales": ["The girl is trying to keep rain out.", "Umbrellas are employed when there is rain and there are visible drops of water on the umbrella that the girls are huddling under.", "Drops of water can be seen on top of the umbrella, indicating that it is raining and the umbrella is protecting her from the rain."], "image": "val2014/COCO_val2014_000000487498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189885, "question_id": "MFYjbapADrrZNmomP8X6T6", "question": "What has caused the road to look slick?", "choices": ["snow", "rain", "ice", "wax"], "correct_choice_idx": 1, "direct_answers": ["water", "water", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The road looks like it is wet from the elements.", "There are puddles on the road. this allows us to see that it has rained.", "The item in a is the most logical choice, as b and c are identified with cold weather, unlike the condition in the photo. item d is not typically applied to roads."], "image": "val2014/COCO_val2014_000000189885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352507, "question_id": "MFdd9s2R4JuGkxqWBFdqLV", "question": "How many pounds of load are the pillars holding up?", "choices": ["1000", "zero", "500", "5000"], "correct_choice_idx": 1, "direct_answers": ["no idea", "zero", "thousands", "tons", "thousand pounds", "four hundred", "unknown", "unknown", "many", "many"], "difficult_direct_answer": false, "rationales": ["There are no pounds.", "Those pillars are for decoration only and to look nice in the room.", "They're just for decor."], "image": "val2014/COCO_val2014_000000352507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365168, "question_id": "MFiXVUynJR9z5xUUxqWXu8", "question": "What is the protein in this dish?", "choices": ["carrots", "chicken", "shrimp", "broccoli"], "correct_choice_idx": 2, "direct_answers": ["shrimp", "shrimp", "shrimp", "shrimp", "shrimp", "shrimp", "shrimp", "2.5 g", "shrimp", "2.5 g"], "difficult_direct_answer": false, "rationales": ["The protein is the shrimp.", "The white color shows that it is the shrimp.", "There is seafood mixed with the vegetables."], "image": "val2014/COCO_val2014_000000365168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474616, "question_id": "MFnPmQfhQ5hfeJgHdKGzJM", "question": "What is the sum of each individual digit on the boy's shirt?", "choices": ["14", "five", "three", "41"], "correct_choice_idx": 1, "direct_answers": ["five", "five", "five", "five", "five", "five", "number 5", "number 5", "five", "five 5"], "difficult_direct_answer": false, "rationales": ["This is simple addition of one plus four.", "The numbers are one and four. they add to be one more than four.", "The boy has a 1 and a 4 and those add up to 5."], "image": "train2014/COCO_train2014_000000474616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102076, "question_id": "MFnkQ5qsjb4rkRTY5EyT9q", "question": "What style of bathing suit is she wearing?", "choices": ["one piece", "boy short", "bikini", "blouson"], "correct_choice_idx": 2, "direct_answers": ["bikini", "bikini", "bikini", "bikini", "bikini", "bikini", "bikini", "bikini", "bikini", "bikini"], "difficult_direct_answer": false, "rationales": ["Her bathing suit has two pieces.", "The woman is wearing a two-piece bathing suit which consists of a triangular shaped top and bottom.", "It has a separate top and bottom"], "image": "train2014/COCO_train2014_000000102076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310703, "question_id": "MFyvHaBLhyjqFmxtyJ5Wnh", "question": "What is the train stopped at?", "choices": ["ball park", "fire hydrant", "bistro", "fountain"], "correct_choice_idx": 1, "direct_answers": ["depot", "fire hydrant", "yukon", "desert", "track end", "train station", "not working", "train station", "depot station", "warehouse"], "difficult_direct_answer": true, "rationales": ["There is a yellow hyrdant in front of the engine.", "The train has stopped next to the fire hydrant.", "The yellow object is used for water."], "image": "val2014/COCO_val2014_000000310703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477097, "question_id": "MGZ59kNktpaNEqUY5G6HJL", "question": "Which elevation did the skier ride the lift from to this point?", "choices": ["higher", "lower", "same", "mount everest"], "correct_choice_idx": 1, "direct_answers": ["ground level", "lower", "bottommost", "low", "bottom", "lower", "5000 feet", "down elevator", "lower", "low elevation"], "difficult_direct_answer": false, "rationales": ["The elevation is lower.", "The skier is going downhill.", "The skier started at the bottom of the hill and used the lift to get to the top."], "image": "train2014/COCO_train2014_000000477097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381117, "question_id": "MGbd4jdrvPY2qbj77McwJa", "question": "What is the grey object used for?", "choices": ["parking", "gaming", "gambling", "sight seeing"], "correct_choice_idx": 0, "direct_answers": ["parking fee", "speed meter", "meters", "track time", "parking meter", "meter", "money", "parking meter", "parking", "parking"], "difficult_direct_answer": false, "rationales": ["The object is for parking.", "This is a meter that takes change in exchange for time to park on a public street.", "The gray device is a parking meter that a person puts money into to pay for parking."], "image": "train2014/COCO_train2014_000000381117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67414, "question_id": "MGcRAAc64A7MDYzkNyFZLu", "question": "Actual color of zebra's stripe are?", "choices": ["purple", "white", "green", "black"], "correct_choice_idx": 1, "direct_answers": ["white", "white black", "black", "black", "white", "black", "black", "white", "black", "black"], "difficult_direct_answer": false, "rationales": ["A typical zebra with black stripes is leaning into a car.", "The zebra's stripes are white and black.", "The zebra by the silver car is black and has white stripes."], "image": "train2014/COCO_train2014_000000067414.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291347, "question_id": "MGhMCjKhacNULdyabEio7N", "question": "What is the piece of fruit with black seeds called?", "choices": ["banana", "grape", "apple", "kiwi"], "correct_choice_idx": 3, "direct_answers": ["kiwi", "kiwifruit", "kiwi", "kiwi", "kiwi", "fruits", "kiwi", "fruits", "kiwi", "kiwi"], "difficult_direct_answer": false, "rationales": ["The fruit is the kiwi.", "This fruits green color with row of seeds towards middle identify it as a piece of kiwi.", "The green fruit with black seeds is a kiwi."], "image": "train2014/COCO_train2014_000000291347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127788, "question_id": "MGpfbZNnh629Z3YMd6jQe2", "question": "What type of transportation is shown?", "choices": ["air", "rail", "road", "water"], "correct_choice_idx": 2, "direct_answers": ["road", "bikes", "bike", "bikes", "bicycle", "bicycle", "bike", "bikes", "bike", "bike"], "difficult_direct_answer": false, "rationales": ["The bikes are on a street.", "Bikes are on pavement with white stripes down it. roads are marked with white lines.", "These are all bicycles you can ride on the street."], "image": "val2014/COCO_val2014_000000127788.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468789, "question_id": "MGzHaQhXzUmu5Hgdh9EQF2", "question": "What is the man doing with the girl?", "choices": ["coaching", "competing", "babysitting", "playing tennis"], "correct_choice_idx": 0, "direct_answers": ["coaching her", "practicing tennis", "practicing tennis", "playing tennis", "practicing tennis", "coaching", "practicing tennis", "playing tennis", "playing tennis", "playing"], "difficult_direct_answer": false, "rationales": ["Within the picture both man and girl are in tennis playing attire. since the man is interacting with the kid in a teaching fashion, it would most likely be part of coaching.", "She is young and seems new to the sport. there are many balls strewn across the court, indicating that they have been using them to practice.", "She is too small to be his opponent, so he must be teaching her the game."], "image": "train2014/COCO_train2014_000000468789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254879, "question_id": "MH5j8cHjaTktcDSvFt3HWi", "question": "What are the activities which take place in this room called?", "choices": ["physical training", "food preparation", "tv watching", "mathematics teaching"], "correct_choice_idx": 1, "direct_answers": ["cooking", "cooking", "food preparation", "cooking", "cooking", "cooking", "cooking", "cooking", "cooking", "cooking"], "difficult_direct_answer": false, "rationales": ["This room is mainly used to make food and get food prepared.", "The activities are for food prep.", "There is a stove, oven, and a refrigerator making the room a kitchen. a kitchen is used for cooking."], "image": "train2014/COCO_train2014_000000254879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504211, "question_id": "MHAfrv7YonA4mC6giLbYXe", "question": "How many important roles in baseball game?", "choices": ["11", "five", "nine", "four"], "correct_choice_idx": 1, "direct_answers": ["nine", "nine", "three", "three pictured", "eleven", "nine", "five", "multiple", "unkown", "three"], "difficult_direct_answer": false, "rationales": ["There are five roles.", "The important roles in baseball are; pitcher, catcher, first baseman, second baseman, third baseman, shortstop, left fielder, center fielder and right fielder.", "Each team has nine players (p, c, 1b, 2b, 3b, ss, rf, cf, lf) take the field while on defense and each person plays an important role in the game."], "image": "train2014/COCO_train2014_000000504211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66241, "question_id": "MHNtpkVmzmuYWWwEsbat2a", "question": "What kind of reaction the person shows?", "choices": ["smiling", "laughing", "ordering", "confusion"], "correct_choice_idx": 3, "direct_answers": ["suprise", "confusion", "disgust", "confusion", "frustration", "confused", "confusion", "confused", "happy", "confused"], "difficult_direct_answer": false, "rationales": ["The person shows confusion as he looks at his luggage.", "The person has his hand up and has an unpleasant expression.", "The person is confused."], "image": "train2014/COCO_train2014_000000066241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523955, "question_id": "MHPkpWGEEqUkiwCUubdU9q", "question": "What sport is the individual engaging in?", "choices": ["skiing", "snowshoeing", "sledding", "snowboarding"], "correct_choice_idx": 3, "direct_answers": ["skiing", "snowboarding", "snowboarding", "snowboarding", "snowboarding", "snowboard", "snowboarding", "snowboarding", "scatting", "snowboarding"], "difficult_direct_answer": false, "rationales": ["The man is standing on one board on the snow.", "As indicated by the board on their feet.", "The people are standing on snowboards which are wide single boards used on snowy surfaces."], "image": "val2014/COCO_val2014_000000523955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172509, "question_id": "MHnMYYs57K9eLnqtPRqWLg", "question": "What type of building might this be?", "choices": ["daycare", "school", "courthouse", "church"], "correct_choice_idx": 1, "direct_answers": ["office", "office", "food hall", "food court", "school", "school", "classroom", "school", "restaurant", "office"], "difficult_direct_answer": false, "rationales": ["There is a whiteboard and desks.", "There are desks in the background.", "In the distance you can see some classroom desks. on front table you have platters of food."], "image": "train2014/COCO_train2014_000000172509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518816, "question_id": "MJ76vViYnrHZnL9mDevhVn", "question": "Where could these buildings be?", "choices": ["russia", "netherlands", "china", "japan"], "correct_choice_idx": 1, "direct_answers": ["farming village", "farm", "houses", "farm", "farm", "netherlands", "houses", "farm", "nightly rentals", "cabins"], "difficult_direct_answer": false, "rationales": ["Netherlands has many rural areas.", "These are common building types in this country", "The area is a temperate area with an animal native to the country."], "image": "train2014/COCO_train2014_000000518816.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167920, "question_id": "MJMZhSy8Ecnjarg4DbF3xD", "question": "In which direction with the airborne skateboarder go next?", "choices": ["his left", "down", "upwards", "backwards"], "correct_choice_idx": 1, "direct_answers": ["down", "down", "west", "down", "west", "down", "west", "down", "down", "down"], "difficult_direct_answer": false, "rationales": ["He is already in the air, and rules of gravity mean that he must now be on his way down.", "Gravity will pull him back to earth", "He is in the air."], "image": "val2014/COCO_val2014_000000167920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7594, "question_id": "MJcPyCVc6fhfiieHMvwvKq", "question": "What is the bear doll's mouth touching?", "choices": ["cushion", "table", "chair", "food"], "correct_choice_idx": 1, "direct_answers": ["table", "table", "table", "table", "table", "table", "boys hand", "table", "table", "boys hand"], "difficult_direct_answer": false, "rationales": ["The boy holding the bear is sitting at a chair. the bear's mouth is touching a round wooden object that is next to the chair.", "It's touching the table.", "The bear is touching a table."], "image": "train2014/COCO_train2014_000000007594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206013, "question_id": "MJco49rhmGnE5VFA9dFTQX", "question": "What topping is the blue-green one on the left slice of bread?", "choices": ["cheese", "peanut butter", "jam", "candy"], "correct_choice_idx": 3, "direct_answers": ["pop rocks", "pop rocks", "pop rocks", "pop rocks", "pop rocks", "pop rocks", "candy", "pop rocks", "pop rocks", "peanut butter"], "difficult_direct_answer": false, "rationales": ["The packaging is next to the blue-green is next to the plate.", "The package is partially on the plate", "The blue-green topping is from the pop rocks package."], "image": "train2014/COCO_train2014_000000206013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331191, "question_id": "MJfPPUPFDmpnejB94vAV6o", "question": "What is most likely to be found inside this store?", "choices": ["blankets", "fish", "food", "jeans"], "correct_choice_idx": 0, "direct_answers": ["blankets", "beds", "beds", "bedroom furniture", "linens", "bedroom furniture", "beds", "beds", "beds", "bedroom furniture"], "difficult_direct_answer": false, "rationales": ["The company sign uses the word bed.", "Blankets could be found.", "A store name has the word bed in it. store windows are lit and beds can be seen on display behind. the beds have blankets on them."], "image": "train2014/COCO_train2014_000000331191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96421, "question_id": "MJxasPnk3YTY2LEmyYFNjJ", "question": "The wall decoration and props here are modeled after which location?", "choices": ["garage", "living room", "bedroom", "kitchen"], "correct_choice_idx": 3, "direct_answers": ["kitchen", "kitchen", "kitchen", "kitchen", "comic", "kitchen", "comic", "kitchen", "kitchen", "kitchen"], "difficult_direct_answer": false, "rationales": ["It's obvious by the fridge that the kids are look inside of.", "There is a refrigerator in the room.", "A fridge is seen in the room."], "image": "train2014/COCO_train2014_000000096421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205505, "question_id": "MJzt7h5DHAwhQt44wjNqWb", "question": "What is this venue likely to be?", "choices": ["wilderness", "themed park", "barn", "zoo"], "correct_choice_idx": 3, "direct_answers": ["farm", "zoo", "farm", "zoo", "farm", "farm", "farm", "farm", "forest", "farm"], "difficult_direct_answer": false, "rationales": ["There are sheep inside of an enclosure.", "Animals are together behind a fence.", "The animals are in a fenced in enclosure."], "image": "train2014/COCO_train2014_000000205505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539631, "question_id": "MKkankjDfwyLrAE6ugQ6Pe", "question": "Who is the man wearing the red tie?", "choices": ["martin luther", "michael blake", "barack obama", "anthony fauci"], "correct_choice_idx": 2, "direct_answers": ["obama", "barack obama", "president", "obama", "obama", "president", "barack obama", "barack obama", "barack obama", "barack obama"], "difficult_direct_answer": false, "rationales": ["The man is like barack.", "Barack obama was the president of the united states. behind him it says the white house.", "The man wearing the red tie was the 44th united states president and is an easily recognizable famous individual."], "image": "train2014/COCO_train2014_000000539631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434866, "question_id": "MKsBWDnWrnXFbMKjAb6hay", "question": "Which country has elephant as national animal?", "choices": ["africa", "france", "thailand", "germany"], "correct_choice_idx": 2, "direct_answers": ["thailand", "thailand", "thailand", "thailand", "india", "africa", "india", "thailand", "india", "thailand"], "difficult_direct_answer": false, "rationales": ["It is used as a symbol for fortune in this country", "Elephants are prominent in thailand.", "The national animal of thailand is the elephant."], "image": "train2014/COCO_train2014_000000434866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137666, "question_id": "MLuy9fFbLDfj5VKjboVnK5", "question": "What are the towels made from?", "choices": ["paper", "steel", "wood", "metal"], "correct_choice_idx": 0, "direct_answers": ["clothes", "clothes", "paper", "paper", "paper", "paper", "paper", "cotton", "paper", "paper"], "difficult_direct_answer": false, "rationales": ["The towels are disposable. it would cost too much to make them out of metal, steel, or wood.", "The towels are paper towels.", "A roll of towels can be seen on the floor. they towels are white."], "image": "train2014/COCO_train2014_000000137666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40912, "question_id": "MMAfoM9uqDHLLFzx2tWmQE", "question": "Why is the man holding an umbrella?", "choices": ["to dance", "keep dry", "to give", "to sell"], "correct_choice_idx": 1, "direct_answers": ["stay dry", "raining", "avoid rain", "rain", "raining", "repel rain", "rain protection", "rain", "keep dry", "it's raining"], "difficult_direct_answer": false, "rationales": ["The street is wet because it is raining. the man is using the umbrella to protect his clothing and body from the raindrops.", "As indicated by the wet sidewalk and streets, which means it's raining.", "There is proof of water droplets forming on the sidewalk. that shows that it's raining."], "image": "train2014/COCO_train2014_000000040912.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417849, "question_id": "MMDEwxRBQSuNa9nq3avK8K", "question": "What is used to milk cows here?", "choices": ["machine", "hands", "cows", "goats"], "correct_choice_idx": 0, "direct_answers": ["person", "machine", "milking machine", "buckets", "machine", "milker", "milker", "machines", "machine", "technology"], "difficult_direct_answer": false, "rationales": ["The man is trying to connect tubes to the cow's udder to suck out the milk into the buckets.", "They use a machine to help milk the cows quicker.", "The man is hooking it up to the udders"], "image": "val2014/COCO_val2014_000000417849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457754, "question_id": "MMU6ougjRtVNFxNxvVDfnP", "question": "What type of event is this?", "choices": ["meeting", "funeral", "shower", "wedding"], "correct_choice_idx": 0, "direct_answers": ["signing", "signing", "signing", "bill signing", "meeting", "signing ceremony", "signing", "political", "bill-signing", "signing"], "difficult_direct_answer": false, "rationales": ["The men are in business attire.", "The event is a meeting.", "The men are in a business meeting."], "image": "val2014/COCO_val2014_000000457754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522416, "question_id": "MMh6ipuNNTtkhc8eyUdsk2", "question": "How does the child in the green shirt feel?", "choices": ["joyful", "amused", "sad", "happy"], "correct_choice_idx": 2, "direct_answers": ["sad", "sad", "sadness", "sad", "sad", "sad", "sad", "large", "large", "sad"], "difficult_direct_answer": false, "rationales": ["The child has the bottom lip out like she is pouting.", "The lower lip protruding indicates they are close to crying", "They are frowning, and these facial cues would let anyone with emotional intelligence know that they are feeling sad."], "image": "train2014/COCO_train2014_000000522416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51720, "question_id": "MMvXQza2JmD7Bw5UqVLfB8", "question": "What sort of lesson might the short people be getting?", "choices": ["beginner ski", "olympic ski", "marshmallows", "kittens"], "correct_choice_idx": 0, "direct_answers": ["skiing", "skiing", "skiing", "skiing lesson", "skiing lesson", "beginner ski", "skiing", "skiing", "skiing", "skiing lessons"], "difficult_direct_answer": false, "rationales": ["Learning to ski starts by not using poles.", "The kids are learning the basics of skiing.", "The lesson is for beginners."], "image": "train2014/COCO_train2014_000000051720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362766, "question_id": "MN6hHkcwggspbLK2e3wZJd", "question": "What do the black flags mark?", "choices": ["player", "danger", "course", "avalanche"], "correct_choice_idx": 2, "direct_answers": ["play", "hitting spots", "course", "direction", "race course", "racing sticks", "terrain", "route", "boundary", "pathway"], "difficult_direct_answer": true, "rationales": ["The flags mark the course.", "They show where to ski", "Flags are used to mark the ski path on courses."], "image": "train2014/COCO_train2014_000000362766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75581, "question_id": "MN7aPkXT7UmMPehXA8JMEN", "question": "What is he doing?", "choices": ["showing off", "taking shortcut", "tricks", "falling"], "correct_choice_idx": 2, "direct_answers": ["skateboarding", "skating", "skateboarding", "playing", "grinding", "grinding skateboard", "tricks", "skateboarding trick", "grinding", "ramp skateboarding"], "difficult_direct_answer": false, "rationales": ["He is on a bar with a skateboard.", "Here we see a man grinding his skateboard down a bannister for onlookers. this would be classified as a skateboarding trick.", "He is sliding his skateboard down a railing"], "image": "train2014/COCO_train2014_000000075581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450107, "question_id": "MNPqzTVXnDzatE2Zdiwepc", "question": "What type of shop is the person wearing the tie in?", "choices": ["deli", "restaurant", "grocery store", "book store"], "correct_choice_idx": 3, "direct_answers": ["bookshop", "book", "library", "book", "book store", "library", "bookshop", "book store", "book", "book"], "difficult_direct_answer": false, "rationales": ["You can tell by all of the bookshelves as to where the photo was taken.", "The shelves behind the guy are stacked high with various books.", "The person has a lot of books behind them."], "image": "train2014/COCO_train2014_000000450107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156232, "question_id": "MNmEBpyk7FDYsRwsP5r9TV", "question": "For what reason do the persons wear vests?", "choices": ["cammo", "fashion", "warmth", "visibility safety"], "correct_choice_idx": 3, "direct_answers": ["reflective lights", "safety", "reflection", "working gear", "visibility", "visibility", "visibility safety", "visibility", "visibility safety", "visibility"], "difficult_direct_answer": false, "rationales": ["The colorful vest makes it easy to see them.", "These vests are used for safety and visibility because of their reflective ability.", "The bright color allows others to see them easily."], "image": "val2014/COCO_val2014_000000156232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29405, "question_id": "MNqyTHCwUHUainmEepoed3", "question": "How much would everything here cost approximately?", "choices": ["275", "50", "310", "300000"], "correct_choice_idx": 3, "direct_answers": ["ten thousand", "thousand", "1000000", "one million", "alot", "lot", "lots", "300000", "200000 dollars", "100 thousands"], "difficult_direct_answer": true, "rationales": ["The room and the furnishings in it would cost several hundreds of thousands of dollars.", "The room is decorated with very expensive furnishings that could cost $300000.", "There are a lot of quality items in the room"], "image": "train2014/COCO_train2014_000000029405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521014, "question_id": "MNxPsSasB8bmVwWeqzFXfc", "question": "What age group is the person who designed the room most likely in?", "choices": ["20-30", "10 -20", "50-60", "70-80"], "correct_choice_idx": 2, "direct_answers": ["50-60", "old", "middle age", "40s", "30s", "50s", "their 30s", "older", "middle aged", "forty's"], "difficult_direct_answer": true, "rationales": ["Someone older in mid life range.", "These are old colors from the 70s when these people were first adults", "These have colors from the 70s and these people were adults then"], "image": "train2014/COCO_train2014_000000521014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247421, "question_id": "MP6f29FJLCnPLREkVPZ9FZ", "question": "Why does the man wear shirt and tie?", "choices": ["personal preference", "fashion", "virtual conference", "in office"], "correct_choice_idx": 2, "direct_answers": ["fashion statement", "looks better", "professionalism", "for work", "zoom meeting", "dressing up", "work", "virtual conference", "work", "video meeting"], "difficult_direct_answer": true, "rationales": ["The man has a laptop in front of him so the conference must be virtual.", "He wants to people in his zoom conference to think that he is professionally dressed and they can only see his top half.", "The man is at a virtual conference."], "image": "train2014/COCO_train2014_000000247421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475571, "question_id": "MPS8735tq9jNhugT7T4QZW", "question": "What kind of media is she consuming?", "choices": ["television", "film", "digital", "print"], "correct_choice_idx": 2, "direct_answers": ["iphone", "social media", "online", "social media", "social", "social", "cellphone", "mobile", "mobile phone", "digital"], "difficult_direct_answer": false, "rationales": ["The woman has her hands on her phone.", "The woman is consuming digital media on her phone.", "The other options don't fit unless she's watching c or d on her phone, which would then mean a plus one of those options."], "image": "train2014/COCO_train2014_000000475571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7149, "question_id": "MPayYT9phEBg3diZWDWHZ8", "question": "Why is the woman hosing the animal off?", "choices": ["it's cold", "it's thirsty", "it's hot", "it's dirty"], "correct_choice_idx": 3, "direct_answers": ["dirty", "cleaning", "washing", "dirty", "dirty", "bath time", "bathing", "washing", "dirty", "it's dirty"], "difficult_direct_answer": false, "rationales": ["The woman wants to clean the animal.", "The fur is matted and dull in color.", "It needs to be cleaned"], "image": "val2014/COCO_val2014_000000007149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106635, "question_id": "MPe5TrxmaQm3kgAGJLhdWP", "question": "Why is there a bright circle?", "choices": ["sun light", "edited in", "bright lamp", "laser light"], "correct_choice_idx": 0, "direct_answers": ["sun", "sun", "sun light", "sun halo", "sun", "sun", "sun", "sun", "sun halo", "sun"], "difficult_direct_answer": false, "rationales": ["The sun is shinning so bright in the sky.", "This is common in photography. the light is simply too bright for the lens to capture it, especially when focused on the foreground.", "The sun is a large ball of flame in space, and it shines in the sky very intensely."], "image": "train2014/COCO_train2014_000000106635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469635, "question_id": "MPzRQYPvHqEexQhazpyLSr", "question": "Why are the people using umbrellas?", "choices": ["blocking sun", "to dance", "wind blocking", "keeping dry"], "correct_choice_idx": 0, "direct_answers": ["sun protection", "sun protection", "shade", "covering sun", "hot sun", "blocking sun", "sun", "provide shade", "hot", "sun protection"], "difficult_direct_answer": false, "rationales": ["The people are trying to block the rays of the sun.", "It's a sunny day and they are using it to block themselves from the sun.", "It is not raining outside but it is very sunny"], "image": "val2014/COCO_val2014_000000469635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285497, "question_id": "MQ2CMuffd7dAWhSNPigEWz", "question": "Why are they wearing so much stuff?", "choices": ["showing off", "is cold", "is stylish", "is windy"], "correct_choice_idx": 1, "direct_answers": ["cold", "cold", "for warmth", "stay warm", "regulate body-temperatures", "cold day", "keeping warm", "cold", "snowboarding", "is cold"], "difficult_direct_answer": false, "rationales": ["There is snow all over, which is very cold.", "Wearing a lot of layers including jackets protects you from the cold as it gives you more insulation.", "We can presume this snowy mountain scene is cold."], "image": "val2014/COCO_val2014_000000285497.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432201, "question_id": "MQNGm9UZizTfpLbvZNGwaT", "question": "What is on the left side of the room?", "choices": ["sink", "computer", "television", "monkey"], "correct_choice_idx": 0, "direct_answers": ["sink", "sink", "sinks", "sink", "sink", "sinks", "sinks", "sink", "sink", "sink"], "difficult_direct_answer": false, "rationales": ["There is a faucet over it", "The left side of the room has all the sinks and can be used for washing.", "There is a faucet over it"], "image": "train2014/COCO_train2014_000000432201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220464, "question_id": "MQU2Qv87RgW9j83Rzaz6mU", "question": "WHat type of phone is the man holding?", "choices": ["corded", "smart phone", "iphone", "flip"], "correct_choice_idx": 3, "direct_answers": ["mobile", "flip phone", "iphone", "cell", "cellular", "flip", "flip", "cell phone", "cell phone", "flip"], "difficult_direct_answer": false, "rationales": ["You can see the little bump by his thumbs.", "The man is holding a flip cell phone.", "The phone has two sides."], "image": "train2014/COCO_train2014_000000220464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14864, "question_id": "MQbnWNCf3cuznAeFu4h6H7", "question": "What shot is the girl hitting?", "choices": ["forehand", "slice", "serve", "backhand"], "correct_choice_idx": 3, "direct_answers": ["tennis shot", "backhand", "backhand", "smashing", "flat", "mid", "backhand", "tennis ball", "underhand", "side arm"], "difficult_direct_answer": false, "rationales": ["The person is using the hand at the back.", "She is standing with her right hand in front instead of behind her.", "The girl is holding the racket in her right hand. the fact that she is swinging the racquet to the right of the body means that she is hitting a backhand."], "image": "train2014/COCO_train2014_000000014864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244171, "question_id": "MQc8PPpBJCZ7wMvwuEULxG", "question": "What are these people observing?", "choices": ["festival", "fireworks", "firestorm", "bonfire"], "correct_choice_idx": 0, "direct_answers": ["fire", "fires", "festival", "bonfire celebration", "fire", "fire", "fire", "fire", "ceremony", "fireworks"], "difficult_direct_answer": false, "rationales": ["Lights are ablaze in a festive way.", "There are a lot of lights out in observance of a special occasion.", "The people are looking at a conflagration at night, thus corresponding to what is mentioned in option c."], "image": "train2014/COCO_train2014_000000244171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235245, "question_id": "MQsxjvNYGnSV9iJS9Q8JLe", "question": "What is a baseball glove called?", "choices": ["mitt", "envelope", "cover", "gloves"], "correct_choice_idx": 0, "direct_answers": ["glove", "catcher's mitt", "mitt", "mitt", "mit", "mitt", "catcher mitt", "mitt", "mitt", "mitt"], "difficult_direct_answer": false, "rationales": ["This is the standard name for this piece of sporting equipment.", "That is what a glove in baseball is called.", "A baseball glove is commonly called a mitt."], "image": "train2014/COCO_train2014_000000235245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289392, "question_id": "MRTKX4zHhe7XgTU5naxBwH", "question": "What is the operating system being projected?", "choices": ["mac os", "ms dos", "linux", "windows"], "correct_choice_idx": 0, "direct_answers": ["wall", "window ten", "mac os", "office", "linux", "mac", "mac", "mac", "mac os", "mac os"], "difficult_direct_answer": false, "rationales": ["The operating system being shown is an apple system.", "Icons are in a line in a black bar along the bottom of a screen.", "The icons are shown across the bottom"], "image": "val2014/COCO_val2014_000000289392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161856, "question_id": "MRVuoqUtqrmbWkA37K8kuy", "question": "In which location might this umbrella be appropriate?", "choices": ["veterans parade", "klan rally", "volcano", "pride parade"], "correct_choice_idx": 3, "direct_answers": ["rainy place", "rainy", "rain", "beach", "rain", "snow/rain", "pride parade", "snowing", "outdoors", "any location"], "difficult_direct_answer": true, "rationales": ["The location is the pride parade.", "The umbrella is rainbow colored which is a symbol that is used by the lgbtq community. they have a yearly outdoor celebration to support their community.", "The colors that represent this sect of america are rainbow colored."], "image": "train2014/COCO_train2014_000000161856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511301, "question_id": "MRWu6SYDyidaD6UsRg2Eo5", "question": "What make of car can be seen next to the parking meter?", "choices": ["mercedes", "acura", "audi", "bmw"], "correct_choice_idx": 0, "direct_answers": ["mercedes", "mercedes benz", "mercedes benz", "mercedes", "mercedes", "mercedes", "stop", "mercedes", "mercedes", "mercedes"], "difficult_direct_answer": false, "rationales": ["An older, boxy mercedes is parked by the meter.", "The car has a logo symbol on the back that is the logo of this brand.", "One can see their familiar three- pronged logo on the back of the car."], "image": "train2014/COCO_train2014_000000511301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327864, "question_id": "MRqPCAFLXocXUW5YTR3BDt", "question": "Besides his own food what specialized food does this person have in his home?", "choices": ["homeless grub", "bird seed", "ferret chow", "cat food"], "correct_choice_idx": 3, "direct_answers": ["cat", "cat food", "cat food", "cat food", "cat food", "cat", "cat", "cat food", "cat food", "cat food"], "difficult_direct_answer": false, "rationales": ["This is the most likely answer if this animal is in fact his pet.", "The man has cat food in his home.", "He would have food for his pet."], "image": "val2014/COCO_val2014_000000327864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566488, "question_id": "MRqhH52GUdEGgrxt8hGVdR", "question": "What is the most accurate name for the boy's hair style?", "choices": ["dreadlocks", "mohawk", "pony tail", "mullet"], "correct_choice_idx": 1, "direct_answers": ["mohawk", "new style", "mohawk", "mohawk", "mohawk", "mohawk", "mohawk", "jane", "mohog", "mohawk"], "difficult_direct_answer": false, "rationales": ["The hair is much longer in a line from front to back", "The boy has his hair styled in a spiky mohawk.", "The boy has a mohawk."], "image": "train2014/COCO_train2014_000000566488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449071, "question_id": "MRvH23AQGkMRjqArhdQvzp", "question": "What is the tail shape of these surfboards?", "choices": ["fish", "square", "squash", "pin"], "correct_choice_idx": 0, "direct_answers": ["fins", "feather", "dolphin", "fin shaped", "swallow", "fish tail", "fin shaped", "fish", "red", "mermaid"], "difficult_direct_answer": true, "rationales": ["The board uses that as it floats in the water.", "The other options don't match a tail.", "The surfboards are shaped like a fish."], "image": "train2014/COCO_train2014_000000449071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5123, "question_id": "MTYUqfw5TYgAsZgGYUoMdy", "question": "What type of trash goes in these trash cans?", "choices": ["recycling", "general waste", "horse manure", "greenery"], "correct_choice_idx": 1, "direct_answers": ["road based", "recycling", "any", "household", "edible trash", "general waste", "non-recyclable trash", "recycling", "garbage", "none recyclables"], "difficult_direct_answer": true, "rationales": ["The trash is general waste.", "Any waste product that do not have any value are placed in the trash can.", "General waste goes into these trash cans."], "image": "val2014/COCO_val2014_000000005123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543203, "question_id": "MUWPfbybCQkvrW6CpJoEwH", "question": "What is painted on the tail of the green plane?", "choices": ["face", "rose", "shamrock", "tulip"], "correct_choice_idx": 2, "direct_answers": ["shamrock", "clover", "clover", "clover", "shamrock", "clover", "clover", "clover", "clover", "clover"], "difficult_direct_answer": false, "rationales": ["One can see the green three leaf clover painted on the tail.", "It is a plant with three leaves on it and it is green.", "There is a three leaf clover on the tail of the green plane."], "image": "val2014/COCO_val2014_000000543203.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502854, "question_id": "MUnkuVFjXtRLpExHuZhYoH", "question": "What type of merchant is this?", "choices": ["beauty", "food", "decor", "vehicle"], "correct_choice_idx": 2, "direct_answers": ["flower", "florist", "flower merchant", "florist", "flower", "florist", "flower", "decor", "flower", "flower"], "difficult_direct_answer": false, "rationales": ["The vases are decorative.", "The florals and vases are used to decorate.", "The merchant sells decor."], "image": "val2014/COCO_val2014_000000502854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185225, "question_id": "MUobfoTfPf3kV4dU7UYRhP", "question": "Where will the meat be placed?", "choices": ["in buns", "in foil", "in plate", "in glass"], "correct_choice_idx": 0, "direct_answers": ["buns", "grill", "buns", "in buns", "buns", "buns", "hot dogs", "hotdog buns", "grill", "in buns"], "difficult_direct_answer": false, "rationales": ["If hot dogs are being served, we can assume they will be eaten with the bread currently being toasted.", "The meat will go in buns.", "Hot dogs such as those pictured here are usually eaten on folded bread. we can see these pieces of folded bread being toasted in the background."], "image": "train2014/COCO_train2014_000000185225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181367, "question_id": "MV2WPFwpKXUvJvXiYA98sS", "question": "What is the woman on the left near?", "choices": ["monkey", "bicycle", "egg", "bear"], "correct_choice_idx": 1, "direct_answers": ["bicycle", "bike", "bike", "playground", "river", "bicycle", "bicycle", "bike", "lake", "park"], "difficult_direct_answer": false, "rationales": ["The woman on the left is standing near a vehicle that has two wheels.", "She is near a bike.", "The woman on the left is near a bike."], "image": "train2014/COCO_train2014_000000181367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570037, "question_id": "MVEZXX8HLUhBZsCxHjGpUy", "question": "What is the person on the left standing next to?", "choices": ["airplane", "car", "baby", "umbrella"], "correct_choice_idx": 1, "direct_answers": ["church", "car", "car", "church", "car", "car", "car", "road", "car", "automobile"], "difficult_direct_answer": false, "rationales": ["The person is by a car.", "The object next to him has wheels, windshield, and headlights.", "There is a vehicle next to the person."], "image": "train2014/COCO_train2014_000000570037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125476, "question_id": "MVdfz5cviZdmRPfCgYoPTR", "question": "What design is on each arm?", "choices": ["cat", "eagles", "dog", "bat"], "correct_choice_idx": 1, "direct_answers": ["birds", "eagle", "eagles", "eagle", "eagle", "birds", "bird", "cast iron", "bird", "eagles"], "difficult_direct_answer": false, "rationales": ["The design of the arms has a bird.", "Those are bald eagles gracing the arms of this bench, which is getting overrun by nature in this photo. and though the wood in the bench may rot with time, those eagles will persevere through the ages, no matter the weather.", "The design is the eagle."], "image": "val2014/COCO_val2014_000000125476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250194, "question_id": "MVn8XQdQPZ9WQ8DSr2nxKE", "question": "What is the name of the street?", "choices": ["suzuran", "mulberry", "yancy", "green"], "correct_choice_idx": 0, "direct_answers": ["suzuran", "suzuran", "suzuran", "suzuran", "suzuran", "suzuran", "suzuran", "suzuran", "suzuran", "suzuran"], "difficult_direct_answer": false, "rationales": ["The street is suzuran.", "The street is called suzuran.", "The name is on top of the entryway."], "image": "train2014/COCO_train2014_000000250194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146509, "question_id": "MVnd7ez26auPv5FUcKGJW7", "question": "What stuffed animal is sold here?", "choices": ["dogs", "bears", "cats", "rabbit"], "correct_choice_idx": 1, "direct_answers": ["teddy bears", "teddy bears", "teddy bears", "teddy bears", "teddy bear", "teddy bears", "bears", "teddy bear", "teddy bears", "bear"], "difficult_direct_answer": false, "rationales": ["There is a sign above the display window.", "There are many bears visible in a store window. the name of the store is \"teddy bears\".", "As indicated on the sign and by the large bear standing next to the window."], "image": "val2014/COCO_val2014_000000146509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62706, "question_id": "MVwbP2Se4YnL4VQaEagFot", "question": "What relation does the person holding the umbrella have to the bride?", "choices": ["thunder stealer", "bride's maid", "child", "stranger"], "correct_choice_idx": 1, "direct_answers": ["husband", "groom", "friend", "friend", "groom", "friend", "groom", "friends", "groom", "bride's maid"], "difficult_direct_answer": false, "rationales": ["Her friend who is in her wedding party is helping her out.", "The people have a black suit and white dress.", "The person is a bridesmaid."], "image": "val2014/COCO_val2014_000000062706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241294, "question_id": "MW4M2cJ5huoTg7gNSNPo7i", "question": "Why is there a bell on the counter?", "choices": ["cow", "assistance", "fire", "cat"], "correct_choice_idx": 1, "direct_answers": ["alert staff", "assistance", "for service", "service", "placing orders", "call server", "call help", "service", "call server", "service"], "difficult_direct_answer": false, "rationales": ["For customers to ask for a.", "The chef can ring the bell when an order is ready.", "Customers can ring this bell if there is not someone at the counter to wait on them."], "image": "train2014/COCO_train2014_000000241294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494579, "question_id": "MW5sDip2EGtvgF3VCMFXYb", "question": "How many eyes does the animal on the left have?", "choices": ["two", "eight", "three", "six"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["These animals have two eyes.", "The animal on the left is a horse which has an eye on each side of its face.", "Horses are known for having two eyes as they are mammals."], "image": "train2014/COCO_train2014_000000494579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65399, "question_id": "MWBoX7EG2HRLfqHZ2ZZSKs", "question": "Which vehicle is an antique?", "choices": ["silver car", "blue car", "black truck", "red truck"], "correct_choice_idx": 3, "direct_answers": ["red car", "red car", "truck", "truck", "truck", "red truck", "car", "truck", "truck", "truck"], "difficult_direct_answer": false, "rationales": ["The red pickup truck looks to be the oldest vehicle.", "The vehicle in the front looks older and is red and a truck.", "The vehicle is the red truck."], "image": "val2014/COCO_val2014_000000065399.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512583, "question_id": "MWKqUdfEVSxasQ6awkHiZS", "question": "What sort of condiment might be used in this meal?", "choices": ["barbeque sauce", "salad dressing", "mustard", "catsup"], "correct_choice_idx": 1, "direct_answers": ["dressing", "pepper flakes", "hot sauce", "salad dressing", "salad dressing", "dressing", "spices", "grated parmesan", "dressing", "ranch"], "difficult_direct_answer": false, "rationales": ["There is salad.", "Ranch dressing is good with pizza.", "The pizza is being served with a salad and that usually has some kind of dressing on it."], "image": "train2014/COCO_train2014_000000512583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273579, "question_id": "MWWcaRqZdCt4yqGhek8ucr", "question": "Who is the artist here?", "choices": ["elephant", "no one", "unseen man", "lady smiling"], "correct_choice_idx": 0, "direct_answers": ["elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["The large grey animal is painting.", "The painting is facing the animal, so it is probably the artist.", "As indicated by what it's doing with its trunk."], "image": "val2014/COCO_val2014_000000273579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544655, "question_id": "MWiJbTPZon6EgtYbBhRFw6", "question": "What mens wear does the girl wear?", "choices": ["shoes", "neck tie", "top hat", "tuxedo"], "correct_choice_idx": 1, "direct_answers": ["tie", "tie", "tie", "tie", "neck tie", "neck tie", "neck tie", "neck tie", "tie", "tie"], "difficult_direct_answer": false, "rationales": ["This girl is wearing a tie around her neck", "A girl is wearing a tie around her neck. men generally wear ties.", "A girl is smiling and wearing a neck tie. ties are generally worn by men."], "image": "val2014/COCO_val2014_000000544655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521752, "question_id": "MWtAzHQg7q2Xb6cARi8wUH", "question": "What type of ceiling is there?", "choices": ["rectangular", "glass", "arched", "flat"], "correct_choice_idx": 2, "direct_answers": ["wooden beams", "wooden", "arched", "arched", "cathedral", "wooden", "vaulted", "wooden", "arched", "arched"], "difficult_direct_answer": false, "rationales": ["The lights are going over like a rainbow.", "There is a multilayered curved ceiling above the people sitting at the couch.", "The ceiling is made out of wood, not glass. it is not flat or rectangular."], "image": "train2014/COCO_train2014_000000521752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516462, "question_id": "MWuQwuM692Zq8zfTmRFmWP", "question": "What does the child have all over her hands?", "choices": ["crayon", "snow", "paint", "food"], "correct_choice_idx": 3, "direct_answers": ["mashed potatoes", "frosting", "cake", "cake", "food", "food", "food", "food", "frosting", "cake"], "difficult_direct_answer": false, "rationales": ["The girl is eating food.", "This toddler is at a birthday party. he really has done his best to utilize his spoon, but that's not working, so he must use his hands to get some food to his mouth!.", "The kid is eating potatoes."], "image": "train2014/COCO_train2014_000000516462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330265, "question_id": "MWzP8UaaoaBDqA68BnMJqT", "question": "How is the toilet flushed?", "choices": ["pressure activated", "ir beam", "sound activated", "manual handle"], "correct_choice_idx": 1, "direct_answers": ["handle", "automatically", "by sensor", "motion detection", "automatic", "ir beam", "flushing knob", "automatic", "automatic sensor", "motion sensor"], "difficult_direct_answer": true, "rationales": ["There is no handle to flush it with, so it must use a sensor of some kind.", "There is no handle seen to flush the toilet, but the silver item attached to it with the black circle is a sensor that can detect when someone stands up, and will automatically flush the toilet.", "There is a sensor above the toilet. it is self flushing."], "image": "val2014/COCO_val2014_000000330265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454296, "question_id": "MX9Hr7NhhZCSW7zKHW4P53", "question": "What is this structure used for?", "choices": ["changing floors", "riding", "sleeping", "resting"], "correct_choice_idx": 0, "direct_answers": ["going upstairs", "changing floors", "changing floors", "ascension", "floor connections", "changing floors", "reaching floors", "climbing upward", "moving between-levels", "changing stories"], "difficult_direct_answer": false, "rationales": ["A long staircase made of marble leads to a landing. after walking across the landing, another flight of stairs awaits, leading to the building's second floor.", "Stairs serve that purpose. the other options don't fit.", "The stairs are used to change floors."], "image": "val2014/COCO_val2014_000000454296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471280, "question_id": "MX9JLQKnVTa3VRTYkHjBks", "question": "What do people put on train tracks?", "choices": ["ballast", "asphalt", "concrete", "wires"], "correct_choice_idx": 0, "direct_answers": ["trains", "trains", "trains", "trains", "pennies", "trains", "pennies", "trains", "ballast", "trains"], "difficult_direct_answer": false, "rationales": ["On most train tracks ballasts are used to bear the load from railroad ties which are quite visible.", "Ballast are put on the tracks. the trains run on the rails.", "There are rocks under the tracks."], "image": "train2014/COCO_train2014_000000471280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534155, "question_id": "MXFA33MKfYVL3pGxQ7aZSz", "question": "In what decade of the twentieth century was this vehicle first used?", "choices": ["fifth", "third", "seventh", "first"], "correct_choice_idx": 3, "direct_answers": ["recently", "first", "twenty ometh", "fourth", "not sure", "twenties", "first decade1903", "second", "1980", "twenties"], "difficult_direct_answer": true, "rationales": ["The vehicle is a airplane which was first used in answer a relative to the decades of the twentieth century.", "Airplanes first came out in the first decade.", "Wright brothers flew one in 1903 and centuries are always one number higher."], "image": "train2014/COCO_train2014_000000534155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5688, "question_id": "MXSYv2kurMfTfM7vdHNxUr", "question": "What is the unique feature of the parasol?", "choices": ["protection", "grip", "none", "shadow"], "correct_choice_idx": 3, "direct_answers": ["shadow", "folding design", "sun protection", "tuborg", "advertising", "it's mounted", "folds down", "green", "green color", "logo"], "difficult_direct_answer": true, "rationales": ["The umbrella is used for shade.", "The feature is a shadow.", "The parasol creates a shadow which is good for shade."], "image": "train2014/COCO_train2014_000000005688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288751, "question_id": "MXW55NDLrT4AvG3LNTkbnc", "question": "What item is blue here?", "choices": ["blueberry", "sky", "smurf", "orchid"], "correct_choice_idx": 1, "direct_answers": ["water", "water", "water", "water", "sky", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["There is blue above the buildings. sky is something that would appear above buildings.", "The water is only reflecting the color", "The sky and water are blue."], "image": "train2014/COCO_train2014_000000288751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488942, "question_id": "MYANWcNFnXHu44qbHqxsgQ", "question": "What types of buildings are these?", "choices": ["mobile", "high rise", "historical", "religious"], "correct_choice_idx": 1, "direct_answers": ["apartments", "high rises", "office buildings", "office buildings", "high rise", "business offices", "skyscrapers", "skyscrapers", "office", "office"], "difficult_direct_answer": false, "rationales": ["These are tall buildings in the city.", "The buildings here are high rise buildings that are found in cities.", "The buildings are highrises."], "image": "val2014/COCO_val2014_000000488942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283097, "question_id": "MYUuerDzgjEGWrof2FHJUH", "question": "Why is he struggling?", "choices": ["too young", "is disabled", "needs help", "is weak"], "correct_choice_idx": 2, "direct_answers": ["heavy", "heavy", "needs help", "baggage", "clean", "heavy luggage", "heavy luggage", "heavy", "it's heavy", "heavy luggage"], "difficult_direct_answer": false, "rationales": ["The young lad pushing the cart could use some assistance because it is very heavy due to the large suitcases.", "The suitcases are too heavy.", "The roller is stacked high with suitcases."], "image": "val2014/COCO_val2014_000000283097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447607, "question_id": "MYW6pVZnpsHNZcJniMKmgK", "question": "Where is the house located?", "choices": ["beach", "lake", "hill", "valley"], "correct_choice_idx": 2, "direct_answers": ["on hill", "hill", "hill", "top hill", "hill", "by street", "top left", "hill", "corner", "hill"], "difficult_direct_answer": false, "rationales": ["You can see that the house is not on steady land.", "It is at a higher elevation than the streetlight", "The other options don't match this image whatsoever."], "image": "train2014/COCO_train2014_000000447607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404759, "question_id": "MYcJ3u3zyNxSk6VQNGpUfz", "question": "How does the man lying on the bench feel?", "choices": ["hot", "cool", "sick", "cold"], "correct_choice_idx": 1, "direct_answers": ["tired", "cool", "tired", "sad", "tired", "sad", "tired", "tired", "tired", "tired"], "difficult_direct_answer": false, "rationales": ["The man is cooling off in the shade.", "The man is underneath a tree which provides shade.", "He's in the shade of the tree"], "image": "train2014/COCO_train2014_000000404759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499480, "question_id": "MYq8AiyQiz6X6DkfYn8EdR", "question": "What is the towel for?", "choices": ["throeing out", "clean dishes", "dry person", "dry dishes"], "correct_choice_idx": 3, "direct_answers": ["drying hands", "cleaning", "drying hands", "dry dishes", "dishes", "wiping", "dry dishes", "dry dishes", "cleaning up", "wapping"], "difficult_direct_answer": false, "rationales": ["The towel is a dish towel.", "The towel is meant to dry the dishes from the sink or dishwasher.", "The towel dries dishes."], "image": "val2014/COCO_val2014_000000499480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219487, "question_id": "MZ35WsfDJ2hsWxZLwpQVG6", "question": "Which car companies logo can be seen on the back of the man's shirt?", "choices": ["bmw", "acura", "chevrolet", "seat"], "correct_choice_idx": 2, "direct_answers": ["chevrolet", "chevrolet", "chevrolet", "chrysler", "chevrolet", "chevrolet", "chevrolet", "ford", "chevrolet", "chevrolet"], "difficult_direct_answer": false, "rationales": ["The plus looking symbol on his back is that of chevy.", "Chevrolet's logo is shown.", "The company is chevrolet."], "image": "val2014/COCO_val2014_000000219487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444571, "question_id": "MZDhTDd3fKQbrVLXzDnP2X", "question": "The red color of fire hydrant represents what?", "choices": ["fire level", "water quality", "water force", "none"], "correct_choice_idx": 2, "direct_answers": ["fire", "fire station", "fire", "stronger water", "fire", "water force", "fire department", "fire", "recognition", "fire"], "difficult_direct_answer": false, "rationales": ["The red color is a water force.", "A red city fire hydrant has a flow rate below 500 gallons per minute.", "The red means the flow of it flow of the water."], "image": "train2014/COCO_train2014_000000444571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135038, "question_id": "MZDhhyfHwfSTXrkqKv5FUa", "question": "What is the person using the spoon for?", "choices": ["feeding", "stirring", "painting", "cleaning"], "correct_choice_idx": 0, "direct_answers": ["feeding", "feeding", "feed cat", "feed cat", "feet kitten", "feeding kitten", "feed kitten", "feeding cat", "feed animal", "feed cat"], "difficult_direct_answer": false, "rationales": ["They are holding it to the kitten's mouth.", "The person is feeding.", "The person is using the spoon to feed the small kitten."], "image": "train2014/COCO_train2014_000000135038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342403, "question_id": "MZNqnPZzXDaGvu9hRq4pNU", "question": "What aspect of the game is being shown here?", "choices": ["cheering", "fielding", "hitting", "umpiring"], "correct_choice_idx": 1, "direct_answers": ["batting", "outfielder", "outfield", "outfield", "outfield", "fielding", "cricket", "fielding", "outfield", "outfield"], "difficult_direct_answer": false, "rationales": ["The aspect is fielding.", "The boy is playing in the outfield and not near the bases.", "The game is at a field."], "image": "val2014/COCO_val2014_000000342403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230777, "question_id": "MZexADy3upEXzCpQW2fhJw", "question": "What kind of lens produced this image?", "choices": ["zoom", "wide angle", "short", "long"], "correct_choice_idx": 1, "direct_answers": ["fish eye", "camera", "fisheye", "scope", "bird's eye", "fisheye", "camera", "wide angle", "camera lens", "camera"], "difficult_direct_answer": false, "rationales": ["The lens makes a wide view of the picture.", "The lens is a wide angle.", "Wide angle lenses produce this kind of effect."], "image": "train2014/COCO_train2014_000000230777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492605, "question_id": "MZhAfhPgp6GcVQv5k8nRnk", "question": "Who will next cause the balls direction to change?", "choices": ["18", "pitcher", "coach", "shortstop"], "correct_choice_idx": 0, "direct_answers": ["18", "hitter", "batter", "batter", "batter", "batter", "batter swing", "batter", "batter", "batter swing"], "difficult_direct_answer": false, "rationales": ["The batter is wearing number 18 and he will hit the ball next.", "18 is batting.", "The pitcher has thrown the ball to the batter. the batter is getting ready to swing to hit the ball and change the direction."], "image": "val2014/COCO_val2014_000000492605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291476, "question_id": "MZhTLB8Nyt8hzkwYwd7WL5", "question": "What type of shot is this man making?", "choices": ["double", "forehand", "backhand", "none"], "correct_choice_idx": 2, "direct_answers": ["backhand", "backhand", "strike", "backhand", "strike", "strike", "backhand", "backhand", "backhand", "backhand"], "difficult_direct_answer": false, "rationales": ["The man is hitting a backhand in the tennis match.", "The man had his hand position so that the back of his hand would be facing the ball.", "The hand that is on the racquet is facing out with the back of his hand towards the ball, therefore it is called a \"backhand\""], "image": "train2014/COCO_train2014_000000291476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125661, "question_id": "MZweibCdJWN4wdDyUVfNZi", "question": "The man on the right holding the beer can is wearing what?", "choices": ["bowtie", "glove", "fedora", "boa"], "correct_choice_idx": 0, "direct_answers": ["tuxedo", "vest", "bowtie", "bow tie", "black suit", "vest", "vest", "vest", "tuxedo", "bow tie"], "difficult_direct_answer": false, "rationales": ["The man on the right has a bowtie on.", "You can tell by the suit the man is wearing as to what he is wearing in particular.", "The tie the man is wearing is shaped like a bow."], "image": "val2014/COCO_val2014_000000125661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32818, "question_id": "MZyZqdrqFGxHmTBQEzKnaZ", "question": "What are the benches for?", "choices": ["washed ashore", "resting", "for sale", "surfing"], "correct_choice_idx": 1, "direct_answers": ["sitting down", "resting", "sitting", "sitting", "resting", "sitting", "resting", "sitting", "sitting", "watching boats"], "difficult_direct_answer": false, "rationales": ["The benches are permanently mounted so they could not have washed ashore or be for sale and they would sink if placed in water so they can't be surfed on.", "The benches are placed here in a public place so that people visiting have a place to comfortably sit.", "Benches placed in public spaces like this beach appears to be are placed to provide people a place to rest."], "image": "train2014/COCO_train2014_000000032818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321942, "question_id": "MaHHKSh98WPJhseLgo4oiB", "question": "What are the toys in the window called?", "choices": ["footballs", "barbie dolls", "horseshoes", "teddy bears"], "correct_choice_idx": 3, "direct_answers": ["dolls", "teddy bears", "stuffed animals", "teddy bears", "teddy bears", "stuffed animals", "teddy bears", "teddy bears", "teddy bear", "teddy bears"], "difficult_direct_answer": false, "rationales": ["There are many popular dolls but the one that imitates bears, or the teddy bear, is very easy to spot.", "The toys are teddy bears.", "The toys are teddies."], "image": "train2014/COCO_train2014_000000321942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234046, "question_id": "MaRSu5CKfH8Ym2j2htFVcD", "question": "What do you usually find in the object that shares the same name as the third word on the sign?", "choices": ["cow", "lawnmower", "fork", "blender"], "correct_choice_idx": 1, "direct_answers": ["tools", "tools", "lawnmower", "tools", "lawn mower", "shovel", "tools", "storage", "shed", "tools"], "difficult_direct_answer": false, "rationales": ["A shed is for storage of things like a mower.", "People store yard equipment in these", "The lawnmower can be found."], "image": "train2014/COCO_train2014_000000234046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296931, "question_id": "MadPy5QTK28MbKqDhfKLBi", "question": "What item is under the bright lights attached to the wall?", "choices": ["oven", "washing machine", "fireplace", "cat"], "correct_choice_idx": 2, "direct_answers": ["mantle", "clock", "clock", "mantle", "mantle", "mantle", "fireplace", "fireplace", "mantle", "mantle"], "difficult_direct_answer": false, "rationales": ["The item is the fireplace.", "The brick and mantle is typical of a fireplace as is the metal guard in front of it as well.", "This is a living room, not a kitchen or laundry room. there is no cat."], "image": "train2014/COCO_train2014_000000296931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415218, "question_id": "MaxWaAMLuZkNVQu7N5q5gX", "question": "Other than cooking what household activity occurs in this room?", "choices": ["sleeping", "radio listening", "laundry", "watching television"], "correct_choice_idx": 2, "direct_answers": ["eating", "washing", "washing", "eating", "eating", "washing", "eating", "eating", "eating", "laundry"], "difficult_direct_answer": false, "rationales": ["They can also wash clothes in it.", "There is a sink for laundry.", "They seems to be laundry by the show."], "image": "val2014/COCO_val2014_000000415218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165298, "question_id": "Mb4ES4Bzab9iQKdSgFRpmP", "question": "From the moss growing on the tree and pole which cardinal direction is the stop sign facing?", "choices": ["east", "west", "south", "north"], "correct_choice_idx": 3, "direct_answers": ["north", "north", "north", "south", "north", "north", "right", "right", "south", "north"], "difficult_direct_answer": false, "rationales": ["The moss is growing in the direction of the sun which is north.", "Moss grows to the north", "Moss usually grows on the north side."], "image": "val2014/COCO_val2014_000000165298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124178, "question_id": "MbHbxpnugZaABhpjRFLJAe", "question": "What kind of building are they probably hanging around outside of?", "choices": ["cinema", "school", "government", "tourist"], "correct_choice_idx": 1, "direct_answers": ["business", "office", "university classroom", "college", "school", "school", "apartment", "university", "school", "school"], "difficult_direct_answer": false, "rationales": ["Given their ages and styles of clothing, this is the most likely answer.", "The building looks like a business or institution. there are many young adults outside. many students are young adults.", "These are young people who may be students."], "image": "train2014/COCO_train2014_000000124178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557556, "question_id": "MbM2BidPX2pjmZjfCgEi4k", "question": "What act are her hands doing?", "choices": ["stretching", "pointing", "catching", "throwing"], "correct_choice_idx": 2, "direct_answers": ["catching", "catching", "grasping", "catching frisbee", "catching", "catching", "catching", "catching", "catching", "frisbee"], "difficult_direct_answer": false, "rationales": ["The frisbee is in the air directly in front of her. her fingers are outspread.", "Her hands are moving towards her body, capturing the frisbee.", "A woman is grabbing a frisbee from someone that threw it on the other side."], "image": "val2014/COCO_val2014_000000557556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153829, "question_id": "MbRFtgFWCiw5PKSsDHRfFk", "question": "During which season is this child skating on the sidewalk?", "choices": ["summer", "fall", "spring", "winter"], "correct_choice_idx": 3, "direct_answers": ["winter", "fall", "spring", "winter", "fall", "winter", "winter", "fall", "fall", "winter"], "difficult_direct_answer": false, "rationales": ["The season is winter.", "The winter is the best in the area.", "The child is wearing a tuque on his head. the trees do not have leaves."], "image": "train2014/COCO_train2014_000000153829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250380, "question_id": "MbizqyU7UVk28DR84MVVmW", "question": "What is the animal doing?", "choices": ["eating", "being trained", "attacking", "fleeing"], "correct_choice_idx": 1, "direct_answers": ["playing", "playing", "prancing", "being trained", "catching frisbee", "dog", "holding frisbee", "running", "playing frisbee", "playing frisbee"], "difficult_direct_answer": false, "rationales": ["The animal is being trained", "The dog is being trained.", "The animal is learning to fetch."], "image": "train2014/COCO_train2014_000000250380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252054, "question_id": "MbkM6wGM5XCgFrPhb49ZdX", "question": "Where is the girl located at?", "choices": ["home", "donut shop", "school", "library"], "correct_choice_idx": 1, "direct_answers": ["donut shop", "kitchen", "restaurant", "bakery", "table", "table", "restaurant", "restaurant", "restaurant", "bakery"], "difficult_direct_answer": false, "rationales": ["The girl is likely located at a donut shop because of the donut.", "The girl is eating a donut.", "The girl has a donut."], "image": "val2014/COCO_val2014_000000252054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534107, "question_id": "MbsMXCKGbw37Z4PwoYxnkV", "question": "What is the giraffe doing?", "choices": ["eating ants", "building hill", "posing", "resting"], "correct_choice_idx": 0, "direct_answers": ["eating ants", "eating", "eating", "eating", "licking", "standing", "looking", "standing", "eating", "playing"], "difficult_direct_answer": false, "rationales": ["There is a tall ant hill by the giraffe and the giraffe's face is by the top of the ant hill with it's mouth open indicating that the giraffe is eating the ants.", "The giraffe is eating.", "There is a giant ant hill."], "image": "train2014/COCO_train2014_000000534107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72650, "question_id": "McChPJ8XmFAKcqPbM7CYgG", "question": "Foam present in the surf board helps in?", "choices": ["slide", "float", "soak", "swim"], "correct_choice_idx": 1, "direct_answers": ["float", "helps floating", "floatation", "flotation", "bouncy", "float", "keeping buoyant", "floating", "buoyancy", "floating"], "difficult_direct_answer": false, "rationales": ["The surfboard floats on the foam.", "Foam helps keep the surfboard up since it's full of oxygen.", "Foam allows floating."], "image": "train2014/COCO_train2014_000000072650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22236, "question_id": "Mck6gx5JoZYM8oRCaCZQMS", "question": "This bus shares a name with what sandwich?", "choices": ["double check", "open-faced", "reuben", "blt"], "correct_choice_idx": 0, "direct_answers": ["submarine", "don't know", "subway", "sub", "double decker", "no idea", "double check", "subway", "double decker", "subs"], "difficult_direct_answer": false, "rationales": ["A reuben is a corn beef sandwich.", "This is a double decker bus.", "The name comes from both having two layers."], "image": "val2014/COCO_val2014_000000022236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110704, "question_id": "Mco4pwFQYeGRyyAQtGTPmx", "question": "Where is the man laying?", "choices": ["couch", "beach", "floor", "hospital bed"], "correct_choice_idx": 3, "direct_answers": ["hospital bed", "hospital", "hospital bed", "hospital", "hospital bed", "hospital bed", "hospital bed", "hospital bed", "hospital bed", "hospital bed"], "difficult_direct_answer": false, "rationales": ["When we are very sick we normally have to go to a hospital. in most cases we have to lay on a hospital bed which is surrounded by medical equipment and personnel.", "He has equipment and tubes around him", "The man is in the hospital. the nurse is taking care of him."], "image": "train2014/COCO_train2014_000000110704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177090, "question_id": "McsqouPb422CSdqC3eho9X", "question": "How many slices of toast can be cooked at once here?", "choices": ["four", "one", "none", "two"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are four slices.", "The toaster has slots for four.", "The toaster can hold 4 slices."], "image": "train2014/COCO_train2014_000000177090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539573, "question_id": "Md7WHg35SKqKFzUqHLTaQe", "question": "What food he is eating?", "choices": ["chocolate", "pancake", "burger", "pizza"], "correct_choice_idx": 1, "direct_answers": ["pancakes", "pancakes", "pancakes", "pancakes", "pancake", "pancakes", "pancakes", "pancakes", "pancakes", "pancakes"], "difficult_direct_answer": false, "rationales": ["This is obvious by the shape and appearance.", "A man is sitting in front of a plate of pancakes with silverware in his hands.", "The man has pancakes on his plate."], "image": "train2014/COCO_train2014_000000539573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99354, "question_id": "Md7sDWSEdEosJQKqBSzQsG", "question": "What ingredient would you find in her drink?", "choices": ["citric acid", "strawberries", "sugar", "milk"], "correct_choice_idx": 0, "direct_answers": ["aspartame", "caffeine", "caffeine", "water", "citric acid", "aspartame", "diet coke", "soda", "coke", "sugar"], "difficult_direct_answer": false, "rationales": ["She is drinking diet coke. it uses artificial sweeteners instead of sugar.", "Citric acid would be found.", "There is no sugar, strawberries, or milk in the diet coke."], "image": "train2014/COCO_train2014_000000099354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187952, "question_id": "MdPmJ7rkZsJSBmXNhq4zqx", "question": "What type of light source is next to the laptop?", "choices": ["chandelier", "lantern", "sunlight", "lamp"], "correct_choice_idx": 1, "direct_answers": ["lantern", "camping lamp", "lamp", "lantern", "oil lamp", "lamp", "lantern", "kerosene lamp", "kerosene lamp", "lantern"], "difficult_direct_answer": false, "rationales": ["This is a lantern that is on the desk.", "There is a glowing lantern next to the laptop.", "The type of shape of lamp with an handle on top of it is usually seen in campsites."], "image": "train2014/COCO_train2014_000000187952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67090, "question_id": "MdXqE8Jjq82JtRQ9BRopBr", "question": "What is on the item in the man's right hand?", "choices": ["nothing", "kite string", "dog bone", "memo"], "correct_choice_idx": 1, "direct_answers": ["kite", "kite", "board", "kite", "string", "spool", "kite", "kite", "kite", "kite string"], "difficult_direct_answer": false, "rationales": ["The kite is being held up by the string.", "He is holding a small kite in his right hand. a child with him appears to be waiting to fly that kite. the kite would need to have this to be flown.", "The man has a kite string."], "image": "train2014/COCO_train2014_000000067090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154597, "question_id": "Mdh9mPGoPBYwjHa5neg6BU", "question": "How many people could fit comfortably in each booth?", "choices": ["four", "fifteen", "eight", "ten"], "correct_choice_idx": 0, "direct_answers": ["four", "six", "four", "four", "four", "three", "six", "three", "three", "four"], "difficult_direct_answer": false, "rationales": ["This would give people room to move their arms as they eat", "Four people can sit by the table.", "The booth could comfortable seat two people on each bench."], "image": "train2014/COCO_train2014_000000154597.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266375, "question_id": "MdpdyZ3rUrKyHTFNX7cKLd", "question": "What type meat might creatures eating this grass create?", "choices": ["beef", "horse", "pork", "mutton"], "correct_choice_idx": 3, "direct_answers": ["beef", "lamb", "steak", "lamb", "lamb", "mutton", "lamb", "mutton", "mutton", "mutton"], "difficult_direct_answer": false, "rationales": ["They are sheep and that's the appropriate term.", "Mutton is a name for the meat from these animals", "The creatures shown are sheep and their meat is called mutton."], "image": "train2014/COCO_train2014_000000266375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542933, "question_id": "MdxLWTsLgE9ohEPyMdRESz", "question": "Which team is up now?", "choices": ["none", "pitchers", "both", "batters"], "correct_choice_idx": 3, "direct_answers": ["angels", "away team", "batting team", "red sox", "away", "home team", "white", "away", "dodgers", "batters"], "difficult_direct_answer": true, "rationales": ["The team that's up is batting.", "The man in gray is currently at bat, meaning that the batters are the ones who are \"up\" at the moment.", "The team is the batters."], "image": "val2014/COCO_val2014_000000542933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539430, "question_id": "MeDFGxzRL8AQmzKxNkfLv5", "question": "What is this group ready to do?", "choices": ["descend", "run", "ascend", "duck"], "correct_choice_idx": 0, "direct_answers": ["ski", "ski", "ski", "ski", "ski", "freeze", "freeze", "descend", "learn skiing", "ski"], "difficult_direct_answer": false, "rationales": ["The group is at the bottom of a relatively flat area. they are wearing skis and look prepared to go skiing.", "The group wants to go down.", "The group is getting ready to ski. when you ski you ride down a"], "image": "train2014/COCO_train2014_000000539430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284789, "question_id": "MeSYRcBbYmy7Mr5MUWRCHX", "question": "Where do the persons controlling the sails in the sky stand?", "choices": ["ocean", "shore", "boat", "island"], "correct_choice_idx": 0, "direct_answers": ["outside sea", "skis", "on surfboards", "surfboards", "on surfboard", "water", "water", "in water", "in water", "ocean"], "difficult_direct_answer": false, "rationales": ["There are people riding boards in the sea as they are being pulled by sails in the sky.", "All of the people are paraskiing in the ocean.", "People can go to the ocean."], "image": "val2014/COCO_val2014_000000284789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466913, "question_id": "MeU7iE9oBRWqHHedTsuRVW", "question": "What mythological creature is most similar to the one the man is riding on?", "choices": ["phlegon", "medusa", "cerberus", "airavata"], "correct_choice_idx": 0, "direct_answers": ["unicorn", "phlegon", "seabiscuit", "seabiscuit", "seabiscuit", "unicorn", "unicorn", "unicorn", "unicorn", "unicorn"], "difficult_direct_answer": false, "rationales": ["Phlegon is similar to the horse.", "A phlegon has furs in its neck.", "Although it doesn't have three heads it is rideable."], "image": "train2014/COCO_train2014_000000466913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137931, "question_id": "MeYNQpvWXAJMMgA4Hmswpr", "question": "What type of shoes does the woman have on?", "choices": ["boots", "sneakers", "high heels", "sandals"], "correct_choice_idx": 2, "direct_answers": ["high heels", "heels", "flat", "heels", "pumps", "pumps", "heels", "high heels", "heels", "high heels"], "difficult_direct_answer": false, "rationales": ["The woman has high heels on.", "The shoes are raised because of the heel.", "A woman in an orange dress is rolling a dog in a suitcase and has a raise in back of her shoes."], "image": "train2014/COCO_train2014_000000137931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319991, "question_id": "Meksx2dSGdj9D8Z7miUYtB", "question": "What is the man preparing to do outdoors with the food items in his hands?", "choices": ["barbecue", "eat", "feed dog", "throw out"], "correct_choice_idx": 0, "direct_answers": ["grill", "cook them", "barbecue", "barbeque", "barbeque", "grill", "grill", "grill", "barbeque", "grill"], "difficult_direct_answer": false, "rationales": ["He is going to barbeque some hot dogs that are on the tray.", "The man is preparing to barbecue these hot dogs.", "Hot dogs are commonly cooked outdoors on a grill."], "image": "train2014/COCO_train2014_000000319991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107183, "question_id": "Mf9K4RjtHQzDaXzngzNHa7", "question": "Which mode of transport in use here uses less gasoline?", "choices": ["elephants", "motorcycle", "van", "bus"], "correct_choice_idx": 0, "direct_answers": ["elephants", "elephants", "motorcyle", "motorbike", "elephant", "motorcycle", "elephants", "elephant", "elephant caravan", "elephant"], "difficult_direct_answer": false, "rationales": ["The transportation is the elephants.", "Elephants are economical. they do not run on gasoline.", "Elephants don't use any gas."], "image": "val2014/COCO_val2014_000000107183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97722, "question_id": "MfGKqmJoFaRH4YLsgqNL9u", "question": "What season is this?", "choices": ["fall", "spring", "winter", "summer"], "correct_choice_idx": 2, "direct_answers": ["winter", "winter", "winter", "winter", "winter", "winter", "winter", "winter", "winter", "winter"], "difficult_direct_answer": false, "rationales": ["The people are bundled in warm clothing and there is snow on the ground.", "There is snow on the ground.", "The season is winter."], "image": "train2014/COCO_train2014_000000097722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331642, "question_id": "MfHYnYeDWZPBSs2DYD8CmQ", "question": "What sort of things are sold at the well lighted business shown?", "choices": ["flowers", "cars", "radios", "food"], "correct_choice_idx": 3, "direct_answers": ["unknown", "food stuff", "food", "food", "deli meats", "convenience items", "deli food", "food", "essentials", "food"], "difficult_direct_answer": false, "rationales": ["The business has packaged items and beverages lining its shelves and also displays food imagery on its signage, so the establishment clearly makes its money by selling food.", "It's a restaurant.", "Food is sold at the lighted business."], "image": "val2014/COCO_val2014_000000331642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48548, "question_id": "MfdgRE5PZb8Lxdso4MrbZi", "question": "Why is the squatting man holding his hand out?", "choices": ["to throw", "to hit", "to congratulate", "to catch"], "correct_choice_idx": 3, "direct_answers": ["catch ball", "increase speed", "catcher", "catch ball", "catcher", "catcher", "catching balls", "to catch", "catching pitch", "catch ball"], "difficult_direct_answer": false, "rationales": ["The man that is squatting has his hand out so he can catch the ball if the batter misses.", "He has mitt on his hand to catch any balls that come to him.", "The man is holding a mitt that is positioned behind a batter to catch a missed ball."], "image": "train2014/COCO_train2014_000000048548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455508, "question_id": "Mfv66o2sdRpF5EfAFEvDvi", "question": "What are the large blocks for?", "choices": ["fishing", "shore protection", "decoration", "shore extension"], "correct_choice_idx": 1, "direct_answers": ["docking", "prevent erosion", "support", "shore protection", "separation", "protect shore", "sea levels", "barrier", "wave barrier", "protection"], "difficult_direct_answer": true, "rationales": ["To protect the pier.", "The large blocks visible are manmade and concrete based on their color, shape and size. structures of this kind are placed intentionally by people for answer a.", "The blocks create a barrier."], "image": "train2014/COCO_train2014_000000455508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435682, "question_id": "Mg6kf6SA2uvZtw5EUTGRhC", "question": "What is this person doing?", "choices": ["interview", "lunch break", "inspection", "maintenance"], "correct_choice_idx": 1, "direct_answers": ["eating texting", "eating donut", "eating", "eating", "eating", "eating", "eating", "eating", "eating donut", "lunch break"], "difficult_direct_answer": false, "rationales": ["He's eating a burger", "The person is eating so this may be his midday meal.", "The man is not working. he is eating."], "image": "val2014/COCO_val2014_000000435682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495692, "question_id": "Mg9s94yqGJZECCZaohrzQN", "question": "What electronic device is likely to be in front of the couch?", "choices": ["television", "record player", "telephone", "computer"], "correct_choice_idx": 0, "direct_answers": ["television", "television", "remote", "remote", "television", "television", "remote", "remote", "television", "remote"], "difficult_direct_answer": false, "rationales": ["These are remotes to change the channel", "The collection of remotes visible would be used on a television based on the button designs and layout. television remotes are usually kept near the television they operate and the couch would likely be pointed at the tv in order to watch.", "By the settings and remotes on the table you can tell what is in front of the cats."], "image": "train2014/COCO_train2014_000000495692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225126, "question_id": "MgAASHRdYBi9NFvfCUZPyd", "question": "What propels these people across the water?", "choices": ["boats", "swimming", "wind sails", "ski do"], "correct_choice_idx": 2, "direct_answers": ["kite", "parachute", "kite", "sail", "wind", "kites", "wind", "wind", "wind sail", "wind sails"], "difficult_direct_answer": false, "rationales": ["The guy is waterboarding with a parasail.", "The people on the water are being propelled by holding onto wind sails.", "Wind sails keep these people moving."], "image": "train2014/COCO_train2014_000000225126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175757, "question_id": "MgqUD7Mahh2tyP8cU3Loi2", "question": "What kind of expression does the man have on his face?", "choices": ["gleeful", "jovial", "serious", "terrified"], "correct_choice_idx": 2, "direct_answers": ["annoyance", "serious", "pensive", "serious", "thinking", "cold", "thoughtful", "serious", "annoyance", "serious"], "difficult_direct_answer": false, "rationales": ["The man's face is clearly visible and lacking features characteristics consistent with the other answers.", "The man has a solemn expression.", "He is not smiling so he is not cheerful. plus he doesn't look scared."], "image": "train2014/COCO_train2014_000000175757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478712, "question_id": "Mh3WrSDDxvvJmr5GdBk34b", "question": "Why are the men sitting down?", "choices": ["to eat", "to wait", "to paint", "to work"], "correct_choice_idx": 3, "direct_answers": ["to work", "working", "meeting", "working", "to work", "see computers", "working", "to work", "working", "laptops"], "difficult_direct_answer": false, "rationales": ["The men have their electronic equipment in front of them to work.", "Each of them seem occupied with work using the laptops.", "Often people use computers to work."], "image": "train2014/COCO_train2014_000000478712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72769, "question_id": "Mh7MJVuSWMBhVbr8zv9rof", "question": "Where is this party taking place?", "choices": ["formal restaurant", "home", "kid's restaurant", "club"], "correct_choice_idx": 2, "direct_answers": ["restaurant", "restaurant", "party place", "room", "party place", "playhouse", "cake shop", "school", "resturant", "kid's restaurant"], "difficult_direct_answer": false, "rationales": ["The setting in the background behind the kids is colorful and the cakes look professional which would be consistent with answer a.", "The kids are at a restaurant.", "The colorful carpet and the long table suggests that it is a restaurant which does kid's birthday parties."], "image": "train2014/COCO_train2014_000000072769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538517, "question_id": "MhDSEuMkUHkodUuznR9k3z", "question": "Which person seems most out of place?", "choices": ["security guard", "hat wearer", "jeans wearer", "kimono wearer"], "correct_choice_idx": 3, "direct_answers": ["old woman", "lady", "kimono lady", "kimono wearer", "women", "woman kimono", "front woman", "woman", "wearing geta", "old woman"], "difficult_direct_answer": true, "rationales": ["Everyone else is wearing street clothes.", "The kimono is out of place", "The woman in the kimono is the only one not wearing modern clothing."], "image": "train2014/COCO_train2014_000000538517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461467, "question_id": "MhDuGf8wL8Qus4gTWqKw2N", "question": "What type of meat fruit or vegetable is most popular on pizza?", "choices": ["olives", "pepperoni", "mushrooms", "onions"], "correct_choice_idx": 1, "direct_answers": ["pepperoni", "straws", "pepperoni", "tomato", "pepperoni", "mushrooms", "bacon olives", "pepperoni", "pepperoni", "pepperoni"], "difficult_direct_answer": false, "rationales": ["Pepperoni is a popular pizza topping.", "Most people like to eat meat on their pizza.", "The pizza has a specific kind of sliced sausage that's red with spices."], "image": "val2014/COCO_val2014_000000461467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439481, "question_id": "MhPD76eJR5WtuWhUhJ4KTg", "question": "Which way do these beasts prefer to travel?", "choices": ["swim", "walk/gallop", "fly", "slither"], "correct_choice_idx": 1, "direct_answers": ["walk/gallop", "running", "horses", "right", "right side", "running", "running north", "forward", "galloping", "walking"], "difficult_direct_answer": true, "rationales": ["These beasts are horses, not fish, birds, or snakes.", "Their legs bent at certain angles show them moving swiftly. the dust flying behind them shows speed.", "The horses run on the field."], "image": "val2014/COCO_val2014_000000439481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502300, "question_id": "MhXr2ExGcDkJk726fCrS4p", "question": "What are the two rectangular objects on each side of the monitor used for?", "choices": ["exercising", "sorting", "stapling", "sound"], "correct_choice_idx": 3, "direct_answers": ["displaying information", "speakers", "speakers", "sound", "playing sound", "sound", "display information", "volume", "sound", "sound"], "difficult_direct_answer": false, "rationales": ["The objects are speakers.", "The speakers provide sound.", "They are used to hear things."], "image": "train2014/COCO_train2014_000000502300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284464, "question_id": "MhZtDD2vHMMasvFBeNzryq", "question": "What is the cat doing?", "choices": ["resting", "hunting", "eating", "drinking"], "correct_choice_idx": 0, "direct_answers": ["resting", "resting", "resting", "sitting down", "resting", "sitting", "resting", "sitting", "sitting", "sitting"], "difficult_direct_answer": false, "rationales": ["The cat is lying down on the chair in the sun.", "The cat is resting.", "The cat is just hanging out laying there on the bench."], "image": "train2014/COCO_train2014_000000284464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493187, "question_id": "MhbSvi7nZoWTMuG3pXqqes", "question": "To which State does 80 and 189 lead to?", "choices": ["florida", "new york", "utah", "arkansas"], "correct_choice_idx": 2, "direct_answers": ["salt lake", "wyoming", "utah", "salt lake", "wyoming", "utah", "utah", "utah", "rock springs", "florida"], "difficult_direct_answer": false, "rationales": ["Salt lake and rock springs are cities in the 'industry' state.", "Those highways will end up converging in utah since salt lake is there.", "The state is utah."], "image": "train2014/COCO_train2014_000000493187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122549, "question_id": "MhercytRFckpzdgGPZHBKb", "question": "What extra item is visible here?", "choices": ["shirt", "beer", "paper towels", "necktie"], "correct_choice_idx": 3, "direct_answers": ["paper towel", "tie", "tie", "papertowels", "paper towel", "paper towel", "necktie", "tie", "neckties", "tie"], "difficult_direct_answer": false, "rationales": ["The man has one tie already on and tied around his neck. he doesn't need another one.", "The man has multiple accessories around his neck.", "The man is wearing a necktie and has another one draped over his shoulders."], "image": "val2014/COCO_val2014_000000122549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220215, "question_id": "MhgsbwfqTnmkXuJi8muyTL", "question": "What do the people in different boats do?", "choices": ["race", "fish", "nap", "sail"], "correct_choice_idx": 0, "direct_answers": ["row", "canoeing", "row", "row", "paddle", "race", "row", "row", "row", "row"], "difficult_direct_answer": false, "rationales": ["There appears to be lane markers which would be used for racing and the boats appear to have the same number of people all trying to move rapidly as if racing.", "Traditionally rowing requires other boats to compete in races.", "The people are racing."], "image": "val2014/COCO_val2014_000000220215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474319, "question_id": "Mi7iHtXTDrdgGuG6CDdcDC", "question": "What type of outfits are the two girls wearing?", "choices": ["sweatsuits", "wetsuits", "beachsuits", "boardsuits"], "correct_choice_idx": 1, "direct_answers": ["wetsuits", "wetsuits", "wet suits", "wetsuits", "wetsuit", "wetsuits", "swimsuit", "wet suits", "wetsuit", "wetsuits"], "difficult_direct_answer": false, "rationales": ["The outfits are wetsuits.", "Wetsuits are for surfing.", "They are holding surfboards and the suits are tight."], "image": "train2014/COCO_train2014_000000474319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39009, "question_id": "MiBtrSkLLcpTDJyxZ6qVSM", "question": "What's the name of the food the people are holding?", "choices": ["meat popsicle", "corndog", "franks", "dog pop"], "correct_choice_idx": 1, "direct_answers": ["corn dog", "corn dog", "corndog", "corndog", "corndog", "corndogs", "corndogs", "corndog", "corndog", "corn dog"], "difficult_direct_answer": false, "rationales": ["The name is a corn dog.", "The objects are clearly visible and appear to be coated hot dogs with a stick out of them. these features are consistent with answer a.", "It's a cornmeal covered hot dog on a stick."], "image": "val2014/COCO_val2014_000000039009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274022, "question_id": "MiPYWB43VHjHXowS5jPdr2", "question": "What brand coffee is most readily available here?", "choices": ["starbucks", "burger king", "mcdonald's", "peets"], "correct_choice_idx": 0, "direct_answers": ["starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks"], "difficult_direct_answer": false, "rationales": ["Starbucks is available.", "The brand of coffee most readily available is the starbucks on the street intersection.", "As indicated by the logo and sign in the center right of the image in the background."], "image": "train2014/COCO_train2014_000000274022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121046, "question_id": "MitjvkJCWgJ5oo5FsUmzC2", "question": "Who probably drew the picture on the fridge?", "choices": ["lady", "visitor", "turkey", "child"], "correct_choice_idx": 3, "direct_answers": ["kid", "child", "her child", "child", "child", "child", "kid", "child", "child", "kid"], "difficult_direct_answer": false, "rationales": ["A child likely drew the picture based on the queen quality of the artwork.", "One of the kids that live in the house did.", "Many parents will showcase their child's artwork on the fridge."], "image": "train2014/COCO_train2014_000000121046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409192, "question_id": "Mj8U7D6MVbCpoHy7aSxSmV", "question": "What kind of luggage is this?", "choices": ["antique", "new", "fresh", "gucci"], "correct_choice_idx": 0, "direct_answers": ["vintage", "clothes", "trunk", "trunk luggage", "trunks", "suitcases", "trunk", "antique", "trunk", "antique"], "difficult_direct_answer": false, "rationales": ["The stack of luggage consists of several antique trunks that look old.", "Newer luggage is usually made of more modern materials such as cloth or plastic.", "The luggage is really old."], "image": "train2014/COCO_train2014_000000409192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206268, "question_id": "MjXESscszVHiF6PKCg447L", "question": "What holiday is the person gaming here celebrating?", "choices": ["valentines day", "halloween", "st. patricks", "mother's day"], "correct_choice_idx": 2, "direct_answers": ["st patricks", "st patrick's", "st patricks", "sta patricks", "independence", "st. patricks", "sta patrick's", "st patricks", "sta patrick", "st patricks"], "difficult_direct_answer": false, "rationales": ["The hat is green and has shamrocks on it which is typical of st. patrick's day.", "The man is wearing a green hat with shamrocks.", "The hat has green clovers on it"], "image": "val2014/COCO_val2014_000000206268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543525, "question_id": "MjjQpbK6xNrv6cFiCc3yQz", "question": "What pattern is on the floor?", "choices": ["tiled pattern", "zigzag pattern", "squiggle pattern", "curvy pattern"], "correct_choice_idx": 0, "direct_answers": ["checkered", "clean", "squares", "checkerboard", "squares", "square", "squares", "squared", "tiled", "tiled pattern"], "difficult_direct_answer": false, "rationales": ["This is tile pieces on the floor", "There are tiles on the floor.", "The tile features squares next to each other."], "image": "val2014/COCO_val2014_000000543525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324845, "question_id": "MjsqsDkazrMYYv9MnFUuxB", "question": "Which food on the sandwich is highest in protein?", "choices": ["tomato", "spinach", "chicken", "cucumber"], "correct_choice_idx": 2, "direct_answers": ["chicken", "chicken", "chicken", "meat", "meat", "chicken", "chicken", "meat", "meat", "chicken"], "difficult_direct_answer": false, "rationales": ["Chicken has more protein than veggies.", "The food is chicken.", "Traditionally meat products are packed with the most protein."], "image": "val2014/COCO_val2014_000000324845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60387, "question_id": "MjxEA6MfEqjFHT62mFfGPG", "question": "Why is the sidewalk ahead yellow?", "choices": ["marketing scheme", "dog curb", "bus route", "elevation change"], "correct_choice_idx": 3, "direct_answers": ["curb cut", "elevation change", "caution", "caution", "caution", "blind person", "crossing light", "wheelchair ramp", "crossing stub", "indicator"], "difficult_direct_answer": false, "rationales": ["It is a visual marker to warn people so that they do not trip.", "The yellow marks are used for traffic.", "This is to warn blind people of an elevation change in the sidewalk."], "image": "train2014/COCO_train2014_000000060387.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152668, "question_id": "MkKzkdeGwi6PGKnN8yjGDG", "question": "What is a plant that is commonly used in hedges?", "choices": ["yew", "roses", "box", "oak"], "correct_choice_idx": 2, "direct_answers": ["boxwood", "boxglove", "buxus", "box", "bushes", "kniaaple", "bushes", "box brush", "shrub", "tree"], "difficult_direct_answer": true, "rationales": ["The plant is a box.", "The box is mostly known to be used for hedges.", "The boxwood plant is commonly used in bushes."], "image": "train2014/COCO_train2014_000000152668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147442, "question_id": "Mkkm6CwtJUYbCGXm5Ncj9D", "question": "The tail has what bright color?", "choices": ["blue", "green", "red", "yellow"], "correct_choice_idx": 3, "direct_answers": ["yellow", "white", "yellow", "yellow", "yellow", "yellow", "yellow", "blue", "yellow", "blue"], "difficult_direct_answer": false, "rationales": ["The tail is yellow.", "The tail has a bright yellow color.", "A large jumbo jet has a blue and bright color like the sky."], "image": "train2014/COCO_train2014_000000147442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11742, "question_id": "Mkm2WdXKYYvg5Vrifv6xAC", "question": "What year was this wine bottled?", "choices": ["2020", "2018", "2017", "2019"], "correct_choice_idx": 3, "direct_answers": ["2019", "2010", "shiraz", "2010", "2010", "in 2010", "2010", "2019", "2010", "2010"], "difficult_direct_answer": false, "rationales": ["The bottle says 2019.", "The year was 2019.", "The year on the bottle is 2019."], "image": "val2014/COCO_val2014_000000011742.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509419, "question_id": "MkoL7DNmpwsW3nemJUxYzw", "question": "What is the object being looked at?", "choices": ["monitor", "aquarium", "tv", "stereo"], "correct_choice_idx": 1, "direct_answers": ["aquarium", "aquarium", "fish", "fish tank", "aquarium", "fishtank", "desk", "wall art", "poster", "fish tank"], "difficult_direct_answer": false, "rationales": ["The object looked at is an aquarium.", "This person is looking at the fish tank.", "The object is a clear and rectangular. it contains fish and water."], "image": "train2014/COCO_train2014_000000509419.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124694, "question_id": "MkpFUzTrKJoRhfgJSRahBV", "question": "What kind of edible meat can be produced from the smaller mammals in this photo?", "choices": ["poultry", "mutton", "pork", "beef"], "correct_choice_idx": 1, "direct_answers": ["lamb", "mutton", "mutton", "mutton", "sheep", "lamb", "veal", "lamb", "mutton", "lamb"], "difficult_direct_answer": false, "rationales": ["Mutton comes from sheep.", "Such meat comes from lambs and rams, of which one is pictured.", "These are sheep"], "image": "train2014/COCO_train2014_000000124694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447379, "question_id": "MkqGg3LHcoCdyW3enM42U4", "question": "What are people along the wall watching?", "choices": ["parade", "motorcycle race", "fair", "bicycle racing"], "correct_choice_idx": 1, "direct_answers": ["motocross", "race", "race", "motorcycle race", "racing", "race", "race", "race", "motocross", "race"], "difficult_direct_answer": false, "rationales": ["Motorcyclists are competing in a race.", "The bike has a competition number on it", "The man is on a motorcycle."], "image": "train2014/COCO_train2014_000000447379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566612, "question_id": "Mm5urqACdm9Nq34GZ2gaAV", "question": "What is the most usual way to ignite the thing the man is holding?", "choices": ["grill lighter", "normal lighter", "camp fire", "torch"], "correct_choice_idx": 1, "direct_answers": ["matches", "stove", "fire", "match", "normal lighter", "blow torch", "match", "lighter", "lighter", "ignite it"], "difficult_direct_answer": false, "rationales": ["The man should use a lighter.", "It is a cigarette so it needs something small to light", "A normal lighter would be used to light a cigarette like this."], "image": "train2014/COCO_train2014_000000566612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474378, "question_id": "MmeGwLT9dAEML7FT5YpZB5", "question": "What type of parking is available?", "choices": ["parallel", "diagonal", "valet", "lot"], "correct_choice_idx": 0, "direct_answers": ["parallel", "metered parking", "metered parking", "meter", "street side", "meter", "street", "street parking", "parallel", "paid parking"], "difficult_direct_answer": false, "rationales": ["Everyone has to park the same way as the sidewalk faces", "Cars are parked along the sidewalk, also known as parallel parking.", "The parked cars are facing the same direction as the traffic that passes by them."], "image": "train2014/COCO_train2014_000000474378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34657, "question_id": "MnChYs7CWwrtWiHVCF5xqq", "question": "What are these people engaging in?", "choices": ["singing karaoke", "watching movie", "reading books", "video game"], "correct_choice_idx": 3, "direct_answers": ["tv", "playing wii", "watching tv", "television", "video game", "playing game", "laughing", "video games", "watching tv", "wii gaming"], "difficult_direct_answer": true, "rationales": ["The man on the right is holding a controller that is used for playing nintendo wii, a popular video game console.", "They are holding nintendo wii remotes.", "The people watch a game."], "image": "val2014/COCO_val2014_000000034657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145606, "question_id": "MnV35Tz2PkStwK8Lb3fvyD", "question": "What item might the shopper purchase here?", "choices": ["napkin", "slacks", "tie", "dress"], "correct_choice_idx": 2, "direct_answers": ["necktie", "clothes", "ties", "ties", "tie", "tie", "ties", "ties", "ties", "ties"], "difficult_direct_answer": false, "rationales": ["The man seems to be looking at different styles of ties here.", "The man is holding ties.", "This store sells ties"], "image": "train2014/COCO_train2014_000000145606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246280, "question_id": "MnbRQRZKKSsxrDfiZUrasM", "question": "Why is he grabbing his foot?", "choices": ["wants fall", "is stretching", "showing off", "performing trick"], "correct_choice_idx": 1, "direct_answers": ["he's not", "he isn't", "just served", "it hurts", "balance", "is stretching", "for balance", "follow through", "stretching", "pain"], "difficult_direct_answer": true, "rationales": ["Athletes often stretch before games.", "He is stretching his muscles before he plays tennis.", "He's keeping his balance after hitting the ball"], "image": "train2014/COCO_train2014_000000246280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44782, "question_id": "MnfUV5YT3X9sBCMyhUvyvg", "question": "What animal is mentioned on one of the signs?", "choices": ["cat", "eagle", "dog", "bunny"], "correct_choice_idx": 3, "direct_answers": ["bunny", "rabbit", "bunny", "bunny", "fish", "bunny", "bunny", "bunny", "rabbit", "bunny"], "difficult_direct_answer": false, "rationales": ["The word \"bunny\" appears in red and white.", "On the right side of the image along the side of the road the word 'bunny' is seen.", "Bunnies are shown on the sign."], "image": "train2014/COCO_train2014_000000044782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441470, "question_id": "Mni8MUrFZp9SwjhzCojEe6", "question": "What type of dog is this?", "choices": ["golden retriever", "pincher", "cocker spaniel", "bull dog"], "correct_choice_idx": 3, "direct_answers": ["bull dog", "pug dog", "bull dog", "pug dog", "bulldog", "bulldog", "bulldog", "bulldog", "bull dog", "bulldog"], "difficult_direct_answer": false, "rationales": ["The dog is the size, shape and has the unique features consistent with answer a.", "The dog is of the right size and style to be of the answer a breed.", "Traditionally this type of dog is recognized by it stocky frame and pushed in face."], "image": "train2014/COCO_train2014_000000441470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542442, "question_id": "Mnq3hjiwxv33yge64hwzDo", "question": "What are the men engaging in?", "choices": ["attending concert", "working", "attending college", "sightseeing"], "correct_choice_idx": 1, "direct_answers": ["walking", "working", "business", "business transaction", "business", "walking", "walking", "meetings", "meeting", "business"], "difficult_direct_answer": false, "rationales": ["The men are in business suits.", "The men are working.", "Since the men are dressed in suits and ties it is reasonable to assume they are on the job here, as opposed to sightseeing or just socializing."], "image": "train2014/COCO_train2014_000000542442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90914, "question_id": "Mnq6yjArjrN2TJbo4STfJE", "question": "What is the woman ready to do?", "choices": ["roll", "bat", "juggle", "catch"], "correct_choice_idx": 3, "direct_answers": ["catch frisbee", "catch frisbee", "catch frisbee", "catch frisbee", "catch frisbee", "catch frisbee", "catch", "catch frisbee", "catch frisbee", "catch frisbee"], "difficult_direct_answer": false, "rationales": ["The woman is playing ultimate frisbee. the frisbee is moving towards her.", "She is ready to grab the frisbee as it comes down out of the air.", "She is looking up to catch the frisbee."], "image": "train2014/COCO_train2014_000000090914.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51898, "question_id": "MoDnDVXDNEZBVyNRG3GSrA", "question": "What is the white circular disc on the tower used for?", "choices": ["window", "spotting ships", "jailing prisoners", "telling time"], "correct_choice_idx": 3, "direct_answers": ["tell time", "tower clock", "telling time", "telling time", "tell time", "telling time", "telling time", "telling time", "time", "time"], "difficult_direct_answer": false, "rationales": ["The disc is for time.", "Traditionally clock faces are circular.", "The white circular disk on the tower is a clock."], "image": "train2014/COCO_train2014_000000051898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340393, "question_id": "Moj66Kwyde3wN3Vkui4X9w", "question": "In which continent is this place found?", "choices": ["north america", "africa", "europe", "asia"], "correct_choice_idx": 3, "direct_answers": ["asia", "sink", "america", "france", "north america", "asia", "asia", "china", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["Most likely asia, due to the odd toilet", "The toilet is found on the floor.", "This could be on any continent but it is most likely found in an asian country."], "image": "train2014/COCO_train2014_000000340393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363827, "question_id": "MpDCTNHnM4HxttSuJh9Tkt", "question": "What kind of tower is this?", "choices": ["lattice", "water", "cell", "clock"], "correct_choice_idx": 3, "direct_answers": ["clock", "clock tower", "clock", "clock", "clock", "clock tower", "clock tower", "clock", "clock", "clock tower"], "difficult_direct_answer": false, "rationales": ["The tower has a clock.", "The tower has a timepiece on it.", "It's possible to tell time by looking at it,"], "image": "train2014/COCO_train2014_000000363827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235481, "question_id": "MpJJ7ERaQ9PLrjJc3MMGyG", "question": "What word is written before service?", "choices": ["out of", "special", "secret", "in"], "correct_choice_idx": 1, "direct_answers": ["special", "special", "special", "special", "special", "special", "special", "special", "special", "special"], "difficult_direct_answer": false, "rationales": ["The sign on the front of the bus says \"special service\" in yellow.", "The word \"special\" appears.", "The word special is written above service."], "image": "train2014/COCO_train2014_000000235481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20794, "question_id": "MpLjDDcK6soHHLBCN2faSZ", "question": "What activity are the men involved in?", "choices": ["gaming", "gambling", "sports", "writing"], "correct_choice_idx": 0, "direct_answers": ["video gaming", "wii", "video games", "wii game", "wii", "playing wii", "wii", "games", "gaming", "walking"], "difficult_direct_answer": false, "rationales": ["The men are holding video game controllers based on the size and design of the objects. people holding video game remotes and regarding something together are likely doing answer a.", "The men are holding wii-motes. wii is a game console.", "The activity is a game."], "image": "train2014/COCO_train2014_000000020794.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184099, "question_id": "MpgUwpsGvGj39cvUc2M9t8", "question": "What is the chair next to?", "choices": ["dining table", "bed", "statue", "laptop"], "correct_choice_idx": 3, "direct_answers": ["laptop", "desk", "seeing", "desk", "dustbin", "monitor", "computer desk", "desk", "computer", "desk"], "difficult_direct_answer": false, "rationales": ["A laptop is on a desk beside a desktop computer and a chair is closer to the laptop.", "The chair is near a laptop.", "There is a folding computer on a stand"], "image": "train2014/COCO_train2014_000000184099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404243, "question_id": "MpxpeCEYVpPqVCKY956f6N", "question": "How much is a Croissant brie?", "choices": ["3.33", "3.00", "4.44", "4.00"], "correct_choice_idx": 0, "direct_answers": ["couple euros", "three thirty-five", "3.35", "3.33 lira", "3.35", "four euros", "3.35 euros", "3.95 euro", "three thirtythree", "3.33"], "difficult_direct_answer": true, "rationales": ["The price is written underneath the croissant brie on a yellow card. the price tells one how much it is.", "The answer on the tag is 3.33", "The brie is 3.33."], "image": "val2014/COCO_val2014_000000404243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31446, "question_id": "Mq5RzkxnRJxUTGroLwR9pE", "question": "The number of items visible in the sky cane be referred to as what?", "choices": ["quartet", "legion", "platoon", "brigade"], "correct_choice_idx": 0, "direct_answers": ["jet planes", "formation", "quartet", "quad", "quad", "four", "quartet", "jets", "quad", "four"], "difficult_direct_answer": false, "rationales": ["Four planes make up a quartet.", "There are four aircrafts visible in the photo.", "There are planes in this formation."], "image": "val2014/COCO_val2014_000000031446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381192, "question_id": "MqRmVcrv2fdHV3HjeiXYfW", "question": "How many of the computers run on the desk run on battery?", "choices": ["two", "three", "none", "one"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Two of the computers on the desk are laptops, and one is a desktop model. laptops typically have an internal batter, so they can be used without being at a desk.", "There are three computers on the desk. one is a desktop that cannot be run on battery.", "The laptops run on batteries."], "image": "train2014/COCO_train2014_000000381192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365340, "question_id": "MqTmXwGB6udbWfAVn5ZtXH", "question": "This skate is wearing safety gear on what part of his body?", "choices": ["knees", "all correct", "elbows", "head"], "correct_choice_idx": 1, "direct_answers": ["head", "all correct", "knees", "head", "head knees", "head", "knees", "head", "head", "arms"], "difficult_direct_answer": false, "rationales": ["The skater is wearing a helmet, knee pads, and elbow pads for protection.", "The person is wearing protective gear on knees, elbows and head.", "The skater is wearing a helmet, elbow pads, and knee pads. these items protect the different parts of his body they cover."], "image": "train2014/COCO_train2014_000000365340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127535, "question_id": "MqUoRXdmkJ5eYnwy4CFLnA", "question": "What pome fruits are shown here?", "choices": ["oranges", "cherries", "bananas", "apples"], "correct_choice_idx": 3, "direct_answers": ["banana", "banana", "apples", "bananas", "banana", "banana", "apple", "apple", "banana", "apples"], "difficult_direct_answer": false, "rationales": ["A basket if filled with bunches of long yellow fruits.", "These are bananas.", "Apples are in the background."], "image": "val2014/COCO_val2014_000000127535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279895, "question_id": "Mr8JJz9T8kL6MrDoUNjBHt", "question": "What role are persons out of uniform here in?", "choices": ["captains", "life guards", "recruits", "prisoners"], "correct_choice_idx": 2, "direct_answers": ["recruit", "police", "training", "navy", "recruits", "enlisted", "recruit", "recruits", "guard", "students"], "difficult_direct_answer": false, "rationales": ["The role is a recruit.", "You can tell by the luggage and the gentleman to the left as to who the guys in the right are.", "The man instructing is in a military uniform. the young men are standing at attention receiving their instructions."], "image": "val2014/COCO_val2014_000000279895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237254, "question_id": "MrKzWKVri8dJvTmnzKJuk3", "question": "Where is the bus's company located?", "choices": ["britain", "israel", "america", "canada"], "correct_choice_idx": 3, "direct_answers": ["london", "fairbanks", "usa", "great britain", "canada", "city", "england", "pennsylvania", "orlando", "london"], "difficult_direct_answer": true, "rationales": ["The bus is driving on the left side of the road. the sign on the back of the bus refers to manchester.", "The magic bus is located in england.", "The company is in canada."], "image": "val2014/COCO_val2014_000000237254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13455, "question_id": "MrgAUgcw8HeRdQWCQPve9S", "question": "What are they doing on the field?", "choices": ["sheep racing", "milking", "shaving", "eating"], "correct_choice_idx": 0, "direct_answers": ["rodeo", "molesting", "rodeo sheep", "cowboy", "wrangling sheep", "sheep racing", "riding sheep", "chasing sheep", "rodeo", "lamb riding"], "difficult_direct_answer": true, "rationales": ["These farmers are trying to wrangle their sheep to race.", "The kid is riding a sheep and appears to be racing with it.", "The person is straddled on top of the sheep that is in motion. this positioning is consistent with answer a."], "image": "train2014/COCO_train2014_000000013455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187611, "question_id": "MrmLo8YPnkgBQHzzaXCczb", "question": "What event is about to begin?", "choices": ["insurrection", "riot", "protest", "marathon"], "correct_choice_idx": 3, "direct_answers": ["protest", "parade", "marathon", "parade", "parade", "protest", "parade", "parade", "march", "marathon"], "difficult_direct_answer": false, "rationales": ["People are seen aligning themselves for the marathon.", "People are holding up a lot of signs", "Racers are lining up."], "image": "train2014/COCO_train2014_000000187611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481354, "question_id": "MroTE7JxDo8Dyn3JpnmkSM", "question": "Why are the plants lined up like that?", "choices": ["just decoration", "for privacy", "for farming", "for hiding"], "correct_choice_idx": 2, "direct_answers": ["planting", "easier harvesting", "growing", "crops", "crops", "farm", "for farming", "crops", "row farming", "rice terrace"], "difficult_direct_answer": false, "rationales": ["The plants appear in straight rows which is consistent with farming practices. there is also a person in the foreground using farming tools so they are likely a farmer and are utilizing this land.", "The farmers line them up like that.", "These people are farming and the plants are in rows to give them room to grow."], "image": "train2014/COCO_train2014_000000481354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57018, "question_id": "MryWDb9dbbXsT9QFLmvtpt", "question": "What type of vehicle is this?", "choices": ["passenger", "commercial", "rental", "emergency"], "correct_choice_idx": 3, "direct_answers": ["fire truck", "fire truck", "truck", "bus", "emergency", "firetruck", "firetruck", "fire truck", "bus", "firetruck"], "difficult_direct_answer": false, "rationales": ["Firetrucks are for emergencies.", "The fire fighter truck is used during emergencies such as fires.", "A large truck is red and white and has fireman in it."], "image": "train2014/COCO_train2014_000000057018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158786, "question_id": "Ms8yiBGPMcMjhKdc98Vr8N", "question": "Who is currently skateboarding?", "choices": ["nobody", "woman", "left man", "middle man"], "correct_choice_idx": 0, "direct_answers": ["nobody", "middle boy", "no one", "no one", "no one", "boy", "boy", "boy", "adolescent", "no one"], "difficult_direct_answer": false, "rationales": ["The only person with a skateboard is holding it up.", "No one is skateboarding.", "The people are standing around and talking and no one is currently skateboarding."], "image": "train2014/COCO_train2014_000000158786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200777, "question_id": "Ms9wnHoHZLiEA5myUrEsKa", "question": "What general category does this type of aircraft belong to?", "choices": ["helicopter", "jet", "airship", "propeller"], "correct_choice_idx": 3, "direct_answers": ["airplane", "plane", "prop plane", "plane", "small aircraft", "two-seater", "propeller", "plane", "airplane", "recreational"], "difficult_direct_answer": false, "rationales": ["There is a spinning gear on the front of the airplane.", "Due to the way the plane is designed and the means for flight you can ascertain the type it is.", "This is a small propeller plane."], "image": "train2014/COCO_train2014_000000200777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89668, "question_id": "MsW2zDK8dnpYP88cgLyJQx", "question": "What fuels this type of animal?", "choices": ["rocks", "plants", "dirt", "meat"], "correct_choice_idx": 1, "direct_answers": ["grass", "plants", "grass", "grass", "water", "cows", "grass", "buffalo", "grass", "plants"], "difficult_direct_answer": false, "rationales": ["These animals are cows that graze on the grass in the pasture or field.", "The fuel is plants.", "Cows eat grass."], "image": "val2014/COCO_val2014_000000089668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568972, "question_id": "Mt9iVFmmL2KLRPv5vbiomT", "question": "What is the child doing on the bike?", "choices": ["stealing it", "riding", "waiting", "holding on"], "correct_choice_idx": 2, "direct_answers": ["riding", "riding it", "waiting", "riding", "steering", "riding", "sitting down", "watching lights", "waiting", "passenger"], "difficult_direct_answer": false, "rationales": ["The people on the bike are not currently riding as determined by the foot on the ground. they are facing a red light which instructs one to wait.", "The child is behind a person who is standing still, and the traffic light is red. it is not permitted to advance through a red light, one must wait for it to change.", "The people in the bike appear to not be in motion while sitting in an intersection facing a red light. all these aspects are consistent with answer a."], "image": "val2014/COCO_val2014_000000568972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280990, "question_id": "MtAxUw6qDL2anp7pKCBDuz", "question": "What emotion does the man seem to be feeling?", "choices": ["sadness", "anger", "happiness", "joy"], "correct_choice_idx": 1, "direct_answers": ["anger", "anger", "anger", "anger", "anger", "angry", "anger", "anger", "anger", "angry"], "difficult_direct_answer": false, "rationales": ["The emotion is anger.", "The man has a tense face and an open mouth like he is yelling.", "His posture and face express his rage"], "image": "train2014/COCO_train2014_000000280990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296766, "question_id": "MtgUioFMZf24jL4btuLDMy", "question": "Where have these people gathered?", "choices": ["library", "home", "reception hall", "church"], "correct_choice_idx": 1, "direct_answers": ["living room", "living room", "home", "living room", "living room", "living room", "apartment", "house", "house", "playing wii"], "difficult_direct_answer": false, "rationales": ["There is a visible sitting area with couches that are normally found in the house.", "This appears to be a living room in a home.", "They are in someones living room playing the wii."], "image": "train2014/COCO_train2014_000000296766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496206, "question_id": "MthVLpuy6NGj4nxFQtUgD2", "question": "What comes through the pipe held here?", "choices": ["smoke", "cookies", "milk", "oil"], "correct_choice_idx": 0, "direct_answers": ["smoke", "smoke", "smoke", "smoke", "smoke", "smoke", "smoke", "smoke", "smoke", "smoke"], "difficult_direct_answer": false, "rationales": ["The pipe is attached to a hookah. it does not emit oil, cookies, or milk.", "The pipe has smoke.", "A man is sitting next to a tall hooka with a tube coming from it. hookas are used to smoke."], "image": "train2014/COCO_train2014_000000496206.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274876, "question_id": "MtyLP4yhskMfu8FQEXoNCC", "question": "What is the man's profession?", "choices": ["teacher", "doctor", "athlete", "dentist"], "correct_choice_idx": 2, "direct_answers": ["tennis player", "tennis player", "athlete", "athlete", "tennis", "tennis player", "tennis", "tennis player", "tennis player", "tennis"], "difficult_direct_answer": false, "rationales": ["The man is an advanced tennis player and likely plays at a professional level.", "He is a tennis player.", "He is competing in a tennis match, so he must be a sportsman."], "image": "train2014/COCO_train2014_000000274876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334580, "question_id": "Mu4izXqCcXx5zcRetuufLT", "question": "Where is the kite in relation to the boy?", "choices": ["under", "in front", "behind", "invisible"], "correct_choice_idx": 2, "direct_answers": ["behind", "behind", "behind", "behind", "sky", "above", "above", "behind", "in air", "behind"], "difficult_direct_answer": false, "rationales": ["The kite is flying from behind.", "The boy is running with the kite. he is in front of the kite.", "The kite is flying behind the kid as it faces the kid's back."], "image": "train2014/COCO_train2014_000000334580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258555, "question_id": "MuCiLEuqJGwQufF9SZtwyv", "question": "What is the cat doing?", "choices": ["sleeping", "jumping", "hunting", "eating"], "correct_choice_idx": 0, "direct_answers": ["sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping", "sleeping"], "difficult_direct_answer": false, "rationales": ["The cat is snoozing.", "The cat is napping.", "This cat is curled up with it's eyes closed and likely resting."], "image": "train2014/COCO_train2014_000000258555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20784, "question_id": "Mubpk3cWRx6jz2wky7CyKm", "question": "Which country borders the country that this airplane is from?", "choices": ["siberia", "spain", "kazakhstan", "egypt"], "correct_choice_idx": 1, "direct_answers": ["spain", "mexico", "spain", "spain", "spain", "spain", "mexico", "spain", "spain", "mexico"], "difficult_direct_answer": false, "rationales": ["Portugal borders spain.", "The plane is from portugal, which borders spain.", "Spain in the country that borders most of portugal."], "image": "val2014/COCO_val2014_000000020784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223094, "question_id": "MunK5fmQDrf4g4eN9zB3Hc", "question": "What is under the colorful glasses?", "choices": ["umbrella", "refrigerator", "car", "bed"], "correct_choice_idx": 1, "direct_answers": ["kitchen set", "refrigerator", "shelf", "carrying tray", "fridge", "fridge", "refrigerator", "fridge", "fridge", "fridge"], "difficult_direct_answer": false, "rationales": ["The glasses are on top of a refrigerator.", "This item has two doors for a fridge and a freezer and is the common shape and design of a refrigerator.", "It is taller than the other appliances and has two doors"], "image": "val2014/COCO_val2014_000000223094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439373, "question_id": "MuoD2HMAfgtcV5uMGN3KjF", "question": "What sort of sex is everyone here thinking about?", "choices": ["straight", "gay", "bondage only", "none"], "correct_choice_idx": 1, "direct_answers": ["good sex", "homosexual", "gay", "male", "homosexual", "same sex", "same sex", "samesex", "same", "homosexual"], "difficult_direct_answer": false, "rationales": ["The people have signs about same sex marriage.", "People are carrying signs referring to same sex marriage and being derogatory towards homosexuality.", "The signs the people are holding contain references to this kind of sex."], "image": "train2014/COCO_train2014_000000439373.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287235, "question_id": "MurbWtSrKZiS8SwEV3WVKH", "question": "The name of what nation's capital is listed on a sign?", "choices": ["jamaica", "morocco", "mexico", "united states"], "correct_choice_idx": 0, "direct_answers": ["jamaica", "jamaica", "colon panama", "jamaica", "kingston", "panama", "colon", "kingston", "jamaica", "usa"], "difficult_direct_answer": false, "rationales": ["Jamaica's capital is identified.", "Jamaica is listed.", "The name is jamaica."], "image": "val2014/COCO_val2014_000000287235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387480, "question_id": "MvScEVdAu2QcasqaArT6T3", "question": "Which one of the companies listed sells mattresses?", "choices": ["white sign", "green sign", "red sign", "yellow sign"], "correct_choice_idx": 3, "direct_answers": ["sleep's", "elbut", "sleep's", "sleep's", "sleep's", "sleep's", "sleeps", "sleepy s", "sleeps", "yellow sign"], "difficult_direct_answer": false, "rationales": ["There is a yellow sign on the baseball field for sleeps which is a store that sells mattresses.", "Sleepy's sells beds", "Sleepy's sells mattresses."], "image": "train2014/COCO_train2014_000000387480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331680, "question_id": "MvbyEJaW8vq3wiqtwnQmcV", "question": "What are the animals here being used as?", "choices": ["police", "guides", "pets", "pack animals"], "correct_choice_idx": 3, "direct_answers": ["load", "pack horses", "enviremental", "pack animals", "pack animals", "water", "haulers", "transport", "transporters", "hauling"], "difficult_direct_answer": true, "rationales": ["The animals are being used to transport cargo.", "The horses are carrying objects on their backs.", "The animals in the river are horses that are used as pack animals to carry belongings."], "image": "train2014/COCO_train2014_000000331680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481885, "question_id": "MvdMUpZuV8eR6UUkMoYGcv", "question": "To be able to see clearly the people holding the kits will have their backs facing what?", "choices": ["their car", "each other", "sun", "their front"], "correct_choice_idx": 2, "direct_answers": ["sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["The sun is blocking the view.", "People can't see if they're staring into the sun.", "The people are holding their backs to the sun."], "image": "train2014/COCO_train2014_000000481885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352127, "question_id": "MvtyR9zuzgEnQm3rSHhtWk", "question": "What kind of transportation is shown?", "choices": ["rail", "road", "air", "water"], "correct_choice_idx": 1, "direct_answers": ["road", "truck", "truck", "truck", "truck", "semi truck", "truck", "truck transportation", "truck", "truck"], "difficult_direct_answer": false, "rationales": ["The transport is for the road.", "A truck is shown.", "There is traffic lights on the top."], "image": "val2014/COCO_val2014_000000352127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282442, "question_id": "MwCzPGrYJ2NqmTkcHdWhfG", "question": "Which part of the animal is precious?", "choices": ["ivory", "skin", "nose", "cape"], "correct_choice_idx": 0, "direct_answers": ["tusk", "tusks", "tusks", "ivory", "trunk", "ivory tusks", "tusk", "elephant", "entire thing", "all"], "difficult_direct_answer": false, "rationales": ["The elephant has tusks made of ivory that are worth a lot of money.", "The tusk of an elephant is made of ivory, and they are killed by poachers who sell the tusks on the world market. a good rule of thumb is, \"don't buy ivory\".", "It is valued by poachers."], "image": "train2014/COCO_train2014_000000282442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254242, "question_id": "MwH3o3H7NtWg2rJXzrH4PZ", "question": "What letters are on the plane?", "choices": ["ad", "wh", "bc", "wa"], "correct_choice_idx": 3, "direct_answers": ["wa af", "wa af", "wa", "wa", "wa", "wa af", "wa", "wa", "wa", "wa"], "difficult_direct_answer": false, "rationales": ["They are on the tail of the plane", "Wa is printed on the plane.", "The tail of the plane says wa."], "image": "train2014/COCO_train2014_000000254242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130280, "question_id": "MwSRTB3ZPR2eQvRjzEyPYz", "question": "Which is one of the villains that the character on the snowboard fights?", "choices": ["cat woman", "bane", "green goblin", "wolverine"], "correct_choice_idx": 2, "direct_answers": ["green goblin", "green goblin", "doc ock", "green goblin", "snow", "batman", "venom", "spiderman", "spiderman", "mister negative"], "difficult_direct_answer": false, "rationales": ["Green goblin shows up at snowboard fights.", "The green goblin is the villain to spiderman.", "The character on the snowboard is spider-man based on the visible features. this character is commonly known to fight answer a."], "image": "train2014/COCO_train2014_000000130280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274939, "question_id": "Mwk5xscn3Mq6kwbCnqrfR3", "question": "What meal has already happened?", "choices": ["dessert", "breakfast", "lunch", "dinner"], "correct_choice_idx": 1, "direct_answers": ["breakfast", "breakfast", "breakfast", "breakfast", "breakfast", "breakfast", "breakfast", "breakfast", "breakfast", "breakfast"], "difficult_direct_answer": false, "rationales": ["Breakfast has happened since it's the afternoon.", "It's almost 11 a.m. since it's daylight", "The clock shows that it is almost 11:00 am so the first meal of the day has already happened. it is almost time for the mid-day meal."], "image": "val2014/COCO_val2014_000000274939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467431, "question_id": "Mwz45z8JHXH9oGDBSyKyD9", "question": "What is the size of the TV in the image?", "choices": ["60 inch", "24 inch", "32 inch", "40 inch"], "correct_choice_idx": 2, "direct_answers": ["forty inch", "large", "small", "32 inch", "large screen", "moderate size", "medium sized", "medium", "forty inch", "forty inches"], "difficult_direct_answer": true, "rationales": ["According to how it has been mount it look as it is 32 inch.", "The tv is on the smaller side but perhaps it's medium sized.", "The size of the tv on the wall is a small screen around 32 inches diagonally."], "image": "train2014/COCO_train2014_000000467431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252025, "question_id": "MwzsMCZPDKXxJTe2ee2nPk", "question": "Why are the people standing on the bridge?", "choices": ["for fun", "less odor", "avoiding attack", "better view"], "correct_choice_idx": 3, "direct_answers": ["watching", "better view", "overlook enclosure", "observing giraffes", "safety", "watching giraffes", "feeding", "watch animals", "observing", "watching giraffes"], "difficult_direct_answer": true, "rationales": ["The elevated vantage point these people observe the giraffes from give them a better view than a ground level view would.", "The people are wanting to get a good look at the giraffes.", "The people want a top down view of the giraffes."], "image": "train2014/COCO_train2014_000000252025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432547, "question_id": "MxKvgkjz4FBNVvKKFc9Xhk", "question": "What is the man doing with the paper?", "choices": ["sailing boats", "decorating table", "making airplanes", "wrapping gifts"], "correct_choice_idx": 3, "direct_answers": ["cutting", "cutting", "wrapping gifts", "wrapping gifts", "wrapping gifts", "wrapping gifts", "cutting", "cutting", "wrapping gifts", "cutting"], "difficult_direct_answer": false, "rationales": ["He is wrapping presents with wrapping paper.", "This is to cover boxes for people to open on special occasions", "He is cutting wrapping paper for gifts."], "image": "val2014/COCO_val2014_000000432547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577912, "question_id": "MxScgu4KqvuJEpNNLZxfqP", "question": "What purpose does the large white bowl sitting in the window likely serve?", "choices": ["tub", "pail", "sink", "foot soak"], "correct_choice_idx": 2, "direct_answers": ["sink", "wash hands", "latin purpose", "sink", "bathroom sink", "handwashing", "sink", "sink", "for bathing", "hand washing"], "difficult_direct_answer": false, "rationales": ["A large vessel is on the counter in a bathroom.", "The large white bowl is under a faucet. it catches the water that leaves the faucet.", "The tub is on the right. the large white bowl is beneath a faucet and is too high to be a foot soak and too fancy to be a pail."], "image": "val2014/COCO_val2014_000000577912.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494968, "question_id": "MxVPoLgkqAF2aJ9SxVTFzK", "question": "What city might this bike be ridden in?", "choices": ["san francisco", "new york", "los angeles", "chicago"], "correct_choice_idx": 0, "direct_answers": ["san francisco", "san francisco", "san francisco", "san francisco", "san francisco", "san francisco", "san francisco", "san francisco", "san francisco", "san francisco"], "difficult_direct_answer": false, "rationales": ["The acronym on the bike says sf, standing for san francisco.", "The bike has an sfpd sticker on it so the city would likely be san francisco.", "The motorcycle says san francisco police on it."], "image": "train2014/COCO_train2014_000000494968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101218, "question_id": "MxZ5AviqD49twDVFcrJBci", "question": "He is pretending the tennis racket is what?", "choices": ["guitar", "violin", "cello", "flute"], "correct_choice_idx": 0, "direct_answers": ["guitar", "guitar", "guitar", "guitar", "guitar", "guitar", "guitar", "guitar", "guitar", "guitar"], "difficult_direct_answer": false, "rationales": ["He is holding the racket like a musical instrument.", "A guitar is played in the same position the man is imitating.", "The handle of the tennis racket this man holds as though holding the frets of a guitar board and this other hand plucks the imaginary strings with an imaginary pick."], "image": "train2014/COCO_train2014_000000101218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432216, "question_id": "MxaqYz7wJiFkD3Vb7Lx6WS", "question": "Why have the people gathered here?", "choices": ["work", "rescue", "vacation", "worship"], "correct_choice_idx": 2, "direct_answers": ["skiing", "ski", "ski fun", "ski", "to ski", "skiing", "vacation", "skiing", "to ski", "skiing"], "difficult_direct_answer": false, "rationales": ["This is recreational skiing", "Attending a ski resort as the people here pictured are doing is for rest and relaxation purposes unless they are training for athletic activity which this group is not.", "The people are seen on a ski slope. this is a location people frequently go as a vacation destination."], "image": "train2014/COCO_train2014_000000432216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497194, "question_id": "MxpSMffNp2yo4Fhfb4nb8h", "question": "What are the animals walking along?", "choices": ["trail", "sidewalk", "fence", "rubble"], "correct_choice_idx": 0, "direct_answers": ["trail", "path", "sheep", "path", "path", "path", "footpath", "trail", "path", "path"], "difficult_direct_answer": false, "rationales": ["The sheep are going on a path.", "A simple process of elimination leads us to the correct answer, as does a casual look at the animals.", "There is a worn path from walking"], "image": "train2014/COCO_train2014_000000497194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571867, "question_id": "MyDdEKtGXReYiXtaWu7tn2", "question": "What style food is the lady in pink going to enjoy next?", "choices": ["soul food", "chinese", "pizza", "mexican"], "correct_choice_idx": 3, "direct_answers": ["mexican", "mexican", "mexican", "mexican", "tacos", "mexican", "mexican food", "mexican", "tacos", "mexican"], "difficult_direct_answer": false, "rationales": ["She has salsa on her plate", "There is a taco in front of the woman.", "The food on the plate is tacos."], "image": "train2014/COCO_train2014_000000571867.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264828, "question_id": "MybAevDyQi473VBCmg5EgS", "question": "The largest item here is usually found where?", "choices": ["ocean", "hangar", "office building", "cave"], "correct_choice_idx": 1, "direct_answers": ["hangar", "airport", "sky airport", "washington dc", "airplane", "sky", "tarmac", "sky", "airport", "airport sky"], "difficult_direct_answer": false, "rationales": ["The hangar usually is where the plane is.", "Planes are stored in large empty structures", "Planes are very big and the only indoor space they will fit is a hangar."], "image": "train2014/COCO_train2014_000000264828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403619, "question_id": "MyhnSZyykE8eAXrUgfJV9e", "question": "What item with a Heineken logo sits to the left of the laptop computer?", "choices": ["coaster", "can", "koozie", "mousepad"], "correct_choice_idx": 0, "direct_answers": ["coaster", "coaster", "coaster", "coaster", "coaster", "coaster", "coaster", "coaster", "coaster", "coaster"], "difficult_direct_answer": false, "rationales": ["The item is round and foamy so it is likely a coaster.", "The item is thick and round and clearly intended for drinks to rest on.", "The item is a coaster to put drinks on."], "image": "val2014/COCO_val2014_000000403619.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434159, "question_id": "Mz54u4VSdP9prnVwTetdLS", "question": "What sort of building stand in could this building substitute for in a movie?", "choices": ["nunnery", "castle", "711", "taxi stand"], "correct_choice_idx": 1, "direct_answers": ["college campus", "buckingham palace", "castle", "castle", "castle", "castle", "castle", "castle", "medieval movie", "jail"], "difficult_direct_answer": false, "rationales": ["By the design and what materials was used to make this structure it's easy to get the correct answer.", "The building is a large old castle made of stone.", "This could be a castle in a movie."], "image": "val2014/COCO_val2014_000000434159.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205564, "question_id": "MzAYC2C7eMjnZNr8ohYCHE", "question": "Where is the headquarters of the company that makes the hoops?", "choices": ["sacramento", "denver", "dover", "new york"], "correct_choice_idx": 3, "direct_answers": ["america", "united states", "east aurora", "germany", "new york", "new york", "fisher-price", "aurora ny", "united states", "east-aurora ny"], "difficult_direct_answer": false, "rationales": ["The headquarters are in ny.", "New york is where fisher price is.", "The child are move us so the headquarter is from newyork."], "image": "train2014/COCO_train2014_000000205564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30437, "question_id": "MzKCWmqUwTMgZyeh9HaKcT", "question": "What are the people next to each other seated on?", "choices": ["chair", "ski lift", "bench", "sofa"], "correct_choice_idx": 1, "direct_answers": ["lift", "ski lift", "ski lift", "chairlift", "ski lift", "ski lift", "snow", "ski lift", "chairlift", "lift"], "difficult_direct_answer": false, "rationales": ["The people are on a ski lift.", "The object they are sitting on consists of a chair that is attached to a cable. this carries them up the mountain so they can participate in an extreme winter sport.", "They are sitting here to be lifted in order to see clear."], "image": "train2014/COCO_train2014_000000030437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246532, "question_id": "MzaqPC6CYGZBzbwXwunTca", "question": "Where is the man snowboarding?", "choices": ["on mountain", "inside", "amusement park", "in videogame"], "correct_choice_idx": 0, "direct_answers": ["mountain", "mountains", "mountain", "mountain", "mountains", "on mountain", "mountains", "mountains", "mountains", "mountains"], "difficult_direct_answer": false, "rationales": ["A man is snowboarding down a large snowy hill. he has started at the top of a peak that goes well into the sky and ski down it.", "The man is on a ski trail on the mountain.", "The man is riding the board on a sloped area."], "image": "val2014/COCO_val2014_000000246532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422893, "question_id": "MzbuzyCzif3XxprBg9JesE", "question": "What lies under the people here?", "choices": ["surfboard", "dolphins", "nothing", "manatees"], "correct_choice_idx": 0, "direct_answers": ["ocean", "surfboard", "water", "surfboards", "ocean", "water", "water", "water", "ocean wave", "surfboard"], "difficult_direct_answer": false, "rationales": ["The people are riding on surfboards.", "There might be b or c under a, but a is directly under them.", "People are using boards on the water."], "image": "val2014/COCO_val2014_000000422893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110435, "question_id": "MzpVRKroPbiyCBZoTjPxDG", "question": "Why is the dog in the sink?", "choices": ["to cut", "to sleep", "to bathe", "to heal"], "correct_choice_idx": 2, "direct_answers": ["bath time", "bathing", "bath", "to bathe", "bathing", "being bathed", "bath", "bathing", "bathing", "bath"], "difficult_direct_answer": false, "rationales": ["The animal is in a bucket. the lady is using a cup to scoop and pour water on the animal.", "The dog is bathing.", "This is a place that dogs would commonly only appear intentionally for one reason. the woman also appears to be holding something to the dog to scrub it."], "image": "val2014/COCO_val2014_000000110435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22907, "question_id": "N2W8R2JxqX4ugQbuysXb72", "question": "What animal looks at the bird?", "choices": ["cat", "dog", "no animal", "iguana"], "correct_choice_idx": 0, "direct_answers": ["cat", "human", "chicken", "hen", "chicken", "chicken", "chicken", "hen", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["The shape of the ears and head in the foreground are distinctive of a cat. cats naturally prey on birds, and would watch them closely out of a desire to hunt them.", "A cat is watching.", "You can see the ears and the top of the head of the animal."], "image": "train2014/COCO_train2014_000000022907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90278, "question_id": "N2abtPdrGibx99T6DKxhgh", "question": "Who are the group of people on the opposite side of the road?", "choices": ["pedestrians", "tourists", "workers", "protesters"], "correct_choice_idx": 1, "direct_answers": ["tourists", "sightseers", "bus passengers", "pedestrians", "tourists", "bus riders", "passengers", "bus riders", "bus riders", "tourists"], "difficult_direct_answer": false, "rationales": ["A group of people are standing on the sidewalk near two large charter buses. tourists take charter buses to see sights.", "They are waiting to board the busses", "A group of people are waiting to board a tourist bus."], "image": "train2014/COCO_train2014_000000090278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35127, "question_id": "N2jjx82mfApLB7G6M7iKKh", "question": "How is this laptop connected to the network in this building?", "choices": ["cellular modem", "dial-up modem", "wi-fi", "wired ethernet"], "correct_choice_idx": 3, "direct_answers": ["wired", "cable connection", "ethernet", "wireless connection", "wires", "cord", "ethernet", "cord", "wired ethernet", "cable"], "difficult_direct_answer": false, "rationales": ["There are cables connecting the internet to the wi-fi.", "The laptop is connected to the network in the building with the black wire ethernet cable.", "The blue wire going out of the right side of the laptop will connect the laptop to the internet."], "image": "train2014/COCO_train2014_000000035127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409489, "question_id": "N2koBFbuP5XH99vwmp47mC", "question": "What is/are going to be baked?", "choices": ["black cat", "grey cat", "cake", "both cats"], "correct_choice_idx": 2, "direct_answers": ["cats", "cats", "bread", "bread", "bread", "cats", "bread", "cake", "pie", "bread"], "difficult_direct_answer": false, "rationales": ["There is a sweet round pastry on top of the stove waiting to be put in the oven.", "Baking the cats would be cruel. the item on top of the stove is going to be baked.", "It would be cruel to cook the cats. the item on top of the stove is going to be baked."], "image": "train2014/COCO_train2014_000000409489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550874, "question_id": "N2oRQRTQ3asv4wUU6uNFkt", "question": "What is this bus advertising?", "choices": ["political figures", "musicians", "food", "street performers"], "correct_choice_idx": 0, "direct_answers": ["mitt romney", "republican voting", "romney", "social media", "presidential campaign", "romney ryan", "political candidate", "presidency", "political figures", "more jobs"], "difficult_direct_answer": true, "rationales": ["The bus is advertising the campaign of romney and ryan.", "The bus is advertising politicians for president.", "The bus has political figures."], "image": "train2014/COCO_train2014_000000550874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41993, "question_id": "N2odnpZmpCYeQPfbcmSsdf", "question": "What is another animal that has markings this colour?", "choices": ["goldfish", "penguin", "chicken", "dinosaur"], "correct_choice_idx": 1, "direct_answers": ["dalmatian", "cow", "panda bear", "donkey", "okapi", "penguin", "penguin", "snow leopard", "skunk", "skunk"], "difficult_direct_answer": false, "rationales": ["Penguins are black and white.", "These animals are black and white. penguins are (mostly) also black and white in coloration.", "Animals are black and white. penguins are also these colors."], "image": "train2014/COCO_train2014_000000041993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511221, "question_id": "N2tjeu5H5yp8CYeacj4ouW", "question": "Why does she look disgusted?", "choices": ["is sad", "pst fprl", "wants more", "dislikes vegetables"], "correct_choice_idx": 3, "direct_answers": ["bad taste", "many vegetables", "vegetables", "dislikes vegetables", "icky vegetables", "vegetables", "veggies nasty", "eating vegetables", "vegetables", "vegetables"], "difficult_direct_answer": false, "rationales": ["The girl doesn't like peas or carrots.", "She is responding to the look and taste of the peas, which she does not like. .", "It is dinner time and there are veggies in front of her."], "image": "train2014/COCO_train2014_000000511221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393091, "question_id": "N3JBKJWddoWBcrKK5p5bZP", "question": "What purpose are the white remotes serving?", "choices": ["window cleaners", "queue maintainence", "wii controls", "sales objects"], "correct_choice_idx": 2, "direct_answers": ["playing wii", "gaming", "operate game", "game", "game controllers", "wii controls", "game controllers", "wii playing", "control", "wii game"], "difficult_direct_answer": true, "rationales": ["They are used to play a video game.", "The are used to play video games.", "They have game controllers in their hands."], "image": "train2014/COCO_train2014_000000393091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105564, "question_id": "N3KYdKuZLw38AJueKzRubA", "question": "What is the brown item next to the mattress and chair?", "choices": ["leaves", "box", "dirt", "cat"], "correct_choice_idx": 1, "direct_answers": ["box", "box", "box", "cardboard box", "box", "box", "box", "cardboard box", "box", "cardboard box"], "difficult_direct_answer": false, "rationales": ["This is a cardboard cube used to pack things", "There is a cardboard container next to the chair.", "The brown item is a cardboard box."], "image": "train2014/COCO_train2014_000000105564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215024, "question_id": "N3fkpQPsAkb5NCWjdUKfjJ", "question": "Where are the people on the boat ultimately headed?", "choices": ["mexico city", "thailand", "near shore", "laguardia"], "correct_choice_idx": 2, "direct_answers": ["across waterway", "to shore", "to city", "shore", "home", "new york", "shore", "land", "near shore", "shore"], "difficult_direct_answer": false, "rationales": ["The boat is transporting people to a shore near the city.", "The boat is heading towards the city.", "They are headed to land."], "image": "val2014/COCO_val2014_000000215024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187285, "question_id": "N3hLQwzTUjt8MMSqMhm2im", "question": "When the surfer riding the wave looks the other way and the board hits him at full force how badly would he be injured?", "choices": ["severely injured", "moderately injured", "slightly injured", "completely safe"], "correct_choice_idx": 0, "direct_answers": ["very badly", "very badly", "medium amount", "severely injured", "very badly", "badly", "very badly", "very badly", "severely", "very badly"], "difficult_direct_answer": false, "rationales": ["There is no way to tell for sure but judging by the size of the wave and the power of the ocean behind it there is a high chance for bad injuries.", "The other surfer is going at a high speed and this would really hurt someone.", "The surfer is injured."], "image": "val2014/COCO_val2014_000000187285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278829, "question_id": "N4MY8437ZK4un5bC5N8Jgx", "question": "What insurance company is a sponsor of the baseball field?", "choices": ["state farm", "farmers", "nationwide", "geico"], "correct_choice_idx": 3, "direct_answers": ["geico", "geico", "geico", "geico", "geico", "geico", "geico", "geico", "geico", "geico"], "difficult_direct_answer": false, "rationales": ["The insurance company on the wall is geico", "The blue and white sign on the right is an ad for an insurance company.", "There is a large blue insurance company sign."], "image": "val2014/COCO_val2014_000000278829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134075, "question_id": "N4S7DWqkQtNQGhK9joDnqP", "question": "What are the people looking at?", "choices": ["menus", "ads", "magazines", "books"], "correct_choice_idx": 0, "direct_answers": ["menu", "menu", "menus", "menus", "menus", "menu", "dessert menu", "menus", "menus", "menus"], "difficult_direct_answer": false, "rationales": ["The design of the objects and the displays of food are consistent with answer a.", "There is a group of people sitting at a restaurant table to eat and they all are deciding what to order.", "They are looking at their menus to see what they want to order."], "image": "val2014/COCO_val2014_000000134075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508881, "question_id": "N4vy6hmNrBhnf68wiiv5vd", "question": "What region of the world does this plane originate from?", "choices": ["germania", "east europe", "rocky mountains", "scandinavia"], "correct_choice_idx": 3, "direct_answers": ["finland", "scandinavia", "finland", "finland", "finland", "finland", "finland", "finland", "finland", "finland"], "difficult_direct_answer": false, "rationales": ["This looks like a plane from finland and that's in scandinavia.", "Looks like it was a plane from finland.", "The airline says finnair so this has to be around scandinavia."], "image": "train2014/COCO_train2014_000000508881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84230, "question_id": "N52a7HpBfmJ4AmmcbMj3tU", "question": "What type of material makes up a majority of the construction in the area?", "choices": ["stone", "steel", "wood", "mud"], "correct_choice_idx": 0, "direct_answers": ["brick", "breaks", "brick", "bricks", "brick", "clay bricks", "stone", "brick", "stone", "brick"], "difficult_direct_answer": false, "rationales": ["Looks like brick or stone is on the ground.", "The street is stone and the walls are bricks.", "They all have rock like structures that are shaped in order to build."], "image": "train2014/COCO_train2014_000000084230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91909, "question_id": "N5dUD8ytWane7Wm9Vbcngd", "question": "What number does the car advise you to call?", "choices": ["zero", "911", "1-800 number", "411"], "correct_choice_idx": 1, "direct_answers": ["911", "911", "911", "911", "nine-one-one", "911", "ninehundredeleven", "nine omeone", "911", "911"], "difficult_direct_answer": false, "rationales": ["The number is visible on the side of the vehicle and is known to be associated with police.", "The vehicle is a police car. it has the north american emergency phone number near its taillight.", "This number is used for emergencies to call for help."], "image": "val2014/COCO_val2014_000000091909.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535445, "question_id": "N5htTtLzjDzB54wbQVKvgo", "question": "Where is the pillow?", "choices": ["refrigerator", "hammock", "bed", "chair"], "correct_choice_idx": 3, "direct_answers": ["couch", "on chair", "chair", "armchair", "on chair", "on chair", "chair", "chair", "chair", "on chair"], "difficult_direct_answer": false, "rationales": ["The pillow is on the chair.", "A stripped rainbow pillow is on the side of a cushioned piece of furniture.", "The pillow is sitting on the armrest of a chair."], "image": "train2014/COCO_train2014_000000535445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505039, "question_id": "N5wecgsUbQTawiPyHS22TF", "question": "How many animal species are shown NOT including the people on shore?", "choices": ["one", "nine", "11", "five"], "correct_choice_idx": 0, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "two", "one", "lot"], "difficult_direct_answer": false, "rationales": ["Cows are the only animals in this photo", "Just many of the same animal are shown.", "One species is shown."], "image": "val2014/COCO_val2014_000000505039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411138, "question_id": "N5zPWUg2vbaxsrciNksDEV", "question": "The plants are growing in what type of outdoor structure?", "choices": ["barn", "greenhouse", "nursery", "sunroom"], "correct_choice_idx": 1, "direct_answers": ["outhouse", "greenhouse", "greenhouse", "greenhouse", "greenhouse", "outhouse", "greenhouse", "greenhouse", "greenhouse", "outhouse"], "difficult_direct_answer": false, "rationales": ["They are in a greenhouse.", "That's what is used for growing things year round.", "The greenhouse holds plants."], "image": "val2014/COCO_val2014_000000411138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341838, "question_id": "N65cQznrmYcCoPqmBZ3dyz", "question": "What group of people mostly live in this area?", "choices": ["korean", "chinese", "japanese", "indian"], "correct_choice_idx": 1, "direct_answers": ["chinese", "chinese", "citizen", "citizen", "chinese", "chinese", "chinese", "chinese", "chinese", "chinese"], "difficult_direct_answer": false, "rationales": ["This looks to be an asian city.", "The building on the right has a sign on it that tells what area in the city the neighborhood is located.", "There is a building structure with the writing of the towns name."], "image": "val2014/COCO_val2014_000000341838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102111, "question_id": "N6Bbbv7V3FBpd5L3MfVSVz", "question": "What famous bird is also this colour?", "choices": ["parrot", "black bird", "flamingo", "eagle"], "correct_choice_idx": 2, "direct_answers": ["flamingo", "flamingo", "flamingo", "flamingo", "flamingo", "flamingo", "flamingo", "flamingo", "flamingo", "flamingo"], "difficult_direct_answer": false, "rationales": ["Flamingo's are pink. they turn pink from the food they eat.", "These objects are mostly pink in color. flamingos are also mostly pink in coloration.", "These birds have a pink color due to the shrimp that they eat."], "image": "train2014/COCO_train2014_000000102111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94949, "question_id": "N6Le3RY9mQk5fjrVUZg34X", "question": "Where are the people on the wire going?", "choices": ["home", "gift shop", "bus stop", "summit"], "correct_choice_idx": 3, "direct_answers": ["ski lift", "up mountain", "uphill", "snow", "mountaintop", "peak", "up mountain", "slopes", "summit", "up mountain"], "difficult_direct_answer": false, "rationales": ["Based on the setting and the visible equipment, the people are skiing and the visible lift would be there to bring them up the mountain.", "The wire above the skier is part of a chairlift. it is carrying people up the ski hill.", "A ski lift takes people up to a higher elevation."], "image": "train2014/COCO_train2014_000000094949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199923, "question_id": "N6M2vZ7LNKRs4zcxPp7VHY", "question": "Which way should the fan turn to circulate air in the room?", "choices": ["counter clockwise", "downward", "upward", "clockwise"], "correct_choice_idx": 3, "direct_answers": ["round", "right", "clockwise", "clockwise", "right", "right", "right", "clockwise", "clockwise", "clockwise"], "difficult_direct_answer": false, "rationales": ["If the fan turns clockwise air will be moved through the room and the people will feel it's effect.", "The way is clockwise.", "The fan should go in the direction of the clock."], "image": "train2014/COCO_train2014_000000199923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544386, "question_id": "N6Zqxwt5kxncQ4aJtjnr7W", "question": "Why would someone sit at these tables?", "choices": ["to paint", "to work", "to eat", "to sew"], "correct_choice_idx": 2, "direct_answers": ["to eat", "eat", "eat food", "to eat", "to eat", "to eat", "to eat", "eat", "eating", "to eat"], "difficult_direct_answer": false, "rationales": ["The tables are located beside a food truck, so people can sit at and enjoy their food.", "There is a food truck parked next to the tables.", "The tables are set up in front of a food truck and the people at the tables are waiting on their orders."], "image": "train2014/COCO_train2014_000000544386.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23311, "question_id": "N6jLJGpc3UAVQD5XRdHZNL", "question": "What is the woman doing near the microwave?", "choices": ["resting", "cleaning", "cooking", "modeling"], "correct_choice_idx": 3, "direct_answers": ["posing", "posing", "posing", "modeling", "posing", "posing", "dress", "posing", "posing", "posing"], "difficult_direct_answer": false, "rationales": ["The woman is smiling and has her head resting on her hand.", "The woman near the microwave is posing for the camera because she is a model.", "The woman is posing near the microwave."], "image": "train2014/COCO_train2014_000000023311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344633, "question_id": "N7BfcTeSUw4GbiCEZ879uY", "question": "Why do the people wear head gear?", "choices": ["fashion", "streamlining", "protection", "to match"], "correct_choice_idx": 2, "direct_answers": ["safety", "protection", "protection", "protect head", "for protection", "protection", "protect heads", "protection", "protect heads", "protection"], "difficult_direct_answer": false, "rationales": ["This type of sport can be dangerous and you need protection.", "The people want to protect their heads.", "If they fall off of a horse, their heads should be protected in case they hit the ground."], "image": "val2014/COCO_val2014_000000344633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545263, "question_id": "N7MjESHYGHjcf9QX9URbYQ", "question": "What is the common term given to the above collection?", "choices": ["people", "congregation", "bears", "dolls"], "correct_choice_idx": 3, "direct_answers": ["collectors", "couture", "teddy bears", "teddy bear", "dolls", "figurines", "fashion", "teddy bear", "teddy bear", "evening wear"], "difficult_direct_answer": false, "rationales": ["These are dolls.", "These are toys", "The costumed stuffed animals all have bear features."], "image": "train2014/COCO_train2014_000000545263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337895, "question_id": "N7bDEYWHDKrSebXHN96q3h", "question": "What kind of cleats is the batter wearing?", "choices": ["puma", "nike", "adidas", "reebok"], "correct_choice_idx": 1, "direct_answers": ["baseball", "nike", "nike", "metal", "nike", "baseball cleats", "metal", "baseball", "black cleats", "nike"], "difficult_direct_answer": false, "rationales": ["A swoosh is on the side of a baseball shoe.", "It's got the iconic swoosh on the shoe.", "You can tell by the logo on the shoe who made that particular cleat."], "image": "train2014/COCO_train2014_000000337895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574114, "question_id": "N7m8iNHvtYqMKGbjC5xa9F", "question": "What is the large blue object under the plane wing?", "choices": ["ladder", "wheel", "engine", "luggage"], "correct_choice_idx": 2, "direct_answers": ["turbine engines", "engine", "engine", "propeller", "engine", "engine", "engine", "fan", "engine", "engine"], "difficult_direct_answer": false, "rationales": ["You can tell by the design and structure as to what is under the planes wing.", "The engine is located under the wing of planes.", "The luggage goes inside the plane, and the wheels are under the plane. the ladder attaches to the doors that are in front of and behind the wings."], "image": "train2014/COCO_train2014_000000574114.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359688, "question_id": "N7q3MkiB3YtmQw9mmJxDPo", "question": "What operating system is the man with his feet on the desk a fan of?", "choices": ["linux", "windows", "macos", "android"], "correct_choice_idx": 0, "direct_answers": ["windows", "linux", "windows", "linux", "linux", "windows", "windows", "linux", "linux", "linux"], "difficult_direct_answer": false, "rationales": ["Linux is commonly associated with the computer shown.", "The penguin is the logo for this operating system and he has one on his desk.", "The man has a penguin figurine on his desk which is the mascot for linux systems."], "image": "train2014/COCO_train2014_000000359688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368302, "question_id": "N7wkGnvaSpaiRmE8CVr5h3", "question": "What bulk food item was once processed in the leftmost building?", "choices": ["grain", "potatoes", "tomatoes", "corn"], "correct_choice_idx": 0, "direct_answers": ["flour", "grain", "flour", "meats", "flour", "wheat", "chips", "flour", "food bank", "flour"], "difficult_direct_answer": false, "rationales": ["It used it to make the flour.", "The sign says flour and it looks like a flour mill.", "The building on the left has a sign on the top that says gold medal flour."], "image": "val2014/COCO_val2014_000000368302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295377, "question_id": "N8Jpig3Vu2ZCM2rxNRJbiw", "question": "What should noobs do here?", "choices": ["leave", "try snowboarding", "do stunts", "try skiing"], "correct_choice_idx": 0, "direct_answers": ["leave", "don't go", "avoid area", "not ski", "leave", "turn around", "stay away", "stop", "not skii", "stay off"], "difficult_direct_answer": true, "rationales": ["People that don't know how to ski this run shouldn't try. the sign indicates that it is for experts only.", "The sign says experts only should continue on and a noob is someone without much experience and thus not an expert.", "Noobs should leave."], "image": "val2014/COCO_val2014_000000295377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59225, "question_id": "N8LbVnRWGbWh5nqdNkiMHR", "question": "What sort of For pay area is near this meter?", "choices": ["parking", "grocery", "racing", "bike kiosk"], "correct_choice_idx": 0, "direct_answers": ["phone", "parking", "parking meter", "parking meter", "parking meter", "parking", "skiing", "parking", "parking", "parking meter"], "difficult_direct_answer": false, "rationales": ["One can see the meter and the p decal on it.", "The parking meter is almost covered by snow and no cars are near.", "It's a parking meter that you put money in so you can park there"], "image": "val2014/COCO_val2014_000000059225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83257, "question_id": "N8Wrf3PKukzhFtSRbM7UoL", "question": "Of which movie is the person with the Camera a fan?", "choices": ["avengers", "dumbo", "star wars", "ted"], "correct_choice_idx": 2, "direct_answers": ["star wars", "star wars", "star wars", "star wars", "star wars", "tin man", "star wars", "star wars", "tin man", "tin man"], "difficult_direct_answer": false, "rationales": ["The person is standing next to c-3po.", "The man is standing next to c-3p0.", "The characters next to them are a jawa and c3-po from the famous movie."], "image": "val2014/COCO_val2014_000000083257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178771, "question_id": "N8sJ2KrDuuD7FGB27YaHLa", "question": "To perform this trick the rider is gliding on what?", "choices": ["mosiac", "top", "rails", "court"], "correct_choice_idx": 2, "direct_answers": ["bench", "skateboard", "bench", "wheels", "stone", "bench", "stone top", "edge", "rails", "skateboard"], "difficult_direct_answer": false, "rationales": ["People use a rail to grind with the board.", "It's the a that connects the wheels. on a motor vehicle, they would be known as axles.", "The rails as shown must gild on to perform trick."], "image": "train2014/COCO_train2014_000000178771.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251523, "question_id": "N8sSKjQPeAyfk486WKNPuQ", "question": "What is near the donut?", "choices": ["bagel", "egg", "dog", "hand"], "correct_choice_idx": 3, "direct_answers": ["napkin", "napkin", "napkin", "napkin", "napkin", "napkin", "legs", "paper towel", "paper towel", "hand"], "difficult_direct_answer": false, "rationales": ["A hand is holding the donut.", "The donut is near a person, not a dog, egg, or bagel.", "Someone is holding a donut."], "image": "train2014/COCO_train2014_000000251523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325078, "question_id": "N8xoctSvvVZFCBfYvt9YUQ", "question": "What provides privacy in the shower?", "choices": ["towel", "shower door", "shower curtain", "bed sheet"], "correct_choice_idx": 2, "direct_answers": ["curtain", "curtain", "shower curtain", "shower curtain", "shower curtain", "shower curtain", "shower curtain", "curtain", "curtain", "curtain"], "difficult_direct_answer": false, "rationales": ["This curtain displays blue circles, and a few yellow ones, and is made of water-proof plastic.", "The shower has a blue and clear shower curtain hanging in front of it for privacy.", "The shower curtain provides privacy."], "image": "val2014/COCO_val2014_000000325078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389256, "question_id": "N96oU9zsj9ov74K2pphr86", "question": "What color is the person wearing whom is celebrating their birthday here?", "choices": ["teal", "white", "red", "black"], "correct_choice_idx": 0, "direct_answers": ["blue", "blue", "blue", "blue", "teal", "teal", "blue", "blue", "blue", "teal"], "difficult_direct_answer": false, "rationales": ["Her shirt is a blue color that has hints of green in it", "The woman cutting the cake is celebrating her birthday. her shirt is not black, white, or red.", "The person cutting the cake and with a crown on has a teal shirt."], "image": "val2014/COCO_val2014_000000389256.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286388, "question_id": "N9K6Nu8EB9oVwQPFKkgZ9V", "question": "What is the yellow hump sign on top of?", "choices": ["pavement", "concrete", "dirt", "grass"], "correct_choice_idx": 2, "direct_answers": ["speed bump", "dirt", "plant", "sign post", "bush", "pole", "post", "weeds", "dirt", "plant"], "difficult_direct_answer": false, "rationales": ["There is dirt and plants under the sign.", "The sign is on top of dirt.", "The yellow sign is on top of dirt."], "image": "train2014/COCO_train2014_000000286388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534917, "question_id": "N9b4h9acDzaQjXebhE3aqr", "question": "What time of year is it?", "choices": ["autumn", "summer", "spring", "winter"], "correct_choice_idx": 1, "direct_answers": ["summer", "summer", "summer", "summer", "spring", "spring", "spring", "summer", "summer", "spring"], "difficult_direct_answer": false, "rationales": ["The deep green of both the trees and grass are hallmarks of a summertime day. these sheep are grazing on a small hill that has plenty of grass to keep them fed.", "The time is summer.", "The grass is a vibrant shade of green."], "image": "train2014/COCO_train2014_000000534917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328900, "question_id": "N9baTd4AEwvCuLrGp275fi", "question": "What is being told to this woman?", "choices": ["joke", "pleasantries", "nothing", "something serious"], "correct_choice_idx": 3, "direct_answers": ["bad news", "talking", "bad news", "directions", "conversation", "bad news", "bad news", "something serious", "something serious", "bad news"], "difficult_direct_answer": false, "rationales": ["The woman has a solemn expression.", "She has a furrowed brow.", "The woman is talking on the phone and has a serious look on her face as if she is hearing news that isn't funny or happy."], "image": "train2014/COCO_train2014_000000328900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364691, "question_id": "N9rXNEKhrgpqxAhWYd2c3o", "question": "What type of event is the skateboarder taking place in?", "choices": ["deathmatch", "slalom", "best trick", "lap race"], "correct_choice_idx": 1, "direct_answers": ["trick competition", "competition", "demonstration", "trick competition", "competition", "agility contest", "festival", "casual", "slalom", "skateboard competition"], "difficult_direct_answer": false, "rationales": ["Who can do the best stunt.", "The skateboard is weaving around.", "The person does this trick."], "image": "train2014/COCO_train2014_000000364691.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492408, "question_id": "N9tz3xeungWwGhnRfq3ZUX", "question": "Why does the man hold 2 umbrellas?", "choices": ["sun protection", "confusion", "photograph pose", "snow prevention"], "correct_choice_idx": 2, "direct_answers": ["raining", "covering something", "photograph pose", "extra dry", "it's raining", "posing", "rain", "its raining", "two hands", "it's raining"], "difficult_direct_answer": true, "rationales": ["No sun nor shine protection is needed in this scene. the confidence he shows indicates a lack of confusion.", "The man is posing for a picture.", "The man appears to be posing for the cameras."], "image": "train2014/COCO_train2014_000000492408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493814, "question_id": "NA2REjoPvjZogpvfamhcBu", "question": "What style of food does this appear to be?", "choices": ["british", "chinese", "american", "mexican"], "correct_choice_idx": 1, "direct_answers": ["chinese", "chinese", "spiced vegetables", "asian", "asian", "veg fiece", "asian", "chinese", "chinese", "stir fry"], "difficult_direct_answer": false, "rationales": ["This happens to be my favorite style but even if it weren't i think i could tell by the vegetables.", "The way the vegetables are cut and arranged together under a sauce of that composition with sesame seeds on top are all consistent with answer a.", "The food is chinese."], "image": "val2014/COCO_val2014_000000493814.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133279, "question_id": "NAC9B5fwtiY345R9caq84X", "question": "What are the oval shaped green plants growing by the fence?", "choices": ["elephant plants", "cactus", "weeds", "vines"], "correct_choice_idx": 1, "direct_answers": ["cactus", "cactus", "cactus", "cactus", "cacti", "cactus", "cacti", "cacti", "cactus", "cactus"], "difficult_direct_answer": false, "rationales": ["A cactus is growing by the fence.", "Cactus have oval shaped prickly leaves.", "The item is a cactus."], "image": "val2014/COCO_val2014_000000133279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205560, "question_id": "NARXKjiFHFhhiVhKXt73Fg", "question": "This laptop and desk is located in which country in Europe?", "choices": ["germany", "france", "austria", "switzerland"], "correct_choice_idx": 3, "direct_answers": ["germany", "germany", "uk", "switzerland", "italy", "germany", "germany", "switzerland", "germany", "yes"], "difficult_direct_answer": false, "rationales": ["According to the papers on the desk, switzerland is the location.", "The laptop and desk is located in the country of switzerland.", "They're in switzerland."], "image": "train2014/COCO_train2014_000000205560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548939, "question_id": "NASQ5TAWUGP2gK7wCbyZHr", "question": "How can he bring the board home without riding it?", "choices": ["throw", "remote", "carry", "mail"], "correct_choice_idx": 2, "direct_answers": ["pull it", "carry", "carry", "carrying", "carry", "carry", "carrying", "carrying", "carry", "carry it"], "difficult_direct_answer": false, "rationales": ["He can pick it up with his hands", "The skateboard is small enough to be carried.", "The man can put the board in his arms."], "image": "val2014/COCO_val2014_000000548939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437970, "question_id": "NAc3DNtj4GH2aygZwm6JDa", "question": "What is the main purpose of the ship shown here?", "choices": ["cruise ship", "passenger", "vacationing", "shipping cargo"], "correct_choice_idx": 3, "direct_answers": ["boat", "shipping", "transport cargo", "ship cargo", "unloading", "cargo", "tanker", "transporting cargo", "shipping cargo", "shipping"], "difficult_direct_answer": true, "rationales": ["The boat is large and is carrying a large quantity of containers. aside from crew members, it does not carry people.", "The ship is loaded with shipping containers that are used to transport cargo across the water.", "The ship in the background is used to ship cargo."], "image": "train2014/COCO_train2014_000000437970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186075, "question_id": "NAh57uiw5NdZ9C4tE6gpSp", "question": "What is the bird doing?", "choices": ["falling", "resting", "landing", "eating"], "correct_choice_idx": 2, "direct_answers": ["feeding", "landing", "feeding", "flying", "flying", "flying", "flying", "feeding", "flying", "flying"], "difficult_direct_answer": false, "rationales": ["The bird is flying and about to land.", "The bird is landing.", "The bird is in the air. it's not flying."], "image": "train2014/COCO_train2014_000000186075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357279, "question_id": "NAiYoKddniNwMJdyuXb9pX", "question": "Why are there straps on snowboards?", "choices": ["law", "support/response", "style", "security"], "correct_choice_idx": 1, "direct_answers": ["support/response", "fits feet", "connecting people", "stay put", "hold boots", "foot connection", "stay attached", "hold feet", "adhesion", "keep feet"], "difficult_direct_answer": true, "rationales": ["They may also be b depending on the design, but they're primarily for a.", "The straps help support balance.", "They will keep them on your feet."], "image": "val2014/COCO_val2014_000000357279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225850, "question_id": "NAjkWKL4UzR8JkPX5E3kwp", "question": "This person is most likely going to do what activity?", "choices": ["working", "birdwatching", "gymnastics", "jogging"], "correct_choice_idx": 3, "direct_answers": ["talking", "hike", "jogging", "run", "run", "run", "jog", "running", "jogging", "talk"], "difficult_direct_answer": false, "rationales": ["The person is likely going for a run.", "The person is wearing exercise gear and running shoes.", "The person will run."], "image": "val2014/COCO_val2014_000000225850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54490, "question_id": "NAm6v6koy956jWfNGnYMYG", "question": "What shape are all these objects?", "choices": ["nonagon", "hexagon", "pentagon", "diamond"], "correct_choice_idx": 1, "direct_answers": ["cylinders", "octagon", "cylinders", "hexagon", "hexagon", "hexagon", "octagon", "pentagon", "hexagon", "hexagon"], "difficult_direct_answer": false, "rationales": ["The objects all have six sides.", "These objects have six sides", "The shape is a hexagon."], "image": "val2014/COCO_val2014_000000054490.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450169, "question_id": "NAw9JTR48dcLc7Smfd7pxP", "question": "Why is it okay for the book to be there?", "choices": ["waterproof", "outdated", "won't fall", "cheap"], "correct_choice_idx": 0, "direct_answers": ["waterproof", "waterproof", "waterproof book", "waterproof", "waterproof", "yes", "waterproof", "waterproof", "waterproof", "waterproof"], "difficult_direct_answer": false, "rationales": ["The book is in the bath area because it is waterproof so kids can use it in the water.", "The children's book inside the bathtub is made of plastic that resists water damage.", "The book is made for use in a kid's bath so the water won't hurt it."], "image": "val2014/COCO_val2014_000000450169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195645, "question_id": "NB2ALBu4XFVvSLtTnDpL4r", "question": "What is in the bottle held by the smallest child?", "choices": ["rose", "milk", "wine", "zinfandel"], "correct_choice_idx": 1, "direct_answers": ["water bottle", "milk", "water", "milk", "water", "juice", "water", "milk", "water", "milk"], "difficult_direct_answer": false, "rationales": ["The bottle has milk.", "Babies drink milk.", "The child is holding milk since young babies drink that."], "image": "val2014/COCO_val2014_000000195645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356791, "question_id": "NB3aMbDPqoNTTCywPm7K4X", "question": "Where is the person walking?", "choices": ["subway", "roadway", "forest", "river"], "correct_choice_idx": 1, "direct_answers": ["roadway", "on pavement", "pedestrian line", "street", "luggage", "street", "bus stop", "road", "bus", "streetside"], "difficult_direct_answer": true, "rationales": ["She is walking on the side of a road.", "They are walking along the side of the street.", "They are on the asphalt next to the bus"], "image": "train2014/COCO_train2014_000000356791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161634, "question_id": "NBNzgMcHRZHBW464M8yyDR", "question": "Why is the man wearing wristbands?", "choices": ["camouflage", "style", "injury", "prevent sweat"], "correct_choice_idx": 3, "direct_answers": ["absorb sweat", "sweat", "arthritis prevention", "prevent sweat", "sweat", "wipe sweat", "remove sweat", "stop sweat", "sweat", "prevent sweat"], "difficult_direct_answer": false, "rationales": ["The man has sweatbands on.", "The man is wearing wristbands to ward sweat from his hands.", "The man is wearing wristbands so sweat doesn't get on his hands and make his grip slippery."], "image": "val2014/COCO_val2014_000000161634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337806, "question_id": "NBPJuNQjFB3VSohrR6Pyuj", "question": "The train engine is operating within which European country?", "choices": ["sweden", "germany", "france", "denmark"], "correct_choice_idx": 1, "direct_answers": ["germany", "germany", "germany", "germany", "france", "germany", "germany", "germany", "augsburger", "goods engine"], "difficult_direct_answer": false, "rationales": ["A train is on tracks and has german words printed on the side of it.", "The language on the train is german", "The side of the train has a german city and german language on it."], "image": "train2014/COCO_train2014_000000337806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180341, "question_id": "NBSEtuuisKqscoQ23Up2wR", "question": "How could someone near here gain elevation without expending a lot of energy?", "choices": ["ski lift", "jog", "catch taxi", "ski uphill"], "correct_choice_idx": 0, "direct_answers": ["ski lift", "ski lift", "normal", "ski lift", "ski lift", "ski lift", "chairlift", "ride lift", "chairlift", "chairlift"], "difficult_direct_answer": false, "rationales": ["The ski lift will bring you up while you are sitting down.", "The lift carries people to higher elevations.", "They could use the ski lift to take them to the top of the hill."], "image": "train2014/COCO_train2014_000000180341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64440, "question_id": "NBSfsVbaXw3dEZFXiNKtVP", "question": "What is the very front of the plane where the pilot sits called?", "choices": ["book house", "engine room", "power den", "cock pit"], "correct_choice_idx": 3, "direct_answers": ["cockpit", "cockpit", "cock pit", "cockpit", "cockpit", "cock pit", "cockpit", "cockpit", "cock pit", "cockpit"], "difficult_direct_answer": false, "rationales": ["The front has the cockpit.", "On a plane the pilot sits in a cock pit.", "The vehicle is an airplane. a boat has an engine room."], "image": "train2014/COCO_train2014_000000064440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544577, "question_id": "NBShC47ZrtGR9bcyAEibVg", "question": "What situation occurred here for the red and yellow tape to be taped up?", "choices": ["graduation event", "religious ceremony", "emergency", "construction"], "correct_choice_idx": 2, "direct_answers": ["restriced", "crime", "fire", "crime", "car wreck", "crime", "crime", "accident", "crime", "emergency"], "difficult_direct_answer": false, "rationales": ["The tape is put up to prevent people from entering an area due to a known danger or because it was a crime scene.", "Emergency tape is up.", "There are fire trucks in the background, so something bad must have happened."], "image": "train2014/COCO_train2014_000000544577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494855, "question_id": "NBUJrF85VJFZag77Ne2XiG", "question": "What is the equipment called on the crouching player's left hand?", "choices": ["glove", "mitten", "bat", "catcher's mitt"], "correct_choice_idx": 3, "direct_answers": ["glove", "baseball glove", "catchers mitt", "mitt", "mitt", "catcher's mitt", "mitt", "catcher's mitt", "baseball glove", "catcher's mitt"], "difficult_direct_answer": false, "rationales": ["The person has a mitt they're wearing to catch balls.", "The player in question is a catcher based on the setting and their position behind the batter. a catcher in baseball would be wearing answer a while playing.", "The person is in the catching position with a glove on his hand."], "image": "val2014/COCO_val2014_000000494855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123555, "question_id": "NBhCzJFA86ZEYnhkYKwsQ6", "question": "What does the sign beside the green light forbid?", "choices": ["left turns", "right turns", "u-turns", "going straight"], "correct_choice_idx": 0, "direct_answers": ["left turn", "left turns", "left turn", "skating", "u-turns", "no left-turn", "left turns", "left turns", "turn left", "left turn"], "difficult_direct_answer": false, "rationales": ["The sign beside the green streetlight has a left turn picture with a red cross out mark over it.", "The sign has the direction crossed out, showing it is not allowed.", "There are no left turns."], "image": "val2014/COCO_val2014_000000123555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19608, "question_id": "NBsXf7yACAmryKcHBzg42Q", "question": "At What location is the biker riding by the bird?", "choices": ["chicken hut", "city street", "market", "park"], "correct_choice_idx": 3, "direct_answers": ["park", "park", "lake", "park", "park", "water", "park", "lake", "park", "park"], "difficult_direct_answer": false, "rationales": ["This is a place where children play, adults can walk, and most of the time have small ponds with ducks.", "You can tell by the setting and background as to where the photo was taken.", "The biker is on a concrete bike path next to a lake."], "image": "val2014/COCO_val2014_000000019608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487020, "question_id": "NBwDTqEoZ8E8MGcxbooCkS", "question": "The birds seen out of their cage here are sold for what purpose?", "choices": ["dinner", "hat feathers", "stealing jewels", "pets"], "correct_choice_idx": 3, "direct_answers": ["food", "pets", "pets", "companionship", "eating", "pet purposes", "pets", "pets", "pets", "pets"], "difficult_direct_answer": false, "rationales": ["The birds are in a pet shop.", "Birds on bird stands are usually trained birds since they don't fly away and can be used as companions for people.", "These are tiny birds people buy for their homes"], "image": "train2014/COCO_train2014_000000487020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469046, "question_id": "NC4GAVgxZiLVPJ3fnoDC9H", "question": "What type of animal is in this cage?", "choices": ["reptile", "domestic", "flying", "wild"], "correct_choice_idx": 1, "direct_answers": ["cat", "cat", "domestic", "cat", "cat", "cat", "cat", "cat", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["This is a house cat", "A cat is domestic.", "Cats cannot fly and are warm-blooded. this cat seems to be tamed."], "image": "val2014/COCO_val2014_000000469046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253577, "question_id": "NCEWT5Chqms6Er3Qr6gkcy", "question": "What would most people do first before biting their pizza?", "choices": ["slice it", "ice it", "dress it", "cool it"], "correct_choice_idx": 0, "direct_answers": ["cutting", "slice it", "cut it", "cut it", "slice", "slice it", "slice it", "cut it", "cut", "cut it"], "difficult_direct_answer": false, "rationales": ["Pizza is always served in slices.", "The pizza pie the woman in this image bites into is normally cut into smaller personal pieces before eating.", "Most people cut the pizza into slices."], "image": "train2014/COCO_train2014_000000253577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208782, "question_id": "NCesXTsx6f4v2H9tgm6ZPC", "question": "Why the gap underneath the boat?", "choices": ["stability", "speed", "style", "weight reduction"], "correct_choice_idx": 0, "direct_answers": ["aerodynamics", "buoyancy", "buoyancy", "stability", "open space", "aerodynamics", "stability", "floating", "on skiis", "to flow"], "difficult_direct_answer": false, "rationales": ["The gap helps provide stability.", "The gap allows for the boat to spread its weight more.", "The gap is to make the boat float."], "image": "train2014/COCO_train2014_000000208782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177319, "question_id": "NCgtMnXpoEqE63PszJHaRD", "question": "Where are the bike riders riding?", "choices": ["school", "park", "bike path", "trail"], "correct_choice_idx": 2, "direct_answers": ["bike lane", "bike lane", "bike lane", "bike lane", "bike path", "bike lane", "bicycle lane", "bikes", "bikes", "bike path"], "difficult_direct_answer": false, "rationales": ["There is a more narrow lane on the side of the bus where no cars are in.", "The bike riders are in a designated lane.", "The riders are on a path."], "image": "train2014/COCO_train2014_000000177319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54976, "question_id": "NCtr6xUcTEyR8WkrbDbiH9", "question": "The colorful flying objects are made of what material?", "choices": ["polyethylene", "plastic", "aluminum", "paper"], "correct_choice_idx": 1, "direct_answers": ["plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "iron"], "difficult_direct_answer": false, "rationales": ["Frisbees are made with a type of plastic called polyethylene.", "The objects are made of plastic.", "Frisbees are typically only made by one type of material."], "image": "train2014/COCO_train2014_000000054976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362278, "question_id": "NCzeER3QFfgAVVk74HiHxF", "question": "What style is the bathroom?", "choices": ["victorian", "modern", "nature", "retro"], "correct_choice_idx": 3, "direct_answers": ["seventies", "modern", "retro", "fifties", "modern", "retro", "strange", "small", "retro", "retro"], "difficult_direct_answer": false, "rationales": ["These colors and materials are from several decades ago", "The bathroom is retro since it features bright orange.", "The style is retro."], "image": "train2014/COCO_train2014_000000362278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561403, "question_id": "NDVh4UZUMRzH6PmGPftpiw", "question": "What is on the table?", "choices": ["plate", "cat", "laptop", "fork"], "correct_choice_idx": 2, "direct_answers": ["monitor", "computer", "phone", "computer", "laptop", "cellphone", "computer screens", "phone", "computers", "computers"], "difficult_direct_answer": false, "rationales": ["There is a folding computer next to the large screen.", "While not directly on the table, answer a is the only object from the list of answers clearly visible.", "A laptop is sitting next to the computer."], "image": "val2014/COCO_val2014_000000561403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416622, "question_id": "NDdtUCXFvZmG4FPdvhVJjM", "question": "The type of restaurant the man is eating at is more likely a what?", "choices": ["chinese", "mexican food", "steakhouse", "italian cuisine"], "correct_choice_idx": 2, "direct_answers": ["fast food", "fast food", "diner", "deli", "cafe", "diner", "fast food", "sandwich shop", "steakhouse", "diner"], "difficult_direct_answer": false, "rationales": ["The man is eating a sandwich, fries and pickles. those are not the types of items served at a chinese, mexican or italian restaurant.", "He has a sub with fries", "The man is eating a steak sandwich."], "image": "train2014/COCO_train2014_000000416622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176008, "question_id": "NDjrBB3FFjrowjiiwarczj", "question": "Where does this scene take place?", "choices": ["subway", "plane", "bus", "train"], "correct_choice_idx": 3, "direct_answers": ["train", "train", "train", "train", "train", "ship", "train", "train", "ship", "train"], "difficult_direct_answer": false, "rationales": ["This is a sleeper car with beds", "The bed set up, the room dimensions and the people visible outside are consistent with train setups.", "The scene is on a train cart."], "image": "train2014/COCO_train2014_000000176008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570016, "question_id": "NDti9aD2p3uM3vYZATmLwo", "question": "Which vegetable has notable Vitamin A content in it?", "choices": ["drumstick", "cabbage", "carrot", "capsicum"], "correct_choice_idx": 2, "direct_answers": ["carrots", "carrots", "carrot", "carrots", "carrots", "carrots", "carrot", "carrots", "carrots", "carrot"], "difficult_direct_answer": false, "rationales": ["The veggie is carrots.", "Carrots have vitamin a.", "Carrots have a lot of vitamin a in them."], "image": "train2014/COCO_train2014_000000570016.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452461, "question_id": "NEAfXuHmwXM7RTnJ9ZgJFR", "question": "By what means is the train powered?", "choices": ["electrical", "gas", "wood", "coal"], "correct_choice_idx": 0, "direct_answers": ["electricity", "electricity", "electricity", "electricity", "electricity", "electricity", "diesel", "electricity", "electrical", "engine"], "difficult_direct_answer": false, "rationales": ["The passenger train is run by electricity.", "There are wires hanging over the train.", "The train is electric."], "image": "val2014/COCO_val2014_000000452461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576871, "question_id": "NESENYgKPK53fEbThV5yQr", "question": "What are the people doing with their motorcycles?", "choices": ["parading", "protesting", "looting", "racing"], "correct_choice_idx": 0, "direct_answers": ["riding", "riding", "riding them", "parading", "riding", "riding", "riding together", "riding", "having fun", "cruising"], "difficult_direct_answer": false, "rationales": ["The people are going on a parade.", "It looks like they could be racing down the road.", "The amount of motorcycles in the group indicated something other than an afternoon ride."], "image": "val2014/COCO_val2014_000000576871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187902, "question_id": "NEc6xMWbM6bJTUs9wSeAMx", "question": "What type of train car do we see?", "choices": ["tank car", "centerbeam", "boxcar", "covered hopper"], "correct_choice_idx": 0, "direct_answers": ["tanker", "tank car", "tank", "caboose", "tanker", "oil rig", "tank car", "oil tanker", "train", "diesel"], "difficult_direct_answer": false, "rationales": ["They are cylindrical and designed to hold liquids.", "Tank cars are long.", "By the design of the train cars you can tell what type they are."], "image": "train2014/COCO_train2014_000000187902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452186, "question_id": "NEnBDX44fP4aTjTGCYUiyu", "question": "What is causing the yellow line?", "choices": ["street lights", "christmas lights", "flood lights", "headlights"], "correct_choice_idx": 3, "direct_answers": ["light", "fast car", "motion blur", "light", "beam lights", "light streak", "headlights", "car", "headlights", "lighting"], "difficult_direct_answer": false, "rationales": ["The cars' headlights are shining down on the streets into the night.", "This is a time lapse picture of a car going past", "The long exposure time for night photography causes the car lights to look like streaks."], "image": "train2014/COCO_train2014_000000452186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568187, "question_id": "NEpHgNuxLzgqciWjRRiea3", "question": "Who is the man sitting by the building entrance?", "choices": ["security guard", "driver", "receptionist", "greeter"], "correct_choice_idx": 0, "direct_answers": ["police", "chair", "police officer", "security", "cop", "security guard", "security guard", "police officer", "firefighter", "police officer"], "difficult_direct_answer": false, "rationales": ["The man is a security guard in uniform.", "The man is wearing a uniform and a badge.", "As indicated by his uniform. he might also double as a b."], "image": "train2014/COCO_train2014_000000568187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559900, "question_id": "NEuyxzbdppNxwpVd4BDHpe", "question": "What is the source of the melted product is in the center of sandwiches shown?", "choices": ["moose", "yak", "dog", "cow"], "correct_choice_idx": 3, "direct_answers": ["cow", "grilled cheese", "cheese", "cheese", "cheese", "grilling", "cow", "cow", "cow", "cheese"], "difficult_direct_answer": false, "rationales": ["We usually make cheese from their milk and this is a grilled cheese sandwich.", "It is grilled cheese.", "The source is a cow."], "image": "train2014/COCO_train2014_000000559900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196841, "question_id": "NFFQahEYmCTn2DFqUHQwDD", "question": "What is the long piece of fabric used for that is on the back of the woman in blue and black?", "choices": ["attach surfboard", "inflate preserver", "pull zipper", "inflate suit"], "correct_choice_idx": 2, "direct_answers": ["strap", "tether", "cord", "fun", "wetsuit", "attached surfboard", "surfboard", "board retrieval", "pull zipper", "sorking"], "difficult_direct_answer": true, "rationales": ["The long piece of fabric on the back of the wetsuit is to help the woman pull the zipper up or down without help.", "The item goes down the woman's back to release her from the suit.", "The fabric is used to pull the zipper down."], "image": "val2014/COCO_val2014_000000196841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445030, "question_id": "NFKgjBiEh9ec2jDEGJwJ5R", "question": "WHat language is on the black shirt?", "choices": ["chinese", "english", "french", "italian"], "correct_choice_idx": 0, "direct_answers": ["chinese", "chinese", "japanese", "asian", "korean", "not visible", "chinese", "english", "japanese", "japanese"], "difficult_direct_answer": false, "rationales": ["The language is chinese.", "The characters on the shirt appear to be.", "Chinese has the characters on the black shirt."], "image": "train2014/COCO_train2014_000000445030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410305, "question_id": "NFxjfZ6R64eqFA94gezzfx", "question": "What are the pink objects above the chairs called?", "choices": ["kites", "planes", "planters", "umbrellas"], "correct_choice_idx": 3, "direct_answers": ["umbrellas", "umbrellas", "parasol", "umbrellas", "umbrellas", "parasol", "umbrellas", "umbrellas", "umbrellas", "umbrellas"], "difficult_direct_answer": false, "rationales": ["There are called umbrella to prevent direct sun.", "The objects are umbrellas.", "You can tell by the design and the setting to what type of items these are."], "image": "train2014/COCO_train2014_000000410305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248104, "question_id": "NFyjfphKEtSVegN7vMFXgR", "question": "What powers the oven here?", "choices": ["gas", "wood", "sun", "grass"], "correct_choice_idx": 1, "direct_answers": ["fire", "fire", "coal", "wood fire", "wood", "wood", "wood", "wood", "wood fire", "wood"], "difficult_direct_answer": false, "rationales": ["The source fueling the fire and thus the oven is visible in the background.", "The wood powers the oven.", "The oven has logs in it."], "image": "train2014/COCO_train2014_000000248104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207513, "question_id": "NFzAJW8vE6CCm5KTUtYvwM", "question": "What part of the sign are these people painting?", "choices": ["top", "middle", "base", "stop"], "correct_choice_idx": 2, "direct_answers": ["white part", "middle", "stop", "letters", "letters", "letter p", "letter", "letters", "white", "base"], "difficult_direct_answer": false, "rationales": ["The woman is holding the paintbrush near the middle of the sign.", "The sign is a base.", "The people are painting the bottom."], "image": "train2014/COCO_train2014_000000207513.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273732, "question_id": "NGAtAAiEABrBQHQhXbecdp", "question": "What sort of waves does the boat captain shown here like that the surfer dislikes?", "choices": ["tidal", "high", "calm water", "sound"], "correct_choice_idx": 2, "direct_answers": ["slow", "small", "large waves", "calm waves", "small wave", "small waves", "calm water", "calm water", "white water", "small"], "difficult_direct_answer": false, "rationales": ["Calm water isn't actually the waves in the foreground, but it could describe what a boat captain would like. they certainly wouldn't like the other options.", "Surfers need big waves and a lot of movement.", "The boat likes calm waves so there but the surfer like higher waves."], "image": "train2014/COCO_train2014_000000273732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276565, "question_id": "NGFCKcPDXkFqFwLJCMaCVh", "question": "This scene is likely in what country?", "choices": ["united states", "czech republic", "china", "kazakhstan"], "correct_choice_idx": 0, "direct_answers": ["america", "usa", "usa", "america", "usa", "usa", "united states", "usa", "canada", "united states"], "difficult_direct_answer": false, "rationales": ["The license plates and street names belong to those seen in america.", "The united states is where broadway is located.", "This is in the usa in california."], "image": "train2014/COCO_train2014_000000276565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372714, "question_id": "NGdSTmh6CcwAwS29uQWrzp", "question": "Where is this game being played?", "choices": ["high school", "stadium", "gym", "park"], "correct_choice_idx": 1, "direct_answers": ["pittsburgh", "pittsburgh", "baltimore", "stadium", "baseball", "stadium", "stadium", "baseball stadium", "stadium", "baseball"], "difficult_direct_answer": false, "rationales": ["The game is at a stadium.", "You can tell by the dugout and the sport that is being played as to where the photo was taken.", "These are professional baseball players. professional baseball is played in stadiums."], "image": "train2014/COCO_train2014_000000372714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25009, "question_id": "NH3Hpd3Ynh5rD9yEV8ZuF8", "question": "What food is on the plate?", "choices": ["egg sandwich", "bagel", "quesadilla", "pizza"], "correct_choice_idx": 2, "direct_answers": ["pita", "quesadilla", "pita", "pita", "tortilla", "quesadilla", "quesadilla", "quesadillas", "mexican", "quesadilla"], "difficult_direct_answer": false, "rationales": ["The food is a quesadilla.", "The tortillas has melted cheese in it.", "A quesadilla is on the plate."], "image": "train2014/COCO_train2014_000000025009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564715, "question_id": "NH5dA9sGrzu727dQdhZyeH", "question": "What type of entertainment is commonly held in the building behind the people eating?", "choices": ["movies", "opera", "standup comedy", "hockey"], "correct_choice_idx": 1, "direct_answers": ["singing", "concert", "music performance", "opera", "light show", "concerts", "operas", "musical", "concert", "opera"], "difficult_direct_answer": false, "rationales": ["The opera house in australia is shown.", "It is built for that particular sound.", "This is an opera house"], "image": "train2014/COCO_train2014_000000564715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32300, "question_id": "NHT3NKJ5ir8aZiswwUigvM", "question": "What is the ideal outcome for the boy about to touch the ball?", "choices": ["out", "walk", "base hit", "home run"], "correct_choice_idx": 0, "direct_answers": ["throwto base", "tag runner", "out", "no errors", "out", "grab ball", "pick up", "out player", "get out", "out"], "difficult_direct_answer": false, "rationales": ["To throw it and get someone out.", "The boy wants to help his team.", "The boy wants to get the batter out."], "image": "val2014/COCO_val2014_000000032300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439774, "question_id": "NHZi5AqsfsRxbQF87bQcPU", "question": "What is wrong with his eyes?", "choices": ["black", "closed", "red", "shiny"], "correct_choice_idx": 1, "direct_answers": ["they're closed", "closed", "closed", "they're shut", "closed", "closed", "sleeping", "closed", "closed", "closed"], "difficult_direct_answer": false, "rationales": ["The man in this image looks as though he's drifting off to sleep.", "The man's eyes are clearly visible and the only thing unusual with them is that they are not open.", "The man is sitting and looks really relaxed. his eyes are closed because he is tired."], "image": "val2014/COCO_val2014_000000439774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320353, "question_id": "NHjhywgUKv3yMgtuXHU2zi", "question": "What type of vehicle has a blue light?", "choices": ["ambulance", "police car", "truck", "motorbike"], "correct_choice_idx": 3, "direct_answers": ["scooter", "motorcycle", "police", "motorcycle", "motorcycle", "motorcycle", "police motorcycle", "motorbike", "motorcycle", "police"], "difficult_direct_answer": false, "rationales": ["There is a blue light on the front of the motorcycle.", "This is a police motorcycle.", "The motorbike has the blue light."], "image": "train2014/COCO_train2014_000000320353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281725, "question_id": "NHrEE8vKxpuwzRUDuXAMan", "question": "What winter activity are the people participating in?", "choices": ["ice skating", "luge", "snowboarding", "ice hockey"], "correct_choice_idx": 2, "direct_answers": ["snowboarding", "snowboarding", "skiing", "snowboarding", "snowboarding", "skiing", "snowboarding", "snowboarding", "snowboarding", "snowboarding"], "difficult_direct_answer": false, "rationales": ["They have both feet on one wide board in the snow", "The people are on a board on snow, and not skis.", "The people are participating via snowboard."], "image": "val2014/COCO_val2014_000000281725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373219, "question_id": "NHtQg5pAHLNpoUdjZqmdkG", "question": "What is most likely in the glass with the round top?", "choices": ["plasma", "alcohol", "tears", "honey"], "correct_choice_idx": 1, "direct_answers": ["alcohol", "bourbon", "whiskey", "drinks more", "wine", "liquor", "alcohol", "keys", "alcohol", "whiskey"], "difficult_direct_answer": false, "rationales": ["This is a decanter and they are made to hold alcohol most often.", "Alcohol is a liquid typically put in a glass, unlike the other options listed.", "Alcohol is in the glass."], "image": "train2014/COCO_train2014_000000373219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37456, "question_id": "NHyPcmEhVEWnW6DWCfirJm", "question": "What kind of emotion is the male feeling?", "choices": ["anger", "happiness", "sadness", "rage"], "correct_choice_idx": 1, "direct_answers": ["happy", "happy", "happy", "happiness", "happy", "happy", "happiness", "happy", "happy", "happiness"], "difficult_direct_answer": false, "rationales": ["He has a big smile on his face so he is clearly enjoying himself.", "He is smiling and laughing.", "The emotion is happiness."], "image": "val2014/COCO_val2014_000000037456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59732, "question_id": "NJNHUJD8rkXBWxsG3YHRte", "question": "The headlights that shine brighter than the other cars show that the driver is using what feature in the car?", "choices": ["radio", "high beams", "turning signal", "camera"], "correct_choice_idx": 1, "direct_answers": ["brake", "light", "high beams", "brights", "bright lights", "bright lights", "bright", "brights", "high beams", "high beams"], "difficult_direct_answer": false, "rationales": ["The bright lights are on.", "The headlights are high beams.", "The lights are extremely bright so you can tell that they are not regular lights."], "image": "val2014/COCO_val2014_000000059732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532408, "question_id": "NJU9cCpyg7zYCiRYM3qWCH", "question": "The pillow on the couch is the same color as what?", "choices": ["lemon", "lime", "tomato", "orange"], "correct_choice_idx": 2, "direct_answers": ["fire truck", "magazine", "red", "apples", "sheets", "carpet", "tomato", "wall art", "red", "picture"], "difficult_direct_answer": true, "rationales": ["The pillow is a bright red.", "Tomatoes are red", "The pillow is red"], "image": "val2014/COCO_val2014_000000532408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522722, "question_id": "NJg25DCJ6dLx8zJkr9bUgs", "question": "What are the kites in most danger of getting stuck in?", "choices": ["sand", "trees", "water", "rocks"], "correct_choice_idx": 1, "direct_answers": ["trees", "tree", "trees", "trees", "trees", "tree", "trees", "tree", "trees", "trees"], "difficult_direct_answer": false, "rationales": ["The kites are flying close to the branches.", "The kites can be easily attached on the tree branches.", "The kites are closest to a tree and that could get tangled in the lines."], "image": "train2014/COCO_train2014_000000522722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217959, "question_id": "NJjXj3RAFkwLnBxWjDFgLG", "question": "The date this picture was taken have what number that is the same for the month and year?", "choices": ["four", "twelve", "dash", "zero"], "correct_choice_idx": 1, "direct_answers": ["12", "two", "two", "twelve", "twelve", "12", "six", "12", "twelve", "twelve"], "difficult_direct_answer": false, "rationales": ["The photo was taken on the twelfth day of the month in 2012.", "The date is given in the time stamp and the year and month can be inferred based on their position.", "The numbers on the image are readable and based on the way dates are commonly written, answer a is consistent."], "image": "train2014/COCO_train2014_000000217959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296859, "question_id": "NKGnUQVHFE5cKwqCeght5W", "question": "How did the bear get there?", "choices": ["fell", "blew there", "climbed up", "placed there"], "correct_choice_idx": 3, "direct_answers": ["placed there", "carried", "posed", "placed", "was placed", "placed", "owner's mother", "person put", "human intervention", "placed"], "difficult_direct_answer": false, "rationales": ["The teddy bear is sitting on a chair.", "The bear was placed in a sitting position on the chair.", "It is a stuffed animal that cannot move itself."], "image": "val2014/COCO_val2014_000000296859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567721, "question_id": "NKRfDxge9pg3L52RwyaXhB", "question": "What is the left computer engaged in right now?", "choices": ["playing video", "nothing off", "just desktop", "running application"], "correct_choice_idx": 3, "direct_answers": ["work", "running software", "running application", "uploading music", "entertainment", "work", "playing music", "email", "downloading music", "unknown"], "difficult_direct_answer": true, "rationales": ["The computer has an app.", "There are windows open and apparently in use on the left screen. when there are windows open and actions being performed they are visual indications that an app is running.", "The left computer is on. its desktop is partially blocked by a non-video program."], "image": "train2014/COCO_train2014_000000567721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142687, "question_id": "NKVtHZuFDs6W3HxZJt6i9L", "question": "What is the next thing the chef should put on the pizza?", "choices": ["dough", "flour", "cheese", "pepperoni"], "correct_choice_idx": 2, "direct_answers": ["cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese"], "difficult_direct_answer": false, "rationales": ["Traditionally when making pizza after the sauce is added to the dough, cheese is next.", "It is the dough to the pizza.", "There is already flour and dough on the pizza. the buyers might not want pepperoni."], "image": "val2014/COCO_val2014_000000142687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543378, "question_id": "NKfGEnxAeyGuCoApKH7HwT", "question": "What dish would be most likely made from this food?", "choices": ["pie", "lasagna", "stroganoff", "tacos"], "correct_choice_idx": 0, "direct_answers": ["smoothie", "fruit salad", "salad", "pie", "fruit salad", "fruit salad", "fruit salad", "dessert", "pie", "salad"], "difficult_direct_answer": false, "rationales": ["This is a fruit stand, which fruit pie can be made from. the other dishes listed are more savory dishes, not made from fruit.", "Fruit is a key ingredient to these recipes", "These are fruits which are good for desserts"], "image": "train2014/COCO_train2014_000000543378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140151, "question_id": "NKfxrQXfrTNxeCorCoRaMV", "question": "Where could the vendor selling bananas here go for either a hot coffee or a slurpee like beverage nearby?", "choices": ["7-eleven", "livestock stall", "home", "bike stall"], "correct_choice_idx": 0, "direct_answers": ["7-eleven", "seven eleven", "7-eleven", "7-eleven", "7 eleven", "seven eleven", "seven eleven", "convenience store", "seven eleven", "7 eleven"], "difficult_direct_answer": false, "rationales": ["There is a seven eleven nearby.", "There is a 7/11 across the street.", "There is a convenience store right across the road."], "image": "val2014/COCO_val2014_000000140151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369023, "question_id": "NKmLQVCKFQ8F7MgNSP2oLJ", "question": "What venue is shown here?", "choices": ["hotel room", "apartment", "studio", "bedroom"], "correct_choice_idx": 0, "direct_answers": ["hotel", "hotel room", "hotel", "hotel", "hotel room", "hotel", "hotel room", "bedroom", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["A large king bed with some pamphlets are located at top of bed.", "The room has pamphlets on the bed and is not arranged like a residential room.", "The bed has cards on it. the room looks like a typical hotel room."], "image": "train2014/COCO_train2014_000000369023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64948, "question_id": "NKmtUty3pKh8yMUuLBzYkg", "question": "What might these people have applied to their bodies?", "choices": ["ink", "butter", "oil", "sunscreen"], "correct_choice_idx": 3, "direct_answers": ["sunscreen", "sun protection", "sunscreen", "sunscreen", "sunscreen", "sunscreen", "sunscreen", "lotion", "sun block", "sunscreen"], "difficult_direct_answer": false, "rationales": ["People typically use sunscreen when they are at the beach and out in the sun.", "On a hot, sunny day at the beach, it's important to apply a sun blocker to avoid burning skin. too much sun in one's lifetime can lead to skin cancer.", "They applied lotion that will protect them from the sun."], "image": "train2014/COCO_train2014_000000064948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27517, "question_id": "NLHz5w48QvYZrWc9MAzP7x", "question": "What is this person a fan of?", "choices": ["movies", "technology", "games", "sports"], "correct_choice_idx": 1, "direct_answers": ["photography", "technology", "technology", "cellphones", "movies", "movies", "flip phones", "movies", "phone", "motorola"], "difficult_direct_answer": false, "rationales": ["The person loves technology since they're using a phone.", "He has a cellphone, and a computer to use.", "The person loves playing with their phone."], "image": "val2014/COCO_val2014_000000027517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8731, "question_id": "NLgt84AisYEPqAXgTyMpmx", "question": "What is used in this room to heat up the metal pots?", "choices": ["solar", "electricity", "friction", "fire"], "correct_choice_idx": 3, "direct_answers": ["fire", "wood", "brick ovens", "fire wood", "fire", "fire", "fire", "charcoal", "brick stoves", "fires"], "difficult_direct_answer": false, "rationales": ["This is a traditional stove with holes under it that could be used for fire.", "This is an old kitchen", "There are fire pits underneath the metal pots."], "image": "train2014/COCO_train2014_000000008731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325532, "question_id": "NLjsueFEh2ucpax8tEBprd", "question": "How are the objects in the sky powered?", "choices": ["gas", "wind", "sun", "electricity"], "correct_choice_idx": 1, "direct_answers": ["wind", "wind", "wind", "wind", "wind", "wind", "wind", "wind", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["Kites fly by wind.", "The objects in the sky are kites. they do not have engines, electrical wires, or solar panels.", "The kites in the sky are being held up by wind."], "image": "train2014/COCO_train2014_000000325532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335144, "question_id": "NMH74rmtgJFsVDNTrEEW6s", "question": "Why would people be seated here?", "choices": ["for massage", "to work", "to eat", "to paint"], "correct_choice_idx": 2, "direct_answers": ["have lunch", "avoid sun", "to relax", "eat", "lunch", "enjoying beach", "vacation", "eat food", "sunbathing", "to eat"], "difficult_direct_answer": true, "rationales": ["There are placemats on the table which resembles a dining table.", "People want to eat.", "The people seated here could eat."], "image": "train2014/COCO_train2014_000000335144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480053, "question_id": "NMdpbUvKNbKxKiXJLnKfFH", "question": "The person standing here wants to do what?", "choices": ["catch cab", "fly kite", "sleep", "cross road"], "correct_choice_idx": 3, "direct_answers": ["catch bus", "catch bus", "cross road", "cross", "catch bus", "cross road", "get ride", "cross road", "board bus", "take bus"], "difficult_direct_answer": false, "rationales": ["They are at an intersection waiting for traffic to clear", "The person is at a crosswalk.", "They are standing at the crosswalk."], "image": "train2014/COCO_train2014_000000480053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391892, "question_id": "NMk39MDSwahTSYn3kDht8b", "question": "Which Airlines is from the land down under?", "choices": ["virgin", "delta", "jet", "quantas"], "correct_choice_idx": 3, "direct_answers": ["qantas", "qantas", "australia", "quantas", "quantas", "qantas", "qantas", "qantas", "yes", "qantas"], "difficult_direct_answer": false, "rationales": ["Quantas airline is based in australia and the term \"down under\" refers to australia.", "The airline is qantas.", "Qantas is from australia."], "image": "train2014/COCO_train2014_000000391892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319766, "question_id": "NMsrgtzw4xMjBQP6ofnSqW", "question": "Who put these benches here?", "choices": ["homeless people", "joggers", "park management", "trash collecter"], "correct_choice_idx": 2, "direct_answers": ["city commissioners", "city workers", "government", "park people", "city", "park management", "humans", "city", "maintenance", "person"], "difficult_direct_answer": true, "rationales": ["This is the most likely answer. the other options wouldn't place benches in a park.", "Parks are usually ran by park managers which they would have the authority to place benches in the park.", "The benches are in a park."], "image": "train2014/COCO_train2014_000000319766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48733, "question_id": "NMwuQZQ5Q9Xet2npvUmFDR", "question": "This umbrella is perfect for the what?", "choices": ["sun", "rain", "snow", "water"], "correct_choice_idx": 1, "direct_answers": ["beach", "shade", "beach", "sun protection", "rain", "shade", "beach", "beach", "sunbathing", "shielding sun"], "difficult_direct_answer": false, "rationales": ["Though some of the answers are usable, but in this setting helping to shade the people is the most logical.", "The umbrella is on a beach on a clear day.", "The umbrella is good for rain."], "image": "train2014/COCO_train2014_000000048733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160181, "question_id": "NNHtiXVBQSfNKjoCgSVnxF", "question": "Why is he holding the doughnut to his face?", "choices": ["to hide", "to steal", "to eat", "to steal"], "correct_choice_idx": 2, "direct_answers": ["eat doughnut", "eating", "eating", "to bite", "eat", "eating", "eating", "eating", "to eat", "to eat"], "difficult_direct_answer": false, "rationales": ["He is hungry and going to eat it.", "A donut is something that you eat and you eat with your mouth.", "The boy wants to eat the donut."], "image": "train2014/COCO_train2014_000000160181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517147, "question_id": "NNUU2QEFLCV7Neg25TapeU", "question": "Who are the people in the wall hanging?", "choices": ["strangers", "friends/family", "colleagues", "missing persons"], "correct_choice_idx": 1, "direct_answers": ["friends/family", "family members", "loved ones", "family members", "friends", "family", "family", "family", "family", "photos"], "difficult_direct_answer": false, "rationales": ["The pictures do not appear professional, but look like candid images. people commonly display pictures of answer a in their homes in this manner.", "The people in the wall hanging are probably those closest to the man.", "Traditionally people make collages of pictures of their loved ones and put it on the wall."], "image": "train2014/COCO_train2014_000000517147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359686, "question_id": "NNfLRGz8prGVwNHbdrq9SV", "question": "What position is held by number 22 during this game?", "choices": ["left field", "hitter", "pitcher", "short stop"], "correct_choice_idx": 2, "direct_answers": ["pitching", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "throwing ball", "pitcher", "pitcher"], "difficult_direct_answer": false, "rationales": ["The player is throwing the ball from the mound.", "He is standing on the mound throwing the ball.", "The position is the pitcher."], "image": "train2014/COCO_train2014_000000359686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469795, "question_id": "NNhpdKZbnVCuMR5AuG3ynJ", "question": "What is the most likely value of a house with this size of bathroom?", "choices": ["$7000", "$7000000", "$700000", "$70000"], "correct_choice_idx": 2, "direct_answers": ["200k", "$700000", "millions", "1000000", "millions", "high", "millions", "million", "millions", "millions"], "difficult_direct_answer": false, "rationales": ["This house looks ultra fancy.", "The bathroom is luxury class and likely found in a mansion, so the price tag would be high--definitely over a million and probably closer to seven or eight million.", "The bathroom is quite expansive and would have to be at least a half million or more."], "image": "val2014/COCO_val2014_000000469795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453485, "question_id": "NPPARQvVuLNrZXbNKqsnsF", "question": "How will the water be heated?", "choices": ["sink tap", "pot", "oven", "kettle"], "correct_choice_idx": 3, "direct_answers": ["stove", "stove", "boiled", "stove", "stovetop", "stove", "boiled", "kettle", "electric stove", "kettle"], "difficult_direct_answer": false, "rationales": ["The water is in a kettle.", "The water will be heated with a water kettle.", "The water is in the kettle."], "image": "val2014/COCO_val2014_000000453485.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334511, "question_id": "NPSykfFZdqNjJrD7MFrXdh", "question": "Where are the persons here headed?", "choices": ["casinos", "bathrooms", "villas", "market"], "correct_choice_idx": 3, "direct_answers": ["market", "unknown", "market", "sea", "market", "market", "farm market", "market", "market", "dinner"], "difficult_direct_answer": false, "rationales": ["The people are headed to the produce market.", "They have food ready to sell.", "The people have more items than they personally need, and are looking to trade or sell their excess."], "image": "val2014/COCO_val2014_000000334511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174766, "question_id": "NPik3EYoKbxmTzn6JfDCix", "question": "Where is the rider wearing protection?", "choices": ["elbows", "knees", "chest", "head"], "correct_choice_idx": 1, "direct_answers": ["head", "knees", "knees", "head", "knees", "head", "knees", "head", "knees", "head"], "difficult_direct_answer": false, "rationales": ["The skater is wearing pads on his knees to protect them if he falls.", "The rider has kneepads on.", "To prevent injury to the knees as they could easily be hurt."], "image": "train2014/COCO_train2014_000000174766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5715, "question_id": "NQ5xnUSLgBiMVQzT4cgpjP", "question": "What does the man use the umbrella for?", "choices": ["hail", "rain", "decoration", "shade"], "correct_choice_idx": 3, "direct_answers": ["shade", "style", "signify pride", "costume", "shade", "sun protection", "shade", "shade", "sun protection", "sun protection"], "difficult_direct_answer": false, "rationales": ["The day is very sunny, so having some shade to keep cool and keep from getting sunburned.", "There is no visible rain in the image, only sunlight. of the uses for an umbrella, answer a is most consistent with sunny day usage.", "The sky is very sunny and is probably trying to stay cool."], "image": "train2014/COCO_train2014_000000005715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381318, "question_id": "NQHb8tVDNi8L59iXLdveX7", "question": "What cooking method was used to prepare the side dishes seen here?", "choices": ["baking", "deep frying", "broiling", "sun drying"], "correct_choice_idx": 1, "direct_answers": ["frying", "grill", "fryer", "fried", "fryer", "deep fry", "frying", "fryer", "deep frying", "grill"], "difficult_direct_answer": false, "rationales": ["French fries are usually cooked in a fryer so they are crisp.", "The side dishes are french fries, which are sliced potatoes cooked jn hot oil.", "Fried foods are made by deep frying them."], "image": "val2014/COCO_val2014_000000381318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80260, "question_id": "NQbW7ACKYxhjDN3nh2uTba", "question": "What year was Coca-Cola founded?", "choices": ["1843", "1892", "1881", "1890"], "correct_choice_idx": 1, "direct_answers": ["1886", "1880", "1892", "1892", "1892", "1892", "1892", "1892", "1892", "1892"], "difficult_direct_answer": false, "rationales": ["The coca cola company, featured here in the advertisement on the right side of the picture was founded 1892.", "Coca cola was founded in the year 1892.", "Coca cola came into existence in 1892."], "image": "val2014/COCO_val2014_000000080260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135683, "question_id": "NQnXJduYJHK7vEQ3E4tHMg", "question": "What is the name of the bird?", "choices": ["gruiformes", "peacock", "duck", "cranes"], "correct_choice_idx": 2, "direct_answers": ["pelican", "pelican", "crane", "duck", "crane", "pelican", "pelican", "pelican", "pelican", "crane"], "difficult_direct_answer": false, "rationales": ["Those birds have long beaks and go by the sea.", "The bird has a long beak.", "The name is a duck."], "image": "train2014/COCO_train2014_000000135683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500207, "question_id": "NRAYUCZcwkyHhttY5jGvvW", "question": "What does the word on the plane mean?", "choices": ["anger", "happiness", "retribution", "trust"], "correct_choice_idx": 2, "direct_answers": ["enemy", "enemy", "nemesis", "enemy", "enemy", "enemy", "enemy", "enemies", "enemy", "retribution"], "difficult_direct_answer": false, "rationales": ["One of the main meanings is retribution.", "Someones downfall it can mean.", "This would be someones downfall."], "image": "train2014/COCO_train2014_000000500207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536926, "question_id": "NRUViDyrkor5YwUHMoztGK", "question": "What are these people doing?", "choices": ["eating", "waiting", "watching skateboarder", "keeping time"], "correct_choice_idx": 2, "direct_answers": ["skateboarding", "watching skateboarders", "standing", "skateboarding", "watching skateboarder", "skateboarding", "skateboarding", "skateboarding", "skateboarding", "watching skateboarder"], "difficult_direct_answer": false, "rationales": ["The people have their eyes on the skateboarder.", "The crowd is observing the guy doing tricks.", "The people in the arena are watching a skateboarder do tricks on a ramp."], "image": "train2014/COCO_train2014_000000536926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16574, "question_id": "NRez2GpCGkojN65KSsbWH4", "question": "What time frame is this image depicting?", "choices": ["modern times", "80's", "medieval times", "b.c"], "correct_choice_idx": 2, "direct_answers": ["midevil", "medieval times", "ancient", "medical", "start", "medieval", "medical times", "medieval", "medieval", "medevel"], "difficult_direct_answer": false, "rationales": ["The time is medieval.", "The people in these photos are on horses in old time clothing.", "Men are dressed as knights."], "image": "val2014/COCO_val2014_000000016574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572233, "question_id": "NRmRcpKbKCiTmcyby5CRJK", "question": "What are they doing?", "choices": ["enjoying show", "ignoring dinner", "returning dinner", "watching traffic"], "correct_choice_idx": 0, "direct_answers": ["celebrating", "attending wedding", "dining", "watching", "watching", "celebrating", "enjoying show", "dining", "celebrating", "celebrating"], "difficult_direct_answer": false, "rationales": ["The people appear to be in some kind of performance or banquet hall and are all looking in the same direction with amusement. people in such an environment all looking in the same direction are probably all watching the same thing which in this space might be a show.", "The audience is enjoying whomever is performing for them.", "They're enjoying the show."], "image": "val2014/COCO_val2014_000000572233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9723, "question_id": "NS48rRM4W8rNLDDhskJVUG", "question": "What type of breakfast restaurant would serve waffles like this?", "choices": ["casual dining", "fast food", "cafe", "upscale"], "correct_choice_idx": 3, "direct_answers": ["breakfast", "diner", "diner", "high end", "hotel", "denny's", "upscale", "ihop", "cafe", "japanese"], "difficult_direct_answer": true, "rationales": ["Fancy restaurants serve unique creations.", "Upscale restaurants serve food on narrow plates and have unique offerings.", "The breakfast is fancy."], "image": "train2014/COCO_train2014_000000009723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468405, "question_id": "NS7mM7tLenEy47VEJJ9CNb", "question": "What is the green truck being used for?", "choices": ["parking", "hiking", "transporting", "crushing"], "correct_choice_idx": 2, "direct_answers": ["construction", "transport", "transport supplies", "haul stuff", "hauling", "transport", "hauling", "transporting goods", "transporting", "hauling cargo"], "difficult_direct_answer": false, "rationales": ["The green truck has a flat bed on the back that is used to transport goods and supplies.", "There are beams of some kind on the back of the truck.", "It has a load strapped on the back"], "image": "val2014/COCO_val2014_000000468405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51054, "question_id": "NSAtz5aAFdQvdmU4CD3sUu", "question": "What is the old lady doing?", "choices": ["watching tv", "talking", "massaging face", "combing hair"], "correct_choice_idx": 1, "direct_answers": ["haunting bedroom", "talking", "making call", "talking phone", "phone call", "chatting", "talking", "talking", "using phone", "phonecall"], "difficult_direct_answer": false, "rationales": ["The lady is using a phone to talk to someone.", "The old lady is holding a phone to her ear. phones are generally used in this manner for conversations.", "The lady is talking."], "image": "val2014/COCO_val2014_000000051054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125252, "question_id": "NSXo9zXtx96vLKXdvfujBP", "question": "What are those glasses designed to protect the wearer from?", "choices": ["sun", "pollen", "water", "cold"], "correct_choice_idx": 0, "direct_answers": ["sun rays", "sun", "sun", "sun", "sun glare", "sun", "safety", "sun", "sun", "sun glare"], "difficult_direct_answer": false, "rationales": ["The glasses next to the phone are sunglasses that are used to protect eyes from the sun.", "Those are sunglasses which protect your eyes from the sun.", "The glasses are known as sunglasses."], "image": "train2014/COCO_train2014_000000125252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490739, "question_id": "NSYftu8GeAkTuLzp5pXGoK", "question": "The man on the left is probably a member of what type of group?", "choices": ["politician", "teacher", "farmer", "clergy"], "correct_choice_idx": 3, "direct_answers": ["muslim", "religious", "clergy", "religious", "church", "ministry", "muslim", "sports", "jew", "jewish"], "difficult_direct_answer": false, "rationales": ["With his unique clothing and large religious necklace, the man is obviously a member of some sect of religion. such representatives of organized religions often offer prayers during public events.", "The man has traditional robes and a holy symbol around his neck.", "There is a picture of jesus on his necklace and he is wearing religious clothes."], "image": "val2014/COCO_val2014_000000490739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527573, "question_id": "NSanid9E7hqdENu8geVMEz", "question": "What recently happened in this location?", "choices": ["it hailed", "it rained", "tornado", "it snowed"], "correct_choice_idx": 1, "direct_answers": ["rain", "rain", "rained", "rain", "it rained", "rained", "rain", "it rained", "rain", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["The ground is covered in water. there is no snow or hail, and this location does not have tornado damage.", "It must have rained since there are puddles.", "There are puddles on the ground and the street is wet."], "image": "val2014/COCO_val2014_000000527573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558697, "question_id": "NSiqPof6JzdxBgCu9JwiJH", "question": "What substance does the animal that is referenced breathe out?", "choices": ["candy", "milk", "fire", "sprinkles"], "correct_choice_idx": 2, "direct_answers": ["fire", "fire", "fire", "fire", "air", "fire", "fire", "fire", "fire", "land food"], "difficult_direct_answer": false, "rationales": ["That's what a dragon is known to breathe out.", "Dragons breathe fire.", "The name has dragon in it"], "image": "train2014/COCO_train2014_000000558697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459665, "question_id": "NSsoYPMmiJQxbeqm9EvpAZ", "question": "What most likely reason might this game end quickly?", "choices": ["fly ball", "tie", "thunderstorm", "lawn sodding"], "correct_choice_idx": 2, "direct_answers": ["thunderstorm", "out", "weather", "rain", "rain", "rain", "rain", "rain", "rain", "storm"], "difficult_direct_answer": false, "rationales": ["Thunderstorms can be dangerous when people are outside.", "This sport is commonly halted for inclement weather. the surface is visibly wet with dark clouds in the background which could likely be related to answer a.", "A ball players just threw a ball in very wet conditions. there is standing water all over the field."], "image": "val2014/COCO_val2014_000000459665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168141, "question_id": "NT7kdyXiFLdC4ne59Jx3zR", "question": "What is the woman in the back holding in her hand?", "choices": ["cell phone", "glass", "video camera", "ipod"], "correct_choice_idx": 2, "direct_answers": ["video camera", "camera", "camera", "video camera", "video camera", "camera", "video camera", "camera", "video camera", "camera"], "difficult_direct_answer": false, "rationales": ["She's filming their actions and it doesn't look like a b.", "The screen is showing it recording people", "A video camera is used to record."], "image": "train2014/COCO_train2014_000000168141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522062, "question_id": "NT9xR89zqmKwL5eSTAcviX", "question": "Why is the man holding the glass?", "choices": ["to drink", "to clean", "to buy", "to sell"], "correct_choice_idx": 0, "direct_answers": ["celebrating", "drinking it", "talking", "toasting", "to drink", "giving toast", "to drink", "drinking", "to drink", "drinking"], "difficult_direct_answer": false, "rationales": ["The man is going to take a sip.", "The man is holding a wine glass that holds a type of beverage.", "The personal glass this man has contains a small amount of liquid and is likely intended to be consumed by him."], "image": "train2014/COCO_train2014_000000522062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298649, "question_id": "NTXV6KULRCbkFDTtWsX2P6", "question": "Where do the persons at the car prefer to visit?", "choices": ["kid's playground", "sand pits", "ocean", "snow mountains"], "correct_choice_idx": 2, "direct_answers": ["cities", "ocean", "outdoors", "beach", "forest", "parks", "ocean", "water", "ocean", "woods"], "difficult_direct_answer": false, "rationales": ["By the surfboard on the car you can tell where he likes to go.", "The people have a paddle board which is perfect for the ocean.", "They have a surf board on top of their car."], "image": "val2014/COCO_val2014_000000298649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244986, "question_id": "NTYwDX9rwnf8UjC4wzCnaD", "question": "Why might someone prefer the vehicle closer to the camera as compared to the other types of vehicle pictured?", "choices": ["more stable", "safer", "cheaper", "carries more"], "correct_choice_idx": 2, "direct_answers": ["thicker wheels", "cheaper", "larger wheels", "maneuverability", "more compact", "more maneuverability", "faster", "faster travelling", "safety", "less gas"], "difficult_direct_answer": true, "rationales": ["The closer vehicle is a motorcycle. one of these costs less than a car.", "These take a lot less gas to run", "The vehicles visibly closer are motorcycles which are known to cost less than a car which is the other type of vehicle present."], "image": "train2014/COCO_train2014_000000244986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40635, "question_id": "NTfGkDxVMJ8f5qsAf2cfKP", "question": "Where does a shopper look to see how much a certain fruit costs?", "choices": ["cardboard sign", "tent flap", "no where", "vendors nametag"], "correct_choice_idx": 0, "direct_answers": ["price tag", "on sign", "signs", "price tag", "cardboard sign", "cardboard sign", "box front", "cardboard sign", "signs", "sign"], "difficult_direct_answer": false, "rationales": ["The prices are on the boxes", "The prices are written on pieces of boxes that the fruit came in on.", "To find out what something costs, it has to be listed on some kind of sign."], "image": "val2014/COCO_val2014_000000040635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124117, "question_id": "NTjqD4oRPCi8jE5uSatvFQ", "question": "If the TV stand suddenly disappeared what would happen?", "choices": ["chair falls", "monitor falls", "laptop falls", "shirt falls"], "correct_choice_idx": 1, "direct_answers": ["broken tv", "tip over", "womango", "fall over", "monitor falls", "expensive crash", "television crashes", "tv breaks", "tv falls", "break tv"], "difficult_direct_answer": true, "rationales": ["The screen would be the largest thing to crash down.", "The monitor would end up falling.", "Based on gravity, when an object is placed on another, if the object below is removed the one on top would fall if not otherwise supported."], "image": "train2014/COCO_train2014_000000124117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461361, "question_id": "NTkyb4BXTvNXpMPcNLYvWT", "question": "What is this meal missing?", "choices": ["condiments", "whip cream", "syrup", "ice cream"], "correct_choice_idx": 0, "direct_answers": ["ketchup", "drinks", "ketchup", "sauce", "ketchup", "targeting", "condiments", "ketchup", "condiments", "lettuce tomato"], "difficult_direct_answer": false, "rationales": ["The meal appears to be a grilled chicken sandwich with fries based on the size, shape and color of the items. these types of food are commonly served with answer a which is not currently visible.", "The meal has no ketchup or mustard.", "The sandwich is dry and sauces are usually what goes inside of a a sandwich."], "image": "train2014/COCO_train2014_000000461361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434145, "question_id": "NTpaqXMDwAQBpYcjVTuGmG", "question": "What letter does the word on the dial begin with?", "choices": ["z", "w", "x", "c"], "correct_choice_idx": 3, "direct_answers": ["letter d", "letter c", "letter c", "c", "c", "letter c", "letter c", "letter c", "c c", "no image"], "difficult_direct_answer": false, "rationales": ["The word on the dial starts with c.", "The letter is c.", "The word on the dial clearly starts with the letter c."], "image": "train2014/COCO_train2014_000000434145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4968, "question_id": "NTrXjApAnLDjTFNieYpVbX", "question": "What type of concrete area is blocked off here?", "choices": ["newly poured", "asphalt", "road turn", "broken"], "correct_choice_idx": 0, "direct_answers": ["crosswalk", "pothole", "newly poured", "corner sidewalk", "sidewalk", "sidewalk", "wet", "wet", "sidewalk", "street corner"], "difficult_direct_answer": false, "rationales": ["It is to keep people off the just made sidewalk.", "The concrete is free of flaws, and needs time to dry in order to prevent getting damaged.", "They have just poured the concrete on the sidewalk and have it blocked off so no one steps in it."], "image": "train2014/COCO_train2014_000000004968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37548, "question_id": "NTrrNaYYDLN76T5E3QxcyR", "question": "What is the lion doing near the downed animal?", "choices": ["saving it", "fighting it", "riding it", "eating it"], "correct_choice_idx": 3, "direct_answers": ["eating", "eating", "eating it", "eating", "eating it", "eating", "feeding", "eating it", "eating it", "eating it"], "difficult_direct_answer": false, "rationales": ["The lion hunts prey to consume", "The animal is laying down, and there appears to be blood and an injury to its side next to the lion. lions are predatory carnivores, they eat other animals to survive.", "The lion is biting into the animal."], "image": "train2014/COCO_train2014_000000037548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321980, "question_id": "NTskowow7fSohZkMF34qns", "question": "What feeling do these cats appear to be portraying?", "choices": ["furious", "irritated", "agitated", "sleepy"], "correct_choice_idx": 3, "direct_answers": ["sleepy", "boredom", "sleepy cats", "awakened", "tired", "relaxation", "sleepy", "sleep", "tiredness", "content"], "difficult_direct_answer": true, "rationales": ["These cats are appearing to portray people who are sleepy.", "They are laying down resting.", "The two cats are laying down in a resting position."], "image": "train2014/COCO_train2014_000000321980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92731, "question_id": "NTxQLuK8KzqeVVKgmYx5dk", "question": "Why do Elephants kneel low down here?", "choices": ["eating food", "humans mount", "passive resistance", "holiday manner"], "correct_choice_idx": 1, "direct_answers": ["passengers off", "acquire passengers", "board passengers", "humans mount", "giving rides", "to carry", "mounted", "mounting/dismounting", "unload", "for riders"], "difficult_direct_answer": true, "rationales": ["These elephants are trained to do this to let people on and off.", "Humans board elephants by climbing onto them when the animals kneel down, which is what the animals are doing here.", "Elephants are very large and this makes it easier for people to climb on"], "image": "train2014/COCO_train2014_000000092731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66487, "question_id": "NUAz5orJ9jQJSqLzB5s6Nx", "question": "What is the sign meant to regulate?", "choices": ["animals", "trees", "weather", "safety"], "correct_choice_idx": 3, "direct_answers": ["people spacing", "skiing accidents", "safety", "lift passengers", "safety protocols", "direction", "ski traffic", "skiing", "skiing", "people skiing"], "difficult_direct_answer": true, "rationales": ["The sign provides warning information to the skiers. it is not capable of controlling the weather, animals, or trees.", "The sign explains proper procedures while on the slope. it is designed to help new visitors to maintain safety as is customary on slopes.", "It is there for safety."], "image": "train2014/COCO_train2014_000000066487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481563, "question_id": "NUefFpTiL5kj5q2azcvZ8C", "question": "What kind of river channel it is?", "choices": ["sea", "canal", "pond", "river"], "correct_choice_idx": 1, "direct_answers": ["large", "canal", "stone", "no idea", "city draining", "city", "fresh water", "canal", "water", "large"], "difficult_direct_answer": false, "rationales": ["The river is a canal.", "The river channel is a canal.", "The river has several canals running underneath of a bridge."], "image": "val2014/COCO_val2014_000000481563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38714, "question_id": "NUis8BufKdKniczLW2Mm9E", "question": "What is the name of this game?", "choices": ["flying", "skydiving", "surfing", "kiting"], "correct_choice_idx": 1, "direct_answers": ["camping", "skydiving", "parasailing", "kiting", "kite flying", "kite flying", "kite flying", "gliding", "sky diving", "parasailing"], "difficult_direct_answer": false, "rationales": ["This game involves flying objects in the air", "People are in the air with parachutes above them.", "Parachutes can be seen in the air in the distance. parachutes are laid out all around on the ground with people standing nearby."], "image": "val2014/COCO_val2014_000000038714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266533, "question_id": "NUxhE2T4yw3CHatAK6RTzB", "question": "What is the woman waiting for?", "choices": ["rain stopping", "bus", "cab", "crossing street"], "correct_choice_idx": 1, "direct_answers": ["walking sign", "light", "bus", "bus", "traffic light", "bus", "green light", "bus", "bus", "light"], "difficult_direct_answer": false, "rationales": ["There is a sign near the woman indicating that this is a public transportation stop, and there's only a road near her.", "The woman is likely sitting at a bus stop.", "The woman is sitting on a bench most likely waiting on the mass transit vehicle."], "image": "train2014/COCO_train2014_000000266533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116737, "question_id": "NV3CR2er5jU2ATUw77iiY5", "question": "Why is the mother cow in a different pen than her calf?", "choices": ["space restrictions", "safety", "feeding", "cruelty"], "correct_choice_idx": 1, "direct_answers": ["milking", "being milked", "industrial farming", "free", "illness", "prevent disease", "forced separation", "different purposes", "safety", "growing"], "difficult_direct_answer": true, "rationales": ["The mom is away from her calf for weaning.", "The baby is probably in a pen with other babies.", "The mother accidentally harm or crush the calf is they are in the same small pen together."], "image": "train2014/COCO_train2014_000000116737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89598, "question_id": "NVLTh3TfjVF7yXzZayNGj9", "question": "What countries flag is on the man in the black shirts hat?", "choices": ["finland", "switzerland", "germany", "united states"], "correct_choice_idx": 3, "direct_answers": ["united states", "usa", "united states", "united states", "united states", "united states", "america", "united states", "usa", "america"], "difficult_direct_answer": false, "rationales": ["Red, white and blue, stars and stripes.", "The man's hat is red, white, and blue and is made up of stars and stripes of the american flag.", "The us flag is red, white and blue."], "image": "train2014/COCO_train2014_000000089598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41461, "question_id": "NVaQWqzRGYUigWCv3iFqs6", "question": "What might this woman shoot the elephant with?", "choices": ["laser", "dart", "camera", "gun"], "correct_choice_idx": 2, "direct_answers": ["camera", "binoculars", "gun", "tranquilizer", "elephant gun", "camera", "camera", "camera", "stun gun", "camera"], "difficult_direct_answer": false, "rationales": ["A woman is in a jeep looking at animals as they drive by.", "The woman has a camera.", "A woman is in a jeep smiling as she looks at animals on a safari."], "image": "train2014/COCO_train2014_000000041461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555206, "question_id": "NVaTKkXRuSqW7bQKHYEeKU", "question": "What are these pools for?", "choices": ["ducks", "dolphins", "athletes", "children"], "correct_choice_idx": 2, "direct_answers": ["swimming", "racing", "competitions", "competition", "swimming", "playing", "swimming", "competitive races", "competitions", "athletes"], "difficult_direct_answer": false, "rationales": ["The pools are for athletes.", "The pools are olympic sized.", "There are barriers in the pool to make lanes for races."], "image": "train2014/COCO_train2014_000000555206.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94168, "question_id": "NVd9vqJhHVEBggoTUgLZz7", "question": "During which season is this train transporting open-top hoppers?", "choices": ["summer", "winter", "fall", "spring"], "correct_choice_idx": 2, "direct_answers": ["fall", "any season", "fall", "winter", "winter", "winter", "summer", "summertime", "winter", "fall"], "difficult_direct_answer": false, "rationales": ["This is for grains harvested", "There is no rain in this season. the tops are open, exposing the contents.", "A train is moving along tracks with open-top hoppers. many of the trees in the background are missing leaves."], "image": "val2014/COCO_val2014_000000094168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383322, "question_id": "NVj9Cb7q8ERhJYaaaphuak", "question": "Which one of these company logos is partially obscured?", "choices": ["nesquik", "nescafe", "nestle", "nespresso"], "correct_choice_idx": 1, "direct_answers": ["nestle", "nescafe", "nestle", "nestle", "nestle", "nestle", "nestle", "nestle", "nestle", "nestle"], "difficult_direct_answer": false, "rationales": ["The woman seems to have nescafe in her body as evident.", "It's hard to say which advertised name is correct but by the letters visible \"c\" is the most viable answer.", "This appears to be correct based on the style of the typical logo for a company."], "image": "val2014/COCO_val2014_000000383322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404450, "question_id": "NWEkTXuXkNsRvHTFyhJH8a", "question": "What sort of event is happening here?", "choices": ["watch reset", "church", "nothing", "olympic"], "correct_choice_idx": 3, "direct_answers": ["olympics", "olympics", "sightseeing", "rally", "site seeing", "olympics", "olympic", "olympics", "sport", "countdown"], "difficult_direct_answer": false, "rationales": ["The interlinked circles tell us what type of sporting event this is", "The olympic rings can be seen on the sign, and it works as a countdown.", "The five intertwined rings is the olympic logo."], "image": "train2014/COCO_train2014_000000404450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313091, "question_id": "NWFdzrJX7kpdsKWyJy2Pab", "question": "What does the sheep have in its fur?", "choices": ["food", "vomiting", "blood", "nothing"], "correct_choice_idx": 2, "direct_answers": ["blood", "blood", "blood", "paint", "wool", "blood", "blood", "blood", "blood", "blood"], "difficult_direct_answer": false, "rationales": ["The sheep looks like she has just given birth and the blood would be from that.", "The sheep's fur has red on it.", "It is red and it just gave birth"], "image": "train2014/COCO_train2014_000000313091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477774, "question_id": "NWFqxAUmd8jwHiQZ8AtG8L", "question": "Why do they have a rainbow on their shirt?", "choices": ["was gift", "found it", "fits outfit", "lgbtq"], "correct_choice_idx": 3, "direct_answers": ["gay parade", "pride", "gay", "pride parade", "lgbtq", "pride parade", "pride", "pride parade", "gay pride", "gay pride"], "difficult_direct_answer": false, "rationales": ["She has rainbow pride colors.", "The rainbow is for gay pride.", "The rainbow or pride flag is a symbol of the lgbtq+ community."], "image": "train2014/COCO_train2014_000000477774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8691, "question_id": "NWGBUk8D9uMpvWwMyDSeDk", "question": "What kind of store is this?", "choices": ["vegetable", "fruit", "meat", "cheese"], "correct_choice_idx": 1, "direct_answers": ["fruit store", "fruit stand", "fruit", "fresh", "produce", "fruit", "grocery", "grocery", "fruit stand", "produce store"], "difficult_direct_answer": false, "rationales": ["There are several fruits on the stand.", "The store has many racks of fruits that are for sale.", "The store carries a lot of it."], "image": "train2014/COCO_train2014_000000008691.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161384, "question_id": "NWfpoKjpPCG7Rz5V4CrAVc", "question": "What is the green cylindrical object used for?", "choices": ["collecting trash", "target practice", "collecting rain", "storing candy"], "correct_choice_idx": 0, "direct_answers": ["garbage", "garbage", "trash", "garbage", "garbage", "trash", "trash", "collecting trash", "collecting garbage", "throwing trash"], "difficult_direct_answer": false, "rationales": ["The can is used for trash.", "There is a black trash bag showing. it is what is placed inside the object to collect all of the trash.", "It is a cylindrical trash can."], "image": "train2014/COCO_train2014_000000161384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296072, "question_id": "NX4gADgbFD7RhqqYsRBJjG", "question": "What seems to be propelling the man forward?", "choices": ["whale", "rain", "wave", "wind"], "correct_choice_idx": 2, "direct_answers": ["wave", "wave action", "wave", "big wave", "waves", "wave", "wave", "waves", "wave", "wave"], "difficult_direct_answer": false, "rationales": ["This man is surfing", "The wave is propelling.", "The man is on a surfboard which is a sport that uses waves to move forward. there is a wave behind the person and thus, being on a surfboard, it is the wave that moves him."], "image": "val2014/COCO_val2014_000000296072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326021, "question_id": "NXPr7bkGGxpYogbMsDbvjR", "question": "What does this woman do to teddy bears?", "choices": ["nothing", "takes pictures", "steals them", "repairs"], "correct_choice_idx": 3, "direct_answers": ["sells", "sell", "repairs bears", "sells", "bow", "clean", "distributes them", "repairs", "beauty advisor", "advertises"], "difficult_direct_answer": true, "rationales": ["The woman advertises that she can repair teddy bears that have been damaged or injured.", "The woman can sew up teddy bears to repair them.", "In the first picture, it shows a worn teddy then looking much happier in the second pic."], "image": "val2014/COCO_val2014_000000326021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367205, "question_id": "NXWjt6K6UnAFDacP2vKjKs", "question": "What type of company paid to have their product advertised on the bus?", "choices": ["travel", "movie", "video game", "food"], "correct_choice_idx": 2, "direct_answers": ["television", "movie", "video game", "media", "video game", "video game", "video game", "homeland 2", "gaming", "video game"], "difficult_direct_answer": false, "rationales": ["Looks like a new game coming out that is on the side of the bus.", "The ad is for a gaming experience.", "The makers of the video game."], "image": "val2014/COCO_val2014_000000367205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63040, "question_id": "NXbe6NNFQBXfJnKnqFwVZs", "question": "What type of job do the animals here hold?", "choices": ["service dogs", "majorettes", "dog walkers", "cooks"], "correct_choice_idx": 0, "direct_answers": ["comfort", "emotional support", "companion", "service", "service", "comfort", "service dogs", "comfort dogs", "service pets", "therapy animals"], "difficult_direct_answer": false, "rationales": ["Often times animals such as dogs are used to help people who have disabilities and other issues.", "The animals in the room are service dogs that are trained to help the children with special needs.", "The animals help give therapeutic services to the kids."], "image": "val2014/COCO_val2014_000000063040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152662, "question_id": "NXhryaiYpxRHGYsYgv6PFm", "question": "What is the professional name of a person who makes this delicacy?", "choices": ["pizzaiolo", "brewer", "patissier", "pastaiolo"], "correct_choice_idx": 0, "direct_answers": ["pizza chef", "baker", "pizzaiolo", "pizzaiolo", "chef", "chef", "pizza cook", "chef", "paisan", "pizzaiolo"], "difficult_direct_answer": false, "rationales": ["A pizzaiolo is a professional pizza maker.", "The food item is circular. two slices of it are on a plate.", "The items are slices, not pastries, pasta, or beer."], "image": "train2014/COCO_train2014_000000152662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69003, "question_id": "NY3EiBnMqtYaKrRhDc7hdL", "question": "The cat on top of the laptop possess which type of fur pattern?", "choices": ["tortoiseshell", "tabby", "calico", "tuxedo"], "correct_choice_idx": 3, "direct_answers": ["patches", "two tone", "tuxedo pattern", "black white", "tuxedo", "checkered", "spot", "tuxedo", "tuxedo", "black white"], "difficult_direct_answer": false, "rationales": ["The cat is black and white.", "The cat on top of the laptop has a black and white pattern that resembles a tuxedo jacket and shirt.", "His fur is black and white so it resembles a tuxedo."], "image": "train2014/COCO_train2014_000000069003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191925, "question_id": "NY7LD6Zkf9RbTY5fu93pvx", "question": "When is this baseball game being played?", "choices": ["afternoon", "noon", "night", "morning"], "correct_choice_idx": 2, "direct_answers": ["afternoon", "summer", "afternoon", "evening", "midday", "during summer", "night", "summer", "night", "summer"], "difficult_direct_answer": false, "rationales": ["It is light out and it's probably being played in the afternoon.", "It's not dark out yet.", "Looks like it is in the evening."], "image": "val2014/COCO_val2014_000000191925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218774, "question_id": "NYbUTZE8d7RtGtSu6SUUb7", "question": "What item is this animal known for?", "choices": ["wool", "gills", "feathers", "milk"], "correct_choice_idx": 3, "direct_answers": ["milk", "cow", "milk", "milk", "milk", "milk", "milk", "milk", "beef", "beef"], "difficult_direct_answer": false, "rationales": ["This animal is a cow, not a sheep, bird, or fish.", "When cows are lactating they give out milk.", "This is a cow and they give us milk."], "image": "train2014/COCO_train2014_000000218774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287305, "question_id": "NZ6fatrMsyqSiAhkmxjzuP", "question": "What type of area is this?", "choices": ["tropical", "residential", "country", "city"], "correct_choice_idx": 3, "direct_answers": ["city", "city", "city", "urban", "business district", "intersection", "city intersection", "urban", "inner city", "city"], "difficult_direct_answer": false, "rationales": ["It is comprised of many tall buildings in close proximity to each other or even touching each other, and there are many vehicles in the street.", "The area is a city.", "There is a lot of traffic."], "image": "val2014/COCO_val2014_000000287305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566277, "question_id": "NZ6gK8voRktCZb8EXtEBUQ", "question": "The person on the right likely plays what position?", "choices": ["pitcher", "tight end", "safety", "first base"], "correct_choice_idx": 3, "direct_answers": ["batter", "batter", "batter", "batter", "pinch hitter", "first base", "batting", "centerfield", "pitcher", "pinch hitter"], "difficult_direct_answer": false, "rationales": ["The players name is readable and internet searchable. he is also holding a baseball bat in a baseball jersey which would be consistent with answer a only.", "He has a glove", "The shirt number shows that he plays in that position."], "image": "val2014/COCO_val2014_000000566277.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172162, "question_id": "NZBfmtUZ7ZRa4ta8AxczpB", "question": "Which one of the following animals might prey on these ones?", "choices": ["parrot", "buffalo", "giraffe", "lion"], "correct_choice_idx": 3, "direct_answers": ["lion", "lion", "lion", "zebra", "lion", "lion", "lion", "lion", "zebras", "hyenas"], "difficult_direct_answer": false, "rationales": ["A lion would hunt these animals.", "A lion is a predator to these zebras and other animals.", "Buffaloes and giraffes are herbivores. parrots are too small to prey on zebras or wildebeests."], "image": "train2014/COCO_train2014_000000172162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553493, "question_id": "NZKxT44HJzDHjvkpgWaoFn", "question": "What type of battery is best for laptop?", "choices": ["nimh", "peds", "lithium-ion", "nicad"], "correct_choice_idx": 0, "direct_answers": ["lithium-ion", "nimh", "lithium", "lithium", "portable", "lithium", "lithium", "lithium ion", "block battery", "lithium ion"], "difficult_direct_answer": false, "rationales": ["The best battery is one that is for nimh.", "An nimh battery would be most useful.", "The battery is nimh."], "image": "train2014/COCO_train2014_000000553493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343948, "question_id": "NZsgKYKfYsUpSgeUV4RUUA", "question": "What is near the flower pot?", "choices": ["boat", "chair", "anteater", "dog"], "correct_choice_idx": 1, "direct_answers": ["bench", "bench", "deck furniture", "bench", "chair", "bucket", "chair", "bench", "bench", "plant"], "difficult_direct_answer": false, "rationales": ["A chair is near the flower pot", "That's what's near the flower pot.", "A chair is near the flower pot."], "image": "train2014/COCO_train2014_000000343948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279940, "question_id": "NZtABrTXnxVc2rjgzWucKy", "question": "How he is going to get hurt?", "choices": ["flying", "beaten", "shot", "falling"], "correct_choice_idx": 3, "direct_answers": ["falling", "falling forward", "fall", "fall", "falling", "falling", "fall", "falling", "fall", "falling"], "difficult_direct_answer": false, "rationales": ["He's falling.", "It looks like it may happen any second now. i think it's safe to say he doesn't know how to fly. no one is in view that is going to beat him up or shoot him.", "By looking at the board and his body language you can tell what will happen next."], "image": "val2014/COCO_val2014_000000279940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390017, "question_id": "NaGJfcvoh2wspCgegRdbTj", "question": "What weather event has stopped the motorcycle rider?", "choices": ["fog", "flooding", "tornado", "snow"], "correct_choice_idx": 0, "direct_answers": ["fog", "fog", "fog", "fog", "mist", "fog", "fog", "fog", "mist", "fog"], "difficult_direct_answer": false, "rationales": ["The fog makes it hard to see where you are going.", "There is heavy fog out since the image is so cloudy.", "A dense fog has set down in the area."], "image": "val2014/COCO_val2014_000000390017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398494, "question_id": "NaJN7t7hdCfV8XC9nEp4pC", "question": "What type of kitchen is this?", "choices": ["kitchenette", "galley", "peninsula kitchen", "island kitchen"], "correct_choice_idx": 0, "direct_answers": ["apartment", "kitchenette", "cluttered", "small", "residential", "small", "galley", "alley", "compact", "mini"], "difficult_direct_answer": true, "rationales": ["The kitchen is a tiny one.", "The truncated style of this kitchen, fitting wholly within the diameter of the bicycle here pictured, identifies it as a small kitchen.", "This is a very small kitchen with not a lot of space to store anything or cook."], "image": "train2014/COCO_train2014_000000398494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535160, "question_id": "NaWkQRJ847JQQELku9n2vK", "question": "What is the bridge used to cross over?", "choices": ["water", "fire", "trees", "holes"], "correct_choice_idx": 0, "direct_answers": ["river", "water", "small lake", "river", "river", "cross bridge", "river", "water", "river", "water"], "difficult_direct_answer": false, "rationales": ["The bridge is for water.", "Bridges are used to cross over water.", "The waterway."], "image": "train2014/COCO_train2014_000000535160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308739, "question_id": "Nakq4mgJ2ckr9UKn8GX8rV", "question": "What kind of prey do these animals hunt?", "choices": ["large", "deer", "small", "bears"], "correct_choice_idx": 2, "direct_answers": ["small", "birds", "mice", "mice", "mice", "mice", "rodents", "small", "mouse", "mice"], "difficult_direct_answer": false, "rationales": ["These are little domestic animals", "Cats hunt mice.", "The average housecat is too domesticated and singular to bring down big prey. they usually hunt smaller animals like birds and mice."], "image": "train2014/COCO_train2014_000000308739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211107, "question_id": "NarnVLBZ6nW8gzGkAJYmMJ", "question": "What is the best way to cool off in this room?", "choices": ["window", "chair", "fan", "water"], "correct_choice_idx": 2, "direct_answers": ["fan", "fan", "fan", "fan", "use fan", "fan", "fan", "fan", "use fan", "fan"], "difficult_direct_answer": false, "rationales": ["Moving the air is the best option for cooling", "The fan is there to cool off.", "The fan are placed at the corner to air the room."], "image": "val2014/COCO_val2014_000000211107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512449, "question_id": "NbAdm723CLrFYJVSjJmKEy", "question": "What is being used to keep balance?", "choices": ["weights", "rope", "hat", "ski pole"], "correct_choice_idx": 3, "direct_answers": ["ski poles", "ski poles", "ski pole", "poles", "poles", "poles", "ski poles", "poles", "ski poles", "poles"], "difficult_direct_answer": false, "rationales": ["Ski poles help keep the skiier upright and in balance.", "Traditionally the poles are used to help balance the person skiing.", "The pole is being used."], "image": "train2014/COCO_train2014_000000512449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361073, "question_id": "NbdXwKKyuaqAa8y6rDCHDn", "question": "What baseball team name is similar to one of the words on the side of the bus?", "choices": ["padres", "mets", "reds", "dodgers"], "correct_choice_idx": 1, "direct_answers": ["mets", "ny mets", "astros", "mets", "metros", "mets", "padres", "mets", "metro", "ny mets"], "difficult_direct_answer": false, "rationales": ["The word on the side of the bus is go metro.", "The side of the bus has the word metro on it which is similar to the baseball team new york mets.", "The mets sound like \"metro.\""], "image": "val2014/COCO_val2014_000000361073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186201, "question_id": "Nc7BYPjnesxETLpNFHQ53w", "question": "In which country is this airport located?", "choices": ["korea", "japan", "india", "china"], "correct_choice_idx": 1, "direct_answers": ["japan", "japan", "japan", "france", "n/a", "japan", "japan", "japan", "brazil", "japan"], "difficult_direct_answer": false, "rationales": ["The name on the plane indicates this country", "Jal express has its main hub at the toyko international airport.", "The plane has a jal express livery. the writing on the sign in the background is in kanji."], "image": "train2014/COCO_train2014_000000186201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402281, "question_id": "Nc89MbpJfiGCPnmG9e9tWv", "question": "What's the name of the red hat the man is wearing?", "choices": ["bowler", "fedora", "beanie", "cap"], "correct_choice_idx": 2, "direct_answers": ["beanie", "beaniee", "beanie", "stocking make", "beanie", "main", "beanie", "beanie", "beanie", "skycap"], "difficult_direct_answer": false, "rationales": ["You can tell by the cloth and the fact that he is wearing the item on his head to what it is.", "The man on the skateboard is wearing a red hat called a beanie that fits tight around the head.", "It seems to be made of a soft material that forms around his head. he is wearing a sweater, so it is probably chilly weather he's in, and those hats keep people's heads warm."], "image": "train2014/COCO_train2014_000000402281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75299, "question_id": "Ncere2FVERVYfNZNtBHzLi", "question": "What is the color of person's shirt who is inside vehicle?", "choices": ["white", "green", "blue", "pink"], "correct_choice_idx": 1, "direct_answers": ["green", "green", "green", "green", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["Well if you are not color blind then you can see the answer.", "Unless you are colorblind you can easily tell what color the person is wearing.", "The color is green."], "image": "train2014/COCO_train2014_000000075299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299838, "question_id": "NcqB2zRDQXLdWcTtEGVeKt", "question": "What is the relationship between the two men?", "choices": ["competitors", "unrelated", "teammates", "classmates"], "correct_choice_idx": 1, "direct_answers": ["teammates", "teammates", "teammates", "partners", "partners", "partners", "tennis partners", "partners", "unrelated", "doubles partners"], "difficult_direct_answer": false, "rationales": ["They are on separate courts.", "They are on the same side of the court playing", "They are on different courts so they are unrelated."], "image": "val2014/COCO_val2014_000000299838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426635, "question_id": "Ncyq4o7eNrSk8Ef2aqhvLX", "question": "What use are the nets here?", "choices": ["decorative", "fishing", "goals", "livestock fencing"], "correct_choice_idx": 2, "direct_answers": ["goals", "goals", "goal", "scoring goals", "catch balls", "goal catchers", "soccer", "goals", "goals", "goal arch"], "difficult_direct_answer": false, "rationales": ["These are where players kick the balls to score points.", "The nets are used for goals.", "The nets are for soccer goals."], "image": "val2014/COCO_val2014_000000426635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474609, "question_id": "NdDUHNKRRvszuRXfUsu6h6", "question": "What type of stove is this?", "choices": ["fire", "electric", "gas", "wood"], "correct_choice_idx": 1, "direct_answers": ["electric", "electric", "electric", "electric", "electric", "electric", "electric", "electric", "electric", "electric"], "difficult_direct_answer": false, "rationales": ["The stove has coils that make heat through electricity.", "It's an electric stove.", "The stove uses electricity to cook."], "image": "val2014/COCO_val2014_000000474609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40846, "question_id": "NdHdAYPGhNkVBjMWbqfQyN", "question": "What is the man doing on the stage?", "choices": ["spelling", "dancing", "stripping", "speaking"], "correct_choice_idx": 3, "direct_answers": ["talking", "talking", "talking", "speaking", "talking", "open mic", "speaking", "standup comic", "instructing", "speaking"], "difficult_direct_answer": false, "rationales": ["He looks to be talking to an audience.", "He is talking to the audience.", "He has a microphone near his mouth which allows for sound for the audience from his mouth."], "image": "train2014/COCO_train2014_000000040846.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341062, "question_id": "NdVw7AU7GhdsVHsYngEopd", "question": "Where should you go from the traffic light if you want to go to Fulton Mall?", "choices": ["go back", "turn left", "go straight", "turn right"], "correct_choice_idx": 3, "direct_answers": ["left", "turn right", "straight", "forward", "right", "right", "left", "turn right", "right", "left"], "difficult_direct_answer": false, "rationales": ["Fulton mall is to the right.", "The mall's street sign is at the right.", "The traffic goes right."], "image": "train2014/COCO_train2014_000000341062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139971, "question_id": "Ndidg3sWydnhuToy4pxwwH", "question": "What does the sign on the wall prohibit?", "choices": ["eating", "drinking", "smoking", "cellphones"], "correct_choice_idx": 2, "direct_answers": ["smoking", "smoking", "smoking", "smoking", "smoking", "smoking", "smoking", "smoking", "smoking", "smoking"], "difficult_direct_answer": false, "rationales": ["The sign prohibits smoking.", "The sign has a cigarette on it.", "The sign is crossing out a cigarette."], "image": "val2014/COCO_val2014_000000139971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239263, "question_id": "NdwGLZjYQMivNtLdGeddgx", "question": "Who played the character on the Brady Bunch whose name can be spelled with the first three letters shown on the vehicle?", "choices": ["eve plumb", "christopher knight", "mike lookinland", "susan olsen"], "correct_choice_idx": 0, "direct_answers": ["eve plumb", "eve plumb", "eve plumb", "jan", "jan", "janice", "eve plumb", "eve plumb", "jan", "jan"], "difficult_direct_answer": false, "rationales": ["The first three letters on the vehicle match the character.", "The first three letters on the vehicle spell jan.", "Eve plumb was in the brady bunch."], "image": "train2014/COCO_train2014_000000239263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496239, "question_id": "NdyZLoMxrTQYvveiVMqDki", "question": "What is the girl holding?", "choices": ["book", "controller", "tennis racquet", "baseball bat"], "correct_choice_idx": 1, "direct_answers": ["wii controller", "remote", "wii controller", "remote", "wii controller", "wii remote", "wii remote", "controller", "wii controller", "seeing"], "difficult_direct_answer": false, "rationales": ["The girl has a controller in her hand to play wii.", "The girl has a controller.", "This is a wii controller that she is holding."], "image": "val2014/COCO_val2014_000000496239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550287, "question_id": "NeXeecwqzXrtLHu7ZNanpw", "question": "What type of animals are shown?", "choices": ["stuffed", "aquatic", "wild", "domestic"], "correct_choice_idx": 3, "direct_answers": ["dogs", "dogs", "dog", "dogs", "dog", "dogs", "dogs", "dogs", "dogs", "domestic"], "difficult_direct_answer": false, "rationales": ["The dogs are trained.", "The dogs are on leashes.", "These people are with their dogs, and dogs are a type of domestic (or domesticated) animal."], "image": "val2014/COCO_val2014_000000550287.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251255, "question_id": "Necxpbs2LNVqF6R8WYAzCu", "question": "What word best describes the setting?", "choices": ["sunny", "snowy", "rainy", "tsunami"], "correct_choice_idx": 1, "direct_answers": ["cold", "desolate", "snowy", "snowy", "snowy hillside", "winter storm", "snowy", "remote", "snowy", "snow"], "difficult_direct_answer": false, "rationales": ["You can see all the snow on the ground.", "Flakes are beginning to accumulate on the ground and people are wearing their winter coats.", "The other options definitely don't match. the sky color is also different then what would be expected for the most part with the other options."], "image": "train2014/COCO_train2014_000000251255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179522, "question_id": "NedaRyP7qRLquJXVEMbEJB", "question": "Which one of these items does he avoid using?", "choices": ["razor", "soap", "electricity", "water"], "correct_choice_idx": 0, "direct_answers": ["buttons", "razor", "razor", "shirt buttons", "razor", "razor", "razor", "buttons", "undershirts", "joystick"], "difficult_direct_answer": false, "rationales": ["The man visibly has a lot of body hair. a razor is a tool that is used to remove body hair so the presence of the hair implies he does not actively use a razor.", "The man has a beard.", "It looks like he has a beard growing."], "image": "val2014/COCO_val2014_000000179522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20202, "question_id": "NejtAXaENnGdATiiRjPw6Q", "question": "What type of kitchen is shown?", "choices": ["commercial", "hospital", "food truck", "residential"], "correct_choice_idx": 3, "direct_answers": ["small", "home", "residential", "home", "home kitchen", "residential", "home", "home kitchen", "conventional", "small"], "difficult_direct_answer": false, "rationales": ["The other options would be larger and more metallic.", "Based on the refrigerator size and design and the setting in the background this would not be a professional kitchen and appears to be in a home setting.", "The kitchen shown is a residential kitchen."], "image": "train2014/COCO_train2014_000000020202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35176, "question_id": "Nekk49uKjq8J3kpJcYXKdz", "question": "What type of truck is shown?", "choices": ["delivery", "mail", "food", "moving"], "correct_choice_idx": 2, "direct_answers": ["food truck", "food truck", "food", "food truck", "food truck", "food truck", "food", "food truck", "food", "food truck"], "difficult_direct_answer": false, "rationales": ["You can order things to eat from here", "A menu is printed on the side of a truck and people are gathered around. food trucks are a popular lunch spot in many cities.", "The menu items are written on the side of the truck."], "image": "val2014/COCO_val2014_000000035176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132574, "question_id": "NeqeDbc668NDy3npswxXaV", "question": "What are the girls boots made out of?", "choices": ["cotton", "bamboo", "silk", "rubber"], "correct_choice_idx": 3, "direct_answers": ["rubber", "rubber", "rubber", "rubber", "rubber", "rubber", "rubber", "rubber", "rubber", "rubber"], "difficult_direct_answer": false, "rationales": ["The boots are very shiny and they would squeak.", "That's what all rain boots are made from.", "The boots are rubber."], "image": "train2014/COCO_train2014_000000132574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18743, "question_id": "NfJUCKoBqnR84PPc2NgjDj", "question": "What causes the texturing on the barn?", "choices": ["trees", "paint", "animals", "weathering"], "correct_choice_idx": 3, "direct_answers": ["weather time", "just only", "erosion", "weathering", "weather", "weather", "rain", "paint", "rain", "weathering"], "difficult_direct_answer": false, "rationales": ["The barn is old. the environment, not the animals or trees, weathered the surface of the barn.", "The wood of the born has been worn down by exposure.", "It is up high on the barn, out of reach of animals and trees, and it is natural unpainted wood."], "image": "train2014/COCO_train2014_000000018743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323019, "question_id": "NfNLQRxWjfvYmVD2CsQM9r", "question": "What is the purpose of the green receptacle?", "choices": ["flower pot", "storage", "water collection", "garbage"], "correct_choice_idx": 3, "direct_answers": ["garbage", "trash removal", "trash", "trash", "garbage", "trash", "trashcan", "trash", "hold trash", "garbage can"], "difficult_direct_answer": false, "rationales": ["The green receptacle is for waste items.", "It is where used up materials are disposed.", "The trash can is for people in the area to put their trash in."], "image": "train2014/COCO_train2014_000000323019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344920, "question_id": "Ng7kGhpvhcrkaMit6wZ4hQ", "question": "How many people probably share this room?", "choices": ["two", "six", "one", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "2 people", "two", "two", "2 people"], "difficult_direct_answer": false, "rationales": ["There are two beds in the room.", "The beds are only big enough for a single person, and there's only so many beds shown.", "This beds are specifically designed for one occupant, the fact that there are two suggest that each person has their own bed. there is also a phone on each bed and other separate objects suggesting that there is more than one person."], "image": "train2014/COCO_train2014_000000344920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322051, "question_id": "Ng8rpuY2o9ZY4qcnRG5Cwk", "question": "What is the girl doing?", "choices": ["playing yoyo", "shooting", "feeding pigeons", "dancing"], "correct_choice_idx": 2, "direct_answers": ["standing", "posing", "feeding birds", "feeding pigeon", "feeding pigeons", "feeding birds", "feeding pigeons", "bird feeding", "feeding pigeon", "feeding birds"], "difficult_direct_answer": false, "rationales": ["A girl stands above birds with her hand outstretched.", "The girl is feeding pigeons with her hands.", "Her attention is towards the bird shown, and she is offering it something from her hand."], "image": "train2014/COCO_train2014_000000322051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207749, "question_id": "Ngev3HngjsrfXWNTMDMQ7a", "question": "What construction equipment is visible in the background?", "choices": ["jackhammer", "crane", "bulldozer", "excavator"], "correct_choice_idx": 1, "direct_answers": ["massive crane", "crane", "crane", "crane", "crane", "crane", "crane", "massive crane", "crane", "crane"], "difficult_direct_answer": false, "rationales": ["A large metal object sticks up from behind a building that is under construction.", "A crane is in the background.", "The equipment is a crane."], "image": "train2014/COCO_train2014_000000207749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333684, "question_id": "NgharPpXu7GiKXNE2qtM9a", "question": "What does the girl have on her feet?", "choices": ["sandals", "cowboy boots", "sneakers", "dress shoes"], "correct_choice_idx": 2, "direct_answers": ["tennis shoes", "shoes", "sneakers", "camera", "tennis shoes", "skate shoes", "shoes", "shoes", "sneakers", "tennis shoes"], "difficult_direct_answer": false, "rationales": ["By the design and the laces of the shoe you can tell what she is wearing.", "The shoes are canvas with a rubber like sole and laces.", "The girl is wearing casual tennis shoes."], "image": "train2014/COCO_train2014_000000333684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304012, "question_id": "NhBXDokXropvkxcbY6sbyi", "question": "What type environment is shown?", "choices": ["urban", "tundra", "rural", "desert"], "correct_choice_idx": 0, "direct_answers": ["city", "urban", "narrow street", "urban", "downtown", "city", "urban", "city", "road", "city"], "difficult_direct_answer": false, "rationales": ["There are several cars and buildings.", "This is a city or urban area.", "There are several buildings so this is likely an urban area."], "image": "val2014/COCO_val2014_000000304012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288828, "question_id": "NhEe6ZrLB44TiJkKaAxPj9", "question": "What is the man doing in front of the meter?", "choices": ["photographing", "tumbling", "dancing", "paying"], "correct_choice_idx": 3, "direct_answers": ["talking", "talking", "talking", "paying", "paying", "complaining", "waiting", "staring", "standing", "paying"], "difficult_direct_answer": false, "rationales": ["The man is paying.", "He is putting coins in the meter", "If you don't put coins in a meter you may find a ticket on your windshield."], "image": "train2014/COCO_train2014_000000288828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64474, "question_id": "NhRKwJX5hpffUUd939ZocR", "question": "What is he doing?", "choices": ["dispensing wine", "stealing wine", "cleaning jar", "hiding wine"], "correct_choice_idx": 0, "direct_answers": ["siphoning", "keg", "siphoning liquid", "dispensing wine", "brewing", "transferring", "brewing", "draining container", "toilet", "brewing"], "difficult_direct_answer": false, "rationales": ["The man seems to have wine in a jar.", "He is pouring out wine.", "A jug is set up to remove a liquid. the liquid is dark like wine."], "image": "val2014/COCO_val2014_000000064474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61196, "question_id": "NhZNy2z29kMbpNoyskh8D5", "question": "What caused the deepest mushiest tracks here?", "choices": ["trains", "skiers", "bears", "automobiles"], "correct_choice_idx": 3, "direct_answers": ["melting snow", "vehicles driving", "car", "cars", "wheels", "truck", "vehicles", "snow melt", "automobiles", "car"], "difficult_direct_answer": true, "rationales": ["Cars caused the gray tracks.", "These are tracks from tires from a vehicle.", "The cause was cars."], "image": "train2014/COCO_train2014_000000061196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517946, "question_id": "NhfbEcsV4ep4gcgkSxseFq", "question": "What safety item is the person in Green and blue shirts missing?", "choices": ["life vest", "bear spray", "oar", "whistle"], "correct_choice_idx": 0, "direct_answers": ["life jacket", "life jacket", "life jacket", "lifejackets", "life jacket", "life jacket", "life vest", "life jacket", "lifejackets", "lifejackets"], "difficult_direct_answer": false, "rationales": ["The person in green doesn't have a life vest.", "The child has one, indicating that they are available, but the green and blue shirts aren't wearing one.", "They are not wearing their safety vest."], "image": "val2014/COCO_val2014_000000517946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343322, "question_id": "NhrcNZsSE8QWBt4DhdxZTf", "question": "What is the name of the cantina?", "choices": ["gary's", "fred's", "george's", "bill's"], "correct_choice_idx": 1, "direct_answers": ["fred's", "fred's", "fred's", "fred's", "fred's", "fred's", "fred's", "fred's", "fred's", "fred's"], "difficult_direct_answer": false, "rationales": ["The sign on the cantina says fred's.", "In the background there is a restaurant on the second floor. the sign clearly states fred's mexican cafe and cantina.", "The name of the cantina is written clearly and readable on the building."], "image": "train2014/COCO_train2014_000000343322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347243, "question_id": "NhsXwUHDpYK3y9KfNuhE7M", "question": "What type of truck is this?", "choices": ["pickup truck", "cement truck", "semi truck", "dump truck"], "correct_choice_idx": 2, "direct_answers": ["semi", "semi truck", "semi", "semi truck", "semi truck", "semi truck", "semitrailer truck", "semi", "semi", "semi"], "difficult_direct_answer": false, "rationales": ["It is only the cab and does not have a bed or load attached to back.", "The truck is known for hauling long trailers.", "By the look of thing it is the semi truck."], "image": "train2014/COCO_train2014_000000347243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543084, "question_id": "NhvtczY35Uf9HVtEQjtyge", "question": "What types of leaves do the trees have?", "choices": ["scale", "needles", "broadleaf", "grass"], "correct_choice_idx": 1, "direct_answers": ["needles", "needles", "needles", "needles", "fir tree", "pine", "needles", "needles", "pine needles", "needles"], "difficult_direct_answer": false, "rationales": ["The people are skiing on snow, so it is winter. the trees still have leaves, so they are evergreens.", "The trees are evergreens.", "They don't have leaves on them they have needles that can be sharp."], "image": "train2014/COCO_train2014_000000543084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562084, "question_id": "Nhx8ktzhWuQYQ6jcNhxsRz", "question": "Why are all the umbrellas there?", "choices": ["keep dry", "for sale", "left there", "sun protection"], "correct_choice_idx": 3, "direct_answers": ["for shade", "sun protection", "shade", "shade", "provide shade", "for shade", "sun protection", "sun shade", "shade", "sun protection"], "difficult_direct_answer": false, "rationales": ["The umbrellas block sun.", "People are laying at a beach and swimming and it is a sunny day.", "The umbrellas are on a beach during a nice day so they are providing shade."], "image": "val2014/COCO_val2014_000000562084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300509, "question_id": "Ni6sKfmLcgkyfGBsUVqhmZ", "question": "Whats the womans skin color?", "choices": ["grey", "white", "black", "brown"], "correct_choice_idx": 1, "direct_answers": ["white", "peach", "peach", "white", "peach", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["She has very light skin", "The woman is caucasian.", "She is of european descent."], "image": "val2014/COCO_val2014_000000300509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481856, "question_id": "NiJps72fYKjPCGEoC7t3Gt", "question": "What type of material is the sheep rubbing against?", "choices": ["wood", "stucco", "metal", "brick"], "correct_choice_idx": 3, "direct_answers": ["brick material", "leaf foliage", "brick", "brick", "brick", "brick", "not clear", "wall", "wall", "brick"], "difficult_direct_answer": false, "rationales": ["The sheep is rubbing against brick.", "The material is red. the material consists of rectangular blocks.", "The wall is made of red brick."], "image": "train2014/COCO_train2014_000000481856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345376, "question_id": "NivffxgjZaY4Sb6XEiKz3V", "question": "What caused the dark stains on number 28?", "choices": ["grass", "sliding", "poor laundry", "enemy fans"], "correct_choice_idx": 1, "direct_answers": ["sliding", "dirt", "grass", "sliding", "dirt slide", "dirt", "dirt", "dirt", "dirt", "dirt"], "difficult_direct_answer": false, "rationales": ["The man could be have slide while playing and got stains.", "Number 28 is playing baseball. the stains are brown, not green.", "The player was sliding when playing the game."], "image": "val2014/COCO_val2014_000000345376.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533889, "question_id": "Nj2gLbmny6EQyazcwfZT7V", "question": "What type of lines are located above the street?", "choices": ["cell", "power", "water", "sewage"], "correct_choice_idx": 1, "direct_answers": ["street markings", "power", "electrical", "electricity line", "power", "electric", "solid", "electric", "lanes", "power"], "difficult_direct_answer": false, "rationales": ["The lines above are for electricity.", "The lines are for power.", "There are power lines located above the street."], "image": "train2014/COCO_train2014_000000533889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487151, "question_id": "Nj6uCGwzKcTFcCh8RPSSo6", "question": "Who won the World Series that calendar year?", "choices": ["blue jays", "indians", "yankees", "orioles"], "correct_choice_idx": 2, "direct_answers": ["twins", "yankees", "yankees", "yankees", "unkown", "yankees", "yankees", "ny yankees", "na", "yankees"], "difficult_direct_answer": false, "rationales": ["The series is the yankees.", "The non-mets team from new york beat the philadelphia phillies.", "According to google the yankees won the world series in 2009."], "image": "val2014/COCO_val2014_000000487151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269038, "question_id": "Nj8qDH69osd55J5CQxwBPS", "question": "What type of area is shown?", "choices": ["rural", "country", "residential", "commercial"], "correct_choice_idx": 3, "direct_answers": ["urban", "store corner", "urban", "commercial", "city street", "business district", "shopping", "sidewalk corner", "hood", "grocery"], "difficult_direct_answer": true, "rationales": ["A commercial area has businesses in it.", "There is a grocery store and a car dealership, so it is a business area.", "Businesses are at an intersection and signs are all around."], "image": "val2014/COCO_val2014_000000269038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22545, "question_id": "NjBfkDpbNbw2krANjKKSc7", "question": "What kind of bread is used on the long sandwiches on the bottom tier?", "choices": ["potato roll", "croissants", "bagels", "tortes"], "correct_choice_idx": 1, "direct_answers": ["croissant", "croissant", "croissants", "croissant", "croissant", "con bread", "croissant", "crossiant", "croissant", "croissant"], "difficult_direct_answer": false, "rationales": ["The sandwich is visible and on a bread that has a light texture with a specific shape and layered consistency that is frequently found in answer a.", "The objects in question have a distinct shape and size that is consistent with answer a.", "Croissants are used on the sandwiches."], "image": "train2014/COCO_train2014_000000022545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2453, "question_id": "NjLq6yTGARiXbTrWFvKXSY", "question": "What is being experienced here?", "choices": ["forest fire", "drought", "market economy", "flood"], "correct_choice_idx": 3, "direct_answers": ["view", "flood", "flooding", "flooding", "flood", "flood", "flood", "flood", "flood", "rain"], "difficult_direct_answer": false, "rationales": ["The water is above the ground.", "The lake has flooded.", "The whole area is covered in water and it's been flooded."], "image": "val2014/COCO_val2014_000000002453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162396, "question_id": "NjMcL42Mbu9guzJSigKGoF", "question": "What is the job of the man sitting down?", "choices": ["photographer", "coach", "referee", "professor"], "correct_choice_idx": 0, "direct_answers": ["take photo", "photographer", "take photos", "photographer", "photographer", "take pictures", "take photos", "photographer", "photographer", "photographer"], "difficult_direct_answer": false, "rationales": ["The man has a camera with him.", "He has a camera in his hand pointing at the other man", "The man has a camera."], "image": "train2014/COCO_train2014_000000162396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292739, "question_id": "NjzxztcxSVFth2nGr7sPU8", "question": "What are these animals known for?", "choices": ["speed", "flexibility", "memory", "jump height"], "correct_choice_idx": 2, "direct_answers": ["ivory", "trunks", "elephant", "memory", "memory", "large", "being big", "being huge", "big", "long trunks"], "difficult_direct_answer": true, "rationales": ["The animals have memory.", "It is said they will remember things for years", "The animals visible are elephants which are commonly known for answer a and none of the other answers would apply."], "image": "train2014/COCO_train2014_000000292739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515893, "question_id": "Nk2PAY5AqJVqkKmurBV3LS", "question": "What is the silver object on the wall used for?", "choices": ["exercising", "brushing teeth", "singing", "drying hair"], "correct_choice_idx": 3, "direct_answers": ["hair dryer", "drying hair", "drying hair", "hair dryer", "drying hair", "drying hair", "hair drying", "hair drying", "hair dryer", "water"], "difficult_direct_answer": false, "rationales": ["This is an electric hair dryer that is used in bathrooms after someone has washed their hair and it is stored safely when not in use.", "It's a hair dryer used to dry your hair.", "The room is a bathroom, not a studio or gym. the item has a heating element and does not have a brush."], "image": "train2014/COCO_train2014_000000515893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567768, "question_id": "Nk3TojYycSY2PrxpVnbRRW", "question": "What is the elephant doing?", "choices": ["walking", "resting", "running", "nothing"], "correct_choice_idx": 0, "direct_answers": ["walking", "wading", "walking", "walking", "wading", "abuse compliance", "walking", "wading", "wading", "walking"], "difficult_direct_answer": false, "rationales": ["He is walking in the water.", "The people are on an elephant ride, which requires the elephant to be walking around.", "The elephant appears to be upright and on all fours which would be consistent with answer a."], "image": "train2014/COCO_train2014_000000567768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94012, "question_id": "NkA7iP9aLsWXxBs9dQ27xy", "question": "What state is the man in?", "choices": ["riding", "outstretched", "submerged", "resting"], "correct_choice_idx": 1, "direct_answers": ["jumping", "florida", "standing", "texas", "florida", "outstretched", "hitting", "tennis playing", "usa", "active"], "difficult_direct_answer": true, "rationales": ["The man is stretched outward.", "A man has an arm raised above his head and is standing tall.", "He is not underwater, there is nothing to ride on, and he is in active motion."], "image": "val2014/COCO_val2014_000000094012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165431, "question_id": "NkG9KoDQt5tegevokXGrwo", "question": "Which former US President shares the name with the street on the right?", "choices": ["clinton", "washington", "trump", "obama"], "correct_choice_idx": 1, "direct_answers": ["george washington", "george washington", "george washington", "george washington", "george washington", "george washington", "washington", "george washington", "george washington", "george washington"], "difficult_direct_answer": false, "rationales": ["The president would be our very first president whose first name was george.", "The name is listed", "The name of the street is washington."], "image": "train2014/COCO_train2014_000000165431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402779, "question_id": "NkKncE2xRZit3MqFzGkLJv", "question": "What does one need to read the objects in the clear canister?", "choices": ["disc drive", "mp3 player", "projector", "flash drive"], "correct_choice_idx": 0, "direct_answers": ["cd drive", "information", "cd player", "cd player", "cd drive", "cd drive", "cd's", "disc drive", "computer", "disk drive"], "difficult_direct_answer": false, "rationales": ["These are cds that hold data", "These appear to be discs and they would go in a disc drive.", "They are cds."], "image": "train2014/COCO_train2014_000000402779.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95124, "question_id": "NkfE6oBuvBd9ppuyU9ob2M", "question": "Why is the man on the left holding the object to his face?", "choices": ["to drink", "to photograph", "to talk", "to view"], "correct_choice_idx": 0, "direct_answers": ["drinking coffee", "talking", "phone call", "on phone", "drinking", "drinking", "to drink", "talking", "to drink", "to drink"], "difficult_direct_answer": false, "rationales": ["The man is holding a cup to his mouth.", "It is a cup", "It is a cup"], "image": "train2014/COCO_train2014_000000095124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313923, "question_id": "Nku2Y3ZfvGPcww7ibRTeC6", "question": "What are the horizontal lines streaks in the sky?", "choices": ["jet streams", "sky slices", "satellite streaks", "photo filter"], "correct_choice_idx": 0, "direct_answers": ["jet streams", "com trails", "jet trails", "planes", "chem trails", "jet lines", "plane exhaust", "jet streams", "plane trails", "vapor trails"], "difficult_direct_answer": true, "rationales": ["Jet streams are fast flowing air currents visible in the sky. they can be caused by natural air flow or exhaust from a jet engine.", "The horizontal streaks are from jets.", "The horizontal lines in the sky are streaks left behind by jets that flew by."], "image": "train2014/COCO_train2014_000000313923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39078, "question_id": "NmG5zWrfABqdy3fUZfnQiC", "question": "This will take you to what metropolis?", "choices": ["dublin", "limerick", "belfast", "cork"], "correct_choice_idx": 0, "direct_answers": ["glendalough", "glendalough", "dublin", "glendalough", "sta kevins", "glendalough", "london", "glendalough", "glendalough", "sta kevins"], "difficult_direct_answer": false, "rationales": ["The bus is in ireland near dublin.", "The display of the bus shows the direction.", "The answer is not apparent by the image, but searching online, answer a appears viable."], "image": "train2014/COCO_train2014_000000039078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281406, "question_id": "NmJfiqaPYXUKJJeKGjLSbp", "question": "What are the red cubic items on the pizza?", "choices": ["tomatoes", "peppers", "onions", "cheese"], "correct_choice_idx": 0, "direct_answers": ["tomato pieces", "tomatoes", "tomato", "tomato", "tomatoes", "bacon bits", "pepper", "pineapple", "tomato", "red peppers"], "difficult_direct_answer": false, "rationales": ["They look like tomatoes and that is a typical topping on a pizza.", "These are diced", "The red food stuff is tomatoes that is used during cooking."], "image": "train2014/COCO_train2014_000000281406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233023, "question_id": "NmM8HRLh2PWKxSKhW4XitV", "question": "What will this man use this stick for regarding the elephant?", "choices": ["conduct music", "hit it", "walking", "milk it"], "correct_choice_idx": 1, "direct_answers": ["riding crop", "prod harass", "tame it", "prodding", "hit it", "guide elephant", "hit it", "instructing", "tame", "hitting"], "difficult_direct_answer": true, "rationales": ["There is no purpose of using a plain stick in a non-veterinarian environment like this with an elephant except to do this.", "The person has a long stick.", "This is the most likely purpose of it."], "image": "val2014/COCO_val2014_000000233023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133827, "question_id": "NmPAzHAHBF5Zjqbkz22jNP", "question": "During which season are the cars on this street parked?", "choices": ["winter", "spring", "summer", "fall"], "correct_choice_idx": 3, "direct_answers": ["fall", "fall", "summer", "fall", "fall", "spring", "summer", "anytime", "summer", "fall"], "difficult_direct_answer": false, "rationales": ["The leaves on the tree appear to be changing from green to orange a red which happens during the autumn season.", "The season is fall.", "The streets are clear of snow."], "image": "val2014/COCO_val2014_000000133827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80690, "question_id": "NmXxtjHSp2jjg2TjBcPSAr", "question": "The pink topping seen here is from what root?", "choices": ["none", "garlic", "onion", "pepper"], "correct_choice_idx": 2, "direct_answers": ["onion", "onion", "beet", "beet", "onion", "beet", "onion", "onion", "onion", "onion"], "difficult_direct_answer": false, "rationales": ["The pink topping is made with chopped onions, so its root must be an onion.", "The topping is onion.", "The hot dog has chopped onion."], "image": "val2014/COCO_val2014_000000080690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313712, "question_id": "NmdAwLrjKAq2MSRQZ2dKaj", "question": "Who uses the circular glass object on the table?", "choices": ["mothers", "smokers", "vegans", "vegetarians"], "correct_choice_idx": 1, "direct_answers": ["ashtray", "ashtray", "smokers", "smoker", "ashtray", "smoker", "smoker", "smoker", "smokers", "smoker"], "difficult_direct_answer": false, "rationales": ["The item on the table is an ashtray which is used to discard cigarette butts.", "There is an ash tray on the table, which denotes an area for smokers.", "The ash tray on the table is used to discard cigarette butts."], "image": "train2014/COCO_train2014_000000313712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428477, "question_id": "NmkAWZCVKJCHYKRTEDpquj", "question": "What is hovering in the air?", "choices": ["airplane", "spaceship", "kite", "weather balloon"], "correct_choice_idx": 2, "direct_answers": ["kites", "kites", "kites", "kite fly", "kites", "kites", "kites", "kites", "kites", "kite"], "difficult_direct_answer": false, "rationales": ["The objects have the same general size and shape options as many kites with visible tails. in addition to these consistent features, the setting is also a place where kites are often flown.", "There are many kites hovering in the air over the crowd that gathered on the lawn", "Kites are floating in the air."], "image": "train2014/COCO_train2014_000000428477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477343, "question_id": "NmoGwbMVvpeDSYJP3fQXmM", "question": "What is the man in red doing with the attached object?", "choices": ["throwing it", "pulling it", "painting it", "kicking it"], "correct_choice_idx": 1, "direct_answers": ["pulling it", "pulling", "skiing", "towing", "pulling", "pulling", "pulling", "pulling", "pulling", "pulling"], "difficult_direct_answer": false, "rationales": ["The man in red has a carrier attached to him. due to the harness that he is wearing and the poles connecting to the object, he would most likely be pulling it.", "The object has attached to the man, and he is visually going forward with some effort.", "He has it strapped around him so he can pull it behind him. this would be for a child or disabled person to ride in and experience skiing with the man."], "image": "train2014/COCO_train2014_000000477343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415458, "question_id": "NmqF2qhTqWj5DMXnucohGm", "question": "What will NOT happen?", "choices": ["walk", "balk", "strike", "hit"], "correct_choice_idx": 1, "direct_answers": ["home run", "bunt", "strike", "balk", "bunt", "bunt", "batter swing", "home run", "pitching ball", "strike"], "difficult_direct_answer": false, "rationales": ["We don't know what wont happen yet.", "A balk is a play where the pitcher makes a pitching movement but pulls out of it before completing the throw. the pitcher has clearly released the ball as it is in mid air and there is nothing in his hands meaning a balk would be physically impossible at this point.", "The ball player won't balk."], "image": "train2014/COCO_train2014_000000415458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574760, "question_id": "NmwH5V5syjVJLd7UevXmR6", "question": "How hot is the air from a hair dryer?", "choices": ["200-250degf", "100-200degf", "10-50degf", "80-140degf"], "correct_choice_idx": 3, "direct_answers": ["hot", "warm", "moderate", "very hot", "moderate", "80-140degf", "very hot", "warm", "hot", "very"], "difficult_direct_answer": false, "rationales": ["The hair dryer won't burn someone.", "The temperature of the dryer is not super hot.", "The air is 80 degrees."], "image": "train2014/COCO_train2014_000000574760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12552, "question_id": "NnPiZ8ve2esQwyvsqP5vwT", "question": "What is number 44 doing?", "choices": ["running away", "sliding down", "hitting ball", "swinging bat"], "correct_choice_idx": 3, "direct_answers": ["swinging bat", "batting", "batting", "swinging bat", "batting", "swinging bat", "batting", "batting", "swinging bat", "swinging bat"], "difficult_direct_answer": false, "rationales": ["The man in the number 44 shirt is standing still and the ball has not been hit but he is trying to hit it", "We don't know if the batter will make contact with the ball, so \"b\" can't be right. the other two options make no sense.", "Forty four is at bat."], "image": "val2014/COCO_val2014_000000012552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111030, "question_id": "No4JsMK63TVkHXULWqUdFP", "question": "What are the blue signs being used for?", "choices": ["decoration", "advertising", "selling", "directing traffic"], "correct_choice_idx": 3, "direct_answers": ["detour", "directions", "directions", "advertising", "traffic signs", "direction", "directing traffic", "directions", "directions", "directions"], "difficult_direct_answer": false, "rationales": ["They have arrows to show drivers and pedestrians where to go.", "The signs direct traffic.", "The blue signs are road signs that tell drivers where to go."], "image": "val2014/COCO_val2014_000000111030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436808, "question_id": "No55xPJ4LRP8QkeY4tRfBZ", "question": "In which country is this park located?", "choices": ["united states", "venezuela", "mexico", "canada"], "correct_choice_idx": 0, "direct_answers": ["usa", "usa", "united states", "united states", "america", "united states", "usa", "united states", "usa", "united states"], "difficult_direct_answer": false, "rationales": ["This is at a national monument in washington, d.c.", "The united states features the national mall.", "The red, white, and blue flags have stars and stripes."], "image": "val2014/COCO_val2014_000000436808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68032, "question_id": "NoBYu85JNXHtHX7wkzwYJZ", "question": "What brand of TV is in the living room?", "choices": ["sharp", "sanyo", "sony", "lg"], "correct_choice_idx": 3, "direct_answers": ["lg", "lg", "samsung", "lg", "lg", "samsung", "lg", "lg", "lg", "lg"], "difficult_direct_answer": false, "rationales": ["The tv has lg's logo on it.", "You can see the logo for lg on it.", "The television is located on the bottom right of picture. if you look at the bottom of the tv, you can see the lg branded icon."], "image": "train2014/COCO_train2014_000000068032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20133, "question_id": "NoGPukkFCSMpGLV5f7oMsz", "question": "How is the phone being powered?", "choices": ["kinetic energy", "solar", "d/c", "a/c"], "correct_choice_idx": 1, "direct_answers": ["solar power", "solar power", "powerbank", "sun", "battery", "sun", "solar", "solar", "solar power", "solar"], "difficult_direct_answer": false, "rationales": ["The object to the right of the phone has photovoltaic panels. it is being used to power the phone.", "The phone is plugged into a charger that has three solar panels that power the phone.", "The phone is powered with solar panels."], "image": "train2014/COCO_train2014_000000020133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419019, "question_id": "NoMBaopBksNKwb8J7JTNcQ", "question": "Where is the girl sitting at?", "choices": ["home", "bank", "restaurant", "library"], "correct_choice_idx": 2, "direct_answers": ["restaurant", "near window", "windowsill", "windowsill", "window seat", "window sill", "window bench", "windowsill", "window", "window"], "difficult_direct_answer": false, "rationales": ["The girl is at a restaurant.", "The girl is in an establishment that has multiple tables in one room.", "The little girl is sitting on a window seal by a dining room table."], "image": "train2014/COCO_train2014_000000419019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215787, "question_id": "NoPsUtT64VpjhiTH6hbruM", "question": "What is she putting on the bun?", "choices": ["apron", "cap", "condiments", "weiner"], "correct_choice_idx": 2, "direct_answers": ["toppings", "sauce", "sauce", "meat", "meat", "condiments", "toppings", "hot dog", "relish", "relish"], "difficult_direct_answer": false, "rationales": ["She is putting mustard and ketchup on the hot dog bun. the bottles are next to her.", "She's holding a hot dog and you put things on it to make it taste even better. there are bottles right next to her for the food.", "It is a hot dog which usually has condiments thar are spread, and the branding of the condiments can be seen on the table."], "image": "val2014/COCO_val2014_000000215787.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237783, "question_id": "Np472RiVVKU8FZtVLpufa3", "question": "What is the woman ready to do?", "choices": ["catch", "serve", "dunk", "dribble"], "correct_choice_idx": 0, "direct_answers": ["catch", "catch", "catch frisbee", "catch", "frisbee", "catch", "catch", "catch", "catch", "catch frisbee"], "difficult_direct_answer": false, "rationales": ["The frisbee is flying through the air towards the woman and her hand is outstretched, so she's positioned to catch it when it reaches her.", "She is ready to catch the frisbee that is being thrown to her.", "A white flying disc is moving towards the woman. there are no balls near her."], "image": "val2014/COCO_val2014_000000237783.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249158, "question_id": "NpCBikThFUhZn4JdsCCh2S", "question": "This road is good for what type of driver?", "choices": ["speeder", "impatient", "sightseer", "sleepy"], "correct_choice_idx": 2, "direct_answers": ["sightseer", "car", "good", "tourist", "tourist", "scenic drivers", "scenic", "scenic drive", "good driver", "pleasure seeking"], "difficult_direct_answer": true, "rationales": ["There is a sign that says \"scenic drive\".", "The sign tells about the scenes of the road.", "This is for tourists to see the area"], "image": "val2014/COCO_val2014_000000249158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354298, "question_id": "NqDeNkVdJmhYKbjERpNggw", "question": "What are they doing with the food on the plate?", "choices": ["decorating it", "cooking it", "trashing it", "eating it"], "correct_choice_idx": 3, "direct_answers": ["eating", "eating", "eating", "eat", "eating pizza", "eating", "eating it", "dropped it", "eating", "serving it"], "difficult_direct_answer": false, "rationales": ["There are bite marks in the food", "The people are eating slices of pizza that are served on the plate.", "Pizza has been cooked and is ready to eat. don't need to waste it by trashing it and you don't decorate pizza."], "image": "train2014/COCO_train2014_000000354298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368479, "question_id": "NqU3j6ULcytu9UaEznDpEK", "question": "What are the square tubes hooked to side of plane for?", "choices": ["rain insurance", "insulation", "loading baggage", "loading passengers"], "correct_choice_idx": 3, "direct_answers": ["boarding", "loading passengers", "passengers footpath", "passenger boarding", "passengers", "loading", "boarding bridge", "loading passengers", "passenger boarding", "windows"], "difficult_direct_answer": false, "rationales": ["The boarding bridge is used to allow the travelers to board the plane.", "The tubes are attached on the plane where the doors would most likely be based on standard plane design. this is likely to allow passengers to directly enter or exit the plane without walking on the tarmac.", "These metal corridors attach to the doors of the plane for people to board."], "image": "train2014/COCO_train2014_000000368479.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333101, "question_id": "NqWTbNsKitigJ8vbgGZ87C", "question": "What material is the girl's wig made of?", "choices": ["denim", "yarn", "wool", "nylon"], "correct_choice_idx": 3, "direct_answers": ["synthetic hair", "ribbons", "ribbon", "fake hair", "nylon", "plastic", "synthetics", "plastic", "hair", "hair"], "difficult_direct_answer": false, "rationales": ["The girl's wig is made of nylon material.", "The material is shiny and flexible and strong and thin.", "The girl's wig is made of artificial fibers made of nylon that look like hair."], "image": "val2014/COCO_val2014_000000333101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436453, "question_id": "NrZmCtNwbf7ycFyLV8cAwA", "question": "Why is this book appropriate for her to read?", "choices": ["school book", "nursery rhymes", "romance novel", "bible"], "correct_choice_idx": 1, "direct_answers": ["children's material", "mother herself", "show book", "nursery rhyme", "age appropriate", "nursery rhyme", "nursery", "mother goose", "fairytales", "nursery rhymes"], "difficult_direct_answer": true, "rationales": ["There are toddlers in bed.", "Two children lay in a bed with a woman holding a children's book sitting on the edge of the bed.", "The book has rhymes."], "image": "train2014/COCO_train2014_000000436453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294702, "question_id": "NrgrvoFt65wsov498jBzSR", "question": "What is the dark sauce in the bowl?", "choices": ["tomato sauce", "bbq sauce", "salsa", "gravy"], "correct_choice_idx": 1, "direct_answers": ["barbeque", "steak sauce", "ketchup", "spaghetti", "bbq sauce", "ketchup", "steak sauce", "ketchup", "barbecue", "barbeque"], "difficult_direct_answer": false, "rationales": ["The sauce is brown and often served with meat.", "It is darker than tomato sauce or salsa", "Barbecue sauce is red and is commonly served with sandwiches and meat."], "image": "train2014/COCO_train2014_000000294702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131780, "question_id": "NriQ3YkfhHyCmTuQSR64gM", "question": "What sea creature is the blue balloon?", "choices": ["eel", "squid", "shark", "octopus"], "correct_choice_idx": 3, "direct_answers": ["octopus", "squid", "octopus", "whale", "whale", "squid", "octopus", "squid", "squid", "squid"], "difficult_direct_answer": false, "rationales": ["The individual tentacles of the octopus along with the suction cups can be seen.", "There is an animal with many tentacles.", "The many tentacles and round head of the kite which is not supposed to be a whale in this image tells us it's an octopus."], "image": "train2014/COCO_train2014_000000131780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389159, "question_id": "NrmTejUF6paHTxWDBt36oR", "question": "This bus is decorated as what?", "choices": ["greenhouse", "school", "weeding venue", "wedding"], "correct_choice_idx": 0, "direct_answers": ["greenhouse", "garden", "nature", "forest", "garden", "plants", "plants", "garden", "garden", "jungle"], "difficult_direct_answer": false, "rationales": ["There are plants all over it with some flowers.", "The bus is a greenhouse.", "There are plants all throughout that would be consistent with answer a."], "image": "train2014/COCO_train2014_000000389159.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136699, "question_id": "NropMkGNuhBeafje4R3U2d", "question": "The items above are likely to be owned by a?", "choices": ["child", "grand father", "female", "male"], "correct_choice_idx": 2, "direct_answers": ["teenage girl", "woman", "teenage girl", "teenager", "woman", "woman", "female", "woman", "woman", "woman"], "difficult_direct_answer": false, "rationales": ["Most people who carry purses are female and they're usually the ones wearing the makeup items seen.", "The items visible appear to be feminine in nature and there are many items used for women that are visible.", "The items are for females."], "image": "train2014/COCO_train2014_000000136699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492407, "question_id": "NrvtdcBZt6Lfst5mmqKMtk", "question": "Who likely owns these horses?", "choices": ["circus", "rancher", "zookeeper", "jockey"], "correct_choice_idx": 1, "direct_answers": ["rancher", "farmer", "rancher", "farmer", "stable", "rancher", "rancher", "farmer", "farmer", "rancher"], "difficult_direct_answer": false, "rationales": ["The horses are on a ranch.", "The horses are a rancher's.", "They look to be on a nice farm."], "image": "val2014/COCO_val2014_000000492407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298535, "question_id": "NsFFvRDN98quieFn3arZuw", "question": "What are the red vegetables called?", "choices": ["tomato", "radish", "beet", "carrot"], "correct_choice_idx": 1, "direct_answers": ["radish", "radish", "onions", "radish", "radish", "beets", "radish", "radishes", "radishes", "beets"], "difficult_direct_answer": false, "rationales": ["The object in question is the right size, shape and color to be consistent with answer a.", "Radishes are round red vegetables with a green top.", "The red vegetable is a radish."], "image": "train2014/COCO_train2014_000000298535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18040, "question_id": "NsM9nawEsds5LqDiJzAc78", "question": "Do these animals typically live in the United States?", "choices": ["unsure", "maybe", "yes", "no"], "correct_choice_idx": 3, "direct_answers": ["no", "no", "not typically", "no", "no", "no", "zoo", "no", "yes", "no"], "difficult_direct_answer": false, "rationales": ["The animals are zebras and giraffes which are native to africa and not found in the united states naturally.", "The animals are clearly visible and identifiable and are known to habitat africa and not the united states.", "No they live where the weather is warmer in places like africa."], "image": "train2014/COCO_train2014_000000018040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485526, "question_id": "NsaPbLp3YoQCT6BFFmSBfG", "question": "What is the orange tool used to do?", "choices": ["juice citrus", "peel veggies", "sift grains", "strain liquids"], "correct_choice_idx": 0, "direct_answers": ["juice fruit", "juice", "making juice", "juice citrus", "juicer", "juicing", "juice", "juice fruits", "juice it", "squeeze juice"], "difficult_direct_answer": true, "rationales": ["The tool is used to juice.", "It's used to get the juice out of an orange.", "It has the shape of the fruit"], "image": "train2014/COCO_train2014_000000485526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390608, "question_id": "NskDF8fStFwdMWAXUJGcK9", "question": "What is used the make the fries have that shape?", "choices": ["crinkle cutter", "spatula", "fork", "steak knife"], "correct_choice_idx": 0, "direct_answers": ["machine", "crinkle cutter", "crinkle cutter", "curved cutter", "knife", "cutter", "crinkle cutter", "machine", "slicer", "machine"], "difficult_direct_answer": false, "rationales": ["They are made with a special knife that cuts the indents in them.", "The cutter is used.", "The fries are crinkly."], "image": "train2014/COCO_train2014_000000390608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520657, "question_id": "NsoY6E3ERM3UUJMj9UqFEx", "question": "What is in the bowl on the back right of the tray?", "choices": ["eggs", "veal", "shrimp", "pancakes"], "correct_choice_idx": 0, "direct_answers": ["eggs", "oatmeal", "eggs", "eggs", "eggs", "eggs", "eggs", "oatmeal", "eggs", "oatmeal"], "difficult_direct_answer": false, "rationales": ["The items in the bowl are white and yellow. the yellow parts are yolks.", "There are two sunny side up eggs in the bowl.", "Eggs are fried over easy."], "image": "train2014/COCO_train2014_000000520657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412760, "question_id": "NsyfesrZJqpXkjkYkR5o3E", "question": "Where are these animals located?", "choices": ["mountains", "beach", "desert", "arctic"], "correct_choice_idx": 0, "direct_answers": ["outdoors", "river", "mountain", "mountains", "riverbank", "river", "river", "wild", "creek", "mountains"], "difficult_direct_answer": false, "rationales": ["The animals are in the mountains.", "The animals seem used to the rocky terrain.", "The ground rises above them in the back"], "image": "train2014/COCO_train2014_000000412760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241453, "question_id": "NtLoMvbqjUh6ycPe4HZGGv", "question": "Where are the horses?", "choices": ["rainforest", "forest", "ranch", "beach"], "correct_choice_idx": 2, "direct_answers": ["early 1900s", "outside", "field", "all sold", "ranch", "outside", "no horses", "by house", "no horses", "outside"], "difficult_direct_answer": false, "rationales": ["They are standing outside of a ranch style home.", "There is a building in the background so they are in some sort of domestic setting.", "This type of environment would be a. and these are cows."], "image": "val2014/COCO_val2014_000000241453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116043, "question_id": "Ntr3BDf5TGwAv8jjtwJ4ct", "question": "Which region of the world are the elephants in?", "choices": ["africa", "america", "europe", "asia"], "correct_choice_idx": 3, "direct_answers": ["bangkok", "asia", "africa", "africa", "africa/asia", "asia", "asia", "thailand", "india", "eastern"], "difficult_direct_answer": false, "rationales": ["The man ridding the elephant is asian.", "The region is asia.", "These elephants have smaller ears so they are asian elephants rather than african."], "image": "train2014/COCO_train2014_000000116043.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300655, "question_id": "Nty9hWka5jk4X3t87Myp7K", "question": "What does the person in the yellow shirt stand on?", "choices": ["dugout", "bird stand", "visitor stands", "mound"], "correct_choice_idx": 3, "direct_answers": ["catcher's mound", "mound", "mound", "mound", "pitchers mound", "sand", "piture's mound", "pitching mound", "mound", "mound"], "difficult_direct_answer": false, "rationales": ["A pitcher's mound is found on baseball diamonds and is where the pitcher stands to pitch balls.", "The man is pitching", "The person is on the mound."], "image": "val2014/COCO_val2014_000000300655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106335, "question_id": "Ntyiv4CHDtg7ax3FG6V7JB", "question": "Considering the direction of traffic where in Asia is this intersection?", "choices": ["south korea", "china", "japan", "vietnam"], "correct_choice_idx": 2, "direct_answers": ["left side", "middle", "hong kong", "tokyo", "japan", "japan", "america", "hong kong", "japan", "hong kong"], "difficult_direct_answer": false, "rationales": ["Japanese drivers drive in this direction.", "The country is japan.", "There is japanese text on the signs."], "image": "val2014/COCO_val2014_000000106335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463680, "question_id": "Nu2NSuchnCQbXwMQPygazh", "question": "Where is this bathroom most likely located?", "choices": ["school", "apartment", "mansion", "motorhome"], "correct_choice_idx": 3, "direct_answers": ["house", "apartment", "camper", "airplane", "motorhome", "rv camper", "camper", "hotel", "hotel", "bottom left"], "difficult_direct_answer": false, "rationales": ["The bathroom is extremely small compared to most bathrooms, so it is likely to be on a transportation vehicle.", "It is very compact with small fixtures", "Depending on the country, this also could be a c or d bathroom, especially if the latter has a boarding setup."], "image": "train2014/COCO_train2014_000000463680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144796, "question_id": "NuE7eLM6wTrRGAJzqSvRtd", "question": "What culture is this dish from?", "choices": ["french", "american", "british", "asian"], "correct_choice_idx": 3, "direct_answers": ["chinese", "japanese", "asia", "england", "chinese", "indian", "asian", "asian", "china", "asian"], "difficult_direct_answer": false, "rationales": ["The rice and dumplings present in this food tell us it's likely eastern.", "The dish features beef and broccoli, rice and dumplings which are chinese food.", "Filled dumplings are a staple in this cuisine"], "image": "train2014/COCO_train2014_000000144796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167028, "question_id": "NuMV6MujEmDFZbRT9cGMpR", "question": "Why is he holding the bowl?", "choices": ["is selling", "is empty", "is hungry", "is stealing"], "correct_choice_idx": 1, "direct_answers": ["serve food", "passing", "man", "passing food", "serving", "serving", "eating", "moving it", "to serve", "is empty"], "difficult_direct_answer": true, "rationales": ["The bowl is very empty.", "Because he wants to serve himself food.", "When there is no longer anything present, a refill is required."], "image": "train2014/COCO_train2014_000000167028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12421, "question_id": "NuU6RyupPykoDi5eFAA4En", "question": "The colors of the train resemble the typical colors of what?", "choices": ["blue jay", "firetruck", "lemon", "lime"], "correct_choice_idx": 1, "direct_answers": ["fire trucks", "red cross", "firetruck", "flags", "fire truck", "firetrucks", "firetruck", "tomatoes", "apple", "blood"], "difficult_direct_answer": true, "rationales": ["The train is red just like a fire engine.", "Red is the color of a firetruck.", "The colors are a firetruck."], "image": "train2014/COCO_train2014_000000012421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197585, "question_id": "NuzvqLZscsXuEMLcGrXqpa", "question": "Where are they playing a game?", "choices": ["beach", "park", "stadium", "gym"], "correct_choice_idx": 1, "direct_answers": ["field", "field", "park", "frisbee", "field", "field", "park", "field", "outdoors", "field"], "difficult_direct_answer": false, "rationales": ["This is a wide grassy area.", "The people are outside, not inside a stadium or gym. the ground is covered with grass, not sand.", "The place is a green field."], "image": "train2014/COCO_train2014_000000197585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420524, "question_id": "NvDaoRcTwUCKmW2xbtUFQx", "question": "What item is sharpest here?", "choices": ["pizza crust", "cutting board", "pizza cutter", "peppers"], "correct_choice_idx": 2, "direct_answers": ["pizza cutter", "pizza cutter", "pizza cutter", "slicer", "slicer", "pizza cutter", "knife cutter", "pizza cutter", "pizza cutter", "pizza cutter"], "difficult_direct_answer": false, "rationales": ["The pizza cutter has a blade that can pierce the crust.", "That is used to cut the pizza.", "The cutter is sharp."], "image": "train2014/COCO_train2014_000000420524.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399354, "question_id": "NvNUB6ndaZPiDuE8QgvqKX", "question": "What will the man read when done texting?", "choices": ["directions", "manual", "paper", "book"], "correct_choice_idx": 2, "direct_answers": ["newspaper", "newspaper", "newspapers", "newspaper", "newspapers", "paper", "newspaper", "newspaper", "newspaper", "paper"], "difficult_direct_answer": false, "rationales": ["The man has brought a newspaper or two, which are visible on the seat next to him, for reading on the train. depending on how long his trip is, he may well go back to his messages before his trip ends.", "The newspaper is. next to him to read when he is ready.", "There is a newspaper next to the man so he will probably read that."], "image": "train2014/COCO_train2014_000000399354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532644, "question_id": "NvXDE7KCa5KWugoxbxJaSn", "question": "Which part of the Elephant's body work to cool their body?", "choices": ["leg", "trunk", "ear", "skin"], "correct_choice_idx": 1, "direct_answers": ["trunk", "skin", "trunk", "trunk", "trunk", "trunk", "trunk", "trunk", "trunks", "ears"], "difficult_direct_answer": false, "rationales": ["They pick up water and throw it back", "The elephants use their nose to get water and spray it over themselves to keep cool.", "The elephant's trunk is close to the body of water under it. since the elephant can suck up and blow out liquid, it would make sense that it would use it to cool its body off."], "image": "train2014/COCO_train2014_000000532644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576015, "question_id": "NvXjQMBdvqneaSKrNYhB68", "question": "What does the orange sign ahead indicate?", "choices": ["caution", "yield", "merge", "construction zone"], "correct_choice_idx": 3, "direct_answers": ["construction", "construction", "warning", "road work", "construction", "construction", "caution", "construction", "continue", "construction zone"], "difficult_direct_answer": false, "rationales": ["The sign is for construction.", "Orange is a color that is commonly used to alert drivers of road work or construction.", "The orange sign indicates construction."], "image": "val2014/COCO_val2014_000000576015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516607, "question_id": "NvhPdrDzAHVw3yRPPuSvpY", "question": "What diet are the cows here on?", "choices": ["vegan", "carnivorous", "milk", "fasting"], "correct_choice_idx": 0, "direct_answers": ["vegetarian", "grass", "hay", "hay", "hay", "hay", "grass", "grass", "hay", "vegan"], "difficult_direct_answer": false, "rationales": ["The cows are eating hay.", "Cows are grazing in a pen.", "The cows are eating hay."], "image": "train2014/COCO_train2014_000000516607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546575, "question_id": "NvuRg23mXCTmq7TuukTrTq", "question": "What can be thrown in the green receptacle?", "choices": ["bottles", "electronics", "food", "dirt"], "correct_choice_idx": 0, "direct_answers": ["trash", "bottles", "recyclables", "recycling", "trash", "skateboard", "recycled products", "plastic bottles", "cans", "recycling"], "difficult_direct_answer": false, "rationales": ["The green receptacles have the universal icon of recycling. of the many items that can be recycled, bottles are something that is commonly targeted.", "The sign on the receptacle is for recycling, and bottles can be recycled.", "These are recycling bins"], "image": "train2014/COCO_train2014_000000546575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411962, "question_id": "NvyfjNMBE4NTUt8s2TNkng", "question": "What natural element might be found here?", "choices": ["earth", "air", "water", "fire"], "correct_choice_idx": 3, "direct_answers": ["fire", "fire", "fire", "fire", "light", "wood", "wood", "wood fire", "lights", "fire"], "difficult_direct_answer": false, "rationales": ["There is a wood burning fireplace in the room.", "There is a fireplace in the corner where a fire could be lit.", "There is an alcove with a chimney attached that is meant for burning items to warm the room."], "image": "train2014/COCO_train2014_000000411962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436333, "question_id": "NwEBwSwsTDQsaRiVXm27Y7", "question": "What are the two girls in front doing?", "choices": ["selling animals", "waiting", "standing line", "stealing animals"], "correct_choice_idx": 1, "direct_answers": ["gifts", "standing", "holding toys", "holding toys", "holding bears", "holding toys", "looking camera", "posing", "waiting", "holding bears"], "difficult_direct_answer": false, "rationales": ["The two little girls are standing while the adults in the background are engaging with other adults.", "The two girls are standing in wait with a line.", "The two girls are standing in front and waiting for the grownups before they leave."], "image": "train2014/COCO_train2014_000000436333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529389, "question_id": "NwbNsRvgKX3NkeFe54u9Wi", "question": "What are they looking at?", "choices": ["photographer", "dog", "walls", "table"], "correct_choice_idx": 0, "direct_answers": ["photographer", "camera", "camera person", "camera", "you", "camera", "people", "camera", "camera", "camera"], "difficult_direct_answer": false, "rationales": ["The dog is staring into the camera.", "Both the dog and person in this image's attention are focused at the camera that took this picture and the person who took it.", "The dog is standing in front of the person who's taking the picture and the dog pants, as they stare at the camera person."], "image": "train2014/COCO_train2014_000000529389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271906, "question_id": "NwjTZoXLNTPqYSyuiFqkEW", "question": "What is the man in red ready to do?", "choices": ["dribble", "serve", "dunk", "swing"], "correct_choice_idx": 3, "direct_answers": ["hit baseball", "bat", "swing", "hit", "bat", "bat", "hit baseball", "bat", "hit ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["The other options apply to other sports.", "That's what batters do when they have a ball pitched to them.", "The man is holding the bat and looking at the ball."], "image": "train2014/COCO_train2014_000000271906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17401, "question_id": "NwobHBTPT9FJ9sxy8hESmC", "question": "What fills the pastry here?", "choices": ["dye", "jelly", "cheese", "honey"], "correct_choice_idx": 1, "direct_answers": ["jelly", "jelly", "jelly", "jam", "jelly", "jelly", "fruit filling", "jelly", "jelly", "jelly"], "difficult_direct_answer": false, "rationales": ["This is a fruit spread", "Just from looking at the color of the inside of this pastry, it's obviously some kind of jelly. if it was a cream filling, we would see an off-white color, but no, this is purple, and it's jelly.", "There is jelly in the middle of this donut."], "image": "train2014/COCO_train2014_000000017401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249397, "question_id": "Nx84GkGe5prvD4yQeTqahV", "question": "Which color animal has hair that is more easily dyed?", "choices": ["white", "patterned", "brown", "black"], "correct_choice_idx": 0, "direct_answers": ["white", "white", "white", "white lamb", "white", "white lamb", "white", "white", "white", "white lamb"], "difficult_direct_answer": false, "rationales": ["Since it's light in color it can take on darker shades", "Dark hair or fur is harder to change the color of.", "The lighter color would be easier to dye."], "image": "train2014/COCO_train2014_000000249397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179960, "question_id": "NxHjiiWvZX4YeHUTSTpTRT", "question": "What type volleyball is being played here?", "choices": ["lawn", "tennis", "beach", "professional surface"], "correct_choice_idx": 2, "direct_answers": ["beach", "beach", "beach", "beach", "beach", "beach", "beach", "beach", "beach volleyball", "beach volleyball"], "difficult_direct_answer": false, "rationales": ["Based on the amount of sand, it is safe to assume there is a body of water near by.", "The playing surface is outside and is made out of sand, not grass or wood. it is not a tennis court.", "Beach volleyball is played in the sand."], "image": "val2014/COCO_val2014_000000179960.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65948, "question_id": "NxJnge3dKjFGrKYduwG9mg", "question": "What kind of shirt is the heavier man wearing?", "choices": ["none", "red", "checkered", "long sleeve"], "correct_choice_idx": 2, "direct_answers": ["plaid shirt", "plaid", "plaid", "checkered", "plaid", "checkered", "plaid", "plaid", "flannel shirt", "buttondown"], "difficult_direct_answer": false, "rationales": ["The pattern of the man's shirt is alternating black and white squares.", "The square, multi-colored pattern denotes the type of shirt described.", "The shirt is checkered."], "image": "val2014/COCO_val2014_000000065948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497660, "question_id": "NxV4hkXHrAybXCpaFUsdtd", "question": "Why are the men's vest/coat orange in color?", "choices": ["fashion", "camouflage", "dress code", "visibility"], "correct_choice_idx": 3, "direct_answers": ["attention", "safety", "safety", "worker", "high visibility", "visibility", "visibility", "visibility", "visibility", "trash men"], "difficult_direct_answer": false, "rationales": ["The vest is for visibility.", "The man where the above color to be easily seen by the passenger.", "The man wants to be seen by the truck."], "image": "val2014/COCO_val2014_000000497660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393900, "question_id": "NxZ5RBFHZe2ZmevND4iREs", "question": "This vehicle is most likely from?", "choices": ["turkey", "mexico", "afghanistan", "south korea"], "correct_choice_idx": 3, "direct_answers": ["south korea", "asia", "asia", "asia", "south korea", "asia", "korea", "south korea", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["The plane says it's from asia.", "The branding on the side of the airplane states an airline that originates in korea.", "Asian airlines is from seoul."], "image": "train2014/COCO_train2014_000000393900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326823, "question_id": "NxaqCT8bv3UFuqCcfyYoL9", "question": "What fruited type pizza is being cut into here?", "choices": ["olive", "pineapple", "mushrooms", "meat"], "correct_choice_idx": 0, "direct_answers": ["hawaiian", "olive", "olives", "olive", "olive", "pineapple", "hawaiian", "hawaiian", "hawaiian", "fruitless"], "difficult_direct_answer": false, "rationales": ["The fruit is round and black, and is a common pizza topping.", "Olives are scattered among the cheese.", "There are olives on the pizza."], "image": "train2014/COCO_train2014_000000326823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423363, "question_id": "NxdbR4VnkMuSfsndvpNVBN", "question": "Where is the man holding the cooler likely headed?", "choices": ["wedding", "safari", "prison", "olympics"], "correct_choice_idx": 0, "direct_answers": ["beach", "wedding", "wedding", "beach", "travelling", "wedding", "wedding", "wedding", "wedding", "beach"], "difficult_direct_answer": false, "rationales": ["The attire of the couple is appropriate for a wedding.", "The man is in a black tuxedo next to a woman in a white dress. this is a combination of clothing styles that is typically worn at a wedding and nowhere else.", "Their formal dress suggests ceremony attendance"], "image": "train2014/COCO_train2014_000000423363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255339, "question_id": "NxuGocR75cG9293a2D8tQX", "question": "What's the name of the pendant on the woman's necklace?", "choices": ["dagger", "ankh", "staff", "cross"], "correct_choice_idx": 1, "direct_answers": ["ankh", "ankh", "ankh", "cross", "cross", "ankh", "ankh", "female", "cross", "ankh"], "difficult_direct_answer": false, "rationales": ["A woman is wearing a necklace with a charm on it.", "A woman is wearing a golden egyptian necklace around her neck.", "The pendant is an ankh."], "image": "val2014/COCO_val2014_000000255339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409165, "question_id": "NxvuzBBaQ6QQznMDwyCz6Y", "question": "What is he analyzing?", "choices": ["target location", "sand trap", "net height", "opponent's position"], "correct_choice_idx": 3, "direct_answers": ["opponent stance", "serve", "tennis", "opponent", "opponent", "serving zone", "serve", "opponent", "other player", "opponent's position"], "difficult_direct_answer": false, "rationales": ["He is looking over the other player on where they might hit.", "The man is looking at where his opponent is standing on the other side of the net so he can aim his shot.", "The man is looking towards his opponent."], "image": "train2014/COCO_train2014_000000409165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240274, "question_id": "NxwXZYNCJdumdAYFDzVKnH", "question": "Why is he dry?", "choices": ["just modeling", "before surfing", "sand surfing", "dried off"], "correct_choice_idx": 1, "direct_answers": ["poser", "not surfing", "didn't swim", "photos only", "outof water", "on sand", "finished", "before surfing", "not skiing", "not surfing"], "difficult_direct_answer": true, "rationales": ["A man is standing on a beach with a surfboard in jeans and a dry shirt.", "He is just posing for a picture.", "The man is wearing jeans still."], "image": "val2014/COCO_val2014_000000240274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378070, "question_id": "Ny8aiyPsJNnKdyJeFYESk9", "question": "What part of her is probably most cold?", "choices": ["back", "head", "legs", "fingers"], "correct_choice_idx": 3, "direct_answers": ["fingers", "fingers", "fingers", "fingers", "fingers", "nose", "hands", "fingers", "face", "hand"], "difficult_direct_answer": false, "rationales": ["The gloves don't cover her fingers.", "The woman has fingerless gloves on and her fingertips are exposed to the cold air.", "The person's face is not covered, but everything else is."], "image": "train2014/COCO_train2014_000000378070.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209493, "question_id": "NyJStXmkdvKWqDvumpSghZ", "question": "Which European language does the person using the mouse speak?", "choices": ["russian", "english", "german", "french"], "correct_choice_idx": 2, "direct_answers": ["german", "ukranian", "german", "german", "french", "german", "german", "german", "german", "italian"], "difficult_direct_answer": false, "rationales": ["German text is on the heart.", "The language is german.", "The german phrase for \"i love you\" is written on the red heart."], "image": "train2014/COCO_train2014_000000209493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91057, "question_id": "NyN7oJ6jhi5DRVGhcyPsUc", "question": "What is the dog doing with the thing in its mouth?", "choices": ["vomiting", "eating", "playing", "choking"], "correct_choice_idx": 2, "direct_answers": ["biting", "chewing", "chewing", "chewing", "chewing", "chewing", "biting", "chewing", "biting", "playing"], "difficult_direct_answer": false, "rationales": ["The dog seems to be eating the slice of pizza shown here.", "The object the dog has is obviously a toy so he is not eating it and he is not in any distress from vomiting or choking", "The thing in the dog's mouth is a dog toy."], "image": "train2014/COCO_train2014_000000091057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236611, "question_id": "NytbBDLt6u66T5dsAQhbEE", "question": "What does the dog want to do with the frisbee?", "choices": ["catch it", "avoid it", "throw it", "eat it"], "correct_choice_idx": 0, "direct_answers": ["catch", "catch", "catch", "catch", "catch it", "chase", "catch it", "catch", "catch it", "catch it"], "difficult_direct_answer": false, "rationales": ["The dog is at a frisbee-catching show.", "The dog wants to catch the frisbee.", "He is jumping up to catch it."], "image": "train2014/COCO_train2014_000000236611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98479, "question_id": "NyyMtQhFVtfWF5DAgo7mCE", "question": "Why are the men wearing headphones?", "choices": ["keep warm", "listen music", "fashion", "protect ears"], "correct_choice_idx": 3, "direct_answers": ["protect ears", "noise control", "engine noise", "too loud", "noise reduction", "on truck", "noise", "hearing protection", "noise control", "hearing protection"], "difficult_direct_answer": false, "rationales": ["In this setting the planes are very noisy due to the large engines.", "They are in at a very noisy location and need some relief.", "The men are wearing headphones to cover their ears and protect them from loud sounds."], "image": "train2014/COCO_train2014_000000098479.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309412, "question_id": "NyzyJNvQmjBgEt5wLfzKsY", "question": "What zone is this area?", "choices": ["tourist", "residential", "shopping", "business"], "correct_choice_idx": 0, "direct_answers": ["loading", "railroad crossing", "loading", "tourist", "station", "town", "train station", "train station", "railroad crossing", "train crossing"], "difficult_direct_answer": false, "rationales": ["The main building in this photo is quite colorful and appears to one to attract people. coupled with the train stop in this location, it would seem that this zone was built for tourists.", "This is an area for tourists.", "Visitors are this place's main customers"], "image": "train2014/COCO_train2014_000000309412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209603, "question_id": "NzQLekpHhLTXcVd96aBbTz", "question": "What brand camera does the food photographer prefer?", "choices": ["panasonic", "polaroid", "canon", "nikon"], "correct_choice_idx": 3, "direct_answers": ["nikon", "nikon", "nikon", "nikon", "nikon", "nikon", "nikon", "nikon", "nikon", "nikon"], "difficult_direct_answer": false, "rationales": ["There is a logo on the camera strap. it is not a polaroid, canon, or panasonic logo.", "There is a person taking a photo of the food on the table and the strap that the camera has on it has the name of the camera, which is nikon.", "The brand is nikon."], "image": "train2014/COCO_train2014_000000209603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307758, "question_id": "NzZC6nYJzqFsTih4gMjYN8", "question": "Which topping gives you the most vitamin C?", "choices": ["peppers", "onion", "cheese", "olive"], "correct_choice_idx": 0, "direct_answers": ["pepper", "peppers", "peppers", "tomato", "peppers", "peppers", "peppers", "tomato", "fruits", "peppers"], "difficult_direct_answer": false, "rationales": ["The topping is peppers.", "Peppers are a vegetable with a lot of vitamin c.", "Peppers have a high amount of vitamin c."], "image": "train2014/COCO_train2014_000000307758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375200, "question_id": "NzhY7jmPdt6yaEsH8EzYXe", "question": "What kind of snack can you get at the business on this street corner?", "choices": ["hotdog", "spaghetti", "submarine sandwich", "falafel"], "correct_choice_idx": 0, "direct_answers": ["hotdog", "hot dog", "papaya dog", "hot dog", "hot dog", "hot dog", "hot dog", "papaya", "papaya", "pizza"], "difficult_direct_answer": false, "rationales": ["One that has papaya somehow.", "The store on the corner is papaya dog and they sell hot dogs.", "The cart has hot dogs."], "image": "train2014/COCO_train2014_000000375200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503008, "question_id": "NzxdmAzd3HMmJBBsTQfHqg", "question": "What does this person try to get the cows to do?", "choices": ["move", "dance", "die", "give milk"], "correct_choice_idx": 0, "direct_answers": ["follow line", "attack", "move on", "move", "walk", "keep moving", "stay lined", "wave flag", "move forward", "move"], "difficult_direct_answer": true, "rationales": ["This man encourages this group of cows to proceed out of and down the street with his red flag.", "The person wants the cows to walk.", "A person is running the cows away with a red flag."], "image": "val2014/COCO_val2014_000000503008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199437, "question_id": "P28cxqzsTUeP2d9hDPmdUk", "question": "What does the item on the far right do?", "choices": ["slices meat", "warms room", "destroys dna", "cuts cake"], "correct_choice_idx": 0, "direct_answers": ["slice meats", "slice meat", "slice meat", "bake", "slice meat", "slice", "slices meat", "cut meat", "slicer", "slice meat"], "difficult_direct_answer": false, "rationales": ["The machine is used to slice meat.", "The item slices meat.", "There are meats visible on the item to the right. there is also a blade that is consistent with answer a."], "image": "val2014/COCO_val2014_000000199437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55727, "question_id": "P2HnRqmmAJtkUboAi4Dj6Y", "question": "Why does the room appear curved?", "choices": ["warped wood", "circular building", "fisheye lens", "earthquake"], "correct_choice_idx": 2, "direct_answers": ["camera lens", "fisheye lens", "panorama shot", "photography", "camera filter", "panoramic", "panorama view", "fisheye lens", "panoramic lens", "angle"], "difficult_direct_answer": true, "rationales": ["The room has been shot using a particular view. it is not shaped like this.", "The photographer is using a fisheye lens.", "The room has a lens."], "image": "val2014/COCO_val2014_000000055727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58609, "question_id": "P33ZCdH2iv9qX6whuCzrvX", "question": "What type of candle is on the table?", "choices": ["floating", "votive", "pillar", "taper"], "correct_choice_idx": 1, "direct_answers": ["tea", "wax", "lit candle", "votive", "votive", "wax", "votive", "votive", "small", "votive"], "difficult_direct_answer": false, "rationales": ["The candle is votive and is visible behind the glass of water.", "It is short and inside a glass holder", "The candle is votive."], "image": "train2014/COCO_train2014_000000058609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358466, "question_id": "P36q9ALLUHgGpBf8GgHsCF", "question": "What item is probably at the highest elevation?", "choices": ["grass", "kite", "roofs", "mountain"], "correct_choice_idx": 3, "direct_answers": ["plane", "kite", "mountain", "kite", "airplane", "mountain", "kite", "clouds", "airplane", "kite"], "difficult_direct_answer": false, "rationales": ["The mountain has the highest elevation.", "There is a mountain in the back of the field.", "A woman stand near a couple of buildings with mountains in the background."], "image": "train2014/COCO_train2014_000000358466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381999, "question_id": "P3QyiALvARi3XWZkjd2ma7", "question": "Why is the guy's neck red?", "choices": ["hair dye", "makeup", "blushing", "sunburn"], "correct_choice_idx": 3, "direct_answers": ["sunburn", "sunburn", "sun line", "sunburn", "sunburn", "sun burnt", "sunburn", "sunburn", "sun", "sunburn"], "difficult_direct_answer": false, "rationales": ["The guy is outside and tanned, implying he spends a lot of time outside. spending too much time outside on really hot days can lead to sunburns without the proper protection.", "He has been in the sun too long.", "The man has been burned by the sun."], "image": "val2014/COCO_val2014_000000381999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306249, "question_id": "P3RFaAVGjZERBRCHTRsRw6", "question": "What type of floor has been laid under the kitchen table?", "choices": ["linoleum", "vinyl", "hardwood", "tile"], "correct_choice_idx": 3, "direct_answers": ["wooden", "wood", "wooden", "tile", "wood", "wood", "wood floor", "wood", "hardwood", "wood"], "difficult_direct_answer": false, "rationales": ["Wood grain can be seen in the floor which is long planks.", "The flooring is of a style, color and shape consistent with answer c.", "This is tile floor that has been laid down for them to walk on."], "image": "train2014/COCO_train2014_000000306249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271825, "question_id": "P3SG479Ft5uDYpCxZzyPFF", "question": "What type of boats are moving through the water?", "choices": ["kayaks", "rowboats", "party boats", "sailboats"], "correct_choice_idx": 3, "direct_answers": ["sailboats", "sailboat", "pontoon", "sailboats", "sailboats", "sailboats", "speedboat", "speed boats", "sailboat", "small boats"], "difficult_direct_answer": false, "rationales": ["The boats are sailboats.", "The boats have tall masts, which is where the sail would be attached, which means that these sailboats are defined as sailboats.", "The boats are clearly visible and they have masts and sails which would be consistent with answer a."], "image": "val2014/COCO_val2014_000000271825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488032, "question_id": "P3xDckTsAZ45o5vVVQ6Luu", "question": "What powers the facilities in this area?", "choices": ["pedaling", "hydronics", "solar", "steam"], "correct_choice_idx": 2, "direct_answers": ["solar panels", "sun", "solar", "solar panels", "solar", "unknown", "sun", "tennis", "satellite", "electricity"], "difficult_direct_answer": false, "rationales": ["There are panels up above that catch the rays", "A solar roof is shown.", "There are solar panels in the background which is a source of power and the only one currently visible."], "image": "train2014/COCO_train2014_000000488032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137173, "question_id": "P48J2yUqyqFhmaSnYMAYLh", "question": "What process was used to color her shirt?", "choices": ["spray paint", "brush paint", "tie-dye", "markers"], "correct_choice_idx": 2, "direct_answers": ["tie-dye", "tie-dye", "tie dye", "dye", "dye", "tie dye", "tie dye", "tie dye", "tie dye", "tie dye"], "difficult_direct_answer": false, "rationales": ["A girl is in a white and pink shirt with a splotchy pattern.", "The girl's shirt has been tie dyed.", "It has the markings that appear when you do this special process"], "image": "train2014/COCO_train2014_000000137173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52606, "question_id": "P4CEdd3SH7yuCy83Fix7Xv", "question": "What sort of surface does the man riding a skateboard do a trick on?", "choices": ["rail", "platform", "block", "ramp"], "correct_choice_idx": 0, "direct_answers": ["metal", "metal", "rail", "rail", "rail", "rail", "rail", "beam", "rail", "railing"], "difficult_direct_answer": false, "rationales": ["The surface is a rail.", "The skater is riding on a metal railing.", "The man is on a piece of metal."], "image": "train2014/COCO_train2014_000000052606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458898, "question_id": "P4s8wXVhDJLwVSCv5KZQwn", "question": "What are the surfers in a push up position attempting to do?", "choices": ["exercise", "stand", "roll", "dive"], "correct_choice_idx": 1, "direct_answers": ["get up", "ride wave", "balancing", "stand up", "stand", "stand", "stand up", "stand", "stand", "stand up"], "difficult_direct_answer": false, "rationales": ["The surfers want to get up.", "The surfers want to stand up on the boards.", "The surfers are standing."], "image": "train2014/COCO_train2014_000000458898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142822, "question_id": "P56Dhsq3ZafxXDvZkrqKuu", "question": "What will the man on the bike do next?", "choices": ["race", "lunch break", "sleep", "change oil"], "correct_choice_idx": 0, "direct_answers": ["riding", "go fast", "race", "race", "ride", "start ignition", "race", "race", "race", "race"], "difficult_direct_answer": false, "rationales": ["The man on the bike is wearing professional motorcycle safety gear. he is not a mechanic.", "A biker is wearing a red bull uniform and to right is another biker ready to compete.", "The man is wearing gear and there are sponsors on the bike."], "image": "train2014/COCO_train2014_000000142822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71072, "question_id": "P584pesX2eHDkx96ywCEpK", "question": "What area is to the left of the TV monitor?", "choices": ["cat house", "kitchen", "garden", "fireplace"], "correct_choice_idx": 3, "direct_answers": ["camera", "fireplace", "desk", "fireplace", "fireplace", "fireplace", "fireplace", "camera", "fireplace", "fireplace"], "difficult_direct_answer": false, "rationales": ["There is an indent visible in the brick wall to the left of the tv. this type of alcove commonly houses answer a.", "There is a small portion of an opening visible and the bricks surrounding. this type of structure in a home is most commonly associated with a fireplace.", "The tv monitor is on top of a mantle next to a brick fireplace."], "image": "val2014/COCO_val2014_000000071072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374084, "question_id": "P5DR4MMb5A2tCwQ62AESFP", "question": "What is the women about to do?", "choices": ["throw something", "high five", "play game", "hit someone"], "correct_choice_idx": 2, "direct_answers": ["play game", "play game", "play game", "hit something", "play wii", "dance", "virtual tennis", "throw something", "swing", "hit ball"], "difficult_direct_answer": false, "rationales": ["The women are playing wii.", "The women will play a game.", "She is holding a game controller."], "image": "train2014/COCO_train2014_000000374084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158277, "question_id": "P5NgbmRtbun8vHzjZUaPxD", "question": "What shape is the orange item?", "choices": ["diamond", "rhombus", "square", "circle"], "correct_choice_idx": 3, "direct_answers": ["circle", "circle", "oval", "round", "circular", "round", "round", "round", "circle", "round"], "difficult_direct_answer": false, "rationales": ["The orange item is round and looks like a donut.", "The shape is a circle.", "This is a safety device thrown to someone who may be drowning."], "image": "val2014/COCO_val2014_000000158277.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31904, "question_id": "P5RoRJGPU7WJ7svjEDZk9G", "question": "What does the W on her cap stand for?", "choices": ["women", "wild", "win", "work"], "correct_choice_idx": 1, "direct_answers": ["wild", "wild cats", "wild", "wild cats", "wild", "wild cats", "wild cats", "wild cats", "washington", "team monogram"], "difficult_direct_answer": false, "rationales": ["W is the first letter of wild and wild cats is written on her shirt.", "The name is on her shirt", "The shirt says wild cat"], "image": "train2014/COCO_train2014_000000031904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36448, "question_id": "P5UBYvuyLcg7S5eEBw769q", "question": "What is the reason for the glare on the train?", "choices": ["fire", "flashlight", "explosion", "sunlight reflection"], "correct_choice_idx": 3, "direct_answers": ["sunlight", "sunshine", "sun", "sunlight reflection", "sun", "sunlight", "sunlight", "sun", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["The sun is reflecting on the windows.", "The sun is low in the sky and rays are hitting the side of the train", "The sun is low in the sky."], "image": "train2014/COCO_train2014_000000036448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347755, "question_id": "P5dmVDfoGPppuapHu75ze5", "question": "What can you do to the red things to efficiently make them take up less space?", "choices": ["stack them", "fold them", "leave them", "cut them"], "correct_choice_idx": 1, "direct_answers": ["fold them", "fold", "fold them", "fold", "fold", "fold", "fold", "push", "gather orderly", "fold"], "difficult_direct_answer": false, "rationales": ["A man is sitting on and is surrounded by red folding chairs. chairs can be folded and stacked when not in use.", "The chairs are foldable.", "These chairs can be folded up and stacked."], "image": "train2014/COCO_train2014_000000347755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213988, "question_id": "P5jXjSNxY59LHnFwCn82XB", "question": "Where do you usually see boardwalks like this?", "choices": ["park", "mall", "beach", "zoo"], "correct_choice_idx": 2, "direct_answers": ["ocean fronts", "near pier", "pier", "ocean", "along water", "near beaches", "beach", "new jersey", "clock", "shops/restaurants"], "difficult_direct_answer": true, "rationales": ["The boardwalk is the kind that is usually built next to the ocean or beach.", "The main place to see boardwalks is on the beach near an ocean or sea.", "Boardwalks are typically seen near a beach."], "image": "train2014/COCO_train2014_000000213988.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374956, "question_id": "P5xfcZ65g3CR2AkYqM77jM", "question": "What is the most common breed of milk cow?", "choices": ["brown swiss", "holstein", "ayrshire", "jersey"], "correct_choice_idx": 1, "direct_answers": ["holstein", "holstein", "holstein", "holstein", "holstein", "brahman cattle", "holstein", "brahman", "holstein", "holstein"], "difficult_direct_answer": false, "rationales": ["The holstein cow is the most common cow used at a dairy farm", "This is the one most have", "The most common breed of milk cow is the holstein."], "image": "train2014/COCO_train2014_000000374956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154329, "question_id": "P6sLr6hXnPjBtrPRwecGWF", "question": "According to the banner ad what kind of Lite beer tastes great?", "choices": ["heineken", "miller", "corona", "bud"], "correct_choice_idx": 1, "direct_answers": ["miller lite", "miller", "miller", "miller", "miller", "miller", "miller", "lite", "miller lite", "miller"], "difficult_direct_answer": false, "rationales": ["The wording is visible with the slogan saying this and the logo of the company is also visible on the ad which connects the two.", "The beer is miller lite.", "One can see the logo of this beer company on the banner."], "image": "train2014/COCO_train2014_000000154329.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430654, "question_id": "P7SDay9tUw2a7cqmNjbxXn", "question": "What is the man using the string to do?", "choices": ["control", "secure", "whip", "tie"], "correct_choice_idx": 0, "direct_answers": ["fly kite", "control", "fly kite", "fly kite", "control kite", "fly kite", "fly kite", "flying kite", "fly kite", "control kite"], "difficult_direct_answer": false, "rationales": ["The man is using the string to control the red kite on the other end.", "By yanking. lifting and twisting on the string, the kite flyer has a lot of say in where the kite will go next.", "The string prevents the kite from getting carried away by the wind."], "image": "train2014/COCO_train2014_000000430654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318845, "question_id": "P7T3zwbgAQiZiHEsanEzXy", "question": "What is the main ingredient of the food that the boys are eating?", "choices": ["juice", "sugar", "milk", "starch"], "correct_choice_idx": 2, "direct_answers": ["soda", "milk", "dairy", "ice cream", "milk", "milk", "ice cream", "milk", "milk", "ice cream"], "difficult_direct_answer": false, "rationales": ["This is a frozen dairy treat", "They're eating ice cream; the answer must be milk.", "The food has some milk."], "image": "train2014/COCO_train2014_000000318845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428403, "question_id": "P7hScFQb2BaTWVUd6UUEyq", "question": "What does the black object above the boat provide?", "choices": ["solar power", "cammo", "water", "shade"], "correct_choice_idx": 3, "direct_answers": ["shade", "power", "shade", "shade", "shade", "shade", "shade", "propulsion", "shade", "shade"], "difficult_direct_answer": false, "rationales": ["A cover can block the sun.", "A large black fabric is stretched above a boat. large canopies can provide shade in sunny weather.", "There is a very large yacht or flat bottom boat. at the top is a large canopy that gives protection from sun."], "image": "val2014/COCO_val2014_000000428403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230597, "question_id": "P7tneKWK9uer7LMVSAtLeq", "question": "What part of the harness is the child holding?", "choices": ["bit", "spurs", "saddle", "reins"], "correct_choice_idx": 3, "direct_answers": ["rein", "top part", "lead", "reins", "reins", "top", "reins", "yoke", "front", "leads"], "difficult_direct_answer": false, "rationales": ["The child is visible and positioned on top of the horse with the harness pulled back toward them. based on the contraption on the horse, this would be known as answer a.", "The child is holding the narrow strap attached to the bit that helps to guide the horse.", "This way the can steer the horse."], "image": "train2014/COCO_train2014_000000230597.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240403, "question_id": "P7x2y6RLtQRy7nBZzsLMBM", "question": "That company made the pink racket?", "choices": ["williams", "wendell", "wendys", "wonton"], "correct_choice_idx": 0, "direct_answers": ["wilson", "wilson", "wilson", "wilson", "wilson", "wilson", "wilson", "williams", "wilson", "wilson"], "difficult_direct_answer": false, "rationales": ["The w is for wilson", "Wilson is the brand name with that logo.", "Williams makes racquets."], "image": "val2014/COCO_val2014_000000240403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256386, "question_id": "P7ybmKhewMyDt7XeUYktt8", "question": "What will these pedestrians do together?", "choices": ["selling", "writing", "skateboard", "sleep"], "correct_choice_idx": 2, "direct_answers": ["cross street", "skateboard", "walk", "cross street", "skateboard", "skateboard", "skateboard", "skateboard", "walk", "skateboarding"], "difficult_direct_answer": false, "rationales": ["They are carrying their boards in their hands.", "The pedestrians are carrying skateboards that they will use to skate together later.", "The people in question are clearly visible and are holding skateboards which they are likely intending to use."], "image": "train2014/COCO_train2014_000000256386.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304158, "question_id": "P8FooUjfc8Nd5XzLiK98ZG", "question": "What kind of vehicle is the yellow thing?", "choices": ["tour bus", "truck", "school bus", "train"], "correct_choice_idx": 0, "direct_answers": ["tiger bus", "bus", "bus", "bus", "bus", "tiger bus", "bus", "bus trolley", "bus", "tour bus"], "difficult_direct_answer": false, "rationales": ["It is a non-tracked road vehicle that is designed to carry passengers. the vehicle is at a zoo and has an animal-themed livery, so it is used by people visiting the zoo.", "It's a bus painted to look like a tiger and used to take visitors around see all the animal areas.", "The vehicle is a bus."], "image": "train2014/COCO_train2014_000000304158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27424, "question_id": "P8Xd7vpvfnD7yVJvxhg3Wb", "question": "Why are they moving strangely?", "choices": ["exercising", "fighting", "dancing", "signaling"], "correct_choice_idx": 0, "direct_answers": ["playing", "playing game", "playing game", "playing wii", "playing game", "room", "exercising", "up hand", "playing games", "playing"], "difficult_direct_answer": false, "rationales": ["They are likely watching an exercise video on tv.", "The people are trying to exercise.", "The people appear to be holding wii remotes based on the size and shape of the objects in their hands. playing wii as intended often leads to one doing answer a."], "image": "train2014/COCO_train2014_000000027424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469870, "question_id": "P8kxvgJPBi3m9w6wMin7FU", "question": "What type potatoes are served here?", "choices": ["baked", "waffle fries", "french fries", "chips"], "correct_choice_idx": 1, "direct_answers": ["chips", "fried", "chips", "chips", "chips", "waffle fries", "potato chips", "chips", "chips", "waffle fries"], "difficult_direct_answer": false, "rationales": ["The fries are in a waffle fry shape.", "One can see the grill marks from the cooking iron.", "The potatoes are waffled."], "image": "train2014/COCO_train2014_000000469870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571441, "question_id": "P8uYRsuQaiB8AxbnL7Ny9u", "question": "Which elephant is likely the youngest of the three?", "choices": ["same age", "back one", "front one", "middle one"], "correct_choice_idx": 2, "direct_answers": ["rightmost", "right", "right", "right", "front one", "darkest one", "smallest one", "baby", "right", "right elephant"], "difficult_direct_answer": false, "rationales": ["The front elephant is smallest.", "It is the smallest", "The one in the front is smaller and likely the youngest."], "image": "train2014/COCO_train2014_000000571441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434316, "question_id": "P8xb6GWo5aVgxZfvkUrSqu", "question": "What is on the floor?", "choices": ["traffic lines", "eggs", "snow", "crawling baby"], "correct_choice_idx": 0, "direct_answers": ["grass", "dirt", "grass", "grass", "grass", "road", "concrete", "traffic lines", "asphalt", "pavement"], "difficult_direct_answer": false, "rationales": ["There are traffic lines painted.", "The painted lines guide traffic.", "Traffic lines are on the floor."], "image": "val2014/COCO_val2014_000000434316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379732, "question_id": "P93YxyXwhZB7Ntx6hXu3tf", "question": "What beverage are most people at this party having?", "choices": ["tomato juice", "wine", "milk", "soda"], "correct_choice_idx": 3, "direct_answers": ["sprite", "sprite", "soda", "beer", "soda", "soft drink", "soda", "soda", "sodas", "soda"], "difficult_direct_answer": false, "rationales": ["The beverages are famous brand name carbonated drinks.", "They are mostly drinking out of cans. many of them have sprite branding.", "There are soda cans visible in and around the people. if there are cans visible it is likely this is what they are drinking."], "image": "train2014/COCO_train2014_000000379732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550028, "question_id": "P97tBDEovVGwNaT6iEGb7s", "question": "What just happened to the ball?", "choices": ["got hit", "foul ball", "caught", "lost"], "correct_choice_idx": 0, "direct_answers": ["got hit", "caught", "swing miss", "got hit", "strike out", "caught", "hit", "baseball", "caught", "got hit"], "difficult_direct_answer": false, "rationales": ["The batter hit the ball.", "The batter swung and connected with it", "The batter swung at the ball."], "image": "train2014/COCO_train2014_000000550028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245367, "question_id": "P9EKXjPio8KF2fNcHggSt2", "question": "What is the large ramp used for?", "choices": ["basketball", "skateboarding", "football", "sledding"], "correct_choice_idx": 1, "direct_answers": ["tricks", "skateboarding", "skateboarding", "grinding", "jumping off", "walking fast", "skate jumps", "skateboarding", "skating", "skateboarding tricks"], "difficult_direct_answer": false, "rationales": ["The half-pipe style of this ramp and it's proximity to skateboarders tell us what it's used for.", "Skateboarders drop into the ramp from the plateau.", "Skateboarders use the ramp."], "image": "train2014/COCO_train2014_000000245367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352288, "question_id": "P9Ji5ZvVYgTYeF4gWJ4dbC", "question": "What is the building used for in the park?", "choices": ["storing kites", "office space", "bathroom", "stage presentations"], "correct_choice_idx": 3, "direct_answers": ["concerts", "stage", "concerts", "assembly", "concerts", "concerts", "concerts", "stage presentations", "live music", "stage"], "difficult_direct_answer": false, "rationales": ["There is a flat stage on the building's front.", "The building is for stage presentations.", "The structure is large, open, simple and does not feature plumbing"], "image": "train2014/COCO_train2014_000000352288.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132057, "question_id": "P9PagCexG7bW3nRkmCz4Vf", "question": "What is the woman drawing?", "choices": ["pigeon", "camel", "peacock", "rooster"], "correct_choice_idx": 3, "direct_answers": ["chicken", "rooster", "animal", "rooster", "rooster", "animal", "cock", "bird", "cock", "hen"], "difficult_direct_answer": false, "rationales": ["The woman is drawing a cock.", "The woman is drawing wings.", "The outline of the woman's drawing appears to have the features, feather pattern and general shape of answer a."], "image": "train2014/COCO_train2014_000000132057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489125, "question_id": "P9n7FT8YZagFNyor2JikBM", "question": "What cut of shirt is she wearing?", "choices": ["t-shirt", "tank top", "crop top", "turtleneck"], "correct_choice_idx": 1, "direct_answers": ["white", "sleeveless", "tank top", "tank top", "sleeveless", "vest", "tank top", "tank top", "tank top", "tank top"], "difficult_direct_answer": false, "rationales": ["She has on a white tank top.", "The woman is wearing a sleeveless top. tank tops do not have sleeves.", "The woman's shirt is sleeveless."], "image": "val2014/COCO_val2014_000000489125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96702, "question_id": "PA5iSLe56DJamiuX4f2pfM", "question": "What is misting up from the mountain?", "choices": ["man's breath", "city smog", "smoke", "fog"], "correct_choice_idx": 3, "direct_answers": ["clouds", "snow", "rocks", "steam", "fog", "mist", "blowing snow", "clouds", "condensation", "clouds"], "difficult_direct_answer": false, "rationales": ["Fog is misting.", "There is a mountain with a white, cloud looking formations in the sky around the peaks. fog is common at higher elevations.", "This is the fog coming up over the mountains."], "image": "train2014/COCO_train2014_000000096702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461156, "question_id": "PAJQZSGrKnYKpBr4FwdMdC", "question": "What type of dog size is this dog a part of?", "choices": ["small dog", "medium dog", "extra large", "large dog"], "correct_choice_idx": 0, "direct_answers": ["small", "small", "miniature", "small", "small dog", "small dog", "mini", "small", "miniature", "small"], "difficult_direct_answer": false, "rationales": ["The dog looks like he weighs under 30 lbs. so he is considered small.", "The dog is so little that it barely takes up space on the chair.", "The size is small."], "image": "val2014/COCO_val2014_000000461156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514131, "question_id": "PAhisdWrEEuBER7kfvz9os", "question": "Indoor plants are used to grow for what purpose?", "choices": ["water purifier", "air purifier", "water filter", "decoration"], "correct_choice_idx": 1, "direct_answers": ["air purifier", "health", "decoration", "decoration", "decoration", "fresh air", "decoration", "pleasure", "pleasure", "beauty"], "difficult_direct_answer": false, "rationales": ["The plants purify air.", "The indoor plants are used to ensure the purity of the air.", "The plants help suck cop from the air."], "image": "train2014/COCO_train2014_000000514131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180129, "question_id": "PAmWrTWHx6TZeQPTkodm8r", "question": "Based on the phone size about what size is the cat sculpture?", "choices": ["5 inches", "1/2 inch", "1 foot", "24 inches"], "correct_choice_idx": 1, "direct_answers": ["half inch", "1 1/2inch", "tiny", "miniature", "small", "15mm", "miniature", "1/2 inch", "small", "tiny"], "difficult_direct_answer": false, "rationales": ["It's really a tiny cat sculpture.", "The cat sculpture is much smaller than the cell phone. cell phones are generally only a few inches long, so it must be quite small.", "The evident is shown of the size."], "image": "train2014/COCO_train2014_000000180129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222564, "question_id": "PBgCTHvtcavqfuHNeeBTjg", "question": "What is being done to the food in the glass fronted box?", "choices": ["fried", "baked", "stored", "chilled"], "correct_choice_idx": 1, "direct_answers": ["baked", "baking", "baked", "heated", "baked", "heated", "baked", "heating", "baking", "broil"], "difficult_direct_answer": false, "rationales": ["The food is baked.", "An oven has a temperature lit on the digital screen on the front. a chef is cooking in a kitchen and items can be seen inside the oven.", "This is an oven"], "image": "val2014/COCO_val2014_000000222564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73786, "question_id": "PBkmq9Fe5x35X37W3jCAb2", "question": "What is she prepared for?", "choices": ["to run", "to serve", "to quit", "receive serve"], "correct_choice_idx": 3, "direct_answers": ["serve", "serve", "serve", "hit ball", "play", "receive serve", "ball", "swing", "game", "return ball"], "difficult_direct_answer": false, "rationales": ["She is ready to receive the tennisball serve from her opponent.", "The player is waiting to hit the ball.", "She hasn't started running for regular hits yet"], "image": "train2014/COCO_train2014_000000073786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146583, "question_id": "PC545XsyCkKPwMMov2Trx7", "question": "How is the pattern of the stripes in the individual zebras?", "choices": ["identical", "alike", "unique", "matching"], "correct_choice_idx": 2, "direct_answers": ["vertical", "different", "different", "thick", "similar", "black/white", "different", "vertical", "striped", "unique"], "difficult_direct_answer": false, "rationales": ["The patterns are visible and while they may be unique, they are similar making c the closest approximate.", "No zebra's stripes are exactly the same.", "The patterns on the zebras are all different to each one."], "image": "train2014/COCO_train2014_000000146583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365116, "question_id": "PCKJ8e6VufpnwyazRtcuC7", "question": "Where is the freezer located on this unit?", "choices": ["side", "bottom", "none included", "top"], "correct_choice_idx": 3, "direct_answers": ["top", "top", "top", "top", "top", "top", "top", "top", "top", "top"], "difficult_direct_answer": false, "rationales": ["The freezer is on top.", "The smaller one is usually the one that has the colder temperature and normally above the fridge.", "The refrigerator visible is a two door design where the top compartment is smaller and neither are pull out doors. this setup would commonly have the freezer above."], "image": "train2014/COCO_train2014_000000365116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532994, "question_id": "PCLPpUu2ks4yZ7xPPCdgvm", "question": "What type of activity is happening here?", "choices": ["olympic contest", "car race", "fair", "cattle call"], "correct_choice_idx": 2, "direct_answers": ["fire truck", "fair", "festival", "festival", "fire fighting", "horse pulling", "county fair", "battle", "parade", "fair"], "difficult_direct_answer": false, "rationales": ["Looks to be a fair that many people are watching and at.", "With the vendors and the costumes, that's what's going on here.", "The multitude of people present in this image and the horsedrawn antique carriage with men in period dress tell us this is some sort of fair."], "image": "val2014/COCO_val2014_000000532994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142182, "question_id": "PCLo7pid7TmV66uswEHDRF", "question": "What does the person who owns the shelves like to collect?", "choices": ["stuffed animals", "clothing", "books", "games"], "correct_choice_idx": 0, "direct_answers": ["stuffed animals", "stuffed animals", "stuffed animals", "stuffed animals", "stuffed animals", "bears", "stuffed animals", "stuffed animals", "stuffed animals", "stuffed animals"], "difficult_direct_answer": false, "rationales": ["The abundance of answer a visible on the shelves implies the owner is interested in collecting the items depicted.", "The person who owns the shelves is collecting a number of stuffed animals there.", "The person that owns the shelves likes to collect stuffed animals. the shelves are all filled with teddy bears and other stuffed animals."], "image": "val2014/COCO_val2014_000000142182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27478, "question_id": "PCbppbmVZZaxYrVcs26XjX", "question": "What material is the Ground made of?", "choices": ["concrete", "plastic", "porcelain", "wood"], "correct_choice_idx": 0, "direct_answers": ["concrete", "asphalt", "asphalt", "concrete", "concrete", "concrete", "concrete", "concrete", "pavement", "cement"], "difficult_direct_answer": false, "rationales": ["This is cement, with cracks typically found in cement.", "It is solid and grey", "The ground is made of concrete."], "image": "val2014/COCO_val2014_000000027478.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118514, "question_id": "PCdk6qC3WhB7BMVvU5LnZr", "question": "What is the flying arrangement of the planes called?", "choices": ["formation", "summation", "abstract", "parallel"], "correct_choice_idx": 0, "direct_answers": ["duck formation", "diamond", "flying v", "wheels", "formation", "triangle", "v formation", "formation", "triangle", "triangle"], "difficult_direct_answer": false, "rationales": ["Planes organized in a specific manner is called a formation.", "The arrangement is a formation.", "The planes fly in an agreed-upon pattern"], "image": "train2014/COCO_train2014_000000118514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150091, "question_id": "PCkTsv3BWaZ57W3hfaSJQm", "question": "What type of area is in the background?", "choices": ["forest", "urban", "rural", "mountain"], "correct_choice_idx": 1, "direct_answers": ["city", "city", "urban", "train station", "city", "bad", "city", "city", "apartments", "industrial"], "difficult_direct_answer": false, "rationales": ["The background appears to have apartment buildings and other metropolitan buildings based on their size and design which would be consistent with answer a.", "The area is urban.", "An urban area is shown since there are so many tall buildings."], "image": "val2014/COCO_val2014_000000150091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69196, "question_id": "PCmPHi2ekANNtNwAqkhUrj", "question": "In what building is the organization in question based?", "choices": ["pentagon", "capitol", "white house", "lincoln memorial"], "correct_choice_idx": 0, "direct_answers": ["aircraft hanger", "robins", "airplane", "air force", "plane", "pentagon", "air force", "plane", "tourism", "usa"], "difficult_direct_answer": false, "rationales": ["The pentagon is from this.", "This is the military headquarters for the us", "The plane is owned by the united states air force reserve command. this organization is based out of the military headquarters in washington, d.c."], "image": "val2014/COCO_val2014_000000069196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178258, "question_id": "PD96QFkSv5pKa2ZfsWeS9w", "question": "What role does this man play?", "choices": ["actor", "motorcycle racer", "stuntman", "terrorist"], "correct_choice_idx": 2, "direct_answers": ["stunt driver", "motorbike riding", "motorcycle stunts", "racer", "entertainer", "stuntman entertainer", "stuntman", "bike racer", "stuntman", "stunt"], "difficult_direct_answer": true, "rationales": ["The role is a stuntman.", "The man is riding on one wheel which is a stunt.", "The man is doing tricks on the motorcycle."], "image": "train2014/COCO_train2014_000000178258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312902, "question_id": "PDUNfdfGaU6wMV6NjXVipK", "question": "What can the clinic on the right help you with?", "choices": ["eyes", "feet", "teeth", "back"], "correct_choice_idx": 2, "direct_answers": ["your teeth", "teeth", "your teeth", "teeth", "teeth", "teeth", "teeth", "teeth", "teeth", "teeth"], "difficult_direct_answer": false, "rationales": ["Its a dental clinic.", "A man is in front of a curb selling umbrellas. there is a sign saying dentist that helps you with your mouth problems.", "The clinic can help clean teeth."], "image": "train2014/COCO_train2014_000000312902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426831, "question_id": "PDUP87E9zTVznfuSQXGdfd", "question": "This showroom specializes in which home renovation product?", "choices": ["kitchen counters", "cabinets", "electronics", "lighting"], "correct_choice_idx": 0, "direct_answers": ["shelving", "modern", "kitchen counters", "tables", "eri", "tiles", "counter tops", "kitchen", "eri", "tables"], "difficult_direct_answer": false, "rationales": ["These counters are on display so people can see them and pick out what they like.", "Answer a is visibly present and likely indicates what the store specializes in.", "The showroom has a bunch of counter spaces."], "image": "train2014/COCO_train2014_000000426831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435260, "question_id": "PDXWr5tzuuNqs8DwuRo5j3", "question": "What will make the people laugh?", "choices": ["store sign", "elephants", "patrons", "clowns"], "correct_choice_idx": 3, "direct_answers": ["clowns", "clowns", "show", "clowns", "clowns", "clowns", "clowns", "clowns", "clowns", "clowns"], "difficult_direct_answer": false, "rationales": ["They do funny things to entertain people", "The clowns are funny.", "Clowns are meant to make people laugh."], "image": "val2014/COCO_val2014_000000435260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267210, "question_id": "PDcKc6h7RQfLFHTMt8uYJK", "question": "Why is she using a umbrella?", "choices": ["rain", "snow", "disguise", "sun"], "correct_choice_idx": 0, "direct_answers": ["raining", "stay dry", "rain", "raining", "raining", "its raining", "raining", "tennis", "for joking", "raining"], "difficult_direct_answer": false, "rationales": ["The green part of the court in the background has different values of green - light and dark. it shows that there are puddles of water suggesting it has poured a bit.", "She is playing tennis but trying to keep from getting wet.", "There is water dropping from the sky, and the umbrella keeps it off of her."], "image": "train2014/COCO_train2014_000000267210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414851, "question_id": "PEJ2v6bGrFZTUXP7NQgZpa", "question": "What skill level is the bus driver likely to have at driving this route?", "choices": ["novice", "can't drive", "moderate", "expert"], "correct_choice_idx": 0, "direct_answers": ["good", "skilled", "expert driver", "highly skilled", "novice", "expert", "beginner", "high level", "low", "professional"], "difficult_direct_answer": true, "rationales": ["The bus sign says it's a training vehicle.", "The bus has a sign on the front that says training vehicle which means the driver is new to driving buses.", "Many obstacles and busy looking road. high traffic area."], "image": "train2014/COCO_train2014_000000414851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504159, "question_id": "PEN4DRMzUPVPrEE6ycjfb7", "question": "What is the activity the man is engaging in?", "choices": ["singing", "video game", "playing magic", "working out"], "correct_choice_idx": 1, "direct_answers": ["wii", "video game", "playing wii", "charge", "playing wii", "video game", "wii", "playing wii", "gaming playing", "wii gaming"], "difficult_direct_answer": false, "rationales": ["Some of the physically active games require a remote device to register your arm movements.", "It's specifically a wii.", "The man is holding a game controller in both hands so he would not be singing, doing magic, or working out."], "image": "train2014/COCO_train2014_000000504159.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570430, "question_id": "PEZPdUmHLwKhLtQkKtmCGD", "question": "The flag is belongs to which country?", "choices": ["uk", "france", "us", "italy"], "correct_choice_idx": 2, "direct_answers": ["usa", "america", "united states", "u.s.a", "usa", "united states", "america", "usa", "usa", "us"], "difficult_direct_answer": false, "rationales": ["This flag is from the usa>", "The us betsey ross flag is on the car.", "The red and white stripes and white stars on a blue background make this an american flag."], "image": "train2014/COCO_train2014_000000570430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183895, "question_id": "PFEsG4eLphGbKvv4JVNHkT", "question": "Who gives the airplanes guidelines on where to take off and land?", "choices": ["spies", "wardens", "air control", "pilots"], "correct_choice_idx": 1, "direct_answers": ["wardens", "airtraffic controllers", "airtraffic controllers", "tower guy", "traffic controller", "traffic controller", "security personnel", "air-traffic control", "poles", "traffic control"], "difficult_direct_answer": false, "rationales": ["The planes have wardens.", "An air control tower is who directs and guides airplanes on their travels.", "The traffic air controllers are the ones who will guide the planes in and out of the airport."], "image": "val2014/COCO_val2014_000000183895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112065, "question_id": "PFM2mnZMo3UPTgpskZoZdV", "question": "What kind of screen is being used for the game?", "choices": ["projector", "tube tv", "lcd", "plasma"], "correct_choice_idx": 0, "direct_answers": ["movie", "projector", "projector", "projector", "projector", "projector", "projector", "movie", "projector", "movie"], "difficult_direct_answer": false, "rationales": ["The screen is a projector.", "A projector is used to play the game.", "The screen projects the game for viewing."], "image": "val2014/COCO_val2014_000000112065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94678, "question_id": "PFRVRuzMJM3DiZ22C7xjZT", "question": "How many colors in Microsoft Windows logo?", "choices": ["six", "one", "five", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "4 colors", "four", "four", "four", "five", "five", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are four colors on the logo.", "There are this many windows", "Microsoft's logo has quadrants."], "image": "val2014/COCO_val2014_000000094678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101013, "question_id": "PFa2P4pnwwj5pdqScgfypm", "question": "What is the oldest method of transportation here?", "choices": ["text", "test", "test", "test"], "correct_choice_idx": 0, "direct_answers": ["horse cart", "horse", "horse cart", "horse riding", "cart", "horse carriage", "wagon", "horse", "text", "horse carriage"], "difficult_direct_answer": false, "rationales": ["This question is bugged.", "The method is texting.", "These are all test responses."], "image": "val2014/COCO_val2014_000000101013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391199, "question_id": "PFi5R5TfsVejX5aAXhNHdb", "question": "What religion are the three men?", "choices": ["buddhist", "catholic", "jewish", "christian"], "correct_choice_idx": 2, "direct_answers": ["judaism", "christianity", "jewish", "jewish", "judaism", "jewish", "jewish", "jewish", "mormon", "amish"], "difficult_direct_answer": false, "rationales": ["Men with this style of hair and headwear are known to be practicers of judaism and not associated with common casual looks or other religions.", "The three men are wearing traditional clothing and hairstyles of orthodox jewish men.", "Jewish people wear hats."], "image": "val2014/COCO_val2014_000000391199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296105, "question_id": "PFmd8SoKKRNn3Gpw7354r9", "question": "Why is the ipod on top of the larger electronic device?", "choices": ["to charge", "as decoration", "to sell", "to display"], "correct_choice_idx": 0, "direct_answers": ["charging", "charging", "to charge", "charge", "charging", "charge", "charging", "charging", "providing content", "charging"], "difficult_direct_answer": false, "rationales": ["The way the ipod interacts with this device when placed in this manner is commonly known and the function it serves can be inferred.", "The ipod is being charged.", "There is a dock on the top of the radio."], "image": "val2014/COCO_val2014_000000296105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384678, "question_id": "PFnyEVi7FaaVQmN5HWURez", "question": "What could potentially puncture the tire?", "choices": ["fur", "claws", "eyes", "ears"], "correct_choice_idx": 1, "direct_answers": ["cat nails", "claws", "claws", "cat claws", "kittens nails", "claws", "claw", "claws", "cat", "cat nails"], "difficult_direct_answer": false, "rationales": ["The cat's sharp claws could make a hole in the tire.", "Felines have retractable claw on the paws and legs which are sharp.", "The cats nails are digging in to the tire which could puncture it."], "image": "train2014/COCO_train2014_000000384678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154452, "question_id": "PFqdetAYpBwUqVN9sbdFPu", "question": "What does the orange sign alert drivers of?", "choices": ["turns prohibited", "bike lanes", "animal crossing", "construction"], "correct_choice_idx": 3, "direct_answers": ["construction", "construction", "construction", "construction", "construction worker", "construction workers", "stop", "road work", "construction", "construction"], "difficult_direct_answer": false, "rationales": ["The sign is used to showcase visually that work is being done by having a silhouette of a man digging.", "Orange is a visible color used in construction. there is a person shoveling on the sign.", "The sign makes that people are working on the road."], "image": "train2014/COCO_train2014_000000154452.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451214, "question_id": "PFtJkroiC7FAA8J8PXjRcD", "question": "What is he doing with the pizza?", "choices": ["stealing it", "adding flavor", "eating it", "returning it"], "correct_choice_idx": 1, "direct_answers": ["salt shaker", "adding flavor", "seasoning", "seasoning", "seasoning it", "salting", "seasoning it", "taking photo", "adding cheese", "salting it"], "difficult_direct_answer": false, "rationales": ["The other options don't match this image at all.", "He's sprinkling something on the slice", "The person is shaking a seasoning above the pizza slice."], "image": "val2014/COCO_val2014_000000451214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116100, "question_id": "PFtV9HMBPC6XhWYBfyXzq9", "question": "What type of photographic lens was used for this photograph?", "choices": ["low light", "panoramic", "portrait", "kaleidoscope"], "correct_choice_idx": 1, "direct_answers": ["3d", "panoramic lens", "panoramic", "panoramic", "wide angle", "panoramic", "panoramic", "panoramic", "wide lens", "panoramic"], "difficult_direct_answer": false, "rationales": ["The picture shows the whole room.", "This is a panoramic shot since i get the entire room.", "It is showing a very wide view of the room"], "image": "train2014/COCO_train2014_000000116100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275843, "question_id": "PGZdNYRzSsCFTePW4d8AtP", "question": "What continent would this be in?", "choices": ["south america", "europe", "north america", "asia"], "correct_choice_idx": 1, "direct_answers": ["australia", "europe", "australia", "europe", "europe", "europe", "europe", "europe", "europe", "australia"], "difficult_direct_answer": false, "rationales": ["The continent is europe.", "Double decker buses are in europe.", "The big buses of this deign are from europe."], "image": "val2014/COCO_val2014_000000275843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400371, "question_id": "PGabu33Hz2ebva6vCuAZcY", "question": "What fills the gray item the person in a blue shirt and white shorts sits upon?", "choices": ["air", "oil", "meat", "plastic"], "correct_choice_idx": 0, "direct_answers": ["air", "water", "air", "water", "air", "air", "air", "air", "balloon chair", "air"], "difficult_direct_answer": false, "rationales": ["That is an inflatable object so some sort of gas must fill it.", "The person is on some kind of blow up thing.", "It is a portable blow up pool, which is blown up with air."], "image": "train2014/COCO_train2014_000000400371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487870, "question_id": "PGmMeLn78gktYhLGvaiEQx", "question": "What sound would an animal that obviously went into the food make?", "choices": ["woof", "tweet", "bahh", "oink"], "correct_choice_idx": 3, "direct_answers": ["moo", "mowmow", "moo", "oink", "oink", "growl", "moo", "oink", "moo moo", "moo"], "difficult_direct_answer": false, "rationales": ["There appears to be bacon and that comes from a pig.", "A piece of bacon can be seen hanging out, and bacon is made from a pig.", "There is bacon visible on the sandwich. bacon is from pigs and pigs make the answer a sound."], "image": "train2014/COCO_train2014_000000487870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183898, "question_id": "PGrZYTCXPicvvntZLxayNm", "question": "What is this pan currently being used to create?", "choices": ["entree", "salad", "appetizer", "dessert"], "correct_choice_idx": 3, "direct_answers": ["bananas foster", "fried bananas", "honey bananas", "banana filling", "desert", "fried bananas", "fried bananas", "dessert", "bananas", "dessert"], "difficult_direct_answer": false, "rationales": ["These are bananas which are sweet and used in desserts.", "The pan is being used to make bananas flambe.", "Based on the size and color of the objects in the pan they are bananas and the sauce appears to be a caramel type sauce based on the color and consistency. these ingredients would be consistent with answer a."], "image": "train2014/COCO_train2014_000000183898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527010, "question_id": "PHGcGQodJDCsgJ56Wodq3i", "question": "Why is the child wearing the helmet?", "choices": ["fun", "protection", "fashion", "visibility"], "correct_choice_idx": 1, "direct_answers": ["safety", "avoid injury", "riding skateboard", "skateboarding", "safety", "for safety", "head protection", "protection", "protection", "protection"], "difficult_direct_answer": false, "rationales": ["A kid on a skateboard is wearing a helmet while he rides.", "The child wants to be protected from falls.", "The purposes for wearing a helmet are commonly known and especially important in the circumstances that the boy is currently facing."], "image": "train2014/COCO_train2014_000000527010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348922, "question_id": "PHL4P8tC5H7AztMAkMLUMF", "question": "The young boy is posing for what type of photograph?", "choices": ["painting", "drawing", "portrait", "selfie"], "correct_choice_idx": 2, "direct_answers": ["school portrait", "school picture", "professional", "school", "portrait", "sepia", "school photograph", "formal", "portrait", "portrait"], "difficult_direct_answer": false, "rationales": ["The boy is posing for a portrait.", "The boy is in a portrait.", "A portrait is where someone stands in front of the camera for their photo."], "image": "train2014/COCO_train2014_000000348922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553130, "question_id": "PHUPwGXsGfqet9ptNsBBnw", "question": "When making a slow descent into water what is the safest way here?", "choices": ["rolling headfirst", "dive headfirst", "belly flop", "ladder"], "correct_choice_idx": 3, "direct_answers": ["walking in", "pier", "from sand", "island", "ladder", "ladder", "stairs", "ladder", "ladder", "ladder"], "difficult_direct_answer": false, "rationales": ["One should be cautious entering this water", "It's the object to the right center of the image.", "A paved area juts out into the water and metal steps are attached at one end."], "image": "train2014/COCO_train2014_000000553130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117093, "question_id": "PHpm3QvXmchrPJHDqk76o9", "question": "What is this type of frame called on this scooter?", "choices": ["trellis frame", "backbone", "step-through", "single cradle"], "correct_choice_idx": 2, "direct_answers": ["muffler", "suitcase", "smart architecture", "aluminum", "steel", "gun", "shield", "single", "step-through", "honda"], "difficult_direct_answer": true, "rationales": ["The base of the scooter appears at a lower level with no obstructions on either side which would allow someone to mount and dismount in the manner of answer a.", "The frame has a technical name and that's a stepthrough.", "One person can ride this bike"], "image": "val2014/COCO_val2014_000000117093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300437, "question_id": "PJ8CXi2uQoJTRDMY2jQcWJ", "question": "What caused the large dent in the side of the pizza?", "choices": ["customer", "oven", "pan", "baker"], "correct_choice_idx": 3, "direct_answers": ["bite", "uneven rolling", "hand", "greens", "baking", "oven", "knife", "baker", "teeth", "baking split"], "difficult_direct_answer": true, "rationales": ["The baker misshaped the dough.", "The baker misshaped the dough.", "The baker caused the dent."], "image": "val2014/COCO_val2014_000000300437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336310, "question_id": "PJBKF4HHuM8MK2aoZt7uK6", "question": "What does this person hope to get good at by purchasing the book entitled Premiere?", "choices": ["computer programming", "video editing", "web design", "illustration"], "correct_choice_idx": 1, "direct_answers": ["software", "knowledge", "video editing", "video editing", "editing", "learning", "editing", "editing", "audio engineering", "video editing"], "difficult_direct_answer": false, "rationales": ["The book is a guide becoming skilled at this.", "They want to get better at video editing.", "Premiere is an editing software for rendering videos."], "image": "train2014/COCO_train2014_000000336310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391940, "question_id": "PJLTnFZGrcuFbdXPbSLCMz", "question": "What venue is this place?", "choices": ["farm", "outdoor dining", "park", "local market"], "correct_choice_idx": 3, "direct_answers": ["farmers market", "market", "farmer market", "farmer's market", "farmer's market", "market", "flea market", "market", "local market", "street fair"], "difficult_direct_answer": false, "rationales": ["There are vendor spots with different goods being displayed.", "The people are outside and are not eating. there are stalls that are selling apples and other items.", "This is a little outdoor market selling apples, and by being on a busy street it is very accessible to foot traffic."], "image": "val2014/COCO_val2014_000000391940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111598, "question_id": "PJMFUcAW4GgJxvR6zwkmZR", "question": "What is the most obvious danger here?", "choices": ["car accident", "brain freeze", "rock collision", "shark attack"], "correct_choice_idx": 2, "direct_answers": ["rocks", "rock", "rock", "rocks", "rocks", "rocks", "rocks", "rocks", "rock collision", "rocks"], "difficult_direct_answer": false, "rationales": ["There are many stone, craggy parts there.", "The water is too shallow for sharks to swim in. there are no cars or ice cream near the surfer.", "He's very close to some"], "image": "train2014/COCO_train2014_000000111598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171192, "question_id": "PJQuC9WLyZ6B9XYAzVgzUG", "question": "What is the lady in a red apron doing?", "choices": ["bartending", "wine demo", "waiting tables", "party catering"], "correct_choice_idx": 1, "direct_answers": ["wine", "pouring wine", "pouring wine", "pouring wine", "pouring wine", "filling glasses", "pouring", "pouring wine", "wine demo", "pouring wine"], "difficult_direct_answer": false, "rationales": ["She is pouring wine for people.", "She has lined up all the glasses for the tasting and is pouring the drinks.", "A table with bottles of wine and glasses is formally set."], "image": "val2014/COCO_val2014_000000171192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38034, "question_id": "PJc5Q9jjzQWY9f82Dt5RYY", "question": "What type of creative work does the person using this computer perform?", "choices": ["illustration", "composing", "directing", "writing"], "correct_choice_idx": 0, "direct_answers": ["illustration", "photography", "photography", "video editing", "video making", "writing", "graphic arts", "photography", "photography", "photography"], "difficult_direct_answer": false, "rationales": ["They have a drawing pad in front of the keyboard.", "They have a drawing pad", "The person is illustrating graphics."], "image": "val2014/COCO_val2014_000000038034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104309, "question_id": "PJdjGnbzfxuREzELxmS9XX", "question": "When the pipes need to be worked on plumbers will be blocked from reaching it by what?", "choices": ["sink", "wall", "counter", "microwave"], "correct_choice_idx": 2, "direct_answers": ["counter", "garbage", "trash", "communication", "sink", "trash", "unknown", "spigot", "wrenches", "clutter"], "difficult_direct_answer": true, "rationales": ["There is a counter right in front of the pipe.", "The pipe below the counter is used to drain dirty water.", "There is nothing really blocking the pipes, but the thing that would most hinder would be answer a."], "image": "train2014/COCO_train2014_000000104309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255139, "question_id": "PJqqxA3TqTiZ78pkiCDhnW", "question": "What are the people on the boat looking at?", "choices": ["whales", "sky", "mountains", "dolphins"], "correct_choice_idx": 3, "direct_answers": ["dolphins", "whale/dolphin", "marine life", "dolphin", "water", "dolphin", "dolphins", "dolphin", "shark", "dolphin"], "difficult_direct_answer": false, "rationales": ["There is a dolphin at the water.", "The people in the boat are looking at the dolphin at the surface of the water.", "All the people are looking towards the camera and to the left and that is where a dolphin is seen in the water."], "image": "train2014/COCO_train2014_000000255139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538231, "question_id": "PJsHTtkBP4Xnixj7AM3KCX", "question": "What is the green object on top of the counter to the right of the sink?", "choices": ["fern", "flowers", "grass", "tree"], "correct_choice_idx": 1, "direct_answers": ["plant leaves", "flowers", "flowers", "leaves", "flowers", "flowers", "flowers", "flowers", "flowers", "flowers"], "difficult_direct_answer": false, "rationales": ["The petals are blooming from the stems.", "Right of the sink in this image is a glass vase holding green stemmed plants bearing yellow red flowers.", "They are in a vase and have blooms"], "image": "val2014/COCO_val2014_000000538231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381416, "question_id": "PK7sGo6j6aW9riUNkZ6KU6", "question": "What protective item should the man wear?", "choices": ["ear muffs", "scarf", "knee pads", "helmet"], "correct_choice_idx": 3, "direct_answers": ["helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "coat", "coat"], "difficult_direct_answer": false, "rationales": ["He needs to protect his head.", "He should be wearing a safety hat on his head in case he falls or runs into anything.", "The man should wear a helmet."], "image": "val2014/COCO_val2014_000000381416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484504, "question_id": "PKPkH5fWs7j66S8SZ4mdEU", "question": "What is the person holding to his ear?", "choices": ["hearing aid", "headphones", "ear muffs", "cell phone"], "correct_choice_idx": 3, "direct_answers": ["phone", "cellphone", "phone", "cell phone", "phone", "cellphone", "phone", "phone", "phone", "cellphone"], "difficult_direct_answer": false, "rationales": ["The person has a phone.", "The item is outside of the person's ear, so it is not a hearing aid. the item is not covering both ears, so it is not headphones or ear muffs.", "The person is talking to someone with a device."], "image": "train2014/COCO_train2014_000000484504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473590, "question_id": "PKPsgG72BNKNjmLZfM7ATm", "question": "What photography related problem can be observed in this photo?", "choices": ["focus", "noise", "motion blur", "exposure"], "correct_choice_idx": 0, "direct_answers": ["distortion", "focus", "blurry", "blur", "blur", "blurry", "blurry", "blurry", "blur", "blurry"], "difficult_direct_answer": false, "rationales": ["The ladies seem to be out of focus some.", "The picture is blurry.", "The picture is very blurry."], "image": "train2014/COCO_train2014_000000473590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199408, "question_id": "PKbcw5Tr4mn9RGPY87wsux", "question": "What time of day is it?", "choices": ["night", "evening", "morning", "midday"], "correct_choice_idx": 3, "direct_answers": ["mid day", "evening", "daytime", "midday", "morning", "morning", "day", "noon", "morning", "afternoon"], "difficult_direct_answer": false, "rationales": ["Because the sun seems to be shining more bright.", "The amount of sunlight is visible in the image and the shadows indicate the relative positioning of the light source. those things combined imply it is close to answer a.", "It appears the sun is high noon."], "image": "train2014/COCO_train2014_000000199408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424258, "question_id": "PL9ZvAU2FPPgEqJiUKidHR", "question": "What type of seating is available?", "choices": ["bed", "recliner", "couch", "bench"], "correct_choice_idx": 3, "direct_answers": ["picnic table", "bench", "picnic table", "bench", "bench", "bench", "bench", "bench", "picnic bench", "picnic table"], "difficult_direct_answer": false, "rationales": ["A picnic table has benches attached.", "The seating is a bench.", "There is a bench."], "image": "val2014/COCO_val2014_000000424258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196619, "question_id": "PLBevnN5xKshfQNBirw2Vr", "question": "What meal was mot likely just eaten?", "choices": ["breakfast", "dinner", "lunch", "brunch"], "correct_choice_idx": 1, "direct_answers": ["salad", "those", "breakfast", "dinner", "dinner", "dinner", "breakfast", "dinner", "soup", "salad"], "difficult_direct_answer": false, "rationales": ["The lights are on which means it's nighttime.", "It is dark and the lights are on", "Because there does not appear to be any natural sunlight, and breakfast, brunch, and lunch are usually eaten during daylight hours."], "image": "train2014/COCO_train2014_000000196619.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210431, "question_id": "PLTHuwGXBffhd95HAABsSK", "question": "What is the likely relationship of the man to the boy?", "choices": ["brother", "father", "son", "great grandfather"], "correct_choice_idx": 1, "direct_answers": ["father", "father", "father/son", "father", "father", "father", "father", "father", "father/son", "father/son"], "difficult_direct_answer": false, "rationales": ["The boy looks younger than the man.", "The age of the man suggests that the boy is likely his son.", "It is most likely a relative because it's very uncommon for strangers to play with children"], "image": "train2014/COCO_train2014_000000210431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20711, "question_id": "PLWizguS7gFh33Fw7HJwrt", "question": "Which EU country might be associated with the colors of the skier's gloves?", "choices": ["france", "croatia", "poland", "netherlands"], "correct_choice_idx": 3, "direct_answers": ["usa", "spain", "dutch", "thuringian", "holland", "australia", "spain", "netherlands", "netherlands", "italy"], "difficult_direct_answer": false, "rationales": ["The country is the netherlands.", "That country has very high snow in the mountains.", "The orange and yellow colors are associated with this country."], "image": "train2014/COCO_train2014_000000020711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117119, "question_id": "PLeEjkrYmV4tgQeJDZ5jtG", "question": "The person here stares at what here?", "choices": ["mountain", "pond", "horses", "ocean"], "correct_choice_idx": 3, "direct_answers": ["sun", "ocean", "ocean", "ocean", "water", "ocean", "ocean", "ocean", "distance", "sun"], "difficult_direct_answer": false, "rationales": ["The person here is staring at what appears to be the ocean.", "He is leaning against a boat that is on rocks next to a large body of water. buildings in the back are built high up normally found near large bodies of water.", "The person is in the ocean."], "image": "train2014/COCO_train2014_000000117119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514563, "question_id": "PLg4K5cqqDE9o26qowusPc", "question": "Which Irish pub can be seen to the left of the traffic light?", "choices": ["paddy's", "mcgillan's", "ernest's", "emmit's"], "correct_choice_idx": 3, "direct_answers": ["emmit's", "emmit's", "emmits", "emmit's", "emmit's", "emmit's", "emails", "emmit's", "emmit's", "email's pub"], "difficult_direct_answer": false, "rationales": ["Emmit's is printed on the pub sign.", "The pub is emmitt's.", "The name of the pub can be seen on the sign."], "image": "train2014/COCO_train2014_000000514563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534275, "question_id": "PLitzg9VkrUkLBPaeUu4pZ", "question": "What type of head covering is the rider wearing?", "choices": ["straw hat", "fedora", "visor", "western hat"], "correct_choice_idx": 3, "direct_answers": ["cowboy hat", "hat", "cowboy hat", "hat", "hat", "cowboy hat", "hat", "cowboy hat", "cowboy hat", "western hat"], "difficult_direct_answer": false, "rationales": ["The rider is wearing a hat that looks to be the right size and shape with hats commonly associated with cowboys who mostly operated in the west.", "This is what people riding horses wear a lot", "The man has a cowboy hat on."], "image": "val2014/COCO_val2014_000000534275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186526, "question_id": "PLopEGWzfkFqhbcjdBptjv", "question": "What does the device on the closer person's arm do?", "choices": ["control console", "calculate angles", "project image", "show time"], "correct_choice_idx": 3, "direct_answers": ["show time", "tell time", "show time", "tells time", "hold drinks", "tell time", "tell time", "tell time", "tell time", "keep time"], "difficult_direct_answer": false, "rationales": ["This is a watch and these items are used to tell time.", "The device shows time.", "The person is wearing a watch."], "image": "train2014/COCO_train2014_000000186526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240449, "question_id": "PM9zvtdWVZEh5MhF9XfgqP", "question": "Why is he wearing a hat?", "choices": ["costume", "safety", "disguise", "warmth"], "correct_choice_idx": 3, "direct_answers": ["warmth", "stay warm", "it's cold", "snowing", "stay warm", "cold", "cold", "cold", "warmth", "cold"], "difficult_direct_answer": false, "rationales": ["There is snow around and it is cold outside.", "The man needs warmth.", "The man wants to keep warm."], "image": "train2014/COCO_train2014_000000240449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270706, "question_id": "PMEctEFiBMZVcXjwXCsBYi", "question": "What is a possible outcome of the video game sport these people are playing?", "choices": ["homerun", "touchdown", "goal", "strike"], "correct_choice_idx": 3, "direct_answers": ["strike", "strike", "victory", "strike", "strike", "strike", "strike", "victory", "strike", "victory"], "difficult_direct_answer": false, "rationales": ["The outcome is a strike.", "The people are playing bowling based on the setting displayed on the screen and answer a is an outcome possible in answer a.", "The two people are bowling and are trying to get all the pins."], "image": "val2014/COCO_val2014_000000270706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506401, "question_id": "PMNXVppSMZGGn6b3Mp38m2", "question": "What type of burger could be eaten here?", "choices": ["none", "big mac", "kfc", "whopper"], "correct_choice_idx": 1, "direct_answers": ["cheeseburger", "mcdonald's", "big mac", "mcdonalds burger", "big mac", "mcdonalds", "big mac", "hamburger", "hamburger", "mcdonalds"], "difficult_direct_answer": false, "rationales": ["The big mac burger is something to be bought at the mcdonalds.", "There is a mcdonald's, not burger king or kfc, sign on the left.", "That is the burger eaten at that place."], "image": "val2014/COCO_val2014_000000506401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503834, "question_id": "PN2WdxsoVfB7gFm6PHKApp", "question": "Why is the bus covered in pictures?", "choices": ["to sell", "to vandalize", "to advertise", "to protest"], "correct_choice_idx": 2, "direct_answers": ["advertising", "advertising", "to advertise", "advertising", "ads", "advertisements", "advertising", "advertisement", "ads", "advertisements"], "difficult_direct_answer": false, "rationales": ["The bus has ads on it.", "The bus is advertising.", "Buses are often used as \"moving advertisements.\"."], "image": "train2014/COCO_train2014_000000503834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232544, "question_id": "PN9PyUhg4JtNhwpESSVHoN", "question": "Why is the frisbee in the air?", "choices": ["fell", "bounced", "guys throwing", "windy day"], "correct_choice_idx": 2, "direct_answers": ["game", "flying", "was thrown", "playing game", "thrown", "was thrown", "thrown", "was thrown", "guys throwing", "tossed"], "difficult_direct_answer": false, "rationales": ["The frisbee is being thrown.", "A player has tossed the frisbee towards a teammate as a small group plays at a park on a summer day. frisbee games are great fun and a great way to get exercise!", "The frisbee is above and in front of a man with his hand extended."], "image": "val2014/COCO_val2014_000000232544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69391, "question_id": "PNGSe3hY72SXAPFUkZCm5i", "question": "What would you get if you reverse the first two letters at the top of the bus?", "choices": ["gj", "rbi", "br", "tm"], "correct_choice_idx": 3, "direct_answers": ["su", "airy mt", "tm", "tm", "tm", "tm", "tm", "airy mt", "tm", "tm"], "difficult_direct_answer": false, "rationales": ["If you reverse mt then you would get tm", "Just the letters in reverse.", "A trademark would be gotten."], "image": "val2014/COCO_val2014_000000069391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448368, "question_id": "PNifdxwEpZEcjFW4FFErCw", "question": "Where are the people in?", "choices": ["cafeteria", "theater", "library", "conference room"], "correct_choice_idx": 3, "direct_answers": ["office room", "meeting", "board room", "office", "meeting", "conference room", "conference room", "conference room", "conference room", "conference room"], "difficult_direct_answer": false, "rationales": ["They are in a bigger room in a meeting.", "It has a long table and chairs", "The setting of the room and style of the table are consistent with answer a. the people also appear to be working or collaborating in some fashion based on their work equipment."], "image": "train2014/COCO_train2014_000000448368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33208, "question_id": "PNitJv9pbBzRAH5Q8jcyYw", "question": "What mammal is this traffic stop trying to keep safe by reminding drivers to drive safe?", "choices": ["human", "dog", "tiger", "elephant"], "correct_choice_idx": 0, "direct_answers": ["humans", "people", "homo sapiens", "humans", "human", "humans", "all pedestrian", "humans", "people", "squirrel"], "difficult_direct_answer": false, "rationales": ["It is a sign about pedestrians", "It's trying to keep people safe while on the road.", "Humans are supposed to stop."], "image": "val2014/COCO_val2014_000000033208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521899, "question_id": "PNxwNMXr787Pjk27zUzFKn", "question": "What are the blue and white sticks on the table?", "choices": ["fireworks", "pens", "pencils", "candles"], "correct_choice_idx": 3, "direct_answers": ["candles", "candles", "candles", "table cloth", "candles", "candles", "candles", "table cloth", "candles", "candles"], "difficult_direct_answer": false, "rationales": ["The white and blue sticks are candles.", "They are candles in a menorah.", "The sticks are candles."], "image": "train2014/COCO_train2014_000000521899.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555118, "question_id": "PPBtRqEidvrRNh4AE26BmM", "question": "What would you use to speak to the clerk?", "choices": ["dutch", "spanish", "english", "french"], "correct_choice_idx": 3, "direct_answers": ["telephone", "document", "translator", "intercom", "casefile", "mouth", "buzzer", "door", "french", "mouth"], "difficult_direct_answer": true, "rationales": ["The location is seen as paris based on a label written over the door. answer a is the language spoken in paris.", "The sign on the gray wall says paris and people would need to know french to speak to the clerk,", "The signs are in english."], "image": "train2014/COCO_train2014_000000555118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69284, "question_id": "PPPPhDwWUkQmLuN4skB653", "question": "What type of luggage does the man have?", "choices": ["plastic bag", "backpack", "duffle bag", "suitcase"], "correct_choice_idx": 3, "direct_answers": ["box", "suitcase", "briefcase", "suitcase", "suitcase", "suitcase", "suitcase", "suitcase", "suitcase", "suitcase"], "difficult_direct_answer": false, "rationales": ["The bag is not on his back and is not made out of plastic. it is too rectangular to be a duffle bag.", "The man carries a hard-sided piece of luggage by a single handle.", "The man has a suitcase in his left hand."], "image": "train2014/COCO_train2014_000000069284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155356, "question_id": "PPbSxfZKZnCexgvJv7mkPX", "question": "What type of transportation is this?", "choices": ["air", "road", "water", "rail"], "correct_choice_idx": 3, "direct_answers": ["railroad", "train", "train", "train", "train", "rail", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["It is moving on tracks or a railway.", "The vehicle is clearly visible and is placed on top of parallel metal bars with crossed wooden planks.", "This is a train that transports people or cargo."], "image": "val2014/COCO_val2014_000000155356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5282, "question_id": "PPr2hW9ASLNKNWQobg3ozY", "question": "Where is the boat going?", "choices": ["over bridge", "home", "under bridge", "up river"], "correct_choice_idx": 2, "direct_answers": ["under bridge", "tunnel", "someplace peaceful", "under bridge", "under bridge", "under bridge", "under bridge", "under bridge", "under bridge", "under bridge"], "difficult_direct_answer": false, "rationales": ["You can tell by the structure and how it was built as to what the boat is going under.", "The boat is already in one of the openings.", "The boat goes under the bridge."], "image": "val2014/COCO_val2014_000000005282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189998, "question_id": "PPwyyyZpCm7xsUQDhCAHUL", "question": "This kitchen was specifically designed to be ready for what?", "choices": ["floods", "fires", "earthquakes", "explosions"], "correct_choice_idx": 1, "direct_answers": ["big meals", "cooking", "catering", "fires", "restaurant", "baking", "kitchen", "catering", "large crowds", "cooking food"], "difficult_direct_answer": true, "rationales": ["There are safety things all over for fires.", "There are fire sprinklers attached to the roof of the kitchen.", "The kitchen is equipped with sprinklers to go on in case of a blaze."], "image": "train2014/COCO_train2014_000000189998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575406, "question_id": "PQ5V7PEfGgMy8i8fVTVHvK", "question": "The color scheme of this vehicle represents what flag?", "choices": ["kazakhstan", "djibouti", "france", "mexico"], "correct_choice_idx": 2, "direct_answers": ["usa", "france", "thailand", "american", "australia flag", "english", "usa", "usa", "american", "thailand"], "difficult_direct_answer": false, "rationales": ["The vehicle has a blue, white, and red color scheme. the flags for mexico and djibouti have green on them, and the flag for kazakhstan does not have red.", "The visible colors in the vehicle are red, white and blue. of the given options, answer a is the one with a flag of this same color scheme.", "France has the same colors in their flag."], "image": "val2014/COCO_val2014_000000575406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156740, "question_id": "PQMZeRtEz8dTLobDZY5pmu", "question": "How much does the Turtle cost?", "choices": ["9.99", "8.99", "7.99", "10.99"], "correct_choice_idx": 0, "direct_answers": ["six dollars", "nine ninetynine", "$9.99", "10 dollars", "ten dollars", "$9.99", "9.99", "9.99", "$9.99", "10 dollars"], "difficult_direct_answer": false, "rationales": ["The turtle's price is on the placard in front of it.", "The turtle's price tag is nine, ninety nine.", "The sign on the turtle-shaped bread says that it costs $9.99."], "image": "val2014/COCO_val2014_000000156740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399687, "question_id": "PQkCvjmShDh6UX5Wp97NFM", "question": "What is this woman ready to do?", "choices": ["serve", "dribble", "tackle", "sprint"], "correct_choice_idx": 0, "direct_answers": ["serve", "serve", "serve", "hit ball", "serve", "serve", "serve ball", "to serve", "serve", "serve ball"], "difficult_direct_answer": false, "rationales": ["She is throwing the ball up in the air ready to hit it over to hear opponent.", "The woman wants to serve the tennis ball.", "She has thrown the ball straight up before she swings at it"], "image": "train2014/COCO_train2014_000000399687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108668, "question_id": "PR6mrJseAbqqFU9kSKtQ7p", "question": "What can be built along the back wall?", "choices": ["snow man", "gingerbread", "clothing", "fire"], "correct_choice_idx": 3, "direct_answers": ["chimney", "fire", "fireplace", "flim", "fireplace", "fire", "fire", "shelves", "fireplace", "mantle"], "difficult_direct_answer": false, "rationales": ["There is a fireplace on the back wall and a fire can be built here.", "The area in the back is connected to chimney, and is meant to burn things for warmth.", "Along the back wall there is a fire place. a fire place can have a fire built in it."], "image": "val2014/COCO_val2014_000000108668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399298, "question_id": "PR78TJ5XttGttgBx32rcpB", "question": "What are the long flat green veggies called?", "choices": ["spinach", "broccoli", "asparagus", "snow peas"], "correct_choice_idx": 3, "direct_answers": ["snow peas", "chinese peas", "peas", "peas", "snowpeas", "peas", "snow peas", "peas", "snap peas", "snap peas"], "difficult_direct_answer": false, "rationales": ["They are a type of pea.", "These are edible pods", "The long green veggies are snow peas."], "image": "train2014/COCO_train2014_000000399298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163451, "question_id": "PRUERL7dBXWCcrysSfdG4y", "question": "What type of phone is being used?", "choices": ["cellular", "pay", "landline", "rotary"], "correct_choice_idx": 0, "direct_answers": ["cell", "cellphone", "cellular", "cellular", "cell", "cellphone", "mobile phone", "cellphone", "cellphone", "cell"], "difficult_direct_answer": false, "rationales": ["The phone is wireless. it does not have a rotary dial or a coin slot.", "It is attached to no cords and they are in public", "It is free of any kind of line connected to it and is a freehand smartphone that is rectangular."], "image": "val2014/COCO_val2014_000000163451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292944, "question_id": "PRasTt4o6txYwRcUfxEY2i", "question": "Where does the non bat wielding player want to run?", "choices": ["dugout", "restroom", "home", "second base"], "correct_choice_idx": 3, "direct_answers": ["second base", "second base", "second base", "second base", "first base", "second base", "next base", "left side", "next base", "second base"], "difficult_direct_answer": false, "rationales": ["The batter in is front of first so the person on first wants to go to the next base.", "He's already on first base waiting to see what the batter does", "The batter is at home plate, and the non bat wielding player and defender are at first. the non bat wielding player wants to run to the next one."], "image": "val2014/COCO_val2014_000000292944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351796, "question_id": "PS5MK7VyPif6CtP2tZVmTE", "question": "The woman will hit the fork on what object of she keeps looking straight instead of her plate?", "choices": ["cup", "hand", "food", "table"], "correct_choice_idx": 3, "direct_answers": ["wrist", "table", "cake", "camera", "sweater", "table", "plate", "wrist", "burrito", "table"], "difficult_direct_answer": false, "rationales": ["The woman is at a table.", "She will miss the plate because the fork is above the table and not the plate.", "She is very close to the edge of the plate"], "image": "train2014/COCO_train2014_000000351796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523517, "question_id": "PSV2QQBrFyK7Mt7Tzrbm6Q", "question": "What is the nickname of this player?", "choices": ["closer", "el hombre", "big papi", "slugger"], "correct_choice_idx": 2, "direct_answers": ["batter", "batter", "big papi", "o j", "manny", "baseball player", "orty", "batter", "big papi", "big papi"], "difficult_direct_answer": false, "rationales": ["A name other players call him.", "Big papi is ortiz's nickname.", "The name is big papi."], "image": "val2014/COCO_val2014_000000523517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64612, "question_id": "PShMCmt45ahvsnaKMGHiPC", "question": "Why is the light there?", "choices": ["easily found", "melts snow", "is night", "for filming"], "correct_choice_idx": 2, "direct_answers": ["night skiing", "nite-time skiing", "night", "night time", "for visibility", "lighting", "for advertisment", "safety", "is night", "light"], "difficult_direct_answer": true, "rationales": ["It is dark outside and the light is needed to see.", "The sky is dark and it's night time. the light is used to light up the area.", "People are on a ski run at night. a bright light is on a pole. spectators are all around."], "image": "train2014/COCO_train2014_000000064612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465744, "question_id": "PSyNXuZGeYAcxM7tb7znHQ", "question": "What class of aircraft is seen here?", "choices": ["helicopter", "amphibious", "cargo plane", "fighter jet"], "correct_choice_idx": 1, "direct_answers": ["amphibious", "jump plane", "smallest", "amphibious helicopter", "floatplane", "propeller", "seaplane", "water plane", "amphibious", "seaplane"], "difficult_direct_answer": false, "rationales": ["The aircraft does not have rotors. it has a civilian tail number and is too small to be a cargo plane.", "The aircraft is used in short distances.", "A plane is on the water, resting on baffles in the water."], "image": "train2014/COCO_train2014_000000465744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291724, "question_id": "PSzqbBr9DZ4dxsX3NJASe5", "question": "The bottom portion of the screen that is furthest to the front looks like what video game controller?", "choices": ["ps3", "nes", "ps4", "n64"], "correct_choice_idx": 1, "direct_answers": ["ninetendo ds", "nintendo", "game boy", "nintendo", "nintendo", "x box", "nintendo", "nintendo", "nintendo", "nes"], "difficult_direct_answer": false, "rationales": ["An nes was grey, white and black with the same style.", "The question doesn't make sense but the object visible that looks similar to another object looks like answer a.", "The other types of consoles don't look like the one in the image."], "image": "val2014/COCO_val2014_000000291724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323423, "question_id": "PT8FfkPddCNZpRaDabkANw", "question": "What is causing the person in purple's face to look red?", "choices": ["sunburn", "sunglasses", "goggles", "wind"], "correct_choice_idx": 2, "direct_answers": ["goggles", "goggles", "goggles", "goggles", "goggles", "goggles", "goggles", "goggles", "goggles", "goggles"], "difficult_direct_answer": false, "rationales": ["The goggles have red lenses so the person's face looks like a different color.", "The person has goggles.", "The person's face is covered by a safety item. the person is not wearing sunglasses."], "image": "val2014/COCO_val2014_000000323423.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563542, "question_id": "PTEJvKrZHMgAUbgfGAMk8d", "question": "Why are there black chords near the buildings?", "choices": ["for power", "for climbing", "to sell", "for decoration"], "correct_choice_idx": 0, "direct_answers": ["delivering electricity", "power wiring", "telephone wires", "electric lines", "electricity", "electric wires", "electrical lines", "power lines", "electric supply", "for power"], "difficult_direct_answer": true, "rationales": ["The black cords provide power.", "They are utility service lines, allowing electricity to get to the houses.", "The cords provide electricity."], "image": "val2014/COCO_val2014_000000563542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118581, "question_id": "PTbMRU2BVbpP88htNqSuy9", "question": "What area is likely safest for smaller children here?", "choices": ["left", "far seaward", "central", "right most"], "correct_choice_idx": 3, "direct_answers": ["pool", "sand/beach", "land", "shore", "beach", "right most", "inside breaker", "pool", "small lagoon", "beach"], "difficult_direct_answer": false, "rationales": ["The area is the right.", "The water is less deep in the small pond to the right.", "The right has towels and toys."], "image": "train2014/COCO_train2014_000000118581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13182, "question_id": "PU3YrdDzvLa8ZYzTn68wjj", "question": "What is helping someone walk?", "choices": ["crutches", "branch", "walker", "cane"], "correct_choice_idx": 0, "direct_answers": ["crutches", "horse", "parent", "horse", "horse", "crutches", "crutch", "cane", "horse", "cane"], "difficult_direct_answer": false, "rationales": ["The crutches allow the person to walk.", "As indicated on the back of their jacket near their elbow.", "The crutches help the elderly person walk."], "image": "train2014/COCO_train2014_000000013182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345434, "question_id": "PUwwbkkJycP9MT8QKW3tmv", "question": "How many different species of animals besides humans are visible?", "choices": ["six", "three", "none", "five"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "two", "three", "three", "three", "three", "two", "three"], "difficult_direct_answer": false, "rationales": ["There are cats, birds and dogs", "There is a bird, cats and dogs.", "We can see a few cats, a duck, and a couple dogs as species other than the humans in this picture."], "image": "val2014/COCO_val2014_000000345434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264959, "question_id": "PVRMB6357gRSVMV4ttQa93", "question": "What is she doing with the stuffed animal?", "choices": ["showing it", "breaking it", "selling it", "squeezing it"], "correct_choice_idx": 3, "direct_answers": ["squeezing it", "holding", "holding out", "showing", "playing", "playing", "showing", "holding up", "picture taken", "showing"], "difficult_direct_answer": false, "rationales": ["The little girl is squeezing the animal.", "The animal is squeezed.", "The girl is holding the bear towards the camera."], "image": "train2014/COCO_train2014_000000264959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47419, "question_id": "PVhW9cxLajejdZy9UPnvc3", "question": "Which profession would have used the red vehicle?", "choices": ["mailmen", "police", "doctors", "firemen"], "correct_choice_idx": 3, "direct_answers": ["firemen", "fire fighters", "firefighter", "driving", "firefighters", "fireman", "fireman", "firetruck driver", "firefighter", "fire fighting"], "difficult_direct_answer": false, "rationales": ["The profession is a fireman.", "The vehicle is owned by the fire department to be used by firefighters.", "The vehicle has the words west hampton beach fire department written on the side. the color and the objects that are on the vehicle suggest that the drivers would be fire men."], "image": "val2014/COCO_val2014_000000047419.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73724, "question_id": "PW5H3RoM7eM4rAdjt3xeJ8", "question": "What color are the speaker cloth coverings?", "choices": ["green", "red", "black", "gray"], "correct_choice_idx": 3, "direct_answers": ["gray", "red", "brown", "grey", "gray", "grey", "gray", "gray", "gray", "gray"], "difficult_direct_answer": false, "rationales": ["Traditionally speakers use black or grey to cover the inside electronics.", "They are grey in color.", "The gray part is seen in large view."], "image": "train2014/COCO_train2014_000000073724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338987, "question_id": "PW8dfLYsS3qLtbCiKZYb8f", "question": "In what event will this thing be used?", "choices": ["flooding", "drought", "fire", "earthquake"], "correct_choice_idx": 2, "direct_answers": ["fire", "fire", "fire", "fire", "fire", "fire", "fire", "fire", "fire", "fire"], "difficult_direct_answer": false, "rationales": ["The hydrant provides water", "This fire hydrant will be used when there's a fire.", "The item in the sidewalk is a fire hydrant that will be used in case of a fire."], "image": "train2014/COCO_train2014_000000338987.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149641, "question_id": "PW8gk56p3RqDNiLVHqnWN4", "question": "What type of meal does this appear to be?", "choices": ["meat lovers", "vegetarian", "chinese", "italian"], "correct_choice_idx": 1, "direct_answers": ["vegetarian", "vegetarian", "healthy", "salad", "lunch", "carrot", "salad", "salad", "vegetarian", "traditional foods"], "difficult_direct_answer": false, "rationales": ["The meal is made out of guacamole, carrots, and cabbage.", "The components of the meal are visible and contain no meat which would be consistent with answer a.", "The meal is vegetarian."], "image": "val2014/COCO_val2014_000000149641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135759, "question_id": "PWAQrxbGmjG538zRaZMS9d", "question": "What type of rice is on the dishes?", "choices": ["wild rice", "spanish rice", "brown rice", "white rice"], "correct_choice_idx": 1, "direct_answers": ["dirty rice", "spanish", "spanish", "mexican", "mexican", "spanish rice", "spanish", "saffron", "long grain", "fried"], "difficult_direct_answer": false, "rationales": ["It has been coked till golden brown.", "Spanish rice is yellow.", "That is a mexican type of rice."], "image": "train2014/COCO_train2014_000000135759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65011, "question_id": "PWbXrcomv6ugmgLDSnFath", "question": "What's the name for the hand gesture the man with the mustache is doing?", "choices": ["devil horns", "peace sign", "cowabunga", "thumbs up"], "correct_choice_idx": 0, "direct_answers": ["metal", "devil horns", "peace", "shocker", "bull horns", "rock on", "devil horns", "rock on", "shocker", "horns"], "difficult_direct_answer": false, "rationales": ["The people are rocking out with a casual expression.", "The name is horns.", "The man's fingers form horns."], "image": "train2014/COCO_train2014_000000065011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56932, "question_id": "PWdtLwoHqseLBzfvy85M3R", "question": "What type of transportation is this?", "choices": ["sky", "road", "rail", "water"], "correct_choice_idx": 2, "direct_answers": ["train", "train", "train", "rail", "train", "train", "bus", "train", "train", "light rail"], "difficult_direct_answer": false, "rationales": ["This is a train which travels via the tracks on the ground.", "There is a land vehicle that travels on tracks.", "This is a train station"], "image": "val2014/COCO_val2014_000000056932.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489264, "question_id": "PWvuVGB2VFcC5TjCDTYos9", "question": "What country is very famous for the thing on the wood floor?", "choices": ["france", "uk", "south africa", "persia"], "correct_choice_idx": 3, "direct_answers": ["persia", "india", "persia", "india", "orient", "china", "persia", "russia", "europe", "persia"], "difficult_direct_answer": false, "rationales": ["You can get a persian carpet.", "It is a decorative rug, also known as a \"persian rug\"", "The patterns are famous in africa."], "image": "val2014/COCO_val2014_000000489264.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246927, "question_id": "PWyeH9dnS2g4Vob8xrCcwa", "question": "What is in the small cream-colored container?", "choices": ["sugar substitute", "coffee creamer", "pepper", "salt"], "correct_choice_idx": 0, "direct_answers": ["sugar packets", "sugar", "sugar substitute", "sugar", "sugar", "alcohol", "sugar", "sugar packets", "butter", "sugar packets"], "difficult_direct_answer": false, "rationales": ["These containers are common to hold small packets of sugar or sugar-like granules to put in coffee or tea.", "Answer a is consistent with the setting of a restaurant where these things are commonly served in this manner.", "The cream colored container has splenda."], "image": "train2014/COCO_train2014_000000246927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478518, "question_id": "PXHVWkhP8SKcigcuDYDv9U", "question": "What type of vehicle is this cup from?", "choices": ["ship", "bus", "plane", "train"], "correct_choice_idx": 0, "direct_answers": ["boat", "boat", "ship", "cruise ship", "cruise", "cruise ship", "ship", "cruise ship", "cruise ship", "cruse ship"], "difficult_direct_answer": false, "rationales": ["The cup says \"norwegian cruise line\".", "The vehicle is a ship.", "A cruise line is a ship."], "image": "val2014/COCO_val2014_000000478518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47837, "question_id": "PXey4n569L9GS72wvQCbRU", "question": "Where was the meat on the table prepared?", "choices": ["oven", "grill", "store", "restaurant"], "correct_choice_idx": 1, "direct_answers": ["pit", "grill", "grilled", "grill", "kitchen", "home", "grill", "stove", "grill", "house"], "difficult_direct_answer": false, "rationales": ["Burgers are usually done on a grill.", "The meat was grilled since the meat resembles burgers.", "It was made on the grill."], "image": "val2014/COCO_val2014_000000047837.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110138, "question_id": "PXfnWnkQpUoK2URJizjUVE", "question": "These buses will take you to what province?", "choices": ["manitoba", "ontario", "british columbia", "quebec"], "correct_choice_idx": 2, "direct_answers": ["vancouver", "vancouver", "vancouver", "vancouver", "india", "british columbia", "india", "vancouver", "vancouver", "british columbia"], "difficult_direct_answer": false, "rationales": ["The buses say vancouver.", "These buses will take someone to the province of british columbia in vancouver.", "A large row of red buses are in a line. above the windshield is the name \"victoria.\""], "image": "val2014/COCO_val2014_000000110138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282130, "question_id": "PXk4MMYttQBtMspkHS7xjj", "question": "What is the first name of the one who started this motor company?", "choices": ["ryo", "torakusu", "yoshi", "naoya"], "correct_choice_idx": 1, "direct_answers": ["yamaha", "yamaha", "torakusu", "william", "torakusu", "yamaha", "torakusu", "harley", "yamaha", "torakusu"], "difficult_direct_answer": false, "rationales": ["The name is torakusu.", "Torakusu was the first name of the founder.", "This is a yamaha motorbike. yamaha's first name is torakusu."], "image": "val2014/COCO_val2014_000000282130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198120, "question_id": "PXtWL8ZXgBvKGaUw2Ho8qH", "question": "What setting does this seem to be?", "choices": ["school", "office", "library", "home"], "correct_choice_idx": 1, "direct_answers": ["office", "office", "office", "office party", "office", "office", "office", "office party", "office", "office"], "difficult_direct_answer": false, "rationales": ["There are cubicles and computer work stations in the background.", "The donuts are laid out in an office.", "The setting is a large room with multiple desks, computers and multiple people. these people are at work."], "image": "val2014/COCO_val2014_000000198120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106151, "question_id": "PYEpCkwngR8d5TtWHMMSBw", "question": "What are the cows standing on?", "choices": ["rock", "water", "snow", "sticks"], "correct_choice_idx": 0, "direct_answers": ["dirt", "dirt", "mountain", "rocks", "rocks", "boulder", "rocks", "rocks", "stone", "rock"], "difficult_direct_answer": false, "rationales": ["They are standing on a surface that is rough, very hard, not manufactured by humans, and is outdoors on a hill in a natural undisturbed area, and is partially covered with lichen.", "There are no cows in the image, but the main animal visible is standing on answer a based on the consistency and color visible on the surface.", "The cows are standing on large flat rocks on top of a hill."], "image": "train2014/COCO_train2014_000000106151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79654, "question_id": "PYRqB9zBbwobJ4EHQBNE2V", "question": "These people are on horses in a line as an example of what?", "choices": ["rodeo", "street performing", "parade", "crowd control"], "correct_choice_idx": 2, "direct_answers": ["parade", "parade", "parade", "identity", "police security", "horse", "soliders", "parade", "parade formation", "parade"], "difficult_direct_answer": false, "rationales": ["The horses are getting ready to march.", "People on horses are lined up and people on the sidelines are watching as in a typical parade.", "The people are in a parade."], "image": "train2014/COCO_train2014_000000079654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331326, "question_id": "PYjQML5sbmk9iUhEFPFDm7", "question": "What is in the middle of the room?", "choices": ["stove", "old lady", "cow", "baby"], "correct_choice_idx": 0, "direct_answers": ["microwave", "stove", "stove", "stove", "stove", "microwave", "stove", "stove", "stove", "stove"], "difficult_direct_answer": false, "rationales": ["The cooking machine is at the middle .", "In the center of the wall is the oven range.", "The largest item in the center of the room is a gas stove."], "image": "train2014/COCO_train2014_000000331326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106202, "question_id": "PYpE5cth6hrmbiHJAqMy9v", "question": "Why is he crouching?", "choices": ["to reach", "for speed", "to pull", "to dig"], "correct_choice_idx": 1, "direct_answers": ["balance", "gravity lowering", "skateboarding", "balance", "skateboarding", "speed", "to skate", "for speed", "balance", "speed"], "difficult_direct_answer": false, "rationales": ["The man wants to go faster.", "Getting low makes you aerodynamic", "The man is riding on a skateboard and going slightly downhill."], "image": "train2014/COCO_train2014_000000106202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9895, "question_id": "PYpoKW2LUGGmebdUHwqWCF", "question": "What is the child travelling on?", "choices": ["hummer", "tricycle", "monster truck", "tank"], "correct_choice_idx": 1, "direct_answers": ["sidewalk", "sidewalk", "tricycle", "sidewalk", "tricycle", "tricycle", "playing", "tricycle", "tricycle", "tricycle"], "difficult_direct_answer": false, "rationales": ["The vehicle in question is clearly visible and has three wheels.", "The transport the boy is riding has three wheels.", "The child is on a tricycle."], "image": "train2014/COCO_train2014_000000009895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24591, "question_id": "PZ3FSR8xrzptwxTuGj2Pp7", "question": "To ensure a safe turn the skier looks out for?", "choices": ["all correct", "people", "rocks", "poles"], "correct_choice_idx": 0, "direct_answers": ["trees", "obstacles", "bumps", "all correct", "incoming skiers", "obstacles", "skier behind", "other people", "other skiers", "trees"], "difficult_direct_answer": false, "rationales": ["A rock would cause the skier to collide and fall.", "A skier has to look out for all of the listed obstacles to stay safe.", "The question doesn't make sense, but the thing visibly present that could harm a skier if they ran into without looking would be answer c."], "image": "train2014/COCO_train2014_000000024591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67985, "question_id": "PZB7LTeDPczEufiJGzAAHB", "question": "How many distinct topping types are on this pizza?", "choices": ["two", "one", "four", "three"], "correct_choice_idx": 0, "direct_answers": ["three", "one", "four", "four", "two", "three", "multiple", "two", "three", "three"], "difficult_direct_answer": false, "rationales": ["There is ham and mushrooms", "The pizza is visible and the toppings can be parsed and identified by their color and consistency and then counted.", "The pizza on the table has ham and cheese as toppings."], "image": "train2014/COCO_train2014_000000067985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417261, "question_id": "PZMfvnbswBuNskfoJfvc6Z", "question": "What man made material is produced from the thing the people are standing on?", "choices": ["plastic", "steel", "medicine", "glass"], "correct_choice_idx": 3, "direct_answers": ["glass", "glass", "asphalt", "asphalt", "glass", "diamonds", "glass", "glass", "glass", "glass"], "difficult_direct_answer": false, "rationales": ["Glass is made with sand.", "Sand is a component of glass.", "Glass is used to produce the boards."], "image": "train2014/COCO_train2014_000000417261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64470, "question_id": "PZXhDoLnCFR9mxvh6u6uyZ", "question": "Colloquially is also known as?", "choices": ["wii remote", "joy stick", "game pad", "joy pad"], "correct_choice_idx": 0, "direct_answers": ["linguistic style", "informal", "common", "wii remote", "wii", "familiar", "gaming", "gaming", "unknown", "familiar"], "difficult_direct_answer": false, "rationales": ["The logo writing is visible at the bottom of the remote.", "That is used for a nintendo system.", "Based on the color, design and the way the person is using the item, all features are consistent with answer a."], "image": "train2014/COCO_train2014_000000064470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504894, "question_id": "PZxoxYpQFa7ch4pAq4JRKw", "question": "What type of activity will these people do?", "choices": ["fishing", "running", "gymnastics", "biking"], "correct_choice_idx": 0, "direct_answers": ["fishing", "boating", "fishing", "boating", "boating", "rural", "fishing", "boating", "bike", "fishing"], "difficult_direct_answer": false, "rationales": ["They are fishing in their boat.", "The people on the boat are trying to catch fish in the water.", "They are on the water in a boat"], "image": "train2014/COCO_train2014_000000504894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450970, "question_id": "PaAihW6e4sPustRBr2pXLS", "question": "What building did the people come from?", "choices": ["hotel", "train station", "ferry terminal", "airport"], "correct_choice_idx": 3, "direct_answers": ["airport", "train", "airport", "train station", "airport", "hotel", "hotel", "train station", "airport", "airport"], "difficult_direct_answer": false, "rationales": ["With the luggage, porter, and the area, it looks like they are at a hotel.", "They are all carrying luggages that are used during travels.", "They are seen with suitcase in their hands."], "image": "val2014/COCO_val2014_000000450970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395723, "question_id": "PaEdnqV6JWVBhaVPTAN5e7", "question": "What is the same colors as the animals?", "choices": ["oreo cookie", "lemon drop", "laffy taffy", "keebler fudge"], "correct_choice_idx": 0, "direct_answers": ["car", "black white", "newspaper", "car", "jail uniform", "black white", "oreo cookie", "car", "car", "black white"], "difficult_direct_answer": false, "rationales": ["The animals are visibly black and white. this is the same color combination of answer a.", "Oreos are black and white.", "Oreos are black and white."], "image": "val2014/COCO_val2014_000000395723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200328, "question_id": "PaMhdDZA9xLXyXbYV3BAhW", "question": "Birds seen here are likely doing what?", "choices": ["attacking", "flyover", "migrating", "landing"], "correct_choice_idx": 2, "direct_answers": ["migrating", "reading", "migrating", "flying", "flying north", "flying", "flying", "flying", "migrating", "formation flying"], "difficult_direct_answer": false, "rationales": ["The birds are going to another place to rest.", "The birds are migrating.", "Birds are flying together in a v formation."], "image": "train2014/COCO_train2014_000000200328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44286, "question_id": "PaZP2RiVxiitc7W49T2PKr", "question": "The large speakers next to the monitor suggest someone uses this station for what?", "choices": ["media", "word processing", "web surfing", "picture editing"], "correct_choice_idx": 0, "direct_answers": ["music", "video games", "radio", "music", "music", "music listening", "media", "audio engineering", "music", "music"], "difficult_direct_answer": false, "rationales": ["There are more electronics in the image.", "The station is used for the media.", "They listen to music on the computer"], "image": "train2014/COCO_train2014_000000044286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177953, "question_id": "Pae39BdsnqcWkgemZzk8SU", "question": "What is the woman's job?", "choices": ["pianist", "drummer", "guitarist", "singer"], "correct_choice_idx": 1, "direct_answers": ["drummer", "drummer", "drummer", "drummer", "drummer", "drummer", "drummer", "drummer", "drummer", "drummer"], "difficult_direct_answer": false, "rationales": ["The woman is supposed to be playing the drums.", "She is seated at a drum set", "The job is to drum."], "image": "val2014/COCO_val2014_000000177953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339228, "question_id": "PakMFSZnc6rHbnNsrSy7r2", "question": "What type room is this?", "choices": ["bedroom", "office", "kitchen", "lounge"], "correct_choice_idx": 3, "direct_answers": ["waiting room", "lounge", "waiting room", "public gathering", "community room", "showroom", "waiting room", "community", "lounge", "lounge"], "difficult_direct_answer": false, "rationales": ["There are many armchairs and tables so it's likely a lounge.", "The other options don't apply because they don't usually contain this type of furniture.", "There are many places to sit grouped around tables."], "image": "train2014/COCO_train2014_000000339228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124210, "question_id": "Pb2z2aU8mHDjoeBkKoo6SG", "question": "What will the person who left this gear do with it?", "choices": ["catch butterflies", "go fishing", "have picnic", "water ski"], "correct_choice_idx": 1, "direct_answers": ["fish", "fishing", "go fishing", "fish", "go fishing", "go fishing", "fishing", "retrieve gear", "fishing", "storage"], "difficult_direct_answer": false, "rationales": ["There is a long stick.", "The gear consists of a rod, a net, a tackle box, and a cooler. these items are not used in water skiing, catching butterflies, or having picnics.", "The person wants to catch fish with the net."], "image": "val2014/COCO_val2014_000000124210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269015, "question_id": "Pb3bFzLSwY6ApHbiPJ99aV", "question": "What does the lady use the umbrella for?", "choices": ["hail", "rain", "wind", "shade"], "correct_choice_idx": 3, "direct_answers": ["sun", "block rain", "prevent heat", "sun protection", "keep dry", "block sun", "shade", "stay dry", "shade", "sun block"], "difficult_direct_answer": true, "rationales": ["The woman wants to keep the sun out.", "The umbrella is for shade.", "The lady is sitting outside and needs some protection from the sun and glare in order to read her magazine."], "image": "val2014/COCO_val2014_000000269015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287233, "question_id": "PbDYXPX2bFDbCdPF2YQvWq", "question": "What are the young women doing in front of the tv?", "choices": ["gaming", "sweeping", "debating", "fighting"], "correct_choice_idx": 0, "direct_answers": ["watching", "playing games", "playing", "watching tv", "playing wii", "video games", "playing wii", "playing wii", "gaming", "playing game"], "difficult_direct_answer": false, "rationales": ["The people game.", "She holds a remote to control the activity", "There would be no point in fighting, debating or sweeping near a television set."], "image": "train2014/COCO_train2014_000000287233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222140, "question_id": "PbP9BCbDCYACLufw8h3z6H", "question": "What kind of telephone is being used?", "choices": ["rotary", "cellular", "pay", "landline"], "correct_choice_idx": 1, "direct_answers": ["cellular", "cell", "mobile", "flip phone", "mobile", "cellphone", "cellular", "cellphone", "cellular", "flip phone"], "difficult_direct_answer": false, "rationales": ["The person uses a cellphone.", "The phone doesn't have any wires attached to it.", "The phone is a cell."], "image": "train2014/COCO_train2014_000000222140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279165, "question_id": "Pbj3FeGAqZ2783CWPDazBq", "question": "What type of animals are present?", "choices": ["sheep", "deer", "cattle", "zebra"], "correct_choice_idx": 3, "direct_answers": ["zebra", "zebra", "zebras", "zebras", "zebras", "zibbera", "zebra", "zebra", "zebra", "zebra"], "difficult_direct_answer": false, "rationales": ["These animals represent zebras.", "This is a large and striped herbivore that is wild and not domestic.", "The distinct stripes identify these animals"], "image": "val2014/COCO_val2014_000000279165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528786, "question_id": "Pbkf9a9BRt2VxnyFkDAqod", "question": "What could they be washing off?", "choices": ["blood", "graffiti", "vomit", "urine"], "correct_choice_idx": 1, "direct_answers": ["graffiti", "graffiti", "graffiti", "grafitti", "poop", "graffiti", "graffiti", "bird poop", "graffiti", "graffiti"], "difficult_direct_answer": false, "rationales": ["Looks like they are cleaning all the writing off of it.", "Objects like this are ruined by so-called art.", "It's white and on the side of the stand where people can easily reach"], "image": "val2014/COCO_val2014_000000528786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441862, "question_id": "PboCHvvT5ezvTRoaUoRojp", "question": "The brand he's advertising on his shirt makes what?", "choices": ["heavy equipment", "clothing", "furniture", "electronics"], "correct_choice_idx": 0, "direct_answers": ["construction equipment", "cat", "caterpillar", "heavy equipment", "trucks", "vehicles", "bulldozers", "construction equipment", "cat", "equipment"], "difficult_direct_answer": false, "rationales": ["The brand cat is known for selling large construction machinery.", "The brand on his shirt is cat, which makes construction machinery--or \"heavy equipment,\" as it's also called.", "The man is wearing a shirt that says cat which is a company that makes construction equipment"], "image": "val2014/COCO_val2014_000000441862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199671, "question_id": "PbshfBGVhf63kz86mqvb99", "question": "What is this type of blade good at?", "choices": ["cutting paper", "cutting bone", "cutting bread", "cutting butter"], "correct_choice_idx": 2, "direct_answers": ["cutting", "slicing", "cutting bread", "cutting", "cutting bread", "cutting", "cutting", "cutting meat", "cutting", "cutting"], "difficult_direct_answer": false, "rationales": ["The knife is serrated.", "It is serrated which helps to saw through things", "The knife is visible and has a serrated blade. this type of blade is commonly used for answer a."], "image": "val2014/COCO_val2014_000000199671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197959, "question_id": "Pbytzz56Spjg4PhVWpSWK8", "question": "What does the round item seen on the floor and plugged into the wall clean?", "choices": ["shoes", "floors", "dishes", "walls"], "correct_choice_idx": 1, "direct_answers": ["floors", "floor", "floor", "floor", "floor", "floors", "floor", "floors", "floor", "floor"], "difficult_direct_answer": false, "rationales": ["The objects are robot vacuums that clean floors.", "The vacuum cleaner cleans the floor.", "The item is the floor."], "image": "train2014/COCO_train2014_000000197959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479711, "question_id": "PcMx2Ba6G6Xp4g4uEFqh6Q", "question": "What animal might have pulled this cart?", "choices": ["monkey", "dog", "kangaroo", "horse"], "correct_choice_idx": 3, "direct_answers": ["horse", "horse", "horse", "oxen", "horse", "oxen", "horse", "oxen", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["The other options typically aren't used to pull a cart of this size. it's too large.", "Horses are often used to pull goods and people in carts. this cart is too big to be pulled by hand by a person.", "Horses pull carts."], "image": "train2014/COCO_train2014_000000479711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54355, "question_id": "PcSgtyhT3DfGindVb7Uskh", "question": "Which fruit is most populous on pies seen here?", "choices": ["olives", "pineapple", "apple", "pear"], "correct_choice_idx": 0, "direct_answers": ["olives", "pineapple", "pineapple", "olives", "black olives", "pineapple", "mushrooms", "olives", "olive", "apple"], "difficult_direct_answer": false, "rationales": ["The pie on the right has circular fruit pieces. they are black, not yellow or white.", "Olives are the only fruit on the top of these pizzas.", "Hawaiian pizza contains this fruit"], "image": "val2014/COCO_val2014_000000054355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33643, "question_id": "PccHhKWZizS52VVEQtNiBQ", "question": "How was this treat prepared?", "choices": ["grilled", "sun baked", "baked", "deep fried"], "correct_choice_idx": 3, "direct_answers": ["fried", "deep fried", "fried", "fried", "fried", "fried", "fried", "fried", "fried", "fried"], "difficult_direct_answer": false, "rationales": ["The donut was fried.", "It was fried in oil", "Donuts are cooked in oil."], "image": "train2014/COCO_train2014_000000033643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157006, "question_id": "Pd4CH8yBZ8xUtFMqHaNgHY", "question": "What is the man in the black glasses using the white remote to do?", "choices": ["power tv", "open door", "play games", "control robot"], "correct_choice_idx": 2, "direct_answers": ["play games", "video gaming", "video game", "change channel", "control", "play game", "play games", "play game", "playing game", "play game"], "difficult_direct_answer": false, "rationales": ["The remote is for a wii system.", "The man is holding a game remote with a determined look on his face so he's definitely playing a game.", "It appears to be a wired controller."], "image": "train2014/COCO_train2014_000000157006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386625, "question_id": "Pd5ygEGEdWSSFpXGP9LmzB", "question": "The red vehicle down the street is used for what purpose?", "choices": ["medical emergencies", "public transport", "fire emergencies", "mail delivery"], "correct_choice_idx": 1, "direct_answers": ["cargo", "transport", "transporting passengers", "transport people", "carrying people", "public transport", "public transportation", "tour bus", "transport people", "transportation"], "difficult_direct_answer": true, "rationales": ["This is a bus and its used to bring people around.", "The red vehicle in the distance is a double decker bus. buses drive groups of people along a set route.", "The red vehicle is a bus."], "image": "train2014/COCO_train2014_000000386625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381368, "question_id": "PdfUCgYx5pVji5UQD3jzSq", "question": "Which animal is in most danger here?", "choices": ["rabbit", "cow", "mare", "bird"], "correct_choice_idx": 3, "direct_answers": ["bird", "stork", "bird", "bird", "bird", "bird", "bird", "stork", "bird", "bird"], "difficult_direct_answer": false, "rationales": ["The bird is the smallest.", "It is the smallest one", "The bird is in danger of being injured by the bull."], "image": "val2014/COCO_val2014_000000381368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101969, "question_id": "Pdt7SxYZSF7oZfSYZMkZ8D", "question": "What type of object is this?", "choices": ["model", "set", "lego", "diorama"], "correct_choice_idx": 0, "direct_answers": ["model train", "model train", "toy train", "model train", "model train", "toy train", "model", "model train", "train", "train replica"], "difficult_direct_answer": false, "rationales": ["This is a small scale railroad.", "This is a model trainset.", "The train is part of a model set that someone built for fun."], "image": "val2014/COCO_val2014_000000101969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130361, "question_id": "PeF4P8evf32wuiYwmHXgoY", "question": "These animals live how many years on average?", "choices": ["60", "five", "20", "200"], "correct_choice_idx": 0, "direct_answers": ["30", "30", "17", "fifty four", "70", "forty eight", "60", "30", "60 years", "80"], "difficult_direct_answer": false, "rationales": ["The lifespan of an african elephant is 60-70 years.", "This is the average age for elephants", "Elephants can live as long as people."], "image": "train2014/COCO_train2014_000000130361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424124, "question_id": "PeRD4ZwctxqpwraDDkBuNd", "question": "What water brand is advertised in the dugout?", "choices": ["voss", "dasani", "nestle", "aquafina"], "correct_choice_idx": 3, "direct_answers": ["aquafina", "aquafina", "aquafina", "aquafina", "aquafina", "aquafina", "aquafina", "aquafina", "aquafina", "aquafina"], "difficult_direct_answer": false, "rationales": ["The brand is clearly visible and is the only one that can be seen", "Aquafina is the water brand that is being advertised on the boards.", "Aquafina is advertised."], "image": "train2014/COCO_train2014_000000424124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83149, "question_id": "PeYA8DPfikXnzvEcDrkPb3", "question": "What shiny object is in the foreground here?", "choices": ["man", "bar", "mirror", "napkins"], "correct_choice_idx": 1, "direct_answers": ["metal", "counter", "bar", "plates light", "cooking tool", "oven", "bar top", "stove", "counter", "kitchen pans"], "difficult_direct_answer": true, "rationales": ["It is a stainless steel counter", "The stainless steel counter is a place where people sit at close to the kitchen or bar area.", "It is a long flat surface that multiple people can sit behind."], "image": "train2014/COCO_train2014_000000083149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238001, "question_id": "PeYM95Mm4GBW5kbvo6WXKs", "question": "What brand of suitcase is the woman in white holding on to?", "choices": ["coach", "vera wang", "gucci", "toler"], "correct_choice_idx": 1, "direct_answers": ["carryon luggage", "louis vuitton", "vera wang", "eagle creek", "coach", "carryon", "gucci", "heys", "heys", "unsure"], "difficult_direct_answer": true, "rationales": ["The suitcase is vera.", "It is a signature design of the g brand.", "She is holding a coach bag."], "image": "train2014/COCO_train2014_000000238001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4444, "question_id": "PeadP6LbxWxiPSwYo6Ej5K", "question": "Where is this dining room located in all likelihood?", "choices": ["bus", "plane", "train", "storefront"], "correct_choice_idx": 2, "direct_answers": ["kitchen", "train", "kitchen", "train", "train", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["Some long trains have dining carts and the shape of the room looks like the interior of a train.", "The rows of seats, windows and metal poles for holding when standing tell us this picture was taken inside a train.", "There are train windows on either side."], "image": "train2014/COCO_train2014_000000004444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461378, "question_id": "PecJUYEvDSReX4FewTXts5", "question": "The lighting item seen here most replicated is constructed from what?", "choices": ["wax", "glass", "bulbs", "wood"], "correct_choice_idx": 0, "direct_answers": ["wax", "wax", "wax", "flowers", "wax", "leafs", "camera flash", "candles", "candle", "wood"], "difficult_direct_answer": false, "rationales": ["It is a candle and candles are usually made from wax.", "Candles are made from a melting substance.", "The item is wax."], "image": "val2014/COCO_val2014_000000461378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92301, "question_id": "PeiUywBMaB5HVQufvStuL5", "question": "Why is rain going through his umbrella?", "choices": ["entertainment", "bad film", "heavy rain", "is broken"], "correct_choice_idx": 0, "direct_answers": ["leaks", "prop umbrella", "entertainment", "holes", "holes", "theater play", "hole", "play", "leaking", "pumped water"], "difficult_direct_answer": true, "rationales": ["Normally umbrellas keep rain off of people but in this instance it's made to be funny.", "Rain is going through the umbrella to entertain the audience of some show.", "The man holding the umbrella is putting on a show and is getting wet on purpose to be funny."], "image": "train2014/COCO_train2014_000000092301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446585, "question_id": "Pey5D7yx4eTbeSB3bSfqiZ", "question": "What material is made from their fur?", "choices": ["cotton", "silk", "polyester", "wool"], "correct_choice_idx": 3, "direct_answers": ["wool", "wool", "wool", "wool", "coat", "wool", "wool", "wool", "coat", "wool"], "difficult_direct_answer": false, "rationales": ["It was found that wool could be made form the fleece of sheep in 400-300 bc. since that time, sheep have been providers of both fleece and mutton for their human owners.", "This is the only material that comes from this mammal; the other options come from worms, a plant or are synthetic.", "The sheep have coats made of fur that can be made into wool."], "image": "train2014/COCO_train2014_000000446585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251026, "question_id": "PezYdJKsiDT4MYJkfCdQnY", "question": "Why are there so many different train tracks so close together?", "choices": ["factory/trains", "seaside convergence", "depot/switching", "recycling area"], "correct_choice_idx": 2, "direct_answers": ["trainyard", "city center", "depot/switching", "depot", "railway", "train station", "save space", "major hub", "city", "space management"], "difficult_direct_answer": true, "rationales": ["Trains are often seen in groups at depots.", "The trains are being switched around.", "There are so many train tracks close together to switch at the depot."], "image": "train2014/COCO_train2014_000000251026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509682, "question_id": "PfFB4REMke8SAf87FM2izz", "question": "Why are the luggage bags on the cart?", "choices": ["to destroy", "as decoration", "to sell", "to transport"], "correct_choice_idx": 3, "direct_answers": ["for transport", "to transport", "transported", "load train", "transport", "pick up", "transport", "transport", "easily carried", "transport"], "difficult_direct_answer": false, "rationales": ["The luggage bags are on the cart to help the people transport their luggage without carrying it.", "The luggage is being used for transportation.", "The bags are on the cart to help them move easily."], "image": "train2014/COCO_train2014_000000509682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410573, "question_id": "PfHcPhMcsjgq8JPNRoRyEm", "question": "What is the cat near?", "choices": ["goat", "baby", "piglet", "dog"], "correct_choice_idx": 3, "direct_answers": ["dog", "dog", "chair", "dog", "chair", "dog", "dog", "dog", "dog", "chair"], "difficult_direct_answer": false, "rationales": ["The cat is lording it over the pup.", "The cat is by a dog.", "The cat is on the back of the chair and the dog is sitting on the seat of the chair."], "image": "val2014/COCO_val2014_000000410573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151330, "question_id": "PfPbxueJE3WCW9jZsNqpDT", "question": "Where is this man going?", "choices": ["track", "pool", "work", "ocean"], "correct_choice_idx": 3, "direct_answers": ["surfing", "to beach", "ocean", "surfing", "ocean", "surfing", "surfing", "ocean", "beach", "surfing"], "difficult_direct_answer": false, "rationales": ["One can surf in in sea", "The man is holding a skate board.", "The man goes to the ocean."], "image": "val2014/COCO_val2014_000000151330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242034, "question_id": "PfSZuwsZygF8aQNrUG75jp", "question": "What is the man in the bottom left holding?", "choices": ["club", "umbrella", "cane", "bat"], "correct_choice_idx": 2, "direct_answers": ["walking stick", "cane", "walking stick", "cane", "cane", "cane", "cane", "cane", "cane", "cane"], "difficult_direct_answer": false, "rationales": ["He is using a stick-like device to support part of his weight while walking.", "A long stick can be seen from his hand to the ground, resembling a walking cane to assist walking.", "This looks like a stick and the guys age makes it likely to be a cane."], "image": "val2014/COCO_val2014_000000242034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256230, "question_id": "PfopEB2fQBh2htxfP2L28u", "question": "What is the boy doing with the cat?", "choices": ["hitting it", "petting it", "grooming it", "feeding it"], "correct_choice_idx": 3, "direct_answers": ["feeding it", "feeding", "feeding it", "feeding", "feeding it", "feeding it", "feeding", "feeding", "feeding", "feeding"], "difficult_direct_answer": false, "rationales": ["The boy looks like he is putting food in the bowl.", "The boy is scooping up food from the bowl and feeding it to the cat.", "The bowl is holding a bowl that has something in it. bowls are frequently used to contain food and the boy and cat both being interested in the bowl means they are likely eating."], "image": "train2014/COCO_train2014_000000256230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28518, "question_id": "PgjZzvMLWwLYzHbkKRshYY", "question": "What place is known for this kind of weather?", "choices": ["las vegas", "london", "egypt", "antarctica"], "correct_choice_idx": 1, "direct_answers": ["london", "seattle", "florida", "seattle", "hawaii", "seattle", "seattle", "seattle", "oregon", "seattle"], "difficult_direct_answer": false, "rationales": ["London is known for rain.", "A woman who wears a raincoat and carries an umbrella is in wet weather. dry or frozen areas include egypt, las vegas and antarctica.", "They get a lot of rain"], "image": "train2014/COCO_train2014_000000028518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442657, "question_id": "PgttJDtrJ9cCLeZbdM4FFR", "question": "This room looks like an old type of what?", "choices": ["school", "church", "hospital", "prison"], "correct_choice_idx": 2, "direct_answers": ["hospital", "hospital", "hospital", "hospital", "hospital", "hospital", "hospital", "hospital", "hospital", "hospital"], "difficult_direct_answer": false, "rationales": ["The room has metallic beds but does not have prison cells. churches and schools do not have beds.", "This was an old room that doctors and nurses would take care of their patients.", "This is how a often looked decades ago. they can also look this way in field varieties, but the frames aren't use these days."], "image": "train2014/COCO_train2014_000000442657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370383, "question_id": "PgxwTVwpVre5zNvqNTBBVc", "question": "Who of these three seemingly has the best vision?", "choices": ["right", "all same", "middle", "left"], "correct_choice_idx": 2, "direct_answers": ["middle person", "firs lady", "middle", "middle", "middle", "middle woman", "middle woman", "middle", "middle", "center lady"], "difficult_direct_answer": false, "rationales": ["The one in the middle doesn't have glasses.", "She doesn't need glasses", "The people on either side are wearing glasses. people wear glasses when they have poor visible that needs correction."], "image": "train2014/COCO_train2014_000000370383.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29523, "question_id": "Ph48CdzoeaY5LGMxJj38EY", "question": "Why is the woman carrying an open umbrella?", "choices": ["fashion", "rain", "sun", "joke"], "correct_choice_idx": 2, "direct_answers": ["shade", "sun", "shade", "sunny", "sun", "sun protection", "block sun", "shade", "block sun", "shade"], "difficult_direct_answer": false, "rationales": ["It is a clear day. it is not raining.", "There is no rain, but the sun is shining bright, so the umbrella can help protect from harmful uv rays.", "An umbrella can provide shade."], "image": "train2014/COCO_train2014_000000029523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364221, "question_id": "PhDGyozk4K5rt5MbBHbXrv", "question": "What is the same color as the color of the surfboard?", "choices": ["cherry", "butter", "lime", "grapefruit"], "correct_choice_idx": 1, "direct_answers": ["sun", "banana", "butter", "yellow", "hair", "yellow", "person's hair", "bananas", "bananas", "lemon"], "difficult_direct_answer": false, "rationales": ["This is the only color option that matches the actual color.", "A girl is on a yellow surfboard in the water. butter is yellow.", "Butter has the same color as the board."], "image": "train2014/COCO_train2014_000000364221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322327, "question_id": "PhJwJRQzfZk2y2GMQjmHEt", "question": "According to the surroundings what is this person doing?", "choices": ["dining", "watching tv", "traveling", "working"], "correct_choice_idx": 2, "direct_answers": ["traveling", "travelling", "train traveling", "travelling", "relaxing", "riding train", "train", "drinking", "traveling", "traveling"], "difficult_direct_answer": false, "rationales": ["The person is in an enclosed space that looks like some kind of transport based on the cot, the visible luggage and shelving. someone would be in this setting if they were moving to a new location.", "The luggage and small cramped quarters where the man is sitting indicates a small traveling space.", "The person is sitting on a bunk bed with luggage next to him which means he is traveling somewhere."], "image": "train2014/COCO_train2014_000000322327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440709, "question_id": "PhSe7rLg4EyxwpYKxX7sZ3", "question": "What vitamin does this fruit contain the most?", "choices": ["vitamin", "vitamin b", "vitamin c", "vitamin e"], "correct_choice_idx": 2, "direct_answers": ["vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c"], "difficult_direct_answer": false, "rationales": ["Oranges are a citrus fruit so they contain vitamin c.", "This is obvious. absorbic or citric acid helps.", "Oranges have vitamin c."], "image": "val2014/COCO_val2014_000000440709.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430191, "question_id": "PhT8t88NyEuHbfowKAxv9x", "question": "What is the man inside the front of the training doing?", "choices": ["driving", "loading", "boarding", "policing"], "correct_choice_idx": 0, "direct_answers": ["driving", "conducting", "driving", "driving it", "controlling", "operating it", "train conducting", "driving", "conductor driving", "operating"], "difficult_direct_answer": false, "rationales": ["The man is steering the train.", "The man is operating the train.", "A man is sitting in the front cart of a train."], "image": "train2014/COCO_train2014_000000430191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406205, "question_id": "Phbh3ZJmQ3J2zoU2RAMXFf", "question": "Why have they made the trees blue?", "choices": ["visibility", "attract pollinators", "protect leaves", "holidays"], "correct_choice_idx": 3, "direct_answers": ["holiday decor", "christmas", "give color", "pretty", "christmas decoration", "christmas", "holidays", "for christmas", "holidays", "celebrate christmas"], "difficult_direct_answer": false, "rationales": ["People put up coloured lights during the holidays.", "The lights are blue because it's the holidays.", "The lights are used for christmas."], "image": "train2014/COCO_train2014_000000406205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261858, "question_id": "PhgzGPJXAsUofhhp4zDewd", "question": "What items are hanging on the wall?", "choices": ["family crests", "portraits", "posters", "tongs"], "correct_choice_idx": 3, "direct_answers": ["tongs", "holders", "tongs", "tongs", "tongs", "tongs", "food tongs", "tongs", "tongs", "tongs"], "difficult_direct_answer": false, "rationales": ["The kitchen has tongs hanging on the wall next to the man washing pots. the shape and makeup of the tongs is visible and is clear.", "Tongs are hanging on the wall.", "Several metal pinchers are hanging above a man washing dishes."], "image": "train2014/COCO_train2014_000000261858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402221, "question_id": "Phmu7VNB2AAgE4YrUAr3YT", "question": "What is the cat doing?", "choices": ["hunting", "resting", "leaping", "eating"], "correct_choice_idx": 1, "direct_answers": ["sleeping", "laying down", "watching", "resting", "cat", "napping", "sleeping", "napping", "laying down", "resting"], "difficult_direct_answer": false, "rationales": ["The cat sleeps.", "The boy looks tired.", "By the look of the image the cat seems to be resting."], "image": "train2014/COCO_train2014_000000402221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373374, "question_id": "PiPM8SLYptWjoMmssaJUY3", "question": "What type of meat is used in the sandwich?", "choices": ["pork", "beef", "seafood", "poultry"], "correct_choice_idx": 2, "direct_answers": ["crab", "chicken", "tuna", "crab meat", "burger", "turkey", "crab", "tuna", "seafood", "lobster"], "difficult_direct_answer": false, "rationales": ["Crab is in the sandwich.", "Looks like its seafood in the bun.", "It is a roll filled with lobster."], "image": "val2014/COCO_val2014_000000373374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547592, "question_id": "PiUuon2xjHkumXhrjp5kd6", "question": "What shared amusement might people do here most passively?", "choices": ["gamble", "play risk", "sing", "watch tv"], "correct_choice_idx": 3, "direct_answers": ["watching tv", "watch television", "watch tv", "watching tv", "watch tv", "watch tv", "watch tv", "watching tv", "watch tv", "watch television"], "difficult_direct_answer": false, "rationales": ["The furniture supports this activity best", "Many people can watch this at the same time", "The room is situated so that furniture is facing the tv. people commonly like to sit on furniture and may or may not watch the visible tv."], "image": "train2014/COCO_train2014_000000547592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249453, "question_id": "PiZ6A9T3EvGAAgPrcTXqyY", "question": "What group does the woman belong to?", "choices": ["sikhism", "jewish", "islam", "amish"], "correct_choice_idx": 3, "direct_answers": ["amish", "woman", "amish", "cook", "mormon", "amish", "mennonites", "mennonite", "amish", "amish"], "difficult_direct_answer": false, "rationales": ["The woman is dressed in traditional amish clothing and wearing a bonnet.", "A woman is cooking and wearing a white apron over a traditional dress with a white hat that sits on the back of her head.", "The woman is an amish person."], "image": "val2014/COCO_val2014_000000249453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204432, "question_id": "PifLepPQVK8B48LQtQuFc4", "question": "Why is she covering her eyes?", "choices": ["rain protection", "snow protection", "ice protection", "sun protection"], "correct_choice_idx": 3, "direct_answers": ["sunlight", "sun protection", "sunglasses", "sunlight", "bright sun", "sunlight", "sun", "shade", "sun", "sunny day"], "difficult_direct_answer": false, "rationales": ["She is wearing sunglasses.", "She wants a shield from the sun.", "To prevent harmful sun rays or dirt from reaching her eyes."], "image": "val2014/COCO_val2014_000000204432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511844, "question_id": "PinKTZV7ASqaipGifBTSns", "question": "What are the white marks on the horse's legs called?", "choices": ["boots", "shoes", "leggings", "socks"], "correct_choice_idx": 3, "direct_answers": ["pastern", "tape", "hair", "socks", "socks", "unknown", "markings", "stockings", "hooves", "socks"], "difficult_direct_answer": false, "rationales": ["The horse has white feet.", "They are called stockings when they go all the way to the knee.", "The marks are socks."], "image": "val2014/COCO_val2014_000000511844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125120, "question_id": "Pj4dujn4wqw5Mdfm4XU3UG", "question": "What is most likely in the silver containers?", "choices": ["milk", "juice", "coffee", "water"], "correct_choice_idx": 2, "direct_answers": ["coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["Coffee is in the carafes.", "The juice is in the clear container. the silver containers are insulated to keep the liquid warm.", "Coffee is typically stored in those metal containers to keep hot."], "image": "train2014/COCO_train2014_000000125120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490872, "question_id": "Pj8PeyWFWtw6TGZJMKMgja", "question": "What is in the container on the toilet tank?", "choices": ["baby powder", "toothpaste", "bubble bath", "cleanser"], "correct_choice_idx": 3, "direct_answers": ["toilet", "cleanser", "bleach", "cleaning product", "cleaner", "bathroom cleaner", "cleaner", "drinks", "comet cleaner", "cleaning product"], "difficult_direct_answer": false, "rationales": ["This is a cleaning product you can use to clean the tub or sink with.", "There is a can of cleaner on top of the toilet.", "A green bottle of comet or something very similar is used to clean the area. it makes the bathroom look, smell and keeps it nice and clean."], "image": "train2014/COCO_train2014_000000490872.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438899, "question_id": "Pj8rziqNvyZhvP4ySjBn9q", "question": "What is stored inside the plastic case in front of the dog statue?", "choices": ["cd", "micro chip", "memory card", "mouse"], "correct_choice_idx": 0, "direct_answers": ["cd", "cd", "compact disc", "cell phone", "cd", "cd", "cd", "cd", "cds", "can't see"], "difficult_direct_answer": false, "rationales": ["The plastic case has a cd in it.", "This is a music device", "The case is too small to hold a mouse and too big to hold a card or chip."], "image": "train2014/COCO_train2014_000000438899.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85187, "question_id": "Pj8xTYkmvHhohgPnujdcVW", "question": "What are the vehicles forbidden to do here?", "choices": ["enter street", "stop", "park", "leave street"], "correct_choice_idx": 0, "direct_answers": ["enter", "enter", "park", "park", "park", "enter", "park", "drive", "enter street", "park"], "difficult_direct_answer": false, "rationales": ["The vehicles enter the street.", "There is a circle with a cross in it as if vehicles should not enter.", "This is a no waiting sign."], "image": "val2014/COCO_val2014_000000085187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92710, "question_id": "Pj9uuqCXjRicipA7sgCmrL", "question": "Why is he riding on the sidewalk?", "choices": ["too slow", "more fun", "he's walking", "he's tired"], "correct_choice_idx": 0, "direct_answers": ["safer", "safety", "stay safe", "pulling child", "safety", "more space", "dog carrier", "avoiding vehicles", "safety", "too slow"], "difficult_direct_answer": false, "rationales": ["The man is too slow for the real road.", "He has a large trailer he is pulling.", "It is where the bike path goes"], "image": "train2014/COCO_train2014_000000092710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546782, "question_id": "PjTvtD9nM6yLzwga4WjUYU", "question": "Which type of mirror is in the above picture?", "choices": ["none", "convex", "concave", "regular"], "correct_choice_idx": 1, "direct_answers": ["convex mirror", "blind spot", "blind spot", "rearview", "round", "traffic mirror", "concave", "rear mirror", "reflective", "convex"], "difficult_direct_answer": true, "rationales": ["The objects in the center of the reflection appear to be closer to the viewer.", "The mirror is very curved.", "The curve of the mirror is apparent based on the distortion of the reflection."], "image": "val2014/COCO_val2014_000000546782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31235, "question_id": "PjWPKp8NuCLHoGJCSMbY3w", "question": "What is the weather faced by the woman?", "choices": ["foggy", "sunny", "cold", "stormy"], "correct_choice_idx": 3, "direct_answers": ["windy", "stormy", "windy", "rainy weather", "windy", "rain", "rain", "wind", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["Her umbrella has been blown and broken", "The woman is facing stormy weather.", "The weather is very stormy and broke the umbrella."], "image": "val2014/COCO_val2014_000000031235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139969, "question_id": "PjXgtEFFu4ctjzDBvnAnQW", "question": "What kind of food is most common in this restaurant?", "choices": ["spicy", "taco", "sandwich", "curry"], "correct_choice_idx": 0, "direct_answers": ["rice", "chinese", "chinese", "chinese", "spicy", "rice", "chinese", "chinese", "rice", "chinese"], "difficult_direct_answer": false, "rationales": ["Szechwan is known for being hot and spicy.", "It is a szechuan styled restaurant", "The word szechuan is on the building is which spicy."], "image": "val2014/COCO_val2014_000000139969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193663, "question_id": "PjfSAFLEt8TVoZpfvKf5Z4", "question": "What is this player getting ready to do?", "choices": ["swing", "dunk", "throw", "dribble"], "correct_choice_idx": 0, "direct_answers": ["bat", "run", "run", "hit ball", "run", "bat", "run", "hit", "swing", "bat"], "difficult_direct_answer": false, "rationales": ["The player is getting ready to swing the bat.", "The man is holding a bat and playing baseball. he is using the bat to try to make contact with a ball that was pitched to him.", "A guy is wearing a baseball uniform and is holding a bat up in the air."], "image": "train2014/COCO_train2014_000000193663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65500, "question_id": "PjnwdvHVjRcUiAjd4gE5Rk", "question": "Which one is inappropriately dressed?", "choices": ["brown outfit", "blue tshirt", "green top", "shorts"], "correct_choice_idx": 0, "direct_answers": ["animal", "dog boy", "costume", "dog costume", "on right", "brown outfit", "costume", "animal costume", "dog", "furthest right"], "difficult_direct_answer": true, "rationales": ["The person in the brown outfit isn't wearing athletic clothes.", "The people are standing on the tennis court, and one of the is dressed like a brown dog.", "The people are playing tennis. the green top, blue t-shirt, and shorts are appropriate attire."], "image": "train2014/COCO_train2014_000000065500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137178, "question_id": "PjtHvJ9UvjWj5L5rFL6tQM", "question": "The person taking this picture is sitting behind the fence in which part of the stadium?", "choices": ["field", "bull pit", "pitchers mound", "seats"], "correct_choice_idx": 3, "direct_answers": ["home plate", "front row", "seats", "corner", "home plate", "stands", "bench", "backstop", "backstop", "bleachers"], "difficult_direct_answer": false, "rationales": ["The mound and field are in front of the photographer and the person batting. a baseball field has a bull pen, not a bull pit.", "Spectators sit behind the net at baseball games.", "They are sitting in the stadium seats."], "image": "train2014/COCO_train2014_000000137178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553072, "question_id": "PjzT88wSfR4XKktmUmdBHd", "question": "What are these planes primarily used for?", "choices": ["performances", "rescues", "passengers", "military"], "correct_choice_idx": 0, "direct_answers": ["flying", "military", "military", "military", "air shows", "cargo", "demonstration", "rescue", "performances", "transportation"], "difficult_direct_answer": false, "rationales": ["The shape and size, as well as the ability to house a single pilot all suggest that these are military vehicles. 0", "These are to show off pilot skills", "The planes are show planes and go fast."], "image": "val2014/COCO_val2014_000000553072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65001, "question_id": "Pk4Sd2V4pUKT6KKioQbCQD", "question": "When stopping what body part did most people use to stop their vehicles?", "choices": ["foot", "hand", "eye", "hip"], "correct_choice_idx": 1, "direct_answers": ["hand", "feet", "feet", "hand", "foot", "feet", "hand", "hands", "foot", "feet hand"], "difficult_direct_answer": false, "rationales": ["The brake for the bike is on their handles.", "The motorcycles in the parking lot have brakes on the handlebars that are used by hand.", "The brakes are usually on the handles of motorcycles."], "image": "val2014/COCO_val2014_000000065001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215213, "question_id": "PkALiETjLi8eJVwL3RbWkf", "question": "For what reason do these people share this room?", "choices": ["punishment", "convention", "emergency evacuation", "imprisonment"], "correct_choice_idx": 1, "direct_answers": ["group event", "family reunion", "birthday", "convention", "convention", "celebrating", "convention", "party", "business event", "celebrating"], "difficult_direct_answer": false, "rationales": ["It is one of the main reasons so many people are all together in a room.", "These happy, free people gather to learn", "They are all there to celebrate something"], "image": "val2014/COCO_val2014_000000215213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184667, "question_id": "PkMRxC4DHny8XQKHgm7DXR", "question": "What phobia is generated by the spider?", "choices": ["ecophobia", "polyphobia", "arachnophobia", "nosophobia"], "correct_choice_idx": 2, "direct_answers": ["arachnophonia", "arachnophobia", "arachnophobia", "arachnophobia", "arachnaphobia", "arachnophobia", "arachnophobia", "arachnophobia", "arachnophobia", "arachnophobia"], "difficult_direct_answer": false, "rationales": ["Arachnophobia is a fear of spiders.", "The phobia is of spiders.", "Some people are terrified of spiders."], "image": "val2014/COCO_val2014_000000184667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272059, "question_id": "PkQRzqfNeJwNrgs8696ADs", "question": "When was this teams ballpark built?", "choices": ["1911", "1945", "1936", "1950"], "correct_choice_idx": 0, "direct_answers": ["nineteen twelve", "1912", "what ink", "1912", "1901", "1912", "1912", "1912", "1912", "1911"], "difficult_direct_answer": false, "rationales": ["The baseball player is wearing a red sox uniform. the red sox play at fenway park which was built in 1911.", "The team is identifiable by the jersey visible. an internet search provides the answer.", "That's when fenway park was being built."], "image": "train2014/COCO_train2014_000000272059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4971, "question_id": "PkYTcp8yLM9UNNt73vQo3B", "question": "What vehicles are located in the water?", "choices": ["yacht", "boat", "canoe", "jet ski"], "correct_choice_idx": 1, "direct_answers": ["boats", "boats", "boats", "boats", "boats", "boats", "boats", "boat", "boats", "boats"], "difficult_direct_answer": false, "rationales": ["Boats float in water and are a form of water transportation.", "These are pleasure vehicles to enjoy the water", "There are many vehicles in the water. they each contain a motor on the end that help move it in the water."], "image": "train2014/COCO_train2014_000000004971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138747, "question_id": "Pkg5FiZLpqB7JYJXvt7VtR", "question": "The glare from the sun is distracting for drivers by reflecting off of what surface?", "choices": ["pocket mirrors", "cell phone", "camera flash", "motorcycle shields"], "correct_choice_idx": 3, "direct_answers": ["metals", "glass", "road", "windshield", "motorcycle shields", "glass", "windshield", "car surface", "metal", "metal"], "difficult_direct_answer": false, "rationales": ["The shield is right in front of the drivers face and would be the most distracting spot for a reflection.", "Given the vehicles shown and the small shield they have in front, you can tell what the answer is.", "The other options don't seem to apply in this image."], "image": "train2014/COCO_train2014_000000138747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15757, "question_id": "PkvoxBagJBnHKptowsMMhJ", "question": "Where is the person holding the sail seen here standing?", "choices": ["whale's back", "roadway", "ocean", "beach"], "correct_choice_idx": 2, "direct_answers": ["on beach", "on water", "water", "beach", "ocean", "shoreline", "water", "beach", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["The sail is connected to a surf boarder consistent with the activity being displayed. while being used as intended, the person would then be in answer a.", "The person is standing in the ocean.", "The person is in the ocean since they're at the brink of the sand."], "image": "train2014/COCO_train2014_000000015757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577373, "question_id": "Pm4HLp7By2sd7Dtqe8kuid", "question": "What color of shoes does the woman on the left wear on the field?", "choices": ["yellow", "blue", "black", "white"], "correct_choice_idx": 0, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "neon yellow", "yellow", "yellow", "neon green", "yellow", "neon yell"], "difficult_direct_answer": false, "rationales": ["They are the same color as a banana", "She has yellow shoes on her feet.", "They are bright agains the turf color"], "image": "val2014/COCO_val2014_000000577373.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124940, "question_id": "Pm5PrkuqN5XpcPm22bnMA3", "question": "How many utensils were used to prepare this sandwich?", "choices": ["three", "seven", "four", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["The knife can cut and spread the toppings", "Only a knife is needed to spread and cut.", "They used the same one for the peanut butter and the jelly."], "image": "val2014/COCO_val2014_000000124940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58079, "question_id": "PmW3VGGAJESxdrEQ7tTv24", "question": "What is this brush intended for?", "choices": ["feet", "hair", "teeth", "nails"], "correct_choice_idx": 2, "direct_answers": ["teeth", "teeth", "dental hygiene", "teeth", "clean teeth", "brush teeth", "cleaning teeth", "brushing teeth", "brush teeth", "teeth"], "difficult_direct_answer": false, "rationales": ["It is used to clean inside the mouth", "The brush is of the right size and shape and being used in the setting consistent with answer a.", "The person is holding a small brush under water."], "image": "val2014/COCO_val2014_000000058079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269541, "question_id": "PmdDUdH4dvUtQy6tGahCNx", "question": "What is the umbrella used to avoid?", "choices": ["debris", "sun", "rain", "birds"], "correct_choice_idx": 1, "direct_answers": ["sun", "sun", "sunlight", "sunlight", "sunlight", "sun", "sun", "sun", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["The umbrella is over the bananas to keep the sun off of them.", "The umbrella is used to avoid sunlight and is a parsol.", "The umbrella is keeping sun away."], "image": "train2014/COCO_train2014_000000269541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380259, "question_id": "PnJJKAwx4225XDgCX5CFXy", "question": "What can be obtained from the yellow thing on the wall?", "choices": ["gas", "groceries", "water", "blood"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["They are both water spouts.", "The yellow thing is a plug for a hoseline.", "Water can be obtained from the sprinkler."], "image": "train2014/COCO_train2014_000000380259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120282, "question_id": "PnbG2zBu7TKdP6foWbk5g4", "question": "What type of birds are the little ones?", "choices": ["seagull", "swan", "duck", "goose"], "correct_choice_idx": 1, "direct_answers": ["ducklings", "ducks", "goslings", "ducks", "geese", "ducklings", "geese", "ducks", "ducklings", "swan"], "difficult_direct_answer": false, "rationales": ["The little birds are the big swan's children.", "The mother is white which means she is a swan so her babies would be too.", "It's hard to tell in the image. they may be the offspring of a or they may be b."], "image": "val2014/COCO_val2014_000000120282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53095, "question_id": "Pnpc7k4mu87QKsf7zMkPYg", "question": "Why are their hands outstretched?", "choices": ["controlling kites", "getting water", "balancing", "climbing ropes"], "correct_choice_idx": 0, "direct_answers": ["holding cable", "holding parasail", "holding sail", "flying kite", "controlling kites", "windsurfing", "hold cable", "balance", "holding parachutes", "balancing"], "difficult_direct_answer": true, "rationales": ["They are holding on and using them to stay up in the water.", "The people want to control their sails.", "Their hands are connected to ropes which are connected to the kites jn order to guide directions."], "image": "train2014/COCO_train2014_000000053095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423091, "question_id": "PnqmPEW7zerPeRcMkUXMn8", "question": "From which direction did this skateboarder just come?", "choices": ["high", "none", "below", "their right"], "correct_choice_idx": 2, "direct_answers": ["up", "right", "ground", "up", "below", "left", "down", "right", "down", "below"], "difficult_direct_answer": false, "rationales": ["He and the skateboard he is riding on are completely in the air.", "He flew off the top of the ramp", "This skater would skate up the hill and then do a stunt like he's doing."], "image": "train2014/COCO_train2014_000000423091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293825, "question_id": "Pntrb5t7VyDmhcUehUuerR", "question": "What is happening on the road?", "choices": ["protest", "parade", "traffic jam", "car accident"], "correct_choice_idx": 2, "direct_answers": ["traffic jam", "traffic jam", "traffic jam", "traffic", "traffic", "traffic jam", "driving", "traffic jam", "traffic", "traffic jam"], "difficult_direct_answer": false, "rationales": ["The street is full of many cars and looks like rush hour traffic.", "The cars are packed closely together and appear to be at a standstill.", "There is a traffic jam of taxis on the road."], "image": "train2014/COCO_train2014_000000293825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257388, "question_id": "Po7dCPh8FAxUmP67kaZitQ", "question": "What does this woman hold in her right hand?", "choices": ["gun", "napkin", "wine", "umbrella"], "correct_choice_idx": 1, "direct_answers": ["ribbon", "wiper", "umbrella", "gloves", "napkin", "napkin", "red cloth", "napkin", "napkin", "napkin"], "difficult_direct_answer": false, "rationales": ["It is used to prevent food from attaching to her clothes.", "The woman is holding something that is cloth in her right hand. the woman appears to be at a restaurant based on the tables and settings and a cloth item of this consistency in a restaurant is commonly a napkin.", "The woman has a cloth napkin."], "image": "train2014/COCO_train2014_000000257388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75595, "question_id": "PoByPfcYdfaYbUa4NBuMiU", "question": "What type of bag is this man using?", "choices": ["mail", "purse", "messenger", "tote"], "correct_choice_idx": 2, "direct_answers": ["messenger", "messenger bag", "messenger", "messenger", "messenger", "messenger bag", "messenger bag", "messenger bag", "messenger", "messenger"], "difficult_direct_answer": false, "rationales": ["The over-the-shoulder bag is called a messenger bag.", "The man is using a messenger bag.", "A messenger bag is distinguished by the strap going across the body."], "image": "val2014/COCO_val2014_000000075595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383760, "question_id": "PoPN8rJXKHsfbeJHp7ztdC", "question": "What is the surfboard stuck in the sand being used for?", "choices": ["message sign", "advertisement", "buoy", "sun dial"], "correct_choice_idx": 0, "direct_answers": ["shade", "sign", "landmark", "sign", "picture", "board", "message sign", "surfing", "shade", "sun dial"], "difficult_direct_answer": false, "rationales": ["It is being used as a billboard.", "The surfboard has a message.", "The board is being used to relay information to the beachgoers."], "image": "train2014/COCO_train2014_000000383760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91378, "question_id": "PoPe325ba2taHCvjxZK7Jz", "question": "The man in the white shirt is using what to touch the frisbee?", "choices": ["thumb", "wrist", "fingernails", "fingertips"], "correct_choice_idx": 3, "direct_answers": ["hand", "fingertip", "fingertips", "fingertips", "finger", "fingers", "finger", "finger", "finger", "index finger"], "difficult_direct_answer": false, "rationales": ["The man is jumping with his fingers outstretched.", "The edges of his fingers are making contact with the frisbee. they can grab the frisbee to catch it from mid air.", "He uses his fingers to hold the disc."], "image": "train2014/COCO_train2014_000000091378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216325, "question_id": "PoaEHhvX6t8WmQ3asNekC4", "question": "What type of surf is the man on the far left holding?", "choices": ["funboard", "fish", "longboard", "short board"], "correct_choice_idx": 3, "direct_answers": ["wave", "surfboard", "elongated", "board", "short board", "board", "short board", "wake board", "shortboard", "surfboard"], "difficult_direct_answer": false, "rationales": ["The board is a short one since it's smaller than normal surfboards.", "You can tell by the length as to what type of surfboard he is holding.", "The man on the far left is holding onto a short board."], "image": "val2014/COCO_val2014_000000216325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184205, "question_id": "PowTvBu29S2rpxD26onW9Z", "question": "What is going on with the island?", "choices": ["nothing", "remodeling", "being moved", "being assembled"], "correct_choice_idx": 1, "direct_answers": ["renovation", "construction", "renovation", "under construction", "under construction", "renovation", "remodeling", "construction", "building", "remodel"], "difficult_direct_answer": false, "rationales": ["They appear to be building it.", "There is bare board showing instead of a finished product", "The place is being fixed up."], "image": "val2014/COCO_val2014_000000184205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423161, "question_id": "PozUKi2UbuLiBSgRNCG8f8", "question": "The men rowing in white shirts are most likely part of what group?", "choices": ["rowing course", "friends", "rowing team", "tourists"], "correct_choice_idx": 2, "direct_answers": ["school", "team", "crew team", "crew team", "crew team", "rowing team", "rowing team", "rowing team", "crew team", "team"], "difficult_direct_answer": false, "rationales": ["The people are competing with the other boat.", "They seems to be driving the smalls boat.", "People rowing in the same boat all wearing the same uniform would most likely be associated in the manner of answer a."], "image": "val2014/COCO_val2014_000000423161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402528, "question_id": "PpPkekadGkBjGrFSGHEgUi", "question": "Which one of these is a competitor of the company that make's the item in the jar?", "choices": ["mitsubishi", "gulden's", "chiquita", "apple"], "correct_choice_idx": 1, "direct_answers": ["heinz", "heinz", "heinz", "heinz", "heinz", "gulden's", "heinz", "grey poupon", "bread", "pizza"], "difficult_direct_answer": false, "rationales": ["Gulden's is another brand of mustard.", "The jar has a label that indicates that it contains french's mustard. the jar does not contain vehicles, electronic devices, or bananas.", "The company on the jar is clearly labeled and in the category of condiment companies. answer a is another company that produces condiments and would thus be a competitor."], "image": "val2014/COCO_val2014_000000402528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126537, "question_id": "PpbgmpUX6Tt5TwDFBExQjY", "question": "Which country headquarters this airline?", "choices": ["turkey", "italy", "india", "spain"], "correct_choice_idx": 0, "direct_answers": ["turkey", "germany", "america", "us", "turkey", "turkey", "turkey", "turkey", "turkey", "turkey"], "difficult_direct_answer": false, "rationales": ["The airplane has a sunexpress livery. this airline is headquartered in antalya.", "Sun express is an airline based out of turkey.", "It is headquartered in this country"], "image": "train2014/COCO_train2014_000000126537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278646, "question_id": "Ppntv7sd3JeUC5EBDeN8gR", "question": "What kind of meat is in the hot dog?", "choices": ["dog", "chicken", "beef", "bat"], "correct_choice_idx": 2, "direct_answers": ["swine", "beef", "pork", "processed", "beef", "beef", "organ", "pork", "pig", "beef"], "difficult_direct_answer": false, "rationales": ["Hot dogs are made of beef.", "Generally, hotdogs are comprised of beef.", "The coloring and firmness of these hotdog means its probably of higher quality. that would mean meat is probably beef."], "image": "train2014/COCO_train2014_000000278646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164135, "question_id": "PqKiGzDBsW2rscuHgsNiwb", "question": "Where is it safest to skateboard?", "choices": ["grass", "sidewalk", "steps", "road"], "correct_choice_idx": 1, "direct_answers": ["skateboard park", "skateboard park", "sidewalk", "skateboard park", "sidewalk", "sidewalk", "sidewalk", "sidewalk", "sidewalk", "sidewalk"], "difficult_direct_answer": false, "rationales": ["On the cement it is safer because there are not cars driving on it.", "There are cars in the road", "The sidewalk is the safest."], "image": "val2014/COCO_val2014_000000164135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271972, "question_id": "PqQK2XqxbVwiN3YMUrAJ2K", "question": "Which sport requires a greater number of people to play than those that are pictured?", "choices": ["water polo", "ping pong", "tennis", "badminton"], "correct_choice_idx": 0, "direct_answers": ["football", "football", "football", "football", "baseball", "football", "soccer", "baseball", "water polo", "football"], "difficult_direct_answer": false, "rationales": ["The game of water polo requires a greater number of players.", "Water polo requires a whole team.", "Water polo requires more."], "image": "train2014/COCO_train2014_000000271972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462477, "question_id": "PqgNKufhjYhm3wnXoW3HS3", "question": "What do these people do here?", "choices": ["sing", "watch youtube", "watch tv", "cook"], "correct_choice_idx": 3, "direct_answers": ["cook", "make food", "prepare food", "preparing food", "cook", "prepare food", "cook", "prepare food", "cook", "cook"], "difficult_direct_answer": false, "rationales": ["The people are visibly preparing food and putting them in vessels that would be used for the purposes of answer a.", "The people are cooking chicken.", "These men are making food."], "image": "train2014/COCO_train2014_000000462477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240648, "question_id": "Pr4HUsJLr4oGTuudQZ3JJ7", "question": "What may you not do on the curb near the Stop sign?", "choices": ["walk", "sing", "park", "talk"], "correct_choice_idx": 2, "direct_answers": ["park", "walk across", "park", "turn", "park", "drive car", "park", "park", "stand", "park"], "difficult_direct_answer": false, "rationales": ["The curb near the stop sign is painted red. people could talk, walk, or sing here.", "The sign states not to.", "You aren't able to stop and park."], "image": "train2014/COCO_train2014_000000240648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143948, "question_id": "PrJfnqhhzJU9VfKQESgf8T", "question": "Where are these people located?", "choices": ["library", "hospital", "school", "winery"], "correct_choice_idx": 3, "direct_answers": ["winery", "bar", "winery", "bar", "bar", "bar", "winery", "bar", "bar", "bar"], "difficult_direct_answer": false, "rationales": ["People are crowded together in a place with signs about wine above. people are holding glasses.", "The people are in a winery.", "The woman is drinking."], "image": "val2014/COCO_val2014_000000143948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75668, "question_id": "PrVfjJUMAWMe7RUbf3VHLD", "question": "What kind of music comes from this country?", "choices": ["eurodance", "latin", "kpop", "jpop t"], "correct_choice_idx": 2, "direct_answers": ["kpop", "chinese", "k-pop", "korean pop", "korean pop", "korean pop", "j-pop", "k-pop", "bts", "kpop"], "difficult_direct_answer": false, "rationales": ["Street signs with asian writing can be seen in a highly populated area.", "This country produce pop asian music.", "That is what that country has in terms of music."], "image": "train2014/COCO_train2014_000000075668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521879, "question_id": "ProNmhHb8TQKNV6XWRq5qW", "question": "What makes sure the vessel pulled by the horse goes straight?", "choices": ["tracks", "rough estimation", "driver", "nothing"], "correct_choice_idx": 0, "direct_answers": ["tracks", "tracks", "driver", "tracks", "ground tracks", "tracks", "reins", "rails", "driver", "harness"], "difficult_direct_answer": false, "rationales": ["The tracks hold the carriage so the horse doesn't go off course.", "The vessel leaves tracks.", "The vessel pulled by the horse is going straight over the tracks."], "image": "val2014/COCO_val2014_000000521879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343291, "question_id": "PrtVp44NcR5HUjUp4k3iEu", "question": "Why are two of them on that little vehicle?", "choices": ["lacking vehicles", "are fighting", "are hiding", "economical transportation"], "correct_choice_idx": 3, "direct_answers": ["resting", "close companions", "sharing", "travelling", "sharing ride", "economical transportation", "riding", "companionship", "riding together", "resting"], "difficult_direct_answer": true, "rationales": ["The women are sharing the vehicle which is more economical.", "The two of them are sharing a ride.", "The two people are on the little vehicle to save time and money by sharing a scooter."], "image": "train2014/COCO_train2014_000000343291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268293, "question_id": "PrzrJWoTk4Yc4YQuywbzYo", "question": "What technique does this player utilize here?", "choices": ["underhanded", "avoidance", "overhanded", "back handed"], "correct_choice_idx": 3, "direct_answers": ["backhand", "backhand", "underhand", "back hand", "swing", "backhand", "below", "back hand", "back handed", "backhand"], "difficult_direct_answer": false, "rationales": ["The players wrist is backwards and it is a typical motion of a band handed swing.", "He has the racket turned and will swing backwards", "He's about to hit it with a back swing"], "image": "val2014/COCO_val2014_000000268293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176621, "question_id": "Ps4Y5Hdu6a3DppVhkDuPwj", "question": "What type of show is being presented here?", "choices": ["sporting event", "drama", "musical", "horror"], "correct_choice_idx": 2, "direct_answers": ["play", "musical", "play", "musical", "musical", "musical", "musical", "musical", "musical", "movie"], "difficult_direct_answer": false, "rationales": ["The name of the show is shown on the building. the building looks to be a theatre.", "The show is a musical.", "The play is known for its songs."], "image": "train2014/COCO_train2014_000000176621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521101, "question_id": "PsWBQZ3fwZeouHSgvZCPRW", "question": "What flowers seeds are visible here?", "choices": ["daisy", "sesame", "poppies", "sunflowers"], "correct_choice_idx": 2, "direct_answers": ["poppy", "poppy", "poppy seeds", "poppy", "not flower", "poppy", "on bun", "poppy", "sesame seeds", "poppies"], "difficult_direct_answer": false, "rationales": ["The seeds are dark so they are not sesame. daisy seeds and sunflower seeds are not put in bread.", "Poppy seeds are on the bun.", "The seeds are black"], "image": "train2014/COCO_train2014_000000521101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563179, "question_id": "PsY97MHRaFb32UtWH7Ekw3", "question": "Under what circumstance might children wear the red item the man is wearing?", "choices": ["swimming", "school", "military", "recreation"], "correct_choice_idx": 1, "direct_answers": ["important occasion", "halloween", "halloween", "shirt", "school", "formal event", "formal party", "wedding", "school uniform", "formal"], "difficult_direct_answer": true, "rationales": ["It is a necktie and sometimes ties are used as a part of a uniform.", "Children may wear ties in a private learning or boarding school setting.", "The man is wearing a tie. there are many scenarios in which a tie might be an appropriate piece of attire, with school being one of them, particularly the formal kind."], "image": "train2014/COCO_train2014_000000563179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307026, "question_id": "PsbMxgyq9mvgbnNQsPmdgd", "question": "What action is the man taking?", "choices": ["throwing", "dunking", "swinging", "batting"], "correct_choice_idx": 2, "direct_answers": ["overhand", "play", "swinging", "swinging", "serving", "serving ball", "forehand serve", "hitting ball", "playing tennis", "serving"], "difficult_direct_answer": false, "rationales": ["The man is hitting the ball.", "A man is hitting the ball out of the air by swatting at it.", "The man is taking a swing with a tennis racket."], "image": "val2014/COCO_val2014_000000307026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70754, "question_id": "Pseu6vEeLKkVSQXXA4p972", "question": "Where are the parcels on the cart being sent to?", "choices": ["north pole", "brazil", "united states", "russia"], "correct_choice_idx": 0, "direct_answers": ["north pole", "north pole", "north pole", "north pole", "north pole", "north pole", "north pole", "north pole", "north pole", "north pole"], "difficult_direct_answer": false, "rationales": ["The parcel is the north pole.", "The tag has the cages going to the north pole.", "There is a tag addressed to the north pole on the packages. a tag designates where something is being sent to."], "image": "train2014/COCO_train2014_000000070754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226577, "question_id": "PspJqCVyN4wmSQJN9w7Ve4", "question": "Why do they have black suits on?", "choices": ["disguise", "stay cool", "are twins", "stay warm"], "correct_choice_idx": 3, "direct_answers": ["surfing", "stay warm", "to surfboard", "keep warm", "cold ocean", "surfing", "going surfing", "surfing", "surfers", "warmth"], "difficult_direct_answer": false, "rationales": ["The people are going to surf in the cold water.", "Two surfers are approaching water on an overcast day.", "The surfers they have on are wetsuits. staying in cold water for a long time while surfing is cold and dangerous which is why surfers where wetsuits."], "image": "train2014/COCO_train2014_000000226577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138956, "question_id": "PtCLf2QkM3xRMUj3UYsTbd", "question": "Why are the people walking through the outdoor area?", "choices": ["to race", "to compete", "to escape", "to shop"], "correct_choice_idx": 3, "direct_answers": ["shopping", "buy goods", "buy goods", "shopping", "to shop", "shopping", "shopping", "shopping", "shopping", "shopping"], "difficult_direct_answer": false, "rationales": ["The two people are walking through an outdoor market to shop.", "People are here walking through the outdoor area to shop.", "The setting appears to be a marketplace based on the storefronts. people would be walking through an outdoor marketplace if they were intending to answer a."], "image": "train2014/COCO_train2014_000000138956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26147, "question_id": "PtWZBS4KHYRMqdEGbcEmz9", "question": "How many years ago was this photo taken?", "choices": ["two", "nine", "five", "eight"], "correct_choice_idx": 3, "direct_answers": ["eighteen years", "eight", "eight", "eight years", "eight", "five", "recently", "eight", "two", "eight"], "difficult_direct_answer": false, "rationales": ["The photo was taken almost eight year ago since it was done in 2015.", "2013 was 8 years ago.", "The photo is old."], "image": "train2014/COCO_train2014_000000026147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33866, "question_id": "PteZYxNeJvHceXEqTbDsN6", "question": "What are the status of the bear dolls?", "choices": ["used", "damaged", "dirty", "brand new"], "correct_choice_idx": 3, "direct_answers": ["brand new", "new", "brand new", "on roof", "new", "newly bought", "happy", "new", "roof positioning", "new"], "difficult_direct_answer": false, "rationales": ["They still have the tags on them.", "The bears have tags on them.", "The bears have tags on them. they are clean and in good shape."], "image": "train2014/COCO_train2014_000000033866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20389, "question_id": "Ptmxc6fccV3sQKSfhEx8E7", "question": "Why are the plants outside the court?", "choices": ["players", "fence", "sunlight", "gardener"], "correct_choice_idx": 1, "direct_answers": ["provide beauty", "landscaping", "to grow", "ivy", "convenient", "privacy", "trees", "fence", "protection", "fence"], "difficult_direct_answer": true, "rationales": ["To make the area green.", "It's barely visible but in the background.", "The plants are growing on the fence."], "image": "train2014/COCO_train2014_000000020389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515702, "question_id": "PtoBenFgTNVjGgFttoyPPT", "question": "What type of sport is this?", "choices": ["combat", "individual", "team", "partner"], "correct_choice_idx": 2, "direct_answers": ["baseball", "baseball", "baseball", "baseball", "baseball", "team", "baseball", "baseball", "baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["The other options obviously don't apply to this sport.", "They are playing baseball. nine players are needed for this sport.", "Baseball is a team sport."], "image": "train2014/COCO_train2014_000000515702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10145, "question_id": "Pts3kqxBxmuTnv4p3DaXWv", "question": "What large rectangular-shaped object is nearest to the TV?", "choices": ["cabinet", "console", "speaker", "stereo"], "correct_choice_idx": 2, "direct_answers": ["table", "speakers", "cabinet", "speaker", "speaker", "table", "sofa", "rug", "rug", "speakers"], "difficult_direct_answer": false, "rationales": ["The object is a speaker.", "Speakers are close to the tv.", "There are some large speakers next to the tv."], "image": "train2014/COCO_train2014_000000010145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88449, "question_id": "Pu2BnpmwR2FMJdt25f4EJi", "question": "At which position are these two trains when shown?", "choices": ["parked", "racing fast", "upside down", "slowly moving"], "correct_choice_idx": 0, "direct_answers": ["parallel", "parallel", "parked", "parked", "sides", "parked", "parked", "left", "resting position", "parked"], "difficult_direct_answer": false, "rationales": ["Neither train appears to be moving.", "The compartments are not occupied by drivers. the trains are not moving.", "The position is parked."], "image": "train2014/COCO_train2014_000000088449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436199, "question_id": "PuRxNdXH2cnFmbjqLKNAB7", "question": "What type of building is this?", "choices": ["apartment", "filling station", "house", "hospital"], "correct_choice_idx": 0, "direct_answers": ["brick", "house", "apartment", "apartment", "house", "house", "home", "apartment", "house", "brick"], "difficult_direct_answer": false, "rationales": ["This building is a type of apartment area.", "It has a smaller area so more then likely a smaller place of living.", "The building is an apartment."], "image": "train2014/COCO_train2014_000000436199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218855, "question_id": "PubXJUG3i7EtwXofh6DtsA", "question": "What are they doing with the white devices?", "choices": ["are confused", "is phone", "video game", "is punishment"], "correct_choice_idx": 2, "direct_answers": ["play videogames", "playing game", "video game", "playing wii", "playing nintendo", "gaming", "playing game", "playing game", "playing games", "playing"], "difficult_direct_answer": false, "rationales": ["They're playing.", "They are using the nintendo wii.", "Those remotes are used for gaming."], "image": "val2014/COCO_val2014_000000218855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49458, "question_id": "PuhTx3NWpZqBCmSzbfajtr", "question": "What sound might be made if the men emptied their hands quickly?", "choices": ["woof", "baa", "meow", "roar"], "correct_choice_idx": 1, "direct_answers": ["baa", "squeal", "thud", "bah", "gasp", "bah", "lamp chops", "been", "baa", "baa"], "difficult_direct_answer": false, "rationales": ["The sheep would make that sound.", "Sheep generally baa loudly.", "They are holding sheep, not dogs, cats, or lions."], "image": "train2014/COCO_train2014_000000049458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427958, "question_id": "PvNAonzqHovg9pVxn9Y45Z", "question": "What is she controlling with the remote?", "choices": ["bed", "robot", "car", "game"], "correct_choice_idx": 3, "direct_answers": ["television", "game system", "channel", "game console", "video game", "nintendo wii", "wii screen", "wii game", "game", "wii"], "difficult_direct_answer": true, "rationales": ["A woman is holding a videogame controller.", "The remote is to a video game.", "She is holding a wii-mote which is used to play video games."], "image": "train2014/COCO_train2014_000000427958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86559, "question_id": "PvR4F5VAg8YVEHEjUoU3uh", "question": "These animals are known for their what?", "choices": ["wool", "horns", "wings", "height"], "correct_choice_idx": 3, "direct_answers": ["height", "height", "long necks", "long necks", "necks", "necks", "height", "necks", "graph", "necks"], "difficult_direct_answer": false, "rationales": ["The animals are really tall.", "Giraffes are tall.", "The animals are tall."], "image": "val2014/COCO_val2014_000000086559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393743, "question_id": "PvfbSZPxAvmASLfcQZ8yy6", "question": "What kind of poles stand erect in the background?", "choices": ["telephone", "wind", "electric", "solar"], "correct_choice_idx": 2, "direct_answers": ["electric", "electric", "hydro", "electricity poles", "goal poles", "light poles", "goal", "light", "white poles", "goal posts"], "difficult_direct_answer": true, "rationales": ["The crossed poles hold up electrical wires.", "The poles are for electricity.", "They are used to hold and support electric cables that are running with electric currents."], "image": "val2014/COCO_val2014_000000393743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77873, "question_id": "PvhJ46dZYm7397J9fYiBFx", "question": "What are the children eating?", "choices": ["hot dog", "pizza", "chicken", "hamburger"], "correct_choice_idx": 0, "direct_answers": ["hotdog", "hot dogs", "hot dogs", "hot dogs", "hotdogs", "hot dogs", "hot dogs", "hot dog", "hot dogs", "hotdogs"], "difficult_direct_answer": false, "rationales": ["The kids eat hot dogs.", "The kids are enjoying hot dogs.", "The kids are eating hot dogs since they're long sausages in buns."], "image": "train2014/COCO_train2014_000000077873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295537, "question_id": "PvkBKmEF5awwQpUEqzG5YP", "question": "Where is the man riding through?", "choices": ["yard", "forest", "desert", "parking lot"], "correct_choice_idx": 2, "direct_answers": ["desert", "desert", "desert", "desert", "desert", "desert", "desert", "desert", "desert", "desert"], "difficult_direct_answer": false, "rationales": ["The man is by the desert.", "There is sand everywhere around the man.", "The area is covered in sand."], "image": "train2014/COCO_train2014_000000295537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282115, "question_id": "PvzCBMipPke4YncRN7dPXm", "question": "What is this bus doing?", "choices": ["turning right", "slowing down", "speeding", "backing up"], "correct_choice_idx": 1, "direct_answers": ["driving", "driving", "slowing down", "driving", "driving", "stopping", "stopping", "stopping", "driving", "driving"], "difficult_direct_answer": false, "rationales": ["The bus is slowing down.", "The bus is going slower because there are other vehicles in front.", "This bus has approached the intersection and is slowing down to stop due to cross traffic."], "image": "train2014/COCO_train2014_000000282115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228644, "question_id": "Pw8KMQjGCCpx3NZPpAj5BX", "question": "What activity is the standing person involved in?", "choices": ["gaming", "cooking", "tennis", "striptease"], "correct_choice_idx": 0, "direct_answers": ["wii game", "video game", "video game", "video games", "video games", "gaming", "charging", "dancing", "wii", "virtual boxing"], "difficult_direct_answer": false, "rationales": ["As indicated by the wii controllers that he's using.", "The standing person is using a nintendo wii remote.", "One man is holding a video game remote."], "image": "val2014/COCO_val2014_000000228644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380959, "question_id": "PwGCFTyQeMZ7vzGFmvtCvy", "question": "What is the profession of the person who would drive this vehicle?", "choices": ["fireman", "lifeguard", "officer", "shopper"], "correct_choice_idx": 0, "direct_answers": ["fire fighter", "fire fighter", "firefighter", "firefighter", "fireman", "firefighter", "firefighter", "fireman", "firefighter", "fireman"], "difficult_direct_answer": false, "rationales": ["The fire truck is driven by fire fighters who are emergency first responders.", "A fireman would drive the firetruck.", "This is a firetruck so it's most likely to have a fireman in control of it."], "image": "train2014/COCO_train2014_000000380959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185820, "question_id": "PwP7Q3gTBfh3waLotUSUM3", "question": "What is to the left of the sink?", "choices": ["door", "baby", "dog", "cat"], "correct_choice_idx": 0, "direct_answers": ["hand soap", "soap", "counter", "door", "soap", "counter", "paper towel", "soap", "soap", "kitchen drawers"], "difficult_direct_answer": false, "rationales": ["There is a door with a window to the left of the kitchen sink.", "There is handle on the wood", "The door is to the left of the sink since there's a handle on it."], "image": "train2014/COCO_train2014_000000185820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546052, "question_id": "PwSzYqAkRSWS5dQ5TaJU4V", "question": "The figurines on the shelf can be used to do what?", "choices": ["crack nuts", "puree nuts", "peel nuts", "store nuts"], "correct_choice_idx": 0, "direct_answers": ["entertain children", "nutcracker", "nut crack", "collectibles", "crack nuts", "crack nuts", "crack nuts", "child toys", "play with", "crack nuts"], "difficult_direct_answer": false, "rationales": ["The figurines crack nuts.", "There are nutcrackers on the very top shelf.", "The figurines are called nutcrackers."], "image": "val2014/COCO_val2014_000000546052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343185, "question_id": "PwVs5Ph8Q37KazLtnt3HGG", "question": "What are the Energizers used for?", "choices": ["painting", "eating", "medicine", "power"], "correct_choice_idx": 3, "direct_answers": ["energy", "batteries", "power devices", "energizing", "power", "batteries", "electronics", "devices", "batteries", "battery items"], "difficult_direct_answer": false, "rationales": ["The energizers are batteries.", "These are batteries", "The batteries are used for to power different things."], "image": "train2014/COCO_train2014_000000343185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454457, "question_id": "Pwcu6jpxbjXdJaEbYbTApB", "question": "This person does the same sport as which athlete?", "choices": ["laird hamilton", "t.j. lavin", "lionel messi", "tony hawk"], "correct_choice_idx": 3, "direct_answers": ["tony hawk", "tony hawk", "tony hawk", "tony hawk", "tony hawk", "tony hawk", "tony hawk", "tony hawk", "tony hawk", "tony hawk"], "difficult_direct_answer": false, "rationales": ["The person is like hawk.", "Tony hawk rides skateboards.", "The man seems to be moving down the stairs just like tony."], "image": "val2014/COCO_val2014_000000454457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165803, "question_id": "Pwf9d8kuaT8dXjiih4Hcaj", "question": "What utensil are they using to eat the cake?", "choices": ["spatula", "forks", "whisk", "knives"], "correct_choice_idx": 1, "direct_answers": ["forks", "fork", "knife", "fork", "forks", "knife", "fork", "knife", "fork", "forks"], "difficult_direct_answer": false, "rationales": ["The items have pointed tips.", "Cake is being served and forks are the customary utensil to eat cake with.", "The plates have forks."], "image": "train2014/COCO_train2014_000000165803.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510806, "question_id": "PwjLRF8g9gqqmLz8kYriVh", "question": "What locale does the policeman serve?", "choices": ["bloomington", "lafayette", "shreveport", "monroe"], "correct_choice_idx": 1, "direct_answers": ["lafayette", "lafayette", "local safety", "road safety", "city", "lafayette", "city", "patrolman", "lafayette", "city"], "difficult_direct_answer": false, "rationales": ["The logo on the motorcycle says lafayette.", "The policeman has a motorcycle that says lafayette police.", "The decal on the policeman's motorcycle names this city."], "image": "train2014/COCO_train2014_000000510806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351451, "question_id": "PwvdgYA3n2fkSg3ErZgenc", "question": "Who or what is closest to the ball?", "choices": ["man", "woman", "dog", "cat"], "correct_choice_idx": 0, "direct_answers": ["man", "man", "man", "man", "man", "man", "man", "grass", "man", "grass"], "difficult_direct_answer": false, "rationales": ["The person on the left is closest to the ball. he is apparently male.", "There are no non-human animals near the ball. the woman is farther away.", "Aside from the two humans, no animals are present. the woman is farther away from the ball than the other person."], "image": "train2014/COCO_train2014_000000351451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556278, "question_id": "PxEitvAT87rjKmgNW6BUt6", "question": "How is the street staying illuminated?", "choices": ["fire", "sun", "street lights", "flashlights"], "correct_choice_idx": 2, "direct_answers": ["light", "street lights", "streetlights", "lights", "street lights", "lights", "lights", "streetlights", "lights", "street lights"], "difficult_direct_answer": false, "rationales": ["The streets are illuminated by street lights.", "Lights around the city allow for illuminated roadways while the sun is down and darkness has enveloped the streets.", "It is night time, so the sun is not out. there are no flashlights or fires."], "image": "val2014/COCO_val2014_000000556278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246183, "question_id": "PxajjEjxfAbCnTgjfTtdhi", "question": "What electronics company made the blue balloons?", "choices": ["samsung", "apple", "sony", "microsoft"], "correct_choice_idx": 0, "direct_answers": ["samsung", "samsung", "samsung", "samsung", "samsung", "samsung", "samsung", "samsung", "samsung", "samsung"], "difficult_direct_answer": false, "rationales": ["The balloons are blue like that company.", "Samsung makes the blue balloons.", "Samsung's logo is on the balloons."], "image": "val2014/COCO_val2014_000000246183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233146, "question_id": "Pxc8NJMCBomgHcWzefJk8Z", "question": "What can be done in this room?", "choices": ["bathing", "exercising", "washing dishes", "sleeping"], "correct_choice_idx": 2, "direct_answers": ["cooking", "cooking", "cooking", "cooking", "cooking", "cooking", "cooking", "cooking", "cooking", "washing dishes"], "difficult_direct_answer": false, "rationales": ["You can clean your dishes in the sink.", "The kitchen can be used for washing dishes.", "You can wash things in the kitchen sink."], "image": "train2014/COCO_train2014_000000233146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472143, "question_id": "PxjybmjUdyE73c9koBLjUg", "question": "What are the couple who sit facing camera doing?", "choices": ["gaming", "texting", "singing", "eating"], "correct_choice_idx": 3, "direct_answers": ["eating", "eating", "eating", "eating", "eating", "eating", "eating", "eating", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["They have boxes in their laps and the boxes look like the boxes that would normally hold food.", "The couple on the bench are holding boxes of take-out food.", "They have boxes with food in it that they are eating."], "image": "val2014/COCO_val2014_000000472143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449676, "question_id": "PyHtBVLYgft2fhyi99mdmw", "question": "From which wind does the air blow here?", "choices": ["nowhere", "from inland", "from seaward", "upward"], "correct_choice_idx": 2, "direct_answers": ["from seaward", "sky", "right", "ocean", "coastal", "from left", "west", "sea", "east", "east"], "difficult_direct_answer": true, "rationales": ["The wind is blowing from the wind.", "The wind is from the sea.", "The wind comes in from the sea."], "image": "val2014/COCO_val2014_000000449676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384160, "question_id": "PyUpw7BB8kBu9MdtRVEtwz", "question": "What was the child playing with before it fell asleep?", "choices": ["jenga", "dolls", "toy blocks", "basketball"], "correct_choice_idx": 1, "direct_answers": ["dolls", "dolls", "dolls", "dolls", "cartoons", "dolls", "dolls", "dolls", "cartoons", "dolls"], "difficult_direct_answer": false, "rationales": ["A child is sleeping surrounded by dolls. children like to play with dolls.", "The kid used dolls.", "The child is near baby dolls."], "image": "train2014/COCO_train2014_000000384160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494904, "question_id": "PyX5TWNszN3nNMhvoQtxDW", "question": "Which base ball sport equipment is made up with maple wood?", "choices": ["bat", "ball", "net", "cap"], "correct_choice_idx": 0, "direct_answers": ["baseball bat", "baseball bat", "bat", "bat", "bat", "bat", "bat", "bat", "bat", "bat"], "difficult_direct_answer": false, "rationales": ["The man is using a stick to hot the ball.", "The baseball needs a bat.", "Baseball bats are often made from maplewood."], "image": "train2014/COCO_train2014_000000494904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51101, "question_id": "PyYqmMozkPnYc2HWm7FBet", "question": "What does the orange item next to the chain look like?", "choices": ["cat", "frisbee", "bunny", "dog"], "correct_choice_idx": 1, "direct_answers": ["disc", "frisbee", "frisbee", "plate", "frisbee", "flying disc", "frisbee", "frisbee", "coin", "frisbee"], "difficult_direct_answer": false, "rationales": ["The orange item is in a disc shape like a frisbee.", "They are plastic flat disks", "There is a plastic disc in the basket."], "image": "train2014/COCO_train2014_000000051101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456972, "question_id": "PydPBQLKBEuBhUpMNrnZiK", "question": "Why is the person holding pineapple in their hand?", "choices": ["to eat", "to moisturize", "to feed", "to lick"], "correct_choice_idx": 2, "direct_answers": ["feed", "feed bird", "feed bird", "feed bird", "feeding", "for bird", "bird feed", "treat", "feed bird", "to feed"], "difficult_direct_answer": false, "rationales": ["The bird is eating the pineapple out of the person's hand.", "The bird is eating the pineapple.", "A red bird is eating out of a man's hands. the bird is feasting on this small food."], "image": "train2014/COCO_train2014_000000456972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111406, "question_id": "PyvYZpyPcj5kzTdBbVpTRb", "question": "What city is this?", "choices": ["portland", "chicago", "ny", "boston"], "correct_choice_idx": 2, "direct_answers": ["new york", "howard beach", "nyc", "howard beach", "nyc", "new york", "new york", "nomad", "new york", "ny"], "difficult_direct_answer": false, "rationales": ["Some of the cities are from that city.", "Howard beach and jfk are locations in new york.", "On the bus it says jfk."], "image": "train2014/COCO_train2014_000000111406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454399, "question_id": "Pz4HgHGNwLdkFEjVocagw8", "question": "Why is the player wearing gloves?", "choices": ["warmth", "grip", "fashion", "health"], "correct_choice_idx": 1, "direct_answers": ["sustain grip", "grip bat", "safety grip", "grip", "grip bat", "protection", "extra grip", "increase grip", "better grip", "grip"], "difficult_direct_answer": false, "rationales": ["The gloves are athletic and intended for baseball games. a baseball is slippery, so gloves intended for baseball have this feature.", "The player is gripping the bat.", "The man is trying to get a grip."], "image": "train2014/COCO_train2014_000000454399.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336058, "question_id": "PzSupnE3Tt8tJuQ5p56Fgt", "question": "Why does the woman have a large belly?", "choices": ["bloat", "gas", "pregnancy", "overweight"], "correct_choice_idx": 2, "direct_answers": ["pregnant", "pregnant", "pregnant", "pregnant", "pregnant", "pregnancy", "pregnant", "pregnant", "pregnant", "pregnant"], "difficult_direct_answer": false, "rationales": ["Her stomach shape and size in relation to the rest of her body only indicates this condition.", "She is carrying a baby", "She is carrying a baby"], "image": "train2014/COCO_train2014_000000336058.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417857, "question_id": "PzXAoiLGXnxXidSvG9Je9Z", "question": "What base does the catcher kneel near?", "choices": ["second", "third", "first", "home"], "correct_choice_idx": 3, "direct_answers": ["home", "home", "home", "home plate", "home", "home plate", "home base", "home base", "home plate", "home"], "difficult_direct_answer": false, "rationales": ["The catcher knees near home plate where the batter is swinging at the ball.", "The catcher is kneeling near the batter and umpire at the plate.", "The catcher is at home plate."], "image": "val2014/COCO_val2014_000000417857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433428, "question_id": "Q2REhMHiuVhNnt9c4DNnYj", "question": "What type of vehicles are present in the foremost foreground?", "choices": ["cars", "trucks", "bicycle", "motorcycle"], "correct_choice_idx": 3, "direct_answers": ["scooter", "motorcycles", "motorcycle", "bike", "motorcycles", "motorcycle", "motorcycle", "motorcycles", "cars", "motorcycles"], "difficult_direct_answer": false, "rationales": ["There are motorcycles all present in the foreground.", "The vehicles in the foremost foreground have two, not four, wheels. they have engines.", "The vehicles that are closer to the ground in the front are two wheeler machines which are known as motorbikes."], "image": "train2014/COCO_train2014_000000433428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479613, "question_id": "Q2VVxQ3rZc77hjMnC4EH46", "question": "How does the person in the image tell time?", "choices": ["wrist watch", "wall clock", "phone", "microwave"], "correct_choice_idx": 0, "direct_answers": ["watch", "watch", "wrist watch", "watch", "clock", "watch", "wrist watch", "watch", "wrist watch", "watch"], "difficult_direct_answer": false, "rationales": ["The man is wearing a wristwatch.", "This is evident by the object on his wrist. there may be a c or b nearby, but they're not shown.", "The person could check his watch in order to get a sense for time."], "image": "val2014/COCO_val2014_000000479613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181084, "question_id": "Q2gYn8BLuTqPKxP8iWj6M9", "question": "What is the police monitoring?", "choices": ["accident", "balloon sale", "riot", "parade"], "correct_choice_idx": 3, "direct_answers": ["crowd", "crowds", "parade", "parade", "parade", "crowds", "crowd", "parade", "crowd", "people activities"], "difficult_direct_answer": false, "rationales": ["There are people holding balloons which would indicate a parade.", "It looks like there will be a parade on that street.", "With the people and the balloons this is likely a parade that is going on."], "image": "train2014/COCO_train2014_000000181084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204683, "question_id": "Q2xS78MGmNYXSeEF99tpmM", "question": "What types of muffins are these?", "choices": ["raisin", "poppy", "apple", "blueberry"], "correct_choice_idx": 1, "direct_answers": ["poppyseed", "poppy seed", "poppy", "no image", "poppyseed", "blueberry", "poppy seed", "poppy seed", "poppy seed", "poppy seed"], "difficult_direct_answer": false, "rationales": ["The muffins have seeds, not fruit, on them.", "The muffins have poppy seeds in them.", "The muffins contain poppy seeds."], "image": "train2014/COCO_train2014_000000204683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483817, "question_id": "Q2xYeDrCAvmhBGiJL59qmJ", "question": "Upon what do the boards seen here ride?", "choices": ["road", "wave", "air", "beach"], "correct_choice_idx": 1, "direct_answers": ["surfers", "waves", "waves", "waves", "waves", "waves", "surfers", "wave", "waves", "wave"], "difficult_direct_answer": false, "rationales": ["A surfboard in calm water is nothing but a floaty; in order to actually ride one, one must have movement in the water.", "The ocean has a and it's obvious from the image.", "The water will push them along the shore"], "image": "train2014/COCO_train2014_000000483817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106909, "question_id": "Q35jEL9ZYbqejxtUWv22fq", "question": "What are the colored plastic objects for?", "choices": ["sitting", "hold food", "for sale", "stacking"], "correct_choice_idx": 0, "direct_answers": ["tables chairs", "sitting", "sitting", "chairs", "sitting", "sitting", "tables", "seating", "child seating", "sitting"], "difficult_direct_answer": false, "rationales": ["This is an outdoor dining area that is informal and has plastic tables with chairs.", "Some of them are also for d.", "Some of the colored plastic objects are in use and they have features consistent with answer a."], "image": "val2014/COCO_val2014_000000106909.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514650, "question_id": "Q3UY43Qs4GNdBsZbwrEJf3", "question": "Why have the skaters covered their heads?", "choices": ["warmth", "uniform", "fashion", "protection"], "correct_choice_idx": 3, "direct_answers": ["for safety", "protection", "helmets", "helmets", "protection", "helmet", "skin protection", "protection", "safety", "protection"], "difficult_direct_answer": false, "rationales": ["Skateboarding presents risk to the head", "The skaters need safety.", "The skaters are wearing helmets."], "image": "train2014/COCO_train2014_000000514650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465468, "question_id": "Q3iBHSxZLEEBvT6JamCy6r", "question": "What might stop you from using this image in a commercial capacity?", "choices": ["disturbing nature", "sexual nature", "watermark", "offensive nature"], "correct_choice_idx": 2, "direct_answers": ["text", "superimposed words", "watermark", "text", "copyright", "copyright", "reproduction rights", "danger", "watermark", "people"], "difficult_direct_answer": false, "rationales": ["It would be unprofessional to display this image in a publication with the watermark on it because viewers would know you did not have permission to use the image.", "There is writing over the image that interrupts the clarity of the picture and would present issue if someone would want to reuse it and be unable to remove the writing. this writing is known as answer a and is meant to prevent unapproved reproduction.", "It is copyrighted according to the words across it"], "image": "train2014/COCO_train2014_000000465468.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106331, "question_id": "Q3jmSsBh4AVnFb8JLedkbD", "question": "Why is the man with the bat upset?", "choices": ["struck out", "he's not", "fined", "threatened"], "correct_choice_idx": 0, "direct_answers": ["he's out", "lost game", "struck out", "strikeout", "missed", "missed ball", "baseball", "bad call", "struck out", "strike out"], "difficult_direct_answer": true, "rationales": ["The man with the bat is upset and is giving up, because he cannot bat anymore.", "The umpire is making the strike sign with his arm.", "The man with the bat is upset because he didn't hit the ball and is out."], "image": "val2014/COCO_val2014_000000106331.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312041, "question_id": "Q3zgqPujtwGNbMwZhUznUA", "question": "What purpose does this item serve to do?", "choices": ["sharpener", "paperweight", "flynt", "candle"], "correct_choice_idx": 1, "direct_answers": ["decorate", "decoration", "hold papers", "decorate", "paperweight", "decorate", "amuse", "paperweight", "decoration", "transport"], "difficult_direct_answer": false, "rationales": ["The object on the desk is meant to be used as a paperweight.", "The metal bike figurine is a paperweight that is used to keep papers from flying off a desk.", "The item is heavy and is on a desk."], "image": "train2014/COCO_train2014_000000312041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82585, "question_id": "Q3zqDcW9c72FbP78Shh6r7", "question": "What type of geographical feature is located near this area?", "choices": ["desert", "ocean", "mountain", "mesa"], "correct_choice_idx": 1, "direct_answers": ["water", "ocean", "ocean", "ocean", "ocean", "ocean", "water", "water", "water", "ocean"], "difficult_direct_answer": false, "rationales": ["When we think of a beach, we almost automatically think of the water nearby.", "The area is an ocean beach.", "The type is the ocean."], "image": "train2014/COCO_train2014_000000082585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200001, "question_id": "Q44TztqTLDer87HPhHwVFv", "question": "What is the boy near?", "choices": ["car", "airplane", "railing", "box"], "correct_choice_idx": 2, "direct_answers": ["railing", "railing", "rail", "ramp", "rail", "rail", "ramp", "rail", "railing", "skate rail"], "difficult_direct_answer": false, "rationales": ["There is a metal pole on supports", "The boy is performing a trick with the railing.", "The skateboarder is near a rail and is jumping off."], "image": "train2014/COCO_train2014_000000200001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181453, "question_id": "Q4df2YgEomTHtxSCRbFXX5", "question": "Why is the bike broken into pieces?", "choices": ["to recycle", "to sell", "to paint", "to transport"], "correct_choice_idx": 3, "direct_answers": ["make fit", "storage", "to transport", "transport", "storage", "portable", "taken apart", "transport", "storage", "bicycle"], "difficult_direct_answer": false, "rationales": ["It is folded or. disassembled in order to fit into the vehicle.", "The bike is put into a shipping container.", "Bikes of this nature are designed to be broken down for convenience. the pieces this bike is designed and the way it is stored it looks intentional and being placed in another vehicle would likely be for answer a."], "image": "train2014/COCO_train2014_000000181453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574953, "question_id": "Q4jVDVUDVFezYabxerTF4x", "question": "Where would you normally find the orange and white thing in the foreground?", "choices": ["playground", "pub", "beach", "road"], "correct_choice_idx": 3, "direct_answers": ["roads", "construction site", "road", "construction zone", "road", "construction site", "yes", "construction", "parking lot", "street"], "difficult_direct_answer": false, "rationales": ["Traffic cones are on the road.", "These orange safety cones are normally found in construction areas of streets and highways.", "The object in the foreground is a traffic or warning cone. these are often placed in roads to advise people of hazards or direct them away from certain places."], "image": "train2014/COCO_train2014_000000574953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258272, "question_id": "Q52BfuLpVGaqnkKZDXs5WC", "question": "What religion is that statue associated with?", "choices": ["islam", "buddhism", "judaism", "christianity"], "correct_choice_idx": 1, "direct_answers": ["buddhism", "hinduism", "buddhism", "christianity", "christian", "asian", "christianity", "satanism", "catholicism", "buddhism"], "difficult_direct_answer": false, "rationales": ["A religious symbol is near a man on the phone.", "The statue is of buddha.", "Buddhists have statues like the one shown."], "image": "train2014/COCO_train2014_000000258272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41818, "question_id": "Q55tbizP5vAZSTseeseaEg", "question": "The drink in each of their glasses usually comes in a bottle that features what word?", "choices": ["homogenized", "concentrate", "caffeine", "proof"], "correct_choice_idx": 1, "direct_answers": ["juice", "orange", "orange", "orange juice", "orange", "concentrate", "quality", "orange", "wine", "orange"], "difficult_direct_answer": false, "rationales": ["Orange juice concentrate is sold in stores.", "The drink is concentrated.", "Orange juice bottles usually say concentrate on them."], "image": "train2014/COCO_train2014_000000041818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432704, "question_id": "Q57F8o7bvAiE96N7zngcY9", "question": "Who are the men wearing yellow?", "choices": ["crew", "doctors", "chefs", "students"], "correct_choice_idx": 0, "direct_answers": ["workers", "fishermen", "divers", "visibility", "diver", "rescue", "rescuers", "rain gear", "lifeguards", "crew"], "difficult_direct_answer": true, "rationales": ["The men wearing yellow are part of the boat crew.", "The yellow is a uniform and is used to visibility in the water when they are doing their job diving.", "They are the crew on the boat."], "image": "train2014/COCO_train2014_000000432704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563730, "question_id": "Q57xXMPckn2dkhWotuBwpT", "question": "What is the name of the green eating utensil next to the orange?", "choices": ["knife", "sporf", "fork", "spork"], "correct_choice_idx": 1, "direct_answers": ["fork", "spork", "spork", "multifunctional", "fork", "sporf", "spoon", "fork spoon", "spork", "fork"], "difficult_direct_answer": false, "rationales": ["A spork since it's a spoon and a fork.", "It is a combination spoon, fork and knife.", "The utensil is half spoon and half fork."], "image": "val2014/COCO_val2014_000000563730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157108, "question_id": "Q5KykHg6E8SHEf8ehibho5", "question": "What is the purpose of the man in yellow?", "choices": ["singing", "entertainment", "traffic control", "dancing"], "correct_choice_idx": 2, "direct_answers": ["crossing guard", "directing traffic", "traffic", "direct traffic", "direct traffic", "traffic control", "directing traffic", "direct traffic", "traffic control", "direct traffic"], "difficult_direct_answer": false, "rationales": ["The purpose is traffic control.", "The person is directing traffic.", "The man in yellow is directing cars."], "image": "train2014/COCO_train2014_000000157108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91989, "question_id": "Q5nm3o32BXAAbWcDZ3aZfr", "question": "What kind of animals are the people interacting with?", "choices": ["zebras", "giraffes", "elephants", "horses"], "correct_choice_idx": 1, "direct_answers": ["giraffes", "giraffes", "giraffes", "giraffes", "giraffes", "giraffe", "giraffe", "giraffes", "giraffes", "giraffe"], "difficult_direct_answer": false, "rationales": ["The people are interacting with giraffes that have walked up to the fence.", "The animals depicted are spotted with long necks, so they're giraffes.", "The animals are clearly visible and identifiable based on their unique features."], "image": "train2014/COCO_train2014_000000091989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5377, "question_id": "Q5xFVccvQyVZ2vmcgo9GRA", "question": "What liquid is disallowed here?", "choices": ["milk", "oil", "blood", "alcohol"], "correct_choice_idx": 3, "direct_answers": ["alcohol", "alcohol", "alcohol", "alcohol", "alcohol", "alcohol", "alcohol", "alcohol", "alcohol", "alcohol"], "difficult_direct_answer": false, "rationales": ["An informational sign near a beach informs that alcohol is not allowed.", "Drinking adult beverages is prohibited according to the sign.", "Alcohol is not allowed."], "image": "train2014/COCO_train2014_000000005377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539335, "question_id": "Q64kHXmYMBdtrJ6b768j8p", "question": "What is normally given out at the desk shown?", "choices": ["masks", "hats", "information", "cats"], "correct_choice_idx": 2, "direct_answers": ["answers", "information", "answers", "information", "enquiries", "information", "information", "information", "information", "answers"], "difficult_direct_answer": false, "rationales": ["The desk says enquiries.", "Info is given.", "When someone seeks facts about something."], "image": "train2014/COCO_train2014_000000539335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48229, "question_id": "Q6HovS7BtYLM6uiTXrmdyE", "question": "The portion of this device that has numbers on it looks like what?", "choices": ["chisel", "shark fins", "mice", "cows"], "correct_choice_idx": 1, "direct_answers": ["fin", "shark fin", "shark fins", "surf board", "shark fins", "waves", "shark", "surfboard", "fins", "board"], "difficult_direct_answer": true, "rationales": ["The shark fins are numbered.", "The portion is the fins.", "Shark fins are pointed."], "image": "train2014/COCO_train2014_000000048229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126001, "question_id": "Q6fNimgqXUVmwNCTxiEkGn", "question": "Who's sitting on the elephant's head?", "choices": ["tourist lady", "nobody", "tourist boy", "gentleman guide"], "correct_choice_idx": 3, "direct_answers": ["trainer", "man", "man", "trainer", "tourists", "tour guide", "guide", "gentleman guide", "man", "people"], "difficult_direct_answer": false, "rationales": ["The people who are sitting directly on the heads and not in seats on the rear, are in matching uniforms and have a guiding stick. based on the uniform and their position in what looks to be the drivers seat, they are likely to be answer a.", "A guide is on the elephant's head.", "The man is a professional and does these tours for a living."], "image": "train2014/COCO_train2014_000000126001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285579, "question_id": "Q6pLEYVSEtXBp78f6F8wcP", "question": "The black item with yellow label is meant for what?", "choices": ["recycling", "growing trees", "garbage", "donations"], "correct_choice_idx": 2, "direct_answers": ["trash", "trash", "holding garbage", "trash disposal", "trash", "trash", "protection wiring", "trash", "trash", "garbage"], "difficult_direct_answer": false, "rationales": ["The item is used to put trash in it.", "It is a trash can", "The trash can is meant for garbage."], "image": "train2014/COCO_train2014_000000285579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114579, "question_id": "Q6qye57wYrZKAs2QLncXU7", "question": "What is required for this activity?", "choices": ["sand", "snow", "sun", "wind"], "correct_choice_idx": 1, "direct_answers": ["snow", "snow", "balance", "snow", "ski poles", "skis", "moving legs", "poles", "skis", "snow"], "difficult_direct_answer": false, "rationales": ["The other options don't match the season, climate or sport.", "Snow is required to ski.", "Snow is required to ski."], "image": "val2014/COCO_val2014_000000114579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4823, "question_id": "Q6uKFZik525gog3mEcpV3R", "question": "What passion might someone in this apartment have that involves music?", "choices": ["singing", "drumming", "playing piano", "disc jockeying"], "correct_choice_idx": 3, "direct_answers": ["gaming", "djing", "disc jockeying", "video games", "do", "disc jockey", "do", "disc jockeying", "djing", "do"], "difficult_direct_answer": false, "rationales": ["There is a painting above the sitting man. it features a character who is spinning records.", "The passion is djing.", "The painting on the wall indicates that this person might have this passion."], "image": "train2014/COCO_train2014_000000004823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14886, "question_id": "Q7FdXXCZMif6eNfsnDxBQ9", "question": "What are the numbers above the lettering on the back of the bus?", "choices": ["address", "area code", "to count", "phone number"], "correct_choice_idx": 3, "direct_answers": ["phone number", "phone number", "phone number", "55851380", "phone number", "55851380", "phone number", "55851380", "phone number", "phone number"], "difficult_direct_answer": false, "rationales": ["Traditionally phone numbers use parenthesis and six numbers after the area code.", "There is a symbol beside the number that indicates the purpose.", "The number to call to get to turns tour"], "image": "train2014/COCO_train2014_000000014886.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55425, "question_id": "Q7NfweTvXSjNREEYw9vj6e", "question": "What part of Indian does this cuisine come from?", "choices": ["southern", "central", "northern", "western"], "correct_choice_idx": 2, "direct_answers": ["punjabi", "india", "punjabi", "downtown", "n/a", "north", "northern", "northern", "punjab", "vegetarian"], "difficult_direct_answer": false, "rationales": ["The northern part.", "This comes from northern india.", "Northern indian food is vegetarian."], "image": "train2014/COCO_train2014_000000055425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447179, "question_id": "Q7RmBipSLSmDxZmVwkZeWJ", "question": "What type job does the man in black hold?", "choices": ["dairy", "religious", "factory", "sports"], "correct_choice_idx": 1, "direct_answers": ["clergy", "priest", "priest", "religious", "priest", "clergy", "clergy", "pastor", "priest", "priest"], "difficult_direct_answer": false, "rationales": ["The job is religious.", "The man in the robe is a pastor.", "He is dressed in a long gown and there are glasses of wine which are probably for communion."], "image": "train2014/COCO_train2014_000000447179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485887, "question_id": "Q7dfWTNTTF8s9PU953e2mc", "question": "Why does the vehicle have two levels?", "choices": ["for deliveries", "for speed", "for sightseeing", "for decoration"], "correct_choice_idx": 2, "direct_answers": ["passenger", "tourist bus", "sightseeing", "fit passengers", "sightseeing", "for sightseeing", "carries passengers", "passengers", "carry more", "more efficient"], "difficult_direct_answer": true, "rationales": ["So that passengers can be able to view various sites.", "The bus is for tourists.", "They can carry more tourists who want to see the sights."], "image": "val2014/COCO_val2014_000000485887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16383, "question_id": "Q7hQhgwcHPvCmCz3SpgvxL", "question": "What is the use of the plant placed on the kitchen island?", "choices": ["aesthetics", "herb garnishes", "air quality", "scents"], "correct_choice_idx": 1, "direct_answers": ["decorations", "food", "decoration", "cooking herbs", "display plate", "beauty", "herbs", "house plant", "herb garnishes", "seasoning"], "difficult_direct_answer": true, "rationales": ["The plant appears to have the composition of answer a and would be consistent with the setting of a kitchen where answer a is used.", "The plant is an herb.", "There is a plant pot with herbs used for cooking."], "image": "train2014/COCO_train2014_000000016383.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552961, "question_id": "Q8JURR5435JrD2EdeMRVBh", "question": "What is the brand of the bike?", "choices": ["hyundai", "honda", "skoda", "bmw"], "correct_choice_idx": 3, "direct_answers": ["bmw", "bmx", "bmw", "bmw", "bmw", "bmw", "bmw", "modern", "bmw", "bmw"], "difficult_direct_answer": false, "rationales": ["The bike is a bmw.", "There is a logo visible on the motorcycle with a blue and white circle broker into fourths. this logo is associated with the company bmw and a logo appearing on a vehicle like this points to the brand.", "The blue and white logo of this company is attached to the bike."], "image": "train2014/COCO_train2014_000000552961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157019, "question_id": "Q8bnEgLsYCGVGQi7tYWcxF", "question": "What kind of vehicle can park in the middle lane?", "choices": ["ups truck", "delivery vehicle", "school bus", "fire truck"], "correct_choice_idx": 3, "direct_answers": ["fire truck", "fire truck", "fire truck", "fire truck", "fire truck", "fire truck", "fire truck", "fire truck", "fire truck", "fire truck"], "difficult_direct_answer": false, "rationales": ["The truck can park.", "Firetrucks can use the fire lane.", "A busy city street has the word fire printed in the middle lane of a multi lane street. fire trucks have a lane to drive in so they can get by in big cities."], "image": "val2014/COCO_val2014_000000157019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59385, "question_id": "Q8ggQrm7wLR4y6kRFqeCir", "question": "To which Ocean did persons owning this baggage travel to reach an Island recently?", "choices": ["sargasso sea", "atlantic", "none", "pacific"], "correct_choice_idx": 1, "direct_answers": ["atlantic", "atlantic", "atlantic", "atlantic", "atlantic", "caribbean", "pacific", "atlantic", "pacific", "atlantic"], "difficult_direct_answer": false, "rationales": ["Stickers on baggage quite often indicate where a person has traveled. multiple stickers have the word jamaica which is an island in the atlantic ocean.", "There are jamaica stickers on the suitcases. jamaica is not in the pacific ocean.", "The person needed to cross the atlantic."], "image": "train2014/COCO_train2014_000000059385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11223, "question_id": "Q8x7qMPLKqRiG5Dd2mrpZF", "question": "Which channel aired this show?", "choices": ["fox", "paramount", "space", "discovery"], "correct_choice_idx": 3, "direct_answers": ["discovery", "discovery", "discovery", "discovery", "discovery", "discovery", "discovery", "discovery", "discovery", "discovery"], "difficult_direct_answer": false, "rationales": ["A slightly tilted view of what looks to be on top of a snowy mountain. in the bottom portion is in name of channel.", "Discovery channel's logo is shown.", "A scene is shown with the discovery channel logo in the bottom corner."], "image": "train2014/COCO_train2014_000000011223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25316, "question_id": "Q9D3crX9vigQiTjdfyGmFa", "question": "Which fruit is too ripe?", "choices": ["apple", "plum", "peach", "banana"], "correct_choice_idx": 3, "direct_answers": ["banana", "banana", "banana", "banana", "banana", "banana", "bananas", "bananas", "banana", "banana"], "difficult_direct_answer": false, "rationales": ["The yellow curved fruit is beginning to get black spots on it as it starts to overripen.", "The item is losing its yellow color and is going brown. the item is also labeled with the brand name chiquita, which produces this kind of fruit.", "The banana is over ripe."], "image": "val2014/COCO_val2014_000000025316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323827, "question_id": "Q9HEadeSsqLxhjpEqyqrNM", "question": "What event do the rings signify?", "choices": ["dolphin show", "square dance", "school play", "olympics"], "correct_choice_idx": 3, "direct_answers": ["olympics", "olympics", "olympics", "olympics", "olympics", "olympics", "olympics", "olympics", "olympics", "olympics"], "difficult_direct_answer": false, "rationales": ["The particular pattern of overlapping rings visible is commonly known to be associated with answer a.", "The rings are for the olympic rings.", "This photo was taken in london in 2012 during the games."], "image": "val2014/COCO_val2014_000000323827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116380, "question_id": "Q9JWYiJr96yoRrvq9XMXBM", "question": "What brand name mentions the thing visible in the toilet?", "choices": ["water world", "hello kitty", "tidy bowl", "goya beans"], "correct_choice_idx": 1, "direct_answers": ["cat", "kitty litter", "caterpillar", "hello kitty", "bobcat", "yankee candle", "cat", "cat", "cat", "meow"], "difficult_direct_answer": false, "rationales": ["A cat is a kitty.", "Hello kitty is a little girls toy.", "The cat is on the toilet."], "image": "train2014/COCO_train2014_000000116380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90293, "question_id": "Q9QfsJHiv2WeezKWenfGMH", "question": "Why does she have the yellow bag over her head?", "choices": ["no umbrella", "debris airborne", "birds overhead", "hiding face"], "correct_choice_idx": 0, "direct_answers": ["raining", "cover head", "prevent rain", "stay dry", "no umbrella", "rain shield", "raining", "raining", "rain protection", "raining"], "difficult_direct_answer": false, "rationales": ["The woman doesn't have an umbrella and it's raining.", "She doesn't have an umbrella.", "She has nothing to cover her head with from getting wet."], "image": "train2014/COCO_train2014_000000090293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426700, "question_id": "Q9i5bmNVB2sPZKuGPciTtX", "question": "What is this sail made to look like?", "choices": ["legal pad", "bread dip", "sheet", "blanket"], "correct_choice_idx": 0, "direct_answers": ["lined paper", "lined paper", "lined paper", "paper", "notebook paper", "notebook paper", "paper", "lined paper", "legal pad", "jet"], "difficult_direct_answer": false, "rationales": ["The sail is yellow and has blue and red lines. it is similar to the paper that lawyers use.", "The sail looks like a piece of paper.", "The sail is yellow with blue evenly spaced horizontal lines just like the popular paper commonly used by attorneys for notetaking."], "image": "val2014/COCO_val2014_000000426700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109836, "question_id": "Q9vEwsDFfDSx7pPuM3hBJr", "question": "What material is the armchair made out of?", "choices": ["cloth", "linen", "leather", "metal"], "correct_choice_idx": 2, "direct_answers": ["leather", "leather", "leather", "leather", "leather", "leather", "wood", "wood", "leather", "wood"], "difficult_direct_answer": false, "rationales": ["The material is smooth and not fuzzy, so it is likely leather.", "The material is leather.", "The chair is very shiny than the couches."], "image": "train2014/COCO_train2014_000000109836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437613, "question_id": "QA3YwTDnV6bZvysyxzWm4f", "question": "What is the black bag on the sidewalk?", "choices": ["tools", "luggage", "furniture", "groceries"], "correct_choice_idx": 1, "direct_answers": ["luggage", "suitcase", "suitcase", "luggage", "luggage", "suitcase", "luggage", "suitcase", "luggage", "suitcase"], "difficult_direct_answer": false, "rationales": ["The people are waiting to board the bus because they are travelling and have their suitcase full of belongings with them.", "The black bag is a suitcase.", "The black bag is a suitcase."], "image": "train2014/COCO_train2014_000000437613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33416, "question_id": "QA6bqSVxLQjW4vguUKc97A", "question": "Which of these men is most likely from a different country?", "choices": ["plaid shirt", "bmw jacket", "yellow shirt", "striped shirt"], "correct_choice_idx": 1, "direct_answers": ["white man", "hat man", "wearing sunglasses", "bmw jacket", "bmw jacket", "black jacket", "with hut", "bmw jacketed", "bmw jacket", "big guy"], "difficult_direct_answer": false, "rationales": ["He has a different skin tone than the rest of the people near him.", "The guy is taking a picture and seems to be a foreigner.", "The men in the yellow, striped, and plaid shirts all have a similar skin colour."], "image": "train2014/COCO_train2014_000000033416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264322, "question_id": "QAPcVxvbyDtVJNtShDjaBe", "question": "Who regularly wore the item the man has over his pants?", "choices": ["man ray", "ray charles", "ray lewis", "rachel ray"], "correct_choice_idx": 3, "direct_answers": ["cooks", "alfredo linguini", "chef", "rachel ray", "cook", "chefs", "apron", "apron", "pizza man", "chef boyardee"], "difficult_direct_answer": true, "rationales": ["Rachel ray is a chef.", "Rachel ray is a chef.", "Rachel ray is a famous chef, and aprons are commonly worn by chefs while cooking."], "image": "train2014/COCO_train2014_000000264322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21404, "question_id": "QAUUutsdGqqkigbtSyJ9Xp", "question": "What numeral system is used for the numbers on the clock?", "choices": ["binary", "roman", "hindu-arabic", "egyptian"], "correct_choice_idx": 1, "direct_answers": ["roman", "roman", "roman numeral", "roman", "roman", "base 12", "roman", "roman numerals", "roman", "roman"], "difficult_direct_answer": false, "rationales": ["The other options don't match these characters. a is commonly used on clocks.", "Roman numerals are ones that feature this kind of i, x and v numbering system.", "The numerals are x and is."], "image": "train2014/COCO_train2014_000000021404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240119, "question_id": "QAiBtEaUTD2cJSqPVR2t4V", "question": "How many types of fire engines are available?", "choices": ["four", "five", "three", "two"], "correct_choice_idx": 0, "direct_answers": ["one", "four", "two", "two", "three", "two", "two", "two", "two", "three"], "difficult_direct_answer": false, "rationales": ["There are two firetrucks visible.", "There are a couple of fire engines.", "I don't see more than this number."], "image": "val2014/COCO_val2014_000000240119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281315, "question_id": "QBEMN4a8UiDj2KRARQo6Pj", "question": "What is the man in the white shirt ready to do?", "choices": ["run", "dribble", "catch", "sit"], "correct_choice_idx": 2, "direct_answers": ["drop arm", "throw", "catch frisbee", "catch frisbee", "catch", "catch", "catch frisbee", "catch frisbee", "catch", "toss"], "difficult_direct_answer": false, "rationales": ["The man in the red shirt just threw or is ready to catch the frisbee.", "The man is catching.", "He has his hand out to catch."], "image": "val2014/COCO_val2014_000000281315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568259, "question_id": "QBEPdHUurKAvTRfQbqoCTs", "question": "What three letters are behind his head?", "choices": ["ghu", "heb", "ful", "rty"], "correct_choice_idx": 1, "direct_answers": ["heb", "heb", "heb", "heb", "heb", "heb", "heb", "heb", "heb", "heb"], "difficult_direct_answer": false, "rationales": ["A sign is behind a tennis player and the letters h-e-b can be seen.", "The letters are written in green on the stands behind the player.", "It is a texas grocery store."], "image": "val2014/COCO_val2014_000000568259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298146, "question_id": "QBGpAbuf2MryxBw5N2oadt", "question": "What sport are the two people in the water participating in?", "choices": ["surfing", "sailing", "swimming", "para waterskiing"], "correct_choice_idx": 3, "direct_answers": ["jet skiing", "kite surfing", "wind surfing", "speedboats", "kiting", "parasailing", "para waterskiing", "water paragliding", "hang gliding", "skiing"], "difficult_direct_answer": true, "rationales": ["The people are para waterskiing.", "The people in the water are para waterskiing which involved using wind sails to pull them on water skis.", "This activity requires a kite-like object."], "image": "train2014/COCO_train2014_000000298146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283611, "question_id": "QBec7ReUeiRUpLHxFQgDJP", "question": "What type of rice is on the plate?", "choices": ["white", "risotto", "brown", "mexican"], "correct_choice_idx": 3, "direct_answers": ["orange", "orange", "spanish rice", "vegetables meat", "yellow", "orange", "mexican", "spiced rice", "mexican", "mexican"], "difficult_direct_answer": false, "rationales": ["The shape of the rice resembles the rice noted.", "Mexican rice is orange.", "The rice is heavily spiced and colored with tomatoes to make it red."], "image": "train2014/COCO_train2014_000000283611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319031, "question_id": "QBtLE7qCFKDvE2CTy5fjdJ", "question": "Where does the kid want to kick the ball?", "choices": ["over fence", "backwards", "past boy", "right"], "correct_choice_idx": 2, "direct_answers": ["goal", "goal", "goal", "net", "into net", "goal", "goal", "friend", "past goalie", "past boy"], "difficult_direct_answer": false, "rationales": ["Based on the ball, the setting and how the boys are using it, they are playing soccer. in line with the objectives of soccer, the boy with the ball would need to score a goal behind the other.", "The kid wants to kick the ball past a boy with a goal.", "The kid wants to kick the ball into the goal behind the goalie boy."], "image": "train2014/COCO_train2014_000000319031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443094, "question_id": "QC8skwdokMX6JXbdnKLqnC", "question": "What studio created the character next to the boy?", "choices": ["lion's gate", "mgm", "paramount", "illumination entertainment"], "correct_choice_idx": 3, "direct_answers": ["illumination entertainment", "disney", "pixar", "pixar", "pixar", "disney", "pixar studios", "illumination entertainment", "pixar", "illumination"], "difficult_direct_answer": false, "rationales": ["A boy is smiling as he reaches in to pose with a minion. the name of the movie company can be found in beginning of minion films.", "The studio who created the character is illumination entertainment for despicable me.", "The boy has a minion on his cake."], "image": "train2014/COCO_train2014_000000443094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237618, "question_id": "QCALmog4KYByRgUUymT3ox", "question": "Why do the men wear hats?", "choices": ["fashion", "prevent sunburn", "dress code", "protect head"], "correct_choice_idx": 1, "direct_answers": ["protect sun", "prevent sunburn", "avoid sunlight", "cowboys", "sun", "sun protection", "block sun", "shade", "protect head", "sun protection"], "difficult_direct_answer": true, "rationales": ["The hats visible have wide brims that are commonly known to block sunshine from the face for the purposes of answer a.", "The men are wearing hats to keep out the sun.", "The men don't want to be sunburned."], "image": "val2014/COCO_val2014_000000237618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498537, "question_id": "QCbAzGqcVmHiLBBx7Np4pd", "question": "Which food offers the most protein?", "choices": ["ginger", "pudding", "bread", "egg"], "correct_choice_idx": 3, "direct_answers": ["egg", "chicken", "egg", "egg", "meat", "egg", "eggs", "eggs", "egg", "egg"], "difficult_direct_answer": false, "rationales": ["It has more protein than almost everything other than meat.", "The yellow yolk is full of protein.", "Most animal products will be higher in protein than most plant based foods"], "image": "val2014/COCO_val2014_000000498537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428992, "question_id": "QD5FGvCTFuSyYJYA6k6vhU", "question": "What meal is being served?", "choices": ["brunch", "dinner", "breakfast", "lunch"], "correct_choice_idx": 1, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "dinner", "pizza"], "difficult_direct_answer": false, "rationales": ["The time on the clock is 5:46 and the food is pizza. pizza is not usually consumed for breakfast or brunch so it must be 5:46 pm which is too late for lunch to be served.", "The time on the clock and the type of food being served means this is likely answer a.", "Based on the time visible on the clock and the food seen, these two things would be associated with answer a."], "image": "val2014/COCO_val2014_000000428992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329753, "question_id": "QDFJ7jDaJHWGXMdMPG7hhP", "question": "What are these people waiting for?", "choices": ["ride", "meth", "eclipse", "lunch"], "correct_choice_idx": 0, "direct_answers": ["bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus", "ride", "bus"], "difficult_direct_answer": false, "rationales": ["The people want a ride.", "The people are holding their things while standing near the road.", "The people are waiting on the side of the road."], "image": "train2014/COCO_train2014_000000329753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44801, "question_id": "QDTSHaYKQ6J87P2Mn3dmkB", "question": "Why are all the vehicles on the left not moving?", "choices": ["tired", "red light", "parade", "accident"], "correct_choice_idx": 1, "direct_answers": ["stop light", "traffic", "traffic", "at stop", "stoplight", "stop light", "traffic light", "stoplight", "red light", "traffic"], "difficult_direct_answer": false, "rationales": ["The vehicles are at a red light.", "A red traffic light is telling the drivers to stop.", "The vehicles are waiting for a green light."], "image": "val2014/COCO_val2014_000000044801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406646, "question_id": "QDrRZxPNr8cqWrVsGzXaqm", "question": "What can be found here?", "choices": ["cats", "dogs", "bats", "pots"], "correct_choice_idx": 3, "direct_answers": ["plants", "plants", "plants", "green house", "plants", "green house", "plants", "green house", "pots", "plants"], "difficult_direct_answer": false, "rationales": ["There are a lot of plants that are in different types of containers.", "The room shows various potted plants that people can shop for.", "The other options don't apply to a greenhouse or garden store setting."], "image": "train2014/COCO_train2014_000000406646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308316, "question_id": "QDuALV8vjXxC2suy5FHJ4m", "question": "What type of land does this plane fly over?", "choices": ["urban", "farm", "city", "desert"], "correct_choice_idx": 1, "direct_answers": ["farm land", "crop fields", "farm", "plantation", "farmland", "farmland", "farmland", "farmland", "farmland", "farm"], "difficult_direct_answer": false, "rationales": ["The land is divided into rectangles. the land is mostly green.", "There are large fields under the plane.", "The plane is over fields."], "image": "val2014/COCO_val2014_000000308316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131919, "question_id": "QE4qvhSYX94xXXGnkJ3hjJ", "question": "Which country invented sunglasses?", "choices": ["israel", "greece", "china", "italy"], "correct_choice_idx": 2, "direct_answers": ["chinese", "china", "italy", "italy", "chinese", "usa", "china", "china", "usa", "chinese"], "difficult_direct_answer": false, "rationales": ["They were invented in asia.", "A woman is wearing sunglasses on a path on a sunny day.", "The chinese invented the sunglasses."], "image": "val2014/COCO_val2014_000000131919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50028, "question_id": "QE7NXmcyyMQ5vQXMeuEjXR", "question": "Why is the woman wearing a scarf?", "choices": ["dress code", "fashion", "cosplay", "warmth"], "correct_choice_idx": 3, "direct_answers": ["snow", "cold", "cold", "cold", "for warmth", "cold", "warmth", "cold", "warmth", "cold"], "difficult_direct_answer": false, "rationales": ["It's cold out.", "The woman is standing in the snow.", "There is snow in the background which is consistent with a cold environment which would lead someone to wear a scarf to stay warm."], "image": "train2014/COCO_train2014_000000050028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252724, "question_id": "QE9Y62S9dzjnbpWqTMKYkG", "question": "What can the animals on the left do that the animals on the right cannot?", "choices": ["run", "swim", "reach high", "talk"], "correct_choice_idx": 2, "direct_answers": ["standing", "eat leaves", "reach high", "reach high", "viewing heights", "reach high", "reach high", "eat leaves", "jump", "eat leaves"], "difficult_direct_answer": false, "rationales": ["Giraffes are tall", "The left animals are notorious for long necks and visibly have a significant advantage in that area compared to the other visible animals. the other options are less physically observable.", "The animals are tall."], "image": "train2014/COCO_train2014_000000252724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211113, "question_id": "QEm8bAynvbHhWcEfcdoEJW", "question": "Why are the benches black?", "choices": ["natural color", "city requirement", "longer wear", "cheapest paint"], "correct_choice_idx": 2, "direct_answers": ["painted", "paint", "paint", "painted", "preference", "painted", "painted black", "paint", "painted black", "longer wear"], "difficult_direct_answer": false, "rationales": ["These benches are the colors they are due to they get a lot of sunlight and don't want to burn anyone.", "Someone painted them that color to last longer.", "The dark colors help the benches to keep clean and last longer."], "image": "train2014/COCO_train2014_000000211113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25685, "question_id": "QF3H6PztMuogpZGgJk6rNw", "question": "Based on the circumstances of the image what method of transportation currently moves the fastest?", "choices": ["walking", "airplane", "motorboat", "swimming"], "correct_choice_idx": 2, "direct_answers": ["boat", "boat", "boat", "boat", "plane", "boat", "boat", "airplane", "motorboat", "boat"], "difficult_direct_answer": false, "rationales": ["The motorboat is generating waves.", "The plane seems to have fallen from the sky.", "The other option isn't actively moving."], "image": "val2014/COCO_val2014_000000025685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561806, "question_id": "QFEgAgxVHWijCarhxfJGqV", "question": "What is hanging off the bike handlebars?", "choices": ["backpack", "suitcase", "planner", "vest"], "correct_choice_idx": 0, "direct_answers": ["bag", "bag", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "bag"], "difficult_direct_answer": false, "rationales": ["The guy has his backpack hanging off the bars.", "There is a backpack on the handlebars of the bike.", "The backpack is hanging."], "image": "val2014/COCO_val2014_000000561806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526892, "question_id": "QFmLnt9VSBZLVWvBXAzwhm", "question": "What is the guy on bare feet doing with the table?", "choices": ["cleaning", "waxing", "painting", "sanding"], "correct_choice_idx": 1, "direct_answers": ["waxing", "waxing", "wiping", "waxing", "polishing", "wiping", "smoothing", "polishing", "sanding", "waxing"], "difficult_direct_answer": false, "rationales": ["The man is waxing the wood table.", "Applying a substance on a surfboard increase the grip when surfing.", "The guy is waxing."], "image": "val2014/COCO_val2014_000000526892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67956, "question_id": "QFu5Mc4jWTxAwmk3MoMv55", "question": "What must vehicles do when reaching the corner near the blue trash container?", "choices": ["park", "reverse", "yield", "stop"], "correct_choice_idx": 3, "direct_answers": ["stop", "stop", "stop", "stop", "stop", "stop", "stop", "stop", "stop", "stop"], "difficult_direct_answer": false, "rationales": ["There is a red octagonal sign at the corner telling the traffic not to go before waiting and checking that there is no opposing traffic coming before proceeding.", "There is a sign saying to come to a complete halt.", "There is a sign above the blue trash can which gives instruction to the drivers and is readable."], "image": "val2014/COCO_val2014_000000067956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97097, "question_id": "QGQqy5igwJbC7TLppMtGsY", "question": "What is the danger of partaking in this activity with no jacket?", "choices": ["starvation", "hypothermia", "bear attack", "dehydration"], "correct_choice_idx": 1, "direct_answers": ["hypothermia", "skin injury", "windburn", "being cold", "frostbite", "frostbite", "frostbite", "frost bite", "frost bite", "hypothermia"], "difficult_direct_answer": false, "rationales": ["They are skiing in a cold snow-covered area. doing this without a jacket could lead to frostbite or worse.", "Skiing is a sport that requires snow. skiers are likely to get cold if they do not wear a warm garment.", "A person could get overly-cold in the snow region."], "image": "train2014/COCO_train2014_000000097097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195542, "question_id": "QGZWN59LcuEB3kWeUzxeQv", "question": "What protective material is covering the wood desk that the laptop is on?", "choices": ["plastic", "epoxy", "glass", "lacquer"], "correct_choice_idx": 2, "direct_answers": ["glass", "glass cover", "gloss", "glass", "glass", "table", "plexiglass", "glass", "glass", "glass"], "difficult_direct_answer": false, "rationales": ["The covering is transparent with a rolled edge", "There is see thru material on the desk. it is hard and reflective.", "The way it shines it is the glass."], "image": "val2014/COCO_val2014_000000195542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377235, "question_id": "QGcA6HJLMNsyqJkR6HcjSP", "question": "At what venue are people seated outdoors on yellow chairs?", "choices": ["park", "sporting event", "sidewalk cafe", "bus stop"], "correct_choice_idx": 2, "direct_answers": ["cafe", "sidewalk", "restaurant", "outdoor caffee", "sidewalk cafe", "cafe", "cafe", "cafe", "cafe", "cafe"], "difficult_direct_answer": false, "rationales": ["There is some people eating food.", "The venue is a cafe.", "These people are seated outdoors at a sidewalk cafe."], "image": "val2014/COCO_val2014_000000377235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165820, "question_id": "QGr3iDfZgBJfqp4wPvqV4F", "question": "What type of content does the website advertised in the background provide?", "choices": ["movies", "blogs", "social media", "music"], "correct_choice_idx": 3, "direct_answers": ["music", "music", "music", "radio", "radio", "entertainment", "radio", "music", "radio", "music"], "difficult_direct_answer": false, "rationales": ["The other options wouldn't apply to a radio station.", "The radio station 92.9 is listed.", "The radio offers music."], "image": "train2014/COCO_train2014_000000165820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54849, "question_id": "QH2um9hxdGkuoSF2iExCjT", "question": "What is the man wearing?", "choices": ["sandals", "mask", "hat", "suspenders"], "correct_choice_idx": 0, "direct_answers": ["shorts", "shorts", "headphones", "hoodie", "clothes", "hoodie", "shorts", "hoodie", "shorts", "sandals"], "difficult_direct_answer": false, "rationales": ["The man has flip flops on.", "The man's head and face are not covered. his feet are partially covered.", "He has flip flops on."], "image": "train2014/COCO_train2014_000000054849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504108, "question_id": "QH9keiTxweMZwEcqbWncfh", "question": "What age group normally eats this food?", "choices": ["young adults", "teenagers", "kids", "seniors"], "correct_choice_idx": 2, "direct_answers": ["children", "children", "children", "children", "kids", "children", "children", "children", "children", "children"], "difficult_direct_answer": false, "rationales": ["The cereal is colored. children like colors.", "The food is cereal based on the shape and coloring and it's serving method. this food type is commonly associated with answer a.", "It is a cereal with lots of sugar so younger humans like it better."], "image": "train2014/COCO_train2014_000000504108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156986, "question_id": "QHGWAeMAtVsE4PTYkxFqHN", "question": "What is the skater doing on the rail?", "choices": ["grinding", "flipping", "manualing", "whipping"], "correct_choice_idx": 0, "direct_answers": ["grinding", "grinding", "grinding", "grinding", "gliding", "grinding", "grinding", "gliding", "riding", "grinding"], "difficult_direct_answer": false, "rationales": ["The skater is grinding on top of the rail.", "The skater is grinding on the rail by using the inside of his board to slide across it,", "The skateboard is in a grind position on its trucks."], "image": "train2014/COCO_train2014_000000156986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402967, "question_id": "QHQFqpDyJJuzGdRKpqe2no", "question": "What structure is located here?", "choices": ["cafe", "pagoda", "pavilion", "barn"], "correct_choice_idx": 2, "direct_answers": ["bus stop", "transportation center", "bus station", "bus station", "pavilion", "indoor mall", "bus depot", "busstop", "bus terminal", "bus stop"], "difficult_direct_answer": false, "rationales": ["A pavilion is located here.", "This structure doesn't have walls but has a roof like a pavilion does.", "The bus is parked by a large pavilion that leads to a place where events take place."], "image": "val2014/COCO_val2014_000000402967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365293, "question_id": "QHRdUGdk6nqT5LLkfNZhzL", "question": "Why is the dog on the board?", "choices": ["steering", "put there", "stealing board", "lost"], "correct_choice_idx": 1, "direct_answers": ["put there", "to surf", "surfing", "enjoys water", "companionship", "to paddle", "surfing", "surfing", "surfing", "photo"], "difficult_direct_answer": false, "rationales": ["The dog was placed on the board by the man.", "The dog is standing on the board. his owner is there with him.", "The human is surfing with his dog"], "image": "train2014/COCO_train2014_000000365293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251576, "question_id": "QHUu5tVjjBte396T3FHsrH", "question": "Whats is the child doing?", "choices": ["cooking", "working", "sleeping", "wakeboarding"], "correct_choice_idx": 3, "direct_answers": ["surfing", "swimming", "surfing", "boogie boarding", "swimming", "swimming", "riding board", "playing", "wakeboarding", "surfing"], "difficult_direct_answer": false, "rationales": ["He's on a board on water", "The kid is wakeboarding.", "The child has a board in water."], "image": "train2014/COCO_train2014_000000251576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41357, "question_id": "QHVSAGk3hnR2LHqnQxRwQL", "question": "What type event does 377 participate in here?", "choices": ["prison", "race", "vacation", "holiday"], "correct_choice_idx": 1, "direct_answers": ["skiing", "race", "skiing", "skiing", "skiing", "skiing", "skiing", "cross country", "cross country", "skiing"], "difficult_direct_answer": false, "rationales": ["The person has a bib with an identifying number, usually only used for competitions.", "The person is competing.", "The person is competing in a ski race."], "image": "train2014/COCO_train2014_000000041357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215701, "question_id": "QHmeyJbTfY3nrR9CNTcG7G", "question": "Which food will most likely get eaten with the fork?", "choices": ["sandwich bread", "sandwich contents", "pickle", "fruit"], "correct_choice_idx": 3, "direct_answers": ["fruit", "fruit", "fruit", "fruit", "fruits", "fruit", "fruit", "fruit", "fruits", "fruit"], "difficult_direct_answer": false, "rationales": ["The food is fruit.", "Fruit is often eaten with a fork.", "The cut fruit would be eaten with a fork."], "image": "train2014/COCO_train2014_000000215701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495756, "question_id": "QHnsgBWWV3UZ4NWCdnD2Ud", "question": "What is the Red Forestry truck driving in?", "choices": ["fire", "parade", "auto mall", "forest picnic"], "correct_choice_idx": 1, "direct_answers": ["parade", "road", "parade", "both lanes", "parade", "street", "something one", "parade", "road", "town"], "difficult_direct_answer": false, "rationales": ["There are people on the side of the street like seen in a parade.", "It is used to parade before going to mission.", "The truck is in a parade."], "image": "train2014/COCO_train2014_000000495756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293072, "question_id": "QHsR7nh3WfSBE27AQX4aka", "question": "When will this girl be old enough for Kindergarten?", "choices": ["3 years", "1 year", "2 years", "this year"], "correct_choice_idx": 1, "direct_answers": ["five", "couple years", "few years", "one year", "5 year", "five", "1 year", "five years", "few years", "one year"], "difficult_direct_answer": false, "rationales": ["Kindergarten is for five year olds. there are four candles on the cake.", "She is smaller than the boy in front of the birthday cake and he will be eligible in 1 year", "There are 4 candles on the cake, indicating she is 4 years old. 5 years old is the typical kindergarten age."], "image": "val2014/COCO_val2014_000000293072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518914, "question_id": "QHzAxA9K6sT4YfkrhmbMcP", "question": "The device behind the orange umbrella is used to charge for what service?", "choices": ["battery charging", "bus fares", "street parking", "valet parking"], "correct_choice_idx": 2, "direct_answers": ["parking", "parking", "cell service", "parking", "electricity", "cell phone", "parking", "parking", "street parking", "parking"], "difficult_direct_answer": false, "rationales": ["People use it in case it rains outside.", "The orange umbrella is for street parking.", "The device behind the orange umbrella is near cars that are stationary. it is a meter."], "image": "val2014/COCO_val2014_000000518914.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430273, "question_id": "QJCDhGFGeYPsufHknVBCGE", "question": "The trains have what safety feature on the glass to help see visibly in stormy weather?", "choices": ["turning signals", "windshield wipers", "high beams", "fog heater"], "correct_choice_idx": 1, "direct_answers": ["windshield wiper", "anti-glare treatment", "wipers", "windshields", "windshield wipers", "wiper blades", "wipers", "windshield wipers", "wipers", "windshield wipers"], "difficult_direct_answer": false, "rationales": ["When it rains the drops can stay on the windshield. it can be hard to see through them if they are not wiped away.", "Windshield wipers are moving on the glass.", "The trains have wipers."], "image": "val2014/COCO_val2014_000000430273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215346, "question_id": "QJa684kqiJVd5gqmT4fTkS", "question": "What meal is this man going to have?", "choices": ["dinner", "afternoon tea", "breakfast", "lunch"], "correct_choice_idx": 1, "direct_answers": ["cake", "cake", "dessert", "meat pie", "dessert", "dessert", "afternoon tea", "dessert", "cake", "cake"], "difficult_direct_answer": false, "rationales": ["The meal is tea.", "The man has little tea cakes on his plate.", "The man has a piece of cake. the serving tray and dishes for the cake are fancy."], "image": "train2014/COCO_train2014_000000215346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555461, "question_id": "QJmQ8SWsUK3QSFvWE6NvTT", "question": "What is the man's hat called?", "choices": ["felt", "bowler hat", "coach", "trilby"], "correct_choice_idx": 1, "direct_answers": ["top hat", "top hat", "bowler hat", "top hat", "top hat", "tophat", "top hat", "beanie", "tophat", "bowler"], "difficult_direct_answer": false, "rationales": ["The hat is made out of felt.", "The hat is a bowler one.", "The shape of the hate gives it the name"], "image": "val2014/COCO_val2014_000000555461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479769, "question_id": "QJneByBJm828nikDpzTfXy", "question": "The ski lifts travel along what material?", "choices": ["rope", "cable", "string", "branch"], "correct_choice_idx": 1, "direct_answers": ["cable", "metal", "wire", "wire", "wires", "wire", "wires", "cable", "wire", "cable"], "difficult_direct_answer": false, "rationales": ["The lifts are in air.", "You can see the wires hanging from right to left.", "There are two lifts hanging above. they are connected to long black ropes."], "image": "train2014/COCO_train2014_000000479769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132521, "question_id": "QKXEqbbEn932c653DiSn5Z", "question": "What is inside the Chicago Sun-Times box?", "choices": ["magazines", "mail", "maps", "newspaper"], "correct_choice_idx": 3, "direct_answers": ["newspaper", "newspaper", "newspaper", "newspapers", "newspapers", "newspapers", "newspapers", "newspaper", "newspaper", "newspapers"], "difficult_direct_answer": false, "rationales": ["The chicago sun times is the name of a newspaper.", "The box on the street corner has copies of the chicago sun-times newspaper inside.", "That is what is sold by that business"], "image": "train2014/COCO_train2014_000000132521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530551, "question_id": "QKp32BZcbFEuRiZVKX9gQa", "question": "What type of fuel does the truck take?", "choices": ["gas", "lighter fluid", "kerosene", "petroleum"], "correct_choice_idx": 0, "direct_answers": ["unleaded", "gasoline", "diesel", "ethanol", "gas", "diesel", "regular", "diesel", "gas", "diesel"], "difficult_direct_answer": false, "rationales": ["It runs on unleaded gasoline.", "The truck on the street is a pickup truck that takes regular gas to run.", "The truck is a gasoline truck."], "image": "train2014/COCO_train2014_000000530551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60179, "question_id": "QKrcgMhCkp3ASEWeCcTUxG", "question": "What symbol is being displayed here?", "choices": ["carp", "bat", "flying fish", "dragon"], "correct_choice_idx": 3, "direct_answers": ["dragon", "fish", "whale", "winged shark", "fish", "fish", "logo", "bat", "dragon", "dragonfly"], "difficult_direct_answer": false, "rationales": ["It is a fish with wings.", "The symbol is a dragon.", "The creatures has wings and fins"], "image": "val2014/COCO_val2014_000000060179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295565, "question_id": "QKwmUJ7vnYUqcgaMJZcae8", "question": "What is at the front of the train?", "choices": ["bear", "cat", "old lady", "child"], "correct_choice_idx": 3, "direct_answers": ["child", "boy", "child", "kid", "children", "little boy", "boy", "boy", "kids", "dad kids"], "difficult_direct_answer": false, "rationales": ["The kid is in front.", "A little boy is at the front.", "Children are in front of the train."], "image": "train2014/COCO_train2014_000000295565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379636, "question_id": "QKxPxSoQ4oWAHLLvdp9JuX", "question": "What are the most plentiful items on the plate made of?", "choices": ["squid", "deer", "apple", "potato"], "correct_choice_idx": 3, "direct_answers": ["potato", "potatoes", "potatoes", "potatoes", "potatoes", "potato", "potato", "potato", "potato", "potatoes"], "difficult_direct_answer": false, "rationales": ["The item is the potato.", "The most plentiful items are french fries.", "They are french fries."], "image": "val2014/COCO_val2014_000000379636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104631, "question_id": "QLVaukzSuVDM3XNca6fji3", "question": "What singer has the same last name as the word that appears on the board?", "choices": ["eddie money", "gwen stefani", "pink", "johnny cash"], "correct_choice_idx": 3, "direct_answers": ["johnny cash", "johnny cash", "cash", "johnny cash", "johnny cash", "johnny cash", "johnny cash", "johnny cash", "no", "johnny"], "difficult_direct_answer": false, "rationales": ["The other options don't match the word on the board. he sang country and even some rock songs.", "The word is cash and johnny cash is a famous singer.", "The word that appears on the board is cash."], "image": "train2014/COCO_train2014_000000104631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59024, "question_id": "QLXwM2ZvAatGQuzzV8HUFR", "question": "Which occasion is this for?", "choices": ["birthday", "anniversary", "christmas", "easter"], "correct_choice_idx": 1, "direct_answers": ["anniversary", "anniversary", "wedding", "anniversary", "anniversary", "birthday celebrating", "anniversary", "anniversary", "birthday celebrating", "anniversary"], "difficult_direct_answer": false, "rationales": ["There is a 50 on top of the cake. it represents their marriage of 50 years.", "Anniversary cake is being served.", "It looks like they are celebrating a wedding anniversary."], "image": "train2014/COCO_train2014_000000059024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568358, "question_id": "QLaNLDfdAwdDFcDJxoGGBg", "question": "What is the liquid?", "choices": ["milk", "oil", "water", "juice"], "correct_choice_idx": 1, "direct_answers": ["oil", "oil", "oil", "oil", "water", "oil", "oil", "water", "donuts", "oil"], "difficult_direct_answer": false, "rationales": ["This product is used to fry food with and it easily heats to many tempertures.", "Oil is used to cook the donuts.", "The food is frying."], "image": "val2014/COCO_val2014_000000568358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365696, "question_id": "QLspdXD5ciUyuvTn33Zh3C", "question": "What sporting event could these animals participate in?", "choices": ["kentucky derby", "world series", "indianapolis 500", "super bowl"], "correct_choice_idx": 3, "direct_answers": ["racing", "horse racing", "pulling", "horse racing", "kentucky derby", "show jumping", "horse racing", "super bowl", "pulling", "carriage pulling"], "difficult_direct_answer": false, "rationales": ["These animals are horses. they could compete in a horse racing event but not human sporting events.", "The other options don't use horses.", "The horses are clydesdale like what is used for beer commercials."], "image": "train2014/COCO_train2014_000000365696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24159, "question_id": "QLwHjgERnLNDtX9MDujAWC", "question": "What holiday has the woman made the cake for?", "choices": ["labor day", "christmas", "halloween", "spring break"], "correct_choice_idx": 1, "direct_answers": ["christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas"], "difficult_direct_answer": false, "rationales": ["There is a green pine tree with ornaments and presents visible on the cake which are elements consistent with answer a.", "There is a christmas tree on the cake.", "A decorated tree with presents underneath would signify the same holiday that celebrates the birth of jesus."], "image": "train2014/COCO_train2014_000000024159.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242745, "question_id": "QMHrQB6Bfdk3jMXVGrxDao", "question": "How is the woman in the tan shorts feeling?", "choices": ["amused", "sad", "happy", "loving"], "correct_choice_idx": 1, "direct_answers": ["extremely upset", "sad", "sad", "upset", "sad", "sad", "sad", "sad", "sad", "sad"], "difficult_direct_answer": false, "rationales": ["The woman is sad.", "She looks like she's about to cry.", "The person appears to have wet, puffy eyes and a frown. these attributes are often associated with crying which is a proponent of being sad."], "image": "train2014/COCO_train2014_000000242745.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297676, "question_id": "QMWUuFJAUY6mcKrjPrCtzf", "question": "What handedness is held by the pitcher?", "choices": ["left", "right", "none", "both"], "correct_choice_idx": 0, "direct_answers": ["left", "left handed", "left handed", "left", "left", "left", "left", "left handed", "left", "left"], "difficult_direct_answer": false, "rationales": ["The pitcher is holding the ball visibly in one hand which confirms their likely handedness.", "The pitcher is holding the ball in his left hand and ready to throw it.", "He is holding the ball with his left hand."], "image": "val2014/COCO_val2014_000000297676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509194, "question_id": "QMdzULigtGzPYHKc9G9xNR", "question": "How did the player here perform most recently?", "choices": ["lost", "conceded", "won", "tied"], "correct_choice_idx": 2, "direct_answers": ["great", "won", "he won", "very good", "successfully", "well", "great", "well", "won", "very well"], "difficult_direct_answer": false, "rationales": ["The player won.", "He displays a triumphant fist", "Agassi is doing a fist pump which generally means something good."], "image": "val2014/COCO_val2014_000000509194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563935, "question_id": "QMhAzN4KnoKa8xdYojghK5", "question": "What is this lady doing?", "choices": ["sleeping", "posing", "working", "cleaning up"], "correct_choice_idx": 1, "direct_answers": ["sitting down", "posing", "sitting down", "posing", "posing", "posing", "posing", "posing", "posing", "posing"], "difficult_direct_answer": false, "rationales": ["The woman is posing.", "She has her hand \"postured\" in a way for the picture. she is in a costume not meant for bathrooms.", "The woman is in an unnatural position in a costume which is consistent with someone doing answer a."], "image": "train2014/COCO_train2014_000000563935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576001, "question_id": "QMjnFUDEaNnLmkqdikKGod", "question": "What is the man doing in the snow?", "choices": ["hiking", "plowing", "building snowmen", "shoveling"], "correct_choice_idx": 0, "direct_answers": ["skiing", "hiking", "skiing", "walking", "walking", "skiing", "skiing", "skiing", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["The man is hiking.", "The man is hiking in the snow with a pair of hiking poles.", "He is hiking."], "image": "train2014/COCO_train2014_000000576001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11349, "question_id": "QMrzhYuLQ3xDCXHRh2UpMA", "question": "Skateboarding is which seasonal Olympic game?", "choices": ["summer", "spring", "winter", "autumn"], "correct_choice_idx": 0, "direct_answers": ["summer olympics", "summer", "summer", "summer", "summer olympics", "summer", "summer", "summer olympics", "summer", "summer olympics"], "difficult_direct_answer": false, "rationales": ["There are no spring or autumn olympics. winter weather would not be suitable for skateboarding.", "Traditionally this type of sports are done in winter, in 2024 they will be held in the summer.", "Skateboarding is always a sport that is featured in the summer."], "image": "train2014/COCO_train2014_000000011349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489542, "question_id": "QMzkkMuEJXEjc9r9tFd6Jf", "question": "What type of drink in in the jar?", "choices": ["wine", "coke", "7-up", "iced coffee"], "correct_choice_idx": 3, "direct_answers": ["coffee", "coffee", "coffee", "coffee", "iced coffee", "whiskey", "coffee", "cold beverage", "soda", "coffee"], "difficult_direct_answer": false, "rationales": ["The drink is light brown with ice on it.", "The color and the floating ice cubes show that it is iced coffee.", "Iced coffee is brown and has ice cubes in it."], "image": "train2014/COCO_train2014_000000489542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34423, "question_id": "QNDyJdAYoYV6pjPEFqfqFi", "question": "When is the favorite time to take the above meal?", "choices": ["supper", "any", "breakfast", "lunch"], "correct_choice_idx": 2, "direct_answers": ["morning", "breakfast", "breakfast", "breakfast", "morning", "breakfast", "morning", "breakfast", "morning", "breakfast"], "difficult_direct_answer": false, "rationales": ["Breakfast is the meal.", "Eggs and bacon are served in the morning.", "Meat and pastries are often enjoyed at breakfast."], "image": "train2014/COCO_train2014_000000034423.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481635, "question_id": "QNYgcBvZ23MVhVYvMYYQ2Q", "question": "How did she know what to order?", "choices": ["menu", "other patrons", "server", "google"], "correct_choice_idx": 0, "direct_answers": ["menu", "menu", "menu", "menu", "menu", "menu", "menu", "menu", "menu", "menu"], "difficult_direct_answer": false, "rationales": ["There is a menu sitting on the table in front of her.", "This person is sitting in front of a menu.", "The menu is in front of her."], "image": "val2014/COCO_val2014_000000481635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304217, "question_id": "QNbeNwEtTcgYnLWnXdqfj2", "question": "What would the red sign on the outer wall say if it was found in Germany?", "choices": ["achtung", "sayonara", "frau", "katze"], "correct_choice_idx": 0, "direct_answers": ["achtung", "honda", "honda", "achtung", "achtung", "gefahr", "gefahr", "achtung", "danger", "unknown"], "difficult_direct_answer": false, "rationales": ["This is a german word for attention! watch out!.", "The sign is for achtung.", "The sign would say achtung."], "image": "val2014/COCO_val2014_000000304217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263146, "question_id": "QNbwXDLeZzhzGYwoJukTxi", "question": "How fast is the dog driving here?", "choices": ["20 mph", "in reverse", "50 mph", "zero mph"], "correct_choice_idx": 3, "direct_answers": ["fast", "1mph", "slow", "zero", "zero mph", "zero mph", "zero", "slow", "slow", "zero mph"], "difficult_direct_answer": false, "rationales": ["The dog can't possibly be moving while standing unless this is a trick of some sort.", "The dog is not really driving.", "The dog is just standing on the model bike."], "image": "val2014/COCO_val2014_000000263146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31368, "question_id": "QPSDG8TdEdCH3mnXNYEJXq", "question": "Why are the men seated?", "choices": ["to eat", "play chess", "draw", "to work"], "correct_choice_idx": 0, "direct_answers": ["meat", "eating", "eating food", "to eat", "to dine", "eating", "eating", "to eat", "eating lunch", "eating"], "difficult_direct_answer": false, "rationales": ["The men are trying to eat.", "The men are seated because they are eating a meal.", "The men are here seated to eat."], "image": "val2014/COCO_val2014_000000031368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363504, "question_id": "QPhHXwdp6eBvsa8odHd9fX", "question": "Why is the boy wearing a glove?", "choices": ["warmth", "costume", "health", "to catch"], "correct_choice_idx": 3, "direct_answers": ["catch", "play catch", "playing catch", "playing catch", "baseball", "playing catch", "baseball", "playing catch", "to catch", "catch ball"], "difficult_direct_answer": false, "rationales": ["The boy is catching.", "The glove the boy has on is used to catch baseballs while playing the game of baseball.", "The boy has a baseball mitt in his hands which is used to catch a baseball."], "image": "train2014/COCO_train2014_000000363504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204562, "question_id": "QPowFuLuLjZr6ndHmww63m", "question": "What is this item an ingredient in?", "choices": ["tacos", "cheeseburgers", "strawberry shortcake", "banana pudding"], "correct_choice_idx": 3, "direct_answers": ["banana bread", "banana bread", "banana bread", "banana pudding", "ice cream", "banana pudding", "banana bread", "potassium", "banana pudding", "banana bread"], "difficult_direct_answer": false, "rationales": ["Bananas are often used for pudding.", "The bananas are used for pudding.", "Yellow fruit is piled on top of each other on a plate."], "image": "train2014/COCO_train2014_000000204562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574028, "question_id": "QQ98u5r32Aagnyi5J4jyZr", "question": "What is the purpose of this setup?", "choices": ["physical enhancement", "sustenance", "physical pleasure", "entertainment"], "correct_choice_idx": 3, "direct_answers": ["organization", "organization", "watch videos", "home entertainment", "gaming", "entertainment", "entertainment", "gaming", "media storage", "entertainment center"], "difficult_direct_answer": false, "rationales": ["A television is on a stand with a computer keyboard.", "The setup includes a television, sound system, and video games.", "There is a tv, game console, and dvd player so this is likely for entertainment."], "image": "train2014/COCO_train2014_000000574028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521471, "question_id": "QQHpKtTAq5Sowc2NQhXk2g", "question": "What is the purpose of the tall building located behind the plane?", "choices": ["illumination", "traffic control", "passenger boarding", "passenger departures"], "correct_choice_idx": 1, "direct_answers": ["balance", "control", "control tower", "control planes", "control tower", "air traffic", "aeroplane", "light", "traffic control", "traffic control"], "difficult_direct_answer": false, "rationales": ["It's a watch tower to see the planes and direct them.", "The building guides traffic.", "The purpose is traffic control."], "image": "train2014/COCO_train2014_000000521471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233970, "question_id": "QQJ5q3Z67DwQRZxpEa4GAA", "question": "The animal is resting on what?", "choices": ["owner's head", "chair", "tub", "blanket"], "correct_choice_idx": 3, "direct_answers": ["bed", "bed", "bed", "bed", "bed", "bed", "bed", "blanket", "bed", "bed"], "difficult_direct_answer": false, "rationales": ["The animal is on a blanket.", "A blanket is a covering used for beds which is where the cat is resting.", "The comforter is laid down for the cat to rest on."], "image": "train2014/COCO_train2014_000000233970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57617, "question_id": "QQzebDsp8vzB4YArDjrYzS", "question": "What will happen to the car on the back of the tow truck?", "choices": ["trashed", "driven away", "buffed out", "repaired"], "correct_choice_idx": 0, "direct_answers": ["get trashed", "junkyard", "junkyard", "junkyard", "get totaled", "be junked", "junkyard", "trashed", "accident", "disposed of"], "difficult_direct_answer": false, "rationales": ["The car is totalled. there is no way it can be fixed or driven again.", "It is very damaged and will be taken to a junk yard", "The car is totaled."], "image": "train2014/COCO_train2014_000000057617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210728, "question_id": "QRJ9onfnSmXR6X5SV693XG", "question": "What period of the day is shown in the picture?", "choices": ["morning", "evening", "afternoon", "night"], "correct_choice_idx": 0, "direct_answers": ["late morning", "day", "morning end", "midday", "afternoon", "late morning", "noon", "morning", "noon", "morning"], "difficult_direct_answer": false, "rationales": ["The scene looks very bright but the sun is still low in the sky like it is morning.", "The period is morning.", "The sun is out."], "image": "train2014/COCO_train2014_000000210728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383213, "question_id": "QRNJfKjaLmPbL8m38AMT8C", "question": "What would this store likely sell?", "choices": ["tires", "pokemon cards", "gasoline", "paprika"], "correct_choice_idx": 3, "direct_answers": ["spice", "food", "herbs", "paprika", "spices", "pizza", "herbs", "food", "kitchen supplies", "cooking spices"], "difficult_direct_answer": false, "rationales": ["This store sells cooking spices in general.", "The store sells spices.", "This is a spice"], "image": "train2014/COCO_train2014_000000383213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240093, "question_id": "QRcBGw5c8XejVHFbZ9qbr8", "question": "Sanjay D. Ghodawat is owner of which airline?", "choices": ["star", "paradise", "jet", "klm"], "correct_choice_idx": 0, "direct_answers": ["star", "unknown", "star air", "star air", "star air", "star", "star air", "star", "star", "star air"], "difficult_direct_answer": false, "rationales": ["Sanjay d. ghodawat owns star airlines.", "The owner is the star.", "The owner of star airline is this guy."], "image": "train2014/COCO_train2014_000000240093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306363, "question_id": "QRq9fnakvHbRjoAra87PvA", "question": "Why are the sheep difference colors?", "choices": ["breed", "dirty", "gender", "spray-painted"], "correct_choice_idx": 0, "direct_answers": ["genetics", "genetics", "different kinds", "different breeds", "variation", "breed", "different breeds", "different breeds", "fur", "different parents"], "difficult_direct_answer": false, "rationales": ["They are different types of the same animal.", "The sheep's breed is different.", "The sheep's wool looks naturally and intentionally the colors that they are but sheep do not have different wool colors based on gender. the only viable option after eliminating the others is answer a."], "image": "val2014/COCO_val2014_000000306363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208044, "question_id": "QRuQHJ3dHFbB3BLi89P8t3", "question": "What is being photographed?", "choices": ["mirror", "man", "watch", "tie"], "correct_choice_idx": 1, "direct_answers": ["boy tie", "man", "man dressing", "man", "man", "man", "man", "man", "man", "mirror"], "difficult_direct_answer": false, "rationales": ["The man is having his photograph taken in front of the mirror.", "The man is looking at the camera and posing.", "A woman is pointing a camera and a man is posing and in focus."], "image": "val2014/COCO_val2014_000000208044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512797, "question_id": "QRvCoZLbdHMHziBSyReCzt", "question": "What vehicles would you find here?", "choices": ["bikes", "trains", "boats", "cars"], "correct_choice_idx": 1, "direct_answers": ["trains", "trains", "trains", "buses", "trains", "buses", "trains", "trains", "trains", "trains"], "difficult_direct_answer": false, "rationales": ["The building has signs referring to platforms which usually refer to an area alongside the tracks where people wait to board.", "This is a railroad station as indicated by the signs", "The sign says platform. this is where this vehicle is parked."], "image": "train2014/COCO_train2014_000000512797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26635, "question_id": "QSHaoHuyTed4r43gKn3z5v", "question": "What time of day is this?", "choices": ["dawn", "afternoon", "midday", "dusk"], "correct_choice_idx": 2, "direct_answers": ["afternoon", "afternoon", "morning", "noon", "afternoon", "daytime", "daytime", "noon", "midday", "midday"], "difficult_direct_answer": false, "rationales": ["The sun is directly overhead, casting only small shadows", "It must be midday since the sun is out and shining.", "The sun is out and it's light out."], "image": "train2014/COCO_train2014_000000026635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252280, "question_id": "QSUKSQkhhL5WDYVHUAF8LC", "question": "Of vehicles seen here which are greenest in regards to emissions?", "choices": ["motorcycle", "bikes", "bus", "car"], "correct_choice_idx": 1, "direct_answers": ["bikes", "bikes", "bicycle", "bikes", "bikes", "bicycles", "bikes", "bicycles", "bicycles", "bikes"], "difficult_direct_answer": false, "rationales": ["They have no motors", "They only require human pedaling power.", "There is no fuel needed to run the bikes."], "image": "val2014/COCO_val2014_000000252280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43543, "question_id": "QSaRySh7StK74mFRj2fL5H", "question": "Why is the person wearing a heavy jacket?", "choices": ["as cosplay", "fashion", "for work", "cold weather"], "correct_choice_idx": 3, "direct_answers": ["stay warm", "cold", "winter", "cold weather", "it's cold", "cold weather", "cold", "cold", "it's cold", "warmth"], "difficult_direct_answer": false, "rationales": ["The person is snowboarding in a snow-covered area.", "The person is snowboarding, not working or cosplaying. the coat helps the person stay warm.", "The person is cold."], "image": "train2014/COCO_train2014_000000043543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332867, "question_id": "QSn5XKW5w3GySsFVwjbVZ7", "question": "What is distracting the woman from her computer?", "choices": ["television", "dogs", "cats", "cell phone"], "correct_choice_idx": 3, "direct_answers": ["phone", "cell phone", "phone", "text message", "phone", "phone", "phone", "phone", "phone", "cell phone"], "difficult_direct_answer": false, "rationales": ["The cell phone is distracting.", "She is looking at a smaller handheld screen", "Non-human animals are not present. there also is no television."], "image": "train2014/COCO_train2014_000000332867.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163575, "question_id": "QTH96tHmZX3wySxpdNGnFy", "question": "What type of dog is the brown and white one?", "choices": ["saint bernard", "standard poodle", "great dane", "bull mastiff"], "correct_choice_idx": 0, "direct_answers": ["st bernard", "st bernard", "bernard", "saint bernard", "saint bernard", "saint bernard", "saint bernard", "left dog", "st bernard", "st bernard"], "difficult_direct_answer": false, "rationales": ["Saint bernards are really big.", "Saint bernard dogs are huge dogs.", "It is a very large dog with the markings of this breed"], "image": "val2014/COCO_val2014_000000163575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321215, "question_id": "QTK6LSoAPe89yan3T99y3T", "question": "What is the Man teaching the child?", "choices": ["steeple chasing", "freebasing", "scooter riding", "sky diving"], "correct_choice_idx": 2, "direct_answers": ["scooter", "scooter riding", "scooter", "scooter riding", "scooter riding", "riding", "ride scooter", "scooter riding", "ride scooter", "ride scooter"], "difficult_direct_answer": false, "rationales": ["He has his hands on the handlebars.", "The child is on a scooter. the man is holding the handlebars.", "The man is teaching the scooter."], "image": "train2014/COCO_train2014_000000321215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555910, "question_id": "QTmMcp35GzuimmKSxu4WhX", "question": "What is inside the bottle sitting to the right of the red tin box?", "choices": ["soy sauce", "ketchup", "balsamic vinegar", "oil"], "correct_choice_idx": 0, "direct_answers": ["soy sauce", "soy sauce", "soy sauce", "soy sauce", "water", "soy sauce", "soy sauce", "cleaning liquid", "unknown", "soy sauce"], "difficult_direct_answer": false, "rationales": ["The liquid is black, not red. the writing on the label is in an asian language.", "The bottle is a soy sauce bottle.", "The bottle has soy sauce."], "image": "train2014/COCO_train2014_000000555910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10149, "question_id": "QU8N5kJfvqvtrsvjwFWTad", "question": "The man is holding an item that is associated with which horror movie character?", "choices": ["freddy krueger", "leatherface", "michael myers", "candyman"], "correct_choice_idx": 2, "direct_answers": ["knife", "scream", "psycho", "chucky", "michael myers", "scream", "michael meyers", "knife", "michael myers", "chucky"], "difficult_direct_answer": false, "rationales": ["The man is holding a knife. michael myers is a horror movie character that appeared many times with a knife.", "The man is holding a knife based on the visible blade and the size and shape of the item. this is commonly known to be the weapon of choice for answer a in horror movies.", "The knife could be associated with a horror movie."], "image": "val2014/COCO_val2014_000000010149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280999, "question_id": "QUBgcySE3rW4VTZSct39Eb", "question": "What color is the shirt underneath of the old man's sweater?", "choices": ["black", "white", "green", "red"], "correct_choice_idx": 1, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["That is the color of the shirt.", "The color is white.", "It is a monochromatic photo and the shirt is bright against the rest of the image."], "image": "train2014/COCO_train2014_000000280999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8285, "question_id": "QUMm7qNB9YcWAYs85sL4pQ", "question": "Who is in danger of falling?", "choices": ["prop", "father", "ladder", "girl"], "correct_choice_idx": 3, "direct_answers": ["child", "child", "little girl", "rain", "little girl", "girl", "girl", "small girl", "girl", "child"], "difficult_direct_answer": false, "rationales": ["The girl is about to fall.", "From what we can tell from this perspective we would assume that almost all the people are firmly on the ground where there is littler danger of falling for the average person walking. there is a young girl raised above everyone else who may be subject to more risk.", "All people except answer a appear to be on their feet on the ground based on their relative heights. the girl is clearly being raised up in some manner which could result in her falling."], "image": "train2014/COCO_train2014_000000008285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495388, "question_id": "QUagWThSpxBhEs9W65Lg4K", "question": "What main dish is served here?", "choices": ["meat muffins", "chili dog", "fried fish", "meat loaf"], "correct_choice_idx": 1, "direct_answers": ["hot dog", "hot dog", "hotdogs", "chili dog", "hotdogs", "hotdogs", "hot dog", "hotdogs", "hot dog", "hot dog"], "difficult_direct_answer": false, "rationales": ["There is a meat sauce on top of the hot dog.", "The food is clearly visible and based on the visible toppings, answer a is consistent.", "The dish is a chili dog."], "image": "val2014/COCO_val2014_000000495388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289238, "question_id": "QVMkWGSLk9ym2AXrdyqSJo", "question": "What is the man wearing?", "choices": ["jacket", "tie", "suspenders", "hat"], "correct_choice_idx": 1, "direct_answers": ["necktie", "tie", "tie", "necktie", "tie", "necktie", "necktie", "shirt tie", "tie", "tie"], "difficult_direct_answer": false, "rationales": ["The man has a tie around his neck and it hangs down the front of his shirt.", "It is a fabric accessory worn around the neck.", "The man has a necktie hanging from his neck."], "image": "train2014/COCO_train2014_000000289238.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300260, "question_id": "QWBpUk7MimYXgTJijoCZHB", "question": "What are these planes emitting?", "choices": ["balloons", "foam", "pesticides", "contrails"], "correct_choice_idx": 3, "direct_answers": ["condensation trails", "contrails", "smoke", "smog", "smoke", "fumes", "pollution", "smoke", "smoke", "exhaust"], "difficult_direct_answer": false, "rationales": ["The planes have contrails.", "These planes are emitting contrails behind their engines.", "The smoke coming out is called contrails."], "image": "train2014/COCO_train2014_000000300260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555282, "question_id": "QWSxi4BWEUH7ffKnnf74ks", "question": "Why is he holding the bat?", "choices": ["clean grounds", "frighten others", "hit ball", "keeping it"], "correct_choice_idx": 2, "direct_answers": ["batting ball", "hit ball", "batter", "to swing", "hit ball", "batting", "swinging", "hit ball", "hit ball", "hitting"], "difficult_direct_answer": false, "rationales": ["He is hitting the ball.", "They are going to hit the ball.", "Typically a batter holds the bat before they hit the baseball."], "image": "train2014/COCO_train2014_000000555282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480594, "question_id": "QWTDySyvJyZkhxqxf7JBos", "question": "What type of room is this?", "choices": ["school", "hotel", "court", "hospital"], "correct_choice_idx": 1, "direct_answers": ["hotel", "hotel", "hotel", "bedroom", "bedroom", "hotel", "bedroom", "bedroom", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["The room is a hotel.", "The room is very nice and there is an employee with a badge", "This is a room people pay to stay the night in when they are traveling."], "image": "train2014/COCO_train2014_000000480594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283269, "question_id": "QWhJwwnebZqcnJnECZr6wZ", "question": "Who is the older woman to the young boy in red?", "choices": ["grandmother", "cousin", "sister", "neighbor"], "correct_choice_idx": 0, "direct_answers": ["grandmother", "grandma", "grandmother", "grandmother", "grandma", "grandmother", "grandmother", "aunt", "grandmother", "grandma"], "difficult_direct_answer": false, "rationales": ["The two people look somewhat alike, as if they are related. they are the right ages in relation to each other to be grandparent and grandchild.", "The woman looks too old to be mother.", "The woman is the grandma."], "image": "train2014/COCO_train2014_000000283269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459184, "question_id": "QWhyymczLygmnYgbmGxw39", "question": "What is the man on the phone sitting behind?", "choices": ["plastic", "glass", "foil", "paper"], "correct_choice_idx": 1, "direct_answers": ["sofa", "glass window", "window", "window", "window", "glass window", "glass", "glass", "glass", "window"], "difficult_direct_answer": false, "rationales": ["He is behind a window.", "In front of the man is a reflection clearly depicting the stores and cars on the opposite side of the street, which indicates that the man is currently sitting behind a pane of glass.", "There is a reflection showing"], "image": "train2014/COCO_train2014_000000459184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219445, "question_id": "QX5cFKfcxK6DXgc95tzooi", "question": "What kind of rider is she?", "choices": ["novice", "intermediate", "professional", "seasoned"], "correct_choice_idx": 0, "direct_answers": ["child", "horse rider", "horse", "novice", "beginner", "western", "young beginner", "novice", "equestrian", "equestrian"], "difficult_direct_answer": false, "rationales": ["She is a young novice rider.", "She is a young child.", "The rider is a novice."], "image": "train2014/COCO_train2014_000000219445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485044, "question_id": "QXHGSU7qTScowNf2GuiYYq", "question": "What is the cat doing?", "choices": ["getting sad", "sleeping", "snoozing", "getting angry"], "correct_choice_idx": 1, "direct_answers": ["sleeping", "resting", "sleeping", "snoozing", "resting", "snoozing", "sleeping", "laying down", "napping", "sleeping"], "difficult_direct_answer": false, "rationales": ["The cat's eyes are closed.", "A black cat is sitting and has its eyes closed. he looks very tired.", "The cat's eyes are closed and it's laying down."], "image": "train2014/COCO_train2014_000000485044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111568, "question_id": "QY5dLxPmPmuPkucLtLJh35", "question": "Which franchise is advertised here?", "choices": ["james bond", "sherlock holmes", "x men", "star wars"], "correct_choice_idx": 0, "direct_answers": ["james bond", "james bond", "james bond", "james bond", "james bond", "james bond", "james bond", "james bond", "james bond", "james bond"], "difficult_direct_answer": false, "rationales": ["The franchise is bond.", "The advertisement is clearly visible and the actors and characters are commonly known.", "The train features an advertisement for 007 on the side."], "image": "train2014/COCO_train2014_000000111568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439271, "question_id": "QYEAQSQc3yey4Ub7xXiJdu", "question": "What sea creature did the woman in black's necklace come from?", "choices": ["white shark", "salmon", "dolphin", "oyster"], "correct_choice_idx": 3, "direct_answers": ["oyster", "oyster", "oyster", "clam", "oyster", "clam", "oyster", "oyster", "clam", "oyster"], "difficult_direct_answer": false, "rationales": ["She is wearing a pearl necklace that comes from oysters and are popular to wear.", "She is wearing a pearl necklace.", "The necklace has pearls on it. they come from oysters."], "image": "train2014/COCO_train2014_000000439271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263428, "question_id": "QYGdKWrXmNd8hBeHeiuXMy", "question": "Why are the people wearing green rubber boots?", "choices": ["dress code", "protection", "visibility", "fashion"], "correct_choice_idx": 1, "direct_answers": ["filthy floor", "goats", "stay clean", "protection", "foot protection", "protection", "shoe protection", "protect shoes", "keep clean", "protection"], "difficult_direct_answer": false, "rationales": ["The rubber boots prevent the mud and poop all around the farm from getting on your feet and pants.", "The rubber boots provide foot protection.", "The people are wearing green rubber boots to keep their feet clean and dry in the animal pen."], "image": "val2014/COCO_val2014_000000263428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523196, "question_id": "QYMDrRCZQj3PyvAMxAW6n3", "question": "Which sign will be easier for someone way down the street to spot?", "choices": ["frame", "rectangle", "octagon", "flyers"], "correct_choice_idx": 2, "direct_answers": ["lion sign", "hanging shingle", "top one", "lion", "stop sign", "illuminated one", "lion", "lion", "lion", "octagon"], "difficult_direct_answer": false, "rationales": ["The sign easier to see would be the sign for \"the lion\" that is shaped like a stop sign with eight sides.", "The most prominent sign is the one that looks like a stop sign.", "A large red sign is on a building."], "image": "train2014/COCO_train2014_000000523196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454236, "question_id": "QYU7bQpcZUKdiQ6rQ3CS9A", "question": "Where will this bus drop passengers off?", "choices": ["factory", "beach", "school", "prison"], "correct_choice_idx": 2, "direct_answers": ["school", "school", "school", "resort", "school", "school", "school", "school", "bus stop", "school"], "difficult_direct_answer": false, "rationales": ["Typically, this color bus applies to a.", "The bus is orange and is a school bus.", "The bus is yellow and has school in a different language."], "image": "train2014/COCO_train2014_000000454236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31604, "question_id": "QYsbd5D7ox7NzRGYPA7xjY", "question": "What is the term for the structure in the middle of the street?", "choices": ["meridian", "grass hut", "toll booth", "gate"], "correct_choice_idx": 0, "direct_answers": ["island", "barrier", "divider", "berm", "median", "only", "median", "lamp post", "meridian", "lamppost"], "difficult_direct_answer": true, "rationales": ["The grass area in the middle is called a meridian.", "There is no fence, so it is not a gate. there is no place to pay a toll.", "A section in the middle of a highway separated by cement curb has trees and flowers growing in it."], "image": "train2014/COCO_train2014_000000031604.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213725, "question_id": "QZ85iPM7MngaupE4QZ9pXY", "question": "What location is displaying items?", "choices": ["bank", "restroom", "museum", "car garage"], "correct_choice_idx": 2, "direct_answers": ["store", "museum", "podiums", "storefront", "museum", "museum", "museum", "museum", "store window", "museum"], "difficult_direct_answer": false, "rationales": ["Answer a is a location that would commonly display options on podiums inside glass cases and none of the other answers would.", "There are a great variety of things being displayed. they are arranged on shelves.", "These look like historic artifacts on pedestals for display."], "image": "train2014/COCO_train2014_000000213725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414552, "question_id": "QZE6qnkVML6JfBgoSixkzo", "question": "What sport are they enjoying watching?", "choices": ["golf", "gymnastics", "baseball", "hockey"], "correct_choice_idx": 3, "direct_answers": ["hockey", "hockey", "hockey", "ice hockey", "hockey", "hockey", "hockey", "hockey", "hockey", "ice hockey"], "difficult_direct_answer": false, "rationales": ["The people in the living room are watching an ice rink sport that players shoot pucks into goals for points.", "As indicated by the ice that they're playing on.", "You can see the ice on the tv which means it's hockey"], "image": "train2014/COCO_train2014_000000414552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449847, "question_id": "QZJYDHpsqk77ZEC5AYRpuX", "question": "For What reason does the person on the motorcycle have their right leg on the street?", "choices": ["balance", "tapping angrily", "kicking", "stop bike"], "correct_choice_idx": 0, "direct_answers": ["stopped", "balance", "balance", "maintain balance", "balance", "he's stopped", "to brake", "balance", "stability", "balance"], "difficult_direct_answer": false, "rationales": ["They are stopped and have their leg down so the bike stays up.", "The biker doesn't want his bike to fall.", "The person on the motorcycle has their right leg on the street to balance themselves while they are waiting at the red light."], "image": "train2014/COCO_train2014_000000449847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402820, "question_id": "QZV7p6VCzpoQjwwAJCqWwB", "question": "What are the white flakes on the donuts on the top shelf?", "choices": ["coconut", "granola", "brown sugar", "sugar"], "correct_choice_idx": 0, "direct_answers": ["sugar", "coconut", "coconut", "coconut", "coconut", "sugar raised", "coconut", "coconut", "sugar", "coconut"], "difficult_direct_answer": false, "rationales": ["There are large shredded pieces of white hanging off top of a donut.", "That ingredient is coconut.", "The topping is flakes not granules"], "image": "train2014/COCO_train2014_000000402820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328283, "question_id": "QZamBKhbLQHjoAQcCPMfyd", "question": "What are the white bark trees called?", "choices": ["birch", "willow", "palm", "pine"], "correct_choice_idx": 0, "direct_answers": ["aspens", "birch", "ash", "birch", "sycamore trees", "birch", "river birch", "ash", "birch", "scatting"], "difficult_direct_answer": false, "rationales": ["The trees are birch trees.", "Birch comes in this color.", "The bark is birch."], "image": "val2014/COCO_val2014_000000328283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432981, "question_id": "QZnt79CZHB3ebn6ksiHsJo", "question": "What does one of the people and the cameraman who took this picture have in common?", "choices": ["hand visible", "taking picture", "overweight", "shadow visible"], "correct_choice_idx": 1, "direct_answers": ["brown hair", "cameras", "jeans", "love skateboarding", "have cameras", "love skateboarding", "wearing pants", "both men", "taking picture", "dark hair"], "difficult_direct_answer": true, "rationales": ["They are both photographers", "The person in the background is using a camera for its intended purpose. the cameraman is doing the same thing.", "The person on the curb is holding a camera to their face."], "image": "train2014/COCO_train2014_000000432981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52790, "question_id": "QaEuRmFuiQqgsyWUN7VG83", "question": "If you stuck your hand out the side what would happen?", "choices": ["get electrocuted", "touch people", "touch cats", "touch plants"], "correct_choice_idx": 3, "direct_answers": ["lose hand", "injury", "touch plants", "touch leaves", "broken", "get hit", "beautiful landscape", "touch leaves", "hit tree", "hit tree"], "difficult_direct_answer": false, "rationales": ["There are no cats, people, or sources of electricity outside the train. there are green items.", "Plants line the roadway where a train is moving.", "There is greenery lining the sides of the tracks and that is what would touch your fingers."], "image": "val2014/COCO_val2014_000000052790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259717, "question_id": "QaiDdcrCUwayTbZFRXGyGf", "question": "Why are they cooking on a boat?", "choices": ["no room", "tastes better", "it's home", "feed fish"], "correct_choice_idx": 2, "direct_answers": ["market", "their store", "custom", "make food", "it's home", "to sell", "for tourists", "cultural tradition", "their job", "prepare food"], "difficult_direct_answer": true, "rationales": ["The people have all their things on the boat. they probably live there.", "The women live on the boat.", "They live on the boat"], "image": "train2014/COCO_train2014_000000259717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569747, "question_id": "QbAprAJkNg5QRgmhAriBcD", "question": "What damage has been done in this street?", "choices": ["cracked ground", "illegal construction", "graffiti", "arson"], "correct_choice_idx": 2, "direct_answers": ["name board", "graffiti", "graffiti", "graffiti", "graffiti", "strike", "strike", "graffiti drawings", "grafitti", "graffiti"], "difficult_direct_answer": false, "rationales": ["There is a lot of graffiti work up and down the street.", "There is spray paint on the signs.", "People have painted on signs"], "image": "train2014/COCO_train2014_000000569747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 913, "question_id": "QbGEhaKbYmHDuduPfod2Pd", "question": "Why is the plane parked here?", "choices": ["cleaning", "on display", "maintenance", "for sale"], "correct_choice_idx": 1, "direct_answers": ["no pilot", "on display", "air show", "showing off", "exhibit", "take off", "to fly", "airport", "to fuel", "landed"], "difficult_direct_answer": true, "rationales": ["A plane is parked among others. people are walking around several planes taking pictures.", "The cones suggest that it is not for display but grounded for maintenance.", "This plane is being shown to the public and on display for them to see."], "image": "train2014/COCO_train2014_000000000913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409338, "question_id": "QbZaFsSWo6CzHFXdXJfnAM", "question": "What is the freestanding structure in the middle of the room called?", "choices": ["freezer", "island", "fridge", "oven"], "correct_choice_idx": 1, "direct_answers": ["island", "counter", "counter", "island", "island", "island", "bar", "island", "island", "kitchen island"], "difficult_direct_answer": false, "rationales": ["The freestanding item in the middle of the kitchen is called an island because it is not connected to anything else.", "It's an object that's surrounded by floor and not connected to another structure of the building, similar to how a piece of land is completely surrounded by water.", "Whenever there is a structure alone in the middle of a kitchen, it's references as a kitchen island."], "image": "train2014/COCO_train2014_000000409338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316648, "question_id": "QbgjeHJgfSkMFReH6t85Fc", "question": "The neon signs on the street are located in which city in Asia?", "choices": ["beijing", "hong kong", "tokyo", "taipei"], "correct_choice_idx": 1, "direct_answers": ["beijing", "tokyo", "korea", "hong king", "china", "hong kong", "hong kong", "hong kong", "hong kong", "give way"], "difficult_direct_answer": false, "rationales": ["The signs are in hong kong.", "The neon street signs are reminiscent of hong kong since that's the asian city of lights.", "Some of the lettering is in chinese."], "image": "val2014/COCO_val2014_000000316648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337298, "question_id": "QbnNeTDMcCcySCoom7H5kP", "question": "What type of medium is the woman using to communicate?", "choices": ["diary", "book", "phone", "kindle"], "correct_choice_idx": 2, "direct_answers": ["phone", "electronic", "cell phone", "phone", "smartphone", "cell phone", "phone", "cell phone", "cellphone", "text message"], "difficult_direct_answer": false, "rationales": ["The woman is on a phone and using it to communicate.", "She has a cell phone in her hand.", "The woman is texting."], "image": "train2014/COCO_train2014_000000337298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275997, "question_id": "QcAsTjwYDCzQsTVj7gUv6s", "question": "What is the plastic bag on the door handle being used to collect?", "choices": ["baseball cards", "food", "laundry", "garbage"], "correct_choice_idx": 3, "direct_answers": ["trash", "recycling", "trash can", "garbage", "trash", "garbage", "trash", "trash", "trash", "soda cans"], "difficult_direct_answer": false, "rationales": ["They don't have a bin in the room", "The bag is used to collect trash from the room.", "They are using this as a thing to put trash in."], "image": "train2014/COCO_train2014_000000275997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573874, "question_id": "QcEt36ZkLyxQudD8b3yHat", "question": "The multi color umbrella used for?", "choices": ["uv protection", "rain", "celebration", "children"], "correct_choice_idx": 2, "direct_answers": ["sun protection", "prevent sun rays", "celebration", "sun protection", "attention", "sun protection", "shade", "celebration shade", "provide shade", "sun protection"], "difficult_direct_answer": false, "rationales": ["It is not raining, and the umbrella is festively colored to match the person's hat. the matching display of colors makes this the most likely answer.", "The multi-color umbrella is used to celebrate.", "The people are at a parade or a festival."], "image": "train2014/COCO_train2014_000000573874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412443, "question_id": "QcFa5242vnCAHpJA7HSjk7", "question": "Which building is reddest here?", "choices": ["barn", "post office", "house", "mill"], "correct_choice_idx": 0, "direct_answers": ["barn", "barn", "barn", "closest", "barn", "barn", "barn", "tall barn", "barn", "big building"], "difficult_direct_answer": false, "rationales": ["The barn building is the most red here.", "The red building is for animals.", "It is the building with very large doors"], "image": "val2014/COCO_val2014_000000412443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111972, "question_id": "QcGNng2cD5WpN2Df3aYuMf", "question": "What is the woman in the white shirt doing?", "choices": ["dancing", "stretching", "serving", "yelling"], "correct_choice_idx": 2, "direct_answers": ["serving", "serving", "serving", "spiking tennisball", "serving", "serving", "serving", "serving", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["She threw the ball up in the air so she can hit it over to her opponet.", "She's serving.", "The woman is serving the tennis ball."], "image": "train2014/COCO_train2014_000000111972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267205, "question_id": "QcGWA9jTo54NHLJeEWZYV8", "question": "How are persons here able to read at night?", "choices": ["kerosene", "gas lanterns", "electric light", "candles"], "correct_choice_idx": 2, "direct_answers": ["electric light", "lights", "lightbulbs", "light", "two", "candle light", "candles", "candle", "flashlights", "light"], "difficult_direct_answer": true, "rationales": ["There is a large pole on the left side. it has a light on it that makes it easy to see things at nice in the area.", "The people can use electricity from the poles.", "There is an electric pole on the side of the sidewalk."], "image": "train2014/COCO_train2014_000000267205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286042, "question_id": "Qd6HLHCYhKNC6CQzjg3K3L", "question": "Who caused the water to flood out?", "choices": ["mayor", "fireman", "vandal", "these men"], "correct_choice_idx": 3, "direct_answers": ["worker", "construction worker", "fire hydrant", "workmen", "man", "worker", "these men", "open hydrant", "workers", "tap"], "difficult_direct_answer": true, "rationales": ["The men turned on the hydrant.", "It's unknow, but the yellow pants and boots worn by the person on the left are usually worn by firemen. that said, anyone might have caused the hydrant to flood water.", "Because they opened the fire hydrant hence releasing water."], "image": "train2014/COCO_train2014_000000286042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422850, "question_id": "Qd73ETWEi2EhuHCCx4XkuA", "question": "What is the horse rider doing?", "choices": ["standing", "commanding", "jumping", "sitting"], "correct_choice_idx": 2, "direct_answers": ["traveling", "galloping", "riding beach", "riding", "jumping", "riding", "riding", "sitting up", "riding", "galloping"], "difficult_direct_answer": false, "rationales": ["The horse is jumping.", "The horse rider wants to run and jump since the horse's legs are off the ground.", "The rider is sitting on the house as it runs. 2"], "image": "train2014/COCO_train2014_000000422850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509750, "question_id": "QdDcEhymEqkNYidjwewT2X", "question": "The second animal looks like it is doing what?", "choices": ["dancing", "jumping", "sleeping", "sniffing"], "correct_choice_idx": 3, "direct_answers": ["sniffing butt", "sniffing butt", "sniffing", "sniffing animal", "sniffing butts", "sniffing but", "smelling", "sniffing", "smelling", "ass sniffing"], "difficult_direct_answer": false, "rationales": ["The animal is sniffing the other's buttocks.", "The second animal is awake. it is not moving.", "Many animals are known to identify each other through sniffing. this animal has its nose pressed up against a second animal which is consistent with a sniffing action."], "image": "val2014/COCO_val2014_000000509750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283382, "question_id": "QdGnZpgLcf4P7VZbushQtx", "question": "Where is this kitchen located?", "choices": ["restaurant", "school", "home", "hospital"], "correct_choice_idx": 2, "direct_answers": ["house", "house", "home", "second floor", "modern home", "personal residence", "inside house", "home", "apartment", "wood floor"], "difficult_direct_answer": false, "rationales": ["This kitchen is located in the home.", "The kitchen is at home.", "This kitchen contains family size appliances, limited seating and an island countertop. all commonly found in someone's home."], "image": "train2014/COCO_train2014_000000283382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412339, "question_id": "QdebG7dAkQfFHM5gYEMkjt", "question": "Why is the woman holding a remote in a batter's stance?", "choices": ["fight someone", "playing game", "being funny", "showing off"], "correct_choice_idx": 1, "direct_answers": ["wii game", "playing game", "tv remote", "playing wii", "playing game", "wii sports", "fight", "wii", "wii baseball", "playing game"], "difficult_direct_answer": false, "rationales": ["The woman is holding a controller for the nintendo console.", "The woman is going to swing the baton a video game.", "The remotes are used for a console game."], "image": "val2014/COCO_val2014_000000412339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251801, "question_id": "Qdpx249PgzWJdufx7McVCP", "question": "Where do coffee beans come from?", "choices": ["australia", "south america", "africa/asia", "north america"], "correct_choice_idx": 2, "direct_answers": ["plant", "grown", "africa/asia", "south america", "coffee plant", "rainforest", "south america", "coffee trees", "coffee plant", "south america"], "difficult_direct_answer": false, "rationales": ["Beans are from africa.", "Most coffee beans come from africa or asia. sometimes people combine those names for unknown reasons.", "Coffee beans are from africa and asia."], "image": "val2014/COCO_val2014_000000251801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496401, "question_id": "QeAtXTukmjsJS92KfnpWBZ", "question": "What has been done to the red pole?", "choices": ["drawing", "none", "special design", "graffiti"], "correct_choice_idx": 3, "direct_answers": ["graffiti", "graffiti", "graffiti", "vandalized", "vandalized", "graffiti", "graffiti", "graffiti", "vandalized", "graffiti"], "difficult_direct_answer": false, "rationales": ["There's a drawing of a heart on the pole.", "The red pole has drawings on it.", "It's arguably also b, but a is the name used when b is done on public structures without permission."], "image": "train2014/COCO_train2014_000000496401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574166, "question_id": "QeBp9tFp8GhXxYmUcaYpiv", "question": "What electronic device is embedded within the bathroom mirror in this bathroom?", "choices": ["phone", "television", "heater", "light"], "correct_choice_idx": 1, "direct_answers": ["television", "vent", "television", "ipad", "monitor", "television", "tv", "tv", "screen", "lamp"], "difficult_direct_answer": false, "rationales": ["There is a tv.", "There is a small, rectangular horizontal screen at the bottom which resembles a video device for providing entertainment for someone who wants to bathe.", "There is a small screen in the mirror."], "image": "val2014/COCO_val2014_000000574166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349442, "question_id": "QehpQgj3eUUSdEeVGqe7K7", "question": "What state is this air patrol plane registered in?", "choices": ["florida", "arizona", "arkansas", "alaska"], "correct_choice_idx": 3, "direct_answers": ["alaska", "alaska", "alaska", "alaska", "alaska", "alaska", "alaska", "alaska", "alaska", "alaska"], "difficult_direct_answer": false, "rationales": ["The state is alaska.", "The plane has \"alaska\" written on the tail.", "The tail of the airplane says \"alaska\" on it."], "image": "train2014/COCO_train2014_000000349442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394149, "question_id": "Qg9US52tkvMdFNXCsR5Djs", "question": "Which coat is worn more outwardly?", "choices": ["house", "vest", "suit", "over"], "correct_choice_idx": 3, "direct_answers": ["overcoat", "jacket", "winter coat", "trench", "over", "trench coat", "overcoat", "grey jacket", "topcoat", "jacket"], "difficult_direct_answer": false, "rationales": ["The coat is over.", "The outer coat is worn on top of the other coat.", "The man is not wearing a vest or house coat. the suit is under the more outwardly coat."], "image": "train2014/COCO_train2014_000000394149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150931, "question_id": "QgB6oSdUFoosyjKcuvNE4r", "question": "What is the person on the right doing?", "choices": ["canoeing", "swimming", "paddling", "lying down"], "correct_choice_idx": 3, "direct_answers": ["surfing", "surfing", "getting offshore", "lying down", "surfing", "swiming", "floating", "surfing preparation", "surfing", "boogie boarding"], "difficult_direct_answer": false, "rationales": ["A man is watching another man surf. he is chest down and not upright on the surfboard.", "The person on the right is on his belly on a surfboard.", "The person is trying to get up from the board."], "image": "train2014/COCO_train2014_000000150931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185587, "question_id": "QgD3RDeFEYyvBVmWBRnXcV", "question": "What birthday is someone celebrating?", "choices": ["42nd", "30th", "15th", "28th"], "correct_choice_idx": 1, "direct_answers": ["30th birthday", "30th", "thirty", "thirty", "thirty", "thirty", "thirtieth", "30th", "thirtieth", "thirty"], "difficult_direct_answer": false, "rationales": ["The number thirty can be seen on one of the balloons.", "These people are celebrating someone's 30th birthday.", "There is a balloon with a number visibly written. a balloon of this type and in this setting with a number on it indicates an age someone is turning on their birthday."], "image": "train2014/COCO_train2014_000000185587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73445, "question_id": "QgDKocq7kptYg5hCpZJhnD", "question": "How is the method of locomotion here powered?", "choices": ["gasoline", "hay", "oil", "coal"], "correct_choice_idx": 1, "direct_answers": ["hay", "horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["The locomotion in this case is the horse. horses eat hay which provides them the calories to move and could be said to be their fuel and power.", "The horse eats hay.", "The method is hay."], "image": "val2014/COCO_val2014_000000073445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137778, "question_id": "QgMM4LNmwyXfXK72bgcxk8", "question": "Where do you think this is located?", "choices": ["countryside", "farm", "school", "city"], "correct_choice_idx": 3, "direct_answers": ["boston", "city", "city", "downtown", "downtown", "america", "usa", "city street", "city", "city"], "difficult_direct_answer": false, "rationales": ["The area is located in an urban area.", "There are large buildings and people all around.", "It looks like the business area of a city."], "image": "train2014/COCO_train2014_000000137778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68194, "question_id": "QgkYyZXRXcNi7MHJ5NPjkh", "question": "What famous secret agent franchise is advertised on this train?", "choices": ["red sparrow", "austin powers", "james bond", "mission impossible"], "correct_choice_idx": 2, "direct_answers": ["james bond", "james bond", "james bond", "james bond", "james bond", "james bond", "skyfall", "skyfall", "james bond", "james bond"], "difficult_direct_answer": false, "rationales": ["Skyfall is from james bond.", "Skyfall is the newest film in the 007 series.", "James bond has a movie called skyfall."], "image": "train2014/COCO_train2014_000000068194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112394, "question_id": "QgqsdbMYFYzgNqvX9M75E2", "question": "How is the tennis player feeling?", "choices": ["tired", "angry", "sad", "happy"], "correct_choice_idx": 3, "direct_answers": ["feeling happy", "good", "happy", "tense", "exhilarated", "happy", "happy", "excited", "happy", "strang"], "difficult_direct_answer": false, "rationales": ["The person is happy.", "The tennis player is smiling.", "The player is smiling."], "image": "val2014/COCO_val2014_000000112394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495311, "question_id": "QhdyZt6t8eWE3dRcwxf22P", "question": "What is northstar air express responsible for?", "choices": ["refueling", "cleaning", "luggage", "maintenance"], "correct_choice_idx": 0, "direct_answers": ["flying", "fueling", "to carry", "fuel", "filling plane", "refueling", "flight school", "fuel", "air services", "fuel"], "difficult_direct_answer": false, "rationales": ["There is a fuel truck there to put more fuel in the plane.", "This is obvious based on the shape of the tanker and the hose.", "The vehicle with northstar air express written on it is a tanker truck with a hose visibly running out of the side. planes need fuel and tankers carry fuel and if the hose is extended between the two there is likely refueling intentions."], "image": "val2014/COCO_val2014_000000495311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543347, "question_id": "Qhe2FvmMeYK2hZcneMYJPW", "question": "What is the white article in front of the man's shirt?", "choices": ["bandana", "skirt", "kilt", "apron"], "correct_choice_idx": 3, "direct_answers": ["apron", "bib", "apron", "ball", "apron", "apron", "apron", "apron", "bib", "apron"], "difficult_direct_answer": false, "rationales": ["The man has a white garment strung from his neck.", "The object in question is secured around the mans neck and has the straps to wrap around his body in a manner consistent with answer a.", "A bandana would be worn on the head. kilts or skirts would be worn below the shirt."], "image": "val2014/COCO_val2014_000000543347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151877, "question_id": "Qi3CTeLZ3qNYSMDX7jfQ3b", "question": "What is in danger of being struck?", "choices": ["human", "bike", "car", "pole"], "correct_choice_idx": 1, "direct_answers": ["bike", "bicycle", "bike", "bike", "bicycle", "bicycle", "truck", "bicycle", "bicycle", "bike"], "difficult_direct_answer": false, "rationales": ["The bike is in front of the truck.", "It is located directly in front of a vehicle in the street.", "The bike is in front of the truck"], "image": "val2014/COCO_val2014_000000151877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325545, "question_id": "QiNFnqS5gvgEjs5DGCjNvV", "question": "What are the three bulbs on the left side of the cutting board?", "choices": ["radish", "garlic", "brussel sprout", "potato"], "correct_choice_idx": 1, "direct_answers": ["garlic", "garlic", "cloves", "garlic", "garlic", "garlic", "garlic", "garlic", "cloves", "garlic"], "difficult_direct_answer": false, "rationales": ["The the small objects on the cutting board are garlic.", "The bulbs are garlic.", "Garlic is on the cutting board."], "image": "train2014/COCO_train2014_000000325545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504620, "question_id": "QiXkdYxYvy5hZPA8ohQEAt", "question": "What separates the tennis court from the building in the background?", "choices": ["chain-link fence", "gate", "racquet", "border control"], "correct_choice_idx": 0, "direct_answers": ["fence", "fence", "face", "fence", "net", "fence", "fence", "chain-link fence", "fence", "fence"], "difficult_direct_answer": false, "rationales": ["The fence is for safety from vandalism and to keep the balls from flying and hitting the building.", "The fence has interlocking metal like that in a chain link fence.", "The fence separates the court."], "image": "train2014/COCO_train2014_000000504620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270784, "question_id": "QibM6Z5xHuVYrwaroiZ59r", "question": "What does the No stand for?", "choices": ["number", "none", "north", "nocturnal"], "correct_choice_idx": 0, "direct_answers": ["number", "number", "number", "number", "number", "number", "number", "number", "number", "number"], "difficult_direct_answer": false, "rationales": ["It is an abbreviation.", "That is an abbreviation for that.", "The no is a number."], "image": "train2014/COCO_train2014_000000270784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284540, "question_id": "QidvZmr6BAmdYFhYZYmanP", "question": "Which country invented the donut?", "choices": ["canada", "france", "sweden", "america"], "correct_choice_idx": 3, "direct_answers": ["netherlands", "america", "america", "netherlands", "usa", "dutch", "dutch settlers", "america", "usa", "united states"], "difficult_direct_answer": false, "rationales": ["The country is the us.", "Reports vary from the dutch to the russians, but the internet agrees that donuts are quintessentially american.", "None of the answers are correct based on an internet search, but answer a heavily commercialized donuts."], "image": "train2014/COCO_train2014_000000284540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554354, "question_id": "QienunZRyycQcFxGq6WyvD", "question": "What type of flooring does this room have?", "choices": ["hardwood", "carpet", "dirt", "concrete"], "correct_choice_idx": 0, "direct_answers": ["wooden", "wood", "wood", "hardwood", "wood", "wood", "wooden", "wood", "wood", "wooden"], "difficult_direct_answer": false, "rationales": ["Wood grain can be seen on the floors in the room.", "It has hardwood flooring.", "It is a hard wood floor."], "image": "train2014/COCO_train2014_000000554354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138553, "question_id": "QinER5zVrqAjFyvK4uG5GV", "question": "What is number 46 waiting for?", "choices": ["some rest", "ball pitched", "lunch", "time off"], "correct_choice_idx": 1, "direct_answers": ["ball", "pitch", "pitch", "pitch", "ball", "pitch", "ball pitched", "ball", "pitch", "pitch"], "difficult_direct_answer": false, "rationales": ["Number 46 wants to hit the ball.", "He's a batter waiting to get a hit", "Waiting for the pitcher to throw the ball."], "image": "val2014/COCO_val2014_000000138553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497227, "question_id": "QipCubXJp55CZwZndEnERA", "question": "This umbrellas used in which lamp?", "choices": ["noon", "day", "night", "evening"], "correct_choice_idx": 2, "direct_answers": ["white metal", "ground", "spotlight", "right", "night", "far right", "far right", "desk lamps", "floor lamp", "bottom right"], "difficult_direct_answer": true, "rationales": ["It was used to produce light at night.", "Lamps are used at night.", "The umbrellas are used at night."], "image": "train2014/COCO_train2014_000000497227.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374685, "question_id": "QivUyycPFzyMZ3Hjqgna8P", "question": "What is located in the cages?", "choices": ["animal", "pocket book", "food", "bottle"], "correct_choice_idx": 1, "direct_answers": ["birds", "purses", "bags", "purses", "purses", "purses", "designer handbags", "handbags", "pocket book", "birds"], "difficult_direct_answer": false, "rationales": ["The pocketbook is in the cage.", "They are purses.", "There are purses in the cages."], "image": "train2014/COCO_train2014_000000374685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272405, "question_id": "Qj4G9LJmMBRueErYgsZPFj", "question": "What is the traffic pattern?", "choices": ["intersection", "dead end", "highway", "traffic circle"], "correct_choice_idx": 0, "direct_answers": ["turnabout", "arrows", "roundabout", "roundabout", "oval", "intersection", "circular", "traffic circle", "roundabout", "one way"], "difficult_direct_answer": false, "rationales": ["The roundabout is a form of an intersection.", "The traffic is going in the intersection.", "The pattern is the intersection."], "image": "train2014/COCO_train2014_000000272405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577654, "question_id": "Qj9DXug6RHHQAipAzYvc4i", "question": "What is near the door?", "choices": ["cat", "apple", "mop", "laundry basket"], "correct_choice_idx": 3, "direct_answers": ["refrigerator", "laundry basket", "laundry basket", "laundry basket", "purple light", "bucket", "home application", "laundry hamper", "chair", "laundry basket"], "difficult_direct_answer": false, "rationales": ["There is a hamper basket for clothes parked right in front of the door.", "There is a plastic basket used to carry clothing to and from the washer/dryer.", "A white basket you put laundry in."], "image": "val2014/COCO_val2014_000000577654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518948, "question_id": "QjQjKQdfcdpatDyTtJKbFh", "question": "What herb is the guy on the left's hair often compared to?", "choices": ["thyme", "ginger", "dill", "parsley"], "correct_choice_idx": 1, "direct_answers": ["carrot", "ginger", "ginger", "ginger", "cumin", "carrot", "ginger", "ginger", "ginger", "ginger"], "difficult_direct_answer": false, "rationales": ["People with red hair are often called gingers.", "People with red hair are often called gingers because of the color.", "The guy on the left has red hair and red-headed people are sometimes referred to as gingers."], "image": "train2014/COCO_train2014_000000518948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142826, "question_id": "QjUujow4E3giPYr6pstbzC", "question": "What is this patio located next to?", "choices": ["fountain", "beach", "stream", "pool"], "correct_choice_idx": 2, "direct_answers": ["drainage", "stream", "outside", "stream/ ditch", "bar", "restaurant", "bar", "little canal", "creek", "stream"], "difficult_direct_answer": false, "rationales": ["This might also be called a creek in some areas of the world.", "The patio is near an elongated body of water.", "There is a small river by the table."], "image": "val2014/COCO_val2014_000000142826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208318, "question_id": "QjosdMGfYeHtt2CCYbbHKp", "question": "What are the objects being that are painted?", "choices": ["door stoppers", "fire hydrants", "inflatable toys", "garden statues"], "correct_choice_idx": 1, "direct_answers": ["fire hydrants", "hydrants", "fire hydrants", "fire hydrants", "fire hydrants", "hydrants", "fire hydrants", "hydrants", "characters", "fire hydrants"], "difficult_direct_answer": false, "rationales": ["The shape and the size of the objects make the answer clear.", "The objects are hydrants.", "They have valves on the side to hook hoses to them"], "image": "val2014/COCO_val2014_000000208318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231476, "question_id": "QjwwNLP8VwBAXnxn97arRc", "question": "What is this area called?", "choices": ["mall", "taxi stand", "train depot", "repair shop"], "correct_choice_idx": 2, "direct_answers": ["train station", "subway", "train station", "train station", "train station", "terminal", "station", "train station", "train depot", "train station"], "difficult_direct_answer": false, "rationales": ["As indicated by the train in the foreground. it might also contain a d.", "The train is in the depot waiting.", "There is a train parked in the depot."], "image": "val2014/COCO_val2014_000000231476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479578, "question_id": "Qk97yyJJocGYvBiqZb23rF", "question": "What part of this photo would these animals never encounter in their natural habitat?", "choices": ["trees", "each other", "grass", "pavement"], "correct_choice_idx": 3, "direct_answers": ["road", "road", "road", "road", "road", "road", "road", "road", "pavement", "road"], "difficult_direct_answer": false, "rationales": ["Generally these animals wouldn't encounter anything man-made.", "The part is the pavement.", "The pavement is manmade."], "image": "train2014/COCO_train2014_000000479578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315841, "question_id": "QkEH6GwLMzJtTHbq94J2Fz", "question": "What age elephant is shown here?", "choices": ["12 years", "aged", "baby", "adult"], "correct_choice_idx": 2, "direct_answers": ["baby", "baby", "baby", "baby", "couple months", "youth", "baby", "baby", "infant", "baby"], "difficult_direct_answer": false, "rationales": ["The elephant is very small.", "The elephant is shorter than the vehicle. an adult elephant would be taller.", "The elephant on the grass is a very small baby elephant that is shorter than the car."], "image": "val2014/COCO_val2014_000000315841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445982, "question_id": "QkGS2FsTY5fhwTHrkLjt5G", "question": "Which animal here is in gravest danger?", "choices": ["crow", "hawk", "bear", "cub"], "correct_choice_idx": 0, "direct_answers": ["crow", "polar bear", "bird", "bird", "bird", "polar bear", "polar bear", "crow", "polar bear", "foreground"], "difficult_direct_answer": false, "rationales": ["Climate change is making their natural habitat destabilize.", "The bear is looking intently at the bird.", "The bears are in danger."], "image": "val2014/COCO_val2014_000000445982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59774, "question_id": "QkRYUAQ7RZHFBALq2EyBZz", "question": "Inside what is the umbrella?", "choices": ["toaster", "microwave", "umbrella stand", "dishwasher"], "correct_choice_idx": 1, "direct_answers": ["microwave", "microwave", "microwave", "microwave", "microwave", "microwave", "microwave", "microwave", "microwave", "microwave"], "difficult_direct_answer": false, "rationales": ["You can tell by the shape of the appliance to where the umbrella is in.", "It has a door, turntable and knobs", "The umbrella is located in a kitchen appliance which is known as a microwave."], "image": "val2014/COCO_val2014_000000059774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510955, "question_id": "QkUfWd28vGdmSpTWJc7jBA", "question": "What dessert item was just placed within the appliance?", "choices": ["muffins", "croissants", "brownies", "cookies"], "correct_choice_idx": 3, "direct_answers": ["pie", "cookies", "cookie sheet", "cake", "pie", "muffins", "pie", "cookies", "cookies", "cookies"], "difficult_direct_answer": false, "rationales": ["The little balls in the oven will flatten out and make cookies.", "The cookies were placed.", "There are cookies in the oven."], "image": "val2014/COCO_val2014_000000510955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469798, "question_id": "QkyA5MxTNYYQ8NLRLJ6Jtm", "question": "What country is this airport located in?", "choices": ["japan", "america", "korea", "china"], "correct_choice_idx": 2, "direct_answers": ["korea", "korea", "korea", "korea", "korea south", "korea", "korea south", "korea", "korea", "korea south"], "difficult_direct_answer": false, "rationales": ["There are words on the plane that say where it is from.", "The country is spelled out right on the side of the planes.", "The country is korea."], "image": "train2014/COCO_train2014_000000469798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410114, "question_id": "QkyBwZdcUxtRyrZ2tmSDLV", "question": "What tragedy can happen here?", "choices": ["earthquake", "fire", "cows hit", "volcano eruption"], "correct_choice_idx": 2, "direct_answers": ["parade", "stampede", "trampling", "stampede", "stampede", "stampede", "cows hit", "stampede", "hit", "stampede"], "difficult_direct_answer": false, "rationales": ["There are cows and cars visibly present and using the same road. in such a setting there could be an incidental collision.", "The herd of cows are on a road where cars are driving on.", "The tragedy is a hit."], "image": "val2014/COCO_val2014_000000410114.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235410, "question_id": "QmKroL357ms4NMLWQUUEmr", "question": "The item in the shape of a rectangle that is on a plate is called what?", "choices": ["napkin", "pocket protector", "scarf", "fork"], "correct_choice_idx": 0, "direct_answers": ["napkin", "napkin", "napkin", "napkin", "napkin", "napkin", "napkin", "napkin", "napkin", "napkin"], "difficult_direct_answer": false, "rationales": ["Napkins are square or rectangular.", "The item is a napkin.", "The napkin is folded."], "image": "train2014/COCO_train2014_000000235410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522443, "question_id": "QmNwAWmrbTjfmVJQqdyyak", "question": "What kind of nuts are these sweets topped with?", "choices": ["pecans", "peanuts", "pistachios", "almonds"], "correct_choice_idx": 3, "direct_answers": ["almonds", "almonds", "almonds", "almonds", "walnuts", "almonds", "peanut", "almond", "almonds", "almonds"], "difficult_direct_answer": false, "rationales": ["Almonds are in the chocolate.", "Answer a is this color and is thinly sliced like this when used as a topping. the other options are a different shape and are usually prepared differently or left whole.", "They are oval shaped slices"], "image": "val2014/COCO_val2014_000000522443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381721, "question_id": "Qn8jiipD7E7VHExmQ2h2d4", "question": "What is the construction out on the water called?", "choices": ["intersection", "walkway", "pier", "coastway"], "correct_choice_idx": 1, "direct_answers": ["dock", "boat", "pier", "under water", "boat", "pier", "walkway", "dock", "boat", "dock construction"], "difficult_direct_answer": false, "rationales": ["The construction that extends on the walkway is a pier that goes over the water.", "This is a pier that people can walk out on to see the water.", "The pier goes out into the water so you can walk on it."], "image": "val2014/COCO_val2014_000000381721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497870, "question_id": "Qn8kayTLYvuZYPTB5Goh3u", "question": "The flag on the pillow case is for which nation?", "choices": ["france", "canada", "united kingdom", "united states"], "correct_choice_idx": 2, "direct_answers": ["uk", "uk", "britain", "united kingdom", "britain", "united kingdom", "britain", "britain", "england", "united kingdom"], "difficult_direct_answer": false, "rationales": ["This flag represents the uk.", "The pillow in question has a visible design that has the pattern and colors known to be associated with answer a.", "The pillow features the uk flag."], "image": "train2014/COCO_train2014_000000497870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232954, "question_id": "QnEDMU2A6XuhbdMMKFSPCK", "question": "How many days after the Independence Day was this picture taken?", "choices": ["three", "one", "two", "seven"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "eight", "three", "three", "three", "three", "eight", "eight"], "difficult_direct_answer": false, "rationales": ["The photo was taken three days after july 4.", "The picture was taken on 07/07 so 3 days.", "Independence day is on the 4th, and this photograph according to the date stamp in the corner happened on the 7th."], "image": "val2014/COCO_val2014_000000232954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98493, "question_id": "Qna7h2UrsEKpBybFmx7ne3", "question": "What age people mostly utilize this space?", "choices": ["toddlers", "teens", "senior citizens", "adults"], "correct_choice_idx": 1, "direct_answers": ["teenager", "teens", "teenage", "17", "sixteen", "teenagers", "teens", "sixteen seventeen", "teens", "teenagers"], "difficult_direct_answer": false, "rationales": ["Teens use the space.", "This is a high school so teenagers go to school there.", "The largest population of a high school is known to be teenaged people, and this building is labeled as a high school."], "image": "val2014/COCO_val2014_000000098493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361947, "question_id": "QnoLoi4SCXeywcJRrB6cCY", "question": "What type of people might the driver here transport?", "choices": ["salesmen", "prisoners", "children", "tourists"], "correct_choice_idx": 3, "direct_answers": ["students", "tourist", "tourists", "tourists", "tourists", "tourists", "drivers", "tourists", "campers", "locals"], "difficult_direct_answer": false, "rationales": ["Because it has the capacity to carry large amount of people.", "Tourists are usually transported in this type of vehicle.", "This bus transports visitors"], "image": "val2014/COCO_val2014_000000361947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459170, "question_id": "Qnuk2cVo5Refe4q59PywLC", "question": "What is the same color as the flag?", "choices": ["cucumber", "strawberry", "cherry", "carrot"], "correct_choice_idx": 0, "direct_answers": ["grass", "green", "green", "umbrella", "green", "grass", "cucumber", "man's shirt", "no flag", "green"], "difficult_direct_answer": false, "rationales": ["Cucumbers are green and the flag is green.", "Traditionally cucumbers are green or shades of green.", "The color of the flag is green."], "image": "train2014/COCO_train2014_000000459170.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509589, "question_id": "Qo7jTkzkfsSM3fxVh5xg2J", "question": "What type of event is happening here?", "choices": ["rodeo", "dog show", "movie", "skateboard expo"], "correct_choice_idx": 3, "direct_answers": ["skateboard expo", "scatting", "skateboard competition", "skateboarding", "festival", "skating", "scatting", "parade", "festival", "concert"], "difficult_direct_answer": false, "rationales": ["A crowd of people are gathered and several people with skateboards are present.", "People are riding them and have helmets", "There are ramps and advertisements"], "image": "val2014/COCO_val2014_000000509589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551654, "question_id": "QoH7LfTBJGLu5EpnR7BAEf", "question": "What allows him to see the contents of the oven when the door is closed?", "choices": ["camera", "magnifying glass", "window", "streaming video"], "correct_choice_idx": 2, "direct_answers": ["window", "oven window", "window", "oven window", "clear glass", "window", "mirror", "window", "window", "window"], "difficult_direct_answer": false, "rationales": ["The oven has a window allowing visibility to its contents.", "There is clear, see through glass on the oven door.", "The person has a window."], "image": "val2014/COCO_val2014_000000551654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33601, "question_id": "QoPFE7K24MGjYXpZ6P65pf", "question": "What job does the woman seen here likely hold?", "choices": ["vendor", "bus driver", "conductor", "meter maid"], "correct_choice_idx": 0, "direct_answers": ["clerk", "food vendor", "vendor street", "domestic", "shop keeper", "vendor", "pole", "vendor", "street vendor", "vender"], "difficult_direct_answer": true, "rationales": ["The woman is standing under the awning of a store owned by a street vendor,", "The woman is selling wares.", "She is standing in front of a local market so she is probably the owner."], "image": "train2014/COCO_train2014_000000033601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537007, "question_id": "Qoao7QAivjCJiyR9sEsz7Q", "question": "Why is the man so close to the child?", "choices": ["likes him", "protecting him", "is game", "stay warm"], "correct_choice_idx": 1, "direct_answers": ["teaching him", "helping", "protecting him", "surfing", "teaching", "surfing", "teaching surfing", "teaching", "safety", "surfing"], "difficult_direct_answer": false, "rationales": ["The man wants to keep the kid safe.", "The man appears to be teaching the child and is there to provide safety by catching him in case he falls off.", "To protect him from falling in the water or getting hurt."], "image": "val2014/COCO_val2014_000000537007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242405, "question_id": "QobZAyU2t6Fd3f6Qsyremu", "question": "What animal does this resemble?", "choices": ["dog", "whale", "bear", "tiger"], "correct_choice_idx": 1, "direct_answers": ["bird", "bird", "whale", "eagle", "bird", "whale", "bird", "whale", "hawk", "shark"], "difficult_direct_answer": false, "rationales": ["It's large and long like a whale's body.", "The object visible is large and rounded similar to answer a and the other answers don't have any similar features.", "The large plane resembles the body of an enormous whale."], "image": "train2014/COCO_train2014_000000242405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141566, "question_id": "QokDkdBAcUBd8wXyhKYBry", "question": "Why would someone come to this location?", "choices": ["massage", "haircut", "eat", "shop"], "correct_choice_idx": 2, "direct_answers": ["food", "eat food", "eat", "live jazz", "eat meal", "eat meal", "to eat", "cross street", "eat", "cross road"], "difficult_direct_answer": false, "rationales": ["This place is a restaurant.", "The sign indicates that this location is a restaurant and restaurants are places people visit in order to eat.", "This is a restaurant which serves food"], "image": "train2014/COCO_train2014_000000141566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151807, "question_id": "QpACa3T84CDYM9JzZ3zDVx", "question": "What sort of craftsman might have wares sold in Hudson Valley Shoefitters?", "choices": ["cobbler", "knitter", "lacer", "pie maker"], "correct_choice_idx": 0, "direct_answers": ["cobbler", "cobbler", "cobbler", "shoemaker", "cobbler", "cobbler", "cobbler", "cobbler", "cobbler", "shoemaker"], "difficult_direct_answer": false, "rationales": ["This is a shoe store. a cobbler makes shoes.", "A cobbler since they make shoes.", "A cobbler makes shoes."], "image": "val2014/COCO_val2014_000000151807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61732, "question_id": "QpBLxbf3QtLiSRotdPqdhY", "question": "People are most likely visiting this general strip to engage in what activity?", "choices": ["shopping", "dining", "gambling", "museums"], "correct_choice_idx": 2, "direct_answers": ["gambling", "gamble", "gambling", "gambling", "travelling", "gambling", "gamble", "gamble", "gambling", "gambling"], "difficult_direct_answer": false, "rationales": ["People will gamble.", "The businesses on this street all look like casinos.", "The las vegas strip is famous for its casinos where although people can shop, dine, and visit museums, the main attractions are slot machines and table games."], "image": "train2014/COCO_train2014_000000061732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200879, "question_id": "QpJYuCY7M9ip8rfSkpcYoc", "question": "Why are her eyes red?", "choices": ["camera filter", "reflected light", "birth defect", "contact lenses"], "correct_choice_idx": 1, "direct_answers": ["reflection", "camera flash", "flash", "glare", "flash", "camera flash", "camera", "camera flash", "camera flash", "reflected light"], "difficult_direct_answer": false, "rationales": ["There is glare from the camera.", "A girl has unnatural colored eyes as she looks forward to pose for a picture.", "Sometimes when you take a picture the glare will make your eyes red."], "image": "train2014/COCO_train2014_000000200879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370513, "question_id": "QpVEaubfYjfr7EPSGr9oPT", "question": "What type of phones are used at this desk?", "choices": ["pay", "rotary", "cellular", "landline"], "correct_choice_idx": 3, "direct_answers": ["corded", "corded", "telephone", "void phones", "landline", "corded telephones", "landline", "landline", "wired", "landlines"], "difficult_direct_answer": false, "rationales": ["The phones on his desk are the kind plugged into the phone outlet in the wall.", "This phone is connected to a wall somewhere .", "A landline phone is used."], "image": "val2014/COCO_val2014_000000370513.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89425, "question_id": "QpeGBWNE8fgCHjXz7G4MEC", "question": "Why is the man in red sticking his foot out?", "choices": ["to trip", "to kick", "to stand", "to turn"], "correct_choice_idx": 3, "direct_answers": ["balance", "catch himself", "balance", "balance", "to turn", "steadying himself", "balance", "balance", "turn", "control"], "difficult_direct_answer": false, "rationales": ["The man is trying to turn.", "He is doing that to turn his bike and not fall.", "The man's body orientation shows that he is leaning. when racing on this vehicle one would lean their body and counterbalance with their foot in order to turn."], "image": "train2014/COCO_train2014_000000089425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257817, "question_id": "QpgSrkBcoaEic7F67JfNc5", "question": "What are the black things on the person's food?", "choices": ["pepperonis", "black olives", "sausage", "peppers"], "correct_choice_idx": 1, "direct_answers": ["black olives", "black olives", "olives", "olives", "olives", "olives", "black olives", "olives", "olives", "olives"], "difficult_direct_answer": false, "rationales": ["They are olives on a pizza.", "None of the answers are visibly on the person's foot, but based of the size and shape of the items on the pizza they would be answer a.", "This is obvious by their shape and location on a pizza, which is common."], "image": "train2014/COCO_train2014_000000257817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197167, "question_id": "QpyrAoU6ttQsckhsoS87C8", "question": "The narrowest visible point of the board is pointing in what direction?", "choices": ["west", "south", "north", "east"], "correct_choice_idx": 2, "direct_answers": ["up", "up", "up", "up", "north", "north", "up", "up", "north", "up"], "difficult_direct_answer": false, "rationales": ["Visibly the narrowest point of the board is facing up. answer a is a synonym for up.", "The point is north.", "The board is pointing up, north."], "image": "train2014/COCO_train2014_000000197167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581009, "question_id": "Qq2JN9kT6VFx8HqQJoTEru", "question": "Why is she holding his arm?", "choices": ["leading him", "in love", "prevent leaving", "prevent falling"], "correct_choice_idx": 3, "direct_answers": ["to balance", "balance", "balance", "prevent falling", "don't fall", "balance", "support", "safety", "coming fast", "balanced"], "difficult_direct_answer": false, "rationales": ["It is easy to lose your balance on the skateboard.", "The woman can't fall.", "She's holding his arm to prevent him from falling."], "image": "train2014/COCO_train2014_000000581009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114917, "question_id": "Qq2ePx2Z8xBzEuo2rNvQUE", "question": "What type of oven is behind the man?", "choices": ["gas", "wood", "brick", "electric"], "correct_choice_idx": 2, "direct_answers": ["commerical", "pizza", "pizza oven", "pizza", "otg", "wood", "brick", "brick", "brick oven", "pizza"], "difficult_direct_answer": false, "rationales": ["The man is making pizzas and an oven with bricks can be seen behind him.", "There are red rectangles stacked along the wall.", "The oven is brick."], "image": "train2014/COCO_train2014_000000114917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333605, "question_id": "Qq6Qs2Vv6gUenBkhfNAQPW", "question": "Why are the parking meters likely displayed here?", "choices": ["trash", "parking", "repairs", "art"], "correct_choice_idx": 3, "direct_answers": ["art", "exhibition", "art", "fencing", "art", "art", "art", "art", "decorating", "memorial"], "difficult_direct_answer": false, "rationales": ["It would be uncommon for this scene to have occurred naturally based on their purpose and the setting they are in now. art installations frequently repurpose or take something out of it's intended setting and try to display them in a new way.", "The meters are painted.", "Since they are in grass and too close together for parking it has to be a display"], "image": "val2014/COCO_val2014_000000333605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479944, "question_id": "QqmAaPQGGTcwsScKBnybgd", "question": "What was probably stored in the container on the grass?", "choices": ["detergent", "marbles", "rice", "cake"], "correct_choice_idx": 0, "direct_answers": ["detergent", "mulch", "trash", "trash", "plants", "water", "leaves", "plant food", "detergent", "detergent"], "difficult_direct_answer": false, "rationales": ["A green bottle is in the grass near people picking up trash. the shape of the container is consistent with the shape of laundry detergent bottles.", "The container is a detergent container.", "The container size, shape and design is consistent with ones used for answer a and no other answer on the list."], "image": "train2014/COCO_train2014_000000479944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10265, "question_id": "QrRKFArcYtQwm7Mm8V6cpt", "question": "The sponsors are a clothing company and what else?", "choices": ["bank", "restaurant", "shoe company", "car company"], "correct_choice_idx": 0, "direct_answers": ["bank", "bank", "bank", "polo bank", "bank", "bank", "apparel", "financial", "bank", "polo"], "difficult_direct_answer": false, "rationales": ["Chase is a well known bank that is the sponsor of this game.", "The name brand is a very well-known banking company.", "The other sponsor is chase bank."], "image": "train2014/COCO_train2014_000000010265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387704, "question_id": "QrUGwxySjRVSaUiJ2meQy5", "question": "What country does the flag resemble?", "choices": ["american", "madagascar", "india", "china"], "correct_choice_idx": 0, "direct_answers": ["usa", "america", "usa", "america", "united states", "american", "usa", "united states", "usa", "usa"], "difficult_direct_answer": false, "rationales": ["The flag has stars and stripes on it.", "The flag is clearly visible and has the components and colors known to be associated with answer a.", "It is the stars and stripes"], "image": "train2014/COCO_train2014_000000387704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180466, "question_id": "QrbZMkT6mwFR5QxQAUiKYv", "question": "What player is at bat?", "choices": ["ryan howard", "chris young", "eric roberts", "jim those"], "correct_choice_idx": 0, "direct_answers": ["number six", "phillies player", "ryan howard", "phillies player", "ryan howard", "baseball player", "phillies", "number six", "batter", "bubba"], "difficult_direct_answer": false, "rationales": ["Howard is batting.", "A man in a phillies uniform is at bat.", "The phillies player is at bat."], "image": "train2014/COCO_train2014_000000180466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62745, "question_id": "QrnJyz5Tcrbtb4NS62HvvN", "question": "What game are these characters from?", "choices": ["sims", "angry birds", "farmville", "candy crush"], "correct_choice_idx": 1, "direct_answers": ["angry birds", "angry birds", "angry birds", "angry birds", "angry birds", "angry birds", "angry birds", "angry birds", "angry birds", "angry birds"], "difficult_direct_answer": false, "rationales": ["Angry birds characters are shown.", "The characters are recognizable and associated with answer a as well as they all have beaks as birds would and answer a is the most bird-like option.", "The game is angry birds."], "image": "train2014/COCO_train2014_000000062745.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324488, "question_id": "QrnwbdEJUFg82iz2SV2cfw", "question": "In what setting is this meal served?", "choices": ["restaurant", "plane", "train", "home"], "correct_choice_idx": 1, "direct_answers": ["informal", "hotel room", "airplane", "dark", "cafeteria", "plane", "airplane", "restaurant", "hospital", "plane"], "difficult_direct_answer": false, "rationales": ["Planes serve meals on trays.", "There is packaged food on a tray table so it's likely on an airplane.", "The setting is a plane."], "image": "train2014/COCO_train2014_000000324488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388829, "question_id": "QroTujJhmscVcP5LTNS8ou", "question": "How many species of animals are here?", "choices": ["one", "seven", "hundred", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "two", "three", "three", "three", "two", "two", "two", "three", "two"], "difficult_direct_answer": false, "rationales": ["Three-- sheep, a dog, and a person", "Theres a dog, human and goats.", "A human, dog and sheep are shown."], "image": "val2014/COCO_val2014_000000388829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119860, "question_id": "QrqXsN72MXaRzm9q9V6brG", "question": "What is the name of the red utensil in the pan?", "choices": ["fork", "knife", "spatula", "spoon"], "correct_choice_idx": 2, "direct_answers": ["slotted spatula", "spatula", "spatula", "spatula", "spatula", "spatula", "spatula", "slotted spatula", "spatula", "spatula"], "difficult_direct_answer": false, "rationales": ["The name is a spatula.", "The tool has a broad blade used for lifting food.", "A flat, slotted serving utensil is laying in a dish that serves many people."], "image": "train2014/COCO_train2014_000000119860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294430, "question_id": "QrvBGJax3wpRiz78X7eZjY", "question": "Why is she bent over?", "choices": ["hitting ball", "tired", "watching others", "hiding ball"], "correct_choice_idx": 0, "direct_answers": ["ready serve", "playing tennis", "waiting", "preparing swing", "to aim", "waiting", "strike", "waiting", "prepared", "hitting ball"], "difficult_direct_answer": false, "rationales": ["This is a stance used in tennis to get one ready to see the ball to hit it from any direction.", "A woman is holding a tennis raquet and is focused on looking at a ball.", "The position and the sport being played you can tell what she is getting ready to do."], "image": "train2014/COCO_train2014_000000294430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11713, "question_id": "Qs45NjoMcosdvCAo9WKvBF", "question": "What is the area marked in blue used for?", "choices": ["laying", "wrestling", "sitting", "jumping"], "correct_choice_idx": 3, "direct_answers": ["see effectively", "danger", "jumping", "landing", "guidance", "tricks", "landing", "warning area", "separation", "skiing"], "difficult_direct_answer": true, "rationales": ["The blue marks the edge of the cliff.", "This marking helps ski jumpers see the contour of a jump as they approach so it does not get lost amongst all the other white of the snow. there are also people visible mid air above the snow who have likely just jumped off.", "This is a boundary marker"], "image": "train2014/COCO_train2014_000000011713.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384176, "question_id": "QsM39cyaBQd3mscwaNjjCL", "question": "Why are they all there together?", "choices": ["hiding", "fighting", "eating lunch", "sharing table"], "correct_choice_idx": 3, "direct_answers": ["working", "meeting", "working", "working", "play games", "doing work", "sharing table", "working", "hanging out", "working"], "difficult_direct_answer": false, "rationales": ["They may be c in a shared video game or about to do b together.", "The individuals are all at one table together and sharing the space between the three of them.", "They have their computers on the same surface"], "image": "train2014/COCO_train2014_000000384176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396958, "question_id": "QswFHJDEGurS5tfoLDqFYQ", "question": "Where is the man probably going to take his bike next?", "choices": ["on stairs", "into building", "on elevator", "on train"], "correct_choice_idx": 3, "direct_answers": ["train", "on train", "train", "train", "on train", "train", "on train", "on train", "train", "train"], "difficult_direct_answer": false, "rationales": ["The people on the platform are waiting for their turn to ride the train.", "He is waiting to board.", "The man is going to get on the train."], "image": "train2014/COCO_train2014_000000396958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317254, "question_id": "QtAWFv8KTziKc5QTTKBsEo", "question": "Which former country had a flag which looks similar to these banners?", "choices": ["czechoslovakia", "zaire", "yugoslavia", "rhodesia"], "correct_choice_idx": 2, "direct_answers": ["italy", "russia", "yugoslavia", "india", "yugoslavia", "france", "unknown", "usa", "netherlands", "france"], "difficult_direct_answer": false, "rationales": ["Yugoslavia is similar.", "Yugoslavia had a flag looking similar to these banners.", "A quick google search made me aware of the country that had this flag."], "image": "val2014/COCO_val2014_000000317254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493601, "question_id": "QtnsFCJmwqWVQfW5ZV8cmN", "question": "How many slices of bread will filling be put on?", "choices": ["two", "four", "three", "six"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "two", "three", "four", "five", "five", "four", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 5 slices, and 2 slices are saved to be the top and bottoms", "There are six slices of bread total.", "There are six slices of bread in the photo. most people make sandwiches which contain two slices. since only one of these two slices normally have filling put on them, there will be a total of three that will be filled."], "image": "train2014/COCO_train2014_000000493601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463151, "question_id": "QtxNqhrdZoJJyjCqjskX34", "question": "What is the term for a large group of people watching an event?", "choices": ["gang", "family", "crowd", "colony"], "correct_choice_idx": 2, "direct_answers": ["spectators", "crowd", "crowd", "spectators", "crowd", "crowd", "audience", "audience", "audience", "crowd"], "difficult_direct_answer": false, "rationales": ["Answer a is a common term for a group of people watch an event.", "Public events which attract many people of diverse backgrounds are normally called a crowd.", "This is a large crowd of people."], "image": "val2014/COCO_val2014_000000463151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481879, "question_id": "QuPx3EGVDdF7Rb4jbjZPHb", "question": "How many items qualify as a berry botanically?", "choices": ["two", "three", "one", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "3 types", "three", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Because of its red color and its appearance.", "There are blueberries and raspberries.", "There are blueberries and raspberries here."], "image": "train2014/COCO_train2014_000000481879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503871, "question_id": "Quto3mVJJnAtEwQy2KQPpU", "question": "Why have these people gathered?", "choices": ["to clean", "to work", "to drink", "to swim"], "correct_choice_idx": 2, "direct_answers": ["drink", "to drink", "for drinks", "celebration", "to drink", "drink", "wine tasting", "to drink", "wine tasting", "celebrate"], "difficult_direct_answer": false, "rationales": ["They are gathered at a table with glasses of wine.", "They are holding glasses of wine.", "They have wine glasses with wine"], "image": "train2014/COCO_train2014_000000503871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258237, "question_id": "Qv9QyAkxg4pssrJhsHgo8S", "question": "How might they know each other?", "choices": ["classmates", "rivals", "roommates", "teammates"], "correct_choice_idx": 3, "direct_answers": ["teammates", "friends", "friends", "team mates", "old friends", "very well", "teammates", "friends", "friends", "sisters"], "difficult_direct_answer": false, "rationales": ["They are dressed in the same uniform, so they probably are partners on the court.", "The text indicates that they play doubles tennis together.", "They are wearing the same uniform and are on a tennis court."], "image": "train2014/COCO_train2014_000000258237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271417, "question_id": "QvASQwii9HWr7fCoMGn9JA", "question": "How is this boat powered?", "choices": ["gas", "coal", "wind", "battery"], "correct_choice_idx": 2, "direct_answers": ["wind", "sail boat", "wind", "wind", "wind", "wind", "wind", "wind", "wind", "sail power"], "difficult_direct_answer": false, "rationales": ["This boat is powered by wind.", "A sailboat is at a dock.", "The boat has masts that would hold up sails. sails need wind to be effective."], "image": "train2014/COCO_train2014_000000271417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539413, "question_id": "QvmPbEAJBnqHHzyCxyqujo", "question": "Why are the lower trunks of the trees painted white?", "choices": ["timber marking", "sunscald protection", "fertilizer", "insecticide"], "correct_choice_idx": 1, "direct_answers": ["insects", "protection", "insect repellent", "moisture", "kill insects", "sunscald protection", "pest control", "avoid pests", "sick", "bugs"], "difficult_direct_answer": true, "rationales": ["There is special kind of paint that you can use to keep bases of trees healthy.", "The trees are painted for protection.", "The white trunks keep pests away."], "image": "val2014/COCO_val2014_000000539413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24551, "question_id": "QvtoPc3qEAyJUjS6vvFAcG", "question": "What team do the men on the field play for?", "choices": ["mets", "yankees", "rays", "red sox"], "correct_choice_idx": 3, "direct_answers": ["boston", "red sox", "red sox", "red sox", "boston", "red sox", "boston", "red sox", "boston", "boston"], "difficult_direct_answer": false, "rationales": ["They play at fenway.", "Red sox is the team that plays in boston.", "The team name is clearly seen on their jackets."], "image": "train2014/COCO_train2014_000000024551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381568, "question_id": "QvttG5KtgeGU5ugtTnss4d", "question": "What type of equipment are the people holding?", "choices": ["oars", "water propellers", "rowing sticks", "surf poles"], "correct_choice_idx": 0, "direct_answers": ["surf paddle", "oars", "oars", "paddles", "paddle", "paddleboard", "oars", "paddles", "oars", "oar"], "difficult_direct_answer": false, "rationales": ["They are using the tool to move their boards through the water.", "It has to be the sticks because they are moving through the currents.", "Oars are used."], "image": "train2014/COCO_train2014_000000381568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553931, "question_id": "QvzHTDGfYzFLyspMiKGZ7Y", "question": "What happens if they are left out too long?", "choices": ["moisten", "explode", "go stale", "disappear"], "correct_choice_idx": 2, "direct_answers": ["get stale", "get hard", "get stale", "stale", "get stale", "go stale", "get stale", "stale", "get stale", "get stale"], "difficult_direct_answer": false, "rationales": ["Doughnuts do not stay fresh very long which is why most shops sell day old doughnuts at a highly discounted rate.", "These food items are donuts. they would not explode, disappear, or moisten if they were left out too long.", "The donuts will go stale if they're out."], "image": "val2014/COCO_val2014_000000553931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97778, "question_id": "QvzSuFUiRuTWZJ9p4jGwfF", "question": "What has been served with the chips?", "choices": ["mayo", "creme", "milk", "dip"], "correct_choice_idx": 3, "direct_answers": ["dip", "dip", "sauce", "dip", "dip", "sauce", "sauce", "dip", "dip", "dip"], "difficult_direct_answer": false, "rationales": ["In the middle of the container is a mixture of spinach and cream to be served with the chips.", "It is a mix of ingredients that typically include cheese or vegetables.", "There is a white creamy condiment in the middle"], "image": "train2014/COCO_train2014_000000097778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522755, "question_id": "Qw6gnBBwJKSLCqki4GfN6S", "question": "What animal is the kite modeled after?", "choices": ["pelican", "pigeon", "eagle", "seagull"], "correct_choice_idx": 0, "direct_answers": ["pelican", "pelican", "stork", "pelican", "pelican", "stork", "pelican", "pelican", "stork", "stork"], "difficult_direct_answer": false, "rationales": ["Anyone can easily tell the type of bird shown.", "It is made to look like a pelican.", "The kite is shaped like a bird with white feathers and a long, distinctive beak. pelicans have these features."], "image": "train2014/COCO_train2014_000000522755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241611, "question_id": "QwAbM8KPDqEydpfUu4GEc8", "question": "What can this vehicle likely carry?", "choices": ["horses", "parcels", "trucks", "elephants"], "correct_choice_idx": 1, "direct_answers": ["wood", "car", "car", "cars", "parcels", "car", "car", "towed vehicles", "cars", "other cars"], "difficult_direct_answer": false, "rationales": ["The vehicle has parcels.", "A truck with a flatbed is driving down a street.", "The vehicle has parcels."], "image": "train2014/COCO_train2014_000000241611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143416, "question_id": "QwpjFuvg2ewq8fq2KXCdFq", "question": "What animal has the same colours as the floor tiles?", "choices": ["giraffe", "rhino", "zebra", "elephant"], "correct_choice_idx": 2, "direct_answers": ["zebra", "zebra", "panda", "cow", "skunks", "zebra", "zebra", "zebra", "zebra", "zebra"], "difficult_direct_answer": false, "rationales": ["The floor is black and white.", "Zebras are black and white.", "Zebras are black and white just like the tiles."], "image": "train2014/COCO_train2014_000000143416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206613, "question_id": "QwuuvHknu2NmUrkGdrxzj3", "question": "What is the purpose of the chalk on the ground?", "choices": ["reflects sunlight", "provide markings", "provides fiction", "is fashionable"], "correct_choice_idx": 1, "direct_answers": ["batters box", "outline boundaries", "field boundary", "provide markings", "marking", "limits", "guide marks", "batting plate", "layout", "mark boxes"], "difficult_direct_answer": true, "rationales": ["In the game, it is important to see the lines to know what is in and out of bounds.", "These lines provide markings for what is in bound on a baseball field.", "They are playing baseball. the lines indicate where players should stand or run."], "image": "val2014/COCO_val2014_000000206613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29724, "question_id": "Qx6Wg2LWDand2VBfVrBXnR", "question": "What are the silver poles being used for?", "choices": ["flying flags", "climbing", "flinging", "swinging"], "correct_choice_idx": 0, "direct_answers": ["flag poles", "flying flags", "flag poles", "flags", "flags", "flags", "flags", "hoist flags", "flags", "flag poles"], "difficult_direct_answer": false, "rationales": ["The silver poles are flag poles.", "There are decorated pieces of cloth hanging from them", "Due to wind, flag poles need to be sturdy. the poles pictured are tall, silver, and sturdy; perfect for displaying flags."], "image": "train2014/COCO_train2014_000000029724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327005, "question_id": "Qx82PHZauCs94p4H8rkAQu", "question": "What tool would be best ideal to propel this type of board?", "choices": ["motor", "paddle", "sail", "wave"], "correct_choice_idx": 1, "direct_answers": ["wrench", "row", "long oar", "paddle", "oar", "oar", "paddle", "paddle", "paddle", "paddle"], "difficult_direct_answer": false, "rationales": ["An oar displaces a lot of water to propel you forward.", "The tool is a paddle.", "A paddle would help move the board to where he wants to go."], "image": "val2014/COCO_val2014_000000327005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299442, "question_id": "QxFAMkQdK5gsB3TcjZBiJM", "question": "Where would you most likely see one of these buses?", "choices": ["minneapolis", "providence", "tokyo", "hamburg"], "correct_choice_idx": 2, "direct_answers": ["tokyo", "japan", "city", "japan", "japan", "japan", "city", "city", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["The bus has asian lettering on the side. this would be most common in a location that speaks an asian language commonly.", "The bus has lettering on it that you would see in japan.", "The side of the bus has asian script on it."], "image": "val2014/COCO_val2014_000000299442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18461, "question_id": "QxJkUWibBexbtShHdyc2L3", "question": "The home that this refrigerator is in is located in which country?", "choices": ["canada", "belgium", "united states", "france"], "correct_choice_idx": 0, "direct_answers": ["canada", "united states", "united states", "united states", "united states", "canada", "united states", "mexico", "usa", "united states"], "difficult_direct_answer": false, "rationales": ["There are bags of milk in pitchers", "The packaging is in english and french.", "The country is canada."], "image": "val2014/COCO_val2014_000000018461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157617, "question_id": "QxM5EPhsDyJEzb7FRyUgyc", "question": "What is her sister's name?", "choices": ["naomi", "serena", "anna", "venus"], "correct_choice_idx": 1, "direct_answers": ["serena williams", "venus", "serena", "serena", "serena", "married", "candice", "serena", "serena williams", "serena"], "difficult_direct_answer": false, "rationales": ["Venus is the sister to serena.", "It's hard to tell in the photo which sister it is, but i believe it's a.", "The player is venus williams."], "image": "val2014/COCO_val2014_000000157617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147482, "question_id": "QxTpuFzn4JF9X3BKbQ93hX", "question": "What is the surface composed of where these kites are flying?", "choices": ["water", "dirt", "sand", "grass"], "correct_choice_idx": 2, "direct_answers": ["air", "dirt", "sand water", "air", "polyester", "sand", "ground", "sand", "wind", "sand"], "difficult_direct_answer": false, "rationales": ["The surface of the beach is composed of sand.", "You need to look carefully but there is only one possible answer.", "The color of the ground and the setting near the water leads one to believe it is a beach that would commonly be composed of answer a."], "image": "val2014/COCO_val2014_000000147482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71563, "question_id": "QxYzoWKVp3rXh7HGn6hcje", "question": "What video game genre is seen on the computer monitor?", "choices": ["real-time strategy", "shooter", "platform", "role-playing"], "correct_choice_idx": 0, "direct_answers": ["farming game", "base builder", "farmville", "barnyard", "farm game", "role-playing", "real-time strategy", "building structure", "world building", "puzzler"], "difficult_direct_answer": true, "rationales": ["Based on the setting of the game and the control bar at the bottom of the screen, this farm-based game is in the real-time strategy genre.", "The game looks like a puzzle.", "A map of a wooded area is shown on a television."], "image": "val2014/COCO_val2014_000000071563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177325, "question_id": "Qxoqs7ZmjcCT6uJjUzqsWg", "question": "What kind of chips are served on the plate?", "choices": ["doritos", "lays", "pringles", "stacys"], "correct_choice_idx": 2, "direct_answers": ["potato chips", "pringles", "pringles", "pringles", "pringles", "potato", "pringles", "breakfast", "pringles", "pringles"], "difficult_direct_answer": false, "rationales": ["The chips are in the shape of pringles.", "The chips are pringles.", "These are all uniform shape and size"], "image": "train2014/COCO_train2014_000000177325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488011, "question_id": "Qy2wZcYHUEowV9rwbExZxB", "question": "What language is most likely spoken here?", "choices": ["chinese", "spanish", "korean", "italian"], "correct_choice_idx": 1, "direct_answers": ["children", "spanish", "spanish", "chinese", "hindi", "hindi", "spanish", "mayan", "malaysian", "mayan"], "difficult_direct_answer": false, "rationales": ["The language is spanish.", "Spanish is likely spoken since the trolley has spanish.", "It looks like that is the type of language on the bus."], "image": "train2014/COCO_train2014_000000488011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490338, "question_id": "Qy78GTGnghQykmJrHQPfDa", "question": "What does the store whose sign has a blue arrow sell?", "choices": ["dvd", "udon", "sushi", "vhs"], "correct_choice_idx": 0, "direct_answers": ["dvds", "dvds", "dvds", "dad's", "dvds", "dvd", "dad's", "dvd", "dvds", "dvds"], "difficult_direct_answer": false, "rationales": ["The store with the blue arrow sells movies on disc.", "The acronym \"dvd\" does not have an equivalent in japanese characters.", "That sign sell videos."], "image": "val2014/COCO_val2014_000000490338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413560, "question_id": "QyCxfwBGsigZKWsB5QqLUJ", "question": "What will the giraffe likely do next?", "choices": ["come out", "throw up", "eat", "bite"], "correct_choice_idx": 2, "direct_answers": ["take food", "eat", "eat", "take food", "eat", "take food", "eat", "eat", "eat", "eat"], "difficult_direct_answer": false, "rationales": ["The giraffe is getting a treat.", "The giraffe will eat.", "The man is holding something in his hand towards the giraffe. the giraffe is leaning his head towards the man's hand."], "image": "train2014/COCO_train2014_000000413560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249482, "question_id": "QyKVftF27aXXYmAnaFLSzG", "question": "Which character is on the television show that adorns this airplane?", "choices": ["bulbasaur", "uhtred uhtredson", "vanessa ives", "jamie lannister"], "correct_choice_idx": 0, "direct_answers": ["pikachu", "pikachu", "bulbasaur", "pikachu", "pikachu", "pikachu", "pokemon", "boeing", "pokemon", "pikachu"], "difficult_direct_answer": false, "rationales": ["There are several pokemon characters on the plane", "Pokemon is depicted.", "The plane has pokemon characters clearly displayed and the name pokemon is also written. answer a is a character from the show."], "image": "val2014/COCO_val2014_000000249482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184201, "question_id": "QyTBqrrey8mZbjkkyz6jZC", "question": "What kind of boat is this?", "choices": ["tow boat", "fishing", "transport", "coast guard"], "correct_choice_idx": 2, "direct_answers": ["transport", "work boat", "john", "floating market", "paddle boat", "fishing boat", "produce", "trade", "canoe", "transport"], "difficult_direct_answer": true, "rationales": ["A small boat has a person and a large amount of produce in it.", "The boat is too small to be used for fishing, coast guard duties, or towing. the boat is carrying vegetables.", "There could be multiple answers possible for this boat, but answer a would apply most closely while there is nothing defining about the boat that would match up with the other answers."], "image": "train2014/COCO_train2014_000000184201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263989, "question_id": "QyZ3dsjJsNkXnbXmCzmNhv", "question": "The man with what color of shirt will get the frisbee?", "choices": ["red", "orange", "white", "grey"], "correct_choice_idx": 1, "direct_answers": ["orange", "orange", "orange", "orange", "orange", "red", "red", "red", "red", "orange"], "difficult_direct_answer": false, "rationales": ["The man with his arms out is wearing orange.", "The man has orange.", "The man is wearing an orange shirt and is jumping to make physical contact with the frisbee."], "image": "train2014/COCO_train2014_000000263989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4984, "question_id": "QyZTwUavZV6yQLhumEovjN", "question": "What force is causing the boards to accelerate forward?", "choices": ["contact force", "inertia", "kinetic force", "friction"], "correct_choice_idx": 0, "direct_answers": ["wave", "waves", "contact force", "water", "waves", "sea wave", "air", "waves", "wave", "waves"], "difficult_direct_answer": false, "rationales": ["The boards are making contact waves and that contact is moving them forward.", "The surf or boogie boards move forward when the waves or moving water meet the resistance of the boards and propel them forward.", "Inertia is a form of gravity."], "image": "train2014/COCO_train2014_000000004984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396212, "question_id": "QyhGm5KUTLNYXAmPi2xZT7", "question": "What is the man doing behind the boy in the cart?", "choices": ["pulling him", "stopping him", "fighting him", "pushing him"], "correct_choice_idx": 3, "direct_answers": ["skiing", "pushing", "steering", "pushing", "pushing him", "pushing", "pushing him", "pushing", "scaring", "skiing"], "difficult_direct_answer": false, "rationales": ["He is pushing the kid down the slope.", "It appears to be a. the other answers really don't make as much sense.", "The man is holding the handle of the cart to push the boy through the snow."], "image": "train2014/COCO_train2014_000000396212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515829, "question_id": "QyhkexLAPTPBCepnJ4D56Y", "question": "The user of this desk works as what type of academic professional?", "choices": ["lawyer", "librarian", "professor", "administrator"], "correct_choice_idx": 1, "direct_answers": ["law firm", "computer science", "professor", "writer", "doctor", "nails", "librarian", "librarian", "nurse", "photography professor"], "difficult_direct_answer": true, "rationales": ["The user of this desk is likely a librarian.", "There is a library card in the drawer.", "The name tag says this"], "image": "train2014/COCO_train2014_000000515829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34827, "question_id": "Qyittmm9unb6z2YRYWNHUc", "question": "Where is the milk?", "choices": ["tabletop", "dog bowl", "cat bowl", "refrigerator"], "correct_choice_idx": 3, "direct_answers": ["top shelf", "shelf", "top shelf", "top shelf", "bottle", "top shelf", "refrigerator shelf", "refrigerator", "top shelf", "top shelf"], "difficult_direct_answer": false, "rationales": ["The equipment seen is the fridge by the look.", "It is on a shelf inside this appliance", "It is in the fridge so it stays fresh and cold."], "image": "train2014/COCO_train2014_000000034827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183988, "question_id": "QynbC74XDag7uUWqEz3Sc5", "question": "Which ball is the person about to strike?", "choices": ["red", "ten", "black", "15"], "correct_choice_idx": 0, "direct_answers": ["red", "red", "cue", "cue ball", "red", "cue", "red solid", "white", "white ball", "red foreground"], "difficult_direct_answer": false, "rationales": ["The white ball and stick are pointed towards the red ball.", "The person is going to hit the red ball.", "The cue ball is aimed at the red ball, so when the cue ball is hit with the cue, the red ball is what it's going to hit."], "image": "val2014/COCO_val2014_000000183988.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428816, "question_id": "Qz5MBTLLu6LTVDgNRQWQXN", "question": "Why are these women smiling?", "choices": ["at party", "posing", "playing prank", "love rain"], "correct_choice_idx": 1, "direct_answers": ["picture", "posing", "for photograph", "photograph", "for photo", "happy", "for photo", "vacation", "photo", "having fun"], "difficult_direct_answer": true, "rationales": ["The woman want to take a photo.", "They are smiling for the camera.", "They smile at the person taking the picture"], "image": "train2014/COCO_train2014_000000428816.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360725, "question_id": "QzKvra3wKKcJyY8XkutZn3", "question": "What season is this definitely not?", "choices": ["winter", "summer", "autumn", "spring"], "correct_choice_idx": 1, "direct_answers": ["winter", "summer", "winter", "winter", "summer", "winter", "winter", "winter", "winter", "summer"], "difficult_direct_answer": false, "rationales": ["With no leaves on the trees it looks like it can't be summer.", "There are trees in the background. they do not have leaves.", "Since this might be a place that doesn't get snow in winter, but trees definitely have leaves in the summer, we can narrow it down to this scene being definitely not then."], "image": "train2014/COCO_train2014_000000360725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103163, "question_id": "QzKxneaqxdzwA2kEt3nTNp", "question": "Why is the man holding his hand to his mouth?", "choices": ["to itch", "to smoke", "to eat", "to cough"], "correct_choice_idx": 1, "direct_answers": ["smoking", "cigarette", "puffing cigarette", "cigarette", "smoking", "smoking", "smoking cigarette", "smoking", "cigarette", "to smoke"], "difficult_direct_answer": false, "rationales": ["The man smokes.", "The man is smoking a cigarette.", "He has a cigarette in his hand."], "image": "val2014/COCO_val2014_000000103163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515660, "question_id": "QzbSfY5BnPT6FNdCyuyuvE", "question": "What is the yellow substance for?", "choices": ["sweetening beverage", "cleaning plate", "cleaning hands", "dipping sauce"], "correct_choice_idx": 3, "direct_answers": ["dipping", "dipping", "dipping shrimp", "dipping", "dipping", "squeeze", "dipping sauce", "dipping", "dipping shrimp", "for crawfish"], "difficult_direct_answer": false, "rationales": ["The yellow substance in the middle of the plate is for dipping the seafood into while eating.", "The yellow substance is a cocktail sauce for the shrimp.", "The yellow sauce is mustard."], "image": "val2014/COCO_val2014_000000515660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113041, "question_id": "QzwvgHoerxEaJaqQLByeCn", "question": "What could possibly be casting the long shadow?", "choices": ["snake", "airplane", "lamp post", "car"], "correct_choice_idx": 2, "direct_answers": ["guy", "sun", "sun", "pole", "pole", "pole", "stadium", "lamp post", "tennis player", "pole"], "difficult_direct_answer": false, "rationales": ["The shadow is really long and a lamp post is tall.", "Because of the shape and width it is easy to surmise what is casting the shadow.", "The shadow is long, thin, and straight, like a pole."], "image": "val2014/COCO_val2014_000000113041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429407, "question_id": "R25eeEkdjKuYLGY77sKxFm", "question": "What is the giraffe on the left close to?", "choices": ["parasol", "rock", "baby", "car"], "correct_choice_idx": 1, "direct_answers": ["tree", "tree", "rock", "rock", "rock", "rock", "rock", "rock", "rock", "tree"], "difficult_direct_answer": false, "rationales": ["As shown clearly in the image. the other options aren't shown.", "There is a large boulder on the ground", "The rock is near the giraffe."], "image": "train2014/COCO_train2014_000000429407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72007, "question_id": "R2C4pXTWYSVp9yziqztsnt", "question": "What vegetable is used in this sandwich unconventionally?", "choices": ["lettuce", "onion", "cucumbers", "cabbage"], "correct_choice_idx": 3, "direct_answers": ["cabbage", "cabbage", "cabbage", "spice pizza", "onions", "cabbage", "lettuce", "yellow pepper", "artichoke", "raw onion"], "difficult_direct_answer": false, "rationales": ["Cabbage leaves are not usually eaten on a sandwich. lettuce would be a more common choice for a cold cut sandwich.", "Usually, we'd see lettuce here", "Slices of firm looking green leaf are on a sandwich."], "image": "train2014/COCO_train2014_000000072007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349059, "question_id": "R2H6RgdMeUv86qjoeGDxqM", "question": "What will they squeeze the substance in the tube onto?", "choices": ["onto washcloth", "toothbrush", "into sink", "onto soap"], "correct_choice_idx": 1, "direct_answers": ["toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush"], "difficult_direct_answer": false, "rationales": ["Toothpaste is typically applied directly to a toothbrush.", "They'll use toothpaste.", "A toothbrush has toothpaste on it."], "image": "train2014/COCO_train2014_000000349059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456648, "question_id": "R2LfocLeGANGoKNuvCssQY", "question": "Why is he in the middle of the intersection?", "choices": ["bike broken", "is confused", "is turning", "is lost"], "correct_choice_idx": 2, "direct_answers": ["turning right", "crossing road", "crossing road", "driving", "turning", "turning", "is turning", "crossing street", "crossing road", "crossing street"], "difficult_direct_answer": false, "rationales": ["The man is turning on the road.", "People always turn at intersections.", "The man wants to turn."], "image": "train2014/COCO_train2014_000000456648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388677, "question_id": "R2Qu7cKuKWPTFNSfUsJchH", "question": "Why is the man riding a motorcycle?", "choices": ["in parade", "stunt man", "hell's angel", "police duty"], "correct_choice_idx": 3, "direct_answers": ["police", "police duty", "law enforcement", "patrol", "police", "working", "policeman", "he's police", "service", "perform duties"], "difficult_direct_answer": true, "rationales": ["The man's motorcycle has the word \"police\" on it.", "The man is riding a motorcycle that is for police officers that need to get around.", "He is an authority figure. there is a sign on the front of his motorcycle."], "image": "val2014/COCO_val2014_000000388677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479696, "question_id": "R2bkpQN9TB68RyN2kFn5EJ", "question": "Which means of transport is pictured above?", "choices": ["railway", "air", "sea", "road"], "correct_choice_idx": 3, "direct_answers": ["road", "truck", "truck", "truck", "truck", "truck", "truck", "truck", "produce truck", "produce truck"], "difficult_direct_answer": false, "rationales": ["There is a truck. it cannot use tracks, float, or fly.", "The road is the means of transport.", "The vehicle has wheels, so it cannot travel on tracks. the vehicle cannot float or fly."], "image": "train2014/COCO_train2014_000000479696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436580, "question_id": "R2hBKTfNcmrd2xRmYqWLw9", "question": "What is the condition of the sky?", "choices": ["clear skies", "overcast", "mostly sunny", "mostly cloudy"], "correct_choice_idx": 1, "direct_answers": ["cloudy", "gray", "cloudy", "overcast", "cloudy", "cloudy", "cloudy", "gray", "cloudy", "cloudy"], "difficult_direct_answer": false, "rationales": ["The condition is gray.", "The sky is grey and the sun isn't showing", "The color of the sky is gray and it is cloudy."], "image": "train2014/COCO_train2014_000000436580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358980, "question_id": "R2o2oq9LqNrjPqkLhWPyAd", "question": "What position does the man in black play for the team?", "choices": ["short stop", "manager", "umpire", "lead referee"], "correct_choice_idx": 2, "direct_answers": ["umpire", "umpire", "umpire", "umpire", "neutral umpire", "neutral umpire", "umpire", "umpire", "umpire", "umpire"], "difficult_direct_answer": false, "rationales": ["The man in black doesn't play for a team. he is supposed to oversee the rules to make the game fair.", "In baseball, the authority wears black.", "The position is the umpire."], "image": "train2014/COCO_train2014_000000358980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573815, "question_id": "R36nDS8xEqd4zTLNVWcjXL", "question": "What is the design behind the character known as?", "choices": ["plaid", "polka dot", "tartan", "sunburst"], "correct_choice_idx": 3, "direct_answers": ["laptop", "cartoon", "rising sun", "sunburst", "starburst", "ninja", "spiral", "stripes", "stripes", "flare"], "difficult_direct_answer": true, "rationales": ["There are rays coming from the character that emphasize the central part of where the rays are coming from.", "The design is called a sunburst.", "The design is a sunburst."], "image": "train2014/COCO_train2014_000000573815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44911, "question_id": "R3KQQ4bLmS2HC45sKoymgy", "question": "What is the state of the blue item?", "choices": ["closed", "invisible", "smashed", "open"], "correct_choice_idx": 3, "direct_answers": ["off", "good", "on", "solid", "opened", "open", "off", "clean", "open", "open"], "difficult_direct_answer": false, "rationales": ["The door is open on it.", "The blue item is visible and is in good shape. its door is not closed.", "The door of an oven is open wide. the inside of the oven is blue in color."], "image": "train2014/COCO_train2014_000000044911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344013, "question_id": "R3Ln2W6nFLbUgWS9o7hfbB", "question": "What letter comes after the last letter in the big sign alphabetically?", "choices": ["e", "m", "t", "p"], "correct_choice_idx": 2, "direct_answers": ["letter t", "letter t", "letter t", "letter t", "letter t", "letter t", "letter t", "letter t", "letter t", "t"], "difficult_direct_answer": false, "rationales": ["The last letter is s, not d, l, or o.", "That letter comes after s.", "The last letter in the sign is \"s\" and the letter after \"s\" is \"t\"."], "image": "val2014/COCO_val2014_000000344013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476925, "question_id": "R3cGH6bQhZKYaQzgfnCPhy", "question": "Why is the bottle sitting on that square object?", "choices": ["prevent falling", "keep cool", "protect table", "easier reach"], "correct_choice_idx": 2, "direct_answers": ["protect wood", "coaster", "protect table", "coaster", "table dry", "to drink", "keep safe", "protect furniture", "coaster", "to protect"], "difficult_direct_answer": false, "rationales": ["Condensation on the bottle may lead to water forming on the surface and dripping downward onto the table. the coaster is used to protect against such incidents.", "The bottle protects.", "The bottle is on a coaster."], "image": "val2014/COCO_val2014_000000476925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426546, "question_id": "R3rzeXAJdLqBURCowmwp9q", "question": "What is the food in the little bowl called?", "choices": ["broccoli slaw", "salad", "pesto", "cole slaw"], "correct_choice_idx": 3, "direct_answers": ["coleslaw", "cole slaw", "coleslaw", "coleslaw", "cole slaw", "coleslaw", "coleslaw", "coleslaw", "cole slaw", "cole slaw"], "difficult_direct_answer": false, "rationales": ["The food is coleslaw.", "A sandwich is served with a small cup of shredded cabbage with dressing on it.", "There is some cole slaw in the white bowl to the right."], "image": "val2014/COCO_val2014_000000426546.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466621, "question_id": "R49ygsAkshYcMErs5D8Uma", "question": "Which train is more likely to move first?", "choices": ["none", "middle", "rightmost", "leftmost"], "correct_choice_idx": 2, "direct_answers": ["right", "right one", "far right", "right train", "red", "black train", "black one", "j 515", "rightmost", "right one"], "difficult_direct_answer": true, "rationales": ["The rightmost train has steam coming from it.", "The train is on the right.", "The train is more modern."], "image": "train2014/COCO_train2014_000000466621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30038, "question_id": "R4DUNVpYgzgPNwbpxCHBsb", "question": "From which donut shop have they most likely purchased donuts?", "choices": ["dunkin donuts", "tim hortons", "winchell's", "krispy kreme"], "correct_choice_idx": 0, "direct_answers": ["dunkin donuts", "dunking donuts", "dunkin", "dunkin donuts", "dunkin donuts", "dunkin", "dunking donuts", "dunkin donuts", "dunkin donuts", "dunkin donuts"], "difficult_direct_answer": false, "rationales": ["The box says dunkin donuts and has donuts in it.", "That box is from dunkin donuts.", "The name of the donut shop is on the coffee cup."], "image": "train2014/COCO_train2014_000000030038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532695, "question_id": "R4GEg2WD8aRR3DsWuLV2dM", "question": "Why is she holding her hair?", "choices": ["is cleaning", "is cutting", "is stuck", "is curious"], "correct_choice_idx": 1, "direct_answers": ["brushing", "is cutting", "brushing it", "spraying", "comb hair", "brushing", "brushing", "brushing", "brushing", "being brushed"], "difficult_direct_answer": false, "rationales": ["The person is holding a brush to the hair.", "This is difficult to tell given she's holding a brush. it might be b or c too.", "The woman in blue is cutting hair."], "image": "val2014/COCO_val2014_000000532695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365032, "question_id": "R4UwFWDw6MifujpDYEs3PM", "question": "What is causing a reflection in the image?", "choices": ["solar winds", "mirror", "wax", "windshield"], "correct_choice_idx": 3, "direct_answers": ["windshield", "glass", "glass", "window pane", "dashboard", "glass window", "window", "sun", "windshield", "mirror"], "difficult_direct_answer": false, "rationales": ["Someone is taking the picture from inside a car.", "Part of a vehicle can be seen.", "The person is taking the photo from inside the car."], "image": "train2014/COCO_train2014_000000365032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76822, "question_id": "R4XfEmAGUQhHvkNb3UmoBt", "question": "What's the name for the kind of area the skiers are using?", "choices": ["full pipe", "half pipe", "mega pit", "drop zone"], "correct_choice_idx": 1, "direct_answers": ["half pipe", "ski slope", "slope", "slope", "luge", "slope", "slope", "half pipe", "half pipe", "half pipe"], "difficult_direct_answer": false, "rationales": ["The skiers are using a half pipe.", "The place is shaped like half a pipe.", "A half pipe so they can do assorted tricks on it."], "image": "val2014/COCO_val2014_000000076822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474868, "question_id": "R4h5zsbT6seeuNNHLL7oY6", "question": "Who might likely employ the person wearing the brightest clothes here?", "choices": ["nasa", "nope", "yelp", "whelp"], "correct_choice_idx": 2, "direct_answers": ["qelp", "yelp", "pizza place", "sales office", "phone company", "tech companies", "yelp", "yelp", "yelp", "yelp"], "difficult_direct_answer": false, "rationales": ["The name of the employer is on the red sweatshirt.", "The person in the red jacket has \"yelp\" on their chest.", "The name of a company appears on the man in questions shirt and is readable. the man may be associated with the company on his shirt."], "image": "val2014/COCO_val2014_000000474868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564982, "question_id": "R4sS7dPeG7hiGpJa6qX4Yc", "question": "How has this food been prepared for serving?", "choices": ["grated", "scooped", "sliced", "poured"], "correct_choice_idx": 2, "direct_answers": ["sliced", "sliced", "heated", "baked", "baked", "cooked", "freshly", "baked", "sliced", "oven"], "difficult_direct_answer": false, "rationales": ["Pizza is baked and sliced.", "This is a slice of pizza.", "The food is sliced."], "image": "train2014/COCO_train2014_000000564982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17944, "question_id": "R56SdjXjTsBfNWYCaP6anq", "question": "What is the person on the left wearing?", "choices": ["tie", "scarf", "cowboy hat", "suspenders"], "correct_choice_idx": 0, "direct_answers": ["tie", "neck tie", "board", "tie", "tie", "tie", "tie", "suit clothes", "tie", "necktie"], "difficult_direct_answer": false, "rationales": ["The person in question is clearly visible and answer a is the most distinctly visible article of clothing they have on.", "A person is in formal clothes in front of a building.", "He is wearing a black one."], "image": "val2014/COCO_val2014_000000017944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453310, "question_id": "R5NC39DSXZYbCkYcrhEo3f", "question": "What is this vehicle used for?", "choices": ["hauling", "construction", "emergencies", "passengers"], "correct_choice_idx": 2, "direct_answers": ["emergencies", "emergencies", "fire department", "transport", "rescue", "city work", "fire department", "emergencies", "work", "hauling things"], "difficult_direct_answer": false, "rationales": ["The vehicle has emergency lights on the roof.", "It is brightly coloured vehicle with sirens on the top if they are needed to quickly get through traffic to reach an emergency.", "It is red."], "image": "val2014/COCO_val2014_000000453310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3220, "question_id": "R5UWcSkBofzBQnYAqsZpEZ", "question": "Why does the person in long read clothing wear a large head?", "choices": ["halloween", "helmet", "he's mascot", "lost bet"], "correct_choice_idx": 2, "direct_answers": ["team mascot", "mascot", "team mascot", "mascot", "mascot", "mascot", "mascot", "team mascot", "mascot", "he's mascot"], "difficult_direct_answer": false, "rationales": ["The person on the field in the red outfit has a large head because it is the costume for the team's mascot.", "The person is cheering on the team as a mascot.", "He is to get people excited about the game"], "image": "train2014/COCO_train2014_000000003220.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513108, "question_id": "R5o7dSuxYPHybMNrqUbnDJ", "question": "Who is the person holding the knives?", "choices": ["priest", "groom", "chef", "waiter"], "correct_choice_idx": 1, "direct_answers": ["priest", "groom", "cake cutter", "groom", "groom", "cake cutter", "cake cutter", "priest", "groom", "groom"], "difficult_direct_answer": false, "rationales": ["The couple will cut the cake holding the knife together", "The groom and bride are cutting the cake at a wedding. the man is wearing a black tux.", "A person is double clutching a knife in their hand. they are about to cut the cake that's for them."], "image": "train2014/COCO_train2014_000000513108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351233, "question_id": "R5ooBcYa2QBtqBKX2DwmSD", "question": "What would happen if he didn't have on gloves?", "choices": ["no traction", "hands dirty", "injured hand", "nothing"], "correct_choice_idx": 2, "direct_answers": ["road rash", "scrapes", "injured hand", "injury", "scrapes", "hand scraped", "road rash", "scrape", "road rash", "roadrash"], "difficult_direct_answer": false, "rationales": ["Skin getting rubbed across pavement generally suffers road rash, a painfull injury.", "The man would scrape his hands.", "You can get your hand hurt."], "image": "train2014/COCO_train2014_000000351233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528868, "question_id": "R5ynFvMGAXFETmHAxNJEgB", "question": "What are the little ones called?", "choices": ["pups", "kits", "chicks", "cubs"], "correct_choice_idx": 3, "direct_answers": ["cubs", "bear", "cubs", "cubs", "cubs", "cubs", "cubs", "bear", "cubs", "cubs"], "difficult_direct_answer": false, "rationales": ["The babies have the same name as the chicago baseball team.", "The little one is a cub.", "The animal is clearly visible and identifiable based on its size, shape, fur and color. the young of this kind of animal is commonly known."], "image": "train2014/COCO_train2014_000000528868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341067, "question_id": "R67pMKVKP5s2tRMQAd6yqF", "question": "What is he doing?", "choices": ["cleaning up", "chasing ball", "falling", "dropping racquet"], "correct_choice_idx": 1, "direct_answers": ["approaching ball", "hitting ball", "playing tennis", "hit ball", "chasing ball", "chasing ball", "playing tennis", "running", "charging", "playing tennis"], "difficult_direct_answer": false, "rationales": ["A young boy is playing a tennis match. he is trying to hit the ball before it hits ground again.", "The boy is trying to run down the ball.", "He is running in the game to hit the tennis ball."], "image": "val2014/COCO_val2014_000000341067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41020, "question_id": "R69YWBu2KCRTifrH7Uaqfs", "question": "What style mount is this woman shown seated in atop her horse?", "choices": ["dismount", "leading", "side saddle", "cowboy style"], "correct_choice_idx": 2, "direct_answers": ["saddle", "side saddle", "side straddle", "side saddle", "sidesaddle", "two legged", "side saddle", "saddle", "side saddle", "side mount"], "difficult_direct_answer": false, "rationales": ["The woman has both her legs on the same side of the horse.", "Both her legs are on the side of the horse.", "Both of her legs are on the same side of the horse."], "image": "val2014/COCO_val2014_000000041020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61503, "question_id": "R6HjLKs3U98eBppaxSyTq9", "question": "The person under the umbrella has what on their sleeve?", "choices": ["notes", "scarf", "button", "dirt"], "correct_choice_idx": 2, "direct_answers": ["button", "button", "button", "botton", "buttons", "button", "button", "patches", "cufflink", "button"], "difficult_direct_answer": false, "rationales": ["The person under the umbrella is wearing a long sleeve jacket. the only object on the sleeve is a white button.", "There is only on person visible under an umbrella and their sleeve is clearly visible. based on the size and shape of the object in question, it would be answer a.", "The person with the umbrella is wearing a jacket with a gold button on the sleeve."], "image": "val2014/COCO_val2014_000000061503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505347, "question_id": "R6KwjtytBT5dY3ZYKtwAth", "question": "What are the horses doing?", "choices": ["plowing", "resting", "pulling man", "exercising"], "correct_choice_idx": 2, "direct_answers": ["walking", "walking", "working", "traversing river", "plowing", "walking", "walking water", "pulling man", "trotting", "walking"], "difficult_direct_answer": false, "rationales": ["That's what it looks like the horses are doing.", "They have no reason to do a unless he needs the exercise and they need it. so, c would be the first reason.", "They are pulling this man."], "image": "train2014/COCO_train2014_000000505347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314297, "question_id": "R6PX6x7tmu65bsNsX8Zfwb", "question": "Why is he sitting on the skateboard?", "choices": ["is stuck", "balancing", "fell down", "showing off"], "correct_choice_idx": 2, "direct_answers": ["fell down", "fell down", "he fell", "skating", "street luging", "fell down", "performing trick", "riding", "he fell", "street luge"], "difficult_direct_answer": false, "rationales": ["The man fell off of it.", "The man fell.", "Others are standing on their boards while one individual is down. the individual appears to have fallen off during the race."], "image": "train2014/COCO_train2014_000000314297.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38572, "question_id": "R6cxrVnHcQ6x4SMUdZ7iXg", "question": "What does the person in pink ride?", "choices": ["dolphins", "surf board", "whales", "donkeys"], "correct_choice_idx": 0, "direct_answers": ["dolphins", "dolphins", "dolphins", "dolphins", "dolphins", "dolphins", "dolphins", "dolphins", "dolphins", "dolphins"], "difficult_direct_answer": false, "rationales": ["She is riding on two grey aquatic mammals. they are too small to be whales.", "They are grey with small beaks and flippers on each side.", "He riding dolphins like they were a toy. :("], "image": "val2014/COCO_val2014_000000038572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268774, "question_id": "R6qoJdKvvYiFoiUVMaM9nJ", "question": "What German company is being advertised in the signs?", "choices": ["volkswagen", "bosch", "mcdonald's", "makita"], "correct_choice_idx": 1, "direct_answers": ["bosch", "bosch", "bosch", "bosch", "makin", "bosch", "bosch", "makin", "bosch", "bosch"], "difficult_direct_answer": false, "rationales": ["Germany's largest auto-parts manufacturer.", "Makita is a japanese company, and mcdonald's is an american company. a red and white sign for a german company is on the left.", "There is a sign for makita tools."], "image": "train2014/COCO_train2014_000000268774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73235, "question_id": "R77dZkJyJ2Luv3Ahwy8CJc", "question": "What type of vehicle crosses near the white X?", "choices": ["bus", "plane", "train", "bike"], "correct_choice_idx": 2, "direct_answers": ["train", "train", "railroad crossing", "train", "train", "small vehicle", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["There is a sign telling what is crossing there.", "The x shaped sign signifies a railroad crossing.", "There is railroad sign across intersection."], "image": "train2014/COCO_train2014_000000073235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127284, "question_id": "R7FM25RJGQfECFieBMpMPi", "question": "What type of vehicle is present in the water?", "choices": ["board", "bicycle", "truck", "car"], "correct_choice_idx": 0, "direct_answers": ["boat", "boat", "board", "paddleboard", "surfboard", "paddleboard", "ski boat", "paddle board", "surf board", "paddle board"], "difficult_direct_answer": false, "rationales": ["Vehicles without wheels don't work well in the water.", "The man and dog are on a paddle board.", "A person and a dog are standing on the floating vehicle. trucks, cars, and bicycles cannot float on water."], "image": "train2014/COCO_train2014_000000127284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245806, "question_id": "R7ixX9c86w63THunNry3qh", "question": "What type or style meal is being prepared?", "choices": ["lunch", "snack", "breakfast", "dinner"], "correct_choice_idx": 2, "direct_answers": ["breakfast", "pancakes", "pancakes", "outdoor", "breakfast", "pancakes", "bbq pancake", "breakfast", "breakfast", "pancake"], "difficult_direct_answer": false, "rationales": ["Pancakes are traditionally breakfast items and he is preparing pancakes.", "There are pancakes on the griddle, a breakfast staple.", "The person is making pancakes and that is typical of breakfast."], "image": "train2014/COCO_train2014_000000245806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399028, "question_id": "R8Sd2d56MnVKPTFsUf9zdo", "question": "What kind of pattern is the road?", "choices": ["black", "tiled", "bumpy", "square"], "correct_choice_idx": 1, "direct_answers": ["tile", "brick", "cobblestone", "vector", "brick", "tiled", "cobblestone", "brick", "tiled", "tiled"], "difficult_direct_answer": false, "rationales": ["The pattern is laid with brick.", "The pattern is tiled.", "The road is neatly tiled with paving stones."], "image": "train2014/COCO_train2014_000000399028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170558, "question_id": "R8VK5rTvmKany6q2wQndNC", "question": "What race is the man closest to the camera?", "choices": ["black", "asian", "white", "indian"], "correct_choice_idx": 1, "direct_answers": ["asian", "asian", "ami", "asian", "asian", "asian", "asian", "unknown", "mexican", "laptop working"], "difficult_direct_answer": false, "rationales": ["The man has black hair.", "The person has the characteristics of an asian person.", "This seems to be the case given the facial features."], "image": "train2014/COCO_train2014_000000170558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473466, "question_id": "R8b9Ruox752SaTmYxgURJV", "question": "What property does the black part of the costume have?", "choices": ["sun proof", "cold resistant", "breathable", "waterproof"], "correct_choice_idx": 2, "direct_answers": ["fur", "mouth", "mesh", "mesh", "mouth", "mouth", "mouth", "breathable", "mouth", "felt"], "difficult_direct_answer": false, "rationales": ["The black part is by the wearers mouth and made out of mesh so they can still breathe while wearing it.", "The costume has breathable material.", "The person inside the mask needs to be able to breathe easily without changing the tone of the mask."], "image": "val2014/COCO_val2014_000000473466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301121, "question_id": "R8pL53bscTzgneZvRFc86C", "question": "What does the man seated here await?", "choices": ["sale", "train", "bus", "airplane"], "correct_choice_idx": 2, "direct_answers": ["bus", "bus", "bowling services", "bus", "bus", "bowling services", "bus", "bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["These seats are by the road, and city transit vehicles normally stop for people sitting on these benches.", "The man wants the bus.", "The bench he is sitting on is on a sidewalk. airplanes and trains do not travel on streets."], "image": "train2014/COCO_train2014_000000301121.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354734, "question_id": "R8tndSXBYSFmRLkajumXmP", "question": "Who are the roads for?", "choices": ["downtown", "pedestrians", "drivers", "directions"], "correct_choice_idx": 2, "direct_answers": ["motorized vehicles", "drivers", "merging", "motor vehicles", "motor vehicles", "people", "traffic", "vehicles", "cars", "driving on"], "difficult_direct_answer": true, "rationales": ["There are cars, trucks and buses on the roads. people operate cars, trucks and buses.", "The roads are heavily travelled with trucks, buses, and cars. there are three different levels of roads or bridges.", "There are no sidewalks near the roads. the roads are for vehicles."], "image": "train2014/COCO_train2014_000000354734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234978, "question_id": "R994eVv5YQZG6nCZ3rJNLx", "question": "In what French region are they in?", "choices": ["corse", "normandy", "occitanie", "brittany"], "correct_choice_idx": 2, "direct_answers": ["brittany", "alps", "peyragudes", "france", "occitanie", "pyrenees", "peragudes", "alps", "dordogne", "peyraglides"], "difficult_direct_answer": true, "rationales": ["The peyragudes is a large ski resort in the occitanie region.", "People are racing bicycles in a mountainous area.", "The fencing contains wording of peyragudes. this is the name of a large ski resort in the french pyreness in the region of occitanie."], "image": "train2014/COCO_train2014_000000234978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98447, "question_id": "R9GyGKzWUGSQv2ywaGVsTA", "question": "What purpose does the gray canister serve?", "choices": ["decorative", "trash can", "beer barrel", "ballot box"], "correct_choice_idx": 1, "direct_answers": ["trash", "trash can", "holding garbage", "trash receptacle", "garbage", "trash can", "garbage can", "garbage can", "garbage can", "garbage can"], "difficult_direct_answer": false, "rationales": ["The gray canister next to the people is a garbage pail that is used for throwing trash into.", "That is used for trash.", "The gray canister is purposed as a trash can."], "image": "train2014/COCO_train2014_000000098447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250193, "question_id": "R9JBnpveAmLnUYtRYDXjSF", "question": "What is the thing on the front tip of the airplane?", "choices": ["whirl", "tip", "windmill", "propeller"], "correct_choice_idx": 3, "direct_answers": ["propeller", "propeller", "propeller", "propeller", "propeller", "propeller", "rotor", "propeller", "antenna", "propeller"], "difficult_direct_answer": false, "rationales": ["Long thin boards protrude from a circular object on the front of a plane.", "There is a propeller with two blades on the front of the plane.", "Planes use propellers to fly."], "image": "train2014/COCO_train2014_000000250193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388179, "question_id": "R9aaSPXqQvT9NhBDCbqmyG", "question": "Why are the men wearing orange vests?", "choices": ["camouflage", "visibility", "fashion", "costume"], "correct_choice_idx": 1, "direct_answers": ["high visibility", "hazards", "safety", "safety", "visibility", "airline workers", "safety", "safety", "attract attention", "unloading luggage"], "difficult_direct_answer": false, "rationales": ["The men need to be visible.", "The men want to be seen for safety reasons.", "The men want to be seen."], "image": "train2014/COCO_train2014_000000388179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55689, "question_id": "R9cGYQxssyiGp9A4kaxDPi", "question": "This phone is the same color as which object inside of the classroom?", "choices": ["desk", "chalkboard", "drapes", "radiator"], "correct_choice_idx": 1, "direct_answers": ["class board", "chalkboard", "chalkboard", "board", "floor line", "chalkboard", "writing board", "chalkboard", "lines", "chalkboard"], "difficult_direct_answer": false, "rationales": ["They are both shades of green", "The chalkboard is green.", "The phone is in the same color as the chalkboard."], "image": "train2014/COCO_train2014_000000055689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449947, "question_id": "R9jdF3knNQHXBtm3tS7fdJ", "question": "What type of mineral is the child most likely getting enough of?", "choices": ["zinc", "iron", "potassium", "calcium"], "correct_choice_idx": 2, "direct_answers": ["potassium", "potassium", "potassium", "potassium", "potassium", "iron", "potassium", "potassium", "potassium", "iron"], "difficult_direct_answer": false, "rationales": ["The child is standing near bananas.", "The child is near bananas which are high in potassium.", "The child is near a bunch of bananas which are very high in potassium."], "image": "train2014/COCO_train2014_000000449947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26506, "question_id": "R9o3kmpk2VvRcc6FVUNPCB", "question": "Where did these animals find water?", "choices": ["on beach", "near lake", "near pool", "in well"], "correct_choice_idx": 0, "direct_answers": ["stream", "beach", "tide pool", "lake", "ocean", "on beach", "beach", "ocean", "puddle", "pond"], "difficult_direct_answer": false, "rationales": ["These animals found a backwater on a beach.", "They are at the water that is a little inland from the coast", "They are slightly inland from the main body of water"], "image": "train2014/COCO_train2014_000000026506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526004, "question_id": "RA3a4nmzrTYCdeV2eaFBgc", "question": "What food is this child chewing on?", "choices": ["bread stick", "cookie", "fruit", "pizza"], "correct_choice_idx": 3, "direct_answers": ["pizza", "crust", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza crust", "pizza crust", "pizza crust"], "difficult_direct_answer": false, "rationales": ["He is eating the crust on a pizza.", "The child has a pizza crust.", "Although it could be a bread stick, they are generally straighter with no curves. obviously this is not a cookie or a fruit."], "image": "val2014/COCO_val2014_000000526004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498931, "question_id": "RAS9Ryhn5mQgufHFBZVjXh", "question": "What is the elevated metal railway called?", "choices": ["zip line", "sky taxi", "sky elevation", "ski lift"], "correct_choice_idx": 3, "direct_answers": ["lift", "ski lift", "chair lift", "guardrail", "ski lift", "lift", "ski lift", "ski lift", "ski lift", "ski lift"], "difficult_direct_answer": false, "rationales": ["There is snow where people want to do snow sports at a high elevation going down.", "The elevated railway on the mountain is a ski lift that skiers use to go up the slope.", "It transports skiiers to the top of the mountain"], "image": "val2014/COCO_val2014_000000498931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176726, "question_id": "RAvXsS9oyjXETxCksLmeC3", "question": "What type of solution were the cucumbers soaked in?", "choices": ["yogurt", "mustard", "syrup", "brine"], "correct_choice_idx": 3, "direct_answers": ["dill", "brine", "brine", "pickling", "fruits", "vinegar", "spicy", "pickle brine", "snake", "vinegar"], "difficult_direct_answer": false, "rationales": ["The cucumbers are soaked in vinegar.", "They are pickled.", "These cucumbers have been soaked in brine to make a pickle."], "image": "train2014/COCO_train2014_000000176726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297092, "question_id": "RBJgRpsYuP64MxYFeCshPV", "question": "Where does the minor league Red Sox player play?", "choices": ["pawtucket", "martha's vineyard", "nantucket", "boston"], "correct_choice_idx": 0, "direct_answers": ["pawtucket", "field", "boston", "boston", "baseball", "indianapolis", "batter", "boston", "florida", "worcester"], "difficult_direct_answer": false, "rationales": ["They play in boston.", "The red sox are out of boston.", "They play in pawtuket."], "image": "train2014/COCO_train2014_000000297092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149123, "question_id": "RBSPtFBFHzsne6m8nKPoTd", "question": "Where is Sovereign Recovery located?", "choices": ["st albans", "london", "st. louis", "tuscany"], "correct_choice_idx": 0, "direct_answers": ["london", "london", "st albans", "london", "london", "london", "london", "london", "london", "london"], "difficult_direct_answer": false, "rationales": ["The text on the truck says st. albans.", "The truck is in st albans.", "The sovereign recovery is located in st albans of london."], "image": "val2014/COCO_val2014_000000149123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147866, "question_id": "RBxf6jaY3WPin6HAUDChQq", "question": "What are the last four digits visible on the pay toll?", "choices": ["7257", "7753", "7375", "7275"], "correct_choice_idx": 3, "direct_answers": ["7275", "7275", "0195", "7278", "7275", "7275", "7275", "7276", "one ninetyfive", "7275"], "difficult_direct_answer": false, "rationales": ["The numbers are written on the pay toll and are readable.", "Those are the numbers on the meter.", "A single pay toll is by the edge of a car. in the blue portion is the numbers that we need."], "image": "val2014/COCO_val2014_000000147866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423576, "question_id": "RCCVzncdarEj6biJo4uxQ6", "question": "What is near the mug?", "choices": ["cat", "baby", "glasses", "cow"], "correct_choice_idx": 2, "direct_answers": ["glasses", "glasses", "glasses", "eye glasses", "eye glasses", "glasses", "glasses", "glasses", "glasses", "eye glasses"], "difficult_direct_answer": false, "rationales": ["The other options are live animals that are obviously not in this image.", "Someone put their glasses down on the table.", "There's a pair of eyeglasses on the table."], "image": "val2014/COCO_val2014_000000423576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360831, "question_id": "RCeMoaLNVxnzsQDyGPkeCZ", "question": "What is in the tubes?", "choices": ["flowers", "snails", "snakes", "apples"], "correct_choice_idx": 0, "direct_answers": ["plants", "water", "flowers", "plants", "flowers", "plants", "plant clippings", "buds", "flowers", "plants"], "difficult_direct_answer": false, "rationales": ["They will be larger some day.", "Assorted flowers are in the tubes.", "There is a purple bloom in the vase."], "image": "train2014/COCO_train2014_000000360831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503887, "question_id": "RCs46u6GtbjaUWMopqHxjh", "question": "What type of transportation is shown?", "choices": ["water", "rail", "road", "air"], "correct_choice_idx": 1, "direct_answers": ["train", "train", "train", "rail", "train", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["It is a rail train.", "There is a large train car that is sitting on train tracks.", "There are rails visible in the image and a train. those things combine for answer a type of travel."], "image": "train2014/COCO_train2014_000000503887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107225, "question_id": "RCwtkZkQaTxqCDL6c4LKji", "question": "Who owns the billboard illuminated in the most golden lighting above the NY Police dept?", "choices": ["izod", "duane reade", "yahoo", "police"], "correct_choice_idx": 2, "direct_answers": ["yahoo", "izod", "yahoo", "yahoo", "izod", "yahoo", "sports shop", "izod", "luxury sport", "izod"], "difficult_direct_answer": false, "rationales": ["The owner is yahoo.", "The yahoo! billboard is the only one with lighting that's closest to the color gold, which is coming from small bulbs surrounding it.", "The question locates the object and the lettering is clearly visible."], "image": "train2014/COCO_train2014_000000107225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249435, "question_id": "RCxCHt8iy73zqkjXBKLKpY", "question": "What treat do the children share here?", "choices": ["birthday cake", "christmas cake", "marshmallow fluff", "hot dogs"], "correct_choice_idx": 0, "direct_answers": ["cake", "cake", "cake", "cake", "cake", "cake", "cake", "cake", "birthday cake", "cake"], "difficult_direct_answer": false, "rationales": ["The kids are sharing a cake for a birthday.", "The food the boys are eating has the colorful frosting and other components consistent with answer a.", "A birthday party with kids would not be complete without this treat."], "image": "train2014/COCO_train2014_000000249435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494860, "question_id": "RD6Qkk6sUdfgwirisaUtxL", "question": "In which country does this person play tennis here?", "choices": ["spain", "italy", "japan", "united states"], "correct_choice_idx": 0, "direct_answers": ["spain", "madrid", "europe", "spain", "spain", "spain", "spain", "spain", "spain", "europe"], "difficult_direct_answer": false, "rationales": ["The country is spain.", "The location is written on the court and the location is commonly known to be in answer a.", "The tennis court has the word madrid on it so presumably the game occurred in the country where madrid is the capital city."], "image": "train2014/COCO_train2014_000000494860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48863, "question_id": "RD8HHMnmMAfAJWrwPz7z7n", "question": "What is the small red appliance?", "choices": ["blender", "air fryer", "can opener", "toaster"], "correct_choice_idx": 3, "direct_answers": ["toaster", "toaster", "toaster", "toaster", "bread machine", "bread machine", "toaster", "toaster", "toaster", "bread machine"], "difficult_direct_answer": false, "rationales": ["The red appliance is for bread.", "The item has entry points for toast and is in an old throwback artistic design.", "A toaster to make toast with."], "image": "train2014/COCO_train2014_000000048863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15554, "question_id": "RDLT82KJvLvyUorkqLVSrB", "question": "From what does the man with the umbrella protect himself?", "choices": ["rain", "snow", "sun glare", "gunfire"], "correct_choice_idx": 0, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "rain", "raindrops", "from rain", "rain"], "difficult_direct_answer": false, "rationales": ["It is gloomy and the streets are wet.", "The streets are wet. there are droplets on the umbrella.", "The ground is wet"], "image": "train2014/COCO_train2014_000000015554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382670, "question_id": "RDLfn6NuwtzcGgZmvEV97M", "question": "What is a likely purpose of the cattle?", "choices": ["racing", "pulling wheelburrow", "hunting", "friendship"], "correct_choice_idx": 1, "direct_answers": ["pulling wheelburrow", "pull carts", "pulling", "balancing", "food", "food", "pulling carts", "beef", "pull things", "pull cart"], "difficult_direct_answer": true, "rationales": ["The wheelbarrow is big and does not have handles, indicating that it needs an animal attached to it in order to move.", "The cow is doing work.", "The cowers have ropes on them and there is a wagon nearby."], "image": "val2014/COCO_val2014_000000382670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265462, "question_id": "RDMUoqKDNtgRmagr4L9yf8", "question": "What prevents a person in a wheelchair from reaching the background?", "choices": ["skateboards", "trees", "teens", "stairs"], "correct_choice_idx": 3, "direct_answers": ["stairs", "stairs", "incline", "stairs", "stairs", "sliding", "steps", "stairs", "stairs", "steps"], "difficult_direct_answer": false, "rationales": ["Wheels can't go down stairs.", "A person is skateboarding down some stairs.", "The picture includes a skating rink and a set of stairs. due to limitations of the wheelchairs, it will not be able to overcome the stairs."], "image": "val2014/COCO_val2014_000000265462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451915, "question_id": "RDN4EazuwwD5L3PCLLfVxQ", "question": "Why is the beach empty?", "choices": ["pollution", "storm coming", "work day", "lockdown"], "correct_choice_idx": 1, "direct_answers": ["raining", "storm coming", "bad weather", "weather", "no people", "cloudy day", "storming", "getting dark", "storm coming", "storm coming"], "difficult_direct_answer": false, "rationales": ["The storm is coming.", "The clouds are dark and its clear the storm is coming.", "There are black clouds in the sky."], "image": "val2014/COCO_val2014_000000451915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196049, "question_id": "RDWknXiCFfESmaUXJVMHwe", "question": "What century could this be?", "choices": ["21st", "8th", "19th", "20th"], "correct_choice_idx": 2, "direct_answers": ["eighteenth", "nineteenth", "eighteenth", "19th", "19th", "19th", "20th", "horse", "eighteen hundreds", "nineteenth"], "difficult_direct_answer": false, "rationales": ["The buildings are well made, but there are horse drawn carriages.", "The century is the 19th.", "The main form of transportation is horse-drawn carriage."], "image": "train2014/COCO_train2014_000000196049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578453, "question_id": "RDgXTRcMaaEHQETdcNmBca", "question": "Why are these blockades here?", "choices": ["to hide", "style", "comfort", "safety/security"], "correct_choice_idx": 3, "direct_answers": ["water", "stop vehicles", "prevent driving", "police", "safety/security", "block traffic", "block vehicles", "for vehicles", "protection", "block people"], "difficult_direct_answer": true, "rationales": ["It is so no one drives off the pier", "The blockades keep out any dangerous threats.", "The blockades are secure."], "image": "train2014/COCO_train2014_000000578453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391627, "question_id": "RDi7cWEFyktZxe5dHWWeGf", "question": "What material is the post to the left of the umbrella cover stand made out of?", "choices": ["aluminum", "tin", "copper", "brass"], "correct_choice_idx": 3, "direct_answers": ["metal", "iron", "brass", "bronze", "brass", "copper", "bronze", "gold", "metal", "aluminium"], "difficult_direct_answer": false, "rationales": ["It is gold colored", "The metal is almost a gold like color.", "The material is brass."], "image": "train2014/COCO_train2014_000000391627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171997, "question_id": "RDmjTKNaL7zj9itPpa39Sg", "question": "What items were in the purple box?", "choices": ["animals", "oranges", "tickets", "baseballs"], "correct_choice_idx": 3, "direct_answers": ["balls", "balls", "balls", "balls", "baseballs", "balls", "baseballs", "baseballs", "balls", "balls"], "difficult_direct_answer": false, "rationales": ["The box contains baseballs for the game.", "There still appears to be a few balls left in the box and there are baseballs on the ground around it.", "The baseballs were inside."], "image": "train2014/COCO_train2014_000000171997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456082, "question_id": "RDmusmSG3SmtyNcNBaDXCf", "question": "How did the players arrive at this venue?", "choices": ["boat", "helicopter", "car", "train"], "correct_choice_idx": 2, "direct_answers": ["by car", "cars", "drove", "cars", "running", "car", "cars", "cars", "personal vehicles", "automobile"], "difficult_direct_answer": false, "rationales": ["The players are playing on a field next to a parking lot where they left the cars they drove.", "The players have cars behind them.", "The players arrived by driving one of the cars in the parking lot."], "image": "train2014/COCO_train2014_000000456082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504304, "question_id": "RDtW3fK37onXWT2C2N46xn", "question": "Where does the girl want to hit the ball?", "choices": ["behind net", "under net", "up", "over net"], "correct_choice_idx": 3, "direct_answers": ["to lady", "over net", "forward", "tennis racket", "over net", "over net", "over net", "over net", "straight across", "racket"], "difficult_direct_answer": false, "rationales": ["The girl is swinging her racket upwards, to strike a ball into the air. the rules of tennis call for balls to be hit over the net.", "She needs to hit it back to the other player to keep the game going", "The girl is playing tennis. the objective is to send the ball to the other player."], "image": "val2014/COCO_val2014_000000504304.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516856, "question_id": "RDwJpUbU2pwFgeCZky7jfq", "question": "Why is he opening his mouth wide?", "choices": ["yawning", "laughing", "screaming", "large sandwich"], "correct_choice_idx": 3, "direct_answers": ["eating", "eating", "eating", "to eat", "eating", "to eat", "large sandwich", "biting sandwich", "to eat", "to eat"], "difficult_direct_answer": false, "rationales": ["He has to open it wide due to the size of the food he is eating.", "The man has a huge cheesesteak.", "He's eating food and you can't do that while you're laughing, screaming or yawning."], "image": "val2014/COCO_val2014_000000516856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372989, "question_id": "RDyoHmpHeivUQq9YYYoLUa", "question": "The grey hat worn by the woman is made of what material?", "choices": ["copper", "plastic", "aluminum", "gold"], "correct_choice_idx": 1, "direct_answers": ["plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "plastic"], "difficult_direct_answer": false, "rationales": ["The hat looks lightweight and flimsy.", "The viking helmet is plastic.", "It looks like it's made from plastic."], "image": "train2014/COCO_train2014_000000372989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574599, "question_id": "REAksmsrHqDsePofFWyqr3", "question": "What are the animals doing in the road?", "choices": ["racing", "eating", "crossing", "fighting"], "correct_choice_idx": 2, "direct_answers": ["crossing", "stampeding", "crossing", "crossing", "crossing", "herding", "running", "crossing", "crossing", "herding"], "difficult_direct_answer": false, "rationales": ["They are walking to the other side of the road.", "The animals are going from one side of the road to the other.", "They are on a road and they are walking over it."], "image": "train2014/COCO_train2014_000000574599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123535, "question_id": "RENg8hgx6cNmbqXpKSTjhQ", "question": "What kind of sheep is nearest the back of the blue trailer?", "choices": ["ewe", "lamb", "ram", "wether"], "correct_choice_idx": 0, "direct_answers": ["ewe", "goat", "white sheep", "furry", "back", "merino sheep", "female", "female", "male", "white"], "difficult_direct_answer": true, "rationales": ["The sheep is a female.", "The ewes are riding in the truck.", "A white sheep is on a trailer."], "image": "train2014/COCO_train2014_000000123535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253266, "question_id": "REjgXZQT9WveMYCuyXRTmY", "question": "What is the height of shuttle Net?", "choices": ["1.9m", "3.78m", "1.55m", "2.0m"], "correct_choice_idx": 2, "direct_answers": ["three feet", "5 feet", "two feet", "low", "five one", "5ft 1in", "1.55m", "3 feet", "5'1", "five ft"], "difficult_direct_answer": true, "rationales": ["That is the height or 5'1\".", "The height is 1.55 meters.", "The shuttle net is relatively short."], "image": "train2014/COCO_train2014_000000253266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136142, "question_id": "REk74vTHqKBN9EWkz2hMYr", "question": "What is the main ingredient in the Kirkland product?", "choices": ["wheat", "quinoa", "oats", "corn"], "correct_choice_idx": 3, "direct_answers": ["flour", "corn", "flour", "corn", "tortilla", "corn", "corn", "corn", "corn", "corn"], "difficult_direct_answer": false, "rationales": ["Most of the part in the table have coins.", "Tortillas are made from this ingredients.", "The ingredient is corn."], "image": "train2014/COCO_train2014_000000136142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191457, "question_id": "REzHziJEhH53sB3KtXH9dQ", "question": "What might delay this planes departure today?", "choices": ["fog", "heat", "political unrest", "ice"], "correct_choice_idx": 0, "direct_answers": ["fog", "weather", "fog", "foggy sky", "fog", "rain", "bad weather", "bad weather", "weather", "fog"], "difficult_direct_answer": false, "rationales": ["When fog is present, pilots can not see the sky properly.", "It's really foggy out.", "The sky is covered in fog."], "image": "train2014/COCO_train2014_000000191457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291319, "question_id": "RFAcN2T4CLCkcSB7D5L5xR", "question": "What is the opposite supernatural being based on the player in the red hat's jersey?", "choices": ["werewolf", "god", "demon", "vampire"], "correct_choice_idx": 2, "direct_answers": ["devil", "devil", "signaling", "angels", "devil", "devils", "demon", "devil", "baseman", "devil"], "difficult_direct_answer": false, "rationales": ["The player plays for the angels. the opposite of an angel is a devil.", "A baseball player wears the uniform of the team the angels.", "The player is wearing an angels, not devils, jersey."], "image": "train2014/COCO_train2014_000000291319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160264, "question_id": "RFBzQjWzJWSvbwrauFvUkX", "question": "What is the name for this style of table?", "choices": ["elongated table", "long table", "rectangle table", "refectory table"], "correct_choice_idx": 3, "direct_answers": ["wood", "banquet", "long table", "dinner", "refectory", "banquet", "refectory table", "picnic", "longtable", "dining room"], "difficult_direct_answer": true, "rationales": ["It is a long wooden table shared by many in a pub", "The table is really long.", "This is a refectory table to be had in the basement or a dive."], "image": "train2014/COCO_train2014_000000160264.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147172, "question_id": "RFHXLvbfu63tiaEAQpS9c6", "question": "What hat does the woman have on?", "choices": ["wool", "baseball cap", "fedora", "witch hat"], "correct_choice_idx": 3, "direct_answers": ["hat", "witch hat", "halloween hat", "witch hat", "witch", "panama", "cute hat", "halloween hat", "witches hat", "witch's hat"], "difficult_direct_answer": false, "rationales": ["The hat is a witch's.", "You can tell by the shape of the hat and what else she is wearing what she has on.", "The witch's hat is indicated by the pointed top."], "image": "train2014/COCO_train2014_000000147172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547136, "question_id": "RFqVtf3emHCFeTpyW74MNR", "question": "What is the abbreviation of the credentials that one needs to fly this craft?", "choices": ["atp", "ccl", "cfl", "apl"], "correct_choice_idx": 0, "direct_answers": ["jet engine", "pl", "camel", "faa certificate", "pilot", "fa", "ca", "pilot", "atp", "having control"], "difficult_direct_answer": true, "rationales": ["The abbreviation is atp.", "A pilot has to have a cfl.", "One needs an advanced trained pilot's license."], "image": "val2014/COCO_val2014_000000547136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206907, "question_id": "RFuptyUw4XasJEtZmZV62r", "question": "Who will ultimately decide the fate of the play?", "choices": ["umpire", "runner", "crowd", "fielder"], "correct_choice_idx": 0, "direct_answers": ["running player", "umpire", "umpire", "running player", "umpire", "umpire", "umpire", "umpire", "umpire", "umpire"], "difficult_direct_answer": false, "rationales": ["The umpire calls the shots in baseball.", "There is a man judging behind them.", "The umpire decides."], "image": "val2014/COCO_val2014_000000206907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245717, "question_id": "RGBuaYoCxSMHEsiPfTsi6Z", "question": "Why are there chains on the red thing?", "choices": ["prevent break-in", "hold lids", "purely aesthetic", "provides strength"], "correct_choice_idx": 1, "direct_answers": ["keep place", "connection", "keep plugs", "hold lids", "safekeeping", "security", "protecting purpose", "security", "lock screws", "hold nearby"], "difficult_direct_answer": true, "rationales": ["The chains hold lids.", "The object visible is a fire hydrant and the chains are connected to the lids to prevent them from being separated.", "The chains prevent the lid from blasting off."], "image": "train2014/COCO_train2014_000000245717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238472, "question_id": "RGFGLGp8V84vK4JmQfzcxY", "question": "What is the woman in red holding?", "choices": ["eggs", "kittens", "bananas", "skis"], "correct_choice_idx": 3, "direct_answers": ["ski poles", "skis", "ski poles", "ski poles", "ski pole", "ski poles", "poles", "ski pole", "ski pole", "ski poles"], "difficult_direct_answer": false, "rationales": ["The woman in red has skis.", "The woman is on skis.", "The woman is holding skis in her hand."], "image": "val2014/COCO_val2014_000000238472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411824, "question_id": "RGKYHgToS3TTSETwRhPpqL", "question": "For what group of people is the yellow area on the platform built?", "choices": ["handicapped people", "elderly people", "blind people", "pregnant women"], "correct_choice_idx": 2, "direct_answers": ["train", "passengers", "blind people", "passengers", "passangers", "train passengers", "passengers", "waiting passengers", "blind", "passengers"], "difficult_direct_answer": false, "rationales": ["The yellow area helps ensure that these people stay back a sufficient distance to be safe.", "Blind people use the grooves on the yellow area to recognize when to stay back.", "The yellow stripe on the train platform is textured so blind people can detect them with their canes."], "image": "train2014/COCO_train2014_000000411824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446971, "question_id": "RGT9G3TfmdpDhRCymaZFvu", "question": "What is the red building the people are walking towards made from?", "choices": ["glass", "brick", "plastic", "steel"], "correct_choice_idx": 1, "direct_answers": ["bricks", "brick", "bricks", "brick", "bricks", "brick", "bricks", "brick", "bricks", "brick"], "difficult_direct_answer": false, "rationales": ["The building is red, and glass, steel, and plastic do not have red colouring.", "It's a brick building made of all bricks.", "This is indicated by the design of the structure and surface."], "image": "train2014/COCO_train2014_000000446971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252807, "question_id": "RGcVeqTRotgmZDnAoYyovU", "question": "What kind of landscape are the two men seated at?", "choices": ["mountain", "plain", "hill", "tundra"], "correct_choice_idx": 0, "direct_answers": ["hill", "mountain side", "mountainous", "hillside", "slanting", "mountain", "mountain", "grass", "mountain", "hillside"], "difficult_direct_answer": false, "rationales": ["A mountain is in front of the men.", "They sky is too close to the top of the mountain for it to be a small hill. a plain or tundra would be flatter.", "You can tell by the landscape as to where they are."], "image": "train2014/COCO_train2014_000000252807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418087, "question_id": "RGh24gYiY9uNJZ7gSLt7nA", "question": "Where is this game being played?", "choices": ["stage", "park", "stadium", "backyard"], "correct_choice_idx": 2, "direct_answers": ["stadium", "baseball field", "stadium", "baseball", "stadium", "stadium", "baseball field", "ball field", "stadium", "baseball"], "difficult_direct_answer": false, "rationales": ["Dodgers and cubs are facing off in a baseball area. people are sitting in the stands as they watch the game.", "There are a number of spectators sitting in seats behind the baseball players. this is the place baseball is typically played.", "There is a field, advertisements and seats"], "image": "train2014/COCO_train2014_000000418087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61439, "question_id": "RGjWW4jNLs2GWG39KAp4zf", "question": "What material is the platform made of?", "choices": ["cement", "plastic", "wood", "metal"], "correct_choice_idx": 0, "direct_answers": ["cement", "stones", "concrete", "concrete", "cement", "concrete", "cement", "concrete", "concrete", "concrete"], "difficult_direct_answer": false, "rationales": ["The platform is made of cement.", "The platform is made of cement.", "The material is cement."], "image": "train2014/COCO_train2014_000000061439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266773, "question_id": "RHMxePb9R3pNNjwAmLF6Dx", "question": "Why is the man kicking his leg on the ground?", "choices": ["to jump", "to ollie", "to grind", "to move"], "correct_choice_idx": 3, "direct_answers": ["go faster", "speed up", "scatting", "tricks", "gain speed", "propulsion", "to move", "gain speed", "propel skateboard", "go faster"], "difficult_direct_answer": false, "rationales": ["He is doing this to gain speed", "Running ones foot on the ground gives traction and propels to make the skateboard move.", "The man is shown riding a skateboard. since there are no other sources of energy, he must use a leg kick on the ground to move."], "image": "train2014/COCO_train2014_000000266773.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514777, "question_id": "RHSVN5EvNNAjReRGiVZipP", "question": "What enclosure is seen in the background?", "choices": ["hut", "gazebo", "tent", "dome"], "correct_choice_idx": 2, "direct_answers": ["tent", "tent", "tent", "tent", "shrubs", "tent", "tent", "tent", "forest", "tent"], "difficult_direct_answer": false, "rationales": ["The enclosure is a tent.", "There is a small tent in the background.", "There is a tent setup in the background."], "image": "train2014/COCO_train2014_000000514777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73215, "question_id": "RHY6vkLghcoLzcyHNRuZLz", "question": "What celebrity is from the country where the bus in the background is from?", "choices": ["elliot page", "sam elliott", "elliott gould", "missy elliott"], "correct_choice_idx": 0, "direct_answers": ["canada", "drake", "ryan reynolds", "eugene levy", "usa", "elliot page", "car", "canada", "ryan reynolds", "alex trebek"], "difficult_direct_answer": false, "rationales": ["The vehicle in the background is from canada. halifax is in canada.", "The train is likely canadian based on the writing readable on the side. the answer a actor is from this country.", "Elliot page is from canada."], "image": "train2014/COCO_train2014_000000073215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108019, "question_id": "RHwzUYTfgPjcxyPczcFxvh", "question": "What will they use to make this small enough to eat?", "choices": ["knife", "food processor", "straw", "spoon"], "correct_choice_idx": 0, "direct_answers": ["pizza cutter", "pizza cutter", "knife", "knife", "personal pan", "knife", "pizza cutter", "fork", "pizza cutter", "pizza cutter"], "difficult_direct_answer": false, "rationales": ["They'll use the knife.", "The knife can cut the pizza.", "The food depicted is a pizza that would be too large on its own to manageably eat. knives are used to cut foods into more manageable eating sizes."], "image": "train2014/COCO_train2014_000000108019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201756, "question_id": "RHy28apXaWQezXoNAw5ZbH", "question": "What is the animal on the tray?", "choices": ["cats", "chickens", "birds", "rabbits"], "correct_choice_idx": 2, "direct_answers": ["birds", "bird", "bird", "bird", "bird", "bird", "bird", "bird", "bird", "bird"], "difficult_direct_answer": false, "rationales": ["The animal has a beak and wings like a bird does.", "The animals have wings and the body shape of birds.", "They have feathers and small wings and also small beaks."], "image": "val2014/COCO_val2014_000000201756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561613, "question_id": "RJ59C9rDkLPzaG9c75vMEq", "question": "Where could you get this food?", "choices": ["fruit vendor", "taco truck", "starbucks", "burger joint"], "correct_choice_idx": 3, "direct_answers": ["restaurant", "restaurant", "cookout", "fast food", "picnic", "sonic", "diner", "burger joint", "burger", "food truck"], "difficult_direct_answer": true, "rationales": ["The food is a burger joint snack.", "Hot dogs and burgers are shown.", "There is literally a burger in the image."], "image": "train2014/COCO_train2014_000000561613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564743, "question_id": "RJFXSfWNzgNRcZpjWU5j2e", "question": "What sort of sport is being learned here?", "choices": ["snow boarding", "water polo", "golf", "baseball"], "correct_choice_idx": 0, "direct_answers": ["snowboarding", "snowboarding", "snowboarding", "skiing", "snowboarding", "snow boarding", "skiing", "snowboarding", "snowboarding", "skiing"], "difficult_direct_answer": false, "rationales": ["The person on the right is skiing. the other people are learning a slightly different winter sport.", "People are on boards with both their feet on one board in the snow.", "The man is standing on a board on the snow and being held by a person on the upside of the hill."], "image": "val2014/COCO_val2014_000000564743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20698, "question_id": "RJM2PSrQXEoisBuoDk4Uyd", "question": "What is the yellow object the boy is staring at?", "choices": ["baseball", "tennis ball", "football", "frisbee"], "correct_choice_idx": 1, "direct_answers": ["tennis ball", "tennis ball", "tennis ball", "ball", "tennis ball", "ball", "tennis ball", "tennis ball", "ball", "tennis ball"], "difficult_direct_answer": false, "rationales": ["The item is a tennis ball and the boy is about to hit it with his racket.", "The boy is staring at a yellow tennis ball.", "Answer a is consistent with the setting, size and color of the ball in the image."], "image": "train2014/COCO_train2014_000000020698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331832, "question_id": "RJQPKJycMXG73EtDfEnrV2", "question": "What type of beverages are being consumed by the pizza eater?", "choices": ["beer", "wine", "milkshakes", "soft drinks"], "correct_choice_idx": 3, "direct_answers": ["coke", "coca cola", "soda", "juice", "coca cola", "cola", "soft drinks", "coca cola", "coke", "soda"], "difficult_direct_answer": false, "rationales": ["There is a a couple men sitting at a table eating pizza. there is an empty bottle of coca cola on the table.", "The soft drink bottle is on the right side of the image.", "The bottle on the far left usually holds this kind of beverage. the glass nearer to the right side of the table, in the center of the picture, contains a liquid which is consistent in appearance with this kind of beverage."], "image": "train2014/COCO_train2014_000000331832.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303365, "question_id": "RJQtPhiNhqTmAzSt5TLKgA", "question": "Where is this area located?", "choices": ["suitcase expo", "airport", "barn", "jail"], "correct_choice_idx": 1, "direct_answers": ["airport", "waiting area", "airport", "airport", "terminal", "airport", "airport", "airport", "bus station", "airport"], "difficult_direct_answer": false, "rationales": ["The other options don't match the setting at all.", "The area is in a place that processes luggage.", "Airports have leather benches."], "image": "val2014/COCO_val2014_000000303365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396691, "question_id": "RK7WpwiwZruNMQuxk3bcw7", "question": "How was the sauce for the spaghetti warmed?", "choices": ["stove top", "restaurant", "microwave", "oven"], "correct_choice_idx": 0, "direct_answers": ["microwave", "stovetop", "microwave", "pot", "microwave", "stove", "microwave", "microwave", "stove", "stove top"], "difficult_direct_answer": false, "rationales": ["There is a large sauce pot on the stove so it appears that the stove top is where the sauce was warmed.", "The spaghetti sauce was warmed on a pot on the white stove.", "The spaghetti sauce is in the pot."], "image": "val2014/COCO_val2014_000000396691.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152084, "question_id": "RKSLET6HczTiHAkqWeEyUJ", "question": "How do these people know each other?", "choices": ["coworkers", "teammates", "siblings", "rivals"], "correct_choice_idx": 2, "direct_answers": ["siblings", "siblings", "siblings", "siblings", "siblings", "probably siblings", "siblings", "siblings", "siblings", "siblings"], "difficult_direct_answer": false, "rationales": ["These kids are siblings.", "They are both young children.", "A taller and older sister is posing with her younger brother. they resemble each other in the face."], "image": "train2014/COCO_train2014_000000152084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106849, "question_id": "RKVptxn2hWjcTJ9VTTVkiV", "question": "What is the man in the green shirt doing?", "choices": ["explaining excuses", "selling alcohol", "stealing alcohol", "requesting money"], "correct_choice_idx": 1, "direct_answers": ["sales", "showing drinks", "selling wine", "selling alcohol", "describing something", "talking", "explaining", "selling beer", "showing length", "explaining"], "difficult_direct_answer": true, "rationales": ["The man appears to be standing behind a table that is covered with various bottles of alcohol. in this type of setting the man behind the table is often selling what is on the table.", "He is behind the counter at a store", "A man is standing behind a counter with a lot of bottles on display."], "image": "val2014/COCO_val2014_000000106849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47226, "question_id": "RKqaStR8rr5x3UptGFpcRc", "question": "What kind of activity is featured in this snowy weather?", "choices": ["car race", "snowmobile", "ski", "speed boat"], "correct_choice_idx": 1, "direct_answers": ["snowmobiling", "riding snowmachine", "snowmobiling", "snow mobile", "snowmachine", "snowmobiling", "snowmobile riding", "snow machine", "snowmobile riding", "snowmobile"], "difficult_direct_answer": false, "rationales": ["The people are on a motorized sled built for winter off road conditions.", "Snowmobiles are vehicles that are used in the snow.", "They are personal motor vehicles on skis."], "image": "val2014/COCO_val2014_000000047226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375812, "question_id": "RKwEQtzcr9722AmN8SiXn8", "question": "What is the boy looking at?", "choices": ["crowd", "baseball", "clouds", "birds"], "correct_choice_idx": 1, "direct_answers": ["ball", "ball", "sky", "baseball", "baseball", "ball", "ball", "baseball", "baseball", "ball"], "difficult_direct_answer": false, "rationales": ["He looks at the baseball.", "The boy just finished swinging the bat and the ball is in the air.", "The boy is holding a baseball bat and based on the position he has just swung the bat. based on his eye line and if he had just swung the bat he is looking at something rising away from him which would be answer a."], "image": "val2014/COCO_val2014_000000375812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47813, "question_id": "RLAofbVYuvyx6nGXEZEyVj", "question": "What type tennis game is being played here?", "choices": ["mixed doubles", "canadian doubles", "men's singles", "men's doubles"], "correct_choice_idx": 3, "direct_answers": ["duo", "doubles", "doubles", "men's doubles", "doubles", "doubles", "doubles", "professional", "doubles", "duos"], "difficult_direct_answer": false, "rationales": ["Men's doubles is the game that's being played on the court as there are two men.", "The type is men's doubles.", "There are two men on each side of the court."], "image": "train2014/COCO_train2014_000000047813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309322, "question_id": "RLHJU5LMUXTgyiKnWEmLiz", "question": "What business does the company featured by the clock engage in?", "choices": ["motorcycle manufacturing", "energy", "clock manufacturing", "car manufacturing"], "correct_choice_idx": 1, "direct_answers": ["gas", "energy", "gasoline station", "automotive", "lighting", "electronics manufacturing", "gasoline", "gas station", "philips", "time"], "difficult_direct_answer": true, "rationales": ["The business does electric.", "Phillips is a well known engine oil manufacturer.", "Phillips is an energy company."], "image": "train2014/COCO_train2014_000000309322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529314, "question_id": "RLMj2nncoT68BLDPmmRqwT", "question": "How are people in the stands here likely connected to the players?", "choices": ["protesters", "family", "employees", "advertising sponsors"], "correct_choice_idx": 1, "direct_answers": ["family friends", "ten", "family members", "family", "family friends", "family members", "parents", "family members", "family", "parents"], "difficult_direct_answer": false, "rationales": ["These are children playing so spectators are likely family members.", "They are family members, most likely parents.", "The players are kids so the people in the stands are probably their parents."], "image": "val2014/COCO_val2014_000000529314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59012, "question_id": "RLUF658SAekJyY9jAhqkLF", "question": "What's the maximum number of players that can be on the court during this game?", "choices": ["three", "six", "two", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "four", "four", "four", "tennis", "four", "four", "four", "tennis"], "difficult_direct_answer": false, "rationales": ["Depends if it's a singles or doubles game.", "There can be two teams of two each at the most.", "The most is doubles which is for four players."], "image": "train2014/COCO_train2014_000000059012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581683, "question_id": "RLYYhJm5FbBMbaCymY5eJA", "question": "What is the long tool behind the cake used for?", "choices": ["fishing", "exploring", "burning", "music"], "correct_choice_idx": 3, "direct_answers": ["cutting", "cutting", "music", "instrument", "play music", "music", "cake shovel", "instrument", "guitar playing", "cutting"], "difficult_direct_answer": false, "rationales": ["It seems to be an instrument. it's hard to tell in the image. that said, the other options really don't fit this scene.", "It is a musical instrument.", "The tool provides a speaker for music."], "image": "val2014/COCO_val2014_000000581683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318001, "question_id": "RLZJZmd4i5x6r6CMEEtJx3", "question": "What is the man in blue trying to do?", "choices": ["block", "dodge", "tackle", "hug"], "correct_choice_idx": 0, "direct_answers": ["intercept", "playing", "block frisbee", "catch frisbee", "block", "block", "block", "block", "catch frisbee", "block"], "difficult_direct_answer": false, "rationales": ["The man in the blue has his hands outstretched and his legs spread out and is trying to prevent the man in the red with the frisbee from successfully making a toss.", "The man is trying to get the frisbee.", "The boy is trying to throw the frisbee to his teammate in the same color shirt, and the man is trying to prevent it."], "image": "train2014/COCO_train2014_000000318001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223699, "question_id": "RLcyR37iH9UdFfbHusZpSD", "question": "What were these treats cooked in?", "choices": ["water", "milk", "gasoline", "oil"], "correct_choice_idx": 3, "direct_answers": ["oil", "oven", "depp fryer", "oil", "oil", "oil", "oven", "oven", "oven", "oil"], "difficult_direct_answer": false, "rationales": ["The treats are donuts. donuts are fried in oil.", "Most pastries are cooked in some kind of grease.", "Generally those types of confectionery treats are fried."], "image": "val2014/COCO_val2014_000000223699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371807, "question_id": "RLhqVX9VMXuiSwSRdYaPde", "question": "Where is this bathroom likely to be found in?", "choices": ["airport", "shopping mall", "school", "theater"], "correct_choice_idx": 0, "direct_answers": ["airport", "airport", "airport", "house", "airport", "airport", "airport", "public building", "public restroom", "gas station"], "difficult_direct_answer": false, "rationales": ["The bathroom has luggage in it.", "The bathroom is of a style found in public places and the presence of the luggage and the luggage cart suggests this would be a place someone was traveling with lots of luggage like an airport.", "There is luggage in there so in an airport."], "image": "train2014/COCO_train2014_000000371807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546091, "question_id": "RLimTehDwvozbZyLyNJsPz", "question": "What does the tool near the pizza do?", "choices": ["cook pizza", "destroy pizza", "scoop pizza", "store pizza"], "correct_choice_idx": 2, "direct_answers": ["serves", "serve", "serve pizza", "serve pizza", "cut serve", "slice", "scoop pizza", "scoop pizza", "serve pizza", "scoop pizza"], "difficult_direct_answer": false, "rationales": ["The tool is flat and triangular, which is the shape it would need to be in order to securely pick up pizza so that's obviously its intended use.", "The serving utensil has an angle that allows the user to slide it under a pizza.", "The tool cuts pizza."], "image": "val2014/COCO_val2014_000000546091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250608, "question_id": "RLj5UMs95q8mPuW8N5EMvU", "question": "Where will the bus stop next?", "choices": ["north", "burwood", "east", "hospital"], "correct_choice_idx": 1, "direct_answers": ["burwood", "burwood", "unknown", "burwood", "burwood", "burwood", "unknown", "burwood", "burwood", "unknown"], "difficult_direct_answer": false, "rationales": ["The destination is visible at the top of the front side of the bus.", "The bus sign states burwood.", "The sign on the bus says burwood."], "image": "val2014/COCO_val2014_000000250608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556851, "question_id": "RLmk4qD78ALCcssWCx6wxq", "question": "Why are they on the giant wave?", "choices": ["by mistake", "wind blown", "is challenge", "landed there"], "correct_choice_idx": 2, "direct_answers": ["surfing", "surfing", "is challenge", "surfing", "surfing", "fun", "to surf", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["The surfers want to test out the wave.", "Surfers are on a large wave in the water.", "Surfers love going for a challenging ride."], "image": "train2014/COCO_train2014_000000556851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155219, "question_id": "RLqUM54cWajG9TQcooEcnh", "question": "What are the posts made from on the right?", "choices": ["wood", "plastic", "steel", "concrete"], "correct_choice_idx": 2, "direct_answers": ["metal", "metal", "stainless steel", "metal", "metal", "stopping", "metal", "metal", "metal", "steel"], "difficult_direct_answer": false, "rationales": ["The two poles are made of a shiny metal.", "The posts are shiny. they are made out of metal, not plastic, wood, or concrete.", "The posts are clearly visible and based on the consistency and color and the function they are serving, answer a is most likely."], "image": "train2014/COCO_train2014_000000155219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240911, "question_id": "RM246wZg4yBFoa7kTnQW2V", "question": "From where does the train get it's power?", "choices": ["water", "electricity", "coal", "fuel"], "correct_choice_idx": 1, "direct_answers": ["cables", "electricity", "electricity", "electric", "electricity", "electricity", "lines", "electricity", "overhead cables", "caboose"], "difficult_direct_answer": false, "rationales": ["It gets electricity from the wires it is connected to.", "Most modern day trains are ran by electricity.", "The train is electric powered."], "image": "train2014/COCO_train2014_000000240911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115917, "question_id": "RMaMwytzL2HirfnYSyruFN", "question": "What did the two men shaking hands just do?", "choices": ["had lunch", "played baseball", "went bowling", "played tennis"], "correct_choice_idx": 3, "direct_answers": ["okay tennis", "tennis", "play tennis", "played tennis", "sportsmanship", "play tennis", "play tennis", "played tennis", "play fair", "finish game"], "difficult_direct_answer": false, "rationales": ["The two men just had a match.", "The players are seen with items used to plat tennis.", "They just finished a game."], "image": "train2014/COCO_train2014_000000115917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99416, "question_id": "RMqpGt4PLUuhBzgy2DXHxR", "question": "Why is he smiling?", "choices": ["new tie", "for camera", "like ties", "good food"], "correct_choice_idx": 1, "direct_answers": ["photo op", "other helping", "having fun", "posing", "happy", "for camera", "being silly", "likes cameraman", "for camera", "happiness"], "difficult_direct_answer": true, "rationales": ["The man is posing for the camera.", "He is smiling to the camera for a photograph.", "He is making eye contact with the photographer."], "image": "val2014/COCO_val2014_000000099416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24385, "question_id": "RMwQjpnLanZPqcyHXTJEMd", "question": "Where is this venue likely to be?", "choices": ["home", "conference room", "office", "restaurant"], "correct_choice_idx": 1, "direct_answers": ["restaurant", "business office", "office", "conference room", "home", "office", "home", "public", "restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["There is a long table with a laptop and a projector on it.", "Looks like they are having a work meeting in a meeting room", "The venue is a conference room."], "image": "val2014/COCO_val2014_000000024385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288099, "question_id": "RMydHnPvvkADcgeVkkPMMY", "question": "Why is only part of the image in color?", "choices": ["faulty camera", "camera filter", "optical illusion", "photo manipulation"], "correct_choice_idx": 3, "direct_answers": ["window", "photoshop", "filters", "filmography", "photo editing", "storeront", "photoshop", "store window", "grab attention", "photo manipulation"], "difficult_direct_answer": true, "rationales": ["A specific part of the photo is the only part that is red, the rest is not in color at all.", "For an image to have a scene in nature with sections that appear in black and white only and some sections in color there was something likely done intentionally by the photo publisher.", "They wanted this part to stand out"], "image": "train2014/COCO_train2014_000000288099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292690, "question_id": "RN5jUSjX9ayhMD6ngxSY7o", "question": "Which corner of the plate contains meat?", "choices": ["top right", "bottom left", "bottom right", "top left"], "correct_choice_idx": 1, "direct_answers": ["bottom left", "bottom left", "bottom left", "lower left", "lower left", "left", "bottom left", "bottom left", "bottom left", "bottom left"], "difficult_direct_answer": false, "rationales": ["A white rectangular plate has different kinds of food on it. there is some pink ham on one corner of plate.", "That area holds thinly sliced carnivore treats. the other three corners have vegetarian fare.", "The bottom left portion has ham."], "image": "train2014/COCO_train2014_000000292690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47624, "question_id": "RNHLfQPJTQFaYZx8RAqMg3", "question": "From what direction is the sun shining?", "choices": ["right", "left", "behind", "front"], "correct_choice_idx": 2, "direct_answers": ["behind kids", "west", "east", "east", "behind", "behind", "north", "east", "behind", "east"], "difficult_direct_answer": false, "rationales": ["The sun is shining behind the girls and their shadows are in front of them.", "The shadows are in front of the girls and the boy.", "There are shadows on the ground. these shadows are in front of the people, so the sun must be shining from the opposite direction."], "image": "val2014/COCO_val2014_000000047624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151611, "question_id": "RNsiqoaYSKjWkHKdLYH3Ni", "question": "How is the man in the t shirt feeling?", "choices": ["hostile", "annoyed", "amused", "angry"], "correct_choice_idx": 2, "direct_answers": ["happy", "excited", "excited", "engaged", "happy", "happy", "happy", "excited", "happy", "amused"], "difficult_direct_answer": false, "rationales": ["The man is amused.", "The man is laughing.", "The man is smiling."], "image": "train2014/COCO_train2014_000000151611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453706, "question_id": "RPTFH4zcJTX38PRBDig68f", "question": "Why is she smiling?", "choices": ["is confused", "has child", "is proud", "selling baby"], "correct_choice_idx": 2, "direct_answers": ["happy", "happy", "photograph", "is proud", "happy baby", "she's happy", "posing", "happy", "for photo", "loves baby"], "difficult_direct_answer": false, "rationales": ["She looks to be happy with the baby.", "The woman is happy she has a new baby.", "Mother and baby are looking at the camera. she is happy and loves the little boy."], "image": "train2014/COCO_train2014_000000453706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93243, "question_id": "RPYQq2oqMydtUeLRUvjeMM", "question": "What is the bat made out of?", "choices": ["wood", "plastic", "metal", "cork"], "correct_choice_idx": 2, "direct_answers": ["metal", "wood aluminum", "metal", "metal", "metal", "metal", "metal", "aluminum", "aluminum", "aluminum"], "difficult_direct_answer": false, "rationales": ["The bat is metal.", "Other than wood, bats are made of metal.", "The bat is shiny."], "image": "train2014/COCO_train2014_000000093243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518224, "question_id": "RPehogHhBHAa8rG2quftVf", "question": "What team is the batter playing for?", "choices": ["red sox", "orioles", "dodgers", "yankees"], "correct_choice_idx": 2, "direct_answers": ["la", "dodgers", "dodgers", "baseball", "dodgers", "dodgers", "baseball", "mets", "la dodgers", "la dodgers"], "difficult_direct_answer": false, "rationales": ["He is playing for the dodgers.", "The batter is wearing the dodgers' uniform.", "He is wearing a blue and grey uniform. there is an la logo on his hat."], "image": "val2014/COCO_val2014_000000518224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24714, "question_id": "RPnhdkRjPGtzaK2U3tn3dx", "question": "Why are they here?", "choices": ["hang out", "sell items", "beggers", "church"], "correct_choice_idx": 1, "direct_answers": ["sell items", "selling", "job", "selling fruit", "selling things", "vendors", "selling food", "selling", "food vendor", "sell produce"], "difficult_direct_answer": true, "rationales": ["They are all here to sell fruit items.", "The lady is selling produce at a street market. she has items on display for people to stop by and purchase as the man with bags has done.", "The produce is on display to be for sale."], "image": "train2014/COCO_train2014_000000024714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353509, "question_id": "RPtQuzQDs7RZqMqhPprC2r", "question": "What is the occupation of the man on the horse?", "choices": ["gardener", "cowboy", "photographer", "shepherd"], "correct_choice_idx": 3, "direct_answers": ["cowboy", "herder", "cowboy", "shepherd", "cowboy/herder", "shepherd", "cowboy", "cowboy", "cowboy", "sheep wrangler"], "difficult_direct_answer": false, "rationales": ["The occupation is a shepherd.", "The man is herding lambs.", "There are a lot of sheep around him so him being a sheppard makes the most sense."], "image": "val2014/COCO_val2014_000000353509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559586, "question_id": "RQCLnamwadVvykDSvjv3QQ", "question": "What method was this dish prepared in?", "choices": ["oven", "grilling", "frying", "stovetop"], "correct_choice_idx": 0, "direct_answers": ["oven", "baking", "baked", "pizza oven", "bake", "oven", "deep dish", "oven", "oven", "oven"], "difficult_direct_answer": false, "rationales": ["The method is the oven.", "This dish was prepared in the oven.", "Pizza is cooked in an oven because it needs to be cooked around all sides and cannot be flipped over in a pan or on a grill or the toppings would fall off."], "image": "train2014/COCO_train2014_000000559586.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183044, "question_id": "RQDLdrYX85xT7iFYawdY7M", "question": "Which person was born in the country where these words come from?", "choices": ["james remar", "hideki matsui", "michael jordan", "harry houdini"], "correct_choice_idx": 3, "direct_answers": ["harry houdini", "hungary", "hungarians", "harry houdini", "unknown", "harry houdini", "hungary", "russian", "lathato leszbikusok", "tony curtis"], "difficult_direct_answer": false, "rationales": ["The person is houdini.", "Harry houdini was born in this country.", "Lathato is a hungarian word and hungary was the birthplace of the famous illusionist."], "image": "train2014/COCO_train2014_000000183044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134747, "question_id": "RQJdnCKc32uhK56CebzWDw", "question": "What country is this beach located in?", "choices": ["united states", "canada", "mexico", "india"], "correct_choice_idx": 3, "direct_answers": ["mexico", "india", "india", "cannot tell", "india", "india", "unknown", "madagascar", "india", "mexico"], "difficult_direct_answer": false, "rationales": ["Cows are located in india.", "This is located in india.", "There is a bull on the beach. bulls are known to roam public areas in some parts of the world, including this country."], "image": "train2014/COCO_train2014_000000134747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444598, "question_id": "RQZBhVsGR9pU3SJogMZou5", "question": "What type of coating is found on the paper below the donuts?", "choices": ["food coloring", "glue", "wax", "ash"], "correct_choice_idx": 2, "direct_answers": ["glaze", "wax", "glazed", "paper", "wax", "glaze", "glaze", "glaze", "glaze", "wax"], "difficult_direct_answer": false, "rationales": ["Donuts can get sticky. it is a non-stick coating.", "The coating is wax.", "The donuts are on a white surface. wax paper is used for baked goods."], "image": "train2014/COCO_train2014_000000444598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307949, "question_id": "RQinicCtLVSQ8Fs4piVHTm", "question": "Which rental car agency is advertised on the fence?", "choices": ["avis", "hertz", "alamo", "enterprise"], "correct_choice_idx": 3, "direct_answers": ["enterprise", "enterprise", "enterprise", "enterprise", "enterprise", "enterprise", "enterprise", "montebello", "enterprise", "enterprise"], "difficult_direct_answer": false, "rationales": ["The fence that is by the building has the car rental service brand printed on a sign on the fence.", "The sign is to the left of the traffic light. it is green and white.", "The advertisement is clearly visible and the name of the company is readable."], "image": "train2014/COCO_train2014_000000307949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499826, "question_id": "RQncHKGKXNbjuNa8rENV2A", "question": "What does this animal tend to have?", "choices": ["wings", "two trunks", "sharp teeth", "three hearts"], "correct_choice_idx": 1, "direct_answers": ["tusks", "tusks", "two trunks", "friends", "trunk", "trunk", "trunk", "trunk", "fans", "long trunks"], "difficult_direct_answer": false, "rationales": ["The animal has two tusks.", "Elephants have three hearts - one systemic and two gill hearts.", "An elephant has a trunk."], "image": "val2014/COCO_val2014_000000499826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218012, "question_id": "RQwx8GiRDvTyBmfQdQgH5X", "question": "What type of donut is shown?", "choices": ["munchkins", "boston creme", "cruller", "jelly"], "correct_choice_idx": 2, "direct_answers": ["cruller", "cruller", "chocolate", "sugared", "cruller", "cruller", "cake", "crueller", "cake", "cruller"], "difficult_direct_answer": false, "rationales": ["The donut is a cruller donut.", "The donuts are in a spiral.", "Crullers are twisted."], "image": "val2014/COCO_val2014_000000218012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238332, "question_id": "RRQQGCDJM7oNaBQDHwiVzt", "question": "What keeps the trains wheels stable during travel?", "choices": ["oil", "strict laws", "rubber tires", "train tracks"], "correct_choice_idx": 3, "direct_answers": ["axel", "train tracks", "truck", "pistons", "coupling rod", "track", "tracks", "tracks", "train tracks", "tracks"], "difficult_direct_answer": false, "rationales": ["The train is visibly on the tracks and based on the common knowledge of how trains run, answer a is viable.", "The tracks ensure the train wheels do not lose control.", "All trains must run on tracks, there is no other way for them to get from point a to b."], "image": "val2014/COCO_val2014_000000238332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478077, "question_id": "RRaRUdgbmUBmwvTcCKLnjS", "question": "How could you gain access to an area to place dirty dishes here to be automatically cleaned?", "choices": ["open microwave", "place shelf", "nothing", "pull door"], "correct_choice_idx": 3, "direct_answers": ["open", "open", "open", "pull door", "open door", "open", "open door", "unlatch", "open", "open door"], "difficult_direct_answer": false, "rationales": ["The dishwasher handle can be seen in the image.", "You could pull the door open.", "Dishwashers normally have pull doors in them. this would be the correct place to have dirty dishes cleaned."], "image": "val2014/COCO_val2014_000000478077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23702, "question_id": "RS5S32Bxf7qa5k9MWbuAMe", "question": "What does Humphrey store for later?", "choices": ["air", "water", "skin oil", "chemicals"], "correct_choice_idx": 1, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["They are in a camel suit", "The label on his back says \"water.\".", "He has water that he can pass out to people"], "image": "train2014/COCO_train2014_000000023702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488915, "question_id": "RSX8cqWSwzFtWyUSRbxCPw", "question": "Why is the boy pointing towards the lit up laptop screen?", "choices": ["using touchscreen", "turning off", "has problem", "turning on"], "correct_choice_idx": 2, "direct_answers": ["broken", "to show", "making point", "right hand", "enlarging image", "explaining", "get attention", "has problem", "busy working", "show it"], "difficult_direct_answer": true, "rationales": ["He is showing someone something on the screen.", "His posture and face are angry. an angry person has this.", "He has a problem."], "image": "val2014/COCO_val2014_000000488915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171272, "question_id": "RSfQXoYzpzEwotysaG6B3R", "question": "What type of racing is this?", "choices": ["sailboat", "side car", "race car", "model car"], "correct_choice_idx": 1, "direct_answers": ["motorbike", "pod racing", "side car", "sidecar racing", "sidecar racers", "sidecar", "cart", "motorcycle", "motorcycle", "motorcycle"], "difficult_direct_answer": false, "rationales": ["There is a sidecar attached to the motorcycle.", "A motorcycle with a car attached is being driven and has a passenger both wearing numbers.", "This has a place for a passenger to sit and ride too"], "image": "train2014/COCO_train2014_000000171272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17882, "question_id": "RSkJSvCBETuaA5G3FWz4qy", "question": "What do the triangular barricades do?", "choices": ["prevent parking", "hang things", "decorate street", "block road"], "correct_choice_idx": 0, "direct_answers": ["advertise", "protect pedestrians", "stop walking", "bike racks", "hold bikes", "divide", "prevent parking", "stop", "hold bike", "no parking"], "difficult_direct_answer": true, "rationales": ["They are there so people cant park there.", "It's a no park zone.", "The barricades prevent people from stopping their cars there."], "image": "val2014/COCO_val2014_000000017882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139712, "question_id": "RTKdB9WPWN7J6B4y7oqSvC", "question": "What transportation surface is mentioned with the sign on the fence?", "choices": ["crosswalk", "highway", "sidewalk", "railroad track"], "correct_choice_idx": 0, "direct_answers": ["crosswalk", "train road", "crosswalk", "crosswalk", "train", "crosswalk", "crosswalk", "crosswalk", "walking", "crosswalk"], "difficult_direct_answer": false, "rationales": ["The surface is a crosswalk.", "It is a designated area for pedestrians to walk from one side of the track to the other.", "The sign mentions the crosswalk to the right."], "image": "train2014/COCO_train2014_000000139712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535307, "question_id": "RTYcghDcHGoETiF3TcFECP", "question": "What sport it is?", "choices": ["paragliding", "skiing", "parasailing", "swimming"], "correct_choice_idx": 2, "direct_answers": ["sky diving", "parasailing", "airliner", "parasailing", "wind sailing", "parasailing", "parasailing", "parasuit gaming", "sport kite", "kite flying"], "difficult_direct_answer": false, "rationales": ["There are sails in the sky.", "Parasailing involves kites.", "As indicated by the water, which is why the second half of the word references water."], "image": "val2014/COCO_val2014_000000535307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473323, "question_id": "RTavYJDJBCC3co7YpBewZt", "question": "Why is the guy in purple crouching?", "choices": ["referee", "injured", "catcher's stance", "fielding ball"], "correct_choice_idx": 2, "direct_answers": ["catch ball", "catch ball", "catch ball", "catcher's stance", "catch ball", "catcher", "catcher", "catch ball", "catch ball", "catcher"], "difficult_direct_answer": false, "rationales": ["The man has his mitt ready to catch the ball.", "The guy in purple is in the crouch position so that he can catch the ball when it is thrown by the pitcher across the plate.", "The ball usually drops as it crosses the plate"], "image": "val2014/COCO_val2014_000000473323.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161011, "question_id": "RTiKbJnyX4nRcMhC7sS77K", "question": "In which direction are the three here likely to go next?", "choices": ["sideways", "uphill", "nowhere", "downhill"], "correct_choice_idx": 3, "direct_answers": ["right", "down hill", "down hill", "downhill", "downhill", "down", "downhill", "down", "downhill", "downhill"], "difficult_direct_answer": false, "rationales": ["These three are all more likely to go downhill.", "Skiers generally ski to a lower elevation.", "The direction is down."], "image": "val2014/COCO_val2014_000000161011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202423, "question_id": "RTjsR9jJqCzsXrJifbjtAC", "question": "Where is this person likely having food?", "choices": ["park", "cafe", "home", "office"], "correct_choice_idx": 1, "direct_answers": ["bakery", "cruise", "restaurant", "cafe", "restaurant", "cafe", "restaurant", "coffee shop", "cafe", "home"], "difficult_direct_answer": false, "rationales": ["Because the food seems to be well arranged and packed for a customer.", "The food is in a container as you would get from a restaurant.", "This person is most likely having food inside of their office space."], "image": "train2014/COCO_train2014_000000202423.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568962, "question_id": "RTqbcdnefddBskzih5QyKo", "question": "In what country is this scene located?", "choices": ["japan", "korea", "thailand", "china"], "correct_choice_idx": 3, "direct_answers": ["korea", "asian country", "japan", "japan", "china", "china", "taiwan", "japan", "china", "korea"], "difficult_direct_answer": false, "rationales": ["The characters on the sign look chinese.", "All the patrons are chinese.", "The hangup characters can be seen on the sign."], "image": "train2014/COCO_train2014_000000568962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12158, "question_id": "RU5Gw5aN8jqGk3XKSEe5X8", "question": "What is needed for this activity?", "choices": ["sun", "wind", "snow", "rain"], "correct_choice_idx": 1, "direct_answers": ["wing", "wind", "kite wind", "wind", "kite", "wind", "wind", "rope", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["Kiting requires an uplifting action, not a downward-pushing action.", "Wind is needed.", "A kite needs blowing air to keep it aloft."], "image": "train2014/COCO_train2014_000000012158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341245, "question_id": "RU75tKxh2PJFwXiruBYpc8", "question": "What number do you get if you add the two numbers on the train together?", "choices": ["38", "22", "15", "56"], "correct_choice_idx": 2, "direct_answers": ["15", "fifteen", "15", "fifteen", "fifteen", "15", "fifteen", "fifteen", "fifteen", "15"], "difficult_direct_answer": false, "rationales": ["The two numbers are six and nine.", "Six plus nine is fifteen.", "You can add 9 + 6."], "image": "train2014/COCO_train2014_000000341245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62293, "question_id": "RUCQu9Nh9gDpogWUj8zfga", "question": "What must be paid to ride this machine?", "choices": ["fee", "tax", "donation", "fare"], "correct_choice_idx": 3, "direct_answers": ["fare", "fare", "money", "fare", "buy ticket", "fare", "fare", "ticket", "ticket fee", "fare"], "difficult_direct_answer": false, "rationales": ["It is money paid to ride", "You need to pay with a fare.", "Riders of the subway need to have paid fare."], "image": "train2014/COCO_train2014_000000062293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166067, "question_id": "RUCdVoq6fHFf8hdmUEC3sN", "question": "Why are all the trucks in back of the boat?", "choices": ["stole them", "hauling them", "part boat", "hiding them"], "correct_choice_idx": 1, "direct_answers": ["hauling them", "first off", "transporting", "ferry transport", "shipping voyage", "crossing water", "transportation", "being transported", "ferry", "transport"], "difficult_direct_answer": true, "rationales": ["They are going to be transported by means of water transport.", "They are getting a ride on the barge.", "The trucks are stacked in such a way that it looks like they are being transported in a way that the trucks could not move themselves. they are all of a similar make and design which implies they are likely being brought for a common and intentional purpose to a new location."], "image": "train2014/COCO_train2014_000000166067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422866, "question_id": "RUdGL5uYqVmduioDUGi8jh", "question": "What is on the left of the plate?", "choices": ["eggs", "apple", "broccoli", "pumpkin"], "correct_choice_idx": 2, "direct_answers": ["vegetables", "vegetables", "vegetables", "broccoli", "broccoli", "salad", "broccoli", "knife", "broccoli", "broccoli"], "difficult_direct_answer": false, "rationales": ["The vegetable that's green is broccoli.", "Broccoli is shown.", "Broccoli is on the plate."], "image": "train2014/COCO_train2014_000000422866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215003, "question_id": "RUnVoTpygcah2ZvrJyaeYQ", "question": "Why is the man holding his arms up?", "choices": ["to block", "to wave", "to tackle", "to catch"], "correct_choice_idx": 0, "direct_answers": ["blocking", "grabbing frisbee", "defense", "blocking play", "block play", "defence", "to block", "blocking throw", "giving up", "blocking"], "difficult_direct_answer": true, "rationales": ["Kids are playing frisbee and one is looking for an open player to pass to.", "The man is holding up his hands to block the frisbee.", "The player in front of him is about to try to pass the frisbee to his teammate."], "image": "train2014/COCO_train2014_000000215003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328075, "question_id": "RUszwV9aC4RkG3ppnaSwKv", "question": "What purpose does the white building serve?", "choices": ["radio", "naval direction", "traffic", "sonar"], "correct_choice_idx": 1, "direct_answers": ["lighthouse", "light", "lighthouse", "warn ships", "lighthouse", "visibility", "lighthouse", "lighthouse", "naval direction", "lighthouse"], "difficult_direct_answer": false, "rationales": ["There is a lighthouse behind the cows.", "Traditionally lighthouses have helped ships navigate the shores as to avoid ships running aground on rocks.", "It is a lighthouse to help boats when it's dark and stormy"], "image": "train2014/COCO_train2014_000000328075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306095, "question_id": "RV3wXLL48ZvkhWgXmK9Mq9", "question": "The legs of the table and chairs contain which one of these elements?", "choices": ["hydrogen", "chromium", "gold", "uranium"], "correct_choice_idx": 1, "direct_answers": ["chromium", "metal", "metal", "good", "metal", "metal", "good", "metal", "metal", "metal"], "difficult_direct_answer": false, "rationales": ["It is shiny and silver", "The legs are made out of a shiny silver metal that is not particularly expensive and not radioactive.", "The shiny metal seen on the chairs is synonymous with chromium."], "image": "train2014/COCO_train2014_000000306095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539317, "question_id": "RV5pGp8PwwXVBo9zEfA4WM", "question": "What will the large boat do in the sea?", "choices": ["sell floats", "haul weed", "sell cokes", "fish"], "correct_choice_idx": 3, "direct_answers": ["fish", "fish", "fishing", "fish", "fish", "fishing", "float", "fish", "tow", "float"], "difficult_direct_answer": false, "rationales": ["The boat will fish.", "That's a type of boat that fisherman use to catch fish in the sea.", "The boat looks like a fishing boat."], "image": "train2014/COCO_train2014_000000539317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2281, "question_id": "RV94iT8MtgLWyNJ68zRdWV", "question": "What is between two of the cars?", "choices": ["refrigerator", "television", "traffic cop", "wolf"], "correct_choice_idx": 0, "direct_answers": ["fridge", "fridge", "refrigerator", "refrigerator", "fridge", "refrigerator", "refrigerator", "fridge", "fridge", "refrigerator"], "difficult_direct_answer": false, "rationales": ["It belongs in the kitchen to keep the food cool.", "The fridge is between.", "It has 2 doors and handles"], "image": "train2014/COCO_train2014_000000002281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290981, "question_id": "RVAyTC866a8MuBZqiJmEKN", "question": "What would people be likely to do in this area?", "choices": ["ski", "hike", "fish", "boat"], "correct_choice_idx": 1, "direct_answers": ["sheep farmers", "graze livestock", "camp", "farm", "sheep herd", "grazing", "explore", "hike", "tend herd", "graze animal"], "difficult_direct_answer": true, "rationales": ["People are likely to go hiking in the hills and mountains of this area.", "The area is a large mountainside where people might like to go hiking.", "They would hike or walk through the woods."], "image": "val2014/COCO_val2014_000000290981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186013, "question_id": "RVBFb77TrAs4N6JEwGhQc2", "question": "Kites are being flown where today?", "choices": ["island", "forest", "beach", "school"], "correct_choice_idx": 2, "direct_answers": ["sky", "beach", "sky", "beach", "beach", "beach", "kite", "beach", "beach", "on beach"], "difficult_direct_answer": false, "rationales": ["The kites are flown here today around in the beach.", "They are flown at the beach for air circulation.", "Kites are at the beach."], "image": "val2014/COCO_val2014_000000186013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576222, "question_id": "RVFA34BbMdg7PRfptyHoUk", "question": "What kind of food is the man consuming?", "choices": ["steak", "seafood", "pork", "lamb"], "correct_choice_idx": 1, "direct_answers": ["cooked", "steak", "sea food", "restaurant", "stew", "seafood", "seafood", "lobster", "seafood", "dinner"], "difficult_direct_answer": false, "rationales": ["The food is seafood.", "The man is consuming seafood at a restaurant.", "The man is consuming a piece of seafood."], "image": "val2014/COCO_val2014_000000576222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567171, "question_id": "RVfs5yuuKaMpxFA7cXwNmP", "question": "Why is a white shield inside the Green truck windshield?", "choices": ["keeping cooler", "storage", "advertising", "privacy"], "correct_choice_idx": 0, "direct_answers": ["block sun", "shade", "sun shield", "sun blocker", "sitting", "sun block", "sun shield", "keeping cooler", "reflects heat", "protect inside"], "difficult_direct_answer": true, "rationales": ["It keeps the sun out.", "A covering can provide shade. things in the shade remain at a lower temperature than things in the sun.", "There is a white shield inside the green truck to keep the sun out and make the car cool"], "image": "val2014/COCO_val2014_000000567171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25717, "question_id": "RVjc7B7Ncy4L5P7aL677EZ", "question": "What is the black structure against the wall used to contain?", "choices": ["books", "pillows", "fire", "water"], "correct_choice_idx": 2, "direct_answers": ["firewood", "firewood", "fire", "firewood", "fire", "fire", "fire", "fire", "fire", "fire"], "difficult_direct_answer": false, "rationales": ["The black structure against the wall is a fireplace used for making fires.", "A fireplace is what fires are made in, inside the home.", "The black structure is the fireplace."], "image": "val2014/COCO_val2014_000000025717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127035, "question_id": "RVyMaKN5L3hvwiqEFxYzF5", "question": "What is being done to the liquid in the bowl?", "choices": ["hardening", "stirring", "eating", "coloring"], "correct_choice_idx": 1, "direct_answers": ["boiling", "whisking", "boiled", "whisked", "washed", "whisk", "stirred", "it's bubbling", "mixed", "stirring"], "difficult_direct_answer": true, "rationales": ["The tool in the bowl is a whisk. whisks are used to mix things.", "There is a whisk in the bowl.", "The liquid is stirred."], "image": "train2014/COCO_train2014_000000127035.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574074, "question_id": "RW72WmxPVB6PkjMc8G8caU", "question": "What does this bird's diet mainly consist of?", "choices": ["swamp vegetation", "insects", "fish", "grubs"], "correct_choice_idx": 0, "direct_answers": ["rodents", "bugs", "fresh", "seeds", "bird", "seeds", "bugs", "swamp vegetation", "seeds", "carcasses"], "difficult_direct_answer": false, "rationales": ["These birds have very big talons with long, sharp claws. they use talons for catching prey. they would catch prey that is large relative to the rest of their body size.", "They like the swamp vegetation.", "The bird eats veggies."], "image": "val2014/COCO_val2014_000000574074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425342, "question_id": "RWJLwykGURsgewMGwheXfj", "question": "What action is the upside down person doing with the frisbee?", "choices": ["eating it", "catching", "throwing", "nothing"], "correct_choice_idx": 1, "direct_answers": ["kicking", "handstand", "handstand", "catching", "throw", "catching it", "cartwheel", "hand stand", "handstand", "catching"], "difficult_direct_answer": false, "rationales": ["They are trying to catch it", "The person has his hand out.", "The upside down person has his hand out and is waiting for the frisbee to come to him."], "image": "train2014/COCO_train2014_000000425342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285131, "question_id": "RWNXCWAT3Xicf4iMppj4DG", "question": "What boulevard is the Jones Theater on?", "choices": ["speer", "12th", "curtis", "spire"], "correct_choice_idx": 0, "direct_answers": ["speer", "speer blvd", "speer blvd", "speer", "speer", "speer blvd", "speer", "speer", "speer blvd", "speer"], "difficult_direct_answer": false, "rationales": ["The street sign has the name of the street that this theater is on.", "There are green street signs that indicate the name of the boulevard.", "The name of the street is on the small green sign by the traffic lights."], "image": "train2014/COCO_train2014_000000285131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83818, "question_id": "RWhoopy4Mwb3Ypwga7jhQp", "question": "What action has she taken?", "choices": ["dribble", "shoot", "serve", "dunk"], "correct_choice_idx": 2, "direct_answers": ["served", "serve", "serve", "serve", "hit ball", "serve", "serve", "hit", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["The woman has hit the ball hard as if serving.", "She is playing tennis, not hockey or basketball.", "The tennis player is in a position that implies they just struck the ball really hard to start off the match."], "image": "train2014/COCO_train2014_000000083818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199835, "question_id": "RWjsXrM9eeQfSeiLC5Gm8R", "question": "What is the man doing?", "choices": ["taxes", "running", "eating chili", "shaving"], "correct_choice_idx": 3, "direct_answers": ["shaving", "shaving", "shaving", "shaving", "shaving", "shaving", "shaving", "shaving", "shaving", "shaving"], "difficult_direct_answer": false, "rationales": ["He has shaving cream on and is holding a razor", "The man is using a razor and shaving cream.", "The man shaves."], "image": "train2014/COCO_train2014_000000199835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162282, "question_id": "RX45gJ79TsE4cYccWzJWae", "question": "What is the boy doing with the book?", "choices": ["highlighting it", "tearing it", "burning it", "reading it"], "correct_choice_idx": 3, "direct_answers": ["reading it", "reading", "reading", "reading", "reading it", "reading it", "reading it", "reading", "reading", "reading"], "difficult_direct_answer": false, "rationales": ["You can tell by his position with the book as to what he is doing with it.", "He is perusing a real book.", "The boy is holding the book in his hands. he is looking at the pages, not causing damage to the book."], "image": "train2014/COCO_train2014_000000162282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5906, "question_id": "RXJpn9KGg4u8vkvLR7eqsF", "question": "What is under the television?", "choices": ["books", "cats", "candy", "action figures"], "correct_choice_idx": 0, "direct_answers": ["books", "books", "books", "books", "books", "books", "books", "books", "books", "books"], "difficult_direct_answer": false, "rationales": ["The items are rectangular. they are made out of paper.", "Even if they are not all books, they surely can't be action figures , candy or cats.", "Books are underneath."], "image": "train2014/COCO_train2014_000000005906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278344, "question_id": "RXVstBVWJwMg9PT3BGwyuD", "question": "Omitting which element to the green items here will quickly cause browning in this room?", "choices": ["water", "silver", "copper", "gold"], "correct_choice_idx": 0, "direct_answers": ["light", "sunlight", "sunlight", "sunlight", "water", "water", "sunlight", "water", "sunlight", "couch"], "difficult_direct_answer": false, "rationales": ["Water is needed for the plants.", "The green items here are plants and if they do not get water, they will turn brown.", "Living things need liquid to survive. the green plants are living things."], "image": "train2014/COCO_train2014_000000278344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220076, "question_id": "RXfFQDg47ggf3zxCDcYTNd", "question": "What type of jewelry is in the woman's ear?", "choices": ["spike", "button", "gauge", "diamond"], "correct_choice_idx": 2, "direct_answers": ["plug", "gauge", "earring", "gauge", "earring", "earrings", "ear gauge", "gauge", "gauge", "gauge"], "difficult_direct_answer": false, "rationales": ["The jewelry is round, not diamond or spike shaped. it has a hole in the center.", "The woman is wearing special jewelry that is used to stretch out the piercing in an ear.", "There is a hole in the center of the ear ring."], "image": "train2014/COCO_train2014_000000220076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401132, "question_id": "RY36vhnbfrYx7K3FqzvYxz", "question": "How is the woman in the grey shirt feeling?", "choices": ["depressed", "hostile", "mad", "excited"], "correct_choice_idx": 3, "direct_answers": ["glad", "happy", "happy", "happy", "excited", "glad", "happy", "glad", "happy", "happy"], "difficult_direct_answer": false, "rationales": ["The woman is smiling.", "She is smiling broadly with an open mouth while touching the glass in the direction she is looking.", "The woman in the gray shirt is very excited because she has a big smile on her face."], "image": "val2014/COCO_val2014_000000401132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474208, "question_id": "RYDVjnfoeL2MAeVjnziPmw", "question": "Where is this cat located?", "choices": ["vet", "boat", "vehicle", "house"], "correct_choice_idx": 2, "direct_answers": ["bus", "bus", "vehicle", "bus", "bus", "bus", "office chair", "bus", "seat", "bus"], "difficult_direct_answer": false, "rationales": ["The cat is a passenger in a bus.", "The cat is on a car.", "The cat is inside a vehicle and it is maybe a bus or a van."], "image": "train2014/COCO_train2014_000000474208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299319, "question_id": "RYbBPEmtfxia3UEELggZfB", "question": "What grade are these girls most likely in?", "choices": ["college", "tenth", "fourth", "graduate school"], "correct_choice_idx": 2, "direct_answers": ["first grade", "first", "first", "third", "third", "fifth", "fifth", "first", "rainbow", "fourth"], "difficult_direct_answer": false, "rationales": ["The girls are visible and their relative age can be guessed based on their size and faces. answer a would be the only realistic answer.", "By there age you can surmise they are in elementary school still.", "They are too young to be in high school or college."], "image": "val2014/COCO_val2014_000000299319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147042, "question_id": "RYpgwToYEP6KDSPxU2N9fn", "question": "What is the hanging item used for?", "choices": ["seasoning", "light", "mix soup", "swatting flies"], "correct_choice_idx": 3, "direct_answers": ["kill flies", "swatting flies", "killing flies", "swat flies", "swatting flies", "flipping", "dry", "swatting flies", "killing flies", "swatting flies"], "difficult_direct_answer": false, "rationales": ["The item hanging on the cabinet is a flyswatter.", "The item swats flies.", "It's a fly swatter and it's used for killing flies."], "image": "val2014/COCO_val2014_000000147042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62690, "question_id": "RZ5MrtWDtJEmqBDN2VRASN", "question": "What natural element is being watched most closely here?", "choices": ["water", "roadway", "dirt", "hills"], "correct_choice_idx": 0, "direct_answers": ["ocean", "water", "waves", "water", "water", "water", "water", "water", "water", "hydrogen dioxide"], "difficult_direct_answer": false, "rationales": ["The people are at a bench designed for watching the ocean.", "People sit on a bench near the water.", "They are sitting on the bench so they can look out at the water."], "image": "train2014/COCO_train2014_000000062690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138351, "question_id": "RZGHmPFJ3Pw84FuVCsFMsT", "question": "What body part is causing the most water mist?", "choices": ["tail", "ears", "nose", "mouth"], "correct_choice_idx": 2, "direct_answers": ["nose", "trunk", "elephants trunk", "trunk", "trunk", "trunk", "trunk", "elephant trunk", "trunk", "trunk"], "difficult_direct_answer": false, "rationales": ["The trunk is causing the mist.", "That's what the elephant's trunk is.", "The water is coming out of the elephant's elongated trunk."], "image": "train2014/COCO_train2014_000000138351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43248, "question_id": "RZNNEKqTqCxP9u8MCR5n78", "question": "Which action would be easiest for the skateboarding man to perform immediately?", "choices": ["grab tree", "go uphill", "call home", "grab snow"], "correct_choice_idx": 3, "direct_answers": ["fall", "stop", "rolling", "turn", "snowball", "standing", "stand", "touch snow", "grab snow", "skate"], "difficult_direct_answer": true, "rationales": ["The man is touching the snow. gravity is pulling him downhill.", "The man is nearly touching the snow.", "The action is to grab snow."], "image": "train2014/COCO_train2014_000000043248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529383, "question_id": "RZUU5HAM9rWRHrRq3NGJTX", "question": "What is someone about to do?", "choices": ["board ship", "board train", "board flight", "domestic trip"], "correct_choice_idx": 2, "direct_answers": ["travel", "work", "homework", "pack", "travel", "work trip", "board flight", "pack", "put away", "pack"], "difficult_direct_answer": false, "rationales": ["Someone will board a flight.", "This person is logging exactly what's in their bag so they may be boarding a plane.", "The person's ticket is pictured."], "image": "train2014/COCO_train2014_000000529383.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274789, "question_id": "RZYLoxCsRGteAzK8fvRz2X", "question": "What are the animals on the left walking across?", "choices": ["bridge", "farm", "field", "parking lot"], "correct_choice_idx": 2, "direct_answers": ["field", "meadow", "grass", "grass", "field", "field", "field", "field", "field", "field"], "difficult_direct_answer": false, "rationales": ["There is a lot of growing grass in the field.", "They are walking across a field", "The animals are going across the field."], "image": "train2014/COCO_train2014_000000274789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11065, "question_id": "RZcUscjCoBZuZ6HFLnrFQ8", "question": "What type of animals are present?", "choices": ["dog", "elephant", "deer", "horse"], "correct_choice_idx": 3, "direct_answers": ["horse", "horses", "horse", "horse", "clydesdale horses", "farm", "horse", "horse", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["A horse is shown.", "There are horses outside.", "An animal with hooves and a mane is present."], "image": "train2014/COCO_train2014_000000011065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166223, "question_id": "RZcpjEUMJS7iWuYEigFPuv", "question": "Where are these animals located?", "choices": ["museum", "zoo", "croft", "veterinarian"], "correct_choice_idx": 2, "direct_answers": ["field", "pasture", "coastal area", "field", "croft", "ireland", "field", "field", "fields", "ireland"], "difficult_direct_answer": false, "rationales": ["The animals are cows that are grazing. they are on a small farm.", "An area of land with grass.", "The animals are in croft since they're in a field."], "image": "train2014/COCO_train2014_000000166223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463665, "question_id": "Ra3UZ2nw48dx3TNkx3wxVW", "question": "Which bird is most likely last to grab a bug from the ground?", "choices": ["rightmost bird", "flying one", "front most", "far left"], "correct_choice_idx": 1, "direct_answers": ["crane", "front", "too long", "flying one", "middle one", "third one", "eagle", "back one", "foremost", "flying one"], "difficult_direct_answer": true, "rationales": ["It has it's head in the ground.", "The one that is up in the air is farthest from grabbing a bug.", "The bird that is in the air is the farthest one from the ground. it would take it longer to get to the ground."], "image": "train2014/COCO_train2014_000000463665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125380, "question_id": "Ra6xAYknCAttVPHB5ccQmh", "question": "Who has fallen in the water?", "choices": ["spectator", "officer", "child", "jockey"], "correct_choice_idx": 3, "direct_answers": ["jockey", "jockey", "jockey", "jockey", "jockey", "rider", "rider", "jockey", "jockey", "rider"], "difficult_direct_answer": false, "rationales": ["The jockey fell off the horse.", "The jockey fell.", "There is a riderless horse and a person in the water with a bib and a whip like a jockey might have. the horse is saddled as though there should be a rider and there appears to be on near the horse in the water so that person is most likely the rider of the horse."], "image": "train2014/COCO_train2014_000000125380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484080, "question_id": "RaCULbLcN4na3Z8to8XaWM", "question": "What food group is available here?", "choices": ["fruits", "dairy", "grains", "vegetables"], "correct_choice_idx": 0, "direct_answers": ["fruits", "fruit", "citrus", "fruit", "oranges", "fruits", "fruit", "orange", "fruit", "fruit"], "difficult_direct_answer": false, "rationales": ["These are oranges", "These are oranges", "Oranges are types of fruit."], "image": "val2014/COCO_val2014_000000484080.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129699, "question_id": "RaCxKXUWTJ7KD8SLEk5rKs", "question": "What are is the image from?", "choices": ["city", "forest", "sky", "underground"], "correct_choice_idx": 0, "direct_answers": ["new york", "new york", "city", "intersection", "new york", "city", "downtown", "street", "street corner", "east street"], "difficult_direct_answer": false, "rationales": ["The image is a city.", "The tall building is proof that this is a city.", "This street sign is in new york."], "image": "val2014/COCO_val2014_000000129699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104788, "question_id": "RaJEv2SJNkPKGaSFSJs8Js", "question": "What direction does a car go to get to Route 101?", "choices": ["left", "right", "straight", "turn around"], "correct_choice_idx": 2, "direct_answers": ["south", "straight", "straight", "left", "straight", "south", "straight", "tennis", "straight", "straight"], "difficult_direct_answer": false, "rationales": ["The sign on the post shows an arrow indicating go straight to get to 101.", "The car can only go straight based on the sign.", "The cars need to go straight."], "image": "train2014/COCO_train2014_000000104788.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320641, "question_id": "RaZytuK4fsPghDXKbASrni", "question": "How many cars can fit side by side on that path?", "choices": ["four", "two", "three", "zero"], "correct_choice_idx": 3, "direct_answers": ["two", "one", "one", "zero", "zero", "one", "zero", "one", "zero", "zero"], "difficult_direct_answer": false, "rationales": ["The path is for bikes and is too narrow for cars.", "There are no cars.", "The path is meant for bikes and pedestrians."], "image": "val2014/COCO_val2014_000000320641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388466, "question_id": "Rac4LTaqnd7isNBhRr7Sqc", "question": "Why does the woman have her arms out?", "choices": ["reach", "greeting", "break fall", "balance"], "correct_choice_idx": 3, "direct_answers": ["enjoying herself", "balance", "balance", "balance", "maintain balance", "keep balance", "posing", "balance", "balancing", "for balance"], "difficult_direct_answer": false, "rationales": ["The woman is on a surfboard.", "The woman balances.", "She is keeping her arms out to be able to stay on the board."], "image": "train2014/COCO_train2014_000000388466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326217, "question_id": "RahniiYVoAcCbJWDvMUzuy", "question": "What purpose do the hats worn serve?", "choices": ["advertising", "status", "style", "sun protection"], "correct_choice_idx": 3, "direct_answers": ["sun protection", "shade", "sun protection", "cover head", "sun protection", "shade", "sun protection", "protection", "block sun", "no sun"], "difficult_direct_answer": false, "rationales": ["Keeps the sun off of their faces.", "It keeps the sun out of their eyes.", "The man is protecting himself from the sun."], "image": "val2014/COCO_val2014_000000326217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437756, "question_id": "Raxk45YWCcfzxS8HsCEjny", "question": "What type of wood floor is used in most homes?", "choices": ["hard", "drift", "particle", "soft"], "correct_choice_idx": 0, "direct_answers": ["oak", "oak", "hard", "laminate", "linoleum", "oak", "hardwood", "oak", "maple", "hardwood"], "difficult_direct_answer": false, "rationales": ["Hardwood is a common floor covering in homes. wood grain can be seen in the flooring.", "There is a hard wood floor used in most homes.", "The wood floor in the home is hardwood flooring."], "image": "train2014/COCO_train2014_000000437756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290620, "question_id": "Rb8eXHE873o8KGgth75ah8", "question": "What is the man with the mustache doing with the black object?", "choices": ["gaming", "paying", "exercising", "calling"], "correct_choice_idx": 3, "direct_answers": ["men", "phone call", "talking", "making calls", "talking", "talking", "speaking into", "talking", "calling", "using it"], "difficult_direct_answer": false, "rationales": ["The man is on the phone talking.", "The man is holding his cell phone up to his ear so he can hear the person on the other end.", "The man is using the phone to call someone to have a conversation. phones are used to call people when up to one's ear."], "image": "train2014/COCO_train2014_000000290620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268725, "question_id": "Rbaq4uamxLfjJ5foT4UKm4", "question": "Why is the front wheel off the ground?", "choices": ["broken bike", "bouncing", "falling", "showing off"], "correct_choice_idx": 3, "direct_answers": ["popping wheelies", "motorcycle tricks", "performing wheelie", "performing trick", "popping wheelie", "wheelie", "showing off", "wheelie", "performing wheelie", "wheely"], "difficult_direct_answer": false, "rationales": ["He's popping a wheelie", "A person in a uniform and on a professional race track is on a motorcycle with the front wheel off the ground and he is not crashing.", "They are doing a trick to impress people"], "image": "val2014/COCO_val2014_000000268725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540176, "question_id": "RbhPjGRE5LobiexoAv8jni", "question": "What is the nickname of the first city?", "choices": ["long time", "wally", "lolo", "gong"], "correct_choice_idx": 3, "direct_answers": ["gong", "gong", "woll", "gong", "gong", "gong", "woll", "gong", "gong", "gong"], "difficult_direct_answer": false, "rationales": ["The name is funny so they just used the last four letters as a nickname.", "Gong is the nickname of wollongong.", "The city of wollongong can be shorted to 'gong' which many refer to it as."], "image": "val2014/COCO_val2014_000000540176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413164, "question_id": "RbmMtaRCHFgCaNUCcZcowr", "question": "What will persons seated here do next?", "choices": ["order", "sing", "pay", "leave"], "correct_choice_idx": 0, "direct_answers": ["order meals", "eat", "order food", "order food", "order food", "order", "order meal", "order food", "order food", "eat"], "difficult_direct_answer": false, "rationales": ["People are ordering.", "The people are looking at menus. they have not received any food yet.", "The persons who are all sitting here are getting ready to order."], "image": "train2014/COCO_train2014_000000413164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452404, "question_id": "RbmRgvNQ5pLyHYYzYayVvn", "question": "A snowblade is made of what?", "choices": ["wood", "plastic", "aluminum", "copper"], "correct_choice_idx": 0, "direct_answers": ["steel", "metal", "fiber glass", "fiberglass", "fiberglass", "carbon fiber", "fiberglass", "steel", "metal", "wood"], "difficult_direct_answer": false, "rationales": ["They are made of wood.", "The snowblade is made of material from trees.", "The blades are wooden."], "image": "train2014/COCO_train2014_000000452404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366967, "question_id": "RbozN9yk9fb5FLTw5GgtEb", "question": "Where does this man?", "choices": ["dugout", "home plate", "outfield", "pitchers mound"], "correct_choice_idx": 3, "direct_answers": ["baseball", "ball field", "pitchers mound", "pitch", "field", "baseball", "mexico", "pitchers mound", "pitcher's mound", "baseball field"], "difficult_direct_answer": false, "rationales": ["A man in a baseball uniform is stepping forward to throw the ball with purpose from an elevated mound of sand. pitchers throw from an elevated mound of sand in the center of baseball diamonds.", "He's in the center of the infield", "The man is standing on the pitchers mound and throwing the ball."], "image": "train2014/COCO_train2014_000000366967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223426, "question_id": "RbxEKvPVYpwnbBKeidCMsD", "question": "What will the blue jug do to the water besides store it?", "choices": ["flavor", "warm", "filter", "freeze"], "correct_choice_idx": 2, "direct_answers": ["drink", "filter", "filter", "filter it", "pour it", "dispense it", "filter", "filter it", "filter it", "filter"], "difficult_direct_answer": false, "rationales": ["The pitcher is equipped with an apparatus to clean water.", "The jug is a filter.", "The blue jug filters the water."], "image": "val2014/COCO_val2014_000000223426.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452229, "question_id": "RcGWiR93HTeVBkJf6tgyR6", "question": "What is in the bottle all the way to the right that is near the brunette woman?", "choices": ["cheese", "cork", "milk", "orange juice"], "correct_choice_idx": 1, "direct_answers": ["wine", "bottle stopper", "cork", "wine", "wine", "wine", "wine", "wine", "cork", "cork"], "difficult_direct_answer": false, "rationales": ["The bottle to the left is corked.", "The wine bottle on the far right has not been opened yet.", "The bottle has a cork."], "image": "train2014/COCO_train2014_000000452229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85101, "question_id": "Rcew9eSk4Adjnx7jk2fnuM", "question": "What touches the feet of the people holding the airborn sails?", "choices": ["rock", "monkeys", "water", "people"], "correct_choice_idx": 2, "direct_answers": ["sand", "sand", "sand", "sand", "experienced trainer", "water", "experienced trainer", "water", "sand", "sand"], "difficult_direct_answer": false, "rationales": ["The water will eventually come in to get their feet wet.", "Due to the setting of the picture it is easy to tell what is touching the people's feet.", "The people are standing in the ocean."], "image": "val2014/COCO_val2014_000000085101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251496, "question_id": "Rcx67xTwHAxbcfLFhJrhZQ", "question": "What is the woman doing while walking the dog?", "choices": ["making call", "reading", "gaming", "eating"], "correct_choice_idx": 0, "direct_answers": ["on cellphone", "talking phone", "using phone", "making call", "talking", "talking cellphone", "talking", "talking", "using phone", "talking"], "difficult_direct_answer": false, "rationales": ["She is talking to someone on her phone, so making a call.", "She is walking and talking on the phone", "The woman is talking on her cell phone while walking the dog."], "image": "train2014/COCO_train2014_000000251496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221416, "question_id": "Rd4TEQmrRREnyeyVSsKVwG", "question": "What vegetables are blooming here with rounded heads?", "choices": ["corn", "broccoli", "onions", "asparagus"], "correct_choice_idx": 2, "direct_answers": ["corn", "onion", "onions", "cabbage", "corn", "sunflowers", "garlic", "tomato", "onions", "garlic"], "difficult_direct_answer": false, "rationales": ["Onions are part of the allium genus, which have flowers that are round like pompoms.", "Corn, broccoli and asparagus don't have rounded heads.", "These are the type of blooms for this plant"], "image": "train2014/COCO_train2014_000000221416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559417, "question_id": "RdCMT2FSWVsTfVLg6EsXDp", "question": "What carved imagery animal is likely found on the amusement ride shown here?", "choices": ["horse", "rabbits", "dogs", "cats"], "correct_choice_idx": 0, "direct_answers": ["horse", "horse", "horse", "unicorn", "horse", "horse", "horse", "animals", "horses", "horse"], "difficult_direct_answer": false, "rationales": ["The ride is a carousel. carousels are known to most commonly feature horses.", "The ride in the distance is a merry-go-round which has horses that go in circles.", "While carousels often have other animals, riders originally rode horses."], "image": "val2014/COCO_val2014_000000559417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551633, "question_id": "RdEk3SnbcY3PH5PyH2GDsa", "question": "Why is the girl reaching down?", "choices": ["to flip", "grab shoe", "to sit", "grab ball"], "correct_choice_idx": 3, "direct_answers": ["getting balls", "ball", "soccer ball", "grab ball", "football", "pick ball", "get ball", "getting ball", "grab ball", "pickup ball"], "difficult_direct_answer": true, "rationales": ["The ball is below her hands.", "She wants the ball.", "The girl is grabbing the ball."], "image": "val2014/COCO_val2014_000000551633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265277, "question_id": "RdFUm2dDFsJZojpvEP2Nhg", "question": "What is used to make the run way?", "choices": ["caol", "metal", "cement", "soil"], "correct_choice_idx": 2, "direct_answers": ["concrete", "concrete", "concrete", "concrete", "concrete", "plane", "lights", "cement", "asphalt", "concrete"], "difficult_direct_answer": false, "rationales": ["Cement is on the runway.", "The roads are tarmac which is asphalt", "The runway is made of smooth solid cement pavement."], "image": "train2014/COCO_train2014_000000265277.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549422, "question_id": "RdG5wbeFvJUce7kZX8GDkL", "question": "While going down a hill the elephants momentum naturally makes him?", "choices": ["turn around", "roll over", "walk slower", "walk faster"], "correct_choice_idx": 3, "direct_answers": ["speed up", "fall susceptible", "accelerate", "run", "bump around", "go forward", "sway", "wobble", "faster", "walk faster"], "difficult_direct_answer": true, "rationales": ["The elephant will naturally walk faster if going down hill because it will gain momentum from gravity.", "Going down a hill increases gravity which will make the elephant go faster.", "The elephant can go faster."], "image": "train2014/COCO_train2014_000000549422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468876, "question_id": "RdSwUzFrECzJrmp3ULJJZr", "question": "How has the girl fastened her shirt?", "choices": ["with glue", "with buttons", "with string", "with zipper"], "correct_choice_idx": 3, "direct_answers": ["zipper", "zipper", "zipper", "zipper", "zipper", "zipper", "zipper", "zipper", "zipper", "with zipper"], "difficult_direct_answer": false, "rationales": ["The girl has a zipper on her jacket that extends all the way to the top.", "The girl has a zipper on her jacket.", "The girl has a zipper."], "image": "train2014/COCO_train2014_000000468876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552947, "question_id": "Rdjgcr2SdL57A92ctPzqb3", "question": "What kind of phone is he using?", "choices": ["rotary", "landline", "cellular", "pay"], "correct_choice_idx": 2, "direct_answers": ["cell", "cell phone", "flip", "cellphone", "cellphone", "flip phone", "cellular", "flip phone", "mobile phone", "cell phone"], "difficult_direct_answer": false, "rationales": ["The phone he's holding is small and has no wires connected to it. the only choice on this list that matches that description is a cell phone. the others need a wire.", "The man is using a cell phone.", "The phone is small and has no wires which means it is a mobile phone."], "image": "val2014/COCO_val2014_000000552947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377213, "question_id": "RdxZv9AWXpmCpNwtd7qpLG", "question": "What is the boy that threw the ball wearing?", "choices": ["scarf", "helmet", "bow tie", "jeans"], "correct_choice_idx": 3, "direct_answers": ["sweater", "sweater", "shirt jeans", "throwing", "white shirt", "baseball glove", "jeans", "jeans", "baseball glove", "white shirt"], "difficult_direct_answer": false, "rationales": ["The boy has jeans.", "The boy is clearly visible and based on the color and texture of his pants, answer a is likely.", "The boy is wearing a pair of jeans."], "image": "train2014/COCO_train2014_000000377213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19123, "question_id": "Re5DtmkTHece3NakbBfTB5", "question": "Why is the lady holding that item?", "choices": ["jumping rope", "to clean", "to sing", "to cook"], "correct_choice_idx": 2, "direct_answers": ["karaoke", "singing", "singing", "to sing", "singing", "singing", "she'll sing", "singing", "microphone", "singing"], "difficult_direct_answer": false, "rationales": ["The item the woman is holding is a microphone based on the size and shape and this would be used primarily for answer a.", "The lady is holding a microphone, not a jump rope, pot, or rag.", "Karaoke is a fun and entertaining way to pass time with friends."], "image": "train2014/COCO_train2014_000000019123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345598, "question_id": "ReFEDTpWGvuMzJeZNuz6MM", "question": "Why did they open up the wall?", "choices": ["leak", "decoration", "styling", "for fun"], "correct_choice_idx": 0, "direct_answers": ["to repair", "leak", "renovation", "repair", "renovation", "repairs", "construction", "bathtub", "renovate", "fix pipe"], "difficult_direct_answer": true, "rationales": ["There is a vacuum to clean up water", "The answer is not knowable based on the image, but answer a is a likely cause for a wall to be opened in this manner around a major plumbing apparatus.", "The wall is opened up so that a leaky pipe can be fixed."], "image": "train2014/COCO_train2014_000000345598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6906, "question_id": "RePYCvkf6biUcJCwHV2WpC", "question": "What type of clothing is this?", "choices": ["casual", "uniform", "work", "play"], "correct_choice_idx": 2, "direct_answers": ["suit", "business suit", "suits", "professional", "work", "business casual", "regular", "formal", "suit", "work"], "difficult_direct_answer": false, "rationales": ["Maybe businessmen will dress formally for work.", "This is a formal suit", "These men are pictured wearing button up shirts with ties, slacks, and one man has on a sport coat or blazer. this is an example of professional clothing."], "image": "val2014/COCO_val2014_000000006906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219294, "question_id": "ReSHFKxzLwpCnaW6VNdNGR", "question": "What is the man in green trying to do?", "choices": ["clothesline", "dance", "tackle", "steal ball"], "correct_choice_idx": 3, "direct_answers": ["steal ball", "steal ball", "kick ball", "steal", "steal ball", "challenging player", "tackle player", "dribble ball", "kick", "kick ball"], "difficult_direct_answer": false, "rationales": ["He has his foot in front of the other player", "A man in a green uniform is reaching out with his foot towards a man in blue dribbling it along a soccer field.", "Two boys are playing soccer and one has the ball."], "image": "train2014/COCO_train2014_000000219294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390524, "question_id": "ReXdw8RSwSMcRp4zwYBz47", "question": "What hobby does the man looking up enjoy?", "choices": ["kites", "painting", "football", "acting"], "correct_choice_idx": 0, "direct_answers": ["flying kites", "kite flying", "kite flying", "kites", "kite flying", "kites", "kite flying", "flying kites", "kites", "flying kites"], "difficult_direct_answer": false, "rationales": ["That's what the man is flying.", "A man is flying a kite on the beach.", "The man is not inside an art gallery, football stadium, or theater. he is outside and is holding control strings."], "image": "train2014/COCO_train2014_000000390524.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229172, "question_id": "ReeLUhaHNQrVDXfHzEfS9t", "question": "What is the end table being used for?", "choices": ["step stool", "decoration", "cooking surface", "serving food"], "correct_choice_idx": 3, "direct_answers": ["hold tray", "holding food", "food", "food", "food tray", "serving food", "tea", "eating", "tea", "food"], "difficult_direct_answer": false, "rationales": ["The end table has cake and cups on it.", "There is food on the table.", "The table has food on top."], "image": "train2014/COCO_train2014_000000229172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428718, "question_id": "RemqrNmNV3UqFFA353vD9u", "question": "What would help protect her skin from sun burn?", "choices": ["oil", "sunscreen", "shaving cream", "baby powder"], "correct_choice_idx": 1, "direct_answers": ["skin care", "sunblock", "sunscreen", "sun lotion", "sun screen", "sun lotion", "sunscreen", "sunscreen", "sunscreen", "sunscreen"], "difficult_direct_answer": false, "rationales": ["The sunscreen would help.", "She needs a sunscreen in order to prevent her skin.", "Sunscreen protects skin from uv rays."], "image": "val2014/COCO_val2014_000000428718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317036, "question_id": "ReuMotgupZPk27ZV9mJ2nf", "question": "What sort of business are the autos in all likelihood closest to?", "choices": ["gas station", "restaurant", "auto repair", "dealership"], "correct_choice_idx": 2, "direct_answers": ["antiques", "auto repair", "glass", "bodyshop", "auto body", "auto repair", "auto body", "auto body", "auto shop", "body shop"], "difficult_direct_answer": false, "rationales": ["The sign on the business across the street references both glass and body work available for cars.", "The cars are parked directly across the street from an auto repair business.", "These vehicles are closeby to an auto repair shop."], "image": "train2014/COCO_train2014_000000317036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54374, "question_id": "Rf5xEEYhkD5b7DyZ5xhY8Q", "question": "What type of building is this?", "choices": ["agricultural", "school", "library", "medical"], "correct_choice_idx": 0, "direct_answers": ["market", "water side", "market", "fruit market", "greenhouse", "steel building", "steel building", "agricultural", "greenhouse", "warehouse"], "difficult_direct_answer": false, "rationales": ["There are plants in the area and plants are common in agricultural areas.", "By the plants and the greenhouse setting, you can tell what type of industry it is.", "The building has green plants."], "image": "train2014/COCO_train2014_000000054374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518853, "question_id": "Rf8zVadLh9FgwB8KddBaWb", "question": "What province is she riding in?", "choices": ["manitoba", "alberta", "british columbia", "ontario"], "correct_choice_idx": 2, "direct_answers": ["england", "quebec", "british columbia", "quebec", "bike", "london", "vancouver", "bike", "british columbia", "british columbia"], "difficult_direct_answer": false, "rationales": ["The words on the building window are in french.", "The street name seen on the street sign is a street in british columbia.", "The spelling on the sign is in british english."], "image": "train2014/COCO_train2014_000000518853.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436145, "question_id": "RfKi3DrVPvgT2pwodhJyLs", "question": "The wall behind the bed could be described by which one of these adjectives?", "choices": ["modern", "sleek", "futuristic", "rustic"], "correct_choice_idx": 3, "direct_answers": ["fresco", "rustic", "brick", "rustic", "stone", "decorative", "stone", "country", "stone", "rainy time"], "difficult_direct_answer": false, "rationales": ["It is exposed brick or stone that was more commonly used in the past before sheetrock", "The wall behind the bed could be described as rustic.", "The wall behind the bed could be described as rustic because it is made of old-fashioned stone."], "image": "train2014/COCO_train2014_000000436145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532, "question_id": "RfYLcpLsEYuT4QD45Tw4B9", "question": "Who would ride this bus?", "choices": ["students", "sightseers", "prisoners", "commuters"], "correct_choice_idx": 0, "direct_answers": ["hispanics", "students", "students", "red", "tourists", "students", "people", "children", "school children", "all peoples"], "difficult_direct_answer": false, "rationales": ["This is a school bus which is usually used to transport children to school.", "Kids that are in school would ride this bus.", "This looks like a school bus."], "image": "train2014/COCO_train2014_000000000532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243472, "question_id": "RfZuuHuN8UhPUSMhXoPhVx", "question": "What type of crossing does the traffic light allow?", "choices": ["carriage", "turtle", "dog", "horse"], "correct_choice_idx": 3, "direct_answers": ["horse", "horse crossing", "horse riders", "horse", "horse", "horse", "pedestrian", "horse", "horses", "horse crossing"], "difficult_direct_answer": false, "rationales": ["The outline of a horse is shown on the light.", "The crossing light indicates who can use the crosswalk and the light depicts a horse and rider.", "You can tell by the picture of the light as to what traffic it allows."], "image": "train2014/COCO_train2014_000000243472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286176, "question_id": "RfcGGt48yxqvsubhZD249G", "question": "Where is this taking place?", "choices": ["mcdonald's", "street vending", "kfc", "food court"], "correct_choice_idx": 3, "direct_answers": ["food court", "food court", "mall", "food court", "people eating", "mall", "food court", "food court", "mall", "mall"], "difficult_direct_answer": false, "rationales": ["There are other businesses next to it", "One can see the individual restaurants of a mall setting.", "The pizza is from a mall food court."], "image": "val2014/COCO_val2014_000000286176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119016, "question_id": "Rfg5QdNuH3vw6sDESqPbht", "question": "What are the drivers using to see the road?", "choices": ["spotlights", "headlights", "light bars", "flashlights"], "correct_choice_idx": 1, "direct_answers": ["headlights", "headlights", "eyes", "headlights", "headlights", "headlights", "headlights", "head lights", "headlights", "headlights"], "difficult_direct_answer": false, "rationales": ["The drivers use headlights.", "Cars have headlights to see in the dark.", "The lights on the front of the car that shine forward are called headlights."], "image": "train2014/COCO_train2014_000000119016.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95371, "question_id": "RfjV9wkGNLZ36WWbZH3rjs", "question": "In which country does this bus travel?", "choices": ["usa", "chile", "england", "canada"], "correct_choice_idx": 2, "direct_answers": ["england", "usa", "england", "united states", "germany", "england", "england", "england", "england", "uk"], "difficult_direct_answer": false, "rationales": ["Exeter is in england.", "The country is england.", "Exeter is in it and this is a double decker bus."], "image": "val2014/COCO_val2014_000000095371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164659, "question_id": "Rfs4RSkGDEd6Y29wGQozaw", "question": "What purpose is the bike serving right now?", "choices": ["exercise", "prop", "travel", "commerce"], "correct_choice_idx": 1, "direct_answers": ["prop", "transportation", "transportation", "visual aid", "transportation", "prop", "backdrop", "posing", "decoration", "picture prop"], "difficult_direct_answer": false, "rationales": ["The bike owner is holding it to one side to create a pictorial detail.", "The bike is being propped up.", "The bike has been used to pose for a photo."], "image": "train2014/COCO_train2014_000000164659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556838, "question_id": "Rfxnkr2ZHkMFYiYGGBSwLh", "question": "Who does this house belong to?", "choices": ["man", "old woman", "no one", "young woman"], "correct_choice_idx": 2, "direct_answers": ["builders", "builder", "older homeowners", "purple woman", "no idea", "woman", "no one", "front woman", "woman", "developer"], "difficult_direct_answer": true, "rationales": ["Places of the house are roped off, indicating it might be a museum or model home.", "The house looks vacant and being worked on.", "It is roped off for an open house"], "image": "train2014/COCO_train2014_000000556838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235564, "question_id": "RhEyduSAiZi9j7boRqacmo", "question": "What is on the plate?", "choices": ["soup", "tomato", "ham", "spoon"], "correct_choice_idx": 1, "direct_answers": ["eggs veggies", "broccoli", "vegetables", "tomato", "eggs", "vegetables", "vegetables", "food", "food", "food"], "difficult_direct_answer": false, "rationales": ["The red item is a tomato.", "The objects on the plate are clearly visible and definable based on their distinct color and shapes. answer a is the only object on the list of answers clearly included on the plate.", "There are slices of red fruit on the plate."], "image": "val2014/COCO_val2014_000000235564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41351, "question_id": "RhYEssneZZWP3S8urwPd5B", "question": "What are the kids doing on the laptop?", "choices": ["gaming", "chatting", "writing", "school work"], "correct_choice_idx": 0, "direct_answers": ["gaming", "video gaming", "playing games", "playing game", "gaming", "playing game", "gaming", "gaming", "gaming", "playing games"], "difficult_direct_answer": false, "rationales": ["The kids are playing games on the laptop.", "The kids are gaming.", "The kids are playing games."], "image": "train2014/COCO_train2014_000000041351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252952, "question_id": "RhjdoQfgEzB633W6N3bdt2", "question": "Where are the people seated with the small child?", "choices": ["restaurant", "museum", "playground", "square"], "correct_choice_idx": 0, "direct_answers": ["table", "outdoor cafe", "outdoor table", "table", "front", "restaurant", "outdoor restaurant", "cafe", "restaurant", "under umbrella"], "difficult_direct_answer": false, "rationales": ["The people are eating out.", "The way the tables are orientated and the settings on the tables with a glimpse into the background of something resembling a restaurant, it is likely the visible people in question sitting at the table are there for answer a.", "These are tables for an eatery"], "image": "train2014/COCO_train2014_000000252952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311846, "question_id": "RhtrydXNuserzDZpHEGzXW", "question": "What time of icing is on all of the cakes?", "choices": ["vanilla", "strawberry", "mint", "chocolate"], "correct_choice_idx": 0, "direct_answers": ["vanilla", "fondant", "white", "fondant", "icing dam", "sugary", "fondant", "white cream", "buttercream", "vanilla"], "difficult_direct_answer": false, "rationales": ["The icing is all white.", "White icing is not usually flavored with chocolate, strawberry, or mint.", "It is a white frosting"], "image": "val2014/COCO_val2014_000000311846.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522194, "question_id": "RiXbogEB4ZMDacqNB7QNcW", "question": "What might explain the lack of furniture here?", "choices": ["pathological illness", "he's moving", "poverty", "robbery"], "correct_choice_idx": 1, "direct_answers": ["just moved", "just moved", "new place", "he's moving", "just moved", "just moved", "just moved", "just moved", "new apartment", "just moved"], "difficult_direct_answer": false, "rationales": ["The man is moving.", "The man has boxes in his place.", "Furniture would not be present when someone is moving."], "image": "val2014/COCO_val2014_000000522194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518375, "question_id": "RiePfwSipbpkftEv8Dy94T", "question": "What can one of the shiny silver things do?", "choices": ["cut", "generate electricity", "drive", "compute"], "correct_choice_idx": 0, "direct_answers": ["cut", "cut pizza", "cut", "cut", "cut", "cut", "cut food", "cut food", "eat", "serve"], "difficult_direct_answer": false, "rationales": ["The utensils can cut and eat.", "The utensils can slice the pizza.", "The shiny thing cuts."], "image": "val2014/COCO_val2014_000000518375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347172, "question_id": "RifGfuWKsKK8kMEQozLvGQ", "question": "What should you do if you enter this circular junction?", "choices": ["turn right", "keep left", "turn left", "keep right"], "correct_choice_idx": 3, "direct_answers": ["keep right", "stop", "stop", "keep right", "keep right", "stop", "stop", "keep right", "slow down", "keep right"], "difficult_direct_answer": false, "rationales": ["The signs have arrows going to the right.", "A white and black street sign gives instructions for rotary traffic to keep right.", "Keep to the right."], "image": "val2014/COCO_val2014_000000347172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472842, "question_id": "RinQkJ3TUsPj9mXtKxLQMQ", "question": "What might you do in the thing seem just to the left?", "choices": ["bathe", "cook", "brush teeth", "eat"], "correct_choice_idx": 0, "direct_answers": ["bathe", "bath", "wash", "bathe", "bath", "bathe", "bathe", "bathe", "bathe", "bathe"], "difficult_direct_answer": false, "rationales": ["The other options aren't done in a bath or shower.", "There is a bathtub which humans use to get clean.", "One might bathe in the shower on the left."], "image": "train2014/COCO_train2014_000000472842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244401, "question_id": "RivUUufi5qxxtwNyPei4HR", "question": "Which system connects the producer and consumers within the food system more closely?", "choices": ["who", "ppt", "csa", "nts"], "correct_choice_idx": 2, "direct_answers": ["community-supported agriculture", "csa", "supply chain", "farmers market", "marketplace", "market", "distribution", "csa", "community agriculture", "broccoli"], "difficult_direct_answer": true, "rationales": ["That's what is being promoted here.", "The producer name is clearly written on the pamphlet visible in the image.", "The brochure says csa."], "image": "val2014/COCO_val2014_000000244401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347113, "question_id": "RiwnbTqXeaeZZj7Z8ZX7Cw", "question": "What is in the kitchen but unnecessary for cooking or baking?", "choices": ["blanket", "radio", "bassinet", "cat"], "correct_choice_idx": 3, "direct_answers": ["cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["The cat isn't needed to bake or cook.", "The cat is a pet and has nothing to do with meal preparation but more for companionship.", "There is a cat on the kitchen island that is not required for any cooking or baking."], "image": "val2014/COCO_val2014_000000347113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245598, "question_id": "RjJhjfjQAsDBu7AqwddvpT", "question": "Why is the boy in red wearing a glove?", "choices": ["fashion", "health", "warmth", "to catch"], "correct_choice_idx": 3, "direct_answers": ["catch ball", "umpire", "catching", "catcher", "catcher", "catcher", "protection", "to catch", "catcher", "catching"], "difficult_direct_answer": false, "rationales": ["A baseball player is crouched behind the plate holding a glove out to catch with.", "The boy in red is playing baseball and is on the defensive team. he is crouching near the umpire and behind the batter.", "The boy is in catcher gear and behind home base on a baseball field so is playing baseball. in baseball one wears a glove to catch."], "image": "train2014/COCO_train2014_000000245598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507665, "question_id": "RkPRVVU56iNXHsJh6cM3v8", "question": "What is the man with the bat about to do?", "choices": ["sit", "duck", "swing", "sing"], "correct_choice_idx": 2, "direct_answers": ["strike", "strike out", "swing", "hit", "swing", "run", "strike out", "swing again", "run", "run"], "difficult_direct_answer": false, "rationales": ["The man wants to swing his bat at the ball.", "Based on the body position and the position of the ball, the man has already swung and missed the ball potentially. in the rules of baseball, he might have struck out and be returning to the bench.", "The man is clearly a baseball batter based on his equipment and positioning and his intention is to swing and hit the ball."], "image": "train2014/COCO_train2014_000000507665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325065, "question_id": "RkjjHAUkyQcmM2EH9Yy4rY", "question": "This truck is probably based in what state?", "choices": ["louisiana", "connecticut", "colorado", "maine"], "correct_choice_idx": 0, "direct_answers": ["louisiana", "louisiana", "louisiana", "louisiana", "louisiana", "louisiana", "louisiana", "louisiana", "louisiana", "louisiana"], "difficult_direct_answer": false, "rationales": ["It is from louisiana according to the menu.", "The truck has southern food for sale.", "The truck is selling cajun food."], "image": "train2014/COCO_train2014_000000325065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513458, "question_id": "RkosyHLccMpQYfomjFFGQx", "question": "What entities likely vandalized the train?", "choices": ["government", "coders", "gangs", "hackers"], "correct_choice_idx": 2, "direct_answers": ["teenagers", "ban people", "youths", "graffiti artists", "criminals", "taggers", "gang", "teenagers", "gangs", "gangs"], "difficult_direct_answer": false, "rationales": ["The graffiti on the trains was likely done by members of a gang.", "When someone does graffiti it's likes a gang that did it.", "The gang easily vandalized the train to steal the train."], "image": "val2014/COCO_val2014_000000513458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218350, "question_id": "RmJWnGhwFEZjvgmMByGivH", "question": "The boy skateboarding is a fan of what baseball team?", "choices": ["atlanta braves", "montreal expos", "detroit tigers", "edmonton oilers"], "correct_choice_idx": 0, "direct_answers": ["astros", "oakland", "astros", "angels", "oakland as", "angels", "braces", "atlanta braves", "braves", "as"], "difficult_direct_answer": false, "rationales": ["The boy on the skateboard has a baseball cap with the atlanta braves logo on it.", "The boy is wearing a hat with the atlanta braves logo on it. people typically wear hats to signify the teams they are fans of.", "The boy has a braves hat on."], "image": "val2014/COCO_val2014_000000218350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544611, "question_id": "RmPRrQU7LcGPbMy78t2hPM", "question": "What is the purpose of the bed being out in the street?", "choices": ["jumping", "resting", "theater", "trash"], "correct_choice_idx": 2, "direct_answers": ["play", "protest", "performance", "contest", "protest", "show", "demonstration", "performance", "dramatic effect", "theater"], "difficult_direct_answer": false, "rationales": ["The other options wouldn't apply to this setting or image.", "These actors are using a bed for their outdoor performance.", "They are dressed in costume and have a boom mic overhead"], "image": "val2014/COCO_val2014_000000544611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453680, "question_id": "RmRMarprc9kugxywXPKhQx", "question": "What item has just been taken out from the plastic package?", "choices": ["food", "dolls", "coke", "bat"], "correct_choice_idx": 1, "direct_answers": ["bat", "doll", "new doll", "doll", "toy", "doll", "dolls", "glove", "bat", "doll"], "difficult_direct_answer": false, "rationales": ["Dolls were just taken out.", "The plastic package is just large enough to contain the doll that the little girl is holding, so it's a safe assumption that she just opened it to get to the doll.", "The child is playing with dolls, which are the same size as the plastic packaging."], "image": "train2014/COCO_train2014_000000453680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23660, "question_id": "RmgNxSs8YyCBg7qR8qRoEA", "question": "Which giraffe left to right has the best chance of getting petted?", "choices": ["second", "very back", "first", "third one"], "correct_choice_idx": 1, "direct_answers": ["tallest", "right", "second", "right", "right", "second", "tallest giraffe", "very back", "right", "far right"], "difficult_direct_answer": false, "rationales": ["The giraffe around the corner with its neck closest to the people and highest reaching has the best chance of getting petted.", "This is the tallest one", "The giraffe in the back seems to have the longest neck."], "image": "val2014/COCO_val2014_000000023660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33529, "question_id": "RmgkiE4RhyJ5bxzZmdoCN2", "question": "What structure is present above the parked trains on the rail tracks?", "choices": ["passenger walkway", "parking garage", "electric line", "waiting area"], "correct_choice_idx": 0, "direct_answers": ["bridge", "bridge", "bridge", "bridge", "passenger walkway", "bridge", "bridge", "cross bridge", "bridge", "cross bridge"], "difficult_direct_answer": false, "rationales": ["The structure is there for passengers to walk on.", "The structure is clearly visible and based on its size and design and placement over the train tracks, answer a is consistent.", "This is normal in a train depot so that people can move from one section to another without crossing the physical tracks at ground level. it can also serve as a d."], "image": "train2014/COCO_train2014_000000033529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557633, "question_id": "RmgqHfr2gtzHSJHysxzfPu", "question": "What are the people holding?", "choices": ["camera", "food", "phone", "umbrella"], "correct_choice_idx": 3, "direct_answers": ["umbrellas", "umbrella", "umbrella", "umbrellas", "umbrellas", "umbrella", "umbrellas", "umbrella", "umbrellas", "umbrella"], "difficult_direct_answer": false, "rationales": ["The people have umbrellas.", "They are preventing rain droplets from falling on them.", "The people are standing in the rain and using umbrellas to stay dry."], "image": "val2014/COCO_val2014_000000557633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297871, "question_id": "RmnoKqZ6AKeAFTSWUNtKmW", "question": "What type of pants is the girl wearing?", "choices": ["cargo", "jeans", "scrubs", "snow pants"], "correct_choice_idx": 3, "direct_answers": ["snowpants", "snowpants", "snow pants", "snow pants", "winter pants", "ski", "snow pants", "ski pants", "ski", "ski"], "difficult_direct_answer": false, "rationales": ["The girl is on a snowy mountain in the cold.", "As indicated by the environment.", "The girl is wearing warm clothes so she can play in the snow."], "image": "train2014/COCO_train2014_000000297871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341678, "question_id": "RmzQVmVyCvSXcGpmQeNcjY", "question": "Which object is in motion?", "choices": ["fence", "chair", "net", "ball"], "correct_choice_idx": 3, "direct_answers": ["ball", "tennis ball", "tennis ball", "ball", "tennis ball", "ball", "ball", "tennis ball", "tennis ball", "ball"], "difficult_direct_answer": false, "rationales": ["It is in midair", "A person is hitting a tennis ball.", "The ball is blurred and not touching any object. the chairs, net and fence are still."], "image": "val2014/COCO_val2014_000000341678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453686, "question_id": "RnSn4f3YViZztroeMtb44h", "question": "What festival was coming soon after the photo was taken?", "choices": ["thanksgiving", "valentine's day", "christmas", "easter"], "correct_choice_idx": 2, "direct_answers": ["christmas", "christmas", "halloween", "christmas", "christmas", "christmas", "new years", "christmas", "christmas", "christmas"], "difficult_direct_answer": false, "rationales": ["There is a wall hanging on the back wall depicting santa with his full sack of toys on the roof of a house.", "The date on the bottom left is a few days before christmas.", "Looks like it was taken days before christmas."], "image": "train2014/COCO_train2014_000000453686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268590, "question_id": "RnUjRCbNurqR87fXEecnnw", "question": "From which plant does the yellow item on the plate here originate?", "choices": ["celery", "tomato", "cucumber", "mustard"], "correct_choice_idx": 3, "direct_answers": ["mustard", "banana plant", "gardens", "mustard", "mustard", "mustard", "mustard tree", "cruciferae", "mustard seed", "wheat"], "difficult_direct_answer": false, "rationales": ["This condiment is made from the seeds of this plan", "The woman is eating a hot dog which is frequently eaten with mustard. there is a yellow item visible on the plate which would likely then be mustard which originates from answer a.", "This is the prepared version of the plant"], "image": "train2014/COCO_train2014_000000268590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185108, "question_id": "Rneo2CxbYo3KF4e9NGSLcC", "question": "From what country did this meal originate?", "choices": ["usa", "mexico", "italy", "spain"], "correct_choice_idx": 0, "direct_answers": ["italy", "america", "united states", "united states", "usa", "usa", "america", "usa", "mexico", "america"], "difficult_direct_answer": false, "rationales": ["The meal is chili and toast, a meal from the east coast of usa.", "This meal is served in usa diners.", "The meal is chili."], "image": "train2014/COCO_train2014_000000185108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390475, "question_id": "RnsHSgywhthZE2MdYTy83S", "question": "What did the man in the blue shirt just do?", "choices": ["served", "quit", "missed ball", "returned ball"], "correct_choice_idx": 3, "direct_answers": ["hit ball", "hit ball", "hit ball", "hit ball", "returned ball", "hit", "hit ball", "serve", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["He can be seen in the air, meaning he just jumped to return the ball to the other side of the net.", "The man just hit the ball.", "The man in the blue shirt just swung his racket and hit the ball back over the net."], "image": "val2014/COCO_val2014_000000390475.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186018, "question_id": "Rnv8qoWwPKUZQCaMvs5ZNw", "question": "What are the blue umbrellas being used for?", "choices": ["blocking rain", "blocking snow", "blocking sun", "blocking wind"], "correct_choice_idx": 2, "direct_answers": ["shade", "shade", "sun protection", "shade", "blocking sun", "shade", "shade", "make shade", "block sunlight", "sunshade"], "difficult_direct_answer": false, "rationales": ["They are blocking the bright sun.", "The blue umbrellas are all used together to block out the sun.", "Umbrellas provide shade from the sun."], "image": "train2014/COCO_train2014_000000186018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416596, "question_id": "Rny7jf3Gs2UeGjfLLCXZSL", "question": "Why are those towels in the background?", "choices": ["for coaches", "for players", "for cameramen", "for sale"], "correct_choice_idx": 3, "direct_answers": ["for sale", "sweat", "for sale", "fan merchandise", "sales", "for sale", "selling", "for sale", "merchandise", "bath towels"], "difficult_direct_answer": false, "rationales": ["The towels are sitting on a table and they are for sale.", "There is a vendor truck in the background. the towels are on display.", "The towels are for sale."], "image": "val2014/COCO_val2014_000000416596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99568, "question_id": "Ro397LnV9kxnjcqPBrnFbH", "question": "What type of magazine genre is this person fond of?", "choices": ["entertainment", "horror comic", "cars", "fashion"], "correct_choice_idx": 1, "direct_answers": ["creepy", "horror", "horror", "creepy", "horror", "horror", "creepy", "horror", "horror comic", "horror"], "difficult_direct_answer": false, "rationales": ["The book has the word \"creepy\" on it", "The name of the magazine is clearly visible on the stack and the type of magazine of answer a is associated with the name.", "It has the words creepy on the front of the page."], "image": "train2014/COCO_train2014_000000099568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39150, "question_id": "RoJsZSebkTko8J76yBLeD4", "question": "Which vegetable is included in the image?", "choices": ["leeks", "broccoli", "watercress", "mushrooms"], "correct_choice_idx": 1, "direct_answers": ["broccoli", "broccoli", "artichoke", "artichoke", "seeds", "artichoke", "artichoke", "broccoli", "carrot", "artichoke"], "difficult_direct_answer": false, "rationales": ["There is no broccoli present.", "Watercress is shown in the bottom left.", "There is a lot of broccoli in the image."], "image": "val2014/COCO_val2014_000000039150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24257, "question_id": "RoKUSijbonnsGcDVfXoSDM", "question": "What are the colorful posters on the wall used for?", "choices": ["targets", "advertising", "photographing", "selling"], "correct_choice_idx": 1, "direct_answers": ["advertising", "advertisements", "advert", "advertisements", "advertisements", "advertising", "advertisements", "primarily advertising", "advertising", "advertising"], "difficult_direct_answer": false, "rationales": ["Posters are used to get attention.", "The posters are on the wall in a train station and have products on them.", "They are ads for different things."], "image": "train2014/COCO_train2014_000000024257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246085, "question_id": "RoPSqQTWmiY2DAhavywHDF", "question": "What brand is the bus?", "choices": ["mta", "luthfansa", "synths", "spirit"], "correct_choice_idx": 2, "direct_answers": ["syntus", "synbus", "syntus", "syntus", "syntus", "synths", "syntus", "syntus", "syntus", "syntus"], "difficult_direct_answer": false, "rationales": ["The bus logo beneath the front window indicates the bus brand.", "The brand is below the windshield and above the license plate.", "The name is visible on the front of the bus where most bus companies put their name."], "image": "train2014/COCO_train2014_000000246085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234499, "question_id": "RoQXyeHhLWx8uHNCiaDoX5", "question": "Based on the amount of crust what is going to be the dominant flavor in this pizza?", "choices": ["sauce", "meat", "cheese", "bread"], "correct_choice_idx": 3, "direct_answers": ["sauce", "dough", "tomato", "bread", "sauce", "crust", "bread", "tomato", "dough", "cheese"], "difficult_direct_answer": false, "rationales": ["The crust will be flavored mostly by the bread.", "It's going to be tasting a bit like bread.", "There is a lot of bread visible on the pizza."], "image": "val2014/COCO_val2014_000000234499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138547, "question_id": "RoTLaeFu5TBLZL2qp8bPCt", "question": "What is the thing that people are using to ascend to the aircraft?", "choices": ["escalator", "airplane steps", "passenger stairs", "ladder"], "correct_choice_idx": 2, "direct_answers": ["stairs", "passenger stairs", "stairs", "stairwell", "stairs", "ladder", "stairs", "steps", "staircase", "boarding stairs"], "difficult_direct_answer": false, "rationales": ["They are attached to the ground and the front door of the plane.", "People are using the stairs that passengers can walk on.", "People are walking up the stairs."], "image": "train2014/COCO_train2014_000000138547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135356, "question_id": "Rochn2fJWStMhhevZQvzJi", "question": "What is he doing with the pot?", "choices": ["hiding it", "carrying it", "drying it", "cleaning it"], "correct_choice_idx": 2, "direct_answers": ["drying", "polishing", "cleaning", "drying it", "wiping", "cleaning", "drying", "cleaning it", "drying it", "drying it"], "difficult_direct_answer": false, "rationales": ["The man is drying a pot with a towel.", "The man is wiping the pot with a towel.", "He is holding a dish towel to it which will soak up the moisture."], "image": "val2014/COCO_val2014_000000135356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327314, "question_id": "Roer9m84jBzHs85NB6QBwZ", "question": "How many more wheels does this have than a regular motorcycle?", "choices": ["three", "two", "one", "four"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "one", "one", "one more", "one", "one", "one", "one", "one more"], "difficult_direct_answer": false, "rationales": ["There is an extra wheel.", "This vehicle has three wheels. a typical motorcycle has two wheels only.", "It has three wheels instead of two."], "image": "val2014/COCO_val2014_000000327314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363100, "question_id": "RofaUAm8LZLsbXLUz4HQLm", "question": "This establishment most likely sells what?", "choices": ["wine", "tires", "video games", "gwent cards"], "correct_choice_idx": 0, "direct_answers": ["wine", "pizza", "meals", "alcohol", "dinner", "beer", "burgers", "food", "alcohol", "drinks"], "difficult_direct_answer": true, "rationales": ["This is a winery that people go to try out wine.", "This area is a bar.", "The size and shape of the glasses on all the tables is consistent with answer a."], "image": "val2014/COCO_val2014_000000363100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191096, "question_id": "RonbGD5VD4Cq8u53FRzcsC", "question": "What is number fifteen hoping to hit?", "choices": ["homerun", "foul", "strike", "strikeout"], "correct_choice_idx": 0, "direct_answers": ["home run", "homerun", "baseball", "baseball", "ball", "home run", "baseball", "baseball", "baseball", "homerun"], "difficult_direct_answer": false, "rationales": ["Most players of this game strive to hit one out of the park.", "Number fifteen is a batter. he wants to hit something that scores a point for his team.", "He wants to hit a homerun."], "image": "val2014/COCO_val2014_000000191096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325685, "question_id": "Rp2nmiJkiZDWWdimG738nR", "question": "The woman holding the controller is playing a simulation of which sport?", "choices": ["hockey", "baseball", "golf", "tennis"], "correct_choice_idx": 1, "direct_answers": ["shaving", "baseball", "baseball", "baseball", "shaving", "baseball", "baseball", "baseball", "baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["The woman is holding a wii controller based on the size, shape and color. in wii sports you hold the controller in a way you would in the real version of the sport and based on this stance baseball is the most likely of the options provided.", "A woman holds a game controller up over her shoulder.", "The girl is holding up the controller like a bat."], "image": "train2014/COCO_train2014_000000325685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351004, "question_id": "RpKzeeE2hUtXTCyj8gwYd4", "question": "What is he about to do?", "choices": ["swing", "duck", "sit", "run"], "correct_choice_idx": 0, "direct_answers": ["hit ball", "hit ball", "hit ball", "hit ball", "hit ball", "swing", "hit ball", "hit ball", "hit ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["Because his body seems to be in a swinging posture.", "The man wants to hit the ball.", "He has a tennis racquet and is looking at a tennis ball. he is ready to use the racquet to hit the ball."], "image": "train2014/COCO_train2014_000000351004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117834, "question_id": "RpLVynuXRc8gggCSiT2r7v", "question": "What type of land are the horses found on?", "choices": ["mountains", "plains", "hills", "gulfs"], "correct_choice_idx": 1, "direct_answers": ["prairie", "grasslands", "grassland", "grazing pastures", "plain", "steppe", "savannah", "prairie", "prairie", "plains"], "difficult_direct_answer": false, "rationales": ["A bunch of horses are walking on a large, open, flat piece of land.", "Horses graze on flat lands.", "Traditionally wild horses are found in lush grassy regions."], "image": "train2014/COCO_train2014_000000117834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224342, "question_id": "RpViyzp4KjFwk5r2LeECgE", "question": "How are these flowers being used?", "choices": ["corsage", "bouquet", "centerpiece", "border"], "correct_choice_idx": 2, "direct_answers": ["decoration", "centerpiece", "displayed", "decoration", "decor", "decoration", "decoration", "decoration", "decoration", "decoration"], "difficult_direct_answer": false, "rationales": ["The flowers are arranged in a vase and are visibly placed on the middle of the table. these characteristics are consistent with answer a.", "The flowers decorate the middle of the table.", "They're at the center of the table."], "image": "val2014/COCO_val2014_000000224342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301575, "question_id": "RpXtydwLNf9a5oCJ2CssqB", "question": "What piece of furniture is misspelled here?", "choices": ["bed", "table", "stool", "couch"], "correct_choice_idx": 3, "direct_answers": ["couch", "couch", "couch", "couch", "couch", "couch", "couch", "couch", "couch", "couch"], "difficult_direct_answer": false, "rationales": ["The only piece of furniture referenced in the caption is a couch and it is spelled wrong.", "The item is something to sit on.", "Couch is spelled with a k."], "image": "val2014/COCO_val2014_000000301575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233526, "question_id": "RpujRNQtFtUYQveAbLNcGn", "question": "What breakfast cereal is being advertised on the television?", "choices": ["wheaties", "cheerios", "lucky charms", "frosted flakes"], "correct_choice_idx": 2, "direct_answers": ["lucky charms", "lucky charms", "lucky charms", "lucky charms", "lucky charms", "lucky charms", "lucky charms", "lucky charms", "game", "lucky charms"], "difficult_direct_answer": false, "rationales": ["There is a leprechaun on the television.", "You can see the leprechaun on the tv screen.", "The cartoon character visible is recognizable and known to be associated with answer a."], "image": "train2014/COCO_train2014_000000233526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539041, "question_id": "Rq7KX6Ucg6sFDrxnKBxZF2", "question": "What type of counter is shown?", "choices": ["coin", "bathroom", "kitchen", "store"], "correct_choice_idx": 1, "direct_answers": ["stone", "bathroom", "bathroom", "tile", "bathroom", "bathroom counter", "stone", "bathroom", "bathroom", "bathroom counter"], "difficult_direct_answer": false, "rationales": ["Most people keep their toothbrushes in the bathroom.", "There is a toothbrush in a cup", "There are toothbrushes and toiletries on the counter."], "image": "train2014/COCO_train2014_000000539041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554285, "question_id": "RqGxQvbF2aakCepUBoFdJj", "question": "What is the owner a fan of according to this room decor?", "choices": ["space", "celebrities", "animals", "racecars"], "correct_choice_idx": 1, "direct_answers": ["old movies", "james dean", "james dean", "movies", "old movies", "movie legends", "old movies", "celebrities", "man united", "james dean"], "difficult_direct_answer": false, "rationales": ["There are pictures of movie stars on the wall.", "James dean was a movie celebrity.", "The wall is adorned with movie star photographs in frames as well as other famous people black and white photos."], "image": "train2014/COCO_train2014_000000554285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263516, "question_id": "Rr57ZtapAkfmx57gzzLypR", "question": "Based on the candles how long has she been on the planet?", "choices": ["three years", "one year", "four years", "two years"], "correct_choice_idx": 3, "direct_answers": ["2 years", "2 years", "2 years", "2 years", "two years", "one year", "two years", "two years", "two years", "2 years"], "difficult_direct_answer": false, "rationales": ["She is a toddler and only 2 years old.", "There are two candles on the cake.", "The candles are two years."], "image": "train2014/COCO_train2014_000000263516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11291, "question_id": "RrBQ8nniK4a3g2mFPkSVTH", "question": "What do the glasses contain?", "choices": ["grape juice", "champagne", "white wine", "red wine"], "correct_choice_idx": 3, "direct_answers": ["wine", "wine", "red wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine"], "difficult_direct_answer": false, "rationales": ["The wine has a red color.", "There is still a little of the red beverage left in the glasses. people don't usually drink grape juice out of those types of glasses and bottles.", "The glasses have wine."], "image": "val2014/COCO_val2014_000000011291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286888, "question_id": "RrFn7mssv33QHM27gaNikj", "question": "What kind of goods or service are the men on bikes probably offering?", "choices": ["delivery", "taxi rides", "performing", "food cart"], "correct_choice_idx": 1, "direct_answers": ["rides", "taxi rides", "transportation", "taxi rides", "carrying passengers", "transportation", "rides", "transportation", "passenger rides", "transport"], "difficult_direct_answer": false, "rationales": ["The men are riding rickshaws which would be used to transport people.", "There are elongated benches on the back of the bikes these men are driving. this would allow for them to carry passengers.", "Because of where they are and he design of the carriage you can surmise what they are selling."], "image": "train2014/COCO_train2014_000000286888.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282847, "question_id": "RrWNLbCrgKBsKn8bYAL5rp", "question": "What is the purpose of the orange net?", "choices": ["frisbee launch", "decoration", "animal trap", "soccer goal"], "correct_choice_idx": 3, "direct_answers": ["score goal", "soccer goal", "soccer net", "visibility", "soccer goal", "count goals", "catch frisbee", "goal", "catch balls", "soccer"], "difficult_direct_answer": true, "rationales": ["The purpose is the soccer goal.", "The people are playing soccer. the ball goes into the net.", "The orange net is a soccer goal."], "image": "val2014/COCO_val2014_000000282847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381547, "question_id": "RrWikwoDK4RwAmPGRbzh5S", "question": "What happened to the structure that sat upon these posts?", "choices": ["burned down", "weathered away", "nothing", "stolen"], "correct_choice_idx": 1, "direct_answers": ["weathered away", "no reaction", "eroded", "rotted", "removed", "rotted", "flew away", "rusted away", "eroded", "fell off"], "difficult_direct_answer": false, "rationales": ["Posts extend into the water from the shore.", "The structure sunk away.", "There is a piece of the structure still leaning against the bottom of the piling."], "image": "train2014/COCO_train2014_000000381547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248229, "question_id": "RsRvKviQkoJ4xECjn72DSy", "question": "What do the green things bring to the beach?", "choices": ["minerals", "tiny fish", "salt", "unwanted trash"], "correct_choice_idx": 3, "direct_answers": ["seashells", "unwanted trash", "kelp", "leaves", "nutrients", "garbage", "plant life", "algae", "waves", "vegetation"], "difficult_direct_answer": true, "rationales": ["The green things represent discards.", "There is so much unwanted trash that is on the beach.", "The green things are trash."], "image": "train2014/COCO_train2014_000000248229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102175, "question_id": "RsoJmSeUTzobNt2yn4F8rX", "question": "What physical activity is the man in orange involved in?", "choices": ["tennis", "field hockey", "wrestling", "baseball"], "correct_choice_idx": 0, "direct_answers": ["tennis", "tennis", "tennis", "tennis", "tennis", "tennis", "tennis", "tennis", "tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["A man is holding a tennis racket on a tennis court.", "The man has a racquet, not a bat or a stick. he is playing on a clay surface.", "The man is holding a tennis racket and is playing on a tennis court."], "image": "train2014/COCO_train2014_000000102175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213419, "question_id": "RsyApy8ixt6Q8zhCUNTm8R", "question": "Where was this photo taken according to what we read on the boat hulls?", "choices": ["guaymas", "san isidro", "san juan", "hermosillo"], "correct_choice_idx": 1, "direct_answers": ["ocean", "spain", "marlin", "marlin", "marlin", "san isidro", "1952", "unknown", "maryland", "marlene"], "difficult_direct_answer": false, "rationales": ["The name is on the boat", "The photo is from san isidro.", "That's what is written on the boat on the right."], "image": "train2014/COCO_train2014_000000213419.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52299, "question_id": "Rt7FuyZgjpXm7JW3pEQLt3", "question": "Why are the people lined up outside the white van?", "choices": ["purchasing food", "buying tickets", "to dance", "to protest"], "correct_choice_idx": 0, "direct_answers": ["waiting food", "ordering food", "buying food", "get food", "ordering food", "purchasing food", "buy food", "ordering food", "getting food", "buy food"], "difficult_direct_answer": false, "rationales": ["The truck says that it is a taco truck.", "This is obviously a food vendor.", "It is a food truck"], "image": "train2014/COCO_train2014_000000052299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234083, "question_id": "RtUXETR9TxVENhe6gkGASm", "question": "What was used to get these small pieces?", "choices": ["fork", "spatula", "spoon", "knife"], "correct_choice_idx": 3, "direct_answers": ["knife", "knife", "knife", "knife", "knife", "knife", "knife", "knife", "knife", "knife"], "difficult_direct_answer": false, "rationales": ["This cuts items", "You use a knife to cut things in to smaller pieces.", "The foods on the counter were prepared by cutting them into small pieces with a knife."], "image": "train2014/COCO_train2014_000000234083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380998, "question_id": "RtbyTGE4VBuX434yunanzA", "question": "Why is the rider's head covered?", "choices": ["fashion", "protection", "religion", "warmth"], "correct_choice_idx": 1, "direct_answers": ["protection", "riding", "protect head", "protection", "safety", "safety", "safety", "security", "protection", "safety"], "difficult_direct_answer": false, "rationales": ["The driver wants to protect against concussions.", "The helmet protects.", "Motorcycle riding is very dangerous so riders are encouraged to wear a helmet to reduce the risk of a head injury."], "image": "train2014/COCO_train2014_000000380998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462676, "question_id": "RthUy3UqsfxSweryjXkV6Y", "question": "Who probably took the picture?", "choices": ["ricky aponte", "trump", "skateboarder", "steve jobs"], "correct_choice_idx": 0, "direct_answers": ["ricky aponte", "ricky aponte", "ricky aponte", "photographer", "skateboarder's friend", "ricky aponte", "friend", "ricky aponte", "street cameras", "people"], "difficult_direct_answer": false, "rationales": ["The name of the photographer is in the lower right corner.", "Name of person is in the corner of the picture on their watermark.", "The name of the photography company is written in the lower right corner of the image."], "image": "train2014/COCO_train2014_000000462676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30357, "question_id": "RuBGNR3CEpVGyzM22mj5Qi", "question": "Why is there a rolling cabinet?", "choices": ["counter space", "exercise", "cooking", "sitting"], "correct_choice_idx": 0, "direct_answers": ["storage", "counter space", "food service", "island", "convenience", "carry foods", "food prep", "efficiency", "to move", "wine"], "difficult_direct_answer": true, "rationales": ["The cabinet can be used as a counter.", "To have moveable counter space.", "Extra space to put things and store things."], "image": "train2014/COCO_train2014_000000030357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86036, "question_id": "RuHuPKzgEvB675XgPb5SsF", "question": "At what event might the women be?", "choices": ["sales convention", "mardi gras", "christmas parade", "cookoff"], "correct_choice_idx": 1, "direct_answers": ["rave party", "mardi gras", "mardi gras", "madi gras", "rave concert", "festival", "mardi gras", "parade", "festival", "fashion"], "difficult_direct_answer": false, "rationales": ["The girls are topless.", "The women have beads all around them.", "The women are wearing beaded necklaces that are thrown at mardi gras parades."], "image": "val2014/COCO_val2014_000000086036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25234, "question_id": "RuJ6JfZVqyJ2zYjLuCtNQG", "question": "What will likely turn black here first?", "choices": ["bananas", "hat", "dog", "wood beams"], "correct_choice_idx": 0, "direct_answers": ["unkown", "bananas", "banana", "banana peel", "first banana", "banana", "bananas", "sand", "bananas", "banana"], "difficult_direct_answer": false, "rationales": ["Black is associated with rot. out of all of the items listed, a will rot the most quickly.", "Those fruits change colors really quickly when in the sun and too much heat.", "They get dark when they get too ripe"], "image": "train2014/COCO_train2014_000000025234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506699, "question_id": "RvipYMFXgyvFD6BUwKNEWg", "question": "Which numbers are fully visible on both the top left and top right of the front of the bus?", "choices": ["350", "399", "810", "220"], "correct_choice_idx": 0, "direct_answers": ["350", "350", "three fifty", "350", "350112", "350", "350 112", "350", "350", "350 112"], "difficult_direct_answer": false, "rationales": ["The combination of 3, 5 and 0 can be seen on both sides.", "The numbers 350 are shown visible on the left and right top part of the bus.", "The numbers are fully visible and readable."], "image": "train2014/COCO_train2014_000000506699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370503, "question_id": "RvzMNdyYFc7XtWWmLsPfzP", "question": "Why is the person putting his hand onto the skateboard?", "choices": ["resting", "doing trick", "jumping off", "balancing"], "correct_choice_idx": 1, "direct_answers": ["balance", "avoid falling", "to balance", "balance", "doing trick", "to balance", "balance", "balance", "balance", "to balance"], "difficult_direct_answer": false, "rationales": ["The person is getting ready to do a flip.", "The person is trying to keep their balance.", "A skateboarder is skating down a sidewalk."], "image": "val2014/COCO_val2014_000000370503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199047, "question_id": "RwGnznNs4wGMqWo75S6rym", "question": "What OS is the computer monitor displaying?", "choices": ["ubuntu", "windows xp", "macos", "windows 95"], "correct_choice_idx": 1, "direct_answers": ["windows", "windows", "microsoft", "microsoft", "windows", "windows", "windows", "windows", "windows", "windows xp"], "difficult_direct_answer": false, "rationales": ["The desktop bar shows a green button.", "The computer screen looks like a typical windows xp screen.", "The computer seems to be running under windows."], "image": "val2014/COCO_val2014_000000199047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229940, "question_id": "RwPh267naQZZnP5Q8HZpWa", "question": "What is the white triangular button used for?", "choices": ["stopping video", "sharing video", "playing video", "pausing video"], "correct_choice_idx": 2, "direct_answers": ["play", "play video", "play", "play video", "playing media", "moving up", "playing video", "play", "play", "play"], "difficult_direct_answer": false, "rationales": ["An arrow pointing right symbolizes around the world that it means \"play\".", "The phone is playing.", "The button is to play the video."], "image": "train2014/COCO_train2014_000000229940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184358, "question_id": "RwfWHMNrBQKQDVtUKhwHA8", "question": "How would it be if she tried to snowboard assis?", "choices": ["too cold", "too hot", "no problem", "too dangerous"], "correct_choice_idx": 3, "direct_answers": ["too dangerous", "lose balance", "unsecured foot", "she'd fall", "difficult", "impossible", "difficult", "she'd fall", "cold", "difficult"], "difficult_direct_answer": false, "rationales": ["It would be very dangerous.", "There are other people with her to keep her safe", "She's not wearing enough equipment to safely snowboard."], "image": "val2014/COCO_val2014_000000184358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559388, "question_id": "RwtXiszgmEqxjC582KK7k2", "question": "How many mammals area shown?", "choices": ["one", "ten", "three", "two"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "one", "two", "two", "three", "three", "two", "three"], "difficult_direct_answer": false, "rationales": ["Two cats and a human equal 3.", "There are two cats and one man", "There are 2 cats and 1 person"], "image": "val2014/COCO_val2014_000000559388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228514, "question_id": "Rx6PQSMWETkRKJBGw4qs83", "question": "How many things can be cooked at once?", "choices": ["six", "four", "eight", "two"], "correct_choice_idx": 1, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are this many burners on the stove", "There are four burners.", "There are four burners."], "image": "train2014/COCO_train2014_000000228514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491836, "question_id": "RxJLvxTGVtod79MbkbPYLj", "question": "What is needed for the activity shown?", "choices": ["rain", "wind", "water", "snow"], "correct_choice_idx": 2, "direct_answers": ["boat", "boat", "boat", "boat", "boat", "boat", "boat", "boat", "water", "boat"], "difficult_direct_answer": false, "rationales": ["You need this to float", "Boating requires water.", "The only visible activity is boating. boating requires water."], "image": "val2014/COCO_val2014_000000491836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67966, "question_id": "RxKWx92FzXrEoCS4oaVYSx", "question": "What is closest to the pizza?", "choices": ["dog", "woman", "cat", "child"], "correct_choice_idx": 3, "direct_answers": ["child", "child", "plate", "child", "child", "child", "child", "girl", "girl", "girl"], "difficult_direct_answer": false, "rationales": ["A child is near the pizza.", "The person inches away from the pizza is a juvenile human.", "The little kid is loving this huge pizza."], "image": "val2014/COCO_val2014_000000067966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250427, "question_id": "Rxfduo7GEZ8Bejfs8SmWmL", "question": "Mention the barber shop in the image?", "choices": ["o2abc", "sports", "none", "citycutz"], "correct_choice_idx": 3, "direct_answers": ["cut", "best coast", "citycuts", "barber shop", "barber shop", "citycutz", "aabc", "city cut", "citycuts", "citycutz"], "difficult_direct_answer": false, "rationales": ["Citycutz is where one would go to get their hair cut.", "A shop has a red and white striped pole outside of its front window.", "The company has the word cut in the name."], "image": "val2014/COCO_val2014_000000250427.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158713, "question_id": "RxqdryPnJSmABErgWDE5aw", "question": "What is on top of the front wheel of the motorcycle?", "choices": ["basket", "fruit", "vegetables", "bird"], "correct_choice_idx": 0, "direct_answers": ["basket", "basket", "basket", "basket", "basket", "basket", "basket", "wheel guard", "basket", "basket"], "difficult_direct_answer": false, "rationales": ["A basket is on the front wheel.", "The basket is on top.", "You can see the basket in the image and it would be used to carry items in it."], "image": "train2014/COCO_train2014_000000158713.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96826, "question_id": "RxsgPTwhcmA7jKRbfGRLra", "question": "What is the green bus doing?", "choices": ["unloading passengers", "is broken", "selling passengers", "loading passengers"], "correct_choice_idx": 3, "direct_answers": ["parked", "loading passengers", "boarding", "loading", "parking", "loading", "making stop", "getting passengers", "picking passengers", "loading passengers"], "difficult_direct_answer": false, "rationales": ["The bus unloads passengers.", "The bus is parked in front of a bus stop so that the person that is waiting can get on the bus.", "The bus is stopped at a bus stop."], "image": "train2014/COCO_train2014_000000096826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353938, "question_id": "RxubJgSumPVfqphnGa39nB", "question": "Which person has the most different sitting posture?", "choices": ["long sleeved", "left short-sleeved", "drink person", "cross legged"], "correct_choice_idx": 2, "direct_answers": ["black shirt", "middle", "middle", "third one", "drink person", "middle", "black shirt", "drinking guy", "middle right", "drink man"], "difficult_direct_answer": false, "rationales": ["He is sitting sideways with a foot on the bench", "The person is drinking.", "All persons beside answer a have their legs facing forward and on the ground while answer a does not. if all other are exhibiting similar body posture than answer a is the most different."], "image": "train2014/COCO_train2014_000000353938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468953, "question_id": "RxxvPfc8AEMrtzSv3LFFS7", "question": "What are they discussing?", "choices": ["traffic", "gas cost", "plans date", "gun cost"], "correct_choice_idx": 0, "direct_answers": ["police issues", "speed", "directions", "traffic", "speeding", "police matters", "issues", "traffic regulations", "donuts", "traffic"], "difficult_direct_answer": true, "rationales": ["They are discussing the traffic.", "More than likely they discussing what to do about all the cars on the road.", "The men are discussing traffic."], "image": "val2014/COCO_val2014_000000468953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209400, "question_id": "Ry6sNeaLkxdFENqKttXyFK", "question": "What scene is this place?", "choices": ["circus", "zoo", "themed park", "horse farm"], "correct_choice_idx": 3, "direct_answers": ["farm", "farm", "farm", "horses grazing", "field", "horse farm", "prairie", "farm", "farm pasture", "farm"], "difficult_direct_answer": false, "rationales": ["The scene is a farm.", "There is horses that are seen in the area.", "The field is in the countryside and there are only horses present."], "image": "train2014/COCO_train2014_000000209400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554006, "question_id": "RyA9fz3FPdr7fZe97T3KQM", "question": "What does the man wear green bow tie?", "choices": ["camouflage", "matches pants", "visibility", "dress code"], "correct_choice_idx": 1, "direct_answers": ["skating", "apron", "to skateboard", "stpatricks day", "matches pants", "costume", "suspenders", "st patricks", "skateboarding", "green"], "difficult_direct_answer": true, "rationales": ["The green bow tie is the same shade as the pants.", "A man wears a green pants and accessories.", "The man seems to have a theme and wants them to match."], "image": "val2014/COCO_val2014_000000554006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418722, "question_id": "RyBXf3fcs3Ej9Bt2NcH7oL", "question": "The orange items are usually eaten by what character?", "choices": ["popeye", "bugs bunny", "garfield", "crash bandicoot"], "correct_choice_idx": 1, "direct_answers": ["bugs bunny", "bugs bunny", "bugs bunny", "bugs bunny", "bugs bunny", "vegetables", "bugs bunny", "bunny rabbit", "bugs bunny", "bugs bunny"], "difficult_direct_answer": false, "rationales": ["Rabbits like carrots and he is a rabbit.", "The cartoon rabbit chews on carrots.", "The orange items are eaten by bugs bunny."], "image": "train2014/COCO_train2014_000000418722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329623, "question_id": "RyDLxya8dJwYyWynwMNVPv", "question": "How will this food be cooked?", "choices": ["barbecue", "fire", "oven", "microwave"], "correct_choice_idx": 1, "direct_answers": ["fire", "grill", "stir fry", "fried", "pan", "grill", "grill", "stir fry", "grilled", "grill"], "difficult_direct_answer": false, "rationales": ["They are outside as evidenced by the grass on the ground and tailgating chair is visible.", "The food is going to be put over flame on the kebab sticks.", "You have to use fire to cook food."], "image": "train2014/COCO_train2014_000000329623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449103, "question_id": "RyPrrhdBhfCUxJG2y5TbhK", "question": "What is the woman in the black boots doing with the can?", "choices": ["throwing", "buying", "drinking", "recycling"], "correct_choice_idx": 2, "direct_answers": ["drinking", "drinking", "drinking", "drinking", "drinking", "drinking", "drinking", "drinking", "drinking", "drinking"], "difficult_direct_answer": false, "rationales": ["It's raised and tipped towards her mouth.", "The woman is taking a sip.", "The woman is drinking."], "image": "train2014/COCO_train2014_000000449103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132565, "question_id": "RyXB8iifwkXaECexHak9dC", "question": "What picture is on the wall?", "choices": ["animal", "forest", "building", "car"], "correct_choice_idx": 2, "direct_answers": ["town", "abstract", "building", "house", "castle", "city", "castle", "water color", "village", "village"], "difficult_direct_answer": false, "rationales": ["A picture of the exterior of a place with blue roofs is on the wall.", "There are homes on the photo.", "The picture shows an immobile manmade structure that people enter and exit through a door. the structure has windows and a chimney."], "image": "train2014/COCO_train2014_000000132565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330133, "question_id": "Ryc6VzoKvwYWNjoDHRTGgt", "question": "What are the women standing in the middle of?", "choices": ["river", "water fountain", "yard sprinklers", "lake"], "correct_choice_idx": 1, "direct_answers": ["water fountain", "sprinklers", "fountain", "fountain", "splash pad", "water", "fountain", "fountain", "fountain", "water fountain"], "difficult_direct_answer": false, "rationales": ["Streams of water flow all around people standing on paved ground.", "The woman are being sprinkled by a water fountain.", "There is water all around the woman, and it's spraying as a fountain would."], "image": "train2014/COCO_train2014_000000330133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84499, "question_id": "RyfHz37h4sLsQhNXsDcytw", "question": "What does the bus say on the front?", "choices": ["delhi", "dubai", "kajal", "hindsa"], "correct_choice_idx": 2, "direct_answers": ["kajal", "kajal", "kajal", "kajal", "kajal", "kajal", "kajal", "kajal", "kajal", "kajal"], "difficult_direct_answer": false, "rationales": ["The writing is visible on a vehicle that is definitely a bus based on its design.", "The letters are blue and easy to see", "The bus says kajal."], "image": "train2014/COCO_train2014_000000084499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56837, "question_id": "Rypsq3sUxMWgMurWxaLC6z", "question": "What are the men on the boards attempting to do?", "choices": ["stand", "lay", "dive", "jump"], "correct_choice_idx": 0, "direct_answers": ["surf", "surf", "stand", "balance", "surf", "hang ten", "surf", "catch wave", "surf", "surf"], "difficult_direct_answer": false, "rationales": ["They are both trying to stand on the surfboards.", "The men on the boards want to stand up.", "They are attempting to stand on the boards and catch the wave."], "image": "train2014/COCO_train2014_000000056837.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266520, "question_id": "RyuYCtW7FfvJCRK5VWJc4x", "question": "What is the man doing to the cow?", "choices": ["combing", "bathing", "milking", "feeding"], "correct_choice_idx": 2, "direct_answers": ["milking", "milking it", "milking", "milking", "milking", "milking", "milking", "milking it", "milking", "milking"], "difficult_direct_answer": false, "rationales": ["The man is milking the cow.", "The man is sitting next to the cow because he is milking it and filling the silver bucket.", "The man is milking the cow to get milk."], "image": "train2014/COCO_train2014_000000266520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448046, "question_id": "Rz8EHFYii3WMeAKgdqBCVK", "question": "What did the two people riding the tandem bike just do?", "choices": ["lost bet", "met santa", "got married", "won game"], "correct_choice_idx": 2, "direct_answers": ["get married", "get on", "marriage", "got married", "married", "wave", "got married", "got married", "by cycle", "get married"], "difficult_direct_answer": false, "rationales": ["There is a bride and a groom on the bike so they likely just got married.", "A man and a woman are on the tandem bike. they are wearing a suit and a white dress.", "Two people are on a bike in a wedding gown and tuxedo."], "image": "train2014/COCO_train2014_000000448046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570594, "question_id": "RzQEDCf297KdaX3J3BKCwG", "question": "Which horror movie title is related to what these animals are showing?", "choices": ["fang", "spike", "tusk", "claw"], "correct_choice_idx": 2, "direct_answers": ["witcher", "tusk", "scream", "attack movie", "tusk", "tusks", "dumbo", "tusk", "tusk", "tusk"], "difficult_direct_answer": false, "rationales": ["These animals are elephants. they have large white teeth-like items near their trunks.", "These animals are elephants. they do not have fangs, claws, or spikes.", "This was a movie that came out in 2014."], "image": "val2014/COCO_val2014_000000570594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72052, "question_id": "RzXFxCm3tkLQAHVn3SHzNd", "question": "What does the restaurant most probably have in addition to food?", "choices": ["liquor", "hookah", "cannabis", "casino"], "correct_choice_idx": 0, "direct_answers": ["cocktails", "drinks", "alcohol", "alcohol", "alcohol", "drinks", "alcohol", "liquor", "drink", "alcohol"], "difficult_direct_answer": false, "rationales": ["An establishment has signs out front that advertise a liquor license as well as food.", "The restaurant has a sign that says they are fully licensed to sell liquor.", "It advertises with a picture of a cocktail glass and mentions it's licensed"], "image": "val2014/COCO_val2014_000000072052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389721, "question_id": "RzeLiNbCd4Kqsc6GraQ5ce", "question": "What are the giraffes near?", "choices": ["man", "cow", "car", "building"], "correct_choice_idx": 3, "direct_answers": ["wall", "fence", "building", "wall", "fence", "food", "place", "wire fence", "fence", "food"], "difficult_direct_answer": false, "rationales": ["The giraffes are eating from a basket on the wall of a building.", "The are close to the building.", "The giraffes are close to the building."], "image": "val2014/COCO_val2014_000000389721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540444, "question_id": "RzyfmB9azQFuWyboruqx6F", "question": "Why does the girl on the table look sad?", "choices": ["was hit", "spilled drink", "lost keys", "shoelaces untied"], "correct_choice_idx": 1, "direct_answers": ["glass over", "no wine", "wine's gone", "spilled drink", "acting", "looking down", "unsure", "spilled glass", "posing", "injured"], "difficult_direct_answer": true, "rationales": ["The wine glass on the table is laying on the side with liquid spilled.", "She is staring at an empty glass.", "She knocked a glass over."], "image": "train2014/COCO_train2014_000000540444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4984, "question_id": "S23QoENNMWiW79PmDPpePv", "question": "What are these kids wearing that keeps the water out of their eyes?", "choices": ["blindfold", "goggles", "eyeglasses", "sunglasses"], "correct_choice_idx": 1, "direct_answers": ["goggles", "goggles", "goggles", "goggles", "goggles", "goggles", "goggles", "goggles", "goggles", "swim goggles"], "difficult_direct_answer": false, "rationales": ["The ocean is making large waves that go above the children's heads.", "The kids are surfing in the ocean and to protect their eyes from the environment.", "The kids have goggles."], "image": "train2014/COCO_train2014_000000004984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516570, "question_id": "S27CdW8nYF6HedThDPGLrX", "question": "Why are their hands behind them?", "choices": ["swinging wildly", "falling", "awaiting ball", "just swung"], "correct_choice_idx": 3, "direct_answers": ["swung bat", "just swung", "swinging", "missed swing", "swinging bat", "hit ball", "swing follow-through", "bat swing", "swinging", "missed swing"], "difficult_direct_answer": false, "rationales": ["The batter swung the bat.", "The ball is behind the person, and their body language shows they just exerted effort into hitting it, but failed.", "The person is holding a bat and a ball is behind them, it is an obvious swing and miss which leaves arms behind the person."], "image": "train2014/COCO_train2014_000000516570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388325, "question_id": "S2bPJDisFVWifBAhVsf7gx", "question": "What type of siding is found on the house?", "choices": ["vinyl", "steel", "brick", "mud"], "correct_choice_idx": 0, "direct_answers": ["shingle", "vinyl", "shingles", "vinyl", "wood", "vinyl siding", "wooden", "tile siding", "grey", "wood"], "difficult_direct_answer": false, "rationales": ["The other options are visibly not matching to the sides of this building.", "The siding is visible and the size and shape of the panels is consistent with answer a.", "None of it is faded"], "image": "val2014/COCO_val2014_000000388325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130122, "question_id": "S2fT6E6y5VfU2TByJQBhbq", "question": "What is the man in the white hat doing?", "choices": ["judging", "complaining", "cheer leading", "escaping"], "correct_choice_idx": 0, "direct_answers": ["officiating", "refereeing", "watching", "hitting ball", "being referee", "line umpire", "catching balls", "referee", "judging", "judging game"], "difficult_direct_answer": true, "rationales": ["He is there to judge the game being played and make calls if needed.", "He's looking to make sure the ball stays in the lines", "The man in the hat is refereeing the plays."], "image": "train2014/COCO_train2014_000000130122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212047, "question_id": "S2qPRdRn6LJmjCZrQXZFFa", "question": "This country has what type of government?", "choices": ["republic", "monarchy", "anarchy", "dictatorship"], "correct_choice_idx": 0, "direct_answers": ["communist", "monarchy", "canada", "england", "parliamentary", "democracy", "china", "federal republic", "britain", "republic"], "difficult_direct_answer": true, "rationales": ["The canadian flag is shown.", "This is located in the country of georgia, which was previously a soviet republic and is now a democratic republic.", "The country has a republic."], "image": "train2014/COCO_train2014_000000212047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321418, "question_id": "S37mMDafkyfw2xDrQeJ2MR", "question": "What type activity is enjoyed here?", "choices": ["cattle show", "blood letting", "beef slaughter", "cake walk"], "correct_choice_idx": 0, "direct_answers": ["farming", "cattle auction", "showcasing", "contest", "cattling", "cattle show", "ranching", "cows", "showing cattle", "cow walking"], "difficult_direct_answer": true, "rationales": ["A cattle show is being held.", "Right now they are having a cattle show.", "The activity is a cattle show."], "image": "train2014/COCO_train2014_000000321418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387903, "question_id": "S3H67Yh6PWkY79XqaMChRq", "question": "How many paintings are on the wall?", "choices": ["two", "three", "six", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is a single painting on the wall.", "The rest of the wall is tile", "There are two beds. half of them have a painting."], "image": "train2014/COCO_train2014_000000387903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141228, "question_id": "S3KWBwjgcrQg9U4CSnBDfX", "question": "The gallon sized jug in the refrigerator door holds liquid from which subfamily?", "choices": ["birds", "bovine", "swine", "equine"], "correct_choice_idx": 1, "direct_answers": ["fruit punch", "milk", "juice", "bovine", "can't see", "milk", "big", "fruit punch", "juice", "dairy"], "difficult_direct_answer": false, "rationales": ["Milk comes from cows.", "The container appears to be milk based on the color of the contents and the cap and label color and style. milk that most people drink is from cows who belong to option a.", "The jug on the shelf of the refrigerator door contains milk from cows that belong to the bovine family."], "image": "val2014/COCO_val2014_000000141228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353001, "question_id": "S3RbKKbCBMAMffR2EfZFVH", "question": "For what protection are the pink round items used?", "choices": ["sleet", "rain", "sun", "snow"], "correct_choice_idx": 2, "direct_answers": ["uv rays", "rain sun", "sun", "solar", "sun rays", "sun", "sun", "sun", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["These are umbrellas and the people are using them for shade.", "It is not raining, snowing, or sleeting. people are dressed as they would be on a very hot day.", "The pink round items are parasols. there is no precipitation near the people."], "image": "val2014/COCO_val2014_000000353001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54009, "question_id": "S3Uqv6vdRTtdjaVZJbgWcv", "question": "Who would drive these vehicles?", "choices": ["officers", "passengers", "employees", "students"], "correct_choice_idx": 2, "direct_answers": ["pilot", "pilot", "pilots", "staff", "pilot", "pilots", "pilots", "pilots", "employees", "pilots"], "difficult_direct_answer": false, "rationales": ["A commercial plane is being loaded at an airport.", "There will be employs seems to be driving the cars.", "People driving the vehicles work for airlines."], "image": "train2014/COCO_train2014_000000054009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311408, "question_id": "S3sKBuZjaPSwmM8W3jPNfE", "question": "What dressing is the green bowl likely to be?", "choices": ["honey mustard", "thousand island", "balsamic vinegar", "ranch"], "correct_choice_idx": 3, "direct_answers": ["salsa", "ranch", "avocado", "ranch", "ranch", "ranch", "ranch", "gravy", "guacamole", "guacamole"], "difficult_direct_answer": false, "rationales": ["It's ranch dressing.", "The dressing is white.", "Ranch dressing is white."], "image": "train2014/COCO_train2014_000000311408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72898, "question_id": "S3uTZ7dNtLfaWUXW9sjWfD", "question": "What kind of event is taking place?", "choices": ["cook out", "wedding", "fancy dinner", "date"], "correct_choice_idx": 0, "direct_answers": ["barbecue", "barbecue", "cook out", "barbeque", "picnic", "bbq", "outdoor", "barbecue", "barbecue", "cookout"], "difficult_direct_answer": false, "rationales": ["Hot dogs are being warmed on a barbecue. people do not serve hot dogs at fancy dinners, weddings, or dates.", "Grilling hot dogs outside is a traditional form of cooking in the summer.", "The food is being cooked outside."], "image": "train2014/COCO_train2014_000000072898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310278, "question_id": "S42U2DELGRdSiVyxDnfdKf", "question": "Which object is in the greatest danger?", "choices": ["right cyclist", "middle cyclist", "man standing", "silver car"], "correct_choice_idx": 1, "direct_answers": ["bicycle", "bicyclist", "bike rider", "yellow bicycle", "car", "silver car", "biker", "car", "car", "middle cyclist"], "difficult_direct_answer": false, "rationales": ["The middle cyclist is on the phone.", "The middle cyclist is in danger of being hit by car doors, not to mention what other opposing traffic there might be, so he's the most vulnerable one in terms of danger.", "The middle cyclist is in danger of running into the car."], "image": "val2014/COCO_val2014_000000310278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472938, "question_id": "S43SaitZpn9RwPuGoZbqju", "question": "Vacuoles are present in which cell?", "choices": ["plant", "prokaryote", "eukaryote", "animal"], "correct_choice_idx": 0, "direct_answers": ["plants", "window", "plants", "plant", "tree", "plant", "plant", "unknown", "left", "cytoplasm"], "difficult_direct_answer": false, "rationales": ["There are plants present in the cell area under the window.", "They are present in the cell, as the diagram shows.", "Plants are growing all around a stone building."], "image": "train2014/COCO_train2014_000000472938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368676, "question_id": "S4LWfhK3cFeQ4EYebyVQ2n", "question": "What relation is the man to the baby on his back?", "choices": ["brother", "neighbor", "father", "teacher"], "correct_choice_idx": 2, "direct_answers": ["father", "father", "father", "father", "father", "father", "father", "father", "father", "father"], "difficult_direct_answer": false, "rationales": ["A man is walking dogs and has a baby on a pack on his back. parents use various items to carry their babies.", "The child is comfortable with him and the man is the right age to have young children", "The baby is a generation younger and looks similar to the adult."], "image": "val2014/COCO_val2014_000000368676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47684, "question_id": "S4epuJ6ZiTw6nZYAdeSP9M", "question": "What normally goes into these two fixtures?", "choices": ["urine", "coffee", "tea", "juice"], "correct_choice_idx": 0, "direct_answers": ["urine", "urine", "urine", "urine", "urine", "urine", "urine", "urine", "urine", "urine"], "difficult_direct_answer": false, "rationales": ["These types of urinals are used to relieve yourself.", "A man is standing at a urinal in a public background.", "These bathroom fixtures are designed to accept pee from men."], "image": "train2014/COCO_train2014_000000047684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497296, "question_id": "S4iZsguRnceSxFBYJZsjPP", "question": "How do these people know each other?", "choices": ["rivals", "coworkers", "teammates", "classmates"], "correct_choice_idx": 3, "direct_answers": ["from school", "classmates", "students", "classmates", "classmates", "classmates", "sisters", "school", "classmates", "attend school"], "difficult_direct_answer": false, "rationales": ["They are dressed alike in uniforms", "They look like they are in the same school uniform.", "They have backpacks on and school uniforms."], "image": "train2014/COCO_train2014_000000497296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65806, "question_id": "S5PUwPLjEtsq6SvnydazTW", "question": "What would happen if you put paper directly over his hand?", "choices": ["it'd soak", "it'd tear", "it'd burn", "it'd fly"], "correct_choice_idx": 2, "direct_answers": ["burn", "catch fire", "burn", "burning paper", "start fire", "catch fire", "fire", "burn", "catch fire", "it'd burn"], "difficult_direct_answer": false, "rationales": ["He is holding a lighter that is currently ignited and paper is very flammable.", "There is a flame from a lighter.", "The fire would cause the paper to start on fire and it would burn."], "image": "val2014/COCO_val2014_000000065806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383250, "question_id": "S5PZoHSdRyRfryHAbvBtjX", "question": "Why would someone sit here?", "choices": ["to work", "to paint", "to wait", "to eat"], "correct_choice_idx": 0, "direct_answers": ["work", "work", "do work", "to work", "work", "work", "do work", "to work", "working", "work"], "difficult_direct_answer": false, "rationales": ["The desk is for working.", "The object is a desk and people use it as a workstation.", "There are computers, a phone and a cup of coffee on a professional looking desk."], "image": "train2014/COCO_train2014_000000383250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77154, "question_id": "S5W9c8hCRWE2gqzQ72R3vn", "question": "What kind of windows does the building centered here have?", "choices": ["tinted", "bay windows", "doll house", "trasit windows"], "correct_choice_idx": 1, "direct_answers": ["double hung", "bay", "bay", "dormer windows", "mirror window", "bay", "sliding", "double", "french", "bay windows"], "difficult_direct_answer": false, "rationales": ["It protrudes from the building where the windows are located", "They have fancy bay windows.", "The style of window is known as a bay window."], "image": "train2014/COCO_train2014_000000077154.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426877, "question_id": "S5YPaBwcepPVJUx7ZqGoKP", "question": "How many types of Frisbee's are there?", "choices": ["six", "four", "three", "five"], "correct_choice_idx": 2, "direct_answers": ["three", "one", "two", "3 types", "one", "one", "four", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There are this many for the game and other kinds as well", "They are all floating frisbees.", "There are three frisbees."], "image": "train2014/COCO_train2014_000000426877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182335, "question_id": "S5aWiTUUzjw63Rhcfb8ZHC", "question": "What is the pole near the older woman's leg?", "choices": ["telephone pole", "golf club", "cane", "umbrella"], "correct_choice_idx": 2, "direct_answers": ["bench leg", "cane", "cane", "cane", "cane", "white cane", "cane", "blind cane", "cane", "walking cane"], "difficult_direct_answer": false, "rationales": ["A woman holds a thin object with a handle as she sits on a bench with others.", "She's old and needs a cane to walk.", "The pole in question is the size and shape consistent with answer a and someone her age might commonly use such a device."], "image": "train2014/COCO_train2014_000000182335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61672, "question_id": "S66Rn8hdVMbBbhXtVkXHGh", "question": "What food did they already eat?", "choices": ["banana", "yogurt", "pizza", "fish"], "correct_choice_idx": 1, "direct_answers": ["yogurt", "yogurt", "croissant", "yogurt", "yogurt", "yogurt", "yogurt", "breakfast", "yogurt", "yogurt"], "difficult_direct_answer": false, "rationales": ["The plastic container of yogurt next to the bowl is open and has already been eaten.", "There is a container of answer a clearly visible and identifiable and it is not full based on the level of the contents. to lower the level of this type of food, it is likely one ate it.", "The cup holding containing this food has been opened and is almost empty."], "image": "train2014/COCO_train2014_000000061672.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340069, "question_id": "S6ARUJ4H5URqwvSf8nbQ2d", "question": "What is inside the cake being cut?", "choices": ["marshmallows", "bread", "ice cream", "angel food"], "correct_choice_idx": 2, "direct_answers": ["knife", "vanilla cake", "ice cream", "ice cream", "knife", "ice cream", "ice cream", "ice cream", "cake", "cake"], "difficult_direct_answer": false, "rationales": ["The cake has ice cream.", "The cake is decorated like ice cream so it likely has ice cream inside.", "There is an ice cream cone on top."], "image": "val2014/COCO_val2014_000000340069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78925, "question_id": "S6EJpyJM4KdyKBYBsiMm6J", "question": "Why are there dark patches on the mountain on the right side?", "choices": ["fire damage", "erosion", "dark soil", "cloud shadows"], "correct_choice_idx": 3, "direct_answers": ["clouds", "lacking foliage", "dirt", "cloud shadows", "cloud shadows", "shade", "shadows", "shadow", "shadow", "old age"], "difficult_direct_answer": false, "rationales": ["The sun is shining. the white aerosols in the sky are blocking some of the sunlight.", "The sky is blue with many clouds.", "There are shadows."], "image": "train2014/COCO_train2014_000000078925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493812, "question_id": "S6M5gr7kEqtSrvpBg6sxkP", "question": "What period of the day is the person in?", "choices": ["afternoon", "night", "evening", "morning"], "correct_choice_idx": 0, "direct_answers": ["noon", "afternoon", "morning", "afternoon", "afternoon", "morning", "afternoon", "afternoon", "afternoon", "morning"], "difficult_direct_answer": false, "rationales": ["There is a clock tower. it indicates that it is just after 3 p.m.", "The clock shows that it is approximately 3:00. the fact that it is still daytime would indicate that it would be 3 in the afternoon versus 3 early morning.", "The sky is bright."], "image": "train2014/COCO_train2014_000000493812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205380, "question_id": "S6NnpDVbF7ituw95TGix45", "question": "What place is shown here?", "choices": ["wilderness", "zoo", "park", "farm"], "correct_choice_idx": 3, "direct_answers": ["farm", "farm", "farmland", "field", "field", "field", "sheep farm", "sheep farm", "farm", "sheep field"], "difficult_direct_answer": false, "rationales": ["The sheep are grazing in a meadow on a farm that is fenced in.", "This is a grazing field for a farm.", "This place shown is a farm of livestock such as sheep."], "image": "train2014/COCO_train2014_000000205380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533097, "question_id": "S6jgsZsWTKGkUSEfuwbxE2", "question": "What are the green items on top of the pizza?", "choices": ["olives", "green onions", "green peppers", "jalapenos"], "correct_choice_idx": 3, "direct_answers": ["peppers", "peppers", "jalapenos", "jalapenos", "peppers", "peppers", "jalapenos", "jalapenos", "jalapenos", "jalapenos"], "difficult_direct_answer": false, "rationales": ["Jalapenos are cut into rings.", "The green items are spicy peppers.", "The shape and size and color indicate that these are jalapenos."], "image": "val2014/COCO_val2014_000000533097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487299, "question_id": "S7g9SookjEJdVSoxjvZeb3", "question": "When the water drains one would expect to see what?", "choices": ["road", "mountain", "grass", "river bed"], "correct_choice_idx": 0, "direct_answers": ["street", "ground", "street", "road", "sand", "street", "street", "street", "road", "street"], "difficult_direct_answer": false, "rationales": ["It is the image that shows the road.", "You can tell buy the signs and setting as to what is under the water.", "We could expect to see a road under the traffic lights."], "image": "val2014/COCO_val2014_000000487299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132874, "question_id": "S7oKusBcQk3tYiczwVyHkX", "question": "Why is the left end of the front bun irregular?", "choices": ["design flaw", "cut funny", "poor workmanship", "took bite"], "correct_choice_idx": 3, "direct_answers": ["bite taken", "hot dog", "it's bite", "eaten", "took bite", "bitten", "bitten", "bitten", "bite taken", "bite missing"], "difficult_direct_answer": false, "rationales": ["There is a piece missing. the remaining bun is in the shape of a row of teeth.", "The end has a tooth-mark imprint, indicating it has been bitten into.", "The left end of the bun on the front had a bite taken out of it."], "image": "train2014/COCO_train2014_000000132874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401312, "question_id": "S7oNFPdCBomfuJfuBsy6uA", "question": "What type of area is shown?", "choices": ["rural", "tropical", "urban", "arctic"], "correct_choice_idx": 2, "direct_answers": ["city", "overpass", "industrial area", "urban downtown", "city", "south dallas", "highway", "urban", "california city", "city"], "difficult_direct_answer": false, "rationales": ["Urban areas are in cities, places with high populations. skyscrapers can be seen as well as highways.", "An overhead shot of a cable car above a cityscape. there are cars traveling down the highway below.", "There is a train, a highway and many buildings."], "image": "train2014/COCO_train2014_000000401312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37205, "question_id": "S7rn6JZuKmVkQHxbbqv6aV", "question": "What can be seen on the front of the poled structure on the far right?", "choices": ["billboard ad", "television screen", "speed camera", "lights"], "correct_choice_idx": 0, "direct_answers": ["traffic light", "light", "sunshine", "billboard ad", "billboard", "traffic light", "sign", "green light", "advertisement", "shadow"], "difficult_direct_answer": true, "rationales": ["The visible object on the poled structure is the right size, shape and configuration to be a billboard. it is also located next to a road where these ads are most commonly displayed.", "You can tell by the color and the words as to what it is.", "There is a large billboard ad on the poled structure up to the right."], "image": "train2014/COCO_train2014_000000037205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179095, "question_id": "S7x57EUCfewhvdC5JDspAG", "question": "Who is giving a gift to the kids here?", "choices": ["child", "mother teresa", "military man", "trump"], "correct_choice_idx": 2, "direct_answers": ["military", "soldier", "army man", "soldier", "military man", "military", "soldier", "soldier", "soldier", "soldier"], "difficult_direct_answer": false, "rationales": ["He is wearing an airforce uniform.", "He is wearing a pilot military uniform.", "The military man gives the gift."], "image": "train2014/COCO_train2014_000000179095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540636, "question_id": "S863Wz83DS4GmFj8pGMBLh", "question": "The animal represented by the float usually lives where?", "choices": ["desert", "plains", "ocean", "snow"], "correct_choice_idx": 2, "direct_answers": ["ocean", "ocean", "ocean floor", "sea", "ocean", "ocean", "water", "ocean", "ocean", "ocean"], "difficult_direct_answer": false, "rationales": ["Lobsters live in the ocean.", "The animal is a lobster. this animal lives in an aquatic environment.", "The animal is in the ocean."], "image": "val2014/COCO_val2014_000000540636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265799, "question_id": "S8MwXRA8Fz99fqTr7GLcM8", "question": "What is the silver area of the bike made of?", "choices": ["glass", "chrome", "pewter", "plastic"], "correct_choice_idx": 1, "direct_answers": ["chrome", "chrome", "metal", "stainless steel", "chrome", "chrome", "metal", "spokes", "gas tank", "chassis"], "difficult_direct_answer": false, "rationales": ["The silver part of the bike is chrome.", "The area is chrome.", "The silver area of the bike is made of shiny chrome metal."], "image": "train2014/COCO_train2014_000000265799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183620, "question_id": "S8NWGUk5kNkqFZMyc4HT8r", "question": "What's the woman attempting to hit?", "choices": ["boxes", "speakers", "containers", "televisions"], "correct_choice_idx": 3, "direct_answers": ["television", "television", "tv", "televisions", "television", "television", "television", "television", "television", "tv"], "difficult_direct_answer": false, "rationales": ["She's breaking up televisions.", "She is hitting the televisions on the side walk.", "The woman is at a tv bashing event."], "image": "val2014/COCO_val2014_000000183620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302102, "question_id": "S8WNpprG9noucUBT4BConH", "question": "Why are the lights on at this ski resort?", "choices": ["it's raining", "it's cloudy", "it's storming", "it's night"], "correct_choice_idx": 3, "direct_answers": ["night", "nighttime", "to see", "it's dark", "night skiing", "night time", "it's night", "it's nighttime", "it's dark", "nighttime"], "difficult_direct_answer": false, "rationales": ["The lights are one because it is night.", "A ski resort does not use lights on the runs during the day.", "It is dark."], "image": "train2014/COCO_train2014_000000302102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179814, "question_id": "S8kuyG3vjHWzuFNjoyABqY", "question": "What is next to the oven?", "choices": ["spices", "refrigerator", "dishwasher", "sink"], "correct_choice_idx": 0, "direct_answers": ["rack", "shelves", "leg", "spices", "spice rack", "spices", "spices", "spice rack", "shelves", "spices"], "difficult_direct_answer": false, "rationales": ["People use spices to cook. they are kept in bottles and jars.", "You can tell by the colors and the shelf they are on as to what they are.", "There is a rack with many small bottles of dry cooking ingredients."], "image": "train2014/COCO_train2014_000000179814.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334743, "question_id": "S8wsdmanJkshrwAzjdLx6X", "question": "Which side of the street do the busses seen here travel when driving forward?", "choices": ["none", "right", "center only", "left"], "correct_choice_idx": 3, "direct_answers": ["back side", "left", "left", "left", "left", "left", "back side", "left", "left", "left"], "difficult_direct_answer": false, "rationales": ["This is in london and they drive on that side", "These buses are from the uk and drive on the left.", "These buses belong in the united kingdom so they drive on the left side of the road."], "image": "val2014/COCO_val2014_000000334743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45494, "question_id": "S94rYAvUmNSR35HCTQtDJU", "question": "What does this truck most likely haul?", "choices": ["equipment", "cars", "waste", "trees"], "correct_choice_idx": 2, "direct_answers": ["waste", "waste", "waste", "cargo", "trash dumpsters", "sewage", "waste", "trash", "waste", "waste"], "difficult_direct_answer": false, "rationales": ["The name of the company has this word in it", "There is a sign on the side of the truck.", "It has words of waste which is the prove."], "image": "train2014/COCO_train2014_000000045494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222694, "question_id": "S95EiDCMU2MYgzKLAADAjF", "question": "What is the red object that sits in the basket?", "choices": ["bag", "helmet", "bottle", "lock"], "correct_choice_idx": 2, "direct_answers": ["bottle", "water bottle", "bottle", "bottle", "water bottle", "water bottle", "water bottle", "bottle", "water", "water bottle"], "difficult_direct_answer": false, "rationales": ["The bicycle has a water container in the basket.", "It has the cylindrical shape and snap lid of a water container.", "It is in a cylinder shape and has a top for the removal of liquids."], "image": "train2014/COCO_train2014_000000222694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279108, "question_id": "S95dKSCFPnDGzBpqExnMZQ", "question": "Where does the woman stand with a utensil?", "choices": ["living room", "storefront", "ship", "kitchen"], "correct_choice_idx": 3, "direct_answers": ["kitchen", "kitchen", "kitchen", "left", "kitchen", "left", "kitchen", "left", "kitchen", "in kitchen"], "difficult_direct_answer": false, "rationales": ["There is a stove behind her. she is feeding a man something.", "The woman is in a kitchen.", "The woman stands with a fork in the kitchen."], "image": "train2014/COCO_train2014_000000279108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413971, "question_id": "S9WRvWnwK74kMx2MFHDaTP", "question": "What topic is absent from these hot dogs?", "choices": ["mustard", "ketchup", "onions", "chili"], "correct_choice_idx": 3, "direct_answers": ["chili", "mustard", "relish", "relish", "relish", "nothing", "relish", "relish", "onion", "chili"], "difficult_direct_answer": false, "rationales": ["The hot dogs don't have chili on them.", "There is no chili visible.", "There is no chili."], "image": "val2014/COCO_val2014_000000413971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451574, "question_id": "S9y8YyhdxfuN8KH42wVDRw", "question": "To which location does this plane mainly fly?", "choices": ["new york", "antarctica", "canada", "tahiti"], "correct_choice_idx": 3, "direct_answers": ["papeete", "tahiti", "tahiti/bora bora", "tahiti", "tahiti", "hawaii", "tahiti", "tahiti", "tahiti", "tahiti"], "difficult_direct_answer": false, "rationales": ["This plane has tahiti on its side.", "The plane says air tahiti on it.", "The word is on the plane."], "image": "train2014/COCO_train2014_000000451574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332270, "question_id": "S9z4BbFohccpbVCmz5PS8X", "question": "What animals are seen on the white rectangular shaped envelope?", "choices": ["raccoons", "squirrels", "groundhogs", "skunks"], "correct_choice_idx": 0, "direct_answers": ["raccoons", "raccoons", "racoons", "raccoons", "racoons", "raccoons", "racoons", "raccoons", "racoons", "raccoons"], "difficult_direct_answer": false, "rationales": ["The animals are locatable by the text of the question and have the unique color patterning consistent with answer a.", "These animals have the color and shape of raccoons.", "Its a book laying next to the child."], "image": "train2014/COCO_train2014_000000332270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101882, "question_id": "S9zC4PWc6TRbkDee2DMgih", "question": "What time of day is the bike being rolled into the covered area?", "choices": ["midnight", "noon", "one pm", "dusk"], "correct_choice_idx": 3, "direct_answers": ["dusk", "night", "evening", "night time", "evening", "night", "evening", "evening", "night", "nighttime"], "difficult_direct_answer": false, "rationales": ["It's getting dark outside", "The day is pretty dark.", "It is not overly dark, but you can tell the sun is going down."], "image": "train2014/COCO_train2014_000000101882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426011, "question_id": "SA6iVZ2rxTLWusYwFHhgsz", "question": "What is the man holding the paper wearing?", "choices": ["glasses", "gas mask", "suit", "cowboy hat"], "correct_choice_idx": 2, "direct_answers": ["suit", "sport coat", "sports jacket", "blazer", "suit", "blazer", "sports coat", "blazer", "suit jacket", "suite"], "difficult_direct_answer": false, "rationales": ["He is wearing a tailored jacket and a collared shirt.", "The man is wearing a suit.", "The man with the paper has a brown jacket on with a plaid shirt inside. taken together the nice jacket and shirt comprise a suit."], "image": "val2014/COCO_val2014_000000426011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27980, "question_id": "SA7XvK4EMvL8uaosqQPdUq", "question": "What is the red and white box on the left used for?", "choices": ["shipping", "keeping cool", "solving puzzles", "collecting sand"], "correct_choice_idx": 1, "direct_answers": ["keeping cool", "tools", "food", "tools", "tackle", "cooling", "food drinks", "keeping cool", "cooling food", "carrying items"], "difficult_direct_answer": false, "rationales": ["It is a cooler that keeps food chilly.", "It looks like it might be a cooler.", "It has insulation. insulation can keep items at a lower temperature."], "image": "train2014/COCO_train2014_000000027980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438811, "question_id": "SAMXEXkRkw22baT9gbHpGZ", "question": "This bus belongs to which state?", "choices": ["delhi", "kerala", "punjab", "karnataka"], "correct_choice_idx": 1, "direct_answers": ["germany", "indonesia", "unknown", "thailand", "thailand", "tamil nada", "india", "kerala", "india", "thailand"], "difficult_direct_answer": false, "rationales": ["The bus is in kerala.", "The bus belongs to kerala, it says so on the bus.", "The bus's license plate indicates kerala."], "image": "val2014/COCO_val2014_000000438811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521808, "question_id": "SASwMmzKQv2PzHpqXUr7RC", "question": "Where is the toilet tank?", "choices": ["in floor", "no tank", "off camera", "inside wall"], "correct_choice_idx": 3, "direct_answers": ["wall", "wall", "back toilet", "bottom", "behind wall", "wall", "tankless", "inside wall", "in wall", "tankless"], "difficult_direct_answer": false, "rationales": ["There is no visible tank.", "The tank is hidden inside the wall.", "The water is dispersed on demand."], "image": "train2014/COCO_train2014_000000521808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515853, "question_id": "SAsmZBM6cXgx9C9j92t7tn", "question": "In what city are people traveling on this sightseeing bus?", "choices": ["virginia", "washington d.c", "seattle", "maryland"], "correct_choice_idx": 1, "direct_answers": ["washington dc", "washington d.c", "washington dc", "washington dc", "washington dc", "washington dc", "washington dc", "washington dc", "washington dc", "washington dc"], "difficult_direct_answer": false, "rationales": ["There is writing on the side of the bus that says what it is and where it operates as well as visible monuments in the background known to be located in answer a.", "The name of the tour city is written on the side of the bus.", "The bus says washington dc on it."], "image": "train2014/COCO_train2014_000000515853.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196924, "question_id": "SB8XC5v3rmBW25Rxjuwurs", "question": "Based on the door this furniture is most likely located in what?", "choices": ["storage unit", "living room", "barn", "bedroom"], "correct_choice_idx": 0, "direct_answers": ["storage unit", "storage unit", "living room", "storage unit", "storage unit", "storage unit", "storage unit", "garage", "garage", "storage unit"], "difficult_direct_answer": false, "rationales": ["The door is a storage unit.", "With the garage door, the place is probably located at a storage facility.", "This garage-style type of sliding door is typical of a storage space compartment, otherwise known as a storage unit."], "image": "val2014/COCO_val2014_000000196924.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429517, "question_id": "SBfJfq6GNaoEeeagyqcuGg", "question": "What costumed character is this lady mimicking?", "choices": ["santa", "humpty dumpty", "clown", "elve"], "correct_choice_idx": 2, "direct_answers": ["clown", "clown", "rudolph", "bozo clown", "clown", "girl", "joker", "clown", "clown", "snow white"], "difficult_direct_answer": false, "rationales": ["The woman is mimicking a clown with her red nose.", "The lady is wearing a clown nose.", "A woman is turned towards another and is wearing a red round nose."], "image": "train2014/COCO_train2014_000000429517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188029, "question_id": "SBtuPLain3G4NTcVKxGi4z", "question": "What is the man in blue t-shirt holding?", "choices": ["telescope", "camera", "binoculars", "microscope"], "correct_choice_idx": 1, "direct_answers": ["camera", "camera", "camera", "camera", "camera", "camera", "camera", "camera", "camera", "camera"], "difficult_direct_answer": false, "rationales": ["The man has a camera.", "He is taking a video of the game.", "It would not be appropriate to take a telescope or microscope to a baseball game. the man is close enough to the field to not need binoculars."], "image": "train2014/COCO_train2014_000000188029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453409, "question_id": "SC5QeihsTgHpyGGqNMPLZH", "question": "The tower behind the center festival decoration is used for broadcasting what?", "choices": ["cellular service", "radar", "television", "radio"], "correct_choice_idx": 0, "direct_answers": ["entertaining", "radio", "radio waves", "cellular service", "radio", "radio", "radio signals", "radio signal", "cell calls", "radio signals"], "difficult_direct_answer": false, "rationales": ["The tower is for cell service", "The tower in the background is used for cell service.", "The tower is for cellular networks."], "image": "train2014/COCO_train2014_000000453409.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562348, "question_id": "SCRJGa4rse8noWp6WZVDeL", "question": "What kind of weather is the woman experiencing?", "choices": ["snow", "rain", "sleet", "wind"], "correct_choice_idx": 1, "direct_answers": ["rain", "rain", "rainy", "rain", "rain", "rainy", "rain", "rainy", "rainy", "rain"], "difficult_direct_answer": false, "rationales": ["The woman is under an umbrella and their is water around her which points to rain coming down.", "The weather is rainy.", "The woman is holding an umbrella with wet spots visible in the background which would be consistent with answer a."], "image": "train2014/COCO_train2014_000000562348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38274, "question_id": "SD8GBVeXsY6U6KtGezXoTN", "question": "What kind of instrument is in the case?", "choices": ["stringed", "percussion", "woodwind", "brass"], "correct_choice_idx": 0, "direct_answers": ["guitar", "bass", "stringed", "guitar", "guitar", "guitar", "bass", "guitar", "guitar", "guitar"], "difficult_direct_answer": false, "rationales": ["This is the normal shape for an a instrument.", "The case is in the shape of a guitar.", "An instrument that is used in a lot of country music songs. people who play this instrument will often carry it around with them."], "image": "train2014/COCO_train2014_000000038274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60647, "question_id": "SDCUdu2HavUPvaE623HX3R", "question": "What is the person with the black helmet running towards?", "choices": ["home plate", "motorcycle", "circus cannon", "brick wall"], "correct_choice_idx": 0, "direct_answers": ["home plate", "home base", "home base", "home plate", "home plate", "home plate", "home plate", "home plate", "home base", "home plate"], "difficult_direct_answer": false, "rationales": ["It is shaped like a house", "The plate is shaped like a house and the umpire is there", "The person wants to move toward home plate to score a run."], "image": "val2014/COCO_val2014_000000060647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169576, "question_id": "SDNbcfNH8R9xDrsaHn5YE8", "question": "What status is the person the people are waiting for?", "choices": ["non existent", "medium", "high", "low"], "correct_choice_idx": 2, "direct_answers": ["famous", "high status", "high", "famous", "high", "unknown", "important", "high status", "royal", "late"], "difficult_direct_answer": false, "rationales": ["There are crowds of people lining the roadway.", "The status is high.", "People only usually congregate to see someone famous or important."], "image": "train2014/COCO_train2014_000000169576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477911, "question_id": "SEUqTggRgWzspzYyCA2Tin", "question": "What type of bed is next to the curtain?", "choices": ["cot", "queen", "king", "foldout"], "correct_choice_idx": 0, "direct_answers": ["cot", "cot", "cot", "pull out", "cot", "cot", "pull out", "cot", "cot", "cot"], "difficult_direct_answer": false, "rationales": ["It has folding leg supports and is low to the ground", "The bed next to the curtain is a small folding bed known as a cot.", "A simple cot is shown."], "image": "train2014/COCO_train2014_000000477911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320292, "question_id": "SEemDtnxTa4AorA6unUEmm", "question": "What would you call the metal poles connecting to the front wheel?", "choices": ["spoon", "fork", "knife", "fender"], "correct_choice_idx": 1, "direct_answers": ["risers", "beads", "spikes", "spokes", "fork", "telescopic fork", "telescopic fork", "forks", "unsure", "spokes"], "difficult_direct_answer": false, "rationales": ["This looks like tines on this utensil", "The object in question is a motorcycle and the piece that is visible and connects the body of the motorcycle to the tires is called a fork.", "The front wheel of the motorcycle does not have a fender. motorcycles do not have spoons or knives."], "image": "train2014/COCO_train2014_000000320292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269551, "question_id": "SF7tXo5dfds4TWQvRLWNPD", "question": "The color of the man's shirt matches the color of what?", "choices": ["bark", "pineapple", "cherry blossom", "banana"], "correct_choice_idx": 0, "direct_answers": ["sand", "table", "table", "bark", "chair", "chair", "dirt", "dirt", "chair", "desk"], "difficult_direct_answer": false, "rationales": ["The man's shirt is brown.", "It is brown like a trunk", "His t-shirt is also brown."], "image": "train2014/COCO_train2014_000000269551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354515, "question_id": "SFLbYyRUNj5oxxcysxcQon", "question": "What type of dish would this be categorized under?", "choices": ["vegetarian", "seafood", "chicken", "pork"], "correct_choice_idx": 1, "direct_answers": ["pasta", "seafood", "seafood", "seafood", "seafood", "pasta", "pasta", "seafood pasta", "seafood", "pasta"], "difficult_direct_answer": false, "rationales": ["Shrimp comes from the ocean.", "Shrimp is a type of seafood.", "Shrimp is seafood."], "image": "val2014/COCO_val2014_000000354515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434412, "question_id": "SFMdK6EoWT8zoQ6UNXEi7f", "question": "What does the design on the box that the monkey is sitting on look like?", "choices": ["dice", "lips", "cabbage", "stripes"], "correct_choice_idx": 0, "direct_answers": ["polka dots", "dice", "polkadot", "polka dots", "dell", "dice", "polka dots", "polka dots", "poke dots", "polka dots"], "difficult_direct_answer": false, "rationales": ["The box has the black dots that one would see on the items.", "A white box with black polka dots is under a stuffed monkey.", "Dice have black spots."], "image": "val2014/COCO_val2014_000000434412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454336, "question_id": "SFMvehgCSfH8tVBwHUfZTV", "question": "What is the object above the right of the sink used to dispense?", "choices": ["napkins", "towels", "soap", "condoms"], "correct_choice_idx": 2, "direct_answers": ["soap", "soap", "soap", "soap", "soap", "soap", "soap", "soap", "soap", "soap"], "difficult_direct_answer": false, "rationales": ["There is a closed off dispenser which is located just right of the sink. due to this proximity it is a soap dispenser.", "It is used to hold the bathing soap.", "The object on the right has soap in it."], "image": "train2014/COCO_train2014_000000454336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409208, "question_id": "SFjSdB5PpAjg34FQupmCkS", "question": "Who is probably feeling the most cold?", "choices": ["green pants", "yellow shirt", "black pants", "red jacket"], "correct_choice_idx": 0, "direct_answers": ["shirtless skier", "shirtless skier", "man", "skiier", "skier", "green pants", "coat less", "skier", "shirtless guy", "skier"], "difficult_direct_answer": false, "rationales": ["The person in the green pants has no shirt.", "He does not have a shirt on in the cold.", "The person is green pants is not wearing any shirt or jacket to protect themselves from the cold."], "image": "train2014/COCO_train2014_000000409208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51851, "question_id": "SG3eFzRt69UJQKVskKeTZg", "question": "What do people usually do in this room?", "choices": ["eat", "sleep", "cook", "wash"], "correct_choice_idx": 3, "direct_answers": ["shower", "relieve themselves", "clean themselves", "restroom", "hygiene", "wash", "pee poop", "wash up", "bpee", "bathroom"], "difficult_direct_answer": true, "rationales": ["There are several rows of sinks in the room.", "They can wash up in theis restroom.", "This is a bathroom, not a kitchen, bedroom, or dining room."], "image": "train2014/COCO_train2014_000000051851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244666, "question_id": "SGHyUTuTVAjv8v9CThfCXR", "question": "Why are these containers arranged this way?", "choices": ["for sale", "hiding", "easy access", "esthetics"], "correct_choice_idx": 3, "direct_answers": ["safety", "purpose", "esthetics", "smallest first", "stylish", "art", "organization", "color", "organization", "organization"], "difficult_direct_answer": false, "rationales": ["The containers are arranged for decoration.", "The containers are pretty.", "These containers are arranged to look nice."], "image": "train2014/COCO_train2014_000000244666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243443, "question_id": "SGLCwRZqtncqW7GPBt8smx", "question": "What kind of location are the bikes parked in?", "choices": ["street", "park", "gas station", "home"], "correct_choice_idx": 2, "direct_answers": ["parking lot", "parking lot", "forward", "parking lot", "parking lot", "forward", "gas station", "parking lot", "forward", "parking lot"], "difficult_direct_answer": false, "rationales": ["The prices for gas are displayed on the sign.", "There are gas prices shown.", "As indicated by the yellow sign near the top right corner."], "image": "train2014/COCO_train2014_000000243443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491736, "question_id": "SGamShKVETAzsAZHdZMkCR", "question": "What can the type of material that's being dragged be used to make?", "choices": ["metal blade", "log cabin", "glass bowl", "plastic toy"], "correct_choice_idx": 1, "direct_answers": ["table", "log cabin", "wood", "wood", "house", "house", "fence", "houses", "houses", "furniture"], "difficult_direct_answer": false, "rationales": ["They are pulling logs so something can be built.", "People use logs to make a house.", "It is part of a tree"], "image": "train2014/COCO_train2014_000000491736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272807, "question_id": "SGb9FvrnuH7thZeHcQ6FKE", "question": "What types of bikes are these?", "choices": ["electric", "cruiser", "children's", "mountain"], "correct_choice_idx": 1, "direct_answers": ["four", "polilate", "motorcycle", "motorcycle", "motorcycle", "royal enfield", "cruiser", "motorcycles", "motorcycles", "sports game"], "difficult_direct_answer": false, "rationales": ["The bikes are motorcycles.", "It's another name for motorcycle.", "The bikes are cruisers."], "image": "train2014/COCO_train2014_000000272807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263355, "question_id": "SGxHLzg2HLQYiT4V3cWhqJ", "question": "What action are these people taking?", "choices": ["descending", "rolling", "running", "ascending"], "correct_choice_idx": 0, "direct_answers": ["skiing", "descending", "skiing", "skiing", "downhill skiboot", "skiing", "skiing downhill", "skiing", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["The skiers are going downhill on their skis.", "The action is going down.", "These people are moving downhill."], "image": "val2014/COCO_val2014_000000263355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193166, "question_id": "SHA4LLgHw6NQbxbfKZJeR3", "question": "How many hours would a person legally be allowed to park here?", "choices": ["seven", "nine", "eight", "five"], "correct_choice_idx": 3, "direct_answers": ["five", "five", "five", "five", "five", "five", "five", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["According to the sign, this is the only amount of time.", "There is a sign that says \"5 hours\".", "There is a large five on the meter."], "image": "train2014/COCO_train2014_000000193166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255096, "question_id": "SHKLDRBZqhg6J2nGLE68n3", "question": "Where do this plans go?", "choices": ["dubai", "america", "qatar", "china"], "correct_choice_idx": 2, "direct_answers": ["qatar", "australia", "qatar", "middle east", "qatar", "qatar", "in air", "qatar", "abroad", "air"], "difficult_direct_answer": false, "rationales": ["The name of the company, qutar airways, can be partially seen from the window.", "The name starts with a q on the plane", "The q on the plane is an abbreviation for the name of the country"], "image": "val2014/COCO_val2014_000000255096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534336, "question_id": "SHs57N7mQo4rUvPeFqtHnz", "question": "Who is the shepherd?", "choices": ["sheep", "dog", "child", "man"], "correct_choice_idx": 3, "direct_answers": ["man", "adult", "man", "man", "man", "boy", "man", "father", "man", "man"], "difficult_direct_answer": false, "rationales": ["The man is and he is carrying a shepherds staff with him", "The shepherd is the guy with the stick.", "The man walking is."], "image": "val2014/COCO_val2014_000000534336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118852, "question_id": "SHuHVGESF2cGFzzv2PMixD", "question": "What animal pattern is the two piece kite using?", "choices": ["cow", "jaguar", "zebra", "leopard"], "correct_choice_idx": 0, "direct_answers": ["cow", "fish", "cow", "cow", "cow", "fish", "fish", "cow", "cow", "cow"], "difficult_direct_answer": false, "rationales": ["That black and white blotchy pattern is often seen on bovine animals.", "Cows are traditionally portrayed with the pattern shown on the kite, the white with black splotches.", "They are black and white."], "image": "val2014/COCO_val2014_000000118852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186060, "question_id": "SHzhSzbiUzajSpSQyuePiU", "question": "What is the best material for a tennis racquet?", "choices": ["nylon", "wood", "lithium", "graphite"], "correct_choice_idx": 3, "direct_answers": ["carbon fiber", "aluminum", "tennis", "aluminum", "graphite", "aluminum", "fiberglass", "nylon", "synthetics", "graphite"], "difficult_direct_answer": false, "rationales": ["Graphite is very light.", "A man is playing tennis on a clay court.", "Most current tennis rackets are made of graphite."], "image": "val2014/COCO_val2014_000000186060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117555, "question_id": "SJ6CEsrigpzAMYhEsjMv4A", "question": "What type of task is the woman working on?", "choices": ["laundry", "paperwork", "mechanical", "culinary"], "correct_choice_idx": 0, "direct_answers": ["ironing", "ironing", "making bed", "housekeeping", "sewing", "ironing", "making bed", "laundry", "laundry", "making bed"], "difficult_direct_answer": false, "rationales": ["The woman is putting a pillow case on.", "There is a shirt on the bed. the woman is pulling at the shirt.", "The task is laundry."], "image": "train2014/COCO_train2014_000000117555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218470, "question_id": "SJmPCRKvEa6kXmGKQbY32b", "question": "What is the worker in the bucket crane examining?", "choices": ["traffic light", "surveillance camera", "streetlamp", "electrical transformer"], "correct_choice_idx": 2, "direct_answers": ["traffic lights", "light pole", "streetlight", "light", "light post", "street lamp", "light", "streetlamp", "street light", "street light"], "difficult_direct_answer": false, "rationales": ["It is a large pole with a light on it", "The worker is for the streetlamp.", "He is probably changing the light bulb."], "image": "val2014/COCO_val2014_000000218470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250991, "question_id": "SJw4dwzhXNDLGa6xARVSA5", "question": "Where is the man on the bicycle possibly going?", "choices": ["work", "school", "gym", "wedding"], "correct_choice_idx": 0, "direct_answers": ["store", "work", "work", "left", "work", "work", "home", "home", "work", "to work"], "difficult_direct_answer": false, "rationales": ["The man is wearing a suit.", "He looks like he is dressed for work and that is where he could be going.", "The man is working."], "image": "train2014/COCO_train2014_000000250991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219958, "question_id": "SKmm3LWtsFNr4hseYA6QLJ", "question": "If the man eats what is around his head what vitamin will he get?", "choices": ["vitamin c", "vitamin w", "vitamin d", "vitamin r"], "correct_choice_idx": 0, "direct_answers": ["vitamin c", "be", "potassium", "vitamin c", "be", "potassium", "vitamin c", "pottasium", "potassium", "potassium"], "difficult_direct_answer": false, "rationales": ["Bananas are known to contain vitamin c.", "They have a little of this vitamin", "The man is wearing bananas around his head which have vitamin c."], "image": "train2014/COCO_train2014_000000219958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126126, "question_id": "SL8fnchqfPuNoWB95KgnLK", "question": "What is the man in the yellow beanie doing?", "choices": ["flipping", "grinding", "filming", "falling"], "correct_choice_idx": 2, "direct_answers": ["skateboarding", "filming skateboarders", "filming", "taking video", "skate boarding", "filming skateboarders", "skating", "filming", "recording video", "skate boarding"], "difficult_direct_answer": false, "rationales": ["He is holding a camera.", "A man is crouched over holding a device as it records another man doing tricks.", "The person on the beanie is holding a camera in his right hand."], "image": "train2014/COCO_train2014_000000126126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201018, "question_id": "SLCZE4sYoNxUFVrB8YopXx", "question": "The speed range of para gliders is typically what?", "choices": ["22-37 mph", "52-67 mph", "12-47 mph", "74-80 mph"], "correct_choice_idx": 2, "direct_answers": ["20-75 km/h", "12-47 mph", "fast", "35 mph", "no idea", "slow", "twelve mph", "ten mph", "12-47 mph", "windspeed"], "difficult_direct_answer": true, "rationales": ["The paragliders are going quickly but they're not whizzing.", "Paragliders cannot go that fast.", "That's their normal speed."], "image": "train2014/COCO_train2014_000000201018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500568, "question_id": "SLJ9T62zT8GBSdbTb2UqtJ", "question": "Why can't they watch the television?", "choices": ["no electricity", "old television", "broken television", "solar glare"], "correct_choice_idx": 0, "direct_answers": ["no electricity", "no power", "no electricity", "no power", "no electricity", "no electricity", "no power", "no electricity", "no power", "no power"], "difficult_direct_answer": false, "rationales": ["There are no outlets outside.", "The television is outside. it is not connected to anything.", "These appliances run on electricity."], "image": "val2014/COCO_val2014_000000500568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452031, "question_id": "SLMj7cxMACfkurfjmMa6hC", "question": "The current season is what?", "choices": ["fall", "summer", "spring", "winter"], "correct_choice_idx": 1, "direct_answers": ["spring", "summer", "spring", "summer", "summer", "spring", "summer", "summer", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["The sun is out and the leaves on the trees are green.", "The foliage and grass is in full bloom, as it would be in summertime.", "The person is wearing shorts and a tee shirt, the leaves are on the trees, and there are bicycles ready for riding. therefore, i chose the option with warm weather and fully bloomed trees."], "image": "train2014/COCO_train2014_000000452031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486491, "question_id": "SLYMQYsiiDhnfdVJFVtDuG", "question": "What are they looking at?", "choices": ["clock", "fans", "batter", "others"], "correct_choice_idx": 2, "direct_answers": ["batter", "batter", "batter", "batter", "batter", "batter", "batter", "ball", "home plate", "homeplate batter"], "difficult_direct_answer": false, "rationales": ["They are all looking toward the batter.", "Based on their positions in the baseball field these players are playing defense and it is their job to react to the batter. the pitcher, standing on the pitching mound, is in the action of pitching and must be looking in the direction of the batter.", "They are watching the player that is going to hit the ball to see what he does."], "image": "val2014/COCO_val2014_000000486491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277491, "question_id": "SLbSYmvY5QT7SKhGcwkF6i", "question": "What type of room are the people in?", "choices": ["lecture hall", "hallway", "gymnasium", "garage"], "correct_choice_idx": 2, "direct_answers": ["gymnasium", "gym", "gym", "gym", "gym", "gymnasium", "gymnasium", "gymnasium", "gym", "gymnasium"], "difficult_direct_answer": false, "rationales": ["There are basketball hoops in the room.", "The people are in a gym.", "The room is a gym."], "image": "train2014/COCO_train2014_000000277491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344578, "question_id": "SLs9WdiW4BDZWjyG5ewMpS", "question": "What language is the banner on the bus written in?", "choices": ["mexican", "egyptian", "asian", "french"], "correct_choice_idx": 2, "direct_answers": ["asian", "japanese", "chinese", "japanese", "japanese", "japanese", "japanese", "chinese", "chinese", "chinese"], "difficult_direct_answer": false, "rationales": ["While answer a is not a language, based on the structure of the letters visible on the sides, they are of a language from the continent of asia where languages have script that look similar to this.", "It is a type of east asian script. egyptians use arabic, while mexican spanish and french use scripts similar to english.", "The language is in asian characters."], "image": "train2014/COCO_train2014_000000344578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312233, "question_id": "SM7kZSKaKUzakzouUFtfbf", "question": "What type of flooring is shown?", "choices": ["tile", "hardwood", "concrete", "carpet"], "correct_choice_idx": 1, "direct_answers": ["hardwood", "hardwood", "hard wood", "wood", "hardwood", "wood", "wood", "wooden", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["Hardwood flooring is shown.", "Brown flooring with wood grain is under a man kneeling by a cat.", "It is common type of wood flooring."], "image": "train2014/COCO_train2014_000000312233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575222, "question_id": "SM8C4W2dhNqsUHiTX6TfNv", "question": "What is on the front of the train?", "choices": ["bear", "clown", "elephant", "zebra"], "correct_choice_idx": 1, "direct_answers": ["face", "smiley face", "clown face", "face", "clown", "clown", "clown", "clown face", "clown", "clown"], "difficult_direct_answer": false, "rationales": ["There is a clown face on the front of the train.", "A face with white cheeks and red lips is painted on a train.", "A cartoon clown face is on the train's front."], "image": "val2014/COCO_val2014_000000575222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338132, "question_id": "SM8TkkJqNbmHb4z6kLzg7U", "question": "What brass object sits in the tower?", "choices": ["statue", "bells", "cross", "clock"], "correct_choice_idx": 1, "direct_answers": ["bells", "bell", "bell", "bell", "cross", "cross", "bell", "bell", "bell", "bell"], "difficult_direct_answer": false, "rationales": ["Although it could be one instead of several. in game of thrones, the wildfire caused the bell to crash into the street.", "Usually a church tower has bells in them.", "They are brass, have a dong in the middle and make a noise when rung."], "image": "train2014/COCO_train2014_000000338132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255674, "question_id": "SMGJ7cYppjzYZ8aDGp748K", "question": "What's the name of the large white building in the background?", "choices": ["hotel", "lodge", "terminal", "casino"], "correct_choice_idx": 1, "direct_answers": ["ski lodge", "lodge", "lodge", "ski lodge", "ski lodge", "chalet", "resort", "lodge", "cabin", "lodge"], "difficult_direct_answer": false, "rationales": ["A person is on a snowy hill skiing with a large building in the background.", "Every ski resort has this place for people to go warm up.", "The building is a lodge."], "image": "train2014/COCO_train2014_000000255674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407582, "question_id": "SMZg73Z437ZVVDWGRmN3DV", "question": "What is the same color as the umbrella?", "choices": ["watermelon", "cherry", "banana", "orange"], "correct_choice_idx": 2, "direct_answers": ["yellow", "lemons", "sun", "banana", "sun", "sun", "building", "yellow", "sun", "banana"], "difficult_direct_answer": false, "rationales": ["Bananas are yellow as is the umbrella.", "A woman carries a yellow umbrella.", "They are both yellow"], "image": "train2014/COCO_train2014_000000407582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48204, "question_id": "SMsfEG5fM4dMpdvXyM3Fck", "question": "What game might be played here by these two?", "choices": ["tidley winks", "ultimate frisbee", "football", "jenga"], "correct_choice_idx": 1, "direct_answers": ["frisbee", "frisbee", "ultimate frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["The man is holding a white disc that can be thrown far in the air.", "Ultimate frisbee is played with a disc that you throw.", "They are outside and are not playing an inside game, like tidley winks or jenga. the man is holding a flying disc, not a ball."], "image": "train2014/COCO_train2014_000000048204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258289, "question_id": "SP5UX48JEwdDGwkR8bNKN4", "question": "Which is not a fur color of one of the animals?", "choices": ["grey", "red", "brown", "black"], "correct_choice_idx": 1, "direct_answers": ["orange", "brown", "green", "brown", "blue", "red", "gray", "yellow", "red", "purple"], "difficult_direct_answer": false, "rationales": ["There are brown, grey, and black animals.", "There are brown, black, grey, and white animals shown in this picture.", "The animals all have neutral and earthy colors."], "image": "train2014/COCO_train2014_000000258289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380516, "question_id": "SP73Lzc6YPo7QZWaAYGr6k", "question": "How many decades must pass before he can play professionally?", "choices": ["one", "three", "five", "two"], "correct_choice_idx": 0, "direct_answers": ["one", "two", "two", "one", "one", "two", "one", "one", "one", "two"], "difficult_direct_answer": false, "rationales": ["14 is the age that one can play professionally. he looks about four years old and a decade is 10 years.", "Professional players have their ages in the double digits, where this child is still in his single digit age group.", "The boy only needs to be a teen to go pro."], "image": "val2014/COCO_val2014_000000380516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75591, "question_id": "SP7FTCsGhrwghawRTHKWoa", "question": "What air quality problem occurs in this bedroom?", "choices": ["mold", "pollen", "low humidity", "high humidity"], "correct_choice_idx": 2, "direct_answers": ["low humidity", "cat odor", "pet allergies", "cat fur", "low humidity", "dander", "dusty", "fur", "animal hair", "pet dander"], "difficult_direct_answer": true, "rationales": ["The problem is low humidity.", "There is a humidifier on in the background.", "There is a humidifier on the table"], "image": "val2014/COCO_val2014_000000075591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138347, "question_id": "SP8FEEQ8rAbVox8P9AiBAw", "question": "Where can you reasonably go to the bathroom here?", "choices": ["behind tree", "male restroom", "outhouse", "female restroom"], "correct_choice_idx": 2, "direct_answers": ["outhouse", "portapotty", "portable potty", "outhouse", "outhouse", "back", "portable toilet", "port-a-potty", "in potty", "porta potty"], "difficult_direct_answer": false, "rationales": ["A blue outhouse is seen in an open area with a parking lot behind. outhouses are used as bathrooms at outdoor venues.", "The very tiny blue building is a bathroom", "In far distance is a blue receptacle. it is the only place because its outside to go bathroom."], "image": "train2014/COCO_train2014_000000138347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75322, "question_id": "SPEtaDvuD7Hh3m5rRNaBHJ", "question": "On what does the child focus here?", "choices": ["words", "dots", "white space", "music"], "correct_choice_idx": 1, "direct_answers": ["reading", "dots", "book", "colors", "book", "reading", "book", "book", "book", "book"], "difficult_direct_answer": false, "rationales": ["The child is looking at multi-colored circles on the paper in front of him.", "The child is looking at the dots.", "The child looks at dots."], "image": "train2014/COCO_train2014_000000075322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278078, "question_id": "SPeNson5ewzKekULqW8X86", "question": "The display is part of the storefront of which store?", "choices": ["walmart", "build-a-bear", "target", "toysrus"], "correct_choice_idx": 1, "direct_answers": ["toy", "toy", "toy", "mall", "toy store", "build-a-bear", "toy", "macy's", "teddy", "toy store"], "difficult_direct_answer": false, "rationales": ["The make bears.", "It is the images around that proves.", "That shop serves plush bears."], "image": "train2014/COCO_train2014_000000278078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273855, "question_id": "SPmoCLd8PoCupcUNBaZAic", "question": "What product does the young diner run out of here?", "choices": ["vinegar", "orange juice", "milk", "water"], "correct_choice_idx": 2, "direct_answers": ["milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk"], "difficult_direct_answer": false, "rationales": ["There is no milk left.", "The young person at the table is eating cereal but the milk container is empty.", "The jug is empty and he does not have any for his cereal."], "image": "val2014/COCO_val2014_000000273855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270071, "question_id": "SQ7W56FySD3bwcEXiU8Hz3", "question": "What is a natural danger here?", "choices": ["tigers", "sharks", "bats", "wasps"], "correct_choice_idx": 1, "direct_answers": ["sharks", "tsunami", "sharks", "waves", "sharks", "drowning", "drowning", "tides", "ocean", "sharks"], "difficult_direct_answer": false, "rationales": ["This is their habitat", "A natural danger in the ocean would be sharks.", "There is an ocean, which is a habitat of sharks."], "image": "train2014/COCO_train2014_000000270071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142646, "question_id": "SQABNTRssb2VqZt8eWK8im", "question": "Where is this elephant located?", "choices": ["circus", "water", "zoo", "forest"], "correct_choice_idx": 1, "direct_answers": ["water", "river", "water", "river", "water", "river", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The elephant is getting a bath.", "The elephant is laying down in the water.", "An elephant is laying in murky water as others look on."], "image": "train2014/COCO_train2014_000000142646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250645, "question_id": "SQB2MhH6ZQNhyTQWA86fXC", "question": "Who has right of way here?", "choices": ["dogs", "bus", "taxi", "pedestrians"], "correct_choice_idx": 3, "direct_answers": ["horse", "horse", "people", "police", "police", "pedestrians", "horse", "people", "pedestrians", "pedestrian"], "difficult_direct_answer": false, "rationales": ["There are visible crosswalks in the image that people are walking in. when walking in a crosswalk, people are given the right of way.", "The people crossing the road have the right of way.", "The lines are marked for people to walk."], "image": "train2014/COCO_train2014_000000250645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528713, "question_id": "SRRJuGc8fhTLF6ou9BJ723", "question": "What event is going to take place?", "choices": ["horse parade", "horse racing", "horse trading", "horse show"], "correct_choice_idx": 1, "direct_answers": ["race", "horse race", "horse racing", "horse race", "parade", "horse racing", "race", "horse race", "horse race", "horse racing"], "difficult_direct_answer": false, "rationales": ["There is a horse and jockey clearly visible. jockeys ride horses in horse racing events.", "The horse is getting ready to compete.", "The event is horse racing."], "image": "train2014/COCO_train2014_000000528713.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532398, "question_id": "SRWkL6rHtPSZK55eZJjQfa", "question": "What kind of kite it is?", "choices": ["delta", "flat", "box", "bow"], "correct_choice_idx": 3, "direct_answers": ["colorful", "colorful stingray", "parasail", "delta kite", "delta", "bow", "triangular", "flying kite", "colorful", "glider"], "difficult_direct_answer": true, "rationales": ["You can tell by the kites design as to what type it is.", "The kite is in a bow shape.", "The kite is bow as it have shape like bow as evident in the picture."], "image": "train2014/COCO_train2014_000000532398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159421, "question_id": "SRWyLMDVfxDau6VyvqcZGy", "question": "What is the most popular conveyance in this part of town?", "choices": ["bike", "car", "bus", "motorcycle"], "correct_choice_idx": 3, "direct_answers": ["motorcycle", "bike", "chicken", "awning", "scooter", "motorcycles", "motorcycle", "food stands", "scooters", "motorbike"], "difficult_direct_answer": true, "rationales": ["The street is filled with them.", "Most of the vehicles have two, not four, wheels. they are powered by engines.", "Everyone is riding a motor bike."], "image": "train2014/COCO_train2014_000000159421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407291, "question_id": "SRZgMhayoWkxtuVX2okVYS", "question": "What does the W S stand for?", "choices": ["washington/state", "white/scarlet", "world/sport", "western/southern"], "correct_choice_idx": 3, "direct_answers": ["tennis", "watersports", "western southern", "western/southern", "western southern", "western southern", "western southern", "world series", "tennis", "wilson"], "difficult_direct_answer": false, "rationales": ["The ws is for directions.", "Was stands for the west and south.", "The ws standards for a region."], "image": "val2014/COCO_val2014_000000407291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380351, "question_id": "SRt7KJcMM2swrfT45HjfoD", "question": "What are the men doing on the field?", "choices": ["signing autographs", "competing", "practicing", "exercising"], "correct_choice_idx": 2, "direct_answers": ["practicing", "soccer practice", "warmup drill", "warming up", "warming up", "warming up", "warming up", "warming up", "dribbling", "playing soccer"], "difficult_direct_answer": false, "rationales": ["Since there is only one ball on a field during a game, having this many could only mean the players are trying out different tactics to use in games.", "The men practice.", "Each of the players has a ball to practice with."], "image": "train2014/COCO_train2014_000000380351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150372, "question_id": "SS5Wb5k7tZPyvRGLNCkV47", "question": "What is the man to the right trying to do on top of his bike?", "choices": ["ride", "smoke", "sleep", "talk"], "correct_choice_idx": 2, "direct_answers": ["relax", "smoke", "sleep", "rest", "sleep", "sleep", "sleep", "sleep", "sleeping", "sleep"], "difficult_direct_answer": false, "rationales": ["The man wants to snooze.", "He is laying back resting for a bit.", "A man is leaning back on a motorcycle with his eyes closed."], "image": "train2014/COCO_train2014_000000150372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230169, "question_id": "SS8t66sFZyFEdLqt32vyMM", "question": "Where is the occupier of the left chair seen here?", "choices": ["sitting", "at work", "taking photo", "at home"], "correct_choice_idx": 2, "direct_answers": ["beach", "no where", "beach", "swimming", "taking photo", "taking picture", "swimming", "gone", "nowhere", "behind camera"], "difficult_direct_answer": false, "rationales": ["There is no one sitting on the left chair so they must be somewhere else. only viable option is that they are the one taking this photo.", "The person is taking the photo.", "The occupier takes a photo."], "image": "train2014/COCO_train2014_000000230169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222244, "question_id": "SSCNSSb5QXmao37YRM8GWJ", "question": "What is used to give the cake its brown color?", "choices": ["cocoa powder", "chocolate syrup", "food coloring", "brown sugar"], "correct_choice_idx": 0, "direct_answers": ["chocolate", "cocoa powder", "cocoa", "chocolate", "chocolate", "cocoa powder", "chocolate", "chocolate", "chocolate", "chocolate"], "difficult_direct_answer": false, "rationales": ["This is a chocolate cake so it has cocoa powder.", "There is coca powder used to make the cake brown.", "The cocoa gives the color."], "image": "train2014/COCO_train2014_000000222244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310134, "question_id": "SSDQv4PXGDsSqdBRTd37A7", "question": "What gives the square items on the couch their shape?", "choices": ["stuffing", "glass", "plastic", "styrofoam"], "correct_choice_idx": 0, "direct_answers": ["cupboard", "stuffing", "filling", "legs", "foam", "stuffing", "fabric fill", "stuffing", "padding", "cushions"], "difficult_direct_answer": false, "rationales": ["That's how they make pillows.", "The chairs are full of stuffing.", "Pillows normally would be flat if not inserted with stuffing of some sort."], "image": "train2014/COCO_train2014_000000310134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544893, "question_id": "SSXWDSAzZv5kUfF8tJXF4P", "question": "In dollars how much does it cost to park a bike here?", "choices": ["$1", "$15", "$4", "$0"], "correct_choice_idx": 3, "direct_answers": ["zero", "zero", "zero", "zero", "zero", "zero", "zero", "$0", "zero", "zero"], "difficult_direct_answer": false, "rationales": ["The sign indicates free parking.", "The sign says parking is free.", "This is when no money is to be exchanged for a service or product."], "image": "train2014/COCO_train2014_000000544893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390923, "question_id": "SScvwqHPcpFrh5BFqvvJuJ", "question": "What is the filling made of?", "choices": ["ice cream", "frosting", "fruit", "custard"], "correct_choice_idx": 2, "direct_answers": ["fruit", "raspberry", "jam", "jam", "fruit", "jelly", "strawberry", "raspberry", "raspberry", "jam"], "difficult_direct_answer": false, "rationales": ["Fruit pieces can be seen on top of the cake.", "It looks to be made of fruit.", "The cake has filling that is made of a deep red fruit."], "image": "train2014/COCO_train2014_000000390923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233815, "question_id": "SSd67RxuWZ8DE8JuZNtn2Z", "question": "What country is depicted in the photo?", "choices": ["india", "sumatra", "borneo", "inodnesia"], "correct_choice_idx": 1, "direct_answers": ["malaysia", "sumatra", "indonesia", "vietnam", "thailand", "india", "india", "china", "india", "vietnam"], "difficult_direct_answer": false, "rationales": ["Sumatra has elephants.", "The people look asian and indonesian. indonesia is known for having elephant tours.", "You can tell by the peoples ethnicity and the setting, as to where they probably are."], "image": "val2014/COCO_val2014_000000233815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384422, "question_id": "SSv99tJjLXDmNSWZW9Kbtr", "question": "What is the name of the single light on the front of the motorcycle?", "choices": ["warning light", "signal light", "headlight", "spotlight"], "correct_choice_idx": 2, "direct_answers": ["headlight", "headlight", "headlight", "headlight", "headlight", "headlight", "headlight", "headlight", "headlight", "headlight"], "difficult_direct_answer": false, "rationales": ["The name is a headlight.", "The front light is the headlight.", "The light is mounted on the front of the vehicle and is used to illuminate the road ahead. that is where headlights are placed."], "image": "train2014/COCO_train2014_000000384422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525823, "question_id": "STH9vFwAEWEp6mP48iBjm9", "question": "What potential hazard might occur?", "choices": ["vomiting", "choking", "drowning", "dancing"], "correct_choice_idx": 1, "direct_answers": ["choking", "choking", "choking", "choking", "choking", "choking", "choking", "choking", "choking", "choking"], "difficult_direct_answer": false, "rationales": ["The girl appears to be putting a large food into her mouth that would be too large to swallow at once. if consumed in this manner without taking smaller bites, answer a could be likely.", "The girl might be choking after she swallows the food.", "The kid could choke since the food is big."], "image": "train2014/COCO_train2014_000000525823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315719, "question_id": "STTyK6nSCnNcaabY22F8DD", "question": "What type of crossing is this?", "choices": ["pedestrian", "duck", "school", "train"], "correct_choice_idx": 0, "direct_answers": ["pedestrian", "crosswalk", "crosswalk", "bike crossing", "pedestrian", "crosswalk", "pedestrian", "crosswalk", "crosswalk", "zebra crossings"], "difficult_direct_answer": false, "rationales": ["The crossing lines up with the sidewalk, which is used by pedestrians.", "The lines in the road indicate the pedestrian crossing where people can walk across the street.", "This crossing is demarcated by white paint in a distinctive layout that indicates it is intended for walkers to cross here."], "image": "train2014/COCO_train2014_000000315719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408806, "question_id": "STqABMs2XZWdfBpJj49ENZ", "question": "Where were the first free public benches invented?", "choices": ["spain", "morocco", "america", "france"], "correct_choice_idx": 3, "direct_answers": ["france", "paris france", "paris", "paris", "1850", "america", "grands boulevards", "paris", "rome", "united states"], "difficult_direct_answer": false, "rationales": ["France came out with free public benches.", "The first public benches were invented in france.", "A guy sits on a bench in a public area."], "image": "train2014/COCO_train2014_000000408806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3538, "question_id": "STqi6k6nT2rJ39hHsUThh5", "question": "What is the person in the jacket holding?", "choices": ["basket", "kittens", "eggs", "skis"], "correct_choice_idx": 3, "direct_answers": ["stick", "stick", "ski poles", "poles", "ski poles", "ski poles", "ski poles", "skis", "ski poles", "ski poles"], "difficult_direct_answer": false, "rationales": ["The person has skis.", "The person is skiing and is on the slope.", "The person is on a snowy mountain and is skiing."], "image": "train2014/COCO_train2014_000000003538.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157465, "question_id": "STvNXsbHo5BJjYDuv5MbQs", "question": "What type of vehicles are shown?", "choices": ["train", "convertible", "helicopter", "bus"], "correct_choice_idx": 3, "direct_answers": ["buses", "buses", "buses", "buses", "busses", "bus", "bus", "busses", "bus", "transit buses"], "difficult_direct_answer": false, "rationales": ["They are land vehicles with covered tops that are designed to use roads, not tracks.", "These are older style busses.", "The other options aren't in this image."], "image": "val2014/COCO_val2014_000000157465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55074, "question_id": "SU8gAHthLcXBAhK6hePXsN", "question": "What will the batter do now?", "choices": ["call ball", "quit", "strike", "turn around"], "correct_choice_idx": 2, "direct_answers": ["hit ball", "hit ball", "hit ball", "run", "swing", "hit ball", "hit baseball", "run", "hit ball", "strike"], "difficult_direct_answer": false, "rationales": ["The batter is in the process of swinging the bat.", "The batter will miss the ball.", "The batter will strike."], "image": "val2014/COCO_val2014_000000055074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366753, "question_id": "SUK3UEEHoAxZsAL5dtdmyy", "question": "What is the girl using the poles to do?", "choices": ["stand up", "climb", "dig", "reach"], "correct_choice_idx": 0, "direct_answers": ["ski", "ski", "balance", "balance", "balance", "skii", "balance", "ski", "balance", "stand up"], "difficult_direct_answer": false, "rationales": ["The girl is heavily putting her weight on the poles, and her legs aren't positioned to support her.", "The girl wants to stand.", "Poles are used for balance and stability in skiing, and she is standing still on her skis. it looks like she kind of doesn't want to lose control, so she went 'pizza formation' with her skis and planted her poles, so that doesn't happen down the hill."], "image": "train2014/COCO_train2014_000000366753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564095, "question_id": "SUNy3SKX8cjnwxi8N7Yott", "question": "What does the person facing seaward await?", "choices": ["uber", "fish", "boat", "huge waves"], "correct_choice_idx": 3, "direct_answers": ["waves", "wave", "waves", "huge waves", "wave", "waves", "waves", "waves", "waves", "waves"], "difficult_direct_answer": false, "rationales": ["A person is standing near the water with a surfboard.", "The person is looking for waves to surf on.", "They are looking to see when the water is right to surf."], "image": "val2014/COCO_val2014_000000564095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225686, "question_id": "SUWTdcEkUVB2iQyPGfWZHB", "question": "What is the shape of the Wall's logo?", "choices": ["circle", "square", "heart", "triangle"], "correct_choice_idx": 2, "direct_answers": ["hearts", "heart", "heart", "heart", "heart shaped", "heart", "heart", "hearts", "heart", "heart"], "difficult_direct_answer": false, "rationales": ["The logo on the wall is in the shape of a heart.", "You can tell by the shape as to what the logo is.", "The wall's logo is in a heart."], "image": "val2014/COCO_val2014_000000225686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524662, "question_id": "SUoj7aTxgwTAoy9Vk754iN", "question": "Which food out of these is most starchy?", "choices": ["orange", "carrot", "potato", "apple"], "correct_choice_idx": 2, "direct_answers": ["lettuce", "potato", "potato", "potatoes", "potatos", "apple", "potatoes", "potato", "potato", "potato"], "difficult_direct_answer": false, "rationales": ["The potatoes are the most starchy food in the bunch.", "The inside of the round yellow vegetable contains lots of starch.", "Foods with starches have less fiber."], "image": "train2014/COCO_train2014_000000524662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401766, "question_id": "SUqqkCuMxisLKXXVfgV4Qs", "question": "What is probably faulty?", "choices": ["cooling", "heating", "plumbing", "electrical"], "correct_choice_idx": 2, "direct_answers": ["sink", "pipes", "leaky pipe", "plumbing", "pipe", "pipes", "sink", "plumbing", "pipes", "sink"], "difficult_direct_answer": false, "rationales": ["The person is working under the sink so it's probably in the pipes.", "The person is working under the sink, which has pipes under the drain.", "A person is laying on kitchen floor as they tend to the underpart of the sink. there is probably issues with the piping."], "image": "train2014/COCO_train2014_000000401766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127202, "question_id": "SV749ESkEpDvMaZHXidfH8", "question": "What type street does this bus drive on?", "choices": ["brick", "concrete", "tar", "dirt"], "correct_choice_idx": 0, "direct_answers": ["cobblestone", "tiled", "cobblestone", "brick", "cobbled", "brick", "brick road", "brick road", "tiled", "brick"], "difficult_direct_answer": false, "rationales": ["This road is a brick road.", "There are small rectangular sections in the road.", "This is a brick road."], "image": "train2014/COCO_train2014_000000127202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106484, "question_id": "SVEXTn2BBVbZUU3WrYhH8z", "question": "Why is he with the dog?", "choices": ["selling it", "walk it", "exercising", "stealing it"], "correct_choice_idx": 1, "direct_answers": ["walking it", "walking", "owner", "companion", "walking", "his pet", "walking", "walking him", "walk it", "walking it"], "difficult_direct_answer": false, "rationales": ["A man is walking with a dog. it has a leash around its neck so he doesn't get away and is getting some exercise with his owner.", "The dog is attached to a leash which is set up that would be commonly associated with answer a.", "He is holding onto a leash and walking the dog."], "image": "val2014/COCO_val2014_000000106484.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479630, "question_id": "SVEfjZzmZZTVHXvmYaV7jw", "question": "What vehicle propels the man forward?", "choices": ["boat", "submarine", "bike", "car"], "correct_choice_idx": 0, "direct_answers": ["boat", "boat", "boat", "boat", "motor boat", "boat", "engine", "boat", "boat", "ski boat"], "difficult_direct_answer": false, "rationales": ["The boat seems to be pulling him very fast in the water.", "This is why he's holding onto the handle to the cable.", "The boat keeps the man moving."], "image": "val2014/COCO_val2014_000000479630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61175, "question_id": "SVFqwcAqjJoajNC7v3FK9k", "question": "What are baseball bats usually made of?", "choices": ["tin", "iron", "wood", "aluminum"], "correct_choice_idx": 3, "direct_answers": ["wood", "wood", "wood", "aluminum", "wool", "wood", "wood", "wood", "wool", "wood"], "difficult_direct_answer": false, "rationales": ["Professional baseball bats are usually made of aluminum.", "Baseball bats are made from aluminum.", "Traditionally baseball bats are made of pine."], "image": "val2014/COCO_val2014_000000061175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22208, "question_id": "SVZLShfhFywvVpDNLERvWk", "question": "What is this large appliance used for?", "choices": ["cooling", "watching", "washing", "cooking"], "correct_choice_idx": 1, "direct_answers": ["entertainment", "tv", "watching", "watching tv", "watching movies", "fireplace", "heat", "watching television", "watching", "television"], "difficult_direct_answer": true, "rationales": ["This is a television and people use it to watch shows and movies.", "A television is turned on in a home.", "The person is watching tv."], "image": "val2014/COCO_val2014_000000022208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46981, "question_id": "SVgo9Fu9wpbTTdcaWLCEGK", "question": "What are most modern baseball bats made of?", "choices": ["wood", "tin", "aluminum", "steel"], "correct_choice_idx": 2, "direct_answers": ["aluminum", "aluminum", "aluminum", "titanium", "aluminum", "aluminum", "maple", "hardwood", "hardwood", "wood"], "difficult_direct_answer": false, "rationales": ["In the past most bats were made of wood however with today's advances, most are made of aluminum today.", "Baseball bats are constructed from aluminum.", "They used to be made primarily of wood. they can also be made of a tin, but it's too easy to damage. and d would be too heavy."], "image": "train2014/COCO_train2014_000000046981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373341, "question_id": "SViGwL2xv7jJaYBnLBjeVb", "question": "Which car is in greatest danger if the FedEx car rushed forward?", "choices": ["blue suv", "silver sedan", "motorcycle", "black truck"], "correct_choice_idx": 0, "direct_answers": ["station wagon", "left", "suv", "suv", "black suv", "dark", "suv", "blue suv", "blue suv", "lexus"], "difficult_direct_answer": false, "rationales": ["The blue car is closest to the fedex truck.", "The blue suv is right in front of the fedex truck.", "The blue suv can cause danger to the area."], "image": "val2014/COCO_val2014_000000373341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491508, "question_id": "SVkkG69ACJHujJ34ty9Umm", "question": "The microbes grow on the tree in cold weather is?", "choices": ["fungi", "lichen", "bacteria", "virus"], "correct_choice_idx": 1, "direct_answers": ["moss", "psychrophiles", "moss", "lichen", "psychrophiles", "fall", "moss", "bacteria", "spores", "moss"], "difficult_direct_answer": false, "rationales": ["Lichen thrives on cold and dark environments.", "There are lichens that can grow.", "This can still grow during the cold periods."], "image": "val2014/COCO_val2014_000000491508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235840, "question_id": "SVm45G6NmMmDfvHJuXauTx", "question": "What part of the giraffe in the front does the giraffe in the back look at?", "choices": ["neck", "legs", "butt", "head"], "correct_choice_idx": 2, "direct_answers": ["butt", "neck", "butt", "butt", "tail", "tail", "tail", "back end", "tail", "green"], "difficult_direct_answer": false, "rationales": ["The little one is looking at the other giraffe's butt.", "The giraffe is looking at the other's behind.", "Its head is at the read of the other giraffe"], "image": "train2014/COCO_train2014_000000235840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426979, "question_id": "SW9UEPhf4Kd2mxaHsAHbQ2", "question": "What year did the biker's state become a part of the union?", "choices": ["1875", "1835", "1861", "1822"], "correct_choice_idx": 2, "direct_answers": ["1861", "eighteen sixtyone", "1861", "1861", "1861", "1861", "unknown", "1861", "eighteen sixty-one", "1861"], "difficult_direct_answer": false, "rationales": ["The biker's state (from the license plate) joined the union in 1861.", "I did an internet search on when kansas, the state on the license plate, became a state.", "Per google, this is the correct date."], "image": "train2014/COCO_train2014_000000426979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466132, "question_id": "SWCGqrSxzkMCxTXrwEsfH5", "question": "What materials are the cabinets made from?", "choices": ["plastic", "metal", "wood", "glass"], "correct_choice_idx": 2, "direct_answers": ["wood-based materials", "wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood-based materials"], "difficult_direct_answer": false, "rationales": ["The cabinets are not transparent or shiny, so they are not made out of glass or metal. plastic cabinets would not be durable.", "The cabinets in the kitchen are made of wood and painted white.", "Kitchen cabinets are mostly made of wood."], "image": "train2014/COCO_train2014_000000466132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134396, "question_id": "SWEAYKQSB4BXP36f2SA2CM", "question": "Why are the chairs in the water?", "choices": ["cleaning off", "are lost", "they're drunk", "cooling off"], "correct_choice_idx": 3, "direct_answers": ["rising tide", "stay cool", "to sit", "cooling off", "for fun", "feet wet", "cooling off", "sitting", "high tide", "relaxing"], "difficult_direct_answer": true, "rationales": ["The chairs are there so people can dip their feet in the cool water.", "The chairs are cooling.", "This setting is normally during the hot time of the year, so the water helps."], "image": "train2014/COCO_train2014_000000134396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470318, "question_id": "SWQRRoFq9VHRvSprWoZVea", "question": "How were the meats most likely cooked?", "choices": ["bbq grill", "oven", "toaster", "stovetop"], "correct_choice_idx": 0, "direct_answers": ["grilled", "bbq grill", "grilled", "grilled", "on grill", "grilled", "grill", "broiled", "grill", "grilled"], "difficult_direct_answer": false, "rationales": ["They were grilled.", "You can see the marks on the meat", "Hotdogs and hamburgers are often cooked on a bbq."], "image": "val2014/COCO_val2014_000000470318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138891, "question_id": "SWTNHbNxndndVeqy5hacL6", "question": "What is the man doing?", "choices": ["observing pilot", "reading information", "cleaning floor", "killing time"], "correct_choice_idx": 1, "direct_answers": ["reading", "observing", "looking", "observing plane", "reading information", "watching airplane", "gazing", "reading information", "reading", "looking"], "difficult_direct_answer": false, "rationales": ["He is looking down at a plaque, which likely has information to read.", "By his posture and the setting he is in, you can tell what he is doing.", "A man stands in front of a plaque in front of a plane on display."], "image": "train2014/COCO_train2014_000000138891.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393664, "question_id": "SWW675U7F8dUneWPnkmh4y", "question": "What kind of lens was used to take this picture?", "choices": ["fish eye", "cell phone", "none", "flat"], "correct_choice_idx": 0, "direct_answers": ["fish bowl", "fisheye", "fishbowl", "fisheye", "fish eye", "fish eye", "fish eye", "fisheye lens", "fish eye", "wide"], "difficult_direct_answer": false, "rationales": ["That is the rounded, four-point perspective kind of camera look, similar to how round gilled animals' eyes look.", "That type of lens will create the rounded look.", "The lens is a fish eye."], "image": "train2014/COCO_train2014_000000393664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193892, "question_id": "SWaSA6nsa3PciVQgCK6Lpf", "question": "On which type of plant do these fruits grow?", "choices": ["low herbs", "shrubs", "trees", "vines"], "correct_choice_idx": 2, "direct_answers": ["orange trees", "tree", "orange tree", "tree", "orange tree", "evergreen tree", "orange tree", "tree", "trees", "tree"], "difficult_direct_answer": false, "rationales": ["The plant is a tree.", "Oranges grow on orange trees, which are larger than shrubs. orange trees are picked using ladders.", "These are tangerines. they grow on perennial woody plants."], "image": "train2014/COCO_train2014_000000193892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520338, "question_id": "SWeezdWgGE996CFH4rSSP4", "question": "What is the man in the light shirt doing?", "choices": ["protecting bank", "selling paper", "resting", "awaiting atm"], "correct_choice_idx": 3, "direct_answers": ["surveying", "awaiting atm", "standing", "standing", "fire hydrant", "standing", "standing around", "waiting", "waiting", "staring"], "difficult_direct_answer": false, "rationales": ["The man is waiting to use the atm.", "He's waiting to deposit money or take money out.", "The man is standing in one place with nothing in front of him."], "image": "val2014/COCO_val2014_000000520338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444304, "question_id": "SWjVB9cLg6gaSC3GDrgYVp", "question": "What video format can this person watch films in?", "choices": ["vhs", "beta", "dvd", "mp4"], "correct_choice_idx": 2, "direct_answers": ["dvd", "video", "dvd", "sitting format", "mpeg", "dvd", "tape", "dvd", "all format", "game"], "difficult_direct_answer": false, "rationales": ["The person is watching a tv that has a dvd player under it for watching dvds.", "This person appears to have a dvd player below and connected to the tv. for them to watch films of the format, they would need the associated player.", "There is a player under the tv."], "image": "val2014/COCO_val2014_000000444304.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355537, "question_id": "SWyf4RtAEdLEk3Wgk2QKpZ", "question": "What are the two doing behind the boat?", "choices": ["diving", "swimming", "fishing", "water skiing"], "correct_choice_idx": 3, "direct_answers": ["skis", "water skiing", "water skiing", "pulling", "water skiing", "being pulled", "water skiing", "water skiing", "water skiing", "water skiing"], "difficult_direct_answer": false, "rationales": ["The two people behind the boat are water skiing.", "The people are skiing in the water.", "The two people are water skiing behind the boat."], "image": "train2014/COCO_train2014_000000355537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565041, "question_id": "SWzLZVjSNBgrhahXgaKknY", "question": "How is this man feeling?", "choices": ["excited", "amused", "shy", "angry"], "correct_choice_idx": 3, "direct_answers": ["discouraged", "angry", "upset", "angry", "okay", "sad", "disappointed", "scared", "worried", "serious"], "difficult_direct_answer": true, "rationales": ["The man does not look happy", "Based on the look of the man's eyes and his furrowed brow, it demonstrates his anger.", "He does not look happy"], "image": "val2014/COCO_val2014_000000565041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539746, "question_id": "SX9t3QgCrG5WegKaDwsYv2", "question": "What does the plaque on the back of this bench say?", "choices": ["wet paint", "bus ad", "dedication", "no seating"], "correct_choice_idx": 2, "direct_answers": ["sit down", "unknown memorial", "dedication", "memorial message", "can't see", "stop", "no sleeping", "dedicated by", "cannot see", "dedicated"], "difficult_direct_answer": true, "rationales": ["The plaque on the rear of the bench is a dedication.", "The plaque on the bench has a name on it of the person the bench is dedicated to.", "Benches in a community are sometimes sponsored by loved ones to commemorate those they lost."], "image": "train2014/COCO_train2014_000000539746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409241, "question_id": "SXBQZiV2nzGFAjj3T8w4H9", "question": "What sort of traffic is allowed on the bridge over this train?", "choices": ["foot", "boat", "none", "large trucks"], "correct_choice_idx": 0, "direct_answers": ["foot", "foot", "people", "pedestrian", "pedestrian", "foot", "pedestrians", "pedestrian", "foot", "pedestrian"], "difficult_direct_answer": false, "rationales": ["The little bridge is too small to support vehicles, and by the many heads visible over the top of it, it is open only to pedestrians.", "A small overpass is seen at the top of photo. people are stopping on it to watch the train go by, in fact.", "There is a very narrow crossing over the tracks"], "image": "val2014/COCO_val2014_000000409241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577033, "question_id": "SXDCfCDCaUR9WwCG9bhRsq", "question": "Who are these people to each other?", "choices": ["allies", "teammates", "relatives", "enemies"], "correct_choice_idx": 1, "direct_answers": ["teammates", "teammates", "teammates", "teammates", "crewmates", "teammates", "teammates", "teammates", "crewmates", "teammates"], "difficult_direct_answer": false, "rationales": ["The people are teammates.", "By the activity they are doing and uniforms they are wearing you can tell what they are to each other.", "The people are all in a boat together, rowing, in the same uniform as a rowing team would."], "image": "val2014/COCO_val2014_000000577033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368249, "question_id": "SXNb883hTU8snXpdX2YHrb", "question": "What item might this man be selling?", "choices": ["gyros", "hot dogs", "cotton candy", "peanuts"], "correct_choice_idx": 3, "direct_answers": ["nuts", "peanuts", "nuts", "nuts", "nuts", "hot dogs", "nuts", "hot dogs", "nuts", "nuts"], "difficult_direct_answer": false, "rationales": ["The man's stall has a sign advertising honey-roasted nuts.", "They are honey roasted and that is usually the type of nut for this", "The sign above his head says he is selling honey roasted nuts."], "image": "train2014/COCO_train2014_000000368249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335321, "question_id": "SXW5M3YTkzmTDJZKPpyGtR", "question": "What is the boy in the blue hat about to do?", "choices": ["drop in", "nose grind", "heel flip", "kick flip"], "correct_choice_idx": 0, "direct_answers": ["skate", "drop in", "skateboard", "descend", "skateboard", "skate", "drop in", "skating", "skating", "half pipe"], "difficult_direct_answer": false, "rationales": ["The boy with the blue hat is about to drop in on a skateboard.", "He will go down onto the rest of the course", "He's at the top of the ramp"], "image": "train2014/COCO_train2014_000000335321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276491, "question_id": "SXWaAGjbX4inbSEyZrzfwp", "question": "Where do the tubes from the raised bag go?", "choices": ["patient's arm", "mans toe", "nurses station", "outside window"], "correct_choice_idx": 0, "direct_answers": ["iv", "patients arm", "iv", "iv bag", "man's arm", "intravenous lines", "iv", "iv", "iv", "patient's arm"], "difficult_direct_answer": false, "rationales": ["The tubes are for the arm.", "The tubes are delivering fluid to the person's arm.", "There are tubes in the raised bag going to the man's arm."], "image": "train2014/COCO_train2014_000000276491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355819, "question_id": "SXbepHDPdfdUzzXZzSaNjH", "question": "What would a player need here additionally to play a game with this equipment?", "choices": ["bat", "ball", "grass", "bases"], "correct_choice_idx": 3, "direct_answers": ["field", "protecting", "bases", "hat", "cleats", "cleats", "players", "baseball", "diamond field", "bases"], "difficult_direct_answer": false, "rationales": ["A player would need additional bases to play on this game.", "A bat and ball are already present. the game could be played without grass.", "Bases are needed to know where to run and be able to tell how many runs there are."], "image": "val2014/COCO_val2014_000000355819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576084, "question_id": "SXeRGv4jyPbkdpQ2sFi2UN", "question": "What can be found in the bucket?", "choices": ["grain", "grass", "water", "milk"], "correct_choice_idx": 3, "direct_answers": ["milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk"], "difficult_direct_answer": false, "rationales": ["The man is milking the cow", "The milkman milks the and fells in the bucket.", "The man is milking to cow from her teeth, and milk is falling into the bucket."], "image": "val2014/COCO_val2014_000000576084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529667, "question_id": "SXgiNdaiuuSk9h8LUHtBzj", "question": "What vehicle is present?", "choices": ["bicycle", "tank", "airplane", "minivan"], "correct_choice_idx": 0, "direct_answers": ["bicycle", "bicycle", "bicycle", "bicycles boats", "boat", "bicycle", "bicycle", "bicycle", "bike", "bike"], "difficult_direct_answer": false, "rationales": ["The vehicle on the far left has two wheels.", "A bike is shown.", "A bicycle can be seen here."], "image": "train2014/COCO_train2014_000000529667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424979, "question_id": "SY6fGzxbSkVspoUWX8mk2B", "question": "How were the potatoes this man eats prepared?", "choices": ["mashed", "baked", "raw", "fried"], "correct_choice_idx": 3, "direct_answers": ["fried", "fried", "fried", "fried", "fried", "continuous fry", "fried", "fried", "continuous fry", "fried"], "difficult_direct_answer": false, "rationales": ["Two men are snacking on a sandwich. close by is a bag of potato chips that are crunchy.", "The potatoes this man eats are kettle fried.", "Chips are fried."], "image": "train2014/COCO_train2014_000000424979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404655, "question_id": "SY8rJGUps2yPs5cEpGxQrS", "question": "What temperatures are the persons walking experiencing?", "choices": ["below average", "hot", "freezing", "cold"], "correct_choice_idx": 1, "direct_answers": ["hot", "very hot", "hot", "hot", "heat", "hot", "hot", "high", "heat", "hot"], "difficult_direct_answer": false, "rationales": ["The people are walking either topless or in bikini tops with shorts/skirts.", "It is very hot outside.", "The persons walking are experiencing hot weather."], "image": "val2014/COCO_val2014_000000404655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354342, "question_id": "SY8tweJ3LEYtTUb2MCQp4p", "question": "What trick is this skateboarder showing to the crowd?", "choices": ["ollie", "kick flip", "wall ride", "grab"], "correct_choice_idx": 0, "direct_answers": ["ollie", "ollie", "ollie", "jumping skateboards", "long jump", "jump", "jump", "jump", "huge jump", "jumping"], "difficult_direct_answer": false, "rationales": ["The man is trying to jump over all the other boards.", "He is in the air.", "This is what google says is the right answer."], "image": "train2014/COCO_train2014_000000354342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470685, "question_id": "SYLnFT2aojPu8Za5PD8u6A", "question": "What is being done?", "choices": ["dancing", "seasoning", "eating", "cleaning"], "correct_choice_idx": 1, "direct_answers": ["cooking", "seasoning", "cooking", "cooking", "cooking", "seasoning", "seasoning", "sprinkle", "cooking", "spicing"], "difficult_direct_answer": false, "rationales": ["Seasoning is applied to the meat in the skillet.", "The label on the bottle that is being poured indicates that it is pepper.", "The person is putting spices into the dish."], "image": "train2014/COCO_train2014_000000470685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80714, "question_id": "SYQet4jvzTjSuF4DtdTG4k", "question": "How is the image from the laptop able to be shown on the projector?", "choices": ["lan cord", "hand drawn", "cable cord", "a/v cable"], "correct_choice_idx": 3, "direct_answers": ["app technology", "device casting", "a/v cable", "wire", "light", "projected", "technology", "connection cord", "wifi", "wifi"], "difficult_direct_answer": true, "rationales": ["A laptop is sitting at a table with a woman using her hands. on the left side of laptop is a large cord extending from it.", "There's a cable coming from the back of the computer and an av cable is most likely what was used to show the images on the big screen.", "The image is projected with an a/v cable."], "image": "val2014/COCO_val2014_000000080714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351367, "question_id": "SYTKGbyubb7WrtxdexKwND", "question": "What are the people doing in the stands?", "choices": ["knitting", "spectating", "protesting", "gaming"], "correct_choice_idx": 1, "direct_answers": ["watching", "spectating", "watching", "cheering", "watching", "cheering", "spectating", "spectating", "spectating", "spectating"], "difficult_direct_answer": false, "rationales": ["A person is in a uniform on skis and there is a large crowd behind him looking on.", "The person in the ski outfit has a number on their uniform which indicates a race and people would watch that", "The people are watching."], "image": "train2014/COCO_train2014_000000351367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551507, "question_id": "SYdV2LfBf3V8kjMV9Rt7FA", "question": "What kind of novels is this author of the book famous for?", "choices": ["horror", "comedy", "romance", "foreign"], "correct_choice_idx": 0, "direct_answers": ["horror", "horror", "horror", "horror", "horror", "horror", "horror", "horror", "horror", "horror"], "difficult_direct_answer": false, "rationales": ["Stephen king is known for writing scary novels.", "Stephen king writes crime novels.", "King is a well known horror writing master."], "image": "train2014/COCO_train2014_000000551507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112793, "question_id": "SYqB7Hnp2QUPfVqARgcFKv", "question": "What species of trees are closest?", "choices": ["oak", "ash", "crate myrtle", "apple"], "correct_choice_idx": 2, "direct_answers": ["crate myrtle", "cherry", "pine tree", "eastern redbud", "flowering", "magnolia", "cherry", "magnolia", "hazel", "rose"], "difficult_direct_answer": false, "rationales": ["The crate myrtle is closest.", "I believe they are called 'crepe' myrtle but i can't tell by the picture without other knowledge.", "The trees have purple flowers. they are too short to be ash, oak, or apple trees."], "image": "train2014/COCO_train2014_000000112793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453512, "question_id": "SYsw4qiohSkCncFHxbv6bD", "question": "What is the name of the trick the man is doing?", "choices": ["spin", "manual", "flip", "grind"], "correct_choice_idx": 3, "direct_answers": ["rail", "grind", "rail", "sliding", "railing", "grind", "rail-slide", "rail slide", "jump", "grind"], "difficult_direct_answer": false, "rationales": ["It is when the board runs along a surface without the wheels touching the surface.", "The man is grinding on the rail.", "The name is grinding."], "image": "train2014/COCO_train2014_000000453512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495793, "question_id": "SZKabut6WSSXYdLtkUA3uT", "question": "What is her character likely to put something under?", "choices": ["table", "pillow", "castle", "tv"], "correct_choice_idx": 1, "direct_answers": ["pillow", "spell", "pillow", "pillow", "tooth fairy", "pillow", "spell", "pillow", "pillow", "pillow"], "difficult_direct_answer": false, "rationales": ["The character needs a pillow.", "The tooth fairy is a make believe person that removes teeth from under this item.", "The woman appears dressed as a tooth fairy based on the tooth wand, tooth brush and costume. the tooth fairy is known to put something under answer a."], "image": "train2014/COCO_train2014_000000495793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284465, "question_id": "SZcEuyQE9QUE4pq3mY4Qzr", "question": "What do these kids want to do to the ball?", "choices": ["dribble it", "avoid it", "ignore it", "kick it"], "correct_choice_idx": 3, "direct_answers": ["kick", "kick it", "kick it", "kick", "kick", "kick it", "kick", "kick", "kick", "score"], "difficult_direct_answer": false, "rationales": ["Soccer players kick the ball.", "They are playing soccer and the ball must be controlled using their feet.", "They are playing a game of soccer."], "image": "val2014/COCO_val2014_000000284465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150292, "question_id": "SZqGgP8A4TECHwD5s3Yb8a", "question": "In which sort of location was this picture taken?", "choices": ["convention center", "rodeo ground", "classroom", "gym"], "correct_choice_idx": 0, "direct_answers": ["event", "business conference", "hospital", "convention sight", "restaurant", "hospital", "convention", "office", "convention center", "comic con"], "difficult_direct_answer": true, "rationales": ["In the background of this photo we can see it is part of a multi-story building. since this gathering looks like it is semi-formal, chances are that the venue is a convention center.", "This picture was taken in a convention center, not a gym.", "The woman is wearing a name badge that is usually worn for conventions and big events."], "image": "val2014/COCO_val2014_000000150292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48870, "question_id": "SZxbDLKBYKvZjLW35EvFN3", "question": "What is the woman prepared to do?", "choices": ["dunk", "swing", "run", "dribble"], "correct_choice_idx": 1, "direct_answers": ["swing", "return serve", "backhand", "hit ball", "hit ball", "rally", "hit", "hit ball", "return", "hit ball"], "difficult_direct_answer": false, "rationales": ["She is waiting for the ball to get closer before she hits it", "The woman will swing.", "The woman is waiting on the ball. she wants to hit it."], "image": "train2014/COCO_train2014_000000048870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93851, "question_id": "Sa8bSkazgy6eEz7qkgfND7", "question": "What holiday is potentially on this day?", "choices": ["christmas", "easter", "remembrance day", "mother's day"], "correct_choice_idx": 2, "direct_answers": ["remembrance day", "halloween", "halloween", "christmas", "christmas", "remembrance day", "halloween", "remembrance day", "christmas", "halloween"], "difficult_direct_answer": false, "rationales": ["The man is wearing a red poppy, a symbol of remembrance.", "Remembrance day is symbolized by a red flower which the man has on his jacket.", "The man is wearing a poppy. he is wearing a somewhat format outfit."], "image": "train2014/COCO_train2014_000000093851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427844, "question_id": "SaQuvHWzFeoS8TYNgsLMPq", "question": "How would they transport the cattle to the nearest patch of grass?", "choices": ["rope", "boat", "buoys", "swim"], "correct_choice_idx": 1, "direct_answers": ["ship", "walking", "walk them", "boat", "boat", "walking", "boat", "via barge", "herding", "ferry"], "difficult_direct_answer": false, "rationales": ["A boat is needed to cross the water.", "They need a boat.", "The most common way to travel over water due to being able to carry heavy supplies."], "image": "train2014/COCO_train2014_000000427844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526759, "question_id": "SaaKnFSL6rTxyaYvadMoZv", "question": "Which automobile appears to have their own dedicated path on which to travel?", "choices": ["car", "train", "taxi", "trucks"], "correct_choice_idx": 1, "direct_answers": ["train", "black sedan", "train", "train", "van", "train", "train", "alongside", "train", "taxi cab"], "difficult_direct_answer": false, "rationales": ["Taxis, cars, and trucks travel in the regular road lanes. there is a dedicated rail track next to the road lanes.", "There are tracks on the bridge", "Since trains have to travel only on railroad tracks, that is certainly a dedicated path to take."], "image": "train2014/COCO_train2014_000000526759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365206, "question_id": "SbMUwoY5Kr9FoDr9JUA3Vc", "question": "What type traffic is allowed to go through this street at this time?", "choices": ["foot only", "all", "buses", "motorcycles"], "correct_choice_idx": 0, "direct_answers": ["foot only", "foot", "foot", "walkers", "pedestrian", "pedestrian", "pedestrian", "pedestrian", "foot", "pedestrian"], "difficult_direct_answer": false, "rationales": ["People are walking around the street and sitting in chairs on the street.", "Because of the venue that is happening you can tell what traffic is permitted at that time.", "Only people are passing through the street."], "image": "train2014/COCO_train2014_000000365206.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136835, "question_id": "SbMiTfTnxRAe2VZt34vmqF", "question": "What danger does the fence at the back of the lot protect the cows from?", "choices": ["dehydration", "insects", "drowning", "starving"], "correct_choice_idx": 2, "direct_answers": ["escaping", "water", "drowning", "railroad tracks", "cars", "road", "escaping", "wetlands", "drowning", "wild animals"], "difficult_direct_answer": false, "rationales": ["A large area has water in the background.", "The fences protect the cows from walking out and drowning.", "There are a lot of cows and water beyond the fencing."], "image": "train2014/COCO_train2014_000000136835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471799, "question_id": "SbQSRwDzCZsKbpuKtVZ5UD", "question": "What is poodle hair called?", "choices": ["soften", "straight", "lion", "curlies"], "correct_choice_idx": 3, "direct_answers": ["fur", "dog", "corded coat", "toy", "curlies", "fur", "fur", "corded", "fur", "curlies"], "difficult_direct_answer": false, "rationales": ["Poodles have curly hair.", "The hair of the dog gets its name from its rounded appearance.", "Poodle hair is curled."], "image": "train2014/COCO_train2014_000000471799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154520, "question_id": "Sba2d2JggwksXuAJWVbBtG", "question": "What is the shape of this kite?", "choices": ["box", "flat", "bow", "delta"], "correct_choice_idx": 2, "direct_answers": ["triangle", "very interesting", "triangle", "triangle", "triangle", "traingle", "bow", "triangle", "triangle", "triangle"], "difficult_direct_answer": false, "rationales": ["The kite is more triangular shaped like delta.", "The image is a delta wing shape in configuration.", "The shape is a bow."], "image": "val2014/COCO_val2014_000000154520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479858, "question_id": "SbrpVSH9daQJYbwBAeFK7m", "question": "What is floating in the sauce in the cup on the right?", "choices": ["beans", "lemons", "limes", "carrots"], "correct_choice_idx": 0, "direct_answers": ["beans", "beans", "beans", "beans", "beans", "stew", "beans", "bean", "beans", "bread"], "difficult_direct_answer": false, "rationales": ["The cup is filled with baked-beans.", "There are some beans floating in the cup on the right.", "There are beans floating in the cup that is next to the sandwich."], "image": "val2014/COCO_val2014_000000479858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438373, "question_id": "SbwHkxJjYm56wimrd9G8Rs", "question": "Who will eat this food?", "choices": ["robot", "human", "fish", "alien"], "correct_choice_idx": 1, "direct_answers": ["person", "human", "people", "person", "me", "person", "diner", "diner", "humans", "person"], "difficult_direct_answer": false, "rationales": ["A person will eat this.", "It is a sandwich for people.", "This is on a plate that a person would eat."], "image": "val2014/COCO_val2014_000000438373.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579602, "question_id": "Sc8PvYh3WuZGkRdmEEparb", "question": "What item is most likely to win this race?", "choices": ["kite", "dog", "cat", "airplane"], "correct_choice_idx": 3, "direct_answers": ["aircraft", "plane", "kite", "plane", "plane", "airplane", "airplane", "plane", "airplane", "plane"], "difficult_direct_answer": false, "rationales": ["A plane is in the air with kites nearby.", "The gas powered flying machine can maintain a consistent speed and has steering mechanisms that allow it to move in a controlled and consistent way. the wind-powered kites gain and lose speed and can only be controlled imprecisely with lengths of string.", "The plane can go much faster than the people flying the kites"], "image": "val2014/COCO_val2014_000000579602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132217, "question_id": "ScQ3qrcZMh5QUgAUkiqHV7", "question": "What was the pole design to do?", "choices": ["prevent sliding", "find metals", "propel forward", "attack predators"], "correct_choice_idx": 2, "direct_answers": ["balance skier", "skii", "navigate", "push skier", "traverse snow", "balance", "aid movement", "balancing", "ski", "propel forward"], "difficult_direct_answer": true, "rationales": ["The poles are used to help move the skier forward.", "The pole is made to help you move forward.", "This helps move you when cross country skiing"], "image": "train2014/COCO_train2014_000000132217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494128, "question_id": "ScRRf6QkdwofPqtSUnEMUM", "question": "What is the upside down person doing?", "choices": ["being buried", "falling", "being punished", "doing trick"], "correct_choice_idx": 3, "direct_answers": ["flipping", "practice", "snowboarding", "doing trick", "trick", "headstand", "snowboarding", "head stand", "being silly", "balancing"], "difficult_direct_answer": true, "rationales": ["A man is holding the snowboard up in the air while the other is doing a sorta handstand in snow.", "By the position of the person more than likely it was a trick that went wrong.", "This is not the common way to use a snowboard but appears to be intentional based on the second person helping. if someone is intentionally not using a snowboard the way it is meant to be used and in an unusual position they are likely doing some kind of trick."], "image": "train2014/COCO_train2014_000000494128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18048, "question_id": "SchUiqGLCfmWk2M3f3cWkx", "question": "What direction is the wind blowing?", "choices": ["left", "down", "right", "up"], "correct_choice_idx": 2, "direct_answers": ["right", "left", "left", "west", "right", "east", "right", "right toleft", "to right", "east"], "difficult_direct_answer": false, "rationales": ["All of the kites are headed to the right.", "The kite tails are blowing in the opposite direction", "The wind is blowing to the right."], "image": "train2014/COCO_train2014_000000018048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383106, "question_id": "SchrrwoPEgxUgXYgJpZ9WL", "question": "How many toppings are on the pizza?", "choices": ["three", "one", "two", "none"], "correct_choice_idx": 1, "direct_answers": ["one", "zero", "one", "one", "one", "two", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["The only thing on the pizza is cheese.", "A cheese pizza is being eaten.", "Only cheese is on the pizza."], "image": "train2014/COCO_train2014_000000383106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309071, "question_id": "ScnGsv22Q6Z5PQ3Pf4BN5R", "question": "What does the man hold in his right hand?", "choices": ["paper", "banana", "dollar bill", "rag"], "correct_choice_idx": 3, "direct_answers": ["rag", "cycle", "waste cloth", "cloth", "cycle", "bicycle", "rag", "rag", "cloth", "paper"], "difficult_direct_answer": false, "rationales": ["The man is using the item to clean his bike and it appears to be fabric.", "A man is wiping a bike with a cloth.", "The man holds a white rag in his right hand."], "image": "val2014/COCO_val2014_000000309071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238459, "question_id": "ScvF36ZDEC2qZaWJEHb6mt", "question": "What colors are on the cow closest to the camera?", "choices": ["orange", "blue", "brown white", "black"], "correct_choice_idx": 2, "direct_answers": ["black brown", "brown white", "black brown", "brown white", "black brown", "brown white", "brown white", "brown white", "brown white", "brown white"], "difficult_direct_answer": false, "rationales": ["Brown and white", "The cow closest to the camera has two colors. they are a common color combination for cows.", "The cows colors are brown and white."], "image": "val2014/COCO_val2014_000000238459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469599, "question_id": "Sd6t75tAQGC38APsoLqQ54", "question": "Why are there tarps and umbrellas?", "choices": ["construction", "beach", "raining", "market"], "correct_choice_idx": 3, "direct_answers": ["market", "shade", "rain", "market", "block sunlight", "protection", "shade", "block sun", "bazaar", "rain"], "difficult_direct_answer": false, "rationales": ["To give people protection from sun and rain while they shop or eat.", "There are wares being sold.", "Markets are set up like this with tarps over stalls."], "image": "val2014/COCO_val2014_000000469599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243970, "question_id": "Sd7jTdMSuGFG8hh59k55e3", "question": "What kind of food is this?", "choices": ["pescatarian", "vegan", "meat", "vegetarian"], "correct_choice_idx": 2, "direct_answers": ["german", "meat stew", "meat vegetables", "meat dish", "sausage", "sausages", "sausages", "dinner", "bangers", "meat"], "difficult_direct_answer": true, "rationales": ["There are sausages on top of the vegetables.", "There are sausages of some kind in the dish.", "The dish has sausages in it."], "image": "train2014/COCO_train2014_000000243970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185991, "question_id": "Sd9icBeH7PnTi4KRpdWeyE", "question": "What are the buildings in the background likely used for?", "choices": ["offices", "private dwellings", "shops", "schools"], "correct_choice_idx": 1, "direct_answers": ["private dwellings", "housing", "living", "housing", "housing", "renting", "apartments", "living", "apartments", "living"], "difficult_direct_answer": false, "rationales": ["The buildings appear to be apartment buildings based on their design and the repeated pattern. apartments are used as answer a.", "The blue three story and tan four story buildings appear to be apartment buildings.", "The buildings appear to be large, multi-dwelling structures. they do not have advertisements, branding, or architecture consistent with another use."], "image": "train2014/COCO_train2014_000000185991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427679, "question_id": "SdDhZcEzgE4nYU3Dbocaoi", "question": "What process is this rowboat in currently?", "choices": ["launching", "beaching", "navigating", "grounding"], "correct_choice_idx": 1, "direct_answers": ["rest", "drifting", "dock", "beached", "beached", "beaching", "beached", "beached", "landed", "docked"], "difficult_direct_answer": false, "rationales": ["The rowboat does not appear in water as its intended use, but appears on the land near the water consistent with answer a.", "The boat is mostly on the land. there is no one in it.", "The rowboat is on the sand."], "image": "train2014/COCO_train2014_000000427679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377901, "question_id": "SdLbPg5Z7ZSEmrdrLQzfGU", "question": "What is the man in the brown shirt about to do?", "choices": ["swing", "run", "stand", "sit"], "correct_choice_idx": 0, "direct_answers": ["hit ball", "hit ball", "hit ball", "hit ball", "get ready", "hit ball", "hit ball", "swing", "get ready", "hit ball"], "difficult_direct_answer": false, "rationales": ["Swing the ball across the opponent in the area.", "The man is about to swing his racquet.", "The man is swinging."], "image": "train2014/COCO_train2014_000000377901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466615, "question_id": "SdPzbiW3QRg9VyhuDHUZ76", "question": "Why is the woman putting money in the device?", "choices": ["getting change", "parking payment", "investing", "order food"], "correct_choice_idx": 1, "direct_answers": ["parking", "parking meter", "parkingmeter", "parking payment", "pay parking", "parking meter", "parking meter", "parking", "parking", "parking"], "difficult_direct_answer": false, "rationales": ["These type of devices require money so you can park there.", "The woman is paying for parking.", "The woman is trying to pay her parking meter fare."], "image": "train2014/COCO_train2014_000000466615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232675, "question_id": "SdX44wcEYwhGi2Nyxc65Dg", "question": "What does this animal eat?", "choices": ["plants", "meat", "bugs", "everything"], "correct_choice_idx": 0, "direct_answers": ["grass", "grass", "plants", "grass", "grass", "grass", "hay", "vegetation", "trash", "grass"], "difficult_direct_answer": false, "rationales": ["This is a goat. it is a herbivore, not a carnivore.", "This animal grazes on vegetation", "Goats are herbivores."], "image": "train2014/COCO_train2014_000000232675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236267, "question_id": "SdgYhAAaBksMmQmuTVRrnC", "question": "Why is the bus here?", "choices": ["is stolen", "is highway", "is parked", "driver lost"], "correct_choice_idx": 1, "direct_answers": ["broke down", "is highway", "passenger transport", "transportation", "carrying passengers", "travel", "transport", "transport workers", "transporting wood", "travelling"], "difficult_direct_answer": true, "rationales": ["The bus is on a highway.", "This an old fashioned photo of a bus going down a dirt path.", "The bus is travelling. there are markings on the ground showing that it is a travelled roadway."], "image": "train2014/COCO_train2014_000000236267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306672, "question_id": "SdjYg2jmYcoAXgcGJjSRq6", "question": "What is the scientific name for the area protected by the pads?", "choices": ["clavicle", "mandible", "patella", "ulna"], "correct_choice_idx": 2, "direct_answers": ["knees", "patella", "patella", "knees", "genu", "knees", "patella", "patella", "patella", "patella"], "difficult_direct_answer": false, "rationales": ["He has pads on his knee caps.", "The skater has pads on his knees which are also know as the patellas.", "The area protected by the pads are the kneecaps, or patella."], "image": "train2014/COCO_train2014_000000306672.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46893, "question_id": "SduHBXbAYGf2UgUroN5GPV", "question": "How was the treat the child bites cooked?", "choices": ["broiled", "deep fried", "it wasn't", "grilled"], "correct_choice_idx": 1, "direct_answers": ["fried", "fried", "fried", "deep fried", "fried", "fried", "fried", "fried", "fried", "fried"], "difficult_direct_answer": false, "rationales": ["The boy is eating a donut. many donuts are fried.", "He is eating a donut, which is commonly known as fried dough.", "The boy is eating a donut. donuts are deep fried."], "image": "train2014/COCO_train2014_000000046893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208612, "question_id": "Se8cUv5xVuhUUwtV3M5ho5", "question": "What is the tube the woman is carrying used for?", "choices": ["storing candy", "drinking", "storing posters", "light"], "correct_choice_idx": 2, "direct_answers": ["beach mat", "woman", "posters", "yoga mat", "blueprints", "stuff", "talking", "papers", "storing posters", "transportation"], "difficult_direct_answer": true, "rationales": ["These tubes allow rolled up paper like posters fit inside without being ruined.", "Posters are rolled up so that they will fit in these types of tubes.", "It is good to roll things up in so they don't get bent"], "image": "train2014/COCO_train2014_000000208612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447485, "question_id": "SeCDwUTqMH4mWmUk2HpEYH", "question": "What is the green thing leaning on the sandwich on the right side?", "choices": ["fascicle", "umbel", "branch", "stem"], "correct_choice_idx": 0, "direct_answers": ["champagne", "pine sprig", "plant", "plant", "pine", "wine", "pine", "fascicle", "tree", "pines"], "difficult_direct_answer": false, "rationales": ["The label says that the green item is \"fascicle.\".", "That is what is leaning on the sandwich.", "The green thing on the sandwich is a fascicle."], "image": "train2014/COCO_train2014_000000447485.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139157, "question_id": "SeMER5BwJE5DPnRfehPDmW", "question": "During which season is this amphibious bus operating in the water?", "choices": ["summer", "fall", "winter", "spring"], "correct_choice_idx": 0, "direct_answers": ["spring", "summer", "spring", "summer", "summer", "summer", "summer", "summer", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["The season is unidentifiable exclusively by the image, but amphibious tour buses are commonly used for tourists who are common in answer a.", "The season is the summer.", "This water bus gives tours when it's hot out."], "image": "train2014/COCO_train2014_000000139157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99636, "question_id": "SeQKtwXR3R6H4GF2xpZ4Wv", "question": "When was the television remote invented?", "choices": ["1878", "1893", "1891", "1867"], "correct_choice_idx": 1, "direct_answers": ["1950", "1930s", "1950", "1980", "usa", "early 1900s", "1900s", "1893", "modern times", "1950"], "difficult_direct_answer": false, "rationales": ["Nikola tesla made a patent describing the device.", "Tv remotes were invented in 1893.", "The television remote was invented in 1950."], "image": "train2014/COCO_train2014_000000099636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522464, "question_id": "SedXCaHR48TY7DJeHAdvJM", "question": "What is the danger faced by the woman on the left?", "choices": ["getting hit", "tornado", "getting sunburned", "getting splashed"], "correct_choice_idx": 3, "direct_answers": ["cars", "splash", "wet", "slipping", "getting wet", "car", "car", "getting splashed", "puddles", "rain"], "difficult_direct_answer": true, "rationales": ["The woman is near water puddles.", "It is a visibly wet day and the woman is walking in close enough proximity to the street where answer a could be possible from a passing vehicle.", "Because she is near the road ,water could be splashed on her by the passing by vehicles."], "image": "train2014/COCO_train2014_000000522464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449210, "question_id": "SekhjCdTh66YTJoYEaHpfw", "question": "What protective apparel should this person wear?", "choices": ["knee pads", "scarf", "helmet", "sunglasses"], "correct_choice_idx": 2, "direct_answers": ["jacket gloves", "helmet", "helmet", "helmet", "goggles", "goggles", "helmet", "facemask", "helmet", "helmet"], "difficult_direct_answer": false, "rationales": ["The helmet will prevent him from getting injury to the gead.", "The skier is wearing knee pads to protect themself if they fall", "The person should wear a helmet for their ski hat."], "image": "train2014/COCO_train2014_000000449210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47898, "question_id": "SenD94FbDf42QLtmcFyYdN", "question": "What kind of natural structure can be seen?", "choices": ["boulder", "mountain", "river", "stalagmite"], "correct_choice_idx": 2, "direct_answers": ["hills", "river", "river", "river", "lake", "river bank", "nonsensical question", "sunset", "river", "river"], "difficult_direct_answer": false, "rationales": ["There is a body of water that moves", "The river is the most natural thing here.", "There is water winding between two pieces of land."], "image": "val2014/COCO_val2014_000000047898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467753, "question_id": "SerPTMuuzu8uoW6xSj6mmr", "question": "For what reason is there yellow tape pulled here?", "choices": ["party", "backhoe digging", "repossession", "crime scene"], "correct_choice_idx": 1, "direct_answers": ["caution area", "safety", "hazard", "create barrier", "caution", "backhoe digging", "caution", "caution", "keep out", "construction"], "difficult_direct_answer": false, "rationales": ["The machines used can be dangerous for people to be around.", "Yellow tape is out for construction.", "The reason is digging."], "image": "train2014/COCO_train2014_000000467753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29189, "question_id": "SetsQsnN6gPvkYi2Bj3Hep", "question": "Why are books displayed on tables here?", "choices": ["giving away", "person's hoarding", "meeting people", "for sale"], "correct_choice_idx": 3, "direct_answers": ["sale book", "for sale", "for sale", "for sale", "for sale", "for sale", "for sale", "book store", "selling books", "book sale"], "difficult_direct_answer": false, "rationales": ["The books are for sale.", "This is a book sale, but what we don't know is if this is a retail bookstore or a library. many libraries hold annual book sales to lessen inventory and make a few extra dollars.", "The books are spread out on display."], "image": "train2014/COCO_train2014_000000029189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157001, "question_id": "SfGXdvCSzNSogg7PaXYSHa", "question": "The man looks most like what celebrity?", "choices": ["janina gavankar", "omar epps", "idris elba", "tiny tim"], "correct_choice_idx": 3, "direct_answers": ["russell brand", "bono", "john lennon", "morgan", "tiny tim", "keanu reeves", "al yankovic", "johnny depp", "no idea", "ozzy osborne"], "difficult_direct_answer": true, "rationales": ["He does resemble the \"tiptoe thru the tulips\" guy.", "The other answers don't remotely match this person.", "He has long dark hair."], "image": "val2014/COCO_val2014_000000157001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348621, "question_id": "SfQULXDqMewCv7A7qpctNr", "question": "What is the social status of most people who own these boats?", "choices": ["poor", "broke", "wealthy", "happy"], "correct_choice_idx": 2, "direct_answers": ["rich", "wealthy", "wealthy", "rich", "rich", "rich", "rich", "rich", "wealthy", "wealthy"], "difficult_direct_answer": false, "rationales": ["In order to afford a boat you have to have extra spending money and these people are likely well off.", "Boats of these sizes are generally expensive.", "These are large fancy looking boats."], "image": "train2014/COCO_train2014_000000348621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231682, "question_id": "SfS4bbxSTN4U5UBy6P5xvV", "question": "Why is the crown worn here?", "choices": ["he's royalty", "heir apparent", "for fun", "imposter"], "correct_choice_idx": 2, "direct_answers": ["for fun", "protection", "birthday", "for fun", "birthday boy", "parade", "royalty", "birthday", "for fun", "celebration"], "difficult_direct_answer": false, "rationales": ["The crown is worn just for kicks.", "The person is wearing a crown because they are having fun at the parade.", "The child has it on to dress up some."], "image": "train2014/COCO_train2014_000000231682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556073, "question_id": "SfhZZsgU7uBWPGBzvGUij6", "question": "What is the brand name of the oral care product shown here?", "choices": ["glide", "oral-b", "crest", "colgate"], "correct_choice_idx": 2, "direct_answers": ["crest", "crest", "crest", "glide", "crest", "crest", "glide", "crest", "crest", "glide"], "difficult_direct_answer": false, "rationales": ["The name of the product is written on the container.", "Crest's logo is on the floss.", "There is a container of dental floss. it has a logo that indicates the brand name."], "image": "val2014/COCO_val2014_000000556073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389244, "question_id": "SfjJ3BGHnwb5ZYapmqWAHk", "question": "What type of store are the scooters parked in front of?", "choices": ["bodega", "restaurant", "bakery", "coffee shop"], "correct_choice_idx": 3, "direct_answers": ["coffee shop", "retail stores", "housing", "starbucks", "restaurant", "starbucks", "starbucks", "coffee", "gift shop", "mall"], "difficult_direct_answer": false, "rationales": ["Any answer is feasible here and it is hard to tell what type of building it can be.", "You can tell by the design and signs on the building as to what type it is.", "The store logo features starbucks."], "image": "train2014/COCO_train2014_000000389244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321438, "question_id": "SgSK9CdzrRZ8g5xDWWn6KB", "question": "What are diners here enjoying with their meal?", "choices": ["soda", "milk", "beer", "water"], "correct_choice_idx": 3, "direct_answers": ["pizza", "water", "water", "water", "water", "water", "water", "water", "water", "pizza"], "difficult_direct_answer": false, "rationales": ["The glasses contain a clear liquid that is not carbonated.", "There is a whole mess of food on a table. in the glasses are clear liquids with no bubbles.", "The liquid in the glasses is clear and has no carbonation bubbles."], "image": "train2014/COCO_train2014_000000321438.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115251, "question_id": "SgfN2N5V8s2WhjrfCjDTvG", "question": "Why are the little people riding the sheep?", "choices": ["are children", "no horses", "forced to", "going home"], "correct_choice_idx": 0, "direct_answers": ["entertainment", "racing", "fun", "contest", "are children", "fun", "mutton busting", "rodeo", "rodeo", "rodeo"], "difficult_direct_answer": false, "rationales": ["The kids are children competing in a mutton busting contest.", "The people are kids.", "Sheep are smaller animals and only a child could ride one comfortably."], "image": "train2014/COCO_train2014_000000115251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56753, "question_id": "SghpZYwQSKsVYbnSUbDToH", "question": "What is the electrical device on the wall to the left of the window used for?", "choices": ["hvac", "entertainment", "lighting", "storage"], "correct_choice_idx": 0, "direct_answers": ["stove exhaust", "air conditioning", "coffee", "ventilation", "cords", "air venting", "heating", "sockets", "hvac", "electrical outlets"], "difficult_direct_answer": true, "rationales": ["The bulb is used to give light.", "The electrical device on the wall by the window is an air conditioner.", "Hvac is shown."], "image": "train2014/COCO_train2014_000000056753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548882, "question_id": "SgkLgMpap3ybhwgg3ZBt4V", "question": "The yellow item was most likely made in what type of pot?", "choices": ["kettle", "crockpot", "saucepan", "skillet"], "correct_choice_idx": 3, "direct_answers": ["frying pan", "instant pot", "cast iron", "pan", "fry pan", "pan", "skillet", "pan", "frying", "frying pan"], "difficult_direct_answer": false, "rationales": ["A pan is used to prepare eggs.", "These are scrambled eggs", "Omelets are made in a skillet."], "image": "val2014/COCO_val2014_000000548882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127632, "question_id": "SgqzKHy4ADumFqpR3QaR6t", "question": "What is the greatest danger here?", "choices": ["drowning", "tsunami", "hitting rocks", "big waves"], "correct_choice_idx": 2, "direct_answers": ["rocks", "rocks", "rock", "hitting rocks", "rocks", "waves", "surfers", "rocks", "rocks", "rocks"], "difficult_direct_answer": false, "rationales": ["Drowning is fatal so assuming death is the worst case scenario, then drowning would be the most dangerous possibility in this location.", "The danger is hitting rocks.", "Hitting these can make them fall or hit their heads"], "image": "train2014/COCO_train2014_000000127632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349410, "question_id": "Sh8gR2SWBbjXnGyAGBgBPK", "question": "How many people are there in the bus?", "choices": ["one", "two", "three", "four"], "correct_choice_idx": 0, "direct_answers": ["two layers", "one", "one", "one", "ten", "twenty seven", "zero", "twenty", "one", "zero"], "difficult_direct_answer": false, "rationales": ["There is one person.", "Only the driver is on the bus.", "The bus is not in service so there would only be the bus driver in it."], "image": "train2014/COCO_train2014_000000349410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230559, "question_id": "ShJPEZwDzybRnGrazW5HZJ", "question": "How many Ossicones do giraffe's has?", "choices": ["two", "three", "one", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "one", "two", "two", "one", "one", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two of them.", "Giraffes have two each of these ossicones.", "Giraffes have two horns."], "image": "train2014/COCO_train2014_000000230559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428391, "question_id": "ShK8dqUXT4g78mGd2RQdX4", "question": "Which animals with four paws can be seen?", "choices": ["bears", "dogs", "cats", "foxes"], "correct_choice_idx": 0, "direct_answers": ["bears", "bear", "bear", "bears", "bear", "bear", "bears", "bear", "bear", "bears"], "difficult_direct_answer": false, "rationales": ["Bears have four paws.", "There are a few brown bears wading in the water nearby.", "The animals with four paws are too big to be cats, dogs, or foxes."], "image": "train2014/COCO_train2014_000000428391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401768, "question_id": "ShSDFgPv7AzmvhP3SKbBN2", "question": "What gives the court its red color?", "choices": ["sand", "crushed brick", "paint", "dye"], "correct_choice_idx": 1, "direct_answers": ["sand", "crushed brick", "clay", "sand", "clay", "clay", "paint", "clay", "clay", "clay content"], "difficult_direct_answer": false, "rationales": ["The court is made of brick that's finely ground.", "Bricks that are crushed are red.", "It can also contain clay and d. clay is usually the source of the color."], "image": "train2014/COCO_train2014_000000401768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33215, "question_id": "ShUdCHeP9xQqejyQG4vrp5", "question": "What word sounds like the first word on the top left sign?", "choices": ["bark", "good", "slop", "toe"], "correct_choice_idx": 3, "direct_answers": ["toe", "tow", "toe", "bow", "tow", "hoe", "know", "toe", "hoe", "mow"], "difficult_direct_answer": false, "rationales": ["It has the same sound as a part of a foot.", "Toe and tow are the same but spelled differently.", "The word rhymes with tow."], "image": "train2014/COCO_train2014_000000033215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329831, "question_id": "ShcVqzXYWrqKZCpLaSsAbW", "question": "What would normally assist the skiers off the snow?", "choices": ["child", "cat", "dog", "officer"], "correct_choice_idx": 2, "direct_answers": ["ski poles", "guides", "lift", "eyesight", "guides", "skis", "guides", "assistant dog", "lift", "dog"], "difficult_direct_answer": false, "rationales": ["The dog would assist.", "The skiers are wearing bibs that indicate that they are blind, so they normally would be assisted by service animals. cats are not normally used as service animals for people who are blind.", "The skiers are blind. they have human guides instead of animal ones in this scene."], "image": "train2014/COCO_train2014_000000329831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28870, "question_id": "ShwSJ8wCmzgnLcPFCXHwRH", "question": "Why are the planes in this hanger?", "choices": ["to display", "to fly", "to repair", "to paint"], "correct_choice_idx": 0, "direct_answers": ["storage", "for storage", "display", "storage", "display", "display", "display", "to display", "storage", "museum"], "difficult_direct_answer": false, "rationales": ["The planes are being shown in a museum.", "The planes are in the hangar because they are on display for visitors and people that like planes.", "This is a museum of airplanes and all of the planes inside are for spectators to see."], "image": "train2014/COCO_train2014_000000028870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581451, "question_id": "ShzUkRZUhq2e5umRxrXS2c", "question": "What is the healthiest ingredient in the photo?", "choices": ["beans", "ketchup", "bun", "sausage"], "correct_choice_idx": 0, "direct_answers": ["peas", "green peas", "beans", "green peas", "peas", "peas", "peas", "peas", "peas", "peas"], "difficult_direct_answer": false, "rationales": ["Peas are healthy.", "The beans are healthier.", "It's a process of elimination. sausage certainly is not healthy (although i like it a lot). bread is not known as a healthy food and ketchup isn't either."], "image": "val2014/COCO_val2014_000000581451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72666, "question_id": "SibvSdCfFL6ZZJXXpDAiU4", "question": "What airway is the closest plane belonging to?", "choices": ["delta", "america airlines", "jet blue", "kuwait airways"], "correct_choice_idx": 3, "direct_answers": ["kuwait airways", "kuwait airways", "kuwait airlines", "kuwait airways", "kuwait airways", "kuwait airways", "kuwait airways", "kuwait airways", "kuwait airways", "kuwait"], "difficult_direct_answer": false, "rationales": ["The airline's name is on the tail of the plane.", "The name of the airway is printed on the side of the plane closest to the camera.", "A logo for an airlines is on a large commercial plane."], "image": "val2014/COCO_val2014_000000072666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283264, "question_id": "Sikdpe5xtGuCTwv4cJ4fCZ", "question": "What handedness does this pitcher possess?", "choices": ["right", "knuckle", "left", "none"], "correct_choice_idx": 2, "direct_answers": ["left", "left hand", "left", "left", "left", "left hand", "left", "left", "left", "left"], "difficult_direct_answer": false, "rationales": ["The pitcher is visibly holding a baseball in one hand with a glove in the other. in baseball, one would hold the ball in their dominant hand.", "He is wearing the glove on his right hand and is throwing the ball with his other hand.", "The pitcher is left handed."], "image": "val2014/COCO_val2014_000000283264.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123184, "question_id": "SiuFe3JSNbdZEYQQDohKMq", "question": "How did this person arrive at this location?", "choices": ["taxi", "bus", "via motorcycle", "walking"], "correct_choice_idx": 2, "direct_answers": ["motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "via motorcycle"], "difficult_direct_answer": false, "rationales": ["The person has a motorbike.", "The person is in cycling gear so they rode a motorbike.", "There is a two-wheeled vehicle in front of the person. this person is wearing a helmet and a leather safety outfit."], "image": "train2014/COCO_train2014_000000123184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178895, "question_id": "SivQbA67g9z6MVnEQwbuYm", "question": "What speed is the train traveling at?", "choices": ["30mph", "100mph", "0mph", "60mph"], "correct_choice_idx": 2, "direct_answers": ["0mph", "50km/h", "zero", "zero", "fast", "0 mph", "stopped", "stopped", "zero", "zero"], "difficult_direct_answer": false, "rationales": ["A train is stopped at the station.", "The train is stationary.", "The train has stopped at a station and has not accelerated in speed."], "image": "train2014/COCO_train2014_000000178895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200024, "question_id": "SiyUqeJ4UTD4odmLCX4Rph", "question": "What is the woman doing to her teeth while looking in the bathroom mirror?", "choices": ["brushing", "washing", "flossing", "picking"], "correct_choice_idx": 0, "direct_answers": ["brushing", "brushing", "brushing", "brushing", "brushing", "brushing", "brushing", "brushing", "brushing", "brushing"], "difficult_direct_answer": false, "rationales": ["The woman wants to brush her teeth.", "The woman is looking in the mirror and brushing her teeth with the toothbrush in her hand.", "A woman is standing in front of a bathroom mirror with her tongue out and an object with a handle in her hand."], "image": "train2014/COCO_train2014_000000200024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418152, "question_id": "Sj6AChjaNNhLhuFcJmv8UZ", "question": "Where is the photographer who captures this image?", "choices": ["road", "home", "rail train", "air"], "correct_choice_idx": 0, "direct_answers": ["traintrack", "sidewalk", "train station", "under bridge", "under structure", "road", "under bridge", "under overpass", "under bridge", "under bridge"], "difficult_direct_answer": false, "rationales": ["The photographer is on the road.", "The view is from eye level.", "The photographer is on top of the road."], "image": "val2014/COCO_val2014_000000418152.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441504, "question_id": "SjaRTx4tE8Zxgnm5rHzry3", "question": "For what group of people is the black area stepped on by the girl built for?", "choices": ["disabled pe6", "blind", "pregnant women", "elderly people"], "correct_choice_idx": 1, "direct_answers": ["blind", "blind", "passengers", "hiking", "passengers", "boarding train", "passengers", "safety", "blind", "train"], "difficult_direct_answer": false, "rationales": ["The way the small boy looks is if he is not seeing.", "The group is blind people.", "The raised dots on the platform aid the visually impaired to not step over the line for safety."], "image": "train2014/COCO_train2014_000000441504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214274, "question_id": "SkA6iYhCz9BUKsDgWvx2sK", "question": "Is the man wearing a belt?", "choices": ["no", "unsure", "yes", "maybe"], "correct_choice_idx": 2, "direct_answers": ["support pants", "yes", "yes blue", "yes", "no", "yes", "yes", "no", "yes", "yes"], "difficult_direct_answer": false, "rationales": ["There is a piece of leather looping through the top of his pants and connecting in the front.", "The man is wearing jeans and a black leather belt.", "The man's jeans are loose."], "image": "val2014/COCO_val2014_000000214274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245757, "question_id": "SkCtiyYSAy7TKZxGdVZhBE", "question": "What is the black chair oriented to view?", "choices": ["sofa", "painting", "computer", "tv"], "correct_choice_idx": 2, "direct_answers": ["monitor", "window", "computer", "window", "soba", "outside", "computer", "computer", "laptop", "window"], "difficult_direct_answer": false, "rationales": ["The black chair is viewing the computer.", "The black chair is pointed away from the sofa and painting and towards the laptop and monitor.", "A chair sits in front of a desk."], "image": "train2014/COCO_train2014_000000245757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421794, "question_id": "SkHxfbrk2oZWryWD4wEszw", "question": "The animal on the screen is what animal?", "choices": ["fox", "giraffe", "ant", "eagle"], "correct_choice_idx": 0, "direct_answers": ["fox", "fox", "fox", "fox", "fox", "fox", "fox", "fox", "fox", "fox"], "difficult_direct_answer": false, "rationales": ["The logo for firefox is a fox.", "This name and fire before it are the name of a common browser.", "The brown animal is a fox."], "image": "train2014/COCO_train2014_000000421794.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516607, "question_id": "SkMsHbniEkXA7Nw5P348rv", "question": "What keeps the cattle from eating the garden here?", "choices": ["man", "fencing", "nothing", "fear"], "correct_choice_idx": 1, "direct_answers": ["gate", "fence", "fencing", "fence", "fence", "fence", "fence", "fence", "fence", "gate"], "difficult_direct_answer": false, "rationales": ["Wooden railings can stop cattle from moving to another area.", "The cattle are kept out of the garden with a bamboo fence.", "Fencing has been erected between the garden and the cattle, which will theoretically keep hungry animals at bay. they seem to be enjoying their hay, meanwhile."], "image": "train2014/COCO_train2014_000000516607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93832, "question_id": "SkSnGgEoLBehviCFj9p9RZ", "question": "What is the banana on top of?", "choices": ["refrigerator", "ice cream", "tray", "plate"], "correct_choice_idx": 0, "direct_answers": ["refrigerator", "refrigerator", "fridge", "fridge", "fridge", "fridge", "refrigerator", "fridge", "refrigerator", "fridge"], "difficult_direct_answer": false, "rationales": ["The fridge is where this family puts their bananas.", "This is taller than other things in the kitchen and has 2 doors", "The bananas are clearly visible and the structure under them is known based on its design and shape."], "image": "train2014/COCO_train2014_000000093832.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47654, "question_id": "SkyNbfpatFbrb7L3oJZ5Uz", "question": "What is the likely relationship between the man and the two girls?", "choices": ["great grandfather", "brother", "father", "nephew"], "correct_choice_idx": 2, "direct_answers": ["parent", "father", "father", "father/daughter", "father", "father", "parent", "father", "parent child", "father"], "difficult_direct_answer": false, "rationales": ["The man is older than the kids. he is old enough to be a parent but not a great grandparent.", "A man is posing with two young children.", "The man resembles the girls."], "image": "val2014/COCO_val2014_000000047654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120248, "question_id": "Sm2rujHb4328x8Tzn7Gaan", "question": "On which continent is this boat floating most likely?", "choices": ["north america", "asia", "antarctica", "europe"], "correct_choice_idx": 1, "direct_answers": ["asa", "asia", "mid east", "asia", "asia", "asia", "in water", "asia", "asia", "all"], "difficult_direct_answer": false, "rationales": ["The boat is flying the flag of thailand.", "In the background there are buildings with architecture commonly associated with answer a.", "The boat style and the architecture of the buildings in the background are synonymous with asian style."], "image": "val2014/COCO_val2014_000000120248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453195, "question_id": "SmPTzLpA9iociskfnePZK2", "question": "What is he doing?", "choices": ["watching tv", "making dinner", "using computer", "showing off"], "correct_choice_idx": 2, "direct_answers": ["viewing screen", "typing", "working", "working", "working", "using computer", "working", "working", "on computer", "typing"], "difficult_direct_answer": false, "rationales": ["He has his hand on a mouse and is looking at a screen.", "The man is holding a mouse in his right and staring straight ahead, seemingly transfixed, which are good indicators that he is currently using a computer.", "He is sitting at a desk. he has a mouse in his right hand."], "image": "train2014/COCO_train2014_000000453195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334476, "question_id": "SmYnzfahwUaocd3brjHLA4", "question": "What food shares the name that appears on the blue bottle?", "choices": ["maple syrup", "apple pie", "swiss cheese", "green bean"], "correct_choice_idx": 2, "direct_answers": ["cheese", "cheese", "cheese", "swiss cheese", "swiss cheese", "swiss", "swiss", "cheese", "cheese", "cheese"], "difficult_direct_answer": false, "rationales": ["Swiss cheese has the same name as swiss.", "The writing on the blue bottle is clearly visible and the food of answer a has a word in common.", "A popular kind of cheese is swiss cheese."], "image": "train2014/COCO_train2014_000000334476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13868, "question_id": "SmYxHFhNwigKvamrrJerwN", "question": "What is the use for the wheeled item with the word lung on it?", "choices": ["handicapped", "skiing", "racing", "luggage moving"], "correct_choice_idx": 3, "direct_answers": ["carry luggage", "luggage moving", "caddy", "luggage transport", "luggage transport", "luggage moving", "bag", "emergency medical", "cart", "breathing better"], "difficult_direct_answer": false, "rationales": ["You can tell by the metal apparatus what it is used for.", "It can transport several heavy items for those who are unable to carry many to their destination. the wheels are for transportation to another location.", "The wheeled cart is in an airport, and already has items on it."], "image": "train2014/COCO_train2014_000000013868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120052, "question_id": "SmpA6GNqUzcrc582Xerb4s", "question": "What objects are being advertised on a billboard?", "choices": ["phones", "bones", "scones", "cones"], "correct_choice_idx": 3, "direct_answers": ["cones", "safety cones", "cones", "traffic cones", "cones", "pylons", "cones", "traffic cone", "cones", "pylons"], "difficult_direct_answer": false, "rationales": ["The object is a cone.", "Safety cones are being shown on the board.", "Cones are on the billboard."], "image": "train2014/COCO_train2014_000000120052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346716, "question_id": "Sn966ZFq9FjyW6z8JPkyrK", "question": "What color car roof here would be most dangerous to fall from?", "choices": ["black", "white", "all same", "yellow"], "correct_choice_idx": 1, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "black", "white"], "difficult_direct_answer": false, "rationales": ["The color is white.", "The white roof is at the highest tier of the bus and falling from a higher height would injury someone more severely.", "The double decker bus has a white roof."], "image": "val2014/COCO_val2014_000000346716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65616, "question_id": "SnNdApYt83cWYhSXCJ3Vfp", "question": "Where did they purchase the beverage?", "choices": ["target", "kroger", "walmart", "cvs"], "correct_choice_idx": 2, "direct_answers": ["store", "great value", "walmart", "walmart", "walmart", "store", "store", "walmart", "market", "walmart"], "difficult_direct_answer": false, "rationales": ["Great value items are from walmart.", "They purchased the drink from sam's club.", "The bottle says the store brand on it."], "image": "train2014/COCO_train2014_000000065616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396247, "question_id": "SnXAWhHGVY8x4xjo9ztjcr", "question": "What article of clothing are the men showing off?", "choices": ["vests", "jacket", "socks", "scarves"], "correct_choice_idx": 2, "direct_answers": ["socks", "socks", "socks", "socks", "socks", "socks", "socks", "socks", "socks", "socks"], "difficult_direct_answer": false, "rationales": ["They are wearing different colored foot coverings on their feet.", "The men are showing off their socks.", "The men are pulling up their pant legs and making their feet garments visible."], "image": "train2014/COCO_train2014_000000396247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270198, "question_id": "SnaVf6hf8bfwxGTHt9sdbB", "question": "What are the circular patterns on the ground?", "choices": ["tire tracks", "paint", "abstract drawing", "ink"], "correct_choice_idx": 0, "direct_answers": ["tracks", "tire marks", "doughnuts", "tire tracks", "tire tracks", "donuts", "burn outs", "tire marks", "burnouts", "donut marks"], "difficult_direct_answer": false, "rationales": ["The circular patterns on the ground are left from tires when the car spun very fast.", "It appears that the trucks have been doing wheelies on the ground to create the circular patterns.", "A large monster truck has burned tired treads from spinning around in it."], "image": "train2014/COCO_train2014_000000270198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284956, "question_id": "SnbpzoTXKoUtKmRdhZy2FL", "question": "What item is being used to celebrate the child's birthday?", "choices": ["cake", "wax cake", "cheese cake", "chocolatechip cookie"], "correct_choice_idx": 3, "direct_answers": ["cake", "cake", "cookie cake", "cake", "birthday cake", "cookie cake", "cake", "birthday cake", "cookie", "chocolatechip cookie"], "difficult_direct_answer": false, "rationales": ["The chocolate chip cookie makes up the cake.", "The cake is a big cookie.", "A big cookie that looks like a cake."], "image": "train2014/COCO_train2014_000000284956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505309, "question_id": "SnmipSFqtdC9ntxYNPewdv", "question": "The blue flame on the top of the range indicates it is burning what flammable item?", "choices": ["coal", "propane", "natural gas", "charcoal"], "correct_choice_idx": 2, "direct_answers": ["natural gas", "food", "gas", "gas", "gas", "gas", "gas", "gas", "gas", "pot"], "difficult_direct_answer": false, "rationales": ["The other fuels cannot be used for cooking in an enclosed space.", "Natural gas is blue when burned.", "Stoves have two common sources of fuel where one is answer a and the other would not have visible flames in this manner."], "image": "train2014/COCO_train2014_000000505309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538846, "question_id": "SnzNym6s9tkDAxBHAyRgFP", "question": "What is on top of this food?", "choices": ["chocolate", "black olives", "maple syrup", "eggs"], "correct_choice_idx": 1, "direct_answers": ["olive", "olives", "olives", "olives", "tomato toppings", "olives", "cheese", "olives", "olives", "black olives"], "difficult_direct_answer": false, "rationales": ["The black round circles are olives that have been sliced up.", "They are one of the main things on top of the pizza.", "The pizza has olives."], "image": "train2014/COCO_train2014_000000538846.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94610, "question_id": "So6JoPMEwK9y22eDN2jPUW", "question": "What form of currency allows cars to park here?", "choices": ["cards", "dollars", "coins", "checks only"], "correct_choice_idx": 2, "direct_answers": ["coins", "change", "coins", "coins", "coins", "coins", "coins", "coins", "parking meter", "coins"], "difficult_direct_answer": false, "rationales": ["The parking machines can only accept coins.", "The currency is coins.", "There are parking meters pictured."], "image": "val2014/COCO_val2014_000000094610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574790, "question_id": "SoVdyGqYj7Nw4VQTa3TpBW", "question": "What type of station is nearby?", "choices": ["gas", "bus", "fire", "train"], "correct_choice_idx": 0, "direct_answers": ["gas station", "gas", "gas", "gas", "gas", "gas", "gas", "gas station", "gas", "gas"], "difficult_direct_answer": false, "rationales": ["Phillips 66 is a gas station so that's the closest type of \"station\" that can be seen.", "There is a sign with prices", "The station serves gas."], "image": "val2014/COCO_val2014_000000574790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470206, "question_id": "SoWTunZxvsfBBzUvcrsT8r", "question": "What is the cat leaning against?", "choices": ["antenna", "turn signal", "horn", "mirror"], "correct_choice_idx": 0, "direct_answers": ["antenna", "stick", "antenna", "antenna", "antenna", "radio antenna", "cat", "antenna", "walk", "antenna"], "difficult_direct_answer": false, "rationales": ["The item is a metallic structure that captures radio signals attached to a car.", "The wire sticking up from the car gives devices better reception.", "The cat is against the antenna."], "image": "train2014/COCO_train2014_000000470206.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67422, "question_id": "SopoDXoBSuH3Pg62ChcyMy", "question": "Which direction does 17 want to run?", "choices": ["first base", "work", "right", "left"], "correct_choice_idx": 2, "direct_answers": ["home", "right", "right", "right", "right", "home", "right", "right", "home plate", "home"], "difficult_direct_answer": false, "rationales": ["Player #17 is currently on third base. the next base he will be headed toward is home plate which is located where the pitcher is throwing the ball.", "The direction is right.", "The runner appears to be on a baseball diamond based on the field and the players. based on the standard rotation of a baseball diamond, they would run to the direction of answer a."], "image": "val2014/COCO_val2014_000000067422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231644, "question_id": "Sor6mUBkguGmcqEYeEqCmP", "question": "What kind of boots is the woman wearing?", "choices": ["army", "ugg", "seal", "combat"], "correct_choice_idx": 1, "direct_answers": ["uggs", "uggs", "uggs", "ugg", "uggs", "uggs", "white boot", "white boot", "uggs", "white boot"], "difficult_direct_answer": false, "rationales": ["By the length and design of the boot it's easy to tell what style they are.", "There is fur at the top and the feet area look like \"eskimo\" footwear.", "The boots are uggs."], "image": "val2014/COCO_val2014_000000231644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358719, "question_id": "SoryVvbBubnGncxGkveFRF", "question": "The relationship between these people is most likely what?", "choices": ["enemies", "coworkers", "strangers", "family"], "correct_choice_idx": 3, "direct_answers": ["family", "friendly", "parent child", "family", "mother son", "parent child", "parent", "friends", "brother sister", "parent child"], "difficult_direct_answer": false, "rationales": ["They look alike so it's probably mother and son", "They are very friendly and comfortable with one another and they are in the same home, so this is the most likely relationship.", "They seems they are having fun as they are a family."], "image": "train2014/COCO_train2014_000000358719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485755, "question_id": "SoyA4UsprZm4YQ8rksSguY", "question": "What would be the best tool for a sand castle here?", "choices": ["umbrella", "chair", "only hands", "bucket"], "correct_choice_idx": 3, "direct_answers": ["bucket", "shovel", "shovel", "bucket", "sand", "bucket", "wheels", "shovel", "shovel", "shovel"], "difficult_direct_answer": false, "rationales": ["Buckets are a good tool to scoop up sand.", "To build a sand structure you need something to scoop up and move sand. a bucket or shovel can really come in handy. a bucket especially as you can move water as well, essentially to molding designs.", "A bucket can grab the most sand."], "image": "train2014/COCO_train2014_000000485755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579232, "question_id": "SpSw4y2pFxF2dQZ7oLxbDT", "question": "Who is in danger?", "choices": ["pedestrian", "bikes", "signs", "cars"], "correct_choice_idx": 0, "direct_answers": ["person walking", "pedestrian", "pedestrians", "car", "person", "pedestrian", "pedestrian", "pedestrian", "pedestrian crossing", "pedestrian"], "difficult_direct_answer": false, "rationales": ["The car is dangerously close to the person crossing the street.", "The pedestrian could be hit by a car.", "The person walking could get hit by the car."], "image": "train2014/COCO_train2014_000000579232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487464, "question_id": "SpeHDYmd5EvqUbGBcsLBkA", "question": "What business category is behind advertised on the white sign?", "choices": ["souvenir shop", "ice-cream", "surf", "restaurant"], "correct_choice_idx": 3, "direct_answers": ["transport", "bar", "restaurant", "restaurant", "restaurant", "restaurant", "grill", "restaurant", "restaurant", "bayside grill"], "difficult_direct_answer": false, "rationales": ["The place with the white moniker is that of a grill place, where they sell grilled type cuisine.", "The sign says its a grill. grilling is how some foods are cooked.", "The sign is used for serving food."], "image": "train2014/COCO_train2014_000000487464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529036, "question_id": "SpxZuwqD6ejgxKy5ncXPzn", "question": "What type of tennis are these two players playing?", "choices": ["mixed doubles", "men's doubles", "woman's doubles", "mixed ages"], "correct_choice_idx": 0, "direct_answers": ["doubles", "mixed doubles", "double tennis", "doubles", "game", "doubles", "doubles", "doubles", "doubles", "couples"], "difficult_direct_answer": false, "rationales": ["Mixed doubles since one player is a woman.", "These two players are of opposite sex and are playing together on the same team.", "The team has a man and a woman on it."], "image": "train2014/COCO_train2014_000000529036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339815, "question_id": "Sq6mZf4JtfFgvDbiZCAnri", "question": "Why would someone sit at this table?", "choices": ["sew", "eat", "work", "paint"], "correct_choice_idx": 1, "direct_answers": ["eating", "eat", "to eat", "have dinner", "to eat", "dessert", "dining out", "eat", "feeding", "eat"], "difficult_direct_answer": false, "rationales": ["The person would want to eat the food.", "It is spread with delicious food. so, a sitter would be ready to eat.", "There are plates of food on this table and someone would sit here to eat."], "image": "train2014/COCO_train2014_000000339815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234297, "question_id": "Sq6v9NbBXBSAgpQ4UHHgUN", "question": "What currency is displayed on the bag?", "choices": ["zloty", "usd", "euro", "cad"], "correct_choice_idx": 2, "direct_answers": ["euros", "euro", "euro", "dollars", "euro", "euro", "pound", "euro", "euro", "euro"], "difficult_direct_answer": false, "rationales": ["The euro symbol looks like curved capital \"e\" with an extra line in the middle.", "The symbol is clearly displayed on the bag and consistent with answer a.", "The bag is clearly visible and the symbol on it is commonly known based on its shape."], "image": "train2014/COCO_train2014_000000234297.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319122, "question_id": "SqDaHrkYtWPSwBaUDuN4u5", "question": "What was the clear glass jar designed for and typically used for?", "choices": ["steeping tea", "vase", "canning", "drinking coffee"], "correct_choice_idx": 2, "direct_answers": ["canning", "jam", "jelly", "vase", "water", "jam", "drinking liquids", "jam jelly", "jelly", "vase"], "difficult_direct_answer": false, "rationales": ["The clear glass jar is a mason jar based on its design. mason jars are used from preserving and sealing foods if used in the intended manner.", "Jars are used for canning.", "Those types of jars are usually used for preserving fruit and vegetables."], "image": "train2014/COCO_train2014_000000319122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229468, "question_id": "SqRAKKU3nFZ4mceSVjQQ8g", "question": "What would be the outcome of the player missing the ball?", "choices": ["strike", "walk", "home run", "ball"], "correct_choice_idx": 0, "direct_answers": ["striking out", "strike", "strike", "strike", "loss", "strike", "strike", "strike", "strike", "baseball"], "difficult_direct_answer": false, "rationales": ["He would strike out.", "He would strike out.", "In baseball if you don't hit the ball, it's called a strike."], "image": "val2014/COCO_val2014_000000229468.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231838, "question_id": "SqpBB8pr7mTz9gy584AqL5", "question": "What sort of room is visible through the door?", "choices": ["bathroom", "meeting room", "market", "pool room"], "correct_choice_idx": 1, "direct_answers": ["meeting room", "dining", "dining", "dining room", "meeting", "dinning room", "meeting", "dining room", "dining/sitting", "dining room"], "difficult_direct_answer": false, "rationales": ["A meeting room with a board table is shown.", "The chairs and long table are appropriate for a meeting room.", "The very long desk would never be seen in a pool room, bathroom or market."], "image": "train2014/COCO_train2014_000000231838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323515, "question_id": "SsjS82PvMZm5pKq3oYNGHk", "question": "What type of utensil is resting in the middle of the bowl in the cup?", "choices": ["fork", "knife", "spoon", "chopsticks"], "correct_choice_idx": 2, "direct_answers": ["spoon", "cooking utensils", "spoon", "spoon", "spoon", "spoon", "spoon", "spoon", "spoon", "spoon"], "difficult_direct_answer": false, "rationales": ["It is utensil used for scooping liquid.", "It is a liquid/sauce and is the only reasonable utensil that can pick up those contents.", "The length and the curve of the metal as well as the style of handle all show that it is a spoon."], "image": "val2014/COCO_val2014_000000323515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91747, "question_id": "Ssngwx7FpmoteDBXVHUbGA", "question": "What part of the body is the green item worn on?", "choices": ["ears", "head", "legs", "hands"], "correct_choice_idx": 1, "direct_answers": ["head", "head", "head", "head", "head", "head", "head", "neck", "head", "head"], "difficult_direct_answer": false, "rationales": ["The article of clothing is clearly visible and is known to be worn on one location on the body.", "The other options don't apply to this accessory.", "The green item is a hat to be worn on the head."], "image": "train2014/COCO_train2014_000000091747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43256, "question_id": "SsnsCYYoR9KCxKiSo8bRmW", "question": "What is behind the orange cabinet?", "choices": ["window", "wall", "lamp", "blind"], "correct_choice_idx": 0, "direct_answers": ["green wall", "flammable liquids", "window", "wall", "explosives", "mini-frig", "window", "wall", "refrigerator", "green wall"], "difficult_direct_answer": false, "rationales": ["You can tell by the glare from the sun to what is behind the cabinet.", "The orange cabinet is against a green wall.", "There is light coming from the orange cabinet."], "image": "train2014/COCO_train2014_000000043256.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368116, "question_id": "SsttXh7wRbtLv4o6XhLGXY", "question": "What is creating the larger waves?", "choices": ["wind", "kids", "falling objects", "boat"], "correct_choice_idx": 3, "direct_answers": ["boat", "boat", "wind", "boat", "boat", "boat", "photo", "boat", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["The boat is creating some larger waves.", "There is a moving vehicle in the otherwise calm water. there are no kids or falling objects.", "There is a watercraft that is creating a wake as it passes by."], "image": "train2014/COCO_train2014_000000368116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129765, "question_id": "St8QY3aabaoN9BZD2BN4Y8", "question": "What is the first name of the athlete he's advertising?", "choices": ["usain", "lebron", "michael", "asafa"], "correct_choice_idx": 0, "direct_answers": ["usain", "bolt", "usain", "usain", "hussein", "husain", "bolt", "usain", "bolt", "bolt"], "difficult_direct_answer": false, "rationales": ["The name is usain.", "The athlete is usain bolt.", "The shirt shows the name of the athlete."], "image": "val2014/COCO_val2014_000000129765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101906, "question_id": "StQHjF9RjTKjivakA6CBJS", "question": "What life threatening danger does this kayaker face if the waves get to high?", "choices": ["drowning", "burning", "electrocution", "punch"], "correct_choice_idx": 0, "direct_answers": ["drowning", "drowning", "rapids", "drowning", "drowning", "rapids", "drowning", "drowning", "drowning", "drowning"], "difficult_direct_answer": false, "rationales": ["The kayaker is on water. there are no fires, electrical wires, or other people near the water.", "The man could tumble into the water.", "The man could drown."], "image": "train2014/COCO_train2014_000000101906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377669, "question_id": "StQKJwtz9QuGhbRn3ijLKH", "question": "What kind of bread is this?", "choices": ["pumpernickle", "multi-grain", "rye", "white"], "correct_choice_idx": 1, "direct_answers": ["wheat", "multi-grain", "wheat", "brown", "egg bread", "rye", "whole grain", "raised", "wheat", "chocolate cheese"], "difficult_direct_answer": false, "rationales": ["Two pieces of toast are shown. they are brown in color with some holes in them as well as pieces of white particles on crust.", "The bread has visible grains.", "There are nuts and grains seen in the bread."], "image": "train2014/COCO_train2014_000000377669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275046, "question_id": "StS2wM9RJoDGGThBdYfNW6", "question": "Who is the man wearing the white shirt?", "choices": ["intruder", "zookeeper", "visitor", "farmer"], "correct_choice_idx": 1, "direct_answers": ["zookeeper", "poacher", "owner", "zookeeper", "zookeeper", "elephant tender", "elephant keeper", "elephant handler", "animal person", "zoo worker"], "difficult_direct_answer": false, "rationales": ["The man appears to be preparing something for the elephants.", "The man is taking care of the animals.", "This is at a zoo and he is their keeper."], "image": "train2014/COCO_train2014_000000275046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174721, "question_id": "StUiQ6wco2VqqvmzmYNuLR", "question": "What word is missing from the phrase that ends in America?", "choices": ["good", "wonderful", "productive", "hello"], "correct_choice_idx": 0, "direct_answers": ["united states", "good morning", "good", "good", "good", "good", "good morning", "good", "good morning", "good"], "difficult_direct_answer": false, "rationales": ["Good morning america is a television show.", "The blue sign in the distance is for the morning show good morning america but only the last two words are visible.", "Good morning america is a show on television."], "image": "val2014/COCO_val2014_000000174721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360673, "question_id": "StVQHSzvaN49ik2DmushL4", "question": "What is this country?", "choices": ["india", "united states", "china", "italy"], "correct_choice_idx": 1, "direct_answers": ["united states", "america", "united states", "america", "united states", "usa", "america", "america", "america", "united states"], "difficult_direct_answer": false, "rationales": ["Dunkin donuts is popular in the usa.", "The words are written in english and they are american companies.", "The business is in this country"], "image": "val2014/COCO_val2014_000000360673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381261, "question_id": "StaGfRChAUBJiQd8dqXZQT", "question": "What is the preferred web browser of the user of this desktop computer?", "choices": ["internet explorer", "mozilla firefox", "opera", "safari"], "correct_choice_idx": 1, "direct_answers": ["unknown", "foxfire", "safari", "firefox", "mozilla firefox", "firefox", "firefox", "firefox", "windows", "opera"], "difficult_direct_answer": false, "rationales": ["As indicated by the red b o logo.", "The firefox icon can be seen.", "Safari is the browser that's up."], "image": "train2014/COCO_train2014_000000381261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10211, "question_id": "SthVSdzqEJnD2iErtyUDF2", "question": "What animals are in the background?", "choices": ["zebras", "tigers", "cows", "leopards"], "correct_choice_idx": 0, "direct_answers": ["zebras", "zebras", "zebras", "zebra", "zebras", "zebras", "zebra", "zebras", "zebras", "zebras"], "difficult_direct_answer": false, "rationales": ["They have black and white stripes", "The animal is the background is horse-like and has a striped coat.", "There are zebras in the background of the photo."], "image": "val2014/COCO_val2014_000000010211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64251, "question_id": "StzTd9XjS9kSC5j8cRm4jF", "question": "What must the train do before the man on the left may pass?", "choices": ["stop", "pass by", "reverse", "levitate"], "correct_choice_idx": 1, "direct_answers": ["move", "pass", "go", "stop", "pass by", "pass", "pass", "move", "stop", "move"], "difficult_direct_answer": false, "rationales": ["The train cannot stop here so the man has to wait for it to pass.", "It might also need to do c, but that's the only way the an can cross the tracks.", "The man is approaching a crosswalk that is currently being blocked by the train. the train would have to no longer be in the crosswalk for the man to continue on in the direction he is facing."], "image": "train2014/COCO_train2014_000000064251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542582, "question_id": "SuR7wKTk43QYAy8ogr7SaA", "question": "What is causing traffic to stop?", "choices": ["pedestrians", "stalled car", "oncoming train", "stalled bus"], "correct_choice_idx": 2, "direct_answers": ["railroad crossing", "street light", "train", "lights", "oncoming train", "tran", "traffic light", "train", "oncoming train", "train"], "difficult_direct_answer": false, "rationales": ["There is a train crossing and the arms to prevent traffic are seen down here.", "The traffic is stopping for an oncoming train.", "The vehicles are held in traffic as the train is nearly approaching."], "image": "train2014/COCO_train2014_000000542582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503081, "question_id": "SuYwTpqV99c5rWjYtwXo6L", "question": "What is the military time?", "choices": ["300", "1215", "1200", "1500"], "correct_choice_idx": 3, "direct_answers": ["1500", "1500", "1500", "fifteen-hundred hours", "1500", "1500", "fifteen hundred", "1500", "1500", "three"], "difficult_direct_answer": false, "rationales": ["The clock on the wall says it is 3:00 which is 1500 in military time.", "The time is visible in the image and based on the military method for time keeping, answer a or c are viable.", "The time on the clock says 3:00. three added to 12:00 is 15:00."], "image": "val2014/COCO_val2014_000000503081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382926, "question_id": "SubBC6WmXeio5itoQYEmgw", "question": "What is the cat in control of?", "choices": ["television", "car", "truck", "radio"], "correct_choice_idx": 0, "direct_answers": ["remote control", "remote", "remote", "remote", "remote", "remote control", "television", "remote", "remote", "tv controller"], "difficult_direct_answer": false, "rationales": ["The controller has many buttons including numbers for changing channels.", "The remote control that the cat has changes the channels on the tv.", "The cat watches tv."], "image": "train2014/COCO_train2014_000000382926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279716, "question_id": "Sui7sJyjRcuG6VzPRqPQX3", "question": "Which type of surf board is good for short waves?", "choices": ["long", "fish", "short", "fin"], "correct_choice_idx": 1, "direct_answers": ["short board", "longboard", "paddleboard", "fish", "body board", "short board", "longboard", "shortboard", "longboard", "longboard"], "difficult_direct_answer": false, "rationales": ["A longer in size surfboard is better for surfing shorter waves.", "They are the best because of their increased length and volume.", "Traditionally smaller boards are better for small waves."], "image": "val2014/COCO_val2014_000000279716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94021, "question_id": "Sv3Bp8MwrpwQ2Pkvav3GGy", "question": "These two black cats are most likely what?", "choices": ["couple", "siblings", "strangers", "friends"], "correct_choice_idx": 1, "direct_answers": ["siblings", "siblings", "siblings", "black cats", "cute", "siblings", "related", "sleeping", "relaxing", "related"], "difficult_direct_answer": false, "rationales": ["These cats look alike.", "The two black cats look very similar and could be siblings.", "The way they cuddle shows they are siblings."], "image": "val2014/COCO_val2014_000000094021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432378, "question_id": "SvRXzjTCWeNjvgSKcjafhY", "question": "What type of uniforms are the people wearing?", "choices": ["army", "chef", "school", "stewardess"], "correct_choice_idx": 2, "direct_answers": ["school", "school", "school", "school", "school", "school", "school", "school", "school", "school"], "difficult_direct_answer": false, "rationales": ["The uniforms are for school.", "The people are getting into a school bus in formal clothes.", "The people are wearing a school uniform with their bookbags."], "image": "val2014/COCO_val2014_000000432378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377592, "question_id": "SvWzTP5EMxSwrMQfdUDUT9", "question": "What device is used to ensure people stop here?", "choices": ["camera", "road spikes", "speedbump", "gate"], "correct_choice_idx": 2, "direct_answers": ["stop sign", "sign", "stop sign", "sign", "speedbump", "stop sign", "stop sign", "sign", "sign", "stop sign"], "difficult_direct_answer": false, "rationales": ["A raised section of road will slow a car down.", "The black and yellow striped obstacle on the road is a speedbump.", "There is a raised section of concrete across the road with yellow markings on it."], "image": "train2014/COCO_train2014_000000377592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536494, "question_id": "SvXgt3eB2TY68kmgTZbQ5D", "question": "What is this sport name is called?", "choices": ["surfing", "skate boarding", "skate driving", "parachuting"], "correct_choice_idx": 1, "direct_answers": ["skateboarding", "skateboarding", "skateboarding", "skateboarding", "skateboarding", "skate boarding", "skateboarding", "skateboarding", "skateboarding", "skateboarding"], "difficult_direct_answer": false, "rationales": ["The sport involves the use of a skateboard, as shown here, under the athlete's* red sneakers.", "The man is on a board with wheels.", "A man is on a skateboard on the sidewalk."], "image": "train2014/COCO_train2014_000000536494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110482, "question_id": "Svdc4WHjAtgdRdDyaqygZh", "question": "What area of the United States has this type of landscape?", "choices": ["mid atlantic", "eastern", "western", "mid west"], "correct_choice_idx": 2, "direct_answers": ["south", "jackie", "texas", "southwest", "southern", "desert", "southwestern", "southwest", "new mexico", "western"], "difficult_direct_answer": true, "rationales": ["A person on a horse walks through an dry, dessert type area.", "The landscape is open and dry like in cowboy movies.", "The land is a dessert with no trees."], "image": "val2014/COCO_val2014_000000110482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274470, "question_id": "SvryZMmi6gTvpnD9qD8DMU", "question": "What has washed up on the beach?", "choices": ["seaweed", "leaves", "hair", "string"], "correct_choice_idx": 0, "direct_answers": ["seaweed", "seaweed", "seaweed", "boats", "seaweed", "seaweed", "seaweed", "seaweed", "boats", "dog"], "difficult_direct_answer": false, "rationales": ["Seaweed is on the sand.", "It washes up on beaches very frequently.", "The seaweed is on the beach."], "image": "val2014/COCO_val2014_000000274470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161084, "question_id": "SvxnVJUJco2QpXusRVRjvE", "question": "What appliance is next to the microwave?", "choices": ["refrigerator", "toaster", "oven", "dishwasher"], "correct_choice_idx": 1, "direct_answers": ["dish", "toaster", "dish", "toaster", "toaster", "toaster", "toaster", "rice cooker", "toaster", "toaster"], "difficult_direct_answer": false, "rationales": ["You can see the slots where bread goes in on this object and it looks just like a toaster.", "The appliance has two slots on top.", "It is used to toast bread before being eaten."], "image": "train2014/COCO_train2014_000000161084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253955, "question_id": "Sw8Sdqq4SYyDcNke7GVCKP", "question": "What is this cake called?", "choices": ["princess cake", "rose cake", "green cake", "plum cake"], "correct_choice_idx": 0, "direct_answers": ["birthday cake", "princess cake", "swiss roll", "jelly cake", "bunt", "melon cake", "key lime", "cheesecake", "bundt cake", "dessert"], "difficult_direct_answer": true, "rationales": ["It's green and has layers.", "The cake is in the same shape and style as a princess cake. the domed top and colorful outside is indicative of as much.", "This is a special kind of cake that is made."], "image": "train2014/COCO_train2014_000000253955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5394, "question_id": "SwQZQJm3jzkdBnbDH5Ewuu", "question": "How many passengers can this boat carry?", "choices": ["two", "one", "four", "three"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "six", "four", "four", "four", "two", "two", "three", "four"], "difficult_direct_answer": false, "rationales": ["One person can ride in the front of the boat, and two people can sit in the back of the boat.", "The image shows slightly two more chairs to the right side.", "It looks as if it can for four people"], "image": "val2014/COCO_val2014_000000005394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118846, "question_id": "Swo58qBdYBJqjS2vdQKcZq", "question": "What is the man expressing?", "choices": ["sorrow", "confidence", "surprise", "joy"], "correct_choice_idx": 2, "direct_answers": ["surprise", "shock", "shock", "shock", "surprise", "shock", "crows", "surprise", "surprise", "surprise"], "difficult_direct_answer": false, "rationales": ["By what he is doing to his hair and look on his face, you can tell his expression.", "A man is looking straight ahead with his mouth hanging open and his brow wrinkling from his raised eyebrows.", "He looks like he's surprised to be doing this photo"], "image": "val2014/COCO_val2014_000000118846.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561066, "question_id": "Swp69seZB7TwH5VbNbCgWz", "question": "What country's flag can we see on top of the roof?", "choices": ["united states", "italy", "spain", "canada"], "correct_choice_idx": 3, "direct_answers": ["canada", "n/a", "mexico", "n/a", "canada", "canada", "canada", "canada", "canada", "canada"], "difficult_direct_answer": false, "rationales": ["The flag is from canada.", "It is a canadian flag.", "The flag has two red stripes and one white stripe with a red maple leaf."], "image": "train2014/COCO_train2014_000000561066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83850, "question_id": "SwzgnY5Seo3qeSwfMQuSN4", "question": "What white items flavor this pizza?", "choices": ["onions", "bacon", "pepperoni", "olives"], "correct_choice_idx": 0, "direct_answers": ["cheese", "onions", "cheese", "onions", "mozzarella", "onion", "onion", "mushroom", "onions", "onion"], "difficult_direct_answer": false, "rationales": ["Onions are on the pizza.", "The onions have been as a topping on the pizza.", "The other options aren't the right color."], "image": "val2014/COCO_val2014_000000083850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210252, "question_id": "SxMxsaJuWVezKgWiTPaHDy", "question": "Which of these men would you call if you were locked out of your car?", "choices": ["hammer man", "no one", "cell phone", "old lady"], "correct_choice_idx": 0, "direct_answers": ["sledgehammer man", "right man", "right", "locksmith", "locksmith", "right", "locksmith", "right", "right side", "hammer man"], "difficult_direct_answer": false, "rationales": ["The man could use the hammer to break a car window and let you in.", "The man with the hammer looks more rugged.", "The man with the hammer is useful."], "image": "train2014/COCO_train2014_000000210252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79893, "question_id": "Sxp8Tfat4PWvvioHKeuntf", "question": "What is required to open the flow of water?", "choices": ["wrench", "saw", "hammer", "drill"], "correct_choice_idx": 0, "direct_answers": ["wrench", "machine", "tool", "wrench", "firefighter", "special tool", "wrench", "firetruck", "wrench", "key"], "difficult_direct_answer": false, "rationales": ["A wrench is needed to open the hydrant.", "A wrench can be used to screw open the hydrant.", "You need a wrench to open it."], "image": "train2014/COCO_train2014_000000079893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212575, "question_id": "Sy32ASzENcMpxdr8wv5k5F", "question": "The glare from the sun may interfere with a persons ability to do what while cooking?", "choices": ["touch", "think", "smell", "see"], "correct_choice_idx": 3, "direct_answers": ["see", "wash dishes", "see", "see clearly", "see", "vision", "wash dishes", "see", "not cooking", "see"], "difficult_direct_answer": false, "rationales": ["Someone who's blinded by the light might not be able to see.", "Sun provides light, and too much light towards one's eyes can make you go slightly blind for a second", "The glare will mess up their ability to see well."], "image": "train2014/COCO_train2014_000000212575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163333, "question_id": "SyDPydZSAz5gmifxLD7dZU", "question": "What mountain range might be seen from this vehicle?", "choices": ["appalachian mountains", "swiss alps", "sierra nevadas", "rocky mountains"], "correct_choice_idx": 1, "direct_answers": ["alps", "alps", "rockies", "alps", "uetliberg", "alps", "alps", "alps", "swiss alps", "swiss alps"], "difficult_direct_answer": false, "rationales": ["The brand name on front of this vehicle is ubs polybahn. this is a well known railway company that operates in the swiss alps.", "The transport is at the mountains.", "The name is in another language"], "image": "val2014/COCO_val2014_000000163333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465969, "question_id": "SyHErZBoyA3Sa7VCU27rSc", "question": "What event might this be?", "choices": ["circus show", "concert", "wrestling show", "award show"], "correct_choice_idx": 0, "direct_answers": ["circus show", "circus", "circus", "circus", "circus", "circus", "circus", "circus", "circus", "circus"], "difficult_direct_answer": false, "rationales": ["There are show elephants", "There is an elephant and a ringleader.", "The event is a circus show."], "image": "train2014/COCO_train2014_000000465969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127450, "question_id": "SyYPQ9ehH67EKEw3HtV5fF", "question": "Why is he in midair?", "choices": ["broken wheells", "showing off", "he fell", "bounced there"], "correct_choice_idx": 1, "direct_answers": ["jumping", "motorcycle", "jumping", "showing off", "jumping", "jumping cars", "jumping motorcycle", "jumping cars", "performing stunt", "ramp"], "difficult_direct_answer": false, "rationales": ["The rider is holding his hands away from the bike as part of a display, demonstrating his prowess and fearlessness. this is a common feature of daredevil stunt performances.", "Motorbike tricks involve the rider in the air.", "The man is trying to show off his skills."], "image": "train2014/COCO_train2014_000000127450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456341, "question_id": "Sz6jbMqADDqGdaFaAY7adW", "question": "What are the two men doing?", "choices": ["heimlich", "hugging", "wrestling", "assisting"], "correct_choice_idx": 2, "direct_answers": ["wrestling", "fighting", "playing", "wrestling", "wrestling", "wrestling", "wrestling wii", "wrestling", "wrestling", "playing"], "difficult_direct_answer": false, "rationales": ["One man has is arms around another man and is tugging on him.", "The two men are wrestling in the living room.", "He has him in a chokehold"], "image": "train2014/COCO_train2014_000000456341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540763, "question_id": "Sz6oVnasfy5Hea4fPrYdUh", "question": "What decorates the wall?", "choices": ["hummer", "monster truck", "sun", "tank"], "correct_choice_idx": 2, "direct_answers": ["sun", "sun", "sun", "tolls", "sun", "sun sculpture", "sun", "home depot", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["There is a brown decoration above the toilet. it is not a vehicle.", "The decoration is not a vehicle. it looks like the star that is closest to earth.", "A picture of the sun is above the toilet."], "image": "val2014/COCO_val2014_000000540763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19318, "question_id": "Sz6r33d78vnAEjBSwCuSoL", "question": "What is the top baseball player doing?", "choices": ["riding opponent", "lashing out", "tagging out", "horsing around"], "correct_choice_idx": 2, "direct_answers": ["winning", "hunched over", "touch base", "tagging runner", "straddling", "stealing base", "leapfrog", "tagging runner", "tagging out", "getting out"], "difficult_direct_answer": true, "rationales": ["The standing player has the ball and its touching the falling player with it.", "The baseball player is a defender and has the ball in his hand next to a base runner so he is tagging the player out.", "The person is tagging."], "image": "train2014/COCO_train2014_000000019318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374666, "question_id": "Sz9afNb6TtYtRAGonJLdEX", "question": "What other people are known to sit in locations like this in this position?", "choices": ["pan handlers", "engineers", "football players", "teachers"], "correct_choice_idx": 0, "direct_answers": ["homeless people", "homeless people", "pan handlers", "stressed people", "yoga teachers", "executives", "homeless people", "gymnasts", "homeless", "homeless people"], "difficult_direct_answer": false, "rationales": ["Panhandlers beg for money on streets.", "They stay where a lot of people walk by.", "A woman is sitting on the sidewalk."], "image": "train2014/COCO_train2014_000000374666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384673, "question_id": "SzTyLtowQyyBJ89LrufLJm", "question": "Who are these grownups?", "choices": ["cow buyers", "medical workers", "farmers", "visitors"], "correct_choice_idx": 1, "direct_answers": ["veterinarian", "scientists", "veterinarians", "doctors", "doctors", "livestock inspectors", "doctors", "medical workers", "doctors", "doctors"], "difficult_direct_answer": false, "rationales": ["The men are wearing white.", "They are dressed like medical workers.", "They are dressed in white coats so they are likely doctors."], "image": "train2014/COCO_train2014_000000384673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512702, "question_id": "SzW5zGAPUmAcgevtJynxJF", "question": "Which direction will the man go next?", "choices": ["left", "up", "right", "forward"], "correct_choice_idx": 3, "direct_answers": ["forward", "forward", "straight ahead", "left", "up ramp", "down south", "straight", "straight", "straight towards", "right"], "difficult_direct_answer": false, "rationales": ["The man is trying to go forward.", "There are lanes on this skate ramp and his lane requires him to go this direction from where he is standing and from the direction he is facing in relation to the skate ramp.", "The direction is forward."], "image": "train2014/COCO_train2014_000000512702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269508, "question_id": "SzZuBsE8kKF2ujRER8jodk", "question": "Which mall business would be most devastated if all crops of this fruit failed?", "choices": ["orange julius", "sonic", "burger king", "w"], "correct_choice_idx": 0, "direct_answers": ["juice vendor", "orange julius", "smoothies", "orange julius", "grocery store", "juice stands", "orange julius", "orange julius", "food court", "orange"], "difficult_direct_answer": false, "rationales": ["The fruit on this tree are oranges. orange julius sells juices containing oranges.", "Orange fruit is hanging from trees.", "These are citrus fruit. burger king and sonic do not use large quantities of citrus fruit."], "image": "train2014/COCO_train2014_000000269508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542717, "question_id": "Szp7xuv3BALbirgyXEFjhV", "question": "Which food gives you the most starch?", "choices": ["tomato", "eggplant", "potato", "green bean"], "correct_choice_idx": 2, "direct_answers": ["potato", "potatoes", "potatoes", "potato", "potatoes", "potato", "potato", "potatoes", "potatoes", "potatoes"], "difficult_direct_answer": false, "rationales": ["This answer can be found without looking at picture.", "Potatoes are very starchy.", "The spud is full of starch."], "image": "val2014/COCO_val2014_000000542717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39106, "question_id": "Szvp26yTtaDxi5c2BFYfo2", "question": "What are these young guys doing?", "choices": ["working out", "fighting", "dancing", "gaming"], "correct_choice_idx": 3, "direct_answers": ["gaming", "gaming", "wii game", "playing wii", "playing wii", "playing game", "gaming", "playing wii", "dancing", "playing wii"], "difficult_direct_answer": false, "rationales": ["The people have a gaming system set up.", "They have wii controllers in their hands", "They are holding controllers that are used only for this activity."], "image": "val2014/COCO_val2014_000000039106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269098, "question_id": "T2xxzE8qHkgSF7B94ak6rm", "question": "For what activity is the standing lamp typically used?", "choices": ["crafting", "dancing", "cooking", "sleeping"], "correct_choice_idx": 0, "direct_answers": ["lighting", "reading", "reading", "reading", "lighting", "reading", "reading", "reading", "reading", "crafting"], "difficult_direct_answer": false, "rationales": ["The lamp is usually used for knitting.", "Standing lamp used to help do more activities that require better visibility such as reading.", "The lamp makes visibility while in the living room area possible. so, it's good for crafters."], "image": "val2014/COCO_val2014_000000269098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60245, "question_id": "T3638zhouuLz8fcCGv5nHx", "question": "What is usually found on this food item?", "choices": ["cherries", "cheese", "chocolate", "mustard"], "correct_choice_idx": 1, "direct_answers": ["pepperoni", "cheese", "cheese", "pepperoni", "tomato sauce", "cheese", "sauce", "meat", "cheese", "cheese"], "difficult_direct_answer": false, "rationales": ["The food on the tray is pizza which is usually topped with melted cheese.", "Traditionally this food item has always had cheese.", "Cheese is the most common topping on pizza."], "image": "train2014/COCO_train2014_000000060245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223369, "question_id": "T3EaoQymnWWe4xe6Rc26EZ", "question": "How are the papers fastened to the appliance behind the baby?", "choices": ["magnets", "static electricity", "glue", "push pins"], "correct_choice_idx": 0, "direct_answers": ["magnets", "magnet", "magnets", "eating", "magnets", "magnet", "overalls", "magnets", "magnets", "magnets"], "difficult_direct_answer": false, "rationales": ["Papers are attached to a refrigerator in a kitchen.", "The papers are fastened to the refrigerator behind the baby with magnets.", "People put magnets on fridges to hold up pieces of fabric. the items are in similar size and shape to that of magnets."], "image": "train2014/COCO_train2014_000000223369.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332025, "question_id": "T3cpueyqPJGF4726Mo2XMx", "question": "What are the 3 people standing together for?", "choices": ["hot chocolate", "photograph", "ski jump", "starting line"], "correct_choice_idx": 1, "direct_answers": ["picture taken", "getting photographed", "photo", "photograph", "posing", "photo", "posing", "photo", "getting photographed", "photograph"], "difficult_direct_answer": false, "rationales": ["The people are standing together because they are posing for a photograph taken by the person on the right.", "They are posing for a photo.", "They are posing for a picture."], "image": "val2014/COCO_val2014_000000332025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121366, "question_id": "T42QY7DyU9mMZg9nv5U9bv", "question": "How many people live here?", "choices": ["hundreds", "nobody", "one", "multiple persons"], "correct_choice_idx": 3, "direct_answers": ["one", "two", "one", "one", "three", "one", "multiple persons", "one", "two", "one"], "difficult_direct_answer": false, "rationales": ["A lot of people live here since there are bulk foods shown.", "There are two people in the picture", "The fridge shows that there are many people people in area."], "image": "train2014/COCO_train2014_000000121366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467778, "question_id": "T45gs2szg54DdCwoptmMpZ", "question": "What is the motorcycle using to stay upright?", "choices": ["ledge", "kickstand", "beam", "bench"], "correct_choice_idx": 1, "direct_answers": ["stand", "kickstand", "kickstand", "kick stand", "kickstand", "kick stand", "kickstand", "kickstand", "kickstand", "kickstand"], "difficult_direct_answer": false, "rationales": ["There is a metal bar with a foot holding the bike up and none of the other items would fit under a bike", "The other options don't apply to a motorcycle.", "The motorcycle is using a silver stand to stay upright."], "image": "train2014/COCO_train2014_000000467778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28402, "question_id": "T4LekeJrN4zAk5mQsFzV5q", "question": "What is the posture of the person in back?", "choices": ["bent over", "crossed legs", "squatting", "standing"], "correct_choice_idx": 1, "direct_answers": ["standing", "sitting", "sitting", "crossed legs", "sitting down", "ball catcher", "stiff", "sitting", "sitting", "lounging"], "difficult_direct_answer": false, "rationales": ["The person's legs are crossed.", "The person is sitting in a chair with one leg over the other.", "The posture is crossed legs."], "image": "train2014/COCO_train2014_000000028402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151778, "question_id": "T4cvmJ39ACpYsqPSmJ8CTe", "question": "From what kind of seed did the item on the back of the motorcycle here first originate?", "choices": ["orange", "plum", "avocado", "strawberry"], "correct_choice_idx": 0, "direct_answers": ["orange", "wheat", "acorn", "orange seed", "southern asia", "too blurry", "corn seed", "orange", "pip", "orange"], "difficult_direct_answer": false, "rationales": ["There is an orange sign on the pole behind the motorcycles.", "There are oranges on the tree.", "The trees behind the motorcycle have oranges growing on them."], "image": "train2014/COCO_train2014_000000151778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391374, "question_id": "T4dqujnBtcGGJqrY232dM6", "question": "What kind of landform extends from the middle of the photo to the right in the background?", "choices": ["marine terrace", "beach cusp", "jetty", "seawall"], "correct_choice_idx": 2, "direct_answers": ["sand", "jetty", "peninsula", "beach", "jetty", "beach", "beach", "war", "pier", "beach"], "difficult_direct_answer": false, "rationales": ["There's a jetty available in the background.", "The woman is near a jetty.", "The landform is the jetty."], "image": "train2014/COCO_train2014_000000391374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105516, "question_id": "T4fantHx8VgxvgUkrnDeqC", "question": "Why is the man swinging his arms?", "choices": ["dancing", "stretching", "swatting flies", "swatting ball"], "correct_choice_idx": 3, "direct_answers": ["hit ball", "playing tennis", "playing tennis", "hitting ball", "hit ball", "hit ball", "hitting ball", "swatting ball", "hitting ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["The man is swinging his arms so he can hit the tennis ball back over the net.", "The man wants to hit the ball.", "The man is trying to hit the ball."], "image": "train2014/COCO_train2014_000000105516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564705, "question_id": "T4i2YHyaiNoK7h8rtFsyWt", "question": "Why have these people gathered?", "choices": ["worship", "fly kites", "go swimming", "dance"], "correct_choice_idx": 1, "direct_answers": ["kite flying", "fly kites", "flying kites", "fly kites", "fly kites", "kite flying", "flying kites", "fly kite", "fly kites", "fly kites"], "difficult_direct_answer": false, "rationales": ["Many people are looking up towards the sky. there are many objects that are fluttering in sky with strings attached.", "People are gathered together. there are multiple kites in the air.", "They are all there to fly kites."], "image": "train2014/COCO_train2014_000000564705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410328, "question_id": "T5J2dVRgKC5bpXbYNqVaFF", "question": "What is number 12 doing?", "choices": ["catching ball", "throwing ball", "cleaning area", "hitting batter"], "correct_choice_idx": 0, "direct_answers": ["catch", "catching ball", "catching", "catching", "catching", "catching", "catching ball", "dodging ball", "catching", "catching"], "difficult_direct_answer": false, "rationales": ["He has crouched down and has his hand out to obtain the ball.", "Traditionally this type of player receives the ball from the pitcher in baseball.", "Number 12 is the umpire so he is catching the ball."], "image": "val2014/COCO_val2014_000000410328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11842, "question_id": "T5Swd8mAkAJGtaZEDiGQH2", "question": "What are they hoping to collect?", "choices": ["coins", "points", "ideas", "signatures"], "correct_choice_idx": 0, "direct_answers": ["donations", "change", "change", "change", "money", "coins", "donations", "change", "money", "coins"], "difficult_direct_answer": false, "rationales": ["The sign needs coins.", "The machine has a slot for inserting coins.", "You can put coins into meters. coins are a form of money that is often collected to help people."], "image": "train2014/COCO_train2014_000000011842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114745, "question_id": "T5rXjPgJcVVA3eMAzANiby", "question": "What is the person dressed as?", "choices": ["baby", "cat", "elf", "gorilla"], "correct_choice_idx": 3, "direct_answers": ["gorilla", "gorilla", "gorilla", "gorilla", "gorilla", "gorilla", "gorilla", "gorilla", "gorilla", "gorilla"], "difficult_direct_answer": false, "rationales": ["They have a furry head and hands", "It took me a few seconds to figure it out but all that black fur made the answer obvious.", "The black thick fur of the outfit shows the person is pretending to be a gorilla."], "image": "val2014/COCO_val2014_000000114745.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416643, "question_id": "T684SR2QjJjJEGWV6EsYQi", "question": "Why are horses eyes covered?", "choices": ["superstition", "insects", "confusion", "style"], "correct_choice_idx": 1, "direct_answers": ["avoid distractions", "vision restriction", "force forwardfocus", "to guide", "insects", "wearing blinders", "protect eyes", "avoid startle", "blinders", "protect eyes"], "difficult_direct_answer": true, "rationales": ["The eye patches keep away distracting flies.", "The horses bat away flies.", "The horses do not want bugs in their eyes."], "image": "train2014/COCO_train2014_000000416643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209755, "question_id": "T6JuJZKeNtLp3BiSfrQ59k", "question": "The orange item here is frequently pictured with what character?", "choices": ["garfield", "bugs bunny", "super mario", "pikachu"], "correct_choice_idx": 1, "direct_answers": ["bugs bunny", "bugs bunny", "bugs bunny", "rabbit", "rabbit", "bugs bunny", "bugs bunny", "rabbit", "bugs bunny", "bugs bunny"], "difficult_direct_answer": false, "rationales": ["Carrots are the trademark food item of the cartoon character bugs bunny.", "The orange item is a carrot based on its size and shape. this food is known to be commonly seen with answer a.", "Rabbits eat carrots"], "image": "val2014/COCO_val2014_000000209755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282503, "question_id": "T6QMZ2m9hGC59UVqqh7kv7", "question": "Who was drinking from the coke bottle?", "choices": ["couch", "woman", "baby", "computer"], "correct_choice_idx": 1, "direct_answers": ["woman", "mother", "woman", "woman", "woman", "woman", "woman", "woman", "lady", "mom"], "difficult_direct_answer": false, "rationales": ["The woman on the laptop was drinking from the coke bottle and now it is empty.", "Children don't drink this beverage", "There are two visible people and one is a baby. it would be inadvisable for health reasons for a baby to drink a coke."], "image": "train2014/COCO_train2014_000000282503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279824, "question_id": "T6a2JYiQjvpterW8JxKr9g", "question": "What is the style that added the picture of the boy to the image called?", "choices": ["photo", "addition", "superimposed", "added"], "correct_choice_idx": 2, "direct_answers": ["edit", "photo shop", "photoshoped", "huck finning", "editing", "green screen", "editing", "photoshop", "meme", "superimposed"], "difficult_direct_answer": true, "rationales": ["He was added in after the picture was taken", "The picture appears to have things added into the image that were taken from other sources based on their non-relation to each other. this technique would be known as answer a.", "The boy has been inserted over the imagery and is thus superimposed."], "image": "val2014/COCO_val2014_000000279824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297770, "question_id": "T6hy84xYMj7qnFjjmnBcc2", "question": "Which way can those cars turn?", "choices": ["their right", "no turn", "unclear", "their left"], "correct_choice_idx": 3, "direct_answers": ["right", "left", "right", "left", "right", "left", "right", "left", "right", "their left"], "difficult_direct_answer": false, "rationales": ["The sign is facing right which is the car driver's left.", "The cars go left.", "The one way sign points the car to the left."], "image": "train2014/COCO_train2014_000000297770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492236, "question_id": "T6rRuk8dwKapHKctLbuf75", "question": "What is the scene being reflected off of?", "choices": ["window", "mirror", "computer screen", "water"], "correct_choice_idx": 1, "direct_answers": ["wedding", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror", "party", "mirror", "mirror"], "difficult_direct_answer": false, "rationales": ["The gold picture frame at the bottom shows what it.", "The scene is reflected off of a mirror on the back of the bar.", "The scene has a mirror."], "image": "train2014/COCO_train2014_000000492236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334125, "question_id": "T6sUcUAZTjuRvsUfm2DKpq", "question": "The items that are square showing a sort of necklace are used for what purpose?", "choices": ["snacks", "mail", "flying", "computer chips"], "correct_choice_idx": 1, "direct_answers": ["key chains", "writing", "mail", "coin purse", "religious", "attaching", "learning", "religion", "water bottle", "writing"], "difficult_direct_answer": true, "rationales": ["The objects in question are stamps which would be used for postage.", "The square items next to the backpack are postage stamps that celebrate a special necklace.", "There are small sticker like items that are shaped as squares, which are stamps."], "image": "val2014/COCO_val2014_000000334125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464845, "question_id": "T6uGJ2dbeyHw3nMCrLVAZw", "question": "What mode of transportation is seen beside the monitor?", "choices": ["airplane", "helicopter", "truck", "car"], "correct_choice_idx": 0, "direct_answers": ["airplane", "airplane", "airplane", "airplane", "airplane", "airplane", "airplane", "airplane", "plane", "airplane"], "difficult_direct_answer": false, "rationales": ["Besides the shape of the model, the name kingfisher is a former airlines.it had its licensed cancelled in the mid otts.", "There is a model airplane next to the monitor.", "There is a model airplane there."], "image": "train2014/COCO_train2014_000000464845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104974, "question_id": "T6v84RpqAZBWr7TfKqufGZ", "question": "Which direction is this skier trying to go?", "choices": ["nowhere", "sideways", "up", "down"], "correct_choice_idx": 2, "direct_answers": ["up", "uphill", "uphill", "right", "uphill", "upward", "up", "up hill", "right", "down"], "difficult_direct_answer": false, "rationales": ["The skier is trying to go up.", "The skiier is faced uphill so he is trying to go up.", "The direction is up."], "image": "train2014/COCO_train2014_000000104974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22863, "question_id": "T78PDGfG7h5vt5BC55RQFX", "question": "With what are these men focusing in on with their devices?", "choices": ["nothing", "videos", "cards", "food"], "correct_choice_idx": 3, "direct_answers": ["food", "food", "food", "cameras", "cameras", "food", "food", "food", "camera", "zoom feature"], "difficult_direct_answer": false, "rationales": ["The men are taking pictures at something on the table. they are in a kitchen.", "They are capturing the plates on camera.", "The men are looking at the sushi."], "image": "train2014/COCO_train2014_000000022863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362005, "question_id": "T7CFCo5DaPeKijFKnZ6fuU", "question": "What is the area the man is skating in made of?", "choices": ["metal", "plastic", "concrete", "wood"], "correct_choice_idx": 2, "direct_answers": ["concrete", "concrete", "concrete", "concrete", "concrete", "concrete", "concrete", "concrete", "concrete", "concrete"], "difficult_direct_answer": false, "rationales": ["The area is concrete.", "The surface is grey and dull like concrete.", "The skate bowl is made of concrete."], "image": "val2014/COCO_val2014_000000362005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228876, "question_id": "T7GN2EQMxo82VWx9j2c6Sx", "question": "What product can you buy from one of the mentioned companies?", "choices": ["food", "clothes", "computers", "medicine"], "correct_choice_idx": 2, "direct_answers": ["computers", "computers", "computer", "computers", "computer", "computers", "dell laptop", "computers", "computer", "computers"], "difficult_direct_answer": false, "rationales": ["Dell sells computers.", "The product is a computer.", "There is a sign behind the sitting man. it has a dell logo on it."], "image": "train2014/COCO_train2014_000000228876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398140, "question_id": "T7HFpPbcWumCEGpL22JR8r", "question": "Why is the train on a bridge?", "choices": ["crossing river", "stolen", "is broken", "lost"], "correct_choice_idx": 0, "direct_answers": ["moving", "passing by", "crossing river", "over lake", "crossing bridge", "transportation", "crossing", "crossing river", "crossing", "it's crossing"], "difficult_direct_answer": false, "rationales": ["Trains can't pass through water.", "It is there to cross the river to the other side of the road.", "To cross the river, the train must use the bridge."], "image": "train2014/COCO_train2014_000000398140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418074, "question_id": "T7au4jif4pjtaqk2e94GJd", "question": "Whose footsteps is she following in?", "choices": ["simone biles", "serena williams", "nancy lopez", "allison felix"], "correct_choice_idx": 1, "direct_answers": ["serena williams", "serena williams", "serena williams", "famous player", "tennis player", "serena williams", "tennis pros", "steffie graf", "tennis player", "player"], "difficult_direct_answer": false, "rationales": ["She is one of the best to ever play the game of tennis.", "The footsteps are williams'.", "She is playing tennis."], "image": "train2014/COCO_train2014_000000418074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472078, "question_id": "T7eyjbBcLMFsW4QPjy6fSS", "question": "Why are sticks stuck into the snow?", "choices": ["tradition", "visibility", "guidance", "style"], "correct_choice_idx": 2, "direct_answers": ["for skiing", "skis", "path", "guide skiers", "guidance", "braking", "for support", "skiers", "ski poles", "to ski"], "difficult_direct_answer": true, "rationales": ["They show how steep the hill is. they provide a path for skiiers.", "The sticks are marking out the correct path because it's easy to get disoriented on the white snow.", "The sticks help guide the people through the snow."], "image": "val2014/COCO_val2014_000000472078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530293, "question_id": "T7jzpRUCWsYquHQmvPUxtY", "question": "Who will try to get the ball from the person who touches it?", "choices": ["yellow person", "21", "coach", "referee"], "correct_choice_idx": 0, "direct_answers": ["player", "opposing player", "opponent", "opponent", "goalie", "opposing team", "yellow person", "opposite team", "opponent player", "other team"], "difficult_direct_answer": true, "rationales": ["The referee can call out the person who touches the ball.", "Two soccer teams are on a field and a player in blue has the ball.", "There are two teams and the people in blue are playing against yellow. the person is blue has the ball."], "image": "train2014/COCO_train2014_000000530293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108838, "question_id": "T7uDDrcfD42LUozjD3pMSz", "question": "What is the woman hoping to do as she holds her bike rather than rides on it?", "choices": ["cross street", "get exercise", "relieve injury", "park"], "correct_choice_idx": 0, "direct_answers": ["cross street", "cross street", "cross street", "cross street", "cross street", "cross walk", "take across", "cross street", "cross street", "ride bike"], "difficult_direct_answer": false, "rationales": ["The woman appears in the street and there is a man crossing towards her which means it is likely a labelled crosswalk.", "The woman is looking like she wants to cross the street.", "She wants to cross the busy street."], "image": "val2014/COCO_val2014_000000108838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109434, "question_id": "T7yqzWLoxbVDupPR3yhvag", "question": "What type of HVAC system conditions the air in the bathroom?", "choices": ["mini-split", "hydronic", "radiant", "central air"], "correct_choice_idx": 3, "direct_answers": ["vents", "central air", "vent", "central air", "air system", "air conditioning", "heater", "central", "floor", "sophisticated system"], "difficult_direct_answer": true, "rationales": ["The system is central air.", "There is a grate visible and identifiable on the ground which would be used with an hvac system of answer a.", "There is central air given the placement of the vent."], "image": "val2014/COCO_val2014_000000109434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531963, "question_id": "T85aLsQBmQY99zdVu7ThQd", "question": "What steel utensil is on the right?", "choices": ["spatula", "skewer", "whisk", "fork"], "correct_choice_idx": 2, "direct_answers": ["whisk", "whisk", "whisk", "whisk", "whisk", "chop sticks", "whisk", "chop sticks", "chop sticks", "whisk"], "difficult_direct_answer": false, "rationales": ["This tool is used to mix things up that are lighter.", "There is a steel utensil whisk on the right.", "It is a large whisk."], "image": "train2014/COCO_train2014_000000531963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531662, "question_id": "T8PoZweCYoEEAZqypvbQcL", "question": "What propels this boat?", "choices": ["electricity", "sail", "oar", "gas engine"], "correct_choice_idx": 2, "direct_answers": ["oars", "propeller", "paddle", "oar", "oar", "oar", "white", "oars", "oar", "oar"], "difficult_direct_answer": false, "rationales": ["The boat is very basic and the only tool present is an oar.", "There are wooden paddles there that you use to row the boat.", "It's sticking out the side and the other options aren't on the boat."], "image": "train2014/COCO_train2014_000000531662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554620, "question_id": "T8bDJzQsouVxtSJarVC4iq", "question": "What is the woman in purple avoiding here?", "choices": ["talking", "sleet", "rain", "press"], "correct_choice_idx": 2, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["She is using the umbrella so she does not get wet.", "The other options don't apply except c, but a is the common answer.", "The woman is holding an umbrella."], "image": "val2014/COCO_val2014_000000554620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333634, "question_id": "T8mbFto5wQF5gTW9vCAbAA", "question": "Which player is determined to be in the right here?", "choices": ["32", "catcher", "none", "20"], "correct_choice_idx": 3, "direct_answers": ["catcher", "20", "right", "base runner", "twenty", "batter", "runner", "pitcher", "safe", "number 20"], "difficult_direct_answer": true, "rationales": ["Player 20 is on the plate.", "The player is 20.", "By the scene and what signal number 32 is showing, he is right in his assessment of what is happening."], "image": "train2014/COCO_train2014_000000333634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315504, "question_id": "T8poRu7GepGmuGyRxKpvSM", "question": "What is this woman trying to do?", "choices": ["push", "carry", "drop", "run"], "correct_choice_idx": 0, "direct_answers": ["move it", "move toilet", "transport object", "push", "transport toilet", "drag toilet", "haul", "move", "transport toilet", "push toilet"], "difficult_direct_answer": true, "rationales": ["A woman is holding onto a dolly that is tipped back towards her with both hands and looking down at it.", "She is trying to move the toilet.", "The woman is pushing the appliance."], "image": "train2014/COCO_train2014_000000315504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357944, "question_id": "T9TcuQYrMUzvXQSchnfi3o", "question": "What does the vehicle belong to?", "choices": ["football team", "fire department", "police department", "baseball team"], "correct_choice_idx": 2, "direct_answers": ["police", "police", "police", "police", "police department", "police", "police", "police", "police", "police"], "difficult_direct_answer": false, "rationales": ["The word above the vehicle's front vehicle indicates the agency it belongs to.", "The vehicle is a police one.", "The door shows a badge on the side."], "image": "val2014/COCO_val2014_000000357944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143510, "question_id": "T9dKZEuKwE7GprXdzoiUtT", "question": "What will go on top of the veggies?", "choices": ["cheese", "dough", "sauce", "mushrooms"], "correct_choice_idx": 0, "direct_answers": ["spinach", "cheese", "spinach", "cheese", "cheese", "cheese", "spinach", "cheese", "cheese", "cheese"], "difficult_direct_answer": false, "rationales": ["There is none on the pizza yet", "Cheese will go on top of the veggies.", "This is a pizza and that is usually on the top"], "image": "train2014/COCO_train2014_000000143510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261779, "question_id": "T9jcCLFbCS5DwvH89AEzyn", "question": "What is usually held in the item being sat on here?", "choices": ["dogs", "coffee", "bibles", "clothing"], "correct_choice_idx": 3, "direct_answers": ["clothing", "luggage", "clothing", "clothing", "personal belongings", "clothes", "clothing", "hat", "clothing", "clothes"], "difficult_direct_answer": false, "rationales": ["The woman is sitting on a trunk, not a dog crate, bookcase, or gunny sack.", "This is a luggage trunk for traveling", "Clothing is usually held inside of the trunk where the woman is sitting on top."], "image": "val2014/COCO_val2014_000000261779.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150911, "question_id": "T9kzbcxf3cfDgMdnQktVL8", "question": "What are the people doing?", "choices": ["swimming", "riding", "flying", "training"], "correct_choice_idx": 1, "direct_answers": ["riding", "riding elephants", "riding elephants", "riding elephants", "riding", "riding elephants", "ride", "riding elephants", "riding", "riding elephants"], "difficult_direct_answer": false, "rationales": ["Several elephants are in the water and all have at least one person on their back.", "They are riding elephants across the river.", "The people on the elephants are not getting wet."], "image": "train2014/COCO_train2014_000000150911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429040, "question_id": "T9zvsbrnEHLdJ7au22ZjAh", "question": "The remote control placed on top of the black cat controls what object?", "choices": ["cable box", "vcr", "dvd player", "television"], "correct_choice_idx": 0, "direct_answers": ["television", "television", "tv", "television", "television", "tv", "tv", "cable box", "tv", "television"], "difficult_direct_answer": false, "rationales": ["It has channel and volume buttons", "That what the remote controls.", "The controller is identifiable by the size, shape and the layout of the buttons."], "image": "train2014/COCO_train2014_000000429040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320432, "question_id": "TAGAQ4xLvmzaTi4mL72iBx", "question": "What's the name of the meeting the group are at?", "choices": ["party", "convention", "lobby", "ruling"], "correct_choice_idx": 1, "direct_answers": ["press conference", "realtors", "convention", "realtor meeting", "business meeting", "press conference", "business", "no idea", "conference", "press conference"], "difficult_direct_answer": false, "rationales": ["They are all dressed up in a room with name tags on", "This can be answered by a process of elimination. all the other choices are not even remotely possible.", "Many people are waiting to hear from a speaker. they have work laptops and lanyards."], "image": "train2014/COCO_train2014_000000320432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373571, "question_id": "TAJyzYiJ36XCwN2c4uqxzT", "question": "What is she doing with the child?", "choices": ["holding captive", "teaching reading", "feeding them", "showing phone"], "correct_choice_idx": 3, "direct_answers": ["picture", "showing phone", "looking phone", "showing phone", "holding", "showing photo", "taking picture", "phone conversation", "showing phone", "talking"], "difficult_direct_answer": false, "rationales": ["The woman is holding her screen so the child can see it.", "The woman is holding a phone. the child is looking at the screen on it.", "The kid is using a phone."], "image": "val2014/COCO_val2014_000000373571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506530, "question_id": "TASzqCr2KBGLtsv47PkjBw", "question": "What is the large brown pastry on the tray?", "choices": ["cruller", "cinnamon roll", "apple fritter", "turnover"], "correct_choice_idx": 1, "direct_answers": ["donut", "cinnamon roll", "danish", "apple fitter", "honey bun", "cinnamon roll", "donut", "donut", "sweet roll", "donut"], "difficult_direct_answer": false, "rationales": ["The texture and consistency of the object with the frosting on top are consistent with answer a.", "To answer this, just look at the pastry and eliminate the other three options. this is not a cruller and is definitely not an apple fitter or turnover.", "The pastry breakfast roll has frosting on top and is flavored with cinnamon."], "image": "train2014/COCO_train2014_000000506530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548773, "question_id": "TAZejXqyzt7CJ3FucM9ztH", "question": "How are these two related?", "choices": ["romantically", "enemies", "parent child", "siblings"], "correct_choice_idx": 0, "direct_answers": ["friends", "brother/sister", "couple", "siblings", "romantically", "married", "spouses", "girlfriend boyfriend", "married", "spouse"], "difficult_direct_answer": true, "rationales": ["Their embrace is suggesting that they know each other intimately.", "The two are very close together with his around around her and her hand on his arm. very couple-like.", "The people are embracing each other in a familiar and loving way that would be associated with two people connected in the manner of answer a."], "image": "train2014/COCO_train2014_000000548773.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443075, "question_id": "TAimqYjvLGCbhT4rSez37Y", "question": "What has the boy done with the frisbee?", "choices": ["made it", "tackled it", "caught it", "threw it"], "correct_choice_idx": 2, "direct_answers": ["caught", "caught it", "catch it", "catch it", "caught it", "caught frisbee", "caught it", "caught it", "caught frisbee", "caught it"], "difficult_direct_answer": false, "rationales": ["This appears to be the answer based on the positioning of his hands and body.", "The boy has caught the frisbee.", "The boy has the frisbee in both hands clasped in front of him. for him to be in this position with the frisbee it would be unlikely to be doing any other action known to be involved with frisbee."], "image": "train2014/COCO_train2014_000000443075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559884, "question_id": "TAsNVPF62cKQkwSaPvStcn", "question": "What is he doing?", "choices": ["stealing sign", "eating", "directing traffic", "resting"], "correct_choice_idx": 2, "direct_answers": ["holding stopsign", "holding sign", "holding stopsign", "directing traffic", "displaying stop-sign", "directing traffic", "stopping traffic", "traffic control", "stopping traffic", "directing traffic"], "difficult_direct_answer": false, "rationales": ["This man is most likely a member of a road crew fixing a stretch of asphalt in this area. he is dressed for maximum visibility and directs the traffic to either stop or to proceed slowly.", "The man is directing traffic.", "His vest indicates he is a construction worker, and they are likely doing roadworks blocking a lane of traffic."], "image": "val2014/COCO_val2014_000000559884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165350, "question_id": "TAtUPXZf7vELYAfHH823NL", "question": "What are the women participating in?", "choices": ["exercise class", "work group", "group meeting", "race"], "correct_choice_idx": 3, "direct_answers": ["bike race", "race", "cycling", "cycling", "bicycle race", "bike race", "cycling competition", "race", "biking", "cycling"], "difficult_direct_answer": false, "rationales": ["All of the women are participating in a race.", "The bikers are in a bicycle race.", "The women are racing."], "image": "train2014/COCO_train2014_000000165350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373177, "question_id": "TAvWgoXaj3fSQHdhRungZD", "question": "What are the blue bottles on the ground used for?", "choices": ["drinking", "practicing", "fueling", "tossing"], "correct_choice_idx": 0, "direct_answers": ["tennis balls", "drinking", "water", "hydration", "gatorade", "water", "replenish electrolytes", "replenish electrolytes", "refreshment", "water"], "difficult_direct_answer": false, "rationales": ["The bottles are to drink.", "The bottles are gatorade bottles.", "They are there so the players can keep refreshed."], "image": "train2014/COCO_train2014_000000373177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103575, "question_id": "TB5WTh2jb5YcYXVNuowpt7", "question": "Which fruit is more expensive to buy at the supermarket?", "choices": ["strawberry", "banana", "apple", "orange"], "correct_choice_idx": 3, "direct_answers": ["banana lemon", "papaya", "orange", "mango", "banana", "orange", "orange", "banana", "lemon", "orange"], "difficult_direct_answer": false, "rationales": ["This is the most likely answer. it would depend on the time of year and location of the market.", "Orange crops have had a difficult few years because of a bacterial disease called \"greening.\" this has made the supply less over the years, making the price go up.", "Oranges cost more than bananas."], "image": "train2014/COCO_train2014_000000103575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293587, "question_id": "TBB6vbjGdUCU7rroYhbF5q", "question": "In which one of these cities can you meet the characters shown here?", "choices": ["fargo", "hershey", "trenton", "anaheim"], "correct_choice_idx": 3, "direct_answers": ["orlando", "orlando", "anaheim", "not clear", "not clear", "orlando", "orlando", "orlando", "not clear", "orlando"], "difficult_direct_answer": false, "rationales": ["Disneyland is in this city", "Disney is the creator and owner of these characters and disneyland is located in california, usa.", "Anaheim features disney."], "image": "train2014/COCO_train2014_000000293587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284691, "question_id": "TBGSoLrJfr7RjWmo5876xc", "question": "Where is the owner of these bags on the way to?", "choices": ["airport", "office", "cafe", "work"], "correct_choice_idx": 0, "direct_answers": ["unpack", "down stairs", "airport", "concert", "airport", "airport", "airport", "trip", "airport", "downstairs"], "difficult_direct_answer": false, "rationales": ["The owner goes to the airport.", "They are going to fly on a plane with their luggage checked in.", "There are suitcases on the cart."], "image": "train2014/COCO_train2014_000000284691.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74825, "question_id": "TBMwJXqDvkURNQcPpBbf6b", "question": "What does this animal like to eat?", "choices": ["oleander", "fish", "flowers", "chocolate"], "correct_choice_idx": 1, "direct_answers": ["fish", "fish", "tuna", "fish", "cat food", "catnip", "mice", "meat", "food", "cat food"], "difficult_direct_answer": false, "rationales": ["Cats like fish and mice.", "Cats love meat", "The animal visible is a fish and fish are colloquially known to like eating fish."], "image": "train2014/COCO_train2014_000000074825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312958, "question_id": "TBsQFjVtjqPbnN9DxA9jhq", "question": "Why are the bricks there?", "choices": ["keep warm", "keep dry", "natural formation", "solid surface"], "correct_choice_idx": 3, "direct_answers": ["divide roads", "decoration", "walking", "sidewalk", "solid surface", "barrier", "sidewalk", "sidewalk", "sidewalk", "divider"], "difficult_direct_answer": false, "rationales": ["The bricks provide a solid sidewalk surface.", "This is how sidewalks and roads used to be made so that people didn't have to travel in the mud", "Clydesdale are parked at the curb and to the left are some brick parts of the streets. horses use to travel on this because it was easier on their hooves."], "image": "train2014/COCO_train2014_000000312958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288712, "question_id": "TCAkwZbyW86L5jeUTHKm2i", "question": "What is the man doing?", "choices": ["sleeping", "drinking", "working out", "running"], "correct_choice_idx": 3, "direct_answers": ["catching frisbee", "running", "frisbee", "frisbee", "running", "catching frisbee", "playing frisbee", "catching frisbee", "catching frisbee", "playing frisbee"], "difficult_direct_answer": false, "rationales": ["The man is running since his feet are off the ground.", "The man runs.", "The way his feet are apart he looks to be going fast to get somewhere."], "image": "train2014/COCO_train2014_000000288712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377342, "question_id": "TCQQfMAPLro2B9fecJyDkY", "question": "What sport is the boy playing on the beach?", "choices": ["football", "volleyball", "tennis", "basketball"], "correct_choice_idx": 2, "direct_answers": ["tennis", "tennis", "tennis", "badminton", "tennis", "tennis", "tennis", "tennis", "badminton", "tennis"], "difficult_direct_answer": false, "rationales": ["The boy is holding a racket and this is the only sport listed that requires a racket to play.", "The sport is tennis.", "The boy is holding a racket and a ball which are two pieces of equipment that would be associated with answer a."], "image": "train2014/COCO_train2014_000000377342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143764, "question_id": "TCXPdVVbzVGN9JbTjLCLbe", "question": "What is usually displayed on the chalkboard in these restaurants?", "choices": ["taxes", "specials", "tips", "phone numbers"], "correct_choice_idx": 1, "direct_answers": ["menu", "menu", "menu", "menu", "menu items", "menu", "prices", "price", "specials", "specials"], "difficult_direct_answer": false, "rationales": ["Chalkboards are often used by restaurants to advertise changing specials.", "The specials are listed.", "The chalkboard usually has sales."], "image": "train2014/COCO_train2014_000000143764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490022, "question_id": "TCtL2npXhGUKdMozdmRXKV", "question": "What video game system are these men playing against each other on?", "choices": ["nintendo wii", "xbox 360", "nintendo 64", "playstation 4"], "correct_choice_idx": 0, "direct_answers": ["wii", "wii", "wii", "wii", "wii", "wii", "wii", "wii", "nintendo wii", "wii"], "difficult_direct_answer": false, "rationales": ["The black kid is holding a rectangle controller known for the wii. there's a nunchuk on the table.", "The wii is the system.", "The controller shows the evident."], "image": "val2014/COCO_val2014_000000490022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160101, "question_id": "TCyzfn7cGmqBDC2Hu67rch", "question": "Who is the manufacturer of the bat?", "choices": ["wilson", "louisville", "easton", "mizuno"], "correct_choice_idx": 2, "direct_answers": ["easton", "easton", "easton", "easton", "easton", "easton", "easton", "easton", "easton", "easton"], "difficult_direct_answer": false, "rationales": ["The manufacturer's name is written on the bat.", "The brand is on the bat.", "The name is on the bat"], "image": "train2014/COCO_train2014_000000160101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210081, "question_id": "TD3Cke3A6ARWgFxGSmVdPU", "question": "What season brings this weather?", "choices": ["winter", "summer", "fall", "spring"], "correct_choice_idx": 0, "direct_answers": ["winter", "winter", "winter", "winter", "winter", "winter", "winter", "winter", "winter", "winter"], "difficult_direct_answer": false, "rationales": ["The ground is covered in snow.", "Mountains covered with both snow and skiers signifies a chilly winter day.", "The ground is covered in snow. the people are skiing."], "image": "val2014/COCO_val2014_000000210081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248682, "question_id": "TD5bSzCeejbJykAmDGiT7f", "question": "Why is the dog in midair?", "choices": ["grabbing frisbee", "fell", "thrown", "bounced"], "correct_choice_idx": 0, "direct_answers": ["catching frisbee", "playing frisbee", "grabbing frisbee", "catching frisbee", "jumping", "catching frisbee", "catching frisbee", "catching frisbee", "catching frisbee", "catch"], "difficult_direct_answer": false, "rationales": ["The dog has jumped up in the air and also has a frisbee in his mouth. it is logical to assume that he jumped up in the air to catch the frisbee.", "He jumped up to grab it after it was thrown to him", "He had to leap to grab the object out of the air."], "image": "train2014/COCO_train2014_000000248682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137595, "question_id": "TDA6SYzEH3Tdr649NZZprp", "question": "What comic book company do they probably like?", "choices": ["dc", "image", "marvel", "valiant"], "correct_choice_idx": 2, "direct_answers": ["hanna barbara", "marvel", "marvel", "spiderman", "marvel", "marvel", "marvel", "marvel", "spiderman", "marvel"], "difficult_direct_answer": false, "rationales": ["There is a spider-man figurine on the television and that franchise is owned by this company.", "Spiderman is shown.", "A spiderman statue is on the television and spiderman is a marvel character."], "image": "val2014/COCO_val2014_000000137595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194755, "question_id": "TDAvUpyqFtXDw5Ew2qsQdt", "question": "What beverage is the man drinking in the mug?", "choices": ["tea", "chai", "cappuccino", "milk"], "correct_choice_idx": 2, "direct_answers": ["coffee", "cappuccino", "latte", "coffee", "cappuccino", "latte", "coffee", "cappuccino", "coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["The man is drinking coffee.", "The foam at the top is how a cappuccino looks.", "It is in a coffee cup and has foam"], "image": "val2014/COCO_val2014_000000194755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452050, "question_id": "TDDgQKMidNQv3a2PUpfj5P", "question": "What kind of shirt does the person most fully prepared to grab the frisbee wear?", "choices": ["cammo", "blue", "white", "black"], "correct_choice_idx": 0, "direct_answers": ["camouflage", "camouflage", "cammo", "camouflage", "frisbee", "sleeveless shirt", "sweatshirt", "camouflage", "camouflage", "t-shirt"], "difficult_direct_answer": false, "rationales": ["The pattern looks like that of military cammo.", "This is a design on his shirt that some wear when they want to blend in with their surroundings.", "A man in assorted greens and browns is in the middle of catching frisbee."], "image": "train2014/COCO_train2014_000000452050.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420859, "question_id": "TDFDobgiz4EhuRoijkfZqL", "question": "What will happen to this mans feet if he doesn't stop?", "choices": ["nothing", "get wet", "twist", "burn"], "correct_choice_idx": 1, "direct_answers": ["get wet", "get wet", "stops", "wet", "get wet", "move forward", "get wet", "get wet", "get wet", "get wet"], "difficult_direct_answer": false, "rationales": ["He is headed toward the water", "A man is biking towards the water at the beach.", "The metal will get wet."], "image": "train2014/COCO_train2014_000000420859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275668, "question_id": "TDPFDJuUpSRFwdkfKRg9LM", "question": "Who would stay in this room?", "choices": ["resident", "prisoner", "maid", "traveler"], "correct_choice_idx": 3, "direct_answers": ["travelers", "friends", "tourist", "people", "tourist", "tourists", "traveler", "tourist", "family", "guests"], "difficult_direct_answer": false, "rationales": ["A room with two beds and somewhat commercial carpet can be seen. there are no personal items or picture frames around the room.", "It is distinguishable as a hotel room which is a temporary place for travelers to stay.", "The room appears to be a hotel room based on the decor, beds and background. people who most commonly use a hotel room are answer a."], "image": "val2014/COCO_val2014_000000275668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172121, "question_id": "TDi5uuhuBH2PMyzGEeD7Le", "question": "Why would someone sit at this table?", "choices": ["to work", "to talk", "to sew", "to eat"], "correct_choice_idx": 3, "direct_answers": ["eat", "to eat", "eat pizza", "to eat", "eat pizza", "eat pizza", "to eat", "eat pizza", "eat pizza", "eat pizza"], "difficult_direct_answer": false, "rationales": ["And possibly d if they already did a or wanted to do both.", "There is pizza to eat on it.", "The table is set for a meal with a pizza in the middle."], "image": "train2014/COCO_train2014_000000172121.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101985, "question_id": "TDnBVTYupLLC9Te87xTU4U", "question": "Why is he kicking the boy in the face?", "choices": ["wants ball", "is accident", "is angry", "is evil"], "correct_choice_idx": 1, "direct_answers": ["missed ball", "accident", "is accident", "regrettable accident", "accidentally", "on accident", "follow through", "accident", "accident", "accident"], "difficult_direct_answer": false, "rationales": ["The boy on the left was going after the ball, and he did indeed kick it. unfortunately, his foot kept going and he clocked the red headed boy in the face!.", "Kicking someone in the face during a soccer game is usually an accident.", "It was not on purpose he was kicking the ball."], "image": "val2014/COCO_val2014_000000101985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36841, "question_id": "TEHwCEohQv8gpsDnXPLM2g", "question": "This is a dorm room of a student majoring in what?", "choices": ["surveying", "biology", "economics", "theater"], "correct_choice_idx": 0, "direct_answers": ["photography", "surveying", "city", "general studies", "english", "design", "photography", "art", "art", "architect"], "difficult_direct_answer": false, "rationales": ["The person must be majoring in theater since there are actors and actresses posted everywhere.", "The room is for surveying.", "There are books on the desk."], "image": "train2014/COCO_train2014_000000036841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359115, "question_id": "TEJXRgsU8ssW3FcvS9V5HC", "question": "What process is used to create the pillow fabric?", "choices": ["embroidery", "painting", "knitting", "quilting"], "correct_choice_idx": 0, "direct_answers": ["sewing", "spinning", "spun", "spinning", "leather", "sew", "tanning", "embroidery", "sewing", "sewing"], "difficult_direct_answer": false, "rationales": ["The pillow fabric has stitches on it.", "The pillow has been embroidered.", "It has intricate designs sewn in"], "image": "val2014/COCO_val2014_000000359115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364201, "question_id": "TEdRdoMuHtJ3YWqMDZZnqh", "question": "The main color of this vehicle is the same color as what?", "choices": ["grass", "flamingo", "daisy", "sky"], "correct_choice_idx": 3, "direct_answers": ["sky", "blue", "sky", "sky", "sky blue", "ocean", "robins egg", "blueberry", "sky", "sky"], "difficult_direct_answer": false, "rationales": ["The bus is blue.", "The train is blue in color just like the sky.", "Almost the same color on certain days and times of day."], "image": "train2014/COCO_train2014_000000364201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531979, "question_id": "TEnpoLDD7E42TW2nKFCZXR", "question": "What sort of weather is this hat usually associated with?", "choices": ["snow", "hurricanes", "rain", "sun"], "correct_choice_idx": 3, "direct_answers": ["sunny", "hot weather", "sunny", "spring", "summer", "sunny", "sunny", "sunny", "sun", "sunny"], "difficult_direct_answer": false, "rationales": ["The weather is sunny.", "The weather is very bright an the woman is trying to stay cool.", "This is used to keep sun out of the eyes and to protect from the rays"], "image": "train2014/COCO_train2014_000000531979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491000, "question_id": "TF2FCuGfgsFBwnYTXuhRh9", "question": "How many different directions may traffic travel here?", "choices": ["four", "three", "two", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "one", "one", "one", "two", "one", "one way", "one", "two"], "difficult_direct_answer": false, "rationales": ["The street is too narrow to accommodate vehicles going in opposite directions.", "It is a one way street only wide enough for one lane of traffic", "The road is wide enough for one vehicle. thinner streets are often one way streets. two cars could not pass each other on a street that is not wide enough."], "image": "val2014/COCO_val2014_000000491000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507362, "question_id": "TFNCe93xFv574uMDCUa6Ns", "question": "Which direction is okay to go?", "choices": ["backwards", "right", "straight", "left"], "correct_choice_idx": 2, "direct_answers": ["east", "straight", "left", "right", "straight", "green", "right", "straight", "straight", "straight"], "difficult_direct_answer": false, "rationales": ["It is okay to go straight.", "Red streaks of light can be seen. this signifies cars going thru an area of road because light is green ahead.", "There are green lights in the middle."], "image": "val2014/COCO_val2014_000000507362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573700, "question_id": "TFQmi5MPXRAWLoTqN63z72", "question": "What do the items in front of the tires here prevent?", "choices": ["reversals", "speeding", "rolling", "advertising"], "correct_choice_idx": 2, "direct_answers": ["rolling forward", "rolling", "rolling", "rolling", "rolling", "moving", "rolling", "vehicle movement", "rolling away", "motion"], "difficult_direct_answer": false, "rationales": ["The bricks are there as a stopper for the tires in case the bus tries to roll forward.", "The red stoppers keep the tires from moving.", "The items under the tires stop the vehicle from moving."], "image": "train2014/COCO_train2014_000000573700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221864, "question_id": "TFQx6KDDy2xtCEp4jBVtV4", "question": "What is she doing?", "choices": ["resting", "cleaning up", "posing", "eating"], "correct_choice_idx": 2, "direct_answers": ["posing", "skiing", "posing", "resting selfie", "posing", "skiing", "women", "smiling", "smiling", "slope skiing"], "difficult_direct_answer": false, "rationales": ["The woman is smiling for the camera.", "The woman is smiling for the camera.", "She is smiling and isn't moving"], "image": "val2014/COCO_val2014_000000221864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62772, "question_id": "TFWFhALLugEB6DGUE6nrnp", "question": "Who lives in these buildings?", "choices": ["soldiers", "students", "teachers", "foster kids"], "correct_choice_idx": 1, "direct_answers": ["students", "students", "college students", "students", "college students", "students", "students", "students", "college students", "students"], "difficult_direct_answer": false, "rationales": ["Halls are where the college kids live.", "Students live in the dorms.", "Students live in college dorms."], "image": "train2014/COCO_train2014_000000062772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53975, "question_id": "TFXDCaBb85TfU6Yf4QzwcT", "question": "The waiting room is segregated by what?", "choices": ["race", "intelligence", "gender", "age"], "correct_choice_idx": 2, "direct_answers": ["metal", "gender", "metal", "gender", "gender", "gender", "gender", "gender", "gender", "gender"], "difficult_direct_answer": false, "rationales": ["The age matter as women are put together and men are put together.", "There is a specific sign for ladies to wait in a separate room.", "By what's written on the sign it tells you what is being segregated."], "image": "train2014/COCO_train2014_000000053975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485954, "question_id": "TFcyAxZbnHtgdDLKxxt595", "question": "What are they doing with the pie?", "choices": ["feeding horses", "eating it", "hitting horses", "throwing out"], "correct_choice_idx": 0, "direct_answers": ["holding", "eating it", "rupees", "feeding horses", "feeding", "smelling", "feeding horses", "feeding horse", "feeding horse", "feeding horses"], "difficult_direct_answer": false, "rationales": ["The people are holding the pie out for the horses to eat.", "A person is holding out a pie in front of a horse. the horse is sniffing a pie being held out in front of him.", "The pie is being given to the horse."], "image": "train2014/COCO_train2014_000000485954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347167, "question_id": "TFnTsS4yynPhmjT8s93mjQ", "question": "What is the temperature like here?", "choices": ["cool", "quite warm", "below average", "freezing"], "correct_choice_idx": 1, "direct_answers": ["hot", "warm", "warm", "cool", "quite warm", "hot", "warm", "hot", "warm", "warm"], "difficult_direct_answer": false, "rationales": ["The temperature is hot.", "The woman is wearing a tank top. tank tops have no sleeves to help people stay cool.", "It is quite warm today as we can see the woman in the foreground wearing a sundress, and a little girl in the background is wearing one too. in addition, people can be seen dining outside, which they surely wouldn't in winter!."], "image": "train2014/COCO_train2014_000000347167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241390, "question_id": "TG56a7yZ3JFbwxhP36ffs7", "question": "Which is the most common seabird?", "choices": ["murres", "skuas", "terns", "gull"], "correct_choice_idx": 3, "direct_answers": ["gull", "seagull", "gulls", "seagull", "gulls", "seagull", "gulls", "seagull", "gulls", "gull"], "difficult_direct_answer": false, "rationales": ["The answer is internet searchable.", "Birds are flying above the ocean.", "Seagulls are poplar at the beach."], "image": "train2014/COCO_train2014_000000241390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324252, "question_id": "TG5JrBqS5ooVFfabpxPt32", "question": "What is on the boys face?", "choices": ["tattoo", "paint", "ski mask", "glasses"], "correct_choice_idx": 3, "direct_answers": ["sunglasses", "sunglasses", "sunglasses", "sunglasses", "glasses", "glasses", "glasses", "sunglasses", "sunglasses", "glasses"], "difficult_direct_answer": false, "rationales": ["These protect the eyes from the sun", "The boy is wearing glasses.", "Glasses are usually worn over eyes which is wear the object is."], "image": "train2014/COCO_train2014_000000324252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55608, "question_id": "TGCq3y4GjG6rXamgoNqfcG", "question": "What type of climate are these people living in based on the amount of plantains?", "choices": ["tropical", "rain forrest", "polar", "arid"], "correct_choice_idx": 0, "direct_answers": ["tropical", "tropical", "tropical", "tropical", "tropical climate", "tropical", "tropical", "tropical climate", "tropical climate", "tropical"], "difficult_direct_answer": false, "rationales": ["Bananas are a well known tropical fruit.", "The place has to be to grow bananas.", "Plantains grow in warm climates."], "image": "train2014/COCO_train2014_000000055608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329304, "question_id": "TGFZv4bbwSnAaSkRKgZqw6", "question": "Why is the long haired boy touching his shoe?", "choices": ["scratching itch", "tying laces", "undressing", "stretching"], "correct_choice_idx": 1, "direct_answers": ["tying", "tie knot", "tying laces", "tying laces", "tie shoe", "tying", "tying", "tying laces", "tying laces", "tying laces"], "difficult_direct_answer": false, "rationales": ["The long haired boy is touching his shoe, attempting to tie his laces.", "The boy puts his foot up to tie his shoes rather than bend down.", "Two boys are standing on a taller slab. another has his shoes up on the slab and is doing something with shoe."], "image": "val2014/COCO_val2014_000000329304.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388070, "question_id": "TGjdtLCiCNEZCH6wa45Fp5", "question": "When did rubber bracelets become popular?", "choices": ["2006", "2001", "2010", "2004"], "correct_choice_idx": 3, "direct_answers": ["yesterday", "eighties", "2000", "late 80's", "nineteen eighties", "wear women", "longtimeago", "2000", "never", "2004"], "difficult_direct_answer": true, "rationales": ["Rubber bracelets were all the rage in 2004.", "Rubber bracelets got crazy popular during the livestrong era.", "The bracelet is from 2004."], "image": "train2014/COCO_train2014_000000388070.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325487, "question_id": "THE5ZVm9gXN9wBJvCeTAuC", "question": "What does the man in the center of the field want to achieve?", "choices": ["home run", "walk", "foul", "strike"], "correct_choice_idx": 3, "direct_answers": ["no hitter", "score", "win", "out", "out", "strike", "strike out", "homerun", "scoring", "out"], "difficult_direct_answer": false, "rationales": ["The pitcher is going to try to get the batter out.", "We are viewing a major league baseball game. a pitcher is trying to throw the ball past a hitter.", "He wants the other player to strike out."], "image": "train2014/COCO_train2014_000000325487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148766, "question_id": "THdUPZ7DhpBbdUU4bYPcNS", "question": "Where would this style of porcelain item be found in a house?", "choices": ["kitchen", "laundry room", "bathroom", "garage"], "correct_choice_idx": 2, "direct_answers": ["bathroom", "bathroom", "entrance", "toilet", "toilet tools", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom"], "difficult_direct_answer": false, "rationales": ["Usually toilets are in the bathroom.", "The style is the bathroom.", "These are toilets."], "image": "val2014/COCO_val2014_000000148766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276854, "question_id": "THkFcnvvbE424XRP65SreJ", "question": "Why are the baseball players so low?", "choices": ["they're kneeling", "they're sitting", "just short", "in dugout"], "correct_choice_idx": 3, "direct_answers": ["in dugout", "in dugout", "in dugout", "dugout", "in dugout", "dugout", "in dugout", "in dugout", "in dugout", "dugout"], "difficult_direct_answer": false, "rationales": ["The baseball players are in the dugout.", "The dugout is lower than the field. it is where the baseball players wait for their turn during the game.", "They are lower so that the audience can see the field"], "image": "train2014/COCO_train2014_000000276854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22492, "question_id": "TJ2eQiqxpaCwZ6YVVfRieY", "question": "What are those french fries made out of?", "choices": ["normal potato", "plastic", "paper", "sweet potato"], "correct_choice_idx": 3, "direct_answers": ["potatoes", "sweet potatoes", "yam", "sweet potato", "potatoes", "potatoes", "sweet potatoes", "sweet potato", "potatoes", "potatoes"], "difficult_direct_answer": false, "rationales": ["The french fries on the plate are a deep orange color because they are made from sweet potatoes.", "Those are a darker orange and made from sweet potatoes.", "Fries are on a plate. the fries are orange in color."], "image": "val2014/COCO_val2014_000000022492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400477, "question_id": "TJicDMYBvRndwtXy6egEyv", "question": "What type of race is this?", "choices": ["dog racing", "cat racing", "sheep racing", "horse racing"], "correct_choice_idx": 3, "direct_answers": ["horse", "horse", "horse", "horse race", "horse", "horse", "horse race", "horse racing", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["The race is for horses.", "There is a picture of a horse on the big flag.", "A sign with a horse on it is in an arena where people stand."], "image": "train2014/COCO_train2014_000000400477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244550, "question_id": "TJoz5pG2XC9jJqcYf92paF", "question": "This kind of animal was the star of what TV show?", "choices": ["garfield", "yogi bear", "mister ed", "lassie"], "correct_choice_idx": 2, "direct_answers": ["horse racing", "mister ed", "mri ed", "mister ed", "horsing ride", "mri ed", "horse show", "horse", "mr ed", "riding show"], "difficult_direct_answer": false, "rationales": ["A horse is jumping in a competition.", "Mister ed was a horse.", "A talking horse was the star of a tv show with the horse's name."], "image": "val2014/COCO_val2014_000000244550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251723, "question_id": "TKDwFH4txhfc6fg9d7EqVh", "question": "What video game console is standing upright?", "choices": ["playstation", "dreamcast", "xbox", "wii"], "correct_choice_idx": 2, "direct_answers": ["xbox", "xbox", "xbox", "xbox", "xbox", "wii", "xbox", "xbox", "asus rog", "xbox"], "difficult_direct_answer": false, "rationales": ["The controller has an \"x\".", "You can tell by the design of the controller, as to what gaming console is shown.", "That system has the trademark green light on the button and the box is white. the controller has the dogbone, chunky boomerang look."], "image": "train2014/COCO_train2014_000000251723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105079, "question_id": "TKVC5m5TwWAqztbsBUotMe", "question": "Who crafted the thing on the person's feet?", "choices": ["company", "shoe maker", "hobbyist", "jeweler"], "correct_choice_idx": 0, "direct_answers": ["burton", "snowboard", "company", "snowboard", "burton", "burton", "snowboard", "burton", "burton", "burton"], "difficult_direct_answer": false, "rationales": ["The name of the company is one the snowboard.", "The company made the snowboard.", "As indicated by the name and logo."], "image": "train2014/COCO_train2014_000000105079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535933, "question_id": "TKctRTMveQB3tcyZLH7XaY", "question": "Why is the plate empty?", "choices": ["for customer", "to throw", "mistake", "not hungry"], "correct_choice_idx": 0, "direct_answers": ["haven't started", "for customer", "unused", "no slice", "no food", "serving", "not served", "unused", "not served", "awaiting pizza"], "difficult_direct_answer": false, "rationales": ["The plate is for the customer.", "For the person to use to put a slice of pizza on it.", "The pizza is in the middle of the table because it is a restaurant and people share the pizza."], "image": "val2014/COCO_val2014_000000535933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274894, "question_id": "TKeuffrHrPsDzESLo5FT4v", "question": "What type of phone can be accessed here?", "choices": ["cellular", "landline", "payphone", "cordless"], "correct_choice_idx": 2, "direct_answers": ["pay phone", "payphone", "emergency", "pay", "pay phone", "pay phone", "cell", "landline", "pay phone", "pay phone"], "difficult_direct_answer": false, "rationales": ["A phone in a booth is in a public place.", "There is a red phone booth near the tower.", "The only phone apparatus clearly visible in the image is answer a."], "image": "train2014/COCO_train2014_000000274894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277793, "question_id": "TKpEdut3QrHqooxS5jh3uM", "question": "What is the exterior of the pet cage made of?", "choices": ["cardboard", "steel", "plastic", "glass"], "correct_choice_idx": 2, "direct_answers": ["plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "nylon", "plastic", "grill", "plastic"], "difficult_direct_answer": false, "rationales": ["The door is made out of steel, but the rest of the cage is made out of a different material. glass or cardboard would not be durable enough.", "The entire cage is made of that.", "The exterior is made of plastic."], "image": "val2014/COCO_val2014_000000277793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357342, "question_id": "TLLVFKDmf8sqYThbkXcnSZ", "question": "What are the animals sleeping on?", "choices": ["mousepad", "pillow", "cushion", "magazine"], "correct_choice_idx": 0, "direct_answers": ["mouse pad", "mousepad", "mouse pad", "mousepad", "desk", "table", "mousepad", "desk", "desk", "mouse pad"], "difficult_direct_answer": false, "rationales": ["The animals are laying on the surface on top of the object that the mouse is used on.", "The item the animals are on has a mouse on it and it's a pad for the mouse.", "There is a computer mouse sitting on it."], "image": "train2014/COCO_train2014_000000357342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434425, "question_id": "TLNhobHh7wZqqiZfYCcYvc", "question": "Which vegetable is called starchy tuber?", "choices": ["ridge gourd", "tomato", "carrot", "potato"], "correct_choice_idx": 3, "direct_answers": ["potato", "potato", "yam", "potato", "potatoes", "potato", "potatoes", "potato", "potato", "potato"], "difficult_direct_answer": false, "rationales": ["There is a potato on the plate.", "Potatoes have the most starch.", "The veggie is a potato."], "image": "train2014/COCO_train2014_000000434425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166889, "question_id": "TMEsWMm8Q2CXbbEAgJzcBs", "question": "What is on the woman's lip who is holding the camera in front of the computer?", "choices": ["mole", "lipstick", "chapstick", "glitter"], "correct_choice_idx": 1, "direct_answers": ["mole", "skin mole", "mole", "mole", "lipstick", "mole", "mole", "mole", "skin mole", "lipstick"], "difficult_direct_answer": false, "rationales": ["The woman is wearing some lipstick on her lips.", "The woman has lipstick on.", "The lipstick is on the lip."], "image": "val2014/COCO_val2014_000000166889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44901, "question_id": "TMH4AgorwVcSiGyQHfhFVD", "question": "Which man has judging power?", "choices": ["middle", "rightmost", "none", "leftmost"], "correct_choice_idx": 1, "direct_answers": ["batter", "umpire", "umpire", "umpire", "rightmost", "umpire", "umpire", "umpire", "umpire", "batter"], "difficult_direct_answer": false, "rationales": ["A ballplayer is swinging the bat while the catcher is trying to catch the ball. man behind him has his hands on knees ready to call pitch.", "The person on the right is the umpire.", "The person on the right is the umpire."], "image": "train2014/COCO_train2014_000000044901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418949, "question_id": "TMN35ynCaZnKKZcPLvNYCW", "question": "What professional athlete wore this number?", "choices": ["nikolai khabibulin", "ichiro suzuki", "andruw jones", "wayne gretzky"], "correct_choice_idx": 2, "direct_answers": ["hitter", "unknown", "number 25", "andruw jones", "baseball player", "number 25", "barry bonds", "barry bonds", "mark mcgwire", "barry bonds"], "difficult_direct_answer": false, "rationales": ["The number is twenty-five.", "It is the same number", "Andruw jones was number 25."], "image": "val2014/COCO_val2014_000000418949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343053, "question_id": "TMcvaFRDVtTmgBFoass6vq", "question": "Where did the money come from?", "choices": ["found it", "tip", "his change", "stole it"], "correct_choice_idx": 2, "direct_answers": ["wallet", "mans wallet", "wallet", "wallet", "pocket", "pocket", "his change", "wallet", "wallet", "man"], "difficult_direct_answer": false, "rationales": ["A man sits at a table in a restaurant and money is on the table.", "The money came from his spare change.", "The money is the change the man had in his pocket."], "image": "val2014/COCO_val2014_000000343053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357860, "question_id": "TMkYAsmVYBm6xpVuAFKZ6D", "question": "What is the silver object near the bull's neck?", "choices": ["ring", "fork", "bell", "spoon"], "correct_choice_idx": 2, "direct_answers": ["bell", "bells", "harness", "bell", "bell", "bells", "bell", "bell", "bells", "bell"], "difficult_direct_answer": false, "rationales": ["The object is a bell.", "The cow is wearing a bell so people can hear it coming.", "The bell makes a noise."], "image": "val2014/COCO_val2014_000000357860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403162, "question_id": "TMpgDZ4FPuAphnNrho2LAb", "question": "What country is highlighted in blue?", "choices": ["greenland", "canada", "angola", "iceland"], "correct_choice_idx": 0, "direct_answers": ["greenland", "greenland", "greenland", "greenland", "greenland", "greenland", "greenland", "greenland", "greenland", "greenland"], "difficult_direct_answer": false, "rationales": ["That is the country near canada's ellesmere island.", "The country of greenland is highlighted in blue on the map.", "A map shows a country east of canada in light blue."], "image": "train2014/COCO_train2014_000000403162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43704, "question_id": "TN5i92rNYckyiqf2k68Mns", "question": "What sport is being shown?", "choices": ["basketball", "rodeo", "frisbee", "football"], "correct_choice_idx": 1, "direct_answers": ["bronco riding", "bronc riding", "rodeo", "rodeo", "rodeo", "rodeo", "horse riding", "rodeo", "rodeo", "rodeo"], "difficult_direct_answer": false, "rationales": ["The person is bucking on a horse at a rodeo.", "There is a rodeo shown with a horse.", "The man is almost falling off the horse."], "image": "train2014/COCO_train2014_000000043704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328810, "question_id": "TND79bPDH5cR85b84vhsF5", "question": "The desire to do what is likely driving the woman to rearrange the magnets?", "choices": ["clean", "color sort", "form words", "aesthetics"], "correct_choice_idx": 2, "direct_answers": ["form words", "spell", "ocd", "write", "spell", "message", "be organized", "ocd", "ocd", "spell"], "difficult_direct_answer": false, "rationales": ["She is putting letters together in groups.", "The desire is to form words.", "A woman is arranging alphabet magnets on a fridge in a line."], "image": "train2014/COCO_train2014_000000328810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340922, "question_id": "TNmjVSNKTwLj7FswVYmzfC", "question": "What is the man participating in?", "choices": ["concert", "sale", "protest", "play"], "correct_choice_idx": 2, "direct_answers": ["protest", "protest", "protest", "protest", "protest", "petition", "protest", "protest", "protest", "petition"], "difficult_direct_answer": false, "rationales": ["The man is making signs for people to hold up in a protest he is participating in.", "The man has protest signs near him.", "The sign is being put up in protest."], "image": "val2014/COCO_val2014_000000340922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242592, "question_id": "TNpik4FVTkBgbBQ5uCCnB7", "question": "What are the colorful machines called?", "choices": ["garbage bins", "mailboxes", "vending machines", "storage lockers"], "correct_choice_idx": 2, "direct_answers": ["vending", "vending machines", "vending machines", "capsule machines", "vending machines", "vending", "vending machines", "vending", "vending", "vending machines"], "difficult_direct_answer": false, "rationales": ["These machines give out something when you put money into them.", "Machines full of candy and trinkets are on a sidewalk.", "You can get prizes from the machines."], "image": "train2014/COCO_train2014_000000242592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245750, "question_id": "TNyWFPPJrqsTwJj4wYvLwJ", "question": "What animal is in the photo?", "choices": ["elephant", "monkey", "zebra", "lion"], "correct_choice_idx": 2, "direct_answers": ["giraffes zebra", "giraffes", "giraffe", "giraffe zebra", "zebra", "giraffe", "giraffe", "giraffe", "zebra", "giraffe"], "difficult_direct_answer": false, "rationales": ["There are multiple animals in the photo, and one of them is a striped horse type.", "A zebra stands with the giraffes.", "Black and white striped, horse shaped animals are here."], "image": "val2014/COCO_val2014_000000245750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516365, "question_id": "TPK2jSQCLddTHX5NALfxxQ", "question": "Why is the woman standing behind the boat?", "choices": ["to fish", "to observe", "to waterski", "to dive"], "correct_choice_idx": 2, "direct_answers": ["surfing", "surfing", "to waterski", "speed", "waterskiing", "parasailing", "sking", "water skiing", "skiing", "water skiing"], "difficult_direct_answer": false, "rationales": ["She is holding on to a line attached to the boat and standing on slim boards.", "She's trying to ski on the water.", "The woman is waterskiing."], "image": "train2014/COCO_train2014_000000516365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 623, "question_id": "TPTfRPukGBme44ju7TH6gJ", "question": "What is inside the creature being cuddled here?", "choices": ["bear guts", "chicken", "goats", "stuffing"], "correct_choice_idx": 3, "direct_answers": ["stuffing", "stuffing", "stuffing", "human", "stuffing", "stuffing", "cotton", "stuffing", "stuffing", "stuffing"], "difficult_direct_answer": false, "rationales": ["The large teddy bear is filled with stuffing that makes it plush.", "The big creature cuddled here is filled with stuffing.", "The thing with the person is a stuffed animal."], "image": "val2014/COCO_val2014_000000000623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12614, "question_id": "TPgJdpriDAdbEpoCzncGkW", "question": "Why is the outdoors hard to see?", "choices": ["smog", "snow", "window blinds", "rain"], "correct_choice_idx": 0, "direct_answers": ["smog", "cloudy", "high elevation", "heavy windows", "glare", "people obstruction", "blocked view", "behind glass", "windows", "obscured"], "difficult_direct_answer": true, "rationales": ["The city gives off pollution in form of clouds. they give off smoke like pollution when they are making things.", "There must be smog or fog out.", "There is a hazy smoke type substance outside that is making the sky less than clear. in urban settings like this that effect is generally do to answer a."], "image": "train2014/COCO_train2014_000000012614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263859, "question_id": "TPgdtXS3toYhcf32wJXjpx", "question": "Besides Africa what continent can these animals be found naturally on?", "choices": ["south america", "asia", "europe", "australia"], "correct_choice_idx": 1, "direct_answers": ["india", "asia", "asia", "north america", "asia", "asia", "south america", "asia", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["Asia is another continent.", "Elephants natural habitat is first at africa and the other country where elephants are common is india which is located in the asian continent.", "Elephants are in asia."], "image": "train2014/COCO_train2014_000000263859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372618, "question_id": "TPrpijybmERhagmh7kPMWe", "question": "Which soft drink does the building in red sell?", "choices": ["fanta", "coca-cola", "pepsi", "dr. pepper"], "correct_choice_idx": 1, "direct_answers": ["coca cola", "coke", "coca cola", "coke", "coca-cola", "coca-cola", "coca cola", "coca cola", "coke", "coca cola"], "difficult_direct_answer": false, "rationales": ["The red building has a sign for coca cola on its drinks stand.", "The logo is has white letters on a red background.", "This drink's emblem is one of the most recognizable ones in the world."], "image": "train2014/COCO_train2014_000000372618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311007, "question_id": "TQ72mmr6o5rZru5aS4cExW", "question": "The person in blue is best described as what?", "choices": ["baby", "toddler", "youth", "elderly"], "correct_choice_idx": 3, "direct_answers": ["getting wet", "wet", "old", "old woman", "old woman", "senior citizen", "elderly", "elderly", "happy", "happy"], "difficult_direct_answer": false, "rationales": ["The white hair and hunched over posture indicate that the women is older.", "The person is old.", "The woman is obviously too old to be a baby, toddler, or youth."], "image": "train2014/COCO_train2014_000000311007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472549, "question_id": "TQEBUFPgwGeT4gqJUgSxfG", "question": "This museum draws visitors to what city?", "choices": ["annapolis", "concord", "birmingham", "topeka"], "correct_choice_idx": 2, "direct_answers": ["birmingham", "barber", "birmingham", "birmingham", "barber", "barber", "detroit", "barber city", "motor city", "birmingham"], "difficult_direct_answer": false, "rationales": ["This tourist attraction can be found in birmingham, alabama.", "They go to birmingham.", "This museum is located in alabama."], "image": "train2014/COCO_train2014_000000472549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442725, "question_id": "TQGs75RGzzdV7GeuZv88Ly", "question": "What food is the heart shaped object made of?", "choices": ["pizza", "donut", "spaghetti", "burger"], "correct_choice_idx": 0, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "dough", "meat", "heart", "pizza", "pizza", "heart"], "difficult_direct_answer": false, "rationales": ["It is cooked dough with sauce and cheese", "The heart is made from pizza.", "It is breading topped with a layer of red sauce, then cheese, then a thinly sliced meat."], "image": "train2014/COCO_train2014_000000442725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232985, "question_id": "TQZjzPhJnniUrjAgBRJKhs", "question": "What brand are the shorts the player is wearing?", "choices": ["gucci", "nike", "ralph lauren", "adidas"], "correct_choice_idx": 3, "direct_answers": ["adidas", "adidas", "adidas", "reebok", "new balance", "nike", "adidas", "adidas", "adidas", "adidas"], "difficult_direct_answer": false, "rationales": ["The logo is in red.", "A man is wearing a blue shirt with the words \"adi\" on front of it.", "The red logo on his left leg indicates the brand that made these shorts."], "image": "val2014/COCO_val2014_000000232985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65087, "question_id": "TQhqromQK7ZzvMMWzYFkWz", "question": "What sport is being played?", "choices": ["hockey", "soccer", "ultimate frisbee", "football"], "correct_choice_idx": 2, "direct_answers": ["ultimate frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["There is a disk in the air.", "The frisbee is being used.", "The men want to catch a frisbee."], "image": "train2014/COCO_train2014_000000065087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6075, "question_id": "TQummET6xwVHPzNZ8K4hW9", "question": "How many minutes until the train arrives?", "choices": ["21 minutes", "25 minutes", "15 minutes", "30 minutes"], "correct_choice_idx": 2, "direct_answers": ["thirteen", "15 minutes", "13", "13", "thirteen", "13", "thirteen", "thirteen", "9 minutes", "thirteen"], "difficult_direct_answer": false, "rationales": ["On the sign board it has next train scheduled to arrive at 9:28. the adjacent clock shows that it is approximately 9:15. the difference or wait time is about 15 minutes.", "There are 15 minutes.", "The displayed clock says the time is 9:15 and the train is scheduled to arrive at 9:28 which is a difference of 13 minutes. i chose the closest option."], "image": "train2014/COCO_train2014_000000006075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471245, "question_id": "TRCQkg7rQ63HPJotugofRp", "question": "What is likely on top of the green part of this meal?", "choices": ["bread crumbs", "beets", "sugar", "dressing"], "correct_choice_idx": 3, "direct_answers": ["cheese", "sprouts", "cheese", "sprouts", "salad", "dressing", "bean sprouts", "cheese", "cheese", "shredded carrots"], "difficult_direct_answer": false, "rationales": ["Dressing goes on the salad.", "A sandwich is on a plate with salad on the side. dressing is commonly served on salads.", "A salad is on a plate with a sandwich."], "image": "train2014/COCO_train2014_000000471245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197593, "question_id": "TRVQus8Argt5ehx7XXHWXc", "question": "What happened to this players left knee?", "choices": ["break", "sunburn", "nothing", "cut"], "correct_choice_idx": 3, "direct_answers": ["injured", "injured", "scrape", "injury", "injured", "hurt", "cut", "injury", "cut", "injured"], "difficult_direct_answer": false, "rationales": ["The player is wearing a bandage on their knee which might be the result of answer a.", "The bandage is small and the athlete is still able to play.", "There is a bandage on it. it was cut or skinned."], "image": "train2014/COCO_train2014_000000197593.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188955, "question_id": "TS7hEhnKzp4zEBCbzr8cBm", "question": "Which hemisphere are the majority of these sport establishments located?", "choices": ["northern", "southern", "eastern", "western"], "correct_choice_idx": 0, "direct_answers": ["northern hemisphere", "northern", "northern", "northern", "north", "snow", "northern", "north", "northern", "northern"], "difficult_direct_answer": false, "rationales": ["The hemisphere is northern.", "There is snow on the ground.", "Skiers are gathered below lifts on a snowy mountain as is needed for downhill skiing."], "image": "train2014/COCO_train2014_000000188955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352185, "question_id": "TSaqUyydLhXUmzst4BLVKE", "question": "Where might Bayshore be based on the flag?", "choices": ["luxembourg", "italy", "canada", "fiji"], "correct_choice_idx": 2, "direct_answers": ["america", "canada", "canada", "canada", "canada", "canada", "canada", "america", "canada", "america"], "difficult_direct_answer": false, "rationales": ["They are out of canada", "The flag is canadian.", "The canadian flag is in the background."], "image": "train2014/COCO_train2014_000000352185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422295, "question_id": "TSbTMVuzLnNPBaX4JjDENB", "question": "Where does this cow live?", "choices": ["city", "zoo", "farm", "fair"], "correct_choice_idx": 0, "direct_answers": ["street", "street", "city", "india", "on street", "india", "roadside", "asia", "street", "city"], "difficult_direct_answer": false, "rationales": ["There are many vehicles, buildings and people in the image.", "The cow is in a city.", "This animal is in the middle of a city road laying down."], "image": "train2014/COCO_train2014_000000422295.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42953, "question_id": "TSdDADocLzg2mAs3eXWjbd", "question": "The helicopter assists which type of sports participants?", "choices": ["bowlers", "none", "skiers", "sledders"], "correct_choice_idx": 2, "direct_answers": ["snowboarders", "skiers", "skiers", "skiers", "ski", "skiers", "skiers", "skiers", "skier", "skiing"], "difficult_direct_answer": false, "rationales": ["All of the people standing around have ski poles and snowboards.", "The helicopter is helping them get up and down the mountain.", "They sometimes require transportation to the top of mountains."], "image": "val2014/COCO_val2014_000000042953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469658, "question_id": "TSdRNGvGwh2ioMiJvN7UMj", "question": "When was Heaver Bros. Ltd. founded?", "choices": ["1957", "1967", "1960", "1956"], "correct_choice_idx": 0, "direct_answers": ["1957", "1960s", "1846", "twothousdand two", "1957", "no clue", "1957", "1957", "1980", "1957"], "difficult_direct_answer": false, "rationales": ["This bus company was founded in 1957.", "The founding date was 1957.", "The company was first founded in 1957."], "image": "train2014/COCO_train2014_000000469658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581593, "question_id": "TSu289eCAbTaNMRCreA5D6", "question": "What are most wineglasses made of?", "choices": ["plastic", "glass", "silvered glass", "pewter"], "correct_choice_idx": 1, "direct_answers": ["glasses", "glasses", "glasses", "glass", "glass", "glass", "glass", "glass", "glass", "glass"], "difficult_direct_answer": false, "rationales": ["Wine glasses are usually formal items and made of glass.", "The regular wineglass is made of glass.", "They are drinking cups that aren't made of plastic, metal, or wood."], "image": "val2014/COCO_val2014_000000581593.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244692, "question_id": "TT7YD6ygcwnAnAucSnrEL8", "question": "What can people enter that is behind the trees?", "choices": ["tent", "playground", "pyramid", "buildings"], "correct_choice_idx": 3, "direct_answers": ["house", "houses", "building", "field", "cabin", "house", "buildings", "building", "houses", "building"], "difficult_direct_answer": false, "rationales": ["The person can go in buildings.", "There are buildings there that they can go in.", "There are buildings which people can enter behind the trees."], "image": "train2014/COCO_train2014_000000244692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175737, "question_id": "TTJgoXebxjJrzqBvK7xSfx", "question": "What sort of terrain is visible in the background?", "choices": ["city", "desert", "farm", "tropical jungle"], "correct_choice_idx": 1, "direct_answers": ["mountains", "desert", "see valleys", "tropical", "hilly", "mountain", "dry valleys", "dirt", "mountain", "mountain"], "difficult_direct_answer": false, "rationales": ["The terrain in the background is made of desert and large rocky canyons.", "The colours and lack of vegetation and buildings show it must be a desert.", "The terrain looks really dry."], "image": "val2014/COCO_val2014_000000175737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380487, "question_id": "TTKjtY6AJL6Jeu9SapzoZw", "question": "What type of finish or item covers the back wall?", "choices": ["mirror", "silver paint", "another room", "white paint"], "correct_choice_idx": 0, "direct_answers": ["paint", "flowers", "mirror", "flowers", "mirror", "board", "flowers", "stickers", "stickers", "gloss"], "difficult_direct_answer": false, "rationales": ["A girl sitting on a bed is reflected on the wall behind her.", "A reflective surface in the background is repeating the image in the foreground. a reflective surface like this in someones home would be from answer a.", "The wall is reflective."], "image": "val2014/COCO_val2014_000000380487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414904, "question_id": "TTRD7UHadkWX5ocgxKc7xv", "question": "Why is the woman pointing her toes?", "choices": ["to kick", "to exercise", "to dance", "to pose"], "correct_choice_idx": 3, "direct_answers": ["posing coyly", "to pose", "posing coyly", "posing", "posing", "posing", "posing", "posing", "posing", "posing"], "difficult_direct_answer": false, "rationales": ["The woman is having fun with the photographer.", "People do not make this pose for any reason besides getting photographed.", "The woman is stationary. she is not dancing, exercising, or kicking."], "image": "train2014/COCO_train2014_000000414904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319690, "question_id": "TTRVGDNDiGwwpooXZ9TLqJ", "question": "What is the term for this outdoor trading area?", "choices": ["garage sale", "farmers market", "county fair", "swap meet"], "correct_choice_idx": 1, "direct_answers": ["market", "bizarre", "market", "market", "market", "market", "farmers market", "fruit market", "marketplace", "market"], "difficult_direct_answer": false, "rationales": ["This is where growers bring their crops to be sold to people and those are known as farmers markets.", "The vendors are selling produce.", "This is the most common term. c might also feature these types of stalls, but usually not with so much produce."], "image": "train2014/COCO_train2014_000000319690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370260, "question_id": "TThhSRWDVgUPVtv8pCiyRb", "question": "Which team did they cheer on at the Olympics?", "choices": ["great britain", "bahamas", "south africa", "united states"], "correct_choice_idx": 0, "direct_answers": ["soccer", "england", "british", "britain", "england", "great britain", "england", "british", "england", "england"], "difficult_direct_answer": false, "rationales": ["A city in great britain is on the bus.", "The name on top of the bus is a place in great brittain.", "The bus advertises for sainsbury's which is a supermarket chain with headquarters in london, england."], "image": "val2014/COCO_val2014_000000370260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3353, "question_id": "TTruqmucjTBpFk9LoD3AMq", "question": "Cats needs which kind of feel?", "choices": ["hot", "freeze", "cold", "warmth"], "correct_choice_idx": 3, "direct_answers": ["warmth", "soft", "warm feels", "love", "petting", "soft touch", "good", "softness", "calm", "love"], "difficult_direct_answer": true, "rationales": ["Cats need warmth.", "Traditionally mammals need and thrive in moderate temperatures.", "They need to stay warm to live."], "image": "train2014/COCO_train2014_000000003353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127499, "question_id": "TU9oMPERSMyusfXZKRPcy6", "question": "What is the woman in yellow protecting herself from?", "choices": ["snow", "rain", "fire", "lightning"], "correct_choice_idx": 1, "direct_answers": ["rain", "raindrops", "raindrops", "raindrops", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The woman in yellow is attempting to protect herself from the rain.", "The woman in yellow is holding an umbrella. with the wet sidewalk around her it would mean that rain is falling now.", "It's raining."], "image": "train2014/COCO_train2014_000000127499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137271, "question_id": "TUx5NQVn6M7opVXViwRH9f", "question": "How are the flying objects being controlled?", "choices": ["computer", "magic", "remote", "string"], "correct_choice_idx": 3, "direct_answers": ["string", "string", "by strings", "strings", "string", "kite string", "string", "strings", "lines", "cord"], "difficult_direct_answer": false, "rationales": ["Kites are in the air and are being held by people on the ground.", "There is a long thin line attached to the kite.", "Kites need to be tethered down or they will fly away."], "image": "val2014/COCO_val2014_000000137271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16931, "question_id": "TVXL8GngnS83XaMiWhDUtE", "question": "Which vehicle is most camouflaged by the snow?", "choices": ["gray sedan", "red van", "white sedan", "gray van"], "correct_choice_idx": 2, "direct_answers": ["white", "white sedan", "white", "white sedan", "white sedan", "cars", "white one", "white", "white car", "white car"], "difficult_direct_answer": false, "rationales": ["The white car is buried.", "The white sedan is totally buried.", "The car that is covered in snow is harder to see."], "image": "val2014/COCO_val2014_000000016931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134362, "question_id": "TVzNKR2Nm7yjEMKqbPsJzL", "question": "How is the boy controlling the object?", "choices": ["string", "magic", "remote", "battery"], "correct_choice_idx": 0, "direct_answers": ["kite line", "string", "kite line", "string", "string", "string", "string", "string", "kite line", "string"], "difficult_direct_answer": false, "rationales": ["Traditionally kites can only be used and controlled with a string of some sort.", "The boy is flying a kite. it has control lines.", "A young boy is standing on the beach. he is holding the kite with a very thin piece of material that is attached at one end of kite."], "image": "train2014/COCO_train2014_000000134362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151671, "question_id": "TW4hY9K6rPPYxTLNhP6MHQ", "question": "What is the fruit mimicking?", "choices": ["pencil", "telephone", "toothbrush", "earmuffs"], "correct_choice_idx": 1, "direct_answers": ["banana", "telephone", "smile", "banana", "using banana", "banana", "banana", "mobile phone", "telephone", "phone"], "difficult_direct_answer": false, "rationales": ["The fruit is like a phone.", "A woman holds one end of a banana up to her ear and the other in front of her mouth as she pretends to be using her phone.", "The woman is making a call with the banana."], "image": "train2014/COCO_train2014_000000151671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396172, "question_id": "TW6G9q2sRxpteWkxp6WUbM", "question": "What sort of session are they attending?", "choices": ["college orientation", "work training", "wedding rehearsal", "party planning"], "correct_choice_idx": 1, "direct_answers": ["work training", "dinner", "award presentation", "corporate training", "meeting", "wedding reception", "conference", "meeting", "staff training", "presentation"], "difficult_direct_answer": true, "rationales": ["These people are all colleagues attending a work presentation.", "There is a big screen, open laptops and everyone is looking at one person.", "There is a projector at the front and instructor"], "image": "train2014/COCO_train2014_000000396172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207486, "question_id": "TWXAomJum6oghT3mgQQkfX", "question": "Where is the animal currently sitting?", "choices": ["street", "park", "boardwalk", "beach"], "correct_choice_idx": 0, "direct_answers": ["street", "street", "road", "street", "street", "street", "ground", "ground", "street", "road"], "difficult_direct_answer": false, "rationales": ["The cow is in the middle of the road.", "In some places, it's not unusual to see animals lying in the middle of the road.", "The other options don't match this setting at all."], "image": "val2014/COCO_val2014_000000207486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557045, "question_id": "TWmhNcwZVv93Vw3kB459B2", "question": "What type of pitch is this?", "choices": ["fastball", "ceremonial pitch", "knuckleball", "forkball"], "correct_choice_idx": 1, "direct_answers": ["baseball diamond", "opening", "baseball", "first", "fast", "baseball pitch", "playing", "first pitch", "photo op", "ceremonial pitch"], "difficult_direct_answer": true, "rationales": ["The navy officer is throwing the first pitch.", "They have someone throw the baseball to open the game.", "The pitch is for fun."], "image": "train2014/COCO_train2014_000000557045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183562, "question_id": "TWuvmRBvSxDXnAao5AwdUg", "question": "What is the problem with this tie?", "choices": ["too long", "too feminine", "too bright", "too short"], "correct_choice_idx": 0, "direct_answers": ["too long", "too long", "too long", "too long", "too long", "long", "too long", "too long", "long", "too long"], "difficult_direct_answer": false, "rationales": ["A man is wearing a tie that reaches down to his knees. ties are typically only long enough to reach above waist level.", "It's supposed to stop in the belt area", "The tie is way too long."], "image": "train2014/COCO_train2014_000000183562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270388, "question_id": "TX8ScpkD6qhwcgTXUiyVQF", "question": "What is the door on the far left for?", "choices": ["refrigerator", "storage", "oven", "dishwasher"], "correct_choice_idx": 2, "direct_answers": ["oven", "baking", "open cabinet", "fake oven", "cooking", "diaper disposal", "oven", "fake oven", "cupboard", "oven"], "difficult_direct_answer": false, "rationales": ["The oven has it door here to ease and accessibility.", "The door on the far left is a pretend oven.", "It has burners on the top"], "image": "train2014/COCO_train2014_000000270388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3786, "question_id": "TXB89ToDwi6Fc7z4bE8Sm6", "question": "What material is the bench made of?", "choices": ["carbon fiber", "metal", "wood", "plastic"], "correct_choice_idx": 1, "direct_answers": ["wood", "wood", "wood", "plastic", "metal", "metal", "wood", "metal", "wood", "metal"], "difficult_direct_answer": false, "rationales": ["The bench seems to be made of metal since it's gold.", "Traditionally these types of sitting tools are made of wood but in this picture is metal.", "A woman is sitting in a chair that is shiny and hard."], "image": "val2014/COCO_val2014_000000003786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567683, "question_id": "TXCB5bxY5m8qSfuWomyjwx", "question": "What type weather is typical here?", "choices": ["tundra", "freezing", "all", "tropical"], "correct_choice_idx": 3, "direct_answers": ["sunny", "tropical", "warm", "sunny", "tropical", "tropical", "clear", "sunny", "clear", "sunny"], "difficult_direct_answer": false, "rationales": ["It's the only answer that's most likely. the other options wouldn't have the green trees in the background.", "Since the weather looks good there can only be one answer out of these options.", "Tropical weather is typical."], "image": "val2014/COCO_val2014_000000567683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29504, "question_id": "TXc64n9JnGVAxazFWgKN3G", "question": "What does the white product to the back need to grow properly?", "choices": ["manure", "sun", "water", "pollination"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["You need water to grow rice.", "This is grown in it", "The white product in the back is rice. rice needs high degrees of hydration in order to grow properly."], "image": "val2014/COCO_val2014_000000029504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244032, "question_id": "TXdtgbCHBnsxLJy9RTFBGV", "question": "Which one is carrying the most weight?", "choices": ["striped shirt", "grey jacket", "man", "purple shirt"], "correct_choice_idx": 2, "direct_answers": ["front rider", "man", "front", "front", "man", "first bike", "first traveler", "leading bicycle", "man", "black outfit"], "difficult_direct_answer": false, "rationales": ["Since he has two people instead of one.", "The cyclist that is carrying another person on the back of their bike would be the one carrying the most weight.", "The man is carrying the most weight since he has a woman on his bike."], "image": "train2014/COCO_train2014_000000244032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319118, "question_id": "TXgJGhkG2NsT4VR9a4eE6M", "question": "What name was added to this company's name in 2000?", "choices": ["santander", "wells fargo", "fleet", "j.p. morgan"], "correct_choice_idx": 3, "direct_answers": ["jpmorgan", "j.p. morgan", "jpmorgan", "not answerable", "heineken", "bank", "seattle", "chase", "not clear", "chase rewards"], "difficult_direct_answer": true, "rationales": ["This company bought chase in 2000.", "The name is jp morgan.", "Chase bank merged with the company into one in 2000."], "image": "train2014/COCO_train2014_000000319118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81704, "question_id": "TY73k75ikmnNheDu4PcEMw", "question": "What is this person miming out as a joke as being done with the scissors?", "choices": ["smiling", "sitting", "standing", "running"], "correct_choice_idx": 3, "direct_answers": ["running", "running", "running", "running", "edward scissorhands", "running", "running", "funny", "lets go", "edward scissorhands"], "difficult_direct_answer": false, "rationales": ["He is running which is dangerous.", "A man with bleached blond hair is being silly. he has both his one arm out in front of him as well as back leg back.", "The person is holding a foot up and facing forward with scissors in hand."], "image": "train2014/COCO_train2014_000000081704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507434, "question_id": "TYDB3yEmHM6N2tokidgFx2", "question": "Which group of people are most likely to visit Mary's club?", "choices": ["straight men", "women", "all equally", "lesbians"], "correct_choice_idx": 0, "direct_answers": ["men", "old men", "old men", "men", "men", "men", "men", "straight men", "men", "men"], "difficult_direct_answer": false, "rationales": ["The sign indicates that it is an all nude strip club with female dancers. lesbians and especially non-lesbian women would not visit this club.", "The club is for men that like women.", "The partially naked woman on the sign indicates that it is a strip club."], "image": "train2014/COCO_train2014_000000507434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333150, "question_id": "TYTFtoAPxZPsdZi4vt3rRo", "question": "What is the purpose of this event?", "choices": ["relax", "shop", "learn", "exercise"], "correct_choice_idx": 2, "direct_answers": ["meeting", "gain knowledge", "teaching", "conference", "learning", "information", "seminar", "improve sales", "conference", "learn"], "difficult_direct_answer": true, "rationales": ["There is a laptop with a presentation open on it and a group of people sitting listening indicating that something is being taught.", "There is a close up shot of a laptop. people are sitting and waiting for someone to speak on stage.", "The laptop is on a training exercise."], "image": "val2014/COCO_val2014_000000333150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343204, "question_id": "TYWJdo49puJ6x9PnUT7cpP", "question": "Who is using this laptop?", "choices": ["girl", "woman", "man", "boy"], "correct_choice_idx": 0, "direct_answers": ["by video", "girl", "woman", "child", "woman", "woman", "man", "guy", "woman", "on right"], "difficult_direct_answer": false, "rationales": ["A girl is using the laptop.", "The girl is using it.", "Because she's clearly visible at the bottom left of the screen."], "image": "train2014/COCO_train2014_000000343204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262093, "question_id": "TYefbNpvwUdrbrSKaxtGwP", "question": "Persons using this kitchen clean dishes by what manner?", "choices": ["none", "industrial drier", "hand", "dishwasher"], "correct_choice_idx": 2, "direct_answers": ["hand", "hand wash", "hand", "handwashing", "dishwasher", "sink", "by hand", "hand", "hand", "hand"], "difficult_direct_answer": false, "rationales": ["There is no dishwasher seen in the kitchen.", "The dishes are sitting on the counter to dry because they were washed manually by a human and not a machine.", "The dishes shown are air drying after being washed in the sink."], "image": "val2014/COCO_val2014_000000262093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519844, "question_id": "TYmePGZW7cJGA5Tb8ZMttW", "question": "Which one of these countries has a flag that is most similar to the kite?", "choices": ["canada", "bangladesh", "peru", "lithuania"], "correct_choice_idx": 1, "direct_answers": ["wales", "portugal", "spain", "japan", "portugal", "jamaica", "belarus", "italy", "japan", "bangladesh"], "difficult_direct_answer": false, "rationales": ["Bangladesh is a country with a red and green flag.", "The kite is red and green, not yellow, green, and red or red and white.", "It has similar colourings."], "image": "train2014/COCO_train2014_000000519844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478648, "question_id": "TYtuPnqyoDgERAysDNJ2Zt", "question": "What is on the wall?", "choices": ["toasters", "dogs", "frames", "caps"], "correct_choice_idx": 2, "direct_answers": ["pictures", "tv", "frames", "pictures", "tv", "paintings", "television", "pictures", "framed art", "artwork"], "difficult_direct_answer": false, "rationales": ["There are four paintings on the wall. each painting is inside a rectangular structure.", "These square objects are commonly found hanging from walls.", "There are four photo frames."], "image": "train2014/COCO_train2014_000000478648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132706, "question_id": "TYvLhadrYYfJCAhKVFEXPc", "question": "What is causing the smoke above the food?", "choices": ["wind", "cold", "heat", "photoshop filters"], "correct_choice_idx": 2, "direct_answers": ["heat", "steam", "it's hot", "heat", "food's hot", "heat", "hot food", "heat", "heat", "heat"], "difficult_direct_answer": false, "rationales": ["When hot things meet cooler air it can cause steam.", "When food is hot, smoke can be seen rising from it.", "The heat causes smoke."], "image": "val2014/COCO_val2014_000000132706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440334, "question_id": "TZ47ZRBNC5YXTyQMSHegrW", "question": "What are the small white objects on the fruit?", "choices": ["stickers", "spiderwebs", "paint", "bugs"], "correct_choice_idx": 0, "direct_answers": ["stickers", "stickers", "stickers", "stickers", "stickers", "price tags", "price tags", "stickers", "stickers", "stickers"], "difficult_direct_answer": false, "rationales": ["Most of the banana have a white object attached to its skin. since this looks like a marketplace, the object is probably a sticker.", "The objects are stickers.", "The items on the fruit are stickers."], "image": "train2014/COCO_train2014_000000440334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537441, "question_id": "TZ9CHJDrNzfp9Li9SDaAXR", "question": "The toilet lid has been made from what material?", "choices": ["wood", "porcelain", "metal", "glass"], "correct_choice_idx": 0, "direct_answers": ["wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["Toilets are generally made from white porcelain, but not so in this example, as the lid is brown and different in material. the graininess of the lid is visible, which is a consistent property of wood.", "The toilet lid is brown and made from solid wood.", "The toilet lid is brown."], "image": "train2014/COCO_train2014_000000537441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183562, "question_id": "TZbzEWpbUrJUWLzGkTpsjn", "question": "What clothing item is most strangely fitting on this man?", "choices": ["necktie", "belt", "shirt", "pants"], "correct_choice_idx": 0, "direct_answers": ["tie", "business", "business", "necktie", "tie", "tie", "tie", "tie", "tie", "tie"], "difficult_direct_answer": false, "rationales": ["It is much longer than it should be", "The man is wearing a necktie that is unusually longer than normal", "A tie is generally supposed to fall above the beltline for men when worn properly in this photo it is longer than average and therefore it looks out of place."], "image": "train2014/COCO_train2014_000000183562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515360, "question_id": "TZcYB7tm78NCFg3rrhyocX", "question": "What do the persons on the bench await?", "choices": ["cars", "parked train", "future train", "horses"], "correct_choice_idx": 2, "direct_answers": ["train", "train", "train", "train", "train", "train", "train", "train", "train", "future train"], "difficult_direct_answer": false, "rationales": ["The people are waiting for the bus.", "The people take the train.", "The people on the bench would probably be standing if they were wanting to get on the parked train. they are most likely waiting for a train that is coming later."], "image": "val2014/COCO_val2014_000000515360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433531, "question_id": "TZiMDx7QqVsqwEvJ9QG6Mg", "question": "Sitting like this allows the snowboarders to avoid doing what with their Snow boards?", "choices": ["losing them", "selling them", "scratching them", "removing them"], "correct_choice_idx": 3, "direct_answers": ["moving", "sliding down", "slide", "removing", "accidentally gliding", "standing", "slide", "slipping", "removing them", "sliding"], "difficult_direct_answer": true, "rationales": ["The snowboarder look like they are taking a short rest. if they were not in this outstretched position, it would mean that they may have to remove them from their feet.", "The snowboarders can take the boards off.", "They can be removed."], "image": "train2014/COCO_train2014_000000433531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39495, "question_id": "TZiTAi7mN4WZAMfr2RVKwF", "question": "What country were his shoes originally created?", "choices": ["canada", "malaysia", "congo", "finland"], "correct_choice_idx": 0, "direct_answers": ["usa", "united states", "canada", "canada", "usa", "netherlands", "canada", "america", "usa", "united states"], "difficult_direct_answer": false, "rationales": ["Crocs were invented in canada.", "Crocs come from canada.", "The shoes are crocs, which had their first factory in the country north of the united states."], "image": "train2014/COCO_train2014_000000039495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410533, "question_id": "TZjdxPciMdzYSaqjckuKaf", "question": "What material is the sink made of?", "choices": ["porcelain", "plastic", "wood", "stainless steel"], "correct_choice_idx": 3, "direct_answers": ["stainless steel", "metal", "steel", "stainless steel", "stainless steel", "metal", "steel", "cat", "steel", "cat"], "difficult_direct_answer": false, "rationales": ["This is a stainless steel sink.", "The sink is built from stainless steel.", "The sink has a silver color. it is shiny."], "image": "val2014/COCO_val2014_000000410533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273067, "question_id": "TZnt68f9wNzyJFPARA9kuY", "question": "What type of professional sport it played in this area?", "choices": ["basketball", "baseball", "soccer", "football"], "correct_choice_idx": 1, "direct_answers": ["cricket", "soccer", "baseball", "football", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["The sport played in the arena is baseball, there are baseball bleachers.", "You can tell by the shape of the stadium and lights, as to what sport is played there.", "The sports park in the background is for baseball games and has a diamond in the field."], "image": "val2014/COCO_val2014_000000273067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181438, "question_id": "TZrw5kE8DivUA7VpDNsu9K", "question": "If you were sitting in a chair what could you put your feet on?", "choices": ["log", "ottoman", "bench", "table"], "correct_choice_idx": 0, "direct_answers": ["ground", "log", "ground", "log", "log", "log", "log", "log", "wooden log", "log"], "difficult_direct_answer": false, "rationales": ["This type of item is pictured in front of the chairs.", "The log is on front of the chairs.", "There is a large tree trunk laying in front of the chairs."], "image": "train2014/COCO_train2014_000000181438.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165492, "question_id": "Ta948Bwpe8QptFjtMrCzFo", "question": "What kind of animal is needed to use this boat?", "choices": ["elephant", "cat", "dog", "human"], "correct_choice_idx": 3, "direct_answers": ["human", "humans", "human", "fish", "human", "human", "human", "fish", "fish", "person"], "difficult_direct_answer": false, "rationales": ["A boat is parked at a dock.", "A person would be needed to use the boat.", "A boat is parked at a dock."], "image": "train2014/COCO_train2014_000000165492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183554, "question_id": "TaiaPQWMHNbfQYnJYKMNJF", "question": "What type of sign is the yellow sign?", "choices": ["informational", "warning", "directional", "identification"], "correct_choice_idx": 0, "direct_answers": ["payment reminder", "payment sign", "payment instructions", "payment", "for paying", "info sign", "fare", "pay sign", "license plate", "informational"], "difficult_direct_answer": true, "rationales": ["The sign has information.", "The yellow sign has information on it for the bus riders to read.", "The yellow sign informs to pay at the entry."], "image": "train2014/COCO_train2014_000000183554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443005, "question_id": "TakCYtPY6WSWcjxFdgPmFz", "question": "What skill is the motorcycle doing?", "choices": ["wheelie", "upper", "drag", "lift"], "correct_choice_idx": 0, "direct_answers": ["wheelie", "wheelie", "wheelie", "wheelie", "wheelie", "wheelie", "wheelie", "wheelie", "wheelie", "wheelies"], "difficult_direct_answer": false, "rationales": ["The motorcycle is doing a wheelie.", "The skill is a wheelie.", "He has one wheel off the ground and it looks like a typical wheelie."], "image": "train2014/COCO_train2014_000000443005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539566, "question_id": "TaqdwZnCxbm9cBvkyHCsZK", "question": "Which item is likely most crispy?", "choices": ["black", "lighter green", "red", "dark green"], "correct_choice_idx": 1, "direct_answers": ["lighter green", "broccoli", "broccoli", "lettuce", "broccoli", "spinach", "broccoli", "broccoli", "broccoli", "lettuce"], "difficult_direct_answer": false, "rationales": ["It is raw", "The most crispy item is darker green.", "Cold greens are on a plate next to cooked broccoli and tomatoes on another plate."], "image": "val2014/COCO_val2014_000000539566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208450, "question_id": "TatKXJbPdmGJBixK8RZ8nr", "question": "Why are the people wearing blue outfit?", "choices": ["visibility", "dress code", "uniform", "fashion"], "correct_choice_idx": 2, "direct_answers": ["they're officers", "uniform", "police uniformity", "uniform", "police", "police", "officers", "law enforcement", "uniform", "police"], "difficult_direct_answer": false, "rationales": ["The people are police officers and are wearing the outfit that was issued to them as a part of their job.", "The uniform seems to be of police officer.", "The officers wear blue as a uniform so people know who they are."], "image": "val2014/COCO_val2014_000000208450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51704, "question_id": "Tb77YvaHcav6dn7SbFiAEj", "question": "What does this young man do here?", "choices": ["board trick", "accident", "bell hop", "suicide"], "correct_choice_idx": 0, "direct_answers": ["skate", "skate", "jumps", "board trick", "skateboard trick", "jump", "skateboard", "skating", "skateboard jump", "kick flip"], "difficult_direct_answer": true, "rationales": ["The man is trying to show off tricks.", "By the position of the board and the child you can tell what he is trying to do.", "He is doing some fancy stuff with a skateboard."], "image": "train2014/COCO_train2014_000000051704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381027, "question_id": "TbFE9r7H6955hGCu5fq3tP", "question": "How can one tell where the doors are on the train?", "choices": ["door porter", "people boarding", "big sign", "white color"], "correct_choice_idx": 3, "direct_answers": ["white color", "painted white", "white", "large openings", "white colored", "white", "white paint", "white color", "different color", "white color"], "difficult_direct_answer": false, "rationales": ["They are different colors than the rest of the train", "The body of the train is dark. the doors have been painted differently in order to increase their visibility.", "They are a different color then the rest of the train."], "image": "train2014/COCO_train2014_000000381027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231281, "question_id": "TbFZzLEsF4CyMd5DyZFfxE", "question": "Where does the man holding the bat want the ball to go?", "choices": ["in pocket", "straight up", "backwards", "forward"], "correct_choice_idx": 3, "direct_answers": ["home", "stands ideally", "home run", "homerun", "first base", "over fence", "left field", "outfield", "first base", "forward"], "difficult_direct_answer": true, "rationales": ["The man wants the ball to go straight.", "The man wants to hit the ball so it goes ahead of him.", "The man holding the bat would like the ball to go forward about 400 feet so he could have a home run. babe ruth, in his heyday, once hit a home run that was 575 feet long!."], "image": "val2014/COCO_val2014_000000231281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447432, "question_id": "TbJ42UDNfuYHMg7yYni6q2", "question": "What celebrity shares the same first name as the name of the street on the right sign?", "choices": ["eagle-eye cherry", "yancy butler", "madison davenport", "dave meltzer"], "correct_choice_idx": 2, "direct_answers": ["madison", "jennifer madison", "madison", "madison", "madison davenport", "madison petits", "madison grant", "madison bumgardner", "billy", "dolly madison"], "difficult_direct_answer": false, "rationales": ["A tennis player has the same name.", "The word is the same on each", "Yancy, eagleeye, and dave do not match the name on the sign."], "image": "train2014/COCO_train2014_000000447432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34445, "question_id": "TbWsH8pnb7RDQqtfCxtLw9", "question": "What are the paddles above the overhead light used for?", "choices": ["eliminating odor", "fly control", "cooling", "special visuals"], "correct_choice_idx": 2, "direct_answers": ["cooling", "fan", "fan", "cooling room", "fan", "fanning", "ceiling fan", "move air", "fan", "fanning"], "difficult_direct_answer": false, "rationales": ["The paddles are part of a ceiling fan. they control the temperature by forcing hot air to move elsewhere.", "It is used to cool off.", "The paddles are surrounding a light on the ceiling."], "image": "val2014/COCO_val2014_000000034445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165883, "question_id": "Tbbz7WHBSKLkaXbJhgGDaf", "question": "What tool is printed on the shirt on the railing?", "choices": ["hammer", "chisel", "screwdriver", "wrench"], "correct_choice_idx": 3, "direct_answers": ["wrench", "wrench", "wrench", "wrench", "wrench", "wrench", "wrench", "wrench", "wrench", "wrench"], "difficult_direct_answer": false, "rationales": ["That is what the tool is.", "The shirt on the railing has a silver wrench on it.", "There is a wrench painted on the t-shirt."], "image": "train2014/COCO_train2014_000000165883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328599, "question_id": "TbcHwepqoZTRqd3YDkneRp", "question": "What religion is the man in the white shirt?", "choices": ["christian", "atheist", "jewish", "catholic"], "correct_choice_idx": 2, "direct_answers": ["jewish", "jewish", "jewish", "jewish", "jewish", "jewish", "jewish", "jewish", "judaism", "jewish"], "difficult_direct_answer": false, "rationales": ["He is wearing a yarmulke.", "The man in the white shirt is wearing a jewish hat.", "A man wears a small, round hat on his head."], "image": "train2014/COCO_train2014_000000328599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224111, "question_id": "TbexZqhxygjJSC47ZAwXh5", "question": "What type of shelter is next to the street?", "choices": ["tent", "coffee shop", "overhang", "bus stop"], "correct_choice_idx": 3, "direct_answers": ["bus stop", "bus", "bus stop", "bus", "bus stop", "bus stop", "bus stop", "bus stop", "bus stop", "traffic light"], "difficult_direct_answer": false, "rationales": ["The shelter is the bus stop.", "The structure is traditionally built to be a shelter for people waiting for public transportation.", "A bus stop is shown."], "image": "val2014/COCO_val2014_000000224111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276956, "question_id": "Tbg6fK9Rm4Ugbt4yYTB3GF", "question": "How many blossom roses are there in the cake?", "choices": ["ten", "nine", "eight", "seven"], "correct_choice_idx": 3, "direct_answers": ["seven", "seven", "four", "bunch", "multiple", "seven", "eight", "seven", "seven", "seven"], "difficult_direct_answer": false, "rationales": ["There are seven red roses.", "Seven red flowers can be seen.", "A wedding cake has many flowers on it."], "image": "train2014/COCO_train2014_000000276956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221882, "question_id": "TbpaPKFF4PhhBCy8hcGMJQ", "question": "What is this place famous for?", "choices": ["technology", "tattoos", "soccer", "luxury shopping"], "correct_choice_idx": 3, "direct_answers": ["gold", "work", "religion", "luxury shopping", "gold", "mall", "shopping", "oil", "gold", "gold"], "difficult_direct_answer": false, "rationales": ["This is a store for shopping.", "It's what it's known for.", "The mall's name is the city of gold."], "image": "val2014/COCO_val2014_000000221882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139758, "question_id": "Tbuv67JPfioRWf7dgNPXif", "question": "What is on the table?", "choices": ["ash tray", "flowers", "book", "dog"], "correct_choice_idx": 2, "direct_answers": ["book", "folder", "books", "computer", "book", "book", "box", "books", "book", "book"], "difficult_direct_answer": false, "rationales": ["There is a book on the coffee table.", "There is a book and a laptop on the table.", "There is a book on the table."], "image": "train2014/COCO_train2014_000000139758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151267, "question_id": "Tc7Gn6pezKXYkMsXvy8SxC", "question": "Which side of the road is the fire truck driving on?", "choices": ["middle", "left", "sidewalk", "right"], "correct_choice_idx": 0, "direct_answers": ["right", "right", "middle", "middle", "center", "photo left", "right", "right", "right", "left side"], "difficult_direct_answer": false, "rationales": ["The truck is driving directly over the yellow line which is always in the middle of the road.", "The truck is on the road line.", "It is near the yellow line that is normally located at the middle."], "image": "val2014/COCO_val2014_000000151267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324400, "question_id": "Tc8cqDnSRoE5eyeQPCK9VT", "question": "Which hand is dominant in the batter shown?", "choices": ["right", "left", "neither", "ambidextrous"], "correct_choice_idx": 1, "direct_answers": ["left", "left", "right", "right", "right", "left", "left", "left", "left", "left"], "difficult_direct_answer": false, "rationales": ["He is swinging back on his left side, meaning the let hand is dominant", "The bat is on the person's left.", "The dominant hand in baseball is the top one."], "image": "train2014/COCO_train2014_000000324400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356406, "question_id": "TcDXyDypS2gqNrAbdQejQW", "question": "What activity are the people shown involved in?", "choices": ["shopping", "sleeping", "dog walking", "selling"], "correct_choice_idx": 0, "direct_answers": ["shopping", "solving", "shopping", "walking", "shopping", "shopping", "walking", "shopping", "dog walking", "talking"], "difficult_direct_answer": false, "rationales": ["The people are in an outdoor mall.", "The activity is shopping.", "The people are at a mall so they are most liking going to shop."], "image": "val2014/COCO_val2014_000000356406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305170, "question_id": "TdLrhUgYvydJxmif2xAnjW", "question": "What tragic even is this city famous for?", "choices": ["church day", "ww2", "9/11", "olympic games"], "correct_choice_idx": 2, "direct_answers": ["9/11", "september 11th", "accident", "911", "nine eleven", "bombing", "trump's birth", "9/11", "911", "bombing"], "difficult_direct_answer": false, "rationales": ["The purple sign for the new york university theater indicates that the building in nyc.", "There is a banner for nyu which is in new york. 9/11 occurred in new york.", "The tall buildings and yellow taxi cabs distinguish it as nyc, which held the twin towers which were taken down in 9/11."], "image": "train2014/COCO_train2014_000000305170.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370278, "question_id": "TdfFvQG3MqGRFnkz7oK6AG", "question": "Which food will most likely be eaten last?", "choices": ["bagels", "casserole", "donuts", "salad"], "correct_choice_idx": 2, "direct_answers": ["dessert", "donuts", "donuts", "donuts", "donuts", "donut", "donuts", "donuts", "dessert", "doughnuts"], "difficult_direct_answer": false, "rationales": ["It is customary to end a meal with a sweet option. so, the donuts will likely get eaten last.", "The donuts are for dessert.", "The donuts are nicely displayed and look appetizing."], "image": "train2014/COCO_train2014_000000370278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6380, "question_id": "TdiFGBLqp73o5VQSmyyE3R", "question": "Where does he want the frisbee to land?", "choices": ["blanket", "grass", "basket", "water"], "correct_choice_idx": 2, "direct_answers": ["chains", "swing", "basket", "inside chains", "in chains", "in basket", "basket", "in basket", "cage", "metal cage"], "difficult_direct_answer": false, "rationales": ["This is a game where the frisbee needs to land in the basket. the frisbee is being thrown towards it.", "The metallic basket is the targeted area of the frisbee.", "He wants the frisbee to go in the gray structure."], "image": "train2014/COCO_train2014_000000006380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318777, "question_id": "TdsUTFTkAWzWBoSRro8wHZ", "question": "Which gate does one enter through if they purchase tickets to the Clubhouse?", "choices": ["eight", "one", "five", "six"], "correct_choice_idx": 0, "direct_answers": ["clubhouse", "stadium gate", "vip", "gate", "td clubhouse", "main gate", "bottom gate", "ten", "back", "eight"], "difficult_direct_answer": true, "rationales": ["A large baseball stadium is shown with several different entrance and exits.", "The gate is number 8.", "You would enter through the clubhouse to get the tickets."], "image": "val2014/COCO_val2014_000000318777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536653, "question_id": "Tdx8hkjQ88VuK4KRmzBn3r", "question": "What is she ready to do next?", "choices": ["juggle", "punt", "serve", "dunk"], "correct_choice_idx": 2, "direct_answers": ["play tennis", "play tennis", "serve", "serve", "play", "serve", "serve ball", "serve ball", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["The woman is preparing to serve.", "The woman is holding a racquet. she is playing tennis, not football, juggling, or basketball.", "She has the tennis ball and racket in her hand on the court. there is an audience behind her watching the game."], "image": "val2014/COCO_val2014_000000536653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376819, "question_id": "TeTLTQXqXgR4hremdgvH6Z", "question": "What type of chair is the first chair on the left?", "choices": ["glider", "lazyboy", "office chair", "rocking chair"], "correct_choice_idx": 0, "direct_answers": ["glider", "slider", "stationary chair", "rocking chair", "wood chair", "rocking", "glider rocker", "glider", "rocking", "glider"], "difficult_direct_answer": false, "rationales": ["A wooden chair with a wood block on each side. it slides back and forth without the back leaning at all backwards.", "There is a sliding mechanism on the bottom of the chair.", "The chair can glide since it has wide feet."], "image": "train2014/COCO_train2014_000000376819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482728, "question_id": "TeUytCHrEAt5Jfr4TQE4PG", "question": "From which fruit does the beverage served here come?", "choices": ["banana", "melon", "grape", "kiwi"], "correct_choice_idx": 2, "direct_answers": ["grapes", "grapes", "grape", "grapes", "grape", "grape", "grape", "grapes", "grapes", "grape"], "difficult_direct_answer": false, "rationales": ["Wine is made from grapes.", "The fruit is grapes.", "Wine comes from grapes."], "image": "train2014/COCO_train2014_000000482728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467245, "question_id": "TenTPyRTW7e64xYLDnGArH", "question": "Why are the tops of waves white?", "choices": ["heat", "big bubbles", "cold", "scattering"], "correct_choice_idx": 3, "direct_answers": ["clouds", "science", "cream", "foam", "air", "scattered light", "froth", "scattering", "foam", "foam"], "difficult_direct_answer": false, "rationales": ["The water has scattered.", "When the water moves quickly, it causes bubbles to form at the tops of the waves created.", "The motion of the wave causes air to mix in with the water."], "image": "train2014/COCO_train2014_000000467245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119815, "question_id": "TeravzWCHdKaMnJrhpxz9z", "question": "What is the woman pushing in the carriage?", "choices": ["dog", "cat", "doll", "baby"], "correct_choice_idx": 3, "direct_answers": ["baby", "baby", "baby", "baby", "baby", "baby", "baby", "baby", "baby", "baby"], "difficult_direct_answer": false, "rationales": ["A stroller is to put a young or newborn in.", "The carriage has a baby placed in it for easy movement.", "The woman has a baby."], "image": "train2014/COCO_train2014_000000119815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307061, "question_id": "TezhcizqGqdwHwj7riVBiE", "question": "What is the texture of the brown object?", "choices": ["wool", "leather", "pic", "fur"], "correct_choice_idx": 3, "direct_answers": ["furry", "soft", "hair", "fur", "soft", "soft", "soft", "fur", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["It's obviously a pet.", "The dog's coat is furry.", "It is the hair of a dog."], "image": "train2014/COCO_train2014_000000307061.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328068, "question_id": "Tfj2HZTZuReLu2KE4dZbmm", "question": "Why is the boy reaching for the racquet?", "choices": ["to nap", "to play", "to eat", "to kick"], "correct_choice_idx": 1, "direct_answers": ["to play", "playing tennis", "to play", "his turn", "his turn", "to play", "for tennis", "to play", "to play", "to play"], "difficult_direct_answer": false, "rationales": ["He is wanting to play with it.", "With the setting and the racket she is giving him you can ascertain what he wants to do.", "A boy is approaching a women who is holding out a tennis racket. the boy is reaching out for the racket."], "image": "val2014/COCO_val2014_000000328068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305710, "question_id": "TfkGYAAxQxdZBmTRm9hEhC", "question": "Why do these people need hats?", "choices": ["dress code", "sun", "warmth", "rain"], "correct_choice_idx": 1, "direct_answers": ["sun", "sun burn", "hot sun", "sun", "hot sun", "sun", "sunshine", "shade", "sunny", "sunny"], "difficult_direct_answer": false, "rationales": ["They are wearing hats to keep the sun out of their eyes so they can see better.", "There is sun shining above. the hats protect them.", "To protect their heads from the bright sun."], "image": "train2014/COCO_train2014_000000305710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26780, "question_id": "Tg35X3cjgg9n5Y8pHH8uk9", "question": "What action is the woman ready to take?", "choices": ["hit", "run", "smash", "throw"], "correct_choice_idx": 3, "direct_answers": ["throw", "throwing", "throw frisbee", "throwing", "throw", "throwing frisbee", "pitch frisbee", "throw", "toss frisbee", "throw frisbee"], "difficult_direct_answer": false, "rationales": ["The woman is holding a frisbee and based on her body positioning and the intended use of this object, she is preparing to do answer a.", "Because of the motion she is in.", "The woman is holding a frisbee in the way that a person would throw it."], "image": "train2014/COCO_train2014_000000026780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262235, "question_id": "TgFtRbNbT5h4W52c3RF5oK", "question": "How is she communicating?", "choices": ["shouting", "telegraph", "computer", "phone"], "correct_choice_idx": 3, "direct_answers": ["cell phone", "phone", "phone", "cell phone", "phone", "phone", "phone", "cell phone", "cellphone", "phone"], "difficult_direct_answer": false, "rationales": ["She's has the cellular device up to her ear which means someone is talking on the other line.", "She is holding it up to her ear", "The woman is holding an item to her ear which would not be required to talk on a computer or telegraph and she does not appear to be shouting"], "image": "val2014/COCO_val2014_000000262235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384007, "question_id": "TgcxKe82rtTTXnNomJPaPD", "question": "What animal is facing the herd?", "choices": ["rabbit", "cat", "fox", "dog"], "correct_choice_idx": 3, "direct_answers": ["dog", "picture didn'tload", "sheepdog", "dog", "dog", "dog", "dog", "dog", "border collie", "dog"], "difficult_direct_answer": false, "rationales": ["The dog is facing the herd.", "The animal is a dog.", "The animal has the right general shape and size of a dog in addition to the ear and tail shape. dogs are also the most frequently used animal for herding and this particular animal is facing a herd of sheep."], "image": "val2014/COCO_val2014_000000384007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113455, "question_id": "TggiWNkrgHmC75ewxk93xm", "question": "Who manufactured the SUV on the right?", "choices": ["toyota", "ford", "chevrolet", "honda"], "correct_choice_idx": 0, "direct_answers": ["toyota", "jeep", "toyota", "tahoe", "toyota", "motorists", "too far", "toyota", "honda", "subaru"], "difficult_direct_answer": false, "rationales": ["A logo is on a car to the right of a motorcycle.", "Based on the visible brand logo on hatchback door.", "The logo of the vehicle is visible and belongs to answer a."], "image": "train2014/COCO_train2014_000000113455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514990, "question_id": "TgihiMPkiYZnYxk3Mbqdhn", "question": "What venue is shown in the image?", "choices": ["pizzeria", "ferry", "cruise ship", "hotel"], "correct_choice_idx": 2, "direct_answers": ["restaurant", "cruise", "lunch", "restaurant", "cruise ship", "ship", "boat", "ship cabin", "restaurant", "ship"], "difficult_direct_answer": false, "rationales": ["There is water outside the windows.", "You can tell by looking out the window as to where they are.", "The people are sitting and eating, but they are obviously in a boat because there is water on the side."], "image": "val2014/COCO_val2014_000000514990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556643, "question_id": "TgjWiZWg5wv2uW2MFvXqfF", "question": "What type of skateboarding would this be considered?", "choices": ["park", "street", "vert", "big air"], "correct_choice_idx": 1, "direct_answers": ["extreme", "trick", "kickflip", "amateur", "trick", "wheelie", "street", "kickflip", "freestyle", "street"], "difficult_direct_answer": false, "rationales": ["The person is skating near cars in driveways.", "The skateboarder is on a residential road as opposed to a park set up specifically for the sport.", "The skateboard is on a street."], "image": "train2014/COCO_train2014_000000556643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381948, "question_id": "Th8ExP4napWqWqkDV3ccC4", "question": "What kind of transport aircraft flies above?", "choices": ["cargo", "passenger", "private", "military"], "correct_choice_idx": 3, "direct_answers": ["military", "cargo", "commercial", "passengers", "airplane", "air force", "military", "cargo plane", "airplane", "cargo"], "difficult_direct_answer": false, "rationales": ["This is a military plane.", "A military plane is flying.", "The aircraft above is a military aircraft."], "image": "train2014/COCO_train2014_000000381948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289900, "question_id": "ThBazkkNdNjhDBp83LxoSt", "question": "Why is the woman on the right holding an object in her hands?", "choices": ["hitting balls", "taking photos", "exercising", "playing games"], "correct_choice_idx": 1, "direct_answers": ["taking picture", "photographing", "taking photos", "taking picture", "camera", "pictures", "documenting", "phone", "picture taking", "take picture"], "difficult_direct_answer": true, "rationales": ["The woman is taking photos.", "A woman is holding her phone up, pointed at a crowd of people.", "The object looks like a phone. she seems to be recording the event."], "image": "val2014/COCO_val2014_000000289900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347995, "question_id": "ThW9tryGjMKF8NwknD2SC5", "question": "Which item is located closest to the woman?", "choices": ["book", "dog", "cat", "baby"], "correct_choice_idx": 3, "direct_answers": ["baby", "baby", "baby", "baby", "dog", "baby", "baby", "baby", "baby", "dog"], "difficult_direct_answer": false, "rationales": ["There is a baby on top of the woman.", "The infant is laying on her chest.", "A woman is laying in bed with a baby on her chest."], "image": "val2014/COCO_val2014_000000347995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33871, "question_id": "ThqegzWg9pVVf7mCM5j53V", "question": "If this person wanted to wash their hands where would they have to go?", "choices": ["here", "gas station", "bathroom", "kitchen"], "correct_choice_idx": 2, "direct_answers": ["sink", "sink", "bathroom", "bathroom", "sink", "bathroom", "nowhere", "sink", "sink", "bathroom"], "difficult_direct_answer": false, "rationales": ["This is a display and unlikely connected to water.", "A woman stands at a display in a large showroom area.", "The person is looking at a mock-up sink rather than a real, working sink."], "image": "val2014/COCO_val2014_000000033871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513555, "question_id": "TiCbhb4o74DgCzJYFowKq4", "question": "Who is most likely to be the boat driver?", "choices": ["young man", "boy", "woman", "old man"], "correct_choice_idx": 0, "direct_answers": ["young man", "standing man", "man standing", "black shirt", "blue shirt", "red shirt", "black shirt", "red shirt", "black shirt", "young man"], "difficult_direct_answer": false, "rationales": ["The young man since he is standing by the steering wheel.", "The man looks well-built and the older man has white hair and is sitting down.", "The man in the black shirt will probably be controlling the boat."], "image": "val2014/COCO_val2014_000000513555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507927, "question_id": "TiGQrNvFKLFH4xaFYAAhBZ", "question": "In which type setting are the bikers?", "choices": ["city", "collegiate", "mall", "farm"], "correct_choice_idx": 0, "direct_answers": ["historical setting", "biker rally", "city", "parade", "city", "street", "bike rally", "parade", "city", "rest"], "difficult_direct_answer": false, "rationales": ["It might actually be a large town.", "The buildings and streetlight indicate that this is an urban setting.", "There are buildings lining the street."], "image": "val2014/COCO_val2014_000000507927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165757, "question_id": "TiHtfXDr4Yzr7BjzUbYRv8", "question": "What color does the person who has a birthday wear?", "choices": ["none", "blue", "white only", "purple white"], "correct_choice_idx": 3, "direct_answers": ["purple", "purple", "purple white", "purple", "purple", "purple", "purple", "purple", "purple", "purple"], "difficult_direct_answer": false, "rationales": ["She has a boa on her.", "This seems to be the birthday child.", "She is the one in front of the cake"], "image": "train2014/COCO_train2014_000000165757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450457, "question_id": "TiYxxXTq5Z56z2oMKs3yAF", "question": "These animals are drinking from what type of water resource?", "choices": ["puddle", "watering hole", "river", "flood"], "correct_choice_idx": 1, "direct_answers": ["lake", "pond", "watering hole", "stream", "pool", "pond", "river", "water", "puddle", "pond"], "difficult_direct_answer": false, "rationales": ["They are drinking from water in a hole", "The animals are standing around a large circle of water that formed in the middle of a dry area, and this is known as a watering hole.", "The animals are drinking in the wild."], "image": "train2014/COCO_train2014_000000450457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389112, "question_id": "TjrCQKGbbQzuNMv859ibS9", "question": "What is the person without a skateboard using for transportation?", "choices": ["bicycle", "unicycle", "scooter", "tricycle"], "correct_choice_idx": 1, "direct_answers": ["unicycle", "unicycle", "unicycle", "unicycle", "unicycle", "bike", "bicycle", "unicycle", "dangerous", "unicycle"], "difficult_direct_answer": false, "rationales": ["You can tell by the one wheel design as to what he in the background is riding.", "It has one wheel and no handlebars.", "They have a one wheeled bike"], "image": "train2014/COCO_train2014_000000389112.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183905, "question_id": "TkBeEgyEQapkKX5BZFE2ke", "question": "What is she doing with the banana?", "choices": ["cooking it", "selling it", "cleaning it", "stealing it"], "correct_choice_idx": 0, "direct_answers": ["cooking it", "spearing it", "poking it", "spearing", "skewering", "dropping", "bbq", "pierced it", "holding it", "grilling it"], "difficult_direct_answer": true, "rationales": ["She is cooking the banana on a fire.", "She has the banana skewered. when food items are skewered they are usually being cooked.", "She has it on a skewer that is used to heat food"], "image": "val2014/COCO_val2014_000000183905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288313, "question_id": "TkSjdEFaLXRDvtFM2GLKQP", "question": "What is the green stuff called on the plate?", "choices": ["pea", "corn", "pickle", "spinach"], "correct_choice_idx": 2, "direct_answers": ["pickles", "pickles", "pickle", "pickles", "pickles", "pickles", "pickle", "pickle", "pickles", "pickle"], "difficult_direct_answer": false, "rationales": ["Sliced, green circles are on a plate with other vegetables.", "Sliced pickles sit at the top of the plate. as most pickle slices are, these are cut with a \"crinkle\" knife.", "Pickles are made from cucumbers so they are both green."], "image": "val2014/COCO_val2014_000000288313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517570, "question_id": "TkSo5yYrv5QcXaSPSqnkpQ", "question": "What is the goal of the man working on the boat here?", "choices": ["waterproofing", "decorative", "spy craft", "weight loss"], "correct_choice_idx": 0, "direct_answers": ["painting", "waterproofing", "plug pleaks", "paint boat", "repair", "seal wood", "paint it", "painting", "waterproofing", "patch holes"], "difficult_direct_answer": false, "rationales": ["He is fixing the boat so there won't be any leaks", "The man is painting the bottom of the boat to waterproof it and protect it from wear.", "A man is standing beside a boat with a brush and can. boats are sealed and painted to waterproof them."], "image": "val2014/COCO_val2014_000000517570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216278, "question_id": "TkUgYnGdqzVSYM5TvS3Hbp", "question": "On what side of the bus should they go if they want to take the metro?", "choices": ["left", "right", "back", "front"], "correct_choice_idx": 0, "direct_answers": ["left", "right", "left", "left side", "right", "left", "left", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["There is no door on the right so the door must be on the left.", "The side is left.", "The passengers should get off on the sidewalk."], "image": "train2014/COCO_train2014_000000216278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529624, "question_id": "Tkdn2iPJyG4qQbSKD2fWA5", "question": "Why are they clasping hands?", "choices": ["struggling", "afraid", "sportsmanship", "fighting"], "correct_choice_idx": 2, "direct_answers": ["good sports", "congratulations", "sportsmanship", "sportsmanship", "congratulations", "sportsmanship", "sportsmanship", "sportsmanship", "congratulating", "sportsmanship"], "difficult_direct_answer": false, "rationales": ["The players are smiling at each other.", "Although they're competitors, the tennis players are smiling and sharing a handshake, which is an indicator of good sportsmanship.", "The women are happy with how they were playing."], "image": "train2014/COCO_train2014_000000529624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351382, "question_id": "TkkG7gzLevXuczebDG8oQ2", "question": "Why is the water shooting at the plane?", "choices": ["chasing it", "ceremony", "prank", "cleaning"], "correct_choice_idx": 1, "direct_answers": ["hose", "washing plane", "cleaning", "cooling it", "clean", "welcoming", "ceremony", "fire drill", "fire", "extinguish fire"], "difficult_direct_answer": true, "rationales": ["The planes are being cleaned.", "The airplane is at the airport strip and the maintenance crew is around the plane with one crew spraying water on the plane indicating that it is about to get clean.", "The water is for a ceremony."], "image": "train2014/COCO_train2014_000000351382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230780, "question_id": "TkzipmP4YuZJADXSRiGv7o", "question": "What sort of beverages are sold here?", "choices": ["milk", "sodas", "mixers", "alcoholic"], "correct_choice_idx": 3, "direct_answers": ["alcohol", "alcoholic", "alcoholic", "wine", "alcoholic", "alcoholic", "alcohol", "wine", "alcohol", "alcoholic"], "difficult_direct_answer": false, "rationales": ["There are bottles of alcohol on the shelf.", "Alcohol is shown.", "They are all found inside a liquor store with a variety of bottles."], "image": "val2014/COCO_val2014_000000230780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266200, "question_id": "Tm6huTFMHLiRNDxcyk2SoM", "question": "The person under the orange umbrella is whom?", "choices": ["dog walker", "police officer", "queen", "life guard"], "correct_choice_idx": 3, "direct_answers": ["lifeguard", "lifeguard", "lifeguard", "lifeguard", "lifeguard", "lifeguard", "life guard", "lifeguard", "lifeguard", "lifeguard"], "difficult_direct_answer": false, "rationales": ["They are up on a platform so they can see people in the water easier", "The person on the orange chair is elevated and watching over others at the beach.", "The person is on a raised seat on the beach which is a consistent position for answer a to be seated in this setting."], "image": "train2014/COCO_train2014_000000266200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393317, "question_id": "TmEVgMiocXvXuSfYz5kz9X", "question": "Why is the street shiny?", "choices": ["just rained", "newly topped", "bright sunshine", "just painted"], "correct_choice_idx": 0, "direct_answers": ["bus", "ot's wet", "rained", "rain", "rainy time", "wet", "just rained", "just rained", "sun brightness", "recent rain"], "difficult_direct_answer": true, "rationales": ["The surface is shiny due to wetness, because it has recently rained.", "The ground is wet.", "There was a rain shower."], "image": "train2014/COCO_train2014_000000393317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522066, "question_id": "TmFeFpGmwtwfi9M65Jdi5S", "question": "What can you see in the sky?", "choices": ["fireworks", "birds", "sunset", "stars"], "correct_choice_idx": 2, "direct_answers": ["sky", "sunset", "clouds", "sunset", "sunset", "sunset", "clouds", "clouds", "clouds", "clouds"], "difficult_direct_answer": false, "rationales": ["The sun is turning the sky purple.", "You can see the sun setting in the sky.", "The somewhat dark purple color of the coming sunset."], "image": "train2014/COCO_train2014_000000522066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365470, "question_id": "TmVNM9wcPKW5cUUc7hGFSS", "question": "Why is the man using a rope with the animal?", "choices": ["to whip", "to lead", "to restrain", "to punish"], "correct_choice_idx": 1, "direct_answers": ["lead", "leading", "travel", "no straying", "walk it", "prevent runaways", "control", "to guide", "to lead", "guide"], "difficult_direct_answer": true, "rationales": ["The man is using the rope to lead the donkey.", "The rope is used to control the horse.", "The man is leading."], "image": "train2014/COCO_train2014_000000365470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273961, "question_id": "Tn465GbrgCrkJGKyFRxKEJ", "question": "What skateboarding trick is the man trying?", "choices": ["dumbo", "front flip", "pogo", "tail whip"], "correct_choice_idx": 2, "direct_answers": ["flip", "jump", "flip", "nothing", "jumping", "pogo", "jump", "olly", "toe flip", "jumping"], "difficult_direct_answer": false, "rationales": ["The boarder is trying to jump as if he is on a pogo stick.", "The skateboarder is trying to flip the skateboard and stand on one wheel.", "He is jumping up in the air like a pogo stick."], "image": "val2014/COCO_val2014_000000273961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64897, "question_id": "TnP72oSuAvRyMvcztoVLKb", "question": "How long until the man lands?", "choices": ["minutes", "days", "hours", "seconds"], "correct_choice_idx": 3, "direct_answers": ["one second", "one second", "seconds", "soon", "seconds", "seconds", "one second", "one second", "seconds", "one second"], "difficult_direct_answer": false, "rationales": ["The man is doing a skateboarding trick. he is very close to the ground.", "The man is not far from the ground so it will not take him long to fall.", "A man is in the air on a skateboard above a ramp."], "image": "train2014/COCO_train2014_000000064897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214009, "question_id": "TngTspaXDkkWNd29x5QuzA", "question": "What are the birds greatest in number in the water?", "choices": ["crows", "pelicans", "flamingos", "seagulls"], "correct_choice_idx": 2, "direct_answers": ["flamingos", "gulls", "storks", "flamingo", "flamingos", "lot", "flamingos", "flamingos", "100", "seagulls"], "difficult_direct_answer": false, "rationales": ["They are pink and have the body and look of flamingos.", "The large pink birds are in the water. the rest are flying or on the shore.", "There are many long necked pink birds sitting in the water."], "image": "val2014/COCO_val2014_000000214009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397605, "question_id": "TnmsmkpizUAY8p5TSpcRL7", "question": "What did the man in white just do?", "choices": ["kissed wife", "boarded plane", "won game", "struck baseball"], "correct_choice_idx": 3, "direct_answers": ["strike out", "swing bat", "hit ball", "hit baseball", "hit ball", "swing", "swing", "hit", "struck baseball", "broke bat"], "difficult_direct_answer": false, "rationales": ["The man struck the ball.", "The man is a baseball player and he has just hit a ball. his position and the motion indicates as much.", "His hands are are \"gestured\" in a way that shows that he hit the ball and has one leg in front of the other."], "image": "train2014/COCO_train2014_000000397605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486360, "question_id": "TnoA9YH2bWBnsWXv7ehYsV", "question": "What is on the string the men hold?", "choices": ["bird", "kite", "top", "dog"], "correct_choice_idx": 1, "direct_answers": ["kite", "butterfly", "kite", "kite", "kites", "kites", "kite", "kites", "kites", "kite"], "difficult_direct_answer": false, "rationales": ["It holds them to keep from flying away", "The image depicts multiple examples of answer a which is known to require a string to prevent it flying away.", "The image shows a lot of kites and people holding on to the strings typically attached to a kite."], "image": "val2014/COCO_val2014_000000486360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209056, "question_id": "TnrDB5i5Ne6SU2cgJ84tHp", "question": "What can this boarder watch while skateboarding here?", "choices": ["park grass", "ocean", "mall", "road rage"], "correct_choice_idx": 1, "direct_answers": ["ocean", "volleyball", "volleyball game", "beach", "volleyball game", "ocean", "beach", "volleyball", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["There is sand and then far off in the distance water which means he's at the beach.", "The person is skateboarding and the beach and water can be seen in the background.", "A man is skateboarding near the beach."], "image": "train2014/COCO_train2014_000000209056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25997, "question_id": "TnzrCSpTAeJiSLx3RPpL7q", "question": "Why is he holding his hand out?", "choices": ["is pointing", "to balance", "is confused", "to catch"], "correct_choice_idx": 1, "direct_answers": ["for balance", "pretend surf", "posing", "surfing", "balance", "pretending surfing", "balance", "balance", "balance", "to balance"], "difficult_direct_answer": false, "rationales": ["The boy wants to balance on the board.", "The boy wants to balance.", "The boy wants to use his hand for balance to keep standing straight."], "image": "val2014/COCO_val2014_000000025997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416648, "question_id": "To5AkjjZSNnPWB7dqEAqyb", "question": "In what year was this company's home state admitted to the Union?", "choices": ["1900", "1875", "1896", "1912"], "correct_choice_idx": 2, "direct_answers": ["smile", "1896", "years ago", "1896", "1896", "1896", "eighteen ninetysix", "this year", "ink", "1920"], "difficult_direct_answer": false, "rationales": ["Red touch is in utah and that year is when they were admitted into the country.", "Retouch media is headquartered in utah and that state was admitted to the union in this year.", "Red touch's home stated was admitted in 1896."], "image": "train2014/COCO_train2014_000000416648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460897, "question_id": "ToGfuX8MpoZynivZecDbzF", "question": "What is the man in white shirt doing?", "choices": ["racing", "exercising", "working", "touring"], "correct_choice_idx": 2, "direct_answers": ["driving bike", "riding bike", "cycling", "riding bike", "working", "working", "pedaling", "pedaling", "riding wagon", "bicycling"], "difficult_direct_answer": false, "rationales": ["The man is working to transport passengers.", "He is peddling a rickshaw that carries people to their destination.", "The driver of this kind of vehicle is used to give rides to others who sit in the back. if he is currently riding it is likely he is able to be hired and is thus working."], "image": "train2014/COCO_train2014_000000460897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142262, "question_id": "ToL7UUxjM4f9DATTLFeZKq", "question": "Skateboard is made up of what wood?", "choices": ["plum", "maple", "apple", "pine"], "correct_choice_idx": 1, "direct_answers": ["maple", "maple", "oak", "plywood", "maple", "maple", "maple", "maple", "birch", "maple"], "difficult_direct_answer": false, "rationales": ["The skateboard uses maple wood.", "That wood is strong for riding.", "Maple wood is what snowboards are made from."], "image": "train2014/COCO_train2014_000000142262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331419, "question_id": "ToLWwS8QuMQQfkVPNkDLjn", "question": "Why is the dog wearing a muzzle?", "choices": ["prevent biting", "prevent drinking", "prevent eating", "prevent whining"], "correct_choice_idx": 0, "direct_answers": ["it bites", "pound", "aggressive", "prevent barking", "safety", "prevent biting", "aggressive", "stop biting", "safety", "he bites"], "difficult_direct_answer": false, "rationales": ["The dog is being kept from biting others.", "The muzzle is being used as a safety measure.", "There are many dogs locked in a large cage. the dog in front has a round piece that keeps him from getting ahold of someone with their teeth."], "image": "train2014/COCO_train2014_000000331419.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224647, "question_id": "ToZTGh4uaK8qvSL7hYV78h", "question": "What type of shoes would be better for this activity?", "choices": ["sneakers", "boots", "flip flops", "heels"], "correct_choice_idx": 0, "direct_answers": ["tennis shoes", "tennis shoes", "sneakers", "sneakers", "tennis shoes", "running shoes", "sneakers", "sneakers", "sneakers", "sneakers"], "difficult_direct_answer": false, "rationales": ["The shoes are sneakers.", "For the sports that she is going to do and most sporting events need robust foot wear to avoid hurting themsleves.", "Sneakers are good for ease of movement during sports."], "image": "val2014/COCO_val2014_000000224647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514180, "question_id": "ToqbKogppB8RH8KEhogtsd", "question": "What type of pizza is in the front?", "choices": ["sicilian", "thin crust", "pan pizza", "flatbread"], "correct_choice_idx": 2, "direct_answers": ["cheese", "ham", "pepperoni", "pepperoni", "pepperoni", "ham", "deep dish", "pan pizza", "pepperoni", "cheese"], "difficult_direct_answer": false, "rationales": ["Looks like pepperoni pan pizza", "The pizza is in a pan.", "The utensil being used explains the answer."], "image": "val2014/COCO_val2014_000000514180.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464443, "question_id": "Tp2WhvbgVqYM2DC4JvnnYs", "question": "What is the man wearing over his body?", "choices": ["towel", "shirt", "snuggie", "smock"], "correct_choice_idx": 2, "direct_answers": ["blanket", "blanket", "robe", "snuggie", "blanket", "robe", "snuggie", "blanket", "blanket", "snuggie"], "difficult_direct_answer": false, "rationales": ["The man is snuggling.", "He is wearing a robe type blanket.", "The man is currently wearing a cloth like material which is cross between a blanket and robe. another word for this is snuggle."], "image": "train2014/COCO_train2014_000000464443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445334, "question_id": "Tp8pRky2eLYF7hRcDRsdjK", "question": "What is the animal used for in the sport depicted on the shirt?", "choices": ["riding", "sniffing", "hunting", "catching"], "correct_choice_idx": 0, "direct_answers": ["riding", "horse", "polo", "horse", "polo", "polo", "riding", "polo", "horse", "polo"], "difficult_direct_answer": false, "rationales": ["The girl has a horse on her shirt which is an animal that people can ride.", "The sport on the shirt is polo. people ride horses to play polo.", "People ride horses."], "image": "val2014/COCO_val2014_000000445334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181260, "question_id": "TpRQhShAJvjLpYi52JfweE", "question": "What is the food being stored in?", "choices": ["fridge", "bags", "jars", "cans"], "correct_choice_idx": 2, "direct_answers": ["mason jar", "jars", "jars", "jars", "glass jars", "jars", "jars", "jar", "jars", "vegetables"], "difficult_direct_answer": false, "rationales": ["People like to put food in them to have an entire meal in on vessel.", "The containers are clearly visible and based on the material of their lid and the main body they would be answer a.", "The food is in jars."], "image": "train2014/COCO_train2014_000000181260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271546, "question_id": "Tpbd7LTAPDgniTS3szkj7b", "question": "What is the player in yellow doing?", "choices": ["bunting", "returning ball", "serving", "striking"], "correct_choice_idx": 1, "direct_answers": ["swinging", "hitting ball", "playing tennis", "returning serve", "returning ball", "hitting ball", "tennis", "practicing hit", "lobbing", "playing"], "difficult_direct_answer": true, "rationales": ["A man is holding a racket near a ball and aiming to swing it which means he is returning the ball.", "They are going to return the ball to the other side.", "The player wants to serve the ball back."], "image": "val2014/COCO_val2014_000000271546.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445443, "question_id": "Tpd2u7hCoeKobtsXdvGDqt", "question": "What is near the man?", "choices": ["egg", "moose", "werebear", "child"], "correct_choice_idx": 3, "direct_answers": ["child", "child", "kite", "kite", "kite", "kite", "child", "kite", "kite", "child"], "difficult_direct_answer": false, "rationales": ["The man is standing near a child that is watching him work", "There is a small boy near the man.", "A child is standing close to the man."], "image": "train2014/COCO_train2014_000000445443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63307, "question_id": "Tq3zBJxaGY5tEror9N23ss", "question": "What part of his uniform is he least likely to wear if he plays when he's older?", "choices": ["socks", "sneakers", "jeans", "watch"], "correct_choice_idx": 2, "direct_answers": ["seven", "shirt", "pants", "jeans trouser", "jeans", "jeans", "jeans", "play sports", "jeans", "jeans"], "difficult_direct_answer": false, "rationales": ["These will become too small as he gets bigger", "He will have to wear the uniforms older players wear.", "When you play baseball when you are older you get baseball pants the the kid will likely not wear jeans next time."], "image": "train2014/COCO_train2014_000000063307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539147, "question_id": "Tq6yaHYBwACEiou2JG68Sb", "question": "What is the man skating bending his knees?", "choices": ["to jump", "stability", "to flip", "to spin"], "correct_choice_idx": 1, "direct_answers": ["balance", "skateboard", "skateboard", "doing tricks", "balance", "turning", "stability", "skateboard", "balance", "skateboard"], "difficult_direct_answer": false, "rationales": ["The man on the skateboard has his knees bent for stability and balance.", "The man is bending his knees because he is going down a steep slope and might fall.", "The man does not want to fall."], "image": "train2014/COCO_train2014_000000539147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500330, "question_id": "Tq7iitxiMtCsd3A8r7Djcn", "question": "What would the average person need to do to use the microwave here?", "choices": ["ask", "bend down", "stretch up", "turn around"], "correct_choice_idx": 1, "direct_answers": ["bend down", "heat food", "read", "stoop", "bend over", "bend", "bend over", "close dishwasher", "bend over", "bend down"], "difficult_direct_answer": false, "rationales": ["The microwave is located below the drawer and lower than the countertop where microwaves usually are found.", "The microwave is below the counter which means most people would need to bend down.", "They would have to bend over to use the microwave."], "image": "val2014/COCO_val2014_000000500330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501758, "question_id": "Tq9oaN92W2tKGBjk5AQfsV", "question": "Who will dry the dishes here?", "choices": ["woman", "man", "no one", "automatic dishwasher"], "correct_choice_idx": 3, "direct_answers": ["woman", "woman", "woman", "man", "automatic dishwasher", "woman", "man", "woman", "man", "man"], "difficult_direct_answer": false, "rationales": ["The man is standing in front of a standard dishwasher. logic would be that it will be used to wash and dry these dishes.", "The dishes have already been dried.", "The automatic dishwasher that is next to the fridge."], "image": "train2014/COCO_train2014_000000501758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416098, "question_id": "TqSd4gsGhUPD9uiMpSzqQr", "question": "What type weather is the beach setting having here?", "choices": ["snowy", "rain", "sleet", "windy"], "correct_choice_idx": 3, "direct_answers": ["hot", "sunny", "windy", "sunny", "sunny", "sunny", "sunny", "hot", "sunny", "hot"], "difficult_direct_answer": false, "rationales": ["There is windy weather on the beach here.", "There is a kite in the air. the person's hair is blowing.", "The weather is windy."], "image": "val2014/COCO_val2014_000000416098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56109, "question_id": "TqStjGtrTdXqL4Xm5FXZEr", "question": "This bus takes passengers to a stop on what subway system?", "choices": ["montreal metro", "paris metro", "berlin u-bahn", "london underground"], "correct_choice_idx": 1, "direct_answers": ["paris metro", "paris metro", "tube", "paris", "paris", "i do", "paris metro", "no idea", "french", "paris"], "difficult_direct_answer": false, "rationales": ["The destination of the bus is displayed on top of its frontside.", "This is the bus in paris that is used for public transportation.", "It's what is indicated on the bus."], "image": "train2014/COCO_train2014_000000056109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281127, "question_id": "TqYjixLzuegsZ6Lkpt3BG3", "question": "What season does the tree indicate it is?", "choices": ["fall", "summer", "spring", "winter"], "correct_choice_idx": 3, "direct_answers": ["winter", "winter", "winter", "fall", "autumn", "fall", "winter", "winter", "autumn", "winter"], "difficult_direct_answer": false, "rationales": ["The tree with no leaves makes it look like it's cold.", "The tree has lost its leaves which is known to happen in season answer a.", "It seems to be dry as the tree has no twins around it."], "image": "train2014/COCO_train2014_000000281127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484488, "question_id": "TqayD2oGQGZDwxrNdBWbcf", "question": "Why is the woman holding an open umbrella behind her back?", "choices": ["to signal", "to dance", "to fight", "to pose"], "correct_choice_idx": 3, "direct_answers": ["posing", "shade", "posing", "to pose", "wind", "wind", "posing", "posing", "posing", "posing"], "difficult_direct_answer": false, "rationales": ["The woman is striking a pose.", "The woman looks like she is getting ready to take a picture. it is not raining or anything either.", "The woman is standing with the open umbrella because she is posing for the camera."], "image": "train2014/COCO_train2014_000000484488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235832, "question_id": "TqbHZhwS8SSvnxJV3u8bRG", "question": "Where might the truck in yellow be headed?", "choices": ["car lot", "home", "junk yard", "grocery store"], "correct_choice_idx": 2, "direct_answers": ["dump", "dump", "dump", "junk yard", "dump", "collect trash", "garbage dump", "garbage dump", "dump", "city dump"], "difficult_direct_answer": false, "rationales": ["A truck is on the road and is full of garbage.", "The truck is for junk.", "The items in the back of the truck look like junk that is ready to be discarded."], "image": "val2014/COCO_val2014_000000235832.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72582, "question_id": "TqypsbL3irF2dgnzkZwwfu", "question": "What temperature beverage is found in the carafe here?", "choices": ["cold", "room temperature", "no beverage", "hot"], "correct_choice_idx": 3, "direct_answers": ["hot coffee", "hot", "room temp", "hot", "hot tea", "hot", "hot", "coffee", "tea", "hot"], "difficult_direct_answer": false, "rationales": ["The beverage is hot.", "Coffee is usually hot.", "Carafes can keep liquids at a high temperature."], "image": "val2014/COCO_val2014_000000072582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284088, "question_id": "TqzWe2uLHYnRK5rujBZVDN", "question": "Which direction are kayakers facing?", "choices": ["upside down", "down stream", "bank wards", "up river"], "correct_choice_idx": 3, "direct_answers": ["downstream", "opposite ones", "right", "right", "downriver", "downstream", "north", "down stream", "up river", "forward"], "difficult_direct_answer": false, "rationales": ["They are going against the current.", "The kayakers are rowing up stream.", "The rapids are going the opposite direction"], "image": "train2014/COCO_train2014_000000284088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297859, "question_id": "TrMuaEAFQnFRQhy327ksDw", "question": "Why are these animals here?", "choices": ["being fed", "on display", "resting", "for sale"], "correct_choice_idx": 1, "direct_answers": ["zoo", "human entertainment", "zoo", "zoo", "on display", "zoo", "preservation", "want food", "zoo", "zoo"], "difficult_direct_answer": false, "rationales": ["The animals appear to be in an enclosure and are of an exotic variety that is not commonly kept as pet or livestock which is consistent with answer a.", "The animals are at a zoo or safari.", "The giraffes are in an enclosure at a zoo or animal park."], "image": "train2014/COCO_train2014_000000297859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504616, "question_id": "TrZghozCnfHpMnusukJdok", "question": "What is he holding on the paddle?", "choices": ["pizza", "taco", "hamburger", "chicken"], "correct_choice_idx": 0, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["The man is standing in front of a pizza oven.", "This type of paddle being used with an oven would likely be for the purposes of cooking answer a.", "The paddle is designed to hold a large round piece of dough with toppings when you put it in the oven."], "image": "train2014/COCO_train2014_000000504616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298151, "question_id": "TrbTiyBoUyYgK8Pi76rHDA", "question": "What is the name of the nut on the plate?", "choices": ["peanut", "walnut", "cashew", "pistachio"], "correct_choice_idx": 1, "direct_answers": ["walnut", "walnut", "peanut", "walnut", "walnut", "cocoa", "walnut", "walnut", "pecan", "walnut"], "difficult_direct_answer": false, "rationales": ["The hard shell is very distinguishable as a walnut.", "Walnuts are shown on the plate.", "The nut has a walnut shell."], "image": "val2014/COCO_val2014_000000298151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82969, "question_id": "TrePgRB95QTRssMqwXb5Dc", "question": "Why does sh hold the slice with both hands?", "choices": ["prevent theft", "prevent dropping", "stay warm", "stay clean"], "correct_choice_idx": 1, "direct_answers": ["balance", "fall down", "feed", "big", "it's big", "to eat", "eating", "to eat", "keep intact", "prevent dropping"], "difficult_direct_answer": true, "rationales": ["The slice is preventing dropping.", "The pizza slice is big so it needs two hands to eat it properly.", "That is a big piece of food that may fall if not held on to."], "image": "train2014/COCO_train2014_000000082969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398493, "question_id": "Trk7ivQSRSbCC9Lf4Pydq7", "question": "What is this place called?", "choices": ["halfpipe", "skatepark", "sink", "skate tube"], "correct_choice_idx": 1, "direct_answers": ["skate park", "skate park", "skate park", "skate park", "skate park", "skatepark", "skate park", "skate park", "skate park", "skate park"], "difficult_direct_answer": false, "rationales": ["There are skateboarders gathered at the special park designed for exactly this use.", "The person is at a skatepark where people ride skateboards on ramps.", "The place is populated by people with skateboards and the specific style of the features and the texture it appears that it's intended use is for skateboarding. parks built this way to be used by skateboarders are called answer a."], "image": "train2014/COCO_train2014_000000398493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295642, "question_id": "TrnBREmFaM8X7NMpnv8j2t", "question": "Which entity is in the greatest danger?", "choices": ["dog", "tall man", "short man", "right man"], "correct_choice_idx": 0, "direct_answers": ["people", "animal crossing", "crane", "rail", "dog", "person", "people", "black dog", "dog", "man"], "difficult_direct_answer": false, "rationales": ["A dog is in the middle of a track where a train is approaching.", "The dog is crossing in front of the train and might be in danger.", "A dog is on tracks with a train nearby."], "image": "train2014/COCO_train2014_000000295642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201519, "question_id": "TrtxELoFXeAr9Pr3uEDPg5", "question": "Why are these people laying here?", "choices": ["hungry", "hiding", "tired", "good weather"], "correct_choice_idx": 3, "direct_answers": ["napping", "sunbathing", "protest", "good weather", "enjoying day", "relaxing", "relaxing", "sunbathing", "relaxing", "relaxing"], "difficult_direct_answer": false, "rationales": ["The weather is good.", "The people are enjoying the nice sunny day.", "It is a pleasant looking day outside so it's logical to assume that the people are laying outside in the grass like this because they're enjoying the good weather."], "image": "train2014/COCO_train2014_000000201519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295505, "question_id": "TsCLbVCRvBGPCcVkP2Sn3p", "question": "What superhero theme is on the bike?", "choices": ["superman", "wonder woman", "hulk", "captain america"], "correct_choice_idx": 1, "direct_answers": ["wonder woman", "wonder woman", "wonder woman", "captain america", "wonder woman", "captain america", "captain america", "captain america", "wonder woman", "captain america"], "difficult_direct_answer": false, "rationales": ["It is a girl dressed in an american super hero themed outfit.", "She has on a costume similar to this character", "Wonder woman uses this same star and color scheme."], "image": "train2014/COCO_train2014_000000295505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431173, "question_id": "Tse5DFhqTLwb9vhy4ZcRfK", "question": "How was the product being eaten here advertised or labeled?", "choices": ["foot long", "ball park", "mini frank", "brat"], "correct_choice_idx": 0, "direct_answers": ["long", "extra large", "unknown", "footlong hotdog", "foot long", "footlong", "tasty", "hot dog", "as delicious", "hotdog"], "difficult_direct_answer": true, "rationales": ["The hot dog is long.", "It is much longer than a regular hot dog", "This product is commonly referred to as a foot long hot dog."], "image": "val2014/COCO_val2014_000000431173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102665, "question_id": "TsvvHEAMuxDA4BJ7i4Zwbc", "question": "Which one is the African artiodactyl mammal?", "choices": ["elephant", "lion", "giraffe", "tiger"], "correct_choice_idx": 2, "direct_answers": ["right most", "giraffe", "giraffe", "giraffe", "right one", "giraffe", "giraffe", "giraffe", "giraffes", "giraffe"], "difficult_direct_answer": false, "rationales": ["These would be giraffes in the picture.", "Giraffes are found in africa and in the picture.", "The animals shown here are giraffes."], "image": "val2014/COCO_val2014_000000102665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503935, "question_id": "TswbVrNU7JnyFMgm5ecth8", "question": "What type of restaurant is this?", "choices": ["bakery", "indian", "mexican", "chinese"], "correct_choice_idx": 0, "direct_answers": ["donut", "asian", "bakery", "dunkin donuts", "bakery", "donut shop", "donut", "donut shop", "donut shop", "bakery"], "difficult_direct_answer": false, "rationales": ["This is a bakery with donuts.", "The language written on the card is chinese.", "The restaurant is a bakery that sells donuts and other sweet desserts."], "image": "train2014/COCO_train2014_000000503935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32060, "question_id": "TsyKSkCintGQgYBGQGpkZ2", "question": "What are the vehicles parked near the bar used for?", "choices": ["racing", "delivery", "farming", "public transport"], "correct_choice_idx": 2, "direct_answers": ["farm", "farming", "farming", "heavy loads", "farming", "farming", "tractor", "farming", "tractors", "farming"], "difficult_direct_answer": false, "rationales": ["The tractor is used to plow fields.", "Tractors are used in farms to make work easier.", "Those are tractors and they are used for farming."], "image": "train2014/COCO_train2014_000000032060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115911, "question_id": "Tt6zDvSadD7rCNNavWGAvy", "question": "What base will the batter run to next?", "choices": ["home", "first", "third", "second"], "correct_choice_idx": 1, "direct_answers": ["first", "first", "first", "first base", "first", "first", "first", "first", "first base", "first base"], "difficult_direct_answer": false, "rationales": ["The batter is about to hit the ball.", "Once the batter hits the ball he will run to first base.", "The batter is at home based on the presence of the catcher and the shape of the base. the rules of baseball are commonly known and the batter would thus intend to go to answer a next in accordance with the rules."], "image": "train2014/COCO_train2014_000000115911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444344, "question_id": "Tt7pJLWaFGoHYDQmvpgDTJ", "question": "What setting is the outfit of the boy sitting in the middle usually found?", "choices": ["pool", "car race", "horse race", "office"], "correct_choice_idx": 3, "direct_answers": ["church", "office", "official", "school", "restaurant", "house", "party", "school", "restaurant", "school"], "difficult_direct_answer": false, "rationales": ["He is wearing a tie.", "The boy is in formal clothing with a tie which is usually worn in business.", "The boy in the middle is wearing a dress shirt and a tie. this is business attire."], "image": "train2014/COCO_train2014_000000444344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183321, "question_id": "TtM3Hhr8q2HHRCzpQNtd9t", "question": "What are people doing in the water?", "choices": ["swimming", "boating", "fishing", "paragliding"], "correct_choice_idx": 3, "direct_answers": ["kite surfing", "surfing", "jumping", "skiing", "jumping", "hang gliding", "diving", "paragliding", "kite sailing", "kite surfing"], "difficult_direct_answer": false, "rationales": ["The people are paragliding.", "The people are holding onto a rope to a paraglider.", "The people on the water are paragliding with sails and surfboards."], "image": "train2014/COCO_train2014_000000183321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103554, "question_id": "Ttj4udiMzBwQa92BcL6GqH", "question": "The arts in the kites are introduced by whom?", "choices": ["romans", "chinese", "italians", "japanese"], "correct_choice_idx": 1, "direct_answers": ["kite flyers", "japanese", "chinese", "moi", "chinese", "japan", "mexican wrestlers", "darth moll", "chinese", "china"], "difficult_direct_answer": false, "rationales": ["The kites are chinese inventions.", "The kites have a chinese logo.", "The chines have the behavior of discovering new things."], "image": "train2014/COCO_train2014_000000103554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354165, "question_id": "TtvmSqHu2ahBYEu8Kbmjxf", "question": "What is the dominant food group within the dishes?", "choices": ["veggies", "meat", "fruit", "pastries"], "correct_choice_idx": 2, "direct_answers": ["fruit", "fruit", "fruit", "fruits", "fruits", "fruits", "fruits", "fruit", "fruit", "fruit"], "difficult_direct_answer": false, "rationales": ["Several dishes filled with various fruits are arranged together.", "Most of this is stuff like bananas, strawberries and grapes.", "The bowls are made of fruit."], "image": "val2014/COCO_val2014_000000354165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332512, "question_id": "Ttxug6bdPjGsSAGK47Ywv6", "question": "What is the size of the bed called?", "choices": ["queen", "twin", "full", "king"], "correct_choice_idx": 1, "direct_answers": ["twin", "twin bed", "twin", "twin", "twin bed", "twin", "twin", "single", "twin", "twin"], "difficult_direct_answer": false, "rationales": ["The bed is very small.", "The bed is the smallest standard size which comfortable sleeps only one person.", "It is only big enough for one person"], "image": "val2014/COCO_val2014_000000332512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338134, "question_id": "Tu4zokAHs4dAQ2PQY9AXT6", "question": "What is this flowerpot made of?", "choices": ["metal", "plant fiber", "terracotta", "plastic"], "correct_choice_idx": 2, "direct_answers": ["terra cotta", "clay", "clay", "stones", "terra cotta", "clay", "terracotta", "clay", "clay", "terracotta"], "difficult_direct_answer": false, "rationales": ["It is made from red clay", "The pot is made of terracotta since that's good for drainage.", "The flower pot is made of cooked clay known as terracotta that turns orange."], "image": "train2014/COCO_train2014_000000338134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511436, "question_id": "TuAccPzerdYWTi7DqpbtFx", "question": "What is the letter made from on the cake?", "choices": ["sprinkles", "cocoa powder", "glitter", "chocolate chips"], "correct_choice_idx": 0, "direct_answers": ["sprinkles", "sprinkles", "letter e", "letter e", "sprinkles", "using paper", "sprinkles", "sprinkles", "sprinkles", "sprinkles"], "difficult_direct_answer": false, "rationales": ["The things on the cake are small and colorful.", "The cake has sprinkles on top.", "The specks on the cake are colorful bits of sugar."], "image": "train2014/COCO_train2014_000000511436.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84131, "question_id": "TuGGVYpAaV2TEDUUzZ5puX", "question": "What would prevent this area from being good farmland?", "choices": ["weather", "elevation", "animals", "rocky"], "correct_choice_idx": 3, "direct_answers": ["rocky", "rocky", "hills", "rocks", "rocks", "rocks", "rocky", "rocks", "rocks", "rock"], "difficult_direct_answer": false, "rationales": ["This area has too many rocks.", "There are too many rocks so there cannot be any planting here until they are removed.", "The rocks would prevent vegetation from growing freely as it needs soil."], "image": "train2014/COCO_train2014_000000084131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455847, "question_id": "TuVLyo3kBGDKYJnUwzNRas", "question": "A balaclava is also known as what?", "choices": ["helmet", "ski mask", "ski muffler", "none"], "correct_choice_idx": 1, "direct_answers": ["helmet", "ski mask", "ski mask", "helmet", "face mask", "dessert", "ski mask", "ski mask", "ski mask", "ski mask"], "difficult_direct_answer": false, "rationales": ["The ski mask that is worn on the face is known as a balaclava.", "The balaclava keeps the face warm while doing cold weather activities like skiing.", "A child with goggles on is standing in skis in front of an adult on skis."], "image": "val2014/COCO_val2014_000000455847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421314, "question_id": "Tum4auVnUtqpkH64mHiWU4", "question": "What is the flat vehicle for in front of the plane?", "choices": ["drink mixing", "moving plane", "taxi cab", "barge driving"], "correct_choice_idx": 1, "direct_answers": ["transporting luggage", "towing plane", "cargo car", "towing plane", "navigation", "assist plane", "guidance", "moving plane", "loading zone", "airplane tow"], "difficult_direct_answer": true, "rationales": ["The flat vehicle is used to manuever the aircraft.", "The flat vehicle helps orient the plane.", "The vehicle appears to be at a height that it could slide underneath the plane and support it. based on the design and setting, answer a is likely."], "image": "train2014/COCO_train2014_000000421314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246879, "question_id": "Tv5Ps8FtqwcUH3fG8m7Xc3", "question": "What is the person standing on?", "choices": ["grass", "surf board", "cement", "sand"], "correct_choice_idx": 3, "direct_answers": ["surfboard", "surfboard", "surf board", "surfboard", "surfboard", "surfboard", "surfboard", "surf board", "sand", "surfboard"], "difficult_direct_answer": false, "rationales": ["The person is standing on the sand under the waves.", "The person is on a stand.", "The bottom of the ocean is covered in this."], "image": "train2014/COCO_train2014_000000246879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405864, "question_id": "TvRrjMc3ZbHeQgB95ns7qT", "question": "Where is the person located?", "choices": ["office", "home", "school", "hospital"], "correct_choice_idx": 1, "direct_answers": ["chair", "desk", "left", "home", "table", "table", "inside house", "chair", "left", "desk"], "difficult_direct_answer": false, "rationales": ["The person is just in socks and is at a desk.", "The person is at home.", "Cats aren't allowed in offices, hospitals nor schools."], "image": "train2014/COCO_train2014_000000405864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151959, "question_id": "TvTt2CVgAik7wkmoouKRUz", "question": "What facial expression is the green ball showing?", "choices": ["fear", "anger", "happiness", "confusion"], "correct_choice_idx": 1, "direct_answers": ["mad", "angry", "anger", "angry", "frown", "frown", "anger", "anger", "angry", "angry"], "difficult_direct_answer": false, "rationales": ["The mouth lines are going down at the corners and the eyebrows are also down", "The eyebrows are slanted downwards in a v and the mouth is straight instead of curved.", "It is frowning and its eyebrows are furrowed."], "image": "train2014/COCO_train2014_000000151959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202653, "question_id": "TvW4bLkumRMzT4g3Z2FKXW", "question": "Why are the people on the bench sitting so close?", "choices": ["for warmth", "cuddling", "tired", "no room"], "correct_choice_idx": 1, "direct_answers": ["affection", "lovers", "love", "couple", "cuddling", "talking", "in love", "they're snuggling", "in love", "married"], "difficult_direct_answer": true, "rationales": ["These people seem to be a couple and together and cuddling.", "Two people are huddled close on a bench. a man has his arms around a girl.", "They are close and cuddling to be intimate."], "image": "val2014/COCO_val2014_000000202653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360274, "question_id": "TviDHBkP35WjanostDVgCQ", "question": "What will the person with the bat do next?", "choices": ["swing", "nothing", "run", "quit"], "correct_choice_idx": 0, "direct_answers": ["hit", "hit ball", "hit ball", "run", "hit ball", "run", "swing", "run", "hit", "run"], "difficult_direct_answer": false, "rationales": ["The person with the bat will next swing at their ball.", "The person wants to hit the ball.", "Given the sport he is playing and the fact that he is a batter."], "image": "val2014/COCO_val2014_000000360274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303735, "question_id": "Tw8crfJ9ExbR7SEghcK4b8", "question": "What might these ladies eat?", "choices": ["banana", "doughnuts", "apples", "hot dogs"], "correct_choice_idx": 0, "direct_answers": ["banana", "banana", "banana", "bananas", "bananas", "bananas", "bananas", "bananas", "banana", "bananas"], "difficult_direct_answer": false, "rationales": ["The women are holding bananas.", "They're all holding bananas in their hands.", "The ladies are holding peeled bananas that they might eat."], "image": "train2014/COCO_train2014_000000303735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460149, "question_id": "Tw9ujYiMsSZ9nQRQg3FyCt", "question": "What is the red zone on the road for?", "choices": ["animals", "cars", "pedestrians", "bicyclists"], "correct_choice_idx": 2, "direct_answers": ["pedestrian crossing", "crossing", "crosswalk", "no parking", "crossing", "caution", "crossing", "crosswalk", "pedestrian crossing", "pedestrians"], "difficult_direct_answer": false, "rationales": ["There is a person crossing sign by the red area on street indicating people could cross the street.", "Its to help pedestrians cross the road.", "The red zones are used for pedestrians to walk."], "image": "val2014/COCO_val2014_000000460149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100439, "question_id": "Tx7gfZFMuqX3F47QDVm8FY", "question": "What is the color of the shirt of the person who can bare a child?", "choices": ["white", "green", "blue", "grey"], "correct_choice_idx": 3, "direct_answers": ["white", "gray", "grey", "grey", "grey", "gray", "grey", "grey", "grey", "tan"], "difficult_direct_answer": false, "rationales": ["There is only one person in the picture who is a woman and can bare a child. she is sitting down on the couch and wearing a grey shirt.", "The woman can have a child. her shirt is not white, blue, or green.", "It is slightly lighter than the black couch"], "image": "train2014/COCO_train2014_000000100439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74549, "question_id": "TxaMtGxXeQEGaMwdrXZrkR", "question": "What type of style of beef called in this sandwich?", "choices": ["sloppy joe", "roast beef", "corned beef", "hamburger"], "correct_choice_idx": 2, "direct_answers": ["corned beef", "corned beef", "corned beef", "corned beef", "corned", "corned", "corned", "corned beef", "corned", "corned"], "difficult_direct_answer": false, "rationales": ["Corned beef is dried beef.", "The beef is corned.", "The beef in the sandwich is compressed together and has a reddish color."], "image": "train2014/COCO_train2014_000000074549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1700, "question_id": "TxdSRmB29S9NzwXBvEih33", "question": "The item the person is standing on was from what century?", "choices": ["18th", "20th", "12th", "17th"], "correct_choice_idx": 1, "direct_answers": ["20th", "twentieth", "twentieth", "twentieth", "20th", "20th", "21st", "20th", "20th", "20th"], "difficult_direct_answer": false, "rationales": ["This was made more recently in time.", "Based on the way the man is dressed he is from this current century.", "The man is current."], "image": "val2014/COCO_val2014_000000001700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386165, "question_id": "TxgihEap5fkXMmktSVfQLY", "question": "What is the purpose of the power cube on the back of the wheelchair?", "choices": ["storage", "for aesthetics", "for weight", "move it"], "correct_choice_idx": 3, "direct_answers": ["charging", "motorize it", "move it", "propels forward", "battery power", "power", "battery", "controls", "energy", "for electricity"], "difficult_direct_answer": true, "rationales": ["It is a battery to run the motor", "Scooters require a mobile power source.", "The purpose is to move."], "image": "val2014/COCO_val2014_000000386165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56040, "question_id": "TxwjhTyDRdmdPwqFPseiu3", "question": "Why would the women be brushing her teeth outside?", "choices": ["fun", "camping", "homeless", "nice weather"], "correct_choice_idx": 1, "direct_answers": ["camping", "camping", "camping", "brushing", "hygiene", "no bathroom", "camping", "camping", "camping", "clean teeth"], "difficult_direct_answer": false, "rationales": ["She is near wooden tables that are used in remote settings", "The woman is camping.", "A woman is sitting at a picnic table brushing her teeth."], "image": "train2014/COCO_train2014_000000056040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95033, "question_id": "Ty8udsyNUzbT4AigKGRMhe", "question": "Which animal is in danger from the other here?", "choices": ["bird", "reflection", "neither", "zebra"], "correct_choice_idx": 2, "direct_answers": ["bird", "bird", "bird", "bird", "zebra", "duck", "bird", "neither", "bird", "bird"], "difficult_direct_answer": false, "rationales": ["The two breeds of animals are docile, friendly animals.", "One is a baby and the other is probably its mother", "Birds and zebras will typically not harm each other. these zebras are larger but not paying or paying attention to the bird, they are each multiple paces away from stepping distance of the bird, and a reflection is not an animal."], "image": "train2014/COCO_train2014_000000095033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388955, "question_id": "TyXXLM7ZqiFX55ffSWLbHk", "question": "What is the surfer doing to the wave?", "choices": ["splitting", "carving", "cutting", "slicing"], "correct_choice_idx": 1, "direct_answers": ["riding it", "surfing it", "carving", "riding wave", "cutting", "riding", "riding", "surfing it", "riding it", "surfing"], "difficult_direct_answer": false, "rationales": ["The surfer is carving.", "The surfer wants to carve the inside.", "The surfer wants to carve the wave."], "image": "train2014/COCO_train2014_000000388955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372489, "question_id": "TyZmfopX9d9fvUny2WHFLe", "question": "What city is associated with the 071 code?", "choices": ["london", "leeds", "edinburgh", "newcastle"], "correct_choice_idx": 0, "direct_answers": ["saigo", "saigo", "london", "saigo", "london", "london", "saigo", "usa", "london", "saigo"], "difficult_direct_answer": false, "rationales": ["The answer is internet searchable and not directly to anything in the image.", "London is associated with the 071 code.", "This would be a store in london with that code."], "image": "train2014/COCO_train2014_000000372489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294838, "question_id": "TzUsxdouHuNFVebmkdTq8t", "question": "What is the silver framed object inside the cabinet used for?", "choices": ["showering", "watching television", "washing hands", "cooking"], "correct_choice_idx": 1, "direct_answers": ["entertainment", "watching tv", "watching tv", "entertainment", "watching television", "watching television", "decoration", "watching shows", "tv", "holding television"], "difficult_direct_answer": false, "rationales": ["The silver object is used to frame the tv.", "The silver object in the cabinet is a large tv for watching television shows.", "The television set has a silver colored frame on it."], "image": "train2014/COCO_train2014_000000294838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357435, "question_id": "Tzrapyi3nojM3Rvq8ieD84", "question": "What is the name of the championship of this sport called in America?", "choices": ["world cup", "uefa cup", "world series", "stanley cup"], "correct_choice_idx": 2, "direct_answers": ["world series", "world series", "world series", "world series", "playoffs", "world series", "world series", "world series", "playoffs", "world series"], "difficult_direct_answer": false, "rationales": ["The world series is for baseball.", "Even people who don't follow baseball closely knows about the event known as the \"fall classic.\".", "In major league baseball everyone wants to win the championship and hose the crown that looks like flags on top."], "image": "train2014/COCO_train2014_000000357435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158861, "question_id": "U26tohqK7LsZE8fVGMDrvC", "question": "What color is the thing you may put garbage inside of here?", "choices": ["teal", "pink", "yellow", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "black", "green", "green", "black", "black", "black", "green", "black", "red"], "difficult_direct_answer": false, "rationales": ["This is an outdoor park where the benches and trash receptacles are green metal.", "There is a green can on the lawn.", "Trash cans are typically green."], "image": "train2014/COCO_train2014_000000158861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227969, "question_id": "U2FgCYagNujGzoQdGYYxYj", "question": "What country did this motorcycle originate from?", "choices": ["japan", "united states", "mexico", "england"], "correct_choice_idx": 0, "direct_answers": ["japan", "japan", "japan", "japan", "japan", "japan", "japan", "japan", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["The color shows that the motorbike is from japane.", "The country is japan.", "Motorcycles came from japan."], "image": "train2014/COCO_train2014_000000227969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238709, "question_id": "U2gqU3br5cGamg3ej5Zp5z", "question": "What will the next thing the pitcher does?", "choices": ["bat", "eat lunch", "pitch ball", "take break"], "correct_choice_idx": 2, "direct_answers": ["throw ball", "throws", "pitch ball", "hit ball", "throw ball", "release ball", "toss ball", "throw ball", "throw", "throw"], "difficult_direct_answer": false, "rationales": ["The pitcher is going to throw the ball.", "The pitcher will throw the ball toward the catcher and try to strike out the batter.", "The batter and catcher are seen in ready position, which means a baseball is likely aboit to be thrown their way from the pitcher."], "image": "train2014/COCO_train2014_000000238709.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466390, "question_id": "U2x2tdAyHGBvr4UfLMWAsn", "question": "What transportation device is visible?", "choices": ["bicycle", "tank", "airplane", "pogo stick"], "correct_choice_idx": 0, "direct_answers": ["bicycle", "bicycle", "bicycle", "bicycle", "bike", "bicycle boat", "bicycle", "bike boat", "bicycle", "bicycle"], "difficult_direct_answer": false, "rationales": ["The bike is definitely the transportation device.", "The vehicle has two wheels. it does not have wings or armor.", "It has 2 wheels and pedals"], "image": "train2014/COCO_train2014_000000466390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27149, "question_id": "U2y3fqjBovzfhLvJ3j8npd", "question": "What are they doing?", "choices": ["using remotes", "talking phones", "fighting", "playing game"], "correct_choice_idx": 3, "direct_answers": ["playing wii", "play videogames", "playing game", "playing game", "video games", "playing games", "playing", "playing games", "playing wii", "gaming"], "difficult_direct_answer": false, "rationales": ["The people are holding video game controllers and playing a game on the tv.", "There is a game being played on the tv screen based on the graphics visible and they are visibly holding gaming controllers which regarding and motioning towards the tv.", "They have controllers in their hands"], "image": "train2014/COCO_train2014_000000027149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425484, "question_id": "U2ybLiXusRCrsyKBxihveo", "question": "What are the colored items here used for?", "choices": ["advertising", "trash", "racing", "sleeping"], "correct_choice_idx": 3, "direct_answers": ["covering", "camping", "tents", "shade", "sleeping", "keep dry", "shelter", "camping", "tents", "shade"], "difficult_direct_answer": false, "rationales": ["Tents are used for protecting when sleeping outdoors.", "These are tents for shelter", "The items are used in settings in which enclosed space with bedding is necessary to protect from the elements."], "image": "val2014/COCO_val2014_000000425484.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280270, "question_id": "U3AYfmLvkaxhQY6t9d6MrW", "question": "What sort of meat is on the plate shown?", "choices": ["venison", "turkey", "none", "beef"], "correct_choice_idx": 2, "direct_answers": ["vegetarian", "vegan", "none", "tofu", "vegan", "fish", "vegan", "cooked", "gyro", "seafood"], "difficult_direct_answer": false, "rationales": ["This is a vegetarian dish featuring soybeans and veggies.", "The plate is showing only vegetarian food.", "There is no meat on the plate."], "image": "train2014/COCO_train2014_000000280270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564301, "question_id": "U3JWDVR7d74gxxxsZg5ins", "question": "By what means is this animal's salable product garnered?", "choices": ["shearing", "beating", "squeezing", "plucking"], "correct_choice_idx": 0, "direct_answers": ["shearing", "shaved", "shearing wool", "bids", "fur", "sheering", "shearing", "sheering", "shearing wool", "sheared"], "difficult_direct_answer": false, "rationales": ["You use razor-like scissors to cut off the wool to make clothing with it.", "A sheep's wool is cut off of them in order to be sold.", "The farmer will use shears to cut off the wool."], "image": "val2014/COCO_val2014_000000564301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1626, "question_id": "U3RAuKyNkrLSrNUiS5H7ag", "question": "What are these people playing?", "choices": ["soccer", "video games", "pool", "bingo"], "correct_choice_idx": 1, "direct_answers": ["wii", "videogames", "wii", "nintendo", "wii boxing", "videogame", "video games", "wii", "wii", "wii"], "difficult_direct_answer": false, "rationales": ["These people are playing video games on the console.", "They are holding wii remotes.", "They are playing video games."], "image": "val2014/COCO_val2014_000000001626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156572, "question_id": "U3VzYS9CPU2UgHuJtbkTpb", "question": "What are the boys doing in the room?", "choices": ["gaming", "praying", "wrestling", "painting"], "correct_choice_idx": 0, "direct_answers": ["gaming", "playing wii", "playing game", "video games", "playing game", "playing nintendo", "playing games", "playing", "video games", "playing"], "difficult_direct_answer": false, "rationales": ["The boys are using nintendo wii remotes.", "The boys are holding game controllers.", "Two kids are holding game controllers."], "image": "val2014/COCO_val2014_000000156572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47435, "question_id": "U3gXRAZr3Ltgd9FThHp6X3", "question": "Where might the items in the bag be cooked today?", "choices": ["grill", "no where", "bucket", "microwave"], "correct_choice_idx": 0, "direct_answers": ["stove", "grill", "grill", "grill", "grill", "oven", "grill", "grill", "oven", "pot"], "difficult_direct_answer": false, "rationales": ["They are outside and you would not cook with a microwave outside and a bucket cannot be used to cook.", "There are many large tubes of meat in the bag. it can be cooked on an outdoor pit to eat for later.", "The picture is taken outside, which is where a grill is often placed."], "image": "val2014/COCO_val2014_000000047435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388933, "question_id": "U3zhurzcFMPo6AJJyCPCqp", "question": "What is the white chord around the woman's neck?", "choices": ["string", "necklace", "twine", "headphone wire"], "correct_choice_idx": 3, "direct_answers": ["headphone wire", "earbuds", "headphones", "earphones", "headphones", "earphones", "earphones", "ear plugs", "earphones", "headphones"], "difficult_direct_answer": false, "rationales": ["The woman is using earbuds.", "The cord appears to be connected to the ears and is of a color common to headphones.", "The cord is the wire."], "image": "train2014/COCO_train2014_000000388933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324909, "question_id": "U4YqYcFpDKeQeBSmugRjuf", "question": "What is an average deck sizes on PISO skateboards for adults?", "choices": ["7.95inch", "6.75inch", "8.5inch", "7.75inch"], "correct_choice_idx": 3, "direct_answers": ["18x10", "8' skateboard", "dont no", "31", "eight inches", "eight", "unknown", "7.75inch", "eight inch", "8 inches"], "difficult_direct_answer": true, "rationales": ["An average is around 8\" for adults.", "The average size is around the given length for most adults.", "The average deck size is 7.75 inches."], "image": "train2014/COCO_train2014_000000324909.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241832, "question_id": "U4xqzjGQfpzSJcpitgsKdf", "question": "What type of wine is likely held in glasses here?", "choices": ["white", "rose", "peach", "burgandy"], "correct_choice_idx": 3, "direct_answers": ["red wine", "burgandy", "red wine", "red", "red", "red", "red wine", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The wine is red.", "The wine is visibly dark red in the glass. burgundy is a shade and type of red wine and more likely than the other possible options which would be lighter in color.", "Red wines are normally darker in color."], "image": "val2014/COCO_val2014_000000241832.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166586, "question_id": "U5Uu8FX5ZhRo2nL8E6HGro", "question": "What food group do these plantains belong to?", "choices": ["vegetables", "grains", "fruits", "seeds"], "correct_choice_idx": 2, "direct_answers": ["fruits", "fruit", "fruit", "fruits", "fruit", "fruit", "fruit", "fruits", "fruit", "banana"], "difficult_direct_answer": false, "rationales": ["I chose the food group that bananas belong to.", "Plaintains are almost like bananas.", "Plantains are male bananas, therefor fruits."], "image": "val2014/COCO_val2014_000000166586.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314734, "question_id": "U5VFn6osMmRcZ7Bb4DZwq7", "question": "What type bird was killed to create this meal?", "choices": ["chicken", "quail", "duck", "pigeon"], "correct_choice_idx": 0, "direct_answers": ["chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken"], "difficult_direct_answer": false, "rationales": ["The meat looks to be chicken and it is the most common meat in a wrap.", "That is chicken on there.", "The bird is chicken."], "image": "train2014/COCO_train2014_000000314734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392659, "question_id": "U5ZhHMPLkEeyH54jZ6Vy7S", "question": "Where is the dog seated while riding on the bike?", "choices": ["handlebar", "lap", "basket", "seat"], "correct_choice_idx": 2, "direct_answers": ["basket", "basket", "basket", "basket", "basket", "basket", "basket", "basket", "basket", "basket"], "difficult_direct_answer": false, "rationales": ["The dog is in the basket.", "The seat is unoccupied. the dog is sitting in a pink container that is attached to the handlebar.", "The dog is placed in the bike pink basket."], "image": "val2014/COCO_val2014_000000392659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468340, "question_id": "U5gtgyVcSMd3fTR2FhQcrh", "question": "What does the man farthest to the right have on his neck?", "choices": ["scarf", "hands", "goggles", "bowtie"], "correct_choice_idx": 3, "direct_answers": ["bowtie", "bow tie", "bowtie", "bowtie", "bowtie", "bowtie", "bow tie", "tie", "bowtie", "bow tie"], "difficult_direct_answer": false, "rationales": ["The man is wearing a bow like tie.", "There is a bowtie on his neck.", "It is much shorter than a regular tie"], "image": "train2014/COCO_train2014_000000468340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172690, "question_id": "U5hEre2rJ5GKyPMiV8Gpwz", "question": "What do they want to prevent the ball from touching?", "choices": ["grass", "sand", "people", "net"], "correct_choice_idx": 1, "direct_answers": ["sand", "net", "net", "sand", "ground", "ground", "sand", "sand", "shoes", "roll"], "difficult_direct_answer": false, "rationales": ["The goal is to keep the ball in the air.", "Volleyball is a game that tries to keep the ball in the air.", "They use sand."], "image": "val2014/COCO_val2014_000000172690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204556, "question_id": "U5qhZaNvsDec5ibzWUv9KA", "question": "This machine is meant to assist what type people in payments?", "choices": ["bikers", "motorists", "unicyclists", "prisoners"], "correct_choice_idx": 1, "direct_answers": ["cash", "parking", "parkers", "drivers", "electronic", "parkers", "parking", "parking", "local merchants", "motorists"], "difficult_direct_answer": false, "rationales": ["The machine is used for motor bikes.", "This is a toll which is used for drivers to pay to park there.", "This machine is meant to assist motorists to pay for parking."], "image": "train2014/COCO_train2014_000000204556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505786, "question_id": "U64teD78tyofNVgadkR97Z", "question": "Why is the person pushing the button?", "choices": ["to blend", "to game", "to light", "to text"], "correct_choice_idx": 0, "direct_answers": ["activate appliance", "stop blender", "to blend", "turning on", "blend", "making smoothie", "to blend", "blend juice", "blend", "blending"], "difficult_direct_answer": false, "rationales": ["In order for a blender to work, the pulse button has to be pressed.", "You use a blender to blend things into liquid.", "The device is similar to a food processor. the person is pushing the button to tell the device to start processing the food in the pitcher."], "image": "val2014/COCO_val2014_000000505786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364160, "question_id": "U68jFxNVw8GWgrLn2ck2gK", "question": "What is the number on the fence referring to?", "choices": ["score", "field", "age", "time"], "correct_choice_idx": 1, "direct_answers": ["sports number", "court", "field", "court number", "court number", "court number", "court number", "six", "court number", "six"], "difficult_direct_answer": false, "rationales": ["The fence refers to the field.", "The number shows people where to go to when meeting at a field.", "The number is a field."], "image": "train2014/COCO_train2014_000000364160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217936, "question_id": "U6FQTnzp7XonZNmJtJAAmJ", "question": "What surface are they playing on?", "choices": ["grass", "clay", "indoor hard", "outdoor hard"], "correct_choice_idx": 3, "direct_answers": ["tennis court", "outdoor hard", "tennis court", "clay", "clay", "clay", "concrete", "artificial", "tennis court", "turf"], "difficult_direct_answer": false, "rationales": ["The surface is hard.", "They are playing on an outdoor tennis court.", "The sky is visible, so they are not inside. the surface is blue, not orange or green, so they are not playing on clay or grass."], "image": "val2014/COCO_val2014_000000217936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83654, "question_id": "U6LaaREpJ7uXoy4vJqhtVN", "question": "Why does this man wear sunglasses?", "choices": ["being blind", "fashion", "protect eyes", "dress code"], "correct_choice_idx": 1, "direct_answers": ["look cool", "look cool", "fashion", "bright", "style", "look cool", "be cool", "style", "cool", "be cool"], "difficult_direct_answer": false, "rationales": ["The man wants to be fashionable.", "The man is indoors and doesn't need the glasses for the sun so they are wearing it for fashion.", "Sunglasses are on the man's face in order to make a statement. it makes him look cool."], "image": "train2014/COCO_train2014_000000083654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539124, "question_id": "U6SJ9Xi7cBZp3TEskjdiH7", "question": "What fruit is required to create the beverage being advertised?", "choices": ["apple", "guava", "lemon", "orange"], "correct_choice_idx": 2, "direct_answers": ["lemon", "lemons", "lemons", "lemons", "lemon", "lemons", "lemons", "lemons", "lemon", "lemon"], "difficult_direct_answer": false, "rationales": ["The fruit name is in the word", "The fruit mentioned on the sign is lemonade which is made from lemons.", "Lemonade is being advertised which requires lemons."], "image": "val2014/COCO_val2014_000000539124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341409, "question_id": "U6hLJoLBQvcjgoPPrBAoad", "question": "Equipment for what sport is readily available here?", "choices": ["springboard diving", "surfing", "sailing", "fishing"], "correct_choice_idx": 1, "direct_answers": ["boating", "surfing", "surfing", "boating", "surfing", "surfing", "surfing", "normal", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["There are many surfboards around the beach.", "A row of surfboards are on a beach.", "The surfboards are waiting for people to rent them."], "image": "val2014/COCO_val2014_000000341409.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100312, "question_id": "U6hRnokShfW4CLgM6LLdJJ", "question": "What is the woman pushing?", "choices": ["sled", "stroller", "cart", "wagon"], "correct_choice_idx": 1, "direct_answers": ["stroller", "baby carriage", "stroller", "stroller", "stroller", "stroller", "stroller", "stroller", "stroller", "baby stroller"], "difficult_direct_answer": false, "rationales": ["A woman pushes a buggy down the sidewalk that has a canopy covering it.", "The object the woman is pushing is clearly visible and matches the style and dimensions of a stroller as well as there is a child visible inside which is the purpose of a stroller.", "There is a baby sitting secured behind the plastic cover."], "image": "train2014/COCO_train2014_000000100312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136687, "question_id": "U6mWRAvMPhi6Ss6F3QX6Kp", "question": "What major gaming company made the device the person is holding?", "choices": ["playstation", "microsoft", "sony", "nintendo"], "correct_choice_idx": 3, "direct_answers": ["nintendo", "apple", "nintendo", "wii", "nintendo", "apple", "nintendo", "nintendo", "nintendo", "nintendo"], "difficult_direct_answer": false, "rationales": ["Nintendo is a video gaming company that manufactures the item the person is holding.", "There is a nintendo device in the person's hand.", "A man is outstretched on the couch. he is smiling as he holds a white game wand in one hand and a controller in another."], "image": "train2014/COCO_train2014_000000136687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283445, "question_id": "U6s9HMiibE2r4pagjcy9qw", "question": "What is in the bowl sitting in another bowl?", "choices": ["onion soup", "cole slaw", "au jus", "thick gravy"], "correct_choice_idx": 2, "direct_answers": ["soup", "au jus", "au jus", "jus", "pasta", "french dip", "au jus", "sauce", "soup", "dipping sauce"], "difficult_direct_answer": false, "rationales": ["The dipping au jus is included.", "There is a beef and bread sandwich on the plate. that sandwich is usually dunked in a broth.", "It's to dip the sandwich in."], "image": "train2014/COCO_train2014_000000283445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392961, "question_id": "U7Hx2AbnUKW3GPh8xmU3zq", "question": "What is the brown desk the laptop is on made of?", "choices": ["steel", "plastic", "wood", "glass"], "correct_choice_idx": 2, "direct_answers": ["wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["There is a brown desk which means it's made of wood.", "Desks are often made from wood.", "The desk is made of wood."], "image": "val2014/COCO_val2014_000000392961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247951, "question_id": "U8avbDs4QP7DSuZrLtDAf6", "question": "What object can be seen underneath one of the restroom sinks?", "choices": ["basket", "scale", "rack", "cabinet"], "correct_choice_idx": 1, "direct_answers": ["scale", "scale", "scale", "scale", "scale", "scale", "garbage can", "scale", "different", "scale"], "difficult_direct_answer": false, "rationales": ["The object is a scale.", "There is a weigh scale underneath the sink.", "Traditionally these types of weight measuring devices are found in the bathroom."], "image": "train2014/COCO_train2014_000000247951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250124, "question_id": "U8h3fXVzWfcdGbEGWCftsV", "question": "How many monitors are present this room?", "choices": ["five", "two", "three", "one"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The desk in the corner has two computer monitors on it.", "There is one monitor by the speakers and one by the tower.", "There are 2 screens on the desktop."], "image": "train2014/COCO_train2014_000000250124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165476, "question_id": "U8jbZ9nTPUnqWb4hJeojnm", "question": "What does the white tag here allow the skier to board?", "choices": ["trolley car", "luggage rack", "plane", "ski lift"], "correct_choice_idx": 3, "direct_answers": ["ski lifts", "race number", "ski lift", "lifts", "ski lift", "ski lift", "ski lift", "ski lift", "ski lift", "ski lift"], "difficult_direct_answer": false, "rationales": ["The tag is of a size, and shape and being worn in a manner consistent with answer a in this setting.", "Traditionally you need a pass of some sort to take lifts while skiing.", "The person is about to participate in an activity that involves going down a hill. the tag allows the person to use the device with chairs in the background to reach the top of the hill."], "image": "val2014/COCO_val2014_000000165476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260025, "question_id": "U8rdCSXD5uwMdG39KTbufK", "question": "What is happening here?", "choices": ["making cake", "graduation ceremony", "funeral", "wedding ceremony"], "correct_choice_idx": 3, "direct_answers": ["cake cutting", "cake cutting", "wedding reception", "wedding", "wedding", "wedding cake", "cutting cake", "wedding ceremony", "cutting cake", "wedding reception"], "difficult_direct_answer": false, "rationales": ["The couple are dressed as a bride and groom and are cutting the cake at their reception.", "The bride and groom are cutting into the cake.", "The picture shows a bride and groom cutting a cake on their wedding day."], "image": "val2014/COCO_val2014_000000260025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99856, "question_id": "U92FCDvuNTSQAivQjJKktR", "question": "How fast is this plane flying now?", "choices": ["zero mph", "100 mph", "mach 5", "250 mph"], "correct_choice_idx": 0, "direct_answers": ["zero mph", "motionless", "0km/h", "it's not", "stopped", "zero mph", "zero", "not flying", "zero mph", "zero mph"], "difficult_direct_answer": false, "rationales": ["The plane is not flying, it is on display.", "The plane is a stationary model.", "The plane isn't moving."], "image": "train2014/COCO_train2014_000000099856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257971, "question_id": "U97PXNjCoToYqU4X3HBQYV", "question": "What is the man standing up pointing at?", "choices": ["dog", "door", "window", "t.v"], "correct_choice_idx": 3, "direct_answers": ["t.v", "tv", "television", "television screen", "tv", "tv", "television monitor", "game console", "television", "wii"], "difficult_direct_answer": false, "rationales": ["He is playing a game pointing the remote at it.", "The man holds a wireless game controller that needs to communicate with the game system.", "A man holds a game controller out in front of his body as he looks forward."], "image": "val2014/COCO_val2014_000000257971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215815, "question_id": "U9EYaJLKEsuqGmrgf6aDXB", "question": "Why are the planes so close?", "choices": ["illusion", "bad judgment", "sighting", "showing off"], "correct_choice_idx": 3, "direct_answers": ["air show", "exhibition", "airshow", "stunts", "trick show", "doing grick", "showing off", "photo manipulation", "air show", "show"], "difficult_direct_answer": true, "rationales": ["There is a person visible who appears to have their arms in the air like they are celebrating and these types of planes are usually associated with air shows. i would think these are two show pilots performing a trick.", "The planes are showing off for onlookers.", "The planes are flying so close because they are showing off a trick that they can do."], "image": "val2014/COCO_val2014_000000215815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322848, "question_id": "U9Wx5UbSGiB4A9wtCh7gAe", "question": "What is number 25 tapping?", "choices": ["ball", "dirt", "sod", "home base"], "correct_choice_idx": 3, "direct_answers": ["home base", "home base", "homeplate", "plate", "base", "home plate", "base", "plate", "base", "bat"], "difficult_direct_answer": false, "rationales": ["This is commonly done in baseball by players.", "Number 25 is batting. he is hitting the plate near the batter's box.", "The baseball player is tapping the plate."], "image": "val2014/COCO_val2014_000000322848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51335, "question_id": "U9szUfx3Sdr8rHwpGNbJoD", "question": "What condiment is in the white paper bowl?", "choices": ["mayo", "mustard", "honey", "barbeque sauce"], "correct_choice_idx": 1, "direct_answers": ["mustard", "mustard", "mustard", "mustard", "cookie", "mustard", "potato chips", "cookie", "mustard", "mustard"], "difficult_direct_answer": false, "rationales": ["Mustard is yellow.", "Yellow can be seen in the paper bowl, which goes well with the hot dog the man is holding. mustard is a popular additional to hot dogs.", "The condiment is yellow, not gold, brown, or white. the man is eating a hot dog."], "image": "val2014/COCO_val2014_000000051335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196418, "question_id": "UA6GKz5mPNRS3dRBYPZFFQ", "question": "Why is he leaning?", "choices": ["is scared", "is falling", "maintaining balance", "is searching"], "correct_choice_idx": 2, "direct_answers": ["ride skateboard", "skating", "skateboarding trick", "falling", "maintaining balance", "tricking", "skateboarding", "trick", "skating", "fell off"], "difficult_direct_answer": true, "rationales": ["The man is on the skateboard and it is splitting into two parts. he won't be able to stay on.", "One of the maneuvers to ride a skateboard involves leaning different ways to maintain balance and move along.", "The man doesn't want to fall off his board."], "image": "train2014/COCO_train2014_000000196418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319908, "question_id": "UA6Rif25tjibNZx8wg7ZFM", "question": "How is this area kept cool in warming weather?", "choices": ["furnaces", "hot fans", "air conditioning", "shade"], "correct_choice_idx": 2, "direct_answers": ["blown air", "snow", "refrigeration", "machinery", "indoors", "air conditioning", "snow", "air conditioning", "air conditioner", "from facility"], "difficult_direct_answer": false, "rationales": ["Ac produces cool air.", "Since it's indoors they would have to keep it cool with a/c units.", "A man is in a building with snow inside."], "image": "val2014/COCO_val2014_000000319908.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87091, "question_id": "UASSHjdikFZZ4JdVdMpdrV", "question": "This bank is affiliated with what church?", "choices": ["baptist", "mormon", "methodist", "catholic"], "correct_choice_idx": 1, "direct_answers": ["zions", "mormon", "zion", "zion", "zion's", "mormon", "zion", "zionism", "zion", "catholic"], "difficult_direct_answer": false, "rationales": ["I'm really not sure but maybe zions have something to do with mormons.", "The bank and the religion have the same name.", "Zion is a common term within the lads church and utah."], "image": "train2014/COCO_train2014_000000087091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93534, "question_id": "UAToccrhthQ35YX5rSsBGK", "question": "Why are there lit candles on the cake?", "choices": ["light room", "burning house", "showing off", "child's birthday"], "correct_choice_idx": 3, "direct_answers": ["birthday celebration", "birthday party", "child's birthday", "cake", "celebrating birthday", "birthday celebration", "birthday celebration", "birthday celebration", "birthday", "symbolize years"], "difficult_direct_answer": false, "rationales": ["It is a birthday.", "The candles represent the kid's age.", "A woman can be seen carrying a cake with about 10 lit candles on it. in many cultures this is used to celebrate someone's birthday with the candles matching years of age."], "image": "val2014/COCO_val2014_000000093534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439907, "question_id": "UAWfrf3hYuNP5vVrBdfJ9t", "question": "What kind of bathroom is this?", "choices": ["public", "school", "home", "hotel"], "correct_choice_idx": 3, "direct_answers": ["hotel", "hotel", "hotel", "hotel", "hotel", "hotel", "hotel", "modern", "private", "fancy kind"], "difficult_direct_answer": false, "rationales": ["It looks like a bathroom in a place with paid rooms.", "The towel and the mini items are typical of a hotel bathroom.", "The bathroom is a hotel."], "image": "val2014/COCO_val2014_000000439907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3532, "question_id": "UAoGcNwQxWstvqi8b72YVV", "question": "What could have caused the puddles in the mud?", "choices": ["buckets", "rain", "snow", "hoses"], "correct_choice_idx": 1, "direct_answers": ["rain shower", "rain", "rain", "rain", "rain", "water", "enjoy", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The field is outside where the most common cause of surface water would be answer a.", "The people are playing baseball, so it is not winter. the water likely came from nature, not buckets or hoses.", "The water is formed in non-geometric areas on the ground."], "image": "train2014/COCO_train2014_000000003532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142815, "question_id": "UApeNSayCEhnbBpte2Cgd2", "question": "The subject being filmed most here wears what color shirt?", "choices": ["red", "none", "white", "gray purple"], "correct_choice_idx": 3, "direct_answers": ["gray", "grey", "blue", "gray", "grey", "grey", "gray", "gray purple", "grey", "gray"], "difficult_direct_answer": false, "rationales": ["It's the color of concrete with a logo on it", "The subject is lounging on the bed and wearing a gray shirt with purple and black design.", "Unless you are colorblind it is easy to tell what colors he has on."], "image": "val2014/COCO_val2014_000000142815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505579, "question_id": "UBfnTsV7KiBnCiesrw98Pu", "question": "How many wheels do all visible vehicles here have?", "choices": ["none", "one", "four", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "road", "two", "two", "two", "two", "two", "two", "road"], "difficult_direct_answer": false, "rationales": ["The motorcycles and bikes have two wheels.", "All the motorcycles and bikes have two wheels.", "There is a lone cyclist that is waiting behind another motorcycle. there are many other motorcycle randomly around the area."], "image": "train2014/COCO_train2014_000000505579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409438, "question_id": "UBjA4iXsWhTGugPfKbk4uq", "question": "What kind of sink is this?", "choices": ["kitchen", "bathroom", "laundry", "workstation"], "correct_choice_idx": 0, "direct_answers": ["aluminum", "kitchen", "kitchen sink", "metal", "stainless steel", "metal", "wood", "divided undermount", "kitchen", "kitchen sink"], "difficult_direct_answer": false, "rationales": ["The cupboards and counter with other kitchen appliances suggest that this is a kitchen.", "The room shows a coffee machine.", "The room is open and has many cupboards and appliances."], "image": "train2014/COCO_train2014_000000409438.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59202, "question_id": "UBjWvoySbnRAzUQ8hEmNfw", "question": "What is the man skateboarding on?", "choices": ["half pipe", "training ramp", "full pipe", "tech deck"], "correct_choice_idx": 0, "direct_answers": ["skateboard", "skateboard park", "plaything", "ramp", "halfpipe", "skateboard deck", "ramp", "half pipe", "half pipe", "skateboard"], "difficult_direct_answer": false, "rationales": ["The skating surface curves sharply upward at least at one end of it, and is roughly the height of an adult person at its highest point.", "It is called this because it looks like the shape of the bottom half of a large one", "The man is in a half pipe."], "image": "val2014/COCO_val2014_000000059202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572978, "question_id": "UBx6qFhqLotki7mDzCXJrz", "question": "What is the stuffed animals on top of the book shelf supposed to be?", "choices": ["pigeon", "tiger", "ox", "snake"], "correct_choice_idx": 3, "direct_answers": ["block draft", "snake", "snake", "snake", "snake", "snake", "snake", "snake", "snake", "cartoon"], "difficult_direct_answer": false, "rationales": ["The stuffed animal is long and thin, with a protruding forked tongue. these are well-known characteristics of snakes.", "It has a forked tongue", "The animal is long and has its tongue out as a snake would."], "image": "train2014/COCO_train2014_000000572978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102461, "question_id": "UCM5KB5ZXREboWTxPtWZ2S", "question": "What do the officers here observe?", "choices": ["party", "horse race", "candled cake", "protest"], "correct_choice_idx": 3, "direct_answers": ["public gathering", "riot", "protest", "protests", "parade", "crowd", "curfew", "line formation", "protest", "event"], "difficult_direct_answer": true, "rationales": ["The officers are wearing their uniforms and are mounted on their horses lined up ready to protect and defend the innocent.", "The police are in riot gear and are likely at a protest.", "As indicated by their helmets, this is the most likely answer."], "image": "val2014/COCO_val2014_000000102461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433311, "question_id": "UCPv7Bsgyw5vVJXpWiMik3", "question": "How many people are waiting to enter the river?", "choices": ["ten", "six", "eight", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are people visibly carrying surfboards by the river who are presumably waiting for their turn to enter. these people are countable.", "Four people are standing to the side of the river.", "There are four that are waiting."], "image": "val2014/COCO_val2014_000000433311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200464, "question_id": "UCWTMrgQpx6JTyvdxXeYYj", "question": "What are they doing?", "choices": ["awaiting bus", "eating lunch", "resting", "seeking food"], "correct_choice_idx": 1, "direct_answers": ["hanging out", "eating lunch", "eating", "organizing frisbee", "playing frisbee", "playing", "frisbee", "talking", "eating", "frisbee"], "difficult_direct_answer": false, "rationales": ["The people are resting from their game.", "They are standing off to the side after playing frisbee.", "They appear to be playing a rigorous game of frisbee, and the bench is there to take rests between games"], "image": "train2014/COCO_train2014_000000200464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288021, "question_id": "UCztcCwrvr8isLhHvJTZr3", "question": "Where is the bus headed to?", "choices": ["harlem", "library", "mall", "trenton"], "correct_choice_idx": 0, "direct_answers": ["harlem", "harlem", "harlem", "harden", "harden", "harlem", "harlem", "harlem", "harlem", "harlem"], "difficult_direct_answer": false, "rationales": ["The bus goes to harlem.", "The bus says harlem on front.", "The bus says harlem on it and that is likely where it's heading."], "image": "train2014/COCO_train2014_000000288021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252629, "question_id": "UDTAKF4CzhYoVb6hpruJuK", "question": "Why are the people celebrating?", "choices": ["birthday", "graduation", "baby shower", "anniversary"], "correct_choice_idx": 2, "direct_answers": ["birthday", "birthday", "birthday", "birthday", "baby shower", "baby shower", "birthday", "bithday", "birthday", "baby"], "difficult_direct_answer": false, "rationales": ["They are celebrating a baby shower.", "The words on the cake state said celebration, and the pink plates suggest those gender reveal type things.", "The cake says baby."], "image": "val2014/COCO_val2014_000000252629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139469, "question_id": "UDTvBPerFhUG4xum96oHLS", "question": "Which letter is sewn in red on the top of the man's hat in the center?", "choices": ["d", "p", "b", "e"], "correct_choice_idx": 1, "direct_answers": ["p", "p", "letter p", "letter p", "letter p", "p letter", "letter p", "pp", "p there", "letter p"], "difficult_direct_answer": false, "rationales": ["It is a letter in the english alphabet", "The letter that's in red is for the phillies.", "The letter is p."], "image": "train2014/COCO_train2014_000000139469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202739, "question_id": "UDcbPJ3evPcLR8uaz64QE8", "question": "What is the woman engaging in?", "choices": ["watching tv", "posing", "drying hair", "meditating"], "correct_choice_idx": 2, "direct_answers": ["hair drying", "hair styling", "grooming", "drying hair", "hair drying", "posing", "blowdrying", "hair drying", "hair care", "hair drying"], "difficult_direct_answer": false, "rationales": ["The woman is posing and primping in a silly way as her raised arm blow drys her wet hair.", "The woman is holding a hair drier over her wet hair.", "The woman is drying her hair."], "image": "train2014/COCO_train2014_000000202739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461445, "question_id": "UDgaqHsJAR6AynxqKpsB7B", "question": "Which animal is closest to the camera?", "choices": ["cow", "horse", "dog", "duck"], "correct_choice_idx": 1, "direct_answers": ["horse", "cow", "horse", "horse", "cow", "horse", "horse", "horse", "horse", "cow"], "difficult_direct_answer": false, "rationales": ["A horse is grazing in a field.", "The closest animal to the bottom of the image is clear and has a size, tail, color and shape consistent with a horse.", "The horse is closest to the camera."], "image": "train2014/COCO_train2014_000000461445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136143, "question_id": "UDji9ZrNvDuezZAMK4XJWf", "question": "What will this man do with the frisbee first?", "choices": ["hide it", "take home", "catch it", "throw it"], "correct_choice_idx": 3, "direct_answers": ["throw it", "throw", "throw", "toss", "throw", "frisbeefirst caching", "throw it", "throw", "throw", "throwfrisbee"], "difficult_direct_answer": false, "rationales": ["The man is trying to throw the frisbee.", "The man's body language shows he is about to put forward motion into the frisbee.", "The boy is playing with a frisbee."], "image": "val2014/COCO_val2014_000000136143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291780, "question_id": "UDqGtywHFzQGeYxYzzPqhY", "question": "Where is the headquarter of this airline company?", "choices": ["italy", "france", "netherlands", "canada"], "correct_choice_idx": 2, "direct_answers": ["netherlands", "klm", "enjoy game", "netherlands", "netherlands", "royal dutch", "dublin", "britain", "dona", "qatar"], "difficult_direct_answer": false, "rationales": ["A logo is on the side of a plane on a runway.", "Klm is based in the netherlands.", "Klm is in the netherlands."], "image": "train2014/COCO_train2014_000000291780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126798, "question_id": "UE7KogddKN2SqWjWcMLvUA", "question": "Which city is the team in gray from?", "choices": ["oakland", "cleveland", "colorado", "cincinnati"], "correct_choice_idx": 2, "direct_answers": ["colorado", "denver", "colorado", "pittsburgh", "colorado", "colorado", "colorado", "colorado", "colorado", "colorado"], "difficult_direct_answer": false, "rationales": ["The gray baseball team is from co based on the name on their uniforms.", "The player's shirt says colorado on it.", "The city is colorado."], "image": "train2014/COCO_train2014_000000126798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496215, "question_id": "UENXo4UH3WdQL4X6Wk6EPr", "question": "Where are the people located?", "choices": ["mall", "store", "restaurant", "home"], "correct_choice_idx": 3, "direct_answers": ["home", "in home", "home", "house", "bedroom", "house", "home", "home", "home", "home"], "difficult_direct_answer": false, "rationales": ["The people are at home with their pets.", "The girl is at home brushing her pet's fur.", "The girl is at home with her bird."], "image": "train2014/COCO_train2014_000000496215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467000, "question_id": "UEQ6kNqmtZh3UHBiTSCVuS", "question": "What does this man wish would stop?", "choices": ["rain", "plane", "daylight", "traffic"], "correct_choice_idx": 0, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The man is looking out of the window at the rain and holding an umbrella as he looks on.", "The man is likely not liking the rainfall", "The man wants the rain to stop."], "image": "val2014/COCO_val2014_000000467000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376790, "question_id": "UEbHKzg9KNcPuQpWQDZobT", "question": "To get away from the dock's edge most quickly what method would one use?", "choices": ["swimming", "hang gliding", "unicycle", "boat"], "correct_choice_idx": 3, "direct_answers": ["boat", "boat", "boat", "drive", "surfing", "boat", "accelerate speed", "speedboat", "boat", "run"], "difficult_direct_answer": false, "rationales": ["A unicycle would sink in the water, and the area is too flat for hang gliding. swimming would be the slower way of leaving.", "A dock extends into the water with various marine vehicles parked there.", "You could go away from it on any kind of boat."], "image": "train2014/COCO_train2014_000000376790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232011, "question_id": "UEgAVj7V5sfnJsxmJrHa27", "question": "What does the item the person on the far right is holding protect against?", "choices": ["rain", "vampires", "bears", "mosquitos"], "correct_choice_idx": 0, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The plastic sheet cover keeps out rain.", "The other options are obviously not happening in this image and an umbrella would offer zero defense unless the handle had a stake pointed end, pest spray inside of it or bear mace.", "The man has an umbrella."], "image": "train2014/COCO_train2014_000000232011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569960, "question_id": "UEjB3jfWr5yPv2xCH6VX4o", "question": "The red meat that is shown here comes from what animal?", "choices": ["lemming", "crane", "pig", "badger"], "correct_choice_idx": 2, "direct_answers": ["pig", "pig", "pigs cows", "pigs cows", "pig", "pig", "pig", "pig", "pig", "pig"], "difficult_direct_answer": false, "rationales": ["The item is question is a pizza and a red circular topping is traditionally pepperoni. pepperoni is a product that comes from pigs.", "A large party style cut pizza contains pepperoni.", "Pepperoni is a pork product."], "image": "val2014/COCO_val2014_000000569960.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211521, "question_id": "UEzxfkcUHFfzKsHi5icyVj", "question": "What can be done with the appliance in this room?", "choices": ["cooling", "cooking", "washing", "viewing"], "correct_choice_idx": 3, "direct_answers": ["watch tv", "watch tov", "watch tv", "watch tv", "perfection", "viewing", "turn off", "watching tv", "watching shows", "television"], "difficult_direct_answer": false, "rationales": ["It is a television", "The appliance in the room is a television. televisions are watched.", "Inside a hotel room with a large flat screen tv. turning it on turns on a video where people can watch things."], "image": "train2014/COCO_train2014_000000211521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408621, "question_id": "UF3oi2RDQa8pbQtPqaqE4F", "question": "What item will you find inside this facility with more duplicates?", "choices": ["trophies", "microphones", "paintings", "books"], "correct_choice_idx": 3, "direct_answers": ["books", "books", "books", "books", "books", "books", "books", "books", "books", "books"], "difficult_direct_answer": false, "rationales": ["The facility in question is a library based on the writing above the door. this type of building would have a multitude of answer a and duplicates.", "Libraries house books.", "A library holds items that can be read."], "image": "val2014/COCO_val2014_000000408621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144817, "question_id": "UFX2H6FojqF9akhMRUFVtv", "question": "What instrumental music might this couple listen to later?", "choices": ["piano", "saxophone", "guitar", "fiddle"], "correct_choice_idx": 2, "direct_answers": ["guitar instrument", "guitar", "guitar", "jazz", "guitar instrument", "guitar", "guitar", "guitar", "guitar", "guitar"], "difficult_direct_answer": false, "rationales": ["A man in the background is holding one", "That's what the man is holding in the background.", "The person playing has a guitar in their hand."], "image": "train2014/COCO_train2014_000000144817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91547, "question_id": "UFaz3qyMeUqnQatgtRpF3p", "question": "Why are they here?", "choices": ["like water", "are lost", "are resting", "are stuck"], "correct_choice_idx": 0, "direct_answers": ["thirsty", "to swim", "ducks", "love water", "water", "like water", "drink water", "swim", "to swim", "swim"], "difficult_direct_answer": false, "rationales": ["Ducks love to swim", "Ducks love to be in water.", "Ducks like to swim."], "image": "train2014/COCO_train2014_000000091547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274337, "question_id": "UFeYhX9npj5DthqRasvg5g", "question": "Why does the man on the motorcycle have his arm out?", "choices": ["to itch", "to signal", "to arrest", "to wave"], "correct_choice_idx": 3, "direct_answers": ["turning", "waving", "waving", "waving", "to wave", "glove", "waving", "waving", "waving", "waving"], "difficult_direct_answer": false, "rationales": ["The man is waving.", "Based on the smile on his face and onlookers who are taking photographs, it is clear that this is a happy gathering, so he's probably waving to someone that caught his attention. his motorcycle has lights that can signal.", "The man is waving to onlookers."], "image": "train2014/COCO_train2014_000000274337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165798, "question_id": "UFnrLRH6zRBr4qunToRvMX", "question": "What do people usually feed these animals?", "choices": ["bread", "dogs", "cats", "chili"], "correct_choice_idx": 0, "direct_answers": ["bread", "bread", "bread", "bread", "bread", "bread", "bread", "bread", "bread", "bread"], "difficult_direct_answer": false, "rationales": ["It shows the type of breeds of the bird.", "The people feed bread.", "Seagulls get fed bread."], "image": "train2014/COCO_train2014_000000165798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475674, "question_id": "UG56EoMkqWdvLLAj3y3hkK", "question": "What kind of sport are the people pictured above playing?", "choices": ["ice skating", "broomball", "sledding", "skiing"], "correct_choice_idx": 3, "direct_answers": ["skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["The people are using skis.", "The people are on a mountain with snow. they have skiis on their feet.", "The people are on snow. they are not wearing skates and are not using sleds or brooms."], "image": "train2014/COCO_train2014_000000475674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376132, "question_id": "UG58TXoPZjySjfuKsh3jE3", "question": "What most likely motivates the cat to do what it's doing?", "choices": ["it's hungry", "found bird", "fleeing danger", "enjoys heights"], "correct_choice_idx": 3, "direct_answers": ["enjoys heights", "reading", "reach floor", "curiosity", "curiosity", "animal instinct", "curiosity", "curiosity", "curiosity", "having fun"], "difficult_direct_answer": false, "rationales": ["It can see everything from the top", "He wants to get higher.", "The cat in the photo appears to be climbing a bookshelf to get to the top. the most likely reason since no other animal or person is present, is that it enjoys heights."], "image": "train2014/COCO_train2014_000000376132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333434, "question_id": "UGAZgh6SaivSzG9SFkPYpA", "question": "What language is seen on these signs?", "choices": ["spanish", "braille", "finnish", "asian"], "correct_choice_idx": 3, "direct_answers": ["chinese", "chinese", "japanese", "asian", "chinese", "chinese", "chinese", "chinese", "japanese", "japanese"], "difficult_direct_answer": false, "rationales": ["By the symbols on the signs it is easy to to tell what region they are from.", "Asian languages use characters.", "There are asian characters on the signs."], "image": "val2014/COCO_val2014_000000333434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29793, "question_id": "UGAhPCg9RkFr25cJ7PhZpW", "question": "For what reason are these people sitting in bed?", "choices": ["they're lazy", "wealth", "sleepiness", "they're ill"], "correct_choice_idx": 3, "direct_answers": ["sick", "sick", "sick", "sick", "they're ill", "sick", "illness", "tired", "sick", "feeling sick"], "difficult_direct_answer": false, "rationales": ["This is what old hospitals look like", "The style of the room and the begging, as well as the appearance of the people, suggests they are in a hospital or other healthcare facility. people generally only rest in hospital beds when they're sick.", "The design of the room and placement of the beds is consistent with an old-fashioned hospital or infirmary."], "image": "val2014/COCO_val2014_000000029793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355161, "question_id": "UGCwkoEM3T86sJu3uKJXKw", "question": "The item advertised on the sign is usually made from what?", "choices": ["beef", "walnuts", "pizza", "rice"], "correct_choice_idx": 0, "direct_answers": ["beef", "plastic", "burgers", "beef", "beef", "beef", "beef", "cow", "beef", "beef"], "difficult_direct_answer": false, "rationales": ["Burgers are generally always made from beef.", "The diner is for burgers.", "You can tell by the word \"burger\" as to what they sell."], "image": "train2014/COCO_train2014_000000355161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455005, "question_id": "UGDUfBVJdRtGN39NgzF6cc", "question": "Why is there black netting behind the players?", "choices": ["special event", "to climb", "protect spectators", "decoration"], "correct_choice_idx": 2, "direct_answers": ["protect fans", "for protection", "safety", "protecting fans", "protect spectators", "protect fans", "catch ball", "audience safety", "stop ball", "prevents fouls"], "difficult_direct_answer": true, "rationales": ["The players are participating in baseball based on the uniforms, field and the bat in hand and are at home base. in baseball it is common for a ball to be hit outside the field of play where spectators sit and in this particular location they are closest to the hitter so a net is in place to prevent them being hit by a ball.", "Netting keeps people who are watching the game safe from flying balls.", "The net separates the playing field from the people watching the game in the stands. the netting is necessary to prevent the ball from going into the stands in a dangerous way."], "image": "val2014/COCO_val2014_000000455005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44813, "question_id": "UGgoXakrdGM8YsSXKyazaT", "question": "Which player is controlling the avatar with the blue gloves?", "choices": ["black shirt", "off screen", "dress shirt", "blue jeans"], "correct_choice_idx": 0, "direct_answers": ["photo left", "right", "red hat", "right", "left side", "right one", "black shirt", "left man", "blue shirt", "left man"], "difficult_direct_answer": false, "rationales": ["Black shirt guy is on the left just like the blue character.", "The person looks older and so does the character", "The left player is matching the human on the left. they are both using wii remotes and when they move their arm the character in video game does their movement."], "image": "train2014/COCO_train2014_000000044813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312433, "question_id": "UHSYfSu33NR3qJJ2YjPWHA", "question": "What is the red vegetable inside this sandwich?", "choices": ["radish", "tomato", "chili pepper", "beet"], "correct_choice_idx": 3, "direct_answers": ["cabbage", "cabbage", "onions", "beet", "cabbage", "cabbage", "onions", "cabbage", "cabbage", "beet"], "difficult_direct_answer": false, "rationales": ["Beets have been shredded.", "Some people like it in their sandwiches.", "The red vegetable in the sandwich is a beet which naturally grows a deep red color."], "image": "train2014/COCO_train2014_000000312433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408270, "question_id": "UHWqMdK4sJD72maf7ADQPs", "question": "What is the man reading?", "choices": ["book", "paper", "text message", "tv message"], "correct_choice_idx": 2, "direct_answers": ["newspaper", "phone", "newspaper", "text message", "newspaper", "newspaper", "newspaper", "phone", "text", "paper"], "difficult_direct_answer": false, "rationales": ["He is looking at the screen of his cellphone.", "He is looking down at his phone. communications with words can be seen on a phone.", "The man is looking at his phone screen so he is likely reading a text."], "image": "train2014/COCO_train2014_000000408270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449029, "question_id": "UHXDGfjmShbaBQ2GfcEab8", "question": "What flag can be seen here?", "choices": ["france", "china", "united states", "germany"], "correct_choice_idx": 2, "direct_answers": ["america", "american flag", "united states", "united states", "american", "american", "united states", "american", "american", "american"], "difficult_direct_answer": false, "rationales": ["The flag has stars on the side.", "There is an american flag hanging from a hole on the building on the right.", "The flag of the usa"], "image": "train2014/COCO_train2014_000000449029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351161, "question_id": "UHfRyYEWNsNLVdidoFVcQh", "question": "What is the traffic light preventing?", "choices": ["racing", "flipping", "crossing", "stopping"], "correct_choice_idx": 2, "direct_answers": ["cars passing", "car traffic", "crossing", "walking", "cars moving", "cars moving", "walking", "crossing", "accidents", "cars"], "difficult_direct_answer": false, "rationales": ["There is a traffic light visible with a red light and a red hand symbol. this light color and symbol are used at intersection to direct people not to cross.", "The light is for crossing.", "The traffic light is red."], "image": "train2014/COCO_train2014_000000351161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80517, "question_id": "UHfZjiTxXjrVekDF5MPxXV", "question": "From what organism did this person get the green items?", "choices": ["mammal", "fish", "dolphin", "plant"], "correct_choice_idx": 3, "direct_answers": ["plant", "tree", "banana tree", "banana tree", "tree", "tree", "tree", "banana tree", "banana tree", "tree"], "difficult_direct_answer": false, "rationales": ["Bananas grown on trees.", "These grow on trees", "A banana is a plant."], "image": "val2014/COCO_val2014_000000080517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454959, "question_id": "UHoARrS5syxEiEM4SLgRHy", "question": "Is there a fence in this image?", "choices": ["unsure", "no", "maybe", "yes"], "correct_choice_idx": 3, "direct_answers": ["no", "yes unsure", "yes", "yes", "yes", "yes", "yes", "no", "yes", "no"], "difficult_direct_answer": false, "rationales": ["There is no fence.", "There is a fence in the background.", "There are wooden posts with wires connected in between them that surround the area where the cows are so they can not wander off."], "image": "train2014/COCO_train2014_000000454959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234234, "question_id": "UHxJu9uh45QutjokfAx4nv", "question": "What is the name of the large yellow pole?", "choices": ["foul pole", "first pole", "base pole", "batting pole"], "correct_choice_idx": 0, "direct_answers": ["homerun pole", "foul pole", "homerun pole", "foul pole", "foul pole", "foul pole", "foul pole", "foul pole", "foul pole", "homerun pole"], "difficult_direct_answer": false, "rationales": ["This is a pole that if the ball goes past it then it will be out.", "The name is a foul ball.", "The yellow pole is used to mark the line that shows when a ball has gone out of bounds in baseball."], "image": "train2014/COCO_train2014_000000234234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179045, "question_id": "UJKBPoCpW78KMPfDufTqWt", "question": "What's inside the animal the child plays with?", "choices": ["beef", "stuffing", "money", "animal intestines"], "correct_choice_idx": 1, "direct_answers": ["stuffing", "stuffing", "stuffing", "cotton", "stuffing", "stuffing", "stuffing", "cotton", "cotton", "stuffing"], "difficult_direct_answer": false, "rationales": ["A kid is playing with a plush toy.", "The stuffed animals have stuffing in them. usually it's some form of cotton.", "The child is holding a teddy bead which is a stuffed animal."], "image": "val2014/COCO_val2014_000000179045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20671, "question_id": "UJRmgWVJoHwAkGUqzF42bk", "question": "What is being given to the dog here?", "choices": ["water", "melon", "burgers", "nothing"], "correct_choice_idx": 0, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The man is talking to the dog and rubbing his head.", "There is a blue portable bowl for liquid in front of the dog and the lady is holding a bottle of liquid.", "The dog is being given water from the water bottle in the person's hand."], "image": "val2014/COCO_val2014_000000020671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560384, "question_id": "UJS2hmnEdzsEDczWaNTtJF", "question": "Who made her racquet?", "choices": ["wilson", "avia", "head", "sportscraft"], "correct_choice_idx": 0, "direct_answers": ["wilson", "wilson", "wilton", "wilson", "wilson", "wilton", "wilson", "wilson", "wilson", "wilson"], "difficult_direct_answer": false, "rationales": ["Wilson is known for its tennis racquets.", "Wilson made the racquet.", "The head of the racket has a w on it which is the logo for a well-known sporting goods company."], "image": "train2014/COCO_train2014_000000560384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433156, "question_id": "UJg5Ks7sG4RHTSAzvAAoTw", "question": "This shop is situated in which country?", "choices": ["france", "netherlands", "britain", "italy"], "correct_choice_idx": 1, "direct_answers": ["netherlands", "amsterdam", "usa", "united states", "netherlands", "netherlands", "united states", "usa", "united states", "us"], "difficult_direct_answer": false, "rationales": ["The shop is in the netherlands.", "The shop is situated in the country of netherlands, as we could probably tell from the heineken t-shirt.", "You can tell by the names on the clothing that this shop is somewhere in europe."], "image": "val2014/COCO_val2014_000000433156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262937, "question_id": "UJy8aBcE6cqVoocMcnHz7W", "question": "What are these vehicles called?", "choices": ["planes", "cars", "tanks", "buses"], "correct_choice_idx": 3, "direct_answers": ["lories", "trucks", "trucks", "jitneys", "buses", "buses", "buses", "buses", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["The vehicles have many seats for holding passengers.", "The vehicles are larger than cars, but not on tracks like tanks. they are on the ground, not in the air.", "The vehicles are buses."], "image": "train2014/COCO_train2014_000000262937.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471669, "question_id": "UJzh9afHRhE7EZ8ZaF5iTn", "question": "How long does would the pedestrian have to cross here in seconds?", "choices": ["zero", "five", "229", "22"], "correct_choice_idx": 3, "direct_answers": ["twenty two", "22 seconds", "twenty two", "twenty two", "22", "twenty two", "22", "twenty two", "sixty", "22 seconds"], "difficult_direct_answer": false, "rationales": ["There is a signal to the right of the crosswalk. it indicates the waiting time.", "There are 22 seconds left.", "There are 22 seconds remaining in the walk sign."], "image": "train2014/COCO_train2014_000000471669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322897, "question_id": "UK2Z8QE77srVezbhRSU8hQ", "question": "What is being held up by two of the giraffes?", "choices": ["vase", "pot", "box", "jug"], "correct_choice_idx": 3, "direct_answers": ["feeder", "bottle", "water", "bottle", "water container", "jug", "water jug", "container bottle", "water jug", "jug"], "difficult_direct_answer": false, "rationales": ["The giraffes have a jug.", "It is a large clear plastic container.", "There is an object between two of the giraffes that is clear and of the shape as something that would be called answer a."], "image": "train2014/COCO_train2014_000000322897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516193, "question_id": "UKGmqR2o6jUDdFtnDJZq7U", "question": "What is the most popular pizza topping?", "choices": ["pineapple", "pepperoni", "mushroom", "olive"], "correct_choice_idx": 1, "direct_answers": ["pepperoni", "pepperoni", "pepperoni", "pepperoni", "pepperoni", "cheese", "cheese", "cheese", "pineapple", "pepperoni"], "difficult_direct_answer": false, "rationales": ["Pepperoni is the most popular topping on pizza.", "The topping is pepperoni.", "A lot of people love pepperoni pizzas."], "image": "train2014/COCO_train2014_000000516193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186791, "question_id": "UKVj4D4WNtkRKhzuRrxnvT", "question": "What is the man with the glove ready to do?", "choices": ["juggle", "catch", "throw", "dunk"], "correct_choice_idx": 1, "direct_answers": ["catch baseball", "catch ball", "catch", "catch ball", "catch", "catch ball", "catch ball", "catch ball", "catch", "catch"], "difficult_direct_answer": false, "rationales": ["The man is going to catch a ball.", "The man is holding his glove out in front of him in a manner consistent with answer a and no other aspect of baseball.", "His left arm is extended straight out from his body, and, since he's wearing a mitt, this player is ready to catch any baseballs coming his way."], "image": "train2014/COCO_train2014_000000186791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265125, "question_id": "UKiEsQSx6KMDUhmFpyt6kL", "question": "What can keep her head dry besides the umbrella?", "choices": ["glasses", "hood", "trunk", "sidewalk"], "correct_choice_idx": 1, "direct_answers": ["jacket cap", "trunk", "hood", "roof", "hood", "hood", "hood", "hood", "hood", "hood"], "difficult_direct_answer": false, "rationales": ["The lady is wearing a hoodie to stay dry.", "A woman is holding an umbrella and wearing a hooded jacket.", "She has a good on the back of her jacket."], "image": "val2014/COCO_val2014_000000265125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503961, "question_id": "UKuWaJg9wzJuXQNAwRfTg9", "question": "Why does the horse have a bright yellow covering?", "choices": ["keep ward", "natural covering", "ceremonial", "keep dry"], "correct_choice_idx": 2, "direct_answers": ["festivities", "royal", "skin protection", "festive", "parade", "ceremonial", "decoration", "unknown", "protection", "decorative"], "difficult_direct_answer": true, "rationales": ["There is a wedding taking place.", "A lot of people are wearing similar clothing to the horse's saddle.", "The horse is part of a celebration."], "image": "train2014/COCO_train2014_000000503961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122394, "question_id": "ULS6WqX3kpVmvXtKUiVEBu", "question": "What kind of event is being advertised?", "choices": ["educational sermon", "bdsm", "flower festival", "music festival"], "correct_choice_idx": 1, "direct_answers": ["dance", "bdsm", "dance", "so dance", "dance", "dance", "dance", "dance", "so dance", "so dance"], "difficult_direct_answer": false, "rationales": ["The poster in the background shows two people wearing fetish type clothing.", "There is leather gear to wear and suggestive poses", "Bdsm involves bondage as advertised."], "image": "train2014/COCO_train2014_000000122394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313983, "question_id": "ULVsXFS5j4Jzn2xVhQwDBc", "question": "A method of horse training is called?", "choices": ["driving", "boarding", "lunging", "carrying"], "correct_choice_idx": 2, "direct_answers": ["reiki", "gentle breaking", "longing", "horsemanship", "natural horsemanship", "unknown", "gentle breaking", "halter training", "lunging", "equestrian"], "difficult_direct_answer": true, "rationales": ["There is no training.", "The method is lunging.", "The method of horse training here is called lunging."], "image": "train2014/COCO_train2014_000000313983.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11576, "question_id": "ULfg2Fy5XZHVB48tnLGzwg", "question": "What kind of weather is this?", "choices": ["tsunami", "rainy", "sunny", "clear skies"], "correct_choice_idx": 1, "direct_answers": ["sunny", "rainy", "sunny", "hot weather", "sunny", "rainy", "sunny", "rainy", "rainy", "sunny"], "difficult_direct_answer": false, "rationales": ["It seems to be rainy as they used the umbrella.", "People are carrying umbrellas", "The weather is rainy."], "image": "train2014/COCO_train2014_000000011576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38709, "question_id": "ULiwhG2zERFFLQEjQaPftE", "question": "What is the giraffe smelling?", "choices": ["hair", "balloon", "grain", "cotton candy"], "correct_choice_idx": 1, "direct_answers": ["balloon", "people", "balloon", "balloon", "balloon", "balloon", "balloon", "balloon", "balloon", "people"], "difficult_direct_answer": false, "rationales": ["He is smelling the balloon that someone is holding", "He is looking at and smelling hte balloon.", "The giraffe has the balloon."], "image": "val2014/COCO_val2014_000000038709.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463649, "question_id": "ULjeH3A9BXM4yeR849cTFK", "question": "What other sport also requires a certain player to wear a glove similar to this?", "choices": ["basketball", "ice hockey", "karate", "swimming"], "correct_choice_idx": 1, "direct_answers": ["baseball", "softball", "cricket", "softball", "softball", "softball", "ice hockey", "softball", "cricket", "softball"], "difficult_direct_answer": false, "rationales": ["They wear big gloves to protect their hands from the cold.", "Hockey goalies wear gloves for catching pucks.", "In ice hockey the goalie wears a glove similar to this one."], "image": "train2014/COCO_train2014_000000463649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245062, "question_id": "UMKBN3mpfXugjyBCUsmojM", "question": "Why is the man wearing a glove?", "choices": ["germs", "fashion", "warmth", "grip"], "correct_choice_idx": 3, "direct_answers": ["catcher", "to catch", "catch ball", "catch ball", "catch balls", "catcher", "safety", "catch ball", "protection", "grip"], "difficult_direct_answer": false, "rationales": ["The man is wearing a glove to grip the ball.", "This makes a bigger surface area to catch the ball", "To catch a ball."], "image": "train2014/COCO_train2014_000000245062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314849, "question_id": "UMLNZBRkT9FsehcDfECiN3", "question": "What is the design of the staircase called?", "choices": ["tall staircase", "l-shaped staircase", "turning staircase", "curved staircase"], "correct_choice_idx": 1, "direct_answers": ["spiral", "glass wall", "wrap around", "diester", "l-shaped staircase", "floating", "modern", "unknown", "split", "modern"], "difficult_direct_answer": true, "rationales": ["It is called an ishape case.", "This is a l shaped staircase because of how it turns at the bottom.", "The design is an i-shaped staircase."], "image": "train2014/COCO_train2014_000000314849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247338, "question_id": "UMdMZNGauhq8mw7LKRBuk7", "question": "What state of mind is the giraffe most likely in?", "choices": ["angry", "anxious", "upset", "relaxed"], "correct_choice_idx": 3, "direct_answers": ["relaxed", "relaxed", "relaxed", "daydreaming", "peaceful", "relaxed", "unhappy", "tired", "tired", "resting"], "difficult_direct_answer": false, "rationales": ["The giraffe is lying on the ground, resting so it would be in a docile, happy mood.", "The giraffe is resting.", "A giraffe is laying down."], "image": "train2014/COCO_train2014_000000247338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11313, "question_id": "UMgaiMDS9RvKMce4E5XTui", "question": "In what style was this room designed and decorated?", "choices": ["elizabethan", "contemporary", "modern", "art deco"], "correct_choice_idx": 1, "direct_answers": ["bedroom", "modern", "stripes", "contemporary", "modern", "modern", "modern", "modern", "modern", "simple"], "difficult_direct_answer": false, "rationales": ["The bedding and decor are more contemporary.", "The colors are modern and the furniture is minimalistic.", "The style is contemporary."], "image": "val2014/COCO_val2014_000000011313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481214, "question_id": "UMofVzViHBEkUaPTeppY5c", "question": "What holiday is this boy likely celebrating?", "choices": ["easter", "christmas", "thanksgiving", "halloween"], "correct_choice_idx": 3, "direct_answers": ["halloween", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween"], "difficult_direct_answer": false, "rationales": ["He is dressed up as a famous book character", "He's dressed up like harry potter", "He is dressed as the character harry potter."], "image": "val2014/COCO_val2014_000000481214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99828, "question_id": "UMvnJHFwnejJXgXsZt7YbL", "question": "What does the flower look like it is inside of?", "choices": ["candy cane", "bird", "plate", "coconut"], "correct_choice_idx": 3, "direct_answers": ["rock", "coconut", "vase", "coconut", "stone", "rock", "rock", "coconut", "rock", "rock"], "difficult_direct_answer": false, "rationales": ["The flower is in a coconut shell.", "The vase is brown, rounded and natural-looking.", "The shape is similar and the other options don't match."], "image": "val2014/COCO_val2014_000000099828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27975, "question_id": "UMyp8HmJPZRxYYgwtjX4Rj", "question": "Is tattoo allowed in this place?", "choices": ["maybe no", "no", "absolutely no", "yes"], "correct_choice_idx": 3, "direct_answers": ["yes", "maybe", "yes", "yes", "yes", "yes", "no", "no", "yes", "no"], "difficult_direct_answer": false, "rationales": ["The tattoo shop advertises quite visibly.", "There is an ink shop on the corner of the street.", "There are no signs saying it's not allowed."], "image": "val2014/COCO_val2014_000000027975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225133, "question_id": "UMzYDJomL8aCVdjtjHmEPz", "question": "What is near the colorful items?", "choices": ["tree", "tiger", "onion", "beach house"], "correct_choice_idx": 0, "direct_answers": ["trees", "trees", "trees", "tree leaves", "tree", "tree", "umbrella", "leafs", "tree", "tree"], "difficult_direct_answer": false, "rationales": ["The umbrellas are tied to trees.", "These umbrellas are up in the trees.", "The items are in trees."], "image": "val2014/COCO_val2014_000000225133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238557, "question_id": "UN3yXFcUAfPvESW6MSwvQZ", "question": "What can the heart do as it is drawn?", "choices": ["eat", "archery", "swim", "fly"], "correct_choice_idx": 3, "direct_answers": ["fly", "fly", "feel better", "fly", "fly", "feel better", "feel better", "fly", "fly", "fly"], "difficult_direct_answer": false, "rationales": ["Fly as there is wind in the area.", "The heart can fly.", "The heart has wings on the umbrella."], "image": "train2014/COCO_train2014_000000238557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473776, "question_id": "UN8dEUyd6utaPrmWKH6293", "question": "Who is the man in grey behind the batter?", "choices": ["catcher", "pitcher", "goalie", "referee"], "correct_choice_idx": 0, "direct_answers": ["catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "catcher"], "difficult_direct_answer": false, "rationales": ["The man is the catcher.", "He catches the ball that the pitcher throws.", "The man is the catcher."], "image": "train2014/COCO_train2014_000000473776.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517737, "question_id": "UN97DaH7Lc56wzdgCKky6S", "question": "Why are the two skiers so close to each other?", "choices": ["they're fighting", "they're friends", "they're siblings", "they're partners"], "correct_choice_idx": 2, "direct_answers": ["family", "they're siblings", "posing", "safety", "training", "posing", "teacher/student", "posing", "training", "they're family"], "difficult_direct_answer": false, "rationales": ["The taller boy is the big brother.", "One is older and one is younger, one is taller and one is smaller.", "They are both youngsters and probably related"], "image": "val2014/COCO_val2014_000000517737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223726, "question_id": "UNDrS3vmNecWE9ECAUAdjs", "question": "What do these men have in common?", "choices": ["chess players", "sports team", "presidency", "corporate ownership"], "correct_choice_idx": 2, "direct_answers": ["presidents", "president position", "presidents", "presidents", "former presidents", "presidents", "presidents", "ex-presidents", "former presidents", "presidency"], "difficult_direct_answer": false, "rationales": ["All of them are former presidents of the united states.", "The men are presidents.", "These men all are presidents."], "image": "train2014/COCO_train2014_000000223726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218701, "question_id": "UNhfkN29RKkGF6aSRyPZ7U", "question": "What is offscreen to the bottom right and likely to be casting a shadow onto the snow?", "choices": ["sign", "tree", "fence", "house"], "correct_choice_idx": 0, "direct_answers": ["flag", "sign", "sign", "flag", "sign", "sign", "slalom flags", "shadow", "sun", "poster"], "difficult_direct_answer": false, "rationales": ["The item casting the shadow is relatively small. it has two posts that hold up a rectangular area.", "You can see the rectangle shape of a sign's shadow in the snow.", "A sign is partly seen."], "image": "train2014/COCO_train2014_000000218701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153931, "question_id": "UNiwviroGYU47SufEqCxUy", "question": "What is he using the rectangular device to do?", "choices": ["record", "communicate", "massage", "keep warm"], "correct_choice_idx": 1, "direct_answers": ["phone calls", "phone calls", "phone", "call", "communicate", "talk", "call someone", "call mom", "phone", "cellphone"], "difficult_direct_answer": false, "rationales": ["He's talking.", "You can talk to people on cell phones that are rectangular.", "The man is making a call."], "image": "val2014/COCO_val2014_000000153931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368845, "question_id": "UNnzKFAgxvcsWg3nouJk2A", "question": "Why is the yellow line painted on the ground?", "choices": ["decoration", "safety", "vandalism", "amusement"], "correct_choice_idx": 1, "direct_answers": ["stand behind", "barrier", "safety", "safety line", "warning", "safety", "safety", "safety", "caution line", "safety barrier"], "difficult_direct_answer": false, "rationales": ["The line is for safety.", "The yellow line indicates where it's safe to stand.", "The yellow line is painted parallel to the edge of the train platform. bright lines are often used to attract attention to something dangerous so this line is to warn of the dangers of the edge of the platform."], "image": "train2014/COCO_train2014_000000368845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239016, "question_id": "UNvhenEffWPG8qYKnBuT2t", "question": "What sound maker can be found above the clock?", "choices": ["bell", "organ", "speaker", "bird"], "correct_choice_idx": 0, "direct_answers": ["bell", "bell", "bell", "bell", "bell", "bell", "bell", "bell", "bell", "bell"], "difficult_direct_answer": false, "rationales": ["The object makes noise by swinging back and forth.", "A large tower with a bell, which commonly makes sound during a certain time of day, can be seen.", "This rings out how many numbers the hour is"], "image": "train2014/COCO_train2014_000000239016.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462011, "question_id": "UP5ozdPxcpkZ55pS2tfDBY", "question": "The pictograms on the tie show how to do what?", "choices": ["make it", "iron it", "tie it", "wash it"], "correct_choice_idx": 2, "direct_answers": ["tie tie", "tying tie", "tie tie", "tie tie", "tie tie", "tie", "tie tie", "tie tie", "tie necktie", "tie it"], "difficult_direct_answer": false, "rationales": ["The diagrams show how to tie a simple knot in a tie.", "It is a funny way to show how to use the tie with the two ends and to make a \"knot\"", "The pictures are telling to tie."], "image": "val2014/COCO_val2014_000000462011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7035, "question_id": "UPBrbkGFYN3qoYh7zG7pf5", "question": "How many total legs are here even if only partially visible?", "choices": ["20", "six", "12", "four"], "correct_choice_idx": 2, "direct_answers": ["seven", "ten", "eight", "twelve", "twelve", "ten", "12", "12", "eight", "eight"], "difficult_direct_answer": false, "rationales": ["There are two horses and two people.", "The person has two legs and each horse has four.", "There are 12 legs."], "image": "train2014/COCO_train2014_000000007035.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95249, "question_id": "UPDezG9Cs5v8QSHywS6RYC", "question": "Why is the man raising the object to his mouth?", "choices": ["to lick", "to drink", "to kiss", "to eat"], "correct_choice_idx": 3, "direct_answers": ["to eat", "eating soon", "eat donut", "eating", "to eat", "eat it", "to eat", "cruller", "eat it", "pastry"], "difficult_direct_answer": false, "rationales": ["The man is eating his donuts.", "The man is holding a food object and opening his mouth. this is how food would be consumed.", "He is hungry and getting ready to eat."], "image": "val2014/COCO_val2014_000000095249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380717, "question_id": "UPM744SkkNejZeBNvyuUda", "question": "What is the large blanket on the right used for?", "choices": ["wearing", "sleeping", "playing", "drying"], "correct_choice_idx": 1, "direct_answers": ["sleeping purpose", "sleeping", "keeping warm", "sleeping", "warmth", "sleep", "warmth", "sleeping", "sleeping", "sleeping"], "difficult_direct_answer": false, "rationales": ["The piece of furniture on the right is a bed and blankets are used on beds for sleeping.", "The blanket is used for sleeping.", "The blanket is on the bed."], "image": "train2014/COCO_train2014_000000380717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243029, "question_id": "UPM9zA5zjAzw3DkQSE5wMi", "question": "What is the man carrying to the water?", "choices": ["buckets", "blankets", "chairs", "surfboards"], "correct_choice_idx": 3, "direct_answers": ["surfboards", "surfboards", "surfboard", "surfboards", "surfboard", "surfboard", "surfboards", "surfboards", "surfboard", "surfboards"], "difficult_direct_answer": false, "rationales": ["The man is going surfing and is walking to the water.", "The man has surfboards.", "The man is holding long boards to go to the water."], "image": "train2014/COCO_train2014_000000243029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436963, "question_id": "UPQfWgsRRqamzxTkXhE7Zt", "question": "What does the spindle across from the cordless phone hold?", "choices": ["coasters", "donuts", "memory cards", "cds"], "correct_choice_idx": 3, "direct_answers": ["discs", "discs", "cds", "antennae", "cds", "cds", "yarn", "cds", "discs", "discs"], "difficult_direct_answer": false, "rationales": ["The discs are sitting in the container.", "The object is a cd holder and there are cds present on it. a cd holder is a common object to encounter in an office near a computer.", "Traditionally these plastic cases help these items to be sold and or clear from damage."], "image": "val2014/COCO_val2014_000000436963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543551, "question_id": "UPSi7S98umuuDH36RiG65s", "question": "What country is this bridge located in?", "choices": ["china", "australia", "britain", "italy"], "correct_choice_idx": 1, "direct_answers": ["usa", "no idea", "australia", "england", "new zealand", "non-third world", "melbourne", "united kingdom", "australia", "australia"], "difficult_direct_answer": false, "rationales": ["This bridge is an australian landmark, which means that this location must be the continent and country of australia.", "This bridge is famous in australia.", "The country is australia."], "image": "val2014/COCO_val2014_000000543551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114025, "question_id": "UPtsrEx3iRUkho2rvcQ5fy", "question": "Why does the woman need string?", "choices": ["knit", "fly kite", "sew", "tie ends"], "correct_choice_idx": 1, "direct_answers": ["kite flying", "fly kite", "flying kite", "flying kite", "fly kite", "fly kite", "keep kite", "fly kite", "control kite", "fly kite"], "difficult_direct_answer": false, "rationales": ["The woman is holding a kite.", "The woman wants to fly the kite.", "Since the woman is outside and holding a kite, she needs string to control and fly it."], "image": "val2014/COCO_val2014_000000114025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262463, "question_id": "UPwAcwsk3VSBZx2gDTpFyH", "question": "What is the mood of this group?", "choices": ["worried", "fun-loving", "angry", "celebratory"], "correct_choice_idx": 0, "direct_answers": ["serious", "consider", "focused", "nervous", "upset", "worried", "tense", "confused", "very worried", "intense"], "difficult_direct_answer": true, "rationales": ["The people look somber and no one is smiling.", "Their brows are furrowed and mouths slightly open", "The people's facial expressions best reflect this emotion."], "image": "train2014/COCO_train2014_000000262463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423602, "question_id": "UQbxVwWp6QZkTPRDC9HiU9", "question": "What name is on the bottom of the ski board?", "choices": ["jackson", "yamaha", "goose", "jones"], "correct_choice_idx": 3, "direct_answers": ["ones", "nes", "sence", "jones", "jones", "ones", "jones", "ones", "ones", "jones"], "difficult_direct_answer": false, "rationales": ["The name is jones.", "The ski board says jones.", "Jones is on the bottom of the red and white board."], "image": "train2014/COCO_train2014_000000423602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518375, "question_id": "URFe5NJHkB38ZFvydE2yvm", "question": "What Leavening was used in this dish?", "choices": ["yeast", "none", "rye", "sour kraut"], "correct_choice_idx": 0, "direct_answers": ["yeast", "spoon", "yeast", "spoon", "spoon", "yeast", "yeast", "yeast", "yeast", "yeast"], "difficult_direct_answer": false, "rationales": ["It is a flat bread with no leavening", "Yeast is used in pizza dough.", "Yeast is used in dough."], "image": "val2014/COCO_val2014_000000518375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391735, "question_id": "URNWBFCkuq5qd69LgTdeeE", "question": "What type of traffic is allowed at this street here at this time?", "choices": ["boats", "pedestrian", "cars", "trucks"], "correct_choice_idx": 1, "direct_answers": ["pedestrian crossing", "crosswalk", "pedestrian", "pedestrian", "foot traffic", "vehicle", "pedestrians crossing", "walking", "cars", "vehicle"], "difficult_direct_answer": false, "rationales": ["The traffic is for people.", "The person walking on the traffic light shows pedestrians can walk.", "There is a sign with a person on it that is lit up. that indicates it is clear for a walker to cross the street."], "image": "val2014/COCO_val2014_000000391735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262009, "question_id": "URaw4wskxcCRkU9uFbJSp9", "question": "Why are so many suitcases together?", "choices": ["collection", "trash", "to sell", "storage"], "correct_choice_idx": 3, "direct_answers": ["storage", "storage", "storage", "being sold", "for sale", "for sale", "lost/found", "storage", "warehouse", "lost found"], "difficult_direct_answer": false, "rationales": ["It could arguably be b or c depending on if this is a warehouse, but the most likely reason, given the travel tags, is a. that said, it could be a lost luggage location.", "The suitcases are being kept for their owners.", "They all have different luggage and travel tags on them."], "image": "train2014/COCO_train2014_000000262009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474817, "question_id": "URdvojeU7ffznib8MTm7j9", "question": "The artistically displayed items here are normally connected to what?", "choices": ["solar panels", "roofs", "electricity", "plumbing"], "correct_choice_idx": 3, "direct_answers": ["bathrooms", "plumbing", "duchamp", "plumbing", "plumbing", "think", "lemonade", "plumbing", "pipes", "toilets"], "difficult_direct_answer": false, "rationales": ["These are toilets", "These toilets need to be connected to plumbing.", "Toilets have to be connected to pipes to drain."], "image": "train2014/COCO_train2014_000000474817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238641, "question_id": "URjRF5fUqTRrLRJMgbsADV", "question": "What utensil is missing from this table?", "choices": ["knife", "fork", "plate", "spoon"], "correct_choice_idx": 0, "direct_answers": ["knife", "knife", "spoon", "fork", "knife", "glass", "knife", "knife", "knife", "knife"], "difficult_direct_answer": false, "rationales": ["There are forks and spoons.", "There is a knife missing.", "There is already a fork, spoon and plate on the table."], "image": "train2014/COCO_train2014_000000238641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105468, "question_id": "US4aHZ34SqMdLSnnrGF4N2", "question": "How is this type of bridge called?", "choices": ["collapsing bridge", "up bridge", "bascule bridge", "triangle bridge"], "correct_choice_idx": 2, "direct_answers": ["drawbridge", "bascule bridge", "retractable", "drawbridge", "raised", "open bridge", "drawbridge", "drawbridge", "movable bridge", "lift"], "difficult_direct_answer": false, "rationales": ["The bridge is lifting for ships.", "An internet search for other names for a moveable or draw bridge was used to determine the correct answer.", "The bridge is a bascule."], "image": "train2014/COCO_train2014_000000105468.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318783, "question_id": "USBA9ZtyckuCxe35X55gTt", "question": "Why are the boys holding gloves to the ground?", "choices": ["to dance", "to catch", "to clean", "to trick"], "correct_choice_idx": 1, "direct_answers": ["to catch", "rolling ball", "to catch", "fielding grounders", "catch ball", "catch balls", "catching", "practicing", "retrieving balls", "playing"], "difficult_direct_answer": true, "rationales": ["The boys are playing baseball.", "The boys are holding gloves to the ground in order to catch the balls.", "This is a drill to practice for games"], "image": "train2014/COCO_train2014_000000318783.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133964, "question_id": "USMqyRKL6gwuFAkf3WepKh", "question": "What type of bird is on the red white and green barrel?", "choices": ["eagle", "swan", "pelican", "rooster"], "correct_choice_idx": 2, "direct_answers": ["pelican", "pelican", "seagull", "pelican", "pelican", "pelican", "pelican", "seagull", "seagull", "pelican"], "difficult_direct_answer": false, "rationales": ["The bird is a water bird. it is a pelican and likes fish.", "The birds have large wings and beaks.", "A pelican has a broad mouth."], "image": "val2014/COCO_val2014_000000133964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463600, "question_id": "UST6N67qVZEoP8Guazxafi", "question": "How many wheels total are there among these vehicles?", "choices": ["ten", "four", "six", "eight"], "correct_choice_idx": 3, "direct_answers": ["eight", "eight", "four", "eight", "eight", "eight", "four", "four", "eight", "eight"], "difficult_direct_answer": false, "rationales": ["The two vehicles has 4 wheels each...4+4=8", "Each vehicles has four wheels on them so the total wheels would be the answer to four plus four.", "There are 8 wheels."], "image": "train2014/COCO_train2014_000000463600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151231, "question_id": "USbNwgjcPLuhv6xV2MfyFf", "question": "What is the name of the street parallel to the stop sign?", "choices": ["chestnut", "washington", "david", "blackberry"], "correct_choice_idx": 2, "direct_answers": ["david", "david", "david", "david", "david st", "david", "david", "david", "david st", "david"], "difficult_direct_answer": false, "rationales": ["The name is on the sign", "The name has been built on top of the sign.", "The name of the street is on the black sign above the red sign."], "image": "val2014/COCO_val2014_000000151231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479042, "question_id": "USdiVvrHFwHJDEB9b5Ty9x", "question": "In order to be authentic this beverage must be produced in what country?", "choices": ["israel", "france", "italy", "denmark"], "correct_choice_idx": 1, "direct_answers": ["france", "france", "france", "italy", "france", "france", "italy", "italy", "not clear", "france"], "difficult_direct_answer": false, "rationales": ["Champagne needs to be produced in france to be authentic.", "Champagne is supposed to come from france.", "The bottle likely contains champagne. champagne originates in, and is named for, a region of france."], "image": "val2014/COCO_val2014_000000479042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322507, "question_id": "USmzvKr68vYfaM2N4QW9fz", "question": "Why are the skiers wearing numbers on their shirts?", "choices": ["for fun", "competing", "for fashion", "to count"], "correct_choice_idx": 1, "direct_answers": ["racing numbers", "ski race", "they're racing", "identification", "competition", "race", "racing", "race", "competing", "for identification"], "difficult_direct_answer": true, "rationales": ["The people are in a race so they need to be identified.", "The skiers are in competition.", "The skiers compete."], "image": "train2014/COCO_train2014_000000322507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442417, "question_id": "USnxc8HENzdx5f4HwtAyfv", "question": "What products can be purchased at this store?", "choices": ["televisions", "mobile phones", "financial services", "food"], "correct_choice_idx": 1, "direct_answers": ["phone", "phone accessories", "electronics", "phones", "electronics", "mobile phones", "electronic", "cellphones", "phones", "electronics"], "difficult_direct_answer": false, "rationales": ["This is a cellular kiosk so they likely cell phones.", "The store is a tech store.", "The products are phones."], "image": "val2014/COCO_val2014_000000442417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361670, "question_id": "UTXoFkYMD96DCnGkxgLZ5L", "question": "What do the kids with the guitars stare at?", "choices": ["monitor screen", "mirror", "window", "other kids"], "correct_choice_idx": 0, "direct_answers": ["tv", "game", "video game", "television", "screen", "monitor screen", "screen", "cant see", "waiting", "tv"], "difficult_direct_answer": false, "rationales": ["They are playing a console game. they would be looking at a television screen.", "The guitars are controllers for a video game. they are staring at the video being outputted by the game.", "They are playing guitars that go with a video game. the video is on the tv."], "image": "train2014/COCO_train2014_000000361670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109351, "question_id": "UThTkZ6j5pXWxb8aJ78TVJ", "question": "What is the best shape for a kite?", "choices": ["square", "triangular", "rectangle", "diamond"], "correct_choice_idx": 3, "direct_answers": ["diamond", "diamond", "triangle", "diamond", "delta", "triangle", "triangular", "diamond", "diamond", "square"], "difficult_direct_answer": false, "rationales": ["Triangle shaped kites are the most common and know to be the best for flight.", "The best shape for a kite is a diamond because it helps it fly through the air", "Diamond shaped kites are common."], "image": "train2014/COCO_train2014_000000109351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413585, "question_id": "UTicZX2jyFGb3rMxPUgwjM", "question": "Are the girls wearing matching necklaces?", "choices": ["unsure", "no", "yes", "maybe"], "correct_choice_idx": 1, "direct_answers": ["no", "no", "no", "no", "no", "no", "no", "no", "no", "no"], "difficult_direct_answer": false, "rationales": ["One girl has a gold dolphin and the other a white object on their necklaces.", "The girls' necklaces are different.", "One is a character and one is a gold dolphin"], "image": "train2014/COCO_train2014_000000413585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410808, "question_id": "UUBC6hP3w38B5HHrfnsKzW", "question": "What is the highest number that is visible?", "choices": ["34", "12", "68", "22"], "correct_choice_idx": 1, "direct_answers": ["12", "12", "twelve", "twelve", "twelve", "12", "twelve", "twelve", "twelve", "twelve"], "difficult_direct_answer": false, "rationales": ["Twelve is the highest number.", "The numbers one through twelve are seen on the clock. twelve is the greatest of these numbers.", "The number is 12."], "image": "train2014/COCO_train2014_000000410808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258182, "question_id": "UUDogpN6jFetbCZxpKSEUP", "question": "What is required of people facing this traffic light?", "choices": ["stop", "yield", "go", "pull over"], "correct_choice_idx": 0, "direct_answers": ["stopping", "stop", "stop", "stop", "stopping", "stopping", "stop", "stop", "stop", "stop"], "difficult_direct_answer": false, "rationales": ["The traffic lights are required for the cars to stop.", "Red means this to drivers.", "Red light means stop."], "image": "train2014/COCO_train2014_000000258182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498890, "question_id": "UUFvdbt78FYHtRWSQftMMA", "question": "What brand is this vehicle?", "choices": ["honda", "ford", "mitsubishi", "toyota"], "correct_choice_idx": 2, "direct_answers": ["mitsubishi", "mitsubishi", "mitsubishi", "mitsubishi", "mitsubishi", "mitsubishi", "mitsubishi", "mitsubishi", "mitsubishi", "mitsubishi"], "difficult_direct_answer": false, "rationales": ["The car's logo is for mitsubishi.", "There is a logo above the license plate. it is not a toyota, honda, or ford logo.", "The logo is visible on the front of the car in a place where a car company would indicate they were the makers of the vehicle. the logo is of a mitsubishi meaning they would be the brand of the car."], "image": "train2014/COCO_train2014_000000498890.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107960, "question_id": "UUW9N3vzkp6iL49AYac3dV", "question": "How does this vehicle connect to the ground?", "choices": ["rail", "hooves", "track", "wheels"], "correct_choice_idx": 3, "direct_answers": ["tires", "gravity", "spinning tires", "wheels", "wheels", "wheels", "wheels", "tires", "tires", "tires"], "difficult_direct_answer": false, "rationales": ["The vehicle has wheels.", "The motorcycle has two wheels.", "The person is riding a motorcycle, not a horse, tank, or train."], "image": "val2014/COCO_val2014_000000107960.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530848, "question_id": "UUkdCU9HBoRYwuevqSbHVc", "question": "What is the pen used to do on the paper?", "choices": ["stab", "write", "poke", "massage"], "correct_choice_idx": 1, "direct_answers": ["write", "write", "write", "write", "writing", "write", "write", "to write", "write", "write"], "difficult_direct_answer": false, "rationales": ["This is pretty commonly accepted and known application for a pen on paper.", "A pen is resting on a the corner of a piece of paper.", "The answer is commonly known."], "image": "train2014/COCO_train2014_000000530848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164594, "question_id": "UUmybEaZLKx5d8zWHLqMUu", "question": "The person riding on the chair on the elephant is doing so because he is a what?", "choices": ["tour guide", "commuter", "tourist", "safety inspector"], "correct_choice_idx": 2, "direct_answers": ["passenger", "getting ride", "touring traveler", "guide", "tourist", "tour guide", "tourist", "tourist", "traveler", "passenger"], "difficult_direct_answer": false, "rationales": ["The men seems to be tourist in the area.", "The man is a tourist.", "The man looks like he's not a native of the area."], "image": "val2014/COCO_val2014_000000164594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89355, "question_id": "UUoAnDndMcRz7cB6qmTL5R", "question": "Where do these ladies walk to?", "choices": ["ocean", "swimming pool", "snow field", "ice berg"], "correct_choice_idx": 0, "direct_answers": ["beach", "beach", "surf", "beach", "ocean", "water", "ocean", "beach", "ocean", "ocean"], "difficult_direct_answer": false, "rationales": ["The women are wearing wet suits and are carrying surfboards. they are walking to a body of water that is suitable for surfing.", "Since they are carrying surfboards they are likely headed for the ocean to do some surfing.", "The woman want to go surfing."], "image": "val2014/COCO_val2014_000000089355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231471, "question_id": "UUpfTDgbdvu4WsFSoD5vD6", "question": "What animals are shown in the photo?", "choices": ["frog", "dog", "cat", "bird"], "correct_choice_idx": 3, "direct_answers": ["geese", "ducks", "geese", "ducks", "bird", "geese", "ducks", "ducks", "geese", "ducks"], "difficult_direct_answer": false, "rationales": ["These animals fly and love water", "These are waterfowl", "The animals visible have wings and beaks which are features of birds and no other option."], "image": "val2014/COCO_val2014_000000231471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367335, "question_id": "UUqNXfLYGytvy9J9spKssv", "question": "What can be purchased at this business?", "choices": ["boat", "car", "art", "food"], "correct_choice_idx": 1, "direct_answers": ["cars", "cars", "cars", "cars", "automobile", "saab vehicles", "car", "cars", "cars", "car"], "difficult_direct_answer": false, "rationales": ["The dealership has a sign for saab and sells automobiles.", "This is a brand name for passenger vehicles", "Saab is a car dealership."], "image": "train2014/COCO_train2014_000000367335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378163, "question_id": "UVQhBTsDrc2vEEc6iKGPV9", "question": "Why is the man's face green?", "choices": ["natural color", "dirt", "illness", "face paint"], "correct_choice_idx": 3, "direct_answers": ["grinch", "costume", "paint", "costume", "grinch facepaint", "costume", "grinch", "costume", "paint", "face paint"], "difficult_direct_answer": false, "rationales": ["He's dressed as the grinch.", "The man is wearing a costume.", "The man has face paint."], "image": "val2014/COCO_val2014_000000378163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155012, "question_id": "UVVD9hfM87iDQNcSDegeDv", "question": "What type of waste material is recycled in the blue bin to the left of the dog?", "choices": ["garbage", "paper", "cardboard", "aluminum"], "correct_choice_idx": 1, "direct_answers": ["paper", "paper", "paper plastic", "electronics", "paper", "paper", "paper", "paper", "recyclable", "paper"], "difficult_direct_answer": false, "rationales": ["Paper is in the wastebasket.", "There is an envelope in the top of the waste bin.", "Paper is the material that gets recycled."], "image": "val2014/COCO_val2014_000000155012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343878, "question_id": "UVWodYcmVDTetNk25zFa9U", "question": "What type property is this?", "choices": ["private", "public", "corporate", "borderlands"], "correct_choice_idx": 1, "direct_answers": ["public", "oceanfront", "public", "beach", "park", "public", "public", "public park", "park", "state"], "difficult_direct_answer": false, "rationales": ["It's a public area to be used by anyone.", "This is a roadway. you can see the lines on it.", "This property is not fenced off. it is open to everyone."], "image": "train2014/COCO_train2014_000000343878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547168, "question_id": "UVcJer4wn8YUvkaKyQ3vqi", "question": "Where was modern skiing invented?", "choices": ["switzerland", "russia", "china", "scandinavia"], "correct_choice_idx": 3, "direct_answers": ["french alps", "russia", "scandinavia", "scandinavia", "russia", "scandinavia", "6000bce", "scandinavia", "america", "scandanavia"], "difficult_direct_answer": false, "rationales": ["It was first done in scandinavia.", "The first modern skiers were scandinavian.", "Scandinavia is credited for skiing."], "image": "train2014/COCO_train2014_000000547168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287402, "question_id": "UWLPB4p7ztmmLgENDfNXcP", "question": "What color pants does the person who threw the frisbee wear?", "choices": ["black", "none", "white", "tan"], "correct_choice_idx": 3, "direct_answers": ["beige", "gray", "tan", "tan", "beige", "grey", "white", "gray", "tan", "tan"], "difficult_direct_answer": false, "rationales": ["The man is wearing khaki style pants which are in that color.", "They look tan from the distance in the photo.", "The color is tan."], "image": "val2014/COCO_val2014_000000287402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147897, "question_id": "UWXjdWouHRQybmEvSpqi2Q", "question": "What type of camera is he using?", "choices": ["film", "dslr", "phone", "digital"], "correct_choice_idx": 3, "direct_answers": ["digital", "digital", "digital", "digital", "digital", "nikon", "digital", "small", "digital", "mini camera"], "difficult_direct_answer": false, "rationales": ["A small silver camera with a small lens is being aimed by a man in a bathroom.", "It is really compact and has that small lense off-center.", "The camera is digital."], "image": "val2014/COCO_val2014_000000147897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321663, "question_id": "UWgqwSJDLc8zpoH9uPPbTm", "question": "What type of event do the people skiing take part in?", "choices": ["bakeoff", "beauty contest", "eating contest", "race"], "correct_choice_idx": 3, "direct_answers": ["skiing", "racing", "nordic", "downhill", "cosplay", "race", "halloween", "race", "crosscountry", "dr suess"], "difficult_direct_answer": true, "rationales": ["A woman is skiing with a number on her shirt.", "A is the only one of the listed activities which is traditionally performed on skis.", "The participants have numbers and are competing on a course."], "image": "val2014/COCO_val2014_000000321663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568880, "question_id": "UWwkkE7FtsN3sdaSXsHdiA", "question": "How were the potatoes seen here cooked?", "choices": ["fried", "mashed", "raw", "baked"], "correct_choice_idx": 0, "direct_answers": ["boiled", "fried", "fried", "fried", "french fries", "fried", "french fried", "french fried", "fried", "fried"], "difficult_direct_answer": false, "rationales": ["The potatoes were fried.", "They look golden brown which shows they were thrown in grease, and potatoes are normally \"white\"-looking.", "A large plate of french fries is seen on the upper left side of the table. they are, of course, fried in oil and are a much-beloved food for millions of people!."], "image": "train2014/COCO_train2014_000000568880.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328494, "question_id": "UX2R7SbXrFAdHL4e2vK9Ld", "question": "What is above the car?", "choices": ["airplane", "balloon", "zeppelin", "traffic light"], "correct_choice_idx": 3, "direct_answers": ["stoplight", "traffic light", "traffic light", "stoplight", "traffic light", "traffic light", "traffic light", "traffic light", "traffic light", "stoplight"], "difficult_direct_answer": false, "rationales": ["Stop lights hang on wire above cars over a road at an intersection", "There is a traffic light above the car.", "The traffic light is above the car that's blurry."], "image": "train2014/COCO_train2014_000000328494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239616, "question_id": "UXAR8beW9EM5Lqvi9JBWwu", "question": "What item is mimicked behind her head?", "choices": ["bow tie", "ear", "belly button", "nose"], "correct_choice_idx": 0, "direct_answers": ["bow", "bow tie", "bow tie", "bowtie", "bow", "teddy bear", "ribbon", "bow tie", "bow", "bowtie"], "difficult_direct_answer": false, "rationales": ["The bear statue is wearing a bow tie.", "Her head is lower than the bear's nose and ears and higher than the bear's belly button. her head is near the bear's neck.", "The bear is wearing a bowtie."], "image": "train2014/COCO_train2014_000000239616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361428, "question_id": "UXHUjwiqUq2FZLaTinqAPZ", "question": "What is in the air?", "choices": ["birds", "balloons", "kites", "airplane"], "correct_choice_idx": 0, "direct_answers": ["seagulls", "birds", "birds", "seagulls", "seagulls", "seagulls", "birds", "bird", "birds", "seagull"], "difficult_direct_answer": false, "rationales": ["There are seagulls in the air.", "The white bird is flying with its wings spread.", "The objects in the air are flying animals."], "image": "train2014/COCO_train2014_000000361428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401455, "question_id": "UXU9rUdEbpKHa7UybTHwer", "question": "What are they doing?", "choices": ["eating sheep", "stealing sheep", "cleaning sheep", "sheering sheep"], "correct_choice_idx": 3, "direct_answers": ["shearing", "shearing sheep", "shaving", "sheering sheep", "sheering sheep", "sheering sheep", "shearing sheep", "shearing sheep", "shearing sheep", "shearing sheep"], "difficult_direct_answer": false, "rationales": ["They are shearing the wool off the sheep.", "The people are sheering sheep's wool.", "A woman is grabbing a sheep and wool is all over a table nearby."], "image": "train2014/COCO_train2014_000000401455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569774, "question_id": "UYC7N2DSt9McAYUguFfLEj", "question": "What appliance will they use to cook this dish?", "choices": ["broiler", "oven", "grill", "stove"], "correct_choice_idx": 1, "direct_answers": ["oven", "oven", "oven", "pizza", "toaster", "oven", "oven", "oven", "oven", "pane"], "difficult_direct_answer": false, "rationales": ["The appliance is the oven.", "Pizzas are baked.", "Pizzas are cooked in the oven."], "image": "train2014/COCO_train2014_000000569774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401032, "question_id": "UYeXAME9y85csFs7KkjZU2", "question": "Who won the game?", "choices": ["girl", "boy", "man", "woman"], "correct_choice_idx": 3, "direct_answers": ["girl", "woman", "woman", "woman", "girl smiling", "woman", "woman", "girl", "woman", "woman"], "difficult_direct_answer": false, "rationales": ["The man looks disappointed and the female is doing a triumphant stance.", "She is raising her arms and smiling.", "The person on the left is celebrating. the person on the right is unhappy."], "image": "train2014/COCO_train2014_000000401032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454813, "question_id": "UZMpQsfesAiHvxoNttWP2h", "question": "What are they doing?", "choices": ["cleaning up", "signing agreement", "selling goods", "checking documents"], "correct_choice_idx": 1, "direct_answers": ["signing", "signing agreement", "signing documents", "signing agreement", "signing", "signing contracts", "signing bills", "signing contracts", "writing", "signing document"], "difficult_direct_answer": false, "rationales": ["The man is holding a pen, hovering over the bottom of the paper where signature usually goes.", "The men are signing paperwork.", "The men are signing contracts with their pens."], "image": "train2014/COCO_train2014_000000454813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245383, "question_id": "UZUFoLoSuzrdmcbXwZsas5", "question": "What type of vehicles are most shown here?", "choices": ["bicycles", "trains", "cars", "motorcycles"], "correct_choice_idx": 3, "direct_answers": ["motorcycle", "motorcycles", "motorcycles", "motorcycles", "motorcycles", "motorcycles", "motorcycle", "motorcycles", "motorcycle", "motorcycles"], "difficult_direct_answer": false, "rationales": ["The vehicles are motorbikes.", "People are using motorcycles around the roads.", "They are small and the rider is exposed"], "image": "val2014/COCO_val2014_000000245383.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552545, "question_id": "UZUrke5peTPUvVXZ9FA6Tq", "question": "What is the person doing here?", "choices": ["working", "sleeping", "counting", "planning"], "correct_choice_idx": 1, "direct_answers": ["sleeping", "sleeping", "sleeping", "watching television", "sleeping", "sleeping", "watching tv", "sleeping", "sleeping", "napping"], "difficult_direct_answer": false, "rationales": ["The person is in a horizontal position which is most associated with sleep. there is also a floral pattern underneath them that is likley sheets that would be on a bed where one would sleep.", "They are on a bed and not moving", "The person fell asleep while holding the remote."], "image": "val2014/COCO_val2014_000000552545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41669, "question_id": "UZqgrbRH6jNN6mnfnsU9Rv", "question": "The flag of what country is placed in the berry cake?", "choices": ["united kingdom", "france", "sweden", "germany"], "correct_choice_idx": 0, "direct_answers": ["england", "england", "great britain", "united kingdom", "england", "england", "england", "england", "united kingdom", "united kingdom"], "difficult_direct_answer": false, "rationales": ["The cake has a british flag on it.", "The flag is for the uk.", "This is the union jack flag"], "image": "train2014/COCO_train2014_000000041669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53410, "question_id": "UZtSattiHoScn4p8LLsJfM", "question": "What is closest to the computer screen?", "choices": ["cup", "nose", "printer", "cat"], "correct_choice_idx": 0, "direct_answers": ["cup", "cup", "monitor", "cup", "cup", "cup", "keyboard", "elbow", "pamphlet", "cup"], "difficult_direct_answer": false, "rationales": ["There is a vessel for drinking next to the screen.", "It is almost touching the side of it", "A cup is the closest object to the computer screen."], "image": "val2014/COCO_val2014_000000053410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303281, "question_id": "UZyzqXUMAS7UdByhjuEogW", "question": "What fruit is the tree bearing most likely?", "choices": ["plums", "apples", "dragon fruit", "pomegranates"], "correct_choice_idx": 1, "direct_answers": ["apples", "apples", "apples", "apple", "apples", "apple", "apples", "apples", "apples", "apple"], "difficult_direct_answer": false, "rationales": ["The fruit in the tree is round and red.", "These are round and red", "Apples are on the trees."], "image": "train2014/COCO_train2014_000000303281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114363, "question_id": "UaM7c67fSAVibn9ZMtxFeR", "question": "What vehicle type is not allowed to park on the street?", "choices": ["motorcycles", "bicycles", "buses", "trucks"], "correct_choice_idx": 3, "direct_answers": ["large vans", "truck", "trucks", "delivery trucks", "delivery truck", "trucks", "van", "trucks", "large trucks", "trucks"], "difficult_direct_answer": false, "rationales": ["That's what is crossed out in the sign.", "The type is a truck.", "There are trucks not allowed to park in this street."], "image": "train2014/COCO_train2014_000000114363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112409, "question_id": "UaTM3qrTKmeYwscNqTY8Tw", "question": "Who is in the greatest danger?", "choices": ["old woman", "young woman", "girl", "boy"], "correct_choice_idx": 0, "direct_answers": ["blonde woman", "older woman", "woman", "surfer", "old woman", "right person", "boarder", "no board", "left woman", "woman"], "difficult_direct_answer": true, "rationales": ["She isn't holding the board", "An older woman is in the greatest danger.", "The old woman does not have a board to keep her afloat."], "image": "train2014/COCO_train2014_000000112409.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371861, "question_id": "UaZJyTXQXUzc6a7KTG73LP", "question": "What type of animal is being depicted on the plate with the food on it?", "choices": ["horse", "elephant", "donkey", "pig"], "correct_choice_idx": 0, "direct_answers": ["horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["Horses are on the plate.", "Fried food is being served on a plate with a cowboy scene on it.", "Most people don't eat horses, elephants, or donkeys."], "image": "val2014/COCO_val2014_000000371861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39082, "question_id": "UbQvKcGu3dhx9zMdGqBTat", "question": "Why is she holding the racquet behind her?", "choices": ["hiding it", "keep sage", "hit ball", "wants scare"], "correct_choice_idx": 2, "direct_answers": ["hit ball", "for momentum", "hit ball", "swatting", "serving ball", "power", "hit ball", "power", "getting power", "power"], "difficult_direct_answer": false, "rationales": ["She is ready to swing and hit the ball.", "She is trying to hit the ball.", "The person is playing tennis and based on their handedness, body position and the relative position of the ball, for her to be in this position she would be attempting to hit the ball which is the object of the game of tennis."], "image": "train2014/COCO_train2014_000000039082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463722, "question_id": "UbRgWjsXi6SM9MfhThcSue", "question": "What are professional horse riders called?", "choices": ["jockeys", "gunners", "riders", "trotters"], "correct_choice_idx": 0, "direct_answers": ["equestrian", "cowboys", "jockeys", "equestrian", "jockeys", "jockeys", "equestrian", "jockey", "equestrians", "jockeys"], "difficult_direct_answer": false, "rationales": ["Jockeys ride horses for a living.", "A person that rides a horse is called a jockey.", "This is the term for the riders"], "image": "val2014/COCO_val2014_000000463722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103137, "question_id": "UbW7NE6GF9rccaVwy7Xbzs", "question": "Which Giants team does he play for?", "choices": ["calgary", "yomiuri", "new york", "san francisco"], "correct_choice_idx": 1, "direct_answers": ["giants", "padres", "new york", "other team", "baseball", "san francisco", "yomiuri", "san francisco", "san francisco", "san francisco"], "difficult_direct_answer": false, "rationales": ["The writing on the sleeve that we can see is \"yomi\" which narrows down our options for the correct answer.", "There is either the yankees or that team for that location.", "The player is yomiuri."], "image": "train2014/COCO_train2014_000000103137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33718, "question_id": "UbmtczhwTjrKopDvesrPQ8", "question": "What did the man in the air just do?", "choices": ["bounce", "land", "jump", "fall"], "correct_choice_idx": 2, "direct_answers": ["skateboard stunt", "trick", "jump", "jump", "skateboard trick", "trick", "skateboarding trick", "jumped", "wheelie", "jump"], "difficult_direct_answer": false, "rationales": ["The man in the air just jumped his skateboard.", "The man just jumped into the air.", "He's doing a trick on the skateboard"], "image": "train2014/COCO_train2014_000000033718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402420, "question_id": "Uc54Xsj7Sdy3KxBwgRNuE3", "question": "What is the man wearing over his shirt?", "choices": ["sweater", "scarf", "vest", "suit"], "correct_choice_idx": 2, "direct_answers": ["vest", "vest", "vest", "vest", "vest", "vest", "vest", "vest", "vest", "vest"], "difficult_direct_answer": false, "rationales": ["The clothing over the man's shirt has no sleeves.", "This item has buttons and has no sleeves.", "The man has a black vest on that is sleeveless."], "image": "train2014/COCO_train2014_000000402420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125548, "question_id": "Uc6wZBNcdjRLh6kjtvrB7B", "question": "What happened to this image?", "choices": ["blurred", "photoshopped", "too dark", "too bright"], "correct_choice_idx": 1, "direct_answers": ["photoshopped", "superimposed", "photoshopped", "photoshop", "photoshop", "photoshop", "graphics", "photoshopped", "edited", "illusion"], "difficult_direct_answer": false, "rationales": ["Someone put two different pictures together", "The lighting of the lady and the elephant don't align and are poorly blended together. there is light on the back of the elephant but not her.", "The image was photoshopped."], "image": "train2014/COCO_train2014_000000125548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550055, "question_id": "UcFm57bZCM9mvbvn9VA5PN", "question": "What is the circular object under the wing?", "choices": ["aileron", "jet engine", "compartment", "trash can"], "correct_choice_idx": 1, "direct_answers": ["jet engine", "engine", "engine", "engine", "engine", "engine", "engine", "engine", "engine", "engine"], "difficult_direct_answer": false, "rationales": ["The circular object under the wing signifies this is a jet engine operated aircraft.", "The object is the engine.", "A jet engine is under the wing."], "image": "val2014/COCO_val2014_000000550055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314313, "question_id": "UcSBpyrtS3aFcodqotRgCX", "question": "Where is the company from that makes the white truck?", "choices": ["france", "germany", "kazakhstan", "japan"], "correct_choice_idx": 3, "direct_answers": ["japan", "japan", "america", "japan", "mistsubishi", "mitsubishi", "g", "japan", "mitsubishi", "japan"], "difficult_direct_answer": false, "rationales": ["The brand is visibly mitsubishi, both written on the front and in the logo, which is made in japan.", "Mitsubishi is a japanese company.", "The name of the company that is on the front of the truck is mitsubishi. mitsubishi is a japanese company."], "image": "val2014/COCO_val2014_000000314313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186968, "question_id": "UciV4X7KxDDcdptCfWSAPi", "question": "Which color item qualifies as dairy?", "choices": ["orange", "green", "pink", "yellow"], "correct_choice_idx": 0, "direct_answers": ["orange", "orange", "orange", "yellow", "orange", "orange", "orange", "orange", "yellow", "orange"], "difficult_direct_answer": false, "rationales": ["The orange item on the crackers is a type of cheese and cheese often looks yellow or orange.", "The cheese qualifies as dairy. the yellow banana, green leaves, and pink meat are not dairy.", "The item is orange."], "image": "train2014/COCO_train2014_000000186968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111436, "question_id": "UcjLZ8ZaiQWcSLCeLhbxW8", "question": "What is the man in the foreground doing?", "choices": ["hiding bike", "eating lunch", "stealing bike", "repairing bike"], "correct_choice_idx": 3, "direct_answers": ["sitting", "sitting", "sitting", "sleeping", "sitting", "sitting", "sitting", "park place", "sitting", "repairing bike"], "difficult_direct_answer": false, "rationales": ["The man in the foreground is tinkering around with a bicycle.", "The man is repairing.", "The man is fixing his bike."], "image": "train2014/COCO_train2014_000000111436.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432912, "question_id": "Uck47RvvQisutxEWASC88E", "question": "Why are the triangular cones orange in color?", "choices": ["camouflage", "visibility", "design", "random pick"], "correct_choice_idx": 1, "direct_answers": ["play", "barriers", "more visible", "to warn", "boundary setting", "visibility", "visibility", "safety", "more visible", "for warning"], "difficult_direct_answer": false, "rationales": ["The orange color sticks out amongst the green of the grass.", "So the players will see them", "The triangular cones are orange because that color makes them much easier to see."], "image": "train2014/COCO_train2014_000000432912.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428754, "question_id": "Uczn5tnAgMe8V68C73WiMr", "question": "What is the person standing on?", "choices": ["snow", "water", "sticks", "concrete"], "correct_choice_idx": 3, "direct_answers": ["cement", "pavement", "asphalt", "pavement", "asphalt", "asphalt", "asphalt", "concrete", "black top", "pavement"], "difficult_direct_answer": false, "rationales": ["The man is standing in a parking lot.", "This person is standing in a parking lot on the hard cement.", "It actually looks like it might be blacktop instead of just a, but the other options don't apply except maybe b since it does look wet as well."], "image": "val2014/COCO_val2014_000000428754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280069, "question_id": "UczyPbsmVu8hn8a3QQXXtQ", "question": "Why are these sheep in pens?", "choices": ["show", "safety", "transport", "petting zoo"], "correct_choice_idx": 1, "direct_answers": ["farmer controlled", "corralled", "grazing", "to shear", "on farm", "prevent escape", "safety", "safety", "stay home", "farm animals"], "difficult_direct_answer": true, "rationales": ["These are prey animals that need protection form humans.", "They are fenced in so they don't run off or get lost. also being fenced in will help keep some predators out that could hurt them.", "To keep them safe from predators."], "image": "train2014/COCO_train2014_000000280069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469147, "question_id": "Ud5URAVrns6fFVzPyw8rpZ", "question": "Which fruit pictured is a good source of vitamin C?", "choices": ["lemons", "tangerines", "watermelon", "oranges"], "correct_choice_idx": 3, "direct_answers": ["orange", "orange", "oranges", "orange", "orange", "oranges", "orange", "oranges", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["Crates of orange fruit are on display.", "Oranges are seen in the stand and no other fruits are visible. oranges are also a good source of vitamin c.", "Oranges have vitamin c."], "image": "train2014/COCO_train2014_000000469147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127842, "question_id": "Ud97fytk7A6uEyJU9M4gQr", "question": "What body of water is this activity more commonly done in?", "choices": ["pool", "lake", "ocean", "river"], "correct_choice_idx": 2, "direct_answers": ["ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean"], "difficult_direct_answer": false, "rationales": ["Rivers typically have fast streams like this.", "There are larger areas for waves to break there", "Surfing is more commonly done in an ocean instead of a river."], "image": "val2014/COCO_val2014_000000127842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116023, "question_id": "UdBhtK2gohbpQWZ3qdGP8K", "question": "What is the player doing here?", "choices": ["congratulating", "quitting", "serving", "returning ball"], "correct_choice_idx": 3, "direct_answers": ["hit ball", "returning volley", "playing tennis", "hitting ball", "returning ball", "forehand", "serve", "playing tennis", "hitting ball", "playing tennis"], "difficult_direct_answer": false, "rationales": ["He needs to hit the ball back to the other person", "The player returns the ball.", "The ball was hit to him by his opponent so he needs to send it back."], "image": "val2014/COCO_val2014_000000116023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142934, "question_id": "UdCUyeaJA8Ei2CGdvtidDh", "question": "What is the weather?", "choices": ["warm", "snowy", "sunny", "rainy"], "correct_choice_idx": 1, "direct_answers": ["clear", "snowy", "cold", "clear cold", "cold", "cold", "chilly", "freezing temperatures", "snowy", "sunny"], "difficult_direct_answer": false, "rationales": ["People are walking up the side of a mountain that is covered in snow. snow covers the mountain and those nearby.", "There is snow in the ground.", "The weather is snowy."], "image": "val2014/COCO_val2014_000000142934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143621, "question_id": "UdKs9Qu6jr8bvg8cxaXH9Y", "question": "What is on the plate?", "choices": ["pear", "apple", "sausage", "cake"], "correct_choice_idx": 3, "direct_answers": ["dessert", "cheesecake", "cheesecake", "cheesecake", "dessert", "cheesecake", "strawberry cheesecake", "cake", "dessert", "cheesecake"], "difficult_direct_answer": false, "rationales": ["It is a popular dessert made with cream cheese.", "There is cheesecake.", "The cake is plated."], "image": "train2014/COCO_train2014_000000143621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570945, "question_id": "UdWn55aaMUJ7tNVv22CYjc", "question": "The kites are flying above what?", "choices": ["forest", "park", "school", "beach"], "correct_choice_idx": 3, "direct_answers": ["girl", "head", "people", "child", "girl", "girl", "her head", "person", "everyone", "beach"], "difficult_direct_answer": false, "rationales": ["The girl's hair is wet and she is wearing a bathing suit.", "She looks like she might be in a swimsuit and that is commonly worn at a beach.", "The person has very small clothing on so it is summer"], "image": "train2014/COCO_train2014_000000570945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539934, "question_id": "Udh7DQXtgLzaC9cUs6dUEp", "question": "The clover in the grass beneath the trees is blooming during which season?", "choices": ["fall", "summer", "spring", "winter"], "correct_choice_idx": 2, "direct_answers": ["spring", "spring", "spring", "spring", "spring", "spring", "auto", "spring", "summer", "spring"], "difficult_direct_answer": false, "rationales": ["Clovers usually grow in the spring.", "Clover grows best in rainy cool conditions. this season is accompanied by flowers.", "A lot of plants and flowers will bloom in spring."], "image": "train2014/COCO_train2014_000000539934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442979, "question_id": "UdtkZHjBFtFYKQFKrkP9cn", "question": "What restricts their movements?", "choices": ["farmer", "other sheep", "fence", "trees"], "correct_choice_idx": 2, "direct_answers": ["fence", "fence", "branches", "fence", "trees", "fence", "fence", "enclosure", "fence", "fence"], "difficult_direct_answer": false, "rationales": ["The fence is restrictive.", "A fence can be seen behind several sheep. fencing is used to keep animals in or out of an area.", "The sheep cannot get past the metal enclosure."], "image": "val2014/COCO_val2014_000000442979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393943, "question_id": "UdywhA9qJRRtRmiNrDMiFC", "question": "What do people put around train tracks?", "choices": ["gravel", "ballast", "metal", "cement"], "correct_choice_idx": 1, "direct_answers": ["rocks", "fences", "ballast", "crosses", "gravel", "rocks", "train", "gravel/rocks", "stations", "rocks/gravel"], "difficult_direct_answer": true, "rationales": ["As can be seen in the image.", "Ballast surrounds the tracks.", "The rocks are small."], "image": "train2014/COCO_train2014_000000393943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321363, "question_id": "UeRvh7L9rJiF3ry99rwsTF", "question": "What type of frosting is on the donut?", "choices": ["chocolate", "mint", "vanilla", "strawberry"], "correct_choice_idx": 3, "direct_answers": ["pink", "pink", "pink", "strawberry", "pink", "icecream", "pink", "pink", "sprinked frosted", "homemade frosted"], "difficult_direct_answer": false, "rationales": ["The frosting is pink.", "Pink icing is often flavoured with strawberry.", "The frosting is pink which suggests it is flavored like a pink food."], "image": "train2014/COCO_train2014_000000321363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557434, "question_id": "UfHcJgHNnQebFHcjekJyyV", "question": "What kind of product is most probably being transported in the last and tallest box on the truck?", "choices": ["clothing", "home appliance", "food", "auto parts"], "correct_choice_idx": 1, "direct_answers": ["refrigerator", "dryer", "refrigerator", "washing machine", "fridge", "dishwashers", "dishwasher", "refrigerator", "dishwasher", "home appliance"], "difficult_direct_answer": false, "rationales": ["The box says whirlpool.", "There is a \"whirlpool\" logo. whirlpool makes home appliances.", "The box has the word 'whirlpool' written on it. whirlpool is a manufacturer of household appliances."], "image": "train2014/COCO_train2014_000000557434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1536, "question_id": "UfPPxFvsKuzUzGhiRufzFT", "question": "The pizza came out of the oven powered by which fuel source?", "choices": ["propane", "charcoal", "electricity", "natural gas"], "correct_choice_idx": 3, "direct_answers": ["natural gas", "gas", "gas", "gas", "gas", "electric", "gas", "gas", "gas", "electric"], "difficult_direct_answer": false, "rationales": ["The oven is using natural gas.", "The pizza is from a gas oven.", "An oven burner with an ignitor is seen under a pizza on a pan. an ignitor is needed for gas stoves."], "image": "train2014/COCO_train2014_000000001536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354771, "question_id": "UfRuvLUCt4wozYGuuNBLsU", "question": "Why is he holding the bat like that?", "choices": ["showing off", "hit ball", "is angry", "exercise"], "correct_choice_idx": 1, "direct_answers": ["hitting ball", "hit ball", "hit ball", "hitting ball", "swinging", "hit ball", "swinging", "hit ball", "she", "swinging"], "difficult_direct_answer": false, "rationales": ["He is holding the bat to hit the ball like this.", "Both hands at the base of the bat makes for a solid grip when the ball comes.", "He wants to hit."], "image": "train2014/COCO_train2014_000000354771.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102555, "question_id": "UgLngGtkcpoaXnMXBAkPGH", "question": "What is the man doing with the dog?", "choices": ["feeding", "petting", "washing", "brushing"], "correct_choice_idx": 1, "direct_answers": ["petting", "petting", "petting", "petting", "petting", "petting", "petting", "petting it", "petting it", "petting"], "difficult_direct_answer": false, "rationales": ["He has his hand on the head", "The man is gently touching the dog.", "The man is petting the dog."], "image": "val2014/COCO_val2014_000000102555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522880, "question_id": "UgQMnfJ5oYMKutYEzwZphg", "question": "What is the floor of the elephants pen made of?", "choices": ["dirt", "cement", "steel", "carpet"], "correct_choice_idx": 0, "direct_answers": ["soil", "dirt", "sand", "sand", "dirt", "three", "sand", "dirt", "sand", "dirt"], "difficult_direct_answer": false, "rationales": ["The elephant pen floor has dirt.", "It is dirt and they can walk on it without hurting their feet.", "The floor is dirt."], "image": "val2014/COCO_val2014_000000522880.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495357, "question_id": "UgZK4tLRKPnEcS9F4bxJg4", "question": "What does the woman all the way to the left have?", "choices": ["purse", "umbrella", "cane", "parasol"], "correct_choice_idx": 0, "direct_answers": ["purse", "purse", "baby", "purse", "baby", "purse", "bag", "baby", "purse", "baby"], "difficult_direct_answer": false, "rationales": ["She has a bag hanging on her by a strap", "The woman has a purse.", "The woman has her bag with her."], "image": "val2014/COCO_val2014_000000495357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448275, "question_id": "UggJYHmNocFEUCxoP2Lrra", "question": "What body part does the blue bowl nearest to the man represent?", "choices": ["heart", "liver", "lungs", "brains"], "correct_choice_idx": 0, "direct_answers": ["eye", "lungs", "elbow", "unknown", "flower", "heart", "heart", "lips", "eyes", "elbow"], "difficult_direct_answer": false, "rationales": ["Bowls come in many shapes and size. the shape of the bowl most closely resembles a heart.", "The body part would be the heart.", "It has rounded edges that from an angle look like this but it's a flower"], "image": "val2014/COCO_val2014_000000448275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423818, "question_id": "UgnSruDuFHBGVvQkbbg32R", "question": "Why are the men leaning to one side?", "choices": ["to turn", "to dance", "to exercise", "to wrestle"], "correct_choice_idx": 0, "direct_answers": ["turning", "balance", "to turn", "balance", "balance", "turning snowboard", "speed", "balance", "turning", "turning"], "difficult_direct_answer": false, "rationales": ["The men are changing direction.", "This helps them maneuver", "The snowboarders are taking a turn to the right, so they must lean into it before returning to an upright position."], "image": "train2014/COCO_train2014_000000423818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465740, "question_id": "UgnqTLfaYPai3DSwifC4W9", "question": "What does the person have in their hand?", "choices": ["ski pole", "plate", "scimitar", "wallet"], "correct_choice_idx": 0, "direct_answers": ["ski poles", "ski pole", "pole", "ski pole", "ski pole", "pole", "ski pole", "pole", "ski pole", "pole"], "difficult_direct_answer": false, "rationales": ["The person is trying to ski.", "The person is traveling via skis.", "This helps them keep balance and move forward on the skis"], "image": "train2014/COCO_train2014_000000465740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9286, "question_id": "Ugr59x5wVUFoot2AMhPRFg", "question": "Who are the yellow buses designed for?", "choices": ["auto racers", "administrators", "students", "teachers"], "correct_choice_idx": 2, "direct_answers": ["students", "students", "kids", "transportation children", "school kids", "school", "school", "school", "transporting students", "students"], "difficult_direct_answer": false, "rationales": ["There are many school buses parked in a gated area. it is used to pick up people for school.", "These buses are made for school children.", "Yellow buses are used by school districts to provide transportation to the children they educate. these yellow buses are school buses and are used for students."], "image": "val2014/COCO_val2014_000000009286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167123, "question_id": "UgsCAJ3iB3Nax7UgoAgUam", "question": "What is the plane doing that requires it to be perpendicular to the runway?", "choices": ["taking off", "boarding", "taxiing", "landing"], "correct_choice_idx": 2, "direct_answers": ["stopping", "turning", "parking", "parked", "taxiing", "moving", "taxiing", "waiting", "approaching runway", "taxiing"], "difficult_direct_answer": false, "rationales": ["A plane is on a runway and is turned towards the side. the wheels of the plane are turned.", "The airplane is currently taxiing.", "The plane is taxiing."], "image": "train2014/COCO_train2014_000000167123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158421, "question_id": "UgsGaVCdPH5psRpKULR2jA", "question": "What kind of Olympic game it is?", "choices": ["summer", "winter", "spring", "autumn"], "correct_choice_idx": 1, "direct_answers": ["skiing", "winter", "skiing", "downhill skiing", "skiing", "skiing", "snowboarding", "skiing", "skiing", "downhill skiing"], "difficult_direct_answer": false, "rationales": ["This is a winter olympic game.", "The ground is covered in snow. the people are skiing and snowboarding.", "There is snow out."], "image": "val2014/COCO_val2014_000000158421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438196, "question_id": "UhFusPLBd9R9nesDjW3BoB", "question": "The elephants and people are causing what to form behind them?", "choices": ["stampede", "traffic jam", "circus", "riot"], "correct_choice_idx": 1, "direct_answers": ["traffic jam", "traffic lines", "lineup", "traffic jam", "traffic jam", "traffic jam", "traffic", "traffic", "traffic", "traffic"], "difficult_direct_answer": false, "rationales": ["Stalled vehicles can be seen behind the elephants and people.", "This is indicated by the traffic congestion.", "The elephants are walking slowly in a street car lane without a way for the vehicles behind the elephants to pass them."], "image": "train2014/COCO_train2014_000000438196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506470, "question_id": "UhHu4DmeniDBxgoZJLZUQU", "question": "What kind of programming is currently playing on the television most probably?", "choices": ["news", "kids", "sports", "reality"], "correct_choice_idx": 0, "direct_answers": ["news", "news", "news", "news", "news", "news", "news", "news", "news", "news"], "difficult_direct_answer": false, "rationales": ["There is a man in a suit sitting at a desk", "You can tell by his clothing and sitting in front of a desk.", "The television is most likely playing the news."], "image": "train2014/COCO_train2014_000000506470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277542, "question_id": "UhHyxdtBMUd3JKc2WTprFL", "question": "Why is the man on the horse here?", "choices": ["seeding employment", "herding animals", "selling livestock", "is curious"], "correct_choice_idx": 1, "direct_answers": ["herd cattle", "herding animals", "herding", "riding it", "directing steer", "cows", "herding", "riding", "herding cattle", "guiding herd"], "difficult_direct_answer": true, "rationales": ["He's keeping the cows moving in one direction", "The rancher is moving the cows forward.", "A man on a brown horse gently keeps his brown cows moving down the road as they get relocated to a new feedlot."], "image": "val2014/COCO_val2014_000000277542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157901, "question_id": "UhPAQy9zaEy8FXQbwpkYh3", "question": "Why do sheep graze in a field?", "choices": ["socialize", "stimulate plants", "forage", "relaxation"], "correct_choice_idx": 1, "direct_answers": ["eat", "to eat", "eating", "love grass", "hungry", "to eat", "hungry", "stimulate plants", "eat grass", "hungry"], "difficult_direct_answer": false, "rationales": ["The sheep are there to eat.", "The sheep need to eat grass to stay full and healthy. grass is the easiest food for them find in the pastures.", "The sheep simulate plants."], "image": "train2014/COCO_train2014_000000157901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380591, "question_id": "UhcsP6yeSSyVoDJwMxy8Zk", "question": "The nearest shadow belongs to the man wearing what color of shirt?", "choices": ["black", "red", "white", "orange"], "correct_choice_idx": 3, "direct_answers": ["orange", "orange", "orange", "red", "orange", "red", "red", "orange", "orange", "red"], "difficult_direct_answer": false, "rationales": ["It could be argued that in that light the color is a and b.", "The shadow is being created by the man in orange.", "His shirt is visible and is orange in colour."], "image": "train2014/COCO_train2014_000000380591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97379, "question_id": "Uhg7MyHAQLdA3HGpqDttdQ", "question": "What missing items allows kites to be easily flown here?", "choices": ["kids", "power lines", "trains", "parents"], "correct_choice_idx": 1, "direct_answers": ["power lines", "wind", "wind", "string", "string", "wind", "trees", "fans", "wind", "fans"], "difficult_direct_answer": false, "rationales": ["There are no power lines in the way.", "The power lines are not present, so the kites cannot get stuck.", "Power lines usually cause kites to get tangled."], "image": "val2014/COCO_val2014_000000097379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143980, "question_id": "UhijcKJyerT9irN6wdfRXs", "question": "What fruit is in the middle?", "choices": ["oranges", "grapes", "watermelon", "mangoes"], "correct_choice_idx": 0, "direct_answers": ["orange", "oranges", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["Oranges are in the middle.", "The oranges have green apples to their left and red apples to their right. it's impossible, due to their singular bright color, to mistake them for anything else!.", "The fruit is oranges."], "image": "train2014/COCO_train2014_000000143980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367641, "question_id": "UhuiLLuuWPfXkSWtcCZ6JD", "question": "How many person can be seen?", "choices": ["three", "two", "four", "one"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two different people in the image. one looking forward and one looking back.", "There is a man buried in the snow and another man bending over.", "There are two people."], "image": "train2014/COCO_train2014_000000367641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251427, "question_id": "UhzfoQXbCdA9WWThSiUfj6", "question": "Why can't they travel?", "choices": ["no wind", "too rainy", "no gas", "no water"], "correct_choice_idx": 3, "direct_answers": ["beached", "no water", "no water", "sandlogged", "beached", "beached", "low tide", "beached", "on beach", "on beach"], "difficult_direct_answer": false, "rationales": ["There is no water visible.", "Boats need water to travel - these are on sand.", "The boats are in the sand and do not appear to be close to the water."], "image": "train2014/COCO_train2014_000000251427.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42260, "question_id": "UiaYVmLKUwCXBnkPuNiXSM", "question": "What are the people under the umbrella fearing?", "choices": ["sunburn", "wind", "rain", "itching"], "correct_choice_idx": 0, "direct_answers": ["sun burn", "being sunburned", "sunburn", "sun", "sunburn", "fearing", "sunburn", "sunburn", "sun", "sunburn"], "difficult_direct_answer": false, "rationales": ["The people are sitting under umbrellas on a clear day, and there is no sign of wind blowing. there is nothing in the picture which would likely cause itching.", "It is a clear and calm day. the people are on a beach.", "The people are sitting on a beach. the weather is clear, not raining or windy."], "image": "val2014/COCO_val2014_000000042260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446354, "question_id": "Uj9R4KnRzpp2VtunVp5MtZ", "question": "How is this food cooked?", "choices": ["baking", "sauteing", "boiling", "grilling"], "correct_choice_idx": 1, "direct_answers": ["in oil", "stir fry", "by heat", "sauteing", "browned", "sauteed", "fried", "stir fried", "stir fry", "boiled"], "difficult_direct_answer": true, "rationales": ["The other options aren't taking place in this pan, which is specifically used for this purpose.", "The food is in a frying pan and sauteing is the only listed method that uses a frying pan.", "These vegetables are in a sauce pan and it looks like it is going to be sauteed."], "image": "val2014/COCO_val2014_000000446354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528710, "question_id": "UjCTVYVK7iP984nfVkpSMA", "question": "Where is this group headed?", "choices": ["disney world", "down", "no where", "up"], "correct_choice_idx": 3, "direct_answers": ["top", "up mountain", "up mountain", "top mountain", "hilltop", "up", "mountaintop", "uphill", "mountaintop", "flying"], "difficult_direct_answer": false, "rationales": ["The ski lift is used to take people up so they can ski down.", "The group goes up.", "The people are skiing. people start skiing from the top of the mountain."], "image": "train2014/COCO_train2014_000000528710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221067, "question_id": "UjHoPHY4ZQEEzUese53REJ", "question": "What does the man's sandwich most resemble?", "choices": ["submarine", "cookie", "croissant", "bagel"], "correct_choice_idx": 3, "direct_answers": ["burger", "bagel", "doughnut", "bagel", "bagel", "life preserver", "burger", "bagel", "doughnut", "bagel"], "difficult_direct_answer": false, "rationales": ["The sandwich is in a ring shape.", "The man has a bagel.", "The sandwich is round with a hole in the center, just like a bagel."], "image": "val2014/COCO_val2014_000000221067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482187, "question_id": "UjHwSLT6EhXuEMZTbuuSU9", "question": "What are the people focusing at?", "choices": ["calculator", "remote control", "walkie talkie", "cellphone"], "correct_choice_idx": 3, "direct_answers": ["cigar", "phone", "cellphone", "phone", "phone", "phone", "phone", "cigar", "mans phone", "phone"], "difficult_direct_answer": false, "rationales": ["The object the men are regarding can be known by their eye-lines and the object is the size, shape and being held in the way that one would use answer a.", "The people are on the phone.", "The people are looking at a phone."], "image": "val2014/COCO_val2014_000000482187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423951, "question_id": "UjLDZxpochsE7rWHLQJ2tp", "question": "What site is the water shown in here?", "choices": ["stream", "fish lake", "harbor", "pond"], "correct_choice_idx": 2, "direct_answers": ["marina", "boat", "bay", "city", "river", "business area", "port", "inlet", "harbor", "building"], "difficult_direct_answer": true, "rationales": ["This is a harbor because you see ships accosted", "There is a bunch of boats and it's near a city so it is most likely a harbor.", "The boats are docked."], "image": "val2014/COCO_val2014_000000423951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158130, "question_id": "UjLEoR9boN74uUFmiL8Cp6", "question": "What do those riding this vessel use to do their jobs?", "choices": ["air", "milk", "water", "singing"], "correct_choice_idx": 2, "direct_answers": ["fire coat", "water", "firehouse", "water", "water hoses", "firefighters", "firefighting", "fire hose", "firefighters", "firehouses"], "difficult_direct_answer": false, "rationales": ["The firetruck uses water.", "The large truck carries water to put out fires.", "The automobile is a firetruck, used by fireman."], "image": "train2014/COCO_train2014_000000158130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381350, "question_id": "UjLurppA4QSqajSigmaYzg", "question": "What material are the two bears to the right of the desk clock made from?", "choices": ["glass", "plastic", "jade", "ceramic"], "correct_choice_idx": 2, "direct_answers": ["jade", "jade", "jade", "porcelain", "stone", "ceramic", "stone", "jade", "ceramic", "rock"], "difficult_direct_answer": false, "rationales": ["This was a common material used to make these types of collectibles historically. that said, c was also then eventually used to mimic a.", "The greenish color indicates jade.", "The two bears are green and look like they are made out of the gemstone jade."], "image": "train2014/COCO_train2014_000000381350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68657, "question_id": "UjQfykhDZJW9b7hQFsAJ7a", "question": "What are the blurry boxes in the background most likely to contain?", "choices": ["seeds", "action figures", "video games", "raisins"], "correct_choice_idx": 2, "direct_answers": ["video games", "dvds", "dvds", "movies", "dvds", "books", "movies", "games", "movies", "dvds"], "difficult_direct_answer": false, "rationales": ["The boxes have games.", "That's the size and shape of video game boxes.", "The boxes are for video games."], "image": "train2014/COCO_train2014_000000068657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12307, "question_id": "UjU6Dqa97pkt5BTQ6F4K3y", "question": "What is a behavior that is found in this animal species?", "choices": ["flying", "barking", "hibernating", "trilling"], "correct_choice_idx": 3, "direct_answers": ["scratching", "purring", "stealth", "cat", "curiosity", "stretching", "curiosity", "trilling", "catching mice", "curiosity"], "difficult_direct_answer": false, "rationales": ["They are cats and they purr", "Cats generally trill.", "Usually cats are trill when they're happy."], "image": "train2014/COCO_train2014_000000012307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255494, "question_id": "UjZoJKQmX6oVnAkNqmjn7N", "question": "What is their relationship?", "choices": ["coworkers", "siblings", "couple", "classmates"], "correct_choice_idx": 2, "direct_answers": ["couple", "significant other", "couple", "romantic", "romantic", "romantic", "romantic", "couple", "lovers", "lovers"], "difficult_direct_answer": false, "rationales": ["They are about to kiss", "The two are kissing each other on the motorcycle.", "The two people are about to kiss. siblings, coworkers, and classmates do not kiss each other."], "image": "train2014/COCO_train2014_000000255494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235721, "question_id": "Ujic7dz792K54bpQ56Zd27", "question": "What profession is shared by these people?", "choices": ["cooks", "boat captains", "pilots", "boaters"], "correct_choice_idx": 2, "direct_answers": ["pilots", "pilots", "pilots", "pilot", "pilots", "aviation", "fireman", "pilots", "pilots", "pilots"], "difficult_direct_answer": false, "rationales": ["Pilot uniforms have wings on the left breast pocket.", "You can tell by their uniforms and hats with the wings on them that they are pilots.", "The people work on planes."], "image": "train2014/COCO_train2014_000000235721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547509, "question_id": "Ujm9ZzEe8BzS23iFnHC7mj", "question": "What game are these two kids going to play?", "choices": ["pickle ball", "softball", "racquet ball", "volleyball"], "correct_choice_idx": 1, "direct_answers": ["softball", "softball", "softball", "softball", "softball", "baseball", "softball", "t-ball", "softball", "softball"], "difficult_direct_answer": false, "rationales": ["The two girls are wearing baseball caps and there is a mitt between them.", "Two girls are sitting on a bench in matching shirts and hats and there is a glove between them.", "The partial writing on their shirts tells the sport they are likey to engage in as well as the visible equipment."], "image": "train2014/COCO_train2014_000000547509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463334, "question_id": "UjqKfV8QCyAFg5i45PoDJ7", "question": "What are the two people located in?", "choices": ["home", "train", "library", "restaurant"], "correct_choice_idx": 3, "direct_answers": ["restaurant", "diner", "restaurant", "restaurant", "restaurant", "diner", "restaurant", "booth", "restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["The two people are sitting in a restaurant booth.", "The two people are at a restaurant table.", "The decor and seating arrangement visible in the image is consistent with answer a."], "image": "train2014/COCO_train2014_000000463334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79333, "question_id": "Uk7ky5ZTAWQo8Xwwb8THXt", "question": "What is happening to the woman in white?", "choices": ["getting soaked", "getting hit", "getting hot", "getting sunburned"], "correct_choice_idx": 0, "direct_answers": ["getting wet", "rained on", "getting wet", "getting wet", "getting wet", "getting soaked", "biking", "getting soaked", "riding bike", "getting soaked"], "difficult_direct_answer": false, "rationales": ["It is dark and raining out. she, unlike the other people, is not carrying an umbrella.", "The woman in white is getting soaked on her bike.", "There is rain occurring based on the wetness visible and the people holding umbrellas. the women in white is on a bicycle and exposed to the elements and would thus be answer a."], "image": "train2014/COCO_train2014_000000079333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101530, "question_id": "UkAuWjXyc92cH3rpFM2TvE", "question": "Where are these people located?", "choices": ["residence", "museum", "hospital", "office"], "correct_choice_idx": 0, "direct_answers": ["den", "home", "living room", "living room", "indoors", "in home", "smiling", "living room", "residence", "livingroom"], "difficult_direct_answer": false, "rationales": ["The people are playing the game at home.", "These people are in someone's living room.", "It's the most likely option given the other ones don't usually contain a full living room"], "image": "train2014/COCO_train2014_000000101530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364455, "question_id": "UkBa5vnMPM5ajHvfzAuoRT", "question": "Fermentation of grains fruits or other sources of sugar produces what?", "choices": ["citric acid", "juices", "alcoholic beverages", "hcl"], "correct_choice_idx": 2, "direct_answers": ["alcohol", "alcoholic beverages", "alcohol", "wine", "wine", "alcohol", "alcohol", "alcohol", "wine", "wine"], "difficult_direct_answer": false, "rationales": ["When grains are fermented, alcohol is produced.", "When the fruits fermat they becomes the alcohol.", "The question is not related to the image, but the answer is commonly known."], "image": "train2014/COCO_train2014_000000364455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198959, "question_id": "UkS6ZCRvNN3AdX6ZA8zWoT", "question": "Why is she holding the bat in front of her?", "choices": ["hit catcher", "exercising", "is angry", "hit ball"], "correct_choice_idx": 3, "direct_answers": ["hit ball", "hitting", "hit ball", "hit ball", "hitting ball", "hit ball", "hit something", "hit ball", "hit ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["Her body is making forward motions as if to hit something.", "The girl is moving the bat toward the ball in front of her.", "It is a game of softball where you use a bat to make the ball go in the other direction really far."], "image": "val2014/COCO_val2014_000000198959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456554, "question_id": "UkwB87n67HMBnLq6J8u7S7", "question": "What kind of knife is the woman using to cut the knife?", "choices": ["cleaver", "serrated", "filleting knife", "peeling knife"], "correct_choice_idx": 1, "direct_answers": ["serrated", "serrated", "wedding", "serrated knife", "cake", "sharp", "serrated", "bread knife", "serrated", "cake"], "difficult_direct_answer": false, "rationales": ["This is the most likely option given the setting.", "The knife has a saw like edge for cutting.", "There are grooves on the knife."], "image": "train2014/COCO_train2014_000000456554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87429, "question_id": "UmPmhqTAaNaXittppet9Zt", "question": "What country are these cars manufactured in?", "choices": ["poland", "japan", "usa", "germany"], "correct_choice_idx": 3, "direct_answers": ["germany", "germany", "germany", "germany", "germany", "germany", "germany", "germany", "germany", "germany"], "difficult_direct_answer": false, "rationales": ["The logos on the backs of the cars are for audi and bmw. audi and bmw are german companies.", "The car brand with four interlocking circles is audi. audi is a german car manufacturer.", "These cars are made in germany since they're benz cars."], "image": "val2014/COCO_val2014_000000087429.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53825, "question_id": "UmXs962snGwLEFY4e5EsCw", "question": "What is the baseball most likely to hit next?", "choices": ["pitcher", "wall", "audience", "baseball bat"], "correct_choice_idx": 3, "direct_answers": ["bat", "outfield", "bat", "baseball bat", "baseball", "bat", "bat", "bat", "ball", "bat"], "difficult_direct_answer": false, "rationales": ["The position of the pitcher shows he just released the ball towards the batter.", "A ball is in the air between the pitcher and batter.", "The ball is on it's way to the batter."], "image": "train2014/COCO_train2014_000000053825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396681, "question_id": "UmZGbC2vGGTx3kMrj6ZWgc", "question": "What are in the bottles on the right?", "choices": ["wine", "water", "gin", "beer"], "correct_choice_idx": 1, "direct_answers": ["water", "water bottles", "water bottles", "water", "water", "water", "water", "water", "water", "water bottles"], "difficult_direct_answer": false, "rationales": ["The bottles have water.", "The bottles on the right hold water.", "Water bottles are shown."], "image": "train2014/COCO_train2014_000000396681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19789, "question_id": "UnC3mduzRgRYF9oMioAVzv", "question": "What action will the man take next?", "choices": ["run", "swing", "sit", "dunk"], "correct_choice_idx": 1, "direct_answers": ["hit ball", "hit ball", "hit ball", "hit ball", "hit ball", "hit ball", "hit ball", "swing", "hit ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["The tennis player will swing at the ball.", "His arm is holding the tennis racket back. the ball is heading towards him while he looks at it.", "The man will swing."], "image": "train2014/COCO_train2014_000000019789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445269, "question_id": "UnGHHxBextU4tWYu6tUnEz", "question": "What is in the tube behind the person's ears?", "choices": ["water", "nitrous oxide", "poison", "oxygen"], "correct_choice_idx": 3, "direct_answers": ["air", "oxygen", "breathing tube", "oxygen", "oxygen", "oxygen tube", "oxygen", "oxygen", "oxygen", "oxygen"], "difficult_direct_answer": false, "rationales": ["People sometimes need help breathing through their noses.", "Invisible air is going into nose. helps with breathing problems.", "The person needs help breathing."], "image": "train2014/COCO_train2014_000000445269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157726, "question_id": "UnqVuqZUHi6XkmYkjXoj8e", "question": "What state is this location?", "choices": ["nevada", "california", "maine", "ohio"], "correct_choice_idx": 1, "direct_answers": ["california", "california", "california", "california", "california", "california", "california", "california", "california", "california"], "difficult_direct_answer": false, "rationales": ["The state is california.", "This location is in san jose of california.", "These are cities in that state"], "image": "val2014/COCO_val2014_000000157726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119608, "question_id": "Uo9NjPRJffP2rF4Nj4dfTd", "question": "What type of phone is not included in the collection of phones?", "choices": ["smart phone", "conventional phone", "cell phone", "flip phone"], "correct_choice_idx": 0, "direct_answers": ["iphone", "smart phone", "iphone", "iphone", "landline", "smart phone", "smartphone", "flip", "smart phone", "home phone"], "difficult_direct_answer": false, "rationales": ["There are no smart phones in the photo.", "These are examples of older generation phones prior, or so it would appear, to full touch screen capability.", "There is not a landline phone."], "image": "train2014/COCO_train2014_000000119608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8483, "question_id": "UoJGTc44tNLPxGLPeXfWCK", "question": "What is the company 2M?", "choices": ["it company", "manufacturing company", "biotechnology company", "news broadcaster"], "correct_choice_idx": 3, "direct_answers": ["paper", "financial", "turf wholesale", "newspaper", "news broadcaster", "oil company", "media", "water products", "sponsor", "no idea"], "difficult_direct_answer": true, "rationales": ["2m is a tv channel in morocco.", "The company 2m is a major news broadcaster.", "The company is for broadcasting."], "image": "val2014/COCO_val2014_000000008483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103954, "question_id": "UoTkJiwU7Nm3nks7MYsfu8", "question": "Where is this vehicle able to drive?", "choices": ["sky", "rail", "water", "street"], "correct_choice_idx": 1, "direct_answers": ["on tracks", "rails", "trolley", "on tracks", "rail", "road", "on tracks", "rail", "tracks", "trolley tracks"], "difficult_direct_answer": false, "rationales": ["The trolley is on its own railroad.", "The vehicle is a railcar.", "The vehicle is a trolley. it cannot travel on regular roads, float on water, or fly in the sky."], "image": "train2014/COCO_train2014_000000103954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228175, "question_id": "UoUkU3LF8J5KCqbgGNw8Jj", "question": "What is the man doing on the bench?", "choices": ["sitting", "jumping jacks", "board slide", "grabbing"], "correct_choice_idx": 2, "direct_answers": ["skateboarding", "skateboarding", "board slide", "skateboarding", "skating", "skateboarding", "skateboarding trick", "skateboard moves", "grinding", "skateboarding"], "difficult_direct_answer": false, "rationales": ["The man is sliding.", "The man is in a skateboard and doing a board slide.", "The man is doing a board slide at the corner of the bench."], "image": "train2014/COCO_train2014_000000228175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262162, "question_id": "UokKe5vqz6NV4gVbcu4P4t", "question": "What type of printing technology does the printer next to the waste bin utilize?", "choices": ["laser", "dye sublimation", "inkjet", "thermal"], "correct_choice_idx": 0, "direct_answers": ["dot matrix", "ink", "ink", "surface", "photocopying", "laser", "laser", "laser", "inkjet", "inkjet"], "difficult_direct_answer": false, "rationales": ["The type is a laser.", "This looks like a laser printer and has the design of common laser printers.", "The printer is a home printer."], "image": "val2014/COCO_val2014_000000262162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452558, "question_id": "Up5TsCeDZQYmrRyG8skcXA", "question": "In addition to pizza what is very likely to be available here?", "choices": ["soup", "pasta", "salad", "fish"], "correct_choice_idx": 1, "direct_answers": ["salad", "pasta", "pasta", "pasta", "salad", "pasta", "pasta", "pasta", "salad", "salad"], "difficult_direct_answer": false, "rationales": ["They serve pasta.", "Usually places have pizza and pasta that they will serve.", "The sign says \"pizza and pasta\"."], "image": "val2014/COCO_val2014_000000452558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334609, "question_id": "UpXryJoKchFzLjUBhEm3DU", "question": "What is the woman using the white object in her right hand to do?", "choices": ["smoke", "brush teeth", "talk", "eat"], "correct_choice_idx": 0, "direct_answers": ["navigate", "make call", "smoke", "walking", "smoke", "find location", "transport", "smoke", "smoke", "smoke"], "difficult_direct_answer": false, "rationales": ["The white object is a cigarette, not a toothbrush, food item, or cell phone.", "The woman has a cigarette.", "The woman is smoking."], "image": "val2014/COCO_val2014_000000334609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177538, "question_id": "UpopDffgVtusTj8JxMeBve", "question": "In a game or rock paper scissors which items beats what the woman has in her hands?", "choices": ["paper", "rock", "spoon", "scissors"], "correct_choice_idx": 1, "direct_answers": ["rock", "paper", "rock", "rock", "rock", "rock", "rock", "scissors", "rock", "rock"], "difficult_direct_answer": false, "rationales": ["She is holding a scissors and rock crushes scissors.", "The woman is holding scissors. scissors cut paper, spoons are not part of the game, and scissors can't beat themselves.", "It smashes the scissors"], "image": "train2014/COCO_train2014_000000177538.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34508, "question_id": "Uq3fxyYJN9HvFrB3VkjpaZ", "question": "This event is most likely to take place where?", "choices": ["cameroon", "rwanda", "egypt", "siberia"], "correct_choice_idx": 3, "direct_answers": ["mountains", "canada", "siberia", "skiing", "mountain", "mountain", "mountain", "mountain", "ski slope", "mountains"], "difficult_direct_answer": false, "rationales": ["The skiing is in siberia since there is snow.", "The event is in siberia.", "Siberia has snow."], "image": "val2014/COCO_val2014_000000034508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525695, "question_id": "Uq42bSP3EqXZxCaxnMDsu3", "question": "What is the brown stuff on the board?", "choices": ["dirt", "oil", "sand", "water"], "correct_choice_idx": 2, "direct_answers": ["dirt", "dirt", "dirt", "dirt", "dirt", "dirt", "sand", "dirt", "dirt", "dirt"], "difficult_direct_answer": false, "rationales": ["The stuff is sand.", "There is a lot of dirt and sand on the board.", "Surfboards are used at the beach. beaches have sand."], "image": "train2014/COCO_train2014_000000525695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166424, "question_id": "Uq7cMeAyPUrajdRuZQ9rcf", "question": "What are they doing?", "choices": ["resting", "waiting ski", "cleaning up", "eating dinner"], "correct_choice_idx": 1, "direct_answers": ["waiting ski", "skating", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["They are waiting for the skiers below them to finish.", "The people are waiting for their turn to go skiing.", "The ski hill seems crowded, so they are waiting until there is more room for movement."], "image": "val2014/COCO_val2014_000000166424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527086, "question_id": "UqE45AJX3ynwCe7zwQ4Evn", "question": "Where do these men enjoy their snack?", "choices": ["emt room", "morgue", "police station", "fire house"], "correct_choice_idx": 3, "direct_answers": ["break room", "firehouse", "meeting", "table", "table", "table", "fire house", "meeting", "meeting room", "break room"], "difficult_direct_answer": false, "rationales": ["They're in a firehouse.", "These men enjoy their snack at the fire department with its chief.", "The men are in police uniforms."], "image": "val2014/COCO_val2014_000000527086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304614, "question_id": "UqYkTtx9iLkZaLQELiwkNd", "question": "Where will these men go next?", "choices": ["up hill", "nowhere", "leftward", "down hill"], "correct_choice_idx": 3, "direct_answers": ["downhill", "down mountain", "down hill", "downhill", "down mountain/hill", "down hill", "down hill", "downhill", "down", "downhill"], "difficult_direct_answer": false, "rationales": ["The men will go down.", "The men are heading downhill since they're at the peak.", "He is on skis and on top of a hill and likely to go downhill next."], "image": "train2014/COCO_train2014_000000304614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459827, "question_id": "UqfCfnmsbgLgbpobEe7DJ5", "question": "What does the horse hold in it's mouth here?", "choices": ["leather", "hand", "bit", "acid"], "correct_choice_idx": 2, "direct_answers": ["bridle", "bit", "bit", "bridle", "bit", "bit", "bridle", "bit", "bit", "bit"], "difficult_direct_answer": false, "rationales": ["The horse has a bit.", "The horse has an part of the harness in its mouth to be directed by the person mounted on the horse.", "There is a metal bar in its mouth with rings on each end."], "image": "train2014/COCO_train2014_000000459827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88634, "question_id": "UqyrQDrRjWKETaAVVcd3CC", "question": "What are the appliances made of?", "choices": ["glass", "plastic", "steel", "wood"], "correct_choice_idx": 2, "direct_answers": ["metal", "stainless steel", "stainless steel", "stainless steel", "steel", "black surfaces", "stainless steel", "steel", "stainless steel", "stainless steel"], "difficult_direct_answer": false, "rationales": ["The appliances are silver and shiny.", "The appliances are stainless steel.", "The appliances are shiny. the material is metal, not plastic or wood."], "image": "val2014/COCO_val2014_000000088634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396460, "question_id": "UrBJpV2UxG4BHzPzxboRL5", "question": "What are the people queueing up for?", "choices": ["boarding bus", "climbing mountain", "entering museum", "boarding car"], "correct_choice_idx": 0, "direct_answers": ["bus", "bus ride", "boarding bus", "boarding", "boarding bus", "bus", "board bus", "bus", "sightseeing bus", "bus"], "difficult_direct_answer": false, "rationales": ["People are standing in line with at a bus stop.", "There is a large yellow transport vehicle waiting for them.", "The people are at a station. the bus is pulling up under the covered area."], "image": "train2014/COCO_train2014_000000396460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285470, "question_id": "Urczqh3fXocrxd2HKCyTqY", "question": "What type of bird is in the image?", "choices": ["hawk", "toucan", "finch", "parrot"], "correct_choice_idx": 2, "direct_answers": ["finch", "thirsty", "crow", "wild", "finch", "cuckoo", "finch", "large", "unsure", "falcon"], "difficult_direct_answer": false, "rationales": ["A finch is in the image since it's small and gray.", "A finch is drinking from the cup.", "The bird is identified by its bright yellow coloring on its head and grey coloring on the rest of its body."], "image": "train2014/COCO_train2014_000000285470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208815, "question_id": "Ure9HH8mmpxNBmq6uQURTH", "question": "What is on the table?", "choices": ["cookie", "egg", "baby", "purse"], "correct_choice_idx": 3, "direct_answers": ["display", "greeting card", "purse", "vase", "small gifts", "purse", "envelopes", "vases", "vases", "cards"], "difficult_direct_answer": false, "rationales": ["The handbag is one of the things on the table.", "There is a bag behind the glass objects. there are no humans or food items on the table.", "The table has a purse."], "image": "train2014/COCO_train2014_000000208815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191325, "question_id": "UskiG753g7LZnQeJuYom5w", "question": "What are the yellow barrels next to the road for?", "choices": ["safety", "construction tools", "parking designators", "speed designators"], "correct_choice_idx": 0, "direct_answers": ["crash", "safety", "crash cushion", "emergency stop", "barriers", "blocking traffic", "barrier", "barriers", "barricade", "safety"], "difficult_direct_answer": false, "rationales": ["The bright color alerts people to stay within the lane that they're confined to.", "The barrels are used to keep people out of certain places.", "The barrels are for safety."], "image": "train2014/COCO_train2014_000000191325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306681, "question_id": "Uso6HfmTADHe5sopX8hjG4", "question": "Why do they have umbrellas?", "choices": ["sleet", "snow", "hail", "rain"], "correct_choice_idx": 0, "direct_answers": ["snowfall", "snowing", "snow", "snow", "sleet", "cover rain", "sleet", "snowing", "its raining", "raining"], "difficult_direct_answer": false, "rationales": ["The people are protecting themselves from sleet.", "There is very wet and frozen precipitation falling", "There are pellets of rain containing snow."], "image": "train2014/COCO_train2014_000000306681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185634, "question_id": "UtEQE44UiyMHYAukcSFop2", "question": "What brand wristband the player worn?", "choices": ["nike", "adidas", "reebok", "puma"], "correct_choice_idx": 1, "direct_answers": ["adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas"], "difficult_direct_answer": false, "rationales": ["The brand is adidas.", "The man is wearing an adidas wristband.", "Their logo is on the band."], "image": "train2014/COCO_train2014_000000185634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341809, "question_id": "UtFXRzED7z28GxV3rnDpYz", "question": "Why are the flamingos looking in the water?", "choices": ["for bugs", "to bathe", "for rocks", "for fish"], "correct_choice_idx": 3, "direct_answers": ["food", "wanting fish", "eating", "for fish", "hungry", "food", "fishing", "fishing", "finding food", "getting food"], "difficult_direct_answer": false, "rationales": ["The flamingoes want to find food.", "The flamingos are looking around in the water for fish.", "Birds are standing in the shallow water with heads bent down towards the water."], "image": "train2014/COCO_train2014_000000341809.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497261, "question_id": "UtMHaNr5xJZnPDdbg5CSq2", "question": "What role does Bank of America play to this game?", "choices": ["site provider", "loan provider", "sponsor", "fund provider"], "correct_choice_idx": 2, "direct_answers": ["sponsor", "sponsoring", "sponsor", "sponsor", "sponsor", "sponsor", "sponsor", "sponsor", "sponsor", "sponsor"], "difficult_direct_answer": false, "rationales": ["The bank has a sign on the field because they provided funding to make the game possible, and in return the poster serves as advertising.", "The sign advertises bank of america in a public setting. companies like bank of america often sponsor sports facilities for publicity for their products or services.", "The bank of america logo is visible on the playing field. logos visible on playing fields in this manner represent answer a."], "image": "train2014/COCO_train2014_000000497261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166477, "question_id": "UtXR8Np6ydadT4UuMRwvT7", "question": "What is the man with his hands in the air doing?", "choices": ["fighting", "directing traffic", "swimming", "tricks"], "correct_choice_idx": 3, "direct_answers": ["skateboarding", "skateboarding", "skateboarding", "tricks", "skateboarding", "skateboarding", "tricks", "skateboarding", "skating", "balancing"], "difficult_direct_answer": false, "rationales": ["The skateboard and man are in the air which is slightly scary.", "The man is trying to show off his moves and tricks on his skateboard.", "As indicated by the skateboard and skate park."], "image": "train2014/COCO_train2014_000000166477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221446, "question_id": "UtaqUoHFUkNTAkw4cwSZPC", "question": "What is a collective name given to the food options above?", "choices": ["fruits", "meat", "nuts", "veggies"], "correct_choice_idx": 3, "direct_answers": ["veggies", "vegetables", "bunch", "vegetables", "vegetables", "vegetables", "bunch", "vegetables", "vegetables", "vegetables"], "difficult_direct_answer": false, "rationales": ["All of these produce options are called vegetables.", "Carrots, broccoli and leeks are all plants that are used in savory cooking.", "The name is veggies."], "image": "train2014/COCO_train2014_000000221446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174188, "question_id": "UthLHG9BkjstEnUHVgtesa", "question": "What famous movie character could ride these floating devices?", "choices": ["ariel", "bugs bunny", "mary poppins", "tupac shakur"], "correct_choice_idx": 2, "direct_answers": ["mary poppins", "mary poppins", "mary poppins", "mary poppins", "mary poppins", "mary poppins", "mary poppins", "mary poppins", "mary poppins", "mary poppins"], "difficult_direct_answer": false, "rationales": ["Mary poppins floated into the air on one of these.", "The character is mary.", "Mary poppins rides umbrellas."], "image": "train2014/COCO_train2014_000000174188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173231, "question_id": "UtmtJCL79RJQaF97rNBz3X", "question": "Which company is known for making the object the person on the right has on their feet?", "choices": ["carhartt", "chanel", "gucci", "estee lauder"], "correct_choice_idx": 0, "direct_answers": ["muckboots", "wellington", "uggs", "muck boots", "carhartt", "ugg", "uggs", "boot company", "dry martens", "uggs"], "difficult_direct_answer": false, "rationales": ["Woman walk near train tracks and one is wearing boots.", "Carhartt is a company that makes apparel for outdoor use.", "Carhartt sells boots."], "image": "train2014/COCO_train2014_000000173231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548731, "question_id": "UtoP8VwRsTnbrh4AJc4DFj", "question": "Why are the cones orange in color?", "choices": ["camouflage", "design", "visibility", "appealing color"], "correct_choice_idx": 2, "direct_answers": ["safety", "visibility", "stand out", "visibility", "see easily", "visibility", "caution", "traffic barrels", "visibility", "visibility"], "difficult_direct_answer": false, "rationales": ["The cones are for visibility.", "The orange cones are directing traffic.", "The cones are a bright color so that motor vehicle operators can easily spot them even in the dark."], "image": "train2014/COCO_train2014_000000548731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411669, "question_id": "UuJkW28igJTjFemXPx7tk2", "question": "What shot is this female player making?", "choices": ["serve", "forehand", "lob", "backhand"], "correct_choice_idx": 1, "direct_answers": ["backhand", "forehand", "forehand", "return shot", "lob", "forehand", "underhand", "serve", "forehand", "underhand"], "difficult_direct_answer": false, "rationales": ["The woman is trying to hit with a forehand.", "The player is in the position of holding the front of their hand and wrist out.", "The player is dipping their arm to hit the ball."], "image": "train2014/COCO_train2014_000000411669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271488, "question_id": "UudLnCKgMZmLfUgZRVKtjs", "question": "What is the shape of the dark inserts on the backsplash?", "choices": ["oval", "triangle", "diamond", "square"], "correct_choice_idx": 2, "direct_answers": ["diamond", "diamond", "diamonds", "rhombus", "rhombus", "diamond", "diamond", "diamond", "diamond", "diamond"], "difficult_direct_answer": false, "rationales": ["The visible inserts have four equal straight sides and points on top and bottom. this shape is described as a diamond.", "Those are diamond shaped and used to give a design to the backsplash.", "The shapes are squares turned on a point."], "image": "train2014/COCO_train2014_000000271488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62337, "question_id": "UuetTJNZuDGLeFvWwMDcMa", "question": "What type of animals are present?", "choices": ["deer", "dog", "horse", "giraffe"], "correct_choice_idx": 2, "direct_answers": ["horses", "horses", "horse", "horses", "horses", "horses", "horses", "horses", "horses", "horses"], "difficult_direct_answer": false, "rationales": ["Horses are in the field.", "The horses are grazing in the pasture. they are large and healthy.", "Horses are present."], "image": "train2014/COCO_train2014_000000062337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365419, "question_id": "Uuoa9kE2UKB92WfDAoVois", "question": "A few people in the stands are wearing what?", "choices": ["clown noses", "sunglasses", "raincoats", "rabbit ears"], "correct_choice_idx": 1, "direct_answers": ["sunglasses", "clothes", "sunglasses", "yellow shirts", "football", "sunglasses", "yellow t-shirt", "jerseys", "yellow shirts", "sunglasses"], "difficult_direct_answer": false, "rationales": ["It is clear and dry, so they are not wearing raincoats. their noses and ears are not covered.", "It is a sunny day and people need to wear eye protection so they can see the game. people wear sunglasses to shade their eyes from the sun.", "It is not raining. some people in the stands have covered eyes."], "image": "train2014/COCO_train2014_000000365419.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353836, "question_id": "UvD4chjDFgDbuE5qxBTqFr", "question": "Where could someone put their garbage?", "choices": ["cab", "kitchen", "rubbish bin", "forest ground"], "correct_choice_idx": 2, "direct_answers": ["trash bin", "trash can", "trash can", "trash can", "rubbish bin", "trash", "trash can", "waste basket", "bin", "trash cans"], "difficult_direct_answer": false, "rationales": ["People could put their garbage inside of the garbage bin on the left.", "There is a garbage bin right next to the car.", "There is a can with a bag in it next to the truck"], "image": "val2014/COCO_val2014_000000353836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431808, "question_id": "UvD8GaGcLVegLFEWyaikNr", "question": "What tree produced the uniquely green fruit seen here?", "choices": ["oak", "tomato", "avocado", "pine"], "correct_choice_idx": 2, "direct_answers": ["avocado", "avocado", "avocado", "avocado", "avocado", "avocado", "avocado", "avocado", "avocado", "avocado"], "difficult_direct_answer": false, "rationales": ["This is a fatty savory fruit", "They are light green and typically served as a side dish.", "There are slices of avocado on the sandwich."], "image": "train2014/COCO_train2014_000000431808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22656, "question_id": "UvUFgWEQqNoHq8s4CKjGP7", "question": "A man that uses the items on the counter is referred to as what?", "choices": ["metrosexual", "septuagenarian", "heathen", "alpha male"], "correct_choice_idx": 0, "direct_answers": ["feminine", "milenial", "clean", "stylish", "makeup", "man", "metrosexual", "groomed", "metrosexual", "cool"], "difficult_direct_answer": true, "rationales": ["A man is standing in front of a mirror with a lot of products on the counter.", "It's more of a woman's product and men who use items like this can sometimes be called a metrosexual.", "The man cares about his appearances a lot."], "image": "val2014/COCO_val2014_000000022656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400075, "question_id": "UvVr9NnJb9C9nRNJCLsx4K", "question": "If he is taking a video what kind of quality will it be?", "choices": ["60fps", "4k", "mediocre", "1080p"], "correct_choice_idx": 2, "direct_answers": ["low", "mediocre", "bad", "decent quality", "low", "poor quality", "poor", "bad", "poor", "mediocre"], "difficult_direct_answer": false, "rationales": ["The person is recording with an old flip phone.", "This is a flip phone; it's probably old and the camera isn't that great.", "He's taking a bad video."], "image": "train2014/COCO_train2014_000000400075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561004, "question_id": "UvaKJMQvv7bovSy6dEKfSW", "question": "What must one do to the original meat item to get it in this form?", "choices": ["julienne", "shred", "mince", "slice"], "correct_choice_idx": 3, "direct_answers": ["bake it", "slice", "cure", "dry it", "slice", "dry", "slice them", "bake it", "cook it", "slice"], "difficult_direct_answer": false, "rationales": ["They will slice the pepperoni so they can put it all on the pizza.", "The pizza needs slicing.", "Pepperoni is usually in sausage form and must be cut into thin flat even parts."], "image": "val2014/COCO_val2014_000000561004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423498, "question_id": "UvbWCTbcSppF989pBDmtv9", "question": "What is the man wearing red shirt doing?", "choices": ["losing balance", "falling", "stopping", "posing"], "correct_choice_idx": 3, "direct_answers": ["skateboarding", "skateboarding", "skateboarding", "skateboarding", "leaning", "posing", "skateboarding", "skateboarding", "skateboarding", "riding skateboard"], "difficult_direct_answer": false, "rationales": ["The man is posing.", "The man in the red t-shirt is posing on the road.", "The man is riding a skateboard and leaning too far to one side. there is no reason to lean that far unless one is posing and having fun."], "image": "val2014/COCO_val2014_000000423498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94179, "question_id": "UvkqbzdDjs2mo8gD3tVwcn", "question": "What do the men's uniforms typically represent?", "choices": ["sports", "school", "graduation", "work"], "correct_choice_idx": 1, "direct_answers": ["students", "school", "lads members", "business", "school", "mormon missionaries", "jehovah witness", "school uniforms", "missionary", "school"], "difficult_direct_answer": false, "rationales": ["Higher end schools require students to wear uniforms.", "In many countries, uniforms are standard in the educational system.", "Uniforms are worn in school."], "image": "train2014/COCO_train2014_000000094179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295441, "question_id": "UvqPJb9i8cjvcwuuaDEuAX", "question": "What period of the day is it?", "choices": ["afternoon", "evening", "night", "morning"], "correct_choice_idx": 0, "direct_answers": ["afternoon", "afternoon", "afternoon", "afternoon", "afternoon", "afternoon", "afternoon", "afternoon", "afternoon", "afternoon"], "difficult_direct_answer": false, "rationales": ["The period is the afternoon.", "Shadows are being cast which means it's the afternoon.", "The clock says 2:58 and it would be dark if it were morning."], "image": "val2014/COCO_val2014_000000295441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350468, "question_id": "UvqoQU8hY7ywtY4hTe9MSo", "question": "The shiny bottle with white label was used to serve what?", "choices": ["beer", "cocktails", "aperitif", "wine"], "correct_choice_idx": 3, "direct_answers": ["wine", "wine", "cook", "beer", "whine", "wine", "wine", "ketchup", "wine", "wine"], "difficult_direct_answer": false, "rationales": ["The stemmed bottle is holding a dark red liquid.", "This is the most likely answer given that this beverage is often service in tall bottles that are dark. that said, perspective may come into play and it might be b.", "The shiny bottle holds alcohol and it's a wine bottle."], "image": "train2014/COCO_train2014_000000350468.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73467, "question_id": "Uw3nm4naQkpmnUwioAvQJe", "question": "What is the black/red item with the white cap?", "choices": ["glue", "lip gloss", "battery", "oil"], "correct_choice_idx": 0, "direct_answers": ["glue", "glue stick", "glue stick", "glue", "chapstick", "chapstick", "chapstick", "chapstick", "glue", "compressed air"], "difficult_direct_answer": false, "rationales": ["Glue comes in many forms like white and super. the glue pictured is called a 'glue stick'.", "The item is glue.", "There is a glue stick on the desk."], "image": "val2014/COCO_val2014_000000073467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328818, "question_id": "UwH4bW9b9k6RmgpDzVUCrv", "question": "What is the woman doing to her sneaker?", "choices": ["changing", "removing pebble", "cleaning", "tying laces"], "correct_choice_idx": 3, "direct_answers": ["tying", "tying it", "tying laces", "tying laces", "tying", "tying laces", "tying laces", "tying laces", "tying it", "tying shoelace"], "difficult_direct_answer": false, "rationales": ["A woman is stopped at a bench with her foot resting on the bench while she is bent over it.", "The woman is holding on to the shoelaces.", "People often put their leg up to reach the laces and tie them."], "image": "val2014/COCO_val2014_000000328818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461715, "question_id": "UwQT4NvCe4DB52CJMwswHg", "question": "What items on the table could feed the tree in the plant pot?", "choices": ["icing", "donut", "water", "human"], "correct_choice_idx": 2, "direct_answers": ["water", "chair", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["Plants take water.", "Trees are not carnivores. they cannot directly consume sugary foods.", "Actually, b could also because of the sugar."], "image": "val2014/COCO_val2014_000000461715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273503, "question_id": "UwWCyZ9CkRZSDBo2dsCKsQ", "question": "What color is the motorcycle helmet on top of the red headed woman?", "choices": ["gray", "white", "black", "green"], "correct_choice_idx": 0, "direct_answers": ["silver", "gray", "silver", "gray", "silver", "gray", "silver", "silver", "gray", "silver"], "difficult_direct_answer": false, "rationales": ["The motorcycle helmet is silver.", "A woman is on a motorcycle in a helmet that matches her silver motorcycle.", "The motorcycle helmet is gray."], "image": "train2014/COCO_train2014_000000273503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254925, "question_id": "UwxWDENrVFTtcmjb4KP2EG", "question": "The person's head here is in which position?", "choices": ["sleeping", "inside out", "upside down", "rightside up"], "correct_choice_idx": 2, "direct_answers": ["downward", "alpine", "upside down", "upside down", "downward", "resting", "downward", "downward", "upside down", "resting"], "difficult_direct_answer": false, "rationales": ["The person's head isn't right side up.", "The person is upside down.", "His head is lower than his chest."], "image": "train2014/COCO_train2014_000000254925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6004, "question_id": "UxHybjBfSiwjPSkrQ5bp2U", "question": "Which direction is the large gray ship going?", "choices": ["south", "no where", "east", "north"], "correct_choice_idx": 1, "direct_answers": ["straight", "west", "forwards", "docked", "left", "left", "ashore", "no where", "left", "west side"], "difficult_direct_answer": false, "rationales": ["The large gray ship is anchored. it is stationary.", "It's easy to see that it's docked.", "The gray ship is going no where since it's anchored."], "image": "train2014/COCO_train2014_000000006004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48685, "question_id": "UxTY7f9LkMho3YtUCpHxf8", "question": "What does this player practice?", "choices": ["ball carrying", "serving", "pitching", "returning"], "correct_choice_idx": 1, "direct_answers": ["tennis", "serving", "tennis", "serving", "tennis", "tennis", "tennis", "hitting", "tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["The person is on a tennis court. the person is reaching high up in the air to hit a tennis ball with his racket. serving in tennis involves tossing the ball up high and hitting it over the net.", "The player is serving.", "The player is reaching up to hit the ball."], "image": "train2014/COCO_train2014_000000048685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448439, "question_id": "Uy7LkhzzRMXcBhEwT42kUd", "question": "Where are the three people seated?", "choices": ["taxi cab", "airplane", "subway", "uber"], "correct_choice_idx": 2, "direct_answers": ["bus", "subway", "subway car", "subway", "train", "bus", "train", "sofa", "bus", "train"], "difficult_direct_answer": false, "rationales": ["It's a resting spot while the vehicle is moving", "The people are on a bus.", "This type of seating arrangement and style is commonly used on the subway."], "image": "train2014/COCO_train2014_000000448439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369362, "question_id": "Uy9zdM7zXGwUk8CbXPznvS", "question": "What do giraffes have that no other animals have?", "choices": ["hooves", "black tongue", "spots", "ossicones"], "correct_choice_idx": 3, "direct_answers": ["ossicones", "long neck", "long neck", "long neck", "long neck", "long neck", "ossicone", "long neck", "long necks", "black tongues"], "difficult_direct_answer": false, "rationales": ["The horns on giraffes are covered by fur.", "They have horns.", "Those things protruding out from the top of their heads are like their own version of \"antlers\" for their species."], "image": "train2014/COCO_train2014_000000369362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23392, "question_id": "UyFnFJ9fXQsEsWaeWdijyb", "question": "What is likely to come by at any moment?", "choices": ["train", "bike", "boat", "duck"], "correct_choice_idx": 0, "direct_answers": ["train", "bus", "train", "train", "bus", "bus", "train", "train", "rain", "bus"], "difficult_direct_answer": false, "rationales": ["There is a train likely to come by at any minute.", "The other options don't make sense in this setting. that said, b could come by and not stop.", "The woman is sitting near the train tracks."], "image": "train2014/COCO_train2014_000000023392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155774, "question_id": "UyjJTrKvzjH6f6ML63eVsc", "question": "What is the woman in the black dress holding in her hand?", "choices": ["book", "folder", "purse", "envelope"], "correct_choice_idx": 2, "direct_answers": ["clutch purse", "clutch", "purse", "clutch", "wallet", "purse", "purse", "wallet", "clutch", "purse"], "difficult_direct_answer": false, "rationales": ["It looks like a smaller purse and has the design pattern that is fitting with purses.", "This is a bag with a zipper enclosure", "The object has a flashy pattern and is large enough to hold essentials."], "image": "val2014/COCO_val2014_000000155774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448690, "question_id": "UysFfb6NZLAT8tVfzBSGgs", "question": "Who owns this plane?", "choices": ["us military", "chinese", "fed x", "spanish"], "correct_choice_idx": 0, "direct_answers": ["u.s.a", "us military", "usa", "us airforce", "air force", "arc", "america", "america", "military", "air force"], "difficult_direct_answer": false, "rationales": ["There is the us flag on the tail, and military jets are usually non white.", "The us owns the plane.", "The plane is a large military plane. the plane has an american flag on it."], "image": "val2014/COCO_val2014_000000448690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126925, "question_id": "UyvLeHMuGjcFLvHZ7k4UaD", "question": "This area is based on which author's works?", "choices": ["lewis carroll", "agatha christie", "mark twain", "stephen king"], "correct_choice_idx": 0, "direct_answers": ["lewis carrot", "lewis carrot", "lewis carrot", "lewis carroll", "lewis carroll", "lewis carroll", "lewis carroll", "lewis carroll", "lewis carroll", "lewis carroll"], "difficult_direct_answer": false, "rationales": ["Lewis carroll wrote through the looking glass.", "Alice through the looking glass features this clock", "Alice in wonderland is a lewis carroll story."], "image": "val2014/COCO_val2014_000000126925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279476, "question_id": "Uz33e4HT2vGNbtXjY5agZ6", "question": "What is the man doing to the bird?", "choices": ["injuring it", "feeding it", "capturing it", "hunting it"], "correct_choice_idx": 1, "direct_answers": ["feeding it", "feeding", "feeding it", "feeding", "feeding it", "feeding", "feeding it", "feeding", "feeding", "feeding it"], "difficult_direct_answer": false, "rationales": ["He is holding a piece of food out", "The man is holding out a piece of food for the bird to eat.", "The man is holding a piece of bread. the bird is using its beak to grab the bread."], "image": "train2014/COCO_train2014_000000279476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556516, "question_id": "Uz9VhrcQ2bLTpPY55WNkqW", "question": "Where could you stretch out and watch TV here?", "choices": ["bed", "no where", "couch", "kitchen chair"], "correct_choice_idx": 2, "direct_answers": ["sofa", "on couch", "sofa", "couch", "couch", "couch", "couch", "sofa", "sofa", "sofa"], "difficult_direct_answer": false, "rationales": ["The couch is available.", "There is a couch facing the tv where one could stretch out and watch.", "It is big enough to lay down on"], "image": "val2014/COCO_val2014_000000556516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176938, "question_id": "UzKmkpjfDymSPKv3MWqgz7", "question": "What brand is his shirt?", "choices": ["puma", "new balance", "adidas", "nike"], "correct_choice_idx": 2, "direct_answers": ["adidas", "adidas", "adidas", "can't see", "sports", "adidas", "nike", "adidas", "adidas", "not clear"], "difficult_direct_answer": false, "rationales": ["The logo for adidas is on the front of the shirt.", "There are three stripes on the arm of the shirt. the brand's logo is on the chest of the shirt.", "The company logo of the brand can be seen on the upper right side of his shirt."], "image": "train2014/COCO_train2014_000000176938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97705, "question_id": "Uzv7kSjpj3Y6RnmqaTS8rg", "question": "What is the little bird on the right side standing on?", "choices": ["plants", "dirt", "rocks", "water"], "correct_choice_idx": 3, "direct_answers": ["water", "water", "water", "sand", "water", "water", "algae", "water", "sandbar", "water"], "difficult_direct_answer": false, "rationales": ["The little bird on the right is standing on the surface of the water.", "These birds are said to be able to stand on a. it could also be on b.", "You can only see the surface and not what is underneath"], "image": "train2014/COCO_train2014_000000097705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563584, "question_id": "UzvS5P4xzqkqe8kyHRHPKH", "question": "What subject are the students studying?", "choices": ["chemistry", "biology", "mathematics", "social studies"], "correct_choice_idx": 3, "direct_answers": ["geography", "geography", "social studies", "english", "geography", "geography", "computing", "computers", "computers", "geography"], "difficult_direct_answer": false, "rationales": ["There are maps on the boards.", "The subject is social studies.", "The students are all studying social sciences."], "image": "train2014/COCO_train2014_000000563584.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281387, "question_id": "V2HUwpE3jGk485Z7os4oVN", "question": "What is the green object on top of the food?", "choices": ["to season", "to cook", "decoration", "to eat"], "correct_choice_idx": 2, "direct_answers": ["rosemary", "rosemary", "decoration", "spice", "decoration", "decoration", "rosemary", "mint", "herbs", "sprig rosemary"], "difficult_direct_answer": false, "rationales": ["The object is decoration.", "A bit of greenery is on top of a piece of meat plated in a fancy manner on a plate.", "Sometimes whole spices are used to make a dish look more interesting."], "image": "val2014/COCO_val2014_000000281387.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545987, "question_id": "V2RcNM5kATD44jk5ku2czn", "question": "What is the man in the middle doing?", "choices": ["selling phone", "checking phone", "paying bill", "getting help"], "correct_choice_idx": 1, "direct_answers": ["watching cell", "using phone", "checking phone", "texting", "texting", "phone looking", "checking phone", "texting", "checking phone", "using phone"], "difficult_direct_answer": false, "rationales": ["The man is holding his cellphone.", "The man seems to look at phone for calls or messages as evident.", "He is checking his phone."], "image": "train2014/COCO_train2014_000000545987.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413277, "question_id": "V2YVy5Jb8Eg5WswNt9b4ot", "question": "What is the man adjusting?", "choices": ["computer", "his pants", "fire hydrant", "his seat"], "correct_choice_idx": 2, "direct_answers": ["fire hydrant", "fire hydrant", "bolt", "hydrant", "fire hydrant", "fire hydrant", "hydrant", "fire hydrant", "fire hydrant", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["This is a firefighter doing maintenance on the valves", "The man wants to fix the fire hydrant on the sidewalk.", "The man is adjusting some bolts on a fire hydrant."], "image": "train2014/COCO_train2014_000000413277.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533553, "question_id": "V2hJmW7UgcRhrnih4MKd7N", "question": "For what reason are buildings here elevated high above ground?", "choices": ["flooding", "avoiding monsters", "pest control", "earthquakes"], "correct_choice_idx": 0, "direct_answers": ["prevent flooding", "flooding", "flood", "flood", "flood prevention", "flood", "prevent water", "floods", "flooding", "repairs"], "difficult_direct_answer": false, "rationales": ["There is a boat here, so it is probably near the shore and in a flood zone.", "In flood prone areas, houses are usually built on stilts for protection from the water.", "The boat suggests the building is near a water source."], "image": "val2014/COCO_val2014_000000533553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203509, "question_id": "V2ryNDHPNp8Cvv33vP3Ryh", "question": "What happen to the eyes of the man who is drinking?", "choices": ["wearing makeup", "blindness", "light reflection", "closed eyes"], "correct_choice_idx": 2, "direct_answers": ["open", "rolling", "light", "bad timing", "light reflection", "rolling", "nothing", "rolled up", "spin back", "blinking"], "difficult_direct_answer": true, "rationales": ["The lights and camera can change eyes in photos", "Light is reflecting off the eyes of the drinking man.", "The man's eyes are white."], "image": "val2014/COCO_val2014_000000203509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38693, "question_id": "V2xaqPR7Mi75ggGPsLUMQR", "question": "What sort of meat is going to be consumed here?", "choices": ["bird", "sausage", "egg", "fish"], "correct_choice_idx": 1, "direct_answers": ["hotdog", "sausage", "hotdog", "pork sausage", "pork", "sausage", "sausage", "sausage", "sausage", "sausage"], "difficult_direct_answer": false, "rationales": ["The item on the plate is a sausage on a bun.", "Even if we aren't certain what the meat is, it is obvious that this is not fish or egg and it certainly is not bird.", "The meat is sausage."], "image": "val2014/COCO_val2014_000000038693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551961, "question_id": "V3Be9NfuEKXb4yifg3JKHi", "question": "What style tennis is going to be played by these girls?", "choices": ["mixed doubles", "ladies doubles", "canadian doubles", "singles"], "correct_choice_idx": 1, "direct_answers": ["doubles", "doubles", "doubles", "ladies doubles", "doubles", "doubles", "doubles", "doubles", "doubles", "doubles"], "difficult_direct_answer": false, "rationales": ["They are on the same side of the court", "There are two people.", "The ladies are competing in a set of two."], "image": "train2014/COCO_train2014_000000551961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482081, "question_id": "V3Pqk7s2FqyHTh8PFq2NiA", "question": "How many giraffes do you see in the picture above?", "choices": ["four", "one", "five", "none"], "correct_choice_idx": 2, "direct_answers": ["four", "four", "four", "four", "four", "five", "four", "five", "five", "four"], "difficult_direct_answer": false, "rationales": ["There are four giraffes very clearly visible in the image but upon closer inspection, a fifth giraffe can been seen sticking their neck out and extending just their head into the image, which means the total number of giraffes that are visible is five.", "There are five animals shown.", "There are two groups of two"], "image": "val2014/COCO_val2014_000000482081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367259, "question_id": "V3WEMy9DcvreXvADNU7rRJ", "question": "What location are these children in?", "choices": ["classroom", "home", "mall", "diner"], "correct_choice_idx": 0, "direct_answers": ["school", "library", "school", "school", "school", "school", "classroom", "school", "school", "library"], "difficult_direct_answer": false, "rationales": ["The kids are in a class.", "The location is a classroom.", "There are multiple supplies around the room that's the just of what location the children are in the clock on the wall the decoration the desk and the collection of supplies in the background are used during school."], "image": "train2014/COCO_train2014_000000367259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190670, "question_id": "V3dpc2W4HesQtcMi4AqrKS", "question": "What can you obtain from the website advertised?", "choices": ["baseball bats", "sports drinks", "team hat", "hubcaps"], "correct_choice_idx": 2, "direct_answers": ["hats", "hats", "hats", "game", "hat", "team hat", "news", "hats", "hats", "news"], "difficult_direct_answer": false, "rationales": ["New era cap makes sports hats.", "It has cap in the name", "The website is neweracap.com. it sells headwear."], "image": "train2014/COCO_train2014_000000190670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15621, "question_id": "V3snHb2DWnrRjYtXehXECx", "question": "What is the condition outside?", "choices": ["raining", "snowing", "sunny", "overcast"], "correct_choice_idx": 2, "direct_answers": ["sunny", "clear", "sunny", "sunny", "sunny", "sunny", "sunny", "clear", "sunny", "clear"], "difficult_direct_answer": false, "rationales": ["There is light reflecting off of the train", "The conditions outside are sunny and midday.", "It is reflecting off the train"], "image": "train2014/COCO_train2014_000000015621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436759, "question_id": "V3twBGXy6uYhqx2zXeDuHE", "question": "What use is the train here mostly?", "choices": ["engineer training", "passenger business", "freight", "child's amusement"], "correct_choice_idx": 3, "direct_answers": ["children's ride", "fun", "train station", "transport", "fun", "fun", "child's amusement", "kid rides", "decorative", "children"], "difficult_direct_answer": false, "rationales": ["The train is tiny and used to give kids a ride for fun.", "The train is too small for adults but just large enough for small passengers.", "The children are using the train here for the purposes of amusement, mostly."], "image": "train2014/COCO_train2014_000000436759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119614, "question_id": "V48WCgG6WYUHhtJPb3JGob", "question": "What kind of cat is this?", "choices": ["persian", "orange tabby", "bengal", "maine coon"], "correct_choice_idx": 1, "direct_answers": ["domestic shorthair", "orange tabby", "tabby", "tabby", "house", "domestic", "orange", "ginger", "tabby", "tabby"], "difficult_direct_answer": false, "rationales": ["The cat is orange.", "The cat is a tabby.", "The cat is orange with stripes."], "image": "train2014/COCO_train2014_000000119614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285260, "question_id": "V52kFsXhwH6D82A2GyMTG8", "question": "What's covering most of the people here?", "choices": ["shadows", "rain", "paint", "street lighting"], "correct_choice_idx": 0, "direct_answers": ["shade", "trees", "bike", "trees", "bikes", "standing", "shadows", "trees", "bikes", "leather"], "difficult_direct_answer": false, "rationales": ["The sky isn't cloudy, it is daytime, and the people are standing under and near tall trees that have branches that grow horizontally far from their trunks.", "There are a lot of people underneath the shadows.", "The other options aren't in this image and don't make sense with the setting or time of day."], "image": "train2014/COCO_train2014_000000285260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337354, "question_id": "V5BrKiDyLoBX5GfMPEzBRo", "question": "Where can they toast an item?", "choices": ["brown door", "white appliance", "blue screen", "silver appliance"], "correct_choice_idx": 3, "direct_answers": ["toaster", "toaster oven", "silver appliance", "toaster oven", "toaster oven", "toaster oven", "toaster oven", "toaster oven", "toaster oven", "toaster oven"], "difficult_direct_answer": false, "rationales": ["The white appliance and blue screen are a microwave and a laptop, respectively. the brown door also cannot be used to toast an item.", "The silver appliance is a toaster oven.", "The toaster can be used."], "image": "val2014/COCO_val2014_000000337354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138573, "question_id": "V5DNHsmZ6Roo495v6aXi7p", "question": "Where is this array of teddy bears being displayed?", "choices": ["car dealership", "medical office", "restaurant", "movie theater"], "correct_choice_idx": 2, "direct_answers": ["brick wall", "restaurant", "restaurant", "wall", "wall", "on wall", "brick wall", "restaurant", "restaurant", "kitchen"], "difficult_direct_answer": false, "rationales": ["There is a table with a salt shaker and sugar packs next to the bears.", "The array is a restaurant.", "There is a table beside the bears."], "image": "val2014/COCO_val2014_000000138573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6981, "question_id": "V5NmrR4T4mJjjV8nnmzuE2", "question": "What is the child doing with the object in his hand?", "choices": ["throwing it", "painting it", "squishing it", "eating it"], "correct_choice_idx": 3, "direct_answers": ["eating", "eating", "eating", "eating", "eating", "eating", "eating it", "eating", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["A young girl is smashing food in her mouth to digest.", "The child is eating a donut with his hadn.", "This child is putting the food in their mouth."], "image": "train2014/COCO_train2014_000000006981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390001, "question_id": "V5jghE82aqAERz5ZSNktBj", "question": "Why are there triangles on the road?", "choices": ["falling rocks", "pedestrian lane", "bike crossing", "one way"], "correct_choice_idx": 3, "direct_answers": ["front", "show direction", "signalling", "direction markers", "one way", "traffic sign", "crosswalk", "guide traffic", "yield", "directional arrows"], "difficult_direct_answer": true, "rationales": ["There are triangles in the road for a one way lane.", "There are white triangles to signify this is a one way street on the road.", "The road only goes one way."], "image": "train2014/COCO_train2014_000000390001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425622, "question_id": "V5nC5RaoihAGv7R2j2PpDa", "question": "What is he hoping to score?", "choices": ["netball", "ace", "volley", "foul"], "correct_choice_idx": 1, "direct_answers": ["ace", "love", "point", "point", "points", "point", "point", "love", "ace", "points"], "difficult_direct_answer": false, "rationales": ["The man wants to win an ace in tennis.", "All tennis players hope to achieve an \"ace.\".", "He wants an ace."], "image": "train2014/COCO_train2014_000000425622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438575, "question_id": "V62CZ4SzFvHo9fDmYg9DnQ", "question": "Why does smoke come from front of train here?", "choices": ["fireworks", "coal power", "electrical discharge", "heating passengers"], "correct_choice_idx": 1, "direct_answers": ["exhaust", "for show", "coal power", "steam", "steam engine", "coal", "steam powered", "fuel", "steam", "firebox"], "difficult_direct_answer": true, "rationales": ["Traditionally old locomotives are fueled by burning fossil fuels.", "This train is a steam locomotive, which burns coal to heat steam to provide power to turn the wheels.", "This looks like an older model of train that would likely be using coal. smoke billowing out of a train occurs when heat from the engine is being projected up, so if coal was the power source, it would also be the source of the smoke."], "image": "train2014/COCO_train2014_000000438575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334234, "question_id": "V68yQ2BRvyprSPLFuHkV2E", "question": "What is lit up?", "choices": ["desk", "traffic lights", "bar", "tree"], "correct_choice_idx": 1, "direct_answers": ["traffic signs", "stoplight", "traffic lights", "traffic lights", "street lights", "traffic lights", "traffic lights", "traffic lights", "traffic lights", "light"], "difficult_direct_answer": false, "rationales": ["There are traffic lights that are lit up.", "The lights are lit.", "The traffic lights are lit since they're glowing."], "image": "train2014/COCO_train2014_000000334234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203466, "question_id": "V6Z3xMexQhB2WfLVWvCfmR", "question": "What does the player wipe away with his towel?", "choices": ["gel", "steroids", "paint", "sweat"], "correct_choice_idx": 3, "direct_answers": ["blood", "sweat", "sweat", "sweat", "sweat", "his fingers", "sweat", "sweat", "sweat", "tennis"], "difficult_direct_answer": false, "rationales": ["He needs his hands dry to keep a grip on the racket", "The player is wiping his wet hand.", "The player is wiping away the sweat from his hands with a white towel."], "image": "val2014/COCO_val2014_000000203466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47981, "question_id": "V78b8KqkubAgay3L6RjUnZ", "question": "Why is his foot raised behind him?", "choices": ["is kicking", "is angry", "is deformed", "is tired"], "correct_choice_idx": 0, "direct_answers": ["kicking", "to kick", "kicking ball", "to kick", "kicking ball", "kicking", "to kick", "kicking ball", "is kicking", "to kick"], "difficult_direct_answer": false, "rationales": ["He needs more force to get the ball across the field", "He has his foot raised behind him because he's kicking.", "The player has his leg bent in the exact way a soccer player does to kick a soccer ball."], "image": "train2014/COCO_train2014_000000047981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140180, "question_id": "V7925M7rq7pPXV8zsRqhZp", "question": "What likely powers these scooters?", "choices": ["electricity", "pentane", "propane", "hydro"], "correct_choice_idx": 0, "direct_answers": ["gas", "electricity", "gas", "gas", "gas", "battery", "gas", "electricity", "gasoline", "battery"], "difficult_direct_answer": false, "rationales": ["The scooters have an engine.", "The scooters do not have internal combustion engines. they instead are powered by batteries.", "The electricity powers them."], "image": "train2014/COCO_train2014_000000140180.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397701, "question_id": "V7PZSvmpS59tFeoJgn3yKh", "question": "Who helps keep the person riding the motorcycle dry?", "choices": ["no one", "driver", "passenger", "police"], "correct_choice_idx": 2, "direct_answers": ["passanger", "passenger", "passenger", "passenger", "umbrella", "umbrella", "passenger", "umbrella", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["The driver has an umbrella overhead which is intended to keep one driver. the drivers hands are both visible and not holding the umbrella so it must be the passenger performing this service.", "There is a passenger on the bike.", "They would be the one holding the umbrella."], "image": "train2014/COCO_train2014_000000397701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397132, "question_id": "V7TCuAQHPdDfGchBdUhB35", "question": "What type weather is likely to cheer up most people we see here?", "choices": ["doldrums", "ice storm", "dead calm", "windy"], "correct_choice_idx": 3, "direct_answers": ["sunny", "windy", "sunny", "sunny", "windy", "warm", "windy", "windy", "warm", "sunny"], "difficult_direct_answer": false, "rationales": ["The people are flying kites.", "They are flying kites. doldrums, an ice storm, or a dead calm would not help with this activity.", "This type of weather is perfect for flying kites and necessary to get them off the ground."], "image": "train2014/COCO_train2014_000000397132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21650, "question_id": "V7TvcVdKmDUBSgRWPPC2E5", "question": "How do you know this is a residential area?", "choices": ["houses", "welcome mats", "signs", "apartment buildings"], "correct_choice_idx": 3, "direct_answers": ["houses", "homes", "houses", "houses", "houses", "houses", "no traffic", "residential buildings", "apartment buildings", "residential"], "difficult_direct_answer": false, "rationales": ["There are homes.", "There are lots of apartments in this residential area.", "The area has apartments."], "image": "train2014/COCO_train2014_000000021650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580414, "question_id": "V7g6etx7ufB5tkZuCgtkdN", "question": "What is the large triangular object in the distance?", "choices": ["sculpture", "cloud", "mountain", "mall"], "correct_choice_idx": 2, "direct_answers": ["mountain", "mountain", "building", "volcano", "logo", "mta fuji", "volcano", "mountain", "mountain", "mountain top"], "difficult_direct_answer": false, "rationales": ["A large mountain with a pointed peak can be seen behind buildings.", "There is a snow capped mountain.", "The mountains are in the back."], "image": "train2014/COCO_train2014_000000580414.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204147, "question_id": "V7huS4K36arS8LRXKoHEhs", "question": "The person here focuses on what?", "choices": ["screen", "axe", "mirror", "door"], "correct_choice_idx": 0, "direct_answers": ["tv", "video game", "screen", "game", "game", "television", "tv screen", "screen", "television", "game"], "difficult_direct_answer": false, "rationales": ["He's playing a video game", "The person is playing a video game.", "He appears to be looking up at a monitor of some sort."], "image": "val2014/COCO_val2014_000000204147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520043, "question_id": "V7ki7xEbMU3AnBJ4fi82Dj", "question": "What should the drivers do in this situation?", "choices": ["hurry up", "be patient", "press horn", "call police"], "correct_choice_idx": 1, "direct_answers": ["call insurance", "remain calm", "stay calm", "be patient", "be patient", "zipper merge", "paint job", "honk", "be patient", "drive"], "difficult_direct_answer": false, "rationales": ["They need to wait until other cars move out of the way first", "The drivers should wait for the road to clear up.", "There is an apparent traffic jam based on the vehicles in the road and the available space. traffic jam situations are eventually cleared up if one is patient."], "image": "val2014/COCO_val2014_000000520043.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454814, "question_id": "V7t3u75R2mKL8ZSqwMNpD5", "question": "What type of phone is being used?", "choices": ["pay", "cellular", "rotary", "landline"], "correct_choice_idx": 1, "direct_answers": ["smartphone", "mobile", "cellular", "smartphone", "mobile", "cellular", "mobile", "smartphone", "smartphone", "smartphone"], "difficult_direct_answer": false, "rationales": ["Cellular phones are being used here.", "The object is the size and shape of answer a and is being used outside of a home or building with no visible cord.", "The phone in the image is a modern day cell phone."], "image": "train2014/COCO_train2014_000000454814.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507080, "question_id": "V7yFpYSbKHQEYArcpj9RYA", "question": "How do persons here dine?", "choices": ["indoors", "al fresco", "buffet style", "barnward"], "correct_choice_idx": 1, "direct_answers": ["with cutlery", "outside", "casually", "al fresco", "outdoors", "al fresco", "on porch", "alfresco", "outdoors", "outside"], "difficult_direct_answer": false, "rationales": ["People are seen dining outdoors, on a patio.", "The persons here all dine at al fresco.", "The people are outside. there is no buffet."], "image": "val2014/COCO_val2014_000000507080.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345879, "question_id": "V8W4iC25bhLwdKbwbAMFbj", "question": "What is across from the water?", "choices": ["trees", "nature", "land", "sand"], "correct_choice_idx": 2, "direct_answers": ["boat", "house", "house", "dock", "house", "land", "house", "house", "house", "docks"], "difficult_direct_answer": false, "rationales": ["The picture shows water and a boat in the foreground, and land covered in grass in the background.", "There is grass across the water.", "There is greenery and buildings there"], "image": "train2014/COCO_train2014_000000345879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122281, "question_id": "V9aDaJj7ZrwRT2y7LEPMfV", "question": "How will most of these men get off the mountain they stand upon?", "choices": ["ski lift", "skis", "snow board", "sherpa"], "correct_choice_idx": 2, "direct_answers": ["glide down", "snowboarding", "skiing downhill", "ski", "snowboard", "snowboard", "snowboard down", "snow board", "ski down", "ski lift"], "difficult_direct_answer": true, "rationales": ["They have boards on them to use to go down.", "They are holding snowboards.", "Three men are posing with snowboards on the top of a mountain."], "image": "val2014/COCO_val2014_000000122281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90884, "question_id": "V9cGJP6Lte4k6itVmDPY9K", "question": "What is the orange object called?", "choices": ["bone", "hammer", "bat", "sickle"], "correct_choice_idx": 0, "direct_answers": ["femur", "bone", "bone", "bone", "bone", "bone", "bone", "bone", "dog born", "femur"], "difficult_direct_answer": false, "rationales": ["The object has the shape of a bone.", "The object is a bone.", "The man is holding an orange bone in his hand."], "image": "val2014/COCO_val2014_000000090884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19694, "question_id": "V9nZMBxSPzwecksq2BQVfq", "question": "Based on the discarded fixtures which part of the building is undergoing renovations?", "choices": ["garage", "kitchen", "office", "bathroom"], "correct_choice_idx": 3, "direct_answers": ["bathroom", "bathroom", "bathroom", "bathroom", "bathroom", "restrooms", "bathroom", "restrooms", "bathroom", "restrooms"], "difficult_direct_answer": false, "rationales": ["These are toilets", "Those ceramic backs are part of a toilet. the toilet lid and seat area are visible from our view.", "There are toilets outside. these would not normally be found in a kitchen, office, or garage."], "image": "train2014/COCO_train2014_000000019694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555639, "question_id": "V9rohu5Dgghetze9N7UsPi", "question": "What game is being played here?", "choices": ["chess", "golf", "frisbee golf", "ultimate frisbee"], "correct_choice_idx": 3, "direct_answers": ["ultimate frisbee", "ultimate frisbee", "frisbee", "frisbee", "ultimate frisbee", "frisbee", "frisbee", "frisbee", "ultimate frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["The man is getting ready to throw the frisbee.", "The people are playing ultimate frisbee.", "There are guys competing against each other in frisbee. and guarding each other."], "image": "train2014/COCO_train2014_000000555639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229067, "question_id": "VAG5u8zVhscx7ZPuDBFsF5", "question": "What is the likely relationship of the woman to the kids?", "choices": ["mother", "sister", "school principal", "great grandmother"], "correct_choice_idx": 0, "direct_answers": ["parent", "instructor", "mother", "mother", "teacher", "green jacket", "mother", "teacher", "mom", "mother"], "difficult_direct_answer": false, "rationales": ["The woman is likely the kids' mom.", "Nuclear families often ski together and very often parents ski with their children at the same time.", "The relationship is the mom."], "image": "val2014/COCO_val2014_000000229067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401320, "question_id": "VAartue7dJbinH7DAD34XG", "question": "What does this train carry?", "choices": ["cattle", "cars", "passengers", "cargo"], "correct_choice_idx": 3, "direct_answers": ["cargo", "cargo", "cargo", "cargo", "coal", "people", "cargo", "cargo", "carriage", "freight"], "difficult_direct_answer": false, "rationales": ["The train has cargo carts.", "The train is not a passenger train so it must carry cargo. the cars are large vessels to hold cargo rather than cars with seats for passengers.", "The train has cargo."], "image": "val2014/COCO_val2014_000000401320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94350, "question_id": "VAoHAYWixju2J9kdLjKtSd", "question": "Where is the woman likely heading?", "choices": ["vacation", "work", "dinner", "school"], "correct_choice_idx": 0, "direct_answers": ["hotel", "to airport", "vacation", "away", "home", "hotel", "airport", "vacation", "airport", "airport"], "difficult_direct_answer": false, "rationales": ["The woman is holding a suitcase.", "She has bags packed for travel", "She has a suitcase so she probably is going on holiday"], "image": "train2014/COCO_train2014_000000094350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360899, "question_id": "VAy4D5MTdvTfGT77g9tM75", "question": "How did the child most likely get to the spot she stands?", "choices": ["bike", "motorcycle", "walked", "bus"], "correct_choice_idx": 3, "direct_answers": ["bus", "motorcycle", "walking", "walked", "bus", "motorcycle", "walked", "by bus", "walked", "walked"], "difficult_direct_answer": false, "rationales": ["The child probably took the private bus.", "He rode on the bus.", "They are in the city. the girl is too young to ride a bike."], "image": "val2014/COCO_val2014_000000360899.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384036, "question_id": "VB9MkpqKCAFbvBcoKsnDoP", "question": "What are the devices on the lower wall called?", "choices": ["urinal", "soap dispenser", "toilet", "sink"], "correct_choice_idx": 0, "direct_answers": ["urinal", "urinals", "urinals", "urinals", "urinals", "urinal", "urinals", "urinals", "urinals", "urinals"], "difficult_direct_answer": false, "rationales": ["Men stand to urinate to use the restroom.", "These objects are attached to the wall to receive liquid, not distribute it.", "The device is the urinal."], "image": "val2014/COCO_val2014_000000384036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111933, "question_id": "VBEsXhSQS3mTmAJSMmvN4r", "question": "What is this place?", "choices": ["waste disposal", "picnic", "outdoor restaurant", "farmers market"], "correct_choice_idx": 2, "direct_answers": ["pizzeria", "restaurant", "restaurant", "restaurant", "restaurant", "cafe", "restaurant", "outdoor restaurant", "restaurant", "hotel"], "difficult_direct_answer": false, "rationales": ["The place is serving food outside.", "You can see the sky so they are outside.", "The place is a restaurant."], "image": "train2014/COCO_train2014_000000111933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358608, "question_id": "VBPTbe6tP3Uyhd996pPjsA", "question": "What is the man doing in the front of the blue plane?", "choices": ["selling it", "repairing it", "flying it", "washing it"], "correct_choice_idx": 2, "direct_answers": ["flying", "flying it", "flying", "flying", "flying", "flying", "flying it", "piloting", "controlling plane", "piloting"], "difficult_direct_answer": false, "rationales": ["The man in the front of the plane is the pilot and he is the one steering the plane.", "He's the pilot and it's off the ground", "The man is flying the plane."], "image": "train2014/COCO_train2014_000000358608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574215, "question_id": "VBXM9VgD3rZVrFAUDiidZf", "question": "What sauce is on this pizza?", "choices": ["cheese", "dyed", "white", "tomato"], "correct_choice_idx": 2, "direct_answers": ["white", "pesto", "white sauce", "cheese", "alfredo sauce", "white sauce", "alfredo", "cheese", "white sauce", "white"], "difficult_direct_answer": false, "rationales": ["The sauce is white.", "The sauce on the pizza is white, possibly an alfredo sauce.", "The sauce isn't red and made out of tomatoes."], "image": "train2014/COCO_train2014_000000574215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377195, "question_id": "VBfExUwUnVi3uZxuk9gvB6", "question": "What's the term for the man seated in the tall blue chair?", "choices": ["coach", "host", "official", "guide"], "correct_choice_idx": 2, "direct_answers": ["announcer", "referee", "guide game", "referee", "official", "referee", "referee", "umpire", "judge", "referee"], "difficult_direct_answer": false, "rationales": ["He needs to make sure the ball doesn't touch out of bounds", "A person is sitting in an elevated chair overseeing a tennis court in which a game is being played.", "The person is the official for the game."], "image": "val2014/COCO_val2014_000000377195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435091, "question_id": "VBgC9uRqZLGp3SUuFfarg5", "question": "Where did he come from?", "choices": ["nail salon", "shower", "school", "grocery store"], "correct_choice_idx": 1, "direct_answers": ["shower", "inside room", "bedroom", "bathroom", "pool", "shower", "shower", "shower", "shower", "pool"], "difficult_direct_answer": false, "rationales": ["The man just showered since he has a towel around him.", "He came from the shower.", "There is a towel around his waste and it looks like he is coming from a place where he got wet."], "image": "train2014/COCO_train2014_000000435091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326584, "question_id": "VBj268hA3GDLVA2ytq2xgb", "question": "What is the girl using the wooden stick to do?", "choices": ["stir", "mold", "paint", "play"], "correct_choice_idx": 0, "direct_answers": ["stir", "stir", "mixing", "stir veggies", "stir", "stirring", "stir pot", "mix", "cook", "stirring"], "difficult_direct_answer": false, "rationales": ["The girl is stirring.", "The little girl has a stick that is in a pot.", "The girl is using the spoon to combine the ingredients in the pot."], "image": "train2014/COCO_train2014_000000326584.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477243, "question_id": "VBxyBszS3butpzc25mptwb", "question": "How is the dog in midair?", "choices": ["falling", "has wings", "bounced", "biting frisbee"], "correct_choice_idx": 3, "direct_answers": ["biting frisbee", "hanging", "hanging", "frisbee", "playful", "jumped", "holding frisbee", "biting frisbee", "frisbee", "biting frisbee"], "difficult_direct_answer": false, "rationales": ["He is hanging on to the frisbee.", "The dog is biting on a disc.", "He jumped up to grab the frisbee."], "image": "train2014/COCO_train2014_000000477243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276863, "question_id": "VCBXuoFTFxzFaERtds5FrG", "question": "Why is the red hat sitting on the briefcase?", "choices": ["to sell", "to wear", "to buy", "for tips"], "correct_choice_idx": 3, "direct_answers": ["hat", "for money", "for tips", "for tips", "for tips", "cap", "for tips", "tips", "donations", "top hat"], "difficult_direct_answer": false, "rationales": ["The musical instruments and cases indicate that people are busking here.", "The hat is for tips.", "There is a red hat sitting on the briefcase to collect tips."], "image": "val2014/COCO_val2014_000000276863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219229, "question_id": "VCFgHABfM9Ni3nuH5JKgKM", "question": "What is the baby called?", "choices": ["kitten", "cub", "baby", "calf"], "correct_choice_idx": 3, "direct_answers": ["lamb", "sheep", "sheep", "lamb", "lamb", "lamb", "lamb", "lamb", "lamb", "calf"], "difficult_direct_answer": false, "rationales": ["The animal is a sheep and a baby sheep is a calf.", "The baby is a calf.", "This is a sheep calf."], "image": "train2014/COCO_train2014_000000219229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525024, "question_id": "VCKrJLS5h2ms7UtcDUiHiK", "question": "What does the most visible kite appear most as?", "choices": ["insect", "mammal", "human", "fish"], "correct_choice_idx": 0, "direct_answers": ["butterfly", "butterfly", "butterfly", "butterfly", "butterfly", "insect", "butterfly", "butterfly", "butterfly", "butterfly"], "difficult_direct_answer": false, "rationales": ["The kite looks like a butterfly and this is what they're classified as.", "The most visible kite most obviously appears as an insect.", "It looks like a butterfly"], "image": "val2014/COCO_val2014_000000525024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161567, "question_id": "VCUeq6ZfQKMoKazABD5gjx", "question": "This picture was likely taken in what decade?", "choices": ["1970's", "1920's", "1940's", "1990's"], "correct_choice_idx": 0, "direct_answers": ["1970s", "1970s", "sixties", "1970s", "80s", "1980s", "seventies", "sixties", "1970's", "1970s"], "difficult_direct_answer": false, "rationales": ["The picture quality is high, so it probably was not taken in the 1920's or 1940's. the picture is in black and white, so it probably was not taken in the 1990's.", "The picture is in black and white. the people are wearing jeans.", "The clothing and appliance indicates this decade"], "image": "val2014/COCO_val2014_000000161567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344033, "question_id": "VCYarfBp6kgpmXJJ8k8HNM", "question": "What can this feline do most of the day?", "choices": ["play", "attack rats", "sleep", "run"], "correct_choice_idx": 2, "direct_answers": ["sleep", "sleep", "sleep", "sleep", "sleep", "sleep", "sleep", "sleep", "sleep", "sleep"], "difficult_direct_answer": false, "rationales": ["The cat is resting.", "Cats sleep.", "Cats sleep 12-16 hours a day."], "image": "train2014/COCO_train2014_000000344033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431832, "question_id": "VCkTBTmsXWN2N3HcwyekFK", "question": "Who's pastime is this sport?", "choices": ["romania's", "germany's", "america's", "russia's"], "correct_choice_idx": 2, "direct_answers": ["america's", "america", "lou gerri", "america's", "players", "america's", "america's", "americans", "baseball", "americans"], "difficult_direct_answer": false, "rationales": ["The past time is america's.", "They are playing baseball. this sport is primarily associated with the united states.", "It is one of the top sports next to american football in the united states."], "image": "val2014/COCO_val2014_000000431832.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441379, "question_id": "VD2EEsVVgn8cdeVpiQmJoD", "question": "What is the number on outfit worn by the goats?", "choices": ["group number", "age", "bib number", "height"], "correct_choice_idx": 2, "direct_answers": ["number 3", "three five", "bib number", "three five", "entrant number", "three", "three", "contestant number", "three", "number 3"], "difficult_direct_answer": false, "rationales": ["Each goat is wearing a jersey in an identical style, like a uniform, while performing a jump over a bar. each goat's jersey is a different color with a different number and few or no other differences between jerseys.", "This is their number on their outfit.", "The number is on the bibs."], "image": "train2014/COCO_train2014_000000441379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206583, "question_id": "VDHjypHp4oHrzcrtjcYnaN", "question": "How many people are wearing spectacles?", "choices": ["all", "three", "none", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["All four people are wearing glasses.", "There are four people with glasses.", "This appears to be a family gathering and poor eyesight is genetic so it goes there would be more than three but less than five of them."], "image": "train2014/COCO_train2014_000000206583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270789, "question_id": "VDHwyhsPEcbByoxCgiHCXE", "question": "Who applied the blue paint to the wall here?", "choices": ["graffiti artist", "portraitist", "auto painter", "building owner"], "correct_choice_idx": 0, "direct_answers": ["graffiti artist", "graffiti artist", "graffiti artist", "graffiti artist", "vandals", "graffiti artist", "graffiti artists", "artist", "kids", "graffiti artist"], "difficult_direct_answer": false, "rationales": ["A graffiti artist applied it.", "This art work is typical of that created by a graffiti artist.", "The paint looks like typical graffiti."], "image": "train2014/COCO_train2014_000000270789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11426, "question_id": "VDiXfxhrrf3XfS3XtVRATn", "question": "Who owns Brough Superior motorcycles?", "choices": ["mark upham", "george brough", "lawrence", "nicholace"], "correct_choice_idx": 0, "direct_answers": ["thierry henriette", "mark upham", "brough", "bike", "boxer design", "mark upham", "boxer design", "thierry henriette", "bough", "george brough"], "difficult_direct_answer": false, "rationales": ["Mark upham owns brough superior motorcycles.", "According to an internet search, although george brough started the company, in 2008 mark upham acquired the rights to the name.", "Upham owns them."], "image": "train2014/COCO_train2014_000000011426.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392726, "question_id": "VDrjsiPAZ44N2x4qMyf4Gu", "question": "What kind of artwork is framed on the left side of the screen on the wall?", "choices": ["abstract", "contemporary", "impressionism", "american pop"], "correct_choice_idx": 3, "direct_answers": ["collage", "photography", "collage", "art work", "new art", "abstract", "patchwork", "magazine cutout", "mosaic", "american pop"], "difficult_direct_answer": true, "rationales": ["American pop art is usually bold and daring.", "The walls are covered in pop art.", "The artwork is pop."], "image": "train2014/COCO_train2014_000000392726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263677, "question_id": "VE9d6eEGLHRdWRxLkYwTQ6", "question": "What is on a carousel?", "choices": ["utensils", "condiments", "mugs", "spices"], "correct_choice_idx": 3, "direct_answers": ["spices", "spices", "spices", "spices", "seasonings spices", "microwave", "microwave", "spices", "spices", "spices"], "difficult_direct_answer": false, "rationales": ["Next to the stove is a rack of spices on a spinning contraption like a carousel.", "The unrefrigerated carousel is holding small jars. utensils and mugs are not stored in small jars, and condiments usually are refrigerated.", "The spices are on a carousel that spin."], "image": "train2014/COCO_train2014_000000263677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518109, "question_id": "VEnkpB9B65meUxyboAmNSq", "question": "What is this called?", "choices": ["giveaway", "museum", "electronics store", "media center"], "correct_choice_idx": 2, "direct_answers": ["laptop", "laptop", "laptop", "working", "working", "laptop", "electronics store", "working", "laptop", "laptop"], "difficult_direct_answer": false, "rationales": ["A man is standing at a laptop that is on display with ads above it.", "The person is at a tech store.", "It's an electronics store."], "image": "val2014/COCO_val2014_000000518109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218057, "question_id": "VEwYzxWowFsxGapnofS4HG", "question": "What are the girls doing with the white remotes?", "choices": ["karaoke", "playing games", "changing channel", "cosplaying"], "correct_choice_idx": 1, "direct_answers": ["playing wii", "playing", "playing", "gaming", "video game", "playing wii", "playing games", "playing games", "playing games", "playing game"], "difficult_direct_answer": false, "rationales": ["The girls are holding a wii-mote, so they are playing games on the wii console.", "The remote is for a wii game.", "The girls are gaming."], "image": "train2014/COCO_train2014_000000218057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306811, "question_id": "VExSwfi7C3jaTtmsk52MCZ", "question": "What is she doing?", "choices": ["changing phone", "using phone", "eating lunch", "stealing phone"], "correct_choice_idx": 1, "direct_answers": ["smiling", "texting", "talking", "texting", "smiling", "using phone", "smiling", "smiling", "smiling phoning", "using cellphone"], "difficult_direct_answer": false, "rationales": ["She's using the phone.", "The women has a cellphone in her hands and it is opened for use.", "She is using the phone to talk."], "image": "train2014/COCO_train2014_000000306811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416862, "question_id": "VEy5UdZ7Z2HXimAxhbzeek", "question": "What sort of facility is seen here?", "choices": ["livestock", "lab", "food sales", "cubicle"], "correct_choice_idx": 1, "direct_answers": ["laboratory", "laboratory", "lab", "lab", "lab", "laboratory", "lab", "laboratory", "laboratory", "kitchen"], "difficult_direct_answer": false, "rationales": ["A small area with clean tables and scientific equipment is very clean.", "There are desks with computers. it looks like a sterile area.", "A science lab is shown."], "image": "val2014/COCO_val2014_000000416862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553116, "question_id": "VF3Bw79VGPHPXazRMUAGYD", "question": "What is the player standing on the base ready to do?", "choices": ["tackle", "dribble", "swing", "dunk"], "correct_choice_idx": 2, "direct_answers": ["hit", "hit", "hit ball", "swing", "hit ball", "play", "swing", "run", "hitting baseball", "through"], "difficult_direct_answer": false, "rationales": ["He is a batter. he is set with a bat in his hand ready for the pitch.", "The player standing on the base is likely to swing his bat.", "The player is swinging."], "image": "train2014/COCO_train2014_000000553116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492639, "question_id": "VGGZ6G8uhigH89jK3vJMnL", "question": "As a foreigner how could somebody know when to cross the street?", "choices": ["bus flashing", "people shouting", "traffic lights", "walk sign"], "correct_choice_idx": 3, "direct_answers": ["crosswalk sign", "crossing lights", "traffic lights", "walk sign", "lights", "walk signal", "green light", "light", "uraffic sign", "traffic light"], "difficult_direct_answer": true, "rationales": ["The symbol is somewhat universal and does not require an understanding of english.", "The sign indicates that pedestrians can walk.", "The universal symbol for walking appears in countries around the world."], "image": "train2014/COCO_train2014_000000492639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402844, "question_id": "VGfsP9gmF73afx2KvV9nZD", "question": "What is the object in his right hand traditionally made of?", "choices": ["gold", "rubber", "wood", "glass"], "correct_choice_idx": 2, "direct_answers": ["wood", "wood", "wood", "wood", "wood", "wood", "foam", "wood", "wood", "aluminum"], "difficult_direct_answer": false, "rationales": ["Bats are made from trees.", "Official baseball bats are made from maple, ash, or hickory.", "This is a bat used in baseball"], "image": "train2014/COCO_train2014_000000402844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26430, "question_id": "VGq7FTNd9x9etm7Ef9DPZV", "question": "Where are the people who set up the umbrella and blanket here now?", "choices": ["rodeo", "grocery store", "parade", "swimming"], "correct_choice_idx": 3, "direct_answers": ["in ocean", "water", "somewhere else", "water", "swimming", "water", "ocean", "ocean", "in water", "swimming walking"], "difficult_direct_answer": false, "rationales": ["The umbrella and blanket are on a beach. there is a body of water next to the beach.", "There are people playing in the water.", "The people are in the ocean."], "image": "val2014/COCO_val2014_000000026430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574370, "question_id": "VGzXNgYC8MWuYwML3v7WXF", "question": "What material is used to make the balls in the dogs mouth?", "choices": ["leather", "plastic", "horsehair", "rubber"], "correct_choice_idx": 3, "direct_answers": ["synthetic", "rubber", "rubber", "rubber", "felt", "natural rubber", "cloth", "plastic", "felt", "rubber felt"], "difficult_direct_answer": false, "rationales": ["Tennis balls are made of rubber in order to bounce effectively off the surface of a tennis court.", "Rubber is used to make tennis balls.", "The material is rubber."], "image": "train2014/COCO_train2014_000000574370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222332, "question_id": "VHGzPTvoUGLY9frRNKwGy9", "question": "What is the job of the man behind the player?", "choices": ["musician", "coach", "painter", "waiter"], "correct_choice_idx": 1, "direct_answers": ["referee", "referee", "coach", "coach", "umpire", "line guy", "line judge", "ball catcher", "referee", "catch balls"], "difficult_direct_answer": false, "rationales": ["That is the mans coach and he is watching so he can tell him what he needs to do to improve.", "The man behind the player is a tennisball coach in a cap.", "The job is the coach."], "image": "val2014/COCO_val2014_000000222332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94055, "question_id": "VHW6MK7D6c3uzYjNyTLoBm", "question": "Which animal has bigger eyes than any other mammal that lives on land?", "choices": ["horses", "tiger", "giraffe", "zebras"], "correct_choice_idx": 0, "direct_answers": ["tarsiers", "tarsiidae", "elephant", "tarsiers", "horse", "horse", "horse", "horses", "horses", "horse"], "difficult_direct_answer": false, "rationales": ["It's types of birds who have the largest and these have the second largest", "The people are riding on the horses.", "Traditionally horses have much larger eyes than humans."], "image": "val2014/COCO_val2014_000000094055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235130, "question_id": "VHyUVUqJMq5dDRz55tUxBG", "question": "In what direction is the sun with respect to the person wearing a blue jacket?", "choices": ["front", "right", "back", "left"], "correct_choice_idx": 2, "direct_answers": ["behind", "behind", "south", "back", "east", "behind", "behind", "behind", "east", "behind"], "difficult_direct_answer": false, "rationales": ["The sun is shining behind the person.", "The sun is behind the man in the blue jacket.", "The direction is in back."], "image": "val2014/COCO_val2014_000000235130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361205, "question_id": "VJ2sQMXaK2zAovX2x3QG6W", "question": "What type of people obviously live here?", "choices": ["children", "middle aged", "elderly", "infirm"], "correct_choice_idx": 0, "direct_answers": ["children", "kids", "kids", "kids", "parents", "children", "children", "children", "kids", "kids"], "difficult_direct_answer": false, "rationales": ["Kids have a lot of toys so they must live here.", "This type of person would play with the toys in this photo.", "This room contains kitchen appliances. they are smaller than normal."], "image": "train2014/COCO_train2014_000000361205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332863, "question_id": "VJ33zoqttgU8vZuZrfVuyu", "question": "What TV show would this kind of vehicle be found in?", "choices": ["black sails", "star trek", "devs", "battlestar galactica"], "correct_choice_idx": 0, "direct_answers": ["boat show", "fishing", "fantasy", "gilligan island", "pirate show", "black sails", "below deck", "titanic", "black sails", "deadliest catch"], "difficult_direct_answer": true, "rationales": ["This would be in a pirate movie or tv show.", "It would be found in a movie that is set on the ocean and may have pirates in it.", "This is a boat that would be seen on a tv show that takes place at sea."], "image": "train2014/COCO_train2014_000000332863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11172, "question_id": "VJBbNFae8kLHXKfepZRvaN", "question": "What is the bowl on the table made of?", "choices": ["steel", "wicker", "glass", "plastic"], "correct_choice_idx": 1, "direct_answers": ["wicker", "wood", "wicker", "wicker", "straw", "wicker", "wicker", "wicker", "straw", "wicker"], "difficult_direct_answer": false, "rationales": ["The pattern, color, and texture visually represent the material listed.", "The bowl is made of wicker.", "This is a wicker basket."], "image": "val2014/COCO_val2014_000000011172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528900, "question_id": "VJBdyWzD26CukSV55xATvg", "question": "World skate is the head controller of which game?", "choices": ["kiting", "swimming", "skating", "snowboarding"], "correct_choice_idx": 2, "direct_answers": ["skating", "skateboarding", "skateboarding", "unknown", "skateboard", "xgames", "world games", "unknown", "roller sports", "tony hawk"], "difficult_direct_answer": false, "rationales": ["Skating is used for gaming.", "World skate controls skating.", "World skate implies it is a skating controller."], "image": "train2014/COCO_train2014_000000528900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351240, "question_id": "VJJ5kdZUy4xTgyoK4N68PL", "question": "What color shirt does the person at bat wear?", "choices": ["gray", "none", "orange", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["Unless you are colorblind it's obvious as to what color they are wearing.", "The player wearing 22 is running to the next base, so he is on the batting team. his shirt is not grey or orange.", "The man wears the black shirt that is seen."], "image": "train2014/COCO_train2014_000000351240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88470, "question_id": "VJN2RtYjsUpHSwDc5dgwJq", "question": "Why is the cage filled with the same type of bird?", "choices": ["to train", "to eat", "to sell", "to cook"], "correct_choice_idx": 2, "direct_answers": ["for sale", "space", "love birds", "to sell", "to sell", "for sale", "pet store", "selling them", "pet storr", "pet shop"], "difficult_direct_answer": false, "rationales": ["The cage is for sale.", "The birds are for sale.", "The birds are available to buy."], "image": "train2014/COCO_train2014_000000088470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213780, "question_id": "VJc7qQjTE6BXYEDHrHbUyA", "question": "What is the make of the dark colored car on the left?", "choices": ["chevy", "ford", "honda", "toyota"], "correct_choice_idx": 1, "direct_answers": ["ford", "ford", "taurus", "ford", "traffic light", "ford taurus", "volvo", "taurus", "ford", "ford"], "difficult_direct_answer": false, "rationales": ["The car on the left says taurus and has a ford logo.", "The logo on the car is for ford.", "The make is ford."], "image": "train2014/COCO_train2014_000000213780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283097, "question_id": "VJiFQvePsjwVYAiwQn6ifB", "question": "Where is he most likely pushing the things to?", "choices": ["forest", "temple", "grocery store", "airport taxi"], "correct_choice_idx": 3, "direct_answers": ["car", "car", "car", "airport", "airport", "airport", "terminal", "airport taxi", "airport", "cleanliness"], "difficult_direct_answer": false, "rationales": ["He'll go to the taxi.", "This is an airport luggage taxi.", "This person is most likely to push these things to the airport taxi."], "image": "val2014/COCO_val2014_000000283097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395339, "question_id": "VJnhPELDcT4WyrxkGCckZX", "question": "What type of shop are people congregated in front of?", "choices": ["flag", "coffee", "resale", "bike"], "correct_choice_idx": 2, "direct_answers": ["resale", "resale shop", "resale shop", "resale shop", "resale shop", "resale shop", "resale shop", "resale", "resale", "resale"], "difficult_direct_answer": false, "rationales": ["The sign says 'resale'.", "The people are in front of a resale shop.", "The word is on the sign"], "image": "train2014/COCO_train2014_000000395339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408818, "question_id": "VJrpRdh74FYRMFcQBJzrGk", "question": "What type passengers board this train?", "choices": ["commuters", "none", "tourists", "engineers only"], "correct_choice_idx": 2, "direct_answers": ["tourist", "tourists", "tourists", "tourists", "tourist", "tourist", "commercial", "tourists", "commercial", "tourists"], "difficult_direct_answer": false, "rationales": ["The passengers who commonly board this train are tourists.", "People going on a trip that want to site see would go on this train.", "The type is a tourist."], "image": "val2014/COCO_val2014_000000408818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227048, "question_id": "VK4hKfu6t2xiRX9TZ3bD8i", "question": "What is the boy in green ready to do?", "choices": ["dunk", "roll", "fall", "swing"], "correct_choice_idx": 3, "direct_answers": ["hit ball", "hit ball", "swing", "hit ball", "serve", "serve", "hit ball", "serve", "hit ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["The is trying to hit the ball.", "The boy is holding is racquet in a swinging motion to hit the ball.", "The boy in green is holding a tennis racquet. a tennis ball is coming towards him."], "image": "val2014/COCO_val2014_000000227048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153368, "question_id": "VKEavjvV7dA3igYwK3EvG9", "question": "What does the animal in the picture above do?", "choices": ["chatters", "neighs", "barks", "brays"], "correct_choice_idx": 2, "direct_answers": ["sit", "protect", "pant", "sit", "lays", "barks", "be friendly", "bark", "bark", "watch"], "difficult_direct_answer": false, "rationales": ["The animal barks.", "It's the thing dogs are most known for.", "This big breed has a long nose, pointy ears, long tail, and is usually a domesticated animal that lives under a roof."], "image": "train2014/COCO_train2014_000000153368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437073, "question_id": "VKaHCDjYKMrpEp5YAeKg5E", "question": "What is the man carrying in both hands?", "choices": ["weights", "briefcase", "banks", "batteries"], "correct_choice_idx": 1, "direct_answers": ["suitcase", "briefcases", "suitcases", "suitcase", "suitcases", "suitcases", "cases", "briefcase", "suitcase", "suitcases"], "difficult_direct_answer": false, "rationales": ["The man has briefcases.", "The man is carrying briefcases in his hands.", "They are small pieces of luggage"], "image": "train2014/COCO_train2014_000000437073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19157, "question_id": "VKrzborGy5q24we4UYwjkf", "question": "What company made the shit the man on the left wearing a hat has on?", "choices": ["hanes", "amazon", "target", "nike"], "correct_choice_idx": 3, "direct_answers": ["nike", "nike", "cow", "nike", "nike", "nike", "john deer", "nike", "nike", "coors"], "difficult_direct_answer": false, "rationales": ["Nike's swoosh is shown.", "Nike makes the shirt based on the logo.", "The company is nike."], "image": "val2014/COCO_val2014_000000019157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256067, "question_id": "VKs5fHYRNdEv2U42aBBKt2", "question": "What type of board is the man in the hat pulling?", "choices": ["paddleboard", "waterboard", "bodyboard", "surfboard"], "correct_choice_idx": 2, "direct_answers": ["boogie board", "kids board", "kick board", "paddle board", "wake board", "paddle", "wakeboard", "boogie board", "boogie", "bodyboard"], "difficult_direct_answer": true, "rationales": ["The board is a bodyboard.", "The man has a board the kid can lie on.", "The size and shape makes it a body board."], "image": "val2014/COCO_val2014_000000256067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392033, "question_id": "VLScQQcbSG4CTceCWzrYNm", "question": "What item was probably used in creating the bench?", "choices": ["crane", "cnc machine", "saw", "kiln"], "correct_choice_idx": 2, "direct_answers": ["logs", "wood", "logs", "saw", "wood", "saw", "trees", "saw", "wood", "saw"], "difficult_direct_answer": false, "rationales": ["The bench is made out of pieces of wood. they were cut.", "The bench is comprised of thin logs that have been cut and then assembled in a bench shape. a sharp tool cuts logs.", "Wood has to be cut before it is used. a saw is a small tool that can be used on logs."], "image": "train2014/COCO_train2014_000000392033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573898, "question_id": "VLdpU46Jfyf4mENZ5X7Abf", "question": "What makes visibility here seem gray and dark?", "choices": ["thunder", "rain clouds", "sun", "nothing"], "correct_choice_idx": 1, "direct_answers": ["clouds", "no sun", "filter", "raining", "clouds", "rain clouds", "black white", "cloudiness", "rain", "fog"], "difficult_direct_answer": true, "rationales": ["The clouds take away a lot of the light and make it darker out and the skies grey.", "The clouds are.", "There are clouds in the sky."], "image": "train2014/COCO_train2014_000000573898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4498, "question_id": "VLgcPBKuaAeP8DDYMcyuCH", "question": "What city are the headquarters of this child's shoes?", "choices": ["calgary", "herzogenaurach", "mumbai", "new york"], "correct_choice_idx": 1, "direct_answers": ["no idea", "herzogenaurach", "portland", "new york", "new york", "herzogenaurach", "oregon", "adidas", "herzogenaurach germany", "new york"], "difficult_direct_answer": false, "rationales": ["The kid is wearing adidas sneakers which their headquarters is in germany.", "The shoes are adidas.", "A kid is wearing adidas shoes."], "image": "val2014/COCO_val2014_000000004498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306663, "question_id": "VLipaajBxxTgtis3aKFCwK", "question": "What are the white posts called in front of the building?", "choices": ["lightposts", "structurals", "columns", "signposts"], "correct_choice_idx": 2, "direct_answers": ["pillars", "columns", "columns", "columns", "beams", "posts", "pillars", "columns", "porch", "columns"], "difficult_direct_answer": false, "rationales": ["The white poles hold up the porch overhang.", "The posts are columns.", "The white posts are columns."], "image": "train2014/COCO_train2014_000000306663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123371, "question_id": "VLjY9drVJZfCRnaV3r7hZy", "question": "What species of cow is black and white?", "choices": ["holsteins", "salers", "black baldy", "shorthorn"], "correct_choice_idx": 0, "direct_answers": ["holsteins", "holstein", "american", "holsteins", "jersey", "frisian", "holstein", "heifer cows", "homestead", "heifer"], "difficult_direct_answer": false, "rationales": ["This is an easily identifiable type of cow", "Holstein cows are the most popular cow used for milk production. a cow is in a populated area, laying in the street.", "The species is holsteins."], "image": "train2014/COCO_train2014_000000123371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56267, "question_id": "VLk9CUgKbJaWdV4kBs9ieX", "question": "In what country would this attire cause a person to sweat?", "choices": ["iceland", "russia", "australia", "norway"], "correct_choice_idx": 2, "direct_answers": ["mexico", "cuba", "australia", "india", "mozambique", "australia", "hot countries", "africa", "africa", "mexico"], "difficult_direct_answer": false, "rationales": ["Australia is the only warmer country on the list and the one where a jacket is something that isn't needed most likely.", "The country is australia.", "This country has a very hot summer"], "image": "val2014/COCO_val2014_000000056267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174967, "question_id": "VLqCAhhdYsjHSwgEih7SCf", "question": "What type of food is served here?", "choices": ["italian", "thai", "korean", "chinese"], "correct_choice_idx": 0, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "italian", "italian", "pizza", "italian", "pizza", "italian"], "difficult_direct_answer": false, "rationales": ["Pizza is on a white plate on a table.", "Pizza is italian food.", "The pizza is originally from the italian."], "image": "train2014/COCO_train2014_000000174967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429236, "question_id": "VM7XPYDZXDKSJ5tVg6zwsr", "question": "WHat kind of dog is this?", "choices": ["pitbull", "dobermin", "yorkie", "lab"], "correct_choice_idx": 2, "direct_answers": ["small", "miniature", "terrier", "situ", "chihuahua", "yorkie", "terrier", "poodle", "shih tzu", "terrier"], "difficult_direct_answer": false, "rationales": ["A small dog with short brown hair is on a skateboard.", "The dog is a yorkie terrier.", "It's actually called a yorkshire terrier. the other options don't match."], "image": "train2014/COCO_train2014_000000429236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533123, "question_id": "VM93F8pVrP9wJFmfcN3e6w", "question": "What is placed on top of the mantle?", "choices": ["ashes", "figurine", "clock", "picture"], "correct_choice_idx": 2, "direct_answers": ["clock", "clock", "clock", "clock", "clock", "clock", "clock", "clock", "clock", "clock"], "difficult_direct_answer": false, "rationales": ["There is a mantle behind the people with a clock on it.", "It has a face and hands", "The only thing on the mantle is a small timekeeper."], "image": "val2014/COCO_val2014_000000533123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364719, "question_id": "VMQwXvzPr4o2FsgCG88hk2", "question": "How did this sheep die?", "choices": ["beheading", "cut throat", "poison", "strangling"], "correct_choice_idx": 1, "direct_answers": ["beheaded", "cut throat", "murder", "killed", "throat cut", "cut", "throat cut", "throat slit", "cut throat", "slaughter"], "difficult_direct_answer": false, "rationales": ["They are letting the blood after butchering it", "The head is deattach from the body.", "A sheep is slit at the throat and the blood is pouring out."], "image": "train2014/COCO_train2014_000000364719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492215, "question_id": "VMiRGMAWRwXpG2tWQ3rwe9", "question": "What is the woman in the costume depicted as?", "choices": ["waiter", "maid", "alice", "goth"], "correct_choice_idx": 1, "direct_answers": ["maid", "maid", "maid", "maid", "maid", "maid", "maid", "maid", "maid", "maid"], "difficult_direct_answer": false, "rationales": ["The woman is dressed in a typical black and white uniform outfit.", "The woman is a maid.", "She is wearing an apron."], "image": "train2014/COCO_train2014_000000492215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153764, "question_id": "VN5YGQDnfaQdc5iVsR6SWV", "question": "What will the man below the tennis ball do now?", "choices": ["serve", "wait", "nothing", "return"], "correct_choice_idx": 0, "direct_answers": ["hit ball", "serve", "serve", "hit it", "hit ball", "swing", "hit ball", "hit ball", "serve ball", "serve"], "difficult_direct_answer": false, "rationales": ["A man on a tennis court is tossing the ball up while standing behind the white line.", "A man is standing at the back line of a tennis court and has thrown the ball into the air to prepare to hit it.", "The man is playing tennis."], "image": "val2014/COCO_val2014_000000153764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377411, "question_id": "VN6hKEAG2zqGjaKiy8y7DG", "question": "The item hanging on the wall is called what?", "choices": ["poster", "brick", "portrait", "cross"], "correct_choice_idx": 3, "direct_answers": ["cross", "christian cross", "cross", "cross", "church", "cross", "cross", "cross", "cross", "cross"], "difficult_direct_answer": false, "rationales": ["This is inside a christian church.", "The item hanging is a cross.", "The item is a cross."], "image": "train2014/COCO_train2014_000000377411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560361, "question_id": "VNDZqAFEpgyKqpamLjoviE", "question": "Where are these elephants located?", "choices": ["zoo", "circus", "captivity", "wild"], "correct_choice_idx": 3, "direct_answers": ["river", "watering hole", "water", "water", "thailand", "asia", "wild", "water", "river", "asia river"], "difficult_direct_answer": false, "rationales": ["These are elephants living wild in the lake.", "They are in a river in a jungle", "The elephants are standing in a jungle area with water and a lot of trees and brush."], "image": "val2014/COCO_val2014_000000560361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140068, "question_id": "VNG3zwSpv5hTURDjThTPri", "question": "Why is the man wearing a glove?", "choices": ["fashion", "warmth", "to catch", "health"], "correct_choice_idx": 2, "direct_answers": ["catch baseball", "catch", "catch baseball", "catching ball", "to catch", "catch balls", "catching ball", "to catch", "baseball", "catching balls"], "difficult_direct_answer": false, "rationales": ["The man is in a baseball uniform on a baseball field.", "A baseball player is on the field in uniform and with his glove.", "The man is playing a sport wear you try to get the baseball."], "image": "val2014/COCO_val2014_000000140068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463120, "question_id": "VNcmcQgcqRyKDMVT9tdN5H", "question": "What is on the dock?", "choices": ["people", "cat", "firefighter", "car"], "correct_choice_idx": 0, "direct_answers": ["boat", "people", "people", "women", "people", "two women", "sea", "two people", "boat", "people"], "difficult_direct_answer": false, "rationales": ["People are standing alongside the dock.", "There are women standing on it", "People are there."], "image": "train2014/COCO_train2014_000000463120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147972, "question_id": "VNzrzF682JJhGtfDk4dPAW", "question": "What is the means of riding available here if you must ride without wheels?", "choices": ["bike", "car", "elephant", "rickshaw"], "correct_choice_idx": 2, "direct_answers": ["elephant", "elephant", "elephant", "elephant", "motorcycle", "elephant", "motorcycle", "elephant", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["The means is an elephant.", "An elephant is walking on the street.", "Someone is being transported on an elephant, indicating it is a possible method of traveling in this location."], "image": "train2014/COCO_train2014_000000147972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35045, "question_id": "VPH8HrDTMcA2awo4aUZwNt", "question": "What powers the device being held here?", "choices": ["gas", "electric outlet", "solar", "batteries"], "correct_choice_idx": 3, "direct_answers": ["batteries", "batteries", "batteries", "electricity", "batteries", "batteries", "batteries", "battery", "battery", "battery"], "difficult_direct_answer": false, "rationales": ["The device is a nintendo wii remote. it is wireless, so it is not powered by an electrical outlet, solar, or gas.", "Batteries power the console.", "The sigh in the items shows that this is battery."], "image": "train2014/COCO_train2014_000000035045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431139, "question_id": "VPVkkfAXUa6DfKaqBdyXPi", "question": "From what location does this train draw or complete an electrical circuit?", "choices": ["wires above", "engine train", "battery caboose", "gas motor"], "correct_choice_idx": 0, "direct_answers": ["wires", "wire above", "conductor", "wires", "top", "top", "wires above", "conductor", "wires", "bayern"], "difficult_direct_answer": false, "rationales": ["The train is in contact with wires above that look to be electrical wires based on their set up. if a train is in contact with electrical wires above it is likely drawing its power from the wires.", "There are power lines.", "Electrical wires are above a train at a station."], "image": "train2014/COCO_train2014_000000431139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190293, "question_id": "VPdndFkjMeGo9EGKR6wsyb", "question": "What is planted in the sand?", "choices": ["net", "tent", "flag", "umbrella"], "correct_choice_idx": 3, "direct_answers": ["umbrella", "umbrella pole", "umbrella post", "parasol", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "beach umbrella"], "difficult_direct_answer": false, "rationales": ["The umbrella is there to let people sit in the shade.", "A beach umbrella is in the sand.", "It has a pole and a canopy"], "image": "train2014/COCO_train2014_000000190293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125694, "question_id": "VPhParJ6Ajj5aCp5Xo5suu", "question": "What is the man holding?", "choices": ["his hair", "forks", "baby", "apple"], "correct_choice_idx": 1, "direct_answers": ["two forks", "forks", "forks", "forks", "forks", "forks", "plastic forks", "silverware", "two forks", "eating"], "difficult_direct_answer": false, "rationales": ["The man has forks.", "The man is holding two utensils to help him eat.", "The man is getting ready to eat."], "image": "train2014/COCO_train2014_000000125694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124419, "question_id": "VPy2JQx8nHiq3LhMkQRGCV", "question": "Who is the man at the end of the train?", "choices": ["postman", "official", "repairman", "conductor"], "correct_choice_idx": 3, "direct_answers": ["brake man", "conductor", "tar", "conductor", "police", "conductor", "police", "police officer", "conductor", "conductor"], "difficult_direct_answer": false, "rationales": ["The conductor will drive the train.", "The man is a conductor.", "Helps ensure the passengers have their ticket payed for their transport."], "image": "train2014/COCO_train2014_000000124419.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377157, "question_id": "VQXRYMRgsp7uzZD549PgWN", "question": "How did that elephant get there?", "choices": ["erosion", "fossilization", "walked", "carved"], "correct_choice_idx": 3, "direct_answers": ["walked", "it's cement", "built", "from jungle", "built", "led", "placed", "walked", "carved", "sculptured"], "difficult_direct_answer": false, "rationales": ["The elephant was carved on the plaza.", "A girl is sitting on an statue of an elephant. statues are often carved.", "The elephant was carved."], "image": "train2014/COCO_train2014_000000377157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150032, "question_id": "VQpV8A73VP6Baz8cPoXakE", "question": "What is in the can on the counter?", "choices": ["fake tan", "paint", "air freshener", "hairspray"], "correct_choice_idx": 2, "direct_answers": ["air freshener", "shaving cream", "air freshener", "room spray", "air freshener", "air freshener", "air freshener", "air freshener", "air freshener", "nothing"], "difficult_direct_answer": false, "rationales": ["The can is an air freshener.", "This is a bathroom. the can has a logo on it side that indicates that it is capable of wicking smells.", "The logo on the can denotes a brand of air purifying mist."], "image": "train2014/COCO_train2014_000000150032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144965, "question_id": "VQunjdT8WPX9HMmmGFQHBu", "question": "What is inside the bun being bitten?", "choices": ["hot dog", "horse", "peanuts", "flowers"], "correct_choice_idx": 0, "direct_answers": ["hot dog", "hot dog", "hot dog", "hot dog", "hotdog", "hotdog", "sausage", "food", "hot dog", "hot dog"], "difficult_direct_answer": false, "rationales": ["The meat inside the bun is a hot dog.", "A person is biting into a slender, white bun and a fence typical of a sports field is behind him.", "This is an elongated bun the same general shape as the meat"], "image": "train2014/COCO_train2014_000000144965.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323930, "question_id": "VR3RKKEQ4DdgWz3ZnvkoyJ", "question": "The man throws with the same hand as what athlete?", "choices": ["clayton kershaw", "max scherzer", "jacob degrom", "ian anderson"], "correct_choice_idx": 0, "direct_answers": ["sandy koufax", "pitcher", "player", "mike trout", "babe ruth", "sandy koufax", "babe ruth", "babe ruth", "clayton kershaw", "lefty grove"], "difficult_direct_answer": false, "rationales": ["The man is throwing with his left hand as clayton does.", "The pitcher is throwing with his left hand while the mitt is on his right.", "The man is kershaw."], "image": "val2014/COCO_val2014_000000323930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559754, "question_id": "VRRx8mmWtb7eHkAjWADTQ2", "question": "What would someone need to do to use this device?", "choices": ["ask", "dance", "crime", "park"], "correct_choice_idx": 3, "direct_answers": ["pay fare", "change", "insert coins", "park", "insert money", "insert coins", "have cellphone", "coins", "coins", "put coin"], "difficult_direct_answer": false, "rationales": ["A person would need to park.", "This device is a meter. it is near a street.", "This machine gives a person the right to park."], "image": "train2014/COCO_train2014_000000559754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446536, "question_id": "VRUPBATFPXmXxQLYHoNNYf", "question": "What type of clientele does the book store have?", "choices": ["gay", "nazis", "jamaicans", "african americans"], "correct_choice_idx": 0, "direct_answers": ["students", "gay", "book readers", "artists", "university", "people", "lgbtq", "university students", "students", "university students"], "difficult_direct_answer": false, "rationales": ["The type is gay.", "Rainbows often symbolize gay support.", "This is a possibility since there is a rainbow of colors"], "image": "train2014/COCO_train2014_000000446536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125305, "question_id": "VRhqav9Qu9yAGapsRywMwG", "question": "What is the main factor keeping the planes on the dirt?", "choices": ["aerodynamics", "age", "production", "gas"], "correct_choice_idx": 1, "direct_answers": ["mechanical problems", "landed", "protection elements", "landed", "in place", "landing gear", "age", "gravity", "gravity", "parked"], "difficult_direct_answer": false, "rationales": ["These planes appear to be de-commissioned and not flying on a regular basis. the biggest reason they are grounded is probably due to their age.", "The factor is age.", "The planes are too old to lift off since they're rusted."], "image": "train2014/COCO_train2014_000000125305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113852, "question_id": "VRqXY4CmWhbooiAdGYNghP", "question": "What are these animals known for producing?", "choices": ["fur", "eggs", "silk", "milk"], "correct_choice_idx": 3, "direct_answers": ["milk", "milk", "milk", "milk", "milk", "milk", "milk", "beef", "beef", "milk"], "difficult_direct_answer": false, "rationales": ["The animals are dairy cows.", "The produce milk", "Cows are raised by dairy farmers."], "image": "train2014/COCO_train2014_000000113852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302606, "question_id": "VS2dKaUigwcxe7tuBBXBsb", "question": "What is most likely in the smallest bowl shown?", "choices": ["sauce", "soup", "chicken", "wontons"], "correct_choice_idx": 0, "direct_answers": ["soup", "sauce", "sauce", "fish sauce", "sauce bowl", "condiment", "by woman", "soup bowl", "soup", "condiments"], "difficult_direct_answer": false, "rationales": ["The bowl has sauce.", "Each of the two main dishes can be dipped in a flavorful liquid that is designed to enhance the taste. duck sauce is a liquid used in this instance for these types of dishes.", "The small bowl holds a thick, colorful liquid which suggests it should be used sparingly."], "image": "val2014/COCO_val2014_000000302606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571059, "question_id": "VSMbCm4AiByoezsNzFAhTU", "question": "What are the people holding?", "choices": ["diapers", "babies", "bats", "hands"], "correct_choice_idx": 3, "direct_answers": ["hands", "hands", "surfboards", "hands", "hands", "hands", "hands", "hands", "hands", "hands"], "difficult_direct_answer": false, "rationales": ["The people are each holding out a hand.", "The people in the waves are holding hands.", "The people hold hands."], "image": "val2014/COCO_val2014_000000571059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350837, "question_id": "VSQZ6PGBjZQ4S5hd5KPeEQ", "question": "When did the company that made this shirt get it's current name?", "choices": ["1949", "2020", "1900", "1950"], "correct_choice_idx": 0, "direct_answers": ["founders name", "1949", "1949", "1949", "adds", "1949", "1963", "1971", "1949", "1949"], "difficult_direct_answer": false, "rationales": ["The brand is adidas based on the logo. the year name of the company was given is internet searchable.", "A tennis player is wearing a shirt with the adidas logo on it.", "The brand on the shirt is adidas based on the logo and the three line design. this brand originated and got its name in answer a."], "image": "train2014/COCO_train2014_000000350837.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345084, "question_id": "VSi6w9hMBPCeLswFnSpnVa", "question": "According to the sign beneath the stop sign what are motorists not allowed to do at this corner?", "choices": ["turn right", "stop", "idle", "u-turn"], "correct_choice_idx": 0, "direct_answers": ["turn", "turn right", "turn", "stop", "turn", "turn", "turn right", "turn right", "turn right", "turn right"], "difficult_direct_answer": false, "rationales": ["There is a black arrow with a red circle-scratchout symbol which means you can't turn in that direction.", "The sign says you can't turn.", "It is an arrow turning towards that way and it has a red circle and line through it"], "image": "train2014/COCO_train2014_000000345084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490044, "question_id": "VStFNgkLLDeC83j7bUWZrv", "question": "What type of event is this?", "choices": ["rally", "competition", "farmer's market", "concert"], "correct_choice_idx": 2, "direct_answers": ["farmer's market", "trading", "farmer's market", "farmers market", "farmer's market", "farmers market", "farmer's market", "market", "market", "market"], "difficult_direct_answer": false, "rationales": ["The farmer's market has veggies.", "There is produce being sold.", "The various produce and food stalls along with people surrounding the stalls indicate that a market is set up."], "image": "val2014/COCO_val2014_000000490044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372554, "question_id": "VTGJPcToWEKoBqBQXiHjiS", "question": "What does the green stuff here need?", "choices": ["water", "salt", "gas", "electricity"], "correct_choice_idx": 0, "direct_answers": ["water", "mowing", "water sunlight", "sunlight water", "water", "water", "kite", "mowing", "water", "cut"], "difficult_direct_answer": false, "rationales": ["The grass requires water in order to grow.", "It needs this to live and grow", "The green stuff needs water."], "image": "train2014/COCO_train2014_000000372554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375782, "question_id": "VTXNY5YcFAoZC3nYB4ZVoZ", "question": "What sort of relationship do the two people seated have with one another?", "choices": ["strangers", "best friends", "married", "sworn enemies"], "correct_choice_idx": 0, "direct_answers": ["strangers", "platonic", "strangers", "strangers", "coworkers", "strangers", "coupl", "strangers", "strangers", "strangers"], "difficult_direct_answer": false, "rationales": ["The two people sitting on the bench are complete strangers.", "The people are sitting far from each other like they do not know each other.", "They're strangers."], "image": "train2014/COCO_train2014_000000375782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526536, "question_id": "VU9BYYTDWsTKFccfUXBSeY", "question": "What style of pizza is on the plate?", "choices": ["deep dish", "neapolitan", "thin crust", "stuffed crust"], "correct_choice_idx": 0, "direct_answers": ["deep dish", "deep dish", "mini", "deep dish", "personal", "deep dish", "deep dish", "deep pizza", "olive", "deep dish"], "difficult_direct_answer": false, "rationales": ["Pizza that has a very thick, deep crust is called deep dish.", "The crust is like a mile high, thus the \"deep\" in the name.", "The crust is tall like a pie crust."], "image": "val2014/COCO_val2014_000000526536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141517, "question_id": "VUAoUtEpc2wutj8VQL9hjH", "question": "What are the poles of the fence made of?", "choices": ["bones", "steel", "wood", "metal"], "correct_choice_idx": 2, "direct_answers": ["wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["This pasture is enclosed enclosed with wire fencing and posts made from trees and lumber.", "The poles are made of wood.", "Fence posts line a pasture and the posts are brown."], "image": "val2014/COCO_val2014_000000141517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429357, "question_id": "VUCirQq7T7VZ6zbiFLko7k", "question": "What type of storm is coming?", "choices": ["rain", "snow", "tropical", "sand"], "correct_choice_idx": 1, "direct_answers": ["snow storm", "blizzard", "snow", "snowstorm", "snow storm", "cold storm", "snow", "thunderstorm", "snow", "snowstorm"], "difficult_direct_answer": false, "rationales": ["The storm is snowy.", "There is snow on the ground and they appear to be in a region where lots of snow is normal based on the trees. in the distance there are heavy clouds with precipitation which would likely result in snow at this place and altitude.", "The whole area has snow over it."], "image": "train2014/COCO_train2014_000000429357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81200, "question_id": "VUVZM5CQyxtcNg98nXdQ6i", "question": "The man is most likely closest to what birthday?", "choices": ["thirtieth", "first", "twentieth", "seventieth"], "correct_choice_idx": 3, "direct_answers": ["90", "ninetieth", "90", "seventieth", "ninety", "ninety", "eighty", "80", "nintyeth", "90"], "difficult_direct_answer": false, "rationales": ["The man is on the older side.", "Because the man looks aged in appearance.", "The man is older."], "image": "train2014/COCO_train2014_000000081200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78642, "question_id": "VUYdgmTDkM3qv2vEuXzHH2", "question": "Why is the foil being used?", "choices": ["retain moisture", "missing tray", "added flavor", "easy cleanup"], "correct_choice_idx": 3, "direct_answers": ["heat tortillas", "easy cleanup", "easy cleanup", "protect sheet", "wrap food", "baking", "protect pan", "cover pan", "keep clean", "to cook"], "difficult_direct_answer": true, "rationales": ["Any spills will not land on the tray.", "The foil is for cleanup.", "The foil is on top of a backing pan with food being places on it. the foil would keep the food separated from direct contact with the pan while cooking and make answer a viable."], "image": "train2014/COCO_train2014_000000078642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23051, "question_id": "VUacGGqcz8qmXv3TufPW4y", "question": "What was used to create the colorful art on the metro car?", "choices": ["markers", "pencil", "spray-paint", "crayons"], "correct_choice_idx": 2, "direct_answers": ["spray-paint", "paint", "spray paint", "spray paint", "spray paint", "spray paint", "paint", "spraypaint", "spray paint", "paint"], "difficult_direct_answer": false, "rationales": ["Spray paint is used for the drawing.", "These cans of paint are portable and easy to use", "The spray was sprayed using the spray-paint."], "image": "val2014/COCO_val2014_000000023051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142098, "question_id": "VUvGMQnC2jRZX7LCNNZEgB", "question": "What did the person in white just do?", "choices": ["rob someone", "drop bat", "buy shoes", "hit baseball"], "correct_choice_idx": 3, "direct_answers": ["bat", "hit ball", "struck ball", "swing", "hit baseball", "hit ball", "hit ball", "hit ball", "hit baseball", "hit baseball"], "difficult_direct_answer": false, "rationales": ["They are a batter and just swung at a pitch", "The player is still holding the bat. he just used it to perform his duty as a batter.", "He is still holding the bat. it would not be possible to rob someone or buy shoes in this part of the stadium."], "image": "train2014/COCO_train2014_000000142098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256983, "question_id": "VV452P3pATqapQ9M8VrRrQ", "question": "What is the smiling woman doing?", "choices": ["daydreaming", "talking", "listening", "singing"], "correct_choice_idx": 2, "direct_answers": ["talking phone", "talking", "on phone", "talking", "talking cell", "on phone", "communicating", "talking", "talking", "listening"], "difficult_direct_answer": false, "rationales": ["She has a flip phone up to her ear, and she's trying to hear what the person on the phone is saying.", "The woman is chatting away on her phone and listening to the conversation.", "A woman is smiling and hears a conversation."], "image": "val2014/COCO_val2014_000000256983.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86282, "question_id": "VVPyCpwXZvWap7x7UdV4ey", "question": "The red envelopes on the table indicate that there is what electronic device in the room?", "choices": ["dvd player", "vhs player", "cd player", "cable box"], "correct_choice_idx": 0, "direct_answers": ["dvd player", "television", "remote", "dvd player", "dvd player", "dvd", "dvd player", "dvd player", "disc player", "dvd player"], "difficult_direct_answer": false, "rationales": ["These types of envelopes were commonly used with movie discs at one point.", "These are netflix dvds and to play the movies you need to have a dvd player to watch them.", "The envelopes are from netflix's movie rental service. the movies come on discs."], "image": "train2014/COCO_train2014_000000086282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274612, "question_id": "VVR74FuwjZWJ5Vbf2rA28o", "question": "Why are the bicycles lined up in a row?", "choices": ["to photograph", "to clean", "to rent", "to paint"], "correct_choice_idx": 2, "direct_answers": ["to rent", "style", "for rent", "parked", "parked", "parking", "for parking", "rentals", "parking", "parking"], "difficult_direct_answer": false, "rationales": ["The yellow umbrella suggests money must be exchanged before using the bikes.", "Bikes that all look alike are on display in front of a business.", "The bikes are available for people to rent and use."], "image": "val2014/COCO_val2014_000000274612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510719, "question_id": "VVSZdEYhvAm6tL4PqR2n42", "question": "What kind of product is likely hauled by the green truck?", "choices": ["furniture", "electronics", "waste", "wood"], "correct_choice_idx": 2, "direct_answers": ["trash", "waste", "trash", "trash", "waste", "trash", "trash", "waste", "trash", "trash"], "difficult_direct_answer": false, "rationales": ["Green trucks generally carry garbage.", "The truck is a dump truck and would carry garbage away.", "The word before management on the side of the truck indicates what it hauls."], "image": "val2014/COCO_val2014_000000510719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406848, "question_id": "VVWYFVPNbJ2zCB6ZGfZNgD", "question": "What are the purplish strips on the pizza?", "choices": ["red carrots", "red onions", "eggplant", "cabbage"], "correct_choice_idx": 1, "direct_answers": ["onions", "onions", "onion", "onions", "onion", "onions", "red onion", "red onions", "onions", "onion"], "difficult_direct_answer": false, "rationales": ["This are the red onions on the pizza.", "The strips are red onions.", "The purple strips are red onions."], "image": "val2014/COCO_val2014_000000406848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25654, "question_id": "VVXr5GUSxzGJcihuJUhp5m", "question": "What are they doing to the green object?", "choices": ["inflating it", "cleaning it", "waxing it", "cutting it"], "correct_choice_idx": 3, "direct_answers": ["cutting it", "cutting it", "cutting it", "puncturing", "cutting", "cutting", "cutting", "cutting open", "cutting", "cutting"], "difficult_direct_answer": false, "rationales": ["One person is using scissors on it", "These are scissors they use", "That's what scissors are used for."], "image": "val2014/COCO_val2014_000000025654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120918, "question_id": "VVZrK5aAYtyZfMde7bRzN3", "question": "What animals are present?", "choices": ["deer", "elephant", "giraffe", "dog"], "correct_choice_idx": 1, "direct_answers": ["elephants", "elephants", "elephants", "elephants", "elephants", "elephant", "elephants", "elephants", "elephants", "elephants"], "difficult_direct_answer": false, "rationales": ["Elephants are shown.", "The height, size, tail, and coloring is of an elephant.", "Great and large animals are standing in a line. they have long tails and trunks."], "image": "val2014/COCO_val2014_000000120918.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564726, "question_id": "VVjVSvQZA3vVMZVhawX3Y7", "question": "What kind of donuts are in the box?", "choices": ["pumpkin", "vanilla", "chocolate", "cinnamon"], "correct_choice_idx": 2, "direct_answers": ["chocolate glazed", "chocolate", "chocolate", "chocolate", "glazed donuts", "chocolate", "chocolate", "chocolate covered", "chocolate", "chocolate"], "difficult_direct_answer": false, "rationales": ["The donuts are dark brown, the same color of the flavor.", "The donuts have a dark brown coating, which is most consistent with donuts of this particular variety.", "Donuts that are this color are chocolate flavored"], "image": "val2014/COCO_val2014_000000564726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4129, "question_id": "VVvUmcdgvHS9smNzNbwXp4", "question": "What does the umbrella provide here?", "choices": ["signaling", "air protection", "shade", "rain protection"], "correct_choice_idx": 2, "direct_answers": ["shade", "shade", "shade", "protect sun", "shade", "shadow", "shade", "shade", "shade", "shade"], "difficult_direct_answer": false, "rationales": ["The umbrella is providing shade for the group.", "The umbrella has shade.", "They are on the beach and it is a sunny day"], "image": "train2014/COCO_train2014_000000004129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442301, "question_id": "VW2mPeAwpVFoVyhYt2qdTW", "question": "Vitamin A is rich in which vegetable?", "choices": ["tomato", "carrot", "beetroot", "broccoli"], "correct_choice_idx": 1, "direct_answers": ["carrots", "carrot", "carrots", "carrots", "carrots", "carrot", "carrot", "carrot", "carrot", "carrot"], "difficult_direct_answer": false, "rationales": ["Carrots have vitamin a.", "Carrots are high in vitamin a.", "Carrots have vitamins."], "image": "val2014/COCO_val2014_000000442301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406723, "question_id": "VWCP7AmabrutZhgSHNar7R", "question": "Which of the woman's limbs is connected more directly to her surfboard?", "choices": ["right leg", "left arm", "left leg", "right arm"], "correct_choice_idx": 0, "direct_answers": ["legs", "right leg", "right foot", "feet", "right leg", "right leg", "feet", "left", "right leg", "feet"], "difficult_direct_answer": false, "rationales": ["The woman's right leg is connected.", "The woman's limbs are clearly visible and the one most in contact with the board is identifiable.", "The woman surfer has two of her limbs, or legs connected to the surfboard. the left foot need to be flexible so it can turn while the right foot remains directly fixed to the board for stability."], "image": "val2014/COCO_val2014_000000406723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489463, "question_id": "VWDgHsh2Vi7u7XQN7hdk7q", "question": "What is the design of the chair?", "choices": ["dotted", "striped", "checkered", "solid"], "correct_choice_idx": 1, "direct_answers": ["rainbow", "striped cushion", "striped", "striped", "highjack", "stripes", "pillow", "striped", "striped", "stripped"], "difficult_direct_answer": false, "rationales": ["The design of the chair is striped.", "It has lines going vertical and different colors.", "It has many colorful lines on the fabric"], "image": "train2014/COCO_train2014_000000489463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72714, "question_id": "VWEAk7KHazvtKY6Wunv85v", "question": "What are the cows walking on?", "choices": ["river", "forest", "roadway", "subway"], "correct_choice_idx": 2, "direct_answers": ["on road", "pavement", "road", "road", "road", "road", "road", "roadway", "street", "road"], "difficult_direct_answer": false, "rationales": ["The cows are in the road.", "This is an asphalt pathway", "The cows are walking together down a paved roadway."], "image": "val2014/COCO_val2014_000000072714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403389, "question_id": "VWWhGY4btseuEbyHQ6bqyM", "question": "What has caused the food on the plate to look shiny?", "choices": ["air", "sauce", "glare", "soda"], "correct_choice_idx": 1, "direct_answers": ["sauce", "oil", "oil", "glaze", "light", "glaze", "sauce", "gloss", "butter", "oil"], "difficult_direct_answer": false, "rationales": ["The plate is covered in sauce.", "The food is in sauce.", "The food is wet because of the liquid on it."], "image": "val2014/COCO_val2014_000000403389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269325, "question_id": "VWdbyZuG4b3Jy3Rp42h4XC", "question": "The green domed building serves which purpose?", "choices": ["food sales", "housing", "worship", "grocery store"], "correct_choice_idx": 2, "direct_answers": ["church", "church", "religious", "worship", "church", "church", "church", "parliament", "worship", "worship"], "difficult_direct_answer": false, "rationales": ["The green domed building is a religious place because of the crosses.", "There is a cross on top of the green domed building. the existence of the cross indicates that the building is associated with christianity.", "The domed building is a church."], "image": "train2014/COCO_train2014_000000269325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217471, "question_id": "VWmZbbzWTJyC4zJSijFF9b", "question": "What state is this field most likely in?", "choices": ["kansas", "maine", "florida", "new jersey"], "correct_choice_idx": 3, "direct_answers": ["cricket", "new jersey", "illinois", "new jersey", "ball field", "japan", "georgia", "new jersey", "new jersey", "new jersey"], "difficult_direct_answer": false, "rationales": ["There is a sign for manasquan bank. this bank is located in that state.", "There is a sign for a radio station of the state on the bleachers.", "There is a radio station billboard with a station called \"jersey\" on it which is likely in reference to the state. radio stations often only advertise at stadiums within their state."], "image": "train2014/COCO_train2014_000000217471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553531, "question_id": "VWrCYYFYMccaS8uNMnFnhk", "question": "What would be the best explanation for why someone parked an old bus indoors?", "choices": ["exhibit", "storage", "weather", "repair"], "correct_choice_idx": 0, "direct_answers": ["display", "display", "exhibition", "museum", "display", "museum", "exhibit", "display", "history", "display"], "difficult_direct_answer": false, "rationales": ["It is no longer used and there for people to look at", "It looks like it's in a museum and on display. there are other exhibits around it.", "The explanation is an exhibit."], "image": "train2014/COCO_train2014_000000553531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15906, "question_id": "VWtjCFv5VKTFYATr6nFS4f", "question": "What time will it be in twenty five minutes?", "choices": ["three", "four", "two", "one"], "correct_choice_idx": 0, "direct_answers": ["300", "259", "three oclock", "three", "three", "three", "three o'clock", "work time", "three", "eight"], "difficult_direct_answer": false, "rationales": ["The clock is currently showing 2:35.", "The time is 3.", "It will be three o'clock in twenty five minutes."], "image": "train2014/COCO_train2014_000000015906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471043, "question_id": "VXQgwYEqt4DyzHXVCvbLzT", "question": "What diet Soda is served here?", "choices": ["so ho", "diet coke", "diet pepsi", "fresca"], "correct_choice_idx": 2, "direct_answers": ["pepsi", "diet pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi", "pepsi"], "difficult_direct_answer": false, "rationales": ["The logo is on the sign.", "The soda is pepsi.", "The red, white, and blue logos on the menu boards indicate which soda is served here."], "image": "val2014/COCO_val2014_000000471043.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388658, "question_id": "VXWtuQV7xWcpXAp3yMsEVV", "question": "What drink are the animals sitting next to?", "choices": ["soda", "coffee", "water bottle", "juice"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "water", "water", "water", "water", "water bottle", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The drink is water.", "There is a bottle of water next to each of the animals.", "Animals drink water."], "image": "val2014/COCO_val2014_000000388658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189153, "question_id": "VXqYXAHWaDxQ2mPyFQnt2Q", "question": "What is the thing on the baseball bat for?", "choices": ["warming up", "cheating", "just aesthetics", "making noise"], "correct_choice_idx": 0, "direct_answers": ["weight", "reduce impact", "protection", "protection", "weight", "warmup", "warm ups", "warming up", "bat protection", "weight"], "difficult_direct_answer": false, "rationales": ["It's thought to help increase speed after a.", "It looks like a weight maybe to enhance the batters swing. i honestly am not sure what the purpose is but the alternative answered do not seem to fit.", "It's on there just while the player warms up before playing"], "image": "train2014/COCO_train2014_000000189153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23639, "question_id": "VXwGuehUrzzLEX6x6amdLG", "question": "Why is he wearing a glove?", "choices": ["fashion", "health", "warmth", "catching"], "correct_choice_idx": 3, "direct_answers": ["catch balls", "to catch", "catch baseball", "catching", "hand protection", "protect hand", "pitcher", "catch ball", "catch ball", "to catch"], "difficult_direct_answer": false, "rationales": ["The person is in a baseball uniform and is throwing a baseball.", "The glove is to protect his hand from the ball hitting it.", "The man has a glove so he can catch."], "image": "train2014/COCO_train2014_000000023639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409591, "question_id": "VXx99EQSjkpyWBmtsHfZUh", "question": "What move does this player use?", "choices": ["forehand", "backhand", "serve", "lob"], "correct_choice_idx": 1, "direct_answers": ["backhand", "backhand", "hitting", "backhand", "backhand", "backhand", "tennis racket", "backhand", "backhand", "tennis racket"], "difficult_direct_answer": false, "rationales": ["The man has the racquet in a backhand move.", "Based on the orientation of the man's hand and the hand the man is using the racket, answer a is accurate.", "The back of his hand can be seen place in the direction of the hit, so it is called backhand."], "image": "train2014/COCO_train2014_000000409591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121162, "question_id": "VY39SrQ3tyrw9dJ9jQs6Dx", "question": "What is the back tire made out of?", "choices": ["metal", "plastic", "rubber", "resin"], "correct_choice_idx": 2, "direct_answers": ["rubber", "rubber", "rubber", "rubber", "rubber", "rubber", "rubber", "rubber", "rubber", "rubber"], "difficult_direct_answer": false, "rationales": ["The back is made of rubber.", "The back tire is made of rubber.", "They're made of rubber."], "image": "val2014/COCO_val2014_000000121162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8647, "question_id": "VY3mwBFSU6LvqudtMLsWzR", "question": "What kind of location is this?", "choices": ["church", "retail", "office", "residential"], "correct_choice_idx": 1, "direct_answers": ["market", "shopping", "retail", "downtown", "city", "shopping center", "market", "storefront", "public", "urban"], "difficult_direct_answer": true, "rationales": ["A city sidewalk is lined with retail storefronts and products displayed outside.", "This is a retail location because there are stores and shops prominently situated in close proximity.", "There are many different businesses"], "image": "val2014/COCO_val2014_000000008647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35945, "question_id": "VYFPfTficy5xLCy8zpy3kL", "question": "Where does the young boy have his hands on?", "choices": ["cup", "table", "chair", "his knee"], "correct_choice_idx": 3, "direct_answers": ["knees", "chair", "behind", "his lap", "lap", "seat", "lap", "his knee", "chair", "chair"], "difficult_direct_answer": false, "rationales": ["The object is on his knee.", "A boy is visible in the picture and it is clear where his hands are.", "He is relaxing in a chair"], "image": "train2014/COCO_train2014_000000035945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135714, "question_id": "VYMoJw8hqM3hZcfX8reenG", "question": "What kind of road do we call this place?", "choices": ["freeway", "expressway", "one way", "intersection"], "correct_choice_idx": 3, "direct_answers": ["city street", "suburban", "street", "intersection", "residential", "seventh", "street", "public road", "intersection", "street"], "difficult_direct_answer": false, "rationales": ["The road is an intersection since it goes in different directions.", "We call this road an intersection.", "The road is an intersection."], "image": "train2014/COCO_train2014_000000135714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204036, "question_id": "VYUSUQfdeerNEfRMJo5iXY", "question": "What type of transportation is shown?", "choices": ["road", "air", "rail", "water"], "correct_choice_idx": 0, "direct_answers": ["public", "bus", "bus", "road", "bus", "bus", "public", "bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["A driveway of some sort.", "The transportation is a bus.", "These vehicles have rubber tires used on asphalt"], "image": "val2014/COCO_val2014_000000204036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467256, "question_id": "VYVioPqkVwVqLCwGrfeTUF", "question": "Why is she forbidden to go past the barrier?", "choices": ["impossible", "dangerous", "country border", "must pay"], "correct_choice_idx": 1, "direct_answers": ["danger", "its dangerous", "danger", "danger", "danger", "it's dangerous", "dangerous", "danger", "dangerous", "it's dangerous"], "difficult_direct_answer": false, "rationales": ["The red and white sign indicates that the area behind the barrier is not safe.", "The sign states the word, 'danger'.", "It's dangerous."], "image": "train2014/COCO_train2014_000000467256.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78737, "question_id": "VYmvpPM2dhGHz2BdfFWqU6", "question": "What type of print is the man raising his hands wearing on his shirt?", "choices": ["animal", "psychedelic", "plaid", "circular"], "correct_choice_idx": 2, "direct_answers": ["plaid", "flannel", "plaid", "plaid", "checked", "plaid", "plaid", "plaid", "plaid", "plaid"], "difficult_direct_answer": false, "rationales": ["He is wearing a plaid shirt.", "The man is wearing a shirt that has a particular geometric pattern that is known as answer a.", "The print consists of rectangles, not circles. the print is not psychedelic or animal."], "image": "train2014/COCO_train2014_000000078737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132953, "question_id": "VZ9JTYDBNBDALd9N7vunH3", "question": "What is on the surfboard in the middle?", "choices": ["wheels", "cat", "person", "dog"], "correct_choice_idx": 2, "direct_answers": ["surfer", "man", "water", "person", "surfer", "man", "man", "surfer", "surfer", "person"], "difficult_direct_answer": false, "rationales": ["You can tell the species of this animal because of the ability to stand as well as the arms and legs and overall shape.", "This is a human riding a board", "He is writing it on the rough water"], "image": "val2014/COCO_val2014_000000132953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432410, "question_id": "VZoXcpouv2PEE4epyArGUQ", "question": "What made the tracks here?", "choices": ["elephants", "mince", "horses", "cars"], "correct_choice_idx": 2, "direct_answers": ["footprints", "horses", "horses hooves", "horses", "horses", "horses", "horses", "horses", "birds", "horses"], "difficult_direct_answer": false, "rationales": ["There are four-legged animals, not vehicles, in the background. they are too small to be elephants.", "The people are all riding them so they would not be leaving the tracks, the animal would.", "Horses all made the tracks over here."], "image": "train2014/COCO_train2014_000000432410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552630, "question_id": "Va4YT3qiam2MRzt5kqB2az", "question": "The person wearing what color of shirt enforces the game rules?", "choices": ["red", "white", "black", "blue"], "correct_choice_idx": 2, "direct_answers": ["black", "black", "umpire", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The umpire of the match wears black.", "The person in black is the umpire.", "Man in back of catcher is officiating the game."], "image": "train2014/COCO_train2014_000000552630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170476, "question_id": "VaKfBjer2BFgrQznkrnSWr", "question": "Where is the man while he is swimming?", "choices": ["in ocean", "in pool", "in river", "in lake"], "correct_choice_idx": 1, "direct_answers": ["pool", "above water", "hotel resort", "pool", "in pool", "pool", "pool", "pool", "hotel", "jumping"], "difficult_direct_answer": false, "rationales": ["The man is in the pool.", "The borders of the pool can be seen and deck chairs can be visible around.", "This is a man made water structure"], "image": "train2014/COCO_train2014_000000170476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142969, "question_id": "VaLhZzHNdXnZPdEtHXcPGg", "question": "Which train is accepting passengers?", "choices": ["left train", "neither", "right train", "both trains"], "correct_choice_idx": 0, "direct_answers": ["left", "left one", "left", "left train", "left", "left", "capitals und", "left train", "left train", "photo left"], "difficult_direct_answer": false, "rationales": ["There are people waiting on the left platform but none on the right.", "The train on the left is.", "Passengers are boarding on the left train."], "image": "train2014/COCO_train2014_000000142969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487487, "question_id": "VaZjEymVirVbVuEjt4hXoB", "question": "What is the form of cake is on the left?", "choices": ["cat", "teddy", "sheep", "fish"], "correct_choice_idx": 1, "direct_answers": ["bread", "bear", "bear", "bear", "bear", "bear", "bear", "bear", "teddy", "bear"], "difficult_direct_answer": false, "rationales": ["The cake is a teddy.", "The cake has the shape, features and look of answer a.", "The cake looks like a stuffed bear."], "image": "train2014/COCO_train2014_000000487487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470984, "question_id": "Vb6C6pJufe2B7F4azdoUPN", "question": "What is the most efficient way back up the hill?", "choices": ["running", "walking", "ski lift", "drive"], "correct_choice_idx": 2, "direct_answers": ["chairlift", "lift", "ski lift", "ski life", "ski lift", "ski lift", "ski lift", "ski lift", "ski lift", "ski lift"], "difficult_direct_answer": false, "rationales": ["The ski lift is fastest.", "The ski lift is helpful.", "The ski lift is powered by electricity to move faster."], "image": "val2014/COCO_val2014_000000470984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463175, "question_id": "VbKHVNjhFc4KpbbTy8cjSC", "question": "What will this man have for dessert?", "choices": ["steak", "birthday cake", "watermelon", "ice cream"], "correct_choice_idx": 1, "direct_answers": ["cake", "cake", "cake", "cake", "birthday cake", "cake", "cake", "cake", "cake", "cake"], "difficult_direct_answer": false, "rationales": ["The man is wearing a party hat and has consumed the entire meal apart from desert. the last part of the meal will be desert.", "It's the man's birthday.", "The man has cake."], "image": "train2014/COCO_train2014_000000463175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95020, "question_id": "VbSeGPGNxh2TR5EpSQYFjK", "question": "What can this person obtain via the grille?", "choices": ["water", "light", "electricity", "heat"], "correct_choice_idx": 3, "direct_answers": ["heat", "information", "information", "cool air", "warmth", "laptop", "heat", "heat", "heat", "phone"], "difficult_direct_answer": false, "rationales": ["This is a radiator used to warm a room", "The grill like appliance is a heater that warms up a room also know as a boiler.", "This is there to keep the place warm when its cold."], "image": "train2014/COCO_train2014_000000095020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569050, "question_id": "VbT3yo6CYt47RGvnMWbpzi", "question": "Which country's flag is the furthest left in the group?", "choices": ["united states", "cuba", "canada", "czech republic"], "correct_choice_idx": 3, "direct_answers": ["czech republic", "czech republic", "czech republic", "france", "philippines", "taiwan", "czechoslovakia", "czech republic", "czech republic", "czechia"], "difficult_direct_answer": false, "rationales": ["The cuba flag is on the furthest to the left.", "I had to look this one up on google.", "The czech flag is the one that's the most distant."], "image": "train2014/COCO_train2014_000000569050.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8095, "question_id": "VbWVVgRRM9vr84654TdbBA", "question": "The player wearing what color of shirt is likely to catch the frisbee?", "choices": ["white", "brown", "blue", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The person in black is closer to the frisbee and his hands are closer to it as well.", "There are two players. the frisbee is farther away from the player wearing the white shirt.", "The frisbee is in between the hands of the man on the left."], "image": "train2014/COCO_train2014_000000008095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403723, "question_id": "VbXWWCs9WtiucoYY5MPrAS", "question": "The men here prefer to stop at which elevation to start their day of fun?", "choices": ["here", "lower", "higher", "same"], "correct_choice_idx": 2, "direct_answers": ["higher", "hilltop", "peak", "ground", "high", "higher", "top", "low", "mountain", "high"], "difficult_direct_answer": false, "rationales": ["There is a snowy mountain in the back, and they are dressed warm with their skis in tow.", "They have skiing gear with them and skiers tend to ski from a high elevation to a lower height.", "They need to be up on the mountain to ski down it"], "image": "train2014/COCO_train2014_000000403723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261945, "question_id": "VbcL5xyon35u7g92GRatMr", "question": "What does the woman in blue laying down intend to do?", "choices": ["push ups", "crunches", "sunbath", "make out"], "correct_choice_idx": 2, "direct_answers": ["sunbath", "read", "read", "sunbathe", "sunbathe", "read book", "read books", "read", "busk", "tanning"], "difficult_direct_answer": false, "rationales": ["She is wearing just a bathing suit on a towel in the sun with sunglasses on", "A woman is laying in the sun with her shoulders and back exposed.", "She is laying in the sun to get a tan."], "image": "train2014/COCO_train2014_000000261945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168413, "question_id": "Vbgsjebf7WekYpckmCxbbv", "question": "What kind of transportation is this?", "choices": ["water", "highway", "rail", "air"], "correct_choice_idx": 2, "direct_answers": ["train", "train", "railway transportation", "train", "rail", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["A train is shown.", "Trains are rail transportation.", "That's what the trains run on."], "image": "train2014/COCO_train2014_000000168413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382731, "question_id": "VbismcVZzFqZUu5EsNbgk3", "question": "What are the players here likely celebrating?", "choices": ["break time", "raise", "homerun", "bonus"], "correct_choice_idx": 2, "direct_answers": ["point", "win", "homerun", "score", "winning", "run", "run", "run", "run", "home run"], "difficult_direct_answer": false, "rationales": ["The players are high fiving each other.", "They are batting and the person is coming into the dugout. he must have done something to score a run.", "The person is a batter for baseball, and they wouldn't be happy if he struck out."], "image": "train2014/COCO_train2014_000000382731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578856, "question_id": "VboC4T4EZZyRYXXvdFB9gU", "question": "Why is the man raising his hands?", "choices": ["to wave", "to dance", "to exercise", "controlling kite"], "correct_choice_idx": 3, "direct_answers": ["flying kite", "holding strings", "flying kite", "controlling kite", "flying kite", "flying kite", "flying kite", "flying kite", "holding strings", "holding strings"], "difficult_direct_answer": false, "rationales": ["The man is holding strings that are attached to a kite.", "He has the strings to keep it from flying away", "The man is guiding the kite with his hands."], "image": "train2014/COCO_train2014_000000578856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469199, "question_id": "Vc4Dr8PFGuY7QvunSDmtHA", "question": "What could help fix the color of this surface?", "choices": ["stucco", "crayons", "paint", "water"], "correct_choice_idx": 3, "direct_answers": ["fertilizer", "fertilizer", "fertilizer", "fertilizer", "water", "reseeding", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["Grass on a baseball field is green and brown.", "Water could make the grass green.", "The lawn needs water to be green."], "image": "train2014/COCO_train2014_000000469199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313372, "question_id": "VcBNgvwnsbevRiNucUnoVX", "question": "The symbol of which popular sporting event can be seen here?", "choices": ["superbowl", "indy 500", "olympics", "world cup"], "correct_choice_idx": 2, "direct_answers": ["olympics", "tennis", "olympics", "olympics", "tennis", "olympics", "olympics", "olympics", "tennis", "olympics"], "difficult_direct_answer": false, "rationales": ["People are playing tennis and a symbol with multiple circles is on display.", "There is a symbol on the magenta wall. it has five white rings.", "It has five interlocking rings."], "image": "val2014/COCO_val2014_000000313372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581542, "question_id": "VcBejpkLLGKf3Jsea6THwd", "question": "What cell phone company does this man support most recently?", "choices": ["att", "sprint", "t-mobile", "verizon"], "correct_choice_idx": 3, "direct_answers": ["verizon", "verizon", "verizon", "verizon", "verizon", "verizon", "verizon", "verizon", "verizon", "blackberry"], "difficult_direct_answer": false, "rationales": ["The man has a tag on his shirt and you can see the name of the company.", "He made a lot of commercials", "There is a company logo on his jacket."], "image": "val2014/COCO_val2014_000000581542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36606, "question_id": "VcGniayk9cbZumKSvZ9R6X", "question": "What are the Asian men trying to find?", "choices": ["hats", "luggage", "jackets", "meal"], "correct_choice_idx": 1, "direct_answers": ["luggage", "luggage", "luggage", "luggage", "suitcases", "luggage", "luggage", "their luggage", "luggage", "suitcase"], "difficult_direct_answer": false, "rationales": ["People stand at baggage claim at an airport.", "They are from the port hence looking the luggage.", "They were in front of a carousel with suitcases on it"], "image": "train2014/COCO_train2014_000000036606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279221, "question_id": "VcJSHHTmmwLBF8rSLNEC62", "question": "What type of sign is on the pole?", "choices": ["regulatory", "informational", "brand", "directional"], "correct_choice_idx": 3, "direct_answers": ["wanted ad", "one way", "directional", "wanted ad", "one way", "one way", "one way", "one way", "one way", "wanted ad"], "difficult_direct_answer": false, "rationales": ["There is a furniture company.", "This is an arrow that is pointing in a direction that is only one way.", "It has an arrow on it"], "image": "val2014/COCO_val2014_000000279221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175863, "question_id": "VcPEFzNHZ2GBVVrkUsoTFz", "question": "What state does the text on his shirt sound like?", "choices": ["new mexico", "new york", "utah", "california"], "correct_choice_idx": 1, "direct_answers": ["new york", "old", "new york", "old", "new york", "new york", "new york", "new york", "new york", "new york"], "difficult_direct_answer": false, "rationales": ["The state is new york.", "The state on his shirt sounds like 'new york'.", "The shirts word ends in york and is close to the state of new york."], "image": "val2014/COCO_val2014_000000175863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302353, "question_id": "VcddJJU3CfFsLT2W4jv9yH", "question": "In what continent is this hotel likely to be located?", "choices": ["asia", "africa", "europe", "north america"], "correct_choice_idx": 0, "direct_answers": ["usa", "asia", "tour area", "africa located", "europe", "europe", "asia", "asia", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["The continent is asia.", "There is an image of an asian character on the wall and an asian sculpture on the table.", "There is an eastern style statue on the table."], "image": "train2014/COCO_train2014_000000302353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392177, "question_id": "VceKQf6uiEcKtZt9ejHprf", "question": "Why is he perfectly safe?", "choices": ["can fly", "photo manipulation", "wearing harness", "no gravity"], "correct_choice_idx": 1, "direct_answers": ["her foot", "surrealism", "below danger", "couple relationship", "rope", "balanced", "harnessed", "photo manipulation", "not real", "hes not"], "difficult_direct_answer": true, "rationales": ["The photo has been photoshopped.", "This photo is not actually really.", "The photo manipulation is safe."], "image": "train2014/COCO_train2014_000000392177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219943, "question_id": "VcejUvwmSiHG2Jddss67Xk", "question": "Which food element here is likely most sour?", "choices": ["fries", "meat", "bread", "pickle"], "correct_choice_idx": 3, "direct_answers": ["lemon", "pickle", "pickle", "pickle", "pickle", "lemon", "pickle", "pickle", "pickle", "pickle"], "difficult_direct_answer": false, "rationales": ["Pickles are made with vinegar.", "Green comes from something with bitter taste.", "Cucumbers fermented in vinegar are often sour. there are pickles in this picture and they are probably sour."], "image": "train2014/COCO_train2014_000000219943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575627, "question_id": "Vcj26Cf87ihaPtiDAnzQNg", "question": "What is a restaurant that specialises in this food?", "choices": ["kfc", "dominoes", "mcdonalds", "burger king"], "correct_choice_idx": 1, "direct_answers": ["pizzeria", "pizza place", "pizzeria", "pizzaria", "pizza parlor", "pizzeria", "italian", "pizza place", "italian", "dominoes"], "difficult_direct_answer": false, "rationales": ["That place is known for serving pizza.", "Mcdonald's and burger king serve burgers, while kfc serves fried chicken. dominos is the only one that serves pizza.", "Pizza is being served to children at a table."], "image": "train2014/COCO_train2014_000000575627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148781, "question_id": "VcqGKnRa8wjBDH6NJprEHR", "question": "Why are his hands stretched out?", "choices": ["balance", "grab ball", "falling", "new player"], "correct_choice_idx": 0, "direct_answers": ["swing racket", "playing tennis", "balance", "in motion", "for hitting", "balance", "balance", "hitting ball", "balance", "swing"], "difficult_direct_answer": false, "rationales": ["He's doing this to stay upright as he's about to swing", "Tennis players use their arms and legs to balance themselves when playing.", "The man is trying to stand up."], "image": "train2014/COCO_train2014_000000148781.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510290, "question_id": "VctTS8P7YJXbQbqgXpVVyE", "question": "What could a person normally do in the small glass structure to the right?", "choices": ["superhero change", "sell fruit", "buy cokes", "phone call"], "correct_choice_idx": 3, "direct_answers": ["phone calls", "phone call", "make call", "phone call", "use payphone", "telephone call", "phone call", "phone", "phone call", "use phone"], "difficult_direct_answer": false, "rationales": ["Years ago these types of buildings housed pay phones.", "This is a phone booth", "There is a pay phone in the booth."], "image": "train2014/COCO_train2014_000000510290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150164, "question_id": "Vd4URiHhiUjCPEJoLdH4dd", "question": "The two people sharing an umbrella here are what to each other?", "choices": ["lovers", "police convict", "enemies", "boss employee"], "correct_choice_idx": 0, "direct_answers": ["friends", "friends", "family", "sweethearts", "lovers", "partners", "friends", "lovers", "couple", "couple"], "difficult_direct_answer": false, "rationales": ["They're lovers.", "They are embracing each other.", "The lady is holding onto the man's arm."], "image": "train2014/COCO_train2014_000000150164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550273, "question_id": "Vd8BGNCDneQXACnoAXSj7s", "question": "In what year did this company exit bankruptcy?", "choices": ["2016", "2014", "2008", "2021"], "correct_choice_idx": 3, "direct_answers": ["1999", "2021", "2021", "2021", "2021", "this year", "2021", "2005", "2021", "2009"], "difficult_direct_answer": false, "rationales": ["Ruby tuesday was able to exit bankruptcy this year.", "Th paper on the table refers to ruby tuesday. this company filed for bankruptcy protection in 2020.", "Ruby tuesday exited bankruptcy in 2021."], "image": "val2014/COCO_val2014_000000550273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282679, "question_id": "VdAac3mPEPe5XnCsXtc4aU", "question": "The umbrella is made of what material?", "choices": ["plastic", "bamboo", "denim", "polyester"], "correct_choice_idx": 1, "direct_answers": ["plastic", "straw", "bamboo", "straw", "sticks", "straw", "wooden sticks", "cloth", "fabric material", "sticks"], "difficult_direct_answer": false, "rationales": ["The umbrella is made of wood. a is the only option which is a type of wood.", "The umbrella is brown and is made out of a wood-like material. it is a naturally occurring material, not a synthetic one.", "The umbrella is made of wood."], "image": "train2014/COCO_train2014_000000282679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164128, "question_id": "VdCBafYypo272kotnvJaZF", "question": "What team is featured on her hat?", "choices": ["tampa bay", "baltimore", "atlanta", "boston"], "correct_choice_idx": 3, "direct_answers": ["something b", "braves", "boston", "red sox", "boston", "red sox", "red sox", "red sox", "red sox", "red sox"], "difficult_direct_answer": false, "rationales": ["The hat has a large stylized letter \"b\" appliqued on the front of it. it is a logo for a sports team from this city.", "Her hat has a red sox, not orioles, braves, or rays, logo.", "The team is boston."], "image": "train2014/COCO_train2014_000000164128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176571, "question_id": "VdEZbj4aSyYcujBXHMQNY9", "question": "Why are the people holding umbrellas?", "choices": ["to buy", "it's raining", "to dance", "it's snowing"], "correct_choice_idx": 1, "direct_answers": ["raining", "it's raining", "need shade", "raining", "raining", "raining", "raining", "rain", "it's raining", "rain"], "difficult_direct_answer": false, "rationales": ["The ground is wet and the people don't want to get wet.", "This hand-held device protects people from becoming wet and it can be used to block the sun.", "It's raining out."], "image": "train2014/COCO_train2014_000000176571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468224, "question_id": "VeRuQYyeMHVzKVqWXJ6zbH", "question": "What condiment is in the small white container next to the dish?", "choices": ["mustard", "blue cheese", "ranch", "butter"], "correct_choice_idx": 3, "direct_answers": ["butter", "mayonnaise", "butter", "mayo", "mayo", "butter", "mayonnaise", "butter", "mayo", "mayonnaise"], "difficult_direct_answer": false, "rationales": ["This is a yellowish fatty spread", "The condiment is yellowish and creamy, plus it's alongside a sweet roll which would likely have butter spread on it, so those clues indicate that the condiment is indeed butter.", "The butter is in the small bowl next to the plate."], "image": "train2014/COCO_train2014_000000468224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71122, "question_id": "VeyZJdW87rRJ9qDap6Zomj", "question": "What type of sign is on the building?", "choices": ["directional", "informational", "brand", "warning"], "correct_choice_idx": 1, "direct_answers": ["ski patrol", "ski patrol", "ski patrol", "ski patrol", "skii patrol", "ski patrol", "skii patrol", "ski patrol", "ski patrol", "informational"], "difficult_direct_answer": false, "rationales": ["The sign says \"ski patrol\" and is directly over the exterior entrance to the building.", "The sign tells the skiiers who is in the building.", "The sign is informational."], "image": "train2014/COCO_train2014_000000071122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313315, "question_id": "Vf92gquDB8a64gewfAAbss", "question": "What type of plane is behind the man?", "choices": ["drone", "airliner", "biplane", "jet"], "correct_choice_idx": 2, "direct_answers": ["airplane", "private plane", "cessna", "cessna 172rg", "cessna", "cessna", "small aircraft", "cessna 172rg", "crop duster", "biplane"], "difficult_direct_answer": false, "rationales": ["The plane has two wings.", "This is called a biplane.", "It is called a biplane because of the type it is."], "image": "val2014/COCO_val2014_000000313315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68595, "question_id": "VfBGz5VGBuhbwu49uNMFUn", "question": "What is the name of the street?", "choices": ["flint", "green", "exchange", "apple"], "correct_choice_idx": 2, "direct_answers": ["exchange", "exchange", "exchange street", "exchange", "exchange", "exchange street", "exchange", "exchange street", "exchange", "exchange"], "difficult_direct_answer": false, "rationales": ["The street says exchange on it.", "The street sign says exchange on it.", "The name is an exchange."], "image": "train2014/COCO_train2014_000000068595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54594, "question_id": "VfSebjJztR95KFAq5fBVvp", "question": "What are the animals doing?", "choices": ["running", "sleeping", "resting", "working"], "correct_choice_idx": 2, "direct_answers": ["herding", "grazing", "walking", "resting", "resting", "hanging out", "grazing", "grazing", "resting", "eating"], "difficult_direct_answer": false, "rationales": ["The animals are taking a breather.", "Some are laying down and some are eating", "Cows typically take a load off in big groups while laying down."], "image": "train2014/COCO_train2014_000000054594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44165, "question_id": "VfXbKqJpnaFxSf9N2NbT3q", "question": "What beverage is being enjoyed here?", "choices": ["beer", "milk", "soda", "water"], "correct_choice_idx": 1, "direct_answers": ["milk", "milk", "milk", "milk", "water", "milk", "milk", "milk", "water", "milk"], "difficult_direct_answer": false, "rationales": ["Milk is being enjoyed.", "The animal is an elephant, not a person. the beverage is white, not brown or clear.", "The elephant is drinking from a milk carton."], "image": "val2014/COCO_val2014_000000044165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560644, "question_id": "VfacsCJSAAXnQzmyoZrQEE", "question": "What is the possible danger shown in the scene?", "choices": ["drowning", "drug overdose", "intoxication", "cardiac arrest"], "correct_choice_idx": 3, "direct_answers": ["overdose", "drowning", "overdoes", "cardiac arrest", "overdose", "drowning", "alcohol pills", "overdose", "alcohol pills", "overdose"], "difficult_direct_answer": false, "rationales": ["Though a few of the answers are possible, do too many of the things pictured can lead to death.", "The possible danger of these drugs are supposedly cardiac arrest.", "A person is sleeping in a tub and pills are spilled on the floor nearby."], "image": "train2014/COCO_train2014_000000560644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497265, "question_id": "VfhKiRHkEZtHvJnNGfu742", "question": "What geographical feature is likely visible from here?", "choices": ["wading pool", "ocean", "pond", "none"], "correct_choice_idx": 1, "direct_answers": ["ocean", "beach", "beach", "beach", "ocean", "beach", "beach", "hill", "beach", "sand dunes"], "difficult_direct_answer": false, "rationales": ["The woman is on the ocean beach.", "The ocean must be close to the sand.", "This is a lot of sand so probably at a beach"], "image": "train2014/COCO_train2014_000000497265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52901, "question_id": "Vfi8paxA7fuWg3Z6tvuXsb", "question": "How do these people know each other?", "choices": ["coworkers", "spouses", "teammates", "siblings"], "correct_choice_idx": 3, "direct_answers": ["same family", "siblings", "siblings", "siblings", "by names", "siblings", "siblings", "siblings", "siblings", "siblings"], "difficult_direct_answer": false, "rationales": ["These people must be siblings since they're young kids.", "They are siblings because they are wearing shirts that says so", "These girls look alike and are likely sisters."], "image": "train2014/COCO_train2014_000000052901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491942, "question_id": "VfqYu52xZgxDU4Ajhh5SyX", "question": "Where will the passengers enter?", "choices": ["rear", "side", "top", "front"], "correct_choice_idx": 1, "direct_answers": ["side door", "through door", "street door", "side door", "door", "door", "side", "right door", "side", "bus"], "difficult_direct_answer": false, "rationales": ["The passengers go on the side.", "The passengers will use the side door.", "There is only one door and it's easy to see."], "image": "train2014/COCO_train2014_000000491942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360566, "question_id": "VfxqWCS3CpDg6JgQmoEkZc", "question": "What gives the water that color?", "choices": ["oil spill", "dye", "sewage", "algae"], "correct_choice_idx": 3, "direct_answers": ["algae", "algae", "plants/algae", "leaves", "vegetation", "algae", "moss", "vegetation", "lilly pads", "algae"], "difficult_direct_answer": false, "rationales": ["The water is colored by green algae.", "Water becomes greenish when it has an abundance of algae, which matches the condition of this body of water.", "There is a lot of growth in the water because it's sunny"], "image": "train2014/COCO_train2014_000000360566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172925, "question_id": "VgABucbPGH8NazCgr4Nr6H", "question": "What province did this event take place?", "choices": ["alberta", "british columbia", "saskatchewan", "pei"], "correct_choice_idx": 1, "direct_answers": ["british columbia", "british columbia", "vancouver", "vancouver", "vancouver", "british columbia", "british columbia", "vancouver", "british columbia", "british columbia"], "difficult_direct_answer": false, "rationales": ["The province is bc.", "On the podium it clearly states the city (vancouver) and year in which this olympic event took place. as many people know, vancouver is part of the british columbia province.", "The 2010 olympics took place in the southwest corner of canada."], "image": "train2014/COCO_train2014_000000172925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213725, "question_id": "VgAywJFATBRwxAin4ga898", "question": "Where are these objects probably from?", "choices": ["south america", "turkey", "vietnam", "china"], "correct_choice_idx": 3, "direct_answers": ["china", "asia", "abroad", "asia", "oriental country", "store", "history", "asia", "china", "japan"], "difficult_direct_answer": false, "rationales": ["These oriental objects are likely from china.", "The pattern on the umbrella is asian. chinese people were very creative in the arts.", "The umbrella is from asian cultures."], "image": "train2014/COCO_train2014_000000213725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284160, "question_id": "VgP4es8Ftx7XcZxV2wSMhj", "question": "What persons most frequently ride this conveyance?", "choices": ["cable workers", "prisoners", "tourists", "commuters"], "correct_choice_idx": 2, "direct_answers": ["tourists", "tourists", "tourist", "tourists", "passengers", "operator", "tourists", "tourists", "tourists", "kids"], "difficult_direct_answer": false, "rationales": ["The bus is brightly colored. there are a lot of advertisements on it.", "People that want to see the sites.", "Tourists would ride this bus to look around."], "image": "val2014/COCO_val2014_000000284160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313818, "question_id": "VgUTvJAy7nmpAb2Dj9i8sJ", "question": "Why is there snow up there?", "choices": ["storm coming", "high altitude", "is mirage", "not shoveled"], "correct_choice_idx": 1, "direct_answers": ["altitude", "precipitation", "snowy peaks", "high elevation", "cold", "mountain", "winter", "high altitude", "cold", "mountain"], "difficult_direct_answer": false, "rationales": ["The mountains appear quite tall and mountains at this height are known to have snow at the top", "The mountain is at a higher elevation than the rest of the town.", "In the foreground of this photo there is a village with no snow. in the background is a mountain range with snow on its top. the biggest different is that mountaintops are at a much higher altitude."], "image": "train2014/COCO_train2014_000000313818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337621, "question_id": "VgX8Hs38qFJxeXbtosgnWu", "question": "The skiers will have difficulty concentrating on skiing because distracts them?", "choices": ["snow", "lift", "their costumes", "their boots"], "correct_choice_idx": 2, "direct_answers": ["no snow", "costumes", "red hat", "child", "outfits", "costumes", "costumes", "their costumes", "costume", "silly outfits"], "difficult_direct_answer": false, "rationales": ["The skiers are wearing very wild costumes so they can be a distraction.", "One of the kids is wearing a set of wings and another has a cat in the hat style hat on their head. they are dressed up perhaps for halloween.", "The skiers are in costume."], "image": "train2014/COCO_train2014_000000337621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555254, "question_id": "VgZThpFzXv5SwKxx7DgK6X", "question": "Who is wearing the most safety gear?", "choices": ["bike", "no one", "man", "woman"], "correct_choice_idx": 2, "direct_answers": ["driver", "man", "front rider", "man", "driver", "driver", "man", "driver", "guy", "driver"], "difficult_direct_answer": false, "rationales": ["The man is wearing a helmet, gloves, elbow pads and knee pads.", "The woman doesn't have a helmet or any pads. the man has all the safety gear on.", "The man has the gear."], "image": "train2014/COCO_train2014_000000555254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578369, "question_id": "VgeR7UHSJBbapxB5FxgaSe", "question": "Why have these people gathered?", "choices": ["to compete", "to eat", "to play", "to work"], "correct_choice_idx": 1, "direct_answers": ["to eat", "restaurant", "meal", "meal", "to chat", "lunch", "breakfast", "to eat", "lunch time", "breakfast"], "difficult_direct_answer": false, "rationales": ["The people are eating.", "They are all gathered at a table with food and drinks.", "The people are gathered to eat a meal."], "image": "train2014/COCO_train2014_000000578369.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347829, "question_id": "VgnHd2oXbn9TNmzKZTVDqi", "question": "What country is this?", "choices": ["south korea", "japan", "canada", "china"], "correct_choice_idx": 1, "direct_answers": ["japan", "usa", "china", "japan", "china", "japan", "asia", "japan", "china", "japan"], "difficult_direct_answer": false, "rationales": ["The writing on the bus is japanese.", "The bus driver is driving on the right side of the bus.", "Mioggi group can be found in country with red dot flag."], "image": "train2014/COCO_train2014_000000347829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353321, "question_id": "Vgtx9d4EfzjffXzMRoijeq", "question": "Why are they so close together?", "choices": ["save money", "little space", "protection", "friends"], "correct_choice_idx": 3, "direct_answers": ["talking", "bench", "talking", "couple", "talking", "sharing food", "dating", "in relationship", "friends", "talking"], "difficult_direct_answer": false, "rationales": ["They're friends.", "They wouldn't sit that close if they were not friends.", "These people likely know each other and are friends with each other."], "image": "val2014/COCO_val2014_000000353321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384811, "question_id": "VgwQQ4cnGJVFuU3fdfU49L", "question": "What can be prevented by the man holding onto the top of the blender?", "choices": ["loud sound", "overflowing", "spilling", "falling"], "correct_choice_idx": 2, "direct_answers": ["splashing", "mess", "food everywhere", "spilling", "spillage", "spillage", "spilling", "spilling", "spill", "splashes"], "difficult_direct_answer": false, "rationales": ["Spilling is prevented.", "Blenders spill easily.", "If there is not a lid on the blender, when it is turned on the contents would likely explode all over. sometimes the lid doesn't stay on tightly so by holding on to it he makes sure it stays sealed."], "image": "val2014/COCO_val2014_000000384811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27805, "question_id": "Vh6CECjY79g39Cqewrzshn", "question": "How many teeth does the baby have?", "choices": ["32", "20", "15", "ten"], "correct_choice_idx": 1, "direct_answers": ["one", "zero", "zero", "zero", "zero", "20", "one", "zero", "one", "zero"], "difficult_direct_answer": false, "rationales": ["The baby has about 20 baby teeth.", "Adults have 32 teeth. babies have fewer, but more than 15, teeth.", "The baby has at most 20 teeth."], "image": "val2014/COCO_val2014_000000027805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560150, "question_id": "VhDasAwjo3mt5VFESEfpn3", "question": "Why has he covered his eyes?", "choices": ["disguise", "shame", "protection", "fashion"], "correct_choice_idx": 2, "direct_answers": ["snow protection", "sun glare", "snow", "skiing fast", "sun protection", "protection", "protection", "protection", "snowing", "sunshine"], "difficult_direct_answer": false, "rationales": ["The man is skiing. the goggles prevent things from injuring his eyes.", "The man needs protection.", "This helps with the bright sun and snow"], "image": "train2014/COCO_train2014_000000560150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495578, "question_id": "VhFc9qCAypMYUkaQEF2Qns", "question": "What is the player ready to do?", "choices": ["dunk", "swing", "bounce", "run"], "correct_choice_idx": 1, "direct_answers": ["hit ball", "return ball", "hit", "return ball", "hit ball", "hit ball", "score", "hit", "play", "swing"], "difficult_direct_answer": false, "rationales": ["The player in ready to return the ball.", "The player is ready to swing his tennis racket.", "The other options don't apply to the sport of tennis."], "image": "val2014/COCO_val2014_000000495578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59593, "question_id": "VhPNeDG2DLCCuEXEa5x2Zq", "question": "What is the young girl doing with the white object?", "choices": ["singing", "playing game", "exercising", "dancing"], "correct_choice_idx": 1, "direct_answers": ["playing nintendo", "video games", "playing game", "playing game", "playing game", "playing game", "playing", "video games", "playing wii", "playing game"], "difficult_direct_answer": false, "rationales": ["A girl is holding a video game controller in her hand.", "That is a controller in her hand", "The girl is trying to play wii."], "image": "train2014/COCO_train2014_000000059593.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247543, "question_id": "VhR3d4sNm5cp9sBhsgJiqQ", "question": "What is on the grass?", "choices": ["women", "animals", "babies", "men"], "correct_choice_idx": 1, "direct_answers": ["cows", "cow", "cow", "cow", "cows", "animals", "cow", "cow", "cow", "down grass"], "difficult_direct_answer": false, "rationales": ["There are calfs on the grass.", "Cattle can be seen feeding on the grass", "Cows are in the grass."], "image": "val2014/COCO_val2014_000000247543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25668, "question_id": "Vi3bLttfheANeVKDag8z3J", "question": "How are the two people under the umbrella likely related?", "choices": ["parent child", "strangers", "siblings", "enemies"], "correct_choice_idx": 2, "direct_answers": ["family", "siblings", "friends", "couple", "siblings", "friends", "siblings", "siblings", "siblings", "siblings"], "difficult_direct_answer": false, "rationales": ["The boy and girls are siblings.", "Sisters and brothers are usually close sharing many things.", "Friendly enough to share an umbrella, but contemptuous enough to frown, this is a sister and brother."], "image": "val2014/COCO_val2014_000000025668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424303, "question_id": "Vi6enmhjSaW7tnEApBXyn3", "question": "What condition of this place is favorable to this sport?", "choices": ["clean water", "fine sand", "big waves", "deep water"], "correct_choice_idx": 2, "direct_answers": ["windy", "big waves", "water", "temperature", "big waves", "good weather", "ocean", "big waves", "waves", "waves"], "difficult_direct_answer": false, "rationales": ["Waves are needed to surf.", "A person is surfing on a board on the water.", "The ocean is a great place for this especially when it is more windy out."], "image": "train2014/COCO_train2014_000000424303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343264, "question_id": "ViK8RQd979e5f7Rvvc9QEv", "question": "What type of team is this?", "choices": ["pack", "little league", "crowd", "league"], "correct_choice_idx": 1, "direct_answers": ["little league", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer"], "difficult_direct_answer": false, "rationales": ["They are all kids playing the game of soccer.", "The players are all just kids.", "They're playing soccer."], "image": "train2014/COCO_train2014_000000343264.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561979, "question_id": "ViPYzLQHZ8CmBu5HVmTWG9", "question": "What is on the sand?", "choices": ["seals", "boats", "artichokes", "humans"], "correct_choice_idx": 1, "direct_answers": ["boats", "boats", "boats", "boats", "dog", "boats dog", "boats", "dog", "boats", "boats"], "difficult_direct_answer": false, "rationales": ["The two vessels docked on the sand are boats.", "You will never sea a seal or an artichoke on sand and there are no humans here.", "There are vehicles, not seals, people, or vegetables, on the sand."], "image": "val2014/COCO_val2014_000000561979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496897, "question_id": "ViWyA6wSsC5gxnfFQKUQXk", "question": "To whom is this man going to buy stuff from this shop?", "choices": ["his kids", "his parents", "his dogs", "his wife"], "correct_choice_idx": 2, "direct_answers": ["dogs", "his dogs", "cashier", "dogs", "shop owner", "his pets", "dog", "dogs", "dogs", "dogs"], "difficult_direct_answer": false, "rationales": ["The exterior sign of the shop identifies it as being for cats and dogs, and the man is being followed by a number of dogs. it is not uncommon for people to bring their dogs to a dog supply store.", "The shop has a sign on its window that indicates that it sells items for cats and another type of animal. there are four of these animals behind the man.", "The man is entering a specialized pet store so unless he has cats at home he is probably buying for the four animals that he has with him."], "image": "val2014/COCO_val2014_000000496897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440752, "question_id": "ViZeWJ6N4FLi49FMszPwkt", "question": "Which lens is used in bus side mirror?", "choices": ["convex", "pin point", "cortex", "concave"], "correct_choice_idx": 0, "direct_answers": ["mirror", "reflective", "glass", "wide angle", "sideview", "circle", "digital camera", "convex", "round", "convex"], "difficult_direct_answer": true, "rationales": ["The convex lens is used in the mirror.", "The opposite of this type is concave and curves inward.", "Convex lens is round."], "image": "train2014/COCO_train2014_000000440752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39592, "question_id": "VisapjqY7Rnn9ry2jn7atJ", "question": "Why might the zebras be gathering here?", "choices": ["treats", "fear", "curiosity", "attention"], "correct_choice_idx": 0, "direct_answers": ["treats", "food", "food", "food", "for food", "for food", "food", "for food", "food", "food"], "difficult_direct_answer": false, "rationales": ["There are spectators, which typically give the zebras food.", "The zebras want snacks.", "Zebras are all standing near a fence wear people have gathered."], "image": "val2014/COCO_val2014_000000039592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381683, "question_id": "Vj2sP6aXM6uBTah3fq2BtV", "question": "What is unusual about the horses?", "choices": ["color", "tails", "legs", "hooves"], "correct_choice_idx": 1, "direct_answers": ["short tails", "cut tails", "tail", "tails", "tails", "short tail", "short tails", "rodeo", "short tails", "tails"], "difficult_direct_answer": false, "rationales": ["The horses have unusually short tails.", "Their tails are chopped.", "They're missing tails."], "image": "train2014/COCO_train2014_000000381683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578506, "question_id": "VjEDErkXSARjCFAM6ZxVES", "question": "What kind of building is the one with blue rails?", "choices": ["police station", "school", "bus station", "government"], "correct_choice_idx": 0, "direct_answers": ["police station", "police headquarters", "police station", "police station", "police station", "police station", "police headquarters", "police headquarters", "police station", "police station"], "difficult_direct_answer": false, "rationales": ["The building is for police.", "This is a police station.", "The building with the blue rails has a sign that says \"barnet police station\"."], "image": "train2014/COCO_train2014_000000578506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577169, "question_id": "VjTCxbkf8gz8Eoxy9iNmpP", "question": "What are this group of people doing?", "choices": ["sightseeing", "queueing", "attending conference", "watching sunset"], "correct_choice_idx": 0, "direct_answers": ["looking", "looking outside", "looking", "overlooking city", "looking", "sightseeing", "sightseeing", "sightseeing", "looking out", "sightseeing"], "difficult_direct_answer": false, "rationales": ["They might also be doing b or c in addition to a, but a alone is the most likely answer.", "The people are looking through the clock.", "The people here are most likely tourist viewing from the clock the scenery below."], "image": "val2014/COCO_val2014_000000577169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172498, "question_id": "Vjb4M2j3xxJgc5Bmgdbuch", "question": "Where does the ladder on the bus give access to?", "choices": ["undercarriage", "engine", "roof", "cab"], "correct_choice_idx": 2, "direct_answers": ["roof", "window", "roof", "roof", "roof", "top", "roof", "roof", "bus top", "bus roof"], "difficult_direct_answer": false, "rationales": ["The roof has a rack for storing luggage so the ladder is used to get up on the roof.", "A person uses the ladder at the end of the bus to climb onto the roof.", "The ladder is for the roof."], "image": "val2014/COCO_val2014_000000172498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70558, "question_id": "VjeEvXPi8QQDmzJDbMTcEB", "question": "What is the binomial classification of these birds?", "choices": ["ara ararauna", "ara glaucogularis", "ara ambiguus", "ara macao"], "correct_choice_idx": 0, "direct_answers": ["ara ararauna", "parrots", "parrot", "parrot", "parrot", "parrot", "parrot", "parrots", "parrots", "parrot"], "difficult_direct_answer": false, "rationales": ["These birds are called ara ararauna.", "The image depicts macaws who have a option d as the binomial classification.", "The animal is ara ararauna."], "image": "train2014/COCO_train2014_000000070558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294537, "question_id": "VjgjtBz89d6AmM2JWdELCM", "question": "Who is skiing with the man in front?", "choices": ["his grandmother", "no one", "someone unseen", "those behind"], "correct_choice_idx": 3, "direct_answers": ["two others", "children", "photographer", "brother", "child", "boy", "kids", "two people", "people", "those behind"], "difficult_direct_answer": true, "rationales": ["There are companions following.", "Since they're all so close together compared to how open the trail is, this would imply they are all skiing together.", "The person is in front of the others."], "image": "val2014/COCO_val2014_000000294537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361009, "question_id": "VkGG65eVL3GqKqRPKioEKo", "question": "What does this food truck only accept for payment?", "choices": ["credit", "debit", "bitcoin", "cash"], "correct_choice_idx": 3, "direct_answers": ["cash", "cash only", "cash", "cash", "cash", "cash", "cash only", "cash", "cash", "cash"], "difficult_direct_answer": false, "rationales": ["The truck has cash.", "The truck indicates it's a cash only business.", "The sign says cash only."], "image": "train2014/COCO_train2014_000000361009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51618, "question_id": "VkSffi8kQQ8WrwmZxkSfqW", "question": "What is she doing with the sandwich?", "choices": ["stealing it", "cleaning it", "sharing it", "eating it"], "correct_choice_idx": 3, "direct_answers": ["eating photographing", "eating", "eating", "eating", "eating it", "eating it", "eating", "eating", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["The meal is brought at the table to the customer to be feed.", "There are bites out of it", "There are bite marks in it and she has it pulled up to her mouth"], "image": "train2014/COCO_train2014_000000051618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314570, "question_id": "VkUy5zkcQmjjRpRrDVnbjj", "question": "Where are they going?", "choices": ["in lake", "around lake", "behind bus", "on bus"], "correct_choice_idx": 3, "direct_answers": ["south", "bus", "mountains", "tour", "exploring", "south", "onto bus", "to bus", "on bus", "touring"], "difficult_direct_answer": true, "rationales": ["These people are getting onto the bus which is nearby them.", "There is a large somewhat rectangular vehicle that can carry many passengers", "The people are walking to the vehicle."], "image": "train2014/COCO_train2014_000000314570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240944, "question_id": "VkhkYtfXzM8pgUhqx39TVB", "question": "What is the woman ready to do?", "choices": ["serve", "dribble", "run", "punt"], "correct_choice_idx": 0, "direct_answers": ["serve", "serve", "serve", "serve", "serve", "serve", "serve", "serve", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["The woman is positioned to throw the ball in the air and hit it to her opponent. this is the typical first move in a tennis game.", "She has the ball in her hand and this is the only time that happens", "With the setting she is in and the outfit she is wearing you can tell what is about to happen."], "image": "train2014/COCO_train2014_000000240944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356362, "question_id": "Vkz7wSQ2VSJsYmYMyE3Ht2", "question": "What is the general term give to the place above?", "choices": ["railway", "packing", "station", "airport"], "correct_choice_idx": 3, "direct_answers": ["airport", "airport", "airport", "sky", "airport", "airport", "airport", "airport", "airport", "germany"], "difficult_direct_answer": false, "rationales": ["The plane is landed at an airport.", "The term is an airport.", "A plane is using a runway to take off. a second plane is taxiing."], "image": "train2014/COCO_train2014_000000356362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48319, "question_id": "VmHsMA88YCkfrewkR2eEjS", "question": "What type of event is this?", "choices": ["zoo", "dog show", "theme park", "sporting event"], "correct_choice_idx": 1, "direct_answers": ["dog frisbee", "dog frisbee", "pet", "dog show", "frisbee competition", "dog show", "frisbee", "frisbee tournament", "dog", "dog show"], "difficult_direct_answer": false, "rationales": ["The one competing is the dog. everyone else is watching.", "The other options are unlikely and wouldn't match what is shown with the dog and frisbee.", "A dog is jumping in the air while other people watch him."], "image": "train2014/COCO_train2014_000000048319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456525, "question_id": "VmMcdkkKW6Bga7MJ8qi2yf", "question": "The man lying down uses the umbrella for what?", "choices": ["rain protection", "signaling", "shade", "advertising"], "correct_choice_idx": 2, "direct_answers": ["protection", "shade", "shade", "shade", "shade", "for shade", "give shade", "shade", "shade", "shade"], "difficult_direct_answer": false, "rationales": ["It surely isn't raining and without this protection he is going to get a nasty sunburn.", "The umbrella is used for protection.", "The man wants shade."], "image": "val2014/COCO_val2014_000000456525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179322, "question_id": "VmPE76Ye7kcr2AwLEZmHDG", "question": "What does the man hold?", "choices": ["check mark", "glasses", "microwave part", "dishes"], "correct_choice_idx": 2, "direct_answers": ["plastic", "plastic part", "broken piece", "door clasp", "oven part", "plastic piece", "microwave part", "plastic", "plastic piece", "latch"], "difficult_direct_answer": false, "rationales": ["The piece in the man's hand looks much like the black piece still on the door of the microwave. a microwave should have two of these pieces to hold the door closed while in operation.", "The man is holding onto a plastic microwave prong.", "He is standing near a microwave. there is a piece of plastic in his hand."], "image": "val2014/COCO_val2014_000000179322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136542, "question_id": "VmkcurxEX6GBaM8a93dhrd", "question": "What is the chopped object near the baby elephant?", "choices": ["food", "tree trunk", "potatoes", "car tires"], "correct_choice_idx": 1, "direct_answers": ["log", "trees", "tree stump", "wood", "tree trunk", "stump", "tree", "tree stump", "tree", "wood"], "difficult_direct_answer": false, "rationales": ["The object is a trunk.", "There are stumps of wood.", "There is a chopped piece of wood near the baby elephant."], "image": "train2014/COCO_train2014_000000136542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195857, "question_id": "VmmS8L2cHZxWbjAhgdBJBW", "question": "What animals can be seen behind the closed doors?", "choices": ["cows", "rats", "chickens", "horses"], "correct_choice_idx": 3, "direct_answers": ["horse", "horse", "horses", "horses", "horses", "horse", "horses", "horses", "horses", "horses"], "difficult_direct_answer": false, "rationales": ["Horses could be seen sitting behind the closed doors.", "There are many different horses in the picture and it's a horse stable.", "These are stalls in a barn"], "image": "train2014/COCO_train2014_000000195857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364571, "question_id": "VmnLaQcMKsoKGWYMjo7phW", "question": "What kind of animals are these in relation to their diets?", "choices": ["nonmajors", "omnivores", "carnivores", "herbivores"], "correct_choice_idx": 3, "direct_answers": ["carnivores", "herbivores", "herbivores", "herbivores", "herbivores", "carnivores", "herbivores", "herbivores", "herbivores", "herbivores"], "difficult_direct_answer": false, "rationales": ["These animals are zebras and wildebeests. they have plant-based diets.", "The animals are eating grass.", "The animals seen here are herbivores, and they get their nourishment from grasses and plants as they spend their days endlessly grazing."], "image": "train2014/COCO_train2014_000000364571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248655, "question_id": "VmpMWPxtRhQ4psLwf2TjLM", "question": "Why is she not moving?", "choices": ["is posing", "is lost", "is confused", "is scared"], "correct_choice_idx": 0, "direct_answers": ["posing picture", "taking picture", "posing", "taking photo", "posing", "posing", "posing", "taking photo", "is posing", "posing"], "difficult_direct_answer": false, "rationales": ["The woman is standing at the bottom of the slope smiling as she stares forward indicating that someone is taking her picture.", "She's posing.", "She is smiling for the camera."], "image": "val2014/COCO_val2014_000000248655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178951, "question_id": "VmsirKXrs6Fpcy5nm8GjDF", "question": "Where are all these suitcases most likely on display?", "choices": ["train station", "museum", "airport", "store"], "correct_choice_idx": 3, "direct_answers": ["being sold", "store", "store", "super market", "baggage claim", "department store", "at market", "store", "to sell", "department store"], "difficult_direct_answer": false, "rationales": ["They are mostly all alike and there are other items in the room", "This is a store as indicated by the shelves with price stickers in the background.", "The items are on display in a store and are for sale in the public."], "image": "train2014/COCO_train2014_000000178951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485808, "question_id": "Vn7GWvuTYawpjFsHyu6LUm", "question": "Which direction was the board traveling in?", "choices": ["up", "left", "down", "right"], "correct_choice_idx": 2, "direct_answers": ["up", "up", "down", "up", "down", "upwards", "right", "right", "up", "up"], "difficult_direct_answer": false, "rationales": ["The skateboarder is going down the ramp.", "He has just done a trick at the top and the board is still on its way down to connect with the surface again", "He just flipped it at the top and is now landing"], "image": "train2014/COCO_train2014_000000485808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461256, "question_id": "Vn9eN8zEfkp7Nq9LYUbqtY", "question": "What is happening in the pot the woman stirs?", "choices": ["boiling", "freezing", "cold storage", "nothing"], "correct_choice_idx": 0, "direct_answers": ["food cooking", "steaming", "boiling", "cooking", "boiling", "cooking", "cooking", "food cooking", "cooking", "cooking"], "difficult_direct_answer": false, "rationales": ["It is heated and she is stirring it so it won't burn and it will get mixed up.", "There is a lot of steam coming from it", "The woman is boiling something since there is steam."], "image": "val2014/COCO_val2014_000000461256.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354088, "question_id": "VnCjVtVF5Xj3Laybkkxbgp", "question": "What food can you buy as you get on the bus?", "choices": ["bread", "steak", "fish", "fruit"], "correct_choice_idx": 3, "direct_answers": ["fruit", "fruit", "fruit", "tamales", "fruit", "fruits", "fruit", "fruit", "orange", "fruit"], "difficult_direct_answer": false, "rationales": ["There are edible plants with seeds in a stall next to the stop.", "The market carries a lot of different kinds.", "The only product seen is oranges and apples and the like."], "image": "val2014/COCO_val2014_000000354088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488822, "question_id": "VnD2QXqD5pn37xq8sF4pCc", "question": "What are the rugs for?", "choices": ["moisture", "decoration", "seating", "cleaning"], "correct_choice_idx": 2, "direct_answers": ["sitting", "beauty", "sitting", "covering", "cover floor", "keeping clean", "decoration", "covering floor", "carpeting", "seating"], "difficult_direct_answer": true, "rationales": ["The rugs can be sat on.", "The rugs are for seating.", "Rugs can be used as a decorative measure. it is colorful."], "image": "val2014/COCO_val2014_000000488822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223216, "question_id": "VnVWyXVpgt2HLVz8gTuuFU", "question": "What is the woman doing with the knife?", "choices": ["sharpening", "cutting", "scaring", "threatening"], "correct_choice_idx": 1, "direct_answers": ["cutting cake", "cake cutting", "cutting", "cutting cake", "cutting", "cutting cake", "cutting", "cutting cake", "cutting", "cutting"], "difficult_direct_answer": false, "rationales": ["She has a sharp utensil in her hand that she is going to use to slice the cake.", "The knife already is sharp. she is pointing the blade towards the food, not the people.", "The woman is cutting."], "image": "train2014/COCO_train2014_000000223216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556544, "question_id": "VnaLJKdAgCF3GmwTtCFuSC", "question": "What drink did the man in the black jacket have in his now empty glass?", "choices": ["red wine", "cognac", "white wine", "champagne"], "correct_choice_idx": 0, "direct_answers": ["wine", "red wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine"], "difficult_direct_answer": false, "rationales": ["It is the kind the other man has", "He has red wine in his glass.", "There are multiple wine bottles on the nearby table. the man has some red liquid in the glass he is holding."], "image": "train2014/COCO_train2014_000000556544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414516, "question_id": "VnbfYo8k6TwJ9Q2zm4h5iF", "question": "The plants that are tallest here produce what edible?", "choices": ["coconuts", "nothing", "bananas", "oranges"], "correct_choice_idx": 2, "direct_answers": ["coconut", "coconuts", "coconuts", "bananas", "bananas", "coconuts", "coconuts", "coconut", "palm trees", "bananas"], "difficult_direct_answer": false, "rationales": ["These trees grow bananas on them.", "There are banana trees.", "These have the type of leaves for this fruit"], "image": "val2014/COCO_val2014_000000414516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463474, "question_id": "VniiYbxwPWzYReUjBGDLP5", "question": "This girl has similar hair color to what actress?", "choices": ["isabelle adjani", "brooke shields", "jessica chastain", "michelle pfeiffer"], "correct_choice_idx": 3, "direct_answers": ["farrah", "renee zellweger", "reese witherspoon", "reese witherspoon", "reese witherspoon", "nicole kidman", "brie larson", "jennifer lawrence", "michelle pfeiffer", "gwyneth paltrow"], "difficult_direct_answer": false, "rationales": ["The girl has blonde hair. jessica chastain has red hair, brooke shields has brown hair, isabelle adjani has black hair and the remaining actress is a blonde.", "The girl's hair is blonde.", "The girl is like pfeiffer."], "image": "train2014/COCO_train2014_000000463474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127296, "question_id": "VnmW7PsAARg5evZArZoh4S", "question": "Who is the man closest to the motorcycle?", "choices": ["cop", "wrestler", "fireman", "shop owner"], "correct_choice_idx": 0, "direct_answers": ["biker", "cop", "police officer", "cop", "cop", "traffic police", "police officer", "bike rider", "policeman", "cop"], "difficult_direct_answer": false, "rationales": ["He has badges on his uniform", "The man has a officer badge on the side of the jacket.", "The man is a cop and has his bike."], "image": "train2014/COCO_train2014_000000127296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413895, "question_id": "VnqumUrEWt9Y7SfM4CtcVx", "question": "What sort of weather is seen here?", "choices": ["tundra", "desert", "alpine", "semi tropical"], "correct_choice_idx": 3, "direct_answers": ["sunny", "sunny", "sunny", "clear", "sunny", "semi tropical", "sunny", "sunny", "clear weather", "sunny"], "difficult_direct_answer": false, "rationales": ["It looks fairly dry and barren out.", "The weather looks hot.", "There are some clouds, but it's not raining."], "image": "train2014/COCO_train2014_000000413895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390130, "question_id": "VoEuy9RqkYHxQSSbmMAGQu", "question": "From where is the water coming?", "choices": ["bottled water", "snow melt", "volcano", "waterfall"], "correct_choice_idx": 1, "direct_answers": ["river", "creek", "snow", "steam", "creek", "snow melt", "mountaintop", "river", "melting ice", "river"], "difficult_direct_answer": false, "rationales": ["It's the runoff from the warmer weather.", "The hill is covered in snow.", "The white substance on the ground produces water."], "image": "val2014/COCO_val2014_000000390130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533955, "question_id": "VoKJKyDvSKadjyT8wHLkDM", "question": "What age group is this person in?", "choices": ["5-13", "35-50", "55-70", "18-30"], "correct_choice_idx": 3, "direct_answers": ["teenager", "twenties", "twenties", "twenties", "twenties", "young adult", "twenties", "twenties", "gingers", "18-30"], "difficult_direct_answer": false, "rationales": ["The person is a young adult.", "She looks to be in her early twenty's.", "The group is a young adult."], "image": "train2014/COCO_train2014_000000533955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502107, "question_id": "VoKXgDyDo5YfTuRY6Se6JU", "question": "The item the person is wearing on their face looks like what?", "choices": ["chicken", "gas mask", "frog", "scarf"], "correct_choice_idx": 1, "direct_answers": ["googles", "gas mask", "mask", "goggles", "goggles", "goggles", "mask", "goggles", "monkey", "mask"], "difficult_direct_answer": false, "rationales": ["Each piece is very large and oversized and everything together resembles that type of equipment", "This is worn to keep the snow out of their face and eyes so they can see to ski.", "A person is wearing clear goggles while they snowboard."], "image": "train2014/COCO_train2014_000000502107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272223, "question_id": "VoU6PqeXW5opNni9uKqw4h", "question": "The person in the bathroom likes which famous classic arcade game?", "choices": ["missile defense", "pac-man", "pong", "space invaders"], "correct_choice_idx": 3, "direct_answers": ["space invaders", "space invaders", "pac man", "asteroids", "space invaders", "space invaders", "pacman", "space invaders", "minecraft", "space invaders"], "difficult_direct_answer": false, "rationales": ["These were pixelized characters in an old arcade game", "There are space invader decals on the wall.", "The background shows the theme for space invaders."], "image": "train2014/COCO_train2014_000000272223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339019, "question_id": "Vp5x4dcb3ovYERMzownn68", "question": "What does the person next to the black banner rent?", "choices": ["umbrellas", "surfing equipment", "swimsuits", "floaties"], "correct_choice_idx": 1, "direct_answers": ["parasol", "surfing equipment", "chairs", "umbrellas", "surfboards", "chairs", "boards", "surfboards", "surfboards", "surfboard"], "difficult_direct_answer": false, "rationales": ["The person has surfing equipment.", "These signs on both sides of him say mojosurf he is advertising and letting people around him know that they can come to him for supplies specifically indicated on the signs.", "The person next to the banner has surfing equipment that's rented."], "image": "val2014/COCO_val2014_000000339019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258395, "question_id": "Vp84HRdp5k9XbErk3ewhYL", "question": "What country's flag can be seen on the truck?", "choices": ["italy", "america", "france", "spain"], "correct_choice_idx": 1, "direct_answers": ["united states", "america", "america", "united states", "america", "american", "united states", "united states", "usa", "american"], "difficult_direct_answer": false, "rationales": ["United states is red white and blue stripes with the stars.", "There are stars and stripes on the flag.", "The flag is american."], "image": "val2014/COCO_val2014_000000258395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298938, "question_id": "VpTYKHR2r4SiWUFK3Vnh5g", "question": "Why is the young child holding poles?", "choices": ["to ski", "to hit", "to dance", "to reach"], "correct_choice_idx": 0, "direct_answers": ["balance", "balance", "skiing", "for balance", "skiing", "skiing", "balancing", "skiing", "skiing", "to ski"], "difficult_direct_answer": false, "rationales": ["The child wants to ski.", "The child is wearing ski clothing. they are on a snowy mountain.", "The kid is trying to ski."], "image": "train2014/COCO_train2014_000000298938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314649, "question_id": "VpV4qzcfGnbFkFpBTKYac5", "question": "Why is the bridge a brownish color?", "choices": ["algae", "chemicals", "rust", "paint"], "correct_choice_idx": 2, "direct_answers": ["rust", "rust", "rust", "rust", "rust", "rust", "rust", "rust", "rust", "rust"], "difficult_direct_answer": false, "rationales": ["The metal on the bridge turned brown from rust.", "It looks like it is metal and old.", "The bridge is rusting."], "image": "val2014/COCO_val2014_000000314649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208031, "question_id": "Vpog27WGyHW2aPpHpxE77t", "question": "In which setting is this person?", "choices": ["factory", "farm", "city", "beach"], "correct_choice_idx": 1, "direct_answers": ["woods", "photograph", "equestrian park", "riding horses", "horse trail", "on trail", "horse track", "farm", "pasture", "horse arena"], "difficult_direct_answer": true, "rationales": ["The man is riding a horse so a barn must be nearby.", "The setting is a farm.", "The person is riding a horse. there are no buildings or bodies of water near them."], "image": "val2014/COCO_val2014_000000208031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297517, "question_id": "VppWjtVvfhpaqJisZKrYP6", "question": "What kind of information is on this train car?", "choices": ["warning", "brand", "directional", "regulatory"], "correct_choice_idx": 1, "direct_answers": ["food company", "origins", "product", "advertisement", "brand", "ad", "advertising", "business name", "ad", "german"], "difficult_direct_answer": true, "rationales": ["This advertises a product or service", "There is an advertisement on the side of the car.", "Companies use train cars to advertise information about their products because it is constantly in motion."], "image": "train2014/COCO_train2014_000000297517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29943, "question_id": "Vq4WSRUuwtXyyEQPVYS8Eo", "question": "What is this sport name?", "choices": ["skating", "surfing", "skiing", "swimming"], "correct_choice_idx": 2, "direct_answers": ["skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["As indicated by the skis on their feet.", "The man is skiing downhill.", "The people are attempting to ski."], "image": "train2014/COCO_train2014_000000029943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467285, "question_id": "VqVPvYKBrns4FivHkCkBXd", "question": "What is about to be hit here?", "choices": ["home base", "enemy", "catcher", "batter"], "correct_choice_idx": 0, "direct_answers": ["ball", "baseball", "baseball", "ball", "baseball", "baseball", "home base", "baseball", "ball", "baseball"], "difficult_direct_answer": false, "rationales": ["The man is looking down with the bat that looks like it's about to touch the white plate because he seems to be trying to prepare himself.", "He is ready to hit the ball when it's thrown to him.", "The batter is preparing the the ball coming in his direction while holding his bat."], "image": "val2014/COCO_val2014_000000467285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575834, "question_id": "VrBV8t2KVDfnVL7m4jrdT8", "question": "How is the food staying fresh?", "choices": ["gmo use", "farm grown", "refrigeration", "preservatives"], "correct_choice_idx": 2, "direct_answers": ["cold", "fridge", "refrigeration", "freezer", "in fridge", "using refrigerator", "refrigeration", "fridge", "staying cool", "refrigerator"], "difficult_direct_answer": false, "rationales": ["This container is large. there is a lot of cooling food in here.", "Various food items are on shelf in a fridge.", "The food is in the refrigerator to keep it cool and fresh."], "image": "val2014/COCO_val2014_000000575834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326726, "question_id": "VrFng9DiYx6UuujVMqMFn5", "question": "What is the opposite destination based on her sign?", "choices": ["kentucky", "hell", "calgary", "detroit"], "correct_choice_idx": 1, "direct_answers": ["hell", "hell", "hell", "hell", "hell", "hell", "hell", "hell", "hell", "hell"], "difficult_direct_answer": false, "rationales": ["The sign has the destination of heaven.", "The direction is hell.", "The opposite direction of heaven is hell."], "image": "val2014/COCO_val2014_000000326726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315654, "question_id": "Vrncyic8PLMB7StFB2hkmL", "question": "People who sleep here pay in which type period of time?", "choices": ["decades long", "yearly", "monthly", "nightly"], "correct_choice_idx": 3, "direct_answers": ["nightly", "per hour", "24 hours", "per night", "night", "vacation", "ancient", "costly pay", "bedroom", "per day"], "difficult_direct_answer": true, "rationales": ["The room has matching end tables and a mat on the side of the bed. the room is small and there are no personal items such as picture frames.", "People sleep at night.", "There are slippers set by the bed."], "image": "train2014/COCO_train2014_000000315654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106453, "question_id": "VrqtJ3nZFYvMDzZpX62e73", "question": "In which era was this photo taken?", "choices": ["frontier days", "prehistoric", "modern", "victorian"], "correct_choice_idx": 2, "direct_answers": ["american revolution", "21st century", "this era", "fifties", "modern", "today", "current era", "outside", "nineteen hundreds", "modern"], "difficult_direct_answer": true, "rationales": ["There are cars in the background.", "Although the horse and carriage may at first glance seem to be a photo taken long ago, the parking lot behind filled with cars, minivans, pickup trucks and campers shows that the photo is current.", "The horses are near cars. these did not exist in the victorian era or in earlier periods."], "image": "val2014/COCO_val2014_000000106453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261003, "question_id": "VrspiDrPFEveV59UKazb9p", "question": "What type of wares are being stacked into the truck?", "choices": ["fruits", "canaries", "meats", "animals"], "correct_choice_idx": 0, "direct_answers": ["boxes", "food products", "fruit", "fruits", "food", "fruits", "housewares", "bananas", "food", "fruits"], "difficult_direct_answer": false, "rationales": ["There are some fruits stacked up on the truck.", "These are boxes with fruits in them.", "The boxes imply fruits."], "image": "train2014/COCO_train2014_000000261003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255186, "question_id": "VrwJwCk7mpfdCd8oXhXqFe", "question": "What are the young men in uniforms in the foreground a part of?", "choices": ["coaches", "rotc", "cheerleaders", "baseball"], "correct_choice_idx": 1, "direct_answers": ["marines", "rotc", "military salute", "band", "color guard", "color guard", "armed service", "marching band", "military", "marching band"], "difficult_direct_answer": false, "rationales": ["The men are in the rotc.", "The young men are in the rotc since they're in uniform.", "They are a part of a military training group."], "image": "train2014/COCO_train2014_000000255186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226528, "question_id": "Vs5TJUdvvreKsHcvsdJZzq", "question": "What must you grasp to open these doors?", "choices": ["nothing", "man's hand", "fire hydrant", "overhead handle"], "correct_choice_idx": 0, "direct_answers": ["nothing", "handle", "handle", "nothing", "nothing", "nothing", "door handle", "nothing", "nothing", "handle"], "difficult_direct_answer": false, "rationales": ["There is nothing to grasp.", "They are automatic for easy opening when customers enter and exit the store.", "You do not have to do anything but walk to open these doors. they are automatic and will sense you there."], "image": "train2014/COCO_train2014_000000226528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229740, "question_id": "Vs9JrYps7K3fvuEPDVvVgU", "question": "What proves that the cat is allowed on the counter?", "choices": ["catnip", "cat stairs", "cat bed", "food/water dish"], "correct_choice_idx": 3, "direct_answers": ["food bowl", "cat food", "nutriment", "taking photo", "kibbles", "picture taken", "food", "photographer", "on counter", "food/water dish"], "difficult_direct_answer": true, "rationales": ["The owner has encouraged daily counter visits", "There is a brown substance that was put there for the cat. it is not catnip.", "There is food and water available behind the cat."], "image": "val2014/COCO_val2014_000000229740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83770, "question_id": "VsSUxgSrdggS3Ctbx3tehi", "question": "What type markings are shown here?", "choices": ["cross walk", "stop message", "zoo ads", "abbey road"], "correct_choice_idx": 0, "direct_answers": ["crosswalk", "crosswalk", "crosswalk", "crosswalks", "crosswalk", "crosswalk", "pedestrian crossing", "pedestrian crossing", "cross walk", "pedestrian crossing"], "difficult_direct_answer": false, "rationales": ["The white lines indicate a crosswalk.", "The markings are a crosswalk.", "There is a zebra crosswalk."], "image": "train2014/COCO_train2014_000000083770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208963, "question_id": "VsTCHkhiXJTB2UEQJjjHb4", "question": "What is the white circle in front of the man?", "choices": ["pan", "clock", "stool", "drum"], "correct_choice_idx": 3, "direct_answers": ["drum", "drum", "drum", "drum", "drum head", "drum", "drum", "drum", "drum", "board"], "difficult_direct_answer": false, "rationales": ["It is a fabric material stretched over taut so it will produce a sound", "The man is playing the drum.", "The item is clearly recognizable as a drum. clocks have numbers and hands on them, stools are vertical, and pans have an opening."], "image": "train2014/COCO_train2014_000000208963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278731, "question_id": "Vsuzx2JhVmzDbYDgYvBGP3", "question": "What country does this bus originate from?", "choices": ["italy", "russia", "germany", "hong kong"], "correct_choice_idx": 3, "direct_answers": ["uk", "hong kong", "hong kong", "germany", "united states", "canada", "holland", "germany", "america", "hong kong"], "difficult_direct_answer": false, "rationales": ["The nlb on the bus indicates this country", "This bus is from hong kong.", "The bus company is visible on the side of the bus as nlb. nlb is a bus company that is based out of hong kong."], "image": "train2014/COCO_train2014_000000278731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121521, "question_id": "VtFpUhkBKZFj47aJQoirPz", "question": "The pets are not eating or drinking because they are likely what?", "choices": ["unreal", "full", "sleeping", "newborns"], "correct_choice_idx": 0, "direct_answers": ["unreal", "not real", "statues", "fake", "not real", "statues", "fake", "ceramic", "fake", "artificial"], "difficult_direct_answer": false, "rationales": ["They're obviously not real.", "These animals do not appear to be living based on their size and lack of fur and other characteristics of real animals. they would thus not be real animals and not need to eat or drink.", "The animals are sculptures."], "image": "train2014/COCO_train2014_000000121521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181118, "question_id": "VtGvKB7EszScZXpcNaJjUJ", "question": "Why is the man holding a camera?", "choices": ["buying it", "selling it", "taking pictures", "weighing it"], "correct_choice_idx": 2, "direct_answers": ["pictures", "elephant photos", "recording elephants", "taking pictures", "picture", "photograph elephants", "take pictures", "picture", "taking pictures", "taking pictures"], "difficult_direct_answer": false, "rationales": ["To have everlasting memories of the vacation", "Typically people hold cameras so that they can take photos.", "He's a tourist and wants to remember this"], "image": "val2014/COCO_val2014_000000181118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373988, "question_id": "VtYJSkSMCkw7iEGr7ojhJb", "question": "Why is there a line forming by the building?", "choices": ["for fun", "its mandatory", "its popular", "its raining"], "correct_choice_idx": 2, "direct_answers": ["its popular", "to enter", "get inside", "going skiing", "food", "getting skis", "need drinks", "lunch time", "restaurant", "drink service"], "difficult_direct_answer": true, "rationales": ["A large group of people has gathered at a ski resort.", "The ski lodge can be quite a popular place especially after dark. these skiers are probably looking for a place to wind down and willing to stand in line to get into a place with lots of other people.", "The large group of people are waiting in line at the busy resort."], "image": "val2014/COCO_val2014_000000373988.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170211, "question_id": "VtzuU7m5Y3FGZJzuXyiw7Q", "question": "The people are using what device?", "choices": ["samsung galaxy", "laptop", "carriage", "nintendo wii"], "correct_choice_idx": 3, "direct_answers": ["wii", "wii", "nintendo wii", "nintendo wii", "wii remote", "wii remote", "controller", "wii remote", "controller", "wii remote"], "difficult_direct_answer": false, "rationales": ["The people are holding video game controllers. they are white and are shaped like remotes.", "The gaming system uses a non-traditional gaming controller in the shape of a television remote.", "The people are all holding nintendo wii controllers."], "image": "val2014/COCO_val2014_000000170211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305647, "question_id": "VuZKurEBdNta4EtigosXGr", "question": "What is the horse doing?", "choices": ["pulling skier", "helping man", "eating", "resting"], "correct_choice_idx": 0, "direct_answers": ["walking", "walking pulling", "walking", "pulling skier", "walking", "pulling", "pulling", "pulling skier", "walking", "pulling skier"], "difficult_direct_answer": false, "rationales": ["The horse is pulling the skier on the snow.", "The person is on skis and is holding on to ropes attached to the horse. the horse appears to be in motion based on the leg bend.", "This is obvious in the picture. the other options aren't taking place. that said, if that's a man on the skis, c might also apply."], "image": "train2014/COCO_train2014_000000305647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534417, "question_id": "VugDagirFcrS4s7YU6sNDE", "question": "What phobia is associated with these kind of waves?", "choices": ["arachnophobia", "cymophobia", "gatophobia", "tokophobia"], "correct_choice_idx": 1, "direct_answers": ["cymophobia", "hydrophobia", "fear drowning", "thalassophobia", "cymophobia", "claustaphobia", "water", "thalassophobia", "cymophobia", "cymophobia"], "difficult_direct_answer": false, "rationales": ["The phobia is cymophobia.", "People that have this fear are afraid of the sea.", "If you google it you will find that option \"a\" is the fear of swells and waves."], "image": "train2014/COCO_train2014_000000534417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430375, "question_id": "Vum3ah8gwgvjXRrDDq3ki4", "question": "What is the person in red most likely awaiting a turn for?", "choices": ["snow blower", "snow cat", "ski lift", "bus"], "correct_choice_idx": 2, "direct_answers": ["chairlift", "chairlift", "chair lift", "lift", "ski lift", "skiing downhill", "to ski", "down slope", "ski lift", "lift"], "difficult_direct_answer": false, "rationales": ["With the setting the man is in and the garb and equipment he is wearing, it's easy to understand what he is waiting for.", "The other options don't apply to a queue for a ski resort.", "The person in red has on skis and ski/snow suit with glove and ski poles. the person shown is ready to ski so he is probably waiting for a ride to the top of the ski slopes."], "image": "train2014/COCO_train2014_000000430375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331366, "question_id": "VvLSh8S4chsfszrLDsRjgY", "question": "How many wheels are on the cycle being ridden here?", "choices": ["two", "four", "three", "one"], "correct_choice_idx": 0, "direct_answers": ["one", "one", "two", "three", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is only one cycle being ridden in the photo and it has a front and back wheel. this is for a total of two wheels.", "There are obviously two wheels on this bicycle. two-wheeled bicycles are the most common form of that vehicle.", "There are two wheels one is way bigger then the other one."], "image": "val2014/COCO_val2014_000000331366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527887, "question_id": "VvdPVJZcuN6kH3KjqxK8bj", "question": "What is the most likely beverage in the cup?", "choices": ["iced tea", "ice water", "iced coffee", "iced juice"], "correct_choice_idx": 2, "direct_answers": ["iced coffee", "ice coffee", "iced coffee", "coffee", "coffee", "coffee", "coffee", "iced coffee", "iced coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["This is a well-known coffee shop drink and it has milk in it", "The cup is from starbuck's and has ice in it.", "Based on the color, the ice in i and the starbucks logo. it could also be b."], "image": "val2014/COCO_val2014_000000527887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358580, "question_id": "Vvg4U5gAGq9gNXQv79ZnH7", "question": "What geographic formation is in the distance?", "choices": ["sand dune", "crater", "glacier", "mountain"], "correct_choice_idx": 3, "direct_answers": ["mountain", "mountains", "mountain", "mountain", "mountain", "mountain", "mountain", "mountain", "mountain", "mountain"], "difficult_direct_answer": false, "rationales": ["The large upraised rock areas are known as mountains, and they are the highest formation one can see.", "It is a large natural formation that rises above everything else", "The formation is a mountain."], "image": "train2014/COCO_train2014_000000358580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110348, "question_id": "Vvhs7NHjLoN444yEpSiGGm", "question": "What are the cows pictured above reared for?", "choices": ["none", "meat production", "dairy production", "both"], "correct_choice_idx": 2, "direct_answers": ["dairy", "milk", "milk", "milk", "meat", "beef", "beef", "beef", "dairy production", "beef"], "difficult_direct_answer": false, "rationales": ["The cows make dairy.", "These cows looks like a typical dairy cow.", "This is a milk cow."], "image": "train2014/COCO_train2014_000000110348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287374, "question_id": "Vvov3him8JBPVPVJfNEhht", "question": "What beverage will they drink after the game?", "choices": ["coca cola", "gatorade", "budweiser", "sprite"], "correct_choice_idx": 1, "direct_answers": ["gatorade", "water", "gatorade", "gatorade", "gatorade", "gatorade", "gatorade", "gatorade", "water", "gatorade"], "difficult_direct_answer": false, "rationales": ["The beverage is gatorade.", "There is a container in the background filled with a sports drink.", "Gatorade is shown."], "image": "train2014/COCO_train2014_000000287374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15725, "question_id": "Vw3Kn9PhX79ta8tRvw3aaM", "question": "What is the person doing at the front of her fridge?", "choices": ["complaining", "writing poetry", "eating", "scouring it"], "correct_choice_idx": 1, "direct_answers": ["sitting", "moving words", "making sentences", "organizing magnets", "rearranging magnets", "adjusting magnets", "cleaning", "reading writing", "writing poetry", "playing magnets"], "difficult_direct_answer": true, "rationales": ["The woman is arranging word magnets and using them to write sentences.", "The person is moving poetry magnets on the fridge.", "The person is writing."], "image": "val2014/COCO_val2014_000000015725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246815, "question_id": "Vw3qYhrxYeW3CgRVaJCpqV", "question": "What do the windsurfers here depend on most?", "choices": ["boats", "wind", "sharks", "ferry"], "correct_choice_idx": 1, "direct_answers": ["sails", "sails", "wind", "sails", "wind", "wind", "wind", "wind", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["The windsurfers need wind.", "Surfing with a sail requires a windy environment in order to move the board in the water.", "They need wind."], "image": "train2014/COCO_train2014_000000246815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167075, "question_id": "Vw4ABTw92qHBjqPRgt6m4V", "question": "What surface is the man playing on?", "choices": ["clay", "carpet", "hard", "grass"], "correct_choice_idx": 2, "direct_answers": ["tennis court", "tennis court", "clay", "court", "clay", "asphalt", "artificial", "tennis court", "hard", "asphalt"], "difficult_direct_answer": false, "rationales": ["He uses a hard surface.", "Tennis is played on a compact surface.", "The man is playing tennis outside. the surface is grey, not orange or green, so he is not playing on clay, grass, or carpet."], "image": "train2014/COCO_train2014_000000167075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105786, "question_id": "Vw8xE95TDrsBEk9JPMorzf", "question": "Of what use are the tables and chairs here?", "choices": ["makeup", "rodeo riding", "contest sitting", "dining"], "correct_choice_idx": 3, "direct_answers": ["hotel", "dining", "dining", "dining", "outdoor dining", "eating", "eating", "resting", "dining", "cafe"], "difficult_direct_answer": false, "rationales": ["The tables are for dining.", "They have tablecloths on them like a restaurant", "The tables are set up like a place to eat at and like there in a dinner to be served here."], "image": "val2014/COCO_val2014_000000105786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68953, "question_id": "VwHPLReCy7U3jBGE8eGRAb", "question": "Why has the woman covered her head?", "choices": ["health", "costume", "warmth", "protection"], "correct_choice_idx": 2, "direct_answers": ["cold outside", "cold", "knit hat", "warmth", "raining", "cap", "weather", "cap", "warmth", "weather"], "difficult_direct_answer": false, "rationales": ["The woman has covered her head for warmth.", "The woman is also wearing a jacket.", "The woman wants warmth."], "image": "train2014/COCO_train2014_000000068953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250880, "question_id": "VwcpK3vJ398EbX4krhBzb2", "question": "Which of those country's flags has the largest land area?", "choices": ["britain", "germany", "italy", "spain"], "correct_choice_idx": 1, "direct_answers": ["germany", "united kingdom", "africa", "germany", "uk", "germany", "second", "germany", "germany", "germany"], "difficult_direct_answer": false, "rationales": ["Of the flags displayed answer a is a flag clearly visible and has the largest land mass of the list.", "Multiple countries flags are represented on the side of a plane.", "That country is the largest out of the flags."], "image": "train2014/COCO_train2014_000000250880.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287839, "question_id": "VwdnuLQihytUnXZCzSeYJj", "question": "At which building does the purple train stop?", "choices": ["depot", "bus stop", "airport", "school"], "correct_choice_idx": 0, "direct_answers": ["train station", "left", "terminal", "train station", "station", "depot", "train station", "train station", "train station", "train stop"], "difficult_direct_answer": false, "rationales": ["A train is stopped at a small building along the tracks. places along train tracks for people to board have a place to buy tickets and wait for the train out of the weather.", "Trains do not stop at schools, airports, or bus stops.", "This is where passengers can get on and off"], "image": "train2014/COCO_train2014_000000287839.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488747, "question_id": "VwrvZM5wmyu9fvEtkoYtjc", "question": "What is in the bucket?", "choices": ["food/water", "money", "gas", "free shirts"], "correct_choice_idx": 0, "direct_answers": ["feed", "grain", "feed", "cow food", "feed", "food", "feed", "food/water", "cow food", "feed"], "difficult_direct_answer": false, "rationales": ["The cow is getting food and water.", "The bucket is being used as a bowl to feed the horse or let it get a drink.", "Food is in the bucket."], "image": "train2014/COCO_train2014_000000488747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443887, "question_id": "Vx3kjeBcrrVJwLPhwtEvw6", "question": "What is the little angel in the flower pot called?", "choices": ["raphael", "gabriel", "cherub", "michael"], "correct_choice_idx": 2, "direct_answers": ["cherub", "cherub", "cute angel", "cherub", "cherub", "angel", "cherub", "cherub", "cherub angel", "cherub"], "difficult_direct_answer": false, "rationales": ["The angel is a cherub.", "That's what they call the littlest angel.", "The angel is a plant cherub."], "image": "val2014/COCO_val2014_000000443887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387977, "question_id": "VxU9Zk6bgbiJNGRFBj43i8", "question": "What type of settlements are near the airport?", "choices": ["sky scrapers", "tennis clubs", "farm", "power plant"], "correct_choice_idx": 2, "direct_answers": ["houses", "farm", "farm", "villages", "houses", "farm", "farm", "farm", "farms", "farms"], "difficult_direct_answer": false, "rationales": ["Lots of land that you can grow crops on.", "There are cows in the background", "There is a lot of land."], "image": "train2014/COCO_train2014_000000387977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226899, "question_id": "VxuR5AZucGyY3S5MtxqEid", "question": "What material is the cage made of?", "choices": ["porcelain", "steel", "plastic", "wood"], "correct_choice_idx": 1, "direct_answers": ["steel", "metal", "steel", "metal", "metal", "steel", "metal", "steel", "steel", "steel"], "difficult_direct_answer": false, "rationales": ["The bars are made of strong metal.", "The cage is for cows. porcelain, wood, and plastic would not be strong enough to hold cows.", "The cage is made of steel."], "image": "train2014/COCO_train2014_000000226899.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565245, "question_id": "Vy4mayvqWYe2GxP99xBRxJ", "question": "What body part do you need to be most careful with here?", "choices": ["knees", "toes", "fingers", "nose"], "correct_choice_idx": 2, "direct_answers": ["hands", "hands", "fingers", "fingers", "hands", "hands", "fingers", "fingers", "hands", "hands"], "difficult_direct_answer": false, "rationales": ["Metal machines as pictured here, have moving parts and it could be easy to lose a body part if not careful.", "The food may be hot so you have to be careful.", "Be careful handling the donuts"], "image": "val2014/COCO_val2014_000000565245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213790, "question_id": "VyDjYVeWXyJiWhwQcRTo2m", "question": "Where is this table located?", "choices": ["classroom", "restaurant", "home", "library"], "correct_choice_idx": 2, "direct_answers": ["dinning room", "kitchen", "home", "house", "house", "kitchen", "kitchen", "home", "dining room", "dining room"], "difficult_direct_answer": false, "rationales": ["Based on the setting in the background and the utensils and plate being used, this would be a home setting.", "There is a package of toilet paper in the background and that wouldn't show in a public place", "This table is located in a home, not a restaurant."], "image": "train2014/COCO_train2014_000000213790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105812, "question_id": "VyHuvDcqiGWJPRkEzVP9bP", "question": "What is the salad in the bowl called?", "choices": ["cole slaw", "potato salad", "ambrosia", "macaroni salad"], "correct_choice_idx": 0, "direct_answers": ["cole slaw", "coleslaw", "coleslaw", "coleslaw", "cole slaw", "cole slaw", "cole slaw", "cole slaw", "coleslaw", "coleslaw"], "difficult_direct_answer": false, "rationales": ["The bowl is creamy with some greenery.", "The salad is made of cabbage.", "Coleslaw is usually paired with sandwiches."], "image": "val2014/COCO_val2014_000000105812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386370, "question_id": "VyL4rm9N5kNwitiLsPMW2B", "question": "If you kept walking forward from where the camera is what would happen to you?", "choices": ["ignite", "get slapped", "hit fence", "get wet"], "correct_choice_idx": 3, "direct_answers": ["get wet", "reach river", "hit water", "get wet", "get wet", "hit water", "get wet", "fall in", "fall", "see things"], "difficult_direct_answer": false, "rationales": ["You'd get wet.", "Walking forward would land you into the large body of water underneath the bridge.", "The water is in front of the camera so you would get wet if you keep walking."], "image": "val2014/COCO_val2014_000000386370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183327, "question_id": "VybPZwms7ZAoofdFcL4AMQ", "question": "What is keeping the road closed sign from falling?", "choices": ["sandbag", "rocks", "bricks", "tire"], "correct_choice_idx": 3, "direct_answers": ["tire", "tire", "tire", "tire", "tire", "tire", "tire", "tire", "tire", "tire"], "difficult_direct_answer": false, "rationales": ["The sign is sitting in a round black object.", "The tires keeps the road closed.", "You can see the tread on it and it is round and rubber"], "image": "train2014/COCO_train2014_000000183327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288336, "question_id": "VyxZMd4NeUXUdxHqm4fgMS", "question": "What will happen if someone parks here?", "choices": ["yelled at", "towed away", "beaten", "nothing"], "correct_choice_idx": 1, "direct_answers": ["towed", "accident", "tow", "tow", "towed away", "towed away", "towing", "arrested", "tow away", "tow away"], "difficult_direct_answer": false, "rationales": ["The red and white no parking sign indicates what would happen if someone were to park here.", "The red and white no parking sign indicates what happens to people who refuse to not park here.", "The white and red sign on the sidewalk indicates what would happen if someone were to park here."], "image": "val2014/COCO_val2014_000000288336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257821, "question_id": "W2Qz59WXakBzBKnK4GZbiB", "question": "What move has the tennis player just done?", "choices": ["jumped", "served ball", "kicked", "received ball"], "correct_choice_idx": 3, "direct_answers": ["served", "forehand hit", "served", "serve", "return", "striking", "received ball", "jumping", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["His are in the position of just hitting the ball underhand.", "Based on where this player is standing on the court and the position of their leg and hand follow throughs they would have just served the ball and their forward momentum has brought them to this position.", "He has just hit from overhead"], "image": "train2014/COCO_train2014_000000257821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84536, "question_id": "W2S6cZ7dD3FshaAC6AeDLJ", "question": "What is fake in this photo?", "choices": ["bears", "plants", "grass", "trees"], "correct_choice_idx": 0, "direct_answers": ["bears", "bears", "bears", "statues", "bears", "all bears", "bear", "bears", "bears", "bears"], "difficult_direct_answer": false, "rationales": ["The bears are made of wood.", "The bears are made of stone", "The bears are fake."], "image": "train2014/COCO_train2014_000000084536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246285, "question_id": "W2TMcxJ7P2Nqjd7P8r2qrY", "question": "What shapes are in the grass?", "choices": ["letters", "numbers", "circled", "stripes"], "correct_choice_idx": 2, "direct_answers": ["rectangles", "rectangles", "semicircle", "lines", "stripes", "rows", "rectangles", "stripes", "stripes", "circled"], "difficult_direct_answer": false, "rationales": ["The grass gets mowed one way and then the opposite.", "There are stripes in the baseball field.", "A baseball field is being played and the outfield has been mowed and the rows are visible."], "image": "train2014/COCO_train2014_000000246285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68176, "question_id": "W2U5mAyM2gGGWzwYMBAhAB", "question": "Which street could this biker refill his prescription on most quickly?", "choices": ["main", "none", "15th", "dirt road"], "correct_choice_idx": 2, "direct_answers": ["15th street", "15th", "cvs pharmacy", "fifteenth", "15th street", "15th", "15th", "15th street", "cvs", "15th"], "difficult_direct_answer": false, "rationales": ["There is a pharmacy called cvs on that given street.", "The street sign indicates that 15th street is nearby.", "There is a cvs pharmacy near the biker. the street sign near the traffic lights indicates the street's number."], "image": "train2014/COCO_train2014_000000068176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323851, "question_id": "W2iLchDg3yiZNZeyG2AMSC", "question": "What does the dog have to do to keep from falling?", "choices": ["hold button", "nothing", "keep balance", "grab rope"], "correct_choice_idx": 2, "direct_answers": ["balance", "keep balance", "balance", "balance", "balance", "balance", "stay put", "balance", "balance", "stand"], "difficult_direct_answer": false, "rationales": ["The dog needs to keep its balance.", "The man is trying to keep a balance.", "It's a precarious position and on an incline, so he has to stand a certain way."], "image": "val2014/COCO_val2014_000000323851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448117, "question_id": "W2oT6bsjte38MNHbYN3Cxx", "question": "From what do the eyesare being worn here protect the wearers from?", "choices": ["wind", "cold", "snow glare", "rain"], "correct_choice_idx": 2, "direct_answers": ["snow glare", "glare", "glare", "sunlight brightness", "aganist wind", "snow blindness", "snow", "glare", "blowing snow", "coat"], "difficult_direct_answer": false, "rationales": ["When the sun hits the snow it can result in some intense glare that people wear sunglasses or some tinted eye covering to protect them from", "Sunglasses are worn by skiers walking in an area covered in snow. sunglasses can help with reflection when in areas with snow.", "The snow is white and bright."], "image": "val2014/COCO_val2014_000000448117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467978, "question_id": "W2ptJjjJMbJvTKVtqFLcfc", "question": "Other than the dog how are the sheep being controlled?", "choices": ["invisible fence", "fires", "holes", "metal fence"], "correct_choice_idx": 3, "direct_answers": ["fence", "metal fence", "nine", "running", "fencing", "fence", "fence", "fence", "fence", "fence"], "difficult_direct_answer": false, "rationales": ["The sheep are being blocked that is very shiny around the field.", "The fence is there.", "The chain link fence keeps the animals from getting out."], "image": "val2014/COCO_val2014_000000467978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71988, "question_id": "W2vnA6P7BDFefJbgn4GmNK", "question": "What is the person on the left sitting on?", "choices": ["chair", "car", "bench", "floor"], "correct_choice_idx": 2, "direct_answers": ["bench", "bench", "bench", "bench", "wife", "bench", "wife", "bench", "bench", "bench"], "difficult_direct_answer": false, "rationales": ["This is easy to determine from the picture.", "The person is sitting on a bench overlooking the ocean.", "The person is on a rigid seat for two or three"], "image": "val2014/COCO_val2014_000000071988.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3967, "question_id": "W2xTDVKWmqWjS4hUdqrsHF", "question": "What amuses the pink phoned person?", "choices": ["nothing", "tourist", "grandmother", "monkey"], "correct_choice_idx": 3, "direct_answers": ["monkey", "dolly", "monkey", "monkey", "cat", "monkey", "monkey", "monkey", "monkey", "monkey"], "difficult_direct_answer": false, "rationales": ["She is smiling and taking a picture of the animal on a leash.", "It's obvious in the photograph when looking at the bottom left corner.", "The monkey is keeping the person with the phone entertained."], "image": "train2014/COCO_train2014_000000003967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560623, "question_id": "W378hnkQLbS8GYmj9LRuoS", "question": "What would be the most likely cause of a travel delay for this airport?", "choices": ["rain", "snow", "wind", "clouds"], "correct_choice_idx": 3, "direct_answers": ["rain", "clouds", "weather", "weather", "rain", "inclement weather", "rain", "bad weather", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The clouds might make for a foggy flight.", "Planes can easily fly in clouds and rain but high winds could keep them grounded", "The clouds are the cause."], "image": "val2014/COCO_val2014_000000560623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226102, "question_id": "W3NdUEYM9cQonwEXtonFiD", "question": "What part of the day is this meal usually eaten?", "choices": ["snack", "dessert", "breakfast", "dinner"], "correct_choice_idx": 2, "direct_answers": ["breakfast", "breakfast", "breakfast", "lunch", "breakfast", "lunch", "lunch", "breakfast", "breakfast", "breakfast"], "difficult_direct_answer": false, "rationales": ["And english muffin and eggs are seen, which are common breakfast items around the world.", "This meal consists of an egg sandwich with a vegetable beverage.", "That is a breakfast sandwich that has that kind of bread and eggs which you eat in the morning."], "image": "train2014/COCO_train2014_000000226102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345960, "question_id": "W3YkF8vD2N7ws5GdSERfNj", "question": "What is the person doing in the kitchen?", "choices": ["sleeping", "meddling", "cleaning", "cooking"], "correct_choice_idx": 2, "direct_answers": ["cleaning", "wiping counters", "cleaning", "cleaning", "cleaning", "wiping counters", "cleaning", "cleaning", "cleaning", "cleaning"], "difficult_direct_answer": false, "rationales": ["The person has a rag in their hand. they are wiping the counter.", "The person cleans.", "There is a spray bottle and she's holding a sponge for wiping surfaces."], "image": "val2014/COCO_val2014_000000345960.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496099, "question_id": "W3avQLa9PogQAfgHz5DYBd", "question": "What is most likely holding the images into the flat white surface?", "choices": ["nails", "tape", "magnets", "glue"], "correct_choice_idx": 2, "direct_answers": ["table", "magnets", "magnets", "magnets", "magnets", "table", "magnets", "magnets", "table", "magnets"], "difficult_direct_answer": false, "rationales": ["The doors are metal and these stick to it", "Because the fridge is metal, you can hangs things with the use of magnets.", "I do not see any tape so i would think they used some kind of glue."], "image": "val2014/COCO_val2014_000000496099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468063, "question_id": "W3fWCx8efvVsNSD6KnjdL3", "question": "Why are they so close together?", "choices": ["to talk", "need directions", "afraid alone", "to fight"], "correct_choice_idx": 0, "direct_answers": ["for picture", "friends", "friends", "friends", "friends", "taking picture", "horses close", "to talk", "friends", "picture together"], "difficult_direct_answer": false, "rationales": ["The men look like they are friends.", "The people are riding horses near the ocean. in this setting with this activity there might be a lot of ambient noise which might force the man to lean closer together to hear each other as depicted.", "The two people are leaning toward each other and theirs mouths are partially opened."], "image": "val2014/COCO_val2014_000000468063.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470770, "question_id": "W3fbW9btQDeFcKicUzJ87x", "question": "What is the color of road?", "choices": ["red", "pink", "green", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "grey", "black", "black", "grey", "gray", "grey", "black"], "difficult_direct_answer": false, "rationales": ["Woman are walking through a paved parking lot.", "The road is made of cement and is a black dark and uniform color.", "The road is black."], "image": "train2014/COCO_train2014_000000470770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46990, "question_id": "W3uJg4uE6QU4fGXWnpTDhQ", "question": "What is strapped to the body?", "choices": ["purse", "backpack", "briefcase", "duffel bag"], "correct_choice_idx": 1, "direct_answers": ["backpack", "backpack", "snowboard", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "snowboard"], "difficult_direct_answer": false, "rationales": ["A man is snowboarding with a pack on his back and the straps are over his shoulders.", "This item straps on the back with straps over the shoulders that can be seen here.", "There is a black bag on his back with straps coming around his front."], "image": "train2014/COCO_train2014_000000046990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536950, "question_id": "W44332ZFCuaEiSSndFCUDA", "question": "How many visible stripes are in his right shoe?", "choices": ["three", "zero", "one", "two"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["The shoes have three stripes each going down the side.", "The man is wearing adidas.", "There are 3 stripes."], "image": "train2014/COCO_train2014_000000536950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185881, "question_id": "W4FJMRykXViLC5ud3M5WjR", "question": "Where will this bus stop next?", "choices": ["downtown", "uptown", "school", "georgetown"], "correct_choice_idx": 3, "direct_answers": ["georgetown", "georgetown", "bus station", "georgetown", "georgetown", "union station", "georgetown", "georgetown", "georgetown", "georgetown"], "difficult_direct_answer": false, "rationales": ["The sign on the front of the bus indicates where it will stop next.", "The bus sign says georgetown.", "The name is displayed on the side."], "image": "train2014/COCO_train2014_000000185881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230966, "question_id": "W4TMP6bZGobBQbpVmqhHUi", "question": "What is the last name of the creator of the big vase?", "choices": ["ben", "anna", "hanford", "owen"], "correct_choice_idx": 3, "direct_answers": ["hanford", "hanford", "owen", "owen", "queen", "owen", "handford", "owen", "owen", "ben owen"], "difficult_direct_answer": false, "rationales": ["The name is on the placecard.", "Owen is an artist that makes vases.", "The name is owen."], "image": "train2014/COCO_train2014_000000230966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14514, "question_id": "W4bXWkU9W92ZhSbYXGMNgX", "question": "Durable and nontoxic kitchen cabinets are made of what?", "choices": ["copper", "stainless steel", "wood", "aluminum"], "correct_choice_idx": 1, "direct_answers": ["steel", "steel", "stainless steel", "stainless steel", "stainless steel", "stainless steel", "steel", "steel", "stainless steel", "stainless steel"], "difficult_direct_answer": false, "rationales": ["Many of the counters here are made of stainless steel.", "These are easy to keep clean", "A certain type of metal is the preference for non-toxic kitchens."], "image": "train2014/COCO_train2014_000000014514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328181, "question_id": "W4rnXuk4pp4ZUQjhPcMjSJ", "question": "What type trip are people standing here going on?", "choices": ["train", "taxi", "car", "boat"], "correct_choice_idx": 3, "direct_answers": ["boat", "boat", "ferry ride", "boat ride", "tourist", "ferry", "boat", "day", "boat", "ferry ride"], "difficult_direct_answer": false, "rationales": ["The people are standing on the dock and are getting ready to board the approaching ferry to get across the water.", "The boat is the trip.", "The people are standing on a dock and a boat is approaching."], "image": "val2014/COCO_val2014_000000328181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298242, "question_id": "W54yM7JRRLmtYutQwWKRYn", "question": "What is creating the white long cloudlike lines in the sky?", "choices": ["photo shop", "special effect", "plane", "skis"], "correct_choice_idx": 2, "direct_answers": ["plane", "plane", "airplane", "airplane", "jet", "jet stream", "skier", "airplane", "aircraft", "sky writer"], "difficult_direct_answer": false, "rationales": ["The plane is streaking.", "An airplane is in front of the trail.", "There is visibly a plane in the sky in front of the white lines. the exhaust from planes in the sky is observed to create these lines."], "image": "train2014/COCO_train2014_000000298242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149842, "question_id": "W5DdbLNe4PRzEX83zSRLTx", "question": "What does the man intend to do to the giraffe?", "choices": ["feed", "itch", "pinch", "poke"], "correct_choice_idx": 0, "direct_answers": ["feed it", "feed it", "feed it", "feed", "feed", "feed", "feed", "feed", "feed", "feed it"], "difficult_direct_answer": false, "rationales": ["Usually if you are somewhere a giraffe is you generally have feed in your hand which is why they come up to you.", "He is holding something to the mouth of the animal", "The man wants to give the giraffe food."], "image": "train2014/COCO_train2014_000000149842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217205, "question_id": "W5ExiFJqhU4s9srRFAzLfY", "question": "What are the pillars for?", "choices": ["holding house", "merely decoration", "holding tent", "holding pier"], "correct_choice_idx": 3, "direct_answers": ["pier", "sea boarding", "stability", "holding pier", "support", "support pier", "hold pier", "support pier", "supporting pier", "support"], "difficult_direct_answer": false, "rationales": ["The beams hold up the pier.", "Structural integrity.", "It's located above the surfers and extends out into the ocean."], "image": "val2014/COCO_val2014_000000217205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78026, "question_id": "W5eJpY3XnVfwKfbaNpS84K", "question": "What covers the top of the treat the cat bites?", "choices": ["icing", "onions", "cheese", "bacon"], "correct_choice_idx": 0, "direct_answers": ["sprinkles", "sprinkles", "frosting sprinkles", "sugar glaze", "sprinkles", "icing", "sprinkles", "sprinkles", "frosting sprinkles", "icing"], "difficult_direct_answer": false, "rationales": ["White frosting is on top of a donut.", "The treat is a donut. donuts commonly have icing on them.", "You can tel by the sprinkles and the color of substance on the donut as to what is on the donut."], "image": "val2014/COCO_val2014_000000078026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524577, "question_id": "W5tZFiH57UtTTPKkogBqs2", "question": "What is the white box near the green door used for?", "choices": ["playing music", "holding mail", "making icecubes", "talking"], "correct_choice_idx": 3, "direct_answers": ["mailbox", "suggestion", "call box", "talking", "intercom", "intercom", "post", "communication", "intercom", "intercom communication"], "difficult_direct_answer": false, "rationales": ["This is a speakerbox to announce your arrival", "People can talk to the residents upstairs.", "The white box is a telecom to speak to people inside."], "image": "val2014/COCO_val2014_000000524577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28525, "question_id": "W6DoNBmKagSTvWeuCgByK7", "question": "This man most closely resembles what actor?", "choices": ["eddie murphy", "edward woodward", "eddie redmayne", "james edwards"], "correct_choice_idx": 1, "direct_answers": ["trump", "stanley tucci", "bill murray", "duvall", "edward woodward", "michael caine", "telly salvalas", "larry david", "cain", "michael caine"], "difficult_direct_answer": true, "rationales": ["The man sort of looks like actor edward woodward.", "The man closely resembles edward woodward.", "The man is woodward."], "image": "train2014/COCO_train2014_000000028525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178166, "question_id": "W78hSNuZkuLUnz8tTMnsZt", "question": "How does the donut taste?", "choices": ["sweet", "salty", "sour", "spicy"], "correct_choice_idx": 0, "direct_answers": ["good", "sweet", "sweet", "sweet", "good", "great", "good", "good", "sweet", "sweet"], "difficult_direct_answer": false, "rationales": ["The donut is sweet.", "The pastry is coated with sugar.", "Donuts are full of sugars."], "image": "train2014/COCO_train2014_000000178166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499951, "question_id": "W79RghAhNekNKVWq36yHVb", "question": "What medium was the art on the wall done with?", "choices": ["spray paint", "pencil", "crayons", "etchings"], "correct_choice_idx": 0, "direct_answers": ["spray paint", "spraypaint", "spraypaint", "spray paint", "spray paint", "spraypaint", "spray paint", "spray paint", "spray paint", "spray paint"], "difficult_direct_answer": false, "rationales": ["The art on the wall is called graffiti and is traditionally done using aerosolized paint from a can.", "The other options aren't commonly used for making graffiti.", "The art on the wall was done with spray paint."], "image": "train2014/COCO_train2014_000000499951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235905, "question_id": "W7Qfht5jGx42aWfq9fFqRa", "question": "Which street lane or lanes is the bus traveling in?", "choices": ["left lane", "both lanes", "right lane", "neither lane"], "correct_choice_idx": 1, "direct_answers": ["both lanes", "both", "right lane", "right lane", "middle", "left", "both", "both", "both", "middle"], "difficult_direct_answer": false, "rationales": ["Both lanes are open.", "The bus looks like it is riding between two lanes of traffic.", "The bus is going in the middle of the lanes."], "image": "train2014/COCO_train2014_000000235905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383616, "question_id": "W7R3H8bUefg3vkfRFYpABh", "question": "What is the sink made out of?", "choices": ["plastic", "stainless steel", "glass", "wood"], "correct_choice_idx": 1, "direct_answers": ["metal", "steel", "stainless steel", "metal", "stainless steel", "stainless steel", "stainless steel", "metal", "steel", "stainless steel"], "difficult_direct_answer": false, "rationales": ["The sink is in a kitchen and is made out of a shiny metal.", "As indicated by the color. the other options also wouldn't make sense.", "The other materials are too fragile, damaged by water or not durable enough for heavy use."], "image": "train2014/COCO_train2014_000000383616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337827, "question_id": "W7W5zAas8XSGK89eKqpdt9", "question": "Which of these bananas will be edible longer?", "choices": ["yellow", "gray", "greenish ones", "brown"], "correct_choice_idx": 2, "direct_answers": ["green", "greenish ones", "green", "green ones", "green", "green", "yellow", "green", "green", "green bananas"], "difficult_direct_answer": false, "rationales": ["The greenish bananas indicate that they are not ripe and are the furthest from spoiling.", "The bananas that are green aren't ripe.", "The bananas that are green aren't ripe."], "image": "val2014/COCO_val2014_000000337827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532714, "question_id": "W7jnXuP88Sp9yK6DTWvCqh", "question": "Where is this person located at?", "choices": ["playground", "waiting room", "park", "platform"], "correct_choice_idx": 3, "direct_answers": ["train station", "trainstop", "subway", "station", "train station", "train depot", "train station", "train station", "platform", "train station"], "difficult_direct_answer": false, "rationales": ["The person is sitting on a bench that is near a solid warning line. there is a train near the person.", "The person is on a platform.", "There is a train visible in the background. the place where people wait for trains that has this setup is known as a platform."], "image": "train2014/COCO_train2014_000000532714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90433, "question_id": "W7m2TQKsvhedheKpQtYrto", "question": "What are cows doing in the lake?", "choices": ["swimming", "playing", "running", "eating"], "correct_choice_idx": 3, "direct_answers": ["drinking water", "swimming", "drinking water", "grazing", "cooling down", "cooling off", "hot", "eating", "getting water", "grazing"], "difficult_direct_answer": false, "rationales": ["Cows are grazing near a body of water.", "The cows are munching on grass.", "This is a grassy area, so the cows are likely grazing."], "image": "train2014/COCO_train2014_000000090433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293466, "question_id": "W8EFGdFfDhP75ZNtmWyYfB", "question": "Where did this person purchase this edible item?", "choices": ["door dash", "luxury restaurant", "roadside", "automat"], "correct_choice_idx": 2, "direct_answers": ["store", "food truck", "food truck", "roadside", "taco stand", "roadside dinner", "hotel", "restaurant", "food truck", "hungry"], "difficult_direct_answer": false, "rationales": ["The item depicted is a serving of french fries, which is a fast food item. there is a highway depicted, but no vendor or restaurant, and delivery services would not usually stop here.", "This gyro came from a cart on the road.", "The person got it roadside."], "image": "train2014/COCO_train2014_000000293466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294051, "question_id": "W8EeGZR8NeGiqipysZrYf2", "question": "Why are they so rowdy?", "choices": ["they're colleagues", "music", "drinks", "weather"], "correct_choice_idx": 2, "direct_answers": ["drinks", "intoxication", "drink", "drunk", "partying", "wine", "drinking", "party", "drunk", "at party"], "difficult_direct_answer": true, "rationales": ["Both appear to hold cups of beer.", "People are in a crowded area holding beer in plastic cups. people tend to be less inhibited when drinking alcohol.", "They have alcohol in their cups"], "image": "train2014/COCO_train2014_000000294051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491736, "question_id": "W8HE4jmTR5kUvwbrUCerL6", "question": "What is the source of the item being drug by elephants?", "choices": ["animal", "water", "rock", "plant"], "correct_choice_idx": 3, "direct_answers": ["trees", "wood", "wood", "tree", "trees", "tree", "trees", "forest", "plant", "rope"], "difficult_direct_answer": false, "rationales": ["Elephants are hauling a log.", "The animals are dragging wood from a tree.", "The item is a tree which was once planted."], "image": "train2014/COCO_train2014_000000491736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290305, "question_id": "W8VmFfaKSyc6crZErb8izp", "question": "The shape of the paragliding inflatable wing is?", "choices": ["round", "circular", "triangle", "elliptical"], "correct_choice_idx": 3, "direct_answers": ["curved", "curved rectangle", "banana peel", "elliptical", "oblong", "crescent", "curved", "birds", "oblong", "moon"], "difficult_direct_answer": false, "rationales": ["This is the only shape that will use the wind correctly to pick up the person.", "The shape of the wing is an ellipsis.", "The shape of the paragliding wing is elliptical so it is able to take off with the wind"], "image": "train2014/COCO_train2014_000000290305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72923, "question_id": "W8biFmszaTt8gsLCKk8XCe", "question": "What's the weather like for these skiers?", "choices": ["clear", "stormy", "cloudy", "rainy"], "correct_choice_idx": 0, "direct_answers": ["cold", "clear", "sunny", "cold", "sunny", "nice", "clear", "clear", "clear skies", "perfect"], "difficult_direct_answer": false, "rationales": ["There is not a cloud in the sky.", "The sky is so blue and no clouds at all.", "The weather is clear."], "image": "train2014/COCO_train2014_000000072923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297191, "question_id": "W8eqs4dboC2JBg6qWkJDQf", "question": "What major type of infrastructure is located close by?", "choices": ["train station", "parking garage", "airport", "bus terminal"], "correct_choice_idx": 2, "direct_answers": ["airport", "highway", "airport", "airport", "airport", "airport", "water", "aviation", "airport", "airport"], "difficult_direct_answer": false, "rationales": ["A large commercial aircraft is parked in an open, cemented area.", "There is an airplane in the background.", "There is an airport located close by to the fire hydrant."], "image": "train2014/COCO_train2014_000000297191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381629, "question_id": "W8kgAdGe2jti232p3hNsAS", "question": "What is the electronic device in this room used for?", "choices": ["cooling", "watching", "listening", "computing"], "correct_choice_idx": 1, "direct_answers": ["entertainment", "viewing", "music", "watching television", "watch tv", "television", "watching content", "tv", "watching tv", "watching"], "difficult_direct_answer": true, "rationales": ["The electronic is tv used to watch different programme.", "The device is for watching.", "It is a television"], "image": "train2014/COCO_train2014_000000381629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166358, "question_id": "W8tJdkcnB3pLfLjmu7gANA", "question": "What are the overhead cables for?", "choices": ["guiding skiers", "carry electricity", "decorative only", "carry skiers"], "correct_choice_idx": 3, "direct_answers": ["electricity", "carry skiers", "ski lifts", "carrying people", "ski lift", "ski lift", "ski lift", "ski lift", "ski lift", "ski lift"], "difficult_direct_answer": false, "rationales": ["The cables are for the ski lift which carries skiiers.", "Ski slopes have ski lifts that take people from the bottom of the slope to the top", "Skiers need to jump from a lift to begin snow skiing."], "image": "val2014/COCO_val2014_000000166358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285755, "question_id": "W9ESZqtrh7qWXT5obW2jxY", "question": "How many different breeds of cows are shown here?", "choices": ["three", "five", "one", "six"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["The cows are together and they all look the same.", "There is one type.", "All of the cows look the same. they belong to the same breed."], "image": "train2014/COCO_train2014_000000285755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344040, "question_id": "W9ZYEjyc6Xtr3h4sKx5jku", "question": "Where is this event most likely being held?", "choices": ["military base", "public park", "college campus", "convention center"], "correct_choice_idx": 2, "direct_answers": ["school", "museum", "airplane event", "aviation fair", "museum", "college campus", "air show", "airport", "vermont", "parking lot"], "difficult_direct_answer": true, "rationales": ["People have backpacks with them.", "The people are young adults. the building in the back looks like a school.", "There are a lot of bikes nearby a large building"], "image": "train2014/COCO_train2014_000000344040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343708, "question_id": "W9pBkPqpTky4oPno4ucUYE", "question": "What is on the lift raised in front of the clock tower?", "choices": ["oil can", "milk jug", "car tire", "cement block"], "correct_choice_idx": 1, "direct_answers": ["milk jug", "container", "bucket", "boom", "tank", "scaffold", "basket", "crane", "platform", "container"], "difficult_direct_answer": true, "rationales": ["The lift has a milk jug on it.", "It's actually an old fashioned a pail.", "A jug is placed on the interior of the lift."], "image": "val2014/COCO_val2014_000000343708.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419017, "question_id": "WA8gQMgk2mURd4gSgsMSEZ", "question": "Most of the visible trees here are what?", "choices": ["pine", "hardwood", "softwood", "evergreen"], "correct_choice_idx": 0, "direct_answers": ["pines", "pine trees", "pine", "pine", "pine trees", "pine", "pine trees", "pine", "bare", "pine"], "difficult_direct_answer": false, "rationales": ["They are shaped like pine trees.", "They are straight up with uniform branches.", "The trees are pines."], "image": "train2014/COCO_train2014_000000419017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451008, "question_id": "WAJAS4btCSXsYA8KRMtEFs", "question": "Why is the man spreading a substance on his surf board?", "choices": ["sun protection", "bug repellent", "grip", "water protection"], "correct_choice_idx": 2, "direct_answers": ["protection", "grip", "waxing", "protection", "waxing", "increase grip", "wax", "waxing", "protection", "increase grip"], "difficult_direct_answer": false, "rationales": ["The substance makes it easier to hold onto the board.", "The man is putting wax on his board.", "This is wax that helps them stay on the board in water"], "image": "val2014/COCO_val2014_000000451008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561735, "question_id": "WATAY6Uo6UtMUmzw4AQpFE", "question": "What language is probably spoken in this locale?", "choices": ["spanish", "italian", "french", "portuguese"], "correct_choice_idx": 2, "direct_answers": ["german", "spanish", "french", "french", "french", "french", "spanish", "german", "french", "spanish"], "difficult_direct_answer": false, "rationales": ["The signs are in the french language.", "The language on the signs is in french.", "Signs with french words are outside of a building."], "image": "train2014/COCO_train2014_000000561735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126606, "question_id": "WATgYN7uAshyCBEnsNZvY3", "question": "In what group are the people with Black Racing jackets?", "choices": ["school class", "family", "club", "volunteers"], "correct_choice_idx": 2, "direct_answers": ["three bikes", "motorcycle club", "club", "motorcycle club", "motorcycle", "bikers", "motorcycle group", "motorcycle", "racing", "motorcycle gang"], "difficult_direct_answer": false, "rationales": ["They are in a club and all have matching jackets so you know they are all together.", "They are all in a biker gang and wear the same jackets.", "The people are in a club because they are wearing jackets where their affiliation, racing, is written"], "image": "val2014/COCO_val2014_000000126606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38701, "question_id": "WAYUuBMqa3xpV2fSu68ZGL", "question": "What will probably be added to this food?", "choices": ["cherry", "butter", "frosting", "spices"], "correct_choice_idx": 2, "direct_answers": ["icing", "icing", "icing", "frosting", "frosting", "frosting", "icing", "icing", "frosting", "icing"], "difficult_direct_answer": false, "rationales": ["Frosting is added.", "Cakes get frosted.", "This is a cake that someone will add a toping to."], "image": "val2014/COCO_val2014_000000038701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399415, "question_id": "WAd3TcdjoBSTpWE9t6FAYj", "question": "What type of footwear are the two wearing?", "choices": ["crocs", "cleats", "boots", "martins"], "correct_choice_idx": 1, "direct_answers": ["spiked shoes", "cleats", "soccer shoes", "cleats", "cleats", "sneakers", "cleats", "cleats", "cleats", "cleats"], "difficult_direct_answer": false, "rationales": ["The men are wearing cleats.", "You can tell by the prongs sticking from the bottom of there shoes.", "The footwear is cleats."], "image": "val2014/COCO_val2014_000000399415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425447, "question_id": "WAoEJ5xUt6wjtLifMBSu7v", "question": "What does he do?", "choices": ["parks vehicles", "cleans trucks", "haircuts", "landscaping"], "correct_choice_idx": 3, "direct_answers": ["construction", "drive", "4 wheeling", "fire protection", "drive truck", "plumbing", "construction", "landscaping", "check hydrant", "works"], "difficult_direct_answer": true, "rationales": ["The man takes care of trees.", "The name of the company is on the side of the truck.", "He has a water jug for working outside"], "image": "val2014/COCO_val2014_000000425447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490099, "question_id": "WBQFARtjajJtLnKoVuZHJB", "question": "What will the woman with the empty purple plate eat?", "choices": ["roast duck", "sandwich", "deviled eggs", "egg plant"], "correct_choice_idx": 1, "direct_answers": ["sandwich", "sandwich", "sandwiches", "sandwich", "sandwiches", "sandwich", "sandwiches", "sandwich", "sandwiches", "sandwich"], "difficult_direct_answer": false, "rationales": ["The woman with the purple plate is picking out a sandwich.", "She'll have a sandwich.", "She is reaching to grab a sandwich from a tray."], "image": "train2014/COCO_train2014_000000490099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40205, "question_id": "WBWL8Gjtx37MHAhQFNwxr8", "question": "What course is being served?", "choices": ["dessert", "entree", "soup", "salad"], "correct_choice_idx": 1, "direct_answers": ["dinner", "entree", "cooking", "cooking course", "main", "home science", "first", "entree", "dinner", "entree"], "difficult_direct_answer": false, "rationales": ["Soup would be all liquid in a bowl, salad would have lettuce and other vegetables on it, and a dessert would be a sweet.", "There is a protein, vegetables and a starch.", "Entrees are being served."], "image": "train2014/COCO_train2014_000000040205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36951, "question_id": "WBZiUsEQrmvMsC5q2no8jM", "question": "What is sh doing?", "choices": ["eating", "posing", "sleeping", "resting"], "correct_choice_idx": 1, "direct_answers": ["posing", "posing", "lying down", "reclining pose", "posing", "staring", "looking", "posing", "chilling", "lying down"], "difficult_direct_answer": false, "rationales": ["She is laying on one arm with the other arm cocked over her waist.", "She is posing for a photograph.", "She is staying still and looking at the camera"], "image": "train2014/COCO_train2014_000000036951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194421, "question_id": "WBcaddt95zgUJPyKnqLsFe", "question": "What type of skate maneuver is the boy in white performing?", "choices": ["flip trick", "nose slide", "grind", "grab"], "correct_choice_idx": 0, "direct_answers": ["flip", "flip", "flip trick", "flip", "jump", "kickflip", "ollie", "wheelie", "jump", "skateboard trick"], "difficult_direct_answer": false, "rationales": ["The skateboarder is jumping up and has flipped his skateboard.", "It's a flip trick.", "It is where the skateboard spins."], "image": "train2014/COCO_train2014_000000194421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435686, "question_id": "WBekUCQ75QSyLT3Q5eidCE", "question": "These animals like to eat food from where?", "choices": ["pig pens", "garbage disposals", "high trees", "lakes"], "correct_choice_idx": 2, "direct_answers": ["trees", "trees", "high trees", "trees", "trees", "trees", "trees", "trees", "trees", "trees"], "difficult_direct_answer": false, "rationales": ["Giraffes have long necks that allow them to eat from tall trees.", "They like to eat the leaves off the trees.", "These animals are giraffes, not fish or pigs. garbage disposals are not a source of food for wild animals."], "image": "train2014/COCO_train2014_000000435686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248163, "question_id": "WCPn7MQPzf47YezV3ZZHYS", "question": "Where is the fire hydrant mounted?", "choices": ["tree", "landscape", "sidewalk", "parking lot"], "correct_choice_idx": 1, "direct_answers": ["grass", "land", "ground", "planter", "roadside", "grass", "in bushes", "landscape", "ground", "rocks"], "difficult_direct_answer": false, "rationales": ["A fire hydrant is surrounded by bushes near the sidewalk.", "There are plants growing all around it.", "It is mounted to the ground next to the sidewalk."], "image": "val2014/COCO_val2014_000000248163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539056, "question_id": "WCXx3UGHnm7jUpx8Jd8Sg4", "question": "What is the oval thing on the desk called?", "choices": ["mouse", "phone", "mouse pad", "speakers"], "correct_choice_idx": 0, "direct_answers": ["mouse", "mouse", "mousepad", "mouse", "mouse", "mouse", "mousepad", "mouse", "computer mouse", "mouse"], "difficult_direct_answer": false, "rationales": ["The oval object is the mouse.", "It is a computer accessory with this name", "The oval thing is a mouse for a computer."], "image": "train2014/COCO_train2014_000000539056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162161, "question_id": "WCYTqLY9eyH2T857dYRvZy", "question": "The kites perform was motion in order to move across the sky?", "choices": ["they jump", "they skip", "they glide", "they catch"], "correct_choice_idx": 2, "direct_answers": ["swooping", "hover", "glide", "upward", "upward", "upward", "flying", "flying", "they glide", "fly"], "difficult_direct_answer": false, "rationales": ["The wind lifts the material in a smooth motion.", "The kites glide.", "The kites glide through the air."], "image": "train2014/COCO_train2014_000000162161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50431, "question_id": "WCdY4CNKyVUj8KJQtQ3iHT", "question": "Why are there horses in front of the carriage?", "choices": ["to pull", "to pet", "to ride", "to eat"], "correct_choice_idx": 0, "direct_answers": ["pulls carriage", "transporting", "to pull", "pulling carriage", "pulling cart", "pulling", "romantic", "pulling cart", "pulling it", "to pull"], "difficult_direct_answer": false, "rationales": ["They are puling the carriage.", "The horses are pulling on the carriage.", "The carriage is being moved by the horses."], "image": "val2014/COCO_val2014_000000050431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451679, "question_id": "WCfPFCKskw4WVFpSNCKnxt", "question": "In what location was this oven built?", "choices": ["here", "mexican factory", "mall", "farm"], "correct_choice_idx": 0, "direct_answers": ["here", "backyard", "outdoors", "backyard", "patio", "africa", "outside", "kiln", "trailer", "outside"], "difficult_direct_answer": false, "rationales": ["The oven is stuck in place.", "It would be too heavy to move so it is a fixed oven", "This oven was built on site because it is too large and heavy to move and might break during transport."], "image": "val2014/COCO_val2014_000000451679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240836, "question_id": "WDRmyWW6cbLuZgKiRoz8Fw", "question": "The pattern on this animal most closely resembles the pattern on what other animal?", "choices": ["robin", "donkey", "cheetah", "zebra"], "correct_choice_idx": 2, "direct_answers": ["giraffe ostrich", "cheetah", "giraffe", "cheetah", "leopard", "leopard", "cheetah", "leopard", "leopard", "tiger"], "difficult_direct_answer": false, "rationales": ["The animal most closely resembles a cheetah.", "The animal is a giraffe. it does not have black and white stripes, brown and white areas, or brown and red areas.", "This spotted pattern does not exist on a zebra as they are striped. a donkey and a robin have more of a plain coloring without designs."], "image": "train2014/COCO_train2014_000000240836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31984, "question_id": "WDbuY7R9fVaEcd5guZHJt4", "question": "What company seems to have sponsored this event?", "choices": ["honda", "toyota", "yamaha", "sony"], "correct_choice_idx": 2, "direct_answers": ["yamaha", "yamaha", "yamaha", "yamaha", "yamaha", "yamaha", "yamaha", "yamaha", "yamaha", "yamaha"], "difficult_direct_answer": false, "rationales": ["The company's name appears on the back wall.", "The logo is on the bike.", "The company is yamaha."], "image": "val2014/COCO_val2014_000000031984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326690, "question_id": "WDvsomCE6jqCLhgkNRhdUk", "question": "Why is the vehicle braking?", "choices": ["police", "traffic", "animals", "fast food"], "correct_choice_idx": 2, "direct_answers": ["elephants", "elephants", "elephants", "elephant", "elephants", "elephants", "elephants", "elephants", "elephants", "animals"], "difficult_direct_answer": false, "rationales": ["These animals are strong and dangerous, so stopping around them is smart.", "The vehicle stops for animals.", "Elephants are walking in the street behind a car."], "image": "train2014/COCO_train2014_000000326690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292296, "question_id": "WEAg6KteLoW8k3yV9GCeWU", "question": "Why did the bike riders stop?", "choices": ["to sleep", "to rest", "to eat", "to dance"], "correct_choice_idx": 2, "direct_answers": ["have picnic", "to eat", "to eat", "eat", "eat", "hungry", "eat", "have lunch", "for lunch", "have lunch"], "difficult_direct_answer": false, "rationales": ["As indicated by the food between them.", "The people were hungry.", "They are taking a break to grab some food because they are hungry."], "image": "train2014/COCO_train2014_000000292296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228910, "question_id": "WEBPwZ3gUx5USpWBgSgSrA", "question": "What hobby might the person shown here have?", "choices": ["competitive eating", "photography", "weight lifting", "gaming"], "correct_choice_idx": 1, "direct_answers": ["photography", "photographer", "photography", "photography", "photography", "photography", "photographer", "photography", "photography", "taking photos"], "difficult_direct_answer": false, "rationales": ["The person has a camera hanging around their neck.", "The person has a camera around their neck.", "It looks like she has a camera around her neck."], "image": "val2014/COCO_val2014_000000228910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14472, "question_id": "WETXxFcTNXiCyCje4bH4A2", "question": "Where is the blue item most likely to be used?", "choices": ["rwanda", "london", "cameroon", "egypt"], "correct_choice_idx": 1, "direct_answers": ["rainy areas", "rain", "rain", "rain", "outside", "outside", "block rain", "rainy day", "outdoors", "london"], "difficult_direct_answer": false, "rationales": ["It rains a lot there", "The item is from london.", "Not sure what city this is but it's to keep out of the rain."], "image": "train2014/COCO_train2014_000000014472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14698, "question_id": "WEayqCfxaTb5abK7Rx6DpQ", "question": "What type of building might that be?", "choices": ["grocery store", "dealership", "school", "department store"], "correct_choice_idx": 0, "direct_answers": ["grocery store", "residential building", "store", "restaurant", "grocery store", "scatting", "grocery", "museum", "grocery store", "storey"], "difficult_direct_answer": false, "rationales": ["The building has pictures of fruit on it.", "There are pictures of produce on it", "Photos of food are on the sides of the building."], "image": "train2014/COCO_train2014_000000014698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547509, "question_id": "WEbF3vsdxrCyDVPBV66XFt", "question": "What is the relationship between the two people?", "choices": ["sisters", "teammates", "coworkers", "strangers"], "correct_choice_idx": 1, "direct_answers": ["teammates", "friends", "teammates", "friends", "teammates", "teammates", "teammates", "teammates", "friends", "teammates"], "difficult_direct_answer": false, "rationales": ["The girls are wearing baseball shirts.", "The two girls have the same shirt and baseball caps on.", "The relationship is teammates."], "image": "train2014/COCO_train2014_000000547509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264155, "question_id": "WEedDbXeWc7gGw3fQDMnpo", "question": "What is this type of pizza called?", "choices": ["pepperoni pizza", "hawaiian pizza", "vegan pizza", "cheese pizza"], "correct_choice_idx": 3, "direct_answers": ["cheese pizza", "margherita", "cheese", "cheese pizza", "cheese pizza", "cheese pizza", "cheese", "cheese", "cheese", "cheese pizza"], "difficult_direct_answer": false, "rationales": ["There are no additional toppings on the pizza.", "There are no toppings besides cheese.", "The pizza has cheese."], "image": "val2014/COCO_val2014_000000264155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424412, "question_id": "WFPQjuqgYB5FTURkcay4kb", "question": "What food place did the children get their food from?", "choices": ["wendys", "starbucks", "mcdonalds", "dunkin donuts"], "correct_choice_idx": 1, "direct_answers": ["burger", "starbucks", "starbucks", "starbucks", "starbucks", "burger", "starbucks", "starbucks", "burger", "starbucks"], "difficult_direct_answer": false, "rationales": ["There is the whipped-coffee-in-plastic cup next to the girl.", "The food place is starbucks.", "The bag has a mermaid logo on it."], "image": "train2014/COCO_train2014_000000424412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23446, "question_id": "WFuHfMkjqNgN9QYdNv6HQ6", "question": "What brand could the red sticker on the laptop stand for?", "choices": ["mars", "welch's", "gatorade", "brach's"], "correct_choice_idx": 2, "direct_answers": ["gatorade", "gatorade", "gatorade", "gatorade", "gatorade", "gatorade", "google", "gatorade", "apple", "gay"], "difficult_direct_answer": false, "rationales": ["A sticker with a large g on it is on the back of a laptop.", "It is a drink logo", "The logo is the letter \"g\"."], "image": "val2014/COCO_val2014_000000023446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450860, "question_id": "WFuxRRAyRYHaiMCVACZLiT", "question": "Why does the man stand on one leg?", "choices": ["running bases", "pitching ball", "stork pose", "yoga move"], "correct_choice_idx": 1, "direct_answers": ["pitch ball", "pitching", "balance", "pitching", "released pitch", "pitching ball", "to pitch", "balance", "pitching", "pitching"], "difficult_direct_answer": false, "rationales": ["The man is pitching.", "The person is in a baseball uniform and is standing on the pitcher's mound.", "The power of his throw moves his weight forward throwing up his back leg."], "image": "train2014/COCO_train2014_000000450860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8536, "question_id": "WGRYptQSc9edwuAsX2Bjqv", "question": "What kind of residence is this?", "choices": ["mansion", "apartment", "villa", "school"], "correct_choice_idx": 2, "direct_answers": ["beach house", "beach house", "farmhouse", "villa", "house", "house", "country home", "ranch", "hotel", "beach"], "difficult_direct_answer": false, "rationales": ["It looks like a villa where you would stay on vacation.", "It's a villa.", "This is a large residential home that is detached."], "image": "train2014/COCO_train2014_000000008536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485788, "question_id": "WGRZVZtixbiuPxDiyUT766", "question": "People sit in what area?", "choices": ["alleyway", "stripper bar", "public park", "mall"], "correct_choice_idx": 3, "direct_answers": ["inside courtyard", "lobby", "ballroom", "dining area", "lobby", "lounge", "waiting area", "dining", "mall", "tables/chairs"], "difficult_direct_answer": true, "rationales": ["The people are seated in an area of a mall where people can sit down", "There are windows with clothes in them", "There are shops near the people."], "image": "train2014/COCO_train2014_000000485788.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409985, "question_id": "WH5YEjvW5YUfC9iXMmBwcT", "question": "What is the man saying with his hand gestures?", "choices": ["situation approval", "angry", "hello", "hatred"], "correct_choice_idx": 0, "direct_answers": ["approval", "happy", "situation approval", "it's good", "good", "good", "thumbs up", "awesome", "i approve", "love it"], "difficult_direct_answer": true, "rationales": ["This is a thumbs-up which means you like something", "The man is posing for a photo with his thumbs up. this is a gesture known to be of approval and especially when doubled, matched with a smile, and done for a photo.", "This is a thumbs up which means someone likes something"], "image": "val2014/COCO_val2014_000000409985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320978, "question_id": "WHAcW86Rp3nNPYxqaZRkLw", "question": "Which vegetable is likely the most expensive item by piece or pound?", "choices": ["artichoke", "celery", "corn", "turnips"], "correct_choice_idx": 0, "direct_answers": ["brussel sprouts", "brussel sprouts", "artichoke", "tomatoes", "tomato", "artichokes", "artichoke", "brussell sprouts", "brussel sprouts", "pepper"], "difficult_direct_answer": false, "rationales": ["The most expensive item is artichokes.", "The veggie is artichokes.", "Artichokes are known for being gourmet."], "image": "val2014/COCO_val2014_000000320978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406710, "question_id": "WHBGq3YMYMWfiRi3nxsjSo", "question": "Why is the woman on the left standing behind the table of pastries?", "choices": ["she's buying", "she's looking", "she's selling", "she's baking"], "correct_choice_idx": 2, "direct_answers": ["selling them", "saleswoman", "she's selling", "selling them", "salesperson", "she's serving", "selling them", "serving them", "serving them", "seller"], "difficult_direct_answer": false, "rationales": ["The man on the opposite side is taking money out of his wallet to buy the food from the woman, while she is holding the bag of pastries to give to the man after he pays.", "The woman is selling.", "She is talking to a customer that is ready to pay her for the pastries."], "image": "train2014/COCO_train2014_000000406710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224554, "question_id": "WHCztEMDNMrF3SHhRDWSyB", "question": "What wrestler is named after the long item with the wing logo?", "choices": ["tugboat", "tank abbott", "a-train", "refrigerator perry"], "correct_choice_idx": 2, "direct_answers": ["trainwreck", "hawk", "ultimate warrior", "tom train", "moth", "hawk", "ice train", "trainwreck mcfarklin", "a-train", "mark benz"], "difficult_direct_answer": true, "rationales": ["A-train is the name of the wrestler.", "Mathew bloom was known as a-train which is what the first car would be.", "The wrestler is a train."], "image": "val2014/COCO_val2014_000000224554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495626, "question_id": "WHatdcUUosMT5aXHAEzfkU", "question": "What type of street sign is near the people crossing?", "choices": ["u turn", "pedestrian crossing", "billboard", "elephant crossing"], "correct_choice_idx": 1, "direct_answers": ["walk", "busy", "not visible", "one way", "road name", "traffic lights", "pedestrian crossing", "parking", "street name", "crosswalk"], "difficult_direct_answer": true, "rationales": ["There is a crossing by the cars.", "A pedestrian crossing allows people to cross without accidents.", "The street crossing must be governed by a pedestrian crossing."], "image": "val2014/COCO_val2014_000000495626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393027, "question_id": "WHoP8mh9YSxBJjGGRFkHpr", "question": "What handedness does the batter here exhibit?", "choices": ["both", "none", "right", "left"], "correct_choice_idx": 3, "direct_answers": ["left", "open", "left", "left", "left", "right", "left", "right", "left", "left"], "difficult_direct_answer": false, "rationales": ["The baseball player is a left-handed batter.", "This batter exhibits a left handed bat.", "The man has right hands."], "image": "train2014/COCO_train2014_000000393027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406530, "question_id": "WHwCuKAMU5AqvGbqQ59pDy", "question": "What is the bear here doing?", "choices": ["running", "hanging", "eating", "jumping"], "correct_choice_idx": 1, "direct_answers": ["hanging out", "hanging", "hanging", "sleep", "hanging", "walking", "hanging purse", "hanging", "string suspension", "hanging around"], "difficult_direct_answer": false, "rationales": ["Strings are used to tie the bear up. the strings are attached to a point that is higher than the bear.", "It is attached to strings", "It is suspended by strings"], "image": "train2014/COCO_train2014_000000406530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182227, "question_id": "WJDzxvEDT5qQYbtAEo82oE", "question": "What is the maximum speed of this type of animal in miles per hour?", "choices": ["15", "25", "40", "55"], "correct_choice_idx": 3, "direct_answers": ["fifty", "many miles", "55", "ten", "55 mph", "fifty five", "twenty", "40", "55", "twenty"], "difficult_direct_answer": false, "rationales": ["The horse can run at 55 miles an hour.", "The animal is a horse. it has a maximum speed of about 88 kilometers per hour.", "They are used in racing"], "image": "val2014/COCO_val2014_000000182227.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390125, "question_id": "WJEdQzMmAeVPmFEM4xLyJq", "question": "What are in the tank against the wall?", "choices": ["frog", "lobster", "octopus", "fish"], "correct_choice_idx": 3, "direct_answers": ["fish", "fish", "fish", "fish", "fishes", "fish", "fish", "fish", "fish tank", "fish"], "difficult_direct_answer": false, "rationales": ["There are many orange and white common aquatic animals of the same species swimming in the tank.", "The tank has fish.", "That is an aquarium"], "image": "train2014/COCO_train2014_000000390125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217009, "question_id": "WJQFA8tS5KwrjQnoVUb643", "question": "What vehicle on the north side of the street will the car traveling west pass second?", "choices": ["motorcycle", "light 4-door", "white van", "black 4-door"], "correct_choice_idx": 2, "direct_answers": ["black", "small sedan", "black car", "white van", "van", "unclear north", "car", "minivan", "black car", "white van"], "difficult_direct_answer": false, "rationales": ["The white van is going to be passed second.", "It is behind the dark car that would be considered first.", "The vehicle is a white van."], "image": "val2014/COCO_val2014_000000217009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197273, "question_id": "WJYwRaDPAwDKN35kJENMmE", "question": "Who might this man be?", "choices": ["officer", "doctor", "teacher", "parent"], "correct_choice_idx": 3, "direct_answers": ["father", "father", "father", "father", "father", "father", "father", "her father", "her father", "parent"], "difficult_direct_answer": false, "rationales": ["The man is holding the girl with care.", "The man is a parent.", "The man is the dad of the little girl."], "image": "train2014/COCO_train2014_000000197273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428885, "question_id": "WJzRXmrvFYAw3rRDnho9yQ", "question": "What is the man doing to the building?", "choices": ["remodeling", "cleaning", "adding on", "breaking down"], "correct_choice_idx": 3, "direct_answers": ["destroying", "demolishing", "tearing apart", "demolishing", "demolishing", "breaking down", "demolition", "demolition", "repairing", "sledgehammering"], "difficult_direct_answer": false, "rationales": ["He is using a hammer to break apart parts of the building.", "The man is breaking down.", "The man is holding a sledge hammer and helping to demolish the building."], "image": "val2014/COCO_val2014_000000428885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231631, "question_id": "WKCW4QBFTDpkvUeJJPCjAD", "question": "What kind of pattern is the road?", "choices": ["square", "flat", "bumpy", "black"], "correct_choice_idx": 1, "direct_answers": ["wet", "plain", "square", "none", "asphalt", "checkered", "solid black", "flat", "rainy", "wet"], "difficult_direct_answer": true, "rationales": ["The gradient of the road is clear and visible.", "A wet, flat road can be seen under produce goods.", "Tables can only be placed on a flat surface so things don't roll or fall off of them."], "image": "train2014/COCO_train2014_000000231631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285470, "question_id": "WKCdWVABuqtt9VckL4ELhu", "question": "What type of dish is the bird drinking from?", "choices": ["bowl", "saucer", "plate", "cup"], "correct_choice_idx": 3, "direct_answers": ["tea cup", "cup", "tea cup", "cup", "tea cup", "coffee cup", "coffee cup", "coffee cup", "tea cup", "cup"], "difficult_direct_answer": false, "rationales": ["The vessel shown is a cup since it's rounded and can hold some volume.", "This is what he drink tea or coffee out of", "The dish is a cup."], "image": "train2014/COCO_train2014_000000285470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490306, "question_id": "WKK8QwxNysTaKZG7jATVNd", "question": "The girl is positioning her body in the way a model does by doing what?", "choices": ["walking", "posing", "crying", "twirling"], "correct_choice_idx": 1, "direct_answers": ["shift weight", "hips", "posing", "posing", "pose", "hips", "throwing hip", "pose", "cat walking", "posing"], "difficult_direct_answer": false, "rationales": ["She's posing.", "The girl is striking a pose.", "Her hand is on her hip like a model would do."], "image": "val2014/COCO_val2014_000000490306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478759, "question_id": "WKKK8nVpSvJw52CFcjguQM", "question": "What leavening allows the dough to rise on this dish?", "choices": ["sour dough", "salt", "yeast", "none"], "correct_choice_idx": 2, "direct_answers": ["yeast", "yeast", "huh", "huh", "yeast", "yeast", "huh", "yeast", "yeast", "yeast"], "difficult_direct_answer": false, "rationales": ["This is the leavening agent used in this type of dough", "When using this in bread it will rise.", "The leavening is yeast."], "image": "val2014/COCO_val2014_000000478759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365802, "question_id": "WKTZ2SdAmqZWu7qFqRQpe7", "question": "What is the author creating?", "choices": ["novel", "poem", "dictionary", "essay"], "correct_choice_idx": 0, "direct_answers": ["book", "novel", "program", "document", "book", "novel", "research paper", "novel", "text document", "novel"], "difficult_direct_answer": false, "rationales": ["He's writing a book.", "The format of the text on the screen looks similar to what one would find in a book.", "The sign advertises a month celebrating a specific type of writing."], "image": "train2014/COCO_train2014_000000365802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56116, "question_id": "WKcsFyaNZ4VtTy2cHM3DkQ", "question": "Where is the white cup in front of the second chair from the left from?", "choices": ["wal mart", "dunkin", "wawa", "starbucks"], "correct_choice_idx": 3, "direct_answers": ["guests", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "coffee"], "difficult_direct_answer": false, "rationales": ["The logo is on the cup.", "It has the green logo on it for this company", "There is a logo visibly on the cup. companies put their logo on the cups when you purchase products from their stores and this logo is from starbucks."], "image": "train2014/COCO_train2014_000000056116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44267, "question_id": "WKskSS6ZrBwtKDRGBoMXdm", "question": "What white fluid is often found here?", "choices": ["milk", "semen", "paint", "conditioner"], "correct_choice_idx": 0, "direct_answers": ["milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk"], "difficult_direct_answer": false, "rationales": ["The white fluid found in a refrigerator would be milk.", "Milk is the most common white liquid in a fridge. it's a staple food and many people have it.", "The fluid is milk."], "image": "train2014/COCO_train2014_000000044267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254357, "question_id": "WLCkUDBjRNKpsBwMHnnpoC", "question": "What country is this spot in?", "choices": ["united states", "australia", "italy", "britain"], "correct_choice_idx": 1, "direct_answers": ["australia", "australia", "country", "australia", "australia", "england", "australia", "australia", "england", "england"], "difficult_direct_answer": false, "rationales": ["The country is australia.", "The flag is on the top of the trolley", "There is an australian flag raised up."], "image": "train2014/COCO_train2014_000000254357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427070, "question_id": "WLRGnXmkyukiMwQHWEiwkZ", "question": "What does the kneeling person hold in their hand?", "choices": ["mop", "kite string", "dog leash", "rattle"], "correct_choice_idx": 1, "direct_answers": ["kite string", "kite string", "kite string", "kite string", "child", "child", "kite", "string", "kite", "kite string"], "difficult_direct_answer": false, "rationales": ["There is a kite flying above the person.", "The person has a string.", "The person has a kite string in their hand."], "image": "val2014/COCO_val2014_000000427070.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562474, "question_id": "WLyiDUnpCcdBPSpZVwomus", "question": "What country is the black and white bear's real version from?", "choices": ["russia", "china", "peru", "japan"], "correct_choice_idx": 1, "direct_answers": ["india", "china", "australia", "china", "china", "united states", "china", "china", "china", "china"], "difficult_direct_answer": false, "rationales": ["Panda bears are from this large asian country.", "China is where pandas come from.", "This specific bear design and book is originated from china."], "image": "train2014/COCO_train2014_000000562474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222322, "question_id": "WMZsXVnhZw69RGdS23uHmW", "question": "Where are all the chairs setup?", "choices": ["on beach", "in park", "near lake", "backyard"], "correct_choice_idx": 0, "direct_answers": ["beach", "beach", "beach", "beach", "beach", "beach", "beach", "on beach", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["The chairs are all on top of sand and facing the water.", "Multiple lines of chairs are set up in a sandy area with a large body of water visible in the background.", "As indicated by the sand and ocean."], "image": "val2014/COCO_val2014_000000222322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544485, "question_id": "WMnTLWL2TS64d3mEgXw8Yp", "question": "What is he doing?", "choices": ["playing baseball", "practicing", "posing", "selling bat"], "correct_choice_idx": 2, "direct_answers": ["posing", "sitting", "taking picture", "resting", "holding bat", "sitting", "resting", "sitting", "sitting", "sitting"], "difficult_direct_answer": false, "rationales": ["The man is posing.", "The man is striking a pose for the camera.", "His legs are crossed in a chair while holding a bat."], "image": "train2014/COCO_train2014_000000544485.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157981, "question_id": "WN3WfGgwYCCZSbwyJ9kNdG", "question": "What is the red item inside the holder?", "choices": ["comb", "floss", "razor", "toothbrush"], "correct_choice_idx": 0, "direct_answers": ["comb", "comb", "comb", "comb", "comb", "comb", "comb", "comb", "comb", "comb"], "difficult_direct_answer": false, "rationales": ["It has one row of plastic teeth.", "The red item has a shaft and teeth. it is near hair brushes and has a similar use case.", "There is a comb in the toothbrush holder."], "image": "train2014/COCO_train2014_000000157981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404395, "question_id": "WN99wQXnFzzoH74ekU35PL", "question": "The number of figures on the TV match the number of members of what band?", "choices": ["nirvana", "green day", "beatles", "hall oates"], "correct_choice_idx": 2, "direct_answers": ["partridge family", "beatles", "metallica", "bts", "beatles", "beatles", "beatles", "beatles", "blue man", "beatles"], "difficult_direct_answer": false, "rationales": ["This band was called the fab four.", "There are four figures on the tv. john lennon, paul mccartney, george harrison, and ringo starr are four people.", "There are 4 people on the screen and also 4 in the fab four."], "image": "train2014/COCO_train2014_000000404395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272426, "question_id": "WNByBQmsvT9aHLJB8jBXdX", "question": "What kind of bread is this?", "choices": ["corn", "matzoh", "sliced", "pita"], "correct_choice_idx": 3, "direct_answers": ["pita", "pita", "flatbread", "pita", "panera", "pita", "pita", "sandwich", "pita", "pita"], "difficult_direct_answer": false, "rationales": ["It is similar to that of a taco in which it has the bread thing with lettuce and tomatoes.", "The bread is pita.", "The bread is flat and surrounds gyro ingredients."], "image": "train2014/COCO_train2014_000000272426.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440546, "question_id": "WNPpFTLgCs7bufXMcpfuUm", "question": "What are the two white people riding the elephant called?", "choices": ["residents", "tourists", "locals", "natives"], "correct_choice_idx": 1, "direct_answers": ["tourists", "harness", "tourists", "tourists", "tourists", "passengers", "tourists", "parents", "tourists", "tourists"], "difficult_direct_answer": false, "rationales": ["Tourists often visit other countries and go on elephant rides while there.", "The people are tourists.", "That is an indian elephant and the indian man is leading it. the riders are kind of sight-seeing."], "image": "train2014/COCO_train2014_000000440546.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420054, "question_id": "WNTdT7P22RyxKjPpfqcoeL", "question": "The person wearing what color of shirt officiates the game?", "choices": ["blue", "white", "grey", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "umpire", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The color of the clothes indicates the person is a neutral third party.", "The ump is wearing black.", "Baseball umpires always wear black."], "image": "train2014/COCO_train2014_000000420054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478407, "question_id": "WNZdZf7ZRWPEEpUiLeH6nM", "question": "What side dish does the man have with his meal?", "choices": ["salad", "enchiladas", "tacos", "water"], "correct_choice_idx": 0, "direct_answers": ["salad", "salad", "salad", "salad", "salad", "salad", "salad", "grapes", "grapes", "salad"], "difficult_direct_answer": false, "rationales": ["There is a plastic container beside his plate. it contains lettuce.", "The man has a sandwich in his hand as his main course and to the side he has a green salad to accompany his sandwich.", "There is a container of chopped vegetables and a sauce"], "image": "val2014/COCO_val2014_000000478407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561152, "question_id": "WNsw49PnePerywBKs9HyRX", "question": "Which of this animals body part is trying to grasp food here?", "choices": ["nose", "tusk", "mouth", "tail"], "correct_choice_idx": 0, "direct_answers": ["trunk", "trunk", "nose", "trunk", "trunk", "trunk", "trunk", "trunk", "nose", "trunk"], "difficult_direct_answer": false, "rationales": ["An elephant's nose is reaching out.", "The elephant is using its trunk to grab the food. the trunk of an elephant contains the nostrils.", "The nose is also known as the truck, which is the largest and long appendage that can grab and eat food."], "image": "train2014/COCO_train2014_000000561152.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363752, "question_id": "WP42BTN3oCV65Tfz4fE4rM", "question": "Why is he wearing gloves?", "choices": ["health", "fashion", "grip", "warmth"], "correct_choice_idx": 2, "direct_answers": ["grip", "to bat", "better grip", "grip", "holding bat", "grip", "good grip", "extra grip", "better grip", "grip"], "difficult_direct_answer": false, "rationales": ["He's gripping.", "The baseball player does not want the bat slipping out of his hands as he swings.", "A man is in a baseball uniform and swinging a bat with his gloved hands."], "image": "train2014/COCO_train2014_000000363752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498645, "question_id": "WP4DQrvQeHtY8DzY7WkXLR", "question": "The emblem/mascot of the team of number 16 here is what type of creature?", "choices": ["bird", "wolf", "serpent", "cow"], "correct_choice_idx": 0, "direct_answers": ["bird", "bird", "bird", "cardinal", "bird", "bird", "cardinal", "bird", "bird", "cougar"], "difficult_direct_answer": false, "rationales": ["The team's mascot is a small, red bird.", "Cardinals are birds.", "Cardinal is a type of this animal"], "image": "train2014/COCO_train2014_000000498645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437955, "question_id": "WP9aizYmnmHAhdeVYH4nNa", "question": "The woman is controlling her balance by doing what with her legs?", "choices": ["on tiptoes", "running", "toe touches", "crossing them"], "correct_choice_idx": 3, "direct_answers": ["moving", "crossing", "spacing them", "bending", "skiing", "bending", "crossing them", "leaning", "using", "crossing"], "difficult_direct_answer": false, "rationales": ["Balancing is very difficult on water skis so the woman must be doing everything she can to stay balanced, including crossing her legs.", "The woman is crossing.", "The woman is crossing her legs to control her balance on the water."], "image": "train2014/COCO_train2014_000000437955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337675, "question_id": "WPMMQ4J2Sq3VRzgKTS7EaM", "question": "Why is his racquet behind his head?", "choices": ["is falling", "is broken", "hit ball", "is confused"], "correct_choice_idx": 2, "direct_answers": ["for power", "serving", "serving ball", "serving", "elevation", "serve", "serving", "he's serving", "serving", "hit ball"], "difficult_direct_answer": false, "rationales": ["The racquet is for hitting.", "A tennis player is on the court with racket raised and a ball dropping down from in the air.", "The tennis player is standing normally and is aware of his surroundings. the racquet is not broken."], "image": "train2014/COCO_train2014_000000337675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199881, "question_id": "WPttk54npgZqvjsY37L4Xe", "question": "What kind of beard the man has?", "choices": ["royale", "goatee", "petite goatee", "circle"], "correct_choice_idx": 1, "direct_answers": ["van dyke", "goatee", "goatee", "goatee", "short", "circle beard", "goatee", "goatee", "goatee", "goatee"], "difficult_direct_answer": false, "rationales": ["The beard is partly shaved.", "This is the type of beard he has on his face.", "His beard is short, similar to how goats have their beards, thus the name."], "image": "val2014/COCO_val2014_000000199881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381925, "question_id": "WPvucH5rgFSFJuNMHnRdTQ", "question": "What does the girl hold?", "choices": ["rabbit", "dog", "teddy bear", "rat"], "correct_choice_idx": 1, "direct_answers": ["dog", "dog", "bag", "glove", "bear", "bags", "dog", "dog", "bag", "leaf"], "difficult_direct_answer": false, "rationales": ["You can see his paws in the picture of him holding.", "The girl has a puppy in her arms.", "The girl is holding a real animal. it is too big to be a rat or rabbit."], "image": "val2014/COCO_val2014_000000381925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492155, "question_id": "WQ7KoqTtmmEDJQb8JA49bD", "question": "What sound will he make?", "choices": ["meow", "baa", "woof", "quack"], "correct_choice_idx": 1, "direct_answers": ["baba", "mama", "baaaaa", "baa", "baa", "bah", "bahhh bleat", "baba", "baa", "mawwww"], "difficult_direct_answer": false, "rationales": ["Sheeps say baa.", "This is a sheep", "The animal in the image is a sheep."], "image": "train2014/COCO_train2014_000000492155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458637, "question_id": "WQAHRE3oHA8Cis5cgYUcms", "question": "What kind of venue is the athlete most likely performing in?", "choices": ["olympic", "park", "local rink", "ski resort"], "correct_choice_idx": 0, "direct_answers": ["slopes", "ski resort", "professional competition", "winter games", "snowboard competition", "olympic", "snow area", "x-games", "olympics", "ski resort"], "difficult_direct_answer": true, "rationales": ["The person is competing.", "The signage indicates major sponsors for a major event, which would be olympic in nature rather than local or small.", "There are lots of advertisements on the course"], "image": "train2014/COCO_train2014_000000458637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114414, "question_id": "WQBrbizYdj7AtCb5o8LXHm", "question": "What is number 2 going to do now?", "choices": ["sit", "return home", "hit someone", "run"], "correct_choice_idx": 3, "direct_answers": ["hitting ball", "run", "to catch", "run", "run", "run base", "bat again", "run", "run", "run"], "difficult_direct_answer": false, "rationales": ["A baseball player is swinging the bat from home plate of a baseball diamond.", "The batter always tries to get to first base before the ball does.", "The bat is in the swung position, so he has to go towards the bases so he won't be \"out\""], "image": "val2014/COCO_val2014_000000114414.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19025, "question_id": "WQqdTpMbacR82FGZpA5Jaa", "question": "In which country's streets does this bus travel?", "choices": ["briton", "usa", "spain", "portugal"], "correct_choice_idx": 0, "direct_answers": ["england", "united kingdom", "foreign country", "britain", "england", "united kingdom", "great britain", "uk", "england", "briton"], "difficult_direct_answer": false, "rationales": ["A double decker bus is driving in the street.", "Britain uses bus travel.", "Britain is known for double decker busses."], "image": "val2014/COCO_val2014_000000019025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238178, "question_id": "WRm7A8vvDkWenp9xKVRwbK", "question": "What is it called when Hulk Hogan assumes the position the woman is in?", "choices": ["leg drop", "headbutt", "flexing", "body slam"], "correct_choice_idx": 2, "direct_answers": ["wrestling", "fist", "flexing", "riding hulk", "flexing", "hunk", "flexing", "grapple", "winner", "hunk"], "difficult_direct_answer": false, "rationales": ["The woman is flexing her muscles.", "To show your muscle is to flex.", "She's showing off muscles"], "image": "train2014/COCO_train2014_000000238178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427823, "question_id": "WRobK4AQ8PJMzq3sV98yVT", "question": "What type of restaurant are the people with normal looking clothing dining at?", "choices": ["themed", "fine dining", "up scale", "casual"], "correct_choice_idx": 3, "direct_answers": ["casual restaurant", "hotel lobby", "upscale italian", "museum cafeteria", "casual", "hotel", "hotel", "casual", "tavern", "hotel"], "difficult_direct_answer": false, "rationales": ["This restaurant welcomes dressed-down people who don't want to make a fuss over dining out. t-shirts, shorts and everyday garb are welcome to come and eat.", "They are dressed down in tshirts since it's just supposed to be a relaxed atmosphere and not some sophisticated place.", "They are wearing casual clothes and not dressed up in anything fancy."], "image": "val2014/COCO_val2014_000000427823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485649, "question_id": "WRrtnJmM4cNcwkQ4hy8Va5", "question": "What's the name for the parked two-wheeled vehicles?", "choices": ["quads", "scooters", "segways", "hovercrafts"], "correct_choice_idx": 1, "direct_answers": ["scooters", "scooter", "scooter", "scooters", "vespa", "scooter", "scooter", "vespa", "scooters", "moped"], "difficult_direct_answer": false, "rationales": ["The vehicles are known as mobile scooters.", "The motorbikes are called scooters.", "These are similar to motorcycles but with less power"], "image": "train2014/COCO_train2014_000000485649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10014, "question_id": "WRvKsVDah5yYYqi3HMjWMb", "question": "What profession is the black man in the blue cap on the left?", "choices": ["fireman", "teacher", "police officer", "lawyer"], "correct_choice_idx": 2, "direct_answers": ["police officer", "police officer", "police officer", "policeman", "police", "police officer", "police", "police officer", "construction", "police officer"], "difficult_direct_answer": false, "rationales": ["The profession is the police.", "The man has a badge.", "He is dressed in an officer's uniform."], "image": "val2014/COCO_val2014_000000010014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511662, "question_id": "WSZ3cikHwwrDocXosogYd4", "question": "What type of ship is this?", "choices": ["container", "cargo", "fishing", "cruise"], "correct_choice_idx": 3, "direct_answers": ["cruise", "cruise", "cruise", "cruise ship", "vacation", "cruise", "cruise ship", "cruise ship", "cruise", "cruise"], "difficult_direct_answer": false, "rationales": ["It is large with many levels to house guests.", "With the design and disney log on the ship you can tell what type of ship it is.", "The ship is for cruises."], "image": "val2014/COCO_val2014_000000511662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115404, "question_id": "WT2D668E8gyhD9wPns3TkH", "question": "What country is this in?", "choices": ["united states", "china", "canada", "japan"], "correct_choice_idx": 2, "direct_answers": ["china", "china", "canada", "china", "hong kong", "china", "usa", "america", "china", "china"], "difficult_direct_answer": false, "rationales": ["There are chinese characters.", "The telephone number on the building has an area code of 416, which is assigned to the city of toronto in canada.", "The telephone number is from that country."], "image": "train2014/COCO_train2014_000000115404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 659, "question_id": "WTFqytZiYmXPs5R7U3wLN3", "question": "Why are the men wearing orange vests?", "choices": ["camouflage", "fashion", "dress code", "visibility"], "correct_choice_idx": 3, "direct_answers": ["workers", "train crew", "conductors", "increased visibility", "safety visibility", "safety", "construction workers", "safety", "poor", "visibility"], "difficult_direct_answer": true, "rationales": ["The orange pattern is very bright and sticks out against most backgrounds.", "So people know they are workers.", "The orange vests are for safety."], "image": "train2014/COCO_train2014_000000000659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523173, "question_id": "WTHFzshDdKt7mKN7VQmKNN", "question": "Why are the people crossing the gate holding umbrellas?", "choices": ["keeping dry", "to dance", "respecting tradition", "for fun"], "correct_choice_idx": 0, "direct_answers": ["rain", "rain", "raining", "rain", "rain", "rain", "raining", "rain", "keeping dry", "raining"], "difficult_direct_answer": false, "rationales": ["It is raining", "The people want to be dry.", "The people don't want to get wet."], "image": "train2014/COCO_train2014_000000523173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277122, "question_id": "WTPDewo5xvLLkjDuk7U69Z", "question": "The plastic cups are placed on the top of what kind of object to the right of the fire hydrant?", "choices": ["newspaper box", "ticket kiosk", "pay telephone", "parking meter"], "correct_choice_idx": 0, "direct_answers": ["garbage can", "newspaper dispenser", "newspaper box", "plastic table", "newspaper kiosk", "news stand", "newspaper box", "newspaper box", "newspaper machine", "newspaper"], "difficult_direct_answer": false, "rationales": ["Usa today is a newspaper and the box is labeled with a sign which, despite being only partially visible, is definitely the usa today logo so this must be a newspaper box.", "The usa today logo on the side of the object indicates its purpose.", "It's a usa today logo and box."], "image": "train2014/COCO_train2014_000000277122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501210, "question_id": "WTY5bsKCobzMLivLbyfNHK", "question": "What is the person on the opposite end preparing to do?", "choices": ["serve", "receive", "strike back", "observe"], "correct_choice_idx": 1, "direct_answers": ["receive", "receive ball", "return ball", "strike", "return ball", "return ball", "return", "return", "receive ball", "return ball"], "difficult_direct_answer": false, "rationales": ["The person at the opposite end is preparing to receive the serve.", "They will have the ball come to them.", "The person is receiving."], "image": "train2014/COCO_train2014_000000501210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467246, "question_id": "WTZ22jc2wBJWNPj5NJoj4f", "question": "Where is the boy playing?", "choices": ["arena", "park", "gym", "stadium"], "correct_choice_idx": 1, "direct_answers": ["tennis courts", "tennis", "outside", "tennis", "park", "baseball field", "tennis court", "tennis court", "park", "baseball field"], "difficult_direct_answer": false, "rationales": ["The tennis court is a park feature. the fenced in tennis court is commonly found in city parks.", "There is a large grassy area near the court.", "The boy is in a park."], "image": "train2014/COCO_train2014_000000467246.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415222, "question_id": "WTauuqeDBveaJWn7ViCJ7F", "question": "Why does this person eat so much fruit?", "choices": ["vegetarian", "lower cost", "weight management", "personal preference"], "correct_choice_idx": 2, "direct_answers": ["dieting", "lose weight", "health conscious", "weight loss", "diet", "weight management", "healthy", "health reasons", "for vitamins", "lose weight"], "difficult_direct_answer": true, "rationales": ["The magazine near the fruit is a health magazine.", "The person manages weight.", "The book is for dieting."], "image": "train2014/COCO_train2014_000000415222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76268, "question_id": "WTc9KPBwAh4TiDxwGDWFb2", "question": "What are the blue coverings on the side windows?", "choices": ["paper", "curtains", "shirts", "pants"], "correct_choice_idx": 1, "direct_answers": ["curtains", "shades", "curtains", "blinds", "curtains", "curtains", "curtains", "curtains", "shades", "mirrors"], "difficult_direct_answer": false, "rationales": ["This looks like a professional coach that has the purpose of transporting people. people traveling on such a vehicle are often given personal curtain to protect from the sun should they choose.", "These cloth items are installed permanently on bus.", "These are curtains used to keep the sun out of the bus."], "image": "train2014/COCO_train2014_000000076268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55973, "question_id": "WTfuFqmsyJa2n7f9Em8PEv", "question": "What civilization is this monument replicating?", "choices": ["greek", "chinese", "egyptian", "chinese"], "correct_choice_idx": 2, "direct_answers": ["egyptian", "ancient egypt", "ancient egypt", "egyptian", "egyptian", "egyptian", "ancient egypt", "ancient egypt", "egyptian", "egyptian"], "difficult_direct_answer": false, "rationales": ["I have an archeological background. even if i didn't, it would be easy to answer this question. in the 1980s, people used hairstyles and clothing replicas in fashion.", "These are typical of ancient artifacts from that country", "These are for mummies"], "image": "train2014/COCO_train2014_000000055973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496115, "question_id": "WTfwDLxqke4oGMGm4D9kgq", "question": "Persons here are likely attending what type reception?", "choices": ["wedding", "retirement", "business", "funeral"], "correct_choice_idx": 0, "direct_answers": ["wedding", "wedding reception", "wedding", "wedding", "wedding reception", "wedding", "wedding", "wedding", "wedding", "wedding"], "difficult_direct_answer": false, "rationales": ["Everyone is dressed up and some people are wearing tuxedos. those wearing tuxedos are part of the bridal party and the others are guests at the event.", "They look like they just came from a wedding.", "The people are dressed up and smiling."], "image": "train2014/COCO_train2014_000000496115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374114, "question_id": "WTrWJfPndUEXJtFtk9FpgT", "question": "Why is the man standing at the open door?", "choices": ["leaving train", "tired", "works there", "is lost"], "correct_choice_idx": 2, "direct_answers": ["checking tickets", "checking clearance", "passenger check", "to depart", "boarding train", "ticket collector", "checking tickets", "works there", "checking platform", "boarding passengers"], "difficult_direct_answer": true, "rationales": ["The man is the conductor.", "The man is wearing a uniform. uniformed people standing the doorway of a train are likely working as this is something workers on trains frequently do and one would wear this type of uniform if at work.", "The man is an employee."], "image": "train2014/COCO_train2014_000000374114.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471785, "question_id": "WTvgpyu22FVE6t9PZA6uuj", "question": "What does her earring represent?", "choices": ["wealth", "royalty", "war", "peace"], "correct_choice_idx": 3, "direct_answers": ["peace sign", "peace", "peace", "peace", "eating", "wheel", "summer trip", "tree", "peace", "peace"], "difficult_direct_answer": false, "rationales": ["It is a peace sign.", "She wants peace.", "The earring is a peace sign."], "image": "val2014/COCO_val2014_000000471785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150342, "question_id": "WU7dvMJayX4mdwKNeuYA9E", "question": "What stone lines the flowerbeds?", "choices": ["slate", "granite", "quartz", "marble"], "correct_choice_idx": 3, "direct_answers": ["granite", "granite", "marble", "granite", "marble", "marble", "granite", "marble", "marble", "marble"], "difficult_direct_answer": false, "rationales": ["The stone is shiny and grey with irregularities on it.", "It is very shiny and polished and smooth", "That rock is very shiny."], "image": "val2014/COCO_val2014_000000150342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112040, "question_id": "WUA8VsBTXWsnAYXgJDnopK", "question": "What type of vehicle is this brand most famous for manufacturing?", "choices": ["bus", "motorcycle", "train", "car"], "correct_choice_idx": 1, "direct_answers": ["motorcycles", "motorcycle", "mercedes", "motorcycles", "motorcycles", "motorcycle", "mercedes", "motorcycles", "motorcycles", "motorcycles"], "difficult_direct_answer": false, "rationales": ["The vehicle is a motorbike.", "This brand is known for its motorcycles.", "Kawasaki is a brand that makes bikes."], "image": "train2014/COCO_train2014_000000112040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165953, "question_id": "WUFKAGf5TDCyGorGDjgmVy", "question": "What type of people sit on the train?", "choices": ["train passengers", "business people", "tourists", "freight"], "correct_choice_idx": 2, "direct_answers": ["tourists", "young", "passengers", "passengers", "tourists", "tourists", "tourists", "tourists", "passengers", "passengers"], "difficult_direct_answer": false, "rationales": ["This is a sightseeing train", "Tourists sit.", "Anyone riding a train is known as a passenger."], "image": "train2014/COCO_train2014_000000165953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94544, "question_id": "WUTrvJ3BWt7ughTvAw6wXj", "question": "What is the man in the black ready to do?", "choices": ["swing", "serve", "dunk", "sit"], "correct_choice_idx": 0, "direct_answers": ["swing", "hit ball", "swing", "hit ball", "bat", "hit ball", "baseball", "swing", "bat", "hit ball"], "difficult_direct_answer": false, "rationales": ["The man wants to swing.", "The man wants to swing the bat back and forth.", "By the setting and the bat in the persons hands it is easy to understand what he wants to do."], "image": "train2014/COCO_train2014_000000094544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248565, "question_id": "WUhucAtxwJAk2N3NPJFVNb", "question": "What safety feature does the bus use whenever they make a stop?", "choices": ["cruise control", "flashes headlights", "stop sign", "honks horn"], "correct_choice_idx": 2, "direct_answers": ["stop sign", "brakes", "stop sign", "stop sign", "stop sign", "stop sign", "stop sign", "stop sign", "stop", "brakes"], "difficult_direct_answer": false, "rationales": ["There is a red octagonal sign on the side of the bus that is used at each stop.", "A bus is stopped on the road with an octagon shaped red and white sign sticking out from the side near the driver.", "The bus uses a stop sign whenever stopping."], "image": "train2014/COCO_train2014_000000248565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212429, "question_id": "WUmeLaHUPwrCwAmqypTJTz", "question": "This traffic light sculpture is located in which European country?", "choices": ["france", "germany", "united kingdom", "spain"], "correct_choice_idx": 2, "direct_answers": ["england", "uk", "england", "england", "united kingdom", "england", "uk", "germany", "united kingdom", "united kingdom"], "difficult_direct_answer": false, "rationales": ["The other signs are in english", "This is a popular sight in the uk.", "The light is in the uk."], "image": "train2014/COCO_train2014_000000212429.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541596, "question_id": "WVCd3YMAWX3j4MNovDz82Q", "question": "What are the people near the bus doing?", "choices": ["sitting", "standing", "running", "eating"], "correct_choice_idx": 1, "direct_answers": ["accident drill", "standing", "training", "working", "standing", "listening", "standing", "fixing", "waiting", "construction"], "difficult_direct_answer": false, "rationales": ["They are upright with their legs on the ground.", "They are upright and not moving", "The people are standing near the bus."], "image": "train2014/COCO_train2014_000000541596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348606, "question_id": "WVDUunao9haBsZtPkXCEep", "question": "Where does this person seem to prefer reading?", "choices": ["no where", "shower", "living room", "bed"], "correct_choice_idx": 3, "direct_answers": ["bedroom", "bed", "bed", "bedroom", "writing books", "bed", "beach", "bedroom", "bedroom", "nothing"], "difficult_direct_answer": false, "rationales": ["The person is on a mattress.", "The person is in bed.", "A person has a book open on a blanket and is laying on the blanket."], "image": "train2014/COCO_train2014_000000348606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568979, "question_id": "WVVzYz3MEuXFCHgeBNxrBT", "question": "What is the woman in white shirt likely to be doing?", "choices": ["covering face", "feeding", "taking photo", "waving"], "correct_choice_idx": 2, "direct_answers": ["taking picture", "pictures", "watching", "taking photos", "taking photo", "taking photos", "taking picture", "pictures", "taking picture", "taking picture"], "difficult_direct_answer": false, "rationales": ["Though out of the frame, the woman appears to be holding her arm up in a manner consistent with photography. people tend to take photos when they're in a place that has elephants.", "A woman is looking at an elephant from a distance back.", "The woman is taking a picture of the elephant."], "image": "val2014/COCO_val2014_000000568979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390292, "question_id": "WVzZ2VW9bmHzCp9Ac5diE3", "question": "What service is this lady receiving?", "choices": ["nail treatment", "haircut", "massage", "facial treatment"], "correct_choice_idx": 2, "direct_answers": ["texting messages", "cell", "massage", "wifi", "massage", "pedicure", "pedicure", "massage", "massage", "mobile"], "difficult_direct_answer": false, "rationales": ["The sign above the chair indicates its purpose.", "The woman is in a massage chair.", "The chair likely gives her this benefit. the sign above plainly states this fact."], "image": "val2014/COCO_val2014_000000390292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322625, "question_id": "WW2JdTeY5T965dywbotzmR", "question": "What was the original color of most carrots?", "choices": ["purple", "green", "black", "red"], "correct_choice_idx": 0, "direct_answers": ["orange", "orange", "orange", "orange", "purple", "orange", "orange", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["Carrots are on a board next to beans.", "Most carrots were actually purple like beets.", "A lot of carrots were purple."], "image": "train2014/COCO_train2014_000000322625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557628, "question_id": "WWQx2svwbknEHmEyALH7i6", "question": "What type of party might be held here?", "choices": ["christmas", "high school", "masquerade", "saint patricks"], "correct_choice_idx": 2, "direct_answers": ["masquerade", "masquerade", "masquerade", "masquerade", "new years", "halloween", "halloween", "halloween", "halloween", "halloween"], "difficult_direct_answer": false, "rationales": ["The people are all wearing costumes and masks.", "The masquerade party is happening.", "There is a man wearing a mask."], "image": "train2014/COCO_train2014_000000557628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206420, "question_id": "WWcXGQz9YZJai3mi5cks9R", "question": "Why is he throwing the item in the air?", "choices": ["stretching it", "throwing out", "showing off", "collecting dust"], "correct_choice_idx": 0, "direct_answers": ["to flatten", "stretching it", "preparing dough", "stretching dough", "stretch out", "widen dough", "making pizza", "making pizza", "flatten out", "making crust"], "difficult_direct_answer": true, "rationales": ["The man wants to stretch the pizza dough.", "The man wants to stretch it.", "The man is throwing the pizza in the air and using his fists to stretch it when he catches it. this is how cooks make pizza dough thin."], "image": "train2014/COCO_train2014_000000206420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121503, "question_id": "WWnWG9o2a8qXGvDm6vffGt", "question": "In which country is this bus taking on passengers?", "choices": ["japan", "england", "united states", "fiji"], "correct_choice_idx": 1, "direct_answers": ["united kingdom", "uk", "england", "united kingdom", "united kingdom", "london", "united kingdom", "mexico", "mexico", "england"], "difficult_direct_answer": false, "rationales": ["A green and tan public bus is in the street.", "The website shown on the bus ends in uk which stands for united kingdom.", "It has a co.uk website address on the bus and uncommon to see that anywhere other than the uk."], "image": "val2014/COCO_val2014_000000121503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520478, "question_id": "WWsGk5Yg3NQf3fRHo2YR2D", "question": "Racket is made up of what?", "choices": ["graphite", "nylon", "wood", "stick"], "correct_choice_idx": 0, "direct_answers": ["graphite", "strings", "carbon fiber", "metal", "wire", "carbon fiber", "fiberglass", "fiberglass", "string", "plastic"], "difficult_direct_answer": false, "rationales": ["The racquet has graphite.", "The racquet is constructed of graphite.", "Tennis racquets are typically made of graphite."], "image": "val2014/COCO_val2014_000000520478.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191585, "question_id": "WWzdSUEhxhWEdpxpx7mJnM", "question": "Why do the women have red bags with them?", "choices": ["they're saleswomen", "they're shopping", "they're traveling", "they're teachers"], "correct_choice_idx": 2, "direct_answers": ["luggage tourists", "traveling", "traveling", "traveling", "they're traveling", "traveling", "carry belongings", "travelling", "travelling", "traveling"], "difficult_direct_answer": false, "rationales": ["This is the most likely reason. the other options don't really fit.", "The luggage is used to hold clothing.", "Suitcases can hold clothing and other possessions for traveling."], "image": "val2014/COCO_val2014_000000191585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214109, "question_id": "WWzfxVrkeryWYCpbFdGxfx", "question": "For what reason were the benches likely sealed off with caution tape?", "choices": ["rain", "private", "broken", "wet paint"], "correct_choice_idx": 3, "direct_answers": ["wet paint", "wet paint", "paint", "wet paint", "fresh paint", "wet paint", "crime", "fresh paint", "wet paint", "just painted"], "difficult_direct_answer": false, "rationales": ["The park benches look too new to be used and look like they were just painted.", "The only thing that could cause these to be dangerous is paint, and they look to have a fresh coat.", "The benches are outside, in good shape, and accessible by the public. the coating on the benches is brand new."], "image": "val2014/COCO_val2014_000000214109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58911, "question_id": "WXKDJAsEimPSGwdJmQ5YLY", "question": "What type of location is this?", "choices": ["commercial", "residential", "educational", "medicinal"], "correct_choice_idx": 1, "direct_answers": ["studio", "living room", "residential", "living room", "living room", "apartment", "living room", "living room", "living room", "studio"], "difficult_direct_answer": false, "rationales": ["There is a sofa and other personal belongings in the room.", "This looks like a place that someone lives in and the residence of a person.", "The location is residential."], "image": "train2014/COCO_train2014_000000058911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268992, "question_id": "WXdScm8JVCHn3gbaGmx9Xz", "question": "The printer to the right of the laptop utilizes what type of printing technology?", "choices": ["laser", "dye sublimation", "thermal", "inkjet"], "correct_choice_idx": 0, "direct_answers": ["ink", "ink", "printing", "laser", "3d", "digital printing", "digital", "wireless", "modern", "inkjet"], "difficult_direct_answer": true, "rationales": ["They are more popular now.", "There is a laser printer.", "The printer is a large laser printer."], "image": "val2014/COCO_val2014_000000268992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147879, "question_id": "WY7BrnnxzjcduNT9h8ws3h", "question": "What is on the ceiling?", "choices": ["balloons", "lights", "bats", "squid"], "correct_choice_idx": 1, "direct_answers": ["umbrellas", "lights", "lights", "lights", "lights", "umbrellas", "lights", "spotlights", "lights", "umbrella"], "difficult_direct_answer": false, "rationales": ["That's what all the dots are.", "These are illuminated bulbs", "There are many lit bulbs hanging from the ceiling."], "image": "val2014/COCO_val2014_000000147879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561903, "question_id": "WY8BZQXoePCCU9YTpAvWpz", "question": "What does this dog like on the TV?", "choices": ["another dog", "toys", "cats", "food"], "correct_choice_idx": 0, "direct_answers": ["other animal", "watch tv", "another dog", "other dogs", "other dog", "other dog", "another dog", "dog", "sound picture", "another dog"], "difficult_direct_answer": false, "rationales": ["The dog is looking at the tv screen.", "Dogs are interested in other dogs, and the dog is looking at the television while there is another dog on the screen.", "There is a canine in the show"], "image": "val2014/COCO_val2014_000000561903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20908, "question_id": "WYt8hwyN4f59kZcaeeYCXL", "question": "Which food is the healthiest?", "choices": ["orange juice", "cake", "grapes", "donuts"], "correct_choice_idx": 2, "direct_answers": ["grapes", "fruits", "grapes", "grapes", "grapes", "fruit", "grapes", "grapes", "grapes", "grapes"], "difficult_direct_answer": false, "rationales": ["The food is grapes.", "Grapes are a fruit.", "Grapes are a fruit."], "image": "val2014/COCO_val2014_000000020908.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380500, "question_id": "WZ9ijQzPjLxHHDWfMCEV5d", "question": "What does the giraffe stick it's neck out for?", "choices": ["food", "scratching", "stretching", "humiliation"], "correct_choice_idx": 0, "direct_answers": ["food", "food", "food", "eat", "to eat", "long neck", "food", "food", "treats", "food"], "difficult_direct_answer": false, "rationales": ["He is sticking his tongue out to receive a snack.", "The giraffe wants food.", "With the giraffe having its tongue out and the setting of where it is, you can tell what it wants."], "image": "train2014/COCO_train2014_000000380500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560152, "question_id": "WZEeTyP8v4LAEr7BhjiV4o", "question": "What may be the best type of licorice?", "choices": ["australian", "american", "indian", "british"], "correct_choice_idx": 0, "direct_answers": ["australian", "twizzlers", "panda licorice", "australian", "black", "candies", "black", "black", "black", "panda licorice"], "difficult_direct_answer": false, "rationales": ["The licorice bag on the ground is made in australia.", "This is from australia.", "The brand is from the land down under."], "image": "train2014/COCO_train2014_000000560152.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100010, "question_id": "WZGjnMBc9u3ehfSuu5eZeN", "question": "What does the white light come from?", "choices": ["star", "lamp", "moon", "sun"], "correct_choice_idx": 1, "direct_answers": ["sun", "light pole", "lamp", "sun", "sun", "snow", "airplane", "moon", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["There is a light in the sky that is illuminating the area.", "There is a street lamp above.", "It's coming from a light outside."], "image": "val2014/COCO_val2014_000000100010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177404, "question_id": "WZQGnWBJU6D64kqB4cnHgJ", "question": "What company made the black laptop on the black notebook?", "choices": ["hp", "dell", "apple", "sony"], "correct_choice_idx": 2, "direct_answers": ["mac", "apple", "apple", "mac", "apple", "apple", "apple", "apple", "apple", "apple"], "difficult_direct_answer": false, "rationales": ["There is a picture of a fruit on it", "The multicolored fruit logo is underneath the screen display.", "The logo on the laptop is that of the apple company."], "image": "train2014/COCO_train2014_000000177404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260382, "question_id": "WZb6aCGGhsQoAoQev9ist9", "question": "What type of transport is visible here?", "choices": ["car", "plane", "hotair balloon", "bike"], "correct_choice_idx": 2, "direct_answers": ["hotair balloon", "balloon", "balloon", "air balloon", "van", "balloon", "hot air-balloon", "air balloon", "balloon", "air balloon"], "difficult_direct_answer": false, "rationales": ["The balloon is on the ground.", "There is a large balloon in the background. a balloon this large compared to the people and vehicle would be a hot air balloon which is a type of transport.", "There is a big inflatable in the back"], "image": "val2014/COCO_val2014_000000260382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197705, "question_id": "WZjJTEJ6JkK4HYmBCPJAcj", "question": "Who is famous for doing what these people are doing?", "choices": ["john cleese", "charles lindbergh", "lance armstrong", "james cameron"], "correct_choice_idx": 2, "direct_answers": ["lance armstrong", "lance armstrong", "lance armstrong", "lance armstrong", "lance armstrong", "lance armstrong", "eddie merckx", "riding", "lance armstrong", "marathon"], "difficult_direct_answer": false, "rationales": ["The people are cycling and the first option is a famous professional road racing cyclist who was stripped of his many medals after being caught using performance enhancing drugs.", "Armstrong bikes.", "He was a world famous biker who is now known for using drugs to help boost his biking skills."], "image": "train2014/COCO_train2014_000000197705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400045, "question_id": "WZpLowEQRjkCdx9z2SFb2m", "question": "What brand of wipes are on the table?", "choices": ["mac", "pampers", "huggies", "hp"], "correct_choice_idx": 2, "direct_answers": ["huggies", "huggies", "mel", "mel", "huggies", "huggies", "huggies", "huggies", "mel", "huggies"], "difficult_direct_answer": false, "rationales": ["The brand is huggies.", "It has the brand's name on the front of the wipes and a baby on it.", "The wipies on the desk have huggies written on them."], "image": "train2014/COCO_train2014_000000400045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549849, "question_id": "WZpg7TexJSNJDkJQg2T4rF", "question": "What activity are these people engaged in?", "choices": ["sport", "travel", "debate", "celebration"], "correct_choice_idx": 1, "direct_answers": ["travel", "travel", "travel", "travel", "travel", "traveling", "waiting", "travel", "luggage collection", "travel"], "difficult_direct_answer": false, "rationales": ["The people have their suitcases with them.", "They have a large amount of luggage with them signifying they are taking a trip together.", "They're traveling."], "image": "train2014/COCO_train2014_000000549849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105865, "question_id": "WaHjy2fw65CskPGK3Qo3Ph", "question": "What device might you find near this appliance?", "choices": ["phone", "kindle", "microwave", "tv"], "correct_choice_idx": 2, "direct_answers": ["toaster", "stove", "microwave", "oven", "oven", "microwave", "oven", "stove", "microwave", "electrical outlet"], "difficult_direct_answer": false, "rationales": ["The object in question is a refrigerator which is commonly located in a kitchen. answer a is another kitchen appliance which might be near.", "A microwave is in a kitchen.", "This is a fridge. logically there would be another kitchen appliance here."], "image": "train2014/COCO_train2014_000000105865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384718, "question_id": "WaLDa4DKL5HcbcMYG53XhU", "question": "What animals are present?", "choices": ["dog", "elephant", "giraffe", "deer"], "correct_choice_idx": 1, "direct_answers": ["elephants", "elephants", "elephants", "elephants", "elephants", "elephants", "elephants", "elephants", "elephant", "elephants"], "difficult_direct_answer": false, "rationales": ["Large animals with trunks are in and near the water.", "Two elephants are in the water and one is on the edge of the water.", "By the trunks and the colors of these animals, you can tell what they are."], "image": "val2014/COCO_val2014_000000384718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58651, "question_id": "WaYcxsGJ3hqKcCXSfhMSiJ", "question": "What kind of transportation is this?", "choices": ["road", "water", "air", "rail"], "correct_choice_idx": 3, "direct_answers": ["train", "train", "train", "train", "train", "train", "train", "train", "rail", "train"], "difficult_direct_answer": false, "rationales": ["It's also referred to as train travel.", "Traditionally train tracks are used by trains that use rails.", "The vehicle travels on tracks and is not capable of flying, floating, or driving on regular roads."], "image": "val2014/COCO_val2014_000000058651.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510577, "question_id": "Wajh9zHggbYtzbZaqVdvo7", "question": "What would be the most appropriate word for the theme of this room?", "choices": ["tropical", "forrest", "desert", "arctic"], "correct_choice_idx": 2, "direct_answers": ["modern", "traditional", "resort", "cozy", "living room", "modern", "desert", "plain", "sitting room", "classy"], "difficult_direct_answer": true, "rationales": ["The desert is appropriate.", "There are sandy colors being used.", "The most appropriate word would be a desert since it has soft beige hues."], "image": "train2014/COCO_train2014_000000510577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572949, "question_id": "WatskgegYqTPsarMkJvPTs", "question": "What kind of gathering is this?", "choices": ["family", "religious", "social", "business"], "correct_choice_idx": 3, "direct_answers": ["business", "work meeting", "study group", "social gathering", "work", "party", "friend gathering", "casual", "study group", "bottle"], "difficult_direct_answer": true, "rationales": ["The people are working on laptops. they are not praying.", "The people are all older men drinking coffee. their computers aren't suitable for gaming, and they're too old to be students, meaning they are having a more serious meeting.", "Business is seen here as a matter of concern because of the laptops."], "image": "train2014/COCO_train2014_000000572949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157172, "question_id": "WbFZFUzm46MQpReaxVqsu3", "question": "Where are the candles placed in the room?", "choices": ["mantle", "table", "dresser", "floor"], "correct_choice_idx": 0, "direct_answers": ["fireplace", "mantle", "mantle", "mantle", "fireplace mantel", "on fireplace", "mantle", "fireplace mantel", "mantle", "chair"], "difficult_direct_answer": false, "rationales": ["The candles are placed on a ledge directly above a fireplace.", "The candles are on the mantle.", "The candles are on the mantle."], "image": "train2014/COCO_train2014_000000157172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479611, "question_id": "WbKNMxDTypvo8g7vvrRFbq", "question": "This animal is closely related to what other animal?", "choices": ["donkey", "bear", "bat", "ant"], "correct_choice_idx": 0, "direct_answers": ["donkey", "horse", "donkey", "donkey", "donkey", "donkey", "zebra", "donkey", "zebra", "horse"], "difficult_direct_answer": false, "rationales": ["This is a quadruped mammal with hooves, which are characteristics that also pertain to a donkey.", "The animal is a horse with equine features.", "The animal is in the horse family."], "image": "train2014/COCO_train2014_000000479611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460183, "question_id": "WbRDWrcp69TVnyQvW98HTg", "question": "What type of cake is this?", "choices": ["garash cake", "circle cake", "cupcake", "bondt cake"], "correct_choice_idx": 3, "direct_answers": ["bundt", "bondt cake", "bunt", "bundt", "bundt", "bundt", "bunt", "bundt", "bundt", "bundt"], "difficult_direct_answer": false, "rationales": ["This is a bundt cake.", "The cake looks like it is upside down.", "It has a classic shape of that type of pan"], "image": "val2014/COCO_val2014_000000460183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211632, "question_id": "WbWzSVScVsYKvBMCgeKjiP", "question": "What method was used to create the picture on the wall?", "choices": ["knitting", "printing", "painting", "embroidery"], "correct_choice_idx": 3, "direct_answers": ["embroidery", "embroidery", "stitching", "embroidery", "crochet", "crochet", "crochet", "cross stitch", "knitting", "cross stitch"], "difficult_direct_answer": false, "rationales": ["The picture on the wall is a piece of embroidery.", "The welcome sign is cross-stitched.", "The method is embroidery."], "image": "train2014/COCO_train2014_000000211632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426421, "question_id": "WbjbxcLeu44Y84m59GYHK9", "question": "What color does the owner of the bed wear?", "choices": ["white", "brown", "none", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The woman has red ones.", "The person lying down on the bed has a red dress on.", "The girl has a red jacket."], "image": "train2014/COCO_train2014_000000426421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136971, "question_id": "WcCjK7uuni8fyboqwJDEar", "question": "Where does the fringe come from?", "choices": ["scarf", "bag", "blouse", "hat"], "correct_choice_idx": 0, "direct_answers": ["yarn", "scarf", "scarf", "scarf", "sweater", "scarf", "scarf", "scarf", "sky", "snow"], "difficult_direct_answer": false, "rationales": ["The item appears to be a scarf underneath the jacket. people wear scarves in cold weather for added warmth.", "It appears to be a blue gray with red fringe.", "The person is wearing a scarf."], "image": "train2014/COCO_train2014_000000136971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174749, "question_id": "WcF3jsyNYPBbiV38cRZuFj", "question": "What type of wrapper is he putting food in?", "choices": ["tortilla", "bun", "lettuce wrap", "sliced bread"], "correct_choice_idx": 0, "direct_answers": ["tortilla", "taco", "tortilla", "saran wrap", "tortillas", "plastic bag", "plastic bag", "tortilla", "dania", "bag"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to what he's holding in his hand.", "The food in his hand is a tortilla. tortillas are usually used as wrappers for food items.", "It is round and made from flour"], "image": "train2014/COCO_train2014_000000174749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515391, "question_id": "WcFGxERUA4rJr3Nw3CkgKf", "question": "What is the area behind this man used for?", "choices": ["prison", "picnics", "sales", "bathroom"], "correct_choice_idx": 1, "direct_answers": ["picnic", "sitting", "eating", "eating", "sitting", "park", "basking", "receiving guests", "eating", "picnics"], "difficult_direct_answer": false, "rationales": ["There are tables in a field that are wooden.", "The area behind the man appears to be a park of some sort, which is a favorite location for families to have cookouts and family reunions.", "The area is filled with picnic tables."], "image": "val2014/COCO_val2014_000000515391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505861, "question_id": "WcXZewPun2SQcLHiw7PvuW", "question": "Who is the woman to the child in front of her?", "choices": ["mother", "neighbor", "sister", "niece"], "correct_choice_idx": 0, "direct_answers": ["mother", "mother", "mom", "mother", "child", "mother", "mother", "tourist", "mother", "mother"], "difficult_direct_answer": false, "rationales": ["The age and looks of the two makes it look like a mother child relationship.", "The woman is the mom.", "The woman is the mother of the small child on the elephant."], "image": "train2014/COCO_train2014_000000505861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43270, "question_id": "WcYn8zDaRzR8r4xc9bTMRB", "question": "Why are there so many televisions?", "choices": ["electronics graveyard", "scientific experiment", "factory", "sales display"], "correct_choice_idx": 3, "direct_answers": ["store display", "display", "sales display", "for sale", "convention expo", "display", "for sale", "sale", "display", "for sale"], "difficult_direct_answer": false, "rationales": ["They are for sales.", "There are sales displays.", "All of the televisions are displaying the same thing. sales displays are the only place where it would be common for this many tvs of this size to be next to each other and all displaying the same stock image."], "image": "train2014/COCO_train2014_000000043270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355342, "question_id": "Wd5Ms8qdEH4X47s3pCyZCd", "question": "What traditionally formed the spokes of these types of umbrella?", "choices": ["clay", "glass", "pearls", "wood"], "correct_choice_idx": 3, "direct_answers": ["shafts", "bamboo", "bamboo", "wires", "metal", "metal", "bamboo", "wood", "wood", "bamboo"], "difficult_direct_answer": false, "rationales": ["Asian woman are performing with umbrellas.", "Although now they are usually made with metal, traditionally this was the sturdiest available material.", "This material supported this japanese device that was used for sun protection and dancing."], "image": "val2014/COCO_val2014_000000355342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193261, "question_id": "Wd9VLSg4UAgVZZknPsGPje", "question": "What weather might be imminent here?", "choices": ["tornado", "sun", "rain", "earthquake"], "correct_choice_idx": 2, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["It is cloudy and the sky is dark.", "The day is very cloudy.", "There are clouds in the sky. they are not funnel clouds."], "image": "val2014/COCO_val2014_000000193261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509536, "question_id": "WdC8hP4pq88atKPPkc3AyH", "question": "What type of establishment in the background is it?", "choices": ["hotel", "furniture room", "restaurant", "bank"], "correct_choice_idx": 1, "direct_answers": ["furniture room", "bank", "building", "city street", "bank", "store", "bank", "building", "commercial", "office"], "difficult_direct_answer": false, "rationales": ["A mirror is on display outside of a business.", "The people are looking in a mirror on the street which shows that furniture is likely being sold.", "By the words on the buildings door you can tell what type of business it is."], "image": "val2014/COCO_val2014_000000509536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457249, "question_id": "WdCK99ZN5L8cch3rSPaWn9", "question": "What is the Blue bag used for?", "choices": ["decoration", "trash", "towel distribution", "ballot collection"], "correct_choice_idx": 1, "direct_answers": ["trash", "garbage", "trash", "trash", "trash", "garbage", "garbage", "trash", "trash", "garbage"], "difficult_direct_answer": false, "rationales": ["The bag is set up and slowly being filled. it is made out of disposable plastic.", "This kind of can is used to put anything in it you don't want it.", "The tag is for trash."], "image": "val2014/COCO_val2014_000000457249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91301, "question_id": "WdMqfzHhySrpr8ekhuoivM", "question": "What is the green item on the tree near the bike?", "choices": ["moss", "pea pods", "banana", "apple"], "correct_choice_idx": 0, "direct_answers": ["moss", "moss", "moss", "moss", "leaves", "moss", "lichen", "moss", "tree branch", "grass"], "difficult_direct_answer": false, "rationales": ["The green item is a small and dense clump in a shady spot.", "The green item is moss growing.", "The thick like green grass that grow on the sides of trees is moss."], "image": "train2014/COCO_train2014_000000091301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540326, "question_id": "WdSqnLK8HwCVHPSdj9cTQ8", "question": "What is the person engaged in?", "choices": ["gaming", "debate", "discussion", "sports"], "correct_choice_idx": 0, "direct_answers": ["playing", "video game", "gaming", "video games", "playing game", "wii boxing", "wii", "game", "video game", "gaming"], "difficult_direct_answer": false, "rationales": ["You can tell by the controllers he is holding and what he was doing as to what the answer is.", "The young boy is standing holding a game controller in front of a tv as a game is being played.", "The person is gaming."], "image": "train2014/COCO_train2014_000000540326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257013, "question_id": "WddnxDYtiNKQPvZCzRQL9X", "question": "In which country is this bus currently driving?", "choices": ["united states", "france", "great britain", "guatamala"], "correct_choice_idx": 2, "direct_answers": ["england", "germany", "england", "uk", "england", "great britain", "uk", "england", "england", "united kingdom"], "difficult_direct_answer": false, "rationales": ["It looks by the plates its from great britain.", "The double-decker bus is driving on the left side of the road. people in france, the united states, and guatemala drive on the right side.", "Double decker busses are common in britain."], "image": "train2014/COCO_train2014_000000257013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440614, "question_id": "WdftAkYLNN3zFFNmwHxS8b", "question": "What negative thing is wrong with the soup?", "choices": ["runny", "salty", "too cold", "too hot"], "correct_choice_idx": 3, "direct_answers": ["hot", "too hot", "green pepper", "hot", "too hot", "juice", "hot", "too hot", "cold", "nothing"], "difficult_direct_answer": false, "rationales": ["The soup is way too hot.", "The person is blowing on the food because it is too warm.", "The woman blows on the soup to cool it."], "image": "train2014/COCO_train2014_000000440614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44616, "question_id": "We49GQzvnARUaQeJ4p3Lk7", "question": "What kind of animal was cooked in order to add the meat on the pizza?", "choices": ["cow", "horse", "pig", "deer"], "correct_choice_idx": 2, "direct_answers": ["pork", "pork", "pork", "pig", "pig", "pig", "pig", "pig", "pig", "pig"], "difficult_direct_answer": false, "rationales": ["There is bacon on the pizza.", "The meat looks like some variant of ham or bacon.", "This is bacon which is pork"], "image": "train2014/COCO_train2014_000000044616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53720, "question_id": "WeXqmUWFzMg2ZsHbXMUZEb", "question": "What role is the man on the red helmet?", "choices": ["pitcher", "umpire", "catcher", "hitter"], "correct_choice_idx": 2, "direct_answers": ["catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "umpire", "catcher"], "difficult_direct_answer": false, "rationales": ["The man with the red helmet is a catcher.", "He catches the ball the pitcher has thrown", "The other options don't apply as b is on the mound, the guy with the bat is c and the d is behind a."], "image": "train2014/COCO_train2014_000000053720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179968, "question_id": "WeeF7UJusLGht5h3SCxunc", "question": "What does this business do?", "choices": ["dentist", "kids' rides", "kids' haircuts", "eye doctor"], "correct_choice_idx": 2, "direct_answers": ["cut hair", "cut hair", "cuts hair", "haircuts", "haircuts", "kids' haircuts", "cut hair", "cuts hair", "children's hair", "cut hair"], "difficult_direct_answer": false, "rationales": ["The business can handle kids' haircuts.", "This is obvious based on the fact that she's cutting his hair and there are hairstyle images in the background and another person cutting hair.", "The people are cutting kids' hair."], "image": "train2014/COCO_train2014_000000179968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477749, "question_id": "WekqdjhYEws4ufALCDPWWu", "question": "Who is the man in the train car?", "choices": ["witness", "operator", "engineer", "host"], "correct_choice_idx": 2, "direct_answers": ["driver", "conductor", "engineer", "conductor", "engineer", "conductor", "conductor", "conductor", "engineer", "engineer"], "difficult_direct_answer": false, "rationales": ["The man is an engineer.", "The man in the train is in charge of operating the engine.", "The man operates the train car."], "image": "train2014/COCO_train2014_000000477749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259342, "question_id": "WennCPiDYuuS8rTiJBcsXv", "question": "What type food is advertised here?", "choices": ["hot dogs", "pizza", "chinese", "cannibal burgers"], "correct_choice_idx": 0, "direct_answers": ["hot dogs", "hot dogs", "hot dogs", "big weenies", "scatting", "hot dogs", "sausages", "hot dogs", "hot dog", "hot dogs"], "difficult_direct_answer": false, "rationales": ["The place is called big weenie. weenie or wiener is another word for hot dogs.", "Hot dogs are advertised.", "Hot dogs are weenies."], "image": "val2014/COCO_val2014_000000259342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416656, "question_id": "WevXt7G5tRUxgL3qbofsgt", "question": "Which country is famous for paintings?", "choices": ["france", "rome italy", "poland", "canada"], "correct_choice_idx": 1, "direct_answers": ["rome italy", "france", "france", "italy", "many", "italy", "japan", "spain", "italy", "paris"], "difficult_direct_answer": false, "rationales": ["Rome is known for famous paintings.", "The paintings in the living room are colorful and abstract. the country which is most famous for this genre type is rome, italy.", "Many countries have famous painters"], "image": "train2014/COCO_train2014_000000416656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268656, "question_id": "Wf9VBPh2gvodhj4PvYQxkx", "question": "Where is this sport most likely taking place?", "choices": ["schoolhouse", "superhighway", "restaurant", "skateboard park"], "correct_choice_idx": 3, "direct_answers": ["skatepark", "skatepark", "skate park", "skateboarding", "skate park", "skating", "skatepark", "empty pool", "skate park", "skateboard park"], "difficult_direct_answer": false, "rationales": ["The person is flipping into the air at a skate park.", "There is a curved surface they are dropping into.", "The sport is at a skateboard park."], "image": "train2014/COCO_train2014_000000268656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13372, "question_id": "WfSCYAkZCjoAFdPLCbNb4e", "question": "What would this location be called?", "choices": ["garage", "dock", "bunker", "hangar"], "correct_choice_idx": 1, "direct_answers": ["marina", "marina", "marina", "marina", "harbor", "ocean", "port", "port", "marina", "dock"], "difficult_direct_answer": false, "rationales": ["The boats are docked at the port.", "The location is a dock.", "This location is a dock with lots of boats tied up."], "image": "train2014/COCO_train2014_000000013372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149679, "question_id": "WfXeTdjrNnr2Xvbt8uNhuN", "question": "How is doing what the photo taker is doing with their leg considered?", "choices": ["spiritual", "slightly rude", "dangerous", "perfectly normal"], "correct_choice_idx": 1, "direct_answers": ["rude", "flirtatious", "rude", "relaxing", "rude", "bad etiquette", "rude", "slightly rude", "bad taste", "stretching"], "difficult_direct_answer": false, "rationales": ["A person has their foot and leg resting on a table.", "The person has their leg on a table which is rude.", "The person's leg is on the table."], "image": "train2014/COCO_train2014_000000149679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555615, "question_id": "WfYpcr5g3QPTr2VVD9UWqK", "question": "What are they doing with her hair?", "choices": ["admiring it", "selling it", "cleaning it", "stealing it"], "correct_choice_idx": 0, "direct_answers": ["brushing", "brushing", "brushing it", "brushing", "admiring it", "brushing", "combing it", "brushing it", "brushing combing", "brushing"], "difficult_direct_answer": false, "rationales": ["The people like her hair.", "They are holding it, looking at it and brushing it for her.", "The kids brushing and liking her hair."], "image": "train2014/COCO_train2014_000000555615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469936, "question_id": "WfdtaxbMvbPpMjJT6Pconx", "question": "In what year did this magazine relaunch?", "choices": ["2016", "2005", "2013", "2012"], "correct_choice_idx": 2, "direct_answers": ["early 1900", "2020", "2013", "2013", "2013", "early 1900", "2013", "2013", "2013", "2013"], "difficult_direct_answer": false, "rationales": ["The magazine domino relaunched in 2013.", "The year of the relaunch could not be determined by the photo, but an internet search provided the answer.", "A home style type magazine cover is shown."], "image": "val2014/COCO_val2014_000000469936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456053, "question_id": "WfmTbj5G8Y2U8SWEgyrPzP", "question": "On which side might people enter the train?", "choices": ["left", "top", "bottom", "facing right"], "correct_choice_idx": 3, "direct_answers": ["left", "left", "platform side", "right", "left", "photo right", "right", "photo-right side", "facing right", "left"], "difficult_direct_answer": false, "rationales": ["They can only board from the platform", "The people boarding will want to be facing right to get into the train.", "The vehicle seems to be facing right as evident."], "image": "val2014/COCO_val2014_000000456053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234910, "question_id": "Wg5BfCLEGQ8eWgVBToPq96", "question": "What type of business is shown here?", "choices": ["car rental", "mini mart", "warehouse", "grocery store"], "correct_choice_idx": 3, "direct_answers": ["grocery", "grocery store", "grocery", "supermarket", "grocery store", "grocery store", "grocery store", "building", "grocery store", "grocery store"], "difficult_direct_answer": false, "rationales": ["The business is for groceries.", "Safeway sells groceries.", "Safeway is a place to get groceries from."], "image": "val2014/COCO_val2014_000000234910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90020, "question_id": "WgE6WsDkRyYcHbaxAMXrp3", "question": "Driving straight ahead might cause what?", "choices": ["accident", "flooded car", "oil leak", "flat tire"], "correct_choice_idx": 0, "direct_answers": ["accident", "accident", "accident", "accident", "accident", "car crash", "accident", "collision", "accident", "collision"], "difficult_direct_answer": false, "rationales": ["There is a one way street.", "An accident might happen.", "Driving straight would lead to oncoming traffic and cause a collision."], "image": "val2014/COCO_val2014_000000090020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379820, "question_id": "WgK87HPCnZoC4MxyKfWFej", "question": "What comic brand owns the franchise depicted here?", "choices": ["pixar", "dc", "disney", "marvel"], "correct_choice_idx": 3, "direct_answers": ["n/a", "marvel", "marvel studios", "marvel", "marvel", "marvel", "marvel", "marvel", "marvel", "spiderman"], "difficult_direct_answer": false, "rationales": ["Spiderman is on the plates.", "The plates have spider-man, which is owned by marvel.", "The plates are spider-man themed."], "image": "train2014/COCO_train2014_000000379820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515106, "question_id": "WgfeCnZr4sGsEAwDeFDYTL", "question": "What does the item hanging on the wall and closest to the broom look like?", "choices": ["cross", "lips", "straw", "pitchfork"], "correct_choice_idx": 3, "direct_answers": ["pitchfork", "horseshoe", "pan", "fireplace poker", "pitchfork", "fork", "fork", "good", "pitchfork", "fireplace poker"], "difficult_direct_answer": false, "rationales": ["It has 2 sharp edges like a pitchfork, and it's just a bit smaller.", "A fire fork does resemble a small version of a pitchfork and is very useful when cooking over a fire.", "The item hanging on the wall close to the broom looks like a pitchfork."], "image": "train2014/COCO_train2014_000000515106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569768, "question_id": "Wh8bL4zsHSy8MBcpCyzd2B", "question": "What sport is the father hoping his daughter might like in the future?", "choices": ["soccer", "softball", "cross country", "football"], "correct_choice_idx": 1, "direct_answers": ["softball", "baseball", "softball", "baseball", "baseball", "baseball", "baseball", "softball", "softball", "baseball"], "difficult_direct_answer": false, "rationales": ["The man is teaching the girls to swing a bat.", "The other options don't use a bat and ball.", "The father is practicing batting with his daughter. this equipment is also used for softball."], "image": "val2014/COCO_val2014_000000569768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241431, "question_id": "WhB4pAnkvksb87m9TU2NuW", "question": "What animal loves this food?", "choices": ["dog", "cow", "rabbit", "cat"], "correct_choice_idx": 2, "direct_answers": ["rabbit", "horse", "rabbit", "rabbit", "rabbit", "rabbit", "rabbit", "horse", "horse", "bunnies"], "difficult_direct_answer": false, "rationales": ["These are the vegetation they eat", "Traditionally this vegetable is eaten by many creatures but especially rabbits.", "Rabbits consume carrots as their primary favorite meal."], "image": "val2014/COCO_val2014_000000241431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367753, "question_id": "WhZzAHBVZ53pkvP7GYBNBR", "question": "What are the men against the wall working on?", "choices": ["ipods", "tablets", "laptop", "desktops"], "correct_choice_idx": 2, "direct_answers": ["laptops", "laptops", "computers", "laptops", "school work", "laptops", "computers", "homework", "laptop", "laptops"], "difficult_direct_answer": false, "rationales": ["The men have laptops.", "The men are on laptops and typing on them.", "The men are all using their laptops."], "image": "train2014/COCO_train2014_000000367753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496198, "question_id": "Whe4tFXmXVcsxt8ern2qLG", "question": "What type of situation is this?", "choices": ["planned", "formal", "emergency", "celebratory"], "correct_choice_idx": 2, "direct_answers": ["celebration", "fire", "emergency", "fire", "celebration", "emergency", "emergency", "emergency", "fire drill", "emergency"], "difficult_direct_answer": false, "rationales": ["A firetruck is there.", "There is a firefighting truck.", "There is a firetruck."], "image": "val2014/COCO_val2014_000000496198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307438, "question_id": "Why7d8VL3V326cmNcsWGjB", "question": "What color light do the cars perpendicular to the camera have?", "choices": ["green", "blue", "red", "yellow"], "correct_choice_idx": 2, "direct_answers": ["red", "green", "white", "green", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["If the traffic light by the camera is the color that signifies go, then the perpendicular light must be the color that signifies stop.", "A green traffic light on a pole is lit.", "If the light straight ahead is green and the cars can go then the lights on either side must be stopping those cars so as not to cause accidents."], "image": "val2014/COCO_val2014_000000307438.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132768, "question_id": "WhyAVEzMrSxUhJYnUJbN6R", "question": "What type of television is on the stand to the left of the laptop?", "choices": ["plasma", "lcd", "crt", "oled"], "correct_choice_idx": 2, "direct_answers": ["flatscreen", "crt", "old", "crt", "flatscreen", "crt", "old tv", "old tv", "old tv", "pictures"], "difficult_direct_answer": false, "rationales": ["The tv is a crt.", "A large crt screen is on top of the stand next to the fireplace.", "The tv is old and bulky."], "image": "train2014/COCO_train2014_000000132768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344308, "question_id": "Wi6orzpGXncNBocKnwigk8", "question": "What is the netting on the canopy for?", "choices": ["warmth", "mosquitoes/insects", "smell", "privacy"], "correct_choice_idx": 1, "direct_answers": ["mosquitos", "mosquitoes/insects", "mosquitoes", "bugs", "mosquitos", "mosquito prevention", "decoration", "mosquitos", "mosquitoes", "bug deterrence"], "difficult_direct_answer": false, "rationales": ["This is used in warm weather to keep bugs from landing on you while you sleep", "This is let down when people are sleeping to keep things off them", "The netting is to keep bugs out."], "image": "val2014/COCO_val2014_000000344308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581196, "question_id": "WiAKRVpTR9omP8RXprkgzo", "question": "What does this train carry?", "choices": ["cars", "passengers", "coal", "livestock"], "correct_choice_idx": 1, "direct_answers": ["people", "passengers", "people", "people", "people", "passengers", "people goods", "people", "passenger", "passengers"], "difficult_direct_answer": false, "rationales": ["The train has passengers.", "This looks like the type of train that is utilized by passengers. livestock could not, from a practical standpoint, travel on a train like this.", "As indicated by the windows on the train cars. the other options would look a lot different."], "image": "train2014/COCO_train2014_000000581196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382307, "question_id": "WiFaohconPmfCo7WWqL9gS", "question": "What black item sits on the floor next to the man?", "choices": ["radiator", "computer", "speaker", "table"], "correct_choice_idx": 2, "direct_answers": ["stereo", "speaker", "speaker", "stereo", "speaker", "speaker", "stereo", "speaker", "speaker", "speaker"], "difficult_direct_answer": false, "rationales": ["There is a silver logo at the bottom that shows the technology company that makes that type of equipment.", "The black item is next to a record player, so it is most likely a speaker.", "The woofer is visible in the box"], "image": "val2014/COCO_val2014_000000382307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93068, "question_id": "WiZkZHWDP4xm3phUDRqKDD", "question": "Where are the bananas being transported to?", "choices": ["wood", "market", "zoo", "farm"], "correct_choice_idx": 1, "direct_answers": ["marketplace", "market", "market place", "market", "market", "marketplace", "market", "grocery store", "store", "market"], "difficult_direct_answer": false, "rationales": ["Farms grow bananas to sell.", "The bananas go to market.", "The only reason someone would drag around a pile of bananas that way is to go to a place to sell them."], "image": "val2014/COCO_val2014_000000093068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300629, "question_id": "WibivpXrFFHnRNtwYtx5tv", "question": "What can be done with the cardboard box when done using it?", "choices": ["boil", "recycle", "eat", "burn"], "correct_choice_idx": 1, "direct_answers": ["recycle", "recycle", "recycled", "recycled", "pizza", "recycling", "target practice", "recycled", "recycled", "recycle"], "difficult_direct_answer": false, "rationales": ["Cardboard is a product that is known to be recyclable and would be done only after it is done being used and no longer has food in it.", "You can put it in the recycle bin.", "The cardboard box should be recycled."], "image": "train2014/COCO_train2014_000000300629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112269, "question_id": "WiyCZDzcH7PsvQsrncipwm", "question": "How does he style his hair?", "choices": ["bowl cut", "braids", "crew cut", "dreadlocks"], "correct_choice_idx": 3, "direct_answers": ["dreadlocks", "dreadlocks", "dreadlocks", "dreadlocks", "dreadlocks", "dreads", "no washing", "died locks", "dreads", "dread locks"], "difficult_direct_answer": false, "rationales": ["The mans hair is clearly visible and is styled with long clumps. this style is known as answer b.", "The hairs look like ropes on his head.", "These are similar to braids and they get left in the hair"], "image": "val2014/COCO_val2014_000000112269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43350, "question_id": "Wj8etCtxfgW6PopjtxzsKj", "question": "What doughnut shown in the image appears to be in higher quantities than the rest?", "choices": ["original glazed", "glazed chocolate", "jelly filled", "lemon filled"], "correct_choice_idx": 0, "direct_answers": ["glazed", "glaze", "glaze", "glazed", "glazed", "glazed", "original glazed", "glazed", "original glazed", "glazed"], "difficult_direct_answer": false, "rationales": ["There are plain shiny donuts on 2 trays", "The glazed looks fresh.", "The glazed one has two trays full of food."], "image": "train2014/COCO_train2014_000000043350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470364, "question_id": "WjD6VxT6qYAD8gLKVyJdHy", "question": "What dessert is on the plates?", "choices": ["croissant", "cookies", "donuts", "cake"], "correct_choice_idx": 2, "direct_answers": ["donuts", "doughnuts", "donut", "donut", "donuts", "donuts", "donut", "donuts", "donuts", "donut"], "difficult_direct_answer": false, "rationales": ["Each item is uncut and has a hole in the middle.", "Each dessert is round and has a hole in its center.", "Donuts are on the plates."], "image": "val2014/COCO_val2014_000000470364.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126356, "question_id": "WjK483p63HAyeaKPEVECQU", "question": "What is the occupation of the man wearing a black coat?", "choices": ["policeman", "photographer", "film director", "news reporter"], "correct_choice_idx": 3, "direct_answers": ["camera man", "photographer", "cameraman", "photographer", "photographer", "photographer", "photojournalist", "photographer", "news reporter", "filmographer"], "difficult_direct_answer": false, "rationales": ["He has a camera.", "He's a reporter.", "People with cameras can have a variety of careers, but in this case, there is a crowd and police presence. it is likely that something noteworthy happened, and the person with the camera is trying to gather details for a news story."], "image": "train2014/COCO_train2014_000000126356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516582, "question_id": "WjkRowxabHuDJ33D9CwwCa", "question": "What is the relationship of the woman to the man?", "choices": ["audience", "teammate", "referee", "competitor"], "correct_choice_idx": 1, "direct_answers": ["teammates", "family", "teammates", "daughter", "teammate", "competitors", "sister brother", "teammates", "siblings", "sister"], "difficult_direct_answer": false, "rationales": ["These two people are playing together on the same side of the tennis court.", "They are playing tennis together.", "The relationship is a teammate."], "image": "train2014/COCO_train2014_000000516582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454129, "question_id": "WjrQAi86ozU6D5WUazh9aD", "question": "Who had an accident?", "choices": ["black car", "woman", "man", "motorcyclist"], "correct_choice_idx": 3, "direct_answers": ["motorcyclist", "motorcyclist", "motorcycle", "motorcycle", "motorcyclist", "motorcycle", "motorcyclist", "motorcycle", "motorcycle", "motorcycle"], "difficult_direct_answer": false, "rationales": ["The cyclist had an accident.", "A motorcycle is on the crossing that has had an accident.", "The bike is tipped over and slightly damaged."], "image": "val2014/COCO_val2014_000000454129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511438, "question_id": "WjvDMBPW3bizJiiv675Yq2", "question": "Where did this airplane originate?", "choices": ["nevada", "nigeria", "australia", "great britain"], "correct_choice_idx": 2, "direct_answers": ["united kingdom", "australia", "australia", "queensland", "australia", "oantas side", "emirates", "australia", "qantas", "australia"], "difficult_direct_answer": false, "rationales": ["Qantas was founded in queensland according to an internet search.", "The plane is from australia.", "This airplane has a qantas livery. qantas is not based in nevada, great britain, or nigeria."], "image": "train2014/COCO_train2014_000000511438.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340318, "question_id": "Wk26GH6PcA3sVpiyN7yfF5", "question": "What is the person in the center wearing?", "choices": ["mattress", "backpack", "monocle", "clown shoes"], "correct_choice_idx": 1, "direct_answers": ["backpack", "backpack", "winter clothes", "backpack", "helmet", "skis", "ski gear", "winter clothing", "blue backpack", "backpack"], "difficult_direct_answer": false, "rationales": ["The person is trekking around with a backpack.", "The man has a backpack.", "The person has a backpack."], "image": "train2014/COCO_train2014_000000340318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95161, "question_id": "Wk4KkC6Z2EBQbidLoVAU7P", "question": "What hour is it?", "choices": ["ten", "seven", "three", "one"], "correct_choice_idx": 1, "direct_answers": ["seven", "seven", "seven", "seven", "seven", "seven", "seven 7", "seven", "705 a.m", "seven"], "difficult_direct_answer": false, "rationales": ["The time is seven.", "The clock shows the time to be about five minutes after 7.", "The hand is pointing at 7."], "image": "val2014/COCO_val2014_000000095161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272323, "question_id": "WkDs2tCH3fCBNHfXH8tKgY", "question": "Why is the man holding his arms out to his sides?", "choices": ["posing", "balance", "to clap", "to wave"], "correct_choice_idx": 1, "direct_answers": ["balance", "balance", "balance", "balance", "maintain balance", "for balance", "balance", "balance", "for balance", "balance"], "difficult_direct_answer": false, "rationales": ["He doesn't want to fall.", "The other options don't apply to this move or sport.", "The man wants to balance."], "image": "val2014/COCO_val2014_000000272323.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499371, "question_id": "WkjWN8LUkoRWkaQYpcWRoS", "question": "What type of action are the people taking?", "choices": ["retreat", "ascend", "descent", "attack"], "correct_choice_idx": 2, "direct_answers": ["skiing", "ski", "playing", "downhill sking", "descent", "skiing", "ski", "playing", "caucusing", "skiing"], "difficult_direct_answer": false, "rationales": ["The action is descending.", "The people are skiing and, since it's impossible to ski up, they are skiing downhill or descending.", "The people are going downhill."], "image": "train2014/COCO_train2014_000000499371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98345, "question_id": "WkmUhKKoECWWGtZY2GmPAg", "question": "What is the business depicted in the photo?", "choices": ["growing vegetable", "transportation", "selling vegetable", "fishery"], "correct_choice_idx": 2, "direct_answers": ["vegetable seller", "farmer", "selling vegetable", "fisherman", "fisherman", "vegetable vendor", "greengrocer wholesaler", "market", "farming", "floating market"], "difficult_direct_answer": true, "rationales": ["The business is selling veggies.", "These are farmers bringing their products to market", "They have baskets of greens."], "image": "train2014/COCO_train2014_000000098345.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210052, "question_id": "WknKLXtMNvxzn43NsochXD", "question": "Why is the rider wearing gloves?", "choices": ["fashion", "warmth", "health", "grip"], "correct_choice_idx": 3, "direct_answers": ["grip", "protection", "protection", "protection", "grip", "driving", "warmth", "protection", "protection", "safety"], "difficult_direct_answer": false, "rationales": ["The gloves are heavy and appear to be insulated, indicating they are being worn to protect against the cold.", "To keep hold of the handle bars better.", "The gloves make it easier for the rider to hold the handles."], "image": "train2014/COCO_train2014_000000210052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346947, "question_id": "Wkrgv8GW8xrW6vrqV5oCFY", "question": "Where will they put that food after they are done?", "choices": ["stove", "oven", "shelf", "fridge"], "correct_choice_idx": 1, "direct_answers": ["oven", "stomachs", "pepperon bag", "eat it", "belly", "oven", "kitchen top", "pepperoni", "microwave", "oven"], "difficult_direct_answer": false, "rationales": ["The girls are preparing small pizzas that have dough that will be baked before eating.", "The food will be baked.", "They will all put their foods in the oven."], "image": "train2014/COCO_train2014_000000346947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305385, "question_id": "WkrmEPRjJeCzhMF3XtVfXQ", "question": "Why are there so many buses?", "choices": ["tourist destination", "coincidence", "break time", "rush hour"], "correct_choice_idx": 0, "direct_answers": ["transit", "tourist destination", "busy area", "busy city", "tourists", "many people", "tourist groups", "tourist attraction", "transport", "for tourists"], "difficult_direct_answer": true, "rationales": ["There are so many buses going to a tour destination.", "People are visiting the area. to get people into the area, multiple vehicles are necessary.", "This is near buckingham palace which is a top tourist destination."], "image": "train2014/COCO_train2014_000000305385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561891, "question_id": "Wm6jspF4mvPB3EuSxQhxyg", "question": "What sort of beverages will be served here?", "choices": ["coffee", "alcohol", "tea", "milk"], "correct_choice_idx": 1, "direct_answers": ["wet ones", "alcoholic", "cocktails", "alcohol", "chemical beverages", "alcohol", "alcoholic beverages", "alcoholic", "alcohol", "cocktails"], "difficult_direct_answer": false, "rationales": ["This is a dimly lit room with bright neon lights along the walls. there is a counter with a lot of bottles that contain liquid behind it.", "A bar has bottles behind it and is lit up in neon.", "Those are liquor bottles"], "image": "train2014/COCO_train2014_000000561891.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117838, "question_id": "Wm9ve9fH29vNZNttXFRi7k", "question": "What are these people waiting to do?", "choices": ["board bus", "see show", "shop sale", "get treatment"], "correct_choice_idx": 0, "direct_answers": ["board bus", "board bus", "board", "board", "board bus", "embark", "board bus", "board bus", "ride bus", "board"], "difficult_direct_answer": false, "rationales": ["They want to get on the bus.", "They are standing behind the person who is now boarding the vehicle.", "The people want the bus."], "image": "train2014/COCO_train2014_000000117838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459721, "question_id": "WmEe4AxtuoZoCBo6SWwGUh", "question": "What did the pair here recently exchange?", "choices": ["clothes", "moms", "cards", "rings"], "correct_choice_idx": 3, "direct_answers": ["vows", "rings", "wedding vows", "vows", "vows", "wedding vows", "vows", "vows", "vows", "vows"], "difficult_direct_answer": false, "rationales": ["Bride and groom cut a cake like this at their wedding and rings are exchanged when they say their vows", "They are cutting their wedding cake while wearing bride and groom attire.", "Wedding ceremonies involve ring exchanges."], "image": "val2014/COCO_val2014_000000459721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573882, "question_id": "WmJRonXn3Mnqe72vsNJutQ", "question": "What is the article of clothing at the base of the console used for?", "choices": ["bulletproofing", "holding weights", "protect eyes", "walking"], "correct_choice_idx": 3, "direct_answers": ["advertising", "protection fashion", "wrist", "game playing", "virtual reality", "walking", "foot protection", "covering feet", "walking", "feet"], "difficult_direct_answer": true, "rationales": ["The clothing is for walking.", "The person took off their shoes to play the game.", "That's what most people do with their shoes."], "image": "val2014/COCO_val2014_000000573882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103822, "question_id": "WmNmczuSiAWnCk8Qg52jvL", "question": "What type of shot is the man about to hit?", "choices": ["serve", "backhand", "slice", "forehand"], "correct_choice_idx": 3, "direct_answers": ["forehand", "forehand", "tennis", "tennis swing", "forehand", "ace", "serve", "forehand", "slap", "backhand"], "difficult_direct_answer": false, "rationales": ["The shot is a forehand one.", "The man seems to hit the ball forward.", "The man has his front hand out."], "image": "train2014/COCO_train2014_000000103822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106244, "question_id": "WmPnjotVUM6FdyLSebqdPZ", "question": "How was the item prepared that is about to be bitten?", "choices": ["deep fried", "broiled", "its raw", "baked"], "correct_choice_idx": 0, "direct_answers": ["fried", "deep fried", "fried", "fried", "baked", "fried", "fried", "doughnut", "fried", "deep fried"], "difficult_direct_answer": false, "rationales": ["The item was fried.", "It is a doughnut, which is cooked in hot oil.", "The man is eating a doughnut, and doughnuts are fried in order to make the dough consumable by customers."], "image": "val2014/COCO_val2014_000000106244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100516, "question_id": "Wmjbza83cWAYCeELyzFD47", "question": "What part of tennis is happening?", "choices": ["serve", "side spin", "block", "backhand"], "correct_choice_idx": 0, "direct_answers": ["serving", "serve", "serve", "hit", "serve", "serve", "serve", "serve", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["The player's body is in the position to serve.", "He is hitting the ball to his opponent.", "The man is serving."], "image": "train2014/COCO_train2014_000000100516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224104, "question_id": "WmmzWVCbhvknqN5uqwRvT6", "question": "Why is he on a horse?", "choices": ["stole horse", "showing off", "can't walk", "is patrolling"], "correct_choice_idx": 3, "direct_answers": ["patrol", "mounted police", "transportation", "is patrolling", "to observe", "on patrol", "policing", "good mobility", "patrolling", "patrol"], "difficult_direct_answer": true, "rationales": ["A police officer is keeping the city safe", "The man is in a police uniform and has the trappings of a police officer. police officers that ride horses do so while patrolling.", "This is a police man that is riding around and protecting the city."], "image": "val2014/COCO_val2014_000000224104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309409, "question_id": "WnC5i3LYthE6AwPVty4SSZ", "question": "What sort of weather is happening where these people gather?", "choices": ["tidal wave", "sun", "sleet storm", "rain"], "correct_choice_idx": 3, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The gear present protects people from falling water.", "The people are trying to stay on of the rain.", "A man and woman are outdoors with their heads covered with a hood and an umbrella."], "image": "train2014/COCO_train2014_000000309409.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220214, "question_id": "WnUyHVEXvFzHiUnPgnVpqa", "question": "What is the last word on the train?", "choices": ["oak", "level", "pass", "baby"], "correct_choice_idx": 2, "direct_answers": ["pass", "pass", "pass", "pass", "pass", "pass", "pass", "pass", "pass", "pass"], "difficult_direct_answer": false, "rationales": ["The last word is pass.", "It says it on the train.", "A train has a company logo on the side."], "image": "val2014/COCO_val2014_000000220214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372319, "question_id": "Wnn23FJkuytbFfrV4w4eWB", "question": "Why is the woman holding the bottle?", "choices": ["to buy", "to sell", "to drink", "to collect"], "correct_choice_idx": 2, "direct_answers": ["quenching thirst", "avoid spill", "drinking", "to drink", "water bottle", "thirsty", "drinking", "to drink", "she's thirsty", "drinking"], "difficult_direct_answer": false, "rationales": ["A girl drinks a beverage when she is thirsty.", "She has liquid in it to quench her thirst.", "She is holding the cap to the bottle in the other hand."], "image": "train2014/COCO_train2014_000000372319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342812, "question_id": "WoQpAJrWC7hrJuM6FXeYBk", "question": "What is the woman near the 8 St. sign wearing?", "choices": ["baseball cap", "shorts", "suit", "leg warmers"], "correct_choice_idx": 1, "direct_answers": ["white t-shirt", "shorts", "shorts", "t-shirt", "shorts", "t-shirt shorts", "t shirt", "shorts", "shorts", "black shorts"], "difficult_direct_answer": false, "rationales": ["A woman is wearing bottoms that expose her legs but have legs rather than a skirt.", "The woman is wearing shorts.", "The woman near the sign has shorts on."], "image": "train2014/COCO_train2014_000000342812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475056, "question_id": "WoeHK2rAkoGzMiU8U6AEad", "question": "Why is he holding the door?", "choices": ["resistance", "balance", "privacy", "visibility"], "correct_choice_idx": 1, "direct_answers": ["balance", "balance", "standing", "won't close", "for balance", "open", "balance", "support", "flooding", "stay balanced"], "difficult_direct_answer": false, "rationales": ["He is only on one foot, meaning that he probably needs a way to better keep his balance, which is where his hand on the door comes in. people tend to grab the closest, sturdiest object to them when they begin to struggle with balance.", "To keep his balance.", "The man wants to keep his balance."], "image": "train2014/COCO_train2014_000000475056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377473, "question_id": "Wp3bGPQKrxx7r7Sp5RAQ2t", "question": "Why is the woman standing in front of the man?", "choices": ["to photograph", "to paint", "to tackle", "to wrestle"], "correct_choice_idx": 0, "direct_answers": ["speak something", "taking picture", "talking", "to photograph", "taking pictures", "taking picture", "taking picture", "video catch", "propose", "photography"], "difficult_direct_answer": false, "rationales": ["The woman is holding a camera. the man is posing.", "She is holding a camera up, facing the man.", "The woman is pointing a camera at the man."], "image": "train2014/COCO_train2014_000000377473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184377, "question_id": "WpDj6iZT44zpgZTkJ6YNSi", "question": "What profession is this man probably in?", "choices": ["horse trainer", "steeplejack", "mounted police", "parade"], "correct_choice_idx": 2, "direct_answers": ["law enforcement", "police", "police", "police", "mounted police", "police", "police", "police", "police", "police"], "difficult_direct_answer": false, "rationales": ["The man is dressed as most police officers would. in addition, police in some cities use horses which are mounted.", "The man seems to have a police uniform on and a police helmet.", "The man is visibly on a horse in an urban setting and is wearing a uniform that is consistent with answer a."], "image": "train2014/COCO_train2014_000000184377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76334, "question_id": "WpEKTFjyiK5mMAdVWgmVi2", "question": "What is the relationship of the man to the woman?", "choices": ["husband", "brother", "son", "father"], "correct_choice_idx": 0, "direct_answers": ["husband", "husband", "husband", "married", "married", "husband", "husband", "married", "husband", "husband"], "difficult_direct_answer": false, "rationales": ["The relationship is the husband.", "They are celebrating their wedding and his wife is in a white dress.", "The man is the husband of the woman in white he just married moments ago. as the rice is flying towards them, he's trying to keep the stuff out of his eyes!."], "image": "val2014/COCO_val2014_000000076334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208479, "question_id": "WpF7jnGja9AnZuLfC9mVWe", "question": "What is the silver object on the man's wrist used for?", "choices": ["telling time", "covering cut", "preventing sweat", "tracking steps"], "correct_choice_idx": 0, "direct_answers": ["tell time", "knowing time", "time", "time", "time", "telling time", "telling time", "telling time", "telling time", "watch"], "difficult_direct_answer": false, "rationales": ["It's a wristwatch obviously.", "The thing on his wrist is a watch.", "The silver object on the man's wrist is a watch."], "image": "train2014/COCO_train2014_000000208479.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454181, "question_id": "WpW2jgP5DWwk8q5PREvUfT", "question": "Which one of these towns is in the region described by the book?", "choices": ["honolulu", "las vegas", "juneau", "springfield"], "correct_choice_idx": 3, "direct_answers": ["springfield", "holyoke", "boston", "boston", "pittsfield", "western", "boston", "chicopee", "boston", "pittsfield"], "difficult_direct_answer": false, "rationales": ["The town is springfield.", "The third largest city in massachusetts.", "The book is about western massachusetts and springfield is located there."], "image": "train2014/COCO_train2014_000000454181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309880, "question_id": "WpapLQTPwmsQAG4P9EnEUp", "question": "What famous painting can be seen on the wall?", "choices": ["scream", "american gothic", "mona lisa", "kiss"], "correct_choice_idx": 2, "direct_answers": ["mona lisa", "mona lisa", "mona lisa", "mona lisa", "mary", "mona lisa", "mona lisa", "mona lisa", "mona lisa", "mona lisa"], "difficult_direct_answer": false, "rationales": ["It is an unsmiling woman sitting", "There is a picture of a woman with a secretive smile.", "There is a mona lisa on the wall."], "image": "train2014/COCO_train2014_000000309880.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124174, "question_id": "Wpe5foUm5g3NRVkyhaxPh5", "question": "What might the man be attempting to do with the animals?", "choices": ["herd them", "sheer them", "hunt them", "ride them"], "correct_choice_idx": 0, "direct_answers": ["herd", "heard", "herd", "herd", "herd", "herd them", "herd them", "herd", "herd sheep", "herd"], "difficult_direct_answer": false, "rationales": ["The photo includes a mountain side which has a lot of sheep scattered around. there is a man in the foreground who is holding a staff and has a dog on the far left. these are all signs that he is trying to herd them.", "The man would herd them.", "He is trying to gather them all to move them to another spot."], "image": "train2014/COCO_train2014_000000124174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467939, "question_id": "WqUJicDJ3rN7Ndxt3qs6r2", "question": "What is being used to keep the sandwiches from falling apart?", "choices": ["tape", "knives", "toothpicks", "glue"], "correct_choice_idx": 2, "direct_answers": ["toothpicks", "toothpicks", "toothpicks", "toothpicks", "toothpicks", "toothpicks", "toothpicks", "toothpicks", "toothpicks", "tooth picks"], "difficult_direct_answer": false, "rationales": ["The small wooden sticks in the sandwiches are used to keep them from falling apart.", "Traditionally these items are used to keep large sandwiches together.", "The sandwich has toothpicks."], "image": "train2014/COCO_train2014_000000467939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523711, "question_id": "Wqdw8sznGVxFTCLPf4HXgh", "question": "Where would these animals most likely come face to face?", "choices": ["farm", "barbecue", "zoo", "rodeo"], "correct_choice_idx": 0, "direct_answers": ["farm", "zoo", "farm", "farm", "zoo", "farm", "farm", "farm", "farm", "zoo"], "difficult_direct_answer": false, "rationales": ["The animals are found in barns.", "The animals are on a farm.", "A cow and pig are on a car. they belong on a farm."], "image": "train2014/COCO_train2014_000000523711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500076, "question_id": "Wqe35d4nc2GPS55adSNCv4", "question": "What hotel is in the background behind the policemen and horses?", "choices": ["best western", "hilton", "wyndham", "hyatt"], "correct_choice_idx": 3, "direct_answers": ["hyatt", "hyatt", "military", "hyatt", "hyatt", "hyatt regency", "hyatt", "hyatt", "hyatt", "hyatt"], "difficult_direct_answer": false, "rationales": ["You can see some of the letters on the sign.", "Hyatt is in the background.", "A partially visible sign is behind the horses. the hotel's name also appears on the banner on one of the poles."], "image": "train2014/COCO_train2014_000000500076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233311, "question_id": "WqrxUkkddPSExjQnU5AkWB", "question": "Which object is most likely the softest?", "choices": ["orange", "plush monkey", "table", "letter"], "correct_choice_idx": 1, "direct_answers": ["bear", "toy", "monkey", "toy", "toy monkey", "stuffed monkey", "monkey", "plush monkey", "monkey doll", "toy monkey"], "difficult_direct_answer": false, "rationales": ["The stuffed animal is stuffed with stuffing and has soft fur.", "The monkey is soft.", "The stuffed animal sitting on the table would be soft because it is filled with stuffing."], "image": "val2014/COCO_val2014_000000233311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538057, "question_id": "WqsWjfDwGbAaqZe3JvCBZD", "question": "What might be taking place in the building to the right?", "choices": ["baseball game", "roller derby", "worship service", "property auction"], "correct_choice_idx": 2, "direct_answers": ["church", "celebration", "service", "religious communion", "church service", "worship service", "event", "church", "church", "church service"], "difficult_direct_answer": false, "rationales": ["It is a church with a tall steeple.", "The design of the building matches that iof a church, where worship takes place every day.", "It appears that the building to the right is a church so it is likely that some church event is occurring."], "image": "train2014/COCO_train2014_000000538057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166864, "question_id": "Wr4eLgAAiWDnPKLfAfCEqF", "question": "What are the boats made out of?", "choices": ["metal", "legos", "plasma", "ceramic"], "correct_choice_idx": 1, "direct_answers": ["steel", "metal", "wood metal", "legos", "legos", "steel", "lego", "plastic material", "legos", "plastic"], "difficult_direct_answer": false, "rationales": ["The boats are miniatures made from plastic bricks.", "The water is noticeably shallow and the boats are to scale with a lego figurine in the foreground.", "This is a recreation of an actual harbor using a."], "image": "train2014/COCO_train2014_000000166864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165686, "question_id": "WrRk7ZyUpZMc8AffAebZfR", "question": "What is in the bowl by the beef?", "choices": ["garlic butter", "mayonnaise", "au jus", "horseradish"], "correct_choice_idx": 0, "direct_answers": ["soup", "soup", "soup", "garlic butter", "brocolli", "food", "soup", "chai", "soup", "gravy"], "difficult_direct_answer": false, "rationales": ["The plate has butter.", "The steak is served with butter.", "There is some garlic butter on by the beef."], "image": "val2014/COCO_val2014_000000165686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502134, "question_id": "WrgYnXVmbav3mvEs4EHoMe", "question": "Who is riding on the motorcycle?", "choices": ["easter bunny", "freddy krueger", "jason vorhees", "santa claus"], "correct_choice_idx": 3, "direct_answers": ["santa", "santa", "santa claus", "santa claus", "santa claus", "santa", "santa claus", "motorcycle driver", "santa", "santa"], "difficult_direct_answer": false, "rationales": ["Santa is wearing his red suit.", "Santa claus is known for dressing in a red suit and wearing a long white beard.", "Santa is riding."], "image": "train2014/COCO_train2014_000000502134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569767, "question_id": "WsecUSDprubbCfcLa7Gjhz", "question": "What animals are present?", "choices": ["horse", "dog", "deer", "giraffe"], "correct_choice_idx": 0, "direct_answers": ["horses", "horses", "horses humans", "horses", "horses", "horse", "horses humans", "horses humans", "horses", "horses"], "difficult_direct_answer": false, "rationales": ["Two horses are a little ways up the dirt road, and both are being ridden today by their respective owners.", "A horse is shown.", "The animals are large, four-legged, have long hair tails, and are being ridden by humans. horses are a common riding animal."], "image": "val2014/COCO_val2014_000000569767.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173198, "question_id": "Wsk5qAKzfU6zTNCEijVeTs", "question": "What are the black and yellow object on the ground for?", "choices": ["to wear", "to discard", "to extinguish", "to throw"], "correct_choice_idx": 0, "direct_answers": ["firefighter uniform", "suit", "life jackets", "firefighter uniform", "safety", "fire protection", "body protection", "visibility", "outfit", "to wear"], "difficult_direct_answer": true, "rationales": ["The yellow and black objects are to wear.", "Traditionally fire fighters wear these colors for safety.", "There is a fire suit on the ground."], "image": "train2014/COCO_train2014_000000173198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448425, "question_id": "WsnY3SWnVFHsyHiEJEJNmM", "question": "What sandwich does this bus share a name with?", "choices": ["open-face", "double decker", "reuben", "blt"], "correct_choice_idx": 1, "direct_answers": ["double decker", "double decker", "double decker", "double decker", "double decker", "double decker", "double decker", "double decker", "double decker", "double decker"], "difficult_direct_answer": false, "rationales": ["The sandwich is the double decker.", "The bus is two-story as is the sandwich.", "Double decker sandwiches have two stacks of bread and meat."], "image": "val2014/COCO_val2014_000000448425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516119, "question_id": "Wt4BonuCmcf7SMjzpfXS4S", "question": "Concave lens is used in which device?", "choices": ["none", "mirror", "camera", "specs"], "correct_choice_idx": 2, "direct_answers": ["eyeglasses", "camera", "lens", "no idea", "phone camera", "eyeglasses", "glasses", "phone camera", "contact lens", "eyeglasses"], "difficult_direct_answer": false, "rationales": ["A man is holding a phone up to capture a picture.", "There is a lens in the small camera.", "Cameras use curved lenses."], "image": "train2014/COCO_train2014_000000516119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217636, "question_id": "WuBTujmWERAVqffNwbe9WS", "question": "What material is this road made of?", "choices": ["asphalt", "concrete", "pavement", "cobblestone"], "correct_choice_idx": 3, "direct_answers": ["brick", "cobblestone", "cobblestone", "bricks", "brick", "brick", "brick", "bricks", "brick", "bricks"], "difficult_direct_answer": false, "rationales": ["The road is paved with small stones.", "The street the horse is on is cobblestone.", "The material is cobblestone."], "image": "train2014/COCO_train2014_000000217636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182715, "question_id": "WuFyvHNW3MM4jihb74oKbf", "question": "How is this food being prepared?", "choices": ["scooped", "sliced", "boiled", "poured"], "correct_choice_idx": 1, "direct_answers": ["knife", "with knife", "cut", "chopped", "sliced", "sliced", "chopped", "kid", "chopped", "diced"], "difficult_direct_answer": false, "rationales": ["The food is sliced.", "The child is holding a knife and there is food in front of her that has been cut up into pieces. knives are known for being able to slice food into pieces.", "She is using a knife"], "image": "val2014/COCO_val2014_000000182715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86512, "question_id": "WuGjYbcwH663iDBq94ixQK", "question": "Where is this game being played?", "choices": ["park", "gym", "stadium", "school"], "correct_choice_idx": 2, "direct_answers": ["new york", "nyc", "nyc", "new york", "baseball field", "stadium", "new york", "new york", "stadium", "stadium"], "difficult_direct_answer": false, "rationales": ["Baseballs games are played in large buildings where fans can watch.", "Professional players are on a baseball diamond.", "Because the ground looks like a pitch and players are also visible."], "image": "train2014/COCO_train2014_000000086512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307683, "question_id": "WuLUqd3b7xHw4gZ2JhiiPw", "question": "What is the sweetest food?", "choices": ["meatballs", "bread", "apple", "egg"], "correct_choice_idx": 2, "direct_answers": ["apple", "apple", "its cake", "brownie", "cake", "apple", "apple", "apple", "cake", "apple"], "difficult_direct_answer": false, "rationales": ["The food is the apple.", "Fruit is sweeter than bread and eggs. an apple can be seen among an egg and a sandwich with a whole grain bread.", "Eggs, bread, and meatballs are savory, not sweet. fruit are sweet."], "image": "val2014/COCO_val2014_000000307683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309371, "question_id": "WufSrSUkTpZMbZDjid2Pxd", "question": "What type of attraction seems to be setup in this location?", "choices": ["car wash", "debate", "concert", "circus"], "correct_choice_idx": 3, "direct_answers": ["carnival", "circus", "circus", "circus", "kids party", "circus", "carnival", "circus", "circus", "circus"], "difficult_direct_answer": false, "rationales": ["There is a circus tent.", "There is a large tent", "Circuses usually take place in tents. car washes, debates, and concerts usually take place in more permanent buildings."], "image": "val2014/COCO_val2014_000000309371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149185, "question_id": "WukqbmrKyYUTf2MMEjLT8v", "question": "What type of skate maneuver is the man attempting?", "choices": ["grab", "grind", "manual", "flip trick"], "correct_choice_idx": 3, "direct_answers": ["kick flip", "kick flip", "trick jump", "ollie", "jump", "kickflip", "jump", "ollie", "flip trick", "jump"], "difficult_direct_answer": false, "rationales": ["The man is attempting to flip the board.", "The boy is trying to flip the skateboard on a side.", "The type is a flip trick."], "image": "train2014/COCO_train2014_000000149185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427166, "question_id": "WusKJxzK6gNbeZPinPeAXp", "question": "Why is she turned around?", "choices": ["saying goodbye", "changed mind", "is afraid", "forgot something"], "correct_choice_idx": 0, "direct_answers": ["saying goodbye", "saying bye", "photograph", "waving goodbye", "saying bye", "for picture", "picture", "photo", "picture", "being photographed"], "difficult_direct_answer": false, "rationales": ["She is walking towards an airplane and is turned back.", "She is walking towards the plane and turning around to say bye.", "She's saying bye."], "image": "train2014/COCO_train2014_000000427166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565264, "question_id": "Wv9h2eUA9sGryTUMcdxQw6", "question": "Why are some bottles horizontal?", "choices": ["for mixing", "moisten cork", "keep warm", "less space"], "correct_choice_idx": 1, "direct_answers": ["storage", "for display", "wine rack", "on rack", "moisten cork", "moisten cork", "moistens cork", "storage", "storage", "better stacking"], "difficult_direct_answer": false, "rationales": ["Wine bottles are stored sideways to moisten the cork to make the bottle easier to open.", "It can't get dry or it crumbles and ruins the liquid", "The bottles are in a rack."], "image": "train2014/COCO_train2014_000000565264.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478212, "question_id": "WvPYrtV3qznERwPDKwL2da", "question": "What types of trees are these?", "choices": ["oak", "willow", "ash", "evergreen"], "correct_choice_idx": 3, "direct_answers": ["evergreen", "pine", "pine", "pines", "pine", "pines", "pine", "pine", "pines", "pine"], "difficult_direct_answer": false, "rationales": ["The trees in the background are pine trees.", "The trees are called evergreens.", "The types are evergreens."], "image": "val2014/COCO_val2014_000000478212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567858, "question_id": "Wvd9NVumYmVneCEcg87ZUJ", "question": "What kind of building is it on the left?", "choices": ["hotel", "government building", "residential building", "shopping mall"], "correct_choice_idx": 1, "direct_answers": ["arch", "columned", "courthouse", "government building", "bank", "coliseum", "gate", "castle", "court house", "courthouse"], "difficult_direct_answer": true, "rationales": ["The building is for the government.", "It has stone pillars with intricate designs", "With people dressed in official clothing on horses seem to be entering a building that is designed to look like government building."], "image": "train2014/COCO_train2014_000000567858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355040, "question_id": "WvovjdHTozdLj3RUz3ZB8U", "question": "The word in red means what in English?", "choices": ["devil", "ghost", "angel", "yeti"], "correct_choice_idx": 0, "direct_answers": ["devil", "devil", "devil", "devil", "devil", "devil", "devil", "devil", "devil", "devil"], "difficult_direct_answer": false, "rationales": ["The sign says diablo.", "The word is devil.", "The word in red is diablo. this is the satanic figure who is in hell."], "image": "val2014/COCO_val2014_000000355040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218931, "question_id": "WvqXAHMdFBH9V83qG9Zqt8", "question": "What kind of label is on their shirts?", "choices": ["warning", "instructional", "brand", "name tag"], "correct_choice_idx": 3, "direct_answers": ["name", "name tag", "nametags", "nametag", "not clear", "name", "introductory label", "name tags", "nametag", "name tag"], "difficult_direct_answer": false, "rationales": ["The label is a name tag.", "They have a white, rectangular sticker on the top of their shirt with words on it, normally to help people to know who they are.", "These people are at some kind of meeting. they might not know each other."], "image": "train2014/COCO_train2014_000000218931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92134, "question_id": "Wvrr8PqQiH6Qne7QKEinCQ", "question": "What type of vessels is the white one?", "choices": ["cargo ship", "cruise ship", "ferry", "navy battleship"], "correct_choice_idx": 1, "direct_answers": ["cruise ship", "ship", "cruise ship", "ship", "cruise ship", "cruise ship", "cruise ship", "cruise ship", "ship", "cruise ship"], "difficult_direct_answer": false, "rationales": ["The ship has many windows and decks. it is aesthetically pleasing.", "It is a very large ship with a lot of upper decks", "The white vessel is for a cruise."], "image": "val2014/COCO_val2014_000000092134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273324, "question_id": "WvxDWdykrza3mgdBLTAD7S", "question": "What is the tallest item called here?", "choices": ["urinal", "closet", "shed", "inlet"], "correct_choice_idx": 0, "direct_answers": ["urinal", "urinal", "urinal", "urinal", "urinal", "urinal", "urinal", "urinal", "urinal", "urinal"], "difficult_direct_answer": false, "rationales": ["This is a place in a mens bathroom they can use instead of a toilet when they have to urinate.", "The urinal is very tall.", "The tallest item is a urinal."], "image": "train2014/COCO_train2014_000000273324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482526, "question_id": "Ww4ZwM2bWsz3RoAKjFzJP8", "question": "What is on the branch?", "choices": ["cat", "birds", "man", "baby eel"], "correct_choice_idx": 1, "direct_answers": ["birds", "birds", "birds", "birds", "birds", "birds", "birds", "birds", "birds", "birds"], "difficult_direct_answer": false, "rationales": ["The branch has birds.", "The animals have beaks and are resting on it similar to how feathered animals sit on branches.", "These items are in the shapes of birds and have the body of a bird and a beak."], "image": "val2014/COCO_val2014_000000482526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519676, "question_id": "WwjNj5Ca4CSjM66eNsGWpt", "question": "How many wheels are visible on the vehicle that is front and center?", "choices": ["two", "three", "seven", "six"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "four", "four", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is one in the front and one in the back", "Two wheels are visible.", "There is one front wheel and one back wheel visible, which equals two."], "image": "val2014/COCO_val2014_000000519676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36607, "question_id": "WwnQsWFERQ5vALtR7ALARn", "question": "What is the quickest way to turn off the phone?", "choices": ["press 0", "shut it", "dial 911", "email code"], "correct_choice_idx": 1, "direct_answers": ["close", "close flap", "close it", "close it", "shut it", "power button", "close", "close it", "close screen", "flip shut"], "difficult_direct_answer": false, "rationales": ["The phone will shut down once it's shut off.", "With these older flip phones the best way to turn it off is closing it.", "A person is holding a flip phone and the top part can be closed."], "image": "val2014/COCO_val2014_000000036607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66828, "question_id": "WwtNWwu8Cs6sUsndwFEnq5", "question": "Why are the people riding the elephants?", "choices": ["to perform", "circus act", "to race", "cross river"], "correct_choice_idx": 3, "direct_answers": ["traveling", "crossing river", "poor infrastructure", "cross river", "transportation", "tour", "crossing river", "cross river", "go across", "for fun"], "difficult_direct_answer": false, "rationales": ["They are trying to cross a river.", "The people don't want to get wet.", "The people here are riding on the elephants to cross the river."], "image": "train2014/COCO_train2014_000000066828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92998, "question_id": "Wwy4EBMyVKYxpESm9mQWco", "question": "This style of furnishing would be most appropriate for a home in what setting?", "choices": ["rural", "tropical", "urban", "suburban"], "correct_choice_idx": 0, "direct_answers": ["countryside/rural", "country", "visitors", "rural", "rustic", "farmhouse", "evening", "country", "country", "rural"], "difficult_direct_answer": false, "rationales": ["It is simplistic and easy to dust.", "The colors of the room and strong use of woodwork are most consistent with the farmhouse decorating style.", "That said, it could also be appropriate in a b setting, depending on the homeowner's tastes."], "image": "val2014/COCO_val2014_000000092998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285893, "question_id": "WxTXz94hikfj3mJDioUTfN", "question": "The man has what on his head?", "choices": ["umbrella", "bonnet", "helmet", "cap"], "correct_choice_idx": 3, "direct_answers": ["cap", "hat", "hat", "hat", "cap", "cap", "hat", "hat", "baseball cap", "ballcap"], "difficult_direct_answer": false, "rationales": ["You can tell by the shape of the garment to what it is.", "The man has a cap.", "It has a a bill at the front and fabric around the sides, unlike the other options mentioned."], "image": "train2014/COCO_train2014_000000285893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100166, "question_id": "WxW9HpeYsxuo23wu4rDR9v", "question": "What type of drinking vessels are on the bench?", "choices": ["steins", "teacups", "lowball", "espresso cups"], "correct_choice_idx": 0, "direct_answers": ["mugs", "cups", "beer stein", "mug", "steins", "mugs", "steins", "steins", "mugs", "mugs"], "difficult_direct_answer": false, "rationales": ["Steins are on the bench.", "The vessel is a stein.", "Steins have handles like the vessels shown."], "image": "val2014/COCO_val2014_000000100166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319933, "question_id": "WxeBuL2iwKcpMTVugqpGWg", "question": "What are the people in the background doing?", "choices": ["serving customers", "buy donuts", "delivering drinks", "delivering donuts"], "correct_choice_idx": 1, "direct_answers": ["analysing", "ordering", "buying doughnuts", "ordering", "frying donuts", "ordering", "ordering", "ordering donuts", "making donuts", "buy donuts"], "difficult_direct_answer": false, "rationales": ["The people in the background are standing on the opposite side of a counter from the visible register which is the place a customer would stand. there appears to be donuts cooking in the foreground so the people making purchases are likely buying donuts.", "Donuts are being cooked.", "In the far background... there's also likely someone doing b."], "image": "val2014/COCO_val2014_000000319933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344005, "question_id": "WyAvVQt3FqotxBE4puMcqB", "question": "What is the woman near the frisbee wearing?", "choices": ["scarf", "parka", "bikini", "hat"], "correct_choice_idx": 2, "direct_answers": ["bra", "bikini top", "bathing suit", "bikini top", "bikini", "beachwear", "bikini top", "bikini top", "bra", "bikini top"], "difficult_direct_answer": false, "rationales": ["The woman is wearing a piece of clothing at the beach that is held by a strap in the back. this article of beach clothing for women is called a bikini.", "The woman is wearing a bikini top.", "The woman has a bikini."], "image": "val2014/COCO_val2014_000000344005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223127, "question_id": "WyfsKfN2y9ZDRFrRyte6J5", "question": "The skateboarder leaping over the skateboards in the park is doing it during which season?", "choices": ["spring", "summer", "winter", "fall"], "correct_choice_idx": 0, "direct_answers": ["summer", "summer", "summer", "summer", "summer", "summer", "spring", "spring", "spring", "spring"], "difficult_direct_answer": false, "rationales": ["The trees have their leaves and it looks warm.", "The season is spring.", "There are light green leaves on the trees and he is wearing a long-sleeved shirt, so it's not very warm outside yet."], "image": "train2014/COCO_train2014_000000223127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82138, "question_id": "WygzrNPZHAT9pdAp8EoiiA", "question": "What is this type of ride called?", "choices": ["coaster", "carousel", "bumper cars", "bounce house"], "correct_choice_idx": 1, "direct_answers": ["carousel", "carousel", "carousel", "carousal", "merry-go-round", "merry-go-round", "carousel", "carousel", "carousel", "carousel"], "difficult_direct_answer": false, "rationales": ["The ride is a carousel.", "The ride shown is a carousel which usually has animal figures for people to ride in circles.", "It goes around in circles"], "image": "val2014/COCO_val2014_000000082138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416271, "question_id": "Wyy2H6gZYRDvV7uVQa6BNF", "question": "What is the woman in the white shirt using in her hands?", "choices": ["calculator", "makeup", "gameboy", "phone"], "correct_choice_idx": 3, "direct_answers": ["phone", "phone", "phone", "phone", "phone", "using phone", "cellphone", "to text", "phone", "phone"], "difficult_direct_answer": false, "rationales": ["The woman has a phone.", "She is texting.", "The woman is looking at her phone."], "image": "val2014/COCO_val2014_000000416271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299871, "question_id": "WyybHpbqKEmnfh6vM5CFWu", "question": "This retail chain was founded in what country?", "choices": ["mexico", "america", "brazil", "colombia"], "correct_choice_idx": 2, "direct_answers": ["brazil", "america", "unknown", "mexico", "america", "brazil", "unknown", "america", "america", "america"], "difficult_direct_answer": false, "rationales": ["I believe this sounds like a company from south america and brazil is the one that fits the best.", "Loras is a brazilian company.", "It is founded there"], "image": "train2014/COCO_train2014_000000299871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243509, "question_id": "WzroENQhakQvVoJFGNbSfC", "question": "What object does the black helmet the batter is wearing protect from?", "choices": ["fists", "football", "baseball", "stones"], "correct_choice_idx": 2, "direct_answers": ["ball", "ball", "ball", "baseball", "impact", "baseball", "ball", "helmet", "baseball", "head"], "difficult_direct_answer": false, "rationales": ["They are playing baseball.", "Sometimes balls can hit people.", "This is in a game with these items"], "image": "train2014/COCO_train2014_000000243509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323442, "question_id": "Wzxtk4t35gv8Ez6fX3T9Gm", "question": "The people are enjoying an outdoor meal during which season?", "choices": ["spring", "fall", "summer", "winter"], "correct_choice_idx": 1, "direct_answers": ["summer", "summer", "fall", "fall", "fall", "summer", "summer", "fall", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["The people are sitting at the table in the fall since the leaves are changing colors.", "The blooms, the sun and the scantiness of the woman's dress suggests hot weather.", "The leaves on the trees are changing."], "image": "val2014/COCO_val2014_000000323442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375382, "question_id": "X27dLBfKAbCHPVC7P2Htax", "question": "What does the dish drainer tell you about this kitchen?", "choices": ["dishwasher missing", "pots missing", "stove missing", "sink missing"], "correct_choice_idx": 0, "direct_answers": ["no dishwasher", "recently cleaned", "nothing", "it's clean", "no dishwasher", "no dishwasher", "heavy workload", "clean", "no dishwasher", "dishwasher missing"], "difficult_direct_answer": false, "rationales": ["The dish drainer is full which means there's no dishwasher.", "Dishes can be dried in a dish drainer after handwashing.", "There is no dishwasher in the kitchen."], "image": "train2014/COCO_train2014_000000375382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534941, "question_id": "X28R5HSBtjLxZDCu6pfRd8", "question": "From what kind of building was this picture taken?", "choices": ["silo", "barn", "house", "trailer"], "correct_choice_idx": 1, "direct_answers": ["stable", "barn", "barn", "barn", "barn", "stable", "stable", "barn", "stable", "barn"], "difficult_direct_answer": false, "rationales": ["There is a large wooden door and gates in front of it", "The picture is looking out onto a pen and some sheep. based on the setting and the style of doors visible, answer a is likely.", "There are animals just outside the structure, and the door is constructed to open in two halves. stone-walled buildings with two-half doors are a common feature of barns."], "image": "train2014/COCO_train2014_000000534941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74340, "question_id": "X2HonukL8ueZJVKgGzyxva", "question": "How are the kids skating on the ice?", "choices": ["ice skates", "skis", "snowboards", "rollerblades"], "correct_choice_idx": 1, "direct_answers": ["roller blades", "ski boards", "skis", "skis", "skiing", "on skis", "on skis", "skies", "skis", "on skis"], "difficult_direct_answer": false, "rationales": ["The kids are using skis.", "Each kid has a distinct object on each leg. the object is not a skate or rollerblade.", "The kids have little skis attached to their feet."], "image": "train2014/COCO_train2014_000000074340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241324, "question_id": "X2WzZxy8BeCEX85tMQrniQ", "question": "What happened to this brown doll?", "choices": ["being dumped", "being displayed", "being owned", "being donated"], "correct_choice_idx": 0, "direct_answers": ["thrown away", "trashed", "sleeping", "thrown out", "thrown away", "trashed", "being dumped", "left outside", "abandoned", "thrown away"], "difficult_direct_answer": false, "rationales": ["The bear appears on a pile of trash on the side of the road where one frequently will leave things they want to throw away or jump.", "The bear is dirty and resting on a trash bag.", "The doll was dumped."], "image": "train2014/COCO_train2014_000000241324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190041, "question_id": "X2iBDU7N5SwPfp6shAdroG", "question": "What type of lens was used to make the warped picture?", "choices": ["turnstile", "hollow", "fish eye", "holographic"], "correct_choice_idx": 2, "direct_answers": ["fisheye", "fisheye", "fisheye", "fish eye", "fisheye", "camera", "filter", "convex", "fisheye", "professional"], "difficult_direct_answer": false, "rationales": ["A scene with a skateboarder is out of focus and off center.", "A fish eye lens creates the warped effect.", "A special lens called a fisheye lens created this skewed effect."], "image": "val2014/COCO_val2014_000000190041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366255, "question_id": "X2iG5Cp6U5gb3cpuj7xwYt", "question": "The hat of this figure suggests it is meant to depict what profession?", "choices": ["dancer", "construction", "clergy", "comedian"], "correct_choice_idx": 1, "direct_answers": ["construction", "construction", "construction", "construction", "miner", "construction", "construction", "mining", "construction worker", "miner"], "difficult_direct_answer": false, "rationales": ["The hard hat is for construction.", "The figure is wearing a hard hat. members of the clergy, dancers, and comedians do not need to wear this kind of personal protective equipment.", "The figure is wearing a hard hat."], "image": "train2014/COCO_train2014_000000366255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289842, "question_id": "X2u6LqRSXrG4iPQ2QKjE9M", "question": "Where is that non-English language mostly spoken?", "choices": ["france", "china", "america", "india"], "correct_choice_idx": 1, "direct_answers": ["japan", "japanese", "chinese", "china", "china", "china", "china", "china", "china", "korea"], "difficult_direct_answer": false, "rationales": ["The characters of the non-english language appear to be chinese which is a language most spoken in china.", "Chinese is spoken in china.", "The other options don't match the writing to the right side of the blue banner in the background, and it's the only non-english related language present."], "image": "val2014/COCO_val2014_000000289842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6488, "question_id": "X2vHZjrGjvf8827KXSkQZm", "question": "A male of this type of animal is called what?", "choices": ["tom", "buffalo", "joey", "ram"], "correct_choice_idx": 3, "direct_answers": ["ram", "no image", "sheep", "ram", "ram", "ram", "ram", "rams", "ram", "ram"], "difficult_direct_answer": false, "rationales": ["A an uncastrated male sheep is called this.", "These are sheep", "It is a sheep with horns."], "image": "train2014/COCO_train2014_000000006488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418196, "question_id": "X37LZXkTx2iJbjTtM33due", "question": "What kitchen cooker is necessary for this treat's preparation?", "choices": ["stove top", "fryer", "open fire", "oven"], "correct_choice_idx": 3, "direct_answers": ["oven", "oven", "oven", "oven", "oven", "oven", "oven", "oven", "oven", "oven"], "difficult_direct_answer": false, "rationales": ["Ovens are needed to bake cakes.", "The cooker is an oven.", "The items, cupcakes, must be baked in order to be prepared correctly and in order to bake something, one must use an oven."], "image": "train2014/COCO_train2014_000000418196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410105, "question_id": "X37x5hhqHUsE7uHCkuegBu", "question": "Why are the clothes on hangers?", "choices": ["for sale", "hiding", "airing out", "cleaning"], "correct_choice_idx": 2, "direct_answers": ["airing out", "drying", "keep clean", "display", "prevent wrinkles", "drying", "drying", "for closet", "shirts", "ironed"], "difficult_direct_answer": false, "rationales": ["The clothes are being dried.", "The fabric looks very delicate and seems like someone doesn't want to ruin it by putting it in the dryer.", "The clothes are drying."], "image": "val2014/COCO_val2014_000000410105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218456, "question_id": "X39r3fDPqRWQULoYbUEGRv", "question": "What vegetable are on the pizza?", "choices": ["broccoli", "squash", "asparagus", "arugula"], "correct_choice_idx": 3, "direct_answers": ["arugula", "herb", "spinach", "spinach", "basil", "spinach", "tomato", "spinach", "spinach", "spinach"], "difficult_direct_answer": false, "rationales": ["The green looks like lettuce.", "The green vegetable on the pizza is arugula", "It's the only vegetable on the pizza."], "image": "val2014/COCO_val2014_000000218456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157933, "question_id": "X3H3fsMBemXj5AY6cFTSYq", "question": "What products are being displayed?", "choices": ["remote controls", "calculators", "landline phones", "mobile phones"], "correct_choice_idx": 3, "direct_answers": ["phones", "mobile phones", "cellphones", "cellphones", "cellphones", "phones", "cellphones", "phones", "cellphones", "phones"], "difficult_direct_answer": false, "rationales": ["Mobile phones are shown.", "The products displayed are mobile phones.", "Cell phones are being displayed."], "image": "train2014/COCO_train2014_000000157933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59278, "question_id": "X3MvT8BFK7pXdiTPYtLEXa", "question": "How do the cry look that are closest to the camera?", "choices": ["blurry", "invisible", "mini", "clear"], "correct_choice_idx": 0, "direct_answers": ["lights", "blurry", "blurry", "in motion", "blurry", "blurry", "in motion", "blurry", "blurry", "blurry"], "difficult_direct_answer": false, "rationales": ["The motions are very blurry.", "The cars are moving quickly and they're blurred out.", "Most things in the picture are in motion"], "image": "train2014/COCO_train2014_000000059278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61566, "question_id": "X3Ru7pToS8LZntDqgWkxRa", "question": "Why is the man all wet?", "choices": ["from sweating", "from showering", "from surfing", "from rain"], "correct_choice_idx": 2, "direct_answers": ["surfing", "surfing", "surfing", "surfing", "surfing", "waves", "he's surfing", "ocean", "surfing", "from surfing"], "difficult_direct_answer": false, "rationales": ["The man is in the ocean.", "The man is surfing.", "The man is standing on a board and is surrounded by waves."], "image": "val2014/COCO_val2014_000000061566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106229, "question_id": "X3gVLE5hbzuJRBrzW3PJEL", "question": "What breed of the dog present in the picture?", "choices": ["retriever", "bull dog", "poodles", "shiba inu"], "correct_choice_idx": 2, "direct_answers": ["shiatsu", "yorkie", "tibetan terrier", "shitzu", "terrier", "small dog", "yorkie", "poodles", "pekinese", "shitzu"], "difficult_direct_answer": false, "rationales": ["The breed is a poodle.", "The dog is a type of poodle since its fur is curly.", "The dog is a poodle."], "image": "train2014/COCO_train2014_000000106229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314154, "question_id": "X3k5NSXJo4ssUiUDVXARpc", "question": "Upon which vessel might this person go for a ride soon?", "choices": ["uber", "airplane", "space ship", "steamer"], "correct_choice_idx": 1, "direct_answers": ["airplane", "train", "boat", "plane", "airplane", "plane", "airplane", "plane", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["She is dragging luggage. luggage is used to go on long trips through the air.", "Looks like she is in front of an airport with her suitcase.", "She is outside a large building that houses airplane terminals with luggage"], "image": "val2014/COCO_val2014_000000314154.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407434, "question_id": "X3zW7XfxAPGRUG4hXUnDDb", "question": "What color has two post-its?", "choices": ["orange", "green", "blue", "yellow"], "correct_choice_idx": 1, "direct_answers": ["green", "green yellow", "green", "green", "green", "green", "gray", "green", "green", "green yellow"], "difficult_direct_answer": false, "rationales": ["You can tell by the color and size of the paper as to what the color is for the post-its.", "The post-its are green pads.", "There are two green ones."], "image": "train2014/COCO_train2014_000000407434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558992, "question_id": "X44MpGDLv5FrGk2AYkszAo", "question": "What season of the year is it?", "choices": ["winter", "summer", "spring", "autumn"], "correct_choice_idx": 3, "direct_answers": ["fall", "spring", "fall", "fall", "fall", "autumn", "fall", "fall", "spring", "fall"], "difficult_direct_answer": false, "rationales": ["The ground is covered in fallen leaves.", "The trees are starting to turn like they do in the fall.", "The season is fall."], "image": "val2014/COCO_val2014_000000558992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518462, "question_id": "X48SFGzWgpUsR2e7EcNALw", "question": "Which photo mismatches the theme?", "choices": ["top right", "top left", "bottom left", "bottom right"], "correct_choice_idx": 2, "direct_answers": ["bike race", "top right", "three", "motorcycles", "bottom left", "bottom left", "bottom left", "bottom left", "second", "truck picture"], "difficult_direct_answer": false, "rationales": ["The bottom left photo shows boys rather than motorcycles.", "All of the photos are of motorcycles while the photo on the bottom left shows a man in the back of a pickup truck.", "It is from the camera man's angle but there are bikes in all"], "image": "train2014/COCO_train2014_000000518462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85298, "question_id": "X4CduDAn3Wc7dus5xS8SMU", "question": "Which person is holding a racket made by an older company?", "choices": ["back person", "no rackets", "they're equal", "front girl"], "correct_choice_idx": 3, "direct_answers": ["front", "front", "front girl", "front girl", "child behind", "second", "back", "kid behind", "front", "front"], "difficult_direct_answer": false, "rationales": ["The girl in the front has a penn racquet.", "The person is the front girl.", "The front girl is holding a wilson racket which is older."], "image": "val2014/COCO_val2014_000000085298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467840, "question_id": "X4Jw8KWWq6gcU8TqoM2BpE", "question": "These activists probably support which movement?", "choices": ["women's", "pro life", "lgbt", "environmentalist"], "correct_choice_idx": 2, "direct_answers": ["gay rights", "lgbt", "lgbtq", "gay", "mexican", "gay", "lgbtq", "lgbt", "civilisation", "lgbt"], "difficult_direct_answer": false, "rationales": ["The activists are wearing rainbow colors.", "The activists are lgbt.", "The same sex marriage movement often use the rainbow as a symbol to help identify."], "image": "train2014/COCO_train2014_000000467840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231768, "question_id": "X4R88MkpsXdRLGenkpq4pY", "question": "This road is belongs to which country?", "choices": ["germany", "australia", "us", "uk"], "correct_choice_idx": 1, "direct_answers": ["united kingdom", "england", "england", "uk", "united kingdom", "england", "england", "australia", "united states", "england"], "difficult_direct_answer": false, "rationales": ["The borough of wirral is in england.", "The road is australian.", "Wirral is in australia."], "image": "val2014/COCO_val2014_000000231768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221739, "question_id": "X4zTX2oZ7dfNYoxSk6LvhC", "question": "What will the lady at the blue table do next?", "choices": ["sing songs", "take nap", "leave", "arrange flowers"], "correct_choice_idx": 3, "direct_answers": ["arrange flowers", "speak", "play instrument", "preparing florals", "make presentation", "seat customers", "nothing", "sing", "eat", "arrange flowers"], "difficult_direct_answer": true, "rationales": ["The woman will arrange the flowers.", "She has a lot of greenery laying on the table", "A woman is standing at a table with cut flowers. cut flowers are used to make arrangements."], "image": "train2014/COCO_train2014_000000221739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180141, "question_id": "X5Lbmdk9e4fpA825nMKGc3", "question": "What are the two boys in front doing?", "choices": ["singing", "spelling", "debating", "announcing"], "correct_choice_idx": 0, "direct_answers": ["singing", "singing", "singing", "karaoke", "karaoke", "karaoke", "singing", "singing", "singing", "karaoke"], "difficult_direct_answer": false, "rationales": ["They are holding microphones in front of their mouths. microphones are used to enhance sound.", "They are looking at a screen and holding a microphone and likely signing.", "The boys in the front are singing together."], "image": "train2014/COCO_train2014_000000180141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152677, "question_id": "X5WVdmGaRMjy9khkpSh7XL", "question": "What is the poster on the back wall about?", "choices": ["car drivers", "presidents", "roman emperors", "action figures"], "correct_choice_idx": 1, "direct_answers": ["world map", "presidents", "us presidents", "presidents", "us presidents", "us presidents", "presidents", "map", "us presidents", "presidents"], "difficult_direct_answer": false, "rationales": ["The list says presidents.", "A bedroom has posters with peoples portraits on it.", "There are us presidents on the poster."], "image": "train2014/COCO_train2014_000000152677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277746, "question_id": "X5zi6vTvmTAjvwVLhSVFPi", "question": "What is the woman wearing?", "choices": ["dress", "dress", "jeans", "pyjamas"], "correct_choice_idx": 0, "direct_answers": ["dress", "dress", "nametag", "dress", "nametag", "gown", "dress", "dress", "dress", "dress"], "difficult_direct_answer": false, "rationales": ["She's wearing a top that's almost knee length.", "The woman is wearing a dress.", "The item has no pantless"], "image": "train2014/COCO_train2014_000000277746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63918, "question_id": "X6esxqdD97NvAhh7fWvGMy", "question": "What sports equipment is the man holding?", "choices": ["golf", "hockey", "tennis", "cricket"], "correct_choice_idx": 2, "direct_answers": ["tennis racket", "tennis", "racket", "tennis", "tennis racket", "tennis racket", "tennis racket", "tennis racket", "tennis", "racket"], "difficult_direct_answer": false, "rationales": ["A man stands on front of a green chalkboard hugging a tennis racket to his chest.", "A man is holding a stringed racket.", "The man has a tennis racquet."], "image": "train2014/COCO_train2014_000000063918.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43657, "question_id": "X6kQvgGLqsN4D8PXP5LaPh", "question": "Where do those who await their turn at bat wait?", "choices": ["mall", "home base", "behind fence", "outfield"], "correct_choice_idx": 2, "direct_answers": ["dugout", "dugout", "dugout", "dugout", "behind fence", "under shade", "dugout", "dugout", "behind fence", "bench"], "difficult_direct_answer": false, "rationales": ["Other players are sitting at bench. there's a protective fence to keep from getting hit.", "The people waiting for a turn at the bat are waiting behind the fence and inside of the dugout.", "They need to stay behind the fence to avoid getting hit by a flying ball or bat."], "image": "val2014/COCO_val2014_000000043657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520047, "question_id": "X6v5o8a4Zdz3eRkCSQxiHs", "question": "What business pays the men in orange here?", "choices": ["groceries", "prisons", "boats", "train"], "correct_choice_idx": 3, "direct_answers": ["railway", "train", "train", "railroad authority", "construction", "railroad", "government", "railway", "freightliner", "railroad"], "difficult_direct_answer": false, "rationales": ["There are land, not water, vehicles that travel on tracks. the men in orange are associated with these vehicles and their tracks.", "They appear to work for the train company.", "They are next to rails and one of the rails has a train and freight boxes behind it."], "image": "val2014/COCO_val2014_000000520047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269263, "question_id": "X79CXzAVA2RjDA9wdgf8iQ", "question": "The ground that the man is standing on is made of what material?", "choices": ["cement", "wood", "ceramic", "soil"], "correct_choice_idx": 0, "direct_answers": ["cement", "wood", "dirt", "wood", "concrete", "concrete", "wood", "cement", "concrete", "concrete/wood"], "difficult_direct_answer": false, "rationales": ["The surface is smooth, dull and grey.", "It is a hard grey surface.", "It is solid and grey"], "image": "train2014/COCO_train2014_000000269263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391252, "question_id": "X7BDmnVAEK4JbgKa32SkPQ", "question": "What is this child likely to do next?", "choices": ["eat", "complain", "scream", "nap"], "correct_choice_idx": 3, "direct_answers": ["sleep", "sleep", "sleep", "sleep", "nap", "sleep", "nap", "roll", "nap", "sleep"], "difficult_direct_answer": false, "rationales": ["Take a nap", "It is likely naptime for the little girl.", "A child is laying down and holding a stuffed animal."], "image": "train2014/COCO_train2014_000000391252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449158, "question_id": "X7Dp66vDyNXSf6PRbaHTzB", "question": "Why is she holding up the item?", "choices": ["listening", "throwing", "afraid", "camera shy"], "correct_choice_idx": 3, "direct_answers": ["hiding face", "block face", "to hide", "cover face", "cover face", "block photographer", "camera shy", "privacy", "shade", "sun protection"], "difficult_direct_answer": true, "rationales": ["She doesn't want her face shown whereever the person plans on uploading the picture.", "Frisbees aren't used for covering a person's face. a photo was being taken and she probably doesn't like having her picture taken.", "The person is hiding her face in the direction of the photographer."], "image": "train2014/COCO_train2014_000000449158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168573, "question_id": "X7UXVCZBsZzEis5mjq4S8e", "question": "If the boy keeps playing this sport whose record can he possibly break?", "choices": ["wayne gretzky", "michael jordan", "rickey henderson", "tiger woods"], "correct_choice_idx": 2, "direct_answers": ["cal ripen", "barry bonds", "rickey henderson", "ripen", "rbi", "home run", "derek jetter", "babe ruth", "aaron", "babe ruth"], "difficult_direct_answer": true, "rationales": ["The record for runs scored is 2,295.", "That is he guy who broke that record.", "Rickey henderson is a baseball player."], "image": "train2014/COCO_train2014_000000168573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153300, "question_id": "X7phZNnUY87jrCMikZLycE", "question": "Which piece of jewelry here would be easier to cut?", "choices": ["earring", "ring", "bracelet", "necklace"], "correct_choice_idx": 2, "direct_answers": ["bracelet", "bracelet", "bracelet", "bracelet", "food", "knife", "diamond", "knife", "bracelet", "bracelet"], "difficult_direct_answer": false, "rationales": ["The bracelet is the thinnest piece of jewelry.", "The bracelet is really thin.", "You can break the bracelet off."], "image": "val2014/COCO_val2014_000000153300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305227, "question_id": "X7uW8VbQEx3xozehkkhKXy", "question": "What's the boy using to float on the water?", "choices": ["boat", "raft", "plywood", "surfboard"], "correct_choice_idx": 1, "direct_answers": ["inflatable raft", "rubber boat", "raft", "raft", "rubber raft", "intertube", "inflatable boat", "raft", "boat", "raft"], "difficult_direct_answer": false, "rationales": ["The boy uses the raft.", "The kids are in an inflatable raft.", "The boy is in the lake in an inflatable boat."], "image": "train2014/COCO_train2014_000000305227.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350640, "question_id": "X7zGQJcB4higpU9gnhpVXt", "question": "What year was this meme originally founded?", "choices": ["2009", "2020", "1939", "1987"], "correct_choice_idx": 2, "direct_answers": ["1939", "2020", "2005", "1939", "2008", "1949", "no idea", "unknown", "2010", "1939"], "difficult_direct_answer": false, "rationales": ["The line was from that year.", "The meme was created in 2009.", "This saying was popularized in england in world war 2 around the time of answer a."], "image": "train2014/COCO_train2014_000000350640.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463836, "question_id": "X82veRUG27zdgbAdGyovA3", "question": "What's the Lord doing?", "choices": ["peeing", "playing", "eating", "reading"], "correct_choice_idx": 0, "direct_answers": ["urinating", "peeing", "potty", "nothing", "urinating", "urinating", "urinating", "peeing", "urinating", "pee"], "difficult_direct_answer": false, "rationales": ["The lord is peeing.", "The lord is peeing in a urinal dug in the stone.", "The man is looking down at his crotch and looking into a hole."], "image": "val2014/COCO_val2014_000000463836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226076, "question_id": "X86dhAvc4SaX45fa7KzBZU", "question": "What do the items in the soup look like?", "choices": ["apple pie", "snails", "doritos", "squid"], "correct_choice_idx": 2, "direct_answers": ["bacon", "bacon", "tortilla chip", "doritos", "chips", "chips", "bacon", "fried onions", "chips", "chips"], "difficult_direct_answer": false, "rationales": ["The items in the soup look like tortilla chips. they are orange.", "They are orange and triangular in shape.", "The items are crushed orange chips."], "image": "val2014/COCO_val2014_000000226076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4820, "question_id": "X8AWiKt6uazhj4p7H8oG2p", "question": "What artist is famous for the type of artwork that is depicted on the side table?", "choices": ["rothko", "da vinci", "mondrian", "van gogh"], "correct_choice_idx": 2, "direct_answers": ["matisses", "klimt", "andy warhol", "mondrian", "no idea", "andy warhol", "vincent gogh", "andy warhol", "thank you", "warhol"], "difficult_direct_answer": false, "rationales": ["Mondrian is known for the artwork.", "The artist is known for his style and this kind of artwork.", "Mondrian usually creates art with colored blocks."], "image": "train2014/COCO_train2014_000000004820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325992, "question_id": "X8Bp4YKyfBa3wbswmextp9", "question": "Who manufactured this laptop?", "choices": ["apple", "compaq", "hp", "dell"], "correct_choice_idx": 0, "direct_answers": ["apple", "apple", "macbook", "mac", "can't see", "company", "macbook", "toshiba", "apple", "apple"], "difficult_direct_answer": false, "rationales": ["Apple's logo is shown.", "This company manufactures the macbook, which is the product name displayed on the laptop.", "It's a macbook, which makes a the obvious answer."], "image": "val2014/COCO_val2014_000000325992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226983, "question_id": "X8FK4VtidaqweuVijoUH8S", "question": "Who is the man in blue behind the batter?", "choices": ["next batter", "coach", "umpire", "referee"], "correct_choice_idx": 2, "direct_answers": ["umpire", "umpire", "umpire", "catcher", "umpire", "catcher", "umpire", "umpire", "umpire", "umpire"], "difficult_direct_answer": false, "rationales": ["The umpire must be close behind the ball in order to correctly call the pitches.", "An umpire is seen at baseball games.", "He stands at home plate to decide if a ball is foul."], "image": "train2014/COCO_train2014_000000226983.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195809, "question_id": "X8XbNCBWhdz8QPiSSTV54g", "question": "What is the person with the laptop taking?", "choices": ["dictation", "photograph", "autographs", "memo"], "correct_choice_idx": 1, "direct_answers": ["picture", "picture", "pictures", "picture", "picture", "notes", "picture", "picture", "photograph", "picture"], "difficult_direct_answer": false, "rationales": ["The man is adjusting a webcam and orientating it towards the person sitting across from him. a web cam would be used to capture answer a and the man would likely need to re-position it based on the person across from him.", "The person is taking a photo.", "He is taking a picture of the person in front of him."], "image": "train2014/COCO_train2014_000000195809.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575900, "question_id": "X8XeGNCx2hhoTiZaBKG8kV", "question": "What is the building behind the red rail car used for?", "choices": ["animal shelter", "homeless shelter", "bible school", "church"], "correct_choice_idx": 1, "direct_answers": ["hospital", "homeless shelter", "hospital", "church", "rescue department", "rescue", "homeless shelter", "rescue center", "homeless shelter", "medical services"], "difficult_direct_answer": false, "rationales": ["The caboose is next to a church.", "It's a rescue shelter for people that have no place to go to sleep.", "The building is for homeless people."], "image": "train2014/COCO_train2014_000000575900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515682, "question_id": "X8e6h5WE9mXKBdUtENtKma", "question": "Where can a kitchenette be found?", "choices": ["truck", "jeep", "car", "trailer"], "correct_choice_idx": 3, "direct_answers": ["trailer", "in camper", "trailer", "trailer", "camper", "rv", "trailer", "trailer", "rv", "trailer"], "difficult_direct_answer": false, "rationales": ["You can find one in a truck.", "The trailer is like a small home.", "The visible trailer is a camper that is used as a small living space. living spaces is where kitchenettes are found and there is no other visible living space in the image."], "image": "val2014/COCO_val2014_000000515682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399744, "question_id": "X8koQr4PSMWukDtBbMZ9dF", "question": "Which fruit is rich in vitamin K?", "choices": ["grape", "pear", "apple", "tomato"], "correct_choice_idx": 3, "direct_answers": ["tomato", "tomato", "oranges", "apples", "apple", "kiwi", "prunes", "apple", "apples", "kiwi"], "difficult_direct_answer": false, "rationales": ["The fruit is the tomato.", "Tomatoes are rich in the vitamin k.", "The tomato is rich in vitamin k."], "image": "val2014/COCO_val2014_000000399744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309261, "question_id": "X98uejuRXE2Cfi5UBqivYF", "question": "What role does the man play on the team?", "choices": ["pitcher", "batter", "catcher", "thrower"], "correct_choice_idx": 0, "direct_answers": ["pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher"], "difficult_direct_answer": false, "rationales": ["Based on the mans placement on the baseball field and the action he is currently engaging in, answer a is consistent.", "The man is pitching the ball.", "The man is winding up his arm to throw the ball."], "image": "train2014/COCO_train2014_000000309261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535106, "question_id": "X9HFQQ9tf8QfM6ZtJjqgi7", "question": "The outfit the child is wearing was famously featured in ads for what company?", "choices": ["oshkosh", "carhartt", "zappos", "timberland"], "correct_choice_idx": 0, "direct_answers": ["oshkosh", "child", "dunsten", "levis", "osh kosh", "oshkosh", "osh kosh", "oshkosh b'gosh", "oshkosh", "denim company"], "difficult_direct_answer": false, "rationales": ["These are overalls.", "You can see the name on the button of the overalls.", "A child is on the floor in jean overalls. osh kosh is a company famous for making jean overalls."], "image": "val2014/COCO_val2014_000000535106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336406, "question_id": "X9Hv73qyQ7uDpN2LDPeRZZ", "question": "What would a vegetarian order from this restaurant?", "choices": ["monte cristo", "hamburger", "reuben", "tofu dog"], "correct_choice_idx": 3, "direct_answers": ["tofu dog", "tofu dog", "tofu dog", "salad", "tofu dog", "salad", "tofu dog", "tofu dog", "tofu dog", "tofu"], "difficult_direct_answer": false, "rationales": ["The vegetarian would need to get tofu.", "The tofu dog is available.", "Tofu is a vegan protein."], "image": "train2014/COCO_train2014_000000336406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120412, "question_id": "X9RicJqrGFm8xuAmzgLegh", "question": "What type of traffic does this road have?", "choices": ["heavy", "tractor", "herded animal", "light"], "correct_choice_idx": 0, "direct_answers": ["automobile", "medium traffic", "vehicle", "heavy", "lights", "stand still", "broken stoplight", "congested", "moderate", "one way"], "difficult_direct_answer": true, "rationales": ["The traffic is heavy.", "There is a lot of traffic on the road.", "There are many cars on the road."], "image": "val2014/COCO_val2014_000000120412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377949, "question_id": "X9mmJtwGJBgdNV5evSa7M7", "question": "What type of hat is the woman wearing?", "choices": ["yamike", "baseball cap", "bowler", "beret"], "correct_choice_idx": 3, "direct_answers": ["knit", "beret", "beret", "skull cap", "beret", "beanie", "blue hat", "beret", "beret", "beanie"], "difficult_direct_answer": false, "rationales": ["The beret is being worn.", "It is a somewhat flat and circular hat", "The hat mostly resembles that."], "image": "val2014/COCO_val2014_000000377949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524905, "question_id": "XA5rT5WJqwkMrd3wfVFT44", "question": "What kind of structure is the animal all the way to the right looking at?", "choices": ["diamond", "wooden", "brick", "mud"], "correct_choice_idx": 1, "direct_answers": ["wooden", "feeder", "feeder", "gate", "bridge", "stable", "building", "shelter barn", "hut", "wooden"], "difficult_direct_answer": false, "rationales": ["The structure is made of wood logs.", "The poles on the structure are wood.", "The structure is wooden."], "image": "train2014/COCO_train2014_000000524905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128599, "question_id": "XACEVZ8mfdPdBdTfTwfjvn", "question": "What is clipped to the green pants?", "choices": ["carabiner", "paper clip", "bobby pin", "bread clip"], "correct_choice_idx": 0, "direct_answers": ["carabiner keychain", "keys", "carabiner", "clip", "carabiner", "carabiner", "carbine", "carabiner", "carabiner", "carabiner keychain"], "difficult_direct_answer": false, "rationales": ["A carabiner is clipped.", "On the man's green pants we see a metal items which is shackled to one of its loops. this is quite a fad in the country today and called a carabiner.", "The object is round and elongated, and had a moveable latch section. carabiners are not a strange thing to find attached to a belt loop, they can be used to hold keys or other common carry items."], "image": "train2014/COCO_train2014_000000128599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251580, "question_id": "XADxJpttfCt7EwZMbWAzji", "question": "What is the metal cart being used to store?", "choices": ["gum balls", "golf balls", "tennis balls", "baseballs"], "correct_choice_idx": 2, "direct_answers": ["tennis balls", "tennis balls", "tennis balls", "tennis balls", "balls", "balls", "tennis balls", "balls", "ball cart", "tennis balls"], "difficult_direct_answer": false, "rationales": ["The metal cart is stacked with balls for tennis.", "The metal cart has tennis balls.", "The objects within the cart are identifiable based on their size, shape and color and the answer a is consistent with the setting and objects within."], "image": "val2014/COCO_val2014_000000251580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366109, "question_id": "XAXPqXJ2738nApwkmoAEhi", "question": "What problem is being solved here?", "choices": ["none", "long nails", "dull scissors", "broken knife"], "correct_choice_idx": 2, "direct_answers": ["scissor sharpening", "dull scissors", "dull scissors", "dull scissors", "dull scissors", "sharper", "dull scissors", "dull scissors", "sharpen scissors", "sharpening"], "difficult_direct_answer": false, "rationales": ["A sharpener is a handy man to know when any sort of blades are too worn down to work properly. grinding, steeling and stropping are some of the ways professionals keep our tools sharp.", "The man is using a grinder to sharpen the blades.", "A man is running a pair of scissors along a sharpening blade."], "image": "train2014/COCO_train2014_000000366109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448656, "question_id": "XAgD7Fg9563GVtcGriW7T6", "question": "What was the original name of this bank?", "choices": ["shawmut", "fleet", "people", "sovereign"], "correct_choice_idx": 3, "direct_answers": ["sovereign", "santander", "sovereign bank", "sovereign bank", "sovereign", "sovereign", "sovereign bank", "sovereign bank", "sovereign bank", "sovereign bank"], "difficult_direct_answer": false, "rationales": ["Sovereign bank became santander.", "Originally it was sovereign.", "Satander merged from soverign."], "image": "train2014/COCO_train2014_000000448656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191293, "question_id": "XAkUwNrwkc5ZckLsXbas3Y", "question": "Where is this room located?", "choices": ["office", "school", "home", "store"], "correct_choice_idx": 2, "direct_answers": ["home", "living room", "house", "livingroom", "house", "beside kitchen", "living room", "motel", "ground floor", "house"], "difficult_direct_answer": false, "rationales": ["This is a living room with couches and a coffee table", "The furniture suggests the room is used for leisure.", "This is a living room within a residential home."], "image": "train2014/COCO_train2014_000000191293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388893, "question_id": "XBBRr758czLUXXHSpS4cjd", "question": "What is the man wearing a helmet?", "choices": ["style", "warmth", "safety", "laws"], "correct_choice_idx": 2, "direct_answers": ["protection", "safety", "safety", "racing", "safety", "safety", "safety", "safety", "for protection", "skier"], "difficult_direct_answer": false, "rationales": ["This man appears to be a competitive skier, so he will be traveling very fast. if he has an issue and takes a tumble, his head will be protected very well by his helmet.", "He is on skis which can go really fast, and when you go really fast you have to protect yourself from getting hurt.", "The man needs safety."], "image": "train2014/COCO_train2014_000000388893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162757, "question_id": "XBFiRpGzzHjrjnWHTJmKNp", "question": "The young people here are what type players?", "choices": ["beginner", "golf", "retirees", "pros"], "correct_choice_idx": 0, "direct_answers": ["beginner", "tennis players", "tennis", "beginners", "tennis", "tennis", "tennis", "tennis", "beginners", "tennis"], "difficult_direct_answer": false, "rationales": ["They are learning how to play the game.", "The kids look like they may be taking starting lessons.", "You can tell they are just learning as they are standing around watching."], "image": "train2014/COCO_train2014_000000162757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511363, "question_id": "XBNPEtWLNEAsusryPtBrSA", "question": "What is necessary for the toy to be played with properly?", "choices": ["manual", "wind", "directions", "marshall"], "correct_choice_idx": 1, "direct_answers": ["wind", "wind", "wind", "safety", "safety", "wind", "wind", "wind", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["Wind makes the kite fly.", "A kite can be a fun toy to play with but needs the wind to get it into the air.", "This is a kite"], "image": "val2014/COCO_val2014_000000511363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572958, "question_id": "XCSJwNSdcthm8bQF73tshz", "question": "What are people filming and taking pictures of?", "choices": ["accident", "tricks", "skateboards", "model"], "correct_choice_idx": 1, "direct_answers": ["skateboarding", "skateboarders", "skateboarder", "skater tricks", "normal", "skateboarder", "skateboarding", "tricks", "skating", "skateboarders"], "difficult_direct_answer": false, "rationales": ["A person is doing a skateboard trick on the side of a cement surface. several people are holding cameras near a person doing a skateboarding trick.", "The people are filing and photographing the skateboarder.", "The skateboarder looks to be professional and it is a skateboarding event where pictures are taken."], "image": "train2014/COCO_train2014_000000572958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391297, "question_id": "XCjB2kKMdhzemNWBNC5XyP", "question": "What kind of sauce is in the jar?", "choices": ["mayonnaise", "horseradish", "soy", "hot"], "correct_choice_idx": 3, "direct_answers": ["hot sauce", "hot sauce", "hot sauce", "hot sauce", "hot sauce", "xtra hot", "hot sauce", "xtra hot", "hot", "xtra hot"], "difficult_direct_answer": false, "rationales": ["The jar has hot sauce for the food.", "This is extra red hot sauce.", "The label says the sauce is extra hot."], "image": "train2014/COCO_train2014_000000391297.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34120, "question_id": "XDEX2d5BKRZzeYxNpyQVKK", "question": "What is the symbol on the red shirt symbolize?", "choices": ["peace", "thor", "loki", "ussr"], "correct_choice_idx": 3, "direct_answers": ["communism", "ussr", "hammer", "ussr", "hammer sickle", "peace", "communism", "union", "ussr", "communist russia"], "difficult_direct_answer": false, "rationales": ["The symbol consists of a hammer and sickle. this symbol is associated with communism.", "This is the logo on their flag of the hammer and sickle.", "The symbol is of the union of soviet socialist republics."], "image": "train2014/COCO_train2014_000000034120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199959, "question_id": "XDFWn8cSzZRyuUkYGyJoLP", "question": "Why are the people standing together on the platform most likely in the area?", "choices": ["work", "school", "relocation", "vacation"], "correct_choice_idx": 3, "direct_answers": ["waiting", "awaiting train", "take train", "board train", "waiting", "tourist", "waiting", "board train", "boarding", "vacation"], "difficult_direct_answer": false, "rationales": ["The people are dressed like tourists and wearing backpacks.", "They have backpacks like students carry", "The people have multiple bags of luggage to carry belongings when traveling."], "image": "val2014/COCO_val2014_000000199959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18270, "question_id": "XDLALcsMiGYcermbKKitZv", "question": "What are the two boys doing?", "choices": ["queueing", "training", "playing", "being punished"], "correct_choice_idx": 2, "direct_answers": ["playing football", "watching", "watching", "watching", "watching soldier", "watching", "playing soccer", "playing soccer", "playing", "watching"], "difficult_direct_answer": false, "rationales": ["The two boys seem to be watching a soldier teach them to fight.", "The boys are playing a game of soccer.", "The boys play."], "image": "train2014/COCO_train2014_000000018270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193951, "question_id": "XDWUhjXVkyeKcvGst9panW", "question": "The first digit of the number that is clipped to the ear is included in what number?", "choices": ["444", "280", "515", "305"], "correct_choice_idx": 3, "direct_answers": ["three", "322", "three", "322", "305", "three", "identification number", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["This is the only number with the number 3 in it.", "The first digit is 305.", "The number 3 is the first digit in the ear tag."], "image": "train2014/COCO_train2014_000000193951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192655, "question_id": "XDecLeGxQ6wN437fYWxqMa", "question": "What is this person preparing for?", "choices": ["work", "dinner", "party", "trip"], "correct_choice_idx": 3, "direct_answers": ["traveling", "war", "travel", "travel", "traveling", "trip", "trip", "trip", "travel", "trip"], "difficult_direct_answer": false, "rationales": ["The person goes on a trip.", "The person is packing a suitcase.", "A well-packed suitcase is essential to a successful trip. it is always best to pack your suitcase well in advance so you don't forget anything."], "image": "train2014/COCO_train2014_000000192655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116696, "question_id": "XDnAYFB7pEehtu6FpohZSd", "question": "What covering is featured in the bowed item?", "choices": ["cloak", "dryer sheets", "cake icing", "diapers"], "correct_choice_idx": 3, "direct_answers": ["baby blanket", "wrapping paper", "teddy bear", "diapers", "diapers", "teddy bear", "ribbon", "diaper", "diapers", "diapers"], "difficult_direct_answer": false, "rationales": ["At baby showers, people make cakes from diapers.", "The diapers are wrapped up in the bow.", "This is a baby shower cake"], "image": "val2014/COCO_val2014_000000116696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20983, "question_id": "XDpRY9YVXbc778tqT4dohH", "question": "Where do these people stand?", "choices": ["photo backdrop", "classroom", "bathroom", "jail"], "correct_choice_idx": 0, "direct_answers": ["red carpet", "photo backdrop", "red carpet", "movie release", "near poster", "picture screen", "ad event", "photo", "china", "ad event"], "difficult_direct_answer": false, "rationales": ["By the setting and the posing they are doing, it is easy to tell where and what they are doing.", "The people are standing in front of a tarp that has logos on it.", "There is a repeated pattern on a backdrop behind the men that is commonly used in association with answer a."], "image": "train2014/COCO_train2014_000000020983.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400916, "question_id": "XEQ6rwjKHCQF5f8y9WymXz", "question": "What is the device on the floor under the desk near the wall?", "choices": ["speaker", "fax machine", "computer tower", "printer"], "correct_choice_idx": 3, "direct_answers": ["printer", "printer", "printer", "printer", "printer", "printer", "router", "cpu", "computer tower", "system"], "difficult_direct_answer": false, "rationales": ["There is a computer printer under the desk, near the wall.", "The device is a printer.", "The device is a printer."], "image": "val2014/COCO_val2014_000000400916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545165, "question_id": "XEYra9DaN2VtsXAudoWJkg", "question": "In which country is the city mentioned here located?", "choices": ["austria", "france", "germany", "us"], "correct_choice_idx": 2, "direct_answers": ["berlin", "germany", "germany", "berlin", "germany", "germany", "berlin", "germany", "germany", "germany"], "difficult_direct_answer": false, "rationales": ["Berlin is in germany.", "The city name \"berlin\" is visibly written in the background and berlin is a city known to be in germany.", "It is a city in germany."], "image": "train2014/COCO_train2014_000000545165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328791, "question_id": "XEo7VGc7yAksnBUUYzQGz8", "question": "What kind of food place did the men most likely buy food from?", "choices": ["street cart", "fast food", "restaurant", "take out"], "correct_choice_idx": 0, "direct_answers": ["street side", "cart", "hotdog cart", "stand", "food cart", "food cart", "street cart", "street vendor", "street cart", "stand"], "difficult_direct_answer": false, "rationales": ["They are standing in front of one", "This type of food vendor is visible in the photo.", "The men eating food are standing near a street cart. if eating on the street a person would likely be eating near where they had purchased their food especially if from a street cart which caters to quick and convenient food eating."], "image": "val2014/COCO_val2014_000000328791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3793, "question_id": "XFo2UBF6iYXe4LDTBRZrtH", "question": "Where does this bus stop here?", "choices": ["school", "crosswalk", "outlet mall", "church"], "correct_choice_idx": 2, "direct_answers": ["barton villager", "in street", "bus depot", "factory", "outlet mall", "pickup stop", "shoulder", "bus stop", "village", "bus station"], "difficult_direct_answer": true, "rationales": ["The bus is by an outlet.", "The bus stops next to a building. the building has a large sign that says it contains factory stores. factory stores together in one location are an outlet mall.", "The bus is going to the mall."], "image": "val2014/COCO_val2014_000000003793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159768, "question_id": "XFoLtqcYMXoXjJUmrF89XJ", "question": "What part of the meal is being eaten?", "choices": ["soup", "entree", "salad", "dessert"], "correct_choice_idx": 1, "direct_answers": ["salad", "entree", "main course", "entree", "entree", "main", "entree", "main course", "entree", "main course"], "difficult_direct_answer": false, "rationales": ["There is meat and cooked items on the plate.", "The plates seems to feature a mean and veggies like found in a lot of entries.", "The other options don't fit the image."], "image": "train2014/COCO_train2014_000000159768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346865, "question_id": "XFuexuMeRxb7ijZQ6RCbsb", "question": "The two cats by the window are looking down at which outdoor part of the residence?", "choices": ["deck", "back yard", "driveway", "garage"], "correct_choice_idx": 2, "direct_answers": ["parking lot", "entrance", "lawn", "grass", "high", "ground", "yard", "driveway", "ramp", "front walkway"], "difficult_direct_answer": true, "rationales": ["Cats are sitting at a window with a paved area below.", "The two cats are looking to the front of the house.", "The two cats are checking out the grassy area."], "image": "val2014/COCO_val2014_000000346865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304173, "question_id": "XFzYgEMFwYjUBC5gwNcqis", "question": "What kind of trick is done with the thing the man in red is standing on?", "choices": ["kickflip", "prank", "magic", "yoyo"], "correct_choice_idx": 0, "direct_answers": ["jump", "grinding", "grind", "ollie", "skating", "hardflip", "kickflip", "skateboarding", "flips", "jumps"], "difficult_direct_answer": true, "rationales": ["The trick is a kickflip.", "The man could do a kickflip with the board.", "The man is currently on a skateboard. answer a is a trick that is frequently done on a skateboard."], "image": "train2014/COCO_train2014_000000304173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33835, "question_id": "XGVfkkSx34K4CM55oEfgVv", "question": "The person in the green shirt is most likely to be what?", "choices": ["son", "grandmother", "daughter", "grandfather"], "correct_choice_idx": 0, "direct_answers": ["technician", "watcher", "friend", "son", "next player", "spectator", "talking", "in charge", "boss", "unknown"], "difficult_direct_answer": true, "rationales": ["He is a young man.", "The man look a bit younger than the people at the front.", "The person is the son."], "image": "val2014/COCO_val2014_000000033835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393867, "question_id": "XGn75hrN8fJongWxfsDBAZ", "question": "What time is conducive to take the meal above?", "choices": ["morning", "supper", "none", "lunch"], "correct_choice_idx": 0, "direct_answers": ["midday", "midday", "morning", "noon", "brunch", "lunch", "not long", "15 minutes", "morning", "lunchtime"], "difficult_direct_answer": false, "rationales": ["There are scones and sandwiches which are typically eaten at lunch time.", "The time is the morning.", "The items are club sandwiches and pastries. these normally would not be eaten at lunch or supper time."], "image": "train2014/COCO_train2014_000000393867.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87126, "question_id": "XGuBkT6nJSCJVrB3iCKRth", "question": "What does the door to the left of the window lead to?", "choices": ["bathroom", "refrigerator", "pantry", "living room"], "correct_choice_idx": 2, "direct_answers": ["pantry", "outside", "pantry", "outside", "outside", "pantry", "pantry", "outside", "pantry", "outside"], "difficult_direct_answer": false, "rationales": ["There is a door similar to a closet door in a kitchen. pantries are often in or close to the kitchen in a home.", "The door is inside a kitchen and leads to a food storage area, not a living room or bathroom. the refrigerator is to the left of the door.", "The door is to the pantry."], "image": "train2014/COCO_train2014_000000087126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282066, "question_id": "XH4VkfXir2dnn2DLQpzcW2", "question": "Where are kites originally from?", "choices": ["mexico", "china", "korea", "taiwan"], "correct_choice_idx": 1, "direct_answers": ["china", "factories", "china", "china", "china", "china", "china", "china", "china", "factories"], "difficult_direct_answer": false, "rationales": ["The kites are from china.", "Kites are known to be invented in china.", "Kites come from china."], "image": "val2014/COCO_val2014_000000282066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168903, "question_id": "XH4eL5BsWKkBtivXb62Lh5", "question": "What do the symbols on the top yellow sign look like?", "choices": ["numbers", "hieroglyphics", "roman numerals", "hanzi"], "correct_choice_idx": 3, "direct_answers": ["elephant hero", "chinese", "chinese", "hanzi", "dragons", "elephant", "animals", "chines' decoration", "animals", "elephant shrimp"], "difficult_direct_answer": false, "rationales": ["The symbols are hanzi.", "They do look like chinese characters.", "They look like chinese writing."], "image": "train2014/COCO_train2014_000000168903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274815, "question_id": "XH6orqN5uZg55Fn3jWmQ5h", "question": "The painting on the white wall is inspired by what continent?", "choices": ["europe", "asia", "africa", "north america"], "correct_choice_idx": 1, "direct_answers": ["africa", "asia", "asia", "china", "asia", "soba", "asia", "asia", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["These are cultures from countries on that continent", "The painting is from china.", "The painting looks asian."], "image": "train2014/COCO_train2014_000000274815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139789, "question_id": "XHPC5EPQkvPtT899aKL3ML", "question": "What item does the person here likely make?", "choices": ["brass rings", "tires", "necklaces", "rings"], "correct_choice_idx": 2, "direct_answers": ["jewelry", "jewelry", "necklaces", "necklaces", "necklaces", "jewelry", "necklaces", "necklaces", "jewelry", "beaded jewelry"], "difficult_direct_answer": false, "rationales": ["A person is walking with a large amount of necklaces on a case on his head.", "The person is selling necklaces.", "The neckless are seen in the cabin being carried."], "image": "train2014/COCO_train2014_000000139789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93725, "question_id": "XHf7iGpPk65Lw4Q97he4FG", "question": "What handedness does this batter possess?", "choices": ["both", "none", "right", "left"], "correct_choice_idx": 2, "direct_answers": ["swing", "right", "right", "right", "swing", "right", "right", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["The person uses their right side.", "The person is swinging the bat to the right so they are right handed.", "The batter is using his right hand."], "image": "val2014/COCO_val2014_000000093725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91349, "question_id": "XJsUyfUJpeohQGpNbmcz4M", "question": "What is happening in the middle of the baseball diamond?", "choices": ["perfect game", "award ceremony", "memorial", "pitching change"], "correct_choice_idx": 1, "direct_answers": ["recording", "award ceremony", "announcement", "ceremony", "ceremony", "speech", "award ceremony", "speech", "ceremony", "speech"], "difficult_direct_answer": false, "rationales": ["Microphones, cameras, and a baseball player waving to the audience are on a baseball diamond.", "A player is waving and walking towards a group of officials on the field to collect a large wooden object.", "This appears to be the answer given the media and podium."], "image": "val2014/COCO_val2014_000000091349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316741, "question_id": "XKE3u4XnBvHxRXSUaUN8fj", "question": "Three brown oval items in the door here are from which animal?", "choices": ["cow", "squirrel", "chicken", "donkey"], "correct_choice_idx": 2, "direct_answers": ["chicken", "hen", "cow", "chicken", "chicken", "chicken", "chicken", "cow", "chicken", "chicken"], "difficult_direct_answer": false, "rationales": ["They are chicken breasts.", "The items are from chickens.", "Chickens hatch offspring from shells."], "image": "val2014/COCO_val2014_000000316741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88748, "question_id": "XKjTnkoiwuJKK8iXUvENAy", "question": "What is the fountain currently being used as?", "choices": ["bird bath", "animal feeder", "shower", "plant feeder"], "correct_choice_idx": 0, "direct_answers": ["birdbath", "bird bath", "birdbath", "bird bath", "birdbath", "bird bath", "door", "bird bath", "bird bath", "birdbath"], "difficult_direct_answer": false, "rationales": ["The birds are bathing in it.", "The fountain is used for birds.", "A fountain has water and plants in it. a bird is sitting in the water in a fountain."], "image": "val2014/COCO_val2014_000000088748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480717, "question_id": "XKmDaXDYCkZfuWopM7BaEi", "question": "What are these people doing?", "choices": ["protesting", "shop lifting", "racing", "dancing"], "correct_choice_idx": 0, "direct_answers": ["protesting", "protesting", "protesting", "protesting", "cheering", "cheering", "protesting", "protesting", "protesting", "cheering"], "difficult_direct_answer": false, "rationales": ["The mass of people are marching for a cause they believe is just and true. they will usually carry banners, flag and signs expressing their beliefs.", "These people are holding signs and there are cops around, so they are likely protesting.", "Police are keeping an eye on the crowd for safety."], "image": "train2014/COCO_train2014_000000480717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196688, "question_id": "XL4hkMEENxdAFgiPjfDsQr", "question": "What actor is from the state that this batter plays for?", "choices": ["matt damon", "jamie foxx", "kristen stewart", "mel brooks"], "correct_choice_idx": 0, "direct_answers": ["chris evans", "hanks", "matt damon", "no idea", "matt damon", "player", "matt damon", "i do", "dennis leary", "matt damon"], "difficult_direct_answer": false, "rationales": ["Matt damon is from boston.", "The actor is damon.", "The player has \"red sox\" visible on his jersey which is a team that plays in boston, massachusetts. answer a is from that state."], "image": "train2014/COCO_train2014_000000196688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402392, "question_id": "XLLF8UhqcYn6xYmhGRdEB3", "question": "What is this man's profession?", "choices": ["janitor", "cashier", "doctor", "athlete"], "correct_choice_idx": 3, "direct_answers": ["tennis", "tennis", "athlete", "tennis player", "tennis player", "tennis", "tennis player", "tennis player", "tennis", "tennis player"], "difficult_direct_answer": false, "rationales": ["He looks to be a professional in his sport.", "The man is a professional tennis player.", "The man plays sports."], "image": "train2014/COCO_train2014_000000402392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363560, "question_id": "XLgdp5wKyopCSo7522gw3j", "question": "What do the kids need to do next?", "choices": ["practice skills", "pull cart", "dump balls", "compete"], "correct_choice_idx": 0, "direct_answers": ["hit ball", "serve", "practice swing", "practice skills", "cleanup", "hit balls", "hit ball", "learn tennis", "hit ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["The kids practice.", "The kids are waiting on the balls to come.", "They are learning how to play tennis."], "image": "val2014/COCO_val2014_000000363560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51530, "question_id": "XM5RcepaPNcKRwetG5B5TQ", "question": "What position does this guy have?", "choices": ["executive", "fireman", "security", "waiter"], "correct_choice_idx": 2, "direct_answers": ["dominating", "security", "unknown", "officer", "security", "security guard", "security guard", "security", "security", "pilot"], "difficult_direct_answer": false, "rationales": ["The man is a security guard in uniform.", "He has an earpiece and is keeping an eye on things", "The man is wearing a security uniform and is watching what is going on."], "image": "train2014/COCO_train2014_000000051530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37291, "question_id": "XM9589wHmKd2zWf8DmKgHd", "question": "How many toes does cats are supposed to have?", "choices": ["18", "24", "11", "14"], "correct_choice_idx": 0, "direct_answers": ["four", "18", "eighteen", "five", "eighteen", "four", "four", "20", "four", "four"], "difficult_direct_answer": false, "rationales": ["They have 6 on each foot.", "A cat should have 18 toes.", "You can count toes on the paw and multiply by 4."], "image": "train2014/COCO_train2014_000000037291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455717, "question_id": "XMFty5hEhXMHxodJnQQkjt", "question": "What are the people playing in?", "choices": ["snow", "sand", "water", "box"], "correct_choice_idx": 0, "direct_answers": ["snow", "snow", "snow", "snow", "snow", "snow", "snow", "snow", "snow", "snow"], "difficult_direct_answer": false, "rationales": ["The ground cover is white and frozen.", "The people are in snow.", "The people are lounging in the snow."], "image": "train2014/COCO_train2014_000000455717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309795, "question_id": "XMSzYQbkK8xVq7GRpmuYcy", "question": "What is the white van used for?", "choices": ["transporting", "racing", "living", "education"], "correct_choice_idx": 0, "direct_answers": ["driving", "transporting", "animal transport", "losing", "deliveries", "transport", "moving storage", "moving stuff", "garbage", "delivery"], "difficult_direct_answer": true, "rationales": ["The white van is for cargo.", "The van is for transporting.", "A white van with a large covered back and door are on a street."], "image": "train2014/COCO_train2014_000000309795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221663, "question_id": "XMbojM83xXvUZvEpUWYSJ6", "question": "What is the woman celebrating with her beads?", "choices": ["easter", "halloween", "christmas", "mardi gras"], "correct_choice_idx": 3, "direct_answers": ["mardi gras", "mardi gras", "mardi gras", "mardi gras", "mardi gras", "mardi gras", "mardi gras", "mardi gras", "mardi gras", "mardi gras"], "difficult_direct_answer": false, "rationales": ["Mardi gras features purple, green and gold beads.", "The lady seems to be having something in her hand.", "Beads of this style and these colors are commonly known to be associated with answer a."], "image": "train2014/COCO_train2014_000000221663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62446, "question_id": "XMgGWcGhxZ545JxacnjizP", "question": "Where is the woman sitting?", "choices": ["bar", "sofa", "desk", "sewing machine"], "correct_choice_idx": 0, "direct_answers": ["resturant", "table", "bar", "table", "bar", "bar", "bar", "table", "restaurant", "bar"], "difficult_direct_answer": false, "rationales": ["The woman has alcohol near her.", "The woman has beer.", "There is a glass of wine and a bottle of wine on a counter."], "image": "train2014/COCO_train2014_000000062446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201873, "question_id": "XMrNRCLMiRTmG8Fii5GVRS", "question": "Why are the people standing behind the yellow line?", "choices": ["fun", "safety", "work", "punishment"], "correct_choice_idx": 1, "direct_answers": ["safety", "safety", "safety", "board train", "safety", "waiting safely", "safety", "safety", "waiting board", "waiting"], "difficult_direct_answer": false, "rationales": ["The people want to be safe.", "The yellow line is near the edge of the platform they are standing on. where the platform ends, trains go through.", "The lines are for visibility."], "image": "train2014/COCO_train2014_000000201873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194448, "question_id": "XMsovm2P9u9sWoAEf5Zd65", "question": "What is the contraption on the right used for?", "choices": ["climbing slopes", "descending slopes", "shoveling snow", "creating snow"], "correct_choice_idx": 0, "direct_answers": ["carry skiers", "support", "ski lift", "lifting", "ascending", "getting uphill", "lifting skiers", "traveling", "climbing slopes", "transportation"], "difficult_direct_answer": true, "rationales": ["There person on the right uses the contraption for climbing slopes.", "It is a ski lift.", "This is known as a ski lift, its main objective is to provide transport to skiers from the bottom of the hill to the top. once at the top of the hill skiers generally descend via ski or snowboard."], "image": "train2014/COCO_train2014_000000194448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210241, "question_id": "XMxfUb2cVjPVgmVmu7Ck8g", "question": "What infrastructure is necessary for the transportation here to move?", "choices": ["roads", "train tracks", "taxis", "airports"], "correct_choice_idx": 1, "direct_answers": ["train tracks", "tracks", "tracks", "rails", "rails", "railroad tracks", "rails", "railroad tracks", "tracks", "tracks"], "difficult_direct_answer": false, "rationales": ["The train needs to run on tracks.", "The trains use train tracks to move.", "Trains are parked next to each other."], "image": "val2014/COCO_val2014_000000210241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46924, "question_id": "XN2S3hAKQqpcpm9qCt6v5U", "question": "Of what use is the small silver lined hole on this device?", "choices": ["hacking device", "antenna", "camera lens", "charging portal"], "correct_choice_idx": 2, "direct_answers": ["safe", "camera", "camera", "camera", "camera lens", "taking pictures", "camera", "camera", "camera", "taking pictures"], "difficult_direct_answer": false, "rationales": ["There is a camera lens in the hole.", "Based on its location and on the type of device in question, the small hole with the silver border is certain to be a camera lens.", "The hole is the lens."], "image": "val2014/COCO_val2014_000000046924.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278100, "question_id": "XN38ieh7PuoniAcYPrA698", "question": "Which one of these terms could be used to describe the motorcycle rider?", "choices": ["patriotic", "traitor", "apolitical", "anarchist"], "correct_choice_idx": 0, "direct_answers": ["patriotic", "patriotic", "patriotic", "flag", "patriot", "patriotic", "patriotic", "male", "white", "patriot"], "difficult_direct_answer": false, "rationales": ["The motorcycle rider has multiple american flags to showcase their loyalty.", "There are two american flags hanging from the motorcycle.", "He displays multiple flags on his motorcycle."], "image": "train2014/COCO_train2014_000000278100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277623, "question_id": "XNLcsAASDUEf9JpoWwJdim", "question": "What would most likely explain why these dogs look similar?", "choices": ["dog farm", "clone", "family", "optical illusion"], "correct_choice_idx": 2, "direct_answers": ["siblings", "related", "brothers", "siblings", "same pack", "same litter", "family", "same breed", "twins", "genetics"], "difficult_direct_answer": true, "rationales": ["The dogs may be related given the same breed and same geographical location.", "The dogs look very similar and are friendly towards each other.", "They are the same breed and maybe from the same litter"], "image": "train2014/COCO_train2014_000000277623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334118, "question_id": "XNYfLxLwTqvHowWT4fRNPB", "question": "What type of cuisine is being served?", "choices": ["japanese", "italian", "korean", "indian"], "correct_choice_idx": 0, "direct_answers": ["asian", "asian", "chinese food", "chinese", "japanese", "slop", "japanese", "asian", "chinese food", "suppergetti"], "difficult_direct_answer": false, "rationales": ["You can tell by the noodles, rice and other items that they are part of a asian cuisine.", "The food in the picture is popular foods amongst asian people.", "These are the types of entrees one would get at a japanese restaurant, indicating that this is japanese cuisine."], "image": "val2014/COCO_val2014_000000334118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112031, "question_id": "XNsivbeitYc97ErUWVhf3Y", "question": "What company name appears?", "choices": ["aol", "mcdonald's", "time warner", "hbo"], "correct_choice_idx": 0, "direct_answers": ["aol", "aol", "aol", "aol", "aol", "aol", "aol", "aol", "aol", "aol"], "difficult_direct_answer": false, "rationales": ["The letters are in the picture", "It is written in the lower left corner.", "The letters aol are at the bottom left."], "image": "train2014/COCO_train2014_000000112031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301397, "question_id": "XNtYcctZsxXvzA7ibckFG3", "question": "The man is doing what?", "choices": ["walking", "eating", "running", "talking"], "correct_choice_idx": 3, "direct_answers": ["taking", "talking", "talking", "talking", "talking", "using phone", "phone call", "talking", "talking", "communicating"], "difficult_direct_answer": false, "rationales": ["The man is talking.", "The man has a cell phone held near his face which implies he is talking on a phone call.", "The man has his phone to his ear."], "image": "val2014/COCO_val2014_000000301397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136704, "question_id": "XPEkYABfVKhuetG6GCNaVE", "question": "What are the passengers told to wait behind?", "choices": ["yellow line", "glass door", "velvet rope", "pylon"], "correct_choice_idx": 0, "direct_answers": ["line", "yellow line", "line", "line", "yellow line", "yellow line", "yellow line", "yellow line", "yellow line", "line"], "difficult_direct_answer": false, "rationales": ["The line and color is frequently used in train stations to bring riders attention to the fact they should wait a fair distance away from the approaching train to avoid injury.", "The yellow line is for safety.", "The passengers are all told to wait behind the yellow line."], "image": "train2014/COCO_train2014_000000136704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232511, "question_id": "XPHLaBUkWovELPD5THTRCr", "question": "What does the man who sits have trouble doing?", "choices": ["magic", "breathing", "walking", "riding"], "correct_choice_idx": 2, "direct_answers": ["walking", "seeing", "walking", "wakeup", "run trouble", "walking", "driving scooter", "travelling", "walking", "walking"], "difficult_direct_answer": false, "rationales": ["The man can't walk.", "The vehicle being used is for people who cannot use their legs properly and need it for assistance in being mobile.", "The scooter that the man is using is usually used for people having trouble getting around."], "image": "val2014/COCO_val2014_000000232511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539557, "question_id": "XPQPW6wucjkprDaYYLtGJ5", "question": "How many kind of ships available mostly?", "choices": ["three", "two", "seven", "four"], "correct_choice_idx": 2, "direct_answers": ["one", "cargo ship", "one", "one", "one", "one", "cargo", "one", "seven", "eleven"], "difficult_direct_answer": false, "rationales": ["There are seven major types of ships.", "There are seven ships.", "You can count them in the picture."], "image": "val2014/COCO_val2014_000000539557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564183, "question_id": "XPRGnpcBpzXUSJzE7GKn8i", "question": "Where is number fifty one running to?", "choices": ["second base", "outfield", "third base", "first base"], "correct_choice_idx": 3, "direct_answers": ["first base", "first base", "first base", "first base", "1st base", "first base", "1st base", "first base", "first base", "first base"], "difficult_direct_answer": false, "rationales": ["He is going to the right and that is wear the batter is supposed to run after hitting the ball.", "The number fifty is running to first base.", "The number is going to first base."], "image": "train2014/COCO_train2014_000000564183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414777, "question_id": "XPrgJb7rqdRa9cFnnESoxg", "question": "Which food company produces these bananas?", "choices": ["dole", "bayer", "coca-cola", "chiquita"], "correct_choice_idx": 0, "direct_answers": ["dole", "agricultural", "dole", "chiquita", "dole", "dole", "dole", "dole", "dole", "dole"], "difficult_direct_answer": false, "rationales": ["Each banana has a sticker that contains this company's name.", "Dole produces bananas.", "They are a well known banana brand."], "image": "val2014/COCO_val2014_000000414777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434142, "question_id": "XQ7pDZD73VKKCA8UYPkJxF", "question": "What category of pizzas would this be considered?", "choices": ["vegan", "large", "meat lovers", "vegetarian"], "correct_choice_idx": 3, "direct_answers": ["white", "veggie", "vegetarian", "vegetarian", "vegetarian", "spinach", "vegetarian", "breakfast", "thin crust", "white"], "difficult_direct_answer": false, "rationales": ["This pizza doesn't have any meat.", "The pizza has no meat.", "The pizza has veggies."], "image": "val2014/COCO_val2014_000000434142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190907, "question_id": "XQ7qcAeEY46DiVfgssXFa4", "question": "What type of death might the Wreath commemorate?", "choices": ["iraqi president", "soldier", "gay person", "enemy fighter"], "correct_choice_idx": 2, "direct_answers": ["war", "killing gays", "gay person", "homicide", "murder", "hate crime", "murder", "bombing", "gays", "murder"], "difficult_direct_answer": false, "rationales": ["The sign near the window advocates against killing people with this sexual orientation.", "The death of gay people", "A sign hangs in protest in a street above a wreath."], "image": "val2014/COCO_val2014_000000190907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172439, "question_id": "XQEJ6SVaQGAwbYvSYQbqkp", "question": "All riders rely on each other to safely ride at the same what?", "choices": ["hour", "level", "speed", "house"], "correct_choice_idx": 2, "direct_answers": ["time", "speed", "rally", "speed", "event", "road", "time", "speed", "speed", "speed"], "difficult_direct_answer": false, "rationales": ["The riders need to go at the same speed to not crash.", "They have to maintain this or the ones behind them will wreck", "The bikers must be sure to go at the same pace to be safe."], "image": "val2014/COCO_val2014_000000172439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404071, "question_id": "XQF96hceXeQB4iau7GvGy2", "question": "What type of transportation is this?", "choices": ["water", "rail", "road", "air"], "correct_choice_idx": 1, "direct_answers": ["train", "train", "train", "train", "train", "train", "rail", "train", "passenger train", "train"], "difficult_direct_answer": false, "rationales": ["The train is on traintracks.", "A train rail is shown.", "The train is on a railway."], "image": "val2014/COCO_val2014_000000404071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334423, "question_id": "XQKgsYY36pUjsjceJXTpqX", "question": "What dish is the person assembling these food items to mimic?", "choices": ["sushi", "burritos", "tacos", "pizza"], "correct_choice_idx": 0, "direct_answers": ["sushi", "kimbab", "sushi", "salad", "sushi", "sushi", "sushi", "sushi roll", "sushi", "sushi"], "difficult_direct_answer": false, "rationales": ["They are wrapping the food inside of seaweed paper and rice.", "There are sliced carrots, egg strips, and other strips of veggies along with a person assembling a sushi roll with a seaweed wrap and a sushi roll mat.", "The person is putting vegetables and meats inside rice. there are no tacos, tortillas, or pizza crusts."], "image": "train2014/COCO_train2014_000000334423.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292283, "question_id": "XQnWH5vmiMdVbq3HoiXBWP", "question": "What is this type of vehicle at the very front of the image referred to?", "choices": ["motorcycle", "truck", "bicycle", "car"], "correct_choice_idx": 0, "direct_answers": ["motor scooter", "motorcycle", "ebike", "scooter", "scooter", "scooter", "moped", "scooter", "scooter", "scooter"], "difficult_direct_answer": false, "rationales": ["The vehicle is a motorbike.", "A motorcycle is at the front.", "A moped is a type of motorcycle as opposed to a bicycle since it is always powered by a motor."], "image": "train2014/COCO_train2014_000000292283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5947, "question_id": "XQxk3xWYNzHq9rxSSAe3jJ", "question": "How do these planes get their main thrust?", "choices": ["front propeller", "push", "ropes", "jet engines"], "correct_choice_idx": 0, "direct_answers": ["propeller", "engine", "engine", "front propeller", "propellers", "propeller", "propeller", "propeller", "propeller", "propellers"], "difficult_direct_answer": false, "rationales": ["This is connected to the engine", "The spinning thing in the front of the plane helps it move. there are no jet engines.", "The planes use their front propellers."], "image": "train2014/COCO_train2014_000000005947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356445, "question_id": "XRGPRKQXoQrMSsknXuzoh9", "question": "The arm band in the player hand represent which brand?", "choices": ["reebok", "puma", "adidas", "nike"], "correct_choice_idx": 3, "direct_answers": ["nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike", "nike"], "difficult_direct_answer": false, "rationales": ["The brand is nike.", "This brand is the only brand that has that check logo.", "The swoosh check is like the trademark image of that brand."], "image": "train2014/COCO_train2014_000000356445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546577, "question_id": "XRpJ38rhGNgTiC4gbUmEyL", "question": "What is he excited about?", "choices": ["movie", "music", "video game", "sports"], "correct_choice_idx": 2, "direct_answers": ["playing", "video game", "video game", "gaming", "game", "game", "video game", "video game", "game", "video game"], "difficult_direct_answer": false, "rationales": ["The equipment he is holding provides the reasoning.", "The boy has a wii controller.", "He loves gaming."], "image": "val2014/COCO_val2014_000000546577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409138, "question_id": "XRrYuUBZ7stqASWa6qjgR9", "question": "What is the bus driving in?", "choices": ["freeway", "emergency lane", "hov lane", "middle lane"], "correct_choice_idx": 2, "direct_answers": ["bus lane", "road", "street", "hov lane", "city", "road", "city", "street", "road", "pavement"], "difficult_direct_answer": false, "rationales": ["The bus is in the hov lane.", "The bus lane is an hov lane.", "There is a bus in its own separate lane. a lane separate from the rest can also be the hov lane."], "image": "val2014/COCO_val2014_000000409138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406145, "question_id": "XSHo8xeWWocnZvfH6qNjZS", "question": "The batter has dated what celebrity?", "choices": ["camila cabello", "kiernan shipka", "rihanna", "zendaya"], "correct_choice_idx": 2, "direct_answers": ["rihanna", "rihanna", "no idea", "marilyn monroe", "unknown", "rihanna", "rihanna", "khloe kardashian", "anne-sophie fury", "rhianna"], "difficult_direct_answer": false, "rationales": ["This man once called rihanna his girlfriend.", "This player once dated rihanna while playing for the dodgers.", "The batter dated rihanna."], "image": "train2014/COCO_train2014_000000406145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15954, "question_id": "XSMN28pEwZkUTzJP3sWasC", "question": "Why are the people behind rails?", "choices": ["see better", "protect giraffes", "protect them", "keep clean"], "correct_choice_idx": 2, "direct_answers": ["zoo", "seeing giraffes", "fence", "safety", "see animals", "protect them", "visitors", "wild animals", "zoo visitors", "protection"], "difficult_direct_answer": true, "rationales": ["Giraffes are wild animals and cannot be in close proximity to people as it is not known how they would interact with them up close.", "The people are protected.", "The giraffes are at a zoo."], "image": "val2014/COCO_val2014_000000015954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347019, "question_id": "XSQeMjyVrTG4XbcbPXRve9", "question": "What type of bikes are these?", "choices": ["dirt", "tandem", "vespa", "cruiser"], "correct_choice_idx": 2, "direct_answers": ["scooters", "mopeds", "scooter", "motor scooters", "scooter", "motor scooters", "mopeds", "scooters", "vespa", "scooters"], "difficult_direct_answer": false, "rationales": ["The bike is a vespa.", "Two, two wheeled vehicles with seats and windshields are red. vespas are italians scooters that can be found in red.", "These are a type of motor scooter that can be ridden."], "image": "train2014/COCO_train2014_000000347019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289861, "question_id": "XSUewJXXUuX9Pqq9rYhoxM", "question": "Who is the cartoon characters companion on the backpack?", "choices": ["eeyore", "piglet", "goofy", "tigger"], "correct_choice_idx": 1, "direct_answers": ["dog", "pooh bear", "eeyore", "tigger", "winnie-the-pooh", "winnie poo", "child", "christopher robin", "winnie thepooh", "piglet"], "difficult_direct_answer": true, "rationales": ["Piglet is pooh's best friend in the winnie the pooh story.", "The character is piglet.", "The character on the backpack is winnie-the-pooh. goofy is from a different cartoon series, and winnie-the-pooh was only acquaintances with eeyore and tigger."], "image": "train2014/COCO_train2014_000000289861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181699, "question_id": "XSZah3Hniugq6tcCWtQCN5", "question": "What company is known for making the items on top of the cake?", "choices": ["subway", "mcdonald's", "popeye's", "dunkin donuts"], "correct_choice_idx": 3, "direct_answers": ["krispy kreme", "dunkin donuts", "dunkin donuts", "dunkin doughnuts", "cake decorator", "krispy kreme", "dunking donuts", "cake", "dunking donuts", "unknown"], "difficult_direct_answer": false, "rationales": ["Dunkin donuts make donuts.", "The items on the cake are donuts, which is the main product offering of dunkin' donuts franchises.", "This company makes pastries."], "image": "train2014/COCO_train2014_000000181699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128829, "question_id": "XSa2GfbHStmfQq4TEFMBXh", "question": "What do you enter in to this object?", "choices": ["grains", "vegetables", "meat", "money"], "correct_choice_idx": 3, "direct_answers": ["coins", "coins", "coins", "coins", "coins", "coins", "coins", "money", "coins", "money"], "difficult_direct_answer": false, "rationales": ["A parking meter has a small slot in it.", "The object is a parking meter. a person would have to pay to park here.", "You put coins in these in order to pay for your parking space for a set amount of time."], "image": "val2014/COCO_val2014_000000128829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375296, "question_id": "XSa7tqZBnLoLsnNX5Wqjhe", "question": "What was this dog bred for?", "choices": ["herding", "rescue", "hunting", "tracking"], "correct_choice_idx": 0, "direct_answers": ["searching", "herding", "border collie", "herding sheep", "sheepdog", "retrieving", "herding", "hunting", "hunting", "fetching"], "difficult_direct_answer": false, "rationales": ["This breed of dog is normally found in a pasture with horses or cows", "This type of dog is very good at herding sheep.", "The dog is a border collie. these were used by farmers to control sheep."], "image": "train2014/COCO_train2014_000000375296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425917, "question_id": "XSkhmsFfAZVYc9aMMPU5gm", "question": "What type of soccer match is this?", "choices": ["peewee league", "professional", "city park", "neighborly"], "correct_choice_idx": 1, "direct_answers": ["international", "euro league", "professional", "mens", "professional", "international", "professional", "final", "professional", "world cup"], "difficult_direct_answer": false, "rationales": ["They are in a stadium with advertisements", "There are a lot of spectators and there are sponsors of the field.", "They are being watched by spectators in a large stadium. the players' uniforms and the stadium have advertising."], "image": "train2014/COCO_train2014_000000425917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35974, "question_id": "XSm2XgVJWopdw752pCfEYn", "question": "What athlete is this?", "choices": ["james blake", "dev patel", "pele", "pete sampras"], "correct_choice_idx": 0, "direct_answers": ["jackson", "tennis", "james blake", "sports", "tennis player", "tennis player", "tennis player", "tennis player", "tennis player", "james blake"], "difficult_direct_answer": false, "rationales": ["The man must be james blake since he's playing tennis.", "The athlete is blake.", "James blake is a tennis player."], "image": "val2014/COCO_val2014_000000035974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241076, "question_id": "XSn74hYui6eZwzbZvwLzVS", "question": "What can be purchased in the yellow building?", "choices": ["skateboards", "bikes", "sneakers", "lottery tickets"], "correct_choice_idx": 3, "direct_answers": ["lotto", "lottery", "lotto tickets", "lottery tickets", "lottery tickets", "lotto tickets", "lotto", "lottery tickets", "lottery tickets", "lotto tickets"], "difficult_direct_answer": false, "rationales": ["Some lottery tickets could be purchased inside the lotto building.", "There is a sign on the building that says \"lotto\".", "The lottery tickets can be bought."], "image": "train2014/COCO_train2014_000000241076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234522, "question_id": "XSo5EDmeE9gpc5kyU7NAtp", "question": "What is the largest item here?", "choices": ["sea", "dog", "bird", "cat"], "correct_choice_idx": 0, "direct_answers": ["building", "building", "ocean", "building structures", "sea", "building", "buildings", "water", "building", "plane"], "difficult_direct_answer": false, "rationales": ["The sea is by far larger than anything else in the store.", "The sea is massive and expansive.", "The item is the sea."], "image": "train2014/COCO_train2014_000000234522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393617, "question_id": "XTVYMVsUT5rUcayNV5GTkp", "question": "What is the small brown structure in the back left of the yard?", "choices": ["shed", "outhouse", "phonebooth", "church"], "correct_choice_idx": 0, "direct_answers": ["shed", "shed", "shed", "outbuilding", "shed", "shed", "barn", "shed", "utility shed", "outbuilding"], "difficult_direct_answer": false, "rationales": ["The small brown structure in the backyard is a backyard shed.", "The structure is a shed.", "The building is very small."], "image": "train2014/COCO_train2014_000000393617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12020, "question_id": "XTfd3xPnGZtd3VgkysjkNJ", "question": "What could he wear on his head for protection while skateboarding?", "choices": ["hat", "sunglasses", "helmet", "headband"], "correct_choice_idx": 2, "direct_answers": ["helmet", "helmet", "helmet", "helmet", "helmet", "glove", "helmet", "helmet", "helmet", "helmet"], "difficult_direct_answer": false, "rationales": ["It provides protection for your head and they come in different levels of protection they provide.", "He could have a helmet.", "A helmet is used by skateboarders to cover their head."], "image": "val2014/COCO_val2014_000000012020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62483, "question_id": "XThRVDooDyG9aW8pz7Ennc", "question": "What clothes are the people wearing?", "choices": ["uniform", "costume", "underwear", "pajamas"], "correct_choice_idx": 1, "direct_answers": ["costume", "costumes", "military costumes", "uniforms", "halloween costumes", "costumes", "military", "modern", "overalls", "costume"], "difficult_direct_answer": false, "rationales": ["The people are dressed up for halloween.", "This is obvious given the woman's front and the teddy bear.", "The people are dressed as cosplay characters."], "image": "val2014/COCO_val2014_000000062483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409092, "question_id": "XURJsTk2YRxHE49Ru5obsJ", "question": "What is a common brand of cat food?", "choices": ["rain", "ebony", "bones", "meow mix"], "correct_choice_idx": 3, "direct_answers": ["friskies", "meow mix", "fancy feast", "meow mix", "friskies", "purina", "meow mix", "friskies", "purina", "purina"], "difficult_direct_answer": false, "rationales": ["Meow mix is made for cats.", "Meow mix is a popular brand of cat food.", "The other options aren't brands of cat food."], "image": "train2014/COCO_train2014_000000409092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27326, "question_id": "XUVgSyG8r5H2SrQCfJJKJE", "question": "What are the white blocks in the soup called?", "choices": ["mozzarella cheese", "cabbage", "turnip", "tofu"], "correct_choice_idx": 3, "direct_answers": ["chicken", "mushrooms", "tofu", "tofu", "tofu", "cheese", "tofu", "tofu", "tofu", "cauliflower"], "difficult_direct_answer": false, "rationales": ["The food looks processed and is used in asian cooking.", "This protein is usually a tarnish to white square formed cube", "A food with this texture that is prepared in squares like those visible is likely to be answer a."], "image": "val2014/COCO_val2014_000000027326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185334, "question_id": "XUqRKhWY2PNrxpW6M4hJRz", "question": "Why are the bikes covered with junk?", "choices": ["for sale", "showing off", "confused", "find easily"], "correct_choice_idx": 1, "direct_answers": ["carry items", "decoration", "showing off", "sale", "drying clothes", "decoration", "decorated", "decoration", "protesting", "decoration"], "difficult_direct_answer": false, "rationales": ["They might be drying their clothes.", "The bikes are showing off.", "The bikes are decorated."], "image": "train2014/COCO_train2014_000000185334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334259, "question_id": "XUwTyYfSau6saNBvByoYee", "question": "These animals have an impressive what?", "choices": ["stinger", "quill", "wingspan", "tusk"], "correct_choice_idx": 2, "direct_answers": ["wingspans", "wingspan", "wingspan", "speed", "wingspan", "wingspan", "wing span", "honk", "wings", "wing span"], "difficult_direct_answer": false, "rationales": ["The span of the wings on these birds is impressive.", "They have big wingspans.", "These animals can spread their wings really wide."], "image": "train2014/COCO_train2014_000000334259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38899, "question_id": "XUwkunQ7vA8g8HJELnQ34Q", "question": "What is the man wearing on his head?", "choices": ["bandana", "hat", "headband", "helmet"], "correct_choice_idx": 0, "direct_answers": ["bandana", "bandanna", "bandana", "bandana", "bandana", "bandana", "bandanna", "bandana", "bandana", "bandana"], "difficult_direct_answer": false, "rationales": ["A bandana to keep the sweat off his face.", "The man has a bandana and it has the typical bandana colors and shape.", "The cloth is wrapped around his head."], "image": "train2014/COCO_train2014_000000038899.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510998, "question_id": "XUxbR8ETxTgkjAxUk8WCnq", "question": "What might this umbrella normally be used for?", "choices": ["signaling", "rain protection", "nothing", "sun protection"], "correct_choice_idx": 3, "direct_answers": ["sea", "shade", "umbrella", "sun", "shade", "shade", "blocking sun", "as shade", "sun protection", "shading beachgoers"], "difficult_direct_answer": false, "rationales": ["This is on a beach so it is for shade", "The umbrella provides shade.", "The umbrella keeps the sun away."], "image": "train2014/COCO_train2014_000000510998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342591, "question_id": "XUzMiShLGNp4hxtXZ7Vn9Y", "question": "Where are the people on the bus going?", "choices": ["to school", "home", "to work", "sightseeing"], "correct_choice_idx": 3, "direct_answers": ["destination", "visit animals", "animal watching", "tour", "park", "sightseeing", "niagara falls", "traveling", "lake", "tourist attraction"], "difficult_direct_answer": true, "rationales": ["The bus shown is for tourists.", "Double decker buses are taken by tourists. tourists come to see the sights.", "A tourist bus is on the street."], "image": "train2014/COCO_train2014_000000342591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244421, "question_id": "XVATrgu2vfVt7XaT2zDwcb", "question": "What is the area code shown in bus?", "choices": ["190", "910", "970", "999"], "correct_choice_idx": 1, "direct_answers": ["910", "910", "910", "910", "910", "910", "910", "910", "910", "910"], "difficult_direct_answer": false, "rationales": ["The bus has the digits 910 on the front.", "The area code is 910.", "The numbers are on the front and side of the vehicle."], "image": "val2014/COCO_val2014_000000244421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146963, "question_id": "XVCqGnV7wYJSBEDrPJRqU5", "question": "The fish on the pillow goes by what name?", "choices": ["finbar", "dory", "nemo", "flounder"], "correct_choice_idx": 2, "direct_answers": ["nemo", "nemo", "nemo", "nemo", "nemo", "nemo", "nemo", "dory", "nemo", "dory"], "difficult_direct_answer": false, "rationales": ["An orange clown fish is on a pillow on a couch.", "The fish is from pixar's finding nemo movie.", "Finding nemo is the name of the clownfish movie."], "image": "val2014/COCO_val2014_000000146963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525021, "question_id": "XVEwTZ37z5m8NgqfenZcEN", "question": "Who is sitting behind the man in the motorcycle?", "choices": ["boy", "woman", "girl", "man"], "correct_choice_idx": 2, "direct_answers": ["child", "little girl", "little girl", "child", "woman", "child", "child", "child", "girl", "little girl"], "difficult_direct_answer": false, "rationales": ["A little girl is behind the man.", "A girl is behind the man.", "She is wearing a pink helmet so she is a she or gay."], "image": "val2014/COCO_val2014_000000525021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331198, "question_id": "XVFpQAERqmLGYbfQyCmgyA", "question": "What phrase best describes this place?", "choices": ["bus depot", "circus", "football stadium", "zoo"], "correct_choice_idx": 0, "direct_answers": ["packed", "depot", "bus depot", "bus station", "parking lot", "bus garage", "parking lot", "bus depot", "bus depot", "parking lot"], "difficult_direct_answer": false, "rationales": ["This would be a place they can park a bunch of busses.", "This place is full of vehicles, not animals or football players. they are parked and have four or more wheels.", "There are tons of buses parked."], "image": "train2014/COCO_train2014_000000331198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29155, "question_id": "XVYCJgx2T9gimygz7MyDek", "question": "How was something added to this sign most recently?", "choices": ["nailed", "tape", "ironed on", "painted"], "correct_choice_idx": 1, "direct_answers": ["sticker", "tape", "tape", "duct tape", "tape", "sticker", "sticker", "sticker", "taped on", "person"], "difficult_direct_answer": false, "rationales": ["There is a sticker attached to the sign.", "There are silver pieces of duck tape attaching the paper to the sign.", "A grey piece of tape is seen on the sign."], "image": "train2014/COCO_train2014_000000029155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82940, "question_id": "XVYfomLWHwqWGPBPrqiQUc", "question": "What are the women holding?", "choices": ["wakeboards", "surfboards", "skis", "paddles"], "correct_choice_idx": 3, "direct_answers": ["rows", "paddles", "paddles", "rows", "paddles", "paddles", "paddles", "paddles", "paddles", "paddles"], "difficult_direct_answer": false, "rationales": ["The women are paddle boarding, holding long sticks with a large part at the end, propelling themselves in the water.", "The women have paddles.", "The women are paddle boarding in the water."], "image": "val2014/COCO_val2014_000000082940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431671, "question_id": "XVsGu2RxHTpQrURN948cH7", "question": "Where is Coca-Cola's headquarters located?", "choices": ["arizona", "georgia", "utah", "maine"], "correct_choice_idx": 1, "direct_answers": ["georgia", "atlanta ga", "atlanta", "atlanta", "atlanta", "atlanta georgia", "atlanta ga", "new york", "atlanta", "atlanta ga"], "difficult_direct_answer": false, "rationales": ["Information can be found online but in a georgia", "In georgia where you can visit it.", "Coca-cola is based in atlanta."], "image": "val2014/COCO_val2014_000000431671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14283, "question_id": "XWE3zQJVvZi6gxVVGnj6BK", "question": "What type of sport is this?", "choices": ["team", "aquatic", "combat", "individual"], "correct_choice_idx": 0, "direct_answers": ["baseball", "baseball", "baseball", "team", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["Men in two different color uniforms are on a baseball field.", "A baseball team has nine players, indicating that baseball is indeed a team sport.", "The players are on opposite teams as evidenced by difference colored uniforms. each baseball team has nine players."], "image": "train2014/COCO_train2014_000000014283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494622, "question_id": "XWokRLUgwqy4cHnFqLrHGe", "question": "What style of environment is this?", "choices": ["russian", "libertarian", "victorian", "cajun"], "correct_choice_idx": 2, "direct_answers": ["antique", "dining room", "victorian", "wine tasting", "classical", "hotel", "inside", "winery", "victorian", "mansion"], "difficult_direct_answer": true, "rationales": ["There are elaborately designed items, patterned wallpaper and antique paintings.", "As indicated by the style of chairs, rugs, other furniture and wallpaper. the other options don't match.", "The chairs and wallpaper would suggest it's this style."], "image": "train2014/COCO_train2014_000000494622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479595, "question_id": "XWqep4kkcSh6cvXjovWZDS", "question": "Where is this lap top set up?", "choices": ["fire house", "church", "school", "cemetery"], "correct_choice_idx": 2, "direct_answers": ["park", "on ground", "garden", "grass", "on grass", "courtyard", "on grass", "school", "at park", "grass"], "difficult_direct_answer": false, "rationales": ["The laptop is for school.", "You can see all the students sitting on the grass.", "The person is in front of a large building with lots of other young adults."], "image": "train2014/COCO_train2014_000000479595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203734, "question_id": "XWyRdTrtDgCVVdPp9s6ec5", "question": "Why are their hands raised?", "choices": ["want more", "greetings", "afraid", "want impress"], "correct_choice_idx": 1, "direct_answers": ["answer question", "greetings", "posing", "happy", "answer", "playing", "approval", "happy", "playing game", "happiness"], "difficult_direct_answer": true, "rationales": ["The hands are greeting.", "These children have their hands waved in greetings.", "Kids raise their hands to be counted. these kids are sitting in front of plates with pizza with their hands raised."], "image": "val2014/COCO_val2014_000000203734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288229, "question_id": "XWzYJGAYgf2Jj98mo9uFFt", "question": "What type of area is this?", "choices": ["commercial", "rural", "country", "residential"], "correct_choice_idx": 0, "direct_answers": ["urban", "hotel square", "city", "city", "urban", "urban", "mall", "hilton", "commercial", "hotel"], "difficult_direct_answer": false, "rationales": ["There are shop signs and this is a place where people buy stuff.", "This is indicated by the hotel chain and restaurant chain names.", "The area must be commercial since there's a hilton's."], "image": "val2014/COCO_val2014_000000288229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54149, "question_id": "XXDsPgngonvDchyVmyz34G", "question": "What substance is covering the turf?", "choices": ["salt", "sand", "ash", "snow"], "correct_choice_idx": 3, "direct_answers": ["ice", "snow", "snow", "snow", "mud", "snow", "snow", "snow", "snow", "water"], "difficult_direct_answer": false, "rationales": ["The players appear to be outside and it's not uncommon for snow to fall.", "It is white.", "It has snow on the ground."], "image": "train2014/COCO_train2014_000000054149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5617, "question_id": "XXNGS9qXMBayCg9eTVCLoi", "question": "How many cats are on pillows?", "choices": ["two", "six", "eight", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is a calico cat on one pillow and a tabby cat on the other pillow.", "There is one on each pillow in the middle of the bed", "There are two cats."], "image": "val2014/COCO_val2014_000000005617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136300, "question_id": "XXR9u7c2bFTFUR9WCsYWbM", "question": "What do these animals have?", "choices": ["horns", "quills", "hooves", "gills"], "correct_choice_idx": 2, "direct_answers": ["jobs", "riders", "riders", "hooves", "manes", "hooves", "hoofs", "hooves", "strength", "riders"], "difficult_direct_answer": false, "rationales": ["The horses have hooves.", "Traditionally horses don't have feet or claws like other mammals, but instead they have hooves.", "The animals are horses. they are ungulates."], "image": "train2014/COCO_train2014_000000136300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564848, "question_id": "XXSqHt4WhbTvdmdyivAz57", "question": "Why does the man seated have his arm up?", "choices": ["measure", "balance", "gesture", "break fall"], "correct_choice_idx": 2, "direct_answers": ["meditation", "building", "getting comfortable", "talking", "taking selfie", "gesturing", "unknown", "talking friend", "reaching out", "gesture"], "difficult_direct_answer": true, "rationales": ["The man is seated with his arm up to gesture at the alleyway.", "This man is speaking and emphasizing topics with his hand.", "The man is waving."], "image": "train2014/COCO_train2014_000000564848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125815, "question_id": "XXXLLuMuYuoCkb82tj5ck7", "question": "What might the red light mean?", "choices": ["ready", "loading", "unavailable", "stop"], "correct_choice_idx": 2, "direct_answers": ["back", "stop", "stop", "stop", "stop", "stop", "stop", "unavailable", "back", "stop"], "difficult_direct_answer": false, "rationales": ["It shows that there might be a problem and it is not available.", "Red usually means stop.", "The train is waiting."], "image": "val2014/COCO_val2014_000000125815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515590, "question_id": "XXaCWQr4vsWo5tRT24f8ZG", "question": "What language is mainly spoken here?", "choices": ["japanese", "taiwanese", "mandarin", "korean"], "correct_choice_idx": 0, "direct_answers": ["chinese", "mandarin", "japanese", "japanese", "mandarin", "japanese", "japanese", "japanese", "mandarin", "japanese"], "difficult_direct_answer": false, "rationales": ["The language depicted is written with symbols common of asian culture, the environment and anime are indicators of which asian culture.", "There is writing on the trains.", "B, c and d writing looks distinctively different."], "image": "train2014/COCO_train2014_000000515590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419645, "question_id": "XXd8iV7co5GtcJVFeDp2Zy", "question": "What is this red truck for?", "choices": ["refurbishment", "rent", "sale", "repair"], "correct_choice_idx": 2, "direct_answers": ["owner preference", "pickup", "driving", "show off", "hauling", "show", "sale", "drive", "car show", "transportation"], "difficult_direct_answer": true, "rationales": ["The truck is an antique.", "It is parked in a row of other trucks. it has a sign on the windshield.", "It is shown in a yard in really good condition along with a whole bunch of other older-looking trucks."], "image": "train2014/COCO_train2014_000000419645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456782, "question_id": "XY2d4FB6XcN8EenaZVYPzc", "question": "What type of station is in view?", "choices": ["bus", "gas", "fire", "train"], "correct_choice_idx": 1, "direct_answers": ["gas", "gas station", "gas station", "square", "gas", "gas", "gas", "gas", "gas", "pizza station"], "difficult_direct_answer": false, "rationales": ["A business is on the corner at an intersection and is the only business open at night.", "There is a island for cars to park and the price is listed on the sign", "The price is advertised in large numbers"], "image": "train2014/COCO_train2014_000000456782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8571, "question_id": "XYBRN9qM6m9wkHkGHqhqFY", "question": "How do the women in black know each other?", "choices": ["neighbors", "teammates", "coworkers", "rivals"], "correct_choice_idx": 1, "direct_answers": ["friends", "play together", "teammates", "teammates", "teammate", "tennis", "teammates", "played tennis", "teammates", "teammates"], "difficult_direct_answer": false, "rationales": ["They are partners in tennis doubles.", "The people are wearing matching clothes and are on the same side of the court.", "They are on the same side of the court"], "image": "train2014/COCO_train2014_000000008571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87113, "question_id": "XYBhYkLJfXZV2VCXYrqc4H", "question": "What is the man in white apron holding to his ear?", "choices": ["cellphone", "banana", "bean bag", "ice pack"], "correct_choice_idx": 0, "direct_answers": ["phone", "cellphone", "phone", "phone", "phone", "phone", "cellphone", "phone", "phone", "phone"], "difficult_direct_answer": false, "rationales": ["The man in the white apron is holding an electronic device, not a banana, ice pack, or bean bag, to his ear.", "The man is holding an electronic device, not a food item, medical item, or bean bag, to his ear. he is using it to have a conversation with someone else.", "He is talking to someone on the phone."], "image": "train2014/COCO_train2014_000000087113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387759, "question_id": "XYCiGrZhdRh8b5mEguWi9y", "question": "What item does the business with banners in front of it sell?", "choices": ["cars", "nothing", "car repair", "milkshakes"], "correct_choice_idx": 0, "direct_answers": ["cars", "cars", "cars", "used cars", "used cars", "cars", "cars", "cars", "used cars", "cars"], "difficult_direct_answer": false, "rationales": ["The sign in the yellow says what is sold there.", "They have a bunch of cars in their lot", "These businesses often have something colorful just played to get people's attention and there are many vehicles in the parking lot"], "image": "train2014/COCO_train2014_000000387759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126906, "question_id": "XYNXMUr3Fwy7JEUxq3irxq", "question": "What is the man on the couch doing?", "choices": ["eating", "gaming", "working", "watching tv"], "correct_choice_idx": 3, "direct_answers": ["watching tv", "watching television", "watching tv", "watching television", "watching tv", "watching television", "watching tv", "watching television", "watching television", "watching"], "difficult_direct_answer": false, "rationales": ["The mans legs are stretched while relaxed.", "The man is checking out the tv.", "A person is laying on a couch with a television on on the wall."], "image": "train2014/COCO_train2014_000000126906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54003, "question_id": "XYNjLtq4qEj4D6ov3QGrTY", "question": "Why is the woman in grey carrying a bag?", "choices": ["buying fruit", "for style", "selling fruit", "discarding trash"], "correct_choice_idx": 0, "direct_answers": ["carry groceries", "shopping", "shopping", "buying fruit", "shopping", "shopping", "purchased items", "keep safe", "store purchases", "carry fruits"], "difficult_direct_answer": false, "rationales": ["The woman buys fruit.", "The people are shopping at an outdoor market selling different types of produce.", "She needs the bag to put the items she will purchase into."], "image": "train2014/COCO_train2014_000000054003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260123, "question_id": "XYVYRfRAAfxrcuTaXSjMBY", "question": "What is the name given to the with lines across the road?", "choices": ["motorcycle crossing", "none", "give way", "zebra crossing"], "correct_choice_idx": 3, "direct_answers": ["crosswalk", "cross", "crosswalks", "crosswalk", "crosswalk", "crosswalk", "crosswalk", "crosswalk", "crosswalk", "zebra crossing"], "difficult_direct_answer": false, "rationales": ["The name is a zebra crossing.", "Crosswalks are also called zebra stripes sometimes.", "The crossings are called zebra stripes."], "image": "train2014/COCO_train2014_000000260123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500514, "question_id": "XYcFK5VJTp2ozQyCACCvKP", "question": "Why dies he wear a tie?", "choices": ["is formal", "scene requires", "keep warm", "impress girls"], "correct_choice_idx": 1, "direct_answers": ["costume", "mimicking actor", "scene requires", "businessman", "costume", "costume", "business man", "costume", "disguise", "being professional"], "difficult_direct_answer": false, "rationales": ["Acting and that is the wardrobe required.", "The scene requires it.", "These people are likely to be part of a movie or tv show and we're dressed to fit the part."], "image": "val2014/COCO_val2014_000000500514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133013, "question_id": "XYfSr2UNdsWgNWUdTiz4tV", "question": "What is the man in yellow beneath the front of the plane making?", "choices": ["surprise", "party hat", "connection", "mess"], "correct_choice_idx": 2, "direct_answers": ["safety assurances", "he's working", "inspection", "connection", "safety", "maintenance", "maintenance checks", "tow line", "tow line", "stop barrier"], "difficult_direct_answer": true, "rationales": ["The crew member is connecting the plane to a trailer with a hatch.", "He is connection something to the wheel.", "The man is connecting."], "image": "train2014/COCO_train2014_000000133013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298405, "question_id": "XYn6FjojjGNewSCTzGEXmN", "question": "Where will this pizza be eaten?", "choices": ["upstairs", "home", "here", "restaurant"], "correct_choice_idx": 1, "direct_answers": ["restaurant", "home", "at home", "home", "at home", "at home", "at home", "restaurant", "home", "restaurant"], "difficult_direct_answer": false, "rationales": ["This pizza will be brought home to be eaten.", "Pizzas are often put in boxes for takeout orders. in many cases, these are than delivered to homes to be eaten.", "It is in a box to take out"], "image": "val2014/COCO_val2014_000000298405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141121, "question_id": "XYuS9Ct64AzPMukPDyYvrV", "question": "Why are they wearing gloves?", "choices": ["to catch", "health", "grip", "warmth"], "correct_choice_idx": 3, "direct_answers": ["cold", "cold", "cold", "for warmth", "cold weather", "warmth", "very cold", "prevent hypothermia", "warmth", "cold"], "difficult_direct_answer": false, "rationales": ["The gloves are warming.", "It is winter outside and dressing requires the man and child to cover up.", "In this winter setting it gets very cold and glove are needed."], "image": "train2014/COCO_train2014_000000141121.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163457, "question_id": "XZ9XLnspMqXUNmeDZxSqB8", "question": "What automaker made this truck?", "choices": ["ford", "chevrolet", "mitsubishi", "toyota"], "correct_choice_idx": 2, "direct_answers": ["mitsubishi", "ford", "mitsubishi", "mitsubishi", "mitsubishi", "mitsubishi", "mitsubishi", "mitsubishi", "ford", "ford"], "difficult_direct_answer": false, "rationales": ["Mitsubishi's logo is on the truck.", "The automaker's logo is in between the license plate and the windshield. it is not a chevrolet, ford, or toyota logo.", "The logo on the front of the truck tells you who it is."], "image": "train2014/COCO_train2014_000000163457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330881, "question_id": "XZYzePurCNZK9y6TCMCerC", "question": "Why is the pink paper there?", "choices": ["dropped", "label", "decoration", "reminder"], "correct_choice_idx": 3, "direct_answers": ["reminders", "reminder", "note", "notes", "reminder", "reminder", "reminder", "writting", "reminder", "reminding"], "difficult_direct_answer": false, "rationales": ["The pink paper here is a written reminder note.", "There is a post its with notes on it.", "Post-it's are used for this purpose."], "image": "train2014/COCO_train2014_000000330881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511294, "question_id": "XZexHtsUSYDyC5Sz54pnZ4", "question": "What does the number ten indicate?", "choices": ["starting position", "betting odds", "his age", "his ranking"], "correct_choice_idx": 0, "direct_answers": ["horse", "identification", "horse number", "starting position", "contestant number", "racing number", "entry number", "horse number", "competitor's number", "entry number"], "difficult_direct_answer": false, "rationales": ["The number 10 indicates the starting position of the horse.", "The number on the horse indicates the position it is in coming out of the gate.", "The number is for starting."], "image": "val2014/COCO_val2014_000000511294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310600, "question_id": "Xa89MS6Khg7vRnX8whG6a4", "question": "What's a name for the type of hat the men are wearing?", "choices": ["gambler", "fedora", "boater", "flat cap"], "correct_choice_idx": 3, "direct_answers": ["news", "derby", "driving cap", "flat cap", "ivy ascot", "tabby", "jaxon hat", "cabbie", "tabby", "old fashioned"], "difficult_direct_answer": true, "rationales": ["The men are wearing flat caps.", "These are fedoras they are wearing.", "These hats sit lower to the head."], "image": "val2014/COCO_val2014_000000310600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337265, "question_id": "XaA9FwhrENwAAVWLtrzRZD", "question": "What happened to the sandwich?", "choices": ["gone bad", "fell apart", "overcooked", "partly eaten"], "correct_choice_idx": 3, "direct_answers": ["eaten", "partly eaten", "bitten", "ate sandwich", "eaten", "was eaten", "eaten", "eaten", "eaten", "eaten"], "difficult_direct_answer": false, "rationales": ["The sandwich has been bitten into.", "Someone has taken bites of it.", "There are bites taken out of it"], "image": "train2014/COCO_train2014_000000337265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543713, "question_id": "XaC3EvLJitYMp3ZsyKxRGY", "question": "Why do sheep have horns?", "choices": ["nothing", "playing", "mating", "shock-absorbers"], "correct_choice_idx": 3, "direct_answers": ["defense", "male sheep", "head butt", "weapons", "fighting advantage", "shock-absorbers", "combat rivals", "defense", "male", "sheep fights"], "difficult_direct_answer": true, "rationales": ["They protect the sheep from the their head butting means of interacting with each other.", "The horns are for protection.", "The sheep need shock absorbers."], "image": "val2014/COCO_val2014_000000543713.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464515, "question_id": "XaCSrsDKLtrwTC7FahfEJ7", "question": "Who is the happiest in the picture?", "choices": ["back woman", "front man", "front woman", "back man"], "correct_choice_idx": 2, "direct_answers": ["front woman", "woman", "lady", "woman", "green lady", "woman", "riding", "green shirt", "wife", "female"], "difficult_direct_answer": false, "rationales": ["Th front woman is smiling and her teeth can be seen.", "The woman behind the man is very happy.", "The front woman is smiling the most."], "image": "val2014/COCO_val2014_000000464515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335747, "question_id": "XaFVBLsYdhwWTdCNjEMVcS", "question": "What is the name for animals of this type?", "choices": ["caprine", "canine", "bovine", "feline"], "correct_choice_idx": 2, "direct_answers": ["cows", "bull", "bovine", "bull", "cattle", "cow", "cow", "bull", "oxen", "ox"], "difficult_direct_answer": false, "rationales": ["Bovines are cows.", "Cows are members of the bovine family.", "This is a bovine animal."], "image": "train2014/COCO_train2014_000000335747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332158, "question_id": "XaFcHFxAqmREeJ7hm6wT6Q", "question": "What would soon stop here if there were a power outage?", "choices": ["nothing", "all work", "parties", "silence"], "correct_choice_idx": 1, "direct_answers": ["electronics", "electronics", "all work", "electronics", "chargers", "electricity", "all work", "work", "internet use", "work"], "difficult_direct_answer": false, "rationales": ["The people working are dependent on the electricity and cords.", "The people are using electricity for their work.", "Their work would stop."], "image": "train2014/COCO_train2014_000000332158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525567, "question_id": "XaG3L6nuoB5suVcmMZwW2c", "question": "Which charity is featured on the red banners?", "choices": ["world vision", "united way", "red cross", "ms society"], "correct_choice_idx": 1, "direct_answers": ["united way", "united way", "united way", "bike", "united way", "united way", "unicef", "united way", "united way", "united way"], "difficult_direct_answer": false, "rationales": ["Each banner has a logo with a hand, a human, and a rainbow. the name of the charity is below the logo.", "The logo on the red banners is from united way.", "This is the common logo for a."], "image": "train2014/COCO_train2014_000000525567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43344, "question_id": "XaLhUDQ5SSuUEgn8jGsb8j", "question": "Who is the man sitting in the middle?", "choices": ["rafael nadal", "jack perry", "bo jackson", "djimon hounsou"], "correct_choice_idx": 0, "direct_answers": ["player", "tennis player", "tennis player", "player", "tennis player", "player", "tennis player", "tennis player", "rafael nadal", "tennis player"], "difficult_direct_answer": false, "rationales": ["The man is nadal.", "This is a spanish tennis player. the rest either play another sport or are actors.", "Rafael nadal is a tennis player."], "image": "val2014/COCO_val2014_000000043344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28002, "question_id": "Xacg6njxa9xruDrMrwN6XT", "question": "Why are the umbrellas in use?", "choices": ["sun protection", "fashion", "for sale", "rain protection"], "correct_choice_idx": 0, "direct_answers": ["shade", "people eating", "provide shade", "block sun", "it's hot", "sun protection", "for shade", "shade", "sunny", "sun protection"], "difficult_direct_answer": false, "rationales": ["The umbrellas block sun.", "It's really sunny out.", "When it is hot and bright outside, many outdoor establishments have umbrellas up to provide shade to their customers."], "image": "val2014/COCO_val2014_000000028002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572145, "question_id": "XasvYcyBjEr7EhffG2uDsY", "question": "From which position in relation to the pile of rectangular boards did the skateboard start?", "choices": ["right", "top", "left", "bottom"], "correct_choice_idx": 1, "direct_answers": ["left", "flat", "top", "bottom", "top", "ground", "floor", "next to", "in front", "lower"], "difficult_direct_answer": true, "rationales": ["The skateboarder is on top of the boards.", "Motion moves forward.", "The person on the top started."], "image": "train2014/COCO_train2014_000000572145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160433, "question_id": "XavNHrQpBHwYZspU8opQRD", "question": "Why are the train cars blurred?", "choices": ["bad film", "moving fast", "broken train", "bad camera"], "correct_choice_idx": 1, "direct_answers": ["fast motion", "moving fast", "motion", "moving", "going fast", "moving", "train moving", "fast moving", "moving", "motion"], "difficult_direct_answer": false, "rationales": ["The subway trains move quickly through underground tunnels to transport passengers to different subway stops.", "The cars move fast.", "The train cars are zipping along."], "image": "train2014/COCO_train2014_000000160433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303330, "question_id": "XayjKJpjeUbWWnUR5mfami", "question": "What activity does he have the equipment for?", "choices": ["surfing", "scuba diving", "fishing", "boating"], "correct_choice_idx": 0, "direct_answers": ["surfing", "surfboard", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["The man is wearing shorts. he is sitting on a board.", "The man uses the board to surf the waves.", "That is a surfboard and he is at the beach"], "image": "train2014/COCO_train2014_000000303330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536939, "question_id": "XazA6XE3AdUorJnayfLRXc", "question": "Why do sanitary items comes in white color?", "choices": ["cleanliness", "none", "code", "protection"], "correct_choice_idx": 0, "direct_answers": ["see blood", "toilet paper", "cleanliness", "toilet", "cleanliness", "cleanliness appearance", "chemicals", "purity", "looks cleaner", "cleanliness"], "difficult_direct_answer": false, "rationales": ["They look clean when they are white and are easy to see the blemishes on them.", "A bathroom has all white fixtures.", "Clean items are white."], "image": "train2014/COCO_train2014_000000536939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510342, "question_id": "Xc8YhRaaHnXswMdX7AtC3T", "question": "What is the most likely setting in this situation?", "choices": ["church gathering", "work office", "aa meeting", "college class"], "correct_choice_idx": 3, "direct_answers": ["school", "college class", "office", "school", "workplace", "casual", "school", "office", "work", "office"], "difficult_direct_answer": false, "rationales": ["The calendar on the wall references different programs and spring and winter quarters. these are terms associated with school.", "There are several younger adults in a small space that appears to be a dorm room.", "Classes are divided into quarters and behind the man you can see a board divided into quarters"], "image": "train2014/COCO_train2014_000000510342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216303, "question_id": "XcB6itaSEvLrwb5eV4nZZp", "question": "Why is the girl blowing on her hand?", "choices": ["signal", "luck", "nails wet", "drying"], "correct_choice_idx": 3, "direct_answers": ["cool down", "cleaning it", "air off", "covering mouth", "soreness", "drying", "hurt", "stings", "luck", "cool down"], "difficult_direct_answer": true, "rationales": ["The girl is drying it.", "A girl with a sweatband on her head is playing tennis and has a sweatband on the wrist she isn't blowing on.", "The girl is drying off her hand."], "image": "val2014/COCO_val2014_000000216303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417006, "question_id": "XcYqYM3qCrnSmoJyyjBdKa", "question": "How many gaming remotes are likely among the bunch?", "choices": ["six", "seven", "one", "two"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["One of these removes is a wii controller. the others are for televisions and not gaming consoles.", "There is a wii game controller among the remotes.", "There is a wii controller out of the 6 tv remotes."], "image": "train2014/COCO_train2014_000000417006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204422, "question_id": "XccW4teGwFkBQncHBHWQb4", "question": "How is one person different from the others?", "choices": ["unhappy", "race", "age", "skis"], "correct_choice_idx": 3, "direct_answers": ["male", "skier", "female", "on skis", "man", "lifting leg", "skis", "one footed", "skis", "male"], "difficult_direct_answer": false, "rationales": ["All have snowboards but one.", "There is one person that is not using a snowboard.", "One person has skis and the others don't."], "image": "train2014/COCO_train2014_000000204422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188009, "question_id": "XcpyC56x9x4HSBAxcLj9t4", "question": "The man is in the midst of what type of timed test of skill?", "choices": ["grind", "ollie", "flip", "slalom"], "correct_choice_idx": 3, "direct_answers": ["skating test", "steering", "skating", "skateboarding", "slalom", "skateboarding", "skateboarding", "agility", "obstacle course", "skating"], "difficult_direct_answer": false, "rationales": ["The man is skating on a slalom course.", "The man is in a slalom contest.", "A man is skateboarding through a line of cones."], "image": "train2014/COCO_train2014_000000188009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353148, "question_id": "Xd298dZDcTgc7ERT3UEpCH", "question": "In what year were blue jeans invented?", "choices": ["1845", "1873", "1867", "1857"], "correct_choice_idx": 1, "direct_answers": ["1873", "eighteen seventythree", "1873", "1873", "eighteen seventythree", "1873", "1873", "1990", "1873", "1873"], "difficult_direct_answer": false, "rationales": ["They are invented a long time ago.", "The year was 1873.", "The levi strauss company opened its doors in 1873. founded in san francisco that year, the company still maintains its headquarters there."], "image": "val2014/COCO_val2014_000000353148.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286925, "question_id": "Xd9knWuAf6VWKDU3cAp6AZ", "question": "What type of surfboard does the woman with the black pants have?", "choices": ["fish", "gun", "shortboard", "hybrid"], "correct_choice_idx": 1, "direct_answers": ["gun", "large", "large", "boogieboard", "yellow", "bodyboard", "wet one", "yellow surfboard", "longboard", "black"], "difficult_direct_answer": true, "rationales": ["The woman's board is flatter, thicker, and straighter.", "We can see that the right most woman with blackpants' surfboard is shorter than brighter yellow one to the left.", "The surfboard is like a gun."], "image": "train2014/COCO_train2014_000000286925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384909, "question_id": "XdNPsDD6vE79UGX3SjNjK8", "question": "What year did this company merge with another airline?", "choices": ["2010", "2015", "2012", "2006"], "correct_choice_idx": 2, "direct_answers": ["1983", "1987", "2013", "two thousands", "2010", "unknown", "2012", "2012", "2010", "twenty ten"], "difficult_direct_answer": false, "rationales": ["Continental merged with united in 2012.", "The continental united merger was in 2012.", "Continental merged with united to form united continental holdings."], "image": "train2014/COCO_train2014_000000384909.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565975, "question_id": "XdRQSAmuYUGqYQt7FyAYq9", "question": "For what reason are there so many suitcases covering the vehicle most likely?", "choices": ["transportation", "sale", "storage", "decoration"], "correct_choice_idx": 3, "direct_answers": ["travel", "parade", "decoration", "decoration", "tourists luggage", "decoration", "decoration", "decorative", "several passengers", "for sale"], "difficult_direct_answer": false, "rationales": ["They put these on there to show off.", "They have them there to decorate it.", "The suitcases are for show."], "image": "train2014/COCO_train2014_000000565975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487011, "question_id": "XdgsXurHNBYHdWYipAspRM", "question": "Which character has been selected?", "choices": ["fifth", "fourth", "second", "first"], "correct_choice_idx": 2, "direct_answers": ["second", "lea", "lea", "lea", "len", "girl", "len", "girl", "lena", "lan"], "difficult_direct_answer": false, "rationales": ["The child is choosing the second character.", "A child is pointing an remote towards the television and the second person on the screen is highlighted.", "The boy is hovering over the second character."], "image": "train2014/COCO_train2014_000000487011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580082, "question_id": "XdjghG8YL2qXzhGpM5ig2W", "question": "The person wearing what kind of outfit is acting in an abnormal way?", "choices": ["white shirt", "black suit", "tennis dress", "black dress"], "correct_choice_idx": 2, "direct_answers": ["tennis", "tennis", "tennis attire", "tennis", "tennis dress", "tennis outfit", "tennis", "tennis", "tennis attire", "tennis"], "difficult_direct_answer": false, "rationales": ["They are pretending to do a serve", "The young woman wearing a tennis dress is acting in an abnormal way in this location.", "The person in the tennis dress shouldn't have her arms out."], "image": "train2014/COCO_train2014_000000580082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221200, "question_id": "XdqRkX8DgnL2PaJYa4CYPB", "question": "Why does the man at the back hold the driver?", "choices": ["for condolence", "for balance", "for love", "for friendship"], "correct_choice_idx": 1, "direct_answers": ["safety", "hold", "for balance", "safety", "joking", "safety", "steady", "for safety", "to protect", "for safety"], "difficult_direct_answer": false, "rationales": ["Sometimes on a bike it can be hard to stay on. you need something to hold onto.", "The man on back of this motorcycle holds onto the driver so as to not fall off the motorcycle.", "The man is balancing."], "image": "train2014/COCO_train2014_000000221200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223095, "question_id": "XdvvzBN7hMZTbquJRDsGQf", "question": "What kind of ship is the one in the water?", "choices": ["container", "naval", "passenger", "tanker"], "correct_choice_idx": 2, "direct_answers": ["cruise ship", "cruise ship", "cruise ship", "cruise", "cruise", "cruise", "passenger", "cruise ship", "cruise ship", "cruise ship"], "difficult_direct_answer": false, "rationales": ["These type of cruise ships house many people.", "Multiple cruises ships are in the water. people board them and stay on them.", "This is a cruise ship. the coloring is that of carnival."], "image": "train2014/COCO_train2014_000000223095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408424, "question_id": "XeJxeagBVbceDC9rVEUCEJ", "question": "What are the black poles on the front train window?", "choices": ["wipers", "antennae", "handles", "bumpers"], "correct_choice_idx": 0, "direct_answers": ["windshield washers", "wipers", "windshield wipers", "wipers", "wipers", "wipers", "windshield wipers", "wipers", "wipers", "handles"], "difficult_direct_answer": false, "rationales": ["The poles are wipers.", "There are windshield wipers on the front of the train.", "The black poles help to keep the glass of the window clear."], "image": "train2014/COCO_train2014_000000408424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217082, "question_id": "XeM3wNAXqAdjgrAMyoQKyy", "question": "Where is the dog located at?", "choices": ["backyard", "zoo", "picnic area", "wilderness"], "correct_choice_idx": 2, "direct_answers": ["outside", "backyard", "park", "martins beach", "by bench", "on grass", "park", "park", "park", "picnic area"], "difficult_direct_answer": false, "rationales": ["The dog is in a picnic area.", "You can tell by the setting and picnic tables as to where the dog is.", "He is sitting on grass between the picnic tables."], "image": "val2014/COCO_val2014_000000217082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450728, "question_id": "XeQPXfNB2nxJQPpGUqqCGb", "question": "The white man is most likely what?", "choices": ["warrior", "tourist", "prisoner", "native"], "correct_choice_idx": 1, "direct_answers": ["trainer", "tourist", "tourist", "tourist guide", "tourist", "tourist", "tourist", "tourist", "tourist", "caretaker"], "difficult_direct_answer": false, "rationales": ["As he is not the same nationality as the man leading the elephant, it is probably safe to assume he is visiting the area.", "The owners give rides to the country's visitors for a fee.", "Usually locals don't ride elephants; this seems like an activity for people visiting."], "image": "val2014/COCO_val2014_000000450728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526987, "question_id": "XeqP8GYBx6Xzt97ev6DX8U", "question": "What animal does the pink meat on the dish come from?", "choices": ["pig", "chicken", "dog", "cow"], "correct_choice_idx": 0, "direct_answers": ["pig", "pig", "pig", "pig", "cow", "pig", "cow", "pig", "cow", "pig"], "difficult_direct_answer": false, "rationales": ["The plate has bacon which is slices of meat.", "The pink meat is bacon. it does not come from chickens, cows, or dogs.", "The pink meat is bacon. bacon comes from pigs."], "image": "train2014/COCO_train2014_000000526987.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247049, "question_id": "XeujRBKqeMi8BoZ3qZhLMi", "question": "What are the metal towers used for?", "choices": ["cell phones", "transportation", "climbing", "gaming"], "correct_choice_idx": 1, "direct_answers": ["move people", "ski lifts", "ski lift", "chair lift", "ski lift", "transportation", "ski lift", "ski lifts", "chairlift", "balancing"], "difficult_direct_answer": false, "rationales": ["The metal towers are used for transporting people on the slope.", "Ski poles are an assistive device used by skiers to help balance and propel them across the snow. the use of skis to get from one place to another is called this.", "This is for a ski life to move people upslope easily"], "image": "train2014/COCO_train2014_000000247049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495980, "question_id": "Xf2NC58jbK97MTV6TQYLgS", "question": "What is the refrigerator currently being used as?", "choices": ["storage", "art", "door blocker", "meme"], "correct_choice_idx": 3, "direct_answers": ["cooler", "meme", "cold space", "haiku", "display", "prop", "food storage", "prop", "leaning post", "decoration"], "difficult_direct_answer": true, "rationales": ["The refrigerator is a meme.", "The refrigerator has a punchline.", "There is some chalk writing in front."], "image": "val2014/COCO_val2014_000000495980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38753, "question_id": "XfBkMciEyu7bRqaiSLjezV", "question": "What item does the young boy snack on here?", "choices": ["mushrooms", "tomatoes", "pizza dough", "sausage"], "correct_choice_idx": 0, "direct_answers": ["mushrooms", "pizza", "pizza", "pizza", "mushroom", "pizza", "pizza", "pizza", "mushrooms", "pizza"], "difficult_direct_answer": false, "rationales": ["The young man is eating mushrooms.", "The boy is eating some sliced mushrooms.", "As indicated by the items being off the pizza and on the table."], "image": "val2014/COCO_val2014_000000038753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167553, "question_id": "XfCqic3cueeBZGWPDrHKMr", "question": "Rawlings are the helmets used by whom?", "choices": ["bbl", "mlb", "ttb", "bsl"], "correct_choice_idx": 1, "direct_answers": ["baseball players", "players", "baseball players", "baseball players", "catcher", "catchers", "empire", "baseball players", "baseball catchers", "mlb"], "difficult_direct_answer": false, "rationales": ["These are for baseball players", "The answer is internet searchable.", "These helmets are used for baseball."], "image": "train2014/COCO_train2014_000000167553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40729, "question_id": "XfDJLGBu7dZHUbowpCe7zT", "question": "How are these two people related?", "choices": ["strangers", "siblings", "enemies", "spouses"], "correct_choice_idx": 3, "direct_answers": ["spouse", "married", "husband wife", "cut cake", "married", "spouses", "partners", "married", "married", "married"], "difficult_direct_answer": false, "rationales": ["They just got married and are cutting their wedding cake.", "A woman in a wedding dress is standing next to a man in a tuxedo and they are cutting a wedding cake. this is done by people who just got married.", "Two people are cutting a wedding cake together."], "image": "val2014/COCO_val2014_000000040729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162488, "question_id": "XfDiizsyf4rvKutq9t4ahN", "question": "What is shown here that could cook a roast for you making it most tender while you are at work so when you return you'll be able to eat it?", "choices": ["stove", "hot plate", "microwave", "crock pot"], "correct_choice_idx": 3, "direct_answers": ["microwave", "microwave oven", "crock pot", "crock pot", "crock pot", "crock pot", "crock pot", "crockpot", "crock pot", "microwave"], "difficult_direct_answer": false, "rationales": ["It is a slow cooker", "You would use the slow-cooking vessel to cook like that.", "Answer a is known to cook in the method described in the question and is identifiable on the shelf based on its shape and design."], "image": "train2014/COCO_train2014_000000162488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32773, "question_id": "XfQZhmHNHGPNRKJTBsdnJA", "question": "What is the name of the safety feature on the front of the bus helps make it visually brighter so no accidents occur?", "choices": ["hd lights", "twinkle lights", "beam lights", "headlights"], "correct_choice_idx": 3, "direct_answers": ["headlights", "reflective paint", "headlights", "sign", "lights", "headlights", "visible paint", "headlights", "yellow colour", "headlights"], "difficult_direct_answer": false, "rationales": ["The name is headlights.", "Headlights are the lights on the front of vehicles so they can see.", "Those lights are used at the front of the train."], "image": "train2014/COCO_train2014_000000032773.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493793, "question_id": "XfZwjCZZNAPLaA6nNPJoJk", "question": "What food group is available?", "choices": ["dairy", "grains", "vegetables", "fruits"], "correct_choice_idx": 3, "direct_answers": ["fruits", "fruit", "fruit", "fruit", "vitamins", "fruit", "fruit", "fruit", "fruit", "fruits"], "difficult_direct_answer": false, "rationales": ["The food is a fruit.", "These are oranges and lemons.", "Oranges, bananas and lemons are available. they are all part of this food group."], "image": "train2014/COCO_train2014_000000493793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239444, "question_id": "XfgtUsyimBcC6XbS2w9Dw5", "question": "At which preparation stage is this pizza?", "choices": ["kneading", "fully baked", "chopping", "raw"], "correct_choice_idx": 3, "direct_answers": ["oven heating", "baking", "cooking", "raw", "uncooked", "pre-cook", "raw", "baking", "baking stage", "baking"], "difficult_direct_answer": false, "rationales": ["The pizza doesn't look like it was cooked yet and appears to be still raw.", "The pizza is raw.", "It hasn't been cooked yet."], "image": "val2014/COCO_val2014_000000239444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310094, "question_id": "XfiJQim5GzV8AnzY6FUd9w", "question": "What is the name given to the purple part of the banana above?", "choices": ["flower bud", "leaf", "stem", "sucker"], "correct_choice_idx": 0, "direct_answers": ["stalk", "flower bud", "flowers", "stem", "inflorescence", "stem", "stem", "stem", "flower", "leaf"], "difficult_direct_answer": false, "rationales": ["The purple part of the banana above is the bud of a flower.", "The plant is called a bud.", "The name is the flower bud."], "image": "val2014/COCO_val2014_000000310094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361140, "question_id": "XfkHiGVxdk8jrPB3YJcHRq", "question": "How is the game laying on top of the chess board called?", "choices": ["chess", "monopoly", "life", "ludo"], "correct_choice_idx": 3, "direct_answers": ["parchisi", "checkers", "ludo", "ludo", "parchisi", "trouble", "ludo", "sorry", "parchese", "ludo"], "difficult_direct_answer": false, "rationales": ["The game is known as ludo.", "The game is ludo.", "The other options don't match the game board graphics."], "image": "val2014/COCO_val2014_000000361140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491733, "question_id": "XgYVoyqsGpBpQPi9pXCvaq", "question": "What state is this photograph based in?", "choices": ["ohio", "alabama", "new york", "michigan"], "correct_choice_idx": 3, "direct_answers": ["michigan", "michigan", "michigan", "michigan", "michigan", "michigan", "michigan", "michigan", "michigan", "michigan"], "difficult_direct_answer": false, "rationales": ["There is a description written at the bottom of the image that describes the location of the photo.", "A photograph of a boat dock is labeled with basic information.", "This looks to be a lake up north."], "image": "train2014/COCO_train2014_000000491733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63263, "question_id": "XgfgG2snvjSYVUaYovUAnm", "question": "What is the boy on the right touching his feet to?", "choices": ["base", "bench", "step", "bag"], "correct_choice_idx": 0, "direct_answers": ["base", "base", "base", "base", "base", "base", "base", "base", "tag base", "tag base"], "difficult_direct_answer": false, "rationales": ["The boy is about to reach the base.", "In baseball, usually there are white bases where you touch when you run across. they are in a baseball field.", "The boy is on base."], "image": "train2014/COCO_train2014_000000063263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242163, "question_id": "XgoL5ojfS7v7fkmawzUaug", "question": "What flowers might grow wild in this environment?", "choices": ["roses", "dhalias", "dandelions", "foxglove"], "correct_choice_idx": 2, "direct_answers": ["dandelions", "dandelions", "daisies", "dandelions", "daisys", "daisies", "daisies", "dandelion", "dandelions", "lilies"], "difficult_direct_answer": false, "rationales": ["They are found in a lot of grassy areas", "The flowers that grow here are called dandelions.", "Dandelions are common in fields."], "image": "train2014/COCO_train2014_000000242163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341013, "question_id": "XhP7bFGpRDhtLUmfefpV6A", "question": "What does this item by the curb need to run?", "choices": ["wind up", "gasoline", "solar power", "trampoline"], "correct_choice_idx": 1, "direct_answers": ["gas", "gasoline", "patrol", "car", "motor", "metal icon", "engine", "gasoline", "gasoline", "diesel"], "difficult_direct_answer": false, "rationales": ["A car is parked in the street.", "A motor vehicle is parked by this curb. motor vehicles require gasoline to operate.", "This is an old car which would run on petroleum fuel."], "image": "train2014/COCO_train2014_000000341013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75267, "question_id": "XhXbBCAUThapCvTaWgwSBn", "question": "What language does the name of this airline derive from?", "choices": ["greek", "assyrian", "french", "spanish"], "correct_choice_idx": 0, "direct_answers": ["greek", "english", "greek", "greek", "greek", "greek", "greek", "english", "greek", "english"], "difficult_direct_answer": false, "rationales": ["Delta is one of the greek letters in the alphabet.", "The word is from greek.", "The language is greek."], "image": "train2014/COCO_train2014_000000075267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403294, "question_id": "XhijhFNYSuXrFqCs3m8Eyk", "question": "Who is the man seen in the front of the bus window?", "choices": ["police", "passenger", "driver", "bystander"], "correct_choice_idx": 2, "direct_answers": ["driver", "driver", "driver", "bus driver", "driver", "bus driver", "driver", "driver", "bus driver", "driver"], "difficult_direct_answer": false, "rationales": ["The person behind the steering wheel operates the bus.", "The man visible at the front of the bus is seated in front of the steering wheel and pedals which move the bus.", "The person in the front is driving the bus."], "image": "val2014/COCO_val2014_000000403294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119036, "question_id": "XhwWQD7L6ctCWGvAqjkhsA", "question": "Who got this ball to this place?", "choices": ["catcher", "pitcher", "outfielder", "coach"], "correct_choice_idx": 1, "direct_answers": ["throw", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "baseball player", "pitcher"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to throwing the ball to the batter.", "The baseball depicted was just thrown towards the batter, who is actively batting. traditionally, a is the only person who does that.", "In baseball, the ball is thrown to the batter."], "image": "train2014/COCO_train2014_000000119036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24458, "question_id": "Xi7bS5gRSyptZbmuCicbjf", "question": "Why is the horse wearing this on its face?", "choices": ["fashion", "training", "sick", "biting"], "correct_choice_idx": 1, "direct_answers": ["bug protection", "avoid bugs", "limit vision", "horse flies", "focus vision", "bugs", "training", "mask", "protection", "bugs"], "difficult_direct_answer": true, "rationales": ["The horse is being trained not to be startled.", "The horse is wearing a mask on its face because it helps with its training", "The owners want to get the horse used to doing something for them."], "image": "val2014/COCO_val2014_000000024458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54039, "question_id": "XiXekCTXu2CrciakLBiTPa", "question": "Why are the vehicles covered?", "choices": ["to decorate", "to steal", "protection", "to ship"], "correct_choice_idx": 2, "direct_answers": ["sun", "keep safe", "for protection", "rain protection", "protection", "protection", "prevent damage", "protection", "for protection", "protect motorbike"], "difficult_direct_answer": false, "rationales": ["The vehicles are protected.", "The vehicles are motorcycles based on their size, shape and design. a cover like that depicted on the image would be used for the purposes of answer a.", "The vehicles are outside. outside conditions deteriorate and damage vehicles, vehicles are covered to prevent that."], "image": "val2014/COCO_val2014_000000054039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558371, "question_id": "XicSMvnZ535bAnDRczjCC5", "question": "What is on the pizza?", "choices": ["meatball", "turkey", "peanuts", "broccoli"], "correct_choice_idx": 3, "direct_answers": ["broccoli", "broccoli", "broccoli", "broccoli", "pepperoni vegetables", "broccoli", "broccoli", "brocolli", "cheese", "broccoli"], "difficult_direct_answer": false, "rationales": ["You can see little green trees that is broccoli on the pizza", "Broccoli florets are a topping.", "The pizza is visible and there is a green floret on top that is consistent with broccoli and matches how broccoli is prepared and used as a topping."], "image": "val2014/COCO_val2014_000000558371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509114, "question_id": "XieT3okqYnVWsrBkoBPtUR", "question": "What sort of treats does the lady here like?", "choices": ["pizza", "barbeque", "salad", "baked goods"], "correct_choice_idx": 3, "direct_answers": ["desserts", "donut", "baked goods", "donut", "donuts", "sweet", "donuts", "sweets", "donuts", "donut"], "difficult_direct_answer": false, "rationales": ["The woman is holding a doughnut which is a baked good.", "This is a donut", "The woman is eating a pastry."], "image": "train2014/COCO_train2014_000000509114.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516175, "question_id": "Xis3R4TgNidcajChE97HMh", "question": "What piece of equipment is in the window?", "choices": ["speaker", "fan", "air conditioner", "heater"], "correct_choice_idx": 2, "direct_answers": ["air conditioning", "air conditioner", "ac", "air conditioner", "air-conditioner", "air conditioner", "air conditioner", "air conditioner", "air conditioner", "air conditioner"], "difficult_direct_answer": false, "rationales": ["Sometimes people have air conditioners sitting in their windows.", "The equipment is the ac.", "This is a machine to keep the room cool when it is hot outside."], "image": "train2014/COCO_train2014_000000516175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201925, "question_id": "XisBrLnHHKh2c6AWwtZZV7", "question": "What yellow fluid might be paired with this?", "choices": ["paint", "banana milk", "custard", "egg yolk"], "correct_choice_idx": 2, "direct_answers": ["butter", "butter milk", "custard", "butter", "lemonade", "icing", "eggnog", "syrup", "icing", "honey"], "difficult_direct_answer": false, "rationales": ["Eggs are often used in baked goods such as cakes and this item is cakelike, in a cake pan, and being baked in an oven.", "A cake is baking in the oven.", "Custard is usually in cakes."], "image": "val2014/COCO_val2014_000000201925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404990, "question_id": "XjAs3eaP4NpyY2Kso7JY45", "question": "Why is everyone posed so ornately?", "choices": ["school pictures", "bus stop", "dinner waiting", "wedding photo"], "correct_choice_idx": 3, "direct_answers": ["prom night", "prom", "wedding photo", "prom", "formal picture", "taking pictures", "prom picture", "prom", "formal night", "group picture"], "difficult_direct_answer": false, "rationales": ["The group of people are posing together as they are dressed up to go to a wedding.", "The people are visibly wearing formal attire. when wearing formal attired and posing in such a manner, they may be attending an event similar to answer a.", "The people are in a wedding."], "image": "train2014/COCO_train2014_000000404990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28149, "question_id": "Xjp68RBgCMSbMpF5TCmYAB", "question": "In which building is this room located?", "choices": ["gas station", "motel", "prison", "train depot"], "correct_choice_idx": 1, "direct_answers": ["hotel", "bathroom", "apartment", "bathroom", "bathroom", "bathroom", "hotel", "hotel", "motel", "tv"], "difficult_direct_answer": false, "rationales": ["Everything is together very close and the tv remote is on the table", "The room is in a motel since there's a tv and small soaps.", "The picture includes a very small bathroom which is adjacent to living room. with its folded towel and boxed soap, it must be a motel room."], "image": "train2014/COCO_train2014_000000028149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402368, "question_id": "XjwCcVYFdqUHMmrCR78G9X", "question": "What do you call a fire place?", "choices": ["heater", "illuminator", "warmer", "hearth"], "correct_choice_idx": 3, "direct_answers": ["fireplace", "lamp", "hearth", "hearth", "fireplace", "hearth", "fireplace", "hearth", "fire place", "fireplace"], "difficult_direct_answer": false, "rationales": ["That's the area around a fireplace.", "This is another term for part of it", "A fireplace is also called a hearth."], "image": "val2014/COCO_val2014_000000402368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29191, "question_id": "XjwFUiWZ4FSMXJZwgiabJc", "question": "Where does the skateboarder hope to land?", "choices": ["grass", "school", "sidewalk", "home"], "correct_choice_idx": 2, "direct_answers": ["sidewalk", "sidewalk", "on sidewalk", "sidewalk", "on sidewalk", "on sidewalk", "sidewalk", "sidewalk", "sidewalk", "sidewalk"], "difficult_direct_answer": false, "rationales": ["The sidewalk is safest.", "The grass would be too rough to land on and cause a fall. the walkway is smooth.", "The skateboarder wants to be on the sidewalk."], "image": "train2014/COCO_train2014_000000029191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21451, "question_id": "XkLVVbkNjDSdfqsfa7y2CK", "question": "The small yellow pieces on the bread are probably what food?", "choices": ["chickpeas", "corn", "beans", "peppers"], "correct_choice_idx": 1, "direct_answers": ["garlic", "corn", "bread", "corn", "nuts", "corn", "corn", "corn", "corn", "pizza"], "difficult_direct_answer": false, "rationales": ["The veggies are taken from their own cob.", "The yellow pieces are yellow kernels that are usually on a cob.", "There are yellow kernels."], "image": "train2014/COCO_train2014_000000021451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327255, "question_id": "XkmVi943CQgcAfM8rXn2bY", "question": "What do these men plan to do here?", "choices": ["ski", "hike", "run", "swim"], "correct_choice_idx": 0, "direct_answers": ["snowboard", "snowboard", "snowboard", "snowboarding", "ski", "snow board", "snowboard", "snowboard", "snowboard", "ski"], "difficult_direct_answer": false, "rationales": ["This is a ski resort and that's one of the things people come up here to do.", "A guy is standing on a snowboard in a snowy area with other people around.", "The men are going snowboarding."], "image": "val2014/COCO_val2014_000000327255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199021, "question_id": "XmJMik3KjLGLJhrXTzrjTj", "question": "Which of the perishables went through a process of fermentation?", "choices": ["turkey", "tomato", "lettuce", "beer"], "correct_choice_idx": 3, "direct_answers": ["beer", "beer", "beer", "beer", "beer", "drink", "beer", "beer", "drink", "beer"], "difficult_direct_answer": false, "rationales": ["The perishables are from beer.", "Beer is always made from fermented wheat or grain.", "Beer is fermented."], "image": "train2014/COCO_train2014_000000199021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196104, "question_id": "XmQX32NpyAwZzvTahXKrof", "question": "How many players in baseball team?", "choices": ["12", "11", "eight", "nine"], "correct_choice_idx": 3, "direct_answers": ["nine", "nine", "four", "twenty", "nine", "nine", "nine", "nine", "nine", "nine"], "difficult_direct_answer": false, "rationales": ["Although more people can be on the team, there is a maximum number of positions that a team can field at one time, p, c, 1b, 2b, 3b, ss, rf, cf, lf.", "I count 12 players from the batter to the players in and around the dugout.", "6 are infield and 3 outfield"], "image": "train2014/COCO_train2014_000000196104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419216, "question_id": "XmTxypaVn92xauEVTVMyNV", "question": "What sort of dining do those in front of the restaurant enjoy?", "choices": ["none", "diet", "fine", "al fresco"], "correct_choice_idx": 3, "direct_answers": ["al fresco", "casual dining", "outdoor dining", "outdoor", "causal", "casual", "outdoor", "restaurant", "modern grilling", "beer"], "difficult_direct_answer": true, "rationales": ["This is an italian restaurant.", "That is the dining being served.", "The diners are eating outside."], "image": "val2014/COCO_val2014_000000419216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507937, "question_id": "XmkEGpimW5qq6GcXanfccE", "question": "Which car is in the greatest danger?", "choices": ["grey car", "white bus", "black car", "white truck"], "correct_choice_idx": 2, "direct_answers": ["black car", "black car", "black car", "black", "black car", "black", "black car", "black sedan", "black", "black"], "difficult_direct_answer": false, "rationales": ["The car that's black is in danger.", "There is a car with its nose out into the traffic lane.", "The ebony car attempting to go out into the street would be the most likely to be hit in the event of an accident."], "image": "train2014/COCO_train2014_000000507937.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365456, "question_id": "XnSec5o9Kb4xJpbGzouLtx", "question": "What configuration of tennis is being played here?", "choices": ["racial", "swoop hawk", "singles", "doubles"], "correct_choice_idx": 3, "direct_answers": ["doubles", "doubles", "doubles", "doubles", "doubles", "doubles", "doubles", "doubles", "doubles", "doubles"], "difficult_direct_answer": false, "rationales": ["Doubles are played with two people to a side.", "There are two players facing the same direction on the same side of the court.", "The configuration is doubles."], "image": "val2014/COCO_val2014_000000365456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539775, "question_id": "XnSupgqTmHTS28qC8ZNBZx", "question": "How many structures that are attached to the orange beam on the middle boat are pointing toward the sky?", "choices": ["six", "ten", "two", "twelve"], "correct_choice_idx": 0, "direct_answers": ["six", "six", "six", "six", "six", "six", "six", "seven", "six", "six"], "difficult_direct_answer": false, "rationales": ["You can easily count all of the structures.", "You can count them.", "In the middle of the picture are multiple orange beamed structures. if you count them, there are six of them."], "image": "val2014/COCO_val2014_000000539775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432212, "question_id": "XnW2DhuRwb2w73WhjAXaLB", "question": "What is the nature of the mouse closest to the cat?", "choices": ["dead mouse", "glass mouse", "computer mouse", "fast mouse"], "correct_choice_idx": 2, "direct_answers": ["computer mouse", "electronic", "electronic", "computer", "computer mouse", "electronic", "computer mouse", "computer", "electronic", "mechanical"], "difficult_direct_answer": false, "rationales": ["The mouse is used with the keyboard.", "The mouse is a electronic gadget used to navigate the computer.", "The nature is a computer mouse."], "image": "train2014/COCO_train2014_000000432212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458027, "question_id": "Xnf7ZhcRCgunRzUd3ox8nj", "question": "What is the sheep in the cage involved in?", "choices": ["sale", "grooming", "trade", "parade"], "correct_choice_idx": 3, "direct_answers": ["protest", "lawn care", "lawn care", "parade", "yard maintenance", "promotion event", "lawn care", "lawn care", "lawn care", "lawn care"], "difficult_direct_answer": false, "rationales": ["The sheep is located in a cage that is aboard a car float used in parades", "The sheep is being shown off to an audience.", "The sheep is on a float."], "image": "train2014/COCO_train2014_000000458027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256605, "question_id": "XnjvNvghac8HJMJKyNsLMW", "question": "What type of fence is shown here?", "choices": ["wood stockade", "none", "electrified", "barbed wire"], "correct_choice_idx": 2, "direct_answers": ["wire fence", "rope", "rope", "electric", "chain", "wire", "wire", "electrified", "wire fence", "string fence"], "difficult_direct_answer": false, "rationales": ["It is an electrified fence so the animals can't run away", "There are electrified wires in front of the cows.", "The fence in the photo is a wire connected to an electric source and it gives the cows a shock if they make contact. this keeps them from escaping the farm."], "image": "train2014/COCO_train2014_000000256605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162558, "question_id": "Xnm9DQpAtcx8SMvZJPkzzA", "question": "Who is maneuvering the flying object?", "choices": ["man", "boy", "girl", "woman"], "correct_choice_idx": 3, "direct_answers": ["woman", "woman", "woman", "mother", "woman", "parent", "kid", "kite", "woman", "woman"], "difficult_direct_answer": false, "rationales": ["The woman maneuvers.", "The woman seems to be maneuvering the object as shown.", "The flying item in this image is a kite. a kite is controlled by the wind as well as a person on the ground holding it's string. this string is held by the woman in this image."], "image": "train2014/COCO_train2014_000000162558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229140, "question_id": "XnnoAbPzrYYbk2i2psZ6rp", "question": "What is built into the cabinet on the left wall?", "choices": ["sinks", "appliances", "seats", "cages"], "correct_choice_idx": 1, "direct_answers": ["appliances", "refrigerator", "refrigerator", "refrigerator", "fridge", "microwave oven", "oven", "oven", "drawers", "refrigerator"], "difficult_direct_answer": false, "rationales": ["There are some kitchen appliances indented into the walls.", "These are a fridge and ovens", "Appliances are built into the cabinet."], "image": "train2014/COCO_train2014_000000229140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52902, "question_id": "Xo3J4hfARJ8gCt4ShsmdEF", "question": "What's the reaction does the girl express?", "choices": ["anger", "laugh", "smile", "nervous"], "correct_choice_idx": 2, "direct_answers": ["happiness", "happy", "happy", "happy", "happy", "smile", "happy", "happy", "happiness", "happy"], "difficult_direct_answer": false, "rationales": ["The person looks very happy with the phone.", "The girl is smiling.", "The woman looks very happy."], "image": "train2014/COCO_train2014_000000052902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160183, "question_id": "Xo48Xx4GzMzqi67aPht2or", "question": "What types of events does this truck usually respond to?", "choices": ["fires", "shootings", "heists", "robberies"], "correct_choice_idx": 0, "direct_answers": ["fires", "fires", "fires", "fires", "fires", "fires", "fires", "fires", "fires", "fires"], "difficult_direct_answer": false, "rationales": ["A firetruck has a ladder and hoses to put out fires.", "The truck has hoses and a ladder. the name of the department on the side of the truck indicates the events that it responds to.", "A firetruck responds to fire emergencies, and has every conceivable tool and accessory needed to quench flames of almost any size."], "image": "val2014/COCO_val2014_000000160183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513670, "question_id": "XoGheDNxePVRN37MxgyKDb", "question": "What is connecting to all the things in the sky?", "choices": ["string", "balloons", "claws", "batteries"], "correct_choice_idx": 0, "direct_answers": ["strings", "string", "kite string", "string", "string", "string", "string", "string", "kite", "string"], "difficult_direct_answer": false, "rationales": ["The things in the sky are kites and kites are controlled by string which connects to the kites and is held by the people operating them.", "String connects the kites.", "The people are flying kites. kites are attached to strings."], "image": "val2014/COCO_val2014_000000513670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139503, "question_id": "XoPZjtcyidXR7XAHpeordg", "question": "How many keys are on a keyboard?", "choices": ["101", "112", "100", "110"], "correct_choice_idx": 0, "direct_answers": ["many keys", "twenty six", "forty five", "101", "many", "101", "101", "101", "seventy five", "100"], "difficult_direct_answer": false, "rationales": ["A typical keyboard has this many on it.", "On a standard keyboard there are 101 keys.", "There are 101 keys."], "image": "train2014/COCO_train2014_000000139503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408243, "question_id": "XoY6ciiuV978yPdx5zAjz3", "question": "What does the 4 letter acronym on the car relate to?", "choices": ["fire department", "video games", "police department", "repair company"], "correct_choice_idx": 2, "direct_answers": ["police", "san francisco", "police department", "san francisco", "police he", "city police", "police", "police department", "police department", "police department"], "difficult_direct_answer": false, "rationales": ["Sfpd stands for san francisco police department.", "The car is for the police's services.", "It is the letters indicating what town"], "image": "train2014/COCO_train2014_000000408243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129397, "question_id": "Xok6g5nMKzfQba7neHtQBN", "question": "The device the man is on has the same number of wheels as what vehicle?", "choices": ["train", "car", "unicycle", "tank"], "correct_choice_idx": 1, "direct_answers": ["car", "car", "car", "car", "car", "car", "car", "car", "scatting", "car"], "difficult_direct_answer": false, "rationales": ["Both have four wheels on them.", "They both have 4", "Both devices have four wheels."], "image": "train2014/COCO_train2014_000000129397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364770, "question_id": "Xos2ZVXDquQWAqKghi6zu3", "question": "Why are the glasses on a rag next to the sink?", "choices": ["to sell", "for dinner", "storage", "just washed"], "correct_choice_idx": 3, "direct_answers": ["washing", "washing", "drying", "drying", "drying", "drying", "drying", "washing", "just washed", "washing"], "difficult_direct_answer": false, "rationales": ["They are there to dry off", "Due to the close proximity of the basin sink with soap water and the laid out rag, it would be logically that these glasses have just been washed and are drying.", "People put cups upside down to dry."], "image": "train2014/COCO_train2014_000000364770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368218, "question_id": "XozbD7yt7Wjax5PkKs6bDU", "question": "What item here has no calories?", "choices": ["tomatoes", "sauce", "diet coke", "hot dogs"], "correct_choice_idx": 2, "direct_answers": ["diet coke", "diet coke", "diet coke", "diet coke", "carrots", "coke", "diet coke", "diet coke", "drink", "diet coke"], "difficult_direct_answer": false, "rationales": ["The other options have a lot of calories.", "Drinks with that label on them usually have zero calories and no added sugar.", "Most diet drinks do not have calories in them."], "image": "val2014/COCO_val2014_000000368218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385307, "question_id": "Xp7XzG5HdNEydEZpMN36iV", "question": "How many pictures are hanging on the wall?", "choices": ["six", "four", "two", "one"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two pictures hanging on the wall on each side of the door", "There is one on each side of the door frame.", "These are mounted on the wall on each side of the door for decoration."], "image": "train2014/COCO_train2014_000000385307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538741, "question_id": "XpBF3pRuyxjn4iT8PChdny", "question": "What level of expertise have the persons on the left?", "choices": ["pros", "beginners", "olympic", "semi professional"], "correct_choice_idx": 1, "direct_answers": ["beginners", "beginner", "beginners", "beginner", "beginners", "beginner", "beginner", "beginner", "beginners", "beginner"], "difficult_direct_answer": false, "rationales": ["This is the most likely answer given the \"school\" portion of the text on the back fo the green jacketed skier.", "They look like they are new to skiis and trying to figure out how to work them.", "Based on the word \"skischool\" on the man on the right and the uncertain position of the people on the left, it can be assumed they are just now learning how to ski."], "image": "val2014/COCO_val2014_000000538741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449177, "question_id": "XpPPtXqAE67NuoNoQFBthW", "question": "The appliance used for multipurpose toasting and grilling is?", "choices": ["toaster", "otg", "oven", "griller"], "correct_choice_idx": 2, "direct_answers": ["oven", "oven", "oven", "oven", "oven", "oven", "oven", "oven", "stove", "tongs"], "difficult_direct_answer": false, "rationales": ["The appliance is an oven.", "An oven serves these purposes.", "The oven can be used for many purposes."], "image": "train2014/COCO_train2014_000000449177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536444, "question_id": "XpQemGvSMEHCxQ6vb2hLdi", "question": "What are the all looking at?", "choices": ["trees", "clouds", "airplane", "bicycle"], "correct_choice_idx": 2, "direct_answers": ["aeroplane", "airplane", "airplane", "airplane", "plane", "plane", "airplane", "airplane", "plane", "airplane"], "difficult_direct_answer": false, "rationales": ["The people are looking up in the air ad not looking at the biker.", "People stand below an airplane flying overhead.", "The airplane is flying over their heads."], "image": "val2014/COCO_val2014_000000536444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442348, "question_id": "XpezQQkKDLz6sc7AYGoTvX", "question": "Where is this lady situated at?", "choices": ["food court", "restaurant", "home", "hotel room"], "correct_choice_idx": 2, "direct_answers": ["table", "restaurant", "home", "dinning table", "table", "dining table", "kitchen", "restaurant", "dining room", "restaurant"], "difficult_direct_answer": false, "rationales": ["There is a glass display with nice dishes behind her", "The lady is eating at home.", "The woman is eating at her kitchen table."], "image": "val2014/COCO_val2014_000000442348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147276, "question_id": "XpjB5tqjmh9UKMg3QPA8yq", "question": "What are the animals standing on?", "choices": ["hillside", "hay", "water", "glass"], "correct_choice_idx": 0, "direct_answers": ["rocky ground", "goat", "hill", "hillside", "ground", "hillside", "ship", "hillside", "hill", "grass"], "difficult_direct_answer": false, "rationales": ["A group of animals are grazing on a sloped, grassy area.", "The animals are on a hill.", "The ground is slanted so they are likely on a hill."], "image": "train2014/COCO_train2014_000000147276.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30565, "question_id": "XpjnAfwTTCGNmDCkA7TuHQ", "question": "What body of water is this?", "choices": ["pool", "pond", "ocean", "swamp"], "correct_choice_idx": 1, "direct_answers": ["pond", "concrete pond", "elephant", "pool water", "manmade pond", "pond", "pond", "pond", "pond", "pond"], "difficult_direct_answer": false, "rationales": ["Elephants are near a small body of water with ducks in it.", "The body of water is small and round with ducks swimming in it.", "The body of water is enclosed."], "image": "train2014/COCO_train2014_000000030565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176278, "question_id": "XpjoKdsJKL6ovheLF9CehD", "question": "What is the possible danger that will occur in the scene?", "choices": ["tree falling", "construction collapsed", "wrong signal", "pedestrian hit"], "correct_choice_idx": 2, "direct_answers": ["traffic", "accident", "accident", "car wreck", "traffic accident", "wrong signal", "hit", "accidents", "accident", "car accident"], "difficult_direct_answer": false, "rationales": ["The danger is the wrong signal.", "The wrong signal could lead to an accident.", "The greatest danger faced in the scene is an error in the signaling for traffic direction."], "image": "val2014/COCO_val2014_000000176278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227000, "question_id": "XpxD63bWPxdoXUvwhikjFw", "question": "What would you pay for if you went up to the green machine?", "choices": ["water", "tires", "parking", "ice cream"], "correct_choice_idx": 2, "direct_answers": ["parking", "parking", "parking", "parking", "parking", "parking", "parking", "parking", "parking", "parking"], "difficult_direct_answer": false, "rationales": ["The machine is a parking machine.", "The green machine has a p on it for parking.", "This is where you pay for your parking space that you parked in"], "image": "train2014/COCO_train2014_000000227000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53608, "question_id": "Xq5MdEAQHLFcy2xAyGyPAj", "question": "What is the man in the apron cooking?", "choices": ["lobster", "lamb", "sausage", "salmon"], "correct_choice_idx": 0, "direct_answers": ["lobster", "lobster claws", "crab", "lobster", "crustaceans", "lobster", "lobster", "lobster", "lobster", "lobster"], "difficult_direct_answer": false, "rationales": ["There are lobsters on the table in front of him.", "This animal has claws and a hard shell.", "There are lobster claws on the table in front of the man."], "image": "train2014/COCO_train2014_000000053608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179374, "question_id": "XqMKa8vHago7SbMBjcd6zA", "question": "What is the name of this stopping technique?", "choices": ["cutting", "braking", "carving", "v-stop"], "correct_choice_idx": 3, "direct_answers": ["pizza", "pizza", "snowplow", "pizza", "snow plough", "snow plough", "v-stop", "snowplow", "cross technique", "stop"], "difficult_direct_answer": false, "rationales": ["The skis are orientated similar to the shape of a letter \"v\" which in skiing would cause a person to stop and give the technique its name.", "The skis are pointed towards each other in a downward way making it look like the letter.", "It's for beginners who need help stopping when going down a slope."], "image": "val2014/COCO_val2014_000000179374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116131, "question_id": "XqQsdcEvXFJhq7DPvuPGUm", "question": "What do you call the unusual image disturbance seen here?", "choices": ["lens flare", "noise", "chromatic aberration", "emboss"], "correct_choice_idx": 2, "direct_answers": ["flash", "chromatic aberration", "messy fridge", "three dimensional", "oversaturation", "double exposure", "overexposure", "static", "discoloration", "rainbow"], "difficult_direct_answer": true, "rationales": ["The edges of objects in the image have wavy, colored lines, and a blue haze over some areas.", "Chromatic aberration disturbs the colors.", "The colors of this image are disturbed and incorrectly placed. this is an effect sometimes associated with images that have been prepared for red-blue 3d image viewing."], "image": "train2014/COCO_train2014_000000116131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364590, "question_id": "XqbLgWUS3stGeS6V8FRYto", "question": "What does this person prepare for?", "choices": ["sale", "parade", "bathing contest", "dinner"], "correct_choice_idx": 1, "direct_answers": ["summer", "video shoot", "hot day", "parade", "festival", "parade", "parade", "comedy", "party", "parade"], "difficult_direct_answer": false, "rationales": ["The colorful costume is worn for such an event.", "This looks like a outfit that one would wear at a gay pride parade.", "The person is on a parade float."], "image": "train2014/COCO_train2014_000000364590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567944, "question_id": "Xqbg2F9kNBPu4Ueyu8dvC8", "question": "Why is the motorcycle rider wearing a red helmet?", "choices": ["punishment", "fashion", "visibility", "protection"], "correct_choice_idx": 3, "direct_answers": ["safety", "safety", "protection", "protection", "visibility", "for safety", "protection", "for safety", "for safety", "safety feel"], "difficult_direct_answer": false, "rationales": ["This rider is wearing a helmet probably because he has seen other bikers get really bad skull injuries at some point, so he's going to do what he can to protect his head.", "The rider needs protection.", "Helmet protect brains."], "image": "val2014/COCO_val2014_000000567944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494580, "question_id": "XqdN9C57Zmw26Ciu3Y4tQm", "question": "What country's citizens are famous for eating this food combination?", "choices": ["south africa", "philippines", "fiji", "england"], "correct_choice_idx": 3, "direct_answers": ["united kingdom", "uk", "english", "england", "england", "england", "asia", "england", "fish chips", "british"], "difficult_direct_answer": false, "rationales": ["Fish and chips are very common in the united kingdom.", "The citizens are from england.", "Fish and chips are considered a traditional british food combination although the fish is normally battered."], "image": "val2014/COCO_val2014_000000494580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422157, "question_id": "XrANDjxvTocagvoHygE2oo", "question": "What is the speed limit of school bus?", "choices": ["60mph", "75mph", "70mph", "50mph"], "correct_choice_idx": 3, "direct_answers": ["forty", "50mph", "30mph", "40", "fifteen mph", "not fast", "40 mph", "route dependent", "25", "fifteen mph"], "difficult_direct_answer": true, "rationales": ["The question is not related to the image but is internet searchable.", "Buses can't move that fast since they carry kids.", "The speed limit is pretty slow."], "image": "val2014/COCO_val2014_000000422157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484570, "question_id": "XrRJaGMx4E4vPpwjnCPpxG", "question": "Why might the air they breathe be thinner than normal?", "choices": ["smoke", "high altitude", "cold weather", "hot weather"], "correct_choice_idx": 1, "direct_answers": ["high altitude", "higher altitude", "elevation", "altitude", "altitude", "elevation", "higher", "height", "high elevation", "high altitude"], "difficult_direct_answer": false, "rationales": ["The people are on a mountain.", "They are near the top of a mountain. the bottom of the mountain has a significantly higher air pressure.", "People are standing at the top of a tall mountain with snow on it."], "image": "val2014/COCO_val2014_000000484570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198768, "question_id": "XrWY3edGGFFJGcJamzv8ui", "question": "Where will the tennis ball next go?", "choices": ["up", "bait shop", "down", "pocket"], "correct_choice_idx": 0, "direct_answers": ["over net", "to opponent", "air", "other side", "across court", "up", "air", "unknown", "in air", "opponent"], "difficult_direct_answer": true, "rationales": ["It'll go up.", "The man will throw it in the air in order to hit it over the net.", "The tennis ball needs to be served."], "image": "val2014/COCO_val2014_000000198768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83658, "question_id": "Xrsp6w8aN442cjJPFxbsq2", "question": "This water can be described as what?", "choices": ["clean", "boiling", "dirty", "frozen"], "correct_choice_idx": 2, "direct_answers": ["clear", "shallow pond", "murky", "shallow", "river", "muddy", "drinking", "dirty", "drinking", "still"], "difficult_direct_answer": true, "rationales": ["It is not clear.", "The water is surrounded by mud and has mud in it.", "The water is murky."], "image": "val2014/COCO_val2014_000000083658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325989, "question_id": "Xs89igthVcq4CjtYHJB5Km", "question": "The people sitting on the couch are competing in what on the television?", "choices": ["tekken", "street fighter", "mario kart", "smash brothers"], "correct_choice_idx": 2, "direct_answers": ["video game", "gaming", "game competition", "mario kart", "mario kart", "video games", "racing game", "racing", "racing game", "video game"], "difficult_direct_answer": false, "rationales": ["They play mario kart.", "The people are playing mario kart based on what's on the screen.", "They are playing a kart racing game."], "image": "train2014/COCO_train2014_000000325989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114288, "question_id": "XsCVkuB7BuCK2MgzjdkjtR", "question": "For what reason is the buses door open here?", "choices": ["loading passengers", "stop light", "driving", "collecting donations"], "correct_choice_idx": 0, "direct_answers": ["passenger onboarding", "loading passengers", "bus stop", "loading", "pick up", "boarding passengers", "stopping", "to board", "bus stop", "boarding"], "difficult_direct_answer": true, "rationales": ["It is open so passengers can get on the bus.", "The door is open so the riders can board the bus.", "The bus riders are getting ready to board the bus."], "image": "train2014/COCO_train2014_000000114288.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355552, "question_id": "XsE9HpWhr3yHKEygqkQKS4", "question": "What is the area the people are standing at called?", "choices": ["observation deck", "picnic", "garage", "porch"], "correct_choice_idx": 0, "direct_answers": ["balcony", "ski resort", "deck", "deck", "deck", "chalet", "deck", "deck", "deck", "observation deck"], "difficult_direct_answer": false, "rationales": ["People are standing on the deck of a boat.", "The are is used to observe wide areas in all direction.", "This deck's has a high vantage point atop a ski lift and view of mountains."], "image": "val2014/COCO_val2014_000000355552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62060, "question_id": "XsFGZpS8VVCKw4YdyHHW7u", "question": "Why is the man holding up his leg?", "choices": ["skipping", "stretching", "dancing", "jumping"], "correct_choice_idx": 1, "direct_answers": ["stretching", "itching", "hand", "stretching", "stretching", "scratching", "stretching", "stretching", "scratching", "scratching"], "difficult_direct_answer": false, "rationales": ["It is good to warm up muscles before running or jumping.", "The man is trying to stretch.", "Moving his legs to stretch it out"], "image": "val2014/COCO_val2014_000000062060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74599, "question_id": "XsMkkWevHLRhS67DuDSZHH", "question": "As is where does the ball have zero chance of going after he hits it?", "choices": ["to car", "to girl", "to tree", "down manhole"], "correct_choice_idx": 3, "direct_answers": ["road", "in drain", "behind him", "behind him", "across town", "behind him", "sewer", "behind him", "manhole", "down manhole"], "difficult_direct_answer": false, "rationales": ["The hole in the ground is covered.", "The sewer cover is covered and has very small holes.", "The ball cannot go down the manhole if it is going forward."], "image": "train2014/COCO_train2014_000000074599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117781, "question_id": "Xt6D2ibr8XGR3MuawFKE6o", "question": "What is the woman with a big blue umbrella doing?", "choices": ["sightseeing", "boat racing", "commuting", "selling stuff"], "correct_choice_idx": 3, "direct_answers": ["sailing", "selling", "selling stuff", "selling", "sitting", "hawker", "sitting", "sailing", "looking", "sailing"], "difficult_direct_answer": false, "rationales": ["It's hard to tell from the image, but this appears to be the case.", "The woman with a big blue umbrella has items she's trying to sell.", "The boat loads of product in front of this woman; as well as her outstretched arms appearing to give something to the crowd gathered, suggest she is running a business here."], "image": "train2014/COCO_train2014_000000117781.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287884, "question_id": "XtAMb4MxabJKSfx22PZ9ua", "question": "What type of board is this?", "choices": ["topple board", "skate board", "snow board", "balance board"], "correct_choice_idx": 1, "direct_answers": ["skateboard", "skate board", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard"], "difficult_direct_answer": false, "rationales": ["The person is on a flat board that doesn't have wheels.", "Is the most likely answer given the background skate area and time of year.", "People ride skateboards in urban areas."], "image": "train2014/COCO_train2014_000000287884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45446, "question_id": "XtbAyeuPCR2sphP7mGWUJ2", "question": "What kind of trash can the company advertising on yellow sign help with?", "choices": ["construction", "appliance", "recyclable", "leaf"], "correct_choice_idx": 3, "direct_answers": ["leaf", "leaves", "yard", "yard waste", "leaf", "leaf", "leaves", "leaf removal", "leaves", "leaves"], "difficult_direct_answer": false, "rationales": ["The trash is for leaves.", "The sign is advertising leaf removal.", "There are words on the yellow sign."], "image": "train2014/COCO_train2014_000000045446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181013, "question_id": "XtgyRp4CePSqUgEdhDfhnB", "question": "What does the guy in black want?", "choices": ["touch other", "buy frisbee", "trip other", "grab frisbee"], "correct_choice_idx": 3, "direct_answers": ["get frisbee", "stop throw", "grab it", "frisbee", "frisbee", "steal", "frisbee", "frisbee", "frisbee", "grab frisbee"], "difficult_direct_answer": false, "rationales": ["The men are clearly playing frisbee and as the man in black is not holding the frisbee currently and is making a gesture to try to block, he is likely trying to gain possession.", "The person is reaching for a frisbee the person in front of him is holding.", "Playing a game with a frisbee involves trying to take it away from your opponent"], "image": "val2014/COCO_val2014_000000181013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271681, "question_id": "Xu7YKoEjKhJaWbQWN7iSe5", "question": "What material are the earrings made of?", "choices": ["crystal", "jade", "metal", "plastic"], "correct_choice_idx": 2, "direct_answers": ["metal", "plastic", "plastic", "metal", "metal", "plastic", "plastic", "plastic", "plastic", "metal"], "difficult_direct_answer": false, "rationales": ["The studs need to be hard enough to pass through the hole in the ear.", "You can tell by the color as they don't fit the other choices.", "They appear to be metallic in design and that is common for earrings."], "image": "val2014/COCO_val2014_000000271681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298306, "question_id": "XuAPhLmNuRqShf5JPvB5Lt", "question": "What surface are they playing on?", "choices": ["carpet", "outdoor hard", "grass", "clay"], "correct_choice_idx": 1, "direct_answers": ["tennis court", "concrete", "tennis court", "tennis court", "artificial", "outdoor hard", "hard", "tennis court", "cement", "clay"], "difficult_direct_answer": false, "rationales": ["They play outdoors.", "They are playing on a hard tennis court.", "They seems to be out having good times."], "image": "train2014/COCO_train2014_000000298306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20106, "question_id": "XuG3gSySm9k3gCHk6TKwLz", "question": "This man likely has origins in what country?", "choices": ["finland", "mexico", "siberia", "rwanda"], "correct_choice_idx": 1, "direct_answers": ["mexico", "india", "mexico", "india", "middle east", "peru", "not clear", "mexico", "asia", "india"], "difficult_direct_answer": false, "rationales": ["Out of the four choices he looks most like he is from mexican decent and has the typical features of a mexican man.", "The man has dark skin.", "The man looks to be hispanic based on his skin color and look and answer a contains many hispanic people compared to the other answers on the list."], "image": "train2014/COCO_train2014_000000020106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243912, "question_id": "XuJ4GMACLxkiUzMMJECx7i", "question": "Where would you normally see the blue thing on the dog?", "choices": ["suit", "forehead", "feet", "hands"], "correct_choice_idx": 0, "direct_answers": ["human neck", "human", "around person", "on men", "around neck", "man", "suit wearer", "suit", "human neck", "man's neck"], "difficult_direct_answer": true, "rationales": ["The blue thing is on a suit.", "The blue thing is a tie. it normally would be worn as part of a formal outfit.", "Ties are worn with suits."], "image": "val2014/COCO_val2014_000000243912.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413248, "question_id": "XuNoGfZ5DPBuSTZve5Ly9r", "question": "What type of transportation is shown?", "choices": ["water", "air", "rail", "road"], "correct_choice_idx": 3, "direct_answers": ["scooter", "scooter", "motorcycle", "scooter", "moped", "scooty", "scooter", "scooter", "road", "scooter"], "difficult_direct_answer": false, "rationales": ["There is a motorcycle on the road. motorcycles ride on the road.", "The transportation is on the road.", "The motorcycle is on the road."], "image": "train2014/COCO_train2014_000000413248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414683, "question_id": "Xuc5NVdy7iJgwEbYrgvAnn", "question": "What will number 30 do next?", "choices": ["coach", "bat", "outfield pickup", "catch"], "correct_choice_idx": 1, "direct_answers": ["hit baseball", "swing", "hit ball", "walk", "bat", "await pitch", "batting stance", "bat baseball", "bat", "bat"], "difficult_direct_answer": false, "rationales": ["This baseball player is standing in the \"batter's box.\" he isn't wearing a fielder's glove, it's obvious he is not a coach and outfield pickup makes no sense.", "Number 30 is going up to homeplate to take his turn as the batter.", "The player has a bat to hit the ball."], "image": "train2014/COCO_train2014_000000414683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248709, "question_id": "XuktKuovS3acqABDwLQYMa", "question": "The book was based on a series of what by the author?", "choices": ["songs", "seminars", "dreams", "sermons"], "correct_choice_idx": 1, "direct_answers": ["paul miller", "biography", "essays", "novels", "things", "religion", "paul miller", "seminars", "field notes", "events"], "difficult_direct_answer": true, "rationales": ["The book on the table has words on it that explain the title.", "The were several seminars that were held from the book.", "The series is based on the author's seminar lectures."], "image": "train2014/COCO_train2014_000000248709.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315974, "question_id": "XumWDBsweJqPYwNFXaACAp", "question": "What finger is shown on the left side of the photo?", "choices": ["ring", "thumb", "middle", "pointer"], "correct_choice_idx": 3, "direct_answers": ["index", "index", "pointer", "index", "index", "pointer", "index", "pointer", "index", "pointer"], "difficult_direct_answer": false, "rationales": ["The finger is pointing.", "The pointer finger is used to point.", "The person is pointing to the item with their index finger."], "image": "val2014/COCO_val2014_000000315974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116663, "question_id": "Xv6f8SgU97PF82ZNFK47wY", "question": "The device in front of the beige car parked on the side of the street can be used for what purpose?", "choices": ["parking payment", "atm withdrawal", "fire alarm", "police alert"], "correct_choice_idx": 0, "direct_answers": ["parking", "riding", "see rearview", "parking payment", "parking", "steering", "parking", "parking", "rain", "light"], "difficult_direct_answer": false, "rationales": ["A parking meter can be sed to pay for your parking spot while you are parked there.", "This is a parking meter that you can pay for your space.", "The device is for parking."], "image": "train2014/COCO_train2014_000000116663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128093, "question_id": "XvFz3c3cu3aGvso87Lidw3", "question": "What often goes on top of his food?", "choices": ["ketchup", "custard", "frosting", "jam"], "correct_choice_idx": 0, "direct_answers": ["mustard", "mustard", "ketchup", "ketchup", "ketchup", "ketchup", "ketchup", "ketchup", "mustard", "mustard"], "difficult_direct_answer": false, "rationales": ["The object in question is visibly a hotdog based on the size, shape and composition. this food is commonly known to be served with answer a.", "The ketchup is on top.", "The boy is eating a hot dog. sugary toppings, like jam, custard, and frosting, would not be suitable."], "image": "train2014/COCO_train2014_000000128093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59015, "question_id": "XvPhJzkvC96J4eyCXNz8sB", "question": "Why are there wood planks on the barge?", "choices": ["partying", "decoration", "traction", "building"], "correct_choice_idx": 2, "direct_answers": ["load truck", "load pickup", "transport car", "smoothing surface", "walking surface", "lift car", "distribute weight", "support", "traction", "flooring support"], "difficult_direct_answer": true, "rationales": ["The planks are for traction.", "The planks are needed for traction.", "A car is parked on the boards so it needs traction."], "image": "val2014/COCO_val2014_000000059015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386341, "question_id": "XvVh9taYe9AinKwJo9ACrR", "question": "What is the full name of the building ahead?", "choices": ["basketorvet", "husketorvet", "kesketorvet", "fisketorvet"], "correct_choice_idx": 3, "direct_answers": ["fisketorvet", "fisketorvet", "sketorvet", "fisketorvet", "fisketorvet", "isketorvet", "arena", "fisketorvet", "unknown", "fisketorvet"], "difficult_direct_answer": false, "rationales": ["There is an \"i\" partially visible before the \"s\". answer a is the only option with this combination.", "I did an internet search on copenhagen mall using the portion of the name visible on the building to find the answer.", "The name is fisketorvet."], "image": "train2014/COCO_train2014_000000386341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128137, "question_id": "XvYUfRHn5u78osxFm3qgLR", "question": "How was this type of bike made?", "choices": ["trade", "custom", "retail", "assembly line"], "correct_choice_idx": 1, "direct_answers": ["welding", "by mechanics", "with spikes", "machine made", "custom", "shop", "custom", "custom", "custom design", "specialized mechanic"], "difficult_direct_answer": false, "rationales": ["The bike is custom.", "The bike has intricate painting on it.", "This type of bike is made in custom media."], "image": "train2014/COCO_train2014_000000128137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526432, "question_id": "XvoXbZhu4MFBgB5nogh88W", "question": "What type of worker sits here?", "choices": ["janitor", "construction", "dentist", "clerical"], "correct_choice_idx": 3, "direct_answers": ["analyst", "cubicle", "office worker", "office worker", "office worker", "office", "office", "clerical", "office", "office worker"], "difficult_direct_answer": false, "rationales": ["There are workers who do clerical roles in this cubicle.", "This area is inside an office. there is a computer and a filing cabinet.", "Office workers sit in white collar settings."], "image": "train2014/COCO_train2014_000000526432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61997, "question_id": "XvsF62tZxhpt7WSMPn7xrR", "question": "What activity can you observe here?", "choices": ["wood carving", "dog grooming", "painting", "skiing"], "correct_choice_idx": 2, "direct_answers": ["painting", "art", "painting", "painting", "painting", "painting", "painting", "painting", "drawing", "art"], "difficult_direct_answer": false, "rationales": ["A man stands in front of an easel with a brush in her hand.", "There are a lot of paintings on the walls.", "The activity is painting."], "image": "val2014/COCO_val2014_000000061997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362198, "question_id": "XvspLEJsPyFy9aFsYVm97e", "question": "What material is the sink made of?", "choices": ["stainless steel", "wood", "plastic", "porcelain"], "correct_choice_idx": 0, "direct_answers": ["steel", "metal", "metal", "stainless steel", "metal", "steel", "metal", "stainless steel", "metal", "steel"], "difficult_direct_answer": false, "rationales": ["The material is steel.", "You can see it's silver so it's made of a steel material.", "The silver sink is made of steel."], "image": "train2014/COCO_train2014_000000362198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280761, "question_id": "XwDPP3FKfWqAzkraHSq5h4", "question": "What does the woman appear to be doing outdoors?", "choices": ["biking", "spectating", "waiting", "playing music"], "correct_choice_idx": 3, "direct_answers": ["working", "watching", "work", "using computer", "playing music", "working", "analyzing computer", "computing", "working", "using computer"], "difficult_direct_answer": false, "rationales": ["The woman is regarding some electronics and is holding headphones to her ear. people have headphones on when they are playing music that might be what the electronics are in use for.", "She is a do and playing music.", "She is djing a party"], "image": "val2014/COCO_val2014_000000280761.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370210, "question_id": "XweR9Yiktpzm4dXN2WYMUD", "question": "Who might they be watching for?", "choices": ["frogs", "birds", "owner", "cat"], "correct_choice_idx": 2, "direct_answers": ["audience", "owner", "owner", "owner", "owner", "audience", "audience", "owner", "owner", "owner"], "difficult_direct_answer": false, "rationales": ["They're sitting in the car waiting for them to come back.", "The dogs are waiting for their human to come back to the car.", "They want the owner."], "image": "val2014/COCO_val2014_000000370210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370890, "question_id": "XxG5Z7Ro6h8zrwK5TUoupw", "question": "What is most obviously being enacted upon him?", "choices": ["water pressure", "wind", "gravity", "extreme heat"], "correct_choice_idx": 2, "direct_answers": ["gravity", "snow", "gravity", "gravity", "flying", "riding", "gravity", "gravity", "gravity", "gravity"], "difficult_direct_answer": false, "rationales": ["The gravity will pull him down.", "Gravity keeps him falling.", "Gravity is having an effect on him because he is going down the slope and is airborne."], "image": "val2014/COCO_val2014_000000370890.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64760, "question_id": "XxLWDh7wyQz5gLpT5rzh7S", "question": "What will these ladies next do with the pizzas?", "choices": ["bake", "serve", "throw away", "eat"], "correct_choice_idx": 0, "direct_answers": ["bake", "bake", "cook", "cook them", "cook pizzas", "bake them", "serve them", "serve them", "serve", "bake"], "difficult_direct_answer": false, "rationales": ["These ladies are baking pizzas.", "The women would bake the pizzas next.", "They are making them and then will put them in the oven to cook."], "image": "val2014/COCO_val2014_000000064760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563061, "question_id": "XxZbhwrcmqVcfG5mbsXQQz", "question": "If this is their natural habitat what continent are they on?", "choices": ["africa", "north america", "australia", "europe"], "correct_choice_idx": 0, "direct_answers": ["africa", "africa", "africa", "africa", "africa", "africa", "africa", "africa", "africa", "africa"], "difficult_direct_answer": false, "rationales": ["The animals in the image are zebras. zebras historically are from the african continent when they are in their natural habitat.", "Zebras are from africa.", "They live in africa."], "image": "train2014/COCO_train2014_000000563061.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260513, "question_id": "XxenspBmFc8P3aAtAt97HH", "question": "How much did the owner of this bike put into the parking meter here?", "choices": ["$1", ".25", "penny", "none"], "correct_choice_idx": 3, "direct_answers": ["nothing", "none", "zero dollars", "1dollar", "no money", "nothing", "zero", "zero dollars", "nothing", "no effort"], "difficult_direct_answer": false, "rationales": ["There seemed to be a snow storm and the bike was left on the meter since it is covered in layers of snow. therefore there is no way the owner left any time on the meter.", "A bike is parked at a meter with snow piled up all around.", "Looks like they did not touch it as there is snow covering it all up."], "image": "train2014/COCO_train2014_000000260513.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307966, "question_id": "XxnmvyfmSugeQVbqsSMris", "question": "What type of dish is this?", "choices": ["dessert", "side dish", "entree", "appetizer"], "correct_choice_idx": 2, "direct_answers": ["dinner", "healthy", "fish", "seafood", "dinner", "plate", "broccoli", "healthy meal", "entree", "dinner"], "difficult_direct_answer": false, "rationales": ["A plate of rice and vegetables is prepared and garnished with lemon wedges.", "A plate has rice and vegetables on it. the plate is a large size, commonly used for main dishes.", "It is a main course because it has rice, vegetables, lemons, and a piece of cooked meat on top of it."], "image": "train2014/COCO_train2014_000000307966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457587, "question_id": "Xy37Lo92Kz8XNkRAhJQPBn", "question": "What is the gold framed area against the back wall used to hold?", "choices": ["fire", "water", "pictures", "books"], "correct_choice_idx": 0, "direct_answers": ["fire", "fire", "fire", "paintings", "portrait", "fireplace", "fireplace", "fire", "firewood", "fire"], "difficult_direct_answer": false, "rationales": ["The area is the fireplace.", "The gold framed area is the fireplace.", "The place that's framed is a fireplace."], "image": "train2014/COCO_train2014_000000457587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190115, "question_id": "Xy7Mki2rF5DQSm5QsaeDxY", "question": "What type of person could be eating the plain looking pizza?", "choices": ["omnivore", "vegetarian", "carnivore", "pescatarian"], "correct_choice_idx": 1, "direct_answers": ["vegetarian", "man", "man", "hungry person", "hungry person", "normal person", "tourist", "vegetarian", "vegetarian", "vegetarian"], "difficult_direct_answer": false, "rationales": ["A vegetarian could eat a cheese pizza. there's no meat.", "The pizza doesn't have any meat and has a bowl of salad.", "There is no meat on the pizza. vegetarians don't eat meat."], "image": "val2014/COCO_val2014_000000190115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81819, "question_id": "XyNrWmH55wuWZdxBJzSV8S", "question": "What is he doing?", "choices": ["packing up", "resting", "eating lunch", "painting train"], "correct_choice_idx": 0, "direct_answers": ["packing up", "painting", "painting", "standing", "observing", "painting", "painting", "sketching", "photo", "standing"], "difficult_direct_answer": false, "rationales": ["The man is taking down his easel.", "He's packing.", "The man seemed to be painting on the side of the street but is putting their stuff away."], "image": "train2014/COCO_train2014_000000081819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64744, "question_id": "XyUhL63uhBQBSoFqJvTF8b", "question": "What are they doing on the bike?", "choices": ["cleaning p", "fighting", "showing off", "saving money"], "correct_choice_idx": 2, "direct_answers": ["tricks", "parade", "acrobatics", "tricks", "tricks", "stunt", "tricks", "performance", "standing", "showing off"], "difficult_direct_answer": false, "rationales": ["The people show off.", "They are on a bike performing a circus trick.", "The men are showing off their skills."], "image": "train2014/COCO_train2014_000000064744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33751, "question_id": "XyYt8RuAYJmzQfFbTwsDzB", "question": "How is this laptop connected to the network at this location?", "choices": ["dial-up modem", "wired ethernet", "wi-fi", "cellular modem"], "correct_choice_idx": 1, "direct_answers": ["wifi", "cable", "ethernet", "wifi", "wires", "cables", "cord", "internet", "ethernet", "wired ethernet"], "difficult_direct_answer": false, "rationales": ["The laptop is connected by the ethernet.", "The laptop is connected through the wire cables.", "It has a lot of cords and is in an office"], "image": "train2014/COCO_train2014_000000033751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543600, "question_id": "Xyhu8utdzqbZPP8fA8t5KH", "question": "Which force is likely to be a more sustained one acting on the person here?", "choices": ["wave", "sail", "shark", "drone"], "correct_choice_idx": 1, "direct_answers": ["wind", "wind", "gravity", "wind", "wind", "sail", "wave", "tide", "wave", "wind"], "difficult_direct_answer": false, "rationales": ["The sail is catching air the whole time the surfer is moving where as the waves come through and pass.", "The force is the sail.", "You can see how the wind has the sail"], "image": "val2014/COCO_val2014_000000543600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334884, "question_id": "XyjRMtQC72arPPPPBPz94n", "question": "This food would best be described as what?", "choices": ["salty", "dessert", "healthy", "fattening"], "correct_choice_idx": 2, "direct_answers": ["veggie bowl", "asian", "veg food", "healthy", "balanced diet", "healthy", "salad", "healthy", "vegetarian", "salad"], "difficult_direct_answer": false, "rationales": ["The food is healthy.", "The bowl is filled with vegetables and beans which are healthy foods.", "The vegetables and legumes present in this salad type dish would be considered low fat and nutritious by most."], "image": "val2014/COCO_val2014_000000334884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354179, "question_id": "XzBKGFZp3mwsQPw36iHtTH", "question": "Which type of weather event is most likely to happen immediately after this photo takes place?", "choices": ["hurricane", "overcast weather", "snow", "hail"], "correct_choice_idx": 1, "direct_answers": ["rain", "overcast weather", "rain", "sunshine", "sunshine", "rain", "rain", "rain", "sunshine", "rain"], "difficult_direct_answer": false, "rationales": ["It looks like clouds are rolling in and they'll block out the sun when they arrive, so the weather will become overcast at that time.", "The image appears to have clouds on the horizon which might cause answer a in the near future.", "It is an overcast weather because there are many clouds in the sky"], "image": "train2014/COCO_train2014_000000354179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283659, "question_id": "XzGpB2CXD5nwJXVDXmMkeB", "question": "Why is the man holding onto a rope?", "choices": ["flying kite", "climbing", "gliding", "wind surfing"], "correct_choice_idx": 3, "direct_answers": ["windsurfing", "to balance", "parasailing", "wind surfing", "move faster", "balance", "sea gliding", "parasailing", "glide", "surfboard parasailing"], "difficult_direct_answer": true, "rationales": ["This activity only happens in water.", "The man is holding onto a lead for wind surfing.", "The string goes to a kite above which is blown by the wind and he's trying to control it while using his board to navigate the surface of the water."], "image": "val2014/COCO_val2014_000000283659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280266, "question_id": "XzHTNxDPhJPiVCfWgFV3CA", "question": "What should one be good at before one begins to learn this sport?", "choices": ["jumping", "flipping", "dancing", "swimming"], "correct_choice_idx": 3, "direct_answers": ["swim", "swimming", "swimming", "swimming", "swim", "swimming", "swimming", "swimming", "swimming", "swimming"], "difficult_direct_answer": false, "rationales": ["People need to swim to surf.", "One should swim since there's a high chance of falling off the board.", "Swimming is required."], "image": "train2014/COCO_train2014_000000280266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521301, "question_id": "XzVCZJG44xNHmWUMhvMFmn", "question": "This animal is most closely related to what other animal?", "choices": ["mollusk", "donkey", "badger", "possum"], "correct_choice_idx": 1, "direct_answers": ["pony", "donkey", "donkey", "donkey", "mule", "pony", "pony", "donkey", "donkey", "donkey"], "difficult_direct_answer": false, "rationales": ["The animal in the picture is a horse. in looking at its head, ears, mane and feet, it very closely resembles a donkey.", "The animal pulling the cart is a horse. a horse is like a donkey.", "This is an animal that looks almost identical"], "image": "train2014/COCO_train2014_000000521301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197254, "question_id": "Y23KDe6LLKLidrovmBrzFN", "question": "What is held by the person who sits upon the black seat high behind the horses?", "choices": ["train ticket", "reins", "movie ticket", "winning ticket"], "correct_choice_idx": 1, "direct_answers": ["reins", "reins", "reins", "harness", "reins", "reigns", "reigns", "conductor", "reins", "coachman"], "difficult_direct_answer": false, "rationales": ["The other options don't match this setting or method of transportation.", "The person is holding the reigns to the horse.", "The person sitting in the seat has to steer the horses with the reins."], "image": "val2014/COCO_val2014_000000197254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349575, "question_id": "Y2YB2AvTkrfof4bpDUWXjM", "question": "In what session of the day is it?", "choices": ["evening", "morning", "dusk", "afternoon"], "correct_choice_idx": 3, "direct_answers": ["afternoon", "afternoon", "afternoon", "second", "afternoon", "afternoon", "three", "afternoon", "daytime", "afternoon"], "difficult_direct_answer": false, "rationales": ["The sun is out and the clock shows it's the afternoon.", "The time is the afternoon.", "The sun looks like it's high in the sky, given the shadows, so it must be sometime in the afternoon."], "image": "val2014/COCO_val2014_000000349575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438481, "question_id": "Y2hVfEkfKwHnJ37Li74MnR", "question": "What does the blue sign with a white arrow mean?", "choices": ["ahead only", "parking", "stop", "no turns"], "correct_choice_idx": 0, "direct_answers": ["up", "go straight", "forward motion", "look out", "direction", "up", "go straight", "ahead only", "up", "straight ahead"], "difficult_direct_answer": false, "rationales": ["The direction on the sign is the way of the street.", "The arrow only points one direction.", "It might also mean b depending on the location, but typically it just means a."], "image": "val2014/COCO_val2014_000000438481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515508, "question_id": "Y3CQHxCesa4Y8GssoqkuXv", "question": "What might ruin the day of people shown here?", "choices": ["hot weather", "freezing cold", "snow", "nothing"], "correct_choice_idx": 0, "direct_answers": ["warm weather", "hot weather", "hit tree", "white out", "rain", "injury", "fall", "rain", "rain", "blizzard"], "difficult_direct_answer": false, "rationales": ["The sun and got temperatures will melt.", "Skiing must be done on snow, which can only exist in cold weather, otherwise it would melt.", "The people are skiing. if it gets too warm, the snow would melt."], "image": "train2014/COCO_train2014_000000515508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403489, "question_id": "Y3EPyYs8fgWq2tiYc89tJA", "question": "What sort of interruption stopped this person?", "choices": ["police questioning", "phone call", "baby accident", "citizen arrest"], "correct_choice_idx": 1, "direct_answers": ["phone call", "people", "phone call", "phone call", "phone call", "phone call", "phone call", "phone", "phone call", "people"], "difficult_direct_answer": false, "rationales": ["The man is using a phone.", "The person is facing away because he had to take a phone call.", "The person is holding a device that is the size, shape and design of a cell phone and is being held to their head in the way one would do if they were doing answer a."], "image": "train2014/COCO_train2014_000000403489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268742, "question_id": "Y3J2eFSM26TAZBGtsK3iVE", "question": "How many types of ceramics are there?", "choices": ["three", "six", "five", "four"], "correct_choice_idx": 0, "direct_answers": ["four", "six", "five", "four", "twelve", "three", "six", "four", "eight", "china"], "difficult_direct_answer": false, "rationales": ["There are five different types.", "Three ceramics are shown.", "There are this many different colors"], "image": "val2014/COCO_val2014_000000268742.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320245, "question_id": "Y3UEuEECE7EyN8nE2phfaH", "question": "How much does the combo cost?", "choices": ["$5", "EUR15", "EUR10", "EUR5"], "correct_choice_idx": 3, "direct_answers": ["5 euro", "5 euros", "five euros", "five lira", "five", "5 euro", "five euros", "5 euros", "five euros", "EUR5"], "difficult_direct_answer": false, "rationales": ["The sign says the combo is five euros.", "A restaurant displays specials on the front windows along with the cost.", "There is a sign saying the sandwich and coffee costs five euros."], "image": "train2014/COCO_train2014_000000320245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242855, "question_id": "Y43oH8uKf4bnLYmiM7kUrU", "question": "Which dairy product is most prominent here?", "choices": ["cottage cheese", "sour cream", "cheese", "milk"], "correct_choice_idx": 1, "direct_answers": ["cream", "butter", "sour cream", "sour cream", "cream", "cheese", "sour cream", "nonveg", "sour cream", "sour cream"], "difficult_direct_answer": false, "rationales": ["A white dollop with potatoes is often sour cream.", "Sour cream is at the top of the plate. it is a dairy product.", "The sour cream on the plate."], "image": "val2014/COCO_val2014_000000242855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135463, "question_id": "Y49seLRDgrTVdR5RDrJKZw", "question": "What are two people on the right going to do next?", "choices": ["walk around", "board bus", "drive car", "cross street"], "correct_choice_idx": 1, "direct_answers": ["board bus", "walking", "board bus", "walking", "board bus", "board bus", "board bus", "walking", "board bus", "board bus"], "difficult_direct_answer": false, "rationales": ["They are standing at a bus stop and the bus has stopped to allow them on.", "These two people are lined up and await the bus in front of them to open it's doors so they can get onto it.", "A city transit vehicle is stopped in the middle of the road. they are in the middle of the sidewalk facing towards it with feet slightly lifted in the air."], "image": "train2014/COCO_train2014_000000135463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466784, "question_id": "Y4WfcBNn3d37uNuvfUUDQC", "question": "What is unusual about their outfits?", "choices": ["pants", "shirts", "material", "ties"], "correct_choice_idx": 3, "direct_answers": ["ties", "ties", "girl ties", "men's neckties", "colors", "ties", "ties", "ties", "all ties", "wearing ties"], "difficult_direct_answer": false, "rationales": ["Women don't wear ties.", "They have ties.", "The women have ties on."], "image": "val2014/COCO_val2014_000000466784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129942, "question_id": "Y4awjsruCNq3o7ALQHxRKS", "question": "What kind of bird are these?", "choices": ["sparrow", "parrot", "crow", "bluebird"], "correct_choice_idx": 1, "direct_answers": ["parrot", "parrots", "macaw", "parrot", "macaw", "parrot", "parrots", "parrot", "parrot", "macaws"], "difficult_direct_answer": false, "rationales": ["The bird is a parrot.", "These colorful birds are parrots.", "Parrots have colorful feathers."], "image": "val2014/COCO_val2014_000000129942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37286, "question_id": "Y4ow4kvKNZEY7K5Cyb3DcC", "question": "Who are bananas being given out to?", "choices": ["zoo animals", "bakers", "marathon runners", "children"], "correct_choice_idx": 2, "direct_answers": ["runners", "marathon runners", "runners", "athletes", "customers", "athletes", "runners", "athletes", "runners", "runners"], "difficult_direct_answer": false, "rationales": ["The people are wearing running attire.", "They are giving them to the competitors.", "The bananas are going to the racers."], "image": "train2014/COCO_train2014_000000037286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278224, "question_id": "Y4sBFwrFeWT3ZtEM3TVDwc", "question": "What are the most acidic red wines?", "choices": ["sancerre", "sauvignon blanc", "champagne", "vouvray"], "correct_choice_idx": 1, "direct_answers": ["cabernet sauvignon", "12", "pinot noir", "pinot noir", "full-bodied wines", "cabernet", "pinot noir", "sauvignon blanc", "cabernet sauvignon", "camembert"], "difficult_direct_answer": false, "rationales": ["The most acidic red wines would be sauvignon wine.", "Only would know this if researching it.", "Sauvignon blanc is known for a high acid content."], "image": "train2014/COCO_train2014_000000278224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444086, "question_id": "Y4sZibJsu5drpJNn2kKQNH", "question": "In which direction is this snowboarder moving?", "choices": ["left", "away", "towards", "right"], "correct_choice_idx": 3, "direct_answers": ["right", "downhill", "downward", "their left", "downhill", "downwards", "downhill", "downhill", "scatting", "to left"], "difficult_direct_answer": false, "rationales": ["The snowboarder seems to be going on the right.", "The direction is right.", "The snowboarder is moving toward the right."], "image": "train2014/COCO_train2014_000000444086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194398, "question_id": "Y4yfEJyVPDDPCBWkahpsTz", "question": "How is the trolley powered?", "choices": ["solar", "gas", "nuclear", "electricity"], "correct_choice_idx": 3, "direct_answers": ["electricity", "electricity", "electricity", "electricity", "electricity", "electricity", "electricity", "electricity", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["The apparatus on top of the trolley draws power from an electric wire, which powers the car's electric motor.", "You can see the electric rid on the top.", "The power lines are visible."], "image": "val2014/COCO_val2014_000000194398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453602, "question_id": "Y5HBsSvmb64fHoD8qGNuXX", "question": "What is the vegetable in this dish other than the broccoli?", "choices": ["potatoes", "carrots", "onions", "mushrooms"], "correct_choice_idx": 3, "direct_answers": ["mushrooms", "mushroom", "mushrooms", "mushroom", "broccoli", "mushroom", "mushroom", "mushroom", "mushrooms", "mushrooms"], "difficult_direct_answer": false, "rationales": ["Mushrooms are right next to the broccoli.", "The mushrooms are the other vegetable", "The veggie is mushrooms."], "image": "train2014/COCO_train2014_000000453602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32850, "question_id": "Y5azZ3Q3QpWvGAbWSZYDBG", "question": "What is her hair most likely wet with?", "choices": ["milk", "gel", "water", "paint"], "correct_choice_idx": 2, "direct_answers": ["ocean water", "water", "water", "water", "water", "water", "salt water", "is bathing", "sea water", "water"], "difficult_direct_answer": false, "rationales": ["The woman is standing near the surf. the water probably splashed on her.", "This young girl is on a surf board in the ocean so her hair is wet from the ocean.", "She got it wet in the ocean."], "image": "train2014/COCO_train2014_000000032850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369913, "question_id": "Y5zsVtLCb43dxzcB65XCFQ", "question": "What field of work is this woman in?", "choices": ["academic", "legal", "medical", "technological"], "correct_choice_idx": 2, "direct_answers": ["interior decorator", "medical", "medical", "nursing", "healthcare", "nursing", "nursing", "art", "nurse", "nurse"], "difficult_direct_answer": false, "rationales": ["The woman is in medical work because she is wearing a nurse's outfit.", "The work is in the medical field.", "The woman is wearing medical scrubs and about to cut a bandage."], "image": "val2014/COCO_val2014_000000369913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529344, "question_id": "Y65n8oepRkCwh3bEh7iojP", "question": "Why are they on top the horses?", "choices": ["cleaning them", "riding them", "stealing them", "selling them"], "correct_choice_idx": 1, "direct_answers": ["riding", "riding", "riding them", "riding", "riding", "riding", "riding", "riding them", "ride", "ride"], "difficult_direct_answer": false, "rationales": ["They're riding.", "The people on top of the horses in order to enjoy recreation using the horses.", "The people are riding the horses."], "image": "val2014/COCO_val2014_000000529344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279222, "question_id": "Y6NoqY9dRAgtRp2pU2xN68", "question": "To take the bus tire off you would need to remove about how many lug nuts?", "choices": ["35", "five", "ten", "20"], "correct_choice_idx": 2, "direct_answers": ["three", "twelve", "ten", "ten", "ten", "ten", "ten", "ten", "ten", "nine"], "difficult_direct_answer": false, "rationales": ["The number of lug nuts used, secures the large tires in place.", "Ten lug nuts need to be removed.", "They show very clearly on the wheel"], "image": "train2014/COCO_train2014_000000279222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338189, "question_id": "Y6eDwcN6Pu78TiN3nxCezP", "question": "What are the orange circles used for?", "choices": ["pillows", "flotation", "fishing", "decoration"], "correct_choice_idx": 1, "direct_answers": ["lifesaving devices", "floating", "flotation", "buoys", "saving lives", "floating people", "life preserve", "floats", "life saving", "flotation"], "difficult_direct_answer": true, "rationales": ["They are used as so the boat does not hit up against the dock", "The orange circles are flotation devices used to rescue people in danger of drowning in the water.", "The circles are to float."], "image": "train2014/COCO_train2014_000000338189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516239, "question_id": "Y6eLEwwG39pkokTsj9Hj2B", "question": "Where are the people on the truck likely going?", "choices": ["dance", "work", "shopping", "party"], "correct_choice_idx": 1, "direct_answers": ["to work", "work", "work", "to work", "work", "work", "party", "home", "home", "work"], "difficult_direct_answer": false, "rationales": ["The people are dressed in work clothing and this is a rugged looking truck.", "The people are off to a work site.", "A bunch of working class people are piled into a truck bed."], "image": "val2014/COCO_val2014_000000516239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461017, "question_id": "Y6f8zoawcFNbqAfhKHaB34", "question": "What is the item on the left likely doing?", "choices": ["spinning", "taking off", "submerging", "tricks"], "correct_choice_idx": 1, "direct_answers": ["waiting", "taking off", "taking off", "waiting", "flying", "resting", "landing", "waiting", "sitting", "taking off"], "difficult_direct_answer": false, "rationales": ["It's either a or prepping to do a.", "The plane is above the runway in the air.", "The plane is going up in the air."], "image": "val2014/COCO_val2014_000000461017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425184, "question_id": "Y6gucYusD8Vz5dHvnjpZ2H", "question": "What are the bird cages made of?", "choices": ["steel", "wood", "gold", "plastic"], "correct_choice_idx": 1, "direct_answers": ["wood", "metal", "metal", "wood", "wicker/rattan", "wood", "straw", "wood", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["The structure is seems to be having woods.", "The cages are wooden.", "The cages are brown."], "image": "train2014/COCO_train2014_000000425184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491743, "question_id": "Y6i4CkKhPfREhne27xEM2h", "question": "Why is the building for rent?", "choices": ["new building", "more money", "no tenant", "forgot rent"], "correct_choice_idx": 2, "direct_answers": ["building behind", "no tenants", "empty", "no tenant", "empty", "it's vacant", "abandoned", "unoccupied", "empty", "money"], "difficult_direct_answer": false, "rationales": ["The building has no tenants in it.", "There is no tenant.", "There is a for rent sign which is on the building in the picture. the primary reason that a good room or building would be out for rent is that there is no tenant."], "image": "train2014/COCO_train2014_000000491743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235692, "question_id": "Y7HRjMbXSq2rWKR8sWCGeM", "question": "What are long horn cows called?", "choices": ["extended horn", "horne", "horner", "longhorn cattle"], "correct_choice_idx": 3, "direct_answers": ["steer", "longhorns", "bulls", "bulls", "longhorns", "longhorns", "steer", "longhorn cattle", "longhorns", "longhorn cattle"], "difficult_direct_answer": false, "rationales": ["The cows are known for their long horns.", "Longhorn cattle have horns like that.", "These cattle are named for their long horns, here extending through most of the picture."], "image": "val2014/COCO_val2014_000000235692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136979, "question_id": "Y7Riqc8PAFN8jW3tQgWifC", "question": "What should the bus use to safely move in front of the bicyclist?", "choices": ["wiper blades", "sign", "turning signals", "motor"], "correct_choice_idx": 2, "direct_answers": ["indicators", "turning signals", "turn signal", "signal", "turn signal", "blinker", "turn signal", "wheels", "turning signal", "blinker"], "difficult_direct_answer": false, "rationales": ["The bus uses a light to signal the bike rider.", "The bus is to the left of the bicyclist, so the bus driver should indicate before moving over.", "Without an indicating light, the cyclist has no way of knowing which the direction the bus is going"], "image": "train2014/COCO_train2014_000000136979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444691, "question_id": "Y7keX4EDJT8VcWR3HFnjLy", "question": "What is this yellow bus doing?", "choices": ["turning left", "reversing", "turning right", "parking"], "correct_choice_idx": 2, "direct_answers": ["turning", "turning right", "making turn", "turning corner", "turning right", "unloading", "turning", "going straight", "driving", "turning"], "difficult_direct_answer": false, "rationales": ["The bus is trying to turn.", "The bus is turning.", "The bus is making a right turn."], "image": "train2014/COCO_train2014_000000444691.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299621, "question_id": "Y83CjjTU6yZZP6ZmoLTmRQ", "question": "What is near the kite?", "choices": ["box", "human", "apple", "dog"], "correct_choice_idx": 1, "direct_answers": ["sun", "sun", "ocean", "sun", "sun", "human", "owner", "people", "person", "sunset"], "difficult_direct_answer": false, "rationales": ["The human is by the kite.", "A human is flying the kite.", "There is a person that is flying the kite."], "image": "train2014/COCO_train2014_000000299621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497402, "question_id": "Y8DAgyS4KtDbTzyeMohatz", "question": "What is the occupation of this person?", "choices": ["plumber", "electrician", "cable man", "firefighter"], "correct_choice_idx": 0, "direct_answers": ["plumber", "electrician", "plumber", "plumber", "engineer", "construction worker", "technician", "electrician", "construction worker", "janitor"], "difficult_direct_answer": false, "rationales": ["The man looks to be in construction and working in a restroom so he is likely in the plumbing business.", "The person is fixing the toilets.", "He's in a bathroom with construction supplies"], "image": "train2014/COCO_train2014_000000497402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320225, "question_id": "Y8kr7cdozhof3zWP3My5Pp", "question": "What kind of camera shots is the photographer probably planning to take?", "choices": ["skateboarding", "architecture", "nature", "clothing models"], "correct_choice_idx": 0, "direct_answers": ["skateboarding tricks", "action", "action", "moving", "action", "close up", "trick", "skating shots", "skateboarding", "action"], "difficult_direct_answer": false, "rationales": ["A skateboard appears to be the main focus of the photograph.", "A skateboard is shown.", "There is one in the center of the picture"], "image": "train2014/COCO_train2014_000000320225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385141, "question_id": "Y8oXY44ZddLarDWbMhvtki", "question": "When buying something from the cart shown what would you do soon afterwards?", "choices": ["kill it", "play it", "eat it", "throw it"], "correct_choice_idx": 2, "direct_answers": ["walk home", "eat", "eat it", "eat it", "eat", "eat", "eat it", "eat", "eat", "pay"], "difficult_direct_answer": false, "rationales": ["The cart is a food cart. food carts sell food.", "It's a food cart.", "It is selling hot dogs"], "image": "train2014/COCO_train2014_000000385141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283868, "question_id": "Y8z3BmVJHrhq6NDUsxGFxR", "question": "Where is the party located?", "choices": ["beach", "pool", "hall", "playground"], "correct_choice_idx": 3, "direct_answers": ["playground", "park", "park", "playground", "playground", "playground", "park", "park", "playground", "park"], "difficult_direct_answer": false, "rationales": ["People are sitting at tables with slides and swings behind.", "The party is located in the playground area.", "The party is outdoors so it is not in a hall and there is no water visible so it is not at a pool or the beach. there is a park with slides and climbers."], "image": "train2014/COCO_train2014_000000283868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87000, "question_id": "Y966GubNd5tUCEjAFuFfAn", "question": "What is the silver object above the mashed potatoes pan used for?", "choices": ["spoon rest", "medicine", "stirring", "serving"], "correct_choice_idx": 0, "direct_answers": ["scooping", "spooning", "cooking", "spoon rest", "spoon rest", "utensil holder", "spoon rest", "utensil stand", "frying potatoes", "holding utensils"], "difficult_direct_answer": false, "rationales": ["A flat object with a depression in it is on the stove.", "You can set your cooking utensils on it so the stove does not get dirty.", "It keeps spoons from sullying the stovetop."], "image": "train2014/COCO_train2014_000000087000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26521, "question_id": "Y9EBBYbrobfE8UfeLNVAxs", "question": "What kind of races does this animal run?", "choices": ["car", "three legged", "derby", "plane"], "correct_choice_idx": 2, "direct_answers": ["horse", "horse racing", "racka", "horse racing", "races betting", "derby", "horse", "derby", "horse", "horse races"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to horse racing.", "The animal is a horse which is well known for competing in derbies.", "The animals visible are horses based on their visible features. these animals are known to compete in races called answer a."], "image": "train2014/COCO_train2014_000000026521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485149, "question_id": "Y9YLDETyf2oySqZxj4RsNV", "question": "What type of programming is this cat watching?", "choices": ["reality show", "sitcom", "sports", "drama"], "correct_choice_idx": 0, "direct_answers": ["reality", "reality", "reality tv", "reality show", "reality show", "soap opera", "reality show", "hills", "reality tv", "reality tv"], "difficult_direct_answer": false, "rationales": ["The television has an image of spencer on it. he starred on a show that followed the lives of some young people living in los angeles.", "Spencer is from a reality show.", "The image on the screen is captioned with a name and an explanation of who the person is, which is most consistent with a. the image depicted does not include athletic events, and b and c are not usually captioned."], "image": "val2014/COCO_val2014_000000485149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206560, "question_id": "Y9tzmGgbfnvyTwEkTDyTZw", "question": "Why has the skater covered his head?", "choices": ["warmth", "religion", "costume", "protection"], "correct_choice_idx": 3, "direct_answers": ["protection", "protection", "protection", "helmet", "protection", "protection", "protection", "protection", "safety", "safety"], "difficult_direct_answer": false, "rationales": ["He has a helmet on to protect his head.", "It is very cold out for this sport", "He is going at high areas while under him is hard pavement which could hurt his head if unprotected."], "image": "val2014/COCO_val2014_000000206560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443240, "question_id": "YALLtyfBG2rCWkfKKfKmRE", "question": "What kind of street is this?", "choices": ["boulevard", "residential", "city", "commercial"], "correct_choice_idx": 1, "direct_answers": ["residential", "residential", "residential", "residential", "neighborhood", "neighborhood", "neighborhood", "residential", "residential", "residential"], "difficult_direct_answer": false, "rationales": ["This is a street where people live.", "The things on the curb appear that they have come from a home.", "The street is residential."], "image": "val2014/COCO_val2014_000000443240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162322, "question_id": "YANPP3fhPLfzFRVfyFUiSH", "question": "What type of filling is in the donuts?", "choices": ["icing", "custard", "chocolate", "fruit"], "correct_choice_idx": 3, "direct_answers": ["strawberry jelly", "cherry", "jelly", "fruit", "strawberry", "strawberry", "strawberry", "fruit", "strawberry", "strawberry"], "difficult_direct_answer": false, "rationales": ["The donuts have strawberries in them.", "Because the cook is adding strawberry on the donut.", "This is indicated by the action of the baker."], "image": "train2014/COCO_train2014_000000162322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244231, "question_id": "YAPDziX7hEob8h4BJP3iBt", "question": "What are the thin objects the birds are sitting on?", "choices": ["branches", "poles", "power lines", "traps"], "correct_choice_idx": 2, "direct_answers": ["wires", "crow", "power lines", "wires", "telephone wire", "wires", "power lines", "wires", "wires", "wires"], "difficult_direct_answer": false, "rationales": ["The wires are attached to telephone poles.", "These are wires that carry electricity", "The birds are sitting on the lines delivering electricity."], "image": "train2014/COCO_train2014_000000244231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181836, "question_id": "YARMpa8WX7zJPqazQSyTe8", "question": "The street has been cordoned off in celebration of what social cause?", "choices": ["anti-war", "anti-racism", "lgbt rights", "police reform"], "correct_choice_idx": 2, "direct_answers": ["pride", "gay pride", "pride parade", "pride", "lgbt rights", "pride", "gay pride", "lgbtq", "gay pride", "pride"], "difficult_direct_answer": false, "rationales": ["The man is parading for lgbt rights.", "There are rainbow flags", "There is a lgbtq flag at the celebration."], "image": "val2014/COCO_val2014_000000181836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530823, "question_id": "YATP9Ps8TvNd75btcnucLu", "question": "What is the man on the right doing with the object in his hands?", "choices": ["sweeping", "putting", "steering", "passing"], "correct_choice_idx": 0, "direct_answers": ["sweeping", "raking", "sweeping", "sweeping", "raking", "sweeping", "sweeping", "sweeping", "sweeping", "sweeping"], "difficult_direct_answer": false, "rationales": ["The man is holding a broom vertically with his hand on top and the other midway down and appears to be moving it forward. this is what someone would do if they were doing answer a.", "The man on the right is holding a broom. he is using it to clean the area.", "The man is clearing the floor."], "image": "train2014/COCO_train2014_000000530823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284014, "question_id": "YATnwbjdNTmrzwrSccXXoH", "question": "What do these plants need very little of?", "choices": ["sun", "love", "water", "heat"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "leaves", "water", "water", "water", "leaves", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["Cacti require little water and can survive long periods of time without it.", "Cacti grow in deserts.", "These plants are cacti. they live in deserts and similar areas that are sunny and dry."], "image": "train2014/COCO_train2014_000000284014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577558, "question_id": "YAYdD2LDaexerDfKi3TTW7", "question": "What shredded vegetable a favorite of rabbits is on both sandwiches?", "choices": ["carrot", "tomato", "jalapeno pepper", "tomato"], "correct_choice_idx": 0, "direct_answers": ["carrot", "carrot", "carrots", "carrot", "carrots", "carrot", "carrot", "lettuce", "carrots", "carrot"], "difficult_direct_answer": false, "rationales": ["You can tell by the color as to what it is.", "They love to eat this root vegetable", "Carrots are the most common orange vegetable, and they are clearly visible in between the buns of each sandwich. bugs bunny famously and frequently eats carrots, showing how rabbits favor carrots."], "image": "train2014/COCO_train2014_000000577558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19444, "question_id": "YAbtSgX2s29jzByhakLfgK", "question": "What is the white netting shown here normally used for?", "choices": ["trapping butterflies", "base", "protecting property", "soccer goal"], "correct_choice_idx": 3, "direct_answers": ["goals", "hockey goal", "soccer goal", "goal", "goal", "soccer goals", "goal", "goal cage", "goal", "soccer goals"], "difficult_direct_answer": false, "rationales": ["This is how you score points", "A soccer goal is on a field.", "White netting is spread along white posts on a grassy field."], "image": "val2014/COCO_val2014_000000019444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129108, "question_id": "YAgniYRuxYLypJsgdC5cLV", "question": "In which part of town is this crosswalk?", "choices": ["china town", "downtown", "polish town", "italian town"], "correct_choice_idx": 0, "direct_answers": ["china town", "downtown", "downtown", "downtown", "street", "downtown", "chinatown", "chinatown", "city center", "busy"], "difficult_direct_answer": false, "rationales": ["It is in china town.", "You can see the writing on the store in the back looks to be an asian culture.", "There are signs in the background. they use an asian, not european, alphabet."], "image": "train2014/COCO_train2014_000000129108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189353, "question_id": "YAudujtLvKANQwxRuZrxaD", "question": "What is the boy wearing white shorts using to get around?", "choices": ["scooter", "bike", "skateboard", "mower"], "correct_choice_idx": 0, "direct_answers": ["scooter", "skateboard", "scooter", "scooter", "skateboard", "scooter", "scaring", "skateboard", "skateboard", "skateboard"], "difficult_direct_answer": false, "rationales": ["The boy in the white shorts is riding a scooter, which has small wheels and a narrow board.", "The thing used to take the boy around is a scooter because you see the rear wheel and the platform of one", "He has one foot on the object and his other foot is pushing the ground to move the object forward with him on it."], "image": "train2014/COCO_train2014_000000189353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191990, "question_id": "YBPus5K7hX3vYfbhgxdEro", "question": "Why is the man holding plastic bottles?", "choices": ["to throw", "to drink", "to juggle", "to sell"], "correct_choice_idx": 1, "direct_answers": ["very thirsty", "drink", "hot", "drinking water", "thirsty", "water", "for family", "for drinking", "to drink", "garbage"], "difficult_direct_answer": true, "rationales": ["The man has a beverage in the bottles.", "He wants a drink.", "The man is going to take a drink."], "image": "val2014/COCO_val2014_000000191990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97633, "question_id": "YBR4D5Gw5C8BdaTmaJG2q7", "question": "What's the name for the style of top the man has on?", "choices": ["tank top", "blazer", "t-shirt", "cutoff"], "correct_choice_idx": 0, "direct_answers": ["tank top", "tank top", "tanktop", "tank", "tank top", "beach", "vests", "tank top", "tank top", "skating"], "difficult_direct_answer": false, "rationales": ["The man is wearing a shirt without any sleeves. this style of shirt is referred to as answer a.", "A shirt without sleeves is a tank top.", "The man is wearing a tank top because it has no sleeves"], "image": "train2014/COCO_train2014_000000097633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24721, "question_id": "YBTm46tjnfKuXf9x7wzYHs", "question": "What is the wooden item here?", "choices": ["paddle", "spoon", "pirate leg", "desk"], "correct_choice_idx": 3, "direct_answers": ["desk", "desk", "desk", "desk", "desk", "desk", "desk", "desk", "desk", "desk"], "difficult_direct_answer": false, "rationales": ["There are computer monitors, speakers, a keyboard, a mouse, and similar items. they are on the wooden item.", "There are devices on top of it, so it's a desk.", "There is a table for computers here."], "image": "val2014/COCO_val2014_000000024721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420889, "question_id": "YBUdAJ2yWQWNPoz2Y3akYe", "question": "What is the most common type of boat in the picture?", "choices": ["trawler", "dinghy", "ski boat", "sailboat"], "correct_choice_idx": 3, "direct_answers": ["sailboat", "big boat", "sailboat", "sailboat", "sailboat", "sailboat", "sailboat", "sailboat", "sailboat", "big boat"], "difficult_direct_answer": false, "rationales": ["This type of boat will be seen with a very long pole at the middle so you can see this mast on a lot of boats here.", "Sailboats are common.", "The poles on the boats are for sails."], "image": "train2014/COCO_train2014_000000420889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394518, "question_id": "YBVQRrJdFPj6e6tGfJCEJZ", "question": "What type of signs are shown?", "choices": ["regulatory", "directional", "warning", "brand"], "correct_choice_idx": 3, "direct_answers": ["brand", "promotional", "sponsors", "advert", "sponsorships", "advertisements", "sponsors", "finish line", "company sponsors", "finish"], "difficult_direct_answer": true, "rationales": ["The signs all contain logos of companies and no instructional language like the other options would have had.", "There are different logos and art of the words to show what brand they represent.", "The signs show brands."], "image": "train2014/COCO_train2014_000000394518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3326, "question_id": "YBZS2WJ9MfuN6V2VhhmACZ", "question": "What do the people have in their hands?", "choices": ["eggs", "paddles", "swords", "spears"], "correct_choice_idx": 1, "direct_answers": ["oars", "oars", "paddle", "paddle", "oars", "oars", "paddle", "oars", "paddles", "oars"], "difficult_direct_answer": false, "rationales": ["Oars to move the boat in the water.", "They have paddles.", "They are using these wooden sticks to move them through the water."], "image": "val2014/COCO_val2014_000000003326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237701, "question_id": "YBhuic8kJERJQChNtQh4nA", "question": "The red and white device behind and to the left of the man serves what function?", "choices": ["fire alarm", "intercom", "light switch", "doorbell"], "correct_choice_idx": 0, "direct_answers": ["fire alarm", "fire alarm", "fire alarm", "fire alarm", "fire emergencies", "emergency", "watching", "block rain", "monitor", "fire alarm"], "difficult_direct_answer": false, "rationales": ["The device is the fire alarm.", "If you pull the lever it makes a loud noise alerting people to danger.", "The device should be pulled in case of fire."], "image": "train2014/COCO_train2014_000000237701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402806, "question_id": "YCNwr6HcPJKSjC3vEJn6db", "question": "What type of sugar is on the baked good?", "choices": ["brown sugar", "powdered sugar", "pure cane", "fake sugar"], "correct_choice_idx": 1, "direct_answers": ["confectionary", "white sugar", "powdered", "powdered", "powdered", "powdered", "powder", "powder sugar", "powdered sugar", "powdered sugar"], "difficult_direct_answer": false, "rationales": ["There are some powdered sugars on top of the baked donut.", "The baked good has confectioners sugar on it.", "It is very white and fluffy"], "image": "train2014/COCO_train2014_000000402806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351483, "question_id": "YCZnrESUXfwhQhbGCv6zWg", "question": "What will passengers use to get across the blue platform?", "choices": ["stairs", "elevator", "ramp", "escalator"], "correct_choice_idx": 0, "direct_answers": ["overpass", "footbridge", "bridge", "stairs", "bridge", "legs", "bridge", "stairs", "bridge", "elevated walkway"], "difficult_direct_answer": false, "rationales": ["There are steps leading up to the overpass.", "They will cross the bridge to get to the other side.", "The passengers will climb the stairs to access to the platform."], "image": "train2014/COCO_train2014_000000351483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245013, "question_id": "YCpVtA8Cr8VBsyA5LPBT5m", "question": "What country is the scene located at?", "choices": ["japan", "iran", "thailand", "china"], "correct_choice_idx": 0, "direct_answers": ["united states", "america", "usa", "united states", "japan", "no idea", "pakistan", "united states", "turkey", "palestine"], "difficult_direct_answer": false, "rationales": ["An asian man is driving in a car.", "There is a south asian person in the car.", "Some of the lettering is visible on the animal attached to the roof. the letter appears to indicate answer a."], "image": "val2014/COCO_val2014_000000245013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152722, "question_id": "YCuxmYS5fANuERgkcrP2wc", "question": "What is the purpose of the black structure?", "choices": ["house planes", "store tools", "restaurant", "police station"], "correct_choice_idx": 0, "direct_answers": ["storing planes", "travel", "storage", "hanger", "house airplanes", "house planes", "store planes", "store planes", "store planes", "store plane"], "difficult_direct_answer": false, "rationales": ["You store planes in the hanger.", "This is a hanger and a place to store planes when not being used.", "The black structure is where the planes are kept."], "image": "train2014/COCO_train2014_000000152722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432593, "question_id": "YD3JZiQN5MsYLhXuoWnmGR", "question": "Which bus is more likely to take you somewhere on this street?", "choices": ["school bus", "airport shuttle", "ipswitch 1", "purple bus"], "correct_choice_idx": 2, "direct_answers": ["left bus", "left", "left", "ipswitch 1", "left", "right one", "left bus", "left bus", "left bus", "left"], "difficult_direct_answer": false, "rationales": ["The other bus isn't in service.", "The 1 bus is running.", "The bus on the right says it is not in service, so it is not likely to take anyone anywhere. the bus on the left has a sign on the top naming the bus and indicating it is in service."], "image": "train2014/COCO_train2014_000000432593.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475535, "question_id": "YDTd5rDvf2PMMd73X6bAhR", "question": "What is the traffic light permitting?", "choices": ["jaywalking", "parking", "crossing", "driving"], "correct_choice_idx": 2, "direct_answers": ["to walk", "crossing", "crossing", "walking", "walking", "walk", "crossing", "closing", "walking", "pedestrian crossing"], "difficult_direct_answer": false, "rationales": ["There is a white icon of a person walking in the traffic light.", "The light permits crossing.", "Its permitting people to cross."], "image": "train2014/COCO_train2014_000000475535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226246, "question_id": "YDcCEND2BrUVC95JyBDAkA", "question": "Who is the main actress in the movie advertised?", "choices": ["kerry washington", "susan sarandon", "halle berry", "keri russell"], "correct_choice_idx": 3, "direct_answers": ["keri russel", "keri russell", "keri russell", "keri russell", "keri russell", "keri russel", "keri russell", "keri russell", "keri russell", "keri russell"], "difficult_direct_answer": false, "rationales": ["Russell is the actress.", "Keri russell starred in dark skies.", "The words on the bus seems to be advertising a lot."], "image": "train2014/COCO_train2014_000000226246.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130000, "question_id": "YDpZVnYbaYUndsiTF5483N", "question": "What method was used to produce the beverage seen held here?", "choices": ["brewing", "churning", "chilling", "mixing"], "correct_choice_idx": 0, "direct_answers": ["espresso machine", "brewing", "brewing", "brew", "brewed", "brewing", "brewing", "brew", "coffee brewer", "brewing"], "difficult_direct_answer": false, "rationales": ["The man is holding a cup that has a sleeve around it to insulate the heat away from his hand. these sleeves are usually used when a cup of hot coffee has been purchased.", "You can tell by the cup he has as to how the liquid inside was made.", "Its likely hot coffee based on the cup design."], "image": "train2014/COCO_train2014_000000130000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474556, "question_id": "YDspd4puRux4qPkKfv6txn", "question": "What job does the man standing in the street hold?", "choices": ["singer", "traffic police", "music conductor", "magician"], "correct_choice_idx": 1, "direct_answers": ["cop", "traffic", "officer", "transport", "police", "baking", "traffic police", "control traffic", "traffic control", "traffic guard"], "difficult_direct_answer": true, "rationales": ["The man is a traffic cop and is directing cars and people.", "The man in standing on the street is wearing a police uniform.", "He's there to make sure it flows efficiently"], "image": "train2014/COCO_train2014_000000474556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415856, "question_id": "YE2HkMaeUUpjREaT65H5b7", "question": "What is the goal of the persons here regarding the river they stand in?", "choices": ["hiding", "crossing", "diving", "swimming"], "correct_choice_idx": 1, "direct_answers": ["cross", "cross it", "crossing", "crossing river", "cross horses", "walking", "water horses", "cool off", "cross river", "crossing"], "difficult_direct_answer": true, "rationales": ["The goal is to cross.", "They are leading the horses in the river by walking.", "The goal is to get over the river."], "image": "val2014/COCO_val2014_000000415856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407589, "question_id": "YE8Xi3vUH783UZEnbn65Me", "question": "Where are these small boats being kept?", "choices": ["field", "shipyard", "beach", "dock"], "correct_choice_idx": 2, "direct_answers": ["marina", "balance", "beach", "beach", "shore", "shore", "yard", "beach", "on land", "on bay"], "difficult_direct_answer": false, "rationales": ["The boats are on sand.", "The beach as they are used in the sea.", "The boats are at the beach."], "image": "train2014/COCO_train2014_000000407589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197057, "question_id": "YE9AUFJtAF4mQUejAZ2G4U", "question": "Which street name is impressed into the sidewalk?", "choices": ["anza", "charles", "wilmont", "kensington"], "correct_choice_idx": 0, "direct_answers": ["anza", "anza", "anza", "anza", "anza", "anza", "anza street", "anza", "anza", "anza"], "difficult_direct_answer": false, "rationales": ["Anza is.", "The impressed name is the same one that is on the street sign.", "Anza's name is shown."], "image": "train2014/COCO_train2014_000000197057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509811, "question_id": "YEAAyXvok58DqtpAtdKNAU", "question": "Who owns those laptops?", "choices": ["one individual", "library", "multiple individuals", "non profit"], "correct_choice_idx": 1, "direct_answers": ["school", "library", "library", "school", "school", "school", "school", "library", "library owners", "school"], "difficult_direct_answer": false, "rationales": ["The library is in a school and the school gives them the computers.", "The library owns them.", "The library is the owner of these many laptops."], "image": "val2014/COCO_val2014_000000509811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135348, "question_id": "YEBoupGdkv5wZu4ucpsnKc", "question": "What is the name for this kind of shirt?", "choices": ["hawaiian", "american", "russian", "british"], "correct_choice_idx": 0, "direct_answers": ["hawaiian", "hawaiian", "hawaiian", "hawaiian", "hawaiian", "hawaiian", "hawaiian", "hawaiian", "hawaiian", "hawaiian"], "difficult_direct_answer": false, "rationales": ["It has palm trees on it. hawaii is known for palm trees.", "The shirt has plants and floral designs and is typical of the hawaiian shirt design.", "The name is hawaiian."], "image": "train2014/COCO_train2014_000000135348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383090, "question_id": "YEH9U9rtEVWFppy7pQR9fD", "question": "What material is the train constructed from?", "choices": ["fruit", "cake", "plastic", "ice"], "correct_choice_idx": 1, "direct_answers": ["cake", "cake", "cake", "cake", "plastic", "cake batter", "cake", "frosting", "cake", "play dough"], "difficult_direct_answer": false, "rationales": ["The material is cake.", "The train has frosting and candy on it.", "The train is a cake."], "image": "train2014/COCO_train2014_000000383090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510861, "question_id": "YEN7NSijV4QFCy2ow6X7us", "question": "What is the woman holding up the phone for?", "choices": ["watching video", "taking photo", "facetime", "sending message"], "correct_choice_idx": 1, "direct_answers": ["picture", "taking photos", "taking photo", "take photo", "photographing", "recording speech", "mobile phone", "taking photo", "take picture", "photo"], "difficult_direct_answer": true, "rationales": ["She is holding it up to take a picture.", "The woman is looking at her view finder so is likely looking to take pictures.", "The woman is taking a photo."], "image": "val2014/COCO_val2014_000000510861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148955, "question_id": "YEYwzdW3MJpW4MRDiWFcBU", "question": "WHo is the man in blue with the red flower?", "choices": ["prince philip", "prince charles", "prince william", "prince harry"], "correct_choice_idx": 1, "direct_answers": ["prince charles", "prince charles", "prince charles", "prince charles", "prince charles", "honored guest", "prince", "prince charles", "prince", "president"], "difficult_direct_answer": false, "rationales": ["The man is prince charles.", "A man is standing with other royals.", "Prince charles is the one next to the man with the red flower."], "image": "val2014/COCO_val2014_000000148955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210568, "question_id": "YEcaWGHRDectgmXaQnE5wZ", "question": "What is hanging from the bear's wrist?", "choices": ["key", "watch", "scissors", "knife"], "correct_choice_idx": 0, "direct_answers": ["key", "key", "key", "key", "key", "key", "key", "key chain", "key chain", "key"], "difficult_direct_answer": false, "rationales": ["A plush animal has a silver key hanging near it.", "An ornate ring with a silver key attached to it is around the wrist of a stuffed bear.", "The bear has a key."], "image": "train2014/COCO_train2014_000000210568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137281, "question_id": "YEfGFcfjtTaP9aEk7UoUpR", "question": "What type of sign is shown?", "choices": ["historical", "traffic", "brand", "directional"], "correct_choice_idx": 1, "direct_answers": ["speed radar", "speed limit", "speed limit", "speed limit", "speed limit", "speed limit", "traffic", "speed sign", "speed limit", "speed radar"], "difficult_direct_answer": false, "rationales": ["The sign controls speed on the road.", "It is showing the speed limit which is common on roads.", "There is a speed limit sign on the side of the road."], "image": "train2014/COCO_train2014_000000137281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289791, "question_id": "YEtfRnSqgMmbTRHTBjnG5y", "question": "What type of service does this place appear to offer?", "choices": ["drive-through", "delivery", "self-service", "sit-down service"], "correct_choice_idx": 2, "direct_answers": ["food", "buffet", "food service", "buffet", "food", "buffet", "self-service", "food", "buffet", "buffet"], "difficult_direct_answer": false, "rationales": ["The place is a self-service buffet.", "There are utensils for self service.", "There are large restaurant pans with servers in a buffet line"], "image": "train2014/COCO_train2014_000000289791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421254, "question_id": "YEyh6R6EyR6JgYFzEW6nXW", "question": "During what type of emergency would the white object be used?", "choices": ["fire", "flood", "earthquake", "tsunami"], "correct_choice_idx": 0, "direct_answers": ["fire", "burnings", "fire", "fire", "burnings", "burnings", "fire", "fire", "fire", "fire"], "difficult_direct_answer": false, "rationales": ["The hydrant allows first responders to obtain large amounts of water.", "The fire hydrant is used in cases of fire.", "Water from the hydrant extinguishes flames."], "image": "train2014/COCO_train2014_000000421254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473500, "question_id": "YEzhCPRwvvzPAzj5u5d6ZF", "question": "Why caused the objects to be scattered all over?", "choices": ["tornado", "hurricane", "cat", "intruder"], "correct_choice_idx": 2, "direct_answers": ["cats", "cat", "cat", "cat", "rowdy cats", "cat", "cat", "cats", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["The cats knocked stuff everywhere.", "Cats are curious by nature and often move objects or knock them on the floor.", "The cat shouldn't be in the sink."], "image": "train2014/COCO_train2014_000000473500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284362, "question_id": "YF6hMJqatyVhGqs3SwwXXr", "question": "The number on the back of the vehicle is two digits smaller than the name of a show what actress was on?", "choices": ["regina king", "kirsten dunst", "anne hathaway", "betty grable"], "correct_choice_idx": 0, "direct_answers": ["vanna white", "ten", "marla gibbs", "227", "marla gibbs", "jackie harry", "jackie", "marla gibbs", "regina king", "regina king"], "difficult_direct_answer": false, "rationales": ["Regina king's age is similar.", "Regina king was associated with a show that was called 227.", "It shows regina was around."], "image": "train2014/COCO_train2014_000000284362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569533, "question_id": "YFLFUDPme9ejANtXa4niaX", "question": "The number of people here is called what?", "choices": ["dozen", "quartet", "quintet", "trio"], "correct_choice_idx": 3, "direct_answers": ["three", "family", "trio", "three", "family", "three", "three", "three", "three", "family"], "difficult_direct_answer": false, "rationales": ["There are three people in the scene.", "There are three people.", "The other number options don't match."], "image": "train2014/COCO_train2014_000000569533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67982, "question_id": "YFRegJXTxUs6DtgQCjEq4P", "question": "What material is the red coat made of?", "choices": ["pic", "nylon", "leather", "cotton"], "correct_choice_idx": 0, "direct_answers": ["leather", "leather", "leather", "plastic", "polyester", "leather", "polyester", "pleather", "leather", "pic"], "difficult_direct_answer": false, "rationales": ["The material is a picture.", "The red coat is leather.", "It gives off a fake leather look in which it's slightly matte shiny but imitates leather."], "image": "train2014/COCO_train2014_000000067982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331082, "question_id": "YFZyWPwjyGwXexkqyejajW", "question": "What are the people doing on the street?", "choices": ["racing", "protesting", "jogging", "shopping"], "correct_choice_idx": 3, "direct_answers": ["shopping", "walking", "shopping", "walking", "shopping", "walking", "walking", "driving", "shopping", "vending"], "difficult_direct_answer": false, "rationales": ["They are shopping.", "There are vendors lining the streets. some people are stopped at the vendors. vendors sell things.", "There are shops along the street."], "image": "train2014/COCO_train2014_000000331082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256276, "question_id": "YFumFr5TcUki5VWFJP2Xmq", "question": "What is the woman doing with the bear?", "choices": ["cleaning it", "feeding it", "smothering it", "fighting it"], "correct_choice_idx": 1, "direct_answers": ["feeding", "feeding", "feeding it", "feeding", "posing", "posing", "feeding", "feeding it", "feeding", "feeding"], "difficult_direct_answer": false, "rationales": ["She has food in her hand and is holding it close to his mouth.", "The woman appears to have something cupped in her hand near the bear's mouth that the bear is showing interest in.", "The woman is giving the bear a snack."], "image": "train2014/COCO_train2014_000000256276.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258073, "question_id": "YGAAARTUTUNLyiyzuhnwkj", "question": "What is sold at this market?", "choices": ["produce", "meat", "fish", "clothing"], "correct_choice_idx": 0, "direct_answers": ["fruit", "produce", "produce", "produce", "fruit", "fruit", "fruit", "produce", "fruits", "fruit"], "difficult_direct_answer": false, "rationales": ["Many fruits can be seen on display, which are also referred to as produce.", "The market sells lots of different kinds of fruits and vegetables.", "Fruit is being sold."], "image": "train2014/COCO_train2014_000000258073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39191, "question_id": "YGR7wRxXVDC7SVkRix9RjJ", "question": "How many layers should you wear when snowboarding?", "choices": ["one", "three", "two", "four"], "correct_choice_idx": 1, "direct_answers": ["two", "three", "two", "unknown", "three", "few", "two", "three", "multiple", "many"], "difficult_direct_answer": false, "rationales": ["While discretionary, many people like triple layers for warmth.", "You can wear several to keep warm", "You should wear as many as possible with still being able to move."], "image": "train2014/COCO_train2014_000000039191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536412, "question_id": "YH2VLF3tYBF9SuTZC58KsX", "question": "What is the young zebra doing?", "choices": ["standing", "running", "eating", "laying"], "correct_choice_idx": 3, "direct_answers": ["laying down", "relaxing", "sleeping", "sleeping", "laying", "laying", "resting", "dying", "resting", "sleeping"], "difficult_direct_answer": false, "rationales": ["The young one is resting on the ground.", "The zebra is laying.", "It's entire body is on the ground in a horizontal position and doesn't appear to be moving in any way."], "image": "train2014/COCO_train2014_000000536412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475407, "question_id": "YH3re72QKjKQR9V5P7xKTp", "question": "What animal is painted on the grey wall?", "choices": ["dove", "eagle", "swan", "bat"], "correct_choice_idx": 0, "direct_answers": ["dove", "bird", "doves", "doves", "dove", "doves", "dove", "dove", "doves", "birds"], "difficult_direct_answer": false, "rationales": ["The animals are white birds. they are too small to be swans.", "The animal is a bird and it is white and it's flying in the air.", "Doves are painted on the wall."], "image": "val2014/COCO_val2014_000000475407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20391, "question_id": "YHbBtzKh4jVcSsiNroPG8E", "question": "What kind of building is the one with the black sign?", "choices": ["gym", "restaurant", "bank", "hospital"], "correct_choice_idx": 1, "direct_answers": ["restaurant", "pizza place", "pizza", "pizza restaurant", "restaurant", "pizzeria", "pizza", "restaurant", "pizzeria", "business building"], "difficult_direct_answer": false, "rationales": ["This is to let people know they can get food", "The sign says it's a restaurant.", "The place has a sign that says pizza and pizza is served at restaurants."], "image": "train2014/COCO_train2014_000000020391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267780, "question_id": "YHrjsi4H7X7AfjcDzCAai9", "question": "What letters are obscured from the box?", "choices": ["lo", "na", "pi", "be"], "correct_choice_idx": 2, "direct_answers": ["pi", "pi", "pi", "p i", "pizza", "burger", "pi", "za", "pi", "pi"], "difficult_direct_answer": false, "rationales": ["It should spell out pizza.", "This flatbread dish has an italian name. the last three letters, but not the first two, are visible.", "The letters are hiding the pi in pizza."], "image": "train2014/COCO_train2014_000000267780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343157, "question_id": "YHtWg5VYzwBZLPRxwCiD59", "question": "What type of activity are those standing involved in?", "choices": ["singing", "wii", "juggling", "arm wrestling"], "correct_choice_idx": 1, "direct_answers": ["playing wii", "bowling", "video game", "wii", "wii", "video games", "playing wii", "playing wii", "video game", "gaming"], "difficult_direct_answer": false, "rationales": ["This is indicated by the white controller visible in the hand of the right player.", "These men are all playing the nintendo wii.", "Those standing are holding a wii controller facing a tv."], "image": "val2014/COCO_val2014_000000343157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36157, "question_id": "YHzR39DYuJa6ZdsrjQLRyN", "question": "What is the man with the striped tie doing with the kite?", "choices": ["getting ready", "selling", "squashing it", "painting it"], "correct_choice_idx": 0, "direct_answers": ["flying", "getting ready", "holding up", "holding it", "holding", "holding", "holding it", "holding", "holding", "holding"], "difficult_direct_answer": false, "rationales": ["The man is holding the kite up high as he prepares to launch it upwards into the winds. with his skillful guidance, the kite should be flying high in no time.", "The man is trying to fly the kite.", "He gets ready."], "image": "train2014/COCO_train2014_000000036157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62726, "question_id": "YJ4Wu3ZNu8oHv4sS9jTtRv", "question": "What company uses the vehicles parked near the curb?", "choices": ["tesla", "mta", "huffy", "yamaha"], "correct_choice_idx": 1, "direct_answers": ["public transport", "mta", "local government", "mta", "bus company", "metrobus", "buses", "mta", "google", "metro"], "difficult_direct_answer": false, "rationales": ["The vehicles are buses. the buses are used by the mta.", "The bus company does.", "The vehicle parked near the curb are city buses. the mta normally runs these types of bus lines."], "image": "val2014/COCO_val2014_000000062726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104393, "question_id": "YJANvsPB9zh5ukYVCKKhaA", "question": "At least how many musicians play different instruments here?", "choices": ["two", "eight", "one", "three"], "correct_choice_idx": 3, "direct_answers": ["four", "just two", "two", "three 3", "two", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["The mirror makes it look like there are more", "There are a trio of people, each with a different instrument.", "There is a guitar player, a bass player and a pianist."], "image": "val2014/COCO_val2014_000000104393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246066, "question_id": "YJDWjZtseNuNyoxtjtYq9i", "question": "What outdoor area is the woman sitting in?", "choices": ["backyard", "tunnel", "garden", "park"], "correct_choice_idx": 3, "direct_answers": ["park", "park", "park", "park", "park", "park", "park", "park", "park", "park"], "difficult_direct_answer": false, "rationales": ["The woman is sitting on a bench in a grassy area so she is at a park.", "She is in a park.", "The area is a park."], "image": "val2014/COCO_val2014_000000246066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141702, "question_id": "YJEXmHjpeWc5GwxkWErbva", "question": "What office are the likely at?", "choices": ["dentist", "stock", "acting", "construction"], "correct_choice_idx": 0, "direct_answers": ["dentist", "dentist", "school", "dentist office", "dentist", "dentist", "dentist", "dentist", "dentist", "children"], "difficult_direct_answer": false, "rationales": ["They are standing by a huge fake toothbrush.", "The big toothbrush is at a dental office.", "The girls are standing by a toothbrush."], "image": "train2014/COCO_train2014_000000141702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399177, "question_id": "YJSHdEh5oJGR8CSFTa8PZc", "question": "Why are there ropes on the statue?", "choices": ["moving it", "theft", "stability", "design"], "correct_choice_idx": 2, "direct_answers": ["support", "balance", "stability", "holding up", "repair work", "prevent falling", "hold place", "holding it", "holding statue", "hold it"], "difficult_direct_answer": true, "rationales": ["The ropes are attached to the statue for stability.", "The ropes give stability.", "You can tell that the ropes help with balance and stability."], "image": "val2014/COCO_val2014_000000399177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290093, "question_id": "YJYd9q4qUvgcKpiSi5YThi", "question": "What is in the room?", "choices": ["bed", "witch hat", "pool table", "refrigerator"], "correct_choice_idx": 0, "direct_answers": ["desk", "bed desk", "bed desk", "bed", "books", "bedroom", "stereo", "bed", "bed", "bedroom"], "difficult_direct_answer": false, "rationales": ["The room has a bed.", "The room has a bed shown.", "The room has a bed that's in it."], "image": "val2014/COCO_val2014_000000290093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105605, "question_id": "YJjvNvWuX2p877kgKEf3wv", "question": "What is inside the Yahoo Mobile phone?", "choices": ["mickey mouse", "cell phone", "sales flier", "person"], "correct_choice_idx": 3, "direct_answers": ["person", "human", "human", "person", "person", "person", "person", "human", "person", "person"], "difficult_direct_answer": false, "rationales": ["The phone has a person.", "The shape of this phone outfit; the room for arms and hands coming out from it, let's us conclude their is likely someone inside of it.", "This is a person dressed as a yahoo mobile phone. you can see their arms."], "image": "train2014/COCO_train2014_000000105605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67115, "question_id": "YK9Jy9E9Hjfy39ndXQGWhQ", "question": "What do giraffes have in the center of their heads?", "choices": ["sting", "single horn", "ossicones", "cornet"], "correct_choice_idx": 2, "direct_answers": ["horns", "horns", "horns", "horns", "horns", "ossicones", "horns", "ossicones", "antennas", "antlers"], "difficult_direct_answer": false, "rationales": ["From a close look a ossicones is seen.", "The giraffes have horns called ossicones.", "They have horns."], "image": "train2014/COCO_train2014_000000067115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416337, "question_id": "YKLXS2gMYHfXyH3gtEBPPk", "question": "Which fruits are the least sweet?", "choices": ["biggest", "green", "most freckled", "smallest"], "correct_choice_idx": 1, "direct_answers": ["bananas", "lemon", "bananas", "green bananas", "banana", "green", "green bananas", "green bananas", "green", "green ones"], "difficult_direct_answer": false, "rationales": ["Green bananas are not yet ripe.", "All of the fruits are bananas. the unripened ones are the least sweet.", "Bananas of varying ripeness are stacked. ripe fruits are generally sweeter than non-ripe fruits. yellow bananas are ripe."], "image": "val2014/COCO_val2014_000000416337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322725, "question_id": "YKWvr63f3hRSpW4ZY2YNJV", "question": "What is the reason for his face being like that?", "choices": ["gross drink", "putrid smell", "physical handicap", "photo"], "correct_choice_idx": 3, "direct_answers": ["taking picture", "photo", "selfie", "funny face", "silly", "fun photograph", "photo", "taking picture", "silly selfie", "fooling around"], "difficult_direct_answer": false, "rationales": ["The person is positioned in front of a photo-taking device making a funny face which is consistent with answer a.", "The man is posing on a photo.", "The man is taking a selfie."], "image": "train2014/COCO_train2014_000000322725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132883, "question_id": "YKYnFy5vZkJhbk9jBdwMH4", "question": "What are the green markings an example of?", "choices": ["art", "typing", "mural", "graffiti"], "correct_choice_idx": 3, "direct_answers": ["graffiti", "graffiti", "graffiti", "graffiti", "graffiti", "graffiti", "graffiti", "graffiti", "graffiti", "graffiti"], "difficult_direct_answer": false, "rationales": ["The markings are graffiti.", "The green markings are that of hoodlums vandalizing street signs.", "Someone has spray-painted them onto the street sign."], "image": "train2014/COCO_train2014_000000132883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12230, "question_id": "YKeVmFFnAwJ23pzjj3wKXA", "question": "How easy would it be to park on the street at this location?", "choices": ["funny", "timely", "hard", "easy"], "correct_choice_idx": 2, "direct_answers": ["hard", "not very", "hard", "difficult", "hard", "hard", "hard", "hard", "not difficult", "hard"], "difficult_direct_answer": false, "rationales": ["The parking lot is full.", "There are already a lot of vehicles", "From what is visible from this image it does not look like there are many available parking spaces which would make it difficult to park in this spot."], "image": "val2014/COCO_val2014_000000012230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297157, "question_id": "YKftXvjzjBWe8mc5ricw79", "question": "What beverage does the woman in black drink?", "choices": ["iced tea", "milk", "coffee", "club soda"], "correct_choice_idx": 0, "direct_answers": ["water", "alcoholic", "iced tea", "water", "iced tea", "water", "water", "tea", "wine", "iced tea"], "difficult_direct_answer": false, "rationales": ["The woman has a lemon slice.", "The woman is drinking iced tea because there is a tea pot and she is drinking the tea in a glass and not a cup", "The woman is drinking iced tea."], "image": "train2014/COCO_train2014_000000297157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333201, "question_id": "YKjkWsovxHCpKAeiPWzYTk", "question": "What part of the person's body is showing?", "choices": ["feet", "head", "nose", "arm"], "correct_choice_idx": 0, "direct_answers": ["foot", "feet", "feet", "feet", "feet", "foot", "feet", "feet", "feet", "feet"], "difficult_direct_answer": false, "rationales": ["There are toes that are visible on the edge of the bed.", "The person's appendage attached to their legs is showing.", "Bear toes are hanging over the side of a bed."], "image": "train2014/COCO_train2014_000000333201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508954, "question_id": "YKz3P3bjduDvb5ebiB5C5Z", "question": "What is the person in white jacket and black pants doing?", "choices": ["calling 911", "filming girl", "getting uber", "texting school"], "correct_choice_idx": 1, "direct_answers": ["recording", "taking photo", "photographing", "supervising", "taking photo", "taking photo", "standing", "filming girl", "taking picture", "photographing"], "difficult_direct_answer": false, "rationales": ["The person appears to be holding a camera that is pointed at the girl. if someone is pointing a camera at another they are likely capturing that person on film.", "This mom wants to capture this fun, non-emergency event.", "The girl is being recorded."], "image": "train2014/COCO_train2014_000000508954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404687, "question_id": "YLZeMvRPaVzLhQwebv9SHx", "question": "What clothing item does the man have most of?", "choices": ["ties", "jeans", "shirts", "gloves"], "correct_choice_idx": 0, "direct_answers": ["ties", "ties", "ties", "ties", "ties", "ties", "ties", "ties", "tie", "tie"], "difficult_direct_answer": false, "rationales": ["The man has a lot of ties on his shoulder.", "The man has three ties over his shoulder.", "He has 3 draped on his shirt"], "image": "val2014/COCO_val2014_000000404687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454543, "question_id": "YLg8bW8kLZuhMTqqxBHYFm", "question": "What propels the ball into the air here?", "choices": ["mind control", "child", "magic", "blow dryer"], "correct_choice_idx": 3, "direct_answers": ["electricity", "air blower", "hair dryer", "hairdryer", "blow dryer", "dryer", "air", "hairdryer", "air pressure", "blowdryer"], "difficult_direct_answer": true, "rationales": ["A blow dryer is pushing a ball into the air.", "The dryer propels the ball.", "The person is using air from the dryer to keep the ball in the air."], "image": "val2014/COCO_val2014_000000454543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402723, "question_id": "YLsELnMnZHkT28bwzuHbcy", "question": "Why is the man on top of the railing?", "choices": ["to wax", "to grind", "to flip", "to clean"], "correct_choice_idx": 1, "direct_answers": ["to grind", "trick", "skating", "skateboarding", "skateboarding", "skateboarding", "skating", "skateboarding", "skateboarding", "skating"], "difficult_direct_answer": false, "rationales": ["That's the skateboard move he's doing.", "He is doing a stunt.", "The man is grinding his skateboard."], "image": "val2014/COCO_val2014_000000402723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372201, "question_id": "YM5dWFzzZEuntrRxW7UUdM", "question": "What kind of payment is needed for an opportunity to ride this machine?", "choices": ["donation", "fare", "salary payment", "volunteer payment"], "correct_choice_idx": 1, "direct_answers": ["ticket", "ticket", "ticket", "fare", "ticket payment", "train ticket", "monetary", "ticket", "metro card", "train ticket"], "difficult_direct_answer": false, "rationales": ["People need to pay fare.", "The payment is the fare.", "People have to pay to use this transportation."], "image": "train2014/COCO_train2014_000000372201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126610, "question_id": "YMDdQc8RKMKPVoNRaWUETS", "question": "What is seen in the top left corner?", "choices": ["eggs", "cheese", "milk", "butter"], "correct_choice_idx": 3, "direct_answers": ["lime", "butter", "stick butter", "butter", "butter", "butter", "butter", "butter", "butter", "butter"], "difficult_direct_answer": false, "rationales": ["It's yellow and next to a wax cardboard box.", "Butter is at the top left.", "This is a box that holds 4 sticks"], "image": "train2014/COCO_train2014_000000126610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1153, "question_id": "YMtNUCqur9gFF4h9gRdbd9", "question": "What person had a 1995 documentary made about their life that had the name of this food item in the title?", "choices": ["tallulah bankhead", "hailee steinfeld", "clara bow", "carmen miranda"], "correct_choice_idx": 3, "direct_answers": ["carmen miranda", "carmen miranda", "chiquita", "carmen miranda", "carmen miranda", "carmen miranda", "carmen miranda", "carmen miranda", "banana", "carmen miranda"], "difficult_direct_answer": false, "rationales": ["Carmen miranda made a documentary about bananas.", "That woman is known for bananas.", "The documentary is called \"bananas is my business\"."], "image": "val2014/COCO_val2014_000000001153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101013, "question_id": "YMuBBZtbyhko6CDeQbrSXC", "question": "It's impossible for this to be which one of these countries?", "choices": ["united states", "jordan", "yemen", "saudi arabia"], "correct_choice_idx": 0, "direct_answers": ["20th", "unknown", "united states", "usa", "china", "romania", "japan", "no", "afghanistan", "england"], "difficult_direct_answer": true, "rationales": ["Horse drawn carriages are in a city street.", "This is most likely taken in a street of the united states.", "This cannot be the untied states because there is an animal on the road."], "image": "val2014/COCO_val2014_000000101013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427796, "question_id": "YMyUbgygisTac9yRw8BDEP", "question": "What is holding up the knives?", "choices": ["glue", "string", "tape", "magnets"], "correct_choice_idx": 3, "direct_answers": ["hooks", "hooks", "hooks", "hooks", "nails", "hooks", "magnets", "magnets", "cabinet", "hooks"], "difficult_direct_answer": false, "rationales": ["The knives are magnetic.", "They are made of metal", "Each knife is laying flat along one of its long sides against a vertical board. no knives are being supported from underneath."], "image": "val2014/COCO_val2014_000000427796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271996, "question_id": "YN7DWesCVdUZuWZjMkX2sY", "question": "What are the clear circles on the front of the car made of?", "choices": ["rubber", "glass", "cotton", "paper"], "correct_choice_idx": 1, "direct_answers": ["glass", "glass", "glass", "lights", "glass", "glass", "glass", "glass", "glass", "silver rod"], "difficult_direct_answer": false, "rationales": ["The clear circles are headlights which are glass.", "The circles are actually headlights.", "The circles are glass."], "image": "train2014/COCO_train2014_000000271996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578294, "question_id": "YN855CMG8eCo2ja2BedDBU", "question": "Those objects are meant to hold what?", "choices": ["juice", "plants", "bread", "flour"], "correct_choice_idx": 1, "direct_answers": ["flowers", "flowers", "plants", "flowers", "flowers", "flowers", "flowers", "flowers", "flowers", "flowers"], "difficult_direct_answer": false, "rationales": ["The objects are for plants.", "Those are vases. vases are used to put flowers in them.", "The shape of these vases indicate that they are made to hold flowers or plants of that shape."], "image": "train2014/COCO_train2014_000000578294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467763, "question_id": "YNSriC4HuhJ7Jqy8zj6tUo", "question": "What does the store sell?", "choices": ["pizza", "hamburgers", "beer", "noodles"], "correct_choice_idx": 3, "direct_answers": ["noodles", "noodles", "noodles", "noodles", "noodles", "chinese food", "noodles", "chinese food", "food", "noodles"], "difficult_direct_answer": false, "rationales": ["The store sells noodles according to the sign.", "There is a bowl of food in the picture.", "The sign clearly indicates what is being sold."], "image": "train2014/COCO_train2014_000000467763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181805, "question_id": "YNdtbo22s42zK6MsWQHAcP", "question": "What is she doing?", "choices": ["riding wave", "paddling ashore", "stealing board", "cleaning surfboard"], "correct_choice_idx": 1, "direct_answers": ["surfing", "surfing", "paddling ashore", "surfing", "surfing", "paddling surfboard", "surfing", "paddling", "paddling", "surfing"], "difficult_direct_answer": false, "rationales": ["The woman wants to reach the beach.", "There are no boards or waves to be seen.", "She is riding in the direction of the waves which move towards the shore."], "image": "val2014/COCO_val2014_000000181805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283704, "question_id": "YNgVZc8SirMiwAUFodKwhA", "question": "How are the objects in the water being powered?", "choices": ["wind", "battery", "gas", "sun"], "correct_choice_idx": 0, "direct_answers": ["wind", "by motor", "wind", "jetski", "wind", "wind gust", "wind", "waves", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["Surfers are holding a large kite and being moved in water.", "The kites are flying in the wind.", "Wind moves things in water. the objects have no motor and are entirely controlled by wind and humans."], "image": "train2014/COCO_train2014_000000283704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32471, "question_id": "YNphn8KxVvjytv8ekyNszf", "question": "What kind of location is the area with grass and trees across from the parking lot?", "choices": ["bike trail", "botanical gardens", "park", "cemetery"], "correct_choice_idx": 3, "direct_answers": ["cemetery", "cemetary", "cemetery", "cemetery", "cemetery", "cemetery", "cemetary", "cemetary", "graveyard", "cemetary"], "difficult_direct_answer": false, "rationales": ["The location is a cemetery as there are graves.", "There is a cemetery in the background behind the fence.", "There are tombstones in the land."], "image": "train2014/COCO_train2014_000000032471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177011, "question_id": "YNt76BxqeNMswSE4kcgcA9", "question": "What word is on the boy to the left's clothing?", "choices": ["mite", "green", "yellow", "apple"], "correct_choice_idx": 0, "direct_answers": ["mogul mite", "mogul mite", "mogul mite", "mogul mite", "model mite", "mogul", "mite", "mogul", "mogul mite", "mite"], "difficult_direct_answer": false, "rationales": ["You can see it says mogul mite on him.", "The boy's clothes say mogul mite.", "It says mite on his outfit."], "image": "train2014/COCO_train2014_000000177011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67749, "question_id": "YP8S3Wbc4BZYuj7f87ihFM", "question": "What is the woman trying to do?", "choices": ["cross street", "board bus", "board car", "jog"], "correct_choice_idx": 0, "direct_answers": ["cross street", "kras road", "cross road", "cross street", "crossing road", "access bus", "road crossing", "cross street", "cross street", "cross"], "difficult_direct_answer": false, "rationales": ["The woman wants to cross.", "The woman is prepared to cross the street.", "The woman has a bag and is going over the street."], "image": "train2014/COCO_train2014_000000067749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534511, "question_id": "YPGZpQkykqioEYgHPCodqt", "question": "What was the man doing before he stood up?", "choices": ["running", "wrestling", "skating", "biking"], "correct_choice_idx": 3, "direct_answers": ["phone call", "riding bike", "sitting", "cycling", "biking", "texting", "riding bike", "riding bicycle", "riding bike", "biking"], "difficult_direct_answer": false, "rationales": ["There is a bike leaning on the man.", "The man has a bike next to him.", "This is indicated by the red and black bike in the foreground."], "image": "train2014/COCO_train2014_000000534511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18670, "question_id": "YPN9HNzzrSPTn3TtX4Vyje", "question": "Why is the newspaper there?", "choices": ["wipe hands", "protect table", "wrap food", "reading material"], "correct_choice_idx": 1, "direct_answers": ["table cover", "protect table", "table cloth", "protect table", "protection", "protect table", "cover", "tablecloth", "prevent mess", "protection"], "difficult_direct_answer": false, "rationales": ["The newspaper keeps the table clean.", "The newspaper keeps crumbs from falling.", "This keeps the food off of it"], "image": "train2014/COCO_train2014_000000018670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250802, "question_id": "YPUURmbdB8FFaJnRzidMou", "question": "What can this person do in the state of california?", "choices": ["practice medicine", "teach", "fish", "drive"], "correct_choice_idx": 1, "direct_answers": ["drive", "poster", "drive", "drive", "drive", "drive cars", "drive car", "drive", "teach", "drive"], "difficult_direct_answer": false, "rationales": ["This person has a teaching permit.", "There are items that shows the woman can teach.", "There is a california drivers license."], "image": "train2014/COCO_train2014_000000250802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444744, "question_id": "YPUwuX8ehbemuzu8AMZ3G8", "question": "What time of the day this meal is usually eaten?", "choices": ["lunch", "dinner", "snack", "breakfast"], "correct_choice_idx": 1, "direct_answers": ["dinner", "dinner", "dinner", "lunch dinner", "dinner", "evening", "lunch", "dinner", "evening", "evening"], "difficult_direct_answer": false, "rationales": ["This meal of pizza is associated with the evening during which dinner takes place.", "The lighting looks a bit dark.", "The food is hearty and heavy. people make a meal of it."], "image": "train2014/COCO_train2014_000000444744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57535, "question_id": "YPbPj6EzbzfUdPXasuPwtd", "question": "What is the person near?", "choices": ["bed", "table", "ski poles", "cow"], "correct_choice_idx": 2, "direct_answers": ["snow", "snow", "skiis", "top", "ground", "ski slope", "snow", "snow", "another person", "ski poles"], "difficult_direct_answer": false, "rationales": ["There are poles by the person.", "These items are visible in the photo.", "The person is next to some ski poles in the snow."], "image": "val2014/COCO_val2014_000000057535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165862, "question_id": "YPbqxCzeFEnKSnKNHob7Sw", "question": "What type of hat does the man wearing jeans have on his head?", "choices": ["fedora", "bucket hat", "safari hat", "top hat"], "correct_choice_idx": 1, "direct_answers": ["tree", "bucket hat", "fishing", "bucket", "fishing hat", "baseball", "puma", "cowboy hat", "fishing", "cloth"], "difficult_direct_answer": true, "rationales": ["The hat is a bucket hat.", "The man has a bucket hat on.", "The hat is really floppy."], "image": "val2014/COCO_val2014_000000165862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232503, "question_id": "YPsXdJLVKEwTnogSxYmAFP", "question": "Who is the manufacturer of the bus?", "choices": ["man", "new flyer", "tata", "volvo"], "correct_choice_idx": 0, "direct_answers": ["veolia", "man", "man", "veolia", "veolia", "veolia", "man", "veolia", "man", "man"], "difficult_direct_answer": false, "rationales": ["The metal or gray painted lettering at the front center on the red makes this clear.", "A volvo logo is on the front of a bus.", "The brand of the vehicle is written on the front of the bus."], "image": "val2014/COCO_val2014_000000232503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302415, "question_id": "YPwvYqBQB4nsTbU6CGgHeB", "question": "What beverage is contained in the glass?", "choices": ["soda", "beer", "red wine", "juice"], "correct_choice_idx": 2, "direct_answers": ["wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine", "red wine"], "difficult_direct_answer": false, "rationales": ["The drink is in a wine glass and it is dark.", "There are bottles of wine near the glass and the glass is filled with wine.", "That is a darker wine."], "image": "train2014/COCO_train2014_000000302415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253883, "question_id": "YQ6B4CvnG4wwmbNk7wF3id", "question": "What age class do most people here belong to?", "choices": ["middle age", "seniors", "juniors", "youth"], "correct_choice_idx": 1, "direct_answers": ["seniors", "sixty's", "retirees", "senior", "senior", "senior age", "middle aged", "elder", "fifties sixties", "senior"], "difficult_direct_answer": false, "rationales": ["Most of the people seen look like they are senior citizens.", "Most of the people have gray hair which would mean they are fairly old.", "They all have gray hair."], "image": "train2014/COCO_train2014_000000253883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43431, "question_id": "YQQ6kyfmSYTvUzuNS5krDJ", "question": "What is a big event in this sport?", "choices": ["wimbledon", "indy 500", "superbowl", "world series"], "correct_choice_idx": 0, "direct_answers": ["wimbledon", "wimbledon", "open", "wimbledon", "tennis", "world tour", "tennis", "wimbledon", "atp tour", "open"], "difficult_direct_answer": false, "rationales": ["This is tennis. wimbledon is a big event in the sport of tennis.", "The event is wimbledon.", "Wimbledon is the biggest tennis event."], "image": "train2014/COCO_train2014_000000043431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196378, "question_id": "YQZg9UnygSe6ojiNJScMzD", "question": "What does the headgear of the lady in pink represent?", "choices": ["royalty", "worker", "athlete", "motorcyclist"], "correct_choice_idx": 0, "direct_answers": ["royalty", "princess peach", "crown", "crown", "crown", "royalty", "crown", "crown", "princess", "royalty"], "difficult_direct_answer": false, "rationales": ["The tiny crown on the pink lady's head mimics that of the royalty found in various countries around the world. they invariably include a display of precious gems, as does this one.", "A girl has a crown on her head.", "The woman is wearing a crown."], "image": "val2014/COCO_val2014_000000196378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337993, "question_id": "YQikkNsN82DLjhZPTdSC8C", "question": "Why is the man bringing his arms to his chest?", "choices": ["for speed", "to tackle", "to spin", "to dance"], "correct_choice_idx": 0, "direct_answers": ["keep warm", "go faster", "speed", "reduce friction", "aero dynamic", "warm up", "aerodynamic", "for speed", "speed up", "skiing"], "difficult_direct_answer": true, "rationales": ["The man goes for speed.", "The individual is attempting to generate more kinetic force by reducing air drag.", "He is trying to go faster so he pulls his body in as much as possible."], "image": "train2014/COCO_train2014_000000337993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140585, "question_id": "YQjFVpSNwxjkJt3WXp9pj4", "question": "Who stars in the studio/theater marked Late Show?", "choices": ["mark twain", "conan obrien", "stephen colbert", "agnes morehead"], "correct_choice_idx": 2, "direct_answers": ["samuel", "stephen colbert", "david letterman", "colbert", "stephen colbert", "david sullivan", "stephen colbert", "david letterman", "stephen colbert", "ed silver"], "difficult_direct_answer": false, "rationales": ["The late show stars stephen colbert.", "The other options don't apply. d is known for the tonight show.", "Stephen colbert is the closest answer to being correct, but it is not actually correct because this picture is from when david letterman still had the show and that's his name is on the marquee."], "image": "train2014/COCO_train2014_000000140585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117764, "question_id": "YQxeQcRoRuJ5gcz9VFCjyi", "question": "What kind of action is the boy taking?", "choices": ["typing", "throwing", "running", "kicking"], "correct_choice_idx": 0, "direct_answers": ["typing", "playing game", "writting", "learning", "typing", "typing", "typing", "typing", "game playing", "learning"], "difficult_direct_answer": false, "rationales": ["The action is typing.", "He is using the keyboard on a laptop.", "The boy is using a laptop toy."], "image": "train2014/COCO_train2014_000000117764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452205, "question_id": "YQyr6pFg5JXpxc9XXatxnz", "question": "What is this dog's owner doing?", "choices": ["driving", "shaving", "drinking", "sewing"], "correct_choice_idx": 0, "direct_answers": ["driving", "driving", "smiling", "driving", "driving", "driving", "driving", "smiling", "playing", "driving"], "difficult_direct_answer": false, "rationales": ["The dog's owner drives.", "The dog is in a vehicle. their owner is driving.", "The dog's owner is going for a drive."], "image": "train2014/COCO_train2014_000000452205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430624, "question_id": "YRCGbH5PcSDRjKWFV3enBy", "question": "Items offered here were cooked inside what?", "choices": ["crock pot", "electric skillet", "oven", "fire"], "correct_choice_idx": 2, "direct_answers": ["oven", "oven", "oven", "oven", "oven", "oven", "oven", "packed", "oven", "oven"], "difficult_direct_answer": false, "rationales": ["Bakery items are usually cooked all around, so they have to be inside something for the heat to go around instead of the heat being directly under it only.", "The items are considered pastries which are traditionally cooked in an oven.", "The items were baked."], "image": "train2014/COCO_train2014_000000430624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543895, "question_id": "YRKukX4BNoumiZdrszLe7n", "question": "What just ended here?", "choices": ["fight", "snow", "rain", "argument"], "correct_choice_idx": 2, "direct_answers": ["play", "rain", "frisbee game", "frisbee playing", "deal", "sale", "frisbee", "game", "frisbee game", "relationship"], "difficult_direct_answer": true, "rationales": ["Looks like it was raining and now it's not.", "The ground is all wet.", "We can conclude due to the wet ground and the umbrellas being put away that this area has recently seen precipitation."], "image": "val2014/COCO_val2014_000000543895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186697, "question_id": "YRL6Vy7kGhkxsZH9Nj8Sza", "question": "How many city buses are likely to travel this route?", "choices": ["none", "one", "four", "eight"], "correct_choice_idx": 0, "direct_answers": ["zero", "one", "zero", "zero", "one", "one", "zero", "none", "one", "zero"], "difficult_direct_answer": false, "rationales": ["City buses don't come to forests.", "It is a narrow road. city buses are wide.", "There are no city buses who could fit on this thin road."], "image": "val2014/COCO_val2014_000000186697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112988, "question_id": "YRM5M8MURg8iNSfMvxHFMX", "question": "Which age range may this cake have been for?", "choices": ["child", "grandparent", "teenager", "young adult"], "correct_choice_idx": 0, "direct_answers": ["adolescent", "child", "child", "under five", "child", "young child", "2-8", "young", "toddlers", "kids"], "difficult_direct_answer": false, "rationales": ["It had the picture of a cartoon character on it.", "There is a cartoon on the cake.", "It has peter rabbit on it"], "image": "val2014/COCO_val2014_000000112988.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322255, "question_id": "YRW4aPKNBxpa9BCXuNrKzK", "question": "Why is his head covered?", "choices": ["fashion", "religion", "protection", "warmth"], "correct_choice_idx": 2, "direct_answers": ["protection", "protection", "protect head", "protection", "protection", "for protection", "safety", "accident protection", "for protection", "safety"], "difficult_direct_answer": false, "rationales": ["Skateboarding is done on concrete. concrete is hard. a head is not hard. the helmet protects the head if it comes in contact with the concrete.", "If he falls down and hits his head, he can get hurt.", "It will prevent head injuries in a fall"], "image": "train2014/COCO_train2014_000000322255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352841, "question_id": "YRWvFfw8PuXqRg4q62Xaru", "question": "What is on the top shelf?", "choices": ["cups", "plates", "toaster", "oil"], "correct_choice_idx": 0, "direct_answers": ["bowls", "cups", "cups", "cup", "vases", "bowls", "mug", "water bottle", "bottle", "wine glass"], "difficult_direct_answer": false, "rationales": ["The toaster and oil are on the counter and sink. the items on the top shelf are used to hold liquid.", "On the top shelf there is a stack of cups.", "Drinkware can be seen through the glass on the top shelf"], "image": "train2014/COCO_train2014_000000352841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260774, "question_id": "YRdFanhfkguoAiGUYR9tFP", "question": "What is the dog doing on the ground?", "choices": ["laying", "eating", "grooming", "playing"], "correct_choice_idx": 0, "direct_answers": ["resting", "laying", "sleeping", "sleeping", "sleeping", "resting", "sleeping", "sleeping", "resting", "sleeping"], "difficult_direct_answer": false, "rationales": ["This dog's legs are not standing upright but are outstretched and akimbo. this dog is resting and likely asleep.", "The dog is laying on the ground while sleeping.", "The dog is laying."], "image": "val2014/COCO_val2014_000000260774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184138, "question_id": "YRvuzccLL5YUH6J6xq5huL", "question": "What type of shop is this?", "choices": ["auto", "shoe", "music", "food"], "correct_choice_idx": 0, "direct_answers": ["tire shop", "tire", "auto", "mechanic", "autobody", "tire shop", "auto sales", "auto", "car repair", "tire"], "difficult_direct_answer": false, "rationales": ["There is a car and spare tires.", "This is the most likely answer given the garage in the background and tires.", "The shop sells cars."], "image": "train2014/COCO_train2014_000000184138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137694, "question_id": "YRyKeEK6PAeDWELn8HRKhs", "question": "What color is he waiting for?", "choices": ["pink", "blue", "purple", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "green", "green", "green", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["This color is often used to mean go, start or okay.", "A green light means go.", "He's waiting for the light to turn green."], "image": "train2014/COCO_train2014_000000137694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64961, "question_id": "YS93N7U325YmJNqJeGssPJ", "question": "The shade held by the teal umbrella pole was crafted in which manner?", "choices": ["carving", "sewing machine", "weaving", "axe"], "correct_choice_idx": 2, "direct_answers": ["thatch", "bamboo", "woven", "weaving", "wicker", "by hand", "weave", "thatched", "sticks", "weaving"], "difficult_direct_answer": true, "rationales": ["The shade is woven.", "This looks like it was hand made and weaved together.", "The umbrella is made with a material that has been folded over and under, again and again. weaving is used to make a variety of items."], "image": "val2014/COCO_val2014_000000064961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180002, "question_id": "YSPzLg6fuFRjvxhtzWSXXh", "question": "What do these people hope for in the ocean today?", "choices": ["red tide", "doldrums", "high waves", "calm water"], "correct_choice_idx": 2, "direct_answers": ["high waves", "waves", "waves", "waves", "waves", "surfing", "waves", "high waves", "many waves", "good waves"], "difficult_direct_answer": false, "rationales": ["Surfers need waves to surf.", "The people want waves.", "These people are surfing. they hope that the water will be rough, not calm."], "image": "train2014/COCO_train2014_000000180002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337452, "question_id": "YSSjiQ4aL5jUP2bz6wDQJ8", "question": "What type of skate maneuver is the boy in white about to perform?", "choices": ["drop in", "wall ride", "flip", "manual"], "correct_choice_idx": 0, "direct_answers": ["grinding", "jump", "drop in", "jump", "regular foot", "wheelie", "skate bowl", "drop-in", "going down", "ollie"], "difficult_direct_answer": true, "rationales": ["Going from a platform into a steep transition is called dropping in.", "He's at the top of the half pipe and has to do that first in order to do tricks inside it.", "He is about to descend."], "image": "train2014/COCO_train2014_000000337452.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495433, "question_id": "YSTKjSxWFAD8XmVCCcM8xr", "question": "What are the round objects on the boats used for?", "choices": ["hoisting sails", "going faster", "stopping suddenly", "steering boat"], "correct_choice_idx": 0, "direct_answers": ["hoisting sails", "life preserver", "life saving", "bunkers", "going", "lifesaving", "fishing", "steering", "safety", "life preservers"], "difficult_direct_answer": true, "rationales": ["There are long poles on the boats.", "They move the rope lines easily. rope lines are attached to the material that catches the wind and moves the boat.", "They help to raise the sails."], "image": "train2014/COCO_train2014_000000495433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26310, "question_id": "YSiFePLEdRNEF8w8S3MbTv", "question": "What does the water create for the surf boarders to ride on?", "choices": ["fountain", "salt", "foam", "wave"], "correct_choice_idx": 3, "direct_answers": ["waves", "waves", "waves", "wave", "waves", "waves", "waves", "waves", "waves", "waves"], "difficult_direct_answer": false, "rationales": ["The water creates waves.", "The answer is known and also visible in the image.", "A man is on a surfboard with foam from a wave breaking around him."], "image": "train2014/COCO_train2014_000000026310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549942, "question_id": "YSrGsaiuasrbZQSSf4vpgf", "question": "What's on the ceiling?", "choices": ["fan", "paint", "lamp", "nothing"], "correct_choice_idx": 0, "direct_answers": ["fan", "fan", "fan", "fan", "fan", "fan", "fan", "lamp fan", "fan", "fan"], "difficult_direct_answer": false, "rationales": ["A fan is on the ceiling.", "Many homes have a fan on the ceiling to help with cooling", "Many people use ceiling fans to better ventilate their homes."], "image": "train2014/COCO_train2014_000000549942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11504, "question_id": "YSrXvkuZFa4AViSqBX2Bhk", "question": "How are the sandwich eggs cooked?", "choices": ["hard-boiled", "scrambled", "fried", "poached"], "correct_choice_idx": 0, "direct_answers": ["hard boiled", "steaming", "stove", "hard boiled", "boiled", "hard boiled", "cold", "boiled", "hard-boiled", "hard boiled"], "difficult_direct_answer": false, "rationales": ["The eggs are hard on not runny.", "The eggs are visible on the sandwich. based on the consistency visible in the yolk and white of the egg, answer a is likely.", "They are smooth and the yolk is intact"], "image": "train2014/COCO_train2014_000000011504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42805, "question_id": "YT279dn4W8YSPrEYTxtGSi", "question": "Who use the big umbrellas?", "choices": ["hawkers", "shoppers", "residents", "shop owners"], "correct_choice_idx": 0, "direct_answers": ["business owners", "tourists", "hawkers", "vendors", "people", "people", "diners", "people", "merchants", "people outside"], "difficult_direct_answer": false, "rationales": ["Hawkers are using the big umbrellas.", "People are shopping in the alley. the umbrellas can be used by them.", "They are there to give shade to the shoppers."], "image": "val2014/COCO_val2014_000000042805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124979, "question_id": "YT5hacVb7emknQ7Fsn4aSk", "question": "What is the man doing on the bench?", "choices": ["reading", "napping", "playing", "eating"], "correct_choice_idx": 1, "direct_answers": ["sleeping", "sleeping", "sleeping", "napping", "sleeping", "sleeping", "sleeping", "resting", "sleeping", "resting"], "difficult_direct_answer": false, "rationales": ["This man rests his head on a backpack and is in the fetal position with his eyes closed sleeping.", "The man is sleeping.", "His eyes are closed and he is lying down."], "image": "val2014/COCO_val2014_000000124979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240084, "question_id": "YTBffkW8cmXF5CmCHuEmEb", "question": "When the first traffic light was invented?", "choices": ["1881", "1986", "1896", "1868"], "correct_choice_idx": 3, "direct_answers": ["1923", "modern times", "1868", "1900", "1914", "1868", "1914", "1914", "1890", "1914"], "difficult_direct_answer": false, "rationales": ["The light is from 1868.", "The traffic light was invented in the year 1868.", "The traffic light came into being in the mid 1800s."], "image": "val2014/COCO_val2014_000000240084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99952, "question_id": "YTZyqW3Hu2e54buKkr2dTg", "question": "What is the umbrella being used for?", "choices": ["keeping dry", "decoration", "lighting", "to dance"], "correct_choice_idx": 2, "direct_answers": ["lighting", "photography", "lighting", "taking photos", "lighting", "light", "photography lighting", "photos", "light", "light shade"], "difficult_direct_answer": false, "rationales": ["The umbrella is used for photographic lighting.", "The umbrella is used for lighting because it is a place where they are taking pictures", "The umbrella is used for lighting a photo shoot."], "image": "train2014/COCO_train2014_000000099952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481670, "question_id": "YTt2vcBHQQGHN4G4NQuzPY", "question": "What are the boys using the poles for?", "choices": ["balancing", "hitting", "jousting", "poking"], "correct_choice_idx": 0, "direct_answers": ["balance", "skiing", "balance", "balance", "skiing", "pushing", "balancing", "balance", "skiing", "ski poles"], "difficult_direct_answer": false, "rationales": ["The skiers have poles to help them stay up.", "Two kids are skiing on a hill and are using poles in each hand.", "The boys need to keep from falling."], "image": "val2014/COCO_val2014_000000481670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449798, "question_id": "YU7yQnfqypm4iXhj7yB8Qp", "question": "Which female superhero is on the left corner of the desk?", "choices": ["black widow", "she hulk", "wonder woman", "harley quinn"], "correct_choice_idx": 2, "direct_answers": ["wonder woman", "wonder woman", "wonder woman", "wonder woman", "wonder woman", "wonder woman", "wonder woman", "wonder woman", "wonder woman", "wonderwoman"], "difficult_direct_answer": false, "rationales": ["Has a blue red and white patriotic uniform on her.", "Wonder woman is displayed.", "Only one superhero is defined by her armored bracelets, magic lasso, red/blue/gold outfit and a tiara."], "image": "val2014/COCO_val2014_000000449798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337083, "question_id": "YUWjh2Zj4BvaL5dYnLfucU", "question": "What type cleaning methods might be used here?", "choices": ["natural", "high chemical", "bleach only", "none"], "correct_choice_idx": 0, "direct_answers": ["organic", "pressure wash", "sweeping", "soap detergent", "dry clean", "wiping", "natural", "soap", "window washing", "dry cleaning"], "difficult_direct_answer": true, "rationales": ["Only natural on this.", "This cleaning establishment touts itself as 'organic'. organic is normally associated with natural methods and ingredients.", "Too many chemicals could hurt the surface of materials."], "image": "val2014/COCO_val2014_000000337083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339034, "question_id": "YUaR389GppVrdoQPj3dPWa", "question": "What is the woman holding the phone wearing?", "choices": ["baseball cap", "hoop earring", "headphones", "sunglasses"], "correct_choice_idx": 1, "direct_answers": ["dress", "floral dress", "dress", "ponytail", "dress", "dress", "dress", "floral dress", "phone", "hoop earring"], "difficult_direct_answer": false, "rationales": ["She has big hoop earings on.", "There are large gold circles hanging from her ears.", "The woman is wearing large round earrings."], "image": "train2014/COCO_train2014_000000339034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27307, "question_id": "YUgHHAWsURU9zyjsS9VCfo", "question": "What is most likely to make their clothes dirty sometime soon?", "choices": ["grass", "tomatoes", "blood", "oil"], "correct_choice_idx": 0, "direct_answers": ["grass", "grass", "grass", "grass", "grass", "ground", "play", "outfielders", "sliding", "sliding"], "difficult_direct_answer": false, "rationales": ["These are baseball players playing on a natural, green surface. it is likely that one of them dives to catch a ball or slides across the field and stains the uniform.", "Though all these answers are viable but by the setting and the sport they play, you can find the answer.", "The grass stains."], "image": "train2014/COCO_train2014_000000027307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472478, "question_id": "YUiHicF7RUH26Gq57pXaUX", "question": "When was the beverage brand founded whose name appears on the glass?", "choices": ["1922", "1876", "1947", "1776"], "correct_choice_idx": 1, "direct_answers": ["beer", "1876", "budweiser", "budweiser", "1876", "1876", "1876", "beer", "beer", "1876"], "difficult_direct_answer": false, "rationales": ["The beer is budweiser. budweiser was founded in 1876.", "The year on the bottle is 1876.", "The beverage brand of budweiser was debut in the year 1876."], "image": "val2014/COCO_val2014_000000472478.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254615, "question_id": "YVC3QnvYMtBRB68NMoHLS6", "question": "How many months before Christmas was this photo taken?", "choices": ["six", "one", "ten", "nine"], "correct_choice_idx": 3, "direct_answers": ["nine", "nine months", "eight", "two", "nine", "nine", "nine", "nine", "nine", "nine months"], "difficult_direct_answer": false, "rationales": ["The photo was taken in march.", "It is in wintertime so it can't be long before christmas.", "There are 9 months."], "image": "train2014/COCO_train2014_000000254615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561660, "question_id": "YVJy8FZRhZ6BSLN69co7b4", "question": "Where is the mouse plugged in?", "choices": ["surge protector", "monitor", "charging station", "keyboard"], "correct_choice_idx": 3, "direct_answers": ["keyboard", "on mac", "keyboard", "laptop", "computer system", "computer", "keyboard", "computer", "computer", "computer"], "difficult_direct_answer": false, "rationales": ["The mouse looks to be plugged into the keyboard.", "The mouse is near the keyboard.", "The mouse is resting near the keyboard."], "image": "train2014/COCO_train2014_000000561660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20839, "question_id": "YW7bKm8hGWo7JqHAFiWgiu", "question": "What does the number on the sign represent?", "choices": ["speed limit", "car weight", "degree turn", "route number"], "correct_choice_idx": 3, "direct_answers": ["route", "highway number", "highway", "road number", "interstate", "route number", "highway", "time", "route", "road number"], "difficult_direct_answer": false, "rationales": ["There is a route number on the highway marker.", "It is a highway marker sign.", "An intersection has a highway sign on the corner."], "image": "train2014/COCO_train2014_000000020839.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364567, "question_id": "YWDosBxgBqbt6AX42ujk9q", "question": "What sort of sporting event is being practiced here?", "choices": ["steeple chase", "barrel racing", "square dancing", "bronco busting"], "correct_choice_idx": 0, "direct_answers": ["horseback riding", "steeple chase", "horse riding", "show jumping", "equestrian", "horse", "horsejumping", "horse jumping", "equestrian track", "equestrianism"], "difficult_direct_answer": true, "rationales": ["This is a horse racing event.", "The horse is jumping over the poles.", "In steeplechase, animals try to jump over obstacles."], "image": "val2014/COCO_val2014_000000364567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257328, "question_id": "YWHYbk5eYrkq4gun2ZtwLs", "question": "What happened to these bananas?", "choices": ["fried", "peeled", "baked", "chopped"], "correct_choice_idx": 1, "direct_answers": ["peeled", "peeled", "peeled", "peeled", "peeled", "peeled", "peeled", "peeled", "peeled", "peeled"], "difficult_direct_answer": false, "rationales": ["The other options don't match. they're still whole and not fried or baked.", "The bananas don't have peels.", "The bananas no longer have skins on them."], "image": "val2014/COCO_val2014_000000257328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226505, "question_id": "YWNzJznAV5ghjjJRoChKAY", "question": "What type of bus is shown?", "choices": ["shuttle", "double-decker", "school", "passenger"], "correct_choice_idx": 3, "direct_answers": ["city", "city", "transport bus", "public transit", "transit bus", "passenger", "metro", "city", "city bus", "transport"], "difficult_direct_answer": false, "rationales": ["This bus does not have the colors of a school bus, the height of a double decker and is too big to be a shuttle.", "The bus carries people from one destination to another.", "A large single level transportation vehicle which is not yellow probably just holds passengers."], "image": "train2014/COCO_train2014_000000226505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250815, "question_id": "YWPTCQtbLPbjVZuJYebgQB", "question": "What are the large pieces of cake supposed to be?", "choices": ["barbies", "legos", "minions", "racecars"], "correct_choice_idx": 1, "direct_answers": ["legos", "legos", "bricks", "lego", "chocolate", "legos", "legos", "legos", "lego", "buildings"], "difficult_direct_answer": false, "rationales": ["The pieces have round top and are rectangle like a lego.", "The pieces are legos.", "The pieces of cake visible are shaped and colored like answer a and are likely intended to resemble them."], "image": "train2014/COCO_train2014_000000250815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349934, "question_id": "YWPfQ5Yn8boRQYWDhyzBhh", "question": "The players are wearing the same shirts because they play in a what?", "choices": ["family reunion", "league", "school", "random match"], "correct_choice_idx": 1, "direct_answers": ["league", "league", "team", "team", "team", "team", "team", "frisbee", "team", "team"], "difficult_direct_answer": false, "rationales": ["The players are part of a team.", "The shirts are matching because they play on the same team.", "The players have matching shirts. they must be teammates."], "image": "train2014/COCO_train2014_000000349934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575287, "question_id": "YWVL9BQfxNZzPDjyWwkPea", "question": "If you needed stamps here what business might you enter?", "choices": ["grocery", "street vendors", "post office", "church"], "correct_choice_idx": 2, "direct_answers": ["post office", "post office", "post office", "post office", "post office", "post office", "post office", "post office", "post office", "post office"], "difficult_direct_answer": false, "rationales": ["The stamps are from the post office.", "If you needed stamps there is a post office on the right.", "Answer a is the most common place to buy stamps and there is a sign to the right with writing that could identify the building as answer a."], "image": "val2014/COCO_val2014_000000575287.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543622, "question_id": "YWbueZj9wZLjxvzrswWTjG", "question": "Where is the truck?", "choices": ["mail delivery", "mall", "trash pickup", "fire"], "correct_choice_idx": 3, "direct_answers": ["on road", "road", "on street", "road", "on road", "mexico", "fire", "road", "road", "street"], "difficult_direct_answer": false, "rationales": ["This is indicated by the color and ladder on top.", "The truck has firefighting equipment since it's red.", "The truck is red and it has a yellow stripe on it."], "image": "train2014/COCO_train2014_000000543622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212083, "question_id": "YWbwJXoF5j9NYkZh7qougq", "question": "What does the silver box on top of the cart do?", "choices": ["play music", "record movies", "project picture", "store money"], "correct_choice_idx": 2, "direct_answers": ["project", "project", "unknown", "projects", "project picture", "viewing", "move", "project", "project images", "project"], "difficult_direct_answer": false, "rationales": ["The silver box on top of the cart projects the picture the men are looking at to play a video game", "The silver box projects a picture on the screen.", "The picture on the screen comes from the silver box on the table."], "image": "train2014/COCO_train2014_000000212083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285462, "question_id": "YWoGFxJgMiTatjjgJbfYST", "question": "What is the woman doing with her arm?", "choices": ["waving", "throwing", "hitting", "stretching"], "correct_choice_idx": 0, "direct_answers": ["raises it", "waving", "posing", "waving", "displaying", "waving", "waving", "waving", "waving", "waving"], "difficult_direct_answer": false, "rationales": ["The outstretched hand is a greeting.", "This woman's arms express a greeting and are posed for the photo.", "The baby seems to be waving to the camera."], "image": "val2014/COCO_val2014_000000285462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120356, "question_id": "YWyvHrcxQYkEe7H8maNokw", "question": "Why does the man have a remote strapped to his wrist?", "choices": ["for control", "by law", "to fight", "for fashion"], "correct_choice_idx": 0, "direct_answers": ["playing wii", "playing game", "playing game", "stability", "playing game", "playing wii", "playing nintendo", "keepin place", "for control", "loss prevention"], "difficult_direct_answer": false, "rationales": ["So when he's playing he doesnt lose it.", "The strap helps the person play without worrying about dropping it.", "He might fling it across the room if the strap isn't used."], "image": "val2014/COCO_val2014_000000120356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175615, "question_id": "YWzpBtb2rrSMWqRejQCFFH", "question": "Why is the woman carrying sandals as she is walking?", "choices": ["they broke", "style", "laziness", "comfort"], "correct_choice_idx": 3, "direct_answers": ["bag", "easier walking", "exercising", "enjoying sand", "beach", "easier", "feel beachsand", "sand", "comfort", "exercising"], "difficult_direct_answer": true, "rationales": ["The woman wants comfort.", "It is easier to walk barefoot on the beach.", "She is carrying them so she does not get sand in them and it's easier to walk barefoot on sand sometimes then with shoes on."], "image": "val2014/COCO_val2014_000000175615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438329, "question_id": "YX8cpYAqWjKn4Cb5BRo53D", "question": "Why is he smiling?", "choices": ["wrote paper", "won prize", "not caught", "for camera"], "correct_choice_idx": 3, "direct_answers": ["happeaness", "for picture", "happy", "picture", "photo", "cool tie", "happy", "for pictures", "for camera", "he's happy"], "difficult_direct_answer": true, "rationales": ["He is looking straight ahead which tells us he is posing for a picture.", "He's looking at the photographer", "The man is posing."], "image": "train2014/COCO_train2014_000000438329.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248089, "question_id": "YXNiAYX8m7aYUexXde8esA", "question": "How many keys does a Wii Remote have?", "choices": ["nine", "four", "eight", "11"], "correct_choice_idx": 0, "direct_answers": ["eight", "seven", "nine buttons", "eight", "seven", "nine", "nine", "twelve", "nine", "nine"], "difficult_direct_answer": false, "rationales": ["There are 9 keys.", "The remote has nine keys.", "By the count it seems to be nine."], "image": "val2014/COCO_val2014_000000248089.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147520, "question_id": "YXNzPGtEMSeVULeb7vnXt4", "question": "Why is the skier crouching?", "choices": ["is hiding", "less resistance", "is falling", "see more"], "correct_choice_idx": 1, "direct_answers": ["improve aerodynamics", "less drag", "balance", "speed", "aerodynamics", "for speed", "increased speed", "balance", "less resistance", "changing speed"], "difficult_direct_answer": true, "rationales": ["If you pull yourself in as much as possible you can go faster.", "A person is skiing down a hill and is crouched a bit while concentrating and leaning forward.", "The skier wants to move faster."], "image": "train2014/COCO_train2014_000000147520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416912, "question_id": "YXuiZqLuimjeLCkr3tdkeH", "question": "The canopy wing is used for?", "choices": ["kiting", "surfing", "parasailing", "swimming"], "correct_choice_idx": 2, "direct_answers": ["flight", "air surfing", "lift", "wind", "hold surfer", "parasailing", "catch wind", "momentum", "slow down", "surfing"], "difficult_direct_answer": true, "rationales": ["This is used for parasailing.", "The canopy wing is for parasailing.", "When a person is on a board while hold kite string to a parachute they are sailing at sea on a single manned craft."], "image": "train2014/COCO_train2014_000000416912.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204232, "question_id": "YXxrjbSoudrhwyu4i9KWxK", "question": "Why is the man jumping over the barrier?", "choices": ["to escape", "to exercise", "for payment", "doing tricks"], "correct_choice_idx": 3, "direct_answers": ["fun", "parkour", "fun", "doing stunts", "skateboard trick", "doing tricks", "skateboard trick", "skating", "thrill seeker", "skateboard trick"], "difficult_direct_answer": false, "rationales": ["The man is riding a skateboard and doing tricks is a common activity when using a skateboard.", "There is a skateboard in between the barrier and the man. the skateboard is upside down.", "The man is on his skateboard."], "image": "val2014/COCO_val2014_000000204232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289260, "question_id": "YYCM5hUNZycFYARsMYqvCK", "question": "In what type building is this located?", "choices": ["gym", "museum", "basketball hall", "residence"], "correct_choice_idx": 1, "direct_answers": ["museum", "hut", "museum", "museum", "house", "old building", "museum", "museum", "museum", "old type"], "difficult_direct_answer": false, "rationales": ["The room has some old items inside.", "The building is a museum.", "The sign providing information on the room is used in museums."], "image": "val2014/COCO_val2014_000000289260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293860, "question_id": "YYK6Xqps9L44PBEH5D6aPQ", "question": "What type passengers ride the ghost bus?", "choices": ["mummies", "tourists", "daddies", "zombies"], "correct_choice_idx": 1, "direct_answers": ["ghosts", "tourists", "tourists", "tourists", "tourists", "tourists", "ghost", "tourists", "tourists", "tourist"], "difficult_direct_answer": false, "rationales": ["The passengers on the tour bus are usually tourists.", "The bus is boarded by tourist around the city.", "A double decker bus is moving down a street."], "image": "train2014/COCO_train2014_000000293860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224017, "question_id": "YYKj4LhjdbkwDSeqZaBCjY", "question": "Who do the blue signs show accommodations for?", "choices": ["deaf", "seniors", "blind", "handicapped"], "correct_choice_idx": 3, "direct_answers": ["handicapped", "wheelchair users", "vehicles", "handicap", "handicapped parking", "wheelchair", "handicapped", "handicapped", "toller", "wheelchairs"], "difficult_direct_answer": false, "rationales": ["A building is shown with multiple blue signs with a person in a wheelchair on them.", "The signs are for handicapped people.", "The blue signs have a person in a wheelchair."], "image": "train2014/COCO_train2014_000000224017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224695, "question_id": "YYgKdUrJtQHobx2UXBQjDm", "question": "What is the woman's orientation in relation to the cows?", "choices": ["in between", "behind", "in front", "below"], "correct_choice_idx": 0, "direct_answers": ["between them", "owner", "middle", "inbetween", "in between", "center", "sideways right", "farmer", "between", "between them"], "difficult_direct_answer": true, "rationales": ["There is a cow on each side of her", "You can tell what position she is in as to where she is in relation.", "There is a cow on each side of the woman."], "image": "train2014/COCO_train2014_000000224695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297645, "question_id": "YYgwEnndzo2c9pUa8qE74a", "question": "What are the people queueing up for?", "choices": ["entering zoo", "entering park", "feeding elephants", "riding elephants"], "correct_choice_idx": 2, "direct_answers": ["see elephants", "feeding elephants", "elephant interaction", "feed elephants", "pet elephants", "feeding elephants", "feed elephants", "petting elephant", "food", "meeting elephants"], "difficult_direct_answer": false, "rationales": ["Several elephants can be seen behind the fence, and their trunks are reaching out for food.", "There are elephants at the end of the queue.", "They are waiting in line to see the elephants and give them a treat"], "image": "train2014/COCO_train2014_000000297645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424105, "question_id": "YYhsvezhNYUyFkmvpeBWHD", "question": "Which one of these items might be in one of the bags?", "choices": ["underwear", "chess board", "towel", "pillow"], "correct_choice_idx": 2, "direct_answers": ["tennis racket", "racket", "towel", "ball", "racket", "water bottle", "racket", "ball", "racket", "racket"], "difficult_direct_answer": false, "rationales": ["Athletes often use towels to wipe off sweat.", "This is a sports equipment bag", "Towels can be in the bags."], "image": "train2014/COCO_train2014_000000424105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390658, "question_id": "YYsWRoibarLwppDAXpBJfc", "question": "Why are there blue lights on the trees?", "choices": ["for racing", "for climbing", "for holiday", "to signal"], "correct_choice_idx": 2, "direct_answers": ["holiday decoration", "decorative", "christmas", "christmas lights", "christmas", "for holiday", "for decoration", "christmas decorations", "christmas", "decoration"], "difficult_direct_answer": false, "rationales": ["There are blue lights on the trees for decoration in celebration of a holiday.", "The trees are decorated for christmas.", "They are decorated for christmas."], "image": "train2014/COCO_train2014_000000390658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320150, "question_id": "YZ2LQxRE62tEJVeZUSMky9", "question": "By which method could someone theoretically grab stuffed animals here?", "choices": ["claw", "betting number", "pleading", "blowing"], "correct_choice_idx": 0, "direct_answers": ["claw grab", "claw", "claw", "claw", "claw", "clamping", "claw", "claw", "claw", "claw"], "difficult_direct_answer": false, "rationales": ["You can tell by the machine and metal claw at the top on how the prizes are won.", "This game requires the metal claw to get a toy.", "A claw will grab the items."], "image": "val2014/COCO_val2014_000000320150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455881, "question_id": "YZ3FpeKKQNpkLoyRTiYU7m", "question": "What powers this means of transportation?", "choices": ["electricity", "coal", "gas", "food"], "correct_choice_idx": 3, "direct_answers": ["horses", "horses", "horse", "horse", "horse power", "horse", "horse", "food", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["The food powers it.", "The horse is pulling the buggy. it needs to eat to live.", "There is a horse and carriage. horses run on food for fuel."], "image": "train2014/COCO_train2014_000000455881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280062, "question_id": "YZENYF9WRf3Y2edg8E52XZ", "question": "What does the person in red provide?", "choices": ["admonishments", "snacks", "ski lessons", "grades"], "correct_choice_idx": 2, "direct_answers": ["ski lessons", "instruction", "skateboard riding", "instruction", "directions", "advice", "instruction", "instructions", "instructions", "lessons"], "difficult_direct_answer": false, "rationales": ["The other options don't match the setting.", "The person is an adult and they know what to do on the slopes.", "The person in red needs to give ski lessons."], "image": "train2014/COCO_train2014_000000280062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60532, "question_id": "YZKAGPiUBdE26ygdcAX8sb", "question": "What does the weather seem to be like here?", "choices": ["cold", "mild", "stormy", "hot"], "correct_choice_idx": 0, "direct_answers": ["cloudy", "sunny", "chilly", "cloudy", "cloudy", "cold", "warm", "sunny", "cold", "nice"], "difficult_direct_answer": false, "rationales": ["The woman is bundled up.", "The weather is chilly since people have jackets.", "The woman has a jacket."], "image": "train2014/COCO_train2014_000000060532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406075, "question_id": "YZLTD5kn3SQbDWfazJKZpQ", "question": "What is the white object on the man's waist used for?", "choices": ["floating", "slimming", "dress code", "protecting backbone"], "correct_choice_idx": 0, "direct_answers": ["prevent drowning", "flotation device", "floatation", "life jacket", "floating", "floating", "life jacket", "balancing", "floatation", "support"], "difficult_direct_answer": false, "rationales": ["The object is used to stay afloat.", "There is water visible in the background of the image and around water many people use flotation devices. the object in question appears to be the right size and shape and being worn in a place where a flotation device would be.", "The object is for floating."], "image": "train2014/COCO_train2014_000000406075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207083, "question_id": "YZT8j9Ea2qAgq6bWzV7Xiq", "question": "What type of phone is being used here?", "choices": ["cellular", "rotary", "landline", "pay"], "correct_choice_idx": 0, "direct_answers": ["cellular", "cell phone", "cell phone", "cell", "cellular", "cellphone", "cellphone", "cell phone", "smart phone", "mobile"], "difficult_direct_answer": false, "rationales": ["The phone is handheld being used outside of the home and has no visible cords. these features are all consistent with answer a.", "The person is making a call from a handheld device.", "The type is a cell phone."], "image": "train2014/COCO_train2014_000000207083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90103, "question_id": "YZXME2ES7oWB32oWKBYVGj", "question": "What type of vehicles are prohibited from going beyond the pole?", "choices": ["taxi", "bikes", "van", "bus"], "correct_choice_idx": 0, "direct_answers": ["taxi", "taxis", "bikes", "bicycles", "bicycles", "taxi", "taxis", "taxi", "bikes", "bike"], "difficult_direct_answer": false, "rationales": ["Cabs are not allowed past the pole.", "The slash through the picture of a cab shows they are not allowed past this point.", "The sign prohibits taxis."], "image": "train2014/COCO_train2014_000000090103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292188, "question_id": "YZfa4chxAhRHq23auhco5S", "question": "Why is the dog in the back?", "choices": ["broken truck", "was bad", "no room", "snuck on"], "correct_choice_idx": 2, "direct_answers": ["riding along", "no room", "working", "free space", "safety", "riding", "chilling", "safer", "riding", "riding"], "difficult_direct_answer": false, "rationales": ["There is ample space for the dog in the back.", "A man is riding in a jeep and his dog is in the backseat.", "The person knows the dog is there. there was probably no room in the front."], "image": "val2014/COCO_val2014_000000292188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463454, "question_id": "YZj7Kmw3y2dNDXZpYFWLjD", "question": "What is causing the two to take shelter?", "choices": ["wind", "rain", "snow", "tornados"], "correct_choice_idx": 1, "direct_answers": ["rain", "rainstorm", "rainstorm", "rain", "rain", "rainstorm", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["There is water falling from the sky", "The two have an umbrella.", "The people are standing under the umbrella to stay out of the rai."], "image": "val2014/COCO_val2014_000000463454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361258, "question_id": "YZoXwGuZqnXH5NUeNRADcw", "question": "Why are there most likely so many blue canopies?", "choices": ["same company/event", "misunderstanding", "shortage", "law"], "correct_choice_idx": 0, "direct_answers": ["beach", "vacationers", "beach resort", "hot sun", "rentals", "same company/event", "festival", "resort canopies", "leisure", "peace"], "difficult_direct_answer": true, "rationales": ["Multiple rows of exactly the same tents are set up in the sand on the beach with different umbrellas in the distance.", "The people are part of the same group.", "The coloring is uniform for a company."], "image": "val2014/COCO_val2014_000000361258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578237, "question_id": "YZt5jTLSj5jUpNukoiWoxG", "question": "What is contained inside the brown barrels?", "choices": ["soy sauce", "water", "beer", "wine"], "correct_choice_idx": 3, "direct_answers": ["wine", "nothing", "wine", "wine", "wine", "wine", "wine", "wine", "nothing", "wine"], "difficult_direct_answer": false, "rationales": ["This drink is often kept in brown barrels on racks in an indoor environment, and two of the people in the room are drinking a grape-color liquid out of long stemmed drinking glasses.", "The man has a wine glass.", "This liquid ages in wooden barrels."], "image": "val2014/COCO_val2014_000000578237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560598, "question_id": "YaSbAA3CYQxJA5dLpHJREH", "question": "What is closest to the woman?", "choices": ["box", "plate", "cat", "refrigerator"], "correct_choice_idx": 1, "direct_answers": ["plate", "plate", "plate", "plate", "plate", "cake", "plate", "cake", "eating", "plate"], "difficult_direct_answer": false, "rationales": ["None of the other objects are in this photo.", "The plate is very close to the woman and is almost touching her arm", "The thing near the woman is round and sitting on the table."], "image": "val2014/COCO_val2014_000000560598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199669, "question_id": "YaXySsgx3zhZmQ4F4GiE4V", "question": "What time in the morning does this building open to the public?", "choices": ["nine", "ten", "11", "eight"], "correct_choice_idx": 3, "direct_answers": ["nine", "na", "9am", "early", "early", "nine", "8am", "eight", "early", "8 am"], "difficult_direct_answer": false, "rationales": ["The citrus heights water district is open from 8:00am to 5:30pm.", "The hours are on the door.", "The hours of the building start at 8 in the morning."], "image": "val2014/COCO_val2014_000000199669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50924, "question_id": "Yagyuit4JHPLcnPNjR8fNh", "question": "What is the vegetable the broccoli is being cooked with called?", "choices": ["brussels sprouts", "carrots", "onions", "turnips"], "correct_choice_idx": 2, "direct_answers": ["onion", "onions", "onion", "onions", "onions", "onion", "onion", "onion", "cucose", "garlic"], "difficult_direct_answer": false, "rationales": ["There are several caramelized white pieces which are likely used to flavor the dish.", "There are chopped onions.", "This is likely to flavor the broccoli."], "image": "val2014/COCO_val2014_000000050924.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55410, "question_id": "Yapkrz7UpL2J2XsCzAMakc", "question": "What country's cities are listed on the information board?", "choices": ["mexico", "united states", "england", "brazil"], "correct_choice_idx": 1, "direct_answers": ["united states", "united states", "united states", "united states", "united states", "united states", "usa", "usa", "united states", "usa"], "difficult_direct_answer": false, "rationales": ["The cities are in the us.", "Atlanta las vegas and washington dc are in united states.", "The cities listed are those that are located in the us only."], "image": "train2014/COCO_train2014_000000055410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222340, "question_id": "YaqPGkBfsR9gWaYH4HsfM8", "question": "What continent is this road located at?", "choices": ["europe", "africa", "asia", "australia"], "correct_choice_idx": 2, "direct_answers": ["asia", "asia", "asia", "asia", "asia", "iraq", "asia", "iraq", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["The bus has filipino writing on it. the philippines are in asia.", "This looks to be in asia by the writing on the bus.", "The text on the bus is in an asian language."], "image": "val2014/COCO_val2014_000000222340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522076, "question_id": "YauSxnFpBoUQwrduqGfLW6", "question": "What does the man standing up have on?", "choices": ["hat", "goggles", "scarf", "scuba gear"], "correct_choice_idx": 0, "direct_answers": ["hat", "hat", "hat", "clothes", "hat", "hat", "hat", "clothes", "hat", "hat"], "difficult_direct_answer": false, "rationales": ["He has a head accessory on for shade.", "There is a man visibly standing up and there is something on his head that has been added and is not natural based on normal human head shapes.", "The man has a hat."], "image": "train2014/COCO_train2014_000000522076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484312, "question_id": "YauniButo7f8WxAFCRhhTt", "question": "Why is he holding the stuffed animal?", "choices": ["hiding it", "for sale", "protecting it", "is lonely"], "correct_choice_idx": 2, "direct_answers": ["prop", "fun", "posing", "posing", "seeking attention", "costume", "photo pose", "protecting it", "picture", "gift"], "difficult_direct_answer": true, "rationales": ["The animal is being held for protection.", "He is holding the stuffed animals to protect it from falling over the edge.", "He's protecting the stuffed animal."], "image": "val2014/COCO_val2014_000000484312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304866, "question_id": "Yaup4YwmsLzfUZsntcU3eq", "question": "What is the man with the red backpack on the right doing?", "choices": ["drawing", "exercising", "taking photo", "playing instrument"], "correct_choice_idx": 2, "direct_answers": ["taking picture", "taking photo", "taking photo", "picture taking", "taking photo", "photographing", "taking pictures", "photographing", "taking picture", "taking pictures"], "difficult_direct_answer": false, "rationales": ["The man is holding a camera visibly and is looking through the lens based on his body position and eye line. someone looking through the lens of a camera is likely to be taking pictures.", "The man is taking a photo.", "The man is taking a picture."], "image": "train2014/COCO_train2014_000000304866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151658, "question_id": "YbXM7tTNzpfEq6JFjemH9L", "question": "What is the string made of?", "choices": ["cotton", "urethane", "linen", "leather"], "correct_choice_idx": 1, "direct_answers": ["wire", "nylon", "nylon", "cord", "boat", "urethane", "hemp", "metal", "nylon", "urethane"], "difficult_direct_answer": false, "rationales": ["It is made of a material so it does not get wet in the water and will last.", "It has to be a thick rope so it doesn't come apart or hurt you.", "The string is urethane."], "image": "train2014/COCO_train2014_000000151658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379434, "question_id": "YbnHzc9dots5CDHp8C2SbE", "question": "For what reason is there clear plastic sheeting over the window?", "choices": ["energy conservation", "uv protection", "damage", "remodeling preparation"], "correct_choice_idx": 0, "direct_answers": ["weather", "stop leaking", "energy conservation", "block sun", "heating", "white", "cover", "protection", "privacy", "cover inside"], "difficult_direct_answer": true, "rationales": ["Energy is being conserved.", "The sheet helps protect the furniture from getting shined on with too much light.", "Energy is conserved."], "image": "train2014/COCO_train2014_000000379434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12943, "question_id": "YbpaB5bKSdA7YeJV6brhLZ", "question": "Which object in the room can create the most noise?", "choices": ["laptop", "chair", "desk", "speakers"], "correct_choice_idx": 3, "direct_answers": ["speakers", "speakers", "speaker", "speakers", "speaker", "computer", "speakers", "speakers", "speakers", "speaker"], "difficult_direct_answer": false, "rationales": ["Sound comes from speakers.", "Electronics are on a desk including a laptop and separate speakers.", "The speakers next to the computer can play music on them the loudest."], "image": "val2014/COCO_val2014_000000012943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281586, "question_id": "YbuSAkJgV6cqBREdV7JUAd", "question": "What is different about this batter from most batters?", "choices": ["gender", "height", "left-handed batter", "age"], "correct_choice_idx": 2, "direct_answers": ["left handed", "good", "lefty", "left handed", "heavyset", "huge body", "left handed", "stance", "left handed", "left-handed batter"], "difficult_direct_answer": false, "rationales": ["By his position on the batters box you can tell he is left handed.", "The batter is left handed.", "He has been in baseball for a longer time than most."], "image": "train2014/COCO_train2014_000000281586.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522461, "question_id": "YbvHhFH5SwZYfXcqFQLRQQ", "question": "What are the people sitting in the snow doing?", "choices": ["waiting", "sledding", "gambling", "skiing"], "correct_choice_idx": 0, "direct_answers": ["snowball fight", "waiting", "talking", "taking break", "snowboarding", "talking", "relaxing", "messing around", "sitting", "laughing"], "difficult_direct_answer": true, "rationales": ["These people appear to be prepared to ski but await something before starting.", "They are on snowboards and are looking for the other person to get ready.", "The people are not going anywhere and are taking their time. they are not actively pursuing interests at the time."], "image": "train2014/COCO_train2014_000000522461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30488, "question_id": "Yc3LewDGaD88iefcTnPg6d", "question": "What could potentially impede their vision shortly?", "choices": ["snow storm", "skiers", "goggles", "sun"], "correct_choice_idx": 0, "direct_answers": ["white out", "snow", "snow storm", "snow", "blizzard", "snow", "snow", "snow", "snow", "snowfall"], "difficult_direct_answer": false, "rationales": ["It is snowing and they are about to ski.", "Falling snow can make it hard to see things in front of you.", "It is getting thicker and will block the view"], "image": "train2014/COCO_train2014_000000030488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512156, "question_id": "Yc69k5K7FkH8ZVDLEiYeK2", "question": "The woman next to the camera has what hair the color of what?", "choices": ["lemon", "lime", "apple", "carrot"], "correct_choice_idx": 3, "direct_answers": ["copper", "carrot", "carrots", "carrot", "red", "red", "red", "cooper", "red", "dark orange"], "difficult_direct_answer": false, "rationales": ["The woman's hair is orange.", "The woman nearest the camera has orange hair. orange is also the color of carrots.", "The other options don't match her orange hair."], "image": "train2014/COCO_train2014_000000512156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455325, "question_id": "YcCoUJNMWgM2Y9bDeCeqge", "question": "What are the objects which are placed underneath the church pews?", "choices": ["heaters", "grates", "storage boxes", "lights"], "correct_choice_idx": 0, "direct_answers": ["pamphlets", "baskets", "radiators", "heaters", "speakers", "heater", "heaters", "heaters", "knee rest", "radiators"], "difficult_direct_answer": false, "rationales": ["A heating/cooling device is visible in the lower back pew of this image; grating is placed around it to protect parishioners legs.", "They are there to keep the people warm.", "The objects under the pews are used to heat the seats."], "image": "val2014/COCO_val2014_000000455325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41739, "question_id": "YcTk8zJZefnqMeJSYFKt3X", "question": "What is the likely hazard that is going to happen?", "choices": ["thunderstorm", "earthquake", "car accident", "fire"], "correct_choice_idx": 0, "direct_answers": ["thunderstorm", "thunderstorm", "train accident", "storm", "storm", "storm", "spreading fire", "blackout", "rain storm", "pedestrian crossing"], "difficult_direct_answer": false, "rationales": ["People are walking on a busy street with dark clouds in the sky above.", "The clouds and skies are ripe for a downpour.", "Thunderstorm clouds are rolling in."], "image": "train2014/COCO_train2014_000000041739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156367, "question_id": "YcXR9ezGTZKgnHmpmFv5gn", "question": "What number will show up on the screen next?", "choices": ["twelve", "seven", "four", "one"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "one", "one", "three", "three", "three", "three", "unsure", "three"], "difficult_direct_answer": false, "rationales": ["The number one is going to show up on the screen next.", "Looks like it's a countdown.", "That is the last last number on the countdown."], "image": "train2014/COCO_train2014_000000156367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434262, "question_id": "YcbG2sjRnwohRtNZHZpvyn", "question": "Upon what does the highest statue sit?", "choices": ["column", "bus", "ground", "person"], "correct_choice_idx": 0, "direct_answers": ["high column", "monument", "pillar", "column top", "tower", "column", "tall building", "pedestal", "memorial", "column"], "difficult_direct_answer": true, "rationales": ["The statue is sat atop the column.", "The statue is on a column.", "The statue is raised on top of a column."], "image": "train2014/COCO_train2014_000000434262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283868, "question_id": "YcfQYGkwJ7qQtj6R8Bybzj", "question": "How is the woman serving the food?", "choices": ["dicing", "scooping", "slicing", "pouring"], "correct_choice_idx": 2, "direct_answers": ["hand", "spoon", "content", "one-handed", "knife", "plate", "hands", "knife", "slicing", "cutting"], "difficult_direct_answer": true, "rationales": ["The woman is serving the food by slicing because she has a knife in her hand", "The woman is slicing the big cake.", "The woman is portioning a cake that is in front of her with a knife. when one portions a cake in this way they are said to be slicing it."], "image": "train2014/COCO_train2014_000000283868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395614, "question_id": "YcmEoW5BwLJaydt9Kz6C6Z", "question": "What kind of sign is shown?", "choices": ["regulatory", "brand", "warning", "informational"], "correct_choice_idx": 3, "direct_answers": ["train schedule", "arrivals board", "schedule sign", "digital", "transport", "go", "informational", "schedule", "itinerary", "schedule"], "difficult_direct_answer": true, "rationales": ["An informatics sign is above the train.", "This gives route and time data", "The sign is showing arrival and departure times for a particular train route."], "image": "train2014/COCO_train2014_000000395614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384659, "question_id": "Ycp5kKH746XznwfXc2vLD3", "question": "Where might one of the cars be headed?", "choices": ["city", "mountains", "beach", "country"], "correct_choice_idx": 0, "direct_answers": ["toward city", "callan park", "home", "city", "into town", "work", "buildings", "to work", "park", "work"], "difficult_direct_answer": true, "rationales": ["These cars could be headed into the city ahead down the highway.", "The cars are going to the city given the buildings.", "They are facing the city"], "image": "train2014/COCO_train2014_000000384659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297877, "question_id": "Ycq2vCNoEF3NRTEwZih3Pp", "question": "What baseball position is the stature commemorating?", "choices": ["outfielder", "catcher", "pitcher", "umpire"], "correct_choice_idx": 1, "direct_answers": ["catcher", "batter up", "catcher", "catcher", "hitter", "catcher", "catcher", "catcher", "catcher", "catcher"], "difficult_direct_answer": false, "rationales": ["They are crouched down with protective gear", "The statue is squatting.", "The statue is wearing the pads traditionally worn by a baseball catcher as well as in the position one would assume as a catcher."], "image": "train2014/COCO_train2014_000000297877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66397, "question_id": "Yd3J7tBEr6hbHeDZ5DK57G", "question": "Why are they skiing on flat ground?", "choices": ["are lost", "are adventurous", "cross country", "are confused"], "correct_choice_idx": 2, "direct_answers": ["cross country", "cross-country", "cross-country", "cross-country", "cross country", "practice", "cross country", "cross country", "cross country", "cross country"], "difficult_direct_answer": false, "rationales": ["Cross country skiing is when you ski on flat ground.", "This is cross country skiing. this takes place on flat ground.", "This is a specific type of skiing."], "image": "val2014/COCO_val2014_000000066397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107691, "question_id": "Yd5nLHcehv7LCErf48ZeKB", "question": "What setting is this picture taken in?", "choices": ["cafeteria", "computer lab", "classroom", "laboratory"], "correct_choice_idx": 1, "direct_answers": ["classroom", "computer lab", "classroom", "classroom", "library", "college classroom", "library", "library", "library", "classroom"], "difficult_direct_answer": false, "rationales": ["A classroom with lap tops in it.", "There are computers at all the seats.", "This is the most likely answer. that said, it could be a d with a focus on computers."], "image": "train2014/COCO_train2014_000000107691.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441736, "question_id": "Yd7dd3tCRoQeaWLi3EFdhc", "question": "Which person is in the greatest danger?", "choices": ["left man", "right man", "farthest man", "rear man"], "correct_choice_idx": 0, "direct_answers": ["older man", "man jacket", "street man", "man", "left person", "older man", "man", "left man", "older man", "left"], "difficult_direct_answer": false, "rationales": ["The left man might be hit by a car.", "People are walking along a sidewalk next to a busy city street. the person closest to the traffic is most likely to get hit.", "The man in the tan jacket could be hit by the car."], "image": "val2014/COCO_val2014_000000441736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552895, "question_id": "YdJP4dqq2KdzEJTQmJ5spf", "question": "What type of area is the woman waiting in?", "choices": ["lobby", "hotel", "subway", "bus stop"], "correct_choice_idx": 2, "direct_answers": ["train station", "station", "transit stop", "subway", "train station", "baggage claim", "train station", "train station", "station", "subway"], "difficult_direct_answer": false, "rationales": ["The woman is at a subway track.", "The woman is waiting in a subway area.", "The area is a subway."], "image": "train2014/COCO_train2014_000000552895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374818, "question_id": "YdgSbhJuZEErPtsS2qNnYH", "question": "How might passengers walk from the plane itself to the terminal?", "choices": ["taxi", "stairs", "gangway", "truck"], "correct_choice_idx": 2, "direct_answers": ["walkway", "gangway", "zero", "walkway", "gate", "tarmac", "boarding tunnel", "tunnel", "duct", "ramp"], "difficult_direct_answer": true, "rationales": ["They might use a walkway.", "There is a covered platform connected to the airport that is in contact with the plane and based on the position, it is over the plane door. this object would be used to connect airport to plane and connect over the door allowing people to walk to the terminal inside.", "They take a tunnel that is like a tube from the building to the plane"], "image": "train2014/COCO_train2014_000000374818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434381, "question_id": "YdgYkE2NrVYcP8JXVfJnXH", "question": "What is the land like in front of the plane?", "choices": ["mountainous", "flat", "volcanic", "hilly"], "correct_choice_idx": 1, "direct_answers": ["concrete", "flat", "flat", "nice", "flat", "flat", "levelled", "flat", "levelled", "flat"], "difficult_direct_answer": false, "rationales": ["There is a driveway like in front of the plane.", "The back has some tall land, but not very tall.", "The land is flat."], "image": "train2014/COCO_train2014_000000434381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201045, "question_id": "YeBeKRWDaiw3YuFaJSBdV3", "question": "What do these young people pretend to do?", "choices": ["strum guitar", "play soccer", "play tennis", "bike"], "correct_choice_idx": 1, "direct_answers": ["play soccer", "play sport", "play soccer", "play soccer", "play sports", "play football", "play soccer", "play soccer", "play", "play soccer"], "difficult_direct_answer": false, "rationales": ["There are many athletes on the screen so it must be a team sport.", "You can tell by the field and setting as to what game they are playing.", "The people play soccer."], "image": "val2014/COCO_val2014_000000201045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491597, "question_id": "YeVrB7ggiofmHrMGtqVZKS", "question": "What is the man doing on the ground?", "choices": ["resting", "making call", "eating", "fishing"], "correct_choice_idx": 1, "direct_answers": ["talking", "on cellphone", "sitting down", "relaxing", "sitting", "talking phone", "talking phone", "calling", "making call", "talking"], "difficult_direct_answer": false, "rationales": ["He's holding a phone up against his ear, and you talk to people on the phone.", "The man is calling.", "The man on the ground is using a cell phone to talk to someone."], "image": "train2014/COCO_train2014_000000491597.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106216, "question_id": "YeYHpYpW5XrMkr8R7fjz9D", "question": "How many tourists are in this carrier?", "choices": ["two", "three", "one", "none"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "zero", "none", "two", "two", "zero", "two", "zero", "two"], "difficult_direct_answer": false, "rationales": ["Tourists sit in the back.", "The tourists would sit in the back of the wagon so there are zero.", "The back of the carrier is empty. there are two people at the front in the driving part of the vehicle."], "image": "val2014/COCO_val2014_000000106216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511734, "question_id": "Yey4aRRPNTRg7caYhjD45c", "question": "How many patriotic motorcycles are there in the image?", "choices": ["three", "two", "four", "five"], "correct_choice_idx": 1, "direct_answers": ["one", "one", "one", "one", "one", "two", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["Two large flags are on the back of vehicles on a street.", "There are a pair of motorcycles carrying american flags on the back.", "Two vehicles are pictured."], "image": "train2014/COCO_train2014_000000511734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433441, "question_id": "YezLMFNR2ZJE5Ri4BvTg5Y", "question": "What are the women doing over the large containers on the ground?", "choices": ["washing", "cleaning", "lighting fire", "cooking"], "correct_choice_idx": 3, "direct_answers": ["cooking food", "cooking", "cooking", "cooking", "cooking", "cooking", "cooking", "cooking", "cooking", "stirring"], "difficult_direct_answer": false, "rationales": ["Woman are standing over large pots with spoons in their hands and the pots are steaming.", "They are making food over the fire.", "The women are stirring the food that is in the large pots placed over the logs on the fire."], "image": "train2014/COCO_train2014_000000433441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364341, "question_id": "Yf5Wm5ykTjwiXSsgBPLs8Q", "question": "What should be put in the container behind the baseball bat?", "choices": ["sand", "equipment", "gun", "trash"], "correct_choice_idx": 3, "direct_answers": ["trash", "trash", "trash", "trash", "baseball", "trash", "trash", "garbage", "trash", "trash"], "difficult_direct_answer": false, "rationales": ["The container in the background is a garbage bin.", "The container has trash.", "Due to the shape of the container and it's square deposit space at the top we can conclude it is for garbage."], "image": "val2014/COCO_val2014_000000364341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546569, "question_id": "YfBvnPYw8MNcEn5fCuNbg8", "question": "What item is lit up inside the green stand?", "choices": ["laptop", "cellphone", "pager", "television"], "correct_choice_idx": 3, "direct_answers": ["television", "television", "tv", "television", "tv", "television", "television", "television", "television", "tv"], "difficult_direct_answer": false, "rationales": ["The cabinet houses a tv.", "The tv screen is on.", "The object is clearly visible as a television because of the shape and size and the screen. it is located in the green stand."], "image": "val2014/COCO_val2014_000000546569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139211, "question_id": "YfDHsJEfph6P2as7LzVLVv", "question": "What country is associated with the second treat mentioned?", "choices": ["sweden", "france", "ireland", "austria"], "correct_choice_idx": 1, "direct_answers": ["france", "france", "united states", "united states", "france", "france", "france", "france", "france", "france"], "difficult_direct_answer": false, "rationales": ["Crepes are popular in france.", "The country is france.", "Crepes are a french food."], "image": "train2014/COCO_train2014_000000139211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561635, "question_id": "YfQDL87CiRtovgTCbR3J4Z", "question": "What type of device does the remote to the right of the cell phone operate?", "choices": ["record player", "stereo", "clock radio", "dvd player"], "correct_choice_idx": 2, "direct_answers": ["media center", "radio", "clock radio", "remote control", "sound system", "radio system", "telescope", "cd player", "dvd player", "dvd player"], "difficult_direct_answer": true, "rationales": ["You can tell by the buttons of the remote as to what it controls.", "There are buttons for controlling and seeing the music.", "The remote has a clock radio."], "image": "train2014/COCO_train2014_000000561635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504617, "question_id": "YfQbcafSFk99gRBisjLTZf", "question": "Why is he squinting?", "choices": ["it's cloudy", "it's dusty", "it's bright", "it's dark"], "correct_choice_idx": 3, "direct_answers": ["lights", "to see", "bright light", "it's dark", "bright light", "to see", "staring", "bright lights", "concentrating", "look cool"], "difficult_direct_answer": false, "rationales": ["It's too dark to see.", "The lights are on inside. the motorcycle is not in motion.", "He's trying to see something"], "image": "train2014/COCO_train2014_000000504617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429807, "question_id": "YfQsDWAkgaxTWsTgKSv6ui", "question": "The cylindrical object on the floor is there for what purpose?", "choices": ["cleaning", "air freshening", "fire prevention", "painting"], "correct_choice_idx": 2, "direct_answers": ["fire extinguisher", "fire extinguisher", "fire extinguisher", "fire prevention", "fire extinguisher", "extinguish fires", "fire extinguisher", "extinguishing fire", "fight fire", "extinguish fires"], "difficult_direct_answer": false, "rationales": ["A red object with a nozzle is on the floor near a stove.", "There is a fire extinguisher on the floor.", "The red item is an extinguisher that could be used if something were to start burning."], "image": "val2014/COCO_val2014_000000429807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302787, "question_id": "YfS4uZCzPTAAkKLubDaZkZ", "question": "Which beverage company spent the most to advertise near here?", "choices": ["budweiser", "coke", "gallo wines", "pepsi"], "correct_choice_idx": 1, "direct_answers": ["coca cola", "coca cola", "coca cola", "coca cola", "coca cola", "coca-cola", "coca cola", "coca cola", "coke", "coca cola"], "difficult_direct_answer": false, "rationales": ["There is a large soft drink sign at the top of the billboard.", "The company is coke.", "The coke company is sent here to advertise near the city square."], "image": "val2014/COCO_val2014_000000302787.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193579, "question_id": "YfmQSLDkNdBs3A2j9Wg8q3", "question": "In which city are these pedestrians walking?", "choices": ["oklahoma", "mexico", "gotham", "san francisco"], "correct_choice_idx": 3, "direct_answers": ["folsom", "san francisco", "european", "folsom", "folsom", "san francisco", "los angeles", "folsom", "folsom", "san francisco"], "difficult_direct_answer": false, "rationales": ["The city is sf.", "Folsom is in san francisco.", "Folsom street is located in san francisco."], "image": "train2014/COCO_train2014_000000193579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62336, "question_id": "YfrNq5DhrLR5E8NQKSp4AN", "question": "Who manufactures the game that the stuffed animal is inspired by?", "choices": ["sega", "atari", "microsoft", "nintendo"], "correct_choice_idx": 3, "direct_answers": ["hasbro", "nintendo", "nintendo", "people", "unknown", "nintendo", "men", "unknown", "nintendo", "nintendo"], "difficult_direct_answer": false, "rationales": ["This is from super mario.", "The stuffed animal is toad from super mario bros. 2.", "The animal is from nintendo."], "image": "train2014/COCO_train2014_000000062336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126020, "question_id": "Yfv4iuU9iZGok9NAFKHWBS", "question": "What type of business is the truck for?", "choices": ["refrigerated items", "food truck", "delivering packages", "gas delivery"], "correct_choice_idx": 2, "direct_answers": ["package delivery", "delivery", "package delivery", "deliveries", "delivering packages", "delivering packages", "delivery", "delivery", "shipping", "fedex deliveries"], "difficult_direct_answer": false, "rationales": ["The brand is famous for delivering packages.", "This is a fedex truck", "The truck is used for the delivery of packages."], "image": "train2014/COCO_train2014_000000126020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82259, "question_id": "YgLTWbALP9dYfespNvRKee", "question": "What will the person in the air do next?", "choices": ["split", "land", "walk", "flip"], "correct_choice_idx": 1, "direct_answers": ["land", "land", "land", "land", "land", "land", "land", "land", "land", "land"], "difficult_direct_answer": false, "rationales": ["They are too close to the ground to do any tricks or stunts.", "The person will land.", "From air the next process is landing."], "image": "val2014/COCO_val2014_000000082259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462847, "question_id": "YgtzbjQgS4Ns6QSurSQ6TK", "question": "What do you call women this age?", "choices": ["seniors", "middle-aged", "toddlers", "teenagers"], "correct_choice_idx": 0, "direct_answers": ["middle", "grand mother", "seniors", "seniors", "elderly", "grandmothers", "senior", "old", "elderly", "older"], "difficult_direct_answer": false, "rationales": ["The woman has graying hair which means she's a senior citizen.", "They have grey hair and stand hunched over a bit.", "I know what the different ages are."], "image": "train2014/COCO_train2014_000000462847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491131, "question_id": "YhRJaxDevnE2gGrRuACE88", "question": "This bench is located where?", "choices": ["ocean front", "city sidewalk", "mall", "park"], "correct_choice_idx": 1, "direct_answers": ["sidewalk", "sidewalk", "on sidewalk", "sidewalk", "city street", "city sidewalk", "sidewalk", "city", "on sidewalk", "city sidewalk"], "difficult_direct_answer": false, "rationales": ["People walk along the street. benches are placed along streets for people.", "This particularly well-maintained bench is on an equally spotless city sidewalk. made popular in paris in the 1850s, park benches are a part of most major cities to this day.", "The bench is clearly outside and appears to be in an urban public setting based on the walkway surface and the building facade behind."], "image": "val2014/COCO_val2014_000000491131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514191, "question_id": "YhTvMB8mjcTy5XT6tehr4z", "question": "What season is this?", "choices": ["summer", "autumn", "winter", "spring"], "correct_choice_idx": 1, "direct_answers": ["fall", "fall", "fall", "autumn", "autumn", "autumn", "autumn", "fall", "fall", "fall"], "difficult_direct_answer": false, "rationales": ["The leaves are orange.", "When trees start to hunker down for winter they withdraw water from the leaves which then turn brown and fall from the branches.", "The leaves are orange."], "image": "train2014/COCO_train2014_000000514191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243728, "question_id": "YhX45sGBKLRW8w6J4ZGt8r", "question": "Where is he practicing his sport?", "choices": ["skate park", "gymnasium", "backyard", "driveway"], "correct_choice_idx": 0, "direct_answers": ["park", "skate park", "skate park", "skate park", "skate park", "skate park", "skate park", "skate park", "skating park", "skate park"], "difficult_direct_answer": false, "rationales": ["He's at a skate park.", "The concrete structures show the skater being at a skateboard park.", "This is a park designed to be skated on."], "image": "train2014/COCO_train2014_000000243728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577464, "question_id": "Yha3EA4o6wRvV2xr5Nmns8", "question": "What does the red and white flag represent?", "choices": ["france", "red cross", "thailand", "mechanic"], "correct_choice_idx": 1, "direct_answers": ["switzerland", "switzerland", "red cross", "hospital", "hospital", "switzerland", "hospital", "switzerland", "swiss", "switzerland"], "difficult_direct_answer": false, "rationales": ["Red cross uses a flag with a cross in the middle.", "The flag has a red cross on it.", "Represents a country and looks like a t."], "image": "train2014/COCO_train2014_000000577464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250808, "question_id": "Yhhcq6M4jmgPSf4MQaaZHb", "question": "Which fast food restaurant is near the lights?", "choices": ["mcdonald's", "burger king", "popeye's", "wendy's"], "correct_choice_idx": 0, "direct_answers": ["mcdonalds", "burger king", "mcdonalds", "mcdonalds", "mcdonald's", "mcdonalds", "mcdonald's", "mcdonald's", "mcdonalds", "mcdonalds"], "difficult_direct_answer": false, "rationales": ["Mcdonald's logo contains the golden arches that are shown in the sign logo near the lights.", "The restaurant is mcdonald's.", "Mcdonald's logo is visible."], "image": "train2014/COCO_train2014_000000250808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30549, "question_id": "YhnDEe6Z4W7ZD5gsqSXXS3", "question": "Do identical twins have 100% the same DNA?", "choices": ["somewhat", "true", "false", "maybe"], "correct_choice_idx": 1, "direct_answers": ["yes", "yes", "yes", "no", "no", "true", "yes", "yes", "yes", "no"], "difficult_direct_answer": false, "rationales": ["Yes, because they are one fertilized egg which has split onto 2.", "They do have the same dna at birth.", "Identical twins have the same dna."], "image": "val2014/COCO_val2014_000000030549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524325, "question_id": "YiL9wqBYUN5SrEHjJUuciL", "question": "The items the people are staring at are likely made of what?", "choices": ["mud", "cloth", "brick", "stone"], "correct_choice_idx": 1, "direct_answers": ["nylon", "cloth", "park", "fabric", "material", "fabric", "plastic", "clorhh", "material", "plastic"], "difficult_direct_answer": false, "rationales": ["The other options are too heave and would not work for a kite.", "The items are cloth.", "The flags are made of fabric."], "image": "train2014/COCO_train2014_000000524325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423037, "question_id": "YiRhe898YUjxCePb8oFJ3E", "question": "What type of body of water is located adjacent to the railway tracks?", "choices": ["canal", "pond", "marsh", "lake"], "correct_choice_idx": 0, "direct_answers": ["river", "canal", "river", "river", "river", "river", "river", "river", "river", "river"], "difficult_direct_answer": false, "rationales": ["The river by the tracks is very long.", "A think expanse of water runs parallel with train tracks into the distance.", "This body of water is usually very straight and narrow"], "image": "train2014/COCO_train2014_000000423037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393224, "question_id": "YiVG54YsqbdzM2FnFDp9Wv", "question": "What facial expression is the man wearing glasses showing?", "choices": ["smile", "grin", "grimace", "frown"], "correct_choice_idx": 3, "direct_answers": ["frustration", "sad", "disappointed", "sad", "disgust", "sad", "pouty", "frown", "scowl", "defeated"], "difficult_direct_answer": false, "rationales": ["The man is grimacing in a frown gesture.", "He does not look happy at all and is not smiling.", "A man is scowling."], "image": "train2014/COCO_train2014_000000393224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146865, "question_id": "YiwaPXBGZD8u6uaEcS5MpB", "question": "What type of park is this?", "choices": ["swim", "national", "golf", "skateboard"], "correct_choice_idx": 3, "direct_answers": ["skatepark", "skate park", "skateboard", "skateboard", "skateboard", "skateboard park", "skate", "skateboard", "skate", "skate park"], "difficult_direct_answer": false, "rationales": ["This is a park for skateboarders to ride their boards.", "This is a small skateboard park.", "The guy is riding a skateboard."], "image": "val2014/COCO_val2014_000000146865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91948, "question_id": "YiyviPC9zscNrXzSqcUAZh", "question": "If he has zero points what is it called?", "choices": ["nothing", "like", "zero", "love"], "correct_choice_idx": 3, "direct_answers": ["neutral", "love", "love love", "love", "love", "love", "love", "love", "love", "love"], "difficult_direct_answer": false, "rationales": ["They are playing tennis. a score of zero in tennis is called love.", "Love equates to zero points.", "In tennis, this is used instead of zero."], "image": "train2014/COCO_train2014_000000091948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422008, "question_id": "Yj28j8SjwiGeAsetdAqM6e", "question": "What type of ingredients can we obtain from these fruits?", "choices": ["vitamins", "none", "proteins", "carbohydrates"], "correct_choice_idx": 0, "direct_answers": ["pie", "apples", "applesauce jelly", "apple", "fruits", "fruit", "apples", "apples", "vitamins", "applesauce"], "difficult_direct_answer": false, "rationales": ["Fruits have lots of nutrition in them that we need.", "Fruits are known and valued for their vitamin content.", "Apples are nutritious."], "image": "train2014/COCO_train2014_000000422008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139008, "question_id": "Yjxtt2Lrg5wNKp4JWWJ6VV", "question": "Who owns these belongings?", "choices": ["woman", "boy", "man", "baby"], "correct_choice_idx": 0, "direct_answers": ["girl", "woman", "woman", "woman", "girl", "young lady", "woman", "female", "young girl", "woman"], "difficult_direct_answer": false, "rationales": ["A scarf, purse, diary and other femine products are spilled out.", "Some females carry purses with a bunch of stuff in it.", "The pink colors, multitude of personal grooming items and purse laid out in this scene suggest femininity."], "image": "train2014/COCO_train2014_000000139008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407259, "question_id": "Yk3gLnvyUG5KUTkdrPvQ84", "question": "What are they doing with the knife?", "choices": ["placing pan", "cutting pie", "cleaning plates", "showing off"], "correct_choice_idx": 1, "direct_answers": ["cutting", "cutting", "cutting", "cutting cake", "cutting cake", "cutting", "cutting", "cutting pie", "cutting cake", "cutting cake"], "difficult_direct_answer": false, "rationales": ["They are getting a slice ready to eat", "These people are cutting a pie with a knife.", "There is a pie on the table and they're slicing into it with the knife."], "image": "val2014/COCO_val2014_000000407259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124462, "question_id": "YkBSkT7J8vyVYhSoCL8B8S", "question": "What brand of car is this?", "choices": ["subaru", "honda", "kia", "toyota"], "correct_choice_idx": 0, "direct_answers": ["acura", "chevrolet", "subaru", "subaru", "subaru", "subaru", "kia", "subaru", "saturn", "chrysler"], "difficult_direct_answer": false, "rationales": ["The assortment of grey four sided stars on the blue oval marking this car's grille identifies it as a subaru.", "That's the make of the car according to its emblem.", "You can see the little logo with stars on the front of the car."], "image": "val2014/COCO_val2014_000000124462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92066, "question_id": "YkmcWt9vVjb83etwwvyeDr", "question": "What flying objects are above the crosswalk?", "choices": ["spaceships", "helicopters", "airplanes", "kites"], "correct_choice_idx": 2, "direct_answers": ["like", "like", "kites", "kites", "kites", "kites", "airplanes", "kites", "kites", "kites"], "difficult_direct_answer": false, "rationales": ["These items are connected to strings held by people.", "By the look it shows wat it is on the sky.", "There are many kites on strings."], "image": "train2014/COCO_train2014_000000092066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257617, "question_id": "YmJbFW6hAesVHLqqxvYUXq", "question": "What is the object plugged into the outlet called?", "choices": ["listening device", "air freshener", "motion sensor", "nightlight"], "correct_choice_idx": 1, "direct_answers": ["air freshener", "air freshener", "air freshener", "night light", "microwave", "air freshener", "air fresher", "socket", "air freshener", "scent diffuser"], "difficult_direct_answer": false, "rationales": ["An air freshener is plugged in.", "It keeps the air smelling good.", "There is an air freshener plugged into the outlet."], "image": "train2014/COCO_train2014_000000257617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374005, "question_id": "YmLzdBFkckt5Wc9UDwLjaW", "question": "The board is used for which sport?", "choices": ["kiting", "skating", "sliding", "surfing"], "correct_choice_idx": 1, "direct_answers": ["snowboarding", "snowboarding", "snowboarding", "snowboard", "snowboarding", "snowboard", "skating", "snowboarding", "skiing", "skateboarding"], "difficult_direct_answer": false, "rationales": ["This looks to be a board that would be used in snowboarding.", "There is a snowboard near the table, which looks like a skateboard.", "The board is for skating."], "image": "train2014/COCO_train2014_000000374005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542359, "question_id": "YmPcbFMiYfyMVyFEEzGnss", "question": "What does the switch between the sink and the dishwasher in the kitchen operate?", "choices": ["garbage disposal", "fan", "cabinet lights", "overhead lights"], "correct_choice_idx": 0, "direct_answers": ["garbage disposal", "lights", "garbage disposal", "garbage disposal", "lights", "light", "garbage disposal", "garbage disposal", "garbage disposal", "garbage disposal"], "difficult_direct_answer": false, "rationales": ["There is often a system set up to break larger things down a sinks drain to be broken up toggled via a small switch near the sink.", "Any switches found near a sink don't control a light, but rather a device in the sink used to grind up biological waste.", "The switch is the disposal."], "image": "train2014/COCO_train2014_000000542359.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350467, "question_id": "YmbFXFyefYVKfpirGNjpuU", "question": "In what city did this sport debut at the Olympics?", "choices": ["atlanta", "seoul", "barcelona", "london"], "correct_choice_idx": 0, "direct_answers": ["tokyo", "atlanta", "atlanta", "athens", "florida", "atlanta", "tokyo", "tokyo", "melbourne", "atlanta"], "difficult_direct_answer": false, "rationales": ["The sport is beach volleyball which was first played in the olympics in atlanta.", "They wer ein atlanta.", "This sport has debut in the olympics at atlanta."], "image": "val2014/COCO_val2014_000000350467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101355, "question_id": "YmsST8pPihUK5CDvsadfmK", "question": "Why is the man pouring liquid into the container?", "choices": ["to clean", "to blend", "to fuel", "to cool"], "correct_choice_idx": 1, "direct_answers": ["to blend", "for consumption", "water", "mixing drinks", "cocktail", "balancing", "get drunk", "drink", "make smoothie", "cocktails"], "difficult_direct_answer": true, "rationales": ["He is making a certain alcoholic drink in the blender.", "There is a machine that has a blade that mixes varieties of solids and liquids.", "This man is preparing a cocktail with this blender. pouring into the blender would be unnecessary unless the man was planning to blend it."], "image": "train2014/COCO_train2014_000000101355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41437, "question_id": "YmxihYDcoJrHDkDpiZsnne", "question": "What type of location is being visited?", "choices": ["forest", "swamp", "ocean", "field"], "correct_choice_idx": 3, "direct_answers": ["park", "grassy field", "field", "field", "park", "park", "field", "park", "field", "field"], "difficult_direct_answer": false, "rationales": ["It is a large open grassy area.", "The location is a field where kites are flown by visitors.", "A field with grass is shown."], "image": "train2014/COCO_train2014_000000041437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143696, "question_id": "YnNRYeLWx6ZAMzxwyA9U7U", "question": "What are the boats parked along?", "choices": ["poles", "curb", "dock", "chargers"], "correct_choice_idx": 1, "direct_answers": ["lake side", "pier", "docks", "fence", "curb", "shoreline", "dock", "coastline", "docks", "riverside"], "difficult_direct_answer": true, "rationales": ["The only objects on the waterside that makes sense is a dock and though you can't see it it would make sense that a dock would be there.", "These boats are not in operation and are 'parked'. a dock is the term for where boats are parked or 'docked'.", "The boats are by the curb."], "image": "val2014/COCO_val2014_000000143696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374106, "question_id": "YnRWWb2SjTkwJutMAFmBiC", "question": "What is the center counter top usually referred to as?", "choices": ["island", "bridge", "plaque", "link"], "correct_choice_idx": 0, "direct_answers": ["island", "island", "island", "bar", "bar", "island", "island", "island", "island", "island"], "difficult_direct_answer": false, "rationales": ["In kitchen parlance, the counter space occupying the center of a kitchen is known as an island, which is what this counter area definitely is.", "The center counter is referred to as the island counter.", "The center is an island."], "image": "train2014/COCO_train2014_000000374106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543719, "question_id": "YnWZ5Y76nz6j3BhQaqPEh2", "question": "The cat is cozying up to what animal?", "choices": ["pig", "cow", "dog", "goat"], "correct_choice_idx": 2, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["There is a dog sleeping to the right of the cat.", "The cat is by a dog.", "Next to this cat we see the the telltale ears and paws of a canine though it's tail and snout are obscured."], "image": "val2014/COCO_val2014_000000543719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402674, "question_id": "YnZrZSnEpw2NWLJyR5Kohm", "question": "This person is playing a similar sport to whom?", "choices": ["lennox lewis", "serena williams", "jordan spieth", "bucky dent"], "correct_choice_idx": 1, "direct_answers": ["tennis", "serena williams", "anna kournakova", "rodger federer", "tennis players", "serena williams", "williams", "lender", "roger federer", "anna kourakova"], "difficult_direct_answer": true, "rationales": ["The man is holding a racket on a tennis court.", "The person is playing tennis. serena williams is a tennis player.", "The person is williams."], "image": "val2014/COCO_val2014_000000402674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327526, "question_id": "YnnJddWUWAEB862dxgzcL9", "question": "Why is the horse handler posing?", "choices": ["look cool", "photographer", "calm horse", "impress spectators"], "correct_choice_idx": 1, "direct_answers": ["to queen", "impress queen", "for photo", "queen", "showing guests", "photographer", "video", "picture", "picture taken", "showing horse"], "difficult_direct_answer": true, "rationales": ["He is there for the photographer and people", "There is a person aiming a camera at the horse and horse handler with the intention of taking a picture.", "The horse handler is posing for the man holding the camera that is pointed at him."], "image": "train2014/COCO_train2014_000000327526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137748, "question_id": "YoHdcPFWYeCJBY2FWn3Nkr", "question": "How many cars are moving?", "choices": ["four", "three", "one", "two"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is one car.", "Only one car is on the road driving.", "There is one car in motion."], "image": "train2014/COCO_train2014_000000137748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175863, "question_id": "YoKjP68mg5LnWFRXRSriEa", "question": "What does this person hold in their left hand?", "choices": ["dagger", "nothing", "gun", "pizza"], "correct_choice_idx": 1, "direct_answers": ["nothing", "gun", "nothing", "phone", "pocket", "nothing", "nothing", "phone", "pocket", "nothing"], "difficult_direct_answer": false, "rationales": ["Though you may have to look close but he isn't holding anything.", "The person has nothing.", "Not having to carry anything makes it easy and more relaxing."], "image": "val2014/COCO_val2014_000000175863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408680, "question_id": "YoRn6ccTVMzcPhSP4n5PBQ", "question": "Why is the man not standing?", "choices": ["dead", "resting", "fell asleep", "fell down"], "correct_choice_idx": 3, "direct_answers": ["falling", "falling", "fell", "he fell", "fell down", "fell down", "fell down", "fell down", "lost balance", "on board"], "difficult_direct_answer": false, "rationales": ["The man fell in the snow.", "The man fell while skiing.", "He will fell down on standing of the skateboard."], "image": "train2014/COCO_train2014_000000408680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240782, "question_id": "YoYxQK5x2aZCiCmoHQ3xNh", "question": "Someone who eats a lot of these can be said to have what kind of tooth?", "choices": ["salty", "sweet", "snaggle", "sour"], "correct_choice_idx": 1, "direct_answers": ["sweet tooth", "sweet tooth", "sweet", "sweet", "sweet", "sweet tooth", "glass", "sweet", "sweet", "sweet"], "difficult_direct_answer": false, "rationales": ["The plate has some sweet food.", "That's someone that likes a lot of sweets and eats or craves them often.", "Someone who eats a lot of these could be eating sweets."], "image": "train2014/COCO_train2014_000000240782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139456, "question_id": "YofnRKNxRSBS7cbAJaoayp", "question": "What kind of animal is on the top shelf?", "choices": ["mouse", "ant", "bear", "cat"], "correct_choice_idx": 2, "direct_answers": ["teddy bear", "teddy bear", "bear", "panda", "decorative bear", "bear", "bear", "bears", "bears", "bear"], "difficult_direct_answer": false, "rationales": ["Bears are on the top shelf.", "The animal is a bear.", "There are figurines on the top shelf that look like a family of teddy bears."], "image": "train2014/COCO_train2014_000000139456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256155, "question_id": "YoyCz2aZ89NZkB5b2q6Xom", "question": "Which of the following is an area code for this location?", "choices": ["546", "310", "700", "650"], "correct_choice_idx": 1, "direct_answers": ["310", "310/424", "california", "818", "310", "three onezero", "95862", "310", "310", "no idea"], "difficult_direct_answer": false, "rationales": ["That is one of the area codes for venice beach, ca.", "The text refers to venice beach, california. area codes 546 and 700 are unused or reserved, and area code 650 is for san francisco.", "Area codes for venice beach are 310 and 424."], "image": "train2014/COCO_train2014_000000256155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452834, "question_id": "YpCo2hdqTnUBrQubvybRdx", "question": "How many more animals need to be added to all of these to get the number ten?", "choices": ["two", "one", "five", "three"], "correct_choice_idx": 2, "direct_answers": ["five", "five", "four", "five", "four", "five", "five", "5 more", "five", "duck"], "difficult_direct_answer": false, "rationales": ["Five more are needed.", "It's a simple arithmetic problem. there are five animals seen, we need to add to that to reach a total of ten. that's all the info we need.", "They are halfway there to the number ten."], "image": "train2014/COCO_train2014_000000452834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148146, "question_id": "YpHhpVzskHfDU7AWmwnHX3", "question": "What type dressing does this man favor?", "choices": ["ranch", "green goddess", "thousand island", "french"], "correct_choice_idx": 0, "direct_answers": ["sporty", "ranch", "casual", "buffalo", "casual", "ranch", "ranch", "ranch", "ranch", "ranch dressing"], "difficult_direct_answer": false, "rationales": ["You can tell by the wings he is eating as to what type of dipping sauce he prefers.", "The dressing is ranch.", "There is a bottle of white dressing near the cup."], "image": "train2014/COCO_train2014_000000148146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119543, "question_id": "YpUWzrL4sZnkT9uffMGXP3", "question": "Which item represents the grain food group?", "choices": ["brown", "yellow", "orange", "red"], "correct_choice_idx": 1, "direct_answers": ["mac--cheese", "nothing", "macaroni", "yellow", "macaroni", "pasta", "noodles", "macaroni", "mac's cheese", "macaroni"], "difficult_direct_answer": false, "rationales": ["The macaroni is the yellow colored food. macaroni is a type of pasta, usually made of a grain called wheat.", "Yellow items are grains.", "The yellow macaroni and cheese dish is yellow because of a cheese sauce applied to pasta. pasta is derived from grain."], "image": "train2014/COCO_train2014_000000119543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484840, "question_id": "Ypimdj4kSF2GC8oqzDek9n", "question": "What are the horned animals on the road?", "choices": ["goats", "rams", "cows", "bison"], "correct_choice_idx": 0, "direct_answers": ["goats", "goats", "goats", "goats", "goats", "goats", "goats", "goats", "goats", "goats"], "difficult_direct_answer": false, "rationales": ["A herd of goats, some blurry due to their fast movement, are most likely making their way to greener pastures.", "The animals are fuzzy goats.", "These are goats that are on the road."], "image": "train2014/COCO_train2014_000000484840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543534, "question_id": "YpjmbjTGu4MJYCqcB4FwoA", "question": "Which one of these items were added to the pizza without precooking?", "choices": ["corn", "tomato", "meat", "sauce"], "correct_choice_idx": 1, "direct_answers": ["tomatoes", "corn", "corn", "cheese", "tomato", "chicken", "corn", "corn", "basil", "tomatoes"], "difficult_direct_answer": false, "rationales": ["The corn and meat would still be uncooked if only put in the oven for a few minutes. the sauce was cooked from tomatoes before adding as well.", "A tomato can be sliced or diced and added to the pizza with no other preparation.", "Tomatoes were added."], "image": "train2014/COCO_train2014_000000543534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419789, "question_id": "YpyywxxLhVJRmpYd4vYaRY", "question": "The ski lift saves the skiers from a lot of what physical activity?", "choices": ["rollerskating", "swimming", "walking", "skipping"], "correct_choice_idx": 2, "direct_answers": ["climbing", "walking uphill", "climbing", "walking", "walking", "walking", "walking", "walking uphill", "climbing", "climbing"], "difficult_direct_answer": false, "rationales": ["The lift prevents walking.", "The chair lift is faster than walking up the hill.", "They use this to take them up the mountain so they do not have to walk which would take a long time."], "image": "val2014/COCO_val2014_000000419789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420490, "question_id": "Yq4YFsDCZ7ieH49z5hioko", "question": "In which one of these months do people like to eat this fruit?", "choices": ["december", "july", "october", "april"], "correct_choice_idx": 1, "direct_answers": ["summer", "july", "august", "july", "july", "july", "for healthy", "summer", "july", "summer"], "difficult_direct_answer": false, "rationales": ["The people are eating water melon in july because it is the time of year where we pick it", "During the summer months people eat watermelon to cool down.", "The sliced watermelon present in this kitchen suggests the summertime."], "image": "val2014/COCO_val2014_000000420490.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555298, "question_id": "YqXH9AXzzpUKSMwSwf7nSP", "question": "What is in the capped bottle on the left side of the desk?", "choices": ["gum", "vitamins", "breath mints", "prescription pills"], "correct_choice_idx": 3, "direct_answers": ["medication", "medication", "prescription pills", "pills", "medicine", "medicine", "medicine", "medicine", "prescription bottle", "medication"], "difficult_direct_answer": false, "rationales": ["There is an orange bottle of prescription pills on the side of the desk.", "The bottle is the kind you get when you get your medicine from a pharmacist.", "The bottle has pills."], "image": "val2014/COCO_val2014_000000555298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167027, "question_id": "YqZ26Bi5VG8xuBewwns9xw", "question": "What is the capital city of this country?", "choices": ["taipei", "bangkok", "tokyo", "manila"], "correct_choice_idx": 1, "direct_answers": ["bangkok", "bangkok", "bangkok", "bangkok", "bangkok", "bangkok", "bangkok", "bangkok", "bangkok", "bangkok"], "difficult_direct_answer": false, "rationales": ["You can tell by the countries name on the sign as to where this is.", "The sign says \"thailand plaza\". bangkok is in thailand.", "The capital is bangkok."], "image": "train2014/COCO_train2014_000000167027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407348, "question_id": "Yqm9mqNgJFvc7Q9MZwd28e", "question": "What super hero logo design is on the woman's watch?", "choices": ["loki", "superman", "black widow", "batman"], "correct_choice_idx": 3, "direct_answers": ["batman", "batman", "batman", "batman", "batman", "batman", "batman", "batman", "batman", "batman"], "difficult_direct_answer": false, "rationales": ["The woman has a batman logo on the watch.", "The logo on the watch is that of batman, the black bat with the yellow back", "The batman logo is yellow and black."], "image": "train2014/COCO_train2014_000000407348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389542, "question_id": "YqtjqgdfZRSPaixkPdVymn", "question": "What is horse house called?", "choices": ["crown", "hind", "shuttle", "stable"], "correct_choice_idx": 3, "direct_answers": ["stable", "mare hotel", "mare hotel", "stable", "stable", "stable", "stable", "stable", "mare hotel", "stable"], "difficult_direct_answer": false, "rationales": ["Traditionally horses are kept in stables or pens.", "The horses are kept in stables.", "The area is a stable."], "image": "train2014/COCO_train2014_000000389542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529632, "question_id": "Yqx3Nnrj7oALJBN7mVjSU5", "question": "What kind of light is shown?", "choices": ["lamp", "street", "traffic", "strobe"], "correct_choice_idx": 2, "direct_answers": ["green", "green light", "pedestrian", "pedestrian stop", "no walking", "stop", "passenger", "stop", "traffic", "don't walk"], "difficult_direct_answer": true, "rationales": ["A tr-colored street light at the corner of a street is meant to direct traffic.", "A pedestrian stoplight is shown.", "The light indicates whether or not it is safe for rhe pedestrian to cross through the traffic."], "image": "train2014/COCO_train2014_000000529632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369997, "question_id": "YqxYYduGJXgJsCWJSkJe8s", "question": "The man riding the motorcycle is involved in what type of public service?", "choices": ["police", "medical", "fire safety", "military"], "correct_choice_idx": 3, "direct_answers": ["military", "military", "safety", "military", "cleanup", "military", "military", "military", "military", "volunteering"], "difficult_direct_answer": false, "rationales": ["The man is in the military since he's wearing cameo.", "The uniform seems to indicate this fact.", "The men in these helmets are members of the military."], "image": "val2014/COCO_val2014_000000369997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243442, "question_id": "YqzftaNwQ9zA5GB6PykKmD", "question": "What temperature are the items inside this case?", "choices": ["cool", "room", "100 degrees", "hot"], "correct_choice_idx": 0, "direct_answers": ["cool", "forty degrees", "forty degrees", "cold", "cold", "forty degrees", "cold", "forty degrees", "cold", "cold"], "difficult_direct_answer": false, "rationales": ["The items inside this case are cool because the case is a refrigerator.", "A refrigerator keeps things cool so they won't spoil.", "A man is grabbing food from a portable fridge."], "image": "val2014/COCO_val2014_000000243442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25374, "question_id": "Yri7aAwGrxDEmtHZYPMh9Q", "question": "Why is the man riding being the black animals?", "choices": ["to kill", "to herd", "to hunt", "to race"], "correct_choice_idx": 1, "direct_answers": ["corralling them", "watching", "herding animals", "to herd", "to maintain", "jacking", "herding cattle", "to corral", "herding", "heard cows"], "difficult_direct_answer": true, "rationales": ["The man is herding the cows.", "The man is herding.", "The man is herding a bunch of cows down the side of a road."], "image": "train2014/COCO_train2014_000000025374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335076, "question_id": "YryCg22RcHMFVptrRGbzVC", "question": "Where are these people located?", "choices": ["shore", "forest", "beach", "field"], "correct_choice_idx": 3, "direct_answers": ["park", "park", "park", "park", "field", "outside", "park", "park", "park", "outside"], "difficult_direct_answer": false, "rationales": ["You can tell by the background as to where they are in the picture.", "Large grassy areas are known as fields, and they are in a large grassy area.", "There is dirt and grass behind them."], "image": "train2014/COCO_train2014_000000335076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217312, "question_id": "Ys94j7RhkDD6D2ZJhZSqEh", "question": "How will the people here be getting home?", "choices": ["taxi", "flying", "uber", "train"], "correct_choice_idx": 3, "direct_answers": ["train", "train", "train", "train", "train", "train", "airplane", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["They are standing waiting to board transportation by rail.", "They are in a railroad station", "They are waiting at the station for the doors to open on the method of transportation that they are using to get home."], "image": "val2014/COCO_val2014_000000217312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66166, "question_id": "YsNk6qqez2dHkXTWEqx7vZ", "question": "In which country is this food vendor operating in?", "choices": ["vietnam", "japan", "thailand", "china"], "correct_choice_idx": 2, "direct_answers": ["thailand", "thailand", "vietnam", "unknown", "thailand", "thailand", "philippines", "china", "vietnam", "vietnam"], "difficult_direct_answer": false, "rationales": ["The fruit selection present at this establishment and the text present place it in thailand.", "The country is thailand.", "This is from their country."], "image": "train2014/COCO_train2014_000000066166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459058, "question_id": "YsPAgWESya7qMcXjtUGVCM", "question": "The owner of this motorcycle likely identifies as what ethnicity?", "choices": ["aboriginal", "navajo", "jewish", "african"], "correct_choice_idx": 2, "direct_answers": ["jewish", "nam", "indian", "jewish", "no idea", "middle eastern", "jewish", "asian", "indian", "black"], "difficult_direct_answer": false, "rationales": ["The writing is hebrew.", "The words on the license are in hebrew.", "The owner is jewish."], "image": "train2014/COCO_train2014_000000459058.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39456, "question_id": "YsnyBR2NFfSAajZWgUgmnr", "question": "What is the name of this dessert?", "choices": ["rugalach", "crepe", "blintz", "croissant"], "correct_choice_idx": 1, "direct_answers": ["crepe", "crepes", "crepe", "crepe", "crepe suzette", "crepe", "crepe", "crepe", "crepe", "breakfast"], "difficult_direct_answer": false, "rationales": ["This is a french dessert.", "The elegant, thin covering for the filling is a french creation.", "It's a very thin pancake."], "image": "train2014/COCO_train2014_000000039456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289236, "question_id": "YtLWiZpaBDeQCbDzVejyBp", "question": "Which brand makes similar products to what the girl has on her feet?", "choices": ["timberland", "prada", "skechers", "gucci"], "correct_choice_idx": 2, "direct_answers": ["nike", "skechers", "adidas", "nike", "adidas", "nike", "nike", "nike", "nike", "adidas"], "difficult_direct_answer": false, "rationales": ["The brand is skechers.", "Skechers produces shoes.", "She is wearing shoes, not boots. they are not designer shoes."], "image": "train2014/COCO_train2014_000000289236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413852, "question_id": "YtPiBHyoPUqdjkZqskChJH", "question": "What type of sport is this?", "choices": ["winter", "tropical", "summer", "aquatic"], "correct_choice_idx": 0, "direct_answers": ["skiing", "skiing", "skiing", "winter", "skiing", "skiing", "skiing", "skiing", "snowboarding", "skiing"], "difficult_direct_answer": false, "rationales": ["The ground is covered in snow.", "The sport is occurring on a snowy mountain which is consistent with answer a and none of the answers.", "There is snow and they are going down a mountain"], "image": "val2014/COCO_val2014_000000413852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148977, "question_id": "YtgnjBDRQbL29mudQeHFVA", "question": "The owner of the teddy bear spends his or her time in what type of online game?", "choices": ["mmorpg", "rts", "puzzle", "fps"], "correct_choice_idx": 0, "direct_answers": ["warcraft", "world warcraft", "warcraft", "mmorpg", "warcraft", "xbox", "warcraft", "warcraft", "battle bears", "warcraft"], "difficult_direct_answer": false, "rationales": ["The bear has a warcraft book on the side of it.", "The owner likes mmorpg.", "Judging by the book on the desk, which is about world of warcraft, the player spends time playing a game that is defined as a massively multiplayer online role-playing game, otherwise known as an mmorpg."], "image": "val2014/COCO_val2014_000000148977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46608, "question_id": "YthxdgrfsZrHteXSHm2SSa", "question": "What profession do the people on top of the planes belong to?", "choices": ["acrobats", "teachers", "lion tamers", "pilots"], "correct_choice_idx": 0, "direct_answers": ["pilot", "stuntman", "aerial performers", "aerial acrobatics", "acrobat", "stunt devils", "daredevils", "acrobats", "performers", "acrobats"], "difficult_direct_answer": true, "rationales": ["There are people doing tricks on top of the planes.", "The people on top of the airplanes do gymnastics.", "The profession is acrobatics."], "image": "train2014/COCO_train2014_000000046608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228340, "question_id": "Ytq7cUoDrZTTcB78TbxcMV", "question": "Who was probably responsible for building the tallest structure?", "choices": ["church", "criminals", "doctors", "farmers"], "correct_choice_idx": 0, "direct_answers": ["church", "construction company", "crane", "steel", "construction workers", "lutheran", "architect", "architects", "church", "church"], "difficult_direct_answer": false, "rationales": ["The church has a cross on it.", "There is a religious cross on that structure.", "The tallest structure is a steeple."], "image": "train2014/COCO_train2014_000000228340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489066, "question_id": "YtsdtEMDf7uJcqeiFbfz4s", "question": "Why does the man wear different socks?", "choices": ["fashion", "camouflage", "visibility", "dress code"], "correct_choice_idx": 0, "direct_answers": ["style", "personality", "doesn't care", "entertainment value", "fun", "be different", "fashion", "fashion", "fashion", "attention"], "difficult_direct_answer": false, "rationales": ["Skateboarders have a goofy style.", "The man wants his socks to be fashionable.", "He is engaged in a free time activity in an environment that doesn't require any specific type of attire."], "image": "train2014/COCO_train2014_000000489066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143346, "question_id": "YtzjXFzEbSNnGhrvsofJPx", "question": "What is the product of each individual number on the back of the jersey?", "choices": ["125", "55", "ten", "25"], "correct_choice_idx": 3, "direct_answers": ["cloth", "ten", "ten", "camera", "25", "25", "player number", "fifty five", "ten", "25"], "difficult_direct_answer": false, "rationales": ["Five times five is twenty five", "Product equals multiplication. five multiplied by five gives this answer.", "Five multiplied by five is 25."], "image": "val2014/COCO_val2014_000000143346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422328, "question_id": "Yu2nJhAzsjsNXSABeuFiZs", "question": "The animal has how many legs?", "choices": ["four", "eight", "six", "two"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["The animal has 4 legs.", "Horses have four legs.", "The horse is standing on four legs in the snow."], "image": "val2014/COCO_val2014_000000422328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143811, "question_id": "Yu3n3dt35kHyGDhUEuXVRD", "question": "What will the woman do with the ball in her left hand?", "choices": ["throw upwards", "sell it", "drop it", "pocket it"], "correct_choice_idx": 0, "direct_answers": ["hit", "hit it", "hit it", "serve", "throw up", "throw upwards", "toss it", "serve ball", "toss", "serve"], "difficult_direct_answer": false, "rationales": ["The woman is about to serve the ball by tossing it up.", "She will toss it in the air so she can swing at it and hit it to her opponent.", "The woman will throw."], "image": "train2014/COCO_train2014_000000143811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498721, "question_id": "YucF8gi7TM3Y934CUqQwTg", "question": "What is in the background?", "choices": ["pizza pie", "ape", "bird", "large building"], "correct_choice_idx": 3, "direct_answers": ["skyscraper", "traffic light", "skyscraper", "skyscraper", "sky", "large building", "traffic light", "tall building", "high-rise building", "buildings"], "difficult_direct_answer": false, "rationales": ["The skyscraper in the background extends from below the frame to past the top of the frame of this picture.", "A skyscraper stands in the background.", "The building is in the background."], "image": "train2014/COCO_train2014_000000498721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139749, "question_id": "Yung2nUoPKhU5TkrXD7Fp8", "question": "What objects are hanging from the ceiling?", "choices": ["umbrella", "lamps", "fans", "bowls"], "correct_choice_idx": 0, "direct_answers": ["umbrellas", "parasols", "umbrellas", "parasols", "umbrellas", "umbrellas", "umbrella", "umbrellas", "lights", "umbrellas"], "difficult_direct_answer": false, "rationales": ["There a a bunch of decorative umbrellas hanging from the ceiling.", "Umbrellas are wide at the top and have extending poles underneath, which is what these shapes are equal to.", "They are there for decoration."], "image": "val2014/COCO_val2014_000000139749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364993, "question_id": "YuzrNUCBymbi2Mf9FMgsYR", "question": "What would you eat if you ate everything in the person's hand?", "choices": ["kiwi", "paper", "frosting", "metal"], "correct_choice_idx": 1, "direct_answers": ["sandwich", "yes", "sandwich", "sub", "sandwich", "sandwich", "paper", "yes", "sandwich", "sandwich"], "difficult_direct_answer": false, "rationales": ["The stuff is paper.", "The white paper wrapping would be consumed just as the sandwich.", "There is wax paper on the sandwich."], "image": "val2014/COCO_val2014_000000364993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306258, "question_id": "Yv5bAk7VZay98KDiBujCQk", "question": "What is the most likely food item wrapped in sandwich wrapping?", "choices": ["hot dog", "sub sandwich", "sushi roll", "meatball sandwich"], "correct_choice_idx": 1, "direct_answers": ["sandwich", "sub", "sandwich", "sandwich", "sub sandwich", "sandwich", "bread", "sandwich", "sub", "sandwich"], "difficult_direct_answer": false, "rationales": ["Sandwiches are sold by jimmy john's.", "Usually sub sandwiches are wrapped like this.", "The item is a sub."], "image": "train2014/COCO_train2014_000000306258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434381, "question_id": "Yv7jR85LFvvmkPxM5hBgjV", "question": "Who is the person wearing a blue shirt?", "choices": ["visitor", "worker", "passenger", "policeman"], "correct_choice_idx": 1, "direct_answers": ["signifies role", "worker", "traffic controller", "worker", "airport crew", "airport staff", "employee", "airport employee", "traffic controller", "worker"], "difficult_direct_answer": false, "rationales": ["The person in a blue shirt appears to be a worker. workers would be the only people allowed at a busy airport.", "A person is walking near a plane parked at a terminal at an airport. people are not allowed to walk near an aircraft unless they are an employee of the airline or airport.", "The person wearing a blue shirt next to the plane is a worker because it is the color of the uniforms"], "image": "train2014/COCO_train2014_000000434381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212574, "question_id": "YvETCpYqeSCRbFvmLpkdqp", "question": "Why are the three men dressed alike?", "choices": ["wearing uniforms", "wearing costumes", "punishment", "for fun"], "correct_choice_idx": 0, "direct_answers": ["officers", "police uniforms", "all policemen", "all policemen", "police officers", "cops", "wearing uniforms", "police officers", "police", "uniforms"], "difficult_direct_answer": false, "rationales": ["These are all police officers that are talking about something.", "The men are wearing uniforms that imply that they work together on the same job.", "Police are gathered together and wear all same thing."], "image": "train2014/COCO_train2014_000000212574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49065, "question_id": "YvFbsdsepxmoouuNGX8BuV", "question": "What skateboard wheels are best for street?", "choices": ["88a-100a", "70a-75a", "90a-100a", "88a-95a"], "correct_choice_idx": 3, "direct_answers": ["street wheels", "rubber", "rubber", "soft", "large", "rubber", "88a-95a", "hard", "fiberglass", "solid wheels"], "difficult_direct_answer": false, "rationales": ["According to wise google this is the answer.", "88a to 95a grip well enough to make them work well enought for rough surfaces and street skating.", "The skateboard wheels that are best and most efficient for street use are 88a95a."], "image": "train2014/COCO_train2014_000000049065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174386, "question_id": "YvcCieJA9F6bNLvVeDEs8Q", "question": "What kind of environment is the fruit hanging in?", "choices": ["outdoor", "underwater", "outer space", "indoor"], "correct_choice_idx": 3, "direct_answers": ["factory", "market", "market", "bizarre", "storage", "warm", "store", "kitchen", "store", "indoor"], "difficult_direct_answer": false, "rationales": ["The room looks like a kitchen.", "The environment is indoors.", "The area is covered. there is furniture around."], "image": "val2014/COCO_val2014_000000174386.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572586, "question_id": "Yve2aZMWXfyTJkh4NH3zGE", "question": "What word is on the ski at the bottom?", "choices": ["blue", "skate", "green", "blizzard"], "correct_choice_idx": 3, "direct_answers": ["blizzard", "letter n", "blizzard", "blizzard", "blizzard", "blizzard", "blizzard", "letter n", "blizzard", "blizzard"], "difficult_direct_answer": false, "rationales": ["There is the word \"blizzard\" on the bottom of this ski.", "The word on the board is a name of a storm.", "The word says \"blizzard\" on that ski."], "image": "train2014/COCO_train2014_000000572586.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551189, "question_id": "YwiSoPhNQzdgxhE58ut5fB", "question": "During which season is the train traveling over the bridge?", "choices": ["spring", "winter", "summer", "fall"], "correct_choice_idx": 3, "direct_answers": ["summer", "summer", "summer", "fall", "summer", "fall", "early fall", "summer", "spring", "fall"], "difficult_direct_answer": false, "rationales": ["The leaves are orange and yellow.", "The leaves are starting to change color so it would be autumn.", "The leaves are turning colors"], "image": "train2014/COCO_train2014_000000551189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298353, "question_id": "YwqaWyTWbntqBP5GWYBtwK", "question": "What has been dusted onto the food?", "choices": ["dirt", "spices", "sand", "snow"], "correct_choice_idx": 1, "direct_answers": ["pepper", "pepper", "black pepper", "pepper", "pepper", "pepper", "pepper", "spices", "pepper", "spices"], "difficult_direct_answer": false, "rationales": ["Vegetables are speckled in dark colors.", "There is salt and pepper all over the food.", "You can tell by the color on top of the fresh food as to what was put on top of it."], "image": "train2014/COCO_train2014_000000298353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312050, "question_id": "Ywxb4K2YP2pp8oSKGkZ4aA", "question": "Why are the people standing behind the black netting?", "choices": ["watching game", "to wrestle", "to socialize", "to compete"], "correct_choice_idx": 0, "direct_answers": ["spectating", "watching game", "watching game", "watching game", "watching baseball", "to watch", "safety", "watching game", "watch sport", "spectators"], "difficult_direct_answer": false, "rationales": ["Being behind net at a baseball game is for protection for the spectators", "The people are observing a baseball game based on the people in the foreground. the people behind the net are in casual clothes so they are not participants, but merely spectators.", "They are watching the game but that's a safety net so they don't get hit with a ball."], "image": "train2014/COCO_train2014_000000312050.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100090, "question_id": "Yx9HNMwqhu5ufa5gdj4vGR", "question": "Why is the leaf on top?", "choices": ["preservative", "color", "seasoning", "garnish"], "correct_choice_idx": 3, "direct_answers": ["lettuce", "garnish", "lettuce", "presentation", "flavor", "broccoli", "garnish", "rabbit food", "topping", "garnish"], "difficult_direct_answer": false, "rationales": ["It could also act as a b.", "Herbs are usually garnishes when added to the top of a dish.", "The leaves help \"seep down\" so as to add flavor to the pasta, and then you can remove it when it has left its traces of flavor."], "image": "val2014/COCO_val2014_000000100090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445075, "question_id": "YxAZu4EcDvtS2GNoLKWPCA", "question": "Why is the traffic stopped?", "choices": ["train crossing", "flooding", "accident", "construction"], "correct_choice_idx": 0, "direct_answers": ["red light", "train crossing", "railroad crossing", "train crossing", "train crossing", "train", "train", "train crossing", "train", "train crossing"], "difficult_direct_answer": false, "rationales": ["Traffic is stopped for a train crossing.", "A car is stopped and a train is moving in front of it. there is a barrier and lights as well.", "The train is crossing."], "image": "train2014/COCO_train2014_000000445075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125244, "question_id": "YxkC5MuMSxVwfnMR9JaSVm", "question": "Why is the train off the tracks?", "choices": ["for repairs", "to display", "to sell", "to purchase"], "correct_choice_idx": 0, "direct_answers": ["broken", "decommissioned", "display", "decommissioned", "not serviceable", "inoperative", "for repairs", "not working", "retired", "decommissioned"], "difficult_direct_answer": false, "rationales": ["The train is being fixed.", "The train is getting fixed.", "It probably has a problem that needs to be fixed before it can run again."], "image": "train2014/COCO_train2014_000000125244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241764, "question_id": "Yxt7odPn5AKPcZtSVcC3Bj", "question": "What's the number on the bottom of the train that the man is stepping in?", "choices": ["779-16", "776-18", "779-15 or", "777-19"], "correct_choice_idx": 1, "direct_answers": ["776", "776-18", "nine", "nine", "nine", "nine", "nine", "776", "nine", "nine"], "difficult_direct_answer": false, "rationales": ["This is the only number printed along the bottom visible part of the train.", "They are black numbers", "The exact numbers are shown, though somewhat blurry, where the guard rails are."], "image": "train2014/COCO_train2014_000000241764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475037, "question_id": "YxxRFYrVnFGnB8ZUDiWVWu", "question": "The ball used in badminton is made up of what?", "choices": ["wool", "cotton", "stone", "wood"], "correct_choice_idx": 0, "direct_answers": ["rubber", "feathers", "rubber felt", "plastic", "rubber", "net", "wool", "cork", "plastic", "rubber"], "difficult_direct_answer": false, "rationales": ["Badminton balls have wool.", "Tennis balls have a wool cover on them and a rubber interior.", "This is tennis, but the ball has a wool covering."], "image": "train2014/COCO_train2014_000000475037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337687, "question_id": "YxyfWAGvycd6fkdyT78fgs", "question": "What state is this sponsor's head office located?", "choices": ["delaware", "georgia", "south dakota", "california"], "correct_choice_idx": 1, "direct_answers": ["california", "georgia", "georgia", "georgia", "california", "georgia", "california", "georgia", "georgia", "georgia"], "difficult_direct_answer": false, "rationales": ["Coke is in atlanta.", "The state is georgia.", "The coca-cola company is famously headquartered in atlanta georgia."], "image": "train2014/COCO_train2014_000000337687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137211, "question_id": "YxzVwM7Fj6n5bRyYWTuPZ3", "question": "What are tarps made of?", "choices": ["metal links", "rubber", "nylon", "cloth/plastic"], "correct_choice_idx": 3, "direct_answers": ["cloth", "plastic", "canvas", "fabric", "unknown", "cloth", "cloth/plastic", "canvas", "truck", "canvas"], "difficult_direct_answer": false, "rationales": ["The tarps are made of plastic.", "They are usually made of plastic to keep the rain out.", "They are made of waterproof fabric."], "image": "val2014/COCO_val2014_000000137211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143769, "question_id": "YyAcM5zvX9BxCuDN5fk7i5", "question": "What type of horse is this?", "choices": ["arabian", "shetland pony", "clydesdale", "mustang"], "correct_choice_idx": 1, "direct_answers": ["shetland pony", "clydesdale", "pony", "clydesdale", "pony", "donkey", "shetland pony", "pony", "shetland pony", "miniature"], "difficult_direct_answer": false, "rationales": ["This is a type of pony.", "Smaller type of horse.", "A shetland pony is short and stocky with lots of mane and tail."], "image": "val2014/COCO_val2014_000000143769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465295, "question_id": "YyCX9vvGjuALvaDCbrwAPq", "question": "Who manufactured this television?", "choices": ["philips", "sony", "hitachi", "sharp"], "correct_choice_idx": 0, "direct_answers": ["philips", "philips", "philips", "philips", "philips", "philips", "philips", "phillips", "phillips", "philips"], "difficult_direct_answer": false, "rationales": ["The brand name is on the front of the television.", "The philips logo is on the tv.", "Philips made it."], "image": "val2014/COCO_val2014_000000465295.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269862, "question_id": "YyYi6L54VFLGqn6sB3qETy", "question": "The toppings are primarily from what food group?", "choices": ["meat", "fruit", "vegetable", "grain"], "correct_choice_idx": 2, "direct_answers": ["vegetables", "vegetables", "fruit", "vegteble", "vegetables", "vegetable", "vegetable", "vegetable", "vegetables", "vegetable"], "difficult_direct_answer": false, "rationales": ["There is corn, carrots and peas.", "On this pizza we can see carrots, olives, peppers and possibly peas.", "Carrots and other vegetables can be seen laying atop the item."], "image": "train2014/COCO_train2014_000000269862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122350, "question_id": "YyuU3br2weGbT2JarRrHRY", "question": "What type of food is in the tinfoil?", "choices": ["pita", "tortilla", "laffa", "pie"], "correct_choice_idx": 2, "direct_answers": ["gyro", "fast", "gyro", "wrap", "gyro", "burrito", "laffa", "pita fries", "gyro", "gyro"], "difficult_direct_answer": false, "rationales": ["The pita like bread with meat inside is common with this iraqi cuisine.", "This is a type of pita, which means b would also apply.", "The food is called a pita."], "image": "train2014/COCO_train2014_000000122350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439299, "question_id": "Yz2GKtkcTyDqmz8ofP3M2X", "question": "What has been added to this surfboard?", "choices": ["wheels", "seat", "umbrella", "pole"], "correct_choice_idx": 1, "direct_answers": ["seat", "chair", "chair", "seat", "chair", "chair", "seat", "chair", "chair", "chair"], "difficult_direct_answer": false, "rationales": ["Surfboards usually never have seating on them since you have to use your feet to balance on them.", "A chair is on the surfboard.", "There is a tall red seat on the surfboard."], "image": "train2014/COCO_train2014_000000439299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454162, "question_id": "Yz6VKNUsSkevtx4h3d6jmk", "question": "What might you wear in this sort of weather?", "choices": ["gloves", "shorts", "sandals", "t shirt"], "correct_choice_idx": 0, "direct_answers": ["jacket", "jacket", "jacket", "trench coat", "ski mask", "winter coat", "coat", "gloves", "snow pants", "warm clothes"], "difficult_direct_answer": false, "rationales": ["Gloves are needed.", "The weather clearly tells us that it is winter time; the other 3 options are worn in the summer.", "One might wear gloves in this snow storm."], "image": "train2014/COCO_train2014_000000454162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266451, "question_id": "YzFFC9ws5caZQ5gUqUevZh", "question": "How many green tree in picture?", "choices": ["one", "two", "six", "zero"], "correct_choice_idx": 0, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is only one tree in this street that still has all its leaves.", "One tree is visible.", "There is only one that is green."], "image": "val2014/COCO_val2014_000000266451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326237, "question_id": "YzFf3W4nhRMQrsVn8UkGCK", "question": "What type of hat is this woman wearing?", "choices": ["police officer", "fedora", "baseball", "chef"], "correct_choice_idx": 0, "direct_answers": ["police", "police officer", "police cap", "police", "police hat", "police", "police hat", "police hat", "police", "police"], "difficult_direct_answer": false, "rationales": ["The officer's hat is present.", "The hat shows a police officer badge.", "The woman is wearing a police hat."], "image": "train2014/COCO_train2014_000000326237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336910, "question_id": "YzTaJVMBtLhfFZnyXQRBqF", "question": "What trade allowed for the surface they are standing on to be inserted?", "choices": ["tiling", "carpeting", "flooring", "roofing"], "correct_choice_idx": 0, "direct_answers": ["brick layer", "tiling", "tile layers", "bus stop", "unknown", "tiler", "tiling", "stone cutters", "carpenter", "construction"], "difficult_direct_answer": true, "rationales": ["The people in the image are standing on tiles which would be put in place by someone who is tiling.", "The trade is for tiling.", "There are tiles on the ground."], "image": "train2014/COCO_train2014_000000336910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421902, "question_id": "Z2FcL2mW9439D8SYRaaVmF", "question": "What type of poster is the girl with the soccer t-shirt carrying?", "choices": ["movie", "band", "propaganda", "art"], "correct_choice_idx": 0, "direct_answers": ["movie", "movie", "movie", "tube", "movie", "movie poster", "rolled up", "movie poster", "paper poster", "soccer poster"], "difficult_direct_answer": false, "rationales": ["The girl has a movie poster with credits.", "The poster is for a movie.", "It looks like film credits on the poster."], "image": "val2014/COCO_val2014_000000421902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133486, "question_id": "Z2Lau82hM5MDGge4wT6LCK", "question": "What is this man's form of employment?", "choices": ["teacher", "fireman", "cook", "doctor"], "correct_choice_idx": 2, "direct_answers": ["chef", "cook", "chef", "cooking", "chef", "cook", "cook", "chef", "chef", "chef"], "difficult_direct_answer": false, "rationales": ["The man is in a kitchen.", "The man is cooking.", "The man is a cook and he is in the kitchen preparing a meal."], "image": "train2014/COCO_train2014_000000133486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355657, "question_id": "Z2MWgSNFF5taYkEPBsyH2v", "question": "What is the man in the Cherry picker doing?", "choices": ["having lunch", "driving", "repairing truck", "trimming tree"], "correct_choice_idx": 3, "direct_answers": ["fixing wires", "elator", "repairing", "trimming tree", "fixing cables", "cutting branches", "cutting branches", "picking cherries", "fixing electricity", "trimming tree"], "difficult_direct_answer": false, "rationales": ["There appears to be recently cut down limbs on the trees and the item would be used to reach those items the best.", "The man is trimming.", "The cherry picker is trimming the tree down."], "image": "train2014/COCO_train2014_000000355657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387895, "question_id": "Z2SBg4yNpjSzuv2kGQehKP", "question": "Why does this person have a bag with them?", "choices": ["lunch", "travelling", "shopping", "hiking"], "correct_choice_idx": 1, "direct_answers": ["travelling", "travel", "travelling", "traveling", "traveling", "traveling", "traveling", "traveling", "travelling", "traveling"], "difficult_direct_answer": false, "rationales": ["They are visiting a place where they do not live.", "They are on a vacation or a business trip and have a packed suitcase.", "The presence of a suitcase indicates this person is a tourist."], "image": "train2014/COCO_train2014_000000387895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83471, "question_id": "Z2pvLshzpdamkPExoKU5Ub", "question": "What country is this city located in based on the signs?", "choices": ["united kingdom", "italy", "portugal", "united states"], "correct_choice_idx": 0, "direct_answers": ["britain", "france", "england", "united kingdom", "england", "uk", "england", "england", "westminster", "england"], "difficult_direct_answer": false, "rationales": ["The sign is in the area of westminster.", "The spelling is indicative of this country", "The spelling is in english, but not american english."], "image": "train2014/COCO_train2014_000000083471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433402, "question_id": "Z39mZjHEzsMgvPH7cTvycd", "question": "What type of transportation is this?", "choices": ["road", "air", "rail", "water"], "correct_choice_idx": 0, "direct_answers": ["motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycles", "bicycle", "road", "motorcycles"], "difficult_direct_answer": false, "rationales": ["They are driving on a paved surface.", "The motorbike only pass on the road.", "This is a road highway."], "image": "train2014/COCO_train2014_000000433402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244095, "question_id": "Z3K7CbFadw9rjcpiMBC4Fv", "question": "What muscle do skater jumps Work?", "choices": ["shoulder", "hand", "elbow", "rump"], "correct_choice_idx": 3, "direct_answers": ["calf muscles", "hamstrings", "quads", "thighs", "all", "leg", "abs", "rump", "leg", "legs"], "difficult_direct_answer": true, "rationales": ["A skateboarder is doing an arial trick on a skateboard.", "The buttocks is a major area that gets a good workout when skateboarding.", "Their rumps need to do a lot of lifting work in order to get them into the air."], "image": "train2014/COCO_train2014_000000244095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193863, "question_id": "Z3XaYCNBkxqwqo8Js56jsH", "question": "What is the woman attempting to do with the ball?", "choices": ["throw it", "sell it", "hit it", "catch it"], "correct_choice_idx": 2, "direct_answers": ["hit", "hit", "hit", "hit", "hit it", "hit", "hit", "hit ball", "hit it", "hit"], "difficult_direct_answer": false, "rationales": ["The woman is attempting to hit the ball because she has a racket in her hands and is aiming at the ball", "The woman is attempting to hit the ball with a tennis racket.", "She is swinging the racket."], "image": "val2014/COCO_val2014_000000193863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102550, "question_id": "Z3bPuztRV4i4sTYnfAN6Tr", "question": "What does the hungry girl have on her face?", "choices": ["glasses", "monocle", "mustard", "ketchup"], "correct_choice_idx": 0, "direct_answers": ["glasses", "glasses", "sandwich", "eyeglasses", "glasses", "glasses", "glasses", "sandwich", "sandwich", "glasses"], "difficult_direct_answer": false, "rationales": ["The girl with glasses has her mouth open wide.", "The girl has glasses.", "The woman is eating while she has her eyeglass on."], "image": "train2014/COCO_train2014_000000102550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6725, "question_id": "Z3okzvppKFTvnUGmBQyPzu", "question": "What is near the laptop?", "choices": ["egg", "chair", "apple", "bacon"], "correct_choice_idx": 1, "direct_answers": ["printer", "mouse", "chair", "chair", "printer", "printer", "printer", "printer", "monitor", "keyboard"], "difficult_direct_answer": false, "rationales": ["A large object on which an individual can sit is located near the laptop and desk.", "The chair is near.", "The laptop is visible and identifiable. answer a is visible and near the laptop while none of the other answers are depicted."], "image": "train2014/COCO_train2014_000000006725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464179, "question_id": "Z49qQPKbWsYLHALvvcXMRv", "question": "What are the stanchions meant to control here?", "choices": ["horse", "owner", "crowd", "traffic"], "correct_choice_idx": 2, "direct_answers": ["horse", "people", "people", "visitors", "restrict access", "hair", "horse", "tourists", "crowd", "horse movement"], "difficult_direct_answer": false, "rationales": ["Those wouldn't stop a horse from moving around and they are meant to keep the people at a safe distance.", "It is meant to tell people where to stand including a large group of people.", "To keep people from getting too close"], "image": "train2014/COCO_train2014_000000464179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352361, "question_id": "Z4DXLKUevtn6TEYcdW5aKL", "question": "What is the man about to do?", "choices": ["land", "roll", "skip", "run"], "correct_choice_idx": 0, "direct_answers": ["land", "land", "land ground", "land", "snowboard", "snowboard", "land", "land trick", "land", "scaring"], "difficult_direct_answer": false, "rationales": ["The man is midair so he will have to come down to the ground.", "The man is jumping in the air.", "The man will land."], "image": "val2014/COCO_val2014_000000352361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85144, "question_id": "Z4TYmUdKJT9PwXyPqNSrnC", "question": "What type of sport is this?", "choices": ["tropical", "winter", "aquatic", "summer"], "correct_choice_idx": 1, "direct_answers": ["skiing", "winter", "skating", "snow", "ski", "skiing", "ski", "ski", "snow sking", "skating"], "difficult_direct_answer": false, "rationales": ["Since the ground is completely covered in snow and contains two skiers in full skiing regalia, we can reasonably deduce that this shot was taken in winter.", "This is done in the winter months.", "Skiing happens in the winter."], "image": "val2014/COCO_val2014_000000085144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186729, "question_id": "Z4dgE4i6XfwsLLsz65rZTQ", "question": "Due to the chemicals and nature of the items on the table what protective gear while working with these items?", "choices": ["all protective", "mask", "apron", "gloves"], "correct_choice_idx": 0, "direct_answers": ["gloves", "gloves", "gloves", "gloves", "all protective", "gloves mask", "gloves goggles", "gloves", "goggles", "gloves"], "difficult_direct_answer": false, "rationales": ["The chemicals could get into the facial area and cause problems. you also don't want to get it on your hands or clothing.", "They're protected.", "With doing taxidermy you must have all of these protective items."], "image": "train2014/COCO_train2014_000000186729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137496, "question_id": "Z4dtRTzh4he52HXLCMLHi2", "question": "Where is the attendant to pump the gas?", "choices": ["there's none", "inside plane", "already left", "inside booth"], "correct_choice_idx": 0, "direct_answers": ["there's none", "on plane", "on wing", "plane", "on plane", "airplane", "not visible", "somewhere else", "wing", "on wing"], "difficult_direct_answer": false, "rationales": ["The is no gas to pump into the plane.", "The gas attendant is partly inside of the plane.", "It is self serve according to the sign"], "image": "val2014/COCO_val2014_000000137496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25147, "question_id": "Z4psAtRnxvqPwZvpre977x", "question": "Why is she wearing boots?", "choices": ["protection", "rain", "style", "uniform"], "correct_choice_idx": 2, "direct_answers": ["cold", "fashion", "protect feet", "style", "match outfit", "it's cold", "look fashionable", "fall weather", "style", "fashion"], "difficult_direct_answer": false, "rationales": ["These have a heel so they are not for comfort", "The woman is wearing boots to display style.", "She is wearing these boots for fashion."], "image": "train2014/COCO_train2014_000000025147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60262, "question_id": "Z54xf9oCFnFU84Rfpvt2k3", "question": "What type of coat does this horse have?", "choices": ["appaloosa", "paint", "gray", "buckskin"], "correct_choice_idx": 0, "direct_answers": ["white", "spotted", "spotted", "short", "appaloosa", "white dappled", "white", "spotted", "spotted", "paint"], "difficult_direct_answer": false, "rationales": ["This kind of horse has spots all over it or on parts of it.", "It has spots on it", "The appaloosa has a spotted pattern."], "image": "val2014/COCO_val2014_000000060262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19142, "question_id": "Z5SsneZvEBP7R5rd6XegJ7", "question": "In which country is this building?", "choices": ["chile", "canada", "usa", "mexico"], "correct_choice_idx": 2, "direct_answers": ["united states", "usa", "usa", "america", "usa", "united kingdom", "usa", "usa", "united states", "usa"], "difficult_direct_answer": false, "rationales": ["The american flag is on the building.", "There is an american flag flying. usa has a red and white striped flag with a blue section containing 50 white stars.", "This country's flag is visible."], "image": "val2014/COCO_val2014_000000019142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566700, "question_id": "Z5x6MZbFvZmHxmKiBYZ8H6", "question": "What are the shattered items on the bear?", "choices": ["cassettes", "8-tracks", "cds", "records"], "correct_choice_idx": 2, "direct_answers": ["colors", "records", "cd's", "cd", "cds", "compact discs", "cds", "cds", "cds", "cds"], "difficult_direct_answer": false, "rationales": ["These flat platters are smaller than records.", "The silver and rainbow accented reflectivity of these shattered discs identifies them as cds.", "More and more people get their music from other sources but people like me still purchase these and these are not records or cassettes."], "image": "train2014/COCO_train2014_000000566700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60081, "question_id": "Z6FqGcjKwTcCvt2vX3E3oJ", "question": "How many people are on the ramp?", "choices": ["four", "two", "one", "many"], "correct_choice_idx": 3, "direct_answers": ["nine", "many", "ten", "ten", "ten", "ten", "ten", "nine", "nine", "ten"], "difficult_direct_answer": false, "rationales": ["There are more than several people on the ramp.", "There are a lot of people up there.", "A lot of people are on the ramp."], "image": "train2014/COCO_train2014_000000060081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504306, "question_id": "Z6RugiPZ5sPariDwZjEyLq", "question": "What is the purpose of the black object on the ceiling?", "choices": ["projecting", "light", "heating", "cooling"], "correct_choice_idx": 0, "direct_answers": ["projecting", "projecting", "projector", "projecting", "conditioning", "show slides", "projecting", "projector", "camera", "project image"], "difficult_direct_answer": false, "rationales": ["The object on the ceiling is a project and can be used to show movies or games.", "This is show you can show slides or a video", "It has a lens on the front to show the screen on the wall. most everyone is look in the direction where it is being shown."], "image": "train2014/COCO_train2014_000000504306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282772, "question_id": "Z7P2oGrbu2n948VpKeghBz", "question": "What room is shown in the photo?", "choices": ["bathroom", "bedroom", "kitchen", "closet"], "correct_choice_idx": 1, "direct_answers": ["bedroom", "bedroom", "sun room", "sun room", "bedroom", "bedroom", "bedroom", "bedroom", "bedroom", "bedroom"], "difficult_direct_answer": false, "rationales": ["You can see a bed adorned with pillows and a blanket. you can also see a nightstand with flowers on it. this is a place where people sleep and rest and is known as this room.", "It has a bed and pillows in it", "There is a bed with pillows on it."], "image": "val2014/COCO_val2014_000000282772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213421, "question_id": "Z7YwwJe7iWXsn6Ecztsikf", "question": "What do the clips on the woman's head do for her?", "choices": ["apply makeup", "tie bread", "relive headaches", "hold hair"], "correct_choice_idx": 3, "direct_answers": ["hold hair", "hold hair", "hold hair", "hold hair", "hold hair", "hold hair", "hold hair", "hold hair", "hold hair", "hold hair"], "difficult_direct_answer": false, "rationales": ["The clips on the woman's head keep her hair in place.", "None of the other options can possibly be true. headaches, bread and makeup have absolutely nothing to do with clips on anyone's head.", "They keep hair off her face"], "image": "val2014/COCO_val2014_000000213421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488014, "question_id": "Z7rpLcpCFf4BizpUcDxUA3", "question": "Where does the dog appear to be standing?", "choices": ["grass", "parking lot", "street", "sidewalk"], "correct_choice_idx": 1, "direct_answers": ["by pole", "near pole", "parking lot", "on road", "on asphalt", "parking lot", "street", "outside", "outside", "near bike"], "difficult_direct_answer": false, "rationales": ["The stripes and the concrete stop indicate this is a place where cars can park.", "There seems to be be a bike beside the dog that is park.", "This is indicated by the yellow markings in the background that are typical in an a setting."], "image": "train2014/COCO_train2014_000000488014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45435, "question_id": "Z8BDNcV53EEtmxA9cCEF7K", "question": "What protects some of the papers in the stack from moisture?", "choices": ["clips", "covers", "sleeves", "binders"], "correct_choice_idx": 2, "direct_answers": ["paper protector", "plastic sheets", "plastic cover", "plastic", "plastic cover", "plastic", "sleeves", "protectors", "plastic envelope", "plastic"], "difficult_direct_answer": false, "rationales": ["The papers are protected by large sleeves.", "There are plastic sleeves on several stacks of paper.", "The sleeves are protective."], "image": "val2014/COCO_val2014_000000045435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405505, "question_id": "Z8FuGk7ZoKUzdznQaqwarn", "question": "What style of paint is on this vehicle?", "choices": ["graffiti", "spray", "watercolor", "camo"], "correct_choice_idx": 3, "direct_answers": ["war", "camo", "camo", "camouflage", "camouflage", "camo", "camouflage", "camouflage", "camo", "camo"], "difficult_direct_answer": false, "rationales": ["This pattern helps disguise vehicles in brush.", "The paint looks military grade.", "The vehicle is painted in a camoflage pattern."], "image": "train2014/COCO_train2014_000000405505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185721, "question_id": "Z8R3bhtRYLo5WFj7Go75Dy", "question": "What time of day is shown here?", "choices": ["late night", "9 am", "noon", "5 pm"], "correct_choice_idx": 0, "direct_answers": ["nighttime", "night", "night", "late night", "night", "night", "night", "nighttime", "night", "night"], "difficult_direct_answer": false, "rationales": ["The sky is dark.", "The time of day is late at night.", "It is completely dark at the time of day shown and the area is deserted which eliminates the other options."], "image": "val2014/COCO_val2014_000000185721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293233, "question_id": "Z8RZctVQV3RbqbJEwxTUsL", "question": "Why cricketers wear white?", "choices": ["reduces heat", "peace", "unique", "dress code"], "correct_choice_idx": 0, "direct_answers": ["uniform", "reduces heat", "reflect heat", "uniforms", "custom", "represent team", "summer sport", "heat/sun", "uniform", "home team"], "difficult_direct_answer": true, "rationales": ["The question is unrelated to the image, but is internet searchable.", "White reduces heat.", "The color white does not absorb heat from sunlight."], "image": "train2014/COCO_train2014_000000293233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460010, "question_id": "Z8gJnCvkRVBYRJv6sHScf8", "question": "The cattle shown in the picture belongs to which group of food habitats?", "choices": ["herbivorous", "none", "carnivorous", "omnivorous"], "correct_choice_idx": 0, "direct_answers": ["herbivores", "herbivorous", "sheep shown", "sheep", "herbivore", "pastures", "sheep", "vegetarian", "sheep", "herbivore"], "difficult_direct_answer": false, "rationales": ["They eat hay and stuff.", "They eat grass and greenery.", "There are a couple of herbivorous goats near the wall."], "image": "val2014/COCO_val2014_000000460010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100245, "question_id": "Z8j64THpSiGE2mYsuZYeRq", "question": "Which Olympics games might this region take place?", "choices": ["autumn games", "winter games", "spring games", "summer games"], "correct_choice_idx": 1, "direct_answers": ["winter olympics", "winter", "winter", "skiing", "winter olympics", "winter games", "switzerland", "winter", "winter", "winter"], "difficult_direct_answer": false, "rationales": ["There is snow and people are wearing warm clothing.", "People are skiing. skiing is an event at the winter games.", "The games are in the winter."], "image": "val2014/COCO_val2014_000000100245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414343, "question_id": "Z8jhCuVRRkgDPRxGdesJXC", "question": "The number of women here can appropriately be referred to as what?", "choices": ["duo", "octagon", "quartet", "trio"], "correct_choice_idx": 3, "direct_answers": ["threesome", "trio", "trio", "trio", "trio", "trio", "three", "triplets", "triple", "trio"], "difficult_direct_answer": false, "rationales": ["Three girls are smiling and posing together.", "The women are a group of three.", "There are three people in the group."], "image": "val2014/COCO_val2014_000000414343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528315, "question_id": "Z93x6rt3aCEQgS8HqErPLM", "question": "What is the horse being used for?", "choices": ["meat", "food production", "racing", "transportation"], "correct_choice_idx": 3, "direct_answers": ["carry people", "pulling", "pulling", "transportation", "pulling", "pulling sleigh", "pulling", "pull buggy", "sleigh rides", "transportation"], "difficult_direct_answer": false, "rationales": ["Two people are petting horse and are about to ride.", "The horse appears to be attached to a sleigh, based on the harnesses and straps, which would be in line with the setting displayed and a common activity done as transportation at this time of year.", "The horse is used for transportation in the mountainside."], "image": "train2014/COCO_train2014_000000528315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254134, "question_id": "Z9FoymXmiG3c5GEJmBpYQN", "question": "Why is there a bright star-shaped aberration in the middle of the laptop screen?", "choices": ["screen damage", "sunlight", "table lamp", "camera flash"], "correct_choice_idx": 3, "direct_answers": ["reflection", "reflection", "camera flash", "camera flash", "reflection", "reflection", "glare", "sunshine", "flash reflection", "flash"], "difficult_direct_answer": false, "rationales": ["The camera is leaving a flash.", "It seems to be the flashlight from a source.", "That is from whomever is taking the picture."], "image": "train2014/COCO_train2014_000000254134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283122, "question_id": "Z9XoADP9uHmmeGZeofH6V4", "question": "Where are these bikes located?", "choices": ["driveway", "mechanic", "indoors", "parking lot"], "correct_choice_idx": 2, "direct_answers": ["inside", "bike shop", "bike shop", "indoors", "store", "museum", "indoors", "inside", "bar", "inside"], "difficult_direct_answer": false, "rationales": ["There are a couple of bikes located indoors.", "Bikes are parked indoors on a showroom floor.", "These bikes are inside a building."], "image": "train2014/COCO_train2014_000000283122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412767, "question_id": "Z9af4Lc8xtqjznUu5A7oij", "question": "What is the name of the container in the background holding coffee?", "choices": ["aeropress coffee", "french press", "drip machine", "coffee maker"], "correct_choice_idx": 1, "direct_answers": ["canister", "coffee pot", "french press", "french press", "cafe press", "french press", "zarf", "french press", "carafe", "mug"], "difficult_direct_answer": false, "rationales": ["This has a metal piece inside that presses coffee grounds down to extract coffee", "A french press stores coffee once brewed.", "The coffee container is a french press."], "image": "val2014/COCO_val2014_000000412767.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444278, "question_id": "ZAEPvGvycznPxk3oZXNPdA", "question": "Why is the man on the board bending his knees?", "choices": ["balance", "to jump", "to sit", "to dive"], "correct_choice_idx": 0, "direct_answers": ["surfing", "balance", "balance", "balance", "balance", "balance", "maintain balance", "surfing", "balance", "maintaining balance"], "difficult_direct_answer": false, "rationales": ["During this type of extreme sport you have to have alot of balance.", "The knees bent are allowing him to get lower on the board.", "Surfer has hands out to keep from falling off."], "image": "train2014/COCO_train2014_000000444278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36990, "question_id": "ZAHJ2MB8gXTtRg2m6Jn4QV", "question": "Where is the bus parked?", "choices": ["curbside", "house driveway", "parking lot", "empty lot"], "correct_choice_idx": 3, "direct_answers": ["empty lot", "outside", "side road", "country road", "road", "under tree", "dirt", "dirt lot", "near store", "street"], "difficult_direct_answer": true, "rationales": ["The bus is the only visible vehicle in the lot so answer a is applicable.", "The bus is near a small building with people by it.", "There are no other buses or cars around."], "image": "val2014/COCO_val2014_000000036990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39464, "question_id": "ZAgvLTWkGmxzjYJA7zajow", "question": "What is the building across the street from the orange sign used for?", "choices": ["education", "business", "religious services", "government office"], "correct_choice_idx": 2, "direct_answers": ["pizza", "library", "food", "columbia library", "church", "municipal building", "church services", "pray marry", "pizzeria", "religious services"], "difficult_direct_answer": true, "rationales": ["The building is for religion.", "The building across the street is identifiable as a cathedral, which is a location in which religious services would often take place.", "The orange sign advertises a library."], "image": "train2014/COCO_train2014_000000039464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403899, "question_id": "ZAuyQY4iMtoGyPQceVkbz4", "question": "Who is the man most likely?", "choices": ["trainer", "matador", "mime", "clown"], "correct_choice_idx": 0, "direct_answers": ["trainer", "trainer", "trainer", "trainer", "trainer", "trainer", "trainer", "trainer", "trainer", "trainer"], "difficult_direct_answer": false, "rationales": ["The man is coaching the elephant for how to perform tricks.", "The man is most likely an elephant trainer.", "They are close to the elephant and it trusts him"], "image": "train2014/COCO_train2014_000000403899.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438769, "question_id": "ZBMkFL65oVLcSzAjDRMNh6", "question": "How many flavors available in Rock star energy drink?", "choices": ["ten", "15", "20", "25"], "correct_choice_idx": 2, "direct_answers": ["multiple", "twentyfive", "over 30", "50", "one", "20", "many flavors", "unknown", "many", "three"], "difficult_direct_answer": true, "rationales": ["It's actually more than 30 now. b just happens to be closest.", "According to research they have more than twenty flavors and are in 30 countries.", "The ad on amazon says more than 25."], "image": "train2014/COCO_train2014_000000438769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363191, "question_id": "ZBatu3XvMGjZ8jbuGPXYDf", "question": "Where are these refrigerators being used in?", "choices": ["house", "bar", "convenience store", "restaurant"], "correct_choice_idx": 1, "direct_answers": ["bar", "liquor store", "restaurant", "bar", "bar", "bar", "bar", "bar", "bar", "bar"], "difficult_direct_answer": false, "rationales": ["The fridge is a bar.", "There are many glasses visible that would be found at this volume in a place that serves food and drink on a professional level. the variety of drinks in the fridge and their compactness would be consistent with a bar offering.", "The refrigerators are in a bar."], "image": "train2014/COCO_train2014_000000363191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398534, "question_id": "ZBfkodpq7QZP9zyWCiutsk", "question": "What grain grows near this tennis court?", "choices": ["rye", "wheat", "corn", "oats"], "correct_choice_idx": 2, "direct_answers": ["grass", "corn", "wheat", "wheat", "corn", "corn", "corn", "grass", "wheat", "corn"], "difficult_direct_answer": false, "rationales": ["Stalks of corn are behind a tennis court.", "There are stalks of corn behind the fence.", "Corn is growing near the tennis court because you can see the particular form of this plant"], "image": "val2014/COCO_val2014_000000398534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579461, "question_id": "ZBh7fig3pF2xrXCro4VN8p", "question": "Why are the vehicles in front of the plane?", "choices": ["just waiting", "to load", "carry passengers", "to unload"], "correct_choice_idx": 3, "direct_answers": ["loading luggage", "loading plane", "trams", "loading up", "trucks", "cargo vans", "to unload", "for loading", "luggage carts", "load cargo"], "difficult_direct_answer": true, "rationales": ["These carry the luggage to and from the planes", "The vehicles contain luggage, which has to be loaded onto the plane before it departs.", "The vehicles are unloading luggage."], "image": "train2014/COCO_train2014_000000579461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464682, "question_id": "ZBtKSMJe96wMAQvVnRWmd2", "question": "What's the purpose of the little sticks?", "choices": ["decoration", "keep together", "add flavor", "test doneness"], "correct_choice_idx": 1, "direct_answers": ["hold together", "structure maintenance", "sandwich construction", "hold together", "hold sandwich", "binder", "hold sandwich", "keep together", "hold together", "hold sandwich"], "difficult_direct_answer": false, "rationales": ["The purpose is to hold together.", "The little sticks keep the sandwich from falling apart.", "The toothpicks stop the food from falling apart."], "image": "val2014/COCO_val2014_000000464682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112096, "question_id": "ZC6CQaYQsTUcZDF7d37qLc", "question": "In What room did the appliance seen here was plugged in last?", "choices": ["kitchen", "none", "street alley", "bedroom"], "correct_choice_idx": 0, "direct_answers": ["kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen"], "difficult_direct_answer": false, "rationales": ["The object seen in the foreground with a cord is a microwave based on its size, shape and design. this object is commonly used in answer a.", "This is a kitchen microwave and was last plugged in there.", "This is a microwave"], "image": "train2014/COCO_train2014_000000112096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212969, "question_id": "ZCAPWGQ5cAySajvhmQmQTS", "question": "Why does the man have a bottle in his backpack?", "choices": ["giving gift", "for hydration", "for balance", "as decoration"], "correct_choice_idx": 1, "direct_answers": ["thirsty", "drink later", "drink water", "hydration", "quench thirst", "for hydration", "hydration", "balancing", "hydration", "hydrate"], "difficult_direct_answer": false, "rationales": ["He is engaged in an activity that will increase thirst.", "The man doesn't want to be thirsty.", "The man wants hydration."], "image": "train2014/COCO_train2014_000000212969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195394, "question_id": "ZCAvzMyvrnhLZeGNzLg86t", "question": "Transport using skis to glide on snow is called?", "choices": ["surfing", "skiing", "snowboarding", "kiting"], "correct_choice_idx": 1, "direct_answers": ["skiing", "sun kits", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["A person wears skis to either go down a hill or cross country ski and both are done on snow.", "A person using skis on snow is called snow skiing.", "They're skiing."], "image": "train2014/COCO_train2014_000000195394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207564, "question_id": "ZCDe9Wf84WdhVsqY9ZgFfN", "question": "The bird on the water has what type feet?", "choices": ["club", "taloned", "none", "webbed"], "correct_choice_idx": 3, "direct_answers": ["web feet", "webbed", "webbed feet", "webbed feet", "webbed", "webbed", "webbed", "webbed", "webbed", "webbed"], "difficult_direct_answer": false, "rationales": ["The bird on the water is a duck and has feet that are webbed for swimming.", "The bird has webbed feet.", "The bird has webbed feet."], "image": "train2014/COCO_train2014_000000207564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457976, "question_id": "ZCHFCWNerqFvzUZcSoWcCx", "question": "How many legs in this image?", "choices": ["12", "seven", "four", "six"], "correct_choice_idx": 1, "direct_answers": ["seven", "twelve", "12", "six", "six", "eight", "seven", "twelve 12", "seven", "seven"], "difficult_direct_answer": false, "rationales": ["The animal shows seven legs.", "There are three cows but only some legs are showing.", "There are three animals. they each have four legs."], "image": "train2014/COCO_train2014_000000457976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65668, "question_id": "ZCVygPW6iPJYB5TT8BKPxK", "question": "Why does the woman have a huge hat?", "choices": ["disguise", "sun protection", "hide money", "selling hat"], "correct_choice_idx": 1, "direct_answers": ["shade", "sun protection", "shade", "sun protection", "sell", "sun", "sun protection", "for shade", "sun protection", "keep cool"], "difficult_direct_answer": false, "rationales": ["The woman wants to avoid sunburn.", "The woman's conical hat, originating in vietnam, keeps her face in the shade no matter what angle her head might be tilting.", "The woman wants protection."], "image": "train2014/COCO_train2014_000000065668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563938, "question_id": "ZCrBv6SxW26duRqTUqviqA", "question": "In what setting do these people chat?", "choices": ["living room", "mall", "patio", "bathroom"], "correct_choice_idx": 2, "direct_answers": ["outdoors", "bar", "friendly", "patio", "outside", "patio", "patio", "outside", "restaurant", "outside"], "difficult_direct_answer": false, "rationales": ["People are sitting at a table with condiments and drinks and are outside with sunglasses on.", "The people are sitting outdoors likely on a patio.", "The people appear to be outside. living rooms, bathrooms, and malls are inside, not outside."], "image": "val2014/COCO_val2014_000000563938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548420, "question_id": "ZD2M4B6dVPhQXXZYCJyvxv", "question": "What does the stick help the man near the elephant do?", "choices": ["impregnate it", "fight it", "brush it", "control it"], "correct_choice_idx": 3, "direct_answers": ["control it", "guide", "guide elephant", "herd elephant", "guide it", "tame", "walk", "guide elephant", "guide", "guide"], "difficult_direct_answer": false, "rationales": ["The elephant is controlled.", "The stick is used by the man as a prod, hopefully a gentle one, as he guides the elephant left, right and forward while giving the tourists a ride.", "Most trainers use sticks when dealing with large animals."], "image": "val2014/COCO_val2014_000000548420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514605, "question_id": "ZDK2ep9YGvxRgRfdfgDnSL", "question": "What sport can be associated with the above picture?", "choices": ["wake boarding", "paragliding", "surfing", "sailing"], "correct_choice_idx": 1, "direct_answers": ["sky diving", "running", "parasailing", "paragliding", "kiting", "parasailing", "parasailing", "paragliding", "land surfing", "handgliding"], "difficult_direct_answer": false, "rationales": ["There is a large kite in the air. kites are used in parasailing.", "A person is on a beach with a large kite.", "When you use a kite like thing to fly through the air."], "image": "train2014/COCO_train2014_000000514605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554502, "question_id": "ZDSGjBHTpqxdx7RWoS3UPE", "question": "What is the lampshade made of?", "choices": ["metal", "stained glass", "ceramic", "fabric"], "correct_choice_idx": 1, "direct_answers": ["stained glass", "stained glass", "stained glass", "glass", "glass", "stained glass", "glass", "lace", "stained glass", "lead"], "difficult_direct_answer": false, "rationales": ["Decorated glass is shining thru lamp. it has patterns on it.", "As soon as you look at it the answer is clear.", "Based on the color and transparency of the lamp in question, answer a is most likely."], "image": "train2014/COCO_train2014_000000554502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575303, "question_id": "ZDmaxhkywnij8orK6ZRcAf", "question": "What instrument is the man playing here?", "choices": ["keyboard", "harp", "piano", "accordion"], "correct_choice_idx": 3, "direct_answers": ["accordion", "accordion", "accordion", "accordion", "accordion", "accordion", "accordion", "accordion", "accordion", "accordion"], "difficult_direct_answer": false, "rationales": ["There's no mistaking the big box shape of an accordion, and the man here appears to be serenading some cows with his!.", "It is a small piano keyboard with other buttons", "The instrument he's playing is an accordion."], "image": "train2014/COCO_train2014_000000575303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362092, "question_id": "ZDtxshmmbzYcqT7ECMyqn2", "question": "What do people most likely do in the structure?", "choices": ["sleep", "vote", "run", "swim"], "correct_choice_idx": 0, "direct_answers": ["ride it", "boat", "ride", "house boat", "sleep", "sit", "tours", "uses needs", "sail", "sleep"], "difficult_direct_answer": true, "rationales": ["A person has somehow made boat into a house and has curtains for privacy.", "This bus has been turned into a home.", "People sleep on structures like the old van."], "image": "val2014/COCO_val2014_000000362092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208779, "question_id": "ZE977bjjVwnzzTouvVUnNa", "question": "Where is the sheep located?", "choices": ["humane society", "shopping mall", "school", "fair"], "correct_choice_idx": 3, "direct_answers": ["pen", "fair", "inside pen", "in farm", "fenced zone", "pen", "inside pen", "petting zoo", "zoo", "pen"], "difficult_direct_answer": false, "rationales": ["The sheep is at a fair.", "The sheep is at a petting zoo which are often at fairs.", "The sheep are in a petting zoo."], "image": "train2014/COCO_train2014_000000208779.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172680, "question_id": "ZEE3War9c5LmUDBtNyPrn9", "question": "What is the original flavor of the beverage?", "choices": ["orange", "grape", "cherry", "lemon-lime"], "correct_choice_idx": 3, "direct_answers": ["water", "water", "lemon lime", "lemon", "no beverage", "lemon-lime", "vanilla", "lemon", "sweet", "plain"], "difficult_direct_answer": false, "rationales": ["There is a box of 7-up which is known to be lemon lime flavored.", "The box reading '7-up' on top of the refrigerator in this image contains or once contained a drink known for it's tangy citrus flavor.", "7-up is not cherry, grape nor orange."], "image": "train2014/COCO_train2014_000000172680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12666, "question_id": "ZEdL2Y56EpMUpFjFGwKyb9", "question": "Where is the van most likely traveling to?", "choices": ["residential places", "insurance places", "education places", "sightseeing places"], "correct_choice_idx": 3, "direct_answers": ["tour site", "haiti", "haiti", "sightseeing places", "to work", "work place", "haiti", "hotel", "work", "sightseeing places"], "difficult_direct_answer": false, "rationales": ["The van is sightseeing.", "There is large text painted on the front center of the van that says \"l'amour du travail\" which means \"the love of travel\" in english. there are other theatrical elements painted on the van, and it is full of people.", "This is a travel van and they are tourist that want to look around."], "image": "val2014/COCO_val2014_000000012666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565500, "question_id": "ZEienjSyYd6zC5a9NKikPj", "question": "What is a symbol for the word in green letters on the bus?", "choices": ["rose", "fist", "2 snakes", "4-leaf clover"], "correct_choice_idx": 3, "direct_answers": ["fourleaf clover", "crossed fingers", "shamrock", "clover", "4-leaf clover", "clover", "horseshoe", "4-leaf clover", "four-leaf clovers", "shamrock"], "difficult_direct_answer": false, "rationales": ["Luck is resembled by a clover.", "It is an irish sign for luck.", "Four leaf clovers are believed to be lucky."], "image": "train2014/COCO_train2014_000000565500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351577, "question_id": "ZFAVAegoCbCWzabduW5Je8", "question": "Why does the player eat banana?", "choices": ["mandatory", "personal preference", "replenish energy", "hungry"], "correct_choice_idx": 2, "direct_answers": ["hungry", "energy", "energy", "energy", "hungry", "hungry", "energy", "energy", "energy", "replenish energy"], "difficult_direct_answer": false, "rationales": ["He is eating it for energy", "The player eats the banana to replenish energy during tennis.", "Bananas are good for fast nutrients and will make them feel refreshed."], "image": "val2014/COCO_val2014_000000351577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389401, "question_id": "ZFopKK62i4ocK3hEpjkBv8", "question": "What is the black object attaching the bike to the pole being used as?", "choices": ["pulley", "wrench", "ramp", "lock"], "correct_choice_idx": 3, "direct_answers": ["bike lock", "lock", "lock", "lock", "lock", "bike security", "lock", "lock", "lock", "lock"], "difficult_direct_answer": false, "rationales": ["The black object is there to keep the bike from being stolen by a thief.", "This secures it to keep people from easily stealing it", "It's a chain used to keep the bike from getting stolen."], "image": "val2014/COCO_val2014_000000389401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335774, "question_id": "ZG5JzPnudeESq8jyM3xpKw", "question": "What type of dish could this be considered?", "choices": ["appetizer", "dessert", "side", "entree"], "correct_choice_idx": 0, "direct_answers": ["buffet", "dinner", "exotic", "antipasto", "lunch", "charcuterie", "hors d'oeuvre", "appetizer", "platter", "appetizers"], "difficult_direct_answer": true, "rationales": ["The dish is an appetizer.", "The dish is hearty and heavy.", "This tray is an appetizer since it's just a few snacks."], "image": "val2014/COCO_val2014_000000335774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485509, "question_id": "ZGTzsLDcgrXvq9HkVeeb6G", "question": "What town is this national park based in?", "choices": ["grand junction", "breckenridge", "aspen", "estes park"], "correct_choice_idx": 3, "direct_answers": ["whistler", "somewhere snowy", "nome", "estes park", "toronto", "yosemite", "yosemite", "estes", "unknown", "yellowstone"], "difficult_direct_answer": true, "rationales": ["Estes park is near the rocky mountains in colorado. mountains can be seen in the distance and there is snow all around.", "This could be aspen.", "Estes park is where the park is."], "image": "val2014/COCO_val2014_000000485509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502748, "question_id": "ZH4GAvrTuAkUozUzxbCdHa", "question": "Which instrument here requires electrical current to be audible?", "choices": ["trombone", "keyboard", "voice", "triangle"], "correct_choice_idx": 1, "direct_answers": ["keyboard", "keyboard", "keyboard", "keyboard", "tuba", "electric keyboard", "keyboard", "keyboard", "keyboard", "piano"], "difficult_direct_answer": false, "rationales": ["The keyboard is an electrical one.", "The keyboardist requires electricity for their music to be heard.", "Men with instruments are sitting around a room including guitars and keyboards."], "image": "train2014/COCO_train2014_000000502748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341060, "question_id": "ZHFvxU9HC6pQN65HQ7v79Q", "question": "What desert is on the clear glass plate on the left of the laptop?", "choices": ["muffin", "crumpet", "scone", "bagel"], "correct_choice_idx": 0, "direct_answers": ["muffin", "cupcake", "cupcake", "muffin", "muffin", "muffing", "stone", "muffin", "muffin", "muffin"], "difficult_direct_answer": false, "rationales": ["A chocolate muffin is shown.", "The object looks soft and smushy. it closely resembles that of a cupcake shape.", "The other options aren't shaped in this form."], "image": "train2014/COCO_train2014_000000341060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320721, "question_id": "ZHLVMZCrj5pcpLKgzaPU92", "question": "What water sport will the bike rider most likely do next?", "choices": ["kayak", "water ski", "wind surf", "surf"], "correct_choice_idx": 3, "direct_answers": ["surfing", "surf", "surfing", "surfing", "surfing", "surf", "surfing", "surf", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["There is a surfboard on the bike.", "The bike rider will want to get up and ride to surf on the board.", "The bike has a board."], "image": "train2014/COCO_train2014_000000320721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292990, "question_id": "ZHVqSsKWXyRTayDqnhYgZP", "question": "Which is the dominant hand for the batter here?", "choices": ["left", "neither", "left foot", "right"], "correct_choice_idx": 0, "direct_answers": ["left", "left", "left", "left", "left", "right", "left", "left", "right", "left"], "difficult_direct_answer": false, "rationales": ["The person is standing on the right side of the plate.", "The batter is standing opposite of the home base indicating that he is left handed.", "The batter's right hand is near the bottom of the bat. his other hand is dominant."], "image": "val2014/COCO_val2014_000000292990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105844, "question_id": "ZHZGd49WS88bqEQCGK6VHw", "question": "Which one has better eyesight?", "choices": ["white top", "black pants", "grey top", "orange stripe"], "correct_choice_idx": 0, "direct_answers": ["white jacket", "left one", "left", "left man", "white top", "white shirt", "left", "left side", "white shirt", "left man"], "difficult_direct_answer": false, "rationales": ["He does not need glasses.", "The person in white has better eyes.", "The one in the white top has a better eyesight."], "image": "train2014/COCO_train2014_000000105844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421218, "question_id": "ZJ8zRZT6tt3r3ow3VwhatJ", "question": "Why is the man midair in the middle of the steps?", "choices": ["stood up", "performing trick", "was thrown", "he fell"], "correct_choice_idx": 1, "direct_answers": ["performing trick", "trick", "performing trick", "performing trick", "performing trick", "jumped", "doing stunt", "jumped", "flying down", "performing trick"], "difficult_direct_answer": false, "rationales": ["You can by the setting and his position in the air as to what he is doing.", "The man is performing the tricks on the stairs.", "He's doing a trick."], "image": "val2014/COCO_val2014_000000421218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78042, "question_id": "ZJBU6GT5FMemT5k3oXo9UC", "question": "What form of travel is this pass intended for?", "choices": ["skateboarding", "walking", "cycling", "skiing"], "correct_choice_idx": 1, "direct_answers": ["walking", "bicycling", "pedestrians", "walking", "walking", "walking", "pedestrian", "walking", "walking", "walking"], "difficult_direct_answer": false, "rationales": ["The travel is walking.", "The road prohibits bikes.", "The path is for pedestrians since cyclists aren't allowed."], "image": "train2014/COCO_train2014_000000078042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61831, "question_id": "ZJDmtAbuGmjSQfsJRtx4iB", "question": "What do the meters display?", "choices": ["colors", "temperature", "language", "time"], "correct_choice_idx": 3, "direct_answers": ["speed", "time", "time", "parking times", "coffee cup", "display weight", "parking time", "time", "time left", "zero"], "difficult_direct_answer": false, "rationales": ["They track how long a car can park there.", "Parking meters display the amount of time left for parking based on the amount of money placed by the driver parking.", "The time is displayed so that you know how long you have to park in the area."], "image": "val2014/COCO_val2014_000000061831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60610, "question_id": "ZJXc4tqd4HsZfrmcSYwJmU", "question": "In which country does this bus drive?", "choices": ["usa", "united kingdom", "belgium", "france"], "correct_choice_idx": 1, "direct_answers": ["uk", "england", "england", "united kingdom", "england", "england", "uk", "england", "england", "uk"], "difficult_direct_answer": false, "rationales": ["Most double deckers are seen in england.", "The bus is going through trafalgar square.", "The country uses that kind of buses."], "image": "val2014/COCO_val2014_000000060610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440045, "question_id": "ZJmphtBieJJKxjzWg6DJYB", "question": "How do these people know each other?", "choices": ["siblings", "competitors", "teammates", "coworkers"], "correct_choice_idx": 0, "direct_answers": ["siblings", "childhood friends", "siblings", "siblings", "friends", "family", "brothers", "siblings", "siblings", "school"], "difficult_direct_answer": false, "rationales": ["They are all brothers.", "All kids are sitting at a beach of similar age, look like each other.", "The people in question visibly seem to be within a few years age of each other and look similarly."], "image": "val2014/COCO_val2014_000000440045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92909, "question_id": "ZKGk8PLFCyuutZACoHycX4", "question": "What material is the flooring?", "choices": ["porcelain", "plastic", "wood", "laminate"], "correct_choice_idx": 3, "direct_answers": ["wood", "tile", "wood", "laminate", "vinyl", "laminate", "tile", "linoleum", "marble", "tile"], "difficult_direct_answer": false, "rationales": ["The material is laminated.", "The floor is laminated since it's so shiny.", "The flooring is clearly visible and based on the color, sheen and composition, answer a is most likely and the other answers are ruled out."], "image": "train2014/COCO_train2014_000000092909.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154496, "question_id": "ZL8PdMriLth2buhBfoTWZV", "question": "What sport other that the ball's proper sport does the ball look closest to belonging to?", "choices": ["american football", "volleyball", "golf", "tennis"], "correct_choice_idx": 1, "direct_answers": ["volleyball", "soccer", "volleyball", "kickball", "volleyball", "volleyball", "hacky sack", "rugby", "hat guy", "basketball"], "difficult_direct_answer": false, "rationales": ["They do look similar.", "Volleyballs look like soccer balls.", "The ball is large and white. it is similar size to one hit over net."], "image": "train2014/COCO_train2014_000000154496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182505, "question_id": "ZL8djB9fWKUdg8MXkgXNSf", "question": "What is the man poking at here?", "choices": ["rat", "bat", "pizza", "cat"], "correct_choice_idx": 2, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "fire", "pizza", "pizza oven"], "difficult_direct_answer": false, "rationales": ["The man wants pizza.", "The man is trying to get pizza out of the oven.", "He is using a long handled pizza shovel and there is a pizza on a platter."], "image": "train2014/COCO_train2014_000000182505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170045, "question_id": "ZLYQKbBCHBwDMhDJ25MLHi", "question": "What will be poured over the item in the box?", "choices": ["water", "yogurt", "milk", "ketchup"], "correct_choice_idx": 2, "direct_answers": ["milk", "chocolate", "garbage", "milk", "food", "batter", "water", "milk", "milk", "milk"], "difficult_direct_answer": false, "rationales": ["The item in the box is cereal. that is usually eaten with milk.", "Honey oats cereal is good with dairy.", "People put this on their cereal to eat it."], "image": "train2014/COCO_train2014_000000170045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199534, "question_id": "ZLnUVvzNeoeVANLf22EwqD", "question": "What is the bus primarily used for?", "choices": ["tours", "racing", "shipping", "school"], "correct_choice_idx": 0, "direct_answers": ["city views", "tours", "sightseeing", "transport", "sight seeing", "intercity transportation", "public transport", "tours", "transportation", "tourist"], "difficult_direct_answer": true, "rationales": ["The bus says city view on it. it is a tour bus.", "The bus is used for tour guides.", "The bus has advertising on it offering to take people to see the sites and hear the history."], "image": "train2014/COCO_train2014_000000199534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187650, "question_id": "ZLyxmmPQbUgDCyPyPE7uF7", "question": "What type of animals are present?", "choices": ["goat", "deer", "giraffe", "dog"], "correct_choice_idx": 2, "direct_answers": ["giraffe", "giraffe", "giraffe", "giraffe", "giraffe", "giraffes", "giraffes", "giraffes", "giraffe", "giraffes"], "difficult_direct_answer": false, "rationales": ["These animals have the correct spots and are very tall.", "These are giraffes.", "Obvious by length of neck, spots and other features."], "image": "train2014/COCO_train2014_000000187650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534194, "question_id": "ZMLDMwRThEp2uXYUj3ErDG", "question": "What phenomenon happened most recently here?", "choices": ["fire storm", "volcanic eruption", "snow storm", "flood"], "correct_choice_idx": 3, "direct_answers": ["flood", "flood", "flooding", "flood", "flood", "flood", "flood", "typhoon", "flooding", "flood"], "difficult_direct_answer": false, "rationales": ["The water is completely covering area where the bench is located so there was flooding.", "The seat is in water.", "The phenomenon is a flood."], "image": "val2014/COCO_val2014_000000534194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320053, "question_id": "ZMSn63cxL2eeo72TLu2wqY", "question": "What do the kids want to do with the ball?", "choices": ["taste it", "kick it", "hide it", "grab it"], "correct_choice_idx": 1, "direct_answers": ["playing", "kick it", "kick", "kick", "claim possession", "score goal", "kick it", "kick it", "kick it", "kick it"], "difficult_direct_answer": false, "rationales": ["They are chasing it to kick it.", "The kids are playing soccer.", "They are playing soccer so they would use their feet."], "image": "val2014/COCO_val2014_000000320053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101646, "question_id": "ZN8J5KcJEE8YvV7ZUVJkdw", "question": "Who decides if the pitch was good or bad?", "choices": ["catcher", "referee", "umpire", "crowd"], "correct_choice_idx": 2, "direct_answers": ["ump", "ump", "umpire", "umpire", "umpire", "umpire", "umpire", "umpire", "ump", "umpire"], "difficult_direct_answer": false, "rationales": ["The man in black has the best view of the pitch.", "The answer is commonly known in baseball which is the sport depicted.", "The umpire decides."], "image": "train2014/COCO_train2014_000000101646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503926, "question_id": "ZNK2t2dZyCGgRNedgmD5Sc", "question": "What are the metal things at the top left supposed to hold?", "choices": ["sand", "candles", "gasoline", "cookies"], "correct_choice_idx": 1, "direct_answers": ["drapes", "candles", "candles", "candles", "table", "clothes", "lantern covers", "candles", "candles", "candles"], "difficult_direct_answer": false, "rationales": ["The items hold burning objects upright.", "The metal things are candlesticks.", "The objects in question are located based on the question. they are the size, shape and style of something that would be used to hold answer a."], "image": "val2014/COCO_val2014_000000503926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449599, "question_id": "ZNRf5soRthvYCb9mvvd6RD", "question": "What is the cat's paw near?", "choices": ["ruler", "human hand", "monkey's paw", "bagel"], "correct_choice_idx": 0, "direct_answers": ["book", "ruler", "book", "book", "book", "document", "book", "book", "book", "cat"], "difficult_direct_answer": false, "rationales": ["The object near the paw is a ruler.", "There's a ruler on the paper.", "It's near the ruler and books."], "image": "train2014/COCO_train2014_000000449599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444390, "question_id": "ZNmu7fzmuBSZidLop4FpWj", "question": "What is on top of the counter?", "choices": ["cat", "toy horse", "towel", "baby"], "correct_choice_idx": 1, "direct_answers": ["toy horse", "vase", "horse statue", "rag", "coffee machine", "burner", "flowers", "vase", "flowers", "phone"], "difficult_direct_answer": false, "rationales": ["The top of the counter holds a toy horse.", "There is a model of a non-human animal on the counter. the animal has a mane.", "A toy horse sculpture is on the counter."], "image": "val2014/COCO_val2014_000000444390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94513, "question_id": "ZP4SCnS7C9VfWgQ8XwMCdS", "question": "What industry have these animals traditionally helped humans in?", "choices": ["truffle hunting", "farming", "textiles", "metal work"], "correct_choice_idx": 1, "direct_answers": ["farming", "transportation", "farming", "farming", "agriculture", "cattle", "farming", "horseback riding", "farming", "farming"], "difficult_direct_answer": false, "rationales": ["Two horses are standing in the water. a man is on one of them", "Horses are in the water.", "Horses are very popular farm animals."], "image": "val2014/COCO_val2014_000000094513.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238562, "question_id": "ZPAdLbat5qjhVJSDd3nmSD", "question": "What would be the major taste biting into the bottom right donut?", "choices": ["sour", "bitter", "salty", "sweet"], "correct_choice_idx": 3, "direct_answers": ["sugar", "sprinkles", "icing", "sweet", "sweet", "sweet", "sugar", "burger", "chocolate", "sprinkles"], "difficult_direct_answer": false, "rationales": ["Donuts are always sugary.", "The sprinkles and frosting of this bottom right donut imply it will taste sweet when eaten.", "The donuts are sweet."], "image": "train2014/COCO_train2014_000000238562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309580, "question_id": "ZPByeJqzfFcc8ik7JYEUWG", "question": "Who controls the channels on this TV?", "choices": ["dog", "human owner", "ferret", "cat"], "correct_choice_idx": 1, "direct_answers": ["person", "person", "person", "human owner", "human", "remote", "person", "person", "buttons", "person"], "difficult_direct_answer": false, "rationales": ["The human owner is the one who has though and knows how to use the remote.", "Animals do not watch television and the human has opposable thumbs to control the tv.", "The human controls."], "image": "train2014/COCO_train2014_000000309580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297444, "question_id": "ZPDBypoDEoYNQ3KJd6Wghv", "question": "What age is the person being feted here?", "choices": ["nine", "newborn", "two", "92"], "correct_choice_idx": 3, "direct_answers": ["ninety two", "92", "92", "92", "92", "ninety two", "ninety two", "92", "92", "92"], "difficult_direct_answer": false, "rationales": ["The age is 92.", "The numbers on the top of the cake indicate the person's age.", "The 92 shaped candles on this cake which has 'happy birthday' written on it in icing strongly suggests someone has recently turned 92."], "image": "val2014/COCO_val2014_000000297444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80013, "question_id": "ZPMBcXmFNfxg45Yn27Q3jV", "question": "Where did the water on the ground come from?", "choices": ["fire hydrant", "ocean", "rain", "bucket"], "correct_choice_idx": 0, "direct_answers": ["hydrant", "hydrant", "fire hydrant", "fire hydrant", "fire hydrant", "hydrant", "fire hydrant", "hydrant", "fire hydrant", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["It is a sunny day in an area that is not near the ocean. first responders are present, and they are using a device that specifically exists to allow them to obtain water.", "The water is from the hydrant.", "The firefighters are messing with the fire hydrant."], "image": "val2014/COCO_val2014_000000080013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40211, "question_id": "ZPNBuGpY5iVo4hNTsVtARS", "question": "Which lens used in side mirror of the car?", "choices": ["macro", "concave", "convex", "zoom"], "correct_choice_idx": 2, "direct_answers": ["dog", "glass", "convex", "camera", "camera", "camera", "convex", "nikon", "camera", "concave lens"], "difficult_direct_answer": false, "rationales": ["The mirror is convex since it slopes outward.", "B is actually a lens or mirror that curves inward.", "The lens is visible and the orientation of the lens is identifiable."], "image": "train2014/COCO_train2014_000000040211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186777, "question_id": "ZPi5aCMHDuXkiih5k9pEJy", "question": "What activity does the posted sign advise is not allowed?", "choices": ["diving", "biking", "fishing", "swimming"], "correct_choice_idx": 3, "direct_answers": ["swimming", "swimming", "swimming", "no swimming", "swimming", "swimming", "swimming", "no swimming", "no swimming", "swimming"], "difficult_direct_answer": false, "rationales": ["Its in red letters and says what you can do with water.", "They are not allowing people to swim in the ocean at the time.", "The posting has words written on it that are readable and confirm what is not allowed in this setting."], "image": "train2014/COCO_train2014_000000186777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297323, "question_id": "ZPswesADMvg6muPE7rNfye", "question": "What vehicles are near the curb?", "choices": ["plane", "bus", "scooter", "bicycle"], "correct_choice_idx": 1, "direct_answers": ["buses", "busses", "buses", "buses", "bus", "vans", "bus", "buses", "bus", "buses"], "difficult_direct_answer": false, "rationales": ["Buses are parked next to the curb.", "There is a bus.", "Buses are near the curb."], "image": "val2014/COCO_val2014_000000297323.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311076, "question_id": "ZPtDVJWQZYzN8Xp6ReFurS", "question": "Who captains the team of the jersey in red?", "choices": ["mario iginla", "jonathan toes", "lionel messi", "bill reid"], "correct_choice_idx": 1, "direct_answers": ["not sure", "blackhawks", "no idea", "blackhawks", "jonathan toes", "blackhawk", "jonathan toews", "smith", "jonathan toews", "toews"], "difficult_direct_answer": false, "rationales": ["He has captained the team since the 2008-2009 season.", "The boy in the photos is wearing a chicago blackhawk hockey jersey. the captain of this team is jonathan toews.", "Jonathan toes is the captain of the chicago blackhawks"], "image": "train2014/COCO_train2014_000000311076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103817, "question_id": "ZQ4Mu4GdpvqZbwqHY9GtWV", "question": "What type of parking is available here?", "choices": ["lot", "parallel", "valet", "angle"], "correct_choice_idx": 1, "direct_answers": ["street", "curb", "street", "parallel", "street", "car", "street parking", "street", "limited", "parallel parking"], "difficult_direct_answer": false, "rationales": ["The cars have to park the same way as the sidewalk", "The cars are parked vertically on the curb.", "Cars need to park on the side of the street."], "image": "train2014/COCO_train2014_000000103817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238427, "question_id": "ZQK4wdML7ZAWTPkbt5t8Sx", "question": "What king of game are the people above playing?", "choices": ["ice skating", "soccer", "gliding", "skiing"], "correct_choice_idx": 3, "direct_answers": ["skiing", "skiing", "skis", "skiing", "hill", "skiing", "king ski", "skiing", "scatting", "skiing"], "difficult_direct_answer": false, "rationales": ["The people are skiing because they are on a moutain wearing skis", "The people above are all skiing together.", "There is snow on the ground, so they are not gliding or playing soccer. they also do not have ice skates."], "image": "train2014/COCO_train2014_000000238427.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429448, "question_id": "ZQTJCHdgrmyhqdApmTyDYK", "question": "Why are the women filling the table with plates?", "choices": ["to paint", "to cook", "to decorate", "to serve"], "correct_choice_idx": 3, "direct_answers": ["to serve", "buffet", "party", "for birthday", "serving crowd", "serving lunch", "for guests", "party", "guests coming", "serving lunch"], "difficult_direct_answer": false, "rationales": ["This is so people can quickly pick up a plate and keep moving", "The women are filling the table with plates of pizza for dining.", "People will come in and eat this food."], "image": "train2014/COCO_train2014_000000429448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469282, "question_id": "ZR6ygwtwbXwpwXinLMhMgV", "question": "What color is the boat closest to the person taking the photo?", "choices": ["blue", "orange", "purple", "yellow"], "correct_choice_idx": 1, "direct_answers": ["red", "red", "red", "red", "orange", "bright red", "bright red", "red", "red", "bright red"], "difficult_direct_answer": false, "rationales": ["The side of the boat is orange.", "The boat closest to the photographer has an orange hull.", "Unless you are colorblind you can tell what color the boat closest is."], "image": "train2014/COCO_train2014_000000469282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326898, "question_id": "ZRStGFoCy4qpZYqiSaPWvT", "question": "The man holding something is likely to develop what ailment?", "choices": ["text neck", "nosebleed", "broken foot", "torn quad"], "correct_choice_idx": 0, "direct_answers": ["carpal tunnel", "website", "eye strain", "bad vision", "text neck", "diabetes", "neck pain", "carpel tunnel", "carpal tunnel", "carpal tunnel"], "difficult_direct_answer": false, "rationales": ["His head is leaning downward towards the cellphone so he is likely to have problems with his cerebral vertebrae.", "The man is looking down at his phone.", "His neck will be sore from looking down."], "image": "val2014/COCO_val2014_000000326898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203986, "question_id": "ZRd9ZpFuBvhnh6xzeiDAD8", "question": "Besides the ground what hard surface are the spectators sitting on?", "choices": ["marble", "bleachers", "plastic", "pavement"], "correct_choice_idx": 1, "direct_answers": ["stadium", "bleachers", "bleachers", "bleachers", "grass", "visitors place", "bleachers", "bleachers", "grass", "bleachers"], "difficult_direct_answer": false, "rationales": ["There are people spectating from the ground and the bleachers.", "The surface is very hard.", "The spectators are sitting on the metal benches which are known as bleachers."], "image": "train2014/COCO_train2014_000000203986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99416, "question_id": "ZRdBvSJzgcUB94ycXP5xaB", "question": "What kind of sight do the glasses worn by the tie fixer correct for?", "choices": ["none", "blindness", "reversable", "far"], "correct_choice_idx": 3, "direct_answers": ["far", "farsightedness", "bad eyesight", "eye problems", "poor", "blurry", "nearsightedness", "closesighted", "far sighted", "nearsighted"], "difficult_direct_answer": true, "rationales": ["He's farsighted.", "The man cannot see far away.", "He is using the glasses to do close-up work, which means he is most likely farsighted (he sees better far away)."], "image": "val2014/COCO_val2014_000000099416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241350, "question_id": "ZRiQD8zx89vgw8Ge4pBKH5", "question": "What is on the wall directly above the bigger monitor?", "choices": ["mirror", "calendar", "clock", "painting"], "correct_choice_idx": 1, "direct_answers": ["chart", "calendar", "calendar", "calendar", "whiteboard", "calendar", "bulletin board", "calendar", "calendar", "calendar"], "difficult_direct_answer": false, "rationales": ["There is a calendar on front of the wall.", "A wall calendar hangs above the larger monitor.", "It has numbered squares on it"], "image": "val2014/COCO_val2014_000000241350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511906, "question_id": "ZRuZVHjT9Ng4RHwyY3mTLR", "question": "What is in the sandwich?", "choices": ["codfish", "apple", "pickle", "steak"], "correct_choice_idx": 2, "direct_answers": ["vegetables", "vegetables", "vegetables", "cucumber", "pickle", "vegetables", "pickle", "pickles", "vegetable", "vegetables"], "difficult_direct_answer": false, "rationales": ["It appears to only have vegetables inside.", "The sandwich lacks pickles.", "There is a cucumber pickled inside of the sandwich."], "image": "val2014/COCO_val2014_000000511906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254325, "question_id": "ZRzbZj9LRwfAyfspiwbr8Y", "question": "What is on the wall to the left?", "choices": ["teddy bear", "poster", "statue", "television"], "correct_choice_idx": 3, "direct_answers": ["tv", "television", "television", "television", "television", "tv", "television", "television", "television", "television"], "difficult_direct_answer": false, "rationales": ["A television is on the wall.", "The screen being on makes this obvious. the other options aren't in the room.", "It's exactly where you'd expect to see it."], "image": "train2014/COCO_train2014_000000254325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216051, "question_id": "ZS2ztY2MoQAFenzWH6idqi", "question": "Which location does the woman most likely rest in?", "choices": ["zoo", "race track", "farm pen", "dog park"], "correct_choice_idx": 3, "direct_answers": ["dog park", "bench", "dog park", "urban", "park", "park", "park", "house", "dog park", "park"], "difficult_direct_answer": false, "rationales": ["The woman is outside with her dog out in what looks like a park setting.", "The woman is sitting on a bench in an area that is fenced in. she is holding a dog.", "There is a fence behind her and a dog on her lap."], "image": "val2014/COCO_val2014_000000216051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386661, "question_id": "ZS7fA56hWYUMj5QRNmU8DV", "question": "The structures enclosing the blue benches are constructed from which wood?", "choices": ["oak", "mahogany", "pine", "bamboo"], "correct_choice_idx": 3, "direct_answers": ["bamboo", "bamboo", "bamboo", "pine", "pine", "sea", "bamboo", "oak", "bamboo", "bamboo"], "difficult_direct_answer": false, "rationales": ["The wood is very light color.", "This is the wood that will not warp when exposed to water.", "The structures are made of bamboo."], "image": "val2014/COCO_val2014_000000386661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125266, "question_id": "ZSAMRTAr79sSCwTUXSa6kc", "question": "Why is the truck not moving?", "choices": ["wet pavement", "stop sign", "no driver", "no gas"], "correct_choice_idx": 1, "direct_answers": ["shift direction", "stop sign", "stop sign", "parked", "stop sign", "stopped", "stop sign", "stop sign", "stop sign", "stop sign"], "difficult_direct_answer": false, "rationales": ["The sigh simple the tells the truck to not move.", "It's at a stop sign.", "The truck is not moving near to the stop sign."], "image": "train2014/COCO_train2014_000000125266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202880, "question_id": "ZSBktjTgEHJBsyiSc9gYJS", "question": "Why is the animal facing the tree?", "choices": ["to sit", "to sleep", "to water", "to eat"], "correct_choice_idx": 3, "direct_answers": ["eating", "eating", "eat leaves", "eating", "eating", "eat leaves", "eating leaves", "eating", "eating", "to eat"], "difficult_direct_answer": false, "rationales": ["Giraffes like to eat leaves from trees.", "The giraffe is facing the tree in order to eat its leaves that are high up", "Giraffes eat foods from high places, as their long necks are built for it. they generally eat leaves, flowers and fruits, when available."], "image": "val2014/COCO_val2014_000000202880.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414492, "question_id": "ZSK4TfYsjaVcsHZcPBYfFk", "question": "What is one of the stops for this bus?", "choices": ["dublin", "edinburgh", "oxford circus", "victoria station"], "correct_choice_idx": 2, "direct_answers": ["oxford circus", "oxford circus", "oxford circus", "oxford", "lambeth building", "oxford circus", "oxford circus", "circus", "oxford", "oxford circus"], "difficult_direct_answer": false, "rationales": ["The sign on the front of the bus indicates where it stops.", "The sign on the bus says oxford circus.", "The sign on the front of the bus says oxford circus, which is its next destination."], "image": "train2014/COCO_train2014_000000414492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188987, "question_id": "ZSMGeiwtZrdS3H7qbRVcSj", "question": "Why are the people on the elephant?", "choices": ["riding it", "both lost", "they're confused", "lost bet"], "correct_choice_idx": 0, "direct_answers": ["adventure riding", "riding it", "tourism", "riding it", "ride", "adventure", "tourists", "riding", "two", "tourists"], "difficult_direct_answer": false, "rationales": ["They're riding it.", "Two people are seated in a chair on top of an elephant. the elephant i sin a wooded area.", "The people are riding in the chair."], "image": "train2014/COCO_train2014_000000188987.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148587, "question_id": "ZSbhts8RQMvvwU3cEDteDV", "question": "What sport is the woman playing?", "choices": ["volleyball", "basketball", "badminton", "beach volleyball"], "correct_choice_idx": 3, "direct_answers": ["beach volleyball", "volleyball", "volleyball", "volleyball", "volleyball", "beach volleyball", "volleyball", "beach volleyball", "beach volleyball", "volleyball"], "difficult_direct_answer": false, "rationales": ["She is playing volleyball.", "The woman is engaged in a round of volleyball in the sand.", "The sport is beach volleyball."], "image": "train2014/COCO_train2014_000000148587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399465, "question_id": "ZSdZx4mYf8LvLQtYAiUYEJ", "question": "These people are most likely where?", "choices": ["college campus", "mansion", "lake cabin", "marshland"], "correct_choice_idx": 0, "direct_answers": ["city", "city", "park", "park", "park", "park", "college campus", "park", "park", "park"], "difficult_direct_answer": false, "rationales": ["There are seen carrying books from class.", "The other places would not look like this.", "The people are on a campus."], "image": "train2014/COCO_train2014_000000399465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370701, "question_id": "ZT74563zWWf3fALAbu6S6s", "question": "What sort of life does this man advocate for?", "choices": ["merchants", "canines", "felines", "people"], "correct_choice_idx": 1, "direct_answers": ["animals", "free range", "animal life", "dogs", "happy", "puppies", "pet adoption", "puppy", "canines", "pets"], "difficult_direct_answer": true, "rationales": ["The man wants to prevent dogs from being abused.", "The sign refers to young dogs being bred by irresponsible owners for quick money.", "They're canines."], "image": "val2014/COCO_val2014_000000370701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40446, "question_id": "ZTGCJVzmsYVKVkU7kwrgNb", "question": "What is she doing?", "choices": ["resting", "praying", "hiding", "eating"], "correct_choice_idx": 1, "direct_answers": ["praying", "sitting", "praying", "praying", "praying", "praying", "praying", "praying", "praying", "praying"], "difficult_direct_answer": false, "rationales": ["She's praying.", "If someone is alone in a church, it is most likely because they are under some kind of terrible stress and feel the need to pray and be uplifted. a quiet communing with god can be a great source of comfort in troubled times.", "The woman is in church."], "image": "val2014/COCO_val2014_000000040446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227495, "question_id": "ZTHk3amXAHDpZEcTpHxMJU", "question": "What is the man wearing?", "choices": ["glasses", "clown nose", "headband", "gas mask"], "correct_choice_idx": 2, "direct_answers": ["sports attire", "headband", "tennis shoes", "shorts", "sweatband", "headband", "hat", "headband", "tennis wear", "shorts"], "difficult_direct_answer": false, "rationales": ["The man is wearing one around his head.", "He has a piece of material around his head to catch the sweat.", "There is a white object around his forehead and a shade for a hatpiece behind his hair."], "image": "train2014/COCO_train2014_000000227495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204024, "question_id": "ZTQfnMnRjKpY2yZFHhDAcb", "question": "What is the person skiing on?", "choices": ["ice", "water", "snow", "sand"], "correct_choice_idx": 1, "direct_answers": ["water", "ice", "water", "water", "water", "ice", "frozen water", "water", "snow", "water"], "difficult_direct_answer": false, "rationales": ["There is splashing coming from behind the person.", "He is skiing in the lake.", "A pool of water is in the middle of the snow."], "image": "train2014/COCO_train2014_000000204024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349647, "question_id": "ZTWL4ZeUG2rnRpeeghH2H6", "question": "What will the woman drink with her pizza?", "choices": ["milk", "coke", "beer", "wine"], "correct_choice_idx": 2, "direct_answers": ["beer", "beer", "beer", "beer", "beer", "beer", "beer", "beer", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["A woman is eating pizza and bottles of beer are on the table in front of her.", "The woman will drink the beers on the counter with her pizza.", "Beer is shown by the pizza."], "image": "val2014/COCO_val2014_000000349647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428595, "question_id": "ZThNfCPnxkafBevWs7qzwF", "question": "What do the women here find most interesting?", "choices": ["walkway", "goose", "child", "duck"], "correct_choice_idx": 2, "direct_answers": ["baby", "baby", "baby", "child", "baby", "baby", "baby", "baby", "baby", "baby"], "difficult_direct_answer": false, "rationales": ["They are both looking at the baby.", "Both women sitting on the bench in this image are looking at the bundled up baby in the green pants woman's arms.", "They are looking at the baby"], "image": "train2014/COCO_train2014_000000428595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23047, "question_id": "ZTjZ7CQGPWq9QCDKJaMtZe", "question": "What energy powers the blender?", "choices": ["solar", "electricity", "battery", "manual"], "correct_choice_idx": 1, "direct_answers": ["electric", "electricity", "electric", "electricity", "electric", "electricity", "electricity", "electricity", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["Electricity keeps the blender going at a high speed.", "The white cord coming from the back of the blender is plugged into an electrical socket. the first electric blender was invented in 1922.", "It has a cord to plug in"], "image": "train2014/COCO_train2014_000000023047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484369, "question_id": "ZTn6b9St2bvSxk5GfHC53b", "question": "What is used for the bear's eye?", "choices": ["lid", "rock", "coin", "button"], "correct_choice_idx": 3, "direct_answers": ["button", "buttons", "button", "button", "buttons", "button", "button", "button", "button", "button"], "difficult_direct_answer": false, "rationales": ["Buttons are over the eyes of the bear.", "A button is used for the eye.", "The eyes are made of buttons."], "image": "train2014/COCO_train2014_000000484369.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567220, "question_id": "ZU4mgPuW78sUTTWg2u8fCA", "question": "Which towel has been used recently for a shower?", "choices": ["black", "green", "blue", "grey"], "correct_choice_idx": 3, "direct_answers": ["white", "blue", "white", "white", "grey", "white", "white", "white", "white", "blue"], "difficult_direct_answer": false, "rationales": ["The light colored towel is messy on the rack.", "It's the only towel that looks like it has been recently unfolded and wrinkled while used.", "The one that looks like it was pulled off the rack then thrown back on."], "image": "val2014/COCO_val2014_000000567220.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535896, "question_id": "ZU9kLpgT8hzd3vDjo5gCe8", "question": "Why has he stopped?", "choices": ["rest", "clean up", "enjoy scenery", "eat lunch"], "correct_choice_idx": 2, "direct_answers": ["admiring view", "probably tired", "enjoying view", "tired", "assessing path", "avoid falling", "resting", "enjoying moment", "resting", "enjoy scenery"], "difficult_direct_answer": true, "rationales": ["There are interesting rock formations and snow drifts below him.", "He's enjoying the sights.", "The man wants to take in the snow's beauty."], "image": "train2014/COCO_train2014_000000535896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138007, "question_id": "ZUeCHZVHHTUtqWLGcTXQhu", "question": "Where is the cat staring to?", "choices": ["ducks", "down", "right", "up"], "correct_choice_idx": 0, "direct_answers": ["geese", "geese", "forest", "ducks", "forest", "geese", "ducks", "forest", "birds", "watching duck"], "difficult_direct_answer": false, "rationales": ["The cat is visible in the background and based on its eye line, and known interests, answer a is correct.", "The cat is looking at the ducks in the grass.", "The cat looks at the ducks."], "image": "train2014/COCO_train2014_000000138007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374530, "question_id": "ZUk85SLi8JUFPdsVpSmgGi", "question": "What is the name of this palace?", "choices": ["windsor", "balmoral", "parliament", "westminster"], "correct_choice_idx": 3, "direct_answers": ["buckingham", "london", "no idea", "castle", "westminster", "big ben", "big ben", "buckingham palace", "london", "london"], "difficult_direct_answer": false, "rationales": ["There is a clock on the north tower of the palace.", "The big ben is located in westminister.", "This is westminster because it is located on the thames pictured here"], "image": "val2014/COCO_val2014_000000374530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320350, "question_id": "ZUoez46ryvHV8RGXASp6He", "question": "When the people ski here what will the dogs do?", "choices": ["sleep", "eat", "follow them", "go home"], "correct_choice_idx": 2, "direct_answers": ["chsse", "chase", "follow them", "run", "walk alongside", "follow them", "chase", "follow", "follow them", "run along"], "difficult_direct_answer": false, "rationales": ["Walk behind skiis as going down hill.", "These animals can ski so most likely they don't go with there owners on the ski jump.", "Two dogs are with their owners at the top of a snowy hill. since the owners are both wearing skis, it can be safely assumed that their dogs will joyfully chase after them as they ski down the hill."], "image": "val2014/COCO_val2014_000000320350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452836, "question_id": "ZUvMFbHVjHe9df3AkCNhyZ", "question": "Why is she holding the umbrella?", "choices": ["is frightened", "to impress", "defends herself", "is ready"], "correct_choice_idx": 3, "direct_answers": ["shade", "sun protection", "is ready", "shade", "sun protection", "for shade", "might rain", "sun prep", "sun protection", "rain"], "difficult_direct_answer": false, "rationales": ["A famous royal carries an umbrella in a garden.", "She is holding the umbrella because she is ready for the rain.", "She's ready."], "image": "val2014/COCO_val2014_000000452836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54738, "question_id": "ZUyJMwBZY5BsW7kHNvwFzW", "question": "What electric device are the two kids intently focused upon?", "choices": ["television", "cable box", "dvd player", "radio"], "correct_choice_idx": 0, "direct_answers": ["tv set", "television", "tv", "playstation", "television", "television", "television", "nintendo", "wii", "television"], "difficult_direct_answer": false, "rationales": ["The kids are holding a controller so they are looking at a tv.", "The two kids are focused on a television screen for playing video games.", "The kids have video game controllers. video games are not played on radios, dvd players, or cable boxes."], "image": "train2014/COCO_train2014_000000054738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387685, "question_id": "ZV6XPXYxGZFdG5GJXVikzU", "question": "What activity goes on in the chair on the platform?", "choices": ["shoe shining", "beard shaving", "haircuts", "scalp massage"], "correct_choice_idx": 0, "direct_answers": ["shoeshine", "shoe shining", "shoe shining", "shoe shining", "shoe polishing", "shoe shining", "shoe shining", "cover", "shoe shining", "shoe shining"], "difficult_direct_answer": false, "rationales": ["The activity is shoe shining.", "A shoe shining kit is near the chair.", "The chair is high up like \"on a throne,\" shoes are to the side, and two metal bars where you place your feet so the person can shine the footwear."], "image": "train2014/COCO_train2014_000000387685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565941, "question_id": "ZV9Trb7vzR9FSEQiNYjFui", "question": "About how much liquid is in the bottle with the green label?", "choices": ["nine tenths", "third", "full", "none"], "correct_choice_idx": 1, "direct_answers": ["third", "quarter", "liter", "quarter bottle", "half full", "half", "liter", "1/4th", "one third", "third"], "difficult_direct_answer": false, "rationales": ["A third of the tea bottle is filled with fluid.", "The liquid is a third full.", "The bottle is not full and it is not empty. it has a lot less than 9/10."], "image": "val2014/COCO_val2014_000000565941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135664, "question_id": "ZVE9zXGx6CY7Tq73BN2EnY", "question": "Where are the beds that the boys are lying on?", "choices": ["living room", "bedroom", "daycare", "furniture store"], "correct_choice_idx": 3, "direct_answers": ["trundle", "nap beds", "joined together", "store", "kids beds", "ikea", "furniture store", "store", "store", "school"], "difficult_direct_answer": false, "rationales": ["The place seems to be a place that takes care of kids.", "Looks like they are in a store that sells bedding.", "They're in a furniture store."], "image": "train2014/COCO_train2014_000000135664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409105, "question_id": "ZVNxgVZr7csYwyjR7cpMMQ", "question": "What type of transportation is this?", "choices": ["road", "air", "rail", "water"], "correct_choice_idx": 2, "direct_answers": ["train", "train", "train", "train", "train", "train", "train", "rail", "train", "train"], "difficult_direct_answer": false, "rationales": ["This is a train. the rail is on the ground.", "It's a train.", "It's a land vehicle that travels on tracks."], "image": "train2014/COCO_train2014_000000409105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196131, "question_id": "ZVY3JWg5SqjJvRGdkyWtjd", "question": "Persons traveling on this street in this direction may turn which way now?", "choices": ["none", "right", "u", "left"], "correct_choice_idx": 0, "direct_answers": ["left", "right", "no turns", "right", "right", "right", "left", "left", "none", "right"], "difficult_direct_answer": false, "rationales": ["This is a one way street and the white sign clearly states turning isn't an option.", "There is a traffic jam.", "People can't turn since the traffic is so bad."], "image": "train2014/COCO_train2014_000000196131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355424, "question_id": "ZWEyDxWGqWEpcX4mmAiPnE", "question": "The dough prepared for pizza by which flour?", "choices": ["pulses", "wheat", "corn", "maize"], "correct_choice_idx": 1, "direct_answers": ["white", "white flour", "whole wheat", "wheat", "semolina", "wheat flour", "white flour", "all purpose", "semolina", "wheat"], "difficult_direct_answer": false, "rationales": ["Crust always has wheat flour in it.", "The dough is made of wheat.", "Most pizza doughs are prepared with wheat flour."], "image": "val2014/COCO_val2014_000000355424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9789, "question_id": "ZWSpyhj48GZjmHuoyXc5T4", "question": "How can the candles be extinguished?", "choices": ["baby's hand", "water", "blowing", "photo flash"], "correct_choice_idx": 2, "direct_answers": ["blowing", "blowing", "blowing it", "snuff it", "breath", "blown out", "blown out", "three", "blown", "blowing"], "difficult_direct_answer": false, "rationales": ["These are birthday candles, and usually people sing happy birthday and then the candles are blown out.", "Expelled breath is used to extinguish candles.", "The candles are small. usually people blow out birthday candles."], "image": "train2014/COCO_train2014_000000009789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386525, "question_id": "ZWTQzFuyLGzBaxuaz7Xkuo", "question": "What word is on the side of the truck?", "choices": ["happy", "omnipotent", "freedom", "bless"], "correct_choice_idx": 2, "direct_answers": ["freedom", "freedom", "freedom", "freedom", "freedom", "freedom", "freedom", "freedom", "freedom", "freedom"], "difficult_direct_answer": false, "rationales": ["There is a slogan on the truck.", "The side of the truck says \"freedom isn't free\".", "The truck has a military theme. the word appears near the tailgate."], "image": "val2014/COCO_val2014_000000386525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202764, "question_id": "ZWUfzzJUvx7Wff6mFb8EYG", "question": "What type dung is most visible here?", "choices": ["goat", "donkey", "ibis", "elephant"], "correct_choice_idx": 3, "direct_answers": ["elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["Elephants are in the area, so their poop is as well.", "Elephants are walking.", "There is a herd of elephants near the water"], "image": "train2014/COCO_train2014_000000202764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25232, "question_id": "ZX5n93bkLaXbvV8ByBDXer", "question": "How did these grooves get set in snow?", "choices": ["natural phenomenon", "freak storm", "magic", "snow groomer"], "correct_choice_idx": 3, "direct_answers": ["snow groomer", "wind", "grooming machine", "melting", "snow machines", "for traction", "special machine", "snow groomer", "rake", "tracks"], "difficult_direct_answer": true, "rationales": ["They reach there by using snow groomer.", "The grooves are from a grooming machine.", "The people have sticks in their hand. they have dragged them across the snow."], "image": "train2014/COCO_train2014_000000025232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484327, "question_id": "ZXAynSfmZoYTBZ4e2SY5xS", "question": "What is the sign discouraging during certain hours?", "choices": ["parking", "loitering", "eating", "turns"], "correct_choice_idx": 3, "direct_answers": ["left turn", "turning", "left turns", "turns", "turns", "no parking", "left turn", "left turns", "turning left", "left turns"], "difficult_direct_answer": false, "rationales": ["The arrow is showing no left turns during certain hours.", "The sign prohibits left turns.", "No turns are allowed."], "image": "train2014/COCO_train2014_000000484327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181929, "question_id": "ZXQJSVzqWVPFTw6VRCr2kL", "question": "Which person wore the apparatus the girl has on her face?", "choices": ["mahatma gandhi", "ernest hemingway", "henry viii", "maya angelou"], "correct_choice_idx": 0, "direct_answers": ["mahatma gandhi", "beard man", "nothing", "left man", "man", "girl", "near sighted", "man", "glasses", "man"], "difficult_direct_answer": false, "rationales": ["Gandhi led a smug lifestyle and existence.", "Mahatma gandhi had glasses.", "Gandhi is a famous person known for wearing glasses, same as this woman is wearing."], "image": "train2014/COCO_train2014_000000181929.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305055, "question_id": "ZXYiqwFp2rY7S9FgLiPeZR", "question": "What kind of snow SLED the man have in the image?", "choices": ["seat", "stick", "board", "basket"], "correct_choice_idx": 2, "direct_answers": ["unknown", "snowboard", "snowboard", "snowboard", "snowboard", "board", "snowboard", "snow board", "board", "snowboard"], "difficult_direct_answer": false, "rationales": ["It is flat like a ski", "The man has a snow board next to him on the ski slope.", "The man has a snowboard in his hand."], "image": "train2014/COCO_train2014_000000305055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283290, "question_id": "ZXuvKgXyj9pjxL9Q3CzmEg", "question": "What power allows the man to to airborne?", "choices": ["water pump", "mental", "solar", "magic"], "correct_choice_idx": 0, "direct_answers": ["water", "water", "skis", "water pressure", "hydro", "jets", "water", "jet", "water pump", "jet"], "difficult_direct_answer": false, "rationales": ["A man is in the air with a contraption that is spitting out water below him.", "The man is almost flying by the wet stuff in the lake.", "There is water being pushed out of the jets near his shoulders."], "image": "val2014/COCO_val2014_000000283290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283138, "question_id": "ZXxrWzYWaT8jhVDLVetufk", "question": "Why does the woman have the dog on a leash?", "choices": ["to walk", "to punish", "to guard", "to bathe"], "correct_choice_idx": 0, "direct_answers": ["control", "maintain control", "following law", "getting loose", "walking", "to walk", "prevent running", "supposed to", "walking it", "no running"], "difficult_direct_answer": true, "rationales": ["The woman is taking the dog out for a walk.", "The woman has the dog on a leash and is walking down the street.", "The other options don't appear in this image or setting."], "image": "train2014/COCO_train2014_000000283138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218734, "question_id": "ZXyzHht2L8VizmJYE3a4PT", "question": "What pastry could be made with these?", "choices": ["chocolate balls", "strawberry tart", "pumpkin pie", "banana bread"], "correct_choice_idx": 3, "direct_answers": ["banana bread", "cake", "banana purses", "banana bread", "banana bread", "banana pie", "bread", "banana bread", "banana bread", "banana bread"], "difficult_direct_answer": false, "rationales": ["There are bunches of bananas.", "The fruit in the back are bananas and could be used in a bread.", "The other options don't normally contain bananas. someone could also make bananas foster."], "image": "train2014/COCO_train2014_000000218734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409884, "question_id": "ZY2G58wVYgGAtuZR24botY", "question": "How long ago did the gardener most likely harvest the produce?", "choices": ["1 day", "20 days", "7 days", "45 days"], "correct_choice_idx": 0, "direct_answers": ["this week", "one day", "one day", "yesterday", "recently", "recently", "today", "until ripened", "recent", "1 day"], "difficult_direct_answer": false, "rationales": ["The produce is very fresh.", "The vegetables are fresh and some of them still have dirt on them.", "The vegetables look fresh based on their coloring and no visible wilting or rotting. for them to maintain this look they would likely have been picked recently."], "image": "train2014/COCO_train2014_000000409884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433574, "question_id": "ZYDZSvWffprXqBTcbJiQMx", "question": "How many varieties of DVD discs are used as storage device?", "choices": ["five", "seven", "six", "four"], "correct_choice_idx": 1, "direct_answers": ["many", "unknown", "unknown", "many", "seven", "frame", "seven", "two", "what....", "fifteen"], "difficult_direct_answer": false, "rationales": ["There are several being used as a storage device.", "There are 7 types.", "There are seven types of dvd discs that can function as storage."], "image": "val2014/COCO_val2014_000000433574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432150, "question_id": "ZYKowqeeEeJLtL9PMbdNbK", "question": "Elephant like craft has done with the use of which vegetable?", "choices": ["bottle gourd", "snake gourd", "spring onion", "ridge gourd"], "correct_choice_idx": 3, "direct_answers": ["corn", "straw", "straw", "corn cobs", "corn-on-the cob", "ridge gourd", "gord", "no idea", "squash", "wood"], "difficult_direct_answer": true, "rationales": ["The elephant is like a ridge gourd.", "The gourds look like trunks.", "The end of the gourd curls to look like an elephants trunk."], "image": "val2014/COCO_val2014_000000432150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26164, "question_id": "ZYPTZ3XYrogaSa89smxMqc", "question": "What is the painting on the wall?", "choices": ["traffic signs", "guide stickers", "advertisement", "art work"], "correct_choice_idx": 2, "direct_answers": ["advertising", "advertisement", "advertisement", "advertisement", "advertisement", "advertisement", "s ow", "coke", "advertisements", "advertisement"], "difficult_direct_answer": false, "rationales": ["It's a sign for a pop company.", "There are coca cola advertisements on the wall.", "The painting is an ad."], "image": "train2014/COCO_train2014_000000026164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563809, "question_id": "ZYVqoPGJ5ZWF5YzQL653V6", "question": "What type of Danish is in the middle of the other two?", "choices": ["cheese", "peach", "strawberry", "blueberry"], "correct_choice_idx": 3, "direct_answers": ["chocolate", "chocolate", "chocolate", "blueberry", "chocolate", "croissant", "croissant", "croissant", "chocolate croissant", "chocolate"], "difficult_direct_answer": false, "rationales": ["There is a danish with dark blue jelly in between the other two.", "The danish is blueberry.", "Blueberries are in the danish."], "image": "train2014/COCO_train2014_000000563809.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580212, "question_id": "ZYgx6i4nmSGyvfxyizP5xY", "question": "The person on the tv is of what ethnicity?", "choices": ["white", "asian", "black", "native american"], "correct_choice_idx": 2, "direct_answers": ["black", "asian", "african", "black", "black", "black", "black", "black", "korean", "asian"], "difficult_direct_answer": false, "rationales": ["This man's skin tone suggests his ethnicity.", "He has the skin color and eyes of an asian.", "The man has dark skin."], "image": "train2014/COCO_train2014_000000580212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5256, "question_id": "ZYpdHUNUsbYfKYoLy6cADm", "question": "What type of text sign is shown?", "choices": ["brand", "directional", "warning", "regulatory"], "correct_choice_idx": 1, "direct_answers": ["cities", "street sign", "directional", "directions", "directions", "caps", "road sign", "block", "city", "road sign"], "difficult_direct_answer": false, "rationales": ["The sign is directional.", "The signs have arrows for directions.", "The sign is pointing out directions with the arrow."], "image": "train2014/COCO_train2014_000000005256.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513616, "question_id": "ZYq5d7n2Z3CeVNEhyGk2a6", "question": "What session of the day is it shown here?", "choices": ["afternoon", "morning", "dawn", "evening"], "correct_choice_idx": 1, "direct_answers": ["midday", "morning", "afternoon", "morning", "afternoon", "morning", "morning", "morning", "late morning", "midday"], "difficult_direct_answer": false, "rationales": ["The sun is shining and a clock can be seen above a fountain.", "The time on the clock says it's the morning.", "The clock shows the time is 10:09. it is daylight outside."], "image": "train2014/COCO_train2014_000000513616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139042, "question_id": "ZZ444By8SbAoxjzNAvTznN", "question": "Who is the same gender as this person?", "choices": ["sandy koufax", "michael learned", "leslie nielsen", "dana andrews"], "correct_choice_idx": 1, "direct_answers": ["serena williams", "britney spears", "ball girl", "female", "woman", "nancy pelosi", "woman", "michael learned", "referee", "marie antoinette"], "difficult_direct_answer": true, "rationales": ["Any woman would fit the bill, starting with leslie nielsen.", "Though the name is masculine answer \"a\" is a female just like the tennis player.", "This person is a woman. leslie nielsen, dana andrews, and sandy koufax are men."], "image": "train2014/COCO_train2014_000000139042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449114, "question_id": "ZZQ8DEAFDCcmM5NJPx45CX", "question": "What move has the player just made?", "choices": ["lob", "backhand", "forehand", "serve"], "correct_choice_idx": 1, "direct_answers": ["serve", "backhand", "serve", "backhand", "backhand", "backhand", "backhand", "backhand", "backhand", "serve"], "difficult_direct_answer": false, "rationales": ["The back of his hand was facing the ball.", "He's swinging his racket from his center outwards", "This tennis player is right handed so that the fact that his swing is finishing on this same side indicates that it was a backhand."], "image": "val2014/COCO_val2014_000000449114.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66593, "question_id": "ZZSWGLYQQJ2rTMWW2zZCiP", "question": "The footwear the woman with the umbrella has on is suitable for what place?", "choices": ["russia", "mongolia", "brazil", "siberia"], "correct_choice_idx": 2, "direct_answers": ["beach", "one dollar", "brazil", "warm areas", "sidewalk", "beach", "beach", "beach", "beach", "desert"], "difficult_direct_answer": false, "rationales": ["She is in a warm place.", "The footwear is for brazil.", "The woman is wearing open shoes that would be suitable for warm places where her feet would not get too cold and be uncomfortable. answer a on the list is a warm place and the other answers are not."], "image": "train2014/COCO_train2014_000000066593.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82252, "question_id": "ZZavngBJJXhNnPNbS2pNnd", "question": "What pattern is the fir on the animal's head?", "choices": ["blotched", "striped", "scalloped", "spotted"], "correct_choice_idx": 1, "direct_answers": ["striped", "stripes", "mohawk", "straight lines", "hair", "striped", "stripes", "stripe", "stripes", "stripes"], "difficult_direct_answer": false, "rationales": ["Zebras are striped and are black and white.", "There are black and white stripes on the animal.", "This is a zebra"], "image": "val2014/COCO_val2014_000000082252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67116, "question_id": "ZZcDx5xAjciwNTwyQLoBJn", "question": "What are bricks mostly made of?", "choices": ["wood", "rock", "clay", "silt"], "correct_choice_idx": 2, "direct_answers": ["sand", "sand", "adobe", "stone sand", "concrete", "cement", "clay", "clay", "white paint", "clay"], "difficult_direct_answer": false, "rationales": ["The material of the wall looks like stone shaped into squares.", "Bricks are made from shaped clay.", "The bricks are made of clay."], "image": "train2014/COCO_train2014_000000067116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62246, "question_id": "ZaVvo95ovDoQ8AVdg7RTdV", "question": "Sundancer is which direction?", "choices": ["lower right", "down", "top right", "left"], "correct_choice_idx": 2, "direct_answers": ["right", "right", "right up", "photo right", "top right", "north/east", "northeast", "northeast", "right", "north"], "difficult_direct_answer": false, "rationales": ["There is an arrow on the sign showing the way.", "The trail can be found in blue sign in middle portion.", "Directional signs are often seen on ski slopes. from this location, going to sundancer is to the top right."], "image": "train2014/COCO_train2014_000000062246.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569143, "question_id": "ZahkDdKJA7BPHwFCu3rPa8", "question": "What describes the situation most accurately about the closest plate?", "choices": ["broken", "half full", "full", "empty"], "correct_choice_idx": 1, "direct_answers": ["salad", "half full", "salad", "dinner time", "half full", "half full", "half full", "no carbs", "dinner", "small food"], "difficult_direct_answer": false, "rationales": ["The plate only has food on half of it.", "There is no food on one side of it", "The plate only has some food on it."], "image": "train2014/COCO_train2014_000000569143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155862, "question_id": "ZajTmAoG4BVU5AQGM8r8E5", "question": "What move is this kid making?", "choices": ["serve", "lob", "forehand", "backhand"], "correct_choice_idx": 2, "direct_answers": ["swinging", "swing", "forehand swing", "volley", "forehand", "tennis", "hitting ball", "hitting", "swing", "forward"], "difficult_direct_answer": true, "rationales": ["The move is a forehand.", "The boy is swinging the racket forward.", "The player's wrist is facing the ball. when performing a forehand shot in tennis, this is the orientation the player would be in."], "image": "train2014/COCO_train2014_000000155862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392991, "question_id": "Zat6ADqCj6bJYEP8mVx8ha", "question": "Another is being added to the cake?", "choices": ["fork", "spoon", "frosting", "layer"], "correct_choice_idx": 3, "direct_answers": ["topping", "topping", "coating", "layer", "layer", "layer", "layer", "cream", "layer", "layer"], "difficult_direct_answer": false, "rationales": ["Chocolate is being poured on top of the last layer of cake. chocolate is used as a layer on cake.", "The person is pouring something onto the cake.", "A layer is added."], "image": "val2014/COCO_val2014_000000392991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120347, "question_id": "ZbJeNoTzmsz2DSD3xRuJhh", "question": "What do skis leave behind in the snow after every movement?", "choices": ["oil", "tracks", "droppings", "steps"], "correct_choice_idx": 1, "direct_answers": ["tracks", "imprints", "tracks", "tracks", "tracks", "tracks", "tracks", "tracks", "imprints", "tracks"], "difficult_direct_answer": false, "rationales": ["Skis are long straight objects and when heavy things move across snow it leaves and indentation.", "The people will leave tracks in the snow.", "Skis leave tracks behind them in the snow."], "image": "val2014/COCO_val2014_000000120347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273035, "question_id": "ZbL5owdEexB9zCDKwjQuzi", "question": "What is the green/yellow item on the right?", "choices": ["tent", "tarp", "canopy", "umbrella"], "correct_choice_idx": 3, "direct_answers": ["umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["It opens up to provide shade'", "The green and yellow item is an umbrella.", "The items has the shape and design of a typical umbrella and is next to seats where one would want to use that item."], "image": "train2014/COCO_train2014_000000273035.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500973, "question_id": "ZbP9sbWiichzX4zyNzPsid", "question": "What except for a flag are the highest emanations coming from here?", "choices": ["homes", "cranes", "boats", "bridges"], "correct_choice_idx": 1, "direct_answers": ["cranes", "cranes", "cranes", "cranes", "cranes", "cranes", "asians", "cranes", "asians", "cranes"], "difficult_direct_answer": false, "rationales": ["There are several tall cranes in the background that are higher than everything else.", "Large machinery extends into the sky.", "A marina is filled with boats and cranes."], "image": "train2014/COCO_train2014_000000500973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464340, "question_id": "ZbQSxHoV9VbZiWgnwef5QD", "question": "What type of event is this?", "choices": ["party", "meeting", "funeral", "wedding"], "correct_choice_idx": 0, "direct_answers": ["celebration", "celebration", "grand opening", "birthday", "party", "grand opening", "celebration", "grand opening", "grand opening", "celebration"], "difficult_direct_answer": false, "rationales": ["The event is a party.", "Cakes are eaten at parties.", "A woman is cutting a professionally made cake on a table with other food items."], "image": "train2014/COCO_train2014_000000464340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125275, "question_id": "ZbdJFW86rAkZxDuu859fjP", "question": "The fruit shown contains a high level of what?", "choices": ["magnesium", "potassium", "vitamin", "vitamin b"], "correct_choice_idx": 1, "direct_answers": ["potassium", "potassium", "potassium", "potassium", "potassium", "potassium", "potassium", "potassium", "potassium", "potassium"], "difficult_direct_answer": false, "rationales": ["The fruit has potassium.", "Bananas have potassium.", "Bananas are known to be high in potassium."], "image": "train2014/COCO_train2014_000000125275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292058, "question_id": "ZbfwKhQJ6xrToHrUJn3V2T", "question": "Who created the works seen here?", "choices": ["city planning", "artist", "government offices", "traffic department"], "correct_choice_idx": 1, "direct_answers": ["artist", "artists", "graffiti artist", "artists", "artist", "protesters", "city workers", "artist", "artist", "artist"], "difficult_direct_answer": false, "rationales": ["It is an artist because so many stop signs would not be on the road this close together", "The traffic department created the stop signs seen here.", "These look like outdoor sculputures. roadside ones are very popular in modern art."], "image": "train2014/COCO_train2014_000000292058.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142946, "question_id": "ZbhD7x2cH6EhPUME7Rnbae", "question": "What is the original color of the baked beans in the dish?", "choices": ["brown", "gray", "white", "black"], "correct_choice_idx": 2, "direct_answers": ["white", "brown", "brown", "brown", "brown", "brown", "white", "red", "tan", "white"], "difficult_direct_answer": false, "rationales": ["The original color cannot be gleaned from the image, but the beans are a brownish color now and likely started as a slightly lighter shade.", "The beans are covered with sauce. they are light in color.", "The beans were originally white."], "image": "train2014/COCO_train2014_000000142946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567976, "question_id": "ZbmejMbd8LRuFxemarvQq2", "question": "What sort of event is going on in this area?", "choices": ["fire", "field days", "evacuation", "air show"], "correct_choice_idx": 1, "direct_answers": ["field days", "field day", "parade", "fire", "festival", "field days", "parade", "block party", "field days", "parade"], "difficult_direct_answer": false, "rationales": ["There is a sign on the windshield of the fire truck that has the words field days on it and there does not seem to be any urgency in the people as there would be if there was a fire.", "There is a sign on the fire truck that says what is happening.", "Kids are around the fire truck."], "image": "train2014/COCO_train2014_000000567976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576650, "question_id": "ZbnHVH7LV7w7cDy2aNHAuL", "question": "What vehicles are most of the people riding on?", "choices": ["car", "train", "bus", "bicycle"], "correct_choice_idx": 3, "direct_answers": ["bikes", "bicycles", "bicycle", "bicycles", "bicycles", "bicycle", "bikes", "bicycles", "bicycles", "cycle"], "difficult_direct_answer": false, "rationales": ["There are several bicycles in the scene with only a few cars.", "You can tell by the design and lack of motors as to what they are riding.", "The vehicles are bikes."], "image": "val2014/COCO_val2014_000000576650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104537, "question_id": "ZbvZvLgCYQ4rq3FwjuYSii", "question": "Why is the man in the background standing there?", "choices": ["payment", "sleeping", "eating", "watching"], "correct_choice_idx": 0, "direct_answers": ["paying parking", "payment", "telephone", "paying meter", "paying", "get newspaper", "relaxing", "using booth", "relaxing", "pay parking"], "difficult_direct_answer": true, "rationales": ["He looks to be paying for parking at a parking meter.", "The man in the background is paying at the parking toll booth.", "The man is standing in order to pay for his car parking"], "image": "train2014/COCO_train2014_000000104537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216449, "question_id": "Zbz7ouwzVhsVHiVFCkjExk", "question": "What large substance will the youngest child be ingesting?", "choices": ["burrito", "tortilla", "taco", "pizza"], "correct_choice_idx": 0, "direct_answers": ["wrap", "burrito", "water", "wrap", "food", "burrito", "burrito", "burrito", "burrito", "crepe"], "difficult_direct_answer": false, "rationales": ["She is holding a piece of one.", "It's on the plate in front of the child.", "The child has a burrito on their plate."], "image": "val2014/COCO_val2014_000000216449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442445, "question_id": "ZcH7pc23p5amj68KruNSEa", "question": "What are the animals closest to?", "choices": ["sun", "cat", "fence", "house"], "correct_choice_idx": 2, "direct_answers": ["donkeys", "fence", "fence", "horse", "horse", "fence", "fence", "fence", "fence", "fence"], "difficult_direct_answer": false, "rationales": ["The barrier is holding in the animals. it is made out of wood.", "The fence is near.", "The horses are all next to the fencing."], "image": "train2014/COCO_train2014_000000442445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92646, "question_id": "ZcgsAiZ5hoB82oDSAa7Wsp", "question": "What form of vintage media do the people in the living room enjoy?", "choices": ["music", "movies", "paintings", "books"], "correct_choice_idx": 1, "direct_answers": ["movies", "posters", "record", "movies", "vertigo", "hifi", "wii", "movies", "radio", "two"], "difficult_direct_answer": false, "rationales": ["The people in the living room enjoy watching movies.", "A movie poster is on the wall in a living room. people like to collect keepsakes and trinkets from things they are interested in.", "They have a movie poster on the wall"], "image": "train2014/COCO_train2014_000000092646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87199, "question_id": "Zcm8D4Mr2DV4s7zn3vT4vm", "question": "What does the man in brown listen to?", "choices": ["jousting", "dinner bell", "telephone", "romans"], "correct_choice_idx": 2, "direct_answers": ["phone", "cellphone", "phone", "cell phone", "cellphone", "mobile phone", "talking", "skyrim soundtrack", "telephone", "his cellphone"], "difficult_direct_answer": false, "rationales": ["He has a cell phone.", "He has a cellphone at his ear.", "He is holding it to his ear to hear."], "image": "val2014/COCO_val2014_000000087199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95744, "question_id": "Zd668bMGq7impCpp63Dj76", "question": "Encouraging what American Ice cream treat is an obvious choice for these vendors?", "choices": ["banana split", "chocolate malt", "chocolate sundae", "brownie cake"], "correct_choice_idx": 0, "direct_answers": ["vanilla", "banana split", "sundae", "banana splits", "frozen bananas", "banana flavor", "banana split", "banana split", "banana split", "banana"], "difficult_direct_answer": false, "rationales": ["Banana splits can be made.", "Banana splits have bananas.", "The fruits in the picture can be used with ice cream to make a banana split."], "image": "train2014/COCO_train2014_000000095744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126096, "question_id": "ZdDFK6n9aeHGcanhpWvceq", "question": "What Disney character can be seen in the sky?", "choices": ["goofy", "tinker bell", "minnie mouse", "lucy"], "correct_choice_idx": 1, "direct_answers": ["flying girl", "tinker bell", "tinkerbell", "tinkerbell", "flying girl", "tinkerbell", "flying girl", "tinkerbell", "tinkerbell", "tinkerbell"], "difficult_direct_answer": false, "rationales": ["Her wings are her hallmark feature.", "The character in the sky has wings and blonde hair.", "The kite looks like tinkerbell, the fairy in the peter pan movies."], "image": "train2014/COCO_train2014_000000126096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283261, "question_id": "ZdG7Azy5eA6k3gwuferBBL", "question": "Which object is most likely getting shot?", "choices": ["heart", "red pillow", "black bear", "sheep"], "correct_choice_idx": 3, "direct_answers": ["stuffed lamb", "stuffed sheep", "sheep", "sheep", "sheep", "cat", "stuffed sheep", "white", "white sheep", "sheep"], "difficult_direct_answer": false, "rationales": ["You can tell by the position of the gun as to who maybe getting shot.", "The bear is holding a gun pointed towards the sheep.", "Guns fire bullets and the end of the gun that bullets come from is only pointed toward the sheep."], "image": "val2014/COCO_val2014_000000283261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197762, "question_id": "ZdK6DNKQajiVqVPNgMq5Er", "question": "What food is the green item on the plate?", "choices": ["kale", "pepper", "cucumber", "olives"], "correct_choice_idx": 2, "direct_answers": ["pickles", "pickle", "pickle", "gherkin", "pickle", "pickle", "cucumber", "pickles", "pickle", "capsicum"], "difficult_direct_answer": false, "rationales": ["There is only one green item depicted and they are the size and shape consistent with answer a.", "The green food is the cucumber.", "There are some pickled cucumbers in the center of the plate."], "image": "train2014/COCO_train2014_000000197762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472484, "question_id": "ZdbJ7SmysREcCkR59fNzmd", "question": "Why would someone come to this location?", "choices": ["to eat", "to travel", "to learn", "to exercise"], "correct_choice_idx": 1, "direct_answers": ["to travel", "to travel", "ride train", "tourism", "travel", "travel", "board train", "traveling", "travel", "travel"], "difficult_direct_answer": false, "rationales": ["Trains take people from one place to another.", "People take trains to travel.", "The person wants to travel."], "image": "val2014/COCO_val2014_000000472484.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149615, "question_id": "ZdkQJEFVF7rjgXkZU7bbA4", "question": "Which country houses the headquarter of the brand company manufacturing the man's shirt?", "choices": ["united states", "italy", "britain", "france"], "correct_choice_idx": 0, "direct_answers": ["united states", "united states", "usa", "usa", "america", "united states", "usa", "usa", "usa", "usa"], "difficult_direct_answer": false, "rationales": ["This is the country it is based in", "The shirt most likely says \"nike sportswear\" and their headquarters are in the usa.", "Nike is headquartered near beaverton, oregon. oregon is a state in america."], "image": "train2014/COCO_train2014_000000149615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446517, "question_id": "Zdm7TuLRpUm6Zc6QkMiN7v", "question": "What might the bird eat in this setting?", "choices": ["grass", "leaves", "person", "dried flowers"], "correct_choice_idx": 3, "direct_answers": ["flower", "seeds", "nectar", "pollen", "greens", "nectar", "dried flowers", "seeds", "outside", "flowers"], "difficult_direct_answer": false, "rationales": ["The bird is eating from the dried flowers.", "The flower are seen in the picture near the man.", "The bird has flowers."], "image": "val2014/COCO_val2014_000000446517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315195, "question_id": "ZdmG8Dw3ykrNg6yQ3K85QT", "question": "What incident is happening in the scene?", "choices": ["fire", "riot", "running race", "water leakage"], "correct_choice_idx": 2, "direct_answers": ["water spraying", "hydrant explosion", "marathon", "hydrant leak", "marathon", "water exploding", "hydrant spray", "sprinkler firing", "running race", "marathon"], "difficult_direct_answer": false, "rationales": ["The people are following each other down the street. they have sneakers and running gear on.", "There are many people jogging in the street. there are people watching from the sidewalk and there are many discarded water containers on the street.", "People with tags on their shirts are running past on the street."], "image": "val2014/COCO_val2014_000000315195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36417, "question_id": "Zdvj2cyTPrSixg9ZUAUfQC", "question": "What language are most words on the banana written in?", "choices": ["english", "japanese", "russian", "french"], "correct_choice_idx": 3, "direct_answers": ["french", "french", "french", "french", "french", "french", "spanish", "spanish", "french", "spanish"], "difficult_direct_answer": false, "rationales": ["Looks like it's french.", "The words on the banana are in french.", "The banana's words are in french."], "image": "train2014/COCO_train2014_000000036417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112574, "question_id": "Ze5ScA3YkqLiVPUe6uuvAX", "question": "Whose glove will next touch the ball?", "choices": ["catcher", "manager", "batter", "pitcher"], "correct_choice_idx": 0, "direct_answers": ["catcher", "catcher", "catcher", "catcher", "catcher", "catchers mitt", "catcher's glove", "catchers glove", "right", "catcher's"], "difficult_direct_answer": false, "rationales": ["The ball is directly headed to his mitt and the batter is moving back.", "The glove will touch the catcher.", "The ball is headed directly for his mitt."], "image": "train2014/COCO_train2014_000000112574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421321, "question_id": "ZeTX3NMAPCaftPPYPgscCk", "question": "What are the zebras doing?", "choices": ["sleeping", "grazing", "running", "drinking"], "correct_choice_idx": 1, "direct_answers": ["eating", "grazing", "eating", "eating", "eating", "eating", "eating", "eating", "grazing", "grazing"], "difficult_direct_answer": false, "rationales": ["They are eating from the ground.", "The zebras are munching on grass.", "The zebras are snacking."], "image": "train2014/COCO_train2014_000000421321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7319, "question_id": "Zeg3MV2GiyuGsbwkkopGF4", "question": "What is this animal about to do?", "choices": ["bath", "drink water", "wash hands", "wash face"], "correct_choice_idx": 1, "direct_answers": ["drink water", "drink water", "drink water", "drink", "drink water", "drink water", "drink", "drink", "drink water", "drink water"], "difficult_direct_answer": false, "rationales": ["The animal wants a sip.", "The animal is looking at a basin of water and leaning forward with their face first. this is the position they would be in to drink water.", "The animal will drink."], "image": "train2014/COCO_train2014_000000007319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271118, "question_id": "ZeqNSdnKYX7jrahVNELyiX", "question": "What came in all those colored boxes?", "choices": ["blankets", "carpet", "food", "presents"], "correct_choice_idx": 3, "direct_answers": ["christmas presents", "gifts", "presents", "christmas gifts", "christmas gifts", "presents", "presents", "presents", "laptop", "gift"], "difficult_direct_answer": false, "rationales": ["The boxes are wrapped in wrapping paper based on the coloring and patterns. boxes wrapped in wrapping paper usually contain presents.", "The colored boxes are gifts.", "These items are wrapped and presented under a christmas tree."], "image": "train2014/COCO_train2014_000000271118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130032, "question_id": "ZetnyCtNCrmuQj76GetABc", "question": "What kind of road is this one?", "choices": ["intersection", "highway", "expressway", "one way"], "correct_choice_idx": 0, "direct_answers": ["intersection", "forked road", "two way", "winding", "winding", "country", "crossroads", "local", "forked", "local road"], "difficult_direct_answer": true, "rationales": ["A small road has crisscrosses and goes to other roads.", "You can tell by the traffic lights and the roads as to what is pictured here.", "The area is where a lot of directions in the street converge."], "image": "train2014/COCO_train2014_000000130032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66825, "question_id": "ZfA8qctZB8uoxee4haKEPJ", "question": "What is inside of the horse sculpture?", "choices": ["dogs", "fish", "food", "humans"], "correct_choice_idx": 3, "direct_answers": ["two people", "people", "humans", "people", "human legs", "people", "human legs", "human legs", "people", "two people"], "difficult_direct_answer": false, "rationales": ["The sculpture has people walking underneath it.", "The feet and legs of two people can be seen from underneath the sculpture.", "There are people in it."], "image": "train2014/COCO_train2014_000000066825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84938, "question_id": "ZfhefU88TGLUoesUgud8vH", "question": "What does the boy in grey pants want to do with the ball?", "choices": ["dodge it", "kick it", "catch it", "throw it"], "correct_choice_idx": 2, "direct_answers": ["catch it", "catch it", "catch", "catch it", "catch", "catch ball", "catch it", "catch it", "catch it", "catch"], "difficult_direct_answer": false, "rationales": ["The boy wants to catch the ball.", "The boy feels pressure to catch the ball, but with the sun in his face and closed eyes, his effort may not be rewarded here!.", "He's obviously jumping up and using the mitt to perform a."], "image": "train2014/COCO_train2014_000000084938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402802, "question_id": "ZfnpsokQmHsXVRzUK4UDiQ", "question": "Where does porcelain originally come from?", "choices": ["italy", "france", "australia", "china"], "correct_choice_idx": 3, "direct_answers": ["china", "china", "no clue", "china", "china", "ground", "china", "europe", "china", "china"], "difficult_direct_answer": false, "rationales": ["Porcelain slowly evolved in china until about 2,000 years ago, when it was deemed to be a functional and beautiful addition to people's lives. today, of course, porcelain is produced internationally, and in great abundance.", "The porcelain was from china.", "The porcelain dishware in this kitchen scene is often called 'china'. this is also the place such dishware is associated with and originally came from."], "image": "val2014/COCO_val2014_000000402802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563048, "question_id": "ZfwvT2hqh8uo4wUM4zeWYy", "question": "What does the store seen in the window sell?", "choices": ["groceries", "cooking utensils", "cosmetics/fragrances", "auto supplies"], "correct_choice_idx": 2, "direct_answers": ["makeup", "senhora", "kid's toys", "cosmetics/fragrances", "makeup", "jewelry", "fashion", "makeup", "toy bears", "dolls"], "difficult_direct_answer": false, "rationales": ["The store shown sells makeup and different perfumes.", "Sephora sells makeup and perfume.", "Senhora is an industry leader in this market."], "image": "train2014/COCO_train2014_000000563048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538581, "question_id": "ZgQZP39XSV4qQY3PzhnPQn", "question": "Which surfer put the others on this board?", "choices": ["man", "tan dog", "black dog", "woman"], "correct_choice_idx": 0, "direct_answers": ["man", "dog owner", "adult male", "man", "man", "man", "man", "dog owner", "pictured surfer", "man"], "difficult_direct_answer": false, "rationales": ["A guy and two dogs are on a surfboard.", "A man would have boarded his dogs on the surfboard.", "Dogs can't carry a surfboard to the ocean. dogs can't paddle a surfboard out into the ocean."], "image": "train2014/COCO_train2014_000000538581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134721, "question_id": "ZgsMAyNK2fuKzgJyzcaACM", "question": "What flavors the item on the brush?", "choices": ["mint", "charcoal", "cherry", "peaches"], "correct_choice_idx": 0, "direct_answers": ["toothpaste", "toothpaste", "mint", "mint", "spearmint", "mint", "mint", "mint", "mint", "chemicals"], "difficult_direct_answer": false, "rationales": ["Mint flavors the paste.", "Mint is usually the flavor of toothpaste.", "Mint is a common flavor in toothpaste and and none of the other options are not."], "image": "train2014/COCO_train2014_000000134721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382715, "question_id": "Zgt2dDiGGpHxAzK8M9Aqcy", "question": "In which lane does the skateboarder travel here?", "choices": ["bike", "bus", "passing", "sidewalk"], "correct_choice_idx": 1, "direct_answers": ["bus lane", "bike lane", "bike", "bike lane", "bike lane", "right lane", "bus", "right", "bike", "bicycle"], "difficult_direct_answer": false, "rationales": ["The skateboarder is in a lane that says bus, so the lane is for busses.", "There is a b painted in the roadway.", "A guy is skateboarding on the far right lane of a street."], "image": "val2014/COCO_val2014_000000382715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297812, "question_id": "Zh4CsgbrccngzhE4HLmwpo", "question": "What style meal is being prepared here?", "choices": ["chinese", "picnic", "wedding reception", "mexican"], "correct_choice_idx": 1, "direct_answers": ["picnic", "vegan", "picnic", "picnic", "sandwiches", "picnic", "lunch", "picnic", "picnic", "picnic"], "difficult_direct_answer": false, "rationales": ["The meal is a picnic.", "The people have food laid out on a blanket on the ground outside which is an eating style known as answer a.", "There is a blanket on grass. it is a sunny day and there is an array of bread and cheese on the blanket. picnics are meals that are eaten outdoor."], "image": "val2014/COCO_val2014_000000297812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151546, "question_id": "ZhRyijMvL3Lux3aUszZhDp", "question": "How has this lunch been arranged?", "choices": ["window serve", "buffet", "picnic", "smorgasbord"], "correct_choice_idx": 2, "direct_answers": ["buffet like", "food", "buffet", "picnic", "in order", "picnic", "by type", "picnic", "picnic", "on table"], "difficult_direct_answer": false, "rationales": ["The lunch is on disposable plates.", "The lunch is spread out on a blanket.", "The lunch is a picnic."], "image": "train2014/COCO_train2014_000000151546.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144182, "question_id": "ZhdMhpvBWhbVv85czGjS84", "question": "What time does the analog clock read?", "choices": ["1100", "255", "200", "1110"], "correct_choice_idx": 3, "direct_answers": ["1112", "eleven ten", "eleven ten", "1105", "1110am", "two", "eleven ten", "eleven ten", "1110", "eleven ten"], "difficult_direct_answer": false, "rationales": ["The clock reads eleven.", "The hour hand points to one hour before 12. the minute hand points to 2.", "When hour hand is on the eleven and the minute hand is at 10 minutes past the hour."], "image": "train2014/COCO_train2014_000000144182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167248, "question_id": "ZhggFKnXGxT2DxaqC7zGK3", "question": "What language is found on the newspaper?", "choices": ["french", "german", "russian", "italian"], "correct_choice_idx": 3, "direct_answers": ["italian", "italian", "italian", "italian", "italian", "italian", "english", "spanish", "italian", "swiss"], "difficult_direct_answer": false, "rationales": ["There are visible words that are known to be italian.", "It is a romance language with words that end in \"o\".", "This newspaper is from that country"], "image": "train2014/COCO_train2014_000000167248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135149, "question_id": "ZhnXtnYCRgdRyxewr2eLCB", "question": "What is the silver appliance near the window used to make?", "choices": ["coffee", "donuts", "bread", "ice cream"], "correct_choice_idx": 0, "direct_answers": ["coffee", "coffee", "coffee", "make coffee", "juice", "coffee", "coffee", "coffee", "coffee", "toast"], "difficult_direct_answer": false, "rationales": ["There is a glass kettle in the appliance.", "It is a percolator with a pot to hold the liquid.", "That is a coffee maker and used to make coffee in"], "image": "train2014/COCO_train2014_000000135149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121172, "question_id": "ZiFaXsdA8Fq3EEF58QdSJp", "question": "What is the black hat the man is wearing called?", "choices": ["top hat", "derby", "beanie", "fedora"], "correct_choice_idx": 2, "direct_answers": ["knitted cap", "beanie", "tobagon", "cap", "beanie", "cap", "ski cap", "beanie", "beanie", "ski hat"], "difficult_direct_answer": false, "rationales": ["The man has a beanie on.", "The man is wearing a beanie to keep warm in the snow", "A top hat would be used with a dressier outfit as would a fedora and a derby."], "image": "train2014/COCO_train2014_000000121172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505469, "question_id": "ZiNh7jKZ4E39pJLTmiZwaS", "question": "What is the bird standing above?", "choices": ["fruit", "baby", "egg", "cardboard box"], "correct_choice_idx": 0, "direct_answers": ["apple", "fruit", "apple", "parrot", "apple", "apple", "apple", "peach", "apple", "parrot"], "difficult_direct_answer": false, "rationales": ["The bird is standing above an apple.", "The bird is standing on a apple.", "The bird is clearly visible and their position and the object below it is identifiable."], "image": "train2014/COCO_train2014_000000505469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423214, "question_id": "ZiwjRGSPFxVCSHnnhNDiJD", "question": "What kind of dog is laying on the carpet?", "choices": ["brown lab", "terrier", "pug", "poodle"], "correct_choice_idx": 0, "direct_answers": ["brown lab", "lab", "chocolate lab", "retriever", "brown dog", "chocolate labrador", "weimaraner", "daschhound", "chocolate lab", "flat"], "difficult_direct_answer": true, "rationales": ["The dog has chocolatey fur.", "The dog is brown in color.", "The dog is brown and small."], "image": "train2014/COCO_train2014_000000423214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222625, "question_id": "ZjM3eTwjDsvr3TtJf36XWb", "question": "What sort of event brings a smile to this child's face today?", "choices": ["party", "lunch", "funeral", "nothing"], "correct_choice_idx": 0, "direct_answers": ["birthday", "party", "party", "party", "birthday party", "party", "birthday", "party", "birthday party", "party"], "difficult_direct_answer": false, "rationales": ["The child in the photo is wearing a tie and has a set of paper plates in front of him. at his age, the most likely event he is smiling about is a party.", "The boy looks to be at a party.", "A party is celebratory."], "image": "val2014/COCO_val2014_000000222625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129201, "question_id": "ZjUMyTC3MZrxLZ34cULVUB", "question": "What is one name for the type of headwear the woman is wearing?", "choices": ["veil", "cap", "tie", "hat"], "correct_choice_idx": 0, "direct_answers": ["veil", "headscarf", "headscarf", "scarf", "scarf", "scarf", "wrap", "hijab", "scarf", "hijab"], "difficult_direct_answer": false, "rationales": ["The name is a veil.", "Women in islamic countries are known to wear the head covering shown here.", "The woman is wearing a veil on her head."], "image": "train2014/COCO_train2014_000000129201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458123, "question_id": "ZjjUtu2N8F7t7Gqyim8FDo", "question": "How many airplanes do you see?", "choices": ["two", "five", "three", "four"], "correct_choice_idx": 1, "direct_answers": ["six", "six", "six", "six", "five", "six", "five", "five", "six", "five"], "difficult_direct_answer": false, "rationales": ["There are several planes and you only have to count.", "More than four plane tails are visible.", "Three planes are in the back and two are in the front."], "image": "train2014/COCO_train2014_000000458123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412400, "question_id": "Zjve7hphr8rpxuKf5gq4Nb", "question": "What is in the silver bowl?", "choices": ["soup", "grease", "butter", "au jus"], "correct_choice_idx": 2, "direct_answers": ["broth", "butter", "broth", "dressing", "butter", "soup", "melted butter", "butter", "dressing", "melted butter"], "difficult_direct_answer": false, "rationales": ["You can tell by the color of the liquid and seafood as to what it is.", "The bowl has butter.", "The bowl is full of melted butter."], "image": "train2014/COCO_train2014_000000412400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39542, "question_id": "ZjwL4A6s5Qq4seihQY6bjP", "question": "What is coming from the top of the vehicle?", "choices": ["fire", "steam", "birds", "water"], "correct_choice_idx": 3, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["There is a stream of water coming from the fire truck.", "The firemen are using their firehose with the stream starting at the top of their truck.", "They are using water to put a fire out."], "image": "train2014/COCO_train2014_000000039542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486457, "question_id": "Zk8tTbtcWnuVFVrtLc8nfz", "question": "What sort of wax item might be on a dessert enjoyed by the person sitting by the balloons today?", "choices": ["waxed nails", "joke teeth", "moon", "birthday candle"], "correct_choice_idx": 3, "direct_answers": ["candle", "candle", "candles", "cake", "candle", "candles", "birthday candle", "candles", "candle", "candle"], "difficult_direct_answer": false, "rationales": ["A birthday cake usually has candles on it.", "Candles on a cake are made out of wax. the balloons say happy birthday on them.", "Birthday candles are made of wax."], "image": "val2014/COCO_val2014_000000486457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471981, "question_id": "ZkAKKMkuVVnurCNWnZ8RgB", "question": "What is taking off?", "choices": ["airplane", "balloon", "helicopter", "kite"], "correct_choice_idx": 0, "direct_answers": ["airplane", "airplane", "plane", "airplane", "airplane", "airplane", "plane", "aeroplane", "plane", "airplane"], "difficult_direct_answer": false, "rationales": ["An airplane is jetting off into the air.", "The plane is pointed upward and is in the sky.", "This is a flying vehicle"], "image": "train2014/COCO_train2014_000000471981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4219, "question_id": "ZkUHkWYruu6kHhgpq2SrGY", "question": "What is the black strap hanging from the bench called?", "choices": ["collar", "necklace", "whip", "leash"], "correct_choice_idx": 3, "direct_answers": ["leash", "leash", "leash", "dog leash", "leash", "leash", "dog leash", "leash", "leash", "harness"], "difficult_direct_answer": false, "rationales": ["The black strap is the dog's leash.", "There is a dog visible on the bench wearing a black collar that appears to be attached to the strap. straps connected to dog's collars are known as leashes.", "This is used for dogs when walking"], "image": "train2014/COCO_train2014_000000004219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429131, "question_id": "ZmEJLYcLDEKH6rE4G2Esm8", "question": "What are modern tents made of?", "choices": ["cotton", "nylon/polyester", "wool", "plastic"], "correct_choice_idx": 1, "direct_answers": ["nylon", "nylon polyester", "plastic", "nylon", "nylon", "canvas", "nylon", "nylon", "nylon", "nylon/polyester"], "difficult_direct_answer": false, "rationales": ["The tents are seen with white nylonpolyester in the area.", "Modern tents are constructed of nylon.", "The tents are made of that stuff nowwdays."], "image": "train2014/COCO_train2014_000000429131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398435, "question_id": "ZmHYq8bepVJmVKqgpC6b35", "question": "Which ocean shares a name with this airline?", "choices": ["atlantic", "arctic", "indian", "pacific"], "correct_choice_idx": 0, "direct_answers": ["atlantic ocean", "atlantic", "atlantic", "atlantic", "atlantic ocean", "atlantic", "atlantic", "atlantic ocean", "atlantic", "atlantic"], "difficult_direct_answer": false, "rationales": ["The atlantic ocean is the second-largest in the world, with the pacific ocean coming in first. and yes, there is also a pacific airlines!.", "They have the same name", "The name printed on this virgin aircraft is also the name of an ocean."], "image": "train2014/COCO_train2014_000000398435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256284, "question_id": "ZnMU9hDGX73wYPAjHagZGh", "question": "What might those houses smell constantly?", "choices": ["bbq steaks", "manure", "flowers", "milk"], "correct_choice_idx": 1, "direct_answers": ["manure", "cow placing", "manure", "cows", "cows", "cow poop", "manure", "manure", "manure", "manure"], "difficult_direct_answer": false, "rationales": ["Cows live nearby and probably poop a lot.", "Horse poops and it stinks.", "Horses always smell of poop."], "image": "train2014/COCO_train2014_000000256284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302433, "question_id": "ZnXZxrKTuf7RcqkCNcETwZ", "question": "What does the man with his picture in the box pretend he is on?", "choices": ["sleeping pills", "tv", "sand", "steroids"], "correct_choice_idx": 1, "direct_answers": ["television", "television", "tv", "tv", "tv", "tv", "tv", "television", "television", "television"], "difficult_direct_answer": false, "rationales": ["It looks like an old television with big knobs", "The man has a tv frame around his face.", "He is pretending he is on the television."], "image": "train2014/COCO_train2014_000000302433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396882, "question_id": "ZnpZ8c89oYTYW79rqFjF9n", "question": "What is the green on the bricks on the ground?", "choices": ["paint", "crayon", "apple", "moss"], "correct_choice_idx": 3, "direct_answers": ["moss", "grass", "lichen", "moss", "no parking", "moss", "moss", "grass", "moss", "grass"], "difficult_direct_answer": false, "rationales": ["It is like a grassy growth that usually occurs widespread on objects like bricks.", "The green stuff is moss.", "There are some mosses growing between the many bricks."], "image": "train2014/COCO_train2014_000000396882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510951, "question_id": "ZnqGU4LtDsGhuu4jmkHsrD", "question": "What are the purplish veggies in the sandwich?", "choices": ["red onions", "eggplant", "purple cauliflower", "turnip"], "correct_choice_idx": 0, "direct_answers": ["onions", "pickled onions", "onions", "pickled onions", "onions", "red onions", "pickled onions", "onion", "onions", "pickled onions"], "difficult_direct_answer": false, "rationales": ["The sandwich has red onions in it because the onions are red", "I wouldn't really call them \"purplish,\" but they certainly are not eggplant, turnips or purple cauliflower.", "There are red onions inside of the sandwiches."], "image": "train2014/COCO_train2014_000000510951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33650, "question_id": "ZnuxCW4qgkYdG5rcQSda3P", "question": "The walls are most likely covered in what material?", "choices": ["slate", "plaster", "wood", "canvas"], "correct_choice_idx": 1, "direct_answers": ["plaster", "wallpaper", "wallpaper", "paint", "velvet", "velvet", "paint", "stucco", "wallpaper", "paint"], "difficult_direct_answer": false, "rationales": ["The walls have plaster on them.", "The texture and color suggests a concrete-like surface.", "Based on the mediterranean style of the room and the uneven colors, it appears the walls are covered with something thicker than paint."], "image": "val2014/COCO_val2014_000000033650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531453, "question_id": "ZoDzmhY5RBcKfmhce72m7k", "question": "What type of shot is the boy about to hit?", "choices": ["backhand", "forehand", "slice", "serve"], "correct_choice_idx": 1, "direct_answers": ["tennis", "backhand", "forehand", "underhand", "serve", "return", "backhand", "backhand", "forehand", "forehand"], "difficult_direct_answer": false, "rationales": ["By the position of the racket and body you can tell what he is trying to do.", "The boy is about to hit a forehand shot of the tennis ball.", "The shot is a forehand one."], "image": "train2014/COCO_train2014_000000531453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534162, "question_id": "ZoM34MpqPobGFzqikz5HHv", "question": "Why do the girls have matching bracelets?", "choices": ["health", "fashion", "visibility", "admission"], "correct_choice_idx": 3, "direct_answers": ["friends", "event", "to drink", "event bracelets", "club", "admission", "at club", "at club", "friends", "required"], "difficult_direct_answer": false, "rationales": ["The women are wearing wristbands.", "Often when entering a festival or concert, attendees are issued a non removable bracelet to signify that they have paid or have been screened to enter the event.", "Two girls are wearing paper bracelets that look exactly the same and are meant to be temporary."], "image": "val2014/COCO_val2014_000000534162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530229, "question_id": "ZoM8rr2r3EvCw6imt7Lkvz", "question": "What is in the room?", "choices": ["bunny", "bed", "couch", "refrigerator"], "correct_choice_idx": 2, "direct_answers": ["furnishings", "living room", "couch", "couches", "couches", "chairs", "furniture", "sofa chair", "furniture", "couch"], "difficult_direct_answer": false, "rationales": ["This is a living room. there are several couches.", "There is couch, loveseat and chair.", "There are places to sit on such as the sofa."], "image": "train2014/COCO_train2014_000000530229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488149, "question_id": "ZoNv4baMJ62bTYBef3q24B", "question": "Which color shirt does the person flying the kite wear?", "choices": ["green", "lavender", "red", "teal"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The person holding the strings to the kite has red on.", "The person with the string is wearing red.", "The man in red is holding the kite string."], "image": "train2014/COCO_train2014_000000488149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448863, "question_id": "ZoP7jMqodT5po8jYE5h8p6", "question": "What is the job of these people?", "choices": ["keep order", "load luggage", "make change", "serve food"], "correct_choice_idx": 1, "direct_answers": ["luggage", "transport luggage", "plane mechanics", "load luggage", "load luggage", "luggage handler", "aircraft crew", "load luggage", "pilot", "air control"], "difficult_direct_answer": false, "rationales": ["The individuals are tasked with loading and transporting luggage at airports. the vests and equipment used are indicative of that.", "The job is to load luggage.", "The people are loading the plane."], "image": "train2014/COCO_train2014_000000448863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346788, "question_id": "ZoVsWGea2axyJYuy8LGDHR", "question": "What is the woman standing in front of?", "choices": ["baby", "counter", "cat", "toilet"], "correct_choice_idx": 1, "direct_answers": ["counter", "sandwich", "microwave", "stove", "stove", "stovetop", "camera", "camera", "holding sandwich", "stove"], "difficult_direct_answer": false, "rationales": ["The woman is in her kitchen.", "She is in a kitchen.", "The woman is by a counter."], "image": "val2014/COCO_val2014_000000346788.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110431, "question_id": "Zp4GHGmFbfCUdj63kA9XvD", "question": "What is the color of dates?", "choices": ["red", "green", "pink", "white"], "correct_choice_idx": 0, "direct_answers": ["brown", "brown", "brown", "purple", "red", "purple", "brown", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["They are a delicious type of meal.", "Dates are in a bowl next to a sandwich. the dates are reddish brown.", "The color is red."], "image": "train2014/COCO_train2014_000000110431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20004, "question_id": "ZpLv2wMZHcNZRrR24BwiNE", "question": "What time of day is depicted here?", "choices": ["noon", "3 pm", "midnight", "twilight"], "correct_choice_idx": 3, "direct_answers": ["night", "night", "night", "twilight", "night", "evening", "evening", "evening", "evening", "night"], "difficult_direct_answer": false, "rationales": ["Time is showing its almost 6pm.", "The other options don't really match since this is closer to night but not quite dark entirely.", "It is easy to tell by the setting and light on the building to what time of day it is."], "image": "val2014/COCO_val2014_000000020004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50159, "question_id": "ZpTdpBKA7dMFXACG5SFhGG", "question": "Why are the two wearing sunglasses?", "choices": ["halloween", "protection", "style", "cosplay"], "correct_choice_idx": 1, "direct_answers": ["sunny", "block sunlight", "bright light", "snow blindness", "sun", "sun", "protection", "protection", "sun protection", "protect eyes"], "difficult_direct_answer": false, "rationales": ["The two are wearing sunglasses for protection.", "The people want to shield their eyes from the sun.", "This keeps them from getting snow blind by all the white and the sun"], "image": "val2014/COCO_val2014_000000050159.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425989, "question_id": "ZpTn6Wq4horqXvN7YkyAVK", "question": "What is the man doing on the rail?", "choices": ["cleaning", "grind", "waxing", "waning"], "correct_choice_idx": 1, "direct_answers": ["skiing", "jumping", "sliding", "falling", "grinding", "balancing", "balancing", "grinding", "standing", "grind"], "difficult_direct_answer": false, "rationales": ["The man is grinding his skateboard on the rail.", "The man is grinding against the rail.", "This is the name of the trick when they ride the board on metal"], "image": "train2014/COCO_train2014_000000425989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317765, "question_id": "ZpX9UWwfx9MMUognPZ2kMX", "question": "What color are the boards at the back of the truck?", "choices": ["purple", "green", "yellow", "blue"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["They are visible and yellow.", "The boards and truck in question are clearly visible and the color is identifiable.", "They are the same color as bananas."], "image": "train2014/COCO_train2014_000000317765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56326, "question_id": "ZpZvYA3mSwd2T65Zqy9fPp", "question": "On which plant does the vegetable that is reddest here grow?", "choices": ["cauliflower", "carrot", "pepper", "corn"], "correct_choice_idx": 2, "direct_answers": ["vine", "pepper", "pepper", "capsicum annum", "pepper", "pepper", "pepper", "ground", "tomato", "tomato"], "difficult_direct_answer": false, "rationales": ["There is only one red vegetable. chicken is eaten with peppers.", "The only red vegetable is cut up peppers.", "There is a giant red pepper on the plate."], "image": "val2014/COCO_val2014_000000056326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455211, "question_id": "ZpiJe6LVmSTZbCDVVnRR9Y", "question": "What does the man in green hold?", "choices": ["kite string", "bathing suit", "remote control", "shovel"], "correct_choice_idx": 0, "direct_answers": ["kite string", "kite", "kite", "kite string", "kite string", "kite string", "string", "kite", "kite strings", "kite"], "difficult_direct_answer": false, "rationales": ["The man is holding onto a string controlling the kite.", "The man has the kite in his hand.", "The man is holding string for a kite."], "image": "train2014/COCO_train2014_000000455211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286460, "question_id": "ZpkQbfvmUb2dJCdVHKLVAm", "question": "Judging by the time of day where is the skier probably going?", "choices": ["competition", "hiking", "skiing", "home"], "correct_choice_idx": 3, "direct_answers": ["home", "home", "home", "ski lodge", "mountain top", "home", "home", "down mountain", "home", "home"], "difficult_direct_answer": false, "rationales": ["It's night time and time to go home", "It's nighttime and most skiing takes place during the day, so it's likely that the skier has finished his skiing activities and is now leaving to head for home.", "The skier is likely headed home as it's night."], "image": "val2014/COCO_val2014_000000286460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549435, "question_id": "Zpwmq6JuVmNpL2PQqnhAfh", "question": "What is the rear part of a skateboard commonly called?", "choices": ["dogend", "tail", "caboose", "backside"], "correct_choice_idx": 1, "direct_answers": ["tail", "tail", "tail", "tail", "front", "tail", "back", "rear", "edge", "tail"], "difficult_direct_answer": false, "rationales": ["The tail of the board.", "The back end of a skateboard, is called the tail.", "The tail is called as the back part of the board."], "image": "train2014/COCO_train2014_000000549435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324428, "question_id": "ZqFzbKuryBhrBCs7E9Unh5", "question": "What are the large mechanical device on the back of the boats do?", "choices": ["propel", "hold bait", "anchor", "catch fish"], "correct_choice_idx": 0, "direct_answers": ["powers boat", "propel", "motor", "engine drive", "engines", "propel", "move them", "propel", "power boat", "motors boat"], "difficult_direct_answer": false, "rationales": ["The large devices on the back of the boats are meant to propel them through water.", "These are engines that move the boat", "The engines propel the boats."], "image": "train2014/COCO_train2014_000000324428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381890, "question_id": "ZqJwHhzy6RshnATTW53oat", "question": "What man made feature should be definitely avoided when engaging in this sport?", "choices": ["bridges", "houses", "power lines", "cars"], "correct_choice_idx": 2, "direct_answers": ["electrical line", "power lines", "buildings", "planes", "lighthouse", "power lines", "electrical lines", "electric lines", "electricity", "telephone wires"], "difficult_direct_answer": true, "rationales": ["Power lines can electrocute if they come in contact with water.", "They can get caught in the lines and cause damage or electrocution.", "The man made object is a kite based on the size and design. while all answers are possible, answer a is commonly depicted as a danger for a kite such as this."], "image": "train2014/COCO_train2014_000000381890.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536728, "question_id": "ZqMoqkRJkA4NYvAEohjGva", "question": "Which jockey is ahead?", "choices": ["green stripes", "none", "red hat", "yellow hat"], "correct_choice_idx": 2, "direct_answers": ["red hat", "nine", "number 9", "red jockey", "orange", "orange hat", "red", "orange cap", "orange", "red one"], "difficult_direct_answer": true, "rationales": ["Anyone can clearly see the red colored jockey is in the lead.", "The jockey with the red hat is first.", "The jockey in the red hat is leading."], "image": "train2014/COCO_train2014_000000536728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281850, "question_id": "ZqUcBt4732yqjwVAUoSj49", "question": "What is the dog wearing?", "choices": ["dress", "vest", "hat", "sweater"], "correct_choice_idx": 3, "direct_answers": ["sweater", "shirt", "shirt", "sweater", "sweater", "sweater", "sweater", "sweater", "sweater", "sweater"], "difficult_direct_answer": false, "rationales": ["The dog is wearing a covering on the front half of its body.", "The dog has a sweater on.", "The dog has a sweater."], "image": "train2014/COCO_train2014_000000281850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402408, "question_id": "ZqkmyWy358d82dqr2hjtjC", "question": "What is this skateboarding feature?", "choices": ["funbox", "bowl", "rail", "half-pipe"], "correct_choice_idx": 3, "direct_answers": ["ollie", "skate", "bowl", "ramp", "no idea", "decals", "skating trick", "half-pipe", "grab trick", "halfpipe"], "difficult_direct_answer": true, "rationales": ["The half pipe is present.", "The semi-circle shape present at this skatepark is known as a half-pipe.", "The person is in a skate bowl and the trick is called a halfpipe."], "image": "train2014/COCO_train2014_000000402408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285130, "question_id": "ZqxpXqm53faunVPcrNtNV5", "question": "How many berry variety fruits are there?", "choices": ["four", "one", "three", "five"], "correct_choice_idx": 2, "direct_answers": ["four", "three", "five", "two", "four", "four", "five", "four", "three", "three"], "difficult_direct_answer": false, "rationales": ["Strawberries, blackberries and blueberries are shown.", "There are three varieties of fruits, strawberries, blueberries and blackberries", "There are blackberries, strawberries and blueberries."], "image": "train2014/COCO_train2014_000000285130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106736, "question_id": "ZrB5jF32yTmxadJ4xfHpL9", "question": "What is on the food?", "choices": ["bacon", "salsa", "potato chips", "candles"], "correct_choice_idx": 3, "direct_answers": ["candles", "candles mms", "candy candles", "candy", "candles", "mms", "candles", "candles", "chocklete", "candles"], "difficult_direct_answer": false, "rationales": ["It is a birthday treat.", "The candles are on the donuts.", "The pointy things sticking out are made of wax and have tips that can be lit."], "image": "val2014/COCO_val2014_000000106736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453734, "question_id": "ZrMcUxVhDjr2zvfNGiqMEW", "question": "What countries flag is on the person's visor?", "choices": ["sweden", "finland", "italy", "germany"], "correct_choice_idx": 3, "direct_answers": ["germany", "germany", "germany", "jamaica", "germany", "germany", "germany", "germany", "spain", "guinea"], "difficult_direct_answer": false, "rationales": ["There is a german flag hanging on the person's head.", "The flag of germany is on the head of the person drinking beer.", "This european country has a black, orange and yellow flag."], "image": "train2014/COCO_train2014_000000453734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205858, "question_id": "ZrMdGHasFRh6axnmooZhw8", "question": "How many giraffes are interacting with the man?", "choices": ["two", "three", "four", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "one", "one", "one", "2 giraffes", "one", "one", "one", "2 giraffes"], "difficult_direct_answer": false, "rationales": ["The man is only touching one giraffe.", "One giraffe is leaning over.", "There is only one giraffe."], "image": "train2014/COCO_train2014_000000205858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157639, "question_id": "ZrZzPk2967nAAqGfjmF9hx", "question": "What meal was this?", "choices": ["snack", "dinner", "breakfast", "lunch"], "correct_choice_idx": 2, "direct_answers": ["lunch", "breakfast", "breakfast", "laka", "breakfast", "breakfast", "breakfast", "breakfast", "breakfast", "breakfast"], "difficult_direct_answer": false, "rationales": ["There are pancakes and eggs", "The plates have breakfast foods like eggs and has browns on them.", "Pancakes and eggs usually signifies a breakfast."], "image": "train2014/COCO_train2014_000000157639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224079, "question_id": "ZrcAkkuNdypg69mb8SGXas", "question": "What is the young child using the object in his hand to do?", "choices": ["paint", "brush teeth", "play", "eat"], "correct_choice_idx": 1, "direct_answers": ["brushing teeth", "toothbrush", "brush teeth", "brush teeth", "brush teeth", "chew", "brush teeth", "brush teeth", "brush teeth", "brush teeth"], "difficult_direct_answer": false, "rationales": ["The youngest child has a toothbrush.", "The boy is holding a long object that has bristles, and has it inside his mouth.", "He is cleaning his teeth."], "image": "train2014/COCO_train2014_000000224079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12240, "question_id": "ZsC6H8mLDit5Dcj6vvnGm5", "question": "Why is the woman using an umbrella?", "choices": ["snow", "disguise", "sun", "rain"], "correct_choice_idx": 2, "direct_answers": ["sun", "shade", "sun", "shade", "sun", "shade", "shade", "for shade", "shade", "for shade"], "difficult_direct_answer": false, "rationales": ["The woman blocks sun.", "Umbrellas are used to prevent much sunlight.", "The woman doesn't want a sunburn."], "image": "train2014/COCO_train2014_000000012240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507854, "question_id": "Zsb2Lp7xmZoXv6YtFkaofZ", "question": "What do these trains carry?", "choices": ["cars", "animals", "people", "coal"], "correct_choice_idx": 2, "direct_answers": ["people", "people", "passengers", "people", "people", "cages", "people", "people", "passengers", "people"], "difficult_direct_answer": false, "rationales": ["Men and woman can be seen both inside and outside the trains showing that this is a passenger train.", "They have people.", "Based on the amount of humans near the trains, a number on the carriage, and handles on the doors, this appears to be a passenger train."], "image": "train2014/COCO_train2014_000000507854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12156, "question_id": "ZsnTzmazGMcJKnTzuHMBnB", "question": "What adds stability to the skier seen here?", "choices": ["phone", "poles", "snow shovels", "shoes"], "correct_choice_idx": 1, "direct_answers": ["ski poles", "pole", "ski poles", "ski poles", "hooks", "poles", "ski pole", "poles", "poles", "poles"], "difficult_direct_answer": false, "rationales": ["These help to keep balance", "A skier is standing on skis with poles on either side touching the ground.", "The skier has poles."], "image": "train2014/COCO_train2014_000000012156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353006, "question_id": "ZssYLKjJZDB7rTkHCTY572", "question": "What is this device used for?", "choices": ["calling", "viewing", "cutting", "cooling"], "correct_choice_idx": 1, "direct_answers": ["watching media", "media", "viewing", "watching", "television", "watching television", "entertainment", "watching tv", "entertainment", "television"], "difficult_direct_answer": false, "rationales": ["The television is on.", "It is a television for shows and movies", "This device was invented to transmit patterns of light."], "image": "train2014/COCO_train2014_000000353006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221609, "question_id": "ZstdfunoG2dhjRPbNJkyNb", "question": "What kind of company set up the thing with a clock?", "choices": ["running goods", "racecar", "life insurance", "watch"], "correct_choice_idx": 3, "direct_answers": ["watch", "bank", "citizen", "watch", "watch", "bank", "bank", "citizen", "watch company", "bank"], "difficult_direct_answer": false, "rationales": ["Citizen is an a company.", "Citizen is a timekeeping brand.", "Citizen makes time pieces."], "image": "val2014/COCO_val2014_000000221609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312435, "question_id": "Zu2UAmXPGkUdagVqUq6kQF", "question": "What instrument is shown in the picture?", "choices": ["clarinet", "guitar", "keyboards", "drums"], "correct_choice_idx": 2, "direct_answers": ["piano", "keyboard", "keyboard", "keyboard", "keyboards", "instrument", "keyboard", "keyboard", "keyboard", "keyboard"], "difficult_direct_answer": false, "rationales": ["There are keyboards on the side.", "These are keyboards with black and white keys.", "There are several musical keyboards in this picture."], "image": "train2014/COCO_train2014_000000312435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156974, "question_id": "Zu9vhiD7Q7DstUqaxhBghX", "question": "How many giraffes can you see?", "choices": ["four", "none", "three", "two"], "correct_choice_idx": 0, "direct_answers": ["three", "four", "three", "three", "four", "four", "three", "three", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are three giraffes near the fence and one near the rocks.", "Four giraffes are there.", "Two giraffes are in between two others."], "image": "val2014/COCO_val2014_000000156974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121457, "question_id": "ZuUvCPiG8PHVNXqs8YCKzV", "question": "Which boat is most visible from the shoreline?", "choices": ["sailboat", "nothing", "jet ski", "motorboat"], "correct_choice_idx": 0, "direct_answers": ["sailboat", "sailboat", "sailboat", "yacht", "sailboat", "sailboat", "sailboat", "sailboat", "sailboat", "sail boat"], "difficult_direct_answer": false, "rationales": ["It has a tall white piece of material that is easy to see.", "Big pieces of canvas help a boat move through the water.", "You can see the sails up on the boat that is the closest to the shore."], "image": "val2014/COCO_val2014_000000121457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298547, "question_id": "Zv2xJMkedkGMsJAthT3jBq", "question": "What abnormality does the man on the right have?", "choices": ["down syndrome", "being blind", "being short", "overweight"], "correct_choice_idx": 0, "direct_answers": ["down syndrome", "downs syndrome", "downs syndrome", "shooting", "downs syndrome", "cake", "vomiting", "down syndrome", "down syndrome", "down syndrome"], "difficult_direct_answer": false, "rationales": ["The man has down syndrome eyes.", "Down's syndrome is the condition.", "The man on the right has down's with his flat face."], "image": "val2014/COCO_val2014_000000298547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1271, "question_id": "Zv3uyYWWmz3AGZRrMAGAM2", "question": "The man all the way to the right looks most like he would belong on what show?", "choices": ["duck dynasty", "empire", "power", "jeffersons"], "correct_choice_idx": 0, "direct_answers": ["duck dynasty", "pawn stars", "redneck show", "duck dynasty", "wwe smackdown", "redneck", "swamp people", "larry cable", "duck dynasty", "comedy show"], "difficult_direct_answer": false, "rationales": ["He looks like white trash that hunts.", "The man is part of the country folk of the south, just like the people in the show.", "The man has a similar look due to his beard and hat."], "image": "train2014/COCO_train2014_000000001271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442861, "question_id": "Zv4z49WJbvrEWW2hXJVJcJ", "question": "Why are they both on the same side of the net?", "choices": ["confused", "fighting", "cheating", "are team"], "correct_choice_idx": 3, "direct_answers": ["partners", "same team", "teammates", "yes", "tennis partners", "playing doubles", "doubles partners", "are team", "playing doubles", "playing doubles"], "difficult_direct_answer": false, "rationales": ["Players on the same side are teammates.", "They're on the same team.", "The men are playing doubles."], "image": "val2014/COCO_val2014_000000442861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217395, "question_id": "ZvRyHwomZVAZjNTuAN5Lpb", "question": "What mph would officially be speeding in this area?", "choices": ["44", "46", "25", "20"], "correct_choice_idx": 1, "direct_answers": ["over 45", "46", "fifty", "fifty", "forty six", "forty five", "46", "46", "46", "forty-six"], "difficult_direct_answer": false, "rationales": ["The speed limit is 45.", "It is one mile over the posted speed limit.", "The sign has a posted speed limit of 45. in this type of zone any speed over 45 would be considered speeding."], "image": "train2014/COCO_train2014_000000217395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145208, "question_id": "ZvnarEpkxqyuGhrtPUDknT", "question": "What might she be telling him to do?", "choices": ["look here", "go here", "get that", "see this"], "correct_choice_idx": 1, "direct_answers": ["change directions", "go right", "turn right", "move", "go here", "direction", "go right", "turn right", "head left", "go there"], "difficult_direct_answer": false, "rationales": ["She is pointing directions with her finger.", "She'll go ahead.", "The man is being directed."], "image": "val2014/COCO_val2014_000000145208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320045, "question_id": "ZvqgjWhxzYWvhQzBXfZXdk", "question": "Where is this cat located?", "choices": ["restaurant", "home", "vet", "park"], "correct_choice_idx": 0, "direct_answers": ["seat", "seat", "chair", "restaurant", "chair", "chair", "chair", "chair", "chair", "chair"], "difficult_direct_answer": false, "rationales": ["There are many tables in the room.", "The tables are the type that you would find at a restaurant.", "There are many uniform tables and with food trays on them."], "image": "train2014/COCO_train2014_000000320045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172370, "question_id": "Zw3SwNdF6QdFaYq7ZtsYsr", "question": "What year did the founder start making these snowboards?", "choices": ["2000", "1977", "1986", "1999"], "correct_choice_idx": 1, "direct_answers": ["1977", "1977", "1977", "first year", "long ago", "2010", "na", "unknown", "1977", "nineteen seventyseven"], "difficult_direct_answer": false, "rationales": ["They were first made in 1977.", "He started in a barn in vermont in the late 70s.", "The year was 1977."], "image": "train2014/COCO_train2014_000000172370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502307, "question_id": "Zw5ht66eKT6VrfcyLi9oSU", "question": "What is the large white object behind the seat of the scooter used for?", "choices": ["sitting", "storage", "tricks", "towing"], "correct_choice_idx": 1, "direct_answers": ["storage chest", "storage", "storage chest", "storage", "storage", "storage", "storage", "baggage", "storage", "storage"], "difficult_direct_answer": false, "rationales": ["It's a locked container to hold belongings.", "The object is for storage.", "The object is for storage purposes."], "image": "train2014/COCO_train2014_000000502307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314944, "question_id": "Zw8WuDeJYsWq5hDVjSFarR", "question": "What sort of sports jersey is the person in red wearing?", "choices": ["soccer", "basketball", "football", "baseball"], "correct_choice_idx": 1, "direct_answers": ["basketball", "nba", "basketball", "basketball", "nba", "basketball", "basketball", "basketball", "basketball", "basketball"], "difficult_direct_answer": false, "rationales": ["Basketball jerseys like this are usually short sleeved and loose fitting, and made of light breathable material.", "A basketball jersey is usually sleeveless.", "It has the nba logo on the front of the shirt."], "image": "train2014/COCO_train2014_000000314944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273684, "question_id": "ZwT3HnGWhgPk2A3DiJ38bW", "question": "What will this person do next?", "choices": ["quit", "ante up", "catch ball", "throw ball"], "correct_choice_idx": 3, "direct_answers": ["throw ball", "throw ball", "throw ball", "throw", "throw ball", "throw pitch", "pitch", "throw ball", "throw ball", "pitch ball"], "difficult_direct_answer": false, "rationales": ["He'll pitch the ball.", "The person is playing baseball based on the visible uniform and equipment. in this sport while in this position holding the ball, the player would next do answer a.", "This person is about to throw the ball."], "image": "val2014/COCO_val2014_000000273684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200597, "question_id": "ZwbqkxW73Wocxrc6r6uApp", "question": "How might many who listen to this speaker hear his message?", "choices": ["through speakers", "paper", "sign language", "interpreter"], "correct_choice_idx": 0, "direct_answers": ["ears", "through speakers", "microphone", "big audience", "live", "speakers", "via microphone", "speakers", "television", "radio"], "difficult_direct_answer": true, "rationales": ["You can hear him through the speakers.", "The man is holding a microphone in his hand.", "The man speaking in this image holds a microphone. the purpose of the microphone is to amplify the voice to an audience through speakers."], "image": "train2014/COCO_train2014_000000200597.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335518, "question_id": "ZwcPNhDD2Er78KUqAghSfM", "question": "What is the sport these two players are engaged in?", "choices": ["egg catch", "ultimate frisbee", "sand running", "basketball"], "correct_choice_idx": 1, "direct_answers": ["ultimate frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["The players are involved in a game of ultimate frisbee.", "They are playing ultimate frisbee.", "They are playing a game of frisbee on the beach."], "image": "train2014/COCO_train2014_000000335518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434699, "question_id": "ZwfnmwnA3oUfJLR93rRCkg", "question": "What is inside the small rectangular objects covered in gold foil?", "choices": ["butter", "sanitizer", "salt", "mayo"], "correct_choice_idx": 0, "direct_answers": ["chocolate", "butter", "butter", "butter", "chocolate", "butter", "butter", "butter", "chocolate", "butter"], "difficult_direct_answer": false, "rationales": ["Butter is kept in little packets.", "Pats of butter are available in wrapped units. it would go good with these dishes.", "The butter is inside."], "image": "train2014/COCO_train2014_000000434699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111245, "question_id": "ZwncwLoaWLyLW3sToNdFJi", "question": "What flavor are these donuts?", "choices": ["chocolate", "strawberry", "lemon", "plain glazed"], "correct_choice_idx": 3, "direct_answers": ["glazed", "sugar", "sugar", "sugar", "glazed", "glazed", "plain glazed", "plain glazed", "sugar", "powdered"], "difficult_direct_answer": false, "rationales": ["The donuts have no frosting but they have clear glaze.", "There is nothing additional on them that would give them a certain flavor.", "They are donut colored with a sugary coating."], "image": "train2014/COCO_train2014_000000111245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24285, "question_id": "Zx2Wmdsb24A6bk8SssEv9N", "question": "Why is she holding the racquet like that?", "choices": ["more power", "new player", "tantrum", "hit someone"], "correct_choice_idx": 0, "direct_answers": ["swinging", "hit ball", "reverse serve", "winding up", "more power", "hitting ball", "serve", "hit ball", "playing", "backhand"], "difficult_direct_answer": true, "rationales": ["She is holding with two hands to help with a more powerful swing.", "She is holding her racquet to deliver a more powerful hit.", "She wants more power."], "image": "val2014/COCO_val2014_000000024285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335363, "question_id": "Zx4AfHdZGptRUbkeWjzP8X", "question": "What type of car is parked closest in view?", "choices": ["jeep", "2-door", "4-door", "convertible"], "correct_choice_idx": 1, "direct_answers": ["sedan", "sedan", "sedan", "sedan", "sedan", "blue colored", "sedan", "bus", "many", "2-door"], "difficult_direct_answer": false, "rationales": ["The doorway has two.", "The smallest car is closest.", "A bus is in the middle of the road and a car with one door on the left side is next to it."], "image": "train2014/COCO_train2014_000000335363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317428, "question_id": "Zx6Z3w5BfwasrwXKJYmytc", "question": "What is this type of baby animal callled?", "choices": ["puppy", "colt", "kitten", "cub"], "correct_choice_idx": 3, "direct_answers": ["bear cub", "bear", "bear", "cub", "cub", "cub", "cubs", "cub", "cob", "cub"], "difficult_direct_answer": false, "rationales": ["There are bear babies.", "The other options don't apply to baby black bears.", "These are bear offspring."], "image": "train2014/COCO_train2014_000000317428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262609, "question_id": "ZxQy4YtRVnHcX2czAtJvd4", "question": "What does the symbol on the left building stand for?", "choices": ["apple company", "adidas", "microsoft", "sketchers"], "correct_choice_idx": 0, "direct_answers": ["apple computers", "apple", "apple company", "stop", "apple computers", "apple", "apple", "apple", "apple", "apple"], "difficult_direct_answer": false, "rationales": ["The symbol is for the mac.", "This is the logo for that company.", "The logo on the left is for apple, the computer company."], "image": "val2014/COCO_val2014_000000262609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402470, "question_id": "ZxVPBVt6na8DcMztkgB4ri", "question": "What type of baseball is being played?", "choices": ["minor league", "little league", "major league", "japanese league"], "correct_choice_idx": 0, "direct_answers": ["minor league", "minor league", "american", "major league", "minor league", "through", "major league", "professional", "american", "mlb"], "difficult_direct_answer": false, "rationales": ["The minor league is likely played in this small stadium.", "Field is decent size but attendance isn't as big.", "Minor league ball has smaller turnouts."], "image": "train2014/COCO_train2014_000000402470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173568, "question_id": "ZxYxKimE9UagY9hMDy6Jbp", "question": "What is near the airplane?", "choices": ["truck", "cow", "cardboard box", "baby"], "correct_choice_idx": 0, "direct_answers": ["turk", "truck", "truck", "truck", "truck workers", "truck", "truck", "truck", "truck", "truck"], "difficult_direct_answer": false, "rationales": ["The truck is close to the plane's engine.", "There is a truck near the airplane because it is making a delivery to the plane", "A truck is near the plane."], "image": "train2014/COCO_train2014_000000173568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570601, "question_id": "ZxqSkoNj8VcigizQ4X8q2o", "question": "Where does baseball come from?", "choices": ["sweden", "england", "america", "france"], "correct_choice_idx": 3, "direct_answers": ["player", "new york", "pitcher", "france", "pitcher", "pitcher", "united states", "cincinnati ohio", "mlb", "usa"], "difficult_direct_answer": false, "rationales": ["Baseball comes from france.", "Baseball was created in america, where is remains a very popular sport.", "Baseball is america's past time."], "image": "train2014/COCO_train2014_000000570601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59815, "question_id": "Zy9ZdNh3gf6yuMMF4ducGx", "question": "What is the ancestral animal of the animal represented here?", "choices": ["woolly mammoth", "lion", "asian elephant", "tiger"], "correct_choice_idx": 0, "direct_answers": ["mammoth", "elepant", "mammoth", "woolly mammoth", "elephant", "elephant", "elephant ancestral", "mammoth", "eighty", "elephant"], "difficult_direct_answer": false, "rationales": ["The animal is a wooly mammoth.", "Woolly mammoths appeared on earth before elephants.", "The elephant is the descendant of the woolly mammoth."], "image": "val2014/COCO_val2014_000000059815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543393, "question_id": "ZyFWoxLTasbrU2LZf3HiwF", "question": "What type of vehicle driving on this road could result in a traffic ticket?", "choices": ["bus", "car", "truck", "motorcycle"], "correct_choice_idx": 2, "direct_answers": ["not handicap", "truck", "truck", "truck", "truck", "truck", "truck", "truck", "not handicap", "truck"], "difficult_direct_answer": false, "rationales": ["There is a sign in the background with a truck crossed out. in traffic sign iconography, this indicates the kind of vehicle that is not permitted.", "The truck is the type.", "There is a sign that indicates trucks aren't allowed."], "image": "val2014/COCO_val2014_000000543393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579081, "question_id": "ZyQRZxtWJ89DBTnyX6wDjZ", "question": "What is this type of transportation called?", "choices": ["forklift", "ski lift", "spacelift", "carrier"], "correct_choice_idx": 1, "direct_answers": ["ski lift", "ski lift", "ski lift", "chairlift", "ski lift", "ski lift", "ski lift", "ski lift", "ski lift", "ski lift"], "difficult_direct_answer": false, "rationales": ["The lift is above the snow and everyone is dressed and ready to ski.", "It is a ski lift because it is carrying people up a moutain wearing skis", "This is a lift at a ski resort taking them to the top of the mountain so they can ski down."], "image": "val2014/COCO_val2014_000000579081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108470, "question_id": "ZyWyT8yXyTgzoMP9MQjgLp", "question": "The first number on the license plate can be described as what?", "choices": ["infinite", "odd", "even", "negative"], "correct_choice_idx": 2, "direct_answers": ["four", "even", "four", "even", "four", "four", "number four", "four", "even", "four"], "difficult_direct_answer": false, "rationales": ["Four is an even number.", "The number on the license plate is even since it's a four.", "It is the third even number."], "image": "train2014/COCO_train2014_000000108470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181503, "question_id": "ZypEAeQY2W7FDktGUYiFUu", "question": "What white item is atop the greens that sit atop the pizza?", "choices": ["makeup", "milk", "dressing", "marshmallow"], "correct_choice_idx": 2, "direct_answers": ["dressing", "dressing", "vinaigrette", "dressing", "dressing", "vinaigrette", "dressing", "dressing", "dressing", "dressing"], "difficult_direct_answer": false, "rationales": ["Salad greens are covered in dressing generally.", "Ranch dressing is on the arugula.", "Fresh salad with sauce on top of it."], "image": "train2014/COCO_train2014_000000181503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209930, "question_id": "Zz4NtfVpgBZoDRzAz6ovyT", "question": "What type of bread might uniquely be available near this stadium?", "choices": ["hotcross buns", "sourdough", "rye", "cheddar bay"], "correct_choice_idx": 1, "direct_answers": ["sourdough", "buns", "sourdough", "hotdog bun", "your giants", "sourdough", "sourdough", "hotdog bun", "sourdough", "sourdough"], "difficult_direct_answer": false, "rationales": ["The bread is sourdough.", "That type of bread is popular in san francisco.", "This type of bread was in invented in this city."], "image": "val2014/COCO_val2014_000000209930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375400, "question_id": "Zz4foKURDkCoaTkx5QVQX5", "question": "What are the elephants moving through?", "choices": ["village", "sea", "jungle", "desert"], "correct_choice_idx": 2, "direct_answers": ["jungle", "jungle", "jungle", "jungle", "jungle", "forest", "bush", "forest", "jungle", "jungle"], "difficult_direct_answer": false, "rationales": ["There is a lot of vegetation", "The elephants are moving through a dense jungle.", "The animals are visibly in a thick green forest setting which is consistent with answer a and none of the other answers."], "image": "train2014/COCO_train2014_000000375400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343481, "question_id": "ZzGAvsy2AhbyL35CdY2xHh", "question": "What is happening to the sheep?", "choices": ["feeding", "sheering", "cleaning", "transportation"], "correct_choice_idx": 3, "direct_answers": ["stuck", "being transported", "transported", "penned in", "coralled", "grouped", "crammed", "herded", "shear", "transportation"], "difficult_direct_answer": true, "rationales": ["The sheep are in a truck.", "There are in a crate to be moved to somewhere else.", "The sheep are being moved."], "image": "val2014/COCO_val2014_000000343481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85657, "question_id": "ZzGWJ7dDkiLDPq9KZfPMAx", "question": "What do the animals need to do?", "choices": ["pull", "push", "carry", "cross"], "correct_choice_idx": 3, "direct_answers": ["run", "graze", "get inside", "follow shepherd", "run", "move", "cross", "run", "enter corral", "graze"], "difficult_direct_answer": false, "rationales": ["The sheep are travelling in a herd. some are on a pavement that it is a road, and they must go over the road to get to the other side.", "The animals cross.", "There is a road in the way and they are trying to get to the other side with their owner."], "image": "train2014/COCO_train2014_000000085657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74365, "question_id": "ZzKc8JRTd4T63Fk8xZLQUL", "question": "These people are most likely where?", "choices": ["garage", "park", "office", "mall"], "correct_choice_idx": 1, "direct_answers": ["restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "park", "restaurant", "restaurant", "restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["The people are in a wooden table.", "The people are outside having breakfast.", "They look to be outside eating food."], "image": "train2014/COCO_train2014_000000074365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535871, "question_id": "ZzagAoxtYXoAJA4AG4kSJY", "question": "In which state was the skateboard invented?", "choices": ["south carolina", "california", "michigan", "utah"], "correct_choice_idx": 1, "direct_answers": ["california", "california", "california", "california", "california", "california", "california", "california", "california", "california"], "difficult_direct_answer": false, "rationales": ["The skateboard was invented in california.", "This is where the skateboard was first manufactured. surfers wanted something to do that was similar to their sport when waves were not present or they were unable to surf.", "The state is california."], "image": "train2014/COCO_train2014_000000535871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564627, "question_id": "ZzhwzT68JRDYoc2Vq2oaHX", "question": "What is the man in the white shirt staring at?", "choices": ["tennis racket", "shoes", "tennis ball", "net"], "correct_choice_idx": 2, "direct_answers": ["ball", "ball", "ball", "ball", "ball", "ball", "ball", "ball", "tennis ball", "ball"], "difficult_direct_answer": false, "rationales": ["He is watching the ball and trying to hit it.", "The man in the white shirt is staring at the tennis ball.", "The man in the white shirt is holding a racket. he is about to use it for its intended purpose."], "image": "train2014/COCO_train2014_000000564627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575519, "question_id": "ZzzwsHx47Lhrer3kVh4Fo8", "question": "What are the people doing?", "choices": ["rowing", "eating", "flying", "running"], "correct_choice_idx": 0, "direct_answers": ["rowing", "rowing", "rowing", "fishing", "fishing", "rowing boat", "rowing", "fishing", "fishing", "using oar"], "difficult_direct_answer": false, "rationales": ["They are using the paddles to move the boats.", "The people have paddles in their hand. they are on a boat.", "The people are in boats and using an oar to propel themselves which is consistent with answer a."], "image": "train2014/COCO_train2014_000000575519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139645, "question_id": "a27iyVTVuh2CNi3wKRnTx4", "question": "What type of sink is this child using?", "choices": ["commercial", "bathroom", "laundry", "kitchen"], "correct_choice_idx": 1, "direct_answers": ["faucet", "faucet", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom sink", "bathroom"], "difficult_direct_answer": false, "rationales": ["The child is brushing his teeth.", "Although sinks can be found in many places, the child is brushing his teeth. most toothbrushes are not kept in a kitchen or laundry room and they are used in a home and not a commercial setting.", "The child is brushing their teeth, not washing dishes or laundry."], "image": "train2014/COCO_train2014_000000139645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336397, "question_id": "a2CCZLWNFmpDx4fAKAyYG5", "question": "Why is the skateboard in the air?", "choices": ["has fallen", "is trick", "dropped it", "bounced"], "correct_choice_idx": 1, "direct_answers": ["kickflip", "doing tricks", "is trick", "performing trick", "trick", "tricks", "skateboard trick", "jump", "skating trick", "performing trick"], "difficult_direct_answer": true, "rationales": ["People often jump in the air when doing tricks on their skateboards.", "This is a deliberate action.", "The skateboard is being flipped."], "image": "train2014/COCO_train2014_000000336397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194462, "question_id": "a2EjZRcuF6V7gu4tpqQh3c", "question": "What animal has similar things on their head to these animals?", "choices": ["skunk", "cat", "dog", "goat"], "correct_choice_idx": 3, "direct_answers": ["bulls", "goat", "bulls", "horns", "oxen", "boar", "goat", "sheep", "deer", "deer"], "difficult_direct_answer": false, "rationales": ["Goats have horns, the other animals don't.", "Like bulls, goats also have horns on their head.", "These cows have horns on their heads. dogs, cats, and skunks do not have horns."], "image": "train2014/COCO_train2014_000000194462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574108, "question_id": "a2GJBrdxSthUWWbxkz4BMp", "question": "What material is the backpack made of?", "choices": ["pic", "nylon", "cotton", "leather"], "correct_choice_idx": 1, "direct_answers": ["nylon", "velcro", "canvas", "nylon", "nylon", "polyester", "nylon", "nylon", "nylon", "nylon"], "difficult_direct_answer": false, "rationales": ["The backpack is made from nylon because it protects from the water the objects that are inside", "The backpack is made of a weatherproof and malleable material which is likely nylon.", "The backpack is a thick water resistant material."], "image": "val2014/COCO_val2014_000000574108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441363, "question_id": "a2MFf7neyaY6cYFXRsDQR6", "question": "Where did the tennis racket come from?", "choices": ["opposing player", "official", "tennis outfitter", "red player"], "correct_choice_idx": 3, "direct_answers": ["mans hand", "hand", "france", "red player", "player", "player's hands", "player's hand", "man's hand", "man", "hand"], "difficult_direct_answer": true, "rationales": ["It looks like it flew from his hand.", "The man lost his racket.", "The player wearing a red shirt was holding a tennis racket and threw it."], "image": "val2014/COCO_val2014_000000441363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145761, "question_id": "a2ZyDRuDcBrVTCrWhNHkMc", "question": "What movie is Tom Cruise starring in?", "choices": ["bond", "mission impossible", "city", "oblivion"], "correct_choice_idx": 3, "direct_answers": ["oblivion", "oblivion", "oblivion", "oblivion", "oblivion", "oblivion", "oblivion", "oblivion", "oblivion", "oblivion"], "difficult_direct_answer": false, "rationales": ["Oblivion's ad is shown.", "Tom Cruise is the actor on the poster for the movie oblivion", "The sign on the bus is an ad for the movie oblivion and has tom cruise's name on it."], "image": "train2014/COCO_train2014_000000145761.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245517, "question_id": "a2kaU9M86pb4tekBM3JGSc", "question": "What number is closest to the number at the top of the bus?", "choices": ["560", "803", "240", "121"], "correct_choice_idx": 2, "direct_answers": ["230", "230", "240", "twohundred thirty", "228", "230", "229", "229", "229", "229"], "difficult_direct_answer": false, "rationales": ["The number at the top of the bus is 229.", "240 is closest to 229.", "The number on the top is 229"], "image": "train2014/COCO_train2014_000000245517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141158, "question_id": "a3L5rucVrsFqQFUPkoyeC7", "question": "What danger is the man likely to face?", "choices": ["heavy rain", "storm", "strong wind", "lightening"], "correct_choice_idx": 1, "direct_answers": ["hypothermia", "frostbite", "frostbite", "hyperthermia", "freezing", "freezing weather", "storm", "hypothermia", "steep slope", "freezing"], "difficult_direct_answer": false, "rationales": ["The sky is covered in dark clouds.", "There is a lot of snow that looks like a blizzard.", "A man is out in a rural area covered in snow with woods all around. the sky is overcast, it is dark and hazy."], "image": "val2014/COCO_val2014_000000141158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399332, "question_id": "a3psBdQD34sFewRy7nwaoR", "question": "What electronics retailer is present in this commercial space?", "choices": ["circuit city", "best buy", "gamestop", "target"], "correct_choice_idx": 1, "direct_answers": ["best buy", "best buy", "best buy", "best buy", "best buy", "best buy", "bike", "best buy", "bike", "best buy"], "difficult_direct_answer": false, "rationales": ["You can see the yellow sign with the word buy on it.", "Best buy's logo is shown.", "Part of the name is visible"], "image": "train2014/COCO_train2014_000000399332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297876, "question_id": "a3y9AKLVPMSCnxnC337PCd", "question": "Why are there five displays on the desk?", "choices": ["multi-tasking", "for sale", "redundancy", "stolen"], "correct_choice_idx": 0, "direct_answers": ["working", "different devices", "gaming", "different uses", "multitasking", "multi-tasking", "more screens", "work", "workflow optimization", "multiple functions"], "difficult_direct_answer": true, "rationales": ["The five displays allow someone to handle work on multiple screens.", "Multiple computers are up and running on a desk.", "When someone is using more than one machine, they are doing more than one thing at a time."], "image": "train2014/COCO_train2014_000000297876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116181, "question_id": "a3zxenJShUgPDWUgC2HkNh", "question": "Which weather phenomenon is likely to be most frustrating to people seen here at this place?", "choices": ["hard freeze", "snow", "heat wave", "cool wind"], "correct_choice_idx": 2, "direct_answers": ["snow", "snowstorm", "storm", "heat wave", "white out", "snowstorm", "global warming", "blizzard", "blizzard", "avalanche"], "difficult_direct_answer": false, "rationales": ["If it's hot, the snow that they need to ski will melt.", "A heat wave would be bad.", "Snow is needed to ski."], "image": "train2014/COCO_train2014_000000116181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178785, "question_id": "a46xZMs9Uhktbhyki3NKnJ", "question": "What are the glass shelves on the left used for?", "choices": ["exercising", "storage", "climbing", "bathing"], "correct_choice_idx": 1, "direct_answers": ["storage", "storage", "toiletries", "storage", "glasses", "storage", "beauty items", "toilet", "holding cups", "corner shelves"], "difficult_direct_answer": false, "rationales": ["The glass shelves can keep shampoos and soaps and cups.", "There are items currently being placed on the objects in question and based on the setting and the composition, this would be the intended use.", "The shelves are for storage."], "image": "train2014/COCO_train2014_000000178785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530811, "question_id": "a4AFAM4aziyxzuyvk4qZHU", "question": "Which mammal will disturb more species with it's movement?", "choices": ["cat", "rat", "man", "dog"], "correct_choice_idx": 3, "direct_answers": ["cat", "dog", "human", "mouse", "dog", "mouse", "dog", "mouse", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["There is a cat and a mouse on his back.", "A dog will disturb them.", "The cat and the rat are sitting on top of the animal on the ground. if the animal closest to the ground moves it will disturb their balance."], "image": "train2014/COCO_train2014_000000530811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84463, "question_id": "a4HjNgrAJZk6qicdA5qdrd", "question": "What was in the glasses before?", "choices": ["red wine", "orange juice", "pineapple juice", "champagne"], "correct_choice_idx": 0, "direct_answers": ["hands", "wine", "wine", "wine", "wine", "wine", "men", "red wine", "wine", "wine"], "difficult_direct_answer": false, "rationales": ["The glasses are wine glasses. there is a red residue visible at the bottom of each glass.", "There are wine markings.", "The glasses had wine."], "image": "val2014/COCO_val2014_000000084463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37244, "question_id": "a4LsAG2WQL2NwXyanii9kW", "question": "What ingredient in the food from the oven provides the most calcium?", "choices": ["mushroom", "meat", "vegetable", "cheese"], "correct_choice_idx": 3, "direct_answers": ["milk", "milk", "cheese", "cheese", "eggs", "cheese", "cheese", "cheese", "cheese", "cheese"], "difficult_direct_answer": false, "rationales": ["The topping is made from cows.", "Cheese has calcium in it due to dairy.", "It has dairy in it which can be a great sourse of calcium"], "image": "train2014/COCO_train2014_000000037244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431432, "question_id": "a4e75Yy9Q6SqB2SVdY5KER", "question": "Why are the young elephants behind the wooden posts?", "choices": ["for feeding", "for training", "to punish", "to wash"], "correct_choice_idx": 1, "direct_answers": ["for training", "learning", "keep enclosed", "protection", "training", "captivity", "barricaded", "safety", "safety", "training"], "difficult_direct_answer": false, "rationales": ["The animals are being trained.", "The bar is used for direction. it helps keep the elephants in line.", "Fences are put in front of wild animals when they can't behave in front of people."], "image": "val2014/COCO_val2014_000000431432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24522, "question_id": "a4vqoudKsWec4bJtux86Vp", "question": "In which setting is the bus travelling?", "choices": ["inner city", "rural", "suburb", "desert"], "correct_choice_idx": 1, "direct_answers": ["in mountains", "forward", "bus route", "rural", "country road", "rural", "down road", "rural", "rural", "road"], "difficult_direct_answer": false, "rationales": ["They are in an area with no buildings and open fields.", "This looks like there are not a lot of houses or buildings around", "The vegetation and the absence of buildings suggests a countryside location."], "image": "val2014/COCO_val2014_000000024522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136644, "question_id": "a55J7mjZKAWkAcrrdtZgS7", "question": "What is inside the white cups of the people?", "choices": ["beer", "wine", "tea", "juice"], "correct_choice_idx": 2, "direct_answers": ["tea", "water", "coffee", "water", "tea", "tea", "soup", "water", "tea", "tea"], "difficult_direct_answer": false, "rationales": ["The liquid inside the cups is tea because there are tea pots next to the cups", "The cups are tea cups.", "The containers are keeping the liquids warm."], "image": "val2014/COCO_val2014_000000136644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69148, "question_id": "a5RWfiEvv9qxcSnqEtEmoU", "question": "What is this player about to do?", "choices": ["roll", "juggle", "throw", "dribble"], "correct_choice_idx": 2, "direct_answers": ["pitch", "pitch", "throw pitch", "pitch", "throw pitch", "pitch baseball", "throw ball", "throw ball", "throw", "throw baseball"], "difficult_direct_answer": false, "rationales": ["The player is about to throw the baseball.", "The position of the pitcher's arm, and the fact that he is holding a baseball, makes it clear.", "The player is the pitcher who is ready to pitch the ball."], "image": "train2014/COCO_train2014_000000069148.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330609, "question_id": "a5Xj9dMCdFLWQ3GEtp4ecq", "question": "A large herbaceous flowering plant is what?", "choices": ["citron", "orange", "banana", "grapes"], "correct_choice_idx": 2, "direct_answers": ["banana", "shrub", "banana", "flower", "perennial", "banana plant", "banana", "banana", "banana plant", "pretty"], "difficult_direct_answer": false, "rationales": ["The key to the answer is the color of what the man is holding.", "It is the largest herbaceous flowering plant. it is a long curved yellow fruit.", "It is a yellow curved fruit."], "image": "train2014/COCO_train2014_000000330609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51324, "question_id": "a5gtoSFYkxY58UCdKhX3ih", "question": "Where is the woman in?", "choices": ["subway", "ferry", "bus", "train"], "correct_choice_idx": 2, "direct_answers": ["office", "bus", "bus", "tram", "subway", "subway train", "train", "waiting room", "bus", "train"], "difficult_direct_answer": false, "rationales": ["The woman is on a bus.", "The inside is a bus.", "The vehicle is in a residential area so it would be one."], "image": "val2014/COCO_val2014_000000051324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530620, "question_id": "a5oHaSpDxZmEKy268YmF6J", "question": "Where will this basket item be ridden?", "choices": ["in air", "roadway", "truck bed", "plane"], "correct_choice_idx": 0, "direct_answers": ["sky", "road", "sky", "air", "in air", "sky", "air", "air", "sky", "sky"], "difficult_direct_answer": false, "rationales": ["There is a balloon and a heater attached to the basket. it is capable of flying.", "The basket is attached to a hot air balloon. if used as intended, the location of the riding will be answer a.", "This is a hot air balloon. these balloons fly in the air."], "image": "val2014/COCO_val2014_000000530620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414588, "question_id": "a6WT7wWmJYRbCApDGAUPbf", "question": "Which is a specific breed of the animal of the table?", "choices": ["scottish fold", "american lamancha", "german shepherd", "holland lop"], "correct_choice_idx": 0, "direct_answers": ["cat", "cat", "cat", "cat", "cat", "cat", "cat", "scottish fold", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["Scottish folds are cats.", "There is a cat on the table which appears to have folded ears. these are also known as scottish fold breed.", "This is the name of one of the many breeds for cats"], "image": "train2014/COCO_train2014_000000414588.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210082, "question_id": "a6Xo2owWJMeadrk9NcM8zq", "question": "The floor of the bathroom is made of what material?", "choices": ["carpet", "wood", "vinyl", "stone"], "correct_choice_idx": 3, "direct_answers": ["stone", "stone", "stone", "stone", "tile", "stone tile", "marble", "stone", "stone", "tile"], "difficult_direct_answer": false, "rationales": ["The irregularly shaped pieces are cemented into place.", "The floor appears to be covered in flat, irregular sized hard rocks that are pieced together with grout.", "We see irregularly shaped and square shaped stone tile making up the floor of this image."], "image": "train2014/COCO_train2014_000000210082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427395, "question_id": "a6YgfT4ytKapikFoCmvLpm", "question": "How many people use this bathroom?", "choices": ["five", "four", "six", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are clearly two tooth brushes.", "Two people use it.", "There are two toothbrushes in the cup. there is one toothbrush per person."], "image": "train2014/COCO_train2014_000000427395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341335, "question_id": "a6YjVFdX5q9bvcdmTg7ciK", "question": "What type of facility is displaying the animal head?", "choices": ["bar", "medical office", "hotel", "home"], "correct_choice_idx": 0, "direct_answers": ["bar", "restaurant", "bar", "bar", "bar", "bar", "bar", "restaurant", "bar", "bull"], "difficult_direct_answer": false, "rationales": ["The bottles of booze in the back indicate that it is a tavern.", "The area is a bar.", "There are alcohol bottles on the wall. alcohol bottles and cattle on the wall are found at restaurants."], "image": "train2014/COCO_train2014_000000341335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523007, "question_id": "a6erUKLPXJwNQLQ4MkyHxq", "question": "What act of sportsmanship is about to occur?", "choices": ["finger wag", "fist pump", "head pat", "handshake"], "correct_choice_idx": 3, "direct_answers": ["handshake", "good sportsmanship", "shaking hands", "handshake", "tennis", "handshake", "handshake", "handshake", "handshake", "handshake"], "difficult_direct_answer": false, "rationales": ["The opponents are shaking hands.", "The people are happy with the game.", "Two men in the same color shirts are extending their hands over a net on a tennis court."], "image": "train2014/COCO_train2014_000000523007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98098, "question_id": "a6gtQbToQvwA9uZXhE388n", "question": "What is the name of this game?", "choices": ["ring throw", "discus throw", "skiing", "surfing"], "correct_choice_idx": 1, "direct_answers": ["ultimate frisbee", "club chase", "frisbee", "frisbee", "ultimate frisbee", "discus throw", "frisbee", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["This game is based on a disc thrown.", "Men are running in an open grassy area.", "This is a disc throwing game."], "image": "train2014/COCO_train2014_000000098098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276507, "question_id": "a74uJyibHGVSbtgd99gZG6", "question": "What is the desk behind the cat used for?", "choices": ["computer work", "cooking", "exercising", "painting"], "correct_choice_idx": 0, "direct_answers": ["computer use", "working", "work", "computer work", "towel", "computer", "work", "computer work", "computer support", "computer"], "difficult_direct_answer": false, "rationales": ["The desk has a computer on it for business purposes.", "There is a desk behind the cat used for completing computer work.", "A keyboard can be seen on a desk behind a cat."], "image": "train2014/COCO_train2014_000000276507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396793, "question_id": "a7E6Cir5a3vgYx4NtkEHdU", "question": "What position does the boy in blue most probably fulfil?", "choices": ["reserve", "spotter", "ball boy", "security"], "correct_choice_idx": 2, "direct_answers": ["ball boy", "ball boy", "bellboy", "ball boy", "ball boy", "ball boy", "ball boy", "bellboy", "ball boy", "ball boy"], "difficult_direct_answer": false, "rationales": ["The boy is standing in the corner of the court waiting to catch the loose balls.", "He is there to pick up the balls.", "In a professional tennis setting, children of that age would only be on the court and standing with that kind of stance if they were fulfilling the role of ball boy or ball girl."], "image": "train2014/COCO_train2014_000000396793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419678, "question_id": "a7nkFByNWFb7SvwsiW4o7n", "question": "What is he doing?", "choices": ["cleaning up", "eating", "playing", "making lunch"], "correct_choice_idx": 2, "direct_answers": ["eating", "opening container", "holding container", "playing", "touching container", "opening", "getting food", "in fridge", "holding tupperware", "opening container"], "difficult_direct_answer": true, "rationales": ["He's doing nothing productive!", "The baby is holding some containers and is playing with them.", "The child is playing."], "image": "train2014/COCO_train2014_000000419678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286376, "question_id": "a7rMWykjo9r4GkMsGnY6Rj", "question": "This board is used for what sport?", "choices": ["skating", "skateboarding", "surfing", "skiing"], "correct_choice_idx": 1, "direct_answers": ["skateboarding", "skateboarding", "skate boarding", "skateboarding", "skateboarding", "skate boarding", "skateboarding", "skateboarding", "skateboarding", "skate boarding"], "difficult_direct_answer": false, "rationales": ["The board is for skateboarding.", "Long boards with wheels and pictures on the boards are stacked up together.", "This board is used for skateboarding."], "image": "train2014/COCO_train2014_000000286376.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300221, "question_id": "a86GGsZgxKRGwVw67CiK3w", "question": "Where does cauliflower come from?", "choices": ["israel", "oregon", "china", "cyprus"], "correct_choice_idx": 3, "direct_answers": ["cyprus", "ground", "farm", "ground", "ground", "garden", "seeds", "flower", "ground", "ground"], "difficult_direct_answer": false, "rationales": ["The cauliflower is a vegetable on the right side. it requires a dry and arid environment which is what cyprus has.", "Cauliflower is shown and can be grown in cyprus.", "Cyprus because the cauliflower originated in the mediterrean sea around asia"], "image": "val2014/COCO_val2014_000000300221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204549, "question_id": "a86gEYU2A57ugBwTimgobW", "question": "What side is served along with this meal in addition to the steak mushrooms and green beans?", "choices": ["cauliflower", "potatoes", "radishes", "turnips"], "correct_choice_idx": 1, "direct_answers": ["potatoes", "potatoes", "bread", "potatoes", "potatoes", "potatoes", "potatoes", "potatoes", "potatoes", "potatos"], "difficult_direct_answer": false, "rationales": ["Potatoes are served with steak.", "Potatoes are also on the plate.", "Small, seasoned, white squares of food can be seen next to a steak and vegetables."], "image": "val2014/COCO_val2014_000000204549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348680, "question_id": "a8AfbGQYm5E3F7vGpJ6YkS", "question": "What mode of transport is displayed on the above image?", "choices": ["air", "water", "none", "road"], "correct_choice_idx": 1, "direct_answers": ["ferry", "airplane", "ferry", "cruise ship", "ferry", "boat", "boat", "boat", "water", "ship"], "difficult_direct_answer": false, "rationales": ["The vehicle is a ferry.", "This is a large boat", "You can tell by the large boat she is boarding as to what type of transportation she is using."], "image": "train2014/COCO_train2014_000000348680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565767, "question_id": "a8DSTPSBJZtDTVdUBaY4r7", "question": "What is this boy about to do?", "choices": ["ski", "sled", "slide", "snowboard"], "correct_choice_idx": 3, "direct_answers": ["snowboard", "skateboard", "snowboard", "skate", "snowboard", "descend hill", "snowboard", "play", "snowboard", "skate"], "difficult_direct_answer": false, "rationales": ["A child is standing on a long flat board on a snowy mountain.", "The boy has a snowboard.", "He has both feet on one board in the snow"], "image": "train2014/COCO_train2014_000000565767.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571944, "question_id": "a8HxWQ5WHU6pCjmU2Ebtre", "question": "Why are the animals crowded around the bucket?", "choices": ["to graze", "to bathe", "to rest", "to eat"], "correct_choice_idx": 3, "direct_answers": ["eating place", "eating", "eating", "eating", "feeding time", "eating food", "eating", "goats", "to eat", "they're eating"], "difficult_direct_answer": false, "rationales": ["The animals are snacking.", "There is food in there", "Everyone here is eager to eat food from the tub."], "image": "val2014/COCO_val2014_000000571944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419780, "question_id": "a9MoUH942yL2UXyBCuiemw", "question": "Why does the animal on the right have its head to the ground?", "choices": ["to dig", "to sit", "to eat", "to drink"], "correct_choice_idx": 2, "direct_answers": ["eating", "eating", "eating", "eating grass", "eating", "to eat", "eating grass", "eating", "grazing", "grazing"], "difficult_direct_answer": false, "rationales": ["The animal is eating grass.", "The animal to the right has its head lowered down to graze in the grass.", "The animal is eating grass."], "image": "train2014/COCO_train2014_000000419780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77981, "question_id": "a9RHbcSemw3rS24moc82Kh", "question": "Why has he covered his head?", "choices": ["safety", "fashion", "disguise", "religion"], "correct_choice_idx": 0, "direct_answers": ["protection", "protect head", "safety protection", "protection", "protection", "protection", "safety", "protection", "protection", "protection"], "difficult_direct_answer": false, "rationales": ["He wants to be safe.", "The person is skiing and currently in midair. they are wearing a helmet and because of the activity they are currently engaging in and the general purpose of wearing a helmet, answer a is correct.", "His head is covered for safety purposes."], "image": "train2014/COCO_train2014_000000077981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44065, "question_id": "a9VWnSuU3siwgCMjzQf2EP", "question": "What material is used to make the stick on the woman's shoulder?", "choices": ["metal", "bamboo", "wood", "plastic"], "correct_choice_idx": 1, "direct_answers": ["wood", "bamboo", "wood", "bamboo", "wood", "wood", "bamboo", "wood", "bamboo", "wood"], "difficult_direct_answer": false, "rationales": ["It is a firm but flexible asian wood. it grows commonly where she lives.", "The woman is carrying a pole that is made of bamboo.", "The woman's baskets are made of bamboo woven together."], "image": "val2014/COCO_val2014_000000044065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533134, "question_id": "a9mT2XKijpmvSbyTQ5qLhN", "question": "Where might this child be located?", "choices": ["texas", "california", "florida", "colorado"], "correct_choice_idx": 3, "direct_answers": ["mountains", "snowy state", "mountain", "mountain", "mountain", "ski resort", "colorado", "mountainside", "colorado", "snow"], "difficult_direct_answer": false, "rationales": ["The child is skiing in an area that is blanketed by snow. florida, texas, and most of california are too warm to get significant amounts of snow.", "Only state that has mountains and cold weather.", "Of the four options, it is the state with the most mountains."], "image": "train2014/COCO_train2014_000000533134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561746, "question_id": "a9xPAkeH4VAgRKByFP2hm4", "question": "What move is the tennis player adopting?", "choices": ["lob", "backhand", "forehand", "serve"], "correct_choice_idx": 2, "direct_answers": ["forehand", "swinging racket", "forehand", "serve", "jump", "throw ball", "forehand", "forehand", "playing", "hitting"], "difficult_direct_answer": false, "rationales": ["The tennis player is making a forehand swing.", "The player is doing a fronthand move.", "The tennis player has a forehand swing."], "image": "train2014/COCO_train2014_000000561746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55545, "question_id": "a9ziUCWANwbF2jr4SBDMKi", "question": "Where is the train from?", "choices": ["czech republic", "america", "china", "japan"], "correct_choice_idx": 0, "direct_answers": ["germany", "ceske", "unknown", "israel", "new york", "czech republic", "europe", "spain", "czech republic", "prague"], "difficult_direct_answer": true, "rationales": ["The train is from czech.", "That's where ceske drahy operates.", "It has roman lettering that is not in english."], "image": "val2014/COCO_val2014_000000055545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332543, "question_id": "aA8pkdX2i33kiva5e9rkF6", "question": "What word is on the sign?", "choices": ["go", "look", "driving", "green"], "correct_choice_idx": 2, "direct_answers": ["stop", "stop", "stop", "driving", "stop driving", "stop", "stop", "four way", "stop", "stop driving"], "difficult_direct_answer": false, "rationales": ["Also stop and 4way", "There is a word under stop.", "Besides the normal word \"stop\" the word below is obvious to understand."], "image": "train2014/COCO_train2014_000000332543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256607, "question_id": "aAFWmizc2ygv4UbCaJqXeQ", "question": "What is the bus doing?", "choices": ["going", "backing up", "yielding", "being parked"], "correct_choice_idx": 2, "direct_answers": ["stopping", "yielding", "loading", "boarding", "driving", "stopping", "travelling", "transporting", "driving", "traveling"], "difficult_direct_answer": false, "rationales": ["The bus is letting the pedestrian have the right of way.", "It appears to be doing a because of a pedestrian or guard.", "The bus is yielded to traffic."], "image": "train2014/COCO_train2014_000000256607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468751, "question_id": "aAUToRpxnbK9aVL6ef4YR3", "question": "What is most likely hiding inside the shoes closest to the camera?", "choices": ["paper", "water", "feet", "bugs"], "correct_choice_idx": 2, "direct_answers": ["feet", "feet", "socks", "feet", "socks", "socks", "feet", "socks", "socks", "socks"], "difficult_direct_answer": false, "rationales": ["The feet are hidden.", "The shoes in the photo are worn on the feet of a woman walking on the street.", "We see the leg's of this woman disappearing at the bottom into her boots. it is likely the feet of this woman are inside of these boots."], "image": "val2014/COCO_val2014_000000468751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162753, "question_id": "aAkQMBmwzapwNLUz3CUYNn", "question": "What kind of sport it is?", "choices": ["volley ball", "basket ball", "throw ball", "golf"], "correct_choice_idx": 1, "direct_answers": ["dodgeball", "basket ball", "ball game", "dodge ball", "hitting sport", "dodgeball", "dodgeball", "dodgeball", "dodgeball", "hand ball"], "difficult_direct_answer": false, "rationales": ["The people are actually playing dodgeball.", "The sport is basketball.", "The people are throwing the balls without a net."], "image": "train2014/COCO_train2014_000000162753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542509, "question_id": "aAkSEghyrkQ59bomgpMpgq", "question": "The speaker at the bottom of the left desk is optimized to produce what type of sound frequency?", "choices": ["highs", "mids", "bass", "treble"], "correct_choice_idx": 2, "direct_answers": ["high", "bass", "hertz", "megahertz", "audible", "low", "low-pitched", "hertz", "bass", "bass"], "difficult_direct_answer": false, "rationales": ["It is a special speaker for the low tones.", "Speakers are known for giving loud sounds.", "There are bass speakers on the bottom of the desk."], "image": "val2014/COCO_val2014_000000542509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406196, "question_id": "aAnebFpyrYvJEvRgcg2cPf", "question": "What are the horses standing on?", "choices": ["water", "dirt", "snow", "sticks"], "correct_choice_idx": 1, "direct_answers": ["grass", "field", "ground", "grass", "dirt", "dirt", "ground", "dead grass", "dirt", "grass"], "difficult_direct_answer": false, "rationales": ["The horses are gathered together in their pasture on their farm, and there is dirt beneath their hooves.", "The horses are on brown earth.", "The horses are standing on dirt."], "image": "train2014/COCO_train2014_000000406196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396623, "question_id": "aAsZ9dAKGYBzHQ83jQyhWK", "question": "Why are there slats in the fence behind the fire hydrant?", "choices": ["wind break", "privacy", "advertising space", "sun shade"], "correct_choice_idx": 1, "direct_answers": ["privacy", "show painting", "protection", "prevent vandalism", "privacy", "view picture", "air", "painting", "repel rodents", "prevent graffiti"], "difficult_direct_answer": true, "rationales": ["The slats obscure anyone from peering in.", "The slats give privacy.", "So no one can see through."], "image": "train2014/COCO_train2014_000000396623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490670, "question_id": "aAxaoxazBiRb7JJ885wjub", "question": "The bumpy yellow tile behind the stop sign is part of what infrastructure feature?", "choices": ["bus stop", "sidewalk", "breakdown lane", "pedestrian crossing"], "correct_choice_idx": 1, "direct_answers": ["wheelchair guards", "sidewalk", "sidewalk", "sidewalk", "blind assistance", "blind people", "crosswalk", "nonslip sidewalk", "sidewalk", "traffic regulation"], "difficult_direct_answer": false, "rationales": ["There is a bumpy yellow sidewalk grating.", "The bumpy yellow surface is part of the area used for walking.", "This is part of the sidewalk."], "image": "train2014/COCO_train2014_000000490670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561780, "question_id": "aAy87dkDsGXAA8XLsjrA8V", "question": "For what do people seated here wait?", "choices": ["bus", "taxi", "lunch", "car"], "correct_choice_idx": 0, "direct_answers": ["bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["The map in the background indicates this stop is a bus stop.", "They are waiting for a bus.", "The people here are all sat ready for the bus."], "image": "val2014/COCO_val2014_000000561780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201352, "question_id": "aB37JG46Eu87vsKecDtzBq", "question": "Why is he standing on his hands?", "choices": ["resting", "showing off", "cleaning beach", "bad legs"], "correct_choice_idx": 1, "direct_answers": ["trick", "skateboard trick", "balancing", "tricks", "skateboard trick", "skateboard trick", "showing off", "skating trick", "doing trick", "trick"], "difficult_direct_answer": false, "rationales": ["This is not the traditional way to use a skateboard and takes a lot of skill. people do things requiring skill beyond intended uses to show off.", "Hot tricks like this attract attention.", "This is not the traditional way to use a skateboard, but is part of a trick. if one is doing tricks in a public setting they can be said to be doing answer a."], "image": "train2014/COCO_train2014_000000201352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483069, "question_id": "aBDHBejddmx22q3UZ2qacW", "question": "Gestation period of the cat is what?", "choices": ["80days", "58-68days", "25days", "30-35days"], "correct_choice_idx": 1, "direct_answers": ["9-10 weeks", "64 days", "60 days", "58-68days", "two months", "58-67 days", "60 days", "dog", "two months", "60 days"], "difficult_direct_answer": false, "rationales": ["A cat's pregnancy lasts about 58-68 days.", "It is the correct the time.", "The period is 58 days."], "image": "train2014/COCO_train2014_000000483069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420007, "question_id": "aBFyrDouTHENAjXBqG59BE", "question": "What is being used to maintain the steam engine's momentum?", "choices": ["wood", "coal", "steel", "electricity"], "correct_choice_idx": 0, "direct_answers": ["steam power", "coal", "firetube boilers", "coal", "wood", "coal", "coal", "wood", "coal", "wood"], "difficult_direct_answer": false, "rationales": ["Logs are stacked in the car behind the engine.", "The steam is coming out of the top of the train. coal is burned in a train.", "Wood keeps the engine going."], "image": "train2014/COCO_train2014_000000420007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250893, "question_id": "aBRdKJFbvZLPbqUWRxBMoP", "question": "How do these people know each other?", "choices": ["coworkers", "teammates", "classmates", "family"], "correct_choice_idx": 3, "direct_answers": ["friends", "friends", "family", "family", "family", "neighbors", "neighbors", "family", "family", "family"], "difficult_direct_answer": false, "rationales": ["The exact relationship is impossible to tell from the image, but the group of people are all seen at a home setting so they may be related.", "These people are family members since they're in a backyard.", "The people are family."], "image": "val2014/COCO_val2014_000000250893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398544, "question_id": "aBaxrS4ghyg6ZwnE3EV9Kn", "question": "What is the maximum number of players who can play simultaneously in this image?", "choices": ["16", "four", "eight", "22"], "correct_choice_idx": 0, "direct_answers": ["16", "sixteen", "four", "four", "sixteen players", "16", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are four courts.", "Two can play on each side of these four courts", "There are four courts shown and four people can play per doubles match of tennis."], "image": "train2014/COCO_train2014_000000398544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189982, "question_id": "aBerKm7d6JTUHZiEVnuhBR", "question": "Upon which type of moving thing would you normally expect to see someone dressed like the skateboarder?", "choices": ["car", "surfboard", "ship", "scooter"], "correct_choice_idx": 2, "direct_answers": ["pirate ship", "ship", "pirate ship", "ship", "ship", "pirate ship", "ship", "ship", "pirate ship", "pirate ship"], "difficult_direct_answer": false, "rationales": ["A pirate would spend most of his time on a sea-going vessel, coming ashore mostly to resupply dwindling stocks with food/ordnance.", "The skateboarder is dressed like a pirate. pirates do not normally use surfboards, scooters, or cars.", "A skateboarder is holding a sword and wearing a hat with a skull and crossbones on it. the skateboarder is also wearing a jacket and boots."], "image": "train2014/COCO_train2014_000000189982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309618, "question_id": "aBgBAH2kkZt58wUBZDcje9", "question": "Why is the child in the workspace?", "choices": ["keep quiet", "help out", "watch her", "keep clean"], "correct_choice_idx": 2, "direct_answers": ["remote work", "he's sleeping", "sleeping", "on train", "nursery", "on train", "napping", "sleeping", "sick", "watch her"], "difficult_direct_answer": false, "rationales": ["The child is a sleeping baby. she is too young to help out, does not need to be cleaned, and does not need to be kept quiet while asleep.", "The person is watching the child.", "The child is in the workspace being watched."], "image": "train2014/COCO_train2014_000000309618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331475, "question_id": "aBiB7irUeyWJFS9ZuxDr5f", "question": "What do the stains on the middle top metal thing come from?", "choices": ["blood", "juice", "oil", "cigarettes"], "correct_choice_idx": 3, "direct_answers": ["cigarettes", "cigarettes", "cigarettes", "cigarettes", "cigarettes", "smoking", "cigarettes", "cigarette butts", "cigarettes", "grill"], "difficult_direct_answer": false, "rationales": ["There are black ash spots on the metal item.", "That is an ash tray.", "The stains come from cigarette butts."], "image": "val2014/COCO_val2014_000000331475.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229827, "question_id": "aBw5jCFRM8hQwjFAnaYsNW", "question": "Why are the remotes strapped to their wrists?", "choices": ["safety", "punishment", "style", "visibility"], "correct_choice_idx": 0, "direct_answers": ["keep inplace", "safety", "playing", "safety", "playing wii", "playing wii", "playing wii", "safety", "safety", "protection"], "difficult_direct_answer": false, "rationales": ["The remotes are for safety.", "The remotes in question are wii remotes based on their size, shape and design. when these remotes are used as intended, they can fly out of someone's hand unless tethered.", "The remotes won't go flinging off their wrists with the straps."], "image": "train2014/COCO_train2014_000000229827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221322, "question_id": "aC6HwuevAJybtubD69uMf3", "question": "What people does the bus drive around?", "choices": ["military personnel", "politicians", "doctors", "rugby players"], "correct_choice_idx": 3, "direct_answers": ["rugby players", "rugby players", "rugby players", "tourists", "athletes", "rugby players", "toulon provence", "athletes", "athletes", "tour"], "difficult_direct_answer": false, "rationales": ["There is writing on the side of the bus that describes what kind of bus it is and the type of people can then be inferred.", "It says what is on the side of bus.", "A bus has a rugby logo on the side of it."], "image": "train2014/COCO_train2014_000000221322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530497, "question_id": "aC6mPoUXiDQQDojHCN4iHK", "question": "The food that was probably recently consumed was of what variety?", "choices": ["fruit", "vegetables", "dessert", "grains"], "correct_choice_idx": 2, "direct_answers": ["dessert", "breakfast", "dessert", "breakfast", "breakfast", "fruit", "breakfast", "breakfast", "dessert", "pancakes"], "difficult_direct_answer": false, "rationales": ["The food was probably sweet since there are fruits and syrup on the plate.", "There is an orange and remnants of syrup on the plate.", "There is a red sauce that looks sweet. after dinner treats are sweet."], "image": "train2014/COCO_train2014_000000530497.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221669, "question_id": "aC73KvdjE6EFMwyiszG58H", "question": "In what style park does the skateboarder skate?", "choices": ["skate park", "state park", "store", "aviary"], "correct_choice_idx": 0, "direct_answers": ["skate park", "cement", "in air", "freestyle", "skate park", "skate park", "normal", "amateur", "skateboard park", "slope style"], "difficult_direct_answer": false, "rationales": ["The place is make for skateboards and skaters.", "The man is in a skate park.", "The park is for skating."], "image": "val2014/COCO_val2014_000000221669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302838, "question_id": "aC9zBbLpdPHnTmeaKLvGeA", "question": "What is the woman receiving on the stage?", "choices": ["haircut", "award", "diploma", "book"], "correct_choice_idx": 0, "direct_answers": ["haircut", "shave", "haircut", "haircut", "haircut", "haircut", "haircut", "shave", "haircut", "haircut"], "difficult_direct_answer": false, "rationales": ["The man is a stylist and cutting her hair to show how to do it on stage.", "The man has scissors and she's wearing a cape", "The barber is seen and his work is to cut the hair."], "image": "train2014/COCO_train2014_000000302838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448176, "question_id": "aCCj6P9EgoBjKVPYgQzv4P", "question": "Why are the vehicles on the boat?", "choices": ["cross water", "for sale", "cleaning them", "free ride"], "correct_choice_idx": 0, "direct_answers": ["crossing", "crossing water", "transport", "transport", "cross water", "transporting cars", "ferrying across", "transporting them", "ferry transport", "carrying"], "difficult_direct_answer": true, "rationales": ["The vehicles on the boat can't traverse water so they need the boat to get across, which is what's happening here.", "The vehicles are crossing.", "Vehicles can't swim or drive on water."], "image": "val2014/COCO_val2014_000000448176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282843, "question_id": "aCVJQh8c66EgwpccfNrpQw", "question": "How many people are waiting to do the activity?", "choices": ["ten", "nine", "seven", "eight"], "correct_choice_idx": 2, "direct_answers": ["two", "seven", "two", "seven", "two", "seven", "seven", "seven", "two", "seven"], "difficult_direct_answer": false, "rationales": ["Seven people are waiting to surf because they have surf boards in their hands", "There are seven people waiting on the sidelines.", "The activity taking place is surfing and there are seven people in total who are currently holding their boards and not actively participating."], "image": "train2014/COCO_train2014_000000282843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192955, "question_id": "aCWrsAp4hgbuo23B9PCjju", "question": "What sort of fuel does the driving mechanism for moving the train use?", "choices": ["gas", "hay", "oil", "natural gas"], "correct_choice_idx": 1, "direct_answers": ["grain", "horsepower", "hay", "horse", "horse", "horsepower", "horse", "hay", "horse", "gas"], "difficult_direct_answer": false, "rationales": ["The cart is being pulled by the horse and the horse feeds off of hay.", "The horse eats hay to give it energy to move the train.", "The train is powered by the horse."], "image": "train2014/COCO_train2014_000000192955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182967, "question_id": "aCfdez6iLipvqKbbM3BhUU", "question": "What other sports might one play here?", "choices": ["tennis", "golf", "soccer", "snowboarding"], "correct_choice_idx": 3, "direct_answers": ["snowboarding", "snowboarding", "snowboarding", "snowboarding", "snowboarding", "snowboarding", "snowboarding", "snowboarding", "snowboarding", "snowboarding"], "difficult_direct_answer": false, "rationales": ["In addition to skiing, a wide single board can be used to traverse down a snow covered hill.", "This is a snowy slope where snow sports can be played.", "It is a ski hill, where snowboarding is also completed on for downhill gun."], "image": "train2014/COCO_train2014_000000182967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463319, "question_id": "aCfyjNjpu2veGqfk4gdpPz", "question": "What shape is the object used to play this game?", "choices": ["oval", "disk", "oblong", "round"], "correct_choice_idx": 3, "direct_answers": ["sphere", "round", "round", "round", "diamond", "round", "round", "round", "round", "stick"], "difficult_direct_answer": false, "rationales": ["In this particular sport bats are used to hit circular leather balls.", "The ball is round.", "A baseball is used to play the game."], "image": "train2014/COCO_train2014_000000463319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527163, "question_id": "aD4eowgmd9icxu6i7f7gsM", "question": "What kind of birds are most clearly visible here?", "choices": ["ducks", "geese", "pigeon", "albatross"], "correct_choice_idx": 2, "direct_answers": ["pigeons", "pigeon", "pigeon", "pigeons", "pigeon", "pigeons", "pigeons", "pigeons", "pigeons", "blackbirds"], "difficult_direct_answer": false, "rationales": ["This type of bird is known for lounging in metro areas as is seen here.", "These are all pigeons.", "These birds gather around people with food."], "image": "val2014/COCO_val2014_000000527163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300333, "question_id": "aDVRCeigUpUeV2dTWQvmRf", "question": "What are the little specks on the bun?", "choices": ["mold", "poppyseed", "dirt", "ants"], "correct_choice_idx": 1, "direct_answers": ["poppy seeds", "poppy seeds", "donuts", "seeds", "poppyseed", "sesame seeds", "poppy seeds", "poppy seeds", "poppy seeds", "sesame seeds"], "difficult_direct_answer": false, "rationales": ["The specks are poppy seeds on the bread.", "The specks are poppy seeds.", "The seeds are edible and poppyseed are a type of seed that are on a bun."], "image": "train2014/COCO_train2014_000000300333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390000, "question_id": "aDdXhmpp4PmmB8vFKeAtw3", "question": "What country is this?", "choices": ["canada", "australia", "uk", "usa"], "correct_choice_idx": 2, "direct_answers": ["england", "england", "england", "uk", "england", "england", "usa", "usa", "england", "england"], "difficult_direct_answer": false, "rationales": ["Double decker buses are common in great britain. there is also a british flag on the side of the bus.", "The uk is known for double decker buses.", "This is in the united kingdom."], "image": "train2014/COCO_train2014_000000390000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160968, "question_id": "aDp2PrqTVbJAn6WGuPpy4Y", "question": "What are the metal shapes attached to the front of the horse's breast collar?", "choices": ["id tags", "breed certifications", "police badges", "trophy plaques"], "correct_choice_idx": 2, "direct_answers": ["harness", "badges", "badge", "badge", "harness", "rope", "badges", "police badges", "halter", "harness"], "difficult_direct_answer": false, "rationales": ["Theses horses are police force horses.", "This indicates they are law enforcement", "The metal shapes are part of the police's uniforms."], "image": "train2014/COCO_train2014_000000160968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388161, "question_id": "aDq3N7YYJk4fUAspgNeM3j", "question": "What is the man doing to the horse's tail?", "choices": ["pulling it", "coloring it", "grooming it", "cutting it"], "correct_choice_idx": 2, "direct_answers": ["tying ribbon", "attaching ribbon", "grooming it", "tying it", "braiding", "decorating", "tying bow", "washing", "braiding tail", "tying"], "difficult_direct_answer": true, "rationales": ["Horses are customarily groomed, and it is not a known thing to pull, colour, or cut a horses tail.", "The man is brushing the tail.", "A man is standing at the back end of a horse with a brush."], "image": "val2014/COCO_val2014_000000388161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505530, "question_id": "aEAeFte6MCE5HX4hbkGSch", "question": "What is the white on the grass near the penguins?", "choices": ["sand", "carpet", "foam", "snow"], "correct_choice_idx": 3, "direct_answers": ["snow", "snow", "sand", "snow", "snow", "sand", "snow", "snow", "sand", "snow"], "difficult_direct_answer": false, "rationales": ["Penguins are arctic creatures.", "The white is from snow.", "Penguins live in cold climates. snow can be seen all around."], "image": "val2014/COCO_val2014_000000505530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162682, "question_id": "aEJindDc6V2HZAtAhqWN6V", "question": "What is the brown areas on the barrels?", "choices": ["animal droppings", "rust", "syrup stains", "paint"], "correct_choice_idx": 1, "direct_answers": ["dirt", "rust", "rust", "dirt", "rust", "rust", "rust", "rust", "rust", "rust"], "difficult_direct_answer": false, "rationales": ["The barrels are made out of metal. metal rusts.", "The area is rusty.", "The barrels appear to contain a liquid and based on the pattern of the color and the tone it is most likely the liquids have interacted with metal and rusted."], "image": "train2014/COCO_train2014_000000162682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88854, "question_id": "aEoooMmS7bozbrEbZv3rV2", "question": "What type of event is being held?", "choices": ["lodge party", "race", "ski party", "bunny hop"], "correct_choice_idx": 1, "direct_answers": ["skiing", "race", "skiing", "crosscountry skiing", "skiing", "skiing", "cross-country skiing", "race", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["They have race pinnies on.", "The event is a race.", "The participants are wearing bibs with numbers on them and appear to be using high-quality professional equipment. these features are consistent with an organized competition."], "image": "val2014/COCO_val2014_000000088854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398063, "question_id": "aF4cGx6GVcHsNxkBVMEEw5", "question": "How is this woman preventing grass stains?", "choices": ["rubber sheet", "plastic tarp", "blanket", "vinyl sheet"], "correct_choice_idx": 2, "direct_answers": ["blanket", "blanket", "blanket", "blanket", "blanket", "blanket", "blanket", "blanket", "blanket", "blanket"], "difficult_direct_answer": false, "rationales": ["The pattern on the item is that of a woven fabric, and is an easier item to carry around and transport to a public area.", "The plaid cloth protects her.", "The woman is using a blanket to prevent grass stains on her white dress."], "image": "val2014/COCO_val2014_000000398063.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517455, "question_id": "aFCfkPFDMCQypQ4axRNiW5", "question": "Why are they all looking at the train?", "choices": ["fearful", "want ride", "annoyed", "like colors"], "correct_choice_idx": 1, "direct_answers": ["boarding", "boarding", "boarding soon", "enter train", "awaiting arrival", "waiting", "waiting embark", "waiting", "seeing friends", "want ride"], "difficult_direct_answer": false, "rationales": ["The people are looking at the train because the want to get on and ride it somewhere.", "They are facing that direction because they are waiting for it to stop so they can board it.", "They want a ride."], "image": "train2014/COCO_train2014_000000517455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12698, "question_id": "aFPfMiy3t5FdgEKcyJeQLL", "question": "What type of food is shown?", "choices": ["soup", "fruit", "sandwiches", "donuts"], "correct_choice_idx": 2, "direct_answers": ["subs", "korean", "sub", "sandwich", "sandwich salad", "sandwiches", "sandwich", "korean", "sandwiches", "sandwich"], "difficult_direct_answer": false, "rationales": ["Vegetables and other ingredients are being served on bread.", "The food item on the right of this picture is made up of folded bread with meat and vegetables in between.", "Sandwiches are shown."], "image": "train2014/COCO_train2014_000000012698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318630, "question_id": "aFdjCNVYyBFQU3gfsnB9wh", "question": "What are the people turning to look at?", "choices": ["animal", "camera", "sea", "traffic"], "correct_choice_idx": 1, "direct_answers": ["camera", "camera", "view", "camera", "camera", "camera", "camera", "camera", "camera", "photographer"], "difficult_direct_answer": false, "rationales": ["People are turning back to look at the camera.", "The people are posing.", "They are all looking at the photographer"], "image": "train2014/COCO_train2014_000000318630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10966, "question_id": "aG7m6Mx2VwAPDaPQ4BDQvn", "question": "What's the name of the green holder the man's can is in?", "choices": ["mug", "jacket", "koozie", "folder"], "correct_choice_idx": 2, "direct_answers": ["koozie", "foam sleeve", "cookie", "beer cookie", "bottle", "cookie", "not visible", "cup", "drink sleeve", "koozie"], "difficult_direct_answer": false, "rationales": ["The green holder is called a koozie.", "Koozies were first seen in australian surfer culture in the 1970s, and have spread around the world since then. they're a great way to advertise a product, because they're cheap to produce and relatively visible.", "Cans need covers to protect hands from cold cans."], "image": "val2014/COCO_val2014_000000010966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540093, "question_id": "aG9VNYcipYrfQJ2N5fE72d", "question": "Why is plastic used to cover plants?", "choices": ["draw attention", "protect them", "hide them", "aesthetics"], "correct_choice_idx": 1, "direct_answers": ["protection", "prevent freezing", "protection", "frost", "protection", "insulate", "protection", "frost", "protect them", "protection"], "difficult_direct_answer": false, "rationales": ["Plants are covered in plastic. covering plants protects them from cold weather.", "The plastic is trying to protect the plants.", "When the weather is bad, it's good to cover the plants with plastic."], "image": "val2014/COCO_val2014_000000540093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14630, "question_id": "aGL8JZWhu8XVvc6GWuiq97", "question": "Where is the silver plane on the right being stored?", "choices": ["home", "hotel", "hanger", "shed"], "correct_choice_idx": 2, "direct_answers": ["hanger", "hanger", "airport hangar", "hanger", "store", "garage", "hangar", "hangar", "hanger", "hangar"], "difficult_direct_answer": false, "rationales": ["The other options don't work in this setting and an a is the standard option.", "The plane is in a large garage type area.", "A building where planes are stored is called a hangar."], "image": "train2014/COCO_train2014_000000014630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145693, "question_id": "aGiBarwtkd6jPy54HepLrU", "question": "What is number fifteen doing on the field?", "choices": ["practicing", "attacking", "throwing", "batting"], "correct_choice_idx": 0, "direct_answers": ["warming up", "swinging bat", "swinging bat", "warming up", "swinging bat", "warming up", "stretching", "practicing", "warming up", "warming up"], "difficult_direct_answer": false, "rationales": ["They are warming up for when it's their turn", "He is swinging the bat to warm up for his performance.", "Batters often wave their bats around before they play."], "image": "val2014/COCO_val2014_000000145693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159370, "question_id": "aH2BWuwwJGYGhv3WEtsBpJ", "question": "What move is this male player using?", "choices": ["serve", "forehand", "backhand", "lob"], "correct_choice_idx": 1, "direct_answers": ["forehand hit", "upper", "forehand", "overhand", "overhand", "forehand", "swing", "forward swing", "forehand", "backhand"], "difficult_direct_answer": false, "rationales": ["He is using a forehand move.", "This man's palm is facing the direction he is swinging his racket. this would be classified as a forehand shot in tennis.", "The player is swiping forward and into their body so it's a forehand motion."], "image": "train2014/COCO_train2014_000000159370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276458, "question_id": "aHHjMEJGPHU5giQQ4fJQKs", "question": "What shape is the kite to the left?", "choices": ["square", "octagon", "crescent", "hexagon"], "correct_choice_idx": 2, "direct_answers": ["crescent", "c shaped", "half circle", "crescent", "half round", "semi circle", "moon", "half moon", "horizon", "crescent"], "difficult_direct_answer": false, "rationales": ["They look like croissants.", "The kites are semi circular.", "Kites are being flown and are the shape of half moons."], "image": "val2014/COCO_val2014_000000276458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235644, "question_id": "aHW7jMS6gkankGxgZxQahK", "question": "Which major US city does this train line service?", "choices": ["philadelphia", "new york", "boston", "washington d.c"], "correct_choice_idx": 1, "direct_answers": ["new york", "long island", "long island", "new york", "long island", "long island", "new york", "new york", "new york", "new york"], "difficult_direct_answer": false, "rationales": ["The logo of the train says mta for new york.", "The train is long island rail road so they are in ny.", "The city is new york."], "image": "val2014/COCO_val2014_000000235644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520063, "question_id": "aHuj3wmKCdVTbXA2mTd5W4", "question": "Why are the nearby pedestrians blurry?", "choices": ["heavy post-processing", "they're moving", "earthquake", "camera's moving"], "correct_choice_idx": 1, "direct_answers": ["moving quickly", "moving", "they're moving", "unfocused", "bad focus", "walking", "photographer moved", "movement", "moving", "moving"], "difficult_direct_answer": false, "rationales": ["The pedestrians are walking.", "The people are moving very fast.", "They are walking and in motion."], "image": "train2014/COCO_train2014_000000520063.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28290, "question_id": "aJeY4S7AdLtas7sRBTJg3A", "question": "What are they doing under the canopy?", "choices": ["fighting", "resting", "changing clothes", "eating"], "correct_choice_idx": 2, "direct_answers": ["blocking sun", "talking", "talking", "getting shade", "avoid sunshine", "getting ready", "avoiding sun", "wind shelter", "talking", "changing clothes"], "difficult_direct_answer": false, "rationales": ["With all the garments on the ground, that looks like what they're doing.", "The people change.", "The people are changing into wetsuits."], "image": "val2014/COCO_val2014_000000028290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451976, "question_id": "aJnS68on2hhszsHMBExyXj", "question": "Which animal is most threatened here?", "choices": ["fish", "ducks", "man", "dogs"], "correct_choice_idx": 1, "direct_answers": ["dog", "dogs", "duck", "ducks", "ducks", "ducks", "duck", "ducks", "dog", "ducks"], "difficult_direct_answer": false, "rationales": ["The animal is a duck.", "The dog will chase after them if they get close enough.", "The dog could be a predator to the birds."], "image": "train2014/COCO_train2014_000000451976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442786, "question_id": "aLKe6d8sD2fVBuhLwGJxSF", "question": "The man closest to the right has what kind of pants on?", "choices": ["jeans", "khakis", "shorts", "tights"], "correct_choice_idx": 0, "direct_answers": ["jeans", "jeans", "slacks", "jeans", "black", "jeans", "blue jeans", "jeans", "jeans", "jeans"], "difficult_direct_answer": false, "rationales": ["The man's pants are blue. blue denim is the most common material from which jeans are made.", "He wears jeans.", "The man is wearing blue jeans."], "image": "train2014/COCO_train2014_000000442786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248457, "question_id": "aLNimr576EYXJkJGBVu2tC", "question": "Why does the photographer cast a shadow?", "choices": ["takes photo", "reflects light", "blocks light", "avoids light"], "correct_choice_idx": 2, "direct_answers": ["sunlight", "sun", "shadows", "sun behind", "blocks light", "sun", "sun", "sun", "sun", "sunlight"], "difficult_direct_answer": false, "rationales": ["The person taking the photo is blocking the sun and that is what casts the shadow.", "The sun is behind the photographer. when you block the sun it creates shade in a silhouette of the thing doing the blocking.", "The photographer blocks light."], "image": "val2014/COCO_val2014_000000248457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308735, "question_id": "aLQRT9QUaHFBcjeAmBNkDn", "question": "What sort of product is in Glass spiral container?", "choices": ["dairy", "wine", "spicy", "vinegar"], "correct_choice_idx": 0, "direct_answers": ["grated cheese", "salt", "parmesan", "dairy", "grated cheese", "parmesan", "cheese", "pizza", "cheese", "grated parmesan"], "difficult_direct_answer": false, "rationales": ["Parmesan is in the container.", "The container contains grated cheese and cheese is a dairy product.", "The glass jar holds parmesan cheese, which is an item made from cows milk."], "image": "val2014/COCO_val2014_000000308735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37905, "question_id": "aLQihv3A4ArzAQYkG74ZUA", "question": "What is the red object on the ground capable of?", "choices": ["driving around", "flight", "playing music", "catching fish"], "correct_choice_idx": 1, "direct_answers": ["flight", "flying", "flying", "flying", "flying", "flight", "flight", "flight", "flying", "flying"], "difficult_direct_answer": false, "rationales": ["It is a kite that is fun to use in the wind", "The object is a kite which is meant for flying.", "The object is for flight."], "image": "train2014/COCO_train2014_000000037905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232546, "question_id": "aLjSAuSYiHq65H3yzM5DoP", "question": "What type business is the white sign advertising?", "choices": ["pharmacy", "car shop", "route store", "grocer"], "correct_choice_idx": 0, "direct_answers": ["mattress company", "mattress sales", "mattress", "mattress", "mattress company", "mattress", "mattresses", "mattresses", "pharmacy", "theater"], "difficult_direct_answer": false, "rationales": ["Drugs outside of a building indicate this type of store.", "The business is a pharmacy.", "It is actually a mattress store."], "image": "train2014/COCO_train2014_000000232546.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348904, "question_id": "aLsgjwBZjAUtAegQbKCGNn", "question": "What is this animal referred to as?", "choices": ["feline", "bovine", "canine", "equine"], "correct_choice_idx": 3, "direct_answers": ["horse", "horse", "horse", "horse", "equine", "horse", "horse", "horse", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["Equine animals are horses.", "This animal is not a canine (dog), feline (cat), or a bovine (cattle) so therefore it is the only option left in the list. it is a large brown animal pulling a carriage.", "A horse is pulling a cart. horses pull carts in the street."], "image": "train2014/COCO_train2014_000000348904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316595, "question_id": "aNVCnuPx5zszZLf3UhN9BQ", "question": "What is the old man doing?", "choices": ["having constipation", "scaring people", "cheering up", "getting mad"], "correct_choice_idx": 2, "direct_answers": ["smiling", "cheering", "skateboarding", "encouraging", "cheering", "cheering", "crouching", "bent over", "cheering up", "cheering"], "difficult_direct_answer": false, "rationales": ["He looks like he is cheering them on.", "The old man is cheering up the boy on the skateboard.", "The man is cheering up."], "image": "train2014/COCO_train2014_000000316595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120810, "question_id": "aNYUtCMkoYppEfka9pmkJy", "question": "What are the terms referring to?", "choices": ["road rules", "rider code", "biker terms", "financing"], "correct_choice_idx": 3, "direct_answers": ["payment", "payment", "motorbike purchase", "payment", "payment plans", "horsepower", "loan", "payment", "payment", "financing"], "difficult_direct_answer": false, "rationales": ["The sign says \"12, 15 or 25 months pay\" which is financing.", "The other options don't apply here. the 1/5 down makes it clear it's about payment plans.", "These items refer to the terms of finance."], "image": "val2014/COCO_val2014_000000120810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263176, "question_id": "aP4uBx4BqVeaN5Z5LxhAZW", "question": "What are the people ready to do?", "choices": ["eat", "run", "deplane", "board"], "correct_choice_idx": 3, "direct_answers": ["travel", "boating", "boat ride", "ride boat", "board", "sail", "sail", "sail", "ride boat", "board boat"], "difficult_direct_answer": false, "rationales": ["They appear to be waiting to a. the other options don't fit.", "The people are standing on the dock waiting to get on board the boat that is waiting there.", "The people are untying the boat. this means the boat is getting ready to leave the dock."], "image": "train2014/COCO_train2014_000000263176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326344, "question_id": "aP5dCKNswbnt8ZoE5Zr3BC", "question": "What might this knife here cut into?", "choices": ["knife", "dirt", "orange", "skin"], "correct_choice_idx": 2, "direct_answers": ["oranges", "paper", "orange", "oranges", "orange", "tangerine", "pumpkins", "peel", "oranges", "tangerine"], "difficult_direct_answer": false, "rationales": ["The knife can cut the orange.", "Oranges are near.", "The knife is pointing towards the orange."], "image": "val2014/COCO_val2014_000000326344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347097, "question_id": "aPRd9f5NQQLJFSNUeobkTc", "question": "What is provided by this object?", "choices": ["shelter", "shade", "warmth", "moisture"], "correct_choice_idx": 1, "direct_answers": ["shade", "shade", "shade", "shade", "shade", "shade", "shade", "shade", "shade", "shade"], "difficult_direct_answer": false, "rationales": ["The umbrella is used to block out the sunlight.", "The umbrella helps provide shade.", "The umbrella gives shade."], "image": "train2014/COCO_train2014_000000347097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527554, "question_id": "aPVKpvseFmGww4zJVwPoxn", "question": "What is the man that is playing video games wearing?", "choices": ["tie", "suspenders", "hat", "glasses"], "correct_choice_idx": 2, "direct_answers": ["flap cap", "beret", "beige cap", "tan hat", "hat", "polo shirt", "hat", "cap", "hat", "shirt"], "difficult_direct_answer": false, "rationales": ["The man playing video games has a covered head. his eyes are uncovered.", "The man is identifiable based on the remote that is visible in his hand and his positioning in front of the television screen. the most defining feature of his attire is answer a.", "The man has a hat on his head."], "image": "train2014/COCO_train2014_000000527554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327970, "question_id": "aPXdmobrhpFkone7F9vcNg", "question": "What sort of job is the man standing in yellow doing?", "choices": ["secret shopper", "game official", "cheer leader", "field worker"], "correct_choice_idx": 1, "direct_answers": ["coaching", "game official", "referee", "refereeing", "coach", "referee", "referee", "referee", "refereeing", "referee"], "difficult_direct_answer": false, "rationales": ["The man is refereeing the game.", "The man standing is a game official because his yellow shirt is the one that the referees wear", "The man is the referee."], "image": "train2014/COCO_train2014_000000327970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538640, "question_id": "aPp24WXdjobtdnbdbSy6Rj", "question": "In what year was the tv station seen here founded?", "choices": ["1975", "1981", "1986", "1992"], "correct_choice_idx": 1, "direct_answers": ["1981", "two thousand", "early", "1981", "unknown", "1981", "1970", "1981", "nineteen eighty", "1950"], "difficult_direct_answer": false, "rationales": ["The tv station shown was founded in 1981.", "The only tv station seen is mtv and that's when they began airing.", "The year was 1981."], "image": "train2014/COCO_train2014_000000538640.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367584, "question_id": "aPxWRcrEXfFkuUEbC9pTyd", "question": "Who is in danger of being struc?", "choices": ["car", "bus", "building", "parked car"], "correct_choice_idx": 0, "direct_answers": ["car", "bus", "black car", "car", "antique car", "little car", "car", "antique automobile", "car", "car"], "difficult_direct_answer": false, "rationales": ["It is angled in front of the larger vehicle", "The bus is very long and the other vehicle is smaller. the bus will hit it with great impact.", "The car comes across the bus hence chance of being hit."], "image": "train2014/COCO_train2014_000000367584.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18473, "question_id": "aQHyDpNoizfrah6BqDcLLU", "question": "What is the dog lying on?", "choices": ["couch cushion", "head rest", "blanket", "dog bed"], "correct_choice_idx": 3, "direct_answers": ["dog bed", "dog bed", "dog bed", "dog bed", "pillow", "pillow", "dog bed", "cushion", "cushion", "cushion"], "difficult_direct_answer": false, "rationales": ["The object is square and allows plenty of space for a large breed to get comfortable on it.", "The dog is in a bed.", "The dog is hanging out on a brown bed."], "image": "train2014/COCO_train2014_000000018473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513093, "question_id": "aQPPQXaV3vLEYnpaD6r3pY", "question": "What type of footwear is the boy wearing?", "choices": ["sneakers", "sandals", "converse", "crocs"], "correct_choice_idx": 3, "direct_answers": ["croks", "sandal", "crocs", "croc", "crocks", "crocs", "crocs", "crocs", "sandals", "clogs"], "difficult_direct_answer": false, "rationales": ["The style and color clearly visible on the shoes are found in crocs and no other style of shoes.", "The footwear is crocs.", "They are clogs with heel straps."], "image": "train2014/COCO_train2014_000000513093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81810, "question_id": "aQaEfHhdb4WG6BrCweFtyT", "question": "What is the appliance in this room used for?", "choices": ["cooking", "watching", "cooling", "washing"], "correct_choice_idx": 1, "direct_answers": ["watching", "watching tv", "watching shows", "watching", "entertainment", "watching tv", "watching entertainment", "watching", "watching tv", "control television"], "difficult_direct_answer": false, "rationales": ["It is what a television what designed for people to do.", "The appliance is for watching.", "This room is used for watching tv."], "image": "train2014/COCO_train2014_000000081810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63565, "question_id": "aQnLziovGKHM2iei6y2ue2", "question": "What is the man in the water doing?", "choices": ["wakeboarding", "jet skiing", "swimming", "surfboarding"], "correct_choice_idx": 3, "direct_answers": ["surfing", "skiing", "windsurfing", "surfing", "paddle board", "surfing", "surfing", "surfing", "parasailing", "surfboarding"], "difficult_direct_answer": false, "rationales": ["The man is surfing.", "You can tell by the oar the man is carrying as to what he is doing.", "The man is wearing a wetsuit and is surfing on a board in the ocean."], "image": "val2014/COCO_val2014_000000063565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309120, "question_id": "aQtUkLhczVCaDqie7vx4XH", "question": "What color stripe is on the ball underneath the chain link fence?", "choices": ["white", "red", "green", "blue"], "correct_choice_idx": 1, "direct_answers": ["white", "white", "red", "red", "red", "red", "soccer goalie", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The ball has a red stripe down the middle.", "It is one of the primary colors", "Unless you are colorblind you can tell what color strip it is."], "image": "val2014/COCO_val2014_000000309120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81251, "question_id": "aR456oNMQHVVeq8zYLwDV3", "question": "Why would someone sit at this table?", "choices": ["to work", "to eat", "to paint", "to sew"], "correct_choice_idx": 1, "direct_answers": ["eat food", "to eat", "outdoors", "eat dinner", "to eat", "feed", "to eat", "eat", "to eat", "eating"], "difficult_direct_answer": false, "rationales": ["If they wanted to eat they would sit down.", "There is food at the table. the food is being served to the diners.", "It's a table with food on it."], "image": "val2014/COCO_val2014_000000081251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544629, "question_id": "aRE78VNxWFzTKiUB3gcp9f", "question": "What is slowly melting from the inside of this cake?", "choices": ["ice cream", "mud slide", "marshmallow fluff", "vanilla drizzle"], "correct_choice_idx": 0, "direct_answers": ["chocolate", "chocolate", "chocolate", "ice cream", "chocolate", "fudge frosting", "ice cream", "chocolate", "icecream", "ice cream"], "difficult_direct_answer": false, "rationales": ["Due to warm temperatures that do not continue making the ice cream.", "It's obvious that this is a frozen cake.", "It looks like a chocolate flavor, and that stuff melts at room temperature. it looks like a frozen kind of desert."], "image": "val2014/COCO_val2014_000000544629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257270, "question_id": "aRQeBvn36RvA5rJaDEesep", "question": "What country is this vehicle licensed?", "choices": ["united states", "germany", "canada", "england"], "correct_choice_idx": 0, "direct_answers": ["united states", "usa", "america", "united states", "usa", "usa", "united states", "usa", "united states", "usa"], "difficult_direct_answer": false, "rationales": ["The motorcycle has an indiana license plate.", "The state on the license plate is a us one.", "It has a state license plate on it"], "image": "val2014/COCO_val2014_000000257270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133060, "question_id": "aReqiKZRuJPKefuWEUFW73", "question": "What is the batter waiting for?", "choices": ["pitch", "drink", "race", "touchdown"], "correct_choice_idx": 0, "direct_answers": ["ball", "pitch", "ball", "pitch", "pitch", "ball", "pitch", "pitcher", "pitch", "pitch"], "difficult_direct_answer": false, "rationales": ["His stance and the way he is holding the baseball bat and the direction he is looking all indicate what he is waiting for.", "They haven't seen the ball yet.", "A batter has the bat on his shoulder. batters wait for the ball to be pitched before swinging."], "image": "val2014/COCO_val2014_000000133060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330665, "question_id": "aRpKy7edg2Y9sTefevUuGx", "question": "What is the woman's occupation?", "choices": ["dentist", "photographer", "judge", "priest"], "correct_choice_idx": 1, "direct_answers": ["photographer", "photographer", "photographer", "photographer", "photographer", "realtor photographer", "photographer", "actress", "photographer", "photographer"], "difficult_direct_answer": false, "rationales": ["The woman is holding a professional grade camera.", "The woman is holding a camera in her hand.", "The women is holding a camera in front of her that looks to be a professional quality. based on the dress and background she may be at some kind of an official event, where if she was bringing a camera, it is likely to be her occupation to take pictures."], "image": "val2014/COCO_val2014_000000330665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165455, "question_id": "aS49QXzChY3VGEuAEoMQmX", "question": "Why is one car such a bright unusual color?", "choices": ["taxi", "fashionable", "highlighter company", "promotion"], "correct_choice_idx": 0, "direct_answers": ["green", "taxi", "taxi visibility", "taxi", "taxi", "taxi", "taxi", "taxi", "taxi", "taxi"], "difficult_direct_answer": false, "rationales": ["It is brightly colored so that it can easily be spotted.", "The taxi is a lime green color for visibility.", "Lime green is an easy color to see. car services try to be unique and easy to pick out of all of the traffic for customers."], "image": "val2014/COCO_val2014_000000165455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378940, "question_id": "aSCAkkgKMWpscH5qQiMJ4y", "question": "What company makes the item the man is looking at?", "choices": ["gucci", "tyson", "shell", "wilson"], "correct_choice_idx": 3, "direct_answers": ["tennis", "tennis", "spalding", "nike", "wilson", "wilson", "nike", "wilson", "wilson", "unknown"], "difficult_direct_answer": false, "rationales": ["Based on the eye line of the man he is looking down at the tennis ball. tennis balls are made by answer a.", "Wilson produces tennis racquets.", "The company is wilson."], "image": "val2014/COCO_val2014_000000378940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418701, "question_id": "aSTz5yZgZCCCY4iJpzwkPo", "question": "Originally what was the glass booth designed for?", "choices": ["police monitoring", "phone calls", "sentry guard", "toll taking"], "correct_choice_idx": 1, "direct_answers": ["phone calls", "privacy", "phone booth", "phone calls", "phone booth", "telephone", "phone booth", "phone calls", "phone booth", "phone booth"], "difficult_direct_answer": false, "rationales": ["The glass booth is for phone calls.", "This is a phone booth", "The glass enclosure is used for phone booths and phone calls."], "image": "val2014/COCO_val2014_000000418701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373283, "question_id": "aSaaq3Z7kJ7UoWJdZqgtGH", "question": "What does this person use the umbrella for?", "choices": ["snow", "hail", "sun", "rain"], "correct_choice_idx": 3, "direct_answers": ["rain", "protect rain", "rain protection", "rain", "sun protection", "rain", "shade", "for rain", "protection", "avoid sun"], "difficult_direct_answer": false, "rationales": ["This person is using the umbrella to protect against the rain.", "There are water droplets beside her.", "The person is by rain."], "image": "train2014/COCO_train2014_000000373283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137230, "question_id": "aSggaA3p5c6pLtVnGUu2qn", "question": "What does the store with the green sign sell?", "choices": ["food", "books", "insurance", "tires"], "correct_choice_idx": 0, "direct_answers": ["fruits", "food", "fruit", "polish food", "food", "groceries", "food", "grocery", "groceries", "groceries"], "difficult_direct_answer": false, "rationales": ["You can see an assortment of fruits and vegetables in the stand it also says in the lettering on the top of the stand groceries.", "The sign states it sells food.", "There is produce at this stand."], "image": "train2014/COCO_train2014_000000137230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475319, "question_id": "aSkifJwzd9LMXxxr2PcCzd", "question": "Why are all these people in midair?", "choices": ["performing tricks", "falling", "confused", "lost"], "correct_choice_idx": 0, "direct_answers": ["tricks", "doing trick", "action shot", "tricks", "performing tricks", "doing flips", "snowboarding images", "photo composite", "same person", "performing tricks"], "difficult_direct_answer": false, "rationales": ["The people are performing a snowboard trick midair.", "The people are all on snowboards.", "The people are doing tricks."], "image": "train2014/COCO_train2014_000000475319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279521, "question_id": "aSzZg8A5LZ6zw8Q3cJwo3n", "question": "What transmits electrical energy to the locomotive here?", "choices": ["cell tower", "gas tank", "overhead line", "furnace"], "correct_choice_idx": 2, "direct_answers": ["wires", "cables", "middle section", "rail", "engine", "overhead line", "overhead line", "rails", "engine battery", "wires"], "difficult_direct_answer": false, "rationales": ["Electric cables are visible.", "These types of public transportation vehicles don't use fossil fuels but instead electricity.", "The overhead line contains electricity which the train runs on."], "image": "val2014/COCO_val2014_000000279521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48710, "question_id": "aToryxJE4JPVWT2SXgavE2", "question": "What kind of room is this?", "choices": ["den", "kitchen", "dining room", "hotel room"], "correct_choice_idx": 3, "direct_answers": ["hotel", "hotel", "hotel", "hotel room", "bedroom", "hotel", "doubles room", "bedroom", "hotel", "bedroom"], "difficult_direct_answer": false, "rationales": ["Hotel rooms often have two matching beds, a business style phone, and an information card and this room has all of those so it's definitely a hotel room.", "This is a small hotel room.", "The room is part of a hotel."], "image": "train2014/COCO_train2014_000000048710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115006, "question_id": "aTrZ9SCMhFWcd7JMzF99Zj", "question": "Who is the man crouched behind the catcher?", "choices": ["umpire", "coach", "batter", "announcer"], "correct_choice_idx": 0, "direct_answers": ["umpire", "umpire", "ump", "umpire", "umpire", "ump", "umpire", "umpire", "umpire", "umpire"], "difficult_direct_answer": false, "rationales": ["The men are playing baseball. baseball games require someone to mediate the game.", "The man is in an all black uniform and is crouched behind the catcher.", "There is an umpire crouched behind the batter."], "image": "val2014/COCO_val2014_000000115006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560613, "question_id": "aTrhGknQ33v5vUwjANGCxy", "question": "What are the two walking along?", "choices": ["trail", "pier", "bridge", "dock"], "correct_choice_idx": 2, "direct_answers": ["bridge", "bridge", "bridge", "sidewalk", "lakefront", "seaside", "bridge", "ocean", "sidewalk", "bridge"], "difficult_direct_answer": false, "rationales": ["They are walking across the part that connects one side to the other and goes over the water.", "It's a street over water.", "Answer a is most likely because of the visible water in the background and the street lining in the foreground. it also looks like the people are elevated above the water on this structure which is all consistent with answer a."], "image": "val2014/COCO_val2014_000000560613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511407, "question_id": "aTuyTaXDmZfjFUQRWeyzTo", "question": "What water sport is the object the man is holding used in?", "choices": ["wakeboarding", "windsurfing", "surfing", "bodyboarding"], "correct_choice_idx": 3, "direct_answers": ["ocean", "surfing", "ocean", "surfing", "surfing", "bodyboarding", "surfing", "surfing", "ocean", "surfing"], "difficult_direct_answer": false, "rationales": ["The man is holding a body board that is used instead of a surfboard in the ocean.", "This is the classic shape for this item and it has a shark on it", "The man is holding a board as long as his torso for swimming."], "image": "val2014/COCO_val2014_000000511407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307262, "question_id": "aU79zZa2cJE3UKwDfqmJne", "question": "This wall hanging would be most likely seen in what kind of building?", "choices": ["clinic", "gym", "restaurant", "mosque"], "correct_choice_idx": 2, "direct_answers": ["restaurant", "restaurant", "hotel", "pizza restaurant", "restaurant", "restaurant", "pizzeria", "restaurant", "pizza", "pizza shop"], "difficult_direct_answer": false, "rationales": ["The wall hanging is a pizza ad.", "Pizzas are shown.", "The print is of food so it would fit perfectly in a business that offers food."], "image": "val2014/COCO_val2014_000000307262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333978, "question_id": "aUCG94jCBtfa8Lye9ks5GZ", "question": "What carbonated drink is in the green case on the right?", "choices": ["sprite", "mountain dew", "7-up", "schweppes"], "correct_choice_idx": 3, "direct_answers": ["ginger ale", "ginger ale", "ginger ale", "ginger ale", "schweppes", "carbonated water", "ginger ale", "ginger ale", "soda", "ginger ale"], "difficult_direct_answer": false, "rationales": ["The drink is schweppes.", "There is schweppes written on the green box.", "Though the text is partially cut off due to the package being opened we can read the word 'schweppes' written on the green case."], "image": "train2014/COCO_train2014_000000333978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561500, "question_id": "aUFo77QWJZASAemDgkCydA", "question": "What type area is visible here?", "choices": ["gym", "waiting room", "storage", "bathroom"], "correct_choice_idx": 2, "direct_answers": ["storage", "storage area", "storage", "storage", "alley", "storage", "storage area", "garage", "storage", "storage"], "difficult_direct_answer": false, "rationales": ["There are many different items stacked together in the room.", "Here we see a multitude of items stacked and placed for long term storage rather than day to day use.", "There are a lot of things in this area that are not being used. this area is not visited every day."], "image": "val2014/COCO_val2014_000000561500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351017, "question_id": "aUKNzS7rDV5RsSSZ2ZRn8o", "question": "What interest is shared by those seated here?", "choices": ["long shirts", "oenology", "breadsticks", "writing"], "correct_choice_idx": 1, "direct_answers": ["paperwork", "wine", "oenology", "wine", "wine", "wine tasting", "wine", "wine", "wine", "paperwork"], "difficult_direct_answer": false, "rationales": ["Some that are seated at the table appear to be surrounded by and inspecting wine. the official word for wine studying is oenology.", "Oenology is the science and study of wine and wine making. they are grading the wines on paper.", "The people are doing work."], "image": "val2014/COCO_val2014_000000351017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279689, "question_id": "aUQLwPzFeZsHFduunPLmtk", "question": "Why might the clothing be hung up in a row?", "choices": ["to sew", "to decorate", "to sell", "to dry"], "correct_choice_idx": 3, "direct_answers": ["dry", "to dry", "drying", "to dry", "drying", "to dry", "drying", "drying", "drying", "more room"], "difficult_direct_answer": false, "rationales": ["The clothing could all be hung to dry inside of the bed bunk.", "This lets the air flow through the fabric so water evaporates", "This is a clothes line. clothes lines are used to hang dry clothing."], "image": "train2014/COCO_train2014_000000279689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231310, "question_id": "aUvixs9eYnfo7bPBtk625T", "question": "Who is the man in red trying to reach?", "choices": ["green jacket", "red pants", "grey jacket", "pink jacket"], "correct_choice_idx": 0, "direct_answers": ["skier", "summit", "high", "high", "hilltop", "green jacket", "top skier", "reach goal", "top man", "snow season"], "difficult_direct_answer": true, "rationales": ["The skier in the red jacket is trying to get to the top of the hill where the man in green is about to ski down.", "The man wants to catch up to the man in the green jacket.", "The man in green is reaching."], "image": "train2014/COCO_train2014_000000231310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142352, "question_id": "aV2tAez58wjURJKHeeg9V4", "question": "Where the the women playing soccer?", "choices": ["field", "school", "forest", "stadium"], "correct_choice_idx": 3, "direct_answers": ["pitch", "soccer field", "stadium", "stadium", "outdoor arena", "front", "soccer field", "in stadium", "at field", "stadium"], "difficult_direct_answer": false, "rationales": ["The field is large enough to have sponsors and nice seating.", "There are bleachers and advertisements", "The women are playing soccer in the soccer stadium."], "image": "train2014/COCO_train2014_000000142352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352680, "question_id": "aV3boBsd6MDefzg6oVekmC", "question": "What are is this woman doing?", "choices": ["sports", "campfire", "hike", "camping"], "correct_choice_idx": 3, "direct_answers": ["camping", "camping", "sitting", "camping out", "sitting", "sunbathing", "camping out", "camping", "sitting", "sitting down"], "difficult_direct_answer": false, "rationales": ["The woman is camping with a tent.", "She has a tent out so she can stay the night there.", "A woman is sitting in a grassy area with a folding chair."], "image": "train2014/COCO_train2014_000000352680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485483, "question_id": "aV7m4gwV9z7TN7eqjJehwH", "question": "What does the number 84 represent?", "choices": ["table number", "store number", "order number", "price"], "correct_choice_idx": 2, "direct_answers": ["table number", "table", "restaurant name", "table number", "table number", "order number", "order number", "order number", "table number", "restaurant name"], "difficult_direct_answer": false, "rationales": ["It helps the server get the right food to the right diners.", "The number is there so the waiter can know which table to serve.", "This is a restaurant and there are not any numbers placed on the table, so this is likely the 84th order of the day."], "image": "val2014/COCO_val2014_000000485483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413900, "question_id": "aVQVu3x9AF6fVNp8SbsPVw", "question": "What is creating the image on the screen?", "choices": ["tv", "reflector", "plasma", "projector"], "correct_choice_idx": 3, "direct_answers": ["wii controller", "projector", "projector", "remote control", "projector", "computer", "projector", "wii console", "tv", "remote controls"], "difficult_direct_answer": false, "rationales": ["A projector blows up an image.", "These people are creating an image on the giant projector for their video game.", "This is a temporary screen that shows what the electronic points to it"], "image": "val2014/COCO_val2014_000000413900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92207, "question_id": "aVXgy9eYzB2wbuHwqV85tT", "question": "What is likely the most valuable object shown?", "choices": ["power bars", "photocopier", "cds", "hard drives"], "correct_choice_idx": 1, "direct_answers": ["laptop", "laptop", "photocopier", "laptop", "printer", "computer", "laptop", "laptop", "hard drive", "laptop"], "difficult_direct_answer": false, "rationales": ["The other electronics are for a charity project.", "Most of the objects are parts of other objects, that have limited value on their own. the most valuable object is probably the one that is functional without missing parts and not needing to be part of a larger object to work.", "The items are clearly visible in the image. based on their relative costs, answer a is visible and the most valuable."], "image": "train2014/COCO_train2014_000000092207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286190, "question_id": "aVjExzwXt4XAXMNrC2tTWs", "question": "What are these vehicles commonly used for?", "choices": ["tours", "demolition", "fundraisers", "games"], "correct_choice_idx": 0, "direct_answers": ["transport", "rides", "public transportation", "tours", "transportation", "tour bus", "transportation", "use", "city touring", "travel"], "difficult_direct_answer": true, "rationales": ["The other options don't match the use of double decker buses. that said, they might also be used for a d if the money from the tour is given to charity.", "Buses are usually great for tours.", "Double decker buses are usually used for tourists."], "image": "train2014/COCO_train2014_000000286190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539904, "question_id": "aW2SqdwSiZ3jrvAeiuWaE2", "question": "For which emergency is this item utilized?", "choices": ["none", "rain", "heat", "fire"], "correct_choice_idx": 3, "direct_answers": ["fire", "fire", "fires", "fire", "fire", "fires", "fires", "fire", "fire", "fire"], "difficult_direct_answer": false, "rationales": ["Although fire hydrants in america aren't usually painted yellow, it is nevertheless easy to identify this as a fire hydrant, due to its unique shape and location next to the sidewalk.", "Fire hydrants are for fire.", "The emergency is fires."], "image": "val2014/COCO_val2014_000000539904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383066, "question_id": "aW4wsvVQpszMM8TQq9MTgK", "question": "What kind of headwear is she wearing?", "choices": ["bonnet", "scarf", "hat", "hijab"], "correct_choice_idx": 0, "direct_answers": ["silk hat", "bonnet", "bonnet", "amish", "bonnet", "bonnet", "cap", "amish", "hairnet", "kapp"], "difficult_direct_answer": false, "rationales": ["She is amish and that is what they wear.", "A girl has a piece of material with strings on her head.", "The woman has a bonnet on."], "image": "val2014/COCO_val2014_000000383066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381433, "question_id": "aW7vqUiUDs6s6ZWFLhVYSG", "question": "Where is the street Aldwych located?", "choices": ["belfast", "cardiff", "edinburgh", "london"], "correct_choice_idx": 3, "direct_answers": ["london", "london", "england", "london", "city", "england", "uk", "london", "london", "uk"], "difficult_direct_answer": false, "rationales": ["Aldwych is a street in london. the street is on a double decker bus.", "Aldwych has british spelling.", "Aldwych is a main street in the capitol of england."], "image": "train2014/COCO_train2014_000000381433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41962, "question_id": "aWQtnQzRtxazPsc4TeP4sw", "question": "Who supplied her tennis racket?", "choices": ["wilson", "nike", "prince", "puma"], "correct_choice_idx": 2, "direct_answers": ["prince", "penn", "nike", "sponsor", "prince", "prince", "priority one", "prince", "prince", "prince"], "difficult_direct_answer": false, "rationales": ["The woman is hitting a ball with her racket.", "There is a large p on the racket strings.", "The racket has a \"p\" on the net."], "image": "train2014/COCO_train2014_000000041962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217710, "question_id": "aWUMBFJX2YXsaZLnj2NFXp", "question": "What color is the light all the way to the left?", "choices": ["blue", "green", "yellow", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The light on the left is red colored.", "A street light is lit in order to stop traffic.", "That shade of color would normally be found at the very top of a street light."], "image": "val2014/COCO_val2014_000000217710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236295, "question_id": "aWk55RWEGJDMM6QtQrE2qs", "question": "What grooming was recently done to this animal?", "choices": ["top cut", "shorn", "none", "curlnrinse"], "correct_choice_idx": 1, "direct_answers": ["shave", "shearing", "sheering", "shorn", "sheering", "shaving", "shearing", "shaved", "shave", "sheared"], "difficult_direct_answer": false, "rationales": ["The animal has no fur.", "The sheep has had it wool removed.", "The man is shaving the sheep."], "image": "train2014/COCO_train2014_000000236295.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530337, "question_id": "aWthvGfsUZ28EyAp6sh54N", "question": "Where is this tournament based?", "choices": ["london", "flushing meadows", "poughkeepsie", "little rock"], "correct_choice_idx": 1, "direct_answers": ["europe", "new york", "nyc", "tennis", "tennis court", "flushing meadows", "america", "us open", "united states", "usa"], "difficult_direct_answer": true, "rationales": ["The writing on the wall indicates that this is the us open. this tournament is based in new york city.", "The name of the tournament is located on the wall in the background and the location is internet searchable.", "The other options don't apply to this sport or the us open."], "image": "val2014/COCO_val2014_000000530337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530630, "question_id": "aX4NHjH2u75ayY2cC29Eo8", "question": "What type soup is being served?", "choices": ["broth", "chicken noodle", "won ton", "tortilla"], "correct_choice_idx": 3, "direct_answers": ["tomato soup", "tortilla", "tomato", "tomato", "tortilla", "tortilla soup", "tortilla", "tortilla", "tomato", "tomato soul"], "difficult_direct_answer": false, "rationales": ["There are chips in it.", "Tortilla chips are in the soup.", "Looks like a tortilla soup."], "image": "val2014/COCO_val2014_000000530630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86215, "question_id": "aX9UjC35Epet2XyRMXw4gF", "question": "What does the animal closest to the window like to eat?", "choices": ["horses", "humans", "cows", "fish"], "correct_choice_idx": 3, "direct_answers": ["treats", "catfood", "fish", "meat", "cat food", "meat", "meat", "mice", "mice", "fish"], "difficult_direct_answer": false, "rationales": ["Cats are known to enjoy eating fish.", "Cats like seafood.", "Cats love things that are from the sea like tuna."], "image": "val2014/COCO_val2014_000000086215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298464, "question_id": "aXP55w3LTkwK7xfHefVwyf", "question": "How many people must be over the legal drinking age in this jurisdiction?", "choices": ["fifty", "thirteen", "two", "five"], "correct_choice_idx": 2, "direct_answers": ["all", "two", "ten", "twenty one", "many", "eight", "one", "all", "many", "all"], "difficult_direct_answer": false, "rationales": ["There are two drinks being held that have alcohol in them.", "Only two have beers in their hands.", "Out of the three people with visible faces in this image; we can only be sure that two of them are over the age of 18."], "image": "train2014/COCO_train2014_000000298464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260695, "question_id": "aXQDKPEHFRkaHcfswr2pi9", "question": "What sort of game is played here?", "choices": ["trivia", "baseball", "monopoly", "tennis"], "correct_choice_idx": 0, "direct_answers": ["phone game", "trivia", "phone", "tetris", "trivia", "phone games", "trivia", "trivia", "mobile", "trivia"], "difficult_direct_answer": false, "rationales": ["A trivia question is on the screen.", "He's playing trivia.", "This flip phone could only play this game on it."], "image": "val2014/COCO_val2014_000000260695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394892, "question_id": "aXWATdLZPHnxCzanYnD8Ur", "question": "What fun can be had on the blue and white item shown here?", "choices": ["video gaming", "sleeping", "sliding", "swinging"], "correct_choice_idx": 3, "direct_answers": ["swinging", "skateboarding", "skateboarding", "skateboarding", "skateboarding", "swinging", "skating", "swing", "skateboarding", "skateboarding"], "difficult_direct_answer": false, "rationales": ["It is a swingset.", "The item can be used for swinging.", "The blue and white item is outside. it has chains that connect to seats."], "image": "val2014/COCO_val2014_000000394892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4916, "question_id": "aXeLWRpxhhzpMa43dusp2f", "question": "What is the pattern on the drapes?", "choices": ["dots", "stars", "circles", "floral"], "correct_choice_idx": 3, "direct_answers": ["flowers", "floral", "floral", "floral", "flowers", "floral", "stripes", "flowers", "floral", "flowers"], "difficult_direct_answer": false, "rationales": ["The pattern appears to have flowers on it.", "Curtains are covered in small flowers.", "The drapes appear to have flowers on then so it is a floral design."], "image": "val2014/COCO_val2014_000000004916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394165, "question_id": "aYYCPGwjcpFPoJC5dBKW8Z", "question": "What is on the plate?", "choices": ["tortilla chips", "gyro", "taco", "tomato"], "correct_choice_idx": 3, "direct_answers": ["dinner", "tomato", "vegetables", "food", "broccoli", "food", "stir fry", "food", "food", "vegetables"], "difficult_direct_answer": false, "rationales": ["The plate has a sliced tomato on it.", "A tomato is in the salad.", "The other options don't appear on the plate."], "image": "train2014/COCO_train2014_000000394165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28889, "question_id": "aYi5LQaFEDrfRp3TCafUDM", "question": "If these men stole the items behind them what would they be called?", "choices": ["jewel thieves", "car jackers", "dognappers", "bank robbers"], "correct_choice_idx": 0, "direct_answers": ["jewelry thieves", "thieves", "jewel thieves", "thieves", "thieves", "jewelry thieves", "jewelry thieves", "jewelry thieves", "thieves", "thieves"], "difficult_direct_answer": false, "rationales": ["The store behind them is a jewelry store.", "The men are thieves.", "They are in front of a jewelry store."], "image": "train2014/COCO_train2014_000000028889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567368, "question_id": "aZ9apsQyKEGGs3XVJAyegZ", "question": "Where is this group located?", "choices": ["forest", "play", "celebration", "movie"], "correct_choice_idx": 2, "direct_answers": ["porch", "porch", "party", "on deck", "celebration", "deck", "beach house", "boat", "outside", "outside patio"], "difficult_direct_answer": true, "rationales": ["They are at a party.", "He's holding a glass of champagne.", "The girl is wearing a fancy dress."], "image": "train2014/COCO_train2014_000000567368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381330, "question_id": "aZC7TVrUZg6QKPYCQBhtsM", "question": "What minerals are rich in banana?", "choices": ["none", "potassium", "phosphorous", "calcium"], "correct_choice_idx": 1, "direct_answers": ["potassium", "magnesium", "potassium", "magnesium", "potassium", "potassium", "potassium", "potassium", "potassium", "potassium"], "difficult_direct_answer": false, "rationales": ["Potassium is in bananas.", "Potassium is in bananas.", "Bananas are full of potassium."], "image": "train2014/COCO_train2014_000000381330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466971, "question_id": "aZSmx4UHTgiBJJjHnStVTF", "question": "What is on the back of the motorcycle?", "choices": ["suitcase", "animal", "backpack", "person"], "correct_choice_idx": 2, "direct_answers": ["backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "bag", "bag", "backpack", "bag"], "difficult_direct_answer": false, "rationales": ["The object is shaped like a school bag meant to carry things in it.", "Many people on motorcycles keep the stuff in there because it's easy to drive with.", "It's small luggage."], "image": "train2014/COCO_train2014_000000466971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119247, "question_id": "aZVDb7dRbAKSDF6zRzsm3E", "question": "In which city is this fire plug?", "choices": ["vegas", "paris", "los angeles", "san francisco"], "correct_choice_idx": 3, "direct_answers": ["san francisco", "san francisco", "unknown", "no clue", "powell", "powell", "san francisco", "san francisco", "san francisco", "new york"], "difficult_direct_answer": false, "rationales": ["The street is located in san francisco.", "The transamerica building in the background is in this city.", "That tall building in the back is in sf, california."], "image": "train2014/COCO_train2014_000000119247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575465, "question_id": "aZVqqB4UeX8Qc3ytpoTGwK", "question": "Where is this game being played?", "choices": ["sand", "gym", "stadium", "park"], "correct_choice_idx": 2, "direct_answers": ["stadium", "baseball stadium", "ohio", "stadium", "stadium", "ballpark", "stadium", "stadium", "boston", "baseball stadium"], "difficult_direct_answer": false, "rationales": ["This is a professional baseball stadium.", "Baseball is played in a stadium.", "The game is at a stadium."], "image": "val2014/COCO_val2014_000000575465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561885, "question_id": "aZYcasB4KbD39Wsc9hBBFv", "question": "What country is shown here?", "choices": ["singapore", "britain", "america", "australia"], "correct_choice_idx": 1, "direct_answers": ["england", "britain", "america", "england", "britain", "uk", "usa", "usa", "uk", "uk"], "difficult_direct_answer": false, "rationales": ["The bus is in britain because the license plate is from that country", "The vehicle in the image is a double-decker bus which is most commonly associated with answer a.", "Double-decker buses are common in britain and the word \"cinema\" is sometimes used in place of the word \"theaters\", so one can assume this location is britain."], "image": "train2014/COCO_train2014_000000561885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280666, "question_id": "aZovRH3hin8Euvpz5BbNHb", "question": "Which country do bananas originate from?", "choices": ["philippines", "china", "peru", "new guinea"], "correct_choice_idx": 3, "direct_answers": ["india", "india", "india", "india", "india", "india", "africa", "malaysia", "new guinea", "asia"], "difficult_direct_answer": false, "rationales": ["They grow in several places now but are originally from new guinea.", "They come from new guinea.", "It is the new guinea that show the banana."], "image": "train2014/COCO_train2014_000000280666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348929, "question_id": "aZupWTZEFHydciL3rS3DRg", "question": "For what purpose are the items hanging on the lines on the upper levels?", "choices": ["drying", "signaling", "shelter", "privacy shield"], "correct_choice_idx": 0, "direct_answers": ["clothing", "drying", "drying", "left", "drying", "be worn", "drying", "dry", "clothes", "drying"], "difficult_direct_answer": false, "rationales": ["The items on the line above are drying off.", "The purpose is to dry.", "The clothes are on the line to dry in the sun."], "image": "val2014/COCO_val2014_000000348929.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92985, "question_id": "aa268gDDFSaS7VgCjbtjNs", "question": "What is this little girl trying to do?", "choices": ["play game", "measure length", "massage foot", "press remote"], "correct_choice_idx": 3, "direct_answers": ["press remote", "engage remote", "open remote", "change channel", "use remote", "use remote", "read", "measure foot", "measure feet", "change channel"], "difficult_direct_answer": false, "rationales": ["The girls is using her toes to select a tv channel.", "The little girl is pressing the remote buttons with her foot.", "The girl is holding an electronic device. she is trying to use the buttons on it."], "image": "val2014/COCO_val2014_000000092985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206147, "question_id": "aa4hgaKdR8BYZeqHvpJ2Lv", "question": "Which one of these processes produced the spread here?", "choices": ["eating", "food prep", "discarding", "baking"], "correct_choice_idx": 1, "direct_answers": ["chopping", "cutting", "blender", "churning", "blender", "cooking", "churn", "churning", "gathering", "food prep"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to the preparation of ingredients.", "There is a food prep area on top of the desk.", "These ingredients were prepared in their individual containers."], "image": "val2014/COCO_val2014_000000206147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459912, "question_id": "aaHUUWDhP4sZrgvUrt5WFR", "question": "The motorbikes on this bustling city street are present in which country?", "choices": ["china", "thailand", "vietnam", "india"], "correct_choice_idx": 3, "direct_answers": ["india", "india", "unsure", "indonesia", "indonesia", "phillipines", "tibet", "india", "nepal", "india"], "difficult_direct_answer": false, "rationales": ["That country is known to have many of those vehicles. there are many very dark-skinned men who don't have that east-asian look, thus the selected country.", "The people here are from this country", "The language on the signs is indian."], "image": "val2014/COCO_val2014_000000459912.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246940, "question_id": "aaWpGUVyKwKxF5Qg6garFe", "question": "What is the black rectangular object in front of the blue boat?", "choices": ["pole", "hose", "fence", "ladder"], "correct_choice_idx": 3, "direct_answers": ["wood", "ladder", "ladder", "boat", "ladder", "ladder", "not visible", "wood", "raft", "ladder"], "difficult_direct_answer": false, "rationales": ["This object has rungs.", "There are steps to get out of the water.", "It's what people would climb to get on to the shore."], "image": "train2014/COCO_train2014_000000246940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98689, "question_id": "aaezuckF6pEGnpd5PAA3UG", "question": "What is the main problem of the man wearing brown pants?", "choices": ["underwear exposed", "naked torso", "no wristbands", "no shoes"], "correct_choice_idx": 3, "direct_answers": ["no shirt", "stuck", "no shirt", "broke raquel", "no shirt", "no shoes", "too big", "too big", "barefoot shirtless", "slipping down"], "difficult_direct_answer": false, "rationales": ["He is playing in his socks.", "It is hard to maintain traction on the court without any shoes.", "His pants are falling down."], "image": "train2014/COCO_train2014_000000098689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132298, "question_id": "aahquZDszFsuFmmUGG7esK", "question": "Skateboard is made of what wood?", "choices": ["pine", "palm", "bamboo", "maple"], "correct_choice_idx": 3, "direct_answers": ["maple", "maple", "maple", "maple", "maple", "maple", "maple", "maple", "maple", "maple"], "difficult_direct_answer": false, "rationales": ["Skateboards are often made of maple wood.", "They are made from maple.", "The surrounding shows the type of maple."], "image": "train2014/COCO_train2014_000000132298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21470, "question_id": "ab4zHifuYbNacdUgxKaqn3", "question": "Which handedness does this player possess?", "choices": ["both", "none", "left", "right"], "correct_choice_idx": 3, "direct_answers": ["right", "left", "right", "left", "left", "left", "left", "right handed", "right", "right"], "difficult_direct_answer": false, "rationales": ["The man is right handed.", "The mitt is on the left hand which makes the player righthanded.", "The player is wearing his glove on his left hand which means he throws with his dominant hand, his right."], "image": "val2014/COCO_val2014_000000021470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545022, "question_id": "abBJ4nMfPu8MaWhQ4W2MWU", "question": "What is the guy wearing a black hat doing?", "choices": ["singing", "talking", "resting", "listening"], "correct_choice_idx": 1, "direct_answers": ["driving", "steering", "steering", "steering", "driving carriage", "driving", "talking", "driving", "driving", "driving"], "difficult_direct_answer": false, "rationales": ["The guy in the black hat is speaking to the man next to him.", "The guy is talking.", "The man has a microphone placed near his mouth. this would be done if he was doing answer a."], "image": "train2014/COCO_train2014_000000545022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527277, "question_id": "abEw4bhNGsTC9zh4LSdcof", "question": "What's the purpose of the colorful banners in the outfield?", "choices": ["to advertise", "to educate", "to celebrate", "to distract"], "correct_choice_idx": 0, "direct_answers": ["advertisement", "advertisements", "advertisement", "to advertise", "advertising", "ads", "advertising", "advertisements", "ads", "advertisement"], "difficult_direct_answer": false, "rationales": ["They're all ads.", "There are various businesses on the wall.", "So the field can earn money from the people who want to show something about their businesses."], "image": "train2014/COCO_train2014_000000527277.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198733, "question_id": "abfCZfhLjeKMcx5WBpLg35", "question": "What is touching the skateboard?", "choices": ["dress shoes", "sneakers", "pogo stick", "hands"], "correct_choice_idx": 1, "direct_answers": ["shoes", "feet", "road", "shoe", "sneakers", "shoe", "skaters feet", "right foot", "foot", "toe"], "difficult_direct_answer": true, "rationales": ["A guy is standing on a skateboard and jumping in the air.", "The sneakers are touching toe with the skateboard top.", "This athletes left foot is fully suspended in air, but the shoe of the right foot is at least partially still on his skateboard."], "image": "val2014/COCO_val2014_000000198733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503472, "question_id": "acLwauWAAAAJ7zWQy66kg3", "question": "The item near the table that is a gray color can fit approximately how many people?", "choices": ["twelve", "four", "twenty", "ten"], "correct_choice_idx": 1, "direct_answers": ["four", "12-15", "five", "six", "eight", "two", "six", "five", "four", "12-15"], "difficult_direct_answer": false, "rationales": ["Four people can be on the couch.", "The couch can seat about two couples.", "The couch is carry four people."], "image": "train2014/COCO_train2014_000000503472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459868, "question_id": "acZzLk8VS6RPaLtMQj3kHB", "question": "What are the cranes being used for?", "choices": ["shipping", "decoration", "construction", "amusement"], "correct_choice_idx": 2, "direct_answers": ["raising", "construction", "skyscrapers", "construction", "build buildings", "carrying equipment", "building", "building", "construction", "lifting"], "difficult_direct_answer": false, "rationales": ["The cranes are used for constructive purposes.", "Cranes in an urban setting like this have been observed to be part of construction and would not serve another purpose.", "There is a building getting built"], "image": "train2014/COCO_train2014_000000459868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137156, "question_id": "acb28ZX7umghpqbVSoiQyt", "question": "What area is this photo least likely to be in?", "choices": ["osaka", "new york", "tokyo", "hokkaido"], "correct_choice_idx": 1, "direct_answers": ["tunnels", "not japan", "cleveland", "rural community", "asia", "tunnel", "country", "country", "urban", "new york"], "difficult_direct_answer": true, "rationales": ["The lights are horizontal instead of vertical. the text on signs is asian. even the yellow cab isn't normal for the u.s.", "The area is urban with new york taxi cabs.", "There are tall buildings like new york, but it's in another language."], "image": "val2014/COCO_val2014_000000137156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31092, "question_id": "acb3AeP6sGrY9zNBjoPbyk", "question": "In which lane does the person in the black helmet ride?", "choices": ["dirt lane", "right lane", "median lane", "bike lane"], "correct_choice_idx": 1, "direct_answers": ["bike lane", "right lane", "bike", "bike lane", "close", "right", "bike lane", "bike lane", "bike lane", "bike lane"], "difficult_direct_answer": false, "rationales": ["The person is on the right lane.", "The right lane is typically where bikes ride.", "They are all the way over to this side"], "image": "val2014/COCO_val2014_000000031092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188911, "question_id": "acmbkAhUtmCp4dGtKcj4YY", "question": "Why are there wooden planks?", "choices": ["to stack", "to burn", "to build", "to fence"], "correct_choice_idx": 3, "direct_answers": ["hold sheep", "safety", "fence", "fence", "separate livestock", "fence", "to fence", "pen", "hold sheep", "fence"], "difficult_direct_answer": false, "rationales": ["The planks contain the sheep.", "The sheep are enclosed by the wood.", "The animals appear to be in a pen which means the wood is there to keep them in making a the most logical answer."], "image": "train2014/COCO_train2014_000000188911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178666, "question_id": "acqswuSjRJEJgJHembCAmL", "question": "When does the Pompeii exhibit end?", "choices": ["2010", "2011", "2012", "2013"], "correct_choice_idx": 2, "direct_answers": ["2012", "september six", "crossing", "november 2012", "2011", "five", "six", "november 12", "twenty twelve", "can't see"], "difficult_direct_answer": true, "rationales": ["The start and end dates are on the sign that is above the person with the umbrella.", "The exhibit ends in 2012.", "According to the sign advertising the exhibit, it began in 2011 and ended the following year."], "image": "train2014/COCO_train2014_000000178666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169204, "question_id": "acquUvm2r8eopQNkAVdEbV", "question": "What purpose does the metal around lower part of obelisk serve?", "choices": ["repair scaffolding", "marketing", "child's prank", "pigeon repellant"], "correct_choice_idx": 0, "direct_answers": ["i do", "for repairs", "no climbing", "protection", "playing", "goal post", "protection", "goal", "repair scaffolding", "safety"], "difficult_direct_answer": true, "rationales": ["It's there for construction.", "The metal is just for repairs.", "Scaffolding surrounds the lower part of a tall tower. scaffolding is used for repairs of tall buildings."], "image": "train2014/COCO_train2014_000000169204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433055, "question_id": "ad8HVdoaAGntcUuKhug3sT", "question": "What is the green leafy item used in this salad?", "choices": ["arugula", "lettuce", "kale", "spinach"], "correct_choice_idx": 3, "direct_answers": ["spinach", "spinach", "spinach", "arugula", "arugula", "spinach", "spinach", "spinach", "spinach", "arugula"], "difficult_direct_answer": false, "rationales": ["The leaves are spinach greens.", "Spinach is used.", "It's not lettuce but also green."], "image": "train2014/COCO_train2014_000000433055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548713, "question_id": "ad9JbsVfUFik7DNoKTBL4u", "question": "What turn is forbidden?", "choices": ["u-turn", "right turn", "left turn", "straightaway"], "correct_choice_idx": 2, "direct_answers": ["left", "left", "uturn", "right turn", "right", "left turn", "left", "left", "left", "left turn"], "difficult_direct_answer": false, "rationales": ["A left turn is forbidden on this highway.", "There is a sign visible on the left side that has a left arrow on it crossed off. this type of road sign would be used to indicate that this type of turn would not be permitted.", "They can not turn left."], "image": "val2014/COCO_val2014_000000548713.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11401, "question_id": "adSuLGjwicLyZ5DmQNMDax", "question": "What is the breed of this dog?", "choices": ["maltipoo", "chow chow", "boxer", "samoyed"], "correct_choice_idx": 0, "direct_answers": ["cocker spaniel", "irish setter", "cocker spaniel", "golden retriever", "maltipoo", "spatial", "cocker spaniel", "cocker spaniel", "canoodle dog", "golden retriever"], "difficult_direct_answer": false, "rationales": ["The hair on the dog is long and it is furry.", "The wavy fur is that of a multipoo.", "It is a curly long haired dog that shows mixed breeds"], "image": "train2014/COCO_train2014_000000011401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529545, "question_id": "aeCgatMzcWgH7nHd4AoUm9", "question": "Who do the umbrellas belong to?", "choices": ["residents", "store", "city", "restaurant"], "correct_choice_idx": 3, "direct_answers": ["restaurant", "cafe", "business", "restaurant", "restraunt", "restaurant", "store", "store", "restaruant", "restaurant"], "difficult_direct_answer": false, "rationales": ["They are part of the restaurant if people want to sit outside.", "The umbrellas are at the dining tables.", "The umbrellas are in the outdoor dining area of a restaurant and belong to that establishment."], "image": "train2014/COCO_train2014_000000529545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400286, "question_id": "aeHqcAeGEVfqGZPPFgBRv9", "question": "What does the object on the ground need to perform its actions?", "choices": ["water", "air", "fire", "electricity"], "correct_choice_idx": 3, "direct_answers": ["electricity", "electricity", "electricity", "electricity", "power", "outlet", "electricity", "electricity", "electricity", "electric outlet"], "difficult_direct_answer": false, "rationales": ["The television will not play without electricity.", "A television with a long cord wrapped around it is in the street.", "The television has an electrical wire connected to it."], "image": "train2014/COCO_train2014_000000400286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468613, "question_id": "aeLF8PEgb5c9iqrt66QUUX", "question": "What period of the day is it in the image?", "choices": ["night", "morning", "evening", "afternoon"], "correct_choice_idx": 3, "direct_answers": ["afternoon", "dinner", "afternoon", "afternoon", "afternoon", "night", "lunch dinner", "lunch", "afternoon", "afternoon"], "difficult_direct_answer": false, "rationales": ["The man is making pizza.", "The clock on the wall of the kitchen says its past 3 o'clock indicating the time of day.", "The day is the afternoon."], "image": "val2014/COCO_val2014_000000468613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304557, "question_id": "aeZrL4UWVFs2XoYUimK4Gv", "question": "What type traffic is permitted here now?", "choices": ["semi trucks", "cars", "none", "food"], "correct_choice_idx": 3, "direct_answers": ["people", "pedestrians", "pedestrian", "food", "foot traffic", "pedestrian", "pedestrian", "pedestrian", "pedestrian", "foot"], "difficult_direct_answer": false, "rationales": ["People are walking here. there are no vehicles.", "There are no cars in the area.", "No vehicles only people are allowed."], "image": "val2014/COCO_val2014_000000304557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510626, "question_id": "aeiuzdHLb69mkvqqwvoUwT", "question": "Where was tennis first invented?", "choices": ["morocco", "england", "ireland", "france"], "correct_choice_idx": 3, "direct_answers": ["france", "europe", "france", "france", "england", "france", "france", "europe", "france", "europe"], "difficult_direct_answer": false, "rationales": ["Tennis is from france.", "Tennis came from france.", "Tennis began in 12th century france."], "image": "train2014/COCO_train2014_000000510626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234941, "question_id": "aekKGt4RB5am4uEL6cmW74", "question": "The surfers are in the water waiting for to form so they can ride?", "choices": ["foam", "towers", "seaweed", "waves"], "correct_choice_idx": 3, "direct_answers": ["waves", "wave", "waves", "waves", "waves", "waves", "waves", "waves", "waves", "wave"], "difficult_direct_answer": false, "rationales": ["They are waiting for a wave to come along.", "The people in the water are waiting for waves to form up so they might surf.", "They need the motion from a to move them."], "image": "train2014/COCO_train2014_000000234941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360771, "question_id": "aeucrNfEjjXQ4ZSK6gyhoq", "question": "What are the two elderly people holding their hands above?", "choices": ["pizza", "sculpture", "cake", "clock"], "correct_choice_idx": 2, "direct_answers": ["cake", "cake", "cake", "cake", "cake", "cake", "cake", "cake", "cake", "cake"], "difficult_direct_answer": false, "rationales": ["A man and woman hold their hands together over a round dessert with frosting and flowers decorating the top.", "The two elderly people are holding their hands above a cake.", "They are about to have dessert for a special occasion, perhaps their anniversary."], "image": "train2014/COCO_train2014_000000360771.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249145, "question_id": "af7dmv7sg8SBaEy6va3rdr", "question": "The woman shown here expresses what?", "choices": ["nothing", "sleepiness", "anger", "appreciation"], "correct_choice_idx": 3, "direct_answers": ["peace", "happiness", "sadness", "joy", "relaxation", "peace", "relaxation", "anger", "appreciation", "prayer"], "difficult_direct_answer": false, "rationales": ["The woman's eyes are closed which is something that usually is done when you are sleeping or tired.", "She looks happy.", "She's showing her phone and smiling"], "image": "train2014/COCO_train2014_000000249145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140776, "question_id": "afAnRBpXLGcHYTFs9GV9Kf", "question": "What type of phone is she using?", "choices": ["rotary", "cellular", "payphone", "landline"], "correct_choice_idx": 1, "direct_answers": ["cell", "cell", "cellular", "mobile phone", "cellular", "cellphone", "cellphone", "cell phone", "cellphone", "small phone"], "difficult_direct_answer": false, "rationales": ["There are no wires coming from the phone and it can fit in her hand.", "The woman has the phone in her hand. there is no line coming from the phone.", "A woman is outside on the phone. she is holding a phone up to her ear and there is no cord."], "image": "train2014/COCO_train2014_000000140776.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399582, "question_id": "afC6eU7qqFdvj5TyvCLevv", "question": "What is the guy in blue doing while playing wii that is different from the guy in red?", "choices": ["playing right-handed", "giving up", "playing left-handed", "smiling"], "correct_choice_idx": 2, "direct_answers": ["playing left-handed", "right handed", "smiling", "smiling", "smiling", "smiling", "lefty", "left handed", "smiling", "laughing"], "difficult_direct_answer": false, "rationales": ["The guy in blue is using his left and where as the other guy is using his right.", "He is playing with his opposite hand.", "Each of the men has the wii controller strapped to his wrist, but they are on different sides of their bodies."], "image": "val2014/COCO_val2014_000000399582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242731, "question_id": "afEBw4d5BrWuowwvdfHT8W", "question": "What is held by the person this ball is pitched to?", "choices": ["jacket", "bat", "bowling ball", "sheet"], "correct_choice_idx": 1, "direct_answers": ["pitcher", "glove", "bat", "bat", "bat", "glove", "bat", "glove", "bat", "bat"], "difficult_direct_answer": false, "rationales": ["The ball is being thrown to a player holding a smooth wooden club used in baseball that he will use to attempt to make contact with the ball.", "This person's ball is pitched to a bat.", "The man has a bat."], "image": "train2014/COCO_train2014_000000242731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433984, "question_id": "afMFhupgRAynQf272p8Le5", "question": "Why is his tongue out?", "choices": ["is hungry", "uncontrolled", "is concentrating", "is angry"], "correct_choice_idx": 2, "direct_answers": ["his tired", "mouth wet", "breathing", "concentration", "is concentrating", "dry lips", "focus", "focusing", "uneasy", "teasing"], "difficult_direct_answer": true, "rationales": ["A man is staring straight ahead while on a tennis court.", "People sometimes put their tongue out to concentrate.", "By the sport he is playing and the setting, he must be mentally acute to the game."], "image": "val2014/COCO_val2014_000000433984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123041, "question_id": "afT2rsfnFq77rjUNpyQhGj", "question": "What does this traffic lights mean?", "choices": ["ready", "nothing", "go", "stop"], "correct_choice_idx": 0, "direct_answers": ["caution", "slow down", "caution", "slow down", "caution", "slow", "caution", "caution", "ready", "slow down"], "difficult_direct_answer": false, "rationales": ["The traffic lines mean to be careful.", "The traffic lights mean caution.", "Yellow means slow down."], "image": "train2014/COCO_train2014_000000123041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277396, "question_id": "afTGBjEV22UEAZjZN7zUEu", "question": "What are the children about to do?", "choices": ["buy candy", "go home", "feed elephants", "eat lunch"], "correct_choice_idx": 2, "direct_answers": ["feed elephant", "feed elephant", "feed animals", "feed elephant", "feed elephant", "feed elephant", "feed animals", "feed elephant", "feed elephant", "feed elephants"], "difficult_direct_answer": false, "rationales": ["The children would feed the elephants.", "A man is handing the boy something. the boy is standing in front of the elephant pen. the elephant has his trunk outside the fence.", "They are paying the man and the elephants are nearby."], "image": "train2014/COCO_train2014_000000277396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139240, "question_id": "afWK9okuWgZpDh6nUEhkAc", "question": "Why are the stripes on the man's backpack illuminated?", "choices": ["flashlight", "sunlight", "overhead light", "camera flash"], "correct_choice_idx": 3, "direct_answers": ["reflective", "seen easier", "camera flash", "reflective material", "reflective", "reflective", "sight", "reflective material", "glow", "safety"], "difficult_direct_answer": false, "rationales": ["There are some stripes illuminated on the man's backpack because of a camera flash.", "The camera must have flashed as they took the picture and that would be the best explanation for the lights there is no other sunlight and the other options don't make sense.", "The stripes are for the flash."], "image": "train2014/COCO_train2014_000000139240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274083, "question_id": "afa82KBK2Unq6yJMMm8mMB", "question": "What do those gathered look at here?", "choices": ["tourists", "protestors", "car crash", "markets"], "correct_choice_idx": 2, "direct_answers": ["truck", "camera", "car", "cameraman", "bus", "bus", "bus", "car crash", "car", "car accident"], "difficult_direct_answer": false, "rationales": ["The car crashed into the bus.", "There is a man holding a sign and the onlookers are observing the group of men behind the man holding the sign.", "Looks like the car hit the bus."], "image": "val2014/COCO_val2014_000000274083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418030, "question_id": "afg3FvSASVCgij26goJQu6", "question": "Where are these colorful objects usually found?", "choices": ["business meeting", "underground bunker", "macy's parade", "court room"], "correct_choice_idx": 2, "direct_answers": ["beach", "ocean", "sky", "macy's parade", "skies", "parades", "air", "cartoon television", "sky", "sky"], "difficult_direct_answer": false, "rationales": ["Kites are in parades.", "The objects are in parades.", "These are big balloon floats used as entertainment."], "image": "train2014/COCO_train2014_000000418030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528062, "question_id": "afh5zDYrYuLhaoQXjjyyvq", "question": "What article of clothing are they wearing that is usually removed when entering a home?", "choices": ["panties", "shoes", "shoes", "shirts"], "correct_choice_idx": 1, "direct_answers": ["shoes", "shoes", "shoes", "shoes", "shoes", "shoes", "shoes", "shoes", "shoes", "shoes"], "difficult_direct_answer": false, "rationales": ["The people still have shoes on.", "People are sitting in a living room and all have shoes on.", "The shoes are removed to avoid dirt in the house."], "image": "val2014/COCO_val2014_000000528062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579213, "question_id": "afnKD3xSbpdTFskG456kiE", "question": "What is the plane on?", "choices": ["runway", "grass", "highway", "beach"], "correct_choice_idx": 0, "direct_answers": ["tarmac", "runway", "tarmac", "tarmac", "tarmac", "runway", "tarmac", "runway", "tarmac", "runway"], "difficult_direct_answer": false, "rationales": ["The plane is on a runway.", "Airplanes usually need a long smooth paved area with no vehicle traffic in order to take off and land.", "The plane is on the ground."], "image": "train2014/COCO_train2014_000000579213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495117, "question_id": "afrB4WhLBm8VqvtEMfWxmR", "question": "Where can the people potentially sleep?", "choices": ["car", "trailer", "field", "horse"], "correct_choice_idx": 1, "direct_answers": ["trailer", "buses", "camper", "camper", "campers", "campers", "campers", "grass", "tent", "van"], "difficult_direct_answer": false, "rationales": ["You can't sleep on a horse. you need some sort of shelter and a trailer is the appropriate size, a car is too small. sleeping in the field offers no shelter.", "People could snooze in the trailers.", "The people are in trailers."], "image": "val2014/COCO_val2014_000000495117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426656, "question_id": "afvVukrRjr7Qm9vDV4RtRx", "question": "What shouldn't you climb onto here?", "choices": ["hills", "backpack", "bench", "walls"], "correct_choice_idx": 3, "direct_answers": ["walls", "overweight", "wall", "wall", "wall", "wall", "walls", "stone wall", "dangerous animals", "wall"], "difficult_direct_answer": false, "rationales": ["The walls have a warning sign.", "The walls can't be climbed.", "There is a sign telling people to stay off the walls."], "image": "train2014/COCO_train2014_000000426656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176785, "question_id": "afviD5mtRgFzb3sjxpnmEc", "question": "What attire is the person standing behind the horse in front of the red rope wearing?", "choices": ["australian ranger", "police officer", "native american", "latin american"], "correct_choice_idx": 2, "direct_answers": ["clown", "costume", "native american", "native american", "clown", "clown", "western", "clown", "cowboy", "cap"], "difficult_direct_answer": false, "rationales": ["The man is seen wearing the cap that was used in those area.", "The person has native american clothing.", "The person is wearing leather and traditional clothing."], "image": "train2014/COCO_train2014_000000176785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515300, "question_id": "ag7MFTU7KHmr2rZDR69qoS", "question": "World Skate is highest governing body of which game?", "choices": ["swimming", "kiting", "skateboarding", "surfing"], "correct_choice_idx": 2, "direct_answers": ["roller sports", "skateboarding", "roller sports", "roller sports", "skateboarding", "skateboarding", "skateboarding", "roller sports", "skateboarding", "skateboarding"], "difficult_direct_answer": false, "rationales": ["A guy is doing skateboarding tricks on stairs.", "Since the title of the competition has skate in it.", "The word implies the use of the board and the man is trying to do tricks with his board with wheels."], "image": "train2014/COCO_train2014_000000515300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479734, "question_id": "agHkAdZCRGfpk4u6edemuT", "question": "What is the brown on the man's board?", "choices": ["wax", "algae", "rubber", "syrup"], "correct_choice_idx": 0, "direct_answers": ["dirt", "sand", "mold", "wax", "sand", "sand", "dirt", "grime dirt", "dirt", "sand"], "difficult_direct_answer": false, "rationales": ["Based on the mans feet we are seeing the top of the surfboard. surfers use wax on their boards for grip that becomes brown after it has been used for a while.", "Wax helps to keep the person from sliding off the board.", "There is brown wax on top of this person's board."], "image": "train2014/COCO_train2014_000000479734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560704, "question_id": "agofQdrQx9KCQxABVNcV6Z", "question": "Who usually hangs out here?", "choices": ["homeless", "children", "athletes", "dogs"], "correct_choice_idx": 1, "direct_answers": ["children", "children", "kids", "children", "kids", "kids", "kids", "kids", "kids", "kids"], "difficult_direct_answer": false, "rationales": ["Children play on playgrounds.", "The location is a playground judging by the visible features. playgrounds are intended to be used by answer a.", "A playground is designed for kids."], "image": "train2014/COCO_train2014_000000560704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191632, "question_id": "agqrFxJFLNBE6q5Bab3JEP", "question": "What could the women be cooking on the stove?", "choices": ["spaghetti", "pancakes", "sauce", "whole chicken"], "correct_choice_idx": 1, "direct_answers": ["cookies", "steak", "breakfast", "eggs", "food", "food", "eggs", "anything", "pancakes", "pancakes"], "difficult_direct_answer": false, "rationales": ["She is holding a spatula.", "The woman appears to be using thin pans and is holding a spatula which would be consistent with someone cooking answer a and none of the other answers.", "The women make pancakes."], "image": "train2014/COCO_train2014_000000191632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112201, "question_id": "ahTGiMYxnWfZxjtv7KKTv5", "question": "Where is this man located?", "choices": ["classroom", "hospital", "restaurant", "home"], "correct_choice_idx": 0, "direct_answers": ["classroom", "classroom", "classroom", "classroom", "classroom", "classroom", "classroom", "classroom", "classroom", "classroom"], "difficult_direct_answer": false, "rationales": ["The man is sitting in a classroom.", "There are desks and whiteboards in the room.", "The room has a whiteboard."], "image": "train2014/COCO_train2014_000000112201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212501, "question_id": "ahY4MEqWS44Sqo5Jp2fD7t", "question": "What is the large green sculpture made up of?", "choices": ["paint", "phones", "flash lights", "traffic lights"], "correct_choice_idx": 3, "direct_answers": ["traffic lights", "street lights", "traffic lights", "iron", "traffic lights", "traffic lights", "traffic lights", "lights", "steel", "metal"], "difficult_direct_answer": false, "rationales": ["It has a bunch of traffic lights on the green tower.", "The pole has three different colors of lights on it. the lights are red, yellow, and green.", "There are yellow, red and green lights and it is near a roadway."], "image": "train2014/COCO_train2014_000000212501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385540, "question_id": "ahm4Rz2QNuygKmqUSK5iLY", "question": "In which country does the woman ride?", "choices": ["australia", "china", "south america", "canada"], "correct_choice_idx": 3, "direct_answers": ["canada", "canada", "south carolina", "usa", "america", "canada", "canada", "canada", "canada", "canada"], "difficult_direct_answer": false, "rationales": ["The city name on the flag is calgary which is in the country north of the united states of america.", "The word calgary can be seen on the flag.", "She is wearing her country colors."], "image": "val2014/COCO_val2014_000000385540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66129, "question_id": "ahnVt539ybQ7DcL4pqHWVw", "question": "What are the small candles on the table called?", "choices": ["mini lights", "table lights", "tiny lights", "tea lights"], "correct_choice_idx": 1, "direct_answers": ["accent candles", "tea lights", "lights", "votive candles", "tea candles", "votives", "table lights", "votive", "votives", "birthday candles"], "difficult_direct_answer": true, "rationales": ["The candles are called tea lights.", "The candles are for the table.", "Small candles are called table lights."], "image": "train2014/COCO_train2014_000000066129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46106, "question_id": "aiFevTTntu6XZ55n72xX87", "question": "Which automotive manufacturer made the jeep?", "choices": ["kia", "honda", "toyota", "hyundai"], "correct_choice_idx": 2, "direct_answers": ["toyota", "toyota", "toyota", "toyota", "toyota", "honda", "honda", "toyota", "toyota", "toyota"], "difficult_direct_answer": false, "rationales": ["The toyota logo is visible from the front of the automobile.", "The carmaker is toyota.", "Toyota constructed the jeep."], "image": "train2014/COCO_train2014_000000046106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361356, "question_id": "aiHyNHg895oQvvMQCrhuDH", "question": "What is the name of the person making making this thread fruit designs?", "choices": ["doctor", "none", "designer", "weaver"], "correct_choice_idx": 3, "direct_answers": ["crocheter", "weaver", "martha", "knitter", "sonia", "crocheter", "knitter", "crafter", "seamstress", "edna"], "difficult_direct_answer": false, "rationales": ["This practice is known as knitting, or weaving yarn into shapes. doctors and designers have a different specialty and none is not an option because it is apparent someone is making this objects.", "A person weaves baskets to hold fruit.", "There are baskets that are designed a certain way using a specific technique by those kinds of artists."], "image": "train2014/COCO_train2014_000000361356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443496, "question_id": "ajANfZAWS6CPM7xmanEpHz", "question": "What sort of cuisine is the diner enjoying?", "choices": ["chinese", "fast food", "soul food", "mexican"], "correct_choice_idx": 0, "direct_answers": ["chinese", "asian", "gumbo meal", "supper", "chinese", "chinese", "asian", "chinese", "chinese food", "asian"], "difficult_direct_answer": false, "rationales": ["They are eating chinese.", "The person is shown eating noodles with chopsticks. none of the other types of cuisines listed commonly feature these items.", "There is a lot of rice."], "image": "train2014/COCO_train2014_000000443496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251098, "question_id": "ajUYMZHSHAC4zBvUVxeGVM", "question": "Which celebrity rides the kind of vehicle that is behind the car?", "choices": ["ian mcneice", "fred norris", "maggie smith", "gabourey sidibe"], "correct_choice_idx": 1, "direct_answers": ["steve mcqueen", "tom cruise", "knives", "lot", "keanu reeves", "fred norris", "marine", "marine", "tom cruise", "batman"], "difficult_direct_answer": false, "rationales": ["Fred norris usually always rides motorcycles.", "Fred norris rides motorcycles.", "Fred norris because he rides a triumph motorcycle"], "image": "val2014/COCO_val2014_000000251098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187483, "question_id": "ajepDvaGaEKWsccMfhpR4D", "question": "What kind of building is the bear in?", "choices": ["house", "church", "school", "store"], "correct_choice_idx": 3, "direct_answers": ["store", "antique shop", "toy", "store", "toy store", "shop", "store", "store", "store", "store"], "difficult_direct_answer": false, "rationales": ["You can by the items and price tags as to where the bear is in.", "The bear is standing next to a display of stuffed animals and looking out through a window onto a street with another shop visible. based on the display of stuffed animals like this and the visible location it is most likely a store.", "The bear is in a store because products are being sold there"], "image": "train2014/COCO_train2014_000000187483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19313, "question_id": "ajjNyEn2AAU5ypoLFRoXuu", "question": "What is the most common tennis racquet string material?", "choices": ["cotton", "nylon", "steel", "animal guts"], "correct_choice_idx": 3, "direct_answers": ["nylon", "nylon", "metal wire", "animal guts", "nylon polyester", "nylon", "wire", "nylon", "nylon", "plastic"], "difficult_direct_answer": false, "rationales": ["The girl looks like a beginner and beginner tennis rackets use animal guts for strings.", "The most common racket material is animal guts.", "The tennis racket has strings similar to a guitar."], "image": "val2014/COCO_val2014_000000019313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221328, "question_id": "ajnchyd73aTQe6dZRNw9T9", "question": "What is the type of garment that the woman in blue is wearing?", "choices": ["raincoat", "chut thai", "kimono", "sari"], "correct_choice_idx": 3, "direct_answers": ["sarong", "dress", "sari", "sari", "dress", "sari", "dress", "sari", "robe", "shari"], "difficult_direct_answer": false, "rationales": ["The woman has a sari on.", "The elephant is located in india. women wear clothes in india.", "The woman is wearing a blue sari next to an elephant."], "image": "train2014/COCO_train2014_000000221328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433235, "question_id": "ajrHzDySKigYKqBzfRtioj", "question": "What type of kitchen is this?", "choices": ["island", "residential", "commercial", "galley"], "correct_choice_idx": 2, "direct_answers": ["professional", "industrial", "industrial", "army", "industrial", "commercial", "military", "commercial", "commercial", "industrial"], "difficult_direct_answer": false, "rationales": ["This kitchen contains large steel industrial appliances with controls that are different from controls on home kitchen appliances.", "The kitchen is commercial.", "Due to the setting and the industrial cooking devices you can tell as to where this picture was taken."], "image": "train2014/COCO_train2014_000000433235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344930, "question_id": "ajyngEyd5dj8iqZBGgHv8c", "question": "Why is the black vehicle stopped near the curb?", "choices": ["to load", "to race", "to deliver", "to park"], "correct_choice_idx": 3, "direct_answers": ["to park", "parked", "parked", "parked", "street parking", "parked", "parking", "live locally", "parking", "visiting friends"], "difficult_direct_answer": false, "rationales": ["He is stopped next to the curb to do so.", "A car is near the curb of a street. people park their cars on the side of the road, near the curb.", "A car is still with no one driving and is on the side of the road."], "image": "val2014/COCO_val2014_000000344930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15067, "question_id": "ak2tt22Ds2pDinDpJWZrAg", "question": "Where will the skateboarder go?", "choices": ["no where", "down ramp", "sideways", "up ramp"], "correct_choice_idx": 3, "direct_answers": ["up ramp", "ramp", "ramp", "ramp", "rail", "up ramp", "ramp", "forward", "up ramp", "ramp"], "difficult_direct_answer": false, "rationales": ["The skateboarder wants to go up the ramp.", "The skateboarder will try to ride the board to the top of the ramp.", "The skateboarder is headed in the direction of the ramp."], "image": "val2014/COCO_val2014_000000015067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181156, "question_id": "ak59SRhMehjo7fZEF4KWSY", "question": "Who are the two people?", "choices": ["customers", "government inspectors", "chefs", "shop owners"], "correct_choice_idx": 3, "direct_answers": ["couple", "restaurant workers", "bartenders", "restaurant employees", "shop owners", "business people", "owners", "bartenders", "bartenders", "owners"], "difficult_direct_answer": false, "rationales": ["They are the shop owners because they are behind the counter", "These two people are the owners of a shop specializing in selling liquor products.", "They own the bar."], "image": "train2014/COCO_train2014_000000181156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131115, "question_id": "akG5Jv6Xdx3TwuowjYTwZd", "question": "What kind of game is this?", "choices": ["hockey", "cricket", "tennis", "football"], "correct_choice_idx": 1, "direct_answers": ["baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "cricket"], "difficult_direct_answer": false, "rationales": ["The game is cricket.", "Most of the players are involved in cricket.", "This is baseball. it is close to cricket, as it has a batter, but it is not."], "image": "val2014/COCO_val2014_000000131115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449706, "question_id": "akLB7LzPDQHmb8ZR5xtvbo", "question": "Through what kind of area is he leading the donkey?", "choices": ["street", "market", "town", "trail"], "correct_choice_idx": 1, "direct_answers": ["hall", "market", "bazar", "market", "crowded street", "alley shops", "market", "market", "village", "market"], "difficult_direct_answer": false, "rationales": ["The donkey is being led in a market between the different stands", "A person is leading a donkey through a crowded area with various items being displayed.", "The man is trying to get the donkey through a crowded shop area."], "image": "train2014/COCO_train2014_000000449706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59281, "question_id": "akVDPtJ8Qz4SqXE9tVoCbf", "question": "Why are the people wearing bright yellow hats?", "choices": ["cosplay", "fashion", "visibility", "punishment"], "correct_choice_idx": 2, "direct_answers": ["visibility", "rain cover", "school trip", "security staff", "tour group", "all together", "group", "safety", "school safety", "rain protection"], "difficult_direct_answer": true, "rationales": ["The people want to be seen more easily.", "The hats keep the people visible.", "This is so they can be seen easily"], "image": "train2014/COCO_train2014_000000059281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489452, "question_id": "akeSSr3meVyJ9MH8Myb4hk", "question": "The man about to catch the frisbee wears what color of shirt?", "choices": ["blue", "black", "grey", "white"], "correct_choice_idx": 0, "direct_answers": ["blue", "blue", "blue", "foreplay", "blue", "foreplay", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["His shirt is the same color as the trash can", "The man in the blue shirt is reaching up into the sky.", "His shirt is blue."], "image": "train2014/COCO_train2014_000000489452.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17667, "question_id": "amG9aQdQPE6y9rpi6RHbwP", "question": "How is this vessel being propelled?", "choices": ["tugged", "motor", "wind", "paddle"], "correct_choice_idx": 1, "direct_answers": ["propeller", "propeller", "motor", "motor", "gasoline", "motor", "motor", "motor", "motor", "motor"], "difficult_direct_answer": false, "rationales": ["There is a motor shown at the back of the boat.", "Behind the boat is water splashing showing that a fan-like object is moving the boat forward.", "It has a motor."], "image": "val2014/COCO_val2014_000000017667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397599, "question_id": "amUqCkucBX5cPo7gJfFA4c", "question": "Where are repairs taking place here?", "choices": ["sidewalk only", "street", "automobiles", "parking garage"], "correct_choice_idx": 1, "direct_answers": ["road construction", "right lane", "street", "roadway", "road", "road", "yes", "street", "street", "road"], "difficult_direct_answer": false, "rationales": ["The asphalt and yellow lines indicate that this is a street.", "Street renovations are taking place in the road.", "There are repairs taking place on the street top."], "image": "train2014/COCO_train2014_000000397599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384461, "question_id": "amnevofNHaC9hyN6RW7myQ", "question": "What is in the open baggage?", "choices": ["lunch", "her tools", "clothing", "tiny house"], "correct_choice_idx": 3, "direct_answers": ["music", "selling products", "photo frames", "dollhouse furniture", "tiny house", "diorama", "diorama", "display", "art", "decorations"], "difficult_direct_answer": true, "rationales": ["This open suitcase contains a miniature diorama of a living room scene. a living room is inside of a house.", "The person has many miniature items resembling home decor featured inside the suitcase that were intentionally placed and likely there to make one think of the house they might represent on a smaller scale.", "Miniature furniture is visible."], "image": "val2014/COCO_val2014_000000384461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1915, "question_id": "anYBkHh98aZG3MGapLbxQ8", "question": "How is the engine on the red motorcycle cooled?", "choices": ["air", "oil", "antifreeze", "engine coolant"], "correct_choice_idx": 0, "direct_answers": ["air", "air", "air cooling", "exhaust", "stopping", "air", "air", "coolant", "air cooling", "air"], "difficult_direct_answer": false, "rationales": ["There is no casing around the engine.", "The engine is cooled by natural air.", "Motorcycles are cooled with air."], "image": "train2014/COCO_train2014_000000001915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55681, "question_id": "and3CtzHwGptJE6EtuJ2mW", "question": "What is the status of the horse?", "choices": ["turning right", "going straight", "turning left", "stopped"], "correct_choice_idx": 1, "direct_answers": ["police horse", "going straight", "being rode", "worker", "stop", "occupied", "police horse", "walking", "walking", "mounted police"], "difficult_direct_answer": false, "rationales": ["He is at a red sign that tells him to do something.", "The horse is moving forward.", "The horse is facing forward."], "image": "train2014/COCO_train2014_000000055681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477046, "question_id": "anjF6fWvoY377NTTwCXhfL", "question": "What is the greatest existential threat to these great animals?", "choices": ["excessive heat", "hunger", "drowning", "humans"], "correct_choice_idx": 3, "direct_answers": ["climate change", "poachers", "humans", "people", "hunters", "climate change", "hunting", "hunters", "humans", "poachers"], "difficult_direct_answer": false, "rationales": ["Since this seems to be a zoo or wildlife preserve, the giraffes are safe from hunger, drowning, or excessive heat. their lives are most threatened by the direct action of humans.", "The humans could shoot the giraffes.", "Giraffes are grazing with people looking on."], "image": "train2014/COCO_train2014_000000477046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486328, "question_id": "anmzuZCjf7riQv2fVtPLtu", "question": "What is rented for family enjoyment?", "choices": ["scuba gear", "boats", "benches", "kites"], "correct_choice_idx": 1, "direct_answers": ["paddle boats", "kayak", "boat", "boats", "boat", "paddle boat", "boats", "row boat", "paddleboat", "boat"], "difficult_direct_answer": false, "rationales": ["The boats can be rented for the day.", "The boats are rented.", "They look like the ones people paddle."], "image": "train2014/COCO_train2014_000000486328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266866, "question_id": "anoZQcN4BFE9GyV9jtNvgb", "question": "What is the weather like on this day?", "choices": ["snowing", "windy", "raining", "sunny"], "correct_choice_idx": 2, "direct_answers": ["rainy", "raining", "rainy", "rain weather", "rainy", "rainy", "rain", "rainy", "rainy", "rainy"], "difficult_direct_answer": false, "rationales": ["The weather is rainy.", "The person is wearing a coat and using an umbrella. the ground is wet but there is no visible snow.", "Umbrellas are out because it's raining."], "image": "val2014/COCO_val2014_000000266866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552654, "question_id": "anrLVSGJUXDudZwJP7VFyr", "question": "Where are the men located?", "choices": ["resort", "stadium", "office", "gym"], "correct_choice_idx": 0, "direct_answers": ["snow inn", "snowboarded", "snow", "resort", "mountain", "snowy area", "snow", "snow area", "outside", "city"], "difficult_direct_answer": true, "rationales": ["The men are at a ski resort.", "The men are at a resort.", "The men are at a ski resort."], "image": "val2014/COCO_val2014_000000552654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296797, "question_id": "aoDuue3BDWd8Qz5DXnmRib", "question": "How are people being transported here?", "choices": ["taxi", "cable car", "steam train", "mule"], "correct_choice_idx": 1, "direct_answers": ["rail", "bus", "trolley", "trolley", "trolley", "trolley", "cable car", "cable car", "trolley", "trolley"], "difficult_direct_answer": false, "rationales": ["The people being transported are aboard a cable car", "The group of people are entering the train that moves by cables in the middle of a city.", "The car is being pulled along the cable."], "image": "val2014/COCO_val2014_000000296797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249382, "question_id": "aoHxqVT3jMFyjRMLtwzDyP", "question": "What have these children likely practiced?", "choices": ["boxing", "sprinting", "skiing", "swimming"], "correct_choice_idx": 2, "direct_answers": ["snowboarding", "jumps", "snowboarding", "snowboarding", "snowboard jumping", "skiing", "snowboarding", "snowboarding", "snowboarding", "snowboarding"], "difficult_direct_answer": false, "rationales": ["These kids seem to be experts at snow sports.", "These children are likely skiing.", "They have practiced skiing because they are snowboarding presently"], "image": "train2014/COCO_train2014_000000249382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173447, "question_id": "aoJEptMoSmiNAXjUYef7hT", "question": "What are the people who watch the frisbee player doing?", "choices": ["swimming", "sleeping", "protesting", "selling"], "correct_choice_idx": 0, "direct_answers": ["swimming", "swimming", "swimming", "swimming", "swimming", "swimming", "swimming", "swimming", "swimming", "swimming"], "difficult_direct_answer": false, "rationales": ["These people are in the water.", "They are in a lake up to their chests in the water while watching the boy.", "The people are swimming in the water."], "image": "train2014/COCO_train2014_000000173447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434371, "question_id": "aoMAoNj6J6KtWwpBAEwBSd", "question": "The person who lives here and owns this entertainment area is likely at least how old?", "choices": ["20", "33", "14", "79"], "correct_choice_idx": 1, "direct_answers": ["fifty", "twenty", "young adult", "thirty", "35", "30's", "33", "50", "fourty", "forty"], "difficult_direct_answer": true, "rationales": ["The person is 33.", "The person's tv show preferences indicate they're a young adult.", "The entertainment area looks like it contains some fairly expensive electronic and modern gaming items. the age group would thus be someone in their thirties."], "image": "val2014/COCO_val2014_000000434371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386162, "question_id": "aoMtCqiQSzGvpbD8Pq9pZt", "question": "What is the purpose of the white furry object?", "choices": ["zoom in", "amplify sound", "hold balls", "soften landing"], "correct_choice_idx": 1, "direct_answers": ["microphone", "clean balls", "microphone", "wind protection", "microphone buffer", "amplify sound", "wind measurement", "microphone", "cleaner", "cleaner"], "difficult_direct_answer": false, "rationales": ["The object helps carry sound further.", "The white object allows mics to pick up sound.", "The purpose is to amplify noise."], "image": "train2014/COCO_train2014_000000386162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158421, "question_id": "aog3DMufd5fGx4mBipH4io", "question": "What do the cars do?", "choices": ["deliver pizza", "clean hills", "lift people", "handle emergencies"], "correct_choice_idx": 2, "direct_answers": ["go uphill", "lift people", "travel", "lift", "transport people", "carry skiers", "move", "transport skiers", "transport people", "lift people"], "difficult_direct_answer": false, "rationales": ["The cars all lift people up the slope.", "The cars carry skiers and snowboarders to the top of the hill.", "This transports people quickly to the top of a high mountain"], "image": "val2014/COCO_val2014_000000158421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477647, "question_id": "aomQkYqasi7vANhq35SkZP", "question": "How many degrees of vision does the cat has?", "choices": ["120", "180", "320", "180"], "correct_choice_idx": 0, "direct_answers": ["200", "360", "nine", "85 degrees", "one", "200", "two hundred", "200", "120", "two hundred"], "difficult_direct_answer": false, "rationales": ["They have a little wider vision than people.", "The answer, based on an internet search is 200, so answer c (and d) are closest.", "The cat has a 120 degree angle."], "image": "train2014/COCO_train2014_000000477647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129886, "question_id": "aomSjUA8siB7gdoLtdgCRH", "question": "How will these people get down from this location?", "choices": ["uber", "ski", "lift", "taxi"], "correct_choice_idx": 1, "direct_answers": ["ski lift", "ski", "ski", "skiing", "ride down", "ski lift", "skiing", "ski", "skis", "chair lift"], "difficult_direct_answer": false, "rationales": ["People are going down hill with long things on feet.", "The people will go skiing down the slope.", "People use skis to get down from the top of this location."], "image": "val2014/COCO_val2014_000000129886.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159987, "question_id": "aorAfmin3F3GiZZ3RWWgag", "question": "What is there a lot of here?", "choices": ["sand", "dust", "traffic", "snow"], "correct_choice_idx": 2, "direct_answers": ["buses", "buses", "vehicles", "buses", "bus", "buses", "buses", "buses", "buses", "traffic"], "difficult_direct_answer": false, "rationales": ["There are several vehicles", "Traffic happens when cars and buses are backed up in the street.", "The vehicles are all lined up bumper to bumper in traffic."], "image": "train2014/COCO_train2014_000000159987.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147653, "question_id": "aouiCMdaeWDdr9BMVhguae", "question": "What is the man on the board doing on the ledge?", "choices": ["flipping", "grinding", "waxing", "manualing"], "correct_choice_idx": 1, "direct_answers": ["skateboarding", "grinding", "grinding", "skateboarding", "grinding", "grinding", "skateboarding", "skateboarding", "skateboarding", "grinding"], "difficult_direct_answer": false, "rationales": ["The other options don't fit his actions.", "He is performing a stunt.", "That's the name of the trick he's doing on the skateboard."], "image": "val2014/COCO_val2014_000000147653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387857, "question_id": "ap3wrExiYRDTjE6VMf7sX3", "question": "Using the long handled pan shown is handy in what situation of cooking?", "choices": ["fireplace", "electric oven", "hot plate", "microwave"], "correct_choice_idx": 0, "direct_answers": ["fireplace", "open fire", "country", "frying", "open fire", "fireplace", "fireplace", "stirring", "fireplace", "over coals"], "difficult_direct_answer": false, "rationales": ["The scene appears to be an old fashioned kitchen where answer a would be the only available cooking method and the pan would have been designed accordingly.", "The other options don't apply to this type of pan or the historic setting.", "The pan would be useful for a fireplace."], "image": "train2014/COCO_train2014_000000387857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168607, "question_id": "apAqHBaFF4yC4QPov8ZNM9", "question": "How many of the ingredients on the dish were cooked by steaming them?", "choices": ["one", "three", "two", "four"], "correct_choice_idx": 0, "direct_answers": ["one", "one", "one", "zero", "one", "one", "one", "two", "one", "two"], "difficult_direct_answer": false, "rationales": ["Rice is only one to steam.", "There is steamed rice on top of the plate.", "You have to steam rice to cook it correctly."], "image": "train2014/COCO_train2014_000000168607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473131, "question_id": "apGH7aR4x9ADyFs2WNpTfJ", "question": "Why are the women collecting food in baskets?", "choices": ["to eat", "to sell", "to cook", "to clean"], "correct_choice_idx": 1, "direct_answers": ["to sell", "selling it", "selling", "to sell", "to sell", "selling food", "to sell", "for sale", "to sell", "will sell"], "difficult_direct_answer": false, "rationales": ["That's how the vendors display their products to customers.", "They are displaying items for people to buy", "The women are all collecting fruits and vegetables for sale in the village."], "image": "train2014/COCO_train2014_000000473131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411875, "question_id": "apH8HSN42M3UzEhkSdhb3s", "question": "What shop is shown in the background?", "choices": ["salon", "pet shop", "electronics shop", "furniture shop"], "correct_choice_idx": 0, "direct_answers": ["shoe", "shoe", "shoe store", "not visible", "pet store", "shoe shop", "pet supply", "shoe", "thrift", "salon"], "difficult_direct_answer": false, "rationales": ["A salon is shown.", "The shop is a salon.", "Out of these choices a makes the most sense but the main object seen are shoes."], "image": "val2014/COCO_val2014_000000411875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163255, "question_id": "apURWyH7NJQ5EtX4oBvrEQ", "question": "What is in the glass?", "choices": ["juice", "wine", "beer", "gin"], "correct_choice_idx": 2, "direct_answers": ["wine", "beer", "ale", "ale", "beer", "beer", "ale", "wine", "ale", "malt beverage"], "difficult_direct_answer": false, "rationales": ["The low shape of the bottle and of the glass plus the color of the drink seem to indicate that this is an ale.", "Beer is in the bottle near the glass.", "There is a beer in the glass."], "image": "val2014/COCO_val2014_000000163255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387216, "question_id": "aph97GfW994hN8JxSoGbbc", "question": "How long has this train been sitting here?", "choices": ["5 hours", "many years", "1 day", "1 hour"], "correct_choice_idx": 1, "direct_answers": ["long time", "decades", "years", "maybe decades", "months", "many years", "years", "years", "many years", "decades"], "difficult_direct_answer": false, "rationales": ["This train has been sitting here for many years and is growing old.", "There is a lot of grass growing around the train. it is starting to rust.", "The trains are getting rusted, which means they have'nt been used for years."], "image": "val2014/COCO_val2014_000000387216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139141, "question_id": "apsvWoA6AvLAWtPthipLSa", "question": "What period of the day is shown here?", "choices": ["evening", "afternoon", "night", "morning"], "correct_choice_idx": 2, "direct_answers": ["evening", "night", "night", "night", "night", "nighttime", "night", "night", "night", "recess"], "difficult_direct_answer": false, "rationales": ["Kids are playing games in front of a dark window with the blinds open.", "It's dark outside of the window.", "It is dark outside the window so the sun is down."], "image": "val2014/COCO_val2014_000000139141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195791, "question_id": "aq5FaQXhCZPtYe8ETkLayP", "question": "What hour does the clock behind the man show?", "choices": ["one", "seven", "twelve", "nine"], "correct_choice_idx": 1, "direct_answers": ["seven", "seven", "735", "735", "seven", "seven", "seven", "seven", "seven", "seven"], "difficult_direct_answer": false, "rationales": ["The small hand is on the 7.", "The digits are visible on the clock behind the man. based on the orientation of the hands on the clock, answer a is the hour.", "The clock hand shows the hour seven."], "image": "train2014/COCO_train2014_000000195791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394486, "question_id": "aqDWytxqHDqHnqupHee5a6", "question": "What is the man engaging in?", "choices": ["vandalizing", "playing game", "repairing phone", "chatting"], "correct_choice_idx": 2, "direct_answers": ["speaking", "illegal calling", "phone call", "phone calls", "conversation", "call", "hacking", "phone call", "repairing phone", "repairing wires"], "difficult_direct_answer": true, "rationales": ["The box on the phone is open and it looks like he is testing it out.", "There is an open box of wires next to him and he is on two phones at the same time.", "The man is engaged in repairing the box of phone wires."], "image": "train2014/COCO_train2014_000000394486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44483, "question_id": "aqHW3qRRpabuLzreRCv6NP", "question": "What do people under the umbrellas here do?", "choices": ["knit", "cheer", "dine", "sleep"], "correct_choice_idx": 2, "direct_answers": ["dine", "dine", "eat", "eat", "eat", "eating", "eat", "eat", "eat", "eat"], "difficult_direct_answer": false, "rationales": ["People are sitting at tables near the water and their are placemats and silverware on the tables.", "There are restaurants lining the waterfront and the people sitting at the tables have plates, menus, and salt shakers.", "The people are eating at cafes."], "image": "train2014/COCO_train2014_000000044483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223718, "question_id": "aqJ8pTrSkRfQc8XSwY975P", "question": "If the ball came this way what would stop it?", "choices": ["field player", "net", "fence", "pole"], "correct_choice_idx": 1, "direct_answers": ["fence", "catcher", "net", "net", "fence", "net", "net", "fence", "catcher", "net"], "difficult_direct_answer": false, "rationales": ["Nets are used on baseball and softball fields.", "The image is seen through a transparent checkered object. based on the sport being played and the object, answer a is consistent.", "There is no fence or pole. field players do not stand behind the catcher."], "image": "train2014/COCO_train2014_000000223718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366262, "question_id": "aqKuCf3ZEq2gyuFyEP5UWC", "question": "What is the main cargo carried by the green train engine?", "choices": ["mail", "automobiles", "passengers", "farm equipment"], "correct_choice_idx": 1, "direct_answers": ["cars", "cars", "cars", "automobiles", "automobiles", "cars", "cars", "automobiles", "cars", "cars"], "difficult_direct_answer": false, "rationales": ["There are several cars in the train.", "There are many vehicles on the train", "There are cars on the train cars behind the engine."], "image": "train2014/COCO_train2014_000000366262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98408, "question_id": "aqLFsj2ADcAcQWizA9Pm6E", "question": "Why are they near the middle of the track?", "choices": ["is random", "shortest distance", "afraid", "prevent falling"], "correct_choice_idx": 1, "direct_answers": ["shortest distance", "to turn", "race lane", "racing", "fastest route", "same speed", "going round", "avoid pass", "turning", "they arent"], "difficult_direct_answer": true, "rationales": ["This makes the most sense given that the spaces shrink toward the center.", "They're the shortest distance.", "It helps them get around the track faster because the outer part would be wider and; therefore, a wider distance."], "image": "train2014/COCO_train2014_000000098408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273560, "question_id": "aqd3EmWFnStp5eHy3Ed8gi", "question": "Which kind of relationship is this cake typically designed for?", "choices": ["friendship", "acquaintanceship", "familial", "romantic"], "correct_choice_idx": 3, "direct_answers": ["wedding", "wedding", "wedding", "marriage", "wedding", "wedding", "wedding", "married one", "romantic", "wedding"], "difficult_direct_answer": false, "rationales": ["The cake is white, multi-layered, and contains floral decorations. this is most consistent with the style of a wedding cake.", "A small three tiered wedding cake with flowers around it.", "The white multi layered style of cake with flowers and pearl shaped flourishes is likely for a wedding."], "image": "train2014/COCO_train2014_000000273560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185291, "question_id": "ar8tNSobvQixbdNooHxTRi", "question": "What is the cow entered into?", "choices": ["dance competition", "livestock fair", "typing contest", "milking contest"], "correct_choice_idx": 1, "direct_answers": ["contest", "contest", "contest", "contest", "competition", "livestock fair", "contest", "cow competition", "contest", "show"], "difficult_direct_answer": false, "rationales": ["A cow is being led by a person with a leash and they are indoors.", "The cow is groomed and inside of a photo yard.", "The cow is in a fair."], "image": "train2014/COCO_train2014_000000185291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517596, "question_id": "arYoqQ9u74HcdJzdWyBnCH", "question": "What powers the green section of train shown?", "choices": ["gas", "coal", "electric", "moss"], "correct_choice_idx": 2, "direct_answers": ["train engine", "cables", "electricity", "electric", "wires", "electricity", "electricity", "engine", "electricity", "train"], "difficult_direct_answer": false, "rationales": ["The power is transferred from the electric wires overhead.", "The electricity is powering.", "Electricity powers the green section."], "image": "val2014/COCO_val2014_000000517596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218397, "question_id": "arkn3BxTMA6ahAc6d4B7XE", "question": "What type toys unite these people today?", "choices": ["trucks", "chairs", "toys", "drones"], "correct_choice_idx": 2, "direct_answers": ["kites", "toys", "kite", "kites", "kite", "kites", "kites", "kites", "kites", "kites"], "difficult_direct_answer": false, "rationales": ["They are flying kites.", "Drones are flying in the sky", "Kites are toys."], "image": "val2014/COCO_val2014_000000218397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105388, "question_id": "asG6t6UhRGAmATQ8pdKJYN", "question": "Who cowrote this book with Tom Clancy?", "choices": ["stephen king", "dan brown", "larry bond", "john grisham"], "correct_choice_idx": 2, "direct_answers": ["larry bond", "putnam", "john irving", "putnam", "dan brown", "putnam", "larry bond", "putnam", "john irving", "no one"], "difficult_direct_answer": false, "rationales": ["The tom clancy book shown is red storm rising and an internet search was used to reveal the name of the co-author.", "The co-author of red storm rising is larry bond.", "The displayed tom clancy novel is red storm rising. i performed an internet search on the co-author of that novel."], "image": "train2014/COCO_train2014_000000105388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314724, "question_id": "asJvpuJyE2sqZhvxjTyLye", "question": "What is the term for how the carrot has been prepared?", "choices": ["diced", "shredded", "chopped", "cubed"], "correct_choice_idx": 1, "direct_answers": ["shredded", "julienne", "julienne", "shred", "julienned", "sliced", "shaved", "fried", "thinly sliced", "julienne"], "difficult_direct_answer": false, "rationales": ["There is shredded carrot on the food.", "The carrot is shredded.", "The term is shredded."], "image": "train2014/COCO_train2014_000000314724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227441, "question_id": "asVEamA6Hos7CYJaLHVRsd", "question": "What items can you find inside all the items displayed on the table?", "choices": ["seeds", "worms", "candles", "peels"], "correct_choice_idx": 0, "direct_answers": ["apples", "seeds", "seeds", "apple peeler", "seeds", "apples", "core", "apples", "seeds", "plate"], "difficult_direct_answer": false, "rationales": ["Apple seeds are in apples.", "That is how these fruits continue life", "All these fruits will have seeds inside of them."], "image": "train2014/COCO_train2014_000000227441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256431, "question_id": "asjydFSpUqorNTX33pHab9", "question": "In which country does this woman walk?", "choices": ["canada", "guatamala", "united states", "mexico"], "correct_choice_idx": 2, "direct_answers": ["united states", "united states", "america", "usa", "usa", "usa", "united states", "america", "america", "usa"], "difficult_direct_answer": false, "rationales": ["There are american road signs", "The woman is walking down a street that has shops and signs in english.", "The highway signs are seen on the street. these are the highway signs seen on the street."], "image": "train2014/COCO_train2014_000000256431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117988, "question_id": "at7J8Udg66UvUQURTamgCw", "question": "What illness does the woman in red shirt have?", "choices": ["stomach cancer", "obesity", "paralysis", "covid-19"], "correct_choice_idx": 1, "direct_answers": ["obesity", "obesity", "obesity", "obesity", "obesity", "obesity", "obesity", "obesity", "obesity", "obesity"], "difficult_direct_answer": false, "rationales": ["The woman in the red shirt is overweight.", "The woman in the red shirt is very large.", "The woman in red is large. she is standing, so she is not paralyzed."], "image": "val2014/COCO_val2014_000000117988.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147404, "question_id": "atPzsMPbktvc9Zxtk5NpZQ", "question": "Kite festivals and kite designs are mostly popular in which country?", "choices": ["china", "nepal", "japan", "india"], "correct_choice_idx": 0, "direct_answers": ["china", "pakistan", "pakistan", "china", "china", "china", "china", "china", "china", "china"], "difficult_direct_answer": false, "rationales": ["China invented kites.", "The answer is commonly known and internet searchable.", "The festivals are in china."], "image": "val2014/COCO_val2014_000000147404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199819, "question_id": "atQnA7EnLpgjqfCrtPpDrr", "question": "What is between the bananas?", "choices": ["baby", "pumpkin", "ice cream", "notebook"], "correct_choice_idx": 0, "direct_answers": ["child", "child", "boy", "child eyes", "face", "child", "child", "face", "little boy", "baby"], "difficult_direct_answer": false, "rationales": ["There is a small baby's face in between the bananas.", "A baby's face is between the bananas.", "The baby is between."], "image": "val2014/COCO_val2014_000000199819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529981, "question_id": "atXV57Cy7Zr9gAGpYJaAs8", "question": "What animals are present?", "choices": ["giraffe", "deer", "dog", "horse"], "correct_choice_idx": 3, "direct_answers": ["horses", "horses", "horses", "horses", "horses", "horses", "horses", "horses", "horses", "horse"], "difficult_direct_answer": false, "rationales": ["The animal is a horse.", "There are several horses in the picture.", "The animals are clearly visible and have identifiable features as answer a such as hooves, particular style of tail, muzzles, and hair patterns."], "image": "val2014/COCO_val2014_000000529981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458502, "question_id": "atu8gtiiWPWCMmmGcpp8pX", "question": "What is the purpose of the black device they are on?", "choices": ["waiting place", "keep warm", "buying tickets", "move skiers"], "correct_choice_idx": 3, "direct_answers": ["ski lift", "move skiers", "move them", "move people", "transport", "transport", "transport", "transport", "riding", "transport"], "difficult_direct_answer": false, "rationales": ["Skiers can ride the magic carpet lift to go up the mountain.", "The black device moves skiers around on the mountain.", "A black strip of rubber is moving up a snowy mountain and skiers are standing on it."], "image": "train2014/COCO_train2014_000000458502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350825, "question_id": "atxNgqQELxkEgpgbaZz7Pb", "question": "What word most closely relates to the red and green things very close to the lady?", "choices": ["romance", "power", "wealth", "conflict"], "correct_choice_idx": 0, "direct_answers": ["roses", "flowers", "roses", "roses", "bags", "christmas", "fire dangerous", "romance", "flowers", "flowers"], "difficult_direct_answer": false, "rationales": ["Roses are sitting next to a woman who is looking at her phone. people give romantic partners flowers.", "The objects in question are roses based on their color and shape. roses are traditionally given on valentines day and other special romantic occasions.", "The shopping bags the lady is carrying"], "image": "train2014/COCO_train2014_000000350825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12731, "question_id": "auGa4RL2vYXttpzhEhnVx2", "question": "What type of trains are shown here?", "choices": ["miniature", "electric", "tram", "diesel"], "correct_choice_idx": 0, "direct_answers": ["miniature", "passenger", "passanger", "boxcar", "toy", "passenger type", "toy trains", "model trains", "freight", "toy trains"], "difficult_direct_answer": true, "rationales": ["There are miniature trains on top of the model track.", "This is a small display with mini trains and people.", "This is a play train set"], "image": "val2014/COCO_val2014_000000012731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317458, "question_id": "auXR2FvCWGcDJMsiTerLdD", "question": "What artist is associated with the flowers on the bed?", "choices": ["donatello", "van gogh", "freud", "turner"], "correct_choice_idx": 1, "direct_answers": ["wizard oz", "van gogh", "hotel", "van gogh", "van gogh", "monet", "van gogh", "van gogh", "van gogh", "van gogh"], "difficult_direct_answer": false, "rationales": ["Van gough painted sunflowers.", "This same artist cut off his own ear.", "The artist is van gough."], "image": "train2014/COCO_train2014_000000317458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63275, "question_id": "auZSSigoHec9vLuEBvZzJH", "question": "In what kind of store are these toilets and bathtubs displayed?", "choices": ["toy", "hardware", "appliance", "plumbing"], "correct_choice_idx": 0, "direct_answers": ["home goods", "home improvement", "gag store", "event", "home goods", "toy", "electronics", "joke", "knick knack", "toy"], "difficult_direct_answer": false, "rationales": ["This is a small version that would be used as a toy.", "The toy is present.", "These are obviously objects that kids would use."], "image": "train2014/COCO_train2014_000000063275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52943, "question_id": "aufax8okqHa4XufBGp9kK8", "question": "What is in front of the man in the foreground?", "choices": ["elephant", "camper", "ball", "car"], "correct_choice_idx": 2, "direct_answers": ["soccer ball", "ball", "ball", "soccer ball", "soccer ball", "ball", "soccer ball", "soccer ball", "ball", "soccer ball"], "difficult_direct_answer": false, "rationales": ["The front is a ball.", "The man has a soccer ball in front of him.", "There is a small soccer ball in front of the man in the foreground."], "image": "train2014/COCO_train2014_000000052943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17260, "question_id": "aunFdW9tEnoM8agXBKEkkj", "question": "What are the males called?", "choices": ["roosters", "cocks", "gander", "drakes"], "correct_choice_idx": 3, "direct_answers": ["drake", "drakes", "drakes", "drakes", "drakes", "gander", "drake", "drakes", "drakes", "drake"], "difficult_direct_answer": false, "rationales": ["The male ducks are drakes.", "There are several ducks, and the males are called drakes.", "The males are drakes."], "image": "val2014/COCO_val2014_000000017260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161719, "question_id": "auneuJWfkNg5s8RS4GdpDn", "question": "From which type fruit are juices available here?", "choices": ["grapefruits", "grapes", "orange", "lemons"], "correct_choice_idx": 2, "direct_answers": ["citrus", "oranges", "oranges", "orange", "orange", "orange", "oranges", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["There are cartons of a well-known brand in the top right that shows the citrus fruit with a straw coming out of it.", "There are some orange juices on the table.", "This is made from orange juice."], "image": "train2014/COCO_train2014_000000161719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413320, "question_id": "auzjfZSj9aYtavCRap6gQU", "question": "What energy powers this yellow duck?", "choices": ["wind", "electricity", "manual", "solar"], "correct_choice_idx": 2, "direct_answers": ["human", "human paddling", "human pedals", "manual", "feet", "diesel", "pedaling", "paddling", "feet", "paddling"], "difficult_direct_answer": false, "rationales": ["A paddle boat is in the shape of a duck and two people are riding in it.", "The energy is manual.", "A paddle boat is shaped like a duck. paddle boats are powered by pushing the pedals."], "image": "val2014/COCO_val2014_000000413320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97696, "question_id": "avAZK6mQboDHVXhG7tcabb", "question": "How many functional keys in keyboard?", "choices": ["11", "21", "17", "15"], "correct_choice_idx": 0, "direct_answers": ["104", "12", "twelve", "twelve", "unknown", "twelve", "hundred four", "12", "twelve", "11"], "difficult_direct_answer": false, "rationales": ["Eleven keys are functional on the keyboard.", "There are 11 keys.", "There are only 11 functional keyboard keys."], "image": "val2014/COCO_val2014_000000097696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99179, "question_id": "avMEpfVcjX5Kzx8eM4FRru", "question": "What animals are laying in front of the women?", "choices": ["pigeons", "swans", "hens", "turkey"], "correct_choice_idx": 3, "direct_answers": ["turkey", "turkeys", "sleeping", "chicken", "turkeys", "turkeys", "turkey", "birds", "turkey", "turkey"], "difficult_direct_answer": false, "rationales": ["The animals are turkeys.", "They are large with white and brown feathers and red heads.", "Based on the look of the gobble on the birds chin it indicates that the bird is a turkey."], "image": "val2014/COCO_val2014_000000099179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359290, "question_id": "avTSaNC9jeuED8eWHAMxH4", "question": "What is the name of the fenced off area for these horses?", "choices": ["corral", "dirt road", "parking lot", "stage"], "correct_choice_idx": 0, "direct_answers": ["stables", "stable", "road", "pasture", "corral", "pasture", "pen", "corral", "paddock", "corral"], "difficult_direct_answer": false, "rationales": ["This is just what it's called.", "Corral has a double meaning, which is also to get the animals into one place. this keeps them safely fenced in.", "Horses reside in that field with fences."], "image": "train2014/COCO_train2014_000000359290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443086, "question_id": "avVCwBSAStL6wcoeHkFBgK", "question": "What kitchen appliance is he moving?", "choices": ["fridge", "microwave", "stove", "dishwasher"], "correct_choice_idx": 0, "direct_answers": ["refrigerator", "fridge", "refrigerator", "fridge", "refrigerator", "fridge", "refrigerator", "refrigerator", "fridge", "fridge"], "difficult_direct_answer": false, "rationales": ["The appliance is a refrigerator.", "The boxy rectangular shape of this item; with it's handle doors and two compartments, identify it as a refrigerator.", "The appliance is the fridge."], "image": "train2014/COCO_train2014_000000443086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486977, "question_id": "avVhfopcpXUbbFkRwsqvDp", "question": "The game on the television on the wall is being run by which game system?", "choices": ["nintendo wii", "xbox", "nintendo switch", "playstation"], "correct_choice_idx": 0, "direct_answers": ["wii", "wii", "wii", "wii", "wii", "tennis", "nintendo wii", "wii", "wii", "game console"], "difficult_direct_answer": false, "rationales": ["People are holding wii controllers.", "The game is the wii.", "The men are holding wii controllers to play the game."], "image": "train2014/COCO_train2014_000000486977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513657, "question_id": "avgax5DLpwgRVTYBQYrogr", "question": "What country is this?", "choices": ["mexico", "usa", "japan", "uk"], "correct_choice_idx": 2, "direct_answers": ["japan", "senegal", "china", "china", "china", "china", "japan", "japan", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["The country is japan.", "The bus is in japan. it has a japanese name on the side.", "The word on the bus is from an asian culture."], "image": "train2014/COCO_train2014_000000513657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578137, "question_id": "aviZPEo5gtGz6jCb5h67S4", "question": "What is the blue bin on the left for?", "choices": ["biohazardous waste", "compost", "garbage", "recycling"], "correct_choice_idx": 3, "direct_answers": ["recycling", "recycling", "recyclables", "recycling", "recycling", "recyclables", "recycle", "recycling", "recycling", "recycling"], "difficult_direct_answer": false, "rationales": ["Recycling bins are always blue.", "The bin is for recycling.", "The blue tin is a recycling can."], "image": "train2014/COCO_train2014_000000578137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560412, "question_id": "avno3PqkgqFHfu7Ymgz8qh", "question": "Sam Walton is a founder of what?", "choices": ["trends", "adidas", "walmart", "amazon"], "correct_choice_idx": 2, "direct_answers": ["walmart", "walmart", "walmart", "walmart", "walmart", "walmart", "walmart", "walmart", "walmart", "walmart"], "difficult_direct_answer": false, "rationales": ["Sam walton is the father of walmart.", "He founded walmart.", "He is the founder of a big brick and mortar store."], "image": "train2014/COCO_train2014_000000560412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150048, "question_id": "aw5Nhzz8GmDHrmZAwQ5BE7", "question": "What would prevent the lighter cows from visited the darker cows?", "choices": ["tired", "wall", "leash", "rancher"], "correct_choice_idx": 1, "direct_answers": ["water", "walls", "steep", "nothing", "hills", "fence", "fence", "hills", "wall", "kept seperately"], "difficult_direct_answer": false, "rationales": ["There is a wall between the darker and lighter cows so they cannot visit eachother.", "There is a wall preventing the cows from visiting eachother.", "There is a natural type of this carved into the mountain"], "image": "train2014/COCO_train2014_000000150048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307511, "question_id": "awAEGSiEhmJJaxtEfc33rx", "question": "Who is known for using the items on top of this vehicle?", "choices": ["tiger woods", "lakey peterson", "joe frazier", "bo jackson"], "correct_choice_idx": 1, "direct_answers": ["lakey peterson", "griswalds", "lisa andersen", "surfers", "surfer", "surfboards", "surfers", "surfers", "surfers", "surfers"], "difficult_direct_answer": false, "rationales": ["The person lakey peterson is known for surfboarding.", "The items on the car are surfboards and answer a is a surfer who would use surfboards.", "That person uses that car."], "image": "val2014/COCO_val2014_000000307511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426085, "question_id": "awHexizCzw4ScYrLK5SDPM", "question": "What is the shape of kite in the image?", "choices": ["sphere", "bow", "delta", "box"], "correct_choice_idx": 1, "direct_answers": ["triangle", "triangle", "triangle", "triangle", "bow", "bat shaped", "triangle", "triangle", "triangle", "umbrella"], "difficult_direct_answer": false, "rationales": ["Most kites unless they are a specialty item are bowed shaped.", "The kite is bow shaped since it has a pointed top.", "It has sort of a trapezoid look with a point where the \"top of the trapezoid\" is supposed to be. also, those objects are shaped like an arc."], "image": "train2014/COCO_train2014_000000426085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192221, "question_id": "awHpdikMpY4QJ5qsqLiKAj", "question": "Why is the man with his back turned bent over?", "choices": ["judging plays", "stealing base", "coaching player", "curious fan"], "correct_choice_idx": 0, "direct_answers": ["waiting balls", "better view", "watching", "better look", "concentrating", "officiating", "judging plays", "watching", "better view", "good view"], "difficult_direct_answer": false, "rationales": ["The man is an mlb third base umpire responsible for making calls on the field and is bent over because he needs to pay close attention in order to perform the duties of his position.", "The man is trying to get a better view of the play.", "He is in this stance to get the best view of the ball, as he needs to call if the play is illegal or not."], "image": "train2014/COCO_train2014_000000192221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289693, "question_id": "awP7H6Fdejm8FrW8uL7y7b", "question": "What country is this venue located in?", "choices": ["united states", "britain", "france", "italy"], "correct_choice_idx": 1, "direct_answers": ["america", "united states", "france", "america", "usa", "britain", "usa", "usa", "foreign country", "united states"], "difficult_direct_answer": false, "rationales": ["The country is britain.", "The country is in britain given the virgin signs.", "There is a sponsor sign for virgin radio. virgin radio is based in britain."], "image": "train2014/COCO_train2014_000000289693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577582, "question_id": "ax8R4nX6U9dSXGiwudFHVD", "question": "What is one of the biggest risks in this environment?", "choices": ["drowning", "assault", "dog assault", "asphyxiation"], "correct_choice_idx": 0, "direct_answers": ["water", "drowning", "drowning", "drowning", "tsunami", "drowning", "oil spill", "drowning", "drowning", "drowning"], "difficult_direct_answer": false, "rationales": ["The risk is drowning.", "People on the boats could drown.", "Any time a person is on a deep body of water, drowning is always a possibility. it's always best to avoid tragedy by taking safety precautions when venturing out into them."], "image": "train2014/COCO_train2014_000000577582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14297, "question_id": "axCvEy58jkaerRxvTgkqHr", "question": "What style of boats are there on the water?", "choices": ["yachts", "catamarans", "barges", "houseboats"], "correct_choice_idx": 3, "direct_answers": ["houseboat", "houseboats", "simply", "house boat", "houseboats", "house boat", "house", "houseboats", "houseboats", "passenger boats"], "difficult_direct_answer": false, "rationales": ["The boats have the exteriors consistent with houseboats as well as have bikes and other personal possessions on them that might be present if someone lived there.", "The style is houseboats.", "The boats docked in this image show signs they are lived in and they are not large enough to be yachts."], "image": "val2014/COCO_val2014_000000014297.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313432, "question_id": "axMueDrwVJPByLExtHvAip", "question": "Which course are the people on the lift probably being brought to?", "choices": ["professional", "beginner", "advanced", "dare devil"], "correct_choice_idx": 1, "direct_answers": ["ski", "ski slope", "alpine", "ski trip", "ski hill", "slalom", "ski slope", "ski run", "beginner", "ski lesson"], "difficult_direct_answer": true, "rationales": ["The people are quite young.", "They are children so they would not have had enough time to have had more experience.", "The majority of the people on the sky lift are young children. children are too young to use advanced courses safely."], "image": "train2014/COCO_train2014_000000313432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192348, "question_id": "axRSaHG9vBveTy3bgKNmjq", "question": "Which device is likely more powerful?", "choices": ["silver", "they're equal", "cannot tell", "green"], "correct_choice_idx": 0, "direct_answers": ["laptop", "left", "laptop", "grey device", "big computer", "silver", "white", "left laptop", "white one", "left laptop"], "difficult_direct_answer": false, "rationales": ["The bigger computer looks more powerful.", "The silver device is larger, indicating it has a larger battery and processor.", "The other one (in background) is a kids' toy."], "image": "train2014/COCO_train2014_000000192348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571797, "question_id": "axaXBB5K3S3Cg9umYsNCt7", "question": "What US state is this lady likely to live in?", "choices": ["illinois", "new york", "ohio", "wisconsin"], "correct_choice_idx": 0, "direct_answers": ["new york", "illinois", "illinois", "illinois", "israel", "illinois", "illinois", "illinois", "illinois", "iowa"], "difficult_direct_answer": false, "rationales": ["That's what il stands for.", "The woman's sticker says illinois.", "You can tell by the abbreviation as to what state she loves."], "image": "train2014/COCO_train2014_000000571797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4988, "question_id": "axedqVpEmPffxZmLX2B43q", "question": "What reason is the glass structure placed here?", "choices": ["bus stop", "sales kiosk", "advertising only", "telephone calls"], "correct_choice_idx": 0, "direct_answers": ["bus stop", "bus shelter", "bus stop", "bus stop", "passenger shelter", "shelter", "bus stop", "shelter", "shelter", "weather protection"], "difficult_direct_answer": false, "rationales": ["It offers protection for people waiting for public transportation", "The bus stop has a large glass coating.", "People wait in here for the bus to arrive."], "image": "val2014/COCO_val2014_000000004988.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358494, "question_id": "axj24kxFPDW5UJj5sgjWG5", "question": "What do the things in the foreground usually wear on their feet?", "choices": ["slippers", "boots", "horseshoes", "sandals"], "correct_choice_idx": 2, "direct_answers": ["horse shoes", "horse shoes", "horseshoes", "horseshoes", "horseshoe", "horse shoes", "horseshoe", "horseshoes", "horseshoe", "horseshoes"], "difficult_direct_answer": false, "rationales": ["They are horses, and they often wear horseshoes to protect their feet and go long distances.", "They will put horseshoes on them.", "The things in the foreground are horses and shoes go on their feet."], "image": "val2014/COCO_val2014_000000358494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124907, "question_id": "axknyR3eLQQ5ZwtbFrnYXk", "question": "What type of hat does the man on the right have on?", "choices": ["bowlers cap", "newsboy cap", "bottle cap", "baseball cap"], "correct_choice_idx": 1, "direct_answers": ["newsboy cap", "flat cap", "derby", "bowler", "driving cap", "cap", "cap", "driving cap", "bolo", "ben hogan"], "difficult_direct_answer": false, "rationales": ["It's a newsboy cap.", "The man on the right is wearing a newsboy cap from the 1920s.", "The man has a newsboy paper cap on."], "image": "train2014/COCO_train2014_000000124907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402759, "question_id": "ayfbTneRtBxPzApjMxamFd", "question": "What will be happening here in the very near future?", "choices": ["school class", "church service", "yacht sailing", "large party"], "correct_choice_idx": 3, "direct_answers": ["party time", "wedding", "wedding", "party", "event", "large party", "reception", "wedding", "meal served", "wedding"], "difficult_direct_answer": false, "rationales": ["The party will be held here.", "Tables are set up outside with tablecloths and formal place settings.", "The tables are set for a party."], "image": "train2014/COCO_train2014_000000402759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102910, "question_id": "aykZkoN26fjSv5PkqahZcN", "question": "For whom was this concrete structure made?", "choices": ["skateboarder", "picasso", "art museum", "city prisoners"], "correct_choice_idx": 0, "direct_answers": ["skateboarder", "skateboarders", "skateboarders", "skateboarders", "city", "skateboarders", "skateboarders", "skaters", "cement", "skateboarders"], "difficult_direct_answer": false, "rationales": ["You can tell by the concrete designs that this place is used for skating.", "Skaters practice on concrete places.", "The area is a skate park."], "image": "train2014/COCO_train2014_000000102910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159598, "question_id": "ays3LMk7oSuLjpk2GkgQv2", "question": "What item is needed to get the food in the machine?", "choices": ["string", "password", "phone", "money"], "correct_choice_idx": 3, "direct_answers": ["money", "money", "money", "money", "coins", "money", "money", "money", "money", "burger"], "difficult_direct_answer": false, "rationales": ["You need to put coins or bills in the machine to get the food out.", "Money is used for vending machines.", "You have to pay to get food"], "image": "val2014/COCO_val2014_000000159598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187286, "question_id": "ayxQFCKVKpj3iWDmwzzt62", "question": "For what reason must the cyclists stop here?", "choices": ["green light", "boredom", "they're tired", "train crossing"], "correct_choice_idx": 3, "direct_answers": ["train crossing", "train passing", "train crossing", "train tracks", "train crossing", "bus", "train crossing", "train", "train passing", "train"], "difficult_direct_answer": false, "rationales": ["There is a train going through town so they have to wait until the train is gone to go forward.", "The reason is the train crossing.", "There are tracks that run perpendicular to the road. a vehicle is currently using the tracks."], "image": "val2014/COCO_val2014_000000187286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422461, "question_id": "az9Ksg8gWELJJfbBdZ3jCb", "question": "What type of snack is on the desk?", "choices": ["vegetables", "candy", "fruit", "chips"], "correct_choice_idx": 1, "direct_answers": ["mins", "candy mints", "candy", "jelly beans", "candy", "candy", "candy", "mms", "laptop", "mom's"], "difficult_direct_answer": false, "rationales": ["A small bowl with multi-colored circular items are visible. the look, color, and shape denotes candy.", "We can see packages labelled 'm&m' on the left of the desk and a dish of jelly bean shaped snacks on the right. these are both candy.", "The snack is candy."], "image": "train2014/COCO_train2014_000000422461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104541, "question_id": "azESKqUgs9miHWGWEPTSDr", "question": "What is the largest pizza on top of?", "choices": ["wooden board", "paper plate", "aluminum tray", "tray"], "correct_choice_idx": 3, "direct_answers": ["mat", "counter", "blanket", "pizza", "tray", "towel", "table", "pan", "pizza tray", "towel"], "difficult_direct_answer": true, "rationales": ["Pizza is served on a flat round pan.", "These types of food are piping hot and need to be served with something that will hold them in place.", "The pizzas are all inside trays, in which they were used to transport and cook inside the oven."], "image": "train2014/COCO_train2014_000000104541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364437, "question_id": "azLKJFzyz5EkwpfU7kvyGy", "question": "What material is the small white cup next to the mouthwash bottle made out of?", "choices": ["metal", "paper", "ceramic", "plastic"], "correct_choice_idx": 1, "direct_answers": ["paper", "paper", "bottle", "paper", "paper", "bottle", "paper", "paper", "paper", "paper"], "difficult_direct_answer": false, "rationales": ["The cup is disposable. it would cost too much money to make disposable cups out of ceramic, plastic, or metal.", "The kinds of cups next to mouthwash are usually single-use, so they are made of disposable materials. this could be located in a hotel or public bathroom.", "The cups are made of paper."], "image": "train2014/COCO_train2014_000000364437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437778, "question_id": "azpAkKFVQUtgeeCs2G4Fhp", "question": "Why is he standing in the middle of the street?", "choices": ["is confused", "is lost", "police officer", "is angry"], "correct_choice_idx": 2, "direct_answers": ["guide officer", "guard", "directing traffic", "guide traffic", "showing direction", "police officer", "directing traffic", "safety", "directing traffic", "instructing"], "difficult_direct_answer": false, "rationales": ["Traffic officers routinely help direct traffic.", "A an in a dark colored uniform is in the street. police direct traffic sometimes.", "He is controlling traffic."], "image": "train2014/COCO_train2014_000000437778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270254, "question_id": "azvZVMGxTbgZ4USGHrPCYR", "question": "What might be used to make something like this go?", "choices": ["oars", "engines", "nuclear power", "fire"], "correct_choice_idx": 0, "direct_answers": ["paddles", "oars", "oars", "oars", "oar", "oar", "wind", "paddle", "oars", "oar"], "difficult_direct_answer": false, "rationales": ["The oars are used.", "There is no engine on the boats. you would move them by rowing.", "These are boats which are controlled with oars."], "image": "train2014/COCO_train2014_000000270254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293647, "question_id": "b27EC7xmaS2JAQMzMqc7ZF", "question": "The Harley police bikes are iconic bikes of police force in?", "choices": ["uk", "australia", "france", "us"], "correct_choice_idx": 3, "direct_answers": ["secret service", "us", "secret service", "classic style", "united states", "united states", "new york", "miami", "cities", "secret service"], "difficult_direct_answer": false, "rationales": ["The text before secret service on the side of the bike indicates the country where these bikes are used.", "Harley are used by motorcycle cops.", "This police vehicle is in the united states, as evidenced by the us license plates and writing on the motorcycle. harley motorcycles are associated with the united states culturally."], "image": "train2014/COCO_train2014_000000293647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329789, "question_id": "b2joc6ickN9XWa8TKN4apo", "question": "What type of meat is being consumed?", "choices": ["pepperoni", "chicken", "goat", "ham"], "correct_choice_idx": 0, "direct_answers": ["pepperoni", "pepperoni", "pepperoni", "pepperoni", "pepperoni", "pepperoni", "pepperoni", "taste", "pepperoni", "pepperoni"], "difficult_direct_answer": false, "rationales": ["The man is eating pepperoni pizza.", "You can tell by the color and shape as to what he is eating.", "Those kinds of slices are very common on pizza."], "image": "val2014/COCO_val2014_000000329789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54304, "question_id": "b2tZUvBbBUV4n54CrCR6fZ", "question": "What is the red object in the food the man is eating?", "choices": ["apple", "pepper", "cherry", "tomato"], "correct_choice_idx": 2, "direct_answers": ["cherry", "cherry", "cherry", "cherry", "cherry", "cherry", "cherry", "cherry", "cherry", "cherry"], "difficult_direct_answer": false, "rationales": ["The red food is a cherry on the whipped cream.", "It's a cherry on top.", "The red object is a cherry on the dessert."], "image": "train2014/COCO_train2014_000000054304.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477111, "question_id": "b32fVjsNp4xmJdSkZjJZbr", "question": "What game are they playing?", "choices": ["running", "fetch", "stretching", "tag"], "correct_choice_idx": 1, "direct_answers": ["frisbee", "frisbee", "catch frisbee", "fetch", "board", "frisbee", "board", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["The speed shown in the orange dog expresses a need to get the toy back to its owner asap. this is usually done when said human has just thrown the toy for the dog to retrieve.", "Two dogs, one with a frisbee in its mouth are running together.", "The dogs are playing with the frisbee."], "image": "val2014/COCO_val2014_000000477111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569889, "question_id": "b35iDq4bYivrBMQQgSainW", "question": "What kind of bus is the white vehicle?", "choices": ["tourist bus", "school bus", "double decker", "public bus"], "correct_choice_idx": 0, "direct_answers": ["tour", "motorcoach", "passenger bus", "he", "passenger bus", "he", "shuttle", "long distance", "tourist bus", "shuttle"], "difficult_direct_answer": false, "rationales": ["It is to move people around who are visiting the area", "The people are seen waiting to book the bus to different location.", "A tourist bus is shown."], "image": "train2014/COCO_train2014_000000569889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534443, "question_id": "b35vaNQejbQUjJtQSBPwog", "question": "What is the name of the platform used to walk out on the water?", "choices": ["tub", "plank", "pier", "steps"], "correct_choice_idx": 2, "direct_answers": ["pier", "pier", "pier", "dock", "pier", "pier", "dock", "dock", "dock", "pier"], "difficult_direct_answer": false, "rationales": ["A wood walkway extend out onto the water.", "It is a platform extending from the shore over the water and supported by pillars that is used to secure, protect, and provide access to ships or boats.", "A pier is a long wooden platform used to walk out to the water."], "image": "train2014/COCO_train2014_000000534443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202818, "question_id": "b3EW2Vd9YSFBDWbaLcu8cz", "question": "The vehicle used to tow here is meant to be used where normally?", "choices": ["street", "city", "farm", "demolition derby"], "correct_choice_idx": 2, "direct_answers": ["farm", "bus service", "junk yard", "school", "bus service", "farming crops", "school bus", "farm", "tractor", "farmland"], "difficult_direct_answer": false, "rationales": ["There is a tractor pictured.", "This is farm equipment.", "Tractors can be found commonly on farms."], "image": "train2014/COCO_train2014_000000202818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188417, "question_id": "b3FBUHFJYnF8MUK4Dh5kfw", "question": "How will the people here get back down?", "choices": ["taxi", "ski", "boat", "ski lift"], "correct_choice_idx": 1, "direct_answers": ["ski down", "ski", "hike", "skis", "ski", "ski downhill", "ski", "hike", "ski", "hike"], "difficult_direct_answer": false, "rationales": ["The people are at a snow covered hill. they have their skiis with them.", "The people will ski.", "The two people are sitting at the top of a mountain with skis so they would go down using the skis."], "image": "val2014/COCO_val2014_000000188417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376410, "question_id": "b3fTgLCTDTuLH7dsPGvMCC", "question": "What is causing the white smoke on the right?", "choices": ["train", "weather", "firepit", "torch"], "correct_choice_idx": 0, "direct_answers": ["train", "train", "smoke", "train", "train", "fire", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["A train is moving through with smoke billowing above. trains cause smokestacks.", "Railroad tracks are used for trains like one seen in the background. trains are known to burn fuel of which smoke is a byproduct.", "It has a steam engine"], "image": "train2014/COCO_train2014_000000376410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62230, "question_id": "b3rMSyBLQh48d4i4p3ca9x", "question": "What is number 1's first name?", "choices": ["delaney", "amar'e", "patrick", "lebron"], "correct_choice_idx": 1, "direct_answers": ["albert", "basketball", "stoudemere", "amar'e", "amare", "stoundemere", "amar", "amar'e", "amarte", "stonemore"], "difficult_direct_answer": true, "rationales": ["Number one is called amare.", "The person is amare.", "His last name is on the back of his shirt."], "image": "val2014/COCO_val2014_000000062230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391509, "question_id": "b3uTUV3LALm5fCbqyQhm53", "question": "During which season is this person waiting at the bus stop?", "choices": ["fall", "summer", "spring", "winter"], "correct_choice_idx": 3, "direct_answers": ["winter", "fall", "winter", "fall", "winter", "winter", "winter", "winter", "siting", "fall"], "difficult_direct_answer": false, "rationales": ["It's cold out since the person is in a jacket.", "The season is winter.", "This person is waiting at the bus stop in winter."], "image": "train2014/COCO_train2014_000000391509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257513, "question_id": "b487wZofH2rME3FHsXicw5", "question": "Where might these sleeping quarters be located?", "choices": ["home", "train", "bus", "car"], "correct_choice_idx": 1, "direct_answers": ["train", "ship", "bed", "train", "top", "train", "train", "bunk beds", "train", "van"], "difficult_direct_answer": false, "rationales": ["The quarters are on a train.", "Trains have tight sleeping quarters.", "There are sleeping quarter on a train."], "image": "train2014/COCO_train2014_000000257513.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51763, "question_id": "b4oEPYm5URdafHUZGoMpeK", "question": "What meal is being served?", "choices": ["breakfast", "lunch", "dinner", "afternoon tea"], "correct_choice_idx": 1, "direct_answers": ["dessert", "lunch", "lunch", "lunch", "lunch", "dessert", "lunch", "dessert", "breakfast", "lunch"], "difficult_direct_answer": false, "rationales": ["There is soda and a sandwich.", "It is a sandwich with a soda", "Lunch consists of sandwiches."], "image": "train2014/COCO_train2014_000000051763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473665, "question_id": "b52psXwzzKite8N3gy8Wm2", "question": "What is the purpose of this vehicle?", "choices": ["school bus", "speed", "carry cargo", "carry passengers"], "correct_choice_idx": 3, "direct_answers": ["transportation", "transportation", "transport people", "transport", "travel", "carry passengers", "transportation", "transport passengers", "transportion", "transportation"], "difficult_direct_answer": false, "rationales": ["The purpose is carrying people.", "This vehicle is used for passenger carrying.", "It has a large number of seats and is a bus."], "image": "train2014/COCO_train2014_000000473665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188510, "question_id": "b5aAMksqrTTMU6YAs87UJi", "question": "Where is the loudest sound coming from?", "choices": ["window", "little boy", "stuffed bear", "piano"], "correct_choice_idx": 3, "direct_answers": ["piano", "piano", "piano", "piano", "piano", "piano", "piano", "piano", "piano", "piano"], "difficult_direct_answer": false, "rationales": ["A piano is likely louder than a calm baby.", "The kid is playing the piano.", "The stuffed bear and window are not capable of making noises. the little boy is playing a loud musical instrument."], "image": "train2014/COCO_train2014_000000188510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519193, "question_id": "b5kjf4nqnVFzM7vST9JVkb", "question": "In which type sales shop do these kids stand?", "choices": ["apple", "ms packman", "grocery", "wii nintendo"], "correct_choice_idx": 3, "direct_answers": ["wii nintendo", "gaming area", "electronics", "video games", "gaming", "electronics", "electronics", "game", "gaming shop", "video game"], "difficult_direct_answer": false, "rationales": ["These kids are playing the nintendo wii.", "The shop sells wii.", "The boy is holding the controller to a wii because it has a nunchuck."], "image": "val2014/COCO_val2014_000000519193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127084, "question_id": "b5psxGGYfgrWexGxfHSPnz", "question": "What is the center piece of the room?", "choices": ["tv", "chair", "couch", "fire place"], "correct_choice_idx": 3, "direct_answers": ["fireplace", "fireplace", "coffee table", "coffee table", "fire place", "table", "coffee table", "coffee table", "fireplace", "coffee table"], "difficult_direct_answer": false, "rationales": ["There is a fireplace in the center of this room.", "The fireplace is in the middle of the room.", "The tv, couch, and chair are off to the sides of the room."], "image": "train2014/COCO_train2014_000000127084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248488, "question_id": "b5rrzwTyyjmMBm6UP7Lwjf", "question": "What type of lot is this?", "choices": ["fairground", "bus depot", "used car", "new car"], "correct_choice_idx": 1, "direct_answers": ["bus", "bus storage", "bus", "bus parking", "parking", "bus depot", "parking lot", "bus", "bus", "bus lot"], "difficult_direct_answer": false, "rationales": ["There are a lot of buses parked.", "The lot is for buses.", "There are only large people transporters parked there."], "image": "val2014/COCO_val2014_000000248488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142570, "question_id": "b5yKYYQjXLdaXrmHPNEvuc", "question": "What food is shown in the boats?", "choices": ["pizza", "hot dog", "hamburger", "tacos"], "correct_choice_idx": 1, "direct_answers": ["hot dogs", "hot dogs", "hot dog", "hot dog", "hot dog", "hot dogs", "hot dog", "hot dogs", "hot dogs", "hot dogs"], "difficult_direct_answer": false, "rationales": ["There are some hot dogs on top of the counter.", "You can tell by the setting and the toppings as to what they are eating.", "Rectangular boxes with long buns and hotdogs are on a counter with drinks."], "image": "train2014/COCO_train2014_000000142570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82676, "question_id": "b63ABNYxXEJMBbtUagDgDR", "question": "What type of road are the small motorcycles created for?", "choices": ["roadways", "highways", "bike lanes", "trails"], "correct_choice_idx": 3, "direct_answers": ["sidewalk", "pavement", "dirt", "street", "dirt", "grass", "trails", "off road", "dirt", "dirt"], "difficult_direct_answer": false, "rationales": ["The bikes are used for roads and trails.", "The tires have large tread. large tread can find better traction in the dirt.", "The motorcycles have versatile rubber tires designed for off-road travel. the type of bike can be seen driving through mud and dirt."], "image": "train2014/COCO_train2014_000000082676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36278, "question_id": "b65UDehuu7k7o5PqFXaCd9", "question": "Why are flowers in the vase?", "choices": ["food", "experiment", "centerpiece", "transport"], "correct_choice_idx": 2, "direct_answers": ["decoration", "daisy", "daisies", "water supply", "decoration", "keep fresh", "decoration", "beautify home", "cheery", "centerpiece"], "difficult_direct_answer": false, "rationales": ["The flowers are a centerpiece.", "The flowers are a centerpiece on the table.", "The flowers in the vase are the centerpiece of this kitchen area."], "image": "train2014/COCO_train2014_000000036278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365271, "question_id": "b6BkomfcabM3iPqYGtz96m", "question": "Which advertiser is a watch company?", "choices": ["armitron", "bud", "hershey's", "kroger"], "correct_choice_idx": 0, "direct_answers": ["armitron", "armitron", "armitron", "bud", "armitron", "armitron", "armitron", "possibly armitron", "armitron", "armitron"], "difficult_direct_answer": false, "rationales": ["Armitron is a watchmaker.", "Armitron makes watches.", "Armitron has been making watches for men and women for many years."], "image": "train2014/COCO_train2014_000000365271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332591, "question_id": "b6Mu2iSaKoLgcZFZnCcSAL", "question": "What type of shot is being taken here?", "choices": ["avoidance", "serve", "backhand", "return"], "correct_choice_idx": 3, "direct_answers": ["small short", "action", "bunt", "backhand", "forehand", "backhand", "front hand", "backhand", "return", "swing"], "difficult_direct_answer": false, "rationales": ["This player is hitting the ball back to his opponet.", "They are hitting the ball back.", "He is standing catty corner to the opposing serving square. a serve is hit cross court."], "image": "train2014/COCO_train2014_000000332591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543897, "question_id": "b6YZDKFuVJ3AZoeD38crCq", "question": "What type of coat does the sleeping cat have?", "choices": ["calico", "solid", "tabby", "brown mackerel"], "correct_choice_idx": 0, "direct_answers": ["calico", "calico", "fur", "fur", "calico", "calico", "patchwork", "calico", "short hair", "fur"], "difficult_direct_answer": false, "rationales": ["The cat is multi-colored and based on the visible colors and patterning, answer a is the term to describe this kind of cat.", "A cat with black, brown, and white fur is sleeping.", "The different colors, black, white, orange, tan, and the distinctive pattern are all common to the calico breed."], "image": "val2014/COCO_val2014_000000543897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418221, "question_id": "b6nxGteMnigCowAPKfEu48", "question": "What is needed for this sport?", "choices": ["wind", "sun", "water", "snow"], "correct_choice_idx": 3, "direct_answers": ["snow", "snowboard", "snow raft", "enjoy snow", "snowboards", "skateboard", "snow board", "snowboard", "snowboard", "snow"], "difficult_direct_answer": false, "rationales": ["Snow is needed for snowboarding.", "You need to have snow on the ground.", "The other three options are not required and can occur without necessarily impact the observed sport."], "image": "train2014/COCO_train2014_000000418221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458953, "question_id": "b6rAZdBqWri2TKQLkVRLC2", "question": "How are the objects in the sky controlled?", "choices": ["string", "computer", "magnets", "remote"], "correct_choice_idx": 0, "direct_answers": ["wind", "remotes", "string", "string", "line", "rope", "strings", "string", "kite lines", "string"], "difficult_direct_answer": false, "rationales": ["The kite is on a string.", "The objects in the sky are kites. kites are controlled with an object that is attached on one end to a kite, and the other end of this object is usually held in a person's hand.", "When flying kites they have to be held by something."], "image": "val2014/COCO_val2014_000000458953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518109, "question_id": "b6tSxSasgnutpfgqxvzDwQ", "question": "In what department does this man stand?", "choices": ["customer service", "housewares", "electronics", "checkout"], "correct_choice_idx": 2, "direct_answers": ["electronics", "electronics", "sales", "computing", "electronics", "electronics", "computer", "electronics", "electronics", "electronics"], "difficult_direct_answer": false, "rationales": ["The man is using a laptop.", "He is standing in front a laptop computer which is for sale, and a laptop is a type of electronic device.", "The man is using a computer which is considered an electronic."], "image": "val2014/COCO_val2014_000000518109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138488, "question_id": "b6xqaDGQwhY6gTSDgPreo9", "question": "What is the type of vehicle the people are riding?", "choices": ["motor horse", "motor scooter", "bicycle", "motorbike"], "correct_choice_idx": 1, "direct_answers": ["motorcycle", "moped", "motor scooter", "bike", "scooter", "scooter", "scooter", "moped", "motor scooter", "moped"], "difficult_direct_answer": false, "rationales": ["You can tell by the width and design as to what type it is.", "The vehicle is a motorbike.", "The people are visibly on a two wheeled vehicle with a built in footrest. this type of vehicle based on these features would be considered answer a."], "image": "val2014/COCO_val2014_000000138488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354248, "question_id": "b6zidGU98sCRBySfKGdeb4", "question": "What street do these people wait on?", "choices": ["main", "elm", "walnut", "fremont"], "correct_choice_idx": 3, "direct_answers": ["fremont", "fremont", "fremont ave", "fremont", "fremont", "fremont avenue", "fremont avenue", "fremont aven", "fremont", "freemont maybe"], "difficult_direct_answer": false, "rationales": ["The street is fremont.", "The man in black is holding a street sign that indicates the name of the street.", "The sign says fremont."], "image": "train2014/COCO_train2014_000000354248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242520, "question_id": "b7UYhEzArygQtZFhD5TgoQ", "question": "What can these animals get to that a dog could not?", "choices": ["windows", "leaves", "walls", "sausages"], "correct_choice_idx": 1, "direct_answers": ["trees", "tree branch", "rooftop moss", "rooftop", "leaves", "eat leaves", "tree", "roof", "roof", "tree leaves"], "difficult_direct_answer": true, "rationales": ["They are tall and have longs necks so they can easily reach the leaves on the branches of the tree. dogs are too short and cannot climb trees.", "These animals could reach the leaves with their giant necks.", "They're high up on trees, which only the giraffes can reach."], "image": "train2014/COCO_train2014_000000242520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568311, "question_id": "b7WwQhjcBNiEsQQqQ2RWJo", "question": "What is the top level of the buses used for?", "choices": ["sleeping", "sightseeing", "shipping", "exercising"], "correct_choice_idx": 1, "direct_answers": ["passengers", "carrying passengers", "passengers", "holding passengers", "more people", "sightseeing", "hold passengers", "people", "sight seeing", "sightseeing"], "difficult_direct_answer": false, "rationales": ["The top level of the bus is for tourists.", "The top is for tourists.", "A much better view of a town is gained by being a few feet above traffic, which double-deckers accomplish. double-deckers were first used in england by horse-drawn buses in 1847."], "image": "train2014/COCO_train2014_000000568311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24189, "question_id": "b7uSHb3EAyQA3WhBGcbxwX", "question": "These animals belong to what family?", "choices": ["bovidae", "felidae", "equidae", "canidae"], "correct_choice_idx": 0, "direct_answers": ["sheep", "sheep", "bovidae", "ungulate", "sheep", "sheep", "bovidae", "sheep", "bovidae", "bovie"], "difficult_direct_answer": false, "rationales": ["Goats are members of the family, which also includes antelopes, cattle and sheep.", "They belong to bovidae family.", "The bovidae comprise the biological family of cloven-hoofed, ruminant mammals."], "image": "train2014/COCO_train2014_000000024189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494552, "question_id": "b82dpjP2hU2jCjbTxfC6kN", "question": "What sort of business do these chairs belong to?", "choices": ["cafe", "table", "chair", "garage"], "correct_choice_idx": 0, "direct_answers": ["restaurant", "party", "restaurant", "cafe", "party", "restaurant", "restaurant", "restaurant", "restaurant", "diner"], "difficult_direct_answer": false, "rationales": ["The chairs are outdoors and next to tables at which one might eat. outdoor eating facilities are often associated with cafes.", "The chairs and table belong to a sidewalk cafe, which attracts both humans and birds! a cup of coffee at a sidewalk cafe is a fine way to people-watch and enjoy the day.", "This looks like a small restaurant."], "image": "train2014/COCO_train2014_000000494552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17409, "question_id": "b8Fj93j4XokFS6CqUkoqQb", "question": "How old is this beverage maker?", "choices": ["150 years", "60 years", "40 years", "200 years"], "correct_choice_idx": 2, "direct_answers": ["40 years", "46 years", "1989", "49 years", "1972", "49 years", "fourty five", "49 years", "100", "not clear"], "difficult_direct_answer": false, "rationales": ["The maker is 40 years.", "Nicolas feuillatte started the company in 1972.", "The beverage maker on the label is 40 years old."], "image": "val2014/COCO_val2014_000000017409.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358100, "question_id": "b8HDd8zrF8XNzaxhZX9uct", "question": "What style of vehicle is the taxi cab?", "choices": ["sedan", "truck", "suv", "compact"], "correct_choice_idx": 0, "direct_answers": ["taxi", "yellow car", "compact car", "for hire", "sedan", "sedan", "sedan", "sedan", "sedan", "yellow car"], "difficult_direct_answer": false, "rationales": ["Its a four door sedan.", "The taxi is a four-door car with a trunk and plenty of room. this is known as a sedan.", "There is a small sedan to the front of this big church."], "image": "train2014/COCO_train2014_000000358100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87507, "question_id": "b8SdGcguyhcwX2Q7MZUED2", "question": "What is a person doing behind the shades?", "choices": ["sock puppets", "drumming", "stabbing", "selling burgers"], "correct_choice_idx": 2, "direct_answers": ["stabbing woman", "stabbing woman", "stabbing someone", "stabbing", "commit murder", "singing", "stabbing", "stabbing", "murder", "killing"], "difficult_direct_answer": false, "rationales": ["A person is holding a knife out.", "The scene shows a silhouette of a person being stabbed with a knife.", "The shadow appears to be a man attacking a woman with a knife."], "image": "train2014/COCO_train2014_000000087507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398025, "question_id": "b8UeYvmJBxh9iv7kuDZNrV", "question": "In what country is this street found?", "choices": ["japan", "north korea", "south korea", "china"], "correct_choice_idx": 0, "direct_answers": ["usa", "usa", "japan", "china", "indonesia", "china", "japan", "america", "japan", "china"], "difficult_direct_answer": false, "rationales": ["The man is in japan.", "The people and the wording around the area look like they are from japan.", "This is in japan."], "image": "train2014/COCO_train2014_000000398025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290400, "question_id": "b8hyhky6w2WTwPUac4D35n", "question": "What time is it here?", "choices": ["527 am", "527 pm", "525 am", "525 pm"], "correct_choice_idx": 3, "direct_answers": ["525 pm", "ten pm", "dusk", "five twenty", "525 pm", "four", "525", "1120pm", "1125", "525 pm"], "difficult_direct_answer": false, "rationales": ["The sun is setting.", "The clock reads that it's 5:25.", "The time on the clock is 525 and the time of day is evening."], "image": "train2014/COCO_train2014_000000290400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187187, "question_id": "b956AjRFhNnqSvCM3Gv37A", "question": "The space is most likely holding what?", "choices": ["trial", "class", "conference", "worship service"], "correct_choice_idx": 2, "direct_answers": ["art", "computer conference", "conference", "meeting", "laptop", "desk", "conference", "computers", "college class", "laptop"], "difficult_direct_answer": false, "rationales": ["This space might be holding a conference exhibition.", "There are trifold presentations in the background.", "There are display boards and computers"], "image": "val2014/COCO_val2014_000000187187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440900, "question_id": "b9PitBT38x7kuphvFdhFcH", "question": "What are cows without horns?", "choices": ["belgium blue", "polled livestock", "gelbvieh", "swiss breed"], "correct_choice_idx": 1, "direct_answers": ["polled livestock", "female cows", "heifer", "females", "not bulls", "polled", "polled", "heifers", "milk cows", "polled"], "difficult_direct_answer": false, "rationales": ["Cows are in field with no horns. animals that don't have horns that normally do are referred to as polled.", "They have been de-horned through selective breeding.", "The cows are polled."], "image": "val2014/COCO_val2014_000000440900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438995, "question_id": "b9mWshk4e8h6f8eukbS5Q2", "question": "What sort of institution is shown here?", "choices": ["community center", "hospital", "church", "university"], "correct_choice_idx": 3, "direct_answers": ["health services", "health services", "university", "university", "bed breakfast", "university", "college", "college", "university", "school"], "difficult_direct_answer": false, "rationales": ["A large building has informational signs and large clock on a lamppost.", "The institution is a school.", "A sign with words pertaining to students is visible, and the building is part of the school."], "image": "val2014/COCO_val2014_000000438995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205981, "question_id": "b9q8YHLfKXtVVAdKbwp2ev", "question": "What is this type of snowboard trick called?", "choices": ["grinding", "nollie", "360 flip", "ollie"], "correct_choice_idx": 0, "direct_answers": ["grinding", "alley oop", "airborne", "jump", "rail riding", "grind", "skating", "grinding", "rail grind", "railsliding"], "difficult_direct_answer": true, "rationales": ["The trick is grinding.", "The snowboarder has one side of the snowboard on the rail.", "The snowboard is visible in contact with a rail. this type of trick for board sports is known as a grind."], "image": "train2014/COCO_train2014_000000205981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171997, "question_id": "b9riJUn3AkJBS8LJedeySd", "question": "What base is the nearest blue shirted person close to?", "choices": ["second", "last", "first", "home"], "correct_choice_idx": 3, "direct_answers": ["second", "home plate", "third", "home", "third", "third", "third base", "third", "home", "third"], "difficult_direct_answer": false, "rationales": ["The kids are playing near home base.", "He is near home plate.", "Baseball players are standing next to a base and another can be seen directly behind and to the right a bit."], "image": "train2014/COCO_train2014_000000171997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353610, "question_id": "bAfbcAuHiyEi57YztpvS4o", "question": "What is the man in the beanie using the black device to do?", "choices": ["to exercise", "to game", "take pictures", "to eat"], "correct_choice_idx": 2, "direct_answers": ["take picture", "take picture", "take picture", "take picture", "take picture", "take picture", "take photos", "take pictures", "take photo", "take pictures"], "difficult_direct_answer": false, "rationales": ["The lens of the camera is visible so he is definitely taking pictures.", "The other options don't apply. he's obviously holding a digital camera.", "He is holding a camera up to his eyes. cameras capture images."], "image": "train2014/COCO_train2014_000000353610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470381, "question_id": "bArP6WGRQSPezGjSNxXxtS", "question": "What are they doing?", "choices": ["answering questions", "arguing", "eating pizza", "resting"], "correct_choice_idx": 2, "direct_answers": ["eating pizza", "eating pizza", "eating", "eating", "eating pizza", "eating", "eating pizza", "eating", "eating pizza", "eating pizza"], "difficult_direct_answer": false, "rationales": ["They're having pizza.", "The soldiers are definitely enjoying some pizza during what appears to be a training session of some sort. the man on the left seems to be leading the meeting, and the somber attendees are paying close attention.", "The men have pizza."], "image": "train2014/COCO_train2014_000000470381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452398, "question_id": "bB2PGavhf6WQzEcJavk9Vf", "question": "What is this train built for?", "choices": ["passengers", "speed", "livestock", "freight"], "correct_choice_idx": 0, "direct_answers": ["railway", "tourism", "commuting", "passengers", "transportation", "passengers", "passengers", "passengers", "passengers", "passengers"], "difficult_direct_answer": false, "rationales": ["The train is build for people to ride to their destination.", "The modest length of this train, with windows all along it and seats lining it, tells us it is for passengers.", "Most vehicles of this sort are used for public transportation."], "image": "train2014/COCO_train2014_000000452398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436354, "question_id": "bB66BjkRpCG5ZWuADGXreL", "question": "Which food is usually eaten by athletes after running?", "choices": ["banana", "cucumber", "carrot", "tomato"], "correct_choice_idx": 0, "direct_answers": ["banana", "bananas", "veggies", "bananas", "vegetables", "banana", "bananas", "banana", "carrots", "banana"], "difficult_direct_answer": false, "rationales": ["Bananas have potassium.", "They eat bananas to get extra carbs.", "Most athletes eat bananas to restore stomach acids after running."], "image": "train2014/COCO_train2014_000000436354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 790, "question_id": "bBWAV88GdCYS6amKfiYqvR", "question": "Why are there candles in the cake in front of the woman?", "choices": ["for light", "her graduation", "her birthday", "decoration"], "correct_choice_idx": 2, "direct_answers": ["birthday", "birthday", "birthday", "birthday", "birthday", "birthday", "birthday", "her birthday", "birthday", "birthday"], "difficult_direct_answer": false, "rationales": ["Candles are usually used for birthday celebrations.", "The candles are for her birthday.", "Birthdays are traditionally celebrated with candles on a cake."], "image": "train2014/COCO_train2014_000000000790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86556, "question_id": "bBcKYWDN5NE6bJYp5CtM7N", "question": "Why are they so far apart?", "choices": ["too crowded", "quiet time", "afraid", "strangers"], "correct_choice_idx": 1, "direct_answers": ["both concentrating", "doing work", "each working", "desk", "working individuallly", "big bed", "working", "working", "working", "quiet time"], "difficult_direct_answer": false, "rationales": ["Both people separately seated in this image show deep concentration and attention towards their devices and likely need each other to be quiet.", "They're having quiet time.", "They are taking time to work on their own laptops."], "image": "train2014/COCO_train2014_000000086556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477459, "question_id": "bBdaBfeVmDcaHMHrxojdzN", "question": "Which letter of the book's title appears to be written in with marker?", "choices": ["b", "t", "o", "n"], "correct_choice_idx": 2, "direct_answers": ["o", "letter o", "letter w", "letter o", "letter", "letter o", "letter t", "check mark", "o", "o"], "difficult_direct_answer": false, "rationales": ["The black letter is written in marker.", "The letter o looks like it was written by hand.", "There looks like a red circle, resembling the letter o."], "image": "train2014/COCO_train2014_000000477459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162023, "question_id": "bBjGpG5WfGWR6JGb9yRkMz", "question": "What are these types of birds called?", "choices": ["chickadee", "crow", "raven", "sparrow"], "correct_choice_idx": 0, "direct_answers": ["chicks", "jay", "sparrow", "finch", "chicks", "chickadee", "chickadees", "pigeons", "chickadees", "robins"], "difficult_direct_answer": false, "rationales": ["The birds are chickadees.", "These are small birds.", "These birds are chickadees."], "image": "train2014/COCO_train2014_000000162023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110090, "question_id": "bBw8uR382YRAzbJdLcQmzG", "question": "What is the cat climbing through?", "choices": ["cupboard", "window", "chimney", "door"], "correct_choice_idx": 1, "direct_answers": ["window", "window", "window", "window", "window", "window", "window", "window", "window", "window"], "difficult_direct_answer": false, "rationales": ["You can tell by the setting and area of the house as to what the cat is climbing on.", "The cat is hanging on to the window sill in order to climb through.", "The cat is outside at the window."], "image": "train2014/COCO_train2014_000000110090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85081, "question_id": "bBwyhqiainNogoigsB5qdY", "question": "What activity is this man taking part in?", "choices": ["hiking", "roller skating", "skiboarding", "skiing"], "correct_choice_idx": 2, "direct_answers": ["snowboarding", "skateboarding", "snowboard", "snowboarding", "snowboarding", "snowboarding", "snowboarding", "skiboarding", "snowboarding", "skiing"], "difficult_direct_answer": false, "rationales": ["He is doing skiboarding.", "He has a snowboard in front of him that he is holding.", "The man is in the snow. the equipment that he is using has two bindings on the same base."], "image": "train2014/COCO_train2014_000000085081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525344, "question_id": "bCjS888uSeyRMczZageqTp", "question": "What is the last letter of the name that appears on the plane?", "choices": ["n", "w", "", "e"], "correct_choice_idx": 2, "direct_answers": ["it's", "", "letter", "letter", "delta", "letter", "letter", "letter", "letter", "letter"], "difficult_direct_answer": false, "rationales": ["N and w do not appear in the name. e is the second letter.", "The letter is a", "The plane is for delta air."], "image": "val2014/COCO_val2014_000000525344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308900, "question_id": "bD4wwNMPmkkr83kSBNzGnV", "question": "In which type setting is this ball park?", "choices": ["suburban", "desert", "urban", "rural"], "correct_choice_idx": 2, "direct_answers": ["baseball field", "urban", "city", "baseball field", "urban", "urban", "city", "urban", "city", "city"], "difficult_direct_answer": false, "rationales": ["There are tall buildings surrounding it.", "There are four tall buildings beyond the outfield, so this must be an urban setting.", "There are tall residential buildings in the background. these dwellings are consistent with answer a."], "image": "val2014/COCO_val2014_000000308900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160501, "question_id": "bD8nF4qWULCDvdXLgkwvbA", "question": "What is the purpose of the dog's jacket?", "choices": ["instrumentation", "floatation", "identification", "nutrition"], "correct_choice_idx": 1, "direct_answers": ["safe", "floatation", "warmth", "noticeability", "warmth", "visibility", "warm", "stay afloat", "drowning prevention", "life jacket"], "difficult_direct_answer": true, "rationales": ["The dog is wearing a life jacket, so he can float on the water.", "They are next to canoes, which shows they are by the water. the color and look of the jacket shows that it is a life jacket worn around water.", "The dog has a life jacket."], "image": "val2014/COCO_val2014_000000160501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130132, "question_id": "bDCJEbKyPyUVZL4VYQ9BdE", "question": "What muscle will the aerobics machine stimulate the most?", "choices": ["arms", "stomach", "glutes", "heart"], "correct_choice_idx": 3, "direct_answers": ["leg", "leg muscles", "heart", "legs", "leg", "legs", "thighs", "leg muscles", "legs", "legs"], "difficult_direct_answer": false, "rationales": ["It will get her legs working.", "The equipment is a cardio machine.", "This is for endurance and not muscle building"], "image": "train2014/COCO_train2014_000000130132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246252, "question_id": "bDKq7ENT9kJxU3MBABpkNf", "question": "What type of sports enthusiasts begin their run at the topmost buildings?", "choices": ["skiers", "ice skaters", "roller bladers", "baseball players"], "correct_choice_idx": 0, "direct_answers": ["skiers", "snowboard", "downhill skiing", "skii jump", "skiers", "professional", "high jumpers", "snowboarder", "snowboarders", "snowboarders"], "difficult_direct_answer": false, "rationales": ["This is at canada olympic park which is used for skiing.", "The skiers begin their runs on top of the giant slides.", "The sport is skiing."], "image": "val2014/COCO_val2014_000000246252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475932, "question_id": "bDn3DALCSRF7S2aWu7ouGs", "question": "What are the men sitting on?", "choices": ["bench", "grass", "boulder", "log"], "correct_choice_idx": 0, "direct_answers": ["bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench"], "difficult_direct_answer": false, "rationales": ["The men are on a bench.", "The men are on a bench.", "He is sitting on a bench. the bench is on the grass. i see no log or nor a boulder."], "image": "train2014/COCO_train2014_000000475932.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515662, "question_id": "bDnUbdyLDcoKNVorCFj4Fi", "question": "What annual event is the company famous for?", "choices": ["butchering contest", "barbecue contest", "eating contest", "cooking contest"], "correct_choice_idx": 2, "direct_answers": ["eating contest", "eating contest", "eating contest", "contest", "hotdog eating", "carnival", "eating contest", "eating contest", "fish chips", "eating contest"], "difficult_direct_answer": false, "rationales": ["The company listed on the cups is nathan's famous. the food items are hot dogs.", "The brand name is visible and associated with the hot dogs also pictured. there is a commonly known event associated with this company and the visible food.", "Nathan's has a hot dog eating contest."], "image": "train2014/COCO_train2014_000000515662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111788, "question_id": "bE294eCUnzyfbGrEW4ZnQc", "question": "What are the people in blue wearing?", "choices": ["rubber", "scuba suits", "wet suits", "running suits"], "correct_choice_idx": 2, "direct_answers": ["white", "scatting", "wet suits", "wet suit", "wet suits", "wetsuits", "wetsuits", "beach", "wet suits", "wet suits"], "difficult_direct_answer": false, "rationales": ["The people are wearing wet suits because they are about to go in the ocean to go surfing", "Surfers wear wetsuits for multiple reasons, and the people are seen holding surfboards on the beach and wearing suits.", "The people have wetsuits."], "image": "val2014/COCO_val2014_000000111788.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440298, "question_id": "bE3ctvG3x7fs2t36NwTdgc", "question": "This city's name comes from a Native American word for what?", "choices": ["wind", "onion", "tomato", "lake"], "correct_choice_idx": 1, "direct_answers": ["stinky onion", "unknown", "chicago", "city", "onion", "onion", "striped skunk", "onion", "peninsula", "shikato"], "difficult_direct_answer": false, "rationales": ["Chicago's name meaning in the algonquin language.", "The word is onion.", "The city is chicago and comes from the algonquin language: \"shikaakwa,\" which means onion."], "image": "train2014/COCO_train2014_000000440298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208805, "question_id": "bE898y95dJjXpgyTivTHdJ", "question": "What is the black device next to the lamp on the end table called?", "choices": ["radio", "telephone", "fan", "gramophone"], "correct_choice_idx": 1, "direct_answers": ["telephone", "rotary phone", "telephone", "telephone", "telephone", "telephone", "rotary phone", "rotary phone", "remote", "phone"], "difficult_direct_answer": false, "rationales": ["The device is a phone.", "The black device is a landline phone.", "The black device is a phone that can be used for calls with the earpiece."], "image": "train2014/COCO_train2014_000000208805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245864, "question_id": "bESrUJGsLM9iMxN2w7dYHP", "question": "What type of area is this?", "choices": ["urban", "rural", "forest", "desert"], "correct_choice_idx": 0, "direct_answers": ["city", "urban", "urban", "city", "city", "city", "city", "urban", "urban", "shopping"], "difficult_direct_answer": false, "rationales": ["There is most of the lights that are seen.", "The area is urban.", "There are tall buildings all around with street lights as well."], "image": "train2014/COCO_train2014_000000245864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504621, "question_id": "bETnffUiaTaYne7WgRZhxy", "question": "What is this lady about to do?", "choices": ["eat", "brush teeth", "sleep", "watch tv"], "correct_choice_idx": 1, "direct_answers": ["brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush teeth"], "difficult_direct_answer": false, "rationales": ["The woman is holding up a toothbrush that is used to clean teeth.", "The woman has a toothbrush in her hand.", "The woman is brushing teeth."], "image": "val2014/COCO_val2014_000000504621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219264, "question_id": "bEi63p8tPDgnkyUSYL7Fwj", "question": "What is the pipe used for in the bottom left corner of the picture?", "choices": ["drainage", "conduit", "water pump", "gas"], "correct_choice_idx": 0, "direct_answers": ["water", "water", "fencing", "drainage", "water", "water", "water", "fencing", "fencing", "fencing"], "difficult_direct_answer": false, "rationales": ["The pipe is for drainage.", "Traditionally farms use wells and have to pump out stool from the farm animals.", "This is a metal cylinder pipe that has an opening perfect for draining."], "image": "train2014/COCO_train2014_000000219264.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193108, "question_id": "bEkxnokjbebZwCteFxzSWC", "question": "Which player are they looking at?", "choices": ["outfielder", "catcher", "shortstop", "pitcher"], "correct_choice_idx": 3, "direct_answers": ["ball", "25", "25 player", "opposing team", "battler", "cricket", "pitcher", "pitcher", "pitcher", "pitcher"], "difficult_direct_answer": false, "rationales": ["The ball is coming toward the batter. it is being thrown by the pitcher.", "The position of these batters in their cage suggests someone who is about to throw a baseball to them off camera.", "They are looking at the pitcher."], "image": "val2014/COCO_val2014_000000193108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31636, "question_id": "bF7YgvXdNoUZTEZuHtmTqF", "question": "Which bowls location is most likely to have more items dipped inside it?", "choices": ["upper left", "center", "upper right", "none"], "correct_choice_idx": 1, "direct_answers": ["middle", "smaller bowl", "middle", "middle one", "smaller one", "center", "vegetables", "big one", "middle bowl", "center"], "difficult_direct_answer": false, "rationales": ["The bowl is in the center.", "The center has a hummus looking substance in it.", "The middle bowl is closer to the dipping vegetables."], "image": "val2014/COCO_val2014_000000031636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447991, "question_id": "bFFwWvNqYt4AFeSArsQuQU", "question": "Which vehicle might transport the largest group of people?", "choices": ["bicycle", "silver sedan", "orange van", "white car"], "correct_choice_idx": 2, "direct_answers": ["van", "bus", "bus", "vw bus", "van", "van", "bus", "orange van", "van", "van"], "difficult_direct_answer": false, "rationales": ["The vehicle is the orange van.", "The orange van can hold several passengers in different rows.", "An orange van brings people around."], "image": "val2014/COCO_val2014_000000447991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494434, "question_id": "bFTaadgJ8t4i9dFqqTXyDg", "question": "Why is the woman using an umbrella?", "choices": ["prevent heat", "prevent sunburn", "snow", "rain"], "correct_choice_idx": 3, "direct_answers": ["raining", "rain", "raining", "rain", "rain", "avoid rain", "rain", "rain", "it's raining", "rain"], "difficult_direct_answer": false, "rationales": ["The woman doesn't want to get wet.", "Umbrellas are used to keep people either dry or out of the sun. it is not sunny here.", "The rain is out."], "image": "train2014/COCO_train2014_000000494434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545566, "question_id": "bFaiLXi7oNE4KUCj9BkZnP", "question": "What beverage is laying to the right of the skateboard?", "choices": ["soda", "beer", "water", "juice"], "correct_choice_idx": 1, "direct_answers": ["beer", "beer", "tecate", "beer", "tecate beer", "beer", "tecate", "beer", "tecate", "tecate"], "difficult_direct_answer": false, "rationales": ["The can has a tecate logo on its side. this is an alcoholic beverage.", "That's what tecate is.", "Beer is shown by the board."], "image": "train2014/COCO_train2014_000000545566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314693, "question_id": "bGBxVFQXybr6DKUmcjDRqQ", "question": "What are the hexagons near the shoreline made of?", "choices": ["wood", "stone", "metal", "ice"], "correct_choice_idx": 1, "direct_answers": ["paving stones", "stone", "stone", "cement", "stone", "concrete", "stone", "stone", "cement", "concrete"], "difficult_direct_answer": false, "rationales": ["The hexagons are stones.", "The hexagons near the shoreline are made of stone.", "They are placed into the soil for you to walk on easily."], "image": "train2014/COCO_train2014_000000314693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341139, "question_id": "bGJSeorjPpzDNwb32URjx7", "question": "What animal has been prepared for consumption?", "choices": ["cow", "crab", "chicken", "pig"], "correct_choice_idx": 0, "direct_answers": ["cow", "cow", "cow", "cow", "cow", "cow", "cow", "cow", "bovine", "cow"], "difficult_direct_answer": false, "rationales": ["Cow's meat is prepared for consumption here.", "There is ground beef in the hot dog.", "The animal is a cow."], "image": "train2014/COCO_train2014_000000341139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59237, "question_id": "bGRB7do5GoWvGAAbY9MzCJ", "question": "What are the people going to do in the wooden object?", "choices": ["eat", "dance", "make out", "kayak"], "correct_choice_idx": 3, "direct_answers": ["go canoeing", "boat ride", "boat ride", "kayak", "go boating", "boat ride", "canoe", "canoe", "sail away", "row"], "difficult_direct_answer": false, "rationales": ["The people will kayak.", "The people are standing next to a kayak.", "The object is made to float. the people will be paddling in it."], "image": "train2014/COCO_train2014_000000059237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84648, "question_id": "bGWTvRXLiKAdSiThGqASWA", "question": "What is the computer sitting on?", "choices": ["desk", "chair", "bed", "bench"], "correct_choice_idx": 1, "direct_answers": ["chair", "chair", "chair", "chair", "chair", "chair", "chair", "chair", "chair", "chair"], "difficult_direct_answer": false, "rationales": ["The computer is open and on a piece of furniture with four legs designed for a single person to sit on.", "The laptop is resting on a chair.", "The computer is on a chair."], "image": "train2014/COCO_train2014_000000084648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548198, "question_id": "bGXpxscpkBLdJ5phYn9mtD", "question": "What is the closest major city from this outdoor area?", "choices": ["portland", "seattle", "vancouver", "edmonton"], "correct_choice_idx": 2, "direct_answers": ["chicago", "dallas", "houston", "san francisco", "vancouver", "no idea", "michigan", "baltimore", "miami", "san francisco"], "difficult_direct_answer": true, "rationales": ["The city is vancouver.", "This area is in canada.", "Vancouver is closest to this area."], "image": "train2014/COCO_train2014_000000548198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424222, "question_id": "bGdg3Q4hcfc3btaJrPofLR", "question": "What type of sign is on the pole?", "choices": ["brand", "directional", "informational", "price"], "correct_choice_idx": 2, "direct_answers": ["bird crossing", "traffic signal", "duck crossing", "informational", "duck crossing", "ducks crossing", "ducks", "animal crossing", "duck crossing", "duck crossing"], "difficult_direct_answer": false, "rationales": ["The sign is of a mother duck and her chicks; which displays information about what type of animal crosses around this area.", "The sign is informational.", "The sign says ducks are crossing."], "image": "train2014/COCO_train2014_000000424222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214527, "question_id": "bGs4jtYF9xtuXb8rg3DVij", "question": "What number is the little hand on the clock closest to?", "choices": ["nine", "seven", "five", "one"], "correct_choice_idx": 3, "direct_answers": ["twelve", "one", "one", "11", "eleven", "one", "one", "one", "eleven", "one"], "difficult_direct_answer": false, "rationales": ["It is almost one o'clock.", "The little hand is pointing directly to the number one.", "The other numbers are farther down on the clock face."], "image": "val2014/COCO_val2014_000000214527.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559315, "question_id": "bGu7sVpibzAArjUexhNgKf", "question": "Where does this scene take place?", "choices": ["morocco", "israel", "america", "greece"], "correct_choice_idx": 1, "direct_answers": ["israel", "bus stop", "bus stop", "israel", "russia", "in bustop", "bus stop", "israel", "bus stop", "israel"], "difficult_direct_answer": false, "rationales": ["This scene takes place in israel. the writing is in hebrew.", "There are jewish words on the bus.", "The scene is in israel."], "image": "val2014/COCO_val2014_000000559315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529590, "question_id": "bHUS9qPNGhMXixCbe4xkeP", "question": "What is a special feature of his glasses?", "choices": ["tinted", "has mirrors", "x-ray", "bifocal"], "correct_choice_idx": 3, "direct_answers": ["bifocal", "bifocals", "bifocals", "bifocal", "bifocals", "spectacle", "colourless", "bifocal", "bifocals", "bi focal"], "difficult_direct_answer": false, "rationales": ["A line can be seen on the lens separating the two different prescriptions so the man can see close up and far away.", "His glasses have lines for different types of vision improvement.", "That is the type of glasses the man is wearing."], "image": "val2014/COCO_val2014_000000529590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16297, "question_id": "bHiuxpyCQat2z2VkCDXhdb", "question": "What is the owner giving his dog?", "choices": ["time out", "food", "hug", "medication"], "correct_choice_idx": 2, "direct_answers": ["hug", "hug", "hug", "hug", "hug", "hug", "hug", "hug", "hug", "hug"], "difficult_direct_answer": false, "rationales": ["He has his hands wrapped around him and this is how people give hugs.", "He has his arm around him", "The owner is spooning the dog."], "image": "train2014/COCO_train2014_000000016297.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473003, "question_id": "bHwDZzcyPguacukKUetc2R", "question": "Why is the desk by the window?", "choices": ["cooler", "no room", "warmer", "enjoying view"], "correct_choice_idx": 3, "direct_answers": ["view", "good view", "look out", "view", "light", "enjoying view", "light", "view", "view", "computer"], "difficult_direct_answer": false, "rationales": ["The person working at this desk will be able to see the scenic view out the window behind their computers.", "The person can get a view of the skyline while working.", "To give some respite while working"], "image": "train2014/COCO_train2014_000000473003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152844, "question_id": "bJ8zMahv6yaSLmZrUzZVie", "question": "What number does the team mate of 14 wear?", "choices": ["zero", "21", "eight", "none"], "correct_choice_idx": 1, "direct_answers": ["21", "twenty one", "twenty one", "21", "twenty one", "21", "twenty-one", "twenty one", "21", "21"], "difficult_direct_answer": false, "rationales": ["The number is on the back of his shirt.", "The man with 14 on his back has a black and white uniform. the other man wearing a black and white uniform has a larger number on his back.", "The number is 21."], "image": "train2014/COCO_train2014_000000152844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544731, "question_id": "bJABXn32UJHnTiyrCJJsfc", "question": "What does the sign say the company buys?", "choices": ["watches", "dvd's", "dogs", "gold"], "correct_choice_idx": 3, "direct_answers": ["gold", "gold", "gold", "gold", "gold", "gold", "gold", "gold", "gold", "gold"], "difficult_direct_answer": false, "rationales": ["The store purchases gold from customers as is displayed on the sign in the front.", "The sign is gold.", "The sign indicates this company is buying gold."], "image": "train2014/COCO_train2014_000000544731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448137, "question_id": "bJSbsnQxGTUKAMnZhudaiP", "question": "What is the man doing in the small white building?", "choices": ["painting", "cooking", "sleeping", "gaming"], "correct_choice_idx": 1, "direct_answers": ["cooking food", "selling food", "making food", "working", "cooking", "vending", "cooking", "cooking", "selling food", "selling food"], "difficult_direct_answer": false, "rationales": ["The man is awake. the signs on the side of the building indicate that it is a mac & cheese stand.", "The man is cooking.", "The man is a cook who is preparing hot food at a mobile restaurant booth."], "image": "train2014/COCO_train2014_000000448137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11968, "question_id": "bJVcd5WV2sq4JsHKqK8vpH", "question": "What impairs sight here?", "choices": ["blinders", "singing", "eye doctor", "cars"], "correct_choice_idx": 0, "direct_answers": ["blinders", "eyes", "blinders", "blinders", "blinders", "eye covers", "blinders", "house", "muzzle", "mask"], "difficult_direct_answer": false, "rationales": ["The horse has blinders.", "Horses wear blinders to avoid seeing things on their sides that could startle them.", "We can see the square leather feature on this horse's harness obstructing it's eye partially."], "image": "train2014/COCO_train2014_000000011968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17741, "question_id": "bJgL7uYrwGHMrPm5oWodkk", "question": "What period of the day is it in the photo?", "choices": ["late morning", "night", "evening", "afternoon"], "correct_choice_idx": 0, "direct_answers": ["evening", "late morning", "late afternoon", "afternoon", "late afternoon", "morning", "early morning", "noon", "morning", "dusk"], "difficult_direct_answer": false, "rationales": ["The sun is just rising, and it is probably almost noon.", "There is a clock visible in the background. based on the time shown on the clock and the visible light, answer a is consistent.", "It is daytime with a shadow being cast below."], "image": "val2014/COCO_val2014_000000017741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48141, "question_id": "bJmyf9G27zXtR9pGjs7A6d", "question": "What is the electro locomotive for this train?", "choices": ["e7b", "e7a", "e7g", "e5a"], "correct_choice_idx": 1, "direct_answers": ["e7a", "unsure", "red", "diesel", "old", "engine", "gg1-class electric", "horsepower", "engine", "pennsylvania"], "difficult_direct_answer": true, "rationales": ["The first electromotive used for this train is the e7a.", "That is the type of diesel.", "This is the type of elector locomotive for this train."], "image": "train2014/COCO_train2014_000000048141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143277, "question_id": "bJnnPSW5ESL5Djt3FtmAty", "question": "Why are the cars lined up along the sidewalk?", "choices": ["to park", "to wash", "car show", "to race"], "correct_choice_idx": 0, "direct_answers": ["parking", "parked", "parked", "parked", "parking", "parking", "parking", "parked", "parked", "to park"], "difficult_direct_answer": false, "rationales": ["The cars are parked.", "The cars are parking.", "One type of parking is to park along the curb next to the sidewalk, which is how these cars are positioned, so they are where they are because they're parked."], "image": "train2014/COCO_train2014_000000143277.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184139, "question_id": "bJqHprvyzChiLPR7NxmEHn", "question": "What is located directly on top of the desk and is generating light?", "choices": ["flask", "laptop", "sun", "television"], "correct_choice_idx": 1, "direct_answers": ["laptop", "lamp", "laptop", "lamp", "lamp", "lamp", "laptop", "lamp", "laptop", "lamp"], "difficult_direct_answer": false, "rationales": ["One can see the blue light emanating from the notebook computer.", "The device is technology.", "The computer is giving off a blue glow and it is on the desk."], "image": "val2014/COCO_val2014_000000184139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88538, "question_id": "bJtWQMp5Lyic5g7vcznD7u", "question": "What are the umbrellas being used for?", "choices": ["lighting", "blocking sun", "stopping rain", "decoration"], "correct_choice_idx": 0, "direct_answers": ["light diffusion", "lighting", "lighting", "light reflectors", "light", "lighting", "light", "photography", "lighting", "light"], "difficult_direct_answer": false, "rationales": ["These are professional lights that have umbrellas attached to them to help direct the lighting.", "With recording videos you must dim the lights so it won't be so bright.", "They are diffusing it so there is no glare"], "image": "train2014/COCO_train2014_000000088538.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84341, "question_id": "bJvJact2VpqnhYqCepDN9P", "question": "Where was skateboarding invented?", "choices": ["california", "france", "italy", "utah"], "correct_choice_idx": 0, "direct_answers": ["california", "california", "california", "america", "america", "california", "california", "california", "america", "california"], "difficult_direct_answer": false, "rationales": ["This is in california.", "Skateboarding comes from a place with sunny weather.", "Skateboards are invented in california."], "image": "train2014/COCO_train2014_000000084341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35297, "question_id": "bJzYW8obaYi8DnZa8Qvz78", "question": "What continent do these animals naturally live on?", "choices": ["australia", "europe", "africa", "north america"], "correct_choice_idx": 0, "direct_answers": ["africa", "africa", "africa", "africa", "africa", "africa", "australia", "africa", "africa", "africa"], "difficult_direct_answer": false, "rationales": ["These would be quite common in the savannah as evidenced by the dry, desert-like ground they are on.", "Zebras are grazing in a dry open landscape. zebras are from africa.", "The continent is australia."], "image": "train2014/COCO_train2014_000000035297.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309152, "question_id": "bKfZ3YU23s44SQ6tnTeQfp", "question": "What is the gray things holding the carrot?", "choices": ["tongue", "tape", "hands", "cord"], "correct_choice_idx": 0, "direct_answers": ["tongue", "tongue", "tongue", "tongue", "tongue", "giraffe", "tongue", "giraffe", "tongue", "tongue"], "difficult_direct_answer": false, "rationales": ["The gray object is the giraffe's tongue.", "The animal is using its mouth to grab the carrot. the hand is holding it as well, but it's not grey.", "The giraffe is grabbing it to eat"], "image": "train2014/COCO_train2014_000000309152.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242979, "question_id": "bKqwZZR8wmqJgyiCawK2XN", "question": "What business are these vehicles in?", "choices": ["movers", "tourism", "logistics", "gas transportation"], "correct_choice_idx": 2, "direct_answers": ["hauling", "logistics", "oil", "gas transport", "logistics", "shipping", "oil transport", "hauling", "trucking", "shipping"], "difficult_direct_answer": false, "rationales": ["That is the business the trucks run.", "The trucks are for transporting cargo.", "Logistics is printed on the vehicles."], "image": "train2014/COCO_train2014_000000242979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425945, "question_id": "bKsNyFnkt7wbukTEADMPhF", "question": "What word is included in the name that is found on the bottom of the label in red?", "choices": ["loud", "mouth", "plumes", "lime"], "correct_choice_idx": 1, "direct_answers": ["plymouth", "plymouth", "plymouth", "plymouth", "mouth", "mouth", "mouth", "plymouth", "plymouth", "plymouth"], "difficult_direct_answer": false, "rationales": ["The word on the label is \"mouth\".", "The word mouth is in plymouth.", "The whole word is plymouth."], "image": "train2014/COCO_train2014_000000425945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487256, "question_id": "bL3MsuSTaiwKYFPamPmb2n", "question": "Where is this room located?", "choices": ["home", "office", "hospital", "store"], "correct_choice_idx": 0, "direct_answers": ["house", "house", "home", "house bedroom", "house", "house", "bedroom", "house", "house", "house"], "difficult_direct_answer": false, "rationales": ["The bedroom has some personal possessions.", "The room is at home.", "It looks like a sparsely furnished room in a house."], "image": "train2014/COCO_train2014_000000487256.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67802, "question_id": "bLDVYsqNNSn2LQDjAtZLSH", "question": "What NHL team does this man like?", "choices": ["oilers", "devils", "penguins", "flyers"], "correct_choice_idx": 0, "direct_answers": ["oilers", "oilers", "oilers", "oilers", "oilers", "oilers", "oilers", "oilers", "oilers", "oilers"], "difficult_direct_answer": false, "rationales": ["That is the team on the hat.", "The man has a hat with the team logo.", "The logo on his hat indicates which team he likes."], "image": "train2014/COCO_train2014_000000067802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30973, "question_id": "bLE4Ld3hWwZT3KUfcYB2Yo", "question": "Where should the heavier person sleep?", "choices": ["another room", "floor", "bottom", "top"], "correct_choice_idx": 2, "direct_answers": ["bed", "bottom", "bottom", "bottom", "bottom bunk", "bottom", "bottom", "bottom", "bottom bunk", "bottom"], "difficult_direct_answer": false, "rationales": ["The lighter person should sleep on the top bunk to keep it safer from falling.", "The heavier person is on the bottom.", "A heavier person would be more likely to cause something to fall than a lighter person. it would be less harmful if a light person fell on a heavy person than the other way around."], "image": "train2014/COCO_train2014_000000030973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417219, "question_id": "bLEMPL3FJAgfquXERgitSH", "question": "Which colored jacket does the person have on who is looking away from the camera?", "choices": ["blue", "red", "black", "yellow"], "correct_choice_idx": 2, "direct_answers": ["black", "black", "black", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["They are off to the left and back", "The person in black is far away.", "A person standing away from a group in colorful snowsuits is turn with only his back visible."], "image": "train2014/COCO_train2014_000000417219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238223, "question_id": "bLXP2dftfb8XVFBJtztMcV", "question": "By what power does the train here move?", "choices": ["electric", "gas", "horse", "water"], "correct_choice_idx": 0, "direct_answers": ["electric", "electric", "electricity", "momentum", "electric", "electric", "electricity", "electric", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["You can tell by the long rods and power lines as to what powers the train.", "The train has power lines connected to it and is run on electricity.", "There are electric poles on top of the train."], "image": "train2014/COCO_train2014_000000238223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557595, "question_id": "bLrgagoJmZ2hqGnCEXumvp", "question": "What is on the item the person is holding?", "choices": ["egg shells", "tattoos", "jewelry", "wheels"], "correct_choice_idx": 3, "direct_answers": ["skateboard", "wheels", "wheels", "wheels", "wheels", "skateboard", "wheels", "skateboard", "wheels", "skateboard"], "difficult_direct_answer": false, "rationales": ["The man has a skateboard.", "A guy is holding a skateboard in one hand.", "The person is holding a skateboard. it has two trucks."], "image": "val2014/COCO_val2014_000000557595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525510, "question_id": "bLyhaZBoNotWGvrjBtWUjb", "question": "What are the vehicles in the left lane attempting to do?", "choices": ["turn", "reverse", "park", "speed"], "correct_choice_idx": 0, "direct_answers": ["turn left", "turn", "turn left", "turn", "turn", "turn left", "turn", "turn", "turn", "turn"], "difficult_direct_answer": false, "rationales": ["They are getting in the left lane so they can turn.", "The vehicles are trying to turn.", "The vehicles want to turn."], "image": "train2014/COCO_train2014_000000525510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541764, "question_id": "bM2MPByDhozX9tVPVEHS9c", "question": "What is the man pushing the cart doing here?", "choices": ["vacationing", "moving", "selling suitcases", "packing"], "correct_choice_idx": 0, "direct_answers": ["vacationing", "carrying luggage", "transporting luggage", "luggage", "carrying luggage", "carrying suitcases", "delivery", "transporting luggage", "moving luggage", "transporting luggage"], "difficult_direct_answer": false, "rationales": ["He is in vacation clothing and pushing suitcases, which indicate he is traveling in that location.", "The man has luggage with him for vacation.", "The man is going for a trip with his suitcases."], "image": "train2014/COCO_train2014_000000541764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418805, "question_id": "bM3ZRVDnzY8KM4D6GVsLN2", "question": "What is the white kite shaped like?", "choices": ["spade", "diamond", "club", "heart"], "correct_choice_idx": 1, "direct_answers": ["diamond", "polygon", "polygon", "diamond", "diamond", "diamond", "polygon", "diamond", "diamond", "diamond"], "difficult_direct_answer": false, "rationales": ["The kite is shaped like two triangles that are touching each other along their longest side.", "The kite is in the shape of a diamond.", "The kite is a diamond."], "image": "train2014/COCO_train2014_000000418805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373266, "question_id": "bM4BFUyaUZaWWbDEBjjfYD", "question": "What is the old man watching?", "choices": ["movie", "operation", "reality show", "drama"], "correct_choice_idx": 1, "direct_answers": ["operation", "screen", "operation", "preformed surgery", "televised surgery", "tv", "video", "monitor", "surgery", "tv"], "difficult_direct_answer": false, "rationales": ["The old man is watching a procedure.", "The man is in scrubs and is surrounded by medical professionals.", "They are in medical scrubs, and the picture on the screen appears to be someone in scrubs over a patient being operated on."], "image": "val2014/COCO_val2014_000000373266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101626, "question_id": "bMBk45GhZEjRkJPmjQruE7", "question": "Who is the woman below the jet?", "choices": ["pilot", "worker", "security", "visitor"], "correct_choice_idx": 3, "direct_answers": ["pilot", "mom", "child", "pilot", "passenger", "mechanic", "mechanic", "visitor", "pilot", "mechanic"], "difficult_direct_answer": false, "rationales": ["There is a visitor under the jet.", "A woman is visiting to take pictures of this large jet.", "The woman is just a visitor."], "image": "train2014/COCO_train2014_000000101626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51470, "question_id": "bMFJce9XgU63pjmyfj2s2c", "question": "What is the person trying to shield their laptop from?", "choices": ["sun", "people", "wind", "water"], "correct_choice_idx": 0, "direct_answers": ["sun", "sun", "sun", "sun", "sun exposure", "sun", "sunlight", "sun", "sunlight", "light"], "difficult_direct_answer": false, "rationales": ["It is hard to see laptop screens when the sun is too bright.", "They are trying to keep the light out so they can see the screen.", "The hoodie provides shade from the bright sun."], "image": "train2014/COCO_train2014_000000051470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315249, "question_id": "bMQwKnUn2MmmJpe26iyW2v", "question": "What color is the person wearing who caught the wave best?", "choices": ["green", "red", "orange", "purple"], "correct_choice_idx": 3, "direct_answers": ["blue", "blue", "purple", "blue", "blue", "blue", "black", "black", "blue", "brown"], "difficult_direct_answer": false, "rationales": ["You can tell that the person who caught the best wave because he is surfing and still on his board.", "The person wearing the purple outfit is riding the wave and benefiting from the kinetic forces of the water, while others are not.", "The man on the wave has a purple shirt."], "image": "val2014/COCO_val2014_000000315249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410266, "question_id": "bMRHEm9iNqND7zo6ywKbfL", "question": "What action is he about to take?", "choices": ["dunk", "swing", "toss", "dribble"], "correct_choice_idx": 1, "direct_answers": ["swing", "hit ball", "hit tennisball", "return ball", "hit ball", "hit", "hit ball", "hit ball", "hitting ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["The boy is holding a racket and the ball is going toward him.", "The boy is playing tennis and with his hand like this and where the visible ball is, in order to achieve the objectives of tennis, he would be preparing to swing.", "He's going to swing."], "image": "train2014/COCO_train2014_000000410266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90573, "question_id": "bMcMj39etz5J8soEfyndte", "question": "Which shirt color does player wants to take over control of the soccer ball from the person near it wear?", "choices": ["green", "yellow", "black", "blue"], "correct_choice_idx": 1, "direct_answers": ["yellow", "blue", "blue", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The person in yellow wants the ball.", "The opponent is wearing a yellow jersey and does not have the ball.", "The color is yellow."], "image": "train2014/COCO_train2014_000000090573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101390, "question_id": "bMpoCapRUTGS6PfMVuBpMC", "question": "What type of store is this?", "choices": ["beauty", "shoe", "pet", "food"], "correct_choice_idx": 3, "direct_answers": ["produce", "fruit store", "produce", "fruit stand", "fruits", "fruit store", "food market", "fruit stand", "food", "fruit"], "difficult_direct_answer": false, "rationales": ["Produce is for sale.", "All types of edibles, such as bananas and oranges, can be seen on display. thus, this is a food store.", "The store sells food."], "image": "train2014/COCO_train2014_000000101390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319275, "question_id": "bN6hcgynMC28NjnCvCFcR2", "question": "What is the weather like in this location?", "choices": ["moderate", "below freezing", "mild", "temperate"], "correct_choice_idx": 1, "direct_answers": ["cold snowy", "snowy", "below freezing", "cold", "cold", "snowing", "snowy", "snowing", "snowy", "snowy"], "difficult_direct_answer": false, "rationales": ["The weather is super snowy.", "There is snow on the ground so it has to be pretty cold or it would melt.", "It's cold and snow is on the ground."], "image": "val2014/COCO_val2014_000000319275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556406, "question_id": "bNZpF3GJpY2zaruh9zKmHt", "question": "What are the zebras doing?", "choices": ["grazing", "drinking", "mating", "searching"], "correct_choice_idx": 0, "direct_answers": ["grazing", "eating", "grazing", "grazing", "eating", "eating", "eating", "grazing", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["The zebras are snacking.", "The zebras are walking around and eating.", "There is food on the ground. the zebras are eating it."], "image": "train2014/COCO_train2014_000000556406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510358, "question_id": "bNaLZAxmCK3rcVuDt3DGD6", "question": "What are the occupation of the people featured in the advertisement?", "choices": ["doctor", "lawyer", "teacher", "scientist"], "correct_choice_idx": 0, "direct_answers": ["doctors", "doctor", "doctors", "doctor", "doctors", "doctor", "doctors", "doctors", "doctors", "doctors"], "difficult_direct_answer": false, "rationales": ["The people are wearing white medical jackets and the add is for healthcare and the mayo clinic.", "The person is a doctor.", "The white coat's of the people pictured in this ad; and the word's healthcare, clinic and physicians present in this ad, allow us to conclude it concerns healthcare."], "image": "val2014/COCO_val2014_000000510358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488303, "question_id": "bNe3KeWNCwrxfoSwwBw5Em", "question": "Where are the spoons compared to the stove?", "choices": ["right", "up", "down", "left"], "correct_choice_idx": 3, "direct_answers": ["on side", "next to", "left", "beside stove", "left", "left", "next", "left", "left", "alongside"], "difficult_direct_answer": false, "rationales": ["The objects are wooden and in a white cup which is above the drawers.", "The spoons are in the crock and the crock is next to the stove.", "The spoons are on the left."], "image": "train2014/COCO_train2014_000000488303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395865, "question_id": "bNjdAVLnuz7Cn8dwrwApah", "question": "How many feet away from the red item should one park?", "choices": ["30", "18", "15", "25"], "correct_choice_idx": 2, "direct_answers": ["nine", "six feet", "15", "nine", "ten", "15", "fifteen", "6 feet", "fifteen feet", "ten"], "difficult_direct_answer": false, "rationales": ["They're 15 feet.", "The item is a fire hydrant. by law you must park 15 feet away.", "Traditionally you have to park fifteen feet away from the hydrant so the fireman can use it."], "image": "val2014/COCO_val2014_000000395865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250383, "question_id": "bNzx67FZF5D2TB3deCFHbB", "question": "Where is the lady sitting in?", "choices": ["outdoor area", "food court", "restaurant", "dining room"], "correct_choice_idx": 1, "direct_answers": ["food court", "food court", "food court", "mall", "food court", "food court", "food court", "food court", "mall", "food court"], "difficult_direct_answer": false, "rationales": ["She is eating in a food court.", "The setup and the people eating in the background gives it away.", "The lady is eating pizza in a mall."], "image": "val2014/COCO_val2014_000000250383.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364911, "question_id": "bPL7vj4D35Eyn6FwVHQ4M4", "question": "Why is the dessert different colors?", "choices": ["going bad", "multiple layers", "food coloring", "got wet"], "correct_choice_idx": 1, "direct_answers": ["different elements", "multiple layers", "different bases", "layered pastry", "different ingredients", "different flavors", "different flavours", "multiple flavors", "layers", "different flavoring"], "difficult_direct_answer": true, "rationales": ["There are different things on the cake such as the flour part, the other flavored flour part, and the flaky crust put together.", "The dessert has layers.", "The dessert is made of three different layers of items."], "image": "val2014/COCO_val2014_000000364911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198139, "question_id": "bPQSPGe2yLZV3LexvCsXLq", "question": "What type of parking is required at this meter?", "choices": ["diagonal", "longitudinal", "backing in", "parallel"], "correct_choice_idx": 3, "direct_answers": ["metered parking", "paid", "paid parking", "paid", "paid", "toll parking", "parallel", "parallel", "money", "street"], "difficult_direct_answer": false, "rationales": ["The car is parked right next to a wall.", "You need to be able to parallel park in between cars to park here.", "The parking is parallel."], "image": "val2014/COCO_val2014_000000198139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200212, "question_id": "bPdG6qkgqnQARAGjaTSbiG", "question": "These buses are moved by what fuel?", "choices": ["electricity", "coal", "solar", "gas"], "correct_choice_idx": 0, "direct_answers": ["gasoline", "diesel", "electricity", "electricity", "diesel", "gasoline", "diesel", "diesel", "diesel", "electricity"], "difficult_direct_answer": false, "rationales": ["Overhead wires normally carry electricity throughout the city. this includes some buses which used them as fuel.", "The buses are visibly attached to wires above. this mode of transportation gets its power from answer a.", "You can tell by the long rods on top of the buses as to what fuels them."], "image": "val2014/COCO_val2014_000000200212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579862, "question_id": "bQK77YvAWazdCNThB6Kosh", "question": "What type of vehicle are the helmeted people riding?", "choices": ["unicycle", "moped", "bike", "scooter"], "correct_choice_idx": 2, "direct_answers": ["bike", "bikes", "bike", "bikes", "bikes", "bikes", "bike", "bicycle", "bike", "bikes"], "difficult_direct_answer": false, "rationales": ["There are several people with helmets located in the foreground of this picture. each are riding a bicycle.", "They are riding bicycles.", "The people are riding bicycles."], "image": "train2014/COCO_train2014_000000579862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405848, "question_id": "bQbJ7nPKWJyTV3aqtdRXtE", "question": "What is this motorcycle likely part of?", "choices": ["military exercise", "fashion show", "parade", "motorcycle gang"], "correct_choice_idx": 2, "direct_answers": ["parade", "parade", "motorcade", "parade", "motorcade", "parade motorcade", "police department", "police brigade", "parade", "parade"], "difficult_direct_answer": false, "rationales": ["The motorcycle is most likely taking part of a parade with a crowd of observers.", "The rally is a parade.", "There are people standing near barricades watching."], "image": "train2014/COCO_train2014_000000405848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401004, "question_id": "bQoonWYPX66igJ63Jp5C4L", "question": "Who would likely serve you here?", "choices": ["bartender", "chauffeur", "policeman", "maid"], "correct_choice_idx": 0, "direct_answers": ["bartender", "bartender", "bartender", "bartender", "bartender", "bartender", "bar lass", "barista", "bartender", "bartender"], "difficult_direct_answer": false, "rationales": ["Beer would be served at the bar.", "The bartender would serve.", "Drinks are served by bartenders in bars. an establishment with a large counter that people can sit at on stools is a bar."], "image": "val2014/COCO_val2014_000000401004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134339, "question_id": "bQuDmP2TowBuJ3CMscvTZQ", "question": "This event is most likely from what historical period?", "choices": ["ming dynasty", "renaissance", "roman republic", "great depression"], "correct_choice_idx": 3, "direct_answers": ["1800s", "pre wwi", "great depression", "1930s", "gold rush", "great depression", "1800s", "olden days", "nineteenth century", "civil war"], "difficult_direct_answer": false, "rationales": ["The photo is from the great depression as seen from how the people are dressed.", "Most people were killed in this period.", "The event is the depression."], "image": "train2014/COCO_train2014_000000134339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162996, "question_id": "bR6BR8ewiTJKJMQ5NmDNaL", "question": "Why is the man wearing an orange vest?", "choices": ["costume", "fashion", "camo", "visibility"], "correct_choice_idx": 3, "direct_answers": ["controller", "be seen", "visibility safety", "visibility", "safety", "bus worker", "visibility", "visibility", "safety", "visibility"], "difficult_direct_answer": false, "rationales": ["The man needs visibility.", "He wants to stand out and be able to be detected in traffic areas.", "He is a worker and needs to be seem so a vehicle doesn't hit him when he's working."], "image": "train2014/COCO_train2014_000000162996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487167, "question_id": "bRCJBu8Rm8HAF3tWJ67Rg4", "question": "What is this constellation often called in English?", "choices": ["orion", "big dipper", "pisces", "virgo"], "correct_choice_idx": 1, "direct_answers": ["little dipper", "big dipper", "big dipper", "big dipper", "big dipper", "big dipper", "big dipper", "little dipper", "big dipper", "big dipper"], "difficult_direct_answer": false, "rationales": ["The constellation looks like a big dipper.", "The constellation is the dipper.", "The bus has a picture of a constellation that has a large diamond and a tail, which is the big dipper."], "image": "train2014/COCO_train2014_000000487167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540330, "question_id": "bRjd7whcKM8tULMzcV8AfY", "question": "What does the tall thin thing next to the train do at night?", "choices": ["play music", "release water", "direct airplanes", "light up"], "correct_choice_idx": 3, "direct_answers": ["light", "arrival bell", "provide light", "light up", "light up", "illuminates platform", "close", "illuminate", "light", "light up"], "difficult_direct_answer": false, "rationales": ["It lights up.", "It is at a train station, and it gives needed light to people at the station.", "The train is being lit up."], "image": "train2014/COCO_train2014_000000540330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371847, "question_id": "bS37giHyoDTYSuuamGSEPV", "question": "The side dish visible here is seen to contain what?", "choices": ["leaves", "corn", "beets", "okra"], "correct_choice_idx": 0, "direct_answers": ["vegetables", "lettuce", "green leaves", "salad", "lettuce", "lettuce", "cheese", "pizza", "leaves", "vegetables"], "difficult_direct_answer": false, "rationales": ["Aside from the pizza there is a serving of salad visible in the back right of this image.", "The side dish is a salad and the salad looks to have lettuce leaves.", "It is a salad. salad is made from lettuce."], "image": "train2014/COCO_train2014_000000371847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340184, "question_id": "bSQpjYh7wGonjht7xzQf4t", "question": "What is the yellow item?", "choices": ["flower", "banana", "bean", "lemon"], "correct_choice_idx": 0, "direct_answers": ["flowers", "flowers", "flower", "flowers", "flower", "flowers", "flowers", "flower", "flower", "flowers"], "difficult_direct_answer": false, "rationales": ["The yellow item is in a vase of flowers.", "A flower is in there.", "The most prominent yellow object in the image is clear and identifiable based on its unique features, size and shape."], "image": "train2014/COCO_train2014_000000340184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539298, "question_id": "bSUg3QRBy7WhtUsqG9QEof", "question": "Which place was this?", "choices": ["brazil", "china", "paris", "hong kong"], "correct_choice_idx": 3, "direct_answers": ["clocktower", "unsure city", "tower", "clock", "clock tower", "clock tower", "london", "clock tower", "hong kong", "california"], "difficult_direct_answer": false, "rationales": ["The clock tower is a landmark, located on the southern shore of trim sha tsui.", "The clocktower is a landmark in hong kong.", "This is in hong kong."], "image": "val2014/COCO_val2014_000000539298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250724, "question_id": "bSf2VZRbACpiWaJZzKdEaP", "question": "What is causing the image on the television screen?", "choices": ["reflection", "video player", "gaming console", "broadcast tv"], "correct_choice_idx": 0, "direct_answers": ["reflection", "reflection", "reflection", "lighting", "reflection", "reflection", "lighting", "reflection", "reflection", "reflection"], "difficult_direct_answer": false, "rationales": ["The screen is made of glass and reflects.", "Since the light is on in the room, an off tv will act as a mirror and you'll be able to see yourself on it.", "The lighting is causing the glass to show an image."], "image": "train2014/COCO_train2014_000000250724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480489, "question_id": "bTYwHLeuNM9dtdyWFcGSt8", "question": "What is the most popular kite shape?", "choices": ["snoopy", "diamond/delta/box", "dragon", "cat"], "correct_choice_idx": 1, "direct_answers": ["rhombus", "triangular", "diamond", "diamond", "diamond", "diamond/delta/box", "triangular", "diamond", "legs", "v shape"], "difficult_direct_answer": false, "rationales": ["The most popular is the diamond shaped kite.", "It is the most popular shape here.", "The diamond is the most popular shape of a kite."], "image": "train2014/COCO_train2014_000000480489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549366, "question_id": "bTmzR8EjDozqUXuqJvMoNY", "question": "What are the zebras doing?", "choices": ["grazing", "mating", "sleeping", "fighting"], "correct_choice_idx": 0, "direct_answers": ["grazing", "grazing", "eating", "eating", "grazing", "grazing", "grazing", "grazing", "grazing", "grazing"], "difficult_direct_answer": false, "rationales": ["The zebras are bent with their muzzles to the grass which is consistent with how they eat and with answer a.", "They have their heads down to the grass", "The zebras are eating the grass."], "image": "train2014/COCO_train2014_000000549366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564915, "question_id": "bU9ZYXjfabVkeTQ7EQgXnB", "question": "Which individual pieces of candy can be seen on the cake?", "choices": ["rockets", "mms", "smarties", "skittles"], "correct_choice_idx": 1, "direct_answers": ["mms", "mm", "mms", "mandy's", "mom's", "mom's", "mms", "candy", "mms", "mom's"], "difficult_direct_answer": false, "rationales": ["The little white letter ms can be distinguished on the candy, which is a popular marker for the brand.", "The candy has the \"m\" initial on it.", "M&ms are visible."], "image": "train2014/COCO_train2014_000000564915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326359, "question_id": "bULrLMGobkDo3PrtE4KDJD", "question": "The kitchen adheres to the electrical standards set in which region?", "choices": ["australia", "north america", "europe", "south america"], "correct_choice_idx": 2, "direct_answers": ["north america", "europe", "united states", "united kingdom", "europe", "us", "usa", "europe", "european", "city"], "difficult_direct_answer": false, "rationales": ["The kitchen sets standards to europe's.", "The plugins look like they are european.", "You can tell by how the appliances are merged together that this is a european home."], "image": "train2014/COCO_train2014_000000326359.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51831, "question_id": "bUSqubLZQtKWQxMtBDdtdx", "question": "What are the glowing lights in the image?", "choices": ["lamps", "glow sticks", "eyes", "string lights"], "correct_choice_idx": 2, "direct_answers": ["eyes", "eyes", "sheep eyes", "sheep eyes", "eyes", "eyes", "sheep's eyes", "eyes", "eyes", "sheep eyes"], "difficult_direct_answer": false, "rationales": ["The lights are eyes.", "Sometimes when the camera has the light option on when taking a picture, it will make the light reflect off that part of the body towards the viewer.", "The eyes of all the sheep are reflecting light and look like they are glowing in the picture."], "image": "train2014/COCO_train2014_000000051831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513796, "question_id": "bUaY5mPBsAXgqVqhQa2Mi8", "question": "What the work of the ight on top of the vehicles?", "choices": ["beauty", "decoration", "signal", "code"], "correct_choice_idx": 2, "direct_answers": ["alert", "signal", "siren", "radio", "meter", "weighting", "police lights", "emergency", "alert", "police"], "difficult_direct_answer": true, "rationales": ["The lights are to grab attention easily.", "The lights are used to signal when they pull people over or if they are parked somewhere that they need to signal people to slow down or caution.", "It's a signal."], "image": "train2014/COCO_train2014_000000513796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426370, "question_id": "bUiypM3XMfNLJyPwxbAzFv", "question": "What are the wheels of the skateboard touching?", "choices": ["sand", "salt", "grass", "road"], "correct_choice_idx": 3, "direct_answers": ["ground", "ground", "pavement", "road", "ground", "road", "street", "ground", "ground", "pavement"], "difficult_direct_answer": false, "rationales": ["The wheels are on the pavement.", "They are touching the street.", "The pavement is noticeable underneath the wheels on the skateboard. also, a skateboard would be difficult or impossible to ride on grass, sand, or salt."], "image": "train2014/COCO_train2014_000000426370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362469, "question_id": "bV3LPEscMNpziD2zAkpZED", "question": "What keeps control of the black and white airborne item?", "choices": ["parrot", "string", "boat", "bird"], "correct_choice_idx": 1, "direct_answers": ["string", "wind", "person", "thread", "sea", "look", "string", "strings", "string", "wind"], "difficult_direct_answer": false, "rationales": ["Strings can keep control of kites.", "Kites have pieces of fabric attaching the kite to a hand-held device controlled by the person flying the kite.", "The string controls it."], "image": "val2014/COCO_val2014_000000362469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86202, "question_id": "bVBSrXa5hA5nZNetTGxhCn", "question": "What direction do you want to travel generally to enjoy this activity?", "choices": ["sideways", "downward", "upwards", "towards water"], "correct_choice_idx": 1, "direct_answers": ["down", "downhill", "northward", "down", "downward", "downhill", "downward", "downhill", "downhill", "north"], "difficult_direct_answer": false, "rationales": ["In order to get speed and enjoy skiing you need to go downhill.", "He is wearing skis", "Skiers pick up speed by travelling down slopes; going up slopes in skis requires more physical exertion and is likely less enjoyable for most."], "image": "val2014/COCO_val2014_000000086202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99536, "question_id": "bVdM2cSsRisHbbhu973j2R", "question": "What is the relationship of the man to the woman?", "choices": ["son", "stranger", "driver", "passenger"], "correct_choice_idx": 3, "direct_answers": ["customer", "passenger", "driver passenger", "passenger", "strangers", "passenger", "passenger", "friends", "wedding", "husband"], "difficult_direct_answer": false, "rationales": ["The woman is conducting the carriage while the man sits in the backseat which makes the man a passenger and the woman the driver.", "The man is in the back.", "The relationship is a passenger."], "image": "train2014/COCO_train2014_000000099536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373395, "question_id": "bVt2tcre6rqrR58y5wuo6R", "question": "Why is there a train here?", "choices": ["abandoned", "is stuck", "is broken", "is station"], "correct_choice_idx": 0, "direct_answers": ["left there", "broken", "abandoned", "abandoned cars", "abandoned", "display", "repair", "lived in", "abandoned", "broken down"], "difficult_direct_answer": false, "rationales": ["There is an abandoned train sitting at the depot.", "A rundown train is parked in a grassy area. incomplete tracks are laid out in front of a train.", "The train is abandoned."], "image": "train2014/COCO_train2014_000000373395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428895, "question_id": "bW2WFBzpwhR3asACKyHXdj", "question": "What color is the end of the man's keychain?", "choices": ["yellow", "blue", "pink", "red"], "correct_choice_idx": 1, "direct_answers": ["blue", "blue", "turquoise", "blue", "blue", "blue", "blue", "blue", "green", "blue"], "difficult_direct_answer": false, "rationales": ["You can see the mans keys hanging off of his beltloop, included with the keys is a chain that connects individual keys to an object so they are easily found. in this photo the mans key chain is a bright blue color.", "The man has a blue keychain.", "It's blue."], "image": "train2014/COCO_train2014_000000428895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251868, "question_id": "bW5ruBFZbAWBFkGi8jjDzM", "question": "What is the relationship of the man to the woman?", "choices": ["teacher", "son", "father", "lover"], "correct_choice_idx": 3, "direct_answers": ["couple", "couple", "husband", "dating", "boyfriend", "lover", "husband", "husband", "husband", "sweethearts"], "difficult_direct_answer": false, "rationales": ["The man and woman are similar age and are sitting very close to one another like they are partners.", "They are sitting thigh to thigh and he has his arm around her shoulder.", "They are a couple and sitting close."], "image": "train2014/COCO_train2014_000000251868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21079, "question_id": "bWEhbasaFksWrSvoiECPK5", "question": "Why does the person with the green shirt have no food?", "choices": ["is through", "is confused", "is dieting", "is sharing"], "correct_choice_idx": 3, "direct_answers": ["allergic", "not hungry", "is sharing", "broke", "waiting", "allergic", "on diet", "none delivered", "already ate", "full"], "difficult_direct_answer": true, "rationales": ["They are all eating together.", "The person shares.", "This person looks like they would like some of the food. it is common for a person to refrain from eating in circumstances where they would normally eat if they are doing this."], "image": "train2014/COCO_train2014_000000021079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397325, "question_id": "bWNpSpKLtSjtL8ZnfEPtA7", "question": "What other animal could help here?", "choices": ["snakes", "bees", "cats", "dogs"], "correct_choice_idx": 3, "direct_answers": ["sheepdog", "dog", "horse", "dogs", "sheep dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The sheep dogs could help", "There is a small herd of sheep visible. answer a is known to be used in association with domestic herd animals.", "Dogs are used for herding livestock like this."], "image": "train2014/COCO_train2014_000000397325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136736, "question_id": "bWVsui5HkWyWVPuZcvchpt", "question": "Why is the door of the train 8343 open?", "choices": ["accepting passengers", "vandalism", "it's broken", "airing out"], "correct_choice_idx": 0, "direct_answers": ["passengers boarding", "awaiting passengers", "boarding passengers", "exit", "stopped", "unloading", "accepting passengers", "awaiting passengers", "exiting passengers", "boarding"], "difficult_direct_answer": true, "rationales": ["The train is letting passengers on.", "The door is taking passengers.", "The door of this train is open to accept passengers in from the side of the road."], "image": "train2014/COCO_train2014_000000136736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281293, "question_id": "bWbwCovAumRAgmqmGZtQzW", "question": "Where is the man trying to hit the ball?", "choices": ["over net", "behind him", "right", "left"], "correct_choice_idx": 0, "direct_answers": ["court", "tennis", "in court", "over net", "across", "tennis", "tennis court", "tennis court", "opposite", "over net"], "difficult_direct_answer": false, "rationales": ["In order to avoid losing the point, he must return the ball to the other side of the court.", "The man is trying to hit the ball over the net.", "The goal of tennis is to get the ball to go over the net."], "image": "train2014/COCO_train2014_000000281293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122252, "question_id": "bWh7FUfXRWQvqbZmnU2aun", "question": "Who is the road for?", "choices": ["trucks", "drivers", "pedestrians", "bicycles"], "correct_choice_idx": 1, "direct_answers": ["vehicles", "cars", "debris", "drivers", "everyone", "vehicles", "drivers", "public", "vehicles", "car crash"], "difficult_direct_answer": false, "rationales": ["The road is for people in vehicles that they can drive.", "The road is for drivers.", "People that drive would use the roadway."], "image": "val2014/COCO_val2014_000000122252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459255, "question_id": "bWwJ32CS9JtVaZZuhcQnHM", "question": "What is the brand symbol in racket?", "choices": ["nike", "puma", "adidas", "reebok"], "correct_choice_idx": 0, "direct_answers": ["nike", "head", "head ti", "head", "oval", "nike", "nike", "nike", "head", "head"], "difficult_direct_answer": false, "rationales": ["There is a nike logo on the front of the racket.", "There is a swoosh symbol, not a mountain lion, three stripes, or union jack, on the racket.", "The racquet has the swoosh logo."], "image": "train2014/COCO_train2014_000000459255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375521, "question_id": "bWyhMMLfmb8LFjtzxMMxuA", "question": "Where is the board the man will use located?", "choices": ["behind him", "no where", "in car", "under him"], "correct_choice_idx": 0, "direct_answers": ["hands", "in front", "between people", "womans hand", "behind him", "snow", "in hand", "his hand", "behind him", "slopes"], "difficult_direct_answer": true, "rationales": ["It's laying there while he adjusts the other one", "The snowboard is behind him.", "He is assisting the woman with her snowboard. in order for him to ride as well he will have to use the one on the ground."], "image": "val2014/COCO_val2014_000000375521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6339, "question_id": "bX4MZqXQVokCCWjkNMu4di", "question": "What does the horse and rider compete in here?", "choices": ["rodeo", "movie tryout", "dressage", "horse race"], "correct_choice_idx": 2, "direct_answers": ["stadium jumping", "jumping", "horse riding", "horse show", "dressage", "dressage", "show", "racing horses", "equestrian", "dressage"], "difficult_direct_answer": false, "rationales": ["The man is riding on a horse. they are in a stadium. the horse's steps are high.", "This is showing harmony and trust between horse and rider,.", "The horse is trying to compete in that event."], "image": "train2014/COCO_train2014_000000006339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439273, "question_id": "bXP5nVy6kqEBWLh8D4T6GC", "question": "Where are they located?", "choices": ["house", "church", "hospital", "classroom"], "correct_choice_idx": 0, "direct_answers": ["indoors", "house", "by door", "hallway", "home", "house", "home", "by porch", "at home", "house"], "difficult_direct_answer": false, "rationales": ["There is a small hanging light reflected behind them and a porch outside.", "The boys are in front of a residential deck.", "The background indicates they are in a home with a deck."], "image": "train2014/COCO_train2014_000000439273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435389, "question_id": "bXWAkn9nFLwQ5ffACukB4A", "question": "What is the purpose of the colorful bag?", "choices": ["decoration", "holding gifts", "storing goods", "hiding secret"], "correct_choice_idx": 1, "direct_answers": ["gift", "gifting", "hold gift", "gift bag", "gift holder", "gift", "hold gift", "carry gift", "holding gifts", "gift"], "difficult_direct_answer": false, "rationales": ["The colorful bag is designed to hold gifts.", "The bag is on a shelf near a card and is brightly colored with handles like a gift bag often is.", "Bags are used for many things but in this instance it is used for gifts."], "image": "val2014/COCO_val2014_000000435389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493751, "question_id": "bXWcTGi2Hoa2nKDPHWX6z4", "question": "How many of the people are kneeling near the floor?", "choices": ["four", "five", "six", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["Three people are in a wooded area and one is bent down lower than the others.", "Only one person is kneeling.", "Only one person is crouching."], "image": "val2014/COCO_val2014_000000493751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330082, "question_id": "bY6suNGKSHyHBon9oV6HbU", "question": "What airline is advertised on the train?", "choices": ["southwest", "united", "delta", "american"], "correct_choice_idx": 0, "direct_answers": ["southwest", "southwest", "southwest", "southwest", "south west", "southwest", "southwest", "southwest", "south west", "southwest"], "difficult_direct_answer": false, "rationales": ["You can see the words next to the train door.", "Southwest is advertised on the side of the train.", "Southwest's colors are shown."], "image": "train2014/COCO_train2014_000000330082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297488, "question_id": "bYPAkQrqWxsfadhX6oad6Q", "question": "What animals are present in the image?", "choices": ["elephant", "giraffe", "tiger", "bear"], "correct_choice_idx": 0, "direct_answers": ["elephants", "elephants", "elephant", "elephants", "elephants", "elephant", "elephants", "elephants", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["The trunk is a sure sign that these are elephants.", "Elephants are seen.", "The size, colour, and distinguishable trunk cannot be seen on any other animal."], "image": "train2014/COCO_train2014_000000297488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10138, "question_id": "bYRHP2SYymcT8WCpvPHZ4z", "question": "When does this take place?", "choices": ["someone's birthday", "chanukah", "christmas", "easter"], "correct_choice_idx": 0, "direct_answers": ["birthday", "morning", "birthday", "today", "nighttime", "birthday", "birthday", "birthday", "birthday", "someone's birthday"], "difficult_direct_answer": false, "rationales": ["By the candles on the cake and the position of the person, you can tell what event is happening.", "It's someone's birthday.", "In front of the man on the table there is a cake with lit candles on it. a cake with candles is used to celebrate one's birthday and not associated with other holidays."], "image": "val2014/COCO_val2014_000000010138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243896, "question_id": "bYnioFqjLjhffYCVPKcPuA", "question": "What should beginners do when approaching this area?", "choices": ["go down", "turn back", "speed up", "outrun patrol"], "correct_choice_idx": 1, "direct_answers": ["stop", "stop", "scatting", "turn back", "stop", "stop", "stop", "stop", "stop", "stop"], "difficult_direct_answer": false, "rationales": ["The trail is marked at the top with a sign that says only intermediate and advanced should proceed. if one is a beginner they would not be intermediate or advanced as required for this trail and should not go further.", "It's only for intermediate and advanced skiers.", "Beginners should leave the area."], "image": "val2014/COCO_val2014_000000243896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380138, "question_id": "bYoHuUGSt74f6CYuZ2pNsK", "question": "What are the green veggies in the bowl called?", "choices": ["celery", "green beans", "asparagus", "peas"], "correct_choice_idx": 0, "direct_answers": ["celery", "peppers", "broccoli", "celery", "kiwi", "celery", "celery", "celery", "celery", "celery"], "difficult_direct_answer": false, "rationales": ["There are chopped up c shaped vegetables.", "That is celery on the plate.", "All the options are green but the shape and size tell us that these veggies are."], "image": "train2014/COCO_train2014_000000380138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70134, "question_id": "bYsXQkJUuQGkaWtQxsNu8P", "question": "What animal is in the team logo?", "choices": ["bear", "lizard", "wolverine", "bird"], "correct_choice_idx": 3, "direct_answers": ["cardinal", "cardinal", "cardinal", "bird", "parrot", "bird", "bird", "bird", "cardinal", "cardinal"], "difficult_direct_answer": false, "rationales": ["That's what a cardinal is.", "You can tell that the animals on the uniform are cardinals.", "The team is named cardinals, which is a well known red bird."], "image": "val2014/COCO_val2014_000000070134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61602, "question_id": "bZCvthSuXAEMUq9tt3gvMD", "question": "What does this door lead to?", "choices": ["dining area", "walkin cooler", "exit", "bathroom"], "correct_choice_idx": 1, "direct_answers": ["refrigerator", "commercial fridge", "refrigerator", "refrigerator", "fridge", "food storage", "industrial freezer", "walkin cooler", "freezer area", "refrigerator"], "difficult_direct_answer": false, "rationales": ["The door goes to a giant refrigerator.", "The door is metal and the area is sealed. there is produce and food in the room.", "This stores food to keep it cool"], "image": "val2014/COCO_val2014_000000061602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51258, "question_id": "bZQjTv9Ea3LHRQuQ2wfVDf", "question": "What did the man do with the bat?", "choices": ["sell it", "catch it", "throw it", "block it"], "correct_choice_idx": 2, "direct_answers": ["slipped out", "threw", "threw it", "throw it", "hit", "toss bat", "throw it", "drop it", "throw", "hit ball"], "difficult_direct_answer": true, "rationales": ["He lost his grip on it", "He threw it to run.", "The batter swung the bat and it appears to have come out of his hands and went out in front of him. the bat can be seen flying through the air."], "image": "val2014/COCO_val2014_000000051258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401435, "question_id": "bZo8MGuqTLW7EFgr5jhcHf", "question": "Why is he leaning over?", "choices": ["is falling", "uncontrolled", "prevent falling", "is afraid"], "correct_choice_idx": 2, "direct_answers": ["balance", "turning skateboard", "for balance", "balance", "turning", "prevent falling", "balance", "balancing", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["The person prevents falling.", "The man doesn't want to fall over.", "He is trying to balance to avoid falling to the road."], "image": "train2014/COCO_train2014_000000401435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463633, "question_id": "baBn9t8XcmUJ87ASsofitE", "question": "Where does the baseball team mentioned hail from?", "choices": ["new york", "los angeles", "boston", "denver"], "correct_choice_idx": 2, "direct_answers": ["boston", "boston", "boston", "boston", "boston", "boston", "redsot", "boston", "redsot", "boston"], "difficult_direct_answer": false, "rationales": ["The red sox are from boston.", "The team is in boston.", "The red sox come from boston."], "image": "val2014/COCO_val2014_000000463633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51501, "question_id": "baCD5GmToR5LquVdTT2MeK", "question": "Why is woman using an umbrella?", "choices": ["decoration", "sun protection", "exercise", "rain"], "correct_choice_idx": 1, "direct_answers": ["shade", "shade", "sun protection", "shade", "sun protection", "shade", "sunny", "hot sun", "sun protection", "for shade"], "difficult_direct_answer": false, "rationales": ["Umbrellas are usually used for either rain or sun protection. there is no visible rain, but there is sun so it is likely that is what they are trying to protect themselves from.", "It is a bright clear day", "There are shadows of the tree and some light on the object showing that she's trying to cover herself as to not get too hot."], "image": "train2014/COCO_train2014_000000051501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399510, "question_id": "baQ4dYcNRhZk2RJv8hD8RJ", "question": "The greenish aspect of this meal comes from what?", "choices": ["spice", "mustard", "ketchup", "relish"], "correct_choice_idx": 3, "direct_answers": ["relish", "pickle relish", "pickles", "pickles", "relish", "cucumbers", "relish", "relish", "relish", "relish"], "difficult_direct_answer": false, "rationales": ["The greenish part of the hot dog is the relish.", "The green stuff is relish.", "The green colored item on the hot dog is pickle relish."], "image": "val2014/COCO_val2014_000000399510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53589, "question_id": "baRu3puvg8iCirvfX2UxhU", "question": "What equipment/ item does the person seen here like to look at while relaxing in bed?", "choices": ["phone", "camera", "none", "printed book"], "correct_choice_idx": 3, "direct_answers": ["printed book", "book", "book", "book", "novel", "book", "book", "book", "novel", "book"], "difficult_direct_answer": false, "rationales": ["The person is reading. the book is in their hands.", "The person is reading in bed, and they are doing it the non-electronic old-fashioned way.", "A woman is holding a book while laying in bed."], "image": "train2014/COCO_train2014_000000053589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546642, "question_id": "baUmRGEHtJcoxShHaCTws9", "question": "Who is in the greatest danger?", "choices": ["woman", "boy", "man", "dog"], "correct_choice_idx": 3, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The dog is not secured in any way, and is not wearing any protective equipment in the case of an accident.", "There is a dog visible standing in the basket on the back of a motorcycle with no other security measures. it would not take much for the dog to be dislodged and injured.", "The animal in the back could easily jump off and get hurt."], "image": "val2014/COCO_val2014_000000546642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209989, "question_id": "baV5ShP5NSEWVVpcktF92Q", "question": "How would they get to the top of this hill?", "choices": ["trolley", "skateboard", "ski lift", "bicycle"], "correct_choice_idx": 0, "direct_answers": ["trolley", "ski lift", "ski lift", "skihilll mount", "ski lift", "ski lift", "lift", "skii lift", "ski lift", "ski lift"], "difficult_direct_answer": false, "rationales": ["They take the trolley.", "Trolleys help people get up.", "One way to get to the top of this hill is to use the monorail trolley."], "image": "val2014/COCO_val2014_000000209989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488539, "question_id": "bafFfhPpzQJhdRxSEnXVMo", "question": "What company makes the item on the left?", "choices": ["samsonite", "moen", "green giant", "burger king"], "correct_choice_idx": 0, "direct_answers": ["samsonite", "luggage company", "samsonite", "luggage", "cloth", "samsonite", "samsonite", "samsonite", "samsonite", "samsonite"], "difficult_direct_answer": false, "rationales": ["The thing on the left is a bag and this is the only brand that makes those.", "The company is samsonite.", "Samsonite is a luggage company."], "image": "val2014/COCO_val2014_000000488539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156324, "question_id": "bagqAYsaFP9oqr6dMwuyLN", "question": "What type of shower head is in the background?", "choices": ["removable", "wall mount", "rainfall", "underwater"], "correct_choice_idx": 1, "direct_answers": ["silver color", "wall mount", "mounted", "overhead", "tall", "wall mounted", "silver shower", "round", "metal", "overhead"], "difficult_direct_answer": true, "rationales": ["The shower head is mounted to the wall.", "The shower head is on the wall.", "There is a wall mount for the shower heads in the background."], "image": "val2014/COCO_val2014_000000156324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574703, "question_id": "baye5Y5DUUfyZhtVkFg52P", "question": "Who is on the boat?", "choices": ["escapees", "convicts", "tourists", "swimmers"], "correct_choice_idx": 2, "direct_answers": ["passengers", "tourists", "passengers", "people", "riders", "passengers", "tourists", "yankees", "people", "tourists"], "difficult_direct_answer": false, "rationales": ["This is a tour boat. the people on the boat are tourists.", "This is a boat that is used for leisure, and for these purposes, it is most likely utilized by tourists.", "A large commercial passenger boat is in a river."], "image": "val2014/COCO_val2014_000000574703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467131, "question_id": "bb27G3SdXqdjexEvLipT3p", "question": "What animals are in the photo?", "choices": ["bear", "cheetah", "giraffe", "jaguar"], "correct_choice_idx": 2, "direct_answers": ["giraffe", "giraffes", "giraffes", "giraffe", "giraffes", "giraffe", "giraffe", "giraffes", "giraffe", "giraffes"], "difficult_direct_answer": false, "rationales": ["There is a spotted yellow animal with a long neck.", "The animals are giraffes.", "A baby and adult giraffe are seen with another nearby."], "image": "train2014/COCO_train2014_000000467131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446036, "question_id": "bb6dj4soZh3zwcPA3yzahX", "question": "The meat shown here was grown in what medium?", "choices": ["water", "underground", "air", "field"], "correct_choice_idx": 0, "direct_answers": ["water", "water", "greenhouse", "fish", "small", "sea", "dirt", "water", "sea", "water"], "difficult_direct_answer": false, "rationales": ["It's grown in water.", "The food is fish sticks.", "The meat is fish."], "image": "train2014/COCO_train2014_000000446036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60140, "question_id": "bbCsjD8HgZjoQujHEqhabf", "question": "What drink brand is seen on the chairs?", "choices": ["coca cola", "pepsi", "sprite", "canada dry"], "correct_choice_idx": 0, "direct_answers": ["coca cola", "coke cola", "coca cola", "coca-cola", "coca-cola", "coca cola", "coca-cola", "coke", "coca cola", "coco-cola"], "difficult_direct_answer": false, "rationales": ["It is red and white with the logo", "Their logo is on the chairs.", "The brand is coke."], "image": "train2014/COCO_train2014_000000060140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55594, "question_id": "bbGPbm55uhztqNeL9cYZ3E", "question": "From what do most of the items sold here come from?", "choices": ["plants", "people", "factory", "animals"], "correct_choice_idx": 0, "direct_answers": ["farms", "plants", "jungle", "plants", "tropical regions", "farmers", "farm", "ground", "plants", "farms"], "difficult_direct_answer": false, "rationales": ["These are fruits and vegetables.", "There are many fruits here where people are buying at an open market.", "Most come from plants."], "image": "val2014/COCO_val2014_000000055594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458673, "question_id": "bbHz8wensKFN7izkp9hSMg", "question": "Why are the pillars green?", "choices": ["paint", "moss", "mold", "rust"], "correct_choice_idx": 1, "direct_answers": ["moss", "moss", "mold", "moss", "moss", "mold", "moss", "moss", "moss", "moss"], "difficult_direct_answer": false, "rationales": ["The pillars are green because they are in a dark humid place and moss is growing on the surfaces.", "There is vegetation on the pillars.", "There is visible plant matter growing on the pillars, which are broken and pitted, and not part of an actively-maintained structure. the growth of moss on abandoned buildings is common."], "image": "train2014/COCO_train2014_000000458673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68130, "question_id": "bbUVVpvKBTcPkTn8aWE6Md", "question": "What are the people doing in the street on the right?", "choices": ["crossing", "protesting", "repairing", "dancing"], "correct_choice_idx": 0, "direct_answers": ["crossing", "walking", "crossing street", "walking", "walking", "walking", "crossing street", "walking", "crossing street", "walking"], "difficult_direct_answer": false, "rationales": ["They are going from one side of the road to the other by walking.", "They are trying to get across.", "They are on the crosswalk in the middle of the road"], "image": "train2014/COCO_train2014_000000068130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170784, "question_id": "bc5TBRN8UAJiDHabQSoHyn", "question": "What is this bus for?", "choices": ["display", "advertisement", "storage", "transportation"], "correct_choice_idx": 0, "direct_answers": ["transport", "parties", "tours", "rides", "tourists", "bus passanger", "passengers", "transportation", "transporting people", "display"], "difficult_direct_answer": true, "rationales": ["It's parked on the sidewalk for display.", "The bus is showing off ads for santander.", "There are other answers possible, but the most common purpose for a bus is known to be answer b."], "image": "val2014/COCO_val2014_000000170784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12372, "question_id": "bc8zt6AZSAiCCPDCX2rGdN", "question": "What does the umbrella keep out here?", "choices": ["flies", "rain", "mosquitos", "sun"], "correct_choice_idx": 3, "direct_answers": ["sun", "patio umbrella", "sun", "sun", "patio umbrella", "sun", "sun", "patio umbrella", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["The umbrella keeps the sun off of someone's head.", "The umbrella blocks sun.", "The umbrella is on a patio. there is probably a lot of sun there."], "image": "train2014/COCO_train2014_000000012372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184275, "question_id": "bcEzswamoVobLkDz9QZ5Zy", "question": "What kind of shoes does the catcher have on?", "choices": ["asics", "k swiss", "nike", "allbirds"], "correct_choice_idx": 2, "direct_answers": ["cleats", "nike", "cleats", "cleats", "cleats", "spiked", "nike", "cleats", "cleats", "cleats"], "difficult_direct_answer": false, "rationales": ["The shoes have a swoosh.", "On the catcher's shoes we see the checkmark shaped swoop of the nike logo.", "The catcher's shoes feature the swoosh."], "image": "train2014/COCO_train2014_000000184275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552723, "question_id": "bcGYfNN4FHdNRvjh4GwSQr", "question": "Where was this food placed during cooking?", "choices": ["counter top", "deep fryer", "microwave", "oven"], "correct_choice_idx": 3, "direct_answers": ["oven", "oven", "oven", "oven", "oven", "oven", "oven", "oven", "oven", "oven"], "difficult_direct_answer": false, "rationales": ["Ovens are commonly used for cooking pizza.", "Pies are on a table.", "Pizzas are baked."], "image": "train2014/COCO_train2014_000000552723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499611, "question_id": "bcJaKy5SJuXQNa97LUuZ35", "question": "Why does the boy have his arm out?", "choices": ["signal", "reach", "balance", "gesture"], "correct_choice_idx": 2, "direct_answers": ["balance", "balance", "balance", "balance", "balance", "balance", "balance", "balance", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["The boy is balancing.", "They are just learning the sport of skiing and arms can help achieve balance when practicing.", "Putting your arms out moves mass away from the body and creates a more stable stance. skis require one to remain upright and this boy is using his arm to do this."], "image": "train2014/COCO_train2014_000000499611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296544, "question_id": "bcLxocdyasCXWZLXQYRReR", "question": "What main dish is served here?", "choices": ["flower soup", "pizza", "flower cake", "meat stew"], "correct_choice_idx": 1, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["There are flat types of this kind of food surrounding the cake which means the main course would be the flat food.", "The dish is pizza.", "There are several of the round pizzas on the table."], "image": "val2014/COCO_val2014_000000296544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580613, "question_id": "bcQEAeJnqbakNSxbvmpmri", "question": "What does the word out refer to in her case?", "choices": ["energy", "personality", "location", "sexual orientation"], "correct_choice_idx": 3, "direct_answers": ["sexual orientation", "outward", "driving", "outside", "equal", "gay pride", "going", "gay", "sexual orientation", "lesbian"], "difficult_direct_answer": true, "rationales": ["It's the energy.", "Being out means she's a lesbian.", "The decal has a rainbow pattern. there are pride flags in the background."], "image": "val2014/COCO_val2014_000000580613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506151, "question_id": "bcg6ZierpQBQVmtcxocueS", "question": "What is illuminating the cat and the table?", "choices": ["led light", "halogen light", "fluorescent light", "sunlight"], "correct_choice_idx": 3, "direct_answers": ["sunlight", "sunlight", "sitting", "lamp", "microwave", "outside light", "daylight", "microwave", "sun", "sunlight"], "difficult_direct_answer": false, "rationales": ["The light is not coming from one place, rather it is light in general.", "Light is coming in from a window.", "The lamp light isn't on, and the way the shadows are up against the wall suggest natural lighting coming from outside a window."], "image": "val2014/COCO_val2014_000000506151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486123, "question_id": "bchTcDeXbq2wjGDVj6Jtqy", "question": "What street is marked by the traffic light?", "choices": ["google", "apple", "facebook", "alphabet"], "correct_choice_idx": 0, "direct_answers": ["google", "google", "google", "google", "google", "google", "google", "google", "google", "google"], "difficult_direct_answer": false, "rationales": ["There is a street sign on the light pole above the street.", "There is a street sign hanging over the road. street signs indicate what the name of the street is.", "Google's street is shown."], "image": "train2014/COCO_train2014_000000486123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557118, "question_id": "bcvNLBtaVFs6P37aeTXaAa", "question": "What action is he taking?", "choices": ["stop", "ascend", "descend", "retreat"], "correct_choice_idx": 2, "direct_answers": ["skiing downhill", "moving forward", "skiing", "skiing", "skiing", "skiing", "skiing", "ski", "descend", "turning"], "difficult_direct_answer": false, "rationales": ["The man is going down the hill.", "The man is looking down and his skis are pointing down.", "He's heading downhill"], "image": "train2014/COCO_train2014_000000557118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419718, "question_id": "bdJn2EuhgWospGui8KWHY6", "question": "In which city is this car driving?", "choices": ["san antonio", "nella", "san francisco", "little rock"], "correct_choice_idx": 2, "direct_answers": ["venice", "san francisco", "venice", "san francisco", "san francisco", "san francisco", "san francisco", "san francisco", "venice", "san francisco"], "difficult_direct_answer": false, "rationales": ["This is a twisting street well known in this city.", "This is usa's twistiest road and is located in san francisco, california.", "It's driving in san francisco."], "image": "train2014/COCO_train2014_000000419718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343200, "question_id": "bdMMSPKTmBuNJytZADvtUV", "question": "What is the profession of this man?", "choices": ["doctor", "athlete", "fireman", "waiter"], "correct_choice_idx": 1, "direct_answers": ["tennis player", "coach", "coach", "tennis player", "tennis player", "tennis player", "tennis player", "tennis player", "athlete", "tennis player"], "difficult_direct_answer": false, "rationales": ["You can tell by the court and equipment he is using as to what his profession is.", "The man is hitting a tennis ball with a racket. he is standing in a tennis court wearing shorts. people who play sports would be dressed like this and found in this type of environment.", "A man is swinging a tennis racket on a court. athletes play sports."], "image": "train2014/COCO_train2014_000000343200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462037, "question_id": "bdPLUM9eScxCi7QrAtNutY", "question": "What do the girls steering wheels control?", "choices": ["car", "barbie house", "video game", "toy truck"], "correct_choice_idx": 2, "direct_answers": ["video game", "car", "cars", "cars", "video game", "video game", "video game", "game", "game", "video game"], "difficult_direct_answer": false, "rationales": ["The visible tv screen is showing a recognizable video game. video games are played on tvs and with a connected controller and as they are playing a racing game, the steering wheels likely control the game seen.", "They are playing video games.", "You can tell by what's on the screen a to what the girls are doing."], "image": "train2014/COCO_train2014_000000462037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387338, "question_id": "bdTUF9raNZEg6Pkx627An5", "question": "What kind of violence is it?", "choices": ["psychological", "physical", "sexual", "emotional"], "correct_choice_idx": 1, "direct_answers": ["extreme", "severe", "domestic", "domestic", "physical violence", "physical", "bullying", "physical", "physical", "domestic"], "difficult_direct_answer": false, "rationales": ["He is hurting the person", "A person is using force to injure another person.", "The man is being overpowered and threatened with dismemberment, which is defines as physical violence."], "image": "train2014/COCO_train2014_000000387338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184929, "question_id": "bdTsqd8RQD2nWkxGKEvCG6", "question": "What does the hair cap prevent?", "choices": ["flies", "dandruff", "stray hairs", "distraction"], "correct_choice_idx": 2, "direct_answers": ["hair fall", "infestation", "contamination", "hairs", "hair falling", "hair falling", "hair falling", "germs", "stray hairs", "contamination"], "difficult_direct_answer": false, "rationales": ["The hair cap prevents stray hairs from landing on the fruits.", "A woman wears a hair cap while handling produce. people wear hair nets in the food industry for cleanliness.", "Hair from getting on the food"], "image": "val2014/COCO_val2014_000000184929.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189936, "question_id": "bdWH9GLX3iV8CChENfjDB2", "question": "How do those in the audience here feel about the speaker?", "choices": ["bored", "negative", "positive", "angry"], "correct_choice_idx": 2, "direct_answers": ["admiration", "admiration", "happy", "devoted", "happy", "support him", "like", "happy", "good", "positive"], "difficult_direct_answer": false, "rationales": ["Those in the audience have a positive opinion of the speaker.", "They indicate this by smiling and clapping.,", "They are all smiling and seem to be happy to be there watching."], "image": "val2014/COCO_val2014_000000189936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495553, "question_id": "bdfnPTXb8gdkYshWAnkBgS", "question": "What is the yellow stuff made from?", "choices": ["mincemeat", "pepper", "bananas", "milk"], "correct_choice_idx": 3, "direct_answers": ["milk", "milk", "milk", "milk", "milk", "mustard", "cheese", "milk", "aged milk", "milk"], "difficult_direct_answer": false, "rationales": ["Cheese is made of milk.", "The stuff is cheese.", "Cheese comes from milk."], "image": "train2014/COCO_train2014_000000495553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421876, "question_id": "bdiK45YNHcTsMgJweqgLbi", "question": "How many girls are playing the game?", "choices": ["five", "two", "three", "four"], "correct_choice_idx": 1, "direct_answers": ["boxing", "two", "2 women", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Two people with long hair and clothing are standing in front of the tv playing video games.", "There are two women playing.", "There are two people playing the games."], "image": "train2014/COCO_train2014_000000421876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571584, "question_id": "bdy2PJWWCehEJoqs4uCMqv", "question": "What do people do inside the building with the spire on it?", "choices": ["play chess", "worship", "party", "eat out"], "correct_choice_idx": 1, "direct_answers": ["pray", "worship", "ring bell", "pray", "pray", "worship", "pray", "worship", "pray", "church"], "difficult_direct_answer": false, "rationales": ["The building with a spire is a church. people pray inside it.", "The people worship.", "The people are going to church."], "image": "val2014/COCO_val2014_000000571584.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57361, "question_id": "be6zcdcBKRnVd3mMTRYPBf", "question": "The store of what US company is found in this street?", "choices": ["nike", "coach", "apple", "macdonald"], "correct_choice_idx": 2, "direct_answers": ["mcdonalds", "apple", "apple", "mcdonalds", "apple", "apple", "apple", "mac", "iowa", "apple"], "difficult_direct_answer": false, "rationales": ["A gray apple is on a sign in front of a building in a busy city.", "There is one english word out of all the japanese writings on the sign to the right referring to the fast food joint.", "Their logo is on the sign."], "image": "train2014/COCO_train2014_000000057361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128441, "question_id": "beHLiQr3hysVivZYKYN8V2", "question": "How many dimensions in traffic light?", "choices": ["five", "four", "three", "two"], "correct_choice_idx": 2, "direct_answers": ["three", "two", "one", "three dimensions", "three", "3 dimensions", "three", "three", "two", "three"], "difficult_direct_answer": false, "rationales": ["There is red for stop, yellow for caution, and green for go.", "It has a length, depth and width.", "There are three in the light."], "image": "val2014/COCO_val2014_000000128441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295630, "question_id": "beK2BFi6V6YJDYzKTDMhqZ", "question": "Where is the woman sitting in?", "choices": ["bedroom", "office", "hotel room", "library"], "correct_choice_idx": 2, "direct_answers": ["bed", "bed", "bed", "chair", "hotel room", "chair", "motel room", "motel room", "hotel room", "hotel room"], "difficult_direct_answer": false, "rationales": ["The woman is in a room that has a mirror and tv, which indicates she's in a hotel.", "The woman is in a hotel room since it has a tv.", "The woman is in a hotel."], "image": "train2014/COCO_train2014_000000295630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10591, "question_id": "beLr3qGMSgv5tBRgTY27db", "question": "Who is the young man near the tennis player?", "choices": ["towel boy", "umpire", "coach", "his nephew"], "correct_choice_idx": 0, "direct_answers": ["son", "bellboy", "his son", "towel boy", "ball boy", "player", "ball boy", "player", "towel boy", "ball boy"], "difficult_direct_answer": false, "rationales": ["He is used to hand a towel to the player to wipe his sweet.", "He's waiting for the player to dry off before he gets the fabric back", "The boy to the right of the athlete appears poised to take away the athlete's towel once he is done with it."], "image": "val2014/COCO_val2014_000000010591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80053, "question_id": "bew5Txy5sEqvWYAaZAaHsi", "question": "What is the object to the left of the laptop?", "choices": ["mouse", "phone", "router", "speaker"], "correct_choice_idx": 0, "direct_answers": ["mouse", "mouse", "mouse", "mouse", "mouse", "compute mouse", "compute mouse", "mouse", "mouse", "mouse"], "difficult_direct_answer": false, "rationales": ["There is a corded mouse by the computer.", "The object is a mouse.", "There is a mouse near the laptop."], "image": "val2014/COCO_val2014_000000080053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272396, "question_id": "bfmAvNhcvLBytrJUAgNWYT", "question": "Which animal can you see in the above picture?", "choices": ["lizard", "dog", "none", "cat"], "correct_choice_idx": 1, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["There is a dog on the beach.", "A dog is in the sand.", "A dog has four legs like the one in the photo."], "image": "train2014/COCO_train2014_000000272396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413996, "question_id": "bg4djVJ3okHZzjtTtHyQPp", "question": "The place where these ships are docked is known as?", "choices": ["harbor", "quay", "port", "wharf"], "correct_choice_idx": 2, "direct_answers": ["ship yard", "shipping port", "pier", "port", "barge", "pier", "port", "harbor", "pier", "dock"], "difficult_direct_answer": false, "rationales": ["There are several shipping vessels near the shore.", "Ports are near the shore. boat park right on the edge at a port.", "The ships are at port."], "image": "train2014/COCO_train2014_000000413996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95841, "question_id": "bgCb8zEDeNUUxfEupja8uB", "question": "Why is the person in the red shirt holding their hand out?", "choices": ["break fall", "catching ball", "hitting batter", "grabbing batter"], "correct_choice_idx": 1, "direct_answers": ["catch ball", "catcher", "catching ball", "to catch", "catch ball", "signaling pitcher", "mitt", "catching ball", "catch ball", "catch baseball"], "difficult_direct_answer": false, "rationales": ["This person's title is catcher, when the batter strikes out or does not hit the ball it is the catchers job to receive the ball.", "The person is catching.", "This is the catcher and they will get the ball if the batter does not hit it."], "image": "val2014/COCO_val2014_000000095841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537367, "question_id": "bgMCkhRT3WrwddQEmXtov6", "question": "What sport it is?", "choices": ["badminton", "cricket", "table tennis", "soccer"], "correct_choice_idx": 0, "direct_answers": ["tennis", "tennis", "tennis", "tennis", "badminton", "tennis", "tennis", "tennis", "tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["Badminton involves racquets and birdies.", "Badminton racquets are shown.", "Looks like badminton."], "image": "train2014/COCO_train2014_000000537367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185158, "question_id": "bgPPQLzzFc6p56b2g8hJRg", "question": "What kind of location is this?", "choices": ["outdoor", "residential", "historic", "commercial"], "correct_choice_idx": 1, "direct_answers": ["living room", "living room", "home", "house", "apartment", "home", "residential", "apartment", "living room", "home"], "difficult_direct_answer": false, "rationales": ["There is a living room and kitchen in the same area", "Houses are decorated with couches, chairs, etc. residential areas have things like magnets on fridges and personal items where as commercial areas do not.", "There is a living room right next to a kitchen"], "image": "train2014/COCO_train2014_000000185158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127702, "question_id": "bgdb2f5ZTAgbKGvEC5Mgor", "question": "What might this person be photographing?", "choices": ["snow", "birds", "sun", "snowman"], "correct_choice_idx": 1, "direct_answers": ["bird", "bird", "birds", "birds", "birds", "birds", "bird", "bird", "birds", "birds"], "difficult_direct_answer": false, "rationales": ["Looking up and taking a picture of birds flying.", "This person is pointing a camera in the trees looking for birds.", "Because they are pointing the camera up towards the trees where birds are."], "image": "val2014/COCO_val2014_000000127702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328462, "question_id": "bgkcfCss9FycvWWQ5gmypq", "question": "What is the size of TV?", "choices": ["26inches", "22inches", "40inches", "32inches"], "correct_choice_idx": 1, "direct_answers": ["small", "thirteen inch", "small", "22inches", "13 inch", "small", "small", "13' inch", "small", "small"], "difficult_direct_answer": false, "rationales": ["It's a smaller 22 inches.", "The television is a smaller size. it has to be smaller than 32 inches.", "It's hard to tell the exact size but it is definitely smaller than 32 and on the smaller side."], "image": "val2014/COCO_val2014_000000328462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164683, "question_id": "bgpWAnUQKNLWKHWc8FcztQ", "question": "What causes separation from the land mass in the background and the location of the elephants?", "choices": ["wall", "clothes", "fence", "planters"], "correct_choice_idx": 0, "direct_answers": ["wall", "wall", "fence", "walls", "giant wall", "long fence", "fence", "wall", "wall", "wall"], "difficult_direct_answer": false, "rationales": ["A barrier surrounds the elephants and tourists in this scene from the background hills.", "The separation is the wall.", "The wall keeps the elephants from the background."], "image": "train2014/COCO_train2014_000000164683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493435, "question_id": "bgzA55cujeCzHwXHZB7cGb", "question": "Which object would be most useful if there was a rainstorm?", "choices": ["on seat", "upside down", "wooden item", "greenery"], "correct_choice_idx": 1, "direct_answers": ["umbrella", "umbrella", "upside down", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["Umbrellas protect from the rain.", "There is an opened umbrella laying behind the bench.", "The object is for rain."], "image": "val2014/COCO_val2014_000000493435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310348, "question_id": "bhBC6M85icdn9sR53zt3fF", "question": "What drug will be ingested momentarily?", "choices": ["alcohol", "marijuana", "cocaine", "pcp"], "correct_choice_idx": 0, "direct_answers": ["wine", "alcohol", "alcohol", "champagne", "alcohol", "alcohol", "alcohol", "alcohol", "liquor", "alcohol"], "difficult_direct_answer": false, "rationales": ["The glasses depicted generally hold this kind of beverage. none of the other drugs listed are usually consumed by drinking.", "The glasses contain champagne.", "The drug is alcohol."], "image": "train2014/COCO_train2014_000000310348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326113, "question_id": "bhetvWpnkJUZp28qCaHgsR", "question": "Whose chair is the woman seated on at the beach?", "choices": ["security", "hers", "stranger", "lifeguard"], "correct_choice_idx": 3, "direct_answers": ["life guard", "lifeguard", "life guard", "lifeguard", "life guards", "lifeguard", "life guard", "lifeguard", "life guards", "life guard"], "difficult_direct_answer": false, "rationales": ["The chair is for the person who is protecting swimmers at the beach.", "This is the lifeguards seat.", "The woman is sitting on a lifeguard post."], "image": "train2014/COCO_train2014_000000326113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343878, "question_id": "bhv2D9CmSiwxyHePtftQ8m", "question": "Of conveyances seen here how many does the ones with the most wheels have?", "choices": ["none", "two", "three", "four"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["Some of the bikes have three wheels.", "There are three conveyances.", "The row bikes have one wheel in the front and two wheels in the back."], "image": "train2014/COCO_train2014_000000343878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248767, "question_id": "bhxBdb74ZhsFD4xcUWSkTB", "question": "What zone is shown in the photo?", "choices": ["shopping", "residential", "business", "tourist"], "correct_choice_idx": 3, "direct_answers": ["traffic", "tourist", "urban", "intersection", "pedestrian", "urban", "hotel", "traffic area", "construction", "crossing"], "difficult_direct_answer": true, "rationales": ["There are a lot of tourists in the area.", "The zone is for tourists.", "The zone is for tourists."], "image": "val2014/COCO_val2014_000000248767.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437776, "question_id": "bi3NVxjbJuzyZy9YC8MVAF", "question": "What type of surface does the blue vehicle run on?", "choices": ["air current", "road", "water", "rail"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "water", "water", "water", "boat", "water", "gas", "water", "water"], "difficult_direct_answer": false, "rationales": ["It is a boat", "It runs on water.", "The blue vehicle is a boat. it cannot fly or travel on land."], "image": "train2014/COCO_train2014_000000437776.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434488, "question_id": "bi9tUXZZEvq6Bo9GMbo2Q9", "question": "What unit of measurement is the tower used for?", "choices": ["volume", "time", "temperature", "height"], "correct_choice_idx": 1, "direct_answers": ["time", "measuring time", "time", "time", "time", "time", "time", "tell time", "clock time", "time"], "difficult_direct_answer": false, "rationales": ["The tower has a clock.", "The unit is time.", "The tower has two large clocks on the top indicating the hour of the day."], "image": "val2014/COCO_val2014_000000434488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462065, "question_id": "biATyS8cKcfoKKQXYfo8Cs", "question": "In which type space does this man spin his frisbee?", "choices": ["beach", "jail", "urban", "park"], "correct_choice_idx": 3, "direct_answers": ["park", "city park", "grassy", "park", "park", "park", "park space", "outdoors", "park", "park"], "difficult_direct_answer": false, "rationales": ["The space is a park.", "By setting of the background, it is easy to tell where he is.", "Most likely the man is in a park. a basketball court is right behind him, and grassy areas abound, making this a great place to run and catch frisbees."], "image": "train2014/COCO_train2014_000000462065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173948, "question_id": "biM28syeMy4vh4WiHUC67Q", "question": "What is this person practicing?", "choices": ["stargazing", "bird watching", "eclipse spotting", "serving"], "correct_choice_idx": 3, "direct_answers": ["tennis", "tennis", "tennis", "tennis", "tennis serve", "serving", "serving", "tennis stance", "tennis serve", "serving"], "difficult_direct_answer": false, "rationales": ["The person is holding a tennis racket and appears to have thrown a ball above their head and is about to make an overhead swing based on their body positioning. these aspects are all consistent with answer a.", "The person is holding a tennis racquet and is looking at a tennis ball, not stars, an eclipse, or birds.", "The person serves."], "image": "train2014/COCO_train2014_000000173948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440885, "question_id": "biZxcTQ88bHtkdfu46A9Xp", "question": "What country is the scene in?", "choices": ["australia", "france", "united states", "united kingdom"], "correct_choice_idx": 3, "direct_answers": ["uk", "england", "europe", "england", "great britain", "england", "uk", "uk", "united kingdom", "england"], "difficult_direct_answer": false, "rationales": ["A double decker bus is on the street. double decker buses are used in the uk.", "The vehicles are driving on the left side of the road. the green bus has an url from this country on its side.", "There are double-decker buses."], "image": "train2014/COCO_train2014_000000440885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313696, "question_id": "bigk36dos7QiF3QFyBdKMY", "question": "This bus is transporting passengers in which geographic region?", "choices": ["asia", "australia", "north america", "europe"], "correct_choice_idx": 0, "direct_answers": ["asia", "usa", "asia", "china", "cooling geography", "asia", "asia", "germany", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["The bus is in asia.", "There is a man on the bus that is reading a newspaper with headlines using characters instead of the alphabet. the assumption would then be that the bus holds asian passengers.", "You can see the writing on the newspaper the man is reading."], "image": "train2014/COCO_train2014_000000313696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6051, "question_id": "bivEf2DupmiSigjP7ShcLF", "question": "Where are these people located?", "choices": ["public transportation", "work", "home", "airport"], "correct_choice_idx": 0, "direct_answers": ["on bus", "bus", "bus", "bus", "public transportation", "public transport", "bus", "subway", "on chairs", "bus"], "difficult_direct_answer": false, "rationales": ["The seating and the yellow baseboard suggest they are on a public bus.", "The yellow lines and seating are the giveaway. it's surprising that the dog is allowed so close to the other passenger.", "The people are sitting in bus seats."], "image": "train2014/COCO_train2014_000000006051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224166, "question_id": "bj9pKjtzokZpt5TPrcPjhr", "question": "While being kissed what does the woman do?", "choices": ["smokes", "protests", "sleeps", "complains"], "correct_choice_idx": 0, "direct_answers": ["smoke", "smile", "smoke cigarette", "smoke", "hold cigarette", "smoke", "smoke", "smokes", "smoke", "smoke"], "difficult_direct_answer": false, "rationales": ["They're smoking.", "She has a cigarette in her hand.", "The woman has a cigarette."], "image": "train2014/COCO_train2014_000000224166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358296, "question_id": "bjcZVfdHHi7QJ37mLLhtWz", "question": "Who is the name of the biblical character whose husband is referenced on the ski lift?", "choices": ["mary", "monica", "sarah", "eve"], "correct_choice_idx": 3, "direct_answers": ["eve", "eve", "eve", "adam", "eve", "eve", "eve", "eve", "eve", "adam"], "difficult_direct_answer": false, "rationales": ["Adam and eve are the couple referenced.", "The ski lift refers to adam, not abraham, joseph, or patricius.", "That's who adam was with."], "image": "val2014/COCO_val2014_000000358296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143576, "question_id": "bjg9oiAi9eaQJFiSz9m9YX", "question": "What can be found underneath the pot being stirred?", "choices": ["flame", "cheese", "ice", "animals"], "correct_choice_idx": 0, "direct_answers": ["flames", "stove", "stove", "flame", "stovetop", "stove", "coils", "flame", "stove", "cart"], "difficult_direct_answer": false, "rationales": ["The food in the pot would not get cooked unless it was over a heat source, and this appears to be a gas powered stove.", "The flame is seen in the kitchen.", "A flame is heating the pot."], "image": "val2014/COCO_val2014_000000143576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50355, "question_id": "bjnUfeDBDvyjc7cszDnXQm", "question": "What does the red sign with a minus symbol on it usually mean?", "choices": ["free parking", "no parking", "no entering", "crossing permitted"], "correct_choice_idx": 2, "direct_answers": ["stop", "no entry", "don't enter", "no entry", "wrong way", "no entering", "no entry", "no entry", "no entry", "caution"], "difficult_direct_answer": false, "rationales": ["The sign means no entering.", "The red sign with the minus sign is a universal sign for no entry.", "The color grabs your attention and the minus usually denotes something against, so going through isn't allowed."], "image": "val2014/COCO_val2014_000000050355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149961, "question_id": "bjtdYyNeYFLov7rCFNGv5s", "question": "What is the man wearing white shirt waiting for?", "choices": ["rain stopping", "crossing street", "his kid", "bus"], "correct_choice_idx": 0, "direct_answers": ["bus", "bus", "cab", "taxi", "rain stopping", "taxi", "taxi", "taxi", "work", "rain"], "difficult_direct_answer": false, "rationales": ["The man is waiting under a roof so he doesn't get wet.", "The ground is wet, and the sky is cloudy. the other people, unlike the man, are holding umbrellas.", "The man doesn't want to get wet."], "image": "train2014/COCO_train2014_000000149961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345833, "question_id": "bjzFsYUi2hF5GS5yq7stWZ", "question": "What style of sunglasses does the man holding the bun have on?", "choices": ["wrap around", "scavenger", "aviator", "cats eye"], "correct_choice_idx": 2, "direct_answers": ["sunglasses", "aviators", "aviators", "aviator", "aviators", "cop", "aviator", "pilot", "aviators", "sun"], "difficult_direct_answer": false, "rationales": ["The shape and look of them are what pilots will wear and are named after that.", "This shape lens and the rims are popularized by pilots during the war period, in aviation, which is where the name comes from.", "The glasses look like those commonly known as aviator sunglasses."], "image": "train2014/COCO_train2014_000000345833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311715, "question_id": "bkK8Ndd5aQUyibRPc2KEsJ", "question": "What type of vehicle is the man in the gray hat riding?", "choices": ["wagon", "four wheeler", "trike", "sports bike"], "correct_choice_idx": 2, "direct_answers": ["taxi", "motorcycle", "motorcycle", "motorcycle", "trike", "motorcycle", "motorcycle", "bike", "motorcycle", "trike"], "difficult_direct_answer": false, "rationales": ["The man is riding a bike with three wheels.", "The bike has three wheels.", "The vehicle driven by the gray hatted man has three wheels."], "image": "train2014/COCO_train2014_000000311715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305105, "question_id": "bkU5ayTg3J4wSZV5qx8p2H", "question": "What time of year is it in this household gathering?", "choices": ["easter", "christmas", "valentine's", "thanksgiving"], "correct_choice_idx": 1, "direct_answers": ["winter", "winter", "christmas", "winter", "christmas", "winter", "anytime", "xmas", "christmas", "winter"], "difficult_direct_answer": false, "rationales": ["You can tell by the decorations in the background as to what time of year it is.", "There are christmas decorations on the wall.", "There is a santa claus blanket on the chair by the people in this room indicating the holiday."], "image": "train2014/COCO_train2014_000000305105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438979, "question_id": "bkWcnfirU688fcefXjrcpm", "question": "These animals are in an area that looks like it is what?", "choices": ["wet", "dry", "submerged", "metropolitan"], "correct_choice_idx": 1, "direct_answers": ["prairie", "farm", "dessert", "barren", "ranch", "desert", "warm", "ranch", "desert", "dry"], "difficult_direct_answer": false, "rationales": ["These sheep are in a rural area. the grass is not green.", "The ground is all brown or yellow.", "There is mostly dirt everywhere save for a few stray bushes here and there. there are rocks as well."], "image": "train2014/COCO_train2014_000000438979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261805, "question_id": "bkyae4e63zQfdhDD3JkaJF", "question": "What type of building are they walking towards?", "choices": ["train station", "metro stop", "taxicab station", "airport"], "correct_choice_idx": 3, "direct_answers": ["terminal", "airport", "airport", "airport", "airport", "airport", "airport", "airport", "airport", "airport terminal"], "difficult_direct_answer": false, "rationales": ["People with luggage go to the airport.", "People with luggage are walking towards a glass building with very tall ceilings. people take luggage when traveling by plane.", "The large scale of the structure and luggage point to this location."], "image": "val2014/COCO_val2014_000000261805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368154, "question_id": "bm94xddWrCNrACS637oiYy", "question": "What could this intersection be called instead of all way?", "choices": ["three-way", "two-way", "one-way", "four-way"], "correct_choice_idx": 3, "direct_answers": ["four way", "four way", "junction", "four way", "four way", "four way", "four way", "four-way", "junction", "four way"], "difficult_direct_answer": false, "rationales": ["The sign is a 4 way.", "This could be called a four way intersection.", "There are four roads leading to an intersection."], "image": "val2014/COCO_val2014_000000368154.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119966, "question_id": "bmPtDyuafneVjTuwHbfLC7", "question": "Which area of the house is this?", "choices": ["kitchen", "basement", "attic", "bathroom"], "correct_choice_idx": 0, "direct_answers": ["kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen"], "difficult_direct_answer": false, "rationales": ["The room contains several items associated with the production of food, which is usually not prepared in the other types of rooms.", "The sink, knives, cabinets and cooking paraphernalia present in this room identifies it as a kitchen.", "There is a sink and knives"], "image": "train2014/COCO_train2014_000000119966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52256, "question_id": "bmXjL7fW2wq3DduM9beEPW", "question": "What street intersects Rhode Island Avenue?", "choices": ["14th", "12th", "11th", "4th"], "correct_choice_idx": 0, "direct_answers": ["not clear", "14th", "14 street", "northwest street", "fourteen street", "14th street", "forteenth", "14 st", "14th st", "14th"], "difficult_direct_answer": true, "rationales": ["The sign for rhode island avenue is on a pole. the sign for the intersecting street is lower on the pole.", "The street intersection is visible on another sign sign in the image.", "There is a sign that indicates the name of the street."], "image": "train2014/COCO_train2014_000000052256.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73588, "question_id": "bmhGRVszZJuwcKAmXk7wRr", "question": "What fairly important item is missing from this child?", "choices": ["bolo tie", "diaper", "jeans", "wrist bands"], "correct_choice_idx": 1, "direct_answers": ["clothes", "diaper", "clothing", "clothes", "clothing", "diaper", "clothes", "clothes", "diaper", "clothing"], "difficult_direct_answer": false, "rationales": ["Most babies at this age need diapers.", "A baby is on a blanket with no clothes on. babies wear diapers because they are not potty trained.", "The child is naked."], "image": "val2014/COCO_val2014_000000073588.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58143, "question_id": "bn5D7w9oyB8aSfztoH5vQU", "question": "What sound will the occupants on the apartments hear through their windows?", "choices": ["people talking", "bikers", "man walking", "train"], "correct_choice_idx": 3, "direct_answers": ["train", "roar", "train", "train", "train", "train", "buses passing", "train", "subway rail", "train"], "difficult_direct_answer": false, "rationales": ["The bicycle is parked, so it is not making noise. the other vehicle is louder than the people.", "It is a large, fast-moving vehicle on tracks.", "Trains whoosh by and make noise."], "image": "train2014/COCO_train2014_000000058143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116224, "question_id": "bnRqimAojQDasSU4itmzpP", "question": "What food category is this object in?", "choices": ["grain", "fruit", "vegetable", "sweet"], "correct_choice_idx": 2, "direct_answers": ["fruit", "vegetable", "vegetable", "vegetable", "fibre", "vegetables", "vegetables", "vegetable", "vegetable", "carrot"], "difficult_direct_answer": false, "rationales": ["It is a carrot.", "This is a carrot. it is classified as a root vegetable.", "That is what a carrot is."], "image": "train2014/COCO_train2014_000000116224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76681, "question_id": "bnsP8vpXo5XbrzNpbEg5jF", "question": "What outdoor activity has the man taken a break from?", "choices": ["baseball", "basketball", "frisbee", "racing"], "correct_choice_idx": 2, "direct_answers": ["frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["He is holding a plastic disk in his hands", "The man is sitting in a chair and holding a plastic disk in his hands which is what is used to play the game.", "The man is sitting and holding a round plastic object with a rim which indicates the activity he likely was participating in prior to sitting down."], "image": "train2014/COCO_train2014_000000076681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421825, "question_id": "bo8GuPfnYgoZPZQUVqZQCS", "question": "The child here is ready for which weather?", "choices": ["hurricane", "wind", "sun rain", "tornado"], "correct_choice_idx": 2, "direct_answers": ["summer", "rain", "rain", "sunny", "sun rain", "cool", "sun", "rainy", "rain", "cool"], "difficult_direct_answer": false, "rationales": ["A large umbrella is being held over a baby. umbrellas are used to block rain and sun.", "The child here is ready for sun rain with a parasol.", "There is an umbrella over the child"], "image": "train2014/COCO_train2014_000000421825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131450, "question_id": "boZCVs275rTM7Qdhmfv8sb", "question": "What type of area is this?", "choices": ["mountain", "residential", "forest", "beach"], "correct_choice_idx": 1, "direct_answers": ["urban", "sidewalk", "sidewalk", "sidewalk", "skiing", "residential", "side street", "residential", "snow area", "residential"], "difficult_direct_answer": false, "rationales": ["A person is on the sidewalk and cars are parked with trees lining the street that is not busy. residential areas are not as busy and have more trees than commercial.", "There are cars parked along the road amongst trees and fences around properties", "The yard fences and sidewalks make it look like an area for homes."], "image": "train2014/COCO_train2014_000000131450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165610, "question_id": "bocNoMYfFq8xrcQo5V9yp4", "question": "What will they serve to drink?", "choices": ["orange juice", "milk", "wine", "grape juice"], "correct_choice_idx": 0, "direct_answers": ["orange juice", "coffee", "orange juice", "juice", "water", "orange juice", "cofee", "orange juice", "juice", "coffee"], "difficult_direct_answer": false, "rationales": ["The eggs on their plates indicate they are likely eating breakfast, and orange juice is a popular breakfast drink. an orange liquid can also be seen in a jug on the table.", "There is a bowl of orange liquid which can be poured into several cups for people to drink.", "All place setting have an oj cup by it."], "image": "train2014/COCO_train2014_000000165610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244713, "question_id": "boduGoeZPLxRZHFsq2NUJ7", "question": "What does the person in the white hat power?", "choices": ["nothing", "blender", "forward movement", "aerobic"], "correct_choice_idx": 1, "direct_answers": ["electricity", "bike", "blender", "cycle", "bike", "blender", "electricity", "bike", "bicycle", "stamina"], "difficult_direct_answer": false, "rationales": ["The person in the white hat has a blender on his bike.", "He is on a bike obviously trying to go forward.", "The person is bending."], "image": "train2014/COCO_train2014_000000244713.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13603, "question_id": "bp8URDnJgLV2JgxWwRVXfz", "question": "Where are people enjoying their pizza?", "choices": ["pizzeria", "factory", "work office", "bakery"], "correct_choice_idx": 2, "direct_answers": ["work office", "office", "office", "in cubicle", "office", "office", "office", "in cubicle", "office", "work"], "difficult_direct_answer": false, "rationales": ["It appears to be in a cubicle, and a computer with work files can be seen on the right side of the screen.", "This guy is relaxing and enjoying pizza at his job.", "There are square ceiling tiles, a desk, stuff all over the wall, and a computer that show that she must be in a cubicle."], "image": "train2014/COCO_train2014_000000013603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193171, "question_id": "bpDGn6HR2nyHc9axb82nQU", "question": "What color is the canopy?", "choices": ["red/white", "white/yellow", "green/white", "blue/white"], "correct_choice_idx": 0, "direct_answers": ["red/white", "blue", "blue", "blue white", "blue white", "blue white", "blue white", "blue white", "blue white", "blue grey"], "difficult_direct_answer": false, "rationales": ["It has these stripes", "The awning is blue and white striped.", "There is only one canopy seen by the tennis court and the color is blue and white strips."], "image": "train2014/COCO_train2014_000000193171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314497, "question_id": "bpthJvHZPrZZXgWzgiBRF2", "question": "How many people are likely enjoying the dessert?", "choices": ["three", "one", "four", "two"], "correct_choice_idx": 1, "direct_answers": ["one", "one", "one", "one", "one", "multiple", "eight", "one", "one", "zero"], "difficult_direct_answer": false, "rationales": ["There is one plate.", "The food in question is cake and the serving portion is a slice which is traditional for cake. cake is usually served in a slice intended to be eating by an individual.", "There is one slice on the plate."], "image": "train2014/COCO_train2014_000000314497.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375750, "question_id": "bptwqwPWawHj7vMQVdGKF2", "question": "Where is this person playing?", "choices": ["sand", "park", "court", "playground"], "correct_choice_idx": 2, "direct_answers": ["tennis court", "indoors", "tennis court", "tennis court", "tennis court", "tennis court", "tennis court", "tennis", "tennis court", "court"], "difficult_direct_answer": false, "rationales": ["The person is on a court.", "The person is playing tennis based on the lines, their uniform and equipment. the playing surface for tennis is known as answer a.", "Tennis is played on a court."], "image": "train2014/COCO_train2014_000000375750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574332, "question_id": "bpw2DwNr3naYd4qGi2Nghq", "question": "Why is the man's coat yellow in color?", "choices": ["dress code", "fashion", "visibility", "camouflage"], "correct_choice_idx": 2, "direct_answers": ["safety", "worker", "safety", "visibility", "visibility", "easy visiblity", "visibility", "reflector", "increased visibility", "visibility"], "difficult_direct_answer": false, "rationales": ["The man's yellow coat on the left is to provide visibility at work.", "The man is wearing yellow to make him visible to cars.", "It' s so people can see them."], "image": "val2014/COCO_val2014_000000574332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308257, "question_id": "bpy4fzF9ToAedKRmXFtqUL", "question": "How might this person easily look up the phone number for a taxi?", "choices": ["check purse", "ask passersby", "google it", "yellow pages"], "correct_choice_idx": 2, "direct_answers": ["google", "internet search", "computer", "computer", "google it", "web search", "uber", "on computer", "internet", "internet search"], "difficult_direct_answer": false, "rationales": ["This person might google the taxi phone number.", "She has a laptop. she could use a search engine to find a phone number for a taxi service.", "She can do a search on the computer."], "image": "train2014/COCO_train2014_000000308257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61624, "question_id": "bqaWhNRHMBABG4aneHqiM8", "question": "Why is the man holding up an orange object?", "choices": ["direct traffic", "to eat", "to fight", "to dance"], "correct_choice_idx": 0, "direct_answers": ["signaling plane", "directing airplanes", "direct traffic", "safety reasons", "direct planes", "give directions", "directing planes", "guidance", "directing traffic", "directing traffic"], "difficult_direct_answer": true, "rationales": ["The man has an orange object so the planes can see him.", "The man is directing traffic.", "Only persons with relevant jobs are allowed on the tarmac."], "image": "val2014/COCO_val2014_000000061624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171845, "question_id": "bqhjPP9v8jUV76GkqCafkT", "question": "What type of exhibition is this?", "choices": ["gun", "art", "sports", "animal"], "correct_choice_idx": 1, "direct_answers": ["toilet", "art", "toilet", "art", "art", "art", "art", "toilets", "art", "art"], "difficult_direct_answer": false, "rationales": ["The toilets are painted. there are no animals, athletes, or weapons.", "Decorative toilets that have been painted are on display.", "The toilets are sculptures."], "image": "train2014/COCO_train2014_000000171845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483027, "question_id": "bqnwECy3jmFing4cDofKR3", "question": "Why are they here?", "choices": ["to rest", "clean up", "to eat", "selling food"], "correct_choice_idx": 2, "direct_answers": ["eating", "eating", "lunch", "eating", "eating", "lunch", "lunch", "to eat", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["The people are at a restaurant.", "They are there to eat.", "Based on the place settings, the tables and the fact that most people are holding utensils it is likely they are at a restaurant. people attend restaurants most commonly with the purpose of eating."], "image": "val2014/COCO_val2014_000000483027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493544, "question_id": "br2YLvAo85iuRH6Hi6oApW", "question": "What venue is this place?", "choices": ["ski resort", "business zone", "residential zone", "shopping zone"], "correct_choice_idx": 0, "direct_answers": ["ski resort", "ski resort", "ski resort", "ski resort", "ski slope", "ski resort", "ski resort", "ski hill", "ski resort", "ski report"], "difficult_direct_answer": false, "rationales": ["The ground is covered in snow. the person is participating in a winter sport that uses poles.", "The person is skiing.", "The man is skiing down the hill so it does make sense."], "image": "val2014/COCO_val2014_000000493544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56656, "question_id": "brc7et7bWDe63FQ8YL9zvn", "question": "What is sometimes substituted for the item the woman is holding?", "choices": ["cummerbund", "glove", "sock", "bow tie"], "correct_choice_idx": 3, "direct_answers": ["bowtie", "bowtie", "bowtie", "bowtie", "bowtie", "clip on", "bowtie", "scarf", "bow tie", "bow tie"], "difficult_direct_answer": false, "rationales": ["Bowties can be subbed for ties.", "The bowtie is subbed.", "A bow tie is used on occasion instead of a regular tie."], "image": "train2014/COCO_train2014_000000056656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437353, "question_id": "brgmoPWUrQVjhdTcTweZfN", "question": "What is knocked over?", "choices": ["light post", "fire hydrant", "parking meter", "stop sign"], "correct_choice_idx": 2, "direct_answers": ["parking meter", "parking meter", "parking meter", "parking meters", "parking meters", "parking meters", "parking meter", "meter", "meters", "parking meter"], "difficult_direct_answer": false, "rationales": ["There are two time keeping pods at the end of the pole next to the curb where the car is.", "There is a visible parking meter on the sidewalk grate with a piece of grate pulled up at its base which would not be its normal upright position. it is clear a parking meter based on the dials and design.", "The parking meter is on the ground."], "image": "train2014/COCO_train2014_000000437353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342279, "question_id": "bs2so7phu5rfaoCTB6jgH8", "question": "In what decade were these reflective ceiling decorations first used?", "choices": ["1920s", "1970s", "1950s", "1960s"], "correct_choice_idx": 0, "direct_answers": ["seventies", "seventies", "1970s", "sixties", "seventies", "1970-80's disco", "70s", "1898", "1920s", "70s"], "difficult_direct_answer": false, "rationales": ["The object in question is a disco ball based on its shape and composition. based on an internet search, answer a is accurate.", "A disco ball hangs above people dancing.", "These reflective ceiling decorations were used during the disco era of the 1970s."], "image": "train2014/COCO_train2014_000000342279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511179, "question_id": "bsApacNPFmfe4LRWPJ3E54", "question": "What kind of seafood did they most likely eat at the restaurant?", "choices": ["calamari", "oysters", "mussels", "clams"], "correct_choice_idx": 1, "direct_answers": ["clams", "oysters", "clams", "oysters", "oysters", "oysters", "oysters", "fresh", "oysters", "oysters"], "difficult_direct_answer": false, "rationales": ["There is a plate full of empty oyster shells on the table. as they are empty, it is likely that they have been eaten by the people also at the table.", "Oyster shells are shown.", "There are empty shells on the plate"], "image": "train2014/COCO_train2014_000000511179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483316, "question_id": "bsBYKshrhVGQP5o7RHjNeT", "question": "What will the train do next?", "choices": ["evacuate passengers", "move", "close doors", "power off"], "correct_choice_idx": 2, "direct_answers": ["depart", "move", "leave", "close doors", "close doors", "leave station", "depart", "leave station", "start moving", "stop"], "difficult_direct_answer": false, "rationales": ["It will close the doors and leave.", "The doors will stay open until the train is ready to leave from the station.", "Since nobody is there or everyone who needed a ride has boarded, it is ready to move, so it has to shut the doors for safety."], "image": "train2014/COCO_train2014_000000483316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22487, "question_id": "bsJRf5YnHQ9ftyCgs8Q7hJ", "question": "The men holding instruments are wearing uniforms that resemble the colors of what police force?", "choices": ["mounties", "nypd", "scotland yard", "lapd"], "correct_choice_idx": 0, "direct_answers": ["blue black", "air force", "usa", "mounties", "motorcycle force", "ramp", "russian", "canadian police", "canadian", "ramp"], "difficult_direct_answer": true, "rationales": ["Mounties wear red also.", "They are the color of mounties.", "There is a red uniform with the people playing instruments."], "image": "train2014/COCO_train2014_000000022487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266910, "question_id": "bsXund8W78w3gqTJ8h4C9x", "question": "What is the person wearing the blue coat about to do?", "choices": ["board train", "wave goodbye", "serve lunch", "get off"], "correct_choice_idx": 3, "direct_answers": ["exit train", "exit train", "exit train", "exit train", "get off", "jump out", "board", "get off", "disembark train", "board"], "difficult_direct_answer": false, "rationales": ["He's leaning forward in the way of the doors as he's about to leave", "The man is near the exit.", "The person is standing at the door and the door is open."], "image": "train2014/COCO_train2014_000000266910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292722, "question_id": "bsiuzkvcXgQ2To3fu5DX6n", "question": "What is the window covering called?", "choices": ["panels", "curtains", "blinds", "shades"], "correct_choice_idx": 2, "direct_answers": ["blinds", "blinds", "blinds", "blinds", "blinds", "blinds", "blinds", "blinds", "blinds", "blinds"], "difficult_direct_answer": false, "rationales": ["There are horizontal lines across the window that can be adjusted.", "The window uses blinds.", "The windows have horizontal slats on them which are used to provide shade."], "image": "train2014/COCO_train2014_000000292722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544516, "question_id": "bsszZfE6TfZ2EZ6W93Z4Ws", "question": "Why are the horses behind a fence?", "choices": ["fun", "fear", "aesthetic appeal", "security"], "correct_choice_idx": 3, "direct_answers": ["containment", "prevent escaping", "containment", "security", "unsure", "can't escape", "containment", "contain them", "horse", "enclosed"], "difficult_direct_answer": false, "rationales": ["Horses tend to roam. this is how they are kept from being lost.", "The horses are for security.", "Horses are kept behind fences so they do not get out and no one else goes in unless authorized."], "image": "train2014/COCO_train2014_000000544516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426118, "question_id": "bt3M7gnZH7zokTGTSujCCQ", "question": "Why is the plane so small?", "choices": ["model airplane", "racing", "manufacturing error", "for pets"], "correct_choice_idx": 0, "direct_answers": ["model airplane", "miniature", "model airplane", "model plane", "model", "model plane", "model", "model", "miniature-aircraft show", "studying"], "difficult_direct_answer": false, "rationales": ["It is to play with", "The plane is just a model.", "It's just a replica of a bigger one. usually a toy or just to look at."], "image": "train2014/COCO_train2014_000000426118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556345, "question_id": "btikPcNaiTsXEB4mwGKyKZ", "question": "What is this country?", "choices": ["canada", "france", "italy", "united states"], "correct_choice_idx": 3, "direct_answers": ["usa", "united states", "usa", "america", "america", "united states", "united states", "usa", "colorado usa", "canada"], "difficult_direct_answer": false, "rationales": ["The country is the us.", "The west is in the us.", "Golden is in california. california is in the united states."], "image": "val2014/COCO_val2014_000000556345.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157269, "question_id": "btnHT5T9hYmrUu7EeEjcox", "question": "What are the signs for?", "choices": ["traffic signs", "maps", "selling stuff", "missing dogs"], "correct_choice_idx": 3, "direct_answers": ["missing dogs", "traffic light", "watch", "pets", "looking animals", "information", "museum", "animals zone", "information", "safety"], "difficult_direct_answer": true, "rationales": ["There are dogs on the signs.", "There appears to be a dog on the signs. when dogs are missing, signs are often put up with a picture of the dog and pertinent information to help reunite them with the owner.", "The signs are for dogs."], "image": "val2014/COCO_val2014_000000157269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168804, "question_id": "btpu2s6HSWwXuWs69M4yMe", "question": "What caused the dark stains on the container?", "choices": ["oil", "dirt", "paint", "ink"], "correct_choice_idx": 0, "direct_answers": ["baking", "grease", "oil", "grease", "stored", "grease", "oil", "grease", "grease", "oil"], "difficult_direct_answer": false, "rationales": ["The grease from the pizza.", "Oil from the cheese caused the stain.", "Pizza is greasy and releases oil from the cheese."], "image": "val2014/COCO_val2014_000000168804.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281840, "question_id": "bttNhU5pcXTFGHpFvsGnf2", "question": "The man in the vest and blue jeans looks like he could be a member of what group?", "choices": ["zz top", "new edition", "jackson 5", "spice girls"], "correct_choice_idx": 0, "direct_answers": ["zz top", "lumberjacks", "zz top", "grandparents", "band", "zz top", "lumberjack", "curl", "church", "lumberjacks"], "difficult_direct_answer": false, "rationales": ["All the members of this band have long beards.", "The man has a top.", "No one with a beard like that was ever in the jackson five or new edition. it's obvious that no one with a beard could ever be in the spice girls although it's a humorous thought; especially with \"scary spice.\" but we get the answer here by a process of elimination."], "image": "train2014/COCO_train2014_000000281840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271429, "question_id": "btyBjkLGAb2WbNdQGs8JQj", "question": "What is the relationship between the two men sitting on the bench in this situation?", "choices": ["classmates", "coworkers", "strangers", "teammates"], "correct_choice_idx": 3, "direct_answers": ["teammates", "teammates", "teammates", "teammates", "teammates", "teammates", "same team", "teammates", "players", "teammates"], "difficult_direct_answer": false, "rationales": ["The man are both wearing the same jersey.", "By the uniforms they are wearing and the sport they are playing you can tell what they are in relation to each other.", "The relation is a teammate."], "image": "val2014/COCO_val2014_000000271429.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312051, "question_id": "buLJd2jBTvAQW6rAczT5rG", "question": "Where is this elephant standing?", "choices": ["car wash", "cow barn", "rodeo stand", "circus grounds"], "correct_choice_idx": 3, "direct_answers": ["circus ring", "sand", "circus ring", "circus ring", "ring", "stadium", "circus grounds", "dirt", "circus ring", "circus ring"], "difficult_direct_answer": false, "rationales": ["There is a colorful ring and stands", "There are colorful rings around the elephant.", "The elephant is inside a circus ring."], "image": "val2014/COCO_val2014_000000312051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287084, "question_id": "buPRyzAsT9FgnrWkZkNSkc", "question": "Where was this man very recently?", "choices": ["far inland", "gaming", "ocean", "in bed"], "correct_choice_idx": 2, "direct_answers": ["in water", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "beach", "in water"], "difficult_direct_answer": false, "rationales": ["The man is carrying a surfboard and wearing a wetsuit, so yes, he is just in from his good runs today on some great waves.", "He is in a wetsuit and holding a surfboard, and he can be seen with water dripping on his head, indicating he just left the ocean.", "The man is surfing."], "image": "train2014/COCO_train2014_000000287084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435620, "question_id": "bukoko46fYFuW9bjqRqD2y", "question": "What country invented the type of bread used on this sandwich?", "choices": ["france", "italy", "greece", "usa"], "correct_choice_idx": 0, "direct_answers": ["austria", "italy", "france", "france", "france", "france", "france", "austria", "france", "france"], "difficult_direct_answer": false, "rationales": ["This is from france.", "This is a croissant which is a french pastry.", "Croissants are a french food."], "image": "train2014/COCO_train2014_000000435620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149440, "question_id": "bv7PDwjddDbeEvY6vfEdND", "question": "What kind of area is shown?", "choices": ["forest", "rural", "coastal", "urban"], "correct_choice_idx": 3, "direct_answers": ["walk way", "urban", "market", "alley", "city", "europe", "shade", "store", "shaded", "decoration area"], "difficult_direct_answer": true, "rationales": ["The buildings are close togther and it's more urban area.", "There are many buildings spaced very close together, indicating it is a city or urban setting.", "The is a street depicted lined with buildings."], "image": "train2014/COCO_train2014_000000149440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518844, "question_id": "bvDyahkkP3GVYmfceihXNe", "question": "The giraffes are made of what kind of fabric?", "choices": ["fur", "nylon", "denim", "water resistant"], "correct_choice_idx": 3, "direct_answers": ["water resistant", "papier mache", "fabric", "polyester", "cotton", "cloth", "plush", "velvet", "polyester", "plastic"], "difficult_direct_answer": true, "rationales": ["The fabric appears smooth and light.", "They look to be made of some kind of plastic that would not let water soak through.", "The materials used are sure not to be destroyed by rain."], "image": "train2014/COCO_train2014_000000518844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353344, "question_id": "bvEqfvBGmaSUEztbMvFic5", "question": "How many German Shepherds shown in the image?", "choices": ["two", "one", "six", "five"], "correct_choice_idx": 1, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is one german shepherd playing in the train tracks.", "One german shepherd is walking along.", "Only one dog is in the picture."], "image": "train2014/COCO_train2014_000000353344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357057, "question_id": "bvJZtXbrGByXfHzhDWuPXD", "question": "How hot is the air from a hair dryer?", "choices": ["200-300f", "100-120f", "500-600f", "80-120f"], "correct_choice_idx": 3, "direct_answers": ["hot", "125 f", "warm", "450 degrees", "warm", "very", "80-120f", "140 celsius", "morten", "80-140 degrees"], "difficult_direct_answer": true, "rationales": ["Hair dryers are still safe for human use.", "The hair dryer can get a maximum of 140 degrees.", "The hair dryer doesn't get so hot it could be deadly."], "image": "train2014/COCO_train2014_000000357057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88084, "question_id": "bvKY8z3zAewDpjAhdgTMTk", "question": "The multiple markings in front of the hydrant on the asphalt alert drivers to what item?", "choices": ["breakdown lane", "passing lane", "crosswalk", "bicycle lane"], "correct_choice_idx": 2, "direct_answers": ["crosswalk", "crosswalk", "crosses", "pedestrian crossing", "crosswalk", "crosswalk", "cross walk", "not park", "crosswalk", "crossword"], "difficult_direct_answer": false, "rationales": ["The markings indicate people can cross there.", "The yellow lines on the road are the easiest colour to see and it lets pedestrians know they can safely cross there.", "Lanes would not be that wide."], "image": "val2014/COCO_val2014_000000088084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515878, "question_id": "bvWhxkJNakzmAFvESQ2dry", "question": "What material is the floor made of?", "choices": ["carpet", "vinyl", "wood", "tile"], "correct_choice_idx": 1, "direct_answers": ["ceramic", "linoleum", "tile", "linoleum", "vinyl", "linoleum", "linoleum", "ceramic", "tile", "tile"], "difficult_direct_answer": false, "rationales": ["Most kitchens you a surface that is easy to clean and won't rot or rust.", "This style of tiling is likely vinyl based on the texture and color.", "Vinyl floors create tile."], "image": "train2014/COCO_train2014_000000515878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371928, "question_id": "bvw52DcoqWyKmzKp43sYoQ", "question": "What liquid have they been given for breakfast?", "choices": ["banana juice", "mango juice", "pineapple juice", "orange juice"], "correct_choice_idx": 3, "direct_answers": ["juice", "orange juice", "orange juice", "orange juice", "orange juice", "orange juice", "juice", "juice", "orange juice", "orange juice"], "difficult_direct_answer": false, "rationales": ["You can tell by the color and other foods around it as to what is in the bottle.", "Orange juice is bottled up.", "There is oj in the jar."], "image": "train2014/COCO_train2014_000000371928.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409143, "question_id": "bwDLX6znuTw6eByS8evgfG", "question": "The animals shown here give birth to what?", "choices": ["calves", "lambs", "kids", "children"], "correct_choice_idx": 1, "direct_answers": ["lamb", "kids", "lamb", "lambs", "lambs", "lamb", "kids", "lambs", "lamb", "babies"], "difficult_direct_answer": false, "rationales": ["The animals are sheep based on their size, shape and the appearance of their coats. sheep give birth to answer a.", "The animal is sheep and their young ones are know to be lamp.", "Sheep are grazing in a field."], "image": "train2014/COCO_train2014_000000409143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243336, "question_id": "bwERgtEKtZzHvdcbmXcMYQ", "question": "Who has the same color hair as the child on the left?", "choices": ["carrot top", "natalie portman", "jessica biel", "jessica simpson"], "correct_choice_idx": 0, "direct_answers": ["fiona", "man", "red", "carrot top", "prince harry", "surfboard", "ron weasley", "kathy griffin", "middle person", "carrot top"], "difficult_direct_answer": true, "rationales": ["The hair is the same color as a carrot.", "A famous comedian with a lot of red hair.", "The little boy is redheaded."], "image": "train2014/COCO_train2014_000000243336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99681, "question_id": "bwXhJdfhWe3Fk2sPZnLreK", "question": "When working in the environment which procedure is most important?", "choices": ["lunch", "schedule", "timesheet", "safety"], "correct_choice_idx": 3, "direct_answers": ["safety", "safety", "rules", "safety", "safety", "sorting", "listening", "safety", "safety", "safety"], "difficult_direct_answer": false, "rationales": ["When working around any machinery you have to be careful.", "It is a warehouse with lots of tools ans machinery that can harm someone if proper precautions are not followed.", "It appears to be a factory where there is a lot of dangerous machinery that can harm someone without the proper training and precaution."], "image": "train2014/COCO_train2014_000000099681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525971, "question_id": "bww6yW2795yk9fShEvvMtD", "question": "Which color is the safest to wear to help stand out?", "choices": ["gray", "brown", "orange", "black"], "correct_choice_idx": 2, "direct_answers": ["orange", "orange", "neon yellow", "bright green", "orange", "yellow", "white", "red", "orange", "yellow"], "difficult_direct_answer": false, "rationales": ["Orange is the most eye catching of the colors listed here.", "Orange is a common color of reflective vests worn by highway workers and others who need to be visible for safety reasons and that is because orange is a color that stands out and is noticeable.", "The orange in the picture is the first colour that stands out, and it can be seen first before any other colour."], "image": "val2014/COCO_val2014_000000525971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340440, "question_id": "bx5apnS4KZ6YrpUjWJK7oo", "question": "Why is the room so small?", "choices": ["temporary arrangement", "college dorm", "small tenants", "low rent"], "correct_choice_idx": 1, "direct_answers": ["dorm", "cluttered", "impoverished occupants", "small unit", "dorm", "small apartment", "its bedroom", "small apartment", "studio", "college dorm"], "difficult_direct_answer": false, "rationales": ["There is a small piece of furniture closest with a small desk and laptop and pairs of tennis shoes.", "This looks like the setup of a college dorm and the rooms are smaller in dormitories.", "Looks to be a room that a college student would stay in if they lived on campus."], "image": "train2014/COCO_train2014_000000340440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62880, "question_id": "bx7iJ9rwJmHLsC6iAqKDD5", "question": "What did the motorcycle near the crosswalk just do?", "choices": ["turn", "fall", "break", "crash"], "correct_choice_idx": 0, "direct_answers": ["turn right", "turn", "turn", "turn right", "turn", "turn", "illegally go", "cross", "turn", "cross intersection"], "difficult_direct_answer": false, "rationales": ["The motorcycle turns.", "The motorcycle is turning.", "The front wheel is turning."], "image": "train2014/COCO_train2014_000000062880.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9800, "question_id": "bxWftHgCtvVm9j7QLR9s3x", "question": "What is the hair on the side of the man's cheek called?", "choices": ["sideburn", "mustache", "soul patch", "goatee"], "correct_choice_idx": 0, "direct_answers": ["sideburns", "chops", "buzz cut", "sideburn", "side burns", "sideburns", "sideburn", "sideburn", "mustache", "sideburn"], "difficult_direct_answer": false, "rationales": ["The man has sideburns on the side of his face.", "The hair is called a sideburn.", "The hair on the side is his sideburns."], "image": "val2014/COCO_val2014_000000009800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104800, "question_id": "bxsWqDxGTyEfBSDAjSoGfL", "question": "What kind of accessory should the woman wear?", "choices": ["scarf", "wristbands", "gloves", "sports shoes"], "correct_choice_idx": 3, "direct_answers": ["helmet", "shoes", "helmet", "helmet", "helmet", "helmet", "helmet", "sports shoes", "sneakers", "shoes"], "difficult_direct_answer": false, "rationales": ["The woman should have sneakers to bike.", "Sneakers would protect the feet while riding a bike. sandals could fall off.", "This would be a better choice for bike riding than sandals."], "image": "train2014/COCO_train2014_000000104800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169040, "question_id": "byGrmiwujRuwRDnK4x8MFb", "question": "What do people use who enter the building shown here?", "choices": ["beds", "movie cameras", "kitchen", "farm"], "correct_choice_idx": 0, "direct_answers": ["clothes", "credit card", "hotel key", "keys", "resting", "beds", "beds", "hotel key", "beds", "overnight rooms"], "difficult_direct_answer": false, "rationales": ["The people have beds.", "The building is a hotel. hotels are places where people sleep, and this piece of furniture is where people sleep.", "A hotel allows travelers a place to stay overnight when away from home."], "image": "train2014/COCO_train2014_000000169040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578418, "question_id": "byPXnERvu2G8UmwEaRKwYA", "question": "What type of cubed cheese is in the salad?", "choices": ["bleu", "cheddar", "american", "feta"], "correct_choice_idx": 3, "direct_answers": ["feta", "ricotta", "mozzarella", "feta", "feta", "mozzarella", "feta", "goat cheese", "feta", "feta"], "difficult_direct_answer": false, "rationales": ["This is a salad with feta cheese on it.", "It is a greek salad. cheddar, bleu, and american cheeses are not used in greek salads.", "Feta cheese is located on top of the salad."], "image": "val2014/COCO_val2014_000000578418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239053, "question_id": "byRDvf8pTfpfNqU6GXzht8", "question": "What sport are they playing?", "choices": ["soccer", "ultimate frisbee", "cricket", "rugby"], "correct_choice_idx": 1, "direct_answers": ["frisbee", "frisbee", "frisbee", "discgolf", "frisbee", "ultimate frisbee", "frisbee", "discgolf", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["There is a very thin white object that's floating near the ground and a man getting lower to try to get it.", "There is a man throwing a frisbee.", "They are playing frisbee."], "image": "train2014/COCO_train2014_000000239053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528358, "question_id": "bysXdyK75SQwTJN8sA5Wmg", "question": "What is the name of this sandwich?", "choices": ["monte cristo", "club sandwich", "reuben", "hoagie"], "correct_choice_idx": 1, "direct_answers": ["tomato", "club", "club", "blt", "club sandwich", "club", "turkey sandwich", "blt", "blt", "club"], "difficult_direct_answer": false, "rationales": ["That's a club sandwich on the plate.", "The club sandwich has fries and some cheese.", "Club sandwiches have bacon and lettuce."], "image": "train2014/COCO_train2014_000000528358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576622, "question_id": "bz3GsNwMB8vD4mwdqcxg7D", "question": "What breed dog it is?", "choices": ["poodle", "pug", "dachshund", "cane corso"], "correct_choice_idx": 3, "direct_answers": ["doberman", "don't know", "black lab", "cane corso", "fly kite", "shepherd", "doberman", "bull", "labrador", "labrador"], "difficult_direct_answer": false, "rationales": ["The dog is black with upright ears which is typical of a cane corso dog.", "The dog is a cane corso.", "The dog is a cane corso."], "image": "train2014/COCO_train2014_000000576622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175461, "question_id": "bz6RZgSvt68rrhmVYqtwJm", "question": "What country is this?", "choices": ["australia", "canada", "uk", "us"], "correct_choice_idx": 2, "direct_answers": ["england", "germany", "uk", "england", "england", "uk", "unknown", "uk", "england", "great britain"], "difficult_direct_answer": false, "rationales": ["It's from the united kingdom", "The uk has double decker buses.", "The words on the bus are in english and the bus is driving on the left side."], "image": "train2014/COCO_train2014_000000175461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436696, "question_id": "bz7Ly4bNDPgRMMG9gy7GvE", "question": "What setting is this sidewalk in?", "choices": ["forest", "urban", "rural", "farm"], "correct_choice_idx": 1, "direct_answers": ["urban", "urban", "residential area", "urban", "residential", "playing area", "city", "city", "city", "city"], "difficult_direct_answer": false, "rationales": ["The setting is urban.", "The place seems to be urban by the presences of the people and building.", "The sidewalk is in an urban area."], "image": "val2014/COCO_val2014_000000436696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4462, "question_id": "bzGCHW8m32hr4Gm56BKg6G", "question": "What is the longest word on the signs?", "choices": ["prepare", "less", "great", "caution"], "correct_choice_idx": 0, "direct_answers": ["prepare", "prepare", "prepare", "prepare", "prepare", "prepare", "prepare", "prepare", "prepare", "prepare"], "difficult_direct_answer": false, "rationales": ["Prepare is the longest word with 7 letters, compared to 2 and 4 letters.", "The word \"prepare\" has the most letters.", "This is obvious both given what is written on the sign and context."], "image": "train2014/COCO_train2014_000000004462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528966, "question_id": "bzxFeqiYCWNR4cyYZ8Tdj8", "question": "Which cake character figure is in most danger?", "choices": ["none", "black", "orange", "candle"], "correct_choice_idx": 2, "direct_answers": ["shark", "dragon", "shark", "cake topper", "shark", "unknown", "orange", "smaller ones", "shark", "mouse"], "difficult_direct_answer": false, "rationales": ["The character is orange.", "The orange figure is going to get eaten.", "The item has some fire on top."], "image": "val2014/COCO_val2014_000000528966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29096, "question_id": "c2HGX8LRJ6EqvEgajAsdWG", "question": "What sort of environment is this vehicle parked in?", "choices": ["snowy", "wet", "arid", "oceanic"], "correct_choice_idx": 2, "direct_answers": ["dessert", "sandy", "sand dune", "desert", "sand dune", "desert", "desert", "desert", "arid", "desert"], "difficult_direct_answer": false, "rationales": ["The vehicle is parked in a desert looking area with sand, dust, and not water.", "The environment is arid.", "The vehicle is parked in an arid desert."], "image": "train2014/COCO_train2014_000000029096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17350, "question_id": "c2JHNP3WL9EqU9q2zpeHKo", "question": "Why are the dogs on leashes?", "choices": ["playing game", "for protection", "as punishment", "for style"], "correct_choice_idx": 1, "direct_answers": ["to stay", "safety", "keep safe", "for protection", "on boat", "keep safe", "safety", "constrain dogs", "control them", "prevent escape"], "difficult_direct_answer": false, "rationales": ["They don't want them to jump out of the boat and into the water.", "Due to the setting that they are in they need to be restrained.", "The dogs do not need to run away."], "image": "val2014/COCO_val2014_000000017350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399282, "question_id": "c2fQiBLpCfukHEJAfiwduw", "question": "The man uses his body to stay on top of the elephant by squeezing his?", "choices": ["mouth", "neck", "elbows", "legs"], "correct_choice_idx": 3, "direct_answers": ["legs", "legs", "legs", "legs", "legs", "back", "legs", "back", "legs", "legs"], "difficult_direct_answer": false, "rationales": ["The muscles in his lower body can help grip the elephants back in order to stay mounted.", "He holds on with his thighs.", "The rider has his knees turned inwards in order to keep his balance on the elephant. this also helps give commands to the elephant."], "image": "train2014/COCO_train2014_000000399282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198068, "question_id": "c2kCSLzRCwACLSEAz923Sn", "question": "What is the player in red doing?", "choices": ["running", "celebrating", "hitting", "cleaning mound"], "correct_choice_idx": 2, "direct_answers": ["pitching", "smoothing dirt", "looking down", "waiting", "kicking dirt", "hitting", "pitching", "pitching", "pitching", "pitching"], "difficult_direct_answer": false, "rationales": ["The player is going to clear the mound.", "The player in the red is standing on the mound. he is kicking dirt away.", "The player is hitting."], "image": "train2014/COCO_train2014_000000198068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446303, "question_id": "c2pxFi9ktmP2NxhzEJDi9X", "question": "What type of golden brown rolls are these?", "choices": ["sourdough", "croissants", "french bread", "crescent"], "correct_choice_idx": 1, "direct_answers": ["bagels", "crescent", "croissant", "croissants", "bagel", "bagels", "croissant", "donuts", "cronuts", "biscuits"], "difficult_direct_answer": false, "rationales": ["A flaky baked good is on a table. croissants are golden brown and flaky.", "The rolls are croissants.", "Since i don't know what these are i'll say they could be croissants."], "image": "train2014/COCO_train2014_000000446303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127967, "question_id": "c2wKkxm5NBUup2UerDv3ec", "question": "The truck is traveling on the railroad tracks for what reason?", "choices": ["rail repair", "electrical repair", "debris cleanup", "new construction"], "correct_choice_idx": 2, "direct_answers": ["for maintenance", "fix electric", "cleaning", "matenice", "debris cleanup", "maintenance", "repairs", "acquire something", "railway maintenance", "maintenance"], "difficult_direct_answer": true, "rationales": ["The man is standing on the front to pick up any trash they see.", "Trucks like these with special rail tires are used to keep the tracks clear so the trains don't hit anything.", "The truck is cleaning debris."], "image": "train2014/COCO_train2014_000000127967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401707, "question_id": "c37R68konHSghb2C3H2gxv", "question": "Which person can get to the bottom of the hill first?", "choices": ["white helmet", "red top", "maroon bottoms", "full black"], "correct_choice_idx": 3, "direct_answers": ["in front", "full black", "black one", "furthest away", "dark jacket", "fastest", "skier", "man", "any", "far right"], "difficult_direct_answer": true, "rationales": ["The person is in black.", "The people visible are on a hill with the top being closest to the bottom of the image. the people closest to the top of the image are therefore closer to the bottom of the hill and the person closer to the top is in all black.", "He is in front of all of the people."], "image": "train2014/COCO_train2014_000000401707.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30401, "question_id": "c3Qn7iBevF2vEcSiRWNgsM", "question": "What is the most likely floor level for this room?", "choices": ["first/second", "fifth/sixth", "third/fourth", "seventh/eighth"], "correct_choice_idx": 0, "direct_answers": ["second", "first/second", "second", "upstairs", "second floor", "second floor", "first", "plant", "15 feet", "ground level"], "difficult_direct_answer": false, "rationales": ["There is a deck just outside the window and they are only on these floors", "The are different levels.", "This is likely on the first or second story of the housing area."], "image": "train2014/COCO_train2014_000000030401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514668, "question_id": "c42k27MZ5oGGerBjuUbHFS", "question": "Which section of the car is the cat by the window sitting at?", "choices": ["passenger backseat", "front passenger", "driver seat", "driver backseat"], "correct_choice_idx": 1, "direct_answers": ["front passenger", "passenger", "door", "passenger seat", "passenger seat", "passenger seat", "passenger seat", "right door", "front", "center console"], "difficult_direct_answer": false, "rationales": ["The cat is looking out the front passenger window.", "Based on seeing the side rearview mirror, we can tell these cats are sitting in the front seat next to the driver's seat.", "A cat is standing up on the passenger door of a vehicle."], "image": "val2014/COCO_val2014_000000514668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454708, "question_id": "c486tWBFwHyGxWjPis7Dpb", "question": "Which one of these brands manufactures items like the ones in the blue box?", "choices": ["ikea", "hasbro", "kleenex", "rubbermaid"], "correct_choice_idx": 2, "direct_answers": ["kleenex", "kleenex", "kleenex", "pen stand", "kleenex", "kleenex", "kleenex", "pen stand", "pen stand", "kleenex"], "difficult_direct_answer": false, "rationales": ["Kleenex is known for making tissues.", "That is the brand of tissues.", "Kleenex makes tissues."], "image": "train2014/COCO_train2014_000000454708.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78466, "question_id": "c4EvvbbcKSZD8D6bjjt4LL", "question": "What game brand is sponsoring this facility?", "choices": ["wii", "x-box", "playstation 3", "nintendo"], "correct_choice_idx": 2, "direct_answers": ["skanska", "playstation", "playstation 3", "sony eriksson", "tennis", "playstation", "play station", "visostar", "playstation", "skanska"], "difficult_direct_answer": false, "rationales": ["You can tell by the companies logo and name as to who is sponsoring the game.", "Playstation 3's logo is shown.", "The name playstation 3 shows up on the banner."], "image": "train2014/COCO_train2014_000000078466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460407, "question_id": "c4KAdTGXpTxdKWQpq4LXFu", "question": "What are the people going to take?", "choices": ["ship", "speed boat", "cruise", "ferry"], "correct_choice_idx": 2, "direct_answers": ["cruise", "cruise", "cruise", "boat", "cruise", "cruise", "cruise", "tour", "cruise", "boat"], "difficult_direct_answer": false, "rationales": ["The people take the cruise.", "The people are lined up approaching a boat. people line up in approaching vehicles when they are getting ready to take them.", "The people are boarding on a cruise ship."], "image": "val2014/COCO_val2014_000000460407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224375, "question_id": "c4KksNPFWK3cvhhhf4KiJp", "question": "Which brothers originally invented this flying device?", "choices": ["mario brothers", "wayne brothers", "trump brothers", "wright brothers"], "correct_choice_idx": 3, "direct_answers": ["wright", "wright brothers", "wright", "wright brothers", "wright", "wright brothers", "wright", "right brothers", "airplain", "wilbur orville"], "difficult_direct_answer": false, "rationales": ["The brothers were known for this historical invention.", "The wright brothers started it.", "They were the first in flight"], "image": "train2014/COCO_train2014_000000224375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153655, "question_id": "c4gUTPPxesHCphE2cjejew", "question": "Where is the man on the bench sitting?", "choices": ["on street", "backyard", "in zoo", "in park"], "correct_choice_idx": 3, "direct_answers": ["park", "building", "in park", "park", "park", "park", "park", "carton", "park", "man"], "difficult_direct_answer": false, "rationales": ["He is relaxing in the park.", "The man is in a park.", "There is an intentional landscape with public amenities that would be expected within a public park."], "image": "train2014/COCO_train2014_000000153655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81081, "question_id": "c54v5LTKBZhhGAkuVGjTsd", "question": "Red on this bears face comes from it's what?", "choices": ["tail", "prey", "sunburn", "cub"], "correct_choice_idx": 1, "direct_answers": ["dinner", "blood", "injury", "animal's blood", "meal", "animal", "food", "blood", "eating", "prey"], "difficult_direct_answer": true, "rationales": ["The polar bear must have just eaten a meal of another animal which is its prey.", "The red comes from blood.", "Bears eat other animals. animals have blood in them. blood is red."], "image": "val2014/COCO_val2014_000000081081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369221, "question_id": "c5NPfwR62oc8AS9uXUX4zd", "question": "What landscape is this location?", "choices": ["beach", "plain", "sand dune", "desert"], "correct_choice_idx": 0, "direct_answers": ["beach", "beach", "sand", "beach", "sand", "sand", "beach", "beach", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["The ground in this natural area is covered in sand. there are people lounging, and two people wearing body suits who have surfboards with them.", "There are boards in the sand.", "The landscape is the beach."], "image": "val2014/COCO_val2014_000000369221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551284, "question_id": "c5WyNQkk6BzAHVGXjGv2XV", "question": "What is this appliance for?", "choices": ["cooking soup", "making coffee", "making juice", "making tea"], "correct_choice_idx": 2, "direct_answers": ["blending", "blending", "making juice", "blending", "blending", "making drinks", "blending", "blending", "blending", "making drinks"], "difficult_direct_answer": false, "rationales": ["The appliance is for juice.", "This device is used to mix fruits and vegetables to make juice.", "The blender is making juice."], "image": "train2014/COCO_train2014_000000551284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414201, "question_id": "c5cJFwGZ6qCtXJvjbw8kht", "question": "What are these two celebrating?", "choices": ["boxing day", "pastry day", "christmas", "anniversary"], "correct_choice_idx": 3, "direct_answers": ["anniversary", "wedding anniversary", "anniversary", "wedding anniversary", "anniversary", "anniversary", "anniversary", "anniversary", "anniversary", "wedding"], "difficult_direct_answer": false, "rationales": ["These two are cutting an anniversary cake.", "The couple is older and they are cutting a cake together while holding hands.", "They're celebrating an anniversary."], "image": "val2014/COCO_val2014_000000414201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397365, "question_id": "c6GsUdkrHciKzQZdtQXbuS", "question": "What is behind the cow?", "choices": ["cars", "people", "trains", "boats"], "correct_choice_idx": 3, "direct_answers": ["boats", "canoes", "boats", "boats", "canoes", "canoes", "canoes", "boats", "boats", "boats"], "difficult_direct_answer": false, "rationales": ["There are boats stacked up behind him.", "The items are vehicles that are designed to be buoyant and waterproof.", "The objects are vessels. they are on their sides and made to float."], "image": "train2014/COCO_train2014_000000397365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206701, "question_id": "c6VzM6Jqz9Fp5gYm4tVvLd", "question": "What is sitting on the dresser?", "choices": ["monitor", "tablet", "phone", "kindle"], "correct_choice_idx": 0, "direct_answers": ["computer", "monitor", "monitor", "imac computer", "monitor", "monitor", "monitor", "computer", "monitor", "television"], "difficult_direct_answer": false, "rationales": ["There is a large screen for a computer.", "The monitor is sitting.", "It is connected to a mouse and computer keyboard."], "image": "val2014/COCO_val2014_000000206701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177150, "question_id": "c6iGcRptR3A7zYY4fxjPpk", "question": "Where is this bathroom located?", "choices": ["hotel", "home", "hospital", "park"], "correct_choice_idx": 0, "direct_answers": ["hotel", "in house", "hotel", "hotel", "inside", "building", "in hotel", "motel", "hotel", "building"], "difficult_direct_answer": false, "rationales": ["From the products, that's where it looks like it's from.", "The bathroom is in a hotel.", "The bathroom has fancy soaps."], "image": "train2014/COCO_train2014_000000177150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221746, "question_id": "c6iH352mRs3Ji9vtryJKuM", "question": "What will happen to the train after people board it?", "choices": ["enter station", "nothing", "departure", "cleaning"], "correct_choice_idx": 2, "direct_answers": ["leave station", "lifting", "take off", "leaving", "depart", "move", "departure", "take off", "it leaves", "leave station"], "difficult_direct_answer": false, "rationales": ["The train will leave.", "The train will take off.", "Based on the general procedures of trains, they would stop to load passengers, who are visible, and then move to the next destination."], "image": "train2014/COCO_train2014_000000221746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507471, "question_id": "c6tQMroUsGAczLK536Jt5o", "question": "What office equipment is on the shelf?", "choices": ["stapler", "fax", "copier", "computer"], "correct_choice_idx": 1, "direct_answers": ["printer", "fax machine", "printer", "printer", "printer", "fax", "fax machine", "printer", "fax machine", "fax machine"], "difficult_direct_answer": false, "rationales": ["The office equipment is an electronic device the size of a printer that has far more buttons than a printer and has an input and output for printer paper.", "A fax machine is on the shelf.", "It's a fax machine."], "image": "train2014/COCO_train2014_000000507471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238498, "question_id": "c6tnZCqD4HRniv7gT82usX", "question": "What is the spiciest item one could place on their hotdog shown here?", "choices": ["jalapenos", "relish", "cheese", "ketchup"], "correct_choice_idx": 0, "direct_answers": ["jalapeno", "jalapenos", "jalapeno peppers", "jalapeno", "jalapenos", "vegetable", "jalapeno", "pepper", "jalapeno peppers", "jalapenos"], "difficult_direct_answer": false, "rationales": ["Of the visible items on the hot dog and on the side there are jalepenos visible which are known to be spicy in flavor and spicier that the other items listed which are not known to have spice.", "A hotdog has toppings including various peppers and relish.", "Jalapenos are very spicy."], "image": "val2014/COCO_val2014_000000238498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466261, "question_id": "c6xmqw5viTyrkRP4phUDQV", "question": "Why do kites have tails?", "choices": ["luck", "preference", "efficiency", "style"], "correct_choice_idx": 2, "direct_answers": ["visibility", "fly", "adds weight", "balance", "help fly", "efficiency", "stabilization", "fly high", "fly better", "wind"], "difficult_direct_answer": true, "rationales": ["Kites have tails to help them fly more evenly and efficiently.", "Like the wings and tail of a plane, the tail helps with the aerodynamic movement of the kite.", "The tail makes it more stable and perform well."], "image": "train2014/COCO_train2014_000000466261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256603, "question_id": "c75LzX9eJPNEw6s92QEKR6", "question": "What is wrapped around his wrist?", "choices": ["yarn", "string", "cord", "ribbon"], "correct_choice_idx": 2, "direct_answers": ["bracelet", "remote", "wrist strap", "cord", "wii cable", "controller", "wii cord", "wrist strap", "cord", "wii controller"], "difficult_direct_answer": false, "rationales": ["The cord is connected to the wii remote, and it is wrapped around his wrist so that the remote does not fall if he lets go of it.", "The man in the photo is carrying wii controllers in his hand. connected to these are cords which wrap around his wrists.", "A cord is on the console."], "image": "train2014/COCO_train2014_000000256603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12891, "question_id": "c78iHQkQZhMJ6oaZKN5C55", "question": "What type of socks is the woman pushing the stroller wearing?", "choices": ["ankle", "knee-high", "white", "crew cut"], "correct_choice_idx": 1, "direct_answers": ["stockings", "red", "stockings", "red", "kneehighs", "knee-high", "knee high", "red", "knee", "thigh highs"], "difficult_direct_answer": false, "rationales": ["The socks are knee high.", "The stroller pushing woman in this scene wears socks that extend past her knees.", "A woman with socks that reach mid-leg height is walking along a path."], "image": "train2014/COCO_train2014_000000012891.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459997, "question_id": "c7Psho2xHZXkFUE8nkDG8j", "question": "What kind of vehicle is shown here?", "choices": ["tour bus", "shuttle bus", "double decker", "public transportation"], "correct_choice_idx": 0, "direct_answers": ["bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus", "tour bus"], "difficult_direct_answer": false, "rationales": ["The bus has tinted windows. the people inside don't want to be seen.", "The vehicle is a tour bus.", "The vehicle shown is a tour bus."], "image": "train2014/COCO_train2014_000000459997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540039, "question_id": "c8Hy9SDs7CAAQ8wNoqQkUL", "question": "Which painter often painted this style of image?", "choices": ["magritte", "mondrian", "picasso", "renoir"], "correct_choice_idx": 0, "direct_answers": ["magritte", "rene magritte", "van gogh", "matisses", "georgia o'keeffe", "magritte", "van gogh", "rembrandt", "magritte", "dali"], "difficult_direct_answer": false, "rationales": ["Magritte was known for this image style.", "The painter is magritte.", "A picture of a woman taking a picture of a man in a suit is depicted and the style is that of magritte."], "image": "train2014/COCO_train2014_000000540039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138501, "question_id": "c8JFoTmuZRky5hPhqESuyD", "question": "What does the woman refer to here?", "choices": ["school notes", "phone", "menu", "book"], "correct_choice_idx": 2, "direct_answers": ["menu", "menu", "menu", "menu", "menu", "menu", "plate", "food", "being alone", "food"], "difficult_direct_answer": false, "rationales": ["The woman has the menu.", "The woman is reading a menu.", "Eating establishments provide these as a way of letting guests know what foods and drinks they serve."], "image": "train2014/COCO_train2014_000000138501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311408, "question_id": "c8MXphqnPWH4zdueufrZg3", "question": "What dressing is the white thing likely to be?", "choices": ["honey mustard", "balsamic vinegar", "thousand island", "ranch"], "correct_choice_idx": 3, "direct_answers": ["blue cheese", "ranch", "ranch", "ranch", "cream sauce", "ranch", "gravy", "ranch", "salsa", "avocado"], "difficult_direct_answer": false, "rationales": ["The white dressing is most likely ranch, which is white.", "Ranch is a common white dressing.", "The dressing is ranch."], "image": "train2014/COCO_train2014_000000311408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29241, "question_id": "c8PhMA8swhvkPSTCvYpo7r", "question": "Which of these colors are the double lines on the street most likely to be?", "choices": ["teal", "yellow", "pink", "blue"], "correct_choice_idx": 1, "direct_answers": ["yellow", "yellow", "black", "yellow", "brick road", "white", "yellow", "white", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["Street lines are boundaries set for drivers and double lines on the road are usually in the color yellow.", "The color is yellow.", "Traffic lines are often yellow to enhance driver visibility."], "image": "train2014/COCO_train2014_000000029241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124408, "question_id": "c8RxD5yjCLnUrMazP5qxQU", "question": "What colors are the largest kite?", "choices": ["hot colors", "white", "cool colors", "black"], "correct_choice_idx": 2, "direct_answers": ["cool colors", "purple", "purple", "blue purple", "blue", "blue purple", "blue", "blue purple", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The colors of the kite are more associated with winter or cold colors.", "The kite is blue, light blue and light purple.", "The kite is blue."], "image": "train2014/COCO_train2014_000000124408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294134, "question_id": "c8TqReM4nww4A7mRfvBJcq", "question": "This fine pale ale malt is made from?", "choices": ["millets", "barley", "dhal", "pearl"], "correct_choice_idx": 3, "direct_answers": ["fawcett", "oat", "barley", "spring barley", "yeast", "oat", "pearl", "wheat", "hops", "grain"], "difficult_direct_answer": true, "rationales": ["Pale ale malt comes from barley.", "It's from pearls.", "The brand of the alcohol is visible on the side of the truck and this brand and type of alcohol uses answer b."], "image": "val2014/COCO_val2014_000000294134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536834, "question_id": "c8WwgpaonMSVrBFJ2Br4HZ", "question": "What is the green substance near the shoreline?", "choices": ["shellfish", "plastic", "moss", "dirt"], "correct_choice_idx": 2, "direct_answers": ["algae", "moss", "seaweed", "algae", "algae", "moss", "seaweed", "lichen", "algae", "moss"], "difficult_direct_answer": false, "rationales": ["This is a moss that grows in wet area because of water.", "This is seaweed near the shore.", "There is moss on the rocks."], "image": "train2014/COCO_train2014_000000536834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532633, "question_id": "c8f8DSCJXEdnA4PuSL3yHA", "question": "What is the person in red trying to accomplish?", "choices": ["goal", "touchdown", "homerun", "basket"], "correct_choice_idx": 2, "direct_answers": ["homerun", "hit baseball", "hit ball", "home run", "hit ball", "hit ball", "hit ball", "home run", "home run", "hit ball"], "difficult_direct_answer": false, "rationales": ["He wants to hit the ball and score", "The person is trying to hit the ball.", "The person wants to hit a home run."], "image": "train2014/COCO_train2014_000000532633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279083, "question_id": "c8xyZnwzmr3aBpNYHSypA3", "question": "What does the pitcher here cause the batter to do?", "choices": ["quit", "go home", "wait", "swing"], "correct_choice_idx": 3, "direct_answers": ["swing bat", "swing", "home run", "swing", "swing", "swing bat", "swing", "swing", "swing", "strike"], "difficult_direct_answer": false, "rationales": ["The pitcher causes the batter to swing, who hopefully doesn't actually hit the ball, thus giving the pitcher a \"strike!\".", "The pitcher is swinging the ball.", "He will throw the ball at the batter to try to get them out of the game."], "image": "val2014/COCO_val2014_000000279083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581332, "question_id": "c9EbTxkchg82pgxJeB93wp", "question": "What type of location would this activity be found at?", "choices": ["gas station", "beer bar", "garage", "winery"], "correct_choice_idx": 3, "direct_answers": ["wine cellar", "wine bar", "restaurant", "winery", "winery", "bar", "bistro", "resturant", "winery", "bistro"], "difficult_direct_answer": false, "rationales": ["The text on the paper indicates that people are tasting alcohol. the alcohol is not beer.", "Glasses of wine are lined up with cards under the glasses. wineries have wine tastings.", "Wine tastings often happen where the wine is made."], "image": "val2014/COCO_val2014_000000581332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107840, "question_id": "c9GefnZt5t5SduHkx5dLDj", "question": "Where is the bus located?", "choices": ["bus lane", "crosswalk", "bike path", "dog path"], "correct_choice_idx": 0, "direct_answers": ["side street", "bus lane", "america", "crosswalk", "by streetlight", "highway", "city", "tracks", "road", "on road"], "difficult_direct_answer": true, "rationales": ["The bus is in a lane.", "The bus is pulled over to the right in the lane that is for busses.", "A bus is on the side of the street on the right. bus lanes are on the right side."], "image": "val2014/COCO_val2014_000000107840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469650, "question_id": "c9avJpibHDfuVGRYhuJ69p", "question": "What is the kite shaped like?", "choices": ["bird", "cow", "mushroom", "dog"], "correct_choice_idx": 1, "direct_answers": ["cow", "diamond", "cow", "cow", "cow", "diamond", "diamond", "diamond", "cow", "cow"], "difficult_direct_answer": false, "rationales": ["The kite is like a cow.", "The kite has udders and white and black patterning that is very common on cows.", "The kite has black and white spots and udders."], "image": "train2014/COCO_train2014_000000469650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63974, "question_id": "c9rUpYNYqzw6aDNY3AyYFh", "question": "What is likely sold here?", "choices": ["video games", "action figures", "newspaper", "model airplanes"], "correct_choice_idx": 2, "direct_answers": ["food", "food", "beer", "drinks", "newspaper", "cafe items", "drinks", "soda", "food", "breakfast"], "difficult_direct_answer": false, "rationales": ["The small shop sells newspapers that are sitting on a rack by the counter.", "A large amount of newspapers can be seen in the holder attached to the counter, and it is nearby to a register, indicating that they are sold here.", "Newspapers are the only item listed that are visible in this image and likely for sale."], "image": "train2014/COCO_train2014_000000063974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244471, "question_id": "cA8JcCmkqXRtU67jHf5iUA", "question": "Where is this giraffe located?", "choices": ["wild", "zoo", "circus", "museum"], "correct_choice_idx": 3, "direct_answers": ["museum", "museum", "museum", "museum", "inside", "museum", "inside", "museum", "museum", "museum"], "difficult_direct_answer": false, "rationales": ["The giraffe is inside and is not alive.", "One can see the halls of the building and the information displays.", "The giraffe is a display."], "image": "train2014/COCO_train2014_000000244471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482940, "question_id": "cADbYZaARRmKHBwk9f5E4B", "question": "For what purpose are all the suitcases organized here?", "choices": ["for sale", "traffic control", "advertising promotion", "giveaway"], "correct_choice_idx": 0, "direct_answers": ["purchases", "to travel", "for sale", "travel", "for sale", "for sale", "selling", "for sale", "sales presentation", "to sell"], "difficult_direct_answer": false, "rationales": ["They are selling them", "There are price tags on the suitcases indication that they are being sold.", "Suitcases are lined up and on display. goods for sale are displayed in order to attract attention."], "image": "train2014/COCO_train2014_000000482940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224112, "question_id": "cAgVSCFcp4qo247K2B2JSu", "question": "The woman here considers the flavor of what?", "choices": ["muffins", "salad", "soup", "water"], "correct_choice_idx": 2, "direct_answers": ["soup", "soup", "gravy", "soup", "soup", "soup", "gravy", "soup", "soup", "soup"], "difficult_direct_answer": false, "rationales": ["The woman is tasting a soup.", "The woman eats soup.", "She is looking at a variety."], "image": "train2014/COCO_train2014_000000224112.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216930, "question_id": "cAwcrpRkpUzi99aHSdzriQ", "question": "Why is he in the air above the skateboard?", "choices": ["bouncing", "showing off", "confused", "falling"], "correct_choice_idx": 1, "direct_answers": ["jump", "sliding down", "doing trick", "jumping", "race", "trees", "skateboarding trick", "showing off", "jumping", "trick"], "difficult_direct_answer": true, "rationales": ["The man is showing off tricks.", "The man is in the air showing off before he falls on his skateboard.", "He is doing a trick in front of people who are watching."], "image": "train2014/COCO_train2014_000000216930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507278, "question_id": "cBBkxwsgyd733kSobg9kW5", "question": "From which plants was the food eaten here harvested?", "choices": ["corn", "bamboo", "grass", "soybeans"], "correct_choice_idx": 2, "direct_answers": ["grass", "grass", "hay", "hay", "grass", "hay", "hay", "grass", "wheat", "hay"], "difficult_direct_answer": false, "rationales": ["The animal is a sheep. it is eating hay.", "There are some grasses here in the feeder.", "It is dried out and now hay for them to eat."], "image": "train2014/COCO_train2014_000000507278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268397, "question_id": "cBFoVj8QeAkvuKojweUaro", "question": "What does this person want to do?", "choices": ["pay cashier", "make dinner", "change channel", "take shower"], "correct_choice_idx": 2, "direct_answers": ["turn on", "watch tv", "watch tv", "switch channel", "power tv", "change channel", "watch television", "watch tv", "watch television", "watch tv"], "difficult_direct_answer": false, "rationales": ["A person is pointing a remote at a television. people use remotes to change the television channel.", "The person is holding a remote control. they are pointing it at a television.", "They are holding a remote towards the television."], "image": "train2014/COCO_train2014_000000268397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418226, "question_id": "cBbMk7mLbcQsJCVE6jjzzw", "question": "Who is this room meant for?", "choices": ["animals", "men", "women", "girls"], "correct_choice_idx": 1, "direct_answers": ["men", "men", "men", "men", "men", "men", "men", "men", "men", "men"], "difficult_direct_answer": false, "rationales": ["Because there are urinals you can tell which bathroom this is.", "The room is for men.", "Men use these urinals."], "image": "val2014/COCO_val2014_000000418226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256628, "question_id": "cBnA6b9kQb8SmUWmeFdBFs", "question": "What is the blue bowl behind the dog used for?", "choices": ["cooking", "painting", "garbage", "drinking"], "correct_choice_idx": 3, "direct_answers": ["eating", "water", "water", "water", "water", "drinking", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["That bowl is used to put water in for the dog.", "Dogs drink out of bowls on the ground and this looks like a typical dog water bowl.", "The bowl is for drinking."], "image": "val2014/COCO_val2014_000000256628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536175, "question_id": "cC6JzSQdS5knDXmGZRCpxz", "question": "What animal does the cat see on TV?", "choices": ["dog", "bird", "donkey", "monkey"], "correct_choice_idx": 1, "direct_answers": ["hummingbird", "bird", "hummingbird", "bird", "bird", "bird", "bird", "bird", "bird", "bird"], "difficult_direct_answer": false, "rationales": ["The animal on the screen is visible and identifiable by its wings. only answer a on the list of options has wings.", "The cat sees a bird.", "There is a bird flying."], "image": "train2014/COCO_train2014_000000536175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195163, "question_id": "cCEDUKQZivat5tpkLFehJP", "question": "The boy is most likely doing what?", "choices": ["cooking", "dreaming", "exercising", "running"], "correct_choice_idx": 1, "direct_answers": ["sleeping", "sleeping", "sleeping", "resting", "dreaming", "reading", "sleeping", "sleeping", "sleeping", "watching tv"], "difficult_direct_answer": false, "rationales": ["The boy is in bed and the items associated with the other given options are not visible.", "The boy is most likely dreaming.", "The boy is probably dreaming."], "image": "train2014/COCO_train2014_000000195163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33488, "question_id": "cCbVVaZT3Bmn4Va6UZoFBK", "question": "What can possibly happen next in this scene?", "choices": ["touchdown", "penalty", "goal", "home run"], "correct_choice_idx": 3, "direct_answers": ["homerun", "homerun", "home run", "homerun", "hit", "homerun", "batter runs", "homerun", "home run", "homerun"], "difficult_direct_answer": false, "rationales": ["The batter could hit a home run for baseball.", "The home run can be hit.", "A home run can occur in a baseball game when the ball is hit out of the park."], "image": "train2014/COCO_train2014_000000033488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142815, "question_id": "cCcbenqJdqt2t5uJe6B9vj", "question": "What is the cameraman sitting on?", "choices": ["ladder", "bed", "step", "wall"], "correct_choice_idx": 1, "direct_answers": ["bed", "knee", "bed", "bed", "bed", "bed", "knee", "bed", "bed", "bed"], "difficult_direct_answer": false, "rationales": ["The man is on the mattress.", "It appears to be a mattress with bedding on it, with a headboard at the back, indicating that it is a bed.", "The furniture looks soft and it has a headboard."], "image": "val2014/COCO_val2014_000000142815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128849, "question_id": "cCnpBMkPaEAM3VZevCLFeT", "question": "This animal is featured in what movie?", "choices": ["dumbo", "lassie", "benji", "free willy"], "correct_choice_idx": 0, "direct_answers": ["dumbo", "horton", "dumbo", "dumbo", "horton", "horton", "dumbo", "dumbo", "dumbo", "dumbo"], "difficult_direct_answer": false, "rationales": ["If anyone watched disney you would know the movie with the elephant.", "The animal is dumbo.", "The animal is an elephant which is a character featured in a famous disney movie taking place at a circus. the other movies feature a whale or a dog."], "image": "val2014/COCO_val2014_000000128849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41649, "question_id": "cCs42va9HobJioMCJy9ZBP", "question": "What is on the building?", "choices": ["moss", "water", "leaves", "snow"], "correct_choice_idx": 0, "direct_answers": ["ivy", "moss", "moss", "trees", "moss", "moss", "elephant", "moss", "mold", "moss"], "difficult_direct_answer": false, "rationales": ["The building has moss.", "The green growth on the outside of the building is mossy buildup.", "There is a grass-like substance that makes the building look old, including the white being greyer over the years."], "image": "train2014/COCO_train2014_000000041649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496496, "question_id": "cCzouCKniMGdLepnMo5mzU", "question": "What type of surf is the man carrying?", "choices": ["longboard", "hybrid", "shortboard", "fish"], "correct_choice_idx": 3, "direct_answers": ["fish", "fishboard", "surfboard", "swallow tail", "shortboard", "surfboard", "broad surfing", "board", "surf board", "shortboard"], "difficult_direct_answer": false, "rationales": ["The surf is a fish board.", "The man is clearly visible and is carrying a board, that compared to his body height, is shorter than a traditional board.", "The way the ends of the board are tapered resemble the same shape as a fish."], "image": "train2014/COCO_train2014_000000496496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536791, "question_id": "cD5SQ4uair8Tso37WuuQir", "question": "Why is the skateboarder on the wall as opposed to being on the ground?", "choices": ["levitating", "avoiding danger", "cleaning", "wall riding"], "correct_choice_idx": 3, "direct_answers": ["performing trick", "trick", "performing trick", "stunting", "doing tricks", "wall riding", "no reason", "performing trick", "doing tricks", "performing trick"], "difficult_direct_answer": false, "rationales": ["Wall riding is a move where you ride your skateboard on the wall.", "The skateboarder is on the wall.", "The man is doing a trick."], "image": "val2014/COCO_val2014_000000536791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528870, "question_id": "cDTDraCAraB8SEETR8NsJS", "question": "What type of pants does the boy in the blue shirt have on?", "choices": ["drawstring bottoms", "jeans", "sweatpants", "trousers"], "correct_choice_idx": 1, "direct_answers": ["jeans", "jeans", "jeans", "jeans", "jeans", "jeans", "jeans", "jeans", "jeans", "jeans"], "difficult_direct_answer": false, "rationales": ["The boy is wearing denim pants which are called jeans.", "A guy in a shirt is wearing blue pants that are heavy material.", "The fabric is blue, faded, and textured cotton."], "image": "train2014/COCO_train2014_000000528870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79930, "question_id": "cDTwi7wE5RaqibhAhYE7RS", "question": "Why are three people on the bike?", "choices": ["dare", "bet", "cheap transportation", "in hurry"], "correct_choice_idx": 2, "direct_answers": ["going shopping", "being transported", "poor", "sharing ride", "going somewhere", "family", "going somewhere", "only transportation", "transportation", "cheap transportation"], "difficult_direct_answer": true, "rationales": ["That is how many poorer country's citizens get around.", "The people are sharing a ride to reduce costs.", "That is the cheapest transportation."], "image": "val2014/COCO_val2014_000000079930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52038, "question_id": "cDUcpe8PsQ5JJsSYVz2ym7", "question": "Why would someone sit at this table?", "choices": ["to eat", "to craft", "to work", "to sew"], "correct_choice_idx": 0, "direct_answers": ["to eat", "to eat", "eat", "to eat", "dining", "to eat", "eat", "eat dinner", "eat", "to eat"], "difficult_direct_answer": false, "rationales": ["The person would want to eat the sushi.", "Someone would eat.", "They would eat dinner at the table with their family or friends."], "image": "train2014/COCO_train2014_000000052038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373395, "question_id": "cDZfuUKyG264VGUpRiNKdW", "question": "Where do the train tracks that the train here sits on lead to?", "choices": ["new york", "nowhere", "sacramento", "reno"], "correct_choice_idx": 1, "direct_answers": ["nowhere", "private land", "nowhere", "dead end", "trainstop", "to right", "train station", "nowhere", "nowhere", "nowhere"], "difficult_direct_answer": false, "rationales": ["They come to a dead end just in front of the train.", "The tracks go nowhere.", "The tracks end because the train is only for display."], "image": "train2014/COCO_train2014_000000373395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477679, "question_id": "cDbnsHmFf8toZ7wPBJmwhX", "question": "What kind of an area is this?", "choices": ["metropolitan", "coastal", "desert", "jungle"], "correct_choice_idx": 1, "direct_answers": ["countryside", "coastal", "ocean front", "coastal", "oceanfront", "water", "plains", "seaside", "lowlands", "beach"], "difficult_direct_answer": true, "rationales": ["The area is coastal.", "The area has water and land.", "It is located along the water."], "image": "train2014/COCO_train2014_000000477679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42008, "question_id": "cDgkupjAjyvHTDhH9eoT23", "question": "In what environment are the Jeep and bus travelling?", "choices": ["savannah", "forest", "desert", "tundra"], "correct_choice_idx": 0, "direct_answers": ["savannah", "off road", "desert", "savannah", "plains", "desert", "desert", "desert", "speed", "african savanna"], "difficult_direct_answer": false, "rationales": ["The area is an open savannah.", "A flat expanse of land with few trees can be seen behind a jeep and another vehicle. jeeps are used in the savannah.", "The environment is a savannah."], "image": "val2014/COCO_val2014_000000042008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41671, "question_id": "cDnVeqhX4ZjZ4semTGFPym", "question": "What is surrounding the cake?", "choices": ["ice cream", "gummy bears", "cookies", "strawberries"], "correct_choice_idx": 3, "direct_answers": ["strawberries", "sauce", "strawberry", "strawberries", "strawberries", "syrup", "strawberries", "strawberries", "strawberries", "strawberries"], "difficult_direct_answer": false, "rationales": ["The slices of the fruit can be seen on the plate.", "Red slices of fruit are on a plate with cake.", "You can tell by the color and shape as to what fruit is being shown here."], "image": "val2014/COCO_val2014_000000041671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61899, "question_id": "cEAoPkvUWgMwkXjVd2umCg", "question": "What number president is the white building dedicated to?", "choices": ["one", "16", "33", "45"], "correct_choice_idx": 1, "direct_answers": ["forty six", "16", "first", "sixteenth", "first", "don't know", "16th", "sixteenth", "sixteenth", "forty five"], "difficult_direct_answer": false, "rationales": ["It is the lincoln memorial.", "That is the number on the building.", "The building is the lincoln memorial. lincoln served after james buchanan, who was the 15th president of the united states."], "image": "val2014/COCO_val2014_000000061899.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544198, "question_id": "cEEJdAyBbstGmV8ioWHKoi", "question": "Which oven counting from the top is best for baking a raw pizza?", "choices": ["first", "none", "fourth", "second"], "correct_choice_idx": 3, "direct_answers": ["middle", "second", "conventional oven", "middle bottom", "second", "second", "second", "two", "baking rack", "baking oven"], "difficult_direct_answer": false, "rationales": ["The second oven can get the hottest.", "The second one is counting.", "The one above is a microwave for heating up foods quickly. the one under it is one that takes plenty of time to cook."], "image": "val2014/COCO_val2014_000000544198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36482, "question_id": "cEGAzYFbjmqp9Eass2ejn9", "question": "What countries flag can be seen as a patch on the man's uniform?", "choices": ["russia", "italy", "france", "united states"], "correct_choice_idx": 3, "direct_answers": ["usa", "america", "america", "usa", "usa", "usa", "usa", "united states", "usa", "united states"], "difficult_direct_answer": false, "rationales": ["The us flag is shown.", "The man has a patch with red and white stripes and a small blue patch.", "The flag on his arm is from the usa."], "image": "train2014/COCO_train2014_000000036482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190722, "question_id": "cER6sooZbiGxK47UrMLeQG", "question": "What are the two standing rectangular devices?", "choices": ["microphone", "speakers", "power supply", "phone"], "correct_choice_idx": 1, "direct_answers": ["system laptop", "speakers", "speakers", "computers", "speakers", "keyboard", "monitors", "speakers", "speakers", "monitor laptop"], "difficult_direct_answer": false, "rationales": ["There are speakers by the laptop.", "The speakers are hooked up to the computer.", "The devices are called speakers."], "image": "val2014/COCO_val2014_000000190722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257187, "question_id": "cEhusCCTuPq4iiQvBbBfQe", "question": "What emotion are the woman exhibiting?", "choices": ["joyful", "surprised", "scared", "fearful"], "correct_choice_idx": 0, "direct_answers": ["happiness", "happiness", "excitement", "happiness", "happiness", "joyful", "happiness", "happiness", "joy", "happy"], "difficult_direct_answer": false, "rationales": ["They are happy and having a fun time.", "The women are both smiling and look like they are celebrating.", "The women are smiling."], "image": "val2014/COCO_val2014_000000257187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136458, "question_id": "cEr2AEg6dV32eeD2mAuQLX", "question": "How many laptops do you see?", "choices": ["none", "one", "three", "two"], "correct_choice_idx": 1, "direct_answers": ["one", "one", "two", "one", "one", "one", "one", "one", "two", "one"], "difficult_direct_answer": false, "rationales": ["There is one.", "Though there are several electronic devices present in this image, there is only a single full fledged portable computer with keyboard and monitor attached.", "There is one laptop computer on the ground."], "image": "val2014/COCO_val2014_000000136458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492132, "question_id": "cErbjwVbnUUNXhQrsRwtNr", "question": "What does this building house?", "choices": ["candy shop", "airport", "museum", "train depot"], "correct_choice_idx": 2, "direct_answers": ["transportation items", "equipment", "transportation artifacts", "planes", "museum", "museum", "artifacts", "war relics", "planes", "planes"], "difficult_direct_answer": false, "rationales": ["This is a museum. you can see the plane is hung up and on display for people to look at.", "This building houses an aircraft museum.", "You can tell by the displays as to what type of place it is."], "image": "val2014/COCO_val2014_000000492132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487521, "question_id": "cFHQ7xgWqVAJZZz5wWYEV2", "question": "If the skateboard kept this orientation how would his landing be?", "choices": ["easy", "normal", "dangerous", "soft"], "correct_choice_idx": 2, "direct_answers": ["like it", "crash", "fall off", "down earth", "dangerous", "fall", "on wheels", "hard", "bad", "sitting walk"], "difficult_direct_answer": true, "rationales": ["The wheels need to point down.", "The board would land upside down and the individual would fall and injure themselves. the orientation is dangerous.", "It's dangerous to land on the stairs."], "image": "train2014/COCO_train2014_000000487521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176938, "question_id": "cFMr2rXkGRdjfamUULRfME", "question": "What is he looking at?", "choices": ["ball", "racquet", "his shoes", "ground"], "correct_choice_idx": 0, "direct_answers": ["ball", "ball", "tennis ball", "ball", "ball", "tennis ball", "tennis ball", "ball", "ball", "ball"], "difficult_direct_answer": false, "rationales": ["In order to be successful, he must look at the item that he is trying to hit with his racket.", "The tennis a is coming toward him.", "The person is playing tennis based on the equipment and uniform and his eye line is visible. based on this sport, his body positioning and eye line, he would be looking at answer a."], "image": "train2014/COCO_train2014_000000176938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478182, "question_id": "cG6MfW9fPwWLSgAjy9GeyA", "question": "What service is available when riding this bus?", "choices": ["free lunch", "wifi", "hand towels", "heated seats"], "correct_choice_idx": 1, "direct_answers": ["wifi", "wifi", "wifi", "food", "wifi", "bus ride", "wifi", "wifi", "meal", "wifi"], "difficult_direct_answer": false, "rationales": ["Wifi is available as indicated by the sign on the front.", "The bus has internet.", "They have wifi available."], "image": "train2014/COCO_train2014_000000478182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114868, "question_id": "cGCLtbMRvjYXQ49gpAp8on", "question": "How many players are on this tennis court?", "choices": ["four", "three", "none", "two"], "correct_choice_idx": 3, "direct_answers": ["one", "two", "one", "one", "one", "two", "two", "one", "two", "two"], "difficult_direct_answer": false, "rationales": ["There needs to be another person out of frame, for this girl to be volleying back and forth with.", "Two players are on the court.", "The girl has to be playing with someone else."], "image": "val2014/COCO_val2014_000000114868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406303, "question_id": "cGGYK8hgEvXjyENG8bvqn4", "question": "What is this game likely to involve?", "choices": ["platform hopping", "shooting", "conductor", "sword fighting"], "correct_choice_idx": 2, "direct_answers": ["music", "music", "music", "conductor", "orchestrating", "conducting", "music", "music", "skill", "music orchestra"], "difficult_direct_answer": false, "rationales": ["There is an orchestra on screen.", "The game features an orchestra.", "The game shows an orchestra."], "image": "train2014/COCO_train2014_000000406303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424533, "question_id": "cGS3tWq3m2xJzi86cs5c3S", "question": "What position is played by the person with the ball?", "choices": ["catcher", "short stop", "batter", "pitcher"], "correct_choice_idx": 0, "direct_answers": ["catcher", "umpire", "catcher", "catcher", "catcher", "catcher", "catcher", "umpire", "catcher", "catcher"], "difficult_direct_answer": false, "rationales": ["He is wearing a mask and pads and is standing near home plate.", "The position is the catcher.", "He has a catching mitt and is wearing a face mask and padded chest to keep the ball from hurting him when he receives it."], "image": "train2014/COCO_train2014_000000424533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264676, "question_id": "cGbt9dUYCZt4FpnGrvwkH5", "question": "How is the ramp able to go up and down?", "choices": ["hand pumping", "pulleys", "leverage", "hydraulics"], "correct_choice_idx": 3, "direct_answers": ["hydraulics", "hydraulics", "hydraulics", "hydraulics", "hydraulics", "hydraulics", "hydraulic lift", "lever", "truck", "truck lift"], "difficult_direct_answer": false, "rationales": ["Hydraulics allow the ramp to move freely.", "There is a hydraulics system to raise and lower the ramp.", "It uses hydraulics."], "image": "train2014/COCO_train2014_000000264676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36498, "question_id": "cGd4F2zcWUXRUswiyczWNK", "question": "What surface is this woman playing on?", "choices": ["asphalt", "clay", "grass", "rubber"], "correct_choice_idx": 0, "direct_answers": ["tennis court", "asphalt", "tennis court", "tennis court", "clay", "concrete court", "asphalt", "court", "tennis court", "clay"], "difficult_direct_answer": false, "rationales": ["The dark color of the surface indicates that it is asphalt; it is not the color of clay, grass or rubber.", "The surface is asphalt.", "The woman is on a solid court."], "image": "train2014/COCO_train2014_000000036498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107970, "question_id": "cGxEqm9UcYwRTJVt3DMmvn", "question": "Where do palm trees come from?", "choices": ["maine", "artic", "tropical/subtropical regions", "antarctica"], "correct_choice_idx": 2, "direct_answers": ["seeds", "tropics", "tropical/subtropical regions", "south america", "tropical climates", "coconuts", "south", "india", "ground", "seeds"], "difficult_direct_answer": true, "rationales": ["The trees are tropical.", "Palm trees are all around. palm trees grow in tropical areas.", "Palm trees grow in warm weather."], "image": "train2014/COCO_train2014_000000107970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578131, "question_id": "cH2Z3Ds2fNeFvmJZGgwahs", "question": "What activity is the horse shown here taking part in?", "choices": ["steeple chase", "racing", "barrel racing", "roping"], "correct_choice_idx": 0, "direct_answers": ["jumping", "jumping", "jumping", "jumping", "jumping", "jumping", "jumping", "jumping", "steeple chase", "jumping"], "difficult_direct_answer": false, "rationales": ["The activity is steeplechase.", "The horses are in a steeplechase.", "The horse is jumping over objects not unlike human runners in the specified chase."], "image": "val2014/COCO_val2014_000000578131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44017, "question_id": "cHDiZE6RayHmAa9JvB5UTA", "question": "What type of area is this?", "choices": ["desert", "country", "beach", "city"], "correct_choice_idx": 3, "direct_answers": ["big city", "town", "urban", "city", "city", "downtown", "city street", "city", "city", "market"], "difficult_direct_answer": false, "rationales": ["The traffic and the buildings would indicate the urban area of the place.", "The lights are bright and there are a lot of buildings.", "The area is a city."], "image": "train2014/COCO_train2014_000000044017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1261, "question_id": "cHG4Cqe5myPnoykabFYd2m", "question": "The item on the left is most likely sold in what?", "choices": ["hand", "soup", "loaf", "pod"], "correct_choice_idx": 2, "direct_answers": ["bakery", "loaf", "bakery", "grocery store", "bakery", "loaves", "bakery", "loaf", "loaf", "loafs"], "difficult_direct_answer": false, "rationales": ["Bread, pasta, and fruit are on plates on a table.", "There is bread on the plate.", "Bread is baked in a loaf form and then cut into slices to be easily consumed."], "image": "train2014/COCO_train2014_000000001261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271051, "question_id": "cHaa272wQVKCq3U3Z55Azh", "question": "How many people can this room accommodate?", "choices": ["one", "two", "three", "six"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "two", "two", "three", "three", "three", "three", "two"], "difficult_direct_answer": false, "rationales": ["By the number of beds, it can tip you off as to the occupancy limit.", "You can count the number of beds.", "Typically the number of people a bedroom can accommodate matches the number of beds, if the beds are single. the beds by their dimensions appear to be singles and there are three of them visible."], "image": "train2014/COCO_train2014_000000271051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432317, "question_id": "cHbAU6Fz5smF9eVS5A5kfg", "question": "What event might be about to occur here?", "choices": ["bank robbery", "yard sale", "horse show", "protest"], "correct_choice_idx": 3, "direct_answers": ["parade", "protest", "riot", "protest", "queen", "parade", "protest", "protest", "protests", "parade"], "difficult_direct_answer": false, "rationales": ["Since the police are lined up and blocking the road, they are anticipating turning back a good-sized group of protestors from coming any further. if they were there providing guidance to townspeople, they wouldn't be lined up in this formation.", "Police are lined up in a town, some on horseback. police use horses and large numbers when protests are occurring.", "Given the police presence with helmets, this is the most likely answer."], "image": "val2014/COCO_val2014_000000432317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507908, "question_id": "cHwGseWAapnmmbrb5D6bwf", "question": "What animals are near the fence?", "choices": ["zebras", "giraffe", "tigers", "gorillas"], "correct_choice_idx": 1, "direct_answers": ["giraffe", "giraffes", "giraffe", "giraffes", "giraffe", "giraffes", "giraffes", "giraffes", "giraffes", "giraffe"], "difficult_direct_answer": false, "rationales": ["It's a giraffe.", "They have long necks and giraffe faces.", "Giraffes have long necks."], "image": "train2014/COCO_train2014_000000507908.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54335, "question_id": "cJraTASyuRgsauRaFDeXJ5", "question": "What can be seen by looking through the clock?", "choices": ["boats", "field", "city", "ocean"], "correct_choice_idx": 2, "direct_answers": ["city", "city", "buildings", "skyline", "city", "city", "cityscape", "time", "skyline", "buildings"], "difficult_direct_answer": false, "rationales": ["The city can be seen.", "You can see all the buildings in the background", "A cityscape can be seen through the interior of the clock tower."], "image": "val2014/COCO_val2014_000000054335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409856, "question_id": "cJwKcQXXtg5MLZtYe5N8x9", "question": "If the camera man moved a little to the right what would most readily run into them?", "choices": ["train", "bicycle", "car", "motorcycle"], "correct_choice_idx": 0, "direct_answers": ["train", "train", "train", "train", "train", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["To the right are tracks and an engine can be seen through the tunnel nearing the camera man's location.", "He would possibly fall off onto the tracks and the train would hit him", "There is a locomotive coming down the tracks."], "image": "val2014/COCO_val2014_000000409856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557105, "question_id": "cJxtM62retxjeNiZfLwrFd", "question": "Beta carotenoid is the reason for what quality of broccoli flower?", "choices": ["mass", "shape", "stem", "color"], "correct_choice_idx": 3, "direct_answers": ["color", "vitamin", "green color", "high nutrients", "green", "vitamin", "yellow", "color", "flowering", "yellow"], "difficult_direct_answer": false, "rationales": ["The color helps.", "The broccoli is green because of beta carotenoid.", "The broccoli has a bright green color."], "image": "train2014/COCO_train2014_000000557105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385492, "question_id": "cK9bzvNHkocabwPnmzc7vb", "question": "Where is this desk setup?", "choices": ["in hallway", "on train", "at work", "in library"], "correct_choice_idx": 2, "direct_answers": ["office cubicle", "office", "work cubicle", "office", "office", "office", "office", "in office", "office cubicle", "at work"], "difficult_direct_answer": false, "rationales": ["This desk is set up inside a cubicle. there are personal belongings, so this building is not a library.", "The desk is located in an working office.", "The desk setup is in the office."], "image": "train2014/COCO_train2014_000000385492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402041, "question_id": "cK9mU8z7ZufcyKZQASCFZo", "question": "Who is the trainer?", "choices": ["first woman", "man", "second woman", "third woman"], "correct_choice_idx": 1, "direct_answers": ["man", "man", "man", "man", "man", "circus master", "man behind", "ring leader", "center circle", "man"], "difficult_direct_answer": false, "rationales": ["The person the ground directing the elephant show is usually the one who has trained the elephants and they will listen to them.", "The trainer is the man on the ground with the petticoat.", "You can tell by what the man is wearing and his position with the elephants as to who is the trainer."], "image": "train2014/COCO_train2014_000000402041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186699, "question_id": "cKMk2AV2qKgTP2aGUAm6fT", "question": "Where is the rice planted?", "choices": ["desert", "water", "land", "sacks"], "correct_choice_idx": 1, "direct_answers": ["plate", "plate", "rice field", "plate", "by broccoli", "water", "house", "clay soil", "under broccoli", "farm"], "difficult_direct_answer": false, "rationales": ["An internet search on rice cultivation and where it is planted shows rice planted in water.", "It is a plant that needs a lot of fluid", "Rice is planted by the water."], "image": "train2014/COCO_train2014_000000186699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151287, "question_id": "cKMkYgbKkE3BJ49qbCvxuF", "question": "Why is the man wearing a shirt with patches on it?", "choices": ["for amusement", "for protection", "for style", "he's sponsored"], "correct_choice_idx": 3, "direct_answers": ["tennis shirt", "advertising", "sponsorship", "he's sponsored", "sponsors", "sponsors", "sponsors", "advertising", "uniform", "advertisement"], "difficult_direct_answer": false, "rationales": ["He is a professional athlete who gets paid to play by companies, so he must advertise for them on his uniform.", "A man is playing tennis on a court with an audience.", "The patches are logos for brands."], "image": "train2014/COCO_train2014_000000151287.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550477, "question_id": "cKSWdU3N3vQmxD8pTqnq7b", "question": "Why drag the yellow object around?", "choices": ["detect metal", "gym exercise", "solve puzzle", "move belongings"], "correct_choice_idx": 3, "direct_answers": ["travel", "traveling", "his stuff", "carry", "travel", "has wheels", "move belongings", "suitcase", "wheels", "suitcase"], "difficult_direct_answer": false, "rationales": ["Suitcases are used to store clothing and other possessions and transport them.", "It is a luggage carrier.", "The person has a suitcase."], "image": "val2014/COCO_val2014_000000550477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389021, "question_id": "cKTpqNh6VSMiEHsPNB72ZC", "question": "What is the purpose of the large sign?", "choices": ["advertisement", "warning", "identification", "direction"], "correct_choice_idx": 0, "direct_answers": ["advertising", "advertisement", "advertisement", "advertise", "advertisement", "advertisement", "advertising", "advertising", "advertisement", "advertisement"], "difficult_direct_answer": false, "rationales": ["It's an ad for ipod.", "It is a billboard showing an item available for purchase.", "There is an person using an ipod on the poster."], "image": "train2014/COCO_train2014_000000389021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458463, "question_id": "cKVvihfKrsBs9fkQRk8ZXS", "question": "Who is famous for using the vehicle in the black and white photo?", "choices": ["orville wright", "gregor mendel", "tank abbott", "john cena"], "correct_choice_idx": 0, "direct_answers": ["pilots", "man", "wrights brothers", "amelia earheart", "aviator", "pilots", "orville wright", "wright brothers", "bush", "pilot"], "difficult_direct_answer": true, "rationales": ["He was part of the first flight", "The photo on the wall shows a plane model designed by the wright brothers.", "He is one of the brothers who was first in flight"], "image": "train2014/COCO_train2014_000000458463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311448, "question_id": "cKZ3sNds8MHCL5J9stvbdE", "question": "What kind of lettuce is used in this dish?", "choices": ["iceberg", "red", "green leaf", "romaine"], "correct_choice_idx": 0, "direct_answers": ["romaine", "iceberg", "iceberg", "romaine", "iceberg", "iceberg", "iceberg", "iceberg", "iceberg lettuce", "iceberg"], "difficult_direct_answer": false, "rationales": ["That is the type of greenery on the salad.", "There are some green and white pieces of iceberg lettuce.", "This type of lettuce is iceberg."], "image": "train2014/COCO_train2014_000000311448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95018, "question_id": "cKeggAwzfGzRcW72GUsPvy", "question": "What is the man in the yellow shirt standing in the door of?", "choices": ["dugout", "bleachers", "ref box", "batting cage"], "correct_choice_idx": 0, "direct_answers": ["exit", "dugout", "watching", "dugout", "dugout", "dugout", "dugout", "dugout", "dugout", "entrance"], "difficult_direct_answer": false, "rationales": ["That is what the man is standing on.", "He is standing and watching. he is in the doorway of where players stay while they wait for their turn in the game.", "This is wear the players sit."], "image": "train2014/COCO_train2014_000000095018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99342, "question_id": "cKp6SJkP3Fq7gnoGsMqC5p", "question": "Who most likely sleeps here?", "choices": ["prisoner", "grandmother", "baby", "landscaper"], "correct_choice_idx": 0, "direct_answers": ["prisoner", "prisoner", "prisoner", "prisoner", "prisoner", "boys", "prisoner", "prisoner", "inmate", "prisoner"], "difficult_direct_answer": false, "rationales": ["An inmate stays in this cell.", "The bed looks uncomfortable.", "This is a cell usually occupied by a prisoner."], "image": "val2014/COCO_val2014_000000099342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439248, "question_id": "cL8rsVTsWLWHsQR2qRvg7n", "question": "Why do they have their phones out?", "choices": ["selling them", "talking together", "bored", "taking photos"], "correct_choice_idx": 2, "direct_answers": ["checking emails", "bored", "using them", "checking phones", "bored", "showing photos", "take pictures", "checking them", "texting", "festival"], "difficult_direct_answer": true, "rationales": ["People will surf through the internet when there is nothing else to do.", "The people have their phones on out of boredom.", "They are laughing and looking at them"], "image": "train2014/COCO_train2014_000000439248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226588, "question_id": "cLBMjxuSnXnnA4G53tFspo", "question": "What is the woman holding?", "choices": ["eggs", "skis", "shovel", "bunnies"], "correct_choice_idx": 1, "direct_answers": ["stick", "ski pole", "skis", "ski poles", "ski poles", "poles", "ski poles", "ski poles", "ski poles", "ski poles"], "difficult_direct_answer": false, "rationales": ["The woman has skis.", "The woman is holding onto a pair of skis.", "The woman is holding ski poles."], "image": "val2014/COCO_val2014_000000226588.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447302, "question_id": "cLkjJhju34ZhqNqEwMvcMn", "question": "What type of display technology does the television on top of the entertainment center utilize?", "choices": ["crt", "lcd", "oled", "plasma"], "correct_choice_idx": 0, "direct_answers": ["dvd player", "crt", "mountain", "decoder", "analog", "crt", "crt", "cable", "lcd", "crt"], "difficult_direct_answer": false, "rationales": ["The crt is the tv.", "That type of tv uses crt technology.", "Crt technology is utilized."], "image": "train2014/COCO_train2014_000000447302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208623, "question_id": "cLmTpSXjxBKX89bXUxjU9K", "question": "What are the people assembled around?", "choices": ["laptop", "dinner table", "barbecue grill", "movie screen"], "correct_choice_idx": 0, "direct_answers": ["computer", "laptop", "laptop", "computer", "laptop", "computer", "laptop", "laptop", "laptop", "laptop computer"], "difficult_direct_answer": false, "rationales": ["The computer can be seen on the desk.", "The people are near a laptop.", "The people are by a laptop."], "image": "val2014/COCO_val2014_000000208623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116431, "question_id": "cLwSvBhsVBn8b44dGEKmXZ", "question": "Where is the man playing?", "choices": ["sand", "track", "field", "court"], "correct_choice_idx": 3, "direct_answers": ["court", "tennis court", "tennis court", "tennis court", "tennis court", "tennis court", "tennis court", "tennis court", "tennis court", "tennis court"], "difficult_direct_answer": false, "rationales": ["The man is on a court.", "He is playing tennis on a clay-color flat, hard surface with a tennis net on the same surface and bleachers behind him.", "The man is on the court."], "image": "train2014/COCO_train2014_000000116431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529139, "question_id": "cLzLFcZ2soPjPx845VNeWP", "question": "What item in the room would glow in the dark?", "choices": ["t-shirt", "keyboard", "necklace", "mouse"], "correct_choice_idx": 2, "direct_answers": ["necklace", "necklace", "blue lights", "blue lights", "wii controller", "necklace", "necklace", "necklace", "necklace", "necklace"], "difficult_direct_answer": false, "rationales": ["The item is a necklace.", "It has that neon-green look and is one of those toy plastic things that would light up if the lights were out.", "The man has a glow in the dark neck ring around his neck."], "image": "val2014/COCO_val2014_000000529139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418034, "question_id": "cLzNpjKVETxMuJka6RKKgU", "question": "Where else would his hand held tool be somewhat suitable?", "choices": ["dining room", "kitchen", "math class", "boat"], "correct_choice_idx": 3, "direct_answers": ["home", "canoe", "creek", "rowing", "river", "camping", "move", "mountains", "boat", "pond"], "difficult_direct_answer": true, "rationales": ["It is an oar. oars help propel things forward in the water.", "Traditionally oars are better suited to rowing boats.", "A boat is usually propelled by a paddle."], "image": "train2014/COCO_train2014_000000418034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20182, "question_id": "cM5CkbPN4DK2pQ8yMs2g75", "question": "What does the man in blue chain?", "choices": ["backhoe", "car", "driver", "children"], "correct_choice_idx": 0, "direct_answers": ["fixing", "tractor", "cargo", "backhoe", "worker", "equipment", "tow", "driver", "truck driver", "construction equipment"], "difficult_direct_answer": true, "rationales": ["The man has a backhoe.", "The man is grabbing onto large machinery.", "The man is chaining up a machine that has large wheels like a tank."], "image": "train2014/COCO_train2014_000000020182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391538, "question_id": "cMEBZUFXLN6XvKns98gKxn", "question": "What is the bus parked on?", "choices": ["sand", "dirt", "grass", "asphalt"], "correct_choice_idx": 0, "direct_answers": ["sand", "sand", "park", "sand", "sand", "sand", "sand", "dirt", "concrete", "beach"], "difficult_direct_answer": false, "rationales": ["The bus is parked in the sand.", "The bus is on sand.", "Sand is seen in the foreground of the image and right up to where the bus is located."], "image": "train2014/COCO_train2014_000000391538.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120827, "question_id": "cMELSvusbZd28EyNCSBdLM", "question": "What do the items in the center appear to be made of?", "choices": ["mud", "steel", "cotton", "brick"], "correct_choice_idx": 1, "direct_answers": ["steel", "steel", "steel", "steel", "metal", "steel", "metal", "iron", "metal", "steel"], "difficult_direct_answer": false, "rationales": ["The items are made of hard metal.", "They are made of metal.", "The parts are sturdy looking and are shiny and greyish in color."], "image": "train2014/COCO_train2014_000000120827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105358, "question_id": "cMGRYLMgEmAnV2jJj5Yupd", "question": "What is the man holding?", "choices": ["remote", "frisbee", "cup", "bowl"], "correct_choice_idx": 0, "direct_answers": ["controller", "controller", "game controller", "remote", "wii remote", "game controller", "wii remote", "controller", "remote", "wii controller"], "difficult_direct_answer": false, "rationales": ["You can tell by the design that he is holding a gaming remote.", "Based on the size, shape, color and design, answer a is accurate and consistent with how he is standing and using the object.", "It has a wrist string and looks like a wii remote. people are watching what he is doing with it."], "image": "train2014/COCO_train2014_000000105358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307442, "question_id": "cML6uuEGB4gk9QXAUVBQEH", "question": "Where are these people probably waiting to catch a bus?", "choices": ["airport", "street", "terminal", "underground"], "correct_choice_idx": 2, "direct_answers": ["bus stop", "bus stop", "bus stop", "terminal", "bus stop", "bus station", "bus station", "bus station", "bus stop", "bus stand"], "difficult_direct_answer": false, "rationales": ["The people are at a terminal.", "It is a shuttle.", "Bus depots are known as terminals."], "image": "val2014/COCO_val2014_000000307442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319747, "question_id": "cMjmWQj49f6X85QWWVws2Q", "question": "What is in the street?", "choices": ["bus", "bulls", "traffic cone", "police car"], "correct_choice_idx": 0, "direct_answers": ["stores", "vans", "vans", "motor vehicles", "buses", "vans", "vans", "bus", "buses", "vans"], "difficult_direct_answer": false, "rationales": ["A bus is driving in the road.", "There are several vehicles for transporting multiple people.", "The street has a bus."], "image": "train2014/COCO_train2014_000000319747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538809, "question_id": "cMrwngTiWCPj9RB7pwXDF2", "question": "What does this store sell?", "choices": ["running clothes", "bikes", "doughnuts", "running shoes"], "correct_choice_idx": 3, "direct_answers": ["running shoes", "sporting goods", "running necessities", "running gear", "running gear", "running equipment", "running shoes", "running shoes", "exercise stuff", "clothings"], "difficult_direct_answer": false, "rationales": ["The store mentions running.", "The store has shoes.", "The store is called running room."], "image": "train2014/COCO_train2014_000000538809.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257729, "question_id": "cNC82v8ZEh4BnQSKDLP7Dk", "question": "What type of oil is shown?", "choices": ["grapeseed", "canola", "olive", "vegetable"], "correct_choice_idx": 0, "direct_answers": ["grapeseed", "grapeseed", "grapeseed", "grapeseed", "grapeseed", "grapeseed", "grapeseed", "grapeseed", "grapeseed", "grapeseed"], "difficult_direct_answer": false, "rationales": ["The label of the oil shows grapeseed oil. the bottle is clearly labeled.", "This plastic container contains the phrase '100% grapeseed oil' so we can assume that this is what it contains.", "That is what the label says."], "image": "train2014/COCO_train2014_000000257729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74241, "question_id": "cNLyVt8RhWx5AYTzXHUod9", "question": "Why are the men reaching forward while on skis?", "choices": ["to swim", "to wave", "waterskiing", "to dance"], "correct_choice_idx": 2, "direct_answers": ["waterskiing", "being propelled", "holding rope", "hold on", "balance", "waterskiing", "holding lines", "hold rope", "increased speed", "holding stick"], "difficult_direct_answer": true, "rationales": ["The men are waterskiing.", "The men are reaching forward to hold on their waterskiing lines.", "The men are waterskiing."], "image": "val2014/COCO_val2014_000000074241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543869, "question_id": "cNWgy3Qoo3dDBnvnvRZYaa", "question": "What type of light do electronic screens emit?", "choices": ["white", "vu", "bvu", "uv"], "correct_choice_idx": 3, "direct_answers": ["led", "blue", "blue", "blue", "soft backlight", "uv", "blue", "blue", "bright", "led"], "difficult_direct_answer": false, "rationales": ["It's also referred to as blue light.", "The electronic screens give out uv lights", "The light from the computer will emit uv light."], "image": "train2014/COCO_train2014_000000543869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71301, "question_id": "cNXSiMmYMVctbQgPnWsYdj", "question": "Which skier is at most risk of getting hit against the blue wall?", "choices": ["middle skier", "right skier", "left skier", "upcoming skier"], "correct_choice_idx": 2, "direct_answers": ["left skier", "players", "left", "female skier", "left one", "right side", "left skier", "left skier", "man", "black shirt"], "difficult_direct_answer": false, "rationales": ["[", "The left skier is most at risk of hitting the blue wall to the left.", "The person on the left is closest to the wall."], "image": "val2014/COCO_val2014_000000071301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188148, "question_id": "cNdbKFa8Asd9etw7Qzoh5b", "question": "What font is used in Apple logo?", "choices": ["slab serif", "sans", "helvetica", "serif"], "correct_choice_idx": 2, "direct_answers": ["helvetica", "bitten apple", "apple", "apple", "san francisco", "san francisco", "san francisco", "medium", "custom font", "san francisco"], "difficult_direct_answer": false, "rationales": ["This helvectica is used as the font for apple.", "The font is helvetica.", "The older apple logo font is helevtica."], "image": "train2014/COCO_train2014_000000188148.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156939, "question_id": "cNfJQ8F5RoFhbg4ELJNXVr", "question": "What material are the brown boots made of?", "choices": ["nylon", "pic", "cotton", "leather"], "correct_choice_idx": 3, "direct_answers": ["leather", "leather", "leather", "leather", "leather", "leather", "leather", "leather", "leather", "leather"], "difficult_direct_answer": false, "rationales": ["The material is leather.", "The boots are shiny and they're cowboy boots which indicates they're leather ones.", "The boots are made out of a shiny brown material. leather is a common material for boots to be made out of."], "image": "train2014/COCO_train2014_000000156939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450263, "question_id": "cNjvynfrt39u8FQAywRLWq", "question": "At what kind of landmark are these people at?", "choices": ["amusement park", "wharf", "beach", "city park"], "correct_choice_idx": 1, "direct_answers": ["stern wheeler", "road", "bridge", "river", "bridge", "bridge", "ferry", "wharf", "riverboat", "war"], "difficult_direct_answer": false, "rationales": ["The people are near water given the boat.", "The people are on land, but there is a waterway that is running past.", "The water and boats being docked there indicating it is a harbour or wharf where boats come and go from the city."], "image": "val2014/COCO_val2014_000000450263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561037, "question_id": "cNni3ZtH4HMLEpSrjdyYxK", "question": "What kind of organization is the white square sticker featuring in the motorcycle?", "choices": ["motors club", "hospital", "insurance", "bank"], "correct_choice_idx": 2, "direct_answers": ["car insurance", "triple", "insurance", "roadside service", "travel club", "motor club", "insurance", "travel", "aka", "insurance"], "difficult_direct_answer": false, "rationales": ["Aka is for insurance.", "Aaa is an accident insurance company.", "It's a common a carrier. it's also considered a travel club."], "image": "val2014/COCO_val2014_000000561037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513282, "question_id": "cNw9jwodT4TyfgDASCepxk", "question": "In what country is this river in?", "choices": ["italy", "britain", "france", "spain"], "correct_choice_idx": 1, "direct_answers": ["england", "uk", "columbia", "columbia", "england", "britain", "usa", "columbia", "england", "england"], "difficult_direct_answer": false, "rationales": ["The sign says columbia wharf.", "The flag of the uk is in the river.", "There are english words on the buildings"], "image": "train2014/COCO_train2014_000000513282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435029, "question_id": "cP4YY6JBgzzQ7ecm4FCqSP", "question": "The man in blue wants to do what to the frisbee holder?", "choices": ["assist her", "block her", "nothing", "embarrass her"], "correct_choice_idx": 1, "direct_answers": ["take frisbee", "toss it", "block", "block her", "block", "block", "give orders", "grab", "block", "hold"], "difficult_direct_answer": false, "rationales": ["His body language with his arms out indicate that he is trying to stop her from moving forward.", "The man wants to block.", "The man wants to keep the woman from throwing the frisbee."], "image": "train2014/COCO_train2014_000000435029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163775, "question_id": "cP9aZAsxgB49gQFCLRrvv2", "question": "What grain is featured in the bread shown here?", "choices": ["beans", "barley", "semolina", "corn"], "correct_choice_idx": 3, "direct_answers": ["corn", "corn", "beans", "corn", "corn", "corn", "corn bread", "sauce", "rice", "sause"], "difficult_direct_answer": false, "rationales": ["Cornbread goes with chili.", "The color is a huge indicator. it's also a common bread eaten with chili, which is what appears to be in the bowl.", "The grain is corn."], "image": "val2014/COCO_val2014_000000163775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404145, "question_id": "cPDv7SAgsdwMYnLsDYR6w2", "question": "Why is his mouth wide open?", "choices": ["is happy", "is angry", "is excited", "iseating"], "correct_choice_idx": 2, "direct_answers": ["proud", "yelling", "yelling", "yelling", "yelling", "yelling", "excitement", "is excited", "yelling", "excited"], "difficult_direct_answer": false, "rationales": ["The man looks shocked.", "The mouth is excited.", "The man is on a tennis court with his mouth open and is holding his hands out on both sides. his fists are clenched."], "image": "val2014/COCO_val2014_000000404145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7873, "question_id": "cPUGBeXCVhqMET8QVvyZJs", "question": "What is the name of this woman's jewelry?", "choices": ["nose stud", "button", "nose ring", "pin"], "correct_choice_idx": 0, "direct_answers": ["bracelet", "bracelet", "nose stud", "bracelet", "bracelet", "bracelets", "bracelet", "nose stud", "nose stud", "nose stud"], "difficult_direct_answer": false, "rationales": ["She has a piece of jewelry on the side of her nostril.", "The woman is wearing a nose stud.", "The name is a nose stud."], "image": "val2014/COCO_val2014_000000007873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79356, "question_id": "cPfbmBMdALZyCmWnvEkmh7", "question": "Which person controls the elephant?", "choices": ["remote holder", "rear", "distant woman", "front most"], "correct_choice_idx": 3, "direct_answers": ["front man", "front", "front most", "driver", "front", "boy", "front", "front one", "in front", "boy"], "difficult_direct_answer": false, "rationales": ["He signals the animal where to turn and walk", "The front person has the most control.", "The person in the front in a saddle is usually controlling the reigns of an animal. the people behind the front most person look to be just riding."], "image": "val2014/COCO_val2014_000000079356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391046, "question_id": "cPmZ5ozxAiYHEecaxGNMx2", "question": "What will this person do here overnight?", "choices": ["play frisbee", "wander around", "catch fireflies", "camp"], "correct_choice_idx": 3, "direct_answers": ["camp", "sleep", "campout", "camp", "camp", "camp", "camp", "camp", "playing", "camp"], "difficult_direct_answer": false, "rationales": ["There are tents set up so it makes sense that they would be sleeping in them.", "The person is camping.", "Tents are behind this person."], "image": "train2014/COCO_train2014_000000391046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484386, "question_id": "cPreSW34bNQntVmvtfEWjo", "question": "For what purpose are tires on the side of the boat?", "choices": ["helping float", "docking against", "flat repair", "good luck"], "correct_choice_idx": 1, "direct_answers": ["bumpers", "safety", "docking against", "protects cord", "bumpers", "for docking", "port", "cost-effective shock-absorbers", "bumpers", "mooring"], "difficult_direct_answer": false, "rationales": ["The tires create space between the boat and the dock, preventing the side of the boat from banging into the dock and being damaged.", "Boats will sometimes have tires on the sides because it provides some cushion so the boat doesn't get damaged from rubbing against another object.", "The answer is internet searchable. based on the positioning the tires would be used to cushion any impact between boat and dock."], "image": "train2014/COCO_train2014_000000484386.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40989, "question_id": "cQ3rHdAYV2qh2QB2idpU7Q", "question": "Why is the man under the yellow and purple umbrella laying down?", "choices": ["to sleep", "to eat", "to tan", "to exercise"], "correct_choice_idx": 2, "direct_answers": ["sunbathing", "tanning", "sunbathing", "to relax", "sunbathing", "relaxing", "sunbathing", "to tan", "relaxing", "tired"], "difficult_direct_answer": false, "rationales": ["He is relaxing in the sand.", "The man under the yellow and purple investment agreements could be barry and could want some time to unwind.", "He is not laying under the protection of the umbrellas or wearing clothing, so he is purposely wanting the sun to touch his skin."], "image": "val2014/COCO_val2014_000000040989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249309, "question_id": "cQ4kPzDrHnJwUCDhaEynVf", "question": "The traffic light in this intersection is operating during which season?", "choices": ["spring", "fall", "summer", "winter"], "correct_choice_idx": 1, "direct_answers": ["fall", "fall", "fall", "fall", "spring", "fall", "fall", "fall", "spring", "fall"], "difficult_direct_answer": false, "rationales": ["The trees are yellow and orange.", "The leaves on the trees are orange like in fall.", "The leaves on the trees in the background appear to have turned yellow, which happens when they are about to fall in the fall."], "image": "train2014/COCO_train2014_000000249309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157350, "question_id": "cQHUgKBauZspVSTvnTEvwC", "question": "How was the drawing on his shoulder made?", "choices": ["marker", "laser", "paint", "tattoo"], "correct_choice_idx": 3, "direct_answers": ["tattoo gun", "tattoo", "tattoo artist", "tattoo gun", "tattoo", "tattoo", "tattoo", "tattoo", "tattoo", "needle"], "difficult_direct_answer": false, "rationales": ["It is a permanent mark made with ink and a needle.", "The drawing is from a tattoo.", "A person is laying shirtless on a bed and a moon and stars can be seen on the shoulder."], "image": "train2014/COCO_train2014_000000157350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327813, "question_id": "cQJb8BxvWotu4teBLqGohV", "question": "What is in the center?", "choices": ["couch", "dog", "baby", "pogo stick"], "correct_choice_idx": 0, "direct_answers": ["furniture", "couch", "couch", "couch", "couch", "couch", "sofa", "couch", "sofa", "couch"], "difficult_direct_answer": false, "rationales": ["A couch is in the middle of the room.", "It is furniture that at least 3 people can sit on", "There is a large piece of furniture in the middle of a room."], "image": "train2014/COCO_train2014_000000327813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568459, "question_id": "cQMzjj3mXGDLYwUBnXSvT5", "question": "What is the bus doing near the sidewalk?", "choices": ["stopping", "backing up", "racing", "accelerating"], "correct_choice_idx": 0, "direct_answers": ["discharging passengers", "standing", "boarding", "parking", "stopping", "stopping", "dropping off", "stop", "parked", "picking up"], "difficult_direct_answer": true, "rationales": ["The bus has stopped near the sidewalk.", "It is stopping to pick up.", "The bus is stopping on the street."], "image": "train2014/COCO_train2014_000000568459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384822, "question_id": "cQaGPx4AMVWkVmT7y84S8h", "question": "What ingredients are on the pizza?", "choices": ["spinach", "pepperoni", "pineapple", "bacon"], "correct_choice_idx": 0, "direct_answers": ["veggies", "spinach cheese", "spinach", "vegetarian", "spinach cheese", "cheese spinach", "cheese", "spinach", "cheese spinach", "cheese veggies"], "difficult_direct_answer": false, "rationales": ["There green veggie on the pizza is spinach because it withered away after cooking.", "There is some spinach on top of the pizza.", "The ingredients are spinach."], "image": "val2014/COCO_val2014_000000384822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290192, "question_id": "cQbc6ScrXEDoA9CbkwE9KC", "question": "What is the white trail behind the plane called?", "choices": ["banner", "net", "cloud", "contrail"], "correct_choice_idx": 3, "direct_answers": ["contrails", "contrail", "contrail", "smoke", "smoke", "contrails", "function", "exhaust", "contrails", "contrail"], "difficult_direct_answer": false, "rationales": ["The lines are in the sky", "Airplanes leave these during an air show.", "It is the vapor trail created by the plane exhaust."], "image": "train2014/COCO_train2014_000000290192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490992, "question_id": "cQkpWAKqef8VJEdiRM2f9R", "question": "Why is there a rope around this statue?", "choices": ["prevent damage", "law", "style", "religious reasons"], "correct_choice_idx": 0, "direct_answers": ["for cover", "off limits", "rope", "protection", "prevent touching", "protection", "prevent damage", "object protection", "don't touch", "protection"], "difficult_direct_answer": false, "rationales": ["The rope is used to keep people at a safe distance from the statue so they don't accidentally damage it.", "This keeps people from touching it", "It is a display."], "image": "train2014/COCO_train2014_000000490992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17364, "question_id": "cR5fCyxY3MGKq3o6rNRwMH", "question": "What could be found beneath the grates in the street here?", "choices": ["mole people", "sewer", "oz", "second city"], "correct_choice_idx": 1, "direct_answers": ["pipes", "sewer", "sewer", "sewer", "machinery", "subway", "sewers", "subway", "air vents", "sewer"], "difficult_direct_answer": false, "rationales": ["The sewer can be found.", "There aren't any ditches in big cities like this.", "The grates cover sewers."], "image": "val2014/COCO_val2014_000000017364.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332038, "question_id": "cRPpnGbUVrxTSNb79nguVT", "question": "At which location does the child hold the tree?", "choices": ["car wash", "dairy barn", "playground", "mall"], "correct_choice_idx": 2, "direct_answers": ["playground", "playground", "play ground", "park", "trunk", "park", "playground", "in park", "trunk", "park"], "difficult_direct_answer": false, "rationales": ["The location is the playground.", "There is a grassy area and there are toys in the background.", "There is a lot of grass. you can see a trampoline in the background."], "image": "train2014/COCO_train2014_000000332038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267022, "question_id": "cRyhiu4H46LSPPnDf45mz2", "question": "The sporting event taking place on the grounds is most likely which one?", "choices": ["golf", "swimming", "tennis", "cycling"], "correct_choice_idx": 0, "direct_answers": ["bike racing", "football", "baseball", "golf", "polo", "soccer", "polo", "polo", "soccer", "polo"], "difficult_direct_answer": false, "rationales": ["There are clubs on the grass.", "A golf club is seen on the bottom.", "The chairs are on a golf course."], "image": "train2014/COCO_train2014_000000267022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193936, "question_id": "cSBcrNidLfCmrGh3sFnT6V", "question": "What type of group does this seem to be?", "choices": ["criminals", "wizards", "athletes", "classmates"], "correct_choice_idx": 3, "direct_answers": ["class", "schoolchildren", "children", "classmates", "school", "classroom", "class", "students", "class", "kindergarten class"], "difficult_direct_answer": false, "rationales": ["The children are posed in uniforms that look like school uniforms with two older people in professional academic attire. they are positioned as a group of children this age would be if they were classmates and taking a school photo.", "With the uniforms worn and the ages you can safely tell that this is a picture for people in the same class.", "There are several kids with two grown ups which are teachers."], "image": "val2014/COCO_val2014_000000193936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197492, "question_id": "cSYpELTBTnfpkdwwy7ggas", "question": "What kind of throw is that called?", "choices": ["pitch", "underhand", "hail mary", "hurl"], "correct_choice_idx": 0, "direct_answers": ["fast ball", "pitch", "overhand", "overhand", "pitch", "pitch", "pitch", "overhand pitch", "fast throw", "overhand"], "difficult_direct_answer": false, "rationales": ["The person is throwing the ball from the mound in the middle of a baseball diamond.", "The man is pitching the ball.", "The baseball player is on the defensive team. he is throwing the ball from the mound to the catcher at home plate."], "image": "val2014/COCO_val2014_000000197492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399835, "question_id": "cSts8uVU8zTLjiFS2rqmky", "question": "What animals are present?", "choices": ["zebra", "giraffe", "ostrich", "deer"], "correct_choice_idx": 0, "direct_answers": ["zebras", "zebras", "zebras", "zebra", "zebra", "zebra", "zebras", "zebra", "zebras", "zebras"], "difficult_direct_answer": false, "rationales": ["They are black and white striped", "The animals look like horses and have black and white stripes.", "You can tell by the stripes and the colors as to what these animals are."], "image": "train2014/COCO_train2014_000000399835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329127, "question_id": "cSvrJKzWtMptV8iEueMoUp", "question": "What is being done behind the glass showcase?", "choices": ["repairing", "baking", "construction", "painting"], "correct_choice_idx": 1, "direct_answers": ["donuts", "baking", "selling doughnuts", "donuts", "baking", "baking", "baking", "baking", "baking", "doughnut manufacturing"], "difficult_direct_answer": false, "rationales": ["The items are baked.", "There are rows of donuts in a display case. this is a bakery.", "There are lots of doughnuts in the foreground and the kitchen used to make them is behind them."], "image": "train2014/COCO_train2014_000000329127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22854, "question_id": "cTdqoFg6bd4WhXi4sdLUjn", "question": "What is the model of phone?", "choices": ["cherry", "razr", "googler", "blackberry"], "correct_choice_idx": 1, "direct_answers": ["razr", "motorola", "motorola", "motorolla flip", "motorola razr", "razor", "flip", "motorola", "motorola", "2005"], "difficult_direct_answer": false, "rationales": ["That's what the phone is called.", "The brand that made this flipphone which is shown in the top of the item was known to have that flat and wide design, like a sharp razor.", "It is a popular brand of motorola form when cell phones first became mainstream."], "image": "train2014/COCO_train2014_000000022854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293605, "question_id": "cThLsHvkZ7asPUk34VXXFK", "question": "How is the black bicycle able to stand on its own?", "choices": ["kickstand", "rack", "leaning", "mount"], "correct_choice_idx": 0, "direct_answers": ["kick stand", "kickstand", "kickstand", "kickstand", "stand", "kickstand", "kickstand", "kickstand", "kickstand", "stand"], "difficult_direct_answer": false, "rationales": ["A kickstand keeps the bike up on it's own.", "The black bike is using a kickstand to keep it upright.", "There is a small piece of metal on the bottom of the bike that is engaged against the sidewalk to balance the bicycle."], "image": "train2014/COCO_train2014_000000293605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475370, "question_id": "cTsdXriRJYLyefsxFZJz2m", "question": "What type of area is nearby?", "choices": ["rural", "country", "urban", "tropical"], "correct_choice_idx": 2, "direct_answers": ["residential", "city", "city", "residential", "city", "city", "city", "urban", "city", "city"], "difficult_direct_answer": false, "rationales": ["There are buildings and trains nearby. the trees are not palm trees.", "There is a city in the background.", "The area has a lot of tall buildings like in a city, which is urban."], "image": "train2014/COCO_train2014_000000475370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92336, "question_id": "cTyWxfD8FdXJ4s3dWtS7cM", "question": "Which rider is in the best position to win?", "choices": ["dark blue", "yellow", "normal blue", "light blue"], "correct_choice_idx": 3, "direct_answers": ["came", "three", "first blue", "1st", "first", "first", "light blue", "third", "front", "light blue"], "difficult_direct_answer": false, "rationales": ["The person in light blue is closest to the edge.", "The rider is in blue.", "A line of motorcycle riders are riding on a racetrack and the first rider in the line is wearing a light blue uniform."], "image": "train2014/COCO_train2014_000000092336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522163, "question_id": "cU3Mw3yUvavDnJ5Qgn2cja", "question": "What is the tallest item here?", "choices": ["tree", "leopard", "wooden board", "man"], "correct_choice_idx": 2, "direct_answers": ["surfboard", "surfboard", "surfboard", "wooden board", "surfboard", "surfboard", "surfboard", "surfboard", "surfboard", "surfboard"], "difficult_direct_answer": false, "rationales": ["There is a large wooden board standing next to the man.", "The item is the wooden board.", "The guy is shorter than the surfboard."], "image": "val2014/COCO_val2014_000000522163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114541, "question_id": "cU3bLCcqTd8vUaR7gT2jgG", "question": "What is in the cup with liquid?", "choices": ["matzo ball", "mozzarella cheese", "dumpling", "bun"], "correct_choice_idx": 1, "direct_answers": ["mozzarella cheese", "sour cream", "ice cream", "milk", "egg", "mozzarella cheese", "ice", "juice", "cheese", "milk"], "difficult_direct_answer": false, "rationales": ["An italian sandwich and italians style sides are being served with a white substance in a cup with the rest of the food.", "There is a thick white substance in the cup.", "It is a white ball in some liquid. there is bread, prosciutto, and tomato on the table. this kind of cheese goes extremely well with the other ingredients."], "image": "train2014/COCO_train2014_000000114541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364722, "question_id": "cUHoLSHfiuM6ZzNY7GBUM4", "question": "What is in the green container?", "choices": ["juice", "vinegar", "wine", "butter"], "correct_choice_idx": 0, "direct_answers": ["apple juice", "apple juice", "juice", "apple juice", "apple juice", "glass", "apple juice", "apple juice", "apple juice", "sparkling water"], "difficult_direct_answer": false, "rationales": ["The bottle says it contains 100% apple juice.", "The contents of the sparkling apple beverage are listed on the green container.", "The text on the side of the green container indicates what is inside."], "image": "train2014/COCO_train2014_000000364722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440284, "question_id": "cULnCVhFMz38TzqULXVvXd", "question": "Where is the person who is going to be riding the 4th bike right now?", "choices": ["store", "bathroom", "taking photo", "eating"], "correct_choice_idx": 2, "direct_answers": ["restroom", "taking photo", "taking picture", "at race", "behind camera", "inside", "taking photo", "taking photo", "unknown", "taking photo"], "difficult_direct_answer": false, "rationales": ["This person is taking the picture.", "His friends probably asked him to snap a picture of them, and you need someone to take a picture.", "This person isn't in the photo since he's taking it."], "image": "val2014/COCO_val2014_000000440284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34500, "question_id": "cUNBhkpkTSvmGx8MvUMEjy", "question": "How many 'points' are required to win a round in this sport?", "choices": ["four", "five", "ten", "two"], "correct_choice_idx": 0, "direct_answers": ["15", "4 points", "60", "four", "four", "50", "40", "four", "four", "forty"], "difficult_direct_answer": false, "rationales": ["This sport is tennis. a person who wins a round starts at love and then increases their score to 15, 30, 40, and then game.", "Four points are needed in tennis to win.", "In order to win a tennis round, a player needs 4 points"], "image": "val2014/COCO_val2014_000000034500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450, "question_id": "cUfBLrVK8ovBNCWSh8qFXz", "question": "Which one of these cheeses is rarely seen on this dish?", "choices": ["mozzarella", "american", "parmesan", "provolone"], "correct_choice_idx": 1, "direct_answers": ["cheddar", "swiss", "swiss cheese", "cheddar", "cottage", "goat", "tasty", "american", "gorgonzola", "cheddar"], "difficult_direct_answer": false, "rationales": ["American is usually not see as a cheese on italian pizza.", "They wouldn't use american.", "American cheese doesn't go on pizza."], "image": "train2014/COCO_train2014_000000000450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64240, "question_id": "cUhkzkbkazjzV7aoJDMbhm", "question": "What are the motorcycles riding on?", "choices": ["asphalt", "concrete", "water", "dirt"], "correct_choice_idx": 0, "direct_answers": ["track", "motorcycle", "track", "dirt", "pavement", "gravels", "dirt", "gravel", "gravel", "asphalt"], "difficult_direct_answer": false, "rationales": ["There is multiple of the same person on a motorcycle. it is a dark surface on a track.", "They're on the street.", "A motorcycle needs a smooth surface to ride upon."], "image": "val2014/COCO_val2014_000000064240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80720, "question_id": "cUs4xsKncZMSP4uz7URjpi", "question": "What appliance can be found beneath the Microwave?", "choices": ["bottle opener", "wine freezer", "stove", "toaster"], "correct_choice_idx": 2, "direct_answers": ["stove", "stove", "stove", "stove", "sink", "stove", "stovetop", "stove", "counter", "sink"], "difficult_direct_answer": false, "rationales": ["There is a stove beneath the microwave in the picture.", "The oven is visible.", "The appliance is the stove."], "image": "train2014/COCO_train2014_000000080720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146150, "question_id": "cUsuHdqNwbxnT6Nx4596oe", "question": "What month is it here?", "choices": ["august", "june", "september", "december"], "correct_choice_idx": 3, "direct_answers": ["december", "december", "december", "december", "december", "december", "december", "december", "december", "december"], "difficult_direct_answer": false, "rationales": ["The month is december.", "There is a visible christmas tree in the corner white snowflake ornaments. christmas occurs in december.", "There is a christmas tree displayed in the photo, and the holiday is in the month of december."], "image": "val2014/COCO_val2014_000000146150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140209, "question_id": "cVEu8EhpcAzHrJF2yAqgKs", "question": "Which country headquarters the brand company of this man's shoes?", "choices": ["china", "india", "united states", "italy"], "correct_choice_idx": 2, "direct_answers": ["usa", "no clue", "usa", "usa", "no clue", "usa", "united states", "usa", "usa", "no clue"], "difficult_direct_answer": false, "rationales": ["The united states made the shoes since the man is playing baseball and that is america's sport.", "They are from the united states.", "They are in the u.s."], "image": "train2014/COCO_train2014_000000140209.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185358, "question_id": "cVGWfcSyJBcV9A3X8ti75x", "question": "If you lost your cell phone where could you make a call anyway?", "choices": ["drug zone", "phone stand", "atm", "street corner"], "correct_choice_idx": 1, "direct_answers": ["payphone", "phone stand", "phone booth", "phone booth", "phone booth", "phone booth", "payphone", "phone booth", "phone booth", "phone booth"], "difficult_direct_answer": false, "rationales": ["There is a sign on the side of the road with an image of a phone on it.", "The phone stand can help.", "Any phone booth has a telephone available."], "image": "train2014/COCO_train2014_000000185358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578396, "question_id": "cVMcZfcHS2g39b9UKnC6ZB", "question": "The human-shaped decorations are made of what material?", "choices": ["metal", "wood", "cement", "plastic"], "correct_choice_idx": 0, "direct_answers": ["metal", "metal", "metal", "metal", "metal", "metal", "metal", "metal", "metal", "metal"], "difficult_direct_answer": false, "rationales": ["The decorations are metal.", "These art sculptures are made of a steel material based on their texture and color.", "The shape is metal."], "image": "train2014/COCO_train2014_000000578396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539355, "question_id": "cVQQ8nXkX4T6YgfbDi37jx", "question": "What famous museum is near this?", "choices": ["louvre", "british museum", "smithsonian", "guggenheim"], "correct_choice_idx": 1, "direct_answers": ["british museum", "british", "unknown", "british museum", "no idea", "freemasonry", "tintin", "pied bull", "british museum", "louvre"], "difficult_direct_answer": false, "rationales": ["This is in brittain near there.", "The british museum is near this clock.", "The museum is british."], "image": "val2014/COCO_val2014_000000539355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300705, "question_id": "cVUP9Q42Qw3eb6nBDeNdQW", "question": "If tiny pieces were found in the glass one would assume they are what?", "choices": ["bugs", "marbles", "beans", "pulp"], "correct_choice_idx": 3, "direct_answers": ["pulp", "pulp", "pulp", "pulp", "pulp", "dipping", "pulp", "bread", "bread", "work"], "difficult_direct_answer": false, "rationales": ["The pieces are pulp.", "The glass contains orange juice. the flesh that contains the juice before the orange has been squeezed often remains in the finished juice product.", "There is a glass of orange juice. orange juice sometimes has pulp."], "image": "val2014/COCO_val2014_000000300705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477682, "question_id": "cVdpyYzcCNGcBLEpEeSiQE", "question": "What is the rabbit doing on the plate?", "choices": ["mating", "playing", "sleeping", "eating carrot"], "correct_choice_idx": 3, "direct_answers": ["eating", "eating", "eating carrot", "sitting", "eating", "looking", "sitting", "sitting", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["He is getting ready to eat a carrot.", "The rabbit has an orange vegetable.", "The rabbit is placed next to a carrot and is an animal well known for eating carrots."], "image": "train2014/COCO_train2014_000000477682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313437, "question_id": "cW6PGidsm3MntRiCYABvJb", "question": "What animal is the man in the hat carrying?", "choices": ["cat", "rabbit", "lamb", "puppy"], "correct_choice_idx": 2, "direct_answers": ["lamb", "lamb", "lamb", "sheep", "lamb", "sheep", "lambs", "lamb", "sheep", "goat"], "difficult_direct_answer": false, "rationales": ["The animals are furry, dry, four legged, floppy-eared, curly-haired, woolen animals that are in a pasture. as babies, the man can hold and carry one of them in each arm.", "The animal is a lamb.", "There are animals under his arms. the animals are young and have a lot of fur on them."], "image": "train2014/COCO_train2014_000000313437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130984, "question_id": "cWBgWjVbzXG9xb2A7JGLWA", "question": "What kind of service is this?", "choices": ["rail", "internet", "cable", "baseball"], "correct_choice_idx": 0, "direct_answers": ["train", "rail services", "cargo transport", "train", "train", "train service", "rail", "train", "rail", "direct rail"], "difficult_direct_answer": false, "rationales": ["The trains go on a set of rails.", "It's a train service.", "There are multiple trains all sitting on their tracks. also, the type of service is displayed on the side of two of the trains."], "image": "val2014/COCO_val2014_000000130984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486606, "question_id": "cWMd2dgqpMjybpcpMVfA3G", "question": "Where does tennis come from?", "choices": ["england", "france", "belgium", "russia"], "correct_choice_idx": 1, "direct_answers": ["france", "france", "usa", "france", "united kingdom", "france", "france", "france", "france", "england"], "difficult_direct_answer": false, "rationales": ["It is believed a form of tennis was first played in france, where the ball was struck with the palm of the hand.", "This questions was searched on the internet and the answer found was answer a.", "Tennis began in france."], "image": "train2014/COCO_train2014_000000486606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254379, "question_id": "cWbWi58eHN4SmJeQq8PqxX", "question": "How many players can play?", "choices": ["three", "one", "four", "two"], "correct_choice_idx": 3, "direct_answers": ["two players", "two", "two 2", "one", "one", "one", "two", "one", "one", "two"], "difficult_direct_answer": false, "rationales": ["There are two players.", "Four players could play with these devices.", "Two players can play the wii."], "image": "val2014/COCO_val2014_000000254379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514245, "question_id": "cWkQyERaTFEm3CBVL9NoXK", "question": "Where are these people most likely traveling?", "choices": ["australia", "france", "new zealand", "united states"], "correct_choice_idx": 0, "direct_answers": ["australia", "australia", "australia", "austruala", "australia", "country", "australia", "australia", "australia", "australia"], "difficult_direct_answer": false, "rationales": ["An airplane has an australian logo on it.", "This is the airline for that country", "The airplane says \"australia\" on it."], "image": "train2014/COCO_train2014_000000514245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381556, "question_id": "cWqT3HRRB6w6w9itJWLDWG", "question": "Where is this bike displayed?", "choices": ["garage", "used lot", "showroom", "home"], "correct_choice_idx": 2, "direct_answers": ["motorcycle", "exhibition", "showroom", "store", "sales room", "bike show", "showroom", "showroom", "show", "for sale"], "difficult_direct_answer": false, "rationales": ["There is carpet and flooring as well as several other bikes", "In the background there appears to be many other fancy looking bikes with people walking around looking at them. they appear to be on display with tags potentially with more information or price details.", "A bike is displayed with others behind it and people gathered all around."], "image": "train2014/COCO_train2014_000000381556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270661, "question_id": "cWxiRzKso3NqQY3rR3ZtQ8", "question": "What material is the suitcase made of?", "choices": ["plastic", "denim", "nylon", "leather"], "correct_choice_idx": 2, "direct_answers": ["canvas", "nylon", "cloth", "nylon", "cloth", "canvas", "polyester", "canvas", "canvas", "polyester"], "difficult_direct_answer": false, "rationales": ["The suitcase has a flat color.", "The material is nylon.", "The suitcase is constructed of nylon."], "image": "train2014/COCO_train2014_000000270661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554607, "question_id": "cXBtM8TuT7jdD44vHY7qi6", "question": "What would the food in the containers be used with the sausage to make?", "choices": ["bread", "hamburgers", "french fries", "hotdogs"], "correct_choice_idx": 3, "direct_answers": ["hot dog", "hotdog", "loaded hotdog", "hotdogs", "hotdogs", "hot dogs", "hotdogs", "sandwich", "hotdogs", "tasty food"], "difficult_direct_answer": false, "rationales": ["They have the same ingredients.", "Sausages are often used in hot dogs.", "These are toppings for the meat"], "image": "val2014/COCO_val2014_000000554607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301602, "question_id": "cXCdFaYsxDKWYSNrwSW6bx", "question": "Why are the bike riders wearing yellow?", "choices": ["as prank", "style", "visibility", "camouflage"], "correct_choice_idx": 2, "direct_answers": ["visibility", "safety", "safety", "visibility", "for safety", "safety", "safety vests", "safety", "increased visibility", "visibility"], "difficult_direct_answer": false, "rationales": ["They want visibility.", "The color is easy to see at night.", "They want to be seen to avoid accidents."], "image": "train2014/COCO_train2014_000000301602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564153, "question_id": "cXebETBK9MNVYC78rqaM7q", "question": "This play is most likely what?", "choices": ["home run", "foul ball", "double", "walk"], "correct_choice_idx": 1, "direct_answers": ["home run", "foul ball", "home run", "foul ball", "strikeout", "fly ball", "home run", "home run", "foul ball", "strike"], "difficult_direct_answer": false, "rationales": ["Foul ball, as the ball is moving far to the right and the batter has swung.", "The guy did not hit the ball in bounds so it will be an out.", "The play is a foul ball."], "image": "train2014/COCO_train2014_000000564153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46251, "question_id": "cXnH2LgVL4CPKDcxjDbf9b", "question": "What is located near this area?", "choices": ["sheep farm", "airport", "jail", "high rise"], "correct_choice_idx": 1, "direct_answers": ["airport", "airport", "airport", "airport", "airport", "trees", "airport", "airport", "airstrip", "airport"], "difficult_direct_answer": false, "rationales": ["That plane is flying rather low indicating that an airport is nearby.", "The airplane is low to the ground. it is probably landing.", "There is a plane going for a landing."], "image": "train2014/COCO_train2014_000000046251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253962, "question_id": "cYAZrtS4bcYdRdVD5iiUdT", "question": "Which food is rich in vitamin A?", "choices": ["cilantro", "melon", "carrot", "tomato"], "correct_choice_idx": 2, "direct_answers": ["carrot", "carrot", "carrots", "carrot", "carrot", "tomatoes", "carrot", "carrots", "carrot", "carrots"], "difficult_direct_answer": false, "rationales": ["Carrots are rich in vitamin a", "Carrots are known for being rich in vitamin a.", "Carrots are known for vitamin a."], "image": "train2014/COCO_train2014_000000253962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263834, "question_id": "cYLsoHvpRSj5WCJtNzg4JP", "question": "How many people ride this one motorcycle?", "choices": ["one", "three", "none", "two"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "two", "three", "two", "two", "two", "three", "three"], "difficult_direct_answer": false, "rationales": ["There is a man, a woman and a baby.", "Three people are sharing the motorcycle.", "The person driving, the child on the back can be seen. the child on the back also appears to be holding a baby, whose legs can be seen on the right side."], "image": "val2014/COCO_val2014_000000263834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204423, "question_id": "cYMVouxSLcJBKUZvd5vuSN", "question": "What are the crowds at the banks along the water observing?", "choices": ["swimming competition", "foliage", "fishing event", "rowing competition"], "correct_choice_idx": 3, "direct_answers": ["boat racing", "race", "rowing race", "rowing", "race", "rowing competition", "race", "rowing", "boat race", "rowers"], "difficult_direct_answer": false, "rationales": ["There are boats in the water.", "The crowds are rowing.", "Teams are rowing their boats in a competition. they are watching and cheering on the teams."], "image": "train2014/COCO_train2014_000000204423.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50667, "question_id": "cYUhWFswdTQzHyfB59D3VP", "question": "What company is famous for making that style lamp?", "choices": ["tiffany", "ikea", "osram", "ashley"], "correct_choice_idx": 0, "direct_answers": ["tiffany", "amazon", "tiffany", "unknown", "ge lighting", "chicago lamps", "tiffany", "tiffany", "tiffany", "tiffany"], "difficult_direct_answer": false, "rationales": ["The company is tiffany.", "Tiffany makes fancy lamps.", "Because the lamp is emitting unique light as a result of the fabric used."], "image": "train2014/COCO_train2014_000000050667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34761, "question_id": "cYdtPP9KQPaNAQK6erawgF", "question": "What kind of terrain is this?", "choices": ["plain", "beach", "desert", "savanna"], "correct_choice_idx": 3, "direct_answers": ["savannah", "flat", "serengeti", "grassy", "savanna", "drought stricken", "savanna", "barren", "savannah", "plains"], "difficult_direct_answer": false, "rationales": ["It is a large grassy flat area.", "The dry grasslands and sparse trees is indicative of the savannah.", "This is a savannah plain with elephants and trees."], "image": "train2014/COCO_train2014_000000034761.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490620, "question_id": "cYrRq5eLFbJ2eu93WujftE", "question": "What type of pizza is this?", "choices": ["mushroom", "sausage", "pepperoni", "vegetarian"], "correct_choice_idx": 3, "direct_answers": ["spinach", "spinach", "spinach", "spinach", "spinach", "spinach", "spinach", "vegetarian", "spinach", "spinach"], "difficult_direct_answer": false, "rationales": ["It contains spinach as a topping.", "The discernible toppings on this pizza appear to be greens and vegetables. if a pizza is topped only with vegetables, it is commonly known as a vegetarian pizza.", "The is no meat visible on the pizza."], "image": "val2014/COCO_val2014_000000490620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436130, "question_id": "cZ3rPX6S6Z68g3ZSdvXqxR", "question": "What energy source can replace electric appliances?", "choices": ["water", "nuclear", "steam", "propane"], "correct_choice_idx": 3, "direct_answers": ["gas", "gas", "solar", "gas", "gas", "solar", "solar", "gas", "propane", "power"], "difficult_direct_answer": false, "rationales": ["The source is propane.", "Modern appliances commonly come in gas or electric options. if not using electric, one could use gas instead.", "Propane could power the gas oven."], "image": "val2014/COCO_val2014_000000436130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82551, "question_id": "cZR67WXAHWfuTagmsu3pEu", "question": "What is the can to his side primarily used for?", "choices": ["soda", "trash", "cookies", "money"], "correct_choice_idx": 1, "direct_answers": ["trash", "trash", "trash", "trash", "trash", "trash", "garbage", "trash", "trash", "trash"], "difficult_direct_answer": false, "rationales": ["There is a trash can to the left of the man in the armchair.", "The can is for garbage.", "You can tell by the bad inside and size of it as to what it is used for."], "image": "val2014/COCO_val2014_000000082551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360798, "question_id": "cZRQGx728Yx4pe85F5t5Hs", "question": "What causes the black markings on the stones?", "choices": ["paint", "grease", "oil", "smoke"], "correct_choice_idx": 3, "direct_answers": ["ash", "design", "spores", "soot", "paint", "stain", "fire", "smoke", "smoke", "fire"], "difficult_direct_answer": false, "rationales": ["The smoke causes the markings.", "There is a fireplace below the stones.", "The stone is on a fireplace. smoke causes things to turn darker in color."], "image": "val2014/COCO_val2014_000000360798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433151, "question_id": "cZXcmutkN49zMkjF3w8JtY", "question": "What type activity was this building designed for?", "choices": ["movie showing", "racing", "prison", "making shirts"], "correct_choice_idx": 0, "direct_answers": ["movies", "plays", "movie showing", "movies", "moviegoing", "movies", "theater", "music programs", "theatre", "concerts"], "difficult_direct_answer": false, "rationales": ["The activity is for movies.", "The place is a theater.", "The building appears to have signage on the front that would be consistent with a theater. answer a is an activity that would happen in a theater."], "image": "val2014/COCO_val2014_000000433151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135577, "question_id": "cZzLx7qbmAtZHFN6tnDPr4", "question": "What is the silver object on the table in front of the couch used to control?", "choices": ["lights", "garage door", "fan", "tv"], "correct_choice_idx": 3, "direct_answers": ["television", "remote", "television", "television", "remote", "television", "remote", "remote", "tv", "television"], "difficult_direct_answer": false, "rationales": ["The object is the tv.", "Based on the size, shape, design and the location on a coffee table in front of a couch, answer a is consistent.", "The sliver device is a remote controller that is used on a television."], "image": "train2014/COCO_train2014_000000135577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33066, "question_id": "cZzU7X5ZZCFFbaRz4X8QMw", "question": "Who most likely put the bear with this child?", "choices": ["stranger", "mom", "cashier", "grocer"], "correct_choice_idx": 1, "direct_answers": ["mothe", "parent", "parent", "mom", "mom", "parents", "his mom", "parent", "parents", "parent"], "difficult_direct_answer": false, "rationales": ["The baby is on a chair but too small to be sitting there.", "A baby is posed with a large stuffed bear. a young baby could not walk and get a bear on their own.", "The mom put the bear with it."], "image": "val2014/COCO_val2014_000000033066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271338, "question_id": "ca52fAPwpJHtDpdCDViBKB", "question": "What is the man trying to get?", "choices": ["date", "phone reception", "cab", "tan"], "correct_choice_idx": 1, "direct_answers": ["selfie", "photo", "cell reception", "cell service", "reception", "picture", "picture", "signal", "phone reception", "reception"], "difficult_direct_answer": false, "rationales": ["The man gets reception.", "A man is holding his phone up in the air and staring at it. people sometimes move their phones around to get a better signal.", "The man wants cell service."], "image": "train2014/COCO_train2014_000000271338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299066, "question_id": "caZRiYhZogxLzvo7QiUJ6J", "question": "What are the cats doing near the stone bench?", "choices": ["playing", "eating", "sleeping", "fighting"], "correct_choice_idx": 1, "direct_answers": ["eating", "eating", "eating", "eating", "eating", "eating", "eating", "eating", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["They are eating.", "Someone has provided some dry food for these feral cats. the humane society says there are approximately 50 million feral cats in the us alone.", "There is food near the cats and one of them is licking its mouth."], "image": "train2014/COCO_train2014_000000299066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380812, "question_id": "capJjZZcM6NzhWqrprDkiy", "question": "What fuel does the plane require?", "choices": ["coal", "jetfuel", "diesel", "electricity"], "correct_choice_idx": 1, "direct_answers": ["jet fuel", "jet fuel", "plane fuel", "petrol", "gas", "jetfuel", "plane fuel", "gasoline", "gas", "jet"], "difficult_direct_answer": false, "rationales": ["These are airplanes", "Airplanes all use jet fuel.", "The answer is internet searchable and known."], "image": "train2014/COCO_train2014_000000380812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140352, "question_id": "cb3XnBR9CMqpL3kb7CYXCG", "question": "Where are the vases most likely being displayed?", "choices": ["store", "museum", "market", "home"], "correct_choice_idx": 0, "direct_answers": ["store", "gift shop", "shop", "store", "shelves", "for sale", "shop", "museum", "antique shop", "store"], "difficult_direct_answer": false, "rationales": ["The vases are being sold.", "The vases are in a store.", "They are on display next to other for sale items."], "image": "train2014/COCO_train2014_000000140352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449350, "question_id": "cbEmGv2G9UrH2VeyUPoGBP", "question": "What design is on the woman's shirt?", "choices": ["tree", "bumble bee", "boat", "ostrich"], "correct_choice_idx": 0, "direct_answers": ["decal", "tree", "tree", "branches", "owls", "owls", "tree", "gnomes", "tree", "owl"], "difficult_direct_answer": false, "rationales": ["There is a tree on the woman's shirt.", "That is what the owls are sitting on.", "It's a tree on the shirt."], "image": "train2014/COCO_train2014_000000449350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158326, "question_id": "cbHRjuupCkJu5ztTo9Nhi5", "question": "What region/continent is likely to appear here?", "choices": ["australia", "arctic", "africa", "asia"], "correct_choice_idx": 1, "direct_answers": ["arctic", "antarctica", "north america", "artic", "arctic", "arctic", "arctic", "artic", "arctic", "antarctica"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to polar bear habitats.", "Polar bears are found in extremely cold climates of the world. the artic area seems like a fitting place for these bears to hang out.", "These are polar bears found in the arctic."], "image": "train2014/COCO_train2014_000000158326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321181, "question_id": "cbKGcCQDGvrHWiNuG8DiUC", "question": "Why are pink ribbons tied on the string?", "choices": ["cancer awareness", "wind direction", "girl's night", "visibility safety"], "correct_choice_idx": 3, "direct_answers": ["boundary line", "markers", "marks track", "safety", "marker", "visibility", "snow season", "visibility safety", "visibility", "location"], "difficult_direct_answer": true, "rationales": ["People are skiing and the path they are walking up the mountain is bordered by poles with pink ribbons tied on them.", "These bright ribbons are tied to the boundary wires on this ski track because they would be hard to see against the snowy background otherwise.", "Pink is an easy color to see in the snow."], "image": "train2014/COCO_train2014_000000321181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477861, "question_id": "cbcQ38Dp8rsB9qvxU6CFe2", "question": "From which location did this skateboarder just begin this maneuver?", "choices": ["leftward ramp", "right", "front", "tall buildings"], "correct_choice_idx": 0, "direct_answers": ["ground", "left", "top", "air", "city", "left", "leftward ramp", "deck", "ramp", "below ramp"], "difficult_direct_answer": true, "rationales": ["He needed this to catch air", "He went up the ramp in order to be able to perform this stunt.", "This man just began his maneuver from the ramp to the left."], "image": "train2014/COCO_train2014_000000477861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13172, "question_id": "cboN5C7HCN6HFAVNV3mXfG", "question": "How does this person feel about the photographer?", "choices": ["likes alot", "wary", "hates", "spiteful"], "correct_choice_idx": 0, "direct_answers": ["cheerful", "smitten", "flirty", "likes alot", "likes", "likes", "love", "loving", "safe", "happy"], "difficult_direct_answer": true, "rationales": ["The person likes it.", "A person smiles for a pictures. people smile when they are happy.", "The person is smiling and feeling giddy."], "image": "train2014/COCO_train2014_000000013172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339545, "question_id": "cbqvYCs9qCBPWamuQ3Kdt7", "question": "What is the breed of this cat?", "choices": ["scottish fold", "maine coon", "persian", "ragdoll"], "correct_choice_idx": 3, "direct_answers": ["tabby", "gray cat", "ragdoll", "tabby", "tabby", "tabby", "gray cat", "tabby", "tabby", "tabby"], "difficult_direct_answer": false, "rationales": ["It's a ragdoll by looking at its coloring.", "The breed is a ragdoll.", "This cat has stripes adn the coloring is that of a maine coon."], "image": "train2014/COCO_train2014_000000339545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30276, "question_id": "cbrp2hFHa7QtBituMLWeRR", "question": "What is the general theme of items in the cup?", "choices": ["office supplies", "cleaning supplies", "construction equipment", "computer equipment"], "correct_choice_idx": 0, "direct_answers": ["homemade", "office supplies", "office", "office", "art", "office supplies", "creative", "office utensils", "orange", "office supplies"], "difficult_direct_answer": false, "rationales": ["It has scissors, pens and pencils that are used in offices", "Items in the cup include scissors, highlighter, pencil and pens. all these are common supplies used in an office.", "The theme is office supplies."], "image": "train2014/COCO_train2014_000000030276.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111024, "question_id": "cbudau6gJ3nta7XehLjszD", "question": "A type of leavened fried dough is called?", "choices": ["chocolate", "pizza", "burger", "donut"], "correct_choice_idx": 3, "direct_answers": ["doughnut", "donut", "doughnut", "doughnut", "donut", "glazed", "donut", "doughnut", "donut", "doughnut"], "difficult_direct_answer": false, "rationales": ["It's called a donut.", "There are several donuts in the middle of the picture which uses leavened flour dough and is fried.", "The donuts are on the plate."], "image": "val2014/COCO_val2014_000000111024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566931, "question_id": "cbzxQJnS6BYksTtmHnAZSj", "question": "If the woman in the water wants to copy what the other girls are doing what does she need?", "choices": ["headband", "surfboard", "bracelet", "anklet"], "correct_choice_idx": 1, "direct_answers": ["climb surfboard", "to fall", "surfboard", "surfboard", "surfboard", "surfboard", "balance", "surfboard", "wet suit", "lay down"], "difficult_direct_answer": false, "rationales": ["There are two other girls visible aside from the one in the water and both have surfboards visible. if the person in the water without the surfboard wanted to be like the others she would need one.", "She needs something to stand on to ride the wave", "She will need a surfboard and be able to surf in the water."], "image": "val2014/COCO_val2014_000000566931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132509, "question_id": "ccNCDpre4LNqFqVER7ovm8", "question": "What sport is the person involved in?", "choices": ["tennis", "baseball", "bowling", "surfing"], "correct_choice_idx": 3, "direct_answers": ["surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["The person is holding a very long object on her head while beside the ocean which is where you'd put the board.", "The person is surfing.", "The person is carrying a surfboard."], "image": "val2014/COCO_val2014_000000132509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331140, "question_id": "ccqPtWo92XxzgTpSmX95Ax", "question": "What will they use the orange ring for?", "choices": ["hula hoop", "anchor boat", "grab dock", "save drowners"], "correct_choice_idx": 3, "direct_answers": ["flotation", "markers", "lifesaving", "life saver", "anchor", "life preserver", "floatation", "floatation", "save drowners", "lifesaver"], "difficult_direct_answer": true, "rationales": ["The orange ring is a lifesaver.", "The orange ring floats and can rescue people that fall into the water.", "The ring is to save people."], "image": "train2014/COCO_train2014_000000331140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42312, "question_id": "cctkYMCdocnD74Y6GsP5kd", "question": "What morning beverage is this company famous for?", "choices": ["oatmeal", "lemonade", "coffee", "fruit punch"], "correct_choice_idx": 2, "direct_answers": ["coffee", "coffee", "donut chocolate", "coffee", "coffee", "coffee", "donut chocolate", "coffee", "coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["You eat donuts with coffee.", "The company makes donuts as well as coffee.", "Dunkin's is known for coffee."], "image": "train2014/COCO_train2014_000000042312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220763, "question_id": "cdL6MtFdqXfWAmcmM6mSx3", "question": "Where are the people seated?", "choices": ["restaurant", "theater", "gym", "home"], "correct_choice_idx": 0, "direct_answers": ["outdoor cafe", "restaurant", "outside", "cafe", "table", "dock", "table", "near water", "shore", "dock"], "difficult_direct_answer": false, "rationales": ["The umbrellas over the table are similar to those used on restaurant patios. the branding on the umbrellas indicates it is not for private use.", "The people are eating food outside and are under an umbrella that has advertising on it, so they are not in a theater, home, or gym.", "The people are seated alongside the water, under an umbrella, and at a table covered with a tablecloth with dishes at every place setting. this would be expected in an area set up for outdoor dining."], "image": "train2014/COCO_train2014_000000220763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55942, "question_id": "cdL8Twe8XQiEenHbUzinvV", "question": "What sport is the animal playing?", "choices": ["frisbee", "soccer", "fishing", "basketball"], "correct_choice_idx": 3, "direct_answers": ["basketball", "basketball", "basketball", "basketball", "basketball", "basketball", "basketball", "basketball", "basketball", "basketball"], "difficult_direct_answer": false, "rationales": ["The elephant is trying to score baskets.", "The elephant threw the ball into the hoop.", "A hoop is in front of an elephant who is hitting a ball."], "image": "train2014/COCO_train2014_000000055942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130485, "question_id": "cdYbPYmSymKtSo8AJnvXRR", "question": "Why does the man have the glass in his hand?", "choices": ["to give", "to drink", "to show", "to clean"], "correct_choice_idx": 1, "direct_answers": ["drinking", "drinking", "drinking wine", "gripping", "drinking", "toast", "drinking wine", "drinking", "to drink", "he's drinking"], "difficult_direct_answer": false, "rationales": ["He is getting ready to drink it.", "The man is drinking.", "The man has a wine glass held to drink."], "image": "val2014/COCO_val2014_000000130485.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250234, "question_id": "cdfNuorWdNRw3qe9Dix84P", "question": "What shape are these fruits?", "choices": ["rectangle", "triangle", "square", "circles"], "correct_choice_idx": 3, "direct_answers": ["circle", "round", "round", "round", "circles", "round", "round", "circle", "circle", "round"], "difficult_direct_answer": false, "rationales": ["The fruits do not have any hard edges and have a uniform shape going around.", "They are round.", "They have no hard edges or points, therefore cannot be squares, rectangles, or triangles."], "image": "train2014/COCO_train2014_000000250234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463101, "question_id": "ce47vWDZMjjqYEZMF9SZYs", "question": "What relation is the man to the boy in his backpack?", "choices": ["pastor", "neighbor", "teacher", "father"], "correct_choice_idx": 3, "direct_answers": ["father", "father", "father", "father", "father", "father", "father", "father", "father", "father"], "difficult_direct_answer": false, "rationales": ["The man is the boy's dad.", "The relation is the dad.", "The man is a parent and he is carrying his child on his back."], "image": "train2014/COCO_train2014_000000463101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441338, "question_id": "ce5p9mETvneJu4Jus85NTB", "question": "How was the man who is standing in the statue killed?", "choices": ["cancer", "beaten", "drowned", "shot"], "correct_choice_idx": 3, "direct_answers": ["shot", "old age", "shot", "shot", "shot", "assassinated", "assassinated", "shot", "shot", "slavery"], "difficult_direct_answer": false, "rationales": ["He is abraham lincoln, and he died from that event.", "President lincoln was assassinated with a gun. the statue depicts lincoln.", "Abraham lincoln was shot."], "image": "train2014/COCO_train2014_000000441338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351466, "question_id": "cedZdWPahKczjvwrEecDih", "question": "What period of the day is it in the image?", "choices": ["evening", "night", "morning", "afternoon"], "correct_choice_idx": 2, "direct_answers": ["morning", "morning", "afternoon", "455", "noon", "morning", "morning", "late morning", "morning", "morning"], "difficult_direct_answer": false, "rationales": ["The time of the day before 12 is considered morning.", "The time shows that it's already morning.", "I'ts in the morning."], "image": "val2014/COCO_val2014_000000351466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393508, "question_id": "cf6RXr94fpehHMkP62QXAb", "question": "What does the cardboard box tell us about this situation?", "choices": ["shops amazon", "likes coffee", "wrong room", "moving in"], "correct_choice_idx": 3, "direct_answers": ["new tenants", "moving", "moving in", "moving", "moving", "moving out", "moving", "moving", "kitchen location", "moving"], "difficult_direct_answer": false, "rationales": ["The box says \"kitchen\" on it and that's what people do when they move to another place.", "A box is open and top of a counter in a kitchen.", "A box marked \"kitchen\" is on a stove. people pack in boxes and unpack when they move in."], "image": "train2014/COCO_train2014_000000393508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121897, "question_id": "cfCFqzDHjgaRyzAmTCuAqp", "question": "What is the job title of the person standing outside the first base line?", "choices": ["outfielder", "batter", "catcher", "coach"], "correct_choice_idx": 3, "direct_answers": ["short stop", "linesman", "coach", "umpire", "coach", "umpire", "1st base", "baseball coach", "first baseman", "umpire"], "difficult_direct_answer": false, "rationales": ["The person standing by first base is one of the coaches for the team that's up at bat.", "The person is the coach.", "The title is the coach."], "image": "val2014/COCO_val2014_000000121897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514073, "question_id": "cfJQWnRKST2BaEg6WjcyUL", "question": "What sort of tide is noticed here?", "choices": ["low tide", "tidal wave", "laundry tide", "high tide"], "correct_choice_idx": 0, "direct_answers": ["low tide", "low tide", "low tide", "low tide", "low tide", "low tide", "ocean tide", "low", "hightide", "low"], "difficult_direct_answer": false, "rationales": ["Surfers are walking near water that has receded and they are carrying surfboards.", "The beach scene shows wet sand in the foreground with dryer sand in the background. wet, flat sand leading up to dryer sand implies the tidal zone.", "There is no visible ocean on the flat part of the beach, but the sand is wet up to a certain point. the wet sand likely reflects that at one point the tide was higher, but as the ocean is not currently visible it would be considered low."], "image": "train2014/COCO_train2014_000000514073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470839, "question_id": "cffk3T74sM9r5Hd6xJD56Q", "question": "How does the woman know the girl?", "choices": ["mother", "teacher", "coach", "doctor"], "correct_choice_idx": 0, "direct_answers": ["mother", "mom/daughter", "child", "mother", "mother", "child", "mother", "mother", "mother", "parent"], "difficult_direct_answer": false, "rationales": ["There is a woman pushing a girl in a baby stroller down street.", "It's a closer relationship than the other three; she's alone with her, casually pushing her in stroller.", "The woman is the mom."], "image": "train2014/COCO_train2014_000000470839.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139728, "question_id": "cfmSfPdYNGUs3kF87zzFrX", "question": "What kind of pasta is on the left?", "choices": ["bowtie", "macaroni", "spaghetti", "penne"], "correct_choice_idx": 1, "direct_answers": ["round", "macaroni", "elbow macaroni", "macaroni", "salad", "macaroni salad", "macaroni", "macaroni", "macaroni", "macaroni"], "difficult_direct_answer": false, "rationales": ["The dish on the left is a traditional macaroni salad.", "There is macaroni in the dish.", "The pasta is macaroni."], "image": "train2014/COCO_train2014_000000139728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145039, "question_id": "cfpSbR5dTS4VUXwuSs3Ct5", "question": "What number do you get if you add 10 to the number at the bottom of the meter?", "choices": ["8610", "2210", "445", "3750"], "correct_choice_idx": 1, "direct_answers": ["2200", "2210", "2210", "large number", "twentytwo ten", "2210", "large number", "twentytwo ten", "2210", "2210"], "difficult_direct_answer": false, "rationales": ["You get that number if you add 10.", "The number on the bottom says 2200.", "The number that is currently on the meter is 2200."], "image": "val2014/COCO_val2014_000000145039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325758, "question_id": "cgWT2HGo4KijxeNcZL8xes", "question": "What is going on with this room?", "choices": ["dirty", "being painted", "being remodeled", "no occupant"], "correct_choice_idx": 3, "direct_answers": ["kitchen", "tidy", "new owner", "cooking", "kitchen", "nothing", "empty", "kitchen", "moving", "no occupant"], "difficult_direct_answer": false, "rationales": ["There is no cooking.", "This room has zero occupants inside of the living area.", "You can easily tell by the picture as to what or who is missing from it."], "image": "train2014/COCO_train2014_000000325758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443601, "question_id": "cgtoNXhdb4pcBqZgwAfMCx", "question": "Who is the greatest female athlete in this sport of all time?", "choices": ["venus williams", "andrea agassi", "anna kournikova", "serena williams"], "correct_choice_idx": 3, "direct_answers": ["sloane stephens", "serena williams", "serena williams", "serena williams", "egret", "serena williams", "billie-jean king", "serena williams", "nadia", "serena williams"], "difficult_direct_answer": false, "rationales": ["The greatest female tennis athlete is serena williams.", "The athlete is williams.", "Serena is one of the goats of tennis."], "image": "train2014/COCO_train2014_000000443601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136129, "question_id": "chD8UWmRzt3vgBdwUKm5z2", "question": "What does this machine run on for energy?", "choices": ["batteries", "diesel", "sunlight", "carbon dioxide"], "correct_choice_idx": 0, "direct_answers": ["battery", "battery", "batteries", "battery", "battery", "electricity", "batteries", "battery", "batteries", "battery"], "difficult_direct_answer": false, "rationales": ["Electronic wheelchairs utilize batteries to power their functions.", "The machine is for energy.", "This can be plugged in and charged at night so it will run on rechargeable system."], "image": "train2014/COCO_train2014_000000136129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386029, "question_id": "chWQMNLkQjnuZDRmx3wK59", "question": "What other animal is this creature related to?", "choices": ["dog", "elephant", "tiger", "frog"], "correct_choice_idx": 2, "direct_answers": ["lion", "lion", "tiger", "wild feline", "tiger", "cat", "tiger", "tiger", "lion", "lion"], "difficult_direct_answer": false, "rationales": ["It's a cat.", "Tigers and felines are both types of cats.", "Cat's are descendants of this asian big cat."], "image": "train2014/COCO_train2014_000000386029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86217, "question_id": "chXYZzBigm2DJPJDJamAjP", "question": "What is next to the red vehicle?", "choices": ["cat", "motorcycle", "elf", "dog"], "correct_choice_idx": 1, "direct_answers": ["motorcycle", "motorcycle", "fence", "motorcycle", "motorcycle", "motorcycle", "fence", "motorcycle", "motorcycle", "motorcycle"], "difficult_direct_answer": false, "rationales": ["The mode of transport has two wheels.", "Between these red trailers or buses we see a two wheeled vehicle with handlebars.", "There is a different type of vehicle, not an animal or elf, near the red one. it has an engine and two wheels."], "image": "train2014/COCO_train2014_000000086217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275608, "question_id": "chmwhYuG5K2FbpXPtWMQqD", "question": "What type of weather is there at the beach today?", "choices": ["snowy", "rainy", "windy", "calm"], "correct_choice_idx": 2, "direct_answers": ["windy", "windy", "windy", "sunny windy", "windy", "windy", "sunny windy", "windy", "windy", "windy"], "difficult_direct_answer": false, "rationales": ["The kites are flying in the strong wind.", "Kites are flying in the sky so it must be windy.", "There is no precipitation. the kites in the sky cannot fly when the weather is calm."], "image": "train2014/COCO_train2014_000000275608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534772, "question_id": "chqw7fHtZgEweZGce3BKqB", "question": "What are these people trying to do?", "choices": ["run", "attack", "duck", "eat"], "correct_choice_idx": 2, "direct_answers": ["pout", "duck", "cover themselves", "flinch", "avoid bat", "dodge bat", "duck", "protect themselves", "avoid projectiles", "avoid bat"], "difficult_direct_answer": false, "rationales": ["The people are trying to duck.", "You can tell by there body language and the bat coming at them what they are doing.", "Some crazy person has thrown a bat towards a section of the bleachers during a baseball game, and the spectators are doing everything they can to avoid being hurt!."], "image": "train2014/COCO_train2014_000000534772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114141, "question_id": "chzvKUW28VS254boCeWhzU", "question": "What activity is the woman engaged in with the child on the bed?", "choices": ["singing", "playing", "story time", "drawing"], "correct_choice_idx": 2, "direct_answers": ["studying", "reading", "book reading", "reading book", "story time", "reading", "reading", "reading", "reading", "reading"], "difficult_direct_answer": false, "rationales": ["The activity is storytime.", "The woman is holding a book while looking at it near the child.", "They are reading books."], "image": "train2014/COCO_train2014_000000114141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569251, "question_id": "ciBpuEFcKduRiTPWxRKgng", "question": "What creatures might be kept in the glass item below the yellow cabinet?", "choices": ["fish", "mice", "vampires", "snakes"], "correct_choice_idx": 0, "direct_answers": ["fish", "fish", "fish", "fish", "cat", "fish", "fish", "cat", "fish", "fish"], "difficult_direct_answer": false, "rationales": ["The glass item is suitable for small pets, not vampires. it is filled with water, so it would not be suitable for land animals.", "There might be fish in glass tank.", "The item is an aquarium. water is in this aquarium."], "image": "val2014/COCO_val2014_000000569251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259809, "question_id": "ciJy5qpKMZagGC98Fc9zXZ", "question": "What clue is the person flipping pancakes watching in the batter to alert them as to flipping time?", "choices": ["heat indicator", "bubbles", "blackening", "thermometer"], "correct_choice_idx": 1, "direct_answers": ["bubbling", "bubbles", "brown dough", "bubbles", "bubbles", "color", "color", "bubbles", "bubbles", "bubbles"], "difficult_direct_answer": false, "rationales": ["It is impossible to see what the underside of the pancake looks like without flipping them, but bubbles appearing on the top are a well documented indicator of doneness on the other side.", "You can tell when they get darker on the sides.", "Bubbles indicate when a pancake is done."], "image": "train2014/COCO_train2014_000000259809.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22056, "question_id": "ciyMKpqWdTfbDbTsUkfFTu", "question": "What period of the day is it in the image?", "choices": ["morning", "afternoon", "night", "evening"], "correct_choice_idx": 0, "direct_answers": ["midday", "eleven thirty", "1127", "late morning", "late morning", "late morning", "morning", "morning", "morning", "day"], "difficult_direct_answer": false, "rationales": ["It must be morning as the streets are empty.", "There is a clock visible that gives the hour of the day. based on the visible light of the day it would make sense that answer a is correct.", "The period is the morning."], "image": "train2014/COCO_train2014_000000022056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15302, "question_id": "cizgXfmnbftgwghZF4RLbX", "question": "Why is there a tall fence behind the batter?", "choices": ["confuse players", "stop spectators", "stop intruders", "stop balls"], "correct_choice_idx": 3, "direct_answers": ["protection", "protect viewers", "stop balls", "safety", "protection", "loss prevention", "protection", "fan protection", "catch balls", "protection"], "difficult_direct_answer": false, "rationales": ["The fence keeps balls from flying over.", "The fence stops balls.", "Sometimes a foul ball will go backwards toward spectators."], "image": "train2014/COCO_train2014_000000015302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303586, "question_id": "cjQMTKCJDKsMK2Fhw96EnY", "question": "What is the green veggie on the dog?", "choices": ["pickles", "green peppers", "jalapenos", "lettuce"], "correct_choice_idx": 2, "direct_answers": ["peppers", "jalapenos", "chili peppers", "green peppers", "peppers", "jalapeno peppers", "jalapeno", "olive", "jalapenos", "pepper"], "difficult_direct_answer": false, "rationales": ["The green veggie is some spicy jalapenos.", "The veggies are jalapenos.", "Jalapenos are on the dog."], "image": "train2014/COCO_train2014_000000303586.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404710, "question_id": "cjVpDBfnXpQFPzVfdMxgEQ", "question": "What is this sport called?", "choices": ["jumping", "hurling", "vaulting", "sailing"], "correct_choice_idx": 0, "direct_answers": ["equestrian", "horse race", "dressage", "show jumping", "horseback", "equestrian", "jumping", "equestrian", "horse jump", "horse jumping"], "difficult_direct_answer": false, "rationales": ["The sport is jumping.", "The woman is on a horse and obviously in some type of competition. it would appear that jumping is the primary goal of this sport.", "A rider is jumping an obstacle on a horse."], "image": "val2014/COCO_val2014_000000404710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148855, "question_id": "cjWVJQbVRqqxGw9Q8YbNZW", "question": "What is the profession of the man?", "choices": ["waiter", "cashier", "athlete", "coach"], "correct_choice_idx": 2, "direct_answers": ["tennis player", "tennis player", "tennis player", "athlete", "tennis player", "tennis player", "tennis", "tennis player", "tennis player", "athlete"], "difficult_direct_answer": false, "rationales": ["The man is holding a tennis racket in athletic gear and the writing on the backstop is of a professional tennis tournament. if a person is playing tennis on the court of a professional tennis tournament, they are likely a professional tennis player which is a type of athlete.", "The man plays tennis.", "He's a tennis player"], "image": "val2014/COCO_val2014_000000148855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397036, "question_id": "cjiS4ikiDW6QtkGC2Xpu9F", "question": "What is the sign all the way to the right for?", "choices": ["caution", "meter", "stop", "panhandling"], "correct_choice_idx": 1, "direct_answers": ["parking", "clock", "clock", "clock", "meter", "parking", "parking", "parking", "parking", "parking"], "difficult_direct_answer": false, "rationales": ["It has the words \"pay here\" indicating that it is a parking meter where you can pay to park.", "It is a meter because you can read on the sign that you can pay here", "The sign all the way to the right is for buses."], "image": "train2014/COCO_train2014_000000397036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109602, "question_id": "cjoij5Bs3J8hU2ebtSCJqA", "question": "What is the dark colored wall made from?", "choices": ["pine", "mud", "steel", "bricks"], "correct_choice_idx": 3, "direct_answers": ["shade", "bricks", "brick", "brick", "brick", "bricks", "brick", "brick", "brick", "bricks"], "difficult_direct_answer": false, "rationales": ["The walls are made of brick.", "There are bricks on the wall.", "The wall behind the old folks is made of brick. the bricks, made mostly out of cement, are adhered to each other with mortar, and many brick walls are decades old."], "image": "train2014/COCO_train2014_000000109602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160421, "question_id": "cjyhMsPGH6QrBAZuUUXzgF", "question": "Which plant family does the green vegetable belong to?", "choices": ["solanaceae", "brassicaceae", "rosaceae", "cucurbitaceae"], "correct_choice_idx": 1, "direct_answers": ["cabbage", "broccoli", "cabbage", "brassicca", "cruciferous", "broccoli", "mustards", "cabbage", "brassicaceae", "brassicaceae"], "difficult_direct_answer": false, "rationales": ["The green vegetables are broccoli, not nightshades, gourds, or roses.", "Broccoli is in the brassicacae family.", "The green veggie is a brocolli."], "image": "val2014/COCO_val2014_000000160421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188510, "question_id": "ckYXPXnZnB9UEj6Lv8bsnC", "question": "What sort of book is seen here?", "choices": ["romance", "cook", "novel", "music"], "correct_choice_idx": 3, "direct_answers": ["music", "piano book", "piano", "music", "music", "sheet music", "piano music", "child book", "music", "textbook"], "difficult_direct_answer": false, "rationales": ["The boy is playing a piano. the book contains notes.", "This is a book you can use to play songs on the piano.", "It's a book that goes along with the piano to play from."], "image": "train2014/COCO_train2014_000000188510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443591, "question_id": "ckrE43NTmFTCUrVdUsLx5c", "question": "What type of vegetation is this?", "choices": ["grassland", "mountain", "woods", "rainforest"], "correct_choice_idx": 0, "direct_answers": ["grass", "grass", "grass", "grass", "grassland", "grass", "grassland", "grassland", "grass", "grassland"], "difficult_direct_answer": false, "rationales": ["Zebras live in grassy areas like the savannah.", "The vegetation is grassy.", "Zebras commonly live in this type of region in africa."], "image": "val2014/COCO_val2014_000000443591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334840, "question_id": "cmXwHgoF3Yaj3pNnThQZpv", "question": "What is the person aiming her phone at?", "choices": ["selfie mirror", "bath mirror", "rearview mirror", "side mirror"], "correct_choice_idx": 2, "direct_answers": ["mirror", "mirror", "self", "rearview mirror", "motorcyclists", "rearview mirror", "rear windshield", "motorcycles", "mirror", "motorcycle riders"], "difficult_direct_answer": false, "rationales": ["She is taking a video or picture of the motorcycles that are behind her.", "It is attached to the front windshield.", "She is taking a picture of the bikes behind her."], "image": "train2014/COCO_train2014_000000334840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99295, "question_id": "cmc57T7kvR2of8LMYJK96a", "question": "What can be made with the beans available?", "choices": ["soup", "plants", "sprouts", "coffee"], "correct_choice_idx": 3, "direct_answers": ["coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["These appear to be coffee beans and their best use is to make coffee.", "Coffee can be made with the beans.", "The beans are for coffee."], "image": "train2014/COCO_train2014_000000099295.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52233, "question_id": "cnMnfNQDTwuakJq2TKrgx4", "question": "What do the red things prevent from getting to your body?", "choices": ["rain", "bullets", "sound", "mosquitos"], "correct_choice_idx": 0, "direct_answers": ["rain", "rain", "rain water", "rain", "rain", "rain", "rain water", "rain", "rain water", "rain"], "difficult_direct_answer": false, "rationales": ["They are umbrellas, which create a cone of protection around the body and are waterproof to water getting through them.", "The umbrellas are used to cover you from the rain.", "The people are holding umbrellas."], "image": "val2014/COCO_val2014_000000052233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159963, "question_id": "cnhs2FdUCPGaADdFVUsrXy", "question": "What is the profession of he man with the dog?", "choices": ["officer", "attendant", "porter", "mechanic"], "correct_choice_idx": 0, "direct_answers": ["cop", "police", "policeman", "customs", "policeman", "police officer", "security", "police", "officer", "police"], "difficult_direct_answer": false, "rationales": ["The dog is with a police officer.", "He's an officer.", "He is dressed in a uniform that a policeman would wear."], "image": "train2014/COCO_train2014_000000159963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453711, "question_id": "cnnyh9dSG4yjcqF45Pw3R6", "question": "While washing dishes in which position to those seated at the bar is the washer?", "choices": ["facing", "under", "sideways", "backwards"], "correct_choice_idx": 0, "direct_answers": ["left", "facing", "to side", "forward", "front", "top right", "left", "left", "facing them", "facing"], "difficult_direct_answer": false, "rationales": ["The washer is to the left facing the chairs on the sink island.", "A kitchen is shown with a countertop and sink facing a table and chairs. people wash dishes in the sink.", "The faucet of the sink is facing in the opposite direction, meaning the person would have to stand on the other side of the counter to access it."], "image": "train2014/COCO_train2014_000000453711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569431, "question_id": "cnwAYv7PKgucNnxumewaSX", "question": "What style of skirt is she wearing?", "choices": ["mini", "pleated", "midi", "peasant"], "correct_choice_idx": 0, "direct_answers": ["mini", "mini", "mini", "mini", "mini", "mini", "jean", "jean", "mini", "jean"], "difficult_direct_answer": false, "rationales": ["Her skirt is short and reaches above her knees.", "The skirt is cut far above the knee.", "A girl is wearing a short skirt."], "image": "train2014/COCO_train2014_000000569431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461435, "question_id": "cocZgY5vHDGjufXPdKx6AJ", "question": "Which bananas should the man pick for eating?", "choices": ["none", "lower ones", "upper ones", "middle ones"], "correct_choice_idx": 2, "direct_answers": ["ripe", "upper ones", "ripe", "yellow", "yellow", "ripe", "yellow", "yellow one", "yellow bananas", "ripe"], "difficult_direct_answer": false, "rationales": ["The ones that are yellow and more ripe.", "The ones on the top are the most yellow. yellow bananas are ripe.", "The bananas on top are yellow rather than green like the ones below."], "image": "val2014/COCO_val2014_000000461435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296233, "question_id": "cohW3iDz7D5Vz2u89t59f6", "question": "Which wrestler would be most likely to wear the garb the men on the left have on?", "choices": ["john cena", "drew mcintyre", "jinder mahal", "kofi kingston"], "correct_choice_idx": 1, "direct_answers": ["hulk hogan", "rick flair", "drew mcintyre", "irish wrestler", "scottish", "rick flair", "roddy riper", "triple h", "drew mcintyre", "roddy piper"], "difficult_direct_answer": false, "rationales": ["The men on the left are wearing kilts. a scottish wrestler might wear a kilt.", "Mcintyre sounds like a scottish name, and clothes like these are traditional wear in scotland.", "The man is wearing a kilt."], "image": "train2014/COCO_train2014_000000296233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470462, "question_id": "corUKE2ACHFCWAdXdZEb4B", "question": "What does the leather on the horse here form?", "choices": ["skirt", "chaps", "apron", "harness"], "correct_choice_idx": 3, "direct_answers": ["bridle tack", "lead", "banner", "reins", "name", "harness", "reins", "owner", "harness", "circle"], "difficult_direct_answer": false, "rationales": ["The horse uses a harness.", "The leather part is on the harness of the horse.", "The leather forms a muzzle."], "image": "train2014/COCO_train2014_000000470462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229688, "question_id": "cpKQ2fuLQhFTtMWXHcg5SK", "question": "What is held in the red and yellow containers on the table?", "choices": ["lettuce", "eggs", "condiments", "salad dressing"], "correct_choice_idx": 2, "direct_answers": ["condiments", "condiments", "condiments", "ketchup mustard", "sauce", "ketchup mustard", "food", "ketchup mustard", "cheese", "ketchup mustard"], "difficult_direct_answer": false, "rationales": ["There is mustard and ketchup in the condiment bottles.", "One is red for ketchup and one is yellow for mustard.", "Ketchup and mustard are red and yellow condiments."], "image": "val2014/COCO_val2014_000000229688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72800, "question_id": "cpNUBr7imw5VsKE4ot8wH9", "question": "What animal is on top of the books?", "choices": ["bird", "no animal", "chameleon", "dog"], "correct_choice_idx": 0, "direct_answers": ["bird", "parakeet", "bird", "bird", "parakeet", "bird", "bird", "bird", "bird", "bird"], "difficult_direct_answer": false, "rationales": ["The animal is the bird.", "This is a winged colorful creature with tail feathers, feathers, and a beak. all characteristics point to this particular answer.", "A bird is on the books."], "image": "train2014/COCO_train2014_000000072800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353919, "question_id": "cpdDcszvLMY2cdAjQmf5qb", "question": "What would describe the atmosphere of this office?", "choices": ["competitive", "artistic", "conventional", "degrading"], "correct_choice_idx": 1, "direct_answers": ["casual", "casual", "casual", "entertaining", "laid back", "casual", "chill", "artistic", "playful", "initech"], "difficult_direct_answer": false, "rationales": ["This office has many decorative items on the walls and desk.", "The man has decorations in his office.", "The office is artsy."], "image": "train2014/COCO_train2014_000000353919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529549, "question_id": "cpwEff2S4vq2VBGGUkZDP8", "question": "What is the white part one of these animals is showing called?", "choices": ["talon", "snout", "tusk", "horn"], "correct_choice_idx": 2, "direct_answers": ["tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk", "tusk"], "difficult_direct_answer": false, "rationales": ["The hard white is called a tusk, and is made from ivory.", "The elephants have tusks.", "The elephants have small ivory pieces near their trunks which are tusks."], "image": "val2014/COCO_val2014_000000529549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96463, "question_id": "cq6kCBmJbTaBLttGDHThz8", "question": "Whom employs the person standing here?", "choices": ["military", "police", "icecream truck", "train company"], "correct_choice_idx": 3, "direct_answers": ["train company", "train company", "train company", "government", "train", "train company", "transit company", "railroad", "city", "rail station"], "difficult_direct_answer": false, "rationales": ["The person is working with the train.", "The train company is the employer.", "The person is the employee."], "image": "train2014/COCO_train2014_000000096463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457725, "question_id": "cqGP8WLHKntDkde2FpzFiF", "question": "Where is the officer riding here?", "choices": ["parade route", "brazil", "fire", "perp. capture"], "correct_choice_idx": 0, "direct_answers": ["street", "road", "parade route", "parade route", "motorcycle", "right", "street", "street", "motorcycle", "motorcycle"], "difficult_direct_answer": false, "rationales": ["There are traffic cones that are being set up or taken down without any road work being done, with one officer on a motorcycle and one walking.", "The officer is on a route.", "He is riding slowly without the sirens on."], "image": "train2014/COCO_train2014_000000457725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379211, "question_id": "cqJQrw4BMDKxJ83nMyJ5WF", "question": "What is the finger everyone is holding up commonly called?", "choices": ["big finger", "ring finger", "index finger", "thrust finger"], "correct_choice_idx": 2, "direct_answers": ["index", "number one", "index finger", "pointer", "index finger", "fore", "index finger", "hand sign", "index finger", "forefinger"], "difficult_direct_answer": false, "rationales": ["The finger next to their thumb.", "People are pointing their second fingers.", "The index finger is used."], "image": "train2014/COCO_train2014_000000379211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445187, "question_id": "cqstSvkydg7BT3vHRBjiiF", "question": "What is the breed of this cat?", "choices": ["ragdoll", "persian", "maine coon", "scottish fold"], "correct_choice_idx": 0, "direct_answers": ["domestic shorthair", "tabby", "ragdoll", "red tabby", "abyssinian", "american curl", "tabby", "tabby", "tabby", "tabby"], "difficult_direct_answer": false, "rationales": ["The cat is striped and orange. it is very close to a tabby cat.", "This is an orange cat.", "These cats come in the orange tabby color."], "image": "train2014/COCO_train2014_000000445187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384605, "question_id": "cqyZoQCM7pP9JSQXMQZMxW", "question": "What is the little boy doing?", "choices": ["flying in", "falling in", "dropping in", "pushing in"], "correct_choice_idx": 2, "direct_answers": ["skateboarding", "skateboarding", "skateboarding", "trick", "skate boarding", "dropping in", "skateboarding", "skateboard jump", "showing off", "skating"], "difficult_direct_answer": false, "rationales": ["The boy dropping in.", "The boy is dropping into the skate bowl.", "The boy is dropping into the skate park."], "image": "train2014/COCO_train2014_000000384605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341857, "question_id": "cr7RKCXKtHgqgvjBUGZsAj", "question": "What type of hat does the man in white have on?", "choices": ["captains", "skating", "athletic", "fisherman"], "correct_choice_idx": 0, "direct_answers": ["captain", "captains", "captains hat", "captain", "captain's", "sailor hat", "sailors", "sailer", "captain's hat", "captains hat"], "difficult_direct_answer": false, "rationales": ["The man is wearing a sailor hat.", "The hat has the classic captains symbol and the white and black design that is seen on many ships.", "This is a uniform"], "image": "val2014/COCO_val2014_000000341857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87740, "question_id": "crNPotfqpsSBcLN3xg8rWx", "question": "What keeps the seat from falling?", "choices": ["balance", "back shape", "ropes", "glue"], "correct_choice_idx": 2, "direct_answers": ["ropes", "rope", "rope", "ropes", "rope", "rope", "string", "rope", "basket", "rope"], "difficult_direct_answer": false, "rationales": ["One can see that the seats are tied to the elephants.", "Seats are tied with ropes to elephants.", "There are some ropes tied around the elephants to fasten the seats on place."], "image": "val2014/COCO_val2014_000000087740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386712, "question_id": "crS8AYRQi3exm8Tq2QWYtu", "question": "What form of art was used to preserve these dead animals?", "choices": ["taxidermy", "metalworking", "sculpting", "drawing"], "correct_choice_idx": 0, "direct_answers": ["taxidermy", "taxidermy", "taxidermy", "abstract", "taxidermy", "taxidermy", "taxidermy", "abstract", "taxidermy", "taxidermy"], "difficult_direct_answer": false, "rationales": ["These are stuffed dead animals which is called taxidermy.", "The art is taxidermy.", "Answer a is known to be the process used to preserve animals in their natural form."], "image": "train2014/COCO_train2014_000000386712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198880, "question_id": "crsbcFUiQdEiDrMjENT3HC", "question": "What style tennis is being played?", "choices": ["twins", "pairs", "standard", "triple"], "correct_choice_idx": 1, "direct_answers": ["pairs", "doubles", "floor", "doubles", "doubles", "counter punch", "doubles", "doubles", "backhand", "men's doubles"], "difficult_direct_answer": false, "rationales": ["There are two people on one side of the court.", "Here we see two tennis players on the same side of the net. this only happens in a doubles match.", "The style is in pairs."], "image": "train2014/COCO_train2014_000000198880.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192970, "question_id": "cs2eP6TXSPGD2bQ7uMVRpN", "question": "What is the giraffe hair is called?", "choices": ["horn", "skin", "verticones", "ossicones"], "correct_choice_idx": 3, "direct_answers": ["fur", "fur", "mane", "ossicones", "mane", "fur", "mane", "fur", "fur", "mane"], "difficult_direct_answer": false, "rationales": ["The hair along the giraffes neck is called the mane.", "The small horn shaped but soft antenna looking features on top of a giraffe's head are know as ossicones.", "They are covered in black hair and are on top of their heads."], "image": "val2014/COCO_val2014_000000192970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126097, "question_id": "cs3JWo7bMQss7S48nbiQKM", "question": "What numbers are visible on the meter closest to the man?", "choices": ["2410", "1903", "3000", "2701"], "correct_choice_idx": 3, "direct_answers": ["2701", "2701", "2701", "two seven", "2701", "parking meters", "2701", "2701", "2791", "two one"], "difficult_direct_answer": false, "rationales": ["They are show on the white panel on the meter", "They are black numbers on a white plate", "Looking at the meter you can make out a 2701 on the front."], "image": "train2014/COCO_train2014_000000126097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336935, "question_id": "csHLQYGvVScSvAJH9aKNXC", "question": "What are they both running towards?", "choices": ["referee", "gatorade", "sidelines", "ball"], "correct_choice_idx": 3, "direct_answers": ["ball", "ball", "ball", "ball", "tennis ball", "tennis ball", "ball", "ball", "ball", "ball"], "difficult_direct_answer": false, "rationales": ["They want the ball.", "Both players are running towards where the ball is so they can hit it.", "They are playing tennis. the goal of tennis is to hit the ball."], "image": "train2014/COCO_train2014_000000336935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147466, "question_id": "csVqFx6Tn4hFef8aAJ44fV", "question": "What are the children making?", "choices": ["muffins", "live bunny", "teddy bear", "cake"], "correct_choice_idx": 2, "direct_answers": ["crafts", "doit yourself", "stuffed animals", "teddy bear", "crafts", "crafts", "cotton candy", "cotton candy", "stuffed animals", "stuffed animal"], "difficult_direct_answer": false, "rationales": ["The kids make a bear.", "The children have a bag of cotton in front of them.", "They are stuffing a stuffed animal."], "image": "train2014/COCO_train2014_000000147466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111525, "question_id": "cscU3oUTZMM9GPERENuJ9A", "question": "What religion is common in this area?", "choices": ["christianity", "judaism", "islam", "hinduism"], "correct_choice_idx": 3, "direct_answers": ["hinduism", "christianity", "india", "hinduism", "hinduism", "buddhism", "buddhism", "hinduism", "hinduism", "buddhism"], "difficult_direct_answer": false, "rationales": ["The religion is hinduism.", "The people nearby are indian and hinduism is a common religion there.", "A street and building is shown with a picture of hindu goddess on the wall."], "image": "train2014/COCO_train2014_000000111525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411832, "question_id": "csfDf49qWhXynZCnZuzKwQ", "question": "What is the most popular type of apple?", "choices": ["ladybug", "honey crisp", "red delicious", "granny smith"], "correct_choice_idx": 2, "direct_answers": ["red", "gala", "apple crisp", "gala", "gala", "granny smith", "red", "macintosh", "gala apple", "red delicious"], "difficult_direct_answer": false, "rationales": ["The red delicious apple is one that most people know.", "People most often buy red delicious apples.", "The apple is a red delicious."], "image": "val2014/COCO_val2014_000000411832.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207491, "question_id": "csfKAtujuSN4vugFkDeFsn", "question": "In what setting does this bus drive?", "choices": ["rural", "sand desert", "urban", "industrial"], "correct_choice_idx": 2, "direct_answers": ["urban", "metropolitan", "urban", "downtown", "urban", "urban", "downtown", "downtown", "city streets", "suburban"], "difficult_direct_answer": false, "rationales": ["There are a lot of buildings and cars.", "The setting is urban.", "A bus is driving on a city street. public transportation runs in urban areas."], "image": "val2014/COCO_val2014_000000207491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356569, "question_id": "cshrsXJRSPmSWH2xXHmZZv", "question": "What should be the distance between eyes and computer screen?", "choices": ["30inches", "5inches", "40inches", "20inches"], "correct_choice_idx": 3, "direct_answers": ["24 inches", "20 inches", "twenty-forty inches", "three feet", "20inches", "25 inches", "20 inches", "twenty inches", "18 inches", "arm's length"], "difficult_direct_answer": true, "rationales": ["There should at least be 20 inches of clearance between eyes and a computer screen.", "That leaves you plenty of space to see but not have to struggle to figure out what the words say on it.", "Scientifically the safest distance to protect eyes is considered 20 inches."], "image": "train2014/COCO_train2014_000000356569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191729, "question_id": "csobK8V8ZsMbJqNbo2eYAw", "question": "The color blue represents commonly what in automobiles?", "choices": ["none", "focus", "dependable", "driving style"], "correct_choice_idx": 2, "direct_answers": ["company automobile", "emergency", "road worker", "dependable", "dump truck", "garbage", "paint", "utility work", "safety", "dependable"], "difficult_direct_answer": true, "rationales": ["The blue color represents a quality automobile.", "The color is dependable.", "The color blue is a common color for a dependable dump truck."], "image": "train2014/COCO_train2014_000000191729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156104, "question_id": "cspP6EUaxuMaJXGX6N6KnD", "question": "What can you do directly related to the place on the sign?", "choices": ["learn driving", "pray", "study", "go camping"], "correct_choice_idx": 1, "direct_answers": ["praying", "worship", "worship", "worship", "worship", "pray", "worship", "worship", "praying", "worship"], "difficult_direct_answer": false, "rationales": ["The sign is about a baptist church.", "The sign can be for prayer.", "Many people come to churches to communicate with god."], "image": "val2014/COCO_val2014_000000156104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277237, "question_id": "ct8YRtS74RjWzbhNyEMa7f", "question": "Why is his right foot in the air?", "choices": ["kicking ball", "is drunk", "to balance", "showing off"], "correct_choice_idx": 2, "direct_answers": ["momentum", "to balance", "running", "moving", "balancing", "stance", "follow through", "hitting ball", "running", "speed"], "difficult_direct_answer": true, "rationales": ["The man is running and trying to maintain his balance.", "He is trying to get balance.", "He has just finished hitting a ball with a tennis racket, while twisting his body."], "image": "train2014/COCO_train2014_000000277237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62459, "question_id": "ctF76pbgQyt5VY2X2gNZcn", "question": "What is the red and white object used for?", "choices": ["stopping fires", "police work", "changing tires", "fishing"], "correct_choice_idx": 0, "direct_answers": ["fire", "water", "fire plug", "fighting fires", "fire water", "fire hydrant", "water supply", "fires", "stopping fires", "fires"], "difficult_direct_answer": true, "rationales": ["The object is a hydrant based on its size, shape and design. these are commonly known to be used for answer a.", "Water comes out from it and is used to spray on the fires.", "These hydrants are used by firefighters to battle the flames by ejecting water."], "image": "val2014/COCO_val2014_000000062459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304930, "question_id": "ctQXEKKxqmxsX5ZKusdvrX", "question": "What OS is the laptop on the right displaying on its screen?", "choices": ["windows xp", "macos", "windows 10", "windows vista"], "correct_choice_idx": 3, "direct_answers": ["windows", "windows", "windows", "windows", "windows", "windows", "windows", "windows vista", "windows", "windows"], "difficult_direct_answer": false, "rationales": ["The laptop says windows 10 on top of the screen.", "The logo is visible.", "The os is the vista."], "image": "train2014/COCO_train2014_000000304930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148639, "question_id": "ctUDD9tnwnVEfgiYbECaVE", "question": "What cut the grass here?", "choices": ["lawn mower", "scissors", "scythe", "cow"], "correct_choice_idx": 0, "direct_answers": ["mower", "ball", "lawnmower", "mower", "mower", "lawn mower", "mower", "reel mowers", "lawnmower", "lawn mower"], "difficult_direct_answer": false, "rationales": ["Traditionally mechanical devices are best suited to cut grass.", "A baseball field is too large to cut with anything other than a power mower. it's most likely a riding mower as well, to cover that huge amount of space.", "It's a lawn mower"], "image": "train2014/COCO_train2014_000000148639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192825, "question_id": "ctijQqP2nYJKrfntb2he9X", "question": "Why is the man swinging his right arm?", "choices": ["playing baseball", "throwing ball", "playing game", "waving"], "correct_choice_idx": 2, "direct_answers": ["playing wii", "playing wii", "playing wii", "playing game", "video game", "playing wii", "playing game", "spin something", "playing game", "playing wii"], "difficult_direct_answer": false, "rationales": ["He has a video game controller", "The man is holding a wii controller and that is used to play video games.", "The man is playing the wii."], "image": "train2014/COCO_train2014_000000192825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407067, "question_id": "ctwFhMtZDNhedLmFroBV3m", "question": "Why is the boy on the skateboard raising his hands in the air?", "choices": ["to balance", "to clap", "to celebrate", "getting help"], "correct_choice_idx": 0, "direct_answers": ["to balance", "balance himself", "balance", "balance", "balance", "for balance", "balance", "maintain balance", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["He's balancing.", "The boy doesn't want to fall.", "The boy's body is moving at a very fast pace, and if he doesn't throw his arms in the right direction often and consistently to maintain balance, he will be thrown to the ground."], "image": "val2014/COCO_val2014_000000407067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272615, "question_id": "cu2YoPr7ue32xszL8c2tTX", "question": "What is the only food group that appears to be missing?", "choices": ["dairy", "grain", "fruit", "vegetable"], "correct_choice_idx": 1, "direct_answers": ["grains", "meat", "dairy", "grains", "meat", "meat", "meat", "meat", "meat", "grain"], "difficult_direct_answer": false, "rationales": ["The group is grains.", "Grain appears to be missing because you have fruits, vegetables, meat and dairy", "There aren't any grains."], "image": "val2014/COCO_val2014_000000272615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102377, "question_id": "cuTtHuWfhSmRHUgwqLZ4f4", "question": "What sandwich does the bus share a name with?", "choices": ["reuben", "double decker", "blt", "submarine"], "correct_choice_idx": 1, "direct_answers": ["double decker", "double decker", "double decker", "decker", "double decker", "double decker", "double decker", "double decker", "double decker", "double-decker"], "difficult_direct_answer": false, "rationales": ["The bus has two \"decks\" as does some sandwiches.", "A double decker sandwich.", "The red bus is a double decker."], "image": "train2014/COCO_train2014_000000102377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574562, "question_id": "cubNWPDHkMhw4kUpenfmDy", "question": "Why is the woman carrying luggage?", "choices": ["to travel", "to buy", "to sell", "to trade"], "correct_choice_idx": 0, "direct_answers": ["to travel", "traveling", "traveling", "travelling", "travelling", "arriving home", "travelling", "travelling", "trip", "traveling"], "difficult_direct_answer": false, "rationales": ["This wheeled single suitcase the woman is holding is suggestive of travel.", "People use luggage to carry their belongings when they travel.", "The primary reason why people carry luggage is because they are in some process of travelling."], "image": "train2014/COCO_train2014_000000574562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21500, "question_id": "cue3ekV4XCPQPZH7GMPGCd", "question": "Which person bit the donut?", "choices": ["leftmost", "baker", "none", "rightmost"], "correct_choice_idx": 3, "direct_answers": ["rightmost", "right", "right hand", "right", "left hand", "right side", "another", "right person", "rightmost", "right"], "difficult_direct_answer": false, "rationales": ["Probably the person on the right.", "The person on the right probably bit the donut before handing it off.", "The person on the right is handing the donut to the person on the left. the person on the right had the donut first."], "image": "train2014/COCO_train2014_000000021500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459185, "question_id": "cujRueXPUC7DbbjfRhmNaK", "question": "What type of event is this rider in?", "choices": ["polo", "dance", "show jumping", "race"], "correct_choice_idx": 2, "direct_answers": ["show jumping", "riders", "equestrian jumping", "jumping", "race", "horse riding", "horse", "equestrian", "jump", "equestrian"], "difficult_direct_answer": true, "rationales": ["The rider is jumping over the log.", "The rider is jumping.", "He is jumping his horse over the bar."], "image": "train2014/COCO_train2014_000000459185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539694, "question_id": "cuqEdcCK2smFuFS53dwLVm", "question": "What does one of the applications on the desktop allow directly you to open?", "choices": ["web browser", "music creator", "card game", "flash cards"], "correct_choice_idx": 0, "direct_answers": ["web browser", "internet", "center application", "google", "setting", "adobe", "google chrome", "adobe", "internet browser", "chrome"], "difficult_direct_answer": true, "rationales": ["The chrome browser is the first icon at the bottom.", "The web browser can come from chrome.", "There is a mozilla firefox shortcut on the desktop."], "image": "val2014/COCO_val2014_000000539694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43774, "question_id": "cuzJBhN6BUXifwYnzqNsTJ", "question": "The birthday boy has a name that refers to what kind of animal?", "choices": ["salamander", "cat", "dog", "bird"], "correct_choice_idx": 0, "direct_answers": ["lion", "newt", "newt", "lizard", "reptile", "newt", "lizard", "newt", "salamander", "newt"], "difficult_direct_answer": false, "rationales": ["The name is \"newt\". newts are a species of salamanders.", "Salamanders and newts are the same.", "The name is salamander."], "image": "train2014/COCO_train2014_000000043774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162223, "question_id": "cv36CMt6756EKwTpJvoYkw", "question": "How many kilometers distance is there between the capital cities of the countries these planes represent?", "choices": ["852", "681", "400", "250"], "correct_choice_idx": 1, "direct_answers": ["500", "681", "eight hundred", "500", "8000", "research required", "681.7", "thousands", "14235 km", "1ookm"], "difficult_direct_answer": true, "rationales": ["There are 681.", "The distance in kilometers between the capital of austria (vienna) and the capital of germany (berlin) is approximately 681.", "The writing on the sides of the two visible planes provides their home locations. the answer is then internet searchable."], "image": "train2014/COCO_train2014_000000162223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134870, "question_id": "cvUvHLpgFnLMY35LcHmkNe", "question": "How should I go if I want to go to McDonald's?", "choices": ["turn right", "turn back", "turn left", "go straight"], "correct_choice_idx": 3, "direct_answers": ["go straight", "forward", "left", "right straight", "follow sign", "follow signs", "straight", "straight", "straight", "turn right"], "difficult_direct_answer": false, "rationales": ["Keep going on the street to get there.", "The people go straight.", "The shape of the arrow below the mcdonalds arches suggest going straight first before making a right then left turn."], "image": "val2014/COCO_val2014_000000134870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248797, "question_id": "cvgxu2uhahruutrHD8nXWW", "question": "The item in the sky looks most like what?", "choices": ["dog", "house", "wheel", "cat"], "correct_choice_idx": 2, "direct_answers": ["round kite", "wheel", "fan", "pinnwheel", "ferris wheel", "kite", "fan", "wheel", "windmill", "wheel"], "difficult_direct_answer": false, "rationales": ["The item in the sky is round. it has spokes.", "The kite looks like a wagon wheel.", "It is round and has spirals that look like rims."], "image": "train2014/COCO_train2014_000000248797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456077, "question_id": "cwHF3Eifk6h8hYS35MWoLa", "question": "What type of area is this event taking place at?", "choices": ["rural", "city", "residential", "country"], "correct_choice_idx": 1, "direct_answers": ["city", "broadgate", "swimming pool", "broadgate", "city", "tourist", "water park", "city center", "urban", "city"], "difficult_direct_answer": false, "rationales": ["The size of the many-windowed brown building in the background of this scene places it in a metro area.", "The sign says it's a city.", "You can see the building in the background that would suggest what area this is in."], "image": "val2014/COCO_val2014_000000456077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9002, "question_id": "cwQ3rzSzjwmGvBmhzNdtdp", "question": "What is the venue shown in the image?", "choices": ["restaurant", "pizzeria", "dining room", "kitchen"], "correct_choice_idx": 3, "direct_answers": ["kitchen", "pizzeria", "kitchen", "kitchen", "kitchen", "pizza parlor", "kitchen", "kitchen", "kitchen", "pizza restaurant"], "difficult_direct_answer": false, "rationales": ["The pizza looks homemade, and the two pot holders appear to come from a personal home kitchen.", "A person is holding cooked foods with pot holders in hand.", "There is food visible that seems to be placed on a countertop. these aspects would occur within the venue of answer a."], "image": "val2014/COCO_val2014_000000009002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292498, "question_id": "cwR7SVWP4WCK9BenRsWKPp", "question": "What country does this bus operate in?", "choices": ["canada", "mexico", "norway", "ireland"], "correct_choice_idx": 3, "direct_answers": ["ireland", "switzerland", "ireland", "ireland", "uk", "europe", "ireland", "ireland", "england", "america"], "difficult_direct_answer": false, "rationales": ["This appears to be a european bus. ireland is a european country.", "The brand of the bus is visible on the front. the bus brand operates in the country answer c.", "The country is ireland."], "image": "train2014/COCO_train2014_000000292498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54733, "question_id": "cwwYeF8Pj5kTuNJbTF2nTV", "question": "What are the two men walking in?", "choices": ["surf", "desert", "river", "meadow"], "correct_choice_idx": 0, "direct_answers": ["water", "surf", "water", "ocean", "ocean", "waves", "ocean", "water", "surf", "water"], "difficult_direct_answer": false, "rationales": ["The two men are walking in waves.", "The men are surfing.", "Two men are walking in the beach surf."], "image": "train2014/COCO_train2014_000000054733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277945, "question_id": "cx4Dhvs7v7xHugS4KEqqbM", "question": "Why is the man wearing gloves?", "choices": ["warmth", "fashion", "health", "grip"], "correct_choice_idx": 3, "direct_answers": ["hold bat", "protect hands", "handling bat", "better grip", "protect hands", "playing baseball", "slippage protection", "grip", "grip", "playing baseball"], "difficult_direct_answer": false, "rationales": ["He is using it to hold the bat.", "When using the bat, it can slip in bares hands.", "Gloves can help hold the bat without slipping."], "image": "val2014/COCO_val2014_000000277945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408364, "question_id": "cxhQfuJicwuaJ568NFnkuF", "question": "Why is everything red and white?", "choices": ["get reward", "coincidence", "camouflage", "team colors"], "correct_choice_idx": 3, "direct_answers": ["team color", "red sox", "team colors", "car", "baseball team", "team colors", "logo colors", "team colors", "sports", "team colors"], "difficult_direct_answer": false, "rationales": ["The team colors are those.", "The red sox logo is pictured.", "Race cars have team colors based on advertisers."], "image": "val2014/COCO_val2014_000000408364.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258035, "question_id": "cxiD9orv6K85EhSPyqGwRL", "question": "Why is he holding the frisbee like that?", "choices": ["examining it", "taunt friend", "offer friend", "to toss"], "correct_choice_idx": 3, "direct_answers": ["to throw", "throwing", "to toss", "to throw", "throwing it", "throwing", "to throw", "playing", "throw it", "throwing"], "difficult_direct_answer": false, "rationales": ["He is ready to throw.", "The man is tossing.", "The person wants to toss the frisbee in the air."], "image": "train2014/COCO_train2014_000000258035.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13220, "question_id": "cyN8AVZmi2uiZzTH2wfeMN", "question": "What is holding the red napkin together?", "choices": ["napkin ring", "friction", "bracelet", "tape"], "correct_choice_idx": 0, "direct_answers": ["napkin ring", "napkin ring", "metal ring", "tape", "napkin ring", "holder", "holder", "napkin ring", "napkin ring", "napkin ring"], "difficult_direct_answer": false, "rationales": ["There is a silver circle encompassing the red material to keep its shape.", "The ring holds the napkin.", "There are red napkins on the table with circular items around them."], "image": "val2014/COCO_val2014_000000013220.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374249, "question_id": "cyU9g52Wmjz9dnnfc8j4sc", "question": "In what country would you find these shoji doors most often?", "choices": ["canada", "japan", "mexico", "france"], "correct_choice_idx": 1, "direct_answers": ["japan", "japan", "japan", "usa", "japan", "japan", "usa", "japan", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["These type of doors can be found in japan.", "This country is known to have those window-style sliding doors. the flowers on the table and back have that oriental feel as well.", "Shoji doors are japanese."], "image": "val2014/COCO_val2014_000000374249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335581, "question_id": "cyUwQXineMiWPAc6UW8MRs", "question": "What is the big mitten called?", "choices": ["oven mitt", "snow mittens", "fishing mittens", "fashion mittens"], "correct_choice_idx": 0, "direct_answers": ["oven mitt", "oven mitt", "mitt", "mitt", "oven mitt", "oven mitt", "oven mitt", "oven mitt", "mitt", "oven mitt"], "difficult_direct_answer": false, "rationales": ["This is a kitchen. people use the mittens to get things out of the stove and oven.", "The mitten is an oven mitt.", "The item is located in a kitchen. it allows a person to touch hot things without being burned."], "image": "train2014/COCO_train2014_000000335581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51628, "question_id": "cyaeBgW9XvJvxhiBaacpiG", "question": "What color socks are preferred by TV watchers who live here?", "choices": ["white", "black", "none", "argyle"], "correct_choice_idx": 0, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["There are four white socks.", "The absence of color is called 'white' and that's the color of these socks.", "People are wearing white socks while they sit in front of a television."], "image": "val2014/COCO_val2014_000000051628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508972, "question_id": "cybTpBkDVgkmDEnxnD9eHR", "question": "What phone feature is she using?", "choices": ["flip", "slide", "open", "zoom"], "correct_choice_idx": 1, "direct_answers": ["slide", "text", "iphone", "sidekick", "text", "text", "text", "texting", "screen", "text"], "difficult_direct_answer": false, "rationales": ["The phone appears to be held sideways and slid up to be able to type a message on a physical keyboard.", "The girl's phone has two different levels.", "The girl is holding the phone horizontally and it appears there is a piece that fits over the other. this style of phone was able to slide one piece over the other and that is what it would be called."], "image": "val2014/COCO_val2014_000000508972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417751, "question_id": "cyrxQ7XzQsZztrihn48jou", "question": "What design is painted onto the plate?", "choices": ["crisscross", "checkers", "chevron", "spider web"], "correct_choice_idx": 3, "direct_answers": ["spiderweb", "spider web", "spider web", "spider web", "spider web", "web", "spiderweb", "spider web", "spiderweb", "spider web"], "difficult_direct_answer": false, "rationales": ["A black plate has white stretched across it in a geometric pattern.", "The black plate with the white stripes is very similar to that of a spiderweb.", "It is white with the lines that go around and around with the eight edges and eight lines going towards the center."], "image": "train2014/COCO_train2014_000000417751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67847, "question_id": "czFEykkHUuLVxi97ZErE86", "question": "If you laid down exactly where the cameraman is what would give you the most speed?", "choices": ["just wait", "roll left", "roll right", "crawl forward"], "correct_choice_idx": 2, "direct_answers": ["air", "falling", "downhill", "frisbee", "downhill", "downhill gravity", "gravity", "wings", "roll right", "rolling"], "difficult_direct_answer": true, "rationales": ["The hill goes down to the right.", "If you rolled down the hill.", "The grass is sloped down to the right and that would cause someone to go the fastest."], "image": "train2014/COCO_train2014_000000067847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146202, "question_id": "czFfZcDshDhVbbw95AhhGF", "question": "What company is famous for making the type of vehicle here?", "choices": ["chrysler defense", "amazon", "ford", "boeing"], "correct_choice_idx": 2, "direct_answers": ["john deer", "ford", "ford", "ford", "ford", "john deer", "ford", "ford", "john deer", "ford"], "difficult_direct_answer": false, "rationales": ["Ford has been making trucks for a long time, and this vehicle in the field is clearly one of them.", "This was one of the first companies to manufacture pickup trucks.", "The company is ford."], "image": "train2014/COCO_train2014_000000146202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493797, "question_id": "czGW4jcszbJqDfWvAzPA9R", "question": "What body of water is shown here?", "choices": ["harbor", "river", "open sea", "stream"], "correct_choice_idx": 0, "direct_answers": ["lake", "ocean", "lake", "lake", "harbor", "fresh water", "harbor", "inlet", "lake", "lake"], "difficult_direct_answer": false, "rationales": ["The water is the harbor.", "There are many boats attached to docks visible in this image. the body of water where boats are docked in this manner is called a harbor.", "There are several boats parked in the water."], "image": "val2014/COCO_val2014_000000493797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194273, "question_id": "czH56akZo2PDbpS9DTDzV7", "question": "What public service does the structure seen here serve?", "choices": ["time keeping", "security", "policing", "cleaning"], "correct_choice_idx": 0, "direct_answers": ["time telling", "telling time", "tell time", "tower", "tourist landmark", "tell time", "time keeping", "clocktower", "clock tower", "local time"], "difficult_direct_answer": true, "rationales": ["There is a clock near the top of the structure.", "It tells the time.", "The little clock tower structures are used for time keeping."], "image": "train2014/COCO_train2014_000000194273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262099, "question_id": "czQdvDZnAdivrQifBujRoZ", "question": "To whom does the cart shown here belong?", "choices": ["bus depot", "shopping mall", "airport", "grocery store"], "correct_choice_idx": 2, "direct_answers": ["airport", "traveler", "passenger", "store", "traveler", "traveler", "traveler", "woman", "airport", "passenger"], "difficult_direct_answer": false, "rationales": ["The cart is owned by the airport.", "It has luggage on it with tags, and the tags are put on to alert which plane and destination the bags are to go to.", "The luggage belongs to the airport given the airport logos on each item."], "image": "train2014/COCO_train2014_000000262099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99938, "question_id": "czVa2zcX2Azmiz3g8VJGXU", "question": "Which actress has a famous uncle with a first name that matches the name on the book the boy is reading?", "choices": ["adelaide kane", "linnea quigley", "natalie portman", "samara weaving"], "correct_choice_idx": 3, "direct_answers": ["samara weaving", "jeanne hugo", "samara weaving", "samara weaving", "samara weaving", "no idea", "no clue", "emma roberts", "boss", "samara weaving"], "difficult_direct_answer": false, "rationales": ["The actress is samara.", "Samara weaving has a similar name to the uncle of hugo.", "Her uncle hugo was featured in films like the matrix, lord of the rings, and v for vendetta."], "image": "val2014/COCO_val2014_000000099938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121040, "question_id": "czayKrJQDGRnY7dXBRXojj", "question": "The man will be safe if he avoids getting hit by what?", "choices": ["camera", "birds", "kites", "air"], "correct_choice_idx": 2, "direct_answers": ["kites", "string", "kites", "kites", "kites", "kites", "string", "kites", "kites", "kites"], "difficult_direct_answer": false, "rationales": ["They're obviously in the background behind him and capable of falling on or flying into him.", "The large billowing kites behind this man are the only thing we can see that could soon collide with him.", "There are quite a few flying behind him."], "image": "train2014/COCO_train2014_000000121040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478811, "question_id": "czeKFpYZWMwtxfRDhtNNgT", "question": "What is the white food being stored in?", "choices": ["paper", "silicone", "jars", "plastic bags"], "correct_choice_idx": 3, "direct_answers": ["bags", "bags", "plastic bags", "plastic bags", "plastic bags", "plastic bags", "bags", "plastic bags", "plastic bags", "onions"], "difficult_direct_answer": false, "rationales": ["They are wrapped in clear to white objects and tied at the top.", "The mushrooms are tied up in plastic bags.", "A clear bag can be seen secured around the white food, with ties at the top of it."], "image": "train2014/COCO_train2014_000000478811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24169, "question_id": "czpW6Pf4TSHFxUAbPf77kY", "question": "The man in the back has what on his feet?", "choices": ["nothing", "socks", "shoes", "sandals"], "correct_choice_idx": 3, "direct_answers": ["truck", "flipflops", "flip flops", "flip flops", "flip flops", "sandals", "flip flops", "sandals", "sandals", "flipflops"], "difficult_direct_answer": false, "rationales": ["The man has footwear that has exposed toes.", "The man in question is clearly visible and located based on the text of the question. his footwear is visible and is consistent in shape and design with answer a.", "He is wearing open-toed footwear. his bare feet are visible."], "image": "train2014/COCO_train2014_000000024169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335633, "question_id": "d23FswHUHCtVQZEh5QzpYp", "question": "What kind of cat is it?", "choices": ["strayed cat", "farm cat", "domestic pet", "mountain cat"], "correct_choice_idx": 2, "direct_answers": ["tabby", "tabby", "happy cat", "tabby", "small cat", "striped", "domestic pet", "tabby", "tabby", "house"], "difficult_direct_answer": false, "rationales": ["The woman is at a restaurant in town.", "The cat is friendly and sitting on the woman's lap.", "The cat is domestic."], "image": "train2014/COCO_train2014_000000335633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326734, "question_id": "d23QSZ9M9Dbb9QhM3QXEjW", "question": "What food on the plate has the sweetest taste?", "choices": ["peppers", "strawberries", "meat", "vegetables"], "correct_choice_idx": 1, "direct_answers": ["strawberries", "strawberries", "strawberries", "strawberries", "strawberries", "strawberries", "strawberries", "strawberries", "strawberries", "strawberries"], "difficult_direct_answer": false, "rationales": ["The fruit on the table is the sweetest food.", "The food is berries.", "These are fruit which have sugar"], "image": "train2014/COCO_train2014_000000326734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355578, "question_id": "d2698T24rnpfdkZiMRXUJL", "question": "The flowers were placed in items that people wear on what part of their body?", "choices": ["feet", "arms", "nose", "head"], "correct_choice_idx": 0, "direct_answers": ["feet", "feet", "no image", "feet", "feet", "feet", "feet", "feet", "feet", "foot"], "difficult_direct_answer": false, "rationales": ["Boots belong on the bottom of legs.", "The flowers are in boots.", "The clothing items are boots. people do not wear boots on their head, arms, or nose."], "image": "train2014/COCO_train2014_000000355578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129293, "question_id": "d2BzLiLFvtoHzDmYqgPHVx", "question": "What type of protein is in the salad?", "choices": ["chicken nuggets", "beef", "ham", "tuna"], "correct_choice_idx": 2, "direct_answers": ["healthy", "ham", "ham", "ham", "healthy", "ham", "ham", "healthy", "ham", "ham"], "difficult_direct_answer": false, "rationales": ["The salad is ham.", "That is the only meat in the salad.", "The ham is full of protein."], "image": "train2014/COCO_train2014_000000129293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465418, "question_id": "d2EpdHkrJSBgPqF55pWGkx", "question": "What is the mesh bin intended for?", "choices": ["oil", "garbage", "recycling", "compost"], "correct_choice_idx": 1, "direct_answers": ["garbage can", "trash", "trash", "trash", "trash", "garbage", "garbage can", "trash", "trash", "trash"], "difficult_direct_answer": false, "rationales": ["The bin is for garbage.", "It is for garbage because recycling bins usually are labeled with what you can put in them. compost is not done in the city and oil would leak out.", "The mesh bin is intended to recycle garbage."], "image": "val2014/COCO_val2014_000000465418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309599, "question_id": "d2VtmzRYc2x3KEuG3qxbKC", "question": "Who is the yellow line there to guide?", "choices": ["air marshall", "traffic control", "passengers", "pilot"], "correct_choice_idx": 3, "direct_answers": ["pilot", "pilot", "pilot", "plane", "pilots", "plane parking", "plane", "plane centering", "airplane pilot", "airplanes"], "difficult_direct_answer": false, "rationales": ["The yellow line guides the pilot's landing.", "The yellow line ensures the pilot steers properly.", "An airplane is being loaded at a gate at an airport."], "image": "train2014/COCO_train2014_000000309599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369139, "question_id": "d2fCDqtU6cyfK7JBWrm3xV", "question": "What might possibly flow outwards from the chrome devices?", "choices": ["oil", "water", "gas", "milk"], "correct_choice_idx": 1, "direct_answers": ["water", "water", "water", "water", "water", "chrome browsers", "water", "fire retardant", "water", "water"], "difficult_direct_answer": false, "rationales": ["Water flows from the chrome devices.", "The chrome devices are fire hydrants. oil, gas, or milk would not flow outwards from them.", "Water is likely to come out from these chrome devices."], "image": "train2014/COCO_train2014_000000369139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29094, "question_id": "d2hhhdxzE9VguwuuGM3fwz", "question": "What needs to be done for the sheep to feel cooler?", "choices": ["feeding", "grazing", "herding", "shearing"], "correct_choice_idx": 3, "direct_answers": ["cut wool", "shaved", "shearing", "shearing", "sheared", "sheer", "shearing", "shave wool", "shear wool", "sheer wool"], "difficult_direct_answer": false, "rationales": ["Shearing is the process of removing the wool from the sheep. wool insulates and keeps the sheep warm, so if it was removed, they would feel cooler.", "The sheep all have a lot of wool which is probably uncomfortable in warm weather, so they will be shaved soon.", "They have very long hair. long hair makes it hot for them."], "image": "val2014/COCO_val2014_000000029094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356755, "question_id": "d3CLUXFd5X3GnC6gLKMQnT", "question": "How many giraffes are engaging with one another?", "choices": ["none", "three", "four", "two"], "correct_choice_idx": 1, "direct_answers": ["four", "three", "three", "four", "two", "three", "four", "two", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three giraffes.", "There are three other giraffes.", "They are all doing their own thing and looking different directions"], "image": "train2014/COCO_train2014_000000356755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97492, "question_id": "d3DeyA7c5nj2fNpAFRhhdT", "question": "The white cylinder with a wire on the wall between the picture frames is used to control what device?", "choices": ["floor fan", "lamp", "radiator", "desktop computer"], "correct_choice_idx": 2, "direct_answers": ["thermostat", "lights", "radiator", "heat", "smoke detector", "managing device", "lights", "thermostat", "temperature", "temperature"], "difficult_direct_answer": false, "rationales": ["The lighting device on the table is remote capable.", "You can tell by the round shape of the device that is controls temperature.", "The cylinder is the radiator."], "image": "train2014/COCO_train2014_000000097492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301207, "question_id": "d3ooDQBeYHtRq4DsvxYPZB", "question": "What is the top lip of the structure here decorated with?", "choices": ["mire", "paint", "tile", "brick"], "correct_choice_idx": 2, "direct_answers": ["tile", "tiles", "tile", "tile", "tile", "tiles", "ceramic", "tile", "tape", "tile"], "difficult_direct_answer": false, "rationales": ["You can see the little squares near the top and these look like tiles.", "There are small glass squares ceramic pieces around top.", "The lip is a tile."], "image": "train2014/COCO_train2014_000000301207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58834, "question_id": "d46m45mqRJkoJnvDdMVNjp", "question": "Who took this photo?", "choices": ["girl", "boy", "toddler", "professional photographer"], "correct_choice_idx": 3, "direct_answers": ["hagen", "homeowner", "professional photographer", "hagenphotographer.com", "hagen", "owner", "hagen", "hagen", "homeowner", "hagan photographer"], "difficult_direct_answer": false, "rationales": ["The writing on the side of the picture gives credit to a professional photography company.", "The picture is very good and takes a lot of skill.", "There is a website of a professional photographer listed on the side of the picture."], "image": "train2014/COCO_train2014_000000058834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134715, "question_id": "d4BhxzibTWtMm36mSFbjWs", "question": "What is the blue/white/red item by the sink?", "choices": ["toothbrush", "bikini trimmer", "toilet brush", "nail clippers"], "correct_choice_idx": 0, "direct_answers": ["toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "tooth brush", "tooth brush"], "difficult_direct_answer": false, "rationales": ["The item is a toothbrush.", "The blue, white and red item standing sideways near the bathroom sink can be identified as a toothbrush due to its shape, size and bristles visible through the semi-translucent red protective case on top.", "Toothbrushes are kept by the sink."], "image": "val2014/COCO_val2014_000000134715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273771, "question_id": "d4GVU7KJkef8eF5iMThGDW", "question": "How might you describe the figurine's character?", "choices": ["soldier", "baker", "office worker", "dancer"], "correct_choice_idx": 2, "direct_answers": ["scary", "cat", "bat", "cat", "cat like", "dilbert", "dilbert", "office worker", "comedic", "miniature"], "difficult_direct_answer": false, "rationales": ["This is a character from a comic strip", "It is a dilbert figurine. he is an engineer.", "He is at a desk sitting in front of a computer."], "image": "train2014/COCO_train2014_000000273771.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173825, "question_id": "d4LLvm7fhSAQmtDPez4355", "question": "What animal is on the couch?", "choices": ["iguana", "rabbit", "cat", "dog"], "correct_choice_idx": 3, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The head of a labrador or golden retriever is peeking out.", "The furry, floppy eared animal is a large pet located in a living room and appears clean and comfortable there.", "A dog is lying down."], "image": "val2014/COCO_val2014_000000173825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474118, "question_id": "d4gx2b22okt4hw5BNqfJQJ", "question": "What does the woman have on her feet?", "choices": ["slippers", "dress shoes", "boots", "sneakers"], "correct_choice_idx": 3, "direct_answers": ["shoes", "shoes", "shoes", "tennis shoes", "sneakers", "tennis shoes", "sneakers", "tennis shoes", "shoes", "shoes"], "difficult_direct_answer": false, "rationales": ["The woman is wearing sneakers.", "The woman is wearing canvas shoes with laces on her feet. these are features of sneakers.", "They are fabric shoes with laces"], "image": "train2014/COCO_train2014_000000474118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448317, "question_id": "d4t99QC9FuWtJBwda6jRqP", "question": "What brand of tennis racket is she using to play?", "choices": ["sportscraft", "wilson", "head", "nike"], "correct_choice_idx": 2, "direct_answers": ["head", "wilson", "wilson", "wilson", "wilson", "winston", "winston", "wilson", "wilson", "winston"], "difficult_direct_answer": false, "rationales": ["The tennis racket has the logo for wilson on it.", "The brand is head.", "This woman is using a wilson brand tennis racket."], "image": "train2014/COCO_train2014_000000448317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221701, "question_id": "d4vLTqup5eghhFttHE6p6M", "question": "What is/are contained inside the wood barrel?", "choices": ["wine", "coffee beans", "water", "melon juice"], "correct_choice_idx": 0, "direct_answers": ["wine", "wine", "wine", "alcohol", "wine", "booze", "wine", "wine", "wine", "alcohol"], "difficult_direct_answer": false, "rationales": ["There are wine bottles and people with wine glasses, and none of the other items in view.", "A barrel has bottles of wine on top of it. wine is kept in barrels.", "The barrel has wine."], "image": "train2014/COCO_train2014_000000221701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41687, "question_id": "d5KRg7TuMKxb6g9nhAds39", "question": "Where do these people ski?", "choices": ["lake", "sand", "private home", "large resort"], "correct_choice_idx": 2, "direct_answers": ["slope", "ski lodge", "in snow", "private home", "in snow", "ski slopes", "hills", "mountain", "hill", "on snow"], "difficult_direct_answer": true, "rationales": ["The people have a home.", "A two story structure with a fence around it and a residential looking yard.", "Looks like they are skiing in their backyard of their home."], "image": "val2014/COCO_val2014_000000041687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63043, "question_id": "d5KsLzrEdHgTrEWAws5Q7Z", "question": "Why has the man attached ropes to the cattle?", "choices": ["to lead", "to ride", "to punish", "to kill"], "correct_choice_idx": 0, "direct_answers": ["lead them", "move them", "leashes", "leading them", "to pull", "to lead", "rangel them", "reins", "pull them", "control"], "difficult_direct_answer": true, "rationales": ["The man is located in front of the cows and appears to be pulling them based on his body-positioning.", "The man is leading.", "He is using the ropes to pull the cows in a specific direction."], "image": "train2014/COCO_train2014_000000063043.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534711, "question_id": "d64B6C56XgGfUbck5P5rpy", "question": "On which continent does this airport appear to be from?", "choices": ["asia", "south america", "north america", "europe"], "correct_choice_idx": 0, "direct_answers": ["asia", "japan", "japan", "asia", "japan", "asia", "china", "china", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["The people seen in the airport are of asian decent. people of asian decent go to the airport in china to fly.", "The continent is asia.", "You can tell by the writing on the wall and ethnicity of the people as to where they are."], "image": "train2014/COCO_train2014_000000534711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183469, "question_id": "d6MjNtdc5nMYRhgRxrpX4b", "question": "The parrot on the right is what kind?", "choices": ["african gray", "stork", "seagull", "budgie"], "correct_choice_idx": 0, "direct_answers": ["cockatoo", "gray", "grey", "maca", "grey", "african gray", "cockatoos", "african grey", "gray parrot", "photoshopped"], "difficult_direct_answer": true, "rationales": ["The parrot on the right is gray colored.", "That is the type of parrot.", "The parrot is gray so african gray is a fitting name."], "image": "val2014/COCO_val2014_000000183469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256250, "question_id": "d6YhpfkYyr5c55VgsYGMLn", "question": "The giraffe in the front is probably related to the one behind in what way?", "choices": ["sibling", "parent", "partner", "none"], "correct_choice_idx": 1, "direct_answers": ["partner", "mother", "parent", "parent", "mother", "parent", "related", "mother", "mother", "mother"], "difficult_direct_answer": false, "rationales": ["The bigger giraffe is probably the mom.", "It is much larger than the small one", "The tall animal is possibly the parent."], "image": "val2014/COCO_val2014_000000256250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245675, "question_id": "d6iJkwnrSBnJbfwKNgkrWq", "question": "Which snowboarder is in the most danger?", "choices": ["straight legs", "blue board", "sitting down", "nobody"], "correct_choice_idx": 0, "direct_answers": ["blue board", "right snowboarder", "right one", "on right", "in black", "blue board", "right", "left", "straight legs", "colliding"], "difficult_direct_answer": true, "rationales": ["It is most likely that the blue board would not face the snow where it's rider to fall to the ground in the position they are now suspended in the air in. falling in such a way is more likely to injure the rider than were it to reach the ground underside of the board first.", "The snowboarder with straight legs has the least good balance.", "The snowboarder has legs."], "image": "train2014/COCO_train2014_000000245675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541627, "question_id": "d6jWsxCmXqrGySGVskEscs", "question": "What does the vehicle that will be used to move the plane face?", "choices": ["nothing", "plane", "airport side", "tow truck"], "correct_choice_idx": 1, "direct_answers": ["plane", "right", "truck", "plane", "plane", "plane", "airplane", "danger", "luggage", "plane"], "difficult_direct_answer": false, "rationales": ["This pushback will connect to the wheel of this large flying vehicle to move it.", "This is self-explanatory.", "Because it has to be positioned that way in order to help the plane move"], "image": "val2014/COCO_val2014_000000541627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402384, "question_id": "d6k2VwQjpq5FHsLw4yiLau", "question": "What would be the best name for the activity the skateboarder is doing?", "choices": ["downhill", "park", "half pipe", "street skating"], "correct_choice_idx": 0, "direct_answers": ["skateboarding", "gliding", "skateboarding", "downhill", "skating", "skateboarding", "street luge", "riding", "skateboarding", "downhill skateboarding"], "difficult_direct_answer": false, "rationales": ["The person is riding down the street.", "They are going downhill", "The person is on a board and they are skating on the street."], "image": "train2014/COCO_train2014_000000402384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131533, "question_id": "d6pmdCEKjXP5kzGKjkBSoR", "question": "What kind of a person is usually found in a building like this?", "choices": ["prisoner", "christian", "atheist", "shaolin monk"], "correct_choice_idx": 1, "direct_answers": ["pope", "priest", "priest", "christian", "priest", "priest", "religious", "priest", "priest", "religious"], "difficult_direct_answer": false, "rationales": ["The person is christian.", "Churches generally accommodate christ-centered people.", "Crosses and cathedrals are sure signs that christians are included."], "image": "train2014/COCO_train2014_000000131533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249856, "question_id": "d6v3kDvD3N3jSuT5QeKR9Z", "question": "What is the purpose of the stick in the sandwich?", "choices": ["eat it", "keep together", "hold it", "garnish"], "correct_choice_idx": 1, "direct_answers": ["stability", "hold together", "clean teeth", "hold sandwich", "main structure", "cutting", "hold sandwich", "keep together", "hold together", "keep together"], "difficult_direct_answer": false, "rationales": ["The purpose holds it together.", "The sandwich would slide and fall apart if nothing was holding it.", "It keeps the ingredients from falling out of the bread."], "image": "train2014/COCO_train2014_000000249856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566189, "question_id": "d6xy8Zgjw5ZvYH8NbnefHv", "question": "What motion must one take if someone wants to flush?", "choices": ["nothing", "kick", "crouch", "reach up"], "correct_choice_idx": 3, "direct_answers": ["push down", "reach up", "pull", "stand", "downward", "push", "reach", "pull", "button", "reach"], "difficult_direct_answer": false, "rationales": ["They must pull the strings by the tanks.", "You have to press up if you want to flush.", "The motion is upward."], "image": "val2014/COCO_val2014_000000566189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140115, "question_id": "d6yyFjrV84Fc393E8q2TtZ", "question": "What is there a picture of on the yellow shirt?", "choices": ["elves", "swords", "cows", "shoes"], "correct_choice_idx": 3, "direct_answers": ["shoes", "sneakers", "shoes", "shoes", "shoes", "shoe", "shoes", "shoes", "converse", "shoes"], "difficult_direct_answer": false, "rationales": ["The picture is shoes.", "There is a pair of black high tops on the shirt.", "Sneakers are shown on the shirt."], "image": "train2014/COCO_train2014_000000140115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30235, "question_id": "d7KoQ6CmqZWsUbjN2n6zr6", "question": "Which country is this plane based in?", "choices": ["mexico", "great britain", "belize", "usa"], "correct_choice_idx": 1, "direct_answers": ["england", "england", "uk", "great britain", "america", "uk", "united kingdom", "united kingdom", "britain", "britain"], "difficult_direct_answer": false, "rationales": ["The flags on the plane are for that country.", "There is a union jack near the front of this plane. this flag is associated with the united kingdom.", "By the flag and the name on the plane, it lets you know what country the plane is from."], "image": "train2014/COCO_train2014_000000030235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18930, "question_id": "d7Rh84WYxs6PT9ZUG4Q7V8", "question": "Which item does the player in red primarily want to control here?", "choices": ["baseball", "football", "minds", "sun"], "correct_choice_idx": 1, "direct_answers": ["football", "football", "football", "football", "football", "football", "football", "football", "football", "football"], "difficult_direct_answer": false, "rationales": ["The ball is elongated.", "The player is wearing a football uniform and is carrying a football. points are scored in football by moving the ball into the end zone.", "The football will be controlled."], "image": "train2014/COCO_train2014_000000018930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232919, "question_id": "d84ddmmtRpNLHAZ7YEzhj5", "question": "What sort of image is in the frame mounted on the wall?", "choices": ["drawing", "collage", "photograph", "painting"], "correct_choice_idx": 2, "direct_answers": ["person motorcycle", "sports", "picture", "photograph", "motorbike racer", "photograph", "photograph", "picture", "outdoors", "man"], "difficult_direct_answer": false, "rationales": ["It would appear to be the a option. it's hard to see it behind the guy.", "There is a framed photograph of a person hanging on the wall.", "It's a picture someone took."], "image": "train2014/COCO_train2014_000000232919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27006, "question_id": "d8835zqRNS8XEatk9j8tCU", "question": "How do these people know each other?", "choices": ["coworkers", "rivals", "spouses", "teammates"], "correct_choice_idx": 2, "direct_answers": ["partners", "married", "married", "married", "spouses", "married", "married", "married", "couple", "couple"], "difficult_direct_answer": false, "rationales": ["The people are spouses.", "These people are bride and groom.", "You can tell by what they are wearing and the scene, as what there relationship to eachother is."], "image": "train2014/COCO_train2014_000000027006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50514, "question_id": "d8Sz9LEssYjNZZ7T3FXvKW", "question": "What location are people strolling in?", "choices": ["mall shops", "race track", "bazaar", "plaza"], "correct_choice_idx": 3, "direct_answers": ["square", "old square", "city", "city plaza", "city square", "courtyard", "plaza", "courtyard", "plaza", "castle"], "difficult_direct_answer": false, "rationales": ["The people are on an outdoor courtyard.", "The location is a plaza.", "There are no stores, race cars, or vendors in this location."], "image": "val2014/COCO_val2014_000000050514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281475, "question_id": "d8hXvX7kAA9VChXNz3CrFv", "question": "In which country is this photo taken?", "choices": ["bolivia", "canada", "el salvador", "usa"], "correct_choice_idx": 3, "direct_answers": ["usa", "united states", "usa", "america", "america", "usa", "united states", "america", "united states", "usa"], "difficult_direct_answer": false, "rationales": ["The country is the usa.", "The answer is unknowable, but the girl has an american flag on her shirt.", "The woman has an american flag on her shirt so implication is that photo taken here in the usa."], "image": "val2014/COCO_val2014_000000281475.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24253, "question_id": "d8mKL5VgHzZjmP4RB428kn", "question": "What is done is this room?", "choices": ["sleeping", "eating", "bathing", "cooking"], "correct_choice_idx": 0, "direct_answers": ["decorated", "decorated", "sleep", "sleep", "sleeping", "sleeping", "bed", "sleeping", "sleeping", "sleeping"], "difficult_direct_answer": false, "rationales": ["Sleeping can be doen.", "There is a bedroom in the room.", "People sleep on a bed at night or nap during the day."], "image": "train2014/COCO_train2014_000000024253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37433, "question_id": "d8odxHE24ycDn7HmTQqQvR", "question": "What beverage is probably in the bucket?", "choices": ["cider", "champagne", "wine", "water"], "correct_choice_idx": 1, "direct_answers": ["wine", "champagne", "champagne", "champagne", "wine", "champagne", "wine", "champagne", "wine", "champagne"], "difficult_direct_answer": false, "rationales": ["There might be a bottle of champagne in the bucket.", "The beverage is champagne.", "Champagne is usually stored in ice buckets."], "image": "train2014/COCO_train2014_000000037433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223914, "question_id": "d8qk6CTDa4GfhhpX8GNxfp", "question": "What does Adidas do to the game?", "choices": ["provides funding", "provides venue", "provides transportation", "sponsors apparels"], "correct_choice_idx": 3, "direct_answers": ["sponsor", "sponsor", "sponsor", "sponsor", "footbal", "sponsors apparels", "supports players", "sponsor", "sponsor", "sponsor"], "difficult_direct_answer": false, "rationales": ["There are players that have the adidas logos on their jerseys distinct by both the logo itself and the writing underneath. when a sports apparel's company logo is on a jersey they are said to sponsor the apparel.", "They help make sport apparel.", "Adidas is the sponsor."], "image": "train2014/COCO_train2014_000000223914.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66397, "question_id": "d8sKVtKj4xj5ARFg5fyAHX", "question": "What is behind the front skier?", "choices": ["post", "photographer", "cat", "body"], "correct_choice_idx": 0, "direct_answers": ["wooden fence", "post", "person", "trees", "fence", "fence", "snow", "fence", "fence", "fence"], "difficult_direct_answer": false, "rationales": ["The post is in front.", "The skier is near a post.", "There are sign posts and fence posts near the skier."], "image": "val2014/COCO_val2014_000000066397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135139, "question_id": "d8yYU34U37rbh86BLo2JWz", "question": "What is the brown post behind the green fruit made of?", "choices": ["sand", "concrete", "wood", "plastic"], "correct_choice_idx": 2, "direct_answers": ["wood", "wood", "wood", "wood", "wood", "wood", "wood", "pine", "pine", "wood"], "difficult_direct_answer": false, "rationales": ["The post is wood.", "The post is brown and grainy. it is able to be nailed and it is sturdy.", "Wood is brown while the rest are fruits."], "image": "val2014/COCO_val2014_000000135139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531146, "question_id": "d97hhrG3qDP3uzVBgWzQRj", "question": "Why is the windshield on the bus so large?", "choices": ["reinforcement", "aerodynamics", "visibility", "safety"], "correct_choice_idx": 2, "direct_answers": ["big bus", "for visibility", "visibility", "visibility", "safe", "visibility", "driver visibility", "better view", "long jurney", "maximum vision"], "difficult_direct_answer": false, "rationales": ["So the driver can see all around them.", "The driver needs to be able to see in all directions.", "Bus drivers need to be able to see in all directions. the positioning of the seat through and size of the vehicle make it difficult. it would be more so if the windshield wasn't so large."], "image": "train2014/COCO_train2014_000000531146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219663, "question_id": "d9SQeiH64cqgnQmhZjnT7k", "question": "What sport are the two men playing?", "choices": ["soccer", "disc golf", "basketball", "baseball"], "correct_choice_idx": 1, "direct_answers": ["frisbee", "frisbee golf", "frisbee", "frisbee", "frisbee", "ultimate frisbee", "disc golf", "disc golf", "frisbee golf", "disc golf"], "difficult_direct_answer": false, "rationales": ["The throw of the ball, by the man shows the type of game being played.", "The men are throwing frisbees at a goal with a chain net.", "The people are using and holding frisbees. baseball, basketball, and soccer use balls, not frisbees."], "image": "train2014/COCO_train2014_000000219663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369221, "question_id": "d9eq7oe2uUrLk7caYcR7r3", "question": "What do these people come to this area for?", "choices": ["animal catching", "hunting", "ocean", "tree searching"], "correct_choice_idx": 2, "direct_answers": ["surfing", "surfing", "surfing", "to surf", "beach", "ocean", "beach", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["The people appear to be on a beach and have surfboards nearby. surfboards are most commonly deployed in the ocean.", "The sandy features, people laid out sunbathing on towels, surfboards and wet suits in this image tell us a large body of water must be nearby.", "Some of them are wearing wetsuits and standing next to surf boards, and surf boards are deemed useless unless used in a body of water with waves, such as the ocean."], "image": "val2014/COCO_val2014_000000369221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369961, "question_id": "d9ov7qXAc467QTNp5BhSVX", "question": "What is this area used for?", "choices": ["growing food", "parties", "frisbee golf", "escape"], "correct_choice_idx": 0, "direct_answers": ["farm", "cultivation", "farming", "growing crops", "planting crops", "straw", "growing food", "planting crops", "farming", "farming"], "difficult_direct_answer": false, "rationales": ["There is a tractor there and there are dead crops.", "The area is a field.", "The boy is running in a field."], "image": "train2014/COCO_train2014_000000369961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355111, "question_id": "d9rPqknCuka9jF5wWKR4CU", "question": "Which duo is burning the most calories?", "choices": ["standing", "middle sitting", "back sitting", "front sitting"], "correct_choice_idx": 0, "direct_answers": ["sledders", "upright duo", "skiers", "standing one", "standing", "standing", "standing one", "fat one", "skiers", "standing"], "difficult_direct_answer": false, "rationales": ["The people who are sitting are not burning many calories. the other people are burning more.", "The ones standing are doing the most.", "It takes more muscle power to stay upright."], "image": "train2014/COCO_train2014_000000355111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430155, "question_id": "dA25MnCBP8joT8n6KGY2XG", "question": "What are the overhead wires for?", "choices": ["powering homes", "telephone lines", "streetcars", "powering businesses"], "correct_choice_idx": 2, "direct_answers": ["electric buses", "streetcars", "trolley", "conduct electricity", "electricity", "streetcars", "electricity", "connecting power", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["The overhead wires that are pictured are not that high off the ground. that would mean that it would have to frequently connecting to something on the ground like a streetcar.", "Large wires extend down and across a street in a town.", "The overhead wires guide streetcars."], "image": "val2014/COCO_val2014_000000430155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550980, "question_id": "dAbMnBwdgRwrDAoW34AnYr", "question": "Where is this train heading?", "choices": ["8th avenue", "central pk", "rockaway", "pike ave"], "correct_choice_idx": 2, "direct_answers": ["rockaway", "rockaway", "rockaway", "rockaway", "rockaway", "rockaway", "rockaway", "rockaway", "rockaway", "rockaway"], "difficult_direct_answer": false, "rationales": ["There is a sign saying they are on their way to rockaway on the back of the train.", "This is indicated by both the decorative banner and the label on top of the train.", "The train is going to rockaway."], "image": "val2014/COCO_val2014_000000550980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258019, "question_id": "dAmB6RqvByGVJc6NF97ysZ", "question": "Why has this person sat down?", "choices": ["eat", "felt faint", "pet dog", "tie shoe"], "correct_choice_idx": 0, "direct_answers": ["eating", "eating", "to eat", "to eat", "eat", "to eat", "eating", "to eat", "eating", "to eat"], "difficult_direct_answer": false, "rationales": ["A person is eating a sandwich. people sit down to eat.", "The person is holding a sandwich to their mouth.", "He has food in his hands"], "image": "val2014/COCO_val2014_000000258019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501786, "question_id": "dB8mDfHkEG5brxAJJN6XYG", "question": "What type bear pelt is seen or imitated here?", "choices": ["polar", "panda", "grizzly", "brown"], "correct_choice_idx": 0, "direct_answers": ["polar", "pillow", "polar bear", "polar bear", "polar bear", "polar", "polar", "white", "polar bear", "polar"], "difficult_direct_answer": false, "rationales": ["The pelt is a white color. that is specific to one kind of bear.", "The fur is white and fluffy which means it comes from a white polar bear.", "This type of bear usually has white fur as this pelt does."], "image": "train2014/COCO_train2014_000000501786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134137, "question_id": "dBBzGahSWnQgpwQeZnWTe7", "question": "What is the next number in the sequence?", "choices": ["one", "ten", "two", "eight"], "correct_choice_idx": 2, "direct_answers": ["6544", "two", "seven", "seven", "two", "two", "seven", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The number is the next number in the list.", "The signs show numbers going down starting from 6 going to 3. the next number would be 2.", "The number is 2."], "image": "train2014/COCO_train2014_000000134137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84851, "question_id": "dBX52heKb9U4TUoH6sxVBT", "question": "Which item is the wrong color?", "choices": ["bat", "helmet", "pants", "ball"], "correct_choice_idx": 3, "direct_answers": ["baseball", "ball", "bat", "baseball", "baseball", "baseball", "baseball", "ball", "pants", "yellow"], "difficult_direct_answer": false, "rationales": ["The ball should be white.", "Baseballs are usually white, not yellow.", "The color is the ball."], "image": "train2014/COCO_train2014_000000084851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276417, "question_id": "dBaqVWPLaG5PPUV8n5N9bW", "question": "What is the black crate used for?", "choices": ["holding gloves", "holding bats", "holding food", "holding balls"], "correct_choice_idx": 3, "direct_answers": ["holding balls", "balls", "holds balls", "hold balls", "hold baseballs", "hold baseballs", "hold balls", "balls", "balls", "store baseballs"], "difficult_direct_answer": false, "rationales": ["The black crate stores balls.", "The crate is full of round spheres used for playing baseball.", "A crate can hold items."], "image": "train2014/COCO_train2014_000000276417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278528, "question_id": "dC25kJKBAf4pfQtsDAYDJL", "question": "The flying object is moved by what power?", "choices": ["electricity", "wind", "manual force", "solar"], "correct_choice_idx": 1, "direct_answers": ["wind", "wind", "wind", "wind", "wind", "wind", "wind", "wind", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["The object is by wind.", "Wind is keeping the kite moving.", "The place seems to have free moving wind hence the power comes from the wind."], "image": "train2014/COCO_train2014_000000278528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225608, "question_id": "dC6PQNgU4PEPgZAsMKps4M", "question": "What color on the bottom sign is out of place?", "choices": ["black", "red", "yellow", "silver"], "correct_choice_idx": 1, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The red sign is misplaced.", "The sign should be in yellow and black. the other color is painted.", "An orange street sign is spray painted with red spray paint."], "image": "train2014/COCO_train2014_000000225608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513811, "question_id": "dCPrsRD4NykMDWxyvxsQXV", "question": "What are the green vegetables next to the red tomatoes on the left-side pizza?", "choices": ["snap peas", "jalapenos", "green tomatoes", "broccoli"], "correct_choice_idx": 2, "direct_answers": ["spinach", "peppers", "green tomatoes", "peppers", "green tomatoes", "peppers", "green tomatoes", "peppers", "spinach", "green tomatoes"], "difficult_direct_answer": false, "rationales": ["They are green but in the same shape and size as the red ones.", "The veggies are tomatoes.", "The green vegetables are sliced tomatoes."], "image": "train2014/COCO_train2014_000000513811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95611, "question_id": "dCRpSvGdSAoYfGA4CC3N8L", "question": "What are the horses being used for?", "choices": ["petting", "field work", "stomping", "riding"], "correct_choice_idx": 1, "direct_answers": ["plowing", "plowing", "working field", "field work", "farming", "pulling", "plow", "farming", "planting", "tilling land"], "difficult_direct_answer": false, "rationales": ["The horses work.", "These horses are used to plow the fields.", "The horses are being used to plow the dirt."], "image": "train2014/COCO_train2014_000000095611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561599, "question_id": "dCawdjyYBZRmJZq8GSDEYM", "question": "What can be used to exercise behind the cat?", "choices": ["skateboard", "roller skates", "scooter", "bicycle"], "correct_choice_idx": 3, "direct_answers": ["bicycle", "bike", "bicycle", "bike", "bike", "bike", "bicycle", "bicycle", "bicycle", "bike"], "difficult_direct_answer": false, "rationales": ["There is a bicycle used for exercises behind the cat.", "This outdoor sport increases cardio fitness and it is considered low impact.", "There is a two, not four, wheeled vehicle parked behind the cat."], "image": "val2014/COCO_val2014_000000561599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340047, "question_id": "dCjvPbfCRGb9EKNfjyamYf", "question": "Who spoke the language that these words are in?", "choices": ["jay thomas", "burt reynolds", "jackie robinson", "albert einstein"], "correct_choice_idx": 3, "direct_answers": ["albert einstein", "hitler", "hitler", "hitler", "albert einstein", "germans", "germans", "german", "german", "hitler"], "difficult_direct_answer": false, "rationales": ["That language is spoken by that german person.", "One of the greatest physicists of all time was german.", "The words are in german. he was german."], "image": "val2014/COCO_val2014_000000340047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367823, "question_id": "dDEYkFofoSKAdqF5vYUr3x", "question": "Who makes the gloves the man is wearing?", "choices": ["mizuno", "gucci", "derek zoolander", "rawlings"], "correct_choice_idx": 3, "direct_answers": ["rawlings", "rawlings", "baseball", "player", "riddell", "rawlings", "rawlings", "rawlings", "rawlings", "glows"], "difficult_direct_answer": false, "rationales": ["The logo is from the rawlings company.", "There is the logo on the back of the glove.", "Rawlings is what the r stands for."], "image": "train2014/COCO_train2014_000000367823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24411, "question_id": "dDKy9sSkctmMqAY4AscizB", "question": "How many females in the photo eating pizza?", "choices": ["four", "two", "none", "six"], "correct_choice_idx": 2, "direct_answers": ["zero", "one", "0 females", "one", "none", "one", "zero", "zero", "zero", "zero"], "difficult_direct_answer": false, "rationales": ["There are none.", "The people eating pizza are male.", "All pizza eaters appear to be men."], "image": "train2014/COCO_train2014_000000024411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242673, "question_id": "dEVMmigGqhKhyZsBjs2du3", "question": "What time is it likely to be?", "choices": ["noon", "845 am", "midnight", "dusk"], "correct_choice_idx": 1, "direct_answers": ["rush hour", "4pm", "rush hour", "rush hour", "rush hour", "500", "845 am", "rush hour", "500", "rush hour"], "difficult_direct_answer": false, "rationales": ["This is rush hour traffic as people try to get to work", "It appears by the look of the traffic that it is rush hour.", "People are commuting in rush hour traffic."], "image": "val2014/COCO_val2014_000000242673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543676, "question_id": "dEXCRqMta4jHMrSHYW82vy", "question": "What does the lattice in front of the man prevent?", "choices": ["glare", "animals", "escaping", "falling"], "correct_choice_idx": 3, "direct_answers": ["falling out", "derailing", "falling out", "falling", "falling", "falling out", "falling", "falling", "falling", "accidents"], "difficult_direct_answer": false, "rationales": ["The lattice prevents falls.", "Lattice is on the bottom half of the doorway of a train. a barrier near a drop off helps people avoid falling on accident.", "The lattice keeps people and trains from falling."], "image": "val2014/COCO_val2014_000000543676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550652, "question_id": "dEXknwkYNgw5CPyi2rMx98", "question": "What style tennis will these men play today?", "choices": ["mixed doubles", "canadian doubles", "singles", "mens doubles"], "correct_choice_idx": 3, "direct_answers": ["doubles", "doubles", "doubles", "doubles", "mens doubles", "doubles", "doubles", "doubles", "doubles", "doubles"], "difficult_direct_answer": false, "rationales": ["There are two people on each side of the net making a total of four players. the players are all the same sex meaning they are not mixed.", "There are two men on each side of the court. having this amount of men gives equal teams.", "There are four tennis players. they all belong to the same sex."], "image": "train2014/COCO_train2014_000000550652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528006, "question_id": "dFE2TcjiogNCm67vX38Byv", "question": "What is the vertical back fin piece on the plane called?", "choices": ["flap", "rudder", "slat", "aileron"], "correct_choice_idx": 1, "direct_answers": ["tail", "rudder", "tail", "tail", "tail", "stabilizer", "tail", "tail", "tail", "tail"], "difficult_direct_answer": false, "rationales": ["That is the rudder used to steer the plan in the direction they want to go", "It is well known to be called a rudder to help with the motion of the plane.", "Just like a boat, a plane needs to change direction while in motion. it uses a rudder, but i couldn't learn this from the picture."], "image": "train2014/COCO_train2014_000000528006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389921, "question_id": "dFEa4JfKYEhMGnRvcvAeyq", "question": "What is the possible danger faced by the man?", "choices": ["concussion", "broken hip", "broken backbone", "broken wrist"], "correct_choice_idx": 0, "direct_answers": ["falling", "hitting head", "falling", "falling over", "fall", "concussion", "falling", "head injury", "falling", "tripping"], "difficult_direct_answer": false, "rationales": ["A man is jumping and flipping over and his head is down near the ground.", "He looks like he's about to fall on his head.", "The danger is a concussion."], "image": "train2014/COCO_train2014_000000389921.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327810, "question_id": "dFGo6R5x2xRdT2CrxRn4XF", "question": "The robots on the field are modeled after which animal?", "choices": ["sheep", "dog", "cat", "rabbit"], "correct_choice_idx": 1, "direct_answers": ["dog", "dogs", "dogs", "dog", "dog", "dogs", "dogs", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The robots on the field have four legs and look like dogs.", "They are made to look like dogs as they compete.", "The robots are dogs."], "image": "train2014/COCO_train2014_000000327810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516414, "question_id": "dFHGCvpk5BwFCRGeTHJTvT", "question": "What yellow items sits on the boat?", "choices": ["stockings", "bananas", "net", "mustard"], "correct_choice_idx": 2, "direct_answers": ["nets", "fishing nets", "yellow nets", "bananas", "clams", "seaweed", "nets", "net", "rope", "bananas"], "difficult_direct_answer": false, "rationales": ["The item looks to be made of material formed in the shape of a net.", "Piles of yellow netting are on a boat. netting is used on boats.", "The items are nets."], "image": "val2014/COCO_val2014_000000516414.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151352, "question_id": "dFJ5Q42zhtoC9pN4A2KJDw", "question": "What are these skiers involved in?", "choices": ["race", "waxing", "clothes drying", "shredding"], "correct_choice_idx": 0, "direct_answers": ["competition", "race", "race", "competition", "competion", "race", "racing", "race", "cross county", "ski race"], "difficult_direct_answer": false, "rationales": ["The skiiers are wearing numbers on them and are trying to go fast.", "The skiers are racing.", "The skiers are all involved in a big race."], "image": "train2014/COCO_train2014_000000151352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483234, "question_id": "dFJVpTquMxGGse9hDvvqMx", "question": "Where do people store their boards when they remove them here?", "choices": ["ski lift", "shed", "ground", "against tree"], "correct_choice_idx": 3, "direct_answers": ["against tree", "trees", "against tree", "vehicle", "garage", "near tree", "inside building", "against tree", "against tree", "tree"], "difficult_direct_answer": false, "rationales": ["When people are done with their outdoor objects, they put them up in a shed to avoid them being stolen or destroyed.", "The people are against a tree.", "They are standing them up so they stay out of the way."], "image": "val2014/COCO_val2014_000000483234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507551, "question_id": "dFPMbFhABz9tH4xyaKP25i", "question": "Why is the bat resting on his shoulder?", "choices": ["hiding bat", "hit ball", "resting", "stealing bat"], "correct_choice_idx": 1, "direct_answers": ["hitting ball", "waiting", "technique", "swinging", "swinging stance", "to ready", "hit ball", "swinging", "taking aim", "swinging"], "difficult_direct_answer": false, "rationales": ["A baseball player normally taking a stance prior to hitting a baseball. also, he will most often rest his bat on his shoulder before he hits ball.", "The bat is to hit.", "The boy is about to bat."], "image": "val2014/COCO_val2014_000000507551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197407, "question_id": "dFeiPtQ7axCfgPu34Q4cYZ", "question": "What kind of sausage is the bacon wrapped around?", "choices": ["hot dog", "bratwurst", "polish sausage", "knockwurst"], "correct_choice_idx": 0, "direct_answers": ["kielbasa", "hot dog", "hotdogs", "hot dog", "hotdog", "kielbasa", "hot dog", "hot dog", "american", "hot dog"], "difficult_direct_answer": false, "rationales": ["Hot dogs are wrapped.", "The hot dog has condiments.", "The bacon is wrapped around an elongated and skinny item on the grill. the hot dog would most likely be that item as it is a crowd favorite."], "image": "train2014/COCO_train2014_000000197407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251600, "question_id": "dFisE55EcyTA7LDQ4esbpp", "question": "In which country were these vintage motorcycles manufactured?", "choices": ["united kingdom", "united states", "japan", "germany"], "correct_choice_idx": 3, "direct_answers": ["germany", "united states", "germany", "germany", "germany", "united states", "germany", "germany", "united states", "germany"], "difficult_direct_answer": false, "rationales": ["Motorcycles are from germany.", "The name on the bottom right of the image is in german.", "Each motorcycle has a bmw logo. the b in bmw stands for bayerische."], "image": "train2014/COCO_train2014_000000251600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63595, "question_id": "dFrQrFBrAFACQTdKU6zmZE", "question": "This athlete is most likely to face who in a match?", "choices": ["lennox lewis", "dennis rodman", "bo jackson", "roger federer"], "correct_choice_idx": 3, "direct_answers": ["opponent", "tennis player", "male", "tennis player", "roger federer", "tennis", "tennis", "tennis", "another player", "andre agassi"], "difficult_direct_answer": false, "rationales": ["Roger federer is a tennis player.", "Federer plays tennis.", "Roger federer plays tennis."], "image": "val2014/COCO_val2014_000000063595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118679, "question_id": "dFuVUJpB53bQvyjKLBBBJb", "question": "What might have happened to the tennis player for him to react like this?", "choices": ["gained point", "gastro intestinal", "lost point", "fear"], "correct_choice_idx": 2, "direct_answers": ["lost", "poor play", "poor performance", "lost match", "he lost", "missed shot", "bad serve", "missed shot", "miss ball", "lost point"], "difficult_direct_answer": true, "rationales": ["He has an angry look on his face. he is snarling.", "It's hard to win when you don't get the numbers.", "He looks like he realizes he made a mistake."], "image": "val2014/COCO_val2014_000000118679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380348, "question_id": "dFxT2TsptiYkYMxn4anUr6", "question": "What is intended to rock back and forth?", "choices": ["chair", "cabinet", "table", "stool"], "correct_choice_idx": 0, "direct_answers": ["chair", "chair", "chair", "rocking chair", "rocking chair", "chair", "rocking chair", "chair", "rocking chair", "chair"], "difficult_direct_answer": false, "rationales": ["There is a white rocker on the porch.", "The chair has a curved bottom so it can rock.", "The chair is rocking."], "image": "val2014/COCO_val2014_000000380348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397742, "question_id": "dG8snULxDyugvcp2WzwVt6", "question": "What does the white portion of the food offer the most?", "choices": ["calcium", "fat", "carbohydrate", "protein"], "correct_choice_idx": 3, "direct_answers": ["dairy", "eggs", "protein", "protein", "protein", "carbohydrates", "protein", "egg white", "cheese", "egg"], "difficult_direct_answer": false, "rationales": ["The white stuff on the food is cheese. cheese has a lot of calcium.", "The white part is attached to the yellow part, indicating it is an egg, which is filled with protein.", "The portion has protein."], "image": "val2014/COCO_val2014_000000397742.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515765, "question_id": "dGCUEj2m4f3nmumxezaeMA", "question": "What hour does the clock face show?", "choices": ["three", "six", "five", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["The small hand is pointed to 4.", "The hour is four o'clock.", "Clock hands are pointing at the four."], "image": "val2014/COCO_val2014_000000515765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129175, "question_id": "dGPutzBDDrVwbLiBqeVMvo", "question": "What are these boats called?", "choices": ["gondola", "tugboat", "rowboat", "putter"], "correct_choice_idx": 0, "direct_answers": ["gondolas", "gondola", "gondolas", "gondolas", "blak gondolas", "gondola", "cannot", "rondola", "gondolas", "gondala"], "difficult_direct_answer": false, "rationales": ["This type of vessel is used to traverse canal waters in venice. this boat is carrying four passengers.", "The boats are gondolas.", "These italian boats are called gondola."], "image": "val2014/COCO_val2014_000000129175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261521, "question_id": "dGeDbLDy9kjNKbh3aytXUB", "question": "How do these people know each other?", "choices": ["spouses", "coworkers", "teammates", "rivals"], "correct_choice_idx": 0, "direct_answers": ["married", "married", "dating", "couple", "married", "spouses", "married", "married", "husband wife", "married"], "difficult_direct_answer": false, "rationales": ["They are in an intimate pose.", "They're spouses.", "The people are holding each other in a familiar embrace and looking at each other lovingly which would be consistent with answer a."], "image": "train2014/COCO_train2014_000000261521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370475, "question_id": "dGf5EHfrsaSor2ajwmYSCs", "question": "From which direction did this person come?", "choices": ["left up", "below", "in front", "no where"], "correct_choice_idx": 0, "direct_answers": ["leftside", "up", "from uphill", "up", "left up", "west", "above", "north", "mountain top", "left"], "difficult_direct_answer": true, "rationales": ["This person is skiing right and down. they came from the opposite direction.", "The orientation of the person on the slope and the trail their skis have made in the snow implies they came from answer a.", "The person is sliding downhill from the top left."], "image": "val2014/COCO_val2014_000000370475.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456076, "question_id": "dGfX7xfqzywMfGSn3b93LM", "question": "What is an alcohol percentage of hand sanitizer?", "choices": ["90%", "60%", "50%", "100%"], "correct_choice_idx": 1, "direct_answers": ["60%", "60%", "sixty", "90", "no idea", "table", "ten", "under 1", "70%", "60 percent"], "difficult_direct_answer": true, "rationales": ["Hand sanitizer has a little over half of its content as alcohol.", "Good hand sanitizer is at least 90% efective.", "It's 60% of hand sanitizer."], "image": "train2014/COCO_train2014_000000456076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92761, "question_id": "dH5wHWBF2GCcdHA3KeRAGL", "question": "Who is in the greatest danger?", "choices": ["right man", "no one", "both men", "left man"], "correct_choice_idx": 3, "direct_answers": ["left surfer", "surfer", "waves", "left person", "fallen person", "left man", "sharks", "surfer", "surfer", "surfer"], "difficult_direct_answer": false, "rationales": ["The person that is knocked down with their head barely above water would appear to be in the most danger.", "The man on the left side of the image appears to be at water level with waves potentially crashing over him which could present danger.", "He's down in the water as the wave crashes"], "image": "train2014/COCO_train2014_000000092761.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374395, "question_id": "dHDUELDvUaJPrJ6n85yiN5", "question": "What is found on the surfboard to allow the surfer to stay on it?", "choices": ["straps", "glue", "tape", "surfboard wax"], "correct_choice_idx": 3, "direct_answers": ["wax", "grips", "wax", "wax", "texture", "surfboard wax", "rip cord", "wave", "wax", "resin"], "difficult_direct_answer": false, "rationales": ["The coating makes the board easier to grip.", "This is commonly used on surfboards to help keep the rider from slipping off the board while paddling and riding waves.", "The surfer could have used wax for grip."], "image": "train2014/COCO_train2014_000000374395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351322, "question_id": "dHDpkbdCPpdQY47k26HcDE", "question": "What do the kids play here?", "choices": ["monopoly", "car racing", "skateboarding", "nintendo wii"], "correct_choice_idx": 3, "direct_answers": ["wii", "video games", "nintendo wii", "game", "video games", "video games", "video game", "nintendo wii", "video game", "wii"], "difficult_direct_answer": false, "rationales": ["They have the white remote in their hands.", "The kids are holding white controllers. they are shaped like remotes.", "The boys have white rectangular remotes for video games."], "image": "train2014/COCO_train2014_000000351322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463599, "question_id": "dHPHpBLoxCbgGcXAwNDXFA", "question": "What is the man doing who took this picture?", "choices": ["sleeping", "photography class", "brushing teeth", "combing hair"], "correct_choice_idx": 2, "direct_answers": ["brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "brush teeth", "brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth"], "difficult_direct_answer": false, "rationales": ["The man is brushing.", "The man is holding something in his mouth.", "The man is standing in front of a bathroom mirror with a handled item in his mouth. people brush their teeth in the bathroom."], "image": "val2014/COCO_val2014_000000463599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154868, "question_id": "dHSvbroqz3XXGkjka3wAQc", "question": "Who is selling these items?", "choices": ["banker", "car repairman", "grocer", "lawyer"], "correct_choice_idx": 2, "direct_answers": ["shopkeeper", "market", "grocer", "grocer", "vendor", "market", "market", "vendor", "vendor", "shopkeeper"], "difficult_direct_answer": false, "rationales": ["Produce is on display with prices listed on signs all around them.", "The grocer is selling them.", "A grocer is selling produce."], "image": "train2014/COCO_train2014_000000154868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566175, "question_id": "dHT7F8bpmSk9nxixBA4WZz", "question": "What major bottled water company advertises here?", "choices": ["dasani", "evian", "poland spring", "fiji"], "correct_choice_idx": 2, "direct_answers": ["poland spring", "poland spring", "poland spring", "poland spring", "island spring", "poland spring", "poland spring", "poland spring", "poland spring", "poland springs"], "difficult_direct_answer": false, "rationales": ["The company is poland spring.", "Poland spring is shown on the boards, which means that they are a sponsor.", "Poland spring's logo is on the wall."], "image": "train2014/COCO_train2014_000000566175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342279, "question_id": "dJ2ZajsHszzhQcFruCeS24", "question": "What is the man on the left doing?", "choices": ["juggling", "jumping", "laughing", "running"], "correct_choice_idx": 3, "direct_answers": ["laughing", "laughing", "laughing", "singing along", "laughing", "laughing", "laughing", "laughing", "laughing", "running"], "difficult_direct_answer": false, "rationales": ["You can tell by how he is leaning back with mouth open as to what he is doing.", "The man is running.", "He's watching the dancer who looks really awkward."], "image": "train2014/COCO_train2014_000000342279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486168, "question_id": "dJDNTR5zR4CsuSdTkLe4Fj", "question": "How raw is the inside of the egg?", "choices": ["slightly raw", "completely raw", "fully cooked", "slightly cooked"], "correct_choice_idx": 2, "direct_answers": ["somewhat", "cooked", "unknown", "not raw", "boiled", "fully cooked", "hard boiled", "not raw", "boiled", "fully cooked"], "difficult_direct_answer": false, "rationales": ["The egg is served in a traditional hard boiled egg serving dish. it is on a plate with cooked food. cooked food isn't raw.", "The egg is fully cooked.", "The inside of the eggs are completely cooked."], "image": "train2014/COCO_train2014_000000486168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460045, "question_id": "dJHD7xpuCkhXTLmTFsuBNy", "question": "What are the clear areas on the front of the plane made out of?", "choices": ["glass", "aluminum", "rock", "stone"], "correct_choice_idx": 0, "direct_answers": ["plexiglass", "glass", "glass", "glass", "glass", "metal", "glass", "glass", "glass", "silicone"], "difficult_direct_answer": false, "rationales": ["The glass is used by the pilot to see through.", "An airplane is shown from the front. glass is clear.", "The area is glass."], "image": "train2014/COCO_train2014_000000460045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356145, "question_id": "dKaFKEUGP2zWakuqLyqwao", "question": "Why are they stopping?", "choices": ["enjoy view", "no gas", "are hungry", "are lost"], "correct_choice_idx": 0, "direct_answers": ["sight see", "resting", "sightseeing", "to chat", "admire view", "scenic views", "pictures", "ticket", "sight seeing", "enjoy view"], "difficult_direct_answer": true, "rationales": ["They appear to be stopped to look at the waterfall, which is a beauty of nature to marvel at.", "Two men are standing on the side of the road and are looking towards a waterfall.", "They are all stopping to enjoy the view of the waterfall."], "image": "val2014/COCO_val2014_000000356145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111777, "question_id": "dKbVbMB58F7DAtaoNpsgcs", "question": "Which one is the superior officer?", "choices": ["can't tell", "in trailer", "facing camera", "back turned"], "correct_choice_idx": 2, "direct_answers": ["saluted", "general", "commander", "right", "by motorcycle", "sunglasses man", "facing camera", "facing camera", "facing camera", "on right"], "difficult_direct_answer": false, "rationales": ["The officer faces the camera.", "The man with his arms on his hips looks more assertive.", "The other man is addressing him as such."], "image": "train2014/COCO_train2014_000000111777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412757, "question_id": "dKkQFvgfobm7tqPEpTwnWg", "question": "What is the sum of each individual digit on the top of the bus?", "choices": ["23", "five", "223", "32"], "correct_choice_idx": 1, "direct_answers": ["five", "five", "five", "five", "five", "five", "five", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["Two plus three equals.", "The sum is 5.", "The number on the bus is 23. if you add two and three you get five."], "image": "val2014/COCO_val2014_000000412757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354144, "question_id": "dKnsWkJwqaC8kRn9Qu8ozC", "question": "What energy is powering the white cable cars?", "choices": ["solar", "electricity", "wind", "gas"], "correct_choice_idx": 1, "direct_answers": ["electricity", "electricity", "electricity", "electricity", "hep", "electric", "electricity", "electric", "electric", "electricity"], "difficult_direct_answer": false, "rationales": ["Electricity is what goes through the lines on the cable.", "The energy is electricity.", "They are on a string which is moved by electrical energy to get them where they need to get."], "image": "train2014/COCO_train2014_000000354144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187424, "question_id": "dKoaYvRKoJUmKBZgHFiPcB", "question": "What is between the car and the cube truck?", "choices": ["mirror", "toilet", "sink", "tub"], "correct_choice_idx": 1, "direct_answers": ["toilet", "toilet", "toilet", "toilet", "toilet", "toilet", "toilet", "toilet", "toilet", "toilet"], "difficult_direct_answer": false, "rationales": ["This has a tank and a lid", "The white seat with a hinged cover with a tank in the back identifies this item as a toilet.", "In the foreground there is a car to the left and on the right a cube truck. between these two is a toilet."], "image": "val2014/COCO_val2014_000000187424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140563, "question_id": "dKsJk3TqNF5s9AKC5VN2NL", "question": "What kind of man is found under the orange square?", "choices": ["plastic", "handicapped", "dwarf", "living"], "correct_choice_idx": 0, "direct_answers": ["politician", "doll", "doll", "mascot", "plastic", "businessman", "mannequin", "business man", "artificial", "business"], "difficult_direct_answer": true, "rationales": ["The man is under a square.", "The man is placed and seen to be a plastic.", "They look like ken dolls."], "image": "train2014/COCO_train2014_000000140563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435187, "question_id": "dL7SoQABC94XwwW6EyccYq", "question": "What is in the back of the truck?", "choices": ["tigers", "horses", "chickens", "cattle"], "correct_choice_idx": 3, "direct_answers": ["machinery", "cows", "cows", "cattle", "cows", "animals", "cows", "cattle", "cows", "cow"], "difficult_direct_answer": false, "rationales": ["Cattle can be transported.", "There are cows, not chickens, horses, or tigers, in the back of the truck.", "Cows are in the back of the truck."], "image": "val2014/COCO_val2014_000000435187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376958, "question_id": "dLDVNwedZPNPxmDY7VsxrZ", "question": "Which film industry likely produced this movie?", "choices": ["nollywood", "hollywood", "ghollywood", "bollywood"], "correct_choice_idx": 3, "direct_answers": ["comedy movie", "bollywood", "hollywood", "bollywood", "television station", "bollywood", "malayalam", "bollywood", "mexican", "indian"], "difficult_direct_answer": false, "rationales": ["The people look like they're from india.", "The movie looks like it is from india.", "The actors appear to look indian, and the movie industry in india is based out of bollywood."], "image": "train2014/COCO_train2014_000000376958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324286, "question_id": "dLWd6JAUFV6VSnZtEAvAcs", "question": "What is partially behind the surf board?", "choices": ["tree", "dog", "boat", "bird"], "correct_choice_idx": 3, "direct_answers": ["bird", "bird", "bird", "ocean", "bird", "bird", "bird", "bird", "ocean", "ocean"], "difficult_direct_answer": false, "rationales": ["The wings and tail of a bird can be seen enough to distinguish what type of animal it is.", "There are four surfboard visible but only one item which is partially viewable behind it. this would be a bird in flight.", "There is a bird flying behind the surf boards."], "image": "train2014/COCO_train2014_000000324286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108217, "question_id": "dLhUN8ERwitzzDMxwUATJM", "question": "What is the black circular object near the top of the tower used for?", "choices": ["telling time", "cooking pizza", "feeding birds", "looking out"], "correct_choice_idx": 0, "direct_answers": ["telling time", "telling time", "clock", "bell", "telling time", "tell time", "tell time", "telling time", "telling time", "time"], "difficult_direct_answer": false, "rationales": ["The circular object is a clock.", "The circular object is for time.", "The black clock with white hands perched on top of the tower lets villagers know what time it is, day or night. clock towers first came into use during the 11th century."], "image": "train2014/COCO_train2014_000000108217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438550, "question_id": "dLvvgueVvkded7mbMipRTL", "question": "What is the rope for?", "choices": ["anchor", "towing boat", "safety harness", "towing skier"], "correct_choice_idx": 3, "direct_answers": ["pull waterskier", "waterskiing", "pull him", "pulling skier", "towing skier", "towing", "pull waterskiier", "pull skier", "water skiing", "waterskiing"], "difficult_direct_answer": true, "rationales": ["The person is holding the rope. the rope is tied to the boat. the person is moving because they are attached to the boat.", "There is a rope connecting the man to the boat.", "A waterskier holds on to a rope pulled by a boat."], "image": "train2014/COCO_train2014_000000438550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224670, "question_id": "dLweZxL4HghHCsC6JzFrgN", "question": "What creates the blurry reflection seen in this photo?", "choices": ["steam", "heat", "cat breath", "glass"], "correct_choice_idx": 3, "direct_answers": ["movement", "glass", "reflection", "movement", "moving camera", "ceiling lights", "laptop", "glare", "glare", "glass"], "difficult_direct_answer": false, "rationales": ["Glass is reflective which can reflect images.", "The glass is blurry.", "Sometimes reflections can be seen through those transparent objects when light hits it."], "image": "train2014/COCO_train2014_000000224670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219756, "question_id": "dLxoe32vTNgM3oo5FxwEQi", "question": "What is beneath the Green Bags?", "choices": ["horse posts", "food menus", "road signs", "parking meters"], "correct_choice_idx": 3, "direct_answers": ["parking meter", "parking meter", "pay terminal", "pay terminal", "parking meter", "pay terminal", "parking meter", "parking meters", "parking meter", "parking meter"], "difficult_direct_answer": false, "rationales": ["The bags have meters.", "There is an out of order parking meter.", "Drivers usually would add coins into these objects, but the text on each one indicates that the payment is now done at a different location."], "image": "train2014/COCO_train2014_000000219756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37563, "question_id": "dM5qgeV53NSz4trKJJmQyv", "question": "What material is this statue made of?", "choices": ["metal", "wood", "clay", "pic"], "correct_choice_idx": 0, "direct_answers": ["bronze", "bronze", "concrete", "bronze", "iron", "iron", "metal", "bronze", "stone", "copper"], "difficult_direct_answer": false, "rationales": ["The statue is made of tough long lasting material.", "The statue is outside. clay or wood would not be durable enough to withstand the elements.", "The statue is made of a really strong kind of metal that is used for displaying in public places."], "image": "train2014/COCO_train2014_000000037563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185487, "question_id": "dM7JFUvMn7PJmguomkCAXs", "question": "Why is the person's outfit green in color?", "choices": ["dress code", "visibility", "camouflage", "matching color"], "correct_choice_idx": 3, "direct_answers": ["matching color", "bright", "be seen", "for protection", "visibility", "match kite", "match board", "match snowboard", "flying", "be noticed"], "difficult_direct_answer": true, "rationales": ["The person has a matching outfit and snowboard.", "A person is on a green snowboard and is wearing a matching coat.", "The outfit is green in color to match."], "image": "val2014/COCO_val2014_000000185487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27562, "question_id": "dMtvsMH2MLs5qSqHjukHXZ", "question": "In which direction from the man will he throw the disc?", "choices": ["behind him", "straight ahead", "his right", "his left"], "correct_choice_idx": 2, "direct_answers": ["right", "north", "his right", "away", "left", "right", "away", "away", "away", "left"], "difficult_direct_answer": false, "rationales": ["He will use the hand it is in to throw it away from him.", "The man will throw the frisbee to the right.", "He is holding it in his right hand with his right side turned in the direction in which the frisbee will be thrown."], "image": "train2014/COCO_train2014_000000027562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30836, "question_id": "dMvJ2NkTaEfCc82S4uAmij", "question": "What might sit in the glass?", "choices": ["dentures", "nothing", "wine", "pencils"], "correct_choice_idx": 0, "direct_answers": ["dentures", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The glass has water in it already.", "The dentures sit in the glass.", "People put their false teeth in glasses."], "image": "train2014/COCO_train2014_000000030836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91465, "question_id": "dMwEmWmjJ5qWQYTPF5e75U", "question": "What toy is in the crib with the child?", "choices": ["dog", "teddy", "doll", "cat"], "correct_choice_idx": 2, "direct_answers": ["doll", "doll", "cat", "doll", "doll", "doll", "doll", "doll", "doll", "play doll"], "difficult_direct_answer": false, "rationales": ["The toy is a doll.", "There is a small toy doll next to the cat and the child.", "Although a cat is in the bed, a cat is not a toy. however, the child does have a toy above her head and part of the head and body can be seen through the bed rails."], "image": "train2014/COCO_train2014_000000091465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547498, "question_id": "dN4vMKwjowFVUL9yCMdBa4", "question": "Why is the man laying under the elephant?", "choices": ["sleeping", "showing off", "napping", "exercising"], "correct_choice_idx": 1, "direct_answers": ["show", "trick", "trick", "training", "showing off", "magic trick", "entertainment", "animal show", "performance", "performing trick"], "difficult_direct_answer": true, "rationales": ["He is showing off for a trick with the elephant.", "The man is dressed in a costume. the elephant is trained, and they are probably part of a circus or other entertainment.", "The man is showing off."], "image": "train2014/COCO_train2014_000000547498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533140, "question_id": "dNEgiaGbPCJpYCjWtE4z6S", "question": "What is the foremost sheep doing?", "choices": ["sleeping", "walking", "working", "sitting"], "correct_choice_idx": 1, "direct_answers": ["resting", "walking", "standing", "walking", "walking", "looking camera", "standing", "standing", "standing", "walking"], "difficult_direct_answer": false, "rationales": ["A sheep is standing with a leg raised in a field.", "The sheep is walking.", "Its head is facing the viewer with one foot up and another back showing movement to a different place."], "image": "val2014/COCO_val2014_000000533140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351620, "question_id": "dNag6rLbuSEi45zuXgJ78h", "question": "What kind of protein is present in cow horn?", "choices": ["melanin", "gelatin", "casein", "keratin"], "correct_choice_idx": 3, "direct_answers": ["calcium", "keratin", "bone", "red meat", "calcium", "keratin", "keratin", "keratin", "keratin", "animal"], "difficult_direct_answer": false, "rationales": ["That is what most of the horn is made of, which surrounds the core.", "They are mostly made up of keratin.", "That is what the horn is made of."], "image": "val2014/COCO_val2014_000000351620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536244, "question_id": "dNgHUb7d4HpEzRxvW7JRdX", "question": "What is the dog getting ready to do?", "choices": ["eat", "sit", "run", "lay down"], "correct_choice_idx": 2, "direct_answers": ["run", "run", "run", "run", "run", "charge man", "run", "charge man", "run", "get picture"], "difficult_direct_answer": false, "rationales": ["The dog is leaping like it wants to go faster.", "The dog wants to go for a trot since its legs are off the ground.", "He is traveling with the man on the skateboard. the skateboard is moving."], "image": "train2014/COCO_train2014_000000536244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418770, "question_id": "dNtojegxn8NbnEgfmftwxK", "question": "What is the brown animal sitting on?", "choices": ["wood chips", "dirt", "carpet", "sand"], "correct_choice_idx": 0, "direct_answers": ["hay", "hay", "straw", "wood shavings", "wood chips", "wood chips", "wood chips", "hay", "litter", "shavings"], "difficult_direct_answer": false, "rationales": ["There are wood chips under the cow.", "You can tell by the cut shavings and color as to what the baby cow is sitting on.", "A cow is sitting in a pen with natural colored bedding under him."], "image": "train2014/COCO_train2014_000000418770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471946, "question_id": "dNxpQurAkWHMDv4hPm9McK", "question": "What sort of outing are they embarking on?", "choices": ["skiing", "snowboarding", "camping", "beach"], "correct_choice_idx": 0, "direct_answers": ["ski trip", "skiing", "ski trip", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["The outing is skiing.", "It is winter and each of them has a set of wooden planks that they will use to move through the snow.", "There are several sets of skis sitting on the snow."], "image": "train2014/COCO_train2014_000000471946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92096, "question_id": "dPbRap4rPkYczJQAz7Yhas", "question": "Approximately how many people are watching the event?", "choices": ["hundreds", "thousands", "dozen", "two"], "correct_choice_idx": 2, "direct_answers": ["25", "many", "twenty", "15", "twenty", "twentyfive", "dozen", "twelve", "fourteen", "fifteen"], "difficult_direct_answer": true, "rationales": ["There are a dozen.", "There are lots of people against the guardrail but not too much where it's extremely crowded.", "There is a small crowd of about 12 people watching the event."], "image": "train2014/COCO_train2014_000000092096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382406, "question_id": "dQSLM449rwwRYNLLwoTNFd", "question": "What color vehicle is closest to the mailbox?", "choices": ["black", "silver", "white", "blue"], "correct_choice_idx": 1, "direct_answers": ["silver", "gray", "silver", "silver", "grey", "van", "silver", "silver", "silver", "silver"], "difficult_direct_answer": false, "rationales": ["A silver car is parked on the side of the street.", "The blue mailbox is seen on the sidewalk, directly next to the silver car.", "It's the only vehicle that is near it."], "image": "val2014/COCO_val2014_000000382406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426980, "question_id": "dQZjSZYMXMhFJrCWoguSLe", "question": "What are they small trucks called?", "choices": ["delivery vans", "mobile cafes", "food trucks", "shuttles"], "correct_choice_idx": 2, "direct_answers": ["trailer", "trailer", "food trucks", "trailers", "food trucks", "food trucks", "campers", "camper", "revs trailers", "food trucks"], "difficult_direct_answer": false, "rationales": ["The small trucks are serving food.", "These are food trucks, and the advertising on their outsides gives you an idea of the types of food they're selling. although they've been around for decades, today's food trucks really started gaining popularity in the 2000s, with no sign of letting up.", "These small trucks normally are used for selling food."], "image": "train2014/COCO_train2014_000000426980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451102, "question_id": "dQm6CP9eTxijMWjYLyckBS", "question": "These animals have an average lifespan of how many years?", "choices": ["thirty", "twelve", "five", "forty"], "correct_choice_idx": 1, "direct_answers": ["no image", "twelve", "ten", "ten", "twelve years", "10-12 years", "ten", "10-12 years", "fifteen", "10-12 years"], "difficult_direct_answer": false, "rationales": ["Sheep live for twelve years.", "There are 12 animals.", "These animals are sheep and they have an average lifespan of about 12 years or slightly less."], "image": "val2014/COCO_val2014_000000451102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273592, "question_id": "dRB2U328WMrCGBjHtwbVmw", "question": "What is the man's profession?", "choices": ["coach", "doctor", "officer", "teacher"], "correct_choice_idx": 2, "direct_answers": ["police", "police officer", "police", "officer", "police", "policeman", "police officer", "policing", "police", "police"], "difficult_direct_answer": false, "rationales": ["He is a policeman.", "A man is on a motorcycle with red and blue lights and is wearing a blue uniform.", "The man is wearing a blue police outfit."], "image": "train2014/COCO_train2014_000000273592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401812, "question_id": "dRgRcGKyzftseR6zX3PjT3", "question": "What is the man on the right doing?", "choices": ["stealing horses", "beating horses", "feeding horses", "controlling horses"], "correct_choice_idx": 3, "direct_answers": ["pushing", "watching", "herding horses", "plowing", "controlling horses", "plowing", "grabbing saddle", "commanding horses", "holding reins", "pushing cart"], "difficult_direct_answer": true, "rationales": ["These are powerful working animals that need to be guided by a human.", "A man is grabbing onto the reigns attached to a horse and trailer.", "The man is holding a strap that appears to be connected to the horses. a strap connected to the horses in this fashion would be used to control and direct their movement."], "image": "train2014/COCO_train2014_000000401812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554169, "question_id": "dRkmzGhggfPmwoEeiMdmqs", "question": "In what area of the kitchen is the boy standing with the door open?", "choices": ["dishwasher", "refrigerator", "cabinet", "pantry"], "correct_choice_idx": 1, "direct_answers": ["refrigerator", "refrigerator", "refrigerator", "refrigerator", "refrigerator", "refrigerator", "fridge", "fridge", "refrigerator", "fridge"], "difficult_direct_answer": false, "rationales": ["The refrigerator holds cold foods.", "He is standing near a fridge; we can see the foods in there and by looking at it we know that this kind of kitchen equipment is a fridge.", "The boy is standing in front of a white appliance that contains food."], "image": "train2014/COCO_train2014_000000554169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229422, "question_id": "dS2bcW7GkWR6Bgoz8FMUGG", "question": "What are the skiers doing with each other?", "choices": ["posing", "arguing", "racing", "fighting"], "correct_choice_idx": 2, "direct_answers": ["skiers all", "racing", "race", "racing", "avoidance", "racing", "racing", "racing", "racing", "racing"], "difficult_direct_answer": false, "rationales": ["The skiers are going very fast down the slope and have numbers on them.", "The skiers are racing.", "The people are racing on the slopes."], "image": "train2014/COCO_train2014_000000229422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9698, "question_id": "dSRRL2atcrsbuyTAsDmU6F", "question": "How many Omnivores in the picture?", "choices": ["three", "five", "two", "four"], "correct_choice_idx": 2, "direct_answers": ["one", "two", "two", "two", "one", "two", "two", "one", "two", "two"], "difficult_direct_answer": false, "rationales": ["They eat meat and other foods.", "There are two omnivores, the man and the dog.", "There are two."], "image": "train2014/COCO_train2014_000000009698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478859, "question_id": "dSUYrFSJPz6eiTrjzgSewY", "question": "What animal is related to the animal that is wearing a hat in the poster?", "choices": ["jellyfish", "wolf", "tiger", "ant"], "correct_choice_idx": 2, "direct_answers": ["panther", "tiger", "cat", "garfield", "tiger", "tiger", "cat", "tiger", "lion", "kitten"], "difficult_direct_answer": false, "rationales": ["Tigers belong to the cat family.", "They are both felines", "There is a cat in the poster, and a tiger is one of the biggest wild cats in the world."], "image": "train2014/COCO_train2014_000000478859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290416, "question_id": "dSswcCTfWkEniPj5mfBzCS", "question": "What grade is this skier in?", "choices": ["beginner", "professional", "intermediate", "amateur"], "correct_choice_idx": 1, "direct_answers": ["advanced", "middle school", "professional", "seventh", "pro", "12th grade", "fifth", "top level", "high grade", "high"], "difficult_direct_answer": true, "rationales": ["He is doing jumping stunts that are not easily performed by any skier.", "The grade is professional.", "The skier is performing a difficult aerial trick based on their body positioning and elevation above the ground. someone doing such a trick is likely very experienced and may be answer a."], "image": "val2014/COCO_val2014_000000290416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75001, "question_id": "dSurPefEoDbUmKo3RJzpKH", "question": "What type fruits might grow on trees shown here?", "choices": ["peaches", "blackberries", "bananas", "cherries"], "correct_choice_idx": 2, "direct_answers": ["coconuts", "coconut", "coconuts", "coconuts", "bananas", "bananas", "coconut", "bananas", "coconuts", "coconut"], "difficult_direct_answer": false, "rationales": ["The fruits are bananas since they grow on palm trees.", "The fruits are bananas.", "Bananas are grown in these trees."], "image": "val2014/COCO_val2014_000000075001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454457, "question_id": "dT59qffmq83nS3BWeWZqbm", "question": "What material is the roof made of?", "choices": ["metal", "vinyl", "brick", "wood"], "correct_choice_idx": 0, "direct_answers": ["tin", "metal", "plastic", "metal", "tin", "metal", "aluminum", "metal", "pipes", "metal"], "difficult_direct_answer": false, "rationales": ["The material is made of metal.", "It is an industrial space, and the silver roof indicates that it is made of metal.", "The roof is shiny so it is likely made of metal."], "image": "val2014/COCO_val2014_000000454457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471642, "question_id": "dTLzRspWEDJDuM8sE4qd76", "question": "What should a rider stand behind to be safe when the train arrives?", "choices": ["yello triangles", "train door", "yellow line", "crossed patterns"], "correct_choice_idx": 2, "direct_answers": ["yellow line", "yellow area", "yellow line", "yellow line", "yellow line", "yellow line", "yellow line", "line", "yellow line", "yellow area"], "difficult_direct_answer": false, "rationales": ["The yellow line is there to indicate the safety demarcation.", "They stand behind the line that is marked to be safe.", "The rider is behind the line."], "image": "val2014/COCO_val2014_000000471642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433170, "question_id": "dU4HoS645LTxdgDzCijhkj", "question": "For what is this man most prepared?", "choices": ["hurricane", "earthquake", "protest", "rain"], "correct_choice_idx": 3, "direct_answers": ["rain", "rain", "reading newspaper", "rain", "rain", "rain", "work", "rain", "rain", "newspaper"], "difficult_direct_answer": false, "rationales": ["He has an umbrella under his arm in case the weather gets bad.", "The man is carrying an umbrella.", "Rain is prepared for."], "image": "train2014/COCO_train2014_000000433170.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300772, "question_id": "dU6KC2NVNcCn7t2tLEWYSy", "question": "What makes it obvious that the boys in the background are just observers?", "choices": ["too small", "no uniform", "too big", "laughing"], "correct_choice_idx": 1, "direct_answers": ["behind fence", "no attention", "laughing", "fence", "joking around", "no gear", "no uniform", "in dugout", "no uniform", "they're watching"], "difficult_direct_answer": true, "rationales": ["There is no uniform.", "The boys in the back are wearing street clothes.", "Children in casual clothes are watching a batter in uniform swing the bat. observers do not wear uniforms."], "image": "train2014/COCO_train2014_000000300772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431526, "question_id": "dU7KFJ9QQjFnP66Yw4EWKv", "question": "What is the structure perpendicular to the plane used for?", "choices": ["boarding", "fueling", "cleaning", "loading luggage"], "correct_choice_idx": 0, "direct_answers": ["passenger transfer", "boarding", "terminal walkway", "boarding passengers", "passenger entrance", "boarding", "passage", "passage", "loading passengers", "boarding"], "difficult_direct_answer": false, "rationales": ["The structure is connected to the plane and forms a tunnel. it helps people get onto the plane.", "That is so people can get on and off the plane.", "The tunnel gets the passengers on and off the plan without having to go down to the ground."], "image": "train2014/COCO_train2014_000000431526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495670, "question_id": "dUCJj2eYzA32YY24HXxXUn", "question": "What hand gesture are the two doing?", "choices": ["hang ten", "thumbs up", "devil horns", "peace sign"], "correct_choice_idx": 3, "direct_answers": ["peace", "peace sign", "peace", "peace", "peace sign", "peace", "peace sign", "peace", "peace", "peace"], "difficult_direct_answer": false, "rationales": ["The hand gesture is a peace sign.", "The people are making a peace sign since their fingers are jutting out.", "They are making the peace sign with their fingers."], "image": "train2014/COCO_train2014_000000495670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103465, "question_id": "dUCXHCjaVwRm8TTjWBtHUN", "question": "What number is represented by a foreign symbol here?", "choices": ["five", "eight", "nine", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "twelve", "two", "twelve", "two", "two", "twelve", "two", "two"], "difficult_direct_answer": false, "rationales": ["The second number is represented by something other than a number two, so that is the number represented with a \"foreign symbol.\".", "A clock is shown and some spots are filled with pictures rather than the typical digit normally present in that location of the clock face.", "The number is 2."], "image": "train2014/COCO_train2014_000000103465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271122, "question_id": "dUMeAAZYUu9jVKXUeG2PEv", "question": "What do the colored flags show?", "choices": ["sea warnings", "directions", "decoration", "weather"], "correct_choice_idx": 0, "direct_answers": ["sea warnings", "wind presence", "wind direction", "windy", "red", "country", "landmark", "countries", "nationality", "wind"], "difficult_direct_answer": true, "rationales": ["This lets people know there is a danger on the coast", "This is a beach area. flags on the beach show the water condition.", "The colored flags show warnings."], "image": "train2014/COCO_train2014_000000271122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290114, "question_id": "dUV7KtW6WsVCu6Pvxx4zCJ", "question": "What is the man using the phone to do?", "choices": ["play games", "take picture", "make call", "text"], "correct_choice_idx": 1, "direct_answers": ["take picture", "selfie", "taking photos", "take picture", "take selfie", "take picture", "picture", "pictures", "take picture", "take picture"], "difficult_direct_answer": false, "rationales": ["They are taking a selfie in a mirror", "The phone has a camera feature visible and the people are posing which would be consistent with their intention to do answer a.", "A man and woman are posing as the man holds his phone up."], "image": "train2014/COCO_train2014_000000290114.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62387, "question_id": "dUp4QYHtMrWPUVNpaHVUUk", "question": "Which direction may the cars moving forward turn at this exact time?", "choices": ["right", "straight", "u turn", "left"], "correct_choice_idx": 1, "direct_answers": ["one way", "straight", "left", "left", "right", "straight only", "left", "left", "right", "left"], "difficult_direct_answer": false, "rationales": ["The cars are seen to be arranging themselves in a straight manner forward.", "There is a no left turn sign on the light pole. there is a one way sign on the closer light pole.", "The cars are going straight."], "image": "train2014/COCO_train2014_000000062387.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287882, "question_id": "dV9YFfQyRARHsH7piY6EfA", "question": "Who would hold the racket in a similar hand to this person?", "choices": ["arodys vizcaino", "archie bradley", "james harden", "bryse wilson"], "correct_choice_idx": 2, "direct_answers": ["lefty", "tennis player", "left hander", "opponent", "tennis player", "james harden", "left handed", "left handed", "williams", "tennis player"], "difficult_direct_answer": false, "rationales": ["The man is holding this racket in his left hand.", "A person is holding a tennis racket in their left hand. harden is left handed.", "Harden can hold the racquet."], "image": "val2014/COCO_val2014_000000287882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163474, "question_id": "dVBQnFRmBgTb8FACnidmNz", "question": "How many motorcycles do you see?", "choices": ["four", "three", "six", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "one", "1 motorcycle", "one", "one", "one", "1 motorcycle", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is one bike.", "There is only one two wheeled vehicle visible.", "There is only one two wheeled motor vehicle present in this city traffic scene."], "image": "train2014/COCO_train2014_000000163474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1307, "question_id": "dVidLW4FTsE9Br3BqjyQnG", "question": "Where do the vegetables here produce their greatest mass?", "choices": ["grass", "tree", "underground", "bloom"], "correct_choice_idx": 2, "direct_answers": ["gardens", "underground", "roots", "california", "carrots", "underground", "ground", "root", "carrots", "carrot"], "difficult_direct_answer": false, "rationales": ["The veggies are underground.", "Carrots are root vegetables and the part that is eaten is grown in the ground.", "Carrots grow underground."], "image": "train2014/COCO_train2014_000000001307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476975, "question_id": "dVwFfisELtc864V2NAznFQ", "question": "In what setting are the animals?", "choices": ["park", "wilderness", "zoo", "farm"], "correct_choice_idx": 1, "direct_answers": ["forrest", "bush", "savannah", "forest", "nature", "wilderness", "savanna", "wild", "wild", "nature"], "difficult_direct_answer": false, "rationales": ["The animals are in the wild.", "There's no fence or cage bars or any signs of buildings, so this is definitely a wilderness area.", "They are in the wilderness."], "image": "val2014/COCO_val2014_000000476975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348874, "question_id": "dW33keWB6QMEbLpkRS6FA4", "question": "What are signage placers here most concerned with?", "choices": ["nothing", "sustainability", "going quickly", "forcing stopping"], "correct_choice_idx": 3, "direct_answers": ["stopping", "stopping", "safety", "forcing stopping", "pedestrians stopping", "stopping", "stopping", "cars", "stopping", "traffic"], "difficult_direct_answer": false, "rationales": ["The signs all stay \"stop.\"", "To make sure people don't go forward", "The signs both say stop so no one passes."], "image": "train2014/COCO_train2014_000000348874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35473, "question_id": "dW5vD7EkZbMzw3eTxMn7q2", "question": "What is the overhead wire for?", "choices": ["power streetcars", "guides streetcar", "electric utility", "phone lines"], "correct_choice_idx": 0, "direct_answers": ["power", "power streetcars", "electricity", "electric", "supplies electricity", "electricity", "power", "electric supply", "power", "electricity"], "difficult_direct_answer": false, "rationales": ["Thick wires extend across the street in a town.", "The wire transmits electricity to the cars.", "We can see a streetcar on the right. it is running on tracks, and the overhead wires are directly above them."], "image": "train2014/COCO_train2014_000000035473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75032, "question_id": "dW5yZ58qebnEXcMcSNW8Wm", "question": "What are the men riding on?", "choices": ["roller blades", "scooter", "skateboard", "bike"], "correct_choice_idx": 2, "direct_answers": ["skateboards", "skateboard", "skateboard", "skateboard", "skateboard", "longboard", "skateboards", "skateboard", "skateboards", "skateboard"], "difficult_direct_answer": false, "rationales": ["The men are riding skateboards down the road.", "The men are standing on boards that have wheels on them.", "The men are on boards."], "image": "val2014/COCO_val2014_000000075032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24107, "question_id": "dWMXiYbKACfUxPigGSW3Uo", "question": "Why is the man wearing the silver helmet?", "choices": ["for halloween", "safety", "for amusement", "style"], "correct_choice_idx": 1, "direct_answers": ["head protection", "protection", "protection", "safety", "protection", "head protection", "protection", "protection", "protective", "protection"], "difficult_direct_answer": false, "rationales": ["He is skiing. the helmet protects his head.", "The man is engaged in a physically dangerous activity, downhill skiing. wearing safety equipment is a prudent measure when engaged in dangerous activity.", "Helmets are for safety."], "image": "train2014/COCO_train2014_000000024107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75560, "question_id": "dWqCLLcryTetcNHHtGZpvH", "question": "What is the yellow structure in the background used for?", "choices": ["lifting things", "throwing things", "climbing", "holding things"], "correct_choice_idx": 0, "direct_answers": ["building", "lifting", "crane", "construction", "crane", "construction", "hotel", "lifting", "lifting things", "warning"], "difficult_direct_answer": false, "rationales": ["The yellow structure is a crane. it can be used to move objects from the ground to higher heights.", "The structure lifts.", "This device is used as a crane for loving and lifting heavy items."], "image": "val2014/COCO_val2014_000000075560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265160, "question_id": "dXANKfrdeyafYy8VC29DKz", "question": "Why are these people using umbrellas?", "choices": ["rain", "disguise", "snow", "sun"], "correct_choice_idx": 0, "direct_answers": ["raining", "raining", "rain", "protection", "block rain", "its raining", "avoiding rain", "raining", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The weather is wet and dreary.", "The ground is wet, and the sky is cloudy. there is no snow.", "The floor is wet."], "image": "train2014/COCO_train2014_000000265160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324635, "question_id": "dXB5LDiVJVDrLvd9Jit3Dc", "question": "Which bird contributed to ingredients seen here?", "choices": ["none", "chicken", "pheasant", "ostrich"], "correct_choice_idx": 1, "direct_answers": ["chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken", "chicken"], "difficult_direct_answer": false, "rationales": ["Eggs are visible and birds like chickens lay eggs.", "There are eggs. they are yellow and white.", "It is a type of common meat."], "image": "train2014/COCO_train2014_000000324635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112922, "question_id": "dXZYNEBW7JLTkS2Y9YkfU7", "question": "What player will kick the ball first?", "choices": ["15", "20", "one", "none"], "correct_choice_idx": 0, "direct_answers": ["center", "15", "soccer player", "number fifteen", "number fifteen", "number fifteen", "player 15", "15", "number 15", "fifteen"], "difficult_direct_answer": false, "rationales": ["The player is 15.", "The number on the player's shorts is 15.", "15 is in control of the ball."], "image": "train2014/COCO_train2014_000000112922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63022, "question_id": "dXaSbEJDJmZLBJDP7cbyGo", "question": "What is the man is trying to do?", "choices": ["parachuting", "paragliding", "kiting", "surfing"], "correct_choice_idx": 1, "direct_answers": ["fly kite", "fly kite", "paragliding", "kiteboard", "fly kite", "parasail", "fly kite", "fly parachute", "fly kite", "fly kite"], "difficult_direct_answer": false, "rationales": ["He is trying to get up in the air.", "They are trying to paraglide in the air.", "The big kite at the top plus the harness attached to him suggest that he is trying to have it pull him."], "image": "train2014/COCO_train2014_000000063022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90393, "question_id": "dXuYaakvZrpXGRSvYWcS73", "question": "What type of trees are shown in the foreground?", "choices": ["evergreen", "conifers", "christmas", "deciduous"], "correct_choice_idx": 3, "direct_answers": ["dead", "maple", "birch", "maple", "tall", "deciduous", "deciduous", "oak", "oak", "oak"], "difficult_direct_answer": false, "rationales": ["The trees are visible and appear leafless. trees that lose their leaves are known as answer a.", "The foreground trees have leaves that fall off in autumn.", "Trees with no leaves are in a park area where someone is throwing a frisbee."], "image": "train2014/COCO_train2014_000000090393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343394, "question_id": "dY5emUoTAfSBPvNLzznv8p", "question": "What is the kite above the girl shaped like?", "choices": ["seal", "ferret", "butterfly", "giraffe"], "correct_choice_idx": 2, "direct_answers": ["butterfly", "butterfly", "butterfly", "butterfly", "butterfly", "butterfly", "butterfly", "butterfly", "butterfly", "butterfly"], "difficult_direct_answer": false, "rationales": ["The kite is visible and identifiable. the object appears to have wings and is the style and color of answer a.", "A colorful kites shaped like wings is above a girl who is jumping.", "That's what the kite is shaped like."], "image": "val2014/COCO_val2014_000000343394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485757, "question_id": "dY8BktfRkTgzA99NToTorM", "question": "What metal object is on the cake?", "choices": ["gate", "key", "handcuffs", "sword"], "correct_choice_idx": 2, "direct_answers": ["handcuffs", "hand cuffs", "handcuffs", "handcuffs", "hand cuffs", "handcuffs", "handcuffs", "handcuffs", "handcuffs", "handcuffs"], "difficult_direct_answer": false, "rationales": ["The object is a handcuff.", "Cuffs that police officers use are on the cake.", "Handcuffs are made of medal and they are on the cake."], "image": "train2014/COCO_train2014_000000485757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225174, "question_id": "dY8Y6hRF5n3DRU2GJZe6vf", "question": "What is this part of the plane known as?", "choices": ["cockpit", "stern", "runway", "first class"], "correct_choice_idx": 0, "direct_answers": ["cockpit", "cockpit", "nose", "nose", "cockpit", "cockpit", "cockpit", "cockpit", "nose", "cockpit"], "difficult_direct_answer": false, "rationales": ["It is the front of the plane.", "The front of the plane is a cockpit.", "Traditionally the front part of an airplane is called the cockpit."], "image": "train2014/COCO_train2014_000000225174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387410, "question_id": "dYHTxZirCZoLTi7uvK9Mqh", "question": "Why do they lay on the benches?", "choices": ["are hiding", "are dead", "are tired", "are confused"], "correct_choice_idx": 2, "direct_answers": ["to rest", "to rest", "sleeping", "are tired", "to rest", "sleeping", "tired", "tired", "tired", "tired"], "difficult_direct_answer": false, "rationales": ["The people are snoozing.", "The people are tired.", "They don't appear to be agitated or injured and are laying down."], "image": "train2014/COCO_train2014_000000387410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223276, "question_id": "dYJHFFX6uJudrsGSuPhDAi", "question": "What store is this man sitting outside of?", "choices": ["starbucks", "wal mart", "target", "hot topic"], "correct_choice_idx": 3, "direct_answers": ["restaurant", "specialty store", "hot topic", "hot topic", "store", "hot topic", "club", "hot topic", "bar", "goth store"], "difficult_direct_answer": false, "rationales": ["The sign says hot topic.", "The sign by the window indicates the name of the store.", "The store is hot topic."], "image": "val2014/COCO_val2014_000000223276.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28246, "question_id": "dYRUGwntL77LZuqec9YZ75", "question": "Based on the hanging flags where is this?", "choices": ["france", "italy", "sweden", "america"], "correct_choice_idx": 0, "direct_answers": ["netherlands", "france", "netherlands", "united states", "france", "italy", "france", "croatia", "france", "netherland"], "difficult_direct_answer": false, "rationales": ["There are french flags flying along the route.", "The flags are in france.", "These are french flags hanging from the rail."], "image": "val2014/COCO_val2014_000000028246.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460567, "question_id": "dZ9eqQpN9m7jPcRF2TzE7a", "question": "How many different species of animals seem to drinking in the area?", "choices": ["three", "four", "one", "two"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "1 species", "three", "three", "four", "three", "1 species", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 3.", "There are three types of animals drinking.", "There are antelopes, zebras, and giraffes."], "image": "val2014/COCO_val2014_000000460567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141003, "question_id": "dZaoFDipQ9tTyoAJfEQr54", "question": "What is the man standing there to observe?", "choices": ["trains", "birds", "ocean", "planes"], "correct_choice_idx": 2, "direct_answers": ["ocean", "ocean view", "water", "surfing", "ocean", "bad", "waves", "waves", "vehicles", "ocean"], "difficult_direct_answer": false, "rationales": ["He is a surfer. surfers watch the water for wave patterns.", "The man is looking at the water.", "He is wearing shorts and standing in front of a surfboard. he is watching the waves."], "image": "train2014/COCO_train2014_000000141003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294866, "question_id": "dZdMf6NtpcXqi9CKpBQDc4", "question": "These people most likely speak with what accent?", "choices": ["new yorker", "valley girl", "southern", "midwestern"], "correct_choice_idx": 0, "direct_answers": ["new yorker", "new york", "english", "midwestern", "american", "new york", "new york", "american", "english", "new york"], "difficult_direct_answer": false, "rationales": ["These people are in a cold climate, which is associated with new york. also, there is a business in the background with the name of \"new york smiles dental\", which gives the location away.", "If the weather and ny on the bag match the location.", "People stand on a city street corner. new york is a city."], "image": "train2014/COCO_train2014_000000294866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203867, "question_id": "dZgPn9AfL6aVS6TbHm9gaZ", "question": "What does the smiling lady do?", "choices": ["dances", "milks", "hobbles", "runs"], "correct_choice_idx": 1, "direct_answers": ["milk cow", "milking cow", "milk cow", "milk", "milks", "milk cows", "milking cow", "milk cows", "milk cow's", "milk cows"], "difficult_direct_answer": false, "rationales": ["The woman is milking the udders.", "The liquid is coming from the cow's udder and being collected in the jar.", "The smiling lady appears in front of a cow with one hand holding a glass bottle with white liquid. it would be logical to believe that the other hand is milking the cow."], "image": "train2014/COCO_train2014_000000203867.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34455, "question_id": "daLyX6VTUv2Nt7Vkv2DDka", "question": "What household object can one assume is within a few feet of this?", "choices": ["microwave", "lamp", "television", "rug"], "correct_choice_idx": 2, "direct_answers": ["television", "television", "television", "tv", "television", "tv", "television", "tv", "television", "television"], "difficult_direct_answer": false, "rationales": ["The object is the tv.", "A remote for a television can be seen.", "There are remote controls in the foreground."], "image": "val2014/COCO_val2014_000000034455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171539, "question_id": "daZbtLcGo4tioLhZAwqngx", "question": "What is the part holding the tire to the wheel called?", "choices": ["knob", "wheel", "rim", "stub"], "correct_choice_idx": 2, "direct_answers": ["dog", "wheel studs", "rim", "wheel stud", "hubcap", "car", "rim", "axle", "rim", "rim"], "difficult_direct_answer": false, "rationales": ["The part holding the tire is called the rim.", "The part is the rim.", "There is a rim holding the tire on the wheel."], "image": "val2014/COCO_val2014_000000171539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215456, "question_id": "dapbqjEr3YaM5FkbX9s968", "question": "What would the opposite of this sign be?", "choices": ["speed up", "halt", "god", "slow"], "correct_choice_idx": 2, "direct_answers": ["go", "go", "go", "god", "go", "go", "go", "go", "go", "go"], "difficult_direct_answer": false, "rationales": ["The sign says stop.", "The opposite is god.", "The sign says to stop."], "image": "val2014/COCO_val2014_000000215456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314131, "question_id": "db2TwrtZKPKMUaxjghY5W8", "question": "What famous actor does he resemble?", "choices": ["marilyn monroe", "jason statham", "mel gibson", "charlie chaplin"], "correct_choice_idx": 3, "direct_answers": ["charlie chaplin", "charlie chaplin", "charlie chaplin", "unknown", "unknown", "chaplin", "charlie chaplin", "charley chaplin", "charlie chaplin", "charlie chaplin"], "difficult_direct_answer": false, "rationales": ["This comedian/actor famously wore a black fedora and had a small black mustache.", "The man is dressed in black and white and is wearing glasses and has facial hair.", "He sure doesn't look like any of the other options!."], "image": "train2014/COCO_train2014_000000314131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460507, "question_id": "dbEpuiwFXMxFtkme36Vjr2", "question": "What is a traditional filling for the triangular items?", "choices": ["potatoes", "cheese", "onions", "pilchards"], "correct_choice_idx": 1, "direct_answers": ["cheese", "tuna", "cheese", "cheese", "cheese", "feed", "cheese", "tuna", "cheese", "cheese"], "difficult_direct_answer": false, "rationales": ["Cheese is often found in sandwiches.", "It looks like a sandwich that has been grilled, and grilled cheese is a very popular sandwich item.", "Cheese goes in grilled cheese."], "image": "val2014/COCO_val2014_000000460507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395752, "question_id": "dbYN4xvs5zcQyTbDMKUWgr", "question": "Why is the man near the front of the trolley?", "choices": ["to fight", "visibility", "getting in", "to talk"], "correct_choice_idx": 2, "direct_answers": ["getting on", "getting on", "driving", "getting on", "driver", "boarding", "boarding", "passenger", "getting on", "getting in"], "difficult_direct_answer": false, "rationales": ["A man is barely seen at the front of the trolly and appears to be boarding.", "He is entering the trolly", "He looks to be boarding the trolley."], "image": "val2014/COCO_val2014_000000395752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244026, "question_id": "dbjmaxP3AB8YUstwS7CaCV", "question": "What food group are they snacking on?", "choices": ["vegetables", "fruits", "meats", "grains"], "correct_choice_idx": 1, "direct_answers": ["vitamins", "fruit", "fruit", "fruits", "fruit", "fruit", "fruit", "fruit", "fruit", "bananas"], "difficult_direct_answer": false, "rationales": ["There are bananas, oranges, and apples on the plate and bananas in the girls' hands.", "People are sitting around a table and a bowl with apples, oranges, and bananas are in it.", "The bowl has fruit."], "image": "train2014/COCO_train2014_000000244026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541293, "question_id": "dbnbdiieJHEVj7qJrfCi3F", "question": "What body part of the man is hidden from view?", "choices": ["arm", "feet", "toes", "leg"], "correct_choice_idx": 0, "direct_answers": ["torso", "torso", "chest", "torso", "arm", "torso", "chest", "stomach", "top", "torso"], "difficult_direct_answer": false, "rationales": ["The other choices on the list can all be seen in the image.", "The man's arms are out of view.", "You can see both of this mans feet, legs and all of his toes, the only thing you can not see is his upper half which includes his arms."], "image": "train2014/COCO_train2014_000000541293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549117, "question_id": "dc2fEtw6DYLhDrPb4mmkys", "question": "What liquid cooks the dough?", "choices": ["milk", "cooking oil", "paint", "water"], "correct_choice_idx": 1, "direct_answers": ["vegetable oil", "oil", "oil", "water", "cooking oil", "oil", "oil", "oil", "milk", "oil"], "difficult_direct_answer": false, "rationales": ["Donuts are shown on display. donuts are fried in oil.", "Cooking oil is used to fry and cook donuts.", "Donuts are displayed in a case."], "image": "train2014/COCO_train2014_000000549117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398575, "question_id": "dcSi6tkmx8CVJgq9drsK48", "question": "What is the proper orientation for the sign?", "choices": ["vertical", "none", "horizontal", "diagonal"], "correct_choice_idx": 0, "direct_answers": ["standing", "upright", "upright", "vertical", "vertical", "upright", "vertical", "vertical", "upright", "upright"], "difficult_direct_answer": false, "rationales": ["The orientation is vertical.", "The sign should be upright so that the words can be read.", "The sign has fallen on the pavement ,and lacks its initial orientation which is firmly upright."], "image": "train2014/COCO_train2014_000000398575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5373, "question_id": "dcX7PfZA7yrKPhYLDqgTv7", "question": "For whom does this woman prepare pizza?", "choices": ["restaurant", "family", "bake sale", "street vendor"], "correct_choice_idx": 1, "direct_answers": ["family", "family", "group", "family", "herself", "her family", "family", "her family", "family", "herself"], "difficult_direct_answer": false, "rationales": ["The pizza is for family.", "The pizza is very large for others to eat.", "She is in a home kitchen. families eat in a home."], "image": "train2014/COCO_train2014_000000005373.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579035, "question_id": "dd3MjUofPvwaALZ5BvjWjL", "question": "What store is near the sign?", "choices": ["sears", "dunkin donuts", "mcdonald's", "coach"], "correct_choice_idx": 3, "direct_answers": ["coach", "coach", "coach", "coach", "coach", "coach", "coach", "coach", "coach", "coach"], "difficult_direct_answer": false, "rationales": ["A coach logo is shown.", "The sign is for coach since that's what the letters spell.", "There is a sign that has \"coach\"."], "image": "train2014/COCO_train2014_000000579035.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420852, "question_id": "dd9hEeL9sSZUf6hsRCS23w", "question": "What are the two items in the sky?", "choices": ["birds", "ufo's", "planes", "balloons"], "correct_choice_idx": 3, "direct_answers": ["balloons", "air balloons", "ballons", "balloons", "hot-air balloons", "balloons", "air balloons", "parasite", "balloons", "ballons"], "difficult_direct_answer": false, "rationales": ["They are powered by continuous hot air in order to float.", "Two large inflatables with baskets underneath are in the air near each other.", "The objects are balloons blown up with hot air."], "image": "val2014/COCO_val2014_000000420852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66246, "question_id": "ddh94tnELV76oYUvVWEUbC", "question": "Why is the man in a hat wearing a glove?", "choices": ["to catch", "health", "warmth", "fashion"], "correct_choice_idx": 0, "direct_answers": ["to play", "baseball", "hand protection", "to catch", "catch ball", "catch ball", "protect hand", "playing baseball", "pitcher", "baseball"], "difficult_direct_answer": false, "rationales": ["People use a glove to catch balls.", "To catch the ball if it comes near him", "He is trying to stop or retrieve the ball after it is hit."], "image": "train2014/COCO_train2014_000000066246.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26448, "question_id": "de9fpCSXXhu574X7SS2BAB", "question": "What material was used to depict the snow in this art piece?", "choices": ["bubbles", "cotton", "feathers", "yarn"], "correct_choice_idx": 1, "direct_answers": ["cauliflower", "styrofoam", "cauliflower", "cauliflower", "cauliflower", "cotton", "cotton", "cauliflower", "cauliflower", "cauliflower"], "difficult_direct_answer": false, "rationales": ["Thick spools of cotton were aggregated.", "Models of skiers are on white, fluffy material. cotton is white and fluffy.", "It is fake snow and it most closely resembles cotton that can be bunched up like this."], "image": "val2014/COCO_val2014_000000026448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141017, "question_id": "deBEAcHiX5CVneNL7NoruZ", "question": "What setting is this scene?", "choices": ["apartment", "childcare center", "office", "restaurant"], "correct_choice_idx": 0, "direct_answers": ["dining room", "apartment", "home", "hotel suite", "kitchen", "kitchen", "apartment", "mama", "apartment", "hotel room"], "difficult_direct_answer": false, "rationales": ["This is a living area that's small.", "The rooms seems to have properties that most apartments have.", "It's in someones apartment."], "image": "val2014/COCO_val2014_000000141017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576855, "question_id": "der5j3UBtQLaMuzFrLAMyU", "question": "The man on the bench is reading the newspaper during which season?", "choices": ["winter", "fall", "summer", "spring"], "correct_choice_idx": 3, "direct_answers": ["spring", "summer", "summer", "spring", "fall", "summer", "spring", "summer", "spring", "fall"], "difficult_direct_answer": false, "rationales": ["The man on the bench is reading newspapers during the spring because there are leaves growing on the trees.", "Everything is blooming.", "The bench is in the spring."], "image": "train2014/COCO_train2014_000000576855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457217, "question_id": "dfcjoCtVsdhE8sWUbtbzvh", "question": "What does the elephant here seek?", "choices": ["love", "nothing", "food", "mate"], "correct_choice_idx": 2, "direct_answers": ["food", "food", "food", "bowl", "food", "food", "food", "food", "ladies lunch", "food"], "difficult_direct_answer": false, "rationales": ["She is holding a bowl of food and the elephant is reaching for it.", "The elephant wants a treat.", "The only practical answer is that the elephant wants food. none of the other options make much sense."], "image": "val2014/COCO_val2014_000000457217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299001, "question_id": "dftBmK9ttCckdUyMsSkaVJ", "question": "What is the name of the red apples?", "choices": ["ladybug", "red delicious", "dark red", "savory"], "correct_choice_idx": 1, "direct_answers": ["red delicious", "red apple", "red delicious", "red delicious", "malus", "honeycrisp", "macintosh", "red delicious", "granny smith", "red delicious"], "difficult_direct_answer": false, "rationales": ["The red apples are called red delicious apples given their color.", "The apples are bright red.", "The name is red delicious."], "image": "val2014/COCO_val2014_000000299001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200627, "question_id": "dfy9UKTzAuX3Ah7w6UhAck", "question": "What are the long white veggies in the dish?", "choices": ["radish", "turnip", "onion", "bean sprouts"], "correct_choice_idx": 3, "direct_answers": ["bean sprouts", "bean sprouts", "bean sprouts", "sprouts", "onions", "lean sprouts", "bean sprouts", "parsnip", "veg fries", "bean sprouts"], "difficult_direct_answer": false, "rationales": ["Bean sprouts are long like this.", "They are long and white.", "Bean sprouts are served in mixed veggies. they are a light color."], "image": "val2014/COCO_val2014_000000200627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430369, "question_id": "dfyLXdYQh7dqRVK6BKFpYr", "question": "Where would the most likely place be for this person to be walking?", "choices": ["walking trail", "residential area", "road", "runway"], "correct_choice_idx": 0, "direct_answers": ["trail", "park", "park", "walking trail", "park", "park", "park", "trail", "trail", "park"], "difficult_direct_answer": false, "rationales": ["The person is most likely walking on a trail.", "A person is walking on a paved path in a wooded area. walking paths in parks are popular places to walk.", "They are in a park area with benches and vegetation"], "image": "train2014/COCO_train2014_000000430369.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301649, "question_id": "dg69VFQaYJJqpvbdHVZKoB", "question": "Wet yeast is used to make?", "choices": ["pizza", "cake", "dough", "bread"], "correct_choice_idx": 1, "direct_answers": ["bread", "bread", "bread", "bread", "cake", "dough", "cake", "bread", "bread", "brownies"], "difficult_direct_answer": false, "rationales": ["There are used to make cake as evident on the picture.", "Yeast is used to raise the cake.", "The yeast makes cake."], "image": "train2014/COCO_train2014_000000301649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9062, "question_id": "dgY8ChedRkU4zLq95eiTnQ", "question": "Why is the yellow item stuck to the sandwich?", "choices": ["toothpick", "spread", "glue", "melted"], "correct_choice_idx": 3, "direct_answers": ["melted", "flavor", "melted cheese", "cheese", "cheese", "cheese", "melted", "melted cheese", "cheese", "cheese"], "difficult_direct_answer": false, "rationales": ["The cheese has melted in the sandwich.", "The cheese was put next to a hot burger.", "The item is melted."], "image": "train2014/COCO_train2014_000000009062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343140, "question_id": "dgnE9vi4qkwMUULSedP3Lg", "question": "The young buy is in danger of slipping because he needs what item of clothing?", "choices": ["shirt", "helmet", "socks", "belt"], "correct_choice_idx": 2, "direct_answers": ["shoes", "shoes", "socks", "shoes", "shoes", "shoes", "shoes", "shoes", "shoes", "shoes"], "difficult_direct_answer": false, "rationales": ["The boy needs some socks.", "The kid has no socks.", "This apparatus should not be used when you are barefooted."], "image": "train2014/COCO_train2014_000000343140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197063, "question_id": "dgq64suXxVJtFi87RK5UNq", "question": "What is the asian man with the skateboard applying to the bench?", "choices": ["tape", "filler", "wax", "gum"], "correct_choice_idx": 2, "direct_answers": ["tape", "wax", "support", "sticker", "sticker", "wax", "gum", "wax", "wax", "feeling"], "difficult_direct_answer": false, "rationales": ["Wax is applied by skateboarders to make it easier to perform tricks over surfaces.", "Wax is a type of polish.", "The man seems to be sticking something in the skateboard."], "image": "train2014/COCO_train2014_000000197063.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404351, "question_id": "dgxVBz3yPeCTdZNnxCe5SK", "question": "Who pays the persons in orange?", "choices": ["train company", "jails", "parks", "police"], "correct_choice_idx": 0, "direct_answers": ["construction company", "railway", "government", "train company", "train company", "company", "train company", "city", "state", "local county"], "difficult_direct_answer": false, "rationales": ["The train company pays the people who are staffed.", "They work for the train company as they are working on the train.", "The orange colours indicate they are wearing a work uniform, and they appear to be working on the train on the railroad."], "image": "train2014/COCO_train2014_000000404351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461884, "question_id": "dhBAgrmD9JbQuQNhSrtoN4", "question": "Why is this man holding his arms out?", "choices": ["to gesture", "for balance", "he fell", "it's hot"], "correct_choice_idx": 1, "direct_answers": ["balance", "for balance", "maintain balance", "balance", "balance", "balance", "maintain balance", "maintain balance", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["While doing this extreme sport you have to avoid falling off.", "The man is holding out his arms to keep himself from falling.", "He can use his arms to add a weight that will counteract any leaning to one side."], "image": "val2014/COCO_val2014_000000461884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248744, "question_id": "dhJEUrxhvZM3tbZSJQXpxe", "question": "Where is the item being grilled normally prepared?", "choices": ["grill", "griddle", "sauce pot", "oven"], "correct_choice_idx": 3, "direct_answers": ["oven", "oven", "oven", "oven", "pizza", "pizza", "oven", "pizza oven", "oven", "oven"], "difficult_direct_answer": false, "rationales": ["A pizza is on a grill. pizzas are traditionally cooked in ovens.", "The other options don't apply. that said, c can apply, but only if used with a.", "The item is a pizza and they are usually made in an oven."], "image": "train2014/COCO_train2014_000000248744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382413, "question_id": "dhPSXtWJkQEHWfWMuJJWCV", "question": "What side of the picture is the sun on?", "choices": ["right", "top", "left", "bottom"], "correct_choice_idx": 1, "direct_answers": ["top side", "top", "top", "top", "left top", "left", "top", "left", "up", "top"], "difficult_direct_answer": false, "rationales": ["The sun is on the top.", "The sun is shining down from the top as the rays are coming downward.", "There are planes at the bottom. the sun is above them."], "image": "train2014/COCO_train2014_000000382413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226938, "question_id": "diA4Pccisnu8zzVAFVyTmF", "question": "Why are the cats resting?", "choices": ["excited", "tired", "angry", "playful"], "correct_choice_idx": 1, "direct_answers": ["habit", "sleepy", "they're cats", "tired", "tired", "tired", "tired", "sleepy", "tired", "very tired"], "difficult_direct_answer": false, "rationales": ["Cats will sleep a lot and often. they like to relax all the time.", "Two cats are laying together in the sunlight from a window.", "Cats are always lying around and sleeping."], "image": "val2014/COCO_val2014_000000226938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392302, "question_id": "diF2n88oEtEb2Uu4a4P7b8", "question": "The drink on the table is likely from what country?", "choices": ["italy", "turkey", "russia", "poland"], "correct_choice_idx": 0, "direct_answers": ["italy", "italy", "italy", "italy", "italy", "italy", "italy", "italy", "italy", "italy"], "difficult_direct_answer": false, "rationales": ["According to an internet search, peroni is an italian beer.", "It has an italian name.", "Peroni is in italy."], "image": "train2014/COCO_train2014_000000392302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317997, "question_id": "diH6AbdxB9VAn9dd7ZYVR9", "question": "Why are they covering their mouths?", "choices": ["are laughing", "are eating", "conceal conversation", "are coughing"], "correct_choice_idx": 2, "direct_answers": ["disappointment", "disappointment", "secret talking", "conceal conversation", "talking strategies", "reacting", "talking", "coughing", "accident happened", "whispering"], "difficult_direct_answer": true, "rationales": ["They don't want the other team to hear their plans. they are dressed the same and are playing doubles.", "They are obviously trying to talk to each other privately.", "The men are talking and concealing their mouths."], "image": "train2014/COCO_train2014_000000317997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39152, "question_id": "diMgvQWZu3DGHXG32YV5Bb", "question": "What are the kids learning to make?", "choices": ["valentine card", "doll clothes", "kites", "dollhouse"], "correct_choice_idx": 2, "direct_answers": ["kites", "kite", "kites", "kites", "kites", "kites", "kites", "kite", "kite", "kites"], "difficult_direct_answer": false, "rationales": ["Kids have small pieces of wood laid in a cross shape taped to decorative paper.", "The kids are crafting up some kites.", "The children are all learning how to put together kites."], "image": "val2014/COCO_val2014_000000039152.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216910, "question_id": "diPQsrSU3bx3cuMZGJjbhn", "question": "What is the best type of surf board?", "choices": ["soft top", "long wave", "fish board", "fun board"], "correct_choice_idx": 0, "direct_answers": ["osprey", "kialoa", "this type", "plastic", "polished board", "soft top", "waxed", "fiberglass", "roxy", "waxed"], "difficult_direct_answer": true, "rationales": ["The surfboard appears to be padded. the man carrying it does not seem to be struggling at all.", "A soft top is the best kind of board.", "The type is a soft top."], "image": "val2014/COCO_val2014_000000216910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125286, "question_id": "diW7wYosM7KmuNi9rzS4NW", "question": "Which of the following is useful to wear in this weather?", "choices": ["tank top", "boots", "swim trunks", "sandals"], "correct_choice_idx": 1, "direct_answers": ["hat", "coat", "jacket", "hat gloves", "coat", "snow pants", "parka", "coat", "coat", "boots"], "difficult_direct_answer": false, "rationales": ["There is a coating of snow on everything so it is cold. sandals, swim trunks, and tank tops are all things worn in warm weather.", "Boots are useful.", "It is snowing here. you would need something to cover up your body and feet."], "image": "val2014/COCO_val2014_000000125286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453020, "question_id": "dignafdxfWYs9o5BEggdko", "question": "What type of location is this?", "choices": ["residential", "public", "private", "theatrical"], "correct_choice_idx": 1, "direct_answers": ["bathroom", "men's bathroom", "toilet", "public restroom", "public", "bathroom", "bathroom", "public bathroom", "bathroom", "bathroom"], "difficult_direct_answer": false, "rationales": ["There are several stalls and urinals", "There are a number of toilets and urinals. you would not have this many in a private place.", "The bathroom has multiple toilet stalls along with many urinals. this would be found at a location that many people frequent at the same time such as a mall or a sporting arena."], "image": "val2014/COCO_val2014_000000453020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571738, "question_id": "dikyW6YfMCLe6EK5A5Mu6L", "question": "What shape is the pickle cut in?", "choices": ["spear", "chunks", "sliced", "cubed"], "correct_choice_idx": 0, "direct_answers": ["spear", "lengthwise", "spear", "spear", "length", "spear", "spear", "wedge", "spears", "quartered"], "difficult_direct_answer": false, "rationales": ["The pickle shape is long.", "The shape is a spear.", "This is a common shape for this treat"], "image": "val2014/COCO_val2014_000000571738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87588, "question_id": "dixquiVgkNczWTDCeRD9ha", "question": "What facility is shown here?", "choices": ["prison", "bazaar", "school", "market"], "correct_choice_idx": 2, "direct_answers": ["school", "school", "school", "school", "museum", "school", "university", "school", "school", "university"], "difficult_direct_answer": false, "rationales": ["Based on the style of the architecture and the backpack being worn by the person in the foreground answer a is most likely of the options. this does not have the same setting or layout as answers b-d would have.", "A school is depicted.", "He has a lot of bikes and large buildings"], "image": "val2014/COCO_val2014_000000087588.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288442, "question_id": "dj28cS4JGkVKWKTJkarx7G", "question": "How did this person get to this point?", "choices": ["ski lift", "taxi", "walk", "uber"], "correct_choice_idx": 2, "direct_answers": ["hike", "walked", "dedication", "walk", "hiked", "walking", "climbing", "walking upward", "climbing", "hiked"], "difficult_direct_answer": false, "rationales": ["The person walked.", "The man seems to have walk on the snow until he reached the high place.", "The person is walking up the mountain."], "image": "val2014/COCO_val2014_000000288442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356651, "question_id": "dj6GXJLTLe5mZfmbbr9w5T", "question": "What is the man attempting to do?", "choices": ["serve", "flip", "sit", "spin"], "correct_choice_idx": 0, "direct_answers": ["serve", "serve", "hit ball", "serve", "hit ball", "serve", "serve ball", "hit ball", "hitting ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["The man is attempting to serve a ball falling in midair.", "The only time a person will touch a ball in tennis is at the beginning when they throw it up in the air and hit it to the other side of the court.", "A man is jumping up to hit a ball near the backline on a tennis court."], "image": "train2014/COCO_train2014_000000356651.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187822, "question_id": "djE6tuWPTRMYeHDmkqyx8W", "question": "What will the person wearing red shorts do?", "choices": ["quit", "go down", "go home", "skate up"], "correct_choice_idx": 1, "direct_answers": ["skateboard", "skate", "skate", "skate", "skate", "skate", "skate", "skateboard", "skateboard", "go down"], "difficult_direct_answer": false, "rationales": ["The person will go down.", "He is standing on part of his board getting ready to move and do a stunt.", "The person wearing red shorts is standing at the top edge of a steep slope. he is facing the slope and standing on a skateboard."], "image": "val2014/COCO_val2014_000000187822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487467, "question_id": "djNsWXFmyfS2fCW4PAc6oW", "question": "These birds are most probably in what kind of location?", "choices": ["wild", "backyard", "reserve", "zoo"], "correct_choice_idx": 0, "direct_answers": ["park", "meadow", "field", "greenfield", "park", "park", "rainy", "wild", "grassland areas", "fields"], "difficult_direct_answer": false, "rationales": ["These birds don't look to be in cages.", "The birds are wild.", "They look to be in a park walking around free."], "image": "train2014/COCO_train2014_000000487467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433896, "question_id": "djbZRth3mTcHRWDJzYzgyr", "question": "What type of media do the people in the bedroom use to listen to music?", "choices": ["mp3s", "cds", "cassettes", "vinyl records"], "correct_choice_idx": 1, "direct_answers": ["cd player", "cds", "cd player", "cds", "cd player", "radio", "cd player", "cds", "cd player", "cds"], "difficult_direct_answer": false, "rationales": ["The media is cds.", "The people are listening to music on the cd discs.", "Cds are shown for music."], "image": "train2014/COCO_train2014_000000433896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451463, "question_id": "dk4T8R46JeXhHDAsXGXBtu", "question": "What period of the day is it likely to be?", "choices": ["evening", "dawn", "afternoon", "morning"], "correct_choice_idx": 0, "direct_answers": ["night", "evening", "evening", "night", "night", "evening", "evening", "evening", "evening", "evening"], "difficult_direct_answer": false, "rationales": ["The sky is getting dark and the streetlights are coming on.", "It is getting dark but it isn't pitch dark, must be evening.", "The period is evening."], "image": "val2014/COCO_val2014_000000451463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240062, "question_id": "dkAGn2pXgwffNyQdztG6RM", "question": "What type of telephone is he using?", "choices": ["pay", "cellular", "rotary", "landline"], "correct_choice_idx": 1, "direct_answers": ["cell", "cellular", "cell", "cellphone", "cell phone", "cellphone", "mobile phone", "cell phone", "cellphone", "walki talki"], "difficult_direct_answer": false, "rationales": ["Because he's outdoors and not near a power source.", "That is the type of phone the man has.", "This is a cellular phone."], "image": "train2014/COCO_train2014_000000240062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373653, "question_id": "dkEQM9qGgmbk96Lb3DU5cS", "question": "What type of information is on the digital bus sign?", "choices": ["brand", "informational", "directional", "warning"], "correct_choice_idx": 1, "direct_answers": ["destination city", "informational", "route", "location number", "place", "destination number", "stop", "destination location", "destination route", "destination"], "difficult_direct_answer": true, "rationales": ["The sign is informational.", "The route number and place name of the buses destination are quite visible above the windshield. these characters are easily changed digitally as routes and place names change as well.", "The number seen provides the little information of the bus."], "image": "train2014/COCO_train2014_000000373653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218283, "question_id": "dkGLr4Js5qnjKzGmiE8F9Z", "question": "What do they drink?", "choices": ["beer", "tea", "coffee", "water"], "correct_choice_idx": 3, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["That's the only thing elephants drink.", "Animals need water.", "Their trunks are for drinking water."], "image": "train2014/COCO_train2014_000000218283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546424, "question_id": "dkunMDAZGLK8hLp6G4kDAt", "question": "In which location is this couple?", "choices": ["garage", "church", "outdoors", "market"], "correct_choice_idx": 1, "direct_answers": ["church", "church", "church", "church", "wedding", "church", "chapel", "church aisle", "wedding chapel", "church"], "difficult_direct_answer": false, "rationales": ["The couple is getting married.", "The man and the woman are going down an aisle with pews on the side.", "It's a fact that many people get married in a church. no one gets married in a market or a garage and we can see that this picture is indoors."], "image": "val2014/COCO_val2014_000000546424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403351, "question_id": "dkxCnsgMd68VmwAdwCtJSD", "question": "What type of meat could be harvested from these creatures?", "choices": ["beef", "pork", "mutton", "chicken"], "correct_choice_idx": 2, "direct_answers": ["lamp chops", "lamb", "mutton", "mutton", "mutton", "mutton", "mutton", "veal", "mutton", "mutton"], "difficult_direct_answer": false, "rationales": ["Mutton could be harvested from sheep.", "The other options aren't names for lamb.", "Mutton comes from sheep."], "image": "train2014/COCO_train2014_000000403351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196944, "question_id": "dmG6NkkDGdvQoJD7qSrXpV", "question": "Why is the zebra wearing a hat?", "choices": ["stay warm", "for photograph", "showing off", "fashion"], "correct_choice_idx": 1, "direct_answers": ["publicity stunt", "for show", "amusement", "pictures", "circus", "for photograph", "for picture", "decoration", "pictures", "humor"], "difficult_direct_answer": true, "rationales": ["They are having fun for the camera.", "The zebra is for the photo.", "These animals don't wear clothing normally."], "image": "train2014/COCO_train2014_000000196944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481314, "question_id": "dmKHbWaxTqnUbUbfXH43Ya", "question": "In regular surfing which leg should be first?", "choices": ["both", "foot", "left", "right"], "correct_choice_idx": 3, "direct_answers": ["left", "right", "weaker", "right", "left", "right", "left", "left", "right", "right"], "difficult_direct_answer": false, "rationales": ["The answer is not evident in the picture, but the internet says the right leg is forward on a surfboard.", "The surfing is on the right.", "The right leg should go first in surfing."], "image": "val2014/COCO_val2014_000000481314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151934, "question_id": "dmQwvTCfpysjzYymqSZAFY", "question": "What food item is the color the letters on the top of the bus spell?", "choices": ["orange", "apple", "eggplant", "banana"], "correct_choice_idx": 2, "direct_answers": ["eggplant", "egg plant", "eggplant", "orange", "eggplant", "purple onion", "eggplant", "orange", "mustard", "beetroot"], "difficult_direct_answer": false, "rationales": ["The color is purple on the top of the bus and an eggplant is purple", "The item is eggplant.", "Eggplant is purple in colour."], "image": "train2014/COCO_train2014_000000151934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449634, "question_id": "dmX6NDsQkjEBnbJao9wHps", "question": "What are the overhead wires for?", "choices": ["power trains", "internet", "phone lines", "electrical utility"], "correct_choice_idx": 0, "direct_answers": ["conducting electricity", "power", "power", "electricity", "energy", "electricity", "electricity", "power trains", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["These lines help the trains go by providing electricity.", "These vehicles run on electricity. they are attached to the wires above them.", "The wires are for the trains."], "image": "val2014/COCO_val2014_000000449634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450151, "question_id": "dmvDsaUjsfCP7C9CQ5DnC3", "question": "What type of bench is this?", "choices": ["blue", "park", "bus", "chair"], "correct_choice_idx": 1, "direct_answers": ["park bench", "wood bench", "park", "park bench", "park", "park bench", "park", "wood", "wooden iron", "park"], "difficult_direct_answer": false, "rationales": ["The greenery and nature surrounding the bench indicate that it is in a pastoral setting.", "It is located in a park.", "The bench is in a park."], "image": "val2014/COCO_val2014_000000450151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495252, "question_id": "dmzHcCs7CAaSbQf7eCmutZ", "question": "What is he using to propel himself down the street?", "choices": ["roller blades", "bicycle", "scooter", "skateboard"], "correct_choice_idx": 3, "direct_answers": ["skateboard", "foot skateboard", "skateboard", "skateboard", "scatting", "feet", "feet", "foot", "foot", "skateboard"], "difficult_direct_answer": false, "rationales": ["He is standing on top of a skateboard, and the board has wheels to reduce traction on the ground and propel him forward.", "The man is on a skateboard.", "The board has four wheels and is otherwise flat."], "image": "val2014/COCO_val2014_000000495252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455073, "question_id": "dn8oimW3TasJnPavD5ey6e", "question": "The man on the phone has what kind of facial hair?", "choices": ["man bun", "sideburns", "goatee", "mutton chops"], "correct_choice_idx": 2, "direct_answers": ["goatee", "goatee", "goatee", "goatee", "goatee", "goatee", "goatee", "beard", "goatee", "goatee"], "difficult_direct_answer": false, "rationales": ["His beard and mustache are connected to one another.", "A man has facial hair that surrounds his mouth but does not extend up onto his cheeks.", "It's not a full beard."], "image": "train2014/COCO_train2014_000000455073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293435, "question_id": "dnbA2WLV3PcWzaZvqfhQRm", "question": "Why do cats sleep so much?", "choices": ["helps stalking", "too cold", "too warm", "evolution"], "correct_choice_idx": 3, "direct_answers": ["tired", "evolution", "tired", "their nature", "unknown", "tired", "tired", "table", "conserve energy", "resting"], "difficult_direct_answer": false, "rationales": ["The cats evolved.", "Their ancestors had to sleep a lot to have energy to look for food and domestic cats have kept it up.", "Cats sleep a lot as an evolved trait."], "image": "train2014/COCO_train2014_000000293435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63075, "question_id": "dnbc5bTGjh9ycfJH46q4xk", "question": "What thing here would it be bad to look at directly?", "choices": ["ocean", "sand", "laser light", "sun"], "correct_choice_idx": 3, "direct_answers": ["sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["The sun is bright and can damage your eyes, so it would be bad to look directly at it.", "The sun is the only item listed here which is powerful enough to cause eye damage and also present in the beach image.", "It is harmful to the eyes to stare at solar rays."], "image": "train2014/COCO_train2014_000000063075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478092, "question_id": "do2hQpC4Cxt54eZ46khFxU", "question": "What is causing the distortion to the image?", "choices": ["high winds", "window screen", "photoshop filter", "heavy rain"], "correct_choice_idx": 1, "direct_answers": ["window screen", "glass", "window", "window screen", "bug net", "screen", "window screen", "wide angle", "window screen", "screen"], "difficult_direct_answer": false, "rationales": ["The picture is taken from above at an angle, as in the room of a building. there is probably a screen in the window that they are taking it from.", "It's a grid pattern much closer to the lens than the actual street.", "They are taking the picture through a window with a screen on it."], "image": "train2014/COCO_train2014_000000478092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1053, "question_id": "do2uHeJFvRorgvpF5wL4uL", "question": "If you're in a car coming from this way what is forbidden?", "choices": ["going forwards", "turning right", "waiting", "turning left"], "correct_choice_idx": 3, "direct_answers": ["going", "left turn", "crossing", "turning left", "parking", "continuing", "turning left", "turning left", "going forward", "parking"], "difficult_direct_answer": false, "rationales": ["A car has to yield to the cyclists. they can't run into them.", "A street sign under a traffic light has a line through an arrow pointing to the left.", "There are no left turns as indicated by the sign."], "image": "train2014/COCO_train2014_000000001053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392809, "question_id": "doCXpmxjYvNmAJnh59MRme", "question": "What industry might this animal be associated with?", "choices": ["pottery", "knitting", "mutton", "farming"], "correct_choice_idx": 3, "direct_answers": ["renaissance fair", "horse riding", "horse racing", "equine", "renaissance entertainment", "racing", "farming", "horse racing", "fair", "entertainment industry"], "difficult_direct_answer": true, "rationales": ["The animal is visible and identifiable a horse. horses are commonly used in association with answer a.", "The industry is farming.", "The animal seen is a horse. horses are both kept on farms and used in farming."], "image": "train2014/COCO_train2014_000000392809.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270270, "question_id": "doaghktpxc83ikLFk8TKmL", "question": "The video game console in this boy hand is called?", "choices": ["wii remote", "joy stick", "mobile game", "magic stick"], "correct_choice_idx": 0, "direct_answers": ["wii", "wii", "wii", "wii", "wii", "wii remote", "wii", "wii", "wii", "wii"], "difficult_direct_answer": false, "rationales": ["The boy is holding a remote for the nintendo game console.", "A boy is holding a white controller with a strap.", "This is obvious by the shape of the controller. the other options also don't match."], "image": "train2014/COCO_train2014_000000270270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498137, "question_id": "dobuaqYyUDsYDbMGynyLa3", "question": "What is he doing?", "choices": ["stealing motorcycles", "riding motorcycle", "selling motorcycles", "viewing motorcycles"], "correct_choice_idx": 3, "direct_answers": ["inspecting motorcycle", "viewing motorcycles", "looking motorcycles", "preparing motorcycle", "viewing motorcycles", "shopping", "shopping", "checking bikes", "looking", "standing"], "difficult_direct_answer": false, "rationales": ["He's looking at motorbikes.", "The man is checking out the motorcycles.", "The person is holding a light so most likely he is inspecting the bikes."], "image": "train2014/COCO_train2014_000000498137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269249, "question_id": "dok3eHLWSGL9MedspWfJQz", "question": "The man with the red trunks has what body type?", "choices": ["gangly", "willowy", "svelte", "husky"], "correct_choice_idx": 3, "direct_answers": ["obese", "portly", "strong", "heavyset", "hefty", "overweight", "husky", "travel", "portly", "medium"], "difficult_direct_answer": true, "rationales": ["The man is a husky.", "The man in red trunks is slightly overweight.", "The man with the red trunks is overweight."], "image": "train2014/COCO_train2014_000000269249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248818, "question_id": "dotTUup4VDsVx3GZYfJxmF", "question": "What does the paint job help the vehicle do?", "choices": ["blend in", "stay dry", "avoid rust", "drive fast"], "correct_choice_idx": 0, "direct_answers": ["came", "drive straight", "blend in", "camelflage", "camouflage", "blend in", "camouflage", "camouflage", "blend in", "army"], "difficult_direct_answer": false, "rationales": ["The truck has a camouflaged paint job.", "A truck is painted in camo. camo is used to blend in.", "The camo colour helps it to blend into forestry when in war situations."], "image": "train2014/COCO_train2014_000000248818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129257, "question_id": "doukrfoPJcyC2UfVUiWpiF", "question": "What did the cream on top come out of?", "choices": ["bag", "can", "bottle", "jar"], "correct_choice_idx": 1, "direct_answers": ["can", "can", "can", "tube", "can", "can", "can", "can", "cow", "spray can"], "difficult_direct_answer": false, "rationales": ["Whip cream is kept in a metal container.", "The whipped cream came from a pressurized can.", "The cream is canned."], "image": "train2014/COCO_train2014_000000129257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422326, "question_id": "dpPHTyxyU4tefEa79tT4mX", "question": "How many cylinders does the engine in this custom tricycle have?", "choices": ["six", "eight", "four", "12"], "correct_choice_idx": 1, "direct_answers": ["eight", "three", "three", "five", "four", "four", "five", "two", "eight", "six"], "difficult_direct_answer": false, "rationales": ["The tricycle has 8 cylinders in total.", "The trike has eight noticeable cylinders.", "There are 8 cylinders."], "image": "val2014/COCO_val2014_000000422326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439427, "question_id": "dpf62NHF4PYppxezoxyVtG", "question": "What is the mound of snow used as?", "choices": ["ramp", "seat", "bed", "castle"], "correct_choice_idx": 0, "direct_answers": ["obstacle", "jump", "ramp", "landing ground", "mogul", "trick jumping", "jump point", "ramp", "leg", "jump"], "difficult_direct_answer": false, "rationales": ["It is piled up to mimic the shape of this", "The snow is in a pile. the person on the snow board is jumping off of it.", "The person is snowboarding and jumping."], "image": "val2014/COCO_val2014_000000439427.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40130, "question_id": "dpsGTeACbznX8K8XTfnDb7", "question": "What type of meat is in the sandwich?", "choices": ["tuna", "roast beef", "ham", "chicken"], "correct_choice_idx": 0, "direct_answers": ["tuna", "tuna", "tuna", "tuna", "tuna", "tuna", "tuna", "bacon", "tuna", "bacon"], "difficult_direct_answer": false, "rationales": ["The receipt has the food item listed on it.", "It has tuna in it.", "Tuna as the receipt shows the prove."], "image": "train2014/COCO_train2014_000000040130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11737, "question_id": "dpsnhkCwqGJz5WRaaetDE7", "question": "Where does this player stand?", "choices": ["stands", "dugout", "first base", "pitcher's mound"], "correct_choice_idx": 3, "direct_answers": ["pitcher's mound", "on mound", "pitcher's mound", "pitcher's mound", "field", "pitcher's mound", "field", "mound", "pitcher's mound", "mound"], "difficult_direct_answer": false, "rationales": ["The person is ready to throw the ball. he is standing in the dirt and surrounded by grass.", "Here we see a baseball player preparing to throw a ball from an elevated dirt mound in the center of the field.", "He's obviously the pitcher given that he's throwing the ball."], "image": "train2014/COCO_train2014_000000011737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356116, "question_id": "dpuZTk2DXrSKEULKWmP7wF", "question": "Where has this person been most recently?", "choices": ["inland", "water", "undersea", "air"], "correct_choice_idx": 0, "direct_answers": ["surfing", "waxing salon", "beach", "surfing", "water", "beach", "surfing", "inland", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["The person was inland.", "This person has most recently been inland.", "They are not wet so have not been in the water yet."], "image": "train2014/COCO_train2014_000000356116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241281, "question_id": "dqTnVm2TPxRtVyk4eNHudg", "question": "When was this picture captured?", "choices": ["1201", "2013", "2001", "2020"], "correct_choice_idx": 1, "direct_answers": ["2013", "2013", "2013", "2013", "during rain", "afternoon", "2013", "2013", "daytime", "2013"], "difficult_direct_answer": false, "rationales": ["The copyright text at the bottom left indicates when the photo was taken.", "The bus is from the 2013 era.", "The date is printed on the picture in the bottom left hand corner."], "image": "train2014/COCO_train2014_000000241281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335624, "question_id": "dqYs9jptCFhB2dPPBZnhqT", "question": "The cows main food source for nutrition is brought out in what shape?", "choices": ["boxed", "bundles", "taped", "barrels"], "correct_choice_idx": 3, "direct_answers": ["cylinder", "bales", "hay", "rectangle", "shredded", "bales", "round", "grass", "long grass", "barrels"], "difficult_direct_answer": true, "rationales": ["The hay is brought in barrels.", "Hay is usually bundled before they are brought out.", "It's brought out in barrels."], "image": "train2014/COCO_train2014_000000335624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164224, "question_id": "dqdjWWLXP9CNnxuanDBFDk", "question": "What is the paper item on the bed?", "choices": ["hotel letter", "key holder", "hotel bill", "welcome card"], "correct_choice_idx": 3, "direct_answers": ["paper note", "note", "newspaper", "welcome card", "card", "card", "book", "napkin", "card", "bussiness materials"], "difficult_direct_answer": false, "rationales": ["This looks like a hotel room, based on the style and the luggage in the picture. when arriving in a hotel room there is often a piece of paper of this style on the bed.", "The item is a card.", "The paper item is a welcome greeting."], "image": "train2014/COCO_train2014_000000164224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564283, "question_id": "dqgZJWq8vUdEPoniTkJ2sv", "question": "How many ducklings stickers are there?", "choices": ["four", "one", "three", "five"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "3 stickers", "three", "3 stickers", "three", "3 stickers"], "difficult_direct_answer": false, "rationales": ["There are three duck stickers.", "There are three duckling stickers in a row on the top of the fridge.", "There are three stickers."], "image": "val2014/COCO_val2014_000000564283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424300, "question_id": "dqoH6kHcfYTZfoxJ2jsigY", "question": "What type of area is shown?", "choices": ["field", "jungle", "beach", "forest"], "correct_choice_idx": 3, "direct_answers": ["woods", "forest", "forest", "forest", "forest", "forest", "woods", "forest", "forest", "forest"], "difficult_direct_answer": false, "rationales": ["A forest is shown as there are many trees.", "The area is a forest.", "This is a forest area where there are birds flying around the trees."], "image": "train2014/COCO_train2014_000000424300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479621, "question_id": "dqpmMVQPqco3XkJ754s8vV", "question": "How is the girl related to the Man?", "choices": ["mother", "his daughter", "wife", "sister"], "correct_choice_idx": 1, "direct_answers": ["daughter", "daughter", "his daughter", "daughter", "his daughter", "daughter", "daughter", "his daughter", "his daughter", "daughter"], "difficult_direct_answer": false, "rationales": ["Due to the age difference between these two on the board it is most likely he is her father out of the relations listed.", "The girl is younger than the man.", "She's very small compared to the man"], "image": "train2014/COCO_train2014_000000479621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270570, "question_id": "dr7KXBaiUoYjGVbBtBmSXh", "question": "What type of phone can this person use at the desk?", "choices": ["cellular", "landline", "payphone", "rotary"], "correct_choice_idx": 1, "direct_answers": ["void phone", "landline", "cord phone", "landline", "touch dial", "landline", "landline", "landline", "landline", "desk phone"], "difficult_direct_answer": false, "rationales": ["The person could use a landline phone.", "The phone is attached to the wall and is a typical landline phone.", "The phone is a landline."], "image": "val2014/COCO_val2014_000000270570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368521, "question_id": "drPStbjhoFXzoccxj4gRiS", "question": "Why is he wearing a glove?", "choices": ["health", "fashion", "catching", "warmth"], "correct_choice_idx": 2, "direct_answers": ["catching", "catching ball", "catch ball", "to catch", "catch ball", "catch baseball", "protect hand", "baseball player", "catching ball", "catch ball"], "difficult_direct_answer": false, "rationales": ["This player wears a glove that is conducive to catching baseballs safely.", "He is playing baseball.", "This gives more surface area to grab a ball"], "image": "train2014/COCO_train2014_000000368521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269103, "question_id": "dsSfW6aVgehACGQ8xZjVj2", "question": "What type of doll is in the front of the picture?", "choices": ["barbie", "beanie babies", "bratz", "elsa"], "correct_choice_idx": 2, "direct_answers": ["bratz doll", "plastic", "suzie doll", "plastic", "teenage doll", "creepy", "young girl", "bratz", "girl doll", "tahiti doll"], "difficult_direct_answer": true, "rationales": ["The big eyes and face shape indicate which type of popular doll is being held.", "The doll is a bratz.", "The doll has features and a design consistent with answer a."], "image": "train2014/COCO_train2014_000000269103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19090, "question_id": "dsbaSR7tC9HcXW36YXVLcx", "question": "What are they capturing?", "choices": ["cute dogs", "dangerous animals", "scenery", "each other"], "correct_choice_idx": 2, "direct_answers": ["city views", "scenery", "city sites", "picture", "images", "photos", "selfies", "city scape", "pictures", "city scenes"], "difficult_direct_answer": true, "rationales": ["Two people are aiming their cameras at the view from an elevated position where the scenery is visible.", "The other options don't apply to the actions in the image.", "They are photographing the scenic view."], "image": "val2014/COCO_val2014_000000019090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395259, "question_id": "dscqdyaeVNSnMpuL5G3keU", "question": "Where is this room likely located?", "choices": ["dorm", "house", "hospital", "hotel"], "correct_choice_idx": 3, "direct_answers": ["hotel", "hotel", "hotel room", "hotel", "hotel", "hotel", "hotel room", "hotel", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["It has a lot of luggage which is being used to store belongings rather than unpacking them.", "There is luggage in the room.", "There is a lot of luggage and a tiny fridge"], "image": "train2014/COCO_train2014_000000395259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248911, "question_id": "dska9LhMHbXJNSMRnjEWcn", "question": "What type persons are shown here?", "choices": ["customer service", "reinactors", "phone workers", "salesmen"], "correct_choice_idx": 1, "direct_answers": ["horse", "knights", "knight", "running razes", "reinactors", "knights", "knights", "knights", "knights", "armour"], "difficult_direct_answer": false, "rationales": ["The word is \"reenactors.\" they're usually history buffs.", "Medieval times are hundreds of years ago, and they are dressing up and participating in a fun event around that theme.", "These people are historical reinactors of the medieval period."], "image": "val2014/COCO_val2014_000000248911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221803, "question_id": "dsqM5VFx72yBZrw5zwNTSE", "question": "How many clock faces can be seen on the clock tower?", "choices": ["two", "four", "one", "three"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Two clock faces are on the front and side.", "There are two clocks on the tower.", "A clocktower has clocks on two sides that face the street."], "image": "train2014/COCO_train2014_000000221803.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195296, "question_id": "dsvWmaz5qFtPtiG7Pa6CjY", "question": "What are the bananas doing on the yellow cloth?", "choices": ["being dried", "being cooked", "being sold", "being eaten"], "correct_choice_idx": 2, "direct_answers": ["ripening", "sitting", "on display", "keep", "sitting", "ripening", "displayed", "ripening", "being sold", "ripening"], "difficult_direct_answer": false, "rationales": ["Fruit stands are easy to set up and allows consumers the ease of purchasing from local vendors.", "They are on display in a market stall.", "They're for sale."], "image": "train2014/COCO_train2014_000000195296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265971, "question_id": "dt2R9gQhmekkFuHX7PSVff", "question": "What is he doing?", "choices": ["throwing racquet", "hitting ball", "catching ball", "serving ball"], "correct_choice_idx": 3, "direct_answers": ["hitting ball", "serving", "serving", "serving", "serving", "serving", "smashing", "serving", "serving ball", "serving"], "difficult_direct_answer": false, "rationales": ["The over head strike this athlete is performing would be called a serve in tennis.", "The tennis player is stretching his racket up high to hit a ball to serve it.", "He is jumping in the air, indicating he just threw the ball in the air to get the most momentum for hitting the ball across the net."], "image": "val2014/COCO_val2014_000000265971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515234, "question_id": "dt4H9N6ik3fMfDmrx9puaQ", "question": "The fence is placed in front of what part of the stadium to stop the pitchers fastball from hitting it?", "choices": ["fans", "all correct", "stands", "audience"], "correct_choice_idx": 1, "direct_answers": ["home plate", "backstop", "home plate", "all correct", "stands", "bleachers", "playing field", "corner", "bench", "dugout"], "difficult_direct_answer": true, "rationales": ["The ball could hurt people and damage things if it flew out of the field.", "The fence is to protect the audience who sits in the stadium seats.", "The fence keeps everything and everyone safe from fastballs."], "image": "train2014/COCO_train2014_000000515234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278673, "question_id": "dt4emSNnf3N3PoyidJ4FKU", "question": "What is the freshest food available to this woman?", "choices": ["flour", "chips", "eggs", "ice cream"], "correct_choice_idx": 2, "direct_answers": ["chicken", "turkey", "chicken", "chicken", "eggs", "eggs", "chicken", "eggs", "chicken", "chicken"], "difficult_direct_answer": false, "rationales": ["There are many live chickens walking around, and chickens can lay eggs for humans to consume.", "She is elderly and so it would be unsafe for her to kill a chicken, but there are eight chickens around her and chickens produce these, which are food for humans.", "They can collect them from the chickens every day."], "image": "train2014/COCO_train2014_000000278673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147746, "question_id": "dtHAzRkRxNYPdv9pVujti4", "question": "Which speed during the turn caused this to happen?", "choices": ["stopping", "decelerating", "high", "low"], "correct_choice_idx": 2, "direct_answers": ["high speed", "high", "sudden speed", "65", "high speed", "high speed", "high speed", "high", "too much", "high speed"], "difficult_direct_answer": false, "rationales": ["The truck is tipped over as if speeding.", "Turning very fast can cause a vehicle to flip.", "They were going too fast around the corner"], "image": "train2014/COCO_train2014_000000147746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261999, "question_id": "dtJvz7CVsqRMqUrpTYkrUa", "question": "What are the two feeling right now?", "choices": ["attraction", "disgust", "amusement", "hate"], "correct_choice_idx": 0, "direct_answers": ["love", "love", "attraction", "love", "love", "love lust", "love", "love", "friendly", "love"], "difficult_direct_answer": false, "rationales": ["The people are embracing in a loving hug.", "The couple is kissing which presumably someone would only do with someone that they like.", "They are obviously in \"mid-kiss\" so that's a sure sign of attraction. people have been kissing each other like that for all time."], "image": "val2014/COCO_val2014_000000261999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28759, "question_id": "dtPSZSszEz5QuN6xxF69j9", "question": "What number comes sequentially after the number on the big sign?", "choices": ["20", "25", "28", "42"], "correct_choice_idx": 3, "direct_answers": ["42", "forty two", "forty two", "42", "42", "forty two", "42", "forty-two", "42", "42"], "difficult_direct_answer": false, "rationales": ["The number is 42.", "The number 42 comes after 41.", "The sign says 41. 42 comes after."], "image": "train2014/COCO_train2014_000000028759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325328, "question_id": "dthpEhMERtpW7DXHjpH3hC", "question": "The expensive ingredients suggest this is what type of pizza restaurant?", "choices": ["middle-quality", "fine dining", "low-quality", "high-quality"], "correct_choice_idx": 3, "direct_answers": ["high-quality", "upscale", "italian", "upscale", "upscale", "upscale", "italian", "upscale", "italian", "italian"], "difficult_direct_answer": false, "rationales": ["When their are expensive ingredients being used on a menu item, it is likely associated with quality.", "Only high quality restaurants use pricy ingredients.", "The ingredients are high quality."], "image": "val2014/COCO_val2014_000000325328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214574, "question_id": "dtuRW2pH3Bsm3vjNXquLLm", "question": "Which two of the giraffes from left to right appear to be the youngest ones?", "choices": ["left", "right", "end", "middle"], "correct_choice_idx": 3, "direct_answers": ["middle", "left", "middle two", "middle two", "middle", "two center", "second", "middle", "middles", "second"], "difficult_direct_answer": false, "rationales": ["The two giraffes in the middle are visibly the smallest which likely implies their age.", "The two giraffes in between the other two are smaller than the other giraffes.", "The two in the center are smaller."], "image": "val2014/COCO_val2014_000000214574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558253, "question_id": "duEJBdEXzJZQTUxUyJF384", "question": "Where are these cattle most likely headed?", "choices": ["mexico", "auction", "sears", "disco"], "correct_choice_idx": 1, "direct_answers": ["parade", "farm", "rodeo", "pasture", "ranch", "farm", "auction", "fair", "slaughterhouse", "to market"], "difficult_direct_answer": true, "rationales": ["The man looks like he's going there to sell off the animals.", "The cattle are auctioned.", "The animals are cows which are often sold in auction."], "image": "val2014/COCO_val2014_000000558253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376371, "question_id": "duEWy43kvMTKA2L8VXh8P7", "question": "Manchego and Roquefort are cheeses got from which animal's milk?", "choices": ["deer", "cow", "sheep", "goat"], "correct_choice_idx": 2, "direct_answers": ["drinking", "sheep", "sheep", "sheep", "sheep", "drinking", "sheep", "sheep", "sheep", "sheep"], "difficult_direct_answer": false, "rationales": ["Sheeps milk produce these kinds of cheeses.", "The cheese comes from sheep.", "The sheep are ones that are seen on the camera."], "image": "val2014/COCO_val2014_000000376371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561028, "question_id": "duGBDaz2wFarT9zq6oS6cd", "question": "What do the sunglasses worn here serve as?", "choices": ["trauma protection", "nothing", "glare protection", "fashion only"], "correct_choice_idx": 2, "direct_answers": ["protection", "protection", "glare protection", "eye protection", "eye protectors", "shade", "shade eyes", "aides", "block sun", "sun protection"], "difficult_direct_answer": true, "rationales": ["Whether at a tennis match or driving, sunglasses are used to prevent glare from the sun.", "They can see clearer with sunglasses.", "The glasses help protect from glare."], "image": "train2014/COCO_train2014_000000561028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208718, "question_id": "dv96tQAD9f5YmscwYNTqWj", "question": "What are the black receptacles used to collect?", "choices": ["trash", "candy", "water", "plants"], "correct_choice_idx": 0, "direct_answers": ["trash", "trash", "trash", "trash", "trash", "trash", "trash", "trash", "trash", "trash"], "difficult_direct_answer": false, "rationales": ["The black objects are used for trash.", "These are used to collect any garbage you may have instead of throwing it on the ground.", "People throw their garbage in those."], "image": "train2014/COCO_train2014_000000208718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293741, "question_id": "dvPEQu3wuYd5pmYgonQRQ4", "question": "What is the state of the colors here?", "choices": ["inverted", "normal", "black/white", "super saturated"], "correct_choice_idx": 2, "direct_answers": ["grayscale", "black white", "black white", "black white", "black white", "black white", "black white", "sepia", "black", "black/white"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to this type of photograph.", "This is all in black and white.", "The colors are black and white."], "image": "train2014/COCO_train2014_000000293741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324952, "question_id": "dveJNf2coCJHFMg938yPLj", "question": "Where was the castle themed birthday cake most likely created?", "choices": ["restaurant", "food bank", "home kitchen", "bakery"], "correct_choice_idx": 3, "direct_answers": ["bakery", "bakery", "bakery", "bakery", "bakery", "bakery", "cake", "bakery", "cake shop", "america"], "difficult_direct_answer": false, "rationales": ["Cakes are made in bakeries.", "It looks to be professionally baked, and professional bakers typically work in a bakery.", "This cake is decorated with fondant and has decorative elements that are usually considered too complicated to do at home. it is not a standard-looking cake that would be widely available."], "image": "train2014/COCO_train2014_000000324952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394874, "question_id": "dwASYYHnUi8X5qb2Pr3MUa", "question": "How are the elephants most likely to cross this river?", "choices": ["swim", "walk across", "ride boat", "fly"], "correct_choice_idx": 1, "direct_answers": ["walking", "swim", "walk", "walking", "walk", "walk", "walk across", "ten", "walking", "walking across"], "difficult_direct_answer": false, "rationales": ["Elephants cannot fly. the water is too shallow for the elephants to swim in.", "The river is shallow. the elephants are tall.", "The river at this location is extremely shallow, and large rocks are seen protruding from the bottom. we can tell by where the water reaches the elephants' feet that it's not more than a few inches deep."], "image": "train2014/COCO_train2014_000000394874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292375, "question_id": "dwJrnDcn77cnRdAKbm7nGS", "question": "Why does the man wear a yellow vest?", "choices": ["visibility", "dress code", "camouflage", "fashion"], "correct_choice_idx": 0, "direct_answers": ["worker", "visibility", "visibility", "airport worker", "visibility", "direct", "safety", "visibility safety", "airport worker", "caution"], "difficult_direct_answer": false, "rationales": ["So people can see him.", "The man needs to stay safe.", "Yellow is easy to see so it is a color worn in high risk areas."], "image": "train2014/COCO_train2014_000000292375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96208, "question_id": "dwKJzTFQZ2niP6gALx6LSu", "question": "What is the man in the red jacket doing?", "choices": ["descending", "rolling", "ascending", "falling"], "correct_choice_idx": 0, "direct_answers": ["skiing", "skiing", "skiing", "skiing", "descending", "skiing", "skiing", "skiing", "playing", "skiing"], "difficult_direct_answer": false, "rationales": ["The man is going down.", "The man is going downhill.", "He is going downhill so he is descending. you cannot ski uphill and he is skiing without falling or rolling since he is on his feet."], "image": "train2014/COCO_train2014_000000096208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332461, "question_id": "dwWhkJdo3A4qtojF9si8hQ", "question": "What part of the vehicle is being shown?", "choices": ["right", "back", "front", "left"], "correct_choice_idx": 1, "direct_answers": ["trunk", "trunk", "trunk", "trunk", "trunk", "trunk", "back", "trunk", "trunk", "trunk"], "difficult_direct_answer": false, "rationales": ["The trunk of the van is being viewed.", "The trunk is open so you can tell it is the back of the car.", "This is the trunk of the vehicle."], "image": "val2014/COCO_val2014_000000332461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457068, "question_id": "dx4DYPVcyu24vzt4ASFpXK", "question": "What is the beverage in the glass with the lemon?", "choices": ["iced tea", "water", "soda pop", "milk"], "correct_choice_idx": 0, "direct_answers": ["iced tea", "iced tea", "iced tea", "ice tea", "tea", "iced tea", "iced tea", "tea", "tea", "iced tea"], "difficult_direct_answer": false, "rationales": ["The glass has iced tea.", "The glass with the lemon has iced tea since it's brown.", "Tea has lemons."], "image": "train2014/COCO_train2014_000000457068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334075, "question_id": "dx8yB5TEhfwhjiibFVgXbj", "question": "What state is the batter's team located in?", "choices": ["new jersey", "minnesota", "new york", "illinois"], "correct_choice_idx": 1, "direct_answers": ["texas", "minnesota", "minnesota", "minnesota", "pittsburg", "minnesota", "minnesota", "minnesota", "minneapolis", "minnesota"], "difficult_direct_answer": false, "rationales": ["The twins are a baseball team that play in minnesota.", "They are the twins.", "The batter's shirt indicates that he plays for the twins, not the yankees, mets, or cubs."], "image": "val2014/COCO_val2014_000000334075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399628, "question_id": "dxDoFhj2G8eZFwSAyMuVBZ", "question": "What does the round platform shown here do?", "choices": ["sit still", "turn upsidedown", "nothing", "rotate"], "correct_choice_idx": 3, "direct_answers": ["train", "rotate", "display locomotive", "display train", "turns train", "rotates", "train", "display train", "protect ground", "rotate"], "difficult_direct_answer": false, "rationales": ["The platform rotates.", "Children look at a train that is on display on a round platform.", "The round platform is for rotation."], "image": "val2014/COCO_val2014_000000399628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153727, "question_id": "dxMLmG2yKpMB9jWprEeAJs", "question": "What has caused some of the sheep to have pink fur?", "choices": ["crayons", "pencils", "dye", "rain"], "correct_choice_idx": 2, "direct_answers": ["identification", "marker", "paint", "paint", "property value", "human painted", "blood", "dye", "bleeding", "dye"], "difficult_direct_answer": false, "rationales": ["The sheep have been dyed.", "They are marked according to health status", "Farmers sometimes dye the hair of their animals to tell which is which."], "image": "train2014/COCO_train2014_000000153727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87328, "question_id": "dxNkbfyq9MWoRzSULME6aZ", "question": "What happened to the background?", "choices": ["edited out", "slightly overexposed", "blurred", "left unchanged"], "correct_choice_idx": 0, "direct_answers": ["removed", "white", "edited out", "cut out", "removed digitally", "removed/deleted", "erased", "edited out", "removed", "deleted"], "difficult_direct_answer": false, "rationales": ["The background is completely white.", "The background is edited.", "There is a background that is edited out in the photo."], "image": "val2014/COCO_val2014_000000087328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245049, "question_id": "dxmWJec8AYWXRZr8S5zZp2", "question": "Red color indicates what in traffic signal?", "choices": ["stop", "none", "start", "go"], "correct_choice_idx": 0, "direct_answers": ["stop", "stop", "stop", "stop", "stop", "stop", "stop", "stop", "stop", "stop"], "difficult_direct_answer": false, "rationales": ["The sign is red and it has the word stop on it.", "Red always means don't go.", "A red sign in an octagon shape is near a traffic light and has the word stop on it."], "image": "val2014/COCO_val2014_000000245049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65162, "question_id": "dxzhUuXstkyMhq3USSU8DN", "question": "What are the two black players doing here?", "choices": ["high fiving", "yelling", "protesting", "throwing"], "correct_choice_idx": 0, "direct_answers": ["high-fiving", "high fiving", "high fives", "high fiving", "high five", "high fiving", "exchanging positions", "high five", "clapping hands", "high fiving"], "difficult_direct_answer": false, "rationales": ["Two baseball players raise their hands to each other as they pass on a baseball diamond.", "Each has a hand up to slap the other's.", "Two men in baseball uniforms are smacking their hands together in congratulations."], "image": "train2014/COCO_train2014_000000065162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509034, "question_id": "dy87mxc6vrFsNBdrDmXqTv", "question": "This man is in a similar profession to what character?", "choices": ["homer simpson", "pikachu", "chef boyardee", "garfield"], "correct_choice_idx": 2, "direct_answers": ["remy", "gordon ramsey", "chef", "alfredo linguini", "chef boyardee", "chef", "carl casper", "gordon ramsey", "chef", "iron chef"], "difficult_direct_answer": false, "rationales": ["The person is holding a pot in a kitchen setting which likely means his profession is in cooking which would be similar to answer a.", "He is in the kitchen cooking.", "The man is in the kitchen and wearing an apron holding cooking tools and would therefore likely be a chef. chef boyardee is a character that is also a chef."], "image": "train2014/COCO_train2014_000000509034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31286, "question_id": "dyV84HMM9EDLG9ZnUMJe83", "question": "How do you get potatoes to this consistency?", "choices": ["dicing", "deep frying", "mashing", "slicing"], "correct_choice_idx": 2, "direct_answers": ["mashing", "mashing", "smash", "mash", "mash", "mash", "mash", "mash", "mash them", "mash them"], "difficult_direct_answer": false, "rationales": ["Potatoes are getting to this consistency by mashing.", "This can be done with a simple fork or by using a hand-held mixer.", "You have to mash them to get mashed potatoes."], "image": "train2014/COCO_train2014_000000031286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442685, "question_id": "dydtuLmNfGWXCGjNkMdeX2", "question": "What is the man relying on to hold him up?", "choices": ["two horses", "person", "board", "string"], "correct_choice_idx": 0, "direct_answers": ["horses", "horse", "horse", "two horses", "horse", "horses", "two horses", "horses", "balance", "horses"], "difficult_direct_answer": false, "rationales": ["The man is standing on two white horses as they support him.", "This is obvious given that they're right under him.", "The man is on two horses."], "image": "train2014/COCO_train2014_000000442685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559877, "question_id": "dyxccwHfkJbLM3bsNGLAEi", "question": "Why is there a stick stuck in the cheeseburger?", "choices": ["appearance", "joke", "hold together", "check temp"], "correct_choice_idx": 2, "direct_answers": ["design", "hold together", "keep together", "keep together", "hold together", "hold together", "keep together", "hold together", "maintain structure", "melting"], "difficult_direct_answer": false, "rationales": ["There is a cheeseburger in the picture with a large toothpick going through it. some restaurants do this for the reason of holding entire burger together.", "There is some cheese sticking the cheeseburger together.", "The stick holds it together."], "image": "train2014/COCO_train2014_000000559877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371958, "question_id": "dz7wsSM4XJfwbMA5GSFUcH", "question": "What is the outer layer of the building made of?", "choices": ["stone", "steel", "gold", "wood"], "correct_choice_idx": 0, "direct_answers": ["bricks", "brick", "bricks", "stone", "stone", "brick", "brick", "bricks", "brick", "stone"], "difficult_direct_answer": false, "rationales": ["The outer layer of the building is stone.", "You can see each individual block on the building.", "The layer is stone."], "image": "val2014/COCO_val2014_000000371958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281665, "question_id": "dzBzERx5KhTDwX7GUCzgXF", "question": "What is inside the seating directly under the support beam?", "choices": ["pillow", "metal", "wood", "beans"], "correct_choice_idx": 3, "direct_answers": ["foam pellets", "bean bag", "polystyrene", "cushioning", "bean bag", "cover", "beans", "beanbag chair", "beans", "beans"], "difficult_direct_answer": false, "rationales": ["The support beam is in the center of the room. at the bottom of it is a beans bag.", "Beans are inside.", "Beans fill the seating."], "image": "train2014/COCO_train2014_000000281665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45031, "question_id": "dzrQTtDutJWAC7Sj6MS2P7", "question": "What use is the fencing shown here?", "choices": ["boundary guideline", "decorative", "livestock containment", "crop protection"], "correct_choice_idx": 0, "direct_answers": ["separation", "prevent falling", "protection", "mark dropoffs", "warning", "boundary guideline", "boundary", "stop skiers", "separating", "define path"], "difficult_direct_answer": true, "rationales": ["People need to stay on one side of the fence to go down the hill.", "An orange fence lines an area near where people are skiing.", "The fence keeps skiers in the boundary."], "image": "train2014/COCO_train2014_000000045031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142129, "question_id": "dzurQuAksofxwxKwMSJ4ir", "question": "Why is there a stuffed toy on the parking meter?", "choices": ["to eat", "is marker", "stay warm", "for sale"], "correct_choice_idx": 1, "direct_answers": ["someone goofing", "for fun", "decoration", "unknown", "is marker", "left there", "humor", "unknown", "doll lost", "humor"], "difficult_direct_answer": false, "rationales": ["The stuffed toy is marking the meter.", "A small plush animal is on top of a parking meter.", "The stuffed animal is acting as a placeholder so the person knows where to go."], "image": "val2014/COCO_val2014_000000142129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270809, "question_id": "dzvLkAMR2vWs7zNNRC2MiL", "question": "Why is he on the ground?", "choices": ["fell", "is sliding", "likes ground", "was pushed"], "correct_choice_idx": 1, "direct_answers": ["play", "circe", "sliding", "pass base", "is sliding", "sliding", "sliding", "attempting slide", "playing", "sliding"], "difficult_direct_answer": false, "rationales": ["He is sliding on the ground to make it to the next goal.", "He is trying to reach the base without being hit and tagged out.", "It is common for baseball players to do this when they are about to reach a new base."], "image": "train2014/COCO_train2014_000000270809.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301554, "question_id": "e25WtTi9ZynFZoQTSvGUGf", "question": "What other animal is this animal traditionally afraid of?", "choices": ["tigers", "rhinos", "cats", "mice"], "correct_choice_idx": 3, "direct_answers": ["mice", "mice", "mice", "mice", "mice", "mouse", "mouse", "mouse", "mice", "tiger"], "difficult_direct_answer": false, "rationales": ["The animal is a mouse.", "Elephants, though large, tend to get scared of really, really, really fast small things.", "Elephants are known to be scared of mice."], "image": "train2014/COCO_train2014_000000301554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152505, "question_id": "e28HgCaht49edRSZsEbjkz", "question": "Why are all the women wearing hats?", "choices": ["fashion", "dress code", "visibility", "warmth"], "correct_choice_idx": 3, "direct_answers": ["head warmth", "warmth", "snow", "cold", "its cold", "keep warm", "warm retention", "winter weather", "cold outside", "cold weather"], "difficult_direct_answer": true, "rationales": ["Woman are walking through snow in hats. it is cold when there is snow around or the snow would melt.", "It is winter and it is cold outside.", "The women want warmth."], "image": "train2014/COCO_train2014_000000152505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363764, "question_id": "e2Ek5ikLMAYifcQqVbn25w", "question": "Why are the kids reaching for the basket?", "choices": ["grabbing food", "getting balls", "to defecate", "to throw"], "correct_choice_idx": 1, "direct_answers": ["getting balls", "grab balls", "playing", "getting balls", "for balls", "balls", "balls", "balls", "tennis ball", "get balls"], "difficult_direct_answer": false, "rationales": ["A bunch of kids are on a tennis court with adults and tennis rackets. the kids are reaching towards a bucket in the middle of the court. tennis balls are needed to play tennis.", "The kids want balls.", "The kids want to get the balls."], "image": "train2014/COCO_train2014_000000363764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91833, "question_id": "e3858rQkijqMuJctPg5wkN", "question": "What kind of sign is shown?", "choices": ["regulatory", "protest", "brand", "directional"], "correct_choice_idx": 1, "direct_answers": ["occupy oakland", "protest", "protest", "protest", "hella occupy", "sucks", "protest sign", "occupy oakland", "goverment signs", "protest"], "difficult_direct_answer": false, "rationales": ["The sign is for protests.", "The sign has \"occupy\" on it for occupy wall street.", "The sign says occupy oakland."], "image": "val2014/COCO_val2014_000000091833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291366, "question_id": "e3NsLUbwSybrnWZ7fZ7M5q", "question": "What is the man in the chair known as?", "choices": ["line judge", "referee", "adjudicator", "umpire"], "correct_choice_idx": 1, "direct_answers": ["referee", "announcer", "referred", "chair umpire", "umpire", "referee", "chair umpire", "umpire", "referee", "line judge"], "difficult_direct_answer": false, "rationales": ["The man in the chair is assessing the fairness of the plays as the referee.", "That man is in charge of the scores.", "The man is judging the event."], "image": "train2014/COCO_train2014_000000291366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394635, "question_id": "e3qroqBtXP5eTPz2BwizFG", "question": "What's the name of the man at the front of the vehicle?", "choices": ["engineer", "conductor", "boss", "expert"], "correct_choice_idx": 0, "direct_answers": ["engineer", "conductor", "conductor", "conductor", "engineer", "conductor", "conductor", "conductor", "engineer", "engineer"], "difficult_direct_answer": false, "rationales": ["The person who pilots the train goes by a number of names but generally these types of vehicles they are engineers.", "The driver is called an engineer or conductor", "He drives the train."], "image": "val2014/COCO_val2014_000000394635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212199, "question_id": "e3tY4yaXNXcyMDbef2dCsT", "question": "The colors of the bottom stripe on the vehicle resemble what flag?", "choices": ["poland", "nepal", "spain", "mexico"], "correct_choice_idx": 0, "direct_answers": ["latvia", "china", "american", "denmark", "china", "american", "usa", "latvia", "poland", "japan"], "difficult_direct_answer": false, "rationales": ["It is white with a red stripe.", "The country is poland.", "The polish flag has a red and and white stripe."], "image": "train2014/COCO_train2014_000000212199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522101, "question_id": "e3u74aYHiKJiyAz7AQbwqD", "question": "What is the purpose of the bananas?", "choices": ["to discard", "to decorate", "to mash", "to feritize"], "correct_choice_idx": 1, "direct_answers": ["decorative clothing", "decoration", "decoration", "to decorate", "to eat", "food", "decoration", "honor dead", "offering", "to eat"], "difficult_direct_answer": false, "rationales": ["The bananas decorate the small statue.", "There is a fake skull next to it and other leaves to make the whole thing look cool or pretty.", "They are placed among other items on the table"], "image": "train2014/COCO_train2014_000000522101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500271, "question_id": "e423nNyLE9g8gPn4W4sZ2x", "question": "What does the lady use the toothbrush for?", "choices": ["grooming", "brushing teeth", "tickling", "attacking"], "correct_choice_idx": 0, "direct_answers": ["comb fur", "grooming parrot", "combing bird", "grooming", "comping", "comb bird", "grooming bird", "pet brush", "brushing bird", "brush bird"], "difficult_direct_answer": true, "rationales": ["The lady is using the toothbrush to groom a bird.", "She is cleaning the bird with a small brush.", "The fibers are easier to use on bird feathers."], "image": "train2014/COCO_train2014_000000500271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223751, "question_id": "e45scLifUuvM5tVxZ44Xzs", "question": "How is the surfboard likely connected to this surfer?", "choices": ["collar", "necklace", "sonar powered", "ankle rope"], "correct_choice_idx": 3, "direct_answers": ["cable", "ankle", "cable", "ocean", "band", "stomach", "ankle rope", "leg rope", "tether", "his feet"], "difficult_direct_answer": true, "rationales": ["A rope is around their body so it does not get away from them.", "This is a tether so the board doesn't get lost", "They tie it to their ankle so it will not get away from them."], "image": "val2014/COCO_val2014_000000223751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85813, "question_id": "e4YBkTvD3yX2UTMWuQUocz", "question": "What temperature will lengthen the use of this ski area?", "choices": ["heat", "warming sun", "freezing", "heavy rain"], "correct_choice_idx": 2, "direct_answers": ["snow", "forty degrees", "cold temperatures", "freezing", "below freezing", "below freezing", "cold", "cold", "freezing", "low"], "difficult_direct_answer": false, "rationales": ["People are skiing on a snowy mountain. freezing temperatures are required to keep snow frozen.", "If it is cold out then the snow will stay longer.", "Freezing temperatures is much better for skiing than hot weather and heavy rain will harm the ski conditions."], "image": "val2014/COCO_val2014_000000085813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147331, "question_id": "e4xg3JiAyMgFvkpL3GmMkc", "question": "What is the man doing?", "choices": ["singing", "walking", "getting directions", "eating"], "correct_choice_idx": 2, "direct_answers": ["reading", "reading", "reading", "read book", "reading", "reading", "reading", "sitting", "getting directions", "reading"], "difficult_direct_answer": false, "rationales": ["The man gets directions.", "The man is looking up directions on his map.", "The man is getting directions from a book."], "image": "train2014/COCO_train2014_000000147331.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150385, "question_id": "e54i7ynAiMz28sdcxKjEMv", "question": "When is it safe to cross here as a pedestrian?", "choices": ["now", "5 seconds", "never", "5 minutes"], "correct_choice_idx": 0, "direct_answers": ["now", "now", "now", "sign lit", "walk signal", "white walk", "white", "green sign", "walk light", "at day"], "difficult_direct_answer": false, "rationales": ["It's safe now.", "The sign is showing it is safe to cross right now with the white person picture on it.", "A light is lit at a street corner showing a person walking. a crosswalk lights to let people know when it is safe to walk."], "image": "train2014/COCO_train2014_000000150385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252968, "question_id": "e5BFa6skwL6zEcThCpEzPW", "question": "How is this train powered?", "choices": ["steam", "battery", "gas", "electricity"], "correct_choice_idx": 3, "direct_answers": ["electricity", "electric", "electricity", "electric", "engine", "electricity", "electricity", "electricity", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["There are lines overhead which are used to power the train.", "A train is moving along tracks with traffic lights above.", "We can see the wires which carry electric current to and power this train on top of it."], "image": "val2014/COCO_val2014_000000252968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554255, "question_id": "e5BdP7rFgpXrZwM67qkWc9", "question": "People often wear the color of the players on the left to support those with what disease?", "choices": ["heart disease", "cancer", "autism", "diabetes"], "correct_choice_idx": 1, "direct_answers": ["breast cancer", "breast cancer", "cancer", "breast cancer", "breast cancer", "breast cancer", "breast cancer", "breast cancer", "breast cancer", "breast cancer"], "difficult_direct_answer": false, "rationales": ["A pink ribbon is an international symbol of breast cancer awareness.", "The players on the left are wearing pink, not red, blue, grey, or yellow.", "The color the people in question are wearing is pink. this color is commonly known to be associated with particular forms of answer a."], "image": "val2014/COCO_val2014_000000554255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367018, "question_id": "e5MepwpRoXbpg5NTh2eCuw", "question": "Which ingredient in the dish is inedible?", "choices": ["noodles", "shells", "mussels", "pepper"], "correct_choice_idx": 1, "direct_answers": ["shells", "shells", "shells", "shells", "shells", "shell", "clam shell", "shells", "shells", "shells"], "difficult_direct_answer": false, "rationales": ["There are a bunch of shells on top of the pasta.", "Pasta with oysters in shells are being served in a white dish.", "They are too hard to eat and would not taste good."], "image": "val2014/COCO_val2014_000000367018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286234, "question_id": "e5N4Udnpp6XwQLbScifwfQ", "question": "While practicing the tennis player is surrounded by nets because she is playing against?", "choices": ["nobody", "audience", "player", "machine"], "correct_choice_idx": 3, "direct_answers": ["machine", "machine", "opponent", "herself", "machine", "machine", "machine", "tough opponent", "yes", "machine"], "difficult_direct_answer": false, "rationales": ["A machine is spitting out the balls.", "A person is in a netted cage swinging a tennis racket.", "A woman stands in an area surrounded by nets with an audience. the woman is swinging a racket."], "image": "train2014/COCO_train2014_000000286234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9946, "question_id": "e5d7vkiHXHxr2WNxLXsUWo", "question": "Why can't we see the man as clearly as we should be able to?", "choices": ["distortion effect", "blur", "watermark", "picture overexposed"], "correct_choice_idx": 2, "direct_answers": ["waves", "watermark", "watermark", "arm", "watermark", "watermark", "logo", "watermark", "logo", "watermark"], "difficult_direct_answer": false, "rationales": ["The man is watermarked.", "A photographer's watermark covers the photo.", "The watermark is covering the photo."], "image": "train2014/COCO_train2014_000000009946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160009, "question_id": "e5dy9FosHXecTHuaYHg8iq", "question": "In which country is this street located?", "choices": ["united states", "china", "england", "japan"], "correct_choice_idx": 0, "direct_answers": ["united states", "japan", "china", "malaysia", "japan", "china", "china", "china", "china", "china"], "difficult_direct_answer": false, "rationales": ["The language in the main highway sign is english.", "The country is the usa.", "The traffic signs depicted appear to be consistent with ones used in answer a."], "image": "train2014/COCO_train2014_000000160009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479658, "question_id": "e6MVSYmkp8qsnjRqZMC4NL", "question": "What is the red item with the wheels?", "choices": ["space shuttle", "tank", "crane", "sedan"], "correct_choice_idx": 2, "direct_answers": ["crane", "crane", "crane", "crane", "crane", "crane", "tow-truck", "crane", "tow-truck", "crane"], "difficult_direct_answer": false, "rationales": ["A crane has a very distinctive look; the main part is what lifts items up. we clearly see that here.", "The item in red is a crane.", "The machinery clearly has hydraulic properties, and extends/curves beyond where it is mounted. there is a large, heavy object that has fallen over nearby."], "image": "train2014/COCO_train2014_000000479658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271106, "question_id": "e6NLx4JyefiMWrPSkDBdYE", "question": "What is the plant coming out of the toilette bowl basin?", "choices": ["morea lily", "flax", "daylily", "agapanthus"], "correct_choice_idx": 1, "direct_answers": ["purple plant", "fern", "flax", "unknown", "ivy", "spider plant", "fern", "purple spider", "don't know", "ivy"], "difficult_direct_answer": false, "rationales": ["The plant is flax.", "The plant with long leaves is flax.", "Flax has long leaves."], "image": "train2014/COCO_train2014_000000271106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114282, "question_id": "e6PzgSVTMjMabSgsQ6yKjk", "question": "What language is the sign in?", "choices": ["english", "chinese", "egyptian", "french"], "correct_choice_idx": 1, "direct_answers": ["chinese", "chinese", "chinese", "chinese", "chinese", "mandarin", "taiwanese", "chinese", "chinese", "chinese"], "difficult_direct_answer": false, "rationales": ["The sign is in chinese.", "You can tell by the characters written on the glass, as to where they are.", "Symbols can be seen on a sign in an airport and asian people are all around."], "image": "val2014/COCO_val2014_000000114282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378204, "question_id": "e6Sosg8Zx7F4bhYyjJYApQ", "question": "What is wrong with the woman's outfits?", "choices": ["sleeveless shirt", "long jeans", "wrong socks", "entangled necklaces"], "correct_choice_idx": 2, "direct_answers": ["wrong socks", "no shoes", "mismatched socks", "different shoes", "mismatched socks", "no shoes", "different socks", "no shoes", "socks", "too revealing"], "difficult_direct_answer": false, "rationales": ["She is wearing the wrong socks.", "The pieces of clothing on her feet don't match each other. this item of clothing comes in pairs, and is usually worn in an identical matching pair.", "The woman's socks don't match."], "image": "val2014/COCO_val2014_000000378204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49643, "question_id": "e6ahtS7wWr7APCNuwwMrSy", "question": "Why are the men wearing yellow vests?", "choices": ["as punishment", "for fun", "visibility", "fashion"], "correct_choice_idx": 2, "direct_answers": ["city workers", "urban workers", "safety", "safety reflection", "visibility", "safety", "increased visibility", "working", "visibility", "safety"], "difficult_direct_answer": false, "rationales": ["The bright colors will be easily spotted by motorists and lessen the chance of them getting hit.", "People wear yellow to be more visible for safety and road workers follow safety precautions.", "They are wearing a bright reflective color so people can see them better for their own safety."], "image": "train2014/COCO_train2014_000000049643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563713, "question_id": "e6hdHeyzCp4yDUt9mF7GgT", "question": "Which country would this vase typically originate from?", "choices": ["china", "ethiopia", "greece", "denmark"], "correct_choice_idx": 2, "direct_answers": ["egypt", "egypt", "egypt", "bangkok", "egypt", "greece", "greece", "greece", "egypt", "egypt"], "difficult_direct_answer": false, "rationales": ["The vase is from greece.", "There are depictions of greek people on it.", "The artwork looks like it is from the greeks."], "image": "train2014/COCO_train2014_000000563713.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524255, "question_id": "e6qbd7j4YcWVxTY2q2GP6E", "question": "What entertainment do these people have to amuse them?", "choices": ["opera", "tv", "horseback tricks", "singing"], "correct_choice_idx": 2, "direct_answers": ["stunt riding", "performers", "animal cruelty", "horseback tricks", "somersaults", "tricks", "stunt people", "horse riding", "circus", "men"], "difficult_direct_answer": true, "rationales": ["There are two people on the same horse and one is on the other one's shoulders", "The two people in the front of the images are performing tricks while on a horse.", "These people are doing tricks on horseback."], "image": "train2014/COCO_train2014_000000524255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6580, "question_id": "e6sWgUhBgVusV9KWqfUkNS", "question": "What object is present but probably going to be used in an unusual way?", "choices": ["shirt", "bed", "sign", "camera"], "correct_choice_idx": 1, "direct_answers": ["bed", "bed", "bed", "bed", "bed", "bed", "bed", "bed", "bed", "bed"], "difficult_direct_answer": false, "rationales": ["Normally this piece of furniture is used inside a house.", "There is a rectangular shape with pillows and a blanket.", "That usually can't be found on the street."], "image": "val2014/COCO_val2014_000000006580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345787, "question_id": "e6wPLUu7KrMQW3CpbgDuF7", "question": "What color hair does the man have who is annoying the bride?", "choices": ["black", "brown", "blonde", "grey"], "correct_choice_idx": 1, "direct_answers": ["brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The hair is brown.", "The man with the brown hair appears to be taunting the bride.", "The man has brown hair."], "image": "val2014/COCO_val2014_000000345787.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12145, "question_id": "e77nojcZEEPtQga92QBSkB", "question": "What country is this taking place in?", "choices": ["canada", "england", "usa", "italy"], "correct_choice_idx": 3, "direct_answers": ["spain", "car", "italy", "italy", "italy", "britain", "italy", "spain", "russia", "spain"], "difficult_direct_answer": false, "rationales": ["The country is italy.", "Italy's colors are red and blue.", "The football club they are fans of is located in italy and these people look italian."], "image": "train2014/COCO_train2014_000000012145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315094, "question_id": "e797UZ3xZarnqL2wQeuC6e", "question": "The people flying kites are wearing sunglasses to prevent what medical condition?", "choices": ["conjunctivitis", "sunburn", "frostbite", "snow blindness"], "correct_choice_idx": 3, "direct_answers": ["sunlight", "snow blindness", "blindness", "snow blindness", "retinal damage", "sun damage", "retina damage", "white out", "snow blindness", "eye strain"], "difficult_direct_answer": false, "rationales": ["The sun reflecting off the white ground can cause damage to eyes.", "The people don't want to be blinded by the snow's brightness.", "They're blinded by snow."], "image": "train2014/COCO_train2014_000000315094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65793, "question_id": "e7A96d5m9vzPoSZUksn4VG", "question": "What are the toys in the picture called?", "choices": ["pokemon", "stuffed animals", "board games", "video games"], "correct_choice_idx": 1, "direct_answers": ["teddy bears", "teddy bears", "stuffed animals", "stuffed", "teddy bears", "stuffed animals", "stuffed animals", "teddy bear", "dolls", "stuffed animals"], "difficult_direct_answer": false, "rationales": ["The toys are teddy bears. they are not games.", "Most toys are teddy bears.", "There are many kinds of teddy bears"], "image": "train2014/COCO_train2014_000000065793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136922, "question_id": "e7AJJBhwnBaMGPZyW2nfh9", "question": "What does this animal use to scoop up water?", "choices": ["its head", "its trunk", "its paws", "its mouth"], "correct_choice_idx": 1, "direct_answers": ["trunk", "trunk", "trunk", "trunk", "trunk", "trunk", "trunk", "trunk", "its trunk", "trunk"], "difficult_direct_answer": false, "rationales": ["Elephants drink with their trunks.", "The elephant uses its trunk to drink.", "A man is riding an elephant."], "image": "train2014/COCO_train2014_000000136922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185302, "question_id": "e7CcD74RxbAbx7kH2qNe9B", "question": "What is available to get from the boats to the ground level?", "choices": ["rope", "stairs", "ladder", "elevator"], "correct_choice_idx": 2, "direct_answers": ["cables", "ladder", "latter", "ladder", "rope", "deck", "ladder", "ladder", "lift", "ladder"], "difficult_direct_answer": false, "rationales": ["Ladders can help people raise the boats.", "There is a ladder.", "There is a ladder near the boats extending from the bottom to the top of the ledge."], "image": "train2014/COCO_train2014_000000185302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541290, "question_id": "e7E8HnFVufSucazaNFpmWA", "question": "What is this sport name?", "choices": ["swimming", "skiing", "sky diving", "skating"], "correct_choice_idx": 1, "direct_answers": ["freestyle skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "skiing", "ski", "snowboarding"], "difficult_direct_answer": false, "rationales": ["The sport name is skiing.", "Skiing uses poles.", "The people are on snow and are not using skates."], "image": "train2014/COCO_train2014_000000541290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559657, "question_id": "e7KD2W7kegDfL7SMaUXWVT", "question": "What type of object is in the forefront of the image?", "choices": ["skateboard", "car", "building", "bench"], "correct_choice_idx": 3, "direct_answers": ["memorial sign", "bench", "bench", "memorial", "sign", "bench", "dedication", "dedication", "bench", "bench"], "difficult_direct_answer": false, "rationales": ["It looks like the top of a bench with a dedication on it.", "It is used for sitting.", "A wood planked backrest with an inscription is in front of a field."], "image": "train2014/COCO_train2014_000000559657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65447, "question_id": "e7KFG9r37JWfwcHbZZPNQ7", "question": "Banana's are rich in which nutrient?", "choices": ["calcium", "minerals", "potassium", "vitamins"], "correct_choice_idx": 2, "direct_answers": ["potassium", "potassium", "potassium", "calcium", "potassium", "potassium", "potassium", "potassium", "calcium", "calcium"], "difficult_direct_answer": false, "rationales": ["It's a known fact.", "Bananas have potassium.", "These are good if you get too much salt in your diet"], "image": "train2014/COCO_train2014_000000065447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118870, "question_id": "e7MGbR4h6Zn8TGcEGxNGCz", "question": "Why was the bike parked here?", "choices": ["hide it", "random", "showing off", "stay clean"], "correct_choice_idx": 3, "direct_answers": ["stay clean", "shade", "storage", "shelter", "show", "cover", "shelter", "photograph", "shade", "for owner"], "difficult_direct_answer": false, "rationales": ["A person has their bike parked here to keep it from the weather elements.", "The bike is parked in a covered parking spot to keep it clean and dry.", "It's under a roof for this reason. they might also be trying to b to prevent street theft."], "image": "train2014/COCO_train2014_000000118870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409050, "question_id": "e7gJZGxtrpa9fws2U27jC3", "question": "What type of area is outside of the door?", "choices": ["patio", "deck", "porch", "balcony"], "correct_choice_idx": 3, "direct_answers": ["tropical", "vacation", "patio", "living room", "patio", "balcony", "deck", "tropical yard", "balcony", "patio"], "difficult_direct_answer": false, "rationales": ["There is a white floor that leads to an open area. there is a lip out there with a fencing to keep people from going over the side.", "The railing indicates it is put there for safety as it is high above the ground, so it is a small balcony.", "There is a small area with a fence around it outside."], "image": "train2014/COCO_train2014_000000409050.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553039, "question_id": "e7uKnJZPRaYeFr8ppewKi8", "question": "What fruit is shown on the plate?", "choices": ["apple", "kiwi", "strawberry", "orange"], "correct_choice_idx": 2, "direct_answers": ["strawberry", "strawberry", "strawberry", "strawberry", "strawberry", "strawberries", "strawberry", "strawberries", "strawberry", "strawberries"], "difficult_direct_answer": false, "rationales": ["The fruit is red with seeds.", "The fruit is red, not orange, white, or green.", "There is a red piece of fruit that accompanies many salads."], "image": "train2014/COCO_train2014_000000553039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129438, "question_id": "e7yWWi6BGNV66cdxSVK9cd", "question": "What comes out of the large cone?", "choices": ["bubbles", "news", "music", "water"], "correct_choice_idx": 2, "direct_answers": ["sound", "sound", "sound", "sound", "sound", "music", "sound", "music", "music", "music"], "difficult_direct_answer": false, "rationales": ["The item has a large speaker for music to come out of.", "There is some music coming out of the large audio phone.", "The cone has music."], "image": "train2014/COCO_train2014_000000129438.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32544, "question_id": "e857xwTV4L9zC3nKnYfvVX", "question": "What type talk is being given here?", "choices": ["panel", "debate", "argument", "barnburner"], "correct_choice_idx": 0, "direct_answers": ["political", "speech", "policy", "politics", "political", "lecture", "interview", "political", "american progress", "panel"], "difficult_direct_answer": false, "rationales": ["This is a political debate. there is \"center for american progress\" on the wall.", "The woman has a book with her in the meeting.", "People are seated together in front of a display and flag"], "image": "train2014/COCO_train2014_000000032544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196365, "question_id": "e8APjr3JmDh9ygVX6W2G3j", "question": "Which one of these countries is a main location for the company on the right?", "choices": ["russia", "canada", "japan", "germany"], "correct_choice_idx": 3, "direct_answers": ["new zealand", "germany", "usa", "new zealand", "france", "america", "new zealand", "netherlands", "switzerland", "netherlands"], "difficult_direct_answer": false, "rationales": ["Germany is a main location.", "Tennis planet is advertised which is located in europe.", "Tennisplanet is primarily a german company."], "image": "train2014/COCO_train2014_000000196365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536369, "question_id": "e8NLzifMD9A2hZa3aamsNN", "question": "What source of heat is used to cook here?", "choices": ["wood", "solar", "propane", "electric"], "correct_choice_idx": 0, "direct_answers": ["oven", "wood", "wood", "wood", "wood", "fire", "fire", "oven heat", "coal", "wood"], "difficult_direct_answer": false, "rationales": ["The stove uses wood.", "The other options aren't used with this type of cast iron stove.", "The stove in the photo is heated with wood fire."], "image": "val2014/COCO_val2014_000000536369.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534896, "question_id": "e8UgjEAAB4tgFSkUV4rBLw", "question": "Which flag has colors most similar to these flags?", "choices": ["japanese", "chinese", "american", "italian"], "correct_choice_idx": 3, "direct_answers": ["sweden", "mexico", "italian", "sierra leone", "italy", "italy", "russia", "mexico", "holland", "germany"], "difficult_direct_answer": false, "rationales": ["Red, white, and green flags hand on a building.", "Italian has blue and green in the flag.", "There are italian flags similar to those found on the left side of the train station."], "image": "train2014/COCO_train2014_000000534896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226472, "question_id": "e8gaDSRCwaSfdw3qna3fS5", "question": "What is the job of these horses?", "choices": ["carry", "push", "pull", "count"], "correct_choice_idx": 2, "direct_answers": ["transportation", "pulling", "transport", "carrying", "pull carriages", "pulling wagon", "cart pulling", "pull", "pull wagon", "pull wagon"], "difficult_direct_answer": true, "rationales": ["The horses are pulling the carts.", "There is a wagon attached to them", "They pull a wagon with their body weight."], "image": "val2014/COCO_val2014_000000226472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16775, "question_id": "e9EyhXuvrGoaHRVqBrnkFJ", "question": "For what are these people queued up?", "choices": ["gas", "motors", "protest", "food"], "correct_choice_idx": 3, "direct_answers": ["food truck", "food", "food", "food", "buy food", "food", "food purchase", "food", "food", "food truck"], "difficult_direct_answer": false, "rationales": ["They are queuing for food as evident by the photo.", "They are standing in line to get food.", "People are lined up outside of a food truck."], "image": "val2014/COCO_val2014_000000016775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158420, "question_id": "e9h2sBEgQaTqzAG4cLarTo", "question": "What piece of clothing does the boy have on that are meant for adults?", "choices": ["belt", "tie", "his shirt", "pants"], "correct_choice_idx": 1, "direct_answers": ["necktie", "tie", "necktie", "tie", "tie", "neck tie", "tie", "tie", "tie", "necktie"], "difficult_direct_answer": false, "rationales": ["A young child wears a tie. adults generally wear ties more often than children.", "He's dressed up in suit clothing", "It is normally worn by adults."], "image": "train2014/COCO_train2014_000000158420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119808, "question_id": "e9nhityt5oHRmVeKWUXA5t", "question": "In what nation is this street located?", "choices": ["india", "korea", "china", "japan"], "correct_choice_idx": 3, "direct_answers": ["korea", "china", "japan", "asia", "japan", "japan", "japan", "japan", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["The writing on the signs appears to be in japanese, indicating that this is taking place in japan.", "The text on the signs in this image which are not in english identifies this scene as taking place in japan.", "Japan has signs like that on its streets. the writing on the signs is japanese."], "image": "train2014/COCO_train2014_000000119808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563040, "question_id": "e9r5Yr8ozWmf8fQZ22gDcD", "question": "Why is there a pink square on the windshield of the car behind the police car?", "choices": ["parking violation", "litter", "aesthetics", "advertisement"], "correct_choice_idx": 0, "direct_answers": ["ticket", "ticket", "parking violation", "parking ticket", "parking ticket", "ticket", "ticket", "parking ticket", "ticket", "traffic ticket"], "difficult_direct_answer": false, "rationales": ["The car has received a ticket.", "It's a ticket.", "The pink square inserted in the windshield is a parking citation."], "image": "val2014/COCO_val2014_000000563040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62276, "question_id": "eA8FTam8vqF2FkVEPUPYk3", "question": "Why is he running on the bridge?", "choices": ["stay dry", "less windy", "being followed", "shorter run"], "correct_choice_idx": 0, "direct_answers": ["exercise", "exercise", "crossover stream", "exercise", "exercise", "exercise", "exercise", "exercise", "crossover stream", "stay dry"], "difficult_direct_answer": false, "rationales": ["He appears to be purposely on a run for exercise, and running makes it faster.", "To stay dry to prevent him falling to river.", "The man is running on the bridge to stay dry."], "image": "train2014/COCO_train2014_000000062276.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371330, "question_id": "eA8p5w2kxZMuVa7CXcuY4e", "question": "Where do the people wearing white shirts work?", "choices": ["government", "airplane", "sewer", "golf course"], "correct_choice_idx": 1, "direct_answers": ["at home", "charter airline", "airport", "airport", "airline", "airport", "airport", "airport", "airplane", "airport"], "difficult_direct_answer": false, "rationales": ["The airplane staff is dressed in uniform, which is a white shirt and shorts as people are boarding the airplane.", "The people in white shirts are staff members of the plane.", "They work for the airlines."], "image": "val2014/COCO_val2014_000000371330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433647, "question_id": "eAA4EoqpMuETUmSdHaMc3J", "question": "What does this store sell?", "choices": ["soup", "eggs", "desert", "chicken"], "correct_choice_idx": 2, "direct_answers": ["baked goods", "pastry's", "baked goods", "goodies", "pastry", "cakes", "cakes", "deserts", "cakes", "desert"], "difficult_direct_answer": false, "rationales": ["There are cakes and other sweets on display", "Cakes and cookies are sold.", "There are many cakes and sweets available."], "image": "train2014/COCO_train2014_000000433647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405605, "question_id": "eAUQVTiKnveydyeEeQtDM3", "question": "What brand of skis does the skier use whose eyes are uncovered?", "choices": ["rossignol", "head", "atomic", "parson"], "correct_choice_idx": 2, "direct_answers": ["atomic", "atlantic", "atomic", "atomic", "atomic", "atomic", "atomic", "atlantic", "atomic", "atomic"], "difficult_direct_answer": false, "rationales": ["These are atomic brand skis.", "The skiers without goggles on his eyes is using a pair of atomic skis.", "The name is on the skis."], "image": "train2014/COCO_train2014_000000405605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127971, "question_id": "eAaEiyXy3E7RfWuVmTsa6a", "question": "What province does this fire crew reside in?", "choices": ["alberta", "nwt", "ontario", "pei"], "correct_choice_idx": 2, "direct_answers": ["hamilton", "hamilton", "hamilton", "hamilton twp", "hamilton", "hamilton", "ontario", "hamilton", "ontario", "hamilton twp"], "difficult_direct_answer": false, "rationales": ["The town of hamilton is in a city of ontario.", "The city name of hamilton is lettered onto this truck. hamilton is located in the province listed in a.", "Hamilton twp is in this canadian province."], "image": "val2014/COCO_val2014_000000127971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404088, "question_id": "eAciyMDWh6qPTBoPyHiTVh", "question": "What does the young man in White and blue hope to catch?", "choices": ["bat", "baseball", "chicken", "basketball"], "correct_choice_idx": 1, "direct_answers": ["ball", "ball", "ball", "ball", "baseball", "ball", "ball", "baseball", "baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["He's a catcher in a game", "The man has a baseball mitt on.", "The player is a catcher in a baseball game."], "image": "val2014/COCO_val2014_000000404088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446557, "question_id": "eAz9ENNX8m5vfq4ijzbG8F", "question": "What is the child wearing on his hand?", "choices": ["gardening glove", "workout glove", "baseball glove", "batting glove"], "correct_choice_idx": 2, "direct_answers": ["glove", "glove", "baseball glove", "baseball glove", "baseball glove", "baseball glove", "baseball glove", "glouse", "baseball glove", "baseball glove"], "difficult_direct_answer": false, "rationales": ["This is a baseball glove to play the sport and you can see a baseball inside of it.", "The kid has a ball glove.", "It is used to catch during games."], "image": "train2014/COCO_train2014_000000446557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391141, "question_id": "eBSpZsqf5ydqD2tnF4xnB5", "question": "What did this lady do on the day she holds this knife?", "choices": ["butcher pigs", "divorce", "become imprisoned", "marry"], "correct_choice_idx": 3, "direct_answers": ["got married", "get married", "get married", "married", "cut cake", "married", "marry", "her birthday", "marriage", "wedding attendee"], "difficult_direct_answer": false, "rationales": ["The cake is a wedding cake.", "The lady is at a marriage party.", "They are about to cut a wedding cake"], "image": "train2014/COCO_train2014_000000391141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482696, "question_id": "eBXdqDRtjMWJ6hBjUkSoT2", "question": "What is the general theme of the objects on the top rack?", "choices": ["baking agents", "seasoning", "sweets", "cutting tools"], "correct_choice_idx": 1, "direct_answers": ["seasoning", "spices", "spices", "seasonings", "knives", "silver", "knives", "spice", "sharp", "metal"], "difficult_direct_answer": false, "rationales": ["There is a lot of seasonings on the top rack.", "The theme is seasoning.", "The objects are spices."], "image": "train2014/COCO_train2014_000000482696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327754, "question_id": "eBeUapuUpkxdKPTT8Humew", "question": "Why is the man holding his arms out?", "choices": ["to dance", "to dive", "to wave", "to balance"], "correct_choice_idx": 3, "direct_answers": ["balancing", "balance", "balance", "balance", "balance", "balance", "balance", "balance", "for balance", "to balance"], "difficult_direct_answer": false, "rationales": ["Doing this type of extreme sport you need to stay on your board.", "He is surfing and trying not to fall into the waves he is riding.", "He is trying to stay standing on a surf board that is moving in the waves."], "image": "train2014/COCO_train2014_000000327754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249531, "question_id": "eBfSmodzTgphE3vtNdxyHf", "question": "What shot is the male player employing?", "choices": ["lob", "backhand", "serve", "forehand"], "correct_choice_idx": 0, "direct_answers": ["lob", "swing", "hard shot", "overhand", "smash", "overhead", "tennis", "overhand", "dive", "overhand"], "difficult_direct_answer": false, "rationales": ["The male player is hitting the green ball in the pitch.", "The man is lobbing the ball.", "The shot is the lob."], "image": "train2014/COCO_train2014_000000249531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62821, "question_id": "eBjpQnVs6UktUmfRiXoJ89", "question": "What made the wavy lines in the sand in front of the trucks?", "choices": ["tires", "snakes", "water", "wind"], "correct_choice_idx": 0, "direct_answers": ["construction vehicle", "trucks", "tires", "tire tracks", "tracks", "truck tires", "tires", "truck tires", "truck", "tracks"], "difficult_direct_answer": false, "rationales": ["As vehicles drive over sand, their tires leave prominent marks as they go. the bigger the vehicle, the larger these tracks will be.", "These lines were made from tires and vehicles turning around and parking in this area.", "The lines are from tires."], "image": "train2014/COCO_train2014_000000062821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362094, "question_id": "eBpK26T44tBVeGSpKpJhpM", "question": "What type of items are on the rack in front?", "choices": ["raw", "day old", "overdone", "freshest"], "correct_choice_idx": 1, "direct_answers": ["doughnuts", "bun donuts", "breads", "pastry", "baked goods", "discounted", "baked goods", "pasteries", "day old", "bread"], "difficult_direct_answer": true, "rationales": ["That is some old bread at the front.", "The older food is presented at front for quick sale.", "The items are a day old and discounted."], "image": "val2014/COCO_val2014_000000362094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72583, "question_id": "eBrnnJJQPJEwyAfDxqVo7F", "question": "What food group is shown?", "choices": ["meats", "dairy", "fruits", "vegetable"], "correct_choice_idx": 2, "direct_answers": ["fruit", "fruit", "fruit", "fruit", "fruits", "fruit", "fruits", "fruit", "fruits", "fruits"], "difficult_direct_answer": false, "rationales": ["A bowl of colorful apples, bananas and grapes carried on a mythical bird's back is enticing a hungry chimp to come and eat!.", "Fruit is depicted.", "Fruit is shown."], "image": "train2014/COCO_train2014_000000072583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208511, "question_id": "eC3pSgUAZ6LtVLEnE2ME3A", "question": "What is there a netting behind the batter?", "choices": ["safety", "practice", "decoration", "style"], "correct_choice_idx": 0, "direct_answers": ["catch balls", "safety", "safety", "catch ball", "ball", "protect crowd", "for balls", "safety", "safety netting", "stop balls"], "difficult_direct_answer": false, "rationales": ["The ball can hurt people watching if it flies back.", "There are spectators behind the netting and this protects them in case there is a fly ball that the batter accidentally hits backwards.", "The netting keeps balls from flying out."], "image": "train2014/COCO_train2014_000000208511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356940, "question_id": "eCUCu3pXwPPyVc3n5s5raT", "question": "Why does the cow have flowers on her head?", "choices": ["hiding her", "found them", "growing there", "won contest"], "correct_choice_idx": 3, "direct_answers": ["celebration", "decoration", "festive occasion", "decoration", "celebration", "festival", "winner", "won contest", "celebration", "competition"], "difficult_direct_answer": false, "rationales": ["Wreaths are usually shown as a sign of winning a competition.", "This is an award wreath", "This is a cow contest."], "image": "train2014/COCO_train2014_000000356940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323311, "question_id": "eCaMLQo4RYqrzC3rWYWtMA", "question": "What is the highest view point a person could see from?", "choices": ["kite", "rooftop", "tower", "stroller"], "correct_choice_idx": 2, "direct_answers": ["tower", "castle", "tower", "clocktower", "clocktower", "tower", "clock tower", "balcony", "tower", "building"], "difficult_direct_answer": false, "rationales": ["There is a tower in the background of the building.", "The highest point where a person could stand is the top of the building.", "The tower is taller than the other places in this area."], "image": "train2014/COCO_train2014_000000323311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406873, "question_id": "eD4V8dn5h2UswZzx4JVmAD", "question": "Why is the elephant forced down low on it's belly?", "choices": ["belly itches", "tired", "punishment", "boarding passenger"], "correct_choice_idx": 3, "direct_answers": ["pick passenger", "mounting", "lift person", "for seating", "give ride", "helping rider", "sit passengers", "someone mounting", "rider mounting", "boarding passenger"], "difficult_direct_answer": true, "rationales": ["The elephant is allowing the man on.", "This tourist would not be able to mount the elephant this easily if the animal were fully upright.", "The elephant has a passenger."], "image": "val2014/COCO_val2014_000000406873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486936, "question_id": "eDHyRSuw7NV7PTKAT8JYwV", "question": "What are the plastic lids used for when covering these trays of food?", "choices": ["transport", "heat", "cold", "protection"], "correct_choice_idx": 3, "direct_answers": ["freshness", "keeping moist", "protection", "stop flies", "protect food", "keep fresh", "freshness", "protection", "protection", "covering food"], "difficult_direct_answer": false, "rationales": ["The lids can protect from flies.", "They protect the food from being exposed too long to the air and drying out or getting stale.", "Food is covered with lids. lids are used to protect food."], "image": "train2014/COCO_train2014_000000486936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144345, "question_id": "eDY3V6t6syusnYXh4EMhqu", "question": "Where is sir writing?", "choices": ["paper", "laptop", "floor", "parchment"], "correct_choice_idx": 1, "direct_answers": ["computer", "computer", "laptop", "desk", "laptop", "lol", "desk", "serve ware", "laptop", "laptop"], "difficult_direct_answer": false, "rationales": ["The man is writing on a laptop keyboard.", "He's on a laptop.", "You can write things on a laptop."], "image": "train2014/COCO_train2014_000000144345.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335405, "question_id": "eDfULoL3ckwywjifyg8PD3", "question": "What keeps the sheep on the side of the road where they graze presently?", "choices": ["wolves", "nothing", "electrical charges", "shepherd"], "correct_choice_idx": 1, "direct_answers": ["grass", "grass", "nothing", "nothing", "nothing", "pasture", "road", "road", "nothing", "grass"], "difficult_direct_answer": false, "rationales": ["Sheep roam freely on both sides of a road.", "The sheep aren't fenced in.", "There is nothing."], "image": "train2014/COCO_train2014_000000335405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35897, "question_id": "eDxmaefuhhKg8QYkeFyF8a", "question": "Why is the SUV moving over?", "choices": ["lane ending", "wrong way", "lost control", "being silly"], "correct_choice_idx": 0, "direct_answers": ["merge sign", "changing lanes", "driving", "cars", "lane ending", "changing lanes", "traffic", "change lane", "merging lanes", "changing lanes"], "difficult_direct_answer": false, "rationales": ["The suv crosses the lane.", "The lane is merging.", "The street is narrowing."], "image": "val2014/COCO_val2014_000000035897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209844, "question_id": "eDy3hosUWvrJfgWG3SH64Y", "question": "What does the boy want to do with the ball?", "choices": ["catch it", "bounce it", "hit it", "dodge it"], "correct_choice_idx": 2, "direct_answers": ["hit it", "hit", "hit it", "hit", "hit it", "hit it", "hit it", "hit it", "hit it", "hit"], "difficult_direct_answer": false, "rationales": ["He is reaching out his racquet and stepping toward the ball.", "He is playing tennis and the goal is to hit the ball.", "That's why he's moved his racket outward."], "image": "train2014/COCO_train2014_000000209844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123731, "question_id": "eEJZTokvQnEEqctAeD2q9W", "question": "What is the profession of this woman?", "choices": ["athlete", "doctor", "librarian", "janitor"], "correct_choice_idx": 0, "direct_answers": ["tennis player", "college student", "tennis player", "tennis player", "college student", "tennis player", "athlete", "tennis player", "tennis player", "tennis player"], "difficult_direct_answer": false, "rationales": ["The woman is playing a sport.", "She's an athlete.", "She is playing sports."], "image": "train2014/COCO_train2014_000000123731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399939, "question_id": "eEaTPteui3m8SwispWXpjQ", "question": "What is the person in the photo wearing?", "choices": ["uniform", "robe", "dress", "apron"], "correct_choice_idx": 3, "direct_answers": ["apron", "apron", "apron", "dress", "dress", "dress", "dress", "apron", "apron", "dress"], "difficult_direct_answer": false, "rationales": ["The person in the photo is wearing a long apron.", "The woman has a white coverall on.", "She has a piece of material tied around her middle to cover the lower part of her."], "image": "train2014/COCO_train2014_000000399939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308947, "question_id": "eEqsDWvx2sC2TjqGXTvQvM", "question": "What are these people most likely attempting to put out?", "choices": ["flyers", "light", "fire", "dinner"], "correct_choice_idx": 2, "direct_answers": ["fire", "fire", "fire", "firetruck", "fire", "fire", "fire", "firetruck", "fire", "fire"], "difficult_direct_answer": false, "rationales": ["There are firefighters and firetrucks visible. when those things are at a particular scene they are likely there to put out a fire.", "Firetrucks are lined up in the street. firetrucks are used to put out fires.", "The firetrucks are out."], "image": "val2014/COCO_val2014_000000308947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299930, "question_id": "eFEW2wF2oLfDDJyfTaHoQ2", "question": "How many of the vases are made from something other than glass?", "choices": ["five", "four", "one", "two"], "correct_choice_idx": 3, "direct_answers": ["all", "five", "zero", "all", "two", "32", "one", "two", "all", "five"], "difficult_direct_answer": false, "rationales": ["Other vases are made from pottery and are different shapes.", "These are ceramic", "There are 2."], "image": "train2014/COCO_train2014_000000299930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378203, "question_id": "eFMPVkeKY99ZG6T8U3kStC", "question": "What conveyance are the people going to get on?", "choices": ["airplane", "taxi", "none", "bus"], "correct_choice_idx": 0, "direct_answers": ["train", "plane", "plane", "bus", "train", "bus", "bus", "airport", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["This type of luggage is usually used by people on their way to a airport to ride a plane.", "The people are boarding a plane.", "They are getting on a plane."], "image": "train2014/COCO_train2014_000000378203.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156969, "question_id": "eFZRHBdtbFv8f3tn7QJ5rS", "question": "What mode of transportation is upside-down?", "choices": ["bicycle", "skateboard", "scooter", "moped"], "correct_choice_idx": 0, "direct_answers": ["bicycle", "bike", "bike", "bicycle", "bike", "bicycle", "bicycle", "bike", "bicycle", "bicycle"], "difficult_direct_answer": false, "rationales": ["They have turned it upside down to look at it and probably try to fix it.", "The mode is a bike.", "Besides buses in the background the only other visible transportation mode is in the foreground. with wheels facing the sky, it is apparent that the bicycle is upside down."], "image": "train2014/COCO_train2014_000000156969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531967, "question_id": "eGAHEEBC6zCEm8guAiEoPN", "question": "What foreign language could this woman probably know?", "choices": ["japanese", "indian", "korean", "chinese"], "correct_choice_idx": 0, "direct_answers": ["sign language", "chinese", "chinese", "chinese", "chinese", "japanese", "japanese", "chinese", "japanese", "japanese"], "difficult_direct_answer": false, "rationales": ["You can tell by the symbols as to where the language is from.", "The language on the paper shows japannese.", "The woman's banner is in japanese."], "image": "val2014/COCO_val2014_000000531967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327209, "question_id": "eGEtxUqvXvAFcETLpx5P75", "question": "How does the boy in the light blue jacket feel?", "choices": ["angry", "scared", "upset", "amused"], "correct_choice_idx": 3, "direct_answers": ["amused", "amused", "happy", "happy", "happy", "happy", "happy", "awkward", "happy", "happy"], "difficult_direct_answer": false, "rationales": ["The boy is amused.", "He is in a good mood and happy.", "The boy in blue is smiling. the smile makes him look happy."], "image": "train2014/COCO_train2014_000000327209.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525702, "question_id": "eGLpCmZPfMHTAehwSRhafM", "question": "In which country does this woman talk on the phone?", "choices": ["portugal", "canada", "spain", "united states"], "correct_choice_idx": 3, "direct_answers": ["america", "united states", "united states", "usa", "usa", "united states", "usa", "usa", "usa", "united states"], "difficult_direct_answer": false, "rationales": ["The woman is near a us flag.", "There are red and white stripes on the flag", "The woman is seated next to an american flag. american flags are most commonly displayed inside buildings within the united states."], "image": "val2014/COCO_val2014_000000525702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296933, "question_id": "eGgbRWku4tmeQDYnpW7She", "question": "Why are they stopped?", "choices": ["eating lunch", "lost", "resting", "at summit"], "correct_choice_idx": 3, "direct_answers": ["taking break", "posing", "at summit", "taking pictures", "talking", "enjoy view", "admiring view", "admire view", "ski top", "take photo"], "difficult_direct_answer": true, "rationales": ["People on skis are standing together at the top of a mountain.", "There is nowhere higher for these people to ski to or from.", "They are at the top of the hill."], "image": "train2014/COCO_train2014_000000296933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552054, "question_id": "eGk422FTBRKuTdasDBtbsr", "question": "What zone is this area likely to be?", "choices": ["business", "tourist", "residential", "shopping"], "correct_choice_idx": 1, "direct_answers": ["commercial", "city", "hotel zone", "passenger loading", "downtown", "tourist", "tourist", "horse carriage", "tourist", "pick up"], "difficult_direct_answer": false, "rationales": ["The area is for tourists because of the stagecoach.", "The area has a lot of fancy buildings.", "There is a horse carriage visible on a main street judging by the road lines and the background. if a horse carriage is in this type of area they are likely trying to be paid to take people on rides and trying to place themselves in an area where there are tourists who would be interested."], "image": "train2014/COCO_train2014_000000552054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455588, "question_id": "eGkm34t2RVTrPwQNwXgZ9p", "question": "What is the term for how the boat is situated?", "choices": ["mooring", "lassoing", "docked", "anchored"], "correct_choice_idx": 2, "direct_answers": ["docked", "docked", "dock", "docking", "docked", "docked", "dock", "docked", "docked", "docked"], "difficult_direct_answer": false, "rationales": ["It is at a rest next to a pier", "The boat is parked by the dock.", "A boat is pulled up to the dock at a marina. the boat is not moving."], "image": "val2014/COCO_val2014_000000455588.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424542, "question_id": "eGowveW3k2VdC3HDzJQf9L", "question": "What activity is the person most likely engaging in while using the laptop?", "choices": ["writing", "printing", "singing", "drawing"], "correct_choice_idx": 0, "direct_answers": ["writing", "work", "business report", "work", "writing", "work", "typing", "work", "writing", "typing"], "difficult_direct_answer": false, "rationales": ["The person is editing a document.", "A word document can be seen open with text on the screen, indicating it was typed by the owner of the computer.", "The program open on the screen is used for writing."], "image": "train2014/COCO_train2014_000000424542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480454, "question_id": "eH7gaLyQJHks4AZGn4a9rX", "question": "What kind of sign is the red sign on the wall?", "choices": ["no entry", "emergency", "stop", "exit"], "correct_choice_idx": 0, "direct_answers": ["no entry", "stop", "no entry", "no driving", "stop sign", "caution sign", "no entry", "traffic sign", "no vehicles", "stop"], "difficult_direct_answer": false, "rationales": ["The sign has a red circle and a white line. this tells drivers not to go in there.", "It is red with a white line in it which means prohibited", "The red sign has a horizontal white line. the sign does not have text."], "image": "train2014/COCO_train2014_000000480454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13893, "question_id": "eHCeX5Gz637rwQzaw6ahzk", "question": "The fruits in the raised wooden baskets seen here are all what?", "choices": ["citrus", "apples", "cherries", "melons"], "correct_choice_idx": 0, "direct_answers": ["citrus", "citrus", "citrus", "citrus", "citrus", "citrus", "tomato", "citrus", "citrus", "tomato"], "difficult_direct_answer": false, "rationales": ["The oranges, lemons and limes are all citrus fruits.", "There are a bunch of citrus fruits inside of the baskets.", "They're oranges."], "image": "train2014/COCO_train2014_000000013893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266273, "question_id": "eJV6rbUveKobRAXtgmSSaU", "question": "Who's grabbing the broom?", "choices": ["lady", "no broom", "child", "no one"], "correct_choice_idx": 2, "direct_answers": ["kid", "kid", "kid", "baby", "baby", "child", "child", "baby", "kid", "kid"], "difficult_direct_answer": false, "rationales": ["The child is trying to sweep.", "The kid is grabbing.", "The child in the center of this image appears to be dragging a push broom behind him."], "image": "train2014/COCO_train2014_000000266273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169500, "question_id": "eJexgTBc5Pw3dQ8AvGsmKq", "question": "Why is the woman sitting?", "choices": ["to eat", "tie shoes", "have conversation", "to work"], "correct_choice_idx": 0, "direct_answers": ["eating cake", "to eat", "dining", "eating", "eating", "eating", "eating", "she's eating", "to eat", "eating"], "difficult_direct_answer": false, "rationales": ["There is food on a plate in front of the person on a table, and the backdrop is a restaurant.", "A woman is sitting at a table next to a dessert. people sit to eat.", "The woman is eating."], "image": "val2014/COCO_val2014_000000169500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423173, "question_id": "eK8ezn8BVA5RFTSxBFygoS", "question": "Why is the man wearing a yellow jacket?", "choices": ["visibility", "dressed down", "style", "dress up"], "correct_choice_idx": 0, "direct_answers": ["visibility", "security", "for visibility", "higher visibility", "working", "vision awareness", "cold", "protection", "visibility", "jacket"], "difficult_direct_answer": true, "rationales": ["This is so cars can see him", "So that he can easily be spotted by motorists.", "The vest is a bright color which helps people see the man better."], "image": "val2014/COCO_val2014_000000423173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539791, "question_id": "eKaHS86TXAPQSAbGdoSfis", "question": "Why are all the pigeons around the woman?", "choices": ["like her", "coincidence", "feeding them", "trained pigeons"], "correct_choice_idx": 2, "direct_answers": ["feeding time", "feeding them", "want food", "feeding them", "food", "bread", "getting food", "eating", "for food", "eating"], "difficult_direct_answer": false, "rationales": ["The pigeons are getting fed.", "The pigeons want snacks.", "If you give a bird something to eat you have a friend for life."], "image": "val2014/COCO_val2014_000000539791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316759, "question_id": "eKf4QvHNx89NqSsF4qxhEN", "question": "What type of recreational activity is the man involved in?", "choices": ["surfing", "boogie boarding", "kayaking", "paddle boarding"], "correct_choice_idx": 3, "direct_answers": ["paddle boarding", "paddle boarding", "surfing", "surfing", "paddling", "surfing", "paddle boarding", "paddle boarding", "paddle boarding", "paddle boarding"], "difficult_direct_answer": false, "rationales": ["The man is using a board and a paddle.", "The man is standing up on a board in the water. he has a paddle in his hand.", "You can tell by the setting and board he is on as to what pastime he is involved in."], "image": "train2014/COCO_train2014_000000316759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69944, "question_id": "eKwt6UcoTE7i8z47wewzgj", "question": "Who is the queen of this territory?", "choices": ["milena trump", "elizabeth ii", "lukashenko", "ivanka trump"], "correct_choice_idx": 1, "direct_answers": ["queen victoria", "elizabeth ii", "elizabeth ii", "mary", "queen elizabeth", "women", "elizabeth", "elizabeth", "victoria", "victoria"], "difficult_direct_answer": false, "rationales": ["The double decker buses are typical of london. elizabeth ii is the queen of england, where london is.", "The queen is elizabeth.", "Due to the double decker buses it's easy to surmise what country this is."], "image": "val2014/COCO_val2014_000000069944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510522, "question_id": "eKzJtXPCLKE7YdVwwdnYkZ", "question": "Which color of the rainbow is missing from this kite?", "choices": ["brown", "blue", "green", "yellow"], "correct_choice_idx": 3, "direct_answers": ["yellow", "blue", "orange", "indigo", "purple", "voilet", "orange", "orange", "red", "orange"], "difficult_direct_answer": false, "rationales": ["Unless you are colorblind you can easily tell what color is missing.", "The rainbow doesn't have yellow.", "Red, orange, green blue and purple are all depicted."], "image": "train2014/COCO_train2014_000000510522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450878, "question_id": "eL2caRPFjeddU6vxztbScs", "question": "What position is this player currently in?", "choices": ["outfielder", "batter", "pitcher", "shortstop"], "correct_choice_idx": 1, "direct_answers": ["home", "batting", "batter", "batter", "leaning forward", "batter", "batter", "home base", "batter", "cricket"], "difficult_direct_answer": false, "rationales": ["The player is in the batter's box and is holding the bat.", "The position is the batter.", "The player is holding a bat in their hands at the moment and on a baseball diamond. the only position in baseball where one holds a bat is answer c."], "image": "train2014/COCO_train2014_000000450878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343967, "question_id": "eLBoYGTDqAbtfRjpWSaB2h", "question": "This room can be described as what?", "choices": ["dilapidated", "organized", "haphazard", "melting"], "correct_choice_idx": 1, "direct_answers": ["classic", "living room", "classic", "living room", "living room", "organized", "living room", "living room", "living room", "living room"], "difficult_direct_answer": false, "rationales": ["The room is organized.", "The other options don't match.", "The sitting room looks neat with nothing misplaced."], "image": "val2014/COCO_val2014_000000343967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408624, "question_id": "eLC42cASoxtg7pDJzBaxqw", "question": "What do the small floats on the boats sides here meant to prevent the boats doing?", "choices": ["getting lost", "soaring", "bumping", "sinking"], "correct_choice_idx": 2, "direct_answers": ["bumping", "clash", "hitting dock", "hitting boats", "sinking", "sinking", "tipping over", "crashing together", "bumping", "tipping over"], "difficult_direct_answer": false, "rationales": ["They will allow the boats to rock in the water without hitting each other and scratching each other.", "This helps prevent damage if waves make the boats move together", "The boats would bump without the floats."], "image": "val2014/COCO_val2014_000000408624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163091, "question_id": "eLRkyg2HyVYBxVj4TWusUs", "question": "What type of transportation is shown?", "choices": ["air", "water", "land", "rail"], "correct_choice_idx": 0, "direct_answers": ["jet", "airplane", "airplanes", "air transport", "airplane", "air", "planes", "spacecraft", "plane", "jet"], "difficult_direct_answer": false, "rationales": ["The planes fly in the air.", "These are all airplanes.", "They are airplanes."], "image": "train2014/COCO_train2014_000000163091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364009, "question_id": "eLrZMWj4dh9bJWipwuhm3C", "question": "Who is the man on the sidewalk?", "choices": ["truck driver", "pedestrian", "passenger", "bus driver"], "correct_choice_idx": 1, "direct_answers": ["average joe", "older man", "passenger", "local", "passenger", "man", "pedestrian", "unknown", "senior citizen", "pedestrian"], "difficult_direct_answer": false, "rationales": ["He is walking on the sidewalk with a bag", "The man walking with a bag in his hand is a person who can cross a road at any time.", "He is walking."], "image": "val2014/COCO_val2014_000000364009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202201, "question_id": "eMAiUDbtRjoAR4Wc9QqzZb", "question": "How do people keep their hands clean while picking out donuts?", "choices": ["water", "tablecloth", "tissue", "shirt"], "correct_choice_idx": 2, "direct_answers": ["tissue", "hold napkin", "glove", "tissue", "tissues", "tissue", "cellophane", "gloves", "wash", "tissue"], "difficult_direct_answer": false, "rationales": ["The people use tissues.", "People use tissue to pick up donuts.", "The box of tissues is on the counter. the man has a tissue in his hand."], "image": "val2014/COCO_val2014_000000202201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15750, "question_id": "eMMCRMJiD9ZE4AMEYLPwtF", "question": "At what room in the house can you fix the following items?", "choices": ["living room", "bedroom", "toilet", "store"], "correct_choice_idx": 2, "direct_answers": ["bathroom", "toilet", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom", "bathroom"], "difficult_direct_answer": false, "rationales": ["Toilets are located in the bathroom.", "Toilets are in the bathroom.", "These are toilets on display."], "image": "train2014/COCO_train2014_000000015750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255992, "question_id": "eMTALR6HcLtv9eFL2PdzuD", "question": "What is the man doing with his eyes?", "choices": ["squinting", "rolling them", "sleeping", "winking"], "correct_choice_idx": 0, "direct_answers": ["squinting", "watching camera", "looking", "starring", "squinting", "looking", "squinting", "smiling", "crinkling", "staring"], "difficult_direct_answer": false, "rationales": ["A man is looking forward with eyes pulled partly shut. people squint when it is sunny out.", "He has them half closed", "The man's face has light shining on it and human's squinch the muscles around their eyes to protect them from brightness."], "image": "train2014/COCO_train2014_000000255992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306782, "question_id": "eMYsqgdRKoLEFn6wQSZof8", "question": "What is behind the television?", "choices": ["dog", "wall", "mouse", "cat"], "correct_choice_idx": 1, "direct_answers": ["wall", "wall", "wall", "wall", "wall", "wall", "wall", "wall", "wall", "wall"], "difficult_direct_answer": false, "rationales": ["An empty wall sits behind the tv set.", "The items are placed against the white long object which connects to the floor and ceiling. to the left of the white object is an opened door.", "The television is inside a room. there are no animals behind the television."], "image": "train2014/COCO_train2014_000000306782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64800, "question_id": "eMf72zyYMCSFoZnd9sLDcL", "question": "What seems to be sold outside the silver trailer?", "choices": ["ice cream", "cupcakes", "hamburgers", "hot dogs"], "correct_choice_idx": 1, "direct_answers": ["cupcakes", "cupcakes", "cupcakes", "cupcakes", "cupcakes", "cupcakes", "cupcakes", "cupcakes", "cupcakes", "cupcakes"], "difficult_direct_answer": false, "rationales": ["There is a large cupcake on top of a silver trailer and many people are gathered around.", "There is a giant cupcake on top of it, indicating it is an advertisement letting people know they can purchase cupcakes there.", "There is an object on the trailer. it looks to be a sweet in a cupcake wrapper."], "image": "train2014/COCO_train2014_000000064800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426915, "question_id": "eMuL8ZoQL22MBfTitsGibB", "question": "What is the color of the sail boat?", "choices": ["red", "orange", "white", "black"], "correct_choice_idx": 3, "direct_answers": ["yellow", "black", "black", "black", "white", "black", "black", "blue", "black", "blue"], "difficult_direct_answer": false, "rationales": ["The main color is not red white or orange.", "The boat is black.", "A large boat with hardware for sales is dark in color."], "image": "train2014/COCO_train2014_000000426915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 781, "question_id": "eMv3JTcKgk6omGwfPSoT9F", "question": "Is this a kite?", "choices": ["maybe", "yes", "unsure", "no"], "correct_choice_idx": 3, "direct_answers": ["no", "yes", "yes", "no", "no", "no", "yes", "yes", "no", "yes"], "difficult_direct_answer": false, "rationales": ["This is actually a parasail.", "This is a parasail.", "It's not a kite."], "image": "train2014/COCO_train2014_000000000781.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362739, "question_id": "eN6Fh8KXqVHGgXfuwNHT4q", "question": "Why is it that structure there in the middle?", "choices": ["warning", "intimidation", "prank", "commemoration"], "correct_choice_idx": 3, "direct_answers": ["statue", "memorialize person", "statue", "park decoration", "protection", "commemoration", "history", "display", "decoration", "remembrance"], "difficult_direct_answer": true, "rationales": ["A statue is in front of a large tree in the middle of an area. statues are used to commemorate people and events.", "The structure is an honor.", "Statues always honor people who have done something significant."], "image": "train2014/COCO_train2014_000000362739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194276, "question_id": "eNPPWQGNTVX6HrXzCXDvdX", "question": "Who would probably use the thing that is red brown and blue near the bottom of the photo?", "choices": ["teen", "woman", "man", "small child"], "correct_choice_idx": 3, "direct_answers": ["child", "toddler", "toddler", "toddler", "kitchen", "toddler", "baby", "small child", "baby", "small child"], "difficult_direct_answer": false, "rationales": ["The rocking horse is so tiny, that someone bigger wouldn't fit on it.", "A child would use the toy.", "There is a rocking horse which is popular with children."], "image": "train2014/COCO_train2014_000000194276.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370602, "question_id": "eNRyeFCFoise3Jiw9ELQAj", "question": "What activity is prohibited here?", "choices": ["eating", "busses", "taxis", "boarding"], "correct_choice_idx": 2, "direct_answers": ["solicitation", "parking", "standing", "parking", "parking", "solicitation", "standing", "solicitating passengers", "taxis", "parking"], "difficult_direct_answer": false, "rationales": ["Taxis are prohibited from entering here, buses only.", "People are gathered on a sidewalk at a bus stop.", "This is for buses only"], "image": "val2014/COCO_val2014_000000370602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320085, "question_id": "eNTFf8gtAEVBgJgqtYDBPR", "question": "What would happen if you cut the top wires?", "choices": ["nothing", "can't call", "laundry falls", "people injured"], "correct_choice_idx": 3, "direct_answers": ["lifts fall", "fall down", "tragedy", "fall", "chaos", "people fall", "crash", "they fall", "people injured", "everything drops"], "difficult_direct_answer": true, "rationales": ["People would fall off the trams.", "The people on the ski lift are far above the ground and if the wires were cut they would fall to the ground and get hurt.", "The people could fall."], "image": "train2014/COCO_train2014_000000320085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345014, "question_id": "eNgtzsSQgtoQjVssf9xgmv", "question": "What is this train hauling?", "choices": ["gravel", "wood chips", "dust", "steel"], "correct_choice_idx": 0, "direct_answers": ["cargo", "rock", "gravel", "gravel", "coal", "gravel", "gravel", "gravel", "gravel", "gravel"], "difficult_direct_answer": false, "rationales": ["The train has gravel.", "Gravel comes in small chunks and is formed into piles.", "The train has gravel."], "image": "train2014/COCO_train2014_000000345014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434192, "question_id": "ePC2fp7FHynwVa2MLPXJtY", "question": "Why are life preservers brightly colored?", "choices": ["pretty", "more buoyant", "visibility", "style"], "correct_choice_idx": 2, "direct_answers": ["visibility", "visibility", "safety", "for visibility", "easily spotted", "easily seen", "enhanced visibility", "visibility", "visibility", "visibility"], "difficult_direct_answer": false, "rationales": ["People need to be able to rescue those using lifesavers.", "Those colors will be easy to spot against the dark water.", "In case someone is drowning they'll more easily see a brightly colored preserver."], "image": "val2014/COCO_val2014_000000434192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383390, "question_id": "ePCn6JhrECAQ97EFAKG2LN", "question": "A famous player of this game was Babe what?", "choices": ["ruth", "emma", "rosie", "anna"], "correct_choice_idx": 0, "direct_answers": ["ruth", "ruth", "ruth", "ruth", "ruth", "ruth", "ruth", "ruth", "ruth", "ruth"], "difficult_direct_answer": false, "rationales": ["This is the only babe who played baseball", "Babe ruth played for the yankees and was the most famous babe.", "They are playing baseball."], "image": "train2014/COCO_train2014_000000383390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474401, "question_id": "ePLVhQcFLMMtUZ7sZJtW3y", "question": "Why would someone sit here?", "choices": ["to paint", "to eat", "to wait", "to work"], "correct_choice_idx": 3, "direct_answers": ["to compute", "work laptop", "do work", "to work", "laptop work", "use laptop", "work", "doing work", "work", "work"], "difficult_direct_answer": false, "rationales": ["The desk has a computer and a pen, which indicates that work is done at this location. the location appears to be an office.", "Most times laptops are used for work related things.", "Due to the laptop and general setting, you can easily surmise what the setting is used for."], "image": "train2014/COCO_train2014_000000474401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45292, "question_id": "ePbP6YRfNGZjTKeWvadroR", "question": "Who is the woman in the suit?", "choices": ["jockey", "flight attendant", "cashier", "announcer"], "correct_choice_idx": 1, "direct_answers": ["pilot", "flight attendant", "employee", "flight attendant", "flight attendant", "flight steward", "stewardess", "flight attendant", "flight attendant", "flight attendant"], "difficult_direct_answer": false, "rationales": ["She is the flight attendant.", "They are holding a clipboard at the door of a plane.", "The woman is on a plane. she is in uniform because she is working."], "image": "train2014/COCO_train2014_000000045292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214007, "question_id": "ePdWWjddfQiqtFzP4r7WP6", "question": "Which seasoning appears to have the tallest dispenser here?", "choices": ["allspice", "pepper", "vinegar", "cinnamon"], "correct_choice_idx": 1, "direct_answers": ["pepper", "pepper", "pepper", "pepper", "pepper", "pepper", "pepper", "pepper", "pepper", "pepper"], "difficult_direct_answer": false, "rationales": ["Pepper is the tallest.", "The seasoning is pepper.", "The mill is the biggest vessel there."], "image": "val2014/COCO_val2014_000000214007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183204, "question_id": "ePg2oUz3GfwCwNppdTy8Bd", "question": "Items here are laid out due to what purpose?", "choices": ["display wealth", "packing backpack", "moving sale", "manic behaviour"], "correct_choice_idx": 1, "direct_answers": ["collection", "yard sale", "trip planning", "arranging", "display", "selling", "packing backpack", "display", "display", "packing"], "difficult_direct_answer": false, "rationales": ["These are all items you would need if you were going somewhere you can see in the back there is a white bag that looks like a backpack the person logically would put all of the small items in the backpack so that they are contained throughout their trip.", "The empty backpack can be seen at the top of the image.", "Someone is trying to fit as much as possible into a small carry-on bag."], "image": "val2014/COCO_val2014_000000183204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340454, "question_id": "ePtzikECbSYfTVJ8CMw2Bh", "question": "What is the large red object in the background called?", "choices": ["crane", "drill", "ladder", "skyscraper"], "correct_choice_idx": 0, "direct_answers": ["scaffolding", "crane", "crane", "scaffolding", "crane", "tower crane", "crane", "crane", "crane", "crane"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to this image.", "There is just one large red object which is the middle of the background. it is known as a crane.", "It lifts and moves heavy things."], "image": "train2014/COCO_train2014_000000340454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137134, "question_id": "ePxkQkFn6n6ZjZz3TZAFo8", "question": "What sport requires this man to lay almost flat to his boards?", "choices": ["snowboarding", "ski jumping", "ski climbing", "ski crossing"], "correct_choice_idx": 1, "direct_answers": ["ski jumping", "surfing", "ski jumping", "skiing", "ski jump", "ski jump", "skiing", "skiing", "hurling", "distance jumping"], "difficult_direct_answer": false, "rationales": ["This is the normal stance for this sport only.", "Jumping in skiis.", "Ski jumping requires a man to lay flat almost to his boards."], "image": "train2014/COCO_train2014_000000137134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25134, "question_id": "eQ6jyWqSQ4fh7cPgZ8EjWd", "question": "How many rows of buses are visible?", "choices": ["six", "four", "three", "five"], "correct_choice_idx": 0, "direct_answers": ["seven", "six", "seven", "six", "six", "six", "six", "six", "seven", "six"], "difficult_direct_answer": false, "rationales": ["There are six rows of busses visible.", "Many bus rows are visible.", "There may be more, but that's the only number in the photo."], "image": "val2014/COCO_val2014_000000025134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447694, "question_id": "eQ8RADgA3qhrWhEddjQZHL", "question": "What can you buy from the shop next to the bar?", "choices": ["laptops", "books", "shoes", "jeans"], "correct_choice_idx": 1, "direct_answers": ["books", "books", "books", "books", "books", "books", "books", "books", "books", "books"], "difficult_direct_answer": false, "rationales": ["The shop in the question has an awning with writing on it that identifies what is sold within.", "The sign says that they sell books.", "The books can be bought."], "image": "train2014/COCO_train2014_000000447694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464812, "question_id": "eQBxtPLzDtfCZTFWzvS3qa", "question": "What league would they want to play in?", "choices": ["nba", "mlb", "nfl", "nhl"], "correct_choice_idx": 1, "direct_answers": ["major league", "major league", "mlb", "mlb", "mens league", "mlb", "mlb", "mlb", "major", "mlb"], "difficult_direct_answer": false, "rationales": ["The type of uniform is exclusively for baseball players and the visible bat eliminates any doubt.", "The league play baseball.", "Major league baseball is the pinnacle of the sport."], "image": "train2014/COCO_train2014_000000464812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256648, "question_id": "eQDAXszAU8hUHMQaM4FwE5", "question": "What metal is visible here?", "choices": ["aluminum", "copper", "nickel", "steel"], "correct_choice_idx": 1, "direct_answers": ["bench", "silver steel", "necklace", "watch", "silver", "copper", "rusty", "wrist watch", "watch/necklace", "silver"], "difficult_direct_answer": true, "rationales": ["The pole in front of her looks to be made out of a metal.", "The metal visible here is nickel in the watch.", "Looks like copper on her sunglasses."], "image": "train2014/COCO_train2014_000000256648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252929, "question_id": "eQWz28qfVLC4Hbe9FUt8iT", "question": "What are the letters for?", "choices": ["child's candy", "writing messages", "mark problems", "luck"], "correct_choice_idx": 1, "direct_answers": ["kite", "decoration", "decoration", "decoration", "decoration", "writing messages", "name", "decorative", "decoration", "decoration"], "difficult_direct_answer": false, "rationales": ["They have words on them.", "You spell words with them", "These letters on kites seem to be a project to send missives into the sky."], "image": "val2014/COCO_val2014_000000252929.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427180, "question_id": "eQYkyw32UB4K9dNZ5DHRzx", "question": "What type of kitchen would this be called?", "choices": ["vintage", "colonial", "commercial", "home"], "correct_choice_idx": 2, "direct_answers": ["industrial kitchen", "modern kitchen", "commercial", "industrial", "industrial", "convenience", "modular", "school kitchen", "commercial", "soup kitchen"], "difficult_direct_answer": false, "rationales": ["It is full of stainless steel furniture and has a very large fridge", "This kitchen can prepare large quantities of food.", "The kitchen has wide countertops."], "image": "train2014/COCO_train2014_000000427180.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506736, "question_id": "eQbhekGJAYKNqtokKSPwep", "question": "How many of the vegetables are unnecessary to peel before consumed?", "choices": ["one", "none", "two", "three"], "correct_choice_idx": 2, "direct_answers": ["seven", "one", "one", "one", "two", "one", "one", "one", "two", "two"], "difficult_direct_answer": false, "rationales": ["Neither vegetable needs to be peeled to be eaten and the third food is a fruit.", "There is no need to peel carrots or tomatoes before eating.", "The carrots and tomatoes do not have to be peeled before being consumed."], "image": "val2014/COCO_val2014_000000506736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306420, "question_id": "eQeSCatjN8YcafrzE6EVRP", "question": "What is the scarf called being worn by the women?", "choices": ["neckies", "dupatta", "hijab", "dickies"], "correct_choice_idx": 1, "direct_answers": ["dupatta", "sarong", "shawl", "scarf", "sash", "khimar", "dupatta", "dupatta", "sari", "brown"], "difficult_direct_answer": false, "rationales": ["The scarf is a dupatta.", "A woman has a long scarf hung over her shoulder in a traditional way.", "Women in india wear this scarf or shawl."], "image": "train2014/COCO_train2014_000000306420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103431, "question_id": "eQi652muAGx5sLw8LjTtNz", "question": "Whose idea was it to take the picture of these boys?", "choices": ["leftmost boy", "rightmost boy", "all boys", "photographer"], "correct_choice_idx": 3, "direct_answers": ["photographer", "photographer", "teacher", "teacher", "photographer", "teacher", "photographer", "photographer", "teacher", "teacher"], "difficult_direct_answer": false, "rationales": ["It was the photographer who put the boys up to take a picture.", "None of the boys seem happy about it so it has to be the expert", "It is a school picture. a professional is hired to take the students pictures."], "image": "val2014/COCO_val2014_000000103431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20524, "question_id": "eQnUtSXNYRdATddxNkJwC5", "question": "What is most likely in the colorful object?", "choices": ["thumbtacks", "tape", "liquid", "candy"], "correct_choice_idx": 2, "direct_answers": ["coffee", "mug", "mug", "coffee", "coffee", "coffee", "coffee", "liquid", "coffee", "mug"], "difficult_direct_answer": false, "rationales": ["This is a mug on a table. in all probability, a liquid is in it; maybe coffee or hot chocolate.", "It is a coffee mug.", "The mug likely has coffee in it."], "image": "train2014/COCO_train2014_000000020524.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284758, "question_id": "eQpmqak5ZJMVaLRT8K3YG9", "question": "The person is viewed through what?", "choices": ["pc screen", "window", "mirror", "naked eye"], "correct_choice_idx": 0, "direct_answers": ["computer", "computer", "mirror", "screen", "screen", "mirror", "pc screen", "mirror", "mirror", "mirror"], "difficult_direct_answer": false, "rationales": ["The person has a pc screen.", "This is a monitor.", "The controls for a monitor can be seen under a girl brushing her teeth."], "image": "train2014/COCO_train2014_000000284758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147971, "question_id": "eQwVUv26x35u62F9zhGVPd", "question": "What animal mimics the pattern of the plane to the left?", "choices": ["cheetah", "snow leopard", "frog", "dog"], "correct_choice_idx": 1, "direct_answers": ["cheetah", "giraffe", "snow leopard", "zebra", "lepeord", "dalmation", "cheetah", "leopard", "leopard", "zebra"], "difficult_direct_answer": false, "rationales": ["The animal is a leopard.", "The spots on the plane are like those of a snow leopard", "Most of the other animals listed don't have the same type of spots in that style."], "image": "train2014/COCO_train2014_000000147971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206757, "question_id": "eQzbeUGareAJ6KNWd3zhBz", "question": "What sort of traffic is allowed in the narrow street ahead?", "choices": ["cars", "busses", "foot only", "vans"], "correct_choice_idx": 2, "direct_answers": ["foot", "pedestrian", "foot", "pedestrian", "pedestrian", "pedestrian", "pedestrian only", "pedestrian", "trolley", "foot only"], "difficult_direct_answer": false, "rationales": ["Everyone is walking.", "Only pedestrians are walking.", "People are walking and no cars are present."], "image": "val2014/COCO_val2014_000000206757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85803, "question_id": "eRFuzEpd99XRjFnfieVZsX", "question": "Why is the red object in the sink?", "choices": ["to sell", "to purchase", "to wax", "to clean"], "correct_choice_idx": 3, "direct_answers": ["bin", "to wash", "bowl", "pot", "to clean", "cooking pot", "bown", "washing", "dirty", "container"], "difficult_direct_answer": true, "rationales": ["Dirty dishes are kept in the sink to be washed after use.", "The pot is dirty.", "The object is for cleaning."], "image": "val2014/COCO_val2014_000000085803.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202657, "question_id": "eRZyfsDrdaBnW4ucRB9i25", "question": "What other situation might the standing items be useful for?", "choices": ["hurricane", "snow", "rain", "wind"], "correct_choice_idx": 2, "direct_answers": ["rain", "rain", "rain", "rain", "dining", "rain", "rain", "during rain", "rain", "sunlight"], "difficult_direct_answer": false, "rationales": ["These umbrellas can help keep them cooler by keeping the heat off of them. they also can keep the other elements off of them if it storms.", "The umbrellas on the beach are being used to shield the sun but it can also be used when it's raining.", "The situation is rainy."], "image": "train2014/COCO_train2014_000000202657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428973, "question_id": "eSErgg7GAEr7bgcbRjq9VV", "question": "The umbrella is being used as a safety measure to protect the kids from getting what?", "choices": ["wet", "tired", "sunburn", "cold"], "correct_choice_idx": 2, "direct_answers": ["sunburn", "sunburn", "sun", "sunburn", "sunburn", "sunburnt", "sunburn", "wet", "sunburned", "rain"], "difficult_direct_answer": false, "rationales": ["The umbrella protects the children from the sun.", "Kids are sitting under an umbrella on a sunny day.", "The dry dirt under them suggests it's to protect against sunlight."], "image": "val2014/COCO_val2014_000000428973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222442, "question_id": "eSRPKvxkmhNoYqJbsyxqtk", "question": "What is the name of the secret identity of the logo on the hat?", "choices": ["bruce wayne", "peter parker", "clarke kent", "rock"], "correct_choice_idx": 0, "direct_answers": ["bruce wayne", "batman", "bruce wayne", "bruce wayne", "bruce wayne", "batman", "batman", "batman", "batman", "bruce wayne"], "difficult_direct_answer": false, "rationales": ["It is the batman symbol of batman, whose real name is bruce.", "The batman's secret identity is that of bruce wayne.", "It's bruce wayne."], "image": "train2014/COCO_train2014_000000222442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335515, "question_id": "eSTSB29nsoLrmdZJtLH634", "question": "What kind of cuisine does this restaurant serve?", "choices": ["american", "italian", "japanese", "chinese"], "correct_choice_idx": 3, "direct_answers": ["chinese", "american italian", "chips", "asian", "asian", "slop", "chinese", "asian", "asian", "chinese food"], "difficult_direct_answer": false, "rationales": ["On the napkin at the tips of the chopstick there are the words \"golden china.\" there are noodles and vegetables next to them on the plate with dumplings in a bowl suggesting asian cuisine.", "There are chopsticks and this type of food", "This is an asian restaurant as you can see from the menu."], "image": "val2014/COCO_val2014_000000335515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352247, "question_id": "eSck38e6cduG6ZXWD9aQsq", "question": "What is the specialty of those larger containers?", "choices": ["preserve temperature", "hold wine", "hold plants", "transporting goods"], "correct_choice_idx": 0, "direct_answers": ["water", "drinks", "milk", "preserve temperature", "drinks", "water", "no clue", "hold beverage", "water", "drinks"], "difficult_direct_answer": false, "rationales": ["These containers have cold beverages inside.", "Large coolers are on a bench with spickets on them. people have drinks ready for kids playing sports.", "The specialty is to preserve the temperature."], "image": "train2014/COCO_train2014_000000352247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124629, "question_id": "eSekXpW8jJTpKLKThyozjF", "question": "What type of sporting area are girls most likely playing on?", "choices": ["tennis court", "soccer field", "basketball stadium", "baseball diamond"], "correct_choice_idx": 0, "direct_answers": ["playground", "tennis court", "sidewalk", "tennis court", "court", "playground", "concrete", "tennis court", "concrete", "park"], "difficult_direct_answer": false, "rationales": ["The girls are playing on a tennis court.", "The area is a tennis court.", "The girls have tennis racquets."], "image": "val2014/COCO_val2014_000000124629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346472, "question_id": "eSozF5W9SQy7EkmRWWeNDM", "question": "What are is the image from?", "choices": ["sky", "city", "underground", "forest"], "correct_choice_idx": 1, "direct_answers": ["streetview", "camera", "downtown", "city", "downtown", "city", "city", "city", "nighttime downtown", "downtown"], "difficult_direct_answer": false, "rationales": ["The area has bright lights and many buildings.", "They took this pic from the air.", "The image shows an overhead view of a city street at night."], "image": "train2014/COCO_train2014_000000346472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501571, "question_id": "eTBghufft6cgZEQwfm7rEY", "question": "What setting do these mounted persons stand in?", "choices": ["park", "riot", "mall", "grocery store"], "correct_choice_idx": 0, "direct_answers": ["park", "public", "outdoors", "road", "crowd", "park", "park", "park", "public park", "park"], "difficult_direct_answer": false, "rationales": ["The setting is a park.", "The setting is a public, outdoor space. based on the layout of the setting and the activities people are engaged in, answer a is likely and the other answers are not viable.", "The area has the greenery and landscaping consistent with public parks."], "image": "train2014/COCO_train2014_000000501571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155644, "question_id": "eTHGCAzfFiCe767kPSLeoP", "question": "What are the shoes that the girl is wearing a good use for?", "choices": ["ballet", "snowboarding", "running", "swimming"], "correct_choice_idx": 2, "direct_answers": ["running", "running", "sports shoes", "running", "sports shoes", "running", "running", "running", "running", "walking"], "difficult_direct_answer": false, "rationales": ["The girl has sneakers on.", "These are tennis shoes and good for exercising.", "A girl is sitting on a bench in athletic shoes."], "image": "val2014/COCO_val2014_000000155644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212679, "question_id": "eTMYaEVvzQLnzFcnsCzUwG", "question": "What are the napkins folded to look like?", "choices": ["boats", "cars", "plane", "fans"], "correct_choice_idx": 0, "direct_answers": ["boats", "boats", "boats", "ships", "ships", "boats", "boats", "boats", "boats", "boats"], "difficult_direct_answer": false, "rationales": ["The napkin is a boat.", "The napkins are folded in a boat shape.", "The napkins look like sailboats."], "image": "train2014/COCO_train2014_000000212679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140500, "question_id": "eTMgfaVrCQf3Q2FguuKULT", "question": "What are these animals commonly called?", "choices": ["llamas", "alpacas", "sheep", "cattle"], "correct_choice_idx": 3, "direct_answers": ["cows", "cattle", "cows", "cows", "cows", "cattle", "cows", "cows", "cattle", "cattle"], "difficult_direct_answer": false, "rationales": ["The animals are cattle.", "The word cattle originates from its latin form meaning head \"one head\" this term later became another name for the cow as they are a livestock animal and used primarily for milk production.", "The animals are big and have four legs. they are farm animals."], "image": "train2014/COCO_train2014_000000140500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54869, "question_id": "eTqJQYz8RvRd5X9xVAZEPY", "question": "How many surfaces can this vehicle adjust to?", "choices": ["one", "two", "four", "none"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The bus is driving in the water. it also can drive on a street.", "This vehicle can traverse water and land.", "The bus can go on water and land."], "image": "train2014/COCO_train2014_000000054869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80652, "question_id": "eTxy39NUr4r3dSke7sLpFi", "question": "How many people ate dinner on this table for lunch today?", "choices": ["four", "ten", "none", "12"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are four seats at the dining table which could lead on to believe that potentially four people ate there.", "A table is shown with four chairs.", "There are 4 people."], "image": "val2014/COCO_val2014_000000080652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237463, "question_id": "eU6ZupWqiJnDK5cUVKhezf", "question": "Who has lighter hair than this person?", "choices": ["margaret qualley", "penelope cruz", "taylor swift", "natalie portman"], "correct_choice_idx": 2, "direct_answers": ["blondes", "blondes", "taylor swift", "blonde people", "girl", "blondes", "dennis themenace", "unknown", "taylor swift", "unknown"], "difficult_direct_answer": false, "rationales": ["Taylor swift has blonde hair.", "Swift has blonde hair.", "The person has light brown hair answer a is the only option with a lighter shade of hair while all other options have dark brown or black hair."], "image": "val2014/COCO_val2014_000000237463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131678, "question_id": "eUEDbmtkmJFETVYgXEpBG8", "question": "The inaugural flight of this airline left what city?", "choices": ["madrid", "rome", "hamburg", "geneva"], "correct_choice_idx": 3, "direct_answers": ["unknown", "jerusalem", "singapore", "israel", "israel", "switzerland", "can't see", "geneva", "unsure", "unknown"], "difficult_direct_answer": false, "rationales": ["The airlines in the foreground is el al, based on the writing on the side. after discerning the airline, the answer is internet searchable.", "The first flight of the company whose logo is on the side of the plain left from geneva.", "It left form geneva"], "image": "train2014/COCO_train2014_000000131678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387362, "question_id": "eUZTYmE65XeBEpuGLpnQqU", "question": "What type of mirror is on the wall?", "choices": ["rearview mirror", "compact", "foldable", "pull out"], "correct_choice_idx": 3, "direct_answers": ["pull out", "reflective", "vanity", "makeup mirror", "rectangular", "vanity mirror", "mounted", "cabinet", "wall mirror", "bathroom mirror"], "difficult_direct_answer": true, "rationales": ["The mirror on the right has an extension arm for pulling out.", "The mirror pulls out.", "The round mirror is mounted on an expandable base."], "image": "val2014/COCO_val2014_000000387362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406488, "question_id": "eUZkKWoZWZCjRiNzRE22cB", "question": "What do the large white lines allow pedestrians to do?", "choices": ["loiter", "speed", "park", "cross"], "correct_choice_idx": 3, "direct_answers": ["cross street", "cross", "cross safely", "cross", "crosswalk", "cross", "cross street", "cross", "cross street", "cross street"], "difficult_direct_answer": false, "rationales": ["The lines indicate that the people can move across the road at that point.", "Vehicles yield to pedestrians standing in this walkway.", "The lines tell people where to walk in the crosswalk."], "image": "train2014/COCO_train2014_000000406488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566941, "question_id": "eUiJk9wrHj7JFpht3x837Q", "question": "What item of clothing does the elephant hold?", "choices": ["dress", "hat", "shoes", "pants"], "correct_choice_idx": 1, "direct_answers": ["hat", "hat", "hat", "hat", "hat", "shirt", "hat", "hat", "hat", "hat"], "difficult_direct_answer": false, "rationales": ["The elephant has a cap.", "He took the persons hat off their heat.", "He is putting it on his head."], "image": "val2014/COCO_val2014_000000566941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34785, "question_id": "eUxE4KdrD9CoWUGSwMq8P7", "question": "Consuming which one of these items will make it dangerous to drive?", "choices": ["in box", "in bottle", "on plate", "in bun"], "correct_choice_idx": 1, "direct_answers": ["bear", "beer", "beer", "beer", "bear", "beer", "budweiser beer", "in bottle", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["There is beer in the bottle. you should not drink and drive.", "The bottle says budweiser.", "The drink in a bottle is beer, which has alcohol. drinking this can impair your brain."], "image": "train2014/COCO_train2014_000000034785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114326, "question_id": "eV7Z8NpBWHjYZkc65WHGwT", "question": "Where is the table the boy is sitting at?", "choices": ["police station", "library", "post office", "restaurant"], "correct_choice_idx": 3, "direct_answers": ["restaurant", "restaurant", "restaurant", "cafeteria", "restaurant", "restaurant", "inside", "restaurant", "restaurant", "inside"], "difficult_direct_answer": false, "rationales": ["You can tell by the tables and other customers around that it is.", "There are many patrons seen at tables in the background, indicating this is not someone's home but is a kitchen.", "There are several tables in the room. other people are dining here as well."], "image": "train2014/COCO_train2014_000000114326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102843, "question_id": "eVBz8CF49CudhBTKcqMeiT", "question": "How did the people standing near the lighthouse structure arrive here?", "choices": ["uber", "walking", "by boat", "lyft"], "correct_choice_idx": 1, "direct_answers": ["recently", "boat", "boat", "boat", "walked pier", "walking", "walking", "boat", "pier path", "boats"], "difficult_direct_answer": false, "rationales": ["Only a boat could reach a place in the middle of the water.", "The people walked.", "They were able to walk down to it on the cemented path."], "image": "val2014/COCO_val2014_000000102843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103315, "question_id": "eVCsNSHJtH7vzhMc6Ciadk", "question": "Why do they have their heads covered?", "choices": ["religion", "safety", "fashion", "uniform"], "correct_choice_idx": 1, "direct_answers": ["protection", "protection", "protection", "safety", "protection", "for safety", "safety", "safety", "protection", "safety"], "difficult_direct_answer": false, "rationales": ["People want their heads safe.", "Most people wear a helmet so that if they hit their head they don't get injured. and he is wearing a helmet while skateboarding which is dangerous.", "When skateboarding, you can fall easily and hit your head, so it is paramount to have head safety."], "image": "train2014/COCO_train2014_000000103315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187537, "question_id": "eVEk2L3WyEfVkXyogjvJQC", "question": "What type of bread is on the sandwich?", "choices": ["white", "whole wheat", "light rye", "sourdough"], "correct_choice_idx": 2, "direct_answers": ["wheat", "rye", "rye", "light rye", "rye", "rye", "sandwich bread", "wheat", "white", "rye"], "difficult_direct_answer": false, "rationales": ["The bread is light rye.", "Light rye has little studded seeds in it.", "The sandwich has rye seeds."], "image": "train2014/COCO_train2014_000000187537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517534, "question_id": "eVSuqnqenmEy7Lwtvw2Kz7", "question": "What are the Elephants standing on?", "choices": ["sticks", "water", "concrete", "snow"], "correct_choice_idx": 2, "direct_answers": ["ground", "dirt", "ground", "ground", "concrete", "dirt", "ground", "ground", "ground", "dirt"], "difficult_direct_answer": false, "rationales": ["You can tell by all of the tree branches on the ground as to what the elephants are standing on.", "The animals are in an area near the dirt and grass. it is a solid area.", "It is on the sidewalk which is made from concrete."], "image": "train2014/COCO_train2014_000000517534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516913, "question_id": "eVXP7EVyKUhXKCUbGeN6Mw", "question": "This athlete is using an instrument that is similar to one found in what other sport?", "choices": ["soccer", "hockey", "badminton", "baseball"], "correct_choice_idx": 2, "direct_answers": ["badminton", "tennis", "squash", "badminton", "tennis", "badminton", "pickleball", "badminton", "badminton", "racket"], "difficult_direct_answer": false, "rationales": ["He is holding a racquet, not a bat, ball, or stick.", "The man is playing tennis based on the equipment, attire and setting. as tennis is a racket-based sport, answer a is most similar as another racket-based sport and none of the answers are similar.", "This sport also uses a similar type of racket as tennis."], "image": "val2014/COCO_val2014_000000516913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207043, "question_id": "eVXTWNkQQKNByyog3hRj3E", "question": "What vehicle is equipped to carry bananas?", "choices": ["bicycle", "motorcycle", "scooter", "car"], "correct_choice_idx": 0, "direct_answers": ["bike", "bicycle", "bicycle", "bicycle", "bicycle", "bicycle", "bike", "bicycle", "bike", "bike"], "difficult_direct_answer": false, "rationales": ["A bicycle is used to carry the bananas.", "There are two tires and handlebars and it's human powered", "Bikes carry bananas."], "image": "train2014/COCO_train2014_000000207043.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277991, "question_id": "eVY2hXJcW7mDdnyGcBukJM", "question": "What is in the boy's glass?", "choices": ["wine", "juice", "champagne", "coke"], "correct_choice_idx": 3, "direct_answers": ["pop", "soda", "soda", "coke", "water", "soda pop", "coke", "cola", "coke", "water"], "difficult_direct_answer": false, "rationales": ["It is dark colored", "The boy has a dark brown drink in his cup.", "The young man in the teal shirt looks under age to be drinking therefore he has soft drink in his glass."], "image": "val2014/COCO_val2014_000000277991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313733, "question_id": "eW4BRNL2jinvvDiAoMpg9p", "question": "How many wheels does the vehicle here have?", "choices": ["four", "two", "none", "three"], "correct_choice_idx": 1, "direct_answers": ["three", "two", "three", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two wheels.", "Motorbikes have two wheels.", "A motorcycle has one wheel in the front and one wheel in the back."], "image": "val2014/COCO_val2014_000000313733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293133, "question_id": "eWDZSoXjehfAPiDdqimiAr", "question": "Why is she holding his head?", "choices": ["angry", "her job", "cutting hair", "punishing him"], "correct_choice_idx": 2, "direct_answers": ["drying hair", "drying hair", "blowdring hair", "cutting hair", "keeping still", "drying hair", "drying", "keep still", "keep steady", "working hair"], "difficult_direct_answer": false, "rationales": ["She is using equipment on the back of his head. she pushes his head forward so she can see better.", "She is holding his head to cut his hair.", "She is shaving his head."], "image": "val2014/COCO_val2014_000000293133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460575, "question_id": "eWj2gPpYaBEDLhzUAdyAFo", "question": "What is the white substance in the left shaker?", "choices": ["icing", "salt", "creme", "milk"], "correct_choice_idx": 1, "direct_answers": ["salt", "salt", "salt", "tissue paper", "salt", "its salt", "salt", "salt", "salt", "salt"], "difficult_direct_answer": false, "rationales": ["There is a bunch of salt in the left salt shaker.", "The substance is salt.", "The contents are solid and granular and served next to a pepper shaker."], "image": "train2014/COCO_train2014_000000460575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555900, "question_id": "eWjhUqmd95pdJyaD9DBZXH", "question": "What brown foodstuff is common in these round things?", "choices": ["marmite", "beef", "chocolate chips", "mushrooms"], "correct_choice_idx": 2, "direct_answers": ["brown sugar", "muffins", "chocolate chips", "muffin", "muffins", "muffins", "chocolate chips", "chocolate chips", "muffins", "muffin"], "difficult_direct_answer": false, "rationales": ["Muffins are shown in a pan. chocolate chip is a popular muffin type.", "The stuff is chocolate.", "Chocolates are used to spice up the cake."], "image": "val2014/COCO_val2014_000000555900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536607, "question_id": "eWsmAXHLPEsGGzfTkSVjfG", "question": "What is the shirtless man dressed as?", "choices": ["pirate", "wrestler", "cop", "prisoner"], "correct_choice_idx": 0, "direct_answers": ["pirate", "pirate", "pirate", "pirate", "pirate", "pirate", "pirate", "pirate", "pirate", "pirate"], "difficult_direct_answer": false, "rationales": ["The man has an eye patch, a tricorne hat, and a parrot and one of the options is traditionally associated with those items.", "The man is a pirate.", "The hat he is wearing and the patch on his eye."], "image": "train2014/COCO_train2014_000000536607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78671, "question_id": "eWw626xRfF9ctpieZUV94P", "question": "What country was the batter born in?", "choices": ["mongolia", "japan", "china", "canada"], "correct_choice_idx": 1, "direct_answers": ["usa", "japan", "japan", "mexico", "united states", "japan", "japan", "spain", "japan", "usa"], "difficult_direct_answer": false, "rationales": ["He was probably born in japan because his name is ichiro", "Ichiro is a japanese name and this player is known to be japanese.", "He was born in japan."], "image": "val2014/COCO_val2014_000000078671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435414, "question_id": "eWyfadKNgEhwwDeDr2Z3gj", "question": "What style are his jeans?", "choices": ["bell bottom", "straight", "skinny", "distressed"], "correct_choice_idx": 0, "direct_answers": ["baggy", "flared", "bell bottoms", "faded", "rugged", "flared", "bell bottom", "bell bottom", "boot cut", "bell bottoms"], "difficult_direct_answer": false, "rationales": ["The jeans are a little flared at the shoes.", "The jeans are larger at the bottom than they are at the top.", "The jeans are actually making contact with the ground; that mean that they are \"bell bottoms.\" these were very popular in the 1970s and eric clapton recorded a song about them; \"bell bottom blues.\""], "image": "train2014/COCO_train2014_000000435414.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44170, "question_id": "eX2R8gtzLjMsVGciWWmLpo", "question": "What sort of establishment is the person visiting?", "choices": ["balloon shop", "bakery", "pizzeria", "sub shop"], "correct_choice_idx": 1, "direct_answers": ["restaurant", "donut shop", "donut shop", "cafe", "bakery", "cafe", "bakery", "donut shop", "bakery", "donut shop"], "difficult_direct_answer": false, "rationales": ["A person is eating a donut at a place of business. bakeries make donuts.", "She is eating a donut.", "She is clearly eating some kind of cookie; that's one thing you can find in a bakery."], "image": "train2014/COCO_train2014_000000044170.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386752, "question_id": "eX4c3MKewv9KXdCACmqJ3V", "question": "The woman wearing what color of coat is in the greatest danger?", "choices": ["grey", "black", "white", "blue"], "correct_choice_idx": 0, "direct_answers": ["brown", "gray", "tan", "bus", "gray", "grey", "bus", "grey", "grey", "grey"], "difficult_direct_answer": false, "rationales": ["She's on the road and a bus is coming", "The woman is right in front of the bus. she could get hit.", "She's in the middle of the street with oncoming traffic."], "image": "train2014/COCO_train2014_000000386752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229295, "question_id": "eXVjBm7xAnsAgiD9bDDyrM", "question": "What kind of sauce is this?", "choices": ["hot sauce", "relish", "mustard", "ketchup"], "correct_choice_idx": 1, "direct_answers": ["relish", "relish", "ketchup", "relish", "relish", "relish", "ketchup", "relish", "relish", "relish"], "difficult_direct_answer": false, "rationales": ["The condiment coming out of the bottle is green and is being put on a hotdog.", "The sauce is relish.", "The sauce is ketch up and its red."], "image": "val2014/COCO_val2014_000000229295.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114648, "question_id": "eXZA9iZoEcQPqqEcugfbS3", "question": "The largest food item on any of these tables is found in what sauce?", "choices": ["duck", "ketchup", "soy", "mustard"], "correct_choice_idx": 1, "direct_answers": ["marinara", "tomato", "tomato", "tomato", "ketchup", "tomatoes", "ketchup", "tomatoes", "spaghetti sauce", "spaghetti"], "difficult_direct_answer": false, "rationales": ["Tomatoes make ketchup and other items.", "The item is ketchup.", "The largest visible food item on table are the tomatoes. tomatoes are the main ingredient in answer a."], "image": "train2014/COCO_train2014_000000114648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98872, "question_id": "eXe4vc8ndEiMEzTDR2MoVJ", "question": "Why is his tongue out?", "choices": ["showing anger", "being friendly", "sharing lunch", "licking tie"], "correct_choice_idx": 1, "direct_answers": ["being friendly", "being silly", "kissing", "unknown", "lick", "sexy time", "kisssing", "kissing her", "to kiss", "french kissing"], "difficult_direct_answer": true, "rationales": ["His tongue is out for a funny photo.", "The people are being friendly with each other.", "The man is about to kiss the woman."], "image": "val2014/COCO_val2014_000000098872.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535768, "question_id": "eXkGgq6PcetMH6dUTH8jK6", "question": "The boy is wearing a shirt that looks like the shirt of a character in what series?", "choices": ["smurfs", "curious george", "where's waldo", "garfield"], "correct_choice_idx": 2, "direct_answers": ["waldo", "waldo", "where's waldo", "nickoledon", "pirate", "where's wally", "waldo", "where's waldo", "where's waldo", "waldo"], "difficult_direct_answer": false, "rationales": ["A kid is wearing a red and white striped shirt.", "Waldo from the \"where's waldo\" series wears a red and white striped shirt. a boy is wearing a red and white striped shirt.", "The boy has a red and white striped shirt."], "image": "train2014/COCO_train2014_000000535768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369977, "question_id": "eXrRAckKcDiMaDEXws89JU", "question": "What is the item hanging from the ceiling?", "choices": ["lights", "chandeliers", "bats", "fans"], "correct_choice_idx": 0, "direct_answers": ["lights", "lights", "lights", "lights", "lights", "light", "lights", "light", "lights", "lights"], "difficult_direct_answer": false, "rationales": ["The item is a light.", "They are lamps with light bulbs in different directions to shine light when you turn them on.", "There are lights all over the ceiling."], "image": "train2014/COCO_train2014_000000369977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479008, "question_id": "eY5cNUPBP85dDxCEndcWbB", "question": "Which class are these passengers probably sitting in?", "choices": ["economy", "business class", "first class", "premium economy"], "correct_choice_idx": 0, "direct_answers": ["economy", "coach", "economy", "economy", "economy", "coach", "coach", "economy", "economy", "coach"], "difficult_direct_answer": false, "rationales": ["The people on the plane are sitting very close together. their seats were probably cheap.", "The class is economy.", "The seats are very cramped."], "image": "val2014/COCO_val2014_000000479008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333626, "question_id": "eY6uqnoaTQ4KUYadPmmMFP", "question": "What is this person's likely job title?", "choices": ["garbage man", "electrician", "waiter", "line cook"], "correct_choice_idx": 3, "direct_answers": ["cook", "cook", "chef", "chef", "cook", "chef", "chef", "chef", "chef", "line cook"], "difficult_direct_answer": false, "rationales": ["A person is preparing food in a commercial kitchen.", "This person is likely to be a line cook at a commercial restaurant.", "The person is working in a restaurant. he is wearing an apron, so he is not a waiter."], "image": "train2014/COCO_train2014_000000333626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120826, "question_id": "eYDyai49YzXfyV6qStkB3f", "question": "What gasoline powers this train?", "choices": ["coal", "unleaded", "diesel", "regular"], "correct_choice_idx": 2, "direct_answers": ["diesel", "diesel", "gas", "diesel", "diesel", "diesel", "diesel", "diesel", "diesel", "not visible"], "difficult_direct_answer": false, "rationales": ["A train is on tracks at a train station.", "Diesel is what keeps the train running since it looks pretty old.", "The train is powered by diesel."], "image": "val2014/COCO_val2014_000000120826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276491, "question_id": "eYLQ72jHUMcpRV2DUy9i4w", "question": "Why is this man in bed?", "choices": ["is lazy", "playing sick", "is sleeping", "in hospital"], "correct_choice_idx": 3, "direct_answers": ["in hospital", "sick", "is ill", "use laptop", "recuperate", "sick", "sick", "recovering", "sick", "hospitalized"], "difficult_direct_answer": false, "rationales": ["Based on the equipment visible, this person is in a medical setting.", "The man is getting a medical procedure done.", "He is in a hospital."], "image": "train2014/COCO_train2014_000000276491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337663, "question_id": "eYQfrp7qMLBcrWzed5Uc6b", "question": "What shape is the sign on the post to the left of the man?", "choices": ["hexagon", "circle", "rectangle", "square"], "correct_choice_idx": 0, "direct_answers": ["octagon", "octagon", "hexagon", "octagon", "octagon", "octagon", "hexagon", "hexagonal", "octagon", "octagon"], "difficult_direct_answer": false, "rationales": ["The sign on the post has six sides on it.", "The sign has six sides.", "The shape is a hexagon."], "image": "train2014/COCO_train2014_000000337663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168828, "question_id": "eYsfs3hfCmv4hrzD5hYSQw", "question": "Why is it unusual for elephants to have tusks this big?", "choices": ["attracts poachers", "grooming habits", "impossibility", "abnormality"], "correct_choice_idx": 0, "direct_answers": ["they break", "it's not", "poaching", "poachers", "attracts poachers", "break off", "old", "ivory", "endangered", "captivity"], "difficult_direct_answer": true, "rationales": ["Most tusks on elephants are a bit shorter than the one presented here.", "Elephants are hunted for their long tusks.", "Most in the wild would have been hunted because their ivory is valuable."], "image": "train2014/COCO_train2014_000000168828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215738, "question_id": "eZ3ptTxMDBoZnQNF8VsVoJ", "question": "What is the man in pink doing on the street?", "choices": ["driving", "selling", "cleaning", "crossing"], "correct_choice_idx": 3, "direct_answers": ["crossing street", "crossing", "crossing", "crossing", "walking", "crossing", "walking", "walking", "crossing", "crossing street"], "difficult_direct_answer": false, "rationales": ["The man is walking a long the lines of a cross walk.", "A person is a pink shirt is walking across the street at the crosswalk. people use the crosswalk to cross the street.", "The man is crossing."], "image": "train2014/COCO_train2014_000000215738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510657, "question_id": "eZ7m2fhpeJNLso4o4s9E65", "question": "What type items are being held in the basket here?", "choices": ["glasses", "ice", "condiments", "cash"], "correct_choice_idx": 2, "direct_answers": ["condiments", "condiments", "condiments", "condiments", "condiments", "condiments", "condiments", "condiments", "condiments", "condiments"], "difficult_direct_answer": false, "rationales": ["The items are condiments.", "Ketchup, mustard, hot sauces, and other items, commonly considered condiments, are in the basket.", "Ketchup and mustard are condiments."], "image": "val2014/COCO_val2014_000000510657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211080, "question_id": "eZEXm7ZVjJvZxBtrSS5wE7", "question": "Whish direction is the skier in?", "choices": ["upside down", "level", "sideways", "backwards"], "correct_choice_idx": 0, "direct_answers": ["upside down", "upside down", "upside down", "downhill", "upside-down", "upside down", "upside down", "upside down", "upside down", "down"], "difficult_direct_answer": false, "rationales": ["The skier is in the middle of a flip, which is why he's upside down.", "His head is pointing to the ground and feet are above him", "The skier is currently upside down in the photo."], "image": "train2014/COCO_train2014_000000211080.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177149, "question_id": "eZNL8wCUKVuAtyL9mBAuRs", "question": "What entity is in charge of the equipment shown here?", "choices": ["executive branch", "no one", "peace corps", "military"], "correct_choice_idx": 3, "direct_answers": ["government", "military", "steel entity", "government", "construction workers", "military", "foreman", "unknown", "construction workers", "crane"], "difficult_direct_answer": false, "rationales": ["The entity is the military.", "The military is in charge based on the uniformed personnel and color of the equipment.", "There are soldiers here."], "image": "val2014/COCO_val2014_000000177149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147970, "question_id": "eZVSuX6sXeBzKMkg8622d5", "question": "The man is throwing the frisbee behind his back and under what body part?", "choices": ["left arm", "right leg", "right arm", "left leg"], "correct_choice_idx": 3, "direct_answers": ["buttocks", "buttocks", "left leg", "leg", "leg", "leg", "leg", "buttocks", "leg", "butt"], "difficult_direct_answer": false, "rationales": ["The man is using his left leg to throw the frisbee.", "The person is reaching towards their left leg, which is elevated.", "Looking carefully at the picture, we see that the frisbee is under his left leg."], "image": "train2014/COCO_train2014_000000147970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377326, "question_id": "eZWo6TTiHroGJedfz4QYoQ", "question": "Why do the animals have their heads to the water?", "choices": ["to drink", "to bathe", "to play", "to soak"], "correct_choice_idx": 0, "direct_answers": ["drinking", "drinking", "cows", "drinking", "to drink", "dehydration", "drinking", "to drink", "drink", "drinking"], "difficult_direct_answer": false, "rationales": ["They are thirsty.", "The cows are thirsty, so they are lapping up some water from their pond. the average cow will drink from 3 - 30 gallons of water a day, depending on the circumstances.", "The cows' mouths are touching the water."], "image": "val2014/COCO_val2014_000000377326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560222, "question_id": "eZeD52rwwyR8eWeRq2s9VN", "question": "Which actor has the same last name as this street?", "choices": ["john goodman", "christian slater", "tom arnold", "ben kingsley"], "correct_choice_idx": 1, "direct_answers": ["christian slater", "christian slater", "christian slater", "christian slater", "christian", "christian slater", "christian slater", "christian", "christian slater", "christian"], "difficult_direct_answer": false, "rationales": ["The sign says slater on it.", "A street sign lists slater as the street name. an act or has the name christian slater.", "It's the same word as on the sign"], "image": "val2014/COCO_val2014_000000560222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143526, "question_id": "eZpuGMvbzyLwjdP5qbWGvm", "question": "How has this table been decorated?", "choices": ["china", "name cards", "confetti", "centerpiece"], "correct_choice_idx": 3, "direct_answers": ["flower bouquet", "flowers", "flowers", "flower vase", "florally", "with flowers", "centerpiece", "with flowers", "flowers", "flowers vase"], "difficult_direct_answer": false, "rationales": ["The flowers are in the center.", "The glass table has been decorated with a vase of flowers. with it being set somewhat the middle of the table, we refer to is commonly as a centerpiece.", "The table has a centerpiece."], "image": "val2014/COCO_val2014_000000143526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411645, "question_id": "eaBWvtKpfch6tCnmtk8j3i", "question": "What color is the plane on the far right?", "choices": ["red", "green", "purple", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "white", "blue/gray", "blue", "blue", "blue white", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The plane on the right is light blue and grey.", "The plane is a bright blue hue.", "A white plane is at an airport with a light blue one nearby."], "image": "train2014/COCO_train2014_000000411645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376246, "question_id": "eaMSYNw77rVPsJrVwFxgda", "question": "What type of jackets do people wear when skiing?", "choices": ["comforter", "vest", "parka", "sweater"], "correct_choice_idx": 2, "direct_answers": ["winter", "winter jackets", "winter", "winter jackets", "snow", "winter", "winter jacket", "insulated warm", "parka", "ski jackets"], "difficult_direct_answer": false, "rationales": ["People wear jackets that make them feel warm in the snow.", "Parkas are warmest.", "A parka jacket is considered fine for skiing even though it is not considered very comfortable."], "image": "val2014/COCO_val2014_000000376246.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334616, "question_id": "eaf3LoxEmZvRMj6Q8iATi5", "question": "What's slightly unusual about the dog?", "choices": ["arm size", "ear size", "tail size", "wearing clothes"], "correct_choice_idx": 3, "direct_answers": ["wearing shirt", "wearing sweater", "large ears", "sweater", "wearing shirt", "wearing clothing", "shirt", "has clothing", "wearing clothes", "wearing sweatshirt"], "difficult_direct_answer": true, "rationales": ["Most dogs don't wear clothes.", "The dog has clothes.", "Animals don't normally wear shirts."], "image": "train2014/COCO_train2014_000000334616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318701, "question_id": "eaphfniHyzh4weDJHEdAeU", "question": "The way the person appears makes them look like what type of being?", "choices": ["ghost", "werewolf", "vampire", "wendigo"], "correct_choice_idx": 0, "direct_answers": ["disappear", "disappear", "ghost", "ghost", "ghost", "disappear", "ghost", "ghost", "ghost", "ghost"], "difficult_direct_answer": false, "rationales": ["The way is a ghost.", "Ghosts appear see through.", "A man is translucent and you can see thru him in the room."], "image": "val2014/COCO_val2014_000000318701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490385, "question_id": "eatbEF84yobRxdTUWzmnpR", "question": "What is the price of each cronut in dollars?", "choices": ["five", "ten", "20", "15"], "correct_choice_idx": 0, "direct_answers": ["five dollars", "five", "$5.00", "five", "$5.00", "five", "five", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["There is a sign on a stand sitting on the same serving platter as the cronut which lists the price.", "The price is 5.", "The sign says that they cost 5 dollars."], "image": "train2014/COCO_train2014_000000490385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399489, "question_id": "ebYYuLmDSDCJeKwB8Y44Qf", "question": "What product is made at and for Lagomarcino's?", "choices": ["wallets", "wendy's shakes", "ice cream", "none"], "correct_choice_idx": 2, "direct_answers": ["ice cream", "ice cream", "ice cream", "ice cream", "ice cream", "ice cream", "ice cream", "ice cream", "ice cream", "ice cream"], "difficult_direct_answer": false, "rationales": ["The sign clearly states the product. these types of signs are common on store fronts.", "The sign says that ice cream is made and sold at lagomarcino's.", "The text on their sign indicates the food item that they make."], "image": "train2014/COCO_train2014_000000399489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378189, "question_id": "ebc3oEKgrHaNgEz8gSHia8", "question": "The cat on the furniture is illuminated by what type of light?", "choices": ["recessed light", "sunlight", "moonlight", "table lamp"], "correct_choice_idx": 0, "direct_answers": ["lamp", "ceiling light", "overhead lightbulb", "light", "computer light", "ceiling light", "ceiling", "artificial", "recessed light", "ceiling light"], "difficult_direct_answer": false, "rationales": ["The cat has some light from the ceiling shining down on it.", "It is a small light up in the ceiling flush with it", "We can see the light source of this image in the ceiling."], "image": "train2014/COCO_train2014_000000378189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240210, "question_id": "ebenmyKQ6SaJ4PdoqZYzWy", "question": "What group of people are likely on this bus?", "choices": ["interstate elderly", "school children", "downtown doctors", "crosstown passengers"], "correct_choice_idx": 3, "direct_answers": ["travellers", "crosstown passengers", "passengers", "lower middleclass", "passengers", "suburban", "paramedics", "ems", "passengers", "disabled"], "difficult_direct_answer": false, "rationales": ["The panel in front of the bus says crosstown.", "The group is a passenger.", "A public transportation bus lists its next stop as crosstown."], "image": "val2014/COCO_val2014_000000240210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3077, "question_id": "ebphGMfW8JUKebmnbjHYiz", "question": "What year was this school founded?", "choices": ["1915", "1848", "2001", "1699"], "correct_choice_idx": 1, "direct_answers": ["1848", "1848", "last century", "1848", "1848", "1848", "1975", "1848", "1924", "1848"], "difficult_direct_answer": false, "rationales": ["That's when the proctor academy was founded.", "The shirts say proctor academy which was founded in 1848 according to google.", "The year was 1848."], "image": "train2014/COCO_train2014_000000003077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391915, "question_id": "eby9FGGiFBHceLdgGQEH58", "question": "What is the canopy netting for?", "choices": ["privacy", "mosquitoes", "wind block", "decor"], "correct_choice_idx": 1, "direct_answers": ["bug repellant", "roof", "bed curtains", "bug prevention", "mosquitoes", "decoration", "avoid mosquitoes", "bugs", "style", "mosquito deterrent"], "difficult_direct_answer": true, "rationales": ["The canopy is for bugs.", "Mosquitoes are a pest and the net keeps them away.", "The nett is for the mosquitos."], "image": "val2014/COCO_val2014_000000391915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419790, "question_id": "ec4Gr9h72jMadiWbVKajr3", "question": "Why are the bike riders stopped?", "choices": ["heavy flooding", "exhaustion", "popped tires", "train crossing"], "correct_choice_idx": 3, "direct_answers": ["train", "red light", "waiting", "train", "traffic signal", "train crossing", "traffic light", "train", "train crossing", "train crossing"], "difficult_direct_answer": false, "rationales": ["They have to wait for the tracks to be clear before they can safely proceed.", "The train is crossing in the area so the bike must stand.", "When a train is travelling, no one can go across the street to the other side at a crossing."], "image": "val2014/COCO_val2014_000000419790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347542, "question_id": "ecE6nZXirtr5HzR46bybnh", "question": "Why is his arm so high in the air?", "choices": ["unbalanced", "hit ball", "wants attention", "arm broken"], "correct_choice_idx": 1, "direct_answers": ["hit ball", "reaching", "hit ball", "tennis", "hit ball", "high ball", "hitting ball", "getting ball", "returning serve", "reaching"], "difficult_direct_answer": false, "rationales": ["A man is reaching up with a tennis racket.", "He is reaching his raquet up to hit the ball.", "The man is playing tennis and with the racket in his hand the tennis ball is touch the racket, indicating that the man is going to hit the tennis ball."], "image": "train2014/COCO_train2014_000000347542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123995, "question_id": "ecd8hwKFhLwKcMrgARFQ5T", "question": "Where does this scene probably take place?", "choices": ["food court", "cellar", "fancy restaurant", "high school"], "correct_choice_idx": 2, "direct_answers": ["restaurant", "restaurant", "restaurant", "living room", "restaurant", "restaurant", "kitchen", "fancy restaurant", "restaurant", "sitting room"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to this type of setting.", "The place looks like it's very fancy to eat at.", "The plates look really fancy."], "image": "train2014/COCO_train2014_000000123995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190509, "question_id": "ecnp9uaT8YiFAJGftEGyie", "question": "What helps for transponders to communicate with the cab and train control systems?", "choices": ["network", "wire", "signal", "cab"], "correct_choice_idx": 2, "direct_answers": ["radios", "signal", "radio", "telecommunication wires", "radio", "radio", "radio", "electric signals", "radios", "electric cables"], "difficult_direct_answer": false, "rationales": ["The signal helps.", "Transponders provide a signal for train communication systems.", "A signal allows for more efficient communication."], "image": "train2014/COCO_train2014_000000190509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412002, "question_id": "ed8Nc9ZnWZswtrmaxvtDsJ", "question": "What type of staircase is shown?", "choices": ["deviated", "spiral", "one way", "floral"], "correct_choice_idx": 1, "direct_answers": ["dual", "curved", "double", "double staircase", "grand", "grand", "imperial", "cruise-ship lobby", "spiral", "building structure"], "difficult_direct_answer": true, "rationales": ["Dual staircases such as these that meet at the top are known as this due to their curved shape as they ascend.", "The staircase is spiral.", "This is a type of spiral as it's curved."], "image": "train2014/COCO_train2014_000000412002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274424, "question_id": "edRMkRCorKGWPjr2eN5oJo", "question": "What type of vehicle is the woman on?", "choices": ["yacht", "bus", "airplane", "boat"], "correct_choice_idx": 1, "direct_answers": ["bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["The woman is sitting on a seat in the bus.", "The seating arrangement, metal bars, and large windows around the interior denote a bus.", "The woman is on a bus."], "image": "train2014/COCO_train2014_000000274424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104389, "question_id": "edWiDhtgnpaZNPZhDWW62U", "question": "Which button is the person almost certainly pressing on the laptop keyboard?", "choices": ["delete", "power", "tab", "volume"], "correct_choice_idx": 1, "direct_answers": ["power", "escape key", "escape", "power", "three", "off button", "power", "power on", "f1 button", "escape"], "difficult_direct_answer": false, "rationales": ["They seem to be in the area where a power button would be located.", "The person wants to hit the power button.", "The button is for power."], "image": "train2014/COCO_train2014_000000104389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218155, "question_id": "edXhzq3H89t3gFGDcRSrRV", "question": "The cyclist is most prepared for which weather today?", "choices": ["tornado", "rain", "earthquake", "tsunami"], "correct_choice_idx": 1, "direct_answers": ["rain", "rainy", "rain", "rain", "rain", "rain", "rain", "rainy", "rain", "rainy"], "difficult_direct_answer": false, "rationales": ["There is an umbrella strapped to the bike which will keep him dry.", "Because a umbrella is attached on the top layer bicycle part.", "The cyclist has an umbrella strapped to the bike to prevent him from becoming wet during his bike ride."], "image": "train2014/COCO_train2014_000000218155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372309, "question_id": "edY4axUgPnN7sBmn2xCQFM", "question": "What are the boots made from on the right?", "choices": ["vinyl", "leather", "rubber", "cloth"], "correct_choice_idx": 2, "direct_answers": ["leather", "rubber", "rubber", "rubber", "rubber", "rubber", "leather", "rubber", "plastic", "rubber"], "difficult_direct_answer": false, "rationales": ["This is the traditional material for footwear to be made of, if the footwear is intended to be worn in heavy rain.", "The boots are rubber.", "They are made of rubber and will help your feet stay dry when it's wet out."], "image": "train2014/COCO_train2014_000000372309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173062, "question_id": "edcciFABANoeAx54RgqmWh", "question": "What is the person doing in the bed?", "choices": ["cleaning", "sleeping", "watching television", "eating"], "correct_choice_idx": 2, "direct_answers": ["watching tv", "watching tv", "watching television", "watching tv", "resting", "watching television", "playing videogame", "watching screen", "watching television", "watching tv"], "difficult_direct_answer": false, "rationales": ["Based on the location and orientation of the person's foot, from this perspective the rest of their body would be facing towards the television.", "There is something on the screen in front of them", "The television is on and judging by the angle of the picture and where the foot is, the person is looking at the television."], "image": "train2014/COCO_train2014_000000173062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26454, "question_id": "eddRPe9Z7kKaZHjv5jA5v6", "question": "What has been used to wrap the food in the lower right?", "choices": ["plastic bag", "saran wrap", "cloth", "tinfoil"], "correct_choice_idx": 3, "direct_answers": ["tinfoil", "tin foil", "tinfoil", "foil", "aluminum foil", "paper", "foil", "aluminum foil", "aluminum foil", "aluminum foil"], "difficult_direct_answer": false, "rationales": ["The food is wrapped in shiny silver wrapping so it is tin foil.", "The food is covered in tinfoil.", "It is a thin layer of metal."], "image": "train2014/COCO_train2014_000000026454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211476, "question_id": "ednxAeQjyKrSWRbDr4qfqe", "question": "What are these food containers called?", "choices": ["styrofoam", "bento boxes", "tupperware", "plastic"], "correct_choice_idx": 1, "direct_answers": ["bento boxes", "bento boxes", "tupperware", "bowls", "bowls", "bento", "bowls", "bowls", "bento box", "tupperware"], "difficult_direct_answer": false, "rationales": ["These are bento boxes.", "That's what the japanese have named the food containers.", "Black boxes with food are on a table. bento boxes are black boxes that hold food."], "image": "val2014/COCO_val2014_000000211476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521669, "question_id": "edoS2wrbMkuLUHaFt4NSBR", "question": "What can you use to do an action similar to what the phone is in the process of doing?", "choices": ["notepad", "ftp server", "cassette player", "teddy bear"], "correct_choice_idx": 1, "direct_answers": ["laptop", "drink", "computer", "computer", "ftp server", "computer", "computer", "shotglass", "take pictures", "computer"], "difficult_direct_answer": false, "rationales": ["The phone is in the process of uploading photos. answer a is another tool that can be used for this type of process.", "The ftp server is used.", "An ftp server will be used."], "image": "val2014/COCO_val2014_000000521669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390759, "question_id": "edoxRgVhff7FSBfMHVu74Z", "question": "Why are the animals putting their face to the ground?", "choices": ["to rest", "to sleep", "to eat", "to fight"], "correct_choice_idx": 2, "direct_answers": ["cow", "to eat", "to eat", "to eat", "eating", "eat", "eating", "eating", "eating", "take rest"], "difficult_direct_answer": false, "rationales": ["There is food on the ground.", "The cows are awake and are not interacting with each other.", "The animals are snacking."], "image": "val2014/COCO_val2014_000000390759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176403, "question_id": "edqmuNw2gnCmdGGoC9QbQH", "question": "What purpose are the red umbrellas serving today?", "choices": ["shelter snow", "shade", "rain protection", "child's toy"], "correct_choice_idx": 1, "direct_answers": ["shade", "shade", "party", "shade", "shade", "shade", "shade", "party", "shade", "party"], "difficult_direct_answer": false, "rationales": ["The umbrellas provide shade to the people resting here.", "The purpose is for shade.", "It is sunny outside and umbrellas give relief from the sun by providing shade."], "image": "train2014/COCO_train2014_000000176403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486650, "question_id": "edr5FcDcmeSacocXwzp92f", "question": "What would happen if the ice instantly melted here?", "choices": ["drink diluted", "people submerged", "kites unstuck", "cups filled"], "correct_choice_idx": 1, "direct_answers": ["drownings", "swimming", "people fall", "people drowning", "wet humans", "people submerged", "people drown", "they'd fall", "drowning", "wet people"], "difficult_direct_answer": true, "rationales": ["People are on a frozen lake.", "People would fall into the water under the ice if it melted. they would be submerged into the water below the ice.", "People are standing on ice with kites."], "image": "train2014/COCO_train2014_000000486650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170072, "question_id": "ee35KZwfzopSCHVnaWyhBP", "question": "What sport does the person play that is on the sign?", "choices": ["hockey", "basketball", "baseball", "football"], "correct_choice_idx": 2, "direct_answers": ["baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["The player has a twins jersey and a baseball bat and batting helmet on.", "He is holding a bat, and wearing a batting helmet and baseball uniform.", "A baseball bat is shown."], "image": "val2014/COCO_val2014_000000170072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114515, "question_id": "ee7dSHxuGG8wNTATwBTNKq", "question": "What does the guy in the button down shirt hope to do?", "choices": ["skateboard", "leave", "just watch", "take photo"], "correct_choice_idx": 3, "direct_answers": ["film", "get pictures", "watch skateboarders", "take photo", "take photo", "exercise", "gain speed", "photography", "win", "skate"], "difficult_direct_answer": true, "rationales": ["He is holding a camera in his hand.", "The guy in the blue shirt is holding a professional camera.", "He is holding a camera with a zoom lense. cameras capture images."], "image": "train2014/COCO_train2014_000000114515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383580, "question_id": "eeCqAAcbcM8wwi4X9JwdiC", "question": "What meal might you eat at this time of day?", "choices": ["elevenses", "high tea", "breakfast", "supper"], "correct_choice_idx": 1, "direct_answers": ["high tea", "early dinner", "supper", "dinner", "cereal", "lunch", "dinner", "dinner", "dinner", "dinner"], "difficult_direct_answer": false, "rationales": ["It is almost at night and supper is taken at this time.", "High tea is in the afternoon.", "This liquid is consumed in the late afternoon in england."], "image": "train2014/COCO_train2014_000000383580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563070, "question_id": "eeE6QUxFGhDU5GZs3dkVyz", "question": "How many vehicles can go parallel along this road?", "choices": ["two", "three", "five", "four"], "correct_choice_idx": 0, "direct_answers": ["2 vehicles", "one", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two visible lanes in the road. lanes are used with one car per lane.", "There are 2 vehicles.", "The bus is the length of two cars."], "image": "train2014/COCO_train2014_000000563070.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376549, "question_id": "eeRbs4DzKwpRB5tWuzYnLc", "question": "Why is he looking back?", "choices": ["is lost", "is confused", "posing camera", "is afraid"], "correct_choice_idx": 2, "direct_answers": ["posing camera", "watching friend", "seeing camera", "pictures", "posing", "posing", "see person", "take photo", "smiling camera", "facing camera"], "difficult_direct_answer": true, "rationales": ["He is smiling at the photographer", "The man is smiling and standing still.", "He's posing."], "image": "val2014/COCO_val2014_000000376549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571636, "question_id": "eeTtzDKnKGYaJbKLJckiZu", "question": "What is the man in the truck delivering?", "choices": ["blown glass", "food", "water jugs", "blue tires"], "correct_choice_idx": 2, "direct_answers": ["bottled water", "water", "water", "water", "water", "water jugs", "water", "water jugs", "water jugs", "water"], "difficult_direct_answer": false, "rationales": ["They are clear so the contents can be seen.", "The man in the truck is delivering a load of water jugs.", "The back of the truck is open, and massive water bottles can be seen, suggesting he works for a water company that delivers water jugs."], "image": "val2014/COCO_val2014_000000571636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321185, "question_id": "eeXbxGvTHoMeTh3FG92NSn", "question": "Why is the black bike seat wet?", "choices": ["perspiration", "sea mist", "spill", "rain"], "correct_choice_idx": 3, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["It is outdoors in a cityscape, under a gloomy sky with no person, sea, or other liquids in sight.", "The bike seat is wet because it is raining outside and the bike is outside", "It has drops of water on it."], "image": "train2014/COCO_train2014_000000321185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573762, "question_id": "eecA6rvZVriQBSHwBxndoY", "question": "Why is he crouching over?", "choices": ["shorter fall", "stay warmer", "less wind", "maintain balance"], "correct_choice_idx": 3, "direct_answers": ["surfing", "balance", "balancing", "surfing", "for balance", "surfing", "maintain balance", "balance", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["When one is trying to maintain their balance on a board like this they would get in the stance depicted.", "He's keeping balance.", "The man doesn't want to fall."], "image": "train2014/COCO_train2014_000000573762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196415, "question_id": "eeczTBHygMFaWyjXdwifYN", "question": "What type of building is shown here?", "choices": ["mall", "changing room", "car lot", "museum"], "correct_choice_idx": 3, "direct_answers": ["museum", "art studio", "museum", "museum", "museum", "museum", "museum", "museum", "museum", "showroom"], "difficult_direct_answer": false, "rationales": ["The building is a museum that has an old classic motorbike.", "The items shown are artefacts.", "The building is a museum."], "image": "val2014/COCO_val2014_000000196415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355410, "question_id": "efJsn9sFwUm5wGYyPJ4yHk", "question": "Which thing is most obviously discolored?", "choices": ["nearby tree", "clouds", "water", "guardrail"], "correct_choice_idx": 1, "direct_answers": ["clouds", "items", "sky", "clouds", "sky", "clouds", "clouds", "sky", "sky", "sky"], "difficult_direct_answer": false, "rationales": ["The clouds are discolored as they're purple.", "The sky is usually a shade of blue and white, not those rainbow colors.", "The clouds are obviously discolored, and are green or purple."], "image": "val2014/COCO_val2014_000000355410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460373, "question_id": "efPZSyTXUbD3SLcBWUyxDP", "question": "What has caused the black bars in the photo?", "choices": ["phone holder", "window frame", "stand", "easel"], "correct_choice_idx": 1, "direct_answers": ["window frame", "windows", "window panes", "window frame", "window frame", "windows", "window", "window frame", "windowpane", "window"], "difficult_direct_answer": false, "rationales": ["The bars are from the window.", "The photographer is standing inside the airport and is looking through multiple glass panes taking a picture. the picture isn't centered in one pane so the separations between the panes is visible.", "At the airport, customers have to be inside the airport and can then board the plane. there are planes out there on the runway. you have to be inside to be safe from those big objects."], "image": "val2014/COCO_val2014_000000460373.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152333, "question_id": "efPkN4zZ4kJsQ2imCnhp38", "question": "What type of parking is shown?", "choices": ["valet", "lot", "parallel", "diagonal"], "correct_choice_idx": 2, "direct_answers": ["street", "parallel", "parallel", "street", "street", "parallel", "parallel", "parallel", "parallel", "parallel"], "difficult_direct_answer": false, "rationales": ["There is parking on the street where there are rows of cars lined up.", "When cars park one behind another.", "The people are parked end to end next to the sidewalk."], "image": "val2014/COCO_val2014_000000152333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65643, "question_id": "efQ9Qi2PvV5z7LhWqcp3Si", "question": "What is the name of the three wheeled vehicle in the middle of the picture?", "choices": ["rickshaw", "scooter", "tuk tuk", "cushman"], "correct_choice_idx": 2, "direct_answers": ["tuktuk", "tricycle", "rickshaw", "cart", "trike", "cart", "cart", "tuk tuk", "tuk tuk", "put put"], "difficult_direct_answer": false, "rationales": ["A car with three wheels and a canopy is driving in the street.", "There is a three wheeled tuk tuk in the center of the picture.", "That is the name of the truck."], "image": "train2014/COCO_train2014_000000065643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96746, "question_id": "efQBfd5WR5iYLxQiLvJiAS", "question": "What's the lady wearing on her head?", "choices": ["hijab", "glasses", "nothing", "cap"], "correct_choice_idx": 3, "direct_answers": ["hat", "cap", "hat", "hat", "hat", "hat", "hat", "hat", "hat", "hat"], "difficult_direct_answer": false, "rationales": ["A cap is covering her hair.", "The woman's head is covered. her headwear has no religious significance.", "The woman has a hat."], "image": "train2014/COCO_train2014_000000096746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33073, "question_id": "efTqPQ4Ed44pyYycLCTGJi", "question": "What is allowed to be carried in this bus?", "choices": ["animals", "big luggage", "bicycles", "explosives"], "correct_choice_idx": 2, "direct_answers": ["bicycles", "people", "passengers", "passengers", "backpack", "passengers", "bicycles", "bike", "passangers", "bikes"], "difficult_direct_answer": false, "rationales": ["There is a bike rack on the front of the bus.", "There is a rack on the front of the bus to transport them.", "Due to the sign on the front of the bus, it tells you what extra items are allowed."], "image": "val2014/COCO_val2014_000000033073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345434, "question_id": "efjRcsih2fEZMcdcr4WKQz", "question": "How many species of animals besides humans are visible?", "choices": ["two", "three", "one", "seven"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "two", "three", "three", "three", "three", "three", "two"], "difficult_direct_answer": false, "rationales": ["There are three species because you see dogs, cats and ducks", "There are cats, bird and dogs visible which would be three additional animal species to the visible humans.", "There are 3."], "image": "val2014/COCO_val2014_000000345434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365865, "question_id": "egT4K5CXhDYeNoXAPdbSzc", "question": "What natural disaster are those houses likely safe from?", "choices": ["win storm", "dust storm", "flooding", "tornado"], "correct_choice_idx": 2, "direct_answers": ["fire", "tsunami", "tornado flood", "tsunami", "tsunami", "flooding", "flood", "flood", "flooding", "tidal wave"], "difficult_direct_answer": false, "rationales": ["The house are very high up on a cliff.", "They are on the top of the hill.", "They are up on the hill so the water can not reach them."], "image": "train2014/COCO_train2014_000000365865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513482, "question_id": "egwjngDHqDp6UcfSWBGFAU", "question": "What relationship do the persons sharing the umbrella have?", "choices": ["strangers", "enemies", "intimate", "newly met"], "correct_choice_idx": 2, "direct_answers": ["lovers", "partners", "dating", "partners", "intimate", "loving", "couple", "lovers", "couple", "romantic"], "difficult_direct_answer": false, "rationales": ["The people are standing close together. one person has their arm around the other.", "One has their arm around the other", "The persons are sharing an umbrella and leaning towards and grasping each other. people who hold each other in this way are likely to be intimate with each other."], "image": "train2014/COCO_train2014_000000513482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366897, "question_id": "eh3VMpQbtchANrNTtLAH9N", "question": "Where are these people engaging in this interaction?", "choices": ["library", "school", "workplace", "party"], "correct_choice_idx": 2, "direct_answers": ["work", "office", "office", "game", "office", "office", "office", "in office", "workplace", "playing game"], "difficult_direct_answer": false, "rationales": ["These people are in an office.", "Four people are standing in a conference room with wii controllers in their hands.", "There is a desk in the room and cubicles in the other room"], "image": "train2014/COCO_train2014_000000366897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446053, "question_id": "eh7rt6kPJRUgeV4e5AHJ8L", "question": "Why is the man in red facing a camera?", "choices": ["for interview", "interrogation", "for movie", "confession"], "correct_choice_idx": 0, "direct_answers": ["being interviewed", "interview", "interview", "interview", "interview", "doing interviews", "for interview", "reporting", "being interviewed", "taping video"], "difficult_direct_answer": false, "rationales": ["The man is talking to a news team.", "The man is being filmed and is likely being interviewed.", "He is being interviewed."], "image": "val2014/COCO_val2014_000000446053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320461, "question_id": "ehPRjFLVhdsWYsHKcKy6vm", "question": "He is performing a what?", "choices": ["ploy", "dupe", "trick", "foible"], "correct_choice_idx": 2, "direct_answers": ["jump", "trick", "trick", "skateboard trick", "trick", "trick", "trick", "slide", "skating stunt", "skateboard trick"], "difficult_direct_answer": false, "rationales": ["The man is riding a skateboard and is interacting with a ramp with only one foot on the board.", "He's trying to show off a new trick.", "When riding a skateboard on a ramp, the purpose of it is to perform tricks."], "image": "val2014/COCO_val2014_000000320461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117871, "question_id": "ehaZcUHPRggQpG3FJkRDKp", "question": "Who is standing at a higher level on the rock?", "choices": ["blue shirt", "brown shirt", "purple shirt", "white frisbee"], "correct_choice_idx": 1, "direct_answers": ["brown tshirt", "red shirt", "young man", "younger man", "brown shirt", "maroon shirt", "leftmost person", "left", "older man", "man"], "difficult_direct_answer": true, "rationales": ["The person in brown is the highest.", "The brown shirt is higher.", "There are three people in the photo with the person at the top being a man wearing a brown shirt."], "image": "train2014/COCO_train2014_000000117871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443336, "question_id": "ehi7k4dZcAK8NfVQuVGwc8", "question": "What period of the day is it?", "choices": ["night", "morning", "afternoon", "evening"], "correct_choice_idx": 2, "direct_answers": ["daytime", "afternoon", "morning", "afternoon", "morning", "afternoon", "morning", "afternoon", "afternoon", "morning"], "difficult_direct_answer": false, "rationales": ["It's in the afternoon.", "There is light coming from the ceiling but also shadows.", "The sun is overhead."], "image": "train2014/COCO_train2014_000000443336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82740, "question_id": "ei98cP355HNBXFJ8cBdRy8", "question": "Over what is the horse jumping?", "choices": ["zebra", "hurdle", "trainer", "jockey"], "correct_choice_idx": 1, "direct_answers": ["bars", "obstacle", "hurdle", "poles", "fence", "hurdle", "striped pole", "obstacles", "fence", "hurdle"], "difficult_direct_answer": false, "rationales": ["He is jumping an oxer decorated with zebras.", "A jockey jumps with a horse at a competition.", "There are horizontal poles set up as an oxer."], "image": "val2014/COCO_val2014_000000082740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153822, "question_id": "eiYPnwGywo3KwBNPrYNPHu", "question": "Why is the racquet blurred?", "choices": ["rapid motion", "falling apart", "dropping it", "out focus"], "correct_choice_idx": 0, "direct_answers": ["rapid motion", "moving fast", "motion", "swinging", "motion", "in motion", "speed", "motion", "motion", "moving fast"], "difficult_direct_answer": false, "rationales": ["The racquet is fast.", "When there is movement in a picture, it might come out blurry. the woman is swinging the racket in the picture.", "The player is hitting the tennis ball so the camera can't take a still picture."], "image": "val2014/COCO_val2014_000000153822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289326, "question_id": "einbs9TXwrV8xPWVovh6wm", "question": "What person worked for the company whose name appears after the words I Am?", "choices": ["missy hyatt", "michael jordan", "carson daly", "jim duggan"], "correct_choice_idx": 2, "direct_answers": ["kevin", "carson daly", "krob", "krob", "krob", "krob", "krob", "kevin", "krob", "kevin"], "difficult_direct_answer": false, "rationales": ["Carson daly does.", "Cason daly used to be a dj.", "Carson daly worked for kroq."], "image": "train2014/COCO_train2014_000000289326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88360, "question_id": "eipiqEVzzwTusBoYjQS7JG", "question": "What would be hardest to hit with the frisbee from here?", "choices": ["birds", "houses", "grass", "trees"], "correct_choice_idx": 1, "direct_answers": ["birds", "houses", "goal", "cat", "houses", "target", "house", "birds", "birds", "another person"], "difficult_direct_answer": false, "rationales": ["They are very far away and the disk is too light to travel that far", "The buildings are far away in the background.", "The houses are very far away and very unlikely to be hit from here."], "image": "val2014/COCO_val2014_000000088360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353813, "question_id": "ejbpu9nhDd4U5ywumjsTv4", "question": "What relation does the clothes here have?", "choices": ["opposites", "uniforms", "all unrelated", "summer clothes"], "correct_choice_idx": 1, "direct_answers": ["uniform", "color", "matching gear", "all white", "white", "uniforms", "uniform", "uniform", "white", "all matching"], "difficult_direct_answer": false, "rationales": ["The relation is the uniform.", "All of the clothes look the same. each person wearing the clothes is doing the same thing.", "This looks to be a group or team of people wearing the same thing."], "image": "train2014/COCO_train2014_000000353813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580843, "question_id": "ejdKW2iRdPTsNufXmeoGoP", "question": "What is the woman using the brown object for?", "choices": ["exercising", "traveling", "painting", "resting"], "correct_choice_idx": 1, "direct_answers": ["carrying", "block seat", "luggage", "foot rest", "travel", "suitcase", "luggage", "traveling", "travel", "foot rest"], "difficult_direct_answer": false, "rationales": ["A woman has a brown leather suitcase by her feet. the woman is sitting on a bus.", "It is a suitcase", "The brown item is a suitcase."], "image": "train2014/COCO_train2014_000000580843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70033, "question_id": "ejhmXUbWhM33mBAfLyoXDF", "question": "What type of television series is the cat watching?", "choices": ["reality", "romance", "comedy", "science fiction"], "correct_choice_idx": 0, "direct_answers": ["soap opera", "come dine", "lcd", "reality", "cooking", "cooking", "flat screen", "reality tv", "commercial", "drama"], "difficult_direct_answer": true, "rationales": ["The cat is watching a tv screen with the words \"come dine with me\" broadcast onto the tv screen against a black background.", "The cat is watching a reality show about cooking.", "The text on the screen is come dine with me. this is not a comedy, science fiction, or romance series."], "image": "train2014/COCO_train2014_000000070033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347695, "question_id": "ejuvLEoLj6mACYXZqXrrTi", "question": "Why does he have a pained look on his face?", "choices": ["is angry", "is tired", "is injured", "is sad"], "correct_choice_idx": 2, "direct_answers": ["pin stick", "pin", "embarrassed", "scared", "funny", "embarrassed", "tight", "got stab", "getting choked", "is injured"], "difficult_direct_answer": true, "rationales": ["The man seemed totally injured.", "She's tightening his tie", "This woman looks to be adjusting his tie or maybe adding a decoration that requires a pin this type of facial expression is known for being one of pain which may have been infected in the process."], "image": "val2014/COCO_val2014_000000347695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3967, "question_id": "ejwYtpVcAvbxPgvPC7RBUE", "question": "What is she doing with her phone?", "choices": ["calling home", "taking pictures", "texting", "watching movie"], "correct_choice_idx": 1, "direct_answers": ["taking picture", "taking picture", "taking pictures", "filming", "taking photo", "taking picture", "snapping pictures", "photographing", "picture", "photographing monkey"], "difficult_direct_answer": false, "rationales": ["She is pointing it at the performing monkey.", "The woman wants a photo of the monkey.", "The woman wants to snap photos."], "image": "train2014/COCO_train2014_000000003967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213125, "question_id": "ek5qmWdHf8jDVBgBhU5yW7", "question": "What is the guy doing with the device in his hand?", "choices": ["talking", "balancing", "filming", "researching"], "correct_choice_idx": 2, "direct_answers": ["texting", "recording", "filming", "taking photo", "filming", "filming", "recording", "recording", "protecting hands", "recording"], "difficult_direct_answer": false, "rationales": ["The skateboarder without a helmet holds in his hands a camcorder which he is also has his attention focused on.", "Two skateboarders are in the street riding skateboards and the one in the back is holding a camera up towards the skater in front of him.", "He's capturing what the other skater is doing"], "image": "train2014/COCO_train2014_000000213125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296599, "question_id": "ekZGxsZtxKncUXRQNkYRPG", "question": "Which word best describes this train station?", "choices": ["new", "alive", "decrepit", "bustling"], "correct_choice_idx": 2, "direct_answers": ["decrepit", "dilapidated", "decrepit", "train", "old", "abandoned", "dilapidated", "dirty", "abandoned", "desolate"], "difficult_direct_answer": false, "rationales": ["The station is mostly empty and does not seem to be in good shape.", "The train station is deserted.", "The look of the tracks and the walking area indicate this is a train station that could surely be modernized."], "image": "train2014/COCO_train2014_000000296599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515367, "question_id": "ekbfKYBghkyMvcSdSe2n9J", "question": "Where was the frisbee invented?", "choices": ["america", "greece", "china", "rome"], "correct_choice_idx": 0, "direct_answers": ["bridgeport", "connecticut", "connecticut", "connecticut", "unknown", "connecticut", "italy", "america", "bridgeport connecticut", "bridgeport connecticut"], "difficult_direct_answer": false, "rationales": ["The frisbee was invented in the south.", "The frisbee is a more recent sport which started in the us.", "The frisbee was invented within the states."], "image": "val2014/COCO_val2014_000000515367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294182, "question_id": "ekkLLLv48WbmSs3bVz9DvU", "question": "What is in the sand?", "choices": ["seagulls", "footprints", "hammocks", "surfers"], "correct_choice_idx": 1, "direct_answers": ["people", "footprints", "red debris", "people", "people", "stone particles", "shells", "footprints", "shells", "rocks"], "difficult_direct_answer": false, "rationales": ["There are divots of people that have walked on the sand. their feet has gone over the sand.", "These are indentations from people walking", "There are imprints or sunken spots in the sand where people have been walking on the beach."], "image": "val2014/COCO_val2014_000000294182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459339, "question_id": "emAmCDJrZooqNx2myffABt", "question": "What is the person in the middle struggling with?", "choices": ["zipper", "waves", "fire", "baby"], "correct_choice_idx": 1, "direct_answers": ["wave", "surfboard", "wave", "surf", "surfboard", "surfing", "waves", "wave", "surfing", "breathing"], "difficult_direct_answer": false, "rationales": ["He has a surfboard. he is swimming in the ocean and the water is splashing him in the face.", "The person has fallen off her surf board. water is splashing over her head.", "There is water splashing up as it hits a shallow area"], "image": "train2014/COCO_train2014_000000459339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541687, "question_id": "emJBBUJzHg9x4QuK6xmxWH", "question": "Why are the women wearing lanyards?", "choices": ["showing id", "cosplay", "halloween", "style"], "correct_choice_idx": 0, "direct_answers": ["showing id", "identification", "work", "identification", "business conference", "for identification", "business conference", "hold nametags", "badge holders", "business conference"], "difficult_direct_answer": false, "rationales": ["The women are all wearing lanyards with their id on them.", "The material used is connected to an identification tag that is used for access purposes.", "The id is located inside of the lanyards."], "image": "val2014/COCO_val2014_000000541687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418684, "question_id": "enG74YouYpoDaGvzvdgfYX", "question": "What did he recently have to eat?", "choices": ["pizza", "cereal", "cake", "chinese"], "correct_choice_idx": 0, "direct_answers": ["pizza", "pizza", "pizza", "food", "pizza", "nothing", "pizza", "pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["There is an empty pizza box near the person.", "There is an open box on the table", "The empty food box is still on the table."], "image": "val2014/COCO_val2014_000000418684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515457, "question_id": "enHG4Dogu6PVTC6kSU2Mtr", "question": "This scene likely takes place at what time?", "choices": ["4pm", "1pm", "10pm", "2pm"], "correct_choice_idx": 2, "direct_answers": ["night", "party scene", "night", "night", "10pm", "10pm", "night", "night", "night", "night"], "difficult_direct_answer": false, "rationales": ["It's in the evening.", "This is most likely a scene of 10 pm.", "It's clearly night time so the answer is obvious."], "image": "train2014/COCO_train2014_000000515457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233348, "question_id": "enHLaDtKuVF5ZcAwnmRckd", "question": "How many players are in baseball?", "choices": ["nine", "16", "11", "14"], "correct_choice_idx": 0, "direct_answers": ["nine", "ten", "nine", "nine", "nine", "three", "nine", "nine", "two", "nine"], "difficult_direct_answer": false, "rationales": ["Each team has 9 players at a given time.", "There are nine players per team in baseball.", "There are 9."], "image": "val2014/COCO_val2014_000000233348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95692, "question_id": "enQ3J2Qyw66n2TjwhzYsPK", "question": "What material is the umbrella made of?", "choices": ["wood", "polyester", "nylon", "metal"], "correct_choice_idx": 3, "direct_answers": ["plastic", "sunbrella", "rubber", "nylon", "plastic", "cloth", "metal", "plastic", "plastic", "nylon"], "difficult_direct_answer": false, "rationales": ["The umbrella looks really stiff.", "Normally made of some sort of cloth, this umbrella is made from metal which you can tell by how it looks.", "The umbrella is made of a metal pole."], "image": "val2014/COCO_val2014_000000095692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493880, "question_id": "enWqAwqxQSaWvAMLEALSRi", "question": "Who is allowed to park by this fire hydrant?", "choices": ["fire truck", "anyone", "commuter", "neighbors"], "correct_choice_idx": 0, "direct_answers": ["fire department", "fire department", "firetruck", "firemen", "pluto", "firemen", "no one", "fire truck", "pluto", "fire truck"], "difficult_direct_answer": false, "rationales": ["A fire hydrant is at the corner. fireman are allowed to use fire hydrants.", "The hydrant could be used by first responders. it would be unacceptable for anyone else to park by it.", "It is illegal to park by a fire hydrant in case it needs to be used in emergencies, so only the fire truck is allowed to park by it."], "image": "train2014/COCO_train2014_000000493880.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208067, "question_id": "endWv5j5CGmw8JC899ersK", "question": "What natural event seems to have occurred here?", "choices": ["hurricane", "thunder", "snow", "tornado"], "correct_choice_idx": 2, "direct_answers": ["snowing", "snow", "storm", "snow", "snow", "snow", "snowstorm", "snow", "flooding", "snowfall"], "difficult_direct_answer": false, "rationales": ["The ground is partially covered by ice and a related substance that falls during winter.", "There is frozen precipitate on the ground, some of which is melting.", "There are still piles of white stuff on the curb and puddles around."], "image": "train2014/COCO_train2014_000000208067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67974, "question_id": "eo9sD87qNeHBcA2jYaZhom", "question": "Why is the man seated at the red table?", "choices": ["to eat", "to cook", "to read", "to work"], "correct_choice_idx": 0, "direct_answers": ["to eat", "picnic", "eating lunch", "to eat", "to eat", "eating meal", "to eat", "eating", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["The man has a tray of food in front of him which would be consistent with someone sitting down for answer a.", "The man is eating.", "He has food in front of him and its a lot easier to eat food while sitting."], "image": "train2014/COCO_train2014_000000067974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304069, "question_id": "eoEakDvKFjnQNm4T9UGBVx", "question": "Who is the road for?", "choices": ["drivers", "bicycles", "pedestrians", "trucks"], "correct_choice_idx": 0, "direct_answers": ["autos", "drivers", "vehicles", "vehicles", "motor vehicles", "cars", "drivers", "cars", "vehicles", "travel"], "difficult_direct_answer": false, "rationales": ["Roads are for people to take their cars on.", "The road can be used by vehicles to drive on.", "There are cars on the road."], "image": "val2014/COCO_val2014_000000304069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431012, "question_id": "eoEnt3sWbXBEmujCmk34A4", "question": "Why do the racers have names all over their bikes?", "choices": ["they're sponsored", "fan support", "looks cool", "mandatory"], "correct_choice_idx": 0, "direct_answers": ["they're sponsored", "sponsors", "sponsors", "competition", "advertising sponsors", "advertisements", "advertising", "sponsors", "advertisement", "advertising"], "difficult_direct_answer": false, "rationales": ["The racers have the names of companies along the visible sides of their bikes. it is common practice in racing to have sponsors for the riders appear on the vehicles.", "The names are the racers' sponsors.", "Companies pay money to have their names on the bikes so that spectators can see them."], "image": "train2014/COCO_train2014_000000431012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19185, "question_id": "eocCFjoJNaQApRzLgykNsd", "question": "How many hours can cars remain parked at this location before the meter expires?", "choices": ["three", "one", "two", "twelve"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "four hours", "4 hours", "two", "four hours", "two", "two"], "difficult_direct_answer": false, "rationales": ["They can remain in the same space for 120 minutes before being ticketed.", "The grey decal on the parking meter indicates the time limit in hours.", "The writing on the side of the meter indicates that there is a 2 hour time limit."], "image": "train2014/COCO_train2014_000000019185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369482, "question_id": "eonCjr3iSR2MGsYUN7Gyay", "question": "What vehicle makes frequent crosses at this intersection?", "choices": ["train", "plane", "motorcycle", "trolley"], "correct_choice_idx": 0, "direct_answers": ["train", "train", "train", "train", "train", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["Trains cross at intersections like the one shown.", "Trains cross.", "There is a railway crossing sign at the right side."], "image": "val2014/COCO_val2014_000000369482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554815, "question_id": "eozsfTVdRAHYyWswTFSts5", "question": "What are the people waiting to do?", "choices": ["eat", "work", "speak", "cross"], "correct_choice_idx": 3, "direct_answers": ["get transportation", "cross", "cross street", "cross street", "cross street", "cross street", "cross", "ride bus", "cross street", "board bus"], "difficult_direct_answer": false, "rationales": ["The people want to get over to the other side of the street.", "The light is red, so they are waiting for it to turn green which means go.", "They are pedestrians and are standing at the side of the road where there is a crosswalk."], "image": "train2014/COCO_train2014_000000554815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40681, "question_id": "ep24aTPizXkfyguPJPDYNi", "question": "What surrounds these people?", "choices": ["sand", "mud", "ocean", "mountains"], "correct_choice_idx": 3, "direct_answers": ["trees", "mountains", "snow", "snow trees", "trees", "trees", "snow", "trees", "trees", "snow"], "difficult_direct_answer": false, "rationales": ["Snow and mountains surround the skiers.", "Mountains surround the people since people ski down mountains.", "They appear to be on an elevate surface, and a ski lift, which suggests it is lifting people higher into the mountains."], "image": "val2014/COCO_val2014_000000040681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367558, "question_id": "epcEpCwYGWssBYWCRRjzbG", "question": "What is on the tray?", "choices": ["cookie", "hand", "cake", "avocado"], "correct_choice_idx": 3, "direct_answers": ["avocado", "avocado", "pizza", "avocado", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["The green slices that become white towards the center identify the toppings that aren't peppers or olives on this pizza as avocado.", "There are slices of soft green food", "A dish is served with slices of green vegetable on top."], "image": "val2014/COCO_val2014_000000367558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200296, "question_id": "epfppnhbCGQ9QX8cRMEog2", "question": "What conveyance do the skiers ride upon?", "choices": ["snow mobile", "car", "wind sail", "bus"], "correct_choice_idx": 0, "direct_answers": ["snowmobile", "snow mobile", "snowmobile", "snowmobile", "skimobile", "snow mobile", "snowmobile", "snowmobile", "motorcycle", "snowmobile"], "difficult_direct_answer": false, "rationales": ["The ground is covered in snow and this has a shield in front of the driver", "All the other options are completely unrealistic. you will never see a car or a bus in a scenario like this and \"wind sail\" is completely inappropriate.", "The conveyance is the snowmobile."], "image": "val2014/COCO_val2014_000000200296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420634, "question_id": "epyEMYSJ9yYFmegNLSbhXi", "question": "What state is the concrete in the brown thing in?", "choices": ["solidified", "gas", "powder", "wet"], "correct_choice_idx": 3, "direct_answers": ["wet", "new jersey", "illinois", "japan", "massachusetts", "wet", "liquid", "germany", "liquid", "liquid"], "difficult_direct_answer": false, "rationales": ["The state is wet.", "The concrete mixer present in this image is for holding the concrete while it is still a pourable goop; before it is poured onto something and dries and becomes solid.", "The brown mixer truck is shining in such a way that it is most likely to be wet."], "image": "train2014/COCO_train2014_000000420634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565716, "question_id": "eqHw3JoR9zw2NquGgSV7MG", "question": "Why has the bus stopped by the sidewalk?", "choices": ["broke down", "delivering package", "getting passengers", "refueling"], "correct_choice_idx": 2, "direct_answers": ["loading", "drop off", "release passengers", "carrying passenger", "bus stop", "accept passengers", "board passengers", "getting passengers", "pick up", "pickup passengers"], "difficult_direct_answer": true, "rationales": ["The bus is pulled over hear a crosswalk and a large building.", "Buses stop to pick up riders.", "The bus has to go up to the curb in order to let the passengers on the bus"], "image": "train2014/COCO_train2014_000000565716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309405, "question_id": "eqWTSpx4DfsRaUUsDPTDzw", "question": "This style of furniture was popular in what era?", "choices": ["12th century", "20th century", "19th century", "18th century"], "correct_choice_idx": 1, "direct_answers": ["60s", "seventies", "seventies", "1960s", "seventies", "20th century", "modern", "seventies", "1960's", "1950s"], "difficult_direct_answer": false, "rationales": ["This style was popular in the 1950s and known as mid-century modern.", "The furniture was popular in modern eras.", "The furniture is modern."], "image": "val2014/COCO_val2014_000000309405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475798, "question_id": "eqswbjxrBDx6ZrSYcpAC7b", "question": "What time is it on the image?", "choices": ["morning", "afternoon", "night", "noon"], "correct_choice_idx": 2, "direct_answers": ["day", "evening", "night", "night", "night", "night", "dusk", "night", "night", "night"], "difficult_direct_answer": false, "rationales": ["A plane is on a runway with dark skies behind it. skies are dark at night.", "It is not night because there is light, but there is not enough light for it to be morning or noon.", "The sun appears to have set as it cannot be seen in the sky, and there are lights that can be seen to brighten the sky."], "image": "val2014/COCO_val2014_000000475798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366484, "question_id": "erFdGyHRsB5zaKzgeqi5DT", "question": "What sport does he participate in?", "choices": ["tennis", "softball", "skateboarding", "surfing"], "correct_choice_idx": 2, "direct_answers": ["skateboarding", "skateboarding", "skateboarding", "skateboarding", "skateboarding", "skateboarding", "skateboarding", "skateboarding", "skateboarding", "skateboarding"], "difficult_direct_answer": false, "rationales": ["He is holding a board with wheels in his hand.", "He is holding one.", "The man is holding a skateboard in his hand."], "image": "val2014/COCO_val2014_000000366484.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276596, "question_id": "erG6NMDSwkdHjL99CdufdR", "question": "What visual safety feature use to make sure enough sees that they are coming?", "choices": ["high beams", "alarm lights", "stop lights", "headlights"], "correct_choice_idx": 3, "direct_answers": ["headlights", "bright headlights", "headlights", "front light", "windshield", "lights", "high beams", "flashing lights", "lights", "lights"], "difficult_direct_answer": false, "rationales": ["The feature is headlights.", "The headlights are extremely bright.", "The train uses headlights to see."], "image": "val2014/COCO_val2014_000000276596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60992, "question_id": "erJMRnNoXxLcBsZtkFr8A9", "question": "What does the company make whose name appears on the right side of the wall?", "choices": ["witcher potions", "crafts", "baseball bats", "eyeglasses"], "correct_choice_idx": 3, "direct_answers": ["eyeglasses", "visilab", "eyewear", "glasses", "glasses", "contacts", "eyewear", "glasses", "unknown", "glasses"], "difficult_direct_answer": false, "rationales": ["Visilab is an eyeglass company.", "Visilab makes glasses.", "The company makes glasses."], "image": "val2014/COCO_val2014_000000060992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372130, "question_id": "erchdqoLQm2cBNCCyDUvfC", "question": "What are these people waiting for?", "choices": ["food", "doctor", "check in", "concert"], "correct_choice_idx": 2, "direct_answers": ["luggage", "airplane", "boarding airplane", "airplane", "plane", "luggage", "boarding", "airplane", "luggage", "check in"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to this setting. that said, some of them might also be in line to get c in a food court.", "The people are trying to check in their luggage.", "They are waiting in line to check in with their airlines."], "image": "train2014/COCO_train2014_000000372130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453824, "question_id": "erwmFfbkHshMMeeDipfCg9", "question": "Where is the person taking the picture?", "choices": ["behind camera", "on wall", "on tv", "behind chair"], "correct_choice_idx": 0, "direct_answers": ["behind", "living room", "living room", "living room", "living room", "living room", "indoor", "family room", "behind camera", "living room"], "difficult_direct_answer": false, "rationales": ["You cannot see anyone in the photo so they must be behind the camera.", "The person is not in the picture. they have taken the picture.", "They are behind the camera."], "image": "val2014/COCO_val2014_000000453824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127945, "question_id": "esPLrtbL7HNrFDZuhNDmnv", "question": "The number on the train is a zip code in what state?", "choices": ["indiana", "washington", "kansas", "new jersey"], "correct_choice_idx": 2, "direct_answers": ["kansas", "kansas", "kansas", "kansas", "kansas", "kansas", "kansas", "kansas", "kansas", "kansas"], "difficult_direct_answer": false, "rationales": ["The number is from kansas.", "The number is the zip code of that state.", "67026 is a zip code located in kansas."], "image": "train2014/COCO_train2014_000000127945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281301, "question_id": "esYBpei56yZurpWTpbZoRZ", "question": "What is the most probable reason there is a dog here?", "choices": ["help blind", "sniff bombs", "watch house", "heard animals"], "correct_choice_idx": 3, "direct_answers": ["corral sheep", "herding", "help grazing", "shepard sheep", "control herd", "sheep herder", "heard sheep", "sheep herding", "herding sheep", "heard animals"], "difficult_direct_answer": true, "rationales": ["This is probably a shepherd dog and is assisting the keeper to tend to the sheep.", "There are a lot of animals. the dog can round them up.", "A dog can herd sheep."], "image": "train2014/COCO_train2014_000000281301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392717, "question_id": "esYnuBswNhks9N5tuD5zxs", "question": "What crime is potentially about to be committed?", "choices": ["theft", "intoxication", "jay walking", "murder"], "correct_choice_idx": 2, "direct_answers": ["bear thievery", "jay walking", "crossing road", "jaywalking", "jaywalking", "jaywalking", "jay walking", "unknown", "jaywalking", "carjacking"], "difficult_direct_answer": false, "rationales": ["The crime is jaywalking.", "People here are about to jaywalk across the street.", "People are supposed to cross the street in a painted cross-walk."], "image": "val2014/COCO_val2014_000000392717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83873, "question_id": "essqVtKcrpfK8VbvDAQHPt", "question": "What material is the round orange and white object made from which this woman is holding?", "choices": ["cotton", "pleather", "polyester", "vinyl"], "correct_choice_idx": 2, "direct_answers": ["polyester", "nylon", "nylon", "nylon", "nylon", "polyester", "nylon", "nylon", "polyester", "polyester"], "difficult_direct_answer": false, "rationales": ["This is a waterproof material", "Umbrellas are made of polyester.", "The umbrella is made of polyester."], "image": "train2014/COCO_train2014_000000083873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396350, "question_id": "et7jee7VJsHabx9XgBz8ue", "question": "What's the name of the area the asian man is near?", "choices": ["cross walk", "terminal b", "terminal", "pickup zone"], "correct_choice_idx": 0, "direct_answers": ["bus station", "crosswalk", "crosswalk", "bus station", "bus stop", "zebra stripes", "crosswalk", "cross walk", "crosswalk", "crosswalk"], "difficult_direct_answer": false, "rationales": ["The white lines are a crosswalk.", "This is indicated by the white stripes.", "The man is located near the lines that are on the street."], "image": "val2014/COCO_val2014_000000396350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543608, "question_id": "et9vBxcnnzpjnjnANPp9ut", "question": "What is being advertised on the board?", "choices": ["vodka", "beer", "wine", "gin"], "correct_choice_idx": 1, "direct_answers": ["canada", "wildlife", "nothing", "wildlife", "not clear", "canada", "beer", "elephant", "elephant", "entertainment event"], "difficult_direct_answer": false, "rationales": ["The beer is advertised.", "The colors are the brand of beer.", "The sign is advertising molson canadian."], "image": "train2014/COCO_train2014_000000543608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221743, "question_id": "etFCt8QsCgux9UYJg6ReiW", "question": "Which vegetable is the most nutritious?", "choices": ["spinach", "broccoli", "lettuce", "green onion"], "correct_choice_idx": 0, "direct_answers": ["broccoli", "broccoli", "kale", "onion", "lettuce", "spinach", "broccoli", "broccoli", "kale", "broccoli"], "difficult_direct_answer": false, "rationales": ["Spinach and other greens are on a counter.", "Spinach is packed with nutrition.", "Spinach has a lot of vitamins."], "image": "train2014/COCO_train2014_000000221743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197851, "question_id": "etGqGWrgtrPhFv4kepYy2D", "question": "The D word here refers to what?", "choices": ["route", "brand", "location", "fuel"], "correct_choice_idx": 3, "direct_answers": ["diesel", "diesel", "diesel", "diesel gas", "gasoline type", "car detailing", "diesel gas", "diesel", "fuel", "diesel"], "difficult_direct_answer": false, "rationales": ["The d word is fuel.", "The d word is diesel. diesel is a type of this.", "Diesel refers to a type of fuel."], "image": "train2014/COCO_train2014_000000197851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100711, "question_id": "etM5ef4s5qA3iPxsrDenfS", "question": "Which star can the persons living here most readily identify?", "choices": ["jean harlow", "natalie wood", "james franco", "judy garland"], "correct_choice_idx": 3, "direct_answers": ["judy garland", "north star", "north star", "judy garland", "television", "johnny depp", "judy garland", "north", "sun", "judy garland"], "difficult_direct_answer": false, "rationales": ["There are dvds in the bookshelves. dvds are from the 90s as is this star.", "There is a poster on the left wall for the wizard of oz, a film starring judy garland.", "There is a wizard of oz poster on the right wall. it depicts dorothy from the movie."], "image": "val2014/COCO_val2014_000000100711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288519, "question_id": "etTbNk27CcXYRwzQUnzNYB", "question": "Who is closer to the shore?", "choices": ["boy", "baby", "old man", "girl"], "correct_choice_idx": 0, "direct_answers": ["young man", "boy", "photographer", "boy", "boy", "boy", "boy", "boy", "boy", "photographer"], "difficult_direct_answer": false, "rationales": ["The waves are approach the bottom of the image. waves approach the shore and the person then closer to the bottom is closer to shore.", "A boy is is the whitewash of a wave on his belly on a surfboard while a girl is behind him near a wave before it has broken.", "The boy seems to be right next to the shoreline and the woman behind him is farther into the water."], "image": "train2014/COCO_train2014_000000288519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457555, "question_id": "etXXN6igt6FhYubxSrEMbF", "question": "What does this grey truck transport?", "choices": ["toys", "food", "tires", "drink"], "correct_choice_idx": 2, "direct_answers": ["tires", "tires", "tires", "tires", "tires", "tires", "tires", "tires", "tires", "tires"], "difficult_direct_answer": false, "rationales": ["This truck has a bed full of michelin tires.", "The truck has michelin on the side.", "The truck has tires."], "image": "train2014/COCO_train2014_000000457555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353994, "question_id": "ete2fUBbeiPLna4bNW2idr", "question": "What other surface might this be played on?", "choices": ["tarmac", "sand", "concrete", "grass"], "correct_choice_idx": 3, "direct_answers": ["asphalt", "grass", "clay", "grass", "grass court", "dirt", "dirt", "grass", "concrete", "grass clay"], "difficult_direct_answer": false, "rationales": ["There are regular tennis tournaments played every year on grass including wimbledon.", "There are 3 types of tennis courts", "This game of tennis might also be played on grass."], "image": "train2014/COCO_train2014_000000353994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129520, "question_id": "ett6TPdjAdvgQJggC99rN2", "question": "What's the name of the cart the riders are on?", "choices": ["driver", "sulky", "spare", "wagon"], "correct_choice_idx": 1, "direct_answers": ["sulky", "carriage", "wagon", "buggy", "buggy", "buggy", "rickshaw", "sulky", "chariot", "chariot"], "difficult_direct_answer": false, "rationales": ["By the look of the picture it is the sulky.", "The name is a sulky.", "The cart riders are all traveling on a sulky."], "image": "val2014/COCO_val2014_000000129520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573513, "question_id": "etue8wzibhkupdBoNJNpXC", "question": "Why is the red vehicle on the right stopped at the building?", "choices": ["to race", "refueling", "broke down", "changing tires"], "correct_choice_idx": 1, "direct_answers": ["getting gas", "refueling", "getting gas", "getting gas", "boarding passengers", "jeep", "getting gas", "gas", "boarding", "awaiting passengers"], "difficult_direct_answer": false, "rationales": ["The red vehicle is parked next to a fuel pump at a \"petron\" station, so it is definately gassing up.", "There is a gas station.", "The red vehicle on the right is stopped to refuel at the gas station."], "image": "train2014/COCO_train2014_000000573513.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387216, "question_id": "eu4yafHhB2K4YZha2BxDqB", "question": "What is the train off of?", "choices": ["schedule", "tracks", "highway", "road"], "correct_choice_idx": 1, "direct_answers": ["tracks", "tracks", "tracks", "track", "tracks", "tracks", "rail", "tracks", "track", "tracks"], "difficult_direct_answer": false, "rationales": ["The train is on the rail track.", "A train is in the grass next to tracks.", "A train sits in grass next to tracks."], "image": "val2014/COCO_val2014_000000387216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124279, "question_id": "euGvTireSezChy8SgbBSxJ", "question": "Why are they ignoring each other?", "choices": ["strangers", "distracted notebook", "angry", "fighting"], "correct_choice_idx": 1, "direct_answers": ["distracted notebook", "working", "busy", "working", "working", "working", "lap tops", "working", "working", "concentrating"], "difficult_direct_answer": false, "rationales": ["The men are looking at their laptop screens.", "They are both engrossed in what is happening on their individual screens.", "The two people are working."], "image": "val2014/COCO_val2014_000000124279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181028, "question_id": "eueTMVYBQnbhffjGzrNDkE", "question": "The desk lamp is illuminating what type of object on the door?", "choices": ["doorbell", "deadbolt lock", "hinge", "transom"], "correct_choice_idx": 1, "direct_answers": ["light", "tablet", "lock", "handle", "deadbolt lock", "books", "front door", "window", "door bolt", "lock"], "difficult_direct_answer": true, "rationales": ["The desk lamp is illuminating a deadbolt lock system.", "There is a knob there", "The desk lamp is lighting up the lock nearby."], "image": "train2014/COCO_train2014_000000181028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272033, "question_id": "euueQFUVo2SVbZfhRkiimp", "question": "What is the rectangular area above the train?", "choices": ["art gallery", "pizzeria", "walkway", "library"], "correct_choice_idx": 2, "direct_answers": ["bridge", "bridge", "walkway", "walkway", "walkway", "foot bridge", "walkway", "bridge", "walkway", "foot bridge"], "difficult_direct_answer": false, "rationales": ["A walkway extend across train tracks with trains on the tracks below. elevated walkways are used in some areas.", "There is a walkway by the train.", "In order for people to be able to get from one side of train tracks to the other, they need to go above or below to avoid trains."], "image": "train2014/COCO_train2014_000000272033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535561, "question_id": "ev7HB7TEE7iSJzXV3uwM5L", "question": "What is she giving the man?", "choices": ["water", "drugs", "change", "food"], "correct_choice_idx": 2, "direct_answers": ["paper", "unknown", "flyer", "money", "change", "paper", "money", "money", "unsure", "unknown"], "difficult_direct_answer": false, "rationales": ["She's giving change.", "The woman is handing off some money.", "It's hard to tell what is being exchanged but perhaps by what he is wearing this could be a sale exchange."], "image": "train2014/COCO_train2014_000000535561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129566, "question_id": "evDku7WZ7fmQbPBBMhbrwQ", "question": "What type of event are the items inside the white cabinet hanging from the wall commonly used for?", "choices": ["emergency", "educational", "religious celebrations", "party"], "correct_choice_idx": 0, "direct_answers": ["emergency", "tea party", "welcoming guests", "emergency", "serving food", "drapes", "meal time", "no idea", "tea", "tea time"], "difficult_direct_answer": true, "rationales": ["The items in the cabinet are for fires.", "There is a blue cross on the front of the emergency cabinet.", "For an emergency"], "image": "val2014/COCO_val2014_000000129566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142420, "question_id": "evK2hfsemFmUiqCnqxshve", "question": "How many objects here contain items from the dairy group?", "choices": ["four", "three", "one", "two"], "correct_choice_idx": 1, "direct_answers": ["three", "two", "two", "two", "four", "nine", "two", "two", "two", "one"], "difficult_direct_answer": false, "rationales": ["There are 3.", "The mayo and cheese have dairy.", "There is mayonnaise, cheese and eggs."], "image": "train2014/COCO_train2014_000000142420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485628, "question_id": "evNreDyp3ZgzVMZv62C7ty", "question": "What is another word for the item all the way to the left?", "choices": ["basket", "loo", "barrel", "egg"], "correct_choice_idx": 1, "direct_answers": ["toilet", "toilet", "toilet", "toilet", "toilet", "crapper", "loo", "toilet", "sink", "crapper"], "difficult_direct_answer": false, "rationales": ["The item all the way to the left is a toilet.", "The toilet has a nickname, which is the loo.", "The toilet is sometimes called a loo in various places."], "image": "val2014/COCO_val2014_000000485628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204867, "question_id": "evi9Z9NPvfhvcu9Aah8psx", "question": "What time can be seen on the highest clock?", "choices": ["six thirty", "twelve thirty", "seven thirty", "four thirty"], "correct_choice_idx": 0, "direct_answers": ["six thirty", "five thirty", "six thirty", "six thirty", "five thirty", "530", "six thirty", "630", "six thirty", "530"], "difficult_direct_answer": false, "rationales": ["Both hands of the clock are pointing at the six.", "The time indicates six thirty.", "The small and large hands are pointing down"], "image": "train2014/COCO_train2014_000000204867.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449850, "question_id": "evk3XpGkQPW2tJz436SGtR", "question": "The flash from the camera creates a bright light as it reflects off of what?", "choices": ["walls", "mattress", "sun", "mirror"], "correct_choice_idx": 3, "direct_answers": ["mirror", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror"], "difficult_direct_answer": false, "rationales": ["The flash reflects from the mirror.", "It's reflecting off a mirror.", "The camera is taking a photo while pointing at an image-reflecting item."], "image": "train2014/COCO_train2014_000000449850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169026, "question_id": "ew5ouGeE9FnecHMgndADUf", "question": "What type of bus is shown?", "choices": ["double-decker", "school", "shuttle", "passenger"], "correct_choice_idx": 3, "direct_answers": ["city bus", "city", "passenger bus", "public", "public", "public transport", "public", "city", "city", "passenger"], "difficult_direct_answer": false, "rationales": ["The bus has a sign of where it is going, and multiple doors for passengers to onboard and offboard.", "The bus has one level. it is not affiliated with a school or business and can be used by members of the public.", "The bus is transporting passengers."], "image": "train2014/COCO_train2014_000000169026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223014, "question_id": "ew9HBfY6ajFbtpq6bUHmN2", "question": "What will the player near the ball do next?", "choices": ["bat", "dribble", "dunk", "swing"], "correct_choice_idx": 3, "direct_answers": ["hit it", "hits", "swing", "swing", "hit it", "whack it", "forehand", "swing", "hit it", "swing"], "difficult_direct_answer": false, "rationales": ["The woman is playing tennis and is holding a racket to which the ball is coming towards the racket and she will most likely swing to hit the ball.", "The player will swing.", "She has her racket back in the ready position. the ball has almost reached her."], "image": "train2014/COCO_train2014_000000223014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386718, "question_id": "ewAFHTEzrVhZrqKMvMYCDf", "question": "What type of industry is being represented?", "choices": ["dairy", "clothing", "gun", "oil"], "correct_choice_idx": 0, "direct_answers": ["yoghurt", "yogurt", "dairy", "farming", "dairy", "agriculture", "yogurt", "dairy", "dairy", "farming"], "difficult_direct_answer": false, "rationales": ["The industry is the dairy one.", "By the type of cow pictured here can tell what type of industry.", "The cow is on a yogurt sign."], "image": "val2014/COCO_val2014_000000386718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268412, "question_id": "ewBP5vaVxzHKvtFzg6avau", "question": "What's the man on the left in brown wearing?", "choices": ["suspenders", "hat", "tie", "jacket"], "correct_choice_idx": 0, "direct_answers": ["lederhosen", "octoberfest gear", "lederhosen", "suspenders", "capris", "lederhosen", "lederhosen", "leiterhosen", "lederhosen", "overalls"], "difficult_direct_answer": false, "rationales": ["A man has straps that extend from his pants over his shoulders.", "There are vertical belts from his pants to the top of his shirt.", "That's is what is attached to his pants."], "image": "val2014/COCO_val2014_000000268412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470722, "question_id": "ex3MM2erb6vqkseXgmcbgc", "question": "What cuisine is featured?", "choices": ["american", "french", "india", "chinese"], "correct_choice_idx": 3, "direct_answers": ["escargot", "chicken", "chicken", "turkey", "chinese", "meat", "turkey", "turkey", "chicken", "turkey"], "difficult_direct_answer": false, "rationales": ["The pictured item is a smaller prepared bird, smaller than a turkey. ducks are smaller than turkeys. ducks are a popular ingredient in chinese food.", "It is a cooked, stuffed turkey. turkey is eaten to celebrate thanksgiving, a unique tradition in the united states.", "Although we can't be sure, the vegetables along with the chicken indicate this is probably chinese food."], "image": "train2014/COCO_train2014_000000470722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198075, "question_id": "exQTXihUYBoeATwg67sP5D", "question": "What part of the image file wasn't physically present?", "choices": ["date", "horses", "watermark", "border"], "correct_choice_idx": 2, "direct_answers": ["watermark", "sun", "watermark", "horses", "letters", "artist name", "water", "text", "ocean", "watermark"], "difficult_direct_answer": false, "rationales": ["Though horses are in the picture the watermark is very obviously the answer.", "The part is a watermark.", "The watermark on the top right was not there when the photo was taken."], "image": "val2014/COCO_val2014_000000198075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24730, "question_id": "exjU9JUiDXcpTfnNTqiAaC", "question": "What is the man in black behind the child walking away with?", "choices": ["jacket", "backpack", "snowboard", "fence"], "correct_choice_idx": 2, "direct_answers": ["snowboard", "snowboard", "snowboard", "leaving", "snowboard", "child", "snowboard", "snowboard", "gear", "snowboard"], "difficult_direct_answer": false, "rationales": ["The man in black is walking on snow at a ski resort and he's holding a snowboard.", "This is obvious given the shape and design.", "The man in black has a snowboard."], "image": "train2014/COCO_train2014_000000024730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179948, "question_id": "eyWhAec4nwHUbyygnNaM3P", "question": "What purpose does the pedestrian signal's symbol represent?", "choices": ["car warning", "go backwards", "stop crossing", "start crossing"], "correct_choice_idx": 3, "direct_answers": ["walk", "walk signal", "crossing", "caution", "go", "walking", "one way", "start crossing", "walk", "safe crossing"], "difficult_direct_answer": true, "rationales": ["A traffic signal with a figure on it is lit.", "The white person is a symbol to let the people know the traffic lights are in a position that is best to cross the road.", "The sign has a white lit up figure."], "image": "val2014/COCO_val2014_000000179948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147546, "question_id": "eygYkkMbvQHjLAiBmeFhrn", "question": "What is the main fruit in the image?", "choices": ["apple", "strawberry", "grape", "banana"], "correct_choice_idx": 3, "direct_answers": ["banana", "bananas", "banana", "banana", "bananas", "bananas", "bananas", "bananas", "bananas", "banana"], "difficult_direct_answer": false, "rationales": ["They are hanging in two rows across the entire scene.", "You can see all the yellow fruits hanging to be sold.", "That's the fruit there is most of."], "image": "val2014/COCO_val2014_000000147546.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492608, "question_id": "eykDYTZBii6JXrQe2VLprW", "question": "What kind of apparatus should a child wear in this region?", "choices": ["goggles", "mittens", "life jacket", "shoes"], "correct_choice_idx": 2, "direct_answers": ["swimsuit", "wetsuit", "lifejacket", "life jacket", "life vest", "lifevest", "strap", "swim suit", "helmet", "life jacket"], "difficult_direct_answer": true, "rationales": ["The waves are large.", "A child should wear a life jacket in this region.", "This will keep you afloat"], "image": "val2014/COCO_val2014_000000492608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260453, "question_id": "eynoWv4w7RCfJMuQZf4BVE", "question": "What filling was used for this pastry?", "choices": ["berry", "chocolate", "vanilla", "creme"], "correct_choice_idx": 0, "direct_answers": ["blueberries", "raspberry", "blueberry", "blueberries", "cheese", "blueberries", "sauce", "jelly", "blueberry", "berry"], "difficult_direct_answer": false, "rationales": ["That purple color is likely made by blueberries or blackberries, because they are used often in pies.", "The filling is a deep red color and you can see the seeds.", "A pastry with a red and purple center is shown. berries are red and purple and used in pastries."], "image": "train2014/COCO_train2014_000000260453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445879, "question_id": "eyu2o27bQJd3Xtk9T3cV6D", "question": "What surfing technique is the woman doing?", "choices": ["waving", "skimming", "grinding", "carving"], "correct_choice_idx": 3, "direct_answers": ["surfing", "bending", "grinding", "riding wave", "carving", "surfing", "surfing", "surfing", "hang ten", "surfing"], "difficult_direct_answer": false, "rationales": ["The technique is carving.", "She is surfing on the wave and doing a trick.", "The woman has her board turned sideways."], "image": "train2014/COCO_train2014_000000445879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83117, "question_id": "eyuJpdv9kaP7t59hbQxoJJ", "question": "The year this photo was taken would have to be before what year?", "choices": ["1900", "1960", "1980", "2021"], "correct_choice_idx": 3, "direct_answers": ["forties", "1990", "1990s", "2010", "2020", "color", "long ago", "2021", "1940", "2022"], "difficult_direct_answer": true, "rationales": ["The photo is in black and white.", "Before 2021 it would of had to been taken.", "We had color photos in 2021."], "image": "train2014/COCO_train2014_000000083117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252084, "question_id": "ezYfVh9QqgXzFTtpSUYY9f", "question": "Where do umbrellas originate from?", "choices": ["persia", "morocco", "france", "chinese"], "correct_choice_idx": 3, "direct_answers": ["egypt", "chinese", "india", "mesopotamian", "mesopotamian region", "china", "mesopotamian", "usa", "china", "outside us"], "difficult_direct_answer": false, "rationales": ["They were used to block out sunlight first and then rain.", "The chinese are believed to be the originators of the umbrella at least two thousand years ago.", "Umbrellas are a chinese invention."], "image": "train2014/COCO_train2014_000000252084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416521, "question_id": "eznZDwWsYxYD9SzhxjGXAW", "question": "How do the people know each other?", "choices": ["rivals", "coworkers", "siblings", "neighbors"], "correct_choice_idx": 1, "direct_answers": ["military", "military service", "pilots", "serve together", "same unit", "military ties", "tleivision", "military", "military", "coworkers"], "difficult_direct_answer": false, "rationales": ["The men are all military colleagues.", "The people are colleagues.", "The people all serve in the military."], "image": "train2014/COCO_train2014_000000416521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193594, "question_id": "ezuZ7Lxgc9SZhShzQU8xWf", "question": "Why is he wearing a glove?", "choices": ["warmth", "to catch", "fashion", "safety"], "correct_choice_idx": 3, "direct_answers": ["cooking", "spice", "sanitary", "safety", "mixing food", "food safety", "sanitation", "sanitary", "sanitary", "cooking"], "difficult_direct_answer": false, "rationales": ["He is wearing a glove for food safety.", "He's working with food, so wants to keep it clean.", "It also prevents his hand from getting dirty."], "image": "train2014/COCO_train2014_000000193594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272026, "question_id": "f2bbqo2K3VdJamFMzPyBce", "question": "What is this place?", "choices": ["seafood restaurant", "photographer's studio", "kitchen", "bar"], "correct_choice_idx": 0, "direct_answers": ["seafood restaurant", "restaurant", "bar", "bar", "restaurant", "restaurant", "bar", "bar", "restaurant", "bar"], "difficult_direct_answer": false, "rationales": ["There are seafood names on the board.", "There is a sign in the background that shows the menu. many of the ingredients come from oceans.", "There are seafood items on the menu."], "image": "train2014/COCO_train2014_000000272026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84171, "question_id": "f2knWL5wVYKStKdm4Qg7Ma", "question": "Which device is used to attenuate the airborne noise of the engine?", "choices": ["none", "silencer", "muffler", "bumper"], "correct_choice_idx": 2, "direct_answers": ["handle", "muffler", "muffler", "clutch", "muffler", "muffler", "muffler", "handle", "muffler", "muffler"], "difficult_direct_answer": false, "rationales": ["To cut down on the engine noise on these types of motorcycles, mufflers are used.", "It is designed to make motors quieter", "There is a muffler."], "image": "train2014/COCO_train2014_000000084171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288054, "question_id": "f2rTPtDZfzRhysPqoQRDDD", "question": "What is the profession of the man seen on the bus?", "choices": ["officer", "doctor", "judge", "driver"], "correct_choice_idx": 3, "direct_answers": ["driver", "chef", "musician", "bus driver", "driver", "driver", "driver", "singer", "musician", "bus driver"], "difficult_direct_answer": false, "rationales": ["There is only one person visible on the bus and they are in the driver's seat and are in contact with the steering wheel. someone sitting in the driver's seat and driving the bus is likely a professional.", "He is in front of the steering wheel and is also in the drivers seat, indicating he is the driver.", "The man is driving the bus."], "image": "train2014/COCO_train2014_000000288054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469961, "question_id": "f3Dpbvy3x8rZKRm8t6m2qG", "question": "How much energy does this stretch of skiing require compared to extreme downhill runs?", "choices": ["none", "same", "more", "less"], "correct_choice_idx": 2, "direct_answers": ["high energy", "much more", "much more", "lots", "much energy", "more", "endurance", "more", "more", "very little"], "difficult_direct_answer": false, "rationales": ["This uphill ski course requires much more energy than the downhill variant.", "Cross-country skiing does not have a decline in order to reduce effort, going across straight land requires much more effort.", "It requires less energy to go downhill because your body weight and gravity will help you on the way down. the skiers are heading uphill here and those same things will work against them."], "image": "val2014/COCO_val2014_000000469961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368594, "question_id": "f3HRJys8UhN5rxiecSe48E", "question": "At what type event is the man preparing food?", "choices": ["fancy brunch", "bris", "picnic", "baby shower"], "correct_choice_idx": 2, "direct_answers": ["picnic", "lunch", "picnic", "lunch", "outdoor", "picnic", "premiere", "picnic", "picnic", "picnic"], "difficult_direct_answer": false, "rationales": ["He is outdoors as can be seen from the shadows and sunlight from above, and appears to be at a makeshift table and kitchen.", "Picnics are held outdoors.", "The sun can be seen shining down, showing if is an outdoor event."], "image": "train2014/COCO_train2014_000000368594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254658, "question_id": "f3RM73m3VPVdqwYovnA6WV", "question": "Why is the item she is showing off black?", "choices": ["spices", "soy sauce", "feathers", "burnt"], "correct_choice_idx": 3, "direct_answers": ["burnt", "burnt", "chocolate", "burnt", "burnt", "burnt", "burnt", "it's burnt", "burnt", "food"], "difficult_direct_answer": false, "rationales": ["She left the meat in the oven for way too long.", "She cooked it too long.", "A person holds up a dark piece of meat on a plate. meat is black when burned."], "image": "train2014/COCO_train2014_000000254658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441518, "question_id": "f3RYCu68HXMpcog9C2SsPC", "question": "What area of the computer is he touching with his fingers?", "choices": ["trackpad", "screen", "keyboard", "usb slot"], "correct_choice_idx": 0, "direct_answers": ["keyboard", "mouse pad", "frame", "keyboard", "hands feet", "ipad", "trackpad", "mousepad", "trackpad", "touchpad"], "difficult_direct_answer": false, "rationales": ["The trackpad is mostly only touched with fingers.", "This works the same as using a mouse", "He is using this to move the cursor."], "image": "val2014/COCO_val2014_000000441518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193168, "question_id": "f3fzBqoyJJWVTroWjNe8ku", "question": "What are the round things on the outer edge?", "choices": ["garbanzo beans", "peas", "olives", "cheese balls"], "correct_choice_idx": 0, "direct_answers": ["chickpeas", "garbanzo beans", "dials", "heat knobs", "chickpeas", "garbanzo beans", "chickpeas", "peanut", "knobs", "sausage"], "difficult_direct_answer": false, "rationales": ["The round things are beans.", "They have added garbanzo beans to their pizza", "The round things are chickpeas."], "image": "train2014/COCO_train2014_000000193168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520259, "question_id": "f3rQEzu9sLwMPPVg6pVgjU", "question": "The item the man is holding is similar to what hygienic item?", "choices": ["dental floss", "moisturizer", "tongue scraper", "hairbrush"], "correct_choice_idx": 0, "direct_answers": ["kite", "dental floss", "tampon", "towel", "garbage bag", "tooth floss", "pad", "dental floss", "floss", "band aid"], "difficult_direct_answer": true, "rationales": ["A floss is made of string.", "A string is similar to dental floss.", "The string from a kite is similar to dental floss."], "image": "train2014/COCO_train2014_000000520259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350514, "question_id": "f464rSpiDFLjPfmCv96Rsv", "question": "The people carrying bags are doing so because of what reason?", "choices": ["shopping", "commuting", "weather", "air travel"], "correct_choice_idx": 3, "direct_answers": ["travel", "traveling", "travelling", "air travel", "traveling", "traveling", "traveling", "travelling", "traveling", "travel"], "difficult_direct_answer": false, "rationales": ["They are traveling.", "They look to be at an airport getting ready to get on a plane.", "The people are at an airport."], "image": "train2014/COCO_train2014_000000350514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86745, "question_id": "f4wUun3fjFGz9nJax7o4KW", "question": "Where are the people with the luggage walking to?", "choices": ["house", "restaurant", "airport", "theme park"], "correct_choice_idx": 2, "direct_answers": ["airport", "airplane", "airport", "airplane", "airplane", "terminal", "departure", "bus", "airport terminal", "airport"], "difficult_direct_answer": false, "rationales": ["They are walking through the terminal, and the tags on their luggage show that they are labeled to go on a plane.", "They are on their way to catch a plane.", "People are walking on an overpass and they all have luggage with them that is used when flying by plane."], "image": "train2014/COCO_train2014_000000086745.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270810, "question_id": "f5AJ5vwVnh9BcBLmYYqcjy", "question": "What is the occupation of the man on the yellow step?", "choices": ["waiter", "musician", "doctor", "conductor"], "correct_choice_idx": 3, "direct_answers": ["engineer", "conductor", "conductor", "conductor", "train conductor", "standing", "train conductor", "conductor", "train", "conductor"], "difficult_direct_answer": false, "rationales": ["He's the conductor.", "The way he looks seems he is a conductor.", "He makes sure everyone is on or off the train before signaling to the engineer to leave."], "image": "val2014/COCO_val2014_000000270810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251395, "question_id": "f5EgtSjSEydcXFinQeap78", "question": "Where will the ball go next?", "choices": ["behind him", "no where", "behind racquet", "upwards"], "correct_choice_idx": 3, "direct_answers": ["over net", "in air", "upwards", "air", "away", "over net", "air", "other side", "over net", "net"], "difficult_direct_answer": false, "rationales": ["The ball is being served.", "He throws it into the air before he hits it.", "The man is throwing the ball to the air."], "image": "val2014/COCO_val2014_000000251395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406753, "question_id": "f5SLmVj8Wd6cE6Uu3P6S9V", "question": "Where room would this food be consumed in?", "choices": ["attic", "bathroom", "dining room", "living room"], "correct_choice_idx": 2, "direct_answers": ["dining", "dining room", "dining room", "dining", "dining", "dining room", "restaurant", "dining room", "dining room", "dining room"], "difficult_direct_answer": false, "rationales": ["Dinner is often eating in the dining room.", "Most times food like this is served in a person's dining area,", "The room has food."], "image": "train2014/COCO_train2014_000000406753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477774, "question_id": "f5WxyobBJu76LDV3pMfy6C", "question": "What format of photographs will this woman be taking?", "choices": ["film", "hand drawn", "digital", "polaroid"], "correct_choice_idx": 2, "direct_answers": ["carnival", "digital", "jpeg file", "closeups", "normal", "color", "digital", "unknown", "selfie", "bikini"], "difficult_direct_answer": true, "rationales": ["The woman has a digital camera.", "There is a camera hanging from her arm.", "The format is digital."], "image": "train2014/COCO_train2014_000000477774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99238, "question_id": "f5uqWaCbYBGUsy2h8NEhog", "question": "What sort of skiers could use this ski run?", "choices": ["almost any", "professional only", "no one", "licensed teachers"], "correct_choice_idx": 0, "direct_answers": ["beginners", "alpine", "almost any", "advanced", "downhill skiers", "beginner", "professional", "tired", "rookie skiers", "intermediate"], "difficult_direct_answer": true, "rationales": ["A ski run with only a slight downward angle is shown.", "The run is pretty flat with no steep areas.", "It is almost flat so beginners wouldn't go too fast here"], "image": "train2014/COCO_train2014_000000099238.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10613, "question_id": "f6GiYfKjRkoFvB6B6NmSpK", "question": "What is the man wearing a hat doing with the camera?", "choices": ["throwing it", "selling it", "taking pictures", "buying it"], "correct_choice_idx": 2, "direct_answers": ["rewinding", "adjusting it", "taking pictures", "taking pictures", "taking photo", "checking photos", "fixing", "taking picture", "checking pictures", "inspecting"], "difficult_direct_answer": true, "rationales": ["The man is taking photos.", "Given how he's looking at the camera and the way it's aimed, he's taking pictures rather than doing the other things with it.", "The man is snapping photos."], "image": "val2014/COCO_val2014_000000010613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170235, "question_id": "f6HVDK8spWzBLUnBPxV86K", "question": "What will the standing player do next?", "choices": ["sit", "squat", "run", "hide"], "correct_choice_idx": 2, "direct_answers": ["it depends", "run", "run", "run", "base run", "run", "home plate", "run", "run", "run"], "difficult_direct_answer": false, "rationales": ["A baseball player has just swung the bat at a ball.", "After hitting the ball the batter will run to first as fast as he can.", "The player is about to run after hitting the ball."], "image": "train2014/COCO_train2014_000000170235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470501, "question_id": "f6RqLkTb6eDKETYANhGgbD", "question": "What is the black pole the white ball is on called?", "choices": ["steady hit", "batting tee", "pitcher's mound", "street pole"], "correct_choice_idx": 1, "direct_answers": ["batting tee", "baseball tee", "batting tee", "tee", "ball holder", "batting tee", "pole", "batting tee", "tee", "tee stand"], "difficult_direct_answer": false, "rationales": ["The pole is used to hit a ball.", "The black pole is a tee that is used for batting practice.", "The black pole is for batting."], "image": "train2014/COCO_train2014_000000470501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102400, "question_id": "f6dhJgxwncjbkMRT8y3Kob", "question": "Which country bus it is?", "choices": ["germany", "france", "china", "taiwan"], "correct_choice_idx": 2, "direct_answers": ["coastal town", "australia", "china", "china", "india", "brazil", "korea", "japan", "china", "foreign"], "difficult_direct_answer": false, "rationales": ["There is asian languages written on side of bus in red part.", "There are chinese characters on the side of the bus.", "The bus has chinese writing on it."], "image": "val2014/COCO_val2014_000000102400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146448, "question_id": "f6f9M5FafxxkdrE9HF83ju", "question": "What jungle animal do his ears resemble?", "choices": ["tiger", "snake", "monkey", "parrot"], "correct_choice_idx": 2, "direct_answers": ["monkey", "human child", "monkey", "monkey", "monkey", "monkey", "monkey", "elephant", "elephant", "koala"], "difficult_direct_answer": false, "rationales": ["Their ears stick straight out", "Humans are closely related to the chimpanzee family.", "Monkeys have really broad ears."], "image": "val2014/COCO_val2014_000000146448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344807, "question_id": "f6rSn7SF7vEfmqMzqaXtLY", "question": "At least how many kid?", "choices": ["six", "one", "three", "two"], "correct_choice_idx": 1, "direct_answers": ["one", "one", "four", "zero", "one", "one", "one", "one kid", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is one baby seat at the table.", "There is a booster seat on a chair at the table.", "There is one high chair on one of the dining room chairs. high chairs are for kids."], "image": "train2014/COCO_train2014_000000344807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24402, "question_id": "f6yjttEorGZXinmZDd8Bpr", "question": "What appliance is being used here?", "choices": ["toaster", "hair dryer", "curling iron", "iron"], "correct_choice_idx": 1, "direct_answers": ["hair dryer", "hair dryer", "blow dryer", "hair dryer", "blow dryer", "hairdryer", "blower", "hair dryer", "hair dryer", "hair dryer"], "difficult_direct_answer": false, "rationales": ["This has a handle and a nozzle usually used to point at hair", "A hair dryer is being used to dry the paper mache.", "They're obviously drying their paper sculpture."], "image": "val2014/COCO_val2014_000000024402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24401, "question_id": "f7FFzDPpoj7hpFgVB387uV", "question": "What is the relationship of the woman to the parachutist?", "choices": ["unrelated", "mother", "grandmother", "friend"], "correct_choice_idx": 3, "direct_answers": ["expert", "friend", "acquaintance", "friends", "friend", "friend", "friend", "fan", "friends", "friends"], "difficult_direct_answer": false, "rationales": ["The person is about the same age as the parachutist.", "The people are both adults. the woman is waving at the other person, so she knows them.", "She's waving as they land"], "image": "train2014/COCO_train2014_000000024401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81589, "question_id": "f7NVKhFL52tix6Sk7XYNbd", "question": "What software technology is the man showing off on the mobile devices?", "choices": ["apple ios", "adobe flash", "linux", "android"], "correct_choice_idx": 1, "direct_answers": ["watching video", "adobe", "revelation", "robot", "tablet", "app", "tablet", "magazine", "adobe flash", "tablet"], "difficult_direct_answer": false, "rationales": ["The technology he is holding is generally of a android based software.", "It is adobe flash on the screen of the mobile device because you need it to play video", "The man is wearing an adobe shirt and showing off electronic devices."], "image": "train2014/COCO_train2014_000000081589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121873, "question_id": "f7iFZ7eShS4qTu7quaM85f", "question": "How was this dish prepared?", "choices": ["raw", "baked", "fried", "boiled"], "correct_choice_idx": 1, "direct_answers": ["baked", "baked", "oven", "baked", "baked", "oven", "baked", "baked", "baked", "baked"], "difficult_direct_answer": false, "rationales": ["This dish is prepared from a baking oven.", "A bread is being served.", "The crisp brown with black splotches of this dishes' crust are indications of baking."], "image": "train2014/COCO_train2014_000000121873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485173, "question_id": "f7u3x8SxwZLE3y9gpaDUVY", "question": "What food product are the animals in the front most closely associated with?", "choices": ["beef", "mutton", "goat's cheese", "cow's milk"], "correct_choice_idx": 2, "direct_answers": ["veil", "beef", "cheese", "goat's cheese", "plants", "milk", "cheese", "cheese", "cheese", "goat meat"], "difficult_direct_answer": false, "rationales": ["The animals on the left have horns. they are not sheep or cows.", "These animals milk can be turned into cheese.", "Goat's cheese is associated with goats."], "image": "train2014/COCO_train2014_000000485173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332968, "question_id": "f7vHauCnLpkh2fzzkhxYsN", "question": "The bakery here specializes in what type occasion?", "choices": ["birthdays", "wedding", "donut day", "anniversary"], "correct_choice_idx": 0, "direct_answers": ["wedding", "birthdays", "birthdays weddings", "birthdays", "birthday cakes", "weddings", "birthdays", "birthdays", "birthdays", "birthdays"], "difficult_direct_answer": false, "rationales": ["There are multiple birthday cakes on display.", "The bakery has birthday cakes in the window.", "The cakes are used for birthdays."], "image": "val2014/COCO_val2014_000000332968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308645, "question_id": "f7wRNyvq5N7DmMV2qimRNp", "question": "What is the person trying to do to the dog?", "choices": ["brush teeth", "clean eyes", "tighten color", "cut nails"], "correct_choice_idx": 0, "direct_answers": ["feed it", "feed it", "brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush teeth", "brush teeth"], "difficult_direct_answer": false, "rationales": ["They have a toothbrush and trying to put it in the mouth", "Human as well as canines should clean their teeth regular. a tooth brush is normally used to accomplish this.", "The person has a toothbrush."], "image": "val2014/COCO_val2014_000000308645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17272, "question_id": "f7y8s6ndUMD7i36ayN7LYu", "question": "What is the purpose of all the umbrellas?", "choices": ["stop rain", "for hiding", "for flying", "deflect sunlight"], "correct_choice_idx": 3, "direct_answers": ["provide shade", "shading", "deflect sunlight", "sun protection", "protect beachgoers", "shade", "shade", "shade", "block sun", "shade"], "difficult_direct_answer": false, "rationales": ["There is usually a lot of sun on the beach.", "The people are on a beach. beaches can take in a lot of sun.", "Due to the setting these umbrellas can keep out the sun and rain."], "image": "val2014/COCO_val2014_000000017272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222262, "question_id": "f87YHpHizcWnBdWWq4jANA", "question": "What is the purpose of the vial with sticks?", "choices": ["insect repellant", "humidifying", "disinfectant", "scent"], "correct_choice_idx": 3, "direct_answers": ["incense burner", "play videogame", "air freshener", "aroma", "protection", "scent", "incense", "play game", "game", "navigation"], "difficult_direct_answer": true, "rationales": ["They are there for to make it smell good.", "The sticks are placed in oil, and release a scent into the sticks, which carries on into the room.", "The vial with sticks also has a liquid in it that is likely a perfume based on appearance which is a common method to diffuse scent."], "image": "train2014/COCO_train2014_000000222262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215600, "question_id": "f89j6x9aMoAorJvh5bLcMk", "question": "How much did parking meters initially charge?", "choices": ["quarter", "dime", "penny", "nickel"], "correct_choice_idx": 3, "direct_answers": ["quarter", "25 cents", "twenty cents", "zero", "five cents", "lot", "dime", "nickel", "quarter", "twenty five"], "difficult_direct_answer": true, "rationales": ["The person needs a nickel.", "A parking meter is shown with a digital readout.", "This answer was gleaned by looking up this question online and finding the historical answer based on the first parking meters usage."], "image": "train2014/COCO_train2014_000000215600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109290, "question_id": "f8B4P4Y9t7M3JXbBFMw7zy", "question": "Whos number did the child dial?", "choices": ["no ones", "mom", "dad", "her own"], "correct_choice_idx": 0, "direct_answers": ["mother", "grandmas", "too little", "mommy", "random number", "police", "no ones", "their parent's", "mom", "no clue"], "difficult_direct_answer": true, "rationales": ["The child can't dial numbers.", "The child has a phone in her hand, which can be used to dial numbers. the child is too young to have memorized a number or the process required in order to initiate a successful telephone call.", "The child is holding a phone. they are too young to understand how to dial a number."], "image": "train2014/COCO_train2014_000000109290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282599, "question_id": "f8EeTmEXbaYSQKTNPzxRvE", "question": "The game being played on the television is meant to improve what aspect of the player?", "choices": ["speech", "cognition", "visual acuity", "fitness"], "correct_choice_idx": 3, "direct_answers": ["balance", "fitness", "wii", "wii", "balance", "balance", "balance", "balance", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["They are standing on an exercise pad.", "The purpose of wii sports is to increase fitness.", "The person is on a wii fitness board which was used for improving fitness."], "image": "train2014/COCO_train2014_000000282599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107105, "question_id": "f8NBnZGh2BYrgB2MFBQyVG", "question": "What does this country's flag look like?", "choices": ["striped", "green background", "blue background", "red background"], "correct_choice_idx": 3, "direct_answers": ["china", "red background", "not clear", "striped", "no idea", "red", "china", "red white", "colorful", "red"], "difficult_direct_answer": false, "rationales": ["It has a red background since it is closely associated with china", "They are in china.", "The flag is red."], "image": "val2014/COCO_val2014_000000107105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482022, "question_id": "f8vVbVHtanwDrqdA6amgWt", "question": "What appliance is missing from this room?", "choices": ["vaccuum", "washing machine", "air conditioner", "refrigerator"], "correct_choice_idx": 3, "direct_answers": ["microwave", "refrigerator", "fridge", "dishwasher", "fridge", "fridge", "fridge", "dishwasher microwave", "fridge", "microwave"], "difficult_direct_answer": false, "rationales": ["The fridge is missing.", "There is a stove but no place to store cold or frozen foods.", "The kitchen has no refrigerator."], "image": "val2014/COCO_val2014_000000482022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532734, "question_id": "f995NMAacqCyzqhQZWWeM9", "question": "What is the color of the following banana imply?", "choices": ["not ripe", "ripe", "none", "rotten"], "correct_choice_idx": 1, "direct_answers": ["ripe", "yellow", "ripe", "ripe", "yellow", "ripe", "ripeness", "ripe", "ripe", "winning"], "difficult_direct_answer": false, "rationales": ["If it were green, then it wouldn't be a.", "A yellow banana is being tossed. yellow bananas are ripe.", "The bananas are primarily yellow, which is generally considered to be the color at which they are most edible."], "image": "train2014/COCO_train2014_000000532734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413822, "question_id": "f9TRSAmvtTZ6y5GDKGBgw3", "question": "What does the man on the motorcycle ride in?", "choices": ["war invasion", "parade", "picnic", "work event"], "correct_choice_idx": 1, "direct_answers": ["leather", "parade", "leather", "history clothes", "parade", "parade", "jacket", "road", "parade", "parade"], "difficult_direct_answer": false, "rationales": ["There are people on the sidelines watching the man, so it's likely a parade.", "The man is part of a parade.", "The man is in a parade."], "image": "val2014/COCO_val2014_000000413822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358411, "question_id": "f9XDcXqx7ytA7sK94RdJfh", "question": "How many people will dine together at this table?", "choices": ["five", "six", "none", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "two", "one person", "one", "one", "one", "one person", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["Judging that there is only one plate of food, we can be confident that only one person will dine at this table.", "There is a single plate sitting on the table.", "One plate is set with food at a dining table."], "image": "train2014/COCO_train2014_000000358411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455665, "question_id": "f9jM4rZeEKpPge9GNQgXXi", "question": "What is the man attempting to do with the ball?", "choices": ["grab it", "hit it", "punch it", "kick it"], "correct_choice_idx": 1, "direct_answers": ["lob", "hit", "hit", "hit", "hit", "hit", "hit", "hit it", "hit it", "hit it"], "difficult_direct_answer": false, "rationales": ["The man wants to get a hit on the ball.", "By the sport being played and the position of his racket you can tell what he is about to do to the ball.", "The man wants to hit."], "image": "train2014/COCO_train2014_000000455665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513861, "question_id": "f9yM7nMbJtGWfjebGVxWdA", "question": "Why is the man wearing a glove?", "choices": ["warmth", "to catch", "fashion", "health"], "correct_choice_idx": 1, "direct_answers": ["catching", "pitcher", "catch ball", "to catch", "hand protection", "playing baseball", "catch ball", "catcher", "hand protection", "catch ball"], "difficult_direct_answer": false, "rationales": ["This is a baseball player. he is on the defensive team.", "The person is a baseball player based on the uniform and equipment. in this sport, answer a is the known reason for why a person wears a glove.", "A man is in a baseball uniform on a baseball field."], "image": "train2014/COCO_train2014_000000513861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235791, "question_id": "fA8C2fuyku4Fhq9n4HgyCD", "question": "What is in the water?", "choices": ["shark", "boat", "catfish", "cow"], "correct_choice_idx": 1, "direct_answers": ["boat", "boat", "boats", "boats", "boats", "boat", "boats", "boat", "boat", "boats"], "difficult_direct_answer": false, "rationales": ["The object is not an animal.", "There are numerous floating vessels present.", "A vehicle, not an animal, is in the water."], "image": "val2014/COCO_val2014_000000235791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164381, "question_id": "fAELey77UQ6VNYPyPXrtkV", "question": "What might they be doing with their devices?", "choices": ["texting friends", "following presentation", "playing game", "taking selfie"], "correct_choice_idx": 1, "direct_answers": ["reading", "texting", "texting", "following presentation", "texting", "texting", "texting", "reading texts", "texting", "silencing them"], "difficult_direct_answer": false, "rationales": ["A bunch of people are sitting in an audience in staring at their phones and in front of them. people sometimes like to record presentations.", "The people are engaging with the presentation.", "They're following the presentation."], "image": "train2014/COCO_train2014_000000164381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208494, "question_id": "fAHvi57Mi4uv7HA4vQQRxW", "question": "What does the giraffe want to do with the item in this man's mouth?", "choices": ["smell it", "spit", "ignore it", "eat it"], "correct_choice_idx": 3, "direct_answers": ["eat it", "eat", "eat", "eat it", "eat it", "eat it", "eat", "eat", "eat", "eat"], "difficult_direct_answer": false, "rationales": ["The giraffe also consumes vegetables such as the carrots.", "It is a food item", "The man is holding a carrot in his mouth and the giraffe is taking it in order to eat it"], "image": "val2014/COCO_val2014_000000208494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72944, "question_id": "fApY46b3V6YuXfTcPBEeRX", "question": "What group of people is the ramp here constructed for?", "choices": ["handicapped people", "merchants", "bikers", "singers"], "correct_choice_idx": 0, "direct_answers": ["handicapped people", "handicapped", "disabled", "handicapped", "handicapped", "disabled", "handicapped", "disabled", "handicapped", "handicapped"], "difficult_direct_answer": false, "rationales": ["There are no stairs. a wheelchair can roll down the ramp easily. people with disabilities ride in wheelchairs.", "The ramp was constructed for people in wheelchairs because they are unable to utilize stairs to enter a building.", "This is for people in wheelchairs to be able to get up instead of going up the stairs."], "image": "val2014/COCO_val2014_000000072944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495013, "question_id": "fAyebddfiH5k3yKDg86aDk", "question": "What has the yellow object been drawn on to resemble?", "choices": ["eagle", "face", "dog", "star"], "correct_choice_idx": 1, "direct_answers": ["elephant", "robot", "face", "monkey", "cartoon character", "frowny face", "face", "sad face", "face", "sad elephant"], "difficult_direct_answer": false, "rationales": ["The object is a face.", "There are eyes, a nose, and a mouth on the hydrant.", "The downwards mouth makes it look sad."], "image": "train2014/COCO_train2014_000000495013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289805, "question_id": "fB2vvi7o34GrC7pzbnVC8S", "question": "Why are the people lining up?", "choices": ["mobile library", "buying food", "boarding vehicle", "donating blood"], "correct_choice_idx": 1, "direct_answers": ["board van", "to board", "bus ride", "buying food", "board van", "boarding bus", "bus ride", "ride bus", "food", "food truck"], "difficult_direct_answer": false, "rationales": ["They appear to be lining up in front of a food truck, which is a traveling truck that sells meals.", "The people are at a grocery store.", "The people want to get onto the van."], "image": "train2014/COCO_train2014_000000289805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579332, "question_id": "fBFktBYPwJnbJRdV8TqmXq", "question": "What item is located behind the lady in red?", "choices": ["hand dryer", "refrigerator", "oven", "double boiler"], "correct_choice_idx": 1, "direct_answers": ["refrigerator", "refrigerator", "fridge", "refrigerator", "fridge", "refrigerator", "refrigerator", "stove", "refrigerator", "refrigerator"], "difficult_direct_answer": false, "rationales": ["The fridge is located.", "A refrigerator is shown.", "A woman is standing in a kitchen with a large stainless steel appliance behind her. the appliance has a big door on the front."], "image": "train2014/COCO_train2014_000000579332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207463, "question_id": "fBHT4ZEuESNkCYi64zKfag", "question": "What is making the people smile and look the same direction?", "choices": ["video game", "movie", "fish tank", "board game"], "correct_choice_idx": 0, "direct_answers": ["video game", "video game", "game", "video game", "video game", "game", "wii", "game system", "video game", "television"], "difficult_direct_answer": false, "rationales": ["They have controllers in their hands", "The people have remotes to play a game.", "The people are looking at the screen and smiling because they're playing nintendo wii."], "image": "train2014/COCO_train2014_000000207463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288283, "question_id": "fBLcbtETuGEqfBAR6pjPCM", "question": "What are the people wearing bright yellow?", "choices": ["to celebrate", "halloween", "punishment", "visibility"], "correct_choice_idx": 3, "direct_answers": ["visibility safety", "team", "team", "visibility", "safety", "street color", "visibility", "on street", "safety", "safety"], "difficult_direct_answer": false, "rationales": ["The people want to be seen by drivers.", "The people will be on the road. they want to make sure people in vehicles will be able to see them.", "The people want visibility."], "image": "train2014/COCO_train2014_000000288283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315863, "question_id": "fBLkg9RVShh3JGRs3ZK9K7", "question": "Why are those flowers growing in those bins?", "choices": ["bees", "birds", "weeds", "gardener"], "correct_choice_idx": 3, "direct_answers": ["as decoration", "decoration", "decoration", "there planters", "decorations", "daisy", "decorations", "gardener", "decoration", "for decoration"], "difficult_direct_answer": false, "rationales": ["The containers appear to be methodically placed and would be professional manicured. someone responsible for flower growth would be answer a.", "There is a gardener who takes care of these flowers.", "They are in a garden."], "image": "val2014/COCO_val2014_000000315863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438744, "question_id": "fBRefgHois73myhh873WxQ", "question": "What is the person in the brown boots doing with the dog?", "choices": ["bathing it", "walking it", "feeding it", "buying it"], "correct_choice_idx": 1, "direct_answers": ["walking", "walking", "walking", "walking them", "walking", "feeding it", "walking dog", "walking it", "walking", "walking"], "difficult_direct_answer": false, "rationales": ["The dog is on a leash.", "The person is walking.", "A woman with brown boots is holding a leash connected to a dog."], "image": "train2014/COCO_train2014_000000438744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306553, "question_id": "fBYRYL3CyRASK7aFuPg7Ag", "question": "Why are they so close together?", "choices": ["hearing impaired", "attacking", "limited space", "friends"], "correct_choice_idx": 3, "direct_answers": ["friends", "friends", "talking", "chatting", "friends", "friends", "partner", "relationship", "hanging out", "couple"], "difficult_direct_answer": false, "rationales": ["They know one another.", "The women look like they are talking.", "They've known each other a long time."], "image": "val2014/COCO_val2014_000000306553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569016, "question_id": "fBZDK7phCteE3Xi3UE9DR2", "question": "What are the four things on each side of the vehicle called?", "choices": ["missiles", "propellers", "windows", "tires"], "correct_choice_idx": 1, "direct_answers": ["wheels", "propellers", "propellers", "propellers", "propellers", "propeller", "wheels", "propellers blades", "propeller blades", "tires"], "difficult_direct_answer": false, "rationales": ["These are found on each side of the airplane and it helps the plane move forward.", "There are propellers on front of the wings.", "These are part of the engine"], "image": "val2014/COCO_val2014_000000569016.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288236, "question_id": "fBdj7cVQCwcikC3F9jq85m", "question": "What is the woman wearing the skirt doing?", "choices": ["random passing", "complaining", "judging", "modeling"], "correct_choice_idx": 2, "direct_answers": ["serving", "hitting ball", "playing tennis", "playing tennis", "playing tennis", "playing tennis", "playing tennis", "tennis", "playing tennis", "judging"], "difficult_direct_answer": false, "rationales": ["Referees who make judgment calls in tennis matches often wear uniforms, such as a white skirt.", "The woman on the sidelines is a referee.", "A girl in the back is being a judge to tell if a ball is staying in the court."], "image": "train2014/COCO_train2014_000000288236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50752, "question_id": "fBofRuKntSWjspLy8xiB3X", "question": "Who serves coffee on the corner?", "choices": ["starbucks", "dunkin' donuts", "tim hortons", "mcdonalds"], "correct_choice_idx": 0, "direct_answers": ["starbucks", "starbucks", "shop", "starbucks", "coffee house", "starbucks", "cafe", "starbucks", "starbucks", "starbucks"], "difficult_direct_answer": false, "rationales": ["You can see the st on the edge of the sign.", "A green st can be seen on the side of the building and the circular mermaid company logo is seen.", "The coffee is starbucks."], "image": "val2014/COCO_val2014_000000050752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171733, "question_id": "fC6o4sjuFUec2n8KUdF2uR", "question": "Where will the SUV drive?", "choices": ["in front", "on top", "beside", "behind"], "correct_choice_idx": 0, "direct_answers": ["in front", "before truck", "past truck", "in front", "straight", "on road", "america", "ahead", "behind truck", "construction die"], "difficult_direct_answer": true, "rationales": ["A large piece of heavy equipment is being towed by a large truck. a suv is driving by in the lane beside the truck.", "An suv is on a road in the lane next to a large truck hauling. people in regular vehicles often pass slower, larger ones.", "The suv is in front."], "image": "val2014/COCO_val2014_000000171733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466137, "question_id": "fCJtBjXY7EievFHfiZF6ms", "question": "Where does the teddy bear come from?", "choices": ["france", "germany", "britain", "america"], "correct_choice_idx": 3, "direct_answers": ["child", "uk", "store", "america", "child", "toy store", "store", "theodore roosevelt", "factory", "toy store"], "difficult_direct_answer": false, "rationales": ["The bear is from the usa.", "A teddy bear is on a table next to a cup of tea.", "Teddy bears are named after us president teddy roosevelt."], "image": "val2014/COCO_val2014_000000466137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461248, "question_id": "fCdF6siNGcMRhH3Jz6jVj5", "question": "What is the Head towering above everyone here meant to represent?", "choices": ["statue liberty", "nothing", "washington monument", "liberty bell"], "correct_choice_idx": 0, "direct_answers": ["liberty", "liberty statue", "liberty statue", "statue liberty", "liberty statue", "liberty", "liberty", "statue liberty", "liberty statue", "freedom"], "difficult_direct_answer": false, "rationales": ["There is a large statue that looks like the statue of liberty at the concert.", "You can see the distinctive crown", "The head is the statue of liberty."], "image": "val2014/COCO_val2014_000000461248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363146, "question_id": "fCf6vmGEoi3YT5uh93gFfe", "question": "What food are they possibly getting ready to make?", "choices": ["pizza", "biscuits", "pies", "cake"], "correct_choice_idx": 0, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["The food is pizza.", "There is a circular dough with a crust.", "There is a large round piece of flat baked dough. the italian pie is made from large round pieces of dough."], "image": "train2014/COCO_train2014_000000363146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581857, "question_id": "fCiCABtqGrATUhRdZb5Qmv", "question": "What is meant to be kept in those cages?", "choices": ["snakes", "lizards", "rabbits", "birds"], "correct_choice_idx": 3, "direct_answers": ["birds", "picnic", "birds", "birds", "birds", "pets", "birds", "birds", "fruit", "birds"], "difficult_direct_answer": false, "rationales": ["They are birdcages.", "There are small pet cages.", "They have perches and little doors"], "image": "train2014/COCO_train2014_000000581857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34164, "question_id": "fCnjdYgWj66NVMadvwgRfA", "question": "On which floor of the building is this bathroom?", "choices": ["fifth floor", "first floor", "basement", "third floor"], "correct_choice_idx": 2, "direct_answers": ["basement", "first", "ground level", "first", "first", "top", "first floor", "first", "not known", "first"], "difficult_direct_answer": false, "rationales": ["This bathroom is just underneath the side of the house.", "There is sunlight and the blinds are closed for privacy", "A room has low ceilings and a window that is up high on the wall."], "image": "train2014/COCO_train2014_000000034164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122934, "question_id": "fDDmTuZECMj7ELHiKnyCGg", "question": "In which season are the people traveling on the black horse drawn coach?", "choices": ["summer", "spring", "fall", "winter"], "correct_choice_idx": 3, "direct_answers": ["winter", "fall", "winter", "fall", "winter", "winter", "winter", "fall", "winter", "winter"], "difficult_direct_answer": false, "rationales": ["People are in a carriage and snow is on the ground. snow is around in the winter.", "The season is winter.", "The people are riding the coach through the snow."], "image": "val2014/COCO_val2014_000000122934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231148, "question_id": "fDEASW99d4DGKbekeJbpf5", "question": "What number is the hour hand currently pointing to on the clock?", "choices": ["two", "nine", "eight", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "eight", "two", "250", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["The small hand is pointed at it", "That is the number the hand is placed.", "It is 3:20 in the afternoon so it would be 3."], "image": "train2014/COCO_train2014_000000231148.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279603, "question_id": "fDmjGqo8VwruxzNdRM786Z", "question": "The cylindrical item seen here came from a container with what color liquid inside?", "choices": ["white", "red", "brown", "green"], "correct_choice_idx": 0, "direct_answers": ["yellow", "gold", "wine glass", "white", "champagne", "white", "white", "white wine", "yellow", "amber"], "difficult_direct_answer": false, "rationales": ["The wine bottle has a greenish liquid in it.", "It is a wine bottle", "There is some white wine inside of the case."], "image": "train2014/COCO_train2014_000000279603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132394, "question_id": "fDosNAsr26NrrQVgg6ki2n", "question": "What is the purpose of the writing on the sign?", "choices": ["direct traffic", "advertisement", "inform public", "slogan"], "correct_choice_idx": 3, "direct_answers": ["communicate hope", "encouragement", "no entry", "help people", "slogan", "inspire hope", "graffiti", "hope", "art", "show address"], "difficult_direct_answer": true, "rationales": ["The purpose is a slogan.", "It's to give people optimism.", "It's a slogan for something."], "image": "train2014/COCO_train2014_000000132394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355086, "question_id": "fEGEG44ba2ZKNc5Co3UCsG", "question": "What type of cheese is on the pizza?", "choices": ["fresh mozzarella", "provolone", "swiss", "pecorino romano"], "correct_choice_idx": 0, "direct_answers": ["mozzarella", "mozzarella", "fresh mozzarella", "pizza", "mozzarella", "bread pizza", "mozzarella", "mozzarella", "mozzarella", "motzerella"], "difficult_direct_answer": false, "rationales": ["Its fresh mozzarella. mozzarella is what they normally will put on pizza.", "Mozzarella is a flat white cheese.", "Pizzas are on silver trays."], "image": "train2014/COCO_train2014_000000355086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140460, "question_id": "fELmvLrNBn5GTZ29VBVJnr", "question": "How many buses have an orange portion near their tires?", "choices": ["five", "three", "one", "six"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["The bus on the right.", "Only one is seen in the angle with orange on the tires.", "One bus has an orange line near the bottom of the bus."], "image": "train2014/COCO_train2014_000000140460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168405, "question_id": "fER95MTyEruz3Qs2qASFqV", "question": "Why are the bananas hanging?", "choices": ["easier access", "provide shade", "looks nicer", "slow ripening"], "correct_choice_idx": 3, "direct_answers": ["prevent destroying", "display", "ripen", "display", "for sale", "for space", "visibility", "slow ripening", "no space", "displaying"], "difficult_direct_answer": true, "rationales": ["The bananas are hanging to prevent being spoiled faster.", "This answer is internet searchable.", "Hanging bananas help to slow down the process of browning."], "image": "train2014/COCO_train2014_000000168405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536179, "question_id": "fESvMMnkDt3gNrH8hX8AZE", "question": "How many people were most probably riding as motorcycle passengers?", "choices": ["two", "one", "three", "zero"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "one", "one", "one", "two", "one", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Motorcycle has two seats one for driver and one for passenger both people sitting in the photo have motorcycle helmet next to them suggesting that they both rode together.", "There are two people.", "There are two people sitting with helmets."], "image": "train2014/COCO_train2014_000000536179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455545, "question_id": "fEXoQftA3vMbn6MEPfGcL5", "question": "This Imbarchi gates are updated as automatic open by using what?", "choices": ["wood", "magnet", "cargos", "steel"], "correct_choice_idx": 1, "direct_answers": ["card", "latches", "motion detector", "electricity", "motion sensors", "technology", "motion", "entry gate", "remote", "magnet"], "difficult_direct_answer": true, "rationales": ["The gates can open once magnets pull them apart.", "They use a magnet.", "The imbarchi gates can be opened with a magnet and then it automatically opens"], "image": "train2014/COCO_train2014_000000455545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381723, "question_id": "fFC2KkQpjNATUJLk5XRVdb", "question": "In which style white pizza made of?", "choices": ["german", "italian", "australian", "arab"], "correct_choice_idx": 1, "direct_answers": ["personal pizza", "chicken", "no tomatoes", "new york", "italian", "personal", "italian", "deep dish", "personal size", "personal pan"], "difficult_direct_answer": true, "rationales": ["Pizza comes from italy.", "The pizza has traditional italian styled toppings on it.", "A traditional pizza is on a plate."], "image": "train2014/COCO_train2014_000000381723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406047, "question_id": "fFHdYKCtzg3hkjx2upaFUx", "question": "What level of event is this?", "choices": ["hobby", "national", "international", "local"], "correct_choice_idx": 2, "direct_answers": ["expert", "hardest", "olympic", "olympic", "olympics", "international", "high altitude", "olympics", "skiing", "olympics"], "difficult_direct_answer": false, "rationales": ["The names of countries are on the people's uniforms.", "Different flags of different countries are seen in the area.", "It's international."], "image": "train2014/COCO_train2014_000000406047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344860, "question_id": "fFMgTRRfyGCg295cbutrUb", "question": "What the young one of the animal displayed?", "choices": ["kitten", "kid", "calf", "foal"], "correct_choice_idx": 3, "direct_answers": ["affection", "zebra", "affection", "zebra", "zebra", "affection", "foal", "zebra", "zebra", "zebra"], "difficult_direct_answer": false, "rationales": ["The young animal is called a foal.", "The young one is a foal.", "A young zebra is called foal."], "image": "val2014/COCO_val2014_000000344860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261298, "question_id": "fFTLEWbMokgGWnsGbgBjQh", "question": "What item suggests that the owner of this home likes bright colors?", "choices": ["window", "table", "couch", "wall"], "correct_choice_idx": 3, "direct_answers": ["bright walls", "walls", "wall", "red walls", "red walls", "wall", "red wall", "walls", "wall color", "wall"], "difficult_direct_answer": false, "rationales": ["The item is the wall.", "The wall is red.", "The wall in the home is a very bright shade of red."], "image": "train2014/COCO_train2014_000000261298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289410, "question_id": "fFTuwKDSSS8Tsb8Np5mkvN", "question": "Where is this room?", "choices": ["inn", "furniture store", "hotel", "house"], "correct_choice_idx": 3, "direct_answers": ["house", "bedroom", "top floor", "hotel", "upstairs", "house", "master bedroom", "bedroom", "house", "bedroom"], "difficult_direct_answer": false, "rationales": ["There is a cat on the bed which would not be a normal setting for a hotel or store.", "A bedroom is in a house.", "It looks like a bedroom."], "image": "train2014/COCO_train2014_000000289410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320947, "question_id": "fFUrxxyjShb95QDFLAFK5R", "question": "What is in the sky?", "choices": ["zeppelin", "airplane", "bird", "kite"], "correct_choice_idx": 3, "direct_answers": ["kite", "parachutes", "kites", "kites", "kites", "parachutes", "parachutes", "kites", "kites", "kites"], "difficult_direct_answer": false, "rationales": ["The kites are in the sky.", "There are a bunch of kites flying in the sky.", "Kites are flying on strings."], "image": "train2014/COCO_train2014_000000320947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386650, "question_id": "fFjCAw9GNjDUkS5Ru87Jaq", "question": "What are these types of plants mainly being grown for?", "choices": ["looks", "herbs", "animal feed", "food"], "correct_choice_idx": 0, "direct_answers": ["display", "decoration", "decoration", "decoration", "display", "looks", "decoration", "decoration", "decoration", "decoration"], "difficult_direct_answer": false, "rationales": ["These plants are grown all as decorations.", "The plants are for looks.", "The greens are placed close to where you would sit to make the environment around you seem nice and pretty. they are coming from a vase and not from something like a garden or such."], "image": "val2014/COCO_val2014_000000386650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313228, "question_id": "fFkZyA8C399SXYBtiqALu9", "question": "What object on his plate could severely injure him?", "choices": ["placemat", "plate", "knife", "cup"], "correct_choice_idx": 2, "direct_answers": ["knife", "knife", "knife", "knife", "knife", "knife", "knife", "knife", "knife", "knife"], "difficult_direct_answer": false, "rationales": ["The boy has a knife and knives are known for hurting people.", "The object is a knife.", "The knife could cut the boy."], "image": "val2014/COCO_val2014_000000313228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550521, "question_id": "fFuFfy7CrRMBDyjEqryuw4", "question": "What does the man in the green shirt with scissors do for a living?", "choices": ["estitician", "murderer", "fortune teller", "barber"], "correct_choice_idx": 3, "direct_answers": ["cut hair", "barber", "barber", "barbar", "barber", "barber", "barber", "barber", "barber", "cuts hair"], "difficult_direct_answer": false, "rationales": ["He cuts hair", "The man is holding scissors and is doing hair.", "He cuts hair"], "image": "val2014/COCO_val2014_000000550521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420634, "question_id": "fG5ZJVy72GFxuxzfYtwrTb", "question": "What period of the day is it in the image?", "choices": ["night", "afternoon", "morning", "evening"], "correct_choice_idx": 2, "direct_answers": ["morning", "midmorning", "morning", "morning", "noon", "late morning", "morning", "midday", "morning", "morning"], "difficult_direct_answer": false, "rationales": ["The clock is showing close to noon but before it so it must be morning. if it was pm it would be dark.", "The sun is still out and there is light in the sky.", "It looks like it's morning."], "image": "train2014/COCO_train2014_000000420634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105026, "question_id": "fG5hkmxuNRC2W6HvMy74ku", "question": "Which one has the best eyesight?", "choices": ["blond", "white shirt", "blue shirt", "redhead"], "correct_choice_idx": 3, "direct_answers": ["necklace woman", "woman", "no glasses", "shorter woman", "redheaded woman", "long dress", "women", "third one", "shorter woman", "redhead"], "difficult_direct_answer": true, "rationales": ["The red-haired woman is not wearing glasses. the other three in the picture are wearing glasses. glasses are worn to improve eyesight.", "The redhead is the only one not wearing glasses, indicating that she does not need assistance with vision.", "The lady with red hair has no glasses."], "image": "train2014/COCO_train2014_000000105026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7096, "question_id": "fGEVLC8cai6j5P4wBPTnHh", "question": "What is the last number on the train?", "choices": ["eight", "zero", "six", "five"], "correct_choice_idx": 1, "direct_answers": ["zero", "zero", "zero", "zero", "zero", "zero", "zero", "zero", "zero", "zero"], "difficult_direct_answer": false, "rationales": ["It is a black number and easy to see", "A zero is the last digit.", "The train has a 0 at the end of the sequence."], "image": "val2014/COCO_val2014_000000007096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375248, "question_id": "fGGy28qamUNNvafGyvGiqi", "question": "The number listed on the train is the same as the area code for which Canadian province?", "choices": ["ontario", "nova scotia", "manitoba", "british columbia"], "correct_choice_idx": 3, "direct_answers": ["british colombia", "ontario", "british columbia", "778", "british columbia", "alberta", "number 778", "british columbia", "british columbia", "british columbia"], "difficult_direct_answer": false, "rationales": ["British columbia has 778 as its area code.", "That area code belongs to british columbia.", "The number on the front of the train is 778, not 204, 416, or 902."], "image": "train2014/COCO_train2014_000000375248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54020, "question_id": "fGYQ7FV6MXrjjYuUFhjZUW", "question": "What type of event is this?", "choices": ["election", "inauguration", "competition", "fair"], "correct_choice_idx": 2, "direct_answers": ["farm", "sheep contest", "sheep show", "livestock show", "petting zoo", "sheep fair", "sheep event", "competition", "fair", "coat"], "difficult_direct_answer": true, "rationales": ["Due to the numbers on the peoples uniform, it is easy to surmise what is being taken place.", "People are kneeling down with their animals in order to present them to a judge.", "The event is a competition."], "image": "train2014/COCO_train2014_000000054020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133836, "question_id": "fGeA7xeU9rRVRHM8eysm9D", "question": "What location is the train parked in?", "choices": ["train depot", "desert", "horse stop", "taxi station"], "correct_choice_idx": 0, "direct_answers": ["train station", "station", "station", "station", "station", "train tracks", "train station", "station", "train depot", "train station"], "difficult_direct_answer": false, "rationales": ["It is next to a platform where passengers can board or disembark.", "Obviously a train would be parked in a train depot. it couldn't fit into a taxi station and the thought of a train in a horse stop is sort of amusing but not realistic. and there has never been a train in a desert.", "This is generally where trains stop."], "image": "val2014/COCO_val2014_000000133836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62209, "question_id": "fGrqCUAT7bQpHcMb7UBttX", "question": "Which bulletin board service is being used on the computer on the left?", "choices": ["compuserve", "prodigy", "shuimu tsinghua", "ptt"], "correct_choice_idx": 3, "direct_answers": ["craigslist", "proprietary", "dial up", "wikipedia", "coding", "forum", "announcements", "ptt", "unknown", "chinese board"], "difficult_direct_answer": true, "rationales": ["A black screen with rows of gray code is on a computer screen.", "The word ptt appears on the computer.", "A black screen with rows of light gray front are seen on a screen."], "image": "train2014/COCO_train2014_000000062209.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205243, "question_id": "fH2ZV8ZacaqhEJSXVEBKFo", "question": "This marina was designed for what type of boats?", "choices": ["sail", "motor boat", "row boat", "yacht"], "correct_choice_idx": 0, "direct_answers": ["sail", "sail", "sail boats", "sailboats", "sailboats", "sail", "sailboat", "sailboats", "small boats", "yacht"], "difficult_direct_answer": false, "rationales": ["The marina is for sailboats.", "The boats are all sailboats.", "Almost all the boats have masts."], "image": "train2014/COCO_train2014_000000205243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514293, "question_id": "fHWqJ63AWiqu5FB7ssjyQX", "question": "What items are sold here that keep people driest?", "choices": ["shirts", "bottles", "umbrellas", "shoes"], "correct_choice_idx": 2, "direct_answers": ["umbrellas", "umbrella", "umbrellas", "umbrellas", "umbrellas", "umbrella", "umbrella", "umbrella", "umbrellas", "umbrellas"], "difficult_direct_answer": false, "rationales": ["Umbrellas are on display for sale. umbrellas are used to stay dry.", "That's what people use in the rain.", "There are umbrellas hanging on the door."], "image": "train2014/COCO_train2014_000000514293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269945, "question_id": "fHXw52xE6bjvtHwnbpShLv", "question": "How many colors of fire hydrants are there?", "choices": ["six", "five", "two", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "three", "one", "three", "two", "three", "three", "three", "four", "three"], "difficult_direct_answer": false, "rationales": ["There are four colors, red white and black and gray.", "By the count of the number.", "There are 4."], "image": "train2014/COCO_train2014_000000269945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548337, "question_id": "fHt65fVPTShCK56VaLXGwo", "question": "What is in the thing with coke?", "choices": ["fork", "straw", "pencil", "spoon"], "correct_choice_idx": 1, "direct_answers": ["straw", "straw", "straw", "straw", "ice", "straw", "cup", "soda", "straw", "straw"], "difficult_direct_answer": false, "rationales": ["There is a glass of coke with a straw in it.", "There is a straw in the glass.", "A cup with a clear, plastic item sticking out of it is on a table with food."], "image": "val2014/COCO_val2014_000000548337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525409, "question_id": "fHyPZAbrAWDyHoXi4Ah73g", "question": "What is this place?", "choices": ["rodeo", "amusement park", "airport", "daycare"], "correct_choice_idx": 1, "direct_answers": ["hobby garden", "airport", "miniature airport", "model airport", "garden", "miniature golf", "miniature land", "airport", "airport", "amusement park"], "difficult_direct_answer": false, "rationales": ["The place is an amusement park.", "There is an airplane on the ground here.", "Small airplanes, wind turbines and buildings are seen with adults and children walking about."], "image": "train2014/COCO_train2014_000000525409.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476841, "question_id": "fJHswVkVr8HaMNG32ZgCGL", "question": "Why are the bikes on the poles?", "choices": ["lost", "for sale", "keep safe", "stolen"], "correct_choice_idx": 2, "direct_answers": ["security", "theft locks", "parked", "stored", "parking", "secure", "locked up", "locked", "locked up", "keep safe"], "difficult_direct_answer": true, "rationales": ["The bikes are temporarily attached to racks next to the poles, or to the poles themselves. each bike is attached independently of the others with a chain and lock, and a lock reduces risk of theft of the item that is locked up.", "They are there to keep them from being stolen.", "Most people in the city use poles to tie there bikes to."], "image": "val2014/COCO_val2014_000000476841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305858, "question_id": "fJLS55UzXwNfTFK3XqPqLX", "question": "The face of the namesake of this bridge is on which American dollar bill?", "choices": ["$20", "$100", "$50", "$5"], "correct_choice_idx": 1, "direct_answers": ["ten", "ten", "one hundred", "100", "one hundred", "franklin", "$100", "one hundred", "hundred probably", "one hundred"], "difficult_direct_answer": false, "rationales": ["He is known on the hundred bill.", "The bridge is the benjamin franklin bridge. lincoln in on $5 bill, grant is on the $50 bill, jackson is on the $20 bill, leaving the $100 bill for franklin.", "Ben franklin is referenced on a street sign. ben franklin is on the one hundred dollar bill."], "image": "train2014/COCO_train2014_000000305858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579952, "question_id": "fJMBWP2Ck6oFTGBtdbCH9n", "question": "What level game is being conducted here?", "choices": ["retired", "high school", "pro", "beginner"], "correct_choice_idx": 2, "direct_answers": ["professional", "professional tennis", "tennis", "professional", "pro", "professional", "tennis", "tennis", "professional", "professional"], "difficult_direct_answer": false, "rationales": ["The audience and the expensive court is designed for professional tennis players.", "The man is a pro athlete.", "It's a pro game."], "image": "train2014/COCO_train2014_000000579952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379044, "question_id": "fJg5vDH9XuSztHs575N4qA", "question": "What kind of fish kite does the man appear to be flying?", "choices": ["stingray", "seal", "shark", "dolphin"], "correct_choice_idx": 3, "direct_answers": ["shark", "shark", "shark", "orca", "shark", "shark", "orca", "dolphin", "whale", "orca"], "difficult_direct_answer": false, "rationales": ["The kite has a mammal tail.", "It is black with a little white like a whale", "The kite has a tail."], "image": "train2014/COCO_train2014_000000379044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111422, "question_id": "fJycQhiZUV3pSU9nHHcFTN", "question": "Where is the athlete positioned?", "choices": ["clearance", "service box", "center court", "sideline"], "correct_choice_idx": 0, "direct_answers": ["clearance", "tennis court", "field", "squat", "tennis court", "on court", "downward", "back court", "behind baseline", "backward"], "difficult_direct_answer": true, "rationales": ["The athlete wants to clear the ball.", "He is on the side line and not on the court.", "They are behind the serving line"], "image": "train2014/COCO_train2014_000000111422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229472, "question_id": "fKGZQscj5YHcD2watTqCYM", "question": "What type of vehicle is to the right of the motorcycle?", "choices": ["bus/van", "train", "plane", "skateboard"], "correct_choice_idx": 0, "direct_answers": ["truck", "mini bus", "bus", "bus/van", "bus", "pickup", "van", "van", "bus", "truck"], "difficult_direct_answer": false, "rationales": ["This is a type of van or bus that is white.", "The vehicle next to the motorcycle is a van.", "The vehicle is a bus or van."], "image": "train2014/COCO_train2014_000000229472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281679, "question_id": "fKNj2tHpo5YMCi7VzHnkhS", "question": "What establishment is located behind the people?", "choices": ["motel", "pub", "restaurant", "store"], "correct_choice_idx": 2, "direct_answers": ["restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "pizzeria", "restaurant", "pizzeria", "pizzeria", "restaurant"], "difficult_direct_answer": false, "rationales": ["People sit on the patio of a business that is behind them with a sign on the top. people sit outside when they eat at restaurants on nice days.", "They are dining at an outdoor patio.", "The establishment is a restaurant."], "image": "val2014/COCO_val2014_000000281679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517623, "question_id": "fKRWUQoWpPY2viNXkgJAdL", "question": "Why are there so many buses?", "choices": ["waiting", "backed up", "abandoned", "tourist destination"], "correct_choice_idx": 3, "direct_answers": ["tourists", "many passengers", "move people", "bus stop", "busy area", "tourists", "tourist destination", "traffic jam", "rush hour", "tourists"], "difficult_direct_answer": false, "rationales": ["There are double decker buses which are mostly ridden by tourists.", "Is a popular destination.", "Double decker buses are in the street. tourists use double decker buses to see the sights."], "image": "val2014/COCO_val2014_000000517623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550147, "question_id": "fKUKf4XWDpRgxRQWA2f4JU", "question": "Why is the child on the skateboard wearing a helmet?", "choices": ["protection", "fashion", "punishment", "camouflage"], "correct_choice_idx": 0, "direct_answers": ["safety", "safety", "safety", "safety", "protection", "safety", "protection", "safety", "safety", "protection"], "difficult_direct_answer": false, "rationales": ["People wear helmets for the purpose of protection and especially when engaging in skateboarding and if they are at a beginner level which a child this young would likely be.", "The child is being protected.", "A helmet will stop the child from getting hurt if they were to fall off of the board."], "image": "train2014/COCO_train2014_000000550147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108944, "question_id": "fKVUcoZAYGFACHfa7zcXZL", "question": "Which vegetable here is more likely to bring tears while preparing?", "choices": ["tomatoes", "cucumber", "squash", "onion"], "correct_choice_idx": 3, "direct_answers": ["onions", "garlic", "onion", "onion", "chillies", "onion", "onion", "onion", "onion", "onion"], "difficult_direct_answer": false, "rationales": ["The veggie is an onion.", "Onions make people cry.", "The onion is likely to bring tears when cut open."], "image": "train2014/COCO_train2014_000000108944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518649, "question_id": "fKkLtBSpsD3eqT8SsKprsX", "question": "Which location would be quickest to get a big gulp if you are waiting at this corner?", "choices": ["kfc", "chevron", "7 eleven", "pharmacy"], "correct_choice_idx": 2, "direct_answers": ["seven eleven", "7eleven", "seven eleven", "7-eleven", "7eleven", "seven eleven", "seven eleven", "seven eleven", "7 eleven", "7 eleven"], "difficult_direct_answer": false, "rationales": ["The big gulp is a staple of this quick stop shop.", "This quick stop shop is known for the big-gulp.", "Seven eleven serves big gulps."], "image": "train2014/COCO_train2014_000000518649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81054, "question_id": "fKwNKAyHpqhYaL5Ce53eMM", "question": "What brand of truck is this?", "choices": ["toyota", "honda", "chevy", "kia"], "correct_choice_idx": 2, "direct_answers": ["car", "chevrolet", "chevrolet", "ford", "gmc", "unknown", "chevy", "chevrolet", "chevy", "chrysler"], "difficult_direct_answer": false, "rationales": ["There is a bowtie brand on the side of vehicle.", "The brand is a chevy.", "The logo on the side is indicative of the specific car brand."], "image": "val2014/COCO_val2014_000000081054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167683, "question_id": "fL6jL7RiUu6gbN4w9DAT4p", "question": "What sort of specialty business is this?", "choices": ["philly cheesesteak", "fried chicken", "pizza hut", "sandwich shop"], "correct_choice_idx": 3, "direct_answers": ["sandwich shop", "sandwich shop", "sandwich shop", "deli", "sandwich shop", "deli", "deli", "sandwiches", "deli", "sub shop"], "difficult_direct_answer": false, "rationales": ["The counter has many sandwiches on it which the woman has made, and the display is filled with sandwich items, indicating this is a shop to purchase sandwiches.", "It is a sandwich shop because there is bread, meats and condiments", "The place sells sandwiches."], "image": "train2014/COCO_train2014_000000167683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414852, "question_id": "fLJMnkQj4mpcvnhjn2GYxt", "question": "What is found on the floor?", "choices": ["cat", "soda can", "cow", "dollar bill"], "correct_choice_idx": 1, "direct_answers": ["tiles", "can", "soda can", "can", "tile patterns", "soda can", "tile", "boys", "trash", "wood pieces"], "difficult_direct_answer": false, "rationales": ["There is an object visible on the ground that is of the same size of a can and features most of the elements consistent with being a can.", "It is cylindrical and aluminum", "An empty aluminum can is visible on the ground in this city scene."], "image": "val2014/COCO_val2014_000000414852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106624, "question_id": "fLStpCqg5T5Fi2k2ufJgzy", "question": "What type climate do the horses walk in?", "choices": ["tundra", "desert", "snow", "tropical"], "correct_choice_idx": 3, "direct_answers": ["tropical", "warm", "tropical", "warm", "tropical", "clear weather", "tropical", "hot", "temperate", "tropical"], "difficult_direct_answer": false, "rationales": ["There is a beach and tropical trees on the beach where the horses are walking indicating the environment.", "There are palm trees and a beach", "The climate is tropical."], "image": "val2014/COCO_val2014_000000106624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537280, "question_id": "fM9Uw8eMrW9eM5dQshucc3", "question": "What are the child's boots made from?", "choices": ["leather", "marshmallows", "plastic", "wood"], "correct_choice_idx": 2, "direct_answers": ["nylon", "plastic", "rubber", "rubber", "rubber", "rubber", "rubber", "rubber", "rubber", "plastic"], "difficult_direct_answer": false, "rationales": ["The other options wouldn't hold up to the rain and water well.", "The boots are being deployed in the rain where a waterproof material would be most likely. answer a is a waterproof material that is used for rainproof gear.", "The child's boots are plastic rubber."], "image": "val2014/COCO_val2014_000000537280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224140, "question_id": "fMFR7KWGL27pJDBG9TrGRH", "question": "Dark condition is due to the absence of what?", "choices": ["proton", "electron", "neutron", "photons"], "correct_choice_idx": 3, "direct_answers": ["sun", "light", "light", "sun", "light", "light", "sunlight", "sun", "sun", "photons"], "difficult_direct_answer": false, "rationales": ["When there are no photons it turns dark.", "They are the particles in light.", "The darkness is from photons."], "image": "train2014/COCO_train2014_000000224140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260182, "question_id": "fMMcy3Wv7yEscrkDR27aVJ", "question": "In what setting is this street scene?", "choices": ["rural", "urban", "farm", "suburban"], "correct_choice_idx": 1, "direct_answers": ["evening", "bar", "night", "urban", "downtown", "downtown", "downtown", "rainy night", "urban", "urban"], "difficult_direct_answer": false, "rationales": ["The street scene is urban as there are many buildings.", "The nearby buildings is a clear sign that this is an urban setting.", "It's a business district which is generally urban."], "image": "val2014/COCO_val2014_000000260182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472003, "question_id": "fMZjmscjFAR5B5eHH9FX6T", "question": "Why are they all leaning forward?", "choices": ["grabbing food", "selling food", "returning food", "making mess"], "correct_choice_idx": 0, "direct_answers": ["grabbing food", "serving food", "get food", "serving", "getting food", "feeding frenzy", "grabbing food", "getting food", "pickup food", "food"], "difficult_direct_answer": false, "rationales": ["They get food.", "They are serving themselves dinner.", "The people are surrounding a table that is covered in food. in order to pick up food for the main purpose of eating one would have to lean to reach all items in this particular setup."], "image": "train2014/COCO_train2014_000000472003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66412, "question_id": "fMeKArgm22vSSbCrVc9ajg", "question": "What is this guy planning to do?", "choices": ["paragliding", "skateboarding", "snowboarding", "skiing"], "correct_choice_idx": 2, "direct_answers": ["sue", "snowboard", "snowboard", "snowboard", "snowboard", "snowboarding", "snowboard", "snowboarding", "snowboard", "snowboarding"], "difficult_direct_answer": false, "rationales": ["There is a single wide board with two attached boots.", "You can tell by the shape of the board and what he is wearing.", "There is a man standing with a jacket on holding a long board with foot holders."], "image": "val2014/COCO_val2014_000000066412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65705, "question_id": "fMoMHsSUgBz4bZ8KmUJAZj", "question": "What is the genre of movie named on the side of the bus?", "choices": ["animated", "documentary", "horror", "romance"], "correct_choice_idx": 2, "direct_answers": ["horror", "horror", "horror", "horror", "horror", "horror", "horror", "horror", "horror", "horror"], "difficult_direct_answer": false, "rationales": ["The movie shown is texas chainsaw which is horror.", "Texas chainsaw is a horror movie.", "The genre is horror."], "image": "train2014/COCO_train2014_000000065705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293453, "question_id": "fN3ujizWr84V7h6e8hqTgk", "question": "What most likely happened before this?", "choices": ["shopping", "biking", "swimming", "hiking"], "correct_choice_idx": 0, "direct_answers": ["packed", "packing", "relocation", "packing", "packing", "packing", "packing", "packing", "shopping", "shopping"], "difficult_direct_answer": false, "rationales": ["Shopping bags are displayed.", "The people went shopping.", "The vehicle contains bags from stores."], "image": "train2014/COCO_train2014_000000293453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332943, "question_id": "fN6iesaygyMA9SdiFTyhKs", "question": "What type of animal is this?", "choices": ["wild", "reptile", "stuffed", "domestic"], "correct_choice_idx": 2, "direct_answers": ["bear", "stuffed", "teddy bear", "bear", "monkey", "teddy bear", "bear", "bear", "teddy bear", "bear"], "difficult_direct_answer": false, "rationales": ["The bear is filled with fluff.", "It is a bear that has been filled with soft material.", "The animal is stuffe.d"], "image": "train2014/COCO_train2014_000000332943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258155, "question_id": "fNjW92eKcmUMcihoVTmUv9", "question": "What may be hanging overhead of the bed on the wall?", "choices": ["air conditioner", "movie screen", "quilt rack", "printer"], "correct_choice_idx": 0, "direct_answers": ["air conditioner", "vent", "air conditioner", "air conditioner", "light", "canopy", "air vent", "a/c", "air conditioner", "air conditioner"], "difficult_direct_answer": false, "rationales": ["There is a device with a vent on it", "This bedroom would need to be cooled and heated so this would be the source of the warmed or cooled air flow.", "The air conditioning vent is on the wall."], "image": "train2014/COCO_train2014_000000258155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300454, "question_id": "fNkpMJcukKq9ihiV3Ly5RE", "question": "What kind of vehicle is the woman travelling on?", "choices": ["motorbike", "airplane", "train", "car"], "correct_choice_idx": 2, "direct_answers": ["train", "train", "bus", "bus", "bus", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["The vehicle is a train.", "This vehicle's way of transportation is through tracks.", "It is a large vehicle that runs on rails not wheels."], "image": "train2014/COCO_train2014_000000300454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281102, "question_id": "fNnyPa73UwVMegNtN37W8E", "question": "What are the people at the columnar kiosk paying for?", "choices": ["subway ride", "christmas gifts", "fines", "parking space"], "correct_choice_idx": 3, "direct_answers": ["parking", "parking", "parking", "parking space", "parking", "parking", "parking", "parking", "parking", "parking"], "difficult_direct_answer": false, "rationales": ["The people are at a parking meter.", "There is a p for parking.", "There is a sign reminding to pay the parking meter."], "image": "val2014/COCO_val2014_000000281102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148983, "question_id": "fNxNS9TcgNrPKAocjVkfCn", "question": "Which shapes make the best kites?", "choices": ["bow", "hybrid", "delta", "foil"], "correct_choice_idx": 2, "direct_answers": ["triangle", "triangular shape", "triangle", "diamonds", "delta", "cobra", "playing", "triangular shape", "diamond", "quadrilaterals"], "difficult_direct_answer": false, "rationales": ["Deltas make the best kites given their nimble shape.", "A dental shape flies the best in the sky", "A kite in the shape of a triangle is in the air."], "image": "train2014/COCO_train2014_000000148983.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329469, "question_id": "fPDNTrTtMFSa3L9CMsieGA", "question": "What is the man on the right riding?", "choices": ["scooter", "quad", "bicycle", "motorcycle"], "correct_choice_idx": 1, "direct_answers": ["bike", "bike", "atv", "riding atv", "quad", "motorcycle", "atv", "atv", "trick", "four-wheeler"], "difficult_direct_answer": false, "rationales": ["The man is in a quad.", "The man on the right is riding a quad bike.", "The man on the right is riding a quad since there are four wheels."], "image": "train2014/COCO_train2014_000000329469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395269, "question_id": "fPZjLwpismyxie6NkhHAJp", "question": "How much calorie intake in kcal is there for eating three of these buns?", "choices": ["670", "1446", "964", "850"], "correct_choice_idx": 1, "direct_answers": ["1446 kcal", "600", "600", "six hundred", "one thousand", "600", "1446", "375", "1446", "six hundred"], "difficult_direct_answer": false, "rationales": ["That is how much calories that would be from eating three buns.", "The sign says there are 200 calories in one.", "Buns are high in calories."], "image": "train2014/COCO_train2014_000000395269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245978, "question_id": "fPgnvaoJYTvN8RmSGt97gy", "question": "The meal seen here is most likely served as which?", "choices": ["breakfast", "lunch", "dinner", "supper"], "correct_choice_idx": 0, "direct_answers": ["breakfast", "breakfast", "breakfast", "breakfast", "breakfast", "breakfast", "breakfast", "breakfast", "breakfast", "breakfast"], "difficult_direct_answer": false, "rationales": ["This meal is served most likely as a breakfast.", "Sausage, eggs, pastries, fruits are all foods associated with the morning time and also present on this plate.", "There are eggs, sausage and fruit which are morning meal foods."], "image": "train2014/COCO_train2014_000000245978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462211, "question_id": "fQAbWDjS6uQ366XsMJBRGu", "question": "What script is that?", "choices": ["thai", "australian", "vietnamese", "japanese"], "correct_choice_idx": 0, "direct_answers": ["middle east", "thai", "dont turn", "arabic", "thai", "hindu", "arabic", "unknown", "russian", "thailand"], "difficult_direct_answer": false, "rationales": ["The text is in a language that's known as thai.", "The language is thai.", "The distinctive shapes can be identified as thai writing."], "image": "val2014/COCO_val2014_000000462211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209733, "question_id": "fQPbWSxsvDsounKhRqRdVo", "question": "What does the man in red hold in his hands?", "choices": ["bait", "kite strings", "food", "nothing"], "correct_choice_idx": 1, "direct_answers": ["parachute", "kite", "kite strings", "whilete", "string", "kite", "kite string", "kite", "line", "kite"], "difficult_direct_answer": false, "rationales": ["Due to the purple kite above him you can easily tell what he is holding.", "The man is flying a kite.", "There is a device that flies in the air that people hold in the air with string."], "image": "val2014/COCO_val2014_000000209733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422782, "question_id": "fQophWJXf5MyQv37fDrHJk", "question": "The woman is reading a catalog from which brand?", "choices": ["baby", "carriage", "pram", "brio"], "correct_choice_idx": 3, "direct_answers": ["brio", "brio", "brio", "brio", "brio", "brio", "brio", "brio", "brio", "brio"], "difficult_direct_answer": false, "rationales": ["The woman is reading from the brio catalog.", "The name is on the front.", "The brand is written on the front of the catalog."], "image": "train2014/COCO_train2014_000000422782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417761, "question_id": "fR5vUhkLc59XFpyU9FXNqz", "question": "What is the geological rock formation called?", "choices": ["bump out", "rock out", "stick out", "outcrop"], "correct_choice_idx": 3, "direct_answers": ["mountain", "cliff", "outcrop", "mountain", "peak", "mountain", "mountain", "mountains", "mountain", "mountain"], "difficult_direct_answer": false, "rationales": ["In the foreground there is a visible exposure of bedrock. this is also referred to as an outcrop.", "Rock formations visible on the surface is called an outcrop.", "The rocks are sticking out of the snow.."], "image": "train2014/COCO_train2014_000000417761.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287601, "question_id": "fRFrdTLAbPLacXszz4V3Lt", "question": "Which vehicle seen here would help someone stay drier in water when in use?", "choices": ["pickup", "rv", "boat", "truck"], "correct_choice_idx": 2, "direct_answers": ["boat", "rv", "transport", "boat", "boat", "boat", "boat", "transport", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["A boat travels through water and keeps people dry.", "It allows people to travel through the water without having to get in and swim.", "The watercraft will float easily on the water keeping everyone dry, unlike the other vehicles which would sink."], "image": "train2014/COCO_train2014_000000287601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480843, "question_id": "fRHsHjvZwEmWbHY7vfvHbF", "question": "What is the object on the stand next to the brown sofa?", "choices": ["speaker", "table", "shelf", "plant pot"], "correct_choice_idx": 0, "direct_answers": ["speaker", "coffee cup", "speaker", "nutcracker", "speaker", "lamp", "headphones", "television", "speaker", "speaker"], "difficult_direct_answer": false, "rationales": ["You can tell by the shape of the box, color and the metal pole sticking into it as to what it is.", "There is a speaker next to the couch.", "It is a box shape and has wires coming out the back"], "image": "train2014/COCO_train2014_000000480843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326583, "question_id": "fRXNrsinLicoB7DTnvBv2e", "question": "What type of transportation is shown?", "choices": ["air", "water", "rail", "road"], "correct_choice_idx": 3, "direct_answers": ["bus", "road", "walking", "bus", "bus", "public", "bus", "bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["The vehicle is a bus. it cannot travel on tracks, fly, or float.", "It is a bus that has tires for driving on paved surfaces.", "A street is shown."], "image": "train2014/COCO_train2014_000000326583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412419, "question_id": "fRbhrYcQdq4vg4mrSNT48C", "question": "What is the gear called that the umpire is wearing on his face?", "choices": ["glasses", "goggles", "umpire mask", "binoculars"], "correct_choice_idx": 2, "direct_answers": ["mask", "face mask", "mask", "mask", "helmet", "umpire mask", "guard", "umpire mask", "face mask", "catcher's mask"], "difficult_direct_answer": false, "rationales": ["The man is wearing the mask to protect himself.", "The gear is the umpire mask.", "It covers his face and protects him from getting hit with the ball or the bat."], "image": "val2014/COCO_val2014_000000412419.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13944, "question_id": "fTA4TugpK2mDRDdqUNZwrP", "question": "What sort of repairs in the lifted person doing?", "choices": ["animal remediation", "pothole", "electrical", "road sign"], "correct_choice_idx": 2, "direct_answers": ["traffic light", "traffic light", "electrical", "traffic lights", "traffic lights", "electrical", "electical", "crane", "electrical", "electric repairs"], "difficult_direct_answer": false, "rationales": ["The traffic lights run on electricity.", "A person is in a lift and is facing a street light. electricians work on street lights.", "They are working on a traffic light, which is run by eletricity."], "image": "train2014/COCO_train2014_000000013944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428816, "question_id": "fTChqxyfUxCotkGcQcH3Pe", "question": "Why are they holding umbrellas?", "choices": ["fashionable", "showing off", "is raining", "posing"], "correct_choice_idx": 2, "direct_answers": ["raining", "rain", "is raining", "rain", "rainy", "rain", "cloudy", "raining", "rain", "raining"], "difficult_direct_answer": false, "rationales": ["There are many people holding umbrellas and a fine mist is in the background.", "The sky looks grey and rainy, and they are also wearing rainy weather attire.", "People are standing in an overcast area with umbrellas. umbrellas are used when it is raining."], "image": "train2014/COCO_train2014_000000428816.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300392, "question_id": "fU4xM58NsYiMjNdXPSRfrZ", "question": "These portable toilets are located closest to which major city?", "choices": ["new york", "chicago", "los angeles", "miami"], "correct_choice_idx": 3, "direct_answers": ["unknown", "wellington", "miami", "denver", "boston", "miami", "unknown", "chicago", "denver", "london"], "difficult_direct_answer": false, "rationales": ["These portable toilets are located in miami because it says so on the sign", "The place is miami.", "We can tell these are the property of a miami business, because the phone number on the doors begins with \"305\", which is a miami area code. with this many facilities set up, it looks to be a huge event coming to town!."], "image": "train2014/COCO_train2014_000000300392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561190, "question_id": "fUE4AWpStUNYLtSKn6qkoP", "question": "What is everyone doing standing with remotes?", "choices": ["changing television", "power point", "video gaming", "calisthenics"], "correct_choice_idx": 2, "direct_answers": ["gaming", "playing games", "playing wii", "playing games", "playing videogames", "video gaming", "playing game", "playing videos", "playing wii", "playing game"], "difficult_direct_answer": false, "rationales": ["People are gaming.", "The remotes look like wii remotes based on their color, style and size. when people are motioning wii controllers and all looking in the same direction they are likely playing video games.", "People are holding remotes and facing the television while standing. people like to watch and play video games."], "image": "train2014/COCO_train2014_000000561190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176765, "question_id": "fUb6CyNHWYEbuKKuQjc86p", "question": "What kind of store is most likely responsible for the red advertisement on the side of the bus?", "choices": ["electronics", "sporting", "office supplies", "jewelry"], "correct_choice_idx": 3, "direct_answers": ["jewelry store", "jewelry", "jewelry", "jewelry store", "jewelry", "jewelry", "jewelry", "jewelry", "jewelry", "jewelry"], "difficult_direct_answer": false, "rationales": ["The picture shows a diamond, and diamonds are typically used in jewelry.", "Jewelry is being advertised.", "The advertisement on the side of the bus has pieces of jewelry on it and it's probably an ad for a jewelry store."], "image": "train2014/COCO_train2014_000000176765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207215, "question_id": "fUcxQPBq4LAiTi64JSsBfE", "question": "How is this blender powered?", "choices": ["bicycle", "it isn't", "gas", "electricity"], "correct_choice_idx": 0, "direct_answers": ["bike", "biking", "bike power", "using bicycle", "riding bike", "crank", "bicycle", "bicycle", "bike", "bicycle"], "difficult_direct_answer": false, "rationales": ["The lady is riding a bike to power it.", "The blender has a bike.", "The blender is powered by a bicycle motor."], "image": "train2014/COCO_train2014_000000207215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495376, "question_id": "fUtUTiQG7mfETMPNVSjEd7", "question": "Why are the woman leaning against the snow pile?", "choices": ["doing tricks", "making snow-angels", "to rest", "to tan"], "correct_choice_idx": 2, "direct_answers": ["rest", "resting", "to rest", "sit down", "resting", "resting", "for rest", "relaxation", "sitting", "resting"], "difficult_direct_answer": false, "rationales": ["The women are idle and taking a break from skiing.", "People on skis are leaning against a pile of snow. people lean on things to rest.", "It appears as if they are cross-country skiing, which exerts energy. the snow pile is a good place to take a rest."], "image": "val2014/COCO_val2014_000000495376.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570111, "question_id": "fV2yDc8Kp9JUAKrcKaFopF", "question": "What service does the vehicle with the purple letters provide?", "choices": ["deliveries", "groceries", "alcohol", "security"], "correct_choice_idx": 0, "direct_answers": ["package delivery", "postal", "package delivery", "delivery", "transport", "mail delivery", "delivery", "packages", "deliveries", "package delivery"], "difficult_direct_answer": false, "rationales": ["The vehicle says fed-ex which is a delivery service.", "Fedex delivers packages.", "This is a fedex truck."], "image": "train2014/COCO_train2014_000000570111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249805, "question_id": "fVBxENvckncVzWkHWU9NhE", "question": "What tire brand is seen on the back of the t-shirt?", "choices": ["hankook", "pirelli", "dunlop", "bridgestone"], "correct_choice_idx": 1, "direct_answers": ["pirelli", "pirelli", "general", "pirelli", "firelli", "viral", "minelli", "petrelli", "pirelli", "tennis"], "difficult_direct_answer": false, "rationales": ["The brand's name appears above the number.", "Pirelli is advertised.", "You can see pirelli on the back of their shirts."], "image": "train2014/COCO_train2014_000000249805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406139, "question_id": "fVCdnRPeQY5i5p2oeHtBCV", "question": "Why is the woman holding an umbrella?", "choices": ["block sun", "staying dry", "block wind", "to dance"], "correct_choice_idx": 1, "direct_answers": ["it's raining", "staying dry", "rain", "rain", "its drizzling", "rain", "staying dry", "raining", "block rain", "rain"], "difficult_direct_answer": false, "rationales": ["She is trying not to get wet from the rain.", "The woman wants to stay dry.", "The woman doesn't want to get wet."], "image": "train2014/COCO_train2014_000000406139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501311, "question_id": "fVCrBorXvbCrZC5fiefimE", "question": "What is the best possible outcome for the pitcher in this situation?", "choices": ["hit", "walk", "strike out", "home run"], "correct_choice_idx": 2, "direct_answers": ["strike out", "win", "strike", "strike", "strike", "strike out", "strike", "strike", "strike", "strike"], "difficult_direct_answer": false, "rationales": ["The pitcher wants to strike out the batter.", "A pitcher's only goal is to get an out. if a player gets a walk, a homerun, or a hit, then the pitcher failed to get an out.", "He will get a strike out"], "image": "val2014/COCO_val2014_000000501311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454148, "question_id": "fVPaveNmAJwacuJr3k6MhH", "question": "What is the man doing?", "choices": ["watching film", "finding friend", "speaking", "singing"], "correct_choice_idx": 3, "direct_answers": ["talking", "singing", "singing", "speaking", "talking", "singing", "singing", "speaking", "speaking", "speaking"], "difficult_direct_answer": false, "rationales": ["The man is standing in front of a microphone.", "The man is singing.", "A man is standing in front of a microphone. people use a microphone to speak to others."], "image": "val2014/COCO_val2014_000000454148.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446984, "question_id": "fVQdsH7rQy33H5sDbnXchc", "question": "What is the name for the large silver object?", "choices": ["scissors", "knife", "spoon", "fork"], "correct_choice_idx": 0, "direct_answers": ["scissors", "scissor", "blades", "scissors", "blades", "running scissors", "scissors", "scissors", "scissors", "scissors"], "difficult_direct_answer": false, "rationales": ["There is a giant pair of scissors.", "That is the name of the tool.", "The sculpture attached in this image has two holes for fingers and shears on a hinge."], "image": "val2014/COCO_val2014_000000446984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553812, "question_id": "fVZTMf7E2d5qvGjnubd6hV", "question": "What is the dog's paw resting on?", "choices": ["apple", "keyboard", "bone", "cat"], "correct_choice_idx": 1, "direct_answers": ["computer", "laptop", "computer", "computer", "trackpad", "laptop", "laptop keyboard", "laptop", "working", "keyboard"], "difficult_direct_answer": false, "rationales": ["A dog sits in front a laptop with a paw on the keyboard.", "The dog is standing on a laptop computer and the part where its paw is, is known as the keyboard.", "The dog has its paw on the keyboard of a laptop."], "image": "train2014/COCO_train2014_000000553812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247660, "question_id": "fVkg7cXrTKpkVxex5BeUrK", "question": "What are they doing?", "choices": ["stealing balls", "leaving", "arguing", "golfing"], "correct_choice_idx": 3, "direct_answers": ["walking", "walking", "golfing", "golfing", "golfing", "golfing", "golfing", "walking", "golfing", "walking"], "difficult_direct_answer": false, "rationales": ["They are all golfing on the green.", "They're golfing.", "The people are on a golf course."], "image": "train2014/COCO_train2014_000000247660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69583, "question_id": "fVmnnZE9WhVVucM9vcyWTd", "question": "What is behind the pole?", "choices": ["bench", "newspaper", "trash can", "flower"], "correct_choice_idx": 2, "direct_answers": ["trash can", "tree", "trash can", "trees", "trash can", "trash can", "trash can", "tree", "garbage can", "garbage can"], "difficult_direct_answer": false, "rationales": ["A trash can is behind the pole.", "It is where you throw away garbage", "There is a metal trash can behind the sign."], "image": "train2014/COCO_train2014_000000069583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195437, "question_id": "fVsBExSvr83S22p4ueAMvf", "question": "Clothing items are hung here for which purpose?", "choices": ["warning", "signaling", "drying", "sale display"], "correct_choice_idx": 2, "direct_answers": ["drying", "dry out", "swimming", "dry them", "to dry", "dry", "to dry", "drying", "for style", "dry them"], "difficult_direct_answer": false, "rationales": ["These items are on a boat and by water where they can get wet. the sun is out and it is a hot day to hanging them over the side of the boat would dry them in the air.", "Wet clothing can become un-wet by hanging on a line and allowing air to circulate around and through them.", "They are on the water, so the clothing probably got wet. hanging them in the sun would help them dry faster."], "image": "val2014/COCO_val2014_000000195437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182874, "question_id": "fVtA7PT57JNukbvZHnBZHX", "question": "What is the burgundy lane in the road used for?", "choices": ["carpooling", "emergency vehicles", "buses", "vehicle breakdowns"], "correct_choice_idx": 2, "direct_answers": ["bus", "bus", "bus", "bus", "bus", "buses", "bus traffic", "bus lane", "bus lane", "buses"], "difficult_direct_answer": false, "rationales": ["The lane says bus lane.", "The burgundy lane is a bus lane.", "You can tell by what is written on the road as to what vehicle it is used for."], "image": "val2014/COCO_val2014_000000182874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115370, "question_id": "fW5ZVfqw2VRMd4i4pgvqeW", "question": "What do the people seated by the road await?", "choices": ["racing", "nothing", "parade", "arrest"], "correct_choice_idx": 2, "direct_answers": ["race", "parade", "parade", "parade", "watch", "parade", "parage", "parade", "parade", "parade"], "difficult_direct_answer": false, "rationales": ["There are people in a motorcade with people standing on sides of road.", "There are several people lining the road, leaving the street empty. there are motorcycle police, which usually head a parade.", "The people are in a parade."], "image": "val2014/COCO_val2014_000000115370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458309, "question_id": "fWHwJDUKAPC6G2sBRmCBDw", "question": "Who does this gear on the beach belong to?", "choices": ["shore waste", "school kids", "bikini models", "lifeguard"], "correct_choice_idx": 3, "direct_answers": ["lifeguards", "lifeguard", "lifeguard", "lifeguard", "lifeguards", "lifeguards", "lifeguard", "life guard", "lifeguard", "lifeguard"], "difficult_direct_answer": false, "rationales": ["The red color indicates the items are used by the lifeguards on the beach.", "You can tell by the color and the symbol as to whom the gear belongs too.", "The gear belongs to someone who rescues people from the water."], "image": "train2014/COCO_train2014_000000458309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268251, "question_id": "fWRWzMrKecVbdj64ftuKYE", "question": "Who is this lady likely to be?", "choices": ["resident", "policewoman", "tourist", "car driver"], "correct_choice_idx": 2, "direct_answers": ["tourist", "woman", "tourist", "tourist", "woman", "tourist", "tourist", "tourist", "tourist", "artiest"], "difficult_direct_answer": false, "rationales": ["The woman is walking, not driving. she is holding a camera and is looking at an informational sign.", "She is dressed in civilian clothing, and is looking at the buildings. she is also holding a camera, which is typical for a tourist.", "The woman is interested in the building and holding a camera."], "image": "train2014/COCO_train2014_000000268251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32270, "question_id": "fWVgVEpRGZNX4W6xVT4jaR", "question": "What is the position of the man who is standing?", "choices": ["umpire", "pitcher", "catcher", "coach"], "correct_choice_idx": 0, "direct_answers": ["umpire", "umpire", "umpire", "umpire", "umpire", "umpire", "umpire", "umpire", "umpire", "umpire"], "difficult_direct_answer": false, "rationales": ["He is an umpire.", "He is watching the play closely to enforce the rules.", "Homeplate at a baseball diamond shows a person in a black uniform behind the plate. the umpire stands behind the plate in order to referee the game."], "image": "train2014/COCO_train2014_000000032270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254432, "question_id": "fWhmCSW43j4sdhfLwXBemf", "question": "What are the elephants doing?", "choices": ["fighting", "talking", "playing", "dancing"], "correct_choice_idx": 2, "direct_answers": ["playing", "mating", "playing", "playing", "fighting", "drinking water", "getting wet", "crossing river", "fighting", "playing"], "difficult_direct_answer": false, "rationales": ["They are wet and one is getting out of the water", "They seem to be using their trunks to have fun with each other.", "We can tell by the animals body language and trunk positions that they are playing."], "image": "train2014/COCO_train2014_000000254432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355168, "question_id": "fWqPpJUZQtf8rVgz2JTLSt", "question": "What is his favorite maker of athletic apparel?", "choices": ["nike", "new balance", "puma", "adidas"], "correct_choice_idx": 0, "direct_answers": ["nike", "nike", "nike", "nike", "wilson", "all favorite", "nike", "unknown", "nike", "nike"], "difficult_direct_answer": false, "rationales": ["The check mark is from nike", "His sneakers have the check mark logo.", "The maker is nike."], "image": "val2014/COCO_val2014_000000355168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491225, "question_id": "fXC9eAcaDVtCiAjuymLZtp", "question": "Who is that on top of the airplane?", "choices": ["acrobat", "pilot", "instructor", "dancer"], "correct_choice_idx": 0, "direct_answers": ["person", "person", "stunt artist", "acrobat", "daredevil", "performer", "man", "person", "pilot", "acrobat"], "difficult_direct_answer": false, "rationales": ["A person is posing on a plane. acrobats perform on top of planes.", "The acrobat is on top.", "An acrobat is flying on the top of the plane."], "image": "train2014/COCO_train2014_000000491225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237772, "question_id": "fXNeWRfKyJyLSZukhobxi4", "question": "What is the cat near?", "choices": ["dog", "egg", "pillows", "goat"], "correct_choice_idx": 2, "direct_answers": ["blanket", "computer", "pillows", "pillows", "laptop", "computer", "laptop", "american bobtail", "laptop", "throw pillows"], "difficult_direct_answer": false, "rationales": ["The cat is visible on the chair and answer a is clearly visible behind it. the are almost certainly pillows based on their shape and size and the placement on the chair.", "There are several on the chair for comfort", "He is near the pillows because it is lying on the sofa"], "image": "val2014/COCO_val2014_000000237772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51674, "question_id": "fXbRmHQpwWKsbXX95RiZ3J", "question": "Where is the boat on the left moving?", "choices": ["away", "left", "towards", "right"], "correct_choice_idx": 0, "direct_answers": ["port side", "forward", "upstream", "up river", "away", "opposite", "downstream", "upriver", "away", "down stream"], "difficult_direct_answer": true, "rationales": ["It's also moving c of the other boat.", "The boat is being propelled into the distance.", "The engine is on as it most past"], "image": "val2014/COCO_val2014_000000051674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516993, "question_id": "fXs7dMtRibKZN3meu4gKxC", "question": "What did he use to get his hair that color?", "choices": ["dye", "mustard", "crayon", "juice"], "correct_choice_idx": 0, "direct_answers": ["dye", "use dye", "bleach", "bleach", "bleach", "dye", "hair dye", "bleach", "dye", "bleach"], "difficult_direct_answer": false, "rationales": ["You can change your hair with dye.", "The boy dyed his hair blonde.", "The man appears to have dark hair underneath and blonde hair on top. this type of hair style is commonly associated with bleaching or at least using some product like a dye to get a different color of hair than what would naturally appear."], "image": "train2014/COCO_train2014_000000516993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560064, "question_id": "fXy35jm3JVh7J4A8mbBrtF", "question": "What kind of shop is in the background?", "choices": ["cake shop", "doughnut shop", "souvenir ship", "restaurant"], "correct_choice_idx": 1, "direct_answers": ["doughnut", "doughnut", "donut shop", "doughnut shop", "doughnut", "bakery", "donut", "donut", "donut shop", "doughnut"], "difficult_direct_answer": false, "rationales": ["The shop sells donuts.", "We can tell by the huge doughnut that says \"doughnuts\" on it that the building behind the people is indeed a doughnut shop.", "There is a sign above the people sitting on the bench."], "image": "val2014/COCO_val2014_000000560064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568403, "question_id": "fYBQKBdksoLDp6Py2HnJa3", "question": "Which Olympic Division are they likely competing in?", "choices": ["summer", "special", "winter", "demonstration"], "correct_choice_idx": 2, "direct_answers": ["winter", "skiing", "skating", "winter", "biathalon", "skiing", "skiing", "slalom skiing", "biathlon", "cross ski"], "difficult_direct_answer": false, "rationales": ["There are two people skiing down a snowy slope.", "Skiing takes place in the winter time.", "The division is winter."], "image": "train2014/COCO_train2014_000000568403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339543, "question_id": "fYVNe2MpDsznt7xs8d7ReJ", "question": "Why might the air be thinner to breath?", "choices": ["holding breath", "smoke", "goggles", "high elevation"], "correct_choice_idx": 3, "direct_answers": ["higher elevation", "high altitude", "elevation", "elevation", "altitude", "high altitude", "air", "cold temperatures", "altitude", "high elevation"], "difficult_direct_answer": false, "rationales": ["The mountains in the background imply that this is the case.", "The higher you are on the mountain, the thinner the air is.", "The higher you go in elevation the thinner it will be to breath."], "image": "train2014/COCO_train2014_000000339543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2823, "question_id": "fYVzpF9tEhhpMmbsb82Ryg", "question": "What type of vehicle are these?", "choices": ["truck", "motorcycle", "helicopter", "bicycle"], "correct_choice_idx": 1, "direct_answers": ["motorcycles", "motorcycles", "motorcycle", "motorcycle", "motorcycles", "motorcycles", "motorcycle", "motorcycles", "motorcycles", "motorcycle"], "difficult_direct_answer": false, "rationales": ["These motor vehicles have two wheels a seat and handlebars.", "This is a two wheel motorized vehicle which would indicate a motorcycle.", "They have two wheels and a motor"], "image": "train2014/COCO_train2014_000000002823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105027, "question_id": "fYZTt8TiErYiSeiHyNs2b9", "question": "What happens to the image?", "choices": ["blurred", "too bright", "too dark", "photoshopped"], "correct_choice_idx": 3, "direct_answers": ["photoshopped", "colored apple", "turns color", "darkened", "colorized", "color adjustment", "color pops", "filter", "fades", "blackened"], "difficult_direct_answer": true, "rationales": ["The image has emphasized the apple with photoshop.", "The color is unnatural as the apple has color but the human does not.", "Only one part is in color"], "image": "train2014/COCO_train2014_000000105027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463342, "question_id": "fYgdZFqMKGishUfANgKAUa", "question": "In which country does this train stop here?", "choices": ["mexico", "spain", "united states", "england"], "correct_choice_idx": 2, "direct_answers": ["usa", "america", "no idea", "united states", "america", "usa", "usa", "usa", "united states", "united states"], "difficult_direct_answer": false, "rationales": ["This is an amtrak train.", "This train stops in the united states.", "It is stopped in the usa."], "image": "train2014/COCO_train2014_000000463342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273059, "question_id": "fZ3zFXnWyJkQiaSzpkwDwm", "question": "Which player is more likely to catch the frisbee?", "choices": ["13", "16", "seven", "55"], "correct_choice_idx": 1, "direct_answers": ["number 16", "thirteen", "red hat", "roll playing", "number 13", "red hat", "number thirteen", "throw", "16", "white hat"], "difficult_direct_answer": true, "rationales": ["Player 16 is closest to the frisbee.", "13 appears to have thrown the frisbee away from them and 16 is the next closest player.", "16 is closest to the frisbee."], "image": "val2014/COCO_val2014_000000273059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209785, "question_id": "fZQS7qtYYoTyqy3au5nL83", "question": "What shape is the red sign?", "choices": ["rhombus", "sphere", "triangular", "circular"], "correct_choice_idx": 2, "direct_answers": ["triangular", "triangle", "triangle", "triangle", "triangle", "triangle", "triangle", "triangle", "triangle", "triangle"], "difficult_direct_answer": false, "rationales": ["The shape is a triangle.", "The red sign has three sides. a three sided shape is known as a triangle", "The red sign has three sides."], "image": "train2014/COCO_train2014_000000209785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384140, "question_id": "fZfnvLspXXDnHRZbeBMANZ", "question": "Why is the dog in the air?", "choices": ["bounced there", "fell there", "can fly", "is kite"], "correct_choice_idx": 3, "direct_answers": ["balloon", "is kite", "balloon", "kite", "kite", "floating balloon", "balloon", "kite", "wind", "baloon dog"], "difficult_direct_answer": false, "rationales": ["You can tell by the setting and the fact it's a flying dog to what it is.", "The black and white dog-kite is flying high on a windy day as his \"master\" guides him on the ground below.", "There is a dog in the air but it is a kite formed like a dog"], "image": "train2014/COCO_train2014_000000384140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329684, "question_id": "fZsycLVkyTx6MThRXTHJXp", "question": "What time will it be on the next hour?", "choices": ["7 o'clock", "6 o'clock", "4 o'clock", "5 o'clock"], "correct_choice_idx": 2, "direct_answers": ["seven", "seven", "seven", "700", "seven", "4 o'clock", "six o'clock", "7.30", "733", "seven thirty"], "difficult_direct_answer": false, "rationales": ["It's 6:33 now so the next hour would be the seven.", "It will soon be 6 o'clock.", "It is six thirty now."], "image": "train2014/COCO_train2014_000000329684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513053, "question_id": "fZwZKedx55jzoayJpjRoRM", "question": "Why is he so intense?", "choices": ["needs rest", "is through", "is running", "stole ball"], "correct_choice_idx": 2, "direct_answers": ["playing tennis", "playing", "tennis match", "professional player", "fierce competition", "concentration", "is running", "sporting", "playing tennis", "big game"], "difficult_direct_answer": true, "rationales": ["The man is jogging with the racquet.", "He is running on the sidelines.", "He's running."], "image": "train2014/COCO_train2014_000000513053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433217, "question_id": "fa7unHA4PyWPi24TYskDZR", "question": "What kind of food is between the mushrooms and carrots?", "choices": ["cheese", "fruit", "vegetable", "meat"], "correct_choice_idx": 2, "direct_answers": ["artichoke hearts", "unknown", "onions", "cucumbers", "artichoke hearts", "artichoke hearts", "vegetable", "cauliflower", "nothing", "pickles"], "difficult_direct_answer": false, "rationales": ["All of the items on the plate are mostly vegetables.", "You can tell by the shapes and colors as to what they are.", "The cheese is on the outside and there isn't any fruit present. so it must be a vegetable."], "image": "train2014/COCO_train2014_000000433217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410141, "question_id": "fa8i6syJRFv7KSSwbDy7zK", "question": "Persons on the left waiting area will board trains upon which track?", "choices": ["leftmost", "overhead", "back", "right track"], "correct_choice_idx": 0, "direct_answers": ["leftmost", "left", "left one", "left", "left track", "left track", "left track", "closer", "closest", "left"], "difficult_direct_answer": false, "rationales": ["In order to board the train furthest to the right passengers would have to cross a set of train tracks posing a danger to the people and the train it's self. there is a set of tracks on the left-hand side specifically for another train the train would be safer and more logical to board from this area.", "People on the left will board on the left.", "They have to board from the concrete"], "image": "val2014/COCO_val2014_000000410141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563763, "question_id": "faZgAKgj4yckdv9ZPvEXxG", "question": "What message does this cake send to those that see it?", "choices": ["happy birthday", "welcome", "happy holiday", "none"], "correct_choice_idx": 1, "direct_answers": ["welcome", "peace", "welcome", "appreciation", "happiness", "welcome", "welcome", "congratulations", "welcome", "good"], "difficult_direct_answer": false, "rationales": ["The word is written on it", "You can see the words on it.", "The cake says welcome on the top."], "image": "train2014/COCO_train2014_000000563763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199764, "question_id": "fap6c3SmMSebnUUoTa9mfJ", "question": "What is the man selling under the tent?", "choices": ["tables", "peanuts", "overalls", "apples"], "correct_choice_idx": 1, "direct_answers": ["roasted peanuts", "peanuts", "peanuts", "roasted peanuts", "peanuts", "peanuts", "roasted peanuts", "peanuts", "peanuts", "peanuts"], "difficult_direct_answer": false, "rationales": ["There is a man selling peanuts under the tent.", "The sign says the man has peanuts.", "There is a sign on the table indicating that peanuts are for sale."], "image": "val2014/COCO_val2014_000000199764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225041, "question_id": "faqkv9CNn6CN7u4HLHy2Yw", "question": "What word includes the first letter found at the top of the bus?", "choices": ["so", "to", "in", "go"], "correct_choice_idx": 0, "direct_answers": ["sea", "sour", "sea", "salmon", "sir", "so", "sun", "sunshine", "safety", "stitching"], "difficult_direct_answer": true, "rationales": ["The word \"so\" includes the letter s.", "The first letter at the top of the bus is an s, and so begins with that letter.", "The first letter on the top is an s."], "image": "train2014/COCO_train2014_000000225041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87040, "question_id": "faxyqKa7q5BkweDTGtpdMy", "question": "Why is the mat there?", "choices": ["protect floor", "it fell", "decoration", "prevent slipping"], "correct_choice_idx": 3, "direct_answers": ["stop slipping", "protect floor", "catch water", "prevent slipping", "absorb water", "safety", "avoid slipping", "wet feet", "traction", "prevent slips"], "difficult_direct_answer": true, "rationales": ["The mat keeps people from slipping on the tile.", "The mat is there to prevent the child from slipping and falling.", "It is in a bathroom where a lot of water can get on the floor, and on tiles, this can create a slipper environment."], "image": "train2014/COCO_train2014_000000087040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386872, "question_id": "fazF77ufrRozpXpJMBTmuP", "question": "A flat cake often thin and round prepared from a starch-based batter is called?", "choices": ["pancake", "jelly", "pizza", "burger"], "correct_choice_idx": 0, "direct_answers": ["pancake", "pancake", "pancake", "pancake", "pancakes", "pancake", "pancake", "pancake", "pancake", "pancakes"], "difficult_direct_answer": false, "rationales": ["A stack of pancakes is on a plate.", "It's a breakfast food that is round.", "The cake is a pancake."], "image": "train2014/COCO_train2014_000000386872.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100535, "question_id": "fbDVnSuNNPK4dkzTZButY3", "question": "What type of food might be served at this cafe?", "choices": ["mexican", "chinese", "indian", "italian"], "correct_choice_idx": 3, "direct_answers": ["italian", "italian", "italian", "italian", "muffins", "pizza", "pizza lasagna", "pizza", "coffee", "sandwiches"], "difficult_direct_answer": false, "rationales": ["Due to the words on the umbrella, it is easy to tell what type of eatery this is.", "The umbrella says cafe italia so it likely sells italian food.", "The word is on the umbrella"], "image": "train2014/COCO_train2014_000000100535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238310, "question_id": "fbRedMuj5Sa4t5mQNPLLLH", "question": "What is the woman with the pony tail doing?", "choices": ["clapping", "dancing", "photographing", "gaming"], "correct_choice_idx": 2, "direct_answers": ["taking picture", "recording video", "photographing", "taking picture", "recording", "taking picture", "taking picture", "filming", "filming", "recording"], "difficult_direct_answer": false, "rationales": ["The woman is taking a picture.", "A woman is holding her phone up to capture an event everyone is watching.", "She has a camera in her hand"], "image": "val2014/COCO_val2014_000000238310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131245, "question_id": "fcChrbGwmD74er7viRZ6xV", "question": "What is this kid responsible for?", "choices": ["selling cows", "punishing cows", "herding cows", "scaring cows"], "correct_choice_idx": 2, "direct_answers": ["cattle herding", "cows", "cattle", "herding", "hoarding cows", "herding cows", "herding cows", "grazing cattle", "livestock", "cow heard"], "difficult_direct_answer": true, "rationales": ["A young boy, squinting into the sun and dust, holds his prodding stick in his right hand as he tries to convince these cows to move along!.", "You can tell by the crop in his hand and the setting as to what his job is.", "The kid is responsible for herding cows because he has a baton in order to do so"], "image": "train2014/COCO_train2014_000000131245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308541, "question_id": "fcNVka9dxyxULzQZMbxqrD", "question": "What style of skateboarding is this?", "choices": ["street", "park", "freestyle", "vert"], "correct_choice_idx": 3, "direct_answers": ["vert", "competitive", "halfpipe", "bowl", "half pipe", "park", "freestyle", "half pipe", "painting", "wall skating"], "difficult_direct_answer": true, "rationales": ["A skateboarder is on a surface with a steep incline. vert is short for vertical which refers to the incline a skater is doing a trick on.", "The style is vert.", "The skateboarders are going vertical."], "image": "train2014/COCO_train2014_000000308541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111259, "question_id": "fcPXKwaJRCVSvBYcGUKCWM", "question": "What activity is taking place in the image?", "choices": ["kayaking", "paddling", "rafting", "canoeing"], "correct_choice_idx": 2, "direct_answers": ["boating", "whitewater rafting", "whitewater rafting", "boat riding", "whitewater rafting", "rafting", "rafting", "river rafting", "rafting", "river rafting"], "difficult_direct_answer": false, "rationales": ["There are many people in a inflatable boat going down a river.", "The activity is rafting.", "The people are on a raft."], "image": "val2014/COCO_val2014_000000111259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235672, "question_id": "fcYjAVmbAweGKSrvrpKtKY", "question": "Why is the outdoor area using covered gazebos?", "choices": ["too icy", "stay dry", "too sunny", "too windy"], "correct_choice_idx": 1, "direct_answers": ["rain", "raining", "rain protection", "rain", "raining", "stay dry", "rain", "raining", "raining", "rain shelter"], "difficult_direct_answer": false, "rationales": ["The sky is very overcast and people also have umbrellas", "The weather is overcast and the ground is wet.", "The area is to stay dry."], "image": "val2014/COCO_val2014_000000235672.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408465, "question_id": "fcmsruVAC5FfMnUDrZGibS", "question": "What activity is the person engaging in?", "choices": ["photography", "photo", "lifting", "selfie"], "correct_choice_idx": 0, "direct_answers": ["selfie", "taking picture", "photography", "photography", "photography", "taking photos", "photography", "photography", "photography", "talking picture"], "difficult_direct_answer": false, "rationales": ["Their reflection can be seen in the mirror, and they are holding up a professional looking camera.", "The person is taking a photo with her camera.", "The person's reflection is clearly visible and contains a camera which was used to make the image."], "image": "train2014/COCO_train2014_000000408465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354425, "question_id": "fdQsJUvhXTTxN8aPjB374y", "question": "What activity is the individual engaging in?", "choices": ["running", "boxing", "taekwondo", "climbing"], "correct_choice_idx": 1, "direct_answers": ["wii", "wii", "video gaming", "play games", "gaming", "bar game", "boxing", "video game", "gaming", "gaming"], "difficult_direct_answer": false, "rationales": ["The person is boxing because he has a boxer's stance", "The man is boxing.", "He is holding video game controllers and has his arms in a stance that someone would be in when boxing."], "image": "val2014/COCO_val2014_000000354425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95455, "question_id": "fdTG4AqHVhQYAVcTcMEYxi", "question": "Where are the people in?", "choices": ["cinema", "conference room", "store", "classroom"], "correct_choice_idx": 3, "direct_answers": ["classroom", "classroom", "classroom", "classroom", "classroom", "school", "classroom", "school", "classroom", "school"], "difficult_direct_answer": false, "rationales": ["These people are all sitting together in a classroom around a laptop and teacher.", "They are children in matching colors and there is a teacher demonstrating something on a laptop.", "There are many children wearing a uniform, and in the process of learning, indicating they are in a classroom."], "image": "train2014/COCO_train2014_000000095455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571462, "question_id": "fdWffugz4hTgHsqK38ZV4V", "question": "How many brand new items will one find in this store?", "choices": ["zero", "twenty", "six", "fifty"], "correct_choice_idx": 0, "direct_answers": ["zero", "zero", "zero", "zero", "zero", "zero", "zero", "zero", "zero", "zero"], "difficult_direct_answer": false, "rationales": ["This is an antique store", "There are no items that are new.", "The items in the window appear to be more old fashioned so it probably is a store selling antique or vintage items that wouldn't carry new merchandise."], "image": "train2014/COCO_train2014_000000571462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464149, "question_id": "fdqTxnhXQ89UUJP5PAtMtA", "question": "What morning hour is the clock ahead reading?", "choices": ["ten", "eleven", "four", "three"], "correct_choice_idx": 1, "direct_answers": ["eleven", "eleven", "1117", "1117", "1117", "eleven", "eleven", "eleven", "eleven", "eleven"], "difficult_direct_answer": false, "rationales": ["The clock hands are clearly visible and based on the orientation of the hands and how clocks work, answer a is consistent.", "The hour hand shows 11.", "The small hand is on the eleven."], "image": "val2014/COCO_val2014_000000464149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68650, "question_id": "fdyMAUdUPEYjkMzSzcjwjB", "question": "What is the man doing with the giraffe?", "choices": ["selling it", "feeding", "taking picture", "stealing"], "correct_choice_idx": 2, "direct_answers": ["photographing", "taking picture", "photo", "photographing", "taking photo", "taking photo", "taking picture", "taking photograph", "picture", "photographing"], "difficult_direct_answer": false, "rationales": ["The man is holding up his camera and pointing it at the giraffe in order to capture the image.", "He can be seen holding up a camera in the direction of where the giraffe is.", "The man is pointing a camera at the giraffe."], "image": "val2014/COCO_val2014_000000068650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311521, "question_id": "feEoWJPeqHHMATGVHNqwyh", "question": "Why is the pie cut up?", "choices": ["fresh longer", "easier disposal", "serve people", "to clean"], "correct_choice_idx": 2, "direct_answers": ["serving", "to share", "serve it", "to serve", "for eating", "serve people", "serving", "servings", "being served", "serving"], "difficult_direct_answer": false, "rationales": ["The pie is being served.", "One person usually eats one slice.", "The pie is in slices."], "image": "train2014/COCO_train2014_000000311521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402097, "question_id": "feQF6uhyBcwoYCjoyMNCHW", "question": "What type of tree is overhanging the body of water here?", "choices": ["walnut", "maple", "oak", "pine"], "correct_choice_idx": 1, "direct_answers": ["maple", "oak", "maple tree", "maple", "maple", "maple", "elm", "maple", "oak", "up water"], "difficult_direct_answer": false, "rationales": ["A maple tree is hanging.", "There are maple leaves on the tree.", "A maple tree is overhanging."], "image": "train2014/COCO_train2014_000000402097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15542, "question_id": "feZGCMyKjGYcc6GFZBoDuK", "question": "What is on the left of the tray?", "choices": ["fish", "hot dog", "hamburger", "bagel"], "correct_choice_idx": 1, "direct_answers": ["hot dog", "hotdog", "hot dog", "hot dog", "hot dog", "hotdog", "hot dog", "hot dog", "hot dog", "hotdog"], "difficult_direct_answer": false, "rationales": ["A hot dog is shown since it's a long sausage.", "A hot dog is in a bun.", "It is long bread and meat with toppings"], "image": "val2014/COCO_val2014_000000015542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94701, "question_id": "fefFLr8mitr6iudyz5hTGM", "question": "Why is the man kneeling in front of the steps?", "choices": ["to pray", "to dance", "to sit", "to record"], "correct_choice_idx": 3, "direct_answers": ["take photo", "taking picture", "balancing", "photography", "take picture", "taking photograph", "photographing", "to record", "pictures", "taking photo"], "difficult_direct_answer": true, "rationales": ["The man wants to record the skateboarder.", "The man that is kneeling is holding a camera and recording the person that is skating.", "He is holding a camera"], "image": "train2014/COCO_train2014_000000094701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29894, "question_id": "ff6JUQm2YC3eHjubjwzj4X", "question": "What is the man holding the racket ready to do?", "choices": ["duck", "dip", "dodge", "hit ball"], "correct_choice_idx": 3, "direct_answers": ["return serve", "hit ball", "score", "hit ball", "hit ball", "hit ball", "catch ball", "receive", "return serve", "hit ball"], "difficult_direct_answer": false, "rationales": ["The player is bending slightly ready to hit the ball back.", "He is ready to return a volley from his opponent, hence his stance and eye contact.", "The racquet is ready to serve or hit the ball back."], "image": "train2014/COCO_train2014_000000029894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412566, "question_id": "ffCDTdV6bBWbXvfcbmC9NC", "question": "What word did they probably say recently?", "choices": ["struggle", "domain", "cheers", "astronomy"], "correct_choice_idx": 2, "direct_answers": ["cheers", "cheers", "cheers", "cheers", "toast", "cheers", "cheers", "chin", "cheers", "cheers"], "difficult_direct_answer": false, "rationales": ["Woman are holding glasses up as they sit at a table at a restaurant together.", "The women have their drinks raised in the air.", "They look to be making a toast with their glasses."], "image": "train2014/COCO_train2014_000000412566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119669, "question_id": "ffJrMh7vJ32up3oCVWNivf", "question": "Why does the animal have its head to the water?", "choices": ["to spit", "to drink", "to swim", "to dive"], "correct_choice_idx": 1, "direct_answers": ["drinking", "drinking", "drinking water", "to drink", "drinking water", "drinking", "drinking water", "drinking", "drinking", "drinking"], "difficult_direct_answer": false, "rationales": ["There is water and the animal's head is near it.", "The animal wants a sip.", "The other options don't apply to the needs of this animal or the action."], "image": "train2014/COCO_train2014_000000119669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400028, "question_id": "ffPny5vCSctyucKoqxwZGz", "question": "What restaurant has he visited recently?", "choices": ["quiznos", "denny's", "tim horton's", "dunkin' donuts"], "correct_choice_idx": 3, "direct_answers": ["dunkin donuts", "dunking donuts", "dunkin' donuts", "dunkin donuts", "dunkin", "dunkin donuts", "dunking donuts", "dunkin", "dunkin donuts", "dunkin donuts"], "difficult_direct_answer": false, "rationales": ["His cup has the logo of this restaurant on it.", "The man has a cup with the dunkin' donuts logo on it.", "The logo is on the coffee cup"], "image": "train2014/COCO_train2014_000000400028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564830, "question_id": "ffTADf9krQ7ck4mhFgmWgv", "question": "What happens when the items in the vertical stacks against the wall are used?", "choices": ["games happen", "nothing", "music plays", "complaining"], "correct_choice_idx": 2, "direct_answers": ["play music", "music plays", "music plays", "hear music", "play audio", "music plays", "make music", "play music", "music", "fall"], "difficult_direct_answer": false, "rationales": ["Two stands that hold compact discs is attached to a wall in a family room.", "Those are cd's and can be listened to.", "They are compact discs, which hold music albums which can be played when inserted in a stereo."], "image": "val2014/COCO_val2014_000000564830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219437, "question_id": "ffdsfRnmaUvXi2rf5b8uHJ", "question": "What is the person in this house about to do?", "choices": ["grill", "fry food", "bake", "dishes"], "correct_choice_idx": 2, "direct_answers": ["bake something", "eat", "bake", "use oven", "cook", "cook meal", "bake", "eat", "cook", "cook"], "difficult_direct_answer": false, "rationales": ["Two jelly roll pans sit on the stove and a rectangular cake pan contains two round loaves of bread or dough on the counter. the oven is empty.", "The person in this house is about to bake with the oven.", "Because the light is on in the oven it is safe to say they are about to bake."], "image": "val2014/COCO_val2014_000000219437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220366, "question_id": "ffoaVMiT8YCsHAmoy5FYj9", "question": "What is the black object near the man's hairline?", "choices": ["headband", "visor", "bandana", "rope"], "correct_choice_idx": 0, "direct_answers": ["sweat band", "sweatband", "headband", "band", "headband", "headband", "headband", "headband", "headband", "headband"], "difficult_direct_answer": false, "rationales": ["The man is playing tennis and since he may work up sweat he has an accessory on his head to keep sweat from getting into his eyes.", "It's his headband.", "The black object is a headband."], "image": "train2014/COCO_train2014_000000220366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384960, "question_id": "ffrytqVzTtjiMpxrCtdJuz", "question": "Why does the person have an umbrella?", "choices": ["disguise", "sun", "snow", "rain"], "correct_choice_idx": 3, "direct_answers": ["raining", "raining", "rain", "rain", "rain", "block rain", "it's raining", "raining", "rain", "raining"], "difficult_direct_answer": false, "rationales": ["There are water drops running down outside of the glass.", "To keep them dry.", "There is rain on the window and the person outside is in the rain so they will need the umbrella to protect themselves."], "image": "train2014/COCO_train2014_000000384960.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568076, "question_id": "ffzdTAtkrdDPKRmaT6JBnz", "question": "Why is there so much color in the sky?", "choices": ["lightening bugs", "fireworks", "streamers", "kite strings"], "correct_choice_idx": 3, "direct_answers": ["kites", "kites", "kite strings", "kites", "blue", "kite flying", "kites", "party", "kites", "clear"], "difficult_direct_answer": false, "rationales": ["A large group of people with many flying kites are in an open area.", "Seven or eight kites in the air with bright streamers is enough to add large amounts of color to this warm summer sky. quite a crowd is on hand to enjoy the spectacle!.", "The kite strings are multicolored."], "image": "train2014/COCO_train2014_000000568076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298050, "question_id": "fg5AwirKhBxcCwexhGMCmm", "question": "What is the blue thing over the boat doing for the items below?", "choices": ["electricity", "insulation", "cover", "wind"], "correct_choice_idx": 2, "direct_answers": ["securing them", "keeping dry", "dry", "tarping", "cover", "keep dry", "cover", "tarp", "protection", "keeping dry"], "difficult_direct_answer": false, "rationales": ["If a boat is not used often, or has water-sensitive cargo, a tarp will be used to protect it from the elements.", "It is a tarp used to protect items", "It's protecting the boat."], "image": "train2014/COCO_train2014_000000298050.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311501, "question_id": "fg9fpJyhGicmTB7obKSKNS", "question": "What type of road command is shown in the image?", "choices": ["pedestrians", "train crossing", "yield", "stop"], "correct_choice_idx": 3, "direct_answers": ["stop", "stop", "stop", "stop", "stop", "stop", "stop", "stop", "traffic light", "stop sign"], "difficult_direct_answer": false, "rationales": ["A stoplight is shown since it's red.", "Red light means to stop where you are at or wait to cross the road.", "The road command is stop."], "image": "train2014/COCO_train2014_000000311501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451793, "question_id": "fgNCRba339jaiQ5Sj9VRcd", "question": "Which person is in the greatest danger?", "choices": ["back man", "woman", "bus driver", "right person"], "correct_choice_idx": 1, "direct_answers": ["bicyclist", "woman", "bicyclist", "girl bike", "bicyclist", "woman", "person bike", "bicyclist", "biker", "woman"], "difficult_direct_answer": false, "rationales": ["A woman is going near a bus and she might get run over by it.", "There is a lady that is on her bike that is going right in front of a bus.", "The lady on the bike."], "image": "train2014/COCO_train2014_000000451793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248014, "question_id": "fgVENmAxM5sRLpyxJge8UD", "question": "How is air stopped from flowing between logs here?", "choices": ["chinking", "paint", "saw", "sawdust"], "correct_choice_idx": 0, "direct_answers": ["grout", "insulation", "chinking", "chinking", "paste", "mud", "caulk", "windows", "mud", "mortar"], "difficult_direct_answer": false, "rationales": ["The process of chinking does as is described.", "The air clinks.", "The logs are chinked."], "image": "val2014/COCO_val2014_000000248014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578498, "question_id": "fgmrrBnYLrTHoUj5FGs8tf", "question": "What poses the biggest threat for a person to trip on?", "choices": ["couch", "cords", "table", "lamp"], "correct_choice_idx": 1, "direct_answers": ["cord", "wires", "fan", "wires", "wire cord", "fan", "wires", "cords", "table", "cords"], "difficult_direct_answer": false, "rationales": ["A living room type area is shown with cords strewn across the floor. people trip on cords.", "They are left around the room in random spots. if someone walks and doesn't notice, he/she will fall.", "The cords are dangerous."], "image": "val2014/COCO_val2014_000000578498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211201, "question_id": "fhB89mXzLewvqhCLq4rUNr", "question": "What hour is the clock fifteen minutes from?", "choices": ["ten", "six", "eight", "eleven"], "correct_choice_idx": 2, "direct_answers": ["nine", "eight", "eight", "eight", "eight", "nine", "nine o'clock", "midnight", "midnight unsure", "eight"], "difficult_direct_answer": false, "rationales": ["It's 8 o'clock.", "The time shown is seven forty five.", "The hour hand is just a little bit before the 8 on the clock."], "image": "train2014/COCO_train2014_000000211201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399594, "question_id": "fhBKv7EBhe4cBLCPFVsP4z", "question": "How can the animals here most readily be identified?", "choices": ["ear tattoos", "claws", "collar tags", "brand"], "correct_choice_idx": 2, "direct_answers": ["collar tags", "fur", "tags", "collars", "collar", "tags", "collar tags", "fur color", "collar tags", "collars"], "difficult_direct_answer": false, "rationales": ["The animals are dogs, which are commonly identified with these items placed around their necks.", "The tags have information on them, such as the dog's name and their owner's information.", "These include their names, owners' names and phone number."], "image": "train2014/COCO_train2014_000000399594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104591, "question_id": "fhLvxBwzxYCtQ5J6wJcLmv", "question": "Where are these animals being kept?", "choices": ["in zoo", "museum", "backyard", "mall"], "correct_choice_idx": 0, "direct_answers": ["in zoo", "zoo", "zoo", "zoo", "zoo", "giraffe", "zoo", "zoo", "enclosure", "zoo"], "difficult_direct_answer": false, "rationales": ["They are in a fenced in area.", "These animals are kept inside of a zoo cage.", "They are kept in a zoo as evident for their safety band human beings."], "image": "train2014/COCO_train2014_000000104591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13143, "question_id": "fhUirhyFxofhj5WXKbQisB", "question": "What should the man be standing behind?", "choices": ["pilar", "woman", "bench", "white line"], "correct_choice_idx": 3, "direct_answers": ["white line", "whate line", "white line", "line", "white line", "white line", "white line", "white edge", "waiting", "white line"], "difficult_direct_answer": false, "rationales": ["The white line is there for safety, to stop passengers from getting too close to the tracks when a train is coming by.", "A white line is on the ground at a train station and a man's foot is on the line. lines are commonly used to provide a barrier between waiting passengers and oncoming trains.", "The white part is for safety."], "image": "train2014/COCO_train2014_000000013143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62200, "question_id": "fhUwHvjoDGmmwjMT58Faj4", "question": "What is in the foreground?", "choices": ["barn", "cow", "basket", "parking meter"], "correct_choice_idx": 3, "direct_answers": ["parking meter", "parking meter", "parking meter", "parking meter", "parking meters", "parking meters", "parking meter", "parking meter", "parking meter", "parking meter"], "difficult_direct_answer": false, "rationales": ["There is a parking meter to feed money into.", "This is where you put money to park", "Parking meters are closest to the camera."], "image": "val2014/COCO_val2014_000000062200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287774, "question_id": "fhjbFky4gS7RiHpDjn2Ped", "question": "What is one thing the white things were historically used for?", "choices": ["helmets", "piano keys", "swords", "kettles"], "correct_choice_idx": 1, "direct_answers": ["ivory", "transport", "playing", "ivory", "trophies", "pianos", "piano keys", "produce ivory", "collectible", "ivory"], "difficult_direct_answer": false, "rationales": ["A man is riding an elephant with tusks.", "Tusks create piano keys.", "People use the tusks for fighting."], "image": "train2014/COCO_train2014_000000287774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167598, "question_id": "fht37ENbmNmPNiAjhHDA2A", "question": "What does the person in the foreground have on?", "choices": ["clown nose", "helmet", "jacket", "glasses"], "correct_choice_idx": 2, "direct_answers": ["jacket", "jacket", "jacket", "white jacket", "jacket", "jacket", "white jacket", "kite", "jacket", "kite"], "difficult_direct_answer": false, "rationales": ["They are wearing a coat.", "The first thing you see is the outer wear.", "The person has a puffy overcoat."], "image": "val2014/COCO_val2014_000000167598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433435, "question_id": "fhxRJADtQJDZvR5DrZquvd", "question": "Why do people gather outside the fence?", "choices": ["elephant rides", "resting", "waiting bus", "watch elephants"], "correct_choice_idx": 3, "direct_answers": ["see elephants", "watch elephants", "see elephants", "watch elephants", "to see", "watch elephants", "watch elephants", "watch elephants", "watch animals", "watch elephants"], "difficult_direct_answer": false, "rationales": ["This is the most obvious reason. that said, this location might also offer b.", "The people are observing the elephants at a zoo.", "People are at the zoo."], "image": "train2014/COCO_train2014_000000433435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220518, "question_id": "fiDWk2QrnKPXv6CLkffXzz", "question": "What function does the night stand provide for the magazines?", "choices": ["protection", "recharge", "light", "storage"], "correct_choice_idx": 3, "direct_answers": ["storage", "holder", "holds them", "hold", "storage", "storage", "storage", "storage", "stand", "storage"], "difficult_direct_answer": false, "rationales": ["The nightstand is storing magazines.", "The night stand stores the magazines in the bedroom.", "The function is storage."], "image": "train2014/COCO_train2014_000000220518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511972, "question_id": "fiHRgLUvkY8FzKsEcroVWa", "question": "What most likely allowed the skier to become aloft?", "choices": ["deep hole", "upslope", "sand trap", "flat land"], "correct_choice_idx": 1, "direct_answers": ["ramp", "small hill", "ramp", "speed", "jumping", "jump", "slope", "hill", "hill", "upslope"], "difficult_direct_answer": false, "rationales": ["The slopes are used to make the people go up.", "The people go upslope.", "The skier in the photo is in midair and mostly like was going up slope to get into this position just prior."], "image": "train2014/COCO_train2014_000000511972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36420, "question_id": "fiWceLXASRbpSzYnCnwqwP", "question": "What color shirt does the person most likely to kick the ball first wear?", "choices": ["green", "white", "black", "none"], "correct_choice_idx": 1, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The girl closest to the ball is wearing white.", "A girl is a white shirt is being trailed by one in a green shirt on a soccer field.", "The player wearing number 17 is closer to the ball."], "image": "val2014/COCO_val2014_000000036420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125645, "question_id": "fiX6kd7Tx7WXgq6WkUDJ4X", "question": "What would someone be riding on top of the red bus for?", "choices": ["tours", "sleeping", "shooting", "eating"], "correct_choice_idx": 0, "direct_answers": ["tourist", "sightseeing", "tour", "sightseeing", "tour", "amusement", "tourism", "going tour", "sightseeing", "tours"], "difficult_direct_answer": false, "rationales": ["A bus has seats on the top. the bus advertises tours.", "It is open air to sight see", "A tourist would rid on the top to be able to see the sights."], "image": "val2014/COCO_val2014_000000125645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294330, "question_id": "ficuSxVX44VSQgFWTjxpQH", "question": "What is the device on the middle shelf sitting to the right of the printer?", "choices": ["laptop", "hard drive", "router", "scanner"], "correct_choice_idx": 3, "direct_answers": ["scanner", "printer", "scanner", "speaker", "scanner", "books", "scanner", "scanner", "scanner", "scanner"], "difficult_direct_answer": false, "rationales": ["The device is meant to scan documents.", "There is a scanner that one can use to scan documents.", "The device is a scanner."], "image": "train2014/COCO_train2014_000000294330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562347, "question_id": "fifM8h95CDcHeb79zaxMUz", "question": "In what country would the eye color of this man be considered rare?", "choices": ["france", "estonia", "sweden", "finland"], "correct_choice_idx": 0, "direct_answers": ["mexico", "china", "iraq", "china", "france", "china", "china", "china", "africa", "uganda"], "difficult_direct_answer": false, "rationales": ["Blue eyes are rare in france.", "Due to evolutional changes that occurred in france.", "A man has blue eyes."], "image": "val2014/COCO_val2014_000000562347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286042, "question_id": "fikX4zLTZCsmyf3ZMhVHaP", "question": "Who is paying the person with the hammer?", "choices": ["city", "president", "no one", "criminals"], "correct_choice_idx": 0, "direct_answers": ["city", "government", "city", "women", "company", "city", "city", "fire department", "unknown", "assistant"], "difficult_direct_answer": false, "rationales": ["This is usually the case.", "The city pays the city worker.", "Most firefighters work for a city or town and are paid by tax money."], "image": "train2014/COCO_train2014_000000286042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362676, "question_id": "fir3wedSg7wS8xzxSkqgDh", "question": "What dangerous substance was often used in the manufacture of these types of windows?", "choices": ["cyanide", "lead", "uranium", "mercury"], "correct_choice_idx": 1, "direct_answers": ["mono", "lead", "lead", "aluminium", "lead", "spray paint", "acrylic enamel", "lead", "lead", "lead"], "difficult_direct_answer": false, "rationales": ["Lead is usually in these windows.", "Lead use to be used.", "It is what the lines were made of."], "image": "train2014/COCO_train2014_000000362676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37913, "question_id": "fj4MvnvQnsDC6oJrpsdSFC", "question": "What period of the day is it in the image?", "choices": ["night", "morning", "afternoon", "evening"], "correct_choice_idx": 2, "direct_answers": ["midday", "afternoon", "mid day", "afternoon", "morning", "midday", "morning", "afternoon", "midday", "afternoon"], "difficult_direct_answer": false, "rationales": ["There is a clock visible in the background that clearly shows the hour. based on the given hour and the available sunlight answer a is most logical.", "The sun isn't as bright but it's not dark out.", "The period is the afternoon."], "image": "train2014/COCO_train2014_000000037913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535580, "question_id": "fj5zmVshtgcN4nwdMeN887", "question": "What sort of area does the man skateboard in?", "choices": ["desert", "urban", "rural", "farm"], "correct_choice_idx": 1, "direct_answers": ["city", "downtown", "road", "urban", "city road", "road", "roadway", "street", "street", "street"], "difficult_direct_answer": false, "rationales": ["He skates in a more urban area with all the buildings around him.", "The man skateboards in an urban area.", "The area is urban."], "image": "train2014/COCO_train2014_000000535580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449066, "question_id": "fj84uFeRZxzyogytWYqe4d", "question": "What is the green sign advertising?", "choices": ["drinks", "players", "politicians", "movies"], "correct_choice_idx": 0, "direct_answers": ["today specials", "mean", "menu", "todays specials", "food", "specials", "today's specials", "meals", "drinks", "today's special"], "difficult_direct_answer": true, "rationales": ["Restaurants have these green boards outside the establishments to showcase any sales they may be having.", "The green chalkboard is advertising drinks.", "It says \"pub.\" so, a makes the most sense."], "image": "train2014/COCO_train2014_000000449066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410301, "question_id": "fjAgizRZXYdcjTjQXLKCYQ", "question": "What can be done here besides going to the bathroom?", "choices": ["surfing internet", "showering", "cooking", "watching tv"], "correct_choice_idx": 1, "direct_answers": ["wash hands", "wash hands", "washing hair", "washing hands", "washing hands", "showering", "shower", "wash hands", "wash hands", "wash hands"], "difficult_direct_answer": false, "rationales": ["Most bathrooms have a working shower in them.", "Someone can take a shower in the bathroom.", "You can shower your body in the bathroom."], "image": "val2014/COCO_val2014_000000410301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459848, "question_id": "fjDQe5qfB8ra56atEWCbjo", "question": "What metro train number is this?", "choices": ["8501", "1058", "5810", "8051"], "correct_choice_idx": 1, "direct_answers": ["1058", "ten fiftyeight", "ten fifty-eight", "ten fifty-eight", "1058", "ten fifty-eight", "1058", "1058", "1058", "1058"], "difficult_direct_answer": false, "rationales": ["The number 1058 is on the front of the train.", "The number 1058 is on the front of the train.", "The number can be seen on the front of the train to identify it, and that number says 1058."], "image": "train2014/COCO_train2014_000000459848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313582, "question_id": "fjGzRNxCHDwSEXrTUfLGJT", "question": "What title did the namesake of this type of sweater have?", "choices": ["viscount", "king", "duke", "earl"], "correct_choice_idx": 3, "direct_answers": ["cardigan", "earl", "cardigan", "polo", "cardigan", "mri rogers", "cardigan", "argyle", "vest", "teacher"], "difficult_direct_answer": false, "rationales": ["He was the seventh one of his kind, james thomas brudenell.", "The title is earl.", "Earl gray is the name of the sweater color."], "image": "train2014/COCO_train2014_000000313582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20962, "question_id": "fjRpzDyhJ5dXMuzhgpqnEC", "question": "How is he resting?", "choices": ["squatting", "eating bar", "having drink", "lying down"], "correct_choice_idx": 2, "direct_answers": ["standing", "standing", "on bike", "standing", "on bike", "standing", "standing", "standing", "standing", "having drink"], "difficult_direct_answer": false, "rationales": ["He has a bottle in his hands.", "The guy is drinking something while taking a break from riding.", "He has a bottle in his hand"], "image": "train2014/COCO_train2014_000000020962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18957, "question_id": "fjioopXLa8FY9C9xrrb4bi", "question": "What were Tutor bread ovens closed with?", "choices": ["metal doors", "raw dough", "wooden logs", "stone doors"], "correct_choice_idx": 0, "direct_answers": ["metal tool", "pole", "metal doors", "wooden doors", "nothing", "oven doors", "door", "brick", "door", "heating"], "difficult_direct_answer": true, "rationales": ["Tudor bread ovens don't have doors.", "Tutor bread ovens have metal doors.", "The other options don't apply to these types of ovens."], "image": "val2014/COCO_val2014_000000018957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173611, "question_id": "fjiwQUfsrss6ii9W6nXjYP", "question": "What item does the maker of the shirt advertise?", "choices": ["dog treats", "glasses", "motorcycles", "hats"], "correct_choice_idx": 2, "direct_answers": ["harley-davidson", "motorcycles", "motorcycle", "motorcycles", "motorcycles", "motor cycles", "motorcycles", "motorcycles", "motorcycles", "harley davidson"], "difficult_direct_answer": false, "rationales": ["Harley davidson is a motorcycle brand.", "Harley are motorcyles.", "Advertises motorcycles."], "image": "train2014/COCO_train2014_000000173611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364525, "question_id": "fjqVazJ3GWMsfBrh62GHbr", "question": "The item on the floor looks like what?", "choices": ["hammer", "toad", "baby", "sewing machine"], "correct_choice_idx": 3, "direct_answers": ["sewing machine", "sewing machine", "sewing machine", "sewing machine", "sewing machine", "sewing machine", "sewing machine", "sewing machine", "sewing machine", "sewing machine"], "difficult_direct_answer": false, "rationales": ["It's a portable machine that sews fabrics.", "It has knobs and an opening to place the fabric", "It probably is one. a small, portable one."], "image": "train2014/COCO_train2014_000000364525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16578, "question_id": "fjvQnwLt7GskkE4yemjMTc", "question": "What type car is the one with 42 on it's top?", "choices": ["convertible", "taxi", "shipping", "prison"], "correct_choice_idx": 1, "direct_answers": ["cab", "taxi", "taxi", "taxi", "cab", "cab", "taxi", "taxi", "taxi", "taxi"], "difficult_direct_answer": false, "rationales": ["This is a taxi.", "The car which has the number 42 on it, has it located on the top which is a normal identifier for a taxi.", "The car is a taxi."], "image": "train2014/COCO_train2014_000000016578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522244, "question_id": "fk7bE7YkjqwRkveKkCvssJ", "question": "What country is this location?", "choices": ["ukraine", "sweden", "czech republic", "poland"], "correct_choice_idx": 2, "direct_answers": ["czechia", "czech republic", "russia", "czechia", "czechoslovakia", "czechia", "czech republic", "russia", "czechia", "czech republic"], "difficult_direct_answer": false, "rationales": ["The text on the sign is in czech.", "The language above the train station is czech.", "This is in the czech republic"], "image": "train2014/COCO_train2014_000000522244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243912, "question_id": "fkDMPYgYLgUUTsQXBgBN2x", "question": "What pattern shirt does the person wear who put the tie on this dog?", "choices": ["hounds tooth", "solid", "stripe", "check"], "correct_choice_idx": 3, "direct_answers": ["plaid", "checkered", "plaid", "check", "plaid", "check", "checkered", "checker", "checkered", "plaid"], "difficult_direct_answer": false, "rationales": ["The person holding the dog is wearing a checked shirt. the person who is holding the dog is likely the owner of the dog.", "A person is holding a dog and wearing a plaid shirt.", "The shirt is checkered."], "image": "val2014/COCO_val2014_000000243912.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139763, "question_id": "fkH7FzdqLHYAiTsiRq9RtD", "question": "At least how many people are needed to play this game?", "choices": ["four", "one", "two", "three"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two little computers there, so two people can play.", "A woman is playing ping pong in an office.", "The person is playing ping pong which requires someone to hit the ball back."], "image": "train2014/COCO_train2014_000000139763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114079, "question_id": "fkrbRYwsaushLt8qzUM7s4", "question": "What sign is on right side of the road?", "choices": ["signal", "slow", "slow", "stop"], "correct_choice_idx": 0, "direct_answers": ["light warning", "stop light", "traffic", "no upturn", "traffic light", "traffic light", "signal", "traffic signal", "u turn", "no turning"], "difficult_direct_answer": true, "rationales": ["The sign has the symbol that indicates that a traffic light is up ahead.", "The sign on the right side of the road has a traffic light on it.", "There is a traffic signal up ahead."], "image": "train2014/COCO_train2014_000000114079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63495, "question_id": "fm7kbJrYQBFjtgbCR2KYvL", "question": "What do they do at the place that the blue sign identifies?", "choices": ["drink beer", "plan heists", "party", "pray"], "correct_choice_idx": 3, "direct_answers": ["pray", "attend church", "pray", "pray", "worship", "worship", "worship", "pray", "worship jesus", "pray"], "difficult_direct_answer": false, "rationales": ["It's a church.", "Churches are used for marshaling your thoughts and hoping.", "It is a church"], "image": "train2014/COCO_train2014_000000063495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92694, "question_id": "fmE5yneZoBwWm2Lp8Kh8uL", "question": "What are the two men doing together?", "choices": ["drawing", "singing", "playing instruments", "gaming"], "correct_choice_idx": 3, "direct_answers": ["playing wii", "playing wii", "gaming", "playing wii", "playing games", "video games", "playing wii", "playing game", "playing wii", "playing"], "difficult_direct_answer": false, "rationales": ["They are playing nintendo wii.", "The people are gaming.", "The two men are playing games together."], "image": "train2014/COCO_train2014_000000092694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523262, "question_id": "fmF2UviqdojTyCtT4Q68op", "question": "Why are the people gathered together?", "choices": ["to pose", "to eat", "to wrestle", "for warmth"], "correct_choice_idx": 0, "direct_answers": ["picture", "snow season", "see picture", "group photo", "having fun", "to pose", "skiing", "take photo", "photo", "having fun"], "difficult_direct_answer": true, "rationales": ["The people pose.", "The people are gathered as a group for a photo.", "The people are all facing in the same direction and are all stationary in a group. these are elements that would be consistent with people posing for a photo."], "image": "train2014/COCO_train2014_000000523262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353197, "question_id": "fmMQ2W9bkYQWCW5YY38vbd", "question": "Why is his board behind him?", "choices": ["is broken", "falling", "not his", "left behind"], "correct_choice_idx": 3, "direct_answers": ["lost it", "mistake", "falling down", "lost balance", "left behind", "doing trick", "he fell", "jumped", "person fell", "fell"], "difficult_direct_answer": true, "rationales": ["A man is jumping and a skateboard is falling behind him. people fall of of skateboard when doing tricks.", "The skate board is falling down in the rea.", "The board is left behind."], "image": "train2014/COCO_train2014_000000353197.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546352, "question_id": "fmkwGy7Zu9vMJC7BZ6prUQ", "question": "What famous rapper made famous those words on the yellow sign?", "choices": ["nate dogg", "tupac", "drake", "snoop dogg"], "correct_choice_idx": 3, "direct_answers": ["mm", "snoop dogg", "dry dre", "snoop dogg", "snoop dogg", "dr dre", "tupac", "banksy", "no idea", "ice t"], "difficult_direct_answer": false, "rationales": ["This is one of his iconic lines", "Snoop dog has a song with those words in it.", "Snoop dogg is a rapper who made those words."], "image": "train2014/COCO_train2014_000000546352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554346, "question_id": "fmnqpsZLkrDmnuBXdayBmD", "question": "What was added to the blue item before using?", "choices": ["shampoo", "oil", "toothpaste", "butter"], "correct_choice_idx": 2, "direct_answers": ["toothpaste", "toothpaste", "toothpaste", "toothpaste", "paste", "toothpaste", "toothpaste", "toothpaste", "toothpaste", "brush"], "difficult_direct_answer": false, "rationales": ["The child is cleaning his mouth. oil, butter, or shampoo would not be used for that purpose.", "The boy is using a toothbrush to brush his teeth. toothpaste is needed to clean your teeth.", "He's obviously brushing his teeth."], "image": "train2014/COCO_train2014_000000554346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 730, "question_id": "fmsKZwAq2umoCenZUVwYpq", "question": "What type of people are most likely on board this bus?", "choices": ["veterans", "doctors", "tourists", "cooks"], "correct_choice_idx": 2, "direct_answers": ["tourists", "tourists", "tourist", "tourists", "tourists", "tourists", "tourists", "tourist", "tourists", "tourists"], "difficult_direct_answer": false, "rationales": ["It has nice large window to see out of. visitors like to travel around and look at things.", "It is a touring bus showing the sights of the town.", "Visitors take this bus to see the town."], "image": "val2014/COCO_val2014_000000000730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47256, "question_id": "fmwWRhNxmCGivGFVhjZmPq", "question": "What literature does one person at the table appear to be reading?", "choices": ["bill", "poster", "pamphlet", "menu"], "correct_choice_idx": 3, "direct_answers": ["menu", "menu", "menu", "reading", "menu", "menu", "menu", "reading", "menu", "reading"], "difficult_direct_answer": false, "rationales": ["The person has a menu at the restaurant.", "They are at a restaurant so it is a menu.", "The paper is a menu."], "image": "train2014/COCO_train2014_000000047256.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86183, "question_id": "fn2yhydXtMfrY8byqZdSCs", "question": "When riding these bikes what by law must be worn by these men?", "choices": ["helmets", "steel boots", "long sleeves", "nothing"], "correct_choice_idx": 0, "direct_answers": ["helmets", "helmets", "helmets", "helmet", "helmets", "helmets", "helmets", "helmet", "helmet", "helmet"], "difficult_direct_answer": false, "rationales": ["These men must wear helmets.", "The bikes need helmets.", "According to the law, these men must wear helmets while they ride their bikes."], "image": "train2014/COCO_train2014_000000086183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139183, "question_id": "fnBupsoDLAhmMSRQx9ZqUz", "question": "What is the white and brown horse doing?", "choices": ["jumping", "falling", "sitting down", "standing up"], "correct_choice_idx": 1, "direct_answers": ["bucking", "playing", "falling", "bucking", "bucking", "bucking", "falling", "bucking", "bucking", "bowing"], "difficult_direct_answer": false, "rationales": ["You can tell by the horses buckled feet and body position as to what is happening.", "The horse was trying to buck the rider off and is going down.", "The horse is falling."], "image": "train2014/COCO_train2014_000000139183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115854, "question_id": "fnGpqY9uLxhKrdYQdxuWsN", "question": "Who is the player wearing gloves?", "choices": ["midfielder", "defender", "goalkeeper", "forward"], "correct_choice_idx": 2, "direct_answers": ["not clear", "goalkeeper", "goalie", "goalie", "goalie", "goalie", "goalie", "goalie", "goalie", "goalie"], "difficult_direct_answer": false, "rationales": ["The player that can grab the ball with their gloves is the goalie.", "He blocks the goal", "The goalkeeper is the only one who can touch the ball."], "image": "train2014/COCO_train2014_000000115854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478099, "question_id": "fnUgxxk7sSdGKyrj5sPc9g", "question": "What continent is this scene located in?", "choices": ["north america", "australia", "asia", "europe"], "correct_choice_idx": 2, "direct_answers": ["asia", "aisa", "asia", "asia", "asia", "japan", "train", "asia", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["There are many cherry trees with blossoms", "It is located in asia because you can see the cherry trees popular in japan", "This area has cherry blossoms everywhere."], "image": "val2014/COCO_val2014_000000478099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143132, "question_id": "fnWBCxXfUnSkVxJjm5p5bD", "question": "What is the boy perfecting here?", "choices": ["balance", "sun bathing", "betting", "game play"], "correct_choice_idx": 0, "direct_answers": ["balance", "surfing skills", "balance board", "balance", "balanced", "practicing surfing", "surfing", "surfing", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["This simulates surfing and helps people to stay on the boards in the ocean.", "The boy is perfecting a surfboard balance.", "A kid is standing on a board in a blow up pool."], "image": "val2014/COCO_val2014_000000143132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192982, "question_id": "fncxTVEU3hb95kqgUh9D5S", "question": "What is the relationship of the man wearing light blue pants to the man wearing star-patterned pants?", "choices": ["competitor", "father", "teammate", "great grandfather"], "correct_choice_idx": 0, "direct_answers": ["competitor", "opponent", "opposing teams", "opposite team", "volleyball opponents", "friend", "opponents", "friends", "play mates", "friendship"], "difficult_direct_answer": true, "rationales": ["The two men are playing volleyball and standing on opposite sides of the net to hit the ball to each other because they are on opposing teams.", "These two men are similar in age. they are playing beach volleyball and are on opposite sides of the net.", "They are on opposite sides of the next. they are playing against each other."], "image": "train2014/COCO_train2014_000000192982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354721, "question_id": "fnhmmJDXBer4zyHfvhiSc6", "question": "What is the skater's source of momentum?", "choices": ["dog", "gasoline", "petrol", "wheels"], "correct_choice_idx": 0, "direct_answers": ["pushing", "pushing", "dog", "dog power", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The source is the dog.", "The dog is pulling him.", "The animal is pulling him on the skateboard."], "image": "train2014/COCO_train2014_000000354721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40065, "question_id": "fnsKBirr48KDmRjV78vf9c", "question": "What is the building at the back?", "choices": ["shopping mall", "residential building", "hotel", "office building"], "correct_choice_idx": 2, "direct_answers": ["hotel", "train station", "hotel", "airport", "hotel", "hotel", "hotel", "market", "apartment", "multistory building"], "difficult_direct_answer": false, "rationales": ["There are luggage stacked in front of the building.", "Luggage is transported to hotels.", "The building can be seen as tall with many windows. the luggage in front of it indicates that it will be brought inside by travelers who are staying there."], "image": "train2014/COCO_train2014_000000040065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531047, "question_id": "fo7RNkm5X5u7W3Q5fLrUL2", "question": "What are they doing?", "choices": ["eating breakfast", "being defiant", "posing", "being caught"], "correct_choice_idx": 2, "direct_answers": ["cheering", "toasting", "toasting", "posing", "toasting", "toasting", "cheers", "cheers", "celebrating", "drinking"], "difficult_direct_answer": false, "rationales": ["They are posing for a picture.", "People are smiling and holding up their glasses. people pose for pictures.", "They are posing together with glasses."], "image": "val2014/COCO_val2014_000000531047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79992, "question_id": "foefMLwsJGtubLDfq8h3nk", "question": "What is the best climate for these fruits to grow in?", "choices": ["dry", "arid", "polar", "tropical"], "correct_choice_idx": 3, "direct_answers": ["tropical", "tropical", "warm", "tropical", "tropical", "tropical", "tropical", "warm", "summer", "tropical"], "difficult_direct_answer": false, "rationales": ["The visible fruits are bananas and coconuts. the trees that these plants grow on are known to be found in tropical climates.", "These are plants that need a lot of sun and water", "Coconuts and bananas are in a basket."], "image": "val2014/COCO_val2014_000000079992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548485, "question_id": "fou7vuadf3gNWcbAqXJwua", "question": "Why are the cleaning bottles on the tub wall?", "choices": ["visual appeal", "convivence", "accidental", "safety"], "correct_choice_idx": 1, "direct_answers": ["shower use", "being used", "easy use", "hygiene", "rackless", "use bottle", "storage", "easy accessibility", "no shelf", "convivence"], "difficult_direct_answer": true, "rationales": ["These bottles are probably on the wall of tub because they are used often. it is more convenient to have them ready to use.", "It's for convenience.", "They are easy to grab when they are needed."], "image": "val2014/COCO_val2014_000000548485.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246686, "question_id": "fp73ufYzaix6avgWjcXMSt", "question": "What is the man wearing the hat pulling?", "choices": ["surfboard", "snow board", "door", "body board"], "correct_choice_idx": 3, "direct_answers": ["body board", "boogy board", "surfboard", "paddle board", "boogie board", "wakeboard", "boogie boards", "dress", "kickboard", "body board"], "difficult_direct_answer": true, "rationales": ["A body board is used on the beach. the others are on the mountain.", "The man has a body board.", "It is shorter than a surf board."], "image": "val2014/COCO_val2014_000000246686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175142, "question_id": "fp8GDKAJitiqcgEtu4o6Ci", "question": "What is the swinging bench called?", "choices": ["porch swing", "hanging swing", "dangerous", "outdoor loveseat"], "correct_choice_idx": 0, "direct_answers": ["porch swing", "porch swing", "swing", "swing", "benches duck", "porch swing", "swing", "porch swing", "porch swing", "swing"], "difficult_direct_answer": false, "rationales": ["The swinging bench is called a porch swing because it is swung from a balcony.", "The swinging benches are known as porch wings.", "That is what a swinging bench is called."], "image": "train2014/COCO_train2014_000000175142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147442, "question_id": "fp8evobuBKAvW3CmiveQXx", "question": "What is the photographer definitely higher than?", "choices": ["people", "clouds", "tower", "air plane"], "correct_choice_idx": 0, "direct_answers": ["ground", "plane", "ground", "people", "tarmac", "ground", "photo", "plane", "airplane", "ground"], "difficult_direct_answer": false, "rationales": ["Higher then the people.", "The photographer is inside of the building.", "The photographer is higher than the people."], "image": "train2014/COCO_train2014_000000147442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334675, "question_id": "fpWxoo4fySK6miscJqF3ge", "question": "What location is this?", "choices": ["zoo", "bus depot", "subway", "carnival"], "correct_choice_idx": 1, "direct_answers": ["bus depot", "bus depot", "transit center", "bus depot", "bus yard", "bus depot", "parking lot", "bus parking", "bus garage", "bus depot"], "difficult_direct_answer": false, "rationales": ["That is a bus depot.", "A large parking lot has numerous buses parked their.", "When approximately 38 buses are parked side by side in a massive parking lot, the place is going to be called a \"bus depot\"!."], "image": "val2014/COCO_val2014_000000334675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398772, "question_id": "fpZjg7k8C5d8b6An9usVmb", "question": "What is this place?", "choices": ["camp", "hospital", "watch factory", "chapel"], "correct_choice_idx": 1, "direct_answers": ["hospital ward", "bedroom", "nursing bed", "dorm", "hospital", "hospital", "bedroom", "hospital", "studio apartment", "dormitory"], "difficult_direct_answer": false, "rationales": ["There is a white hospital bed with the tray for eating. there are crutches behind the chair for the person to walk with. those rooms are common in those medical facilities.", "This is a hospital bed, with its bedtray and sidearms.", "There is medical equipment on the refrigerator."], "image": "train2014/COCO_train2014_000000398772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273379, "question_id": "fpbCk7HSzZX8PxudVrGUM2", "question": "You can take this bus to what area of England?", "choices": ["berkshire", "norfolk", "bristol", "cheshire"], "correct_choice_idx": 1, "direct_answers": ["norfolk", "holt", "holt", "wet areas", "holt", "holt", "downtown", "wet areas", "holt", "holt"], "difficult_direct_answer": false, "rationales": ["Holt is in the norfolk region.", "The bus goes to norfolk.", "According to the sign on the bus, it is headed to holt. an internet search of the english town of holt provided the county where holt is located."], "image": "train2014/COCO_train2014_000000273379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285358, "question_id": "fpgk4pieVcezDf3ERD566A", "question": "What sort of nut is on this treat?", "choices": ["chestnut", "walnut", "coconut", "peanut"], "correct_choice_idx": 2, "direct_answers": ["walnut", "macadamia", "coconut pecan", "hazelnut", "coconut", "chestnuts", "pine", "cake", "hazelnuts", "coconut"], "difficult_direct_answer": true, "rationales": ["The nut is coconut.", "There are coconuts flakes on the outside of the cake.", "It's shredded coconut on it."], "image": "train2014/COCO_train2014_000000285358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276766, "question_id": "fppSfyaEHiw3wPcbWTwBcW", "question": "What is seen brewing here?", "choices": ["magic", "tea", "storm", "coffee"], "correct_choice_idx": 2, "direct_answers": ["storm", "storm", "storm", "storm", "storm", "storm", "storm clouds", "storm", "bus", "storm"], "difficult_direct_answer": false, "rationales": ["There are very dark clouds overhead", "The sky is cloudy. it is getting ready to rain.", "Storm clouds are brewing."], "image": "train2014/COCO_train2014_000000276766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425948, "question_id": "fpwHG5xUmn7LpvJq9xQr2q", "question": "What is the green object the girl is carrying?", "choices": ["purse", "stereo", "bucket", "lunch box"], "correct_choice_idx": 2, "direct_answers": ["sand bucket", "bucket", "bucket", "bucket", "bucket", "bucket", "pail", "bucket", "sand bucket", "bucket"], "difficult_direct_answer": false, "rationales": ["The girl in the photo is on the beach. the green object in her hand would most likely be used to carry water or sand. a bucket would do the job.", "The girl is carrying a green bucket.", "The object is a bucket."], "image": "val2014/COCO_val2014_000000425948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384635, "question_id": "fqSnaioUkrJksk6U6XWp56", "question": "What does this vehicle ride on?", "choices": ["water", "air currents", "roads", "rails"], "correct_choice_idx": 3, "direct_answers": ["tracks", "tracks", "tracks", "rail", "railroad track", "rails", "tracks", "rails", "rails", "tracks"], "difficult_direct_answer": false, "rationales": ["The train can only ride on rails given its wheel configuration.", "The other options don't apply to train travel in this image.", "It is on the rails of the train tracks"], "image": "val2014/COCO_val2014_000000384635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47737, "question_id": "fqhWg4bKRvqmPkVK73kkSB", "question": "What takes up more room on the plate?", "choices": ["mushrooms", "carrots", "peppers", "apples"], "correct_choice_idx": 1, "direct_answers": ["carrots", "carrots", "food", "carrots", "food", "carrots", "carrots", "carrots", "carrots", "food"], "difficult_direct_answer": false, "rationales": ["The carrots are enormous.", "They are very large chunks", "The carrots have been sliced in large intervals."], "image": "train2014/COCO_train2014_000000047737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295007, "question_id": "fqkE54oq3CGVRADLqRZnxL", "question": "Why is this bus articulated?", "choices": ["keep apart", "mistake", "wide turns", "broken"], "correct_choice_idx": 2, "direct_answers": ["decir", "more passengers", "more space", "more passengers", "stopped", "no idea", "getting passengers", "carry more", "maneuverability", "wide turns"], "difficult_direct_answer": true, "rationales": ["Some buses that are extended for extra room will usually have a flexible area in the center to help with maneuverability on sharp turns.", "The bus can't make wide turns.", "The bus makes turns."], "image": "train2014/COCO_train2014_000000295007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243796, "question_id": "fqvXYx47hyLWLbWRRogh66", "question": "How do these people know each other?", "choices": ["teammates", "rivals", "neighbors", "coworkers"], "correct_choice_idx": 3, "direct_answers": ["friends", "coworkers", "coworkers", "friends", "coworkers", "coworkers", "coworkers", "work together", "same office", "coworkers"], "difficult_direct_answer": false, "rationales": ["They sit in front of computers to do work.", "These people are all coworkers.", "They are all around a large table filled with laptops"], "image": "train2014/COCO_train2014_000000243796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409178, "question_id": "fqzdL4z675T4v9Dxxna4xu", "question": "Why is she wearing glasses?", "choices": ["fashion", "costume", "disguise", "safety"], "correct_choice_idx": 3, "direct_answers": ["protect eyes", "protection", "protection", "see better", "snow protection", "protection", "protection", "protection", "safety", "block snow"], "difficult_direct_answer": false, "rationales": ["The goggles the woman is wearing protect her face from snow.", "The glasses that the skier is wearing are for the skier's safety because it protects his eyes", "Skiing can be very unpredictable so it's a good idea to wear glasses to stay safe; just in case something goes wrong."], "image": "val2014/COCO_val2014_000000409178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100128, "question_id": "fr37xjM2TCaRSWZvggoqjR", "question": "What purple vegetable toppings are on the first pie?", "choices": ["onions", "carrots", "cauliflower", "mushrooms"], "correct_choice_idx": 0, "direct_answers": ["onion", "onions", "onion", "onions", "onions", "onion", "onions", "onions", "onions", "onion"], "difficult_direct_answer": false, "rationales": ["Red onions are purple.", "There are a bunch of purple onions sitting on top of the pizza pie.", "The veggies are onions."], "image": "train2014/COCO_train2014_000000100128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244557, "question_id": "frKF5GumQQaipLitaZJ8q4", "question": "Oil holding capacity per batch of this machine is what?", "choices": ["50l", "20l", "10l", "15l"], "correct_choice_idx": 3, "direct_answers": ["35 pounds", "liters", "six liters", "unknown", "ten", "7.5 liters", "15litres", "15l", "onehundred", "lot"], "difficult_direct_answer": true, "rationales": ["The capacity of the machine is 15 liters.", "A vat like this is too small to hold more than 15 liters.", "That is how much the machine takes."], "image": "train2014/COCO_train2014_000000244557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129334, "question_id": "frZ8ZBJ6RhUUcMFELgJZSn", "question": "What are the little things on the screen on the left called?", "choices": ["icons", "bugs", "snowflakes", "cracks"], "correct_choice_idx": 0, "direct_answers": ["dots", "no image", "shortcut icons", "icons", "icons", "icons", "icons", "icons", "icons", "icons"], "difficult_direct_answer": false, "rationales": ["They are small pictures that tell you what each program is.", "Small gray images can be seen on a screen.", "There are several file names on the screen."], "image": "train2014/COCO_train2014_000000129334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168569, "question_id": "frsKyC5yF7DF2SJzSheSdC", "question": "What is holding the tennis racquet?", "choices": ["old man", "baby", "old lady", "wolf"], "correct_choice_idx": 1, "direct_answers": ["baby", "toddler", "child", "baby", "baby", "baby", "baby", "baby", "toddler", "child"], "difficult_direct_answer": false, "rationales": ["The raquet is being held by a person who is smaller than the racquet, indicating it is a toddler or baby.", "A baby who seems happy is having the the racquet.", "A baby is holding the tennis racquet because the racquet is almost as big as the baby"], "image": "train2014/COCO_train2014_000000168569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149993, "question_id": "fs5HgGFLXndTKUXimQ44sS", "question": "What kind of geological rock formations appear on the outcrops?", "choices": ["pyrite", "siltstone", "dolomite", "sandstone"], "correct_choice_idx": 1, "direct_answers": ["hills", "boulders", "boulders", "limestone", "rock", "mountain", "igneous", "rocks", "siltstone", "boulders"], "difficult_direct_answer": false, "rationales": ["The rocks appear to be siltstone and that would explain all the spots where the rocks extend from the dirt.", "There are siltstone formations.", "That is the type of stone that is on the ground."], "image": "train2014/COCO_train2014_000000149993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567308, "question_id": "fsJNGaaiSjMEyqyUHoaZc9", "question": "Who are the people standing behind the gate?", "choices": ["coaches", "travelers", "spectators", "jury"], "correct_choice_idx": 2, "direct_answers": ["ball boys", "spectators", "spectators", "spectators", "spectators", "spectators", "spectators", "fans", "fans", "spectators"], "difficult_direct_answer": false, "rationales": ["There are spectators watching.", "The fans are seen behind the camera.", "The people are fans."], "image": "val2014/COCO_val2014_000000567308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33444, "question_id": "fsb5mUVaUA8wkp4krNE7aW", "question": "Why did the dog jump in the air?", "choices": ["eat", "catch", "flip", "greet"], "correct_choice_idx": 1, "direct_answers": ["catch frisbee", "catch", "playing frisbee", "catch frisbee", "catch frisbee", "catch", "catch frisbee", "catch frisbee", "grab frisbee", "catch frisbee"], "difficult_direct_answer": false, "rationales": ["The dog got the frisbee.", "The pair are at a frisbee dog show where the objective is to throw frisbees and have the dogs catch them. the dog has the frisbee in its mouth meaning a catch recently happened and likely was the cause for jumping into the air.", "He has a frisbee in his mouth."], "image": "train2014/COCO_train2014_000000033444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336887, "question_id": "fsf7gaSJwAszHFecs7PJjS", "question": "What area is the image from?", "choices": ["sky", "beach", "forest", "car"], "correct_choice_idx": 1, "direct_answers": ["beach", "beach", "beach", "beach", "beach", "beach", "beach", "beach", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["There is water behind the structure.", "The ocean is in the background.", "The area is a beach."], "image": "train2014/COCO_train2014_000000336887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166585, "question_id": "fswVCmh8hmXnF8GF8HfAmm", "question": "What animals are sitting in the boat?", "choices": ["dolphin", "cat", "dog", "frog"], "correct_choice_idx": 2, "direct_answers": ["dogs", "dogs", "dog", "dogs", "dogs", "dogs", "dog", "dogs", "dogs", "dog"], "difficult_direct_answer": false, "rationales": ["The animal is a dog as it is seen clearly.", "Dogs are on the boat.", "Two labs are in a boat on the water."], "image": "train2014/COCO_train2014_000000166585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253087, "question_id": "ftxKbmg8hAATiQmbPD7e8m", "question": "What is he about to do?", "choices": ["cut cake", "fall over", "cut himself", "cut girl"], "correct_choice_idx": 0, "direct_answers": ["cut cake", "cut cake", "cut cake", "cut cake", "cutting cake", "cut cake", "cut cake", "cut cake", "cut cake", "cut cake"], "difficult_direct_answer": false, "rationales": ["He is holding a knife and penetrating the cake with it, indicating that he is going to cut it into slices for eating.", "The man wants to cut into the cake.", "The bride and groom are near a food item and are happy. they are holding a knife."], "image": "train2014/COCO_train2014_000000253087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474614, "question_id": "ftzDAXDwRdUtyuPuwRHNtP", "question": "What style food are the persons standing here fans of?", "choices": ["fasting", "korean", "cafeteria style", "mexican"], "correct_choice_idx": 1, "direct_answers": ["korean", "korean bbq", "bbq", "korean", "korean", "korean tacos", "tacos", "korean", "korean", "korean barbeque"], "difficult_direct_answer": false, "rationales": ["There is a sign for korean bbq tacos.", "There is a sign on the food truck. it indicates the style of food that is being sold.", "Korean food is named on the sign."], "image": "train2014/COCO_train2014_000000474614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404013, "question_id": "fu5yPcxRpg8RZC792GaRbq", "question": "What can you pull from the circular object on the wall?", "choices": ["floss", "condom", "toilet paper", "tampon"], "correct_choice_idx": 2, "direct_answers": ["toilet paper", "toilet paper", "toilet paper", "toilet paper", "toilet paper", "toilet paper", "toilet paper", "toilet paper", "toilet paper", "toilet paper"], "difficult_direct_answer": false, "rationales": ["Toilet paper always comes out of rolls that are rounded.", "You might pull toilet paper from the black container on the wall.", "These types of objects are generally not found in homes, but do the same thing in homes which is house toilet paper."], "image": "train2014/COCO_train2014_000000404013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306266, "question_id": "fuA78LNFBSn6ABFxzXRChY", "question": "If the bench instantly went away what would happen?", "choices": ["man falls", "car falls", "nothing", "fence opens"], "correct_choice_idx": 0, "direct_answers": ["boy falls", "scatting", "fall", "injured skateboarder", "fall", "man fall", "fall down", "skater falls", "fall", "man falls"], "difficult_direct_answer": false, "rationales": ["The bench would cause the man to fall.", "The man would fall if he is not sitting on the bench.", "The man is on a skateboard on top of the bench balanced on one side, without the bench he would be on the ground."], "image": "train2014/COCO_train2014_000000306266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268293, "question_id": "fuCyTa43XhAKJKcucNTnSf", "question": "What manner will the person here hit the ball?", "choices": ["forehanded", "head butt", "two handed", "backhanded"], "correct_choice_idx": 3, "direct_answers": ["backhand", "backhanded", "backhand", "backhanded", "backhand", "backhand", "backhand", "backhand", "backhand", "backhanded"], "difficult_direct_answer": false, "rationales": ["The man has his tennis racket in front of him and is about to hit the ball using the forehand position.", "This man is preparing to strike the ball with his knuckles towards the ball. this would be called a backhand in tennis.", "A man is reaching back, across his body with a tennis racket, in preparation to hit a ball."], "image": "val2014/COCO_val2014_000000268293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223790, "question_id": "fuKoJennE8SWQnSL9hqkww", "question": "What are they expecting to be poured into the upside down glass?", "choices": ["soda", "seltzer", "gatorade", "wine"], "correct_choice_idx": 3, "direct_answers": ["wine", "water", "wine", "wine", "wine", "wine", "wine", "water", "water", "wine"], "difficult_direct_answer": false, "rationales": ["The upside down glass is a wine glass.", "Wine is expected to be poured in the upside down glass because it is a wine glass", "They want wine."], "image": "train2014/COCO_train2014_000000223790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429635, "question_id": "funmAi9AZYJ9gtnbuE5xGd", "question": "What are the blue structures in the snow made out of?", "choices": ["putty", "plaster", "ice", "plastic"], "correct_choice_idx": 2, "direct_answers": ["ice", "ice", "rock glacier", "rock", "ice", "ice", "ice", "ice", "stone", "snow crystals"], "difficult_direct_answer": false, "rationales": ["The structures are ice.", "The blue structures are icy floes.", "Those are all ice in the mountains."], "image": "train2014/COCO_train2014_000000429635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369253, "question_id": "fv2XApYjat974n28PMWJ5L", "question": "What does this setting most resemble?", "choices": ["college campus", "tundra", "circus", "desert"], "correct_choice_idx": 0, "direct_answers": ["university", "college campus", "college campus", "campus", "college campus", "school", "nature", "church", "courtyard", "school"], "difficult_direct_answer": false, "rationales": ["The setting is a college campus.", "The building looks like it is at a university.", "The building looks like an old college campus."], "image": "train2014/COCO_train2014_000000369253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4360, "question_id": "fv3dBdcFDnv6eaNiSeJeeh", "question": "What affords this motorcycle a green hue?", "choices": ["hair", "astro turf", "paint", "wig"], "correct_choice_idx": 1, "direct_answers": ["carpet", "moss", "grass", "vegetation", "fake grass", "astroturf", "astroturf", "cover", "astro turf", "plants"], "difficult_direct_answer": true, "rationales": ["The covering appears to be grass, but artificial. artificial grass is referred to as answer a.", "There is fake grass.", "There is grass on the motorcycle. it is artificial, not real, grass."], "image": "train2014/COCO_train2014_000000004360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580945, "question_id": "fv4m2Dbn23vSvN5JzdSkto", "question": "What breakfast food do they sell at this store?", "choices": ["steak", "sushi", "bagels", "ice cream"], "correct_choice_idx": 2, "direct_answers": ["bagels", "bagels", "pastries", "bagels", "bagels", "donuts", "donuts", "bagels", "doughnuts", "bagels"], "difficult_direct_answer": false, "rationales": ["You can tell what they sell if you look in the background to the right.", "They are available behind the counter.", "They sell bagels because you see them displayed on the stands"], "image": "train2014/COCO_train2014_000000580945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487322, "question_id": "fv8mRgqnHHUY5KqgFqnzsr", "question": "What color is one of the kneepads?", "choices": ["black", "yellow", "green", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "blue", "red", "red", "blue", "red", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["Unless you are colorblind you can discern what colors are the kneepads.", "The kneepads are red and blue.", "The kneepads are red and blue."], "image": "train2014/COCO_train2014_000000487322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417143, "question_id": "fvUwSevcg2WAX2LQUoRhaA", "question": "Where are these people located?", "choices": ["restaurant", "classroom", "office", "theater"], "correct_choice_idx": 0, "direct_answers": ["restaurant", "some bar", "restaurant", "bar", "restaurant", "bar", "bar", "bar", "bar", "bar"], "difficult_direct_answer": false, "rationales": ["All the bottles are a clear indication that this is not a classroom or a theatre. the only way there would be bottles in an office would be at a party and it's obvious just by looking that this is not an office.", "The people are located at a cafe or a bar.", "These people are located in a restaurant with a bar."], "image": "train2014/COCO_train2014_000000417143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15151, "question_id": "fvmcTwU4Z8zBUxSkwPiQXL", "question": "What is the towel on the ground for?", "choices": ["drying hands", "wiping sweat", "washing face", "bathing"], "correct_choice_idx": 1, "direct_answers": ["wiping sweat", "singer", "mic", "wiping sweat", "sweating", "sweat", "sweat", "wiping sweat", "wiping sweat", "dry sweat"], "difficult_direct_answer": false, "rationales": ["The towel is to keep the man from getting sweaty.", "The towel is for sweat.", "The man is sweating on stage so he will use the towel to dry himself."], "image": "train2014/COCO_train2014_000000015151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198545, "question_id": "fvxPSENAAtmobtsE6GvvgX", "question": "What is the purpose of all these objects?", "choices": ["good luck", "decorative", "hiding", "for sale"], "correct_choice_idx": 1, "direct_answers": ["decoration", "decoration", "decoration", "decor", "decoration", "decorative", "decoration", "decoration", "prevent rain", "decorations"], "difficult_direct_answer": false, "rationales": ["By the vibrant colors and setting you can easily tell what they were trying to accomplish in the picture.", "The purpose is to decorate.", "Colorful umbrellas are hung from the ceiling. umbrellas hung from the ceiling could not be easily retrieved in cases of rain."], "image": "train2014/COCO_train2014_000000198545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42285, "question_id": "fw2qP4cwEKA7JpDxerPHVh", "question": "What type of animal is in the image?", "choices": ["chickens", "ducks", "cows", "dogs"], "correct_choice_idx": 0, "direct_answers": ["chickens", "chickens", "chicken", "chicken", "chickens", "chicken", "chicken", "chicken", "chicken", "chicken"], "difficult_direct_answer": false, "rationales": ["The animal is a chicken.", "Chickens have beaks and are shown living in the chicken coop.", "The webbed feet and beaks of these birds identify them as chickens."], "image": "train2014/COCO_train2014_000000042285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453799, "question_id": "fw8SftonQtNqUWgUSSsAoQ", "question": "What is the company of the motorcycle?", "choices": ["cruisers", "kawasaki", "triumph", "husqvarna"], "correct_choice_idx": 2, "direct_answers": ["indian", "honda", "ducati", "honda", "honda", "yamaha", "not harley", "triumph", "honda", "harley"], "difficult_direct_answer": false, "rationales": ["The motorcycle is branded to be triumph's.", "The company is triumph.", "The logo of the motorcycle indicates triumph."], "image": "val2014/COCO_val2014_000000453799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160185, "question_id": "fw8xkz9RpbggzYXaCMum9v", "question": "How is the ball likely to be moved along first?", "choices": ["batted", "kicked", "carried", "dribbled"], "correct_choice_idx": 1, "direct_answers": ["kicked", "kick", "kicked", "foot", "kicked", "foot", "kicked", "kicked", "foot", "kick"], "difficult_direct_answer": false, "rationales": ["The ball is kicked.", "It's soccer; they can only move the ball with their feet.", "The ball moves in soccer when you kick it."], "image": "val2014/COCO_val2014_000000160185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467726, "question_id": "fwKcUaVTdpUGT6LB9zUNUQ", "question": "What item resembles the item all the way to the left?", "choices": ["buzzsaw", "pitchfork", "chainsaw", "gramophone"], "correct_choice_idx": 1, "direct_answers": ["fork", "fork", "flowers", "flower", "fork", "fork", "fork", "fork", "pitchfork", "fork"], "difficult_direct_answer": false, "rationales": ["The fork has four prongs.", "The item has prongs and is partially made out of metal.", "The item is a pitchfork."], "image": "val2014/COCO_val2014_000000467726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398153, "question_id": "fwQhspJFzD5YQJvCsGt5tK", "question": "How are the knifes hanging on the wall?", "choices": ["hooks", "twine", "magnetic strip", "nails"], "correct_choice_idx": 2, "direct_answers": ["magnets", "mounted stripes", "magnets", "magnet", "magnetic strip", "magnet", "rack", "magnetic", "magnetic strip", "rack"], "difficult_direct_answer": false, "rationales": ["The knifes hold up by applying the blades on a magnetic strip", "The entire thing is magnetized.", "They are on there by a magnetic strip"], "image": "val2014/COCO_val2014_000000398153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260116, "question_id": "fwmj6WCYBB69yAFtgouF3w", "question": "What helmets do MLB players wear?", "choices": ["rawlings", "protector", "is2", "none"], "correct_choice_idx": 0, "direct_answers": ["baseball helmets", "rawlings", "rawlings", "batting helmet", "protection", "rawlings", "batting helmet", "rawlings", "baseball helmets", "rawlings"], "difficult_direct_answer": false, "rationales": ["That is the helmets the players are wearing.", "That who provides the equipment.", "Rawlings makes most of the helmets for baseball players."], "image": "train2014/COCO_train2014_000000260116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255137, "question_id": "fwsC8Z2EZ7B2W42v6njCac", "question": "Why is he standing far from the animals?", "choices": ["avoid spooking", "dangerous animals", "he's afraid", "private property"], "correct_choice_idx": 0, "direct_answers": ["safety", "for safety", "fear", "avoid spooking", "herding them", "prevent scaring", "heading them", "taking photo", "for photograph", "safety"], "difficult_direct_answer": true, "rationales": ["Most herd animals are easily startled, as they are prey to many different species of carnivores. if a visitor moves slowly, these animals will stay calm, just as the visitor is.", "He doesn't want to scare them and cause them to run.", "He wants them to stay calm and eat."], "image": "train2014/COCO_train2014_000000255137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220972, "question_id": "fx2FuX9JKnDeys538gDg94", "question": "What has this person jumped up to do?", "choices": ["swing", "answer", "help", "speak"], "correct_choice_idx": 0, "direct_answers": ["hit ball", "hit ball", "swing", "hit ball", "hit ball", "hit ball", "hit ball", "swing", "serve", "hit ball"], "difficult_direct_answer": false, "rationales": ["Jumping up to hit the ball", "The person wants to swing his racquet.", "The woman is holding a racket and a ball is headed towards it so she will likely swing."], "image": "train2014/COCO_train2014_000000220972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554875, "question_id": "fxAheUMdxKiYW6pdpo7uqx", "question": "What color would this horse be called?", "choices": ["gray", "silver", "white", "snow"], "correct_choice_idx": 2, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "albino", "white", "albino"], "difficult_direct_answer": false, "rationales": ["Unless you are colorblind you can tell what color the horse is.", "This horse is mostly white.", "It is mostly this color"], "image": "train2014/COCO_train2014_000000554875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402795, "question_id": "fxaLk9g9X8MCVKeGezDeim", "question": "What's the lady doing?", "choices": ["eating", "reading", "texting", "playing"], "correct_choice_idx": 2, "direct_answers": ["texting", "checking phone", "sitting", "reading phone", "texting", "using phone", "watching cell", "checking cellphone", "texting", "texting"], "difficult_direct_answer": false, "rationales": ["Given she's on her phone, this is the most likely answer.", "The lady is texting on her phone.", "This woman appears engaged in what she's seeing on her mobile phone's screen and she is using both thumbs to input something onto it."], "image": "val2014/COCO_val2014_000000402795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283199, "question_id": "fxcfRL8Uzsax4nSeebwqsH", "question": "How many birds are hanging upside down?", "choices": ["four", "none", "three", "two"], "correct_choice_idx": 1, "direct_answers": ["zero", "one", "one", "zero", "one", "zero", "none", "zero", "zero", "zero"], "difficult_direct_answer": false, "rationales": ["The only bird pictured is standing on the branch, right side up.", "There are none upside down.", "There is one bird visible and it is not upside down."], "image": "train2014/COCO_train2014_000000283199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431566, "question_id": "fxdABMViMfDtqddGme7NcA", "question": "What type of shot is about to be hit?", "choices": ["backhand", "slice", "forehand", "serve"], "correct_choice_idx": 3, "direct_answers": ["serve", "serve", "serve", "serve", "serve", "serve", "serve", "serve", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["The person has their arms raised above.", "They are throwing the ball up in the air first.", "The shot is a serve."], "image": "train2014/COCO_train2014_000000431566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534734, "question_id": "fxkW3NejpXvkG6MRcVmoUU", "question": "Who is playing tennis with the boy wearing yellow pants?", "choices": ["woman", "young man", "boy", "old man"], "correct_choice_idx": 0, "direct_answers": ["woman", "white shorts", "friend", "his opponent", "girl", "woman", "another boy", "another boy", "man", "woman"], "difficult_direct_answer": false, "rationales": ["A woman is standing on the other side of a tennis court from a boy in yellow.", "A woman is playing tennis.", "There are only two people playing tennis and the woman is not a boy or wearing yellow pants."], "image": "val2014/COCO_val2014_000000534734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330719, "question_id": "fxyAPJjGh32zaYsBm8h5Ji", "question": "What might the cables seen here move along?", "choices": ["gulls", "eskimos", "commuters", "skiers"], "correct_choice_idx": 3, "direct_answers": ["hill", "ski lift", "hill", "ski lift", "track", "track", "skiers", "snowboard", "skiers", "cable car"], "difficult_direct_answer": false, "rationales": ["There is a cable that takes people up and down the slopes. it takes these people across the open expanse.", "The skiers can move.", "The people are moving up the mountain to go skiing."], "image": "train2014/COCO_train2014_000000330719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88928, "question_id": "fyKj8YsY4wREriTHU9UgAp", "question": "Why is the baby wrapped in a blanket?", "choices": ["to bathe", "to dry", "to sleep", "dressing up"], "correct_choice_idx": 2, "direct_answers": ["warmth comfort", "to sleep", "for warmth", "for warm", "warmth", "cold", "warmth security", "warmth", "needs warmth", "keeping warm"], "difficult_direct_answer": true, "rationales": ["The baby is wrapped in a blanket to stay comfy and warm for a nap.", "The baby is asleep.", "The baby is sleeping."], "image": "train2014/COCO_train2014_000000088928.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118343, "question_id": "fyS2SvxJYfHTi8KjYfjrrD", "question": "What is the short man doing?", "choices": ["texting", "playing game", "taking photo", "online shopping"], "correct_choice_idx": 2, "direct_answers": ["texting", "checking phone", "photo", "texting", "on phone", "reading", "taking photo", "taking photo", "checking phone", "taking picture"], "difficult_direct_answer": false, "rationales": ["The man takes photos.", "He is taking a picture of what looks to be something in his hand he received like an award or promotion.", "The short man is taking a photo with his phone."], "image": "val2014/COCO_val2014_000000118343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239657, "question_id": "fyf7PcVEXDSv9nRJ77zHc8", "question": "What can usually be found in this setting?", "choices": ["tigers", "camels", "horses", "fish"], "correct_choice_idx": 3, "direct_answers": ["fish", "boats", "fish", "sea", "fish", "dock", "fish", "docked boats", "fish", "dock"], "difficult_direct_answer": false, "rationales": ["These types of animals live in the sea.", "There is water visible in this scene and takes up most of the space. fish are frequently found in water and none of the other answers are.", "Fish can be found in the water."], "image": "val2014/COCO_val2014_000000239657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495786, "question_id": "fyj2KnVTf9y8nBbttcBWB9", "question": "The people in the carriage are most likely what?", "choices": ["trainers", "dog catchers", "tourists", "employees"], "correct_choice_idx": 2, "direct_answers": ["riding", "riders", "tourists", "carriage rides", "tourists", "riding", "tourists", "tourists", "tourists", "tourists"], "difficult_direct_answer": false, "rationales": ["It's common for tourists to check out a city in horse carriages.", "The carriage is pulled by horses which is not that common.", "The people are mostly providing tourists for this coach service."], "image": "val2014/COCO_val2014_000000495786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382203, "question_id": "fymVjZ4foRTxGUm2Ucuvf5", "question": "What type of communication is she using?", "choices": ["verbal", "written", "gestures", "electronic"], "correct_choice_idx": 3, "direct_answers": ["phone", "cellphone", "electronic", "phone", "cell phone", "phone", "cellphone", "cellphone", "cellphone", "cell phone"], "difficult_direct_answer": false, "rationales": ["The person is using electronic communication on their phone.", "The woman is holding a cell phone in her hand.", "A cell phone is considered an a device."], "image": "train2014/COCO_train2014_000000382203.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370926, "question_id": "fzawMbZtTgUQpQx6eTKKGC", "question": "What kind of passengers will travel in this bus?", "choices": ["children", "celebrities", "students", "elders"], "correct_choice_idx": 2, "direct_answers": ["students", "students", "students", "kids", "students", "students", "children", "students", "students", "school kids"], "difficult_direct_answer": false, "rationales": ["It's a school bus.", "This is a yellow school bus that school kids ride on.", "The vehicle is a school bus. it carries pupils to and from school."], "image": "train2014/COCO_train2014_000000370926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55155, "question_id": "fzx34uiRsbMsDK79iPZBFh", "question": "What is the girl in red pointing to?", "choices": ["phone", "letter", "her hand", "her mother"], "correct_choice_idx": 0, "direct_answers": ["phone", "phone", "phone", "cellphone", "cell phone", "phone", "cellphone", "phone", "phone", "cellphone"], "difficult_direct_answer": false, "rationales": ["The girl is holding a device that is used for calling.", "The girl has a phone.", "The girl has a phone."], "image": "val2014/COCO_val2014_000000055155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431450, "question_id": "fzzB2scS2RU8xtWL9vPibY", "question": "How does the rainbow object in the air get elevated?", "choices": ["propulsion", "sheer willpower", "speed", "wind"], "correct_choice_idx": 3, "direct_answers": ["wind", "wind", "wind", "wind", "wind", "wind", "wind", "wind temperature", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["The wind will pick it up and lift it into the air.", "It is a kite, connected to a string and it is very light and held up when there are strong winds.", "The rainbow colored kite is powered by gusts of air that move it higher and higher."], "image": "train2014/COCO_train2014_000000431450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101636, "question_id": "fzzpFUNdf4jo5cAVbKQc4T", "question": "What else is often put where the bikes are now?", "choices": ["tents", "motorcycles", "cars", "guns"], "correct_choice_idx": 0, "direct_answers": ["tents", "canoe", "luggage", "luggage", "luggage", "luggage", "luggage", "canoe", "luggage", "canoe"], "difficult_direct_answer": false, "rationales": ["Camping gear can be transported on the luggage rack of a car.", "Tents can be put on top of the cars.", "The tents are pitched."], "image": "val2014/COCO_val2014_000000101636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20464, "question_id": "g2F9KkJJEYnUzWs7nwANoW", "question": "Which city department are they?", "choices": ["police", "water", "library", "fire"], "correct_choice_idx": 3, "direct_answers": ["fire department", "fire", "fire department", "fire", "fire department", "cin", "fire department", "fire department", "fire", "fire department"], "difficult_direct_answer": false, "rationales": ["There are hoses on the carriages. there is a sign on the garage.", "The sign on the building says what the department is.", "The department is the fire one."], "image": "train2014/COCO_train2014_000000020464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97362, "question_id": "g2YCarwHunmXv9GNwZqmkG", "question": "Who is the woman feeding the cows?", "choices": ["farm worker", "animal rescuer", "visitor", "zookeeper"], "correct_choice_idx": 2, "direct_answers": ["visitor", "caretaker", "farmer", "farmer", "guest", "farmer", "farmer", "their owner", "farmer", "owner"], "difficult_direct_answer": false, "rationales": ["The woman is just a visitor.", "The setting these cows are in is likely a farm based on the presence of cows and the pen they are in. the most likely person to be responsible for feeding lifestock on the farm would be someone who worked there.", "The woman is a visitor."], "image": "val2014/COCO_val2014_000000097362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286751, "question_id": "g2ZwWYQLoa5Fbq9TVtfP7c", "question": "What are the green fruits?", "choices": ["pears", "grapes", "apples", "watermelons"], "correct_choice_idx": 0, "direct_answers": ["pears", "pear", "pear", "pears", "pears", "pear", "pears", "pears", "pears", "pears"], "difficult_direct_answer": false, "rationales": ["They are round and have a stem.", "They are pears.", "The size and shape are not consistent with watermelons, grapes, or apples."], "image": "train2014/COCO_train2014_000000286751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576098, "question_id": "g2cC4Gmfb6EGk3mUA5dat9", "question": "What does the girl in yellow do to the birds?", "choices": ["hide", "grab them", "hit them", "feed them"], "correct_choice_idx": 3, "direct_answers": ["feed them", "feed them", "feed", "feed", "feeding birds", "feed", "feed them", "feed them", "feed them", "feed them"], "difficult_direct_answer": false, "rationales": ["The girls is crouched around birds that are gathering.", "The girl has some food on her hand.", "There is bread on the ground and she is leaning over. birds like to eat bread."], "image": "train2014/COCO_train2014_000000576098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363887, "question_id": "g32nhD7fm2CV4yJMuvq2JP", "question": "What sort of emergency is the truck seen here prepared to immediately handle?", "choices": ["break ins", "none", "heart attack", "towing"], "correct_choice_idx": 1, "direct_answers": ["fire", "driver", "fire", "fire", "fire", "none", "driver", "driver", "fire", "fire"], "difficult_direct_answer": false, "rationales": ["Fire departments can deal with medical emergencies.", "It is broken down. the tires are buried in the dirt.", "The truck isn't an emergency truck."], "image": "val2014/COCO_val2014_000000363887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469605, "question_id": "g34BBzjpUfE4DM7ptLbjkV", "question": "What is behind the man that is standing?", "choices": ["fish", "dogs", "tents", "barbed wire"], "correct_choice_idx": 2, "direct_answers": ["tents", "tent", "tent", "tents", "tents", "tents", "tents", "tent", "tents", "tents"], "difficult_direct_answer": false, "rationales": ["There are tents behind the man.", "There are a bunch of tents standing behind the man who is talking on the phone.", "They are structures with pointed tops like tents."], "image": "train2014/COCO_train2014_000000469605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126910, "question_id": "g3KBmhUjJty5Qrjh5cHQ28", "question": "The flesh of which animal is likely contained her burger?", "choices": ["worm", "donkey", "pig", "cow"], "correct_choice_idx": 3, "direct_answers": ["cow", "beef", "beef", "cow", "beef", "cow", "cow", "beef", "cow", "cow"], "difficult_direct_answer": false, "rationales": ["A burger is usually made from beef which comes from a cow.", "She is holding a regular hamburger. the meat is beef.", "Hamburgers usually contain beef."], "image": "train2014/COCO_train2014_000000126910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247190, "question_id": "g3Qtt3N2YKtisZx6PcxVFe", "question": "Why is there a chain on the elephant?", "choices": ["it's dangerous", "it's injured", "it's property", "it's stylish"], "correct_choice_idx": 2, "direct_answers": ["it's property", "security", "guide", "control", "leash", "tame", "ownership", "captive animal", "elephant", "captivity"], "difficult_direct_answer": true, "rationales": ["A person is riding an elephant.", "A chain helps keep an animal under control.", "The chain is property."], "image": "train2014/COCO_train2014_000000247190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36351, "question_id": "g3daQZviSbVaUoibyoV45i", "question": "What kind of surface are they playing on?", "choices": ["wood", "mud", "sand", "grass"], "correct_choice_idx": 1, "direct_answers": ["dirt", "mud field", "mud", "dirt", "sand", "mud", "dirt", "field", "dirt", "dirt field"], "difficult_direct_answer": false, "rationales": ["It is churned up wet soil", "The players are walking in mud.", "The ground is brown and it looks like dirt that got wet and turned into mud."], "image": "train2014/COCO_train2014_000000036351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561761, "question_id": "g4PKU3NFLar8Q2XXeuRL8H", "question": "What is the weather today good for?", "choices": ["staying inside", "using umbrella", "skiing", "flying kites"], "correct_choice_idx": 3, "direct_answers": ["flying kites", "kites", "flying kites", "flying kites", "kites", "kites", "flying kites", "flying kites", "flying kites", "flying kites"], "difficult_direct_answer": false, "rationales": ["The weather is for kites.", "The weather is windy.", "The trees look to be in spring or autumn when it is windier than summer or winter. the spectators are all looking up like they are there for the same reason."], "image": "train2014/COCO_train2014_000000561761.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154145, "question_id": "g4YPFh55yCTGnGejFqj6aZ", "question": "What will this player do?", "choices": ["foul", "return ball", "serve", "quit"], "correct_choice_idx": 1, "direct_answers": ["hit", "return", "forehand", "hit ball", "hit ball", "hit ball", "hit ball", "return ball", "hit ball", "return serve"], "difficult_direct_answer": false, "rationales": ["The player would return the ball.", "This player is about to return the ball.", "The ball is coming at him from his opponent and he is going to hit it back."], "image": "val2014/COCO_val2014_000000154145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236125, "question_id": "g57P7eEZJqQiQJaLerYjNV", "question": "What profession are these women in?", "choices": ["nurses", "cashiers", "teachers", "nuns"], "correct_choice_idx": 3, "direct_answers": ["kitchen", "nuns", "chefs", "nuns", "nuns", "nuns", "nun", "nuns", "nuns", "nuns"], "difficult_direct_answer": false, "rationales": ["They are wearing habits. habits are a special uniformfor women of the church.", "They're nuns.", "The women have veils and modest clothes on."], "image": "train2014/COCO_train2014_000000236125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13076, "question_id": "g5b6RV3dgiLcXshn8JbZDT", "question": "Which type dressing does the diner eating here prefer?", "choices": ["none", "thousand island", "ranch", "green goddess"], "correct_choice_idx": 2, "direct_answers": ["blue cheese", "ranch", "ranch", "ranch", "ranch", "ranch", "ranch", "creamy", "ranch", "ranch"], "difficult_direct_answer": false, "rationales": ["The green shows the dressing cord of the area.", "There is white dressing on the salad.", "The dressing on the salad is creamy and white."], "image": "train2014/COCO_train2014_000000013076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103331, "question_id": "g5cTWpBXNgKBZM2swoGyi5", "question": "Why does the dog have a silver tag on its collar?", "choices": ["medical use", "for amusement", "breeding", "identification"], "correct_choice_idx": 3, "direct_answers": ["for owners", "shows ownership", "owner name", "identification", "identification information", "identification", "nametag", "identification", "for identification", "identification"], "difficult_direct_answer": false, "rationales": ["Dog tags are used to identify the dog.", "The tag's attached to dogs are normally for ease of return to owner should they become lost.", "The tag has identification information on it."], "image": "train2014/COCO_train2014_000000103331.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412468, "question_id": "g5rgunMxYz8SGLwmQF7yfj", "question": "What is used to attach the table?", "choices": ["glue", "metal", "heat", "water"], "correct_choice_idx": 0, "direct_answers": ["wood boards", "legs", "nails", "glue", "glue", "glue", "wire", "plate", "nails", "wood"], "difficult_direct_answer": false, "rationales": ["This is an adhesive used in construction", "The wood planks on the table are attached by glue.", "Glue helps attach the table."], "image": "train2014/COCO_train2014_000000412468.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153231, "question_id": "g5uAhtuarSQdXMWGnS7YtB", "question": "Why is the TV on?", "choices": ["entertain children", "decorative", "watch game", "distraction"], "correct_choice_idx": 2, "direct_answers": ["sports", "watch it", "watch game", "entertainment", "football game", "entertainment", "football game", "watching game", "display sport", "watching game"], "difficult_direct_answer": false, "rationales": ["Most parents will keep the television on to entertain their little ones.", "The tv is showing the game.", "It's football and in many american households it is imperative to have it on television even if you're eating."], "image": "val2014/COCO_val2014_000000153231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468537, "question_id": "g64TBt6cFS4XAAnW47cDdT", "question": "What type of beverages are in bottles on the counter?", "choices": ["juice", "soda", "water", "alcohol"], "correct_choice_idx": 3, "direct_answers": ["alcohol", "liquor", "alcoholic beverages", "wine", "liquor bottles", "alcohol", "alcohol", "alcohol", "wine", "alcoholic"], "difficult_direct_answer": false, "rationales": ["Alcohol is in the bottles.", "The beverages are alcohol.", "Soda, water, and juice are normally in plastic containers. these bottles are glass."], "image": "train2014/COCO_train2014_000000468537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354819, "question_id": "g6FRcGA6oVdrKvynQqvsZT", "question": "What type of event is highly likely to happen at this intersection?", "choices": ["car meet", "car crash", "car race", "parade"], "correct_choice_idx": 1, "direct_answers": ["car accident", "accident", "accident", "slip", "hurricane", "accident", "collision", "car crash", "accident", "collision"], "difficult_direct_answer": false, "rationales": ["The event is a car crash.", "The cars will crash without a stop sign.", "A car crashed on a tree."], "image": "train2014/COCO_train2014_000000354819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260748, "question_id": "g6JvgNrcfhFgXKSMRPcbUx", "question": "What fruit is on the far left side of the table?", "choices": ["banana", "strawberry", "dragonfruit", "peach"], "correct_choice_idx": 0, "direct_answers": ["banana", "oranges", "banana", "banana", "banana", "banana", "banana", "orange", "banana's", "bananas"], "difficult_direct_answer": false, "rationales": ["Bananas are long and yellow.", "The fruit is a banana.", "Bananas are the furthest on the left."], "image": "train2014/COCO_train2014_000000260748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186883, "question_id": "g6UiyPdLZRvAYXuH44WtKZ", "question": "Who wrote the famous song inspired by this highway?", "choices": ["michael jackson", "elvis presley", "bobby troup", "dean martin"], "correct_choice_idx": 2, "direct_answers": ["cow", "bobby troup", "bobby troup", "bobby troup", "john denver", "bobby troup", "bobby troup", "route 66", "bobby troup", "rock singer"], "difficult_direct_answer": false, "rationales": ["The label on the item says route 66.", "It was written by him.", "Bobby troup wrote a song inspired by route 66."], "image": "train2014/COCO_train2014_000000186883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63084, "question_id": "g6eFpTuGkVrZtMEwB4FmCg", "question": "In which liquid primarily were the cucumbers stored in?", "choices": ["vinegar", "coca cola", "none", "sugar water"], "correct_choice_idx": 0, "direct_answers": ["water", "vinegar", "brine", "vinegar", "brine", "vinegar", "vinegar", "vinegar", "brine", "water"], "difficult_direct_answer": false, "rationales": ["Cucumbers are pickled in vinegar.", "The cucumbers have been transformed into pickles which is an item commonly served with sandwiches at restaurant lunches. there is clearly a sandwich visible and what looks to be a pickle which would have been put in answer a to turn a cucumber into a pickle.", "Normally cucumbers are stored in a vinegar and water mixture to pickle them,"], "image": "train2014/COCO_train2014_000000063084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196205, "question_id": "g6h9SUNnnmC96JmuYTJzdN", "question": "Why is the person carrying a box?", "choices": ["it's baggage", "tossing it", "throwing away", "marketing"], "correct_choice_idx": 0, "direct_answers": ["carry on", "it's baggage", "travel cargo", "luggage", "travel", "luggage", "supplies", "transporting", "boarding plane", "shipping"], "difficult_direct_answer": true, "rationales": ["It is something they need on their trip", "The person has baggage.", "They are about to get on a plane."], "image": "train2014/COCO_train2014_000000196205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564099, "question_id": "g6nu7dcoFc44n3Nafk7QdX", "question": "What is the black framed device on top of the book?", "choices": ["frame", "folder", "screen", "tablet"], "correct_choice_idx": 3, "direct_answers": ["ipad", "tablet", "kindle", "tablet", "tablet", "tablet", "tablet", "monitor", "tablet", "monitor"], "difficult_direct_answer": false, "rationales": ["The black framed device is an ipad.", "This is a flat electronic device with a screen", "It has a screen like a tablet does."], "image": "train2014/COCO_train2014_000000564099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340736, "question_id": "g6t6HWkgxm6D6d5shqhVPa", "question": "What scene is this likely to be?", "choices": ["buffet restaurant", "market", "commercial kitchen", "domestic kitchen"], "correct_choice_idx": 2, "direct_answers": ["bar", "restaurant kitchen", "restaurant", "bar", "restaurant", "outdoor party", "kitchen", "buffet", "commercial kitchen", "bar"], "difficult_direct_answer": false, "rationales": ["This scene has a lot of commercial sized containers.", "It's probably a restaurant kitchen", "This is likely to be a commercial kitchen."], "image": "train2014/COCO_train2014_000000340736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198382, "question_id": "g7Gd86eyhDVgKqcMyhLzgh", "question": "What is the species of the nearest bird?", "choices": ["thrush", "duck", "seagull", "pelican"], "correct_choice_idx": 3, "direct_answers": ["pelican", "pelican", "pelican", "seagull", "seagull", "pelican", "pelican", "pelican", "pelican", "pelican"], "difficult_direct_answer": false, "rationales": ["The bird has a large beak.", "The long beak and long neck are this type of bird.", "The species is the pelican."], "image": "train2014/COCO_train2014_000000198382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73560, "question_id": "g7nJUkFWqrPwSyZALcB66y", "question": "What sort of sauce is found in the plastic cup?", "choices": ["cheese", "salsa", "marinara", "taco"], "correct_choice_idx": 2, "direct_answers": ["marinara", "salsa", "ketchup", "marinara", "marinara", "marinara", "marinara", "barbecue", "marinara", "dipping sauce"], "difficult_direct_answer": false, "rationales": ["Marinara is provided for dipping.", "That is the sauce used for pizza.", "They are eating pizza. when eating pizza the associated red sauce would be answer a."], "image": "val2014/COCO_val2014_000000073560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4108, "question_id": "g82WcYEh5oonQhCwdtN4Y5", "question": "What green item is in the cup?", "choices": ["mole", "salsa", "relish", "green tea"], "correct_choice_idx": 2, "direct_answers": ["sprinkles", "relish", "relish", "relish", "relish", "relish", "relish", "can't tell", "relish", "relish"], "difficult_direct_answer": false, "rationales": ["The cup is being used to hold a green condiment for hot dogs.", "The cup is filled with that.", "The item is relish."], "image": "val2014/COCO_val2014_000000004108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1591, "question_id": "g89YwUB3Zhg78q5LT5AXSU", "question": "Why do kitchen tables have tablecloths?", "choices": ["religious reasons", "hygiene", "decoration", "superstition"], "correct_choice_idx": 1, "direct_answers": ["messes", "cleaning", "protection", "protect decorate", "protect table", "decoration", "for spills", "dust", "protection", "hygiene"], "difficult_direct_answer": true, "rationales": ["It protects the wood and is easy to clean", "A table has a large cloth laid over it. table cloths are commonly used on kitchen tables to keep the table clean and free from dents and scratches.", "Cloths protect the table."], "image": "val2014/COCO_val2014_000000001591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492417, "question_id": "g8QyswkgDsY4JTCRakndfy", "question": "What is the woman using the bat to do?", "choices": ["defend herself", "dance", "hit ball", "exercise"], "correct_choice_idx": 2, "direct_answers": ["hit ball", "hit ball", "hitting ball", "hit ball", "hit", "hit ball", "hit ball", "softball", "hit ball", "baseball"], "difficult_direct_answer": false, "rationales": ["The woman is using the baseball bat to hit a ball.", "This is indicated by the ball coming toward her.", "The woman is trying to hit the ball with her bat."], "image": "train2014/COCO_train2014_000000492417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270193, "question_id": "g8XL5cXoHjwJw3dvaKEZK6", "question": "What does the silo behind the train store?", "choices": ["grain", "water", "coal", "corn"], "correct_choice_idx": 1, "direct_answers": ["grain", "coal", "water", "goods", "supplies", "holding grains", "grain", "grain", "grain", "water"], "difficult_direct_answer": false, "rationales": ["The silo stores water.", "The silo is for water.", "Silos are used to contain water."], "image": "train2014/COCO_train2014_000000270193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542301, "question_id": "g8YXPnS6rzGmHbBgPHZCvf", "question": "Which one is probably the most proficient rider?", "choices": ["striped pants", "tan pants", "none", "blue pants"], "correct_choice_idx": 1, "direct_answers": ["instructor", "holding reigns", "tan pants", "left woman", "left woman", "left woman", "adult woman", "jockey", "blonde woman", "woman"], "difficult_direct_answer": false, "rationales": ["The one in tan pants is the oldest and is leading the horse.", "The one in the beige is dressed like a rider and is holding the reins which means she probably owns the horse.", "The one in tan looks like a staff member."], "image": "train2014/COCO_train2014_000000542301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499309, "question_id": "g8d24U7rQGtbQCMAbfxCwK", "question": "What might this room be used for?", "choices": ["golfing", "sleeping", "reading", "bowling"], "correct_choice_idx": 2, "direct_answers": ["meetings", "convening", "community", "studying", "community center", "reading", "entertaining", "display", "reading", "meetings"], "difficult_direct_answer": false, "rationales": ["Someone might also d accidentally, but the lighting suggests it would be best for a.", "There are several seats in this room where someone can look at a book.", "The room is not suitable for playing sports, and it does not have beds for sleeping."], "image": "train2014/COCO_train2014_000000499309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193380, "question_id": "g8gBgvCKqffg4EW6v23v6E", "question": "What area is shown here?", "choices": ["city highway", "race track", "bike path", "harbor"], "correct_choice_idx": 3, "direct_answers": ["marina", "marina", "harbor", "pier", "boat dock", "waterfront", "dock", "marina", "marina", "marina"], "difficult_direct_answer": false, "rationales": ["There are wooden floors with guard rails and water below with a wooden walkway. there are boats all around.", "The area contains water and boats.", "There are boats parked by the boards."], "image": "val2014/COCO_val2014_000000193380.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414389, "question_id": "g92FGJYvbXoymZYwkcrKDb", "question": "What kind of scene is this?", "choices": ["surreal", "flashback", "blank", "realistic"], "correct_choice_idx": 0, "direct_answers": ["surreal", "dream", "posed", "picturesque", "picture", "posed", "fantasy", "snowy", "posed", "rural snowy"], "difficult_direct_answer": false, "rationales": ["The picture does not look too real.", "These things don't go together naturally.", "The photo seems dreamlike."], "image": "train2014/COCO_train2014_000000414389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145945, "question_id": "g9Anhe7jg65fvFrSkWoHSi", "question": "What type of infrastructure does this city avoid having?", "choices": ["low overpasses", "sidewalks", "aqueducts", "bridges"], "correct_choice_idx": 0, "direct_answers": ["subway", "skyscrapers", "low overpasses", "taxis", "street parking", "vague", "modern", "border", "unstable infrastructure", "sidewalks"], "difficult_direct_answer": true, "rationales": ["The infrastructure has overpasses.", "The double decker bus would lead for this city to have larger underpasses of bridges.", "This is a double decker and is really high in the sky and has to be careful of low overhangs."], "image": "train2014/COCO_train2014_000000145945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421689, "question_id": "g9YrW6t36gpx9BHR7GQKPQ", "question": "What human body part does the cat lean back on?", "choices": ["arm", "neck", "leg", "head"], "correct_choice_idx": 2, "direct_answers": ["legs", "leg", "foot", "leg", "leg", "legs", "leg", "leg", "foot", "leg"], "difficult_direct_answer": false, "rationales": ["A foot can be seen by the cat.", "The person is wearing jeans and the cat is lying next to the jeans.", "There is a human foot attached to it"], "image": "train2014/COCO_train2014_000000421689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19036, "question_id": "g9mWj8HuSmsMfk4S4XY8VU", "question": "How are the three people feeling in the kitchen?", "choices": ["disappointed", "angry", "proud", "hostile"], "correct_choice_idx": 2, "direct_answers": ["happy", "proud", "happiness", "pizza", "proud", "pizza", "accomplished happy", "proud", "good", "pleased"], "difficult_direct_answer": false, "rationales": ["They are smiling and showing off their finished pizza.", "The three people are smiling and are showing off food. they are not angry, disappointed, or hostile.", "The three people in the kitchen are all together and have a look of satisfaction on their faces."], "image": "val2014/COCO_val2014_000000019036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180002, "question_id": "g9uGiLFmBa4RC6GPw5QSzk", "question": "Why are they holding their boards?", "choices": ["posing", "taking break", "showing off", "cleaning them"], "correct_choice_idx": 0, "direct_answers": ["they surf", "posing", "surf boards", "picture taking", "posing", "its theirs", "to pose", "posing", "posing", "surfer picture"], "difficult_direct_answer": false, "rationales": ["The people are posing for the camera.", "They are all mugging for the camera.", "Getting their picture taken."], "image": "train2014/COCO_train2014_000000180002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515053, "question_id": "gA3ga4MFypBdCKGSG5VgEH", "question": "Why are the people holding microphones?", "choices": ["to sing", "to announce", "for karaoke", "to interview"], "correct_choice_idx": 3, "direct_answers": ["reporters", "reporters", "asking questions", "press conference", "taking quotes", "live tv", "to interview", "interview", "press conference", "record speech"], "difficult_direct_answer": false, "rationales": ["People are holding microphones to interview the man at the podium.", "The men in ties appear to be giving a press conference to a multitude of recording devices bearing the name of different media outlets.", "The people are interviewing."], "image": "train2014/COCO_train2014_000000515053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432890, "question_id": "gA97k9V53Lp3UumVuFK5uU", "question": "Where is the woman jumping?", "choices": ["dorm room", "hotel room", "restaurant", "hospital room"], "correct_choice_idx": 0, "direct_answers": ["room", "dorm room", "kitchen", "bedroom", "bedroom", "dorm room", "floor", "air", "dorm room", "apartment"], "difficult_direct_answer": false, "rationales": ["The room is personalized and everything is in a small area", "A girl is jumping in the doorway of a small room.", "The room is small and the person appears to be wearing a school sweatshirt."], "image": "train2014/COCO_train2014_000000432890.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353507, "question_id": "gAFGMH8Ebh3YHrVVAeJFiA", "question": "Where might you go in the item the cake is shaped as?", "choices": ["ocean", "volcano", "cave", "space"], "correct_choice_idx": 3, "direct_answers": ["space", "outer space", "space", "moon", "moon", "space", "space", "space", "space", "outer space"], "difficult_direct_answer": false, "rationales": ["The cake is shaped as a spacecraft.", "The cake is shaped like a rocket.", "You can see the stars at space."], "image": "train2014/COCO_train2014_000000353507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375568, "question_id": "gATXRYbJ5qkRV9qMaZ7vUQ", "question": "What item is in the wrong setting?", "choices": ["dog", "tree", "chairs", "grass"], "correct_choice_idx": 2, "direct_answers": ["chairs", "chairs", "chairs", "chairs", "chairs", "chairs", "chairs", "chair bench", "dog", "plastic chairs"], "difficult_direct_answer": false, "rationales": ["These are indoor chairs. people normally wouldn't sit in them in the grass.", "The chairs are out of place because the grass is running wild outside so it is unusual to see chairs in a setting", "There's no reason for the chairs to be outside."], "image": "train2014/COCO_train2014_000000375568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215584, "question_id": "gAuEuNZnscEAdkbjLti8XP", "question": "How is the woman on the bench feeling?", "choices": ["scared", "annoyed", "happy", "angry"], "correct_choice_idx": 2, "direct_answers": ["happy", "happy", "happy", "happy", "sad", "sad", "happy", "happy", "surprised", "pleased"], "difficult_direct_answer": false, "rationales": ["The woman is smiling.", "She has a smile on her face.", "She is smiling."], "image": "train2014/COCO_train2014_000000215584.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306305, "question_id": "gB9cdza4U9MCjmhhtLogvN", "question": "Where are these two girls at?", "choices": ["restaurant", "liquor store", "convenience store", "supermarket"], "correct_choice_idx": 1, "direct_answers": ["liquor store", "wine store", "liquor store", "wine store", "wine shop", "wine shop", "liquor mart", "wine store", "winery", "wine store"], "difficult_direct_answer": false, "rationales": ["This is obvious based on all of the alcohol bottles.", "These girls are in front of wine.", "This business exclusively sells bottles of alcohol."], "image": "train2014/COCO_train2014_000000306305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160629, "question_id": "gBEkryF2ZhprupUjfzLKof", "question": "What color are the sweet fruits?", "choices": ["green", "yellow", "brown", "red"], "correct_choice_idx": 1, "direct_answers": ["yellow", "yellow green", "yellow green", "yellow green", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The pineapples and bananas are yellow.", "The bananas are yellow.", "These are bananas"], "image": "train2014/COCO_train2014_000000160629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57515, "question_id": "gBPuJeXJqsdeukuy9JaAUg", "question": "Why the umbrella on a sunny day?", "choices": ["it's new", "hiding", "prank", "block sun"], "correct_choice_idx": 3, "direct_answers": ["shade", "sun protection", "prevent sunburn", "playing", "shade", "shade", "sun protection", "shade", "for shade", "block sun"], "difficult_direct_answer": false, "rationales": ["It gives a lot of ray protection.", "The people are using it to block the sun.", "The umbrella blocks sun."], "image": "train2014/COCO_train2014_000000057515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417571, "question_id": "gBWNwbRYuYZYTZx7JMBjKT", "question": "What are these boys playing on the beach?", "choices": ["paddle ball", "basketball", "soccer", "frisbee"], "correct_choice_idx": 2, "direct_answers": ["soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer", "soccer"], "difficult_direct_answer": false, "rationales": ["You can tell from the color patterns and the fact they are using there legs only as to what sport is being played.", "The boys play soccer.", "There is a soccer ball in the air between the guys, so it's apparent that the game they're playing on the beach is soccer."], "image": "train2014/COCO_train2014_000000417571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458079, "question_id": "gBeMD3XNFgoXFGfXzh5QHD", "question": "What sort of goods are sold in the shop depicted in the blue sign?", "choices": ["train tickets", "snacks", "souvenirs", "coffee"], "correct_choice_idx": 2, "direct_answers": ["souvenir", "souvenirs", "souvenir", "gifts", "souvenirs", "souvenir's", "souvenir", "souvenirs", "souvenirs", "souvenir"], "difficult_direct_answer": false, "rationales": ["The shop sells souvenirs.", "The shop sign states \"souvenirs\" on it.", "The goods are souvenirs."], "image": "val2014/COCO_val2014_000000458079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554070, "question_id": "gBzHawFmnE6YSgymSb3UjR", "question": "This man likely idolizes what athlete?", "choices": ["tony hawk", "aaron judge", "michael jordan", "mike tyson"], "correct_choice_idx": 0, "direct_answers": ["tony hawk", "tony hawk", "tony hawk", "tony hawk", "tony hawk", "tony hawk", "tony hawk", "tony hawk", "tony hawk", "tony hawk"], "difficult_direct_answer": false, "rationales": ["The man is skateboarding.", "The man idolizes hawk.", "Tony hawk is a skateboarder."], "image": "train2014/COCO_train2014_000000554070.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385084, "question_id": "gCaZNC2Nhv72gtvdcU9uvd", "question": "What is the pale top of the wave called?", "choices": ["sea foam", "whitecap", "top", "bubbles"], "correct_choice_idx": 1, "direct_answers": ["foam", "whitecap", "crest", "crest", "foam", "crest", "crest", "break", "foam", "crest"], "difficult_direct_answer": false, "rationales": ["The top of the wave is white.", "It is called after the color of the top of the wave.", "They are white on top."], "image": "train2014/COCO_train2014_000000385084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214059, "question_id": "gCgBD94Hu5bsaLQCVDKsZp", "question": "What nation is this airport located at?", "choices": ["china", "korea", "japan", "india"], "correct_choice_idx": 2, "direct_answers": ["bus stand", "japan", "africa", "japan", "unknown", "middle eastern", "america", "slovenia", "japan", "nae"], "difficult_direct_answer": false, "rationales": ["Nagasaki is where japan is.", "You can look at the color of the license plate to know it's in japan.", "The nation is japan."], "image": "train2014/COCO_train2014_000000214059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335824, "question_id": "gChkubpTmRvbLvU8wGhxSg", "question": "What type trees are visible here?", "choices": ["fir", "palm", "evergreen", "deciduous"], "correct_choice_idx": 3, "direct_answers": ["birch", "acacia", "barren ones", "deciduous", "pine", "alpine", "oak", "pine", "cypress", "pine"], "difficult_direct_answer": false, "rationales": ["The trees are deciduous since they've lost their leaves.", "The trees have lost their leaves.", "They have lost their leaves in the cold weather"], "image": "train2014/COCO_train2014_000000335824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519573, "question_id": "gCrBvbGMbkgyYGuehuB9d2", "question": "What do the people pictured near the art display all share the ability to do?", "choices": ["play golf", "skateboard", "play chess", "drive cars"], "correct_choice_idx": 1, "direct_answers": ["skateboard", "skateboard", "ride skateboard", "pose", "skateboard", "skateboarding", "skating", "sit", "skateboard", "skateboard"], "difficult_direct_answer": false, "rationales": ["They have skateboards.", "The people all know how to ride the skateboards they have with them.", "The people are using skateboards."], "image": "val2014/COCO_val2014_000000519573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143017, "question_id": "gDA6aoWmTA2ACDcfHj4hLT", "question": "What does pommes mean in english?", "choices": ["apples", "melon", "banana", "kiwi"], "correct_choice_idx": 0, "direct_answers": ["apples", "apples", "apples", "french fries", "apple", "pomegranate", "apples", "apples", "apples", "apples"], "difficult_direct_answer": false, "rationales": ["This is the english translation", "\"pommes\" in english mean \"apples\". as an aside, a \"pomme de terre\" in french means \"apple of the earth\", ie, a potato.", "In english it is for apples."], "image": "train2014/COCO_train2014_000000143017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384626, "question_id": "gDJ5fFuxoEiWYSLjyxAG8Q", "question": "Why does the biker have his head covered?", "choices": ["uniform", "cleanliness", "costume", "safety"], "correct_choice_idx": 3, "direct_answers": ["protection", "safety", "safety", "safety", "safety", "protection", "safety", "safety", "protection", "safety"], "difficult_direct_answer": false, "rationales": ["He has a hat on for safety.", "The biker has his head covered with a helmet for safety purposes.", "A helmet is used for protection."], "image": "train2014/COCO_train2014_000000384626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567274, "question_id": "gDKTLw2XnbHNoQsSUbvDho", "question": "The largest fruit shown here is what type of Fruit?", "choices": ["cherry", "citrus", "pome", "melon"], "correct_choice_idx": 3, "direct_answers": ["grapefruit", "melons", "grapefruit", "oranges", "grapefruit", "melon", "grapefruit", "grapefruit", "grape fruit", "grapefruit"], "difficult_direct_answer": false, "rationales": ["They are striped watermelons", "The largest fruit is on the far right and is a watermelon.", "There are watermelons on the right."], "image": "train2014/COCO_train2014_000000567274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41397, "question_id": "gDggKaB5tPg6cYjorJ4v7v", "question": "Where would someone eating on the bench throw the remains?", "choices": ["pavement", "ground", "can", "bench"], "correct_choice_idx": 2, "direct_answers": ["trash bin", "trash bin", "behind", "trash can", "garbage", "trash can", "trash can", "trash can", "can", "trash can"], "difficult_direct_answer": false, "rationales": ["There is a trash receptacle near these benches so it is likely anyone eating on this bench would throw away whatever trash they had into the can.", "Trash always goes into the garbage can.", "There is a trash bin near them."], "image": "val2014/COCO_val2014_000000041397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423618, "question_id": "gDrXQedYP37D9D7DoUCa8d", "question": "What other tool is required to treat the carrot?", "choices": ["fork", "spoon", "blender", "peeler"], "correct_choice_idx": 3, "direct_answers": ["knife", "peeler", "peeler", "peeler", "peeler", "peeler", "peeler", "vegetable peeler", "peeler", "shredder"], "difficult_direct_answer": false, "rationales": ["A peeler is required to peel the skin off the carrot.", "The tool is a peeler.", "The rough outer layer of the carrot needs to be removed before it can be eaten or cut up."], "image": "val2014/COCO_val2014_000000423618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38317, "question_id": "gE2CdLt5rhsoWWMQGN3rSG", "question": "What is the large circular object called?", "choices": ["windmill", "rollercoaster", "ferris wheel", "ski coaster"], "correct_choice_idx": 2, "direct_answers": ["ferris wheel", "ferris wheel", "triser", "ferris wheel", "ferris wheel", "ferris wheel", "ferris wheel", "ferris wheel", "ferris wheel", "big wheel"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to this type of option or match the shape.", "The circular object is for sightseeing and riding.", "The object is a perfect circle and it is large. there are little seats on it."], "image": "train2014/COCO_train2014_000000038317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467854, "question_id": "gE9iAm65QfEasRxeju94KL", "question": "What is he doing?", "choices": ["smoking cigarette", "plowing field", "stealing horsed", "feeding horsed"], "correct_choice_idx": 1, "direct_answers": ["farming", "plowing", "plowing", "farming", "plowing", "plowing", "plowing", "plowing field", "riding", "plowing"], "difficult_direct_answer": false, "rationales": ["Horses serve many purposes however when they are in a field with a harness and plow on them, they are probably doing this.", "He has farm equipment hooked up to the horses", "He is using the horses to get the field ready for planting crops."], "image": "val2014/COCO_val2014_000000467854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216243, "question_id": "gEEN5Jx5akea5YcVUKbDUA", "question": "Where is this player headed?", "choices": ["second base", "visitors stands", "home plate", "home"], "correct_choice_idx": 2, "direct_answers": ["dugout", "dugout", "home plate", "off field", "team", "dugout", "home plate", "dugout", "on-deck circle", "home base"], "difficult_direct_answer": false, "rationales": ["The man is walking to home plate because is next at bat.", "He has a bat and getting ready for his turn to try to hit the ball", "The player is heading to the home plate."], "image": "val2014/COCO_val2014_000000216243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518511, "question_id": "gEFpYEnWhQaiXUHnDwYvpq", "question": "For which city does this fire truck perform work?", "choices": ["new york", "kentucky", "arkansas", "new jersey"], "correct_choice_idx": 0, "direct_answers": ["new york", "edny", "new york", "new york", "new york", "new york", "new york", "new york", "asian", "new york"], "difficult_direct_answer": false, "rationales": ["It has fdny on the side of it", "Fdny is on the side of the fire truck stating where it's from.", "It says fdny on the door."], "image": "train2014/COCO_train2014_000000518511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464967, "question_id": "gEN55UoJT7q6mPuvCbJNZs", "question": "How is this stove powered?", "choices": ["wood", "coal", "electric", "gas"], "correct_choice_idx": 0, "direct_answers": ["wood", "wooden fire", "fire", "wood", "wood", "wood", "wood", "coal", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["There are stacks of it in the room", "A wood burning stove is shown. wood is used to heat wood burning stoves.", "A wood powered stove is commonly used to cook this type of food and there are piles of wood all around the oven which would make it the likeliest power source."], "image": "train2014/COCO_train2014_000000464967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516273, "question_id": "gEZGNGfcGERv9pk4kdPWMq", "question": "What type of sandwich is being served?", "choices": ["club", "peanut butter", "barbeque", "tuna"], "correct_choice_idx": 2, "direct_answers": ["sloppy joe", "bbq", "bbq", "pulled pork", "barbecue", "corned beef", "barbecue", "pulled pork", "pulled pork", "barbeque"], "difficult_direct_answer": false, "rationales": ["The sandwich is bbq.", "There is some thick sauce in the buns.", "The sandwich is pulled pork covered in bbq sauce."], "image": "train2014/COCO_train2014_000000516273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515710, "question_id": "gEaEnEx2HygUk9WJzV6LXj", "question": "What kind of drink is on the table?", "choices": ["pepsi", "coca-cola", "sprite", "fanta"], "correct_choice_idx": 1, "direct_answers": ["soda", "soda", "coca cola", "coca cola", "sodas", "coca cola", "pop", "coca cola", "coca cola", "coca-cola"], "difficult_direct_answer": false, "rationales": ["A bottle with a red label and white script is on a table. coca cola has red with white lettering in the logo.", "The drink is coke.", "There is a bottle with a label visible in which the writing cocacola is readable."], "image": "train2014/COCO_train2014_000000515710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541071, "question_id": "gFFWkmHgcQEHRMYdMsKY8b", "question": "What is he doing?", "choices": ["is landing", "is falling", "is bouncing", "is flying"], "correct_choice_idx": 0, "direct_answers": ["flying kite", "flying kite", "flying paraglider", "flying kite", "is landing", "gliding", "parasailing", "air balloon", "kite flying", "flying kites"], "difficult_direct_answer": false, "rationales": ["He is landing on the sand.", "He wants to fly the kite.", "The man is landing."], "image": "val2014/COCO_val2014_000000541071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551338, "question_id": "gFHDMvNntiUjnQCij9SkGV", "question": "The large container just outside the fence here likely contains what?", "choices": ["frisbees", "golf balls", "water", "oil"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "water", "water", "gasoline", "water", "water", "gasoline", "water", "water"], "difficult_direct_answer": false, "rationales": ["The container has water.", "It looks like a collection device", "The black container outside of the field is rain collecting tank."], "image": "val2014/COCO_val2014_000000551338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252549, "question_id": "gFZSfPDTs5jpx97JxhYmmU", "question": "What section of the store is this area?", "choices": ["women's section", "babies' section", "kids' section", "men's section"], "correct_choice_idx": 3, "direct_answers": ["clothing", "clothing", "dress shirts", "clothing", "men's section", "men's", "mens clothing", "men's clothing", "ties", "men's apparels"], "difficult_direct_answer": false, "rationales": ["Mens shirts are on display.", "This is the men's section of the store.", "Collared shirts with ties are usually worn by men."], "image": "val2014/COCO_val2014_000000252549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453561, "question_id": "gFcEFqgXNvdFVHMmUmnbMb", "question": "What is the biggest danger here?", "choices": ["falling", "stroke", "drowning", "burning"], "correct_choice_idx": 2, "direct_answers": ["waves", "waves", "boats colliding", "drowning", "drowning", "drowning", "drowning", "waves", "drowning", "tipping over"], "difficult_direct_answer": false, "rationales": ["People who can't swim may die if they fall into the water.", "Of course c is the first danger followed by the biggest one, which is a.", "You can drown in the water if you fall."], "image": "train2014/COCO_train2014_000000453561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241155, "question_id": "gFitaQP7FSyRajGLGae4FR", "question": "What form of transportation is this?", "choices": ["subway", "bus", "train", "car"], "correct_choice_idx": 0, "direct_answers": ["subway train", "subway", "subway", "subway", "train", "subway", "subway", "subway", "train", "rail transportation"], "difficult_direct_answer": false, "rationales": ["The transportation is a subway.", "Buses, trains and cars don't run under the ground.", "Subway tracks are shown."], "image": "val2014/COCO_val2014_000000241155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190018, "question_id": "gFk9xXkTWvtUe66AjiqpAC", "question": "A small vessel for travelling over water propelled by oars sails or an engine is?", "choices": ["flight", "boat", "ship", "floater"], "correct_choice_idx": 1, "direct_answers": ["boat", "boat", "boat", "boat", "boat", "boat", "boat", "boat", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["Boats go on the water.", "A small vessel is meant to propel the sails of a boat.", "The vessel is a boat."], "image": "val2014/COCO_val2014_000000190018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21926, "question_id": "gFsYioUmdrBv9ux9MzPRgx", "question": "What type of area is shown?", "choices": ["coastal", "beach", "urban", "rural"], "correct_choice_idx": 2, "direct_answers": ["amusement park", "park", "zoo", "park", "urban", "urban", "urban", "park", "park", "carnival"], "difficult_direct_answer": false, "rationales": ["There are lots of people and buildings close together", "There are a lot of buildings and people walking around the city.", "The area is urban."], "image": "train2014/COCO_train2014_000000021926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482707, "question_id": "gFuShA35yyUC7FSk3ZRP77", "question": "What country is the bike manufacturer from?", "choices": ["japan", "america", "germany", "china"], "correct_choice_idx": 2, "direct_answers": ["germany", "germany", "germany", "germany", "england", "bmw", "germany", "england", "germany", "germany"], "difficult_direct_answer": false, "rationales": ["The bike is made in germany.", "This is a bmw which is a german bike maker.", "Bmws come from germany."], "image": "val2014/COCO_val2014_000000482707.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512070, "question_id": "gFytJJfmMJfzDZtuQpDPqq", "question": "What is the person likely to do next?", "choices": ["record something", "read message", "make call", "take photo"], "correct_choice_idx": 1, "direct_answers": ["check texts", "call", "read message", "hear message", "read it", "read message", "call someone", "call", "open phone", "open phone"], "difficult_direct_answer": false, "rationales": ["The person will read.", "A person is holding phone that displays a message informing of a message being received.", "This is an older type phone, where texting wasn't as popular as today so most likely he is calling someone."], "image": "val2014/COCO_val2014_000000512070.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353666, "question_id": "gGEoUK7wevmTa2VP5it43Q", "question": "Why is the woman holding a phone to her ear?", "choices": ["as decoration", "to dance", "listening music", "making calls"], "correct_choice_idx": 3, "direct_answers": ["phone", "talking", "making calls", "talking", "to listen", "talking", "conversation", "to talk", "taking call", "talking"], "difficult_direct_answer": false, "rationales": ["These are conversations that are conducted through this small device.", "She is listening to a call.", "The woman is on the phone talking."], "image": "val2014/COCO_val2014_000000353666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500162, "question_id": "gGieskCMyPFkBHKgw3Cjb2", "question": "Alternative energy sources for wood fireplaces is what?", "choices": ["gasoline", "chemical", "electrical", "water"], "correct_choice_idx": 2, "direct_answers": ["electricity", "electricity", "electrical", "electric/ gas", "heater", "heater", "gas", "propane", "gas", "gas"], "difficult_direct_answer": false, "rationales": ["This is the most common alternative followed by natural gas.", "Turning on the heat from an electric source would be an alternative.", "You use electricity for fireplaces when you don't want them to be wood burning"], "image": "train2014/COCO_train2014_000000500162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87105, "question_id": "gGoiGdS6PmLtPfRgJvm7qw", "question": "Which one of these would be listed in his book?", "choices": ["state prisons", "federal banks", "hiking trails", "public schools"], "correct_choice_idx": 2, "direct_answers": ["maps", "periodic table", "maps", "instructions", "states", "map", "map", "hiking trails", "maps", "this one"], "difficult_direct_answer": false, "rationales": ["A man is holding a book of maps and a laptop in his lap.", "A guy holds a book of maps and a computer in his lap.", "Hiking trails don't fit into the contents of his book."], "image": "train2014/COCO_train2014_000000087105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146560, "question_id": "gGr3Tiu4QXLWNQw729bqcB", "question": "What is the pole part of?", "choices": ["cell tower", "phone line", "barber shop", "polo game"], "correct_choice_idx": 3, "direct_answers": ["polo match", "competition", "goalpost", "game", "polo", "tall pole", "goal", "horse polo", "polo game", "game"], "difficult_direct_answer": true, "rationales": ["The game of polo is played on horses. none of the other options make sense.", "This is the game that is played with sticks and balls on horses.", "The horseback riders wooden mallets tell us this is a game of polo in progress."], "image": "train2014/COCO_train2014_000000146560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528527, "question_id": "gGt7cJMdexAcBcorXgYFHx", "question": "What energy powers the splash?", "choices": ["electricity", "solar", "wind", "manual power"], "correct_choice_idx": 0, "direct_answers": ["water", "wind", "water", "fountain", "hydro", "electricity", "wind", "water", "hydro", "water"], "difficult_direct_answer": false, "rationales": ["To create the splashing in this picture electricity must be used to achieve the effect.", "The fans are powered by electric.", "The splash is powered by engines."], "image": "val2014/COCO_val2014_000000528527.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436571, "question_id": "gH8DUu4eWHCqavK4BHtsMd", "question": "What does the large ships used to move?", "choices": ["nuclear energy", "electricity", "sails", "coal"], "correct_choice_idx": 2, "direct_answers": ["cargo", "sails", "wind", "cargo", "cargo", "sails", "wind", "cargo", "sails", "rutters"], "difficult_direct_answer": false, "rationales": ["They have large masts, which sails are lifted up onto. the sails use the wind to help the boat move and direct it.", "Those are sail boats and they use huge sails to move the boat in the water.", "The ship moves the sails which are attached. the other choices are not things moved via a ship."], "image": "train2014/COCO_train2014_000000436571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131743, "question_id": "gHNmXSCR4Fr5L4oLLgRgef", "question": "What are the people looking at?", "choices": ["virtual photographs", "virtual currency", "virtual games", "virtual webcasts"], "correct_choice_idx": 2, "direct_answers": ["game", "television", "television", "television", "screen", "game", "television", "game", "television", "virtual games"], "difficult_direct_answer": false, "rationales": ["The people are looking at wii games.", "The people look at virtual games.", "The people are holding a game console to play."], "image": "val2014/COCO_val2014_000000131743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468981, "question_id": "gHS7LdoSSddZQ5BULDChKd", "question": "What is the job of these horses?", "choices": ["jump", "race", "carry", "pull"], "correct_choice_idx": 3, "direct_answers": ["hauling carts", "pull cart", "pulling", "pulling", "pulling", "pulling cart", "pull wagon", "transport passengers", "pull", "pull wagon"], "difficult_direct_answer": false, "rationales": ["The vehicle behind the horses is a wagon and it is attached with harnesses to the horses. this is a vehicle that moves when the horses pull it and the horses are harnessed in position to do just that.", "They obvious do a with the cart.", "They are there to pull the carriage."], "image": "train2014/COCO_train2014_000000468981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326724, "question_id": "gHSQ6xAkRaJbpyKYigyzps", "question": "What action are the people taking?", "choices": ["falling", "descend", "riding", "ascend"], "correct_choice_idx": 3, "direct_answers": ["ski walking", "skiing", "playing", "skiing", "skiing", "cross-country skiing", "ascend", "skiing", "skiing cross/country", "skiing"], "difficult_direct_answer": false, "rationales": ["They are gaining elevation as they go up the hill", "The two skiers are walking in a forward lunging motion which indicates that they are going uphill.", "The people ascend."], "image": "train2014/COCO_train2014_000000326724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540544, "question_id": "gHj4bjf2onrNPZBSruz3tm", "question": "What color is the gaming system being used?", "choices": ["blue", "white", "gold", "brown"], "correct_choice_idx": 1, "direct_answers": ["wii", "black", "green", "wii", "white", "grey", "wii", "green", "white", "white"], "difficult_direct_answer": false, "rationales": ["Unless you are colorblind you can tell what color the system is.", "The console is on the floor and it is white.", "The gaming system being used is white."], "image": "val2014/COCO_val2014_000000540544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498339, "question_id": "gHjVjVPnAG6giTDQmCusaa", "question": "What are people here to do?", "choices": ["worship", "travel", "work", "shop"], "correct_choice_idx": 1, "direct_answers": ["train ride", "travel", "board train", "enter train", "ride train", "travel", "travel", "travel", "travel", "to travel"], "difficult_direct_answer": false, "rationales": ["Trains can transport people to other places.", "The people are at the train station to get from one place to another.", "The people travel."], "image": "val2014/COCO_val2014_000000498339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77351, "question_id": "gHzUsmVmxUgQfHsWJFPqHM", "question": "What sort of people are boarded on this bus?", "choices": ["house mothers", "tourists", "salemen", "coal miners"], "correct_choice_idx": 1, "direct_answers": ["english", "passengers", "passengers", "locals", "passengers", "adults", "tourists townspeople", "old people", "passengers", "tourists"], "difficult_direct_answer": false, "rationales": ["It seems to be a bus that takes groups to different sites or site seeing.", "They are vacationers looking at the area", "There are people with cameras."], "image": "val2014/COCO_val2014_000000077351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147556, "question_id": "gJrDBou9tFDCvNCiL7X93x", "question": "Why is the man sitting on the elephant?", "choices": ["to fight", "to punish", "to ride", "to clean"], "correct_choice_idx": 2, "direct_answers": ["riding", "riding", "riding", "riding", "to ride", "riding", "to ride", "riding", "to ride", "riding"], "difficult_direct_answer": false, "rationales": ["They are going for a ride on it.", "Generally if he is on top of the animals it is to ride.", "The man is going for a ride."], "image": "val2014/COCO_val2014_000000147556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488990, "question_id": "gJtLnEgKyLjfxT5tYrw2Uv", "question": "What is the hand on the traffic light telling those facing it?", "choices": ["go left", "walk", "go right", "stop"], "correct_choice_idx": 3, "direct_answers": ["stop", "stop", "stop", "don't cross", "not walk", "stop", "stop", "stop", "stop", "walk"], "difficult_direct_answer": false, "rationales": ["The hand is saying to stop and don't walk.", "The hand means to stop.", "The traffic light is indicating stop."], "image": "train2014/COCO_train2014_000000488990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469632, "question_id": "gJzyoms4yi4yUNtpVRJR5A", "question": "These animals are doing what?", "choices": ["standing", "sleeping", "eating", "climbing"], "correct_choice_idx": 0, "direct_answers": ["standing", "walking", "running", "standing", "walking", "trotting", "running", "running", "running", "standing"], "difficult_direct_answer": false, "rationales": ["The animals stand.", "The giraffes are on all fours walking around.", "They are upright will all four legs on the ground."], "image": "train2014/COCO_train2014_000000469632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247979, "question_id": "gK63gaSiSiT8BA2av3WHKQ", "question": "In what continent is this setting found?", "choices": ["australia", "europe", "asia", "africa"], "correct_choice_idx": 2, "direct_answers": ["asia", "asia", "asia", "asia", "asia", "asia", "asia", "japan", "asia", "china"], "difficult_direct_answer": false, "rationales": ["You can tell by the words on the vase, as to what region of the world it's from.", "It has chinese writing on the vase.", "There is a vase. the text on its side is in chinese."], "image": "train2014/COCO_train2014_000000247979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530013, "question_id": "gKPLpJSvLcxyzHwK54AnHE", "question": "What position will the person who stands holding the bat vertically play next?", "choices": ["catcher", "shortstop", "manager", "batter"], "correct_choice_idx": 3, "direct_answers": ["batter", "batter", "batter", "batter", "at bat", "batter", "batter", "batter", "first base", "batting"], "difficult_direct_answer": false, "rationales": ["He is waiting for his turn next to try to hit the ball.", "The person at bat is known as the batter.", "He's warming up for his turn"], "image": "val2014/COCO_val2014_000000530013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321500, "question_id": "gKYR2HYvJfUtJWXACir5sg", "question": "What would make the tagged objects easier to transport?", "choices": ["rope", "cart", "hammock", "bicycle"], "correct_choice_idx": 1, "direct_answers": ["handle", "wheels", "baggage cart", "cart", "cart", "wheels", "cart", "wheels", "luggage", "cart"], "difficult_direct_answer": false, "rationales": ["When put on a cart, it can transport all of the luggage at once instead of having to move each one individually.", "For these types of bags, they would need a roll away type of device to help them along faster.", "These five good-sized suitcases would be easy to stack neatly on a cart and roll right to a cab!."], "image": "train2014/COCO_train2014_000000321500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252839, "question_id": "gKeumC5XHjSA3Kjxx2oyJ8", "question": "The food source of cows creates oxygen through what process?", "choices": ["solar", "nuclear", "photosynthesis", "wind"], "correct_choice_idx": 2, "direct_answers": ["photosynthesis", "photosynthesis", "breathing", "photosynthesis", "photosynthesis", "photosynthesis", "photosynthesis", "photosynthesis", "cellular respiration", "photosynthesis"], "difficult_direct_answer": false, "rationales": ["The leaves gather sun and convert it", "Cows have plant-based diets. plants do not have photovoltaic panels, wind turbines, or nuclear reactors.", "The process of light synthesizing food helps feed the cows."], "image": "train2014/COCO_train2014_000000252839.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58403, "question_id": "gKiMHP3snM8yGxtHbRxpJG", "question": "What are the two about to do?", "choices": ["do puzzles", "eat cake", "write paper", "play games"], "correct_choice_idx": 1, "direct_answers": ["eat cake", "eat cake", "eat cake", "eat cake", "eat", "eat", "eat cake", "eat cake", "eat cake", "eat cake"], "difficult_direct_answer": false, "rationales": ["There is a large cake right in front of them; we can assume they are about to eat it.", "There is a cake in front of the girls.", "The two women are sitting in front of a cake and one of them has an eating utensil in her hand."], "image": "train2014/COCO_train2014_000000058403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20188, "question_id": "gKmEDbHpKWJbJrg9635jod", "question": "What type of hat is the person holding?", "choices": ["bucket hat", "beanie", "fedora", "baseball cap"], "correct_choice_idx": 1, "direct_answers": ["toque", "toque", "beanie", "toque", "beanie", "beanie", "beanie", "beanie", "beanie", "beanie"], "difficult_direct_answer": false, "rationales": ["The person in the picture is holding a soft knitted head covering with a small hem. this also can used to described as a beanie which is popular with young men in cooler weather.", "The hat is a beanie.", "He has a beanie hat in his hands."], "image": "train2014/COCO_train2014_000000020188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269306, "question_id": "gLg9NRAokRtfUEzNGKD6zu", "question": "What are the birds doing with the pizza?", "choices": ["eating it", "guarding it", "cooking it", "attacking it"], "correct_choice_idx": 0, "direct_answers": ["eating", "eating", "eating", "eating", "eating", "eating", "eating", "eating", "eating", "eating it"], "difficult_direct_answer": false, "rationales": ["The pizza is on the ground and they are pecking at it.", "They are holding their beaks to the pizza which means they are probably having a meal.", "The birds are snacking on it."], "image": "val2014/COCO_val2014_000000269306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370369, "question_id": "gM3vdur7qBAf73cTwVZEEw", "question": "What number is painted on the sheep on the left?", "choices": ["four", "two", "three", "one"], "correct_choice_idx": 2, "direct_answers": ["eight", "three", "three", "three", "three", "three", "eight", "three", "eight", "three"], "difficult_direct_answer": false, "rationales": ["He has a number 3 on hiim.", "On the sheep's fur is a number three.", "There is a number three on the fur in red."], "image": "train2014/COCO_train2014_000000370369.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289019, "question_id": "gMFEacQHKaArQ2Sv8zi5Dp", "question": "What is being held by the person sitting highest?", "choices": ["wagon wheels", "spurs", "tickets", "reins"], "correct_choice_idx": 3, "direct_answers": ["reins", "reins", "reins", "reins", "reins", "reins", "reins", "reins", "leads", "reigns"], "difficult_direct_answer": false, "rationales": ["The driver with the reins is sitting up the highest.", "The reins are being held.", "The person is holding the reins because he is riding a chariot led by horses"], "image": "train2014/COCO_train2014_000000289019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154576, "question_id": "gMK9mUMSmPRn8rBvuNnS5q", "question": "These people are likely in what type of school?", "choices": ["college", "grammar school", "high school", "master's program"], "correct_choice_idx": 1, "direct_answers": ["sheep school", "elementary", "boarding school", "shepard", "homeschooling", "farming", "elementary", "husbandry", "grammar school", "trade"], "difficult_direct_answer": true, "rationales": ["They are too young to be attending any kind of secondary or higher education.", "The people are really young.", "They are very young kids"], "image": "train2014/COCO_train2014_000000154576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402057, "question_id": "gMLjHCsbkP88Xdn3qeCtus", "question": "What are the vegetables call that are above the broccoli?", "choices": ["spinach", "asparagus", "brussels sprouts", "kale"], "correct_choice_idx": 1, "direct_answers": ["greens", "asparagus", "asparagus", "asparagus", "asparagus", "asparagus", "asparagus", "asparagus", "asparagus", "greens"], "difficult_direct_answer": false, "rationales": ["The veggies are long and are asparagus stalks.", "The veggies are asparagus.", "Asparagus are long and green."], "image": "train2014/COCO_train2014_000000402057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285910, "question_id": "gMetBhikChSKZpe8y2xMeH", "question": "What is the appliance in this room used for?", "choices": ["cooling", "watching", "cooking", "washing"], "correct_choice_idx": 1, "direct_answers": ["watching tov", "watching", "watching shows", "watching tv", "producing light", "television", "decoration", "television", "watch tv", "watching"], "difficult_direct_answer": false, "rationales": ["Watching is the intended purpose of a television. it produces pictures to watch.", "The appliance is a television set and is used for viewing shows, news, etc.", "The appliance in this room is a television, not an air conditioner, oven, or washing machine."], "image": "train2014/COCO_train2014_000000285910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543422, "question_id": "gMyAiuxB8wL3HB7ourMgsc", "question": "What is the father doing with the child out on the mountain?", "choices": ["teaching", "transporting", "pulling", "hunting"], "correct_choice_idx": 0, "direct_answers": ["skiing", "skiing", "hiking", "skiing", "skiing", "skiing", "hiking", "skiing", "skiing", "teaching"], "difficult_direct_answer": false, "rationales": ["Pulling his child.", "The child is very small so they need to learn to ski", "The child is small and likely does not know how to ski yet."], "image": "val2014/COCO_val2014_000000543422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457011, "question_id": "gN4FSfEAoWDnYmR42x3WBZ", "question": "What is the person attached to who is about to get dragged by the horse?", "choices": ["sled", "snowmobile", "snowboard", "skis"], "correct_choice_idx": 3, "direct_answers": ["skier", "rope", "skis", "rope", "player", "hay", "haystack", "skis", "helmet people", "skis"], "difficult_direct_answer": false, "rationales": ["The person is going on a skiing adventure behind a horse and is wearing skis.", "The person attached to the rope can be seen standing on long pointy objects, which appear to resemble skis.", "The person is riding on skis."], "image": "train2014/COCO_train2014_000000457011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320670, "question_id": "gNRzqX9SQ9A33f8frFjnGM", "question": "The man here is posing to mimic what?", "choices": ["drunkenness", "levitation", "working out", "insomnia"], "correct_choice_idx": 1, "direct_answers": ["levitation", "sleeping", "floating", "levitating", "levitation", "sleeping", "levitation", "levitating", "levitation", "movie"], "difficult_direct_answer": false, "rationales": ["He's pretending to be sleeping above the bed.", "The man is levitating off the bed.", "The man is levitating."], "image": "val2014/COCO_val2014_000000320670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500967, "question_id": "gNTAMF6emZFPFx4CLqBzPz", "question": "What is this style of egg called?", "choices": ["soft boiled", "scrambled", "hard boiled", "poached"], "correct_choice_idx": 3, "direct_answers": ["poached", "poached", "over easy", "over easy", "poached", "poach", "poached egg", "over medium", "poached", "over easy"], "difficult_direct_answer": false, "rationales": ["The eggs are poached.", "The egg is runny.", "The egg was cooked outside of its shell. the yolk is also mostly intact, so it was not scrambled."], "image": "train2014/COCO_train2014_000000500967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398722, "question_id": "gNtUzW5joUmVGST6Z3axWM", "question": "Why are all the boys dressed alike?", "choices": ["for fun", "they're siblings", "for halloween", "dress code"], "correct_choice_idx": 3, "direct_answers": ["school uniforms", "school uniforms", "in school", "dress code", "uniforms", "quadruplets", "dress code", "school", "school uniforms", "classmates"], "difficult_direct_answer": false, "rationales": ["Most places of learning abroad have strict codes as to what they can wear.", "They are all wearing similar school uniforms which is probably required by their school.", "The boys are all sitting in front of computers together. schools have computers for students. schools sometimes have dress codes."], "image": "val2014/COCO_val2014_000000398722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167397, "question_id": "gPGXR2BLzn673i6XYE7ECR", "question": "What are the people watching?", "choices": ["game", "concert", "tv show", "movie"], "correct_choice_idx": 0, "direct_answers": ["game", "tv", "television", "tv", "video gave", "television", "television", "wii competition", "games", "game"], "difficult_direct_answer": false, "rationales": ["They are watching the game being played", "The people are watching a game.", "The people in the centre are holding console game controllers."], "image": "train2014/COCO_train2014_000000167397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374782, "question_id": "gPP9W2GXuLLXnMeneMMJHB", "question": "What Apple logo means?", "choices": ["invention", "knowledge", "recreation", "intelligence"], "correct_choice_idx": 1, "direct_answers": ["symbol", "computer", "made by", "quality", "apple product", "macbook", "apple computers", "knowledge", "apple computer", "knowledge"], "difficult_direct_answer": true, "rationales": ["It's means knowledge.", "It refers to the learning that gives enlightenment.", "The apple logo is a symbol of knowledge."], "image": "val2014/COCO_val2014_000000374782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349354, "question_id": "gPcRrxJyoUDrtafbBMMkJH", "question": "What could this girl wear if the glare is bothering her here?", "choices": ["magnifying glass", "sunglasses", "prescription glasses", "reading glasses"], "correct_choice_idx": 1, "direct_answers": ["sunglasses", "sunglasses", "sunglasses", "sunglasses", "sunglasses", "sunglasses", "sunglasses", "sunglasses", "sunglasses", "sunglasses"], "difficult_direct_answer": false, "rationales": ["She has sunglasses on the table and could wear them because she is outside and the sun is shining", "The girl has a pair of shades next to her with dark tint that can make the light coming into her eyes less bright.", "The girl could have sunglasses."], "image": "train2014/COCO_train2014_000000349354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367850, "question_id": "gPejaPeXuBt9MNgr5KKcaM", "question": "What's the name for this photographic technique?", "choices": ["cloning effect", "bell curve", "double vision", "time lapse"], "correct_choice_idx": 3, "direct_answers": ["stenography", "time lapse", "rephotography", "pause", "multiplicity", "burst", "multiple exposure", "time lapse", "motion analysis", "rolling shutter"], "difficult_direct_answer": true, "rationales": ["The same skateboarder is shown at different parts of the photo which shows it was taken at different times.", "It shows the same person at different points in the jump", "It grabs a frame in timed intervals"], "image": "train2014/COCO_train2014_000000367850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456917, "question_id": "gPuRuxd8TjKpJC6ZeC96Jm", "question": "What is the sum of each individual number shown?", "choices": ["ten", "16", "28", "82"], "correct_choice_idx": 0, "direct_answers": ["ten", "ten", "ten", "ten", "ten", "ten", "ten", "ten", "ten", "ten"], "difficult_direct_answer": false, "rationales": ["2 plus 8 is 10.", "A man in a baseball uniform has the number twenty eight on the back. two plus eight is ten.", "The numbers are two and eight."], "image": "val2014/COCO_val2014_000000456917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145921, "question_id": "gPySqh9ZXJwsdbte3oWmQA", "question": "How many different airlines are being featured by the planes in the photo?", "choices": ["one", "two", "three", "four"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "two", "three", "three", "three", "three", "two"], "difficult_direct_answer": false, "rationales": ["3 different airlines, singapore, delta, and qantas can be seen.", "The planes pictured are from singapore airlines, qantas and delta.", "The tails have different designs."], "image": "val2014/COCO_val2014_000000145921.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455401, "question_id": "gQCvkSRcrhFoLH4XgyVZFU", "question": "What are the three men with signs wearing?", "choices": ["ties", "cowboy hats", "cowboy boots", "parkas"], "correct_choice_idx": 0, "direct_answers": ["ties", "boxers", "boxers", "boxers", "boxers", "boxers", "boxers", "ties", "boxers", "ties"], "difficult_direct_answer": false, "rationales": ["Both men have a piece of cloth wrapped around their necks for decorative purposes. these are generally knotted in a particular fashion and commonly worn by men.", "They have fabric tied on their necks", "The men all have ties on."], "image": "val2014/COCO_val2014_000000455401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382857, "question_id": "gQcHyu5V3XbfvsejE6zhkK", "question": "What iconic child favorite is there besides Winnie the Pooh?", "choices": ["raggedy andy", "tigger", "raggedy ann", "chatty cathy"], "correct_choice_idx": 2, "direct_answers": ["teddy bear", "ragged ann", "raggedy ann", "raggedy ann", "raggedy anne", "raggedy ann", "raggedy ann", "raggedy ann", "raggedy ann", "raggedy ann"], "difficult_direct_answer": false, "rationales": ["The doll has red hair", "The dolls red yarn hair is well known.", "There is raggedy ann sitting next to winnie the pooh."], "image": "train2014/COCO_train2014_000000382857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554706, "question_id": "gRA92x9CEweH8x8jhNKKtn", "question": "What is being dipped in the red sauce?", "choices": ["fries", "hot dog", "drink", "bread"], "correct_choice_idx": 0, "direct_answers": ["french fries", "french fries", "fries", "fries", "french fries", "fries", "french fries", "fries", "fries", "fries"], "difficult_direct_answer": false, "rationales": ["Fries go in ketchup.", "Many people like to dip their fries in ketchup.", "This is ketchup"], "image": "train2014/COCO_train2014_000000554706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267435, "question_id": "gRyEea2gNLeYZHtMzLt7e2", "question": "What will they eat the bread with?", "choices": ["jam", "nutella", "peanut butter", "cheese"], "correct_choice_idx": 0, "direct_answers": ["jam", "hands", "jam", "butter", "jelly", "coffee", "jelly", "food", "hands", "coffee"], "difficult_direct_answer": false, "rationales": ["There are jars of jelly in front of them.", "There are many biscuits that primarily are used with fruit of some kind spread over it.", "There are a lot of fruity spreads on the table."], "image": "train2014/COCO_train2014_000000267435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240279, "question_id": "gSA2nqDCbtBivCVbf435YM", "question": "The plant is ripening what type of palatable object?", "choices": ["apples", "bananas", "plantains", "pears"], "correct_choice_idx": 1, "direct_answers": ["bananas", "bananas", "banana", "banana", "banana", "banana", "banana", "banana", "bananas", "banana"], "difficult_direct_answer": false, "rationales": ["Generally the types of fruits are green before the turn yellow when ripe.", "The plant is a tree that grows long yellow curved fruit which hangs in bunches.", "By the texture and color, it is easy to tell what the fruit is."], "image": "val2014/COCO_val2014_000000240279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400190, "question_id": "gSCRUmT3SLQTWof6nBbciT", "question": "What form of entertainment are the remotes used for?", "choices": ["action figures", "puzzles", "video games", "movies"], "correct_choice_idx": 2, "direct_answers": ["playing wii", "video games", "playing wii", "playin kid", "nintendo wii", "video games", "tv watching", "video games", "wii", "video games"], "difficult_direct_answer": false, "rationales": ["These are used for the nintendo wii.", "This is indicated by the wii controller.", "A child is holding a video game controller."], "image": "train2014/COCO_train2014_000000400190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321692, "question_id": "gSPHt2FFx4R3n7eSaZMijo", "question": "How many species of animals are there?", "choices": ["three", "four", "one", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are several horses, and one dog, so there are 2 different species.", "There are horses and a dog", "There are horses and a dog here."], "image": "val2014/COCO_val2014_000000321692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156958, "question_id": "gSWeetGDoL3tU7Exm7UjxD", "question": "Why are the umbrellas setup above the chairs?", "choices": ["block wind", "block rain", "decorative purposes", "for shade"], "correct_choice_idx": 3, "direct_answers": ["for shade", "shade", "sun block", "shade", "shade", "shade", "sun protection", "shade", "shade", "allow shade"], "difficult_direct_answer": false, "rationales": ["Umbrellas keep the sun away.", "It is a calm sunny day. the umbrellas are needed to block the sunlight.", "The umbrellas would block the rays of the sun and prevent sunburn."], "image": "train2014/COCO_train2014_000000156958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350444, "question_id": "gSWtkfp6K274oUoJEKooTF", "question": "If you wanted to cause an explosion using something here which object would be most useful?", "choices": ["gas pipe", "car", "bomb", "ball"], "correct_choice_idx": 1, "direct_answers": ["ball", "ball", "car", "ball", "car", "car", "car", "car", "car", "car"], "difficult_direct_answer": false, "rationales": ["A car is full of gas that you can put on fire.", "It is pretty easy to make a vehicle into a bomb.", "The car is available."], "image": "val2014/COCO_val2014_000000350444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518509, "question_id": "gSh8QM7m3Tku2rUYph36bh", "question": "What is the black pot used for?", "choices": ["fondue", "rice", "sushi", "tempura"], "correct_choice_idx": 0, "direct_answers": ["meat", "fries", "hold food", "for funduue", "soy sauce", "keep warm", "stews", "soup", "fondue", "food"], "difficult_direct_answer": true, "rationales": ["A pot with food in it is on the table.", "The black pot is used to keep the contents hot.", "The pot is for fondue."], "image": "train2014/COCO_train2014_000000518509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293342, "question_id": "gSp7WNciYhoJyH523TtYsc", "question": "What is the area called that the clock is in?", "choices": ["front", "tower", "pulpit", "barn"], "correct_choice_idx": 1, "direct_answers": ["church", "clock tower", "tower", "chimney", "tower", "tower", "church", "tower", "clock tower", "tower"], "difficult_direct_answer": false, "rationales": ["The clock is on the side of a tall tower.", "The clock is inside of the tower.", "The clock is up in the tower."], "image": "train2014/COCO_train2014_000000293342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492057, "question_id": "gSwBPguHGzqXzbgoCaYrw7", "question": "Who is taking this man's picture?", "choices": ["teen friend", "no one", "studio photographer", "he is"], "correct_choice_idx": 3, "direct_answers": ["man", "himself", "he is", "himself", "himself", "man", "self", "himself", "pictured man", "himself"], "difficult_direct_answer": false, "rationales": ["The man takes a selfie.", "A man is standing in front of a mirror with his phone camera out.", "The man is holding a phone."], "image": "val2014/COCO_val2014_000000492057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381641, "question_id": "gSxWxT6TivMNVVtenuwtEv", "question": "What are the sheep traveling down from?", "choices": ["zoo", "forest", "hill", "river"], "correct_choice_idx": 2, "direct_answers": ["mountain top", "hill", "hill", "hill", "hills", "mountain top", "barn", "hill", "mountain", "hill"], "difficult_direct_answer": false, "rationales": ["There are no forests, rivers, or zoos near the sheep. there is a slope.", "You can tell by the raised field, what type of terrain it is.", "The area seems to be a hill and they are moving down it during the picture."], "image": "train2014/COCO_train2014_000000381641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480739, "question_id": "gT6tZE94VXZQXQiD7kvEzQ", "question": "What type of room is this typically referred to as?", "choices": ["bedroom", "kitchen", "restroom", "garage"], "correct_choice_idx": 2, "direct_answers": ["restroom", "bathroom", "bathroom", "mens room", "restroom", "bathroom", "restroom", "restroom", "bathroom", "bathroom"], "difficult_direct_answer": false, "rationales": ["The room as urinals in it.", "It's also referred to as a bathroom or a with urinals.", "This is a room typically referred to as a restroom."], "image": "train2014/COCO_train2014_000000480739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406659, "question_id": "gT83CXKrdsPUu6fvQLzHFe", "question": "To hit this ball the child should place the ball on which color of an item seen here first?", "choices": ["green", "red", "white", "yellow"], "correct_choice_idx": 1, "direct_answers": ["red", "red", "red", "red", "red tee", "red tee", "red", "red", "red", "red tee"], "difficult_direct_answer": false, "rationales": ["The yellow item is the bat, and the white item is the ball. the green grass would not help the child hit the ball.", "There are no white or green items and the bat he is holding is yellow. so we get the answer by a process of elimination.", "The ball needs to be put on the batting tee which is the color of a strawberry."], "image": "train2014/COCO_train2014_000000406659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371002, "question_id": "gTGM5nVgurNs6jmuQDNTWv", "question": "Why is the man stretching his arms out?", "choices": ["to dive", "to wave", "to balance", "to dance"], "correct_choice_idx": 2, "direct_answers": ["for balance", "surfing", "balance", "balance", "balance", "to balance", "regain balance", "balance", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["The man is doing a stunt on his skateboard and might fall off.", "The man is on a surfboard. he is surfing, not waving, dancing, or diving.", "He is riding a wave so his outstretched arms helps with the balance"], "image": "train2014/COCO_train2014_000000371002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62442, "question_id": "gTLzX8TTmbSBsoLVncRCzP", "question": "Why is the man wearing a hat with earflaps?", "choices": ["warmth", "visibility", "as cosplay", "fashion"], "correct_choice_idx": 0, "direct_answers": ["warms ear", "warmth", "stay warm", "heat", "cold", "cold", "thermal retention", "stay warm", "cold", "warmth"], "difficult_direct_answer": false, "rationales": ["The man is cold in the snow.", "The man wants warmth.", "When skiing in the snow and cold the ears need to be protected from frostbite."], "image": "train2014/COCO_train2014_000000062442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375503, "question_id": "gTQxoDYJkWsiznFSR9R34i", "question": "What is attached to the brown door?", "choices": ["garbage bag", "pastries", "cross", "papers"], "correct_choice_idx": 3, "direct_answers": ["posters", "posters", "papers", "posters", "papers", "map", "map", "posters", "map", "papers"], "difficult_direct_answer": false, "rationales": ["There are sheets of paper on the door.", "Papers are sticked to the brown door", "We can see three papers taped up to the door; the larger one is in the middle."], "image": "val2014/COCO_val2014_000000375503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353644, "question_id": "gTY36TXZYr2uCrDDSr3xUc", "question": "What is the name of the device used for playing in this image?", "choices": ["bat", "ball", "skating board", "stick"], "correct_choice_idx": 2, "direct_answers": ["skating board", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skate board", "scatting", "skate board"], "difficult_direct_answer": false, "rationales": ["These are devices that you ride and you can also perform tricks it can be a social experience therefore they can be used for play.", "The name is a skateboard.", "It has a deck for standing and 4 wheels"], "image": "val2014/COCO_val2014_000000353644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368956, "question_id": "gTvkr93yxtag5ANXzMx6zB", "question": "Why are some of the kids wearing numbers?", "choices": ["to participate", "to punish", "dress code", "for fun"], "correct_choice_idx": 0, "direct_answers": ["racing", "competition", "race", "race", "ski competition", "to participate", "race", "competition", "racing", "race"], "difficult_direct_answer": false, "rationales": ["The skiers are wearing numbers because it is a competition that they are participating in", "The kids are participating.", "They are participating in a competition or a race this identifies them."], "image": "train2014/COCO_train2014_000000368956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251115, "question_id": "gU6No3H5AKoUZDMifufYfH", "question": "What kind of game is the man playing?", "choices": ["building", "bowling", "shooting", "dancing"], "correct_choice_idx": 1, "direct_answers": ["video games", "bowling", "bowling", "wii", "cheese", "bowling", "wii", "wii bowling", "wii", "bowling"], "difficult_direct_answer": false, "rationales": ["The television screen shows a ball and lanes.", "The game on the screen has lanes, pins, and balls.", "The screen shows a bowling alley."], "image": "train2014/COCO_train2014_000000251115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532689, "question_id": "gU9bURABx99K3Ew3gRonwC", "question": "For what reason might the taut lines here serve?", "choices": ["decoration", "prevent disorientation", "property line", "skier pulling"], "correct_choice_idx": 3, "direct_answers": ["skier pulling", "guarding", "navigating path", "keep balance", "transport", "traffic line", "no falling", "tow uphill", "balance", "divider"], "difficult_direct_answer": true, "rationales": ["The reason is the skier pulling.", "The lines allow the skiers to guide themselves using them.", "Tight lines are erected on a ski run. beginners hold on to lines to help get balance when they first learn to sky."], "image": "train2014/COCO_train2014_000000532689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49643, "question_id": "gUEufAxu52pUPQDVQuTCCJ", "question": "Why are the men's vests yellow in color?", "choices": ["fashion", "dress code", "visibility", "camouflage"], "correct_choice_idx": 2, "direct_answers": ["city workers", "more visibility", "safety", "safety reflection", "working gear", "safety", "increased visibility", "visibility", "visibility", "safety"], "difficult_direct_answer": false, "rationales": ["Garbage men are wearing brightly colored vests. roadworkers wear brightly colored clothes to increase visibility and safety.", "The man wants to be visible.", "The vests are reflective so people can see the men better for their safety."], "image": "train2014/COCO_train2014_000000049643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459000, "question_id": "gUHEULEiY26fJvpwzptvHV", "question": "Who was Vice President of the United States when this picture was captured?", "choices": ["dan quayle", "dick cheney", "al gore", "joe biden"], "correct_choice_idx": 2, "direct_answers": ["dick cheney", "al gore", "al gore", "al gore", "al gore", "al gore", "biden", "al gore", "albert gore", "al gore"], "difficult_direct_answer": false, "rationales": ["In the year 2000, the vice president of the usa was al gore.", "This was taken in may 2000 which was still under the clinton administration.", "In 2000 he was the vp with clinton."], "image": "train2014/COCO_train2014_000000459000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181079, "question_id": "gURae7pDb4NdaJqsbPSwoH", "question": "How many years old is this dog now?", "choices": ["three", "13", "eight", "23"], "correct_choice_idx": 1, "direct_answers": ["13", "three", "16", "3 years", "3 years", "one", "three", "three", "three", "sixteen"], "difficult_direct_answer": false, "rationales": ["The dog is 13.", "The dog must be thirteen.", "This picture was taken in 2008 and the dog has one candle. it's now 2021."], "image": "train2014/COCO_train2014_000000181079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97580, "question_id": "gUnFzXGi2ujXEL8Na63UEb", "question": "Why does he have the dog on the board?", "choices": ["training dog", "followed him", "owns dog", "captured dog"], "correct_choice_idx": 2, "direct_answers": ["companionship", "companionship", "to float", "owns dog", "friendship", "fun", "for company", "enjoying", "companionship", "fun times"], "difficult_direct_answer": false, "rationales": ["It is his pet that he enjoys spending time with.", "The woman is afraid the dog might drown.", "The dog is owned by the man."], "image": "val2014/COCO_val2014_000000097580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187869, "question_id": "gVJoW7Euyh79VfPg4uxFK8", "question": "What is the red structure meant to prevent?", "choices": ["car crashes", "ship wrecks", "speeding", "air crashes"], "correct_choice_idx": 1, "direct_answers": ["ships crashing", "vessel accidents", "accidents", "crashes", "crashes", "ship wrecks", "ships crashing", "lighthouse", "collisions", "boat accidents"], "difficult_direct_answer": false, "rationales": ["The lighthouse is used to show the light either at the dark or in the fog.", "The structure is known as a lighthouse because it shines light into the water for boats to avoid obstacles in the dark.", "The structure prevents wrecks."], "image": "train2014/COCO_train2014_000000187869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256529, "question_id": "gVWyJ3gYJ2hc3C7mZydmL6", "question": "What is the common term for these objects with wheels?", "choices": ["wheelbarrow", "luggage dolly", "air cart", "smart cart"], "correct_choice_idx": 3, "direct_answers": ["trolley", "luggage cart", "suitcases", "easily", "luggage cart", "luggage carrier", "cart", "luggage cart", "smart cart", "cart"], "difficult_direct_answer": false, "rationales": ["The carts with wheels help transport suitcases.", "The utility carts are known as smart carts.", "Though a luggage dolly is correct, generally a smart cart is what helps you to lug around a lot of luggage at one time."], "image": "val2014/COCO_val2014_000000256529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451972, "question_id": "gVXoqZBW7eLMSrG2BRtwGY", "question": "What sort of Holiday parade is being feted here?", "choices": ["christmas", "st patricks", "flag day", "veterans day"], "correct_choice_idx": 0, "direct_answers": ["christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas", "christmas"], "difficult_direct_answer": false, "rationales": ["Christmas decorations are on the truck.", "Santa claus is on the firetruck.", "The holiday is christmas."], "image": "val2014/COCO_val2014_000000451972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388770, "question_id": "gVfZJaPYRs9YpDf5mSg9BM", "question": "What movie are these animals reminiscent of?", "choices": ["birds", "grizzly", "cujo", "cat people"], "correct_choice_idx": 0, "direct_answers": ["birds", "birds", "finding nemo", "birds", "birds", "birds", "free birds", "birds", "stephen king", "birds"], "difficult_direct_answer": false, "rationales": ["That is the name of the movie.", "These flocks animals are reminiscent of the movie birds.", "They are small animals that are capable of flying. they are not bears, dogs, or cats."], "image": "val2014/COCO_val2014_000000388770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149863, "question_id": "gVgUJrY54Lb4tRdKBDwyag", "question": "What is the thin object the man holding the microphone is using to prop up called?", "choices": ["cane", "dagger", "shovel", "flute"], "correct_choice_idx": 0, "direct_answers": ["cane", "cane", "walking stick", "cane", "walking stick", "cane", "walking stick", "cane", "cane", "cane"], "difficult_direct_answer": false, "rationales": ["The object is the length of the height of the man's waist to the ground. it is about the width of five fingers. it is sturdy and can hold a lot of weight without breaking.", "The object is a walking cane.", "The man is holding a cane."], "image": "train2014/COCO_train2014_000000149863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377652, "question_id": "gVvGs7oaVCrSrdzNMixCZq", "question": "What are the cables for?", "choices": ["rescuing him", "lifting him", "climbing", "holding him"], "correct_choice_idx": 1, "direct_answers": ["air sailing", "control", "windsurfing", "lifting him", "lift off", "kite", "para-glide", "controlling kite", "hang gliding", "hold on"], "difficult_direct_answer": true, "rationales": ["He's attached to a large sail in the air", "The man is in the air.", "The cables help raise the man."], "image": "val2014/COCO_val2014_000000377652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552330, "question_id": "gVvcHCzNHearTmiCV4sG5T", "question": "What is the snow covering to the right of the path in front of the bench?", "choices": ["water", "gravel", "grass", "sand"], "correct_choice_idx": 0, "direct_answers": ["walk way", "sidewalk", "sidewalk", "walkway", "ice", "ice", "walking path", "water", "pee", "road"], "difficult_direct_answer": false, "rationales": ["The snow is water.", "The snow is covering the lake.", "The snow covers up the pond."], "image": "train2014/COCO_train2014_000000552330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222639, "question_id": "gW96eemduUZ9XrDa95NPL5", "question": "What activity has the man jumping in the air?", "choices": ["extreme frisbee", "soccer", "football", "baseball"], "correct_choice_idx": 0, "direct_answers": ["frisbee", "extreme frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "catching frisbee", "catching frisbee"], "difficult_direct_answer": false, "rationales": ["A man is jumping to catch a frisbee.", "He's doing a trick while catching a disk", "The man is holding a yellow flying disc."], "image": "train2014/COCO_train2014_000000222639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277625, "question_id": "gWHonCejca2QPSyYu8SpiE", "question": "What type of faire might be happening here?", "choices": ["circus", "rodeo", "eating contest", "renaissance"], "correct_choice_idx": 3, "direct_answers": ["midlevel", "renaissance", "fighting", "medieval", "renaissance", "crossing", "renaissance", "zebra", "renaissance", "medieval"], "difficult_direct_answer": false, "rationales": ["You can tell by how they are dressed as to what type of faire may be happening.", "A renaissance fare features weapons and horses.", "The tradition of dressing up in medieval garb is commonly referred to as such."], "image": "val2014/COCO_val2014_000000277625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104931, "question_id": "gWK9wcXYF5HUXKYWSCCQq8", "question": "What direction does the black surface the pink jacketed person stands upon go?", "choices": ["none", "down", "sideways", "up"], "correct_choice_idx": 3, "direct_answers": ["left", "straight", "up", "up", "horizontal", "up", "up", "mountain", "up", "up"], "difficult_direct_answer": false, "rationales": ["The black surface is a belt that carries snowboarders up the slope.", "The surface goes up the hill.", "The direction is up."], "image": "train2014/COCO_train2014_000000104931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252018, "question_id": "gWQx5Cbm4AiB36uL5DUjhP", "question": "The girl that is next to the boy in the blue cap has what hairstyle?", "choices": ["pig tails", "beehive", "bouffant", "pony tail"], "correct_choice_idx": 3, "direct_answers": ["ponytail", "ponytail", "short cut", "ponytail", "pony tail", "ponytail", "ponytail", "ponytail", "short cut", "pony tail"], "difficult_direct_answer": false, "rationales": ["A girl has her hair pulled into a single elastic band.", "That is what the hairstyle is called because it ends up looking like the tail of a horse.", "The girl in this image's long hair is pulled into a tight row and fastened. this is known as a pony tail."], "image": "train2014/COCO_train2014_000000252018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337314, "question_id": "gWo2f87xZXPTZUFxXJMEo5", "question": "What is the man trying to do to the horse?", "choices": ["punish it", "milk it", "calm it", "feed it"], "correct_choice_idx": 2, "direct_answers": ["calm it", "pat", "pet it", "soothing", "pet horse", "calm it", "comfort", "ride", "calm horse", "pet"], "difficult_direct_answer": true, "rationales": ["The man is calming it.", "Traditionally touching an animal gently can be used to calm it down.", "He is patting it's neck."], "image": "train2014/COCO_train2014_000000337314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438232, "question_id": "gWpNBhLTKc5jErmPv4m27n", "question": "What will persons standing here do next?", "choices": ["rob train", "exit station", "exit train", "board train"], "correct_choice_idx": 3, "direct_answers": ["board train", "board", "board", "board train", "board", "enter train", "board train", "travel", "board train", "board"], "difficult_direct_answer": false, "rationales": ["It's a passenger train", "The doors of the train are open. the people are on the platform facing the train.", "The person is waiting on the platform which people use to board the train."], "image": "val2014/COCO_val2014_000000438232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29299, "question_id": "gWwnwkoJrELsoGkp7K74RX", "question": "What countries flag can be seen near the front of the plane?", "choices": ["germany", "sweden", "united states", "poland"], "correct_choice_idx": 2, "direct_answers": ["united states", "usa", "american", "united states", "america", "american", "usa", "us", "united states", "united states"], "difficult_direct_answer": false, "rationales": ["The flag is the usa.", "The flag is clearly visible and belong to the united states. based on contained colors alone, this is the only option that works.", "There are stars and stripes."], "image": "train2014/COCO_train2014_000000029299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202998, "question_id": "gXLWmXhkKnWFkrCnTkWrR7", "question": "What does the item in the can with the utensil look like?", "choices": ["cake", "soup", "cat", "pears"], "correct_choice_idx": 1, "direct_answers": ["soup", "soup", "spoon", "soup", "spoon", "spoon", "tomato soup", "tomato soup", "tomato soup", "soup"], "difficult_direct_answer": false, "rationales": ["The can design is consistent with something that would be used to insulate food and keep it at a certain temperature. the liquid inside the container looks to be of the consistency of soup and is likely soup because of the container it is in.", "The stuff in the can looks like soup to go with the sandwich.", "It's a spoon for the soup."], "image": "val2014/COCO_val2014_000000202998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492020, "question_id": "gXMHhqGdWiXKuSF4yFp4DL", "question": "What is the man standing next to the bikes most likely doing?", "choices": ["travelling", "exercising", "training", "working"], "correct_choice_idx": 3, "direct_answers": ["parking bike", "renting bike", "putting away", "selling bikes", "working", "returning bike", "parking", "parking bike", "parking bike", "returning"], "difficult_direct_answer": false, "rationales": ["He is wearing a branded t shirt which is the same branding as that on the side of the bikes.", "The man standing next to the bikes is working because he's wearing a work shirt.", "The man's shirt has the same company name on it as the bike does. he is putting the bike away as part of his duties as an employee."], "image": "train2014/COCO_train2014_000000492020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69220, "question_id": "gXfcqNbS5t3h9Mb8f73a8K", "question": "What is the man holding?", "choices": ["chicken ring", "cinnamon roll", "zeppole", "calamari"], "correct_choice_idx": 2, "direct_answers": ["donut", "mini donut", "onion ring", "small donut", "food", "mini donut", "zeppole", "donut", "donut", "doughnut"], "difficult_direct_answer": false, "rationales": ["That is the foo the man is holding.", "The man is holding a round piece of breaded chicken with a hole in the middle.", "This small fried bit of bread with powdered sugar outside is known as zeppole."], "image": "train2014/COCO_train2014_000000069220.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549407, "question_id": "gXoycj3y62uRhQysN4AA3E", "question": "Where are they most likely sharing a meal and a laugh?", "choices": ["home", "restaurant", "hotel", "cafeteria"], "correct_choice_idx": 0, "direct_answers": ["dining table", "restaurant", "kitchen", "dining room", "home kitchen", "home", "home", "home", "dinner", "kitchen table"], "difficult_direct_answer": false, "rationales": ["By the kitchen setting, and background it is easy to surmise where the picture is being taken.", "They're at home.", "The people are sitting at a dining table which is found inside a house."], "image": "train2014/COCO_train2014_000000549407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119294, "question_id": "gYKZeMWbL4LQ92pHiHGMhP", "question": "What years was this machine first introduced?", "choices": ["1866", "2004", "1994", "1804"], "correct_choice_idx": 3, "direct_answers": ["1804", "1814", "1804", "1804", "nineteenth century", "1800's", "eighteen seventythree", "1804", "1804", "eighteen fifty-eight"], "difficult_direct_answer": false, "rationales": ["The year was 1804.", "Richard trevithick built the first steam locomotive during the first decade of the nineteenth century.", "Trains come from the early 1800s."], "image": "train2014/COCO_train2014_000000119294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144620, "question_id": "gYYHXhJnrMPDwmZJioBP3f", "question": "What is the relationship between the man and the woman?", "choices": ["couple", "friends", "siblings", "cousins"], "correct_choice_idx": 0, "direct_answers": ["husband/wive", "married", "married", "married", "married", "married", "married", "parent", "married", "couple"], "difficult_direct_answer": false, "rationales": ["They are sitting close together next to the words \"love story\".", "By definition, a couple is \"two people.\" this is what we see here. they could be siblings, friends or even cousins but the most logical answer is \"couple.\"", "The couple are sitting on a bench that says love story."], "image": "train2014/COCO_train2014_000000144620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14312, "question_id": "gZ44TDEnidwsJR5q8FRLQM", "question": "What are the women doing?", "choices": ["stealing", "window shopping", "watching television", "texting"], "correct_choice_idx": 1, "direct_answers": ["window shopping", "window shopping", "window shopping", "window shopping", "window shopping", "standing", "window shopping", "standing", "window shopping", "window shopping"], "difficult_direct_answer": false, "rationales": ["You can tell by the rows of stores in a city setting as to what they are doing.", "When you just look inside and don't go in you are window shopping.", "They are looking in the window at the items the store is selling."], "image": "train2014/COCO_train2014_000000014312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513075, "question_id": "gZ4c36mfRBCnmHi8zjJvK3", "question": "What make of vehicles does this dealership sell?", "choices": ["hyundai", "nissan", "toyota", "honda"], "correct_choice_idx": 0, "direct_answers": ["honda", "hyundai", "hyundai", "hyundai", "hyundai", "compact", "hyundai", "honda", "hyundai", "hyundai"], "difficult_direct_answer": false, "rationales": ["The name is on the building", "The sign on the dealership says it is.", "The cars for sale are korean, not japanese."], "image": "train2014/COCO_train2014_000000513075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371603, "question_id": "gZJQPFnqcDdeg5QcZEPV2X", "question": "What are these vehicles used for fighting?", "choices": ["fire", "war", "crime", "bugs"], "correct_choice_idx": 0, "direct_answers": ["fire", "fires", "fires", "fires", "fire", "fires", "fire", "fires", "fire", "fire"], "difficult_direct_answer": false, "rationales": ["The trucks are red as they are used to tell they fight fires.", "The vehicles parked look like fire engines based on their color schemes and the structure of the vehicle. fire trucks are used by fire fighters to fight fires.", "They are used to put out fires."], "image": "train2014/COCO_train2014_000000371603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62478, "question_id": "gZc4YLBNYRfkTM4GjWD45H", "question": "Where does the woman stand at?", "choices": ["front yard", "highway", "intersection", "porch"], "correct_choice_idx": 2, "direct_answers": ["corner", "crosswalk", "crosswalk", "traffic", "sign", "traffic light", "left", "at corner", "intersection", "street"], "difficult_direct_answer": true, "rationales": ["There is a sign that has arrows pointing other ways.", "The woman is at an intersection.", "The woman is standing underneath a traffic sign for intersection."], "image": "train2014/COCO_train2014_000000062478.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106661, "question_id": "gZoWXxcsPyVMGdz2jf36Zp", "question": "What left the marks on the ground?", "choices": ["motorcycles", "little kids", "horses", "cars"], "correct_choice_idx": 0, "direct_answers": ["tires", "cars", "motorcycles", "bikes", "tires", "motorcycles", "tires", "motorcycles", "tires", "motorcycles"], "difficult_direct_answer": false, "rationales": ["Motorcycles are lined up on the street. single lines can be seen on the pavement.", "The marks are about the same width as the wheels of the bikes.", "Motorcycles are parked in a line and faint, single lines can be seen on the pavement in front of them."], "image": "val2014/COCO_val2014_000000106661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457271, "question_id": "ga4YD8xbMUX28iDu47RoVi", "question": "What is the area where the horse is being kept called?", "choices": ["garage", "shed", "stable", "bunk"], "correct_choice_idx": 2, "direct_answers": ["stable", "stable", "stable", "stable", "stable", "stable", "stall", "stables", "stables", "stable"], "difficult_direct_answer": false, "rationales": ["Horses are not kept in garages, sheds, or bunks.", "The horse is in a stable at a barn.", "The horse is being kept in a stable."], "image": "val2014/COCO_val2014_000000457271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84530, "question_id": "gaJS3WVAg7niWfgczSzPHQ", "question": "What type of tops are the men on the right wearing?", "choices": ["skinny tops", "crop tops", "big tops", "tank tops"], "correct_choice_idx": 3, "direct_answers": ["tank tops", "t shirt", "tank tops", "tank tops", "tank tops", "tank", "sports banyan", "tank tops", "tank", "tank tops"], "difficult_direct_answer": false, "rationales": ["They have no sleeves and have straps", "The men are wearing shirts without sleeves. this style of shirt is referred to as answer a.", "The shirts without sleeves"], "image": "val2014/COCO_val2014_000000084530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105783, "question_id": "gaWe2NbuBEwhqNsKBhDKwb", "question": "What is this bear made of?", "choices": ["gummy bears", "bananas", "taffy", "chocolate"], "correct_choice_idx": 0, "direct_answers": ["gummies", "candy", "gummi bears", "gummy bears", "candy", "gummi", "gummy bears", "gummy bears", "gummy bears", "gummy worms"], "difficult_direct_answer": false, "rationales": ["The bear has gummies.", "It is a larger version of the candy it is made of.", "The big candy is in the shape of the little candy itself."], "image": "val2014/COCO_val2014_000000105783.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573553, "question_id": "gaXXM3AbqvuATN8maNGMKr", "question": "Which item is most likely made from animal skin?", "choices": ["couch", "candle", "floor", "pillow"], "correct_choice_idx": 0, "direct_answers": ["couch", "sofa", "couch", "sofa sectional", "soba", "sofa", "couch", "sofa", "not clear", "couch"], "difficult_direct_answer": false, "rationales": ["It is leather", "The couch looks to be made of leather.", "Leather is a common textile to use in sofas. animal skin is made into leather."], "image": "train2014/COCO_train2014_000000573553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110392, "question_id": "gausG3aPvAihuJ2HAf5MS3", "question": "What is the woman wearing sunglasses doing?", "choices": ["crossing street", "talking", "listening", "recording"], "correct_choice_idx": 1, "direct_answers": ["talking phone", "talking", "talking phone", "on cellphone", "talking", "communicating", "talking cell", "talking", "talking", "walking"], "difficult_direct_answer": false, "rationales": ["She is talking on a phone.", "She has her phone up to her ear", "The woman is holding a phone to her ear and her mouth is open, so it's safe to assume that she is currently talking to someone on the phone."], "image": "val2014/COCO_val2014_000000110392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537631, "question_id": "gb9fi9WKCzWTJUNsp8Fir6", "question": "Which Russian athlete plays a similar sport to these girls?", "choices": ["mariya abakumova", "aliya mustafina", "evgenia medvedeva", "maria sharapova"], "correct_choice_idx": 3, "direct_answers": ["maria sharapova", "no idea", "maria sharapova", "maria sharapova", "maria shrapova", "sharapova", "maria sharapova", "russia", "kornakova", "anna kournikova"], "difficult_direct_answer": false, "rationales": ["One of the most famous tennis players who's a women from russia.", "The athlete is maria.", "Maria sharapova plays a similar sport to these girls on the tennis court."], "image": "val2014/COCO_val2014_000000537631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320308, "question_id": "gbFNjNHApnBZyNeYJxoGRY", "question": "What sort of adventure are they probably heading out on?", "choices": ["flying", "riding", "camping", "swimming"], "correct_choice_idx": 2, "direct_answers": ["hiking", "wildlife", "camping", "camping", "hiking", "hiking", "hiking", "hike", "hiking", "hiking"], "difficult_direct_answer": false, "rationales": ["The adventure is camping.", "They have backpacks full of supplies and a sleeping bag on the bottom.", "There is a nature scene in the background and they have camping backpacks on with visible sleeping bags as one would if they were going camping."], "image": "train2014/COCO_train2014_000000320308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569190, "question_id": "gbHh6iEPkCryfe4Gi5ruWS", "question": "The rightmost set of rails leads to which railway structure?", "choices": ["workshop", "depot", "turntable", "train station"], "correct_choice_idx": 3, "direct_answers": ["train station", "train station", "train station", "depot", "main tracks", "middle structure", "train station", "station", "station", "depot"], "difficult_direct_answer": false, "rationales": ["All trains need to end up at a train station.", "There are people waiting in the building and the building is near the tracks.", "Train tracks diverge from the main route and go to the side. train stations are present every so often on the length of tracks."], "image": "train2014/COCO_train2014_000000569190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350138, "question_id": "gbohp8YLPoLXxicwkJFAfV", "question": "These types of vehicles are commonly referred to as what?", "choices": ["modern", "vintage", "contemporary", "futuristic"], "correct_choice_idx": 1, "direct_answers": ["trucks", "trucks", "trucks", "trucks", "vintage", "trucks", "trucks", "trucks", "trucks", "trucks"], "difficult_direct_answer": false, "rationales": ["The truck is very old. older trucks are considered antiques when they are still in working condition.", "The car is old. old things are called vintage or classic.", "These trucks are vintage because they are older models"], "image": "train2014/COCO_train2014_000000350138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122317, "question_id": "gbs27XSFR9h7y8VeXuC4uA", "question": "How is the object in the sky controlled?", "choices": ["remote", "computer", "string", "magic"], "correct_choice_idx": 2, "direct_answers": ["by string", "drown", "string", "string", "string", "air", "wind", "tail", "wind", "using string"], "difficult_direct_answer": false, "rationales": ["The kite is controlled by a string.", "Kites have rolls of string attached to help in flying them.", "The object has a string."], "image": "val2014/COCO_val2014_000000122317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329276, "question_id": "gbsKEtwbNvAvtVBMXPT74c", "question": "Why are some people standing in the boats?", "choices": ["angry", "tipping boat", "diving", "gondoliers"], "correct_choice_idx": 3, "direct_answers": ["looking", "rowing", "balancing", "to see", "pole boat", "steering", "gondoliers", "steering", "guiding", "balancing"], "difficult_direct_answer": false, "rationales": ["The people that are standing are the ones that are holding the paddles.", "They have to stand because of the type of boat.", "They are special boats called gondollers."], "image": "train2014/COCO_train2014_000000329276.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222430, "question_id": "gc2tgS5CC7embQSqKESeLN", "question": "What location does this man work in?", "choices": ["clothing store", "mall", "office", "stage"], "correct_choice_idx": 2, "direct_answers": ["office", "office", "office", "office", "office", "office", "office", "office", "office", "office"], "difficult_direct_answer": false, "rationales": ["The man is sitting near computers and documents. he is wearing business casual clothing.", "The man is in a white collar environment.", "The man is in an office."], "image": "val2014/COCO_val2014_000000222430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68231, "question_id": "gcE8wxpxbTcajBtjgRKSbE", "question": "What are the planes near the dock called?", "choices": ["airbus", "jet", "commuter", "seaplane"], "correct_choice_idx": 3, "direct_answers": ["seaplane", "boat", "sea planes", "floating planes", "seaplane", "seaplanes", "seaplanes", "seaplanes", "ports", "sea planes"], "difficult_direct_answer": false, "rationales": ["The planes are the kind that can land on the sea and are called seaplanes.", "The planes can take of from the water or ocean.", "The planes are at sea."], "image": "train2014/COCO_train2014_000000068231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304150, "question_id": "gcJTSo5XSPPr8GYUhFqF7s", "question": "What is in front of the man in the first row wearing glasses?", "choices": ["laptop", "lion", "cow", "baby"], "correct_choice_idx": 0, "direct_answers": ["mm candy", "mms", "laptop", "laptop", "laptop", "computer", "pop bottle", "computer", "computer", "laptop"], "difficult_direct_answer": false, "rationales": ["The man has a laptop.", "Laptops are small and portable. the others are living creatures.", "The room contains a bunch of people and tables, so this would not be a suitable environment for babies or animals. there is a computer in front of the man."], "image": "train2014/COCO_train2014_000000304150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566448, "question_id": "gcYYkXzoph9VWVtcz8WfYt", "question": "What language is shown on the banner?", "choices": ["german", "chinese", "italian", "spanish"], "correct_choice_idx": 3, "direct_answers": ["spanish", "spanish", "spanish", "spanish", "spanish", "spanish", "spanish", "spanish", "spanish", "spanish"], "difficult_direct_answer": false, "rationales": ["The other options don't match the language.", "Those words are spanish.", "You can tell by the how the words read on the sign, as to what language it is."], "image": "train2014/COCO_train2014_000000566448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272901, "question_id": "gdjjMJHTqbnjcUhjv5LLLa", "question": "In which lane does the cyclist cycle?", "choices": ["passing", "dotted line", "bus lane", "bike lane"], "correct_choice_idx": 3, "direct_answers": ["right lane", "bike lane", "bike lane", "bike lane", "bike lane", "bike", "bike lane", "bike", "bike lane", "bicycle lane"], "difficult_direct_answer": false, "rationales": ["The biker is in a thin lane between the lane for cars and the lane for parking next to the curb.", "There is a small narrow lane that a man on a bike is using.", "There is a thin lane next to the parking spaces just for this traveler."], "image": "train2014/COCO_train2014_000000272901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413959, "question_id": "gdszcW95K37q3N9UF8itEK", "question": "What is the man doing on the carriage?", "choices": ["making repairs", "is waiting", "is resting", "is stealing"], "correct_choice_idx": 1, "direct_answers": ["standing", "transporting", "nothing", "driving", "is waiting", "driving", "waiting", "driving", "transportation", "walking"], "difficult_direct_answer": false, "rationales": ["He is sitting and waiting for passengers.", "He is walking next to the carriage.", "He is waiting for riders."], "image": "val2014/COCO_val2014_000000413959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91715, "question_id": "ge2z5FTTxysGhd7tKRWSU5", "question": "What is the company of Roycawood truck?", "choices": ["honda", "bmw", "audi", "hitachi"], "correct_choice_idx": 0, "direct_answers": ["honda", "trucking company", "lucky june", "lady jane", "truck company", "lady jane", "cawood", "roy cawood", "lady jane", "roycawood"], "difficult_direct_answer": false, "rationales": ["This type of truck is by honda and used to transport things.", "Honda owns roycawood.", "This is unclear in the image, but since a is often the answer, i'm answering a. i tried to look it up with google and couldn't find the right answer."], "image": "val2014/COCO_val2014_000000091715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381642, "question_id": "geC5cgkjSR4un66kaq5emT", "question": "Which of the foods on the table belong to the cruciferous family?", "choices": ["carrots", "broccoli", "celery", "tomato"], "correct_choice_idx": 1, "direct_answers": ["broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "green one"], "difficult_direct_answer": false, "rationales": ["Cruciferous is the cabbage family.", "They look like trees.", "The green florets are the only ones."], "image": "train2014/COCO_train2014_000000381642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260138, "question_id": "geQyM3gaK4ZDSgFcCRjvak", "question": "Different parabolic shapes are found in?", "choices": ["poles", "surfs", "snowblades", "kites"], "correct_choice_idx": 2, "direct_answers": ["snow", "slopes", "people", "everywhere", "signs", "sky", "headgear", "snowblades", "signs", "snow"], "difficult_direct_answer": false, "rationales": ["The snowblades have varying shapes.", "Different shapes in the snowblades.", "The skiis are different shapes."], "image": "train2014/COCO_train2014_000000260138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273586, "question_id": "gerEg9BQryJXwTmmp4xjkn", "question": "What might a young child want to wear should they be in this exact area?", "choices": ["hat", "life jacket", "gloves", "shorts"], "correct_choice_idx": 1, "direct_answers": ["surfing suit", "bathing suit", "life vest", "life jacket", "life vest", "life jacket", "life jacket", "lifejacket", "wetsuit", "shorts"], "difficult_direct_answer": false, "rationales": ["The kid wants a life jacket.", "This child should use a life jacket in order to ensure the child stays afloat.", "The waves are rough."], "image": "val2014/COCO_val2014_000000273586.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405991, "question_id": "gf4Rg6DZ3jYLW3u9Mp4Fkx", "question": "What beverage does the man in checkered shirt carry?", "choices": ["margarita", "milk", "singapore sling", "coffee"], "correct_choice_idx": 3, "direct_answers": ["coffee", "coffee", "coffee", "milk shake", "coffee", "coffee", "coffee", "milk shake", "coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["The man has coffee in that cup.", "The beverage is coffee.", "A man is walking in the street with a cup that has a lid. coffee comes in cups with lids."], "image": "train2014/COCO_train2014_000000405991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167355, "question_id": "gf4oHHfKEyfAZb355gSeCv", "question": "What is the brown object under the chair?", "choices": ["dog rest", "floor protector", "energy mat", "rug"], "correct_choice_idx": 1, "direct_answers": ["mat", "mat", "floor protector", "mat", "rug protector", "mat", "mat", "rug protector", "rug protector", "mat"], "difficult_direct_answer": false, "rationales": ["The object protects the floor.", "The brown object keeps the floor safe from the wheels.", "Rolling chairs are really hard on flooring."], "image": "train2014/COCO_train2014_000000167355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331551, "question_id": "gfLvRMj3XkRYFocYan2Lth", "question": "What vegetable might you find on this dish?", "choices": ["potato", "onion", "asparagus", "turnip"], "correct_choice_idx": 1, "direct_answers": ["tomato", "tomato", "tomato", "onion", "tomato", "peppers", "onion", "tomato", "onion", "mushroom"], "difficult_direct_answer": false, "rationales": ["This dish is pizza. asparagus, turnip, and potato are not usual pizza toppings.", "You can see that there are onions on the pizza", "Onions are a popular pizza topping."], "image": "train2014/COCO_train2014_000000331551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278638, "question_id": "gft3uiYv5Rb8ZddJac6XqK", "question": "How was the item on the plate prepared?", "choices": ["open fire", "baking", "grill", "frying"], "correct_choice_idx": 1, "direct_answers": ["baked", "oven", "in oven", "oven", "baked", "baking", "baked", "baked", "baked", "baked"], "difficult_direct_answer": false, "rationales": ["The food has to be baked.", "The pizza is baked in an oven.", "The pizza was put in the oven."], "image": "train2014/COCO_train2014_000000278638.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72011, "question_id": "gg6AzS6GU7e8R4TPn2p32P", "question": "What season will occur after the current season?", "choices": ["summer", "winter", "spring", "autumn"], "correct_choice_idx": 1, "direct_answers": ["winter", "winter", "winter", "winter", "winter", "winter", "winter", "winter", "winter", "winter"], "difficult_direct_answer": false, "rationales": ["The ground is covered in leaves, so the current season is autumn. spring and summer are before autumn.", "There are yellow and orange leaves.", "The leaves on the ground indicate that it is autumn, which is always followed by winter."], "image": "val2014/COCO_val2014_000000072011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98601, "question_id": "ggQ6PK4HZBwyxYXURopVN9", "question": "What type of birds are these?", "choices": ["seagull", "vulture", "falcon", "eagle"], "correct_choice_idx": 0, "direct_answers": ["seagull", "sea gulls", "sea gulls", "seagull", "seagulls", "geese", "seagulls", "seagulls", "seagull", "sea gulls"], "difficult_direct_answer": false, "rationales": ["The birds are gulls.", "Traditionally these type of birds are semi aquatic and feed and breed in and near the ocean.", "They are very common birds in this type of area."], "image": "train2014/COCO_train2014_000000098601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492125, "question_id": "ggSbY4c9F7upznw3fUTfWW", "question": "What country is this picture taken in?", "choices": ["germany", "france", "united kingdom", "italiy"], "correct_choice_idx": 0, "direct_answers": ["germany", "germany", "germany", "russia", "germany", "germany", "germany", "europe", "europe", "germany"], "difficult_direct_answer": false, "rationales": ["The capital of germany is printed in the corner.", "The text is in german.", "The writing on the wall going down the stairs is written in german."], "image": "train2014/COCO_train2014_000000492125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254927, "question_id": "ggZj8TZfiTVP3TxvzhgLgf", "question": "What did he just do?", "choices": ["kicked ball", "dropped ball", "bounced", "lost balance"], "correct_choice_idx": 0, "direct_answers": ["kick ball", "kicked ball", "kick ball", "kick ball", "kicked ball", "kick ball", "kicked ball", "kick ball", "kick ball", "kicked ball"], "difficult_direct_answer": false, "rationales": ["The ball is moving away from him and his foot is forward", "He just kicked the soccer ball.", "The player has his foot in a kicking motion."], "image": "val2014/COCO_val2014_000000254927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10615, "question_id": "ggZwBGVjhvLU86vhdZJpBm", "question": "What is the truck missing that would make it illegal in many countries?", "choices": ["truck bed", "tires", "tail gate", "doors"], "correct_choice_idx": 3, "direct_answers": ["door", "door", "doors", "door", "license plates", "headlight", "doors", "door", "door", "door"], "difficult_direct_answer": false, "rationales": ["The truck doesn't have doors near the chairs.", "The truck is lacking doors.", "The truck doesn't have doors."], "image": "val2014/COCO_val2014_000000010615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474024, "question_id": "ggaQggJkfUYiZHxjNLCoCF", "question": "How would she close the front of her sweater?", "choices": ["zipper", "strings", "velcro", "buttons"], "correct_choice_idx": 0, "direct_answers": ["zipper", "zipper", "zipper", "zipper", "zip up", "zipper", "zipper", "zip up", "zipper", "zipper"], "difficult_direct_answer": false, "rationales": ["The front of her sweater is visible and has a zipper on the edge. zippers are used for fastening so a garment with a zipper in this placement on either side would be fastened by the zipper.", "She could close it by zipping it up.", "There is a metal slide on the jacket"], "image": "train2014/COCO_train2014_000000474024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236623, "question_id": "ggfDRDGGofKPPrgNdbDy4p", "question": "How do these people know each other?", "choices": ["teammates", "coworkers", "neighbors", "family"], "correct_choice_idx": 3, "direct_answers": ["family", "dad daughter", "parent", "parent", "family", "parent child", "related", "family", "parents", "kite"], "difficult_direct_answer": false, "rationales": ["The girl appears to be the adult's daughter.", "The people are family.", "It is an adult with a small child so probably a parent"], "image": "train2014/COCO_train2014_000000236623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69257, "question_id": "ggrq4ggKsiuDobeF3SDPzk", "question": "What is the shape of the bread called?", "choices": ["flat", "square", "loaf", "boule"], "correct_choice_idx": 3, "direct_answers": ["oval", "hotdog", "long", "boule", "rectangle", "hot dog", "bun", "pita", "bun", "hot dog"], "difficult_direct_answer": false, "rationales": ["The bread has two parts.", "These sandwiches are made in the shape that the bread is cooking in.", "The bread is boule."], "image": "train2014/COCO_train2014_000000069257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452412, "question_id": "ggzfdeFbRVpVZmQ6rGNpZB", "question": "What kind of landscape is this?", "choices": ["beach", "desert", "plain", "savanna"], "correct_choice_idx": 3, "direct_answers": ["safari", "pasture", "flat", "grassy lands", "grassland", "grassland", "savanna", "savanna", "flatlands", "prairie"], "difficult_direct_answer": false, "rationales": ["There's grass out.", "The area is comprised of grasses and small trees, and the land is mostly flat.", "Zebras live in savannahs."], "image": "train2014/COCO_train2014_000000452412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183435, "question_id": "gh2XfSXhbBcfZNADTZ3ZLj", "question": "Which among the following options is not available in the picture above?", "choices": ["tomatoes", "egg plant", "oranges", "carrots"], "correct_choice_idx": 1, "direct_answers": ["onions", "coconut banana", "banana", "egg plant", "can't answer", "pineapples", "unknown", "pineapple", "bananas", "banana"], "difficult_direct_answer": true, "rationales": ["There are oranges, tomatoes and carrots.", "There are no purple eggplants in the image but the other options are visible.", "There is no eggplant."], "image": "train2014/COCO_train2014_000000183435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52746, "question_id": "gh7CuyoJrxjtRuFA6JpipL", "question": "How was this dish prepared?", "choices": ["microwave", "oven", "grill", "air fryer"], "correct_choice_idx": 0, "direct_answers": ["baked", "microwave", "microwave", "microwave", "microwave", "microwaved", "microwave", "microwave", "microwave", "microwave"], "difficult_direct_answer": false, "rationales": ["This is a type of container that is used in one", "The dish shows that it was placed on microwave.", "The dish was microwaved."], "image": "val2014/COCO_val2014_000000052746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103122, "question_id": "gh7GjYrcdhH5vTnv7Zch68", "question": "The man directly behind the tennis player is doing what?", "choices": ["eating", "sleeping", "jumping", "hurrying"], "correct_choice_idx": 3, "direct_answers": ["running", "running", "fetching ball", "running", "running", "running", "hurrying", "fetching ball", "running", "ball boy"], "difficult_direct_answer": false, "rationales": ["The man is hurrying.", "The man is hustling and running.", "The guy is most definitely sprinting across."], "image": "val2014/COCO_val2014_000000103122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356379, "question_id": "ghKcojgxVFpxykoPZeZ9y3", "question": "What is the sidewalk made of?", "choices": ["cobblestones", "concrete", "slate", "brick"], "correct_choice_idx": 0, "direct_answers": ["brick", "cobblestones", "brick", "rocks", "cobblestones", "brick", "brick", "stone", "stone", "bricks"], "difficult_direct_answer": false, "rationales": ["The sidewalk has cobblestones.", "These stones are called cobblestone", "These are special kind of stones you can get to make this kind of sidewalk."], "image": "val2014/COCO_val2014_000000356379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338807, "question_id": "ghNLfncamNjeuAtr944vR2", "question": "Why is he using an umbrella?", "choices": ["disguise", "rain", "snow", "sun"], "correct_choice_idx": 1, "direct_answers": ["raining", "rain protection", "rain", "protection rain", "raining", "avoid rain", "raining", "raining", "avoid rain", "block rain"], "difficult_direct_answer": false, "rationales": ["He is using it to keep himself dry.", "This is indicated by the wet, reflective street.", "The man is using an umbrella because it's raining."], "image": "train2014/COCO_train2014_000000338807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300024, "question_id": "gi6JuHVBokus3myUiauFWg", "question": "What are the tall silver poles used for?", "choices": ["targets", "swinging", "climbing", "lighting"], "correct_choice_idx": 3, "direct_answers": ["lights", "lighting", "lighting", "lights", "lightpoles", "hold lights", "fencing", "light", "fences", "nets"], "difficult_direct_answer": false, "rationales": ["There are light fixtures at the top of them", "The tall silver poles have bulbs. they illuminate the tennis courts.", "The tall silver poles have lightbulbs in them so they are for lighting."], "image": "train2014/COCO_train2014_000000300024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520401, "question_id": "giFoWGyxyMxdFMhMtieM8W", "question": "The painting is an example of what type of art?", "choices": ["cubist", "abstract", "baroque", "impressionist"], "correct_choice_idx": 1, "direct_answers": ["doodle art", "abstract", "abstract", "modern", "abstract", "line art", "abstract", "contemporary", "modern", "deco"], "difficult_direct_answer": false, "rationales": ["It has a depiction of squiggly lines that represent a circle on top of some squares.", "The other options don't match this style.", "The painting does not look to be of anything in particular."], "image": "val2014/COCO_val2014_000000520401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195955, "question_id": "giRZQoh9BCs6abxeftcoN6", "question": "Upon what vessel are the people seated?", "choices": ["airship", "airplane", "sloop", "dinghy"], "correct_choice_idx": 1, "direct_answers": ["train", "train", "train", "train", "train", "train", "train", "train", "airplane", "train"], "difficult_direct_answer": false, "rationales": ["The vessel is a plane.", "They are all on an airplane.", "These people are all passengers on a plane."], "image": "val2014/COCO_val2014_000000195955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406426, "question_id": "gifhaGugJRCTPdHhLyAQpz", "question": "How many of the vegetables were grown in the ground?", "choices": ["five", "six", "four", "three"], "correct_choice_idx": 1, "direct_answers": ["six", "two", "two", "three", "all", "eleven", "two", "five", "two", "six"], "difficult_direct_answer": false, "rationales": ["They are all planted in dirt", "Only vegetables grown hydroponically are not grown in the ground. these appear to still have dirt on them and so it's safe to say they all had some part of them touch dirt.", "There are six differents vegetables on the table that have been grown in the ground"], "image": "val2014/COCO_val2014_000000406426.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52066, "question_id": "gij6PSH8dfDzztdQuK53jD", "question": "What will this man read on the train today?", "choices": ["paper", "your mind", "book", "directions"], "correct_choice_idx": 0, "direct_answers": ["newspaper", "newspaper", "newspaper", "newspaper", "newspaper", "newspaper", "paper", "newspaper", "newspaper", "newspaper"], "difficult_direct_answer": false, "rationales": ["He has a newspaper under his arm to read.", "The man will read his newspaper on the train.", "You can tell by what he is holding as to what he will read on the train."], "image": "val2014/COCO_val2014_000000052066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495557, "question_id": "gijDaefseAr8ofmgiSAbAD", "question": "What is held in the gray canisters?", "choices": ["soup", "nothing", "syrup", "propane"], "correct_choice_idx": 3, "direct_answers": ["propane", "gas", "gas", "gas", "gas", "propane", "propane", "propane", "fuel", "fuel"], "difficult_direct_answer": false, "rationales": ["Propane is stored in metal cans.", "Those canisters are full of propane.", "It's unlikely to be any of the other options based on the shape of the canisters."], "image": "val2014/COCO_val2014_000000495557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436467, "question_id": "ginH5BheyrGTUTGvLeifnL", "question": "What type of action is being taken?", "choices": ["mashing", "stirring", "baking", "blending"], "correct_choice_idx": 1, "direct_answers": ["stirring", "stirring", "cooking", "stirring", "cooking", "stirring", "cooking", "stirring", "saute", "cooking"], "difficult_direct_answer": false, "rationales": ["A wooden spoon is moving the ingredients around the pan.", "There is a spoon in the food", "The food is in a pot, and a wooden spool can be seen on the side, and a spoon is used for stirring ingredients together."], "image": "val2014/COCO_val2014_000000436467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367222, "question_id": "gizN7m8VzUkEdeT7NwbE75", "question": "What type power does this train use?", "choices": ["gas", "diesel", "coal", "electrical"], "correct_choice_idx": 3, "direct_answers": ["electric", "electricity", "electric", "electric", "electricity", "electrical", "electricity", "electricity", "electrical", "electric"], "difficult_direct_answer": false, "rationales": ["The train does not have an internal combustion engine. it uses the power lines that hang above the tracks.", "There are power lines above the trains.", "The train uses electric to run."], "image": "train2014/COCO_train2014_000000367222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273850, "question_id": "gj39j3gwdDbuaDmNCcX8B3", "question": "What is the pitchers left foot touching?", "choices": ["sign", "sand", "base", "rock"], "correct_choice_idx": 2, "direct_answers": ["mound", "dirt", "pitching mound", "pitchers mound", "pitchers mound", "pitching rubber", "mound", "plate", "base", "pitcher's mound"], "difficult_direct_answer": false, "rationales": ["His one foot is on the base.", "The pitcher touches the base.", "You can see the white base under this foot, and the rule of baseball indicste the person must be on their base for the out to count."], "image": "train2014/COCO_train2014_000000273850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577520, "question_id": "gj8MeWXCYLUgDR5mdk7gBJ", "question": "What type of counter is shown?", "choices": ["check-out", "kitchen", "bathroom", "restaurant"], "correct_choice_idx": 3, "direct_answers": ["kitchen", "restaurant", "restaurant", "bar", "food", "restaurant", "bar", "dining", "kitchen", "bar table"], "difficult_direct_answer": false, "rationales": ["The counter is a bar type counter where people can sit to be served food and drink.", "It has many place settings and stools for customers.", "It is a restaurant because it has tables, a counter, a kitchen and waiters"], "image": "train2014/COCO_train2014_000000577520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545891, "question_id": "gj9CtL2EGF63YEpJLL2ksR", "question": "What is likely in the two tins on the table?", "choices": ["peppers", "beets", "olives", "tomatoes"], "correct_choice_idx": 0, "direct_answers": ["roasted peppers", "beer", "olive pits", "peppers", "coffee", "fruit", "tomatoes", "peppers", "candles", "peppers"], "difficult_direct_answer": false, "rationales": ["The tins have images of peppers on the cans. it is reasonable to assume that peppers are inside.", "Two large cans with a picture of the contents painted on the side are on a table.", "These can's have pictures of peppers on them and it is likely they once contained peppers."], "image": "train2014/COCO_train2014_000000545891.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512989, "question_id": "gjGuLovD7CTutE7xF9s7kW", "question": "The item in the bowl touching the knife is what?", "choices": ["powder", "tomato", "soap", "cheese"], "correct_choice_idx": 1, "direct_answers": ["tomatoes", "tomatoes", "tomato", "tomatoes", "tomatoes", "tomato", "tomato", "peppers", "knife", "tomatoes"], "difficult_direct_answer": false, "rationales": ["There are chopped tomatoes in the bowl.", "Often used in making and sometimes topping pizzas, tomatoes are at the top of the list! they're red and a fruit and are diced in that bowl.", "Tomatoes are in the bowl."], "image": "val2014/COCO_val2014_000000512989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168001, "question_id": "gjJZw6hWVj3aQFf5mDNEbP", "question": "Which activity are the majority of horses performing?", "choices": ["sleeping", "eating", "drinking", "running"], "correct_choice_idx": 2, "direct_answers": ["drinking", "drinking", "drinking water", "drinking", "drinking", "drinking water", "drinking", "drinking", "drinking", "drinking"], "difficult_direct_answer": false, "rationales": ["The horses are taking drinks of water from the ocean.", "They are drinking.", "They are all in the lake drinking water."], "image": "train2014/COCO_train2014_000000168001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503674, "question_id": "gjLXoQMUA9XnVcEGFHLXfr", "question": "What are the men doing in this area?", "choices": ["loitering", "fighting", "racing", "construction"], "correct_choice_idx": 3, "direct_answers": ["construction", "construction", "construction", "working", "working", "working", "road construction", "construction", "construction work", "construction"], "difficult_direct_answer": false, "rationales": ["They are working on the road.", "The men are doing repairs on the road.", "They are wearing uniforms and bright colours, and using tools such as a wheelbarrow that people working in construction would use."], "image": "train2014/COCO_train2014_000000503674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380097, "question_id": "gjQeff88UNCAwFrkJ6Vr2H", "question": "Which granite is best for bathroom?", "choices": ["ubatuba", "santa cecilia", "caledonia", "black pearl"], "correct_choice_idx": 1, "direct_answers": ["cecilia granite", "gray", "shiny", "white granite", "santa cecilia", "gray scale", "engineered granite", "waterproof", "solid", "quartz"], "difficult_direct_answer": true, "rationales": ["A counter needs to be smooth.", "It helps make the counter look very rich with the color.", "A bathroom has gray countertops."], "image": "train2014/COCO_train2014_000000380097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232550, "question_id": "gjWuRQdMYU3u8BtXD7ZyzQ", "question": "What kind of payment is needed to ride this bus?", "choices": ["donation", "volunteer", "debt", "fare"], "correct_choice_idx": 3, "direct_answers": ["cash", "currency", "ticket", "bus fare", "fare", "fare", "coin", "bus ticket", "token", "coins"], "difficult_direct_answer": true, "rationales": ["The other options don't apply to bus travel. the person can pay in cash or with a bus pass or credit card.", "Which can be in the form of ticket, pass, or coin.", "You need to pay to ride the bus."], "image": "val2014/COCO_val2014_000000232550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88576, "question_id": "gjkZvqd5XhXPfBHXo5xoSH", "question": "What is the woman wearing over her yellow shirt?", "choices": ["polo", "sweatshirt", "jacket", "blazer"], "correct_choice_idx": 1, "direct_answers": ["jacket", "sweatshirt", "sweater", "sweatshirt", "jacket", "gray hoodie", "jacket", "hoodie", "t-shirt", "jacket"], "difficult_direct_answer": false, "rationales": ["The woman has a sweatshirt.", "The woman is in a sweatshirt.", "It actually looks like a hoodie, but the term is better than the other options."], "image": "train2014/COCO_train2014_000000088576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535517, "question_id": "gkTGyMkebAykhPGNXgpmst", "question": "How many of these people will eventually need to be screened for prostate cancer?", "choices": ["nine", "two", "four", "six"], "correct_choice_idx": 2, "direct_answers": ["four", "all four", "four", "five", "all", "one", "four", "four", "all", "four"], "difficult_direct_answer": false, "rationales": ["All of the people are men and would need to have their prostates checked.", "There are four men shown.", "There are 4 people needing screening."], "image": "train2014/COCO_train2014_000000535517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535579, "question_id": "gkbR26akPG3hrUcJGqFXVd", "question": "Who would be the most likely owner of these bears?", "choices": ["dad", "children", "teenage boy", "grandpa"], "correct_choice_idx": 1, "direct_answers": ["store", "child", "child", "child", "children", "kids", "kid", "children", "kid", "child"], "difficult_direct_answer": false, "rationales": ["Stuffed animals are generally used as a comfort item for kids bears are used for this purpose and are a staple for stuffed animals.", "The bears are toys. they are sitting near a picture book.", "Kids like teddy bears."], "image": "val2014/COCO_val2014_000000535579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161873, "question_id": "gkfH38Ug4SsbvMRmEwrKyj", "question": "What are you supposed to do when you meet an animal like this on the road?", "choices": ["stop", "reverse", "go", "hoot"], "correct_choice_idx": 0, "direct_answers": ["wait", "stop", "pull over", "move slowly", "stop", "wait", "stop", "stop", "stop", "stop"], "difficult_direct_answer": false, "rationales": ["Stop so you don't hurt them.", "You should never run down a giraffe.", "You wait until they move off the road"], "image": "val2014/COCO_val2014_000000161873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191230, "question_id": "gmKcRbxCbEsgq9fkHoUvHP", "question": "What should the vehicle showing the side mirror do in this situation?", "choices": ["turn right", "turn left", "stop", "go"], "correct_choice_idx": 3, "direct_answers": ["keep going", "drive", "proceed", "go", "go", "go forward", "proceed", "move forward", "proceed", "go"], "difficult_direct_answer": false, "rationales": ["The green light tells the driver to continue.", "There is a green light.", "The cars can go."], "image": "train2014/COCO_train2014_000000191230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248300, "question_id": "gmRwfeqtYAtZv7uxWmACwg", "question": "What Nintendo video game character is on the left on top of the cake?", "choices": ["donkey kong", "mario", "wario", "luigi"], "correct_choice_idx": 3, "direct_answers": ["luigi", "luigi", "mario", "luigi", "luigi", "luigi", "mario", "luigi", "luigi", "luigi"], "difficult_direct_answer": false, "rationales": ["The game is luigi.", "A couple stands in front of a cake with a princess and luigi. the princess and luigi are video game characters.", "Two video game characters are on top of a cake, one is a princess and one is a guy in green and both are from the mario bro's franchise."], "image": "val2014/COCO_val2014_000000248300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553815, "question_id": "gmYHtsWDzjjECWrkepGYtE", "question": "Why are they not playing?", "choices": ["awaiting referee", "bad ball", "tired", "confused"], "correct_choice_idx": 0, "direct_answers": ["taking rest", "refereewith ball", "time out", "awaiting referee", "foul", "time out", "watching", "timeout", "change plays", "time out"], "difficult_direct_answer": false, "rationales": ["The referee has the ball in his hands.", "The players are waiting for the referee.", "The referee has the ball."], "image": "val2014/COCO_val2014_000000553815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390515, "question_id": "gmaFxRDaQBLL2jcoJRthDT", "question": "What is the man to the right of the black vehicle laying on?", "choices": ["box", "sofa", "bench", "case"], "correct_choice_idx": 2, "direct_answers": ["bench", "bench", "bench", "bench", "bench", "sleeping", "bench", "park bench", "car", "bench"], "difficult_direct_answer": false, "rationales": ["The man needs a bench.", "It is a public seating area typically found in parks.", "The man is sleeping on a wooden public bench."], "image": "train2014/COCO_train2014_000000390515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488500, "question_id": "gmc2DGFSfKwR3SRevEnuGs", "question": "When an athlete starts to overheat and sweat starts pouring there body is asking for what to replenish it?", "choices": ["sunscreen", "coffee", "soda", "water"], "correct_choice_idx": 3, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The body will be thirsty.", "When someone sweats, the sweat is water leaving their body, so more water ingested is needed in order to replenish the body to its regular level.", "When you are playing sports you need to drink a lot."], "image": "val2014/COCO_val2014_000000488500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577283, "question_id": "gmjkbnzRWP9tzCdVMFVR3V", "question": "Which root veg has more starch content?", "choices": ["potato", "carrot", "tomato", "celery"], "correct_choice_idx": 0, "direct_answers": ["fries", "potato", "potatoes", "potato", "potatoes", "yams", "potato", "potato", "parsnip", "snacks"], "difficult_direct_answer": false, "rationales": ["I've been told that potatoes have a lot of starch and i have no reason to doubt it.", "Potatoes are known to have more starch.", "The potato has the most starch content out of all of the food."], "image": "train2014/COCO_train2014_000000577283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219030, "question_id": "gn27tne2iQm6HhTgQAhPJE", "question": "What type of animals are present?", "choices": ["cow", "goat", "dog", "deer"], "correct_choice_idx": 1, "direct_answers": ["sheep", "goat", "ram", "sheep", "sheep", "sheep", "sheeps", "sheep", "sheep", "sheep"], "difficult_direct_answer": false, "rationales": ["The visible animals have wool and horns like a goat might.", "There are a bunch of little goat sheep.", "They have rounding horns."], "image": "train2014/COCO_train2014_000000219030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141586, "question_id": "gnLB9KfQxhjDce5ns3Tugn", "question": "Which hat brand is advertised above the dugout?", "choices": ["zephyr", "new era", "brixton", "kangol"], "correct_choice_idx": 1, "direct_answers": ["new era", "newtrac", "newer cap", "baseball", "unsure", "can't view", "nike", "new era", "neutracap.com", "cap"], "difficult_direct_answer": true, "rationales": ["You can tell by the web address listed on the dugout as to what company it is for.", "It has newearcap.com all across the perimeter above the dugout.", "The hat is new era."], "image": "train2014/COCO_train2014_000000141586.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93378, "question_id": "gnQSC5LxmYH22XfjpLkyts", "question": "What thing does this place sell?", "choices": ["dogs", "fish", "cats", "birds"], "correct_choice_idx": 3, "direct_answers": ["plants", "cages", "birds", "plants", "exotic birds", "birds", "plants", "plants", "plants", "plants"], "difficult_direct_answer": false, "rationales": ["This place specializes in pet bird sales.", "The thing is a bird.", "There are numerous birdcages visible and the cages have birds within, implying that this a store where one can buy birds and bird-related supplies."], "image": "train2014/COCO_train2014_000000093378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5620, "question_id": "gnVf9CjJv9k3jwhFGwJDRB", "question": "What form of roadway are these buses traveling on?", "choices": ["intersection", "highway", "rural road", "traffic circle"], "correct_choice_idx": 3, "direct_answers": ["bus", "roundabout", "roundabout", "roundabout", "traffic circle", "pavement", "roundabout", "roundabout", "roundabout", "pavement"], "difficult_direct_answer": false, "rationales": ["The buses are traveling in a circular direction in an area that is also known as a roundabout.", "The buses are in a traffic circle.", "They are going around the circle."], "image": "train2014/COCO_train2014_000000005620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33764, "question_id": "gncHjYmRfpowZLaeAU6YAu", "question": "What type of surface is this game played on?", "choices": ["clay", "court", "field", "sand"], "correct_choice_idx": 2, "direct_answers": ["grass", "turf", "field", "grass field", "grass", "grass", "turf", "ball", "grass", "turf"], "difficult_direct_answer": false, "rationales": ["This is played on a field.", "It is a grassy area with white lines painted on it.", "They are playing on grass."], "image": "train2014/COCO_train2014_000000033764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464736, "question_id": "gnhXmU7W3PMcQFo8fnP2Qq", "question": "What are the large surf boards designed for?", "choices": ["boogie boarding", "skim boarding", "paddle boarding", "riding waves"], "correct_choice_idx": 2, "direct_answers": ["large waves", "surfing", "surfing", "surfing", "paddle boarding", "surfing", "surfing", "paddling", "small waves", "larger people"], "difficult_direct_answer": false, "rationales": ["They are surfing.", "They are big so you can stand on them as you use the oar", "They are holding paddles in their hands so they are paddle boards that you stand on."], "image": "train2014/COCO_train2014_000000464736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494771, "question_id": "gniYUX6MbQoiXb7asCQiyt", "question": "What state are the candles in?", "choices": ["dowsed", "lit", "fake", "unlit"], "correct_choice_idx": 1, "direct_answers": ["lit", "lit", "burned", "lit", "burning", "greece", "crooked", "lit", "lit", "disarray"], "difficult_direct_answer": false, "rationales": ["Most of them have a small fire lit on each of their tops which is fueled by a wick coming from inside each candle.", "The candles have been lit up.", "The candles are lit."], "image": "train2014/COCO_train2014_000000494771.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448181, "question_id": "gnjoFPP9NNU3JS4dKF48kK", "question": "Why is the puppy there?", "choices": ["feeding lunch", "watching it", "for sale", "stolen"], "correct_choice_idx": 1, "direct_answers": ["trip", "resting", "pup sitting", "for comfort", "being watched", "photo", "watching it", "dog walking", "with owner", "entertainment"], "difficult_direct_answer": true, "rationales": ["The person is in charge of the puppy.", "A very small puppy is on a table near a man. people keep an eye on small puppies.", "The person at the table appears to be the owner of or the person taking care of the puppy as it is sitting on top of their table."], "image": "val2014/COCO_val2014_000000448181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10540, "question_id": "gnomvFJrbcRECU4ghyPQFU", "question": "If a car stops at this light what should they do?", "choices": ["turn", "yield", "stop", "go"], "correct_choice_idx": 2, "direct_answers": ["turn right", "continue driving", "go", "wait", "wait", "honk horn", "go", "stop", "start", "stop"], "difficult_direct_answer": false, "rationales": ["The opposite direction has a green light, which means go, so cars here would likely have a red light to stop in order to prevent any collisions.", "Stay stopped until the light turns green.", "The car should stop."], "image": "train2014/COCO_train2014_000000010540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66524, "question_id": "go5CKuX2RipzXUHbBLPurf", "question": "What is the yellow pole in the left corner?", "choices": ["utility pole", "goal post", "foul pole", "flag pole"], "correct_choice_idx": 2, "direct_answers": ["goal", "foul boundary", "foul pole", "homerun pole", "pole", "pole", "foul", "marker", "foul post", "foul pole"], "difficult_direct_answer": false, "rationales": ["In baseball the yellow pole helps to determine if a ball is foul or not.", "It gauges whether a ball is in play or not.", "It helps the umpire determine whether a ball hit in the air and over the fence is a fair ball depending on which side it goes."], "image": "val2014/COCO_val2014_000000066524.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473632, "question_id": "go7eVSHTqpoiLhXwCCicyj", "question": "What is a place you commonly see the type of thing which is around the bear's neck?", "choices": ["window", "computer", "lamp post", "gift box"], "correct_choice_idx": 3, "direct_answers": ["gift box", "bow", "man's neck", "scotland", "neck", "children's rooms", "holiday decor", "christmas parties", "carnival", "ballroom"], "difficult_direct_answer": true, "rationales": ["The place is a gift box.", "These teddies are usually found near gift boxes.", "There are usually some kind of bow on presents."], "image": "val2014/COCO_val2014_000000473632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22104, "question_id": "goCAk8yPz8LrDXz9FNctZF", "question": "Which food group would be the healthiest on the dinner plate?", "choices": ["grain", "vegetable", "meat", "dairy"], "correct_choice_idx": 1, "direct_answers": ["vegetables", "vegetables", "vegetable", "vegetable", "fruits", "carrots", "vegetables", "vegetables", "carrots", "vegetable"], "difficult_direct_answer": false, "rationales": ["The carrots on the plate are healthy and good for you.", "The healthiest food on the plate would be the carrots. these belong to the vegetable group.", "The carrots have less fat."], "image": "train2014/COCO_train2014_000000022104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417720, "question_id": "goDmaJDZf6eByviNZYQ3kx", "question": "What do the girls use the umbrella to avoid in this situation?", "choices": ["sunburn", "getting soaked", "lightning", "hail"], "correct_choice_idx": 0, "direct_answers": ["sun", "sun", "sun", "sunburn", "sunburn", "sun", "sun", "hot sun", "rain", "sun"], "difficult_direct_answer": false, "rationales": ["Umbrellas can be used to protect from uv rays. the girls do not want to get skins cancer.", "Girls are holding umbrellas on a sunny day.", "The women don't want to get burned from the sun."], "image": "train2014/COCO_train2014_000000417720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89884, "question_id": "goTQ22wjFv6vLZqmzuax72", "question": "What would a person be doing here?", "choices": ["flying", "digging", "burying", "relaxing"], "correct_choice_idx": 3, "direct_answers": ["relaxing", "sun", "rest", "relaxing", "relaxing vacationing", "relaxing", "vacationing", "sunbathing", "resting", "lounging"], "difficult_direct_answer": false, "rationales": ["The area looks like a vocational facility where a person would go to relax.", "Due to not many people being shown, it's easy to surmise that they are going to relax.", "The person would relax."], "image": "val2014/COCO_val2014_000000089884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212381, "question_id": "gpE5rpr3Fs3xJrxUtpNUR4", "question": "What is near the window?", "choices": ["shampoo bottle", "man", "overalls", "dog"], "correct_choice_idx": 0, "direct_answers": ["shampoo bottle", "bottle", "shampoo", "curtain", "curtains", "bath", "shampoo", "shampoo bottle", "shampoo", "bath tub"], "difficult_direct_answer": false, "rationales": ["The red object looks like cleanser for the hair.", "Shampoo is in the window.", "The shampoo bottle is near."], "image": "train2014/COCO_train2014_000000212381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71138, "question_id": "gpHDjxfJgk3XvicoU4uCdW", "question": "What type of telephone is being used?", "choices": ["pay", "cellular", "rotary", "landline"], "correct_choice_idx": 1, "direct_answers": ["small phone", "cellphone", "cellphone", "nokia", "cellular", "cell", "smartphone", "cell", "cell phone", "cellphone"], "difficult_direct_answer": false, "rationales": ["The phone in the man's hand is small and portable, so it's obviously a cell phone.", "The other options don't apply to wireless technology.", "The man is using a cellphone."], "image": "train2014/COCO_train2014_000000071138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319735, "question_id": "gpNUfcKJ2Buy5LAn4jQuV5", "question": "What color is the older women's scarf?", "choices": ["white", "blue", "red", "pink"], "correct_choice_idx": 0, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["All we have to do is identify the old woman and we see her scarf is white.", "The older woman has a white garment sticking out of her jacket.", "Unless you are colorblind you can easily tell what color scarf she is wearing."], "image": "train2014/COCO_train2014_000000319735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391187, "question_id": "gpSWi7SHVNULgfBLSHpQdp", "question": "What county does this bus go to?", "choices": ["suffolk", "norfolk", "surrey", "cumbria"], "correct_choice_idx": 2, "direct_answers": ["don't know", "united kingdom", "surrey", "esher", "united kingdom", "church cobham", "england", "england", "surrey", "church cobham"], "difficult_direct_answer": false, "rationales": ["That is the county the bus is going.", "There is church cobham written on the bus which is located in answer a. buses typically display their destinations in this manner on the bus.", "This goes to surrey."], "image": "val2014/COCO_val2014_000000391187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274438, "question_id": "gpTU3wLPN67CSYEbijmTmn", "question": "The item in the sky resembles what?", "choices": ["dragon", "bull", "lion", "cat"], "correct_choice_idx": 0, "direct_answers": ["butterfly", "phoenix bird", "dragon", "dragon", "dragon", "dragon", "dragon", "butterfly", "dragon", "butterfly"], "difficult_direct_answer": false, "rationales": ["Looks like a dragon in the sky", "The creature that is depicted in the kite has a long tail with what looks like tail there are also two wings and you can see lizards like the truth in the face in the head all of these can find qualities indicate the kite is supposed to be this creature.", "The other options don't match the design of the kite."], "image": "val2014/COCO_val2014_000000274438.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358017, "question_id": "gpZe238gYZeYCCqGPbq6i4", "question": "What general type of plane is on display in front of the building?", "choices": ["fighter", "passenger", "bomber", "cargo"], "correct_choice_idx": 1, "direct_answers": ["aircraft", "propeller", "military plane", "reconnaissance", "military", "jet", "cargo", "bomber", "passenger", "cargo"], "difficult_direct_answer": true, "rationales": ["This is at an air force base.", "This plane could carry a few people.", "This is a fighter jet."], "image": "train2014/COCO_train2014_000000358017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532439, "question_id": "gps7cK6i4NASd4v9KV6VyF", "question": "What part of this animal is larger than the same part on a human?", "choices": ["talon", "spikes", "ear", "tail"], "correct_choice_idx": 2, "direct_answers": ["trunk", "legs", "eyes", "ear", "nose legs", "nose", "legs", "leg", "ears", "eye"], "difficult_direct_answer": true, "rationales": ["The ear is big.", "Elephants have large ears that are bigger than a human's ear.", "The ears on an elephant are much bigger than the ears on a human."], "image": "train2014/COCO_train2014_000000532439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66959, "question_id": "gpxExwaWcKqNnhY3nAQD6Q", "question": "What purpose do the vests serve to do?", "choices": ["fashion", "political statement", "reflective visibility", "shielding"], "correct_choice_idx": 2, "direct_answers": ["reflective visibility", "be seen", "safety visibility", "carry", "be seen", "military", "visibility safety", "visibility", "visibility safety", "safety"], "difficult_direct_answer": false, "rationales": ["Most people use this universal color for safety reasons.", "The vests are bright with reflective strips.", "The purpose is to reflect visibility."], "image": "val2014/COCO_val2014_000000066959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64824, "question_id": "gq7nLDoWoKbGF6atzApvrS", "question": "Where is this photograph likely to have been taken?", "choices": ["soccer field", "reading program", "organization/research facility", "bank"], "correct_choice_idx": 2, "direct_answers": ["university", "school", "university", "university", "office", "college", "work", "graduation", "outside", "organization/research facility"], "difficult_direct_answer": false, "rationales": ["Looks like some kind of work event or convention.", "They are holding notepads and pens.", "The people are all researchers."], "image": "val2014/COCO_val2014_000000064824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253485, "question_id": "gqNTVpdwHeu3krnFGXR7bR", "question": "Why is the man behind the other two holding a flashing object?", "choices": ["being annoying", "making art", "being funny", "taking pictures"], "correct_choice_idx": 3, "direct_answers": ["taking picture", "taking picture", "pictures", "taking photo", "picture", "photographing", "taking picture", "taking pictures", "taking picture", "taking picture"], "difficult_direct_answer": false, "rationales": ["The person appears to be holding a camera and the flash would go off at the moment a picture was taken. the couple also appear to be posing which would also be consistent with answer a.", "This provides light for the camera", "Someone has a camera."], "image": "val2014/COCO_val2014_000000253485.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535455, "question_id": "gqQodcMFoo8swYZ2q37JtD", "question": "What is traditionally NOT needed to eat this food?", "choices": ["fork", "sauce", "glass", "plate"], "correct_choice_idx": 0, "direct_answers": ["fork", "fork", "utensils", "fork", "utensils", "utensils", "utensils", "utensils", "spoon", "fork"], "difficult_direct_answer": false, "rationales": ["Forks are usually not necessary to eat slices of pizza.", "People usually eat pizza by holding slices in their hands.", "Pizza is eaten with hands."], "image": "train2014/COCO_train2014_000000535455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214641, "question_id": "gqawT5BgSrssx3f7cz8dSh", "question": "What key ingredient do these things need?", "choices": ["sand", "cherries", "wheat", "bitter melon"], "correct_choice_idx": 2, "direct_answers": ["flour", "flour", "flour", "dough", "oven", "wheat", "flour", "flour", "flour", "dough"], "difficult_direct_answer": false, "rationales": ["The pastries need flour which is usually made from wheat.", "Some people like a grain version of their baked goods.", "These are pastries"], "image": "train2014/COCO_train2014_000000214641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381925, "question_id": "gr8EKmMN9GCXYT5diqff8R", "question": "What is it called when walls are built without mortar?", "choices": ["dry stone", "rubble", "neolithic", "stack"], "correct_choice_idx": 0, "direct_answers": ["stones", "dry stack", "dry stone", "stacking", "drystack", "dry stone", "dry stack", "drystack", "natural", "cobblestone"], "difficult_direct_answer": false, "rationales": ["When walls are built without mortar it's called dry stone.", "The dry stone is the name.", "The walls are made with dry stone."], "image": "val2014/COCO_val2014_000000381925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517492, "question_id": "grAXJJcWUMkDhoZxQVUgY8", "question": "What type musician lives here?", "choices": ["violinist", "triangle player", "percussionist", "pianist"], "correct_choice_idx": 3, "direct_answers": ["pianist", "pianist", "pianist", "pianist", "pianist", "piano", "pianist", "pianist", "pianist", "piano"], "difficult_direct_answer": false, "rationales": ["It is a pianist because a piano is against the wall", "There is a keyboard in the background. keyboards are similar to pianos.", "There is a piano behind the dining table. a musician who plays the piano lives here."], "image": "train2014/COCO_train2014_000000517492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28654, "question_id": "grAvFyk89VACzpAuAQAPyD", "question": "This game is originated from which country?", "choices": ["us", "england", "dutch", "uk"], "correct_choice_idx": 0, "direct_answers": ["united states", "united states", "usa", "usa", "united states", "usa", "usa", "us", "united states", "america"], "difficult_direct_answer": false, "rationales": ["Skateboarding is from the us.", "A guy is doing a skateboard trick on a sidewalk.", "The game is from the us."], "image": "train2014/COCO_train2014_000000028654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431613, "question_id": "grMuAG4o6hrzW2bzAQhJAe", "question": "What is the person in the red coat and green tinted boots having?", "choices": ["fight", "snack", "nothing", "nap"], "correct_choice_idx": 1, "direct_answers": ["lunch", "man", "tea", "soup", "lunch", "snack", "lunch", "soup", "snack", "beverage"], "difficult_direct_answer": false, "rationales": ["They appear to be stopped for a rest and holding food containers in their hands.", "The person is breaking for a quick treat.", "The person with the red coat is having a snack with a bag."], "image": "train2014/COCO_train2014_000000431613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67440, "question_id": "grahfx9Vtbjxaqykq7qZCU", "question": "What is the best baseball net?", "choices": ["string net", "bow net", "rukket net", "golf net"], "correct_choice_idx": 2, "direct_answers": ["rukket", "rukket", "rukket sock", "rukket", "rukket sock", "powernet", "see through", "rukket net", "wire", "unknown"], "difficult_direct_answer": false, "rationales": ["The ball is by a net.", "This is a net so the ball does not go out and hit someone.", "They make golf nets as well."], "image": "train2014/COCO_train2014_000000067440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100746, "question_id": "grfxEWu9dW8HAgY8ZSB7gP", "question": "What are these types of planes generally used for?", "choices": ["crop dusting", "military", "tourism", "commercial travel"], "correct_choice_idx": 1, "direct_answers": ["fighting", "military", "military", "military", "military purposes", "war", "military", "military", "airshows", "war"], "difficult_direct_answer": false, "rationales": ["These planes are fighter jets.", "These are fighter jets and they are flying in formation", "These are jet fighters and not commonly used by the general public."], "image": "train2014/COCO_train2014_000000100746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19781, "question_id": "grpeXkjbGfwW7CzoKH3578", "question": "His chair is most likely made from what material?", "choices": ["marble", "leather", "straw", "metal"], "correct_choice_idx": 1, "direct_answers": ["plastic", "leather", "leather", "leather", "leather", "leather", "leather", "plastic", "leather", "leather"], "difficult_direct_answer": false, "rationales": ["The chair is leather.", "The dark black material is often how leather looks.", "The chair is most likely leather."], "image": "train2014/COCO_train2014_000000019781.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28874, "question_id": "grwcancrjw9P66Pzx69qDz", "question": "What is this dog ready to do?", "choices": ["catch", "eat", "play", "rest"], "correct_choice_idx": 3, "direct_answers": ["sleep", "cry", "sleep", "sleep", "cry", "bark", "sleep", "sleep", "sleep", "rest"], "difficult_direct_answer": false, "rationales": ["The dog is sitting on a piece of furniture. there are no food items, toys, or balls near the dog.", "This dog is lying down already on a plush pillow and has a tired look in their eyes.", "It seems as if it is tired and ready to sleep."], "image": "val2014/COCO_val2014_000000028874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365752, "question_id": "gs57Tc9A3N23FkL7bYMf96", "question": "Who is this smiling man?", "choices": ["scientist", "book author", "government official", "citizen"], "correct_choice_idx": 2, "direct_answers": ["president", "government official", "left person", "president", "politician", "politician", "prominent politician", "government leader", "politician", "political"], "difficult_direct_answer": false, "rationales": ["The man is a government official.", "There are government soldiers in the background.they are standing outside of a government building during a greeting.", "The man is being saluted by soldiers."], "image": "val2014/COCO_val2014_000000365752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100811, "question_id": "gsFaXYyiJazCW9sNinfBka", "question": "What vessels are tied to the piers here?", "choices": ["cars", "rafts", "boats", "horse buggies"], "correct_choice_idx": 2, "direct_answers": ["boats", "boats", "boats", "boats", "boat", "boats", "boats", "boats", "boats", "boats"], "difficult_direct_answer": false, "rationales": ["There are boats at the pier.", "Boats would be tied so they don't float away.", "Only the big vehicles that can float are tied up."], "image": "val2014/COCO_val2014_000000100811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321959, "question_id": "gsGHreXPoEcEcfM9u7394E", "question": "Which color is the man looking at the train wearing?", "choices": ["white", "green", "red", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "white", "blue", "blue", "blue", "blue", "blue", "white", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The person is wearing blue.", "If the question refers to the man with the backpack, then this is the right answer.", "The color is blue."], "image": "train2014/COCO_train2014_000000321959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443999, "question_id": "gsSKGX4edXjopfSioXoJw7", "question": "What surface are they skiing on?", "choices": ["sand", "road", "mud", "mountain"], "correct_choice_idx": 1, "direct_answers": ["snow", "snow", "snow", "snow", "snow", "snowy concrete", "road", "snow", "road", "show road"], "difficult_direct_answer": false, "rationales": ["Directly in front of stores like these would be sidewalks. further out from sidewalks would be streets; that's where these people are skiing.", "You can tell by the buildings in the background, as to where they are skiing.", "The building behind indicates they are in an urban setting, and the flat service is road."], "image": "train2014/COCO_train2014_000000443999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271780, "question_id": "gseCASqojcfujrrzKS8HAX", "question": "Who collects the money from this item?", "choices": ["police", "municipal maids", "charity", "meter maids"], "correct_choice_idx": 2, "direct_answers": ["fraternity", "meter maid", "sorority", "sidewalk", "sorority", "meter maid", "meter maid", "sorority alumnae", "parking meter", "charity"], "difficult_direct_answer": false, "rationales": ["The toll says \"donation station\" and \"make a change here\".", "A charity collects the money from this parking meter because it is written donation station on it", "The meter maids collect money."], "image": "val2014/COCO_val2014_000000271780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311895, "question_id": "gsuGCffWHSFKzL6vAaS2mS", "question": "What door must be open to fill a glass with ice here?", "choices": ["none", "microwave", "left", "right"], "correct_choice_idx": 0, "direct_answers": ["fridge", "none", "left", "left", "cupboard", "left", "refrigerator", "freezer", "freezer", "left"], "difficult_direct_answer": false, "rationales": ["The icemaker is on the left side of the fridge.", "The water filler is on the outside.", "There is an ice dispenser on the outside."], "image": "train2014/COCO_train2014_000000311895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336003, "question_id": "gswhvSJPTXDxMBH3Cxnibj", "question": "What type of trick is the man in green performing?", "choices": ["manual", "flip", "handplant", "grind"], "correct_choice_idx": 2, "direct_answers": ["jumping", "stunt", "jump", "handplant", "hand stand", "handstand", "handstand", "handplant", "skateboard", "jump"], "difficult_direct_answer": false, "rationales": ["The skateboarder is doing a handplant because he is upside down with a hand on the ground", "The man is performing a handplant.", "He grabs the ramp and board with his hand"], "image": "train2014/COCO_train2014_000000336003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397383, "question_id": "gt4QkQbW4Lk3HAVhC5WNC2", "question": "What is the little child standing near?", "choices": ["water", "basket", "apple", "cat"], "correct_choice_idx": 0, "direct_answers": ["water", "water", "ocean", "ocean", "ocean", "water", "ocean", "water", "ocean", "water"], "difficult_direct_answer": false, "rationales": ["The child is near the waves.", "The kid is by water.", "The little child in the background stands in front of the retreating surf of the ocean."], "image": "train2014/COCO_train2014_000000397383.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338651, "question_id": "gtCSAkuH7oEUEKcMJzqhDp", "question": "Where is the baby seated while eating cake?", "choices": ["dining table", "high chair", "nursery table", "couch"], "correct_choice_idx": 1, "direct_answers": ["highchair", "high chair", "high chair", "chair", "high chair", "height chair", "high chair", "high chair", "highchair", "high chair"], "difficult_direct_answer": false, "rationales": ["There is a plastic tray in front of the baby. the seat is above the ground.", "The kid has a high chair.", "The baby is in a high chair."], "image": "train2014/COCO_train2014_000000338651.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288224, "question_id": "gtHXAeuyVjXGEgeM6z9evV", "question": "What side dish is to be enjoyed with this sandwich?", "choices": ["pears", "potato chips", "bacon", "apples"], "correct_choice_idx": 1, "direct_answers": ["potato chips", "chips", "potato chips", "chips", "potato chips", "chips", "chips", "chips", "chips", "potato chips"], "difficult_direct_answer": false, "rationales": ["By the texture and the fact that sandwiches usually have this side dish, you can surmise what it is.", "That's what is in the bowl.", "The dish is potato chips."], "image": "val2014/COCO_val2014_000000288224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455934, "question_id": "gtQ3E3oAC2TABxhKbxNeUA", "question": "This truck shares a name with a popular American Sporting expo group who plays what sport?", "choices": ["soccer", "tennis", "baseball", "basketball"], "correct_choice_idx": 3, "direct_answers": ["basketball", "hunting", "ski", "basketball", "basketball", "harlem globetrotters", "basketball", "basketball", "basketball", "basketball"], "difficult_direct_answer": false, "rationales": ["The name on the top of the bus says globetrotters. for anyone who follows american sports, they know that the harlem globetrotters were the ambassadors of basketball.", "A truck has logos on the front of it.", "Globetrotter is written on the truck and there is a basketball team doing exhibition that plays basketball."], "image": "train2014/COCO_train2014_000000455934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120441, "question_id": "gtVDjVbgkK7Sxdabqru2Cz", "question": "Where does the person with the ball stand?", "choices": ["outfield", "pitcher's mound", "home plate", "bull pen"], "correct_choice_idx": 1, "direct_answers": ["mound", "pitcher's mound", "pitchers mound", "mound", "pitchers mound", "pitcher's mound", "mound", "baseball field", "pitcher's mound", "pitcher's mound"], "difficult_direct_answer": false, "rationales": ["He throws the ball to the batter", "He will throw the ball to the batter.", "The player is pitching the ball and they have their own section to stand."], "image": "val2014/COCO_val2014_000000120441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42081, "question_id": "gtizgTRYpuorgPZHRaJtFo", "question": "What setting is it likely to be?", "choices": ["restaurant", "library", "school", "home"], "correct_choice_idx": 0, "direct_answers": ["lunchroom", "restaurant", "restaurant", "donut shop", "bakery", "cafe", "diner", "cafeteria", "bakery", "kitchen"], "difficult_direct_answer": false, "rationales": ["These donuts were bought in the store.", "It appears these donuts were just purchased and came in the bags laid out on the table in this picture. this would only be possible at a restaurant of some kind.", "The setting is a restaurant."], "image": "train2014/COCO_train2014_000000042081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358399, "question_id": "gtyXPzi4jrer3e5feBEnw2", "question": "Why is the man leaning forward?", "choices": ["to game", "to hit", "to reach", "to throw"], "correct_choice_idx": 0, "direct_answers": ["to game", "playing wii", "walking", "playing games", "using wiimote", "playing wii", "playing game", "playing game", "playing game", "playing wii"], "difficult_direct_answer": false, "rationales": ["A man is leaning forward with a wii controller in his hand.", "The man leaning forward has a wii controller in hand and is looking ahead. he is mostly like looking to game with person next to him.", "He has a controller in his hand"], "image": "val2014/COCO_val2014_000000358399.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441008, "question_id": "guFZijLyxyy8byjNg9sBbK", "question": "What is a potential danger for this man?", "choices": ["dolphins", "sharks", "dogs", "fish"], "correct_choice_idx": 1, "direct_answers": ["drowing", "sharks", "drowning", "waves", "sharks", "sharks", "drowning", "waves", "water", "sharks"], "difficult_direct_answer": false, "rationales": ["He is going to surf which is done in the ocean. that is where sharks live.", "The man could get eaten by sharks.", "The danger is sharks."], "image": "val2014/COCO_val2014_000000441008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30478, "question_id": "gvK8e9JeVmM8VHLzpPTTVy", "question": "Where is this train located?", "choices": ["marketing wing", "airport", "bus stop", "in storage"], "correct_choice_idx": 3, "direct_answers": ["train station", "subway", "train station", "station", "in warehouse", "museum", "train station", "station", "train station", "in storage"], "difficult_direct_answer": false, "rationales": ["The train appears to be in a garage.", "It is inside a building off the main track", "The train is in storage since it's in a garage."], "image": "val2014/COCO_val2014_000000030478.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190469, "question_id": "gvMme2Q7V3b3LpbR69NqRu", "question": "What type of silver utensil sits atop the salad bar?", "choices": ["spoon", "tongs", "knife", "fork"], "correct_choice_idx": 1, "direct_answers": ["tongs", "prongs", "tongs", "tongs", "tongs", "tongs", "tongs", "tongs", "tongs", "tongs"], "difficult_direct_answer": false, "rationales": ["Silver tongs are open on top of the tomatoes.", "The silver utensils are tongs.", "The tongs can be used."], "image": "train2014/COCO_train2014_000000190469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209018, "question_id": "gvehGajgtVG4geAT7nqHMf", "question": "Why have the people gathered?", "choices": ["protest", "pray", "compete", "eat"], "correct_choice_idx": 0, "direct_answers": ["protest", "team meeting", "protest", "parade", "protest", "protest", "hat organization", "crowd", "protest", "party"], "difficult_direct_answer": false, "rationales": ["The people are expressing their concerns by protesting.", "The people are on the streets and they are carrying signs.", "It is likely the people are gathered to protest something."], "image": "val2014/COCO_val2014_000000209018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476335, "question_id": "gvfyPXHK6myCBQhVKwnLxJ", "question": "Why are the men behind the counter?", "choices": ["to purchase", "to cook", "to talk", "to eat"], "correct_choice_idx": 1, "direct_answers": ["cooking", "cooking food", "making food", "chef", "cooking", "to cook", "working", "washing", "cooking", "cooking"], "difficult_direct_answer": false, "rationales": ["The men are cooks who are preparing food items for customers. at times, cooks will stand behind the counter and prepare items at restaurants.", "The men are cooking.", "They are in a kitchen"], "image": "val2014/COCO_val2014_000000476335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180504, "question_id": "gwFXrwMbPJoqvxYPs3D3fN", "question": "What is tattooed on his right leg?", "choices": ["gun", "brain", "heart", "flag"], "correct_choice_idx": 2, "direct_answers": ["heart", "sun", "sun", "sun", "sun", "sun", "letter p", "bug", "heart", "sun"], "difficult_direct_answer": false, "rationales": ["A heart is tattooed on.", "A heart is on his leg.", "There is a shape of a heart on the man's leg."], "image": "train2014/COCO_train2014_000000180504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109889, "question_id": "gwX7ZgHhskwQV7gfPuJEdN", "question": "What does the backpack contain?", "choices": ["calculator", "laptop", "personal belongings", "ipad"], "correct_choice_idx": 2, "direct_answers": ["water", "supplies", "unknown", "staples", "supplies food", "food", "ski gear", "personal belongings", "supplies", "supplies"], "difficult_direct_answer": false, "rationales": ["Backpacks are used to store random personal items.", "The backpack has personal stuff.", "The backpack contains personal and survival items."], "image": "val2014/COCO_val2014_000000109889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315057, "question_id": "gwZEULJBQYuHetpumn5rrW", "question": "Which item dominates this dish?", "choices": ["sauce", "peppers", "meat", "broccoli"], "correct_choice_idx": 2, "direct_answers": ["beef", "beef", "beef", "beef", "meat", "broccoli", "beef", "beef", "meat", "brocolli"], "difficult_direct_answer": false, "rationales": ["The brown item looks like steak, and as if it is a beef stir fry.", "There is more meat then anything else on the plate.", "There is more beef than veggies."], "image": "train2014/COCO_train2014_000000315057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220843, "question_id": "gwokLhzSnLLJSfJZwEoJ7A", "question": "What is this type of vehicle at the very front of the image referred to?", "choices": ["bicycle", "truck", "car", "motorcycle"], "correct_choice_idx": 3, "direct_answers": ["bike", "scooter", "motor scooter", "moped", "motorcycle", "scooter", "scooter", "rv", "e-bike", "motor scooter"], "difficult_direct_answer": false, "rationales": ["It's normally referred to as a motorbike.", "The one closer is a motorcycle and the other is a bicycle.", "The vehicle at the front of the image is a motorcycle."], "image": "train2014/COCO_train2014_000000220843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527786, "question_id": "gwviGiQSAXHV7omZRXamMF", "question": "What is the person wearing?", "choices": ["sunglasses", "crown", "feathers", "armor"], "correct_choice_idx": 0, "direct_answers": ["sunglasses", "sunglasses", "sunglasses", "sunglasses", "tshirt", "jeans shirt", "sunglasses", "white shirt", "t shirt", "jeans"], "difficult_direct_answer": false, "rationales": ["The sun glasses are seen on the face on the man.", "The man is wearing sunglasses because he is outside and the sun is shining", "The person is covering their eyes with glasses."], "image": "train2014/COCO_train2014_000000527786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411666, "question_id": "gx9ztx6Sydq37MNDjGbJnH", "question": "What type of spread is in the fridge?", "choices": ["jelly", "peanut butter", "marshmallow fluff", "nutella"], "correct_choice_idx": 1, "direct_answers": ["peanut", "peanut", "peanut butter", "peanut butter", "peanut butter", "peanut butter", "peanut butter", "peanut butter", "peanut butter", "peanut"], "difficult_direct_answer": false, "rationales": ["The peanut butter is seen in the fridge.", "The spread is peanut butter.", "There is a peanut butter spread inside of the fridge."], "image": "val2014/COCO_val2014_000000411666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544956, "question_id": "gxJfXHH4usN9x93FeRW4nz", "question": "What venue should this be?", "choices": ["waiting room", "school hallway", "hotel lobby", "gallery"], "correct_choice_idx": 3, "direct_answers": ["museum", "art museum", "museum", "art museum", "museum", "gallery", "museum", "art museum", "museum", "museum"], "difficult_direct_answer": false, "rationales": ["It is a room in an art museum, with a few simple benches for seating.", "There are ornate paintings in decorative frames lined up on the wall with placards labelling them. there is also seating provided for patrons which is all consistent with answer a.", "There are paintings with descriptions on the wall"], "image": "val2014/COCO_val2014_000000544956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554232, "question_id": "gxyYf8e8UUYZhkmZXpudHg", "question": "Which geographic area of the United States did this locomotive spend its working life?", "choices": ["west", "south", "east", "north"], "correct_choice_idx": 0, "direct_answers": ["pacific", "pacific coast", "west coast", "arkansas", "north", "pacific", "west", "west", "northwest", "pacific"], "difficult_direct_answer": false, "rationales": ["This is an old steam engine and has the word pacific on it", "The train is the pacific line.", "The word pacific is on the side of the train."], "image": "train2014/COCO_train2014_000000554232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394790, "question_id": "gyBpWnVsYvs2vRNaowfapr", "question": "The person in the group that is at higher of risk their hair freezing from the cold is wearing what color jacket?", "choices": ["brown", "black", "yellow", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "grey", "grey", "green", "blue", "green", "grey", "gray", "green", "black"], "difficult_direct_answer": false, "rationales": ["The person wearing a black jacket does not have all their hair covered. they have long hair and it is not completely covered.", "The person in green is not wearing a hat.", "The green person doesn't have a hat."], "image": "train2014/COCO_train2014_000000394790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468471, "question_id": "gyTjXjLLPagebDvmPbzsyV", "question": "What space is this bed located in?", "choices": ["furniture store", "doctors office", "waiting room", "video store"], "correct_choice_idx": 0, "direct_answers": ["furniture store", "showroom floor", "store", "bed", "furniture store", "store", "store", "furniture showroom", "public", "ikea"], "difficult_direct_answer": false, "rationales": ["We get the answer from a process of elimination. there are no doctor office's that look like that. a waiting room would never have a bed in it and there is no way this is a video store; especially since there are hardly any video stores around anyway.", "It is on display with other related items for purchase.", "These household items are displayed on a showroom floor for people to try out for comfort."], "image": "val2014/COCO_val2014_000000468471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297227, "question_id": "gyqM5QojrYis9x7rirvoLZ", "question": "What profession are the men on bikes?", "choices": ["lawyers", "racers", "dentists", "police officers"], "correct_choice_idx": 3, "direct_answers": ["police", "police", "police", "police", "police", "policemen", "police", "policemen", "police officers", "policemen"], "difficult_direct_answer": false, "rationales": ["The men are wearing police jackets.", "By the name on their jackets and their utility belts, it's easy to tell who they are.", "Their jackets literally state this fact."], "image": "train2014/COCO_train2014_000000297227.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550702, "question_id": "gyswXjsPr8NesePbGQVmyL", "question": "Which food on the table provides the most protein?", "choices": ["carrot", "rice", "lettuce", "beans"], "correct_choice_idx": 3, "direct_answers": ["beans", "beans", "beans", "beans", "beans", "beans", "soup", "soup", "beans", "beans"], "difficult_direct_answer": false, "rationales": ["Beans are on a table and are high in protein.", "Beans are full of protein and the thing on the table richest in it.", "The food is beans."], "image": "val2014/COCO_val2014_000000550702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559902, "question_id": "gz42eminmNRS7MHNYULS5M", "question": "In which continent is this road located?", "choices": ["europe", "eastern asia", "africa", "western asia"], "correct_choice_idx": 3, "direct_answers": ["asia", "asia", "india", "western asia", "asia", "china", "north america", "asia", "asia", "hot one"], "difficult_direct_answer": false, "rationales": ["By looking at the horse and carriage it looks to be in asia.", "The decorations on the horse/carriage look to be kuwaiti.", "The landscape, and the decorations on the horses are comparable to those found in western asia."], "image": "val2014/COCO_val2014_000000559902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487796, "question_id": "gz9e47hb4xqZXvML9TbmxH", "question": "What could the pot with the round white gauge on top be used for?", "choices": ["baking bread", "canning fruit", "making candy", "peeling carrots"], "correct_choice_idx": 1, "direct_answers": ["soup", "canning fruit", "tea", "shaking milk", "pressurizing", "tea", "pressure cooking", "pressure cooking", "storage", "heating"], "difficult_direct_answer": false, "rationales": ["The pot could be used for canning purposes since it has a preserving lid.", "The gauge is for pressurization which helps with canning fruit.", "The pot is for fruit."], "image": "train2014/COCO_train2014_000000487796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397217, "question_id": "gzNCmxfm6tj7DeAtwkdRAW", "question": "What is the man ready to do?", "choices": ["kill", "cut", "run", "call"], "correct_choice_idx": 1, "direct_answers": ["cut cake", "cut cake", "cut cake", "cut cake", "cut cake", "cut cake", "cut", "cake cutting", "cake cutting", "cut cake"], "difficult_direct_answer": false, "rationales": ["The man sits in front of an iced sheet cake as his right hand reaches for a big knife with which he'll start cutting pieces of the cake for everybody.", "They are going to cut the cake.", "The man wants to cut into the cake."], "image": "train2014/COCO_train2014_000000397217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407795, "question_id": "gzuyaqJY7McNh7a8xNL9kf", "question": "How many people shown here belong to the same sports team?", "choices": ["four", "three", "two", "five"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Both guys are on the same team.", "Both people are on the same team.", "Two people are on the same team."], "image": "train2014/COCO_train2014_000000407795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335332, "question_id": "h2aHuL3QQfTyuNvRZBxsLM", "question": "The items laid out on the counter are what part of a recipe?", "choices": ["snacks", "instructions", "scraps", "ingredients"], "correct_choice_idx": 3, "direct_answers": ["ingredients", "ingredients", "pizza", "ingredients", "stir fry", "yes", "stir fry", "stir fry", "ingredients", "ingredients"], "difficult_direct_answer": false, "rationales": ["The items are not yet assembled in a dish together.", "The items are ingredients.", "The ingredients are all laid out."], "image": "train2014/COCO_train2014_000000335332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365958, "question_id": "h2cc5cfvYeJP7DUsdnVMaV", "question": "What is that light in the distance called?", "choices": ["lamp", "post", "street lamp", "torch"], "correct_choice_idx": 3, "direct_answers": ["torch", "camera blobs", "reflection", "torch light", "trail light", "torch", "camera flash", "floodlights", "torch light", "torch"], "difficult_direct_answer": false, "rationales": ["The street lamps are on in public places.", "The light in the distance is a flame.", "The light is a torch."], "image": "train2014/COCO_train2014_000000365958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188545, "question_id": "h2kzwdGcUZdoi8VZ266BEU", "question": "What action is the man wearing yellow taking?", "choices": ["flipping", "running", "leaving", "rolling"], "correct_choice_idx": 0, "direct_answers": ["flipping", "upside down", "backflip", "flip", "flipping", "flip", "flip", "backflip", "snowboarding trick", "jumping"], "difficult_direct_answer": false, "rationales": ["The man has his feet over head in the air.", "A person is upside down with a snowboard.", "The man is flipping."], "image": "train2014/COCO_train2014_000000188545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378657, "question_id": "h3SeGtiDv5YvYgaPKeAS9h", "question": "What sound do people here await?", "choices": ["clock chime", "prayer", "rodeo band", "silence"], "correct_choice_idx": 0, "direct_answers": ["clock chime", "bell chime", "clock", "bell", "church bells", "clock", "church bells", "clock bell", "bell", "bells"], "difficult_direct_answer": false, "rationales": ["People stand near a large clocktower. people like to hear the chime of large clocks.", "The people here are gathered in front of a clock tower building. beside telling time, many tower clocks will give off a chime at each passing hour.", "The people want the clock to chime."], "image": "val2014/COCO_val2014_000000378657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38432, "question_id": "h3wagDTUndCAuKp8eto8GG", "question": "They are most likely hoping to advance what?", "choices": ["spirituality", "social life", "careers", "romantic life"], "correct_choice_idx": 2, "direct_answers": ["career", "society", "career", "technology", "careers", "knowledge", "discussions", "their education", "their position", "knowledge"], "difficult_direct_answer": false, "rationales": ["The people are wearing business suits.", "This is a business setting so it has nothing to do with spirituality, social life or romantic life.", "This appears to be a classroom. it's hard to tell what they're learning."], "image": "train2014/COCO_train2014_000000038432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258249, "question_id": "h3yWjyQCY5VUyTigzwGCpv", "question": "Transparent umbrella is used only from protecting?", "choices": ["rain", "uv", "sun", "wind"], "correct_choice_idx": 0, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "glass umbrella", "glass umbrella", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The umbrella is for rain.", "The umbrella is covered in water.", "Transparent umbrellas are used for rain. rain drops are on them."], "image": "train2014/COCO_train2014_000000258249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322031, "question_id": "h45Wcd5VSZrpqChYwayvQR", "question": "What Disney cartoon character is based on this animal?", "choices": ["barbie", "dumbo", "genie", "aladdin"], "correct_choice_idx": 1, "direct_answers": ["dumbo", "dumbo", "dumbo", "dumbo", "dumbo", "dumbo", "dumbo", "dumbo", "dumbo", "dumbo"], "difficult_direct_answer": false, "rationales": ["The animal is an elephant.", "Disney made a movie about an elephant named \"dumbo.\"", "Dumbo is the character."], "image": "val2014/COCO_val2014_000000322031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510813, "question_id": "h469JBgEXGQ5MHpXGWizVm", "question": "What is the fence's purpose?", "choices": ["stop balls", "cut grass", "hold children", "destroy grass"], "correct_choice_idx": 0, "direct_answers": ["protect crowds", "crowd protection", "protection", "safety", "stop balls", "stop ball", "for protection", "block ball", "balls protection", "spectator protection"], "difficult_direct_answer": true, "rationales": ["The fence is used to stop balls.", "The fence keeps balls out.", "The fence keeps balls from flying out."], "image": "train2014/COCO_train2014_000000510813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252639, "question_id": "h4KP6Jq9geDaEP7MdcEdki", "question": "What type setting is this roadway located in?", "choices": ["rural", "north pole", "urban", "tundra"], "correct_choice_idx": 2, "direct_answers": ["business area", "residential", "crossing", "urban", "city", "city", "city", "city", "city", "city"], "difficult_direct_answer": false, "rationales": ["There are buildings and streets clustered together in a city like setting.", "A road is in the city.", "The setting is urban."], "image": "val2014/COCO_val2014_000000252639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146388, "question_id": "h4KTJgPtaowNjyVgik7sDj", "question": "Where can people sit here?", "choices": ["car hood", "ski lift", "hammock", "bench"], "correct_choice_idx": 3, "direct_answers": ["picnic table", "table", "near sea", "bench", "picnic table", "bench", "picnic table", "bench", "picnic table", "picnic table"], "difficult_direct_answer": false, "rationales": ["There is a park bench visible in the picture.", "People could sit here at the park bench.", "There are two planks to sit on with tables attached."], "image": "train2014/COCO_train2014_000000146388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343148, "question_id": "h4PP3JzKKusY6NYnh8R8Fa", "question": "Who is most likely to get hurt?", "choices": ["baby", "striped top", "man", "pink top"], "correct_choice_idx": 0, "direct_answers": ["baby", "child", "skateboarder", "child", "baby", "child", "toddler", "baby", "kid", "kid"], "difficult_direct_answer": false, "rationales": ["A baby is on a skateboard with another person. babies are not good at skating.", "The baby can be hurt.", "The child is wearing no safety gear and is close to the ground and on a dangerous moving object; she is also unlikely to have the quick reflexes to prevent herself from injury."], "image": "train2014/COCO_train2014_000000343148.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237834, "question_id": "h4YvRLrJYG5JkeYfnRNPiL", "question": "What is the yellow item near the egg?", "choices": ["corn muffin", "cheese", "canary", "lemon"], "correct_choice_idx": 1, "direct_answers": ["cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese"], "difficult_direct_answer": false, "rationales": ["This is a slice of dairy product", "American cheese is yellow and melts.", "A slice of something shiny, melted, and orange is on a burger. cheese is often served on burgers."], "image": "train2014/COCO_train2014_000000237834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465765, "question_id": "h4f4af6xweq84SHcsL6Gyd", "question": "Why is the man in front leaning while on the board?", "choices": ["to turn", "to spin", "to sit", "to jump"], "correct_choice_idx": 0, "direct_answers": ["balance", "turn", "for direction", "controls direction", "to turn", "balance", "stay upright", "balance", "steering", "maintain balance"], "difficult_direct_answer": false, "rationales": ["The man is leaning in a leftward direction so he can avoid going straight.", "Balance determines direction on a snowboard.", "He is leaning to go in a different direction then what he was going."], "image": "train2014/COCO_train2014_000000465765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411642, "question_id": "h4nDNFAusAh8cgyiBJmajJ", "question": "What is the batter about to do?", "choices": ["nothing", "strike", "quit", "run"], "correct_choice_idx": 3, "direct_answers": ["run", "run", "run", "hit ball", "run", "run", "hit ball", "strikeout", "run", "run"], "difficult_direct_answer": false, "rationales": ["The batter runs.", "The batter is about to go for a run.", "It looks like the batter has already hit the ball as his bat is extended behind him, so he will run next."], "image": "val2014/COCO_val2014_000000411642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478055, "question_id": "h57VnnBoT4ZBeamHoyjpKN", "question": "The family of cows indicate this is good grounds for what?", "choices": ["recreation", "hunting", "grazing", "farming"], "correct_choice_idx": 2, "direct_answers": ["grazing", "grazing", "grazing", "grazing", "grazing", "grazing", "grazing", "grazing", "grazing", "grazing"], "difficult_direct_answer": false, "rationales": ["They are eating the grasses", "There seems to be a lot to eat here for cows and two of them are eating at the moment.", "There is a lot of grass to eat"], "image": "val2014/COCO_val2014_000000478055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390345, "question_id": "h5HeNa6BQuHYmdJYxEDWRB", "question": "The pavement is made using which one of these ingredients?", "choices": ["copper", "cement", "sulfur", "gold"], "correct_choice_idx": 1, "direct_answers": ["concrete", "sand", "concrete", "concrete", "concrete", "concrete", "cement", "cement", "concrete", "cement"], "difficult_direct_answer": false, "rationales": ["The pavement is gray and solid.", "It is grey and solid", "The surface has the color and texture of cement."], "image": "train2014/COCO_train2014_000000390345.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354569, "question_id": "h5JMEFknb37eypdpWXdN9V", "question": "What event is taking place here?", "choices": ["break-in", "vacation", "animal abuse", "moving home"], "correct_choice_idx": 3, "direct_answers": ["dog show", "cat", "moving home", "dog date", "house", "unknown", "photo taking", "afternoon nap", "dog playdate", "photo"], "difficult_direct_answer": true, "rationales": ["There are bags around the house.", "There are moving boxes in the background.", "There are boxes in the back that are typically used for moving."], "image": "train2014/COCO_train2014_000000354569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114540, "question_id": "h5JMfswQJKxZFpukjnT23F", "question": "What is the name of the structure where the people are riding?", "choices": ["highway", "gangplank", "quay", "overpass"], "correct_choice_idx": 2, "direct_answers": ["wheelchairs", "shipyard", "pier", "pier", "dock", "quay", "dock", "cycle", "dock", "bicycles"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to this type of scene.", "There are people standing alongside a concrete ledge by a marina with boats.", "The structure is where a quay is."], "image": "train2014/COCO_train2014_000000114540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115014, "question_id": "h5PNk7sNMBWrV65zsnsPqR", "question": "How are the candles on the wall powered?", "choices": ["oil", "wood", "electricity", "fire"], "correct_choice_idx": 2, "direct_answers": ["two", "electricity", "electricity", "electricity", "electricity", "electricity", "two", "electricity", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["A cord can be seen running down the wall from the fixture that holds the candle. the cord would then be plugged into the wall to provide power to the candle shaped lights.", "Cords are hanging from candles on the wall.", "Cords can be seen hanging from them."], "image": "train2014/COCO_train2014_000000115014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282415, "question_id": "h5Q4JYj2xTGiJSsYZPvRzJ", "question": "Of what use are the items on top of the white van?", "choices": ["extra seating", "packing cushioning", "for sale", "garbage"], "correct_choice_idx": 1, "direct_answers": ["packing cushioning", "landscaping", "prevent breakage", "cushion tv's", "protection", "shipping material", "cushioning", "padding", "cushions", "fireplace"], "difficult_direct_answer": true, "rationales": ["There is foam to keep the items safe.", "There is a trash can near the truck and the cushions appear to be damaged.", "The package cushioning is on top."], "image": "train2014/COCO_train2014_000000282415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333691, "question_id": "h5ZyjokErfocPsb4bH8hhZ", "question": "How old is he now?", "choices": ["older adult", "teen", "young adult", "child"], "correct_choice_idx": 0, "direct_answers": ["four", "old", "101 years", "three", "80", "five", "dead", "elderly", "six", "older adult"], "difficult_direct_answer": true, "rationales": ["Is is most likely an adult.", "This is a very old picture so the child would be a senior by now.", "This is an old black and white photo anyone would be an older man now."], "image": "val2014/COCO_val2014_000000333691.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151211, "question_id": "h5dbJqzXX3LAQu8q4g3d8j", "question": "Which region in Europe is this bus line servicing?", "choices": ["united kingdom", "germany", "france", "italy"], "correct_choice_idx": 0, "direct_answers": ["brighton", "england", "brighton", "england", "united kingdom", "brighton", "brighton", "brighton", "uk", "england"], "difficult_direct_answer": false, "rationales": ["Brighton line is in the united kingdom.", "The region is the uk.", "The bus line is advertising uk brands."], "image": "train2014/COCO_train2014_000000151211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39309, "question_id": "h5mkPfghaA377kZcS5KNe8", "question": "How many visible pieces does the highest large kite have connected below it?", "choices": ["one", "eight", "six", "four"], "correct_choice_idx": 2, "direct_answers": ["six", "six", "seven", "six", "six", "six", "two", "six", "six", "six"], "difficult_direct_answer": false, "rationales": ["There are six kites in the sky that are easy to see.", "There are six pieces below the largest kite.", "The kite seems to be one piece and doesn't have any attachments."], "image": "train2014/COCO_train2014_000000039309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464087, "question_id": "h5vi2ujWS3aEbw3JM7zYvU", "question": "What is the person with the blue helmet doing?", "choices": ["swinging bat", "jumping", "riding motorcycle", "sky diving"], "correct_choice_idx": 0, "direct_answers": ["hitting ball", "swinging bat", "swinging bat", "swinging bat", "swinging bat", "batting", "swinging bat", "hitting", "cricket", "batting"], "difficult_direct_answer": false, "rationales": ["The person is batting in a game.", "It's also known as a wide swing.", "He is playing baseball and is beside the catcher and umpire."], "image": "val2014/COCO_val2014_000000464087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293979, "question_id": "h64rwSPNrhNgCqj9NyvcGq", "question": "What shot is the man wearing a hat making?", "choices": ["forehand", "serve", "backhand", "lob"], "correct_choice_idx": 2, "direct_answers": ["backhand", "black", "backhand", "backhand", "back hand", "serve", "under", "backhand volley", "backhand", "chopper grip"], "difficult_direct_answer": false, "rationales": ["You can tell by how he is holding the racket as to what type of hit he is trying to do.", "The man in the hat is prepared to make a backhand shot at the ball.", "He is using the hand behind him."], "image": "train2014/COCO_train2014_000000293979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 927, "question_id": "h68bzh9BnUg44ok4eRA2du", "question": "What major skateboard safety gear is missing on the girl with pigtails?", "choices": ["vest", "goggles", "elbow pads", "jacket"], "correct_choice_idx": 2, "direct_answers": ["elbow pads", "elbow pads", "elbow pads", "elbow pads", "elbow pads", "elbow pads", "elbow pads", "elbow pads", "elbow pads", "elbow pads"], "difficult_direct_answer": false, "rationales": ["Elbow pads are not present.", "The middle of her arms where they bend are bare.", "The girl is wearing that in case she falls."], "image": "train2014/COCO_train2014_000000000927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364075, "question_id": "h6kgfbMP8ABkWFGytKD93h", "question": "In which type setting do the Giraffes rest?", "choices": ["museum", "park", "car lot", "racetrack"], "correct_choice_idx": 1, "direct_answers": ["zoo", "zoo", "dirt sand", "zoo", "natural", "green space", "zoo", "park", "forest", "sand"], "difficult_direct_answer": false, "rationales": ["The giraffes are in a green area of a zoo.", "The green grass and scenery behind them indicates they are in a park.", "There is a lot of grass and it is a natural looking habitat."], "image": "val2014/COCO_val2014_000000364075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327218, "question_id": "h6nBfLFQxXWDUE58sJNX3B", "question": "What are the small animals doing?", "choices": ["digging holes", "eating grass", "fighting", "playing"], "correct_choice_idx": 1, "direct_answers": ["grazing", "herding", "grass eating", "eating grass", "grazing", "eating grass", "grasing", "grazing", "grazing", "grazing"], "difficult_direct_answer": false, "rationales": ["The animals are not interacting with each other. they are consuming the material that is on the surface of the ground.", "The animals are munching.", "The small animals are grazing, not digging, playing, or fighting."], "image": "train2014/COCO_train2014_000000327218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274642, "question_id": "h6uoLwKChJUSva37erWRS7", "question": "Which birthday is the little boy celebrating?", "choices": ["first", "second", "third", "fifth"], "correct_choice_idx": 0, "direct_answers": ["one", "first", "one year", "first", "one", "first", "first", "one", "first", "one"], "difficult_direct_answer": false, "rationales": ["When children turn one, their parents usually get them a smash cake, like this one, to well smash.", "This is his first.", "He has a cake in front of him with the word one on it."], "image": "train2014/COCO_train2014_000000274642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131162, "question_id": "h6wz5V9CPoXG96avjr4jbR", "question": "What is in the bottle's opening?", "choices": ["tab", "straw", "thumb", "cork"], "correct_choice_idx": 3, "direct_answers": ["cork", "cork", "wooden cork", "cork", "box", "left side", "cupboard", "cork", "cork", "cork"], "difficult_direct_answer": false, "rationales": ["The bottle has a cork.", "There's a cork in the opening.", "There is a cork in the bottle."], "image": "train2014/COCO_train2014_000000131162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164568, "question_id": "h73jjkJyqKccBv7W9KvWL9", "question": "Why are the cows here?", "choices": ["to mingle", "sell milk", "to eat", "avoid danger"], "correct_choice_idx": 2, "direct_answers": ["eating", "grazing", "to graze", "grazing", "grazing", "grazing", "grazing", "to eat", "eating", "grazing"], "difficult_direct_answer": false, "rationales": ["Cows are in a grassy field with their heads bent down to the ground.", "They are grazing in the grass.", "The cows are grazing on the grass."], "image": "val2014/COCO_val2014_000000164568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286678, "question_id": "h76Nm79WyXuw5GWHuSvHop", "question": "What is being used to pull the black trailer?", "choices": ["dog", "men", "bike", "car"], "correct_choice_idx": 3, "direct_answers": ["car", "car", "hitch", "car", "car", "car", "car", "hitch", "car", "car"], "difficult_direct_answer": false, "rationales": ["The car is pulling the trailer.", "The trailer would be pulled by the vehicle in front of it.", "The car is being used to pull it."], "image": "train2014/COCO_train2014_000000286678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372433, "question_id": "h7AQbHF6C23jz5arV5rxTi", "question": "What are the black lamps used to control?", "choices": ["traffic", "animals", "light", "noise"], "correct_choice_idx": 0, "direct_answers": ["traffic lights", "glare", "traffic", "traffic", "traffic", "traffic pedestrians", "traffic", "traffic", "stoplights", "lights"], "difficult_direct_answer": false, "rationales": ["This is a light that controls the traffic.", "It is a stoplight which directs pedestrians and vehicles regarding when to stop, and when to go.", "The lamps are used for traffic lights."], "image": "val2014/COCO_val2014_000000372433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544732, "question_id": "h7QfUzd9de56tLYHjZURAr", "question": "Passengers disembarking from the busses seen here might do what in the yellow canopied building?", "choices": ["catch rides", "complain", "dine", "game"], "correct_choice_idx": 2, "direct_answers": ["wait", "eat", "eat", "order coffee", "eat", "order coffee", "eat", "dine", "eat", "eat"], "difficult_direct_answer": false, "rationales": ["The building is a restaurant.", "The passengers dine.", "The passengers are going to a cafe."], "image": "train2014/COCO_train2014_000000544732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9025, "question_id": "h7agrYhsvfZLw2xQFGaa89", "question": "The expression on the people's faces reveal that the bus is what?", "choices": ["fun", "unpleasant", "pleasant", "scary"], "correct_choice_idx": 1, "direct_answers": ["delayed", "delayed", "busy", "hot", "boring", "behind schedule", "unpleasant", "going slow", "moving", "stopping"], "difficult_direct_answer": true, "rationales": ["The expressions show boredom.", "People sit in a line together, all with negative expressions on their faces.", "They have unhappy expressions on their faces."], "image": "train2014/COCO_train2014_000000009025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145313, "question_id": "h7nJurND3U569nYjRYumqP", "question": "What are the little brown objects in the salad?", "choices": ["pinto peans", "split peas", "garbanzo beans", "kidney beans"], "correct_choice_idx": 2, "direct_answers": ["chickpeas", "chickpeas", "beans", "chick peas", "chestnuts", "chickpeas", "chick peas", "garbanzo beans", "chickpeas", "litchi nuts"], "difficult_direct_answer": false, "rationales": ["There are beans in the bowl.", "These are also known as chickpeas.", "Garbanzo beans are a light brown color."], "image": "train2014/COCO_train2014_000000145313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101630, "question_id": "h7num2SBCV3uqCT9pkrjBZ", "question": "What is near the sink?", "choices": ["mouse", "cat", "electric toothbrush", "dog"], "correct_choice_idx": 2, "direct_answers": ["lotion", "electric toothbrush", "toothbrush", "toothbrush", "water bottle", "toiletries", "toothbrush", "toothbrush", "towel", "hair conditioners"], "difficult_direct_answer": false, "rationales": ["The first item is the only one that can be seen in the image.", "An electric toothbrush is shown.", "There is an object with bristles and a button to power the bristles near the edge of the sink. the button turns on the bristles."], "image": "train2014/COCO_train2014_000000101630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379841, "question_id": "h7sPNcrYs85xvFfdVjok7Y", "question": "Why is he in midair?", "choices": ["showing off", "is bouncing", "is falling", "performing stunt"], "correct_choice_idx": 3, "direct_answers": ["midjump", "skateboard jump", "jump", "doing trick", "fun", "stunt", "performing stunt", "performing trick", "skateboarding trick", "skateboard jump"], "difficult_direct_answer": true, "rationales": ["This person is trying to fly through the air.", "He's performing.", "You can tell by the skateboard and the position he is in, as to what he is trying to accomplish."], "image": "val2014/COCO_val2014_000000379841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281541, "question_id": "h89yfAUbCXK3rHCKxdSoSp", "question": "Why does the woman hold something to her head?", "choices": ["dancing", "singing", "listening", "music screening"], "correct_choice_idx": 2, "direct_answers": ["phone conversation", "flipphone", "using phone", "phone call", "listening", "talk", "to hear", "making call", "to talk", "talking phone"], "difficult_direct_answer": true, "rationales": ["The woman has a phone held to her ear.", "It is so she can hear someone on the call", "The woman is holding a cell phone to her head. it has a speaker."], "image": "val2014/COCO_val2014_000000281541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419442, "question_id": "h8JG5mHqefudY6rbVZwgeB", "question": "This airline is headquartered in which city?", "choices": ["george town", "malacca", "singapore", "kuala lumpur"], "correct_choice_idx": 3, "direct_answers": ["malaysia", "malaysia", "kuala lumpur", "kuala lumpur", "kuala lumpur", "selangor", "kuala lumpur", "kuala lumpur", "malaysia", "kuala lumpur"], "difficult_direct_answer": false, "rationales": ["The word \"malaysia\" is clearly written on the side of the plane which likely corresponds to the airline. the malaysia airline is located in answer a.", "The word on the plain is the country and a is the capital city.", "The airline is malaysian."], "image": "train2014/COCO_train2014_000000419442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236422, "question_id": "h8Wc3TP6bXdVnTPAWtBP4S", "question": "The dog in the image belongs to which breed?", "choices": ["shelties", "gray ghost", "poodle", "retriever"], "correct_choice_idx": 0, "direct_answers": ["australian shepherd", "sheep dog", "mixed breed", "shepherd dogs", "northern breed", "shelties", "shepard", "australian shepherd", "heeler", "australian shepard"], "difficult_direct_answer": true, "rationales": ["The coloring and the long hair of the dog indicates its breed.", "A dog with a long, thin snout and long hair is sleeping under a bed.", "The dog is a shelbie dog."], "image": "train2014/COCO_train2014_000000236422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355741, "question_id": "h8hTDsADEHoDWkzxavnBci", "question": "What will this man need to look at while using this device?", "choices": ["nothing", "keys", "mirror", "screen"], "correct_choice_idx": 3, "direct_answers": ["game screen", "monitor", "screen", "tv", "television", "television", "screen", "wall", "vision", "television"], "difficult_direct_answer": false, "rationales": ["He will need to look at the tv so he can see to play the game.", "In order to play the game, he will need to see what is displayed on the t.v.", "Wii games can take place on a television."], "image": "train2014/COCO_train2014_000000355741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158028, "question_id": "h8pwyUWb2nbZAq9t9niqoW", "question": "Why are the bananas lined up on the shelf?", "choices": ["to sell", "to eat", "to decorate", "to cook"], "correct_choice_idx": 0, "direct_answers": ["good presentation", "for sale", "aesthetic appeal", "for sale", "sale", "for sale", "selling", "for selling", "sales presentation", "to sell"], "difficult_direct_answer": false, "rationales": ["The bananas are in a display at a store.", "They are in a grocery store for purchase.", "Before anyone can eat these bananas, they have to purchase them. by looking, we can see these bananas are for sale."], "image": "train2014/COCO_train2014_000000158028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329903, "question_id": "h8vzj7JPqDTKdtMDuSn6xk", "question": "What are the small white objects below the stuffed animals?", "choices": ["erasers", "packing peanuts", "balls", "clips"], "correct_choice_idx": 1, "direct_answers": ["peanuts", "foam pieces", "packing peanuts", "foam", "styrofoam peanuts", "protection", "packing peanuts", "candy", "packing peanuts", "styrofoam"], "difficult_direct_answer": false, "rationales": ["These cushion items to protect them", "These are used to take up empty space normally for shipping", "The objects are known for providing some cushion."], "image": "val2014/COCO_val2014_000000329903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359701, "question_id": "h9PzqmK8MrWHGgPfYNTgWT", "question": "What number is on the red train?", "choices": ["nine", "six", "one", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "number 2", "two", "two", "two", "two", "two", "number 2", "two", "two"], "difficult_direct_answer": false, "rationales": ["The number is 2.", "The train is marked with two on it.", "You can see the number between the window and the door."], "image": "train2014/COCO_train2014_000000359701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145638, "question_id": "h9X795atoWJLEiRGoZg8Dy", "question": "What type of animal is the cartoon figure portrayed here based?", "choices": ["tiger", "pigeon", "dog", "owl"], "correct_choice_idx": 0, "direct_answers": ["tiger", "tiger", "tiger", "tiger", "tiger", "tiger", "tiger", "tiger", "tiger", "tiger"], "difficult_direct_answer": false, "rationales": ["A boy is holding a tigger toothbrush. tigger is from the show winnie the pooh and he is a tiger.", "The toothbrush is a tiger handle.", "The animal looks like a tiger."], "image": "train2014/COCO_train2014_000000145638.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521964, "question_id": "h9eWRkTMiMcwzzHzxbGHdh", "question": "What is the brand Lifebuoy selling?", "choices": ["clothing", "shampoo", "soap", "backpacks"], "correct_choice_idx": 2, "direct_answers": ["soap", "herbal tea", "pineapple", "bananas", "soap", "soap", "herbs", "soap", "tea", "fruit"], "difficult_direct_answer": false, "rationales": ["Lifebuoy is selling bars of body soap.", "Lifebuoy is advertising soap.", "There is an image of a box of soap on the sign for the brand."], "image": "val2014/COCO_val2014_000000521964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6753, "question_id": "h9j3e9DMUpauDVyNUut8Qt", "question": "Which type of fruit is in the image?", "choices": ["watermelon", "banana", "strawberry", "cantaloupe"], "correct_choice_idx": 1, "direct_answers": ["tropical", "mangoes", "mango", "bananas", "banana", "bananas", "many many", "mangoes", "market fruit", "mangoes"], "difficult_direct_answer": false, "rationales": ["You see bananas on the display in the store", "Answer a is the most visually obvious answer, but other answers on the list are also acceptable.", "The fruit is the banana."], "image": "train2014/COCO_train2014_000000006753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372494, "question_id": "h9s9h7ufgJi9mgHimUWsKF", "question": "The people in the stands are supporters of which major league baseball franchise?", "choices": ["cardinals", "yankees", "giants", "mariners"], "correct_choice_idx": 2, "direct_answers": ["giants", "giants", "giants", "mlb", "ny giants", "giants", "giants", "giants", "giants", "sf giants"], "difficult_direct_answer": false, "rationales": ["They are rooting for san francisco's team.", "The people support the giants.", "On some of the clothing articles the colors of the giants and the word \"giants\" is visible. people wearing the color and name of a team on their clothes are likely supporters of the team."], "image": "val2014/COCO_val2014_000000372494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286785, "question_id": "h9y3xtu4E7EtguqPUjRgdL", "question": "What part of the animal is closest to the person?", "choices": ["back", "nose", "tail", "ear"], "correct_choice_idx": 1, "direct_answers": ["nose", "mouth", "nose", "nose", "snout", "nose", "snout", "head", "nose", "snout"], "difficult_direct_answer": false, "rationales": ["The zebra has stuck its head into the window of a car and its nose is almost touch the passenger.", "There is a zebra nose near to the person.", "The zebra has its head in the car"], "image": "val2014/COCO_val2014_000000286785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571906, "question_id": "h9z5sryxmiLSpNp49UYyC8", "question": "What are his pants made of?", "choices": ["silk", "denim", "leather", "microfiber"], "correct_choice_idx": 1, "direct_answers": ["denim", "denim", "denim", "denim", "cotton", "denim", "denim", "cotton", "denim", "denim"], "difficult_direct_answer": false, "rationales": ["This is also referred to as blue jean material.", "These are jeans that he is wearing.", "He is wearing jeans."], "image": "train2014/COCO_train2014_000000571906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573584, "question_id": "h9zCzsmwGWGvSU6qbEZ5kS", "question": "Which is the best bathroom fittings?", "choices": ["parryware", "cera", "hindware", "jaguar"], "correct_choice_idx": 3, "direct_answers": ["faucets", "granite", "faucet", "tile", "sink", "sink", "jaguar", "high quality", "tile", "sink"], "difficult_direct_answer": false, "rationales": ["The spots look like they are from a feline.", "The bathroom looks like a jungle.", "The best thing is jaguar."], "image": "train2014/COCO_train2014_000000573584.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250192, "question_id": "hAD9Wh6D2eUxdHHPuYzWgX", "question": "What is this this elephant doing?", "choices": ["drinking", "painting", "playing soccer", "eating"], "correct_choice_idx": 2, "direct_answers": ["kicking ball", "walking", "playing soccer", "playing soccer", "hitting ball", "playing game", "walking", "training", "playing", "playing ball"], "difficult_direct_answer": false, "rationales": ["This elephant is not eating or drinking, nor is he painting. he is playing with a ball.", "The elephant can be seen in front of a soccer ball and their trunk appears to be in movement, as it it is using the trunk to move the ball.", "The elephant is kicking a soccer ball."], "image": "train2014/COCO_train2014_000000250192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523272, "question_id": "hALmn6TdftZ74Psnzf6LNC", "question": "What is the traffic light telling the cars to do?", "choices": ["turn", "stop", "yield", "go"], "correct_choice_idx": 1, "direct_answers": ["stop", "stop", "stop", "stop", "stop", "stop", "stop", "stop", "stop", "stop"], "difficult_direct_answer": false, "rationales": ["The lights are red.", "The traffic light is red.", "The light says to stop."], "image": "train2014/COCO_train2014_000000523272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254714, "question_id": "hANSHCQ5ea6Q76bJvREEAK", "question": "Who was the other friend besides the butcher of the person whose name appears before the word market?", "choices": ["landscaper", "candlestick maker", "plumber", "gardener"], "correct_choice_idx": 1, "direct_answers": ["baker", "baker", "candlestick maker", "candlestick maker", "baker", "baker", "baker", "baker", "baker", "candlestick maker"], "difficult_direct_answer": false, "rationales": ["In the rhyme it is the one that builds the wax lighting.", "The board easily show the evident the person.", "A candlestick maker is there."], "image": "train2014/COCO_train2014_000000254714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344702, "question_id": "hAUMez7PibBhZnJCgrfa9T", "question": "What kind of organization is this entity?", "choices": ["government", "public", "private", "individual"], "correct_choice_idx": 0, "direct_answers": ["animal rescue", "pet shelter", "humane society", "humane society", "animal rescue", "humane society", "government", "humane society", "humane society", "animal rescue"], "difficult_direct_answer": false, "rationales": ["The humane society usually is operated by government entities.", "The word \"washington\" tells us that this is a government organization.", "It is an organization, but the website doesn't end in gov."], "image": "val2014/COCO_val2014_000000344702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37852, "question_id": "hAp55PJX8mceYX9mUcLPw5", "question": "How will he be able to tell what time it is?", "choices": ["keyboard", "sundial", "donut", "watch"], "correct_choice_idx": 3, "direct_answers": ["watch", "looking watch", "watch", "wristwatch", "watch", "his watch", "watch", "watch", "wrist watch", "use watch"], "difficult_direct_answer": false, "rationales": ["He'll watch the time.", "The man has a watch on his wrist.", "Watches allow this person to tell the time."], "image": "train2014/COCO_train2014_000000037852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87263, "question_id": "hB5coXzgyrRxTz95DPNZik", "question": "What is he about to do?", "choices": ["speak", "punish people", "find food", "eat dinner"], "correct_choice_idx": 0, "direct_answers": ["sing", "talk", "speak", "speak", "make speech", "lie", "deliver speech", "speech", "speak", "speak"], "difficult_direct_answer": false, "rationales": ["Given that he's holding a microphone, this is the most likely option.", "The man is holding a microphone which is used to project your voice so that others can hear what you have to say.", "The answer is really the only thing you can do with a microphone."], "image": "train2014/COCO_train2014_000000087263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381646, "question_id": "hB9grxPVo6zbERmC8Xa4b2", "question": "The train is passing under a safety structure that measures what on the train?", "choices": ["length", "height", "weight", "depth"], "correct_choice_idx": 1, "direct_answers": ["height", "height", "height", "speed", "speed", "speed", "speed", "height", "height", "speed"], "difficult_direct_answer": false, "rationales": ["The train's height is measured.", "The train can only fit if it's at a certain height. this will make sure it will fit across the track.", "The safety structure measures the height of the train because it passes right under it"], "image": "val2014/COCO_val2014_000000381646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365344, "question_id": "hBBnJKQQAwMCc5zE5mZj86", "question": "Why are the cones there?", "choices": ["vandalism", "randomly left", "to guide", "to challenge"], "correct_choice_idx": 3, "direct_answers": ["for guidance", "make lanes", "practicing tricks", "obstacle course", "to challenge", "slalom cones", "markers", "pathway", "direct", "obstacle course"], "difficult_direct_answer": true, "rationales": ["The cones are placed in order for the skateboarder to go between them as a challenge", "The cones are a challenge.", "The cones are there to challenge the man for a turn-off."], "image": "train2014/COCO_train2014_000000365344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407262, "question_id": "hBFyUYYDRYVg5cJQky9qzz", "question": "Who are the adults standing behind the children?", "choices": ["coaches", "teachers", "neighbors", "parents"], "correct_choice_idx": 3, "direct_answers": ["parents", "parents", "parents", "parents", "parents", "parents", "parents", "parents", "parents", "parents"], "difficult_direct_answer": false, "rationales": ["There is a man and a woman.", "They look like they are on a family holiday.", "The parents are watching the kids."], "image": "train2014/COCO_train2014_000000407262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434877, "question_id": "hCBhdgK5rrtuXHZXrjiqYH", "question": "What country is this location?", "choices": ["russia", "moldova", "ukraine", "belarus"], "correct_choice_idx": 0, "direct_answers": ["poland", "russia", "russia", "russia", "sweden", "england", "greece", "uk", "usa", "germany"], "difficult_direct_answer": false, "rationales": ["The country is russia.", "The train looks to be in russia.", "This location is russia since the language of the signs is in russian."], "image": "train2014/COCO_train2014_000000434877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85160, "question_id": "hCVLdnLz3Yzwgs857cEiLN", "question": "What is really odd about the phone the woman is talking on?", "choices": ["corded", "pay phone", "receiver shape", "color"], "correct_choice_idx": 0, "direct_answers": ["cord", "not cordless", "cord", "landline", "wired", "corded", "has cable", "corded", "has cord", "corded"], "difficult_direct_answer": false, "rationales": ["The man's phone is corded which is old fashioned.", "This is an old phone", "The woman's phone has a super long cord."], "image": "val2014/COCO_val2014_000000085160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225869, "question_id": "hCe8ci93C7M2tAakJGRXtm", "question": "What natural phenomenon assists this person?", "choices": ["hurricane", "tides", "rain", "eclipse"], "correct_choice_idx": 1, "direct_answers": ["moon", "wave", "waves", "waves", "waves", "waves", "waves", "tides", "waves", "moon"], "difficult_direct_answer": false, "rationales": ["The tides let this person ride a wave because if there are none he could not surf", "The phenomenon is the tide.", "Waves and tides help keep the person moving."], "image": "val2014/COCO_val2014_000000225869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451893, "question_id": "hCiKYs9hT7LJuBQqkn6wup", "question": "What kinds of trees are in the background?", "choices": ["evergreen", "tropical", "deciduous", "conifers"], "correct_choice_idx": 2, "direct_answers": ["maple", "oak trees", "green trees", "deciduous", "mature", "vacation place", "deciduous", "oak", "evergreen", "oak"], "difficult_direct_answer": false, "rationales": ["They shed leaves annually.", "The trees in these area are adapted to this condition.", "The trees are very bushy."], "image": "train2014/COCO_train2014_000000451893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567268, "question_id": "hCtJp9AarUzoekoyQhXqhs", "question": "What venue is the woman in?", "choices": ["restaurant", "home", "hotel room", "office"], "correct_choice_idx": 0, "direct_answers": ["restuarant", "resturant", "party", "pizza place", "pizza inn", "restaurant", "pizzeria", "home", "restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["The table decor looks like a restaurant. the size of the pizza and quality would be unlikely to replicate in a home setting.", "The person is eating the pizza.", "The woman is in a restaurant."], "image": "train2014/COCO_train2014_000000567268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407903, "question_id": "hCuwxerhhuR3fKw4q8pyD7", "question": "What does the girl walking into the water is waiting for what to come from directly behind her so she can stand up on the board from the water waiting for what the only thing that will get her on moving on the board?", "choices": ["wave", "sun", "her hands", "sand"], "correct_choice_idx": 0, "direct_answers": ["wave", "wave", "wave", "wave", "wave", "wave", "wave", "wave", "wave", "wave"], "difficult_direct_answer": false, "rationales": ["The water from the wave would push her board along so that she can balance on the board.", "The girl is behind the waves and is waiting for another wave to form.", "This is what gives her momentum."], "image": "val2014/COCO_val2014_000000407903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57810, "question_id": "hCv8N83AFjVKV4UTobiSfj", "question": "Which sleeper is sleeping in a more unconventional awkward position?", "choices": ["lady", "none", "smaller", "larger"], "correct_choice_idx": 2, "direct_answers": ["child", "little boy", "child", "kid", "smaller", "pajama child", "child", "toddler", "girl", "kid"], "difficult_direct_answer": false, "rationales": ["The child is in the bed head first in the covers which is not totally normal.", "The child isn't using a pillow and has their legs spread in a weird way.", "The person who is smallest is upside down."], "image": "train2014/COCO_train2014_000000057810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195149, "question_id": "hD3Tr7kQFJDjTm4T6f3UkB", "question": "What are these women looking at?", "choices": ["mirror", "car", "monitor screen", "rainbow"], "correct_choice_idx": 2, "direct_answers": ["television", "monitor screen", "television screen", "game", "game screen", "tv screen", "screen", "wii", "tv screen", "video game"], "difficult_direct_answer": true, "rationales": ["They are playing a video game.", "These women are watching their game on a screen.", "They are looking a screen while playing."], "image": "train2014/COCO_train2014_000000195149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361149, "question_id": "hDDgkdHAsAYfujmZYmo3VW", "question": "If you lifted up the brown thing on the ground where would it lead to?", "choices": ["nowhere", "home", "playpen", "sewer"], "correct_choice_idx": 3, "direct_answers": ["sewer", "top", "sewer", "sewer", "sewer system", "sewer", "sewers", "sewer", "sewer", "sewer"], "difficult_direct_answer": false, "rationales": ["There are man holes on the ground.", "This is a manhole cover.", "The manhole covers hide the entrance to the sewage disposal drains."], "image": "train2014/COCO_train2014_000000361149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326480, "question_id": "hDQfgVe4rRjjHy85nasfBK", "question": "Which person is the oldest?", "choices": ["right man", "middle woman", "left man", "back man"], "correct_choice_idx": 2, "direct_answers": ["with blackshirts", "man", "man", "man left", "left guy", "grey pants", "left man", "left one", "left man", "old man"], "difficult_direct_answer": false, "rationales": ["This person has white hair and looks older.", "The man with the grey hair is the oldest out of them.", "The man on the left has gray hair which would probably mean he is older than the other two."], "image": "train2014/COCO_train2014_000000326480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118051, "question_id": "hDWh8SB3PDsDnXi9Uwa9FP", "question": "Why is the partition's height extended?", "choices": ["shielding noise", "posting photos", "privacy", "special design"], "correct_choice_idx": 2, "direct_answers": ["privacy", "add privacy", "privacy", "privacy", "privacy", "for privacy", "for privacy", "privacy", "privacy", "for privacy"], "difficult_direct_answer": false, "rationales": ["The person made it look like it's covered.", "Without the extension, the walls would be so low as to allow others to see inside the cubicle, so it seems obvious that the extension was installed in order to increase the privacy within the space.", "It's up higher for privacy"], "image": "val2014/COCO_val2014_000000118051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51021, "question_id": "hDbd2ZBvq9YnTBGCSNygRG", "question": "What are these computers used for?", "choices": ["banking", "business", "gaming", "government"], "correct_choice_idx": 2, "direct_answers": ["laptop", "gaming", "gaming", "gaming", "gaming", "gaming", "laptop", "gaming", "gaming", "gaming"], "difficult_direct_answer": false, "rationales": ["The computers are for playing video games.", "The computers are for games.", "Computers are on a table with game controllers around. computers are used for gaming."], "image": "train2014/COCO_train2014_000000051021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445309, "question_id": "hEFucyEW9rkDDHssCGg7uT", "question": "Why is she so close to the car?", "choices": ["is friendly", "turning left", "is following", "is confused"], "correct_choice_idx": 1, "direct_answers": ["making turn", "turning left", "sharing space", "same way", "feels safe", "impulse", "stupid", "ignores safety", "crossing", "passing"], "difficult_direct_answer": true, "rationales": ["The woman wants to turn left with the car.", "She's turning.", "It's a process of elimination. if she were following she would be behind the car. the other 2 wrong options make no sense."], "image": "val2014/COCO_val2014_000000445309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351133, "question_id": "hEUy3WmuxsHanDb2KRWExe", "question": "How is the woman wearing black feeling?", "choices": ["angry", "depressed", "amused", "shocked"], "correct_choice_idx": 2, "direct_answers": ["good", "happy", "happy", "happy", "happy", "happy", "proud", "sassy", "happy", "amused"], "difficult_direct_answer": false, "rationales": ["She might also be happy, but it's not an option on this list. she's smiling wide though.", "With that smile and her hand on her hip, this woman looks happy with something funny going on around her.", "The woman is smiling and looks delighted."], "image": "val2014/COCO_val2014_000000351133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109482, "question_id": "hEmZaVtWppf5nCWV8A2jzS", "question": "What is behind the sofa?", "choices": ["mirror", "painting", "door", "bookcase"], "correct_choice_idx": 0, "direct_answers": ["mirror", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror"], "difficult_direct_answer": false, "rationales": ["A huge decoration that is a mirror on the wall.", "Used to allow the owners to view themselves and see how they look.", "A big mirror."], "image": "val2014/COCO_val2014_000000109482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445334, "question_id": "hEpPAuVJKcWM8Jg5ad9YfU", "question": "What is the child about to bite?", "choices": ["apple", "mustard", "yogurt", "rice"], "correct_choice_idx": 0, "direct_answers": ["apple", "vegetable", "apple", "food", "apple", "food", "vegetable", "cucumber", "food", "food"], "difficult_direct_answer": false, "rationales": ["The child is holding a piece of fruit. there is no mustard, yogurt, or rice.", "Kids like the green fruit.", "He is holding a slice in his hand."], "image": "val2014/COCO_val2014_000000445334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201252, "question_id": "hEwXaaKBtwKdvRRg7ZvmE9", "question": "What is the woman with the purple umbrella holding to her face?", "choices": ["phone", "sandwich", "drink", "glasses"], "correct_choice_idx": 0, "direct_answers": ["phone", "block sun", "phone", "phone", "cell phone", "phone", "phone", "phone", "umbrella", "phone"], "difficult_direct_answer": false, "rationales": ["People will often hold their mobile phone up near their face when calling.", "The woman is talking onto a device.", "The woman is on her phone."], "image": "train2014/COCO_train2014_000000201252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174357, "question_id": "hEwcKNqaoj9S5FCHDkbsbD", "question": "This car will likely go where next?", "choices": ["grocery", "deli", "mall", "ocean"], "correct_choice_idx": 3, "direct_answers": ["ocean", "beach", "beach", "beach", "beach", "ocean", "beach", "ocean", "beach", "home"], "difficult_direct_answer": false, "rationales": ["The car goes to the ocean.", "There are surfboards in the back.", "With a process of elimination, no one can surf at a grocery, deli or a mall."], "image": "train2014/COCO_train2014_000000174357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255795, "question_id": "hFGHscm2Hczwj8fDmyJ4iS", "question": "Which direction do you go for the nearest soccer field?", "choices": ["go straight", "go back", "turn right", "turn left"], "correct_choice_idx": 3, "direct_answers": ["left", "left", "turn left", "west", "left", "west", "left", "left", "left", "left"], "difficult_direct_answer": false, "rationales": ["A sign with a ball and an arrow is on a pole near the street.", "There is a picture on a sign with a soccer ball and an arrow pointing left.", "There is an arrow under the picture of a soccer ball"], "image": "val2014/COCO_val2014_000000255795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29080, "question_id": "hFKknysNqFiJzveHVh2Xws", "question": "What is the purpose of the rope?", "choices": ["clean boat", "holding boat", "decorative", "seat"], "correct_choice_idx": 1, "direct_answers": ["support", "anchor ship", "tie ship", "theft", "tether", "anchor", "holding boat", "anchor", "prevent movement", "anchor ship"], "difficult_direct_answer": false, "rationales": ["The rope will hold the boat to the doc so it does not drift away", "The rope is connected to the boat and tied around a cement block on the dock, indicating that it is holding it in place and preventing it from floating away.", "The rope is attached to a large structure and a dock, indicating a boat is being held."], "image": "val2014/COCO_val2014_000000029080.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337984, "question_id": "hFZ6MdXf9jKpWgQNHbXADQ", "question": "What is found in the room?", "choices": ["car", "sink", "hammer", "snake"], "correct_choice_idx": 1, "direct_answers": ["sink", "sink", "sinks", "two sinks", "sinks", "sink", "washing room", "sink", "sinks", "food"], "difficult_direct_answer": false, "rationales": ["There are two of them with faucets", "It's one of the first things we see. and you'll never find a snake or a car in a kitchen and probably won't find a hammer there either.", "A big sink is found inside of this room."], "image": "val2014/COCO_val2014_000000337984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139390, "question_id": "hFgSaZy4F6Bjet3snJ5xrv", "question": "What does the bus have on board?", "choices": ["tickets", "wi-fi", "restrooms", "outlets"], "correct_choice_idx": 1, "direct_answers": ["passengers wifi", "wi-fi", "wifi", "tourists", "people", "wifi", "tourists", "wifi", "wifi people", "passengers"], "difficult_direct_answer": false, "rationales": ["There is writing on the side of the bus implicitly stating that there is wifi on the bus.", "The bus has wi-fi available.", "It has a sign on the window stating that there is wifi available in the bus."], "image": "train2014/COCO_train2014_000000139390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572226, "question_id": "hFhUTKSFbtxxFc2w3R3ZqJ", "question": "Why are there stalls with products outside?", "choices": ["for homeless", "to decorate", "to sell", "for fundraising"], "correct_choice_idx": 2, "direct_answers": ["selling", "street vendors", "selling products", "to sell", "selling products", "marketplace", "toys", "market", "market street", "outdoor fair"], "difficult_direct_answer": true, "rationales": ["The stands are set up in a public place with merchandise on display.", "The stalls are for sale.", "These are street vendors that are going to try to make money."], "image": "val2014/COCO_val2014_000000572226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32645, "question_id": "hG7qksMQDybD6LyFWYpXPz", "question": "What kind of animal is shown?", "choices": ["wild", "caged", "stuffed", "domestic"], "correct_choice_idx": 2, "direct_answers": ["toy bear", "bear", "bear", "teddy bear", "bear", "stuffed", "toy bear", "bear", "bear", "bear"], "difficult_direct_answer": false, "rationales": ["It is a toy bear", "It's a stuffed teddy bear.", "The animal shown is a stuffed bear or dinosaur."], "image": "val2014/COCO_val2014_000000032645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143711, "question_id": "hG8GDAPyTHPzCRYhQRpyiu", "question": "What is the reddish netting for on the ground?", "choices": ["goat path", "artistic design", "prevent snowdrift", "rabbit pen"], "correct_choice_idx": 2, "direct_answers": ["lift", "catch people", "safety", "guard", "snow fence", "prevent snowdrift", "snow fence", "division", "fence", "snow fence"], "difficult_direct_answer": false, "rationales": ["The netting is used to prevent snow.", "The item prevents snow drift.", "The reddish netting keeps snow from moving too far out."], "image": "train2014/COCO_train2014_000000143711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2415, "question_id": "hGBnsZbze5pFVsc26wJtqe", "question": "What type of hat is the man in red wearing?", "choices": ["derby", "beanie", "fedora", "newsboy cap"], "correct_choice_idx": 1, "direct_answers": ["knit", "ski cap", "beanie", "ski cap", "beanie", "beanie", "beanie", "beanie", "beanie", "beanie"], "difficult_direct_answer": false, "rationales": ["Newsboy caps, fedoras, and derby hats, all have some sort of brim.", "The hat is a beanie.", "The man is wearing a small hat."], "image": "train2014/COCO_train2014_000000002415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252786, "question_id": "hGX67eShj8x4XdLjZNnErB", "question": "What is she doing?", "choices": ["stealing luggage", "returning luggage", "posing", "exercising"], "correct_choice_idx": 2, "direct_answers": ["travelling", "waiting", "waiting pickup", "posing", "waiting", "waiting", "posing", "posing", "posing", "traveling"], "difficult_direct_answer": false, "rationales": ["She is standing still and smiling for the camera.", "This woman's stance; one leg over the other, smiling with one arm gripping and one on her hip, suggest a pose for the camera.", "This is not a natural way for a person to stand, her position is intentional and is done to look flattering in the photo being taken."], "image": "train2014/COCO_train2014_000000252786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80543, "question_id": "hGbeSqs6So5AEyXns4vYg6", "question": "Why is the man in black crouched?", "choices": ["to dive", "to jump", "gain speed", "to sit"], "correct_choice_idx": 2, "direct_answers": ["balance", "surfing", "surfing", "gain speed", "balance", "surfing", "balance", "riding surfboard", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["Crouching lessens the wind resistance.", "The man wants to propel forward.", "The man wants speed."], "image": "train2014/COCO_train2014_000000080543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129544, "question_id": "hGjW9wKwhWDQuwAj9qCfxe", "question": "In which metropolitan area is this clock installed?", "choices": ["boston", "san francisco", "new york", "london"], "correct_choice_idx": 1, "direct_answers": ["san francisco", "san francisco", "san francisco", "san francisco", "london", "london", "london", "london", "london", "san francisco"], "difficult_direct_answer": false, "rationales": ["There are two cities listed on the clock, but lloyds of london is the name of the company that insures the clock and not the location of the clock.", "In san francisco.", "This clock is in san francisco."], "image": "val2014/COCO_val2014_000000129544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488676, "question_id": "hGnydQ3ZRdudGucb5ZC6fp", "question": "Which player is dressed the warmest?", "choices": ["grey shoes", "blue shorts", "red shirt", "black pants"], "correct_choice_idx": 1, "direct_answers": ["farthest", "girl", "left person", "blue shorts", "top right", "girl", "right orange", "coach", "first man", "hat boy"], "difficult_direct_answer": true, "rationales": ["The player with long pants has the blue shorts.", "The player in blue shorts is warm.", "The player who has the most clothing is in blue shorts."], "image": "train2014/COCO_train2014_000000488676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564657, "question_id": "hGrmA4huCxzva4fqLi442Z", "question": "Why are the small walls setup between the urinals?", "choices": ["to clean", "for maintenance", "for decoration", "for privacy"], "correct_choice_idx": 3, "direct_answers": ["privacy", "urinals", "urinals", "privacy", "privacy", "urinals", "privacy", "privacy", "privacy", "for privacy"], "difficult_direct_answer": false, "rationales": ["The small walls are setup for the privacy between urinals.", "They are so other people can't watch", "As much privacy as possible is appreciated in a public urinal. it's not the most pleasant place to be."], "image": "val2014/COCO_val2014_000000564657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348269, "question_id": "hGryMQ2S7xFz6Y7TfiMfLs", "question": "Why do both ladies look identical?", "choices": ["robot", "twins", "mirror", "clone"], "correct_choice_idx": 2, "direct_answers": ["mirror image", "mirror", "mirror image", "mirrored image", "mirror", "mirror", "in mirror", "mirror", "mirror", "mirror"], "difficult_direct_answer": false, "rationales": ["The woman in the mirror is just a reflection.", "The woman is sitting in front of a reflective lens.", "This is a reflection of one lady"], "image": "train2014/COCO_train2014_000000348269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292730, "question_id": "hGvPaAVqVnKqR55SZUKGZ3", "question": "Why is there a candle in the woman's dessert?", "choices": ["to trick", "to celebrate", "as joke", "for light"], "correct_choice_idx": 1, "direct_answers": ["to celebrate", "birthday", "her birthday", "celebrate birthday", "birthday", "birthday", "birthday", "celebration", "birthday", "for birthday"], "difficult_direct_answer": false, "rationales": ["It is most likely for her birthday.", "This is her birthday", "Candles placed in desserts are a traditional way to commemorate a special day or achievement."], "image": "train2014/COCO_train2014_000000292730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127298, "question_id": "hH8u6WM9VGDYWGbySiuDhv", "question": "How did the kite flyer get to the park?", "choices": ["bus", "bicycle", "car", "wagon"], "correct_choice_idx": 1, "direct_answers": ["bike", "using bicycle", "bicycle", "bike", "by bicycle", "bicycled", "bicycle", "bicycle", "bicycle", "bicycle"], "difficult_direct_answer": false, "rationales": ["The person used the bike.", "Odds are very good that the kite flyer is a boy, and that he rode to this park on his bicycle, seen on the left.", "They rode their bike."], "image": "train2014/COCO_train2014_000000127298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178052, "question_id": "hHJbq7Mdf6uU4wZTwTVSRM", "question": "What does Hyundai do to this game?", "choices": ["provides trophy", "provides transportation", "sponsors", "organizes game"], "correct_choice_idx": 2, "direct_answers": ["sponsor it", "sponsor", "sponsor", "sponsor", "sponsors it", "sponsor", "sponsor", "sponsor", "sponsors", "sponsor"], "difficult_direct_answer": false, "rationales": ["Hyundai sponsored the game and so their name is advertised on the sign.", "Hyundai is a sponsor.", "Hyundai is a sponsor for the game."], "image": "train2014/COCO_train2014_000000178052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295317, "question_id": "hHPBddZoJBDYerXcJrrFKB", "question": "What is probably capable of the most storage of data?", "choices": ["black device", "bottle", "white paper", "white device"], "correct_choice_idx": 3, "direct_answers": ["desktop computer", "desktop", "30mb", "laptop", "laptop", "laptop", "laptop", "internet cloud", "white device", "laptop"], "difficult_direct_answer": false, "rationales": ["The white device has the largest memory cache.", "The white device is larger.", "The white device has more."], "image": "train2014/COCO_train2014_000000295317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332654, "question_id": "hHV62oqxGUPocoNEJ5XyYz", "question": "Who usually use this object?", "choices": ["pizza delivery", "teacher", "policeman", "firefighter"], "correct_choice_idx": 3, "direct_answers": ["fireman", "fire fighters", "firemen", "firefighter", "firefighter", "firefighter", "firefighters", "dogs", "firemen", "firefighters"], "difficult_direct_answer": false, "rationales": ["This is where hoses are hooked up to get water for fires", "A hydrant provides water in case of a fire.", "This is a hydrant."], "image": "val2014/COCO_val2014_000000332654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175491, "question_id": "hHpY849S73F3sJPzTHxtZV", "question": "Where does the cat rest?", "choices": ["cat house", "couch", "mattress", "dog house"], "correct_choice_idx": 1, "direct_answers": ["on lap", "legs", "sofa", "on purse", "couch", "purse", "owner's lap", "couch", "couch", "lap"], "difficult_direct_answer": false, "rationales": ["The cat is on the couch.", "There are cushions and an arm", "A cat is laying next to a purse on a striped piece of furniture. couches are common pieces of furniture found in homes."], "image": "train2014/COCO_train2014_000000175491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446864, "question_id": "hHsDsXwyfhDF8E8SNYxrSa", "question": "What is in the bowl on the left?", "choices": ["strawberries", "lemons", "grapes", "apples"], "correct_choice_idx": 0, "direct_answers": ["strawberries", "strawberries", "strawberries", "strawberries", "strawberry's", "strawberries", "strawberry", "strawberries", "strawberry", "strawberries"], "difficult_direct_answer": false, "rationales": ["The bowl has berries.", "The bowl on the left is full of sliced strawberies.", "The color and the size tell us the answer."], "image": "train2014/COCO_train2014_000000446864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407570, "question_id": "hHsmSwYq7YMmord6JkkDQA", "question": "What letter was altered by someone on this sign?", "choices": ["y covered", "l added", "", "none"], "correct_choice_idx": 0, "direct_answers": ["way", "r", "way", "y covered", "r", "letter y", "letter y", "y", "letter y", "way"], "difficult_direct_answer": false, "rationales": ["It should say all-way", "The word \"war\" was originally \"way.\".", "The y is covered."], "image": "val2014/COCO_val2014_000000407570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172057, "question_id": "hJ7gFMXr2ZtHSQYmsXDHvR", "question": "What event is being held here?", "choices": ["wedding", "tractor pull", "expo", "jail break"], "correct_choice_idx": 2, "direct_answers": ["expo", "exposition", "expo", "exposition", "wedding", "wedding", "luncheon", "conference", "wedding", "celebration"], "difficult_direct_answer": false, "rationales": ["An expo is the event going on.", "There is a sign with the expo name in it, so most likely it's a small expo out and inside.", "Some people are wearing different color dresses."], "image": "val2014/COCO_val2014_000000172057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485710, "question_id": "hJCMEJVTTWwfESXPRS6j9R", "question": "What is the skier holding in each hand?", "choices": ["tubes", "canes", "sticks", "poles"], "correct_choice_idx": 3, "direct_answers": ["ski pole", "ski poles", "ski poles", "ski poles", "ski pole", "ski pole", "poles", "pole", "ski poles", "ski poles"], "difficult_direct_answer": false, "rationales": ["A man is skiing with long, thin objects in his hands.", "The man is holding some sticks.", "There are poles in both hands."], "image": "val2014/COCO_val2014_000000485710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250075, "question_id": "hJDvEkqvWgNu7hkNwo7fZk", "question": "What item on the counter has melted?", "choices": ["flower", "candle", "tray", "soap"], "correct_choice_idx": 1, "direct_answers": ["candle", "candle", "candle", "candle", "candle", "candle", "candle", "candle", "candle", "candle"], "difficult_direct_answer": false, "rationales": ["There is nothing left in the candlestick so the candle must have melted.", "It is made of wax and has a wick that gets lit", "The item is a candle."], "image": "train2014/COCO_train2014_000000250075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476268, "question_id": "hJThH7ZSwcDMqXrTFPvCXD", "question": "These parasols are made up of what?", "choices": ["garden", "husk", "cloth", "bamboo"], "correct_choice_idx": 3, "direct_answers": ["umbrellas", "bamboo", "straw", "twigs", "dried grass", "straw", "bamboo", "tree bark", "straw", "thatch"], "difficult_direct_answer": false, "rationales": ["The parasols are made of bamboo.", "The parasols are bamboo.", "The parasols have bamboo coverings."], "image": "train2014/COCO_train2014_000000476268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568143, "question_id": "hJUkBWkzTX6yPFPWFPSz89", "question": "What type of transportation is this?", "choices": ["rail", "road", "water", "air"], "correct_choice_idx": 1, "direct_answers": ["van", "van", "van", "truck", "road", "truck", "van", "van", "van", "van"], "difficult_direct_answer": false, "rationales": ["The vehicle is a van, not a train, airplane, or boat.", "There are wheels on the vehicle and the vehicle has the overall appearance of one that travels that way.", "This is a road van."], "image": "train2014/COCO_train2014_000000568143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177915, "question_id": "hJV9o5whQVhjn8z2EUkepa", "question": "What is separate from the reality being captured with a camera?", "choices": ["date", "names", "advertisement", "racket text"], "correct_choice_idx": 0, "direct_answers": ["human", "city", "fence", "fiction", "date", "nothing", "timestamp", "timestamp", "no clue", "discord"], "difficult_direct_answer": true, "rationales": ["The date is in the corner.", "The time/date stamp is not put on by the camera.", "The date is superimposed onto the camera photo."], "image": "train2014/COCO_train2014_000000177915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538006, "question_id": "hJZRApqqoozqLwTKp4YGbx", "question": "Where are the people?", "choices": ["mall", "museum", "ballpark", "garage"], "correct_choice_idx": 1, "direct_answers": ["behind glass", "museum", "outside glass", "science museum", "museum", "on pathway", "museum", "museum", "in museum", "museum"], "difficult_direct_answer": false, "rationales": ["The animals are being shown at a museum.", "The people are at a museum of natural history.", "They're at a museum."], "image": "train2014/COCO_train2014_000000538006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345042, "question_id": "hJiB6a4G6822p6fqM9i6Uk", "question": "What type of information is printed on the boat?", "choices": ["regulatory", "name", "brand", "warning"], "correct_choice_idx": 1, "direct_answers": ["helen j", "name", "name", "helen j", "helen", "name", "name", "name", "name", "name"], "difficult_direct_answer": false, "rationales": ["The name of boats is printed on the side of the boat.", "A boat has a name on the front of it.", "A common name is printed on the back of a boat."], "image": "train2014/COCO_train2014_000000345042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526801, "question_id": "hJmQDwzCBezmKwJxHcftet", "question": "How many computers are there?", "choices": ["one", "four", "two", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "two", "two", "three", "three", "two", "three", "two", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 3 located on the desk in sight.", "There are three monitors.", "There are 3 computers."], "image": "val2014/COCO_val2014_000000526801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522491, "question_id": "hK2TA5B9fBMz5mESkHCefS", "question": "What does this motorcycle have attached to its right side?", "choices": ["trunk", "wagon", "cart", "carriage"], "correct_choice_idx": 3, "direct_answers": ["sidecar", "sidecar", "sidecar", "cart", "sidecar", "side car", "carriage", "side car", "sidecar", "board"], "difficult_direct_answer": false, "rationales": ["This is so another person can ride beside him", "A baby carriage is strapped to the motorcycle.", "The motorcycle has a carriage on it."], "image": "train2014/COCO_train2014_000000522491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433705, "question_id": "hK9HFydAuCbT5YRz4jiMPG", "question": "What is the large vegetable on the far left?", "choices": ["yam", "cabbage", "lettuce", "pumpkin"], "correct_choice_idx": 1, "direct_answers": ["lettuce", "cabbage", "cabbage", "cabbage", "cabbage", "lettuce", "cabbage", "cabbage", "lettuce", "cabbage"], "difficult_direct_answer": false, "rationales": ["The big vegetable is a cabbage because it is green and has leaves like a cabbage does", "The veggie is a cabbage.", "There is a head of lettuce on the far left."], "image": "val2014/COCO_val2014_000000433705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83766, "question_id": "hKGLyWjCDEWB9BcywKAPV9", "question": "Where is this scene taking place?", "choices": ["airport", "mall", "dmv", "school"], "correct_choice_idx": 0, "direct_answers": ["airport", "airport", "airport", "airport", "airport", "airport", "airport", "airport", "airport", "airport"], "difficult_direct_answer": false, "rationales": ["Luggage is piled up in a large building with people all around. airports are large places with lots of people and luggage.", "The scene is at an airport.", "There are people with luggage."], "image": "val2014/COCO_val2014_000000083766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7650, "question_id": "hKLaeBGTco5smXeggcTy9Y", "question": "Where are persons carrying the white items going?", "choices": ["pool hall", "swimming pool", "ocean", "bar"], "correct_choice_idx": 2, "direct_answers": ["beach", "ocean", "ocean", "surfboard", "beach", "ocean", "beach", "beach", "beach", "to beach"], "difficult_direct_answer": false, "rationales": ["The people want to surf in the water.", "This is the most obvious answer given that those are surfboards.", "Those are surf boards which makes the answer obvious."], "image": "train2014/COCO_train2014_000000007650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544933, "question_id": "hKLzbxhaKdJGK8dQYeaFLr", "question": "What will she use the brush for?", "choices": ["brush horse", "brush hair", "sweep ground", "clean shoes"], "correct_choice_idx": 0, "direct_answers": ["horse", "curry horse", "brush horse", "horse", "horse's hair", "brush horse", "grooming horse", "horse", "horse", "brush horse"], "difficult_direct_answer": false, "rationales": ["There is a horse in the background.", "A woman is holding a large brush and is wearing riding boots and pants. domesticated horses need to be brushed.", "She wants to brush the horse."], "image": "train2014/COCO_train2014_000000544933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8314, "question_id": "hKMLWXtBhhuFJEgwLaYxyK", "question": "Which direction will this person most likely go next?", "choices": ["down slop", "skyward", "same elevation", "higher"], "correct_choice_idx": 0, "direct_answers": ["mountain", "upwards", "east", "down", "forward", "down", "forward", "north", "down slop", "downhill"], "difficult_direct_answer": false, "rationales": ["The direction is down.", "The person is at the top of a snowy mountain and the only way to go now is down hill.", "He's on a snow board"], "image": "train2014/COCO_train2014_000000008314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519088, "question_id": "hKQhsGngLvrBc9HaF6KeR8", "question": "What is being gathered by this vehicle?", "choices": ["paper", "children", "ice cream", "garbage"], "correct_choice_idx": 3, "direct_answers": ["garbage", "entertaining", "garbage", "trash", "trash", "trash", "trash", "garbage", "waste", "trash"], "difficult_direct_answer": false, "rationales": ["A man is standing at the back of a garbage truck on the street.", "This is a trash truck.", "The vehicle has trash in the back and a trash can has just been emptied."], "image": "val2014/COCO_val2014_000000519088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332397, "question_id": "hKaPoFEnKYJTV64fVgPv2V", "question": "Why does the girl have a rope in her hand?", "choices": ["to tie", "to pull", "to swing", "to braid"], "correct_choice_idx": 1, "direct_answers": ["pulling sled", "to pull", "pulling child", "pulling sled", "pull child", "dragging sled", "pulling sister", "pull sled", "pulling sled", "tugging sled"], "difficult_direct_answer": false, "rationales": ["She is taking a little kid on a sled ride", "The girl is using the rope to drag the blue sled.", "It is attached to the sled her younger sibling is riding in."], "image": "train2014/COCO_train2014_000000332397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120643, "question_id": "hKjwgDV5wQiyz2qCURGHok", "question": "He is using the skin as a what?", "choices": ["bowl", "pot", "napkin", "fork"], "correct_choice_idx": 0, "direct_answers": ["plate", "bowl", "bowl", "bowl", "bun", "bowl", "container", "bowl", "plate", "bowl"], "difficult_direct_answer": false, "rationales": ["The man is scooping out of the shell.", "The banana peel is holding the food like a serving dish would.", "The banana peel is stuffed with food."], "image": "train2014/COCO_train2014_000000120643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115927, "question_id": "hKkQNW5ErBQtcudZvjF3vX", "question": "What type of area is this?", "choices": ["residential", "commercial", "rural", "tropical"], "correct_choice_idx": 1, "direct_answers": ["commercial", "urban", "food court", "urban", "downtown", "market", "commercial", "market", "market", "urban"], "difficult_direct_answer": false, "rationales": ["The area is commercial.", "This is an area with lots of buildings and shops.", "The area contains modern building structures where most are used for commercial activities."], "image": "val2014/COCO_val2014_000000115927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236068, "question_id": "hKy2aT6mbcjkAYM6Pn3Bgv", "question": "What breed of Firetruck related dog is shown here?", "choices": ["mutt", "dalmatian", "doberman", "collie"], "correct_choice_idx": 1, "direct_answers": ["dalmatian", "dalmatian", "dalmatian", "dalmatian", "dalmation", "dalmatian", "dalmatian", "dalmatian", "dalmation", "dalmatian"], "difficult_direct_answer": false, "rationales": ["The white coat with black spots on it identify this stuffed animal as a dalmatian.", "It has a lot of spots on it", "This type of dog is associated with firehouses due to the fact that they are usually deaf and the sirens do not bother them."], "image": "val2014/COCO_val2014_000000236068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524059, "question_id": "hL9uHQYE2qJVPgqjqmELum", "question": "What is the woman doing to the elephant?", "choices": ["feeding it", "hitting it", "patting it", "brushing it"], "correct_choice_idx": 0, "direct_answers": ["watching", "feeding it", "feeding", "playing", "feeding", "feeding", "feeding it", "feeding it", "feeding", "feeding"], "difficult_direct_answer": false, "rationales": ["The woman is feeding the pet elephant.", "The woman has food in her hands and is reach towards the elephant. this action would be consistent if she were intending to feed it.", "The woman has food in her hand. the elephant is looking toward her."], "image": "train2014/COCO_train2014_000000524059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347793, "question_id": "hLcR3bTMosDJAbAZsKGbZp", "question": "During which time of the year are these trains operating?", "choices": ["winter", "fall", "summer", "spring"], "correct_choice_idx": 2, "direct_answers": ["all year", "summertime", "summer", "regularly", "summer", "summer", "summer", "summer", "spring", "day"], "difficult_direct_answer": false, "rationales": ["It doesn't look cold.", "It looks like a nice summer day.", "The trees are full and vibrant."], "image": "train2014/COCO_train2014_000000347793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206082, "question_id": "hLqLHMZEicPPDgVjjdukqk", "question": "What is the top word on the box?", "choices": ["caution", "waste", "lift", "fragile"], "correct_choice_idx": 3, "direct_answers": ["this", "fragile", "fragile", "fragile", "fragile", "fragile", "fragile", "paper", "not clear", "fragile"], "difficult_direct_answer": false, "rationales": ["The letters are plain to see and the first one is f", "A word on the side of the box appears above all other words.", "This is a moving box that says dish pack on it. dishes are breakable."], "image": "train2014/COCO_train2014_000000206082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8187, "question_id": "hM8Xm3t5PfRUc6jua4PLc8", "question": "Why has the cyclist covered his head?", "choices": ["safety", "fashion", "warmth", "religion"], "correct_choice_idx": 0, "direct_answers": ["protection", "noisy", "protection", "safety", "helmet", "safety", "safety", "safety", "riding bicycle", "helmet"], "difficult_direct_answer": false, "rationales": ["Riders must wear a head covering in case they fall and hit their heads on a rock.", "A person is on a bike with a helmet on.", "The cyclist needs to keep his head safe."], "image": "train2014/COCO_train2014_000000008187.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407625, "question_id": "hMBFLeepHPGLmFKDk7ZB7i", "question": "What is probably in his hair?", "choices": ["tiara", "rubber band", "just water", "gel"], "correct_choice_idx": 3, "direct_answers": ["gel", "gel", "gel", "gel", "gel", "gel", "gel", "gel", "gel", "gel"], "difficult_direct_answer": false, "rationales": ["He has product in his hair to hold it up.", "His hair is sleek and wet looking.", "The man has his hair slicked back."], "image": "train2014/COCO_train2014_000000407625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330562, "question_id": "hMMZe4fcXMYYryqe6Xocga", "question": "What material would the plates be made of?", "choices": ["ceramic", "wood", "iron", "carpet"], "correct_choice_idx": 0, "direct_answers": ["ceramic", "ceramic", "ceramic", "ceramic", "ceramic", "china", "ceramic", "ceramic", "porcelain", "clay"], "difficult_direct_answer": false, "rationales": ["The plates are hard and shiny.", "Commonly dish plates are made of a hard ceramic type of material.", "The material is ceramic."], "image": "train2014/COCO_train2014_000000330562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207454, "question_id": "hMV3jc3jb3TUubuQdDqcEs", "question": "What is the woman putting on the sauce?", "choices": ["mushrooms", "shrimp", "cheese", "onion"], "correct_choice_idx": 0, "direct_answers": ["mushrooms", "mushroom", "mushrooms", "mushrooms", "mushrooms", "mushrooms", "mushrooms", "daal", "mushrooms", "mushrooms"], "difficult_direct_answer": false, "rationales": ["The woman adds mushrooms.", "The woman is adding mushrooms.", "The woman has a bowl of them in her hand and putting them on the pizza."], "image": "train2014/COCO_train2014_000000207454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187801, "question_id": "hMqEW2PZs3SmRT2oeFxVUB", "question": "Which side of the road would this bus drive on in this country?", "choices": ["special lane", "middle", "left", "right"], "correct_choice_idx": 2, "direct_answers": ["left", "left", "right", "left", "left", "left", "right", "right", "left", "left"], "difficult_direct_answer": false, "rationales": ["This is a bus in london", "The bus is likely in england where they drive on the left side of the road.", "In england people drive on the left side."], "image": "train2014/COCO_train2014_000000187801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11107, "question_id": "hMzSzFoCwtJRno2usSBYje", "question": "What sound would come from the top blurred part of the photo?", "choices": ["pet noises", "car sounds", "classical music", "cheering"], "correct_choice_idx": 3, "direct_answers": ["cheering", "cheers", "cheering", "celebratory", "cheer", "cheering", "cheers", "cheers", "applause", "cheer"], "difficult_direct_answer": false, "rationales": ["The crowd does this when they're happy.", "The blurred area is the dugouts and the stands for the fans. the team in the dugout and the fans would be cheering for their team.", "This would come from the specatators."], "image": "train2014/COCO_train2014_000000011107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556120, "question_id": "hN2P7RvSPsjHpg9z5WMruw", "question": "What is someone who collects huge amounts of rubbish called?", "choices": ["hoarder", "butler", "collector", "miscreant"], "correct_choice_idx": 0, "direct_answers": ["garbage man", "trash collecter", "garbage collector", "luggage", "hoarder", "pack rat", "hoarder", "horder", "hoarder", "hoarder"], "difficult_direct_answer": false, "rationales": ["This group of people can't bring themselves to throw anything out.", "A large pile of junk is gathered in the corner of a room.", "A hoarder collects trash."], "image": "val2014/COCO_val2014_000000556120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314126, "question_id": "hNCVbyd9uGeoGaj4q43kPn", "question": "What is the black rectangular board used for?", "choices": ["seat", "door", "table", "cover"], "correct_choice_idx": 3, "direct_answers": ["protection", "cover tire", "cover", "cover", "storage", "storage cover", "trunk cover", "clothes", "seat", "cover"], "difficult_direct_answer": false, "rationales": ["The board is a cover.", "The black rectangular board is used to cover the storage compartment.", "There is a cover in the trunk."], "image": "train2014/COCO_train2014_000000314126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317683, "question_id": "hNDHYR34LsAdsiYVkosvm4", "question": "What is the name of the hat located on the corner of the table?", "choices": ["trilby", "derby", "fedora", "tricorne"], "correct_choice_idx": 3, "direct_answers": ["fedora", "tricone", "pirate", "pirate hat", "derby hat", "british hat", "tricorne", "tricorne", "tricorne", "cowboy hat"], "difficult_direct_answer": false, "rationales": ["The hat is shown with three corners and is commonly known to be called answer a in reference to this shape.", "This hat has 3 points to it.", "That's what that type of hat is called."], "image": "val2014/COCO_val2014_000000317683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536, "question_id": "hNFCaSoh8aLSS5SLfC7Rnr", "question": "How are the cellphones receiving reception?", "choices": ["wire", "magnetic waves", "antennae", "radar"], "correct_choice_idx": 2, "direct_answers": ["antennae", "wireless signals", "antennas", "antennae", "two", "antennae", "antenna", "antenna", "satellite", "antennas"], "difficult_direct_answer": false, "rationales": ["There is a stick coming out of the phone.", "These are older cell phones that have an antenna on them.", "There is a thin stick extended from one of the phones"], "image": "val2014/COCO_val2014_000000000536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160836, "question_id": "hNGsu9SqsAaAosom3uDbae", "question": "At what state of doneness is this pizza shown?", "choices": ["overdone", "raw", "perfectly done", "underdone"], "correct_choice_idx": 0, "direct_answers": ["well well", "well done", "burnt", "burnt", "finished", "illinois", "illinois", "well done", "overdone", "well done"], "difficult_direct_answer": false, "rationales": ["The pizza is overbaked.", "It's overdone.", "The black area on the crust and throughout the pizza indicate that it has been burnt. in order for food do you burn it would've had to of been in the oven cooking for an extended period of time."], "image": "val2014/COCO_val2014_000000160836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199634, "question_id": "hNb6yRfMvQpCuXGqtxkhfD", "question": "What is the large tube coming down from the ceiling for?", "choices": ["water pipe", "cooling unit", "waste carrier", "ventilation"], "correct_choice_idx": 3, "direct_answers": ["vent", "cooking exhaust", "ventilation", "ventilation", "vent", "vent", "vent", "helping", "vent", "vent"], "difficult_direct_answer": false, "rationales": ["The tube is directly over a cooking place, which may cause smoke to generate when cooking.", "The heat, smoke, and steam from the stove is ventilated through that large tube.", "The hood above the stove is to take smoke out."], "image": "val2014/COCO_val2014_000000199634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375535, "question_id": "hNqZB6ndyVhfeVPogtS7mk", "question": "What is the use of the following traffic cones?", "choices": ["block road", "stop vehicle", "decoration", "traffic redirection"], "correct_choice_idx": 3, "direct_answers": ["indoor hazards", "cordon off", "guide traffic", "traffic redirection", "traffic control", "don't cross", "barrier", "orderly line", "route designation", "prevent crossing"], "difficult_direct_answer": true, "rationales": ["The cones block traffic.", "There are still vehicles moving, so they are not stopped or blocked, but it forces them to move down a different route.", "They are directing traffic."], "image": "train2014/COCO_train2014_000000375535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249151, "question_id": "hNrYWuydrxQwPTj6hzoVaq", "question": "What are the two touching?", "choices": ["brownies", "cake", "pie", "muffins"], "correct_choice_idx": 1, "direct_answers": ["cake spatula", "knife", "wedding knife", "cake", "cake", "cake", "cake knife", "knife", "cake knife", "cake"], "difficult_direct_answer": false, "rationales": ["They are cutting a food item at the same time while dressed in bride and groom attire in a room filled with trays of cupcakes.", "They are holding a knife to cut the cake together.", "They are doing the ceremonial slicing at their reception"], "image": "train2014/COCO_train2014_000000249151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225923, "question_id": "hNrj7aBNLh3888Pj6g49DL", "question": "What is the stuff inside the white containers used for?", "choices": ["bagels", "donuts", "tea", "coffee"], "correct_choice_idx": 0, "direct_answers": ["spread", "coffee", "lightening cream", "spreading", "spread", "sauce", "bagel shmear", "donuts", "sweet", "bagels"], "difficult_direct_answer": true, "rationales": ["The stuff in the containers is likely cream cheese to put on the bagels.", "It is cream cheese which is a classic topping", "It is most likely cream cheese to be spread on the wheat products that are first boiled."], "image": "train2014/COCO_train2014_000000225923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137028, "question_id": "hPFNMkRz2NZQrdLxxzTVyD", "question": "What is the title of a dog that helps find people?", "choices": ["people pleaser", "scent sniffer", "fur finder", "rescue animal"], "correct_choice_idx": 3, "direct_answers": ["search dog", "rescue", "rescue dog", "search-and-rescue dog", "rescue", "search dog", "search", "search rescue", "rescue animal", "search animal"], "difficult_direct_answer": false, "rationales": ["The dog is a rescue animal.", "They help find people that have been in disasters.", "The title is a rescue dog."], "image": "val2014/COCO_val2014_000000137028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361158, "question_id": "hPP2ryxHrEnqP8E7g4naSV", "question": "What does this umbrella keep off her head?", "choices": ["sun", "showers", "sleet", "advertising"], "correct_choice_idx": 0, "direct_answers": ["sunlight", "sun", "sun ray", "rain", "sunlight", "sun", "sun", "sunlight", "sunshine", "sun"], "difficult_direct_answer": false, "rationales": ["She is preventing burning her skin.", "This umbrella keeps the sunlight off the woman's head.", "There's no other weather visible except for clear skies."], "image": "train2014/COCO_train2014_000000361158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421578, "question_id": "hPQo9vUNCNiwCgMiQufVdt", "question": "What type of fruit is most likely on the tree?", "choices": ["lime", "dragonfruit", "apple", "lemon"], "correct_choice_idx": 3, "direct_answers": ["lemon", "lemon", "lemons", "lemon", "orange", "lemons", "oranges", "orange", "lemon", "lemon"], "difficult_direct_answer": false, "rationales": ["These are lemons.", "Yellow fruit is on a tree. lemons are yellow and grow on trees.", "Lemons are yellow."], "image": "train2014/COCO_train2014_000000421578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244267, "question_id": "hPXPRPCLXyMXRmDqgv2iHV", "question": "Persons standing on the board here perfect what?", "choices": ["wardrobe", "hair", "video skills", "stance"], "correct_choice_idx": 3, "direct_answers": ["balance", "stance", "form", "stance", "balance", "stance", "form", "stance", "ten", "surfing"], "difficult_direct_answer": false, "rationales": ["They are in a perfect pose for surfing.", "The people have a perfect stance.", "The people are doing a stance."], "image": "train2014/COCO_train2014_000000244267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541147, "question_id": "hPXXEQSmSJcVN6BJm9bY4H", "question": "What piece of furniture appears as if it might go into the source of heat in this room?", "choices": ["cushion", "divan", "chair", "table"], "correct_choice_idx": 3, "direct_answers": ["table", "table", "table", "chair", "coffee table", "fireplace", "television", "coffee table", "end table", "fireplace"], "difficult_direct_answer": false, "rationales": ["The fireplace uses wood for fire and that's what the coffeetable is made of.", "The table is wood.", "There is a unique wood table that is made entirely of wood logs."], "image": "train2014/COCO_train2014_000000541147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34283, "question_id": "hPsp4vuMKXDnqUkAyzTHXc", "question": "How are the children controlling the object?", "choices": ["string", "magic", "battery", "remote"], "correct_choice_idx": 0, "direct_answers": ["kite string", "string", "string", "string", "string", "string", "string", "using drone", "kite string", "string"], "difficult_direct_answer": false, "rationales": ["This holds on to it to steer it and make sure it doesn't fly away", "The kite flies by a piece of string.", "The kids are holding the kite string."], "image": "train2014/COCO_train2014_000000034283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127074, "question_id": "hPzPxxUMxnJfEJNhGeZjZt", "question": "What is the name of the bag on the grass in front of this man?", "choices": ["duffle bag", "sports bag", "drawstring bag", "handbag"], "correct_choice_idx": 2, "direct_answers": ["can't see", "tote", "gym bag", "back pack", "tote", "drawstring", "drawstring bag", "backpack", "bag pack", "side bag"], "difficult_direct_answer": true, "rationales": ["The bag has a drawstring and that's what it's called...a drawstring bag.", "This type of bag you pull on the strings to close the bag so nothing falls out.", "The name is a drawstring bag."], "image": "val2014/COCO_val2014_000000127074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263042, "question_id": "hQ9uhYj7KRB3V6bPbWbP7c", "question": "What is the boy doing with the water?", "choices": ["drink", "extinguish", "wash", "cook"], "correct_choice_idx": 2, "direct_answers": ["washing ground", "cleaning", "washing", "hosing down", "cleaning area", "cleaning", "wash", "cleaning ground", "washing elephant", "hosing down"], "difficult_direct_answer": false, "rationales": ["The boy is holding the hose near the ground.", "He's pointing the hose at debris on the ground", "The boy is washing."], "image": "train2014/COCO_train2014_000000263042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9822, "question_id": "hQTRsc5kprFAxtu64NBNEB", "question": "What type of toppings are on the donuts?", "choices": ["dirt", "oreo", "brownie", "chocolate"], "correct_choice_idx": 1, "direct_answers": ["oreo bits", "chocolate cookies", "chocolate", "oreos", "oreo", "oreo chunks", "cookie", "oreos", "oreos", "oreos"], "difficult_direct_answer": false, "rationales": ["These are crumbled cookies", "The toppings have the patterns that you would see on an oreo cookie.", "It's oreos."], "image": "train2014/COCO_train2014_000000009822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246184, "question_id": "hQWEHKkM23PyFWhfAR2Dne", "question": "What is the proper name for these fins?", "choices": ["skeg", "rudder", "flipper", "arm"], "correct_choice_idx": 0, "direct_answers": ["sket", "skeg", "gibberts", "shark", "sharpies", "skeg", "dorsal", "fins", "skeg", "gills fins"], "difficult_direct_answer": false, "rationales": ["These fins are actually called skegs.", "The name is skeg.", "The fin underneath the rear of a surfboard is called a skeg."], "image": "train2014/COCO_train2014_000000246184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410912, "question_id": "hQzwNiMjJQWaEgTJ7yqMP2", "question": "What are the cameras for?", "choices": ["wedding", "party", "security", "game"], "correct_choice_idx": 2, "direct_answers": ["security", "security", "security", "security", "security", "security", "security", "security", "security", "security"], "difficult_direct_answer": false, "rationales": ["The cameras are security.", "A train station can be an area what many issues or crimes can occur, so the cameras are needed for that purpose.", "The cameras are placed overlooking a public open space and of a style, size and design consistent with answer a."], "image": "val2014/COCO_val2014_000000410912.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571141, "question_id": "hR3kzSaDMdLvgfwLBPKSFH", "question": "In skateboarding skates with right foot what they called?", "choices": ["goofy", "rider", "regular", "looser"], "correct_choice_idx": 0, "direct_answers": ["goofy", "goofy footers", "goofy-footers", "rightie", "goofy footers", "jump", "goofy-footer", "righty", "goofy-footers", "goofy"], "difficult_direct_answer": false, "rationales": ["He is doing a trick.", "It's goofy.", "That foot is known to push the board."], "image": "train2014/COCO_train2014_000000571141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125017, "question_id": "hR6F59gwn5UtTauZeWinNr", "question": "The person who uses this desk likely works as what type of professional?", "choices": ["actuary", "engineer", "web developer", "architect"], "correct_choice_idx": 2, "direct_answers": ["developer", "website design", "writer", "office", "programmer", "computer", "web developer", "computer programmer", "programmer", "programmer"], "difficult_direct_answer": false, "rationales": ["The books on the desk are all related to programing and the internet.", "That's what all of the books are referencing.", "Due to the computer with library of programming books we can assume this person does web development."], "image": "val2014/COCO_val2014_000000125017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499125, "question_id": "hRFWeXcdPvmYr5oTDBhcBE", "question": "What does the largest animal here have?", "choices": ["tusk", "antlers", "long neck", "wings"], "correct_choice_idx": 0, "direct_answers": ["tusk", "tusk", "tusk", "tusk", "tusks", "tusk", "tusks", "tusks", "tusks", "tusks"], "difficult_direct_answer": false, "rationales": ["The elephant has tusks.", "The elephant that's larger has tusks showing.", "Elephants have tusks."], "image": "train2014/COCO_train2014_000000499125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127454, "question_id": "hRK894E676A3JsYJM3R8gw", "question": "What type of ball has a special place for it here?", "choices": ["baseball", "shuttlecock", "basketball", "football"], "correct_choice_idx": 2, "direct_answers": ["basketball", "golf", "road", "hitch ball", "basketball hoop", "basketball", "unclear question", "basketball", "basketball", "basketball"], "difficult_direct_answer": false, "rationales": ["There is a basketball hoop in the pictures. you shoot basketballs in a basketball hoop.", "There is a basketball goal in the background.", "There is a hoop on the other street"], "image": "train2014/COCO_train2014_000000127454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72702, "question_id": "hRtkUfvE8NHdzgmx8gvdzR", "question": "What style of architecture is this?", "choices": ["australian", "african", "asian", "south american"], "correct_choice_idx": 2, "direct_answers": ["stone", "elephant", "asian", "statue", "asian", "japanese", "asian", "sculpture", "hui style", "temple"], "difficult_direct_answer": false, "rationales": ["There are places in asia filled with these types of statues.", "It has the flared roofs and upper floors are smaller", "The architecture is asian."], "image": "train2014/COCO_train2014_000000072702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465824, "question_id": "hS39CiMJzsbMpNoBYsVLft", "question": "What is the woman doing?", "choices": ["dancing", "jumping", "sleeping", "yawning"], "correct_choice_idx": 3, "direct_answers": ["advertising", "sitting", "advertising", "yawning", "yawning", "sitting", "yawning", "modeling", "sitting", "sitting"], "difficult_direct_answer": false, "rationales": ["She has her mouth stretched wide open presumably because she is tired or bored.", "Her mouth is open.", "The woman has her mouth agape."], "image": "train2014/COCO_train2014_000000465824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272064, "question_id": "hSUpiawyrKQ3LXJZUQVW2R", "question": "The player is using all his energy when preparing to do what with the ball?", "choices": ["bunce it", "throw it", "roll it", "serve it"], "correct_choice_idx": 3, "direct_answers": ["hit it", "hit", "throw", "tennis", "serve it", "hit", "hit", "tennis", "hit it", "serve"], "difficult_direct_answer": false, "rationales": ["The man's trying to serve the ball with force.", "The player is hitting the ball to his opponent.", "He is about to smash the ball across the court to his opponent."], "image": "train2014/COCO_train2014_000000272064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538581, "question_id": "hSYHx89RDwHTgos5nNmtmU", "question": "What color vest does the person who put these dogs on the surfboard wear?", "choices": ["purple", "white", "polka dot", "yellow"], "correct_choice_idx": 0, "direct_answers": ["blue green", "blue", "blue", "blue", "purple", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The person's vest is visible and easily identifiable.", "The person is wearing purple.", "The person who owns the dogs has a purple shirt on."], "image": "train2014/COCO_train2014_000000538581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467791, "question_id": "hSZmASp5ZjWc9LzscAsEfM", "question": "What base is the player in white behind the grey suited player supposed to be on?", "choices": ["first base", "second base", "home base", "third base"], "correct_choice_idx": 0, "direct_answers": ["first", "home base", "first", "home plate", "first", "first", "pujols", "center field", "first base", "first base"], "difficult_direct_answer": false, "rationales": ["The batter would run from where he is which is home base to the very first white plate on our right before the ball is caught and he would be out.", "He is trying to steal to second base.", "The guy is guarding first."], "image": "val2014/COCO_val2014_000000467791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173553, "question_id": "hSiiYHWC2HG34rhoAVUZh7", "question": "If the man in the blue shirt was to eat a food that matched the color of his bandana what would it be?", "choices": ["banana", "cherry", "steak", "spinach"], "correct_choice_idx": 0, "direct_answers": ["hark work", "blueberries", "banana", "banana", "banana", "banana", "banana", "banana", "pizza", "banana"], "difficult_direct_answer": false, "rationales": ["The item in the option is a yellow fruit, matching the color of the man's bandana.", "His bandana is yellow.", "Typically, humans eat ripe bananas. unripe bananas are green, rotten ones are brown; ripe bananas are yellow like his bandana."], "image": "val2014/COCO_val2014_000000173553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535506, "question_id": "hSxE9rqRaPAYfEzGxV8ZcQ", "question": "Why is the woman holding a knife?", "choices": ["attacking others", "cutting food", "defending", "hurting self"], "correct_choice_idx": 1, "direct_answers": ["photograph", "food preparation", "chopping onions", "she's cooking", "chopping", "chopping", "cutting food", "food preparation", "to cut", "prepping"], "difficult_direct_answer": false, "rationales": ["She's cutting food.", "The woman is situated in a kitchen in front of a cutting board with choppedfood. kitchens are typically where food is prepared and a knife would be used.", "We can see what the woman was doing with the knife on the platter below her, vegetables and fruits have been diced and sliced."], "image": "val2014/COCO_val2014_000000535506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570624, "question_id": "hT6M455cHk6bPg4XkNWZGp", "question": "What month was this picture taken?", "choices": ["october", "march", "december", "february"], "correct_choice_idx": 2, "direct_answers": ["december", "december", "december", "december", "december", "december", "december 2009", "december", "december 2009", "december"], "difficult_direct_answer": false, "rationales": ["This street scene is dark and damp, and could easily be accepted as a fall or winter photo, as the people are bundled up. however, the presence of a good christmas decoration on the street light makes \"december\" a wise choice.", "There are christmas trees", "The picture was taken in december."], "image": "val2014/COCO_val2014_000000570624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347362, "question_id": "hT6PyKj7r3xVDz3XULZtaq", "question": "Where is this man eating?", "choices": ["park", "restaurant", "home", "office"], "correct_choice_idx": 1, "direct_answers": ["restaurant", "pie", "dessert", "dessert", "dessert", "cake", "coffee shop", "dessert", "restaurant", "pie"], "difficult_direct_answer": false, "rationales": ["The decor and the furniture in addition to the multiple groups of patrons put this in a professional establishment that serves food.", "There are other people sitting at more tables in the same room", "People sit at tables close to one another in restaurants."], "image": "train2014/COCO_train2014_000000347362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219558, "question_id": "hT9sMVBUTykDHYx3PLFERA", "question": "The parent company of this telephone brand is from what country?", "choices": ["finland", "germany", "japan", "united states"], "correct_choice_idx": 1, "direct_answers": ["germany", "t mobile", "united states", "london", "germany", "japan", "germany", "germany", "germany", "america"], "difficult_direct_answer": false, "rationales": ["The sign on the building says t mobile whose parent company is deutsche telecom ag with headquarters in bonn.", "The sign to the right indicates that the telephone brand is t-mobile. it is owned by deutsche telekom.", "The company is t-mobile. the parent company is deutsche telekom."], "image": "val2014/COCO_val2014_000000219558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405192, "question_id": "hTAncDCVQKjhBujUtEaU8X", "question": "What are the sticks seen here made from?", "choices": ["bread", "drumsticks", "chicken", "beef"], "correct_choice_idx": 0, "direct_answers": ["wood", "bread", "bread", "wood", "chopsticks", "bread", "chopaticka", "wood", "bread", "wood"], "difficult_direct_answer": false, "rationales": ["The basket on the table contains breadsticks.", "These are breadsticks that are a side with dinner", "These sticks are presented with similar food objects."], "image": "val2014/COCO_val2014_000000405192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271167, "question_id": "hTMLbjp9J3vH2KYygJeD2S", "question": "What is the green thing at the bottom of the tub?", "choices": ["towel", "slip mat", "bath toy", "robe"], "correct_choice_idx": 1, "direct_answers": ["slip mat", "slip mat", "mat", "bothinh", "bath mat", "shampoo", "safety mat", "mat", "slip mat", "bath mat"], "difficult_direct_answer": false, "rationales": ["A kid is in a bathtub and a green mat can be seen under the water.", "The item is a mat for the bathtub to keep people from slipping.", "The rubber mat creates friction so you don't fall in the tub."], "image": "train2014/COCO_train2014_000000271167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192464, "question_id": "hTQdd3Q8AwdZbMYQUthjsn", "question": "Which word would be used to describe this woman?", "choices": ["gigantic", "swarthy", "translucent", "pale"], "correct_choice_idx": 1, "direct_answers": ["beautiful", "cute", "happy", "happy", "happy", "happy", "happy", "fun", "happy", "swarthy"], "difficult_direct_answer": false, "rationales": ["She looks happy.", "She is swarthy.", "The woman seems to be having good time with her friends with the warm smile."], "image": "train2014/COCO_train2014_000000192464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454867, "question_id": "hTy9DkcmYBQv5eWW8yF72P", "question": "What type of motion does a train use?", "choices": ["rectilinear motion", "power", "acceleration", "moment"], "correct_choice_idx": 0, "direct_answers": ["electrical", "forward motion", "forward", "mechanical", "rectilinear motion", "rail", "forward", "locomotion", "rectilinear motion", "rectilinear motion"], "difficult_direct_answer": false, "rationales": ["The motion is slow and steady.", "It uses c to follow a linear motion path.", "The train is going in rectilinear motion."], "image": "train2014/COCO_train2014_000000454867.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75296, "question_id": "hUJ3h7R7nYYVZXx4JqZ8c2", "question": "What food nutrients are lacking in this food?", "choices": ["starches", "fiber", "sugars", "carbohydrates"], "correct_choice_idx": 1, "direct_answers": ["vitamins", "is unhealthy", "fiber", "vitamins", "protein", "vegetables", "vitamins", "protein vitamins", "fruits vegetables", "protein"], "difficult_direct_answer": false, "rationales": ["The nutrients lack fiber.", "Donuts have a lot of carbs, starches and sugar in them.", "Donuts don't have fiber."], "image": "val2014/COCO_val2014_000000075296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275271, "question_id": "hUKjBj9Ci89bUtugsQbmXi", "question": "What at least symbolically is meant to take place below the clock here?", "choices": ["painting", "fire", "sales", "rain"], "correct_choice_idx": 1, "direct_answers": ["fire", "prayer", "snowing", "time passing", "fireplace", "fire", "fire", "room", "prayers", "fireplace"], "difficult_direct_answer": false, "rationales": ["This decor typically is placed on a mantle for a chimney.", "It's a fireplace.", "In a normal setting the wood panel would be removed to reveal a stove like setting meant for flames."], "image": "train2014/COCO_train2014_000000275271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66839, "question_id": "hUwSpAmjThi3akMu7Ng8FF", "question": "The trolley most likely transports what type of passengers?", "choices": ["executives", "seniors", "tourists", "children"], "correct_choice_idx": 2, "direct_answers": ["tourists", "citizens", "tourist", "tourists", "adults", "normal people", "local", "tourists", "humans", "city"], "difficult_direct_answer": false, "rationales": ["This is not an extremely effective mode of transportation anymore but remains in some places for nostalgia. this trolley is particularly decorated so it is likely people using it would be visiting specifically to use this and not people from around town who need everyday transportation.", "The trolley is for tourists.", "This bus is nicer looking than those used by everyday residents or commuters and is probably for tourists."], "image": "train2014/COCO_train2014_000000066839.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160233, "question_id": "hV6mSv2X9FbmzApdDTGNJu", "question": "Why is the woman carrying an orange umbrella?", "choices": ["it's raining", "showing off", "it's sunny", "for fashion"], "correct_choice_idx": 0, "direct_answers": ["raining", "raining", "raining", "it's raining", "raining", "rain", "rain", "stay dry", "its raining", "raining"], "difficult_direct_answer": false, "rationales": ["It's obvious from the wet street that it's raining, so the woman has opened her umbrella. it's orange because she likes the color and isn't afraid of people looking at her!.", "The wet sidewalk and glossy reflectivity present in this image let's us know it's raining.", "The sky looks dreary and the road looks wet so it is assumed it is raining and the umbrella is protecting her."], "image": "val2014/COCO_val2014_000000160233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235098, "question_id": "hVBnaCS9qJNQd5goEMXcG4", "question": "What is the woman using the object in her hand to do?", "choices": ["eat", "photograph", "to text", "drink"], "correct_choice_idx": 1, "direct_answers": ["photograph", "photography", "holding", "take pictures", "photograph", "taking pictures", "drink", "ride horse", "pictures", "take pictures"], "difficult_direct_answer": false, "rationales": ["The object in the women's hand is a camera. camera's are used to take pictures.", "The woman is holding a camera which is a photography tool.", "The item the woman is using only has one purpose."], "image": "train2014/COCO_train2014_000000235098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337883, "question_id": "hVTnyZ5zZWHAkjbhmJ2q4Q", "question": "What kitchen appliance is the man standing in front of?", "choices": ["dishwasher", "stove", "toaster", "microwave"], "correct_choice_idx": 1, "direct_answers": ["microwave", "stove", "stove", "stove", "stove", "microwave", "microwave", "stove", "stove", "stove"], "difficult_direct_answer": false, "rationales": ["The man is cooking so he is standing in front of an stove.", "The man is in a kitchen. he is cooking in a pot.", "The appliance is a stove."], "image": "train2014/COCO_train2014_000000337883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200678, "question_id": "hVgDpUnmVrRfrFGP85cnTz", "question": "How is the grill acquiring its heat source?", "choices": ["gas", "wood chips", "electricity", "charcoal"], "correct_choice_idx": 0, "direct_answers": ["gas", "propane", "propane tank", "propane tank", "flame", "propane", "gas", "propane", "propane", "charcoal"], "difficult_direct_answer": false, "rationales": ["The grill uses gas.", "There are no wires coming out of the grill. the substance in the tank behind the doors is being burned to generate heat.", "There is a propane tank visible underneath the grill. propane tanks connected to grills mean that the source of power would be the gas contained in the tank."], "image": "train2014/COCO_train2014_000000200678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116462, "question_id": "hVjQByxgbdhqxoKDZ97jbH", "question": "What is the long object on the top of the truck?", "choices": ["rope", "ladder", "board", "pole"], "correct_choice_idx": 1, "direct_answers": ["ladder", "ladder", "ladder", "wire", "ladder", "wire", "ladder", "ladder", "ladder", "ladder"], "difficult_direct_answer": false, "rationales": ["The ladder helps firemen reach the tops of buildings.", "The ladder is the long object.", "There is a steel piece of equipment that has rungs on them to climb up tall buildings."], "image": "train2014/COCO_train2014_000000116462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290378, "question_id": "hVoQZx7Z3zyEUgRzAaVYy6", "question": "What book series does he probably like?", "choices": ["berenstain bears", "amelia bedelia", "clifford", "curious george"], "correct_choice_idx": 3, "direct_answers": ["dr suss", "children's", "arthur", "curious george", "rugrats", "curious george", "unknown", "curious george", "curious george", "school bus"], "difficult_direct_answer": false, "rationales": ["The kid is still really young and probably likes cartoon animals.", "It's a curious george.", "A boy is sitting down. boys like curious george who is a children's character depicted in books and shows."], "image": "train2014/COCO_train2014_000000290378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401330, "question_id": "hVvs4AdfCixT89GuWTDAiw", "question": "Why are the men's coat/vest yellow or orange?", "choices": ["camouflage", "visibility", "dress code", "fashion"], "correct_choice_idx": 1, "direct_answers": ["safety", "visibility", "safety visibility", "visibility", "at work", "visability", "workers", "for safety", "job safety", "be visible"], "difficult_direct_answer": true, "rationales": ["Because of all the vehicles and activity on the tarmac, it is important that they can be seen as clearly as possible.", "The people are moving luggage at an airport. they need to be seen by operators of planes and other vehicles.", "The men work on a air strip where plane come and go, so for safety reason they wear neon colors."], "image": "val2014/COCO_val2014_000000401330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90341, "question_id": "hVyL4u7RVTEePrCHQkFoGf", "question": "What might the man in blue be feeling right now?", "choices": ["joyful", "love", "happiness", "disappointment"], "correct_choice_idx": 3, "direct_answers": ["angry", "pressure", "anticipation", "sad", "anxiety", "nervous", "defeat", "disappointment", "disappointed", "dissapointed"], "difficult_direct_answer": true, "rationales": ["The man in blue might be looking a little disappointed.", "He's looking down and has a sad face", "A person on a tennis court looks down with a pensive expression."], "image": "train2014/COCO_train2014_000000090341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45923, "question_id": "hWByQV5bpCz5ez562afcP6", "question": "What color is the brick in the middle?", "choices": ["brown", "yellow", "rainbow", "red"], "correct_choice_idx": 3, "direct_answers": ["multi", "orange", "multicolored", "blue", "rainbow", "red", "multicolored", "blue", "multi pastels", "blue"], "difficult_direct_answer": false, "rationales": ["There are multiple brick patterns and colors visible in the image. it is difficult to discern which is the exact middle, but the pattern is consistent with answer c.", "It is several colors", "The color is red."], "image": "train2014/COCO_train2014_000000045923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415288, "question_id": "hWKAEQzR7uAL5akhrUPrhD", "question": "How many motors are on the vehicles used by the people shown here to get them here?", "choices": ["two", "three", "three", "none"], "correct_choice_idx": 3, "direct_answers": ["zero", "zero", "none", "zero", "zero", "zero", "one", "two", "one", "zero"], "difficult_direct_answer": false, "rationales": ["All of these people are standing around their bicycles. they don't require any engines to move.", "These bicycles do not have motors.", "There are none."], "image": "train2014/COCO_train2014_000000415288.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569017, "question_id": "hWNBEUdDzQjX9x4FXF7WNb", "question": "What type of thing is shown on the umbrella?", "choices": ["food", "palace", "life raft", "drink"], "correct_choice_idx": 3, "direct_answers": ["advertisement", "cokecola", "logo", "beverage logo", "coca cola", "coca cola", "coca cola", "drink", "drink", "advertisement"], "difficult_direct_answer": false, "rationales": ["This is the logo for coca cola which is a soda", "It says coca cola on it", "Coca cola is advertised."], "image": "train2014/COCO_train2014_000000569017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165741, "question_id": "hWVErsWZLASJGKDPhKt3RS", "question": "This type of protein is most likely what?", "choices": ["fish", "tofu", "beef", "veal"], "correct_choice_idx": 0, "direct_answers": ["cheese", "meat", "meat", "fish", "fish", "fish", "fish", "fish", "fish", "fish"], "difficult_direct_answer": false, "rationales": ["The protein here is most likely for purpose of protein.", "It curled up when cooking which is characteristic of this meat", "Fried seafood is often thin cut and served with lemon."], "image": "train2014/COCO_train2014_000000165741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117765, "question_id": "hWejTUx6JfAEP5HGLbVhT7", "question": "From where did the camera man take this photo?", "choices": ["plane", "helicopter", "tall building", "ferris wheel"], "correct_choice_idx": 1, "direct_answers": ["airplane", "helicopter", "air", "helicopter", "up high", "out window", "sky", "airplane", "helicopter", "air"], "difficult_direct_answer": false, "rationales": ["The camera is from a helicopter.", "The photo is taken from above and the ground looks tiny so it was likely in a helicopter.", "The person is sitting in a helicopter."], "image": "train2014/COCO_train2014_000000117765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286411, "question_id": "hWqJD3sjjFVjdjLrgSNEZh", "question": "Why didn't they put the cardboard in the receptacles?", "choices": ["too big", "selling", "saving", "recycling"], "correct_choice_idx": 3, "direct_answers": ["cover already", "recycle", "lazy", "recycling", "too large", "didn't fit", "recycling", "too big", "won't fit", "full"], "difficult_direct_answer": true, "rationales": ["They're being recycled.", "Cardboard goes in a different truck than trash to be recycled.", "It is being recycled."], "image": "train2014/COCO_train2014_000000286411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251857, "question_id": "hWuSnuSQ9YcUm7HgN7J5Sj", "question": "What is the weather like where the man is riding his skateboard?", "choices": ["sunny warm", "desert dry", "windy", "cold"], "correct_choice_idx": 0, "direct_answers": ["sunny", "sunny", "sunny", "sunny warm", "rainy", "sunny", "sunny", "sunny", "warm", "summer"], "difficult_direct_answer": false, "rationales": ["The people are wearing short sleeves and shorts so it must be a nice day.", "It is bright outside and one person has shorts on", "People are wearing shorts."], "image": "val2014/COCO_val2014_000000251857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147101, "question_id": "hWyZh4muEo3isULeBPyEec", "question": "What's the name of this type of skating area?", "choices": ["pipe", "grid", "ramp", "bowl"], "correct_choice_idx": 3, "direct_answers": ["skate park", "park", "skate park", "skating rink", "bowl", "sport area", "skating park", "bowl", "skate park", "bowl"], "difficult_direct_answer": false, "rationales": ["This skating area is made of concrete with the most prominent feature that it has a big empty bottom. this would most closely resemble a bowl.", "It's a bowl.", "The person is in a skate bowl."], "image": "val2014/COCO_val2014_000000147101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190447, "question_id": "hXyWDPix69h5QLtazmnFdz", "question": "The store that the bicycle is parked in front of specializes in the sales of what items?", "choices": ["old furniture", "groceries", "coffee", "candy"], "correct_choice_idx": 0, "direct_answers": ["antiques", "old furniture", "antiques", "antiques", "antiques", "bikes", "antiques", "bikes", "bikes", "antiques"], "difficult_direct_answer": false, "rationales": ["The store sign says antiques.", "The sign has writing that is clearly visible and readable and is a word that can be defined as answer a.", "Antique stores sell furniture."], "image": "val2014/COCO_val2014_000000190447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291355, "question_id": "hY82XBs7HVZznEpXBGtJNP", "question": "Where would this room be located?", "choices": ["hotel", "rv", "gym", "hospital"], "correct_choice_idx": 0, "direct_answers": ["house", "family room", "hotel room", "living area", "living room", "hotel", "parlor", "house", "hotel", "outside"], "difficult_direct_answer": false, "rationales": ["This room has a hotel-style chair.", "The room is in a hotel.", "This room would be usually located in a hotel."], "image": "train2014/COCO_train2014_000000291355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494629, "question_id": "hY9AiXKHGodWUBNM5pHWMm", "question": "What part of this mans furniture is most surprising?", "choices": ["desk", "toilet", "speaker", "flooring holder"], "correct_choice_idx": 1, "direct_answers": ["toilet", "seat", "toilet chair", "toilet seat", "toilet sit", "toilet", "back", "toilet", "toilet", "chair"], "difficult_direct_answer": false, "rationales": ["The man seems to be be sitting on the toilet.", "He is using a toilet to sit on.", "It is made from a toilet."], "image": "val2014/COCO_val2014_000000494629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433941, "question_id": "hYFxCSAdhBtbCSCxkKrz9q", "question": "Which one of these months is possible?", "choices": ["july", "august", "june", "december"], "correct_choice_idx": 3, "direct_answers": ["january", "ice", "january", "december", "winter", "december", "december", "december", "december", "december"], "difficult_direct_answer": false, "rationales": ["The month is december.", "With the snow it has to be a winter month and december is the only winter month here.", "There is snow on the ground. snow usually falls during the winter."], "image": "train2014/COCO_train2014_000000433941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357814, "question_id": "hYG5QTUEZ5nQWLyTHNsDJN", "question": "Who is the older woman to the young girl?", "choices": ["mother", "sister", "teacher", "cousin"], "correct_choice_idx": 0, "direct_answers": ["mother", "mother", "mother", "mother", "mom", "mom", "mom", "mother", "mother", "mom"], "difficult_direct_answer": false, "rationales": ["The woman is the girl's mom.", "The woman is old enough to be the child's mom.", "The older woman is holding the child lovingly and is likely the mother."], "image": "val2014/COCO_val2014_000000357814.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39258, "question_id": "hYPHx9puraQ7YPtDgVhkfi", "question": "Why is the skateboarder reaching down?", "choices": ["scratching itch", "grabbing phone", "performing trick", "tying shoes"], "correct_choice_idx": 2, "direct_answers": ["to skate", "maintaining balance", "balance", "balance", "balance", "reaching board", "grabbing board", "grabbing board", "holding skateboard", "performing trick"], "difficult_direct_answer": false, "rationales": ["The man is airborne over an obstacle which would be consistent with someone using a skateboard with answer a.", "The skateboarder is currently in midair over a rock. skateboarders in mid air reaching for their board are most frequently attempting some kind of trick.", "He's doing a trick."], "image": "train2014/COCO_train2014_000000039258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181098, "question_id": "hYS4GyWrWu9oUPAVdaGYEg", "question": "What is showing on the woman that shouldn't be?", "choices": ["bra straps", "slip", "underwear", "pantyhose"], "correct_choice_idx": 0, "direct_answers": ["bra straps", "brastrap", "bra", "bra strap", "bra", "bra strap", "bra strap", "bra strap", "bra", "bra"], "difficult_direct_answer": false, "rationales": ["This is the only article of clothing the woman is wearing that is visibly showing.", "These are normally hidden by a shirt", "Underwear is typical seen as shameful or private, and display is frowned on."], "image": "train2014/COCO_train2014_000000181098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542365, "question_id": "hYV4wRznJSY9TjVFagAaET", "question": "What are the posts of the wire fence made of?", "choices": ["metal", "aluminum", "wood", "plastic"], "correct_choice_idx": 2, "direct_answers": ["wood", "wires", "wood", "wood", "wood", "wood", "wires", "wood", "wood", "wires"], "difficult_direct_answer": false, "rationales": ["The posts are wooden.", "The posts that are used to support the fence are brown and not perfectly shaped. these are characteristics of wood.", "These are rough pieces of wood from trees"], "image": "train2014/COCO_train2014_000000542365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544104, "question_id": "hYa7QVqRUf8J9YBBUDzzQb", "question": "What might many of the boat owners here use the boats for?", "choices": ["racing", "fishing", "regatta", "tourism"], "correct_choice_idx": 1, "direct_answers": ["fishing", "fishing", "travelling purpose", "transport", "fishing", "fishing", "fishing", "transportation", "fishing", "fishing"], "difficult_direct_answer": false, "rationales": ["There are fishing nets by the boats.", "These boats are used to catch fish.", "The people are fishing."], "image": "val2014/COCO_val2014_000000544104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52086, "question_id": "hYcAdWgVEib5QKcofNT5qj", "question": "The condiment on this food comes from where?", "choices": ["mustard seed", "echinacea plant", "tomato plant", "wasabi leaf"], "correct_choice_idx": 0, "direct_answers": ["seeds", "germany", "mustard", "mustard seed", "mustard seed", "mustard seeds", "mustard plant", "seeds", "mustard plant", "chines"], "difficult_direct_answer": false, "rationales": ["It's obvious that mustard wouldn't come from anything else.", "The food is mustard.", "Mustard comes from seeds."], "image": "train2014/COCO_train2014_000000052086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155997, "question_id": "hYojrq7UiC3cgmQEYVy5kC", "question": "What is the dog riding in?", "choices": ["trolley", "wagon", "bus", "car"], "correct_choice_idx": 1, "direct_answers": ["cart", "wagon", "wagon", "wagon", "cart", "wagon", "cart", "cart", "cart", "cart"], "difficult_direct_answer": false, "rationales": ["It is a small cart that is towed behind something", "The dog is in a crate pulled by a bike.", "This mode of transportation can be used to move objects or people easily. it can have two to four wheels."], "image": "val2014/COCO_val2014_000000155997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483494, "question_id": "hZ3xntbd4RR5ZSGyMgZZaR", "question": "What venue is shown in the image?", "choices": ["living room", "bedroom", "cabin", "hotel room"], "correct_choice_idx": 1, "direct_answers": ["hotel", "hotel", "hotel", "bedroom", "hotel", "bedroom", "traveling", "hotel", "luggage", "hotel"], "difficult_direct_answer": false, "rationales": ["The venue is a bedroom.", "We can see a mattress with sheet on top in this room. such a thing is traditionally found in a bedroom of whatever building the room might be in.", "There is a bed in this room that someone is laying on."], "image": "train2014/COCO_train2014_000000483494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425150, "question_id": "hZJRQ8pqWKeZeqtVsKXjhb", "question": "What does the crest on the castle represent?", "choices": ["england", "aaron's", "france", "lakers"], "correct_choice_idx": 0, "direct_answers": ["clock", "safety", "logo", "family", "achievements status", "lions", "england", "status", "king", "house"], "difficult_direct_answer": true, "rationales": ["It represents england.", "Lions are the symbol used for england.", "The crests are usually from britain."], "image": "train2014/COCO_train2014_000000425150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303471, "question_id": "hZJzAsWtmtayRVU2xbNQAN", "question": "What surface is the girl playing on?", "choices": ["indoor hard", "outdoor hard", "grass", "clay"], "correct_choice_idx": 1, "direct_answers": ["outdoor hard", "clay court", "tennis court", "synthetic", "court", "tennis court", "concrete", "rubber", "clay", "asphalt"], "difficult_direct_answer": true, "rationales": ["The surface is outdoors.", "The woman is on an outdoor tennis court.", "The hard surface is visible and distinct from other surfaces because of it's texture. the player is clearly outside because of the sky in the background."], "image": "train2014/COCO_train2014_000000303471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513145, "question_id": "hZoNXg87cSrZUf8HTAizoy", "question": "What does the woman have all over her face?", "choices": ["hair", "piercings", "stickers", "food"], "correct_choice_idx": 1, "direct_answers": ["piercings", "decoration", "paint", "rings", "piercings", "piercings", "jewelry", "makeup", "piercings", "paint"], "difficult_direct_answer": false, "rationales": ["She has all different kind of piercings to decorate her face.", "It is covered in metal studs and rings of varying sizes.", "The woman has her faces covered with jewelries."], "image": "train2014/COCO_train2014_000000513145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470115, "question_id": "hZvX2QExdCgmt3Cm3Eyg9U", "question": "Do both options have cheese on them?", "choices": ["maybe", "yes", "unsure", "no"], "correct_choice_idx": 3, "direct_answers": ["no", "yes", "no", "no", "no", "no", "no", "no", "no", "yes"], "difficult_direct_answer": false, "rationales": ["The hot dog only has ketchup", "The hot dog doesn't have cheese.", "The hot dog on the left does not have cheese on it."], "image": "val2014/COCO_val2014_000000470115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577766, "question_id": "ha5MqGdk64bseUugr9yHNH", "question": "One will be charged what if one uses the yellow item?", "choices": ["rental", "salary", "fare", "toll"], "correct_choice_idx": 2, "direct_answers": ["fare", "taxi", "fare", "money", "fare", "ten", "fare", "fare", "fare", "money"], "difficult_direct_answer": false, "rationales": ["The yellow item in this scene is a taxi. taxi's transport people places for a rate which when tallied at the end of the trip is called a fare.", "This is a taxi", "Taxis are cars that charge a fare to transport you places."], "image": "train2014/COCO_train2014_000000577766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261707, "question_id": "ha5s8ouWeSq52kFvVYN935", "question": "What company is the main competitor to the laptop's operating system?", "choices": ["microsoft", "apple", "samsung", "android"], "correct_choice_idx": 0, "direct_answers": ["firefox", "microsoft", "windows", "apple", "windows", "microsoft", "microsoft", "apply", "apple", "samsung"], "difficult_direct_answer": false, "rationales": ["The laptop is a macbook.", "This company produces the most widely adopted pc operating system.", "The company is microsoft."], "image": "val2014/COCO_val2014_000000261707.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271760, "question_id": "haCGCAo2PejphB4hYfBpWp", "question": "Where are the people in?", "choices": ["zoo", "wilderness", "theme park", "farm"], "correct_choice_idx": 0, "direct_answers": ["zoo", "bus", "bus", "wildlife sanctuary", "gazebo", "behind glass", "bus", "tour bus", "trolley", "tent"], "difficult_direct_answer": false, "rationales": ["The zebras shown are in an enclosure, and not the wild.", "People are peeing thru a piece of glass as they watch zebras on other side.", "The animals are in cages and the people looking through glass."], "image": "train2014/COCO_train2014_000000271760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289814, "question_id": "haH5FQMdFMFi7LtDihseDS", "question": "Why is the woman holding an umbrella?", "choices": ["cosplay", "blocking sun", "dancing", "staying dry"], "correct_choice_idx": 3, "direct_answers": ["rain", "raining", "rain", "stay dry", "stop rain", "rain", "rain protection", "staying dry", "rain", "keep dry"], "difficult_direct_answer": false, "rationales": ["So that her clothes and herself might not get rained on.", "The woman is holding the so she doesn't get wet from the rain.", "The other options obviously don't apply to this scene."], "image": "train2014/COCO_train2014_000000289814.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521652, "question_id": "haUNUmpvzhUf7p2MvVmgtM", "question": "One will come here if one wants to do what?", "choices": ["order takeout", "buy shoes", "see movie", "take train"], "correct_choice_idx": 3, "direct_answers": ["buy shoes", "buy shoes", "take train", "take train", "bus ride", "catch bus", "walk", "travel", "rid ebus", "use transit"], "difficult_direct_answer": false, "rationales": ["There are signs indicating this is a stop for public transportation.", "One will take the train if they want to get out of here.", "Platz is german for \"station\"."], "image": "train2014/COCO_train2014_000000521652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91612, "question_id": "haZAyuBp7vE84Q8RAFkpvL", "question": "In which local doe the the woman sit?", "choices": ["park", "zoo", "museum", "farm"], "correct_choice_idx": 0, "direct_answers": ["bench", "unknown", "bench", "rural", "park", "park", "park", "park", "dog park", "park"], "difficult_direct_answer": false, "rationales": ["The woman is on a bench near some trees.", "The dog and the wooden bench indicates that this is a local park the woman is visiting with her pet.", "The woman is sitting in a local park."], "image": "val2014/COCO_val2014_000000091612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532704, "question_id": "hacnye3GS6swACau2RjVBu", "question": "What type of television display technology is being used in the living room?", "choices": ["oled", "crt", "plasma", "lcd"], "correct_choice_idx": 1, "direct_answers": ["crt", "crt", "crt", "crt", "crt", "tube", "old", "crt", "analog", "wii"], "difficult_direct_answer": false, "rationales": ["It's a crt.", "The display is crt technology.", "This is an older type of television."], "image": "train2014/COCO_train2014_000000532704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286550, "question_id": "hafAdKUZ27jFt5WvR2BVxA", "question": "What meeting type is most probably taking place?", "choices": ["recreational", "work", "family", "legal"], "correct_choice_idx": 1, "direct_answers": ["work", "business", "marketing", "business", "business", "business", "presentation", "office", "office", "company"], "difficult_direct_answer": false, "rationales": ["The people all have computers out which means they are working.", "The people are dressed professionally and are in an office type conference room.", "There is a board behind them, laptops on the table, and people dressed in slight business attire."], "image": "train2014/COCO_train2014_000000286550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517296, "question_id": "haoeoezqqdSJDN46HUmZ5C", "question": "What kind of bus is parked in front of the men?", "choices": ["charter", "exhibition", "school", "tour"], "correct_choice_idx": 1, "direct_answers": ["exhibition bus", "exhibition bus", "exhibition bus", "exhibition bus", "exhibition", "exhibition bus", "passenger bus", "exhibition bus", "exhibition", "exhibition"], "difficult_direct_answer": false, "rationales": ["The bus is for an exhibit.", "There is writing on the bus in question that indicates what kind of bus it is.", "The bus has the words \"exhibition bus\" running across the top."], "image": "val2014/COCO_val2014_000000517296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179650, "question_id": "hb82wexxVixmLTho5YZvHA", "question": "What is contained inside the Scissor lift style truck with a rectangular box on it?", "choices": ["airplane food", "passengers", "safety equipment", "baggage"], "correct_choice_idx": 0, "direct_answers": ["people loading", "passengers", "luggage", "food", "cargo", "cargo", "food", "airplane food", "cargo", "food/luggage"], "difficult_direct_answer": false, "rationales": ["The vehicle is moving towards an airplane that is being boarded.", "The plane's food is being transported.", "The company for the box truck is gategourmet. gourmet is usually associated with edible items."], "image": "train2014/COCO_train2014_000000179650.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124609, "question_id": "hbTyMsso5aZKnCVK8m5hXC", "question": "Where is the smoke near the horses coming from?", "choices": ["auto exhaust", "horses kicking", "sticks", "mountains"], "correct_choice_idx": 1, "direct_answers": ["dust", "ground dust", "dirt", "dirt", "dust", "dirt", "people", "dust", "horses kicking", "dirt"], "difficult_direct_answer": false, "rationales": ["Horses make lots of dust when they run on dirt.", "Horses are in a large group and dust is being kicked up as they run together.", "The smoke of this image emanates from clouds of dust being kicked up by the horses."], "image": "train2014/COCO_train2014_000000124609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97483, "question_id": "hbnAmnE7b4FkYfjN8GVNbT", "question": "What is required for this activity?", "choices": ["sun", "snow", "wind", "water"], "correct_choice_idx": 1, "direct_answers": ["skis", "skis poles", "snow", "money", "goggles", "poles", "skis", "skis", "snow", "skis"], "difficult_direct_answer": false, "rationales": ["The ski is designed to slide on a smooth frozen surface.", "The activity is skiing while wearing cold weather gear.", "The man is skiing which requires snow to be on the ground."], "image": "train2014/COCO_train2014_000000097483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173821, "question_id": "hbsxmPhBxxQFMNN8fwhJT5", "question": "Where does the man want to go?", "choices": ["in raft", "in water", "on land", "on boat"], "correct_choice_idx": 1, "direct_answers": ["into water", "water", "water", "in water", "water", "into lake", "swimming", "boat", "in water", "into water"], "difficult_direct_answer": false, "rationales": ["He wants to go in the water.", "The man wants to jump into the ocean.", "The man is jumping off the platform of a boat and wants to fall in the water."], "image": "train2014/COCO_train2014_000000173821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226424, "question_id": "hbv6nLdr3id2W6mCgNjr56", "question": "Why is the boy near the edge of the water crouching down?", "choices": ["he's sick", "to dance", "for balance", "to yell"], "correct_choice_idx": 2, "direct_answers": ["surfing", "surfing", "balancing", "for balance", "surfing", "wave", "building sandcastle", "balance", "for balance", "surfing"], "difficult_direct_answer": false, "rationales": ["The boy is crouching near the edge of the water for balance.", "A boy in red is being pushed around by waves. these waves are strong so he gets down so he doesn't fall over.", "The boy is looking to get some balance."], "image": "train2014/COCO_train2014_000000226424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175781, "question_id": "hbxAYy6AyvS5SXPpk66fWa", "question": "What are the toy bears wearing?", "choices": ["tank top", "dress", "sweater", "skirt"], "correct_choice_idx": 2, "direct_answers": ["sweaters", "sweater", "sweaters", "sweaters", "shirts", "sweaters", "sweaters", "cardigans", "boots", "sweaters"], "difficult_direct_answer": false, "rationales": ["They are dressed for winter.", "They have sweaters.", "The bears are wearing knit long sleeve garments with buttons up the front. these attributes are consistent with sweaters."], "image": "train2014/COCO_train2014_000000175781.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338018, "question_id": "hbyB5YaQddYjffvyupK8Ez", "question": "What does the man with the racket want to do next?", "choices": ["dodge ball", "roll", "hit ball", "throw racket"], "correct_choice_idx": 2, "direct_answers": ["hit ball", "hit", "hit ball", "hit ball", "return hit", "return volley", "return serve", "hit ball", "hit ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["The man is poised to hit the tennis ball back to his opponent.", "The man is preparing to swing his racket in order to make contact and return the ball to the opposite side of the court.", "He's playing tennis and needs the ball to go to the other player"], "image": "train2014/COCO_train2014_000000338018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166678, "question_id": "hc8KVMbxyrTQYNeSD5W9KE", "question": "The filling of this cake is most likely what?", "choices": ["lemon", "chocolate", "strawberry", "beef"], "correct_choice_idx": 2, "direct_answers": ["strawberry", "strawberry", "cheese cake", "raspberry", "strawberry jelly", "strawberry", "strawberry", "frosting", "strawberry", "cream egg"], "difficult_direct_answer": false, "rationales": ["A cake is sliced and has red filling. strawberries are red and are commonly used in baked goods.", "The cake is pink.", "The fruit is a red shade, and that is a common flavor for for food."], "image": "train2014/COCO_train2014_000000166678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430285, "question_id": "hcHRhCt35bTwvHKJarFMBZ", "question": "What are the white blocks on the pizza?", "choices": ["turnips", "tofu", "onions", "pineapple chunks"], "correct_choice_idx": 3, "direct_answers": ["pineapple chunks", "chicken", "pineapple", "onion chunks", "pineapple", "tofu", "pineapple", "mushrooms", "pineapple", "pineapple"], "difficult_direct_answer": false, "rationales": ["The pizza has pineapple on it.", "The chunks are yellowish.", "A common topic on pizza is pineapple's"], "image": "train2014/COCO_train2014_000000430285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523306, "question_id": "hcJR9JG4L5PtJWE6bo2Wbq", "question": "WHat is the elephant husk made of?", "choices": ["gold", "silver", "dentine", "ice"], "correct_choice_idx": 2, "direct_answers": ["ivory", "elephant", "leather", "calcium", "dentine", "ivory", "ivory", "ivory", "ivory", "keratin"], "difficult_direct_answer": false, "rationales": ["This is the hard bony substance", "Body parts are not made out of gold, silver or ice.", "The tusks are made out of ivory, not gold, silver, or ice."], "image": "train2014/COCO_train2014_000000523306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72018, "question_id": "hctbNgni7wqGY8bz9txuJB", "question": "What is folded up next to dresser?", "choices": ["hamper", "luggage", "tent", "pack'n'play"], "correct_choice_idx": 3, "direct_answers": ["baby bed", "pack'n'play", "baby bed", "stools", "blanket", "playpen", "seat", "portable crib", "playpen", "playpen"], "difficult_direct_answer": false, "rationales": ["A portable bed for a baby or small child.", "A playpen of sorts for a baby to stay in.", "This is a portable baby crib"], "image": "val2014/COCO_val2014_000000072018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574940, "question_id": "hdAtm9tsAi7MufLixMV8KT", "question": "Which cloth is good for window curtains?", "choices": ["linen", "silk", "nylon", "cotton"], "correct_choice_idx": 0, "direct_answers": ["polyester", "linen", "polyester", "linen", "linen", "satin", "cotton", "cotton", "canvas", "polyester"], "difficult_direct_answer": false, "rationales": ["Linen is a good cloth for window curtains, and appears to be the material of the window curtains of this good room.", "It is durable and holds up to sunlight", "The linen cloths are best."], "image": "val2014/COCO_val2014_000000574940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255462, "question_id": "hdBGSE2nzGJV9AfVinUimk", "question": "What is the majority of the hill covered in?", "choices": ["sand", "grass", "mud", "rocks"], "correct_choice_idx": 0, "direct_answers": ["sunshine", "sand", "snow", "boat", "sand", "snow sand", "sand", "snow", "sand", "sand"], "difficult_direct_answer": false, "rationales": ["The majority is sandy.", "It is a light brown color and there is no sign of grass which is green.", "The hill is near a body of water."], "image": "train2014/COCO_train2014_000000255462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14353, "question_id": "hdJy5jFbLXgNBGCPT2yXHX", "question": "What will the woman do next?", "choices": ["eat pizza", "cool pizza", "bake pizza", "cut pizza"], "correct_choice_idx": 2, "direct_answers": ["cook pizza", "close oven", "close door", "shut door", "cook", "bake pizza", "close oven", "close door", "close door", "bake pizza"], "difficult_direct_answer": false, "rationales": ["The woman bakes.", "The woman is putting the pizza in the oven with the intention of baking it.", "The pizza is raw so it needs to be baked."], "image": "val2014/COCO_val2014_000000014353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356708, "question_id": "he2D2dmnyHZkDDnz9Y62UT", "question": "Which skier is teaching here?", "choices": ["both", "none", "tallest", "shortest"], "correct_choice_idx": 2, "direct_answers": ["tall one", "taller", "red jacket", "adult", "man", "adult", "tallest", "skating", "older one", "walking"], "difficult_direct_answer": true, "rationales": ["He is looking back at a young skier.", "The shorter person is a kid. the kid is too inexperienced to be a teacher.", "The skier is tallest."], "image": "val2014/COCO_val2014_000000356708.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256222, "question_id": "he3xJT4RQc94Z6Q5QkoiBz", "question": "Where will the bare shouldered person most likely go to next?", "choices": ["food store", "foreign country", "mid ocean", "shore"], "correct_choice_idx": 3, "direct_answers": ["deeper water", "sand", "shore", "waves", "waves", "straight", "beach", "water", "further out", "sea"], "difficult_direct_answer": true, "rationales": ["The person will turn around and head back to the beach shore after enjoying the waves.", "The woman is walking and not swimming. you need to walk when going out of the water.", "The bare shouldered person is wading in a body of water. they likely will return back to land soon."], "image": "val2014/COCO_val2014_000000256222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113261, "question_id": "he7uD2W7Cw78HaUcWhf8Wr", "question": "Why is the plane blue and red?", "choices": ["company colors", "easily seen", "cheap paint", "as is"], "correct_choice_idx": 0, "direct_answers": ["company brand", "logo colors", "company colors", "company logo", "american plane", "company colors", "america", "paint", "logo", "company colors"], "difficult_direct_answer": false, "rationales": ["These are the official colors of federal express.", "Airlines decorate there fleets with colors that corresponds to their company that way they are easily identifiable and a good marketing strategy.", "This matches their logo"], "image": "train2014/COCO_train2014_000000113261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340445, "question_id": "he9QHwsPDGZ2a7ETgGQvbv", "question": "What event is taking place here?", "choices": ["car accident", "snow storm", "logging", "road construction"], "correct_choice_idx": 2, "direct_answers": ["tree removal", "tree felling", "logging", "trimming trees", "lumberjacking", "tree removal", "cleanup", "tree cutting", "tree repairs", "tree hauling"], "difficult_direct_answer": true, "rationales": ["The people are cutting down the tree.", "The event is logging.", "They are taking down trees."], "image": "train2014/COCO_train2014_000000340445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91416, "question_id": "heEjruMkoDTktkdg9C7cLQ", "question": "What style food is most likely being prepared in this kitchen?", "choices": ["baked goods", "mexican", "italian", "chinese"], "correct_choice_idx": 3, "direct_answers": ["chinese", "asian", "chinese", "stir fry", "stir fried", "chinese", "asian", "chinese", "chinese", "chinese"], "difficult_direct_answer": false, "rationales": ["Some chinese food served in restaurants are typically cooked in a wok, which is shown to be used by the chef here.", "The chef appears to be of asian, not italian or mexican, descent. he is cooking on a stove, not baking.", "This kitchen is equipped with a wok, which is used to cook asian cuisine."], "image": "val2014/COCO_val2014_000000091416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566501, "question_id": "heHRh582eXkvkKdmoiKLVe", "question": "Why is there no color in the image?", "choices": ["old photo", "photo manipulation", "picture damaged", "smoke affected"], "correct_choice_idx": 1, "direct_answers": ["it's old", "old", "old photo", "photoshop", "photo manipulation", "old photo", "olden times", "old", "filter", "sepia ink"], "difficult_direct_answer": false, "rationales": ["There's photo manipulation.", "The photo has been photoshopped.", "There was a sepia filter used on this image. it is not an old photo because the watermark says 2013."], "image": "train2014/COCO_train2014_000000566501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345263, "question_id": "heRsFVUR9qGuTh3ueQKZnb", "question": "What is near the apples?", "choices": ["cat", "baby", "basketball", "egg"], "correct_choice_idx": 0, "direct_answers": ["cat", "dog", "fruits", "fruits", "cat", "cat", "cat", "cat", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["A gray cat is laying next to a box of red fruit.", "A cat is near the basket.", "The apples are clearly visible and identifiable and the only object visibly close has the defining features consistent with answer a."], "image": "train2014/COCO_train2014_000000345263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299051, "question_id": "heXUdMWDBZAfPLNsXhEh99", "question": "How many Chevrolet cars are there?", "choices": ["one", "two", "three", "four"], "correct_choice_idx": 0, "direct_answers": ["five", "five", "five", "five", "three", "five", "five", "three", "three", "one"], "difficult_direct_answer": false, "rationales": ["The logo is on the truck.", "The blue truck has the logo on it", "There are four cars shown."], "image": "train2014/COCO_train2014_000000299051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420750, "question_id": "hei5A6wdP26VCLaSjtjK2V", "question": "Why are the tomatoes sitting on the white table?", "choices": ["to cook", "to cut", "to clean", "to sell"], "correct_choice_idx": 3, "direct_answers": ["for sale", "for sale", "for sale", "for sale", "sales presentation", "to sell", "decoration", "to sell", "for sale", "sales"], "difficult_direct_answer": false, "rationales": ["A price tag can be seen so the items displayed are for sale.", "There is a price sign for some of the produce", "There are price tags on all of the produce."], "image": "train2014/COCO_train2014_000000420750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222027, "question_id": "heoeccbXdWxDBenxKbVUNa", "question": "What is explicitly forbidden on the bus?", "choices": ["spitting", "eating", "singing", "talking"], "correct_choice_idx": 1, "direct_answers": ["eating", "smoking", "smoking", "smoking", "smoking", "smoking", "smoking", "smoking", "smoking", "smoking"], "difficult_direct_answer": false, "rationales": ["There is a sign of food and a drink with a line through it, which means it is not permitted.", "There is a sticker above in the bus that shows various pictures of things forbidden. one has food on it with a slash.", "No eating is allowed."], "image": "train2014/COCO_train2014_000000222027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263696, "question_id": "heq7RVLcv3My8EgyrRKex9", "question": "Who owns the umbrella?", "choices": ["unseen person", "black jacket", "brown jacket", "unknown"], "correct_choice_idx": 2, "direct_answers": ["brown jacket", "woman", "brown jacket", "brown jacket", "woman", "woman", "mother", "woman", "woman", "woman"], "difficult_direct_answer": false, "rationales": ["The oldest of the two woman held this over their heads.", "The woman on the right, not left, is holding the umbrella. she owns it.", "The lady on the right is holding it so she probably owns it."], "image": "val2014/COCO_val2014_000000263696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65837, "question_id": "hf8cSFG9ycW8WxAU3mSdBF", "question": "What is this type of food called?", "choices": ["gyro", "burger", "burrito", "hot dog"], "correct_choice_idx": 0, "direct_answers": ["taco", "gyro", "quesadilla", "gyro", "taco", "asian", "burrito", "buritto", "gyro", "wrap"], "difficult_direct_answer": false, "rationales": ["The other options don't match the ingredients.", "The food is greek and wrapped in pita bread.", "The type of spices is the evident."], "image": "train2014/COCO_train2014_000000065837.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434486, "question_id": "hfBknEPBZuqqfHBaYbM56T", "question": "What will the person here do next in the game?", "choices": ["serve", "rest", "quit", "return ball"], "correct_choice_idx": 0, "direct_answers": ["serve", "serve", "hit", "serve", "serve", "hit ball", "hit ball", "hit ball", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["The player serving often bounce the ball before serving.", "She is getting ready to hit the ball.", "The person wants to hit the bal."], "image": "val2014/COCO_val2014_000000434486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14352, "question_id": "hfNVgbWKXDH8Aexn8tBn72", "question": "What happens when you push the metal button on the back wall?", "choices": ["faucet runs", "toilette flushes", "lights on", "bidet sprays"], "correct_choice_idx": 1, "direct_answers": ["flush", "flush", "flush toilet", "toilette flushes", "flush toilet", "flush", "flush", "toilet flush", "toilet flushes", "flush toilet"], "difficult_direct_answer": false, "rationales": ["There is no bidet. the controls for the lights or faucet would not be placed in that location.", "The toilet will flush.", "The button is meant to flush the toilet."], "image": "val2014/COCO_val2014_000000014352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551717, "question_id": "hfThCbzrX5kZpk5nNarc8P", "question": "What persons enter the open door here?", "choices": ["train execs", "baggage handlers", "all passengers", "engineer only"], "correct_choice_idx": 1, "direct_answers": ["passenger", "male", "workers", "passengers", "bearded man", "man", "passengers", "coach", "travelers passengers", "baggage handlers"], "difficult_direct_answer": true, "rationales": ["It is the baggage car", "The train says coach/baggage on it.", "The writing on the side indicates that baggage goes here."], "image": "train2014/COCO_train2014_000000551717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378448, "question_id": "hfZB5TRKKgzfau7PF7Nn5Z", "question": "What activity is the person who owns these things doing?", "choices": ["taking test", "travelling", "reading", "incarceration"], "correct_choice_idx": 1, "direct_answers": ["travelling", "travelling", "flying", "flying", "computing", "listening music", "hearing music", "traveling", "connecting", "travelling"], "difficult_direct_answer": false, "rationales": ["They are traveling.", "The person is at an airport with his gear so he is likely traveling.", "There is a large airplane in the background and those airport vehicles which means he might be going someplace far. the words \"connect anywhere\" mean he can connect wherever he's going."], "image": "val2014/COCO_val2014_000000378448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502284, "question_id": "hffBbVsJoDGYDCVQbCrrck", "question": "What might you find written on the other side of the bone?", "choices": ["theater advertisement", "wedding invitation", "recipe", "address"], "correct_choice_idx": 3, "direct_answers": ["address", "owners number", "address", "if found", "address name", "address", "name", "owners name", "play", "name"], "difficult_direct_answer": false, "rationales": ["Dog tags usually have location information for a pet's owner, so since the phone number is on the visible side, it's safe to assume the address is on the other side.", "It gives information in case the dog gets lost", "If the dog gets lost and somebody finds it, they can bring the dog back to the owner."], "image": "train2014/COCO_train2014_000000502284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359354, "question_id": "hfjEZmWFKdqQriz2SusyfL", "question": "What is being done in the area beyond the arched opening?", "choices": ["baking", "dining", "serving", "displays"], "correct_choice_idx": 0, "direct_answers": ["pizza making", "cooking pizza", "pizza cooking", "door", "baking", "baking", "cooking", "cooking", "pizza cooking", "baking"], "difficult_direct_answer": false, "rationales": ["It's being baked.", "It is a pizza oven.", "The oven is baking the pizzas."], "image": "train2014/COCO_train2014_000000359354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8223, "question_id": "hfq38moaW6pdZQHJZZ7EYX", "question": "What type of soda is in the image?", "choices": ["sprite", "ginger ale", "coke", "pepsi"], "correct_choice_idx": 3, "direct_answers": ["dark soda", "pepsi", "cola", "cola", "cola", "cola perhaps", "coke/pepsi", "cola", "orange soda", "broccoli"], "difficult_direct_answer": false, "rationales": ["A pale colored beverage is in a cup with ice.", "The soda is a lighter brown and served in a restaurant setting, meaning it's a cola.", "The beverage is dark brown."], "image": "train2014/COCO_train2014_000000008223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512974, "question_id": "hfxADVmMFGQQNVZUyFdko9", "question": "Video game consoles are popularly marked by whom?", "choices": ["dell", "samsung", "sony", "nintendo"], "correct_choice_idx": 3, "direct_answers": ["manufacturers", "nintendo", "nintendo", "manufacturer", "nintendo", "wii", "nintendo", "nintendo", "wii", "nintendo"], "difficult_direct_answer": false, "rationales": ["This is a wii made by them.", "There is a wii logo on the games and manual. samsung and dell do not make video game consoles.", "The game on the counter has a logo for the console manufacturer that makes the wii. samsung and dell make cell phones and computers, not video game consoles."], "image": "val2014/COCO_val2014_000000512974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26413, "question_id": "hg992YGpynTgnVx7VwBfWQ", "question": "Why is he standing inside the square?", "choices": ["is boundary", "keeps ok", "coincidence", "enjoys it"], "correct_choice_idx": 0, "direct_answers": ["hitting ball", "is boundary", "batting", "batting", "batting", "at bat", "batting", "base", "batting", "batter's box"], "difficult_direct_answer": false, "rationales": ["He is where he is supposed to stand to play the game.", "He is inside a marking.", "The baseball player is standing inside the square because it is the designated spot for batters' boundaries."], "image": "val2014/COCO_val2014_000000026413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183677, "question_id": "hgbkYqVaPMWWpKrpnRWjSL", "question": "Which one of the paint is safe for children art work?", "choices": ["enamel paint", "acrylic paint", "oil paint", "emulsion paint"], "correct_choice_idx": 1, "direct_answers": ["orange", "red paint", "orange", "water", "left", "water", "water based", "acrylic paint", "finger", "toxic free"], "difficult_direct_answer": false, "rationales": ["There is certain kind of paint that is easy to wash out. it contains water which will help with cleanup.", "Acrylic paint is safe for kids to use.", "The safest paint for children to use is acrylic paint."], "image": "val2014/COCO_val2014_000000183677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455995, "question_id": "hgqv4WFDNqovBNqrkbnZYf", "question": "What is the leg gear called that the catcher is wearing?", "choices": ["braces", "leg guard", "stockings", "leggings"], "correct_choice_idx": 1, "direct_answers": ["armour", "leg guard", "leg guards", "leg guards", "leg guards", "shin guards", "front", "shin guard", "shin guards", "shin guards"], "difficult_direct_answer": false, "rationales": ["The catcher wears leg guards for protection.", "The catcher is wearing guards.", "The man has protection on the legs."], "image": "train2014/COCO_train2014_000000455995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238887, "question_id": "hgypyAkYkTpDT76BBELWfb", "question": "What appliance sits on the bathroom sink counter?", "choices": ["hair dryer", "kettle", "coffee maker", "hand dryer"], "correct_choice_idx": 2, "direct_answers": ["coffee maker", "coffee maker", "coffee maker", "coffee maker", "coffee maker", "coffee maker", "coffee maker", "coffee maker", "coffee maker", "coffee maker"], "difficult_direct_answer": false, "rationales": ["It's a coffee maker.", "A coffee maker is on the counter.", "Hair dryer as it is use to dry the hair."], "image": "train2014/COCO_train2014_000000238887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446894, "question_id": "hh6hUDJGbu9DFr4MJqkYgW", "question": "What is the use of the pestle and mortar in the picture above?", "choices": ["crash", "none", "smash contents", "mix"], "correct_choice_idx": 2, "direct_answers": ["grinding", "crush veggies", "grind", "grinding", "mix food", "grind", "crushing", "grinding", "stirring", "smash contents"], "difficult_direct_answer": false, "rationales": ["A pestle and mortar is used to smash ingredients.", "The intended use for this equipment is commonly known and not necessarily related to the image.", "The person is grinding up the stuff in it."], "image": "val2014/COCO_val2014_000000446894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558851, "question_id": "hhAeqGLst6wGTAmgyAKpzm", "question": "From what did the animals shown here first emerge?", "choices": ["their mother", "ufos", "eggs", "bacon"], "correct_choice_idx": 2, "direct_answers": ["ocean", "lake", "land", "egg", "shore", "eggs", "land", "eggs", "duck", "water"], "difficult_direct_answer": false, "rationales": ["Birds are hatched when they are born. ducks are birds.", "Ducks come from eggs.", "The eggs emerged."], "image": "train2014/COCO_train2014_000000558851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21534, "question_id": "hhWJbSNSkVXTEu7vKBMZD7", "question": "Who is the man waiting for?", "choices": ["mechanic", "cashier", "doctor", "banker"], "correct_choice_idx": 1, "direct_answers": ["store employee", "waiter", "cashier", "cashier", "cashier", "food", "donuts", "checkout", "buy dessert", "employee"], "difficult_direct_answer": false, "rationales": ["He is at the register waiting to make a purchase", "There is no one at the till to take his order.", "The man is waiting patiently for the cashier to return so he can buy some doughnuts. she had to run into the back for a moment, but will return in two minutes!."], "image": "train2014/COCO_train2014_000000021534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321522, "question_id": "hhWMxLmb2Dhmhiecgfz8Sj", "question": "What type of board is the black one behind the stove?", "choices": ["communication board", "bulletin board", "chalkboard", "whiteboard"], "correct_choice_idx": 2, "direct_answers": ["blackboard", "menu board", "chalk", "chalk", "board", "chalk", "chalkboard", "chalk", "cutting board", "chalkboard"], "difficult_direct_answer": false, "rationales": ["You can write on it and erase it", "The board is written on with a white substance.", "The board is black with white chalk writing on it."], "image": "val2014/COCO_val2014_000000321522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370337, "question_id": "hhWh5woVJbqrNxHq4Ugeud", "question": "Where have the ships stopped?", "choices": ["near barge", "at lighthouse", "at dock", "on island"], "correct_choice_idx": 2, "direct_answers": ["at dock", "dock", "dock", "dock", "dock", "dock", "at port", "dock", "deck", "wharf"], "difficult_direct_answer": false, "rationales": ["The structure attached to the boats is visible and identifiable based on the ropes attached and the material used.", "This shoreline has pilings sticking up at the water's edge for boats to attach to.", "The ships are docked."], "image": "val2014/COCO_val2014_000000370337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128957, "question_id": "hhZrvCFZyoq7NfuiUtDB2A", "question": "Why is the orange cone placed by the plane?", "choices": ["safety", "it fell", "traffic direction", "thrown away"], "correct_choice_idx": 0, "direct_answers": ["safety", "guide", "warning", "safety", "safety", "warning", "safety", "checking engine", "parking side", "safety"], "difficult_direct_answer": false, "rationales": ["A single cone is there so that someone doesn't walk in front of it. they don't want an arm to get sucked into the turbine.", "The orange cone is a warning symbol.", "This is so no one goes near that part of the plane and doesn't get hurt."], "image": "train2014/COCO_train2014_000000128957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204216, "question_id": "hhcWVuyk5DwhwRXa4xR4fn", "question": "Why are the top of the rails by the railroad station shiny?", "choices": ["recently cleaned", "new installation", "metal quality", "wear"], "correct_choice_idx": 3, "direct_answers": ["grease", "newer", "metal", "friction", "cleaner metal", "stone", "electric lines", "wear", "recently replaced", "wear"], "difficult_direct_answer": true, "rationales": ["The top is worn.", "The top of the rails recently got a quick cleanup.", "They are shiny because they are new."], "image": "train2014/COCO_train2014_000000204216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296106, "question_id": "hhd2B8v86cL3wdzccVJQAc", "question": "What is this place?", "choices": ["resort", "circus", "school", "zoo"], "correct_choice_idx": 3, "direct_answers": ["zoo", "zoo", "zoo", "elephant sanctuary", "zoo", "elephant sanctuary", "zoo", "zoo", "zoo", "zoo"], "difficult_direct_answer": false, "rationales": ["The elephants are standing in an enclosure at a zoo.", "The animals are clearly in captivity and there are spectators standing along the sidelines to view them.", "The animals depicted seem to be in some sort of enclosure with spectators nearby separated by a railing which would all be found in answer a."], "image": "train2014/COCO_train2014_000000296106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385540, "question_id": "hiPDAcaftnPXe4wDmFs9np", "question": "What month does this event take place?", "choices": ["september", "june", "january", "july"], "correct_choice_idx": 3, "direct_answers": ["september", "july", "july", "july", "july", "july", "january", "july", "july", "may"], "difficult_direct_answer": false, "rationales": ["This event is staged every july, and its roots stretch back to 1886, when two calgary agricultural societies came together to stage a fair.", "The flag indicates that this event is the calgary stampede. it happens during the summer in the month after june.", "The flag indicates that this event is the calgary stampede. it takes place in the summer after june."], "image": "val2014/COCO_val2014_000000385540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487007, "question_id": "hiTpzYzobLtnpe7T8N9Ddh", "question": "What is this woman going to eat?", "choices": ["steak", "burrito", "taco", "sandwich"], "correct_choice_idx": 3, "direct_answers": ["sandwich", "sandwich", "sandwich", "sandwich", "sandwich", "sandwich", "veggie sandwich", "lunch", "sandwich", "toast"], "difficult_direct_answer": false, "rationales": ["The woman's plate contains two slices of bread, one of which has other foods piled on top of it.", "The woman has two slices of bread with toppings.", "She's having a sandwich."], "image": "train2014/COCO_train2014_000000487007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280766, "question_id": "hiVHSbb9sJjSehhr7tKi4s", "question": "What needs to be done to the wall?", "choices": ["cleaned", "demolished", "hoisted", "painted"], "correct_choice_idx": 3, "direct_answers": ["painted", "paint", "painted", "painting", "decorate", "painted", "paint", "painting", "painted", "paint it"], "difficult_direct_answer": false, "rationales": ["The wall needs paint.", "Walls look good with a fresh coat of paint.", "The wall needs some paint."], "image": "val2014/COCO_val2014_000000280766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66266, "question_id": "hidQ5rCTDXxHcwbjVUMCdx", "question": "The man with the ball has shoes that have a majority color that matches the color of what?", "choices": ["horse", "cow's tongue", "mallard's bill", "zebra"], "correct_choice_idx": 2, "direct_answers": ["mallard's bill", "bananas", "banana", "yellow", "team", "sun", "yellow", "banana", "lemons", "shirt shorts"], "difficult_direct_answer": false, "rationales": ["The shows look just like a bird's beak.", "A red and white ball is shown. mallard's colors are red and white.", "A mallards bill is yellow or sometimes lighter like white."], "image": "train2014/COCO_train2014_000000066266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499915, "question_id": "hie7KBtUKtuPku8U4veCFY", "question": "What kind of event is taking place in the area?", "choices": ["market", "concert", "fair", "protest"], "correct_choice_idx": 3, "direct_answers": ["protest", "protest", "political gathering", "protest", "takeover", "protest", "protest", "protest", "protest", "demonstration"], "difficult_direct_answer": false, "rationales": ["The event is a protest.", "The people are protesting something.", "People are holding signs."], "image": "train2014/COCO_train2014_000000499915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456975, "question_id": "hiopZWc3yVb6ViUD85vdWG", "question": "What fun activity is shown?", "choices": ["free fall", "rollar coaster", "bumper cars", "skiing"], "correct_choice_idx": 3, "direct_answers": ["skiing", "skiing", "skiing", "skiing", "sking", "skiing", "skiing", "skiing", "skiing", "ski"], "difficult_direct_answer": false, "rationales": ["The people have skis.", "The activity is skiing.", "Some people are on skis and others on snowboards."], "image": "train2014/COCO_train2014_000000456975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77473, "question_id": "hirt27CpodSUmk8DWchBsF", "question": "What is the girl in this image most likely looking at here?", "choices": ["television", "another person", "sign", "teacher"], "correct_choice_idx": 0, "direct_answers": ["tv monitor", "television", "television", "tv", "someone", "person", "tv monitor", "television", "television", "tv"], "difficult_direct_answer": false, "rationales": ["Two people are standing and turning their heads. they are both holding a wii remote for video game.", "The woman is standing with a remote control in her hand and has it pointed in front of her.", "She has a wii controller."], "image": "val2014/COCO_val2014_000000077473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336493, "question_id": "hiyNgr2TapvngYuD3RdgT7", "question": "Where is the baseball?", "choices": ["pitcher's glove", "outfield", "catcher's glove", "at batter"], "correct_choice_idx": 2, "direct_answers": ["catcher's glove", "catchers mitt", "pitcher", "catcher", "catchers mitt", "glove", "mitt", "catcher", "mitt", "catcher's mitt"], "difficult_direct_answer": false, "rationales": ["The ball is in the glove.", "The reaction of the players and the position of the glove make it very likely the catcher caught the ball.", "The person is catching the ball."], "image": "val2014/COCO_val2014_000000336493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83476, "question_id": "hjMqaofRj93ndSRibZqeZq", "question": "What is the cat sleeping on?", "choices": ["couch", "outdoors", "floor", "bed"], "correct_choice_idx": 3, "direct_answers": ["sheet", "bed", "bed", "bed", "bed", "remote", "bed", "bed", "bed", "bed"], "difficult_direct_answer": false, "rationales": ["The cat is snoozing on a bed.", "With the pillow and covers, that would be what the piece of furniture is.", "The cat is on a bed."], "image": "val2014/COCO_val2014_000000083476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100124, "question_id": "hjauhSQCBHhGQxEgKxGcpW", "question": "What actress has a first name that can be formed from the initials on the plane?", "choices": ["pia zadora", "lolo jones", "ann dowd", "mia goth"], "correct_choice_idx": 0, "direct_answers": ["pia mia", "pia mia", "patti", "anne hathaway", "pamela anderson", "pia zadora", "pia zadora", "pia zandora", "pia bajpai", "pamela anderson"], "difficult_direct_answer": false, "rationales": ["The name is three letters.", "The actress is pia.", "The initials on the plane spell the word \"pia\" and the most famous pia is an actress named pia zadora."], "image": "train2014/COCO_train2014_000000100124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224306, "question_id": "hjmn3J8yZ7swixeVrYpeKr", "question": "What material is the orange mug to the left of the donut made out of?", "choices": ["ceramic", "plastic", "metal", "glass"], "correct_choice_idx": 3, "direct_answers": ["plastic", "glass", "glass", "ceramic", "glass", "ceramic", "plastic", "glass", "glass", "ceramic"], "difficult_direct_answer": false, "rationales": ["It's a glass cup", "A coffee mug is on a desk. coffee mugs are made of glass.", "The mug is shiny like glass."], "image": "train2014/COCO_train2014_000000224306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96298, "question_id": "hkcu9veVJDNSGrswLTkQpn", "question": "What is the guy with a backpack doing?", "choices": ["dancing", "marching", "mimicking", "running"], "correct_choice_idx": 2, "direct_answers": ["walking", "walking", "mimicking", "crossing", "copying sign", "dancing", "walking", "walking silly", "walking", "walking"], "difficult_direct_answer": false, "rationales": ["There is a yellow sign with a black character. the guy with a backpack is imitating the character.", "A guy is standing in front of a sign with his leg and arm held in the same position as the stick figure on the sign.", "He is posing the same as the human form on the sign"], "image": "train2014/COCO_train2014_000000096298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202829, "question_id": "hkmzRLcjmcd2reQua4q52J", "question": "What is used to make the cake on the top left corner?", "choices": ["milk", "chocolate", "vanilla", "berry"], "correct_choice_idx": 1, "direct_answers": ["chocolate", "chocolate", "chocolate", "chocolate", "chocolate", "chocolate", "chocolate", "chocolate", "chocolate", "chocolate"], "difficult_direct_answer": false, "rationales": ["The brown rectangle pastries in the top left corner are labeled as brownies. brownies are traditionally chocolate flavored.", "This is a brownie", "On the tops shelf they are dark brown."], "image": "train2014/COCO_train2014_000000202829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558457, "question_id": "hknfQ3yXQhzuGwwKLjEKEa", "question": "What type of fruit are the green items on the boys right?", "choices": ["bananas", "potatoes", "papayas", "turnips"], "correct_choice_idx": 0, "direct_answers": ["coconuts", "banana", "bananas", "coconut", "durian", "coconut", "melon", "mangos", "mangoes", "jack fruit"], "difficult_direct_answer": true, "rationales": ["They are in a half circle type shape and are green before they will turn ripe and yellow.", "The fruit on the boy's right are the bananas to our left.", "They are the shape of bananas in the same bundles, they just have not yet ripened."], "image": "val2014/COCO_val2014_000000558457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369023, "question_id": "hko6f9dNPogLT9PD7ZSHWh", "question": "What type of establishment is known to put notecards on beds like this?", "choices": ["hotels", "arenas", "lobbies", "parks"], "correct_choice_idx": 0, "direct_answers": ["hotel", "hotel", "hotel", "hotels", "hotel", "hotel", "hotel", "hotel", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["The room has the layout and interior features consistent with answer a in addition to the practice of leaving notecards which is common for this type of establishment.", "Hotels have touches in their rooms such as welcome cards. the bed is perfectly made with a note on the bed.", "The notecards are for people traveling to take notes of things like phone numbers, channels, etc. the air conditioner in the background is commonly found in hotels/motels."], "image": "train2014/COCO_train2014_000000369023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163903, "question_id": "hkof9YiAkj23Q3g7YkE5QY", "question": "What action did the woman just finish doing prior to drying her hands?", "choices": ["pet cat", "wash hands", "paint nails", "fold laundry"], "correct_choice_idx": 1, "direct_answers": ["washing hands", "washing", "washed hands", "washing hands", "washed them", "hand washing", "wash hands", "washing", "washing hands", "wash hands"], "difficult_direct_answer": false, "rationales": ["The woman washed her hands.", "The woman is rubbing her hands against the towel.", "She is washing her hands."], "image": "train2014/COCO_train2014_000000163903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224396, "question_id": "hkrgYHuWcL2wqTALCtN2qd", "question": "What is usually placed on the silver item?", "choices": ["beverages", "medical supplies", "clothing", "food"], "correct_choice_idx": 3, "direct_answers": ["trays", "cookies", "pans", "food plates", "trays", "food", "food", "food", "food", "dishes"], "difficult_direct_answer": false, "rationales": ["It is a table to use for preparing meals.", "These shelves are in the kitchen and already hold some cooking tools. there is ample space for serving food on top of it.", "This is a stainless table often seen in commercial kitchens"], "image": "train2014/COCO_train2014_000000224396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325015, "question_id": "hkt88eZQdLed6SNPFz2RQr", "question": "Where are these 3 kids most likely from?", "choices": ["same mother", "different countries", "nigeria", "china"], "correct_choice_idx": 0, "direct_answers": ["same parents", "same family", "eating", "same mother", "america", "england", "america", "same mother", "chines", "same mother"], "difficult_direct_answer": false, "rationales": ["The children in question have similar looks and features which likely indicates a relationship.", "They're siblings.", "The kids look alike."], "image": "train2014/COCO_train2014_000000325015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517017, "question_id": "hkwfiBtbv6Hu7hjLa6Nuu6", "question": "What countries flag can be seen near the building?", "choices": ["united kingdom", "russia", "united states", "france"], "correct_choice_idx": 2, "direct_answers": ["united states", "usa", "usa", "america", "usa", "usa", "america", "america", "america", "america"], "difficult_direct_answer": false, "rationales": ["The flag is the us's.", "The flag has some stars and lines.", "The stars and stripes on the flag are from the usa."], "image": "train2014/COCO_train2014_000000517017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324999, "question_id": "hm4etXVXRjCKEsAFhHoVrH", "question": "What is this space dedicated to displaying?", "choices": ["planes only", "art", "vehicles", "ubers"], "correct_choice_idx": 2, "direct_answers": ["vintage vehicles", "vehicles", "vehicles", "vehicles", "vehicles", "cars", "vehicles", "cars", "antique vehicles", "cars"], "difficult_direct_answer": false, "rationales": ["The space is the vehicles.", "The space is displaying different types of vehicles.", "This place is dedicated to displaying old vehicles such as cars and aircraft."], "image": "train2014/COCO_train2014_000000324999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77854, "question_id": "hm9jP84jAF7TwTKd272Yhw", "question": "What is the same color as the arrows on the floor?", "choices": ["ketchup", "lime", "orange", "mustard"], "correct_choice_idx": 3, "direct_answers": ["bananas", "numbers", "numbers", "airplane body", "yellow", "white", "aroplane", "airplane", "mustard", "airplane"], "difficult_direct_answer": false, "rationales": ["The arrows share the same color as mustard.", "They are both yellow", "The arrows are yellow. choice a is the only one which is usually yellow in color."], "image": "train2014/COCO_train2014_000000077854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212787, "question_id": "hmEYm5xRSkrxNzahswJo5c", "question": "What type of structure does he live in?", "choices": ["tent", "log cabin", "mud hut", "glass house"], "correct_choice_idx": 1, "direct_answers": ["log cabin", "wood cabin", "house", "cabin", "house", "wood cabin", "wood cabin", "log cabin", "log cabin", "cabin"], "difficult_direct_answer": false, "rationales": ["The walls are rounded like a part of a tree.", "The house is made of wood.", "The walls are made from cut down and treated tree trunks, this gives them the circular shape you see in this photo. there are also cross sections of the tree trunk along the backside of the wall that are visible."], "image": "train2014/COCO_train2014_000000212787.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451324, "question_id": "hmsRRHoJjpsHKaqS7mRxwy", "question": "Why is everyone at the table using laptops?", "choices": ["they're hackers", "they're repairmen", "they're criminals", "they're working"], "correct_choice_idx": 3, "direct_answers": ["working", "gambling", "gaming", "working", "they're working", "work", "laptop working", "lan party", "for work", "working"], "difficult_direct_answer": false, "rationales": ["They are all working on stuff.", "The men are dressed professionally for a work environment.", "Everyone is at the table using their laptops to work on a project together because there is an instructions manual on the table with them."], "image": "val2014/COCO_val2014_000000451324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227772, "question_id": "hn9PUyRzKsNwEjHzPa44tR", "question": "The company advertised makes which one of these cars?", "choices": ["accord", "tacoma", "forester", "montego"], "correct_choice_idx": 2, "direct_answers": ["subaru", "forester", "subaru", "subaru", "subaru", "subaru", "subaru", "suv", "subaru", "subaru outback"], "difficult_direct_answer": false, "rationales": ["The company name and logo of subaru can be partially seen. the forester is made by subaru.", "The company is forester.", "Forester makes subaru."], "image": "val2014/COCO_val2014_000000227772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534000, "question_id": "hn9a8uR2vafvRtiwuAm34J", "question": "Where is this man working?", "choices": ["home", "office", "library", "coffee shop"], "correct_choice_idx": 0, "direct_answers": ["kitchen", "dining room", "table", "home", "home", "home", "dining table", "diningroom table", "dining table", "home"], "difficult_direct_answer": false, "rationales": ["A man has commandeered his dining room table to use as his desk. his kitchen is to the right, and double doors leading out back show us he is definately at home.", "The man is at home.", "The man appears to be sitting in the dining room of a home"], "image": "train2014/COCO_train2014_000000534000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146926, "question_id": "hnBpwZVMakCdDgKkHDqw6S", "question": "Where are selling the pizza from?", "choices": ["door", "roof", "window", "gate"], "correct_choice_idx": 2, "direct_answers": ["vw bug", "car", "car window", "car", "car", "window", "car", "car", "car", "mini car"], "difficult_direct_answer": false, "rationales": ["They are selling them from the window of their car.", "They are selling their pizzas from the windows of their cars.", "The boards are sticking out of a car"], "image": "train2014/COCO_train2014_000000146926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412247, "question_id": "hnLhE6M69LgsjsXjKv8DJF", "question": "This girl plays a similar sport to what athlete?", "choices": ["serena williams", "cheryl swoopes", "jennie finch", "alex morgan"], "correct_choice_idx": 2, "direct_answers": ["buster posey", "hank aaron", "babe ruth", "mike trout", "jennie finch", "sammy sosa", "cat osterman", "mike trout", "freddie freeman", "hank aaron"], "difficult_direct_answer": false, "rationales": ["The athlete was a famous softball player.", "Just like jennie finch", "The woman in a batting cage. she is not playing soccer, tennis, or basketball."], "image": "val2014/COCO_val2014_000000412247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394065, "question_id": "hnu3iJj5ws8a5MbVca6hge", "question": "What word is written in black letters?", "choices": ["river", "pest", "signal", "green"], "correct_choice_idx": 2, "direct_answers": ["signal", "signal", "signal", "signal", "signal", "signal", "signal", "signal", "signal", "signal"], "difficult_direct_answer": false, "rationales": ["The truck has the word signal printed on it in black letters.", "This word can be seen spelled out on the truck.", "The word is written out"], "image": "train2014/COCO_train2014_000000394065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529910, "question_id": "hoAkEQ9PZ4oxe8DkyqaLb4", "question": "How many different persons are shown atop a snowboard?", "choices": ["seven", "six", "eight", "one"], "correct_choice_idx": 3, "direct_answers": ["eight", "one", "one", "one", "one", "one", "one", "one", "eight", "one"], "difficult_direct_answer": false, "rationales": ["A person jumping on a snowboard is shown in snapshots, with several moments of the jump being represented.", "The same snowboarder is shown multiple times. there are no other snowboarders.", "A person is shown jumping on a snowboard as they move through the air."], "image": "train2014/COCO_train2014_000000529910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386592, "question_id": "hoR8NAfMVFhm8gPquXv8f9", "question": "Why are only the letters SBwa visible on that sign?", "choices": ["broken bulbs", "correct name", "spray paint", "fallen letters"], "correct_choice_idx": 0, "direct_answers": ["lights out", "broken bulbs", "glitches", "lights out", "lights out", "light works", "broken bulb", "lights out", "burned out", "u missing"], "difficult_direct_answer": false, "rationales": ["The other letters are visible, but they aren't lit up.", "The letters referenced are light up in a style commonly used for signs in urban centers. the letters not lit, but outlined, are likely malfunctioning as the other letters are still lit.", "The full sign says subway. the u and y lights are dead."], "image": "val2014/COCO_val2014_000000386592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88723, "question_id": "hoRRKB2E3XFs6Pnqbqbqif", "question": "What is this place?", "choices": ["store", "hallway", "library", "kitchen"], "correct_choice_idx": 3, "direct_answers": ["kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen", "kitchen"], "difficult_direct_answer": false, "rationales": ["An oven and stove are present in the location making it the room in the house where cooking takes place.", "There are appliances consistent with a kitchen, such as a fridge and stove. the objects in the room are used for cooking, which is something that almost always happens in kitchens.", "It's a kitchen."], "image": "train2014/COCO_train2014_000000088723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376990, "question_id": "hoXRPpxh39SR24rLDvCMNB", "question": "Where would be strange to see the graphic on the hat?", "choices": ["video game", "sticker", "tv show", "real rocket"], "correct_choice_idx": 3, "direct_answers": ["nowhere", "on food", "top", "wedding", "top", "japan", "boy", "real rocket", "hot dog", "cat"], "difficult_direct_answer": true, "rationales": ["There is a hello kitty graphic on the hat. hello kitty has tv shows, stickers, and video games.", "The graphic is the logo for the brand hello kitty. that is a children's toys, clothing, and accessories brand. it would be strange to see this logo on a serious, adult, scientific piece of machinery.", "The graphic is of hello kitty so it would be strange to see it on a rocket."], "image": "val2014/COCO_val2014_000000376990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422686, "question_id": "hoeeNZUd7KXwTb9hKFMdmd", "question": "What sports can both be enjoyed nearby?", "choices": ["none", "skateboarding swimming", "bowling driving", "ice skating"], "correct_choice_idx": 1, "direct_answers": ["swimming surfing", "surfing skateboarding", "surfing swimming", "swimming surfing", "skateboard", "surfing", "skateboarding swimming", "swimming surfing", "surfing skateboarding", "surfingskating"], "difficult_direct_answer": false, "rationales": ["Skateboarding and swimming are enjoyed by the beach and skatepark.", "There is a beach in the background and a skate park in the foreground.", "The sport is skateboarding."], "image": "val2014/COCO_val2014_000000422686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6973, "question_id": "hovX3kAQCaFMQ95Eu9vmX8", "question": "What is this man doing?", "choices": ["return ball", "quit", "loving", "serving"], "correct_choice_idx": 0, "direct_answers": ["playing tennis", "returning", "playing tennis", "playing tennis", "swinging", "playing tennis", "playing tennis", "hitting ball", "return ball", "playing tennis"], "difficult_direct_answer": false, "rationales": ["The man is hitting the ball.", "He is hitting the ball back.", "The man is getting ready to hit the ball back."], "image": "train2014/COCO_train2014_000000006973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497386, "question_id": "hpEhuanmLnSXYSadiazzTq", "question": "For whom is the grey mark on the ground built?", "choices": ["elderly people", "children", "blind people", "pregnant women"], "correct_choice_idx": 2, "direct_answers": ["passengers", "passengers", "passengers", "passengers", "passengers", "conductor", "caution", "passengers", "blind people", "riders"], "difficult_direct_answer": false, "rationales": ["A blind person can feel the bumps on the grey area.", "The texture of the grey mark lets people know by feel where the platform ends.", "The ground is stippled so a person who cannot see can now where to stand."], "image": "train2014/COCO_train2014_000000497386.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339589, "question_id": "hpaRTuGpdbQ3En3PRmhYpk", "question": "Where on this street can a car be parked at the curb and left more than a day without being ticketed?", "choices": ["right side", "anywhere", "left side", "nowhere"], "correct_choice_idx": 3, "direct_answers": ["nowhere", "behind post", "right side", "parking meter", "directly ahead", "nowhere", "near right", "town", "nowhere", "nowhere"], "difficult_direct_answer": false, "rationales": ["The are no places seen that someone could park for an extended period of time without being ticketed.", "It's nowhere.", "The signs say that parking is only available for two hours except for sundays."], "image": "train2014/COCO_train2014_000000339589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528851, "question_id": "hpijKFS6GWKE7m9rgt8tBQ", "question": "What are they having to eat?", "choices": ["subs", "pasta", "pork", "chicken"], "correct_choice_idx": 0, "direct_answers": ["sandwiches", "sandwiches", "sandwiches", "sandwiches", "sandwiches", "subs", "sandwiches", "sandwiches", "sandwiches", "sandwiches"], "difficult_direct_answer": false, "rationales": ["The food on the table is bread on long loafs with filling.", "People are standing around in a kitchen. there are sandwiches that are cut up into individual sections on the table. subs are popular party food.", "The food is clear and placed on the table and composed of bread with toppings inside."], "image": "train2014/COCO_train2014_000000528851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471287, "question_id": "hppEVGD8trHYXk8AToJnqu", "question": "What is the woman doing at the window?", "choices": ["breaking in", "selling cupcakes", "admiring room", "waiting"], "correct_choice_idx": 2, "direct_answers": ["gazing", "standing", "eating", "talking", "reading", "looking inside", "resting", "peeping", "peeping in", "admiring room"], "difficult_direct_answer": true, "rationales": ["She appears to be just looking into the room and taking in the view.", "She is looking out.", "The woman looks at the room."], "image": "val2014/COCO_val2014_000000471287.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383950, "question_id": "hppx5Lt8Xai9ag6tfy54H6", "question": "What kind of street is this?", "choices": ["boardwalk", "avenue", "strip", "alley"], "correct_choice_idx": 3, "direct_answers": ["alley", "alley", "alley", "alleyway", "alley", "ally", "flying lips", "alley", "flaming lips", "alley"], "difficult_direct_answer": false, "rationales": ["The sign indicates road type.", "It says alley at the end of the street name.", "There is a street sign. the word after flaming lips indicates the street type."], "image": "val2014/COCO_val2014_000000383950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305589, "question_id": "hqAUKYXQSYWmmNQN2JSj4A", "question": "When not in use how is this phone stored?", "choices": ["flipped closed", "special wallet", "left open", "briefcase only"], "correct_choice_idx": 0, "direct_answers": ["case", "flipped", "flipped closed", "pocket", "flipped down", "in pocket", "flipped closed", "folded", "pocket", "flip closed"], "difficult_direct_answer": false, "rationales": ["It is a flip phone that needs to be flipped open for use or down to power off.", "People close the flip phone so they don't dial any numbers.", "This is a flip phone with a crease in the middle to be able to fold itself."], "image": "val2014/COCO_val2014_000000305589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210983, "question_id": "hqe7LR6xQNbLETvjwPDKph", "question": "In what kind of environment are these unique items and two chickens likely located?", "choices": ["rural", "mountain", "shore", "urban"], "correct_choice_idx": 0, "direct_answers": ["farm", "countryside", "farm", "farm", "rural", "farm", "countryside", "farm", "farm", "countryside"], "difficult_direct_answer": false, "rationales": ["Chickens need a lot of room to roam.", "The chickens are in a farm environment.", "The lack of houses and buildings and the prevalence of nature means that this is a rural location."], "image": "train2014/COCO_train2014_000000210983.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367969, "question_id": "hqgU9Sx7NLmem4mUVUcYuF", "question": "Where is the woman likely returning home from?", "choices": ["mall", "drug store", "work", "restaurant"], "correct_choice_idx": 1, "direct_answers": ["work", "shopping", "setting board", "drug store", "board riding", "store", "store", "store", "store", "park"], "difficult_direct_answer": false, "rationales": ["The woman is in a store.", "The woman is holding a bag. there is toilet paper in the bag.", "The woman is holding a shopping bag full of toiletries."], "image": "train2014/COCO_train2014_000000367969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305772, "question_id": "hqickJ9zrzSzD7scnBmiDJ", "question": "In which liquid were these potatoes cooked?", "choices": ["oil", "water", "milk", "blood"], "correct_choice_idx": 0, "direct_answers": ["hot oil", "oil", "hot oil", "oil", "hot oil", "oil", "oil", "oil", "oil", "oil"], "difficult_direct_answer": false, "rationales": ["When you make french fries you fry them in hot oil", "The potatoes in this picture are actually french fries. these are cooked in hot oil.", "The liquid is oil."], "image": "train2014/COCO_train2014_000000305772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325727, "question_id": "hqkPwqafkLE7ssnB3BhLoy", "question": "Which person is most protected if it started to rain?", "choices": ["hoodie girl", "yellow jacket", "umbrella holder", "blue pants"], "correct_choice_idx": 2, "direct_answers": ["umbrella person", "umbrella holder", "man umbrella", "lightbrown shoes", "with umbrella", "far right", "umbrella holder", "under umbrella", "umbrella holder", "umbrella"], "difficult_direct_answer": false, "rationales": ["The person with the umbrella won't get rained on.", "The person has the umbrella.", "The person with the umbrella is protected."], "image": "train2014/COCO_train2014_000000325727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156701, "question_id": "hr6E8pJAz4LS2dqEEHa7Hy", "question": "What does the umbrella here prevent?", "choices": ["soaking", "hail damage", "sunburn", "getting lost"], "correct_choice_idx": 2, "direct_answers": ["sunburn", "sun", "sunburn", "sun", "sunburn", "sun", "sunburn", "sunburn", "uv rays", "sunburn"], "difficult_direct_answer": false, "rationales": ["The umbrella prevents sunburn.", "It blocks the sun.", "The sun is covering the person. the sun is shining brightly. you can get burned from the sun."], "image": "train2014/COCO_train2014_000000156701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426408, "question_id": "hrKwQ6fmpzoWiWrmDPfDAH", "question": "What countries flag can be seen on the man's shirt?", "choices": ["australia", "united kingdom", "china", "africa"], "correct_choice_idx": 1, "direct_answers": ["britain", "england", "england", "great britain", "united kingdom", "united kingdom", "britain", "united kingdom", "britain", "united kingdom"], "difficult_direct_answer": false, "rationales": ["United kingdom is on his shirt.", "The man has a union jack on his shirt.", "The man in the window is wearing a british flag."], "image": "train2014/COCO_train2014_000000426408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483723, "question_id": "hrXxAYFGfqaJ59HohyjeAj", "question": "What is the person sitting near?", "choices": ["hens", "cows", "surfboards", "slippers"], "correct_choice_idx": 2, "direct_answers": ["surfboards", "hammock", "hammock", "hammock", "surfboards", "surfboards", "hammock", "hammock", "hammock", "hammock"], "difficult_direct_answer": false, "rationales": ["There are surfboards stacked up as if for rent.", "The man is sitting on sand and is at the beach.", "There are long boards on a beach"], "image": "val2014/COCO_val2014_000000483723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360747, "question_id": "hripp4ZTsjKBBnUZncs8vv", "question": "Why are the people handing out the back of the truck?", "choices": ["stolen people", "stolen truck", "special skills", "poverty"], "correct_choice_idx": 3, "direct_answers": ["full car", "too many", "vehicle full", "poverty", "no room", "no room", "no room", "riding", "riding", "no room"], "difficult_direct_answer": false, "rationales": ["They're poor.", "The people are poor.", "When people are poor they can't afford to all ride in different vehicles."], "image": "train2014/COCO_train2014_000000360747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549268, "question_id": "hrki4EZA5djNdXqDofYKE8", "question": "What is the slowest thing that can move faster than the large thing here?", "choices": ["airplane", "car", "ant", "horse"], "correct_choice_idx": 3, "direct_answers": ["elephant", "people", "whale", "human", "human", "people", "human", "people", "human", "horse"], "difficult_direct_answer": false, "rationales": ["Horses can't move as quickly and nimbly as ants, cars or planes.", "Horse moves faster than a car or airplane and ant cannot cover a lot of distance.", "A horse is the slowest of the items listed which is also faster than an elephant. a car and airplane are much faster and an ant isn't faster."], "image": "val2014/COCO_val2014_000000549268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547730, "question_id": "hroqPRvkXfEUMY2y93kRQb", "question": "What is the player trying to hit the ball over?", "choices": ["player", "umpire", "net", "basket"], "correct_choice_idx": 2, "direct_answers": ["net", "net", "net", "tennis", "net", "net", "net", "net", "net", "net"], "difficult_direct_answer": false, "rationales": ["He's playing tennis", "Judging by the racket and the ball, this is a tennis game taking place on a tennis court and a tennis court has a net in the middle and that's the direction the man is hitting the ball towards.", "The person is playing tennis."], "image": "train2014/COCO_train2014_000000547730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554740, "question_id": "hrxCNu7xYsKARis9NmcoEC", "question": "What is the purpose of the object?", "choices": ["help you", "provide parking", "call police", "provide food"], "correct_choice_idx": 1, "direct_answers": ["provide parking", "coin collection", "parking meter", "payment", "parking time", "paid parking", "paid parking", "parking", "timing", "parking"], "difficult_direct_answer": false, "rationales": ["The meters are for parking.", "The purpose is for parking.", "These parking meters need to be fed to park."], "image": "val2014/COCO_val2014_000000554740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325043, "question_id": "hs4rKs2y9psuXxHZrbDmho", "question": "What area is the player hitting the tennis ball in?", "choices": ["inner city", "tundra", "desert", "suburban"], "correct_choice_idx": 3, "direct_answers": ["inbounds", "back court", "tennis court", "backcourt", "tennis court", "tennis court", "suburban", "tennis court", "tennis court", "court"], "difficult_direct_answer": false, "rationales": ["There is fans that are rich associated with this game.", "There are trees in the park so it's not inner city", "A girl is playing tennis on an outdoor court with trees around and a car or two in the background."], "image": "val2014/COCO_val2014_000000325043.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424712, "question_id": "hs5jmv9ar2a6ntAjHvEJ4s", "question": "What are they all looking at?", "choices": ["boy's phone", "boy's feet", "ground", "bench"], "correct_choice_idx": 0, "direct_answers": ["phone", "music", "boy's phone", "smartphone", "picture", "phone", "tablet", "cell phone", "phone", "video game"], "difficult_direct_answer": false, "rationales": ["They have a phone.", "They're looking at the phone.", "They are looking at the phone being held by the boy in the middle."], "image": "val2014/COCO_val2014_000000424712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187354, "question_id": "hsFfCDdbwmuCwpfccwFRwG", "question": "What is on both sides of the green plate?", "choices": ["cups", "napkins", "utensils", "plates"], "correct_choice_idx": 2, "direct_answers": ["utensils", "utensils", "eating utensils", "spoons", "utensils", "utensils", "eating utensils", "eating utensils", "blue spoons", "utensils"], "difficult_direct_answer": false, "rationales": ["There are plastic utensils.", "A fork, spoon, or knife by a plate are considered to be for use when eating.", "The items look like spoons you use to eat the cake with."], "image": "train2014/COCO_train2014_000000187354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71602, "question_id": "hsFoLS8pNSAF2HPjt6tXap", "question": "What is the man in grey pants doing?", "choices": ["coaching", "joking", "complaining", "singing"], "correct_choice_idx": 0, "direct_answers": ["skiing", "teaching skiing", "teaching", "skiing", "scatting", "coaching", "instructing", "standing", "skiing", "teaching"], "difficult_direct_answer": false, "rationales": ["He is walking with the skiiers.", "The man is coaching.", "The man in the gray suit is showing the people in the group how to ski."], "image": "val2014/COCO_val2014_000000071602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545138, "question_id": "hsKeciRQydvLnvU9w7BUsF", "question": "What does the grey cloth do?", "choices": ["hide motorcycle", "prevent scratches", "prevent bugs", "keep dry"], "correct_choice_idx": 3, "direct_answers": ["protect motorbike", "protect", "covering motorbike", "keep dry", "protect", "cover bike", "protect", "cover motorbike", "ridding", "keeps dry"], "difficult_direct_answer": false, "rationales": ["The cloth covers the bike.", "The cloth keeps dry.", "The tarp is partially covering the bike. this prevents rain from making the seat wet."], "image": "val2014/COCO_val2014_000000545138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448690, "question_id": "hsLjNbxoz6CEnJaknLRnCn", "question": "Which Entity owns this plane?", "choices": ["delta airlines", "us military", "toy stores", "german military"], "correct_choice_idx": 1, "direct_answers": ["us airforce", "us military", "airforce", "united states", "arc", "air force", "military", "america", "america", "arc"], "difficult_direct_answer": false, "rationales": ["The usa flag is on it so the usa military owns it.", "The us flag is on the side of the plane.", "There is an american flag on the tail of the plane."], "image": "val2014/COCO_val2014_000000448690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343503, "question_id": "hsvC5KzUkEPk8sbSYE8AWo", "question": "What feature does the bright red chair probably have?", "choices": ["reclinable", "embedded speakers", "bullet proof", "adjustable height"], "correct_choice_idx": 3, "direct_answers": ["swivel", "adjustable height", "back adjusting", "swivel", "wheels", "ergonomic", "seat tilt", "adjustable", "computer chair", "lumbar support"], "difficult_direct_answer": true, "rationales": ["The feature is the height.", "Desk chairs have controls to change the seat positions.", "The chair is an office chair which usually has a lever to adjust the height."], "image": "train2014/COCO_train2014_000000343503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60706, "question_id": "htM273T3xzgwPmHRzFTz74", "question": "Why are there two laptops on the table?", "choices": ["stolen", "for sale", "on display", "random"], "correct_choice_idx": 2, "direct_answers": ["entertainment", "need information", "for people", "on display", "being used", "workflow optimization", "two workers", "gaming", "being used", "working"], "difficult_direct_answer": true, "rationales": ["They are there so people can look at what's on the screen", "They are showing what they are capable of doing", "The laptops are on the table for purposes of display."], "image": "train2014/COCO_train2014_000000060706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81735, "question_id": "htTw8VzShoi8D896tERAAw", "question": "What color bathroom soap is there?", "choices": ["white", "blue", "yellow", "gray"], "correct_choice_idx": 2, "direct_answers": ["yellow", "orange", "brown", "gold", "orange", "orange", "brown", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["Its yellow in color.", "The bar of soap next to the sink is yellow.", "The rounded rectangular item next to the bathroom sink is yellow colored."], "image": "train2014/COCO_train2014_000000081735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445928, "question_id": "htcAQZBTpYLuv9tYxpT2UW", "question": "Where is the function attended by the crowd taking place?", "choices": ["outdoors", "country club", "restaurant", "auditorium"], "correct_choice_idx": 0, "direct_answers": ["outside", "city square", "forward", "streets", "outdoors", "outdoors", "parade", "outside", "social gathering", "protest"], "difficult_direct_answer": false, "rationales": ["The people in the crowd are holding umbrellas. they would not be doing this if they were inside.", "The people are holding umbrellas to protect them from the weather.", "Since most of the people have umbrellas over their heads, the event would not be taking place at one of the indoor venues which only leaves option a."], "image": "train2014/COCO_train2014_000000445928.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145684, "question_id": "htxpK3h6jsVbpc8CL6Vett", "question": "What type of boots is the woman wearing?", "choices": ["uggs", "cowboy boots", "rain boots", "fashion boots"], "correct_choice_idx": 2, "direct_answers": ["rain boots", "rubber", "rainboots", "snow boots", "gumboots", "rain boots", "rain boots", "rain boots", "rubber", "wellingtons"], "difficult_direct_answer": false, "rationales": ["Rain boots are usually worn in bad weather and are usually yellow.", "They are waterproof and keep her feet dry.", "These keep your feet from getting wet"], "image": "val2014/COCO_val2014_000000145684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380828, "question_id": "hu3KPkRm6NsZPbBV3M2D8j", "question": "Why are they sitting on a pile of books?", "choices": ["their job", "found", "is bookstore", "are stolen"], "correct_choice_idx": 2, "direct_answers": ["for bench", "customized chair", "is bookstore", "they're chairs", "resting", "customized chair", "resting", "bench", "waiting", "bench"], "difficult_direct_answer": false, "rationales": ["They're at a bookstore.", "The bench is using a book theme.", "It is to get people's attention to the store"], "image": "val2014/COCO_val2014_000000380828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350974, "question_id": "huWkfUbCojm6UQ8cA5netN", "question": "What is the purpose of the yellow can shown?", "choices": ["music", "sales", "trash disposal", "delivery"], "correct_choice_idx": 2, "direct_answers": ["trash disposal", "mailbox", "post box", "trash", "garbage collection", "trash can", "dustbin", "placing trash", "collect garbage", "mail"], "difficult_direct_answer": true, "rationales": ["A large yellow can is on a sidewalk. trash cans are on sidewalks to dispose of trash.", "It's for trash disposal.", "Trash is placed into receptacles that are cylindrical and tall on the street."], "image": "val2014/COCO_val2014_000000350974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313948, "question_id": "huXnxDJa6ob8qokTaQRy5c", "question": "What sort of building is seen here?", "choices": ["expo hall", "barn", "school", "kitchen furnishing"], "correct_choice_idx": 0, "direct_answers": ["convention center", "warehouse", "expo hall", "convention center", "convention center", "convention center", "exposition", "showroom", "convention center", "exposition"], "difficult_direct_answer": false, "rationales": ["By looking at this photo, it clearly is not a kitchen, school or barn.", "This is a trade show.", "People are looking at motorcycles on display."], "image": "val2014/COCO_val2014_000000313948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353790, "question_id": "hua7zE92QpueSsEb8zjNNP", "question": "What material are the white rounds in the jar made of?", "choices": ["paper", "plastic", "cotton", "ice"], "correct_choice_idx": 2, "direct_answers": ["cotton", "cotton", "soap", "cotton", "cotton", "soap", "cotton", "cotton", "cotton", "cotton"], "difficult_direct_answer": false, "rationales": ["Cottons balls are made of cotton.", "They are shaped into small balls for personal care use.", "Those are cotton balls."], "image": "train2014/COCO_train2014_000000353790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281230, "question_id": "huhdx55ghCxd8yM53BSPQT", "question": "How many people are kiteboarding in this photo?", "choices": ["two", "four", "one", "three"], "correct_choice_idx": 3, "direct_answers": ["one", "three", "one", "one", "one", "three", "2 people", "three", "one", "three"], "difficult_direct_answer": false, "rationales": ["One person is shown.", "Three kites are shown in the air.", "There are 3."], "image": "train2014/COCO_train2014_000000281230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178415, "question_id": "huxg3QKph9q7jWd3HkxbVa", "question": "How many people was this dish prepared for?", "choices": ["eight", "three", "seven", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "one", "one", "one", "one", "1 person", "two", "one", "one"], "difficult_direct_answer": false, "rationales": ["It is only a few meatballs with cheese", "There is just one.", "This is an entree that one might find at a restaurant where it is most common to have plates and meals served for one person."], "image": "val2014/COCO_val2014_000000178415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357898, "question_id": "huzurcrpjzmPtLWghTp6EL", "question": "What is the raw material for tissue paper?", "choices": ["clothes", "cottons", "bleached paper", "paper pulp"], "correct_choice_idx": 3, "direct_answers": ["paper pulp", "pulp", "pulp", "cotton", "trees", "pulp", "paper", "pulp", "trees", "cotton"], "difficult_direct_answer": false, "rationales": ["Tissue paper is made of paper pulp.", "The material is pulp.", "The tissue paper on the roll next to the woman is made from a raw material called paper pulp."], "image": "train2014/COCO_train2014_000000357898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376557, "question_id": "hv6oCFtGoLZvn2MvCnxUyj", "question": "What is the man using the kite to do?", "choices": ["fly", "climb", "surf", "catch birds"], "correct_choice_idx": 2, "direct_answers": ["surf", "surf", "ski", "surf", "surf", "surf", "parasail waterskiing", "wind surf", "watergliding", "water ski"], "difficult_direct_answer": false, "rationales": ["The man is parasurfing.", "The kite is attached via harness to a person seen on a board in the water which is consistent with the activity of answer a.", "The person is being towed on the water by the kite, thus, performing the activity mentioned in the option."], "image": "train2014/COCO_train2014_000000376557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355231, "question_id": "hvfvajbgGJv6yxEGcC3Ypw", "question": "What was the traditional use for these hats?", "choices": ["armor", "farming", "camouflage", "purely aesthetic"], "correct_choice_idx": 1, "direct_answers": ["rice hats", "rice farming", "farming", "carrying rice", "rainhats", "sun protection", "block sun", "sun protection", "farming", "protection"], "difficult_direct_answer": false, "rationales": ["The hats being worn are clear and used to keep the sun off one's face as would be necessary for one working in an open outdoor area.", "They were popular in asia to protect faces from the hot sun while farming.", "These hats are traditionally used by farmers for field work."], "image": "train2014/COCO_train2014_000000355231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351157, "question_id": "hvjsmUrVSuUn2qmjCR6KQf", "question": "What type of business is this?", "choices": ["doctor", "barber", "bank", "deli"], "correct_choice_idx": 3, "direct_answers": ["restaurant", "deli", "sandwich shop", "restaurant", "sandwich shop", "restaurant delicatessen", "deli", "deli", "delicatessen", "eatery"], "difficult_direct_answer": false, "rationales": ["There are sandwiches piled on top of a glass counter where they are being sold.", "The place sells subs.", "There are multiple submarine style sandwiches on serving trays there making it a food related business."], "image": "train2014/COCO_train2014_000000351157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433830, "question_id": "hwXnkQTWQcnM6Y6d9GjJvs", "question": "What color sweater is the toddler on the little skateboard wearing?", "choices": ["white", "brown", "olive", "cream"], "correct_choice_idx": 2, "direct_answers": ["green", "grey", "olive", "green", "green", "green", "olive", "beige", "olive green", "green"], "difficult_direct_answer": false, "rationales": ["The boy has a light green shirt on.", "Olive is a light green.", "The sweater is olive."], "image": "train2014/COCO_train2014_000000433830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236316, "question_id": "hwhZWmtdisg7jiNerTxgTY", "question": "What classification of food is being eaten with the fork?", "choices": ["meat", "fish", "fruit", "vegetables"], "correct_choice_idx": 2, "direct_answers": ["salad", "salad", "vegetable", "salad", "fruit", "vegetables", "healthy", "vegetable", "salad", "leafy greens"], "difficult_direct_answer": false, "rationales": ["Avocado is a fruit.", "In the container with the fork, the person is eating a salad that is composed of leafy greens. leafy greens are vegetables.", "It is an avocado"], "image": "train2014/COCO_train2014_000000236316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53800, "question_id": "hwnYLLZPvvrS5ZMFtRWEB8", "question": "What is the man helping the young boy do?", "choices": ["play games", "learn math", "count", "finish puzzle"], "correct_choice_idx": 0, "direct_answers": ["play game", "play game", "play games", "play wii", "play wii", "video game", "watchin tv", "vedo game", "play wii", "play games"], "difficult_direct_answer": false, "rationales": ["The man is helping the child use a motion controller for a video game.", "The boy has a remote.", "The boy has a console."], "image": "val2014/COCO_val2014_000000053800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421193, "question_id": "hwpHWZUoApKkwydTTP5Kwz", "question": "Persons here wait to do what?", "choices": ["board", "uber", "catch cab", "depart"], "correct_choice_idx": 0, "direct_answers": ["board train", "board train", "travel", "board train", "travel", "travel", "board", "board train", "board train", "board train"], "difficult_direct_answer": false, "rationales": ["People want to board.", "People here wait on a train platform to board the train.", "The people are waiting alongside the tracks to get on the train. if they were getting off the train, then they wouldn't be waiting but instead would be leaving."], "image": "train2014/COCO_train2014_000000421193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79628, "question_id": "hwujqXjdqf7Z2WBhFMpa5B", "question": "What does the truck have a spare of on the back?", "choices": ["gas tank", "tire", "motor", "seat"], "correct_choice_idx": 1, "direct_answers": ["tire", "tire", "spare tire", "tire", "spare tire", "tire", "tire", "tire", "tire", "tire"], "difficult_direct_answer": false, "rationales": ["There is a tire on the back of the truck.", "There is a spare tire at the back.", "There is an extra tire."], "image": "train2014/COCO_train2014_000000079628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111694, "question_id": "hwzTiKvHFWsnxZCYzzfgAM", "question": "What is the name of the structure the bench is sitting on?", "choices": ["ramp", "pier", "dais", "island"], "correct_choice_idx": 1, "direct_answers": ["pier", "boardwalk", "boardwalk", "pier", "boardwalk", "pier", "war", "boardwalk", "boardwalk", "pier"], "difficult_direct_answer": false, "rationales": ["The structure is raised and surrounded by water as it extends out. these features are consistent with answer a.", "The structure is seen extending out into the ocean off of the beach. this type of structure is frequently found at a beach.", "This is a pier that you can walk out on to look at the water."], "image": "train2014/COCO_train2014_000000111694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143450, "question_id": "hx7vMBZhqiMfS89zz49gJW", "question": "Which one of these holidays would this cake be appropriate for?", "choices": ["independence day", "thanksgiving", "christmas", "easter"], "correct_choice_idx": 0, "direct_answers": ["july 4th", "independence day", "july fourth", "independence day", "independence day", "independence day", "independence day", "july fourth", "independence", "july 4th"], "difficult_direct_answer": false, "rationales": ["Fourth of july is celebrated with american flags.", "The cake would be good for fourth of july.", "It has a flag in it"], "image": "val2014/COCO_val2014_000000143450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99785, "question_id": "hx8Bxva2cJDqqWhhn4imU7", "question": "Where are these children located?", "choices": ["hotel", "hospital", "classroom", "playground"], "correct_choice_idx": 0, "direct_answers": ["bed", "bed", "hotel room", "bedroom", "room", "on bed", "hotel", "bedroom", "bed", "hotel"], "difficult_direct_answer": false, "rationales": ["They're in a hotel.", "Water appears out of the bedroom window past the balcony. the children are wearing admission bands and have their faces painted so they may be on vacation.", "They look to be in a bedroom."], "image": "train2014/COCO_train2014_000000099785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577838, "question_id": "hxETGdsTYMPG2TqCdRD2UF", "question": "What setting is this venue?", "choices": ["park", "farm", "zoo", "wilderness"], "correct_choice_idx": 1, "direct_answers": ["grassland", "farm", "field", "park", "field", "pasture", "ranch", "field", "farm", "country"], "difficult_direct_answer": false, "rationales": ["The animals are seen in a farm.", "It seems to be outdoors but not in a zoo environment but it has fencing that a farm would have.", "The animals displayed are typical farm animals and there is fencing visible in the background which is used on farms to keep the animals contained."], "image": "train2014/COCO_train2014_000000577838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544140, "question_id": "hxF5o3EpSg9VvWFJkcp2Mf", "question": "What is the person in the green hoody practicing?", "choices": ["skateboarding", "dancing", "snowboarding", "skiing"], "correct_choice_idx": 0, "direct_answers": ["yoga", "skateboarding", "skateboarding", "skating", "moving", "skateboarding", "can't see", "sweeping", "skateboarding trick", "skateboarding"], "difficult_direct_answer": false, "rationales": ["The person is clearly identifiable and is utilizing the equipment required for answer a based on the board and wheels style and design underneath his feet.", "He is grinding on a rail.", "The person is on a skateboard and is practicing a trick commonly associated with skateboarding."], "image": "val2014/COCO_val2014_000000544140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86183, "question_id": "hxJBuakeFNewt5Az7RkLjh", "question": "What are the two people doing with their motorcycles?", "choices": ["driving", "parking", "resting", "posing"], "correct_choice_idx": 3, "direct_answers": ["posing", "posing", "posing", "posing with", "posing with", "posing", "posing with", "posing", "posing", "posing"], "difficult_direct_answer": false, "rationales": ["They are taking a photo.", "The two guys are posing for a photo and trying to look cool.", "They are modeling for the photographer."], "image": "train2014/COCO_train2014_000000086183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340108, "question_id": "hxSZ9XQVoGjrXh7addx4Nm", "question": "What is on the floor?", "choices": ["milk", "bananas", "sand", "crumbs"], "correct_choice_idx": 2, "direct_answers": ["snow", "snow", "snow", "snow", "snow", "snow", "snow", "sand", "snow", "snow"], "difficult_direct_answer": false, "rationales": ["It's snow on the ground", "The ground is a sandy surface.", "The floor is sandy."], "image": "train2014/COCO_train2014_000000340108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380039, "question_id": "hxwHuMMb5qfyoKmkNNGRdG", "question": "What type of vehicle is the cat sitting on?", "choices": ["quad", "motorcycle", "scooter", "jetski"], "correct_choice_idx": 2, "direct_answers": ["motorcycle", "scooter", "scooter", "scooter", "scooter", "moped", "scooter", "moped", "motorcycle", "moped"], "difficult_direct_answer": false, "rationales": ["The cat is perched on the scooter seat.", "The cute cat is sitting on a motor scooter.", "The vehicle is a scooter."], "image": "train2014/COCO_train2014_000000380039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337233, "question_id": "hxze8Un3qkWgBC99i6hfBV", "question": "What number do you get if you take the largest jersey number and then subtract the smallest jersey number from it?", "choices": ["eight", "99", "five", "20"], "correct_choice_idx": 0, "direct_answers": ["eight", "eight", "ten", "eight", "eight", "eight", "eight", "ten", "eight", "eight"], "difficult_direct_answer": false, "rationales": ["Ten minus two is eight.", "10 minus 2 gets you this number", "Ten minus two is eight."], "image": "val2014/COCO_val2014_000000337233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231991, "question_id": "hy4fVekT2yoVakW9Wu9q4i", "question": "What is he doing?", "choices": ["eating fruit", "playing game", "recording voice", "testing mouse"], "correct_choice_idx": 2, "direct_answers": ["checking laptop", "on computer", "on laptop", "computer work", "recording voice", "watching", "working", "working", "recording voice", "dictating"], "difficult_direct_answer": false, "rationales": ["He has a black stick in his hand held up close to his face which he needs to speak into.", "He is holding a microphone to record himself.", "He is holding a voice recorder up to his mouth."], "image": "val2014/COCO_val2014_000000231991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141532, "question_id": "hy87yC7gnbeDusd8wdEFpM", "question": "Why is the boat attached to a rope?", "choices": ["prevent theft", "prevent moving", "mark place", "protect fish"], "correct_choice_idx": 1, "direct_answers": ["prevent moving", "moored", "balancing", "keep secure", "docked", "tied down", "docked", "keep anchored", "docking", "docked"], "difficult_direct_answer": false, "rationales": ["The waves can make the boat drift away.", "To keep the boat tied to the dock.", "The rope keeps the boat attached to the dock so it wont float away."], "image": "train2014/COCO_train2014_000000141532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43163, "question_id": "hyS8jico3EKB4kBUpuxx6V", "question": "What is the man doing?", "choices": ["eating", "sleeping", "relaxing", "lunging forward"], "correct_choice_idx": 3, "direct_answers": ["playing tennis", "playing tennis", "hitting ball", "tennis", "swinging", "playing tennis", "playing tennis", "hitting ball", "lunging forward", "tennis"], "difficult_direct_answer": false, "rationales": ["He is stretching as he reaches to hit the ball", "The man has one foot in front of the other.", "The man lunges."], "image": "train2014/COCO_train2014_000000043163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154193, "question_id": "hykuaX6iyzGcVkYu6JbC93", "question": "Why is the woman wearing a ring on her fourth finger?", "choices": ["she's married", "fashion", "showing off", "style"], "correct_choice_idx": 0, "direct_answers": ["married", "wedding ring", "married", "she's married", "she's married", "married", "married", "married", "married", "married"], "difficult_direct_answer": false, "rationales": ["A ring on the left hand and this finger is traditionally done to demonstrate answer a.", "The woman is not single.", "Gold bands on the left hand are commonly worn by people who are married."], "image": "val2014/COCO_val2014_000000154193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515820, "question_id": "hyuzNQfMBkPjrgQkqs6Jny", "question": "What type of station is this?", "choices": ["taxi station", "car park", "train station", "subway station"], "correct_choice_idx": 2, "direct_answers": ["train", "train station", "train", "subway", "train", "train", "train", "train", "train station", "train"], "difficult_direct_answer": false, "rationales": ["The station appears next to parallel sets of rails which is consistent with the transportation type of answer a.", "The visible train tracks on either side of the platform indicate the usage for this station.", "People can wait for trains here."], "image": "val2014/COCO_val2014_000000515820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125076, "question_id": "hyvukCvhXWjqgUyonrEFKL", "question": "What type of internet service is being utilized by the computer?", "choices": ["cellular", "dsl", "fiber", "cable"], "correct_choice_idx": 3, "direct_answers": ["dsl", "wifi", "wifi", "cable", "high speed", "broadband", "dsl", "broadband", "wireless", "wi fi"], "difficult_direct_answer": false, "rationales": ["There are cables hooked up everywhere.", "It's using a cable.", "The computer has cable cords."], "image": "val2014/COCO_val2014_000000125076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419117, "question_id": "hzV8GAHcgxvuSpF2SFtNVi", "question": "From which vegetable is the main side dish sourced from mainly?", "choices": ["lettuce", "cabbage", "apples", "pears"], "correct_choice_idx": 1, "direct_answers": ["cabbage", "cabbage", "cabbage", "cabbage", "cabbage", "cabbage", "cucumbers", "cabbage", "cabbage", "lettuce"], "difficult_direct_answer": false, "rationales": ["The side dish on the plate is cole slaw which is made of shredded cabbage.", "It is from cabbage.", "The side dish is coleslaw which has a primary ingredient of shredded cabbage."], "image": "val2014/COCO_val2014_000000419117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498175, "question_id": "hzs4Q8E24wEzeL9hM4V4T2", "question": "Which person is the teacher?", "choices": ["black pants", "green clothes", "red clothes", "white pants"], "correct_choice_idx": 2, "direct_answers": ["right woman", "in red", "red jacket", "in red", "red jacket", "oldest girl", "red", "red clothes", "red jacket", "in red"], "difficult_direct_answer": false, "rationales": ["The person in red is tallest.", "Answer a is the oldest person visible surrounded by children wearing bibs over their jackets consistent with a teacher and student relationship in this setting.", "The person in red is the oldest."], "image": "train2014/COCO_train2014_000000498175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190326, "question_id": "hzshkW7rPmcBoFVdeS7zJu", "question": "When stray cats jump in the road they rely on what to keep them save from getting hit?", "choices": ["traffic lights", "drivers", "crosswalk", "other cats"], "correct_choice_idx": 1, "direct_answers": ["drivers", "speed", "speed", "speed", "speed", "hearing sight", "feet", "speed", "feet", "feet"], "difficult_direct_answer": false, "rationales": ["The cats are near a street where vehicles would be passing by and to be safe they sit on top of parked vehicles.", "A stray cat is one that is running around and doesn't have a home. people in vehicles have to look out and brake and swerve to avoid hitting them in road.", "The answer is not related to the image, but if a cat were in the road there would not be any other option to protect them naturally."], "image": "val2014/COCO_val2014_000000190326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229352, "question_id": "hzv5egFAWh9XFK6M6suU4F", "question": "Why does the man have his hands opened?", "choices": ["to catch", "dance moves", "to clap", "balance"], "correct_choice_idx": 0, "direct_answers": ["catching frisbee", "catching frisbee", "to catch", "catching frisbee", "catch frisbee", "catching", "catching", "catching frisbee", "to catch", "catching"], "difficult_direct_answer": false, "rationales": ["He is trying to get the frisbee.", "He is in position to catch the frisbee.", "His goal is to grab hold of the frisbee, so \"a\" must be correct."], "image": "train2014/COCO_train2014_000000229352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350435, "question_id": "hzwpV3JXk3A4CeQ8qDYgFP", "question": "What mechanism the the motorcyclists just engage?", "choices": ["pit", "ramp", "flat surface", "sand pit"], "correct_choice_idx": 1, "direct_answers": ["jump", "pop wheelie", "ramp", "throttle", "stunts", "ramp", "engine", "riding", "speed", "jumps"], "difficult_direct_answer": true, "rationales": ["The motorcyclists are going up a ramp.", "The ramp can be seen below them and they are in the air, indicating they used the ramp for momentum.", "The motorcycles taken off from a point."], "image": "train2014/COCO_train2014_000000350435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393544, "question_id": "i2MCjupNUp2zSWRU9hkhPT", "question": "What is the man with a white shirt and light green shorts taking here?", "choices": ["skateboard", "risk", "photo", "nothing"], "correct_choice_idx": 2, "direct_answers": ["picture", "picture", "picture", "photos", "photos", "photos", "photos", "photos", "photo", "picture"], "difficult_direct_answer": false, "rationales": ["He is taking a picture.", "The man is taking a photo with a camera.", "The man has a camera in his hands."], "image": "train2014/COCO_train2014_000000393544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252300, "question_id": "i2WDFbVXEuZf9n8dP35NHq", "question": "What is the relationship between the boys wearing shirts of different colors in this situation?", "choices": ["teammates", "competitors", "classmates", "coworkers"], "correct_choice_idx": 1, "direct_answers": ["competitors", "differentiates teams", "different teams", "opponents", "opponents", "opponents", "opponents", "soccer team", "teammates", "opposing teams"], "difficult_direct_answer": false, "rationales": ["The kids are on different teams.", "They're competitors.", "The boys are wearing different color uniforms and are playing soccer."], "image": "val2014/COCO_val2014_000000252300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541169, "question_id": "i2ZmC3MGiFCVTGy56JwTQr", "question": "What is the brown building likely to be?", "choices": ["shopping center", "museum", "government building", "university"], "correct_choice_idx": 1, "direct_answers": ["museum", "castle", "umbrella", "castle", "castle", "museum", "school", "sempione park", "museum", "umbrella"], "difficult_direct_answer": false, "rationales": ["The building appears old and the people walking about appear older and more professional.", "The building is a museum.", "Looks like a museum."], "image": "train2014/COCO_train2014_000000541169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550438, "question_id": "i2b2sTdNHbW5gC3MCUWAFj", "question": "What other food is popular to cook using this tool?", "choices": ["rice", "smoothies", "cookies", "steak"], "correct_choice_idx": 3, "direct_answers": ["brats", "hamburgers", "brats", "hotdogs", "hamburgers", "hamburger", "hamburgers", "burgers", "steak", "steak"], "difficult_direct_answer": false, "rationales": ["The food is steak.", "A grill is usually used for cooking meat products.", "Meat is a good thing to cook on an outdoor grill."], "image": "train2014/COCO_train2014_000000550438.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506115, "question_id": "i2nwph3F3ywceMoSM4Gc8A", "question": "What is unusual about the soccer ball being held up by the girl in black and gray striped shirt?", "choices": ["it's overblown", "it's larger", "it's airless", "nothing"], "correct_choice_idx": 2, "direct_answers": ["it's deflated", "flat", "flat", "squished", "deflated", "flat", "flat", "it's airless", "deflated", "flat"], "difficult_direct_answer": false, "rationales": ["The soccer ball she is holding is squished, so it must not have enough air.", "The ball is airless.", "The ball is deflated."], "image": "val2014/COCO_val2014_000000506115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395198, "question_id": "i2vLFqMXoY9YVaCHJVuTXZ", "question": "What is the woman riding?", "choices": ["bike", "skateboard", "scooter", "motorcycle"], "correct_choice_idx": 1, "direct_answers": ["skateboard", "skateboard", "skateboard", "skateboard", "skateboard", "skating", "skateboard", "skating", "skateboard", "skateboard"], "difficult_direct_answer": false, "rationales": ["A skateboard is long and rectangle with wheels.", "She is on a skateboard.", "It is a four-wheeled wood surface without handles."], "image": "train2014/COCO_train2014_000000395198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475939, "question_id": "i37bY734g2kE7sbGJbAmrr", "question": "What country produces a large number of these yellow food items?", "choices": ["greenland", "siberia", "wessex", "india"], "correct_choice_idx": 3, "direct_answers": ["brazil", "india", "latin america", "india", "brazil", "india", "india", "india", "india", "thailand"], "difficult_direct_answer": false, "rationales": ["India produces these.", "India is a large producer of bananas.", "India has a warm and tropical climate."], "image": "train2014/COCO_train2014_000000475939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561957, "question_id": "i3PLfiRujyRZN7ugSp2Uhw", "question": "The person playing with the Frisbee is doing so during which season?", "choices": ["fall", "winter", "summer", "spring"], "correct_choice_idx": 3, "direct_answers": ["summer", "spring", "spring", "spring/summer", "spring", "summer", "spring", "spring", "spring", "spring"], "difficult_direct_answer": false, "rationales": ["The grass is green and people are wearing light clothes.", "The season is spring.", "The leaves are still small and emerging"], "image": "train2014/COCO_train2014_000000561957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180670, "question_id": "i3SDSMiN2ppRLMEMfHKLVx", "question": "What type of collectable is the large blue and white vase a part of?", "choices": ["textiles", "nature", "periodicals", "antique"], "correct_choice_idx": 3, "direct_answers": ["antique", "china", "china", "china", "porcelain", "traditional display", "china", "antiques", "glass", "art show"], "difficult_direct_answer": false, "rationales": ["The collectible is an antique.", "Given it's being on display and rustic but delicate appearance we can assume this is an antique.", "The large blue and white vase is a part of the antique collection of this museum."], "image": "train2014/COCO_train2014_000000180670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352835, "question_id": "i4ET9eMfXvpTqEv8Ew7C8J", "question": "What color is the chocolate on top of the white plate?", "choices": ["brown", "white", "yellow", "black"], "correct_choice_idx": 3, "direct_answers": ["dark brown", "dark brown", "brown", "brown", "dark brown", "black", "chocolate", "brown", "dark", "dark brown"], "difficult_direct_answer": false, "rationales": ["Chocolate has some milk added to it which gives it a brown consistency rather than black.", "The chocolate is dark, but not complete black.", "That is always the color of chocolate"], "image": "train2014/COCO_train2014_000000352835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70087, "question_id": "i4LySKQHuZ9vXsdezcESjn", "question": "What is this vehicle trying to do?", "choices": ["hit pedestrians", "nothing", "park", "turn around"], "correct_choice_idx": 3, "direct_answers": ["turn", "bus", "turn", "park", "cross lanes", "turn around", "park", "bus", "turn left", "park"], "difficult_direct_answer": false, "rationales": ["The bus is trying to turn.", "It's the only answer that makes sense. it's in a parking lot and it is in a position to turn around.", "The bus is turning."], "image": "train2014/COCO_train2014_000000070087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51773, "question_id": "i4Mo9ojJB4vYAHF4EGvuS2", "question": "What is the woman and horse here engaged in?", "choices": ["rodeo", "candy tasting", "hack pulling", "competition"], "correct_choice_idx": 3, "direct_answers": ["competition", "competition", "riding", "equestrian event", "horseriding", "competition", "riding", "competition", "equestrian event", "competition"], "difficult_direct_answer": false, "rationales": ["There is a clock and portapotties in the distance", "The woman and horse are competing in polo.", "There is a game timer in the background."], "image": "val2014/COCO_val2014_000000051773.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328751, "question_id": "i4SCrVQNCAME7pjPhHWTAy", "question": "What can you find from the billboard?", "choices": ["weather", "news", "lottery payouts", "train schedule"], "correct_choice_idx": 3, "direct_answers": ["number", "travel times", "train schedule", "schedule", "destinations", "bullet train", "departure times", "train", "departure time", "schedule"], "difficult_direct_answer": true, "rationales": ["Schedules are posted at train stations and show departure an arrival times.", "A digital sign hangs outside on a train platform at a train station.", "The billboard will show the times the trains will arrive for the passengers."], "image": "train2014/COCO_train2014_000000328751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 154600, "question_id": "i4UEwpNf7zhukDpu9bJmrk", "question": "What is the parent company of their sponsors?", "choices": ["nesquick", "hershey", "lactalis australia", "nestle"], "correct_choice_idx": 2, "direct_answers": ["to enjoy", "break", "break", "lactalis australia", "asp", "as", "as", "break", "parmalat", "break"], "difficult_direct_answer": false, "rationales": ["It says the name of the company on their shirts.", "The company is australian.", "The breaka logo is on surfers shirts. lactalis australia is the parent company of breaka."], "image": "train2014/COCO_train2014_000000154600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178078, "question_id": "i4Z2jkj4zqMcQbqKGEwucN", "question": "What angle is the motorcycle at to the cars?", "choices": ["obtuse", "perpendicular", "right", "parallel"], "correct_choice_idx": 1, "direct_answers": ["perpendicular", "perpendicular", "perpendicular", "perpendicular", "right angle", "right angle", "perpendicular", "bike", "perpendicular", "perpendicular"], "difficult_direct_answer": false, "rationales": ["A bike is parked at a ninety degree angle with the cars nearby.", "The front of the car is visible and facing the bottom of the image based on the headlights which are known to be on the front. the motorcycle is facing the right edge of the image which would be perpendicular to the car.", "The bike is perpendicular to the other vehicles."], "image": "val2014/COCO_val2014_000000178078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453918, "question_id": "i4d2HTJ7zBFsJ6JpYaJzvJ", "question": "What act is the older woman preparing to do to the young girl?", "choices": ["laugh at", "scold", "cut hair", "massage"], "correct_choice_idx": 2, "direct_answers": ["cutting hair", "cut hair", "cut hair", "cutting hair", "cut hair", "cut hair", "cut hair", "cut hair", "cut hair", "haircutting"], "difficult_direct_answer": false, "rationales": ["She's cutting hair.", "She has scissors and has sectioned the hair", "She's holding scissors and the girls hair."], "image": "train2014/COCO_train2014_000000453918.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568555, "question_id": "i4g7AFT3j9B8PTLxJaeKyg", "question": "What would the upright objects littering the lawn do if they were hit with a hammer swung by Paul Wight?", "choices": ["nothing", "evaporate", "shatter", "crumble"], "correct_choice_idx": 2, "direct_answers": ["shatter", "break", "shatter", "shatter", "break", "breaky", "crash", "break", "shatter", "break"], "difficult_direct_answer": false, "rationales": ["The objects shatter.", "When hit porcelain will shatter into hundreds of pieces.", "If hit with something hard they would break to pieces."], "image": "val2014/COCO_val2014_000000568555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245430, "question_id": "i4ifjtwaDptVyhKPy5iCza", "question": "What numeral system is used on the clock?", "choices": ["egyptian", "greek", "roman", "digital"], "correct_choice_idx": 2, "direct_answers": ["roman", "roman", "roman", "roman", "roman", "roman numerals", "roman", "roman numerals", "arabic", "roman"], "difficult_direct_answer": false, "rationales": ["These are roman numerals.", "The numeral system used in on the clock is roman numerals.", "The clock on the tower has numerals such as i, ii, iv and ix. these symbols are all part of the roman numeral system."], "image": "val2014/COCO_val2014_000000245430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316777, "question_id": "i4jKpx2ZHtHixCYi2WwoSe", "question": "How long must the bus wait to enter this intersection safely?", "choices": ["no time", "5 minutes", "hour", "5 seconds"], "correct_choice_idx": 0, "direct_answers": ["no wait", "not", "zero seconds", "no wait", "0 minutes", "no wait", "zero time", "no time", "no time", "one minute"], "difficult_direct_answer": false, "rationales": ["The light is green.", "The answer would be zero because the light is green.", "There is a traffic light to the left. the light is green."], "image": "train2014/COCO_train2014_000000316777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545000, "question_id": "i4s9zkL8eRjKJebakyMju7", "question": "Why are there so many leaves on the ground?", "choices": ["its summer", "its warm", "its windy", "its fall"], "correct_choice_idx": 3, "direct_answers": ["fall", "autumn", "autumn", "tree felling", "fall", "fall", "fall", "fall", "fall", "its fall"], "difficult_direct_answer": false, "rationales": ["In the autumn the leaves start to fall off the trees.", "The leaves are on the ground for fall.", "When the weather starts to get cold leave baring trees shed their leaves causing them to fall to the ground. if not disposed of properly they lay on the ground in bunches."], "image": "val2014/COCO_val2014_000000545000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111807, "question_id": "i525yzW8uqaymciEh9oEKP", "question": "The sliders on the equipment on the desk is used to adjust what?", "choices": ["lighting", "sound", "temperature", "blinds"], "correct_choice_idx": 0, "direct_answers": ["different tones", "sounds", "sound", "sound", "sound", "sound", "sound", "lighting", "volume", "volume"], "difficult_direct_answer": false, "rationales": ["The sliders adjust the lighting.", "A board with adjustable dials is on a desk with screens around.", "Looks like it's in front of a sound booth."], "image": "train2014/COCO_train2014_000000111807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214924, "question_id": "i5AeH2BM5FfuKusaEzjHFM", "question": "This restaurant definitely serves which countries products?", "choices": ["china", "canada", "brazil", "mexico"], "correct_choice_idx": 3, "direct_answers": ["italy", "mexico", "italy", "italy", "italy", "italy", "italy", "mexico", "italy", "italy"], "difficult_direct_answer": false, "rationales": ["Corona beer is product from mexico", "There is a corona beer. corona is a mexican company.", "A person is eating pizza and a corona beer."], "image": "train2014/COCO_train2014_000000214924.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484619, "question_id": "i5GMyRc6vFddzncMdmiDif", "question": "What is being cooked here?", "choices": ["fish", "waffles", "chicken", "cookies"], "correct_choice_idx": 1, "direct_answers": ["bacon", "breakfast", "breakfast", "breakfast", "chicken", "waffles", "chicken", "breakfast", "breakfast", "breakfast"], "difficult_direct_answer": false, "rationales": ["The black object is a waffle maker. you can also see waffles on the girls plate.", "Waffles are being cooked at this breakfast layout.", "A common topic for waffles is syrup. syrup is in the picture."], "image": "train2014/COCO_train2014_000000484619.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292648, "question_id": "i5HxGdoNk9V6p5biH9LPpR", "question": "What was the shirtless man just doing?", "choices": ["surfing", "skiing", "dancing", "showering"], "correct_choice_idx": 0, "direct_answers": ["surfing", "surfing", "surfing", "surfing", "surfing", "standing", "kitesurfing", "surfing", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["He is holding a surfboard so he just got done surfing.", "The man is holding a surf board.", "The shirtless man was just riding a surf board."], "image": "val2014/COCO_val2014_000000292648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168151, "question_id": "i5UcosFm34M5pDNMCFhprF", "question": "This man is most likely playing what?", "choices": ["football", "catch", "soccer", "pinball"], "correct_choice_idx": 1, "direct_answers": ["baseball", "coin", "baseball", "baseball", "softball", "softball", "softball", "baseball", "baseball", "catch"], "difficult_direct_answer": false, "rationales": ["He's wearing a baseball glove.", "He is playing catch with someone.", "The man is playing with a baseball, and he has his hand extended as if he is throwing the ball back and forth with another person."], "image": "train2014/COCO_train2014_000000168151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176519, "question_id": "i5Y8XQmrK2hrvnSNfr9rhR", "question": "Why are they wearing wetsuits?", "choices": ["for fun", "cold water", "easier finding", "showing off"], "correct_choice_idx": 1, "direct_answers": ["in ocean", "warmth", "stay warm", "surfing", "surfing", "keep warm", "stay dry", "protection", "in water", "cold water"], "difficult_direct_answer": true, "rationales": ["They want to be warm.", "The purpose of wetsuits is to provide warmth in cold water and they are visibly in water so it is likely cold.", "The people are wearing wetsuits so they can surf in the water and not get too cold."], "image": "val2014/COCO_val2014_000000176519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166975, "question_id": "i5kKyc9sGHLsm7WsMS3ARN", "question": "Which one is probably the driver of the car?", "choices": ["facing camera", "in store", "facing bus", "in bus"], "correct_choice_idx": 2, "direct_answers": ["in black", "man", "man", "facing bus", "man", "man", "man", "man standing", "black jacket", "man"], "difficult_direct_answer": false, "rationales": ["He is standing in front to assess the damage of the accident.", "His vehicle has collided with a public transit vehicle and he is angry.", "A man is standing facing the bus and looking at the damage of the bus and the car colliding."], "image": "train2014/COCO_train2014_000000166975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162792, "question_id": "i5snf8ask3pJpF4fzXMNY6", "question": "Who was the former owner to the railway?", "choices": ["orient express", "amtrak", "greyhound", "cn"], "correct_choice_idx": 3, "direct_answers": ["cn kington", "vanderbilt", "cn", "rich person", "cn", "kingston", "canadian national", "cn", "cn", "cn"], "difficult_direct_answer": false, "rationales": ["There is a sign that says the name.", "Cn was the former owner.", "The older photograph shows the logo for cn on the train station."], "image": "train2014/COCO_train2014_000000162792.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408989, "question_id": "i67yA3MQMLidMjDz2eJZe4", "question": "What appliance is in the image?", "choices": ["stove", "dishwasher", "microwave", "blender"], "correct_choice_idx": 3, "direct_answers": ["blender", "blender", "blender", "blender", "blender", "blender", "blender", "blender", "blender", "blender"], "difficult_direct_answer": false, "rationales": ["There is a motor inside the silver part that moves the blade in the glass part causing ingredients to be mixed.", "The skinned tomatoes are going to be blended.", "It is used to fully mix ingredients together using sharp blades."], "image": "val2014/COCO_val2014_000000408989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301209, "question_id": "i6Tps6cmfbvL62TX9trLit", "question": "What are the skiers watching?", "choices": ["moon", "sun", "stars", "clouds"], "correct_choice_idx": 1, "direct_answers": ["sunset", "sunset", "sunset", "sun", "sunset", "sunset", "sunset", "sunset", "sunset", "sunset"], "difficult_direct_answer": false, "rationales": ["The skiers are looking at the sun.", "He's watching the sun.", "They are all looking at the sunset as it goes down."], "image": "train2014/COCO_train2014_000000301209.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190845, "question_id": "i6Wpfi4eEHsotEmfiyJ4gf", "question": "What is the man shown here likely to have for lunch today?", "choices": ["seafood", "burgers", "cotton candy", "pizza"], "correct_choice_idx": 0, "direct_answers": ["seafood", "seafood", "seafood", "seafood", "fish", "seafood", "seafood", "seafood", "sandwich", "prawns fish"], "difficult_direct_answer": false, "rationales": ["The food truck in the back sells seafood.", "There is a place that serves seafood behind him.", "The man is right next to a seafood restaurant."], "image": "train2014/COCO_train2014_000000190845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371351, "question_id": "i7C7PhN8CvsJcafGRV4YGG", "question": "What aquatic order are these birds from?", "choices": ["phoenicopteriformes", "vegaviiformes", "podicipediformes", "anseriformes"], "correct_choice_idx": 0, "direct_answers": ["flamingo", "sea", "flamingos", "flamingo", "venezuela", "phoenicopteriformes", "phoenicopteridae", "ocean", "phoenicopteriformes", "flamingo"], "difficult_direct_answer": false, "rationales": ["Flamingoes come from phoenicopteriformes.", "Several flamingos are posing in the sand. the only way to find this information is to research group.", "These birds come from the phoenicopteriformes order since they're flamingoes."], "image": "train2014/COCO_train2014_000000371351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169640, "question_id": "i7KgURFuEdJXkdmVSaeosq", "question": "What is parked on the side of the road?", "choices": ["car", "bicycle", "motorcycle", "bus"], "correct_choice_idx": 3, "direct_answers": ["bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["There is a waiting station with rails on the left and the vehicle is very tall compared to a regular vehicle.", "The vehicle has more than two wheels. it is taller than a car.", "The bus is waiting."], "image": "train2014/COCO_train2014_000000169640.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168145, "question_id": "i7Ykq2BwfRttFpaSXupk23", "question": "Why are the flags red in color?", "choices": ["game rules", "camouflage", "design", "visibility"], "correct_choice_idx": 3, "direct_answers": ["greater visibility", "visibility", "to warn", "visibility", "avalanche", "marking", "visibility", "pole markers", "show path", "caution"], "difficult_direct_answer": false, "rationales": ["Trail markers and visibilty.", "These flags are brightly colored so they'll be easier to see.", "The flags are visible."], "image": "train2014/COCO_train2014_000000168145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529345, "question_id": "i7nh9vhpmxAjGV2VkDCQaa", "question": "Where will they use the scissors?", "choices": ["clothes", "pizza", "hair", "paper"], "correct_choice_idx": 1, "direct_answers": ["yes", "pizza", "pizza", "cut pizza", "pizza", "pizza", "cutting pizza", "on pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["The scissors are sitting by a pizza that hasn't been cut and is ready to be served.", "Due to the scissors resting on the uncut full pizza we can assume they will be used to cut it.", "They will use them to cut the food."], "image": "train2014/COCO_train2014_000000529345.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135210, "question_id": "i7zHFrMDkW6aGW7ZEA5Axe", "question": "What is playing with the phone?", "choices": ["baby", "cat", "bird", "dog"], "correct_choice_idx": 0, "direct_answers": ["baby", "child", "baby", "child", "baby", "toddler", "baby", "baby", "baby", "baby"], "difficult_direct_answer": false, "rationales": ["The baby plays.", "This is a young human that isn't old enough to walk.", "A child is holding a phone."], "image": "val2014/COCO_val2014_000000135210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290201, "question_id": "i8Tb5B9LC7j7PmmCxraUms", "question": "What is in the yellow bag on the table?", "choices": ["chips", "pretzels", "popcorn", "candy"], "correct_choice_idx": 0, "direct_answers": ["chips", "chips", "potato chips", "chips", "chips", "potato chips", "chips", "chips", "chips", "chips"], "difficult_direct_answer": false, "rationales": ["The bag is marked with the logo of the lays brand. lays brand chips are a common side order to meals.", "That is a bag of lay's that people like to eat as a snack or with a sandwich.", "There is a lays logo on the bag. sliced potatoes also appear on the bag."], "image": "train2014/COCO_train2014_000000290201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205486, "question_id": "i8odgrCAJLSEn4vtsD7LGG", "question": "What item do multiple elderly persons here grasp?", "choices": ["scepters", "canes", "wheelchairs", "tiaras"], "correct_choice_idx": 1, "direct_answers": ["canes", "cane", "cane", "canes", "cane", "cane", "canes", "canes", "canes", "canes"], "difficult_direct_answer": false, "rationales": ["A large group of people are posing together and many of them are holding long slender objects used for balance when walking.", "They are holding sticks made for walking and helping them keep balance.", "The item is the canes."], "image": "train2014/COCO_train2014_000000205486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405964, "question_id": "i94k6vgYEeBGXqfJ7tbWBB", "question": "Why are they so high up?", "choices": ["broken mechanism", "carrying uphill", "lost", "daredevils"], "correct_choice_idx": 1, "direct_answers": ["on lift", "ski lift", "on skylift", "reach hilltop", "ski lift", "to ski", "ski lift", "riding lift", "to skii", "carrying uphill"], "difficult_direct_answer": false, "rationales": ["They are carrying them uphill.", "Based on their equipment and setting, the people are skiing and sitting on a chairlift which is known to bring people to the top of the mountain.", "The people want to go uphill so they can ski down."], "image": "train2014/COCO_train2014_000000405964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442531, "question_id": "i9EuopzCg8cki9JsCXKjPk", "question": "What should the woman sitting in the middle wear for protection?", "choices": ["scarf", "hat", "sunglasses", "mittens"], "correct_choice_idx": 2, "direct_answers": ["sunglasses", "mask", "sunglasses", "sunglasses", "face mask", "mask", "mask", "hat", "hat", "mask"], "difficult_direct_answer": false, "rationales": ["A man is sitting on a bench outdoors on a sunny day.", "The woman has glasses.", "The woman should wear sunglasses."], "image": "train2014/COCO_train2014_000000442531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400534, "question_id": "i9LUmjn7rvGhNrwUq7n6qN", "question": "What phase of meeting is this room in?", "choices": ["finishing soon", "not started", "taking vote", "just started"], "correct_choice_idx": 1, "direct_answers": ["not started", "empty", "conference", "afterwards", "board member", "post-meeting", "empty", "preparation", "conference", "not started"], "difficult_direct_answer": false, "rationales": ["The phase hasn't started.", "It's empty.", "The meeting room appears to empty waiting for people to show up."], "image": "train2014/COCO_train2014_000000400534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242847, "question_id": "i9aTKBg5ZugZsW7Rs9pVc5", "question": "How was this beverage created?", "choices": ["stirring", "baked", "blended", "boiled"], "correct_choice_idx": 2, "direct_answers": ["brewing", "by blending", "heating", "blender", "blend", "mixed", "blender", "blender", "blender", "blended"], "difficult_direct_answer": false, "rationales": ["The beverage is being poured into a cup from a larger plastic container. next to the cup is an electronic device which when combine with the plastic container becomes a blender.", "The beverage is in a blender.", "The drink was blended."], "image": "train2014/COCO_train2014_000000242847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244815, "question_id": "i9ha3Vb6PABHetcKmXWsrc", "question": "Why is the girl reaching into the van?", "choices": ["pulling chord", "grabbing phone", "buying goods", "getting in"], "correct_choice_idx": 2, "direct_answers": ["to pay", "to pay", "buying cake", "order food", "paying up", "to pay", "food", "get food", "buying goods", "to pay"], "difficult_direct_answer": false, "rationales": ["The van is selling cupcakes and similar items.", "She is trying to get an ice cream treat because it's an ice cream truck as evidenced by the name on the door.", "The girl wants to buy something."], "image": "val2014/COCO_val2014_000000244815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136168, "question_id": "i9zAQWqeRATvJWdYAe38Zf", "question": "How heavy is a newborn giraffe calf in general?", "choices": ["100 kg", "70 kg", "80 kg", "60 kg"], "correct_choice_idx": 0, "direct_answers": ["quarter ton", "half ton", "100-150 pounds", "60kg", "sixty kilograms", "50 kg", "100 kg", "fifty pounds", "50kg", "100 pounds"], "difficult_direct_answer": true, "rationales": ["Depends on their size.", "Answer derived from an internet search and the information provided there.", "A group of giraffes are grazing. baby giraffes are big."], "image": "train2014/COCO_train2014_000000136168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48066, "question_id": "iA3LXUquNCWZ4GSS89eiMj", "question": "What sport are the players in red shirts most likely playing?", "choices": ["cricket", "hockey", "lacrosse", "soccer"], "correct_choice_idx": 3, "direct_answers": ["soccer", "street hockey", "lacrosse", "street hockey", "hockey", "hockey", "soccer", "soccer", "soccer", "soccer"], "difficult_direct_answer": false, "rationales": ["A crowd of people are sitting on side while players are in middle. there is a goalie net as well as a curved stick in one hand.", "The people are playing soccer since they're on grass.", "There is a goal."], "image": "train2014/COCO_train2014_000000048066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427181, "question_id": "iADyjUWoxH9cqN3UrXswyg", "question": "What item is bigger than normal?", "choices": ["yard", "person", "bat", "ball"], "correct_choice_idx": 3, "direct_answers": ["ball", "ball", "ball", "ball", "ball", "ball", "ball", "ball", "can", "can"], "difficult_direct_answer": false, "rationales": ["It's a bigger ball for little kids.", "It's bigger so the little boy can hit it easier.", "The person is a kid. the bat and yard are normal-sized."], "image": "val2014/COCO_val2014_000000427181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265186, "question_id": "iAWcmVSWCbJCNBakvNWBZy", "question": "If this was a color picture what colors would be in the flag?", "choices": ["redwhiteblue", "whiteredyellow", "yellowwhiteblue", "blueyellowred"], "correct_choice_idx": 0, "direct_answers": ["redwhiteblue", "red blue", "red blue", "bluewhite stripes", "red", "red blue", "red-white-blue", "red", "red white", "red whiteblue"], "difficult_direct_answer": false, "rationales": ["I chose the option with the colors of the flag known as the union jack.", "A black and white photo shows people in a parade with horses and carriage. there is a flag of united kingdom handing from back of carriage.", "The flag is the flag of the uk which is red, white, and blue."], "image": "train2014/COCO_train2014_000000265186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92145, "question_id": "iAnCXkawsDmjyZWuDZhvkp", "question": "What type of animals are shown?", "choices": ["snake", "dog", "cow", "rat"], "correct_choice_idx": 2, "direct_answers": ["cows", "cows", "cows", "cows", "cows", "cows", "cows", "cow", "cows", "cows"], "difficult_direct_answer": false, "rationales": ["These animals are bovines.", "Cows are there eating the leaves.", "The animals are clearly visible and based on their size, shape and coloring, answer a is apparent."], "image": "val2014/COCO_val2014_000000092145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163479, "question_id": "iAssDuFkDLrsXC5iFNQ7hZ", "question": "What type of waterway is this?", "choices": ["pond", "ocean", "lake", "river"], "correct_choice_idx": 1, "direct_answers": ["ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "wave", "ocean", "wave", "ocean"], "difficult_direct_answer": false, "rationales": ["The ocean has people surfing.", "The beach is indicative of an ocean.", "It's the ocean."], "image": "val2014/COCO_val2014_000000163479.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77402, "question_id": "iAtTpeUbvG6Vyh677wbdSU", "question": "What rightmost utensil is upside down?", "choices": ["chop stick", "pitchfork", "knife", "spoon"], "correct_choice_idx": 3, "direct_answers": ["spoon", "spoon fork", "spoon", "for placement", "spoon", "spoon", "spoon", "spoon", "spoon", "spoon"], "difficult_direct_answer": false, "rationales": ["It does not have any prongs and is curved as a spoon.", "There is an upside down spoon.", "A utensil capable of scooping and holding liquid, which appears to be the same shape as a spoon, is visible to the right and is upside down."], "image": "val2014/COCO_val2014_000000077402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14575, "question_id": "iBTf3JWtV4X7rvAyWs7rpq", "question": "What are coming out of the closet?", "choices": ["hands", "heads", "tentacles", "feet"], "correct_choice_idx": 0, "direct_answers": ["hands", "hands", "hands", "hands", "hands", "hands", "hands", "hands", "hands", "hands"], "difficult_direct_answer": false, "rationales": ["Hands and bits of arms extend from out of the closets shadows in this bit of trick photography.", "There are several appendages that look like arms coming out of the closet.", "The hands are out."], "image": "train2014/COCO_train2014_000000014575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403862, "question_id": "iBfUkrXXMdJmMpCkW3PCEo", "question": "The face on the train makes it seem like which character?", "choices": ["choo", "thomas", "old yeller", "choo"], "correct_choice_idx": 1, "direct_answers": ["thomas", "thomas", "thomas", "cartoon character", "thomas", "cartoon character", "thomas", "thomas", "thomas", "thomas"], "difficult_direct_answer": false, "rationales": ["This is a storybook character for children's stories", "Answer a is a known cartoon character and the face placed on the train in the image matches.", "The face on the train makes it similar to thomas the train engine."], "image": "val2014/COCO_val2014_000000403862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311475, "question_id": "iBfVjJQCTF7LPSzs7q3FwD", "question": "What is the tunnel nearest the plane door called?", "choices": ["jet bridge", "air tunnel", "luggage tunnel", "escape tunnel"], "correct_choice_idx": 0, "direct_answers": ["tarmac", "aerobridge", "plane", "jet bridge", "exit", "aerobridge", "boarding bridge", "entrance", "jet bridge", "boarding bridge"], "difficult_direct_answer": false, "rationales": ["The tunnel is a bridge.", "It is for passengers to cross from their terminal gate to the plane without having to go downstairs or outdoors.", "This tunnel will connect to the door, allowing passengers to get on board. it forms a bridge from the building to the plane."], "image": "train2014/COCO_train2014_000000311475.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27888, "question_id": "iBfzFDqzdSrihhXt775hkm", "question": "What do the ladies here discuss?", "choices": ["anteaters", "wine", "women", "retirement"], "correct_choice_idx": 1, "direct_answers": ["wine", "rumors", "gossip", "many things", "opinions", "gossip", "gossiping", "men", "stories", "wine"], "difficult_direct_answer": false, "rationales": ["The ladies are near wine bottles.", "There are several bottles of wine on the table.", "They are talking about wine."], "image": "train2014/COCO_train2014_000000027888.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46155, "question_id": "iBqH9KcCGQUUdPBEHoDeEk", "question": "What is this player hoping to keep up?", "choices": ["volley", "net", "complaints", "emotions"], "correct_choice_idx": 0, "direct_answers": ["stance", "lead", "volley", "tennis", "ball", "score", "tennis ball", "match", "stamina", "ball"], "difficult_direct_answer": true, "rationales": ["They want to keep hitting it to get a point", "The player is crouched holding a racket, as if to prepare themselves to receive something.", "The woman seems to be concentrating on the ball to volley."], "image": "val2014/COCO_val2014_000000046155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516184, "question_id": "iCFe3ihKntgJCzSx3RVTzy", "question": "What type of shirt does the skateboarder in the air have on?", "choices": ["polka dot", "long sleeve", "ripped", "short sleeve"], "correct_choice_idx": 3, "direct_answers": ["striped", "striped", "striped", "striped", "striped", "short sleeve", "striped", "striped t-shirt", "striped t-shirt", "striped"], "difficult_direct_answer": false, "rationales": ["The sleeves end right below his shoulder and above his elbow.", "The skateboarder is wearing a t-shirt.", "The skateboarding boy's shirt's sleeves end above his elbow and are shorter than sleeves that extend to the wrist."], "image": "train2014/COCO_train2014_000000516184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388136, "question_id": "iCZvF4jAGogn6uNTXHQ5fu", "question": "What are the round brown things in the salad?", "choices": ["kidney beans", "pinto beans", "garbanzo beans", "mushrooms"], "correct_choice_idx": 2, "direct_answers": ["chickpeas", "chickpeas", "chickpeas", "garbanzo beans", "chick peas", "chickpeas", "nuts", "chickpeas", "mushrooms", "chickpeas"], "difficult_direct_answer": false, "rationales": ["The brown things are chickpeas.", "Garbanzo beans are the round, brown bean shown in this salad.", "The beans are round."], "image": "train2014/COCO_train2014_000000388136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550353, "question_id": "iCqvmJQRQJknwkLAATXASV", "question": "What is contained inside the long white box?", "choices": ["keyboard", "pen", "cellphone", "mouse"], "correct_choice_idx": 0, "direct_answers": ["stylus", "keyboard", "antenna", "keyboard", "keyboard", "keyboard", "keyboard", "keyboard", "keyboard", "computer part"], "difficult_direct_answer": false, "rationales": ["The box is long and narrow like a keyboard.", "There is a long white keyboard picture on top of the long box.", "A keyboard is in the box."], "image": "train2014/COCO_train2014_000000550353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457453, "question_id": "iDFat6knwSBiMoopsPX4d7", "question": "What vehicle should stop?", "choices": ["bus", "car", "bicycle", "truck"], "correct_choice_idx": 2, "direct_answers": ["bicycle", "bicycles", "bicycle", "bicycle", "bike", "car", "sedan", "bike", "left", "cars"], "difficult_direct_answer": false, "rationales": ["If you look carefully at the red traffic light, you'll see this method of transportation.", "There is a picture of one on the red light", "The red light has a bicycle symbol, and red means to stop."], "image": "val2014/COCO_val2014_000000457453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237111, "question_id": "iDbYhCRBpBBvR2uibVpZzN", "question": "What is the purpose of the paved area?", "choices": ["car parking", "basketball playing", "outdoor dining", "park swinging"], "correct_choice_idx": 0, "direct_answers": ["driving", "pedestrian crossing", "driving", "driving", "road", "driving", "driving", "driving", "walking", "car parking"], "difficult_direct_answer": false, "rationales": ["The paved area allows people to park their cars.", "This is an intersection of two roads", "This is a parking lot for cars to park."], "image": "train2014/COCO_train2014_000000237111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521874, "question_id": "iDokC24ZgvrAzcuNKzBqSb", "question": "Why is this train so small?", "choices": ["small engineer", "is broken", "for children", "is old"], "correct_choice_idx": 2, "direct_answers": ["transport", "model", "single car", "miniature version", "for kids", "fewer passengers", "amusement train", "model train", "for kids", "for children"], "difficult_direct_answer": true, "rationales": ["It's a sightseeing train", "The train is small because it is a child's model.", "The train is too small to carry loads and is more likely a show piece."], "image": "val2014/COCO_val2014_000000521874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357362, "question_id": "iDozM7ebnMU3tssobUW4Xq", "question": "What is the orange object used for?", "choices": ["storage", "scoring", "trash", "blocking"], "correct_choice_idx": 2, "direct_answers": ["trash", "trash", "trash", "refuse", "trash", "garbage", "trash", "garbage", "trash", "trash"], "difficult_direct_answer": false, "rationales": ["It's used for a trash can.", "The bin is a trash can.", "The orange object is used for collecting garbage."], "image": "train2014/COCO_train2014_000000357362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304134, "question_id": "iE4cnMDJnpRh9XshF4ZxXr", "question": "What job does the man holding the orange stick carry out here?", "choices": ["toll taker", "traffic cop", "seamstress", "bus driver"], "correct_choice_idx": 1, "direct_answers": ["traffic", "construction worker", "crossing guard", "crossing guard", "crossing guard", "control movement", "walk road", "traffic cop", "traffic director", "construction duties"], "difficult_direct_answer": false, "rationales": ["The man is making sure people follow traffic rules.", "The man is a crossing guard. the stick in his hand tells people to stop or go.", "The man has some traffic items."], "image": "train2014/COCO_train2014_000000304134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456648, "question_id": "iE4eSUeLoQvmyMXoxw9LLe", "question": "Which form of transportation shown here uses less fuel to fill up?", "choices": ["suv", "bus", "semi", "motorcycle"], "correct_choice_idx": 3, "direct_answers": ["motorcycle", "moped", "motorbike", "scooter", "bike", "bike", "motor bike", "motorcycle", "scooter", "motorcycle"], "difficult_direct_answer": false, "rationales": ["The transport is the motorbike.", "Motorcycles are more efficient.", "The motorcycle will take less gas and use less gas."], "image": "train2014/COCO_train2014_000000456648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49424, "question_id": "iE6pKDCmHa5SXvi5byHpRh", "question": "The desk is made of what type of material?", "choices": ["mahogany", "particle board", "oak", "styrofoam"], "correct_choice_idx": 1, "direct_answers": ["wood", "wood", "plywood", "plywood", "particle board", "plywood", "wood", "wood", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["A computer sits on a flat brown surface that has some wood grain in it.", "It looks to be made with a thinner wood.", "The desk is made out of an inexpensive type of wood."], "image": "train2014/COCO_train2014_000000049424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133660, "question_id": "iEHRHLvEJn8RkVryBPXGAs", "question": "What color shorts does the person to whom the frisbee is thrown wear?", "choices": ["red", "white", "green", "light blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "blue", "blue", "gray", "blue", "blue", "gray", "light blue", "gray", "blue"], "difficult_direct_answer": false, "rationales": ["The color is similar to that of the sky.", "This person is wearing shorts that are a similar color to to the body of water he is near.", "The person is wearing swim trunks."], "image": "train2014/COCO_train2014_000000133660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431825, "question_id": "iESRJAgYLhaT49J4tcbjYg", "question": "How many giraffes are there?", "choices": ["four", "six", "three", "five"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three giraffes.", "Two have their heads up looking straight ahead, one is bent down to eat.", "One giraffe is in the front and two are in the back."], "image": "train2014/COCO_train2014_000000431825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253281, "question_id": "iEaAWDn7vk67dfEWzZ9ksg", "question": "Why are there no sails raised here?", "choices": ["for speed", "doldrums", "too windy", "boats vacant"], "correct_choice_idx": 3, "direct_answers": ["boats vacant", "wind", "parked", "docked boats", "no wind", "anchored", "stationary", "no wind", "no wind", "not windy"], "difficult_direct_answer": false, "rationales": ["Nobody is using the vehicles at this time.", "The boats are wanting to not move in the water.", "The boats aren't being used."], "image": "train2014/COCO_train2014_000000253281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337126, "question_id": "iEgDpQuGTqG6rmVJcbBLhJ", "question": "What accessory can you get from a building to the left of he camera?", "choices": ["bags", "shoes", "hats", "glasses"], "correct_choice_idx": 3, "direct_answers": ["glasses", "glasses", "photo captured", "eyewear", "glasses", "unknown", "glasses", "glasses", "phone", "eye glasses"], "difficult_direct_answer": false, "rationales": ["The leftmost readable store front reads 'opticians'. this is term is associated with eyeglasses.", "You can buy glasses at one of the stores at the left of the camera.", "It says opticians."], "image": "train2014/COCO_train2014_000000337126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425772, "question_id": "iEjD55V7uG9AavZM4Q94dZ", "question": "What shape is the item on the floor that is in front of the boat that is behind the red boat?", "choices": ["square", "rectangle", "rhombus", "round"], "correct_choice_idx": 3, "direct_answers": ["round", "round", "curved", "round", "round", "round", "circle", "ball", "round", "boat"], "difficult_direct_answer": false, "rationales": ["The shape is circular.", "A buoy is behind a boat.", "There is a ball on the ground between the two boats."], "image": "train2014/COCO_train2014_000000425772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34480, "question_id": "iEvPbB6Ys5UBuwBW66QuLN", "question": "What are they doing?", "choices": ["chatting phone", "eating lunch", "remote control", "video game"], "correct_choice_idx": 3, "direct_answers": ["video game", "gaming", "video games", "playing games", "playing", "playing videos", "playing videogame", "playing game", "playing wii", "playing wii"], "difficult_direct_answer": true, "rationales": ["They are playing video games and holding the controlers.", "The two girls are standing playing with a nintendo wii.", "They have game controllers in their hands"], "image": "train2014/COCO_train2014_000000034480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46883, "question_id": "iEzpQSQB4GUuw9kPhXjJs8", "question": "Which traffic light is missing?", "choices": ["white", "orange", "yellow", "blue"], "correct_choice_idx": 1, "direct_answers": ["stop", "yellow", "yellow", "orange", "yellow", "yellow", "yellow", "yellow caution", "yellow", "orange"], "difficult_direct_answer": false, "rationales": ["Green and red are lite up but not the middle one.", "The only colors on a traffic light are red, yellow and green.", "The only lights are red and green."], "image": "train2014/COCO_train2014_000000046883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281177, "question_id": "iFPKenZ72xk5aRWkar2dvi", "question": "Who got Mrs. Kalayaan 2013?", "choices": ["none", "anthony bautista", "rose pacia", "alice howden"], "correct_choice_idx": 3, "direct_answers": ["alice howden", "umbrella woman", "alice howden", "rose labelle", "alice howden", "kalayaan", "pacia labelle", "alice howden", "alice howden", "rose labelle"], "difficult_direct_answer": false, "rationales": ["The winner was alice howden.", "A woman in a pink dress with a sash sits on the back of a convertible in a parade.", "It's alice howden."], "image": "train2014/COCO_train2014_000000281177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504500, "question_id": "iFQ4oLtPPpdqkfHPUSWuTa", "question": "What are the people in the middle standing in front of?", "choices": ["airplanes", "boxes", "surfboards", "cars"], "correct_choice_idx": 2, "direct_answers": ["camera", "surfboards", "surfboards", "surfboards", "surfboards", "surfboards", "surfboards", "boards", "surfboards", "surfboards"], "difficult_direct_answer": false, "rationales": ["They are tall boards used for riding waves.", "The objects behind the people are long, flat, and thin. surfboards are a common item to be found at a beach.", "That's what they're standing in front of at the beach."], "image": "val2014/COCO_val2014_000000504500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444493, "question_id": "iFSv5UawM9ENMNnjxdmKVU", "question": "What location do these men wait in?", "choices": ["parking lot", "bus stop", "taxi stop", "church lot"], "correct_choice_idx": 1, "direct_answers": ["bus stop", "on sidewalk", "bus stop", "bus stop", "bus stop", "outside", "bus stop", "on sidewalk", "bus stop", "bus stop"], "difficult_direct_answer": false, "rationales": ["People gather at this point to wait for their public transportation.", "The location is a bus stop.", "The area the men are standing in is a stop for buses."], "image": "val2014/COCO_val2014_000000444493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345071, "question_id": "iFVoZFkhghCzTfhN3onkaq", "question": "What are the people patiently waiting for?", "choices": ["traffic stopping", "bus", "rain stopping", "friends"], "correct_choice_idx": 2, "direct_answers": ["rain stopping", "rain stopping", "stop raining", "rain stop", "rain", "rain stop", "stop raining", "rain stop", "rain", "bus"], "difficult_direct_answer": false, "rationales": ["The people wait for the rain.", "The weather is very wet", "The rain is pounding down."], "image": "val2014/COCO_val2014_000000345071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50521, "question_id": "iFcJ9e3YjCUEDoHVQLKuYY", "question": "What is the corrugated white metal panel to the left of the wash basin used for?", "choices": ["room aesthetics", "cooling", "storage", "heating"], "correct_choice_idx": 3, "direct_answers": ["privacy", "dirty clothes", "heating", "towel hanger", "potty train", "sink", "heat", "washing clothes", "peeing", "heat"], "difficult_direct_answer": true, "rationales": ["It's an element to provide warmth.", "It's to heat the place up.", "Hot water tends to run through the white metal panel."], "image": "val2014/COCO_val2014_000000050521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421823, "question_id": "iFmoy83KBMAejSczdCTfku", "question": "How many geese are flying in a formation?", "choices": ["seven", "eight", "fourteen", "four"], "correct_choice_idx": 0, "direct_answers": ["seven", "seven", "seven", "seven", "seven", "seven", "seven", "seven", "7 geese", "seven"], "difficult_direct_answer": false, "rationales": ["They are in a v formation", "There is a goose in front and six following.", "They are in an organized line."], "image": "train2014/COCO_train2014_000000421823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160428, "question_id": "iFxjZSzzRm2js9K5Meh5YF", "question": "The orange orbs seen here are actually what?", "choices": ["real oranges", "plastic", "lemons", "pinatas"], "correct_choice_idx": 1, "direct_answers": ["oranges", "plastic", "plastic ornaments", "oranges", "oranges", "decoration", "plastic", "lights", "fruits", "decoration"], "difficult_direct_answer": false, "rationales": ["The orbs are plastic.", "There is a seam in the middle of these orbs, implying they are not natural.", "They are fake and hanging for decoration."], "image": "train2014/COCO_train2014_000000160428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467502, "question_id": "iG8hSaWJ9e55pLW29Cahgs", "question": "These animals are mascots for what brand of gum?", "choices": ["doublemint", "trident", "dubble bubble", "fruit stripe"], "correct_choice_idx": 3, "direct_answers": ["zebra", "wrigley", "fruit stripe", "fruit stripe", "fruit stripe", "fruit stripe", "fruit stripe", "wrigley's", "fruit striped", "fruit stripe"], "difficult_direct_answer": false, "rationales": ["Here we see zebras grazing. fruit stripe gum has a famous multicolored zebra as a mascot.", "The brand of gum uses a zebra mascot on the wrapper.", "They are zebras."], "image": "train2014/COCO_train2014_000000467502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361420, "question_id": "iGA6dRVTXcLMa5gJcoSPBT", "question": "This playing is making what shot?", "choices": ["forehand", "serve", "lob", "backhand"], "correct_choice_idx": 3, "direct_answers": ["backhand", "quick", "tennis forehand", "backhand", "tennis", "swing", "cross swing", "back hand", "volley", "tennis"], "difficult_direct_answer": false, "rationales": ["Tennis players make this movement when making a swing across his/her body.", "The backhand is the first thing you see when he makes his swing", "The player is making a backhand shot."], "image": "train2014/COCO_train2014_000000361420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410373, "question_id": "iGSVXUFpNb4QEMDyTjEiw7", "question": "What is the terrain with trees on it?", "choices": ["savanna", "private island", "peninsula", "plain"], "correct_choice_idx": 1, "direct_answers": ["beach", "island", "island", "island", "island", "island", "island", "beach", "private island", "island"], "difficult_direct_answer": false, "rationales": ["It is an island because it has water all around it.", "The land can be seen meeting the water on both sides so it is apparent it is an island.", "The are in the ocean."], "image": "train2014/COCO_train2014_000000410373.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384242, "question_id": "iGae2WY4jaRJdSvHzPfH2G", "question": "What is he doing?", "choices": ["boarding board", "sinking", "taking board", "falling"], "correct_choice_idx": 0, "direct_answers": ["carrying glider", "parasailing", "boarding board", "windsurfing", "sailboarding", "sail surfing", "windsurfing", "water skiing", "windsailing", "kitesurfing"], "difficult_direct_answer": true, "rationales": ["He's boarding the board.", "The man is getting on his board.", "The person was parasailing but now they are halfway in the water holding onto the board indicating that they are sinking."], "image": "val2014/COCO_val2014_000000384242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196811, "question_id": "iHJPBvFprPbgjs5P73S7Nu", "question": "What type of event is being held here?", "choices": ["phone resales", "racing cars", "tractor pull", "outdoor faire"], "correct_choice_idx": 3, "direct_answers": ["festival", "market", "parade", "vendors", "tour bus", "outdoor faire", "fair", "fair show", "festival", "information fair"], "difficult_direct_answer": true, "rationales": ["There are tents and stations.", "An outdoor market place.", "There are visible tents set up with goods on them as would happen at answer a."], "image": "val2014/COCO_val2014_000000196811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169226, "question_id": "iHPAJZhFaV3jJ5d8qKiJAP", "question": "What photographic technique was used to capture the movement of traffic on the street?", "choices": ["hdr", "time-lapse", "panorama", "bokeh"], "correct_choice_idx": 1, "direct_answers": ["shutter speed", "unknown", "light trails", "time delay", "drone photo", "iso", "panning", "time-lapse", "long exposure", "blurred"], "difficult_direct_answer": true, "rationales": ["Timelapse is used to capture the hurried light.", "It shows the same few vehicles at different points along the street", "The traffic is supposed to be time lapsed."], "image": "val2014/COCO_val2014_000000169226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362812, "question_id": "iHhst7bfqKVT8feRyCgqHU", "question": "What mode of travel is in use here?", "choices": ["taxi", "uber plus", "train", "plane"], "correct_choice_idx": 2, "direct_answers": ["bus", "subway", "subway", "train", "subway", "subway", "bus", "train", "subway", "subway"], "difficult_direct_answer": false, "rationales": ["The are standing up.", "People are lined up in a vehicle that has poles and handles. trains are used in cities for public transportation.", "The interior has rails, handles and a seating arrangement that is consistent with answer a."], "image": "val2014/COCO_val2014_000000362812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289248, "question_id": "iHr4LaQgcdUSeRKSRpBt9j", "question": "What digital device is being used to capture memories?", "choices": ["recorder", "phone", "sketch artist", "camera"], "correct_choice_idx": 3, "direct_answers": ["camera", "camera", "camera", "camera", "camera", "camera", "camera", "camera", "camera", "camera"], "difficult_direct_answer": false, "rationales": ["A group of women are sitting on a boat. one has a device in her hand that is small and has a lens for capturing pictures.", "The woman is holding a camera.", "The device's sole purpose is to take photos."], "image": "train2014/COCO_train2014_000000289248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167267, "question_id": "iJ63yfobrVdq3jW4vdB55m", "question": "What is being fed to the giraffe?", "choices": ["apple", "french fry", "cracker", "banana"], "correct_choice_idx": 2, "direct_answers": ["crackers", "cracker", "cracker", "cookie", "biscuit", "cracker", "bread", "cracker", "cracker", "cracker"], "difficult_direct_answer": false, "rationales": ["The item is square and dry, which is what a food item by that name looks like.", "They are feeding him a cracker.", "The item being fed is a flat dry looking object consistent with answer a. it is clearly not answers b and c and looks too large and flat to be answer d."], "image": "train2014/COCO_train2014_000000167267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11332, "question_id": "iJESQVszyFxwHtnPGz72yM", "question": "Where are these people gathered?", "choices": ["hospital", "office", "restaurant", "home"], "correct_choice_idx": 2, "direct_answers": ["at table", "restaurant", "restaurant", "restaurant", "conference meeting", "restaurant", "meeting", "restaurant", "restaurant", "convention"], "difficult_direct_answer": false, "rationales": ["The people are eating a meal.", "The people are sitting around a table gathered at a restaurant.", "There is a formal table with lots of glasses"], "image": "train2014/COCO_train2014_000000011332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290965, "question_id": "iJTpxVuVCorPysLj78XDJB", "question": "What is the first letter after the picture of the flag on the plane in the foreground?", "choices": ["n", "r", "w", "e"], "correct_choice_idx": 0, "direct_answers": ["n", "letter n", "letter n", "letter n", "letter n", "n k", "express", "it's n", "letter n", "letter n"], "difficult_direct_answer": false, "rationales": ["The letter is easily readable, and comes before the numbers but after the flag.", "The letter is n.", "An american flag is on a plane followed by a series of letters and numbers."], "image": "train2014/COCO_train2014_000000290965.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150413, "question_id": "iJdQNuxcQujetxKcBhuGoQ", "question": "What is the most likely reason for the dogs to be in this yard?", "choices": ["work dogs", "pets", "guard dogs", "strays"], "correct_choice_idx": 1, "direct_answers": ["exercise", "bathroom time", "bathroom", "live there", "pets", "watchdogs", "fresh air", "construction", "to pee", "live there"], "difficult_direct_answer": true, "rationales": ["The pets are in the yard.", "The dogs are probably in the yard of their home.", "The dogs are pets."], "image": "val2014/COCO_val2014_000000150413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202582, "question_id": "iK5Awqa6rsAseycd9bjmGC", "question": "What part of a country is this?", "choices": ["inland", "valley", "coast", "mountaintop"], "correct_choice_idx": 2, "direct_answers": ["beach", "sea side", "coast", "beach", "coastal", "beach", "west coast", "coast", "seashore", "beach"], "difficult_direct_answer": false, "rationales": ["The scene is set on a beach with a large body of water near. the place where beaches and bodies of water exist is answer a.", "The country is the coast.", "This would be considered the coast since its on an ocean."], "image": "val2014/COCO_val2014_000000202582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379086, "question_id": "iKGAaYAcjxZBgTLmP9hsMs", "question": "What hazard appears to be occurring on the road?", "choices": ["too hot", "too cold", "water", "slippery"], "correct_choice_idx": 3, "direct_answers": ["ice", "ice", "slippery", "ice", "fog", "rain", "car crash", "ice", "snow", "snow"], "difficult_direct_answer": false, "rationales": ["The road is currently slippery.", "The road looks wet or icy because it's shiny", "The road is wet."], "image": "val2014/COCO_val2014_000000379086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419980, "question_id": "iKGzfGUNbVBwBtf9zNTTgE", "question": "What is the woman putting the tray in the oven?", "choices": ["to cook", "to clean", "to decorate", "to fumigate"], "correct_choice_idx": 0, "direct_answers": ["to cook", "chef", "turkey", "casserole", "dinner", "food", "cooking", "food", "to cook", "cooking"], "difficult_direct_answer": false, "rationales": ["The woman is putting something in an oven.", "The woman is putting a tray full of food in the oven in order to bake it.", "The food in the tray is raw, and the oven is turned on."], "image": "train2014/COCO_train2014_000000419980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503972, "question_id": "iLeV8XqrTbAK7aZgiggKkh", "question": "What dangles from the dark blue type ribbon here?", "choices": ["rabbits foot", "diamond", "id", "dog tags"], "correct_choice_idx": 2, "direct_answers": ["id", "identification badge", "id card", "identification", "name badge", "lanyard", "tie", "badge", "id", "badge"], "difficult_direct_answer": false, "rationales": ["The dark blue type ribbon, or lanyard, is known as a less formal but still professional way to carry things in a business setting.", "People wear lanyards at work. lanyards hold identification.", "The id dangle.s"], "image": "val2014/COCO_val2014_000000503972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348595, "question_id": "iLiwcPz3ck7KSf73D4eNdx", "question": "Which country has red with white flag?", "choices": ["dutch", "poland", "turkey", "russia"], "correct_choice_idx": 1, "direct_answers": ["russia", "poland", "poland", "poland", "japan", "japan", "canada", "poland", "switzerland", "canada"], "difficult_direct_answer": false, "rationales": ["Poland's flag is red and white.", "The answer is commonly known and not necessarily related to the image.", "The country is poland."], "image": "train2014/COCO_train2014_000000348595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155355, "question_id": "iLzGC7poYTqCmNUzSYn59M", "question": "How was the photo turned black and white?", "choices": ["weather", "filter", "crayons", "time"], "correct_choice_idx": 1, "direct_answers": ["filters", "filter", "editing software", "filters", "invert", "computer effects", "photo editing", "filter", "edit", "photo editing"], "difficult_direct_answer": false, "rationales": ["A filter is used to digitally turn photos black and white.", "The photo has a filter.", "It is not a realistic photo."], "image": "val2014/COCO_val2014_000000155355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251690, "question_id": "iMGccuQHuzhvVRKjrbQJXy", "question": "How much would 2 pounds of oranges cost?", "choices": ["1.92", "1.63", "1.98", "1.49"], "correct_choice_idx": 2, "direct_answers": ["one ninety-eight", "1.98", "ninety nine", "ten dollars", "two dollars", "1.98", "two dollars", "1.98", "ninty-nine cents", "1.98"], "difficult_direct_answer": false, "rationales": ["Two multiplied by 99 is 198.", "The sign indicates that one pound sells for 99 cents. two pounds would cost twice as much.", "Per the sign one pound would be 99 cents so multiply that by 2."], "image": "train2014/COCO_train2014_000000251690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403329, "question_id": "iMTErxbM9XGyqrGsd3WXie", "question": "What language is likely the language of the person using the right laptop?", "choices": ["chinese", "tagalog", "japanese", "korean"], "correct_choice_idx": 0, "direct_answers": ["chinese", "chinese", "asian", "chinese", "japanese", "chinese", "chinese", "chinese", "english", "korean"], "difficult_direct_answer": false, "rationales": ["The language is chinese.", "The characters are in mandarin.", "Laptops are shown with chinese characters on them."], "image": "train2014/COCO_train2014_000000403329.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215907, "question_id": "iMUwz4cyHWGUHDrsaXgjYX", "question": "What type of mattress would one have to buy for the dog's resting place?", "choices": ["daybed", "queen", "twin", "full"], "correct_choice_idx": 0, "direct_answers": ["futon", "single", "twin", "daybed", "pillow", "pillow", "couch", "dog bed", "futon", "dog mat"], "difficult_direct_answer": false, "rationales": ["The mattress is a daybed.", "This is a type of bed that is a couch but folds out to a bed.", "I'd get a daybed since this is the most luxurious but still the most compact."], "image": "train2014/COCO_train2014_000000215907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258714, "question_id": "iMXWjJq2SAGXR2b37iRpuo", "question": "Where is the child's parents most likely?", "choices": ["home", "behind her", "bathroom", "beer stand"], "correct_choice_idx": 1, "direct_answers": ["taking picture", "playing", "behind", "close proximity", "behind camera", "taking photo", "behind her", "behind her", "behind her", "taking photo"], "difficult_direct_answer": false, "rationales": ["More than likely they are sitting right near her.", "The little girl is watching on bleachers.", "The parents are behind."], "image": "train2014/COCO_train2014_000000258714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42751, "question_id": "iMfzwRZJeX9N7CNUbvM537", "question": "What is shelf made with?", "choices": ["wood", "plastic", "steel", "glass"], "correct_choice_idx": 2, "direct_answers": ["metal", "metal", "metal", "metal", "metal", "wood", "steel", "steel", "metal", "steel"], "difficult_direct_answer": false, "rationales": ["The shelf is steel.", "The shelf is the same color as other steel shelves and looks strong and metallic.", "The shelf is visible and of a color and consistent like one made from answer a."], "image": "train2014/COCO_train2014_000000042751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574178, "question_id": "iMpDBPjAGLUBoJXzaF3vpP", "question": "Why are they carrying the snowboards?", "choices": ["going boarding", "selling them", "stole them", "going home"], "correct_choice_idx": 0, "direct_answers": ["leaving slope", "snowboarding", "going uphill", "uphill", "finished snowboarding", "up slope", "going boarding", "walking", "finished snowboarding", "finished"], "difficult_direct_answer": true, "rationales": ["They are going to ride in the snow.", "The people are snowboarding based on the visible equipment and their gear and would be carrying the equipment to get in position to do that activity.", "The two individuals are dressed to engage in a cold or snow activity."], "image": "val2014/COCO_val2014_000000574178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186095, "question_id": "iMq77Ys6W6sen7eNSV2Hyu", "question": "How many of these could he safely mount at a time?", "choices": ["three", "six", "one", "two"], "correct_choice_idx": 2, "direct_answers": ["two", "one", "one", "two", "one", "one", "one", "two", "one", "one"], "difficult_direct_answer": false, "rationales": ["A young boy is standing by two horses. you have to take your time and only do a horse at a time so they don't get spooked and run.", "A person is standing near two horses. people can only ride one horse at a time.", "He can't get on two horses at once without falling."], "image": "val2014/COCO_val2014_000000186095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203702, "question_id": "iMtMNmVXKG6JRQdBAKTUuQ", "question": "Why are they so distracted by the zebra?", "choices": ["is noisy", "is unusual", "is attacking", "is famous"], "correct_choice_idx": 1, "direct_answers": ["outof place", "its beauty", "rare sighting", "is unusual", "on street", "feeding", "close by", "unusual", "escaped zoo", "feeding it"], "difficult_direct_answer": true, "rationales": ["Zebras are standing near cars.", "They do not see these types of animals everyday.", "Zebras are usually found in the savannah or zoo, not the city streets."], "image": "train2014/COCO_train2014_000000203702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420617, "question_id": "iNGuHy56tpBASt39YZSU3J", "question": "What are the two men watching?", "choices": ["news report", "video game", "movie", "music video"], "correct_choice_idx": 1, "direct_answers": ["esports", "video game", "video game", "video game", "video game", "video game", "computer screen", "video game", "video game", "game"], "difficult_direct_answer": false, "rationales": ["The two men are watching a screen showing a video game.", "One man is using the keyboard to interact with the content on the monitor. the content consists of a character holding a gun in a first-person view.", "The onscreen information and animated style of what is displayed on the monitor these two men fix their attention on let's us know it's an interactive computer game."], "image": "train2014/COCO_train2014_000000420617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147025, "question_id": "iNQA3ppoyA8CAA9pEbEigu", "question": "What is a term that could be used to refer to the person on the right?", "choices": ["brunette", "ginger", "woman", "baby"], "correct_choice_idx": 1, "direct_answers": ["millennial", "man", "friend", "gothic", "friend", "ginger", "speaking", "block t-shirt", "ginger", "ginger"], "difficult_direct_answer": false, "rationales": ["The person on the right is male, has red hair, and is an adult.", "People that have red hair are often called gingers.", "The man has pale skin and red hair, known as a ginger."], "image": "val2014/COCO_val2014_000000147025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309889, "question_id": "iNQdb8RJ2ZHn4r3fYAKV9J", "question": "What are the two areas decorated with red sheets used for?", "choices": ["performing", "serving", "sleeping", "gaming"], "correct_choice_idx": 2, "direct_answers": ["sleeping", "sleeping", "sleeping", "sleeping", "beds", "sleeping", "sleeping", "sleeping", "beds", "sleeping"], "difficult_direct_answer": false, "rationales": ["They are twin beds.", "The areas are for sleeping.", "The mattress is covered in a sheet for sleeping."], "image": "val2014/COCO_val2014_000000309889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372514, "question_id": "iNWe6qhv85XsWmQA5SZabd", "question": "What food is frying in the oil?", "choices": ["hot dogs", "fritters", "donuts", "hamburgers"], "correct_choice_idx": 2, "direct_answers": ["donuts", "donuts", "donuts", "donuts", "donuts", "donut", "donuts", "buns", "doughnuts", "donuts"], "difficult_direct_answer": false, "rationales": ["Donuts are frying.", "The donuts are seen evident by the shape.", "Donuts are in the oil."], "image": "train2014/COCO_train2014_000000372514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402249, "question_id": "iNeS5w9fbY6KetSoJRaSUn", "question": "What kind of broad category tattoos she has?", "choices": ["decorative", "pictorial", "grand", "symbolic"], "correct_choice_idx": 1, "direct_answers": ["black white", "asian", "hand", "henna", "symbol", "florals", "tribal", "pictorial", "tribal", "dermal"], "difficult_direct_answer": true, "rationales": ["That is what the tattoos are from.", "Pictorial tattoos show a drawn object.", "The tattoos are just pictures."], "image": "train2014/COCO_train2014_000000402249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134778, "question_id": "iNfg259Ygpx7kG4uUVK2Pp", "question": "What color will these birds become?", "choices": ["white", "bright pink", "black", "green"], "correct_choice_idx": 1, "direct_answers": ["bright pink", "pink", "pink", "pink", "pink", "grey", "pink", "pink", "grey", "pink"], "difficult_direct_answer": false, "rationales": ["These birds look like flamingos. and if they are flamingos turn pink at some point due to their diet.", "The color is pink.", "Although flamingos are born gray, they change color due to the beta-carotene in the crustaceans and plankton that they eat."], "image": "val2014/COCO_val2014_000000134778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447253, "question_id": "iNiHTR6aktkRwoe8XMvxHZ", "question": "What is the person carrying the tray most likely doing with the items?", "choices": ["selling", "exercising", "buying", "decorating"], "correct_choice_idx": 0, "direct_answers": ["selling", "serving", "selling", "selling", "serving", "serving", "serving guests", "selling them", "serving", "selling"], "difficult_direct_answer": false, "rationales": ["She has several of the same items as if selling", "The person wants to sell his wares.", "The man carrying the tray is selling the items on it."], "image": "train2014/COCO_train2014_000000447253.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266852, "question_id": "iNnKiowf9RbUrsqvsa3JaL", "question": "What kind of birds are these?", "choices": ["flamingos", "peacocks", "crows", "boobies"], "correct_choice_idx": 0, "direct_answers": ["flamingos", "flamingo", "flamingos", "flamingos", "flamingo", "flamingo", "flamingos", "flamingo", "flamingo", "flamingos"], "difficult_direct_answer": false, "rationales": ["The birds are tall and have pink feathers, curved necks, and thin legs.", "Flamingos have pink feathers", "The birds are flamingoes."], "image": "val2014/COCO_val2014_000000266852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108282, "question_id": "iNwEqy4V8zUZZfdpFsp2E2", "question": "What segment of this food is this woman eating right now?", "choices": ["quart", "slice", "half", "dozen"], "correct_choice_idx": 1, "direct_answers": ["pizza piece", "slice", "pizza", "first segment", "tip", "tip", "slice", "slice", "slice", "pizza"], "difficult_direct_answer": false, "rationales": ["This woman is eating a slice of pizza.", "She's eating a piece of pizza", "The woman is eating pizza which is cut into triangle sections."], "image": "train2014/COCO_train2014_000000108282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268570, "question_id": "iP4Z4zeAvXzjgkcnC4MSKS", "question": "What type of furniture is the cat on?", "choices": ["chair", "bookcase", "table", "bed"], "correct_choice_idx": 0, "direct_answers": ["chair", "chair", "chair", "chair", "backpack", "chair", "luggage", "chair", "chair", "chair"], "difficult_direct_answer": false, "rationales": ["The cat is sitting in a chair.", "The type is a chair.", "The place the cat is on looks like a chair and has an armrest made of wood like a lot of chairs have."], "image": "train2014/COCO_train2014_000000268570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52222, "question_id": "iP6qCqqcUbfXvCEe5PFa5s", "question": "What are bricks mostly made of?", "choices": ["sand", "clay", "straw", "rock"], "correct_choice_idx": 1, "direct_answers": ["clay", "clay", "clay", "sand", "cement", "racing", "clay", "clay", "clay", "clay"], "difficult_direct_answer": false, "rationales": ["The bricks get their color from this material.", "Brick is made out of baked clay.", "Traditional ones also included b, c and d, but modern ones are primarily a."], "image": "train2014/COCO_train2014_000000052222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515056, "question_id": "iP8ywuyPEvG8sfvGkyfDv9", "question": "How many people can be seen?", "choices": ["three", "four", "six", "five"], "correct_choice_idx": 2, "direct_answers": ["six", "six", "six", "five", "six", "six", "seven", "five", "seven", "5-6"], "difficult_direct_answer": false, "rationales": ["There are 6.", "There are two sets of three people each.", "A half dozen people can be seen."], "image": "val2014/COCO_val2014_000000515056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408528, "question_id": "iPJvQicQMyCGWG3kRmhgdX", "question": "What blends things in the green based glass pitcher?", "choices": ["wheel turning", "electric motor", "poodles", "solar energy"], "correct_choice_idx": 0, "direct_answers": ["blades", "smoothie", "blades", "blender", "blender", "wheel turning", "blender", "metal", "blades", "blender"], "difficult_direct_answer": false, "rationales": ["A blender is on a table on a patio.", "There is a bicycle.", "A cord can be seen coming from a green blender on a table."], "image": "train2014/COCO_train2014_000000408528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322197, "question_id": "iPN5pvzH4Xos6thDUyueGt", "question": "The colors of this vehicle resemble which flag?", "choices": ["belarus", "russia", "argentina", "india"], "correct_choice_idx": 2, "direct_answers": ["blue yellow", "ukraine", "sweden", "ukraine", "sweden", "ukraine", "ukraine", "argentina", "argentina", "ukraine"], "difficult_direct_answer": false, "rationales": ["Argentina's country colors are light blue, white and yellow.", "The plane has a white, blue, and yellow livery. the livery does not contain red, orange, or green.", "It has blue and yellow that are the same color as their country flag."], "image": "train2014/COCO_train2014_000000322197.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174157, "question_id": "iPgXd7gUvt2dtqeH9FLrWV", "question": "What are the two red objects on top of the sandwich?", "choices": ["red peppers", "tomatoes", "toothpick tops", "ketchup spots"], "correct_choice_idx": 2, "direct_answers": ["lobster toothpicks", "lobster claws", "toothpicks", "peppers", "toothpicks", "claw toothpicks", "tomatoes", "toothpick tops", "tomatoes", "toothpicks"], "difficult_direct_answer": false, "rationales": ["The toothpicks hold the sandwich together.", "The objects are pointed and are being used to keep the sandwich together.", "The objects go through the entire sandwich, and are probably used to keep it together."], "image": "train2014/COCO_train2014_000000174157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226634, "question_id": "iPhqRDsW89MVR2JHvem6vC", "question": "When driving down this street when getting to Kenmore street which direction turn is allowed?", "choices": ["none", "any", "left", "right"], "correct_choice_idx": 2, "direct_answers": ["left", "left", "left", "right", "left", "left", "left turn", "left turn", "left", "left"], "difficult_direct_answer": false, "rationales": ["There is a one way sign pointing left.", "The turn is leftward.", "The street sign says to go left."], "image": "val2014/COCO_val2014_000000226634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576896, "question_id": "iQ43msw6NZ2Cjsr66PZYd4", "question": "What does the woman here do with her kite?", "choices": ["markets it", "flies it", "nothing", "boxes it"], "correct_choice_idx": 1, "direct_answers": ["flies it", "flies it", "fly it", "fly it", "control it", "fly it", "fly kit", "fly", "fly it", "fly it"], "difficult_direct_answer": false, "rationales": ["The kites are up in the air and she has string in her hands", "She is holding a string that is attached at the other end to a kite which is high above her in the sky.", "The woman is flying her kite."], "image": "train2014/COCO_train2014_000000576896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519899, "question_id": "iQPhQmQz4QsSeYgipVjP8j", "question": "What biological class do the animals in the water belong to?", "choices": ["diplopoda", "aves", "maxillopoda", "mammalia"], "correct_choice_idx": 1, "direct_answers": ["avian", "birds aves", "fowl", "bird", "duck", "water birds", "birds", "birds", "aves", "aves"], "difficult_direct_answer": false, "rationales": ["This is their classification", "The animals visible are ducks and the biological class is internet searchable for ducks.", "The animals that are in the water are birds."], "image": "train2014/COCO_train2014_000000519899.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182956, "question_id": "iQY9kvU4cy5CaUwjTKu2yg", "question": "In which room does this man stand?", "choices": ["kitchen", "men's room", "ladies room", "bedroom"], "correct_choice_idx": 1, "direct_answers": ["bathroom", "washroom", "washroom", "bathroom", "bathroom", "men's room", "bathroom", "bathroom", "bathroom", "bathroom"], "difficult_direct_answer": false, "rationales": ["Due to the toilet with bar style paraphernalia, handicapped accommodation present and the fact that a male is pictured we can conclude this is a man's bathroom.", "The room is for men.", "He is standing in front of a toilet."], "image": "train2014/COCO_train2014_000000182956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25996, "question_id": "iQstBsTAShVWCX48XPtfs9", "question": "What motion makes the child smile?", "choices": ["nodding", "earthquake", "rocking", "sliding"], "correct_choice_idx": 3, "direct_answers": ["sliding", "sliding", "sliding", "slide", "sliding", "pulling", "tipping", "sliding", "sliding", "waving"], "difficult_direct_answer": false, "rationales": ["The child is sliding down the hill on a sled.", "The baby is sitting a sleigh and the adults are pushing him back and forth. p", "A child is on a flat object in the street. a person is pushing the object the child is sitting on. the child is smiling."], "image": "train2014/COCO_train2014_000000025996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313526, "question_id": "iR5mnnNQRRkCyoMA9kWFht", "question": "What type of socks is the little girl wearing?", "choices": ["anklets", "winter socks", "knee socks", "compression socks"], "correct_choice_idx": 0, "direct_answers": ["white", "anklets", "tube", "anklets", "ankle socks", "white", "ankle", "ankle socks", "sweat", "white socks"], "difficult_direct_answer": false, "rationales": ["The socks are low.", "A girl is on a skateboard and has black shoes with short socks.", "The socks go up to the little ankle."], "image": "train2014/COCO_train2014_000000313526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28782, "question_id": "iR8TtQsApUv9SAEqf4cUZx", "question": "What is the proper name for this style of eating?", "choices": ["picnic", "brunch", "party", "soiree"], "correct_choice_idx": 0, "direct_answers": ["picnic", "alfresco", "al fresco", "outdoor eating", "handheld", "al fresco", "casual picnic", "picnic", "picnic", "picnic"], "difficult_direct_answer": false, "rationales": ["They are on a blanket in some grass outside", "They are having a picnic.", "This is a picnic on a blanket on the grass."], "image": "train2014/COCO_train2014_000000028782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391496, "question_id": "iRJ54TK88Mjd9m73be3KR8", "question": "Where is this table located at?", "choices": ["dinning room", "backyard", "restaurant", "patio"], "correct_choice_idx": 3, "direct_answers": ["outside", "outside", "window", "outside", "patio", "restaurant", "outside", "window", "outside", "outside"], "difficult_direct_answer": false, "rationales": ["Looks like it's on an outdoor porch.", "There is a bird on the table.", "The table is on an outdoor area."], "image": "train2014/COCO_train2014_000000391496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181013, "question_id": "iRYrndSfADgQSLEXfuKtPQ", "question": "What does the player with the frisbee want to do with it?", "choices": ["sell it", "fling it", "pass it", "eat it"], "correct_choice_idx": 1, "direct_answers": ["throw it", "throw it", "throw", "fling it", "throw it", "throw it", "throw", "throw it", "throw", "throw it"], "difficult_direct_answer": false, "rationales": ["His arms are in the throwing position and the other person is trying to grab it", "The person with the frisbee appears to be winding up to pass the frisbee on as the intention of the game would dictate.", "His arm is bent and he is ready to throw it."], "image": "val2014/COCO_val2014_000000181013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125971, "question_id": "iRmPkXHuD7EHZsrKYoExLB", "question": "Which Hot Dog topping here is longest?", "choices": ["pickle", "onion", "tomato", "relish"], "correct_choice_idx": 0, "direct_answers": ["pickle", "pickles", "bun", "cheese", "hot dog", "pickle", "pickle", "pickle", "pickle", "bell pepper"], "difficult_direct_answer": false, "rationales": ["The hot dog is chicago style.", "The topping is a pickle.", "The pickle is nearly the length of the entire bun and equal to the size of the hot dog; none of the other toppings are as lengthy."], "image": "val2014/COCO_val2014_000000125971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505733, "question_id": "iSEMPZmS2pXmJ8gKgGneRz", "question": "What is the name of the birds pictured above?", "choices": ["eagles", "peacocks", "flamingoes", "ostriches"], "correct_choice_idx": 3, "direct_answers": ["ostrich", "ostriches", "ostrich", "ostrich", "ostrich", "ostriches", "emu", "ostriches", "ostrich", "ostrich"], "difficult_direct_answer": false, "rationales": ["Ostriches are huge birds.", "The name is an ostrich.", "Tall birds with long legs and necks are walking in grass."], "image": "val2014/COCO_val2014_000000505733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530508, "question_id": "iT6SNkdyDjCthsLT55zVbN", "question": "What type of hot dog is on the plate?", "choices": ["chicago dog", "chili dog", "plain dog", "foot long"], "correct_choice_idx": 1, "direct_answers": ["chili dog", "half eaten", "chili dog", "chili", "chili dog", "nu-way", "chili", "chili dog", "chili dog", "chili dog"], "difficult_direct_answer": false, "rationales": ["The hot dog is covered in brown chili.", "It has ground beef on it.", "The dog has chili on it."], "image": "train2014/COCO_train2014_000000530508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432424, "question_id": "iTRFzkcoJjGEPXNQKn2LHU", "question": "What type of gathering is this?", "choices": ["reception", "meeting", "rehearsal", "shower"], "correct_choice_idx": 1, "direct_answers": ["meeting", "gaming/coding", "work meeting", "study group", "computer geek", "coding party", "work", "social", "work event", "lan party"], "difficult_direct_answer": true, "rationales": ["The gathering is a meeting.", "This is done in an informal place. the people are meeting together.", "The people are having a working meeting."], "image": "train2014/COCO_train2014_000000432424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226967, "question_id": "iTSddAzoJPgdDioujtQKbk", "question": "What is the man wearing the headset most likely carrying?", "choices": ["laptop", "briefcase", "camera", "tablet"], "correct_choice_idx": 2, "direct_answers": ["camera", "cameraman", "camera", "cameraman", "camera", "camera", "camera recorder", "camera", "camera", "camera"], "difficult_direct_answer": false, "rationales": ["We can see the top of an eyepiece in the device the headset-wearing man holds and can conclude it is a camcorder to record this tennis match.", "The man is holding a camera on him.", "The man is holding a camera."], "image": "val2014/COCO_val2014_000000226967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400862, "question_id": "iTTuasZgzT8HLS5U5MDYFc", "question": "Why is the man grabbing the other man's collar?", "choices": ["threatening him", "tying sleeves", "fighting him", "tying tie"], "correct_choice_idx": 3, "direct_answers": ["adjusting tie", "tying tie", "fixing tie", "tying tie", "tying tie", "fixing tie", "tying tie", "tying tie", "tieing tie", "tying necktie"], "difficult_direct_answer": false, "rationales": ["The man is putting the tie on.", "He is helping him out.", "The man wants to help tie the other man's tie."], "image": "train2014/COCO_train2014_000000400862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263359, "question_id": "iTUwKCpLC8SGKg6bYF2nkj", "question": "What do these horseback riders ride along?", "choices": ["desert", "ocean", "city", "streambed"], "correct_choice_idx": 3, "direct_answers": ["trail", "pond", "bushes", "trail", "trail", "field", "trail", "streambed", "lake", "trail"], "difficult_direct_answer": false, "rationales": ["It is a natural ditch full of grass and stones, with tall reeds along the edge.", "To the right of the riders there is a edge to the tall vegetation, a downward slope and visible rocks and stones with lush green vegetation below. those elements in combination are consistent with answer a.", "There is a line of water by the horses."], "image": "val2014/COCO_val2014_000000263359.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565312, "question_id": "iU46Dbkq6DYsbBfC5ug5Lm", "question": "Who plays the same sport?", "choices": ["serena williams", "otis nixon", "alex morgan", "danica patrick"], "correct_choice_idx": 0, "direct_answers": ["tennis player", "andre agassi", "serena williams", "serena williams", "serena williams", "serena", "venus williams", "serena williams", "venus williams", "serena williams"], "difficult_direct_answer": false, "rationales": ["The girl is holding a tennis racquet, not a baseball bat, steering wheel, or soccer ball.", "Serena plays tennis.", "Serena plays tennis."], "image": "train2014/COCO_train2014_000000565312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353260, "question_id": "iUPK8fCPcp23axxbSyGeVw", "question": "Who are the roads for?", "choices": ["drivers", "pedestrians", "downtown", "directions"], "correct_choice_idx": 0, "direct_answers": ["cars", "drivers", "drivers", "travellers", "cars", "everyone", "drivers", "vehicles", "drivers", "executive car"], "difficult_direct_answer": false, "rationales": ["There are no sidewalks, so they are not for pedestrians. vehicles are occupying the roads.", "They are for people driving on them.", "All of the people in the cars are drivers."], "image": "train2014/COCO_train2014_000000353260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533508, "question_id": "iUVYbBfQ69VBM2aHZ769qb", "question": "How do you describe those people going inside the building?", "choices": ["judges", "medical workers", "religious people", "politicians"], "correct_choice_idx": 2, "direct_answers": ["religious people", "tourist", "learners", "church", "tourists", "rich", "christians", "tourists", "church goers", "parishioners"], "difficult_direct_answer": true, "rationales": ["The place is a church so most of the people who frequent the building would probably be considered religious.", "The building looks like it is a church.", "They are religious people."], "image": "val2014/COCO_val2014_000000533508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96973, "question_id": "iVdqbU75GaR4V4DnsHH9v4", "question": "Inside the covered pot rests what?", "choices": ["cocoa", "cider", "tea", "coffee"], "correct_choice_idx": 2, "direct_answers": ["tea", "tea", "tea", "tea", "tea", "tea", "tea", "tea", "tea", "tea"], "difficult_direct_answer": false, "rationales": ["The pot has tea.", "It is a teapot", "Some places prefer an herbal warm beverage instead of coffee."], "image": "train2014/COCO_train2014_000000096973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270316, "question_id": "iVjcgcTzpAjqzMzoCKNqLw", "question": "What temperature items might be carried by this truck?", "choices": ["cold", "all", "room", "none"], "correct_choice_idx": 0, "direct_answers": ["ice cream", "freezing", "frozen", "cold", "cold", "food", "unknown", "cold", "cold/frozen", "below zero"], "difficult_direct_answer": false, "rationales": ["A refrigerated truck can keep things at a low temperature.", "This truck has a freezer.", "This truck is a freezer to keep things frozen."], "image": "val2014/COCO_val2014_000000270316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210604, "question_id": "iVqDUaMePTfQgcosyPuP2v", "question": "What allows this toy to fly?", "choices": ["lift", "fan", "string", "battery"], "correct_choice_idx": 0, "direct_answers": ["air momentum", "wind", "wind", "aerodynamics", "lift", "wind", "air pressure", "lift", "aerodynamics", "inertia"], "difficult_direct_answer": false, "rationales": ["Frisbees have to have air.", "The frisbee uses lift in the air.", "The lift allows it to fly."], "image": "train2014/COCO_train2014_000000210604.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360189, "question_id": "iVsjPVDy8wXgcwwFi2hPMn", "question": "If you are crossing the street what should you do at this point?", "choices": ["lay down", "turn around", "run", "stop"], "correct_choice_idx": 2, "direct_answers": ["stop", "wait", "look", "look left", "wait", "wait", "wait", "wait", "wait", "run"], "difficult_direct_answer": false, "rationales": ["The sign indicates only three seconds left before crossing should cease. when the number reaches zero, traffic will resume, and it will be unsafe to be in the street.", "According to the display, the light is about to change, meaning it is prudent to clear the intersection as fast as possible.", "The red hand means to not walk and the 3 indicates the number of seconds until a pedestrian should walk."], "image": "train2014/COCO_train2014_000000360189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539233, "question_id": "iW8cNLCm2ayc657cFShQgy", "question": "What does the user of this room do apart from working on the laptop?", "choices": ["cooking", "workout", "raising animals", "sleeping"], "correct_choice_idx": 3, "direct_answers": ["work", "sleeping", "sleep", "love cats", "sleep", "sleep", "sleep", "sleep", "chilling", "plays cats"], "difficult_direct_answer": false, "rationales": ["The couch in the back has two blankets which are used for resting.", "The room has office equipment and a bed that is used for sleeping.", "There is a bed there."], "image": "train2014/COCO_train2014_000000539233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74759, "question_id": "iW94VDDEntV2xYyDrdRrcZ", "question": "What will the man do to the elephants with the sticks he holds?", "choices": ["kill them", "poke them", "mesmerize them", "feed them"], "correct_choice_idx": 1, "direct_answers": ["stroke them", "guide them", "hit", "tap them", "tap them", "poke them", "signal them", "abuse", "whip them", "hit them"], "difficult_direct_answer": true, "rationales": ["He will do this to tell them what to do next", "The man is a trainer.", "They will nudge them with the sticks to get them to do what they want."], "image": "val2014/COCO_val2014_000000074759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37527, "question_id": "iWLhLpy2YXdc4m3bvN6gFQ", "question": "Where is the man?", "choices": ["garage", "wine store", "garden", "stadium"], "correct_choice_idx": 1, "direct_answers": ["wine cellar", "wine store", "wine store", "winery", "bar", "alcohol store", "winery", "bar", "winery", "restaurant bar"], "difficult_direct_answer": false, "rationales": ["The man is selling some wine since there are bottles behind him.", "The man has alcohol behind him.", "The man is at a wine store."], "image": "train2014/COCO_train2014_000000037527.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439539, "question_id": "iWgaEc3484MNmQc5CPPPxP", "question": "Where is the owner of the backpack?", "choices": ["church", "home", "bathroom", "skateboarding"], "correct_choice_idx": 3, "direct_answers": ["skating", "gone", "skateboarding", "skating", "skating", "behind camera", "skateboarding", "skateboarding", "skateboarding", "street"], "difficult_direct_answer": false, "rationales": ["The person is on a skateboard.", "He has is stuff pushed to the side to not let it get in his way while he's skating.", "The owner of the backpack put his stuff down so he could skateboard."], "image": "train2014/COCO_train2014_000000439539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191828, "question_id": "iXCvs2n2XmcXPpANNWqV94", "question": "What fun activity is shown?", "choices": ["snow boarding", "free fall", "bumper cars", "rollar coaster"], "correct_choice_idx": 0, "direct_answers": ["snowboarding", "snowboarding", "snowboarding", "snowboarding", "scatting", "scatting", "snow boarding", "snowboarding", "snowboarding", "snowboarding"], "difficult_direct_answer": false, "rationales": ["The people are gearing up to ride snowboards.", "They are snow boarding.", "They are at a lodge and there is snow"], "image": "train2014/COCO_train2014_000000191828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384346, "question_id": "iXHqJUk62QmNcNpzANSrri", "question": "What colour is the man's shirt underneath his vest?", "choices": ["pink", "red", "yellow", "blue"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The looks shows the yellow color.", "It is yellow.", "The color is yellow."], "image": "train2014/COCO_train2014_000000384346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214326, "question_id": "iXLH3QC4fzoxuEG9QexeFv", "question": "What is the woman holding to her ear?", "choices": ["cell phone", "headphones", "stereo", "walkie talkie"], "correct_choice_idx": 0, "direct_answers": ["phone", "phone", "phone", "cell phone", "phone", "cell phone", "phone", "phone", "phone", "cell phone"], "difficult_direct_answer": false, "rationales": ["It is the flip type of device", "She has a phone.", "The woman is talking on the phone."], "image": "train2014/COCO_train2014_000000214326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206309, "question_id": "iXLWJFJkqbn6twnAp3ViZU", "question": "What does the NZ on the microphone stand for?", "choices": ["neutral zone", "national-zeitung", "net zero", "new zealand"], "correct_choice_idx": 3, "direct_answers": ["new zealand", "new zealand", "news zone", "new zealand", "new zealand", "new zealand", "new zealand", "news zone", "new zealand", "new zealand"], "difficult_direct_answer": false, "rationales": ["This stands for the country they are from.", "The country is nz so it would stand to reason.", "The nz is new zealand."], "image": "train2014/COCO_train2014_000000206309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48126, "question_id": "iXxFovQuQhAmVZeLCzDaZp", "question": "Which food is deadly to this animal?", "choices": ["cheese", "chocolate", "peanut butter", "milk"], "correct_choice_idx": 1, "direct_answers": ["chocolate", "chocolate", "chocolate", "chocolate", "dark chocolate", "onions", "chocolate", "chocolate", "onions", "chocolate"], "difficult_direct_answer": false, "rationales": ["The animal is a dog and chocolate is dangerous for dogs.", "The animal resting it's head on this man's shoulder is a dog. most dog owners know that they should never be fed chocolate.", "Chocolate is poisonous to dogs."], "image": "train2014/COCO_train2014_000000048126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99908, "question_id": "iY84vQhx8v6tBnigoktnRS", "question": "Why is the car stopping?", "choices": ["animal crossing", "broke down", "flat tire", "accident"], "correct_choice_idx": 0, "direct_answers": ["cattle crossing", "cows", "cows crossing", "animal crossing", "cow crossing", "cows crossing", "cow", "cows crossing", "for cows", "animal crossing"], "difficult_direct_answer": false, "rationales": ["Cows are blocking the road. there is nothing wrong with the car.", "The car is letting the cows cross.", "There are cows walking across the road in front of the car."], "image": "val2014/COCO_val2014_000000099908.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20184, "question_id": "iYDCRWHDcFYvsUw4hRkq5E", "question": "What level is this ski course catering to?", "choices": ["veterans", "advanced", "mid tier", "beginners"], "correct_choice_idx": 3, "direct_answers": ["children", "beginner", "beginners", "beginner", "beginners", "beginners", "beginner", "beginners", "beginner", "beginner"], "difficult_direct_answer": false, "rationales": ["The slope is flat and thus geared for beginners like the child.", "The slope is clear and the skiers on it are all young. the slope also is not very steep which is consistent with a trail meant for inexperienced skiers.", "The level is a beginner."], "image": "train2014/COCO_train2014_000000020184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163571, "question_id": "iYLZBypLSSocCWguATmPse", "question": "Why is there a car poster on the building?", "choices": ["window cover", "advertisement", "decoration", "missing poster"], "correct_choice_idx": 1, "direct_answers": ["car sales", "dealership", "travelling", "advertisement", "dealership", "selling cars", "dealership", "advertising", "car selling", "dealership"], "difficult_direct_answer": false, "rationales": ["The poster on the building is to display one of the models that someone can buy if they are shopping for a car.", "It is an ad for the car.", "It is a store or a convention center showing cars for sale or auction because of the tall, sleek-looking building."], "image": "train2014/COCO_train2014_000000163571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408789, "question_id": "iYdRFr8SdneoBeRHeYQSq2", "question": "Who is the man on the tv?", "choices": ["john cena", "john mccain", "john stewart", "john stamos"], "correct_choice_idx": 1, "direct_answers": ["politician", "interviewee", "john mccain", "john mccain", "joe biden", "john mccain", "john mccain", "john mccain", "mike pence", "mccain"], "difficult_direct_answer": false, "rationales": ["The man is mccain.", "The man is mccain.", "John mccain is visible on television based on the characteristics and profile of the face and complexion."], "image": "val2014/COCO_val2014_000000408789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495051, "question_id": "iYeVS2nZ6YBWPbuYNJvJhm", "question": "Why has the man covered his head?", "choices": ["style", "keep cool", "protection", "uniform"], "correct_choice_idx": 1, "direct_answers": ["religious attire", "sunny day", "heat", "sun protection", "sun protection", "hot", "keep cool", "it's hot", "not overheat", "sun protection"], "difficult_direct_answer": false, "rationales": ["The man has a cloth over his head.", "He put the towel over his head to keep it cooler.", "The man appears to be in a hot place judging by the background and the presence of this type of elephant. when in a hot pace one might cover their head to get out of the sun and cool down."], "image": "train2014/COCO_train2014_000000495051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172088, "question_id": "iYzy6temdnGrDkzZkQrZL2", "question": "What is the player's facial expression?", "choices": ["disgusted", "focused", "angry", "painful"], "correct_choice_idx": 1, "direct_answers": ["surprise", "tennis", "determined", "focused", "strained", "intensity", "determination", "effortful", "tense", "determined"], "difficult_direct_answer": true, "rationales": ["The man is determined.", "The expression is focused.", "The woman is aimed on getting the ball."], "image": "val2014/COCO_val2014_000000172088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85179, "question_id": "iZCJzxeSXuxDCXzdNSb7Dz", "question": "What do the people do when the fast moving thing stops?", "choices": ["exercise", "cook", "swim", "board it"], "correct_choice_idx": 3, "direct_answers": ["get inside", "get on", "get on", "get onboard", "get on", "board", "get on", "board it", "board", "board it"], "difficult_direct_answer": false, "rationales": ["They have luggage and are standing near the track", "The fast-moving thing is a train that the people are waiting for it to come to a stop so they can get on.", "The people are waiting for a train. they will get on the train when it stops."], "image": "train2014/COCO_train2014_000000085179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288872, "question_id": "iZTidD7u5nZPZ3wTBSDWpm", "question": "What kind of hairstyle is the man sporting?", "choices": ["mohawk", "pompadour", "dreadlocks", "liberty spikes"], "correct_choice_idx": 2, "direct_answers": ["dreadlocks", "normal", "dreadlocks", "twists", "clean cut", "dredlocks", "normal", "dreads", "shot dreadlock", "dreadlocks"], "difficult_direct_answer": false, "rationales": ["I've seen and even written about this hairstyle before.", "The man is wearing his hair in dreadlocks.", "His hair is fashioned to be rope-like and braided."], "image": "train2014/COCO_train2014_000000288872.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241860, "question_id": "iZWdB3SKhPb7bNDPiWoDgn", "question": "What is the style of the two chairs in front of the laptop?", "choices": ["traditional", "art deco", "rustic", "mid century"], "correct_choice_idx": 3, "direct_answers": ["office chair", "lounge", "modern", "office", "office", "arm chairs", "office", "square", "mid century", "metal"], "difficult_direct_answer": false, "rationales": ["The chairs are from modern days.", "The style is mid century.", "The chairs in front of the laptop have an art deco look since they're modernistic."], "image": "train2014/COCO_train2014_000000241860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78555, "question_id": "iZZ3CGcuqLicC68sHw9958", "question": "Who likes to eat the orange item here?", "choices": ["pikachu", "popeye", "spongebob", "bugs bunny"], "correct_choice_idx": 3, "direct_answers": ["rabbits", "horses", "rabbits", "rabbit", "rabbits", "rabbits", "rabbits", "carrots", "many people", "bugs bunny"], "difficult_direct_answer": false, "rationales": ["Rabbits eat carrots", "That crazy wabbit was always eating them.", "Rabbits are often shown in cartoons eating and liking carrots. bugs bunny is a rabbit."], "image": "train2014/COCO_train2014_000000078555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132132, "question_id": "iZgEixJYNkTFW2yfdxEWeT", "question": "What job does the man behind the stand hold?", "choices": ["green grocer", "watch salesman", "driver", "butcher"], "correct_choice_idx": 0, "direct_answers": ["vendor", "cashier", "restocking", "cashier", "green grocer", "seller", "seller", "selling fruits", "cashier", "vendor"], "difficult_direct_answer": false, "rationales": ["The man is selling produce.", "The man sells groceries since he has fruit.", "He sells fruit."], "image": "val2014/COCO_val2014_000000132132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224349, "question_id": "ia9ibmp78GNdphiRDDZ2yp", "question": "What do the animals here have in common locationwise?", "choices": ["europe", "mexico", "desert", "ocean"], "correct_choice_idx": 3, "direct_answers": ["ocean", "beach", "sea creatures", "looking downwards", "ocean", "sea animals", "flying", "ocean animals", "kites", "on beach"], "difficult_direct_answer": true, "rationales": ["The kites resemble jellyfish and a penguin. penguins are aquatic birds and jellyfish are found in the ocean.", "They are all animals that would be found in the ocean.", "The animals are in the ocean."], "image": "train2014/COCO_train2014_000000224349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448826, "question_id": "iaZQDEdRxgYNDkWTQ8kMKP", "question": "What is the largest plane in the entire fleet owned by the airline sponsor?", "choices": ["a380", "878", "767", "747"], "correct_choice_idx": 0, "direct_answers": ["unknown", "emirates airline", "a380", "seven fourtyseven", "passenger jet", "767", "emirates", "emirates airline", "a380 super", "emirates"], "difficult_direct_answer": false, "rationales": ["The a380 is the largest airplane flown by this fleet's sponsor.", "The largest plane owned by emirates is the a380.", "It is the largest for emirates"], "image": "train2014/COCO_train2014_000000448826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130718, "question_id": "iaajyWA5nfFQSFpkn8FNr8", "question": "Why is the woman using an umbrella?", "choices": ["disguise", "sun", "snow", "rain"], "correct_choice_idx": 3, "direct_answers": ["rain", "rain", "rain", "keep dry", "raining", "raining", "keep dry", "water protection", "rain time", "raining"], "difficult_direct_answer": false, "rationales": ["The surfaces appear wet but not snowed on.", "The ground is wet", "The woman is using an umbrella because it is raining and she wants to stay dry."], "image": "train2014/COCO_train2014_000000130718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338552, "question_id": "iajUb824JdSiQ4SESrVaA9", "question": "What automobile companies logo can be seen on the flag?", "choices": ["toyota", "chevrolet", "honda", "ford"], "correct_choice_idx": 1, "direct_answers": ["chevrolet", "chevrolet", "chevrolet", "chevrolet", "chevrolet", "chevrolet", "chevrolet", "chevrolet", "chevrolet", "chevrolet"], "difficult_direct_answer": false, "rationales": ["It says the name beneath the logo", "Their name and logo are printed on the flag.", "Their logo and name are on the flag."], "image": "val2014/COCO_val2014_000000338552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570307, "question_id": "ibCnPPRhJi9MdUfex5nnc8", "question": "What is the girl trying to do with the horse?", "choices": ["trim it", "ride it", "fight it", "pull it"], "correct_choice_idx": 3, "direct_answers": ["pull", "feed water", "pull it", "pull it", "into water", "get wet", "walk", "coax movement", "pull horse", "pulling"], "difficult_direct_answer": true, "rationales": ["The girl has the horse on a leash and is trying to pull it into the water.", "The girl has a rope.", "The girl has the leash that she is using to pull the horse toward the water."], "image": "train2014/COCO_train2014_000000570307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238488, "question_id": "ibPkBTu2GfXo8t42Mz37Mi", "question": "What is the woman doing?", "choices": ["working", "checking emails", "taking calls", "watching tv"], "correct_choice_idx": 0, "direct_answers": ["watching tv", "computer work", "working", "watching news", "working", "checking emails", "working", "working", "watching television", "working"], "difficult_direct_answer": false, "rationales": ["The woman is sitting behind a laptop that has an excel type document up. both tools would be something frequently associated with answer a and she looks like she is really focused.", "The woman is working on her computer.", "She is monitoring the television show on her computer."], "image": "val2014/COCO_val2014_000000238488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355450, "question_id": "ibQiqNuj5mhbhfDTWTKBuw", "question": "What is behind the small table with the flowers?", "choices": ["baby", "glasses", "mirror", "cat"], "correct_choice_idx": 2, "direct_answers": ["mirror", "mirror", "mirror", "good looking", "wall", "mirror", "mirror", "mirror", "mirror", "mirror"], "difficult_direct_answer": false, "rationales": ["The mirror is behind.", "That is what's on the wall behind the table.", "Given the surface behind the flowers reflecting back the room it's housed in we can conclude a mirror is there."], "image": "val2014/COCO_val2014_000000355450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395271, "question_id": "ibaxaU7wayPjXXMhHa7zQy", "question": "What is the most popular pizza cheese?", "choices": ["cheddar", "mozzarella", "gouda", "american"], "correct_choice_idx": 1, "direct_answers": ["mozzarella", "mozzarella", "cheese pepperoni", "mozzarella", "mozzarella", "mozzarella", "mozzarella", "mozzarella", "cheese", "mozzarella"], "difficult_direct_answer": false, "rationales": ["Mozzarella is used.", "Mozzarella is what italians use on pizza.", "The man is holding a pizza that is topped with mozzarella cheese."], "image": "train2014/COCO_train2014_000000395271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475889, "question_id": "ibpz4DXvmW4mwN9cWvMSp2", "question": "On what is the meat for this group prepared?", "choices": ["butcher rack", "no where", "grill", "microwave"], "correct_choice_idx": 2, "direct_answers": ["hot dogs", "grill", "pulled pork", "grill", "grill", "grill", "grill", "barbecue grill", "grill", "grill"], "difficult_direct_answer": false, "rationales": ["Hamburgers are grilled.", "The hotdogs are being prepared outside which most likely is a barbecue and the meats here are always grilled.", "The people are in an outdoor area allowing them to use a more heavy smoke cooking device to give the meat more flavor."], "image": "val2014/COCO_val2014_000000475889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316438, "question_id": "icMVuxumLKyL6zEDofAwDA", "question": "Why is the pizza cut into small pieces?", "choices": ["looks good", "easier eating", "to trash", "see easier"], "correct_choice_idx": 1, "direct_answers": ["serving ease", "easier eating", "eating", "bite sized", "eating ease", "to eat", "to eat", "bite sized", "portioned", "easier eating"], "difficult_direct_answer": false, "rationales": ["This serving style is consistent with the food present. because of the thinness of the crust it is likely large slices would bend and be difficult to insert into one's mouth.", "The pizza as a whole is difficult to fit in a person's mouth without being cut up.", "The small pieces make the food easier to pick up."], "image": "val2014/COCO_val2014_000000316438.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395531, "question_id": "icQnnhzRF2H8XYV8jkozQk", "question": "What does the stuffed item here appear to wear?", "choices": ["buddha", "backpack", "bear suit", "nothing"], "correct_choice_idx": 1, "direct_answers": ["backpack", "backpack", "backpack", "bag", "bag", "backpack", "backpack", "bag", "backpack", "backpack"], "difficult_direct_answer": false, "rationales": ["The stuffed animal panda has a knitted backpack shaped structure coming off it's shoulder.", "The garment being worn is a bag attached over the shoulders with straps. this resembles a backapack.", "The item has straps that hug its arms and sit on its back so it looks like a backpack."], "image": "val2014/COCO_val2014_000000395531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207763, "question_id": "icU5CKRrRBkcrsfBkiSM6A", "question": "What is the person on the right holding in the air?", "choices": ["leg", "fish", "apple", "kite"], "correct_choice_idx": 0, "direct_answers": ["leg", "baby", "hand", "frisbee", "her hand", "leg", "right hand", "her hand", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["The girl is in mid-kick position.", "The person is holding one leg in the air.", "The is kicking her leg."], "image": "val2014/COCO_val2014_000000207763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64310, "question_id": "icaWoZhywTGRxUAogiiH5P", "question": "Why does he have two monitors?", "choices": ["has extra", "is confused", "stole one", "more work"], "correct_choice_idx": 3, "direct_answers": ["for work", "more displays", "increased productivity", "more information", "computer tech", "business", "tech worker", "more work", "productivity", "work"], "difficult_direct_answer": true, "rationales": ["The man can be more efficient with work.", "One screen is not enough to complete their work one one screen.", "With two monitors you would be more efficient."], "image": "train2014/COCO_train2014_000000064310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56145, "question_id": "icibkdYCRuS268yLBYzZuB", "question": "Why is the hill above the sea brown with furrows?", "choices": ["sports field", "rodeo", "farm land", "housing development"], "correct_choice_idx": 2, "direct_answers": ["farm", "farm land", "agriculture", "growing crops", "farming", "farmland", "farming", "dirt", "plowed", "crops"], "difficult_direct_answer": true, "rationales": ["These types of markings are usually associated with planting crops.", "The area is for farming.", "The hill is a farm."], "image": "train2014/COCO_train2014_000000056145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482472, "question_id": "idXz4FpGrG7VgXghzFsmEZ", "question": "What used to be inside the barrels shown before they became planters?", "choices": ["milkshakes", "carrots", "wine", "burgers"], "correct_choice_idx": 2, "direct_answers": ["whiskey", "whiskey", "food", "beer", "wine", "vegetables", "antique things", "whiskey", "wine", "whiskey"], "difficult_direct_answer": false, "rationales": ["They are likely painted vino barrels or made to look like them.", "The wine is used.", "The planters were made from wine barrels."], "image": "train2014/COCO_train2014_000000482472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579739, "question_id": "idaoqhC5rGKBKk5edyTFSk", "question": "What period of the day is it in the image?", "choices": ["morning", "evening", "afternoon", "night"], "correct_choice_idx": 2, "direct_answers": ["afternoon", "afternoon", "morning", "afternoon", "afternoon", "afternoon", "daytime", "afternoon", "afternoon", "afternoon"], "difficult_direct_answer": false, "rationales": ["The hands on the clock show that the time is nearly two in the afternoon.", "The sun can be seen high in the sky such as in later afternoon.", "The clock says 1:34 and since it's not dark, it would be in the afternoon."], "image": "train2014/COCO_train2014_000000579739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571746, "question_id": "idqbaWh8dMDt9Zm9KT2sQy", "question": "What are the three people next to the train doing?", "choices": ["sledding", "running", "skiing", "rolling"], "correct_choice_idx": 0, "direct_answers": ["sledding", "sledding", "sledding", "sledding", "sledding", "sledding", "sledding", "sledding", "sledding", "sledding"], "difficult_direct_answer": false, "rationales": ["The people are going to sled down.", "The people are in sleds going down the trail.", "The three people are sledding since they're moving downhill in the snow."], "image": "val2014/COCO_val2014_000000571746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413666, "question_id": "idupRsDiBnkPu2DGXxRH9U", "question": "The skateboarders are skating in the park during which season of the year?", "choices": ["summer", "fall", "spring", "witner"], "correct_choice_idx": 1, "direct_answers": ["autumn", "summer", "summer", "fall", "summer", "fall", "fall", "fall", "summer", "fall"], "difficult_direct_answer": false, "rationales": ["There are leaves on the trees behind that are of an orange shade. this a a phenomena known to happen in fall.", "There is no snow on the ground, so it is not winter. there are dead leaves on the ground.", "People are wearing shorts however most have jeans on. the leaves are still on the trees, but the leaf colors are beginning to change."], "image": "val2014/COCO_val2014_000000413666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251754, "question_id": "idxu8CKbRJGJM4hGuQ3Szt", "question": "How are the special type of skis called?", "choices": ["narrow skis", "skinny skis", "small skis", "thin skis"], "correct_choice_idx": 1, "direct_answers": ["cross country", "pole skis", "racing skis", "carving skis", "skinny skis", "powder skiis", "snowblades", "cross-country skis", "cross-country skis", "nordic"], "difficult_direct_answer": true, "rationales": ["The skinnies skies are used to balance the man.", "They are skinnier than regular skis and that's where they get their name.", "The people standing on the snow are wearing skinny skis which are narrower than normal skis."], "image": "train2014/COCO_train2014_000000251754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548965, "question_id": "ie7eNzgrngKBEqJ9GuwQWx", "question": "Misty dawn is sailing under the flag of which country?", "choices": ["france", "uk", "italy", "us"], "correct_choice_idx": 3, "direct_answers": ["no flag", "sweden", "usa", "us", "usa", "united states", "america", "united states", "not known", "america"], "difficult_direct_answer": false, "rationales": ["The misty dawn is a us boat.", "It's under the usa.", "Misty dawn is under the usa flag."], "image": "train2014/COCO_train2014_000000548965.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79957, "question_id": "ieMcAYsM5XpSVvfogvigT5", "question": "Why is the elephant's leg raised?", "choices": ["fighting", "relieved itself", "kicked ball", "deterring mosquitos"], "correct_choice_idx": 2, "direct_answers": ["done pooping", "defecating", "kicking ball", "doing trick", "kicking", "kicked ball", "kick", "stretching", "playing", "urinating"], "difficult_direct_answer": true, "rationales": ["The elephant is kicking the soccer ball.", "The ball is getting kicked.", "Based on the ball on the ground and the fact that the man is walking towards the elephant with a ball in hand--and also because the elephant's leg is in a \"kicking\" position--it can be concluded that the elephant leg is raised because it kicked the ball."], "image": "train2014/COCO_train2014_000000079957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477641, "question_id": "iehA9v3LsQ5QfDBytqpHbW", "question": "Why is the woman on the phone carrying a bag?", "choices": ["computer", "books", "camera", "pet"], "correct_choice_idx": 3, "direct_answers": ["carrying pet", "personal items", "travelling", "her dog", "carry clothes", "carrying pet", "stuff", "carry", "pet", "animal transport"], "difficult_direct_answer": true, "rationales": ["The woman is carrying her little pet in the shoulder bag.", "There is a dog inside the bag.", "The bag is for her dog."], "image": "train2014/COCO_train2014_000000477641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344025, "question_id": "iejgsuoxrLt6jy29mrtmBE", "question": "Which treat was most likely purchased instead of baked?", "choices": ["donuts", "all three", "brownies", "cookies"], "correct_choice_idx": 3, "direct_answers": ["oreos", "cookies", "oreo cookies", "cookies", "oreos", "sandwich cookies", "cookies", "oreos cookies", "oreo cookies", "oreo"], "difficult_direct_answer": false, "rationales": ["Some of the brownies are on a baking sheet in the background. the black and white treats appear to be store bought oreos.", "These are oreos", "Oreos aren't baked."], "image": "val2014/COCO_val2014_000000344025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512569, "question_id": "if29mMJrFxGVViTnexhrij", "question": "What is most likely to be added to this food item?", "choices": ["apples", "syrup", "milk", "jam"], "correct_choice_idx": 1, "direct_answers": ["syrup", "syrup", "maple syrup", "syrup", "maple syrup", "syrup", "syrup", "syrup", "syrup", "syrup"], "difficult_direct_answer": false, "rationales": ["Waffles use syrup.", "Syrup is most likely added to this food item.", "This food item is a waffle. people usually do not put jam, apples, or milk on waffles."], "image": "train2014/COCO_train2014_000000512569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283170, "question_id": "ifXCPS9RLgGvZA85hTGmEH", "question": "Which vehicle stuck in the intersection is in the most danger?", "choices": ["truck", "car", "bus", "motorcycle"], "correct_choice_idx": 3, "direct_answers": ["scooter", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "motorcycle", "scooter", "motorcycle", "motorcycle", "scooter"], "difficult_direct_answer": false, "rationales": ["The cyclist has less to protect him than the cars on the road.", "The vehicles are visible and identifiable based on their unique features. the least protective for the user is known to be a based on the structure of the vehicle and how the riders use it.", "Motorcycles have less protection."], "image": "train2014/COCO_train2014_000000283170.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59138, "question_id": "ifiMXgVeMRuApea2kggUe4", "question": "What vehicle is he in?", "choices": ["boat", "car", "plane", "train"], "correct_choice_idx": 3, "direct_answers": ["subway", "train", "train", "train", "subway", "train", "train", "train", "subway", "subway"], "difficult_direct_answer": false, "rationales": ["The vehicle is a train.", "The way the passenger stands shows it is a train.", "The sliding doors and seating are common for subway cars"], "image": "train2014/COCO_train2014_000000059138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400332, "question_id": "ifuRk8DYDDK9oGKvCW2WJH", "question": "What does the man want to do with the ball?", "choices": ["throw it", "drop it", "hit it", "pocket it"], "correct_choice_idx": 0, "direct_answers": ["throw", "pitch", "pitch", "throw ball", "throw", "pitch it", "casing", "throw", "throw", "throw it"], "difficult_direct_answer": false, "rationales": ["He is pitching to a batter", "This persons body alignment and position indicate he is projecting the ball in a forward motion to a specified target.", "After catching the baseball he'll most likely throw it"], "image": "train2014/COCO_train2014_000000400332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514904, "question_id": "ig3YWMju4jueA9hCBwKCFj", "question": "What surrounds the sausages here?", "choices": ["buns", "barbeque sauce", "smiles", "chips"], "correct_choice_idx": 0, "direct_answers": ["buns", "buns", "bread", "buns", "buns", "bread", "bread", "bread", "bread", "buns"], "difficult_direct_answer": false, "rationales": ["The woman is holding sausages that are between bread buns.", "The sausages have buns.", "The sausages are wrapped in bread."], "image": "val2014/COCO_val2014_000000514904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100611, "question_id": "igRjBRv6Gbxkbzp6GMS2ii", "question": "What is contained inside the dark colored bottle?", "choices": ["juice", "wine", "beer", "soda"], "correct_choice_idx": 2, "direct_answers": ["cola", "beer", "wine", "beer", "beer", "beer", "wine", "wine", "beer", "champagne"], "difficult_direct_answer": false, "rationales": ["The person is eating a meal.", "Normally this type of drink comes in a darker bottle.", "The brand on the bottle is a well known brewer of alcoholic beverages."], "image": "train2014/COCO_train2014_000000100611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19236, "question_id": "igcCAXFCDyXcirPDp3RGJi", "question": "What great empire once ruled this land?", "choices": ["indian", "serbian", "mayan", "ottoman"], "correct_choice_idx": 3, "direct_answers": ["turkish", "mehmet", "constantine", "justinian", "ottoman", "julius caesar", "rome", "sev", "aztec", "ottoman"], "difficult_direct_answer": true, "rationales": ["The ottoman empire used to rule in this region of the world.", "Rules by ottoman before.", "The ottoman ruled the area."], "image": "train2014/COCO_train2014_000000019236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36915, "question_id": "igeov2yqx4DR7bhPuRvS6M", "question": "Where are the buildings that offer the most protection from a tsunami?", "choices": ["left", "right", "middle", "none"], "correct_choice_idx": 1, "direct_answers": ["tall ones", "inland", "right side", "on mountain", "tallest one", "skyscrapers", "back", "right", "off shore", "high structures"], "difficult_direct_answer": true, "rationales": ["The right building is sturdier.", "The buildings in the middle are tallest.", "The ones on the right are highest and therefore will protect from large waves. it will shield any buildings behind it and act as a buffer."], "image": "train2014/COCO_train2014_000000036915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115222, "question_id": "iggvCM5gFd9qCDov34dMsq", "question": "What make is the white vehicle parked near the curb behind the truck?", "choices": ["jeep", "minivan", "suv", "sedan"], "correct_choice_idx": 0, "direct_answers": ["jeep", "jeep", "jeep", "jeep", "jeep", "jeep", "jeep", "jeep", "jeep", "jeep"], "difficult_direct_answer": false, "rationales": ["The make is jeep.", "It has a removable top and large wheels", "The white vehicle has a removable black top. it also has a replacement tire on its back door."], "image": "val2014/COCO_val2014_000000115222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57515, "question_id": "ihDtvq8MvQrty8w4AwsJ7p", "question": "Why are the kids holding umbrellas?", "choices": ["playing games", "hailstorm", "rainstorm", "snow"], "correct_choice_idx": 0, "direct_answers": ["sunny", "sun protection", "sun", "playing", "shade sunlight", "playing games", "shade", "sun protection", "sun", "block sun"], "difficult_direct_answer": false, "rationales": ["The weather is not wet and it looks like they're pretending with it.", "The kids are playing.", "The little boy is shading the little girl while she plays."], "image": "train2014/COCO_train2014_000000057515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491169, "question_id": "ihJ5CXcQQWhzTSV74FTtcd", "question": "These men probably belong to what type of organization?", "choices": ["book club", "rider's club", "hiking club", "chess club"], "correct_choice_idx": 1, "direct_answers": ["motorcycle club", "harley angels", "motorcycle club", "motorbike", "bike club", "police", "riding club", "motorcycle gang", "biker club", "rider's club"], "difficult_direct_answer": true, "rationales": ["They are all driving motorcycles and wearing similar jackets suggesting they belong to the same club.", "These men likely have a riders club for the logo on their bikes.", "The men have motorcycles."], "image": "val2014/COCO_val2014_000000491169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347270, "question_id": "ihLZzvXoREuTZXVmg5vXgN", "question": "What good or service can be found in the glass kiosk with a light blue top to the left of the train?", "choices": ["news stand", "taxi kiosk", "public payphone", "valet"], "correct_choice_idx": 2, "direct_answers": ["vending machine", "public payphone", "rental cars", "ticketing", "tickets", "store", "phone", "baggage claim", "food", "tickets"], "difficult_direct_answer": true, "rationales": ["The object is glass and narrow, which appears to be similar to that of a phonebooth.", "There is a booth with a coin-operated communications device inside.", "There is a phone for people to use."], "image": "train2014/COCO_train2014_000000347270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213599, "question_id": "ihTc8QmZgcyYMKaBYkyKpt", "question": "What birthday are they most likely celebrating for the child?", "choices": ["seventh", "eighth", "sixth", "fifth"], "correct_choice_idx": 3, "direct_answers": ["fifth", "birthday", "fifth", "fifth", "fifth", "birthday", "fifth", "birthday", "fifth", "fifth"], "difficult_direct_answer": false, "rationales": ["There are five candles on the cake.", "There are five candles on the cake.", "Given that there are five candles present on the cake it is most likely a child is turning 5."], "image": "val2014/COCO_val2014_000000213599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94566, "question_id": "ihckSXL2qL5Nni7uMoYWMC", "question": "What does this cat seem to be feeling the most?", "choices": ["disturbed", "content", "angry", "fearful"], "correct_choice_idx": 1, "direct_answers": ["mellow fine", "frisky", "comfy", "relaxed", "relaxed", "happy", "content", "relaxed", "boredom", "comfort"], "difficult_direct_answer": false, "rationales": ["The cat is content.", "The cat looks happy.", "The cat looks laid back and quite comfortable."], "image": "val2014/COCO_val2014_000000094566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30267, "question_id": "ihh5po4r3puBkE9Pt6UE48", "question": "In what country did the rail freight company branded on this train originate?", "choices": ["united kingdom", "france", "germany", "italy"], "correct_choice_idx": 0, "direct_answers": ["usa", "united states", "usa", "usa", "usa", "united states", "united kingdom", "usa", "usa unsure", "germany"], "difficult_direct_answer": false, "rationales": ["Freightliner is known to be a uk company.", "The country is the uk.", "The company is known to have originated in this country."], "image": "val2014/COCO_val2014_000000030267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27190, "question_id": "ihkvywYyoAUTbyzLRW6imt", "question": "What color do the cheapest apples all have on their skins?", "choices": ["brown", "red", "green", "orange"], "correct_choice_idx": 1, "direct_answers": ["red", "red", "red", "red", "green", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The cheapest apples are .99.", "Most of the apples are red that are very cheap.", "The red apples are 99 cents."], "image": "train2014/COCO_train2014_000000027190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102030, "question_id": "ii3gtvQ57FnAwZa9efgw8U", "question": "What is the man putting on the animal?", "choices": ["coat", "kite", "harness", "noose"], "correct_choice_idx": 2, "direct_answers": ["leash", "harness", "rope", "leash", "rope", "harness", "rope", "rope", "harness", "rein"], "difficult_direct_answer": false, "rationales": ["A kite or noose would hurt the animal. a sheep already has wool and does not need to wear a coat.", "The animal is a sheep. the item being attached is a rope that attaches to its head, not neck.", "The man puts a harness son."], "image": "train2014/COCO_train2014_000000102030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356010, "question_id": "ii4tHAxCJhYDhfRKGxFueB", "question": "What type of bears are in the foreground?", "choices": ["black", "polar", "grizzly", "panda"], "correct_choice_idx": 2, "direct_answers": ["grizzly", "brown bears", "small bear", "brown bears", "brown bears", "grizzly", "brown", "grizzly", "brown", "brown bears"], "difficult_direct_answer": false, "rationales": ["The brown and scruffy furred appearance of these bears identifies them as grizzlies.", "Bear is seen with a little bear.", "The bears are light brown, a color that denotes its breed."], "image": "train2014/COCO_train2014_000000356010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557047, "question_id": "ii8h2RHQov7K9oAY2VvKLK", "question": "Which section of the tower would light come out of to help boats?", "choices": ["no light", "very top", "middle white", "bottom red"], "correct_choice_idx": 1, "direct_answers": ["top", "top", "very top", "lighthouse", "top", "top", "lighthouse", "top", "lighthouse", "top"], "difficult_direct_answer": false, "rationales": ["The lighthouse shines light from the top since that's where the bulb is.", "The light which acts as a beacon is on the upper layer of the lighthouse.", "The higher it is, the further out it can be seen."], "image": "train2014/COCO_train2014_000000557047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221833, "question_id": "iiRG9ie4zr4PJGV22wDBhZ", "question": "What type of area is this?", "choices": ["tropical", "rural", "commercial", "residential"], "correct_choice_idx": 2, "direct_answers": ["road", "suburban", "city roads", "city", "intersection", "city", "city", "commercial", "street", "city street"], "difficult_direct_answer": false, "rationales": ["The area has stores.", "There are buildings and stores in the area and that is what is found in a commercial district.", "The street is lined with businesses and shopping centers."], "image": "train2014/COCO_train2014_000000221833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377496, "question_id": "iib8AM9daX2YsascT2MYo7", "question": "Where are the two women eating?", "choices": ["at airport", "at home", "in subway", "in restaurant"], "correct_choice_idx": 3, "direct_answers": ["restaurant", "eggs", "pizza", "in restaurant", "restaurant", "999 meals", "meal", "breakfast", "chicken", "breakfast"], "difficult_direct_answer": false, "rationales": ["They're at a restaurant.", "The woman are sitting at a table with an advertisement on a card on the table.", "There are several booths in a large room and an advertisement on the table"], "image": "train2014/COCO_train2014_000000377496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210408, "question_id": "iiyzcfknovnXxwW4yewyRD", "question": "What persons might normally ride in the cart behind this horse?", "choices": ["family only", "pioneers", "tourists", "farmers"], "correct_choice_idx": 2, "direct_answers": ["bride groom", "tourists", "passengers", "tourists", "humans", "tourists", "tourists", "passengers", "passengers", "tourists"], "difficult_direct_answer": false, "rationales": ["Horse-drawn carriages are almost always used by tourists, and not locals. locals already know their town well, and probably aren't interested in spending the money for a ride!.", "A horse and carriage is in the city. tourists take horse and carriage rides.", "This is in a city area where there are mostly vehicles"], "image": "val2014/COCO_val2014_000000210408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161871, "question_id": "ij5dptwknBtCGxMHtzFGoA", "question": "Why is the elephant placing its legs on the wheel?", "choices": ["to scratch", "to flip", "to kick", "to mount"], "correct_choice_idx": 3, "direct_answers": ["show trick", "balance display", "performing trick", "balannce", "trick", "balancing", "to perform", "to balance", "to mount", "balance"], "difficult_direct_answer": true, "rationales": ["The elephant is trying to mount.", "It's about to do a balancing trick", "The elephant is made to do tricks for the audience."], "image": "train2014/COCO_train2014_000000161871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465294, "question_id": "ijTaaXorAS6GqmtnzePgd6", "question": "What type of banana is this?", "choices": ["cavendish", "plantain", "lady finger", "goldfinger"], "correct_choice_idx": 1, "direct_answers": ["plantain", "plantain", "ripe", "asian", "yellow", "plantain", "plantain", "plantain", "plantain", "ripe"], "difficult_direct_answer": false, "rationales": ["Plantains are fatter.", "The bananas are plantains.", "These bananas look to be shorter and thicker than the standard banana. shorter and thicker bananas like this are likely to be answer a."], "image": "train2014/COCO_train2014_000000465294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91523, "question_id": "ijdedyWWYmAPQdcshSkeYh", "question": "What is the boarder about to hit?", "choices": ["shark", "abyss", "beach", "tidal wave"], "correct_choice_idx": 2, "direct_answers": ["wave", "wave", "shoreline", "shore", "beach", "water", "water", "wave", "wave", "wave"], "difficult_direct_answer": false, "rationales": ["The man is coming in towards the sand at end of water.", "The boarder is about to reach the ocean where the beach is.", "The boarder is at the beach."], "image": "train2014/COCO_train2014_000000091523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493341, "question_id": "ijeWLy6Nb64dfwAMtxWakH", "question": "The skateboarder would have to Ollie at least how high to clear the top of the railing?", "choices": ["12 feet", "8 feet", "5 feet", "3 feet"], "correct_choice_idx": 3, "direct_answers": ["4 feet", "six feet", "few inches", "ten feet", "yes", "3 feet", "eight feet", "four feet", "eight feet", "five feet"], "difficult_direct_answer": true, "rationales": ["The railing is going to be three feet since it's waist high.", "The boarder would have to jump high.", "The boarder is 3 feet."], "image": "train2014/COCO_train2014_000000493341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20133, "question_id": "ijfxQDez5gun6e3ybut8Jf", "question": "The pink and white item will provide what?", "choices": ["power", "music", "games", "cell service"], "correct_choice_idx": 0, "direct_answers": ["battery charge", "power", "battery charge", "electricity", "power", "power", "power", "battery charge", "power", "battery charging"], "difficult_direct_answer": false, "rationales": ["The object is connected to the power charger of the phone, and has flat panels like a solar charger.", "These are small solar panels which harness energy from the sun to charge the phone", "It contains solar panels to provide energy to charge the phone."], "image": "train2014/COCO_train2014_000000020133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93894, "question_id": "ijrKEb7yHGKPLFca6vujdz", "question": "What is the large rectangular object on the table with a screen used for?", "choices": ["ordering", "cooking", "wiping", "eating"], "correct_choice_idx": 0, "direct_answers": ["ordering paying", "ordering", "ordering food", "order food", "tables", "table", "ordering", "ordering", "placing orders", "order management"], "difficult_direct_answer": false, "rationales": ["A screen is held in a professionally made wooden stand on a table in a restaurant.", "You can use the tablet to order your own food.", "They use this to order their food."], "image": "val2014/COCO_val2014_000000093894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382655, "question_id": "ik5k7VLC3v4Bmkf94eKCLu", "question": "What are persons who are on the couch playing with?", "choices": ["sega", "wii", "guns", "cards"], "correct_choice_idx": 1, "direct_answers": ["nintendo wii", "family", "video game", "wii", "wii", "wii", "remotes", "wii", "wii controllers", "wii"], "difficult_direct_answer": false, "rationales": ["The remotes are white", "The people play wii.", "They are seated in front of a tv console and are holding white remotes in their hands. the two things together correspond with playing wii."], "image": "train2014/COCO_train2014_000000382655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421520, "question_id": "ikMUp7pogJwRCc8TkuTeZV", "question": "What time of year is associated with the trees to the back left?", "choices": ["midsummer", "halloween", "easter", "christmas"], "correct_choice_idx": 3, "direct_answers": ["winter", "winter", "winter", "winter", "winter", "winter", "winter", "christmas", "winter", "winter"], "difficult_direct_answer": false, "rationales": ["There is snow.", "There is snow on the ground, so it is winter time.", "The time is christmas."], "image": "train2014/COCO_train2014_000000421520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72397, "question_id": "ikZCFYuLEfSHBcKCMqpG7d", "question": "What activity could you do in the structure in the center here?", "choices": ["sales", "distribution", "telephoning", "kiosk"], "correct_choice_idx": 2, "direct_answers": ["phone call", "phone call", "stay sheltered", "phone call", "make phonecall", "phone call", "phone call", "phone call", "cylinder", "telephoning"], "difficult_direct_answer": false, "rationales": ["The structure in the center is a telephone booth based on its size, design and logo. one would perform the action of using the telephone if entering a phone booth.", "It is a phone booth", "The structure in question is a payphone based on the shape and design of the structure and the phone within which would be used for answer a."], "image": "val2014/COCO_val2014_000000072397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491554, "question_id": "ikd7wFB2wmEhy27fSW4p53", "question": "The umbrella here prevents the boater from what fate?", "choices": ["falling overboard", "getting lost", "sunburn", "dizziness"], "correct_choice_idx": 2, "direct_answers": ["rain", "sunburn", "rain", "sunburn", "overheating", "sunburn", "sunburn", "sunburn", "sunburn", "sunburn"], "difficult_direct_answer": false, "rationales": ["The umbrella helps give shade.", "Umbrellas are used not only to prevent water from reaching a human, but also to prevent sun from damaging a human's skin.", "Keeps the sun off of them."], "image": "train2014/COCO_train2014_000000491554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215081, "question_id": "ikf76EzRVScgbeKrMvW4xt", "question": "What does the big blue jug in the background dispense?", "choices": ["water", "milk", "soda", "beer"], "correct_choice_idx": 0, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The jug has water in it.", "The jug is filled with water since it's a water dispenser.", "A cooler to dispense water is in a room against the wall."], "image": "train2014/COCO_train2014_000000215081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248953, "question_id": "ikjxYqVYAiDo4pYTB2awwW", "question": "What move is the snowboarder doing?", "choices": ["grind", "nose grab", "kickflip", "indy"], "correct_choice_idx": 0, "direct_answers": ["grind", "grinding rail", "ski ramp", "sliding", "railing", "grind", "grind", "grinding", "grind", "ride rail"], "difficult_direct_answer": false, "rationales": ["He is going down the rail.", "The snowboard is in contact with a stair rail. when one is slide on a rail with a board that they are riding the trick is known as a grind.", "This snowboarder travels down the bannister on the bottom of his board. this is known as grinding in extreme sports."], "image": "val2014/COCO_val2014_000000248953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573995, "question_id": "iknV7PtRrzEzzVZNBpQ2kw", "question": "What is the man doing with the food on the plate?", "choices": ["trashing it", "eating it", "cooking it", "serving it"], "correct_choice_idx": 1, "direct_answers": ["using fork", "eating it", "using fork", "eating", "eating", "picking", "eating it", "playing", "push fork", "eating"], "difficult_direct_answer": false, "rationales": ["The man is extended a fork towards the food item visible which would be consistent with answer a and the intention of food.", "The food has already been cooked and served. trashing the food would be wasteful.", "The man is eating a piece of cake."], "image": "train2014/COCO_train2014_000000573995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511165, "question_id": "ikoW8zNqzu2nQTUixLRWgq", "question": "How many people can cook food here at once?", "choices": ["four", "two", "one", "six"], "correct_choice_idx": 0, "direct_answers": ["one", "four", "four", "zero", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are four microwaves on the counter.", "Up to four people could use one of the four microwaves on the shelves.", "Given that there are four microwaves and no other items visible that cook food we can conclude that four people would be able to cook their food at the same time here."], "image": "train2014/COCO_train2014_000000511165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324261, "question_id": "ikp6YLukCJnSSiUXBGLric", "question": "What type of cup is he using?", "choices": ["glass", "plastic", "styrofoam", "metal"], "correct_choice_idx": 1, "direct_answers": ["plastic", "plastic", "plastic", "plastic", "solo", "blue solo", "plastic", "solo cup", "plastic", "plastic"], "difficult_direct_answer": false, "rationales": ["It's a plastic one.", "Blue solo cups are always plastic.", "The man sitting on the bed is holding a blue cup made of plastic."], "image": "train2014/COCO_train2014_000000324261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226278, "question_id": "ikwW4JumxkaNXN9LEJqVo4", "question": "What is he waiting for?", "choices": ["ball", "ride home", "darkness", "applause"], "correct_choice_idx": 0, "direct_answers": ["ball", "serve", "tennis serve", "ball", "tennis ball", "tennis ball", "tennis serve", "ball", "serve", "ball"], "difficult_direct_answer": false, "rationales": ["He's on a tennis court", "The man wants to hit the ball.", "The person has his racquet out to hit the ball."], "image": "val2014/COCO_val2014_000000226278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161434, "question_id": "im98YtCk4xeduBh2XBnDiK", "question": "What is the red building to the right of the bus used for?", "choices": ["convenience store", "gas station", "auto mechanic", "grocery store"], "correct_choice_idx": 1, "direct_answers": ["gas station", "gas", "fueling vehicles", "gas station", "gas station", "gas", "gas", "gas refill", "gas station", "gas"], "difficult_direct_answer": false, "rationales": ["To the right in red is pumps where people drive up to. they then will pump liquid to drive around in.", "It's where people go to buy fuel for their vehicles.", "There is a display board visible that is showing gas prices. such an object would likely be found at answer a."], "image": "train2014/COCO_train2014_000000161434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575755, "question_id": "imD3bmwnyLABzFuKhPiPWR", "question": "This sandwich is probably being eaten in what kind of setting?", "choices": ["outdoor", "office", "cafeteria", "kitchen"], "correct_choice_idx": 0, "direct_answers": ["restaurant", "restaurant", "outdoor", "outdoor restaurant", "restaurant", "lunch table", "restaurant", "beef", "lunch", "restaurant"], "difficult_direct_answer": false, "rationales": ["The light in the image is natural, and not artificial, and the table beneath it is a grated design. tables of this kind are used outdoors, so that rain can pass through them rather than pooling.", "The sandwich is probably being eaten in an outside dining area.", "The table is the kind seen on patios and sunlight is visible through it."], "image": "val2014/COCO_val2014_000000575755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525865, "question_id": "imSAMYgpkhXnFAV98Cm67k", "question": "What temperature do these devices keep things?", "choices": ["hot", "boiling", "cold", "room"], "correct_choice_idx": 2, "direct_answers": ["cold", "under 40", "cold", "cold", "freezing", "cold", "cold", "fridge", "freezing", "36"], "difficult_direct_answer": false, "rationales": ["They are refrigerators to keep food cold in order to preserve it.", "Th devices are refrigerators which cools things to keep them fresh.", "These objects look to be industrial refrigerator based on their design and material. refrigerators are used to keep things cold."], "image": "train2014/COCO_train2014_000000525865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408307, "question_id": "imWZ3qvCjx2P5owfSnyHsY", "question": "What uses the ramp on the back of the car?", "choices": ["birds", "cats", "babies", "dogs"], "correct_choice_idx": 3, "direct_answers": ["dog", "pet", "dog", "dog", "dog", "dogs", "dogs", "old dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["This is used when they get old and can't jump up anymore", "There's only one type of creature pictured, with one of them relaxing next to the ramp.", "There is a dog in the back of the car. another dog is on the outside."], "image": "val2014/COCO_val2014_000000408307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16195, "question_id": "imbK5EnBHU2pqPtmL2c265", "question": "What is essential for this activity?", "choices": ["snow", "sand", "water", "grass"], "correct_choice_idx": 0, "direct_answers": ["skis", "snow", "skis", "snow", "skis", "snow", "skis", "skis", "snow", "skis"], "difficult_direct_answer": false, "rationales": ["A person is snowboarding.", "Snow skiing requires snow to move across the ground.", "The boarders have to have snow."], "image": "train2014/COCO_train2014_000000016195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278913, "question_id": "imqwDBryAjP72BMLYnHzsr", "question": "How skilled is this skier in the activity?", "choices": ["intermediate", "professional", "amateur", "beginner"], "correct_choice_idx": 1, "direct_answers": ["expert", "no skiers", "professional", "very skills", "highly skilled", "professional", "expert", "pro", "extremely skilled", "incredibly"], "difficult_direct_answer": false, "rationales": ["The person is high up in the air calm and collected without freaking out.", "The skier is doing a trick that is very complicated and dangerous and of the highest skill level.", "The skier is showing extremely advanced skills as they are performing tricks high off of the ground."], "image": "train2014/COCO_train2014_000000278913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369889, "question_id": "imuVeEmNo2QYoKhYvHfaEx", "question": "The two pizzas have different sized what?", "choices": ["plates", "toppings", "cheese", "colors"], "correct_choice_idx": 1, "direct_answers": ["crusts", "toppings", "crusts", "toppings", "toppings", "crusts", "toppings", "toppings", "toppings", "toppings"], "difficult_direct_answer": false, "rationales": ["There are meatballs on one and not the other.", "One has large meatballs and the other has flat ingredients", "The stuff on top is differnt sizes of the two pizzas."], "image": "train2014/COCO_train2014_000000369889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405047, "question_id": "in8H5fNVQHoCERwaCZPuQd", "question": "What word is on the orange sign?", "choices": ["road", "leave", "stop", "beware"], "correct_choice_idx": 0, "direct_answers": ["road closed", "ahead", "road", "road closed", "road", "road", "road closed", "road", "road closed", "road closed"], "difficult_direct_answer": false, "rationales": ["That is the first word.", "This is a sign about a closure ahead", "The orange sign says \"road closed ahead\""], "image": "val2014/COCO_val2014_000000405047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186608, "question_id": "inCPQ3JUWtZAKXWfTzpLmy", "question": "What is near the sidewalk here?", "choices": ["beach", "grocery store", "cow fields", "dairy"], "correct_choice_idx": 0, "direct_answers": ["sand", "sand", "beach", "beach", "ocean", "carriage", "horse buggy", "plastic tarp", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["This is on the coast", "It is near the sand and water.", "There is a lot of sand and then adjacent to it is a large body of water."], "image": "train2014/COCO_train2014_000000186608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13357, "question_id": "inMFjEybUcoFuca29vh54j", "question": "The dark condition is due to the absence of which molecule?", "choices": ["photon", "neutron", "electron", "proton"], "correct_choice_idx": 0, "direct_answers": ["p680", "light", "light", "energy", "light", "light", "photon", "light molecule", "light", "photon"], "difficult_direct_answer": false, "rationales": ["Photons are an element of light. since it's dark here that means there is no light or photons.", "Light particles are known as photons. when there is a lack of light it would follow there is a lack of light particles.", "There are no electrons for the electricity."], "image": "val2014/COCO_val2014_000000013357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161807, "question_id": "inRGZYeYhBa4qYdbuxjV5x", "question": "Which food here is highest in vitamin B-12?", "choices": ["broccoli", "salmon", "potato", "scallops"], "correct_choice_idx": 1, "direct_answers": ["salmon", "salmon", "fish", "broccoli", "broccoli", "fish", "salmon", "broccoli", "salmon", "brocolli"], "difficult_direct_answer": false, "rationales": ["Fish is high in vitamin b 12.", "Salmon has b12.", "The food is salmon."], "image": "val2014/COCO_val2014_000000161807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510572, "question_id": "inYp4AMQSZCdad8bsph686", "question": "What is the woman holding?", "choices": ["pumpkins", "children", "kittens", "bunny rabbits"], "correct_choice_idx": 1, "direct_answers": ["children", "babies", "children", "children", "children", "children", "babies", "children", "children", "children"], "difficult_direct_answer": false, "rationales": ["The woman is holding two kids.", "The woman has two children.", "There are no animals or pumpkins near the woman. there are humans near her."], "image": "train2014/COCO_train2014_000000510572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522708, "question_id": "inb8uDuDKKcbCpXGnNbyZr", "question": "What brand of bike is the couple sitting on?", "choices": ["yamaha", "kawasaki", "ducati", "harley"], "correct_choice_idx": 3, "direct_answers": ["harley", "yamaha", "harley davidson", "motorcycle", "motor bike", "indian", "harley", "harley", "harley davidson", "harley davidson"], "difficult_direct_answer": false, "rationales": ["The brand is harley.", "The style of the bicycle is consistent with those produced by company answer a.", "The couple is on a large bike."], "image": "train2014/COCO_train2014_000000522708.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144481, "question_id": "io2L6yy8iZrzvjFLpS8p4K", "question": "What objects are on display on the blue paper?", "choices": ["greek vases", "old planters", "ceremonial urns", "ancient urinals"], "correct_choice_idx": 0, "direct_answers": ["vessels", "egyptian vases", "vases", "urns", "ancient vases", "greek vases", "vases", "urns", "jugs", "vase"], "difficult_direct_answer": false, "rationales": ["Two greek vases lay on the blue paper.", "The objects are vases.", "They look like a type of vase, and have greek styled designs on them."], "image": "val2014/COCO_val2014_000000144481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262124, "question_id": "ioMVWLucXvvH7NoArnAxWK", "question": "What type of weather is this area experiencing?", "choices": ["rain", "snow", "hail", "wind"], "correct_choice_idx": 0, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The people are holding umbrellas. the ground is wet.", "People are carrying umbrellas and it is wet on the ground.", "Due to the extended umbrellas, raincoats, and wet appearance to the ground we can conclude this area is now experiencing precipitation."], "image": "train2014/COCO_train2014_000000262124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116512, "question_id": "ioX7Fv7yELPKsge9cEspwa", "question": "If someone on a horse here sees a person committing a crime what will they do?", "choices": ["nothing", "look away", "arrest them", "ride off"], "correct_choice_idx": 2, "direct_answers": ["chase them", "arrest them", "arrest them", "arrest", "arrest them", "pursue", "arrest", "confront offender", "arrest", "stop them"], "difficult_direct_answer": false, "rationales": ["The riders are uniformed police officers.", "The men shown have the type of vest policemen wear. police men arrest people who do illegal actions.", "They're police."], "image": "train2014/COCO_train2014_000000116512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482407, "question_id": "iodFwfiVGQFqwBD9N4GzZn", "question": "What kind of an area is this?", "choices": ["desert", "savanna", "forest", "tundra"], "correct_choice_idx": 2, "direct_answers": ["forest", "forest", "park", "road", "mountains", "highway", "forest", "woodlands", "forest", "rural"], "difficult_direct_answer": false, "rationales": ["There are tall trees around the road.", "A forest has a lot of trees in it.", "It's a forest with a lot of trees."], "image": "train2014/COCO_train2014_000000482407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277150, "question_id": "ipA6BSLRFXG8DpjEqdtsqY", "question": "What does the woman have in her right hand?", "choices": ["charger", "coins", "phone", "bible"], "correct_choice_idx": 2, "direct_answers": ["phone", "phone", "cell phone", "phone", "phone", "phone", "phone", "cell phone", "phone", "phone"], "difficult_direct_answer": false, "rationales": ["The woman has a phone.", "She is holding her hand to her ear like a person who is holding a phone would do.", "The woman has a cellphone in her right hand."], "image": "train2014/COCO_train2014_000000277150.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323249, "question_id": "ipTiKXeeVukYmngA5tpNAD", "question": "The vehicle here is powered by something located where in relation to it?", "choices": ["above", "street side", "inside", "under"], "correct_choice_idx": 0, "direct_answers": ["engine", "above", "above", "remote", "train", "parallel", "above", "gas", "underneath", "below"], "difficult_direct_answer": false, "rationales": ["The vehicle is powered by the lines above.", "There are visible wires above the trolleys that they are in contact with. these types of vehicles run from electrical power and in this case it is visibly being delivered from the wire above.", "It is a streetcar running along the wire attached overhead."], "image": "train2014/COCO_train2014_000000323249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499393, "question_id": "ipmn9bBxBhk4vjBE7mToL4", "question": "What is this watercraft's purpose?", "choices": ["research", "excursions", "fishing", "hunting"], "correct_choice_idx": 1, "direct_answers": ["transportation", "site seeing", "transportation", "transport people", "water taxi", "excursions", "sightseeing", "transport people", "carry passengers", "sightseeing"], "difficult_direct_answer": false, "rationales": ["The boat goes on trips.", "Boats are meant for little day trips.", "The boat takes people on fun adventures for vacations."], "image": "val2014/COCO_val2014_000000499393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296782, "question_id": "ipqg8WkYNNm6sRqcxcZ3vL", "question": "What is the skateboarder likely to suffer from?", "choices": ["fatigue", "dog bite", "pricked feet", "car accident"], "correct_choice_idx": 2, "direct_answers": ["scraped knees", "sore feet", "blindness", "stubbed toe", "sore knees", "pricked feet", "blisters", "dirty feet", "foot injury", "unemployment"], "difficult_direct_answer": true, "rationales": ["The skateboarder in question is not wearing shoes in an urban area which would likely result in stepping on something unpleasant.", "The skateboarder doesn't have any shoes on. there's nothing on his feet to protect them.", "He is riding barefoot and going to hurt his feet."], "image": "train2014/COCO_train2014_000000296782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532219, "question_id": "ipyeEFqZfD4R24Ao7bftdP", "question": "How do the people know each other?", "choices": ["siblings", "teammates", "coworkers", "spouses"], "correct_choice_idx": 0, "direct_answers": ["scatting", "siblings", "family", "sports team", "snowboarding", "friends", "friends", "ski together", "married", "friends"], "difficult_direct_answer": false, "rationales": ["Two kids are going up a mountain with snowboards.", "They are both younger and are staying together so they are mostly likely on a trip together", "They should be really close to be out together in this type of weather."], "image": "train2014/COCO_train2014_000000532219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81201, "question_id": "iqJAf8kPkuHyCwsvASQvyZ", "question": "Where is the woman located?", "choices": ["restaurant", "office", "store", "library"], "correct_choice_idx": 0, "direct_answers": ["pizza place", "pizzeria", "restaurant", "restaurant", "restaurant", "dining room", "chair", "restaurant", "restaurant", "front"], "difficult_direct_answer": false, "rationales": ["The woman is sitting at a restaurant table that has pizza and drinks on it.", "The table she is sitting at has pizzas and drinks.", "She is in a restaurant."], "image": "train2014/COCO_train2014_000000081201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425497, "question_id": "iqXgzjr6aVaTcZcQnXzTzd", "question": "These sports equips are used to play which sport?", "choices": ["surfing", "snowboarding", "skating", "skiing"], "correct_choice_idx": 3, "direct_answers": ["snowboarding", "snowboarding", "snowboarding", "snowboarding", "surfboard", "snowboarding", "skiing", "skiing", "skiing", "snowboarding"], "difficult_direct_answer": false, "rationales": ["This equipment is all ski equipment.", "It is a wider board that both feet are strapped into and ride on top of snow.", "There is a snowboard."], "image": "train2014/COCO_train2014_000000425497.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415077, "question_id": "iqdZDt2sa8AACgJTj7rvBx", "question": "Which item is sitting on top of an object that was woven?", "choices": ["fire extinguisher", "fan", "red suitcase", "welcome mat"], "correct_choice_idx": 2, "direct_answers": ["suitcase", "suitcases", "suitcases", "red suitcase", "suitcase", "suitcase", "luggages", "suitcases", "suitcase", "luggage"], "difficult_direct_answer": false, "rationales": ["There are several suitcases on top of baskets.", "The object on top has a handle and clasps.", "There are multiple items that are visibly consistent with the question, but answer a is applicable based on the size, structure and shape of the object and the defining weaving pattern underneath."], "image": "train2014/COCO_train2014_000000415077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92804, "question_id": "iqg4XacbLaUN6kABE5TfhN", "question": "What is the leftmost skier doing?", "choices": ["seeking shelter", "hiding", "waiting turn", "sleeping"], "correct_choice_idx": 0, "direct_answers": ["walking", "walking", "seeking shelter", "resting", "resting", "resting", "exploring structure", "walking", "taking photograph", "hiking"], "difficult_direct_answer": false, "rationales": ["He is seeking shelter to ease the task along his friend.", "He is looking for somewhere that is a shelter.", "There is nothing in the image that directly confirms the intention of the skier, but they are seen in a shaded area out of the sun and possibly wind that one might rest in a way consistent with a."], "image": "val2014/COCO_val2014_000000092804.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17474, "question_id": "iqhwffbHJSEghrMgRg7YQr", "question": "What type of establishment is Canova?", "choices": ["library", "restaurant", "retail store", "grocery"], "correct_choice_idx": 1, "direct_answers": ["restaurant", "art gallery", "restaurant", "restaurant", "art gallery", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["Canola is a restaurant.", "It's a restaurant for people to eat at.", "Canova is an establishment that sells food to its patrons based on the outside dining tables."], "image": "train2014/COCO_train2014_000000017474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325021, "question_id": "iqoYMeW2NAHpHwWwSbUMj3", "question": "What is being served on plates?", "choices": ["pizza", "donut", "sandwich", "pasta"], "correct_choice_idx": 2, "direct_answers": ["sandwich", "hamburgers", "sandwiches", "sandwiches", "sandwiches", "cooking", "sandwiches", "sandwiches", "sandwiches", "burgers"], "difficult_direct_answer": false, "rationales": ["Sandwiches are plated.", "The plates each contain various food items between two pieces of bread.", "There is bread with meat inside of it."], "image": "train2014/COCO_train2014_000000325021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440534, "question_id": "iqsPzyAs4mZsp6Md3wk5KG", "question": "What are the two people both directly above and below the snowboarder doing?", "choices": ["clapping", "cheering", "filming", "spectating"], "correct_choice_idx": 2, "direct_answers": ["photography", "watching", "filming", "watching", "watching", "taking pictures", "filming", "taking photos", "watching", "photographing"], "difficult_direct_answer": false, "rationales": ["The people are watching the performance.", "The people are recording the guy's actions with cameras.", "They are recording him."], "image": "train2014/COCO_train2014_000000440534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518866, "question_id": "ir3r6UmscUJyZkhE8yFuBq", "question": "What is used to pull this boat?", "choices": ["twine", "yarn", "cables", "string"], "correct_choice_idx": 2, "direct_answers": ["rope", "rope", "cables", "rope", "tug", "rope", "tug", "rope", "tug", "rope"], "difficult_direct_answer": false, "rationales": ["Since boats are heavy and water resistance needs a lot of force to get through, a boat like this needs to be towed with a strong metal cable.", "Long lines run forward from a boat in the water.", "In the water you can see cables attached to this boat."], "image": "train2014/COCO_train2014_000000518866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383562, "question_id": "ir4gaSkRTyk3DkNZJrLKwT", "question": "Where are the apples in the buckets picked from?", "choices": ["grass", "weeds", "vines", "trees"], "correct_choice_idx": 3, "direct_answers": ["tree", "tree", "tree", "trees", "trees", "apple tree", "tree", "trees", "apple tree", "tree"], "difficult_direct_answer": false, "rationales": ["Apples grow from tree branches.", "Apples don't grow on vines or grass. they are also not classified as weeds.", "The apples are from trees."], "image": "train2014/COCO_train2014_000000383562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148427, "question_id": "ir5Ua9Qys63D5285FDm7vs", "question": "What country's flag is on the white carriage?", "choices": ["italy", "ireland", "germany", "russia"], "correct_choice_idx": 1, "direct_answers": ["ireland", "ireland", "ireland", "ireland", "ireland", "ireland's", "ireland", "ireland's", "ireland", "ireland"], "difficult_direct_answer": false, "rationales": ["Green, white and orange are irish colors.", "The flag is irish.", "Ireland's flag colors are white orange and green."], "image": "train2014/COCO_train2014_000000148427.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509503, "question_id": "ir7D4VRL5SSqRr44UBs8HY", "question": "What is allowed on this pathway?", "choices": ["planes", "elephants", "busses", "pedestrians"], "correct_choice_idx": 3, "direct_answers": ["car", "walking", "walking", "bicycles", "bicycles", "people", "bicycle", "pedestrians", "bikes", "people"], "difficult_direct_answer": false, "rationales": ["The yellow and red sign indicates that vehicles with motors are not allowed. the blue and white sign shows that people and bicycles are allowed.", "There is a blue circle with people and a bike, these are not crossed out or red which seems to suggest \"not allowed\". since these are not marked for not allowed they are okay to use the path.", "The pedestrians are allowe.d"], "image": "val2014/COCO_val2014_000000509503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457667, "question_id": "irHEBVQmGJpQX58rdm5P3S", "question": "To represent the cleanliness of sanitary wares its available with what color?", "choices": ["blue", "black", "white", "red"], "correct_choice_idx": 2, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["This is the color seen as the most pure", "The sink and towels are white which shows they are clean if there are no spots.", "The white towels are available."], "image": "train2014/COCO_train2014_000000457667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276918, "question_id": "ircf4vyePknYLHP7LqJJTU", "question": "Why are the animals enclosed in one area?", "choices": ["to protect", "to hunt", "to capture", "to heal"], "correct_choice_idx": 0, "direct_answers": ["giraffe", "zoo", "to protect", "safety", "zoo", "protection", "zoo", "spectating", "giraffes", "keep safe"], "difficult_direct_answer": false, "rationales": ["These animals are fenced in so as to protect them from the wild so as to be on display.", "The animals appear in some kind of sanctuary or zoo. they would be enclosed in such a place so they could not escape and do damage to themselves or be hurt from something coming in.", "The animals are in the area protected by a fence."], "image": "train2014/COCO_train2014_000000276918.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106849, "question_id": "irieveUASKMNuq7M6LAcPd", "question": "Why does he holds his hands about a foot apart?", "choices": ["hands stuck", "dropped something", "is threatening", "showing size"], "correct_choice_idx": 3, "direct_answers": ["demonstrating", "showing size", "explains", "describing experience", "showing", "hand talking", "showing measurement", "describing something", "describing size", "measuring"], "difficult_direct_answer": true, "rationales": ["The man is demonstrating size.", "Holding one's hands apart is a common way of estimating the size of an object.", "He's a cashier and he's asking or showing the customer how big the object is with an imaginary length."], "image": "val2014/COCO_val2014_000000106849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199117, "question_id": "irpiwiStcahnZCsaBmVRXY", "question": "What type of toys are marketed here?", "choices": ["tops", "dolls", "trucks", "kites"], "correct_choice_idx": 3, "direct_answers": ["kites", "bird", "kites", "kites", "kites", "yo yos", "kites", "party supplies", "kites", "kites"], "difficult_direct_answer": false, "rationales": ["The toys are kites.", "Kites are marketed here.", "Kites are being sold."], "image": "val2014/COCO_val2014_000000199117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347948, "question_id": "irxw2VDkGMKpJbNdrUuFe9", "question": "What type people most likely ride this conveyance?", "choices": ["homeless", "tourists", "commuters", "military"], "correct_choice_idx": 1, "direct_answers": ["non-drivers", "tourist", "passengers", "tourist", "tourists", "tourists", "workers", "passengers", "tourists", "tourists"], "difficult_direct_answer": false, "rationales": ["The people are tourists.", "The double decker bus is a tourist attractino.", "Such buses are designed for tourists to see the sight and sounds of the city. they are throwbacks to old buses from decades past."], "image": "train2014/COCO_train2014_000000347948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344307, "question_id": "isRiEtwSBYAVbpZR5FufDG", "question": "What type of board does the man have?", "choices": ["skate board", "land board", "body board", "long board"], "correct_choice_idx": 3, "direct_answers": ["longboard", "skateboard", "long board", "skateboard", "skateboard", "skateboard", "long board", "long board", "skateboard", "skateboard"], "difficult_direct_answer": false, "rationales": ["The man has a long board.", "The board is a long one.", "The man has a long skateboard."], "image": "val2014/COCO_val2014_000000344307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324139, "question_id": "iscY3DmiZ9YAGv6VSJANte", "question": "What is the black object near the bench used to collect?", "choices": ["trash", "mail", "coins", "donations"], "correct_choice_idx": 0, "direct_answers": ["trash", "trash", "trash", "trash", "garbage", "trash", "trash", "trash", "garbage", "garbage"], "difficult_direct_answer": false, "rationales": ["There is a trash can.", "People usually have a place to put trash in a park.", "The black object is a garbage pale. it's for throwing things away."], "image": "val2014/COCO_val2014_000000324139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228530, "question_id": "iszaeXAU7WzDFZ9E8zHWaQ", "question": "The first flag celebrates what heritage?", "choices": ["scottish", "american", "australian", "irish"], "correct_choice_idx": 0, "direct_answers": ["scottish", "dominica", "jamaica", "norwegian", "scottish", "canadian", "scottish", "scottish", "scottish", "scotland"], "difficult_direct_answer": false, "rationales": ["A white and blue flag flies outside a building.", "Scottish heritage is celebrated by the rightmost flag of scotland.", "The flag celebrates scottish heritage."], "image": "train2014/COCO_train2014_000000228530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156651, "question_id": "it4wtRpn4GPS3cSaGZvQFV", "question": "Why are her glasses that color?", "choices": ["paint", "sunlight", "prescription", "style"], "correct_choice_idx": 1, "direct_answers": ["sun protection", "tinted", "shade sun", "transition lenses", "sunlight", "transitions lenses", "tinted", "block sun", "block sun", "sunglasses"], "difficult_direct_answer": false, "rationales": ["They are used to protect the eyes from the sun and the shade of them help prevent all the sunlight from reaching the eyes.", "The glasses are filtering sunlight.", "It's light out."], "image": "val2014/COCO_val2014_000000156651.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223932, "question_id": "itF3AzJDKufCK93G6YpdyM", "question": "The cat next to the person's foot is eating food from which national cuisine?", "choices": ["chinese", "japanese", "french", "italian"], "correct_choice_idx": 1, "direct_answers": ["philippines", "japanese sushi", "french", "norway", "japanese", "broccoli", "japanese", "japanese", "sushi", "japan"], "difficult_direct_answer": false, "rationales": ["The cat is by japanese food.", "The cat is eating food from japanese culture.", "The cat wants to get some fish from japanese sushi."], "image": "train2014/COCO_train2014_000000223932.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492657, "question_id": "itFJUZbbJk5qnVDUfs6PLc", "question": "Which direction are these people travelling?", "choices": ["no where", "staying still", "down hill", "upwards"], "correct_choice_idx": 3, "direct_answers": ["north", "away", "forward", "north", "north", "forward", "upwards", "forward", "north", "north"], "difficult_direct_answer": false, "rationales": ["The land is sloping up on a slight incline.", "The slop is angled higher than the point the people are on.", "The people are going uphill."], "image": "train2014/COCO_train2014_000000492657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506640, "question_id": "itViLFrXh5XJASpDKGqbRq", "question": "What's near the zebras?", "choices": ["lagoon", "jungle", "lion", "ocean"], "correct_choice_idx": 0, "direct_answers": ["trees", "trees", "water", "water", "water", "lagoon", "water", "trees", "water", "water"], "difficult_direct_answer": false, "rationales": ["The lagoon is near.", "The lake is near them.", "Based on the environments zebras are known to inhabit, a body of water like this would likely be answer a."], "image": "train2014/COCO_train2014_000000506640.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263104, "question_id": "itkw7oic2i57fSRqosLyHY", "question": "What is the round blue bin used to collect?", "choices": ["candy", "mail", "rain", "trash"], "correct_choice_idx": 3, "direct_answers": ["trash", "trash", "trash", "garbage", "garbage", "recyclables", "trash", "trash", "passenger", "trash"], "difficult_direct_answer": false, "rationales": ["The bin in the middle of the road is a trash bin.", "The bin has holes for items to be put into it, and usually are used to keep the area litter free.", "This is a place for people to throw garbage instead of littering."], "image": "train2014/COCO_train2014_000000263104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507087, "question_id": "itsYeXJrPqVeV2t3HfSKW5", "question": "Which music group would be able to use all of these boards without sharing?", "choices": ["cream", "nsync", "backstreet boys", "spice girls"], "correct_choice_idx": 0, "direct_answers": ["beach boys", "beach boys", "quartet", "bens", "beastie boys", "cream", "jonas brothers", "band", "savage garden", "beach boys"], "difficult_direct_answer": false, "rationales": ["The rock band cream had three members and there are three boards.", "The group is cream.", "There are three members in cream. there are three surfboards here."], "image": "val2014/COCO_val2014_000000507087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331686, "question_id": "ittJwPdGJuChgv7zqMCHKv", "question": "What is worn by all who ride these bikes?", "choices": ["cowboy hats", "police badge", "biker jeans", "rubber vest"], "correct_choice_idx": 1, "direct_answers": ["police badge", "helmets", "sunglasses", "uniforms", "police uniforms", "helmets", "bike ride", "police", "helmet", "police uniform"], "difficult_direct_answer": true, "rationales": ["They are in the common work uniform sitting on the common motorcycle ridden by these types of people. they have shoulder patches on their sleeves showing their profession.", "They wear badges.", "Police are standing near bikes. policemen wear badges,"], "image": "val2014/COCO_val2014_000000331686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342958, "question_id": "ityr3jfoVuYNFnEmi2ebE3", "question": "What type of area is shown?", "choices": ["forest", "rural", "jungle", "coastal"], "correct_choice_idx": 3, "direct_answers": ["beach", "beachfront restaurant", "beachfront restaurant", "coastal", "beach", "beach", "beach", "beach", "beach", "ocean"], "difficult_direct_answer": false, "rationales": ["It's a coastal area", "A beach area is shown.", "The beach and a large body of water can be seen, with a winding landmass visible, which denotes a coast."], "image": "val2014/COCO_val2014_000000342958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107729, "question_id": "iu6GbRWkmy7d72bLc3kAec", "question": "The woman is most likely training her eyes on what object?", "choices": ["net", "glove", "ball", "birdie"], "correct_choice_idx": 2, "direct_answers": ["ball", "ball", "ball", "baseball", "ball", "ball", "tennis", "baseball", "ball", "ball"], "difficult_direct_answer": false, "rationales": ["The woman is watching to hit the ball.", "The woman wants to hit the ball so she would be looking at it.", "She has a bat ready to swing"], "image": "val2014/COCO_val2014_000000107729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555684, "question_id": "iuGdi8fjCaeHLJ8yRP73ZQ", "question": "The dog seen here was placed there by whom?", "choices": ["self", "cat", "mortal enemy", "owner"], "correct_choice_idx": 3, "direct_answers": ["mike baird", "person", "mike baird", "person", "it's owner", "it's owner", "owner", "mike", "owner", "owner"], "difficult_direct_answer": false, "rationales": ["This dogs human would of put them on this surfboard.", "The dog is the owner's.", "The dog is on a surfboard. a person, not itself or a cat, must have placed it there."], "image": "val2014/COCO_val2014_000000555684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121351, "question_id": "ivGmdDUH84prVbGHKonn76", "question": "What type of donut is she eating?", "choices": ["yeast donut", "cake donut", "mini donut", "square donut"], "correct_choice_idx": 0, "direct_answers": ["cookie", "raised", "oreo", "oreo crumble", "cake", "oreo", "yeast donut", "cookie doughnut", "oreo", "chocolate chip"], "difficult_direct_answer": false, "rationales": ["The donut is a light brown color and round, making it a traditional type of donut.", "A leavening agent was used in making the dough.", "This is a donut with icing on top like a cake."], "image": "train2014/COCO_train2014_000000121351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557548, "question_id": "ivNFwksVwNzsVEkPq8XH9H", "question": "What type of device would create a better picture than the flip phone?", "choices": ["rotary phone", "blackberry phone", "disposable camera", "smart phone"], "correct_choice_idx": 3, "direct_answers": ["camera", "camera", "dslr camera", "smart phone", "iphone", "camera", "unknown", "camera", "camera", "camera"], "difficult_direct_answer": false, "rationales": ["The flip phone likely has poor camera quality.", "It is more advanced with a better camera", "A smart phone with a camera takes better photos than an old flip phone. smart phones have better quality."], "image": "train2014/COCO_train2014_000000557548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290196, "question_id": "ivZsH67DEjwWTsx3NU6gGR", "question": "Why are they skiing on level ground?", "choices": ["cross-country", "beginners", "easier", "safer"], "correct_choice_idx": 0, "direct_answers": ["cross country", "cross country", "cross country", "for walking", "cross-country", "safety", "hiking", "no mountain", "cross county", "no snow"], "difficult_direct_answer": false, "rationales": ["The people are going across the country.", "The skiers are going up the mountain using their own path and going through different terrain.", "These people are skiing the way they are for exercise across a flat surface."], "image": "val2014/COCO_val2014_000000290196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117222, "question_id": "ivp4PA9eDJRbm8NGJqzDi7", "question": "What is the man using the bike for?", "choices": ["racing", "transporting", "leisure", "exercising"], "correct_choice_idx": 1, "direct_answers": ["transporting", "transport bananas", "transport bananas", "sell bananas", "banana transport", "transporting goods", "transport bananas", "transport bananas", "traveling", "banana transport"], "difficult_direct_answer": false, "rationales": ["This is the most convenient way to bring lots of bananas to market in order to sell them.", "He is taking the bananas to market.", "The man is using the bike to transport fruit quickly."], "image": "val2014/COCO_val2014_000000117222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305141, "question_id": "ivwW36Ndmi9icgCDwBebJC", "question": "The man is mugging about his wife doing what?", "choices": ["sitting", "looking up", "wine drinking", "resting"], "correct_choice_idx": 2, "direct_answers": ["farting", "posing", "posing", "drinking wine", "nothing", "posing", "drinking", "drinking", "drinking wine", "wine drinking"], "difficult_direct_answer": false, "rationales": ["The woman looks very under the influence and embarrassing.", "A man in glasses is pointing at his wife. she is currently drinking out of a glass that is dark in nature.", "The wife has a wine glass."], "image": "train2014/COCO_train2014_000000305141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131969, "question_id": "iw6eFnGnXcCFdVwpie2cdQ", "question": "What might you find in the glass and green sided structure?", "choices": ["aliens", "superman", "telephone", "bathroom"], "correct_choice_idx": 2, "direct_answers": ["telephone", "payphone", "pay phone", "telephone", "phone", "store", "pay phone", "telephone", "phone", "telephone"], "difficult_direct_answer": false, "rationales": ["The other options either don't exist or don't make sense.", "The structure is a phone booth.", "The phone would be there."], "image": "val2014/COCO_val2014_000000131969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58462, "question_id": "iwRc7yAiLVLSCR7qWkVgx7", "question": "Why are the beverages in the cooler?", "choices": ["to warm", "to marinate", "to settle", "to sell"], "correct_choice_idx": 3, "direct_answers": ["keep cold", "to sell", "keep cold", "refrigeration", "keep cold", "for coldness", "refrigeration", "keep cold", "keep cold", "soda"], "difficult_direct_answer": false, "rationales": ["They are there to be sold cold.", "They are in what looks like a convenience store.", "Almost all of us have put the coins in and enjoyed them."], "image": "val2014/COCO_val2014_000000058462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262670, "question_id": "iwZnbR8Pq4W6THoVKv3VsH", "question": "What type of animal is laying in the middle of the street?", "choices": ["horses", "dogs", "cows", "goats"], "correct_choice_idx": 2, "direct_answers": ["cow", "cows", "cows", "cow", "cow", "cows", "cow", "cow", "cows", "cow"], "difficult_direct_answer": false, "rationales": ["There is a group of bovines laying in the street.", "Cows are laying down.", "A cluster of animals are just sitting in the middle of a road. they are medium size and have pointy ears."], "image": "train2014/COCO_train2014_000000262670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29801, "question_id": "iwf7yvgzY68NWpNA8WDLiu", "question": "What time of day does this man dine here?", "choices": ["noon", "night", "morning", "evening"], "correct_choice_idx": 2, "direct_answers": ["breakfast", "morning", "breakfast", "breakfast", "morning", "breakfast", "morning", "morning", "morning", "morning"], "difficult_direct_answer": false, "rationales": ["He is eating breakfast.", "The man is having traditional breakfast food like orange juice and cereal in milk. it is also light outside, but part of the area is still shaded.", "It is light outside and the man is eating cereal with milk and drinking orange juice which typically are breakfast items consumed as the first meal of the day after waking up."], "image": "train2014/COCO_train2014_000000029801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412440, "question_id": "iwk4ixGh9xGVeLLWfWn4og", "question": "What utensils with the person holding this pizza use to eat it with?", "choices": ["knife", "fork", "spoon", "none"], "correct_choice_idx": 3, "direct_answers": ["fingers", "paper plate", "plate", "plate", "fork", "fork knife", "none", "fork", "fingers", "fork"], "difficult_direct_answer": false, "rationales": ["One of the person's hands is visible and currently holding only the pizza plate and nothing else.", "Pizza can be easily eaten with one's hands.", "This person is eating the pizza with their hands."], "image": "val2014/COCO_val2014_000000412440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455345, "question_id": "iwqyGMzqNuwKbubdzUWYw7", "question": "Which country grows most bananas?", "choices": ["nepal", "india", "us", "china"], "correct_choice_idx": 1, "direct_answers": ["india", "india", "india", "india", "india", "india", "vietnam", "vietnam", "vietnam", "india"], "difficult_direct_answer": false, "rationales": ["Bananas are grown in india.", "India grows a lot.", "This country is known for growing and exporting this fruit. it is the country that produces the most of this fruit out of these options."], "image": "val2014/COCO_val2014_000000455345.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558169, "question_id": "iwyGqFrQzKwFeaJU5rfiVw", "question": "Why is the person holding the bird wearing a glove?", "choices": ["fashion", "for work", "warmth", "protection"], "correct_choice_idx": 3, "direct_answers": ["protection", "talons", "claws hurt", "sharp claws", "safety", "bird bites", "protection", "protection", "claws", "protection"], "difficult_direct_answer": false, "rationales": ["This glove is probably to protect the wearers wrist and hand from the parrots claws.", "The bird has sharp claws that could otherwise injure the person's hand.", "This blue bird has sharp claws so it must be handled with care. it can cause puncturing if bare skin is showing."], "image": "val2014/COCO_val2014_000000558169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125547, "question_id": "ixKRFPVBQ5DRaMGfhMMSzE", "question": "Why are the people sitting on the benches?", "choices": ["sell stuff", "find friends", "resting", "awaiting trains"], "correct_choice_idx": 3, "direct_answers": ["waiting", "waiting", "waiting", "awaiting trains", "waiting", "waiting", "waiting", "waiting", "waiting", "waiting"], "difficult_direct_answer": false, "rationales": ["They're waiting for trains.", "The people are at a train station and sitting on benches as they wait to board a train.", "They have tickets for a trip."], "image": "val2014/COCO_val2014_000000125547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562623, "question_id": "ixMJRwkDfpnubmKNWsx6Hj", "question": "What animal is shown in the container?", "choices": ["tiger", "lion", "lion", "horse"], "correct_choice_idx": 0, "direct_answers": ["tiger", "horses", "horse", "horses", "horse", "tiger", "lion", "tiger", "tiger", "tiger"], "difficult_direct_answer": false, "rationales": ["A large trailer with a picture of a tiger is behind horses in a pen with people watching on.", "The animal is a tiger.", "There is a large cat like animal with stripes on side of truck."], "image": "train2014/COCO_train2014_000000562623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230522, "question_id": "ixRGHFuJgV7gxBpv8Gfrka", "question": "What is known as the best cut of meat from the largest animal?", "choices": ["chuck", "sirloin", "ribeye", "filet mignon"], "correct_choice_idx": 3, "direct_answers": ["filet mignon", "hindquarters", "tenderloin", "filet mignon", "tbone", "steak", "beef", "filet mignon", "t-bone", "steak"], "difficult_direct_answer": false, "rationales": ["The largest animal is a cow as identified by the size, shape and color. answer a is known to be a top choice cut of meat from this animal.", "This cut is one of the most tender of the cow so therefor the most desirable.", "That is the meat the bull is known to have when it's cut up."], "image": "train2014/COCO_train2014_000000230522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371534, "question_id": "ixSprjDtsBeoFFr33HC6X9", "question": "Where is this room located?", "choices": ["home", "church", "hospital", "school"], "correct_choice_idx": 0, "direct_answers": ["bedroom", "top", "home", "in house", "house", "bedroom", "residence", "in house", "bedroom", "house"], "difficult_direct_answer": false, "rationales": ["There is a bed. it is not a hospital bed.", "It's at home.", "Kids like bunk beds. a child is sitting on a bunkbed."], "image": "train2014/COCO_train2014_000000371534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387255, "question_id": "ixTLTvppeaxUMqKSjJ3X2E", "question": "What are these people doing with each other?", "choices": ["resting", "singing", "yelling", "racing"], "correct_choice_idx": 3, "direct_answers": ["racing", "skiing", "racing", "racing", "skiing", "skiing", "skiing", "having fun", "skiing", "racing"], "difficult_direct_answer": false, "rationales": ["They are in a competition.", "The people are skiing.", "They have racing pennys on."], "image": "train2014/COCO_train2014_000000387255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116149, "question_id": "ixcLFAzd2JhPXLx8izNd9p", "question": "What use would these devices held aloft here be?", "choices": ["defense", "signaling", "rain", "shade"], "correct_choice_idx": 3, "direct_answers": ["twirling", "parasol", "shade", "stop rain", "umbrella", "advertisment", "sun shade", "sun protection", "umbrellas", "shade"], "difficult_direct_answer": true, "rationales": ["These umbrellas are thin and made of paper so they are for shade", "These devices are held aloft as parsols.", "These are made from paper and would be useless in the rain"], "image": "val2014/COCO_val2014_000000116149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470442, "question_id": "ixiBUhC4GyWokfxMgHM22j", "question": "What does this vehicle hold in it's rear?", "choices": ["coal", "wheat", "wood", "snow"], "correct_choice_idx": 3, "direct_answers": ["snow", "snow", "salt", "snow", "dirt", "snow", "snow", "snow", "rubble", "snow"], "difficult_direct_answer": false, "rationales": ["There are men shoveling a white, thick, dirty substance off the street into the bed of the truck.", "The men are shoveling snow into the back of the vehincle.", "They are shoveling it to get it out of the road"], "image": "train2014/COCO_train2014_000000470442.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72437, "question_id": "ixnruMfymYFCtLxymPeNZc", "question": "What video game system are the boys using?", "choices": ["atari", "nintendo wii", "playstation 4", "xbox 360"], "correct_choice_idx": 1, "direct_answers": ["wii", "wii", "wii", "xbox", "wii", "wei", "wii", "nintendo wii", "nintendo wii", "wii"], "difficult_direct_answer": false, "rationales": ["They are using white remotes, not controllers or joysticks.", "The remotes are white.", "The system is the wii."], "image": "train2014/COCO_train2014_000000072437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187235, "question_id": "ixuJskknnUycDNnExhBxNr", "question": "What is in the background?", "choices": ["palm tree", "sailboat", "fisherman", "baby"], "correct_choice_idx": 1, "direct_answers": ["shore", "sailboat", "factory smokestacks", "boat", "sailboat", "sailboat", "boat", "sailboat", "sailboat", "sail boat"], "difficult_direct_answer": false, "rationales": ["A boat with it's sails up is there.", "There are sails.", "The vessel in the background is using wind power to travel across the water."], "image": "val2014/COCO_val2014_000000187235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565777, "question_id": "ixxc4ovPqQRq4jUAF4Cdwu", "question": "Which cuisine is this?", "choices": ["french", "asian", "american", "indian"], "correct_choice_idx": 1, "direct_answers": ["pot roast", "american", "chinese", "chinese", "asian", "vegetarian", "asian", "japanese", "asian", "chinese"], "difficult_direct_answer": false, "rationales": ["The food resembles the asian life in earlie life.", "The plates of food and their contents are visible, based on the sauces and the food construction it is likely of answer a origin.", "The food and ingredients are clearly visible and are of a size, shape and consistency in line with answer a."], "image": "train2014/COCO_train2014_000000565777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162199, "question_id": "iyL3WCgCieW58shcf7ahmy", "question": "What is able to be repaired by the machine in the corner?", "choices": ["tv", "ovens", "clothing", "shoes"], "correct_choice_idx": 2, "direct_answers": ["clothing", "television", "clothing", "clothes", "fabric", "clothing curtains", "torn clothing", "fabric", "fabric", "clothes"], "difficult_direct_answer": false, "rationales": ["A sewing machine can stitch cloth together.", "The machine is for sewing.", "The clothing is able."], "image": "train2014/COCO_train2014_000000162199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190393, "question_id": "iyLF7PAzM9YweVUupiJjep", "question": "What is attached to the chair?", "choices": ["knives", "apples", "wheels", "balloons"], "correct_choice_idx": 2, "direct_answers": ["wheels", "wheels", "wheels", "big wheels", "wheels", "big wheels", "wheels", "wheels", "wheels", "wheel"], "difficult_direct_answer": false, "rationales": ["There are four thick circular objects on the bottom of the chair.", "There are large circular inflated things on the chair.", "The object is circular shaped."], "image": "train2014/COCO_train2014_000000190393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294823, "question_id": "iyQwX6aNDMhFuZ7ZVjw554", "question": "The items with the blue signs are likely where?", "choices": ["desert", "farm", "tundra", "city center"], "correct_choice_idx": 3, "direct_answers": ["parking meters", "city center", "parking lot", "city", "parkers pay", "parking lot", "street parking", "germany", "parking lot", "parking meter"], "difficult_direct_answer": false, "rationales": ["The signs are from a city.", "This is the only logical explanation with the answers given. there is a few cars behind the boxes.", "The items in the center are blue."], "image": "train2014/COCO_train2014_000000294823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7989, "question_id": "iyTsMJqXsLn7J8SkEQuEWH", "question": "Where does the carrier take the man to?", "choices": ["right", "uphill", "downhill", "left"], "correct_choice_idx": 1, "direct_answers": ["top slope", "mountain top", "hill", "mountain top", "top", "hilltop", "mountain", "uphill", "mountaintop", "top"], "difficult_direct_answer": false, "rationales": ["It makes it easier to go up a mountain quickly", "The lift is taking the man uphill so he can snowboard down", "In order for skiers and snowboarders to go down the hill multiple times, they must reach the top before each time."], "image": "val2014/COCO_val2014_000000007989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298527, "question_id": "iyeTVtWn8pnJWhvGgGBNvQ", "question": "The shape the man has on his yellow shirt is found on what flag?", "choices": ["china", "argentina", "greece", "sweden"], "correct_choice_idx": 0, "direct_answers": ["russian", "china", "usa", "moheli", "usa", "united states", "usa", "american", "china", "country flag"], "difficult_direct_answer": false, "rationales": ["There is a man sitting behind a woman listening to music. he has a red star on chest which can be found in asian country.", "One asian country who has a red star on it is china.", "Stars are on the chinese flag."], "image": "train2014/COCO_train2014_000000298527.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195265, "question_id": "iyfBu6r7nHUabZz6YV2aEg", "question": "What is the book resting on?", "choices": ["towel rack", "radiator", "toilet", "sink"], "correct_choice_idx": 1, "direct_answers": ["radiator", "heater", "radiator", "heater", "radiator heater", "radiator", "radiator", "heater", "radiator heater", "radiator"], "difficult_direct_answer": false, "rationales": ["The book is resting on top of the device made up of tubes which the bathroom uses to provide heat to the room.", "This is a metal device in rooms that has hot water flowing through it", "The book is on a thing with vertical metal columns."], "image": "train2014/COCO_train2014_000000195265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505388, "question_id": "iyfYgoY6sZhZ7tHe6k6HQG", "question": "Which one will reach the finish line first if they maintain their positions?", "choices": ["blue helmet", "red bike", "84", "14"], "correct_choice_idx": 3, "direct_answers": ["14", "white", "fourteen", "number 14", "left one", "inside driver", "left one", "number 14", "number 14", "left"], "difficult_direct_answer": false, "rationales": ["It'll be 14.", "Fourteen is on the interior.", "The bike with the number 14 is ahead of the other bike."], "image": "train2014/COCO_train2014_000000505388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352188, "question_id": "iyjK84P7eYbQVKXTGCHBmE", "question": "What should the pedestrians do in this situation?", "choices": ["wait", "run", "hide", "cross road"], "correct_choice_idx": 0, "direct_answers": ["cross street", "stop", "walk", "wait", "stop", "cross", "go", "not walk", "cross", "wait"], "difficult_direct_answer": false, "rationales": ["The traffic light is used to control cars coming into the direction of the walkway. since it is green, one should wait till it turns red.", "The green light is facing the other way.", "The cars have a green light to proceed; it's not safe for pedestrians until this light turns red and the cars stop."], "image": "train2014/COCO_train2014_000000352188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546518, "question_id": "iykLVCSH7AaFRKoDeo9hgi", "question": "Someone needs to score at least how many sets to win?", "choices": ["four", "five", "two", "eight"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "zero", "seven", "three", "six", "unknown", "three", "two"], "difficult_direct_answer": false, "rationales": ["A team or players needs that amount to win at tennis.", "The answer depends on the type of tennis being played, but at most three sets need to be won which eliminates all answers except the first.", "The player needs to score 2 sets to win."], "image": "train2014/COCO_train2014_000000546518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141376, "question_id": "iyuN9TJiXQHyTg2SGqy32q", "question": "What is the cat laying in front of?", "choices": ["freezer", "door", "refrigerator", "car door"], "correct_choice_idx": 2, "direct_answers": ["refrigerator", "refrigerator", "refridgeratior", "fridge", "open refrigerator", "fridge", "stretching", "refrigerator", "fridge", "refrigerator"], "difficult_direct_answer": false, "rationales": ["The cat is in front of the fridge door.", "The door is open and an apple is visible.", "The items in the refrigerator are plain to see since the door is open. the cat is lying in front of the open door."], "image": "train2014/COCO_train2014_000000141376.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237210, "question_id": "izzBvorzPAArQosW6dkW66", "question": "These cars are stuck in what?", "choices": ["parade", "car show", "traffic jam", "parking lot"], "correct_choice_idx": 2, "direct_answers": ["traffic", "traffic", "traffic", "traffic", "traffic", "traffic", "traffic", "traffic", "traffic jam", "traffic"], "difficult_direct_answer": false, "rationales": ["The cars are in traffic.", "When cars are this close together there is frequently a slowdown which causes the phenomena known as answer a.", "The cars are in traffic."], "image": "train2014/COCO_train2014_000000237210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5312, "question_id": "izzwdPDq64LJ3VTEu77oah", "question": "What is the man kneeling going to do next?", "choices": ["play tennis", "fly kite", "go swimming", "race cars"], "correct_choice_idx": 1, "direct_answers": ["fly kite", "fly kite", "launch plane", "fly kite", "kite", "fly kite", "fly kite", "fly", "fly kite", "fly kite"], "difficult_direct_answer": false, "rationales": ["The man is preparing the object.", "He is holding a kite.", "A man is kneeling down on ground and about to put object in sky. it does best when the wind is strong."], "image": "train2014/COCO_train2014_000000005312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36828, "question_id": "j2BTjubQjgCip7Y9pJugqs", "question": "What is the surfer most likely looking up at?", "choices": ["people", "bridge", "sunset", "waves"], "correct_choice_idx": 2, "direct_answers": ["sunset", "sun", "sunset", "sun", "sun", "sun", "sun", "pier", "waves", "sunset"], "difficult_direct_answer": false, "rationales": ["The surfer is looking at the sunset.", "The surfer is looking towards the setting sun.", "The sun setting is beautiful."], "image": "train2014/COCO_train2014_000000036828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80895, "question_id": "j2EyLH2JJ8rpCv5JVL7zqG", "question": "If the man threw his phone 2 meters forward where would it land?", "choices": ["in water", "on boat", "in grass", "on woman"], "correct_choice_idx": 0, "direct_answers": ["water", "water", "underwater", "water", "water", "in water", "in water", "water", "in water", "water"], "difficult_direct_answer": false, "rationales": ["The man is sitting near water.", "It would likely throw into water.", "A man is looking at his phone near the water."], "image": "val2014/COCO_val2014_000000080895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149571, "question_id": "j2dqQLFcAmeNXr2vJTAmBo", "question": "Which single object does he have that is least useful to him right now?", "choices": ["hat", "pants", "skateboard", "shoes"], "correct_choice_idx": 2, "direct_answers": ["backpack", "backpack", "backpack", "other skateboard", "second skateboard", "hat", "skateboard", "second skateboard", "skateboard", "extra skateboard"], "difficult_direct_answer": false, "rationales": ["He has a second skateboard on his back but he's already riding one.", "The man is already on a skateboard. he cannot ride two skateboards at the same time.", "The object is the board."], "image": "val2014/COCO_val2014_000000149571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61260, "question_id": "j2mrUVkVCMNYA7T7SFszce", "question": "What brand is famous for making the item the boy is holding?", "choices": ["louisville slugger", "green giant", "hbo", "goya"], "correct_choice_idx": 0, "direct_answers": ["louisville slugger", "louisville slugger", "louisville slugger", "wilson", "easton", "easton", "slugger", "louisville slugger", "louisville slugger", "louisville slugger"], "difficult_direct_answer": false, "rationales": ["The brand is slugger.", "The boy is holding a baseball bat. the brand's name is on the side of the bat.", "The brand that makes bats is called \"louisville slugger'"], "image": "train2014/COCO_train2014_000000061260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484761, "question_id": "j2p7QQK96fEJv4WT9cebbQ", "question": "What type of activities are happening here?", "choices": ["winter", "electronic", "aquatic", "aviation"], "correct_choice_idx": 3, "direct_answers": ["flying", "air show", "air show", "aviation", "plane demonstration", "air show", "air show", "flying formation", "aviation", "air show"], "difficult_direct_answer": false, "rationales": ["Several planes are in the air flying in formation.", "They are flying planes with race numbers", "Four airplanes fly in this image. aviation pertains to flight."], "image": "train2014/COCO_train2014_000000484761.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434415, "question_id": "j2woiBa5r99JnYDbxsEReK", "question": "What is the oldest cap name?", "choices": ["panama", "stockman", "berets", "western"], "correct_choice_idx": 2, "direct_answers": ["fedora", "bonnet", "berets", "no idea", "fedora", "bonnet", "bonnet", "flat cap", "fedora", "bonnet"], "difficult_direct_answer": false, "rationales": ["The other options seem to be name of places.", "The berets are old.", "This hat is named after a south american country."], "image": "val2014/COCO_val2014_000000434415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230661, "question_id": "j3LPdRRJ4d5BQYTh79heKh", "question": "What does this truck do?", "choices": ["book donation", "mobile library", "book sale", "transportation"], "correct_choice_idx": 1, "direct_answers": ["library", "library books", "lend books", "delivers books", "deliver books", "library", "enriches lives", "mobile library", "distribute books", "deliver books"], "difficult_direct_answer": false, "rationales": ["It's a mobile library.", "There is a picture of a book on side of truck.", "On the side of the truck the words library is clearly visible. the fact that it on a truck implies a mobile library."], "image": "val2014/COCO_val2014_000000230661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390963, "question_id": "j3RWLVJjDXZcckpWavVLUd", "question": "What is the tarp above the giraffe being used to block?", "choices": ["wind", "sun", "rain", "insects"], "correct_choice_idx": 1, "direct_answers": ["sun", "sun", "sun", "sun", "sun", "sunshine", "light", "sun rays", "sun", "sunlight"], "difficult_direct_answer": false, "rationales": ["The tarp is for shade.", "It's blocking the sun.", "It is not raining out. wind or insects could enter from any direction, so the tarp above is not stopping them."], "image": "train2014/COCO_train2014_000000390963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363102, "question_id": "j3mxBAjJj3KzfBYapTCMWK", "question": "Why are her feet off the ground?", "choices": ["falling", "tripped", "hit ball", "running"], "correct_choice_idx": 2, "direct_answers": ["jumped", "power", "she jumped", "hitting", "jumping", "power", "jumping", "hitting ball", "jumping", "hit ball"], "difficult_direct_answer": false, "rationales": ["She's hitting the ball.", "This tennis player's feet are off the ground because she's leaping in order to reach the ball in time and smack it back with maximum force.", "She is leaping to hit the ball."], "image": "train2014/COCO_train2014_000000363102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43966, "question_id": "j3qUMa5DgskEXWXBKJ8Mef", "question": "This food is usually sold where?", "choices": ["candy store", "fishery", "pizzeria", "farm"], "correct_choice_idx": 2, "direct_answers": ["pizzeria", "pizzeria", "pizza parlor", "pizza shop", "restaurant", "restaurant", "italian restaurant", "pizza store", "pizza shop", "pizzeria"], "difficult_direct_answer": false, "rationales": ["The pizza is a speciality pizza.", "The pizza is on the plate.", "Pizzas are usually sold in pizza shops."], "image": "train2014/COCO_train2014_000000043966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519186, "question_id": "j4KsPVSft8oPpFwvoEW8ZG", "question": "What can obviously be used to save your life here?", "choices": ["bulletproof armor", "life vest", "water", "tiles"], "correct_choice_idx": 1, "direct_answers": ["buoy", "lifesaver", "life vest", "lifesavers", "boat", "life jacket", "life preservers", "floating tubes", "lifesaver", "lifesaver"], "difficult_direct_answer": false, "rationales": ["The life vests are needed.", "The \"life vests\" (actually life preservers) are on the boat in case of emergency because they help people avoid drowning if they're in the water accidentally.", "Life vests can safe a life."], "image": "train2014/COCO_train2014_000000519186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374597, "question_id": "j4SJtxkJUMsHQXyNuqU8Nc", "question": "Why is the batter wearing white gloves?", "choices": ["sanitary reasons", "style", "keeping warm", "increased grip"], "correct_choice_idx": 3, "direct_answers": ["hand protection", "protect hands", "grip", "protect hands", "grip", "hold firmly", "increased grip", "better grip", "better grip", "grip"], "difficult_direct_answer": false, "rationales": ["These are so the bat doesn't slip when he's swinging", "The gloves help with holding the bat tighter.", "The batter has a grip."], "image": "train2014/COCO_train2014_000000374597.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213897, "question_id": "j4ZBWwEscn9QDc6kSQPqCx", "question": "What animal is there besides the giraffe?", "choices": ["none", "bear", "dog", "cat"], "correct_choice_idx": 1, "direct_answers": ["elephant", "elephant", "elephant", "elephant", "bear", "elephant", "elephant", "elephant", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["A large grizzly can be found in the far pen.", "In the far back in another fenced in area is a brown animal that is not like the giraffees.", "A bear is in an enclosure behind some giraffes."], "image": "train2014/COCO_train2014_000000213897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431092, "question_id": "j4afA3s5goRczSjgj2aNEr", "question": "What is the man doing with the blue wand?", "choices": ["waving", "digging", "scratching", "playing"], "correct_choice_idx": 3, "direct_answers": ["throwing", "waving bubbles", "throwing ball", "playing fetch", "throwing ball", "throwing ball", "playing fetch", "playing", "playing dog", "standing"], "difficult_direct_answer": false, "rationales": ["The dog is playing.", "There is an excited dog right next to him looking in the same direction.", "The man is holding a blue wand that the dog can catch for fun."], "image": "val2014/COCO_val2014_000000431092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53328, "question_id": "j57P3GJsvQEhcuaiGs3EYq", "question": "Whose turn is it to play?", "choices": ["neither", "blue shirt", "both", "black shirt"], "correct_choice_idx": 3, "direct_answers": ["man", "left man", "right man", "black shirt", "black shirt", "thin man", "left", "blue jeans", "both", "black shirt"], "difficult_direct_answer": false, "rationales": ["The person is lined up and standing with the controllers at the ready.", "The man in the black shirt is playing the game.", "He is holding the controllers up making moves"], "image": "val2014/COCO_val2014_000000053328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351126, "question_id": "j5AJVs5saMVCjQcbCJgKy8", "question": "What company owns the red and white vehicle?", "choices": ["target", "costco", "ikea", "sam's club"], "correct_choice_idx": 0, "direct_answers": ["target", "target", "target", "target", "bus line", "target", "target", "target", "target", "skytrain"], "difficult_direct_answer": false, "rationales": ["A target logo is on the side of a red and white vehicle.", "It has the red branding and several of its logos printed on the side.", "Target's logo is red and white."], "image": "train2014/COCO_train2014_000000351126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403054, "question_id": "j5CvWsmXeFiEv5DGNpSDSm", "question": "What country does the white car originate from?", "choices": ["israel", "america", "japan", "canada"], "correct_choice_idx": 2, "direct_answers": ["japan", "usa", "japan", "china", "china", "china", "china", "sweden", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["The country is japan.", "The icon on the white car appears to be from the toyota make. this brand finds its origin in japan.", "The car comes from japan."], "image": "val2014/COCO_val2014_000000403054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247264, "question_id": "j5Fw7K9hLk3gTQBM7KYPnp", "question": "Why are there two oval patterns on the right man's pants?", "choices": ["fashion", "broken", "knee protection", "dress code"], "correct_choice_idx": 2, "direct_answers": ["knee pads", "pads", "for protection", "protection", "swimming", "knee pads", "knee protection", "knee pads", "knee pads", "kneepads"], "difficult_direct_answer": false, "rationales": ["The pads are placed over both of his kneecaps.", "The pants of the man has the two oval patterns is covering his two knees. with the padding that is in the area, it would seem that this would be for knee protection.", "It's knee protection."], "image": "val2014/COCO_val2014_000000247264.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519768, "question_id": "j5c79WFmjCLPk5eLkiQdT3", "question": "Why have these people gathered?", "choices": ["to pray", "to learn", "to mourn", "to play"], "correct_choice_idx": 1, "direct_answers": ["learning", "to learn", "meeting", "workshop", "work", "to learn", "attend seminar", "presentation", "meeting", "teaching students"], "difficult_direct_answer": false, "rationales": ["The people are watching a presentation.", "The group of people seated have their laptops in front of them as they look at a teacher in front of the classroom with a presentation and the group of people seated are paying attention.", "There are multiple people sitting in a chair with a laptop on their lap. there is a projection screen showing some sort of map."], "image": "train2014/COCO_train2014_000000519768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557568, "question_id": "j5jheRmQgKqrp6kJqUg2p3", "question": "What is the player in the forefront doing?", "choices": ["designated runner", "practicing swing", "at bat", "stealing base"], "correct_choice_idx": 1, "direct_answers": ["swinging bat", "batting", "batting", "practicing swing", "baseball", "practice swinging", "practice swing", "batting", "practice", "practicing"], "difficult_direct_answer": false, "rationales": ["He is mimicking the actual batter player swinging to hit the real ball.", "The player closest is holding a baseball bat but is not currently batting as the batter is on the baseball diamond in the background. players holding a bat in this position and not actively batting in the game are likely to be practicing for when they are.", "The at bat player is near the umpire and catcher. the player in the forefront is in the on-deck circle and will be at bat next."], "image": "train2014/COCO_train2014_000000557568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442080, "question_id": "j5naoEzJsWPo6psLWSsPCW", "question": "What did this man serve in?", "choices": ["tennis", "military", "dog grooming", "horse show"], "correct_choice_idx": 1, "direct_answers": ["royal army", "military", "military", "military", "military", "royal army", "royal army", "military", "military", "military"], "difficult_direct_answer": false, "rationales": ["The man is riding a horse and wearing a uniform that looks like a soldier's uniform.", "The man has a solider outfit.", "The man is in the military."], "image": "train2014/COCO_train2014_000000442080.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559837, "question_id": "j5pDbqZUxsg2apqTyShWmn", "question": "What should be put in the hole near the nearby child?", "choices": ["lightbulb", "key", "hand", "trash"], "correct_choice_idx": 3, "direct_answers": ["trash", "garbage", "garbage", "trash", "trash", "trash", "garbage", "trash", "trash", "chair"], "difficult_direct_answer": false, "rationales": ["It has the opening on the side so that rubbish can be dropped into it.", "There is a trash receptacle by the child so people can throw their garbage away.", "It is a trash can."], "image": "val2014/COCO_val2014_000000559837.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241373, "question_id": "j5yt69ix4eBZ44x2QLSy7X", "question": "Why does the man have the yellow stick with him?", "choices": ["kill bugs", "to ski", "help walk", "protection"], "correct_choice_idx": 2, "direct_answers": ["walking cane", "help walk", "old man", "balance", "help walk", "support weight", "to walk", "help walking", "help walk", "cane"], "difficult_direct_answer": false, "rationales": ["The man has a cane.", "The man needs a walking cane.", "Often with age, bones or joints can deteriorate. humans often use canes or sticks to help move around."], "image": "val2014/COCO_val2014_000000241373.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413676, "question_id": "j6L7tXXjPxYoQmmBnLFHXw", "question": "What is the small dog being carried in?", "choices": ["backpack", "purse", "crate", "cart"], "correct_choice_idx": 0, "direct_answers": ["backpack", "backpack", "bag", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack"], "difficult_direct_answer": false, "rationales": ["There is a dog head poking out of a backpack.", "The dog is in the large zipper pocket of the backpack.", "He is being carried in a backpack."], "image": "train2014/COCO_train2014_000000413676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329125, "question_id": "j6rHAjHBFPZJwZHwgSrFde", "question": "For what type of event is the man dressed?", "choices": ["semi-formal", "formal", "beach", "casual"], "correct_choice_idx": 3, "direct_answers": ["interview", "party", "graduation", "casual event", "party", "chilling", "beer bash", "casual", "party", "casual"], "difficult_direct_answer": false, "rationales": ["The man is wearing jeans an a white t-shirt.", "He is just wearing a t shirt and jeans.", "The man is dressed in jeans and a shirt."], "image": "train2014/COCO_train2014_000000329125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129216, "question_id": "j7Axk683uHE6veRsB3QA8J", "question": "What are the pink objects on the shelf?", "choices": ["boxes", "envelopes", "hard drives", "binders"], "correct_choice_idx": 3, "direct_answers": ["binder", "file folders", "binders", "binders", "magazine holders", "books", "binders", "binders", "books", "managing director"], "difficult_direct_answer": false, "rationales": ["He has binders.", "Large office supplies can be seen.", "Any student and/or employee knows that these look like."], "image": "train2014/COCO_train2014_000000129216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564515, "question_id": "j7MbFCS9zE5V5yvnurwwuv", "question": "What has the man just done?", "choices": ["stretched", "danced", "caught frisbee", "thrown frisbee"], "correct_choice_idx": 3, "direct_answers": ["thrown frisbee", "thrown frisbee", "thrown", "throw frisbee", "thrown frisbee", "thrown frisbee", "throw frisbee", "thrown frisbee", "throw frisbee", "throw frisbee"], "difficult_direct_answer": false, "rationales": ["The man has just released the frisbee that is midair.", "The man's body language shows he is not prepared to catch anything, and his arm is going in a forward motion.", "The man's body position and the position of the frisbee in relation to him implies that answer a has transpired."], "image": "train2014/COCO_train2014_000000564515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189744, "question_id": "j7U5LSP85WJ8zheQZQgqM8", "question": "What pizza place is on the main level?", "choices": ["little caesars", "pizza hut", "papa john's", "domino's"], "correct_choice_idx": 3, "direct_answers": ["domino's", "domino's", "domino's", "domino's", "domino's", "dominos", "domino's", "dominos", "pic notloading", "domino's"], "difficult_direct_answer": false, "rationales": ["The place is domino's.", "The name brand of the pizza place is on the moniker of the business.", "The store front lowest to the ground in this scene read's domino's. domino's is known as a pizza restaurant."], "image": "val2014/COCO_val2014_000000189744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496362, "question_id": "j7XPdQrHuH4B2LTHpChPTi", "question": "Where are these 2 people standing?", "choices": ["grass", "street", "trail", "beach"], "correct_choice_idx": 1, "direct_answers": ["road", "road", "near road", "street", "street", "street", "street", "street", "in street", "center road"], "difficult_direct_answer": false, "rationales": ["The two people closest to the foreground are standing on a paved surface with traffic lines painted on the ground and vehicles in the background. this elements are consistent with answer a.", "They are on a paved route which is closed off from motor vehicles.", "The people are in the street."], "image": "train2014/COCO_train2014_000000496362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79883, "question_id": "j7g4VkBjgsBEzaoBHPv9Gi", "question": "What image cheers the woman taking the zoom call we see?", "choices": ["herself", "pet owners", "nothing", "dog"], "correct_choice_idx": 3, "direct_answers": ["animal", "dog", "pet", "puppy", "dog", "face", "dog", "puppy", "cat", "puppy"], "difficult_direct_answer": false, "rationales": ["A person is on a computer screen and is holding a dog.", "There is an animal.", "The small screen on the right top hand is the woman smiling and there is a person holding a sleeping dog."], "image": "train2014/COCO_train2014_000000079883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576966, "question_id": "j895vEeKFnnWdXRoPzvvzW", "question": "Why are the buildings right on the water?", "choices": ["land scarce", "can swim", "need water", "good view"], "correct_choice_idx": 3, "direct_answers": ["dock", "unsure", "good view", "terrain issues", "building ease", "waterfront properties", "hotel accomodations", "views", "homes", "no room"], "difficult_direct_answer": true, "rationales": ["There is not much space to build anything.", "They are there for the view and to be on the water.", "The buildings have a good view."], "image": "val2014/COCO_val2014_000000576966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24457, "question_id": "j8UVvrqFmycQWEK283iSrV", "question": "For what purpose is the person reading?", "choices": ["teaching lessons", "school work", "pleasure", "mental edification"], "correct_choice_idx": 2, "direct_answers": ["entertainment", "leisure", "for enjoyment", "entertainment", "fight boredom", "entertainment", "entertainment", "entertainment", "pleasure", "for fun"], "difficult_direct_answer": false, "rationales": ["The person is enjoying a romance novel.", "The person is reading a romance book.", "The purpose is for pleasure."], "image": "train2014/COCO_train2014_000000024457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94033, "question_id": "j8cLDC3xDm6hoNXGh5a4rT", "question": "The rules of this game are similar to which game?", "choices": ["golf", "frisbee", "valleyball", "basketball"], "correct_choice_idx": 0, "direct_answers": ["catch", "frisbee", "golf", "catch", "pickleball", "golf", "golf", "golf", "basket ball", "basketball"], "difficult_direct_answer": false, "rationales": ["Similar to golf the man is trying to get the frisbee into the basket hole.", "You go from goal to goal and you have to throw a frisbee to try to get it in it.", "This is like golf but with a frisbee."], "image": "val2014/COCO_val2014_000000094033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13202, "question_id": "j8frkP3ka5XrTb8Pc8vLCm", "question": "What has the woman done with the white object?", "choices": ["threw it", "shot it", "caught it", "tackled it"], "correct_choice_idx": 0, "direct_answers": ["throw", "threw it", "thrown", "tossed", "thrown it", "tossed", "thrown it", "thrown it", "tossed frisbee", "threw frisbee"], "difficult_direct_answer": false, "rationales": ["The woman's arm is outstretched and the object is flying away.", "If one looks at the arm extending from this woman, the logical conclusion is that she just threw the white object in the air.", "The woman's body language indicates effort was put into launching something forwards."], "image": "train2014/COCO_train2014_000000013202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400919, "question_id": "j8rF7MQCkRWqZwbyyhTqKR", "question": "What kind of footage are they most likely looking at on the screen?", "choices": ["movie", "live sports", "lecture", "game"], "correct_choice_idx": 3, "direct_answers": ["wii", "video game", "video game", "video game", "themselves", "video game", "game", "boxing", "sports wii", "video game"], "difficult_direct_answer": false, "rationales": ["A video game controller can be seen in this right hand. it seems logical that he would be looking at the game he's playing.", "They are all watching with different emotions, as if they want different outcomes to occur.", "They play a game."], "image": "train2014/COCO_train2014_000000400919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432179, "question_id": "j9A5Bh9eZangsb67JtWf9E", "question": "What is the first number that appears on the train?", "choices": ["five", "seven", "eight", "nine"], "correct_choice_idx": 0, "direct_answers": ["five", "five", "five", "five", "five", "five", "five", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["The first number printed on the side of the train is a five.", "The number is recognizable, and comes after 4 but before 6.", "A train has an identification number on the side in white letters."], "image": "val2014/COCO_val2014_000000432179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481940, "question_id": "j9ApK5MX9w63NtMnBPhQuh", "question": "What popular news agency as the same Acronym as the letters near the bottom of the plane?", "choices": ["associated press", "amazing press", "awesome people", "associated people"], "correct_choice_idx": 0, "direct_answers": ["usa", "ap news", "associated press", "ap", "no idea", "associated press", "associated press", "associated press", "associated press", "associated press"], "difficult_direct_answer": false, "rationales": ["The letters displayed is ap which is the initials for associated press which is a news agency.", "It's the ap.", "This agency is internationally known and recognizable."], "image": "train2014/COCO_train2014_000000481940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493526, "question_id": "j9EkhgYzfKoCUfhVRrBCuy", "question": "Which country's flag is draped over them?", "choices": ["united states", "united kingdom", "france", "canada"], "correct_choice_idx": 1, "direct_answers": ["england", "british", "britain", "united kingdom", "england", "england", "united kingdom", "united kingdom", "britain", "uk"], "difficult_direct_answer": false, "rationales": ["The flag of the uk is covering the couple.", "The flag is the uk's.", "People are laying on the ground with a flag that is red, white, and blue draped over them. the uk flag is red, white, and blue."], "image": "train2014/COCO_train2014_000000493526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237920, "question_id": "j9LUjVPxUghoh3vuFWCDB8", "question": "How can the red chairs be transported easily?", "choices": ["drag them", "stack them", "lift them", "turn them"], "correct_choice_idx": 1, "direct_answers": ["picking it", "stacked", "picked up", "stack them", "stacked up", "lightweight", "stacked/carried", "stackable", "carry", "stacked"], "difficult_direct_answer": true, "rationales": ["These types of red chairs are shaped in a way they make them easily stackable. the legs are guided against each other.", "These are plastic chairs that nestle to store", "The red chairs are stackable."], "image": "val2014/COCO_val2014_000000237920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255006, "question_id": "j9sxz5NK2cURppcKLchhWv", "question": "What is unusual about the person's less-popular phone system?", "choices": ["corded", "display", "number pad", "color"], "correct_choice_idx": 0, "direct_answers": ["corded", "corded", "corded", "corded", "mobile system", "cord", "landline", "corded", "corded", "cord"], "difficult_direct_answer": false, "rationales": ["The phone has a cord which is not common in the modern day.", "The phone is corded.", "The question is subjective, but modern phones rarely include cords so this would be unusual today."], "image": "train2014/COCO_train2014_000000255006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426343, "question_id": "jAFsqh6WenxmNogMA8zMtE", "question": "What room are they in?", "choices": ["office", "conference", "bathroom", "pantry"], "correct_choice_idx": 1, "direct_answers": ["conference", "person", "dorm room", "conference", "breakroom", "conference room", "game room", "persian conference", "conference room", "conference room"], "difficult_direct_answer": false, "rationales": ["There is a sign on the door that says \"persian conference room\".", "There is a sign on the door behind the people which labels the room and is consistent with answer a.", "Two people are standing in a commercial type room with advertisements on the door."], "image": "train2014/COCO_train2014_000000426343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75697, "question_id": "jAUkv8faSNiMbjx3ghN2AF", "question": "What might be age inappropriate here?", "choices": ["ring", "teddy bear", "necklace", "shirt"], "correct_choice_idx": 1, "direct_answers": ["outfit", "teddy bear", "teddy bear", "teddy bear", "stuffed toy", "woman", "dress", "teddy bear", "teddy bear", "bear"], "difficult_direct_answer": false, "rationales": ["The woman appears to be a little old to be carrying around a stuffed animal.", "The item the woman is holding is usually associated with children. the woman is an adult.", "Teddy bears are usually played with by children. this girl looks too old to play with teddy bears."], "image": "train2014/COCO_train2014_000000075697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496424, "question_id": "jAZyH3UtrBbzfMozdNDyJa", "question": "What type of facility is this?", "choices": ["bakery", "smokehouse", "meat department", "deli"], "correct_choice_idx": 0, "direct_answers": ["kitchen", "kitchen", "kitchen", "kitchen", "diner room", "bakery", "kitchen", "bakery", "bakery", "restaurant"], "difficult_direct_answer": false, "rationales": ["There are what looks to be yellow cake on the table with aluminum trays for baking.", "There are commercial ovens and cakes.", "The industrial kitchen is made with large ovens that is good for baking pastries and desserts."], "image": "train2014/COCO_train2014_000000496424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345822, "question_id": "jAbHgeZxhQDzGqLp96Fuen", "question": "What is the occupation of the man holding a camera?", "choices": ["actor", "athlete", "film director", "reporter"], "correct_choice_idx": 3, "direct_answers": ["photographer", "cameraman", "camera operator", "photographer", "photography", "photographer", "cameraman", "photographer", "videographer", "reporter"], "difficult_direct_answer": false, "rationales": ["Most reporters move around with cameras.", "The occupation is a reporter.", "He is talking to them as he films them"], "image": "val2014/COCO_val2014_000000345822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81103, "question_id": "jAgfTFwxe4k8F3jphs3BWg", "question": "What company makes the vehicle?", "choices": ["ford", "suzuki", "tesla", "general motors"], "correct_choice_idx": 1, "direct_answers": ["suzuki", "suzuki", "suzuki", "suzuki", "suzuki", "suzuki", "suzuki", "suzuki", "suzuki", "suzuki"], "difficult_direct_answer": false, "rationales": ["The company is suzuki.", "The motorcycle is a red suzuki", "The make is right on the tank."], "image": "val2014/COCO_val2014_000000081103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48572, "question_id": "jAz5SJEFToDZhaXg7mmiau", "question": "What is in the plate in the foreground?", "choices": ["orange", "banana", "shrimp", "apple"], "correct_choice_idx": 2, "direct_answers": ["shrimp", "shrimp salad", "shrimp", "shrimp", "shrimp", "shrimp", "shrimp", "shrimp", "shrimp lettuce", "shrimp"], "difficult_direct_answer": false, "rationales": ["The objects on the plate in the question are clearly visible and of a shape, size and color of answer a.", "Shrimp salad is shown on the plate.", "It is a seafood salad."], "image": "train2014/COCO_train2014_000000048572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524646, "question_id": "jBBMogih5KpHRCj3wAcLzb", "question": "What natural feature does the person on the water use for movement?", "choices": ["sun", "tsunami", "wind", "earthquakes"], "correct_choice_idx": 2, "direct_answers": ["sail", "wind", "wind", "sail", "parasail", "wind", "air", "water", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["He has a sail.", "The person uses wind in the sail for motion.", "They have a sail on the board to catch the wind to propel them forward"], "image": "train2014/COCO_train2014_000000524646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464937, "question_id": "jBGfQaDZPSBjVfC7ed33zQ", "question": "Where is this produce located?", "choices": ["market", "store", "refrigerator", "driveway"], "correct_choice_idx": 0, "direct_answers": ["farm", "basket", "basket", "baskets", "basket", "baskets", "bucket", "for purchase", "street market", "market"], "difficult_direct_answer": false, "rationales": ["The produce looks to be on display at an outdoor venue.", "There are fruits in baskets.", "There are two buckets of produce. it appears to be outside and dirty."], "image": "train2014/COCO_train2014_000000464937.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360449, "question_id": "jBN4FBck6GFxhZb3edPEXg", "question": "During which time of the year are the vehicles traveling on this roadway?", "choices": ["winter", "spring", "summer", "fall"], "correct_choice_idx": 3, "direct_answers": ["night time", "summer", "fall", "july", "night", "night", "evening", "winter", "fall", "unknown"], "difficult_direct_answer": false, "rationales": ["The leaves on the trees appear to have fallen.", "Vehicles travel in the fall.", "It's fall."], "image": "val2014/COCO_val2014_000000360449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205756, "question_id": "jBNpqCeiXaFP2Pmgh7ZHrj", "question": "What is most likely in the large white jug?", "choices": ["ice cream", "popcorn", "liquid", "candy"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "water", "water", "water", "water", "liquid", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["These types of containers are typically used to hold a liquid in them and keep them cool.", "The jug has a drink in it.", "It is a cooler that is meant to keep beverages cold during picnics or outside events."], "image": "train2014/COCO_train2014_000000205756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274593, "question_id": "jCefk82eyskiJ3MWuSToKf", "question": "This city is the capital of which European country?", "choices": ["austria", "germany", "france", "belgium"], "correct_choice_idx": 3, "direct_answers": ["belgium", "belgium", "belgium", "belgium", "belgium", "belgium", "brussels", "italy", "belgium", "belgium"], "difficult_direct_answer": false, "rationales": ["It's in belgium", "This famous city is located in this country.", "Brussels is a city in belgium and is on the sign behind the hydrant."], "image": "val2014/COCO_val2014_000000274593.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264599, "question_id": "jCfpTxwV6tNmwH7RGWp3jv", "question": "Where is Alpen's headquarters?", "choices": ["france", "america", "netherlands", "germany"], "correct_choice_idx": 2, "direct_answers": ["asia", "netherlands", "don't know", "japan", "asia", "japan", "japan", "tokyo", "straight", "unsure"], "difficult_direct_answer": false, "rationales": ["The he are in netherlands.", "The place is foreign.", "Alpen is a dutch company headquartered in the netherlands."], "image": "val2014/COCO_val2014_000000264599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366173, "question_id": "jCjiY8H6bdoY4Xxvw7m45f", "question": "Why is the man wearing a white helmet?", "choices": ["protection", "visibility", "dress code", "fashion"], "correct_choice_idx": 0, "direct_answers": ["riding motorbike", "protection", "protect head", "protection", "protection", "safety", "safety", "safety", "protection", "safety"], "difficult_direct_answer": false, "rationales": ["The man wants to protect his head.", "In case he falls off the bike, it does protect his head.", "He is traveling fast on the motorcycle and the helmet provides safety in case of an accident."], "image": "train2014/COCO_train2014_000000366173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335090, "question_id": "jCrBCMFWnL9vJfbJoXqRbb", "question": "What does the DB stand for?", "choices": ["danke bahn", "der bahn", "deutsche bende", "deutsche bahn"], "correct_choice_idx": 3, "direct_answers": ["die berge", "deutsche bahn", "deutsche bahn", "deutsche bahn", "denmark", "die berge", "der bahn", "deutsche bahn", "train number", "germany"], "difficult_direct_answer": false, "rationales": ["The db is dutch bahn.", "Db stands for deutsche bahn.", "The train has a db on its side which likely stands for deutsche bahn."], "image": "train2014/COCO_train2014_000000335090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468662, "question_id": "jCyduEKzigSyX7nMjpZYa5", "question": "What is the child on the right wearing?", "choices": ["suspenders", "helmet", "tie", "goggles"], "correct_choice_idx": 1, "direct_answers": ["baseball cap", "helmet", "helmet", "baseball helmet", "blue helmet", "shorts", "helmet", "helmet", "helmet", "batting"], "difficult_direct_answer": false, "rationales": ["The kid has a helmet.", "The child is wearing a baseball helmet.", "It is a hard plastic headpiece used for safety purposes."], "image": "train2014/COCO_train2014_000000468662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509471, "question_id": "jDCzDptTnWqbU4tU8ivok9", "question": "What is the man wearing a hat doing?", "choices": ["playing roshambo", "holding plate", "catching frisbee", "throwing frisbee"], "correct_choice_idx": 2, "direct_answers": ["playing frisbee", "pushing frisbee", "catching frisbee", "catching frisbee", "catching frisbee", "catching frisbee", "catching frisbee", "playing frisbee", "catching frisbee", "catching"], "difficult_direct_answer": false, "rationales": ["The man with the hat is holding out his hands to catch a frisbee.", "The man is holding his hands out to a frisbee as if catching it.", "He is catching a frisbee."], "image": "val2014/COCO_val2014_000000509471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215579, "question_id": "jDJ5scjTnPy28nuH6eCfrd", "question": "What is unique about this cat?", "choices": ["sleeps standing", "runs fast", "watches tv", "eats fruit"], "correct_choice_idx": 2, "direct_answers": ["tabby pattern", "watch tv", "watching tv", "watching tv", "watching tv", "attentive", "watching tv", "it's coat", "watches tv", "watching tv"], "difficult_direct_answer": false, "rationales": ["The cat is watching tv.", "The cat is watching the television.", "A cat is staring at a television screen."], "image": "val2014/COCO_val2014_000000215579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272800, "question_id": "jDktrCb5vo2QrdujvQn3KL", "question": "What type of room is this?", "choices": ["living room", "dining room", "bedroom", "kitchen"], "correct_choice_idx": 0, "direct_answers": ["living room", "living room", "office", "living room", "office", "living room", "home office", "office", "living room", "livingroom"], "difficult_direct_answer": false, "rationales": ["There is a television and a coffee table in the room.", "There is a tv.", "This is someones living room."], "image": "train2014/COCO_train2014_000000272800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398733, "question_id": "jDurChcEN9MqXGpca7xRGf", "question": "What is the women missing that many women have on in public?", "choices": ["tiara", "make-up", "fur coat", "hat"], "correct_choice_idx": 1, "direct_answers": ["jewelry", "makeup", "makeup", "make-up", "handbag", "mask", "bra", "hijab", "cell phone", "makeup"], "difficult_direct_answer": false, "rationales": ["The woman has no makeup.", "The woman is going au naturel in terms of skincare.", "She has makeup."], "image": "train2014/COCO_train2014_000000398733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545040, "question_id": "jEUQCGqHtMAHyYCNxaEZvf", "question": "From which deck area must passengers depart this train?", "choices": ["far platform", "upper roof", "near platform", "below ground"], "correct_choice_idx": 0, "direct_answers": ["far platform", "left side", "left deck", "left", "on arrival", "left", "platform", "left", "left", "left deck"], "difficult_direct_answer": false, "rationales": ["The far platform is for departures.", "Passengers depart from the right side of the train cab to the platform far away.", "The area is the far one."], "image": "val2014/COCO_val2014_000000545040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322707, "question_id": "jEXud5YZhiotYxJLd5SkZc", "question": "What type of food is advertised on the cart?", "choices": ["muffin", "hot dog", "hamburger", "bagel"], "correct_choice_idx": 1, "direct_answers": ["hot dog", "hotdog", "fast", "hot dog", "hot dog", "hot dog", "hot dogs", "hot dog", "snacks", "hot dog"], "difficult_direct_answer": false, "rationales": ["The white text on the left of the cart is the advertisement for this type of food. it consists of a bun, a sausage, and condiments.", "It is served in a cart.", "You can see the picture and word showing the hot dog on the left side."], "image": "train2014/COCO_train2014_000000322707.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369140, "question_id": "jEk7MTDXxaqFx3UP7s6uEZ", "question": "If there more than one direction for the street up ahead?", "choices": ["yes", "maybe", "unsure", "no"], "correct_choice_idx": 3, "direct_answers": ["no", "no", "no", "no", "no", "snow", "maybe", "unknown", "no", "no"], "difficult_direct_answer": false, "rationales": ["They don't show enough of the street up ahead to tell accurately.", "No there isnt", "There is a sign saying one way with an arrow."], "image": "train2014/COCO_train2014_000000369140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382630, "question_id": "jEpYshhECXLhdaeCxMq5kT", "question": "What is the train close to?", "choices": ["trucks", "cars", "cows", "trees"], "correct_choice_idx": 3, "direct_answers": ["trees", "ocean", "trees", "trees", "trees", "bushes", "tree", "trees", "trees", "trees"], "difficult_direct_answer": false, "rationales": ["There are green plants, not animals or other vehicles, near the train. they are tall.", "The train is running past trees.", "There are trees in the foreground."], "image": "train2014/COCO_train2014_000000382630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263101, "question_id": "jEwPGLMQsLLrLtHym2hALv", "question": "The person nearest has what handicap?", "choices": ["lameness", "hearing", "heart defect", "blindness"], "correct_choice_idx": 3, "direct_answers": ["blindness", "blind", "blind", "blind", "blindness", "limp", "blind", "blind", "limp", "blind"], "difficult_direct_answer": false, "rationales": ["They have a stick.", "The person is using a special cane that helps the visually impaired to scan for obstacles in their environment.", "They're blind."], "image": "train2014/COCO_train2014_000000263101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477468, "question_id": "jEwiVecJJa3Crjz6W72JGH", "question": "What are the green items on top of the tomatoes on the man's sandwich?", "choices": ["relish", "avocados", "lettuce", "pickles"], "correct_choice_idx": 1, "direct_answers": ["pickles", "avocados", "avocado", "apple slices", "avocado", "snap peas", "apple", "avocados", "avocado", "avocado"], "difficult_direct_answer": false, "rationales": ["The items on top of the tomatoes have a green and yellow coloring. also they appear to be relatively soft. these all point to the item being a favorite topping or avocados.", "The green items are avocadoes.", "The green parts are from that produce."], "image": "train2014/COCO_train2014_000000477468.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2498, "question_id": "jF5NQUjkxfLrWC7qMLcYcN", "question": "What is the cause of the puddle of water in the foreground of the Frisbee players?", "choices": ["snow", "sleet", "rain", "low tide"], "correct_choice_idx": 3, "direct_answers": ["rain", "rain", "rainstorm", "rain", "tide activity", "water", "rain", "low tide", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["When the tide is low a puddle is created.", "The cause is low tide.", "Kids are playing on a beach in which the water is receding. water recedes during low tide."], "image": "train2014/COCO_train2014_000000002498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270475, "question_id": "jFBRN2uZ6JmuNhRMbadexw", "question": "What are the metal poles on the window called?", "choices": ["shuttle sticks", "handles", "rackets", "wipes"], "correct_choice_idx": 3, "direct_answers": ["wipers", "supports", "wipers", "panes", "wipers", "divider", "wipes", "windshield wipers", "wipers", "windshield wipers"], "difficult_direct_answer": false, "rationales": ["A bus has long metal objects on the windshield used when it rains.", "They are wipes that move back and forth to keep the windshield clear of rain and snow.", "The metal poles are windshield wipers."], "image": "train2014/COCO_train2014_000000270475.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40910, "question_id": "jFS7KYzhxNWnKgjRtMftTa", "question": "What are the trees with white bases called?", "choices": ["willow trees", "pine trees", "birch trees", "palm trees"], "correct_choice_idx": 3, "direct_answers": ["palm", "palm", "palm", "palm", "palm trees", "palm trees", "palm trees", "palm trees", "palm", "palm"], "difficult_direct_answer": false, "rationales": ["The thin trunks and wispy long thin leaves of this tropical tree identifies them as palm trees.", "Tall trees with large leaves are on a beach.", "Palms usually grow in tropical beachy areas."], "image": "train2014/COCO_train2014_000000040910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270789, "question_id": "jFbnw5goA5sBsN7n3CxGHS", "question": "What time can parking be found near this location?", "choices": ["evening only", "24 hours", "no time", "morning only"], "correct_choice_idx": 1, "direct_answers": ["24", "twenty four", "24hr", "24 hours", "anytime", "24 hours", "24 hours", "24 hours", "anytime", "never"], "difficult_direct_answer": false, "rationales": ["A sign advertises twenty four hour parking on the side of the road.", "There is a sign that says no standing anytime.", "There is a black sign with white text. it indicates when parking is available."], "image": "train2014/COCO_train2014_000000270789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197690, "question_id": "jFfy8BGmVe9tyMvMBJdyHK", "question": "What switch hitting Atlanta Braves legend is at the plate?", "choices": ["ozzie albies", "chipper jones", "freddie freeman", "otis nixon"], "correct_choice_idx": 1, "direct_answers": ["batter", "chipper jones", "batter", "chipper jones", "chipper jones", "chipper jones", "chipper jones", "riley", "chipper jones", "chipper jones"], "difficult_direct_answer": false, "rationales": ["This would be chipper jones.", "That's chipper jones.", "Based on his physical characteristics, that batter is none other than atlanta braves switch hitter chipper jones."], "image": "train2014/COCO_train2014_000000197690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163525, "question_id": "jG2a7u6t5QMoK429K9JToZ", "question": "What material is the sail mast made of?", "choices": ["wood", "aluminum", "copper", "iron"], "correct_choice_idx": 1, "direct_answers": ["aluminum", "aluminum", "canvas", "lightweight material", "nylon", "canvas", "canvas", "plastic", "canvas", "canvas"], "difficult_direct_answer": false, "rationales": ["The sail looks like it's sturdy without pulling the boat down.", "The mast is made out of a grey metal. the water would cause iron or copper to rust.", "The mast of the boat is a tall pole made from aluminum."], "image": "train2014/COCO_train2014_000000163525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50462, "question_id": "jGB68vpfXgj7jtDKAqHy7s", "question": "Who do the luggage belong to?", "choices": ["homeless people", "passengers", "drivers", "station workers"], "correct_choice_idx": 1, "direct_answers": ["train riders", "passengers", "passengers", "passengers", "man", "passengers", "passengers", "travelers", "passenger", "traveler"], "difficult_direct_answer": false, "rationales": ["The luggage is next to a waiting area for a train meant to transport people.", "The baggage is there for the passengers to pick up.", "This is a train station and luggage outside of the waiting area most likely belongs to the train goers."], "image": "train2014/COCO_train2014_000000050462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541319, "question_id": "jGHtL8gnVWEuhBbYqsQczY", "question": "How is the young girl feeling?", "choices": ["angry", "sad", "amazed", "fearful"], "correct_choice_idx": 2, "direct_answers": ["happy", "happy", "happy", "amazed", "happy", "happy", "happy", "delighted", "happy entertained", "happy"], "difficult_direct_answer": false, "rationales": ["The girl is amazed.", "She has an open-mouth smile", "The young girl is feeling amazed by the orange."], "image": "train2014/COCO_train2014_000000541319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562232, "question_id": "jGRvUt3pEJw8VJabMLikaZ", "question": "What item is held up by the man here?", "choices": ["iv bag", "poison", "milk bottle", "whiskey"], "correct_choice_idx": 0, "direct_answers": ["horse", "iv bag", "fluid container", "iv", "medication iv", "feeder", "iv bag", "cathode", "iv", "water bottle"], "difficult_direct_answer": false, "rationales": ["Men are standing around with their hands on a horse. one of the men is holding a bottle with a line connected to it up near the horse.", "Looks like they are giving the horse medication.", "There's liquid in the bag, with a hose connecting it to the horse."], "image": "val2014/COCO_val2014_000000562232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196170, "question_id": "jGneZMTcTNtLjL3PGMkzQq", "question": "Why are the patrons eating with chopsticks?", "choices": ["superiority", "for fun", "for authenticity", "as joke"], "correct_choice_idx": 2, "direct_answers": ["noodles", "asian cuisine", "asian food", "asian restaurant", "asian", "asian food", "chinese restaurant", "asian cuisine", "for authenticity", "noodles"], "difficult_direct_answer": false, "rationales": ["They wanted to eat the food like they're in asia.", "The people are eating chinese food.", "This is an asian restaurant and they are eating with them because it makes the meal more authentic."], "image": "train2014/COCO_train2014_000000196170.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94863, "question_id": "jGuEoBUGv3J4Aswj7P7uRh", "question": "What is sent into the black/yellow tube?", "choices": ["laundry", "grass", "water", "garbage"], "correct_choice_idx": 3, "direct_answers": ["mail", "trash", "trash", "garbage", "garbage", "trash", "garbage", "students", "trash", "trash"], "difficult_direct_answer": false, "rationales": ["The black and yellow container by the building is to dispose of trash.", "The garbage is sent.", "There is a large garbage chute."], "image": "train2014/COCO_train2014_000000094863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253903, "question_id": "jH9YTWEhez6yTggVKTd2vf", "question": "Which direction is the arrow pointing?", "choices": ["left", "right", "down", "up"], "correct_choice_idx": 3, "direct_answers": ["up", "vertically up", "up", "up", "up", "up", "north", "forward", "up", "up"], "difficult_direct_answer": false, "rationales": ["The sign above the platform has an arrow on it that is pointing towards the sky.", "It's pointing at the ceiling", "The arrow is pointed towards the sky in the up direction."], "image": "train2014/COCO_train2014_000000253903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480005, "question_id": "jHAfycfqhVEmyQ93hhiN64", "question": "Where is the person taking the vegetables on the boat?", "choices": ["throwing away", "to market", "home", "church"], "correct_choice_idx": 1, "direct_answers": ["on land", "to market", "market", "near lake", "market", "shore", "to market", "market", "front", "to market"], "difficult_direct_answer": false, "rationales": ["They are stacked up to sell what they grew", "They have many of them but only two types, which they're taking to sell.", "This is how these vendors get their products to a place to sell them."], "image": "train2014/COCO_train2014_000000480005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108111, "question_id": "jHFkpSbLqwPafUDw9ae8ri", "question": "What kind of gathering is this?", "choices": ["business", "religious", "social", "family"], "correct_choice_idx": 0, "direct_answers": ["business meeting", "board", "office meeting", "meeting", "business", "work", "staff meeting", "meeting", "family", "business meeting"], "difficult_direct_answer": false, "rationales": ["People sit at a large table with computers and papers all around.", "These people are gathered together for a meeting that involves the use business equipment.", "They look to be working on work stuff."], "image": "val2014/COCO_val2014_000000108111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364934, "question_id": "jHZyQ8yjB6GZjh3yi7dWFn", "question": "What is number twenty four trying to do?", "choices": ["backflip", "tackle girl", "steal ball", "sit down"], "correct_choice_idx": 2, "direct_answers": ["steal ball", "defend", "steal ball", "block", "guard", "steal ball", "steal ball", "steal ball", "guard", "defend ball"], "difficult_direct_answer": false, "rationales": ["The person is going after the ball. there is no tackling in soccer.", "A player in red is kicking a ball. the other person in white is trying to kick ball and intercept the ball as it rolls.", "The girl is trying to steal the ball."], "image": "train2014/COCO_train2014_000000364934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310317, "question_id": "jHpAynd2sRLZWoQNfy3j39", "question": "What are the two watching in the distance?", "choices": ["football", "skateboarding", "birds", "boxing"], "correct_choice_idx": 1, "direct_answers": ["skateboarder", "skateboarder", "skateboarder", "skateboarder", "person skateboarding", "skateboarder", "skateboard tricks", "skateboarding", "skateboarder", "skateboarding"], "difficult_direct_answer": false, "rationales": ["The person in the background has his arms in the air for balance and a board under his feet on top of a fire hydrant doing tricks.", "The kids are looking at the skateboarder.", "There is a guy on a skateboard on a ramp in the distance."], "image": "train2014/COCO_train2014_000000310317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295480, "question_id": "jJ7QwdkcAhPNQYUMXS3GS9", "question": "This airline is based out of what city?", "choices": ["bern", "helsinki", "capetown", "quebec"], "correct_choice_idx": 3, "direct_answers": ["paris", "montreal", "montreal", "quebec", "montreal", "rome", "canada", "montreal", "montreal", "montreal"], "difficult_direct_answer": false, "rationales": ["The canadian flag on this plane gives us a hint that it's airline is headquartered out of quebec.", "Air transit logos can be seen on the planes and air transit is based out of quebec.", "A small flag can be seen at the tail of an airplane. it has red and white stripes and small maple leaf."], "image": "train2014/COCO_train2014_000000295480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574140, "question_id": "jJGCPcPtBCeHVj8WbQ6u4y", "question": "What type of event is he attending?", "choices": ["concert", "meeting", "game", "reception"], "correct_choice_idx": 3, "direct_answers": ["gathering event", "black tie", "wedding", "wedding", "magic show", "wedding", "wedding", "party", "wedding", "reception"], "difficult_direct_answer": false, "rationales": ["A reception is usually fancy and involves a meal.", "He is at a reception.", "Looks like he is at a wedding reception."], "image": "train2014/COCO_train2014_000000574140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365947, "question_id": "jJGJDGUbUPMNNhKBPJSZ2q", "question": "What are the brown objects on the metal pans?", "choices": ["mushrooms", "potatoes", "bread", "roots"], "correct_choice_idx": 1, "direct_answers": ["potatoes", "potatoes", "buns", "potatoes", "potatoes", "potatoes", "handles", "potatoes", "potatoes", "dough"], "difficult_direct_answer": false, "rationales": ["The brown objects are potatoes.", "Potatoes are beige and round.", "They are this color because they are grown in the ground"], "image": "train2014/COCO_train2014_000000365947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41763, "question_id": "jJGWj27ox5CfVmqL8Pt7LP", "question": "What has to occur in order for the fire extinguisher to be used?", "choices": ["accident", "fire", "flood", "crime"], "correct_choice_idx": 1, "direct_answers": ["unscrewed", "unscrew cap", "fire", "fire", "wrench open", "fire", "fire", "unscrewed bolt", "fire", "fire"], "difficult_direct_answer": false, "rationales": ["The hydrant contains water that they get to put up flames. water helps dose these flames and put out them.", "Using a hydrant during a flood would make it worse. aside from arson, a hydrant normally would not be useful after an accident or crime.", "These are used by firefighters for the purpose of their jobs when heat is involved."], "image": "train2014/COCO_train2014_000000041763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491992, "question_id": "jJk8pzeMez7a3stbYVGeKC", "question": "What activity is being demonstrated?", "choices": ["rafting", "canoeing", "surfing", "paddling"], "correct_choice_idx": 3, "direct_answers": ["waterboarding", "paddle boarding", "paddleboard", "paddle board", "standup paddle-boarding", "paddleboarding", "surf", "paddling", "fishing", "water boarding"], "difficult_direct_answer": true, "rationales": ["The activity is paddling.", "The people are paddling on the boards.", "The man is trying to paddle through water."], "image": "train2014/COCO_train2014_000000491992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561488, "question_id": "jKAdLWHqXR9ycsZPpte4EE", "question": "What type of shot is the woman hitting?", "choices": ["slice", "serve", "forehand", "backhand"], "correct_choice_idx": 2, "direct_answers": ["backhand", "forehand", "right hand", "forehand groundstroke", "tennis", "swing", "forehand", "return", "forehand", "return"], "difficult_direct_answer": false, "rationales": ["The shot uses the forehand.", "The woman is using her forehand.", "The woman is on a tennis court reaching for the ball with her right hand."], "image": "train2014/COCO_train2014_000000561488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271987, "question_id": "jKMwNX23y7zJzorovVShD2", "question": "What type of birds are in the sky?", "choices": ["sea gulls", "ravens", "penguins", "doves"], "correct_choice_idx": 0, "direct_answers": ["sea gulls", "seagulls", "seagulls", "seagulls", "seagulls", "gulls", "seagulls", "seagull", "sea gulls", "seagulls"], "difficult_direct_answer": false, "rationales": ["Sea gulls are the only bird which would be found in such a rocky ocean scene in such numbers as seen in this picture.", "The setting implies the birds would be those that habitat in and water based setting. the shape of the birds is additionally consistent with answer a.", "The birds are flying over the water."], "image": "train2014/COCO_train2014_000000271987.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217201, "question_id": "jLj7CFGbMwMVdPqLxDabN3", "question": "What is the relationship between these two teams?", "choices": ["different league", "rivals", "different division", "different sport"], "correct_choice_idx": 1, "direct_answers": ["opponents", "opposition", "opponent", "opposing", "rivals", "baseball teams", "opponents", "rivals", "opponents", "opponents"], "difficult_direct_answer": false, "rationales": ["The relationship is a rival.", "The two teams are playing each other in a baseball game so they would be rivals.", "They are opponents in every game they play with each other."], "image": "train2014/COCO_train2014_000000217201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330649, "question_id": "jLzc3bSpftxtpxYJEFzdK5", "question": "Why is the person still in the car with the door open?", "choices": ["stuck", "safety", "indecision", "not ready"], "correct_choice_idx": 1, "direct_answers": ["getting in", "entering", "waiting", "bus blocking", "leaving", "waiting", "getting out", "getting out", "safety", "bus proximity"], "difficult_direct_answer": false, "rationales": ["The person needs safety.", "The person cannot get out because there is a bus next to the car.", "The person wants to stay safe."], "image": "train2014/COCO_train2014_000000330649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563641, "question_id": "jMFgQSnericap23ec3jj3z", "question": "What are the people waiting to do?", "choices": ["pay", "eat", "play", "board"], "correct_choice_idx": 3, "direct_answers": ["board", "board train", "board train", "board train", "board train", "board", "board", "board train", "get on", "board train"], "difficult_direct_answer": false, "rationales": ["The people are standing on a platform next to a train which indicates they are likely there for the purposes of getting on the train.", "It is a train stopped at a station and they are standing on the platform where it is customary to board.", "Passengers wait on a platform where a train stops to allow them to board."], "image": "val2014/COCO_val2014_000000563641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389074, "question_id": "jMWKiFcPL8K7EeXyjD7Rfo", "question": "How do the women taking know each other?", "choices": ["rivals", "coworkers", "teammates", "neighbors"], "correct_choice_idx": 2, "direct_answers": ["good", "doubles team", "tennis players", "teammates", "tennis partners", "teammates", "doubles partners", "teammates", "teammates", "sisters"], "difficult_direct_answer": false, "rationales": ["This appears to be the case given that they're on the same side of the net.", "The women are on the same side of the tennis court because they are playing on the same team.", "They are playing doubles tennis together."], "image": "train2014/COCO_train2014_000000389074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484341, "question_id": "jMqobvUp9DLZ4vakzjviaY", "question": "What form of communication is practised in the area behind the cow?", "choices": ["telephoning", "letter writing", "internet", "telegraphing"], "correct_choice_idx": 0, "direct_answers": ["telephone", "telephoning", "phone", "telephone", "transporting", "telephone conversation", "phone", "telecommunication", "telephone", "phone"], "difficult_direct_answer": false, "rationales": ["There's a telephone booth behind the cow. people talk on the phone in a telephone booth.", "The device in the booth has a handset that allows a person to talk to someone else.", "There is a booth with a device that has a receiver and speaker allowing people to make calls."], "image": "train2014/COCO_train2014_000000484341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449384, "question_id": "jMtzjfoLs3fkqMT6hg9Kjp", "question": "Why is the girl holding an open umbrella?", "choices": ["for photo", "to dance", "staying dry", "fashion"], "correct_choice_idx": 2, "direct_answers": ["rain", "rain", "rainy", "rain protection", "rain protection", "raining", "staying dry", "rain forthcoming", "drizzling", "raining"], "difficult_direct_answer": false, "rationales": ["A child is holding an umbrella in the rain. umbrellas are used to block rain.", "The girl is preventing herself from coming into contact to rain.", "It's a cloudy day and the ground is wet."], "image": "train2014/COCO_train2014_000000449384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280971, "question_id": "jMx4iUELAaVgfPQmuw9LpF", "question": "How many bedrooms are in the apartment that is available for rent?", "choices": ["one", "three", "two", "zero"], "correct_choice_idx": 0, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["A sign advertising an apartment for rent says one bedroom on it.", "There is one.", "The amount of bedrooms is listed on the sign."], "image": "train2014/COCO_train2014_000000280971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458592, "question_id": "jN4gVtWN9UqwUhfugNBVmT", "question": "Where is this room located?", "choices": ["work", "home", "hotel", "school"], "correct_choice_idx": 2, "direct_answers": ["bathroom", "hotel", "motel/bathroom", "bathroom", "bathroom", "hotel", "hotel", "hotel", "hotel room", "bathroom"], "difficult_direct_answer": false, "rationales": ["The room is in a hotel.", "A bathroom with commercial fixtures is shown with a dispenser for seat covers.", "This bathroom appears to be located in a place that provide lodging for guests and tourists. the glasses have paper coasters underneath, there are some toiletries provided by the establishment, the towels are folded special, and there is an ice bucket. all of these items suggest that this is a room located in a travel lodging location."], "image": "train2014/COCO_train2014_000000458592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129247, "question_id": "jNCNiNaH5MCygmSSRWszph", "question": "Where does the man carry his cell phone?", "choices": ["shirt pocket", "jeans pocket", "messenger bag", "side holster"], "correct_choice_idx": 3, "direct_answers": ["in hand", "pocket", "right hand", "bathroom", "side holster", "holster", "holster", "hand", "by neck", "hand"], "difficult_direct_answer": false, "rationales": ["The man has a holder to carry his cell phone on his side.", "There is a side part on the phone.", "The man in the mirror has a holster on the side of his belt to carry his cell phone."], "image": "train2014/COCO_train2014_000000129247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92686, "question_id": "jNNhJ24zgy4j9dhrHq5qJX", "question": "Where is she standing?", "choices": ["zoo", "market", "home", "park"], "correct_choice_idx": 2, "direct_answers": ["indoors", "near tv", "living room", "room", "dining room", "by tv", "living room", "home", "dining room", "bedroom"], "difficult_direct_answer": false, "rationales": ["The inside looks like a house.", "The woman is inside a building. there are no stores, benches, or animals near her.", "It appears as if she is in the living room."], "image": "train2014/COCO_train2014_000000092686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576082, "question_id": "jNRveYXuji6bTHpFR4pFmk", "question": "In which way is this person communicating currently?", "choices": ["none", "textually", "visually", "verbally"], "correct_choice_idx": 1, "direct_answers": ["using cellphone", "text message", "phone", "texting", "text message", "texting", "textually", "text", "cellphone", "texting"], "difficult_direct_answer": false, "rationales": ["The person is texting.", "The person is using their fingers to craft a text message.", "The woman is holding and typing into her cell phone."], "image": "train2014/COCO_train2014_000000576082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438219, "question_id": "jNZzQiP2tVbGkbXAoTJpU2", "question": "What is the major German city closest to the locomotive?", "choices": ["munich", "leipzig", "hamburg", "berlin"], "correct_choice_idx": 1, "direct_answers": ["leipzig hauptbahnhof", "leipzig", "leipzig", "saxony", "leipzig", "munich", "leipzig", "amsterdam", "leipzig", "leipzig"], "difficult_direct_answer": false, "rationales": ["Leipzig is closest based on the signage.", "Leipzig is close.", "It is the name on the sign of the train heading in that direction."], "image": "train2014/COCO_train2014_000000438219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255026, "question_id": "jNivi4gmWwsNgLfHRqNEZR", "question": "What color are the stripes on the sleeves of the jacket worn by the motorcyclist in front?", "choices": ["blue", "white", "green", "red"], "correct_choice_idx": 1, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The color of the stripes is grayish white.", "The steps are white.", "They are the same as the road stripes"], "image": "train2014/COCO_train2014_000000255026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523754, "question_id": "jP572dwAiYNZRpxogXU8bn", "question": "How many surfboards are there?", "choices": ["seven", "five", "nine", "four"], "correct_choice_idx": 1, "direct_answers": ["nine", "eight", "five", "five", "five", "six", "four", "nine", "eight", "eight"], "difficult_direct_answer": false, "rationales": ["It's easy to count them as they are all clearly visible.", "The boards are clearly visible and countable based on their distinct outlines.", "There are 5 standing up and two on the right laying down."], "image": "train2014/COCO_train2014_000000523754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40341, "question_id": "jP6wVfkUHvHWiUC3m2cxtT", "question": "How did this man get to this location immediately prior to taking this picture?", "choices": ["skied", "walked", "flew", "jogged"], "correct_choice_idx": 1, "direct_answers": ["skis", "driving", "skiing", "car", "drove there", "walked", "skis", "skis", "skiing", "skied there"], "difficult_direct_answer": false, "rationales": ["He is standing on a sidewalk, which means he walked and it's too snow covered to safely jog.", "The man walked.", "The man had walked to arrive at this location."], "image": "val2014/COCO_val2014_000000040341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231379, "question_id": "jPEgzyK3APGuvD7pANuBG2", "question": "What material is used to make the sheet laid on by the man?", "choices": ["nylon", "leather", "wool", "cotton"], "correct_choice_idx": 0, "direct_answers": ["canvas", "rope", "cloth", "nylon", "hemp", "cloth", "cotton", "cloth", "cloth", "cloth"], "difficult_direct_answer": false, "rationales": ["The man is in a hammock.", "Nylon is used to make the hammock used by the man here.", "The material is nylon."], "image": "val2014/COCO_val2014_000000231379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294124, "question_id": "jPQiNASX2CXaxUF6KWaHb3", "question": "What is the bald person in the jacket a part of?", "choices": ["circus", "rockettes", "13th legion", "event staff"], "correct_choice_idx": 3, "direct_answers": ["school", "sports management", "event staff", "event staff", "security", "security", "event staff", "event staff", "event staff", "owner"], "difficult_direct_answer": false, "rationales": ["A man in a jacket with employee designation on the back is standing on a basketball court.", "It says it on his jacket", "We can see the letters 'even' and below that 'staf' on the back of the bald man's jacket. given that we cannot see all the word's printed here we can conclude it reads event staff."], "image": "val2014/COCO_val2014_000000294124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331549, "question_id": "jPYLHrXKzynsW9vXf2ULRB", "question": "Why have the two men gripped hands?", "choices": ["to swing", "showing respect", "arm wrestling", "to dance"], "correct_choice_idx": 1, "direct_answers": ["good", "sportsmanship", "sportsmanship", "shaking hands", "showing respect", "sportsmanship", "good sportsmanship", "congratulations", "shaking hands", "sportsmanship"], "difficult_direct_answer": false, "rationales": ["These two athletes shake hands over the net which they have or will presumably compete against one another.", "This is a standard occurance after a tennis match. players shake hands to show respect for each other for a good game.", "The men show respect."], "image": "train2014/COCO_train2014_000000331549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109521, "question_id": "jPcdE2rjt6dVivFubbcV3E", "question": "What are the individuals looking at across the water?", "choices": ["land", "trees", "sand", "nature"], "correct_choice_idx": 0, "direct_answers": ["land", "trees", "buoy", "men", "looking fish", "trees", "shore", "trees", "trees", "trees"], "difficult_direct_answer": false, "rationales": ["The individuals are looking at land across the water.", "They are looking at the land on the side of the river.", "They want to get to land."], "image": "train2014/COCO_train2014_000000109521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526458, "question_id": "jPuCwtFZ2bh9w48wZ7RwYV", "question": "Who owns this dog?", "choices": ["hippies", "vagabond", "police", "surfer"], "correct_choice_idx": 3, "direct_answers": ["surfer", "surfer", "left man", "surfer", "man", "surfer", "surfer", "man", "man", "surfer"], "difficult_direct_answer": false, "rationales": ["The surfer is the only other person in the picture and is likely the owner.", "The guy in the wet suit owns the dog. dogs aren't allowed to roam on the beach without their owners.", "The surfer is near the dog."], "image": "train2014/COCO_train2014_000000526458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112022, "question_id": "jPz3qzDyof5hgRrERsMz3d", "question": "Where are half of these people probably going?", "choices": ["skiing", "home", "parade", "mexico"], "correct_choice_idx": 1, "direct_answers": ["home", "trip", "waiting", "asia", "travelling", "asia", "home", "home", "asia", "home"], "difficult_direct_answer": false, "rationales": ["The people are at the baggage claim of an airport, which means they are about to leave to either a hotel or their home.", "They're going home.", "People are travelling with luggage. people go home after travelling."], "image": "val2014/COCO_val2014_000000112022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561514, "question_id": "jPzBdag7nLxyp37a6T39d3", "question": "Under what is the oven located here?", "choices": ["mixer", "center island", "stove top", "sink"], "correct_choice_idx": 2, "direct_answers": ["range hood", "decorative hood", "window", "counter", "arch", "stove top", "window", "counter", "window", "window"], "difficult_direct_answer": false, "rationales": ["The oven is under the stovetop.", "Stovetops are typically connected on to and on top of ovens.", "A range is above the stove."], "image": "val2014/COCO_val2014_000000561514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306716, "question_id": "jQ4E7xDBWTvHf5u2ajazAS", "question": "In what city is this woman boarding the bus?", "choices": ["kyoto", "osaka", "kobe", "tokyo"], "correct_choice_idx": 1, "direct_answers": ["osaka", "osaka", "osaka", "osaka", "osaka", "osaka", "osaka", "osaka", "osaka", "osaka"], "difficult_direct_answer": false, "rationales": ["The bus is a shuttle belonging to the hyatt regency hotel chain located in the city that is also written on the side of the bus.", "The bus says osaka on it.", "The city is osaka."], "image": "val2014/COCO_val2014_000000306716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324322, "question_id": "jQMDkeKftEjzzp6AiWEahj", "question": "How can the room be heated?", "choices": ["candles", "fire", "lanters", "fireplace"], "correct_choice_idx": 3, "direct_answers": ["fireplace", "fireplace", "fireplace", "fireplace", "fireplace", "fireplace", "fireplace", "fireplace", "fireplace", "fireplace"], "difficult_direct_answer": false, "rationales": ["There is a mantle in the room.", "There is a fireplace by the people.", "The room has a fireplace."], "image": "val2014/COCO_val2014_000000324322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396663, "question_id": "jQSpgFN9895SxvAkqtTuJv", "question": "What number is missing from the sequence of the numbers next to the word private?", "choices": ["four", "two hundred", "thirty", "one hundred"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["Counting upwards, the missing number is between 3 and 5.", "123 is always followed by 4 in chronological order.", "An old red bus has the numbers 1235. it is missing a number that goes between 3 and 5."], "image": "train2014/COCO_train2014_000000396663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484108, "question_id": "jQSzSbzs4gNKVs9DNg84zk", "question": "What do the greenish brown things bring to the beach?", "choices": ["salt", "unwanted trash", "minerals", "tiny fish"], "correct_choice_idx": 1, "direct_answers": ["unwanted trash", "kelp", "fish", "stones", "fun", "seaweed", "seaweed", "seaweed", "algae", "birds"], "difficult_direct_answer": false, "rationales": ["The brown things make the beach less attractive.", "They're trash.", "Looks like it's dried out seaweed on the beach."], "image": "train2014/COCO_train2014_000000484108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558678, "question_id": "jQg6TAoS5qdZMMDZVJoLN5", "question": "What sort of skiers are practicing in the foreground?", "choices": ["beginners", "professional", "advanced", "hot dog"], "correct_choice_idx": 0, "direct_answers": ["beginners", "children", "snow season", "kids", "kids", "kids", "kids", "young", "child skiers", "children"], "difficult_direct_answer": false, "rationales": ["The skiers are children and are on flat ground. they're obviously inexperienced.", "Small children are on skis at a ski resort. the area is flat with no big mountains on the immediate area.", "The people that are learning to ski."], "image": "train2014/COCO_train2014_000000558678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316141, "question_id": "jQmBpaCHQNFLRF2jz6Hpsf", "question": "What is the route named after?", "choices": ["color", "flower", "country", "animal"], "correct_choice_idx": 0, "direct_answers": ["color purple", "color", "color", "purple", "purple", "tusayan route", "purple color", "purple", "color", "purple"], "difficult_direct_answer": false, "rationales": ["That is the name of a color", "The route is called purple route. purple is a color.", "The sign on the bus says the route name. the route name is a color."], "image": "train2014/COCO_train2014_000000316141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312182, "question_id": "jQwXT84MQC3MrHmgTaZhCq", "question": "People are doing what?", "choices": ["singing", "marching", "swimming", "protesting"], "correct_choice_idx": 1, "direct_answers": ["protesting", "walking", "charity walk", "walking", "walking", "trekking", "travelling", "marching", "walking", "walking"], "difficult_direct_answer": false, "rationales": ["The people are marching on the street.", "The people are walking.", "This appears to be a peaceful walk and gathering of many people which would best be described as a march."], "image": "train2014/COCO_train2014_000000312182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357770, "question_id": "jR5jFBa8qfKhrb2fSJGorU", "question": "What is he doing with the ball?", "choices": ["throwing", "kicking", "catching", "serving"], "correct_choice_idx": 3, "direct_answers": ["serving", "serving", "hitting it", "serving", "hitting", "hitting", "hitting", "serving", "hitting ball", "serving"], "difficult_direct_answer": false, "rationales": ["This tennis player is prepared to hit a ball currently far above his head in an overhand fashion. such a maneuver would be a serve in tennis.", "He is hitting it to his opponent.", "The ball is above his head and he is jumping to swing at it while standing behind the white line."], "image": "train2014/COCO_train2014_000000357770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460496, "question_id": "jR5nLmCjWEJVot7yKzJAZ6", "question": "What is this device being used for?", "choices": ["calling", "working", "cooling", "playing"], "correct_choice_idx": 3, "direct_answers": ["entertainment", "video gaming", "playing game", "games", "video game", "gaming", "playing", "gaming", "gaming", "playing games"], "difficult_direct_answer": false, "rationales": ["The device is for playing.", "There is a game on the monitor", "The boy is playing a game."], "image": "train2014/COCO_train2014_000000460496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208748, "question_id": "jRCyiwWXe3NTCZmq7cAYdL", "question": "What item is abundant on the table is being ignored?", "choices": ["wine glasses", "forks", "hamburger", "french fries"], "correct_choice_idx": 0, "direct_answers": ["wine glasses", "glasses", "burger", "glasses", "hamburger", "burger", "glasses", "glass", "glasses", "glasses"], "difficult_direct_answer": false, "rationales": ["A table is lined with formal glasses with long stems. wine glasses have long stems.", "The wine glasses are all empty.", "The wine glasses are mostly empty. sometimes wine is served with dessert or after dinner."], "image": "val2014/COCO_val2014_000000208748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61266, "question_id": "jRQnEVRfkHV3ZVqukMfwFd", "question": "What are these people ready to do?", "choices": ["hide", "board", "sleep", "run"], "correct_choice_idx": 1, "direct_answers": ["board", "board", "travel", "travel", "travel", "load up", "train travel", "go home", "travel", "travel"], "difficult_direct_answer": false, "rationales": ["They have their luggage and are staying near the trains", "The have their luggage on them which means they are traveling using the train.", "The people are at a train station and are getting ready to board a train to travel on."], "image": "train2014/COCO_train2014_000000061266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577875, "question_id": "jRj9cBsMf5JbYVnAjDmGam", "question": "What has the man stopped on his scooter?", "choices": ["accident", "traffic", "floods", "phone call"], "correct_choice_idx": 3, "direct_answers": ["phone conversation", "phone call", "cellphone", "talking", "brakes", "talking", "phone call", "tires", "phone call", "talking phone"], "difficult_direct_answer": false, "rationales": ["The man is holding a phone to their ear.", "He is holding something up to his ear, which is what a person does if he is using a communications device.", "The man is holding a phone to his ear as one does when on a call. this would be a difficult activity to do on a scooter as it requires one hand and leaves only one to control the vehicle with which is why he likely pulled over."], "image": "train2014/COCO_train2014_000000577875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131579, "question_id": "jSEtzybDcHCK3uL6SsqEGf", "question": "Which ward is the advertised district in?", "choices": ["chuo", "tennoji", "nishinari", "kita"], "correct_choice_idx": 3, "direct_answers": ["11", "kita", "fifth", "umgle", "osaka", "i do", "umegle", "downtown", "grand front", "omegle"], "difficult_direct_answer": true, "rationales": ["The ward is in kita.", "The area is known as kita based on the bus and street signs.", "The ward is kita."], "image": "train2014/COCO_train2014_000000131579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467285, "question_id": "jSKipCntaPGZK6SFpyVDzT", "question": "After looking at the base where will this player look next?", "choices": ["righward", "leftward", "back", "up"], "correct_choice_idx": 1, "direct_answers": ["pitchers mound", "pitcher", "leftward", "pitcher", "pitcher", "pitcher", "ball", "pitchers mound", "pitcher", "pitcher"], "difficult_direct_answer": false, "rationales": ["The batter will look to the left at the pitcher mound.", "The person in question is a batter standing at the plate based on their uniform and positioning. if playing as intended, they would next regard the pitcher and look for the ball after pitched which would be to the left of this batter based on their stance and the baseball field layout.", "The pitcher is out on the field on a mound in front of the umpire."], "image": "val2014/COCO_val2014_000000467285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463306, "question_id": "jSQ2sZ3RecqUf9cSRGiSBC", "question": "In skateboarding terms what is the skateboarder doing with his right hand?", "choices": ["grab", "hold", "linger", "catch"], "correct_choice_idx": 1, "direct_answers": ["indy grab", "hold", "balancing", "grip", "balancing", "balance", "one hand", "hang", "grabbing board", "holding"], "difficult_direct_answer": true, "rationales": ["The boarder is doing a trick.", "The term is to hold.", "He has jumped in the air and his skateboard is still touching both of his shoes under him. doing this keeps his feet on the skateboard."], "image": "train2014/COCO_train2014_000000463306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462080, "question_id": "jSRoRtM39Y5cRYjqFCCoFx", "question": "Why are apples better than oranges?", "choices": ["nicer color", "better looks", "more vitamins", "more fiber"], "correct_choice_idx": 2, "direct_answers": ["vitamin c", "healthier", "fiber", "more vitamins", "vitamin c", "more vitamins", "no peeling", "vitamins", "they aren't", "subjectivity"], "difficult_direct_answer": false, "rationales": ["That's one thing the apple has more of than oranges.", "Apples and oranges are shown together. apples have more vitamins than oranges.", "The answer is subjective, but answer c is an answer that is presented as correct based on an internet search of the question."], "image": "train2014/COCO_train2014_000000462080.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389016, "question_id": "jSgCgmt64QA7Qri52uq35i", "question": "What do the white lines on the road mean?", "choices": ["park here", "no crossing", "stop driving", "cross walk"], "correct_choice_idx": 3, "direct_answers": ["jntersection", "crosswalk", "no crossing", "pedestrian crossing", "crosswalk", "lanes", "cross walk", "lanes", "crosswalk", "traffic signs"], "difficult_direct_answer": false, "rationales": ["Lines in the road are used to mark crosswalks.", "Crosswalks use lines to show where to walk.", "They provide a safe area for pedestrians to go to the opposite side of the street."], "image": "train2014/COCO_train2014_000000389016.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260627, "question_id": "jSukH29EB7R7Ybv7dmtppc", "question": "What energy propels this train?", "choices": ["electric", "coal", "gas", "oil"], "correct_choice_idx": 0, "direct_answers": ["electricity", "electric", "electric", "electricity", "electric", "electricity", "electric", "electricity", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["The train is not emitting any fumes or vapours. there are wires above the tracks.", "The train is electric.", "The train does not have an internal combustion engine. the wires above the tracks power the train."], "image": "val2014/COCO_val2014_000000260627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177338, "question_id": "jSw4rmcz7xZ2MmZkp3jLyp", "question": "Where are these men riding?", "choices": ["mountain", "beach", "woods", "street"], "correct_choice_idx": 3, "direct_answers": ["bikes", "atvs", "off-road vehicles", "around", "public square", "atvs", "street", "square", "road", "atv"], "difficult_direct_answer": true, "rationales": ["The area the men are riding on is flat and paved with bricks. there are no hills, trees, sand, or water visible.", "The people are on a cobble-stoned road.", "They're in the street."], "image": "val2014/COCO_val2014_000000177338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450716, "question_id": "jT2hmhMUuzNnFbYKmguPYw", "question": "Why are the animals lowering their heads?", "choices": ["to walk", "for petting", "to comb", "to eat"], "correct_choice_idx": 3, "direct_answers": ["grazing", "eating", "eating hay", "to eat", "to eat", "to eat", "eating", "eating hay", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["They are eating hay.", "The heads of the animals in question are placed over piles of hay which is a known food of answer a and some of the animals appear to be actively grazing.", "There is hay on the ground for to feed the cows."], "image": "train2014/COCO_train2014_000000450716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188143, "question_id": "jT8wSJumQY8gpHQMjRmRJA", "question": "What is the woman ready to do?", "choices": ["catch", "roll", "run", "eat"], "correct_choice_idx": 0, "direct_answers": ["catch frisbee", "catch frisbee", "throw", "catch frisbee", "catch frisbee", "catch frisbee", "catch", "catch frisbee", "catch", "catch"], "difficult_direct_answer": false, "rationales": ["Her hands are outstretched towards the frisbee which is seen going in the direction of her hands. collision is imminent.", "The woman is catching.", "There is a frisbee. it is moving towards the woman."], "image": "train2014/COCO_train2014_000000188143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546515, "question_id": "jT9dUkS5H65VGMNG8pMsH6", "question": "What type area does this train leave?", "choices": ["desert", "suburb", "rural", "urban"], "correct_choice_idx": 3, "direct_answers": ["rail yard", "city station", "urban", "urban", "urban", "city", "industrial area", "station", "train station", "depot"], "difficult_direct_answer": false, "rationales": ["Behind the train is a place that has lots of buildings which is typical of cities.", "It's leaving an urban area.", "The trains appear to be moving away from the background which has many tall buildings consistent with answer a."], "image": "train2014/COCO_train2014_000000546515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459303, "question_id": "jTGEa3LYsCuM386Lw4SMwy", "question": "What are you most at risk of if you touch the things covering the sky here?", "choices": ["electrocution", "wet hands", "bug bite", "heavy fine"], "correct_choice_idx": 0, "direct_answers": ["shock", "electrocution", "shock", "electrocution", "electrocuted", "electrocution", "electrocution", "electrocution falling", "electrocution", "electrocution"], "difficult_direct_answer": false, "rationales": ["You could get hurt.", "The things in question are the size and shape and being held in a manner correlated with wires that transmit electricity. touching these in the incorrect way could result in answer a.", "If you touch the power lines you could get an electric shock."], "image": "val2014/COCO_val2014_000000459303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262531, "question_id": "jTHkJMsZG4mSmhZF7KVMQx", "question": "Why is the man on the skateboard crouching?", "choices": ["stretching", "exercise", "showing off", "speed"], "correct_choice_idx": 3, "direct_answers": ["speed", "balance", "for balance", "jump", "balance", "doing trick", "vertical maneuver", "doing trick", "steering", "for balance"], "difficult_direct_answer": false, "rationales": ["Being lower to the ground gives him more momentum.", "When riding a skateboard this position is needed to maintain balance so you can go faster without falling off.", "Crouching down on a skateboard cuts down on wind resistance which allows the skater to move faster."], "image": "val2014/COCO_val2014_000000262531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338012, "question_id": "jTKjw9PUSNbrqr67aBQMmd", "question": "Why are bikes that are similar sitting together?", "choices": ["random", "bike sale", "rental", "monopoly"], "correct_choice_idx": 2, "direct_answers": ["rental bikes", "rent-able bikes", "parked", "for rent", "rentals", "for rent", "rentals", "rented bikes", "rental", "rental bikes"], "difficult_direct_answer": false, "rationales": ["They're rentals.", "Companies lease bikes for money. companies put their logos on the bikes. these bikes have all the same logos.", "The bikes are available for people to rent."], "image": "train2014/COCO_train2014_000000338012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412697, "question_id": "jTWQ3kBVjD8XVjFyGNB4xY", "question": "Which person's pizza has the most cheese?", "choices": ["man", "middle woman", "right woman", "left woman"], "correct_choice_idx": 3, "direct_answers": ["left", "left woman", "left", "left", "left", "left", "left", "left", "left", "left"], "difficult_direct_answer": false, "rationales": ["The pizza on the left has cheese.", "It is the only ingredient on her pizza.", "The woman on the left's pizza only has cheese on it."], "image": "train2014/COCO_train2014_000000412697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99192, "question_id": "jTXq6bjyT6bBaXKW4GWQhX", "question": "What is the woman doing with the yellow device?", "choices": ["drinking", "throwing it", "combing hair", "making call"], "correct_choice_idx": 3, "direct_answers": ["talking", "talking", "calling", "talking", "talking", "talking", "talking", "talking", "calling", "making call"], "difficult_direct_answer": false, "rationales": ["It's her phone and she's calling someone.", "The woman is putting a call through.", "The yellow device is a phone, not a cup or comb. she is holding, not throwing, it."], "image": "train2014/COCO_train2014_000000099192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277384, "question_id": "jTsUXwn8DSzoPjW4UzD9Cq", "question": "What is the number at the front of the train on the left?", "choices": ["690", "203", "210", "952"], "correct_choice_idx": 0, "direct_answers": ["690", "690", "690", "690", "690", "690", "six-nine-zero", "six", "six ninezero", "690"], "difficult_direct_answer": false, "rationales": ["The number is visible in grey lettering on the door of the train.", "The number on the back of the train identifies it.", "The numbers are silver on the yellow door"], "image": "train2014/COCO_train2014_000000277384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109425, "question_id": "jU2pZKWQAt5tmhJr5sWdtp", "question": "What number is the street?", "choices": ["35", "21", "28", "19"], "correct_choice_idx": 2, "direct_answers": ["28", "twenty-eight", "28th", "28", "28", "28", "28th", "28", "28", "28"], "difficult_direct_answer": false, "rationales": ["The street is w 28th st.", "The street is called w 28th st.", "The green street sign can be seen with the number 28 written on it."], "image": "val2014/COCO_val2014_000000109425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130313, "question_id": "jUXtn4L8oYxrjUpgwMAafh", "question": "What personal care item is missing from the dish next to the sink?", "choices": ["hand soap", "shampoo", "mouthwash", "hand lotion"], "correct_choice_idx": 0, "direct_answers": ["soap", "soap", "soap", "soap", "hand soap", "soap", "soap", "soap", "soap", "soap"], "difficult_direct_answer": false, "rationales": ["There is a dish but nothing in it", "There is a dish the size and shape and placed next to the sink as if it were intended to hold answer a which is not currently present or visible.", "There is no hand soap in the little soap dish, nor does there appear to be a dispenser on the wall. this is a bad situation, considering we're all supposed to be washing our hands constantly now."], "image": "train2014/COCO_train2014_000000130313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173315, "question_id": "jUcJbBH5mZvNTKuqKrfu8N", "question": "What place are the zebra in?", "choices": ["farm", "zoo", "park", "wilderness"], "correct_choice_idx": 3, "direct_answers": ["forrest", "africa", "safari", "savanna", "jungle", "africa", "wilderness", "forrest", "nature", "field"], "difficult_direct_answer": false, "rationales": ["They're in the wild.", "There are no buildings, vehicles or people nearby (except the photographer).", "The zebras are outdoors in the wild and are roaming free."], "image": "train2014/COCO_train2014_000000173315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550356, "question_id": "jUyXTE37RBuNyYmhpQp84e", "question": "What is drawn on the bumper?", "choices": ["ear", "nose", "lips", "eye"], "correct_choice_idx": 3, "direct_answers": ["eye", "name", "eye", "eye", "eye", "eye", "eye", "eye", "name", "eye"], "difficult_direct_answer": false, "rationales": ["There is an eye with eyelashes drawn on the bumper.", "There is an eye drawing.", "There is something used for seeing and has eye lashes and a pupil."], "image": "train2014/COCO_train2014_000000550356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163047, "question_id": "jV4dDesMbU4d2xhMWiN7vK", "question": "Which item normally found on a car can be seen drug behind the horse here?", "choices": ["tires", "antennae", "necklace", "fruit"], "correct_choice_idx": 0, "direct_answers": ["tires", "wagon", "fruit", "tires", "carriage", "tires", "wheel", "wheels", "wheel", "tires"], "difficult_direct_answer": false, "rationales": ["Tires are on the buggy.", "Cars by definition have wheels. there is a cart being dragged by the visible horse that also includes wheels.", "They are used on a fruit cart instead."], "image": "train2014/COCO_train2014_000000163047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522135, "question_id": "jVTWNX3cYmm2w9iXLnkSq6", "question": "What kind of tool is on top of the bookshelf to the left of the desk with a monitor on it?", "choices": ["saw", "hammer", "chisel", "screwdriver"], "correct_choice_idx": 1, "direct_answers": ["hammer", "hammer", "hammer", "hammer", "hammer", "hammer", "hammer", "hammer", "hammer", "hammer"], "difficult_direct_answer": false, "rationales": ["A hammer sits on the bookshelf.", "A hammer is shown.", "The tool is a hammer."], "image": "train2014/COCO_train2014_000000522135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360068, "question_id": "jVXj4htxMAmDK5RifRP5MP", "question": "If one adds a wheel to this vehicle how many would it have?", "choices": ["four", "five", "two", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["The wheels would be three.", "This is a motorcycle with two wheels already.", "If you have one wheel you have 3 wheels because the motorcycle had already 2 wheels"], "image": "train2014/COCO_train2014_000000360068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433145, "question_id": "jVogCXna98CsRRde5NbRHR", "question": "What did this boy just do?", "choices": ["missed", "nothing", "hit", "quit"], "correct_choice_idx": 0, "direct_answers": ["swung bat", "swing", "hit ball", "hit baseball", "hit ball", "swing", "hit ball", "swing bat", "hit ball", "missed"], "difficult_direct_answer": false, "rationales": ["The ball is still coming toward him instead of moving away", "The boy swung the bat and tried to hit the ball but he missed and the ball kept going.", "The boy missed."], "image": "train2014/COCO_train2014_000000433145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330122, "question_id": "jWMwHRjaB7BBorm9dEvP5X", "question": "How many people below three years of age are there?", "choices": ["three", "two", "four", "five"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "four", "three", "three", "four", "three", "four", "three"], "difficult_direct_answer": false, "rationales": ["You can count four kids on the floor with their moms.", "There are four babies on the ground.", "There are three of them."], "image": "val2014/COCO_val2014_000000330122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261477, "question_id": "jWQUv8oxCKszkutZwPtK99", "question": "What is abnormal about the man showing his back?", "choices": ["wrong position", "age inappropriate", "unsuitable outfit", "poor skill"], "correct_choice_idx": 2, "direct_answers": ["dressed fancy", "attire", "formal dressed", "curved spine", "business dress", "unsuitable outfit", "old", "attire", "clothing choice", "wearing suit"], "difficult_direct_answer": true, "rationales": ["This outfit is not suitable for playing tennis.", "He is dressed in business attire, but on the tennis court holding a racket.", "He's not wearing the right clothes."], "image": "train2014/COCO_train2014_000000261477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11494, "question_id": "jWR8BN4LmubtYZEejEQoox", "question": "Who do most of the men probably like to date?", "choices": ["other men", "aliens", "women", "nobody"], "correct_choice_idx": 0, "direct_answers": ["men", "men", "men", "men", "men", "men", "men", "other men", "men", "men"], "difficult_direct_answer": false, "rationales": ["The men are gay with the rainbow flag.", "There is an lgbt flag on the bus.", "The men like other men."], "image": "val2014/COCO_val2014_000000011494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202723, "question_id": "jWafgZxxqup7FyYZjLE9Gm", "question": "If this truck sold food the same color that the truck is what food would it sell?", "choices": ["blueberry", "watermelon", "peas", "carrot"], "correct_choice_idx": 1, "direct_answers": ["cotton candy", "cotton candy", "watermelon", "pink", "pink", "pink cake", "candy", "cotton candy", "cotton candy", "cotton candy"], "difficult_direct_answer": false, "rationales": ["The fruit is red or pink in the center, just like the truck.", "This fruit is usually the color of the truck when fresh.", "The truck is red and pink. answer a is a food that has shades of these colors in it."], "image": "train2014/COCO_train2014_000000202723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309528, "question_id": "jWdjDNQuFLDYQaRJ8prTkA", "question": "What bus is this?", "choices": ["tourist bus", "public bus", "school bus", "double decker"], "correct_choice_idx": 1, "direct_answers": ["4731", "new bus", "transit bus", "public bus", "passenger bus", "4131", "public transit", "passenger bus", "city", "public bus"], "difficult_direct_answer": false, "rationales": ["The bus has only one level. the passengers are regular people who do not necessarily belong to a specific group or demographic.", "There is a varied age range for the people getting of the vehicle and the color scheme of it implies it's owned by the city.", "A yellow and white bus is pulled up at a bus stop on the sidewalk."], "image": "val2014/COCO_val2014_000000309528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117016, "question_id": "jX3a2BKKRQK6hTySDPgVFN", "question": "What will the spoon/fork combination utensil be used for?", "choices": ["serve cake", "serve salad", "serve pasta", "serve meat"], "correct_choice_idx": 1, "direct_answers": ["eating", "salad", "serve salad", "serving", "salads", "grabbing salad", "eating dinner", "eating", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["The fork and spoon are usually used to toss salad and then serve it.", "The salad already has serving utensils so this will be used for the meat", "They are used to serve salad."], "image": "train2014/COCO_train2014_000000117016.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249620, "question_id": "jXGaqN9zfmBPf2hVSj8L3z", "question": "What is the most vulnerable in the picture?", "choices": ["adult zebra", "baby zebra", "bird", "grass"], "correct_choice_idx": 1, "direct_answers": ["baby", "baby zebra", "baby zebra", "baby zebra", "baby zebra", "baby", "young zebra", "baby zebra", "baby", "baby zebra"], "difficult_direct_answer": false, "rationales": ["The smallest and youngest animal is protected by the older members of the herd.", "Predators go after weak, injured animals, or the younger, smaller ones.", "The baby zebra can't run fast."], "image": "train2014/COCO_train2014_000000249620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454940, "question_id": "jXMuFfDgD43BfSTkNMkpxv", "question": "What is this person doing?", "choices": ["movie watching", "escaping", "racing", "performing music"], "correct_choice_idx": 2, "direct_answers": ["riding motorcycle", "racing", "riding motorcyle", "racing", "riding motorcycle", "racing motorcycle", "riding racing", "riding", "riding bike", "riding motorcycle"], "difficult_direct_answer": false, "rationales": ["The person is racing the motor bike.", "The person is riding a motorcycle. the writing on its front indicates what the person is doing with the bike.", "There are several pit crew members behind a man riding a motorcycle. the bike has a number on front with advertisements ."], "image": "train2014/COCO_train2014_000000454940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327258, "question_id": "jXZoMkDBTSEv2sBw8uvV3P", "question": "What are the men armed?", "choices": ["protection", "pictures", "competition", "fashion"], "correct_choice_idx": 0, "direct_answers": ["guns", "gun", "guns", "mounted police", "gun", "pistols", "police", "police", "cops", "protection"], "difficult_direct_answer": false, "rationales": ["The men are cops and have funs to protect and serve.", "They are police.", "They are police officers on patrol"], "image": "train2014/COCO_train2014_000000327258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532552, "question_id": "jXcMABT4KatN3347aowZGM", "question": "What person does the batter watch here?", "choices": ["catcher", "third baseman", "pitcher", "umpire"], "correct_choice_idx": 2, "direct_answers": ["pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher", "pitcher"], "difficult_direct_answer": false, "rationales": ["A hitter has to concentrate on the player who is challenging him.", "A batter stands at home plate with a bat over his shoulder as he looks forward towards the center of the diamond.", "The batter is looking at the pitcher because he is waiting for the ball to be thrown at him"], "image": "val2014/COCO_val2014_000000532552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355481, "question_id": "jXdbEfj5YP45ezKVt3eVxP", "question": "An American motorcycle rally held annually in which place?", "choices": ["rapid city", "sturgis", "pierre", "deadwood"], "correct_choice_idx": 1, "direct_answers": ["sturgis", "park", "sturgeon", "country", "sturgis", "sturgis", "field", "sturgis", "all traffic", "sturgis"], "difficult_direct_answer": false, "rationales": ["Sturgis holds the rally.", "The bike is a sturgis.", "Sturgis is the hometown of a major motorcycle rally."], "image": "train2014/COCO_train2014_000000355481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10701, "question_id": "jYFQyNs869kGGyJYJVTgYh", "question": "Where is 39 headed?", "choices": ["wendy's", "outfield", "third base", "home base"], "correct_choice_idx": 3, "direct_answers": ["home plate", "home", "home plate", "right", "home base", "home plate", "home plate", "home plate", "next base", "home plate"], "difficult_direct_answer": false, "rationales": ["Given the position of #39 and the layout of a baseball diamond, he is headed for the final plate.", "The person with the ball in the middle of the field on the brown dirt throws the ball towards the direction of the batter and the catcher (with the mitt). those two people are located at the home base area.", "He is about to score a run."], "image": "train2014/COCO_train2014_000000010701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573349, "question_id": "jYH8ies2EoPeL7qjmoUstG", "question": "What event is being filmed here?", "choices": ["protest", "cake walk", "horse show", "parade"], "correct_choice_idx": 3, "direct_answers": ["parade", "parade", "walk", "parade", "parade", "parade", "parade", "parade", "walk", "golf event"], "difficult_direct_answer": false, "rationales": ["This is a walk or march in public in a formal procession or in an ostentatious or attention-seeking way. there is an abundant amount of attention from people on both sides.", "The event is a parade.", "There are people waling down the middle of the street with crowds watching them on both sides."], "image": "val2014/COCO_val2014_000000573349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565878, "question_id": "jYZWtZ8WsPpqRNTPuNCsAU", "question": "Why does he have the huge sandwich?", "choices": ["is sharing", "overloaded it", "is hungry", "not his"], "correct_choice_idx": 2, "direct_answers": ["lunch", "he's hungry", "lunch", "eating out", "to eat", "is hungry", "very hungry", "hungry", "extremely hungry", "hungry"], "difficult_direct_answer": false, "rationales": ["The man is looking at it with an interested expression, and food is usually consumed for a specific reason.", "For him to have this much food he must be hungry.", "A man in glasses is sitting and holding a coney hot dog. it is huge because he hasn't eaten."], "image": "train2014/COCO_train2014_000000565878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143749, "question_id": "jYi2N5zwoxScZ3xQG67JME", "question": "What is required to park here?", "choices": ["nothing", "receipts", "coins", "dollar bills"], "correct_choice_idx": 2, "direct_answers": ["quarter", "quarters", "cars", "coins", "cars", "coins", "money", "coins", "money", "car"], "difficult_direct_answer": false, "rationales": ["You have to pay the parking meters to park there.", "This is a parking meter so you have to pay", "Coins are needed."], "image": "train2014/COCO_train2014_000000143749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294029, "question_id": "jZ8Do8qzHkVhCakTJBoNZG", "question": "What is the man doing to the fridge?", "choices": ["painting", "sanding", "washing", "repairing"], "correct_choice_idx": 2, "direct_answers": ["cleaning", "cleaning", "cleaning", "cleaning", "cleaning", "cleaning", "cleaning", "clean", "clean", "washing"], "difficult_direct_answer": false, "rationales": ["The man is holding a hose and is washing the fridge.", "He's washing.", "The man is washing the fridge."], "image": "train2014/COCO_train2014_000000294029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183401, "question_id": "ja456ai7JTZtrSNWDnoLqf", "question": "What are the green objects in the background used for?", "choices": ["painting", "practicing", "sleeping", "sitting"], "correct_choice_idx": 3, "direct_answers": ["bleachers", "sitting", "protection", "safty", "spectator seating", "seating", "fans", "seating", "seats", "seating observers"], "difficult_direct_answer": true, "rationales": ["There are bleachers.", "The stands are for people who come to see the match.", "They are seats for the audience."], "image": "train2014/COCO_train2014_000000183401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534849, "question_id": "jaLU66HmsaonfDvoiCHoyr", "question": "In which state is this bus being towed?", "choices": ["texas", "new mexico", "kansas", "mass"], "correct_choice_idx": 0, "direct_answers": ["texas", "austin", "texas", "austin", "texas", "austin", "texas", "austin", "austin", "texas"], "difficult_direct_answer": false, "rationales": ["The state is texas.", "Neal kocurek memorial is in austin.", "In the background of the image there is a sign for the austin convention center. austin is a city in texas."], "image": "train2014/COCO_train2014_000000534849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83996, "question_id": "jaWMpbXB9a224CDuPKHfMK", "question": "Based on the size of the wave what skill level is the surfer?", "choices": ["beginner", "amateur", "professional", "advanced"], "correct_choice_idx": 1, "direct_answers": ["novice", "intermediate", "high", "novice", "beginner", "high", "amateur", "five", "beginner", "intermediate"], "difficult_direct_answer": false, "rationales": ["The waves are smaller and easier to learn on.", "They're amateurs.", "The wave is small."], "image": "train2014/COCO_train2014_000000083996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451482, "question_id": "jabhKpXjMy6aSVgUckjHUV", "question": "What could be a hobby of the owner of the books?", "choices": ["knitting", "embroidery", "crocheting", "mosaics"], "correct_choice_idx": 1, "direct_answers": ["nerd", "embroidery", "knitter", "needlework", "embroidery", "embroidery", "sewing", "embroidery", "stitching", "embroidery"], "difficult_direct_answer": false, "rationales": ["Based on the title of one of the books they are reading, they enjoy using a needle and thread or yarn to decorate fabric.", "Since the middle book mentions a common hobby, it is safe to assume the reader would have an interest in such.", "The hobby is embroidery."], "image": "train2014/COCO_train2014_000000451482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180289, "question_id": "jaoxnUMio7YR4ibi6BjdtB", "question": "What video game system is the man using?", "choices": ["atari", "nintendo wii", "xbox one", "playstation 5"], "correct_choice_idx": 1, "direct_answers": ["nintendo wii", "nintendo wii", "wii", "nintendo wii", "wii", "nintendo wii", "wii", "wii", "nintendo wii", "wii"], "difficult_direct_answer": false, "rationales": ["The controllers are white and wireless", "The video gaming sight is shown in the picture.", "The game is the wii."], "image": "val2014/COCO_val2014_000000180289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369762, "question_id": "jarU3ynxYgEhRzJ4Rkj2Sb", "question": "If the woman in white wants to go forward when is it safe for her to cross the road or path she is headed toward?", "choices": ["1 minute", "never", "now", "2 hours"], "correct_choice_idx": 2, "direct_answers": ["after train", "now", "no", "no idea", "now", "now", "now", "now", "after train", "now"], "difficult_direct_answer": false, "rationales": ["The train will take less than a minute to pass.", "There is a pedestrian walking sign lit on the traffic signals.", "She can cross now why it's sitting there."], "image": "train2014/COCO_train2014_000000369762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532552, "question_id": "jb3LgN8YRnNsQaoLiZmePx", "question": "What protects observers from being hit with the ball?", "choices": ["mesh", "umpire", "batter", "catcher"], "correct_choice_idx": 0, "direct_answers": ["helmet", "helmet", "net", "helmet", "net", "net", "net", "mesh", "net", "net"], "difficult_direct_answer": false, "rationales": ["If the ball hits the mesh it will bounce back into the field, protecting the spectators.", "A mesh cloth covers the fence.", "The netting is there to make sure the people watching do not get hit."], "image": "val2014/COCO_val2014_000000532552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417983, "question_id": "jbDubGhDSeDWMSgrLsFfue", "question": "What is she looking at?", "choices": ["grass", "trees", "flowers", "elephants"], "correct_choice_idx": 3, "direct_answers": ["elephants", "elephant", "elephants", "elephants", "elephants", "elephants", "elephants", "elephants", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["The large grey animals are the most interesting things in this area.", "She's observing elephants.", "The most interested thing in the area which the lady is facing are the gray tusked animals with a trunk."], "image": "train2014/COCO_train2014_000000417983.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197199, "question_id": "jbNB8jjohFiwxDe9wEU9PX", "question": "How are the cows contained within this field?", "choices": ["electric fence", "rail fence", "wood fence", "wire fence"], "correct_choice_idx": 3, "direct_answers": ["fence", "fencing", "fence", "eight", "fence", "fence", "fence", "fence", "fencing", "wire fence"], "difficult_direct_answer": false, "rationales": ["The cows are kept by the wire fence.", "The cows are fenced.", "The cows are fenced in with wire."], "image": "val2014/COCO_val2014_000000197199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316937, "question_id": "jc3tHocQvjaB9c24GZDb2K", "question": "What type of activity are these lines for?", "choices": ["laundry", "running", "dancing", "painting"], "correct_choice_idx": 0, "direct_answers": ["drying clothes", "drying", "laundry", "drying", "drying", "hanging clothes", "drying", "drying", "drying", "hanging things"], "difficult_direct_answer": false, "rationales": ["A clothes line with clothes pins are used to dry washed clothes.", "The teddy bears are hanging from lines that are normally used for drying laundry.", "There are towels and sheets hanging from clips on it."], "image": "train2014/COCO_train2014_000000316937.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375892, "question_id": "jcTqTEoZv5nZWWmf8D5aSC", "question": "What is beneath the number 714?", "choices": ["slug", "tire", "grass", "paw"], "correct_choice_idx": 1, "direct_answers": ["light", "headlight", "orange light", "reflector light", "light", "letters", "wheel", "headlight", "reflector", "tire"], "difficult_direct_answer": false, "rationales": ["These are police motorcycles. the number 714 is on a fender.", "There are several police motorcycles in a line. underneath the front if made of rubber and round that help the bike go.", "The vehicle tire shows up under this number."], "image": "val2014/COCO_val2014_000000375892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17482, "question_id": "jcWnJjXBSrpgJDLFP4bCGj", "question": "This boy swings from the same side of the plate as what baseball player?", "choices": ["manny ramirez", "cody bellinger", "freddie freeman", "john olerud"], "correct_choice_idx": 0, "direct_answers": ["manny ramirez", "baseball", "left handed", "babe ruth", "willie horton", "yes", "arrow judge", "lou gerri", "batter", "jackie robinson"], "difficult_direct_answer": true, "rationales": ["The boy and manny ramirez stand on the same side of the plate.", "The player is like a pro player.", "The boy is like manny."], "image": "val2014/COCO_val2014_000000017482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297872, "question_id": "jciHFgoULM39DguWCvHdv3", "question": "What animal is shown on the banner?", "choices": ["horse", "bull", "donkey", "llama"], "correct_choice_idx": 1, "direct_answers": ["bull", "bull", "bull", "bull", "bull", "bull", "bull", "bull", "bull", "bull"], "difficult_direct_answer": false, "rationales": ["Obviously a bull with horns showing.", "The word after chicago on the banner indicates the animal's type. it is a cow that has horns.", "A bull looks like a cow but it has two horns."], "image": "train2014/COCO_train2014_000000297872.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166692, "question_id": "jczsbK2uXU6kWugSWgatty", "question": "How are the different levels of this type of cake called?", "choices": ["mini cakes", "steps", "platforms", "tiers"], "correct_choice_idx": 3, "direct_answers": ["tiers", "layers", "tiers", "tiers", "tiers", "tiers", "tiers", "tiers", "layers", "layers"], "difficult_direct_answer": false, "rationales": ["The layers of this wedding cake are called tiers.", "It is a wedding cake", "That is how the cake layers are made."], "image": "train2014/COCO_train2014_000000166692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161185, "question_id": "jczxvnhLYJ2BBM8CerHErK", "question": "What type of transportation is this?", "choices": ["air", "rail", "water", "road"], "correct_choice_idx": 1, "direct_answers": ["train", "train", "rail", "train", "train", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["There are tracks on the ground and a train in the background.", "There is a train.", "The transport is by rail."], "image": "val2014/COCO_val2014_000000161185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393464, "question_id": "jdFrZMZrN24Gd25gZE7K3z", "question": "What are children pictured above doing?", "choices": ["playing", "jogging", "eating", "walking"], "correct_choice_idx": 0, "direct_answers": ["playing", "playing", "playing", "running", "running", "running", "playing soccer", "playing", "lookingat camera", "playing"], "difficult_direct_answer": false, "rationales": ["The children are chasing after a ball so they can kick it.", "They are smiling and running the same direction", "They are running and appear to be happy, all playing an outdoor sport together."], "image": "train2014/COCO_train2014_000000393464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355756, "question_id": "jdHZ2keAbiX5K5XgHBPXNT", "question": "If you needed to get cash now on this street corner what would you use to do that?", "choices": ["atm", "check", "credit card", "bank"], "correct_choice_idx": 0, "direct_answers": ["atm", "atm", "atm", "atm", "atm", "debit card", "atm", "atm", "atm", "debit card"], "difficult_direct_answer": false, "rationales": ["There is a sign that stands for atm which dispenses cash.", "An advertisement for a banking machine is on the side of a building.", "An automated teller machine will give cash from a bank account."], "image": "val2014/COCO_val2014_000000355756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535265, "question_id": "jdMtA9FizN3ycfqpe3mLVR", "question": "Who put the hat on the dog?", "choices": ["woman", "cat", "dog", "man behind"], "correct_choice_idx": 3, "direct_answers": ["man", "owner", "owner", "man", "man", "man", "photographer", "human", "man", "man behind"], "difficult_direct_answer": false, "rationales": ["A man sits behind a dog who has a cowboy hat on. the man is smiling.", "There is a man behind the dog that is probably the owner that put the hat on him.", "The man put it on."], "image": "train2014/COCO_train2014_000000535265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419391, "question_id": "jdRwen5sfYDndDkjDTk4DZ", "question": "What does the woman sitting on the bench do?", "choices": ["waits", "protests", "exercises", "sells things"], "correct_choice_idx": 0, "direct_answers": ["wait", "wait", "worker", "sit", "wait", "wait", "waits", "wait", "sales", "wait"], "difficult_direct_answer": false, "rationales": ["She's waiting.", "The woman isn't doing anything productive here.", "The font on the bottom of the image describes the woman's actions."], "image": "train2014/COCO_train2014_000000419391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296303, "question_id": "jdV53P6UhHPvJkewqByXeE", "question": "What kind of animal do you see in the picture?", "choices": ["insect", "fish", "mammal", "reptile"], "correct_choice_idx": 0, "direct_answers": ["insect", "grasshopper", "cricket", "grasshopper", "bee", "giant insect", "grasshopper", "fly", "insect", "beetle"], "difficult_direct_answer": false, "rationales": ["The animal in the cage is a cricket which is a kind of insect.", "The exoskeleton is that of an bug and the type of wings further verify that the animal is an insect.", "There is a bug in the cage."], "image": "val2014/COCO_val2014_000000296303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126104, "question_id": "jdsLSJGZFLa5py7LbV5isu", "question": "What does the glove on the person's hand provide?", "choices": ["mobility", "stickiness", "protection", "warmth"], "correct_choice_idx": 2, "direct_answers": ["protection", "protection", "protection", "protection", "protection", "protection", "safety", "protection", "support", "protection"], "difficult_direct_answer": false, "rationales": ["This activity exposes the body to many hazards.", "The person on the skateboard is wearing a glove to protect their wrist in case they fall.", "A man is bent over to the side as he uses his hand to skim along asphalt. he wears a glove to keep from skinning it up."], "image": "train2014/COCO_train2014_000000126104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178207, "question_id": "jeBRSdcaw2Kb6AFgkGGY4r", "question": "Which hand caused the ball to go aloft here?", "choices": ["server's right", "opposite player", "none", "left"], "correct_choice_idx": 0, "direct_answers": ["right", "left", "right", "left", "right", "server's right", "left", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["A tennis player is holding a racket in one hand. he uses this hand to hit the ball and the other to throw.", "The tennis player used his right hand to throw the ball up for the serve.", "The tennis player threw the ball up with his right hand."], "image": "train2014/COCO_train2014_000000178207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308892, "question_id": "jeBo6GyRwCByCbtn9zW8JN", "question": "What design is the bike seat?", "choices": ["leopard print", "stripes", "zebra print", "plaid"], "correct_choice_idx": 0, "direct_answers": ["cheetah print", "leopard", "leopard", "cheetah print", "leopard print", "leopard", "leopard", "leopard", "leopard", "triangular"], "difficult_direct_answer": false, "rationales": ["Leopard fur is solid golden with small black spots.", "There is animal print on the bike.", "A bike is parked and has a cover on the seat that is spotted similar to a big cat."], "image": "train2014/COCO_train2014_000000308892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370493, "question_id": "jeTdSQLiZ2yprCKTpnRGzm", "question": "What are these women doing?", "choices": ["eating", "dancing", "talking", "running"], "correct_choice_idx": 2, "direct_answers": ["talking stories", "talking", "talking", "talking", "talking", "talking", "writing", "talking", "taking notes", "talking"], "difficult_direct_answer": false, "rationales": ["The woman are sitting on a bench looking at each other and one of then is holding a pen and paper.", "They woman is speaking and her mouth is open.", "They don't have food and they're not physically doing c or d."], "image": "train2014/COCO_train2014_000000370493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516174, "question_id": "jeXFmrxVzY6cftdZEz2W6r", "question": "What is the name of the blue piece on the end of the plane?", "choices": ["slats", "wing", "spoiler", "vertical stabilizer"], "correct_choice_idx": 3, "direct_answers": ["tail", "tail fin", "tail", "tail", "tail", "vertical stabilizer", "tail", "tail", "rudder", "tail"], "difficult_direct_answer": false, "rationales": ["The name is a stabilizer.", "It is responsible to keep the plane balanced.", "The blue fin at the end of the plane is a vertical stabilizer."], "image": "val2014/COCO_val2014_000000516174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555015, "question_id": "jeqy3xbsHicYhzYr84Nnga", "question": "What person has the same first initials as the initials on the card?", "choices": ["h.g. wells", "w.c. fields", "b.j. novak", "j.k. rowling"], "correct_choice_idx": 1, "direct_answers": ["water closet", "wc fields", "winston churchill", "w.c. fields", "william christensen", "woody allen", "winston churchill", "whitney houston", "wc", "wc fields"], "difficult_direct_answer": false, "rationales": ["We fields has the same initials.", "The sign has the letters w and c.", "William christensen has these initials."], "image": "train2014/COCO_train2014_000000555015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390137, "question_id": "jf66CNPrAT4w37aYKknUJv", "question": "Who will ride in the blue seat?", "choices": ["adult", "child", "pet", "doll"], "correct_choice_idx": 1, "direct_answers": ["child", "child", "baby", "child", "baby", "toddler", "baby", "baby", "child", "baby"], "difficult_direct_answer": false, "rationales": ["The child goes in the small seat.", "The blue seat is a seat for kids.", "A man is sitting and posing on a bike. another bike behind him has a blue small chair for a small person to ride on."], "image": "val2014/COCO_val2014_000000390137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399049, "question_id": "jfRcELeX8GsPhTTpnw7TTq", "question": "What sport would this be if gates were added?", "choices": ["slalom", "ski jump", "moguls", "downhill"], "correct_choice_idx": 2, "direct_answers": ["downhill skiing", "so racing", "alpine", "slalom", "moguls", "snowboarding", "slalom", "skiing", "downhill skiing", "slalom"], "difficult_direct_answer": false, "rationales": ["Moguls are used for skiing and someone is skiing on a snowy mountain.", "This sport would be moguls if there are gates added on the ski ramp.", "Gates define the path in this sport."], "image": "val2014/COCO_val2014_000000399049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255164, "question_id": "jfuQBmy7Y8wnNd9YA9RXsx", "question": "If she stopped her bike suddenly what is most likely to happen?", "choices": ["nothing else", "water spills", "birds scared", "bus stops"], "correct_choice_idx": 3, "direct_answers": ["collision", "bus accident", "bus hits", "bus accident", "bus stops", "ran over", "fall off", "crash", "bus hits", "hit bus"], "difficult_direct_answer": false, "rationales": ["She would otherwise be hit by the bus.", "The driver of the bus would not want to injure the bike rider so they would stop.", "The woman cyclist is in front of the bus and if she were to halt all of a sudden it would cause the bus to do the same."], "image": "train2014/COCO_train2014_000000255164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23019, "question_id": "jg3UQMSUHxSdnPF9av6tkE", "question": "What time of day is this?", "choices": ["early morning", "5 pm", "noon", "9 am"], "correct_choice_idx": 0, "direct_answers": ["evening", "sunset", "dusk", "early morning", "dusk", "night", "dusk", "evening", "early evening", "dusk"], "difficult_direct_answer": false, "rationales": ["The sun is very low in the sky and there are no cars on the street indicating people are not yet out and about.", "There are no cars out.", "The color of the sky indicates that it is most likely the twilight of the day, just before sunrise."], "image": "val2014/COCO_val2014_000000023019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458397, "question_id": "jgCdTdGfsaZQMCb66UWwHp", "question": "The woman is putting what piece of her safety riding gear on the pony?", "choices": ["harness", "glasses", "hat", "whip"], "correct_choice_idx": 0, "direct_answers": ["reins", "harness", "harness", "bridle", "halter", "harness", "bridle", "hat", "harness", "muzzle"], "difficult_direct_answer": false, "rationales": ["The woman is putting a harness on the mouth of the horse.", "The woman is putting something over the pony's head to pull it.", "The harness is there for safety."], "image": "train2014/COCO_train2014_000000458397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20251, "question_id": "jgQJjvQ29Q9b86JTDafXGk", "question": "What can be used to transport multiple luggage bags?", "choices": ["servants", "people", "carousel", "luggage cart"], "correct_choice_idx": 3, "direct_answers": ["luggage cart", "luggage belt", "luggage carousel", "wheel", "luggage cart", "luggage cart", "cart", "cart", "cart", "cart"], "difficult_direct_answer": false, "rationales": ["A cart can be seen by baggage claim that can hold multiple pieces of luggage at once.", "It's the metal item with wheels to the left of the luggage area.", "People stand around a luggage belt in an airport. carts are used to move luggage."], "image": "train2014/COCO_train2014_000000020251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167083, "question_id": "jgRTFTvpMJwgz4WokmTF2H", "question": "What fast food place has a similar name to the name on the building?", "choices": ["checkers", "nathan's", "mcdonald's", "chipotle"], "correct_choice_idx": 0, "direct_answers": ["checkers", "checkers", "checkers", "checkers", "checkers", "cheques", "cheques", "checkers", "checkers", "checkers"], "difficult_direct_answer": false, "rationales": ["The place is checkers.", "A building has a sign with the word \"chequers\" on it. an american fast food restaurant is called \"checkers\".", "Checkers is a burger joint."], "image": "train2014/COCO_train2014_000000167083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125322, "question_id": "jgVKdxqx5uoY4wHeXDqD4N", "question": "The man is taking a picture of something on which side of his body?", "choices": ["your right", "his right", "your left", "his left"], "correct_choice_idx": 3, "direct_answers": ["right", "front", "right", "front", "his left", "right", "right", "right", "right", "left"], "difficult_direct_answer": false, "rationales": ["The man is holding his phone to take a picture of something on the left side of his body", "He is taking a pic on his right side.", "The man has oriented his hands and body as if he is looking at something to his right."], "image": "val2014/COCO_val2014_000000125322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378415, "question_id": "jgXmLTt4STkJ3wXRPJjWq2", "question": "Where do bananas originally come from?", "choices": ["americas", "asia", "india", "france"], "correct_choice_idx": 1, "direct_answers": ["southeast asia", "asia", "southeast asia", "thailand", "southeast asia", "tree", "trees", "southeast asia", "southeast asia", "southeast asia"], "difficult_direct_answer": false, "rationales": ["They are from the southeastern part of the continent, which is tropical.", "This fruit is very appreciative of the tropical regions.", "The bananas are from asia."], "image": "train2014/COCO_train2014_000000378415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337188, "question_id": "jgY9yW2ZHsb6HveSVGs7QQ", "question": "What type of woman might this animal be associated with historically?", "choices": ["wetnurse", "midwife", "witch", "nurse"], "correct_choice_idx": 2, "direct_answers": ["witch", "witch", "witch", "witch", "witch", "young", "witch", "cat lady", "cat lady", "witch"], "difficult_direct_answer": false, "rationales": ["This animal is a black cat. black cats are not associated with medical professionals.", "The animal is a black cat. cats are not associated with midwives, nurses, or wetnurses.", "A black cat has always been associated with witches in tales."], "image": "val2014/COCO_val2014_000000337188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440037, "question_id": "jghjjSiKPTnG5wkvjoTjML", "question": "How is this craft propelled along the water?", "choices": ["motor", "foot paddles", "paddle", "wind"], "correct_choice_idx": 0, "direct_answers": ["motor", "motor", "motor", "motor", "motor", "motor", "paddles", "engine", "motor", "motor"], "difficult_direct_answer": false, "rationales": ["A boat is in the water with a wake behind it. boat motors cause wakes in the water.", "There is one with the rudder in the water", "A motor engine keeps the whole raft afloat."], "image": "val2014/COCO_val2014_000000440037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81299, "question_id": "jgifpUGgugXybeD5bSiPQP", "question": "How many plates are on the table?", "choices": ["one", "two", "seven", "four"], "correct_choice_idx": 2, "direct_answers": ["seven", "seven", "seven", "seven", "seven", "seven", "seven", "seven", "seven", "seven"], "difficult_direct_answer": false, "rationales": ["The plate in the center is surrounded by six other plates.", "One plate is in between six others.", "There are seven plates."], "image": "val2014/COCO_val2014_000000081299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80363, "question_id": "jh2xrjLrMEuKPBaABDnEBx", "question": "Why is this place good for the animals?", "choices": ["trees", "water source", "grassy ground", "being spacious"], "correct_choice_idx": 2, "direct_answers": ["no pain", "grass", "lush grazing", "eating grass", "grass", "grassy ground", "grazing", "grass", "available grass", "to graze"], "difficult_direct_answer": false, "rationales": ["The place has grass.", "The animals have room to graze.", "There is plentiful material for them to graze on."], "image": "train2014/COCO_train2014_000000080363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230898, "question_id": "jh5TsD4GsdLJ5jAfArP4Y5", "question": "What is he doing hanging from the board?", "choices": ["jumping", "falling", "balancing", "bouncing"], "correct_choice_idx": 2, "direct_answers": ["perform trick", "riding", "grinding", "grinding", "grinding", "grind", "board-slide", "railing", "balancing", "grinding"], "difficult_direct_answer": false, "rationales": ["A person is on a skateboard performing a jump. people have to balance on skateboards to avoid falling.", "He is keeping his balance while doing a stunt.", "His skate trick requires balance, which is also why he is holding his arms in that position."], "image": "train2014/COCO_train2014_000000230898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328464, "question_id": "jh5XtoetqbAthjatmvQUcT", "question": "In which manner does this vehicle move?", "choices": ["flying", "drilling", "rolling", "sliding"], "correct_choice_idx": 3, "direct_answers": ["sail", "wind", "skate", "wind sail", "wind", "wind dependent", "wind power", "gliding", "on ice", "sliding"], "difficult_direct_answer": true, "rationales": ["A boat is shown in the water. boats slide across the water as they move.", "The vehicle is using the wind to gently glide the boat across the surface of the water.", "The boat glides across the water."], "image": "val2014/COCO_val2014_000000328464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540860, "question_id": "jhHU42uQQCaVd4FTLNFHe6", "question": "Why is the car on the sidewalk?", "choices": ["parking zone", "accident", "broke down", "display"], "correct_choice_idx": 3, "direct_answers": ["display", "show", "museum", "display", "on display", "display", "replica decoration", "on display", "on display", "model car"], "difficult_direct_answer": false, "rationales": ["This vehicle is not functional and there is a plaque explaining itself so it would be considered a decoration.", "It has a plaque for people to read information about it", "It's on display."], "image": "val2014/COCO_val2014_000000540860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234679, "question_id": "jhPHGUZT3cLDnQ5qndG6hY", "question": "What is she doing with the oven?", "choices": ["moving it", "removing food", "opening door", "closing door"], "correct_choice_idx": 1, "direct_answers": ["checking food", "baking", "baking", "removing food", "baking", "cooking", "removing tray", "removing dish", "removing tray", "baking"], "difficult_direct_answer": false, "rationales": ["She is removing food", "The woman has the mitt on so she doesn't burn herself when she takes the food out.", "She is taking the food out of it."], "image": "val2014/COCO_val2014_000000234679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296093, "question_id": "jhQiKg5bKxscy2QoG9Qqrc", "question": "What is the man to the right doing?", "choices": ["snoring", "eating", "gritting teeth", "jumping jacks"], "correct_choice_idx": 2, "direct_answers": ["walking", "grinning", "grimacing", "disapproving expression", "playing frisbee", "smiling", "grimacing", "gritting teeth", "walking", "grimacing"], "difficult_direct_answer": false, "rationales": ["His facial features seem to indicate this option. the other options don't make sense in this scene.", "He has his lips pulled back and showing his teeth", "The man to the right wants to just grind his teeth since he has such a creepy smile on."], "image": "train2014/COCO_train2014_000000296093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217562, "question_id": "jhQmejRQHxuPGJWHkDmv9b", "question": "What fuel source powers the heat in this room?", "choices": ["logs", "natural gas", "steam", "coal"], "correct_choice_idx": 1, "direct_answers": ["natural gas", "gas", "electricity", "fire place", "fireplace", "fireplace electric", "wood", "fire place", "natural gas", "fireplace"], "difficult_direct_answer": false, "rationales": ["There is a fake fireplace that can't have real fires", "The source is gas.", "There is a gas fireplace."], "image": "val2014/COCO_val2014_000000217562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11328, "question_id": "jhn2GcVGycdcmQ7MKFwAxB", "question": "What is probably in front of them?", "choices": ["video game", "radio", "laptop", "computer"], "correct_choice_idx": 0, "direct_answers": ["television", "television", "video game", "television", "wii screen", "tv", "wii", "television", "television", "wii"], "difficult_direct_answer": false, "rationales": ["The girls are playing with consoles.", "The women are moving in side ways with sticks.", "The video game screen."], "image": "train2014/COCO_train2014_000000011328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14723, "question_id": "jhyheMLdNBtuf5psnVo7YE", "question": "What use would sitting in the seats have?", "choices": ["up/downhill transport", "avoiding sun", "avoiding rain", "dining"], "correct_choice_idx": 0, "direct_answers": ["transportation", "transportation", "transportation", "up/downhill transport", "travelling", "going up", "top mountain", "ski drop", "carrying", "rideth top"], "difficult_direct_answer": false, "rationales": ["The seats can go up and down the mountain.", "Ski lifts are run along cables above mountains.", "The ski lift brings people back to the mountaintop so they can ski down it again."], "image": "val2014/COCO_val2014_000000014723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114140, "question_id": "ji2oSNQPja7oW9pDGFEZ6h", "question": "What are the small lights called?", "choices": ["night lights", "christmas lights", "holiday lights", "festival lights"], "correct_choice_idx": 1, "direct_answers": ["street lights", "neon", "christmas lights", "christmas lights", "twinkle lights", "christmas lights", "string lights", "christmas lights", "christmas lights", "christmas lights"], "difficult_direct_answer": false, "rationales": ["The lights are red and white.", "There are multi color lights strewn on the edge of a building. they are seasonal colors and only used for holiday season.", "Colorful small lights are popular at christmas time."], "image": "train2014/COCO_train2014_000000114140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8019, "question_id": "jiAqM2hjFump9MgjjasFUz", "question": "What does the person with the toy hope for here?", "choices": ["wind", "dog", "snow", "rain"], "correct_choice_idx": 0, "direct_answers": ["fly", "wind", "wind", "flight", "wind", "wind", "wind", "wind", "fly kite", "wind"], "difficult_direct_answer": false, "rationales": ["Wind is what is needed to make kites fly.", "That what is needed to fly a kite.", "This is a kite"], "image": "train2014/COCO_train2014_000000008019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535692, "question_id": "jiCZQJN5BMSZutTPbWGZmH", "question": "Where does surfing come from?", "choices": ["mexico", "hawaii", "polynesia", "argentina"], "correct_choice_idx": 2, "direct_answers": ["hawaii", "tropical areas", "tahiti", "california", "usa", "hawaii", "waves", "hawaii", "california", "polynesia"], "difficult_direct_answer": false, "rationales": ["Two people are surfing. surfing originated in polynesia.", "The original surfing originated here but modern type was in hawaii", "The sport of surfing is invented in polynesia."], "image": "train2014/COCO_train2014_000000535692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107535, "question_id": "jiKUYa6Y3mjVA4PGPfka2Q", "question": "Why is the woman pressing the box?", "choices": ["cross street", "contact police", "get cab", "get help"], "correct_choice_idx": 0, "direct_answers": ["change lights", "filling cup", "crosswalk", "to cross", "cross street", "cross street", "walking", "crossing street", "playing", "walk button"], "difficult_direct_answer": true, "rationales": ["She is at the crosswalk.", "If the woman pushes the button there, it alerts the traffic signal that a person is now waiting for the lights to change. if she didn't do that, the light won't change, and she'll be stuck where she is.", "The woman wants to go across the street."], "image": "train2014/COCO_train2014_000000107535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59752, "question_id": "jiNHdAXBFKT7KYt6RrYhDy", "question": "What powers those boats?", "choices": ["humans", "steam", "gas", "propane"], "correct_choice_idx": 0, "direct_answers": ["kinetic energy", "people", "humans", "sticks", "rows", "small motors", "oars", "engines", "small engines", "paddle"], "difficult_direct_answer": true, "rationales": ["The boats have no motors so they must be powered manually.", "Humans will power the boat.", "There are no gas engines so they have to be operated by hand."], "image": "val2014/COCO_val2014_000000059752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485014, "question_id": "jigZNEpaKZxw62ERiXkeXf", "question": "What is the woman holding?", "choices": ["remote", "phone", "ball", "book"], "correct_choice_idx": 0, "direct_answers": ["remote", "remote control", "wii controller", "controller", "wii remote", "wii remote", "wii controller", "wii", "remote", "wii controller"], "difficult_direct_answer": false, "rationales": ["She has as wii remote.", "She is playing nintendo wii.", "She is playing a nintendo wii game."], "image": "train2014/COCO_train2014_000000485014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346950, "question_id": "jihXgGxpCLNDzrqwDmy4DZ", "question": "What is the man's job?", "choices": ["soldier", "waiter", "electrician", "dancer"], "correct_choice_idx": 0, "direct_answers": ["soldier", "soldier", "soldier", "military", "military", "military", "military", "military", "soldier", "soldier"], "difficult_direct_answer": false, "rationales": ["The man is wearing camouflage, one of his kids is wearing an army tee shirt, and the other is holding an american flag. the man is in the military.", "The job is a soldier.", "The man is wearing a military uniform."], "image": "train2014/COCO_train2014_000000346950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571160, "question_id": "jijTGVPYJ97r7KP4ozRDqP", "question": "What is the man doing with the black funnel shapes object?", "choices": ["cheering", "taking photos", "singing", "announcing"], "correct_choice_idx": 1, "direct_answers": ["taking pictures", "photographing", "balancing", "taking photos", "taking pictures", "recording", "taking pictures", "holding them", "aiding player", "taking picture"], "difficult_direct_answer": false, "rationales": ["It's a long lense on a digital camera.", "The funnel shaped object is attached to a camera. someone at a sports event with a camera in hand is likely there taking photos.", "The black funnel shaped object is a camera."], "image": "train2014/COCO_train2014_000000571160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98872, "question_id": "jisz73nEJfePx77o2nAP3z", "question": "What is the relationship between the man and the woman?", "choices": ["lovers", "coworkers", "friends", "siblings"], "correct_choice_idx": 0, "direct_answers": ["lovers", "partners", "lovers", "girlfriend", "romantic", "husband/wife", "dating", "unknown", "sweethearts", "couple"], "difficult_direct_answer": true, "rationales": ["They are performing an intimate act that only people in a romantic relationship would do.", "They are super close to be anything but in love.", "They are about to kiss."], "image": "val2014/COCO_val2014_000000098872.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104095, "question_id": "jiuYVAa5jL9aPtidG47pAQ", "question": "The hides from the cows are used to produce what?", "choices": ["toys", "leather", "plastic", "poly carbon"], "correct_choice_idx": 1, "direct_answers": ["leather", "leather", "leather", "leather", "leather", "leather", "leather", "boots", "boots", "leather"], "difficult_direct_answer": false, "rationales": ["Cow hides are generally used for leather production.", "The cows produce leather.", "Cows that live on a pasture are used for their skin."], "image": "val2014/COCO_val2014_000000104095.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396324, "question_id": "jjBy5n5nNabgJmEm8tuuRY", "question": "How was the sandwich able to get the markings?", "choices": ["bbq", "fried", "boiled", "panini pressed"], "correct_choice_idx": 3, "direct_answers": ["grilled", "grill", "grill", "grill", "panini machine", "panini pressed", "grill", "grill", "grilled", "grilled"], "difficult_direct_answer": false, "rationales": ["The sandwich is known as this kind of thing, and the grill marks are deep.", "The sandwich was cooked on a special grill that cooks it simultaneously from above and below and leaves the grill marks visible.", "It was pressed in a device that has ridged marking like that of a panini presser."], "image": "val2014/COCO_val2014_000000396324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287396, "question_id": "jjCYTekWtC7K7iJkpjr3Gf", "question": "Why is the bus without passengers?", "choices": ["passengers exiting", "accident", "garage bound", "broken down"], "correct_choice_idx": 2, "direct_answers": ["garage maintenance", "broken", "bus", "broken", "towards garage", "garage bound", "driving", "garage bound", "going garage", "quitting time"], "difficult_direct_answer": false, "rationales": ["The bus has a sign that says it is headed to the garage.", "The bus is going to the garage.", "Per the sign on the upper level, it's heading to a."], "image": "val2014/COCO_val2014_000000287396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529303, "question_id": "jjE8HYUTjmaXJnNwRPxSCv", "question": "What year was this sport originally created?", "choices": ["2000", "2012", "1873", "1993"], "correct_choice_idx": 2, "direct_answers": ["1873", "12th century", "years ago", "1873", "19th century", "eleven hundred", "1901", "1874", "twelfth century", "1873"], "difficult_direct_answer": false, "rationales": ["The year was 1873.", "This sport is tennis. it was invented by major walter clapton wingfield before 1900.", "Tennis was created in the 19th century."], "image": "train2014/COCO_train2014_000000529303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499135, "question_id": "jk6k9BSnWUuXoovD88ZCpX", "question": "If need be who can run the fastest?", "choices": ["old woman", "blonde woman", "taxi driver", "neither woman"], "correct_choice_idx": 1, "direct_answers": ["walking", "both", "front girl", "yound lady", "younger woman", "blonde woman", "woman", "younger woman", "sneaker wearer", "blonde woman"], "difficult_direct_answer": false, "rationales": ["The lady in front is much younger than the lady in the back.", "She is wearing tennis shoes.", "The blond woman on the path is wearing sneakers that would be suitable for running fast."], "image": "train2014/COCO_train2014_000000499135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524925, "question_id": "jkDktCbr9KVRjxNbZeSy8X", "question": "What are cables hooked to these planes for?", "choices": ["kite flying", "sales gimmick", "holding steady", "racing feature"], "correct_choice_idx": 2, "direct_answers": ["keep down", "tow banners", "fuel", "for parking", "holding steady", "security", "high winds", "securing", "flight", "tethers"], "difficult_direct_answer": true, "rationales": ["The cables hold the plane down.", "The cables hold it steady.", "The cables are hooked to the planes in order to hold them steady."], "image": "train2014/COCO_train2014_000000524925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202609, "question_id": "jkUjzHbPc5urJazrhvJyHM", "question": "What type is served here?", "choices": ["white", "black", "red", "rose"], "correct_choice_idx": 0, "direct_answers": ["lebanese", "dip food", "dumpling", "food", "fried", "oil food", "mexican", "samosa", "samosas", "white"], "difficult_direct_answer": true, "rationales": ["The wine is very lightly colored", "There is wine to the right. it is not dark enough to be red, black, or rose.", "The wine in the glass is not a wine with any color."], "image": "val2014/COCO_val2014_000000202609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337339, "question_id": "jkXRnpNutkm6ZRyrpDgQBC", "question": "What are the boys doing with the circular object?", "choices": ["karate", "selling", "trading", "playing"], "correct_choice_idx": 3, "direct_answers": ["playing", "playing", "throwing it", "throwing around", "playing", "throwing", "throw", "playing", "holding", "tossing"], "difficult_direct_answer": false, "rationales": ["The boy has a frisbee.", "The boys are carrying around a large disc which is made for throwing.", "These kids are playing."], "image": "train2014/COCO_train2014_000000337339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532555, "question_id": "jkYHfaBLsEoepYyck7voD7", "question": "Why is the horse wearing a blanket?", "choices": ["shaved", "cold", "pregnant", "protection"], "correct_choice_idx": 1, "direct_answers": ["preserve warmth", "cold", "cold", "cold", "stay warm", "warmth", "warmth", "winter", "cold", "cold"], "difficult_direct_answer": false, "rationales": ["The blanket will give the horse warmth on the core of its body.", "The horse needs it for warmth in the barn.", "It's cold outside."], "image": "train2014/COCO_train2014_000000532555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35705, "question_id": "jkaoyEPrZQr4fbzw2q8Axc", "question": "Which item might she use on her skin?", "choices": ["box", "paper", "metal", "tube"], "correct_choice_idx": 3, "direct_answers": ["lotion", "lotion", "lotion", "tube", "lotion", "lotion", "lotion", "lotion", "lotion", "lotion"], "difficult_direct_answer": false, "rationales": ["The item is full of lotion intended for skin.", "A container of lotion is on a table near a girl on the computer.", "There is a tube with ointment on a surface next to her laptop. ointments are used on skin to help it look more young."], "image": "val2014/COCO_val2014_000000035705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387328, "question_id": "jku26e3WkdBhB8egAjzXUp", "question": "Which direction are the three people on the right walking?", "choices": ["towards", "right", "away", "left"], "correct_choice_idx": 3, "direct_answers": ["left", "left", "towards left", "left", "left", "left", "left", "left", "towards left", "left"], "difficult_direct_answer": false, "rationales": ["Based on the knee bend one can tell which direction they are facing and the knee bend also suggests they are in motion. if they where moving in the direction they are facing they would be going toward the left of the image.", "If taking into account the direction in relation to the sides of the image, then this is the right direction.", "All are going towards the left since they are walking away from the right corner of the image."], "image": "val2014/COCO_val2014_000000387328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414934, "question_id": "jm2JynYynR48kJnMKNTopW", "question": "Where is this vehicle headed?", "choices": ["beach", "graveyard", "malt shop", "funeral home"], "correct_choice_idx": 0, "direct_answers": ["beach", "beach", "beach", "beach", "beach", "beach", "beach", "beach", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["Surfboards are being loaded onto the vehicle. in order surf, they need to find a large body of water with waves.", "They are loading in surf boards.", "The vehicle is being loaded up with surfboards."], "image": "val2014/COCO_val2014_000000414934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274916, "question_id": "jm2ac2zxjmGyibeugvzbmn", "question": "What state is the woman and her dog in?", "choices": ["new york", "new jersey", "massachusetts", "connecticut"], "correct_choice_idx": 2, "direct_answers": ["massachusetts", "massachusetts", "sitting", "sitting", "massachusetts", "massachusetts", "sitting", "massachusetts", "massachusetts", "massachusetts"], "difficult_direct_answer": false, "rationales": ["The city on the stonework, boston, is known to be in this state.", "There is a sign engraved in stone in the ground. the sign says \"boston\", and boston is in massechusetts.", "The state is ma."], "image": "train2014/COCO_train2014_000000274916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65292, "question_id": "jm4zeosoYTLEvncQ7zF2GA", "question": "The woman atop the horse rides in what style here?", "choices": ["straddling", "unicycle", "side car", "side saddle"], "correct_choice_idx": 3, "direct_answers": ["side saddle", "side saddle", "side saddle", "formal", "side saddle", "dressage", "victorian", "side saddle", "mourner's", "queen"], "difficult_direct_answer": false, "rationales": ["The woman's leg appears up over the top of the saddle and based on her body position this would result in two legs being on the same side of the horse.", "She saddle is on the side of her head.", "The woman is in a saddle."], "image": "train2014/COCO_train2014_000000065292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419159, "question_id": "jmA6MSbqPVAVnKwFZn4US2", "question": "If she violently jabbed her stylus straight up and down from this position what most likely would happen?", "choices": ["damage desk", "damage tablet", "damage keyboard", "damage guitar"], "correct_choice_idx": 1, "direct_answers": ["broken screen", "break", "damage equipment", "break", "screen line", "break screen", "damage tablet", "sell", "break tablet", "injure herself"], "difficult_direct_answer": true, "rationales": ["The item her stylus is on is a fragile and expensive electronic.", "She would probably cause harm to the tablet.", "She'd damage it."], "image": "train2014/COCO_train2014_000000419159.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361282, "question_id": "jmMq2VLPXCHNibf66kNHej", "question": "What type pressure can be discerned here?", "choices": ["blood", "barometric", "none", "gas"], "correct_choice_idx": 1, "direct_answers": ["barometric", "air", "barometric", "barometric", "barometric", "air pressure", "air", "air pressure", "air pressure", "barometric"], "difficult_direct_answer": false, "rationales": ["The pressure is barometric.", "It is a barometer.", "A contraption measure pressure in the air."], "image": "val2014/COCO_val2014_000000361282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282444, "question_id": "jmPnTTvHuRWJE8R9YRzKSN", "question": "Why are the men dressed in white?", "choices": ["dress code", "personal preference", "fashion", "visibility"], "correct_choice_idx": 0, "direct_answers": ["chef", "chef uniform", "chef uniform", "chef uniform", "dress code", "chefs", "chef uniform", "working", "chefs", "chef"], "difficult_direct_answer": false, "rationales": ["He appears to be in a chef outfit in a kitchen.", "This is his work uniform. he works in a kitchen as a chef.", "He's a chef."], "image": "train2014/COCO_train2014_000000282444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330478, "question_id": "jmget7656ys3FY37jnU9Pn", "question": "What is the best luggage brand in the world?", "choices": ["rimowa", "samsonite", "delsey", "away"], "correct_choice_idx": 3, "direct_answers": ["away", "american tourister", "away brand", "away", "samsonite", "samsonite", "samsonite", "rolex", "samsonite", "samsonite"], "difficult_direct_answer": false, "rationales": ["The question is subjective, but answer d is commonly regarded as one of the most widely used and preferred brands.", "Samsonite is a big brand of luggage.", "The away is widely known to be the best."], "image": "val2014/COCO_val2014_000000330478.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524382, "question_id": "jmkC8DEbJ7Eyi93YQ3GAe4", "question": "What is behind the fence?", "choices": ["dogs", "chickens", "monkeys", "cats"], "correct_choice_idx": 0, "direct_answers": ["dogs", "dogs", "dogs", "dogs", "dogs", "beagles", "beagles", "beagles", "dogs", "dogs"], "difficult_direct_answer": false, "rationales": ["The dogs are in an enclosed cage.", "These are canines.", "There are four hound dogs behind the fence."], "image": "val2014/COCO_val2014_000000524382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467951, "question_id": "jn47WYJ3Abnh6GAfPdqZFZ", "question": "Why are the flowers immersed inside a bowl of water?", "choices": ["decoration", "nothing", "fun", "avoid withering"], "correct_choice_idx": 3, "direct_answers": ["flower arrangement", "avoid withering", "keep alive", "freshness", "keep fresh", "keep alive", "keep fresh", "maintain life", "keep fresh", "for display"], "difficult_direct_answer": false, "rationales": ["The water is to keep them alive longer.", "Flowers are cut from live plants that need water to stay alive. when cut it is common practice to place them in water to try to keep them fresher longer before they start to decompose.", "The flowers are avoiding withering."], "image": "val2014/COCO_val2014_000000467951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56701, "question_id": "jn6HgrtAEKWj9YhLdWBv3q", "question": "Why is she laying on the sofa?", "choices": ["comfortable", "is lost", "avoid dog", "can't walk"], "correct_choice_idx": 0, "direct_answers": ["reading", "reading", "reading", "reading book", "comfortable", "reading", "reading", "resting", "reading", "to rest"], "difficult_direct_answer": false, "rationales": ["The woman is seen reading the book comfortably.", "Couches are made of thick cushions for resting on.", "The woman is looking for comfort while reading."], "image": "val2014/COCO_val2014_000000056701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252198, "question_id": "jnaLXGmtt7eMikj5eUKtK7", "question": "What are the items in the brown and grey receptacle for?", "choices": ["washing", "stirring", "wiping", "eating"], "correct_choice_idx": 2, "direct_answers": ["wiping mouth", "eating", "napkins", "hotdogs", "eating", "napkins", "wiping mouth", "wiping", "napkins", "hygiene"], "difficult_direct_answer": false, "rationales": ["The items are for wiping.", "The receptacle has tissue.", "Napkins are used to clean food off of a persons face or clothing."], "image": "train2014/COCO_train2014_000000252198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469913, "question_id": "jnmMFUuxD6Fv4AHAHUgzd5", "question": "What is this person trying to do?", "choices": ["roll", "descend", "flip", "ascend"], "correct_choice_idx": 1, "direct_answers": ["ski", "maximize speed", "ski", "skiing", "ski fast", "compete", "ski", "descend", "glide", "ski fast"], "difficult_direct_answer": false, "rationales": ["They are headed rapidly down a steep hill.", "The skier is trying to maximize his speed and can only be doing this if he is descending the slope.", "The skiier is trying to go down the hill."], "image": "train2014/COCO_train2014_000000469913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471737, "question_id": "jnuCXDJZdBumwWbwJmzMy5", "question": "Who invented this vehicle?", "choices": ["richard trevithick", "orville wright", "jeff goldblum", "bill nye"], "correct_choice_idx": 0, "direct_answers": ["richard trevithick", "richard trevithick", "george stephenson", "george stephenson", "no clue", "edison", "george stephenson", "train", "richard trevithick", "richard trevithick"], "difficult_direct_answer": false, "rationales": ["The inventor is richard.", "Richard trevithick is the inventor of the train.", "The others are the inventor of the airplane, a scientist and an actor."], "image": "val2014/COCO_val2014_000000471737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378458, "question_id": "jnvWBGpHPBPwtB8bctoXTB", "question": "What type of company is on his shirt?", "choices": ["hospital", "restaurant", "blog", "transportation"], "correct_choice_idx": 2, "direct_answers": ["metafilter", "community weblog", "metafilter", "blog", "metafilter", "security", "technological", "computer", "general-interest weblog", "filter company"], "difficult_direct_answer": false, "rationales": ["The company on this shirt is a blog.", "A company logo is on the front of a shirt.", "Metafilter is a blog."], "image": "train2014/COCO_train2014_000000378458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382104, "question_id": "jnw4XKqQqZ6qySBs6qX4n9", "question": "Ignoring everything else about the image what should pedestrians do about crossing the street according to the traffic light?", "choices": ["go around", "wait", "cross", "give up"], "correct_choice_idx": 1, "direct_answers": ["should stop", "wait", "stop", "stop", "wait", "wait", "stop", "stop", "stop", "stop"], "difficult_direct_answer": false, "rationales": ["People will wait.", "The red hand instructs them it isn't safe to proceed yet.", "The streetlight depicted shows a red hand, which is a signal to do this."], "image": "train2014/COCO_train2014_000000382104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11147, "question_id": "jo7WRdiiGErpE2KV6HXeaU", "question": "What was the original material of the stringy food on the right side of the sandwich?", "choices": ["eggs", "eggplant", "radish", "potato"], "correct_choice_idx": 3, "direct_answers": ["potato", "potatoes", "bread", "potato", "potato", "potatoes", "potato", "potato", "potato", "potatoes"], "difficult_direct_answer": false, "rationales": ["The stringy food items are french fries.", "The material is the potato.", "These were originally potatoes."], "image": "train2014/COCO_train2014_000000011147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352257, "question_id": "joCW72pnuRbFr5i9qexevc", "question": "What is the most likely reason for the word appearing on the wall behind the athlete?", "choices": ["paid advertisement", "player name", "instructions", "tournament name"], "correct_choice_idx": 0, "direct_answers": ["sponsner", "paid advertisement", "advertisement", "tournament sponsor", "advertisement", "advertisement", "advertising", "advertisement", "advertising", "sponsors"], "difficult_direct_answer": false, "rationales": ["The tournament is sponsored by this brand.", "It's a paid ad.", "The word canon is on a banner behind the woman in prominent lettering."], "image": "val2014/COCO_val2014_000000352257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273728, "question_id": "joL9LGpsVRNZ7dVR2ksj5M", "question": "How did this man get here today?", "choices": ["bus", "on horseback", "car", "tram"], "correct_choice_idx": 1, "direct_answers": ["lifted himself", "horse", "horse", "horse", "on horse", "on horseback", "rode horse", "horseback", "by horse", "horse"], "difficult_direct_answer": false, "rationales": ["The answer is not directly knowable but the man is a top answer a based on the features visible and this type of animal can be used as a mode of transportation.", "The man is riding on a horse.", "The man is sitting on top of a horse."], "image": "val2014/COCO_val2014_000000273728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222512, "question_id": "jocn7QXpB7Hji8BoKm4Tmh", "question": "What is the woman doing while holding the baby?", "choices": ["pampering baby", "teaching baby", "feeding baby", "surfing internet"], "correct_choice_idx": 3, "direct_answers": ["surfing internet", "using mouse/computer", "working", "working", "computer work", "holding mouse", "using computer", "working", "holding mouse", "using mouse"], "difficult_direct_answer": false, "rationales": ["She's surfing the net.", "The woman is on her computer.", "She is on the computer looking at stuff."], "image": "val2014/COCO_val2014_000000222512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407644, "question_id": "joitsZ87nPCcEyVKKswHVu", "question": "What conveyance creates shadows seen here?", "choices": ["uber", "ski lift", "taxi", "bus"], "correct_choice_idx": 1, "direct_answers": ["stance", "snowboard", "sun", "sun", "sun", "sun", "ski lift", "ski lift", "sun", "skier"], "difficult_direct_answer": false, "rationales": ["The people are at a ski mountain which would have answer a present and the shadows in question resemble the shape of answer a.", "There is a ski lift.", "The lift is creating shadows here."], "image": "val2014/COCO_val2014_000000407644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182893, "question_id": "jon4uPiyCEfRWmSjvzuw6c", "question": "What is the most likely time of day?", "choices": ["midnight", "sunset", "noon", "morning"], "correct_choice_idx": 1, "direct_answers": ["six pm", "dusk", "evening", "sunset", "sunset", "sunset", "dusk", "sunset", "evening", "morning"], "difficult_direct_answer": false, "rationales": ["The sun is low in the sky", "The orange glow present in this mountaintop scene is associated with the hour before the sun sets", "The color of the snow and the sky have those shades of purples and yellows that are very evident with the sun going down."], "image": "train2014/COCO_train2014_000000182893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3178, "question_id": "jou3yp8MLFCb7ZsEMFtDUR", "question": "What is the young giraffe doing?", "choices": ["walking", "feeding", "running", "laying down"], "correct_choice_idx": 1, "direct_answers": ["eating/nursing", "nursing", "eating", "nursing", "nursing", "nursing", "sucking", "nursing", "nursing", "feeding"], "difficult_direct_answer": false, "rationales": ["The baby is eating from his mother", "He is nursing from his mom.", "The youngest giraffe is getting some milk."], "image": "train2014/COCO_train2014_000000003178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45148, "question_id": "jp4FAZvoXtRXZqaKSMkhb4", "question": "The goods in the image can be prepared by which thermal procedure?", "choices": ["baking", "toasting", "frying", "grilling"], "correct_choice_idx": 0, "direct_answers": ["baking", "baking", "baking", "baking", "baking", "baking", "baking", "baking", "baking", "baking"], "difficult_direct_answer": false, "rationales": ["These type of goods are baked in an oven.", "The items in question are pastries based on their size, shape and color. pastries are commonly known to be prepared in the manner of answer a.", "The food items contain wheat flour which is baked to produce the above foods."], "image": "train2014/COCO_train2014_000000045148.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31981, "question_id": "jp7ULjsxyjkFVJk59exx9d", "question": "The frosting is probably made from what?", "choices": ["buttercream", "honey", "chocolate", "fondant"], "correct_choice_idx": 0, "direct_answers": ["cream", "buttercream", "cake", "sugar", "cream", "sugar", "sugar", "coconut", "coconut", "whip cream"], "difficult_direct_answer": false, "rationales": ["The frosting is light in color and whipped.", "The frosting is buttercream.", "Given the white splotchy texture of this cakes frosting buttercream is the only possible main ingredient of those listed here."], "image": "val2014/COCO_val2014_000000031981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385389, "question_id": "jp7irp9EqLDxYLzeP7g4ij", "question": "What is the man that is standing wearing?", "choices": ["helmet", "scarf", "jacket", "sombrero"], "correct_choice_idx": 2, "direct_answers": ["khakis", "jacket", "boots", "jacket", "jacket", "jacket", "work clothes", "jacket", "jacket", "clothes"], "difficult_direct_answer": false, "rationales": ["He has a thicker upper body covering that looks like a jacket.", "A man in pants who is standing behind another person who is crouched down is wearing a heavy coat.", "He has a jacket on."], "image": "train2014/COCO_train2014_000000385389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50244, "question_id": "jpE96XQn7hks6wQm8YHVtj", "question": "She is dressed to attend what kind of ceremony?", "choices": ["baptism", "graduation", "funeral", "wedding"], "correct_choice_idx": 1, "direct_answers": ["graduation", "graduation", "graduation", "graduation", "graduation", "graduation", "graduation", "graduation", "graduation", "graduation"], "difficult_direct_answer": false, "rationales": ["She's wearing a grad cap.", "Student who finish school can wear the cap at the finishing ceremony.", "She's going to graduation."], "image": "train2014/COCO_train2014_000000050244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504951, "question_id": "jpHSz2S4EpHuRJFXw5bAaf", "question": "Why is the man holding the trash bag carrying a large stick?", "choices": ["help walking", "for protection", "poke trash", "as weapon"], "correct_choice_idx": 2, "direct_answers": ["grabbing trash", "poke trash", "picking trash", "collecting trash", "metal detector", "janitor", "collect trash", "community service", "collects litter", "collecting garbage"], "difficult_direct_answer": true, "rationales": ["Besides this man with the bag and stick, there appears others in the background do something similar. it would seem that this is a concerted effort by a group to pick up and poke at trash.", "He is using the stick to spear the garbage left on the ground so that he doesn't have to bend over or touch the garbage.", "The man is poking around for trash."], "image": "train2014/COCO_train2014_000000504951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400303, "question_id": "jpKHaE8fkQsMtX6yLAUmPc", "question": "What has caused the blur in the middle of the photo?", "choices": ["window glare", "mist", "motion", "clouds"], "correct_choice_idx": 0, "direct_answers": ["light", "window", "accident", "reflection", "reflection", "reflection glare", "reflection", "window glass", "window reflection", "window glare"], "difficult_direct_answer": false, "rationales": ["We can see parts of where the person who took the picture was reflected in the window of this image.", "A faint reflection can be seen in a seen looking down from a window.", "A faint reflection can be seen in a photograph. taking a photo through glass can cause a reflection to be seen."], "image": "train2014/COCO_train2014_000000400303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532503, "question_id": "jpLWdtaZ4iVMivTuwZGqWo", "question": "What is she doing?", "choices": ["falling backwards", "taking selfie", "watching waterfall", "arguing"], "correct_choice_idx": 1, "direct_answers": ["selfie", "tsking selfie", "taking selfie", "taking pictures", "taking selfie", "taking selfie", "playing", "taking photo", "taking selfie", "taking photo"], "difficult_direct_answer": false, "rationales": ["She is holding the camera away from herself and her companion in order to get both of them in the photo.", "This option showed the woman taking a selfie since she was nabbing a photo of herself.", "She is holding the camera up pointing at herself."], "image": "val2014/COCO_val2014_000000532503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142825, "question_id": "jpUqcxckLUT5D6NMcESimD", "question": "Which body part of the largest animal might be the object of the most smuggling?", "choices": ["trunk", "neck", "ears", "tusks"], "correct_choice_idx": 3, "direct_answers": ["tusk", "tusk", "tusk", "tusks", "tusks", "tusks", "tusks", "tusks", "tusks", "tusk"], "difficult_direct_answer": false, "rationales": ["The large elephant has ivory tusks that are worth a lot of money and often smuggled.", "They are the most noticeable unusual part of this animal.", "Tusks are often sold on the black market."], "image": "train2014/COCO_train2014_000000142825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163483, "question_id": "jpwzfALpr9r6N397HVT6T4", "question": "Why is the man on the bike wearing yellow?", "choices": ["as punishment", "visibility", "as cosplay", "style"], "correct_choice_idx": 1, "direct_answers": ["safety purposes", "being careful", "safety", "visibility", "policeman", "safety visibility", "safety", "safety", "likes it", "safety reflection"], "difficult_direct_answer": false, "rationales": ["A man is wearing a brightly outfit while riding a motorcycle.", "It is dangerous to ride a motorcycle because car drivers don't always see them so the yellow makes him stand out", "The man is wearing a bright color that can be seen easily."], "image": "train2014/COCO_train2014_000000163483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216110, "question_id": "jpxNwbmRiEKE3bFJNurLZX", "question": "What job does the person in black standing have?", "choices": ["waitress", "barker", "none", "hair stylist"], "correct_choice_idx": 3, "direct_answers": ["hairdresser", "hairdresser", "stylist", "hair stylist", "hair dresser", "hairstylist", "hair stylist", "stylist", "holding", "assistant"], "difficult_direct_answer": false, "rationales": ["Given the hair cutting paraphernalia, plastic covering on the person seated in a specialty swivel chair and hair product present in this scene we can conclude this is a hair salon.", "The job is a hair stylist.", "The woman is doing another woman's hair which means she is a hair stylist."], "image": "train2014/COCO_train2014_000000216110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555267, "question_id": "jpy2taZUKYyw2za8atau8M", "question": "Why is the train shaped like this?", "choices": ["more room", "less resistance", "new requirement", "trendy"], "correct_choice_idx": 1, "direct_answers": ["aerodynamics", "less resistance", "speed", "efficiency", "wind drift", "less pressure", "aerodynamics", "speed", "aerodynamics", "aerodynamic"], "difficult_direct_answer": false, "rationales": ["The train is designed to have more speed.", "The shape is aerodynamic, which helps it use less energy when moving.", "It can go faster."], "image": "val2014/COCO_val2014_000000555267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65336, "question_id": "jpzNXJwSZiwvihhZTV5ZQX", "question": "What age group are the magnets on the fridge for?", "choices": ["teenagers", "adults", "children", "babies"], "correct_choice_idx": 2, "direct_answers": ["kids", "children", "child", "toddlers", "five", "kindergarden", "preschool", "children", "kids", "before ten"], "difficult_direct_answer": false, "rationales": ["The group is for kids.", "The magnets are a kids' toy.", "Babies would be too young to play with these. adults and teenagers already know their letters and numbers."], "image": "train2014/COCO_train2014_000000065336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1290, "question_id": "jq8BwPBxyKBBbuJyoFucHh", "question": "Why is there a candle in the cake?", "choices": ["to celebrate", "for light", "to eat", "to cook"], "correct_choice_idx": 0, "direct_answers": ["birthday", "someone's birthday", "to celebrate", "someone's birthday", "birthday", "birthday", "birthday", "birthday age", "birthday", "birthday"], "difficult_direct_answer": false, "rationales": ["This is for the childs birthday celebration.", "It is a birthday cake. the candle is not edible, and it is not bright or hot enough to light up the room or cook.", "Candles are put on cakes to celebrate birthdays."], "image": "val2014/COCO_val2014_000000001290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309873, "question_id": "jqBHwByikuPxRoZBkJ9gR4", "question": "What are the two young people doing with the headphones?", "choices": ["listening", "pulling", "gaming", "fighting"], "correct_choice_idx": 0, "direct_answers": ["enjoying music", "listening", "sharing", "sharing", "sharing", "sharing", "sharing", "listening", "hearing music", "listening"], "difficult_direct_answer": false, "rationales": ["The people are watching something on the computer screen, most likely a movie, and are using the headphones to hear the dialogue.", "The two people are listening to something with the headphones.", "The people are listening to music."], "image": "train2014/COCO_train2014_000000309873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551648, "question_id": "jqz3QGpF4fFipXPMVBbo7T", "question": "What is the baby doing to the phone?", "choices": ["eating", "pressing buttons", "staring at", "throwing"], "correct_choice_idx": 0, "direct_answers": ["chewing", "sucking", "playing", "chewing it", "chewing", "chewing", "chewing", "eating", "chewing it", "chewing"], "difficult_direct_answer": false, "rationales": ["The baby is chewing on the phone.", "The phone is in the baby's mouth.", "The phone is in the mouth of the child."], "image": "val2014/COCO_val2014_000000551648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236078, "question_id": "jr9TD3Znkk5ips5GSn8Rbs", "question": "What is he about to do?", "choices": ["dunk", "throw", "juggle", "dribble"], "correct_choice_idx": 1, "direct_answers": ["pitch", "pitch", "pitch", "throw ball", "pitch", "throw", "throw pitch", "throw ball", "pitch", "pitch"], "difficult_direct_answer": false, "rationales": ["He is getting ready to pitch the ball to the batter.", "The pitcher is throwing the ball.", "He's throwing."], "image": "train2014/COCO_train2014_000000236078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35975, "question_id": "jr9hw2rVSfwArX4MQUPqVd", "question": "What ingredient used as a veg toppings of the pizza?", "choices": ["celery", "pasta", "mushroom", "capsicum"], "correct_choice_idx": 2, "direct_answers": ["mushrooms", "mushroom", "mushrooms", "mushroom", "mushroom", "mushroom", "mushrooms", "mushroom", "mushroom", "mushroom"], "difficult_direct_answer": false, "rationales": ["This pizza has meat toppings but the only vegetable topping is mushrooms.", "There are many mushrooms.", "Small brown pieces of a vegetable topping are on a pizza."], "image": "val2014/COCO_val2014_000000035975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6789, "question_id": "jrP8xhSjzqqMUbACLnkfqX", "question": "What purpose do the wires on the poles serve to do?", "choices": ["gas", "heat", "carry electricity", "transportation"], "correct_choice_idx": 2, "direct_answers": ["power train", "carry electricity", "electricity", "carry electricity", "electricity", "electric power", "telephone lines", "telephone lines", "transfer electricity", "send electricity"], "difficult_direct_answer": false, "rationales": ["The wires are narrow and attached to a structure known to hold wires for the purposes of answer a.", "The purpose is for electricity.", "The wires are electric wires that are used to provide power."], "image": "val2014/COCO_val2014_000000006789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227325, "question_id": "jrie22Nxo6NXY8xxJwAD5z", "question": "What style of pizza is the man having?", "choices": ["flat bread", "new york", "deep dish", "stuffed crust"], "correct_choice_idx": 2, "direct_answers": ["deep dish", "deep dish", "pan", "pan", "deep dish", "deep dish", "deep dish", "deep dish", "deep dish", "deep dish"], "difficult_direct_answer": false, "rationales": ["This is a thicker crust pizza.", "The pizza is thicker than average pizzas.", "The man is eating a deep dish pizza which is a style of pizza with crusts that are a few inches deep."], "image": "train2014/COCO_train2014_000000227325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490511, "question_id": "jrwioQmmJ7eGmh3siXBppu", "question": "What does the blue protective device help protect?", "choices": ["chest", "head", "knees", "elbows"], "correct_choice_idx": 1, "direct_answers": ["head", "head", "head", "helmet", "head", "gutter cover", "head", "head", "bike", "head"], "difficult_direct_answer": false, "rationales": ["The blue device is a helmet.", "His head.", "The blue item is a helmet, which is worn on the head for safety purposes."], "image": "train2014/COCO_train2014_000000490511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74110, "question_id": "js6z6sKw8nDiYfwkA7VjW9", "question": "What substance can be obtained from the red object?", "choices": ["juice", "oil", "water", "fire"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The red object is a fire hydrant.", "There is water in the red object.", "Water is nearby."], "image": "train2014/COCO_train2014_000000074110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565501, "question_id": "js87oHFY2c6J4jsyWJD7RU", "question": "The chains visible here are meant to retain what?", "choices": ["caps", "firemen", "helmets", "dog tags"], "correct_choice_idx": 0, "direct_answers": ["covers", "caps", "water", "caps", "cap", "secure caps", "caps", "caps", "sides", "caps"], "difficult_direct_answer": false, "rationales": ["The chain seen here is attached to the bolt's which serve as caps on this hydrant.", "The chains connect the caps of the hydrant to the body. connecting to the body would mean they could be used to open the hydrant but could not be removed entirely.", "The caps are visible."], "image": "train2014/COCO_train2014_000000565501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329664, "question_id": "jsEWdqd67BuYnc6nLSUyzL", "question": "What does the man in the square hold do here?", "choices": ["films", "keeps score", "hides", "sleeps"], "correct_choice_idx": 0, "direct_answers": ["toughing", "film", "catching ball", "film", "look out", "take video", "videographer", "film", "films", "put score"], "difficult_direct_answer": false, "rationales": ["The man is holding a video camera and aiming it out of the hole. cameras are used for filming and these holes are commonly used in sports venues to give photographers unimpeded vantage points to record from without interfering in the action.", "The man in the square is recording the game.", "The man in the square is holding a camera. he is using it for its intended purpose."], "image": "train2014/COCO_train2014_000000329664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20598, "question_id": "jsEp5iegcE6fXaYpWnYCvE", "question": "How much pizza should a child eat?", "choices": ["2 slices", "4 slices", "5 slices", "3 slices"], "correct_choice_idx": 0, "direct_answers": ["one", "two slices", "two slices", "2 slices", "it's child-dependent", "three slices", "one slice", "two", "two slices", "two slices"], "difficult_direct_answer": false, "rationales": ["Truthfully, this isn't an easy answer, but the foreground child's plate suggests this answer.", "They are small people and don't have large stomachs. too much food will result in a stomach ache.", "Children should eat as little pizza as possible given the lack of nutrition and abundance of unhealthy ingredients."], "image": "val2014/COCO_val2014_000000020598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558166, "question_id": "jsFhFEGaPVN8kiTm26UhtK", "question": "What would most likely be stored in this type of location?", "choices": ["alcohol", "meat", "furniture", "produce"], "correct_choice_idx": 0, "direct_answers": ["wine", "wine", "wine", "wine", "wine spirits", "wine", "alcohol", "food", "food", "wine"], "difficult_direct_answer": false, "rationales": ["In order to age the alcohol it's stored in barrels.", "It is a wine cellar.", "The people are in an underground area. there are barrels behind them that are commonly used to store whiskey."], "image": "train2014/COCO_train2014_000000558166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289889, "question_id": "jsRsxPHi7yR8bBkGFNKR2S", "question": "Why are there so many stop signs?", "choices": ["emphasis", "storage", "many cars", "collector"], "correct_choice_idx": 1, "direct_answers": ["not installed", "event signs", "extra signs", "stacked up", "storage site", "storage site", "storage", "stop", "storage", "hauled away"], "difficult_direct_answer": false, "rationales": ["Items like this would not be in use as intended if they were so close together. a volume density like this would likely only occur where they were being stored before intended use.", "The signs are being stored.", "They're being stored."], "image": "val2014/COCO_val2014_000000289889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477335, "question_id": "jsU4kRUhuN5cMr72Brpxqm", "question": "How much older is this man now?", "choices": ["10 years", "22 years", "14 years", "30 years"], "correct_choice_idx": 2, "direct_answers": ["fourteen years", "fourteen years", "fourteen years", "14 years", "fourteen years", "fourteen years", "fourteen", "14 years", "14 years", "fourteen years"], "difficult_direct_answer": false, "rationales": ["The photo was taken in 2007.", "This image dates back to 2007 so given that the current date is 2021, the time span would make the man 14 years older.", "The date of the photo is visible in the bottom corner of the image and can be subtracted from this year."], "image": "train2014/COCO_train2014_000000477335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453989, "question_id": "jsUNsbeC5WkZZAQ84SAqqt", "question": "What can be found on the ground?", "choices": ["shells", "grass", "flowers", "shrubs"], "correct_choice_idx": 0, "direct_answers": ["sand", "sand", "sand", "sand", "sand", "sand", "sand", "shells", "sand", "sand"], "difficult_direct_answer": false, "rationales": ["The site of surfing is at the beach. the sand has many items such as shells.", "They wash up when the tide is high", "The beach likely has remnants of sea creatures."], "image": "train2014/COCO_train2014_000000453989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369476, "question_id": "jsXRddp9ATKWd3n2ERGmyP", "question": "What is the purpose of the half sphere to the left of the sign?", "choices": ["air freshener", "stop button", "camera", "light"], "correct_choice_idx": 2, "direct_answers": ["escape", "hand rail", "traffic light", "stop", "surveillance", "camera", "security camera", "paper", "timer", "security"], "difficult_direct_answer": true, "rationales": ["The purpose is to take a photo.", "The object is a camera to keep track of what is happening on the bus.", "There is a surveillance camera next to the sign."], "image": "train2014/COCO_train2014_000000369476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579614, "question_id": "jswq8BPXRcu5rnq6Gdouix", "question": "What do the two red signs in front of the cobblestone alley signal?", "choices": ["stop", "danger", "no entry", "private road"], "correct_choice_idx": 2, "direct_answers": ["no entry", "no enter", "stop", "no entry", "to stop", "no parking", "no entry", "no entry", "dont enter", "no cars"], "difficult_direct_answer": false, "rationales": ["Each red sign has a horizontal white line. they do not have text.", "Each sign has a red circle and a white horizontal line. these symbols indicate that vehicles should not proceed into the alley.", "The sign is a known prohibition type visual message for traffic."], "image": "train2014/COCO_train2014_000000579614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418144, "question_id": "jtCoNqPt4VEbsXC8ceSrCM", "question": "What are the birds perched on?", "choices": ["bench", "traffic light", "branch", "window"], "correct_choice_idx": 1, "direct_answers": ["traffic-light post", "stoplight", "wire", "traffic light", "street lights", "wires", "traffic light", "street light", "stoplight", "trees"], "difficult_direct_answer": false, "rationales": ["The birds are on top of lights.", "The birds are on the traffic light.", "There is a yellow stop light assembly at the end of the pole."], "image": "val2014/COCO_val2014_000000418144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199460, "question_id": "jtPiy2nVcpYn7J48NSed9m", "question": "What reactant or leavening agent is used in this dish?", "choices": ["none", "salt", "baking sprinkle", "yeast"], "correct_choice_idx": 3, "direct_answers": ["baking powder", "flour", "flour", "yeast", "yeast", "yeast", "yeast", "yeast", "flour", "tomato sauce"], "difficult_direct_answer": false, "rationales": ["The flour is poofed up so it uses yeast.", "It is a leavening agent for pizza dough.", "You need yeast to make the dough to rise."], "image": "train2014/COCO_train2014_000000199460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290935, "question_id": "jtg96Be9q46oTSYTeqPMKB", "question": "What type of shoes is the boy wearing?", "choices": ["adidas", "jordan", "reebok", "nike"], "correct_choice_idx": 0, "direct_answers": ["adidas", "skateboarding", "sneakers", "adidas", "skateboarding", "skateboarding", "sneakers", "skater shoes", "sneakers", "sport shoes"], "difficult_direct_answer": false, "rationales": ["He is wearing adidas.", "The shoes have three stripes on them which is the logo of this famous shoe company.", "The stripped edges on the shoe are typical of adidas shoes."], "image": "train2014/COCO_train2014_000000290935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49357, "question_id": "jtgtixdcd34AF5dyAuLgqY", "question": "What is the man using as a skate ramp?", "choices": ["rocks", "twig", "pole", "log"], "correct_choice_idx": 3, "direct_answers": ["tree", "wood", "wooden plank", "tree trunk", "log", "log", "log", "log", "pipe", "wood"], "difficult_direct_answer": false, "rationales": ["The person is on a round piece of wood that is long.", "The ramp is cylindrical and made of wood.", "The man is on a beach. he is using a piece of driftwood."], "image": "train2014/COCO_train2014_000000049357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47481, "question_id": "juAW6666BLR4GEYdzGgfvc", "question": "What is the woman most likely retrieving in front of the refrigerator?", "choices": ["water", "soda", "juice", "milk"], "correct_choice_idx": 0, "direct_answers": ["water", "ice", "hungry", "water", "ice", "sodas", "ice", "food", "drink", "ice water"], "difficult_direct_answer": false, "rationales": ["The woman wants water.", "The refrigerator has an ice maker. it can also output the liquid form of ice.", "There is a liquid dispenser on the front of the fridge. fridges usually do not have juice, milk, or soda dispensers."], "image": "train2014/COCO_train2014_000000047481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261871, "question_id": "juMxci5n24n5Gua7pisVdA", "question": "What environment are they at?", "choices": ["museum", "farm", "school", "amusement park"], "correct_choice_idx": 1, "direct_answers": ["farm", "farm", "rural", "farm", "rural", "farm", "farm", "farm", "farm", "farm"], "difficult_direct_answer": false, "rationales": ["This environment is in a pasture or field with a tractor hauling hay for the horses.", "They have a tractor and horses", "The people are using equipment and supplies consistent with answer a as well as the visible lifestock."], "image": "train2014/COCO_train2014_000000261871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333819, "question_id": "jvNLXJo2bKjNTCxT8oaX28", "question": "What animal is licking the plant?", "choices": ["bird", "camel", "tiger", "dog"], "correct_choice_idx": 3, "direct_answers": ["cat", "cat", "cat", "cat", "puppy", "dog", "dog", "dog", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["The nose is that of a dog.", "The animal is a dog.", "A small black and white animal in a home is licking a plant."], "image": "train2014/COCO_train2014_000000333819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567901, "question_id": "jvSE72zUzmxYU7W7hG4APF", "question": "The large item on the right resembles what?", "choices": ["cat", "dog", "baby", "spaceship"], "correct_choice_idx": 3, "direct_answers": ["shield", "flying saucer", "convex mirror", "spaceship", "mirror", "spaceship", "umbrella", "space ship", "observatory", "fisheye lens"], "difficult_direct_answer": true, "rationales": ["The item on the right is silver, has a doom and plates of metal on the size. this closely resembles the capsule of a spaceship.", "The large item is a spaceship.", "The silver is shiny and reflects. spaceships are shiny and reflective."], "image": "train2014/COCO_train2014_000000567901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24980, "question_id": "jvSdFuAPH5YDTuyPcegGAQ", "question": "What type facility is shown?", "choices": ["train race", "bus stop", "train depot", "taxi stand"], "correct_choice_idx": 2, "direct_answers": ["train station", "train station", "train depot", "train station", "train station", "train station", "train station", "train station", "train station", "train station"], "difficult_direct_answer": false, "rationales": ["A train station is next to the train tracks.", "There are train tracks there, seats for people waiting for trains, and a platform as well.", "The facility is for trains."], "image": "train2014/COCO_train2014_000000024980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339705, "question_id": "jvaUwxqfWntpWMytBJcPDN", "question": "What type day are people enjoying the outdoors here?", "choices": ["still", "hot", "still", "windy"], "correct_choice_idx": 3, "direct_answers": ["afternoon", "sunny", "slightly overcast", "holiday", "warm", "windy", "shout practicing", "windy", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["They are kites.", "The people are using kites based on the items visible in the option which would be commonly used on a day with answer a type weather.", "Kites need wind to fly and these kites are flying well, so it must be a windy day."], "image": "val2014/COCO_val2014_000000339705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24813, "question_id": "jvfiWdLNoVn5fSxD83QBn6", "question": "What is preventing the trees from sliding into the ocean?", "choices": ["fence", "their roots", "their bark", "their leaves"], "correct_choice_idx": 1, "direct_answers": ["roots", "dirt", "roots", "dirt", "roots", "roots", "roots", "dirt", "their roots", "roots"], "difficult_direct_answer": false, "rationales": ["They extend deep into the ground to provide an anchor for the tree.", "Their roots go in to the ground pretty far so they are stable where they are.", "The trees have root stuck onto the rocks."], "image": "train2014/COCO_train2014_000000024813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171785, "question_id": "jvr72ggHJkrvAmAii89ir2", "question": "What kind of structure is this?", "choices": ["deck", "slide", "flume", "ride"], "correct_choice_idx": 0, "direct_answers": ["skateboard", "skateboard curl", "skateboard ramp", "skateboard ramp", "skating ground", "skate park", "ramp", "deck", "half pipe", "half pipes"], "difficult_direct_answer": true, "rationales": ["This is actually a slope.", "There is a beach visible in the background and this structure is made from wood. decks are made from wood and are frequently near bodies of water so this could be answer a.", "The structure is a deck."], "image": "train2014/COCO_train2014_000000171785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31923, "question_id": "jvubTK2NYYYfvJsGEuXsLv", "question": "Which woman will get soaked the least?", "choices": ["middle", "any", "left", "right"], "correct_choice_idx": 0, "direct_answers": ["middle", "green one", "green woman", "middle", "head", "middle woman", "middle", "center", "center", "middle"], "difficult_direct_answer": false, "rationales": ["The woman in the middle is covered the most by the umbrella and will stay dry.", "The woman in the center of this grouping of three has the most of her body covered by the umbrella. the two on the sides have more of themselves exposed to the rain.", "She'll be in the middle."], "image": "train2014/COCO_train2014_000000031923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518290, "question_id": "jw33gGUbdqn2JNou5VMzCb", "question": "How many birds are in the image?", "choices": ["three", "nine", "seven", "four"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are two birds on shore and one in water.", "There are more than two but less than four birds.", "One is n a rock and the other two are closer to shore"], "image": "train2014/COCO_train2014_000000518290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91045, "question_id": "jwdmxGfYYegh9y5C3HJnvA", "question": "From what fruit comes the item being drunk here?", "choices": ["bananas", "cherries", "grapes", "apples"], "correct_choice_idx": 2, "direct_answers": ["grapes", "grape", "grape", "grapes", "grapes", "grapes", "grape", "grapes", "grapes", "grapes"], "difficult_direct_answer": false, "rationales": ["The fruit is grapes.", "Wine is made by pressing grapes and fermenting the juice", "I googled this and modern wines are made exclusively from grapes."], "image": "val2014/COCO_val2014_000000091045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340647, "question_id": "jwhdAFuGefMH5cUYb98Wtm", "question": "Which vehicle is a government vehicle?", "choices": ["firetruck", "none", "taxi", "van"], "correct_choice_idx": 0, "direct_answers": ["fire truck", "firetruck", "fire engine", "fire truck", "firetruck", "fire engine", "firetruck", "firetruck", "fire truck", "firetruck"], "difficult_direct_answer": false, "rationales": ["The large red and white vehicle is used by first responders who work for the government.", "These are run by community governments", "Governments tend to own this kind of vehicle due to their purpose and price."], "image": "train2014/COCO_train2014_000000340647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552237, "question_id": "jwmcj9xpUpvZpEw32xDxqq", "question": "What weather do these boys hope for?", "choices": ["storm", "doldrums", "wind", "rain"], "correct_choice_idx": 2, "direct_answers": ["wind", "summer", "windy", "windy", "windy", "windy", "kite", "sunny", "windy", "wind"], "difficult_direct_answer": false, "rationales": ["They need wind in order to lift their kites off the ground.", "Wind is needed for the kites to fly.", "The weather is windy."], "image": "val2014/COCO_val2014_000000552237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454210, "question_id": "jwyMKCbMkNXSPsRy7FyHx6", "question": "This animal is a symbol of what nation?", "choices": ["ukraine", "china", "scotland", "tanzania"], "correct_choice_idx": 1, "direct_answers": ["china", "america", "china", "china", "china", "china", "china", "china", "america", "china"], "difficult_direct_answer": false, "rationales": ["A panda is shown. china is associated with pandas.", "Panda's are a symbol from china.", "The panda is from china."], "image": "train2014/COCO_train2014_000000454210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74576, "question_id": "jx3HGdmzepvsGZhRvmRjnG", "question": "Which computer is most probably used in multiple locations?", "choices": ["laptop", "none", "middle", "left"], "correct_choice_idx": 0, "direct_answers": ["laptop", "laptop", "laptop", "laptop", "laptop", "mac", "laptop", "laptop", "laptop", "mac"], "difficult_direct_answer": false, "rationales": ["Because it can be easily moved from one place to another.", "Laptops can be easily moved.", "The computer is the laptop."], "image": "train2014/COCO_train2014_000000074576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538609, "question_id": "jxCW4dvntDZfw6uuCq4ctg", "question": "What type of internet device is in use at this desk?", "choices": ["desktop computer", "tablet", "smartphone", "laptop computer"], "correct_choice_idx": 0, "direct_answers": ["laptop", "laptop", "laptop", "laptop", "laptop", "laptop", "laptop", "desktop computer", "laptop", "laptop"], "difficult_direct_answer": false, "rationales": ["The attached screen means it's not a desktop. the keyboard means it's not a tablet or a smartphone.", "The service is a desktop.", "The monitor and keyboard are attached to each other and it folds flat"], "image": "train2014/COCO_train2014_000000538609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477435, "question_id": "jxRJJ9bkBBQBpgcY65ZKhW", "question": "According to clock the time displayed is ten minutes past which hour?", "choices": ["nine", "one", "twelve", "two"], "correct_choice_idx": 2, "direct_answers": ["twelve", "twelve", "noon", "noon", "twelve", "noon", "twelve", "12", "twelve", "twelve"], "difficult_direct_answer": false, "rationales": ["A clock is shown with the minutes hand pointing in the area of the two on the clock which indicates ten minutes of the hour.", "It is 12:10", "The clock is ten minutes from noon."], "image": "val2014/COCO_val2014_000000477435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334760, "question_id": "jxtwF5rHgC7Fs2zzyEjKEJ", "question": "Most horses that are used for racing are ridden by professional riders called as?", "choices": ["equestrian", "jockeys", "riders", "trainers"], "correct_choice_idx": 1, "direct_answers": ["jockeys", "jockeys", "jockeys", "jockeys", "jockeys", "horses", "thoroughbred", "jockey", "jockey", "jockeys"], "difficult_direct_answer": false, "rationales": ["Horses that are ridden for racing are ridden by professionals called jockeys.", "A person is crouched down near a horse in a uniform. jockeys ride horses professionally.", "The other options don't apply to professional riders in races."], "image": "val2014/COCO_val2014_000000334760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10104, "question_id": "jxvCjA3zUhtFBts5W4NNYM", "question": "How many people are controlling and steering this motorcycle?", "choices": ["one", "three", "four", "two"], "correct_choice_idx": 0, "direct_answers": ["one", "one", "four", "one", "one", "one", "two", "one", "one", "five"], "difficult_direct_answer": false, "rationales": ["One person is driving a motorcycle while three others perform stunts on the same motorcycle.", "Although multiple people are on the bike, only one is controlling the steering.", "There is one person driving but multiple people on the motorcycle."], "image": "val2014/COCO_val2014_000000010104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100910, "question_id": "jxyhCjSZ7nYmpnNPKR2aBJ", "question": "What type of weather is present?", "choices": ["rain", "sleet", "wind", "hurricane"], "correct_choice_idx": 2, "direct_answers": ["windy", "sunny", "summer", "wind", "windy", "windy", "windy", "sunny", "clear", "clear"], "difficult_direct_answer": false, "rationales": ["There are kites flying in the sky. the sky otherwise is mostly clear.", "Wind is needed for kites.", "Wind is required for the kites to fly. the kites are flying normally in the air."], "image": "train2014/COCO_train2014_000000100910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548164, "question_id": "jyLGmQiFX2GJdrnij7h4Fb", "question": "What period of the day is it in the image?", "choices": ["afternoon", "evening", "night", "morning"], "correct_choice_idx": 0, "direct_answers": ["afternoon", "midday", "evening", "afternoon", "afternoon", "afternoon", "noon", "daytime", "afternoon", "late afternoon"], "difficult_direct_answer": false, "rationales": ["The daytime appears to be around evening given the lack of darkness and does not appear to be morning given an abundance of light.", "A train is casting shadow in a train station.", "The sun is setting."], "image": "val2014/COCO_val2014_000000548164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195271, "question_id": "jyMS5Vqz8djLp9A9Bru5Ew", "question": "What is the woman preparing to pour from the carton in her hand?", "choices": ["eggnog", "cream", "yogurt", "milk"], "correct_choice_idx": 0, "direct_answers": ["egg nog", "egg nog", "milk", "milk", "milk", "eggnog", "milk", "milk", "milk", "eggnog"], "difficult_direct_answer": false, "rationales": ["She's drinking eggnog.", "The woman is making eggnog.", "The side of the container says egg nog on it."], "image": "val2014/COCO_val2014_000000195271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300965, "question_id": "jySHWtk2hr3RFRdCLejBaw", "question": "What type of event has happened?", "choices": ["flood", "explosion", "crash", "fire"], "correct_choice_idx": 0, "direct_answers": ["flood", "flood", "flood", "flood", "flood", "rain", "flooding", "flood", "bird watching", "duck bath"], "difficult_direct_answer": false, "rationales": ["A flood has occurred.", "The street has filled with water from heavy rain.", "There is water in front of the buildings."], "image": "train2014/COCO_train2014_000000300965.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307432, "question_id": "jya6979TvD49HnLbnBLi65", "question": "What is going to approach the two men in front soon?", "choices": ["baseball", "frisbee", "bullet", "football"], "correct_choice_idx": 0, "direct_answers": ["baseball", "ball", "ball", "ball", "baseball", "baseball", "ball", "baseball", "baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["The men are playing baseball and are waiting on the ball.", "The baseball will approach.", "The two men are at home plate and the pitch will come in soon and it will be a baseball."], "image": "train2014/COCO_train2014_000000307432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147624, "question_id": "jycnAjcWsEekM7e9PxJfon", "question": "How is the man holding the can feeling?", "choices": ["friendly", "happy", "amused", "angry"], "correct_choice_idx": 3, "direct_answers": ["mean", "annoyed", "angry", "angry", "angry", "bored", "serious", "buzzed", "angry", "unhappy"], "difficult_direct_answer": false, "rationales": ["He's angry.", "The man has a frown and his brow is furrowed.", "The man is pouting."], "image": "val2014/COCO_val2014_000000147624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225721, "question_id": "jzaJ8hvEGHEwFfk8EYta85", "question": "What does the woman here study?", "choices": ["frog", "toad", "worm", "pig"], "correct_choice_idx": 2, "direct_answers": ["snakes", "worm", "biology", "science", "worms", "biology", "biology", "biology", "earthworm", "biology"], "difficult_direct_answer": false, "rationales": ["The woman looks at a worm.", "The woman is looking at the worms.", "The woman is looking at a worm on the tray."], "image": "val2014/COCO_val2014_000000225721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400554, "question_id": "jzbcmEpTojjLTKP5Hbk8Gh", "question": "What are you instructed to do?", "choices": ["watch out", "follow car", "turn left", "stop"], "correct_choice_idx": 1, "direct_answers": ["follow car", "follow car", "follow car", "follow", "follow", "follow car", "follow", "follow", "follow", "follow"], "difficult_direct_answer": false, "rationales": ["The pilot car wants the other vehicles to follow him.", "A truck says to follow it.", "A car with a sign instructing to follow is in a construction area with big equipment nearby."], "image": "val2014/COCO_val2014_000000400554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164512, "question_id": "jzgyqNKANwvybtmqjQwabE", "question": "What does the man show off here?", "choices": ["racquet", "tennis ball", "shirt", "bicep"], "correct_choice_idx": 3, "direct_answers": ["bicep", "muscles", "muscles", "muscles", "bicep", "bicep", "muscle", "muscles", "muscles", "bicep"], "difficult_direct_answer": false, "rationales": ["The man has biceps.", "The man is flexing is right arm muscle.", "The man is flexing his muscle."], "image": "val2014/COCO_val2014_000000164512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366659, "question_id": "k2F6Fo9vywiRpYzJowrCU2", "question": "What is he doing with the pizza?", "choices": ["throwing out", "placing oven", "stealing", "removing oven"], "correct_choice_idx": 3, "direct_answers": ["moving", "removing oven", "baking it", "baking", "baking", "cooking", "cooking it", "extracting oven", "cooking", "cooking"], "difficult_direct_answer": false, "rationales": ["The tool being used is commonly used for inserting or removing pizza from the oven. this pizza appears to have brown markings on it indicating it has spent some time in the oven thus meaning it is being taken out.", "The man is taking the pizza out of the oven.", "A chef is using a large wood board and has a pizza on it. large wood boards are used to put pizzas in and out of ovens."], "image": "val2014/COCO_val2014_000000366659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495388, "question_id": "k2S6AiBa2TWMd3h5Bsdip3", "question": "Which food contains the highest level of sodium?", "choices": ["bun", "sausage", "drink", "fried onion"], "correct_choice_idx": 1, "direct_answers": ["hot dog", "hot dog", "hot dog", "hot dog", "onion rings", "onion rings", "hot dog", "hot dog", "sausage", "hot dogs"], "difficult_direct_answer": false, "rationales": ["Processed meats are high in salt.", "Preservatives are used in making spicey hot dogs.", "The meat product on the bun would have the highest level of sodium."], "image": "val2014/COCO_val2014_000000495388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354690, "question_id": "k2VLMCaoFRkGPsHwqyygmn", "question": "What is the bird on the bench called?", "choices": ["hawk", "flamingo", "penguin", "stork"], "correct_choice_idx": 3, "direct_answers": ["pelican", "egret", "stork", "stork", "stork", "pelican", "stork", "goose", "pelican", "pelican"], "difficult_direct_answer": false, "rationales": ["The bird represents babies.", "The bird on the bench is large and has a large beak for swallowing birds.", "The bird is a stork."], "image": "train2014/COCO_train2014_000000354690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441856, "question_id": "k2XKGm4mMYbEvLNjCLREEL", "question": "What material is the sink made of?", "choices": ["stainless steel", "plastic", "wood", "porcelain"], "correct_choice_idx": 0, "direct_answers": ["steel", "steel", "stainless steel", "steel", "metal", "metal", "metal", "stainless steel", "wooden", "metal"], "difficult_direct_answer": false, "rationales": ["It is shiny and metal", "The material is gray which is usually the color of metal.", "The sink is stainless steel based on the shiny metal appearance."], "image": "train2014/COCO_train2014_000000441856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196016, "question_id": "k2pcgJCNdq6fo5CfC4y4i6", "question": "Why is the woman holding her phone above her head?", "choices": ["playing game", "checking signal", "watching video", "taking photo"], "correct_choice_idx": 3, "direct_answers": ["to photograph", "taking pictures", "selfie", "taking photo", "take selfie", "taking pictures", "taking selfie", "photo", "photo", "pictures"], "difficult_direct_answer": false, "rationales": ["The higher the phone the more background she can get in the picture.", "She is reaching up to take a picture.", "On a city street, a woman holds her cellphone way over her head. since she's looking at it directly, we can assume she's taking a selfie."], "image": "train2014/COCO_train2014_000000196016.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37358, "question_id": "k34cnopLND2pzDz8zDCfVv", "question": "The building to the right of the hotel is used for what general purpose?", "choices": ["commerce", "government administration", "higher education", "religious services"], "correct_choice_idx": 2, "direct_answers": ["hotel", "schooling", "education", "education", "lodging", "university", "university", "business", "higher education", "learning"], "difficult_direct_answer": false, "rationales": ["The sign on the building next to the hotel is for roosevelt university which is a college where students go to be educated.", "A sign indicated that the institution is an academic center.", "Looks like it has been turned into a college building."], "image": "train2014/COCO_train2014_000000037358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70258, "question_id": "k3TMLo6hRHkqAopkwQ3R52", "question": "What does the young boy wearing plaid want to do?", "choices": ["back flip", "front flip", "catch frisbee", "dodge frisbee"], "correct_choice_idx": 2, "direct_answers": ["catch", "catch frisbee", "watch", "catch frisbee", "play", "catch", "catch frisbee", "catch frisbees", "catch frisbee", "catch frisbee"], "difficult_direct_answer": false, "rationales": ["He has his hands in the air and is under the toy", "The boy is standing with his hand outstretched facing the frisbee that is angled towards him. this is a consistent body positioning for one who wants to catch a frisbee which is one of the main objectives of frisbee.", "He is facing the frisbee with his arms outstretched, indicating he is waiting to catch it."], "image": "val2014/COCO_val2014_000000070258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50561, "question_id": "k3cVSjkzuFbinhhyEdi8Jd", "question": "What are the pastries called?", "choices": ["tea cake", "coronets", "croissants", "donuts"], "correct_choice_idx": 3, "direct_answers": ["donuts", "donuts", "donuts", "doughnuts", "donuts", "donuts", "donuts", "mini bundtcake", "donuts", "doughnuts"], "difficult_direct_answer": false, "rationales": ["They're donuts.", "The visible pastries have the tell tale hole in the center as well as the apparent texture and consistency of a donut.", "The pastries are glazed and of a shape, style and consistent that would make answer a accurate."], "image": "val2014/COCO_val2014_000000050561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202747, "question_id": "k3dbcbGkXFXFG2rxLCYYvQ", "question": "What brand is on her tank top?", "choices": ["bebe", "roxy", "wilson", "burton"], "correct_choice_idx": 0, "direct_answers": ["bebe", "bebe", "bebe", "bebe", "bebe", "bebe", "bebe", "bebe", "bebe", "bebe"], "difficult_direct_answer": false, "rationales": ["A woman is wearing a tank top with a logo on it.", "The woman's top says \"bebe\" on it.", "The top has a word that starts with \"b\"."], "image": "train2014/COCO_train2014_000000202747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110196, "question_id": "k3sbtVhZxnrRxjyr2S5dv3", "question": "According to the evidence up above where might you find the cameraman?", "choices": ["china", "south africa", "america", "india"], "correct_choice_idx": 0, "direct_answers": ["below bridge", "china", "under bridge", "china", "below", "under bridge", "lower road", "next exit", "below", "below"], "difficult_direct_answer": false, "rationales": ["Chinese text is shown on the signs.", "The symbols on the interstate signs show that they're written in chinese.", "Because of the language written on the road sign."], "image": "val2014/COCO_val2014_000000110196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542782, "question_id": "k465GMMzwu2gS4LgBeemW5", "question": "What winter sport is this group participating in?", "choices": ["snowboarding", "skiing", "sledding", "ice skating"], "correct_choice_idx": 0, "direct_answers": ["snowboarding", "snowboarding", "snow boarding", "snowboarding", "snowboarding", "snowboarding", "snow boarding", "snowboarding", "snowboarding", "snowboarding"], "difficult_direct_answer": false, "rationales": ["They are on a lift to go up the mountain to snow board down it.", "Each person has a winter sports item. each item has two, not one, foot bindings.", "They each have one large board to use and they are wearing winter gear and goggles"], "image": "val2014/COCO_val2014_000000542782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123136, "question_id": "k4JAqEdPFkxxVTAAzp7xba", "question": "What is done manually in this kitchen that is done by machines in most kitchens?", "choices": ["coffee", "wash dishes", "baking", "make bread"], "correct_choice_idx": 1, "direct_answers": ["dish washing", "dishwashing", "dish washing", "wash dishes", "wash dishes", "wash dishes", "dish washing", "washing dishes", "dishes", "stove"], "difficult_direct_answer": false, "rationales": ["There is not a dishwasher in this kitchen, and the dishes would have to be washed by hand in the sink.", "The kitchen is used to wash dishes.", "The machine can wash dishes."], "image": "val2014/COCO_val2014_000000123136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336314, "question_id": "k4My5VZvjKaRSE2EMPMM7G", "question": "What is the percent of alcohol?", "choices": ["five", "11", "60", "80"], "correct_choice_idx": 1, "direct_answers": ["eleven", "11", "11", "11", "eleven", "eleven percent", "11", "eleven", "eleven", "eleven"], "difficult_direct_answer": false, "rationales": ["On the bottle it states it has 11%.", "The wine says 11.0% alc/vol on the front.", "The wine bottle has printing on it that states the percentage."], "image": "train2014/COCO_train2014_000000336314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378455, "question_id": "k4SqXYLpj8dfpJ64sJVBAQ", "question": "Who likely owns these belongings?", "choices": ["boy", "baby", "man", "teenage girl"], "correct_choice_idx": 3, "direct_answers": ["woman", "woman", "young girl", "girl", "young girl", "little girl", "girl", "teenage girl", "girl", "travelling person"], "difficult_direct_answer": false, "rationales": ["These are mostly pink items", "The person who likely owns these belongings is likely a teenage girl for their makeup.", "The items are all pink."], "image": "train2014/COCO_train2014_000000378455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137724, "question_id": "k4e2wqee4QPfDYsyJFQS8g", "question": "Where did the person in red get their beverage?", "choices": ["krolls", "starbucks", "mamba jamba", "peets"], "correct_choice_idx": 1, "direct_answers": ["store", "starbucks", "starbucks", "starbucks", "starbucks", "starbucks", "store", "starbucks", "starbucks", "starbucks"], "difficult_direct_answer": false, "rationales": ["The starbucks logo is visible.", "The person is holding a cup with a starbucks logo.", "The person in red has a white cup. there is a mermaid logo on it."], "image": "train2014/COCO_train2014_000000137724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472102, "question_id": "k4ifHuXhMGm5QzTdzpzRvh", "question": "What do these people wait on?", "choices": ["dinner", "bus", "plane", "moss"], "correct_choice_idx": 2, "direct_answers": ["plane", "plane", "seats", "transportation", "seats", "flight", "airport chairs", "flight", "board plane", "airplane"], "difficult_direct_answer": false, "rationales": ["This is an airport", "Rows of chairs placed back-to-back, carry-on luggage, and large windows overlooking the tarmac are all things found while waiting at the airport.", "An airport terminal is shown that is full of people waiting in chairs with luggage."], "image": "val2014/COCO_val2014_000000472102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456819, "question_id": "k4kvHD8t8JghiTVrxULiAN", "question": "What type of establishment is the man in using his computer?", "choices": ["airport", "coffee bar", "bar", "cafe"], "correct_choice_idx": 2, "direct_answers": ["tavern", "bar", "bar", "bar", "bar", "bar", "bar", "bar", "bar", "night"], "difficult_direct_answer": false, "rationales": ["The man has a glass with a beer in it which is commonly found at bars. the lighting and decor are also consistent with many bars.", "The man is sitting at a bar and drinking beer while using his computer.", "He is sitting at a counter drinking a beer."], "image": "train2014/COCO_train2014_000000456819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59776, "question_id": "k4uwfVJvRYx7YWtGxaKkgF", "question": "What could the rope attached to the boat act as?", "choices": ["sail", "shimmy", "net", "anchor"], "correct_choice_idx": 3, "direct_answers": ["anchor", "anchor", "tow rope", "anchor", "anchor", "anchor", "tow rope", "anchor", "anchor", "anchor"], "difficult_direct_answer": false, "rationales": ["The boat is tied onto the buoy.", "The boat is not moving and is one place.", "It could act as an anchor to hold the boat to dock it."], "image": "train2014/COCO_train2014_000000059776.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429829, "question_id": "k4yBzAJnNx5BkBunPvkpDw", "question": "Racquet is used in which game?", "choices": ["hockey", "cricket", "baseball", "badminton"], "correct_choice_idx": 3, "direct_answers": ["tennis", "tennis", "tennis", "badminton", "tennis", "tennis", "tennis", "tennis", "tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["The answer is commonly known and based on the list of possible answers only answer a is viable.", "They are playing tennis which has a racquet as well.", "The sport uses a racket to hit a small object over a net."], "image": "train2014/COCO_train2014_000000429829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126732, "question_id": "k4yTt7A6K8ibeWbYJiLs8m", "question": "What is the name of the object on top of the boat's roof?", "choices": ["radio", "booster", "radar", "antenna"], "correct_choice_idx": 3, "direct_answers": ["radar", "antennae", "flag", "antenna", "antenna", "light", "radar", "radar", "radio", "no idea"], "difficult_direct_answer": false, "rationales": ["It's an antenna.", "There is a white antenna on top of the boat's roof used for radio communication.", "The object on the boat's roof is an antenna for communication."], "image": "train2014/COCO_train2014_000000126732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572362, "question_id": "k59bYxWfZAw4QELq9uoghB", "question": "Why are they using an umbrella?", "choices": ["disguise", "rain", "sun", "snow"], "correct_choice_idx": 1, "direct_answers": ["avoid rain", "block rain", "rain", "rain protection", "stay dry", "rainy day", "its raining", "raining", "block rain", "rain"], "difficult_direct_answer": false, "rationales": ["A couple is holding an umbrella. umbrellas are used to block rain.", "It's raining outside.", "It's raining."], "image": "train2014/COCO_train2014_000000572362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472439, "question_id": "k66ro6zPc5e69w8DBPNZW2", "question": "Who was the last person to have a perfect game from that position on the left?", "choices": ["mark mcgwire", "felix hernandez", "philip humber", "matt cain"], "correct_choice_idx": 1, "direct_answers": ["felix hernandez", "felix hernandez", "felix hernandez", "feliz hernandez", "felix hernandez", "felix hernandez", "feliz hernandez", "felix hernandez", "felix hernandez", "babe ruth"], "difficult_direct_answer": false, "rationales": ["Felix is widely known.", "Felix hernandez had a perfect game last.", "A man is throwing a ball from a pitcher's mound on a baseball diamond."], "image": "train2014/COCO_train2014_000000472439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318088, "question_id": "k6PioGmqcB8JVdspic2n7L", "question": "What is the small square object next to the white vase used for?", "choices": ["watching television", "exercising", "cooking", "storage"], "correct_choice_idx": 0, "direct_answers": ["computer screen", "tv", "tv", "watching television", "compeer", "television", "television", "entertainment", "watching tv", "protection"], "difficult_direct_answer": false, "rationales": ["The object in question is the size, shape and style with a screen consistent with answer a.", "The object is to watch tv.", "The object is clearly a television based on the screen and general structure. televisions are primarily used for watching."], "image": "train2014/COCO_train2014_000000318088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302193, "question_id": "k6mPcKNrCiJf8jhfH8aPWi", "question": "In the event of a fire what could be blamed?", "choices": ["smokers", "clock", "lights", "torch"], "correct_choice_idx": 3, "direct_answers": ["torches", "tiki torches", "tiki lights", "torches", "fire lamps", "torches", "torches", "torch", "tiki torches", "spears"], "difficult_direct_answer": false, "rationales": ["The torches are lit.", "The torch would be blamed.", "Torches are lit and blowing along a street."], "image": "val2014/COCO_val2014_000000302193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182728, "question_id": "k6qS34hWrXVcZB2zDYabqX", "question": "How does the man know the boy?", "choices": ["grandfather", "parent", "priest", "doctor"], "correct_choice_idx": 1, "direct_answers": ["father", "son", "his son", "father", "son", "his son", "son", "parent", "son", "son"], "difficult_direct_answer": false, "rationales": ["He appears to be about 30 years older and they have a close, trusting relationship.", "He's teaching the boy to surf", "The man appears to be too young to be a grandfather so \"parent\" makes the most sense."], "image": "train2014/COCO_train2014_000000182728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534889, "question_id": "k6rax22VtwoS4mjhXwrccL", "question": "What does the man look up at?", "choices": ["moon", "crows", "sun", "tennis ball"], "correct_choice_idx": 3, "direct_answers": ["tennis ball", "tennis ball", "ball", "tennis ball", "ball", "tennis ball", "tennis ball", "ball", "tennis ball", "ball"], "difficult_direct_answer": false, "rationales": ["The man is staying focused on his tennis match.", "The person is playing a sport that involves using a racquet to hit an item over a net. he is looking at that item.", "He's judging where he needs to swing"], "image": "train2014/COCO_train2014_000000534889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282425, "question_id": "k756CQMAiPiKYRbFzsQkXJ", "question": "In what type of building is this room found?", "choices": ["deli", "store", "restaurant", "hotel"], "correct_choice_idx": 3, "direct_answers": ["hotel", "hotel", "residential", "hotel", "hotel", "motel", "hotel", "apartment building", "hotel", "hotel room"], "difficult_direct_answer": false, "rationales": ["The building in this room is found inside of a hotel, because there are papers on the table.", "It is a place to relax, but it is too clean and sterile to be someone's home.", "It's in a hotel room."], "image": "train2014/COCO_train2014_000000282425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84474, "question_id": "k7S6J5unV2xcexoodgj6XS", "question": "What is this plane called?", "choices": ["seaplane", "lear jet", "fighter", "helicopter"], "correct_choice_idx": 0, "direct_answers": ["jet", "amphibious plane", "seaplane", "seaplane", "seaplane", "boat plane", "seaplane", "seaplane", "sea plane", "seaplane"], "difficult_direct_answer": false, "rationales": ["The plane is a seaplane.", "A plane is on the water at a dock.", "The plane is currently docked and floating on pontoons in the water. the only plane with this capability is known as answer a."], "image": "val2014/COCO_val2014_000000084474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130245, "question_id": "k7SNZLrWqMUNH2bWHQj4zY", "question": "What type of tennis game is being played here?", "choices": ["doubles", "singles", "handball", "mixed doubles"], "correct_choice_idx": 1, "direct_answers": ["professional", "singles", "singles", "singles", "singles", "tournament", "legs mason", "classic", "singles", "match"], "difficult_direct_answer": false, "rationales": ["The names of the players are displayed on the board in the background. there are two types of tennis played, doubles and singles, and there is only one name displayed per line which is consistent for singles but if it were doubles both partners names would be displayed.", "The game is singles.", "There is only one player on the side of the tennis court playing."], "image": "train2014/COCO_train2014_000000130245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390177, "question_id": "k7Wwn5fSTwx9aL2tJAAwt8", "question": "The billboard on the building is advertising for which science fiction franchise?", "choices": ["stargate", "dune", "star wars", "star trek"], "correct_choice_idx": 3, "direct_answers": ["star trek", "star trek", "movies", "star trek", "unknown", "star trek", "movies", "star trek", "star trek", "star trek"], "difficult_direct_answer": false, "rationales": ["The sign on the building is advertising for a star trek movie.", "The last 3 letters are visible on the bottom", "A star trek movie poster can be seen on the side of a building. star trek is a popular science fiction franchise."], "image": "train2014/COCO_train2014_000000390177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218397, "question_id": "k8B9PMkhEAxMurbFt9AU9b", "question": "Why are there more people than kites?", "choices": ["lost some", "lazy people", "some hidden", "mostly spectators"], "correct_choice_idx": 3, "direct_answers": ["observers", "kite watching", "spectators", "non-kiting spectators", "show", "picnic", "show", "watching", "venue", "mostly spectators"], "difficult_direct_answer": true, "rationales": ["People like to watch events with kites.", "Many of them are relaxing in chairs, showing they are just there to watch.", "Not everyone is flying one, some are just sitting to watch."], "image": "val2014/COCO_val2014_000000218397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45099, "question_id": "k8JqJpLeono6sGQcu3LCeC", "question": "Which feature of the camera poses harm to the person taking a photograph of a mirror?", "choices": ["selfie stick", "sounds", "flash", "dark mode"], "correct_choice_idx": 2, "direct_answers": ["flash", "flash", "strap", "flash", "flash", "strap", "flash", "flash", "flash", "strap"], "difficult_direct_answer": false, "rationales": ["The feature is flash.", "The person with the camera is facing a mirror and could accidentally flash the light at herself.", "A flash can cause a bright light and mirrors reflect light."], "image": "val2014/COCO_val2014_000000045099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301453, "question_id": "k8bCifdcRKwRNfrTNTCZwD", "question": "What stops the motor bike from tipping over?", "choices": ["braces", "bricks", "wall", "person"], "correct_choice_idx": 0, "direct_answers": ["stand", "kickstand", "kickstand", "stand", "stand", "braces", "kickstand", "stand", "kickstand", "stand"], "difficult_direct_answer": false, "rationales": ["The structures that are around the wheels are clearly visible and have the foundation that would be required to keep a two-wheeled vehicle upright and prevent tipping.", "The braces keep the bike from tipping.", "The bike is put on a stand."], "image": "val2014/COCO_val2014_000000301453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577122, "question_id": "k8eBgbgTxNEAUsZaS34Pt8", "question": "What is made by the plugged in item?", "choices": ["steak", "toast", "tea", "apple pie"], "correct_choice_idx": 2, "direct_answers": ["coffee", "coffee", "coffee", "tea", "tea", "coffee", "tea", "coffee", "coffee", "tea"], "difficult_direct_answer": false, "rationales": ["It is an electric kettle used to heat water.", "The kettle is plugged into the wall in order to heat up water for things like liquid drinks.", "There is a cork in the top of a bottle. the liquid is alcohol and be derived from the grape."], "image": "train2014/COCO_train2014_000000577122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372229, "question_id": "k8mTAnSYVBNZsepgzMXDku", "question": "What can be said about the business behind him?", "choices": ["just opened", "busy", "closed", "having hardships"], "correct_choice_idx": 2, "direct_answers": ["closed", "closed", "closed", "closed", "secure", "closed", "closed", "closed", "closed", "closed"], "difficult_direct_answer": false, "rationales": ["The security gate is pulled down over the entrance to the business.", "It's closed.", "The business has the gate up."], "image": "train2014/COCO_train2014_000000372229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413533, "question_id": "k9HN9Jn2YmTxoAtcHPoAfz", "question": "What is seen in the sky?", "choices": ["kite", "airplane", "bird", "helicopter"], "correct_choice_idx": 1, "direct_answers": ["airplane", "airplane", "airplane", "airplane", "airplane", "plane", "airplane", "fixed-wing", "aircraft", "airplane"], "difficult_direct_answer": false, "rationales": ["This is obvious given the shape of the aircraft.", "It is metal, has wings and engines with propellers", "The plane is hanging out in the air."], "image": "train2014/COCO_train2014_000000413533.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463211, "question_id": "k9ZvwCD3kgqxTbCuJ2tTaR", "question": "What does the girl awaiting the bus have behind her?", "choices": ["rotisserie", "backpack", "masks", "stalker"], "correct_choice_idx": 1, "direct_answers": ["backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack", "backpack"], "difficult_direct_answer": false, "rationales": ["We can see the top of this girl's backpack traditionally worn by children to school; which this school bus will likely take her to.", "The girl has a backpack.", "There is only one clear girl waiting for the bus and the picture includes a partial backpack. the child is approaching a school bus and likely going to school which validates why she may be wearing a backpack."], "image": "val2014/COCO_val2014_000000463211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332109, "question_id": "kA5UZwTjgm9XjxFTj5Cw2X", "question": "How many items on the table are to the left of the clock?", "choices": ["five", "one", "three", "four"], "correct_choice_idx": 1, "direct_answers": ["one", "one", "one items", "one", "one", "one", "one", "one items", "one", "one"], "difficult_direct_answer": false, "rationales": ["There are matching items on both sides of the clock, so there is one on the left and one on the right.", "One item is to the left of the clock.", "Only one is on the left."], "image": "train2014/COCO_train2014_000000332109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496874, "question_id": "kABU8Bbz5WJ6Z4ZkWEcjgx", "question": "What type of vehicle is shown?", "choices": ["subway", "bus", "car", "train"], "correct_choice_idx": 2, "direct_answers": ["four", "classic", "truck", "chevy truck", "truck", "truck", "old truck", "classic car", "classic truck", "car"], "difficult_direct_answer": false, "rationales": ["It's a shorter vehicle and has the common features of a car.", "It's a car.", "The vehicle is actually a truck."], "image": "train2014/COCO_train2014_000000496874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286322, "question_id": "kARzC5vZMRWf7tVhhHgzUQ", "question": "The person that invented the item with the yellow lid was from what country?", "choices": ["sweden", "russia", "thailand", "germany"], "correct_choice_idx": 3, "direct_answers": ["usa", "germany", "germany", "germany", "germany", "germany", "australia", "england", "australia", "germany"], "difficult_direct_answer": false, "rationales": ["The product was discovered as an edible item by a scientist from this country.", "Marmite is a paste that is a popular condiment used in germany.", "The person is from germany."], "image": "val2014/COCO_val2014_000000286322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224272, "question_id": "kASaeeg5bdcjH5kCJM25Gg", "question": "How many modes of transportation are pictured here including the children's toy?", "choices": ["ten", "seven", "one", "four"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "two", "two", "two", "four", "three"], "difficult_direct_answer": false, "rationales": ["There are 4 modes.", "There are four modes shown.", "Two bicycles, a car and a child's wheeled toy."], "image": "train2014/COCO_train2014_000000224272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231958, "question_id": "kATJkRgYdumuieRVazhhkX", "question": "What are the people sending into the air?", "choices": ["kites", "balloons", "doves", "fireworks"], "correct_choice_idx": 0, "direct_answers": ["kites", "kites", "kites", "kites", "kites", "kites", "kites", "kites", "kite fly", "kites"], "difficult_direct_answer": false, "rationales": ["That's what people are flying in the park.", "The people send kites.", "They have pieces of material tied to strings. the material will fly in the wind."], "image": "train2014/COCO_train2014_000000231958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121452, "question_id": "kB4GRdsynSn9hvo8eQASBc", "question": "Who constructed the obstacle being jumped here?", "choices": ["stuntman", "skateboard maker", "flying skateboarder", "milk man"], "correct_choice_idx": 2, "direct_answers": ["male", "boy", "skater", "man", "friends", "flying skateboarder", "young man", "skateboarder", "human", "skateboarder"], "difficult_direct_answer": true, "rationales": ["An obstacle has been created in a driveway. a skateboarder is jumping over the obstacle. skateboard tricks often involve jumping.", "The object is for skateboarding.", "The skateboarder probably set this up for himself."], "image": "train2014/COCO_train2014_000000121452.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337420, "question_id": "kB88zRYXRkgko24aNWVqd2", "question": "Is GPS attached in the paragliding?", "choices": ["no", "only radio", "yes", "none"], "correct_choice_idx": 2, "direct_answers": ["gps maybe", "sometimes gps", "yes", "yes", "yes", "yes", "yes", "no", "yes", "yes"], "difficult_direct_answer": false, "rationales": ["There is nothing attached to the kites.", "There is a tracking system for safety on the para gliders.", "There is a gps."], "image": "val2014/COCO_val2014_000000337420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8385, "question_id": "kBPV8SNbeZgDtkkbz2beA7", "question": "What is the purpose of the trunk?", "choices": ["to throw", "to decorate", "to heal", "to travel"], "correct_choice_idx": 3, "direct_answers": ["storage", "to travel", "possession storage", "safekeeping", "transport items", "hold clothes", "storage", "storage", "items", "hold clothing"], "difficult_direct_answer": false, "rationales": ["This holds clothing and other items people need to take with them", "The purpose of trunks is commonly known and consistent with the setting of a train station.", "People will pack things when they go somewhere."], "image": "train2014/COCO_train2014_000000008385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176969, "question_id": "kBdmkGGPu9ZvMogqxnWAu2", "question": "What type of object have the toothbrushes been modified to resemble?", "choices": ["dogs", "giraffes", "horses", "people"], "correct_choice_idx": 3, "direct_answers": ["diving board", "shelf", "levers", "giraffe", "people", "spear", "giraffe", "people", "clamps", "divers"], "difficult_direct_answer": false, "rationales": ["This is called an anthropomorphic display or sculpture.", "The toothbrushes seem to be doing squats.", "Toothbrushes are in holders that resemble people on all fours."], "image": "train2014/COCO_train2014_000000176969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411480, "question_id": "kBw8R3j3oYo2o6R7bMZKB9", "question": "What poses the gravest danger to the person under the blue umbrella here?", "choices": ["falling", "none", "lightning", "rain"], "correct_choice_idx": 0, "direct_answers": ["falling", "cliff", "falling", "falling", "cliff", "falling", "falling", "slipping", "falling", "falling"], "difficult_direct_answer": false, "rationales": ["He is standing on the edge of a very tall cliff.", "This child is near a huge drop off.", "The person with the umbrella is in great danger of falling."], "image": "val2014/COCO_val2014_000000411480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227186, "question_id": "kC9A6qBrPomxUAeougThqB", "question": "Who manufactured the car in the background?", "choices": ["chevrolet", "dodge", "bmw", "mercedes"], "correct_choice_idx": 3, "direct_answers": ["mercedes benz", "mercedes benz", "mercedes", "mercedes benz", "mercedes-benz", "mercedes", "mercedes", "mercedes", "mercedes", "mercedes benz"], "difficult_direct_answer": false, "rationales": ["Mercedes manufacturer the car.", "The manufacturer is mercedes.", "The emblem on the front of the car is for a benz."], "image": "train2014/COCO_train2014_000000227186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266021, "question_id": "kCJLxj4NtLqrwwPQ2UoJYH", "question": "What is the sand on the beach made of?", "choices": ["rock chips", "coarse mud", "calcium carbonate", "unknown"], "correct_choice_idx": 2, "direct_answers": ["crushed shells", "rocks", "rocks", "silica", "calcium carbonate", "dirt", "quartz", "quartz", "sand", "rocks"], "difficult_direct_answer": false, "rationales": ["Many grains of finely ground stone.", "Sand is made of rock that's been ground up into tiny chips.", "The sand is made of carbonate."], "image": "val2014/COCO_val2014_000000266021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437872, "question_id": "kCRuAUDUnvJK72uEWgtgXr", "question": "What is the man doing to the child's hair?", "choices": ["combing it", "cutting it", "braiding it", "coloring it"], "correct_choice_idx": 0, "direct_answers": ["brushing", "brushing", "brushing", "brushing", "brushing", "brushing", "brushing", "combing it", "brushing", "brushing"], "difficult_direct_answer": false, "rationales": ["The man is holding a comb to the girl's hair.", "He is running a tool through it to get any tangles out.", "A man is sitting next to a child and he is holding a brush to her hair."], "image": "val2014/COCO_val2014_000000437872.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215664, "question_id": "kChLpmWiUmaupjJcqh2YcC", "question": "What are the people behind the green wall doing?", "choices": ["eating", "resting", "gaming", "spectating"], "correct_choice_idx": 3, "direct_answers": ["watching", "spectating", "spectating", "watching", "watching tennis", "watching", "watching", "spectating", "watching", "watching"], "difficult_direct_answer": false, "rationales": ["Spectators usually wait behind the green wall in tennis.", "It is a professional tennis match and they are in the stands where spectators sit to watch.", "They are watching the game"], "image": "train2014/COCO_train2014_000000215664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33527, "question_id": "kChT3GYf7f97SKwgWkSv73", "question": "What age are these cats?", "choices": ["young", "middle aged", "old", "adult"], "correct_choice_idx": 0, "direct_answers": ["baby", "babies", "6 months", "kittens", "couple months", "infants", "kittens", "young", "8 weeks", "kittens"], "difficult_direct_answer": false, "rationales": ["The cats are young.", "They are very small compared to the size of the remote so they are kittens", "They are still really small so they are still babies."], "image": "train2014/COCO_train2014_000000033527.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245221, "question_id": "kCkFcoELU3QegYjEbdLK3y", "question": "How are these boats powered?", "choices": ["solar", "wind", "paddle", "gas"], "correct_choice_idx": 1, "direct_answers": ["wind", "wind", "wind", "sail", "sails", "sails", "sails", "sails", "wind", "motor"], "difficult_direct_answer": false, "rationales": ["The sails use wind to power up.", "The boats are sailboats.", "They have sails and masts."], "image": "train2014/COCO_train2014_000000245221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171585, "question_id": "kCpV5L5eVgngUzDa2rMnBx", "question": "What is the boy swinging?", "choices": ["club", "racquet", "baseball bat", "sneakers"], "correct_choice_idx": 1, "direct_answers": ["racquet", "tennis racket", "racket", "tennis racket", "tennis racket", "tennis racket", "tennis racket", "racket", "racket", "tennis racket"], "difficult_direct_answer": false, "rationales": ["This sport is played on a court and requires a ball and flat hand-held device.", "Based on the setting and the size and shape of the object in the boy's hand, answer a is consistent.", "The boy is swinging his racquet."], "image": "train2014/COCO_train2014_000000171585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197130, "question_id": "kCq7KLyxgxeDAvc36KPRPG", "question": "Which direction will the bus go next?", "choices": ["turn right", "go straight", "turn left", "back up"], "correct_choice_idx": 3, "direct_answers": ["back up", "right", "reverse", "backwards", "straight", "right", "right", "right", "girl walking", "right"], "difficult_direct_answer": false, "rationales": ["In front of the bus there is visibly a cone and other items blocking the road that would prevent it from continuing in that direction. the bus has lights on the back that also indicate it is about to reverse.", "The bus will move forward.", "A bus is stopped because there is a stand in front of it. it will have to go the other way to get out."], "image": "val2014/COCO_val2014_000000197130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172886, "question_id": "kCzXbiFhWgPvwh5HsXJSbP", "question": "What typical bathroom item is integrated into the wall that normally is free standing?", "choices": ["plumbing", "hair dryer", "mirror", "sink"], "correct_choice_idx": 1, "direct_answers": ["sink", "sink", "sink", "sink", "shower", "sink", "hair dryer", "sink", "sink", "shower"], "difficult_direct_answer": false, "rationales": ["The hair dryer is stuck in the wall.", "Hair dryers are usually not stored on the wall.", "The bathroom has a hair dryer."], "image": "val2014/COCO_val2014_000000172886.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36218, "question_id": "kDcXYaeTqPmSPRuMUeBStt", "question": "The man is wearing what?", "choices": ["feather boa", "cowboy hat", "sandals", "gas mask"], "correct_choice_idx": 1, "direct_answers": ["suit", "cowboy hat", "cowboy hat", "cowboy hat", "cowboy hat", "hat", "cowboy hat", "cowboy hat", "suit", "suit jacket"], "difficult_direct_answer": false, "rationales": ["The man is wearing a cowboy hat.", "The man is a cowboy.", "It's a stetson type hat."], "image": "train2014/COCO_train2014_000000036218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571950, "question_id": "kDd2pK8x6RCaVaVRMbefCv", "question": "What is under this man?", "choices": ["water", "shadow", "grass", "sand"], "correct_choice_idx": 1, "direct_answers": ["shadow", "skateboard", "skateboard", "curb", "skateboard", "skateboard", "skateboard", "sidewalk", "skateboard", "curb"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to this scene.", "The sun is shining down from above the man with the skateboard.", "The area is darkened and cast due to the lighting."], "image": "train2014/COCO_train2014_000000571950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385326, "question_id": "kDdP5QY6KG6aDuFwXcZ8MK", "question": "How many citrus fruits are there in the image?", "choices": ["three", "one", "four", "two"], "correct_choice_idx": 1, "direct_answers": ["twenty", "two", "two", "one", "approximately 2", "one", "many", "one", "four", "one"], "difficult_direct_answer": false, "rationales": ["Oranges are a citrus fruit. there are oranges in the image.", "Oranges are in slices on a table people are getting food from.", "There is only one on the table."], "image": "train2014/COCO_train2014_000000385326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219544, "question_id": "kDe5HmcBH4rLdrJzWn2zRF", "question": "What kind of event has occurred in this scene?", "choices": ["car accident", "flooding", "earthquake", "landslide"], "correct_choice_idx": 3, "direct_answers": ["mudslide", "landslide", "landslide", "dirt slide", "mudslide", "landslide", "landslide", "mudslide", "landslide", "landslide"], "difficult_direct_answer": false, "rationales": ["Roads with rock cliffs next to them are usually kept in check. there is much dirt and rocks on the road here so it's safe to assume there has been a landslide.", "A lot of rocks and dirt slid down the mountain onto the road and blocked it", "Road worker vehicles are on the side of a road that is on the side of a steep rock and dirt face and dirt is piled in the road."], "image": "train2014/COCO_train2014_000000219544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65063, "question_id": "kDuKvrdXRttbYRPQsCzTKq", "question": "What is it called when this animal moves?", "choices": ["trot", "slither", "slime", "roll"], "correct_choice_idx": 0, "direct_answers": ["gallop", "gallop", "resisting", "gallop", "gallop", "trot", "trot", "gallop", "trot", "horse trot"], "difficult_direct_answer": false, "rationales": ["The horse trots.", "The horse is running.", "When a horse runs, they trot."], "image": "train2014/COCO_train2014_000000065063.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173759, "question_id": "kEEoDZYucMRjVskGu5pxEC", "question": "What country is the flag painted on the motorcycle above belong to?", "choices": ["united states", "switzerland", "great britain", "mexico"], "correct_choice_idx": 0, "direct_answers": ["usa", "united states", "united states", "usa", "usa", "usa", "usa", "united states", "usa", "america"], "difficult_direct_answer": false, "rationales": ["It is the stars and stripes", "The motorcycle is painted with the stars and stripes from the american flag.", "The flag is the american flag with the stars and stripes and is well known."], "image": "train2014/COCO_train2014_000000173759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537879, "question_id": "kEuDLnyWG8jHRDcvgqXCqB", "question": "When was the double-decker bus invented?", "choices": ["1910", "1893", "1900", "1906"], "correct_choice_idx": 3, "direct_answers": ["1906", "1847", "1847", "1847", "1847", "1847", "1900's", "last century", "1828", "unknown"], "difficult_direct_answer": false, "rationales": ["It first came out in that year.", "The bus is from 1906.", "It was invented in 1906"], "image": "train2014/COCO_train2014_000000537879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495568, "question_id": "kFCuU4Q6QecRdU7xqQNAKG", "question": "What can be listened to whose info is on the tent?", "choices": ["politician", "symphony", "radio", "indie mixtape"], "correct_choice_idx": 2, "direct_answers": ["radio station", "radio station", "radio", "radio station", "jazz", "radio", "jazz", "radio station", "jazz music", "jazz music"], "difficult_direct_answer": false, "rationales": ["Jazz fm is a radio station.", "The radio can be heard.", "The writing on the tent is advertising an fm station called jazz fm 91."], "image": "train2014/COCO_train2014_000000495568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398440, "question_id": "kFL7tCPZHGymVypp3FgNyg", "question": "What are the cows looking at?", "choices": ["dog", "rat", "lion", "cat"], "correct_choice_idx": 0, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The cows are looking at the dog.", "It is the only non-cow animal sitting in the middle of the pen.", "While the dog sits patiently, all the cow's heads are directed toward it with curiosity."], "image": "val2014/COCO_val2014_000000398440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242852, "question_id": "kFLtMdE9sfuomVzdto2RVc", "question": "What kind of course is the vehicle going to take?", "choices": ["go straight", "gradual turn", "reverse course", "hairpin turn"], "correct_choice_idx": 1, "direct_answers": ["winding course", "track", "railroad", "gradual turn", "loop", "left", "rail", "train", "train route", "circle"], "difficult_direct_answer": true, "rationales": ["The course takes a turn.", "The train is going to follow along the tracks. it will be making a slight curve to the left.", "The engine of the train is visible which indicates the direction it is heading. the track in front of the train is visible so if it were to continue on its course it would gradually turn."], "image": "val2014/COCO_val2014_000000242852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259858, "question_id": "kFdhwMwvmbxiLD8KxmYp9Y", "question": "How were the houses on this land built?", "choices": ["by hand", "manufacturer", "power tools", "machine"], "correct_choice_idx": 0, "direct_answers": ["adobe", "mud", "carved", "clay", "with mud", "clay", "clay", "by hand", "adobe", "with sand"], "difficult_direct_answer": false, "rationales": ["Houses made of sand have livestock in front of them. the houses lack perfectly straight lines.", "They are made of clay that people put together themselves.", "By looking at the houses, we can assume that they were not built in any type of sophisticated, modern way."], "image": "val2014/COCO_val2014_000000259858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520503, "question_id": "kFfbbpyZRjg8AYnVy7hBqW", "question": "What are the yellow lines on the road warning the drivers about?", "choices": ["boats", "semis", "trolley's", "trains"], "correct_choice_idx": 2, "direct_answers": ["sign", "no passing", "stop", "street sides", "don't cross", "lane rules", "trolley's", "railcar", "train", "pedestrians"], "difficult_direct_answer": true, "rationales": ["The yellow lines are warning of the train tracks.", "There is a mass transportation on the other side of the yellow line.", "The lines are for trolleys."], "image": "val2014/COCO_val2014_000000520503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391333, "question_id": "kFjMw9QPPko3mfcrzH3Kky", "question": "What is the white water the boat creates called?", "choices": ["wake", "fog", "speed water", "clouds"], "correct_choice_idx": 0, "direct_answers": ["boat wake", "wake", "wake", "whitewater", "wake", "wake", "wake", "foam", "wake", "wake"], "difficult_direct_answer": false, "rationales": ["This is the word for boat waves", "The white water behind the boat is called wake that is created by moving through water.", "A boat, or any object, moving through the water, moves water, creating a wake."], "image": "train2014/COCO_train2014_000000391333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135576, "question_id": "kFokLE4JXqtvgyavnFmpBT", "question": "Which food provides the most starch?", "choices": ["biscuit", "potato", "egg", "pastry"], "correct_choice_idx": 1, "direct_answers": ["bread", "potatoes", "potatoes", "potato", "potatoes", "potatoes", "potato", "bread", "potato", "potatoes"], "difficult_direct_answer": false, "rationales": ["Potatoes are starchy.", "These types of foods are usually really full of carbohydrates compared to other vegetables.", "These potatoes are very starchy."], "image": "train2014/COCO_train2014_000000135576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362599, "question_id": "kFpbPGXVzBGedsVSxmv9L3", "question": "What is hanging on the cabinet handle?", "choices": ["soap", "bananas", "garbage bag", "keys"], "correct_choice_idx": 2, "direct_answers": ["plastic bag", "bags", "towel", "garbage bag", "magnet", "bags", "towel", "plastic bags", "kitchen towels", "bag"], "difficult_direct_answer": false, "rationales": ["There are is a bag on the handle.", "A garbage bag is hanging.", "Sometimes, plastic grocery or shopping bags are used to dispose of trash."], "image": "val2014/COCO_val2014_000000362599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233017, "question_id": "kGFeNFu6pLYBQa9pRh3WQm", "question": "What does the first S stand for?", "choices": ["siam", "singapore", "suriname", "sri lanka"], "correct_choice_idx": 1, "direct_answers": ["singapore", "singapore", "singapore", "singapore", "singapore", "singapore", "singapore", "sensoang", "sensoang", "singapore"], "difficult_direct_answer": false, "rationales": ["The name of the country is represented in the bus name.", "A commercial bus is shown with logos on the side.", "Per google, this is the right selection."], "image": "val2014/COCO_val2014_000000233017.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86121, "question_id": "kGHNQF9htpFYhGWBzck96f", "question": "What animals legs are closest to the ground here?", "choices": ["birds", "pigs", "bears", "giraffes"], "correct_choice_idx": 0, "direct_answers": ["goose", "birds", "all giraffes", "ducks", "birds", "geese", "ducks", "geese", "geese", "goose"], "difficult_direct_answer": false, "rationales": ["The ducks are closest to the ground.", "The birds are shorter.", "Geese have shorter legs."], "image": "train2014/COCO_train2014_000000086121.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39791, "question_id": "kGT8ZTGcDnVsqKkivGhCxi", "question": "Who is in the greatest risk of being attacked?", "choices": ["little boy", "man", "woman", "black dog"], "correct_choice_idx": 0, "direct_answers": ["kid", "toddler", "dog", "child", "kid", "little kid", "child", "boy", "little boy", "child"], "difficult_direct_answer": false, "rationales": ["Since he is so small he is the easiest target", "The child is closest to the dog. he would not know how to interact with the dog properly.", "There is a brown dog. the woman, man, and black dog are far away from the brown dog."], "image": "train2014/COCO_train2014_000000039791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528624, "question_id": "kGTcsi2ixuktwHPJRDGGqi", "question": "What are they doing?", "choices": ["resting", "playing", "waiting", "fighting"], "correct_choice_idx": 2, "direct_answers": ["waiting", "playing baseball", "waiting", "baseball", "baseball", "playing baseball", "standing around", "standing", "fielding", "standing around"], "difficult_direct_answer": false, "rationales": ["They are waiting in their positions until the ball comes to them and they can play.", "They are waiting for the game to start.", "The kids are waiting for activity on the field to occur."], "image": "train2014/COCO_train2014_000000528624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17365, "question_id": "kGazcE5xVDxeVjvKF7VGvK", "question": "What are the poles sticking out of the ground near the man wearing yellow?", "choices": ["goal posts", "decoration", "light poles", "trail markers"], "correct_choice_idx": 3, "direct_answers": ["boundry markers", "warning skiers", "route marker", "trail markers", "markers", "markers", "trail guides", "direction poles", "boundary markers", "jacket"], "difficult_direct_answer": true, "rationales": ["The poles sticking out of the ground near the man wearing yellow are used to mark the trail for skiers.", "The poles are markers.", "The poles are for skiers to find their way."], "image": "val2014/COCO_val2014_000000017365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120939, "question_id": "kGg2R8YPiA3b2cQpBiWreL", "question": "What is the name for the table the woman is sitting at?", "choices": ["picnic table", "end table", "kitchen table", "conference table"], "correct_choice_idx": 0, "direct_answers": ["picnic table", "picnic table", "picnic table", "picnic table", "picnic table", "picnic table", "picnic table", "picnic table", "picnic", "picnic"], "difficult_direct_answer": false, "rationales": ["The name is a picnic table.", "This table is usually found outside so people can eat together.", "The table is for picnics."], "image": "train2014/COCO_train2014_000000120939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556637, "question_id": "kGhSJJVcdVPgvj7HJVh7gu", "question": "Which character wears a shirt with a similar pattern to this man's shirt?", "choices": ["peter pan", "linus", "donald duck", "daffy duck"], "correct_choice_idx": 1, "direct_answers": ["unknown", "charlie brown", "waldo", "waldo", "linus", "charlie brown", "waldo", "addams", "waldo", "prisoner"], "difficult_direct_answer": false, "rationales": ["Linus has a striped shirt.", "A man is wearing a gray shirt with white stripes.", "Linus has a striped shirt."], "image": "train2014/COCO_train2014_000000556637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410707, "question_id": "kGrMzpFz99i3k636JnRbcE", "question": "Who is the main male character in that video game?", "choices": ["link", "kong", "zelda", "mario"], "correct_choice_idx": 0, "direct_answers": ["adam", "zelda", "guy", "link", "zelda", "zelda", "link", "link", "luigi", "zelda"], "difficult_direct_answer": false, "rationales": ["The video game is zelda. zelda is the main female character.", "He is a character on the game zelda.", "The blonde character is the trademark hero in the zelda series which is what is shown on the game case next to the wii box."], "image": "train2014/COCO_train2014_000000410707.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367011, "question_id": "kGtoN4E5e7HXFrXpMgFaJt", "question": "What is the blue structure located above the railroad tracks used as?", "choices": ["bike trail", "pedestrian walkway", "road", "highway"], "correct_choice_idx": 2, "direct_answers": ["bridge", "traffic", "bridge", "bridge", "road", "cars", "bridge", "bridge", "bridge", "bridge"], "difficult_direct_answer": false, "rationales": ["The blue structure allows cars to pass from above.", "The blue structure is part of the road going across the tracks.", "The structure is a road."], "image": "train2014/COCO_train2014_000000367011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308034, "question_id": "kGxFWpYK5QSg2eAY7SBdKZ", "question": "What does the boat carry?", "choices": ["electronics", "books", "animals", "food"], "correct_choice_idx": 3, "direct_answers": ["produce", "produce", "food", "food", "produce", "produce", "fruits", "food", "food", "fruits vegetables"], "difficult_direct_answer": false, "rationales": ["The boat has food.", "A large boat is filled with various types of produce.", "There are many boxes of fresh produce."], "image": "train2014/COCO_train2014_000000308034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115519, "question_id": "kHCHzksApZZ9y7QsVF7985", "question": "Where are these people gathered?", "choices": ["hospital", "home", "museum", "restaurant"], "correct_choice_idx": 1, "direct_answers": ["dinner", "dining room", "dining room", "dinner", "dinner table", "home", "dinner", "home", "dining room", "table"], "difficult_direct_answer": false, "rationales": ["They're at home.", "The people are in a dining room. it is too small to be part of a restaurant.", "Most homes have a dining table that family gathers around for meals."], "image": "train2014/COCO_train2014_000000115519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126657, "question_id": "kHDUcibxffErHgmncnKFPU", "question": "What nickname does the front bike often have?", "choices": ["chopper", "killer", "winger", "rider"], "correct_choice_idx": 0, "direct_answers": ["chopper", "hog", "harley", "hog", "front runner", "pack leader", "chopper", "hog", "hog", "lead"], "difficult_direct_answer": false, "rationales": ["This is the type of bike that this is called.", "The front bike is the chopper.", "The name is a chopper."], "image": "val2014/COCO_val2014_000000126657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265324, "question_id": "kHVRnnQ75D8cNHVFAggn28", "question": "What chemical can unclog a toilet?", "choices": ["hcl", "ncl", "acetic acid", "sulfuric acid"], "correct_choice_idx": 3, "direct_answers": ["sulfuric acid", "baking soda", "lysol", "sulfuric acid", "borax", "no chemical", "lysol", "baking soda", "sulfuric acid", "nothing"], "difficult_direct_answer": false, "rationales": ["Sulfuric acid unclogs toilets.", "Sulfuric acid breaks down gunk in the pipes.", "A bathroom has a large tub that is full."], "image": "train2014/COCO_train2014_000000265324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492107, "question_id": "kHpEsZFGnGcvKyaGbShZwB", "question": "What is the next step the people are going to do with the pizzas?", "choices": ["pan fry", "steam", "grill", "bake"], "correct_choice_idx": 3, "direct_answers": ["bake it", "oven", "bake", "cook", "garnish", "cook them", "eat", "cook", "bake", "bake"], "difficult_direct_answer": false, "rationales": ["The people are preparing raw pizzas and need to be placed in the oven next in order to eat them.", "The step is to bake.", "In order to cook the pizza you have to put it in the oven."], "image": "val2014/COCO_val2014_000000492107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207862, "question_id": "kJ36oX4mb6CAoGTLgtASje", "question": "How do you know someone here is probably married?", "choices": ["wedding ring", "wedding dress", "wedding photo", "wedding procession"], "correct_choice_idx": 0, "direct_answers": ["wedding ring", "wedding ring", "wedding ring", "setting", "wedding ring", "wedding ring", "single rose", "wedding ring", "ring finger", "ring"], "difficult_direct_answer": false, "rationales": ["There is a man wearing a wedding ring at the table that means he is probably married.", "There is a ring around a person's finger, which denotes marriage.", "The wedding ring."], "image": "train2014/COCO_train2014_000000207862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279331, "question_id": "kJJZfvtMayeyaYJEQTwnCa", "question": "What is probably at the other end of the leash?", "choices": ["person", "dog", "racket", "cat"], "correct_choice_idx": 0, "direct_answers": ["dog owner", "owner", "owner", "person", "owner", "owner", "person", "hand", "human", "person"], "difficult_direct_answer": false, "rationales": ["The owner is holding the leash.", "Someone is holding the dog's leash.", "A person is going to hold onto the leash."], "image": "val2014/COCO_val2014_000000279331.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45343, "question_id": "kJiwJfZw3YijyTq9ceD2Qh", "question": "In what continent is this street situated?", "choices": ["asia", "europe", "north america", "australia"], "correct_choice_idx": 2, "direct_answers": ["cuba", "north america", "north america", "cuba", "north america", "north america", "north america", "cuba", "north america", "north america"], "difficult_direct_answer": false, "rationales": ["English is used on a sign on a bus giving the route.", "Cuba is in north america.", "The bus is in north america."], "image": "train2014/COCO_train2014_000000045343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157920, "question_id": "kJoFqbEkzmzyTfbvkSWtYS", "question": "Why is the man holding the ball wearing a glove?", "choices": ["fashion", "warmth", "for catching", "health"], "correct_choice_idx": 2, "direct_answers": ["catching", "for catching", "playing softball", "to catch", "playing baseball", "catch easier", "to catch", "catch", "baseball", "catch ball"], "difficult_direct_answer": true, "rationales": ["The man is catching the ball.", "A mitt is made just for that.", "He has a mitt."], "image": "train2014/COCO_train2014_000000157920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263434, "question_id": "kJpPxeJraw7AFSUwrxVyCN", "question": "On which computer could one find directions the fastest?", "choices": ["middle", "left", "none", "right"], "correct_choice_idx": 3, "direct_answers": ["right laptop", "black laptop", "black laptop", "right laptop", "center computer", "desktop", "right", "right computer", "third", "fastest one"], "difficult_direct_answer": false, "rationales": ["A map is shown in a computer screen to the right of other computers.", "The right computer looks the newest.", "The computer is on the right."], "image": "train2014/COCO_train2014_000000263434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326428, "question_id": "kJtwXgUvd8TG45rcWjf9d4", "question": "What event has likely taken place here?", "choices": ["hurricane", "tornado", "thunderstorm", "car accident"], "correct_choice_idx": 3, "direct_answers": ["accident sign", "car accident", "collision", "car accident", "crash", "vandalism", "accident", "car accident", "crash", "accident"], "difficult_direct_answer": false, "rationales": ["A sign is bent to the side near the edge of a street. something significant would have to hit the sign to bend it and cars drive back and forth regularly.", "There is no visible evidence of car debris or scratching on the pole that is bent. it is thus likely it was bent by something strong enough to bend but make no marks on the metal.", "The stop sign is bent at the bottom. it would take something heavy to make it bend like this."], "image": "val2014/COCO_val2014_000000326428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79106, "question_id": "kKC6FBSBTbFDPjwpk6Aerd", "question": "What is the cheapest flat that you can buy here?", "choices": ["$300s", "202575", "5758444", "mid $300s"], "correct_choice_idx": 3, "direct_answers": ["mid $300s", "condos", "open house", "300000 dollars", "mid $300s", "mid $300s", "condos", "condo", "condo", "mid 300s"], "difficult_direct_answer": false, "rationales": ["The house is the mid 300s.", "The cheapest flat is in the mid 300s.", "This amount is the least expensive."], "image": "train2014/COCO_train2014_000000079106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514222, "question_id": "kKEBgDKrkFBizWAFRTuENf", "question": "What group of people originally created this food?", "choices": ["koreans", "chinese", "dutch", "jews"], "correct_choice_idx": 2, "direct_answers": ["black", "americans", "korean", "poverty", "dutch", "germans", "americans", "dutch", "french", "americans"], "difficult_direct_answer": false, "rationales": ["Donuts were part of the amsterdam culture.", "The boy is eating doughnuts.", "The donut is known to have been created in the netherlands, which is where the dutch originate."], "image": "val2014/COCO_val2014_000000514222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327624, "question_id": "kKLhjycyb6GbUYaj9BaufR", "question": "How would these animals be classified?", "choices": ["pescatarian", "omnivores", "carnivores", "herbivores"], "correct_choice_idx": 3, "direct_answers": ["zebra", "zebras", "zebras", "mammals", "equus", "mammals", "wild animals", "by breed", "flock", "herbivores"], "difficult_direct_answer": false, "rationales": ["They eat grass.", "The animals visible are zebras and they are answer a as they eat plants only.", "These animals are zebras. they have plant-based diets."], "image": "val2014/COCO_val2014_000000327624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379853, "question_id": "kKV3AoFs4zAyQXcT5HjtQG", "question": "What type tennis is being played here?", "choices": ["men's doubles", "passive", "singles", "mixed doubles"], "correct_choice_idx": 0, "direct_answers": ["tennis court", "doubles", "team", "team", "doubles", "doubles", "doubles", "men's doubles", "doubles", "doubles"], "difficult_direct_answer": false, "rationales": ["Their obviously two men here on the same side, so the other answers would not fit.", "The men are playing a doubles match.", "There's two people on one side of the court instead of the usual one player."], "image": "train2014/COCO_train2014_000000379853.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243205, "question_id": "kKbHEbFWwfWqZ4u7aAdjqn", "question": "What item is intended to rock back and forth in this room?", "choices": ["chair", "table", "cabinet", "painting"], "correct_choice_idx": 0, "direct_answers": ["clock", "rocking chair", "rocking chair", "battle", "chair", "chair", "chair", "rocking chair", "rocking chair", "rocking chair"], "difficult_direct_answer": false, "rationales": ["The center chair in this image has something rounded attached to the bottom of it's legs which allows the sitter to rock back and forth.", "The rocking chair will rock.", "The chair has a curved piece under the legs."], "image": "train2014/COCO_train2014_000000243205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290034, "question_id": "kKctcLGNUBFBpv95chLD3z", "question": "What is the sum of each individual digit on the side of the boat?", "choices": ["1530", "22", "nine", "34"], "correct_choice_idx": 2, "direct_answers": ["nine", "nine", "four", "nine", "nine", "four", "nine", "nine", "nine", "nine"], "difficult_direct_answer": false, "rationales": ["If you add 1, 5, 3, and 0 together, you get nine.", "The digits total up to nine.", "The sum is 9."], "image": "train2014/COCO_train2014_000000290034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9708, "question_id": "kKxdK7NRqa2TgsDEDTaS4M", "question": "What color hair does the woman have?", "choices": ["blue", "blonde", "red", "green"], "correct_choice_idx": 2, "direct_answers": ["brown", "brown", "red", "red", "brown", "red", "red", "brown", "brown", "red"], "difficult_direct_answer": false, "rationales": ["A woman with dark auburn hair is sitting in the sand.", "The woman sitting on the beach has red hair.", "The woman has fiery hair."], "image": "train2014/COCO_train2014_000000009708.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405225, "question_id": "kLKyauh5mGzjBjhGxEHKvX", "question": "Where is this person working?", "choices": ["courthouse", "home", "library", "office"], "correct_choice_idx": 1, "direct_answers": ["home", "at desk", "desk", "home", "home", "home", "desk", "desk", "office desk", "table"], "difficult_direct_answer": false, "rationales": ["The cat is snoozing along with him.", "He is in his home", "People usually can't bring pets to their workplace. many people work remotely though."], "image": "train2014/COCO_train2014_000000405225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460139, "question_id": "kLiLK5riuD69Ei92JeZ2Nt", "question": "What kind of pattern is the road?", "choices": ["black", "bumpy", "tiled", "square"], "correct_choice_idx": 2, "direct_answers": ["brick", "concrete", "solid", "brick", "brick", "tiled", "marble", "tile", "brick", "paved"], "difficult_direct_answer": false, "rationales": ["The road has tiles on it.", "In looking at the entire ground, it would appear that the top is at a different level from the bottom. this would indicate that the road has to be tilted down.", "It's bricks on the road."], "image": "train2014/COCO_train2014_000000460139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58572, "question_id": "kLuRAaFB7k9RiWNqNjWexm", "question": "Where is this person dining?", "choices": ["private space", "park", "public bus", "restaurant"], "correct_choice_idx": 3, "direct_answers": ["restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["With all the different foods and layout, the person is dining out.", "The person is at a restaurant.", "A table with a cloth tablecloth and prepared food is shown with people sitting around it. restaurants use tablecloths."], "image": "train2014/COCO_train2014_000000058572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332877, "question_id": "kMGiFNvns9B6yDeVvuMRAK", "question": "How many directions does traffic flow in these pictured lanes?", "choices": ["two", "none", "three", "one"], "correct_choice_idx": 3, "direct_answers": ["one way", "one way", "one", "one", "three", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["The traffic is all going the same way.", "There is only one way for traffic.", "This street is four lanes wide and all the vehicles are driving in one direction."], "image": "val2014/COCO_val2014_000000332877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570406, "question_id": "kMhNvHDUfw9DjUjXAfAGTs", "question": "Which platform services the train arriving now?", "choices": ["right", "none", "left", "last one"], "correct_choice_idx": 0, "direct_answers": ["right platform", "right", "two", "right", "track", "second platform", "furthest", "platform two", "platform 5", "tracks"], "difficult_direct_answer": true, "rationales": ["The train is on the right tracks so it would be using the right-side platform.", "The services go right.", "The train is on the track that is on the right side of the photo."], "image": "train2014/COCO_train2014_000000570406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346196, "question_id": "kMpgFpMmJRmdxfQC88KkVq", "question": "What is the woman prepared to do?", "choices": ["dribble", "bat", "swing", "dunk"], "correct_choice_idx": 2, "direct_answers": ["swing", "return volley", "hit ball", "return ball", "swing", "hit ball", "hit ball", "volley", "volley", "hit ball"], "difficult_direct_answer": false, "rationales": ["The woman is going to swing to hit the ball.", "She is waiting for the ball to come to her side to hit it", "The woman is prepared to swing at the tennis ball."], "image": "train2014/COCO_train2014_000000346196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277263, "question_id": "kMpnw6KHVbREt24kESieGS", "question": "When can you make a right turn at Elizabeth Street?", "choices": ["july 2006", "february 2006", "april 2006", "june 2006"], "correct_choice_idx": 1, "direct_answers": ["march-july", "after july", "to walk", "march-july", "summer winter", "august", "august twothousandsix", "august-february", "july 2006", "february 2006"], "difficult_direct_answer": true, "rationales": ["The sign says no right turn from march 2006 to july 2006.", "You cannot make a right turn at elizabeth street between march 2006 and july 2006.", "You can make a turn before march."], "image": "train2014/COCO_train2014_000000277263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538682, "question_id": "kNC8jytQoFjnWSuJDY5FWC", "question": "What does the mechanism below the skateboard do?", "choices": ["makes noise", "rotates circularly", "tilts/ moves", "nothing"], "correct_choice_idx": 2, "direct_answers": ["tilts/ moves", "move skateboard", "moves board", "vibrate", "helps balance", "balance", "weighing", "corrects balance", "electrical circuit", "clean"], "difficult_direct_answer": true, "rationales": ["It is kind of a lever which will make it lean from one side to the other.", "It has a mechanism to raise and lower the sides, front and/or back end of the board.", "The mechanism moves."], "image": "train2014/COCO_train2014_000000538682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306972, "question_id": "kNFByJeqMGCdvWuSVaAuKv", "question": "Who is near the train?", "choices": ["cowboy", "police officer", "conductor", "firefighter"], "correct_choice_idx": 2, "direct_answers": ["man", "conductor", "child", "engineer", "baby", "conductor", "conductor", "people", "conductor", "conductor"], "difficult_direct_answer": false, "rationales": ["The conductor is standing nearby.", "A conductor is standing there since he's wearing a conductor's uniform.", "The conductor is near."], "image": "val2014/COCO_val2014_000000306972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363349, "question_id": "kNTiqd7RPVBgdDsjEtmqLZ", "question": "What is the person doing to the cat?", "choices": ["petting", "feeding", "hugging", "bathing"], "correct_choice_idx": 1, "direct_answers": ["feeding", "feeding", "feeding", "feeding", "brushing teeth", "feeding it", "feeding", "brushing", "brushing", "feeding"], "difficult_direct_answer": false, "rationales": ["A tabby cat resting on a pillow is offered what appears to be a potato chip. by and large, cats don't eat potatoes, but kitty might like the salt on it.", "The person is extending an item towards the cat's mouth which would be consistent with answer a.", "The person is holding food in front of the cat."], "image": "train2014/COCO_train2014_000000363349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241643, "question_id": "kNZAxeZVCwcwYrBWnSKKYG", "question": "What surface is the woman playing tennis on?", "choices": ["hard", "clay", "grass", "carpet"], "correct_choice_idx": 0, "direct_answers": ["tennis court", "hard", "tennis court", "harcourt", "court", "rubber", "rubber", "clay court", "concrete", "tennis court"], "difficult_direct_answer": false, "rationales": ["Playing on pavement makes you able to make quick movements when playing any sport.", "The woman is playing tennis on a hard tennis court surface.", "The surface is man made and does not have any give when the woman steps on it."], "image": "train2014/COCO_train2014_000000241643.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70211, "question_id": "kNjrmVUMy2XT7iPS9KqGMa", "question": "This is most likely what kind of event?", "choices": ["license renewal", "book signing", "concert", "wedding"], "correct_choice_idx": 3, "direct_answers": ["celebration", "birthday party", "wedding", "birthday", "birthday party", "birthday", "wedding", "birthday celebration", "celebration", "birthday graduation"], "difficult_direct_answer": false, "rationales": ["Looks like it was a groomsman cake.", "The wedding cake is shown.", "The cake has white frosting and is large for a wedding sized party."], "image": "train2014/COCO_train2014_000000070211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195848, "question_id": "kP5DSYzPMuUp9jRS5toTKM", "question": "What theme parade do these bikers ride in?", "choices": ["protest", "prison", "gay pride", "rodeo"], "correct_choice_idx": 2, "direct_answers": ["pride", "lgbtq", "lgbtq", "freedom", "lgbtq", "veterans", "gay pride", "gay pride", "gay pride", "gay pride"], "difficult_direct_answer": false, "rationales": ["The parade has a rainbow flag.", "There are several rainbow flags.", "Several bikers are holding flags and one has a rainbow flag. it designates those have a certain sexual preference."], "image": "train2014/COCO_train2014_000000195848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488118, "question_id": "kPAYifGJb9nXD2SpDUJEeU", "question": "What flag dominates the crowd?", "choices": ["ukraine", "canada", "uganda", "norway"], "correct_choice_idx": 3, "direct_answers": ["norway", "norway", "switzerland", "norway", "norway", "norway", "norway", "united kingdom", "denmark", "norway"], "difficult_direct_answer": false, "rationales": ["There are people in the crowd waving the norwegian flag.", "You see the flag and you see it on some peoples clothing.", "The flag is from norway."], "image": "train2014/COCO_train2014_000000488118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449039, "question_id": "kPDS3ckjs2NbXVUCUUdoQ2", "question": "What fuels the mode of travel shown?", "choices": ["beef", "oats", "gas", "coal"], "correct_choice_idx": 1, "direct_answers": ["food", "horse", "oats", "hay", "food", "hay", "horse", "hay", "hay", "horsepower"], "difficult_direct_answer": false, "rationales": ["Horses eat oats.", "The other options aren't food for horses.", "They eat this to fill them up."], "image": "val2014/COCO_val2014_000000449039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569081, "question_id": "kPEARLWu2DwjVubthtHjrE", "question": "What is another tool used to cut this type of food?", "choices": ["pizza razor", "pizza spatula", "pizza slicer", "pizza cutter"], "correct_choice_idx": 3, "direct_answers": ["knife", "pizza cutter", "pizza cutter", "pizza cutter", "pizza cutter", "pizza cutter", "pizza cutter", "pizza cutter", "knife", "knife"], "difficult_direct_answer": false, "rationales": ["A pizza cutter cuts pizza.", "Pizza is usually cut by a pizza cutter.", "The person is cutting a pizza."], "image": "train2014/COCO_train2014_000000569081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540503, "question_id": "kPFaGFJs3pr5JGA6ykHjqb", "question": "What activity is taking place besides surfing?", "choices": ["fishing", "paddling", "swimming", "canoeing"], "correct_choice_idx": 2, "direct_answers": ["swimming", "boating", "boating", "swimming", "boating", "swimming", "wading", "surfing", "boating", "swimming"], "difficult_direct_answer": false, "rationales": ["There are people in the water that are swimming and aren't using any equipment.", "The people are on surfboards.", "People are in the water together jumping in the waves."], "image": "train2014/COCO_train2014_000000540503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561124, "question_id": "kQDsbgHBmJZQ97hK3ntFP4", "question": "What country does this train run in?", "choices": ["norway", "nigeria", "canada", "russia"], "correct_choice_idx": 0, "direct_answers": ["ukraine", "norway", "norway", "germany", "norway", "germany", "germany", "norway", "norway", "norway"], "difficult_direct_answer": false, "rationales": ["Looking up the city of kjosfossen on side of train reveals country.", "The landscape is typical of norway and the words on the train are written in their typical spelling and language.", "Kjosfossen is the name of a waterfall in norway."], "image": "train2014/COCO_train2014_000000561124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74942, "question_id": "kQE9KYPrhmkKoGMLehBVuQ", "question": "Which food contains the most protein?", "choices": ["carrot", "pickled cucumber", "ham", "bread"], "correct_choice_idx": 2, "direct_answers": ["sandwich meat", "sliced brisket", "ham", "roast beef", "meat", "meat", "smoked meat", "roast beef", "sandwich meat", "sandwich"], "difficult_direct_answer": false, "rationales": ["There is a ham based food item on the sandwich based on the texture and color. ham is a meat and meat is known to have a lot of protein especially compared to the other foods visible.", "The meat would contain the most.", "Ham contains a lot of protein."], "image": "train2014/COCO_train2014_000000074942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502963, "question_id": "kQac9iW8h3RQvLakRwb8GD", "question": "What time of day is it here?", "choices": ["midday", "midnight", "noon", "twilight"], "correct_choice_idx": 3, "direct_answers": ["twilight", "evening", "dusk", "evening", "early evening", "daytime", "afternoon", "evening", "night time", "evening"], "difficult_direct_answer": false, "rationales": ["The day is twilight.", "It is still a little light but almost dark out", "It's twilight since it's a little dark."], "image": "val2014/COCO_val2014_000000502963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214213, "question_id": "kQpNQ77aXXmJYuGbdhtrSd", "question": "What type of shot is the woman about to hit?", "choices": ["serve", "backhand", "forehand", "slice"], "correct_choice_idx": 0, "direct_answers": ["overhead", "serve", "serve", "slam", "forehand", "tennis", "slam", "lob", "backhand", "serve"], "difficult_direct_answer": false, "rationales": ["The shot is a serve.", "The ball is in the air and she is poised to hit it.", "The woman has her arm positioned to serve the tennis ball."], "image": "train2014/COCO_train2014_000000214213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381377, "question_id": "kQy28k2ZDiTbtSysJV7XQ2", "question": "What does the man in red want to do with the ball?", "choices": ["grab it", "squish it", "pocket it", "kick it"], "correct_choice_idx": 3, "direct_answers": ["kick it", "kick ball", "kick", "kick it", "kick it", "kick it", "kick", "kick", "kick", "score"], "difficult_direct_answer": false, "rationales": ["They are playing soccer.", "His right foot is being held back to prepare energy for going forwards towards the ball.", "Soccer is played by kicking the ball."], "image": "train2014/COCO_train2014_000000381377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475988, "question_id": "kR3WLGQDT4aDdricyhYqaf", "question": "Why are the motorbikes lined up in a row?", "choices": ["for repair", "for show", "for amusement", "coincidence"], "correct_choice_idx": 1, "direct_answers": ["aesthetic appeal", "bike show", "display case", "event", "for show", "parked", "parking", "showcasing", "parked", "sales presentation"], "difficult_direct_answer": true, "rationales": ["The bikes are for show.", "The motorbikes are available for display.", "The bikes are for a show."], "image": "train2014/COCO_train2014_000000475988.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308759, "question_id": "kR9sKUajTMN2hjRzH6w4J7", "question": "What sort of court is this?", "choices": ["sod", "asphalt", "concrete", "clay"], "correct_choice_idx": 3, "direct_answers": ["tennis", "clay", "tennis", "clay", "clay court", "clay tennis", "clay", "dirt", "clay", "tennis court"], "difficult_direct_answer": false, "rationales": ["It's a clay court.", "The surface is orange and it is coming up as the man walks.", "The surface is a rusty colored dirt."], "image": "val2014/COCO_val2014_000000308759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142564, "question_id": "kRCdFKYsa764uRXaiqdNd5", "question": "Who do the children smile at while bathing?", "choices": ["mailman", "parent", "mouse", "teacher"], "correct_choice_idx": 1, "direct_answers": ["like it", "parent", "photographer", "photographer", "mom", "parents", "happy", "photographer", "having fun", "parent"], "difficult_direct_answer": false, "rationales": ["The parent is making the kids smile.", "Children of this young age are most likely to be supervised in the bath by a parent or guardian, so a parental figure can be assumed to be supervising.", "The kids smile at the parent."], "image": "val2014/COCO_val2014_000000142564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334197, "question_id": "kRPisiP4gKsmbN4BBjvYSw", "question": "What kind of backsplash has been attached to the wall?", "choices": ["natural stone", "metal", "stainless steel", "glass"], "correct_choice_idx": 3, "direct_answers": ["little tiles", "navy blue", "tiles", "tile", "tile", "glass tiles", "tile", "glass", "exhaust", "halogen"], "difficult_direct_answer": false, "rationales": ["These are small tiles that come in sheets to help them apply to the wall easier and are easy to care for", "It's hard to tell in the image. the tile is likely ceramic. so, i'm choosing d for this reason. i can't tell if a is in the picture. b and c aren't backsplashes.", "The window has glass on it."], "image": "train2014/COCO_train2014_000000334197.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363869, "question_id": "kRarNzkjawyFGEgzBa5Um7", "question": "What type of station is this?", "choices": ["fire", "bus", "train", "beverage"], "correct_choice_idx": 3, "direct_answers": ["beverage", "bus", "drink", "beverage", "drink", "bar", "restaurant", "beverage", "drink", "beverage"], "difficult_direct_answer": false, "rationales": ["It is a place to get drinks.", "There is a large container of fruit punch on the counter.", "The station has drinks."], "image": "train2014/COCO_train2014_000000363869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208172, "question_id": "kRfa8xTg6uzcY2oWEu8Tjh", "question": "What does the boy have on his head?", "choices": ["baseball cap", "cat", "balloon", "fedora"], "correct_choice_idx": 0, "direct_answers": ["baseball hat", "baseball hat", "baseball cap", "baseball cap", "baseball cap", "baseball hat", "cap", "mets cap", "hat", "baseball cap"], "difficult_direct_answer": false, "rationales": ["The boy is wearing a hat.", "The boy has a cap.", "The boy has a yankees hat on his head."], "image": "train2014/COCO_train2014_000000208172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296797, "question_id": "kRwbbP5potKkEcDyosmEFv", "question": "In which city do these passengers board?", "choices": ["san francisco", "dallas", "santa fe", "san jose"], "correct_choice_idx": 0, "direct_answers": ["london", "san francisco", "san francisco", "san francisco", "san francisco", "san francisco", "san francisco", "new york", "san francisco", "trolley"], "difficult_direct_answer": false, "rationales": ["The people are getting on a trolley.", "These types of trolleys are very common in san francisco.", "The city is sf."], "image": "val2014/COCO_val2014_000000296797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484721, "question_id": "kSJt6XifdNXQnVyA4MxAdv", "question": "What company is known for making the item farthest to the right?", "choices": ["apple", "heinz", "welch's", "amazon"], "correct_choice_idx": 1, "direct_answers": ["heinz", "heinz", "heinz", "heinz", "heinz", "heinz", "heinz", "heinz", "heinz", "heinz"], "difficult_direct_answer": false, "rationales": ["The company is heinz.", "Heinz is the most famous ketchup manufacturer.", "The item is ketchup."], "image": "val2014/COCO_val2014_000000484721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331670, "question_id": "kSYg7dqbmNrA94SH3Qia9m", "question": "What are the people ready to do?", "choices": ["leave", "board", "run", "play"], "correct_choice_idx": 1, "direct_answers": ["board", "board train", "travel", "board", "travel", "embark", "board", "board", "load train", "board train"], "difficult_direct_answer": false, "rationales": ["The people want to get on the train.", "The people are boarding.", "The people are getting on the train."], "image": "train2014/COCO_train2014_000000331670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381932, "question_id": "kSwoRv4yhAnsW8BRpmhiWZ", "question": "Where is the surfer located in?", "choices": ["river", "pool", "lake", "pond"], "correct_choice_idx": 1, "direct_answers": ["canal", "water trough", "fake ocean", "water", "pool", "fountain", "pool", "canal", "water-filled object", "water"], "difficult_direct_answer": false, "rationales": ["The body of water is surrounded by an artificial material, so it is not a river, pond, or lake.", "The surfer is in a receptacle with water.", "The surfer is not in a naturally occurring body of water, like a river, pond, or lake. there is a green structure that keeps the water in place."], "image": "train2014/COCO_train2014_000000381932.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206256, "question_id": "kT7CTeXT3EqAfZXcrAsWSA", "question": "What could have been the reason the bus is on the road but out of service?", "choices": ["driver quit", "engine trouble", "ice", "no gas"], "correct_choice_idx": 2, "direct_answers": ["engine problems", "ice", "broken", "replacement bus", "got stuck", "no people", "off duty", "weather", "breakdown", "flat tire"], "difficult_direct_answer": true, "rationales": ["The reason is ice.", "Vehicles tend not to drive well when the road is slippery. it makes it dangerous and easy to lose control.", "The bus is pulled over on the curb in an icy environment. it is cold and snow is all over the ground."], "image": "train2014/COCO_train2014_000000206256.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188616, "question_id": "kU8ZQwXSW6XJ2rGqcQqDvf", "question": "The basket of salmon is currently being eaten at which location?", "choices": ["restaurant", "car", "work", "home"], "correct_choice_idx": 0, "direct_answers": ["table", "home", "restaurant", "grgeorien", "gregoire", "floor", "restaurant", "restaurant", "gregoire restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["It is plated like a paid eatery.", "The basket is in restaurant paper.", "The text in the url in the background indicates the type of establishment. the for here text indicates that the person is eating at the establishment."], "image": "val2014/COCO_val2014_000000188616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438061, "question_id": "kUeTiGExnqGb6iWQyg8bwx", "question": "Why does the train have so many windows?", "choices": ["engineer mistake", "passenger train", "freight train", "greenhouse train"], "correct_choice_idx": 1, "direct_answers": ["look out", "passenger views", "passenger seats", "passengers", "passengers", "passenger train", "sight", "passenger", "visibility", "passengers"], "difficult_direct_answer": false, "rationales": ["The train has passengers.", "The train is for passengers to look out of.", "This type of train with many windows is used for transporting humans; without windows it would usually be transporting goods."], "image": "train2014/COCO_train2014_000000438061.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467194, "question_id": "kVJqD9trVGZtCXe3hawqgh", "question": "Why is the man in all black holding his arms out?", "choices": ["to wave", "to balance", "to tag", "to dance"], "correct_choice_idx": 1, "direct_answers": ["for balance", "balance", "for balance", "balance", "balance", "balance", "for balance", "to balance", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["He is trying to keep his balance.", "The man wants to stay standing on his board.", "A man is performing a grind trick on a skateboard on a rail."], "image": "val2014/COCO_val2014_000000467194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367732, "question_id": "kVXYkZPWAWq7XdzmtUkPDi", "question": "What this man doing?", "choices": ["buying cake", "smashing cake", "decorating cake", "tasting cake"], "correct_choice_idx": 2, "direct_answers": ["posing", "celebrating", "decorating cake", "celebrating birthday", "posing", "celebrating", "holding cake", "celebrating", "showing off", "celebrating"], "difficult_direct_answer": false, "rationales": ["The man is holding a cake and adding decorations to it.", "The man is putting frosting on the cake.", "The cake is already decorating and there is no evidence of any other answer being viable. normally a person makes a cake for the purposes of answer c which would likely happen at some point in the near future."], "image": "val2014/COCO_val2014_000000367732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329573, "question_id": "kVbTFVxm9a6GDA8uXCwxhY", "question": "What is near the plane?", "choices": ["bat", "traffic coordinator", "monkey", "traffic cone"], "correct_choice_idx": 3, "direct_answers": ["car", "vehicles", "vehicles", "workers", "workers", "traffic cone", "van", "loading gate", "orange cone", "aircraft towing"], "difficult_direct_answer": false, "rationales": ["Below either of this plane's engines we can spot two cylinder shaped orange items with white stripes towards the tip.", "A traffic cone is there", "An orange cone is near the engine of the plane. cones are used to mark off areas to use caution around."], "image": "val2014/COCO_val2014_000000329573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573750, "question_id": "kW67FrTJ8aQDBgBXiyHbY9", "question": "What are these people in the front carrying?", "choices": ["snowboards", "surfboards", "skates", "skateboards"], "correct_choice_idx": 0, "direct_answers": ["snow boards", "snowboards", "snowboards", "snow boards", "snowboards", "snowboards", "snowboards", "snowboards", "snowboards", "snow boards"], "difficult_direct_answer": false, "rationales": ["They are using these to glide down the mountain alongside skiers.", "The people are in the snow.", "They are on snow climbing a hill with snowboards on their hands."], "image": "val2014/COCO_val2014_000000573750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340877, "question_id": "kWU5fqDrsaNsSbqbBNaeHw", "question": "This beverage tastes similar to what other beverage?", "choices": ["sprite", "ginger ale", "sorrel drink", "root beer"], "correct_choice_idx": 3, "direct_answers": ["beer", "beer", "beer", "root beer", "raspberry lime", "beer", "beer", "root beer", "root beer", "beer"], "difficult_direct_answer": false, "rationales": ["Root beer is a soda beverage that has a similar flavor to sasparilla.", "There is a dark colored brown glass bottle with a rolled up sandwich.", "The drink is in a beer bottle."], "image": "train2014/COCO_train2014_000000340877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92519, "question_id": "kWUrjtNXKAR7dTdxFSeuw2", "question": "How many different cakes are there on the table?", "choices": ["nine", "seven", "eight", "six"], "correct_choice_idx": 1, "direct_answers": ["five", "seven", "six", "seven", "six", "seven", "six", "five", "seven", "seven"], "difficult_direct_answer": false, "rationales": ["There are 7 cakes.", "There are 7 different type cakes on the table.", "Seven cakes are located around the table."], "image": "val2014/COCO_val2014_000000092519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564532, "question_id": "kWV7Di9fUzG79jYBgp2QeF", "question": "The color on the vehicle that is above the headlights is the same color as what?", "choices": ["frog", "tiger", "polar bear", "jaguar"], "correct_choice_idx": 2, "direct_answers": ["snow white", "polar bear", "white", "snow", "clouds", "roof", "clouds", "milk", "marshmallow", "swan"], "difficult_direct_answer": true, "rationales": ["The color is white.", "White is the color of polar bears.", "White is the color of a polar bear."], "image": "train2014/COCO_train2014_000000564532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413489, "question_id": "kWXAAobJsFjuhXb7YCwfDF", "question": "At least how many mammals are on the couch?", "choices": ["seven", "four", "eight", "none"], "correct_choice_idx": 1, "direct_answers": ["4 mammals", "4 mammals", "four", "four", "four", "four", "4 mammals", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["A man is laying on the couch with three small puppies.", "There are three dogs and one person on the couch which totals four mammals.", "Three dogs and one person can be seen laying on the couch."], "image": "train2014/COCO_train2014_000000413489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424088, "question_id": "kWcfDSTrB4ehhreNZdm9WD", "question": "What was used to cook the meat and potatoes of the dish?", "choices": ["stove", "oven", "air fryer", "grill"], "correct_choice_idx": 3, "direct_answers": ["oven", "grill", "oven", "grill", "grill", "oven", "pot", "grill", "grill", "grill"], "difficult_direct_answer": false, "rationales": ["The chicken has grill marks.", "Cooking on a grill leaves marks from the grates of the grill. a plate of food has meat and potatoes with dark lines across them.", "The grill was used."], "image": "train2014/COCO_train2014_000000424088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81317, "question_id": "kWeuCkGBMPaVwro5uyGznT", "question": "What type of vehicle is next to the sedan?", "choices": ["motorcycle", "truck", "minivan", "convertible"], "correct_choice_idx": 2, "direct_answers": ["toyota", "van", "van", "toyota", "minivan", "van", "toyota", "van", "van", "van"], "difficult_direct_answer": false, "rationales": ["The sedan is in the middle and there is only one vehicle next to it. it is smaller than a regular van so would qualify as a minivan.", "The car is a minivan.", "The car is stopped next to a minivan."], "image": "train2014/COCO_train2014_000000081317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522990, "question_id": "kWnPZsPqmYXcNYVJqvk9yQ", "question": "Where is the person who is in charge of the horse and carriage?", "choices": ["horses head", "in market", "running races", "bathroom"], "correct_choice_idx": 0, "direct_answers": ["horses head", "infront", "near horse", "holding horse", "left side", "in front", "feeding horse", "front", "beside horse", "left"], "difficult_direct_answer": true, "rationales": ["He is holding the reins.", "The man holding them.", "A person is holding the reigns of a horse that is hooked up to a carriage. the person is a uniform."], "image": "val2014/COCO_val2014_000000522990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376921, "question_id": "kWyeznXMVEX7WtLQybEDR2", "question": "What is the person in black about to do?", "choices": ["move residences", "sales call", "work errand", "cook lunch"], "correct_choice_idx": 0, "direct_answers": ["move furniture", "move", "move house", "move", "move residences", "move", "move", "posing", "move", "move"], "difficult_direct_answer": false, "rationales": ["The person is standing in front of a u-haul in front of a house.", "The man is a mover.", "Move to a new place to live."], "image": "train2014/COCO_train2014_000000376921.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24972, "question_id": "kXCK2ZxBRGiqaMjKvJVntA", "question": "What part of the world is this from?", "choices": ["russia", "asia", "australia", "sweden"], "correct_choice_idx": 1, "direct_answers": ["eastern", "asia", "asia", "japan", "south korea", "south korea", "korea", "korea", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["There are asian people as well as japanese writing on the wall.", "There are several men with black hair and with that typical east asian look with korean writing on the banner.", "The writing on the banner behind the people and the skin tone of the individuals is consistent with answer a."], "image": "train2014/COCO_train2014_000000024972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28103, "question_id": "kY6LKByePjHodwVD6e4ytP", "question": "What can the company whose name is shown do for you?", "choices": ["deliver packages", "fortune telling", "massages", "offer flowers"], "correct_choice_idx": 0, "direct_answers": ["deliver packages", "ship packages", "deliver packages", "deliver packages", "deliver packages", "ship", "deliver packages", "clothe you", "deliver packages", "deliver packages"], "difficult_direct_answer": false, "rationales": ["Fedex is a company that delivers parcels.", "A shipping company logo is seen on a banner behind tennis courts.", "The company delivers mail."], "image": "train2014/COCO_train2014_000000028103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240950, "question_id": "kYGcvVfDriuLK9MEPsNtmy", "question": "Which fruit in the picture contain more water content in it?", "choices": ["muskmelon", "strawberry", "watermelon", "grapes"], "correct_choice_idx": 2, "direct_answers": ["watermelon", "watermelon", "watermelon", "watermelon", "strawberry", "strawberry", "watermelon", "watermelon", "watermelon", "watermelon"], "difficult_direct_answer": false, "rationales": ["The foods are clearly visible and identifiable. of the foods present, answer a is known to have high water content.", "The fruit is watermelon.", "The fruit shows some pink and a green outside."], "image": "val2014/COCO_val2014_000000240950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184470, "question_id": "kYchdzy7ugpbDRFf7vekkn", "question": "How many of the dozen donuts could be cream-filled?", "choices": ["three", "seven", "five", "two"], "correct_choice_idx": 2, "direct_answers": ["five", "two", "three", "five", "four", "three", "five", "three", "four", "five"], "difficult_direct_answer": false, "rationales": ["There are five.", "The donuts with no holes are cream-filled.", "There are twelve donuts in the box and five of them have filling."], "image": "train2014/COCO_train2014_000000184470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510665, "question_id": "kYr4cobFtxhc38EwWRLhJA", "question": "During which weather would the bus stop here be most appreciated by riders?", "choices": ["breezes", "wind", "clouds", "rain"], "correct_choice_idx": 3, "direct_answers": ["raining", "rain", "rainy", "summer", "rain", "raining", "bad weather", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["The stop has a roof covering.", "The bus stop is covered so it would provide passengers with a place to wait where they would be protected from getting wet.", "There is an overhang to stand under in bad weather."], "image": "train2014/COCO_train2014_000000510665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361460, "question_id": "kZLT8L3Q7TQrsQ2RJL93Xw", "question": "What number is the jockey?", "choices": ["719", "411", "159", "621"], "correct_choice_idx": 0, "direct_answers": ["719", "719", "719", "seven nineteen", "719", "seven nineteen", "719", "719", "19", "sevenhundred nineteen"], "difficult_direct_answer": false, "rationales": ["The person riding the horse has the number on their bib.", "The number is 719.", "The number 719 is on his jersey."], "image": "train2014/COCO_train2014_000000361460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520147, "question_id": "kZapVWNT4wWeuQaR8NQxXF", "question": "What is this wall used for?", "choices": ["pictures", "plants", "snowboarding", "door"], "correct_choice_idx": 2, "direct_answers": ["ramp", "snowboarding", "grinding", "slide", "snowboarding", "tricks", "snowboarding tricks", "skateboarding", "snowboarding", "snowboard tricks"], "difficult_direct_answer": false, "rationales": ["The wall has a sign that says snowboard on it.", "The word snowboards is written on the wall and there is a snowboarder actively using it. it was likely intentionally placed there for this purpose.", "A snowboarder traverses down this wall on the bottom of his snowboard."], "image": "val2014/COCO_val2014_000000520147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446481, "question_id": "kZe4QSJqZZCiJXBTzTkmZp", "question": "During which time of day is it only ok to go straight or right?", "choices": ["late afternoon", "midnight", "morning", "none"], "correct_choice_idx": 0, "direct_answers": ["daytime", "night", "late afternoon", "night", "5pm", "anytime", "night", "anytime", "everytime", "day"], "difficult_direct_answer": false, "rationales": ["The traffic light on the picture is shown to be red.", "There is a sign at the intersection that says you can't turn during the late afternoon, but can only go straight or right.", "The times are listed on the sign"], "image": "train2014/COCO_train2014_000000446481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396754, "question_id": "ka6Kb5XSa7V3PYNh8GXDJJ", "question": "THe animals being ridden are part of what classification?", "choices": ["bovine", "equine", "canine", "feline"], "correct_choice_idx": 2, "direct_answers": ["horses", "horse", "horse", "horse", "equus", "equus", "equine", "canine", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["That is the type of animal the people are riding.", "Equine is the term for horses and these are horses in the image.", "The animals are like horses."], "image": "val2014/COCO_val2014_000000396754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174185, "question_id": "kaHG8QnQmoxZQMup8GfzSV", "question": "What type music is offered here?", "choices": ["string", "clarinet", "brass", "flute"], "correct_choice_idx": 0, "direct_answers": ["violin guitar", "stringed instruments", "violin", "violin", "street", "string", "street music", "violin", "string", "violin guitar"], "difficult_direct_answer": false, "rationales": ["The two musicians are on the sidewalk preforming with a guitar and a violin.", "The man on the right is playing a violin or viola which you have to use a bow against some strings to make music.", "String music is being played."], "image": "train2014/COCO_train2014_000000174185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527353, "question_id": "kaJVLAiAQwSv3svGYpJEpR", "question": "Why are the skis turned up in front?", "choices": ["defective", "broken", "push snow", "bent"], "correct_choice_idx": 2, "direct_answers": ["control", "move snow", "go straight", "to glide", "clearance", "smoothness", "balance", "increase speed", "push snow", "snow"], "difficult_direct_answer": true, "rationales": ["A child is wearing skis with the ends pointed up. skis point up to push snow as someone skis.", "The skis are pushing.", "This will let snow move to the side and the skier face less resistance."], "image": "train2014/COCO_train2014_000000527353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365261, "question_id": "kaPURhhBZB9LwwXHhbStuS", "question": "What shape are the roofs?", "choices": ["triangle", "square", "round", "hexagon"], "correct_choice_idx": 0, "direct_answers": ["half triangle", "long", "sloped", "v shaped", "triangle", "triangle", "triangle", "peaked", "triangle", "triangular"], "difficult_direct_answer": false, "rationales": ["The roofs come to a point like a triangle does.", "They are in a triangle shape.", "The roofs are triangles."], "image": "train2014/COCO_train2014_000000365261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9615, "question_id": "kaPjZT6AGxQiNuWDXebSmk", "question": "What kind of vehicles are the three with flashing lights?", "choices": ["ambulance", "security", "taxis", "police"], "correct_choice_idx": 3, "direct_answers": ["police", "police cars", "police car", "police", "police", "police cars", "police", "police cars", "police", "police cars"], "difficult_direct_answer": false, "rationales": ["In the picture the vehicles have flashing lights mounted on the top. in most cities, these vehicles are used by the police.", "The three vehicles with flashing lights are police cars.", "The vehicles are police ones."], "image": "train2014/COCO_train2014_000000009615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23361, "question_id": "kaXmVom9RHKjMB6A8qGPpH", "question": "What type of crossing is this?", "choices": ["animal", "train", "school", "bicycle"], "correct_choice_idx": 3, "direct_answers": ["crosswalk", "pedestrian", "crosswalk", "crosswalk", "bicycle", "bicycle", "bike", "bike", "walking", "bike crossing"], "difficult_direct_answer": false, "rationales": ["The crossing is for bikes.", "The traffic light is displaying a green two-wheeled vehicle.", "The lines are narrow."], "image": "train2014/COCO_train2014_000000023361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561570, "question_id": "kaYKfhrgvts6akJVsVPA7g", "question": "Why are the lights and images strange here?", "choices": ["too bright", "over-exposure", "excited cameraman", "broken camera"], "correct_choice_idx": 1, "direct_answers": ["exposure", "yellow", "glaring", "picture effects", "exposure", "reflection", "glare", "transparent", "very glaring", "over-exposure"], "difficult_direct_answer": true, "rationales": ["The lights are overexposed.", "Overexposure can cause lights to display in an odd manner.", "The lights are sparkling."], "image": "val2014/COCO_val2014_000000561570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251922, "question_id": "kaa8QgUAHHui6zUiXTVwDj", "question": "How was the man able to get the plastic bags he is carrying?", "choices": ["by shopping", "by stealing", "by dodging", "by weaving"], "correct_choice_idx": 0, "direct_answers": ["made purchases", "shopping", "bus", "went shopping", "money", "loading", "purchasing", "by shopping", "door", "grocery shopping"], "difficult_direct_answer": true, "rationales": ["The bags are advertising the store name that it was bought from.", "When you buy things at the store they place them into bags for you to make it easier to carry multiple things at once.", "The man was given the bags for buying items."], "image": "train2014/COCO_train2014_000000251922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239157, "question_id": "kabECd646LGraJLRzMVAtn", "question": "What is the green sign for?", "choices": ["building name", "advertisement", "warning sign", "street identification"], "correct_choice_idx": 3, "direct_answers": ["street directions", "street name", "road direction", "directions", "city name", "directions", "directions", "street identification", "directions", "highway"], "difficult_direct_answer": false, "rationales": ["The signs help drivers know where to go.", "It lets people know where this road is", "It has a street name on the sign."], "image": "val2014/COCO_val2014_000000239157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206492, "question_id": "kaw94iLGCBUFUEzMWmdvoR", "question": "Why have these people sat down?", "choices": ["draw", "work", "eat", "paint"], "correct_choice_idx": 2, "direct_answers": ["eat", "eat", "to eat", "eat", "eating", "to eat", "to eat", "restaurant eating", "for food", "to eat"], "difficult_direct_answer": false, "rationales": ["The people want to eat the meal.", "There is food on the table and on plates in front of the person which is consistent with answer a.", "These people are sitting in chairs at a table, and food can be seen on the table. people usually eat when they are sitting at a table with food."], "image": "train2014/COCO_train2014_000000206492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316336, "question_id": "kb4Lz5uTea3eNUEfnyYcUj", "question": "What is not allowed in this establishment?", "choices": ["booze", "smoking", "children", "screaming"], "correct_choice_idx": 1, "direct_answers": ["smoking", "cigarettes", "cigarettes", "smoking", "smoking", "cigarettes", "smoking", "smoking", "smoking", "smoking"], "difficult_direct_answer": false, "rationales": ["There is a cigarette with a red circle and line through it", "The sign above the cash register indicates what is not allowed.", "There is a sign with a cigarette crossed out, indicating that it is not allowed."], "image": "train2014/COCO_train2014_000000316336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419609, "question_id": "kbPnUGp2czebphixfGwPjF", "question": "Which track will passengers be unable to access should a train arrive on it?", "choices": ["left", "middle", "upper", "right"], "correct_choice_idx": 1, "direct_answers": ["right", "furthest", "middle", "front tracks", "right one", "right", "right", "right", "second", "right track"], "difficult_direct_answer": false, "rationales": ["Due to the distance of the platform, it would be very hard to reach a train in the middle.", "The passengers won't be able to reach the middle.", "The train will arrive closest to the station, which would be the track closest to the edge and to the left."], "image": "train2014/COCO_train2014_000000419609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261788, "question_id": "kbZUUZ7zLCHmSxERXzNFKx", "question": "What is the player going to do?", "choices": ["swing", "juggle", "dribble", "run"], "correct_choice_idx": 0, "direct_answers": ["swing", "swing", "hit tennisball", "forehand shot", "hit ball", "hit", "hit ball", "hit", "return volley", "hit ball"], "difficult_direct_answer": false, "rationales": ["The player is taking a swing.", "They have their arm raised and the other options don't match tennis.", "The player is holding the racket in his hand and the ball is coming at him."], "image": "train2014/COCO_train2014_000000261788.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45829, "question_id": "kbd4rQVn8UCVfGL5gU9Pbr", "question": "What type of store are the men most likely standing in?", "choices": ["educational", "video games", "kids toys", "home appliances"], "correct_choice_idx": 1, "direct_answers": ["videogame store", "gaming stores", "game store", "collectibles", "game", "game", "game store", "video games", "video game", "game store"], "difficult_direct_answer": false, "rationales": ["You can see mario and donkey kong. a wii is sitting on the table.", "There are game characters in the room", "The video game controllers, consoles, posters and stuffed animals popular video game characters allows us to conclude this store sells video games."], "image": "train2014/COCO_train2014_000000045829.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220382, "question_id": "kbjnHAmoPt5wiZ2Kf2nRrR", "question": "Why is the boy holding his hands out?", "choices": ["to dance", "to wave", "to spin", "to balance"], "correct_choice_idx": 3, "direct_answers": ["balance", "to balance", "for balance", "for balance", "balance", "balance", "balance", "balance", "balance", "balance"], "difficult_direct_answer": false, "rationales": ["The boy needs balance.", "The boy wants to stand on his snowboard.", "He is holding his hands out to keep from falling."], "image": "val2014/COCO_val2014_000000220382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5673, "question_id": "kc4PXMaDNVaQUgBHo9f5Ty", "question": "Where is the man located?", "choices": ["mall", "beach", "park", "playground"], "correct_choice_idx": 0, "direct_answers": ["sidewalk", "city", "store", "store", "clothing center", "outside", "mall", "outside store", "clothing store", "shopping center"], "difficult_direct_answer": true, "rationales": ["The man is at a mall.", "The man is near a clothing store and a virgin store.", "The man is standing in front of a store display with mannequins. mannequin store window displays are common in malls."], "image": "val2014/COCO_val2014_000000005673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281910, "question_id": "kcHe9LWDjpouPEoYC3g8UF", "question": "What is prohibited when traveling into the right lane?", "choices": ["snoozing", "looking", "exiting", "entering"], "correct_choice_idx": 3, "direct_answers": ["nothing", "turning", "turning left", "entry", "entering", "going straight", "going straight", "exiting", "cars", "straight"], "difficult_direct_answer": true, "rationales": ["Each sign near the right lane has a red circle and a white line. this indicates that a vehicle should not go into the right lane.", "No entering is allowed.", "There is a line circle with a line through it so you cannot go in."], "image": "train2014/COCO_train2014_000000281910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149268, "question_id": "kcdvfQjYd3mbrSrVhsvJbL", "question": "How does the woman direct here kite and control it?", "choices": ["magic", "drone", "string", "wand"], "correct_choice_idx": 2, "direct_answers": ["lines", "hold string", "string", "kite line", "with string", "string", "by string", "string", "move arms", "string"], "difficult_direct_answer": false, "rationales": ["This is attached to the kite", "The woman has a kite string.", "That's how all kites are controlled by a person."], "image": "val2014/COCO_val2014_000000149268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486987, "question_id": "kcw9Zua4vanPV6FmdguadM", "question": "What culture would be representative of this area?", "choices": ["polish", "italian", "asian", "native american"], "correct_choice_idx": 2, "direct_answers": ["chinese", "urban", "asian", "asian", "asia", "asian", "china", "asian", "asian", "asian"], "difficult_direct_answer": false, "rationales": ["There are asian people at the market.", "The text on the signs is in chinese and similar languages.", "Because of the asian language written on the cardboard, which shows that area has asian people."], "image": "val2014/COCO_val2014_000000486987.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132773, "question_id": "kd8EksH64LBbXX8waFmuJS", "question": "What number is next to Q on the sign?", "choices": ["three", "eight", "five", "ten"], "correct_choice_idx": 1, "direct_answers": ["eight", "eight", "eight", "eight", "eight", "eight", "eight", "eight", "eight", "eight"], "difficult_direct_answer": false, "rationales": ["The number appears next to letter q on the black sign above the white truck.", "The number 8 is shown.", "The number is 8."], "image": "train2014/COCO_train2014_000000132773.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577835, "question_id": "keAMRsbgFXpvmtkEGe85Cg", "question": "What type of truck is this?", "choices": ["model", "ladder", "commercial", "passenger"], "correct_choice_idx": 2, "direct_answers": ["commercial", "semi", "tractor trailer", "coal", "transport", "trash truck", "garbage truck", "dump truck", "semi", "truck"], "difficult_direct_answer": true, "rationales": ["This truck is massive and a regular driver would not choose it. the name of the company is on the front.", "The truck is for commercial purposes.", "The truck has a company name and company logos on it."], "image": "train2014/COCO_train2014_000000577835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195826, "question_id": "keWWh6s4kP7qPDUqznMJXW", "question": "Why are the people holding signs?", "choices": ["to protest", "to mock", "to decorate", "to celebrate"], "correct_choice_idx": 0, "direct_answers": ["protesting", "protest", "to protest", "to protest", "protest", "protesting", "protest", "protesting he", "protest", "protest"], "difficult_direct_answer": false, "rationales": ["The people want prop 8 to be repealed and are marching with signs to bring the issue to the attention of the public and the courts.", "The people are advocating for prop 8 to be repealed.", "To protest to stop the hate."], "image": "train2014/COCO_train2014_000000195826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553563, "question_id": "keYaJBfvKqQWKrJdBPbQsA", "question": "Who has the ball?", "choices": ["batter", "outfielder", "catcher", "pitcher"], "correct_choice_idx": 2, "direct_answers": ["catcher", "catcher", "catcher", "pitcher", "umpire", "catcher", "catcher", "catcher", "catcher", "catcher"], "difficult_direct_answer": false, "rationales": ["The catcher is at the mound.", "The batter did not make contact, so the pitcher can't have the ball and neither can the outfielder. of course the batter would never have the ball.", "The catcher has the ball."], "image": "train2014/COCO_train2014_000000553563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457537, "question_id": "keujvWdDLkhWa96C7Ygin8", "question": "What country does this plane belong to?", "choices": ["uganda", "france", "poland", "canada"], "correct_choice_idx": 1, "direct_answers": ["france", "france", "usa", "france", "france", "france", "france", "ukraine", "france", "usa"], "difficult_direct_answer": false, "rationales": ["The plane is a dassault mirage 2000c. there is an armed de l'air marking on the tail.", "Looks like it's from france.", "The writing on the tail is not in english or polish. the roundel is not canadian."], "image": "train2014/COCO_train2014_000000457537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581401, "question_id": "kexVR92PeD6jkqLZYDoCd7", "question": "How many wheels must vehicles in the left lane shown here as we face it have?", "choices": ["none", "six", "four", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "4 wheels", "two", "two", "two", "two", "one", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two wheels on motorbikes.", "They are bikes and motorcycles.", "This is a motorbike lane only where motorbikes be at."], "image": "val2014/COCO_val2014_000000581401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542637, "question_id": "kf9xcY9hF2fCrubtsxwnmQ", "question": "What other type of large animal might be found in this environment?", "choices": ["dog", "cow", "elephant", "tiger"], "correct_choice_idx": 1, "direct_answers": ["cows", "cow", "good", "cow", "cows", "cow", "cows", "cow", "cows", "cow"], "difficult_direct_answer": false, "rationales": ["These are grazing animals as well, so it makes sense they'd be in the same place.", "This field of fenced in horses and chickens would be found on a farm. a cow is something often found at such a farm.", "This is farmland"], "image": "val2014/COCO_val2014_000000542637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61854, "question_id": "kf9z9pr673oBXwz8hyCMwJ", "question": "What are the yellow objects on the flat plate?", "choices": ["bananas", "onions", "peppers", "fries"], "correct_choice_idx": 3, "direct_answers": ["fries", "fries", "fries", "fries", "fries", "fries", "fries", "fries", "french fries", "french fries"], "difficult_direct_answer": false, "rationales": ["Crinkle cut potatoes are on a plate. fries are crinkle cut.", "A meal is on a table including crinkle cut potatoes.", "The yellow objects are long, thin food that is made from potatoes. they are usually served salted and often with ketchup."], "image": "train2014/COCO_train2014_000000061854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474067, "question_id": "kfxgRSaCeaxSVRvs97c4Mh", "question": "Who is going the fastest?", "choices": ["jogger", "motorcycle rider", "bicycle rider", "walker"], "correct_choice_idx": 1, "direct_answers": ["motorcycle rider", "background", "motorcycle", "motorcyclist", "motorcycle", "man", "bike rider", "motorbike rider", "man", "motorcycle"], "difficult_direct_answer": false, "rationales": ["The bike has a motor to make it go faster.", "The motorcycle is powered by an engine.", "A person on a motorized bike would go faster than people on foot."], "image": "train2014/COCO_train2014_000000474067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48185, "question_id": "kgCqAsgixur4CgNVYGo2gA", "question": "What is behind the two tall mirrors?", "choices": ["bathroom", "bedroom", "closet", "hallway"], "correct_choice_idx": 2, "direct_answers": ["closet", "shower", "closet", "clothes", "closet", "closet", "door", "closet", "closet", "shower"], "difficult_direct_answer": false, "rationales": ["The mirrors have a closet behind them.", "Two doors are mirrored on one side and have handles to open them.", "Most master suites have an attached closet by the bathroom."], "image": "val2014/COCO_val2014_000000048185.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162104, "question_id": "kgKvtSE5op3pkTkvqj6isw", "question": "This person is laying near what?", "choices": ["building", "sand", "zebras", "fence"], "correct_choice_idx": 1, "direct_answers": ["beach", "beach", "water", "ocean", "beach", "beach", "umbrella", "beach", "sand", "ocean"], "difficult_direct_answer": false, "rationales": ["They are on a rocky beach", "The person is on a beach.", "The person is on a beach."], "image": "train2014/COCO_train2014_000000162104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440991, "question_id": "kgkcALdEG4jPUYVB9gfwW6", "question": "What are the dark vegetables next to the carrots?", "choices": ["brets", "onions", "potatoes", "yams"], "correct_choice_idx": 0, "direct_answers": ["beets", "beets", "beets", "brocolli", "brets", "turnips", "potatoes", "radishes", "potatoes", "beet"], "difficult_direct_answer": false, "rationales": ["These are beets next to it.", "The beets are dark red.", "A pile of beets, onions, and carrots are laid out on a green surface."], "image": "val2014/COCO_val2014_000000440991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343610, "question_id": "kgoFdjyLUTBomdCHPNjePb", "question": "How intelligent would an animal have to be to do this?", "choices": ["very intelligent", "moderately intelligent", "somewhat intelligent", "unintelligent"], "correct_choice_idx": 0, "direct_answers": ["very", "very intelligent", "very", "very", "smart", "very intelligent", "very", "very intelligent", "very", "very"], "difficult_direct_answer": false, "rationales": ["A very smart animal would do.", "An animal would have to be of high intelligence to perform this.", "Elephants are smart."], "image": "train2014/COCO_train2014_000000343610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518127, "question_id": "kh8Ju3P35AzqMsipXyuzc4", "question": "What else would you probably see in this giant structure with oversized balls?", "choices": ["soccer match", "golf match", "swimming competition", "theater play"], "correct_choice_idx": 0, "direct_answers": ["soccer game", "soccer game", "athletes", "soccer ball", "soccer match", "beach", "soccer game", "person inside", "football game", "soccer"], "difficult_direct_answer": false, "rationales": ["The structure is a soccer stadium.", "It's a massive green field with room for many spectators.", "It is in a soccer stadium with fake grass replicating the soccer pitch or field."], "image": "val2014/COCO_val2014_000000518127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579589, "question_id": "khLDi7ffkYfXysdx4jfJAn", "question": "What type of bag does the man have with him?", "choices": ["messenger", "briefcase", "backpack", "tote"], "correct_choice_idx": 1, "direct_answers": ["briefcase", "briefcase", "briefcase", "briefcase", "briefcase", "briefcase", "briefcase", "briefcase", "briefcase", "briefcase"], "difficult_direct_answer": false, "rationales": ["The bag is a briefcase.", "The man has a briefcase.", "The man is using a rectangular bag that is black."], "image": "train2014/COCO_train2014_000000579589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376677, "question_id": "khXwK7Dcyoz9f5ge93VS3j", "question": "The area on the left with sign displays is most likely what kind of location?", "choices": ["shopping mall", "strip mall", "market", "rest stop"], "correct_choice_idx": 3, "direct_answers": ["gas station", "rest stop", "rest stop", "gas station", "shopping center", "mall", "strip mall", "strip mall", "gas station", "rest stop"], "difficult_direct_answer": false, "rationales": ["The sign indicates that it has the amenities of a rest stop.", "The area is a rest stop.", "Rest stops are usually located on the side of major roads."], "image": "val2014/COCO_val2014_000000376677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512911, "question_id": "khYNbwJfsov7eJRDfGNp5N", "question": "Why does the child sit in the suitcase?", "choices": ["shipping", "changing clothes", "hiding", "photo pose"], "correct_choice_idx": 3, "direct_answers": ["relaxing", "playing around", "posing", "playing", "resting", "for fun", "pictures", "for fun", "photo pose", "for fun"], "difficult_direct_answer": false, "rationales": ["The child is taking a photo.", "The child is posing for a camera, with props in his character, for the purposes of photography.", "A kid is posing from a suitcase while holding a newspaper."], "image": "val2014/COCO_val2014_000000512911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191625, "question_id": "kha3S3LP4kAsb7CaJaEpJz", "question": "What is the name of the path they're on?", "choices": ["tracks", "slope", "river", "country cross"], "correct_choice_idx": 1, "direct_answers": ["slope", "ski run", "snowboard trail", "bunny trail", "ski slope", "slope", "ski slope", "slope", "downhill skiing", "ski run"], "difficult_direct_answer": false, "rationales": ["The name is a slope.", "The people are not on a body of water. they are skiing and snowboarding down a hill.", "These snowboarders and skiers are descending down the slope."], "image": "train2014/COCO_train2014_000000191625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33341, "question_id": "ki35eqPKHCM4PT6E2BxJEy", "question": "Which root vegetable is on the plate?", "choices": ["carrot", "beet", "corn", "rutabaga"], "correct_choice_idx": 0, "direct_answers": ["carrot", "carrot", "carrot", "carrot", "carrot", "carrot", "carrot", "carrot", "carrot", "carrot"], "difficult_direct_answer": false, "rationales": ["The green plant is rutabaga as its color shows.", "It is shredded and orange.", "The root vegetable is orange, not white, red, or yellow."], "image": "train2014/COCO_train2014_000000033341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101892, "question_id": "kiAS69S3V9NiwGUQgCbTKq", "question": "What would be the most common clothing to see in this setting?", "choices": ["wedding dress", "swimming costume", "school uniform", "morning suit"], "correct_choice_idx": 1, "direct_answers": ["bathing suit", "swimsuits", "swimming costume", "swimsuit", "swimming suit", "swimsuit", "bathing suit", "swimsuit", "bikini", "swimsuit"], "difficult_direct_answer": false, "rationales": ["The setting shows a beach and sand which is where people normally wear swimming costumes.", "It is near the beach and many people go to enjoy being in the water.", "People often wear swimwear at the beach."], "image": "train2014/COCO_train2014_000000101892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2349, "question_id": "kifRUqswftYwyPiBAJrjfg", "question": "What is the cat sitting on?", "choices": ["car", "rug", "cushion", "carpet"], "correct_choice_idx": 2, "direct_answers": ["pillow", "throw pillow", "pillow", "pillow", "pillow", "pillow", "pillow", "cushion", "throw pillow", "pillow"], "difficult_direct_answer": false, "rationales": ["The cat is sitting on a pillow.", "The cat is on a pillow.", "It's on a pillow."], "image": "train2014/COCO_train2014_000000002349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107156, "question_id": "kijH5mcfR4smRGLj5sTVuL", "question": "Nintendo is manufacturer of what console?", "choices": ["air pods", "remote", "wii remote", "mobile"], "correct_choice_idx": 2, "direct_answers": ["wii", "wii remote", "wii", "wii", "wii", "wii", "wii", "wii", "wii", "wii"], "difficult_direct_answer": false, "rationales": ["Nintendo creates the wii.", "The nintendo company makes the wii game.", "She has a wii."], "image": "train2014/COCO_train2014_000000107156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412908, "question_id": "kj4BSr4H8X9q7Y6h2K9sYQ", "question": "What type of shop is this?", "choices": ["body", "gift", "hair", "auto"], "correct_choice_idx": 1, "direct_answers": ["boutique", "gift", "gift", "garden", "garden", "plant", "greenhouse", "garden", "gardening", "hardware"], "difficult_direct_answer": false, "rationales": ["The shelves are decorated with random gifts and small souvenirs, which indicates that this location is a gift shop.", "The other options don't match the items in the background.", "The items in the store look like what you buy others as gifts or presents."], "image": "train2014/COCO_train2014_000000412908.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436878, "question_id": "kjaEaL3jFMnWegmBMbfeRP", "question": "What language is shown on the front of the train?", "choices": ["english", "german", "arabic", "chinese"], "correct_choice_idx": 1, "direct_answers": ["german", "german", "german", "german", "german", "english", "german", "english", "german", "german"], "difficult_direct_answer": false, "rationales": ["The language is german.", "The words are in this language", "The language on the train is german given the lettering."], "image": "train2014/COCO_train2014_000000436878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71719, "question_id": "kjnCNgGmRQhKFmheSwTQkD", "question": "Why do boats have portholes?", "choices": ["superstition", "tradition", "light/fresh air", "style"], "correct_choice_idx": 2, "direct_answers": ["to see", "fresh air", "to see", "visibility", "see outside", "see outside", "more buoyancy", "light/fresh air", "visibility", "windows"], "difficult_direct_answer": false, "rationales": ["The round openings give light and fresh air.", "The boats need fresh air.", "The porthole's on the long covered part of this boat serve the same functions as windows."], "image": "train2014/COCO_train2014_000000071719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166849, "question_id": "kjrug9NamGUCVozoLJGEm2", "question": "What is this horse's color pattern called?", "choices": ["paint", "dalmatian", "appaloosa", "palomino"], "correct_choice_idx": 0, "direct_answers": ["cow print", "paint", "pinto", "white", "black white", "spots", "paint", "paint", "piebald", "tuxedo"], "difficult_direct_answer": false, "rationales": ["Since the colors look like splashes of a.", "A paint horse has dots in their pattern.", "The horse is primarily black with splashes of white mixed in. in the art and horse world, this color pattern is known as paint."], "image": "train2014/COCO_train2014_000000166849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295057, "question_id": "kjyYuFGfV8EVHeMuiJjqjd", "question": "What are these mugs sitting on top of?", "choices": ["washer", "refrigerator", "sink", "cabinet"], "correct_choice_idx": 1, "direct_answers": ["counter", "refrigerator", "fridge", "one", "counter", "counter", "each other", "each other", "counter", "counter"], "difficult_direct_answer": false, "rationales": ["The mugs are on a white fridge.", "The bottom of the mugs has a similar top to the fridge.", "The hinge and sea tight door of a fridge can be seen on the bottom and side of the frame."], "image": "train2014/COCO_train2014_000000295057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59683, "question_id": "kkWAmmv9zkMm6PMm4ggrp5", "question": "What clothing item in white/grey are rolled up?", "choices": ["pants", "socks", "shirts", "underwear"], "correct_choice_idx": 1, "direct_answers": ["socks", "socks", "socks", "shoes", "socks", "bedsheet", "socks", "socks", "socks", "socks"], "difficult_direct_answer": false, "rationales": ["Socks are rolled.", "Small white clothing items with gray on the ends are rolled up near a suitcase.", "There are white and grey socks on the bed."], "image": "train2014/COCO_train2014_000000059683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523314, "question_id": "kkbfyaEpo9WG8wh6EEDnbv", "question": "Which of the four seasons of the year is it?", "choices": ["winter", "spring", "autumn", "summer"], "correct_choice_idx": 0, "direct_answers": ["winter", "winter", "winter", "winter", "winter", "charisma", "winter", "winter", "winter", "charisma"], "difficult_direct_answer": false, "rationales": ["There is a decorated christmas tree in the background which is consistent with answer a in the northern hemisphere.", "There is a lit christmas tree in the room which most likely was on display during the month of december.", "It's winter."], "image": "train2014/COCO_train2014_000000523314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179094, "question_id": "kkvNE7B2JGzv8mCd6Qy7So", "question": "What are the blue items used for?", "choices": ["catching fish", "rainy days", "digging ditches", "shoveling snow"], "correct_choice_idx": 1, "direct_answers": ["shade", "blocking rain", "heat", "provide shade", "rainy days", "repel rain", "rain protection", "keeping dry", "rain shielding", "rain protection"], "difficult_direct_answer": true, "rationales": ["The blue items help keep rain out of people's eyes and hair.", "Umbrellas are used for covering the rain.", "These are umbrellas which people use to keep dry when it is raining."], "image": "val2014/COCO_val2014_000000179094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259676, "question_id": "kkxzrqog7PP6YNpTCEbSPo", "question": "What might be placed inside the object being used for seating here?", "choices": ["rocks", "animals", "files", "clothing"], "correct_choice_idx": 3, "direct_answers": ["laundry", "clothing", "clothes", "laundry", "clothes", "laundry", "washer", "clothes", "clothes", "clothes"], "difficult_direct_answer": false, "rationales": ["You would put clothes in it to dry", "The child is sitting on a washing machine. this is used to clean dirty laundry.", "The child is seating on top of a washing machine which is used for cleaning laundry."], "image": "train2014/COCO_train2014_000000259676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239180, "question_id": "kmCiWEzTKn5J6tpTeCTTXG", "question": "What is the tray on the coffee table filled with most likely as decoration?", "choices": ["fruit", "candles", "vegetables", "blocks"], "correct_choice_idx": 0, "direct_answers": ["fruit", "fruit", "fruit", "fruit", "fruit", "fruit", "fruit", "fruit", "fruit", "fruit"], "difficult_direct_answer": false, "rationales": ["There is fruit in the room.", "The tray has bananas in it.", "The tray has fruit."], "image": "train2014/COCO_train2014_000000239180.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151826, "question_id": "kmNtZmtMZcHMf5BoQku2a3", "question": "What food is that bun normally used for?", "choices": ["french fries", "chicken strips", "hot dogs", "hamburgers"], "correct_choice_idx": 2, "direct_answers": ["hotdog", "hot dog", "hotdog", "hotdog", "hot dog", "hot dog", "hot dogs", "hot dogs", "hotdog", "hotdog"], "difficult_direct_answer": false, "rationales": ["She is eating some kind of cased sausage.", "The long narrow buns usually house long narrow meat products.", "This type of round cylindrical buns normally are used with hotdogs."], "image": "val2014/COCO_val2014_000000151826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579267, "question_id": "kmtu5NeXcmvrQVVUTEZxdu", "question": "What is the woman wearing on her feet?", "choices": ["high heels", "crocs", "sandals", "sneakers"], "correct_choice_idx": 2, "direct_answers": ["sandals", "sandals", "flats", "sandals", "flats", "shoes", "sandals", "sandals", "sandals", "ballet flats"], "difficult_direct_answer": false, "rationales": ["The woman is wearing sandals.", "The woman on the sidewalk is wearing white sandals on her feet.", "The shoes are closed toed with a heel in back."], "image": "train2014/COCO_train2014_000000579267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312958, "question_id": "kmzc9Jbw5b3MjAbZ4sufB3", "question": "Who might ride on this horses cart next?", "choices": ["policeman", "sunday driver", "blacksmith", "tourist"], "correct_choice_idx": 3, "direct_answers": ["human", "tourists", "passenger", "passengers", "anyone", "passengers", "tourist", "tourists", "tourist", "human"], "difficult_direct_answer": false, "rationales": ["People who want to see the city.", "Someone who wants to see the city.", "A horse pulling a carriage is waiting on the side of the road. tourists take horse and carriage rides."], "image": "train2014/COCO_train2014_000000312958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4535, "question_id": "kn4TmRu9pxoRZiHVBvL2G2", "question": "What sort of event is being attended by the costumed riders?", "choices": ["steeple chase", "parade", "renaissance faire", "sideshow"], "correct_choice_idx": 2, "direct_answers": ["medieval reenactment", "renaissance fair", "parade", "joust", "jousting", "renaissance festival", "faire", "renaissance faire", "wedding", "festival"], "difficult_direct_answer": true, "rationales": ["The event is a faire.", "There are horses. the people riding them are wearing a style of clothing that was popular hundreds of years ago.", "The event looks like medieval times. the men are wearing armor."], "image": "train2014/COCO_train2014_000000004535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293589, "question_id": "knBwhmLHnS37XuoFUS7Cvf", "question": "What type of hat is the man wearing?", "choices": ["baseball", "sombrero", "fedora", "tippy"], "correct_choice_idx": 1, "direct_answers": ["sombrero", "cowboy", "sombrero", "sombrero", "sombrero", "sombrero", "sombrero", "sombrero", "cowboy", "sombrero"], "difficult_direct_answer": false, "rationales": ["A man is wearing a hat with a large brim around it.", "A sombrero is a hat with a wide brim and pointed top. this man is also wearing the type of pants and top that people wear along with sombreros. it is a complete outfit.", "The man has a sombrero on."], "image": "train2014/COCO_train2014_000000293589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397132, "question_id": "knRwUyruRmwg7pjMLekZZ4", "question": "What will be loaded on the Load Master?", "choices": ["trash", "cars", "brick", "dresses"], "correct_choice_idx": 0, "direct_answers": ["trash", "trash", "trash", "garbage", "trash", "trash", "garbage", "garbage", "garbage", "trash"], "difficult_direct_answer": false, "rationales": ["The truck has a hydraulic lift on the back.", "The load master truck has the look of a trash truck, which is characterized by the large open rear attachment.", "This kind of truck is a garbage truck, and usually carries only one specific load."], "image": "train2014/COCO_train2014_000000397132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469139, "question_id": "knXHDCmkZWNuoSRWEtDvKP", "question": "Traffic is blocked in which direction?", "choices": ["to left", "no where", "sharp right", "ahead"], "correct_choice_idx": 2, "direct_answers": ["oncoming", "sharp right", "south", "south", "towards photographer", "both", "forward", "towards camera", "left", "right"], "difficult_direct_answer": true, "rationales": ["Traffic is blocked on the right.", "You cannot turn right.", "Cars on the right can be seen stopped in the street."], "image": "train2014/COCO_train2014_000000469139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579883, "question_id": "knYzaQ5siSLA8mZn8oKBM3", "question": "What is the man in yellow pants trying to do?", "choices": ["ascend", "attack", "retreat", "descend"], "correct_choice_idx": 0, "direct_answers": ["walking", "snowboard", "hike mountain", "ascend", "walk uphill", "walk", "climb hill", "hike uphill", "hike uphill", "walk"], "difficult_direct_answer": false, "rationales": ["The man appears to be walking up the hill with a snowboard on his back. this activity is common for those who want to hike up a hill and then descend by skiing or snowboarding.", "The man in yellow pant's is mid stride against the rising upcoming slope.", "The man goes upward."], "image": "train2014/COCO_train2014_000000579883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442286, "question_id": "kncNTwqstaDEpdbccXQcZR", "question": "When it is dark at night what will the boats use as navigation?", "choices": ["moonlight", "flashlights", "radar", "lighthouse"], "correct_choice_idx": 3, "direct_answers": ["lighthouse", "light house", "lights", "lights", "lighthouse", "lighthouse", "lighthouse", "lighthouse", "lighthouse", "lighthouse"], "difficult_direct_answer": false, "rationales": ["This helps sailors see on stormy nights", "These buildings are usually used to visually warn boats of dangers.", "The boats' navigation will be made easier by the light from the lighthouse."], "image": "val2014/COCO_val2014_000000442286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401842, "question_id": "knddUa9i2UpzCCXhHjYtNe", "question": "How are the two people holding an umbrella related to each other?", "choices": ["couple", "coworkers", "strangers", "siblings"], "correct_choice_idx": 0, "direct_answers": ["partners", "married", "sisters", "couple", "couple", "love birds", "lovers", "lovers", "lovers", "couple"], "difficult_direct_answer": false, "rationales": ["The two people holding the umbrella together look like life partners.", "They are in a romantic embrace, suggesting they are an intimate couple.", "The closeness of the two people would indicate they are a couple."], "image": "train2014/COCO_train2014_000000401842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532194, "question_id": "koRCHH8EsSgonJn4nHEeEu", "question": "What is the elbow in the background leaning on?", "choices": ["table", "wall", "chair", "knee"], "correct_choice_idx": 3, "direct_answers": ["lap", "knee", "child", "knee", "leg", "knee", "knee", "leg", "knee", "knee"], "difficult_direct_answer": false, "rationales": ["It's leaning on his knee", "The person is leaning forward some.", "Someone is siting with their elbow on their knee."], "image": "train2014/COCO_train2014_000000532194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467333, "question_id": "koTfZNz2vJwfWRVNGYzsJt", "question": "How is the cat on the couch illuminated?", "choices": ["moonlight", "sunlight", "led light", "fluorescent light"], "correct_choice_idx": 1, "direct_answers": ["light", "television", "sunlight", "lamp", "flash", "light", "light", "natural light", "black white", "sunlight"], "difficult_direct_answer": false, "rationales": ["It appears that there are windows very near the cat so it's easy to assume that it is indeed sunlight that is providing illumination.", "It looks like the couch is by a window, so the sunlight is coming inside.", "The lighting is too clear to be artificial."], "image": "train2014/COCO_train2014_000000467333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284560, "question_id": "kotFDqXjkCmtbMiE3fiKPt", "question": "What is the food being eaten with?", "choices": ["fork", "chopsticks", "fingers", "spoon"], "correct_choice_idx": 0, "direct_answers": ["fork", "fork", "fork", "utensils", "fork", "rice", "fork", "rice", "fork", "rice"], "difficult_direct_answer": false, "rationales": ["The utensil in option a has a handle and several tines, thus fitting the description of the item mentioned.", "This is obvious given that it's the utensil shown to the left of the plate.", "This utensil is used to easily pick up food with so you don't get your hands dirty."], "image": "val2014/COCO_val2014_000000284560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449866, "question_id": "kp2GAugo4Rx4c9PpeMQsak", "question": "What surface are the people playing on?", "choices": ["indoor hard", "clay", "outdoor hard", "grass"], "correct_choice_idx": 1, "direct_answers": ["clay", "clay", "asphalt", "clay", "tennis court", "play ground", "court", "tennis court", "clay", "clay"], "difficult_direct_answer": false, "rationales": ["These tennis court surfaces are made from clay.", "The surface of the tennis court is called a clay court.", "The court is made of clay."], "image": "train2014/COCO_train2014_000000449866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12209, "question_id": "kp8LK2NqAQvs6TiV9VdSck", "question": "Why is the man using a white object on the end of the bat?", "choices": ["to punish", "warming up", "to hit", "to sign"], "correct_choice_idx": 1, "direct_answers": ["training", "weight", "increase weight", "warm up", "warming up", "practicing", "practice swings", "warm up", "swing practice", "silencer"], "difficult_direct_answer": true, "rationales": ["The baseball player is off to the side of the field as he practices his swing with an attachment on his bat to help him.", "The man is warming up.", "The man isn't yet on the field, and the white object acts as a weight to prepare the batter."], "image": "val2014/COCO_val2014_000000012209.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531151, "question_id": "kpG8NHQKmjFXGufJfKzNiH", "question": "What is in the water?", "choices": ["boat", "pier", "seal", "surfer"], "correct_choice_idx": 0, "direct_answers": ["boat", "boat", "waves", "sailboat", "waves", "sailboat", "waves", "waves", "waves", "boat"], "difficult_direct_answer": false, "rationales": ["While there are surfers technically in water on the beach, a boat can be seen clearly in the water in the background.", "A sail can be seen on the water past waves.", "A sailboat is in the water."], "image": "train2014/COCO_train2014_000000531151.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486415, "question_id": "kpHXYvEo6mqq446sHri8QH", "question": "What multicolored food items do the two bowls contain?", "choices": ["fruit", "vegetables", "grains", "legumes"], "correct_choice_idx": 0, "direct_answers": ["beans", "fruit", "fruit", "cereal", "fruit", "fruit", "fruit", "fruit", "fruits", "fruits"], "difficult_direct_answer": false, "rationales": ["The items are fruit.", "There are mangoes and strawberries.", "Fruit comes in many colours."], "image": "train2014/COCO_train2014_000000486415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489062, "question_id": "kpJeEWeWWwJiKbsydTYPKr", "question": "Why are the three women walking together?", "choices": ["keeping cold", "competing", "keeping warm", "racing"], "correct_choice_idx": 2, "direct_answers": ["umbrella", "friends", "protection", "it's windy", "wind protection", "warmth", "wind protection", "stay warm", "keeping warm", "unknown"], "difficult_direct_answer": true, "rationales": ["The woman want to stay huddled for warmth.", "The women stay warm.", "They are protecting each other from a cold wind and keeping each other warmer."], "image": "train2014/COCO_train2014_000000489062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43961, "question_id": "kpMdoW48utv4uFLtpuqq93", "question": "What did the man with an S on his shirt likely read when he was young?", "choices": ["bible", "poe", "comic books", "edith wharton"], "correct_choice_idx": 2, "direct_answers": ["comics", "comics", "superman", "comics", "comic books", "comics", "comics", "superman", "books", "superman"], "difficult_direct_answer": false, "rationales": ["The superman logo indicates he read comics.", "The s is the logo for superman, a famous icon for this medium.", "The man reads comics."], "image": "val2014/COCO_val2014_000000043961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15496, "question_id": "kpNn9H2c6fU95ju85Nxwrn", "question": "How are the people feeling in this moment?", "choices": ["happy", "fun loving", "amused", "annoyed"], "correct_choice_idx": 3, "direct_answers": ["happy", "cold", "annoyed", "scared", "very cooling", "sarcastically happy", "confused", "cool", "nervous", "cold"], "difficult_direct_answer": true, "rationales": ["Two people are forced to turn around before going down the slopes. based on body language and emotion they seem like they are tired of it.", "The people appear irritated.", "The people's expressions are angry and they're frowning."], "image": "train2014/COCO_train2014_000000015496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458139, "question_id": "kpNnT98RqfrnbMF6tnQbEA", "question": "What type of alcohol is on the table?", "choices": ["tequila", "vodka", "beer", "bourbon"], "correct_choice_idx": 3, "direct_answers": ["bourbon", "bourbon", "beer", "beer", "bourbon", "bourbon", "bourbon", "bourbon", "bourbon", "bourbon"], "difficult_direct_answer": false, "rationales": ["There is a bottle of virginia bourbon on the table.", "There is a bottle named virginia gentleman. it contains brown clearish liquid and has type of alcohol on it.", "Bourbon is in the bottle."], "image": "train2014/COCO_train2014_000000458139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366799, "question_id": "kprvyccGQfWgA8kFDbQAce", "question": "What meal is being served?", "choices": ["dinner", "afternoon tea", "lunch", "breakfast"], "correct_choice_idx": 0, "direct_answers": ["dinner", "dinner", "dinner", "dinner", "dinner", "dinner", "french fries", "french fries", "dinner", "dinner"], "difficult_direct_answer": false, "rationales": ["The sky is very dark.", "There are candles.", "There is wine so it appears to be dinner and the outside looks dark so it must be a later meal."], "image": "train2014/COCO_train2014_000000366799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525667, "question_id": "kpxzRFVJwRCiFM9pGYiW8m", "question": "What is the woman in the blue jacket standing on?", "choices": ["sofa", "folding chair", "chaise", "picnic table"], "correct_choice_idx": 3, "direct_answers": ["picnic table", "table seat", "picnic bench", "bench", "land", "bench", "picnic table", "picnic table", "table seat", "bench"], "difficult_direct_answer": false, "rationales": ["The woman is behind a picnic table.", "The woman is on a picnic table.", "The woman is on a bench for a table."], "image": "val2014/COCO_val2014_000000525667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579697, "question_id": "kq7TCd7UPexLHEAnLsHYNL", "question": "What general variety of fruit is shown?", "choices": ["pomme", "plum", "citrus", "avocado"], "correct_choice_idx": 2, "direct_answers": ["citrus", "citrus", "naval orange", "lemons", "citrus", "lemons", "lemons", "lemons", "oranges", "citrus"], "difficult_direct_answer": false, "rationales": ["Oranges are a citrus fruit.", "The yellow or orange colour and round shape is consistent with that of most citrus fruit like oranges.", "These fruits are yellow and look like lemons."], "image": "val2014/COCO_val2014_000000579697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456505, "question_id": "kqQJx5DD74WzcowA57FeCH", "question": "What is the speed limit on this stretch of road?", "choices": ["30", "45", "35", "25"], "correct_choice_idx": 3, "direct_answers": ["25", "25", "25 mph", "twenty five", "twenty five", "25 mph", "25", "25", "25", "twenty five"], "difficult_direct_answer": false, "rationales": ["The sign on the side of the road states \"speed limit\" and then the number.", "The speed limit sign clear states the limit is twenty-five mph.", "There is a sign on the side of the road"], "image": "train2014/COCO_train2014_000000456505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467126, "question_id": "kqQe3KUKHwhorJJSqiMEBp", "question": "Is it legal to take a right turn right ahead?", "choices": ["uturn only", "yes", "taxis only", "no"], "correct_choice_idx": 1, "direct_answers": ["yes", "yes", "no", "yes", "yes", "yes", "yes", "no", "yes", "yes"], "difficult_direct_answer": false, "rationales": ["It must be legal since a car is making the turn.", "The sign points to a location that goes that direction", "Yes you can turn right."], "image": "train2014/COCO_train2014_000000467126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482959, "question_id": "kqo2ZDJovqxFEopd3BpQaa", "question": "What's the capital city of his ancestral home?", "choices": ["limerick", "cork", "belfast", "dublin"], "correct_choice_idx": 3, "direct_answers": ["dublin", "dublin", "dublin", "unknown", "dublin", "dublin", "dublin", "dublin", "newyork", "dublin"], "difficult_direct_answer": false, "rationales": ["A man in an orange jacket and green tie is in the street with decorations for st. patrick's day all around.", "A man with a painted mustache and crazy orange jacket has several shamrocks on it.", "The man is dressed in festive st. patricks day attire which is celebrated in ireland. the capital of ireland is dublin."], "image": "train2014/COCO_train2014_000000482959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114239, "question_id": "kqy5KK3DoyRuXAampALLM2", "question": "What is she doing with the cat?", "choices": ["playing", "feeding", "photographing", "attacking"], "correct_choice_idx": 0, "direct_answers": ["playing", "feeding", "playing", "feeding", "playing", "playing", "playing", "feeding", "reflection", "feeding it"], "difficult_direct_answer": false, "rationales": ["The person appears to be showing the cat an item in her hand and trying to interact with the cat as one would do when they are playing.", "She's playing.", "The woman is playing with the cat."], "image": "val2014/COCO_val2014_000000114239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341902, "question_id": "krCbYiNUAhpUYzQX8XdCmL", "question": "What does the man here drink?", "choices": ["ale", "wine", "water", "beer"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The man has water.", "The man here is drinking bottled water.", "The liquid is clear."], "image": "train2014/COCO_train2014_000000341902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437778, "question_id": "krRCgjx5Vsa8a5GWWMghtS", "question": "In which country is this street located?", "choices": ["united kingdom", "italy", "france", "australia"], "correct_choice_idx": 0, "direct_answers": ["england", "united kingdom", "united kingdom", "brittany", "britain", "england", "united kingdom", "england", "united kingdom", "london"], "difficult_direct_answer": false, "rationales": ["The bus on the street has a flag for the united kingdom on the back which is where the street is located.", "This is in the uk", "The bus in the photo bears the flag of the united kingdom."], "image": "train2014/COCO_train2014_000000437778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521236, "question_id": "kracrjmuoft9mtiTy3EMby", "question": "What have the rocks near the water been covered in?", "choices": ["paint", "fish", "moss", "marker"], "correct_choice_idx": 2, "direct_answers": ["moss", "moss", "moss", "moss", "moss", "stones", "moss", "stones", "moss", "stones"], "difficult_direct_answer": false, "rationales": ["It's moss on the rocks.", "The rocks are green.", "The rocks have some green around it."], "image": "val2014/COCO_val2014_000000521236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547267, "question_id": "kreSQ2jEejuyaDKiYSbu6y", "question": "What will the man in red to next?", "choices": ["swing", "dribble", "dunk", "bat"], "correct_choice_idx": 0, "direct_answers": ["swing", "swing", "hit ball", "swing", "hit ball", "hit ball", "hit ball", "swing", "hit ball", "hit ball"], "difficult_direct_answer": false, "rationales": ["The man is playing tennis, not basketball or baseball. he is about to hit the ball.", "The ball is coming towards him and he wants to hit it back over the net to his opponent.", "He'll swing."], "image": "train2014/COCO_train2014_000000547267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380139, "question_id": "krkgqcuJZugHqxAqMsm6bu", "question": "Why are the cones orange?", "choices": ["arbitrary", "blending in", "beauty", "visibility"], "correct_choice_idx": 3, "direct_answers": ["high visibility", "safety", "visibility", "visibility", "path definition", "visibility", "safety", "visibility", "set boundary", "visibility"], "difficult_direct_answer": false, "rationales": ["Orange is an easy color to see.", "They are a bright color that does not blend in with any natural outdoor surroundings.", "Orange is a bright colour and helps to be seen."], "image": "train2014/COCO_train2014_000000380139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16418, "question_id": "krqjWggY4GVer8afttQwdp", "question": "What continent would this plane be from?", "choices": ["north america", "asia", "south america", "oceania"], "correct_choice_idx": 1, "direct_answers": ["asia", "singapore", "asia", "singapore", "asia", "asia", "asia", "asia", "japan", "asia"], "difficult_direct_answer": false, "rationales": ["It's from singapore airlines, which is based in china.", "Singapore is not in south america or north america; nor is it in oceania.", "The continent is asia."], "image": "train2014/COCO_train2014_000000016418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260033, "question_id": "ks2VCQxpTMg7fP4x2etmUH", "question": "The animal in the middle is spreading what?", "choices": ["seeds", "spray", "wings", "eggs"], "correct_choice_idx": 2, "direct_answers": ["wings", "wings", "wings", "wings", "wings", "wings", "wings", "wings", "wings", "wings"], "difficult_direct_answer": false, "rationales": ["The items on each side of the bird are wings.", "A bird stands with others with his wings spread out.", "The animal spreads wings."], "image": "train2014/COCO_train2014_000000260033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306992, "question_id": "ks3YmeeUbdqRBg8qv7ZT69", "question": "What is the big boat at the back doing?", "choices": ["patrolling", "public transportation", "touring", "goods transportation"], "correct_choice_idx": 0, "direct_answers": ["watching", "policing", "police monitoring", "patrolling", "keeping peace", "patrolling", "chilling", "searching", "patrolling", "chilling"], "difficult_direct_answer": false, "rationales": ["The big boat is the patrol.", "It is a police boat", "The big boat in the back has police on it that are on watch."], "image": "train2014/COCO_train2014_000000306992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79660, "question_id": "ks5NFU4Mwx8SvVTJSVEC3S", "question": "What are men doing?", "choices": ["walking", "no men", "riding elephants", "nothing"], "correct_choice_idx": 2, "direct_answers": ["riding", "riding elephants", "riding elephants", "riding elephants", "riding elephants", "riding elephants", "riding", "riding", "riding elephants", "riding elephants"], "difficult_direct_answer": false, "rationales": ["The men or onto of an elephant to move from one direction to another.", "They are seated on top of a large mammal with long noses.", "The animals are clearly visible and identifiable due to their unique features and the men on top of them are in a position that would imply they are doing answer a."], "image": "train2014/COCO_train2014_000000079660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221311, "question_id": "ksNip9Sz4wHzR43MFsenCW", "question": "What is the person dragging?", "choices": ["dog leash", "luggage", "racquet", "wheelbarrow"], "correct_choice_idx": 1, "direct_answers": ["suitcase", "suitcase", "suitcase", "suit case", "suitcase", "luggage", "luggage", "suit case", "suitcase", "suitcase"], "difficult_direct_answer": false, "rationales": ["He is pulling a suitcase as he is walking.", "A suitcase is being rolled by the person walking into the tunnel.", "This is a durable box with wheels and a handle used to transport clothing"], "image": "train2014/COCO_train2014_000000221311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70402, "question_id": "ksTnWeCGydUMxDi33heZCh", "question": "Why is the man giving a thumbs up to the viewer?", "choices": ["showing approval", "showing off", "rating movies", "playing prank"], "correct_choice_idx": 0, "direct_answers": ["good food", "good food", "pose", "loves pizza", "loves pizza", "showing approval", "good food", "good pizza", "good food", "good pizza"], "difficult_direct_answer": false, "rationales": ["The man likes his food.", "He is about to enjoy his pizza.", "This is the universally accepted meaning to the symbol and especially so when being done in a photograph."], "image": "train2014/COCO_train2014_000000070402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231675, "question_id": "ksVzqMgnTpHCzvrswG9SL6", "question": "What is the approximate age of the baby?", "choices": ["four months", "twelve months", "one month", "nine months"], "correct_choice_idx": 2, "direct_answers": ["newborn", "two", "newborn", "newborn", "newborn", "1 month", "three months", "one month", "one", "newborn"], "difficult_direct_answer": false, "rationales": ["The baby is no longer really a baby.", "The baby looks very small and like it is around one month old by it's size.", "The age is a month."], "image": "train2014/COCO_train2014_000000231675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442686, "question_id": "ksaWv6BafBERb5hSafg3ad", "question": "What are the circular areas of the umbrellas made from?", "choices": ["paper", "nylon", "wool", "plastic"], "correct_choice_idx": 0, "direct_answers": ["paper", "bamboo", "cane", "wood", "wood", "wood", "paper", "fabric", "bamboo", "cloth"], "difficult_direct_answer": false, "rationales": ["The umbrellas are made of paper.", "These are ligthweight and made out of paper.", "The decorative part is made of a delicate material that won't last long in rain."], "image": "train2014/COCO_train2014_000000442686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294847, "question_id": "ksogcgrA4qo7a56GNLXYqk", "question": "What number comes after the number on the player's jersey?", "choices": ["99", "83", "76", "55"], "correct_choice_idx": 3, "direct_answers": ["fifty four", "55", "55", "55", "fifty five", "55", "55", "55", "55", "fifty five"], "difficult_direct_answer": false, "rationales": ["The number on the jersey is 54.", "The jersey number visible on the player is clearly 54 and answer a would come after.", "A baseball player has the number fifty four on the back of his jersey."], "image": "train2014/COCO_train2014_000000294847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270603, "question_id": "kspg4RphnzpJStzh9SSKtD", "question": "In what year did the final episode of this show air?", "choices": ["1987", "1990", "1983", "1979"], "correct_choice_idx": 2, "direct_answers": ["1972", "1983", "1983", "1989", "1983", "not visible", "1980", "1983", "1984", "1980"], "difficult_direct_answer": false, "rationales": ["Mash stopped airing in 1983.", "I did an internet search for the date of the finale episode of mash.", "The show last aired on 1983."], "image": "val2014/COCO_val2014_000000270603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348027, "question_id": "ksuEC2io6PCgMb9mN9vRZV", "question": "The player just hit the ball so he watches it while he runs to what base?", "choices": ["second", "home", "first", "fourth"], "correct_choice_idx": 2, "direct_answers": ["first", "first", "first", "first", "first", "first", "first", "first", "first", "first"], "difficult_direct_answer": false, "rationales": ["The first base to get to is the first one.", "The man is trying to get to the first base.", "After a batter hits the ball he is to run to the first base."], "image": "train2014/COCO_train2014_000000348027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62131, "question_id": "kswauhjX4pY8HSbYcUctNT", "question": "What materials are the chairs made of?", "choices": ["metal", "bamboo", "ceramic", "wood"], "correct_choice_idx": 1, "direct_answers": ["wicker", "rattan", "bamboo", "wood", "bamboo", "plastic", "rattan", "straw", "plastic", "wicker"], "difficult_direct_answer": false, "rationales": ["The material is woodlike. the material is woven.", "The material is bamboo.", "It is a natural material which is lighter and more flexible than what comes from tree trunks."], "image": "train2014/COCO_train2014_000000062131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335656, "question_id": "ktKBXZuYqnuGDjh39AbxLk", "question": "What type of parking is available?", "choices": ["valet", "diagonal", "lot", "street"], "correct_choice_idx": 3, "direct_answers": ["city parking", "street", "side street", "perpendicular", "parallel", "parallel", "street", "street", "parallel", "street"], "difficult_direct_answer": false, "rationales": ["There is street parking available.", "There are vehicles parked closely together on the sides of the visible street. based on the lines and the setting, answer a is consistent.", "The road is wide enough for vehicles to park without blocking oncoming cars."], "image": "train2014/COCO_train2014_000000335656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92974, "question_id": "ktYFQ3iBg6u823yHCksAEs", "question": "What does the fenced in area behind the man lead to?", "choices": ["subway", "park", "jail", "deli"], "correct_choice_idx": 0, "direct_answers": ["town", "subway", "subway", "subway", "subway", "subway", "subway", "subway", "subway", "bus stop"], "difficult_direct_answer": false, "rationales": ["These are where the stairs are that go down to the train.", "This looks like a typical subway entrance in a city.", "The fenced in area goes underground. delis, parks, and jails are above ground."], "image": "train2014/COCO_train2014_000000092974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180357, "question_id": "ktcUysm3FkK8SGcsBzBY29", "question": "What is the purpose of the parasols shown here?", "choices": ["status symbols", "rain protection", "stage props", "sun protection"], "correct_choice_idx": 2, "direct_answers": ["they're props", "peace", "dancing", "decoration", "stage props", "decorative", "dance prop", "accessory", "costuming", "decoration"], "difficult_direct_answer": true, "rationales": ["The purpose is a prop.", "The parasols are being held up in an interior environment without rain, so the assumption is that they are being displayed for decorative purposes.", "The woman are using the parasols on stage during their performance."], "image": "val2014/COCO_val2014_000000180357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160970, "question_id": "kttkNteF8eBk8xRbEQ2wfk", "question": "What is this biker doing?", "choices": ["racing", "quitting", "falling", "resigning"], "correct_choice_idx": 0, "direct_answers": ["jumping", "jumping", "jump", "jumping", "jumping", "jump", "jumping", "jump", "racing", "jumping"], "difficult_direct_answer": false, "rationales": ["The biker is racing.", "He has a number on his bike and is on a track so he is racing.", "The biker is going around the track appearing to be in a competition."], "image": "train2014/COCO_train2014_000000160970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405444, "question_id": "ku2Udpng2Ved2J9u8pnZ8N", "question": "What condition are the pizza in if they need to be in a pan?", "choices": ["melty", "cheesey", "burnt", "frozen"], "correct_choice_idx": 3, "direct_answers": ["frozen", "frozen", "raw", "raw", "uncooked", "uncooked", "uncooked", "outside plastic", "uncooked", "round"], "difficult_direct_answer": false, "rationales": ["They have not been cooked yet", "A pizza that is raw or cold is put in a pan because pizza is served hot.", "The condition is frozen."], "image": "val2014/COCO_val2014_000000405444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187079, "question_id": "ku9bwMBgw3rpfwdXtdyhU7", "question": "What is being concealed by striped vest and overalls here?", "choices": ["little boy", "girl", "stuffing", "elephant"], "correct_choice_idx": 2, "direct_answers": ["bear", "bear body", "bear", "bear", "stuffing", "cartoon", "torso", "teddy bear", "teddy bear", "teddy bear"], "difficult_direct_answer": false, "rationales": ["The stuffing is covered.", "The stuffing is concealed.", "The teddy bear is wearing the outfit and under the outfit he is a stuffed animal."], "image": "val2014/COCO_val2014_000000187079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117040, "question_id": "kuAjri9JHxxgFUdZ7UFXrR", "question": "What type of industry is sponsoring this event?", "choices": ["lodging", "automobiles", "restaurant", "apparel"], "correct_choice_idx": 1, "direct_answers": ["hotel", "hotel", "hotel", "hotel", "hotel", "automobiles", "hotel", "hotel", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["A sign for a hotel can be seen behind a player on a tennis court.", "The pink and white sign refers to hotels, not automobiles, restaurants, or clothing.", "The name of the hotel is advertised."], "image": "train2014/COCO_train2014_000000117040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134430, "question_id": "kuEoJPu2cCF2ymSAg4PVjn", "question": "What is the small blue and white object on the right side of the sink called?", "choices": ["lamp", "pen", "soap", "toothbrush"], "correct_choice_idx": 3, "direct_answers": ["toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush"], "difficult_direct_answer": false, "rationales": ["The object is a toothbrush.", "The object is for brushing teeth.", "Looking closely at the right side of the sink, you can see something this is standing up. this blue and white object most closely resembles an electric toothbrush."], "image": "train2014/COCO_train2014_000000134430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251777, "question_id": "kuacG6ts3JFzMfzH5Mgo5k", "question": "Why does the horse rider wear Blue costume?", "choices": ["county fair", "disney employee", "renaissance fair", "lost bet"], "correct_choice_idx": 2, "direct_answers": ["playing part", "show team", "acting", "decoration", "knight", "renaissance fair", "preference", "identification", "favorite color", "roleplay"], "difficult_direct_answer": true, "rationales": ["He is a competitor. the color helps tell the competitors apart.", "The person is wearing the gear of someone who would be performing at answer a. the background scene and the crowd of people also would be consistent with answer a.", "The man is dressed as a knight."], "image": "train2014/COCO_train2014_000000251777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65353, "question_id": "kupE4CfeYVRuMSsL8J9YKX", "question": "What are the zebras emerging from?", "choices": ["clouds", "dust", "brush", "water"], "correct_choice_idx": 2, "direct_answers": ["brush", "bush", "bramble", "grass", "bushes", "kind", "bushes", "trees", "bushes", "thicket"], "difficult_direct_answer": false, "rationales": ["The zebras are on land, not in the clouds or on water. there is no dust near the zebras.", "The zebras are from the bush.", "The zebras based on the direction they are collectively facing are coming out of longer bush type grass at the edge of shorter grass. this type of area is referred to as answer a."], "image": "train2014/COCO_train2014_000000065353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167656, "question_id": "kusXn3jStv5MdgBZyZXEP8", "question": "What kind of products does the main sponsor produce?", "choices": ["planes", "vehicles", "milk", "boats"], "correct_choice_idx": 1, "direct_answers": ["cars", "cars", "automobile", "cars", "cars", "cars", "automobiles", "cars", "vehicles", "vehicles"], "difficult_direct_answer": false, "rationales": ["The main sponsor is visible and written on the wall behind the player. the company is commonly known to produce answer a.", "The main sponsor is kia. kia is a south korean automobile company.", "The logo is for a large maker of this product."], "image": "val2014/COCO_val2014_000000167656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153589, "question_id": "kv6PZzBny4yP9YYjwHsQqm", "question": "The name of this animal rhymes best with what word?", "choices": ["libra", "house", "log", "rat"], "correct_choice_idx": 0, "direct_answers": ["libra", "zebra", "libre", "libra", "libra", "libra", "no rhyme", "libra", "zebra", "libra"], "difficult_direct_answer": false, "rationales": ["The name is a libra.", "The name of the animal is a zebra.", "This animal is a zebra, not a dog, cat, or mouse."], "image": "train2014/COCO_train2014_000000153589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48974, "question_id": "kvASjTvp3vTcyYUcpAwBjh", "question": "What shared passion do these men enjoy?", "choices": ["skateboarding", "sun tanning", "tattoos", "eating burgers"], "correct_choice_idx": 0, "direct_answers": ["skateboarding", "skateboarding", "skateboarding", "skateboarding", "skaters", "long boarding", "skateboarding", "skateboarding", "surf", "skateboarding"], "difficult_direct_answer": false, "rationales": ["The men are all holding the same item.", "There are many men holding boards in their hands. they have two attached wheels on front and back of boards.", "The men are carrying skateboards."], "image": "train2014/COCO_train2014_000000048974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338064, "question_id": "kvRgNuJkydtwUwukUi2Foq", "question": "Where can snow be found?", "choices": ["in apartments", "in trees", "underwater", "mountain peaks"], "correct_choice_idx": 3, "direct_answers": ["mountain peaks", "mountains", "mountains", "mountains", "mountain top", "mountains", "mountain top", "mountain peaks", "mountain top", "mountains"], "difficult_direct_answer": false, "rationales": ["The mountains are the highest and cold.", "You can see snow at the top of the mountains.", "It can be found on the mountain peaks."], "image": "train2014/COCO_train2014_000000338064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33572, "question_id": "kvhXavVaPjenULaLtEZpcn", "question": "What would be the best use for this type of bike?", "choices": ["racing", "cruising", "jumps", "tricks"], "correct_choice_idx": 1, "direct_answers": ["commuting", "riding", "transportation", "transport", "helmet", "delivery", "for city", "short rides", "beginners", "cruising"], "difficult_direct_answer": true, "rationales": ["The bike does not have the size, shape or style consistent with another answer and the setting behind the bike looks to be a casual one.", "This bicycle is a custom made one that doesn't look like it is built for speed or many movements.", "This is a cruising bike more than a fast one."], "image": "train2014/COCO_train2014_000000033572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290705, "question_id": "kvzmdoTJRKoGAfrQzSWTqe", "question": "What car company is a major sponsor of the tennis matches?", "choices": ["mercedes benz", "volkswagen", "gm", "dodge"], "correct_choice_idx": 0, "direct_answers": ["mercedes benz", "mercedes benz", "mercedes", "bmw", "mercedes", "mercedes benz", "mercedes benz", "mercedes-benz", "mercedes", "mercedes"], "difficult_direct_answer": false, "rationales": ["An automobile company's logo is on the net. it is not a gm, dodge, or volkswagen logo.", "The sponsor's emblem is seen on the net.", "The mercedes benz logo is largely displayed on the center net."], "image": "train2014/COCO_train2014_000000290705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547099, "question_id": "kw6wK6GN377R7ukRaHjfqp", "question": "How do these men know each other?", "choices": ["classmates", "rivals", "coworkers", "teammates"], "correct_choice_idx": 3, "direct_answers": ["doubles partners", "tennis partners", "team mates", "unknown answer", "teammates", "related", "teammates", "teammates", "tennis partners", "teammates"], "difficult_direct_answer": false, "rationales": ["Since they are on the same side of the net they are on the same team.", "They are wearing the same uniform and playing on the same side.", "They are matching and on the same side of a tennis court"], "image": "train2014/COCO_train2014_000000547099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168774, "question_id": "kwLk3W9vZn6tQhjy4fQJXE", "question": "What kind of event is this?", "choices": ["shower", "birthday party", "reception", "wedding"], "correct_choice_idx": 1, "direct_answers": ["birthday party", "birthday party", "birthday celebration", "birthday party", "birthday party", "birthday party", "picnic", "birthday", "birthday party", "birthday"], "difficult_direct_answer": false, "rationales": ["There are several kids and there is cake on their plates.", "The kids have birthday cake.", "The cake is for a birthday."], "image": "train2014/COCO_train2014_000000168774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470885, "question_id": "kwdp78ENPCh8a7zs7aqMPY", "question": "Why are the horses in the water?", "choices": ["to clean", "rescue boat", "like water", "escaped"], "correct_choice_idx": 1, "direct_answers": ["pulling", "moving boat", "pull boat", "being driven", "pull boat", "rescue boat", "pulling boat", "cooling off", "to rescue", "pulling boat"], "difficult_direct_answer": false, "rationales": ["They are harnessed up and attached to the boat. they are helping the vessel get back into the water.", "The horses are walking beside a boat with men behind leading them.", "They are pulling the boat out to the water."], "image": "val2014/COCO_val2014_000000470885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123184, "question_id": "kwgRcHNwFsVCMpUQH3Ne7S", "question": "Which brand bike is shown in picture?", "choices": ["ford", "hitachi", "trek", "bmw"], "correct_choice_idx": 3, "direct_answers": ["bmw", "bmw", "bmw", "bmw", "bmw", "bmw", "bmw", "bmw", "bmw", "bmw"], "difficult_direct_answer": false, "rationales": ["The brand is a bmw.", "The bike has a bmw logo.", "The company logo is shown on the side of the motorcycle."], "image": "train2014/COCO_train2014_000000123184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573033, "question_id": "kwpYmGm8fxHubyucDk7rvF", "question": "What is casted on the ground behind the bench?", "choices": ["doubt", "shadow", "mirror image", "hole"], "correct_choice_idx": 1, "direct_answers": ["shadow", "shadow", "shadow", "shadow", "shadow", "shadow", "shadow", "shadow", "shadow", "shadow"], "difficult_direct_answer": false, "rationales": ["It is sunny. the bench is blocking some of the sunlight, so there are dark areas on the ground.", "The shadow is in the shape and orientated behind the bench consistent with answer a.", "A wood bench is all alone and causes a dark figure that is in shape of bench. it is caused when sun is shining thru."], "image": "val2014/COCO_val2014_000000573033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348838, "question_id": "kwwATtt8PKCvQkZDqkVT75", "question": "What is this girl baking?", "choices": ["cookies", "tarts", "frosting", "cupcakes"], "correct_choice_idx": 0, "direct_answers": ["cookies", "cookies", "cookies", "tasting", "cookies", "cookies", "tasting", "tasting", "cookies", "cookies"], "difficult_direct_answer": false, "rationales": ["The girl has batter in little sections on the tray.", "She's making cookies.", "Mixing a few ingredients together can make anything, but in this case it is traditional to put cookies to bake in those rounded shapes."], "image": "val2014/COCO_val2014_000000348838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501624, "question_id": "kx5d7bLtAd4cUXdk9QSiTG", "question": "What role does the most heavily armored person shown here hold?", "choices": ["runner", "pitcher", "catcher", "batter"], "correct_choice_idx": 2, "direct_answers": ["catcher", "catcher", "catcher", "catcher", "batter", "catcher", "bat boy", "catcher", "catcher", "batter"], "difficult_direct_answer": false, "rationales": ["He's a catcher.", "The catcher gets hit with balls the most.", "This role is at the highest risk of getting hit; the padding makes sense."], "image": "train2014/COCO_train2014_000000501624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415569, "question_id": "kxNX5qAo9osLbKrrakFAcn", "question": "Why is the man holding a leg up high behind him?", "choices": ["running away", "pitched ball", "doing trick", "stretching"], "correct_choice_idx": 1, "direct_answers": ["threw baseball", "throwing ball", "balance", "give power", "thowing", "threw ball", "pitching", "pitching", "pitching", "pitched ball"], "difficult_direct_answer": false, "rationales": ["The man that is holding up his leg is the pitcher.", "The man is pitching.", "He throws the ball to the batter with such force that his whole body extends and his leg out helps balance him"], "image": "train2014/COCO_train2014_000000415569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131044, "question_id": "kxk9ZJcXFU9NMPGYzcgpuk", "question": "Why are the people wearing orange shirts?", "choices": ["costume", "warmth", "fashion", "uniform"], "correct_choice_idx": 3, "direct_answers": ["uniform", "group members", "company colors", "safety", "unity", "team", "team colors", "parade", "sporting event", "pirate"], "difficult_direct_answer": true, "rationales": ["It's their uniform so they all match.", "They are to color match with each other for an event that's on the bus in front of them.", "These people are in a group doing an activity that appears to involve work."], "image": "train2014/COCO_train2014_000000131044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316536, "question_id": "kyXBWnbeNNCePZafMhqpTk", "question": "What activity is this man involved in?", "choices": ["milking", "sleeping", "examination", "sales"], "correct_choice_idx": 0, "direct_answers": ["milking cow", "milking", "milking", "milking", "milking", "milking cow", "milking cow", "milking", "milking", "milking"], "difficult_direct_answer": false, "rationales": ["This man's hand's are oriented towards a cow's udders; he sit's on a short stool and we can see a metal pail below the udders. this set of circumstances is associated from taking milk out of a cow.", "He is pulling on the teeth where milk comes out from.", "The man is sitting on a stool that is made for doing that."], "image": "train2014/COCO_train2014_000000316536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461878, "question_id": "kyeJHvLmkzRqfG5LKxfTVX", "question": "What cartoon character does the Alarm clock owner prefer?", "choices": ["tweety bird", "mickey mouse", "winnie bear", "sylvester"], "correct_choice_idx": 0, "direct_answers": ["tweety bird", "tweety", "tweety bird", "tomb jerry", "tweety bird", "tweety bird", "tweety", "tweety bird", "tweety bird", "tweetie bird"], "difficult_direct_answer": false, "rationales": ["The clock has a looney tunes bird on it. the clock is an old fashioned style clock.", "There is an animal on the clock. it has a beak.", "The character is on the clock"], "image": "train2014/COCO_train2014_000000461878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357684, "question_id": "kysDsDwT5SqELLn78VKLJ2", "question": "What does this company provide internationally?", "choices": ["power", "beef", "cars", "electronics"], "correct_choice_idx": 0, "direct_answers": ["energy", "time", "energy", "flights", "power", "power", "strom", "phones", "power", "energy"], "difficult_direct_answer": false, "rationales": ["The company provides power and is a multi-national power company based in sweden.", "A sign for a german power company can be seen.", "The company provides power."], "image": "train2014/COCO_train2014_000000357684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163601, "question_id": "kz3rc3CP2xXwCRwfAEg7wc", "question": "Why are the clocks all facing different directions?", "choices": ["easier reading", "gimmick", "broken", "different times"], "correct_choice_idx": 1, "direct_answers": ["decoration", "art", "gimmick", "art installation", "art installation", "art", "design", "hanging", "art piece", "looks"], "difficult_direct_answer": false, "rationales": ["This is likely a sculpture and possibly commentary about different time zones.", "All of the clocks hanging are telling showing different time zones.", "The clocks are hung as a sculpture."], "image": "val2014/COCO_val2014_000000163601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452137, "question_id": "kz7CSzarqXg9Wef8Py4uWA", "question": "What meat product tops these hot dogs?", "choices": ["chile", "grits", "syrup", "gravy"], "correct_choice_idx": 0, "direct_answers": ["chili", "chili", "chili", "chili", "beef chili", "chili", "hamburger", "chile", "hamburger", "beef"], "difficult_direct_answer": false, "rationales": ["There are only two types of meat visible which are the hot dogs and chili on top.", "There is a brown sauce and it is common on hot dogs", "There is ground beef and beans smothered on top of a hot dog."], "image": "val2014/COCO_val2014_000000452137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551842, "question_id": "kzSe4FMRFoyRLeygKDLcbp", "question": "Who is in the greatest danger?", "choices": ["policeman", "sidewalk pedestrian", "car driver", "man crossing"], "correct_choice_idx": 3, "direct_answers": ["peoples", "policeman", "pedestrian", "person walking", "man crossing", "officer", "guy walking", "man", "criminals", "pedestrian"], "difficult_direct_answer": true, "rationales": ["There are only a few people visible. of the people visible, one man is wearing a helmet which would serve some protection, others would be in cars which would offer protection, then there is a man with no protection in the street where he would be vulnerable.", "The man crossing could go any direction and is harder to see.", "A man is crossing a busy street that is filled with cars."], "image": "train2014/COCO_train2014_000000551842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315517, "question_id": "kzXXcYArvqoQkwfeQx7jjK", "question": "What is the item that the cat is on top of used for?", "choices": ["writing books", "storing groceries", "watching shows", "cleaning floors"], "correct_choice_idx": 2, "direct_answers": ["watching television", "television", "tv", "watching movies", "watching shows", "tv", "watch television", "entertainment", "watching television", "tv set"], "difficult_direct_answer": false, "rationales": ["The cat is on a tv based on the screen material and size as well as the general structure. televisions are used for watching tv and frequently for shows.", "The cat is sitting on top of a television that is used for watching shows.", "It is for entertainment and transmitting information."], "image": "train2014/COCO_train2014_000000315517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502248, "question_id": "kzeUKAJTnW8jWhfppyVrkL", "question": "What type of animal produces this commodity?", "choices": ["goat", "chicken", "bee", "cow"], "correct_choice_idx": 2, "direct_answers": ["bee", "bees", "badger", "bee", "bees", "bee", "bee", "honey bees", "bees", "bee"], "difficult_direct_answer": false, "rationales": ["A bee helps create honey.", "The animal is a bee.", "Bees produce honey."], "image": "train2014/COCO_train2014_000000502248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348907, "question_id": "kzvDX7rWepAb5e7hBJnUpY", "question": "What relationship exists between the man and the woman on the right?", "choices": ["coworkers", "lovers", "siblings", "teammates"], "correct_choice_idx": 1, "direct_answers": ["lovers", "partners", "lovers", "lovers", "couple", "partners", "romantic partners", "lovers", "couple", "lovers"], "difficult_direct_answer": false, "rationales": ["They have arms around each other so they're in a romantic relationship.", "The two people are cuddling each other.", "The way they are holding each other is if you are in love with each other like boyfriend and girlfriend. in the other options, it woul be odd and weird."], "image": "train2014/COCO_train2014_000000348907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454543, "question_id": "m2Fuc8JPvxC33JfJTEmQ8t", "question": "What does the machine that is pushing the ball emit?", "choices": ["glue", "water", "lasers", "air"], "correct_choice_idx": 3, "direct_answers": ["hair dryer", "air", "air", "hot air", "air", "air", "air", "air", "air", "air"], "difficult_direct_answer": false, "rationales": ["The machine has air.", "It is a hair dryer, and it uses air to dry things, and push things.", "The machine is an air dryer."], "image": "val2014/COCO_val2014_000000454543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164208, "question_id": "m2GKctfhzApxMiRopx5eCC", "question": "Why does he need the light to be on?", "choices": ["reading", "writing", "cooking", "watching"], "correct_choice_idx": 0, "direct_answers": ["its nighttime", "reading", "reading", "to read", "reading", "reading", "nighttime dark", "reading", "reading", "man reading"], "difficult_direct_answer": false, "rationales": ["He is using the light for reading", "He needs to read.", "He needs light for visibility of words."], "image": "train2014/COCO_train2014_000000164208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212648, "question_id": "m2H48tMvJFUhHLtrzkZhmw", "question": "What company uses vehicles like this?", "choices": ["greyhound", "united airlines", "nokia", "nathan's famous"], "correct_choice_idx": 0, "direct_answers": ["city transit", "greyhound", "bus company", "greyhound", "green transport", "greenwave", "bus company", "bus company", "bus companies", "green wave"], "difficult_direct_answer": false, "rationales": ["Greyhound is a widely used bus company. greyhound has large buses in its fleet.", "This is a bus line.", "The vehicle is a bus, not a hot dog, airplane, or cell phone."], "image": "train2014/COCO_train2014_000000212648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115521, "question_id": "m2NvrvA4GnNpNFnS5qAVLq", "question": "What are the two guys participating in?", "choices": ["watching movie", "playing game", "spraying pesticides", "singing"], "correct_choice_idx": 1, "direct_answers": ["wii", "playing game", "video gaming", "wii", "game", "wi nintendo", "video games", "wii", "remote game", "gaming"], "difficult_direct_answer": false, "rationales": ["The guys are playing the game.", "The two are playing a game.", "The two men are using nintendo wii remotes."], "image": "val2014/COCO_val2014_000000115521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447082, "question_id": "m2UqBNY8EEH2wTEt8rmrtb", "question": "What is the condition of the skateboard next to the boy with blue shoes?", "choices": ["folded", "bent", "broken", "shredded"], "correct_choice_idx": 2, "direct_answers": ["broken", "broken", "broken", "broken", "broken", "broken", "broken", "broken", "broken", "broken"], "difficult_direct_answer": false, "rationales": ["The skateboard is split in half. you can't ride a skateboard that's split in half.", "People sit and a broken skateboard is on the ground in the middle.", "A skateboard in working condition would have a straight board connecting two sets of wheels. the visible board has two perpendicular pieces and the wood is clearly broken."], "image": "train2014/COCO_train2014_000000447082.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491515, "question_id": "m2nnDa59GoY238if2hAcyC", "question": "How are bags identified here?", "choices": ["tracking device", "color", "tags", "they aren't"], "correct_choice_idx": 2, "direct_answers": ["using tags", "tags", "tags", "tags", "using tags", "tags", "tags", "luggage tags", "tags", "tags"], "difficult_direct_answer": false, "rationales": ["The luggage is all tagged.", "The tags are on top and people use tags to write their information.", "There are rectangular pieces of paper attached to the bag handles."], "image": "val2014/COCO_val2014_000000491515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532877, "question_id": "m2sHGxSxKziUZqmvavxjLn", "question": "What is the man attempting to ride?", "choices": ["sand", "air", "waves", "snow"], "correct_choice_idx": 2, "direct_answers": ["surfboard", "wave", "surfboard", "waves", "wave", "surfboard", "surfboard", "wave", "surfboard", "wave"], "difficult_direct_answer": false, "rationales": ["The man rides waves.", "These are formed by the ocean hitting land, and the man is on a surfboard.", "This surfer is attempting to ride a fairly small wave, so there shouldn't be a severe wipe out here."], "image": "train2014/COCO_train2014_000000532877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406839, "question_id": "m2sWfDpG34rDDZ4ppdZXYT", "question": "The pickup truck is carrying the dog during which season of the year?", "choices": ["fall", "spring", "winter", "summer"], "correct_choice_idx": 2, "direct_answers": ["winter", "spring", "winter", "winter", "winter", "summer", "fall", "winter", "fall", "winter"], "difficult_direct_answer": false, "rationales": ["The truck is in winter.", "It looks like it's dark and dreary.", "It is dark and gloomy out."], "image": "train2014/COCO_train2014_000000406839.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514934, "question_id": "m3BnYvJqu26LX5SGgJCLjg", "question": "What is pushing back the dog's fur?", "choices": ["wind", "hand", "ribbon", "brush"], "correct_choice_idx": 0, "direct_answers": ["wind", "wind", "wind", "wind", "wind", "wind", "wind", "wind", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["The car is in motion and the window is open", "The dog's hair is being blown by the wind.", "Wind pushes it back."], "image": "train2014/COCO_train2014_000000514934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280020, "question_id": "m3Kfr99X9KLydzLJQWzpbn", "question": "Which type of glass is used in bus front windshield?", "choices": ["laminated glass", "fiber", "turbo", "carbon"], "correct_choice_idx": 0, "direct_answers": ["flat tempered", "laminated glass", "laminated", "tempered", "bus", "tinted", "safety glass", "safety", "big", "crystal glass"], "difficult_direct_answer": true, "rationales": ["The front has strong glass. this helps prevents breaks.", "The glass is laminated.", "The glass is laminated black."], "image": "train2014/COCO_train2014_000000280020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4489, "question_id": "m3QWaCjDbfVXdootjV9bKW", "question": "What item does she wish she had right now?", "choices": ["car", "flowers", "rake", "balloon"], "correct_choice_idx": 0, "direct_answers": ["umbrella", "car", "car", "car", "umbrella", "umbrella", "shelter", "car", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["A person is walking in a down pour. a car is a way to travel while staying dry.", "A woman is pulling luggage through a rainy street. cars can be used to stay out of the rain and move things easier.", "The item is a car."], "image": "train2014/COCO_train2014_000000004489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146646, "question_id": "m3R2G597Nk5VJZeMEmgHTD", "question": "What does this vehicle collect along its route?", "choices": ["children", "fresh food", "animals", "trash"], "correct_choice_idx": 3, "direct_answers": ["trash", "garbage", "trash", "trash", "trash", "trash", "garbage", "garbage", "garbage", "garbage"], "difficult_direct_answer": false, "rationales": ["This vehicle is collecting trash on its route.", "The vehicle is picking up garbage.", "The vehicle collects trash."], "image": "train2014/COCO_train2014_000000146646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352142, "question_id": "m3dxTxSdsNDWrM38Vvw5rk", "question": "What would the pink item normally be put on?", "choices": ["falafel", "bread", "tortilla", "pasta"], "correct_choice_idx": 1, "direct_answers": ["bun", "plate", "bread", "eggs", "plate", "bread", "bread", "eggs", "skillet", "blt"], "difficult_direct_answer": false, "rationales": ["Bacon can be eaten in a breakfast sandwich.", "This is the main ingredient of a blt.", "The item in question could likely be served on many types of food, but is commonly on a sandwich which would include answer a."], "image": "train2014/COCO_train2014_000000352142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510182, "question_id": "m4AyKbBjVnE4qFDZKqrVWk", "question": "What is near the woman?", "choices": ["tree", "sword", "tennis ball", "badger"], "correct_choice_idx": 2, "direct_answers": ["tennis ball", "tennis ball", "tennis ball", "ball", "tennis ball", "ball", "ball", "tennis ball", "ball", "ball"], "difficult_direct_answer": false, "rationales": ["She is near the tennis ball.", "She is playing tennis.", "The tennis player is near a ball."], "image": "val2014/COCO_val2014_000000510182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210729, "question_id": "m4beWkfBAeMG5TGSApdCr7", "question": "What type beverage is the woman having?", "choices": ["chocolate", "iced coffee", "soda", "milk"], "correct_choice_idx": 1, "direct_answers": ["iced coffee", "coffee", "coffee", "iced coffee", "iced coffee", "coffee", "coffee", "coffee", "coffee", "iced coffee"], "difficult_direct_answer": false, "rationales": ["A woman is drinking a light brown drink in a clear cup.", "Iced coffee is a great drink on a warm day. this woman, in a sleeveless blouse and sunglasses, is enjoying a summer day with iced coffee and a doughnut.", "The drink is milky and has coffee in it."], "image": "train2014/COCO_train2014_000000210729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479466, "question_id": "m567tZbppEqH3Gjsq7cS8A", "question": "What sort of product is the pink box advertising?", "choices": ["water", "soda", "coffee", "gatorade"], "correct_choice_idx": 0, "direct_answers": ["bottled water", "water", "water", "water", "bottled water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The pink box is advertising a vitamin water product.", "The brand evian is written on the side of it, and that is well known bottled water brand.", "Water is shown."], "image": "val2014/COCO_val2014_000000479466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512830, "question_id": "m5zUiG2eH7VZnnSwJUao98", "question": "What do these men seem to be?", "choices": ["students", "pilots", "soldiers", "chefs"], "correct_choice_idx": 2, "direct_answers": ["military", "soldiers", "service soldiers", "soldiers", "military", "servicemen", "soldiers", "soldiers", "airmen", "soldiers"], "difficult_direct_answer": false, "rationales": ["The people are all wearing identical uniforms with camouflage, and some of them have visible ranks.", "They have army uniforms on.", "They are all wearing matching military uniforms."], "image": "val2014/COCO_val2014_000000512830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221532, "question_id": "m687ALMLz9cknAd6KtRSzN", "question": "What athlete was born on the continent whose name appears on the plane?", "choices": ["jose vidro", "otis nixon", "shohei ohtani", "chris jericho"], "correct_choice_idx": 2, "direct_answers": ["multiple", "kim lee", "many", "manny pacquiao", "lot", "manny pacquiao", "jeremy lin", "shohei ohtani", "ichiro suzuki", "yao ming"], "difficult_direct_answer": true, "rationales": ["The athlete is ohtani.", "The plane is from asia based on the writing on the tail of the plane and answer a is the only answer that has someone born on this continent.", "The athlete shohei ohtani was born in asia."], "image": "train2014/COCO_train2014_000000221532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418933, "question_id": "m6ByBgGGxGHpHYNVs8iYY3", "question": "What was left behind on the bench?", "choices": ["food", "pants", "shoes", "stuffed animals"], "correct_choice_idx": 3, "direct_answers": ["stuffed animals", "stuffed animals", "stuffed animals", "stuffed animals", "stuffed animals", "stuffed animal", "stuffed toys", "stuffed animals", "tracks", "stuffed animals"], "difficult_direct_answer": false, "rationales": ["Because it is a doll with a white body fur.", "The stuffed animal was left.", "They are furry and soft toys"], "image": "val2014/COCO_val2014_000000418933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168134, "question_id": "m6D6Z5qG8pHESQcWJKEC4V", "question": "What is the woman doing beside the dog?", "choices": ["drinking", "singing", "smoking", "eating"], "correct_choice_idx": 2, "direct_answers": ["smoking", "smoking", "smoking", "smoking", "smoking", "smoking cigarette", "watching", "smoking", "holding dog", "smoking"], "difficult_direct_answer": false, "rationales": ["The woman has a cigarette.", "She has a cigarette in her hand", "The woman is smoking."], "image": "train2014/COCO_train2014_000000168134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422956, "question_id": "m6Qoz3xZZzJP7aDxx8CkUX", "question": "It's unlikely that he's on which floor?", "choices": ["fourth", "second", "third", "ground"], "correct_choice_idx": 3, "direct_answers": ["first", "first", "first", "top floor", "ground", "ground", "1st", "lowest", "ground floor", "second flor"], "difficult_direct_answer": false, "rationales": ["The pitch of the ceiling indicates he's at the top of a house", "The ceiling in the room is very low so it would be located on the highest floor.", "The person has a window view so he's not on the ground floor."], "image": "train2014/COCO_train2014_000000422956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310035, "question_id": "m6vxHzkgBgFRGX4bzMQLkW", "question": "How do the book's pages appear to her that's different than normal?", "choices": ["wet", "blurred", "tinted black", "tinted red"], "correct_choice_idx": 2, "direct_answers": ["darker", "not reflective", "color", "very bright", "closed", "smaller", "sunnier", "bright", "tinted black", "blank"], "difficult_direct_answer": true, "rationales": ["She has sunglasses on, so everything she sees while wearing them appears darker.", "The pages are somewhat black.", "She is wearing sunglasses which would tint the pages compared to a normal reader without sunglasses."], "image": "val2014/COCO_val2014_000000310035.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394182, "question_id": "m6zhBbBJ4vrqvHrEcqe9or", "question": "What are the green tags on the animals ears for?", "choices": ["punishment", "decoration", "identification", "protection"], "correct_choice_idx": 2, "direct_answers": ["tracking tags", "tracking them", "animal identification", "identification", "tracking", "identification", "their numbers", "cataloging", "id's", "identification"], "difficult_direct_answer": false, "rationales": ["The animals depicted are cows and the tags are small plastic tags in their ears. animals of this kind are often marked in this way for the purposes of identification.", "The tags are used to mark and keep track of the cows.", "The tags on the ears of the animals are for them being identificated"], "image": "train2014/COCO_train2014_000000394182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162252, "question_id": "m73jWvtT9FxmMyGShEdLcj", "question": "What is unusually long here?", "choices": ["manes", "roads", "helmets", "hooves"], "correct_choice_idx": 0, "direct_answers": ["manes", "manes", "horse's manes", "horse hair", "tree", "horse's mane", "horse hair", "trees", "horses manes", "manes"], "difficult_direct_answer": false, "rationales": ["The manes are very bushy.", "The horses' manes are really fluffy.", "The manes are long."], "image": "train2014/COCO_train2014_000000162252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126054, "question_id": "m74E7SB9sacDJyuVeQuXQh", "question": "Why are they so close?", "choices": ["threatening", "admiring", "talking", "examining"], "correct_choice_idx": 2, "direct_answers": ["discussing game", "talking", "talking", "talking", "talking", "talking", "discussing game", "talking", "talking", "talking"], "difficult_direct_answer": false, "rationales": ["They're talking.", "Two people are standing facing each other. people stand and face each other when they are talking.", "Two people stand facing each other in an outdoor, grassy area. no one else is around."], "image": "train2014/COCO_train2014_000000126054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448648, "question_id": "m7AUbaqa79ms9VfAbqwYCZ", "question": "What are is the image from?", "choices": ["underground", "city", "forest", "sky"], "correct_choice_idx": 1, "direct_answers": ["city", "asia", "city", "city", "city", "asia", "city street", "developing country", "downtown", "city"], "difficult_direct_answer": false, "rationales": ["There are buildings and cars and people everywhere.", "There are many buildings, street signs, pedestrians and vehicles, suggesting this is a highly populated urban area.", "It has the buildings and traffic you see in a decent sized city."], "image": "train2014/COCO_train2014_000000448648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271741, "question_id": "m7DEU3Wuxvjo4mKNn2GjPV", "question": "What would be the reason a person on the field is dressed in orange and black?", "choices": ["team owner", "team captain", "referee", "goalie"], "correct_choice_idx": 2, "direct_answers": ["opposing team", "referee", "referee", "referee", "referee", "ref", "referee", "referee", "referee", "halloween"], "difficult_direct_answer": false, "rationales": ["The reason is to referee.", "She is wearing a bright color to distinguish her from the players. she is positioned on the field to see the plays clearly.", "The orange and black outfit is different than the two color schemes worn by the competing athletes in the image; probably to differentiate them as a judge of the competition."], "image": "train2014/COCO_train2014_000000271741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229156, "question_id": "m7ptRD8qt4i9owCaeJaKVK", "question": "The stop sign was defaced with a reference to which rock group?", "choices": ["boston", "styx", "journey", "reo speedwagon"], "correct_choice_idx": 2, "direct_answers": ["bon jovi", "journey", "journey", "stop", "kansas", "journey", "journey", "journey", "journey", "hate group"], "difficult_direct_answer": false, "rationales": ["The sign is for journey.", "The stop sign has a lyric from a journey song.", "Journey has a famous song."], "image": "train2014/COCO_train2014_000000229156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497642, "question_id": "m8KfDMdDNj4dDcaXBKegV8", "question": "What is the elephant on the far right next to?", "choices": ["airplane", "car", "fan", "building"], "correct_choice_idx": 3, "direct_answers": ["building", "baby", "its mother", "pillar", "baby elephant", "building", "building", "building", "baby elephant", "her baby"], "difficult_direct_answer": false, "rationales": ["The elephant is close to a structure that's a building.", "The elephant is close to a building.", "The animal is by a building."], "image": "train2014/COCO_train2014_000000497642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529309, "question_id": "m8dT3dR2UBCQHjd2CYGP9j", "question": "What is the role of the person behind the counter?", "choices": ["doctor", "employee", "judge", "guard"], "correct_choice_idx": 1, "direct_answers": ["worker", "customer", "service", "clerk", "clerk", "employee", "help", "sales clerk", "customer service", "sell phone"], "difficult_direct_answer": true, "rationales": ["They are there to assist with customers. the computer is for doing check ins with the people in line.", "The person works there and is showing the customer how to do something.", "Because the person holding the phone is behind the counter with the computer it verifies that they are an employee interacting with a customer."], "image": "train2014/COCO_train2014_000000529309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128168, "question_id": "m8fjnMcHBE5doe4rQpjy7d", "question": "Why are the shirts hanging outside the bus?", "choices": ["to clean", "to dry", "to sell", "to donate"], "correct_choice_idx": 2, "direct_answers": ["tie die", "sale", "on sale", "for sale", "for sale", "for sale", "for sale", "sale", "to sell", "for sale"], "difficult_direct_answer": false, "rationales": ["The shirts are for sale.", "This person has set up a little shop to make some money.", "There is a price label at the top of the rack with the shirts."], "image": "val2014/COCO_val2014_000000128168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286406, "question_id": "m8mJWUcHKhVNH7gbANpZth", "question": "What position is the man in the green shirt?", "choices": ["mid fielder", "defenseman", "striker", "goalie"], "correct_choice_idx": 3, "direct_answers": ["goalie", "goalie", "goalie", "goalie", "goalie", "goalie", "goalie", "jumping", "goalie", "jumping"], "difficult_direct_answer": false, "rationales": ["The position is the goalie.", "The man is catching the ball.", "This man in a sports uniform on a field is stopping a ball from going the direction it was going through the air after someone had thrown it."], "image": "train2014/COCO_train2014_000000286406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101312, "question_id": "m9A38kuQ7kpfXBTwoBqRyJ", "question": "What is the woman looking down at?", "choices": ["table", "laptop", "television", "tablet"], "correct_choice_idx": 1, "direct_answers": ["laptop", "laptop", "computer", "laptop", "keyboard screen", "computer", "laptop", "laptop", "laptop screen", "stuffed animal"], "difficult_direct_answer": false, "rationales": ["A woman is at a table bent over a computer that is on.", "The woman has her laptop.", "The laptop is visible in the bottom right corner of the image and directly in the woman's eye line."], "image": "val2014/COCO_val2014_000000101312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423023, "question_id": "m9AAGhNmgMdrLd6pajjb8d", "question": "Why would you use the pictured stairs?", "choices": ["normal use", "fire emergency", "playing games", "moving furniture"], "correct_choice_idx": 1, "direct_answers": ["fire escape", "fire escape", "fire escape", "escape fire", "fire", "fire", "fire emergency", "climb up", "reach restaurant", "fire"], "difficult_direct_answer": false, "rationales": ["There are stairs on the outside of a building in case of emergency.", "To leave your high-rise apartment quickly if normal exits are blocked.", "It is a fire escape."], "image": "train2014/COCO_train2014_000000423023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300177, "question_id": "m9HrYYSbRfXtdkiCiimnRQ", "question": "Where is the fire hydrant located?", "choices": ["park", "village", "city", "farm"], "correct_choice_idx": 0, "direct_answers": ["at park", "grass", "field", "park", "park", "in field", "on grass", "park", "park", "park"], "difficult_direct_answer": false, "rationales": ["There is a lot of grass, but it is taken care of. you can see people enjoying their day.", "It's in a park.", "The hydrant is in a grassy park."], "image": "train2014/COCO_train2014_000000300177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416723, "question_id": "m9dbH22hSKZ3S2LKCnvXt4", "question": "Those crackers taste like what?", "choices": ["tomatoes", "onion", "sardines", "cheese"], "correct_choice_idx": 3, "direct_answers": ["cheese", "cheese", "cheddar", "cheese", "cheese", "cheddar", "cheese", "cheese", "cheese", "cheese"], "difficult_direct_answer": false, "rationales": ["The person in the back is holding a box of cheezits which are crackers flavored with cheese.", "Cheez-its are made with real cheese.", "The crackers are cheesy."], "image": "train2014/COCO_train2014_000000416723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184030, "question_id": "m9fFZC4oxEotVYgJ6XdPy3", "question": "How did the people standing near the lighthouse get there?", "choices": ["sailed", "walked", "uber", "motorcade"], "correct_choice_idx": 1, "direct_answers": ["walked", "paddleboard", "walk", "boat", "walked", "surfboard", "paddled", "by boat", "placed", "walked"], "difficult_direct_answer": false, "rationales": ["The people walked.", "They are dry and wearing clothing, but there are no roads for motor vehicles to get to the lighthouse.", "The people standing near the lighthouse arrived on foot by walking over rocks."], "image": "val2014/COCO_val2014_000000184030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347596, "question_id": "m9rXhdLYMjDMCB2Gv5vyYR", "question": "What is the man walking to?", "choices": ["ocean", "pond", "river", "lake"], "correct_choice_idx": 0, "direct_answers": ["water", "ocean", "ocean", "beach", "ocean", "ocean", "beach", "ocean", "ocean", "ocean"], "difficult_direct_answer": false, "rationales": ["The man is going to the ocean.", "The man is holding a surfboard and is walking forward the sea to most likely surf.", "He is trying to get into the water."], "image": "train2014/COCO_train2014_000000347596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33303, "question_id": "mAR2RUGWbtbUQA3cnAA6VM", "question": "The boy is most likely eating what?", "choices": ["carrot", "watermelon", "lemon", "lettuce"], "correct_choice_idx": 3, "direct_answers": ["broccoli", "vegetables", "broccoli", "broccoli", "unknown", "something green", "lettuce", "candy", "strawberry", "vegetables"], "difficult_direct_answer": false, "rationales": ["A nice green leafy produce one can eat alone is lettuce.", "The food is green, not orange, red, yellow, or red.", "He is eating lettuce."], "image": "train2014/COCO_train2014_000000033303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404257, "question_id": "mAVRUxGSKPE5QDNEfDeYTV", "question": "What item usually comes in a similar container?", "choices": ["milk", "oranges", "cat food", "hair dye"], "correct_choice_idx": 0, "direct_answers": ["milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk", "milk"], "difficult_direct_answer": false, "rationales": ["Milk is usually found in a carton.", "Many dairy products are packaged in a similar cardboard carton.", "The beverage closely associated with this type of container is milk."], "image": "val2014/COCO_val2014_000000404257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257943, "question_id": "mAm5iMRKDx7Gvu42zVTYUC", "question": "What material is the shield on the woman's face made of?", "choices": ["metal", "carbon fiber", "wood", "plastic"], "correct_choice_idx": 0, "direct_answers": ["metal", "metal", "steel", "metal", "metal", "metal", "metal", "carbon fiber", "metal", "metal"], "difficult_direct_answer": false, "rationales": ["Metal is the only of these materials normally strong enough to prevent a fast moving ball from hurting someone's face.", "The material protects her from a really fast ball hurting her face.", "It's made from steel so it doesn't break."], "image": "val2014/COCO_val2014_000000257943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204170, "question_id": "mAx4sTeafxpbRXLhaXjQrS", "question": "What is the problem with this photo?", "choices": ["too bright", "blurred", "photoshopped", "too dark"], "correct_choice_idx": 3, "direct_answers": ["night", "too dark", "too dark", "color", "poor lighting", "too dark", "no color", "dark", "dark", "black white"], "difficult_direct_answer": false, "rationales": ["This picture is too dark because you have troubles seeing every detail", "It's difficult to make out details because of a.", "The background is almost blackened out."], "image": "train2014/COCO_train2014_000000204170.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550394, "question_id": "mB6WaZdpLFSooSjUAqeGV6", "question": "Where does the cat get his water from?", "choices": ["cup", "faucet", "hose", "bowl"], "correct_choice_idx": 1, "direct_answers": ["faucet", "faucer", "faucet", "faucet", "faucet", "faucet", "faucer", "tap", "faucet", "faucet"], "difficult_direct_answer": false, "rationales": ["The cat is in a sink.", "The cat is in the faucet.", "The cat is standing on a sink with a faucet that water is clearly coming out of."], "image": "train2014/COCO_train2014_000000550394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440273, "question_id": "mBB2yQSWFZz4mX6Tti8gyH", "question": "What do both of the people have on their heads?", "choices": ["cowboy hats", "glasses", "gas masks", "helmets"], "correct_choice_idx": 0, "direct_answers": ["cowboy hats", "cowboy hats", "hat", "cowboy hats", "cowboy hats", "cowboy hats", "hats", "hats", "cowboy hats", "cowboy hat"], "difficult_direct_answer": false, "rationales": ["The two men that are visible are wearing cowboy-style hats on their heads and that is the only type of headwear seen.", "Their headwear is made out of straw or felt. their faces are uncovered.", "The people talking by the carriage are both wearing cowboy hats on their heads."], "image": "train2014/COCO_train2014_000000440273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9226, "question_id": "mBaGxBZiFinjwFMUaYQ2xF", "question": "What is the white poster near the woman used for?", "choices": ["mailing", "painting", "selling", "advertising"], "correct_choice_idx": 3, "direct_answers": ["sitting down", "ads", "marketing", "advertisement", "floor", "advertising", "furnitute ad", "advertising", "advertising", "advertising"], "difficult_direct_answer": false, "rationales": ["This type of panel shows business or organizational ads.", "The poster is an ad.", "The poster has an ad for selling comfortable furniture."], "image": "train2014/COCO_train2014_000000009226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451471, "question_id": "mBpxviaLNWHWNKrcGKDSVs", "question": "The flag represents which country?", "choices": ["france", "uk", "italy", "us"], "correct_choice_idx": 3, "direct_answers": ["us", "america", "united states", "usa", "america", "usa", "united states", "united states", "usa", "usa"], "difficult_direct_answer": false, "rationales": ["The flag is red, white, and blue. it has stars and stripes.", "That is a usa flag.", "The blue, red and white colors along with the stars and stripes are famous for being the us flag."], "image": "val2014/COCO_val2014_000000451471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9112, "question_id": "mByqHbv4aVuLq99uCHSsJX", "question": "What are the arrow signs telling people?", "choices": ["no turning", "go right", "go left", "stop"], "correct_choice_idx": 0, "direct_answers": ["no turns", "no turn", "no turning", "no turn", "no turns", "don't turn", "no turning", "no turning", "no turning", "no turns"], "difficult_direct_answer": false, "rationales": ["The arrows say no turning.", "Each sign contains an arrow that is covered by a red circle and a red line. a driver is not allowed to go in the direction of a crossed out arrow.", "The signs say no turns."], "image": "train2014/COCO_train2014_000000009112.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477700, "question_id": "mC74Q9P7kQsyrWd34G4bXS", "question": "Why is the plane hanging in the air?", "choices": ["hiding", "for display", "fell there", "is stuck"], "correct_choice_idx": 1, "direct_answers": ["display", "display", "for display", "exhibited", "display", "display", "display", "display", "for display", "museum"], "difficult_direct_answer": false, "rationales": ["It is suspended by cables and being shown in a museum exhibit.", "They would hang something like this to show it off in a museum or a place that likes this kind of decor.", "The plane is displayed."], "image": "val2014/COCO_val2014_000000477700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119761, "question_id": "mCVoqFNM2VkjhNhkzpwYL8", "question": "What sport is this?", "choices": ["soccer", "baseball", "equestrian", "basketball"], "correct_choice_idx": 2, "direct_answers": ["show jumping", "horse redding", "equestrian", "riding", "show jumping", "horse jumping", "equestrian", "jumping", "jumping", "horseback riding"], "difficult_direct_answer": false, "rationales": ["It's riding horses competition.", "The horse is an equine animal.", "There is a woman riding a horse."], "image": "train2014/COCO_train2014_000000119761.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90490, "question_id": "mCeXHnHhf9TG9K4L6wXTaf", "question": "The display is part of which retail store?", "choices": ["target", "sears", "walmart", "best buy"], "correct_choice_idx": 3, "direct_answers": ["best buy", "apple", "gadget accessories", "apple", "apple", "computer", "apple", "apple", "best buy", "apple"], "difficult_direct_answer": false, "rationales": ["There are blue signs for the bathroom. best buy's color is blue.", "There are other brand posters in the background.", "A typical setup for that store."], "image": "train2014/COCO_train2014_000000090490.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195606, "question_id": "mD2tbj5c7xyxEHCWeTgHUq", "question": "What are the people walking in?", "choices": ["sand", "rain", "snow", "park"], "correct_choice_idx": 1, "direct_answers": ["rain", "rainwater", "rain", "rain", "market", "rain", "rain", "store", "dirt", "rain"], "difficult_direct_answer": false, "rationales": ["The people are in rain.", "They are holding umbrellas and you can see water draining.", "There is water pouring off the canvas awnings above them."], "image": "train2014/COCO_train2014_000000195606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493020, "question_id": "mD9HbGfkBZqvim46zSxkch", "question": "From where is the light most likely coming?", "choices": ["sun", "candles", "moon", "bonfire"], "correct_choice_idx": 2, "direct_answers": ["street lamps", "moon", "streetlight", "streetlight", "street lamp", "streetlight", "moon", "streetlight", "street lamp", "street lamps"], "difficult_direct_answer": false, "rationales": ["A man is skateboarding down the middle of the street. it is night time.", "There are no street lights shown and the moon is bright.", "There is a man skating with his dog at night."], "image": "val2014/COCO_val2014_000000493020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9760, "question_id": "mDAayHPPUNdNShJGgaFJji", "question": "What type of animal can be seen on top of the shelf near the back doors?", "choices": ["pigeon", "eagle", "owl", "hawk"], "correct_choice_idx": 2, "direct_answers": ["cat", "bird", "owl", "owl", "owl", "cat", "owl", "owl", "owl", "owl"], "difficult_direct_answer": false, "rationales": ["It has that round head with wings held against its body on the sides and looks like a vase.", "A statue of a large bird with big eyes is high on a shelf in a home. people like owls.", "In looking at the background one can see a stuffed animal at the top of the shelf. this animal is known as an owl."], "image": "train2014/COCO_train2014_000000009760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140340, "question_id": "mDtHB9K7Jmpt9tDAvXQFAW", "question": "In which European country can this ornate clock be found?", "choices": ["poland", "england", "france", "germany"], "correct_choice_idx": 2, "direct_answers": ["france", "france", "paris", "paris", "france", "france", "france", "london", "france", "spain"], "difficult_direct_answer": false, "rationales": ["That's what country it's in.", "The language of the advertisements are in french.", "There is a french word on the wall"], "image": "train2014/COCO_train2014_000000140340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108991, "question_id": "mDvy3MWcLTiAhXFUXB5CiU", "question": "Which photo goes first?", "choices": ["second right", "left", "second left", "right"], "correct_choice_idx": 1, "direct_answers": ["left", "first", "far left", "left", "far left", "left", "first one", "left", "fought", "left"], "difficult_direct_answer": false, "rationales": ["He appears to start at the top and then comes down, which makes sense.", "The man isn't visible yet in the left.", "He is just leaving the top of the drop."], "image": "train2014/COCO_train2014_000000108991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282618, "question_id": "mEyGsAzLWuPwhjs5Z87vei", "question": "What kind of stand is shown?", "choices": ["rental", "lunch", "produce", "drink"], "correct_choice_idx": 0, "direct_answers": ["snowboard", "ski stand", "wooden", "ski stand", "ski stand", "skaters", "rental", "ski stand", "ski holder", "ski rentals"], "difficult_direct_answer": false, "rationales": ["It's a place you can stay just a few days.", "A ski resort is full of people and skis are stacked up outside one of the buildings.", "The people are getting their gear."], "image": "val2014/COCO_val2014_000000282618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302657, "question_id": "mFbnYm2gEYC6JNfauYSHaz", "question": "What animal is known for eating the item on the plate?", "choices": ["cheetah", "badger", "monkey", "porcupine"], "correct_choice_idx": 2, "direct_answers": ["monkey", "monkey", "monkey", "primates", "monkey", "monkey", "monkey", "monkey", "monkey", "monkey"], "difficult_direct_answer": false, "rationales": ["If you see a monkey on television there is a good chance you will also see a banana.", "Bananas are eaten by monkeys.", "The item on the child's plate is a banana. monkey's love to eat bananas."], "image": "train2014/COCO_train2014_000000302657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123800, "question_id": "mFgVjq2WncgJnXPPNWBioA", "question": "What is the main purpose for all of the older style sail boats to be on the water?", "choices": ["transporting people", "trading goods", "exploration", "boat show"], "correct_choice_idx": 3, "direct_answers": ["fishing", "boat show", "sailing", "festival", "sail", "display", "unknown", "boat", "fishing", "travelling"], "difficult_direct_answer": true, "rationales": ["The boats are from a different time period.", "A large group of various ships are anchored in the water near each other.", "It's impractical for these types of vehicles to be used in modern times, and there are varied styles from different ages."], "image": "train2014/COCO_train2014_000000123800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1722, "question_id": "mFmvtXBLjjSP9SUJFywxbL", "question": "Which person gets the most soaked?", "choices": ["woman", "cyclist", "short man", "tall man"], "correct_choice_idx": 1, "direct_answers": ["on bike", "without umbrella", "rider", "bicyclist", "cycle", "biker", "cyclist", "bike rider", "bicyclist", "cyclist"], "difficult_direct_answer": false, "rationales": ["The cyclist is in the rain and while holding an umbrella, will likely be hit outside the range of the umbrella and from spray off of the bike, other passing vehicles etc. they are the only person visible outside of shelter from the rain and actively getting wet.", "The cyclist is the only person who can be seen outside in the open air where it is clearly raining.", "A man is riding on a bike in the rain and another is under an awning of a business. an awning blocks rain."], "image": "val2014/COCO_val2014_000000001722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314019, "question_id": "mFwX7w8vrWgxzgCitpVAcu", "question": "What venue is this person sitting at?", "choices": ["front yard", "park", "college campus", "street"], "correct_choice_idx": 3, "direct_answers": ["street", "city sidewalk", "school", "park", "courtyard", "bus stop", "city street", "school", "school", "bench"], "difficult_direct_answer": false, "rationales": ["The person is just sitting on the sidewalk near a road.", "She looks like a young woman and in the background, there is a building that's like a hall on a higher institution.", "The person is on a street."], "image": "train2014/COCO_train2014_000000314019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414659, "question_id": "mG7R5NCMyjDpHuRzzGjaYR", "question": "What does the business sell?", "choices": ["sandwiches", "soup", "fruit", "pastries"], "correct_choice_idx": 3, "direct_answers": ["donuts", "donuts", "donuts", "donuts", "donuts", "pastries", "donuts", "donuts", "donuts", "donuts"], "difficult_direct_answer": false, "rationales": ["The sign above the door has the word donuts on it, suggesting those are sold there, and they are a type of pastry.", "It sells sweets like doughnuts.", "The sign says donuts."], "image": "train2014/COCO_train2014_000000414659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402255, "question_id": "mG7uCzH5v2Kv5K74deeU3P", "question": "Why is the woman wearing a diamond ring?", "choices": ["she's flirty", "dress code", "fashion", "she's married"], "correct_choice_idx": 3, "direct_answers": ["she's married", "married", "married", "she's married", "she's married", "married", "married", "she's married", "married", "married"], "difficult_direct_answer": false, "rationales": ["She is in a committed relationship that she took a vow to someone.", "She is wearing the ring on her left 4th finger which is the finger culturally used to wear wedding rings.", "She is married."], "image": "train2014/COCO_train2014_000000402255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463615, "question_id": "mGC8uXMAGTrxVE4JGcai4f", "question": "What type of site is this?", "choices": ["burial", "historical", "religious", "construction"], "correct_choice_idx": 3, "direct_answers": ["barracks", "construction site", "construction", "forest", "construction", "construction", "construction", "construction", "construction", "construction"], "difficult_direct_answer": false, "rationales": ["There is an industrial vehicle, a lot of mud and large underground pipes in a pile.", "With all the pipes on the ground, there would be some building going on around there.", "The pies seem to indicate this option. the other options don't match unless an a dig found items from a past b, c or d site."], "image": "val2014/COCO_val2014_000000463615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434127, "question_id": "mGE2LYePwSqfTxyp4mktYA", "question": "What type of sink is this?", "choices": ["marble", "single", "double", "farmhouse"], "correct_choice_idx": 2, "direct_answers": ["kitchen sink", "kitchen", "kitchen", "kitchen sink", "kitchen", "kitchen", "kitchen", "double", "double", "kitchen sink"], "difficult_direct_answer": false, "rationales": ["It has two sides to use", "The sink is doubled.", "There are two sinks in the basin."], "image": "train2014/COCO_train2014_000000434127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461183, "question_id": "mGGfWsGWzZgbDuN9EPFFXY", "question": "Where will he put the yellow and black object?", "choices": ["feet", "hands", "waist", "head"], "correct_choice_idx": 0, "direct_answers": ["man", "on back", "bench", "feet", "on foot", "feet", "feet", "feet", "feet", "foot"], "difficult_direct_answer": false, "rationales": ["The yellow and black flipper will be worn on the man's feet. flippers enable divers to move efficiently in the water.", "Flippers are worn at the end of the lower extremities to help move faster in the water.", "He will put the objects on his feet to swim."], "image": "train2014/COCO_train2014_000000461183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553954, "question_id": "mGJ8FDUo2zxzz7VSwEdDdc", "question": "This activity is most associated with which people group?", "choices": ["tibetans", "tanzanians", "japanese", "inuit"], "correct_choice_idx": 2, "direct_answers": ["japanese", "sumo", "japanese", "sumo", "sumo wrestling", "sumo", "big people", "japanese", "sumo", "sumo"], "difficult_direct_answer": false, "rationales": ["The men are participating in sumo wrestling. i did an internet search for the county of origin of sumo wrestling.", "Sumo is a popular japanese sport with a history spanning hundreds of years.", "Sumo wrestlers are japanese."], "image": "val2014/COCO_val2014_000000553954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40517, "question_id": "mGmzsH2WfAbLU558DZCFFD", "question": "What type of buildings are these?", "choices": ["barn", "shed", "church", "high rise"], "correct_choice_idx": 3, "direct_answers": ["high rises", "skyscraper", "high rises", "skyscraper", "high rise", "apartment building", "high rise", "skyscrapers", "apartment", "hotel"], "difficult_direct_answer": false, "rationales": ["Tall buildings are all around a city.", "The buildings are fairly tall and are in an urban area.", "The tall buildings resemble a high rise."], "image": "val2014/COCO_val2014_000000040517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477655, "question_id": "mH2mi2oqUgYgYDKmMmw6BP", "question": "Why are they both trying to kick the ball?", "choices": ["trying steal", "is stolen", "is game", "are angry"], "correct_choice_idx": 2, "direct_answers": ["soccer", "soccer", "to win", "win", "competition", "opponents", "game", "to score", "is game", "score"], "difficult_direct_answer": true, "rationales": ["They want to play a game.", "The players are playing soccer based on the uniforms and equipment and are on separate teams based on their colors. the rules of the game determine why they would be interested in kicking the ball.", "These boys are playing soccer so they are both trying to kick the ball to get control."], "image": "val2014/COCO_val2014_000000477655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480812, "question_id": "mH8G9JboosjBj39h4jhUDx", "question": "What does the man have around his neck?", "choices": ["bowtie", "scarf", "tie", "wrench pendant"], "correct_choice_idx": 3, "direct_answers": ["wrench", "necklace", "necklace", "necklace", "miniature wrench", "necklace", "wrench pendant", "wrench", "wrench", "necklace"], "difficult_direct_answer": false, "rationales": ["The man has a necklace around his neck that is shaped like wrench.", "The man has a pendant.", "The man has a pendant."], "image": "train2014/COCO_train2014_000000480812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41123, "question_id": "mHANiLQd8ato7XZ5UrSJsS", "question": "What made the slot for the filling?", "choices": ["hammer", "saw", "knife", "straw"], "correct_choice_idx": 2, "direct_answers": ["knife", "knife", "knife", "knife", "ham", "knife", "knife", "ham", "knife", "knife"], "difficult_direct_answer": false, "rationales": ["Bread is not commonly cooked already sliced in half for the purposes of food filling. the slice happens after baking and would commonly require something sharp enough to cut the bread into this configuration.", "Knives are used to cut bread.", "The bun has been sliced."], "image": "train2014/COCO_train2014_000000041123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39040, "question_id": "mHS3tJTD2niP3XYAPSW5HA", "question": "What are the people holding the umbrellas trying to avoid?", "choices": ["rain", "sun", "snow", "wind"], "correct_choice_idx": 0, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "raindrops", "raindrops"], "difficult_direct_answer": false, "rationales": ["There is water falling from the sky, and most people don't want to be wet.", "Umbrellas are used to avoid being rained on.", "Umbrellas protect people from falling water."], "image": "train2014/COCO_train2014_000000039040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240134, "question_id": "mHSgnjvsUCLwhEHgb2acX2", "question": "What type taste does the item shown here have?", "choices": ["bland", "sour", "sweet", "salty"], "correct_choice_idx": 2, "direct_answers": ["savory", "sweet", "sweet", "sweet", "spice", "sweet", "sweet", "sweet", "sweet", "sweet"], "difficult_direct_answer": false, "rationales": ["The item appears to have powdered sugar on it which would make the item sweet.", "It is a pastry item which is like a dessert", "The taste is sweet."], "image": "train2014/COCO_train2014_000000240134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338515, "question_id": "mHaNX4E4pwBa9PW8YiLgpY", "question": "Why is the ball so close to the racquet?", "choices": ["fell there", "bounced there", "random", "is hitting"], "correct_choice_idx": 3, "direct_answers": ["hitting", "return throw", "hitting", "is hitting", "ball hitting", "hitting ball", "hitting ball", "hitting ball", "returning ball", "making contact"], "difficult_direct_answer": false, "rationales": ["The woman is hitting the ball so it seems to be racquet.", "The player is hitting the ball.", "The ball is getting hit."], "image": "train2014/COCO_train2014_000000338515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345960, "question_id": "mHicD7PArWgDYThMK4dKUf", "question": "How are dishes cleaned here?", "choices": ["they aren't", "dishwashing machine", "sponged", "air washed"], "correct_choice_idx": 1, "direct_answers": ["dishwasher", "dishwasher", "dishwashing machine", "dishwasher", "dishwasher", "dishwasher", "dishwasher", "in dishwasher", "dishwasher", "in dishwasher"], "difficult_direct_answer": false, "rationales": ["There is a dishwasher visible in the left corner.", "Dishes are in the dishwasher.", "She has a dishwasher."], "image": "val2014/COCO_val2014_000000345960.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493056, "question_id": "mHojgf2gM4np6b8gNgwBQ7", "question": "Dark condition is due to the absence of which molecule?", "choices": ["electron", "photon", "neutron", "proton"], "correct_choice_idx": 1, "direct_answers": ["bright", "light", "photon", "light", "chlorophyll", "photosynthesis", "light", "light", "light", "light"], "difficult_direct_answer": false, "rationales": ["There is a lack of light in parts of this image. light is made up of photons.", "The condition is for photons.", "Without photons there's no light."], "image": "train2014/COCO_train2014_000000493056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111930, "question_id": "mJ7hJsbaaJ7ZdSUqGadgiW", "question": "What companies logo can be seen on the white snow ramp?", "choices": ["prada", "gucci", "dc", "vans"], "correct_choice_idx": 2, "direct_answers": ["converse", "dc", "chanel", "dc", "successful", "gucci", "bmw", "dc", "dc", "chanel"], "difficult_direct_answer": false, "rationales": ["Companies often post their brand's logo on public areas. the dc logo is quite striking on the snow ramp.", "Dc's logo is apparent.", "There is a capital d interlocking with a capital c."], "image": "train2014/COCO_train2014_000000111930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263120, "question_id": "mJ8dDpCxdKYrkDJqxdVQmJ", "question": "What is next to the elephant?", "choices": ["baby wolf", "bear", "trainer", "water"], "correct_choice_idx": 3, "direct_answers": ["pond", "pond trees", "another elephant", "water", "water hole", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["Elephants gather at a small water source.", "The elephants are at a watering hole.", "A pool of water is next to him."], "image": "val2014/COCO_val2014_000000263120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380724, "question_id": "mJCbX9aaP43Bfh4XXtCnfF", "question": "What would happen to his speed if he moved to the green area?", "choices": ["slow down", "speed up", "unknown", "stay same"], "correct_choice_idx": 0, "direct_answers": ["slow down", "decrease", "decrease", "slow down", "slow down", "slow down", "slow down", "decrease", "slow down", "decrease"], "difficult_direct_answer": false, "rationales": ["A skateboard can't roll on the grass.", "The area he is currently on is flat, hard, and free of debris. the green area is bumpy, soft, and contains plants, so it has a lot more friction, and adding friction causes a change in speed.", "The person is on a flat road."], "image": "train2014/COCO_train2014_000000380724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49165, "question_id": "mJGAupvSzhaKmM5sSXP4Z5", "question": "What energy moves these boats?", "choices": ["manual force", "diesel", "electricity", "gas"], "correct_choice_idx": 2, "direct_answers": ["gas", "outboard motor", "gas", "motor", "gasoline", "motor", "gasoline", "electricity", "outboard motor", "propulsion"], "difficult_direct_answer": false, "rationales": ["The motors being used are traditional dc wound motors that propel the boat forward.", "Electricity moves these boats since they are powered by an engine.", "Electricity powers them."], "image": "val2014/COCO_val2014_000000049165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248052, "question_id": "mJW8FKJCQ5n2WJdhSRgviz", "question": "What is the entertainment tonight for the people eating dinner?", "choices": ["acrobats", "movie", "live singing", "magic show"], "correct_choice_idx": 2, "direct_answers": ["singer", "singer", "singer", "singer", "singer", "music performance", "singing", "singing", "karaoke", "live singing"], "difficult_direct_answer": false, "rationales": ["The lady is holding a microphone and performing for them.", "The entertainment is singing.", "The entertainer appears to be singing."], "image": "train2014/COCO_train2014_000000248052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334941, "question_id": "mJhyz7oGo5VyQsDPokLXxo", "question": "Why is this woman standing outside?", "choices": ["being loud", "using phone", "having tattoo", "smoking"], "correct_choice_idx": 3, "direct_answers": ["smoking", "smoking", "smoking", "smoking", "to smoke", "smoking", "smoking", "texting", "smoking", "smoking"], "difficult_direct_answer": false, "rationales": ["She has a cigarette in her hand.", "A woman outside at night leans against a wall while making a phone call and having a cigarette.", "The woman is smoking since she has a cigarette."], "image": "val2014/COCO_val2014_000000334941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419029, "question_id": "mJzeAqUwCWcsokjWjYeFhh", "question": "What does the sign on the building indicate might be obtained there?", "choices": ["alcohol", "clothing", "medical aid", "food"], "correct_choice_idx": 2, "direct_answers": ["medical assistance", "peace", "medical health", "scaring", "first aid", "medical aid", "aide", "medical services", "first aid", "medical treatment"], "difficult_direct_answer": true, "rationales": ["A building near people skiing has a white cross on a white background on a sign on it. the sign is a symbol for medical facilities.", "The symbol in question is a white cross on a red background which is an internationally recognized symbol associated with answer a.", "A cross on a red and white sign is an international symbol for medic"], "image": "train2014/COCO_train2014_000000419029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559288, "question_id": "mKAmf56u4kiRMAB6CSpkRU", "question": "Why does the truck have sign on top?", "choices": ["original part", "vandalism", "keep score", "advertising"], "correct_choice_idx": 3, "direct_answers": ["advertising", "advertisement", "advertising", "passing store", "advertising", "brand itself", "it doesn't", "advertising", "advertisement", "advertisement"], "difficult_direct_answer": false, "rationales": ["The truck has an ad.", "There is advertising for a warehouse on top of the truck.", "It is an ad for a place."], "image": "train2014/COCO_train2014_000000559288.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3995, "question_id": "mKWGQhuHWYQ6podoYjrRjt", "question": "What is the orange cone for?", "choices": ["pilon", "parking", "boundary", "goal"], "correct_choice_idx": 2, "direct_answers": ["marker zone", "boundary", "set barrier", "warning", "marks boundary", "line marking", "boundary", "boundry", "border", "end marking"], "difficult_direct_answer": true, "rationales": ["The cone is a boundary.", "The orange cone on the grass is used to mark a boundary for the frisbee game.", "Cones denote boundaries for athletics occurring on fields. the cone is on a pale white line that also denotes a boundary."], "image": "train2014/COCO_train2014_000000003995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75213, "question_id": "mL7zMpPR7Cb7EuKeVsb7z7", "question": "What kitchen appliance is against the wall?", "choices": ["microwave", "dishwasher", "oven", "fridge"], "correct_choice_idx": 3, "direct_answers": ["fridge", "refrigerator", "fridge", "refrigerator", "fridge", "refrigerator", "refrigerator", "fridge", "fridge", "fridge"], "difficult_direct_answer": false, "rationales": ["The fridge is against the wall.", "The refrigerator looks vintage.", "There is a fridge next to the wall."], "image": "train2014/COCO_train2014_000000075213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521998, "question_id": "mLADQPbHPFRFZW9gvWqXsN", "question": "What is locked to the street sign?", "choices": ["scooter", "bicycle", "wagon", "motorcycle"], "correct_choice_idx": 1, "direct_answers": ["cycle", "bike", "bolts", "bicycle", "bike", "bike", "hour parking", "bike", "bike", "bicycle"], "difficult_direct_answer": false, "rationales": ["The vehicle has two wheels. it does not have an engine.", "The bike is locked.", "It is metal and has two wheels with a seat and handlebars"], "image": "train2014/COCO_train2014_000000521998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557721, "question_id": "mLAQMWERic4ZBsXKmc6K6X", "question": "What is the purpose of the cones?", "choices": ["discourage children", "for sale", "obstruction", "decorative"], "correct_choice_idx": 2, "direct_answers": ["slalom", "challenge", "obstacles", "weaving into", "safety", "obstacles", "obstacle course", "competition", "obstruction", "skateboard course"], "difficult_direct_answer": true, "rationales": ["The cones create an obstacle for the skateboarder to go through.", "They set up an obstacle course.", "They are there to make a \"roadway\" for the skater and keep people away from the area where he's skateboarding in."], "image": "train2014/COCO_train2014_000000557721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102056, "question_id": "mLH755AzG8TJNG5X4zm8is", "question": "Which head covering is made of the hardest material?", "choices": ["black/white", "pink", "red", "black"], "correct_choice_idx": 3, "direct_answers": ["plastic", "helmet", "plastic", "helmet", "helmet", "plastic", "helmet", "helmet", "black", "helmet"], "difficult_direct_answer": false, "rationales": ["People are getting into a van and some are wearing hoods while others have helmets on that are plastic rather than made from material.", "The roof of the bus is made of a metal.", "The dual-colored, red, and pink head coverings are made out of fabric or soft plastic. the helmet is made out of hard plastic."], "image": "val2014/COCO_val2014_000000102056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51285, "question_id": "mLHvBZ37h987Vi27558Eap", "question": "The logo in the skirt indicates which brand?", "choices": ["tiger", "nike", "puma", "recap"], "correct_choice_idx": 2, "direct_answers": ["puma", "puma", "puma", "puma", "puma", "puma", "puma", "puma", "puma", "puma"], "difficult_direct_answer": false, "rationales": ["The logo is a jumping large jungle cat.", "The logo on the brand is that of a jumping cat. this is known to be of the puma brand.", "It is the animal symbol for this brand"], "image": "train2014/COCO_train2014_000000051285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456184, "question_id": "mLRSYYwuFxNz4Tw2eSqjS4", "question": "What color is the boy in the red jacket's hood?", "choices": ["purple", "pink", "black", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "green", "green", "green", "red", "green", "green", "red", "green", "red"], "difficult_direct_answer": false, "rationales": ["Part of the hoodie is green.", "The hood is a bright green color.", "His hood is not black, pink, or purple."], "image": "val2014/COCO_val2014_000000456184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374955, "question_id": "mLYsurbg9QsvBzmPL3oMrK", "question": "The object the man is flying has a similar color pattern to what animal?", "choices": ["puma", "zebra", "flamingo", "giraffe"], "correct_choice_idx": 2, "direct_answers": ["flamingo", "flamingo", "flamingo", "flamingo", "flamingo", "bird", "flamingo", "flamingo", "bird", "flamingo"], "difficult_direct_answer": false, "rationales": ["The animal is a light bright pink color with black wing petals.", "Flamingoes are a bright pink color.", "Flamingos are pink. the color and pattern of the kite are also pink and it is designed like this bird."], "image": "train2014/COCO_train2014_000000374955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387977, "question_id": "mLZku6DBGnNt9WzPGC7U8u", "question": "What area is behind Plane runway?", "choices": ["political building", "cow field", "munitions plant", "mall"], "correct_choice_idx": 1, "direct_answers": ["cow field", "runway", "farm", "farm", "run way", "park", "pasture", "park", "cow pasture", "farm"], "difficult_direct_answer": false, "rationales": ["You can see black and white cows in the background.", "There are animals in the distant background that have the size, shape and coloring of cows which indicates the likely purpose of the area.", "Cows can be seen in grass behind a plane."], "image": "train2014/COCO_train2014_000000387977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49450, "question_id": "mLn35ZUj38Ptftc2fjVeR2", "question": "What are the women doing on the road?", "choices": ["waiting", "sunbathing", "posing", "travelling"], "correct_choice_idx": 2, "direct_answers": ["hitchhiking", "hitchhiking", "hitchhiking", "hitchhiking", "hitchhiking", "hitchhiking", "posing", "hitchhiking", "hitchhiking", "hitchhiking"], "difficult_direct_answer": false, "rationales": ["The women are posing for a picture and pretending as though they are hitchhikers. the image is a photograph for advertising purposes.", "They look to be having their picture taken.", "The women are posing on the road."], "image": "val2014/COCO_val2014_000000049450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293853, "question_id": "mLpYENvvV95YHfby2jqy6o", "question": "What are the women most probably using the open umbrellas for?", "choices": ["sunlight", "wind", "rain", "props"], "correct_choice_idx": 3, "direct_answers": ["photo shoot", "props", "rain", "fashion show", "rain", "rain", "pictures", "protect rain", "fashion", "photoshoot"], "difficult_direct_answer": false, "rationales": ["The women are using it as props.", "They're using props.", "The ladies are posing with umbrellas. all have different umbrellas for their photos."], "image": "train2014/COCO_train2014_000000293853.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581097, "question_id": "mLr4dHncU3aexJuUMWcb93", "question": "What is next to the dog?", "choices": ["cat", "lemon", "apple", "sneakers"], "correct_choice_idx": 3, "direct_answers": ["dresser", "dvds", "sneakers", "desk", "human", "person", "human", "shoes", "shoes", "pillow"], "difficult_direct_answer": false, "rationales": ["Shoes are visible.", "There are shoes on the floor near him", "There are sneakers on the floor next to the dog."], "image": "train2014/COCO_train2014_000000581097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541970, "question_id": "mM6ptLzyijuoAVr2XVbgPf", "question": "Why are some of the animals smaller than other?", "choices": ["breed", "age", "injuries", "malnourished"], "correct_choice_idx": 1, "direct_answers": ["babies", "age", "they're babies", "babies", "babies", "babies", "born later", "babies", "they're babies", "younger"], "difficult_direct_answer": false, "rationales": ["They are babies. babies are younger than the adults.", "Some of the animals are smaller than the others because of their age.", "The animals are aging."], "image": "train2014/COCO_train2014_000000541970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231657, "question_id": "mMDiq3ehT2sftXw99hcg2o", "question": "What meat is most likely being served with this dish?", "choices": ["lamb", "steak", "fish", "chicken"], "correct_choice_idx": 0, "direct_answers": ["rice beans", "been", "lamb", "beef", "bbq", "pork", "beef", "hamburger", "beef", "lamb"], "difficult_direct_answer": false, "rationales": ["The bone in the meat is small.", "The bone sticking out is how you can tell what it is.", "The meat is brown, not white or red. it has a shank."], "image": "train2014/COCO_train2014_000000231657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144562, "question_id": "mMGUjPYsrq5zoxKPyVwAqc", "question": "Who holds an all-time record in a statistical category of this sport?", "choices": ["michael jordan", "rickey henderson", "wayne gretzky", "tiger woods"], "correct_choice_idx": 1, "direct_answers": ["nba", "no idea", "rickey henderson", "master athlete", "pete rose", "ty cobb", "player", "barry bonds", "babe ruth", "unknown"], "difficult_direct_answer": true, "rationales": ["Tiger woods, michael jordan, and wayne gretzky play sports other than baseball.", "Rickey henderson holds a record.", "Only one person on the list plays baseball. the rest were at the top of their sport of golf, basketball, or hockey."], "image": "val2014/COCO_val2014_000000144562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331214, "question_id": "mMHZJCjpSpuvYZLe8NbNLJ", "question": "Which conveyance pictured here uses less gas?", "choices": ["truck", "van", "buggy", "car"], "correct_choice_idx": 2, "direct_answers": ["horse cart", "wagon", "buggy", "wagon", "horse carriage", "horse buggy", "horse", "horse", "horse carriage", "carriage"], "difficult_direct_answer": false, "rationales": ["The buggy in the middle of the road uses less gas than all of the cars parked on the street.", "A buggy used horses to power itself and uses no gas.", "The conveyance is a buggy."], "image": "train2014/COCO_train2014_000000331214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466964, "question_id": "mMKkyqoi2QbvN8bgQGB9wv", "question": "What is the ground covered with?", "choices": ["water", "grass", "snow", "dirt"], "correct_choice_idx": 3, "direct_answers": ["dirt", "consumer goods", "dirt", "wares", "dirt", "clothes", "sand", "sand", "sand", "dirt"], "difficult_direct_answer": false, "rationales": ["The ground appears to be brown and it is likely a dirt floor.", "The ground is brown.", "The ground has dirt on it."], "image": "train2014/COCO_train2014_000000466964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284286, "question_id": "mMpcGAJLnzQ99XQe7wYC3B", "question": "What number is closest to how many people are under the middle umbrella?", "choices": ["four", "one", "twenty", "ten"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "four", "four", "four", "five", "four", "4 people", "five", "five"], "difficult_direct_answer": false, "rationales": ["There is more than three people and less than five people visible.", "Approximately five people are under the middle umbrella.", "I see about 8 shoes."], "image": "val2014/COCO_val2014_000000284286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498669, "question_id": "mMyrCmrjbJXDrp6AVymw7v", "question": "How full does he hope the boat will be at the end of the day?", "choices": ["half full", "full", "empty", "quarter full"], "correct_choice_idx": 2, "direct_answers": ["as much", "max capacity", "empty", "empty", "over-packed", "full", "not very", "overflowing", "empty", "empty"], "difficult_direct_answer": false, "rationales": ["The man would likely have an empty boat after selling the food.", "He wants to sell all the goods.", "It appears the person is bringing produce somewhere with the intent to sell based on the volume they have and the way they are wrapped. if one was looking to sell their produce they would likely want to sell all of it to maximize profits which would leave them with an empty boat."], "image": "train2014/COCO_train2014_000000498669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439139, "question_id": "mNMdmKzpDJhCdCA5dU9v3Q", "question": "For what is the woman using the umbrella?", "choices": ["thunder", "shade", "hail", "rain"], "correct_choice_idx": 3, "direct_answers": ["coverage", "rain", "rain", "cover", "rain", "rain protection", "stay dry", "rain", "protection", "avoid rain"], "difficult_direct_answer": false, "rationales": ["The woman is on a wet surface and there is a cloudy sky.", "The person is using the umbrella so she doesn't get so wet while outside and in bad weather.", "The woman seems to be sheltering herself from rain."], "image": "train2014/COCO_train2014_000000439139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237381, "question_id": "mNcYfNTow6Hi49HEzU8bg7", "question": "What are these kids doing?", "choices": ["playing ball", "resting", "fighting", "eating"], "correct_choice_idx": 0, "direct_answers": ["baseball", "baseball", "playing baseball", "playing ball", "playing ball", "playing baseball", "baseball", "playing", "playing baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["Kids are standing in a grassy area. one kid is holding a long stick over his shoulder.", "They are playing baseball.", "The kids are standing in a grass field and playing ball with one another."], "image": "train2014/COCO_train2014_000000237381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10689, "question_id": "mNmRKF4y4pdewinMcMbKqF", "question": "What period of the day is shown in the image?", "choices": ["night", "afternoon", "evening", "morning"], "correct_choice_idx": 3, "direct_answers": ["morning", "late morning", "1135am", "late morning", "afternoon", "noon", "evening", "morning", "afternoon", "morning"], "difficult_direct_answer": false, "rationales": ["The period of time shown is morning daytime.", "It is not noon yet", "It is light outside and the hands on the clock have not yet reached 12."], "image": "train2014/COCO_train2014_000000010689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286499, "question_id": "mNpdhHMwDfG5v6WYWmMTpT", "question": "What might you need to us the item on the left?", "choices": ["driving license", "boat license", "gun", "passport"], "correct_choice_idx": 0, "direct_answers": ["street", "wheel", "gasoline", "crane", "license", "driving license", "key", "license", "players", "forklift"], "difficult_direct_answer": true, "rationales": ["To operate motor vehicles such as those that are on display on the left one needs to pass a driving test and be issued a license.", "There are cars stacked up on the left and a license is required to drive one on public property.", "A license is needed."], "image": "train2014/COCO_train2014_000000286499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383212, "question_id": "mPv7HFUsVZR7tw4wjvybNQ", "question": "What kind of a number is on the bike?", "choices": ["composite", "odd", "even", "negative"], "correct_choice_idx": 1, "direct_answers": ["motocross", "forty seven", "forty seven", "seven", "forty seem", "arabic", "forty seven", "racing", "odd", "race tag"], "difficult_direct_answer": false, "rationales": ["The number is odd.", "It ends with a seven, which is one of the odd numbers.", "The number on both bikes end with 7. numbers that end in 7 are always odd."], "image": "train2014/COCO_train2014_000000383212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158876, "question_id": "mQ3s5MU6SDfskW7g5DADsU", "question": "What action is the person doing?", "choices": ["stirring", "eating", "kneading", "chopping"], "correct_choice_idx": 0, "direct_answers": ["cooking", "cooking", "cook", "stirring", "stirring", "cooking", "stirring", "cooking", "cooking", "stirring"], "difficult_direct_answer": false, "rationales": ["They have a spatula in a pot.", "A person is holding a spatula in a pot on the stove.", "A person is holding a spatula in a pout with bananas in it. people use spatulas to stir."], "image": "train2014/COCO_train2014_000000158876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97411, "question_id": "mQ7bG2DZc8h5sbjdmkJChu", "question": "Which topping contains the highest level of sodium?", "choices": ["cheese", "olive", "pepperoni", "mushroom"], "correct_choice_idx": 2, "direct_answers": ["pepperoni", "pepperoni", "olives", "pepperoni", "meat", "olives", "pepperoni", "olives", "olives", "pepperoni"], "difficult_direct_answer": false, "rationales": ["The other ingredients usually don't have as much sodium as cured meat.", "The pepperoni slices on the pizza likely has the highest sodium content.", "The meat is filled with salt."], "image": "train2014/COCO_train2014_000000097411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558449, "question_id": "mQ9JwwVc8kKbY64GHqFDJn", "question": "What type of flower pot is this?", "choices": ["terracotta", "plastic", "wood", "clay"], "correct_choice_idx": 0, "direct_answers": ["basin", "planter", "oval", "clay", "terra cotta", "flower planter", "concrete", "terracotta", "tub", "terracotta"], "difficult_direct_answer": true, "rationales": ["The flower pot is made of terracotta.", "The material for this pot looks like classic terracotta material in its color and by how it is worn.", "The pot is terracotta."], "image": "val2014/COCO_val2014_000000558449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81721, "question_id": "mQCHtReVHLLEBKoFDJEpmx", "question": "Why are they holding the glasses up?", "choices": ["escaping detection", "they're weapons", "confused", "being friendly"], "correct_choice_idx": 3, "direct_answers": ["drinking", "toast", "cheers", "cheers", "juice", "toast", "being friendly", "cheers", "toasting", "cheers"], "difficult_direct_answer": false, "rationales": ["The people are drinking martinis.", "They are touching their raised glasses together which is commonly what one does to toast. when people toast they are often celebrating something or sharing a friendly moment with someone.", "They're being friendly."], "image": "train2014/COCO_train2014_000000081721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323528, "question_id": "mQFp2LritmgxsgsmHgYcEG", "question": "What area is shown here?", "choices": ["farm", "urban", "rural", "suburban"], "correct_choice_idx": 1, "direct_answers": ["crosswalk", "urban", "crosswalk", "street", "street", "crosswalk", "street", "crosswalk", "street", "street"], "difficult_direct_answer": false, "rationales": ["The area is urban.", "There are many people.", "An urban area has big buildings for both business and residences with lots of people and traffic."], "image": "val2014/COCO_val2014_000000323528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189903, "question_id": "mQSYfibweLr94YRQkAT6JS", "question": "What pattern is the paper under the pizza?", "choices": ["checkerboard", "striped", "plain", "swirl"], "correct_choice_idx": 0, "direct_answers": ["checkered", "round", "checkered", "checkered", "round", "checkered", "checkerboard", "checkered", "checkered", "checkered"], "difficult_direct_answer": false, "rationales": ["The pattern alternates between red and white squares.", "This pattern is modified stripes consisting of crossed horizontal and vertical lines, forming squares.", "Paper under a pizza is made up of white and red squares."], "image": "train2014/COCO_train2014_000000189903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136233, "question_id": "mQjgHfieZUWcGSHZzBQHHQ", "question": "Why is the white board underneath them?", "choices": ["protect table", "serving board", "keep clean", "grater"], "correct_choice_idx": 0, "direct_answers": ["to protect", "support cutting", "cutting board", "cutting board", "to chop", "protect table", "protect counter", "protect counter", "for cutting", "protect counter"], "difficult_direct_answer": false, "rationales": ["The board is to protect the table from cut marks.", "Someone will use the knife to cut the carrots. the white board will prevent the surface underneath from being damaged.", "It is a cutting board. it enables the user to cut the carrots without damaging the object beneath it."], "image": "train2014/COCO_train2014_000000136233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53900, "question_id": "mQkqkb4uSsXzktbsaZjghz", "question": "What is the highest symbol representative of?", "choices": ["judaism", "islam", "buddhism", "christianity"], "correct_choice_idx": 3, "direct_answers": ["christianity", "church", "christ", "christ", "christianity", "christianity", "christianity", "christ", "frisbee", "christ"], "difficult_direct_answer": false, "rationales": ["The symbol is christian.", "There is a cross on the tower, which is a symbol of christianity.", "The cross represents christianity."], "image": "train2014/COCO_train2014_000000053900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461496, "question_id": "mQvbUvWG4QQkq68p7BeFDj", "question": "What allows the woman on the bike to carry needed items safely?", "choices": ["zip ties", "red wagon", "basket", "bike rack"], "correct_choice_idx": 2, "direct_answers": ["basket", "purse", "balancing", "basket", "bag", "basket", "basket", "basket", "basket", "basket"], "difficult_direct_answer": false, "rationales": ["On the front of the bike there is a basket and it has her items in it.", "It is on the front and is secured to the handlebars", "A woman rides a bike with a basket on the front. baskets are used to carry things."], "image": "train2014/COCO_train2014_000000461496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96157, "question_id": "mQy5yMwHzi8ihtKiVybgyc", "question": "What is near the neatly stacked up items?", "choices": ["baby", "door", "statue", "tiger"], "correct_choice_idx": 1, "direct_answers": ["door", "doors", "door", "doors", "tape", "door", "door", "tools", "doors", "double-doors"], "difficult_direct_answer": false, "rationales": ["It looks like a closet door.", "The closes thing to the stack is the door.", "The entrance to the room is on the left."], "image": "train2014/COCO_train2014_000000096157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540428, "question_id": "mRUNNgWJrgpPkTfWiYCGK5", "question": "What mammal is referenced on the adult male's shirt?", "choices": ["wild fawn", "mountain gorilla", "puma concolor", "salmon"], "correct_choice_idx": 2, "direct_answers": ["cougars", "cougars", "cougar", "cougar", "cougars", "cougar", "cougars", "cougars", "cougars", "puma concolor"], "difficult_direct_answer": false, "rationales": ["It's another name for this mammal.", "Cougars are another word for this kind of animal.", "The word \"cougars\" is displayed on the man's shirt. answer a is synonymous with \"cougars.\"."], "image": "train2014/COCO_train2014_000000540428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263758, "question_id": "mRaCM3fnDGV24wyJdibVcu", "question": "Where is the vehicle located?", "choices": ["runway", "underground bunker", "parking garage", "sea"], "correct_choice_idx": 0, "direct_answers": ["air strip", "runway", "runway", "runway", "airport", "runway", "airport", "runway", "runway", "air strip"], "difficult_direct_answer": false, "rationales": ["The plane is parked on a runway so it can take off for its destination.", "A large concrete airstrip is found with several planes riding down it.", "An airplane is on a long, paved road. airplanes take off and land from runways."], "image": "val2014/COCO_val2014_000000263758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521972, "question_id": "mRfS27JUwpuVvxRjHvf5iS", "question": "What vehicle is stored in front of the boat?", "choices": ["raft", "quad", "sedan", "bike"], "correct_choice_idx": 3, "direct_answers": ["bikes", "bike", "bicycles", "bicycles", "bike", "bicycles", "bicycles", "bicycles", "bicycle", "bike"], "difficult_direct_answer": false, "rationales": ["There are bicycles outside of the boat.", "There are bikes on the front.", "Bikes are crammed up front."], "image": "train2014/COCO_train2014_000000521972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141574, "question_id": "mRmyrhHgqe8yibMccZpnvy", "question": "What could assist someone who cannot swim here?", "choices": ["scooter", "surfboard", "gloves", "lifejacket"], "correct_choice_idx": 3, "direct_answers": ["life jacket", "surfboard", "life jacket", "lifejacket", "life vest", "life vest", "life preserver", "lifesaver", "life vest", "life vest"], "difficult_direct_answer": false, "rationales": ["Wearing a lifejacket helps a non swimmer.", "Lifejackets prevent drowning.", "A vest filled with floatable material can help poor swimmers."], "image": "val2014/COCO_val2014_000000141574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443641, "question_id": "mRv4b2EvJLGvtQUdrKytyr", "question": "What are the group of people attempting to do?", "choices": ["protest", "wrestle", "sit", "cross street"], "correct_choice_idx": 3, "direct_answers": ["cross street", "crossing street", "cross street", "cross street", "cross street", "crossing street", "cross street", "cross street", "cross street", "cross street"], "difficult_direct_answer": false, "rationales": ["They are walking to the other side of the road.", "People are at a crosswalk.", "The group crosses."], "image": "train2014/COCO_train2014_000000443641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160459, "question_id": "mRyRHrxEmG2KvfXRSyeSqj", "question": "What is inside the pastry shown here?", "choices": ["jelly", "cream", "nothing", "air"], "correct_choice_idx": 0, "direct_answers": ["cream", "jelly", "cream", "jelly", "jelly", "jelly", "jelly", "filling", "glaze", "flour"], "difficult_direct_answer": false, "rationales": ["This looks like a jelly donut and it will have jelly inside of it.", "That type of donut often has that filling.", "The pastry has jelly."], "image": "train2014/COCO_train2014_000000160459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561795, "question_id": "mS3S5zGkEJTeNfT3HfWzox", "question": "What are the men doing in the truck?", "choices": ["repairing it", "breaking it", "waxing it", "driving it"], "correct_choice_idx": 0, "direct_answers": ["restoring it", "repair", "standing", "fixing truck", "working", "repairing it", "fixing something", "working", "repairs", "working"], "difficult_direct_answer": false, "rationales": ["The men are repairing the truck.", "The men are working on a broken truck.", "The individuals are in a truck that appears to be in the process of repair."], "image": "val2014/COCO_val2014_000000561795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138935, "question_id": "mSEGsU7EfFwva72UyNJVLe", "question": "What does she fear might happen?", "choices": ["rain", "tornado", "snow", "sleet"], "correct_choice_idx": 0, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["She is holding an umbrella.", "The woman has an umbrella.", "The woman is holding an umbrella. she is expecting some sort of weather."], "image": "train2014/COCO_train2014_000000138935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397665, "question_id": "mSHg6djSq49nyiWiLZiwdw", "question": "What devices would the word at the top be associated with?", "choices": ["microwaves", "kettle", "cell phones", "microphones"], "correct_choice_idx": 2, "direct_answers": ["phones", "fruit", "fruit", "orange juicers", "drink", "telecom", "phones", "cell phones", "juicer", "unknown"], "difficult_direct_answer": false, "rationales": ["The device is a cell phone.", "The device is a cell phone.", "It provides internet services and also provides network services."], "image": "val2014/COCO_val2014_000000397665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85155, "question_id": "mSJiMsztg4VVDA2zbKG5V2", "question": "What fuel does the ferry run on?", "choices": ["diesel", "electricity", "coal", "oxygen"], "correct_choice_idx": 0, "direct_answers": ["oil", "diesel", "motor fuel", "gas", "diesel", "diesel", "gasoline", "gasoline", "diesel", "gas"], "difficult_direct_answer": false, "rationales": ["The fuel is diesel.", "They run on diesel.", "A ferry usually operates on a diesel engine."], "image": "train2014/COCO_train2014_000000085155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231322, "question_id": "mSMctDpgi7RPwKDoFArBKD", "question": "What's the baby done to get so messy?", "choices": ["found dirt", "opened jar", "ate food", "painted picture"], "correct_choice_idx": 2, "direct_answers": ["grabbed cake", "eat", "ate food", "eat", "cake", "eat cake", "eat cake", "cake eating", "eaten", "eat cake"], "difficult_direct_answer": false, "rationales": ["The baby stuck her hands in the cake in front of her and has smeared cake and frosting all over her face.", "There is a cake prominently displayed in the forefront. chunks matching missing parts of the cake are on the baby's body.", "He has the cake all over his hands and face."], "image": "train2014/COCO_train2014_000000231322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162780, "question_id": "mSP7vSRC2SF5v3vBcd9Jo2", "question": "What is on the end of tigger's head?", "choices": ["comb", "toothbrush", "sucker", "hairbrush"], "correct_choice_idx": 1, "direct_answers": ["tooth brush", "toothbrush", "toothpaste", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "brush", "toothbrush", "toothbrush"], "difficult_direct_answer": false, "rationales": ["This is in the child's mouth to clean his teeth", "The head part that is in the kid's mouth is against his teeth which means it's to clean teeth.", "The child is brushing his teeth with it."], "image": "train2014/COCO_train2014_000000162780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417031, "question_id": "mSee8BPsw5jsKj7E2LLL2A", "question": "What is this boat called?", "choices": ["lifeboat", "ship", "inner tube", "raft"], "correct_choice_idx": 3, "direct_answers": ["raft", "raft", "raft", "raft", "raft", "round boat", "raft", "raft", "tube", "raft"], "difficult_direct_answer": false, "rationales": ["The boat is a raft.", "This small boat is inflated and buoyant is mostly used for rafting because it is directed using ores.", "A family is in close proximity as it goes down in a large round inflatable water device. they are using oars to paddle thru water."], "image": "val2014/COCO_val2014_000000417031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200642, "question_id": "mSefYRryJi5ixZgsQJ3VP7", "question": "What animals are seen?", "choices": ["zebra", "lion", "gazelle", "elephant"], "correct_choice_idx": 3, "direct_answers": ["elephant", "elephants", "elephant", "elephant", "elephant", "elephant", "elephants", "elephants", "elephants", "elephants"], "difficult_direct_answer": false, "rationales": ["The animals are large and grey. they have trunks.", "The other types of animals, although from the same area of the world, aren't in this image.", "The animals are clearly visible and have all features consistent with answer a."], "image": "train2014/COCO_train2014_000000200642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402213, "question_id": "mSjAqyj3kWV5hfv6MwV3mX", "question": "What type of transportation is shown?", "choices": ["air", "rail", "road", "water"], "correct_choice_idx": 2, "direct_answers": ["city bus", "bus", "bus", "city bus", "road", "bus", "city bus", "bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["Buses are parked in the street. buses are driven on roads.", "The bus is on the road.", "The large bus in the photo has four rubber tires which it uses to travel. these tires support transportation made on roads."], "image": "train2014/COCO_train2014_000000402213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358903, "question_id": "mTZodjFXivTsV6UozuQERP", "question": "Why are they holding the ropes?", "choices": ["showing off", "are lost", "own them", "being towed"], "correct_choice_idx": 3, "direct_answers": ["getting pulled", "safety", "grip", "build speed", "waterskiing", "towing", "being towed", "skiing", "not fall", "get pulled"], "difficult_direct_answer": true, "rationales": ["The people are being towed.", "The people are water skiing based on the setting and equipment. the rope would be used for the purposes of answer a in order to allow the people to engage in this activity.", "I can't see behind the skiers to see if they are holding a rope, but the skiers are for sure showing off and holding ropes."], "image": "val2014/COCO_val2014_000000358903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53315, "question_id": "mTqRfoNoGcBwCvdLrdPkVU", "question": "What stylized image does the boy who is having a birthday today wear?", "choices": ["phone booth", "captain america", "flag", "skull"], "correct_choice_idx": 3, "direct_answers": ["skull", "skull", "skull", "skull", "skull", "skull", "skull", "skull", "skull", "skull"], "difficult_direct_answer": false, "rationales": ["A boy is wearing a graphic t-shirt.", "A boy has a shirt with a skull on it.", "The skull is menancing."], "image": "val2014/COCO_val2014_000000053315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267224, "question_id": "mTzC3tEWY9wNozEibSBKiN", "question": "What does the word on the back of car represent?", "choices": ["automobile manufacturer", "state name", "family name", "city name"], "correct_choice_idx": 0, "direct_answers": ["ford", "car maker", "ford", "ford", "automobile manufacturer", "make", "ford", "car manufacturer", "ford", "ford"], "difficult_direct_answer": false, "rationales": ["The name of the company that makes the truck is on the tailgate.", "The name at the back is the name of the company that made the car.", "The word is the maker."], "image": "val2014/COCO_val2014_000000267224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512459, "question_id": "mU2t8j7yZjgnkpRD6yNKUL", "question": "Why is the bus blurred in the picture?", "choices": ["cloudy lens", "moving fast", "flickering lights", "wet paint"], "correct_choice_idx": 1, "direct_answers": ["it's moving", "moving", "motion", "moving", "going fast", "moving", "its moving", "camera effects", "in motion", "moving fast"], "difficult_direct_answer": false, "rationales": ["The shutter setting allows it to be captured in action.", "Pictures are blurry when the subjects are moving in it.", "Speed can blur things when you are standing still."], "image": "train2014/COCO_train2014_000000512459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304964, "question_id": "mUVsKQwMgN9zmnXSKJjxkY", "question": "What are the people lining up for?", "choices": ["boarding bus", "boarding plane", "buying tickets", "buying food"], "correct_choice_idx": 3, "direct_answers": ["travel", "food", "phones", "passport", "buying food", "food", "food", "mobile", "dinner", "movie showing"], "difficult_direct_answer": false, "rationales": ["The atmosphere of this building with chairs and tables tells us it's probably a restaurant. a restaurant sells food.", "This is a restaurant scene.", "The location appears to be a food court with chairs visible and people eating. others would line up at a restaurant as is expected at a food court."], "image": "val2014/COCO_val2014_000000304964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316194, "question_id": "mUgSbTDyUY3NGMYitEUfiK", "question": "What is the red bus engaging in?", "choices": ["being repaired", "food sale", "being abandoned", "carrying passengers"], "correct_choice_idx": 1, "direct_answers": ["serving food", "food sale", "food service", "selling food", "restaurant", "traffic", "food truck", "catering", "serving food", "advertising"], "difficult_direct_answer": true, "rationales": ["This is a food truck because there is a menu in front of it and a window for the food to be served from", "The bus is selling food.", "It appears that this bus has been converted to a structure to sell items that people would enjoy eating."], "image": "train2014/COCO_train2014_000000316194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561578, "question_id": "mUm2QtA5Mhvf3y9NLUiGX8", "question": "What can someone know by looking at the white tower?", "choices": ["date", "time", "year", "month"], "correct_choice_idx": 1, "direct_answers": ["time", "time", "time", "time", "time", "time", "time", "time", "time", "time"], "difficult_direct_answer": false, "rationales": ["The white tower on the building has a clock on each side so people can tell the time by looking at it.", "The tower is equipped with a clock on the front.", "They can tell time."], "image": "val2014/COCO_val2014_000000561578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413122, "question_id": "mUvLiWUDZaGnSsFKRhB9rk", "question": "What type of weather is the beach seeing today?", "choices": ["hurricane", "snow", "wind", "rain"], "correct_choice_idx": 2, "direct_answers": ["sunny", "clear", "sunny", "beach weather", "sunny", "sunny", "clear", "wind", "sunny", "windy"], "difficult_direct_answer": false, "rationales": ["A kite is flying over a beach.", "The kites won't stay in the air without wind.", "Kites need wind."], "image": "train2014/COCO_train2014_000000413122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144863, "question_id": "mUxU7stMtWraeNxUqksgiM", "question": "How many inches long is the hot dog he is holding?", "choices": ["six", "eight", "thirteen", "twelve"], "correct_choice_idx": 3, "direct_answers": ["12", "twelve", "twelve", "eight", "twelve", "twelve", "twelve", "eight inches", "twelve", "eight"], "difficult_direct_answer": false, "rationales": ["A man is tilting a large hot dog that is bigger than the average one.", "The average man's hand size is 7 inches, which the hot dog is a few inches longer.", "It looks large in comparison to the man, and foot-long hot dogs are a well known street food item."], "image": "val2014/COCO_val2014_000000144863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439883, "question_id": "mV6Em5rWVGcpafdc4uWSAS", "question": "What time of day are the people skiing?", "choices": ["evening", "night", "morning", "dawn"], "correct_choice_idx": 0, "direct_answers": ["evening", "afternoon", "daytime", "day", "midday", "afternoon", "afternoon", "dusk", "midday", "morning"], "difficult_direct_answer": false, "rationales": ["The sun is still out but the shadows cannot be seen with it overhead, so it suggests the sun is lower in the sky and getting ready to set.", "The clouds are seen and it seems the dark is approaching.", "It doesn't look like the bright day but it's not totally dark yet. looks like evening time."], "image": "train2014/COCO_train2014_000000439883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312403, "question_id": "mV8ewetGQwUe6ocE9CV7nH", "question": "What household food might you find in the object on the right?", "choices": ["tinned fruit", "peanuts", "potato chips", "milk"], "correct_choice_idx": 3, "direct_answers": ["groceries", "mayo", "milk", "milk", "ice cream", "bread", "milk", "milk", "vegetables", "human food"], "difficult_direct_answer": false, "rationales": ["The object on the right is a refrigerator. tinned fruits, peanuts, and potato chips are not refrigerated.", "Milk is found in the fridge.", "Best appliances purpose is to keep food cold this type of liquid is one that needs to be kept at a cooler temperature so that it does not spoil. placing it in a warm environment such as the countertop would result in rotting."], "image": "train2014/COCO_train2014_000000312403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368367, "question_id": "mVHQfVKftGcF5MbnAhhDdK", "question": "Where is she most likely having a conversation on her cellphone?", "choices": ["street", "park", "restaurant", "school"], "correct_choice_idx": 2, "direct_answers": ["cafe", "coffee shop", "outside", "hotel", "patio", "restaurant", "restaurant", "patio", "outside", "outdoors"], "difficult_direct_answer": false, "rationales": ["The windows reflect many people in this setting most of which are sitting, she also has a drink with a straw in it sitting directly in front of her.", "Cups are on the table.", "She's at a restaurant."], "image": "val2014/COCO_val2014_000000368367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174747, "question_id": "mVYNcWsHdTufTzKVPZACQh", "question": "In which continent is this scene more likely to be typical?", "choices": ["australia", "asia", "antarctica", "south america"], "correct_choice_idx": 1, "direct_answers": ["asia", "french", "asia", "japan", "asia", "asia", "sea", "asia", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["The styles of the building and boat are usually associated with this continent.", "The other options don't match the architecture or boat designs.", "Asian countries have pagodas."], "image": "train2014/COCO_train2014_000000174747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190603, "question_id": "mVbpiT9gv53JT3LEejnAfU", "question": "What does the blue road sign warn of?", "choices": ["intersection", "deaf children", "construction", "child crossing"], "correct_choice_idx": 0, "direct_answers": ["dead end", "intersection", "intersection", "dead end", "overspeeding", "dead end", "t intersection", "cross street", "dead end", "t ahead"], "difficult_direct_answer": false, "rationales": ["The t on this blue traffic sign is the shape of the upcoming arrangement of road.", "It is a street sign telling you there is an area coming up ahead.", "The sign depicts two roads crossing."], "image": "train2014/COCO_train2014_000000190603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222113, "question_id": "mVnfTjjuyAf2YAvCXUHZop", "question": "What object are the giraffes next to?", "choices": ["car", "boulder", "tree", "mountain"], "correct_choice_idx": 2, "direct_answers": ["tree", "tree", "tree", "tree", "tree", "tree", "tree", "tree", "tree", "tree"], "difficult_direct_answer": false, "rationales": ["They're by a tree.", "The giraffe is near a tree trunk.", "The object near the giraffes appears to be a tree trunk based on the shape size and consistency."], "image": "train2014/COCO_train2014_000000222113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562356, "question_id": "mVrZY7qkkBpBxeAeAtCoBk", "question": "What type of vehicles are the men riding on?", "choices": ["skateboard", "motorcycle", "scooter", "bicycle"], "correct_choice_idx": 1, "direct_answers": ["motorcycle", "motorcycles", "motorcycles", "motorcycles", "motorcycle", "motorcycles", "motorcycles", "motorcycle", "motorcycle", "motorcycles"], "difficult_direct_answer": false, "rationales": ["This kind of vehicle has two wheels and an engine.", "The vehicles look like a bike but are motorized and have two wheels, so they are motorcycles.", "The vehicles are motorbikes."], "image": "val2014/COCO_val2014_000000562356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181682, "question_id": "mVwZK9LhJjCdqDHpjd9cpZ", "question": "When was the advertised company founded?", "choices": ["2000", "1990", "2007", "1888"], "correct_choice_idx": 3, "direct_answers": ["oh seven", "2007", "2007", "twothusand seven", "twothousand seven", "2007", "cincinnati ohio", "2007", "1888", "2007"], "difficult_direct_answer": false, "rationales": ["It is western & southern financial group", "The advertised company on the wall in the background is called western and southern financial group. they were founded in 1888 but sponsored a tennis tourney in 2007.", "The company is from the 1800s."], "image": "train2014/COCO_train2014_000000181682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354994, "question_id": "mWj2bHBk95BqMsn5wSrzUp", "question": "What do you have to add to the number on the yellow sign to get to 50?", "choices": ["seven", "12", "32", "30"], "correct_choice_idx": 0, "direct_answers": ["seven", "seven", "seven", "seven", "seven", "seven", "seven", "seven", "seven", "seven"], "difficult_direct_answer": false, "rationales": ["This is the obvious answer when subtracting 43 from 50.", "An airplane is at a gate with a yellow sign that has the number forty three on it. seven plus forty three is fifty.", "This is what 50 minus 43 equals"], "image": "train2014/COCO_train2014_000000354994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361265, "question_id": "mWpiqUz6WC2awfVKFKRcZu", "question": "What is the woman doing with the device in her right hand?", "choices": ["texting", "reading", "recording", "gaming"], "correct_choice_idx": 2, "direct_answers": ["dancing", "waving", "taking selfie", "photographing", "taking video", "recording", "filming", "filming", "videotaping", "taking photos"], "difficult_direct_answer": true, "rationales": ["She is making a video of herself.", "The woman has a video camera.", "She is holding a camcorder in her right hand. it is capturing a video."], "image": "val2014/COCO_val2014_000000361265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578866, "question_id": "mXJWWtUTYyr3SzzpSJi69J", "question": "What is this man experiencing?", "choices": ["sleet", "snow", "rain", "high wind"], "correct_choice_idx": 3, "direct_answers": ["high wind", "wind", "joy", "strong wind", "happiness", "wind", "wind", "wind", "victory", "wind"], "difficult_direct_answer": false, "rationales": ["The man's necktie is blowing up in the wind.", "The man is in wind.", "The man's tie is being blown."], "image": "train2014/COCO_train2014_000000578866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474034, "question_id": "mXMJ8i8DHncnHsFiGphC3R", "question": "What was the lady carrying a bag doing inside the place she is exiting?", "choices": ["singing", "shopping", "dancing", "selling things"], "correct_choice_idx": 1, "direct_answers": ["eating", "shopping", "shopping", "shopping", "shopping", "shopping", "shopping", "finished shopping", "shopping", "shopping"], "difficult_direct_answer": false, "rationales": ["The lady got the bag from a store.", "The lady is shopping.", "The woman has shopping bags with her."], "image": "val2014/COCO_val2014_000000474034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168763, "question_id": "mXTiJ8bXN9pqDo5Y7DkMkU", "question": "What is the profession of the men on horses?", "choices": ["waiter", "officer", "priest", "doctor"], "correct_choice_idx": 1, "direct_answers": ["police", "police", "two ridemen", "police", "police", "cop", "officer", "walking", "police", "police"], "difficult_direct_answer": false, "rationales": ["The men are wearing tactical gear, uniforms, and badges.", "The other options wouldn't apply to this type of scene or the uniforms worn by the riders.", "There is a mounted department in many police precincts."], "image": "train2014/COCO_train2014_000000168763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515566, "question_id": "mY9KvhkunKzn3ivE5hioPL", "question": "What part of the bus needs good traction to ride safely?", "choices": ["tires", "motor", "door", "windows"], "correct_choice_idx": 0, "direct_answers": ["tires", "wheels", "tires", "wheels", "wheels", "tires", "tires", "tires", "wheels", "tires"], "difficult_direct_answer": false, "rationales": ["These are what allow it to stop quickly if necessary and keep it from sliding in wet weather.", "It's important that the tires hug the road.", "The part is the tires."], "image": "train2014/COCO_train2014_000000515566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256546, "question_id": "mYDbMDtsjJsQt58MQS5xvD", "question": "The round green items on the food are also usually found in what color?", "choices": ["blue", "purple", "orange", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "black", "black", "olives", "black", "black", "green", "pizza"], "difficult_direct_answer": false, "rationales": ["Olives can be green and black.", "The insides are black.", "The round green items are peas. in many other instances peas can normally be found to be black."], "image": "train2014/COCO_train2014_000000256546.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304387, "question_id": "mYHzjiVRv7VubLAQZPqCNy", "question": "What is he doing?", "choices": ["swimming", "resting", "skiing", "wind surfing"], "correct_choice_idx": 3, "direct_answers": ["wind skiing", "wind surfing", "wind surfing", "surfing", "paragliding", "water skiing", "kite surfing", "surfing", "kitesurfing", "wind surfing"], "difficult_direct_answer": false, "rationales": ["It is evident by the rope showing that wind is used.", "He's wind surfing.", "The man is near water, not snow. he is above the water and is using a kite, so he is not swimming."], "image": "val2014/COCO_val2014_000000304387.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217825, "question_id": "mYRruvL5iraT7NHhYwB4We", "question": "What is the bus next to?", "choices": ["palm tree", "curb", "cat", "baby"], "correct_choice_idx": 1, "direct_answers": ["van", "sidewalk", "curb", "curb", "curb", "newsstand", "curb", "sidewalk", "car", "van"], "difficult_direct_answer": false, "rationales": ["The side of the road has some stores in front.", "It is next to the curb.", "The bus is parked up next to the sidewalk."], "image": "val2014/COCO_val2014_000000217825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5483, "question_id": "mYaQd69PCgXisGp9tMFU63", "question": "What do people do here?", "choices": ["swim", "eat", "gamble", "drive"], "correct_choice_idx": 1, "direct_answers": ["buy lunch", "get food", "buy food", "fruit smoothies", "make food", "eat", "eat", "buy food", "order food", "shop"], "difficult_direct_answer": false, "rationales": ["People eat here.", "There is food on the counter.", "The people eat."], "image": "train2014/COCO_train2014_000000005483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241214, "question_id": "mYduKitGREMLuZCFdDDeFd", "question": "To what public place is this car likely headed?", "choices": ["park", "airport", "water park", "drygoods store"], "correct_choice_idx": 1, "direct_answers": ["hotel", "airport", "airport", "airport", "airport", "airport", "airport", "airport", "airport", "car parking"], "difficult_direct_answer": false, "rationales": ["Usually when someone packs a lot of suitcases into the back of their car they are traveling and since these suitcases have airport tags on them it is assumed that the car is headed to the airport.", "There is luggage in the back.", "The suitcases all have identifying tags on them for when they're put on a plane."], "image": "train2014/COCO_train2014_000000241214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207875, "question_id": "mYm9rbmKeko3e8zm5YWXhv", "question": "The country this plane is from has people that are likely descended from what historical figure?", "choices": ["edward iii", "rollo", "barbarossa", "saladin"], "correct_choice_idx": 3, "direct_answers": ["muhammad", "buddy", "saladin", "emirates", "genghis khan", "use", "caliph", "emirates", "emirates", "saladin"], "difficult_direct_answer": false, "rationales": ["Saladin comes from the middle east.", "The country's people came from saladin.", "The plane has an emirates, not air france, lufthansa, or british airways, livery."], "image": "train2014/COCO_train2014_000000207875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249325, "question_id": "mYoABGdTvYcZzXYEg5EmFJ", "question": "What type of birds are on the ground in front of the man?", "choices": ["doves", "fantails", "parrots", "pigeons"], "correct_choice_idx": 3, "direct_answers": ["pigeons", "pigeons", "pigeons", "doves", "pigeons", "pigeon", "pigeons", "pigeons", "pigeons", "pigeon"], "difficult_direct_answer": false, "rationales": ["The man is on a bench that is surrounded by pigeons.", "The birds have small heads and grey and white plumage.", "The birds have the same size and shape as answer a and appear to be in an environment that would be consistent with where they are normally found."], "image": "val2014/COCO_val2014_000000249325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49884, "question_id": "mYskGi8uvcwaPUyKvUyEP8", "question": "How many competitive teams are shown?", "choices": ["four", "one", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is one net, so that means the players are sharing the net and playing on the same team as the person on the same side.", "Two sets of people are playing against each other.", "A tennis court is shown with two people on each side of the net. doubles is played with two teams of two."], "image": "train2014/COCO_train2014_000000049884.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147919, "question_id": "mYu74cvEcSV3oeCT93warD", "question": "What kind of bread is this?", "choices": ["rye", "white", "naan", "wheat"], "correct_choice_idx": 2, "direct_answers": ["naan", "naan", "tortilla", "pancake", "tortilla", "naan", "tortilla", "nan", "toast", "pita"], "difficult_direct_answer": false, "rationales": ["It's naan bread", "Naan goes with curries.", "The bread is naan since this dish has indian food."], "image": "val2014/COCO_val2014_000000147919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425120, "question_id": "mYz4RVpVdYvisMfETtuAPR", "question": "What protection is offered within the long socks?", "choices": ["shin pads", "weights", "deflectors", "cooling"], "correct_choice_idx": 0, "direct_answers": ["shin guard", "grass burns", "shin pads", "kick protection", "from bruises", "shin", "shin brace", "pads", "shin guards", "shin guard"], "difficult_direct_answer": true, "rationales": ["There are pads inside the socks.", "There are stuffed leggings designed to protect the lower leg.", "Shin pads are located under soccer player's socks."], "image": "val2014/COCO_val2014_000000425120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132132, "question_id": "mZBauAkCMa7L2aDisQjgdE", "question": "Where can you see a copyright symbol?", "choices": ["top left", "bottom right", "top right", "bottom left"], "correct_choice_idx": 1, "direct_answers": ["bottom right", "yes", "lower right", "lower right", "bottom right", "bottom right", "corner", "bottom right", "corner", "bottom right"], "difficult_direct_answer": false, "rationales": ["There is an encircled c near the photographer's name.", "It is next to the photographer's name.", "The bottom right has the symbol."], "image": "val2014/COCO_val2014_000000132132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556349, "question_id": "mZQPQnUVrGGiTwCMt6DDug", "question": "What is the endure motor cycle designed for?", "choices": ["jumping", "climbing", "speed", "racing"], "correct_choice_idx": 2, "direct_answers": ["speed", "speed", "speed", "speed", "speed", "racing", "sports riders", "riding", "racing", "speed"], "difficult_direct_answer": false, "rationales": ["It has a racing design and is a light weight meant for faster speeds.", "A sleek motorcycle made for one is on a sidewalk.", "The endura is made for long-distance competitions."], "image": "val2014/COCO_val2014_000000556349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134948, "question_id": "mZavdAVg5N2LVSR42fAMdT", "question": "Why is the man on the beach holding the object?", "choices": ["to dance", "to ski", "to surf", "to swim"], "correct_choice_idx": 2, "direct_answers": ["to surf", "maintain ownership", "transport", "for surfing", "getting ready", "to surf", "going home", "wave waiting", "going surfing", "going surfing"], "difficult_direct_answer": false, "rationales": ["The man wants to take the surfboard out to ride.", "A man is walking on a beach with a surfboard.", "The man has a surfboard."], "image": "train2014/COCO_train2014_000000134948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22102, "question_id": "mZwiLsZSZW38oaPp4WwTVk", "question": "Regarding cooking pizza what did the cook do regarding cook time here?", "choices": ["just under", "too short", "too long", "exactly right"], "correct_choice_idx": 2, "direct_answers": ["overcooked", "overcooked", "burned", "over cooked", "fast cooking", "over cooked", "too long", "burn", "30 minutes", "overcook it"], "difficult_direct_answer": false, "rationales": ["The items was cooked too long because it has spots that have burnt on it indicating it was cooked longer than necessary.", "It is a little burnt", "The pizza has dark edges."], "image": "train2014/COCO_train2014_000000022102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178198, "question_id": "mZyCXJMJEKJ9ibh4qwMyvZ", "question": "In which country were eyeglasses invented?", "choices": ["italy", "wales", "france", "china"], "correct_choice_idx": 0, "direct_answers": ["italy", "italy", "italy", "england", "italy", "italy", "italy", "italy", "germany", "france"], "difficult_direct_answer": false, "rationales": ["Savino d'armante is attributed as the inventor of the first wearable eyeglasses.", "People from italy were reading a lot and invented glasses.", "According to an online search, this is the correct answer."], "image": "train2014/COCO_train2014_000000178198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205848, "question_id": "mZzuXSRmn5rApWGbi9QSrg", "question": "What type of buildings are the striped tall ones?", "choices": ["malls", "apartments", "factories", "lighthouses"], "correct_choice_idx": 3, "direct_answers": ["stacks", "smoke stack", "lighthouses", "chimney", "skyscrapers", "exhaust", "power plant", "incinerator", "smokestacks", "smokestacks"], "difficult_direct_answer": true, "rationales": ["They are factories.", "The tall striped buildings are factory smokestacks.", "They are smoke stacks."], "image": "train2014/COCO_train2014_000000205848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390002, "question_id": "ma2dsf7MAN7PyoP6vRfThw", "question": "What part of a car is symbolized in the objects they are holding?", "choices": ["seatbelt", "radio", "steering wheel", "headlights"], "correct_choice_idx": 2, "direct_answers": ["steering wheel", "steering wheels", "wheel", "steering wheel", "steering wheel", "steering wheel", "steering wheel", "steering wheel", "steering wheel", "steering wheel"], "difficult_direct_answer": false, "rationales": ["They are holding a circular controller that resembles the part of a car that drivers use to control the direction the car travels.", "Steering wheels are round and games often involve racing.", "This appears to be car simulation controllers for wii."], "image": "train2014/COCO_train2014_000000390002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233119, "question_id": "macbsvd8tDYDDNodTDZzoJ", "question": "What would you most likely do with a card near here?", "choices": ["magic trick", "open door", "play cards", "banking"], "correct_choice_idx": 3, "direct_answers": ["withdraw", "atm withdrawal", "withdraw money", "withdraw cash", "withdraw money", "go shopping", "board transit", "banking", "enter bank", "public transit"], "difficult_direct_answer": true, "rationales": ["The sign above the people refers to chase. this business provides financial services.", "There is a sign for chase which is a bank. banks often use cards for atm withdrawals and other services.", "There is an awning with a brand written on it that is associated with using cards for the purposes of answer a."], "image": "val2014/COCO_val2014_000000233119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115086, "question_id": "majQk6L6tg3cEgKkG8grxC", "question": "What is this appliance used for?", "choices": ["cooling", "cleaning", "watching", "cooking"], "correct_choice_idx": 0, "direct_answers": ["refrigeration", "refrigeration", "preserving food", "cooling freezing", "cooling", "food safety", "freezing food", "cold food", "display pictures", "storing perishables"], "difficult_direct_answer": true, "rationales": ["Refrigerators keep food cold.", "It is used to cool the food.", "This device is used to keep food fresh, which is done by chilling it."], "image": "train2014/COCO_train2014_000000115086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399626, "question_id": "marswG87DSoL8NNeNyML9y", "question": "The skier wearing what color of outfit is at a different height than others?", "choices": ["red", "yellow", "green", "brown"], "correct_choice_idx": 3, "direct_answers": ["red blue", "black red", "blue orange", "gray red", "brown", "grey", "black red", "grey red", "black", "black"], "difficult_direct_answer": false, "rationales": ["The person has on brown.", "The color is brown.", "The color is a dark shade of red and yellows mixed together."], "image": "val2014/COCO_val2014_000000399626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530284, "question_id": "maso2m88kuZSsVW8DftWHb", "question": "When did the TV company start using this name?", "choices": ["1958", "1962", "1935", "1945"], "correct_choice_idx": 0, "direct_answers": ["nineteen fiftyseven", "past", "before years", "unsure", "1958", "forties", "1960", "1958", "1988", "modern times"], "difficult_direct_answer": true, "rationales": ["This was 12 years after the company was founded", "Sony was launched in 1958.", "The tv is a sony tv based on the writing on the front and the name origin is internet searchable."], "image": "train2014/COCO_train2014_000000530284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4312, "question_id": "mb6G7zXAmzBw9ZPRi4SMBX", "question": "In which object was the item being photographed prepared?", "choices": ["grill", "open fire", "oven", "deep fryer"], "correct_choice_idx": 2, "direct_answers": ["kitchen", "pan", "oven", "pizzeria", "oven", "oven", "pot pie", "oven", "pizza", "oven"], "difficult_direct_answer": false, "rationales": ["The food item is a pizza. pizzas are not cooked on open fires, deep fried, or grilled.", "Pies are always baked.", "The item being photographed is a pizza. pizzas are baked, not deep fried or grilled."], "image": "val2014/COCO_val2014_000000004312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504165, "question_id": "mb72hiW3mm2HEogWXrAroN", "question": "What is most likely in the can that the child has taken from the fridge?", "choices": ["milk", "alcohol", "vegetable puree", "juice"], "correct_choice_idx": 3, "direct_answers": ["food", "juice", "juice", "tomato juice", "v8", "soda", "juice", "vegetable juice", "juice", "juice"], "difficult_direct_answer": false, "rationales": ["It is a small one serving size can with liquid in it", "The can belongs to a company called v8 which is known for its vegetable juices. it is also in a can with an opening for drinking.", "There is a picture of tomatoes on the can."], "image": "train2014/COCO_train2014_000000504165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359436, "question_id": "mbBzvnWZetoRREtxELdGWw", "question": "What flutters just beneath the main body of this kite?", "choices": ["nothing", "eagle", "tail", "pigeon"], "correct_choice_idx": 2, "direct_answers": ["thread", "tail", "tail", "tail", "flags", "string", "string", "tail", "little girl", "tassels"], "difficult_direct_answer": false, "rationales": ["It has a tail hanging down.", "Some kites have tails and that is what it looks like in the image.", "There is something just beneath the kite's main body. there are no birds near the kite."], "image": "val2014/COCO_val2014_000000359436.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534502, "question_id": "mbFi6p5rHpHUXG3pBYzMxr", "question": "What is the reason behind the wet nose of dog?", "choices": ["secret mucus", "secretion", "glands", "none"], "correct_choice_idx": 0, "direct_answers": ["secret mucus", "smell", "licking", "keep cool", "cooling", "seeing", "better smelling", "hydration", "moisturize", "healthy"], "difficult_direct_answer": true, "rationales": ["The dog is excreting mucus.", "Mucus come from the nose.", "The dog is panting because it is hot or nervous. the nose is wet because it contains snot from the nose."], "image": "train2014/COCO_train2014_000000534502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450961, "question_id": "mbLF4aHF4DEayufpySQTq5", "question": "What type of food is in the image?", "choices": ["apple", "orange", "banana", "tomato"], "correct_choice_idx": 2, "direct_answers": ["bananas", "bananas", "bananas", "banana", "bananas", "banana", "banana", "banana", "banana", "banana"], "difficult_direct_answer": false, "rationales": ["There are yellow, not green, orange, or red, food items.", "The fruit visible is yellow and banana shaped.", "Bananas are in the image."], "image": "train2014/COCO_train2014_000000450961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453785, "question_id": "mbhfuub93ubgMEai3Yr6VM", "question": "What is the gold pole used for?", "choices": ["pull ups", "climbing", "swinging", "hanging clothing"], "correct_choice_idx": 3, "direct_answers": ["clothes hanger", "hanging towels", "ladder", "climbing", "ladder", "hang clothes", "hanging", "climbing", "hanging clothing", "to climb"], "difficult_direct_answer": false, "rationales": ["The pole is wear clothes dry from.", "There are items in use on it, showing what the purpose is.", "Although it's designed to look like a ladder, it's actually a."], "image": "val2014/COCO_val2014_000000453785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336690, "question_id": "mbrhszHUjAXjh2Mot6KNrK", "question": "What do the orange cones set out signify?", "choices": ["free parking", "turns allowed", "crossing", "safety hazards"], "correct_choice_idx": 3, "direct_answers": ["safety buffer", "caution", "warning", "safety hazards", "obstacles", "danger", "stopping points", "safety", "mark positions", "safety zones"], "difficult_direct_answer": true, "rationales": ["The cones set out safety issues.", "The cones are for safety purposes.", "These cones direct people away from them."], "image": "train2014/COCO_train2014_000000336690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20245, "question_id": "mcCcpkX9zrsMACuwzvVUZm", "question": "What is the shape of this kite?", "choices": ["diamond", "box", "delta", "sled"], "correct_choice_idx": 2, "direct_answers": ["triangle", "delta", "triangle", "triangle", "triangle", "triangle", "triangle", "delta", "delta", "triangle"], "difficult_direct_answer": false, "rationales": ["Delta is sharp at the front.", "The triangular shape is that of delta.", "It is a triangular shape"], "image": "train2014/COCO_train2014_000000020245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33830, "question_id": "mcSm37nu9PgLgDAtvWk26t", "question": "Why is the animal on the ground?", "choices": ["sleeping", "shearing", "killing", "birthing"], "correct_choice_idx": 1, "direct_answers": ["sheering wool", "shearing", "being sheered", "being sheered", "shearing", "sheep", "sheering", "shearing", "shearing", "sheep"], "difficult_direct_answer": false, "rationales": ["The sheep is laying on the ground because it is being sheared for its wool.", "There is a sheep on the ground having its wool cut, and this is called shearing.", "The animal is getting sheared."], "image": "val2014/COCO_val2014_000000033830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272255, "question_id": "mcUWuDhsd563JFGc4MhuoW", "question": "What do the people with the gray and black umbrella with the logo want to do?", "choices": ["cross street", "dance", "reverse course", "direct traffic"], "correct_choice_idx": 0, "direct_answers": ["stay dry", "go home", "cross street", "cross road", "cross street", "cross street", "walk", "cross street", "cross road", "cross street"], "difficult_direct_answer": false, "rationales": ["They are on those white lines that lead you to the other side of the sidewalk.", "They are waiting to cross to the other side", "They are walking in a cross walk."], "image": "train2014/COCO_train2014_000000272255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357044, "question_id": "mchU3SKxm38MqvnZYgGqik", "question": "The kicking animal is likely a hybrid of which two animals?", "choices": ["dog cat", "mule donkey", "zebra horse", "seahorse manatee"], "correct_choice_idx": 2, "direct_answers": ["zebra horse", "zebra horse", "zebra horse", "zebra donkey", "zebra/horse", "zebra horse", "horse zebra", "horse zebra", "zebra horse", "donkey"], "difficult_direct_answer": false, "rationales": ["The coloration and the other animal not kicking shows the animal's heritage.", "The animal is kicking and has a partially striped body.", "The animal looks like it's equine."], "image": "val2014/COCO_val2014_000000357044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108617, "question_id": "mcjw6LMMdFQjAgzUdWUFKV", "question": "What type of creatures are eating the oranges?", "choices": ["spiders", "butterflies", "dragonflies", "snails"], "correct_choice_idx": 1, "direct_answers": ["butterflies", "moths", "butterflies", "butterflies", "moth", "butterflies", "butterfly", "moths", "butterflies", "butterflies"], "difficult_direct_answer": false, "rationales": ["The creatures are insects with large and conspicuous wings.", "The creatures are clearly visible and are of the size and shape of answer a.", "The creatures are moths which are a form of butterfly."], "image": "train2014/COCO_train2014_000000108617.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180488, "question_id": "mdBznV2DeVdok3rFHkXXJA", "question": "After traversing over the jump what natural force will cause the boarder to return to the ground?", "choices": ["gravity", "inertia", "fission", "kinetic energy"], "correct_choice_idx": 0, "direct_answers": ["gravity", "gravity", "gravity", "gravity", "gravity", "gravity", "gravity", "gravity", "gravity", "gravity"], "difficult_direct_answer": false, "rationales": ["Gravity is something we learn about in grade school.", "Gravity will bring them to the ground.", "The gravity causes the return."], "image": "train2014/COCO_train2014_000000180488.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339690, "question_id": "mdK4fumpuKEQ2m4XN3zh5F", "question": "Why is the taller person smiling?", "choices": ["is amused", "likes game", "is cheating", "is winning"], "correct_choice_idx": 0, "direct_answers": ["enjoy playing", "funny child", "happy", "child", "is amused", "baby's cute", "child", "playing", "child", "having fun"], "difficult_direct_answer": false, "rationales": ["The smaller person is attempting to eat the wii remote, and it is a funny sight.", "The taller person is smiling at the child because she is using the video game controller the wrong way and thinks it's amusing.", "She finds the child funny."], "image": "train2014/COCO_train2014_000000339690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152309, "question_id": "mdmiFui89XypbcGaMD5HWE", "question": "Who is likely her sponsor?", "choices": ["new era", "adidas", "nike", "gen x"], "correct_choice_idx": 2, "direct_answers": ["wilson", "wilson sports", "nike", "nike", "nike", "wilson", "wimbledon", "nike", "nike", "nike"], "difficult_direct_answer": false, "rationales": ["The sponsor is nike.", "She has the logo on her shoes in a professional match", "She has their swoosh on her shoes."], "image": "train2014/COCO_train2014_000000152309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192301, "question_id": "meBABtrcw3zGHznvZkB9Dk", "question": "What type of stand is this?", "choices": ["political", "craft", "umbrella", "produce"], "correct_choice_idx": 3, "direct_answers": ["fruit", "fruit", "fruit stand", "fruit", "fruit", "for sale", "fruit", "fruit stand", "produce", "fruit"], "difficult_direct_answer": false, "rationales": ["The stand has fruit.", "This stand sells fruit.", "There is fruit on the stand."], "image": "train2014/COCO_train2014_000000192301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285810, "question_id": "meFF5VMDfJwBQTa3oFTYnv", "question": "During what time of day are the pedestrians walking on this sidewalk?", "choices": ["evening", "morning", "night", "noon"], "correct_choice_idx": 1, "direct_answers": ["morning", "afternoon", "day", "morning", "noon", "daylight", "afternoon", "afternoon", "afternoon", "morning"], "difficult_direct_answer": false, "rationales": ["Its still dull and cold.", "It's in the morning.", "Given the lighting it looks like it is morning."], "image": "train2014/COCO_train2014_000000285810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141278, "question_id": "meTMHR9262UnWEbD9qJjQK", "question": "What is the device on the right side of the desk used for?", "choices": ["burning discs", "painting", "printing", "calculating"], "correct_choice_idx": 2, "direct_answers": ["printing", "printing", "printing", "printing", "printing", "printing", "printing", "printing", "printing", "printing"], "difficult_direct_answer": false, "rationales": ["It produces physical copies of computer documents.", "The device is a printer.", "There is paper inside of it."], "image": "val2014/COCO_val2014_000000141278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516793, "question_id": "menc2MGF6D8voVzTdp94VY", "question": "What type of home do the brochures on the table suggest it is?", "choices": ["museum", "for sale", "air bnb", "hotel"], "correct_choice_idx": 2, "direct_answers": ["vacation rental", "cabin", "air bnb", "private home", "cabin", "lodging", "cabin", "rental property", "museum", "model"], "difficult_direct_answer": false, "rationales": ["The free air and large are shows the colors.", "The area is an airbnb for rent.", "It looks like someone's living room."], "image": "train2014/COCO_train2014_000000516793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491337, "question_id": "mfrnSjZTN7c3smUSTzTa6K", "question": "How do the skiers get to the top of the hill?", "choices": ["walk up", "ski up", "gondola", "chairlift"], "correct_choice_idx": 3, "direct_answers": ["skiing", "ski lift", "ride", "lift", "skylift", "lift", "ski lift", "chairlift", "lift", "chairlift"], "difficult_direct_answer": false, "rationales": ["It is a long way up to the top of the mountain. a ski lift can take the skiiers to the top of the mountain.", "There is one off to the side", "The skiers used the chairlift."], "image": "train2014/COCO_train2014_000000491337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240862, "question_id": "mg3TkBk2fdL399fRydDmgj", "question": "Which container holds the food the child here will eat?", "choices": ["kabob stand", "bowl", "plate", "serving platter"], "correct_choice_idx": 1, "direct_answers": ["blue bowl", "dinner plate", "bowl", "blue bowl", "bowl", "bowl", "bowl", "bowl", "plate", "blue"], "difficult_direct_answer": false, "rationales": ["The bowl has soft food in it.", "The blue bowl with the baby spoon in it is for the baby.", "There is a utensil in answer a that looks to be intended for use by a child and the food itself has been prepared in a manner that would be more suitable for a child vs. the food on the plate."], "image": "train2014/COCO_train2014_000000240862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362521, "question_id": "mgLUYawGPVLHq4wmaHmJCc", "question": "What is the green light on the rectangular traffic light permitting?", "choices": ["pedestrian crossing", "bike crossing", "u turns", "right turns"], "correct_choice_idx": 3, "direct_answers": ["arrow", "right turn", "right", "traffic pass", "turn right", "right turn", "right turns", "turning", "right turn", "turning"], "difficult_direct_answer": false, "rationales": ["The lights on the traffic sign are permitting turns.", "Green means go and it is pointing right meaning right turns are currently allowed.", "The green arrow is pointing the direction that is available."], "image": "val2014/COCO_val2014_000000362521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176415, "question_id": "mgZfsNv3XXVgxLeh75AXh5", "question": "Where is this cat located?", "choices": ["wild", "store", "vet", "home"], "correct_choice_idx": 3, "direct_answers": ["home", "floor", "floor", "living room", "rug", "floor", "floor", "living room", "watching tv", "floor"], "difficult_direct_answer": false, "rationales": ["You can tell by the setting and television as to where the cat is located.", "There is a television and books in a shelf. these are things that belong in a living room.", "Residential homes have personal items in them. the cat sits in a living room of a home."], "image": "train2014/COCO_train2014_000000176415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420052, "question_id": "mgfxfDqJkFvyLwNKiVxAwe", "question": "What are the objects near the front of the black gate considered?", "choices": ["garbage", "priceless", "heirlooms", "for sale"], "correct_choice_idx": 0, "direct_answers": ["garbage", "trash", "garbage", "garbage", "trash", "garbage", "trash", "trash", "trash", "trash"], "difficult_direct_answer": false, "rationales": ["The collection of items looks used and left there for a a garbage truck to pick up.", "It's a bunch of garbage that's been piled up together and some in bags.", "The items are in large plastic bags and beat up, with little to no care given to them."], "image": "val2014/COCO_val2014_000000420052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91858, "question_id": "mggFFRaQExcLUAiUonzBDi", "question": "This bird is native to which country?", "choices": ["brazil", "australia", "africa", "india"], "correct_choice_idx": 3, "direct_answers": ["america", "peacock", "america", "india", "india", "india", "india", "england", "usa", "india"], "difficult_direct_answer": false, "rationales": ["It is a peacock.", "The peacock is from india.", "Those birds come from that country."], "image": "train2014/COCO_train2014_000000091858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326210, "question_id": "mgkKCAZc8aotRRHufjva65", "question": "Which sport may someone be a fan of given the type of sports object on the desk?", "choices": ["soccer", "basketball", "football", "baseball"], "correct_choice_idx": 3, "direct_answers": ["baseball", "baseball", "baseball", "baseball", "baseball", "football", "baseball", "baseball", "baseball", "football"], "difficult_direct_answer": false, "rationales": ["Baseball items are shown.", "A baseball is there", "There is a baseball on the desk of this office indicating that the owner most likely enjoys baseball as a sport."], "image": "train2014/COCO_train2014_000000326210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311205, "question_id": "mgx7rmNoH3AXpFAs7F3VCF", "question": "What is the make of the parked car?", "choices": ["toyota", "mitsubishi", "honda", "volkswagen"], "correct_choice_idx": 3, "direct_answers": ["volkswagen", "volkswagen", "volkswagen", "volkswagen", "bmw", "volkswagen", "volkswagen", "vw", "truck", "honda"], "difficult_direct_answer": false, "rationales": ["A volkswagen logo can be seen on the side of the road.", "The car says vw.", "It's a volkswagen"], "image": "val2014/COCO_val2014_000000311205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89882, "question_id": "mhfDjUSxdVWePCjG493BgA", "question": "What is the man hoping to do by skating between the two green cones?", "choices": ["get exercise", "stop fall", "perfect stunt", "avoid traffic"], "correct_choice_idx": 2, "direct_answers": ["keep riding", "jump above", "train", "skateboard", "tricks", "have control", "win competition", "practice", "score points", "perfect stunt"], "difficult_direct_answer": true, "rationales": ["The green cones are there to create the obstacle for the stunt.", "The cones are there to male the moves more difficult and if he can successfully skate through them, he has achieved the stunt", "He is practicing skateboard tricks"], "image": "train2014/COCO_train2014_000000089882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365101, "question_id": "mhk6Qvpr2jEk3GiSPTUGyo", "question": "What is the beverage in the glass?", "choices": ["lite beer", "soda pop", "green tea", "ale"], "correct_choice_idx": 3, "direct_answers": ["beer", "beer", "beer", "ale", "beer", "beer", "wine", "beer", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["Looks to be some kind of beer", "The tall glass has a liquid with a head.", "It has an amber color and is frothy at the top"], "image": "train2014/COCO_train2014_000000365101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76882, "question_id": "mho9TWq9Zs4dUcPCA7YBVw", "question": "Where does this tool have to go to get used?", "choices": ["in spaceship", "on boat", "in mouth", "in factory"], "correct_choice_idx": 2, "direct_answers": ["bathroom", "bathroom", "bathroom", "in mouth", "mouth", "mouth", "bathroom", "mouth", "bathroom", "mouth"], "difficult_direct_answer": false, "rationales": ["That is wear a toothbrush is used.", "The tool goes in the mouth.", "The tool needs to go in the mouth."], "image": "train2014/COCO_train2014_000000076882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356972, "question_id": "mhxYV4XEYmyKJ7vmUZ8GFA", "question": "Why is there a yellow line on the ground?", "choices": ["as prank", "optical illusion", "decoration", "safety"], "correct_choice_idx": 3, "direct_answers": ["ground", "caution", "safety", "stop point", "stay behind", "waiting", "safety", "safety", "protection", "caution"], "difficult_direct_answer": false, "rationales": ["Yellow is for caution.", "It serves a warning that alerts people to not move any closer to the edge.", "A brightly colored barrier is often used at train stations to show passengers what distance to give oncoming trains while they wait to board."], "image": "train2014/COCO_train2014_000000356972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437797, "question_id": "mi87jK9DeNwkGb6Lf6nHRD", "question": "What type of phone is being used?", "choices": ["rotary", "landline", "pay", "cellular"], "correct_choice_idx": 3, "direct_answers": ["cell", "cell phone", "cell phone", "cell", "cell phone", "cell phone", "mobile", "cellular", "mobile phone", "cellphone"], "difficult_direct_answer": false, "rationales": ["A woman is looking back as she talks on a small phone. she is around other people who are walking around.", "The phone is small enough to be held in the woman's hand and be held up to her face, so it has to be a cell phone. a landline, rotary and pay phone cannot be held in a person's hand.", "It is a small portable device which can be used while away from home."], "image": "train2014/COCO_train2014_000000437797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405004, "question_id": "miCWvwTmPWwaufasdaJW34", "question": "What animal is the same color as the couch nearest to the lamp?", "choices": ["cheetah", "polar bear", "blue jay", "leopard"], "correct_choice_idx": 1, "direct_answers": ["polar bear", "sheep", "white bird", "polar bear", "polar bear", "polar bear", "polar bear", "bird", "polar bear", "polar bear"], "difficult_direct_answer": false, "rationales": ["Because polar bear also has a brownish or white fur.", "Polar bears are white and live in the antarctic regions.", "The couch in question is clearly visible and the color is apparent. answer a is an animal that has the same color."], "image": "val2014/COCO_val2014_000000405004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397186, "question_id": "miVjn5Msbd4Qhy2gHhSbvU", "question": "What two individuals are being judged?", "choices": ["dog dog", "woman", "man man", "dog man"], "correct_choice_idx": 3, "direct_answers": ["man dog", "man dog", "dog man", "man dog", "man dog", "dog man", "man dog", "man dog", "running", "man/dog"], "difficult_direct_answer": false, "rationales": ["The dog and man are a team being watched.", "The human throws the frisbee to the animal in this competition", "The dog is in a competition with his dog."], "image": "train2014/COCO_train2014_000000397186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519880, "question_id": "mioWPZFNtKCBTj5EoFa4Wi", "question": "Why is the skateboard hanging there?", "choices": ["is trick", "unbalanced", "is falling", "showing off"], "correct_choice_idx": 0, "direct_answers": ["trick", "stunt", "stunt", "skateboard trick", "mid trick", "trick", "balancing act", "wheelie attempt", "is trick", "momentum"], "difficult_direct_answer": false, "rationales": ["The skateboard is doing a trick.", "The man is trying to do some tricks on the rail.", "The skateboarder wants to show off a trick for the audience."], "image": "val2014/COCO_val2014_000000519880.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349418, "question_id": "mirSZPkv2dX5WmsnDvbhT6", "question": "Where are the athlete's feet?", "choices": ["doubles alley", "clearance", "service box", "center court"], "correct_choice_idx": 1, "direct_answers": ["behind line", "halfof ground", "clearance", "together", "on court", "offcourt", "ground", "ground", "behind line", "on court"], "difficult_direct_answer": false, "rationales": ["Answer d is the most accurate as the person is serving the tennis ball based on the body position and the location of the ball directly over their head.", "The athlete needs to clear.", "They are outside the official court as she makes the serve"], "image": "val2014/COCO_val2014_000000349418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76765, "question_id": "mirtUNwGeXvvzyabfkZV7B", "question": "Why is the woman wearing sandals?", "choices": ["its warm", "its cold", "its cloudy", "its wet"], "correct_choice_idx": 0, "direct_answers": ["warm day", "warm weather", "for walking", "protect feet", "nice day", "warm weather", "travelling", "good weather", "its warm", "hot outside"], "difficult_direct_answer": true, "rationales": ["It is summertime and regular shoes will be too hot", "Sandals are worn when it's warm out, because otherwise, ones toes would be too cold. this woman, suitcase in tow, is traveling on a warm day.", "A woman is walking on a sidewalk on a sunny day."], "image": "val2014/COCO_val2014_000000076765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395402, "question_id": "mirzUpeMXBfBVjvrivzrcY", "question": "Why wear a collar?", "choices": ["noise maker", "decoration", "identification", "fashion"], "correct_choice_idx": 2, "direct_answers": ["identification", "connect leash", "identity", "black", "for leash", "identification", "for leash", "identify owner", "identification", "identification"], "difficult_direct_answer": false, "rationales": ["The collar identifies the dog.", "The collar is for id.", "A collar is on him to put a leash on him and for his tag that has his owners information on it."], "image": "val2014/COCO_val2014_000000395402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235687, "question_id": "mjAwAECgZ9Jzao7dkWXgwr", "question": "What type flag elements appear in a pictured umbrella?", "choices": ["nazi", "american", "gay rainbow", "chilean"], "correct_choice_idx": 2, "direct_answers": ["gay rainbow", "rainbow", "rainbow flag", "rainbow", "umbrellas", "rainbow", "stripes", "colors", "leaf", "rainbow"], "difficult_direct_answer": false, "rationales": ["The umbrella has the entire color spectrum. this color scheme is associated with pride.", "The flag is a rainbow.", "There is an umbrella with a rainbow pattern."], "image": "train2014/COCO_train2014_000000235687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93677, "question_id": "mjK5RYvi2P7fs9zMBbv7sc", "question": "Why are all 4 boys similarly touching their neckties?", "choices": ["tightening ties", "coincidence", "camera pose", "giving lesson"], "correct_choice_idx": 2, "direct_answers": ["taking photos", "group photo", "posing picture", "straightening them", "posing", "camera pose", "posing", "pozzing", "they're cool", "pose picture"], "difficult_direct_answer": true, "rationales": ["The four boys are touching their neckties for a camera pose.", "The boys are posing.", "Given the setting and how they are dressed, it's easy to tell why they are holding their ties."], "image": "train2014/COCO_train2014_000000093677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378662, "question_id": "mjLcCDGLMcA2ECnmDNBruu", "question": "What is the statue shaped like?", "choices": ["bear", "lion", "tiger", "dog"], "correct_choice_idx": 3, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["This statue has the four paws, snout, hanging ears and even collar of a dog well replicated.", "The statue is too small to be a lion, tiger, or bear. it looks like an animal a human might keep as a pet.", "The statue looks like a dog."], "image": "train2014/COCO_train2014_000000378662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7921, "question_id": "mjWfibGfS4SZeo4eXNvyrW", "question": "Why is the child laying there?", "choices": ["badly injured", "asleep", "dead", "coma"], "correct_choice_idx": 1, "direct_answers": ["sleeping", "sleeping", "sleeping", "sleeping", "sleep", "sleeping", "sleeping", "asleep", "sleeping", "sleep"], "difficult_direct_answer": false, "rationales": ["The child has their eyes closed yet they are not in the hospital and nobody is upset. therefore, they are sleeping and not injured, in a coma, or dead.", "The child is in a limp, relaxed position with his eyes closed as one would for answer a.", "Their son is tired and will probably be put to bed soon."], "image": "train2014/COCO_train2014_000000007921.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 383933, "question_id": "mjdUtBmWuXcCpbJQhHXtBF", "question": "What trick is this skateboarder performing?", "choices": ["frontside 5050", "nosegrind", "crooked grind", "5-0 grind"], "correct_choice_idx": 3, "direct_answers": ["no idea", "railing", "grind rail", "bs crooked", "rail grind", "grinds", "5-0 grind", "grind", "grind", "rail slider"], "difficult_direct_answer": true, "rationales": ["A person on a skateboard is grinding on a rail.", "A skateboarder performs tricks that include having one set of wheels on the ground and the other in the air or over the side.", "That is what the skateboarder is performing."], "image": "train2014/COCO_train2014_000000383933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255614, "question_id": "mji85EuWvYUAjCRGhfv7fr", "question": "What is he doing?", "choices": ["singing", "drinking", "listening", "talking"], "correct_choice_idx": 2, "direct_answers": ["on phone", "listening", "using cellphone", "talking phone", "talking", "listening", "using phone", "talking", "talking", "talking phone"], "difficult_direct_answer": false, "rationales": ["He's listening.", "The man has his phone to his ear.", "The man's mouth isn't open and he's on the phone."], "image": "train2014/COCO_train2014_000000255614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294073, "question_id": "mjpAzemUskphhQaEr6gvAM", "question": "What do the horses here hope the people have?", "choices": ["apples", "music", "i pads", "radios"], "correct_choice_idx": 0, "direct_answers": ["food", "food", "food", "food", "food", "food", "apples", "treats", "treats", "food"], "difficult_direct_answer": false, "rationales": ["The horses look hungry and are hoping the people have apples to eat.", "Horses like to eat these", "The horse is looking at her with its nose extremely close to her shoulder, as if it's hungry."], "image": "val2014/COCO_val2014_000000294073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508985, "question_id": "mk5igA2cidg8k2pwnNfCKL", "question": "What are is the image from?", "choices": ["forest", "underground", "city", "sky"], "correct_choice_idx": 2, "direct_answers": ["traffic", "motorcycles automobiles", "historical street", "asia", "street", "cuba", "street", "town", "city", "town"], "difficult_direct_answer": false, "rationales": ["The structure of the road and the buildings in the background and the volume of cars visible is all consistent with answer a.", "These people are riding riding in a densely populated area.", "The view of vehicles on a wide road is very city like."], "image": "val2014/COCO_val2014_000000508985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139590, "question_id": "mkTvaLjdLXMzX9yrZuPFwh", "question": "Which team's logo is seen behind home plate?", "choices": ["chicago", "milwaukee", "washington", "baltimore"], "correct_choice_idx": 2, "direct_answers": ["washington", "wisconsin", "unknown", "away team", "cubs", "pirates", "nationals", "washington", "washington", "dodgers"], "difficult_direct_answer": false, "rationales": ["The logo is for washington.", "The washington nationals \"w\" is depicted.", "The decorative w is the nationals' symbol."], "image": "train2014/COCO_train2014_000000139590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70411, "question_id": "mkWX2wyMmKRBtz7uxHaNWK", "question": "What feeling does this cat most likely seem to be portraying?", "choices": ["fear", "frustrated", "relaxed", "angry"], "correct_choice_idx": 2, "direct_answers": ["attention seeking", "relaxed", "bored", "interest", "relaxation", "anger", "disapproval", "alertness", "curiousity", "interest"], "difficult_direct_answer": true, "rationales": ["The cat is laying down and chilling out.", "The cat is completely relaxing on the top of the suitcase.", "The cat is lying down."], "image": "train2014/COCO_train2014_000000070411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314613, "question_id": "mke4qpvBctSAYSbsLbtK6F", "question": "What book is more favored by the owner of this boat?", "choices": ["carrie", "hobbit", "vampire lestat", "cujo"], "correct_choice_idx": 1, "direct_answers": ["jar tolkien", "lord rings", "hobbit", "hobbit", "hobbit", "lord rings", "hobbits", "hobbit", "lot", "tolkien"], "difficult_direct_answer": false, "rationales": ["The owner of this boat would favor the hobbit since he named his boat after the author of that book.", "The boat says j.r tolkien.", "The author written on the boat is famous for writing about this type of species in his novels."], "image": "val2014/COCO_val2014_000000314613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429924, "question_id": "mkmwU25LsVwsfQMGJTACDV", "question": "Where is this bicycle storage depot most probably located based on the language on the sign?", "choices": ["south america", "central asia", "north america", "western europe"], "correct_choice_idx": 3, "direct_answers": ["netherlands", "amsterdam", "germany", "western europe", "germany", "amsterdam", "germany", "germany", "germany", "amsterdam"], "difficult_direct_answer": false, "rationales": ["The bike is from europe.", "The sign is in dutch. the depot likely is in the netherlands.", "The language appears to be germanic"], "image": "val2014/COCO_val2014_000000429924.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425542, "question_id": "mm5NDPCfVKYBNhjbybfZtT", "question": "Who played this sport?", "choices": ["tom brady", "jim those", "maria sharapova", "bo jackson"], "correct_choice_idx": 2, "direct_answers": ["athlete", "maria sharapova", "tennis players", "many people", "serena williams", "tennis player", "sherina williams", "woman", "maria sharapova", "tennis players"], "difficult_direct_answer": false, "rationales": ["Maria sharapova is a very famous tennis player.", "Maria sharapova plays tennis.", "The person is a famous tennis player."], "image": "train2014/COCO_train2014_000000425542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551736, "question_id": "mmbddtfqvdA6bYoRjGTUB2", "question": "What breed the dog is?", "choices": ["bull dog", "poodle", "retriever", "chihuahua"], "correct_choice_idx": 1, "direct_answers": ["terrier", "schnauzer", "terrier", "terrier", "terrier", "looking", "unknown", "boston terrier", "chihuahua", "poodle"], "difficult_direct_answer": false, "rationales": ["The dog actually has some schnauzer in it.", "A dog with a wire hair coat is walking on the beach.", "Looks more like a mut."], "image": "train2014/COCO_train2014_000000551736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48850, "question_id": "mmdvLE9VQ6zxST8XTab7x3", "question": "What type of electronic is on the table?", "choices": ["phone", "hair dryer", "fan", "vacuum"], "correct_choice_idx": 1, "direct_answers": ["hair dryer", "nail dryer", "hair dryer", "mixer", "fan", "hair dryer", "camera", "fan", "light", "hair dryer"], "difficult_direct_answer": false, "rationales": ["The item sitting on the table is a hair dryer since it has an oblong shape.", "This device blows air to get rid of wetness.", "It is typically used in hair salons and at home in one's bathroom."], "image": "val2014/COCO_val2014_000000048850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 476950, "question_id": "mmhxxXQQT4cQgp7U2pxYpn", "question": "Where is the woman in Pink Going?", "choices": ["sales call", "home", "vacation", "mall"], "correct_choice_idx": 2, "direct_answers": ["on vacation", "vacation", "airport", "airport", "vacation", "travelling", "vacation", "travelling", "on trip", "vacation"], "difficult_direct_answer": false, "rationales": ["The packed suitcase with backpack this woman totes implies she'll be away from home for longer than a day.", "The woman has luggage. she is leaving her home.", "The woman is posing next to luggage. one usually packs and brings luggage with them when they are going on vacation."], "image": "train2014/COCO_train2014_000000476950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421978, "question_id": "mmuaNBBJRK7egB3xgD4RLD", "question": "What sort of product do ad men use these type horses to market?", "choices": ["horse food", "beer", "pizza", "muffins"], "correct_choice_idx": 1, "direct_answers": ["marketing", "beer", "beer", "heineken", "posters", "horse", "beer", "beer", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["That is a well known german beer", "The horses are clydesdales. they are used to market budweiser.", "The cart is promoting beer."], "image": "val2014/COCO_val2014_000000421978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247206, "question_id": "mmwEJkYkMSXtNM3j5WYJHf", "question": "What is the emotion of the cat?", "choices": ["fearful", "angry", "excited", "scared"], "correct_choice_idx": 2, "direct_answers": ["playful", "playful", "playful", "excited", "curious", "happy", "playful", "playful", "happy", "playful"], "difficult_direct_answer": false, "rationales": ["The cat's excited.", "The cat looks very happy with the toy.", "I would also say happy. the cat looks like it's enjoying itself. research has shown that cats do get excited by toys like this one."], "image": "val2014/COCO_val2014_000000247206.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166230, "question_id": "mmx632j3eMZQfvL9v5effW", "question": "Which side is in the lead in this match thus far?", "choices": ["neither", "guest", "tied", "home"], "correct_choice_idx": 3, "direct_answers": ["left side", "left side", "home side", "home", "home", "home", "home", "home", "home", "home"], "difficult_direct_answer": false, "rationales": ["The scoreboard says home has more points than the guest.", "Per the side in the background, this is the answer.", "The player on the home side is in the lead."], "image": "train2014/COCO_train2014_000000166230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394517, "question_id": "mnLhzibQxUtaWfNkWRBNxZ", "question": "At least how many different people likely share this space?", "choices": ["none", "one", "seven", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two toothbrushes in the cup.", "I see a pair of toothbrushes.", "There are two toothbrushes."], "image": "val2014/COCO_val2014_000000394517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283858, "question_id": "mnWAHq9skdXaHkvvasuVzS", "question": "Why are the boys wearing hard plastic helmets?", "choices": ["protection", "fashion", "costume", "punishment"], "correct_choice_idx": 0, "direct_answers": ["protection", "playing", "head injury", "for protection", "protection", "head protection", "protection", "safety", "protection", "safety"], "difficult_direct_answer": false, "rationales": ["They need to protect themselves in case they fall off the skateboard.", "The boys are biking and skateboarding.", "Answer a is known to be the common purpose for helmets."], "image": "val2014/COCO_val2014_000000283858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271681, "question_id": "mo7SvfAoQ4A3HA7MWbhKp6", "question": "What is matching the color of her jacket?", "choices": ["hair", "eyebrows", "earrings", "lipstick"], "correct_choice_idx": 2, "direct_answers": ["earrings", "earrings", "earrings", "earrings", "blue", "blue", "blue", "earrings", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["They are both blue", "The jacket is blue like the earrings.", "The earrings are a bright blue."], "image": "val2014/COCO_val2014_000000271681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444209, "question_id": "moJ7FqwrKHH9bUg3jkoSzx", "question": "What criminal organization of often associated with this method of transportation?", "choices": ["hell's angels", "yakuza", "bloods", "crips"], "correct_choice_idx": 0, "direct_answers": ["hell's angels", "hells angels", "hells angels", "gangs", "gangs", "hell's angel's", "hell's angels", "hells angels", "hells angels", "hell's angels"], "difficult_direct_answer": false, "rationales": ["A motorcycle is pictures. that is the transportation choice of the group mentioned in option a.", "Motorcycles are parked in the street.", "The vehicles are motorcycles. they are associated with motorcycle gangs."], "image": "val2014/COCO_val2014_000000444209.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302606, "question_id": "moJFZjtnSra4K4A96r9nEM", "question": "What type of restaurant is serving this food?", "choices": ["greek", "asian", "mexican", "italian"], "correct_choice_idx": 1, "direct_answers": ["asian", "asian", "thai", "chinese", "thai", "pho", "chinese", "asian", "vietnamese", "asian"], "difficult_direct_answer": false, "rationales": ["The person is sitting at a table with plates of chinese food on it.", "The spoon is only in one cuisine.", "They are serving asian food."], "image": "val2014/COCO_val2014_000000302606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139291, "question_id": "moNJcjUvzm7WmhKGd3NXcN", "question": "This store likely sells what?", "choices": ["books", "beer", "caviar", "marbles"], "correct_choice_idx": 1, "direct_answers": ["alcohol", "alcohol", "liquor", "bear", "alcohol", "beer", "beer", "alcohol", "beer", "spirits"], "difficult_direct_answer": false, "rationales": ["The sign refers to cold drinks. alcohol bottles are present.", "You can see bottles of beer all around the store.", "The store has beer."], "image": "val2014/COCO_val2014_000000139291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332133, "question_id": "moZFn7tZGpTucoTV6d3eyJ", "question": "Why are their skateboards off the ground?", "choices": ["bounced there", "performing tricks", "fell off", "lost control"], "correct_choice_idx": 1, "direct_answers": ["tricks", "doing moves", "performing trick", "jump", "kickflip", "tricks", "jumping", "doing tricks", "performing tricks", "jumping bounce"], "difficult_direct_answer": true, "rationales": ["They're doing tricks.", "They are doing tricks in the road.", "Skateboards are frequently used to do answer a and when they are off the ground and orientated like this it is unlikely they are being used for standard riding."], "image": "train2014/COCO_train2014_000000332133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213506, "question_id": "modqQFhcidKK5MLDg8uV7c", "question": "The owner of the nearest Bicycle here has which role?", "choices": ["robber", "prisoner", "page", "parent"], "correct_choice_idx": 3, "direct_answers": ["parent", "talking", "mother", "parent", "mom", "renter", "babysitter", "child", "rider", "child"], "difficult_direct_answer": false, "rationales": ["The closest bicycle has a child seat attached so that makes it obvious that it's a bike ridden by a parent.", "There is a child seat on the bike", "The toddler sized seat attached to the back of this bicycle implies it's owner has a young child."], "image": "train2014/COCO_train2014_000000213506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134870, "question_id": "mpTgzvaMgcahGUvjpsoM2B", "question": "What does the M on the signs stand for?", "choices": ["masterchef", "mcdonalds", "mcmurdo", "motor cars"], "correct_choice_idx": 1, "direct_answers": ["mcdonalds", "mcdonalds", "mcdonalds", "mcdonald's", "mcdonalds", "mcdonalds", "mcdonald's", "mcdonald's", "mcdonalds", "mcdonalds"], "difficult_direct_answer": false, "rationales": ["The mcdonald's \"golden arches\" can be found in most cities and towns across the world now. opening just one store in 1940, the franchise has grown exponentially since then.", "The sign on the post has an m that stands for mcdonalds restaurants.", "A brand logo is on red signs with yellow font."], "image": "val2014/COCO_val2014_000000134870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543697, "question_id": "mpTnKaU2zb2NJQvExqtKvH", "question": "What direction is the person skating in relation to the road?", "choices": ["upward", "sideways", "uphill", "downhill"], "correct_choice_idx": 3, "direct_answers": ["down", "left", "left", "forward", "west", "downhill", "to leftside", "parallel", "left", "above"], "difficult_direct_answer": false, "rationales": ["Based on the equipment of this person and the relative speed they might be traveling at, answer a is most likely.", "The person is going downhill.", "The person is trying to move down a hill."], "image": "train2014/COCO_train2014_000000543697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517101, "question_id": "mpcsBiiAJPN7uVTJe9xvUG", "question": "What's the name of the red object on the stove?", "choices": ["teapot", "pan", "steamer", "stockpot"], "correct_choice_idx": 0, "direct_answers": ["pizza", "tea kettle", "kettle", "tea kettle", "teapot", "teapot", "pizza", "kettle", "kettle", "kettle"], "difficult_direct_answer": false, "rationales": ["A teapot is on the stove.", "The red object is for brewing tea.", "The red object, also known as a kettle, is a special pot used for boiling water."], "image": "train2014/COCO_train2014_000000517101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360606, "question_id": "mpdetkkQEPo637mhVHVstu", "question": "Which country does this airline headquarter in?", "choices": ["germany", "united kingdom", "united states", "france"], "correct_choice_idx": 2, "direct_answers": ["united states", "united states", "usa", "usa", "america", "usa", "america", "usa", "america", "united states"], "difficult_direct_answer": false, "rationales": ["Delta airlines headquarters are located in atlanta, georgia.", "Delta is an american company.", "The sign to the right of the image indicates this answer."], "image": "train2014/COCO_train2014_000000360606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505092, "question_id": "mpr2BgFzwe3hv2PXxbNHom", "question": "Why are the flying objects three different colors?", "choices": ["random colors", "illusion", "for show", "different species"], "correct_choice_idx": 2, "direct_answers": ["for show", "different teams", "kites", "entertainment", "identification", "decorative purposes", "decorative preferences", "kite parade", "different people", "different characters"], "difficult_direct_answer": true, "rationales": ["The various colors are decorative.", "The crazy objects are fictitious so the coloring is just random and clearly being used as decoration or \"show.\".", "The different colors and odd shape makes them very interesting for kites"], "image": "train2014/COCO_train2014_000000505092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126907, "question_id": "mq8wJfuwV2jZ3nr8Uh93qu", "question": "Who can open this?", "choices": ["firefighter", "teacher", "sanitation worker", "librarian"], "correct_choice_idx": 0, "direct_answers": ["fireman", "firefighter", "firefighter", "firefighter", "fireman", "fireman", "firefighter", "firefighter", "fireman", "firefighter"], "difficult_direct_answer": false, "rationales": ["Men who fight fires have special tools to open the hydrant.", "Generally only the fire department has access to this hydrant.", "This are used in emergencies, and usually are limited in who can interact with them."], "image": "train2014/COCO_train2014_000000126907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471936, "question_id": "mqH3YgKLWHJuYD6LspfDfL", "question": "What is unusual about this carrot?", "choices": ["dirty", "two roots", "broken", "human hand"], "correct_choice_idx": 1, "direct_answers": ["two ends", "two bottoms", "two roots", "two tips", "two roots", "two ends", "two", "two carrots", "double", "two points"], "difficult_direct_answer": false, "rationales": ["They usually only have one", "It has split as it was growing instead of having one", "Carrots usually have one root when you pull them out of the ground."], "image": "train2014/COCO_train2014_000000471936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492171, "question_id": "mqP2hPkGTHHEXUbP2UFcgv", "question": "What sport are the women playing?", "choices": ["soccer", "cricket", "field hockey", "ultimate frisbee"], "correct_choice_idx": 3, "direct_answers": ["frisbee", "ultimate frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["There are two people wearing different colors and one is holding a frisbee looking out into the field while the other stands in close proximity and is in a defensive stance. these relative positions when a frisbee is involved would be related to answer a.", "The woman are playing ultimate frisbee on the grass. the woman in white is next to throw the frisbee.", "The sport is frisbee."], "image": "train2014/COCO_train2014_000000492171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189880, "question_id": "mqYtzDnmUDG2NtzbpnE4WJ", "question": "What type of furniture is the girl sitting on?", "choices": ["futon", "recliner", "chaise", "sectional"], "correct_choice_idx": 0, "direct_answers": ["couch", "futon", "couch", "couch", "futon", "futon", "couch", "couch", "red cotton", "couch"], "difficult_direct_answer": false, "rationales": ["The girl is sitting on some sort of couch.", "The girl is sitting on a futon which is made of a wooden frame and a thin mat.", "It's a pull out couch."], "image": "train2014/COCO_train2014_000000189880.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521070, "question_id": "mqaSYPZSKxzHLrxqgoHTjr", "question": "Why is there a sign on the bed?", "choices": ["for fun", "as joke", "as decoration", "to sell"], "correct_choice_idx": 3, "direct_answers": ["history", "price", "window", "to sell", "name plate", "meeting", "bed description", "event", "explaining product", "price"], "difficult_direct_answer": true, "rationales": ["On the very bottom of the sign is the name ikea. this bed can be found in the store and can be taking home after purchase.", "There is a sign 9on the bed to sell it at ikea.", "It is describing what the item is"], "image": "train2014/COCO_train2014_000000521070.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37678, "question_id": "mqk6juzL8fv8kWsGn2ucyL", "question": "What operation should be done with the numbers to arrive at the years of steam power?", "choices": ["division", "multiplication", "subtraction", "addition"], "correct_choice_idx": 2, "direct_answers": ["add", "i do", "subtraction", "subtraction", "add", "subtract", "subtract", "subtraction", "subtraction", "subtraction"], "difficult_direct_answer": false, "rationales": ["To tell a difference of years, you take the smaller number from the bigger number.", "The sign implies that a person would need to subtract the top number from the bottom one.", "You would have to use a minus sign between the two numbers."], "image": "val2014/COCO_val2014_000000037678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137377, "question_id": "mr3MJYC7GUzrvhDRCXsAAX", "question": "What is she taking a photo of?", "choices": ["yarn", "potato", "doll", "ball"], "correct_choice_idx": 2, "direct_answers": ["doll", "stuffed animal", "tags", "books", "doll", "doll", "book", "tags", "mascot", "people"], "difficult_direct_answer": false, "rationales": ["This woman holds her toy out far in front of her with one hand and takes a picture of it with her camera held in the other hand.", "She's taking a picture of the doll.", "The woman is holding a doll. the woman is holding a camera up to the doll."], "image": "train2014/COCO_train2014_000000137377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494285, "question_id": "mr4NGb3xsPmPLSCxj56g3f", "question": "What is the item on the left side of the mirror?", "choices": ["tripod", "statue", "vase", "candle holder"], "correct_choice_idx": 3, "direct_answers": ["jewelry stand", "candlebra", "candle stick", "candle", "candle", "candlestick", "candlestick", "candle holder", "candle holder", "candle"], "difficult_direct_answer": false, "rationales": ["There is a candle holder on the left side of the mirror.", "The item is a candle holder.", "There is a candle holder on the left side of the mirror with a purple candle on it."], "image": "train2014/COCO_train2014_000000494285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292444, "question_id": "mr9dwofKutc9dMCDaHCFMf", "question": "What is the job of the men on the horses?", "choices": ["doctors", "judges", "waiters", "officers"], "correct_choice_idx": 3, "direct_answers": ["law enforcement", "police", "police officers", "police", "officers", "police", "policemen", "policing", "police", "police"], "difficult_direct_answer": false, "rationales": ["The men are police officers.", "The police seems to be taking patrol in the area.", "They have uniforms and helmets"], "image": "train2014/COCO_train2014_000000292444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128282, "question_id": "mrfR4umPrH8YUq4Mk6TkXt", "question": "What is the red wall behind the fence made of?", "choices": ["metal", "brick", "wood", "plastic"], "correct_choice_idx": 1, "direct_answers": ["metal", "brick", "brick", "brick", "brick", "metal", "brick", "metal", "brick", "brick"], "difficult_direct_answer": false, "rationales": ["The wall is made from a rectangular red building material that is held together visibly by white grout in between. these elements are all consistent with answer a which is one of the most common building materials and is commonly used in athletic facilities that might be near a tennis court as seen.", "The red wall is brick.", "Tennis players play in front of a wall held together by mortar that forms lines showing rectangular building materials of the wall."], "image": "train2014/COCO_train2014_000000128282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45433, "question_id": "mrfRUuZDHa6UdVyaNSSjgs", "question": "What does the giant ball do?", "choices": ["electrify", "nuclear fusion", "squeak", "bounce"], "correct_choice_idx": 1, "direct_answers": ["provide sunlight", "provide light", "emit heat", "glow", "nuclear fusion", "heat earth", "slow speed", "play", "sunlight", "illuminate"], "difficult_direct_answer": true, "rationales": ["The giant ball is the sun. the process of the sun is nuclear fusion.", "This is the type of energy the star has to generate light and warmth", "The sun is nuclear."], "image": "val2014/COCO_val2014_000000045433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311726, "question_id": "mruDtJJijpaiiZVqspnyRV", "question": "What are the horses pulling?", "choices": ["sled", "wagon", "stagecoach", "chariot"], "correct_choice_idx": 3, "direct_answers": ["chariot", "chariot", "chariot", "sleigh", "chariot", "chariot", "chariot", "wagon", "carriage", "cart"], "difficult_direct_answer": false, "rationales": ["The horses are pulling a person in a fancy cart.", "The horses are a display with a chariot.", "The horses have chariots."], "image": "train2014/COCO_train2014_000000311726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204994, "question_id": "mruz2YbW7UHpgPMrHwep3s", "question": "What does the giraffe in front of the man have in its mouth?", "choices": ["stick", "camera", "paintbrush", "grass"], "correct_choice_idx": 3, "direct_answers": ["leaves", "frisbee", "leaf", "leaf", "grass", "leaves", "leaf", "stick", "grass blade", "food treat"], "difficult_direct_answer": false, "rationales": ["The animal is eating a green object shaped and colored like a piece of grass.", "The item is green and a plant.", "He has grass."], "image": "val2014/COCO_val2014_000000204994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66691, "question_id": "ms9uVZTxdspUmECLS73WMe", "question": "Who is now sitting at the table in the foreground?", "choices": ["background person", "no one", "lady beth", "cook"], "correct_choice_idx": 1, "direct_answers": ["elderly couple", "no one", "nobody", "no one", "old man", "nobody", "people", "nobody", "no one", "photographer"], "difficult_direct_answer": false, "rationales": ["The seats at the table in the foreground are unoccupied.", "No one sits.", "There is no human species - a person with a head, arms, clothes, etc. in any of the chairs within close range. all the look-alike objects are chairs showing that there is no person within the specified area."], "image": "train2014/COCO_train2014_000000066691.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455946, "question_id": "msC5tAWocUW7XKtcbX9GXa", "question": "What is the person wearing?", "choices": ["sneakers", "slippers", "cowboy boots", "shoes"], "correct_choice_idx": 1, "direct_answers": ["house slippers", "shorts", "sandals", "slides", "slippers", "black sandals", "black sandals", "sandals", "sandals", "flip flops"], "difficult_direct_answer": false, "rationales": ["A person is holding a diamond with no socks and lip on shoes with a black strap over her toes.", "The are wearing what looks like slipper. they aren't covering their whole feet.", "A person is holding up a donut. another has footwear where a person slips them on and you can see bare feet."], "image": "val2014/COCO_val2014_000000455946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170857, "question_id": "msVUL7s7p9xw8HLqVkkGYs", "question": "What color is the suitcase on the luggage rack underneath the number two?", "choices": ["blue", "orange", "black", "red"], "correct_choice_idx": 2, "direct_answers": ["black", "black", "black", "like", "black", "black", "black", "black", "black", "like"], "difficult_direct_answer": false, "rationales": ["The suitcase is black.", "Many suitcases are black.", "It is the darkest thing in the room"], "image": "train2014/COCO_train2014_000000170857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326798, "question_id": "mscYruaEnKFXmASXS9s3bZ", "question": "What is the reddish colored room made from?", "choices": ["wood", "grass", "rubies", "terra cotta"], "correct_choice_idx": 3, "direct_answers": ["tile", "plaster", "terra cotta", "clay tiles", "stone", "brick", "paint", "clay", "brick", "brick"], "difficult_direct_answer": false, "rationales": ["It's a type of material called terra cotta", "This is the color of clay", "Clay tile roofs are common in european countries."], "image": "val2014/COCO_val2014_000000326798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412616, "question_id": "msxpwTjCqRHyzYctH8V5rt", "question": "What is the most likely reason that the bike is covered where it is?", "choices": ["legal", "protection", "style", "camouflage"], "correct_choice_idx": 1, "direct_answers": ["dry", "stay dry", "bike parking", "protection", "protection", "cover", "keep safe", "dust problem", "protection", "pole"], "difficult_direct_answer": false, "rationales": ["The bike is covered for protection.", "This type of vehicle is not enclosed and needs additional protection from the elements if it is to be parked outside. additionally, there are clearly leaves all over the ground and it is parked under a tree.", "The reason is protection."], "image": "train2014/COCO_train2014_000000412616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171647, "question_id": "msyLufiy2CSnt2pREdEYzV", "question": "What kind of vehicle will stop in this depot in the future?", "choices": ["subway", "train", "bus", "plane"], "correct_choice_idx": 1, "direct_answers": ["bus", "train", "train", "train", "train", "train", "train", "train", "train", "bus"], "difficult_direct_answer": false, "rationales": ["A train will stop at this train station soon.", "It appears the benches are in a train station", "Train tracks are visible at a train station."], "image": "train2014/COCO_train2014_000000171647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102355, "question_id": "mt3X6r8V2MWUCbzJDGwt5y", "question": "What modern invention is seen here to help the wagon move smoother?", "choices": ["reins", "tires", "wood", "tassles"], "correct_choice_idx": 1, "direct_answers": ["tires", "tires", "tires", "wheels", "wheels", "tires", "tires", "tires", "wheels", "tires"], "difficult_direct_answer": false, "rationales": ["The wagon is being moved along using tires which are a more modern invention than the wheel. tires make the ride smoother.", "The modern wheel covering is much better than the wooden wheels.", "There is rubber on the wheels"], "image": "val2014/COCO_val2014_000000102355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26764, "question_id": "mtxL2bPVsGWT7Zy84VE4Z6", "question": "Where do tangerines originate from?", "choices": ["asia", "morocco", "balkans", "australia"], "correct_choice_idx": 0, "direct_answers": ["asia", "asia", "asia", "southeast asia", "southeast asia", "southeast asia", "oranges", "orange", "southeast asia", "southeast asia"], "difficult_direct_answer": false, "rationales": ["This question was posed to the internet and the answer i saw was c.", "A bowl of tangerines is on a table.", "Tangerines originate from tangier."], "image": "train2014/COCO_train2014_000000026764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310524, "question_id": "muT6gEA2sdzr4tNGDTZ7D2", "question": "What type of setting does this seem to be?", "choices": ["mall courtyard", "prison yard", "college campus", "beach resort"], "correct_choice_idx": 2, "direct_answers": ["park", "park", "park", "university campus", "college", "courtyard", "courtyard", "school", "courtyard", "college campus"], "difficult_direct_answer": false, "rationales": ["The area looks like a school campus.", "Looks like a college campus.", "There are lots of young people sitting around. it looks like a friendly area."], "image": "val2014/COCO_val2014_000000310524.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392394, "question_id": "muZPHvS2zXyGNqy4ELdtsr", "question": "What other animal is there besides zebras?", "choices": ["deer", "bull", "moose", "antelope"], "correct_choice_idx": 3, "direct_answers": ["ibex", "ibex", "antelope", "antelope", "antelope", "antelope", "antelope", "antelope", "ibex", "antelope"], "difficult_direct_answer": false, "rationales": ["The antelope is there.", "Zebras and antelope are coexisting.", "That is the animal with the zebras."], "image": "train2014/COCO_train2014_000000392394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113142, "question_id": "mudG7Hi9EUsmAuLaqj6pje", "question": "Which woman emerged victorious in this tournament?", "choices": ["monica seles", "jelena jancovic", "naomi osaka", "serena williams"], "correct_choice_idx": 1, "direct_answers": ["serena williams", "champion", "unknown", "serena williams", "female", "masters open", "jelena jancovic", "player", "tennis", "venus"], "difficult_direct_answer": true, "rationales": ["Jalena jancovic is in the tournament.", "The tournament is the western and southern women's open per the sing in the background. answer a won that tournament.", "Jelena jancovic was the winner of the tennis match."], "image": "train2014/COCO_train2014_000000113142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351850, "question_id": "mufGtUXmEh5o8GmJSQuUBN", "question": "What kind of swinging technic is this?", "choices": ["backhand", "overhead", "forehand", "underhand"], "correct_choice_idx": 1, "direct_answers": ["playing", "overhand", "forehand", "tennis", "serve", "backhand", "lob", "forehand", "overhead", "overhand"], "difficult_direct_answer": false, "rationales": ["Her arm is raised in the air.", "The woman is using her forearm.", "You can tell by the position of her racket as to what type of swing she is taking."], "image": "train2014/COCO_train2014_000000351850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 567142, "question_id": "muh2He73yeAAXRBNvgv7UL", "question": "How is the blue vessel moved here?", "choices": ["manpower", "tugged", "steam", "motor"], "correct_choice_idx": 0, "direct_answers": ["paddling", "car", "oars", "paddle", "paddles", "paddle", "floating", "paddling", "paddling", "manpower"], "difficult_direct_answer": false, "rationales": ["The blue vessel moves with the rowing.", "They're using paddles.", "People have to use the paddles to power the boat."], "image": "val2014/COCO_val2014_000000567142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17909, "question_id": "mumwiJ5C5pwP2bFFEH5qUa", "question": "What type of milk is in the fridge?", "choices": ["oat", "whole", "soy", "almond"], "correct_choice_idx": 3, "direct_answers": ["almond", "almond", "almond", "almond", "almond", "vanilla", "vanilla", "almond", "vanilla", "almond"], "difficult_direct_answer": false, "rationales": ["The carton states that the contents contain almond breeze.", "The word is on the container", "The brand is almond breeze."], "image": "val2014/COCO_val2014_000000017909.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237848, "question_id": "mv4zVvhUu7y6zsPrQJ2vNb", "question": "What is the person in the foreground doing?", "choices": ["fishing", "running", "skateboarding", "eating"], "correct_choice_idx": 2, "direct_answers": ["skateboarding", "falling", "skateboarding", "skateboarding", "falling", "skateboarding", "skateboarding", "skateboarding", "falling", "skateboarding"], "difficult_direct_answer": false, "rationales": ["The person is on a plank of wood attached to four wheels.", "He is standing on an object in order to do a trick in a bowl.", "The person is performing on a skateboard"], "image": "train2014/COCO_train2014_000000237848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408806, "question_id": "mvBwgrkXcR8ddPqsVCMScf", "question": "Where is the head of this person?", "choices": ["behind them", "above shoulders", "above knees", "no where"], "correct_choice_idx": 2, "direct_answers": ["his lap", "hood", "in hands", "bent down", "in lap", "lap", "bent", "above knees", "in lap", "in lap"], "difficult_direct_answer": false, "rationales": ["The person has their hood over their head. they are sitting on the bench and they are bent over with their head resting on their legs.", "The person has their head buried.", "The person is above the knees."], "image": "train2014/COCO_train2014_000000408806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168927, "question_id": "mvRroWvaTumxJYfHjbJnDY", "question": "What faith does the man in the glasses practice?", "choices": ["buddhism", "islam", "judaism", "christianity"], "correct_choice_idx": 2, "direct_answers": ["judaism", "judaism", "christian", "jewish", "judaism", "judaism", "judaism", "law", "judaism", "jewish"], "difficult_direct_answer": false, "rationales": ["He is wearing a yarmulke on his head. these items are typically worn by jewish men.", "The man is wearing headwear on his head that is known to be commonly associated with one religion.", "The man follows judaism."], "image": "train2014/COCO_train2014_000000168927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286930, "question_id": "mvt7EwGKkeAPd8UgbSuY2G", "question": "Who is the player wearing red boots?", "choices": ["fielder", "pitcher", "catcher", "goalie"], "correct_choice_idx": 2, "direct_answers": ["catcher", "catcher", "umpire", "catcher", "catcher", "catcher", "umpire", "catcher", "catcher", "catcher"], "difficult_direct_answer": false, "rationales": ["They need production from the baseballs being hit in their direction, which could injure them.", "The player wearing red boots is talking to the pitcher and is wearing other protective gear also.", "The player is the catcher."], "image": "train2014/COCO_train2014_000000286930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485840, "question_id": "mvyBCfXAFjrsh8u23djy6X", "question": "Which characteristic describes the front elephant accurately?", "choices": ["inanimate", "fast", "hot", "super small"], "correct_choice_idx": 0, "direct_answers": ["white doves", "blue", "colorful", "trunk", "trunk", "blue", "inanimate", "trunk", "colorful", "trunk"], "difficult_direct_answer": false, "rationales": ["It is very colorful compared to a real one so it looks based off a fictional character", "The elephant is basically a statue of sorts.", "They are inanimate."], "image": "train2014/COCO_train2014_000000485840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473575, "question_id": "mw4i6eijS46vXpLcKZiKfo", "question": "What are the people looking at in the room?", "choices": ["painting", "window", "television", "artwork"], "correct_choice_idx": 2, "direct_answers": ["television", "tv screen", "tv screen", "television", "television", "television", "television", "television", "television", "television"], "difficult_direct_answer": false, "rationales": ["Everyone in the room is watching a show on tv.", "People are gathered around a television that is turned on.", "The people watch tv."], "image": "train2014/COCO_train2014_000000473575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391696, "question_id": "mwPxdpgZsRw9rcWa6c7SFg", "question": "What kind of animal is the cartoon face on the left?", "choices": ["octopus", "shark", "bear", "seal"], "correct_choice_idx": 0, "direct_answers": ["octopus", "whale", "fish", "fish", "octopus", "turtle", "octopus", "octopus", "octopus", "octopus"], "difficult_direct_answer": false, "rationales": ["The animal is an octopus.", "An octopus is depicted.", "An octopus is shown since it's purple with a rounded head."], "image": "train2014/COCO_train2014_000000391696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52966, "question_id": "mwQpHahzWvFShg2HjB2gAZ", "question": "The man in the middle has what feature?", "choices": ["goatee", "horns", "third eye", "triple chin"], "correct_choice_idx": 0, "direct_answers": ["beard", "beard", "brown eyes", "beard", "facial hair", "goatee", "beard", "beard", "beard", "goatee"], "difficult_direct_answer": false, "rationales": ["The facial hair is like a beard but does not cover the man's entire lower face.", "The man has a goatee.", "The man in the middle has groomed and short facial hair around his mouth but not his cheeks or neck. this style of hair is referred to as answer a."], "image": "val2014/COCO_val2014_000000052966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69189, "question_id": "mwZwTstKaV75VKf3tHevEx", "question": "What does the man on the bike do for a living?", "choices": ["gives rides", "sells bikes", "sells chickens", "steals bikes"], "correct_choice_idx": 2, "direct_answers": ["sell chickens", "butcher", "kill chickens", "raising chickens", "butcher", "sells chickens", "sell items", "sell chickens", "sell chicken", "farm"], "difficult_direct_answer": false, "rationales": ["The man has chickens on the bike.", "There are many birds on his bike, which presumably he's riding to market.", "It's actually unclear as to whether he sells or delivers them. that said, given their location on the bike and that they're dead, it's likely the former. it's common in asia for people to sell dead livestock this way."], "image": "val2014/COCO_val2014_000000069189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393971, "question_id": "mwfV6fkCnTVQmLWVfnk5hL", "question": "What operating system does this computer operate on?", "choices": ["windows", "mac os", "linux", "ms dos"], "correct_choice_idx": 1, "direct_answers": ["mac", "mac os", "macos", "apple", "macos", "macbook air", "unknown", "mac", "ios", "linux"], "difficult_direct_answer": false, "rationales": ["It's an apple computer.", "If you zoom closely under the screen you can see macbook in black.", "There is a logo for an apple laptop near the screen. apple laptops do not run windows, linux, or ms-dos."], "image": "val2014/COCO_val2014_000000393971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286719, "question_id": "mwi2uMQQ7tTrDYXtHQBLzB", "question": "What does this man do?", "choices": ["sing", "serve", "farm", "paint"], "correct_choice_idx": 2, "direct_answers": ["sells foods", "sells vegetables/fruit", "farm", "sell", "sell produce", "sell veggies", "buy produce", "farm", "sell vegetables", "purchasing"], "difficult_direct_answer": true, "rationales": ["A person stands near piles of produce.", "If they are made naturally, vegetables come from farms.", "There are fresh vegetables."], "image": "val2014/COCO_val2014_000000286719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32413, "question_id": "mwj4uqh5Vp6uXvnTxPg78i", "question": "What does the shown speed likely correlate to?", "choices": ["car speed", "ball speed", "computer speed", "running speed"], "correct_choice_idx": 1, "direct_answers": ["speed ball", "ball", "swing", "ball speed", "ball speed", "ball speed", "stopped", "ball speed", "ball speed", "ball's speed"], "difficult_direct_answer": false, "rationales": ["The speed is for the ball.", "The ball is going fast.", "The tracker is identifying how hard the ball is hit."], "image": "train2014/COCO_train2014_000000032413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444503, "question_id": "mwk2kTBmwmL3ratuqkfotS", "question": "If you had to cross to the other side how would you do it?", "choices": ["overhead bridge", "swing over", "cross tracks", "take taxi"], "correct_choice_idx": 0, "direct_answers": ["carefully", "walk", "cross tracks", "overhead walkway", "walk", "jump", "foot bridge", "use bridge", "overhead bridge", "luggage"], "difficult_direct_answer": true, "rationales": ["Cross over on the bridge.", "There is a crossing bridge ahead.", "Right in front there is a footbridge."], "image": "train2014/COCO_train2014_000000444503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537866, "question_id": "mwqR46Jm9SbAWZuy7ipwQb", "question": "What is usually put on this kind of bread and possibly already on this kind of bread?", "choices": ["jelly", "butter", "jam", "avocado"], "correct_choice_idx": 1, "direct_answers": ["butter", "butter", "butter", "garlic", "garlic", "garlic cheese", "garlic", "garlic butter", "garlic", "garlic"], "difficult_direct_answer": false, "rationales": ["The yellow color shows the presence of butter on the bread. and no matter if the meal is actually italian or not, many people enjoy a bit of garlic on that bread along with the butter.", "You can spread butter on the bread.", "It's toasted and is yellow colored"], "image": "train2014/COCO_train2014_000000537866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545033, "question_id": "mwsmUP52chZyUf3AoFMDbE", "question": "What is being poured here?", "choices": ["corn mush", "grits", "cake batter", "milk"], "correct_choice_idx": 2, "direct_answers": ["cake batter", "cake batter", "batter", "batter", "dough", "dough", "cake batter", "batter", "cake batter", "dough mix"], "difficult_direct_answer": false, "rationales": ["It is going into a bundt pan to be baked for dessert.", "Cake batter is poured into the bunt cake shape.", "The person is making is bundt cake."], "image": "train2014/COCO_train2014_000000545033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391685, "question_id": "mwwmrcsT3Urf4AunUP7Bga", "question": "What is behind the dog on a skateboard?", "choices": ["rug", "couch", "food", "skateboard"], "correct_choice_idx": 1, "direct_answers": ["couch", "couch", "sofa", "couch", "sofa", "sofa", "couch", "couch", "couch", "couch"], "difficult_direct_answer": false, "rationales": ["The item behind the dogs has the shape of a couch.", "It has cushions and several places to sit", "A dog is on a skateboard in a living room by a piece of furniture that is plush and could fit several people on it."], "image": "train2014/COCO_train2014_000000391685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275372, "question_id": "mx55C57DPYdAzH7vatLHL7", "question": "What visible item can be used to identify the origin of boats here?", "choices": ["flag", "mast heads", "figure heads", "license plate"], "correct_choice_idx": 0, "direct_answers": ["water", "flag", "boat mast", "writing", "flag", "flags", "unknown", "flag", "unknown", "masts"], "difficult_direct_answer": false, "rationales": ["You can see the masts in the air.", "Boats tend to fly these to identify port of call.", "Flags placed on boats typically correspond with the country of the boat and boat owner. visible flags would align with this aspect."], "image": "train2014/COCO_train2014_000000275372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307322, "question_id": "mxGX9ZKn3WZKWtagfAFWcC", "question": "What was installed on both the computers?", "choices": ["windows", "solaris", "linux", "osx"], "correct_choice_idx": 0, "direct_answers": ["music", "program", "software", "windows", "windows", "windows", "software", "windows", "windows", "windows"], "difficult_direct_answer": false, "rationales": ["There is a logo that looks like a window.", "Windows software is installed.", "The logo for the operating system is visible on both monitors."], "image": "train2014/COCO_train2014_000000307322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301910, "question_id": "mxKXEbp24Z2Z8LYR9R93CS", "question": "What expression does the woman that is directly behind the woman holding her phone up have on her face?", "choices": ["happiness", "disgust", "glee", "fear"], "correct_choice_idx": 3, "direct_answers": ["upset", "worried", "not happy", "excitement", "smile", "smile", "smiling", "fear", "blank", "bored"], "difficult_direct_answer": true, "rationales": ["The woman looks shocked.", "The woman appears to be disgusted by the act occurring in front of her. her eyes are averted and she doesn't wish to be in the image.", "I'd say it's more \"concern\" or \"worry,\" but a comes closest and the other options don't match."], "image": "val2014/COCO_val2014_000000301910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528465, "question_id": "mxor93o6uUbTq3L62BSY7h", "question": "What is the most probable reason his face is red?", "choices": ["he's embarrassed", "exercise", "he's drunk", "he's mad"], "correct_choice_idx": 1, "direct_answers": ["sun burnt", "exercise", "hot temperature", "sun exposure", "hot", "physical exertion", "playing hard", "tired", "hot", "playing"], "difficult_direct_answer": true, "rationales": ["The man is playing tennis on a hot day and is heating himself up.", "There is a lot of running involved when playing tennis and the man is holding a tennis racket and tennis balls.", "He's exercising."], "image": "train2014/COCO_train2014_000000528465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162083, "question_id": "myD4tVQHz95teNggBCqx2W", "question": "What animal is picture in this image?", "choices": ["cow", "rhino", "cat", "dog"], "correct_choice_idx": 1, "direct_answers": ["rhinoceros", "rhinoceros", "rhinoceros", "rhino", "rhino", "rhino", "rhino", "rhino", "rhino", "rhino"], "difficult_direct_answer": false, "rationales": ["There is a cartoon image of an animal on the flag. the animal has a single horn.", "The animal has two horns.", "It's a rhino."], "image": "train2014/COCO_train2014_000000162083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481581, "question_id": "myHmGn6AQPi9LjTMVaKjRs", "question": "What season is the athlete performing in?", "choices": ["fall", "summer", "fall", "winter"], "correct_choice_idx": 3, "direct_answers": ["winter", "winter", "winter", "winter", "snow race", "winter", "winter", "snow race", "winter", "winter"], "difficult_direct_answer": false, "rationales": ["The season is winter.", "The athlete is skiing down a mountain covered in snow.", "There is snow."], "image": "train2014/COCO_train2014_000000481581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40446, "question_id": "myJAzcP9ULUsZbKiAJ6HFv", "question": "How might you be considered if you set a fire here?", "choices": ["sacrilegious", "forest-fire starter", "chef", "camper"], "correct_choice_idx": 0, "direct_answers": ["evil", "evil", "heathen", "light candles", "sacrilegious", "pagan", "evil", "deviant", "arsonist", "evil"], "difficult_direct_answer": false, "rationales": ["This is a church. there is a cross on the wall.", "The setting is a church based on the cross and pews in addition to the interior space design. doing something intentionally to cause damage to this setting may be considered anti-religious.", "It is violating a sacred thing."], "image": "val2014/COCO_val2014_000000040446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22646, "question_id": "myboNvfW6WJjj4xZByxJU5", "question": "What does this man wait for?", "choices": ["taxi", "plane", "boat", "train"], "correct_choice_idx": 3, "direct_answers": ["train", "train", "train", "transportation", "train", "train", "train", "transportation", "train", "train"], "difficult_direct_answer": false, "rationales": ["The person is standing next to railway tracks which trains use. he has luggage which people use while traveling like on a train.", "The man is waiting for a train to come.", "He is waiting by the tracks for the train."], "image": "val2014/COCO_val2014_000000022646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536711, "question_id": "mzMQQQBhqZqUkL44JBC4Pz", "question": "What holiday is the store decorated for?", "choices": ["easter", "new year", "halloween", "christmas"], "correct_choice_idx": 2, "direct_answers": ["halloween", "halloween", "thanksgiving", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween"], "difficult_direct_answer": false, "rationales": ["The holiday is halloween.", "The store has a pumpkin and a scarecrow in front of it which are popular halloween decorations.", "There is a scarecrow and pumpkin in the front of the store"], "image": "train2014/COCO_train2014_000000536711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576385, "question_id": "mzVxehTjfBn4hTce7fa25H", "question": "What type of restaurant does this appear to be?", "choices": ["asian", "italian", "english", "greek"], "correct_choice_idx": 0, "direct_answers": ["indoor", "chinese", "chinese", "bar", "chinese", "modern", "asian", "outdoor", "asian", "casual"], "difficult_direct_answer": false, "rationales": ["This seems to be the case based on the customers.", "There is asian decor in a restaurant with many asian people sitting at the tables.", "The restaurant has the type of decor usually seen in asian restaurants."], "image": "val2014/COCO_val2014_000000576385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141008, "question_id": "mzktSrkCjc4AsU5zT45skQ", "question": "The front of the vehicle is mostly the color of what?", "choices": ["cherry", "tangerine", "lime", "mustard"], "correct_choice_idx": 3, "direct_answers": ["sky", "yellow", "yellow", "banana", "bananas", "sun", "black", "yellow", "mustard", "yellow"], "difficult_direct_answer": false, "rationales": ["The most prevalent color on the front is yellow, the same color as mustard from the list.", "The front of the vehicle is yellow, not orange, red, or green.", "The front of the train is mainly yellow resembling mustard."], "image": "train2014/COCO_train2014_000000141008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501118, "question_id": "mznYbAaWRQkVgiKyGJY35m", "question": "When the entities shown on screen leave how many different prints are left with each set of steps taken by them?", "choices": ["two", "four", "six", "none"], "correct_choice_idx": 2, "direct_answers": ["six", "six", "two", "four", "six", "2 sets", "six", "two", "six", "six"], "difficult_direct_answer": false, "rationales": ["A dog has 4 feet and a human has 2", "There are 6.", "Humans have two feet and dogs have four."], "image": "train2014/COCO_train2014_000000501118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312826, "question_id": "mzvPEpYUYgSLDx6vhaGNwk", "question": "What type of food is in the box?", "choices": ["candy bars", "donuts", "potato chips", "cookies"], "correct_choice_idx": 1, "direct_answers": ["donuts", "doughnuts", "donut holes", "doughnuts", "tidbits", "doughnuts", "tidbits", "doughnuts", "donuts", "donut holes"], "difficult_direct_answer": false, "rationales": ["The food is a donut.", "The item looks like they're donut holes.", "The photo on the box shows donut holes and the brand is tim hortons known for their donuts."], "image": "train2014/COCO_train2014_000000312826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293756, "question_id": "n22JJgP4v8yHQZhnnhxqKo", "question": "What should the silver and black cars nearest here do?", "choices": ["turn around", "stop", "roll through", "race through"], "correct_choice_idx": 1, "direct_answers": ["stop", "stop", "stop", "turn right", "move forward", "stop", "stop", "stop", "stop", "stop"], "difficult_direct_answer": false, "rationales": ["The silver and black cars here are made to stop at the stop light.", "The cars are stopping.", "The cars are parked."], "image": "train2014/COCO_train2014_000000293756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131044, "question_id": "n23USymK5FQfD5osaj2WTh", "question": "What fun item can be seen in the photo?", "choices": ["ponies", "candy", "rides", "balloons"], "correct_choice_idx": 3, "direct_answers": ["balloons", "balloons", "balloons", "balloons", "balloons", "balloons", "balloons", "balloons", "balloons", "balloons"], "difficult_direct_answer": false, "rationales": ["Balloons are festive.", "Yellow balloons are seen on the vehicle.", "There is a bunch of yellow orbs on the back of the yellow truck."], "image": "train2014/COCO_train2014_000000131044.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318361, "question_id": "n2JC9wXR3gdm6x3vQfNKzi", "question": "In what is the most likely type of structure is this room?", "choices": ["bridge", "house", "store", "skyscraper"], "correct_choice_idx": 1, "direct_answers": ["kitchen", "house", "house", "wood", "dining table", "rectangle", "kitchen", "rectangular", "wood", "kitchen"], "difficult_direct_answer": false, "rationales": ["This looks like a residential kitchen so this must be a house.", "It looks like a regular residential kitchen, which can be found in a house.", "The room is in a normal residential houe."], "image": "val2014/COCO_val2014_000000318361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521752, "question_id": "n2Kbc9dxRuDarHf4Hr24ib", "question": "What brand of electronics are being utilized?", "choices": ["dell", "lenovo", "apple", "hp"], "correct_choice_idx": 2, "direct_answers": ["apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple"], "difficult_direct_answer": false, "rationales": ["The brand is apple.", "A fruit logo is on the back of each laptop.", "The logo of apple computing is easily recognized around the world. on the laptop it is clearly visible on its cover."], "image": "train2014/COCO_train2014_000000521752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430495, "question_id": "n2ovrfv2HkPCo8PFtFYXGK", "question": "What might prevent the animals from going to the rightmost side of the image?", "choices": ["tall grass", "train", "fence", "houses"], "correct_choice_idx": 2, "direct_answers": ["fence", "fence", "fence", "fence", "fence", "high greenery", "train", "weeds", "fence", "fence"], "difficult_direct_answer": false, "rationales": ["A fence can be seen in the background, which could stop the animals from walking into the yard.", "The fence is wooden and too high for them to climb over.", "There is a fence up for their safety."], "image": "train2014/COCO_train2014_000000430495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434161, "question_id": "n36wEERooWtaQPByVu6rrR", "question": "What holiday is being Celebrated here?", "choices": ["halloween", "independence day", "mardi gras", "boxing day"], "correct_choice_idx": 1, "direct_answers": ["forth", "july 4th", "fourth july", "july fourth", "baseball", "memorial day", "independence day", "4th july", "july 4th", "independence day"], "difficult_direct_answer": false, "rationales": ["The flags have the american red, white, and blue colors.", "There are red white and blue banners on the fence", "Independence day is celebrated with flags."], "image": "train2014/COCO_train2014_000000434161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126770, "question_id": "n3AHtoR8u6MHxEK3QoSjbg", "question": "What are the people sitting outside the building doing?", "choices": ["dining", "typing", "arm wrestling", "drawing"], "correct_choice_idx": 0, "direct_answers": ["eating", "eating", "drink coffee", "eating", "talking", "eating", "eating", "dining", "dining", "having lunch"], "difficult_direct_answer": false, "rationales": ["The people are eating outside in front of a cafe.", "An outdoor cafe next to a sidewalk displays a chalkboard menu, inviting potential customers into their establishment.", "It is a restaurant."], "image": "train2014/COCO_train2014_000000126770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134648, "question_id": "n3J89mnpCm9xYXT9jbFfLd", "question": "What happened to this person?", "choices": ["slept", "ate food", "strapped", "fell down"], "correct_choice_idx": 3, "direct_answers": ["they fell", "fell down", "they fell", "they fell", "fell", "fell down", "they fell", "fell down", "fell down", "fell"], "difficult_direct_answer": false, "rationales": ["The person fell.", "The man is on the ground in the snow.", "They lost their balance while skiing."], "image": "train2014/COCO_train2014_000000134648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273951, "question_id": "n3dwWcMw6ATxRfspEn6VA8", "question": "What does the man all the way to the right have on his head?", "choices": ["goggles", "scarf", "cowboy hat", "cone"], "correct_choice_idx": 0, "direct_answers": ["goggles", "ski goggles", "helmet", "goggles", "goggles", "protecting", "ski goggles", "goggles", "goggles", "hat"], "difficult_direct_answer": false, "rationales": ["The man has ski goggles on his head.", "The man has goggles on.", "The man is located based on the text of the question and the object on his head has a defining shape, size and is being worn in a manner consistent with answer a."], "image": "train2014/COCO_train2014_000000273951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202799, "question_id": "n48rNynNUbWaP6HBnHoRi3", "question": "Why is the boy wearing purple touching the bottom of the skateboard?", "choices": ["throw it", "clean it", "performing tricks", "massage it"], "correct_choice_idx": 2, "direct_answers": ["performing tricks", "ensure landing", "trick", "retain control", "jumping", "balance", "for trick", "perform trick", "skating", "riding it"], "difficult_direct_answer": true, "rationales": ["He is performing stunts.", "The skateboard is airborne. when a skateboard is airborne like this it is often for the purpose of doing a trick and a hand would be placed as such keep the board close and complete the trick.", "He's doing tricks."], "image": "val2014/COCO_val2014_000000202799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2315, "question_id": "n4Fyw3j3zZ5vu9dE9xkLbB", "question": "What is a unique feature of these animals?", "choices": ["quills", "gills", "neck", "trunk"], "correct_choice_idx": 3, "direct_answers": ["trunk", "trunks", "trunk", "ears", "trunk", "trunk", "trunks", "trunk", "tusks", "trunk"], "difficult_direct_answer": false, "rationales": ["No other animal has a trunk like an elephant.", "The feature is a trunk.", "The animals have long trunks."], "image": "val2014/COCO_val2014_000000002315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512948, "question_id": "n4QUcW4PTeFsLfjtcxBQR6", "question": "What food group has been produced by these plants?", "choices": ["vegetables", "meats", "fruits", "grains"], "correct_choice_idx": 2, "direct_answers": ["fruit", "bananas", "fruit", "fruits", "fruit", "fruits", "fruits", "fruit", "fruit", "fruit"], "difficult_direct_answer": false, "rationales": ["Green bananas are displayed. bananas are categorized as fruit.", "The group is fruits.", "We see bunches of bananas in this image, though green and not close to being ripe. bananas are a type of fruit."], "image": "train2014/COCO_train2014_000000512948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223496, "question_id": "n5LFjcATDFqWFUeNvXsg8J", "question": "Who might use the boat on the beach?", "choices": ["lifeguards", "senators", "custodians", "police"], "correct_choice_idx": 0, "direct_answers": ["sailor", "sailor", "lifeguards", "fishermen", "life guard", "fisherman", "fishers", "fisherman", "fisherman", "lifeguard"], "difficult_direct_answer": false, "rationales": ["The boat is for emergency use.", "The lifeguards can use the boat to save people from the water.", "The boat next to the beach in which people might need to be rescued."], "image": "val2014/COCO_val2014_000000223496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284910, "question_id": "n5NcfHSqxoLvQsJBHi4cDG", "question": "What is the woman telling the dogs to do?", "choices": ["roll over", "drink", "eat", "sit"], "correct_choice_idx": 3, "direct_answers": ["sit", "sit", "sit", "sit", "sit", "sit", "stay", "sit", "sit", "sit"], "difficult_direct_answer": false, "rationales": ["The woman wants them to sit.", "The woman is telling the dogs to sit by using her hands as a command.", "The woman is holding her hand out towards the dog telling it to stay in that spot with its bottom on the ground."], "image": "train2014/COCO_train2014_000000284910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79471, "question_id": "n5eNZTJ8z7AF4bfv8xdcCV", "question": "What age person is likely to ride in the back seat here?", "choices": ["retiree", "child", "adult", "senior"], "correct_choice_idx": 1, "direct_answers": ["child", "child", "child", "nine", "two years", "young", "child", "young", "ten", "child"], "difficult_direct_answer": false, "rationales": ["A little kid will ride in the back.", "Children need to sit in the backseat of cars for safety reasons.", "Small children often play with teddy bears to give them comfort."], "image": "train2014/COCO_train2014_000000079471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513066, "question_id": "n5gLhoaVPXfuBKNKzZbtHr", "question": "What is the doctor doing to this person's leg?", "choices": ["stitches", "tattoo", "waxing", "cleaning wound"], "correct_choice_idx": 0, "direct_answers": ["surgery", "suturing", "suturing", "stitches", "stitching", "stitching wound", "stitching", "stitching", "sewing", "stitches"], "difficult_direct_answer": false, "rationales": ["The woman has thread on someone's leg.", "The doctor is sewing the cut on the leg.", "The doctor is stitching up the leg."], "image": "val2014/COCO_val2014_000000513066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85213, "question_id": "n5gZwqTyyXxuR3FWchuyxE", "question": "What muscles in the male showing off?", "choices": ["biceps", "deltas", "triceps", "quads"], "correct_choice_idx": 0, "direct_answers": ["biceps", "biceps", "biceps", "biceps", "biceps", "arms", "biceps", "arm", "biceps", "biceps"], "difficult_direct_answer": false, "rationales": ["The man is showing his arms.", "When someone flexes their arms up it shows off this muscle.", "The muscles are biceps."], "image": "train2014/COCO_train2014_000000085213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235844, "question_id": "n5t6RDZtYKDYukbcg4SFfQ", "question": "What type religion is sheltered here?", "choices": ["hindu", "protestant", "catholic", "muslim"], "correct_choice_idx": 1, "direct_answers": ["baptist", "baptist", "christian", "christine", "protestant", "baptist", "baptist", "baptist", "bus", "baptist"], "difficult_direct_answer": false, "rationales": ["Protestants attend churches that are baptist.", "The bus says baptist.", "It's a protestant church."], "image": "train2014/COCO_train2014_000000235844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49371, "question_id": "n66dULiRDMq2U8C3C2BgaS", "question": "What does the object use to speed?", "choices": ["gas", "gears", "engine", "fire"], "correct_choice_idx": 1, "direct_answers": ["gears", "chain", "pedals", "foot pedals", "bicycle", "wheels", "pedals", "moving forward", "pedals", "wheels"], "difficult_direct_answer": false, "rationales": ["These change how hard or easy it is to pedal", "You can switch gears on a bike to make it go faster.", "It is a bicycle, which are known to basically all have gears to shift speeds. bicycles are manually powered (meaning, without the use of a combustible)."], "image": "val2014/COCO_val2014_000000049371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402887, "question_id": "n69j7z7cV5AuNg7WP23X8n", "question": "Why are the boxes lined up and on display?", "choices": ["to donate", "to clean", "to sell", "as art"], "correct_choice_idx": 2, "direct_answers": ["sales presentation", "presentation", "sale", "to sell", "for sale", "sell quickly", "party platter", "gift boxes", "store display", "to sell"], "difficult_direct_answer": true, "rationales": ["There are many of them and they're located in either a store or market stall.", "The items have price tags on them. they are not being donated.", "The boxes are lined up to be sold at a market."], "image": "val2014/COCO_val2014_000000402887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149356, "question_id": "n6HAYEnrDcKmwEXUZc6uxY", "question": "What color are her glasses?", "choices": ["gold", "red", "white", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The glasses are too dark to be white, but lack any other color.", "The sunglasses have a dark color in order to reduce the amount of light and protect your eyes.", "The woman is wearing sunglasses which are meant to block the sun."], "image": "train2014/COCO_train2014_000000149356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83663, "question_id": "n6MY2vvBDCYgaXaRoiyMuR", "question": "What floor level are these drinks on?", "choices": ["first", "second", "basement", "third"], "correct_choice_idx": 0, "direct_answers": ["second", "second", "second", "above ground", "window", "ground", "first", "first", "top", "ledge"], "difficult_direct_answer": false, "rationales": ["Based on the perspective of the image and the location of the ground visible in relation to the drinks, answer a is consistent.", "The level is the first.", "From this perspective and based on where the ground level is where the dog can be seen it is not far enough up to be a second level or higher and is not under ground."], "image": "train2014/COCO_train2014_000000083663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172545, "question_id": "n6a7CuvcmwV9KbnUuGoYhb", "question": "Where are these people located?", "choices": ["highway", "parking lot", "driveway", "parkway"], "correct_choice_idx": 2, "direct_answers": ["driveway", "motorcycle", "outside", "driveway", "driveway", "outside", "parking lot", "on motorbike", "driveway", "on motorcycle"], "difficult_direct_answer": false, "rationales": ["The people are in a driveway.", "The people are in their own driveway.", "The motorcycle is parked near a house."], "image": "train2014/COCO_train2014_000000172545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29059, "question_id": "n6o9A5vHWb5Zgj7CLRQVoA", "question": "What breakfast food shares the same name of the sweet treat the girl is holding?", "choices": ["hash browns", "waffle", "pancake", "biscuit"], "correct_choice_idx": 1, "direct_answers": ["unknown", "waffle", "pastry", "sandwich", "breakfast burrito", "french toast", "baguette", "waffle", "pastry", "bun"], "difficult_direct_answer": false, "rationales": ["The shown meal is a waffle that kids take.", "She has a waffle cone.", "The girl has a waffle."], "image": "val2014/COCO_val2014_000000029059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213827, "question_id": "n6qtCzQoLrxeyhEmB273Vn", "question": "What is the woman diving into?", "choices": ["pool", "waterfall", "pond", "ocean"], "correct_choice_idx": 3, "direct_answers": ["water", "lake", "ocean", "lake", "water", "water", "water", "lake", "lake", "water"], "difficult_direct_answer": false, "rationales": ["The woman goes to the ocean.", "The woman is going into the ocean.", "The large body of water is outside and is not falling over a cliff."], "image": "val2014/COCO_val2014_000000213827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573125, "question_id": "n79WtZeZUzsTbrcqiN2vZN", "question": "What utensil are the people holding?", "choices": ["knife", "scissors", "spoon", "fork"], "correct_choice_idx": 1, "direct_answers": ["giant scissors", "scissors", "ripen", "scissors", "scissors", "scissors", "scissors", "scissors", "scissors", "scissors"], "difficult_direct_answer": false, "rationales": ["The scissors are used to cut the ribbon", "They are cutting a giant ribbon and using the typical design seen in a pair of scissors.", "These are large ceremonial ones."], "image": "train2014/COCO_train2014_000000573125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427384, "question_id": "n7WqNwRa7ZFHVkqkzxC6e5", "question": "How many women are kissing the man?", "choices": ["seven", "three", "two", "four"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["He has one on each side of him.", "There is one woman on each side of his face kissing each cheek.", "There are more than one but less than three women with their lips near the man's cheeks."], "image": "train2014/COCO_train2014_000000427384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130419, "question_id": "n7dpFoFJFQVHkKC3Qutcsr", "question": "What food and beverage purveyor did this woman visit most recently?", "choices": ["starbucks", "burger king", "taco bell", "mcdonalds"], "correct_choice_idx": 0, "direct_answers": ["starbucks", "starbucks", "starbucks", "starbucks", "coffee", "starbucks", "starbucks", "starbucks", "starbucks", "coffee"], "difficult_direct_answer": false, "rationales": ["The woman has a starbucks logo on her cup.", "The woman is holding a cup in her hand with a visible logo. the logo is known to be that of answer a.", "It has a green logo on the cup"], "image": "val2014/COCO_val2014_000000130419.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546041, "question_id": "n7hxkHf9MjHW6u48mYQAmL", "question": "How would pouring the red stuff on the pizza change it?", "choices": ["more salty", "more spicy", "more wet", "more bland"], "correct_choice_idx": 1, "direct_answers": ["pizza", "add flavor", "spicy", "spicier", "sauce", "spicier", "taste", "spicy", "more spicy", "make hotter"], "difficult_direct_answer": false, "rationales": ["The red spice will make it taste spicy.", "This is used to season food and it is made from crushed red chili peppers.", "It'd be spicier."], "image": "train2014/COCO_train2014_000000546041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328638, "question_id": "n7iMogTpRQgKZ9dcWmHrra", "question": "What structure is atop the elephant?", "choices": ["ladder", "seat", "hat", "wagon"], "correct_choice_idx": 1, "direct_answers": ["saddle", "howdah", "seat", "seat", "seat", "seat", "saddle", "seat", "saddle", "seat"], "difficult_direct_answer": false, "rationales": ["The structure is a seat.", "It has a backrest and handles for a person to sit and hold on to.", "There is a seat."], "image": "train2014/COCO_train2014_000000328638.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122354, "question_id": "n7mPbtBbR3dd3u4jq59jbG", "question": "What pedal does the driver of the black van have their foot on?", "choices": ["brake", "accelerator", "none", "fast forward"], "correct_choice_idx": 0, "direct_answers": ["brake", "brake", "brake", "brakes", "brake", "brakes", "brakes", "brake", "brake", "brake"], "difficult_direct_answer": false, "rationales": ["The black van's red lights are on, showing those behind it that the van is stopped or slowing.", "The van is stopped.", "The driver is stopped."], "image": "val2014/COCO_val2014_000000122354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514392, "question_id": "n8BHmPevBgRhyX4b3DKp9z", "question": "Which skiers gather under the pavilion nearest here?", "choices": ["beginners", "retirees", "olympians", "pros"], "correct_choice_idx": 0, "direct_answers": ["ski lift", "beginners", "daredevils", "mountain", "first timers", "left", "beginners", "people", "mountain", "amateurs"], "difficult_direct_answer": false, "rationales": ["Young skiers are on a ski hill that is not very steep.", "The pavilion states \"first time,\" referring to experience levels.", "The skiers are beginners."], "image": "train2014/COCO_train2014_000000514392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398099, "question_id": "n8WEUAV8c97jpvNKqbtrQF", "question": "Who is going to serve the ball?", "choices": ["her opponent", "her partner", "referee", "her"], "correct_choice_idx": 3, "direct_answers": ["woman", "tennis player", "woman", "woman", "female", "her", "woman", "player", "white shirt", "tennis player"], "difficult_direct_answer": false, "rationales": ["The woman is bouncing the ball in front of her.", "The woman will serve the ball.", "The woman with the ball."], "image": "train2014/COCO_train2014_000000398099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536725, "question_id": "n8cppQ5ry9BjeM99Xvg3m5", "question": "What type of pants is the man wearing?", "choices": ["corduroy", "cargo pants", "jeans", "dress pants"], "correct_choice_idx": 3, "direct_answers": ["khaki pants", "dress pants", "dress pants", "dress pants", "khaki", "dress", "dress pants", "khakis", "dress", "dress pants"], "difficult_direct_answer": false, "rationales": ["The man is in work clothes.", "He is wearing dress pants.", "The man is wearing slacks which are considered dressy."], "image": "val2014/COCO_val2014_000000536725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142263, "question_id": "n8fk47DQrDSQh8CRAK7h7g", "question": "Those birds are related to what other type of bird?", "choices": ["raven", "robin", "peacock", "dove"], "correct_choice_idx": 3, "direct_answers": ["pigeons", "seagull", "flying", "doves", "doves", "crows", "pigeon", "pigeons", "dove", "dove"], "difficult_direct_answer": false, "rationales": ["The birds are doves.", "The birds look like doves and pigeons. they appear to be a nuisance.", "They are pigeons."], "image": "train2014/COCO_train2014_000000142263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139389, "question_id": "n8ozsGjsbNgKAtRwb9HyF8", "question": "What matches the color of the car?", "choices": ["cow", "sky", "chicken", "mud"], "correct_choice_idx": 1, "direct_answers": ["store trim", "ocean", "building", "building trim", "building paint", "sky", "sky", "trim", "quiznos sub", "building"], "difficult_direct_answer": false, "rationales": ["Thee car is blue like the sky.", "The sky is gray as is the car.", "The car is blue. cows, chickens, and mud are not blue."], "image": "train2014/COCO_train2014_000000139389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533045, "question_id": "n95t8VjPoRt4E8fLvczT9o", "question": "What is next to each pizza pie?", "choices": ["napkin", "baby", "knife", "umbrella"], "correct_choice_idx": 2, "direct_answers": ["tasty", "silverware", "knives", "eating utensils", "utensils", "knife fork", "butter knife", "fork", "knife", "knife"], "difficult_direct_answer": true, "rationales": ["A knife is used to slice the pizza.", "There is a knife.", "There is a serrated utensil used for cutting to the right side of the plates that contain the pizza pie."], "image": "train2014/COCO_train2014_000000533045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405271, "question_id": "n9CVzNRdcvJhyfGffqVeEy", "question": "How might people here propel themselves forward if they aren't going downhill?", "choices": ["taxi", "using poles", "uber", "wind"], "correct_choice_idx": 1, "direct_answers": ["skiing", "pushing pies", "ski pole", "using poles", "with poles", "lift", "ski poles", "use skaters", "skis", "poles"], "difficult_direct_answer": true, "rationales": ["They use the poles.", "The people are carrying long sticks in their hands.", "They can use their poles to help move themselves forward."], "image": "train2014/COCO_train2014_000000405271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316700, "question_id": "n9FDRTETZXgqhbyJp7zMgd", "question": "What is the white ingredient in the cup by the pickle?", "choices": ["tartar sauce", "butter", "cream cheese", "mayonnaise"], "correct_choice_idx": 0, "direct_answers": ["mayo", "coleslaw", "mayonnaise", "white sauce", "tartar sauce", "dressing", "mayonnaise", "cabbage", "tartar sauce", "mayonnaise"], "difficult_direct_answer": false, "rationales": ["There is seafood on the plate, which is usually served with tartar sauce.", "The ingredient is tartar sauce.", "The cup has tartar sauce."], "image": "val2014/COCO_val2014_000000316700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62195, "question_id": "n9HjrZHHVh7L59vWCdMgxj", "question": "What is the umbrella used to block?", "choices": ["snow", "hail", "rain", "sun"], "correct_choice_idx": 3, "direct_answers": ["sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["It helps keep them cooler and the sun out.", "The sky is clear. it is not raining, hailing, or snowing.", "The people riding the elephant are carrying an umbrella to protect themselves from the sun's rays."], "image": "train2014/COCO_train2014_000000062195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482381, "question_id": "n9T34kcmvTnzobEMr4r4Cg", "question": "What type of room is this?", "choices": ["dining", "living room", "entertainment room", "kitchen"], "correct_choice_idx": 0, "direct_answers": ["kitchen", "dining", "kitchen", "kitchen", "dining room", "kitchen", "dining room", "kitchen", "dining room", "dining"], "difficult_direct_answer": false, "rationales": ["The table is set for people to eat.", "This is a room that people will sit at the table all together and eat.", "A room with a table and chairs can be seen near a kitchen."], "image": "train2014/COCO_train2014_000000482381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181466, "question_id": "n9b2mJQHJLgZTf4EvAoHuR", "question": "What is this man's hobby?", "choices": ["photography", "painting", "football", "golf"], "correct_choice_idx": 0, "direct_answers": ["photographer", "photographer", "photography", "photography", "photographer", "photography", "photography", "photography", "photography", "photography"], "difficult_direct_answer": false, "rationales": ["A man is sitting and using a camera to take a picture of flowers.", "He is holding a camera and has a bag of equipment", "The man has an expensive camera."], "image": "val2014/COCO_val2014_000000181466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186443, "question_id": "n9hycXgyj9QhpVsWSYfvBB", "question": "What has been in this location the longest?", "choices": ["men", "water", "surfboard", "metal structures"], "correct_choice_idx": 1, "direct_answers": ["water", "water", "pier", "water", "pier", "water", "water", "pier", "water", "pier"], "difficult_direct_answer": false, "rationales": ["The water has been there before the men, the structures, or even the surfboard.", "Oceans have been on the planet for a very long time", "People are in the ocean with surfboards. oceans have been around longer than people."], "image": "val2014/COCO_val2014_000000186443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362072, "question_id": "nA3vN9CJRNWVzY6yyFZmfP", "question": "What color is the largest fruit on the plate?", "choices": ["brown", "green", "yellow", "red"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "orange", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "orange"], "difficult_direct_answer": false, "rationales": ["The melon at the center is the largest item in width.", "The largest fruit on the plate is bright yellow.", "There are bananas."], "image": "train2014/COCO_train2014_000000362072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191850, "question_id": "nANXkzUoqD2AQ4pnWXKHVg", "question": "Why are the people carrying umbrellas?", "choices": ["blocking rain", "blocking sun", "to dance", "to sell"], "correct_choice_idx": 1, "direct_answers": ["shade", "shade", "shade", "block sun", "sunny", "blocking sun", "sun protection", "sunny", "sunny", "shade"], "difficult_direct_answer": false, "rationales": ["The sun is bright.", "It is really bright outside and there are no clouds.", "The main two functions of umbrellas are to protect from sun or rain. there is no visible rain as there is nothing wet, but judging by the light and the shadows their is sunlight which is what some people may want to avoid."], "image": "train2014/COCO_train2014_000000191850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393286, "question_id": "nAf2rXStvVsk8yPHTb6sMv", "question": "What type of phone is nearby?", "choices": ["cellular", "landline", "payphone", "rotary"], "correct_choice_idx": 1, "direct_answers": ["desk", "desk phone", "landline", "open banana", "work phone", "landline", "landline", "landline", "desk phone", "landline"], "difficult_direct_answer": false, "rationales": ["The phone is a landline.", "The phone has push-buttons, not a rotary dialer. it not wireless and does not have a coin slot.", "This is a landline phone based on the keypad configuration and size."], "image": "train2014/COCO_train2014_000000393286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464178, "question_id": "nB7pdcLHZhyodZvnafatYX", "question": "What feature is this animal most known for?", "choices": ["big teeth", "gills", "long neck", "short legs"], "correct_choice_idx": 2, "direct_answers": ["neck", "tall neck", "neck", "long neck", "long neck", "neck", "long neck", "long neck", "long neck", "neck"], "difficult_direct_answer": false, "rationales": ["These animals have long necks to eat the leaves in the trees.", "These animals are giraffes, not fish. they are tall.", "The giraffe is known for its long neck."], "image": "val2014/COCO_val2014_000000464178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328833, "question_id": "nBagzXTeBNU2efqVYLeJsE", "question": "What is likely used in the item under the bricks on the left?", "choices": ["paper", "frozen food", "wood", "bread"], "correct_choice_idx": 2, "direct_answers": ["firewood", "stools", "logs", "wood", "wood", "fireplace", "wood", "fire", "water", "chair"], "difficult_direct_answer": false, "rationales": ["It is a wood-burning fireplace.", "A home has a fireplace near the kitchen.", "A fireplace is shown. wood is used in fireplaces."], "image": "train2014/COCO_train2014_000000328833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80553, "question_id": "nByyPmyzntgs7sCzMH2Bdq", "question": "Where are these people standing?", "choices": ["post office", "library", "airport", "casino"], "correct_choice_idx": 2, "direct_answers": ["airport", "waiting", "airport", "airport", "waiting", "waiting", "airport", "waiting", "airport", "airport"], "difficult_direct_answer": false, "rationales": ["The people are in an airport.", "They are standing in line waiting to check in for their airlines.", "They're at an airport."], "image": "train2014/COCO_train2014_000000080553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235595, "question_id": "nCB7RMDRQAXNJC3oWQJWvM", "question": "What is the parent organization of the SUV?", "choices": ["ford", "toyota", "honda", "ferrari"], "correct_choice_idx": 1, "direct_answers": ["toyota", "lexus", "lexus", "honda", "toyota", "ford", "handi", "toyota", "toyota", "harley davidson"], "difficult_direct_answer": false, "rationales": ["Toyota owns the suv.", "The suv is lexus, which belongs to toyota.", "The organization is toyota."], "image": "val2014/COCO_val2014_000000235595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578250, "question_id": "nCLW5ZA8mpR4jDqTGtWRJR", "question": "What are they doing with the spatulas?", "choices": ["eating lunch", "making soup", "stirring together", "cleaning up"], "correct_choice_idx": 2, "direct_answers": ["cooking", "stirring", "stirring together", "cooking", "stirring", "cooking", "cooking", "cooking", "cooking", "cooking"], "difficult_direct_answer": false, "rationales": ["They have the spatulas in the same skillet.", "They are using the spatula to stir the food in the pan.", "Woman are in kitchen holding spatulas in bowls. spatulas are used to stir."], "image": "train2014/COCO_train2014_000000578250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343211, "question_id": "nCMyqaepVDnuJBbZbys3B5", "question": "Upon what surfaced court is this game being played?", "choices": ["asphalt", "concrete", "grass", "clay"], "correct_choice_idx": 3, "direct_answers": ["clay", "clay", "clay", "clay", "clay", "dirt court", "tennis", "sand", "clay", "clay"], "difficult_direct_answer": false, "rationales": ["The red color and consistency suggests that this is a clay surface.", "The man is playing tennis on a brownish colored court.", "The court is red."], "image": "train2014/COCO_train2014_000000343211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239936, "question_id": "nCxa3CNxN8C6HiMHSARbLb", "question": "What was this home team's previous name?", "choices": ["montreal expos", "washington wizards", "charlotte hornets", "houston oilers"], "correct_choice_idx": 0, "direct_answers": ["black sox", "montreal expos", "montreal expos", "red sox", "capitols", "red skins", "not know", "cardinals", "boston americans", "washington nationals"], "difficult_direct_answer": true, "rationales": ["The washington nationals used to be called the montreal expos.", "The expos were a team in the past that played baseball.", "The name of the team can be inferred from the logo and the historic names of the team can be internet searchable."], "image": "train2014/COCO_train2014_000000239936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418711, "question_id": "nD5xSFkVzMTZigziMq7fcf", "question": "To what location is the man on the bike headed?", "choices": ["tire shop", "pet shop", "butcher", "fruit market"], "correct_choice_idx": 3, "direct_answers": ["market", "market", "front", "market", "downtown", "fruit market", "market", "market", "north", "market"], "difficult_direct_answer": false, "rationales": ["The man is carrying fruit to the downtown fruit market.", "The man's bike is carrying bananas, not meat or pet food. the bicycle's tires are in good condition.", "He has bundles of bananas to sell."], "image": "val2014/COCO_val2014_000000418711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58184, "question_id": "nDB3vwAwXBtNqoseeh4vJF", "question": "What shines on the plate under the biscuit?", "choices": ["butter", "honey", "plate only", "mirror"], "correct_choice_idx": 1, "direct_answers": ["syrup", "gravy", "syrup", "syrup", "honey", "syrup", "maple syrup", "syrup", "syrup", "liquid"], "difficult_direct_answer": false, "rationales": ["There is a brown substance underneath.", "The honey shines.", "People will eat this dish with honey."], "image": "train2014/COCO_train2014_000000058184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267683, "question_id": "nDM4ky7bts4wHvmBFRwJt8", "question": "What degree did she get to qualify for this role?", "choices": ["none", "associate's", "bachelor's", "master's"], "correct_choice_idx": 0, "direct_answers": ["culinary", "culinary", "none", "chef", "cooking", "culinary arts", "culinary", "culinary", "chef", "culinary"], "difficult_direct_answer": false, "rationales": ["There are no degrees.", "She does not need a degree to cook food.", "You don't need a degree to work in the kitchen."], "image": "val2014/COCO_val2014_000000267683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550719, "question_id": "nDXJCahfyQ8uZfs9AYCG7n", "question": "Why is the batter wearing gloves?", "choices": ["warmth", "germs", "fashion", "grip"], "correct_choice_idx": 3, "direct_answers": ["nonslip grip", "hold bat", "sweat", "catching ball", "grip", "for grip", "protection", "grip", "improve grip", "grip"], "difficult_direct_answer": false, "rationales": ["This is so the bat doesn't slip out of his hands", "The batter needs to grip the bat.", "The batter has gloves on his hands to better hold the bat with."], "image": "train2014/COCO_train2014_000000550719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490629, "question_id": "nDfvUNTz7yadCDy69knGNd", "question": "Why is the woman wearing a triangular hat?", "choices": ["dress code", "visibility", "protection", "camouflage"], "correct_choice_idx": 2, "direct_answers": ["protection", "for shade", "bike", "local style", "block sunlight", "weather protection", "shade", "block sun", "cultural", "protection"], "difficult_direct_answer": true, "rationales": ["This keeps the sun off her face", "Her hat will keep the sun off her face and out of her eyes.", "The hat is used for protection."], "image": "val2014/COCO_val2014_000000490629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467257, "question_id": "nDjbq2zoBmVtPfq8vCASJ7", "question": "What powers the item this man works on?", "choices": ["battery", "generator", "diesel", "gas"], "correct_choice_idx": 0, "direct_answers": ["charger", "electricity", "battery", "battery electricity", "electricity", "electricity", "electricity", "electricity", "electricity", "battery"], "difficult_direct_answer": false, "rationales": ["Laptops naturally come with a battery in them. it has no space for any gas or a generator.", "Obviously laptops are not powered by gas or diesel and as far as i know they aren't powered by generators but these days i can't be sure. the obvious answer is \"a\".", "The man is on a laptop."], "image": "val2014/COCO_val2014_000000467257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410019, "question_id": "nDqvbEipKHCEwA6YB8wSVY", "question": "What is the man playing on?", "choices": ["longboard", "bike", "scooter", "skateboard"], "correct_choice_idx": 2, "direct_answers": ["scooter", "scooter", "scooter", "scooter", "scooter", "scooter", "scooter", "scooter", "scooter", "scooter"], "difficult_direct_answer": false, "rationales": ["He's on a scooter.", "The object being used is clearly visible and has two wheels connected be a horizontal bar and a vertical bar with handles which are all features of scooters.", "The man is playing on a scooter in his drivewayt."], "image": "val2014/COCO_val2014_000000410019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387492, "question_id": "nEQrRkLaH7eCZWLFhermAm", "question": "What type of ramp is the skateboarder jumping off?", "choices": ["bowl", "half pipe", "pool", "wet pipe"], "correct_choice_idx": 1, "direct_answers": ["playing", "halfpipe", "launch", "half pipe", "skateboard", "half pipe", "skateboarding ramp", "half pipe", "halfpipe", "high jump"], "difficult_direct_answer": false, "rationales": ["There is a steep wall.", "The ramp is not very large.", "This ramp is known as a half pipe."], "image": "train2014/COCO_train2014_000000387492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419145, "question_id": "nEdeJLVQMfjv79LWi4FByY", "question": "What is the girl doing with the device she is holding?", "choices": ["playing games", "art", "watching movies", "taking pictures"], "correct_choice_idx": 3, "direct_answers": ["recording", "taking picture", "taking pictures", "taking pictures", "taking picture", "taking pics", "taking picture", "taking picture", "taking pictures", "taking picture"], "difficult_direct_answer": false, "rationales": ["Her reflection can be seen in the reflection of the mirror and this photo is proof of her taking pictures.", "An apple ipad has many functions which are useful. one of them is being a camera which allows it to take pictures.", "The girl is taking a photo."], "image": "train2014/COCO_train2014_000000419145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526523, "question_id": "nEtfNNiXs9v2dSy2JXiq4U", "question": "Which activity are the boots that the girls are wearing best used for?", "choices": ["hiking", "football", "swimming", "soccer"], "correct_choice_idx": 0, "direct_answers": ["hiking", "for hiking", "hiking", "hiking", "hiking", "hiking", "hiking", "hiking", "for hiking", "hiking"], "difficult_direct_answer": false, "rationales": ["The kids are wearing hiking boots.", "Girls sit together in boots in a wooded area.", "The boots are used to hike in the woods since they're heavy duty."], "image": "train2014/COCO_train2014_000000526523.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509654, "question_id": "nF7NUZzWYE4AAd7X5tibMd", "question": "What is on the plate all the way to the right?", "choices": ["banana", "egg", "apple", "lemon"], "correct_choice_idx": 0, "direct_answers": ["banana", "banana bread", "banana bread", "banana", "banana", "banana", "banana bread", "banana", "banana", "banana"], "difficult_direct_answer": false, "rationales": ["It's a unripe one.", "It is an elongated, yellow piece of fruit that needs to be peeled in order to eat.", "The object is long, yellow, sort of skinny, and has that \"stick\" thing at the very top resembling such fruit."], "image": "train2014/COCO_train2014_000000509654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480088, "question_id": "nF85BhFuUPKhp3yamRaMoA", "question": "What sort of activity are we seeing here?", "choices": ["singing", "mime", "fist fight", "comic routine"], "correct_choice_idx": 3, "direct_answers": ["kidnapping", "comedy", "acting", "comedy", "tied up", "attached", "kidnapping", "comic routine", "arrested here", "burglary"], "difficult_direct_answer": false, "rationales": ["It's the three stooges. the other options don't match the scene.", "The three stooges were comedians known for their slapstick performances on their television show.", "These are the three stooges who had many movies in early hollywood"], "image": "train2014/COCO_train2014_000000480088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20935, "question_id": "nFDFWVDxo3VQufVaTpVaoZ", "question": "What is following someone you are attracted to called?", "choices": ["stalking", "knowledge", "comradery", "friendship"], "correct_choice_idx": 0, "direct_answers": ["stalking", "crush", "stalking", "stalking", "stalking", "stalking", "stalking", "stalking", "stalking", "stalking"], "difficult_direct_answer": false, "rationales": ["He is looking at someone's social media accounts on his phone.", "If you are following them and they don't know it you are stalking them.", "It's creepy to follow someone."], "image": "val2014/COCO_val2014_000000020935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486974, "question_id": "nFEZ42iztK22Ee5rT7iite", "question": "What sort of sounding device might be located above the clock?", "choices": ["bells", "town crier", "organ", "alarm clock"], "correct_choice_idx": 0, "direct_answers": ["gong", "bells", "bells", "bells", "bell", "bell", "bell", "drawing", "bell", "bell"], "difficult_direct_answer": false, "rationales": ["The device is a bell.", "There are bells on the clock.", "Clock towers typically have bells."], "image": "val2014/COCO_val2014_000000486974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402451, "question_id": "nFF9rBrwhx3qV6uvUKhvNe", "question": "Why does the man have his arms out?", "choices": ["measure", "break fall", "to balance", "to swing"], "correct_choice_idx": 3, "direct_answers": ["returning ball", "backhand shot", "serving ball", "hitting ball", "to swing", "playing tennis", "swinging", "reaching ball", "hit ball", "hitting ball"], "difficult_direct_answer": true, "rationales": ["The man has his arms out and on a tennis racquet in order to swing for the ball.", "The man is playing tennis based on the visible equipment and setting. in this sport, one would do answer a to hit the ball that is visibly in front of him.", "The man is going for the ball in tennis."], "image": "train2014/COCO_train2014_000000402451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214704, "question_id": "nFWLLoiJHuDVfsQ2W2eo2H", "question": "What item will allow for food to retain their freshness?", "choices": ["shelf", "cupboard", "television", "refrigerator"], "correct_choice_idx": 3, "direct_answers": ["refrigerator", "fridge", "preservative", "fridge", "fridge", "refrigerator", "fridge", "fridge", "fridge", "refrigerator"], "difficult_direct_answer": false, "rationales": ["Televisions, cupboards, and shelves have no mechanisms to keep food fresh.", "The fridge will make the food cold.", "The only appliance in the room clearly visible that relates to food is a refrigerator which is commonly used to keep foods from spoiling."], "image": "train2014/COCO_train2014_000000214704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203458, "question_id": "nG2yvQ6M4cydsB4Khz7Dcu", "question": "What is the destination for the bus waiting in traffic?", "choices": ["michigan", "ulta", "men's wearhouse", "express"], "correct_choice_idx": 0, "direct_answers": ["michigan", "michigan", "148", "express", "michigan express", "michigan", "michigan", "michigan express", "traffic rules", "michigan express"], "difficult_direct_answer": false, "rationales": ["A digital sign can be seen above a bus. it mentions where it is going.", "A bus lists its next destination as michigan on the digital sign on top.", "The sign on the front of the bus says michigan because that is its next stop."], "image": "train2014/COCO_train2014_000000203458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83959, "question_id": "nG5CTf6q5Z8UYThhHhvUhc", "question": "What is the pastry to the right of the chocolate donut called?", "choices": ["apple pie", "apple donut", "apple fritter", "apple dumpling"], "correct_choice_idx": 2, "direct_answers": ["apple brown", "cookie", "apple fritter", "fritter", "apple fritter", "apple fritter", "apple fritter", "krueller", "dessert", "apple fritter"], "difficult_direct_answer": false, "rationales": ["It is filled with apples and frosted with a light sugary coating.", "The big pastry is a fritter.", "The other options don't match the shape and ingredients as well as this one."], "image": "train2014/COCO_train2014_000000083959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459749, "question_id": "nG792EYVdGQwbCp8d5kyRG", "question": "The person cooking should beware at this point because the water is at what stage?", "choices": ["cooking", "boiling", "evaporating", "cooling off"], "correct_choice_idx": 1, "direct_answers": ["boiling", "boiling", "boiling", "boiling", "boiling point", "boiling", "boiling", "boiling", "boiling", "boiling point"], "difficult_direct_answer": false, "rationales": ["The water is bubbling.", "Carrots are in a pot of water on the stove that is steaming and has some bubbles appearing at the surface of the water.", "When bubbles start appearing in a pot of water on the stove, that means it has heated up to at least 212 f and is dangerously hot."], "image": "val2014/COCO_val2014_000000459749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161823, "question_id": "nGW9BYACuvy4TJC23vPjJ7", "question": "What does the light from the appliance with a door best let you see?", "choices": ["baking food", "freezing food", "coffee", "plates"], "correct_choice_idx": 0, "direct_answers": ["food", "cooking pan", "things", "food", "food", "food", "oven interior", "food", "baking food", "cooking"], "difficult_direct_answer": false, "rationales": ["The appliance is an oven. the light is inside of the oven and helps you to better see what is cooking inside.", "Since ovens are used for heating up sustenance for humans, having a light will help you determine if it's ready.", "The light is for baking."], "image": "train2014/COCO_train2014_000000161823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70636, "question_id": "nGaJhxPnsQmePhLZtxamdt", "question": "What is above the horses?", "choices": ["fly", "kite", "sky", "dragon"], "correct_choice_idx": 2, "direct_answers": ["sky", "sky", "sky", "clouds", "sky", "sky", "sky", "clouds", "sky", "clouds"], "difficult_direct_answer": false, "rationales": ["The horses have the sky above them.", "The sky is visibly above the horses in this image.", "The sky is above them."], "image": "val2014/COCO_val2014_000000070636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516945, "question_id": "nGwyzaAsWs9zCeGeBLSnRt", "question": "What has caused the roads to look reflective?", "choices": ["ice", "snow", "wax", "rain"], "correct_choice_idx": 3, "direct_answers": ["rain", "water", "rain", "water", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["A warm precipitation event will cause roads to get which, causing them to reflect any available light. motorists should always slow down on wet roads, since sliding and swerving come into play.", "The grounds are wet.", "Cars on a street are using their wipers and the road is shiny with water puddling in various areas."], "image": "train2014/COCO_train2014_000000516945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168355, "question_id": "nH624QiAizjbGoiCQewFoH", "question": "What is this man selling?", "choices": ["dirt", "spices", "ground insects", "sand"], "correct_choice_idx": 1, "direct_answers": ["spices", "spices", "spices", "spices", "powdered things", "spices", "spices", "spices", "powdered things", "spices"], "difficult_direct_answer": false, "rationales": ["The man is selling spices.", "The items in front of the man are dried and ground up plants used for flavoring.", "This man is selling street spices."], "image": "val2014/COCO_val2014_000000168355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257815, "question_id": "nH84XX3V587AC9teVtpwGe", "question": "This is in the great state of?", "choices": ["alabama", "florida", "new york", "illinois"], "correct_choice_idx": 3, "direct_answers": ["chicago", "illinois", "illinois", "illinois", "chicago", "illinois", "illinois", "illinois", "illinois", "chicago"], "difficult_direct_answer": false, "rationales": ["The sign near the wheel refers to a business named uptown bikes that is based in chicago. this city is not in new york, florida, or alabama.", "The bike says illinois.", "Chicago is printed on one of the bikes."], "image": "train2014/COCO_train2014_000000257815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296825, "question_id": "nHLPc9Y9m6EFq47MPeVz2E", "question": "What is the vehicle design inspired from?", "choices": ["comic book", "song", "history", "book"], "correct_choice_idx": 0, "direct_answers": ["comic book", "legos", "nina turtles", "ninja turtles", "scooby doo", "mutant ninjas", "ninja turtles", "mutant turtles", "tuk tuk", "ninja turtles"], "difficult_direct_answer": false, "rationales": ["A vehicle is decorated in teenage mutant ninja turtle themes which is a cartoon that also has comic books associated with it.", "The design is like a book.", "The vehicle is designed from the ninja turtles comic book."], "image": "val2014/COCO_val2014_000000296825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110313, "question_id": "nHSaCDf4smZ5YXGxfBGn7N", "question": "What sort of art were people here recently enjoying?", "choices": ["painting", "music", "literature", "sculpture"], "correct_choice_idx": 1, "direct_answers": ["opera", "opera", "opera", "opera", "music", "opera", "opera", "opera", "opera", "opera"], "difficult_direct_answer": false, "rationales": ["The lettering on the wall indicates that this is an opera house and opera is a type of music, so the people are enjoying music here.", "The writing on the wall indicates it is an opera house, so that means opera music was taking place there.", "There is a sign that says opera on the building."], "image": "val2014/COCO_val2014_000000110313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208714, "question_id": "nHinHDFxTn6rpqN2uwcJN2", "question": "Why are there two pizzas?", "choices": ["hungry customer", "two customers", "overordered", "baking mistake"], "correct_choice_idx": 1, "direct_answers": ["variety flavors", "many eaters", "two customers", "two people", "different toppings", "family", "many people", "different kinds", "different toppings", "hunger"], "difficult_direct_answer": true, "rationales": ["There is a child sitting on the other side of the table. there is a sippy cup on the other side of the table.", "The pizzas are ordered by two people.", "Two people are at the table eating pizza."], "image": "train2014/COCO_train2014_000000208714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440698, "question_id": "nJJDQTuzYubLKxeL4udfHY", "question": "Where is the cat hanging out most likely?", "choices": ["park", "wild", "backyard", "storefront"], "correct_choice_idx": 2, "direct_answers": ["yard", "basket", "backyard", "apartments", "yard", "home", "neighborhood", "backyard", "in yard", "school"], "difficult_direct_answer": false, "rationales": ["A cat is laying in the grass and bikes and people can be seen behind.", "The cat is in a yard.", "There are several bikes and a screen door on the building which is for residences"], "image": "train2014/COCO_train2014_000000440698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207548, "question_id": "nJsfUZh69tQBxaAnfeBeL2", "question": "What could likely happen to you on this bench?", "choices": ["get seasick", "get sunburn", "get lost", "sandy feet"], "correct_choice_idx": 1, "direct_answers": ["get pictured", "unknown", "fall asleep", "get sunburn", "fall", "photo", "bear attack", "sleep", "take picture", "rest"], "difficult_direct_answer": true, "rationales": ["The kids are likely to get sunburn while sitting on the bench.", "The tree cover opens just the right way to let light shine directly on the bench.", "There is some light peeking through the branches"], "image": "val2014/COCO_val2014_000000207548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293425, "question_id": "nJzvcbRbwrnQjkhqWBM9Mn", "question": "What is most likely being transmitted via the headphones?", "choices": ["movie", "music", "audiobook", "translations"], "correct_choice_idx": 3, "direct_answers": ["audio", "translated speech", "translation", "translated speech", "voice", "translation", "translations", "translation", "voice", "finance information"], "difficult_direct_answer": false, "rationales": ["The men have headphones on at the world economic forum.", "The man in the foreground is asian while the man in the back is white. usually there will be language barriers, so the asian person more than likely knows his language and english while the white man only knows english.", "In this situation, people are gathered from different countries, so headphones do not look out of place as attendants have access to listening in their own languages."], "image": "val2014/COCO_val2014_000000293425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70744, "question_id": "nK4tEP64D54SdtRTmcRToc", "question": "This scene takes place during when?", "choices": ["summer", "vietnam war", "winter", "korean war"], "correct_choice_idx": 0, "direct_answers": ["daylight", "summer", "volleyball game", "summer", "lunch", "summer", "summer", "summer", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["It's summertime.", "The scene occurs in the summer since everyone's in shorts.", "There are people with bikes and surfboards."], "image": "val2014/COCO_val2014_000000070744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406770, "question_id": "nKCgz9d3SvxdqZtuynGssW", "question": "What is the job of this bull?", "choices": ["pull", "fight", "run", "dairy"], "correct_choice_idx": 0, "direct_answers": ["pull trolley", "pull cart", "transit", "pull freight", "transport", "pull wagon", "pull", "pull cart", "pull cart", "pull cart"], "difficult_direct_answer": false, "rationales": ["The rope is wrapped around the bulls neck.", "The job is to pull.", "There is a little cart right beside him that he is suppose to pull."], "image": "train2014/COCO_train2014_000000406770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34842, "question_id": "nKVjGRHxzB8Sp6v3guTw2j", "question": "This computer desk is in what type of building?", "choices": ["dormitory", "home", "commercial office", "apartment"], "correct_choice_idx": 2, "direct_answers": ["office building", "office", "office", "office building", "office", "office", "office building", "commercial office", "office", "office"], "difficult_direct_answer": false, "rationales": ["The windows are very large and plain and the floor is tile so it looks industrial", "The space looks like a typical office.", "The desk is commercial."], "image": "train2014/COCO_train2014_000000034842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317349, "question_id": "nKtehCpHnc5JbPfaXAz5Te", "question": "What company makes the sneakers the girls are wearing?", "choices": ["dc", "vans", "sketchers", "converse"], "correct_choice_idx": 3, "direct_answers": ["converse", "converse", "converse", "converse", "jungle", "converse", "chucks", "converse", "jungle", "converse"], "difficult_direct_answer": false, "rationales": ["The girls are wearing chuck taylor all star shoes. these shoes are not made by vans, sketchers, or dc.", "There are the white shoe ends with white bottoms and shoestrings. the middle of the shoe is colored.", "Two women are sitting and talking on the phone and appear to be sitting on top of buildings. they are wearing chucks and have a white toe end cap and stripe on side of shoe."], "image": "train2014/COCO_train2014_000000317349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355092, "question_id": "nKvx9byzXVfwSftKWnU2aE", "question": "Why are all those balloons in the boat?", "choices": ["fell there", "celebration", "for sale", "stolen"], "correct_choice_idx": 1, "direct_answers": ["party", "celebration", "celebration", "celebration", "celebration", "decoration", "celebration", "for celebration", "party", "celebration"], "difficult_direct_answer": false, "rationales": ["The balloons are for a festive event.", "Balloons are for parties", "The balloons are for a celebration."], "image": "train2014/COCO_train2014_000000355092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309692, "question_id": "nKyJQ3qFgQK8DWp8Q3Y3Tq", "question": "What country is this venue situated in?", "choices": ["spain", "honduras", "chile", "mexico"], "correct_choice_idx": 1, "direct_answers": ["honduras", "honduras", "honduras", "america", "honduras", "honduras", "airport", "i do", "honduras", "honduras"], "difficult_direct_answer": false, "rationales": ["The word honduras is clearly visible on the baggage claim. baggage claims are usually situated in airports and when country names are written on a permanent fixture in an airport it is likely the name of the country the airport is in.", "The venue's text indicates honduras.", "The country is honduras."], "image": "val2014/COCO_val2014_000000309692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409216, "question_id": "nL7xfWz9rvcE4PPeiUxzkh", "question": "What is the woman's hat made out of?", "choices": ["straw", "bronze", "leather", "rubber"], "correct_choice_idx": 0, "direct_answers": ["straw", "straw", "straw", "straw", "straw", "straw", "straw", "straw", "straw", "straw"], "difficult_direct_answer": false, "rationales": ["A woman is wearing a woven, light brown hat.", "The hat is made of straw.", "The hat is woven out of a plant like material."], "image": "train2014/COCO_train2014_000000409216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302141, "question_id": "nLVvnp9vTbBTt4WvJr4yFc", "question": "What country is this location?", "choices": ["canada", "spain", "mexico", "france"], "correct_choice_idx": 1, "direct_answers": ["spain", "spain", "mexico", "tenerife", "puerto rico", "spain", "spain", "bulgaria", "italy", "spain"], "difficult_direct_answer": false, "rationales": ["The country would be spain.", "The country is spain.", "Spanish writing is on the sign on the side of a building."], "image": "train2014/COCO_train2014_000000302141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366734, "question_id": "nLWa9zcjp7SukzngCApBc8", "question": "What is the rack near the bus used for?", "choices": ["coats", "pets", "hats", "bikes"], "correct_choice_idx": 3, "direct_answers": ["luggage", "bicycles", "parking bikes", "holding bikes", "bike", "secure bikes", "locking bikes", "bikes", "bikes", "bikes"], "difficult_direct_answer": false, "rationales": ["Bikes need to be kept outside, and this rack allows them to be chained up.", "This rack is used to park bikes.", "The metal racks are for bike storage."], "image": "train2014/COCO_train2014_000000366734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28626, "question_id": "nLWybQyzyBe4LCNbCQMkhP", "question": "What should be the weight of cricket ball?", "choices": ["1.78 ounces", "4.25 ounces", "3.77 ounces", "5.75 ounces"], "correct_choice_idx": 3, "direct_answers": ["twelve ounces", "5.75 ounces", "one pound", "5.62 ounces", "light", "half pound", "5.75 ounces", "5.75 ounces", "5.75 ounces", "light"], "difficult_direct_answer": false, "rationales": ["The weight is 5.75.", "The cricket ball is 5.75 ounces.", "A cricket ball generally clocks in at around 575 ounces."], "image": "train2014/COCO_train2014_000000028626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438623, "question_id": "nLipZUSYHP7ZC2HUPWCy6b", "question": "Why is the food unhealthy?", "choices": ["high fat", "high carbohydrate", "high sodium", "high sugar"], "correct_choice_idx": 2, "direct_answers": ["salty", "fatty", "red meat", "high sodium", "preservatives", "processed", "dirty", "fatty", "salt", "salty"], "difficult_direct_answer": false, "rationales": ["The food has tons of salt in it.", "Kids are eating hot dogs. hot dogs are high in sodium.", "The food has a lot of salt."], "image": "val2014/COCO_val2014_000000438623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21971, "question_id": "nLmmPHdEcxxzvGPBHN9n6P", "question": "The person standing by the water in a bikini is holding what?", "choices": ["parasol", "ball", "cookies", "cat"], "correct_choice_idx": 1, "direct_answers": ["ball", "ball", "beach ball", "ball", "beach ball", "ball", "beach ball", "ball", "ball", "ball"], "difficult_direct_answer": false, "rationales": ["She is at the beach.", "The person standing by the water is holding a ball and wearing a bikini.", "The person is by the ball."], "image": "val2014/COCO_val2014_000000021971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277470, "question_id": "nLr9SnnThqiiqkqNTXLGag", "question": "What type of business is shown?", "choices": ["food", "rental", "beauty", "law"], "correct_choice_idx": 1, "direct_answers": ["selling bicycles", "bike shop", "bicycle shop", "street vendors", "bike rental", "bike company", "bike shop", "rental", "bike rental", "bicycle sales"], "difficult_direct_answer": false, "rationales": ["There are bikes to rent.", "There is a line of matching bicycles in front of a store.", "The bikes are all similar and the sign indicates you can use one in exchange for money"], "image": "train2014/COCO_train2014_000000277470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327070, "question_id": "nLwSG95qAqf3Uudkkhvgfr", "question": "What word is on the ground in white letters?", "choices": ["bus", "good", "left", "car"], "correct_choice_idx": 0, "direct_answers": ["lane bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus", "liner", "bus"], "difficult_direct_answer": false, "rationales": ["The word on the ground is bus.", "It says bus lane.", "The word on the street is bus to signify this is a bus lane."], "image": "val2014/COCO_val2014_000000327070.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286813, "question_id": "nMp7ZWBwFvv2JGJCRjaPeQ", "question": "What is this activity?", "choices": ["musical", "play", "race", "concert"], "correct_choice_idx": 2, "direct_answers": ["polo", "harness racing", "trotting", "racing", "wheelchair horseracing", "horse racing", "horse racing", "buggy racing", "harness racing", "race"], "difficult_direct_answer": false, "rationales": ["This is a jocket behind a race horse on a track", "The pony has a number on it. numbers are used for competitions.", "A competition is happening since the horse is numbered."], "image": "val2014/COCO_val2014_000000286813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338531, "question_id": "nMrn2xRXk4DwA7K58xfrBt", "question": "What does the sky depict about the weather?", "choices": ["clear", "cloudy", "rainy", "foggy"], "correct_choice_idx": 0, "direct_answers": ["clear", "sunny", "clear skies", "dry", "blue sky", "sunny", "clear", "clear warm", "beautiful", "sunny"], "difficult_direct_answer": false, "rationales": ["The sky is blue without a cloud in sight. the weather is good.", "The visible sky is all blue with no clouds. when there are no clouds in the sky it is said to be clear.", "It is a clear sky because there are no clouds in it"], "image": "val2014/COCO_val2014_000000338531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502393, "question_id": "nNWALYSE5KKQd6ukrCvrEG", "question": "What does AllPlay sell?", "choices": ["sports equipment", "weightlifting equipment", "games", "armor"], "correct_choice_idx": 2, "direct_answers": ["balls", "skis", "games", "sports", "equipment", "skis", "audio streaming", "sporting goods", "tennis balls", "sports equipment"], "difficult_direct_answer": true, "rationales": ["Allplay is advertised at a sporting event.", "Allplay sells sports equipment.", "They sell games."], "image": "val2014/COCO_val2014_000000502393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24755, "question_id": "nPbT3G923RHpzViKkcM7bT", "question": "Who are all the people amassed behind the skiers watching standing still?", "choices": ["judges", "spectators", "diners", "waiters"], "correct_choice_idx": 1, "direct_answers": ["spectators", "spectators", "spectators", "spectators", "spectators", "spectators", "spectators", "spectators", "spectators", "spectators"], "difficult_direct_answer": false, "rationales": ["They are watching them.", "The people who come to watch the race.", "The people behind the skiers are behind a partition and are not wearing gear needed for competition which is consistent with answer a in this setting."], "image": "val2014/COCO_val2014_000000024755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389138, "question_id": "nPgjCxmj3VrP4bc86BSshb", "question": "The animals without wings have how many legs combined?", "choices": ["four", "three", "six", "eight"], "correct_choice_idx": 3, "direct_answers": ["eight", "eight", "eight", "eight", "four legs", "four legs", "eight", "eight", "eight", "eight"], "difficult_direct_answer": false, "rationales": ["The dog and cow combined have eight legs.", "We see a cow and dog in this image; each have four legs and do not have wings.", "There are a total of 8 legs."], "image": "train2014/COCO_train2014_000000389138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460030, "question_id": "nQH9kwBoTKSo26mQnhNHfX", "question": "What is the red vest the person in the boat is wearing called?", "choices": ["pilots vest", "fashion vest", "life vest", "novelty vest"], "correct_choice_idx": 2, "direct_answers": ["rower", "life jacket", "lifejacket", "life vest", "life vest", "crew captain", "lifejacket", "life jacket", "life jacket", "lifevest"], "difficult_direct_answer": false, "rationales": ["This is a flotation device", "People in a boat have vests on that are flotation devices.", "The red vest is a life vest for emergencies."], "image": "train2014/COCO_train2014_000000460030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109908, "question_id": "nQfNFvxYAq9KVbgpkWXFco", "question": "What are the motorcyclists engaging in?", "choices": ["parade", "racing", "terrorism", "training"], "correct_choice_idx": 0, "direct_answers": ["formation", "parade", "riding", "parade", "ridding", "motorcade", "parade", "parade", "parade", "formation display"], "difficult_direct_answer": false, "rationales": ["There are 8 of them riding in a formation with many people watching, but not at a racetrack.", "They are riding in the street while people watch.", "They're in a parade."], "image": "train2014/COCO_train2014_000000109908.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553224, "question_id": "nQjYTTQSvEsJ7cvXosrDfU", "question": "What is the woman holding white items looking at?", "choices": ["spouse", "enemy", "monitor screen", "dog"], "correct_choice_idx": 2, "direct_answers": ["monitor", "monitor screen", "wii screen", "tv screen", "video game", "tv screen", "television", "video game", "television", "television"], "difficult_direct_answer": false, "rationales": ["The two are white items are used to control a monitor.", "The girl is looking a a screen as she's playing a game.", "The woman is holding a console to play on her monitor."], "image": "train2014/COCO_train2014_000000553224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166742, "question_id": "nQm7Esa3cpgVkRj9iUAfdA", "question": "Which vitamin is rich in apple?", "choices": ["vitamin k", "folates", "vitamin b", "vitamin c"], "correct_choice_idx": 3, "direct_answers": ["vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c"], "difficult_direct_answer": false, "rationales": ["The apples on the table are rich in vitamin c.", "The apple is rich in vitamin c.", "Apples are rich in that vitamin."], "image": "train2014/COCO_train2014_000000166742.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397117, "question_id": "nQu9HTpdG5h4MbKdTrH25L", "question": "Martin Weissburg is a President of which American truck manufacturing company?", "choices": ["ford", "isuzu", "mack", "volvo"], "correct_choice_idx": 2, "direct_answers": ["mack", "mack trucks", "mack", "mack trucks", "mack", "mack", "mack trucks", "mack", "mack", "mack"], "difficult_direct_answer": false, "rationales": ["The truck in the street is made by mack trucks. martin weissburg is the president of that company.", "Martin weissburg is listed on the company's website as the president.", "Martin weissburg leads the helm of mack."], "image": "val2014/COCO_val2014_000000397117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155450, "question_id": "nQvySh7PuipHFFMXcQSubC", "question": "What is the man trying to teach the young girl?", "choices": ["aerobics", "tennis", "catch", "counting"], "correct_choice_idx": 1, "direct_answers": ["tennis", "tennis", "tennis", "tennis", "play tennis", "tennis", "playing tennis", "tennis", "tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["She is standing on a court, holding a racket and hitting at yellow balls.", "The man plays tennis.", "The girl is standing behind a net. she is holding a racquet and is about to hit a green ball."], "image": "train2014/COCO_train2014_000000155450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397433, "question_id": "nQw3vScYbununeh7yANJNr", "question": "What items are sold here?", "choices": ["electronics", "calendars", "rings", "animals"], "correct_choice_idx": 0, "direct_answers": ["electronics", "electronics", "electronics", "phones", "electronics", "cell phone", "cell phones", "phones", "cell phones", "electronics/phones"], "difficult_direct_answer": false, "rationales": ["Electronics are sold.", "Electronics are on display.", "There are phones on the table"], "image": "val2014/COCO_val2014_000000397433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498259, "question_id": "nQwLGesiXgFmeNXF9SKvSV", "question": "Why are there so many people on the boat?", "choices": ["taking tour", "fishing expedition", "rowing team", "life boat"], "correct_choice_idx": 2, "direct_answers": ["team sport", "rowing competition", "rowing team", "crew team", "rowing team", "team", "race", "team", "tour group", "rowing team"], "difficult_direct_answer": false, "rationales": ["The matching outfits and text and numbers on the side of this long boat tells us these rowers are on a team together.", "They're on the rowing team.", "The boat has the words dragon boat on the side. dragon boats loaded with this many people in it are probably competing in a race and this is the team."], "image": "val2014/COCO_val2014_000000498259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147681, "question_id": "nRGhc6zttvQnHduUv4QFEX", "question": "Where did the skateboarder begin this move from?", "choices": ["up above", "mid level", "grassy area", "bottom step"], "correct_choice_idx": 0, "direct_answers": ["piller", "top", "top stairs", "top", "top", "up above", "stair top", "above staircase", "top stairs", "upper staircase"], "difficult_direct_answer": false, "rationales": ["The skater is starting from above.", "The skateboarder came from the top of the rail.", "The skateboarder started at the top of the stairs above the railing and then went down."], "image": "val2014/COCO_val2014_000000147681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364589, "question_id": "nRPMxbwDcMTPG48yZaKms6", "question": "What is the mode of feeding of this animals?", "choices": ["frugivores", "carnivores", "omnivores", "herbivores"], "correct_choice_idx": 3, "direct_answers": ["gazing", "bulk", "zookeeper", "eat grass", "herbivores", "grass", "grains/grass", "grazing", "grazing", "grass"], "difficult_direct_answer": false, "rationales": ["These animals are giraffes and zebras. they have plant-based diets.", "These animals have flat teeth which are suitable for eating plants.", "They only eat vegetables."], "image": "val2014/COCO_val2014_000000364589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47492, "question_id": "nRQBLHmumer7LpcT7uwz6f", "question": "Why would this person load the back of their car with these?", "choices": ["delivery driver", "work tools", "road trip", "add weight"], "correct_choice_idx": 2, "direct_answers": ["travel", "overnight trip", "road trip", "vacation", "traveling", "travelling", "road trip", "traveling", "traveling", "leaving town"], "difficult_direct_answer": false, "rationales": ["The person is on a road trip.", "A van is packed with suitcases in the back of it. people pack their stuff into suitcases when they go on a trip.", "There are several suitcases in the back of the trunk."], "image": "train2014/COCO_train2014_000000047492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148373, "question_id": "nRTXuWsfUARMvgu7feBVDz", "question": "What is listed on the chalkboard here?", "choices": ["menu", "vocabulary", "rules", "math"], "correct_choice_idx": 0, "direct_answers": ["menu", "menu", "menu", "menu", "menu", "menu items", "menu", "menu items", "menu", "menu items"], "difficult_direct_answer": false, "rationales": ["They appear to be at a restaurant and the chalkboard shows the food prices and options.", "People are eating food there.", "Restaurants sometimes display their food options on a board for patrons to see."], "image": "train2014/COCO_train2014_000000148373.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542452, "question_id": "nRgKnGFLyjNN3UaX6ywis5", "question": "What sort of traffic is forbidden during this time?", "choices": ["pedestrian", "automobile", "foot", "vendor"], "correct_choice_idx": 1, "direct_answers": ["motor vehicles", "automobile", "car", "car", "cars", "vehicular", "automobile traffic", "car", "cars", "motor vehicles"], "difficult_direct_answer": false, "rationales": ["The traffic being forbidden is automobile because the street is cordoned off and people are walking in it", "The car traffic is forbidden.", "This is a street fair and only people are allowed to walk through. no cars are allowed."], "image": "train2014/COCO_train2014_000000542452.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349074, "question_id": "nRnETxifE7cdYTycRm3kgn", "question": "Who was born in the country that the town on the top of the bus is located in?", "choices": ["robert pattinson", "miley cyrus", "kristen stewart", "noah wyle"], "correct_choice_idx": 0, "direct_answers": ["many people", "winston churchill", "robert pattinson", "nicholas holt", "someone english", "citizen", "nicholas holt", "englishman", "english people", "winston churchill"], "difficult_direct_answer": false, "rationales": ["The town on the top of the bus is wokingham. this town is in england, not the united states.", "He is the only person that was not born in the us.", "Robert pattinson was born there."], "image": "train2014/COCO_train2014_000000349074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450940, "question_id": "nRuweoMjyfauKoMDkLbYNv", "question": "What is the woman on the bike using the bridge to cross over?", "choices": ["grass", "water", "rocks", "debris"], "correct_choice_idx": 1, "direct_answers": ["water", "river", "river", "river", "water", "water", "water", "river", "river", "river"], "difficult_direct_answer": false, "rationales": ["There is a river, not rocks, grass, or debris, to the left of and beneath the bridge.", "A river is to the left of the bridge.", "A woman is riding on a paved bridge and a river can be seen below."], "image": "train2014/COCO_train2014_000000450940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389974, "question_id": "nRxF9qwpENqCaw3NjsCqCs", "question": "What is the collection of photos on the wall called?", "choices": ["spread", "menagerie", "album", "collage"], "correct_choice_idx": 3, "direct_answers": ["collage", "montage", "collage", "collage", "collage", "gallery", "collage", "collage", "collage", "array"], "difficult_direct_answer": false, "rationales": ["The collection is a collage.", "The collection is a collage of photos.", "It is a lot of different photos that are grouped together and in close proximity to each other in an intentional arrangement for display."], "image": "val2014/COCO_val2014_000000389974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568148, "question_id": "nSQRdKNKBSJmXPF7AnitgW", "question": "To whom does the woman want to throw the frisbee?", "choices": ["photographer", "no one", "enemy", "grounds keeper"], "correct_choice_idx": 0, "direct_answers": ["camera person", "cameraman", "photographer", "friend", "friend", "photographer", "photographer", "no one", "photographer", "cameraman"], "difficult_direct_answer": false, "rationales": ["The woman is holding the frisbee and looking right at the person with the camera and there are no other people around.", "To the person taking the picture.", "The woman in question is looking in the direction and eye line of where the photographer would be positioned and looking longingly."], "image": "val2014/COCO_val2014_000000568148.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367471, "question_id": "nSr9vYE7YEU9G8kQPfjiHF", "question": "The thing sitting on the couch is the girl's what?", "choices": ["pet", "father", "mother", "sister"], "correct_choice_idx": 0, "direct_answers": ["pet", "cat", "pet", "pet", "cat", "cat", "pet", "pet", "cat", "pet"], "difficult_direct_answer": false, "rationales": ["Cats are usually kept by humans for companionship.", "The cat is the pet that belongs to the girl.", "It's her cat."], "image": "train2014/COCO_train2014_000000367471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126347, "question_id": "nSvvSGvzxRfeXkdKFRGSnw", "question": "To what is this sail attached?", "choices": ["nothing", "shark", "beach comber", "surfer"], "correct_choice_idx": 3, "direct_answers": ["surfboard", "board", "rope", "string", "surfer", "person", "on surfer", "skier", "person", "surfer"], "difficult_direct_answer": false, "rationales": ["The sail is on the surfer.", "The sail is attached by a cord to a surfer in an activity known as parasailing.", "The sail helps the person maintain balance while in the water using the wind."], "image": "train2014/COCO_train2014_000000126347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340674, "question_id": "nSw6uNMAALwjPhuNAtkQn5", "question": "What sort of cooking device is found in the food truck here?", "choices": ["pizza oven", "bun warmer", "hot plate", "deep fryer"], "correct_choice_idx": 3, "direct_answers": ["deep fryer", "fryer", "deep fryer", "fryer", "deep fryer", "fryer", "fryer", "fryer", "fryer", "deep fryer"], "difficult_direct_answer": false, "rationales": ["The donut has been fried.", "The fryer is used.", "The food closest to the camera is only cooked one way."], "image": "train2014/COCO_train2014_000000340674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243386, "question_id": "nTBanLLLm3HAKSLFAd9gE2", "question": "What is this seat used for?", "choices": ["tea time", "watching tv", "restaurant", "showering"], "correct_choice_idx": 3, "direct_answers": ["old people", "showering", "showering", "shower", "shower stability", "sitting", "sitting", "showering safely", "showers", "resting"], "difficult_direct_answer": false, "rationales": ["Shower seats are used for elderly or disabled individuals to be able to shower and not fall down.", "The seat is used to take a shower.", "The chair is in the shower."], "image": "val2014/COCO_val2014_000000243386.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454722, "question_id": "nTCdoVTcHp98m4WQt5ZFNC", "question": "What is often the maximum weight each baggage can be in kilograms?", "choices": ["23", "13", "33", "unlimited"], "correct_choice_idx": 0, "direct_answers": ["ten", "2.5", "25", "32kg", "50 pds", "fifty", "ten", "23", "fifty", "50"], "difficult_direct_answer": false, "rationales": ["Luggage is piled up at an airport baggage claim area.", "The maximum that most airports allow is 32 kilograms, so 33 would be the closest number to that.", "The recommendations for checked baggage are: advised maximum weight 23 kg (50.71 lbs)"], "image": "train2014/COCO_train2014_000000454722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141678, "question_id": "nTHh7jNmcVKdXZKGUzirB8", "question": "The person in the lead is wearing what color jacket?", "choices": ["blue", "yellow", "green", "black"], "correct_choice_idx": 2, "direct_answers": ["green", "green", "green", "green", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["The people are clearly visible and the person closer to the bottom and thus in the lead is wearing a green jacket.", "The skier in front is wearing green.", "Two people are skiing down a perfectly grooved ski slope. the person in front has a color matching grass."], "image": "train2014/COCO_train2014_000000141678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217571, "question_id": "nTn7HzQMTnbNVbbuotL4gW", "question": "What maneuver did this plane just do?", "choices": ["landing", "takeoff", "cruising", "evasive"], "correct_choice_idx": 1, "direct_answers": ["take off", "ascent", "takeoff", "takeoff", "take off", "lift off", "take off", "takeoff", "take off", "take off"], "difficult_direct_answer": false, "rationales": ["The plane is headed up in the sky.", "The plane is ascending.", "The plane is obviously taking off, because its nose is tilted upwards as it ascends into the friendly skies."], "image": "train2014/COCO_train2014_000000217571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505080, "question_id": "nUHC9wSX7KUcNbB7HAFq5P", "question": "What is the black object on the back of the vessel used for?", "choices": ["anchoring", "storage", "moving", "mooring"], "correct_choice_idx": 2, "direct_answers": ["motor", "motor", "power", "movement", "power", "moving", "engine", "moving", "engine", "move boat"], "difficult_direct_answer": false, "rationales": ["The motor is black.", "It's the motor.", "The black object is the motor which moves the boat."], "image": "val2014/COCO_val2014_000000505080.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220390, "question_id": "nUSLRfaDbgdKiGL7LTLCnX", "question": "What is the most probable location of this town square?", "choices": ["nepal", "indonesia", "bangladesh", "tibet"], "correct_choice_idx": 2, "direct_answers": ["bengali", "china", "india", "india", "bangladesh", "india", "poor country", "afghanistan", "india", "bangladesh"], "difficult_direct_answer": false, "rationales": ["The bus in the image has writing in bengali.", "The town square is really run down with a lot of indians.", "It's bangladesh."], "image": "train2014/COCO_train2014_000000220390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152162, "question_id": "nUXJuZfezmRyyy3qsarSPo", "question": "What are the fences made out of?", "choices": ["metal", "rubber", "plastic", "diamond"], "correct_choice_idx": 0, "direct_answers": ["barbed wire", "metal", "chainlink", "metal", "steel", "metal", "metal", "iron rod", "metal", "metal"], "difficult_direct_answer": false, "rationales": ["The material is sturdy and also rusting.", "The fences here are made of metal.", "They are chain link fences. chain link is made from silver hard material."], "image": "train2014/COCO_train2014_000000152162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283780, "question_id": "nUYNKgvyXXqfoG3TeaY3iZ", "question": "What utensil is in full view on the table alongside a knife and spoon?", "choices": ["spatula", "tongs", "fork", "pizza cutter"], "correct_choice_idx": 3, "direct_answers": ["pizza cutter", "pizza cutter", "plate", "pizza cutter", "pizza cutter", "plate", "pizza cutter", "pizza cutter", "pizza cutter", "plate"], "difficult_direct_answer": false, "rationales": ["The utensil is a round blade designed to slice pizzas and pies by rolling over them.", "The sharp circle with handle of the device in the middle of this table identifies it as a pizza slicer.", "There are three utensils in view; a knife, a spoon and one that has a round sharp metal wheel attached to it. since it is located next to a pizza, it would make sense that utensil is a pizza cutter."], "image": "train2014/COCO_train2014_000000283780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377772, "question_id": "nUdNXLAuYhmiCLmssZP3dc", "question": "What type of bed is shown?", "choices": ["queen", "king", "air mattress", "bunk bed"], "correct_choice_idx": 3, "direct_answers": ["bunk", "bunked", "bunk", "bunk bed", "bunk bed", "bunk", "bunk", "bunk bed", "bunk", "bunk bed"], "difficult_direct_answer": false, "rationales": ["A bed frame holding two beds, one above the other is shown. bunk beds have two beds.", "There are 2 beds.", "There is a double decker bed visible which is commonly known as a bunk bed."], "image": "train2014/COCO_train2014_000000377772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166631, "question_id": "nVBswE6Qw66J3p8GvdyXMB", "question": "What are the helmets shells made from?", "choices": ["plastic", "steel", "foam", "clay"], "correct_choice_idx": 0, "direct_answers": ["plastic", "plastic", "plastic", "fiberglass", "plastic", "plastic", "plastic", "plastic", "metal", "plastic"], "difficult_direct_answer": false, "rationales": ["Helmets are most commonly made with a plastic shell and the helmets visible look like standard helmets that would be made from the most common materials.", "The helmets material looks sleek and shiny which is consistent with answer a and is commonly known to be the material of helmets.", "The men are wearing helmets that have shells made from plastic."], "image": "train2014/COCO_train2014_000000166631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262119, "question_id": "nVFwt99j4zYs5kzeTezhBL", "question": "What is in the air?", "choices": ["bird", "car", "airplane", "man"], "correct_choice_idx": 3, "direct_answers": ["guy", "skateboard", "skateboarder", "skateboarder", "skateboarder", "person", "skateboarder", "man", "person", "skateboarder"], "difficult_direct_answer": false, "rationales": ["The man is in the air.", "He is doing a trick by jumping up off the skateboard", "A man is flying through the air on his skateboard."], "image": "val2014/COCO_val2014_000000262119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162580, "question_id": "nVN4rZNysKTWcsk9JiQWeJ", "question": "What does the child cuddle?", "choices": ["doll", "teddy bear", "barbie", "child"], "correct_choice_idx": 1, "direct_answers": ["stuffed animal", "teddy bear", "teddy bear", "teddy bear", "baby", "teddy bear", "baby doll", "teddy bear", "bear", "stuffed animal"], "difficult_direct_answer": false, "rationales": ["The child is holding a furry friend.", "The child has a stuffed bear.", "A child is holding a stuffed bear. children like stuffed animals."], "image": "val2014/COCO_val2014_000000162580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312033, "question_id": "nVTqCFRBaS3CFcaRUNH6AR", "question": "What color vest does the person batting next wear?", "choices": ["red", "green", "black", "white"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "blue", "black", "black", "blue", "blue", "blue", "black", "brightly colored"], "difficult_direct_answer": false, "rationales": ["It looks blue to me, but this is the closest other option.", "The person batting has a black vest.", "The baseball player wore the dark team colors."], "image": "val2014/COCO_val2014_000000312033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266908, "question_id": "nVVreed5yA85VT9fmwwxbD", "question": "Given the toppings who would best enjoy eating this kind of pizza?", "choices": ["meat lovers", "vegetarians", "vegans", "everyone"], "correct_choice_idx": 0, "direct_answers": ["pepperoni lover", "meat lover", "pepperoni lovers", "people", "pepperoni lovers", "meat lovers", "carnivore", "meat lovers", "kids", "meat lover"], "difficult_direct_answer": false, "rationales": ["The pizza appears to be topped with pepperoni, which is a dry cured meat.", "Pepperoni is made out of multiple types of meat.", "The pizza has a lot of pepperoni on it which is made of pork."], "image": "train2014/COCO_train2014_000000266908.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229969, "question_id": "nVZevYHYpBJejoHckpvdcA", "question": "What is missing in the picture that is typical at beaches?", "choices": ["umbrellas", "beach towels", "fording chairs", "buckets"], "correct_choice_idx": 0, "direct_answers": ["umbrella", "beach umbrellas", "umbrellas", "umbrellas", "sun", "umbrellas", "dogs", "umbrellas", "surfboards", "towel"], "difficult_direct_answer": false, "rationales": ["They give shade from the sun.", "Nobody is in the shade.", "On a hot day there are typically umbrellas for people to sit under to take refuge from the hot sun and uv rays."], "image": "train2014/COCO_train2014_000000229969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465353, "question_id": "nVkwi5PGSBGJxeiGYMhtn8", "question": "What is the terrain near the parking lot?", "choices": ["beach", "urban center", "mountainous", "deep valley"], "correct_choice_idx": 2, "direct_answers": ["concrete", "mountain", "dirt", "mountainous", "same terrain", "mountainous", "mountainous", "mountainous", "mountains", "mountains"], "difficult_direct_answer": false, "rationales": ["The area appears elevated based on background terrain visible and the area appears to fall off just beyond the edge of the road.", "There are mountains in the background.", "The landform in the background is very high."], "image": "train2014/COCO_train2014_000000465353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450833, "question_id": "nVu8XTtJNS7AudYM9Qkzb9", "question": "What do the cones mark?", "choices": ["holes", "finish", "lanes", "danger"], "correct_choice_idx": 2, "direct_answers": ["lanes", "slaloms", "course", "spacers", "pathway", "lanes", "specific angles", "course", "slalom points", "course"], "difficult_direct_answer": false, "rationales": ["They're lanes.", "The way they need to skate to stay in line.", "They have to maneuver around these on their skateboards"], "image": "train2014/COCO_train2014_000000450833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217793, "question_id": "nWXPXYRzWmZwoc5mgucp8o", "question": "How many people can this area accommodate comfortably?", "choices": ["none", "four", "one", "two"], "correct_choice_idx": 1, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are four different chairs so four people would be best for this area.", "There are four chairs by the beach that people can sit in comfortable.", "There are 4 people."], "image": "train2014/COCO_train2014_000000217793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17696, "question_id": "nWbzTpxtsMstMFDknSXwKb", "question": "What is the structure covered with snow called?", "choices": ["ski jump", "ferris wheel", "obstacle", "slalom"], "correct_choice_idx": 0, "direct_answers": ["ramp", "ramp", "slope", "ramp", "ski jump", "ramp", "jump", "ramp", "slope", "ramp"], "difficult_direct_answer": false, "rationales": ["It is stacked up and angled like a wooden one would be", "The area is where people can jump from.", "People are skiing. they are making it into a ramp."], "image": "train2014/COCO_train2014_000000017696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578130, "question_id": "nX6Rsx8MB4yjFEohqkHxiP", "question": "What color is the seat of the oriental chair on top of the rug?", "choices": ["red", "white", "blue", "yellow"], "correct_choice_idx": 2, "direct_answers": ["blue", "light brown", "blue", "brown", "blue", "tan", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The chair is identifiable based on the text of the question and the seat cushion color is visible through the siding of the chair.", "That's what color the seat is.", "There is a bamboo chair on the oriental rug with a blue seat."], "image": "val2014/COCO_val2014_000000578130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188067, "question_id": "nXDWe8fU28opU9qi8ipMVL", "question": "What player wears the same jersey number of the boy but plays a different sport?", "choices": ["michael jordan", "mike trout", "freddie freeman", "wayne gretzky"], "correct_choice_idx": 2, "direct_answers": ["jalen ramsey", "joe dimaggio", "freddie freeman", "unknown", "joe dimaggio", "no idea", "number 5", "jalen ramsey", "kevin garnet", "many players"], "difficult_direct_answer": false, "rationales": ["The number on the player is clearly visible and professional players across different sports that share the number is internet searchable.", "Wayne gretzky, michael jordan, and mike trout wore 99, 23, and 27, respectively.", "He has the number 5 for the atlanta braves baseball team"], "image": "val2014/COCO_val2014_000000188067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128608, "question_id": "nXZPwvVxb537m4TaG4MyjZ", "question": "The blue translucent container in the lower right corner dispenses what?", "choices": ["candy", "mints", "water", "paper towels"], "correct_choice_idx": 2, "direct_answers": ["water", "water", "water", "water", "water", "water", "water", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["Often found in offices and sometimes in homes, that blue container is a traditional water cooler.", "That is a bottle that goes in a dispenser", "It has the color and texture consistent with that of a large water jug."], "image": "train2014/COCO_train2014_000000128608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64799, "question_id": "nXaFGURGYCFMQyUQU2up2R", "question": "What is the woman wearing?", "choices": ["camouflage hat", "scarf", "purse", "bandana"], "correct_choice_idx": 0, "direct_answers": ["hat", "necklace", "hat", "camouflage hat", "hat", "tshirt", "hat", "camo hat", "necklace", "hat"], "difficult_direct_answer": false, "rationales": ["It is splotchy iwth greens and browns and normally worn by military personnel", "She has a camo hat on.", "That's what she has on her head."], "image": "train2014/COCO_train2014_000000064799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299704, "question_id": "nXfgxcwzgraG3ciaeRSbaz", "question": "What is the gray electronic device mounted above the red sign on the left?", "choices": ["clock", "computer", "command center", "security camera"], "correct_choice_idx": 3, "direct_answers": ["security camera", "security camera", "camera", "security camera", "security camera", "camera", "camera", "security camera", "camera", "camera"], "difficult_direct_answer": false, "rationales": ["The device is a camera.", "There is a security camera mounted against the side of this building.", "This is to watch people and make sure there are no crimes. they mount them high so it's difficult for people to mess with them"], "image": "train2014/COCO_train2014_000000299704.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577355, "question_id": "nYGqemha8tWYunJaexBtcV", "question": "The set of parallel electric lines are used to power what on the road below?", "choices": ["bus", "signage", "tram", "traffic lights"], "correct_choice_idx": 3, "direct_answers": ["voltage", "traffic lights", "train", "narrow", "traffic lights", "street lights", "electric circuits", "traffic lights", "train", "traffic lights"], "difficult_direct_answer": false, "rationales": ["There are lights above the poles.", "The traffic lights need electricity.", "There are wires going directly to the pole holding the signals."], "image": "val2014/COCO_val2014_000000577355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405849, "question_id": "nYYnQykdwyxQ5j2Fv6pmXu", "question": "What vehicle are the people riding on?", "choices": ["jeep", "double decker", "train", "van"], "correct_choice_idx": 1, "direct_answers": ["carts", "doubledecker bus", "tourist bus", "jeep", "truck", "bus", "double decker", "double decker", "sightseeing bus", "vehicle"], "difficult_direct_answer": true, "rationales": ["They appear to be riding on the top of a bus.", "The vehicle is high off the ground, almost hitting the traffic light, and can fit multiple people.", "The people are riding a vehicle that can seat a large amount of people and is tall enough to almost hit the traffic light."], "image": "train2014/COCO_train2014_000000405849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454923, "question_id": "nYZAvnv3JaBSZC5B89UmEz", "question": "What wrestler has a similar first name to the word that appears above power?", "choices": ["jerry lynn", "brock lesnar", "chris candido", "alex wright"], "correct_choice_idx": 1, "direct_answers": ["rock", "rock", "brock lesnar", "rock", "rock", "brock lesnar", "rock", "brock", "bock", "unknown"], "difficult_direct_answer": false, "rationales": ["People sit at a business with the name on the sign above.", "The word above power is bock.", "The word in the background is \"bock\" which is similar to the name \"brock\" which is the first name of a wrestler named brock lesnar."], "image": "train2014/COCO_train2014_000000454923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214737, "question_id": "nYeYiszK23gERk5dSeSVej", "question": "What ensigns are shown at the top most part of this building?", "choices": ["coatsof arms", "fleursdi lis", "clocks", "flags"], "correct_choice_idx": 0, "direct_answers": ["american", "family crest", "coatsof arms", "exit", "railings", "american", "shields", "crests", "service branches", "statues"], "difficult_direct_answer": true, "rationales": ["This building is old and the design on top of the clock is an insignia of the people who previously owned the building.", "There is a clock on the building.", "The signs are showing shields."], "image": "val2014/COCO_val2014_000000214737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570069, "question_id": "nYgZMBu2VHcED8s57eD25R", "question": "What is the man in the green sweater on the left doing?", "choices": ["reading", "exercising", "dancing", "photographing"], "correct_choice_idx": 3, "direct_answers": ["recording", "photographing", "taking pictures", "dodging", "taking photo", "hitting ball", "taking photo", "taking pictures", "standing", "taking picture"], "difficult_direct_answer": false, "rationales": ["The man is snapping a photo.", "He is holding a camera up toward the frisbee game and looking through the lens.", "He is taking a picture."], "image": "train2014/COCO_train2014_000000570069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309005, "question_id": "nYuA2urWGascRjoxSEVAWZ", "question": "Which object in the room is the most mobile?", "choices": ["baby carriage", "table", "mirror", "television"], "correct_choice_idx": 0, "direct_answers": ["stroller", "baby carriage", "vehicle", "chair", "stroller", "stroller", "tricycle", "stroller", "laundry rack", "baby carriage"], "difficult_direct_answer": false, "rationales": ["The most mobile item is going to be the one that is easiest to move. only one of the items has wheels.", "It has wheels and can be pushed.", "This has wheels and can be pushed."], "image": "val2014/COCO_val2014_000000309005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399968, "question_id": "nZ8scykE4ErWUmD458u6Zn", "question": "Why does he have the racquet behind him?", "choices": ["strike ball", "bad arm", "stole it", "hiding it"], "correct_choice_idx": 0, "direct_answers": ["already swung", "strike ball", "follow through", "back swing", "hitting ball", "back swing", "balance", "increase power", "swinging", "to return"], "difficult_direct_answer": true, "rationales": ["While and immediately after doing this, a person's arm is often extended out from the body in a way that helps them keep their balance.", "The man wants to hit the ball.", "The man is playing tennis. he is preparing to swing the racket."], "image": "val2014/COCO_val2014_000000399968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209763, "question_id": "nZQjmankCnfcjniapLC5yM", "question": "What is the purpose of the stuffed teddy bear in the statue's hand?", "choices": ["dress code", "historical significance", "symbolic", "decorative"], "correct_choice_idx": 3, "direct_answers": ["decoration", "joke", "nothing", "compassion", "comedy", "peace", "decorative", "offering", "comfort", "symbolic pacifism"], "difficult_direct_answer": true, "rationales": ["Someone left the bear there. it is not part of the statue.", "Someone must have put the teddy bear on the statue to decorate it.", "The bear is just decorative."], "image": "val2014/COCO_val2014_000000209763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478575, "question_id": "nZcf7xXNRZ7AZ7TcnJcq3E", "question": "What was the first name of Mr. Harley?", "choices": ["sean", "john", "william", "mark"], "correct_choice_idx": 2, "direct_answers": ["william", "harley", "davidson", "william", "harley", "william", "davidson", "william", "william", "william"], "difficult_direct_answer": false, "rationales": ["William s. harley was one of the founders of this motorcycle company.", "William harley is the full name.", "The answer is internet searchable based on the company displayed and last name given in the question."], "image": "val2014/COCO_val2014_000000478575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250834, "question_id": "nZjoYJBz76HnZWDqYyBKkR", "question": "Which Salem is in the picture?", "choices": ["missouri", "massachusetts", "oregon", "connecticut"], "correct_choice_idx": 2, "direct_answers": ["oregon", "virginia", "oregon", "oregon", "oregon", "california", "usa", "city", "massachusetts", "pennsylvania"], "difficult_direct_answer": false, "rationales": ["It's in oregon.", "The sign on the left refers to interstate 5. this freeway runs along the west coast of the united states.", "This interstate is on the west coast"], "image": "train2014/COCO_train2014_000000250834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100081, "question_id": "nZn6DG7N8JjcZx9eT8cden", "question": "What natural element might interrupt the frisbee here?", "choices": ["sleet", "tornado", "wave", "storm"], "correct_choice_idx": 2, "direct_answers": ["water", "wave", "wave", "wave", "wave", "wind", "wind", "water", "wind", "water"], "difficult_direct_answer": false, "rationales": ["A wave might interrupt the frisbee here.", "The element is a wave.", "If it is low enough to the water then it could hit the frisbee as it rolls into shore"], "image": "val2014/COCO_val2014_000000100081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487530, "question_id": "nZvHphUgdEQ8CZgxg4Z9Vb", "question": "What is the man at the curb sitting down doing?", "choices": ["crossing street", "selling oranges", "sleeping", "panhandling"], "correct_choice_idx": 3, "direct_answers": ["resting", "begging", "taking break", "resting", "resting", "waiting", "waiting bus", "waiting", "hitchhiking", "panhandling"], "difficult_direct_answer": false, "rationales": ["You can't cross a street or sell oranges when sitting down. and we usually don't sit down while we sleep.", "He is resting as he waits for people to ask for money", "The man is most likely homeless and asking for money from stopped motorists."], "image": "val2014/COCO_val2014_000000487530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302551, "question_id": "na35SmwmuHyu4v9XsyAEMz", "question": "What does the woman intend to do?", "choices": ["pull pants", "catch frisbee", "trip man", "catch man"], "correct_choice_idx": 1, "direct_answers": ["grab frisbee", "tag", "catch frisbee", "catch frisbee", "hit", "pants guy", "catch", "catch", "block", "pull shorts"], "difficult_direct_answer": false, "rationales": ["She is trying to grab his pants.", "She is trying to pull on his pants to stop him.", "The woman seems to try to pull the mans paint as evident in the picture."], "image": "train2014/COCO_train2014_000000302551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254379, "question_id": "naEtPdEveHxyPpss6iZBZn", "question": "What was the first item manufactured by the company that makes the remote?", "choices": ["pinball machine", "playing cards", "slot machine", "comic books"], "correct_choice_idx": 1, "direct_answers": ["wii", "wii", "wii", "sensor magnetic", "nintendo", "unknown", "nes", "nintendo", "hanafuda cards", "playing cards"], "difficult_direct_answer": false, "rationales": ["Nintendo is the manufacture of the wii game console and they are know first to make playing cards until they got into making game consoles.", "Nintendo started out as gaming cards.", "It is nintendo wii."], "image": "val2014/COCO_val2014_000000254379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191561, "question_id": "nai2HgaSzMnqS5TEP8rXVB", "question": "What is the contents of the donuts with no holes?", "choices": ["jelly", "meat", "water", "spinach"], "correct_choice_idx": 0, "direct_answers": ["custard", "jelly", "jelly", "cream filling", "jelly", "cream jelly", "jelly", "jelly", "jelly filling", "jelly"], "difficult_direct_answer": false, "rationales": ["The donut with no holes in it has a jelly filling.", "Donuts do not contain meat, spinach, or water.", "The contents are jelly."], "image": "train2014/COCO_train2014_000000191561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95251, "question_id": "najVXW5gNE9MR4VqoCztM6", "question": "Why are the men wearing red lanyards?", "choices": ["for protection", "for style", "for fun", "for work"], "correct_choice_idx": 3, "direct_answers": ["name tag", "for work", "identification", "holding ids", "playing wii", "cast", "identification", "hold badges", "employees", "identification"], "difficult_direct_answer": false, "rationales": ["The men are working.", "The men need the lanyards to do work.", "Many jobs have the employees wear badges around their necks."], "image": "val2014/COCO_val2014_000000095251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186368, "question_id": "nb62ygoRGccqZ3QFHxTtKE", "question": "What is the blue thing in front of the woman intended for?", "choices": ["microphone", "breathing", "drinking water", "recording"], "correct_choice_idx": 2, "direct_answers": ["water bottle", "drinking", "drink water", "drinking water", "sipping water", "drinking", "drinking water", "drinking water", "drinking", "water"], "difficult_direct_answer": false, "rationales": ["The blue thing is for water.", "This is a straw and that is what it is used for.", "This looks like a straw and would be used to drink."], "image": "val2014/COCO_val2014_000000186368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80931, "question_id": "nbAAqAcJBtRt83V6DWR6yC", "question": "What are the two putting on the dough?", "choices": ["jelly", "sauce", "icing", "ice cream"], "correct_choice_idx": 1, "direct_answers": ["sauce", "pizza sauce", "sauce", "family", "tomato sauce", "sauce", "tomato sauce", "sauce", "sauce", "pizza sauce"], "difficult_direct_answer": false, "rationales": ["It is red and made from tomatoes and a common topping for pizza", "They are making a pizza and the red stuff is the sauce.", "They put sauce on."], "image": "train2014/COCO_train2014_000000080931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31273, "question_id": "nbAh4ueYQEUnw9k5W3JfNu", "question": "The lizard in the sign holds the same equipment as does who seen here?", "choices": ["coach", "catcher", "batter", "noone"], "correct_choice_idx": 2, "direct_answers": ["batter", "batter", "batter", "batter", "batter", "baseball bat", "bat", "hitter", "place", "bat"], "difficult_direct_answer": false, "rationales": ["It's the geico gecko and he's holding a bat.", "The lizard is holding a bat on his shoulder just like the player in the batter's box has over his.", "A lizard and a batter are both holding bats."], "image": "train2014/COCO_train2014_000000031273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69355, "question_id": "nbP3mojS62hZqdP8fDNpKX", "question": "What special group of people are accommodated in the bus?", "choices": ["pregnant women", "blind", "handicapped", "elderly"], "correct_choice_idx": 2, "direct_answers": ["handicap", "handicapped", "handicapped", "handicapped", "handicapped", "handicapped", "handicapped", "disabled", "tourists", "hotel workers"], "difficult_direct_answer": false, "rationales": ["The handicapped people can ride on this bus because there is a handicapped sign on it", "A blue sign with a wheelchair is in the window of a bus.", "The blue label on the windshield signifies that the bus is wheelchair-accessible."], "image": "train2014/COCO_train2014_000000069355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298933, "question_id": "nbcK6tQYnctSS4UjkQt8Kc", "question": "Which animal does closely resemble the kite flown above the beach by the young child?", "choices": ["bird", "butterfly", "dragonfly", "dragon"], "correct_choice_idx": 1, "direct_answers": ["butterfly", "penguin", "butterfly", "butterfly", "butterfly", "duck", "butterfly", "butterfly", "butterfly", "penguin"], "difficult_direct_answer": false, "rationales": ["This insect has prominent and flashy triangular wings.", "The animal is a butterfly.", "The kite resembles a butterfly."], "image": "val2014/COCO_val2014_000000298933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183219, "question_id": "nbcNrv3FtbuFBe9FTSgQmU", "question": "People gathered here will enjoy what type of art?", "choices": ["sand art", "music", "crayons", "painting"], "correct_choice_idx": 1, "direct_answers": ["jazz music", "jazz", "jazz", "jazz", "music", "jazz", "alan", "jazz", "music", "musicjazz festival"], "difficult_direct_answer": false, "rationales": ["The sign says jazz.", "People are gathered outside together.", "There is a sign for jazz in the background."], "image": "train2014/COCO_train2014_000000183219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494261, "question_id": "nbfThopmun2GY9kfpX3mAf", "question": "The orange bear is made of what material?", "choices": ["cotton", "wool", "polyester", "denim"], "correct_choice_idx": 2, "direct_answers": ["polyester", "plastic", "ballon", "rubber", "plastic", "plastic", "balloon", "plastic", "balloon", "nylon"], "difficult_direct_answer": false, "rationales": ["The bear is a kite which are usually made out of polyester.", "The bear is made of polyester.", "Most kites are made of a light fabric so it can be less air resistant."], "image": "train2014/COCO_train2014_000000494261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36614, "question_id": "nbvCBzs6H4GEgsyMY4PYNi", "question": "What is in the plate further away?", "choices": ["pancakes", "omelette", "fries", "pizza"], "correct_choice_idx": 2, "direct_answers": ["burger", "burger", "burger", "burger", "burger", "burger", "fries", "burger", "burger", "burger"], "difficult_direct_answer": false, "rationales": ["It has pieces of cut potato which have been cooked in oil.", "The plate has fries.", "There is a burger on the plate and this is the only thing that goes with them normally and none of the others would."], "image": "val2014/COCO_val2014_000000036614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82873, "question_id": "nbwhsyUpwtVXG2KCnaAyJL", "question": "What allows the person in this image to be taller?", "choices": ["sidewalk", "night sky", "heels", "fire hydrant"], "correct_choice_idx": 2, "direct_answers": ["platform shoes", "shoes", "heeled shoes", "heels", "high heels", "heels", "heels", "heels", "high heels", "heels"], "difficult_direct_answer": false, "rationales": ["The bottoms of the shoes have a riser on the back.", "The person is wearing heels and they add some inches on to the persons height.", "Those shoes have high ones."], "image": "train2014/COCO_train2014_000000082873.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237487, "question_id": "ncBEEsNpECW7pZvC7rjbi6", "question": "What city are they in?", "choices": ["chicago", "houston", "boston", "denver"], "correct_choice_idx": 0, "direct_answers": ["chicago", "chicago", "chicago", "chicago", "chicago", "chicago", "chicago", "chicago", "chicago", "chicago"], "difficult_direct_answer": false, "rationales": ["They are in the city of chicago.", "The cubs are playing and there are windy city clothing on the fans.", "The man has a cubs logo."], "image": "val2014/COCO_val2014_000000237487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135845, "question_id": "ncBZdK95siEDmrSVzpoCjv", "question": "Why is the sky getting dark in this location?", "choices": ["dark clouds", "storm incoming", "sun setting", "large tarp"], "correct_choice_idx": 2, "direct_answers": ["sunset", "evening", "sun setting", "nighttime", "sunset", "sun set", "darkness", "sun setting", "sunset", "cloudy"], "difficult_direct_answer": false, "rationales": ["The sky is getting dark because the sun is going down at the end of the day.", "The sun is setting.", "The sun is setting because you can see the sun going down behind the clouds"], "image": "train2014/COCO_train2014_000000135845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121461, "question_id": "ncH4r3fDQ8BYt8xjEMhF3r", "question": "What would be the typical diet of these cows?", "choices": ["grass", "bugs", "trees", "small animals"], "correct_choice_idx": 0, "direct_answers": ["grass", "grass grain", "grass", "grass", "grass", "grass grain", "grass", "grass", "grass", "grass"], "difficult_direct_answer": false, "rationales": ["The cows graze on the grass.", "The cows eat grass.", "They would eat grass."], "image": "train2014/COCO_train2014_000000121461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156341, "question_id": "ncSWphGL7yhF9nxShYTKpA", "question": "What player does 28 focus on now?", "choices": ["coach", "pitcher", "catcher", "outfielder"], "correct_choice_idx": 1, "direct_answers": ["pitcher", "pitcher", "pitcher", "baseball pitch", "pitcher", "ball", "ball", "baseball", "pitcher", "pitcher"], "difficult_direct_answer": false, "rationales": ["He is looking at the person about to throw the ball to the catcher.", "The man is waiting to hit.", "He is watching the person that will be throwing the ball at him so he can hit it with his bat."], "image": "val2014/COCO_val2014_000000156341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8195, "question_id": "ncthKB8DHHyYrt7czXLYiT", "question": "What part is missing on all the toilets?", "choices": ["seat", "lid", "tank", "bowl"], "correct_choice_idx": 2, "direct_answers": ["tank", "tank", "tank", "tank", "tank", "tank", "tank", "tank", "tank", "tank"], "difficult_direct_answer": false, "rationales": ["One of the essential parts of a toilet is the tank. this part is obviously missing from these toilets.", "There is no part to hold the water.", "None of the toilets have a tank on them."], "image": "train2014/COCO_train2014_000000008195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239464, "question_id": "ndCXBbkxFPqHNCvELuC9PG", "question": "What is in the white basket near the woman wearing red?", "choices": ["vegetables", "dog", "cat", "baby"], "correct_choice_idx": 3, "direct_answers": ["baby", "baby", "baby", "baby", "baby", "baby", "baby", "baby", "baby", "baby"], "difficult_direct_answer": false, "rationales": ["This is a carriage for small children", "The woman has a baby with a cap on in the basket.", "There are no non-human animals or vegetables in the basket. there is a human in the basket."], "image": "train2014/COCO_train2014_000000239464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105647, "question_id": "ndHyAVgY5SAog6KkevquRR", "question": "What type of area is shown?", "choices": ["urban", "rural", "forest", "coastal"], "correct_choice_idx": 0, "direct_answers": ["busy street", "street", "transport area", "indian market", "india", "urban", "city", "market", "urban street", "urban"], "difficult_direct_answer": true, "rationales": ["With the buildings and traffic, that would be correct.", "It is busy and crowded with many people, buildings and vehicles.", "The area is urban."], "image": "val2014/COCO_val2014_000000105647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489961, "question_id": "ndsLdYvdhJzfwRN9ZMJHXB", "question": "Who is the sign for?", "choices": ["animals", "drivers", "pedestrians", "bicyclists"], "correct_choice_idx": 1, "direct_answers": ["everyone driving", "cars", "driver", "drivers", "drivers", "drivers", "traffic", "cars", "drivers", "drivers"], "difficult_direct_answer": false, "rationales": ["The one-way sign is for drivers so they know which direction traffic is going.", "It is a right of way sign for cars.", "The sign is posted on a street pole, and cars are driven in the street. cars must obey one-way signs to avoid accidents."], "image": "train2014/COCO_train2014_000000489961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57088, "question_id": "ne5o3APde8d8J4Ugk78S5z", "question": "What sandwich shares a name with the buses?", "choices": ["reuben", "submarine", "double-decker", "open face"], "correct_choice_idx": 2, "direct_answers": ["double decker", "double", "double decker", "double decker", "doubledecker", "double decker", "double decker", "hamburger", "double-decker", "double decker"], "difficult_direct_answer": false, "rationales": ["Many red buses are double decker buses.", "A double decker sandwich has two levels, just like these buses.", "The buses have two decks."], "image": "train2014/COCO_train2014_000000057088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220176, "question_id": "ne8QKPXYDinHKa27VuFiVS", "question": "Why does the man stand so strangely here?", "choices": ["disoriented", "posing", "he's ill", "having seizure"], "correct_choice_idx": 1, "direct_answers": ["trick", "floating air", "jumping", "posing", "he's weird", "floating", "posing", "falling", "trick photography", "photoshoped"], "difficult_direct_answer": true, "rationales": ["The man is posing.", "There's a camera flash, and the man is looking towards the camera.", "The man is looking at the camera."], "image": "val2014/COCO_val2014_000000220176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209859, "question_id": "neKTyQuB64gHnkysTy7BcW", "question": "What do the trains likely carry?", "choices": ["kids", "fuel", "cargo", "livestock"], "correct_choice_idx": 2, "direct_answers": ["people", "freight", "freight", "freight", "cargo", "passengers", "passengers", "passengers", "people", "passengers"], "difficult_direct_answer": false, "rationales": ["It says so on the side of the train.", "The trains are boxlike and really worn. they don't need to look attractive for passengers.", "These trains are fully enclosed and have little or no ventilation. they would not be carrying people, animals or fuel. mostly likely some sort of cargo."], "image": "train2014/COCO_train2014_000000209859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9077, "question_id": "neQYjLhwUL3aDyV4ELueyQ", "question": "Where is this image taken?", "choices": ["store", "meat market", "hotel", "gas station"], "correct_choice_idx": 0, "direct_answers": ["market", "market", "store", "market", "produce stand", "food market", "market", "market", "market", "fruit market"], "difficult_direct_answer": false, "rationales": ["The image is at a store.", "It is the produce section of a grocery store.", "There are several offerings of fruits and vegetables."], "image": "val2014/COCO_val2014_000000009077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324754, "question_id": "neYnw4RhdiVywrdTVkGjBo", "question": "Which bus company owns this bus?", "choices": ["metroline", "pimlico", "vwh", "volvo"], "correct_choice_idx": 0, "direct_answers": ["metroline", "metroline", "metroline", "metroline", "metroline", "metroline", "metroline", "metroline", "metroline", "metroline"], "difficult_direct_answer": false, "rationales": ["The front of the bus says metroline.", "The bus says metroline on the front.", "The company is metroline."], "image": "train2014/COCO_train2014_000000324754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437139, "question_id": "necsCdHBLGX4qsbYeuq4sg", "question": "The lines on the sand were made by what part of a vehicle?", "choices": ["trunk", "tires", "motor", "bumper"], "correct_choice_idx": 1, "direct_answers": ["tires", "tires", "atv", "tires", "beach cruiser", "wheels", "tires", "wheels", "tires", "tires"], "difficult_direct_answer": false, "rationales": ["Narrow, straight tracks can be seen in the sand, equal distance apart, denoting a vehicle track.", "People walk on a beach and two long lines can be seen in the sand.", "The lines came from the tires."], "image": "train2014/COCO_train2014_000000437139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138401, "question_id": "neczE4FbrDTnevcR9GnWw4", "question": "What is the translation of the team's name?", "choices": ["fathers", "seals", "parrots", "cowboys"], "correct_choice_idx": 0, "direct_answers": ["priest", "fathers", "fathers", "pacers", "baseball", "friends", "padres", "parents", "priest", "fathers"], "difficult_direct_answer": false, "rationales": ["The padres are fathers.", "Padres is another name for dad.", "These are the padres, which is spanish for fathers."], "image": "train2014/COCO_train2014_000000138401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533022, "question_id": "nekDfRUXuGfND8zZc4nZ8L", "question": "What device is the boy holding?", "choices": ["television", "smartphone", "laptop", "tablet"], "correct_choice_idx": 3, "direct_answers": ["smart phone", "tablet", "tablet", "tablet", "tablet", "tablet", "tablet", "ipad", "skateboard", "tablet"], "difficult_direct_answer": false, "rationales": ["The device is handheld, but it is bigger than a phone.", "The boy has an ipad.", "The device has a screen and is the size, shape, style and is being held and used in the manner consistent with answer a."], "image": "train2014/COCO_train2014_000000533022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6998, "question_id": "nepBbY574LAnHhBfjnkdvJ", "question": "What is the area where the boy is riding his bike?", "choices": ["sidewalk", "street", "boardwalk", "bike lane"], "correct_choice_idx": 2, "direct_answers": ["boardwalk", "sidewalk", "beach", "path", "pathway", "boardwalk", "beach", "path", "boardwalk", "boardwalk"], "difficult_direct_answer": false, "rationales": ["The boy is riding a bike along a path near a beach with shops along it. these are elements that would be found on answer a.", "The boy is on his bike on a cement path on the beach.", "The area is a boardwalk at the beach."], "image": "train2014/COCO_train2014_000000006998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67470, "question_id": "nfH4FqVnSkQ9Q3RrwsrLuC", "question": "What is the white object on the bike frame used for?", "choices": ["collecting bugs", "engine fuel", "collecting trash", "drinking"], "correct_choice_idx": 3, "direct_answers": ["drinking water", "water", "water bottle", "drinking", "holding water", "carrying stuff", "sitting", "baggage", "standing", "standing up"], "difficult_direct_answer": true, "rationales": ["The white object is a water bottle.", "The object is being held in a manner consistent with a bottle and the object is the size and shape consistent with one that would hold a liquid for the purposes of answer a.", "The object is used for drinking."], "image": "train2014/COCO_train2014_000000067470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367142, "question_id": "nfLPXRNESHvNVgfGTdqpdi", "question": "Which term would best describe this woman?", "choices": ["quadragenarian", "tricenarian", "quinquagenarian", "octogenarian"], "correct_choice_idx": 3, "direct_answers": ["elderly", "old", "octogenarian", "old", "old", "elderly", "elderly", "elderly", "elderly", "elderly"], "difficult_direct_answer": false, "rationales": ["The woman in the kitchen looks to be in her eighties which would make her an octogenarian.", "She appears to be in her eighties.", "The exact age of the woman is not known, but she appears to be elderly based on her face and hair color. her age is likely closest to that which falls into the category of answer a."], "image": "val2014/COCO_val2014_000000367142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444576, "question_id": "nfNweQouc7zL7tUbk3GfcM", "question": "What's the name for the type of car in green?", "choices": ["convertible", "affordable", "all terrain", "sedan"], "correct_choice_idx": 0, "direct_answers": ["convertible", "convertible", "mata", "convertible", "convertible", "convertible", "convertible", "convertible", "mata", "convertible"], "difficult_direct_answer": false, "rationales": ["There is no top on this motorized road vehicle.", "A car with no top is in the street.", "It has a roof that can be raised or stored depending on the weather."], "image": "val2014/COCO_val2014_000000444576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111955, "question_id": "nfSCuaUtbFmQXVubA6qVgK", "question": "What is the man doing on the board?", "choices": ["ollie", "kickflip", "grind", "heelflip"], "correct_choice_idx": 0, "direct_answers": ["ollie", "kick flip", "trick", "jumping", "jumping", "ollie", "skating", "jumping", "ollie", "trick"], "difficult_direct_answer": false, "rationales": ["A guy is jumping up on a skateboard.", "He is attempting an ollie since his body is more towards the back.", "The man jumped off the ground. he is not flipping or grinding."], "image": "train2014/COCO_train2014_000000111955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365214, "question_id": "nfaMHhRVnhsLSNmyex7s3z", "question": "The coffee mug is placed on the bench in which structure?", "choices": ["bus stop", "cafe", "news stand", "phone booth"], "correct_choice_idx": 0, "direct_answers": ["bus stop", "plastic mug", "bus stop", "cup", "bus stop", "bus stop", "bus stop", "right", "bus stop", "bus stop"], "difficult_direct_answer": false, "rationales": ["There are glass panels behind the bench", "The bench is bolted to a structure commonly seen in bus stops.", "The bench is for waiting bus passengers to sit on."], "image": "val2014/COCO_val2014_000000365214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128560, "question_id": "nfxzb3jhtn9cgqYWN6HCWR", "question": "What type of board is the man using?", "choices": ["bodyboard", "snowboard", "popsicle board", "longboard"], "correct_choice_idx": 3, "direct_answers": ["cruiser", "skateboard", "skateboard", "skateboard", "skate", "longboard", "skateboard", "skateboard", "longboard", "skateboard"], "difficult_direct_answer": false, "rationales": ["The man is using a longboard.", "The board is very long and has wheels under..", "The man is using a longboard because what he is wearing is longer than the usual skateboard"], "image": "train2014/COCO_train2014_000000128560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438795, "question_id": "nfyS22jPjUECnnPcfcnMD5", "question": "What animals might be found in the pond here?", "choices": ["cats", "dogs", "fish", "elephants"], "correct_choice_idx": 2, "direct_answers": ["elephant", "fish", "elephants", "fish", "elephants", "elephants", "frogs", "elephants", "fish", "elephants"], "difficult_direct_answer": false, "rationales": ["Answer a is a type of animal that is commonly found in a pond and none of the other answers are.", "The animals are fish.", "Fish are the only animals here that live in a pond."], "image": "train2014/COCO_train2014_000000438795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167456, "question_id": "ng4sRhUvPJxLmBWQSr7ASA", "question": "What countries flag is seen on the boats?", "choices": ["sweden", "finland", "iceland", "norway"], "correct_choice_idx": 3, "direct_answers": ["united kingdom", "norway", "finland", "norway", "switzerland", "norway", "country", "norway", "england", "sweden"], "difficult_direct_answer": false, "rationales": ["Norway's flag is shown.", "Red, white, and blue flags are on boats in the water.", "Each flag has a blue and white cross on a red background."], "image": "val2014/COCO_val2014_000000167456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562291, "question_id": "ngi6CVRw7errJv9YW74Xxv", "question": "The train is parked near what type of body of water?", "choices": ["sea", "lake", "marsh", "river"], "correct_choice_idx": 0, "direct_answers": ["train", "ocean", "ocean", "ocean", "ocean", "sea", "sea", "ocean", "ocean", "ocean"], "difficult_direct_answer": false, "rationales": ["There is a large ocean.", "There's a boat nearby.", "A train is on a dock near a large cruise ship."], "image": "train2014/COCO_train2014_000000562291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1911, "question_id": "ngx9wcEzvxALJRDC3pFuqo", "question": "What powers this mode of transportation?", "choices": ["electricity", "gas", "coal", "dog food"], "correct_choice_idx": 3, "direct_answers": ["dogs", "sled dogs", "sled dogs", "dogs", "dogs", "dogs", "dogs", "dogs", "dog food", "dogs"], "difficult_direct_answer": false, "rationales": ["The canines are expending energy to pull the sleds. canines eat to get energy.", "The dogs are fed food as fuel for the transportation by dog sled.", "The sled is being pulled by animals. the sled does not have an engine that is powered by gas, electricity, or coal."], "image": "train2014/COCO_train2014_000000001911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 450841, "question_id": "nh5B69TikbkdtbK9NZpGaY", "question": "Why do hunters hunt this animal?", "choices": ["ears", "trunk", "ivory tusks", "tail"], "correct_choice_idx": 2, "direct_answers": ["tusks", "ivory tusks", "for sport", "ivory", "ivory", "tusks", "tusks", "tusks", "tusks", "ivory"], "difficult_direct_answer": false, "rationales": ["The animal usually grows white tusks.", "Elephants have ivory tusks, which can make people a lot of money.", "They want ivory."], "image": "train2014/COCO_train2014_000000450841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577125, "question_id": "nhCRaFi7yiBR7jDx9AGn4t", "question": "What's the name of the skate technique the man is doing?", "choices": ["180", "tail slide", "board slide", "manual"], "correct_choice_idx": 3, "direct_answers": ["grind", "jump", "ollie", "trick", "manual", "unknown", "grind", "balancing", "ollie", "ollie"], "difficult_direct_answer": false, "rationales": ["The manual technique is being used.", "The name is manual.", "That the name of the trick he's doing."], "image": "train2014/COCO_train2014_000000577125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345014, "question_id": "nhq2xrPhSyobypa9iDkAHH", "question": "How did the gravel get on the train?", "choices": ["shovel", "conveyer", "ramp", "loader"], "correct_choice_idx": 3, "direct_answers": ["digger machine", "truck", "loader", "loaded there", "bobcat", "tractor", "construction equipment", "tractor", "heavy machinery", "loader"], "difficult_direct_answer": false, "rationales": ["The loader is putting gravel on the train.", "The vehicle putting the gravel on the train is actually putting more gravel on its bucket to move more on to the train.", "The vehicle that loaded the gravel is seen picking more gravel up."], "image": "train2014/COCO_train2014_000000345014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405515, "question_id": "nhv9o2p8pC6xvyx73Ln26w", "question": "What is the kitten doing?", "choices": ["feeding", "vomiting", "mimicry", "sales"], "correct_choice_idx": 0, "direct_answers": ["eating", "feeding", "drinking", "drinking", "drinking", "eating", "eating", "drinking formula", "drinking", "being fed"], "difficult_direct_answer": false, "rationales": ["The kitten is feeding.", "It is drinking from a bottle that is being held and directed towards its mouth by a person.", "The kitten is drinking from a baby bottle, which is the activity defined by option a."], "image": "train2014/COCO_train2014_000000405515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550324, "question_id": "ni2Q3cfLej4cSFrh8ksfwX", "question": "Which object on the table is still alive?", "choices": ["eggs", "carrots", "potted plant", "carton"], "correct_choice_idx": 2, "direct_answers": ["plant", "potted plant", "plant", "plant", "plant", "vegetable", "plant", "potted plant", "plant", "potted plant"], "difficult_direct_answer": false, "rationales": ["The object is the plant.", "There is a potted plant still on the table.", "The carton never was alive. the eggs and carrots are dead."], "image": "train2014/COCO_train2014_000000550324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4266, "question_id": "niAfn7NZDtp9VcbitRGGMm", "question": "What is this activity for?", "choices": ["racing", "photo taking", "practice", "leisure"], "correct_choice_idx": 0, "direct_answers": ["exercise", "skiing", "skiing", "skiing", "slowing down", "skiing", "fun", "racing", "fun", "fun"], "difficult_direct_answer": false, "rationales": ["Flags placed on a skiing trail is usually intending to mark a path that one must take in a race.", "Looks like some poles are up that would be used to compete with to see who is fastest.", "The man is using skis on the slope to race other skiers that are trying to go fast."], "image": "val2014/COCO_val2014_000000004266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284366, "question_id": "niT4cHd2nQWqNrx68x2pEE", "question": "What is the person lunging for?", "choices": ["soccer ball", "frisbee", "pizza slice", "runaway dog"], "correct_choice_idx": 0, "direct_answers": ["ball", "soccer ball", "ball", "ball", "soccer ball", "ball", "ball", "ball", "soccer ball", "catch ball"], "difficult_direct_answer": false, "rationales": ["The person wants to grab the soccer ball from the air.", "They are on a soccer field and the ball is white", "The person wants the ball."], "image": "train2014/COCO_train2014_000000284366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480977, "question_id": "niprZyyt8PnpvaipVhjHXL", "question": "What is the person without a skateboard doing at the skate park?", "choices": ["cheering", "filming", "bandaging", "repairing"], "correct_choice_idx": 1, "direct_answers": ["taking pictures", "photographs", "filming", "filming", "taking photos", "taking pictures", "taking pictures", "photographing", "photos", "taking pictures"], "difficult_direct_answer": false, "rationales": ["The person is filming.", "The person without a skateboard has a camera to film the action,.", "He has a camera in his hand so he can record."], "image": "train2014/COCO_train2014_000000480977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455758, "question_id": "njKdTHP4CLiFpyt9LnSrep", "question": "People in this area are proud of having heritage from which country?", "choices": ["germany", "italy", "france", "ireland"], "correct_choice_idx": 1, "direct_answers": ["germany", "italy", "italy", "bulgaria", "italy", "italy", "italy", "italy", "italy", "italy"], "difficult_direct_answer": false, "rationales": ["Italy's flag is red, green and white.", "This boot country has a green, white and red flag.", "The colors are the same color as the country's flag."], "image": "train2014/COCO_train2014_000000455758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184172, "question_id": "njPtgDBFpZBUSzwmhdTwTf", "question": "What is the man doing with the pole?", "choices": ["kayaking", "paddle boarding", "tennis", "jousting"], "correct_choice_idx": 1, "direct_answers": ["paddleboarding", "rowing", "paddle boarding", "paddling", "navigating", "paddling", "paddling", "surfing", "paddling", "waterboarding"], "difficult_direct_answer": false, "rationales": ["He is standing on a surfboard and using an oar to move forward.", "A man is on a paddle board with a with a paddle in the water. people paddle while on boards to move through the water.", "A man is in a wetsuit and is paddling in the ocean on a board."], "image": "val2014/COCO_val2014_000000184172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524775, "question_id": "njmLoSzyTN73FCgxQPVgRc", "question": "What is the bus doing?", "choices": ["getting passengers", "parked", "being cleaned", "driving"], "correct_choice_idx": 1, "direct_answers": ["parked", "parking", "parking", "parked", "parking", "parking", "parking", "parked", "parked", "being boarded"], "difficult_direct_answer": false, "rationales": ["The bus is parked.", "It is stopped to let passengers on or off of it.", "It is idling in a designated space, as evidenced by the lines painted on the street."], "image": "val2014/COCO_val2014_000000524775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311229, "question_id": "njohwwRonSmSAGmoQVGzrt", "question": "What is the meaning of the arrows on the sign?", "choices": ["merge left", "one way", "turn right", "go straight"], "correct_choice_idx": 3, "direct_answers": ["go straight", "lanes", "direct traffic", "lanes", "forward", "lanes", "drive here", "direction", "lane identification", "vehicle lanes"], "difficult_direct_answer": false, "rationales": ["It is letting people know to stay in those lanes going that direction for different places they need to be", "The arrows are pointing straight ahead on the road and they mean to continue forward to reach those destinations.", "The arrows indicate one should proceed forward."], "image": "train2014/COCO_train2014_000000311229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223481, "question_id": "njrvX9QFSKXHS6oDu3mBzB", "question": "What is the grey car driving on?", "choices": ["parking lot", "dirt", "street", "sand"], "correct_choice_idx": 2, "direct_answers": ["street", "road", "pavement", "road", "road", "pavement", "road", "street", "road", "road"], "difficult_direct_answer": false, "rationales": ["The asphalt surface this car's wheels are in contact with is also called a road.", "The car is on the street.", "The grey car is on a paved road."], "image": "train2014/COCO_train2014_000000223481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75120, "question_id": "nk4hhio6rDbPLQcS7pt85A", "question": "What illegal action can be seen here?", "choices": ["property damage", "littering", "graffiti", "arson"], "correct_choice_idx": 2, "direct_answers": ["graffiti", "vandalism", "standing", "loitering", "graffiti", "graffiti", "graffiti", "graffiti", "graffiti", "grafitti"], "difficult_direct_answer": false, "rationales": ["Although not always, usually graffiti is illegal. there is no evidence of littering and the other options are obviously wrong.", "It's on the brick wall in the background. it could technically also be considered d.", "Someone has spray painted words on the brick"], "image": "train2014/COCO_train2014_000000075120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233292, "question_id": "nkAL5eTCUqa8STvSL8s9Bo", "question": "What does Red Bull do to this show?", "choices": ["supplies drink", "holds show", "sponsors show", "nothing"], "correct_choice_idx": 2, "direct_answers": ["sponsor", "sponsor", "sponsor", "sponsor", "sponsors show", "sponsor", "sponsor", "sponsors", "sponsors", "sponsor event"], "difficult_direct_answer": false, "rationales": ["It gives them money to help run the show in exchange for advertisement", "The biker has red bull prominently featured on his helmet. in exchange for money or equipment the biker agreed to provide advertisement for red bull and its products.", "There is a logo."], "image": "train2014/COCO_train2014_000000233292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282942, "question_id": "nkGa2wsRsrL52D9cvMLX2o", "question": "What is she about to do?", "choices": ["clean knige", "kill spouse", "cut cake", "cut tablecloth"], "correct_choice_idx": 2, "direct_answers": ["cut cake", "cut cake", "cut cake", "cut cake", "cut cake", "cut cake", "cut cake", "cut cake", "cut cake", "cut cake"], "difficult_direct_answer": false, "rationales": ["She's cutting the cake.", "She is standing next to a cake with a knife so she is likely to cut it.", "The knife in this woman's hand's while she stands looking at the cake on the table implies she will soon be cutting it."], "image": "val2014/COCO_val2014_000000282942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108075, "question_id": "nkJiuh4VZ4kvwjYkQ99DeT", "question": "What is the man using the computer to do?", "choices": ["game", "watch movie", "film", "type"], "correct_choice_idx": 3, "direct_answers": ["work", "work", "work", "work", "work", "work", "work", "work", "work", "type"], "difficult_direct_answer": false, "rationales": ["The man is typing.", "The man has fingers on the keyboard.", "The man has his hand resting over the keyboard where one would make inputs to type. the screen is visible and there are no obvious signs of answers b-d visible."], "image": "train2014/COCO_train2014_000000108075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214106, "question_id": "nkNmv2BproL2wKpjVj9uFc", "question": "What type of area is this?", "choices": ["city", "tropical", "rural", "commercial"], "correct_choice_idx": 2, "direct_answers": ["rural", "rural", "road", "rural", "rural", "rural", "rural", "rural", "rural", "road"], "difficult_direct_answer": false, "rationales": ["The area is rural.", "A vehicle is on the side of the road with not much traffic or other buildings around. rural areas do not have a lot of businesses and houses.", "It is a two lane road with a lot of vegetation including trees on both sides of the road. there is only one visible building."], "image": "val2014/COCO_val2014_000000214106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11805, "question_id": "nkTXoRhyV8aSJ3KMdVXCXX", "question": "What type of even is being held?", "choices": ["wedding", "birthday party", "farmers market", "reception"], "correct_choice_idx": 2, "direct_answers": ["food market", "market", "food sales", "market", "sale", "market", "outdoor market", "fund raising", "umbrella", "farmers market"], "difficult_direct_answer": false, "rationales": ["The booths are all close together and covered with canopies so people are safe from sun and rain as they shop", "There are many stalls out.", "There is a farmers market being held here."], "image": "train2014/COCO_train2014_000000011805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314778, "question_id": "nkariKYQJqL2MCRr3ihsQy", "question": "What is moving the fastest in this scene?", "choices": ["bicycle", "lamp post", "skateboarding boy", "bicycle boy"], "correct_choice_idx": 2, "direct_answers": ["person", "skateboard", "board", "skateboarding boy", "skateboarder", "skateboarder", "skater", "skateboarder", "skater", "skateboard"], "difficult_direct_answer": false, "rationales": ["He is the only living thing in motion as can be suggested with him in the air.", "The boy on the board is moving fast.", "The boy is doing a sport."], "image": "train2014/COCO_train2014_000000314778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399275, "question_id": "nkpNjtzsBB55gX3EsauuKb", "question": "What is the red vehicle?", "choices": ["van", "tank", "airplane", "carriage"], "correct_choice_idx": 0, "direct_answers": ["van", "bus", "van", "taxi", "van", "van", "bus", "van", "van", "van"], "difficult_direct_answer": false, "rationales": ["These large vehicles are used to transport more people then a car. also, they are used to transport items.", "It is taller than a regular car but not as big as a bus or truck", "The vehicle is a van."], "image": "train2014/COCO_train2014_000000399275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302038, "question_id": "nmJsgBo7K5esHPxejvYUgA", "question": "What plants help heat this space?", "choices": ["trees", "cotton", "flax", "bamboo"], "correct_choice_idx": 0, "direct_answers": ["trees", "trees", "trees", "wood", "trees", "trees", "wood", "trees", "trees", "trees"], "difficult_direct_answer": false, "rationales": ["There are logs to heat up the stove.", "The plants help with the trees.", "The fireplace is burning wood. wood comes from this living source."], "image": "val2014/COCO_val2014_000000302038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397877, "question_id": "nmMwdyvvZqkhBb9rtmkyeB", "question": "Why does she haver her shoes off?", "choices": ["in bed", "too hot", "confused", "dislikes shoes"], "correct_choice_idx": 0, "direct_answers": ["bed laying", "playing game", "to relax", "resting", "bed time", "laying down", "will nap", "indoors", "to sleep", "in bed"], "difficult_direct_answer": true, "rationales": ["She is inside a room that is used for sleeping. it would not be appropriate to wear shoes in this setting.", "She wants to be in bed.", "People take their shoes off in bed."], "image": "train2014/COCO_train2014_000000397877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367608, "question_id": "nmXMMmBTJ68UQEyWSSq5ry", "question": "What kind of skate trick is the man doing?", "choices": ["flip", "grind", "grab", "manual"], "correct_choice_idx": 0, "direct_answers": ["sky jumping", "flip", "jumping", "kick flip", "flip", "ollie", "jump", "air lift", "jumping", "flip"], "difficult_direct_answer": false, "rationales": ["The trick is a flip.", "The skateboard is orientated in a wheels up position which would be the opposite of normal riding but consistent with a trick that would flip the board.", "The man is flipping his board."], "image": "val2014/COCO_val2014_000000367608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366179, "question_id": "nmjWdvgj8By3PZLEbzvoNt", "question": "What mass transit units sit parked here?", "choices": ["busses", "trains", "cabs", "cars"], "correct_choice_idx": 0, "direct_answers": ["bus", "busses", "bus", "buses", "bus", "buses", "buses", "bus station", "buses", "buses"], "difficult_direct_answer": false, "rationales": ["There are buses.", "Busses are parked there.", "The vehicles have wheels and cannot travel on tracks. they can carry more passengers than cars."], "image": "val2014/COCO_val2014_000000366179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370839, "question_id": "nmqGg4vA9dAfkgL6UBEAm3", "question": "What are the people playing?", "choices": ["video games", "card games", "sports", "laptop games"], "correct_choice_idx": 0, "direct_answers": ["wii", "video games", "dance", "nintendo", "wii", "nintendo wii", "wii bowling", "wii", "video game", "wii"], "difficult_direct_answer": false, "rationales": ["The man is holding a wii remote.", "The people are playing video games in front of the tv.", "The people are holding controllers for a nintendo console."], "image": "val2014/COCO_val2014_000000370839.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246831, "question_id": "nnHgqzra6W8b293P4gwJ6p", "question": "What type of professional would use this silver and green object?", "choices": ["paramedic", "fireman", "it", "emt"], "correct_choice_idx": 1, "direct_answers": ["firefighter", "fireman", "fire fighters", "fireman", "firefighter", "firefighter", "firefighter", "fireman", "firefighter", "fireman"], "difficult_direct_answer": false, "rationales": ["This is a hydrant used to supply water to put out fires", "This is a hydrant where they can get highly pressurized water to use to put out fires", "Fire hydrants are for getting water to douse fires. a fireman would use the fire hydrant to fight a fire."], "image": "train2014/COCO_train2014_000000246831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573882, "question_id": "nnPHzHknxv39CSzR9DSYPA", "question": "What consumer electronic company made the white gaming displays?", "choices": ["sega", "sony", "apple", "nintendo"], "correct_choice_idx": 3, "direct_answers": ["nintendo", "nintendo", "nintendo", "wii", "nintendo", "nintendo", "wii", "nintendo", "wii", "nintendo"], "difficult_direct_answer": false, "rationales": ["Gaming displays have the wii logo on them. wii is made by nintendo.", "This company's logo appears near the bottom of each display.", "Nintendo made it."], "image": "val2014/COCO_val2014_000000573882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397786, "question_id": "nnaeCTnxSH77t64DiHLSSg", "question": "What do the triangular pieces harness?", "choices": ["sun", "coal", "water", "wind"], "correct_choice_idx": 3, "direct_answers": ["wind", "wind", "wind", "boat", "wind", "wind", "sailboat", "wind", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["The vehicle is a sail boat. it does not use solar, water, or coal power.", "The wind sails harness.", "The sails harness wind."], "image": "val2014/COCO_val2014_000000397786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461201, "question_id": "nndCzspaj7yvZNNWBSavQm", "question": "What color are the beaks of these birds?", "choices": ["green", "yellow", "orange", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The birds' beaks are black.", "The birds have black beaks.", "They are black."], "image": "train2014/COCO_train2014_000000461201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435260, "question_id": "nnkEFWLcizdSMTfus2yMaQ", "question": "What species of elephants are these?", "choices": ["extinct", "asian", "african", "sahara"], "correct_choice_idx": 1, "direct_answers": ["asian", "asian", "indian", "grey elephants", "african elephants", "circus", "asian", "african", "circus", "indian"], "difficult_direct_answer": false, "rationales": ["Asian elephants have this kind of decoration put on them.", "They are african elephants.", "This species is known for not having prominent tusks."], "image": "val2014/COCO_val2014_000000435260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188847, "question_id": "nnz7R2xCPcLi7JmcFTatGE", "question": "Where can you find this scene?", "choices": ["korea", "japan", "china", "india"], "correct_choice_idx": 1, "direct_answers": ["japan", "japan", "japan", "city", "city", "street", "city", "street", "outside", "city"], "difficult_direct_answer": false, "rationales": ["The bus is going to kinshicho station in tokyo.", "The display on the front of the bus shows that it is headed to kinshicho station which is located in toyko.", "The language written on the car and busy is japanese."], "image": "train2014/COCO_train2014_000000188847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283350, "question_id": "noUjBGhgyYqreJAxdMdgRW", "question": "What are these people doing here?", "choices": ["hired", "help riders", "awaiting ride", "invited yesterday"], "correct_choice_idx": 2, "direct_answers": ["curd", "protesting", "protesting", "protesting", "stand signal", "lining up", "watching race", "protesting", "protest", "awaiting ride"], "difficult_direct_answer": false, "rationales": ["The people want a ride.", "The people are aligned by a street indicating that they might be waiting for a ride.", "The people wait for a ride."], "image": "train2014/COCO_train2014_000000283350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157693, "question_id": "noaNmfD5rDVzQHffeQNQxt", "question": "What is the item next to the wheel that her hand is touching?", "choices": ["ball", "board", "weight", "goat"], "correct_choice_idx": 0, "direct_answers": ["tennis racket", "tennis ball", "tennis racket", "tennis racket", "tennis ball", "tennis ball", "racquet", "metal", "brake", "ball"], "difficult_direct_answer": false, "rationales": ["The woman has a racquet in her right hand. a green item used in tennis is near her left hand.", "There is a ball in the spokes.", "The ball is near her hand on the wheel."], "image": "train2014/COCO_train2014_000000157693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365021, "question_id": "nofF43YnoG7LZpe9Z95648", "question": "Which famous person fits the description on the bottom of the board?", "choices": ["liberace", "amy smart", "jessica biel", "zoe kravitz"], "correct_choice_idx": 3, "direct_answers": ["no idea", "julia child", "shawn white", "john lennon", "zoe kravitz", "will ferret", "redhead", "player", "shaun white", "kelly clarkson"], "difficult_direct_answer": true, "rationales": ["The person is zoe.", "A snowboard is shown with the words \"step child\" on it. zoe kravitz is a step child.", "Zoe kravitz is a step child to her mother's second husband."], "image": "train2014/COCO_train2014_000000365021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527092, "question_id": "nokwbF4DJMwxPrFRpSVzdH", "question": "What brand's name is on the hat?", "choices": ["nike", "dell", "corona", "mcdonald's"], "correct_choice_idx": 2, "direct_answers": ["corona", "corona", "corona", "corona", "corona", "corona", "corona", "corona", "corona", "corona"], "difficult_direct_answer": false, "rationales": ["That the beer brand that's on the hat.", "The woman on the boogie board is wearing a hat for corona beers.", "The name is corona."], "image": "train2014/COCO_train2014_000000527092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434457, "question_id": "noxCCZFuxQ4xMEtCA478Ep", "question": "Where is this location?", "choices": ["hong kong", "dublin", "belfast", "tokyo"], "correct_choice_idx": 3, "direct_answers": ["tokyo", "japan", "china", "china", "japan", "japan", "asia", "japan", "japan", "tokyo"], "difficult_direct_answer": false, "rationales": ["Due to the japanese lettering present in this image we can conclude it to take place in tokyo. tokyo is in japan unlike any of the other choices here.", "The wording is done on japanese.", "The location is tokyo."], "image": "train2014/COCO_train2014_000000434457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351549, "question_id": "np28nTu5NXYUpZKEaUxcRE", "question": "What is the red object on the left side of the table used for?", "choices": ["storing tape", "saving files", "sharpening pencils", "storing tacks"], "correct_choice_idx": 2, "direct_answers": ["sharpening pencils", "sharpen pencils", "pencil sharpener", "sharpening", "sharpening pencils", "sharpening pencils", "sharpening", "pencil sharpening", "pencil sharpener", "pencil sharpening"], "difficult_direct_answer": false, "rationales": ["Pencils can be sharpened.", "The object is a sharpener.", "It is a attached on the corner of the desk near the the trashcan."], "image": "train2014/COCO_train2014_000000351549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211192, "question_id": "npGmktHoUD4hXph2y6M7h3", "question": "What professional is one likely to meet with in this room?", "choices": ["teacher", "judge", "doctor", "lawyer"], "correct_choice_idx": 2, "direct_answers": ["ob-gyn", "ob-gyn", "doctor", "doctor", "doctor", "doctor", "doctor", "doctor", "doctor", "doctor"], "difficult_direct_answer": false, "rationales": ["The room has an examination table in it covered in paper that is commonly found in a doctors office and rarely anywhere else.", "An exam table is in a small room with a small counter and sink with medical supplies on it.", "Doctors have patient rooms."], "image": "val2014/COCO_val2014_000000211192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319605, "question_id": "npPDaAMAtXMipv2Qo745r8", "question": "What is sitting in front of the man?", "choices": ["cat", "luggage", "person", "dog"], "correct_choice_idx": 1, "direct_answers": ["backpack", "luggage", "bag", "large backpack", "backpack", "backpack", "bag", "backpack", "backpack", "backpack"], "difficult_direct_answer": false, "rationales": ["Luggage is in front.", "A suitcase is near him.", "The man has a very large backpack in front of him. this is more likely to be luggage rather than everyday belongings."], "image": "train2014/COCO_train2014_000000319605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318637, "question_id": "npT4zbTxgVRXNEMkct2wGu", "question": "Which provide quick solution for wiping sweat during match?", "choices": ["cap", "wrist band", "none", "shocks"], "correct_choice_idx": 1, "direct_answers": ["wrist band", "sweatbands", "towel", "towel", "towel", "towel", "wrist band", "towel", "towel", "sweatband"], "difficult_direct_answer": false, "rationales": ["The answer is commonly known to be a solution for athletes competing in the sport depicted.", "A piece of terry cloth around the wrist is used to wipe forehead or face sweat when competing.", "He has an absorbent band on his wrist."], "image": "train2014/COCO_train2014_000000318637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194067, "question_id": "npbRUQJFqKTkp2Zaa7PRLh", "question": "Who is the most famous surfer?", "choices": ["duke", "nick", "laird", "miki dora"], "correct_choice_idx": 0, "direct_answers": ["jackson", "kelly slater", "duke kahanamoku", "kelly slater", "slater", "rob slater", "duke", "rk slater", "robert slater", "rober slater"], "difficult_direct_answer": true, "rationales": ["The most famous world's surfer is duke.", "A person named duke is a very famous surfer.", "That is the name of the famous surfer."], "image": "train2014/COCO_train2014_000000194067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145113, "question_id": "npu5gkgHsSu6yLcZChxkNx", "question": "What is rain meteorology?", "choices": ["0.2mm", "0.8mm", "0.5mm", "1.5mm"], "correct_choice_idx": 2, "direct_answers": ["weather", "study rain", "precipitation", "precipitation forecast", "glass", "rain", "forecast", "weather study", "0.5mm", "science"], "difficult_direct_answer": true, "rationales": ["It's hard to tell but it's raining.", "It is precipitation of liquid water drops with diameters greater than that amount.", "Given how much rain is on the window a seems like the likely amount."], "image": "train2014/COCO_train2014_000000145113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251206, "question_id": "nqAiwEPAcnfFGQbd4NxWdm", "question": "Why is the vehicle stopped?", "choices": ["avoiding sheep", "getting out", "lost", "resting"], "correct_choice_idx": 0, "direct_answers": ["sheep", "sheep", "sheep roadblock", "sheep road", "sheep crossing", "avoiding sheep", "sheep appeared", "prevent accident", "sheep", "goats"], "difficult_direct_answer": false, "rationales": ["The car doesn't want to hit the sheep.", "There is a sheep standing in the middle of the road.", "There are sheep in front of the truck. the truck stopped so it wouldn't hit them."], "image": "val2014/COCO_val2014_000000251206.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509864, "question_id": "nqL3CKG5qdwtAYEUMGLF2d", "question": "Where will they put that sauce?", "choices": ["rice", "bread", "potatoes", "dumplings"], "correct_choice_idx": 1, "direct_answers": ["fried food", "bread", "chicken", "on food", "on dishes", "on food", "on food", "on entree", "on food", "in can"], "difficult_direct_answer": false, "rationales": ["The sauce in the silver bowls on the table will go on pieces of bread.", "The red sauce would add a lot of flavor to whatever food it is put on. the food it is added to would need to be somewhat bland to handle that much flavor, and this food has a bland taste when eaten by itself.", "They are having curry and roti."], "image": "train2014/COCO_train2014_000000509864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407667, "question_id": "nqe4QbkzMDb9JujmE8ALyN", "question": "What countries flag is in the middle position?", "choices": ["germany", "russia", "sweden", "united states"], "correct_choice_idx": 3, "direct_answers": ["usa", "united states", "usa", "usa", "united states", "united states", "united states", "united states", "usa", "united states"], "difficult_direct_answer": false, "rationales": ["The stars and stripes are easily seen on the united states' flag.", "It is a red, white and blue flag with stars and stripes.", "The us's flag is in the middle."], "image": "train2014/COCO_train2014_000000407667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491440, "question_id": "nr2yATyzGLizRwBthgeWue", "question": "What is the downtown loop map for?", "choices": ["directions", "downtown", "bus schedule", "pedestrians"], "correct_choice_idx": 2, "direct_answers": ["wharf street", "tourists", "identification", "bus schedule", "directions", "washington doc", "bus", "free service", "tunnel", "bus routes"], "difficult_direct_answer": true, "rationales": ["There's only roads visible, and the sign also states what it is for.", "These type of maps with \"free service\" as seen in writing on the map, are commonly used with bus services and schedules.", "The sign says its a free service."], "image": "train2014/COCO_train2014_000000491440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353366, "question_id": "ns2VRjtoxEyYBah2C52UVa", "question": "In what kind of area is the woman riding her skateboard?", "choices": ["skating arena", "resort", "park", "school yard"], "correct_choice_idx": 2, "direct_answers": ["street", "park", "road", "park", "roadway", "street", "main street", "road", "road", "pavement"], "difficult_direct_answer": false, "rationales": ["The girl is riding at the park.", "The woman is riding her skateboard in the middle of the road surrounded by park trees.", "There are large grassy areas by the road"], "image": "train2014/COCO_train2014_000000353366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392364, "question_id": "nsBrcDhVnjiYTnF583Zg4p", "question": "The monument is located within what type of roadway construction?", "choices": ["roundabout", "intersection", "four-way stop", "bowtie"], "correct_choice_idx": 0, "direct_answers": ["roundabout", "government", "roundabout", "rotunda", "roundabout", "travelling road", "roundabout", "roundabout", "unknown", "roundabout"], "difficult_direct_answer": false, "rationales": ["They seems to be a lot of roads on the area.", "The monument is at a roundabout.", "A large statue is in the middle of a circular roadway. roundabouts are circular roads that avoid traditional intersections with ninety degree turns."], "image": "val2014/COCO_val2014_000000392364.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417218, "question_id": "nsCoQhbeBV8MBNtKyXnAst", "question": "What type of people utilize the space seen here?", "choices": ["activists", "diners", "merchants", "vagabonds"], "correct_choice_idx": 1, "direct_answers": ["diners", "diners", "gardner", "guests", "picnickers", "diners", "campers", "hungry", "diners", "picnicers"], "difficult_direct_answer": false, "rationales": ["The space contains tables based on their size and shape which could be used to eat food from.", "Diners utilize it.", "There are table covered with tablecloths."], "image": "train2014/COCO_train2014_000000417218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134007, "question_id": "nsMcbZiBQiRhmVGvZWRb7B", "question": "The item the woman has over her eyes was featured in a song by what artist?", "choices": ["charlotte church", "pavarotti", "celine dion", "corey hart"], "correct_choice_idx": 3, "direct_answers": ["cardi", "rob artist", "corey hart", "corey hart", "zz top", "rick okazek", "sara bareilles", "marilyn manson", "zz top", "cory heart"], "difficult_direct_answer": false, "rationales": ["You can tell by how the shades are featured as to what they are speaking of.", "A popular song he did in the 80s was \"sunglasses at night\"", "The song was \"sun glasses at night\" and i believe it was a big hit in the eighties."], "image": "train2014/COCO_train2014_000000134007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424683, "question_id": "nsPC8fhsVc4ZdPBy4gnVwp", "question": "What sort of beverages are most readily available here?", "choices": ["hot chocolate", "icees", "coffee", "iced tea"], "correct_choice_idx": 3, "direct_answers": ["tea", "tea", "snapple", "snapple", "snapple", "tea", "snapple", "iced tea", "snapple", "tea"], "difficult_direct_answer": false, "rationales": ["A snapple machine is in a waiting room. snapple makes tea.", "Snapple makes tea products.", "Snapple is written on the side of the machine."], "image": "val2014/COCO_val2014_000000424683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410601, "question_id": "nsRFyreH64Q9u64QTvFc2B", "question": "What do these people do together?", "choices": ["work", "skateboard", "run", "swim"], "correct_choice_idx": 1, "direct_answers": ["skate", "skateboarding", "skateboard", "skate", "skate", "talk", "skateboard", "skate", "skate", "skateboard"], "difficult_direct_answer": false, "rationales": ["The people here are together for skateboarding.", "They appear to be friendly and they are all holding skateboards and wearing protective skateboarding equipment.", "They all have oversized skateboards."], "image": "train2014/COCO_train2014_000000410601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530631, "question_id": "nsi4by6X8zNQGTRzPPbYDx", "question": "What are the cabinets in the background called?", "choices": ["safety cabinets", "tool cabinets", "safes", "file cabinets"], "correct_choice_idx": 2, "direct_answers": ["file cabinet", "filing cabinets", "drawers", "cabinets", "file cabinets", "filing", "tool shelves", "file cabinet", "toolboxes", "safes"], "difficult_direct_answer": true, "rationales": ["The person is working in a mechanics shop based on their equipment, uniform and the act they are seen doing. this type of setting would include many containers for housing their tools.", "This is a shop. so, c makes the most sense.", "This is a mechanic shop so there should be tools in the cabinets."], "image": "val2014/COCO_val2014_000000530631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576262, "question_id": "nsnWGe7DzMAG4xCo4pTGwH", "question": "How long does it take for luggage to get to the carousel?", "choices": ["10mins", "8mins", "20mins", "15mins"], "correct_choice_idx": 1, "direct_answers": ["minutes", "fast", "too long", "minutes", "another country", "thirty minutes", "long time", "several minutes", "8mins", "minutes"], "difficult_direct_answer": false, "rationales": ["Traditionally it takes luggage from the plane to the carousel about 20 minutes or so.", "The luggage will be in the rack in a few mins.", "They put them at very specific timings."], "image": "train2014/COCO_train2014_000000576262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277726, "question_id": "nt7M4xbBhWpH7FRqLwzHBF", "question": "Why are the bases of the towers brightly colored?", "choices": ["sturdiness", "sales boosting", "safety visibility", "marketing"], "correct_choice_idx": 2, "direct_answers": ["miss them", "notification", "visual guide", "see them", "be seen", "safety", "be seen", "warning", "safety visibility", "antenna"], "difficult_direct_answer": true, "rationales": ["Large metal towers have red towards the bottom and skiers are all around.", "They are only colored on the bottom so people do not crash into them.", "They are used to show and mark the visibility."], "image": "train2014/COCO_train2014_000000277726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527059, "question_id": "ntDUQ4vmVuzHs8AF4WTYio", "question": "What type of business is being advertised on that sign?", "choices": ["real estate", "post office", "bus company", "tobacco shop"], "correct_choice_idx": 0, "direct_answers": ["collectables", "real estate", "clothing retailer", "real estate", "lounge", "collars", "real estate", "lawyers", "real estate", "store"], "difficult_direct_answer": false, "rationales": ["A real estate business is shown.", "There are images of cigarettes so it is likely a tobacco shop.", "There is a picture of a sign above glass window. it shows a number and square feet of place."], "image": "train2014/COCO_train2014_000000527059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24307, "question_id": "ntDv4kS7wpVXfhsnHezkqf", "question": "Where is the man pointing his camera?", "choices": ["himself", "ceiling", "ground", "mirror"], "correct_choice_idx": 3, "direct_answers": ["mirror", "mirror", "glass shelf", "mirror", "mirror", "self", "mirror", "mirror", "mirror", "glass shelf"], "difficult_direct_answer": false, "rationales": ["He is taking a selfie.", "The man is taking a photo in the mirror.", "A man is holding up his phone and pointing the camera in front of him."], "image": "train2014/COCO_train2014_000000024307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277688, "question_id": "nu7iy7Rz2bWaNz4zJNbvHA", "question": "What are the horses doing?", "choices": ["resting", "feeding", "pulling cart", "posing"], "correct_choice_idx": 2, "direct_answers": ["pulling wagon", "pulling", "drawing carriage", "pulling cart", "transporting people", "working", "pulling wagon", "pulling card", "competitions", "pulling"], "difficult_direct_answer": false, "rationales": ["The horses are on the road pulling the trailer with people on it.", "They are harnessed to it", "The horses have a cart and are going down the street."], "image": "train2014/COCO_train2014_000000277688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526257, "question_id": "nuDsaqKG2tWD8SsExLtykd", "question": "What base is the photographer standing behind?", "choices": ["home", "third", "first", "second"], "correct_choice_idx": 2, "direct_answers": ["first base", "first", "first", "first", "first", "first", "first", "cannot tell", "first base", "first"], "difficult_direct_answer": false, "rationales": ["He is to the left of the pitcher's mound", "He's on the left side of the pitcher", "The person is standing on the first plate."], "image": "val2014/COCO_val2014_000000526257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91105, "question_id": "nuJcrpBq8rpBCSWiNvrj8p", "question": "What British sport could this ball be used for?", "choices": ["rugby", "soccer", "bowls", "golf"], "correct_choice_idx": 0, "direct_answers": ["rugby", "rugby", "rugby", "football", "football", "football", "rugby", "rugby", "rugby", "rugby"], "difficult_direct_answer": false, "rationales": ["Football is used in rugby.", "A child is holding a football.", "This ball could be used in a game of rugby."], "image": "val2014/COCO_val2014_000000091105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160483, "question_id": "nubGHLmfmdrEmw4NcJbeUi", "question": "What is the third veggie called that cooked with the potatoes and carrots?", "choices": ["lettuce", "squash", "chickpeas", "cabbage"], "correct_choice_idx": 3, "direct_answers": ["corn", "collard greens", "cabbage", "cabbage", "greens", "cabbage", "cabbage", "cabbage", "spinach", "kale"], "difficult_direct_answer": false, "rationales": ["The plate is full of foods such as potatoes, carrots, and cabbage.", "The dark green leaves are consistent of that of cooked cabbage.", "Cabbage is cooked in this dish."], "image": "val2014/COCO_val2014_000000160483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358965, "question_id": "nugBZiHEkCEKP4MCXZh3vJ", "question": "What type of vehicle is present?", "choices": ["car", "bicycle", "ship", "board"], "correct_choice_idx": 3, "direct_answers": ["board", "surfboard", "surf board", "boat", "surfboard", "surfboard", "boat", "surfboard", "surfboard", "surf board"], "difficult_direct_answer": false, "rationales": ["The men are sitting on items that enable them to ride the waves.", "The vehicle is in the water, only a little longer than the people on them, and appears to be used for surfing.", "On what appears to be a windy day at the beach, a few surfers on their surfboards wait expectantly for good rides. surfing was invented in hawaii hundreds of years ago."], "image": "train2014/COCO_train2014_000000358965.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316514, "question_id": "nvLxvefNNHsXRh6hr26cTz", "question": "What dressing is traditionally added to this?", "choices": ["mustard", "ranch", "mayo", "ketchup"], "correct_choice_idx": 2, "direct_answers": ["salad", "ranch", "coleslaw", "broccoli", "salad dressing", "mayonaisse", "soy sauce", "mayonnaise", "mayonnaise", "mayo"], "difficult_direct_answer": true, "rationales": ["Usually coleslaw has mayo.", "The dressing is mayo.", "It is usually a mix for coleslaw which has a base made of this ingredient"], "image": "val2014/COCO_val2014_000000316514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508467, "question_id": "nvNV7QJ25kYBZeyGxVfEkM", "question": "What fungal growth is visible here?", "choices": ["tomatoes", "mushrooms", "cucumbers", "olives"], "correct_choice_idx": 1, "direct_answers": ["mushrooms", "mushrooms", "mushrooms", "mushrooms", "mushrooms", "mushrooms", "parsley", "mushroom", "green", "mushroom"], "difficult_direct_answer": false, "rationales": ["The other options are fruit and vegetables.", "Mushrooms are in the soup.", "The mushrooms are a fungus."], "image": "train2014/COCO_train2014_000000508467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425762, "question_id": "nvZGuKEfd2SauosY4PTWob", "question": "The item on the right is most likely a cooked version of what?", "choices": ["potato", "carrot", "orange", "lemon"], "correct_choice_idx": 0, "direct_answers": ["potatoes", "potatoes", "potatoes", "potato chips", "potatoes", "potato", "potato", "potato", "food", "potatoes"], "difficult_direct_answer": false, "rationales": ["The potatoes are sliced and fried.", "These are a version of potatoes.", "The item on the right is fried potato."], "image": "val2014/COCO_val2014_000000425762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483378, "question_id": "nvZo4zK4k6QJzy8JPp4Gkh", "question": "What type of shot is the boy about to hit?", "choices": ["backhand", "slice", "forehand", "serve"], "correct_choice_idx": 3, "direct_answers": ["overhand", "serve", "serve", "overhand", "overhead", "serve", "serve", "serve", "serve", "overhead smash"], "difficult_direct_answer": false, "rationales": ["He has thrown the ball above himself and has the racket behind him to do a large swing", "He's thrown the ball straight up and has his racket back", "The boy threw the ball in the air so he can hit it. this is called a serve."], "image": "train2014/COCO_train2014_000000483378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545992, "question_id": "nw2txksUKghuhWj7HLVmJL", "question": "What are the wires above the train used for?", "choices": ["climbing", "hanging", "decoration", "power"], "correct_choice_idx": 3, "direct_answers": ["light", "electricity", "power", "electricity", "provide power", "power", "stability", "power source", "communication", "electricity"], "difficult_direct_answer": false, "rationales": ["The wires are for power.", "The wires provide power.", "They power the train so it can move along the track."], "image": "train2014/COCO_train2014_000000545992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423358, "question_id": "nw47vtzCqW8KnecazNBPpM", "question": "What is a famous company that makes the device the man is holding?", "choices": ["samsung", "dell", "hitachi", "hoover"], "correct_choice_idx": 0, "direct_answers": ["motorola", "samsung", "nokia", "mobile company", "apple", "nokia", "samsung", "samsung", "apple", "mobile phone"], "difficult_direct_answer": false, "rationales": ["The brand is unclear but based on the device being a cellphone of this type and the other options, answer a seems most likely.", "This is a korean company known for its electronics, including mobile phones like this one.", "The man is on his cellphone."], "image": "val2014/COCO_val2014_000000423358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508962, "question_id": "nw6JvwBGeseyyLVLxLehX7", "question": "What character does the man looking at his cell phone play?", "choices": ["mary poppins", "jesus", "sweeny todd", "santa"], "correct_choice_idx": 1, "direct_answers": ["game", "jesus", "jesus", "biblical character", "jesus", "jesus", "zombies", "jesus", "jesus", "jesus"], "difficult_direct_answer": false, "rationales": ["The clothing indicate that he's role-playing a from christian stories.", "He has the outfit on that jesus is portrayed wearing in many shows.", "The person on the phone is playing jesus christ."], "image": "val2014/COCO_val2014_000000508962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5105, "question_id": "nwJiGMkCAHzZpRRwznQcUn", "question": "What will the dog do with the ball?", "choices": ["give human", "break it", "chase it", "swallow it"], "correct_choice_idx": 0, "direct_answers": ["retrieve it", "fetch", "play", "return it", "give human", "fetch", "bring shore", "carry owner", "retrieve", "return ball"], "difficult_direct_answer": true, "rationales": ["The dog wants to return the ball.", "This is the most likely option. the dog probably wants it thrown again.", "The dog will most likely paddle around with the ball in his mouth until he spies a human who's in a position to throw it for him!."], "image": "val2014/COCO_val2014_000000005105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485830, "question_id": "nwT6UQnP4TCmUo5PaVo85B", "question": "Why is the kid putting her hand close to the sheep?", "choices": ["snapping", "touching", "feeding", "pinching"], "correct_choice_idx": 2, "direct_answers": ["to feed", "to pet", "give food", "feed it", "feeding it", "feeding", "offer food", "feeding it", "food", "smell"], "difficult_direct_answer": true, "rationales": ["The kid's hand is held like it is holding something, and placed near the sheep's mouth.", "The sheep is taking food from it.", "The kid is feeding."], "image": "train2014/COCO_train2014_000000485830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105389, "question_id": "nwVhhq9NhEurYFV3cHsyVf", "question": "The theme of the decorating being done here includes what most prominently?", "choices": ["thanksgiving", "valentine's day", "halloween", "kites"], "correct_choice_idx": 3, "direct_answers": ["colors", "streamers", "teal", "streamers", "kites", "ribbons", "crafting", "streamers", "streamers", "streamers"], "difficult_direct_answer": false, "rationales": ["They are making paper kites.", "The theme of decorating here involves making kites.", "Kites are being made because you see all the parts, to the tail, being fabricated"], "image": "train2014/COCO_train2014_000000105389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316971, "question_id": "nwbRsCqRf6sXf3btbagkFN", "question": "What is the relationship between the two women?", "choices": ["teammate", "competitors", "twin sisters", "classmates"], "correct_choice_idx": 0, "direct_answers": ["doubles partners", "teammates", "doubles partners", "tennis partners", "teammates", "players", "co players", "team partners", "teammate", "partners"], "difficult_direct_answer": false, "rationales": ["They are playing doubles in tennis.", "Two women are playing tennis on the same side of court. they are against two other players on other side of net.", "Two women are on the same side of a tennis court, both facing the same direction and in the same outfit. doubles matches in tennis involve two people playing on the same team against two others."], "image": "train2014/COCO_train2014_000000316971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364073, "question_id": "nwoWVU4vr2FUKLjPvNVR3J", "question": "If 27 hits the ball well which way will they run?", "choices": ["rightward", "no where", "left", "backwards"], "correct_choice_idx": 0, "direct_answers": ["rightward", "right", "to right", "first base", "right", "right side", "right", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["That's where first base is", "The way to first base is to the right of the batter.", "First base is to the right."], "image": "val2014/COCO_val2014_000000364073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100034, "question_id": "nxeR8tbkBNWUUS4Udietwr", "question": "What is this type of play called?", "choices": ["dunk", "strike", "drill", "serve"], "correct_choice_idx": 2, "direct_answers": ["tennis", "tennis", "drill", "tennis", "tennis", "triple play", "tennis", "tennis", "tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["Players are in a line taking their turn.", "Kids with tennis rackets are lined up in a row practicing tennis. drills are done at practice.", "The children are in a single file line."], "image": "train2014/COCO_train2014_000000100034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399991, "question_id": "nxqem2VZGS64ivk5fo6nF2", "question": "What is he about to do?", "choices": ["dunk", "hit", "run", "catch"], "correct_choice_idx": 3, "direct_answers": ["catch", "catch ball", "catch ball", "catch ball", "catch ball", "catch", "catch ball", "catch ball", "catch ball", "catch ball"], "difficult_direct_answer": false, "rationales": ["You can tel by what he is wearing and how he is dressed as to what he is doing.", "He has is glove and hand reached out in the read position as if a ball is coming his way.", "The baseball is about to enter his glove."], "image": "val2014/COCO_val2014_000000399991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555537, "question_id": "nxvgsvJrheSxvsdVKxgpqi", "question": "What part of this man's body is most likely to first touch the ball?", "choices": ["none", "rear", "foot", "head"], "correct_choice_idx": 2, "direct_answers": ["foot", "foot", "foot", "foot", "foot", "toes foot", "his feet", "foot", "foot", "foot"], "difficult_direct_answer": false, "rationales": ["The man is playing soccer. the ball is on the ground.", "The person is playing soccer and will kick the ball.", "He is running towards the ball, which is stationary on the ground. he is wearing a soccer uniform."], "image": "train2014/COCO_train2014_000000555537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256539, "question_id": "nySzE3yhPU7gUmLxZvnPnf", "question": "Which fruit contains the highest amount of potassium?", "choices": ["papaya", "plum", "banana", "grape"], "correct_choice_idx": 2, "direct_answers": ["banana", "banana", "kiwifruit", "banana", "bananas", "banana", "banana", "bananas", "bananas", "banana"], "difficult_direct_answer": false, "rationales": ["People eat bananas when their potassium is low.", "The fruit is the banana.", "Bananas have potassium."], "image": "train2014/COCO_train2014_000000256539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264888, "question_id": "nyUgLwNFhfGJgc5C492ULJ", "question": "What are the corded objects used for?", "choices": ["dividing objects", "powering devices", "organizing objects", "testing voltage"], "correct_choice_idx": 1, "direct_answers": ["connections", "charging", "typing", "charging", "powering devices", "charging", "connect devices", "writing", "plugging in", "computer peripherals"], "difficult_direct_answer": false, "rationales": ["The cords put power to the devices.", "Cords can transfer electricity.", "Electronics need some corded devices to power."], "image": "train2014/COCO_train2014_000000264888.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573065, "question_id": "nyanBJgbdRf4o5FiaYsiUB", "question": "What type of shot is the man hitting?", "choices": ["forehand", "slice", "serve", "backhand"], "correct_choice_idx": 0, "direct_answers": ["forehand", "forehand", "tennis", "forehand", "forehand", "backhand shot", "return", "forehand", "forehand", "ball"], "difficult_direct_answer": false, "rationales": ["The man is hitting a forehand shot.", "The ball is in front of the man and based on the positioning of his feet and his wrist facing out, if he were to swing forward towards the ball, he would be utilizing a forehand swing.", "He has his racket pointing in front of him and will move across his body"], "image": "train2014/COCO_train2014_000000573065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46812, "question_id": "nyisWuxRVWKPADgiaUnULo", "question": "What country do the noodles originate from?", "choices": ["ukraine", "korea", "japan", "china"], "correct_choice_idx": 3, "direct_answers": ["japan", "china", "china", "japan", "china", "china", "china", "china", "china", "japan"], "difficult_direct_answer": false, "rationales": ["China invented the noodles.", "They were first known over 4000 years ago in this country", "Noodles come from china."], "image": "val2014/COCO_val2014_000000046812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307894, "question_id": "nyovzdKZzKHGv2JmiubqR3", "question": "What material is the brown floor made of?", "choices": ["vinyl", "wood", "carpet", "tile"], "correct_choice_idx": 3, "direct_answers": ["carpet", "carpet", "tile", "linoleum", "linoleum", "carpet", "wood", "tile", "linoleum", "linoleum"], "difficult_direct_answer": false, "rationales": ["The floor is made of tile.", "The brown floor is made of a type of laminate vinyl.", "The floor is made of tiles."], "image": "train2014/COCO_train2014_000000307894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342408, "question_id": "nysvvuGLYf2tiPoSAvfrWe", "question": "Why is the dog mostly dry?", "choices": ["indoors", "umbrella", "hot", "tent"], "correct_choice_idx": 1, "direct_answers": ["umbrella", "umbrella", "umbrella", "toweled", "umbrella coverage", "under umbrella", "umbrella", "umbrella", "sunshine", "umbrella"], "difficult_direct_answer": false, "rationales": ["It's obvious given the object above it.", "The dog is visibly underneath an umbrella. the umbrella has water dripping down it that would otherwise be dripping on the dog.", "The dog has an umbrella over it."], "image": "train2014/COCO_train2014_000000342408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348838, "question_id": "nyzsgjEyMJ8PkgkT2X7Uwu", "question": "What will come out of the oven?", "choices": ["bread", "donuts", "cookies", "pie"], "correct_choice_idx": 2, "direct_answers": ["cookies", "cookies", "cookies", "cookies", "cookies", "cookies", "cookies", "cookies", "cookies", "cookies"], "difficult_direct_answer": false, "rationales": ["There is a pan on the counter with balls of dough on it. the girl is licking the spoon which people often do when making cookies.", "There is dough on the pans.", "A child is baking and is standing next to a sheet of cookies and holding a spoon."], "image": "val2014/COCO_val2014_000000348838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267203, "question_id": "nz8ruxwEruEWyP99UcLk73", "question": "What type of transportation is this?", "choices": ["ferry", "plane", "car", "rail"], "correct_choice_idx": 3, "direct_answers": ["train", "train", "train", "subway", "rail", "train", "train", "subway", "train", "train"], "difficult_direct_answer": false, "rationales": ["It's a land vehicle that uses tracks. it cannot fly or float.", "The train is a form of rail transport.", "There is a train. it is travelling on tracks."], "image": "train2014/COCO_train2014_000000267203.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266677, "question_id": "nzhfjj2VE95RRnLBRtW3HP", "question": "What is the woman talking about?", "choices": ["footwear", "roaches", "dogs", "giraffes"], "correct_choice_idx": 3, "direct_answers": ["giraffes", "news", "giraffes", "animals", "giraffes", "giraffes", "animals", "giraffes", "giraffes", "giraffe"], "difficult_direct_answer": false, "rationales": ["The lady appears to be a guide at the zoo. giraffes are located directly behind her and are probably part of the tour.", "There are tall animals near the woman. they are not dogs or roaches.", "The woman is standing in a zoo pen with giraffes and using a microphone to deliver a discussion about them"], "image": "val2014/COCO_val2014_000000266677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528067, "question_id": "nzpjMm5jVyVzfdU4s47ho3", "question": "How many of what is standing in front of the elephant looking to the right?", "choices": ["1 elephant", "2 elephants", "2 birds", "3 birds"], "correct_choice_idx": 3, "direct_answers": ["three birds", "one tree", "three", "one tree", "three", "tree trunks", "3 birds", "one", "one tree", "three"], "difficult_direct_answer": false, "rationales": ["There are not any other elephants, just the one. there are more than 2 birds visible.", "There are three birds.", "There are egrets on the ground."], "image": "train2014/COCO_train2014_000000528067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276753, "question_id": "o2BMU8663pQeu5wa8Bhp6v", "question": "How many distinct toppings are on this pizza?", "choices": ["two", "four", "three", "one"], "correct_choice_idx": 2, "direct_answers": ["five", "six", "five", "three", "one", "four", "three", "four 4", "five", "three"], "difficult_direct_answer": false, "rationales": ["There are three toppings on the pizza.", "The pizza has pineapple, beacon, and lettuce. only three distinct toppings are visible.", "There are ham, lettuce and pineapples on the pizza."], "image": "train2014/COCO_train2014_000000276753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565625, "question_id": "o2CfqotUEY7gbBrR5Et6nd", "question": "What shot is the player making?", "choices": ["lob", "backhand", "serve", "forehand"], "correct_choice_idx": 3, "direct_answers": ["serve", "serve", "forehand", "upper", "return", "lob", "return", "forward hit", "forehand", "difficult"], "difficult_direct_answer": false, "rationales": ["The hand is facing forward.", "The way the player's palm is facing the net, the position of the shot is called a \"forehand\".", "The man is using his forehand."], "image": "train2014/COCO_train2014_000000565625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81447, "question_id": "o2G5XjCQkSBEwcSDWfWM22", "question": "What does the man have on?", "choices": ["gas mask", "helmet", "slippers", "backpack"], "correct_choice_idx": 3, "direct_answers": ["shorts", "backpack", "bag pack", "hat", "backpack", "backpack", "shirt shorts", "hat", "bag", "backpack"], "difficult_direct_answer": false, "rationales": ["The man has a backpack.", "This man seems to be a hiker based on his attire and he would carry a sac of sort to carry his stuff.", "The man is wearing a bag. it is behind him."], "image": "train2014/COCO_train2014_000000081447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221045, "question_id": "o2PJk5D7Wb3UWT3sffXgYp", "question": "What type of drink is in the can?", "choices": ["iced tea", "lemonade", "beer", "soda pop"], "correct_choice_idx": 2, "direct_answers": ["beer", "fizzy drink", "beer", "beer", "fizzy drink", "beer", "beer", "fizzy drink", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["It says beer in spanish on the side.", "The word cerveza can be seen written on the can, which is spanish for beer.", "It's a type of beer in the can."], "image": "train2014/COCO_train2014_000000221045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272735, "question_id": "o2PM6kN5tEvG6grD55tMck", "question": "What is the boy ready to do?", "choices": ["sit", "run", "swing", "bat"], "correct_choice_idx": 2, "direct_answers": ["hit tennisball", "hit", "swing", "hit ball", "swing", "tennis", "hit ball", "swing", "swing", "tennis"], "difficult_direct_answer": false, "rationales": ["The boy is swinging.", "The boy is using a racket to hit the ball.", "The boy is getting ready for the ball in tennis."], "image": "train2014/COCO_train2014_000000272735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384027, "question_id": "o2PMNYfftzMwJJbvqdpGMu", "question": "What is the man in the red jacket doing in the road?", "choices": ["crossing", "driving", "repairing", "racing"], "correct_choice_idx": 0, "direct_answers": ["crossing", "crossing street", "crossing", "jaywalking", "crossing", "crossing", "walking", "crossing", "crossing", "crossing street"], "difficult_direct_answer": false, "rationales": ["He is crossing the street to get to other side. he is in a traffic lane and his legs are in a position that indicate he is moving.", "The man is walking across the street.", "The man is crossing the road."], "image": "train2014/COCO_train2014_000000384027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341296, "question_id": "o2Phju3di6UZrg5GVzDbc6", "question": "What hour of the day is it in their language?", "choices": ["sept", "cinq", "quatre", "six"], "correct_choice_idx": 1, "direct_answers": ["seize", "four fifteen", "cinco", "apres-midi", "two twentyfive", "apres midi", "cinq", "1711", "cinq", "5pm"], "difficult_direct_answer": true, "rationales": ["The clock is showing it is near five.", "The small hand on the clock is pointing to the 5, and cinque is five in french. the signs are written in french, indicating this is the language spoken here.", "Cinq is french for 5 which the short hand denoting the hour on this clock points towards."], "image": "val2014/COCO_val2014_000000341296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 525266, "question_id": "o2RVtS7cxBW9VFKVnwfhMj", "question": "Why are there bars on the door?", "choices": ["theft deterrent", "wrought iron", "hurricane protection", "prison bars"], "correct_choice_idx": 0, "direct_answers": ["stop theft", "closed", "locked", "theft deterrent", "jail", "security", "protection", "safety protection", "prevent theft", "prevent breakin"], "difficult_direct_answer": true, "rationales": ["The bars on the door make it difficult for people to break into the store.", "The other options don't match the setting or normal usage.", "The bars are sturdy and prevent entry, even if the glass on the door is shattered."], "image": "train2014/COCO_train2014_000000525266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460145, "question_id": "o2Ta55JjaLqskhXdDT54ud", "question": "What type of transportation is shown?", "choices": ["air", "water", "rail", "road"], "correct_choice_idx": 3, "direct_answers": ["suv", "car", "cars", "cars", "cars", "road transport", "vans", "car", "car", "road"], "difficult_direct_answer": false, "rationales": ["The cars are a form of road transportation.", "Road transportation is shown.", "These are land vehicles that cannot travel on tracks, fly, or float on water."], "image": "val2014/COCO_val2014_000000460145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505100, "question_id": "o2aQuh9RNywQRz3XRUWGTM", "question": "Who is flying the kite?", "choices": ["woman", "man", "girl", "boy"], "correct_choice_idx": 1, "direct_answers": ["man", "man", "man", "people", "people", "man", "man", "man", "people", "people"], "difficult_direct_answer": false, "rationales": ["He has his arm up in the air holding the string that is attached to the kite.", "A man is standing on a beach with his hand outstretched and a kite overhead. a girl is standing next to the man with her hands at her side.", "The man flies."], "image": "train2014/COCO_train2014_000000505100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354205, "question_id": "o2i9q9XhTSzvJMKpbGvxTN", "question": "This vehicle is more likely to fly to what destination?", "choices": ["siberia", "texas", "portugal", "scandinavia"], "correct_choice_idx": 1, "direct_answers": ["usa", "washington dc", "united states", "usa", "united states", "texas", "washington dc", "america", "usa", "washington doc"], "difficult_direct_answer": false, "rationales": ["The other options aren't within the usa. that said, it could be literally flying anywhere. the image isn't clear.", "This is within the united states and this is an official government plane that either the president or vice president uses", "The plane says united states of america on it."], "image": "train2014/COCO_train2014_000000354205.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478499, "question_id": "o2iLkUHNzBBniPPauY8wRf", "question": "What type of sandwich is this?", "choices": ["turkey", "monte cristo", "blt", "corned beef"], "correct_choice_idx": 3, "direct_answers": ["smoked meat", "corned beef", "ruben", "pastrami", "roast beef", "meat", "corned beef", "corned beef", "roast beef", "pastrami"], "difficult_direct_answer": false, "rationales": ["The sandwich is made from thickly sliced pinkish meat.", "There is pink meat and it's topped with mustard.", "The meat looks like corned beef by is shape and color."], "image": "train2014/COCO_train2014_000000478499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79722, "question_id": "o2tHAwvUDF5nateJZ4SGzN", "question": "The stuffy animal is made of what material?", "choices": ["denim", "synthetic fabric", "real fur", "wool"], "correct_choice_idx": 1, "direct_answers": ["cotton", "cotton", "fur", "synthetic fabric", "fabric", "plush", "plush", "fur", "cotton", "cloth"], "difficult_direct_answer": false, "rationales": ["The bear is a stuffed animals and that's what stuffed animals are made from.", "The stuffed animal is made of fabric.", "A bear is stuffed and is blue. stuffed animals are often made of synthetic fibers."], "image": "train2014/COCO_train2014_000000079722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8024, "question_id": "o2u2YqgvYBPEpgyBAGQXr3", "question": "What in her vicinity can she use to wipe her mouth?", "choices": ["shirt", "table cloth", "paper towel", "scarf"], "correct_choice_idx": 2, "direct_answers": ["paper towels", "paper towels", "papertowel", "paper towel", "paper towel", "paper towel", "paper towel", "paper towel", "paper towel", "paper towels"], "difficult_direct_answer": false, "rationales": ["She has a paper towel.", "The girl can use the paper towels.", "There's a roll of paper towels in front of and just to the left of a little girl whose face appears to be covered with spaghetti or pizza sauce."], "image": "train2014/COCO_train2014_000000008024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145834, "question_id": "o2vKJQU2HHQZTkRmdZoXTZ", "question": "Who painted this area?", "choices": ["monet", "armitage", "graffiti artists", "van gogh"], "correct_choice_idx": 2, "direct_answers": ["taggers", "skateboarder", "artists", "graffiti artist", "artist", "graffiti artists", "taggers", "artist", "graffiti artist", "graffiti artists"], "difficult_direct_answer": false, "rationales": ["There is graffiti everywhere.", "Colorful sets of painting can often be seen throughout the inner cities. graffiti artists are normally responsible for creating these.", "This kind of spray paint artwork is known as a specific style of art."], "image": "train2014/COCO_train2014_000000145834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375621, "question_id": "o2ygGXZaiZYcShM7zMpaBU", "question": "What is the name of the store this donut came from?", "choices": ["duck donuts", "7-11", "krispy creme", "dunkin donuts"], "correct_choice_idx": 3, "direct_answers": ["dunkin donuts", "dunking donuts", "dunkin doughnuts", "dunkin donuts", "dunkin donuts", "dunking donuts", "burger", "dunkin donuts", "dunkin donuts", "dunkin"], "difficult_direct_answer": false, "rationales": ["The wrapper has the colors and name of the store on it.", "It says the store name on the wrapping paper.", "The donut is wrapped in a dunkin' donuts wrapper, indicating that that's where it was purchased from."], "image": "val2014/COCO_val2014_000000375621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334032, "question_id": "o3B9YLAab7Hm6pf3v28SU8", "question": "What is the purple outfit the woman is wearing called?", "choices": ["smock", "blanket", "poncho", "robe"], "correct_choice_idx": 2, "direct_answers": ["poncho", "raincoat", "raincoat", "raincoat", "raincoat", "poncho", "poncho", "rain coat", "poncho", "parka"], "difficult_direct_answer": false, "rationales": ["The woman in the wheelchair is wearing a purple poncho which keeps her dry in the rain.", "The purple stuff is a poncho.", "The purple outfit keeps away rain."], "image": "train2014/COCO_train2014_000000334032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487655, "question_id": "o3TkT6Rh8hCWQN5U6teT3v", "question": "Why is the white van parked in the lot?", "choices": ["to wash", "selling food", "refueling", "to dry"], "correct_choice_idx": 1, "direct_answers": ["selling food", "selling food", "selling food", "food", "selling food", "selling food", "selling tacos", "selling", "food sales", "selling food"], "difficult_direct_answer": false, "rationales": ["The van has a food truck name on the side and side doors so it sells food to people.", "The van has a name on it that refers to food and the side reveals the inner works of a food preparation station.", "The white van is parked to sell food."], "image": "train2014/COCO_train2014_000000487655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466766, "question_id": "o3pYhghva3x4z8zt6B3JVJ", "question": "What are most people gathered around?", "choices": ["table", "bar", "library", "kitchen"], "correct_choice_idx": 1, "direct_answers": ["counter", "counter", "bar", "counter", "convention", "bar", "computers", "bar", "party", "desk"], "difficult_direct_answer": false, "rationales": ["There is a setup that resembles a bar based on the height of the table and most visible people are standing and leaning around it. there are also people inside who appear to be preparing drinks as one would do at a bar.", "The people are gathered around a bar where drinks are usually served", "They are around a bar."], "image": "val2014/COCO_val2014_000000466766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475710, "question_id": "o3phoKkRzcW8gzpyBDn8AN", "question": "Where was this picture likely taken from?", "choices": ["cab", "television", "airplane window", "car"], "correct_choice_idx": 2, "direct_answers": ["inside airplane", "inside airport", "jerusalem", "terminal", "airport", "inside airport", "airport", "airport", "airport", "airplane window"], "difficult_direct_answer": false, "rationales": ["The picture is behind a window and the tarmac is visible with airplanes on it.", "The picture is from a plane.", "The photo is blurry from the plane window."], "image": "val2014/COCO_val2014_000000475710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491272, "question_id": "o3qTSC7ccAUQHdzqGwspEh", "question": "Why did the bike rider bring those two bottles?", "choices": ["urinate", "hydrate", "water grass", "wash up"], "correct_choice_idx": 1, "direct_answers": ["supply hydration", "water", "for water", "thirsty", "thirsty", "hydration", "hydrate", "quench thirst", "drink water", "hydration"], "difficult_direct_answer": false, "rationales": ["Riding a bike for a long time can make you thirsty, so better to bring more than one water bottle.", "The bike is hydrating.", "The bottles are attached in the bike water bottle holders and would be brought by the rider to drink on the go instead of finding something to drink during their ride and stopping. when performing a physical activity like cycling, it is also important to hydrate to prevent sickness or fatigue."], "image": "train2014/COCO_train2014_000000491272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397105, "question_id": "o3rrsXVjKNDicNz7QChFvQ", "question": "What is the man grabbing out of the trees?", "choices": ["balls", "apples", "nuts", "pears"], "correct_choice_idx": 1, "direct_answers": ["apple", "apples", "apple", "apple", "apples", "apples", "apple", "apples", "apples", "apple"], "difficult_direct_answer": false, "rationales": ["There are red fruits in the trees. they are not nuts or pears.", "The man is reaching for the apple that is in the tree.", "There are red fruits in the trees. one of them is on the other man's head."], "image": "train2014/COCO_train2014_000000397105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145161, "question_id": "o3yHQJC6GLoshQjb2oVxYP", "question": "What material is this hat made of?", "choices": ["cardboard", "metal", "nylon", "polyester"], "correct_choice_idx": 0, "direct_answers": ["metal", "foil", "straw", "aluminum", "plastic", "tinfoil", "paper", "aluminum", "cardboard", "foil"], "difficult_direct_answer": false, "rationales": ["The hat is made of cardboard.", "It is silver and shiny", "Her hat is made of spray painted cardboard."], "image": "train2014/COCO_train2014_000000145161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279041, "question_id": "o43hM8BUb3neeg7tK3ndv2", "question": "The wooden item on the bathtub is good for holding what?", "choices": ["flowers", "radio", "rug", "soap"], "correct_choice_idx": 3, "direct_answers": ["towels", "book", "soap", "book", "book", "soap", "soap", "towels", "wine", "books"], "difficult_direct_answer": false, "rationales": ["The item keeps things from going into the bathtub", "Instead of putting the bath items way at the bottom outside the tub, the object would help keep the items within arms reach to wash with.", "A tray is resting on top of a bathtub with each end of the tray resting on the edges of the tub."], "image": "train2014/COCO_train2014_000000279041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518685, "question_id": "o44PpGCPmv6rnzYTLhvf3n", "question": "What is the green umbrella being used to block?", "choices": ["sun", "wind", "bugs", "rain"], "correct_choice_idx": 0, "direct_answers": ["sun", "sunlight", "sun", "sun", "sun", "sun", "sun", "sunlight", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["It's not raining, and c and d can't be stopped with an umbrella.", "The sky is very sunny.", "The green umbrella is being used to provide shade."], "image": "train2014/COCO_train2014_000000518685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531647, "question_id": "o4CJ5sVjmV2uASBJA6yddJ", "question": "What is the purpose of the many black chords?", "choices": ["decoration", "swinging", "climbing", "electricity"], "correct_choice_idx": 3, "direct_answers": ["electricity", "power supply", "carry electricity", "electricity", "conduct electricity", "electrical lines", "electricity", "electricity", "electricity", "electricity"], "difficult_direct_answer": false, "rationales": ["The cords provide electricity.", "They all are wires that bring electric to all the homes and businesses.", "You can tell those chords are part of a power grid."], "image": "train2014/COCO_train2014_000000531647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268548, "question_id": "o4LWeK9msGpsuJx6XxumUg", "question": "What part of a beach are the translucent blue objects made from?", "choices": ["sand", "seashells", "water", "seaweed"], "correct_choice_idx": 0, "direct_answers": ["jellyfish", "sea", "sea", "sea glass", "sand", "sand", "sand", "sand", "sand", "sand"], "difficult_direct_answer": false, "rationales": ["The objects are glass by their appearance. glass is made from heated up sand.", "The blue objects are made of sand.", "Sand can be used to make colorful items"], "image": "train2014/COCO_train2014_000000268548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277869, "question_id": "o4PGxNrmQgjH4qNzwYJYtJ", "question": "Who are the bus's passengers?", "choices": ["convicts", "office workers", "small children", "tourist"], "correct_choice_idx": 3, "direct_answers": ["tourist", "children", "human beings", "tourist", "tourists", "travelers", "tourists", "children", "monkeys", "travelers"], "difficult_direct_answer": false, "rationales": ["The bus is full of tourists.", "Most likely, by the writing and adverts on the bus it's a tour bus for tourists.", "A tourist bus is shown with advertisements on the side. tourists ride public buses."], "image": "val2014/COCO_val2014_000000277869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426026, "question_id": "o4SZfLs5SvuXqckqFz7gjM", "question": "During which season are these people enjoying the park?", "choices": ["fall", "summer", "winter", "spring"], "correct_choice_idx": 0, "direct_answers": ["autumn", "summer", "autumn", "fall", "fall", "spring season", "summer", "fall", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["Some trees do not have leaves. the leaves that remain mostly are not green.", "The leaves have fallen off the trees.", "It is more windy in the spring."], "image": "train2014/COCO_train2014_000000426026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566085, "question_id": "o4UTY97aNuk2dXvEDYvXeV", "question": "Which mart is seen in near the taxi?", "choices": ["bcm", "ala", "lo", "abc"], "correct_choice_idx": 3, "direct_answers": ["abc mart", "abc", "abc", "abc", "abc", "abc mart", "abc", "abc", "abc", "abc mart"], "difficult_direct_answer": false, "rationales": ["The mart is abc.", "There is a sign that says \"abc mart\" near the taxi.", "The letters are on the neon sign above the store"], "image": "train2014/COCO_train2014_000000566085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198596, "question_id": "o4ZWufGXR8ooM2F43E3ee9", "question": "Which sponsor appears on the jersey?", "choices": ["mcdonalds", "subway", "pizza hut", "burger king"], "correct_choice_idx": 1, "direct_answers": ["subway", "subway", "subway bangers", "subway", "subway", "subway", "subway", "subway bangers", "subway", "subway"], "difficult_direct_answer": false, "rationales": ["The logo shows about making subs.", "The logo is written on the shirt", "The man is wearing a baseball jersey that has the name of his team, the bangers, and the team's sponsor, subway."], "image": "train2014/COCO_train2014_000000198596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4355, "question_id": "o4aDY8xirVyGS5HNqGCAZJ", "question": "Why is the man wearing a monster helmet?", "choices": ["visibility", "camouflage", "dress code", "for fun"], "correct_choice_idx": 3, "direct_answers": ["protection", "for safety", "protection", "intimidation", "for fun", "motorcycling", "bike", "bike", "photo", "preference"], "difficult_direct_answer": false, "rationales": ["The man wants to be cutesy.", "It's decorative but also a head covering for safety.", "The man wants to be fun."], "image": "val2014/COCO_val2014_000000004355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389480, "question_id": "o4nDKrDfMdGKP3Bmd8PCHZ", "question": "The woman wants to throw the plate to whom?", "choices": ["ranger", "camera man", "self", "mom"], "correct_choice_idx": 1, "direct_answers": ["camera person", "man", "man", "camera man", "photographer", "camera man", "man", "friend", "man", "man"], "difficult_direct_answer": false, "rationales": ["The woman with the plate holding it as if she's going to throw it like a frisbee is looking at the camera person.", "The woman is throwing to the cameraman.", "The woman is motioning to the throw the plate at the camera."], "image": "val2014/COCO_val2014_000000389480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29833, "question_id": "o4ngdKoW35UAvVGXRy6K8g", "question": "The women picked the fruit before it was what?", "choices": ["planted", "ripe", "flowered", "bloomed"], "correct_choice_idx": 1, "direct_answers": ["ripe", "garden", "ripe", "ripe", "ripe", "ripe", "garden", "garden", "ripe", "ripe"], "difficult_direct_answer": false, "rationales": ["The bananas are still green.", "Green fruit is on a vine.", "The fruit is not yellow yet."], "image": "val2014/COCO_val2014_000000029833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349822, "question_id": "o4qwGFWpWhMuxPuw7wc7NB", "question": "What multiple person sport is being played?", "choices": ["cricket", "frisbee", "tennis", "badminton"], "correct_choice_idx": 1, "direct_answers": ["frisbee", "frisbee", "ultimate frisbee", "ultimate frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["He is playing with a frisbee.", "The person is playing frisbee.", "A person is running and grabbing onto a round, plastic item."], "image": "val2014/COCO_val2014_000000349822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73172, "question_id": "o4toVat3DAekwpGaXvfvnf", "question": "What is the rod sticking out of the suitcase used for?", "choices": ["straightening", "radar", "anchoring", "pulling"], "correct_choice_idx": 3, "direct_answers": ["pulling", "pulling", "pulling", "rolling it", "pulling", "pulling", "pulling", "pulling", "to pull", "handle"], "difficult_direct_answer": false, "rationales": ["This is a handle used to pull your suitcase so you don't have to carry it.", "The rod pulls.", "The suitcase is a rolling one."], "image": "train2014/COCO_train2014_000000073172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198506, "question_id": "o4vBexd35d9GeVesbLzYBW", "question": "What will the people standing by the Train do next?", "choices": ["sell candy", "board train", "depart train", "clean train"], "correct_choice_idx": 1, "direct_answers": ["get on", "board", "board", "board train", "getting on", "board train", "board train", "enter train", "get on", "lifting"], "difficult_direct_answer": false, "rationales": ["People are waiting to get on a train. they are traveling because they are holding onto luggage.", "The people will get on the train.", "All the people near the train are going to get on it."], "image": "train2014/COCO_train2014_000000198506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206397, "question_id": "o574nSEqxiG7zGaxUEQBdb", "question": "What is a process that is related to these animals?", "choices": ["nuclear fusion", "shearing", "soaring", "photosynthesis"], "correct_choice_idx": 1, "direct_answers": ["sheering", "shearing", "shearing wool", "shearing", "wool", "shedding", "wool", "shearing", "sheer", "shearing"], "difficult_direct_answer": false, "rationales": ["Sheep get sheared for their wool when it gets very long.", "The fur gets cut off.", "The process is shearing."], "image": "train2014/COCO_train2014_000000206397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160229, "question_id": "o58doYnMqYSZLZ7X9eDwXg", "question": "Performing skating around a straight line of equally spaced cones is called?", "choices": ["free line", "slalom", "out line", "in line"], "correct_choice_idx": 1, "direct_answers": ["weaving", "slalom skating", "slalom", "freestyle", "freestyle slalom", "slalom skating", "agility training", "slalom", "slalom", "freestyle slalom"], "difficult_direct_answer": false, "rationales": ["Skating in a line of cones is called slalom.", "The man is skating fast between sets of cones in a line called a slalom.", "Orange cones are set up in a straight line and people skateboard through them."], "image": "train2014/COCO_train2014_000000160229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16977, "question_id": "o58fHweTF9k8RYhY2T2Kxd", "question": "Which car violates the law?", "choices": ["black car", "green car", "red car", "silver car"], "correct_choice_idx": 0, "direct_answers": ["black car", "black car", "suv", "black truck", "car", "right side", "black toyota", "black one", "black car", "suv"], "difficult_direct_answer": false, "rationales": ["The black car can't park behind the gray one.", "The black car is not well parked and is parked beyond the yellow parking slot.", "The vehicle is on the side walk and past the yellow line indicating where to park."], "image": "val2014/COCO_val2014_000000016977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71603, "question_id": "o5HWJaLBnponeMw5dYj7z3", "question": "Why are the the two riders wearing helmets?", "choices": ["identification", "protect heads", "fashion", "incognito"], "correct_choice_idx": 1, "direct_answers": ["safety", "safety", "safety", "for protection", "head protection", "safety", "for protection", "protect heads", "protection", "head protection"], "difficult_direct_answer": false, "rationales": ["The helmets can provide cushioning for their heads so they do not crack.", "The people want to protect their heads.", "They want protection."], "image": "train2014/COCO_train2014_000000071603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51167, "question_id": "o5NAMQzn2R6o9AD8rs3PML", "question": "The symbol on the top right of the bus means this bus is equipped with what?", "choices": ["braille writing", "wheelchair ramp", "wheelchair lift", "attending nurses"], "correct_choice_idx": 2, "direct_answers": ["handicap services", "electricity", "wheelchair ramp", "wheelchair access", "wheelchair access", "handicap accessibility", "ramp", "wheelchair lift", "handicapped exits", "handicap ramp"], "difficult_direct_answer": true, "rationales": ["The image contains a drawn person seated onto what appears to be a wheelchair.", "The symbol is for wheelchairs.", "There is a handicapped sign on the bus so you guess there is a wheelchair lift for them"], "image": "val2014/COCO_val2014_000000051167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554301, "question_id": "o5NUCqrg8R3LbjxKWjWJx4", "question": "Who is the man playing tennis with?", "choices": ["no one", "singles partner", "intern", "doubles partner"], "correct_choice_idx": 0, "direct_answers": ["no one", "no one", "no one", "tennis-ball machine", "can't see", "machine", "friend", "himself", "friend", "racket"], "difficult_direct_answer": false, "rationales": ["He is practicing.", "He is practicing by himself.", "He is hitting balls alone."], "image": "train2014/COCO_train2014_000000554301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446917, "question_id": "o5SNW9bSmhrZvjTsc8Yup5", "question": "What mode of transportation is being utilized here?", "choices": ["unicycle", "bicycle", "motor cycle", "car"], "correct_choice_idx": 1, "direct_answers": ["bicycle", "bicycle", "bicycle", "bike", "bike", "bicycle", "bicycle", "bicycle", "bicycle", "bicycle"], "difficult_direct_answer": false, "rationales": ["It's a bicycle.", "A bike is shown.", "The man is on top of a bike in as city environment."], "image": "val2014/COCO_val2014_000000446917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233341, "question_id": "o5byaDbvCe7cqZ6jLstbF2", "question": "Solingen HBF railways in?", "choices": ["germany", "italy", "france", "canada"], "correct_choice_idx": 0, "direct_answers": ["germany", "new york", "germany", "germany", "trains", "germany", "germany", "germany", "new york", "germany"], "difficult_direct_answer": false, "rationales": ["Solingen is a german brand.", "This is a known location found in germany.", "This company is located in germany"], "image": "train2014/COCO_train2014_000000233341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9148, "question_id": "o5jzoGzFyz2vMVihbz6wnV", "question": "The truck most likely transports what kind of goods?", "choices": ["fruits", "oil", "trees", "rubber"], "correct_choice_idx": 0, "direct_answers": ["agricultural", "food products", "bananas", "fruits", "food", "fish", "cars", "fruit", "loggs", "fruits"], "difficult_direct_answer": true, "rationales": ["The truck is carrying fruit.", "Bananas are visible in its bed.", "There are bananas visible on the bottom of the truck. bananas are in the category of answer a."], "image": "train2014/COCO_train2014_000000009148.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220355, "question_id": "o6JDhwbod9QnsdLV54zZTa", "question": "This layer of icing is known as the what?", "choices": ["crumb coat", "ugly coat", "sugar coat", "final coat"], "correct_choice_idx": 0, "direct_answers": ["bottom", "cake", "crumb coat", "top layer", "primer", "frosting", "frosting", "base", "buttercream", "base"], "difficult_direct_answer": false, "rationales": ["The layer keeps the crumbs of the cake in place because the final layer of icing is done.", "The coating provided a surface onto which crumbs can be applied at the end of the manufacturing process.", "It's a crumb coat."], "image": "train2014/COCO_train2014_000000220355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551284, "question_id": "o6JcFweSCanhVwcyWKYtfb", "question": "What is he doing with his finger?", "choices": ["selecting speed", "mixing drink", "cleaning appliance", "moving appliance"], "correct_choice_idx": 0, "direct_answers": ["flicking switch", "selecting speed", "dialing blender", "pressing button", "operating blender", "turning on", "changing settings", "switching", "pushing button", "pushing controls"], "difficult_direct_answer": true, "rationales": ["The man is ready to touch the button on the blender and that's what the buttons do.", "The man is pressing a button.", "A person is his finger out to press a button on a blender."], "image": "train2014/COCO_train2014_000000551284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338134, "question_id": "o6Lf4auydY8yocFG3EmLkh", "question": "What is the cat resting inside?", "choices": ["planter", "vase", "bird bath", "saucer"], "correct_choice_idx": 0, "direct_answers": ["pot", "tired", "pot", "pot", "planter", "plant pot", "flower pot", "planter", "planter pot", "flower pot"], "difficult_direct_answer": false, "rationales": ["The cat is in the planter.", "These are made out of clay to put vegetation in to grow.", "The cat rest in a planter used to grow plants."], "image": "train2014/COCO_train2014_000000338134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172616, "question_id": "o6RxXGCiCxGtQmLVVsYbxh", "question": "To what does the two wheels most likely belong?", "choices": ["stroller", "motorcycle", "car", "skateboard"], "correct_choice_idx": 0, "direct_answers": ["child", "bicycle", "stroller", "stroller", "cart", "child", "tricycle", "stroller", "bike", "stroller"], "difficult_direct_answer": false, "rationales": ["Even though you can only see the end of it, it has a stroller shape to it. it looks like the end where you put their feet or a bag at.", "The wheels are from the stroller.", "It has at least two wheels that are about 18\" in diameter. above the wheels is a metal tray with metal slats as the bottom."], "image": "val2014/COCO_val2014_000000172616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105293, "question_id": "o6cv7NsHhgcxENmrdKFcin", "question": "Which bowl's contents would be most dry to the touch?", "choices": ["middle bowl", "bottom bowl", "top bowl", "glass cup"], "correct_choice_idx": 0, "direct_answers": ["bread", "carrot", "rolls", "middle", "square bowl", "glass dish", "middle bowl", "bread", "glass", "middle"], "difficult_direct_answer": false, "rationales": ["The other bowls have broth in them.", "A bowl with soup is in front of a dish with fried items. soup is made of liquid.", "The middle bowl has dried baked goods."], "image": "train2014/COCO_train2014_000000105293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553719, "question_id": "o7H9dE8niqBqYG4rtojj5s", "question": "What is the furniture the cat laying on?", "choices": ["table", "bed", "couch", "chair"], "correct_choice_idx": 2, "direct_answers": ["sofa", "sofa", "couch", "sofa", "sofa", "couch", "couch", "sofa", "sofa", "couch"], "difficult_direct_answer": false, "rationales": ["The furniture is a couch.", "The cat is on a couch.", "The cat is sitting on furniture with two cushions."], "image": "train2014/COCO_train2014_000000553719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149556, "question_id": "o7KubffWrVw6DVcJjdejnx", "question": "What would these devices normally be found resting on?", "choices": ["cushion", "rug", "carpet", "mat"], "correct_choice_idx": 3, "direct_answers": ["desktop", "mat", "desk", "desk", "mousepad", "mouse pad", "mousepad", "desk", "desk", "mouse pad"], "difficult_direct_answer": false, "rationales": ["The devices are on a mat.", "Computer mice are often on top of mouse pads.", "Normally you would put these on a mouse pad so they move smoother when you are using them."], "image": "train2014/COCO_train2014_000000149556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242631, "question_id": "o7YyFzr4kNujDEQp5KojWH", "question": "What is the person who is aloft attempting to do with the frisbee?", "choices": ["catch it", "judge it", "read it", "throw it"], "correct_choice_idx": 0, "direct_answers": ["catch", "catch it", "catch", "catch it", "catch", "catch it", "catch", "catch it", "catch", "catch"], "difficult_direct_answer": false, "rationales": ["He's attempting to grab the disc flying in his direction.", "He is reaching out to grab the disc as it flies towards him.", "The kid is trying to catch the frisbee in his hands."], "image": "train2014/COCO_train2014_000000242631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152994, "question_id": "o7eA7tBGinHan48Nc348ib", "question": "What type of productivity work is being done on the white monitor connected to the laptop?", "choices": ["answering e-mails", "coding", "spreadsheet calculation", "data entry"], "correct_choice_idx": 0, "direct_answers": ["editing", "answering e-mails", "email", "email", "email", "editing", "email", "email", "email", "editing"], "difficult_direct_answer": false, "rationales": ["They are using a program to respond to emails.", "The gmail program is open on the screen", "The white screen owner has their email open."], "image": "train2014/COCO_train2014_000000152994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496128, "question_id": "o7q6A4KzNCM9eQDPyqmxrL", "question": "What beverage is the man drinking?", "choices": ["malt", "ale", "iced tea", "soda"], "correct_choice_idx": 1, "direct_answers": ["beer", "beer", "beer", "beer", "beer", "beer", "beer", "beer", "ale", "beer"], "difficult_direct_answer": false, "rationales": ["The beverage is ale.", "The man is drinking beer.", "The drink is a dark color with some foam at the top and it is in a drinking glass."], "image": "train2014/COCO_train2014_000000496128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214388, "question_id": "o82FXtPP6G7Qme2hgGV6dL", "question": "What is on the plate?", "choices": ["spoon", "salmon", "fork", "pizza"], "correct_choice_idx": 2, "direct_answers": ["egg toast", "food", "fork", "food", "pie", "bread", "breakfast", "breakfast sandwich", "egg", "egg"], "difficult_direct_answer": false, "rationales": ["The food is not pizza nor salmon, but appears to be a fried egg on toast. although there is a knife on the plate, there is no spoon which leaves the only other piece of silverware as the proper answer.", "There is a knife, food and a fork on the plate.", "There is a fork, knife and food."], "image": "val2014/COCO_val2014_000000214388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477807, "question_id": "o8Ls8MFoCTPca4gsRtsVwL", "question": "The dogs face danger of falling off if the rider does what?", "choices": ["stops", "speeds", "yells", "sings"], "correct_choice_idx": 1, "direct_answers": ["hits bump", "speeds", "crashes", "crash", "stops abruptly", "swerve", "turns", "drives fast", "drives fast", "starts moving"], "difficult_direct_answer": true, "rationales": ["The dog is exposed.", "If the man goes to fast they could get hurt.", "There seems to be no safety harness to keep the dogs from falling out of the seats."], "image": "train2014/COCO_train2014_000000477807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505188, "question_id": "o8RMqw7VoHTETMe7y4fmFV", "question": "What is the game controller called?", "choices": ["stick", "game pad", "joystick", "joy pad"], "correct_choice_idx": 2, "direct_answers": ["playstation", "ps4", "playstation", "playstation", "playstation", "joystick", "playstation", "playstation controller", "remote", "playstation"], "difficult_direct_answer": false, "rationales": ["The controller is multiple buttons on an elongated platform held by both hands.", "The controller is for a nintendo.", "The black device allows users to control the game."], "image": "train2014/COCO_train2014_000000505188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407042, "question_id": "o8gq2rUH8jB4hXQG35jE2D", "question": "Why are they holding stuffed animals?", "choices": ["for sale", "are toddlers", "are confused", "stole them"], "correct_choice_idx": 1, "direct_answers": ["fair", "are toddlers", "winners", "prize", "comfort", "they won", "teddy", "prizes", "fair", "prize"], "difficult_direct_answer": false, "rationales": ["Kids like to play with stuffed animals.", "Kids love stuffed animals.", "They are kids and these are toys"], "image": "val2014/COCO_val2014_000000407042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382854, "question_id": "o8hsQtux9cYUvmCf3XkZWQ", "question": "What sport is the cartoon dog playing?", "choices": ["baseball", "ice hockey", "golf", "lacrosse"], "correct_choice_idx": 1, "direct_answers": ["ice hockey", "hockey", "hockey", "hockey", "hockey", "hockey", "ice hockey", "hockey", "ice hockey", "ice hockey"], "difficult_direct_answer": false, "rationales": ["The team is niagara ice dogs.", "The dog is holding a hockey stick and has what looks like ice skates and a hockey jersey on. one wearing skates, a hockey jersey and a hockey stick is likely to be playing hockey.", "The only sport that is played on ice is hockey."], "image": "train2014/COCO_train2014_000000382854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8787, "question_id": "o8sXyJBUQyJLTVFRpxudcn", "question": "What type of writing is on the can?", "choices": ["directional", "informational", "regulatory", "graffiti"], "correct_choice_idx": 3, "direct_answers": ["graffiti", "grafitti", "graffiti", "graffiti", "grafitti", "graffiti", "graffiti", "graffiti", "graffiti", "graffiti"], "difficult_direct_answer": false, "rationales": ["The writing is graffiti sprayed on the can.", "The can has irregular writing made of black paint.", "There is a doodle on the trash can."], "image": "val2014/COCO_val2014_000000008787.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437199, "question_id": "o95zm5a2KecRciFQBqNSsq", "question": "What type fried vegetable is shown here?", "choices": ["tomato", "radish", "celery", "potato"], "correct_choice_idx": 3, "direct_answers": ["potatoe", "potato", "potato", "french fries", "potatoes", "fries", "potato", "french fries", "potato", "potato"], "difficult_direct_answer": false, "rationales": ["The veggie is a potato.", "French fries are made from potatoes", "French fries are lightly-browned skinny sticks of potato which have been fried. they are frequently eaten along with a sandwich."], "image": "train2014/COCO_train2014_000000437199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176891, "question_id": "o9PrXYQhJdXmeYFoZgSbhB", "question": "What kind of food eater is the animal?", "choices": ["carnivore", "omnivore", "herbivore", "photosynthesis"], "correct_choice_idx": 2, "direct_answers": ["grass", "herbivore", "plant eater", "herbivore", "herbivore", "herbivore", "hay", "herbivore", "herbivore", "herbivore"], "difficult_direct_answer": false, "rationales": ["Cows eat grass.", "It's a herbivore.", "Cows eat grass and hay and other plants so they would be a herbivore."], "image": "val2014/COCO_val2014_000000176891.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575882, "question_id": "o9YCHCGga8sNaMPN6px5dp", "question": "Why is there a chord connected to the device the man is using?", "choices": ["to whip", "to secure", "to charge", "to hold"], "correct_choice_idx": 2, "direct_answers": ["power", "charging", "charging", "electricity", "charging", "for power", "charging", "to charge", "charging", "ethernet cord"], "difficult_direct_answer": false, "rationales": ["The computer needs power to run", "The cord is charging.", "There is a cord connected to the device the man is using to charge its battery."], "image": "val2014/COCO_val2014_000000575882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36034, "question_id": "oA4525dyC6PwXcAaYcE2TF", "question": "Boston Cream Doughnut covered with what?", "choices": ["chocolate", "scotch", "wine", "sugar"], "correct_choice_idx": 0, "direct_answers": ["chocolate", "chocolate", "chocolate", "glaze", "chocolate sauce", "glaze", "pastry cream", "icing", "cream", "glaze"], "difficult_direct_answer": false, "rationales": ["The answer is commonly known to be the topping for a boston cream doughnut.", "The donut has chocolate on it.", "Chocolate is the frosting for a boston cream donut."], "image": "train2014/COCO_train2014_000000036034.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78762, "question_id": "oAJ4wfDcBPkGi2XXPoKd5j", "question": "How many kind of kite shapes available?", "choices": ["eight", "three", "five", "four"], "correct_choice_idx": 0, "direct_answers": ["thirteen", "three", "one", "eight", "one", "five", "one kind", "two", "twelve", "one"], "difficult_direct_answer": false, "rationales": ["There are eight kites in the sky.", "There are eight shapes.", "There are 8 major types of kite shapes."], "image": "train2014/COCO_train2014_000000078762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573549, "question_id": "oAMs2gFRDdqzLdeHajN4gf", "question": "Why do they impersonate females?", "choices": ["avoid police", "confused", "disguise", "money"], "correct_choice_idx": 3, "direct_answers": ["fun", "money", "entertainment", "comedy", "performance", "drag queens", "hobby", "its fun", "entertainment", "show"], "difficult_direct_answer": true, "rationales": ["The sign is at a store.", "The location's signage reveals that it is a place for comedy. darcelle xv, who runs a drag show, is also mentioned in the signage.", "They want money."], "image": "val2014/COCO_val2014_000000573549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294716, "question_id": "oAQcSfcFyV5EQ3ruxW268U", "question": "What type beverage is the woman enjoying with her food?", "choices": ["milk", "ale", "cola", "tea"], "correct_choice_idx": 1, "direct_answers": ["ale", "wine", "beer", "ale", "dark beer", "beer", "beer", "wine", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["The woman is holding a beer glass.", "The text on the glass refers to guinness. this company makes alcoholic beverages.", "It's in a guiness glass which is a type of beer."], "image": "train2014/COCO_train2014_000000294716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426031, "question_id": "oAd69pEN9DbaLZU48koEEy", "question": "What handedness does this person have?", "choices": ["left", "right", "none", "ambidextrous"], "correct_choice_idx": 1, "direct_answers": ["right", "right", "right", "righthand", "left", "right handed", "left hand", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["A person is wearing a baseball glove on their left hand.", "The boy is reaching out with his left.", "The glove is on his left hand. the dominant one is the one he will throw with."], "image": "train2014/COCO_train2014_000000426031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 105110, "question_id": "oAiAYGHU4MQKzoRbtnzp3n", "question": "This frisbee is how many grams?", "choices": ["200", "300", "160", "250"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "three", "many grams", "30", "160", "twenty seven", "ten", "100 grams", "100"], "difficult_direct_answer": true, "rationales": ["Frisbees have a standard weight of 160 grams.", "Frisbees are really light so they can fly in the air.", "The frisbee looks lightweight."], "image": "train2014/COCO_train2014_000000105110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411266, "question_id": "oApZJMHbpfKh8s4X4goTDD", "question": "What type of transportation is this?", "choices": ["water", "road", "rail", "air"], "correct_choice_idx": 3, "direct_answers": ["plane", "plane", "air", "air transport", "airplane", "airplane", "airplane", "airplane", "plane", "airplane"], "difficult_direct_answer": false, "rationales": ["Specifically, this is an airplane.", "The man is getting in to a plane that he will use as transporation through the air.", "The helicopter will fly off."], "image": "train2014/COCO_train2014_000000411266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285478, "question_id": "oAq2gtFxkes3X6BfcyKeUP", "question": "What is the opposite form of this pitch?", "choices": ["lefthand", "sidehand", "underhand", "overhand"], "correct_choice_idx": 2, "direct_answers": ["underhand", "underhand", "underhand", "batting", "screwball", "underhand", "curveball", "underhand", "right", "underhand"], "difficult_direct_answer": false, "rationales": ["Most all pitchers in baseball throw overhand, so the opposite is under.", "The current pitch is over the pitcher's head, meaning the opposite would be this.", "The arm is over the head."], "image": "train2014/COCO_train2014_000000285478.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460728, "question_id": "oAqmRzXYNYGCkPCnRAbW6L", "question": "Which side of the bus can people enter through?", "choices": ["driver side", "back side", "passenger side", "top side"], "correct_choice_idx": 0, "direct_answers": ["driver side", "passenger sure", "left", "right", "left", "left", "left", "right", "left", "left"], "difficult_direct_answer": false, "rationales": ["The bus can accept people for entry at the driver side.", "On the bus there is a door on the drivers side here.", "The side is the driver side."], "image": "train2014/COCO_train2014_000000460728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267175, "question_id": "oBAeYNojVkH7jKqRcLnZMM", "question": "What is on the desk?", "choices": ["apple", "computer", "cat", "bird"], "correct_choice_idx": 1, "direct_answers": ["monitors", "keyboard", "computer", "computer", "computer", "computers", "computer", "computer screens", "computer", "computer monitors"], "difficult_direct_answer": false, "rationales": ["Two monitors and a keyboard are on the desk.", "It is a desktop model with dual screen monitors.", "There are computers on the desk."], "image": "val2014/COCO_val2014_000000267175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158133, "question_id": "oBBmHeFpNthqS4ceSbLGZX", "question": "What is the snow ramp being used for?", "choices": ["sitting", "jumps", "archway", "shipping"], "correct_choice_idx": 1, "direct_answers": ["scatting", "skating", "hitting jump", "snowboarding", "jumps", "tricks", "jumps", "snowboarding", "jumping", "ski jumping"], "difficult_direct_answer": false, "rationales": ["They are using it to jump.", "People use it to gain speed to jump.", "The snow ramp can be jumped from."], "image": "val2014/COCO_val2014_000000158133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320929, "question_id": "oBBprK79A26WLjvnwrpGKC", "question": "The brown blocks came from what type of plant?", "choices": ["lilies", "tree", "bamboo", "violets"], "correct_choice_idx": 1, "direct_answers": ["playing items", "tree", "tree", "tree", "playing items", "tree", "tree", "tree", "tree", "tree"], "difficult_direct_answer": false, "rationales": ["The blocks are wooden.", "The texture and color of these blocks let's us conclude they're made of wood. wood comes from trees.", "The brown blocks are made of wood."], "image": "train2014/COCO_train2014_000000320929.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420065, "question_id": "oBZqE2MKaw93YabAiGFrj8", "question": "Why did this couple take a break?", "choices": ["hunger", "bathroom", "work rule", "thirst"], "correct_choice_idx": 0, "direct_answers": ["to eat", "hungry", "to eat", "hunger", "to eat", "hungry", "hungry", "to eat", "to eat", "to eat"], "difficult_direct_answer": false, "rationales": ["The couple is hungry.", "These people are eating food. they are not drinking or going to the bathroom.", "They're hungry."], "image": "train2014/COCO_train2014_000000420065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300631, "question_id": "oCCbE9eNtdia9zheYNnagd", "question": "What is the man replacing on the tire?", "choices": ["brakes", "rim", "tube", "spokes"], "correct_choice_idx": 2, "direct_answers": ["inner tubes", "lining", "new tire", "wheel", "rubber", "tube", "inner tube", "tube", "tube", "tire tube"], "difficult_direct_answer": false, "rationales": ["The man has an inflatable tube.", "The man is replacing the inner tube of a bike tire.", "A man is crouched near a bike with a tire tube on the ground next to him."], "image": "train2014/COCO_train2014_000000300631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47263, "question_id": "oCCqKvJGNABrcvoi6YZ9zy", "question": "Why is he standing next to the truck?", "choices": ["is talking", "selling candy", "is lost", "robbing truck"], "correct_choice_idx": 0, "direct_answers": ["is talking", "talking", "talking", "is dog", "helping", "talking", "talking", "talking", "talking", "giving directions"], "difficult_direct_answer": false, "rationales": ["A man is standing next to a vehicle in the street and the arm of the driver is casually hanging out of the window. the man standing is facing the driver and they are conversationally close.", "The man is talking to the driver.", "He is talking."], "image": "val2014/COCO_val2014_000000047263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295124, "question_id": "oCKeJsfANfsX2F4TS99YsC", "question": "What are they doing?", "choices": ["racing", "sliding", "chasing someone", "fighting"], "correct_choice_idx": 0, "direct_answers": ["racing", "skiing", "racing", "skiing", "skiing", "skiing", "skiing", "cross country", "cross-country skiing", "skiing"], "difficult_direct_answer": false, "rationales": ["They're racing.", "The are wearing numbers that indicate which racer they are.", "The people are wearing numbered pinnies."], "image": "val2014/COCO_val2014_000000295124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387401, "question_id": "oCPU9Taho2Kr44hodYbQer", "question": "What make of vehicle is following the bus?", "choices": ["kia", "mazda", "nissan", "hyundai"], "correct_choice_idx": 3, "direct_answers": ["ford", "volvo", "mercedes", "marcopolo", "honda", "volvo", "marcazzio", "hyundai", "hyundai", "volvo"], "difficult_direct_answer": false, "rationales": ["A small white sedan is following a bus. the emblem has a h on the front grill.", "A hyundai is traveling.", "The car manufacturer is hyundai."], "image": "train2014/COCO_train2014_000000387401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449402, "question_id": "oCZHGWQD39SJJGJ4wGs2wj", "question": "What is he doing?", "choices": ["phone conversation", "watching enemies", "showing off", "ordering dinner"], "correct_choice_idx": 0, "direct_answers": ["talking", "phone conversation", "talking", "calling someone", "on phone", "on phone", "phonecall", "using phone", "phone call", "phone call"], "difficult_direct_answer": false, "rationales": ["The man is holding a portable electronic device up to his ear and mouth.", "The man is on the phone.", "Since the man is clearly holding a cellphone up to his ear, we can surmise that he's in the process of using a cellphone."], "image": "val2014/COCO_val2014_000000449402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250955, "question_id": "oCcRhCquSpAivqzy8dYTGK", "question": "When is the type of meal above favorable to be served?", "choices": ["afternoon", "supper", "breakfast", "lunch"], "correct_choice_idx": 2, "direct_answers": ["after dinner", "breakfast", "morning", "breakfast", "morning", "breakfast", "date", "breakfast", "breakfast", "breakfast"], "difficult_direct_answer": false, "rationales": ["Croiscants would suggest it is served in the morning.", "The meal being served is breakfast because you have coffee, toast and eggs on the table", "There are eggs and toast"], "image": "train2014/COCO_train2014_000000250955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375304, "question_id": "oDDmujqLqpHEeBgmFwU8sg", "question": "What is the brand of the skis?", "choices": ["hart", "salomon", "fischer", "nordica"], "correct_choice_idx": 1, "direct_answers": ["salomon", "aroma", "salomon", "salman", "salomon", "shalom", "salomon", "salman", "salome", "salomon"], "difficult_direct_answer": false, "rationales": ["The brand name is show on both skis.", "The brand is listed on the bottom of each ski.", "Name of the brand can be found written in black. it starts with a \"s\" letter."], "image": "train2014/COCO_train2014_000000375304.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359525, "question_id": "oDGzGWNTMHqrTGFX56V425", "question": "What is the Smucker's product replacing?", "choices": ["corn syrup", "date syrup", "maple syrup", "agave nectar"], "correct_choice_idx": 2, "direct_answers": ["maple syrup", "syrup", "maple syrup", "maple syrup", "maple syrup", "syrup", "maple syrup", "syrup", "syrup", "maple syrup"], "difficult_direct_answer": false, "rationales": ["The bottle is by pancakes to pour onto.", "A smucker's bottle is on a table behind a plate of pancakes. pancakes are served with syrup.", "A sugar free version of syrup is behind a plate of pancakes."], "image": "val2014/COCO_val2014_000000359525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267064, "question_id": "oDHzXJC7e2y3fgKsncpNak", "question": "What type of location is this?", "choices": ["foyer", "summit", "finish line", "intersection"], "correct_choice_idx": 3, "direct_answers": ["downtown", "street", "intersection", "intersection", "intersection", "downtown", "intersection", "intersection", "city", "city"], "difficult_direct_answer": false, "rationales": ["There are traffic lights and 2 roads meeting, which is called an intersection.", "This location is where the streets come together and cross. it is on the corner of a city block with traffic lights to guide drivers to get through the area.", "Two roads converge with crosswalks and traffic lights present."], "image": "train2014/COCO_train2014_000000267064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133090, "question_id": "oDNnv2uy9FhiNb66eK8nu5", "question": "What is he doing?", "choices": ["falling", "sowing", "bouncing", "following through"], "correct_choice_idx": 3, "direct_answers": ["playing tennis", "stumbling", "playing tennis", "falling", "playing tennis", "serving", "following through", "falling", "playing tennis", "running"], "difficult_direct_answer": false, "rationales": ["The man is following through on his serve.", "The man wants to make a complete follow through.", "A person is on a tennis court leaning forward with a tennis racket in his hand."], "image": "val2014/COCO_val2014_000000133090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422669, "question_id": "oDPMkZdveK8HWL5hTiRbn9", "question": "What state is this van from?", "choices": ["new york", "new jersey", "montana", "ohio"], "correct_choice_idx": 3, "direct_answers": ["ohio", "new", "ohio", "new", "ohio", "new", "ohio", "ohio", "ohio", "ohio"], "difficult_direct_answer": false, "rationales": ["The truck is from cleveland.", "It has cleveland as the name", "The logo on the side of the van indicates that it belongs to cleveland ems. cleveland is not in new york, new jersey, or montana."], "image": "train2014/COCO_train2014_000000422669.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530683, "question_id": "oDdcv4NZpLSs8g88fqLptd", "question": "Why is the dog in the basket?", "choices": ["taking home", "for sale", "is stolen", "keep safe"], "correct_choice_idx": 3, "direct_answers": ["can't cycle", "transportation", "free ride", "can't run", "being transported", "transportation", "keep safe", "riding", "transportation", "transportation"], "difficult_direct_answer": false, "rationales": ["A girl rides on a busy street with a dog in the basket. dogs can get hit by cars.", "The dog is being kept safe.", "Dogs would not be safe to run around on their own in this environment because of the cars. the dog is in a basket on a bike and likely has been removed from the ground in order to keep it safe and contained."], "image": "train2014/COCO_train2014_000000530683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345114, "question_id": "oDeq6ZwZHdY4PMY4bgWGMN", "question": "What is the woman packing her luggage in?", "choices": ["duffle bag", "suitcase", "backpack", "car"], "correct_choice_idx": 1, "direct_answers": ["suitcase", "suitcase", "clothes", "clothes", "clothes", "suitcase", "clothes", "suitcase", "clothes", "suitcase"], "difficult_direct_answer": false, "rationales": ["A woman sits near a large, square object filled with clothes that has a zipper around it.", "The woman's stuff is in a suitcase.", "The container is cloth and has a handle on it."], "image": "train2014/COCO_train2014_000000345114.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424821, "question_id": "oDoWaAcstZQSW34giTq5pZ", "question": "Which one of these goals would he love to achieve?", "choices": ["strike out", "home run", "foul", "ejection"], "correct_choice_idx": 1, "direct_answers": ["hit baseball", "home run", "home run", "home run", "hit ball", "homerun", "homerun", "home run", "home run", "hit baseball"], "difficult_direct_answer": false, "rationales": ["The person wants to get a home run.", "The boy wants to hit a home run.", "Every baseball player steps up to bat and hopes for a slam hit."], "image": "train2014/COCO_train2014_000000424821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348400, "question_id": "oDpCiEg5ZbNvXmjRXvcLFy", "question": "Where will this person who holds a pink umbrella go to next?", "choices": ["bus stop", "taxi trunk", "taxi hood", "mexico"], "correct_choice_idx": 1, "direct_answers": ["airport", "hotel", "taxi", "hotel", "hotel", "inside cab", "house", "in building", "taxi trunk", "airport"], "difficult_direct_answer": false, "rationales": ["The person is going to put away their luggage.", "The person has a suitcase and is trying to get in a car.", "They are putting their suitcase in the trunk."], "image": "val2014/COCO_val2014_000000348400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439765, "question_id": "oEFn6oYZbvbAVLnKHUPCo4", "question": "What activity is the individual engaging in?", "choices": ["washing clothes", "steaming", "tie folding", "folding clothes"], "correct_choice_idx": 2, "direct_answers": ["playing", "sitting", "comparing ties", "entwining ties", "tie folding", "tying tie", "tying knot", "watching television", "comparing ties", "tie tying"], "difficult_direct_answer": true, "rationales": ["He has his tie and the other man's tie in his hands.", "The guy is folding a tie.", "He is folding his neck tie."], "image": "train2014/COCO_train2014_000000439765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60387, "question_id": "oEFrqZ5EfxbyfsbVn3Fywu", "question": "What are people doing?", "choices": ["waiting", "smoking", "drinking", "eating"], "correct_choice_idx": 0, "direct_answers": ["waiting", "crossing", "waiting", "waiting", "waiting", "walking", "waiting", "pedestrian traffic", "walking", "walking"], "difficult_direct_answer": false, "rationales": ["There are people standing at the traffic light, waiting to cross the street.", "The people are waiting for a bus.", "The people are waiting."], "image": "train2014/COCO_train2014_000000060387.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500568, "question_id": "oEk4YiAGFwFKNcxXaZgAJT", "question": "What's connected to the back of the chair?", "choices": ["blanket", "metal rod", "ribbon", "tv"], "correct_choice_idx": 1, "direct_answers": ["iron poles", "support rod", "stand", "metal", "connector", "metal rod", "pole", "metal pole", "metal", "metal piece"], "difficult_direct_answer": true, "rationales": ["The pole is there to keep the chair from moving.", "A metal piece is connected to the back of a chair in an outdoor area.", "A metal rod is connected."], "image": "val2014/COCO_val2014_000000500568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87920, "question_id": "oF4CALSZFoHSYMA5epGQQ3", "question": "What is the woman on the bike using?", "choices": ["towel", "spray bottle", "helmet", "cellphone"], "correct_choice_idx": 3, "direct_answers": ["cellphone", "phone", "phone", "phone", "cellphone", "cellphone", "phone", "cell phone", "cell phone", "cell phone"], "difficult_direct_answer": false, "rationales": ["The woman is talking on a phone.", "A woman on a bike is holding a phone up to her ear. people talk on the phone while doing all kinds of things.", "She has something in her hand and is holding it to her ear."], "image": "val2014/COCO_val2014_000000087920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564589, "question_id": "oF8UWAFUVuMdWzKWG9fcMM", "question": "How many adults are there in picture?", "choices": ["eight", "five", "nine", "two"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "nine", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are nine adults near the bus stop.", "The only adults are the old man standing on the left side of the group of kids.", "There are two older males to the left. the rest of the people are are teenagers."], "image": "train2014/COCO_train2014_000000564589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253971, "question_id": "oFN3T4Dj2ut98z89ohcEzQ", "question": "What does the kid use the yellow object for?", "choices": ["surfing", "floaty", "paddling", "canoeing"], "correct_choice_idx": 0, "direct_answers": ["body surf", "boogie boarding", "surfing", "surf", "rescue", "rescuing surfers", "surfing", "standing", "standing", "surfing"], "difficult_direct_answer": false, "rationales": ["There is a surf rescue sign on the beach.", "The board is for rescue.", "The kid uses a surfboard."], "image": "train2014/COCO_train2014_000000253971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500664, "question_id": "oFrU7dkTDeTHdFeBxhi53o", "question": "What type of surface can be found past the rock wall to the right of the road?", "choices": ["gravel", "water", "sand", "grass"], "correct_choice_idx": 1, "direct_answers": ["asphalt", "water", "water", "water", "water", "asphalt", "water", "water", "asphalt", "water"], "difficult_direct_answer": false, "rationales": ["The surface has water.", "There is a body of water beyond the rock wall.", "There is a rock wall behind the road to separate it from a body of water."], "image": "train2014/COCO_train2014_000000500664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273579, "question_id": "oFwMVcAqRasvTohmkDHfi8", "question": "What is the elephant doing?", "choices": ["drinking", "painting picture", "eating lunch", "resting"], "correct_choice_idx": 1, "direct_answers": ["painting", "painting", "painting", "painting", "painting", "painting", "painting picture", "painting", "painting", "painting"], "difficult_direct_answer": false, "rationales": ["The elephant is painting on a canvas.", "The elephant is standing in front of an easel an holding a paintbrush with its trunk.", "The elephant paints."], "image": "val2014/COCO_val2014_000000273579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188624, "question_id": "oG7c5YfDc4hEhAECYtDWK6", "question": "In what venue is the skateboarder practicing his tricks?", "choices": ["garage", "sidewalk", "public park", "schoolyard"], "correct_choice_idx": 2, "direct_answers": ["skate park", "park", "pavement", "park", "park", "park", "pavement", "park", "public park", "park"], "difficult_direct_answer": false, "rationales": ["The man is in a park.", "There are people casually sitting and watching a fountain.", "There are fountains, flower beds and people relaxing on benches nearby."], "image": "val2014/COCO_val2014_000000188624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490708, "question_id": "oG9zL2qzfDN4y2EcuCFeeB", "question": "Which type of meat does the animals above provide?", "choices": ["pork", "mutton", "none", "chevon"], "correct_choice_idx": 1, "direct_answers": ["lamb", "mutton", "red meat", "mutton", "mutton", "goat", "beef", "beef", "lamb", "mutton"], "difficult_direct_answer": false, "rationales": ["Lamb meat is commonly called chops or mutton.", "Sheep do not produce pork or chevon.", "The animals are sheep based on their distinct and visible features and answer a is the name for meat from this animal."], "image": "val2014/COCO_val2014_000000490708.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500940, "question_id": "oGAxQZeZkNeGEse9pQBtwG", "question": "What period of the day is shown here?", "choices": ["morning", "afternoon", "evening", "night"], "correct_choice_idx": 1, "direct_answers": ["afternoon", "midday", "afternoon", "day", "afternoon", "afternoon", "noon", "afternoon", "afternoon", "afternoon"], "difficult_direct_answer": false, "rationales": ["It is sunny. the clock indicates that it is around 2.", "It's 2:08 on the clock and if it were morning, it would be dark.", "The time is the afternoon."], "image": "val2014/COCO_val2014_000000500940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417115, "question_id": "oGG65BWL9rgmYeuT9EgDBV", "question": "After making one full circuit of their route starting from here where will this bus return?", "choices": ["next city", "mexico", "here", "depot"], "correct_choice_idx": 2, "direct_answers": ["same place", "bus station", "same place", "here", "here", "lot", "bus station", "here", "intersection", "home base"], "difficult_direct_answer": false, "rationales": ["They can go back here.", "The bus says loop on it so it will make a full circle and stop where it started.", "A public bus advertises being on a loop which would involve starting and stopping in the same place."], "image": "val2014/COCO_val2014_000000417115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130076, "question_id": "oGPHv2APECSAVwJYYQyHcD", "question": "How many keys are present in Wii remote?", "choices": ["nine", "11", "eight", "four"], "correct_choice_idx": 2, "direct_answers": ["nine", "nine", "two", "nine", "three", "ten", "nine", "nine", "three", "eight"], "difficult_direct_answer": false, "rationales": ["The wii remote's buttons are the power button, directional pad, a, b minus, home and 1 and 2 buttons.", "There are a total of eight buttons in the wii remote.", "According to an online search, this is the right answer."], "image": "val2014/COCO_val2014_000000130076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271999, "question_id": "oGtW8LppvCmnqZVtnPhCiV", "question": "Where is the train stopped?", "choices": ["gas station", "elementary school", "walmart", "train station"], "correct_choice_idx": 3, "direct_answers": ["station", "station", "station", "station", "train station", "station", "station", "station", "station", "station"], "difficult_direct_answer": false, "rationales": ["A train is stopped at an area with a small building.", "The trains docks in this area for booking passengers.", "The train is at the depot."], "image": "train2014/COCO_train2014_000000271999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94632, "question_id": "oHTQt5pFGh8HsAuViTketW", "question": "Who is flying these vehicles?", "choices": ["pilot", "driver", "engineer", "biker"], "correct_choice_idx": 1, "direct_answers": ["driver", "pilots", "pilots", "pilots", "pilots", "pilots", "pilots", "pilots", "pilots", "pilots"], "difficult_direct_answer": false, "rationales": ["The kinds of planes on display in this image require pilots operating them inside.", "To fly an airplane you need to have a pilots license and study aviation.", "They are airplanes."], "image": "train2014/COCO_train2014_000000094632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574928, "question_id": "oHmZ9sZiTqoeKqqtkhB3x7", "question": "What type of worker would be found here?", "choices": ["farmer", "dentist", "judge", "doctor"], "correct_choice_idx": 0, "direct_answers": ["farmer", "shepherd", "farmer", "farmer", "shepherd", "farmer", "shepherd farmer", "farmer", "few sheeps", "shepherd"], "difficult_direct_answer": false, "rationales": ["Farmers are the people who herd sheep.", "Farmers herd sheep.", "The farmer works with sheep."], "image": "val2014/COCO_val2014_000000574928.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101270, "question_id": "oHtuGDDrFymXg4E2wTKJaJ", "question": "The planes are executing a what?", "choices": ["emergency landing", "stunt formation", "space flight", "sky dive"], "correct_choice_idx": 1, "direct_answers": ["formation", "formation", "maneuver", "airshow", "trick", "aerobatics", "maneuver", "stunt formation", "dive", "v formation"], "difficult_direct_answer": false, "rationales": ["They have smoke coming out the back and are in formation doing tricks", "The planes are flying in groups while spewing out fumes to make a shape.", "They are performing a stunt."], "image": "val2014/COCO_val2014_000000101270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156685, "question_id": "oHx3HMA6r8fnHKSSHrp3ts", "question": "What country is known for selling fruit from boats as shown in the image?", "choices": ["germany", "india", "china", "vietnam"], "correct_choice_idx": 3, "direct_answers": ["brazil", "vietnam", "vietnam", "vietnam", "thailand", "indonesia", "vietnam", "thailand", "jamaica", "thailand"], "difficult_direct_answer": false, "rationales": ["Vietnam sells bananas.", "Vietnam has a famous river where people sell items out of boats.", "Vietnam is a poor country that is warm enough to grow fruits. poor villagers then sell them."], "image": "val2014/COCO_val2014_000000156685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541961, "question_id": "oJAUbhk8Yw5utGgfSUiW9A", "question": "What is this dog ready to do?", "choices": ["sleep", "attack", "eat", "run"], "correct_choice_idx": 3, "direct_answers": ["fetch", "stop", "catch frisbee", "fetch", "stop running", "play frisbee", "run", "play", "run", "retrieve"], "difficult_direct_answer": false, "rationales": ["The dog has its legs stretched out and in the back of it.", "A dog has a frisbee and is leaping forward in the grass.", "He has is front paws outstretched and is above the grass."], "image": "train2014/COCO_train2014_000000541961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209073, "question_id": "oJKJQxdXMi6FkFrhkXcEUb", "question": "What can the object the person is leaning on be used for?", "choices": ["running", "transportation", "swimming", "flying"], "correct_choice_idx": 1, "direct_answers": ["riding", "riding", "skating", "transportation", "skating", "skating", "skate", "skating", "skateboarding", "skateboarding"], "difficult_direct_answer": false, "rationales": ["The person can use the skateboard to get around.", "A person is resting against a skateboard.", "It can be used for transportation because it has wheels on it and you can ride on it"], "image": "train2014/COCO_train2014_000000209073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259458, "question_id": "oJPPLpyD4DoEGdfqpBWyJg", "question": "What type of heat is shown?", "choices": ["radiator", "blanket", "fire", "coat"], "correct_choice_idx": 2, "direct_answers": ["fire", "fire", "fireplace", "fireplace", "fireplace", "fire", "fireplace", "fire", "fireplace", "fire"], "difficult_direct_answer": false, "rationales": ["The only heat producing feature of this picture is the fire in the fireplace we can see glowing orange here.", "The heat does not come from a radiator or a piece of clothing. there is something burning inside the stone structure.", "The heat is from fire."], "image": "train2014/COCO_train2014_000000259458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283257, "question_id": "oJZTe829LYnPADWbj6J7pZ", "question": "How are the people related to one another?", "choices": ["religious members", "family members", "strangers", "coworkers"], "correct_choice_idx": 3, "direct_answers": ["coworkers", "friends", "family", "colleagues", "relatives", "coworkers", "coworkers", "coworkers", "co workers", "family members"], "difficult_direct_answer": false, "rationales": ["They look like they are located in the break room which is where workers take their breaks.", "They look to be in an office setting.", "They are in an office."], "image": "train2014/COCO_train2014_000000283257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500561, "question_id": "oJo7QiV6SXxzEgWUMogr2J", "question": "What is covering the fries?", "choices": ["cheese", "mustard", "ranch", "ketchup"], "correct_choice_idx": 0, "direct_answers": ["cheese", "cheese", "cheese", "cheese", "cheese", "cheese", "cheese sauce", "cheese", "cheese sauce", "cheese"], "difficult_direct_answer": false, "rationales": ["Fries are on a plate. the fries are covered in a pale orange substance. cheese is sometimes served with fries.", "Cheese covers the fries.", "The covering of the fries is clearly visible and is a color and consistency as well as being served in a manner in line with answer a."], "image": "train2014/COCO_train2014_000000500561.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250344, "question_id": "oK6jZAG2Eo35bCQna8ozXx", "question": "Why is the blanket wrapped around her?", "choices": ["is cold", "is hiding", "is confused", "showing off"], "correct_choice_idx": 0, "direct_answers": ["is cold", "cold", "she's cold", "cold", "keep warm", "women's cold", "she's cold", "cold", "for warmth", "keep warm"], "difficult_direct_answer": false, "rationales": ["The girl is cold and is trying to stay warm.", "A woman is sitting on a couch with a blanket wrapped all the way around her. a girl sits on the couch next to her in a long sleeved sweatshirt.", "The blanket provides warmth."], "image": "val2014/COCO_val2014_000000250344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199127, "question_id": "oK8py8VdS6LmEPgG5HC46V", "question": "What is the tower used for?", "choices": ["alien signals", "electric lines", "tourism", "warning"], "correct_choice_idx": 1, "direct_answers": ["electricity", "electric", "electric lines", "radio", "electricity", "radio signals", "telephone", "electricity", "radio", "power lines"], "difficult_direct_answer": false, "rationales": ["The tower powers up electric.", "Large towers are connected by lines. electric lines run between towers.", "The tower is used for electricity."], "image": "val2014/COCO_val2014_000000199127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467479, "question_id": "oKL95Ub4JTV9GML8gBAHTH", "question": "How many gun surf boards are there?", "choices": ["six", "four", "five", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Two people have two surf boards in the water.", "There are two boards.", "There are two surf boards carried by two different men."], "image": "val2014/COCO_val2014_000000467479.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190764, "question_id": "oKaNLsuUmgDVZ2VmhbTCz4", "question": "Why are the men standing around a plane?", "choices": ["to clean", "to fly", "to fix", "to view"], "correct_choice_idx": 3, "direct_answers": ["appreciating beauty", "observing plane", "learning", "to view", "admiring", "group discussion", "viewing", "looking", "learning", "looking"], "difficult_direct_answer": false, "rationales": ["The men are viewing the planes.", "There are men standing around talking and looking at the plane.", "They are looking at the plane."], "image": "train2014/COCO_train2014_000000190764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109851, "question_id": "oKgHiZYSPhKbZEnWPzfk2a", "question": "This office processes which one of these items?", "choices": ["report card", "baptism certificate", "driver's license", "diplomas"], "correct_choice_idx": 2, "direct_answers": ["license", "documents", "driver licenses", "tickets", "driver's license", "bank transactions", "birth certificates", "driver's licenses", "tickets", "drivers licenses"], "difficult_direct_answer": true, "rationales": ["The office has the internal setup and desk configuration that is consistent with answer a.", "This is an office environment with the layout associated with a dmv. also, there is art on the wall of a roadway.", "The dmv process the information."], "image": "train2014/COCO_train2014_000000109851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214067, "question_id": "oL5jHtnfc4aEZbi7HJbcPu", "question": "What is the box made of?", "choices": ["glass", "steel", "paper", "plastic"], "correct_choice_idx": 2, "direct_answers": ["paper", "cardboard", "paper", "paper", "card board", "kraft paperboard", "carton", "paper", "cardboard", "cardboard"], "difficult_direct_answer": false, "rationales": ["Many carry out food establishments prefer to use recyclable paper or cardboard containers. paper recycles more easily than most styrofoam.", "The texture here is common knowledge, and near all donut boxes are made of paper.", "The box is made of cardboard paper."], "image": "train2014/COCO_train2014_000000214067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313627, "question_id": "oLiDqu6VTUt2mTsAzmkohd", "question": "What material is the green bottle made of?", "choices": ["pic", "metal", "porcelain", "glass"], "correct_choice_idx": 0, "direct_answers": ["plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "plastic", "pic", "plastic"], "difficult_direct_answer": false, "rationales": ["The bottle is translucent, so it cannot be made of metal or porcelain. glass would break too easily, so it is made out of a type of plastic.", "The material is pic.", "The water bottle is a plastic material."], "image": "val2014/COCO_val2014_000000313627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196899, "question_id": "oLo2JiXzS9Gq2R9xoap4jr", "question": "The smaller glasses are designed for?", "choices": ["milk", "wine", "tea", "coffee"], "correct_choice_idx": 1, "direct_answers": ["water", "beer", "water", "dessert wine", "wine", "wine", "wine", "wine", "beverages", "water"], "difficult_direct_answer": false, "rationales": ["The smaller ones are for wine.", "People like to have wine and water with dinner.", "The glass has wine."], "image": "train2014/COCO_train2014_000000196899.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381000, "question_id": "oM6jmLNiP9NNDbQM6T9mWT", "question": "Where have these people gathered?", "choices": ["station", "stadium", "restaurant", "residence"], "correct_choice_idx": 2, "direct_answers": ["cafe", "coffee shop", "restaurant", "cafe", "coffee house", "restaurant", "cafe", "cafe", "restaurant", "coffee shop"], "difficult_direct_answer": false, "rationales": ["There are people at various tables with drinks.", "People are sitting at tables in a public place with signage on the tables.", "They are in a restaurant."], "image": "train2014/COCO_train2014_000000381000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308568, "question_id": "oMAYzE2fvLcjQv8RMaNsg4", "question": "What is the brown area behind the bench made of?", "choices": ["bricks", "paper", "wood", "plywood"], "correct_choice_idx": 0, "direct_answers": ["brick", "bricks", "brick", "bricks", "bricks", "bricks", "brick", "brick", "brick", "brick"], "difficult_direct_answer": false, "rationales": ["A bench is on the sidewalk in front of a brick wall.", "The brown area is a structure that is made out of rectangular objects. the material is not wood, paper, or plywood.", "Brown bricks are seen on the wall."], "image": "train2014/COCO_train2014_000000308568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571787, "question_id": "oMP8cQaTqwVXg9d7onvCoJ", "question": "What nut was used to make this treat?", "choices": ["cashew", "coconut", "pistachio", "hazelnut"], "correct_choice_idx": 1, "direct_answers": ["walnut", "pecans", "hazelnut", "coconut", "almond", "walnut", "coconut", "coconut", "coconut", "cheese"], "difficult_direct_answer": false, "rationales": ["The other answers might be present, but answer a is clearly included based on the unique coloring, shape and consistency.", "Coconut is used to make this treat.", "The nut is coconut."], "image": "train2014/COCO_train2014_000000571787.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347989, "question_id": "oMaJ3iwaEJJRkkuSbyXXPy", "question": "What does the MP here likely stand for?", "choices": ["monkey police", "military police", "meat police", "metropolitan police"], "correct_choice_idx": 3, "direct_answers": ["metropolitan police", "military police", "mounted police", "military police", "michigan police", "motorized police", "metro police", "bike", "checking", "metropolitan police"], "difficult_direct_answer": false, "rationales": ["Metropolitan police departments use the letters m and p to denote metropolitan police on their uniforms.", "He's in a city so it wouldn't be military police in this instance", "The vehicle belongs to police."], "image": "train2014/COCO_train2014_000000347989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69654, "question_id": "oMcSUxd4BAk97csKX3nTsg", "question": "Where is the woman sitting?", "choices": ["bench", "trampoline", "ladder", "couch"], "correct_choice_idx": 0, "direct_answers": ["bullet train", "train station", "bench", "on bench", "train station", "station bench", "bench", "bench", "on bench", "bench"], "difficult_direct_answer": false, "rationales": ["She's on a bench.", "The woman is on a bench.", "The woman is sitting on a long seat."], "image": "train2014/COCO_train2014_000000069654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224633, "question_id": "oMd5k2zMuT2pVN6EQ8wXJX", "question": "What are the scissors primarily used for most probably?", "choices": ["hair-cutting", "gardening", "crafts", "sewing"], "correct_choice_idx": 1, "direct_answers": ["cutting", "to cut", "cutting", "decoration", "trimming plants", "for looks", "decoration", "cutting", "gardening", "pruning plants"], "difficult_direct_answer": false, "rationales": ["The scissors are near a plant so they are likely being used to garden.", "Creating clothing requires the material to be cut. scissors are for cutting things including material.", "Scissors can cut material to make clothing."], "image": "train2014/COCO_train2014_000000224633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327176, "question_id": "oNCHoQNeFWprjusiKNCueh", "question": "What part does this animal have that is absent in humans?", "choices": ["exoskeleton", "wings", "quills", "stinger"], "correct_choice_idx": 1, "direct_answers": ["wings", "wings", "wings", "beak", "wings", "beak", "wings", "wings", "wings", "fur"], "difficult_direct_answer": false, "rationales": ["Humans normally have arms and not wings.", "The animal on the beach is a bird which has feathers. humans do not have wings.", "The animals visible are birds. birds have wings and humans do not."], "image": "train2014/COCO_train2014_000000327176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26988, "question_id": "oNDbVsrizTBQQZ59ugyPVV", "question": "What are the things on top of giraffes heads?", "choices": ["tubes", "nose", "ossicones", "ears"], "correct_choice_idx": 2, "direct_answers": ["ossicones", "horns", "horns", "horns", "ossicones", "ears", "ossicones", "horns", "horns", "ossicones"], "difficult_direct_answer": false, "rationales": ["The things are horns.", "The two things protruding out of the top of giraffe's head are ossicones.", "That's what the horns on top of a giraffe's head are called."], "image": "train2014/COCO_train2014_000000026988.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57641, "question_id": "oNMCvoq7zPRmDCbssGpPLy", "question": "What activity do most people here want to do today?", "choices": ["swimming", "robbery", "waiting", "shopping"], "correct_choice_idx": 3, "direct_answers": ["shopping", "shop", "shop", "sightseeing", "building", "shop", "shop", "shop", "shop", "go shopping"], "difficult_direct_answer": false, "rationales": ["They are looking in the windows which means they want to see what's for sale.", "The activity is shopping.", "The people are in downtown to shop."], "image": "train2014/COCO_train2014_000000057641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92721, "question_id": "oNw3S8KQVdAqPLwYqyzfXw", "question": "What mode of transportation are they?", "choices": ["van", "bus", "truck", "train"], "correct_choice_idx": 1, "direct_answers": ["bus", "bus", "buses", "bus", "bus", "bus", "bus", "buses", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to these types of vehicles.", "Each vehicle has up to three doors on one side. each vehicle has signs that indicate its route on the front and back.", "That's what all the vehicles are."], "image": "train2014/COCO_train2014_000000092721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412972, "question_id": "oPUiXqct8xz8yCAKXto6Px", "question": "What is the shadow of?", "choices": ["building", "umbrella", "person", "bird"], "correct_choice_idx": 2, "direct_answers": ["person", "man", "man", "person", "person", "man", "person", "person", "man", "person"], "difficult_direct_answer": false, "rationales": ["There is a shadow of a man taking pictures of the car.", "A person standing there.", "The shadow has these human-shaped attributes: head to lower torso, and two arms."], "image": "train2014/COCO_train2014_000000412972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239144, "question_id": "oPUozZXP9UuXMhe4CmEYwP", "question": "What sort of climate might the trees in the background be most likely to be found in?", "choices": ["misty", "hot", "snowy", "rainy"], "correct_choice_idx": 1, "direct_answers": ["tropical", "tropical", "tropical", "tropical", "tropical", "spring", "tropical", "hot", "hot", "tropical"], "difficult_direct_answer": false, "rationales": ["The trees in the background are palm trees, and they need hot tropical weather in order to grow.", "The palm trees are tropical.", "The climate is hot."], "image": "train2014/COCO_train2014_000000239144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247893, "question_id": "oPWuj89fNqEpTHgkbsBCNz", "question": "Why is she lookin away from everybody else?", "choices": ["is afraid", "is lost", "looking camera", "is confused"], "correct_choice_idx": 2, "direct_answers": ["picture taken", "checking course", "looking camera", "photographed", "keeping balance", "lookin camera", "snowboarding downhill", "facing camera", "posing", "focus snowboarding"], "difficult_direct_answer": true, "rationales": ["She is looking at a camera.", "A girl is looking straight ahead and standing still.", "The woman is looking away from the other people so she can smile at the camera."], "image": "train2014/COCO_train2014_000000247893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557190, "question_id": "oPvXjPzb9NE3BT73Z6MrMQ", "question": "What type setting do these men pose in?", "choices": ["circus", "suburban", "city", "farm"], "correct_choice_idx": 3, "direct_answers": ["jovial", "country side", "outdoors", "hillside", "country hills", "landscape", "farm", "nature", "desert", "hillside"], "difficult_direct_answer": true, "rationales": ["The men are posing in a scene filled with natural landscape and farmland.", "The men are in front of grass and trees.", "The setting is a farm."], "image": "val2014/COCO_val2014_000000557190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24845, "question_id": "oPwKmCeVVCaNCbG9KaXnQf", "question": "What is strange about the toilet paper?", "choices": ["empty", "color", "up high", "black"], "correct_choice_idx": 2, "direct_answers": ["toilet", "high", "high up", "high up", "up high", "hung high", "high up", "up high", "far away", "it's high"], "difficult_direct_answer": false, "rationales": ["It is normally placed lower close to the toilet", "The toilet paper isn't at the level one would need to easily access it.", "Toilet paper is traditionally located in an area easily reached while a person is sitting on the toilet. the location on the toilet paper here may require a person to stand in order to utilize the roll."], "image": "val2014/COCO_val2014_000000024845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313926, "question_id": "oQH66jxTroR9zEkuQARVck", "question": "What material do these animals provide for clothing?", "choices": ["silk", "wool", "polyester", "cotton"], "correct_choice_idx": 1, "direct_answers": ["wool", "wool", "sweater", "sweater", "wool", "wool", "wool", "wool", "wool", "wool"], "difficult_direct_answer": false, "rationales": ["The sheep can be sheared for their coats to make wool.", "The material is wool.", "The sheep provide wool."], "image": "train2014/COCO_train2014_000000313926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393630, "question_id": "oQNrHWAxNer2L4nU7MiNJa", "question": "What could offer protection from the sun?", "choices": ["bench", "boat", "tree shade", "jackets"], "correct_choice_idx": 2, "direct_answers": ["trees", "umbrella", "trees", "trees", "trees", "trees", "trees", "trees", "shade trees", "tree shade"], "difficult_direct_answer": false, "rationales": ["The trees provide a canopy that is a natural umbrella", "The branches are shading them from the sun.", "The trees give protection."], "image": "train2014/COCO_train2014_000000393630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536920, "question_id": "oQYHoZQ9JVUGFHWvmRDjxz", "question": "Which direction is the entrance according to the sign?", "choices": ["downstairs", "left", "right", "behind camera"], "correct_choice_idx": 2, "direct_answers": ["right", "right", "right", "right", "right", "right", "right", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["You can tell by how the arrow is pointed as to what direction it is.", "The entrance sign has an arrow pointing to the right.", "The direction is to the right."], "image": "train2014/COCO_train2014_000000536920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415388, "question_id": "oQYsDrSRravqxi8vVZpciv", "question": "Why are umbrellas being used?", "choices": ["disguise", "snow", "sun", "rain"], "correct_choice_idx": 2, "direct_answers": ["shade", "block sunlight", "sun", "shade", "give shade", "shade", "shade", "shade", "shade", "sunny"], "difficult_direct_answer": false, "rationales": ["It is sunny out.", "People are holding umbrellas above themselves on a sunny day.", "Umbrellas block the sun."], "image": "train2014/COCO_train2014_000000415388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287535, "question_id": "oQdsz9cVDMp7Y5nXQBtRMA", "question": "What type of maneuver is the sign by the traffic light prohibiting?", "choices": ["3-point", "k-turn", "2-point", "u-turn"], "correct_choice_idx": 3, "direct_answers": ["upturn", "u-turn", "u turn", "u turn", "upturn", "walking", "upturn", "u turn", "u turn", "turn around"], "difficult_direct_answer": false, "rationales": ["The arrow on the sign shows you can not turn around and go in the opposite direction.", "The u turn is a manuever.", "The sign prohibits u turns."], "image": "val2014/COCO_val2014_000000287535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424132, "question_id": "oQkjNqxfCAG5RxN6V4wcWg", "question": "What skating footwear do the women have?", "choices": ["rollerblades", "roller-skates", "ice-skates", "skateboards"], "correct_choice_idx": 1, "direct_answers": ["skates", "roller blades", "roller-skates", "rollerskates", "skates", "rollerblades", "inline skates", "skate shoes", "shoes", "rollerskates"], "difficult_direct_answer": false, "rationales": ["The ladies have these on their feet and they have four wheels.", "Their shoes have wheels.", "The woman have rollerblades on."], "image": "train2014/COCO_train2014_000000424132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339974, "question_id": "oQozw2SNYfgexr7T3VVkBx", "question": "What type of signs are these?", "choices": ["direction signs", "traffic signs", "helpful signs", "schoolzone signs"], "correct_choice_idx": 1, "direct_answers": ["street directions", "traffic", "traffic signs", "traffic", "traffic signs", "traffic lights", "traffic lights", "traffic sign", "traffic", "traffic"], "difficult_direct_answer": false, "rationales": ["The signs direct traffic.", "The signs are found on roads and highway.", "The signs guide traffic."], "image": "val2014/COCO_val2014_000000339974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74, "question_id": "oREZKAiH8Pica9ujKBC8Vo", "question": "What state is the dog in?", "choices": ["being sick", "sleeping", "being abandoned", "dying"], "correct_choice_idx": 1, "direct_answers": ["resting", "texas", "sleeping", "rome", "resting", "rest", "lying", "calm", "laying down", "sleeping"], "difficult_direct_answer": false, "rationales": ["Majority of animals will close their eyes while in the act of sleeping.", "The dog is sleeping.", "The dog appears to be healthy. it is lying down, and its eyes are closed."], "image": "val2014/COCO_val2014_000000000074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469134, "question_id": "oRYhk5zE3wHyRBACtEDEwX", "question": "What will the bikers shown here have for lunch today?", "choices": ["hot dogs", "steak", "pizza", "salads"], "correct_choice_idx": 2, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["They are at a pizza place.", "The bikers get pizza.", "They are gathered outside a pizza restaurant. so they will most likely eat there."], "image": "val2014/COCO_val2014_000000469134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492492, "question_id": "oRw9Vdmvj5BZBmnULz6YeS", "question": "What is the vehicle in the sky on the right called?", "choices": ["helicopter", "glider", "blimp", "hovercraft"], "correct_choice_idx": 0, "direct_answers": ["helicopter", "helicopter", "helicopter", "helicopter", "helicopter", "helicopter", "helicopter", "helicopter", "helicopter", "helicopter"], "difficult_direct_answer": false, "rationales": ["The vehicle is a helicopter.", "A helicopter is seen in the sky on the right side of the photo. and there is actually another one, much further away, on the left side of the photo as well.", "This is a helicopter that is flying over the water."], "image": "train2014/COCO_train2014_000000492492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361804, "question_id": "oRzsEhD6eKrPm5Wp7Fjng2", "question": "What is the girl in the pink bikini looking at?", "choices": ["briefs", "book", "brochure", "toes"], "correct_choice_idx": 1, "direct_answers": ["surf board", "book", "book", "book", "surfboard", "book", "surfboard", "book", "surfboard", "surfboard"], "difficult_direct_answer": false, "rationales": ["The girl is looking at a book.", "She is reading.", "The woman is looking at a book."], "image": "val2014/COCO_val2014_000000361804.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173069, "question_id": "oSGyQUwDMn7rH5t9gkwL84", "question": "The large word on the box near the top of the shelf is also the name of a company that specializes in what?", "choices": ["canned beans", "canned spinach", "dolphin rescue", "pest control"], "correct_choice_idx": 3, "direct_answers": ["vehicles", "moving", "furniture", "automobile", "trucks", "pest control", "cars", "cars", "moving", "propane"], "difficult_direct_answer": false, "rationales": ["Companies select various names to go by. a company name should reflect the type of business and/or the area served. in the care of a pest control business, they might choose the area they serve.", "The word on the box indicates the company handles pests.", "Suburban specializes in pest control."], "image": "train2014/COCO_train2014_000000173069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297540, "question_id": "oShn6auW6mzeFXZgnoMKgL", "question": "Which Minnesota Twins catcher is at the bat?", "choices": ["willians astudillo", "gabby hartnett", "joe mauer", "butch henline"], "correct_choice_idx": 2, "direct_answers": ["baseball", "max keller", "balsa", "joe mauer", "batter", "joseph mauer", "player", "puckett", "unknown", "thirty-four"], "difficult_direct_answer": true, "rationales": ["The player batting is clear and based on their number and the team they play for, their name is internet searchable.", "Number 34 is up to bat and is the catcher for team. you can easily research the roster to find person.", "His name is on the back of his jersey."], "image": "val2014/COCO_val2014_000000297540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67131, "question_id": "oTRPXjNsajChwvV6EP3tUk", "question": "Which United States president was born in this car's state?", "choices": ["lincoln", "reagan", "obama", "jefferson"], "correct_choice_idx": 1, "direct_answers": ["ford", "lincoln", "reagan", "gerald ford", "lincoln", "ford", "ronald reagan", "truman", "il president", "hayes"], "difficult_direct_answer": false, "rationales": ["The state is illinois.", "Reagan was born there.", "President reagan was born illinois as shown on the license plate."], "image": "val2014/COCO_val2014_000000067131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167178, "question_id": "oTjmrPV5Px55RE5WCddbDS", "question": "What is the grey statue supposed to be?", "choices": ["god", "angel", "ghost", "demon"], "correct_choice_idx": 1, "direct_answers": ["playing music", "cherub", "baby angel", "angel", "angel", "angel", "angel", "cupid", "child", "cherub"], "difficult_direct_answer": false, "rationales": ["It resembles that of a well known figure depicted in renessaince art of an angel baby or cherub.", "It is a cherub.", "The statue indicates an angle."], "image": "val2014/COCO_val2014_000000167178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348826, "question_id": "oUEcnGS7MYpDmxoVQ3yaHd", "question": "What language does the name on the side of the largest vehicle here come from?", "choices": ["japanese", "egyptian", "greek", "aramaic"], "correct_choice_idx": 2, "direct_answers": ["greek", "english", "greek", "usa", "greek", "english", "english", "greek", "english", "greek"], "difficult_direct_answer": false, "rationales": ["The language is greek.", "The language is greek.", "The word delta comes from the greek language."], "image": "train2014/COCO_train2014_000000348826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311310, "question_id": "oULJCGkDk8bZXMipwdJZGo", "question": "The object in the air is in the shape of what animal?", "choices": ["panda", "lizard", "rabbit", "stingray"], "correct_choice_idx": 3, "direct_answers": ["dragon", "dragon", "butterfly", "devil", "stingray", "dragon", "butterfly", "dragon", "stingray", "dragon"], "difficult_direct_answer": false, "rationales": ["The kite is a stingray.", "The object in the air is in the shape of a stingray.", "The kite looks like a sea creature."], "image": "val2014/COCO_val2014_000000311310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472893, "question_id": "oUcaFMUnhMNfvQ79R46sp2", "question": "What is the first activity that is not allowed on the ice?", "choices": ["fishing", "ice-skating", "running", "sledding"], "correct_choice_idx": 1, "direct_answers": ["ice-skating", "smashing", "sliding", "ice skating", "ice skating", "skiing", "skating", "skating", "skating", "skating"], "difficult_direct_answer": false, "rationales": ["There is an ice skate that is crossed off.", "An ice skate boot is crossed out.", "The sign on the far right forbids two activities. the first has a picture of an item that has a shoe and a blade."], "image": "train2014/COCO_train2014_000000472893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563371, "question_id": "oUdzdqxD4vP9WSfUThYFgL", "question": "Besides Asia what continent are these animals found on?", "choices": ["europe", "antarctica", "south america", "africa"], "correct_choice_idx": 3, "direct_answers": ["africa", "africa", "africa", "africa", "africa", "africa", "europe", "africa", "africa", "africa"], "difficult_direct_answer": false, "rationales": ["Elephants can be found in africa.", "There are three different species of elephant. one is asian and the other two are african.", "Elephants are also found in africa and sahara regions."], "image": "train2014/COCO_train2014_000000563371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432172, "question_id": "oUhRg9VFXV3a2bQDGngHQ2", "question": "What type dried plants are the cows eating here?", "choices": ["fruits", "vegetables", "grasses", "sprouts"], "correct_choice_idx": 2, "direct_answers": ["hay bales", "alfalfa", "hay", "hay", "grass", "grasses", "hay", "grass", "straw", "hay"], "difficult_direct_answer": false, "rationales": ["The cows in the barn are eating hay that is bunches of dried grass.", "The cows are eating hay.", "The cows are eating hay."], "image": "train2014/COCO_train2014_000000432172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198437, "question_id": "oUveoLCFeubyTjEFBLRftQ", "question": "What is the woman in yellow doing on the horse?", "choices": ["feeding", "competing", "learning", "cleaning"], "correct_choice_idx": 1, "direct_answers": ["riding", "riding it", "competing", "riding", "riding", "riding", "riding", "riding it", "riding", "riding"], "difficult_direct_answer": false, "rationales": ["The woman competes.", "The woman is competing because there are people on the stands watching her.", "The woman is in a polo competition."], "image": "train2014/COCO_train2014_000000198437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299764, "question_id": "oVKCAyGnspVzkfi3LcQSn8", "question": "Where are the three men in orange and white having their discussion?", "choices": ["homeplate", "pitcher's mound", "2nd base", "outfield"], "correct_choice_idx": 1, "direct_answers": ["pitcher's mound", "pitcher mound", "stadium", "pitchers mound", "pitcher's mound", "pitcher's mound", "pitchers mound", "pitcher's mound", "pitcher's mound", "pitcher mound"], "difficult_direct_answer": false, "rationales": ["The men are on the pitchers' mound.", "The men are standing on the artificial hill in the middle of the baseball diamond. the ball is thrown to the batter by the player standing at the top of the mound which provides him with more leverage.", "They are standing on a pile of dirt. the man who throws the ball to the catcher stands on this mound to do it."], "image": "train2014/COCO_train2014_000000299764.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365645, "question_id": "oVRMcaYApNgwsvvyA6rFTB", "question": "The word in big letters in the middle is also a city in what country?", "choices": ["kazakhstan", "turkey", "brazil", "russia"], "correct_choice_idx": 2, "direct_answers": ["brazil", "brazil", "america", "brazil", "brazil", "de janeiro", "brazil", "brazil", "brazil", "brazil"], "difficult_direct_answer": false, "rationales": ["The city is a very large and internationally known city in south america.", "Rio is a major city there.", "Rio is a city in brazil."], "image": "train2014/COCO_train2014_000000365645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254179, "question_id": "oVa2dB3mJxBCNrB2b8oSRW", "question": "The person wearing what color of outfit is in the greatest danger?", "choices": ["yellow", "white", "black", "blue"], "correct_choice_idx": 0, "direct_answers": ["white", "yellow", "yellow", "unknown", "yellow", "yellow", "tan", "getting hit", "yellow", "white"], "difficult_direct_answer": false, "rationales": ["They are walking in the road with oncoming traffic coming.", "The person in yellow is crossing.", "The person has yellow on their body."], "image": "train2014/COCO_train2014_000000254179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124739, "question_id": "oVoGUZGBx6WH4nywqEs2R8", "question": "What is likely to start Sept. 29 here?", "choices": ["marching practice", "free parking", "road work", "candy giveaway"], "correct_choice_idx": 2, "direct_answers": ["road construction", "roadwork", "road repair", "road work", "roadwork", "lane closure", "construction", "road work", "construction", "construction"], "difficult_direct_answer": false, "rationales": ["Road work will start.", "The sign says starting sept 29 on the road so it's likely about fixing the road.", "Road work is likely to start on september 29 because there is a road sign announcing it"], "image": "train2014/COCO_train2014_000000124739.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272957, "question_id": "oVxSwJZm9udfxYJ3B7eLmd", "question": "What event is taking place here?", "choices": ["motorcycle parade", "looting", "protest", "vandalism"], "correct_choice_idx": 0, "direct_answers": ["biker event", "bike week", "biker show", "rally", "sturgis", "biker", "motorcycle race", "motorcycle show", "motorcycle parade", "sturgis rally"], "difficult_direct_answer": true, "rationales": ["There are many of these going down the street in the same direction, with no other types of vehicles, and spectators on the sidewalk.", "There are rows of motorcycles.", "The vehicles are all organized in a line with a large amount of people watching them."], "image": "train2014/COCO_train2014_000000272957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308772, "question_id": "oWAYpjzdg4YCkQpGWocaXZ", "question": "Why is the man holding the camera?", "choices": ["to buy", "to throw", "taking pictures", "to text"], "correct_choice_idx": 2, "direct_answers": ["photographing things", "take photos", "for photos", "picture taking", "dogs", "taking pictures", "snap photos", "taking pictures", "taking pictures", "pictures"], "difficult_direct_answer": false, "rationales": ["The purpose of a camera is to take pictures. if one is holding a camera they are likely using it for the intended purpose.", "He is taking pictures.", "The man is ready to snap a shot."], "image": "train2014/COCO_train2014_000000308772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220372, "question_id": "oWrd38xyvUkVad8rzxwuwp", "question": "What type of tree is most likely in the house?", "choices": ["maple", "thanksgiving", "christmas", "halloween"], "correct_choice_idx": 2, "direct_answers": ["christmas", "hotel", "cedar", "indoor tree", "christmas tree", "family", "christmas", "oak", "christmas", "oak"], "difficult_direct_answer": false, "rationales": ["There is some red ribbons and lights.", "There are stockings, presents, and a bow near the fireplace. some of the people are wearing sweaters.", "It seems it is the period of christmas and family member are around."], "image": "train2014/COCO_train2014_000000220372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130339, "question_id": "oX9AHnffCaPSHWosBQNa68", "question": "What is the person on the left wearing?", "choices": ["glasses", "fedora", "top hat", "boa"], "correct_choice_idx": 0, "direct_answers": ["glasses", "glasses", "eyeglasses", "glasses", "eyeglasses", "brown coat", "glasses", "glasses", "glasses", "glasses"], "difficult_direct_answer": false, "rationales": ["The person is question is identifiable by the text of the question and what they are wearing is visible.", "The man has an item on his face.", "There are glasses that are seen on the man face."], "image": "train2014/COCO_train2014_000000130339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 291606, "question_id": "oXWM3VosPM6NbytKdvnFWp", "question": "What continent is this most likely on?", "choices": ["africa", "europe", "asia", "south america"], "correct_choice_idx": 2, "direct_answers": ["asia", "asia", "asia", "asia", "asia", "asia", "asia", "asia", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["The text on the side of the bus is in asian characters.", "There is writing on the side of the bus that are associated with chinese characters.", "There are asian characters shown."], "image": "val2014/COCO_val2014_000000291606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134620, "question_id": "oXgfYWFRnE72qCGraEKRd5", "question": "What color is the line on the floor that is all the way to the right?", "choices": ["blue", "purple", "orange", "black"], "correct_choice_idx": 0, "direct_answers": ["blue", "yellow", "white", "red", "blue", "white", "blue", "white", "brown", "sky blue"], "difficult_direct_answer": false, "rationales": ["The line on the right is blue.", "The line is blue.", "The line is not orange, black, or purple. it has the same color as the sky."], "image": "train2014/COCO_train2014_000000134620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73724, "question_id": "oY2yMaQbyCJJsZzet34Tro", "question": "What chair would best fit either child?", "choices": ["dark green", "grey", "red", "brown"], "correct_choice_idx": 0, "direct_answers": ["small chair", "green chair", "green chair", "baby seat", "black", "middle black", "dark green", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The green chair is the smaller than the others.", "The brown, red, and grey chairs are designed for adults. the other one is smaller and would be more suitable for either child.", "It is a small chair. they are small children."], "image": "train2014/COCO_train2014_000000073724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197120, "question_id": "oY3QLpVdUFHcZuq9SHRrkt", "question": "What do the carts carry to the airplane here?", "choices": ["people", "pets", "baggage", "staff"], "correct_choice_idx": 2, "direct_answers": ["luggage", "luggage", "baggage", "luggage", "luggage", "luggage", "baggage", "baggage", "luggage", "luggage"], "difficult_direct_answer": false, "rationales": ["These carts have various luggage and baggage of the passengers.", "The carts next to the airplane are connected to each other by a towing device. this is a clear indication that baggage is either going to or away from airplane.", "They transport passengers' suitcases to be loaded into the cargo hold."], "image": "train2014/COCO_train2014_000000197120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102460, "question_id": "oYKkNDS2CmD5DwefhJhngj", "question": "What is the elephant using to spray water?", "choices": ["trunk", "ears", "water gun", "person aback"], "correct_choice_idx": 0, "direct_answers": ["trunk", "trunk", "trunk", "snout", "trunk", "trunk", "trunk", "trunk", "trunk", "trunk"], "difficult_direct_answer": false, "rationales": ["They pick up water with this and fling it back", "Water is coming out of the trunk.", "Elephants use trunks to wash themselves and to do others things. the elephant is seen using its trunk to throws water over its head."], "image": "train2014/COCO_train2014_000000102460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580120, "question_id": "oYwPTGjyFJWfFDXGn2wFXR", "question": "What is the person in the white coat doing?", "choices": ["shaving", "taking vacation", "having lunch", "teaching"], "correct_choice_idx": 3, "direct_answers": ["teaching", "teaching", "helping students", "teaching", "teaching", "teaching", "teaching", "helping students", "helping students", "teaching"], "difficult_direct_answer": false, "rationales": ["The person is the only adult in the classroom.", "He is a instructor in a classroom full of kids.", "A lab educator is going around the room and checking on students. he is making sure they know what they are doing and helping assist."], "image": "train2014/COCO_train2014_000000580120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12147, "question_id": "oZDjm5VNhdYY3EduG5W57v", "question": "What person is known for competing in this sport?", "choices": ["alex rodriguez", "bo jackson", "tony hawk", "ken shamrock"], "correct_choice_idx": 2, "direct_answers": ["tony hawk", "tony hawk", "tony hawk", "skateboarder", "tony hawk", "skateboarder", "tony hawk", "tony hawk", "tony hawk", "skateboarder"], "difficult_direct_answer": false, "rationales": ["The person is like hawk.", "The man is skateboarding.", "Tony hawk is a famous skateboarder."], "image": "val2014/COCO_val2014_000000012147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129648, "question_id": "2dzVmqRBymZF2vHMihRUTk", "question": "What's happening to this guy?", "choices": ["bouncing", "being chased", "falling", "doing tricks"], "correct_choice_idx": 3, "direct_answers": ["doing tricks", "skateboarding trick", "falling", "falling", "skating", "dropping in", "he skated", "falling", "jumping", "falling"], "difficult_direct_answer": false, "rationales": ["The man is using a skateboard to perform.", "A guy is skateboarding in a pool. skateboarders do tricks.", "The guy is grabbing his skateboard as he's coming out of an empty pool."], "image": "train2014/COCO_train2014_000000129648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347018, "question_id": "2rKiZrY94yQHa5JnQThdaV", "question": "What hockey team does the batter's jersey signify?", "choices": ["penguins", "steelers", "flames", "pirates"], "correct_choice_idx": 0, "direct_answers": ["baseball idiot", "penguins", "pittsburgh penguins", "pittsburg", "pirates", "penguins", "penguins", "penguins", "not hockey", "pittsburg"], "difficult_direct_answer": false, "rationales": ["The batter's shirt read's pittsburgh though some of the letters are obscured; and the logo on his shirt is a pirate associated with the pittsburgh pirates. pittsburgh's hockey team is the penguins.", "Your question is wrong here, because the hockey team in pittsburgh is the penguins, but this is a baseball game, and this is the jersey of the pittsburgh pirates. i would know, i'm from pittsburgh.", "There are penguins on the jersey."], "image": "val2014/COCO_val2014_000000347018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210806, "question_id": "346sfSfnDqiD9EywN7rTHp", "question": "Why is there no logo on the plane?", "choices": ["no money", "not finished", "other side", "secret"], "correct_choice_idx": 1, "direct_answers": ["new plane", "repurposed airplane", "government plane", "it's old", "privacy", "generic airplane", "generic design", "private", "private", "not finished"], "difficult_direct_answer": true, "rationales": ["The answer is unknowable from the image, but answer a is a reason that would cause logos and brands to be missing from objects that might normally have one.", "There is a plain white plane just sitting without any markings. it is still in the works.", "It's a really big plane which makes me think it is owned by an airline or government and it just needs to be painted."], "image": "train2014/COCO_train2014_000000210806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578037, "question_id": "36SMzD6UUtkiym9roznHKx", "question": "What is the man in a white shirt's vector?", "choices": ["move sideways", "move backward", "stay stationary", "move forward"], "correct_choice_idx": 3, "direct_answers": ["forwards", "skateboarder", "skateboard", "forward", "skating", "skateboarder", "forward", "forwards", "move forward", "tank top"], "difficult_direct_answer": false, "rationales": ["A man is using a skateboard on the road going straight.", "The man is moving forward on his skateboard. he is pushing forward with his back foot.", "He's riding a skateboard and not performing any tricks"], "image": "train2014/COCO_train2014_000000578037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452465, "question_id": "3ErZX2xBVEFkuEkgbhixH8", "question": "What type of transportation is shown?", "choices": ["water", "land", "rail", "air"], "correct_choice_idx": 0, "direct_answers": ["boat", "boats", "boat", "boat", "boats", "boat", "boats", "boat", "water", "boats"], "difficult_direct_answer": false, "rationales": ["That is wet and blue like water.", "The boat's here docked are floating in water and water is the medium through which they traverse when in use.", "These are boats"], "image": "train2014/COCO_train2014_000000452465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211935, "question_id": "3FyvZWSzenWpkasRpdfAXA", "question": "What can people do in this location?", "choices": ["gamble", "skydive", "hunt", "kayak"], "correct_choice_idx": 0, "direct_answers": ["traffic area", "gamble", "gamble eat", "gamble", "gamble", "gamble", "gamble", "eat", "gamble", "gamble"], "difficult_direct_answer": false, "rationales": ["The location is showboat hotel casino according to the sign in the parking lot. people play games like craps, blackjack and slot machines there.", "People gamble.", "This is the obvious reason given the text on the signs. the geographic location also implies it's an area for this type of recreation in the southwest."], "image": "train2014/COCO_train2014_000000211935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285799, "question_id": "3Nn5sGxDd37hFXtzcyS79v", "question": "What makes this woman's task easier?", "choices": ["weather", "snow", "drone", "sled"], "correct_choice_idx": 3, "direct_answers": ["sled", "sled", "sled", "sled", "sled", "sled", "sled", "sled", "sled", "strings"], "difficult_direct_answer": false, "rationales": ["The sled will slide down the hill easier.", "A girl has bags on a sled and is pushing the sled.", "The sled is the only object that the woman is using, although her sliding would also not be possible without the snow or a smooth surface either, in order for her to slide downwards, she would need an object such as the sled to do so."], "image": "val2014/COCO_val2014_000000285799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226705, "question_id": "3Uge6M5y9hmT4v3rTxoRdF", "question": "What vehicle manufacturer's logo is seen on the hub cap on the left?", "choices": ["buick", "ford", "chevrolet", "lincoln"], "correct_choice_idx": 0, "direct_answers": ["suzuki", "buick", "buick", "chevrolet", "buick", "buick", "yes", "cadillac", "unknown", "buick"], "difficult_direct_answer": false, "rationales": ["The logo is the buick.", "The hub cap has the company logo on it.", "That's the name that matches the logo."], "image": "train2014/COCO_train2014_000000226705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573749, "question_id": "3nQQLULc2yVccD7YjZY2UC", "question": "What video game company's product name is seen here?", "choices": ["microsoft", "sega", "nintendo", "sony"], "correct_choice_idx": 2, "direct_answers": ["nintendo wii", "nintendo", "nintendo", "wii", "nintendo", "nintendo", "wii", "will", "dell", "wii"], "difficult_direct_answer": false, "rationales": ["The remote controller for a nintendo wii is in this picture.", "The video game product name seen is wii. i chose the option with the company that manufactures the wii video system.", "There is a wii controller"], "image": "train2014/COCO_train2014_000000573749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298071, "question_id": "4DmaKM5C3HoUShMMrxNFGG", "question": "What is the man in the mural using?", "choices": ["toaster", "laptop", "phone", "gun"], "correct_choice_idx": 1, "direct_answers": ["laptop", "laptop", "laptop", "laptop", "computer", "laptop", "laptop", "laptop", "laptop", "computer"], "difficult_direct_answer": false, "rationales": ["The man is using a laptop.", "The man is holding an electronic item that is large enough to rest across both of his knees. his fingers are touching its keyboard.", "The man has a laptop."], "image": "train2014/COCO_train2014_000000298071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327055, "question_id": "4F4csoPVE4PVCgsfVZQ23r", "question": "What type of signs are shown?", "choices": ["regulatory", "traffic", "protest", "price"], "correct_choice_idx": 2, "direct_answers": ["love", "love", "protest", "protest", "cardboard", "cardboard", "gathering", "demonstration", "zen meditation", "cardboard"], "difficult_direct_answer": false, "rationales": ["The signs are in protest.", "Protesting signs are shown (a). signs are used to non-verbally communicate any sort of message.", "They are holding up signs with messages"], "image": "train2014/COCO_train2014_000000327055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364535, "question_id": "4Q8ZfGkUbh4gnDDGj8ptce", "question": "These women enjoy what sport as referred to by it's European moniker?", "choices": ["rugby", "darts", "football", "american football"], "correct_choice_idx": 2, "direct_answers": ["soccer", "soccer", "soccer", "football", "football", "football", "soccer", "football", "football", "football"], "difficult_direct_answer": false, "rationales": ["It is called this everywhere except the us", "The women on the field are holding a soccer ball which is not called soccer in europe. instead they call it football because of all the kicking they do with their feet.`", "The girl is holding a football."], "image": "train2014/COCO_train2014_000000364535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135399, "question_id": "4Spfm9gPBoFmKQLryj2XCr", "question": "What's most likely in the box?", "choices": ["napkins", "more food", "gas", "water"], "correct_choice_idx": 1, "direct_answers": ["rice", "more food", "toppings", "food", "milk", "sauce", "rice", "food", "food", "food"], "difficult_direct_answer": false, "rationales": ["The box has food.", "The box is a food serving container so it would include more food.", "A table is set with food and a silver dish with a lid is included. banquet serving dishes are often silver with lids."], "image": "val2014/COCO_val2014_000000135399.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159601, "question_id": "4rohPoP3oC7znrVorFwopz", "question": "Which nation's flag is hanging from the statue?", "choices": ["uk", "france", "usa", "norway"], "correct_choice_idx": 3, "direct_answers": ["britain", "switzerland", "denmark", "norway", "norway", "united kingdom", "norway", "norway", "norway", "norway"], "difficult_direct_answer": false, "rationales": ["I reconfirmed this via google.", "The blue and white cross on the red flag is norwegian.", "The flag is on the statue."], "image": "train2014/COCO_train2014_000000159601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482464, "question_id": "56DsnKVam7Y6bLjLWNccTN", "question": "What are the people most likely looking at?", "choices": ["snow", "mountain", "trees", "lake"], "correct_choice_idx": 3, "direct_answers": ["other skiers", "accident", "lake", "accident", "lake", "incident", "ski slope", "ski resort", "downhill", "their children"], "difficult_direct_answer": false, "rationales": ["The people are looking at the lake.", "People are looking down on water from a mountain top above.", "Mountain trip am friends."], "image": "train2014/COCO_train2014_000000482464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156701, "question_id": "5EgRfYQJxHs7LRkDjh9KKz", "question": "What being's pattern does that umbrella pattern vaguely resemble?", "choices": ["snake", "zebra", "leopard", "peacock"], "correct_choice_idx": 1, "direct_answers": ["conch", "zebra", "zebra", "octagon", "stripes", "zebra", "zebra", "zebra", "hexagon", "zebra"], "difficult_direct_answer": false, "rationales": ["An umbrella is white and black striped.", "The umbrella has a white and dark striped pattern which is similar to how zebras look. although snakes can be striped they are not usually dark and white colored.", "The stripes make it look like a zebra."], "image": "train2014/COCO_train2014_000000156701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522530, "question_id": "5KXrsEvpexv9qqS4YSqrkb", "question": "What power moves the Daily Express vehicle?", "choices": ["coal", "electricity", "horse power", "gas"], "correct_choice_idx": 2, "direct_answers": ["horse", "horses", "horse power", "horse", "horse power", "horse", "horse power", "horse", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["The vehicle is being drawn by horses, which generate their own power.", "The power is by horse.", "The vehicle is identifiable by the text of the question and is connected to horses with no other power source apparent."], "image": "train2014/COCO_train2014_000000522530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338108, "question_id": "5MW3wdTxE59RhFCigWVPG2", "question": "What tag hangs from this man's front?", "choices": ["train pass", "bus", "parking", "ski pass"], "correct_choice_idx": 3, "direct_answers": ["lift tickets", "lift ticket", "pass", "ticket", "ski resort", "entrance tag", "ski pass", "identification", "lift ticket", "identification"], "difficult_direct_answer": false, "rationales": ["This man is on a ski slope. you will not find a bus or train this high up. typically you don't wear a parking pass.", "It's a ski pass for him to ski", "The tag is a ski pass."], "image": "val2014/COCO_val2014_000000338108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165589, "question_id": "5NVkjtPChvRPgvv3wdKTWr", "question": "What day of the week is it?", "choices": ["tuesday", "sunday", "monday", "saturday"], "correct_choice_idx": 3, "direct_answers": ["saturday", "fifteenth", "saturday", "saturday", "fifteenth", "saturday", "saturday", "monday", "15", "saturday"], "difficult_direct_answer": false, "rationales": ["The day is on a weekend.", "The date seemed to have fallen on a specific day of the week.", "September 15, 2012 is captured on the photo and that day was the seventh day of the week."], "image": "val2014/COCO_val2014_000000165589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439614, "question_id": "5RLrzHzEhdYVFMMcZGv6jU", "question": "What country does the name of the boat originate from?", "choices": ["germany", "india", "mexico", "japanese"], "correct_choice_idx": 0, "direct_answers": ["india", "netherlands", "germany", "france", "usa", "england", "germany", "netherlands", "united kingdom", "senegal"], "difficult_direct_answer": false, "rationales": ["There is a german flag on the boat.", "It comes from germany", "Germany is the origins of the boat."], "image": "train2014/COCO_train2014_000000439614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106793, "question_id": "5YNhVR6W7LcasaKoUTQ7H7", "question": "Where is it dangerous to stick their finger into?", "choices": ["eyeball", "ladder", "socket", "pillow"], "correct_choice_idx": 2, "direct_answers": ["socket", "socket", "outlet", "light socket", "electric outlet", "socket", "outlet", "electrical socket", "light socket", "socket"], "difficult_direct_answer": false, "rationales": ["If anyone puts their finger in it, a person could get seriously injured.", "The child would get shocked if they touch the socket.", "Option a is the only choice for presenting a danger to the person who performs such an action."], "image": "val2014/COCO_val2014_000000106793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167610, "question_id": "5YSF8qa87w3Mz3B6x3igLE", "question": "What's the woman that's bending over doing?", "choices": ["sulking", "laughing", "praying", "crying"], "correct_choice_idx": 1, "direct_answers": ["playing nintendo", "playing wii", "playing game", "laughing", "laughing", "laughing", "laughing", "playing game", "playing videogame", "laughing"], "difficult_direct_answer": false, "rationales": ["The woman is laughing.", "They are laughing because their expression suggests they are laughing and having fun.", "The woman that is bending over is laughing because she is playing a game and having fun."], "image": "val2014/COCO_val2014_000000167610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468483, "question_id": "5aKxECoN9RRmWEUd28xk4M", "question": "What should the man wear before the activity for protection?", "choices": ["wristband", "sunglasses", "sunscreen", "helmet"], "correct_choice_idx": 3, "direct_answers": ["pads", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet"], "difficult_direct_answer": false, "rationales": ["The man needs a helmet.", "A helmet provides the best protection for a person skateboarding. if he falls, he is less likely to have a head injury.", "The man should get a helmet."], "image": "val2014/COCO_val2014_000000468483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145422, "question_id": "5hRtapX2q3HXybfoxuEtTG", "question": "Who is the president of the front skier's country?", "choices": ["macron", "trudeau", "putin", "zelensky"], "correct_choice_idx": 0, "direct_answers": ["mark rutte", "joe biden", "macron", "emmanuel macron", "macron", "putin", "merle", "emmanuel macron", "unknown", "jonas store"], "difficult_direct_answer": false, "rationales": ["A skier is in uniform decorated with logos and home country colors.", "The front skier has two french flags on their shoulders.", "Macron is the president."], "image": "val2014/COCO_val2014_000000145422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241155, "question_id": "5m5ziVfMHj8aRQg29bEUNJ", "question": "What is this mode of transportation called?", "choices": ["tram", "subway", "train", "bus"], "correct_choice_idx": 1, "direct_answers": ["subway", "subway", "rail transportation", "rail transportation", "subway", "subway", "subway", "subway", "subway", "subway"], "difficult_direct_answer": false, "rationales": ["The platform and tracks appear to be underground as no natural light can be seen, and trains that run underground are called subways.", "There is a train underground.", "This is a subway train because you can see the people waiting at an underground stop."], "image": "val2014/COCO_val2014_000000241155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462632, "question_id": "5yDL6tGFuXKERoidbDrnZu", "question": "What kind of software is the left computer running?", "choices": ["video production", "word processing", "email", "photo editing"], "correct_choice_idx": 3, "direct_answers": ["windows", "photo editing", "monater", "photoshop", "photo editing", "windows paint", "microsoft paint", "can't see", "windows", "graphics"], "difficult_direct_answer": false, "rationales": ["There appears to be a picture on the screen narrowing it down either email or photo editing, but there also appears to be editing options on the left side of the screen meaning it has to be editing.", "The software has a picture open on the screen which is currently being edited.", "The program has an edit tool bar next to the picture shown."], "image": "val2014/COCO_val2014_000000462632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41728, "question_id": "66PKuT8RnzLGro2SBHTYaj", "question": "What is visible in window of the tall structure that is white?", "choices": ["kites", "pictures", "light", "fishing poles"], "correct_choice_idx": 2, "direct_answers": ["light", "not visible", "light", "harbor view", "nothing", "person", "bell", "light", "black color", "inner dome"], "difficult_direct_answer": false, "rationales": ["There is a white brightness that is light.", "Light is visible from the tall structure as it's getting dark outside. this light projects outward.", "Light is visible."], "image": "train2014/COCO_train2014_000000041728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173760, "question_id": "6Ab7AVxYqFDY95HgMiSVmb", "question": "What is strung on the fence to keep the cows in?", "choices": ["wood", "wire", "thorns", "metal"], "correct_choice_idx": 1, "direct_answers": ["barbwire", "wires", "barbed wire", "wire", "barbed wire", "wire", "wire", "wire", "barbed wire", "thread"], "difficult_direct_answer": false, "rationales": ["It has barbs on it to discourage them from running through it", "Ranchers often keep cows penned in with a.", "Cows lay in the grass behind a fence with wood pools and silver line strung between them."], "image": "train2014/COCO_train2014_000000173760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109008, "question_id": "6hXTPEMZCmzgXknZsFc9Bk", "question": "What event are the people participating in?", "choices": ["class", "reception", "church", "movie"], "correct_choice_idx": 0, "direct_answers": ["dating", "meeting", "social event", "school", "owrkshop", "class", "meeting", "studying", "computer", "unknown"], "difficult_direct_answer": true, "rationales": ["The people are students in class.", "The people are gathered together to attend a class. they are seated patiently by their computers and socializing.", "The event is a class."], "image": "train2014/COCO_train2014_000000109008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470797, "question_id": "6mt5QMq4bnW9q4AYJupvor", "question": "The plane is most likely dropping off what to the people?", "choices": ["weaponized vehicles", "money", "furniture", "supplies"], "correct_choice_idx": 3, "direct_answers": ["supplies", "supplies", "supplies", "supplies", "supplies", "supplies", "supplies", "supplies cargo", "supplies cargo", "supplies"], "difficult_direct_answer": false, "rationales": ["The plane is dropping off supplies.", "This is a large us airforce plane. these kinds of planes carry military supplies.", "This is a cargo plane"], "image": "train2014/COCO_train2014_000000470797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537293, "question_id": "72jwCHpfznXqHfiwNbSPX7", "question": "Why is the woman holding the object near the child's head?", "choices": ["to cut", "to curl", "to dry", "to dye"], "correct_choice_idx": 2, "direct_answers": ["dry hair", "dry hair", "dry hair", "hairdryer", "drying", "dry hair", "drying hair", "to dry", "blow dryer", "dry hair"], "difficult_direct_answer": false, "rationales": ["The child's head is wet.", "A hair dryer is only used for making wet hair dry.", "A woman is holding a hair dryer near a baby's head."], "image": "train2014/COCO_train2014_000000537293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444199, "question_id": "75ktxPe9UFq64jCfDdwzf3", "question": "The flag has colors similar to what other country's flag?", "choices": ["nepal", "spain", "argentina", "united kingdom"], "correct_choice_idx": 3, "direct_answers": ["uk", "red", "united kingdom", "united states", "france", "work", "liberian flag", "canada", "united states", "france"], "difficult_direct_answer": false, "rationales": ["The flag on the street has red white and blue colors just like the united kingdom's flag.", "A red, white, and blue flag is flying.", "A because the uk flag has red white and blue too."], "image": "val2014/COCO_val2014_000000444199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136186, "question_id": "7FKnXrqUmN8h2GLXwwveVd", "question": "What type of sign is shown?", "choices": ["brand", "traffic", "regulatory", "warning"], "correct_choice_idx": 0, "direct_answers": ["ilford", "ilford", "billboard", "company", "business", "farm name", "brand", "several cows", "business name", "ilford"], "difficult_direct_answer": false, "rationales": ["The brand on the building.", "It offers black and white products.", "That must be a brand name because it is none of the other things."], "image": "train2014/COCO_train2014_000000136186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112800, "question_id": "7vkWmM3K8UCSdCEuBrmWBf", "question": "Which form of transportation seen here is more versatile in it's stopping or parking places?", "choices": ["bike", "plane", "ship", "boat"], "correct_choice_idx": 1, "direct_answers": ["plane", "plane", "ferry", "boat", "sea", "plane", "plane", "plane", "airplane", "seaplane"], "difficult_direct_answer": false, "rationales": ["It can be on land or water because it has pontoons and wheels", "It can land in most places and park where ever it lands.", "A plane is a bit more flexible in where it starts and stops."], "image": "val2014/COCO_val2014_000000112800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338595, "question_id": "84Ua3wTFPShmvPD76tXgnT", "question": "What venue is shown here?", "choices": ["temple", "carnival", "country border", "flea market"], "correct_choice_idx": 0, "direct_answers": ["sporting", "outside", "station", "concert", "market", "market", "parade", "urban street", "parade", "temple"], "difficult_direct_answer": false, "rationales": ["A flea market where people can shop", "They are worshipping.", "It looks like these people are all in line to go see something and the building on the left looks pretty ornate and old."], "image": "val2014/COCO_val2014_000000338595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68743, "question_id": "8Jc3HEY72vDAuzfeJpma7y", "question": "What is the green plastic thing on the baby's chest for?", "choices": ["camouflage", "cleanliness", "dress code", "visibility"], "correct_choice_idx": 1, "direct_answers": ["prevent mess", "cleaning purpose", "prevent stains", "keep clean", "cleanliness", "cleanliness", "repel food", "avoid messes", "spillage", "stopping stains"], "difficult_direct_answer": true, "rationales": ["It's a bib to keep them clean.", "A baby is eating and has a bib on to prevent messes.", "Cleanliness avoid a dirty on clothes."], "image": "val2014/COCO_val2014_000000068743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482819, "question_id": "8qV8LmfpMuVkdZXg5utND5", "question": "What country's flag colors are represented on the pole?", "choices": ["canada", "niger", "poland", "ukraine"], "correct_choice_idx": 3, "direct_answers": ["blue yellow", "ukraine", "ukraine", "sweden", "ukraine", "sweden", "ukraine", "greece", "sweden", "sweden"], "difficult_direct_answer": false, "rationales": ["The colors are for that country.", "The flag colors represent the colors of ukraine, which is yellow and blue. it's a very prominent color.", "The flag is from ukraine."], "image": "train2014/COCO_train2014_000000482819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385141, "question_id": "9CczQk38aeNHfJYAx7tFp6", "question": "What famous Christmas sweet is associated with the color of the seller's stand?", "choices": ["christmas cake", "candy cane", "mars bar", "snickers"], "correct_choice_idx": 1, "direct_answers": ["candy cane", "candy cane", "candy cane", "candy cane", "candy cane", "candy cane", "candy cane", "candy cane", "candy", "candy"], "difficult_direct_answer": false, "rationales": ["Candy canes are usually twisted red, and white.", "The sweet is a candy cane.", "The seller's stand has an alternating red and white pattern. snickers and mars bars are not christmas sweets."], "image": "train2014/COCO_train2014_000000385141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539299, "question_id": "9Qtt55cyPacE6r6ZPNrPNA", "question": "Why are the men's vests orange in color?", "choices": ["camouflage", "dress code", "fashion", "visibility"], "correct_choice_idx": 3, "direct_answers": ["construction", "increased visibility", "visibility", "security", "construction", "safety", "visibility", "increased visibility", "construction workers", "safety visibility"], "difficult_direct_answer": false, "rationales": ["The vests are visible.", "The men's vests need to be visible.", "Those are safety vests that are to make people more visible."], "image": "train2014/COCO_train2014_000000539299.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204887, "question_id": "9b75CgVJQFMhW6NFkyc8Rh", "question": "What do you do with the thing attached to the front of the bus?", "choices": ["go fishing", "go hiking", "go skiing", "go biking"], "correct_choice_idx": 3, "direct_answers": ["ride it", "ride", "cycle", "hold", "pedal", "bicycle", "hold bicycle", "pedal it", "go biking", "hold bicycles"], "difficult_direct_answer": true, "rationales": ["There is a bicycle attached to the bus and this can be ridden.", "It is a bicycle", "It's a bike. so, the answer is obvious."], "image": "val2014/COCO_val2014_000000204887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266032, "question_id": "9kgujJuBPpoJ3R5pJzqA8j", "question": "What might this animal leave behind when leaving it's chair?", "choices": ["hair", "rats", "birds", "notes"], "correct_choice_idx": 0, "direct_answers": ["poo", "fur", "hair", "fur", "fur", "hair", "hair", "fur", "fur", "fur"], "difficult_direct_answer": false, "rationales": ["Hair is a protein filament that grows from follicles found in the dermis. hair is one of the defining characteristics of mammals.", "Animals shed their fur a little at a time and the friction of moving on fabric can make it come out faster", "You can see that he has a thick coat of fur. it is common for cats to shet."], "image": "train2014/COCO_train2014_000000266032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527388, "question_id": "9v3szQqaHusMYS9vZAQ9nd", "question": "What is most likely causing the man's pocket to bulge?", "choices": ["keys", "tennis ball", "wallet", "socks"], "correct_choice_idx": 1, "direct_answers": ["tennis ball", "tennis ball", "ball", "playing", "tennis ball", "ball", "tennis ball", "tennis ball", "ball", "tennis ball"], "difficult_direct_answer": false, "rationales": ["He keeps spares so it's faster to keep the game moving if one ball goes awry", "He's on a tennis court and is currently playing tennis.", "He is likely carrying an additional ball in his pocket as tennis balls can be lost easily during a game."], "image": "train2014/COCO_train2014_000000527388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410898, "question_id": "9xkMeW2o7afpT3wF84xPpG", "question": "What type of outfit are the men on the motorcycles wearing?", "choices": ["beach wear", "uniform", "casual", "sweatsuits"], "correct_choice_idx": 1, "direct_answers": ["military", "uniform", "uniforms", "uniform", "police", "uniforms", "british", "uniform", "uniform", "military"], "difficult_direct_answer": false, "rationales": ["The clothes the men are wearing are identical.", "The men are wearing the same outfit and appear to be formally dressed. this is typical when they are from the same group.", "Several men in same outfits are riding a bike. they have similar accolades of valor as well as white rope strewn on them."], "image": "train2014/COCO_train2014_000000410898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84258, "question_id": "AFPgVdGoQBu8nL6qsiCytJ", "question": "What company is known for making the objects in the man's pocket?", "choices": ["bic", "dell", "ibm", "chipotle"], "correct_choice_idx": 0, "direct_answers": ["bic", "bic", "apple", "bic", "bic", "sale market", "bic", "bic", "bic", "bic"], "difficult_direct_answer": false, "rationales": ["There are pens, not food items or computers, in the man's pocket.", "The man has several pens in his pocket. many pens are made by bic.", "Its a writing material and among the given brands bic deals with pens."], "image": "val2014/COCO_val2014_000000084258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522930, "question_id": "BHVg2cYGhk5DXojvoHeej9", "question": "What is the cause of the bright spot in the center of the mirror on the wall?", "choices": ["lamp", "lighter", "flashlight", "camera flash"], "correct_choice_idx": 3, "direct_answers": ["camera flash", "camera flash", "camera flash", "flash light", "camera flash", "glare", "camera flash", "camera flash", "camera flash", "camera"], "difficult_direct_answer": false, "rationales": ["The photographer is visible in the image and the bright spot is consistent with their location and the tool they would be using to take the picture.", "There is a bright spot in the mirror that was caused when the camera flash went off to snap the photo.", "The brightness is the camera flash."], "image": "train2014/COCO_train2014_000000522930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452611, "question_id": "BPUw3aGnxHsyWskgWAXJDa", "question": "What problem is posed by the man's shoes?", "choices": ["ankle sprained", "feet soaked", "wart", "insect bite"], "correct_choice_idx": 1, "direct_answers": ["tripped", "work out", "holes", "non slipping", "hole bottom", "hole", "cleats", "flat", "feet soaked", "ripped"], "difficult_direct_answer": true, "rationales": ["His shoes will be soaked from the holes.", "He has holes in the bottom of them", "I am only guessing."], "image": "val2014/COCO_val2014_000000452611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262509, "question_id": "BRaWt7uGspRnSijCYJGwmq", "question": "What is the purpose of the orange buoys?", "choices": ["floating devices", "stylistic purposes", "provide information", "anchors"], "correct_choice_idx": 2, "direct_answers": ["protection", "crabs", "warning", "block area", "markers", "warning", "safety marker", "mark shallows", "fishing nets", "provide information"], "difficult_direct_answer": true, "rationales": ["The purpose is for info.", "The orange buoys are used to alert the boats that the dock is close. it gives information as to how close land is.", "They are located near the shore and are very bright and visible."], "image": "val2014/COCO_val2014_000000262509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77602, "question_id": "Bwy9LinrRzYJGNAyEwjeE3", "question": "What are the people in the first row doing?", "choices": ["celebrating", "sitting", "eating", "photographing"], "correct_choice_idx": 3, "direct_answers": ["photographing", "photos", "filming", "watching game", "photographing", "theyre praying", "site", "taking pictures", "taking pictures", "spectating"], "difficult_direct_answer": false, "rationales": ["They have cameras.", "The people in the first row all have cameras and are taking photographs because they are members of the press and media.", "They are sitting in their seats watching a baseball game."], "image": "train2014/COCO_train2014_000000077602.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44464, "question_id": "C96MHGkfwL4YZWo9vPR2eQ", "question": "What word rhymes with the number on the man's shirt?", "choices": ["hate", "boo", "more", "fine"], "correct_choice_idx": 0, "direct_answers": ["late", "hate", "turn", "date", "mate", "rate", "weight", "great", "great", "ate"], "difficult_direct_answer": true, "rationales": ["The word is hate.", "The number 8 rhymes with hate.", "The nearest man has the number eight on his shirt which rhymes with hate."], "image": "train2014/COCO_train2014_000000044464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227869, "question_id": "CWn8T4FPvELks63Z2PEu4b", "question": "What is the name of the red flower on the man's lapel?", "choices": ["rose", "poppy", "daisy", "chrysanthemum"], "correct_choice_idx": 1, "direct_answers": ["rose", "poppy", "poppy", "rose", "poppy", "poppy", "poppy", "poppy", "rose", "poppy"], "difficult_direct_answer": false, "rationales": ["The name is a poppy.", "The red flower on the man's lapel is a poppy that people in the uk wear as a symbol of remembrance.", "This is a poppy flower on the mans jacket."], "image": "train2014/COCO_train2014_000000227869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172478, "question_id": "CdJqETWanyRPfWigATaTY4", "question": "What area of London does the bus go to?", "choices": ["south", "west", "central", "north"], "correct_choice_idx": 2, "direct_answers": ["pimlico", "cool areas", "pimlico", "yes", "bus stations", "pimlico", "central", "pimlico", "downtown", "pimlico"], "difficult_direct_answer": false, "rationales": ["The area is central.", "In london one of the area name is primlico in the west of london,we travel to that area by bus.", "The area is central."], "image": "val2014/COCO_val2014_000000172478.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228418, "question_id": "CioLtUmGNGdmiGEU7eSqtD", "question": "What period of the day is it in the picture?", "choices": ["night", "afternoon", "morning", "evening"], "correct_choice_idx": 1, "direct_answers": ["afternoon", "morning", "nightime", "morning", "morning", "night", "afternoon", "morning", "morning", "morning"], "difficult_direct_answer": false, "rationales": ["The clock says it's 3:00, but it's still light outside, so it was probably taken during the day.", "The clock says 12:15 and it's daytime", "The sun doesn't seem to be that high outside the window, and seems to be setting."], "image": "val2014/COCO_val2014_000000228418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536145, "question_id": "DGSgqjvaUXpLDWKm5CwDsM", "question": "What's likely the word between see and now on the person's back?", "choices": ["mexico", "me", "money", "mom"], "correct_choice_idx": 1, "direct_answers": ["me", "me", "me", "me", "me", "me", "me", "me", "me", "me"], "difficult_direct_answer": false, "rationales": ["Because there is a present m that can be seen.", "She is making a statement about herself.", "A person is at a protest with a message on their back."], "image": "train2014/COCO_train2014_000000536145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357429, "question_id": "DJYfcmpby8W5Mvv8RFG2SR", "question": "What is unusual about the man's skateboard?", "choices": ["inline", "primer color", "miniature", "big wheels"], "correct_choice_idx": 3, "direct_answers": ["offload wheels", "wheel size", "wheels", "wheels", "unknown", "big wheels", "electric", "wheels", "big wheels", "high front"], "difficult_direct_answer": false, "rationales": ["Skateboards usually have small wheels, this one has rather big wheels making it look weird.", "You can see that the round rolling parts are very large relative to the size of the board. normally, skateboards have small ones.", "The skateboard has huge wheels."], "image": "train2014/COCO_train2014_000000357429.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49097, "question_id": "DMDiPpyqMesGDxR6yfa5Vd", "question": "Where is the dog's owner?", "choices": ["inside restaurant", "at supermarket", "at work", "at home"], "correct_choice_idx": 0, "direct_answers": ["inside", "inside building", "inside", "inside", "inside", "inside", "in store", "inside restaurant", "inside store", "store"], "difficult_direct_answer": false, "rationales": ["The door next to the dog is for this business", "The dog appears to be waiting for its owner and it is in front of a door. it is typical to leave a dog outside a place where you eat because dogs are not allowed.", "The owner left their bike outside the eatery, so it appears they are inside."], "image": "val2014/COCO_val2014_000000049097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101860, "question_id": "DP23ANrBBpYjSxKADtpysa", "question": "What MLB team does the man up at bat play for?", "choices": ["orioles", "mets", "mariners", "braves"], "correct_choice_idx": 1, "direct_answers": ["mets", "toronto", "mets", "mets", "mets", "ny mets", "mets", "mets", "mets", "mets"], "difficult_direct_answer": false, "rationales": ["It clearly says mets in script lettering across the man with a bat's chest, also if you follow baseball you know the mets are an orange and blue team.", "The mets are in the mlb.", "He plays for the mets."], "image": "val2014/COCO_val2014_000000101860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375881, "question_id": "DkQvyrtQBSuPhEBUEbrdmn", "question": "What is the small child's tie made out of?", "choices": ["plastic", "paper", "metal", "cotton"], "correct_choice_idx": 1, "direct_answers": ["paper", "paper", "paper", "paper", "paper", "paper", "construction paper", "paper", "paper", "construction paper"], "difficult_direct_answer": false, "rationales": ["The tie is made of paper.", "The tie is yellow, it's flimsy ,and has drawings on it.", "It is very flat and 2 dimensional and is also coloured on, presumably by the child, so it can be concluded it is paper."], "image": "val2014/COCO_val2014_000000375881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238963, "question_id": "Dx93fMLGgzo2xzseR26Vfp", "question": "What is the black under the man's chin?", "choices": ["mask", "bib", "bandana", "beard"], "correct_choice_idx": 3, "direct_answers": ["beard", "beard", "beard", "beard", "beard", "beard", "beard", "beard", "beard", "facial"], "difficult_direct_answer": false, "rationales": ["The man at the pitcher's mound has a dark black beard growing out of his chin.", "The black below the chin is uneven and scruffy like a beard would be. it is not covering the chest like a bib would and not covering the face or head like a mask or bandana would.", "A man with facial hair is pitching a baseball."], "image": "train2014/COCO_train2014_000000238963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348379, "question_id": "E86JvfKkEMzfyni5GSe5XN", "question": "What might the item on the woman's face be used to shield from?", "choices": ["punches", "sun", "rain", "bees"], "correct_choice_idx": 1, "direct_answers": ["identity", "sun", "sun", "sun", "sun", "sun glare", "sun", "sun", "sun", "sunlight"], "difficult_direct_answer": false, "rationales": ["The glasses are keeping the light off her eyes.", "The woman is wearing special glasses to protect her eyes from harmful uv rays.", "Those are called sunglasses, they protect from the sun."], "image": "val2014/COCO_val2014_000000348379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357799, "question_id": "EP4xeeEwSHpumCC6phZmzW", "question": "What's the girl in the back's situation?", "choices": ["studying", "lost", "can't see", "hungry"], "correct_choice_idx": 2, "direct_answers": ["bright sun", "happy", "annoyed", "laughing", "catcher", "can't see", "distracted", "sunlight", "short stop", "bright sun"], "difficult_direct_answer": true, "rationales": ["The girl is unable to see with the sun in her eyes.", "The girl can't see.", "The sun is in her eyes."], "image": "train2014/COCO_train2014_000000357799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380279, "question_id": "EeAcxZPiqCZADoA8wBRdHc", "question": "What is this woman's profession?", "choices": ["waitress", "nun", "clown", "janitor"], "correct_choice_idx": 1, "direct_answers": ["nun", "nun", "nun", "nun", "nun", "nun", "church sister", "nun", "nun", "nun"], "difficult_direct_answer": false, "rationales": ["The woman is a nun.", "She's wearing a type of headdress that is usually associated with nuns.", "The woman is wearing a habit."], "image": "train2014/COCO_train2014_000000380279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543616, "question_id": "EjKCb4Bi22ayYcNwdtzURn", "question": "Why is the man's vest yellow in color?", "choices": ["camouflage", "visibility", "dress code", "fashion"], "correct_choice_idx": 1, "direct_answers": ["be noticed", "visibility", "worker", "visibility safety", "safety", "traveller", "easy vision", "matainance worker", "visibility/safety", "safety"], "difficult_direct_answer": true, "rationales": ["Answer a is the commonly known reason for neon colored vests and is consistent with this setting.", "The yellow vest is made to make the man more noticeable. it's yellow because it's easier to spot him.", "The vest is for visibility."], "image": "train2014/COCO_train2014_000000543616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515391, "question_id": "FNjhydMLVZZfgzW4vxjRkT", "question": "What item here is held by knotting?", "choices": ["coat", "jacket", "nothing", "tie"], "correct_choice_idx": 3, "direct_answers": ["tie", "tie", "tie", "necktie", "neck tie", "tie", "tie", "tie", "tie", "tie"], "difficult_direct_answer": false, "rationales": ["You knot the tie at the top to make sure that it stays in place.", "The item is the tie.", "There is a piece of clothing that is hanging around the neck of this person. it usually accompanies a suit to be formal."], "image": "val2014/COCO_val2014_000000515391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194851, "question_id": "FToNkWU2zga2NJBodahZq4", "question": "What does the unit named Kohler provide?", "choices": ["water", "electricity", "heat", "air conditioning"], "correct_choice_idx": 3, "direct_answers": ["restroom gear", "air conditioning", "electricity", "air conditioning", "ice", "air conditioning", "air conditioner", "electricity", "air condition", "electricity"], "difficult_direct_answer": false, "rationales": ["Water (d). the brand specializes in home and plumbing, most directly related to water.", "The unit is the ac.", "There are large air machines outside."], "image": "val2014/COCO_val2014_000000194851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382342, "question_id": "Fb2bhanjAatxNtRUwCehkD", "question": "What is this expanse of poured cement?", "choices": ["swimming pool", "public park", "skate park", "zoo exhibit"], "correct_choice_idx": 2, "direct_answers": ["skate park", "skate park", "skate park", "skate park", "skateboarder rink", "skate park", "skateboarder rink", "skateboarder rink", "skate park", "skate park"], "difficult_direct_answer": false, "rationales": ["The area is designed so you can do tricks and ride your wheeled sport equipment.", "There are people having fun on this sloped course made out of hardened material.", "This area is a skate park."], "image": "train2014/COCO_train2014_000000382342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99423, "question_id": "G8oNhp9SayvYJMuL57xAKP", "question": "What animal is on the sticker of the white laptop?", "choices": ["cat", "monkey", "penguin", "bear"], "correct_choice_idx": 2, "direct_answers": ["penguin", "cow", "dog", "penguin", "penguin", "penguin", "penguin", "penguin", "penguin", "dog"], "difficult_direct_answer": false, "rationales": ["It has black and white skin.", "A white and black animal with a beak and webbed type feet is on a sticker.", "There is a black and white chubby bird."], "image": "train2014/COCO_train2014_000000099423.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395216, "question_id": "GA6HW7HTejryz4zNiqwB87", "question": "Why are they on the pavement?", "choices": ["fell", "broken boards", "awaiting race", "hiding"], "correct_choice_idx": 2, "direct_answers": ["competing", "awaiting race", "racing", "riding skateboards", "racing", "competing", "skateboarding", "skating", "skating", "racing"], "difficult_direct_answer": false, "rationales": ["The skateboarders are in the starting position, ready to race. they take this position so that no one can get a head start.", "People are skateboarding down a street in uniforms with people watching.", "Many people are standing on both sides of a track. there are many contestants ready to go to start race."], "image": "train2014/COCO_train2014_000000395216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531163, "question_id": "Ghibs4Pt3TocNXKeJR7g5F", "question": "Who is the husband of the woman referenced in the bible who's name is on the top window?", "choices": ["jesus", "joseph", "phil", "adam"], "correct_choice_idx": 3, "direct_answers": ["adam", "eve", "adam", "adam", "adam", "adam", "adam", "adam", "adam", "adam mate"], "difficult_direct_answer": false, "rationales": ["Adam was the man to eve in the bible.", "According to the bible these were the first two people", "Adam and eve."], "image": "val2014/COCO_val2014_000000531163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499826, "question_id": "GnWzkYenK32o7Q4BxdKzzq", "question": "What is the expression on the woman's face?", "choices": ["worry", "scare", "excitement", "disgust"], "correct_choice_idx": 2, "direct_answers": ["excitement", "surprised happy", "surprise", "awe", "surprise", "happiness", "happy", "happiness", "excitement", "surprise"], "difficult_direct_answer": false, "rationales": ["The woman is very excited.", "The woman's mouth and eyes are open wide, and her eyebrows are unfurrowed. she is experiencing a positive emotion.", "Her mouth is wide open and smiling"], "image": "val2014/COCO_val2014_000000499826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274907, "question_id": "GpkX8VVCSLEwbdME24Hua2", "question": "What can't be done in this room?", "choices": ["drinking", "smoking", "dancing", "talking"], "correct_choice_idx": 1, "direct_answers": ["sleep", "skiing", "snowboard", "skiing", "snowboarding", "no smoking", "snowboard", "snowboard", "smoking", "snowboard"], "difficult_direct_answer": false, "rationales": ["A sign showing a cigarette with a line through it is on the wall.", "Here is a sign above the woman on the right banning the activity.", "There is no smoking sign behind them."], "image": "train2014/COCO_train2014_000000274907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405179, "question_id": "GwziUHcZ27r93KJKGpZUdJ", "question": "On this day the weather was?", "choices": ["snowy", "blue skies", "rainy", "sunny"], "correct_choice_idx": 2, "direct_answers": ["cloudy", "rainy", "rain", "rainy", "rainy day", "rainy", "i agree", "rainy", "rainy", "rainy"], "difficult_direct_answer": false, "rationales": ["The gloomy sky and water on the ground prove there has been some precipitation in this area recently.", "The weather is rainy.", "The clouds in the sky are a dark gray color and the floor is covered in ponds."], "image": "train2014/COCO_train2014_000000405179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425325, "question_id": "GxSuiDFrb8trQD3hPKby4j", "question": "What media company produced the franchise on the boy's shirt?", "choices": ["lucas arts", "pixar", "warner", "dreamworks"], "correct_choice_idx": 1, "direct_answers": ["pixar", "disney", "disney", "disney", "disney", "disney", "pixar", "pixar", "speed", "marvel"], "difficult_direct_answer": false, "rationales": ["I saw cars the movie and remember that pixar made it.", "The company is pixar.", "The boy has on a cars shirt."], "image": "train2014/COCO_train2014_000000425325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239001, "question_id": "H5nCVYZcybkqXhcdy384LL", "question": "What's the purpose of the poled items?", "choices": ["block sun", "block rain", "block wind", "block snow"], "correct_choice_idx": 0, "direct_answers": ["shade sun", "offer shade", "umbrellas", "block sun", "shade", "shade", "shade", "provide shade", "umbrellas", "umbrellas"], "difficult_direct_answer": false, "rationales": ["The poles appear to be umbrellas and umbrellas have two purposes, block water or sunlight.", "They are umbrellas.", "They keep the sun out."], "image": "train2014/COCO_train2014_000000239001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315314, "question_id": "HCKrcHuWs6YA9skP8rNudH", "question": "Which person will try this cake first?", "choices": ["groomsman", "bride", "groom", "both"], "correct_choice_idx": 3, "direct_answers": ["her", "bride", "both", "groom", "bride", "bride", "bride", "bride", "woman", "woman"], "difficult_direct_answer": false, "rationales": ["Tradition has it that the bride and groom feed each other the cake on their wedding day first.", "The people are bride and groom based on their attire and the style of the cake in front of them. per tradition, these two would each take cake on their wedding.", "The people will cut together."], "image": "train2014/COCO_train2014_000000315314.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75423, "question_id": "HziGtW4fbPY6WRWKwruPeC", "question": "The translation of the warning states that objects are what than they appear?", "choices": ["further", "bigger", "smaller", "closer"], "correct_choice_idx": 3, "direct_answers": ["closer", "closer", "closer", "true", "closer", "closer", "spanish", "closer", "closer", "in mirror"], "difficult_direct_answer": false, "rationales": ["I know that this is what rearview mirrors say.", "The phrase on the mirror is a common and recognizable phrase.", "The word means to be very close."], "image": "train2014/COCO_train2014_000000075423.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27495, "question_id": "Jv7DKxW3nzJ54kmXugqpvV", "question": "What is in the mouth of the horse?", "choices": ["apple", "bit", "saddle", "spurs"], "correct_choice_idx": 1, "direct_answers": ["bit", "bit", "bit", "teeth", "bit", "bit", "hardest", "bit", "reins", "bit"], "difficult_direct_answer": false, "rationales": ["There is a metal piece with attached reins running through its mouth.", "There is a metal object in the horse's mouth. this is connected to a bridle which is used to control the horse.", "It is the piece that helps you guide the horse."], "image": "train2014/COCO_train2014_000000027495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421689, "question_id": "JvEjJ4dzn8QKRGFsby3eNF", "question": "Why are the animals eyes white?", "choices": ["light reflection", "blindness", "eye disease", "photoshopped"], "correct_choice_idx": 0, "direct_answers": ["light reflection", "camera flash", "light", "light flash", "flash", "camera flash", "camera flash", "camera flash", "open", "camera flash"], "difficult_direct_answer": false, "rationales": ["They are blurred out.", "The cat is reflecting the light.", "The flash of the camera will cause this to happen with animals."], "image": "train2014/COCO_train2014_000000421689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118964, "question_id": "K7FUnAn637n4eg8Sto8Mkc", "question": "What is the seated man's profession?", "choices": ["athlete", "dentist", "teacher", "doctor"], "correct_choice_idx": 0, "direct_answers": ["baseball player", "baseball", "baseball player", "baseball player", "baseball player", "athlete", "baseball player", "athlete", "athlete", "baseball player"], "difficult_direct_answer": false, "rationales": ["The man is wearing a professional sport team jersey.", "He is wearing a sports jersey and signing an autograph.", "The man is signing a bat which is used in athletics, specifically baseball."], "image": "train2014/COCO_train2014_000000118964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76541, "question_id": "KBWCFkW9uM3Ydczq8S2QtH", "question": "What colors are on the child's shirt who's birthday it is?", "choices": ["red white", "orange white", "blue green", "blue white"], "correct_choice_idx": 0, "direct_answers": ["red gray", "red white", "red blue", "red white", "red white", "red white", "red white", "red", "white red", "red black"], "difficult_direct_answer": false, "rationales": ["The child is wearing red, yellow, white and blue with a little green thrown in.", "The colors are red and white.", "The child who is cutting the cake is the birthday boy. his shirt is not blue, green, or orange."], "image": "train2014/COCO_train2014_000000076541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457348, "question_id": "L7FTVG5ftYwQJfcM7e7LqH", "question": "What video game is advertised on the bus?", "choices": ["max payne", "fortnite", "minecraft", "final fantasy"], "correct_choice_idx": 0, "direct_answers": ["max paydie", "max payne", "max payne", "red dead", "max payne", "max payne", "max payne", "max payne", "max payne", "max payne"], "difficult_direct_answer": false, "rationales": ["The lettering on the poster says max payne.", "Max payne is being advertised.", "The game is max payne."], "image": "val2014/COCO_val2014_000000457348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470029, "question_id": "MD29cgkrLhStPvBS8bYr9y", "question": "How was the sink's countertop geologically formed?", "choices": ["carbon based", "igneous processes", "metamorphic processes", "hand crafted"], "correct_choice_idx": 1, "direct_answers": ["igneous processes", "ground", "pressure", "marble", "marble", "rocks", "igneous rock", "marble", "square", "marble"], "difficult_direct_answer": false, "rationales": ["It was made by humans, and machines alike.", "There were processes.", "Marble is an igneous rock formed by pressure."], "image": "train2014/COCO_train2014_000000470029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86588, "question_id": "MVapdnetCjqLKDbqXYAejj", "question": "What brand are the batter's shoes?", "choices": ["nike", "under amour", "puma", "adidas"], "correct_choice_idx": 1, "direct_answers": ["under armour", "under amour", "under armour", "under armour", "adidas", "under armour", "under armour", "under armour", "reebok", "adidas"], "difficult_direct_answer": false, "rationales": ["The logo of a popular sports brand which consists of two of the letter \"u\", one of which is upside down is visible on the shoe.", "You can tell by the shoes logo as to who made the shoes.", "The logo on his cleats is for ua."], "image": "train2014/COCO_train2014_000000086588.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182242, "question_id": "MX6ZQGQ9PGdVs8Ltm5PXte", "question": "Which truck does this dog's owner possess?", "choices": ["blue truck", "semi truck", "distant truck", "white truck"], "correct_choice_idx": 3, "direct_answers": ["ford", "ford", "white truck", "white truck", "ford", "ford", "pickup", "ford", "white truck", "ford"], "difficult_direct_answer": false, "rationales": ["It's a white truck", "The truck is white, and the dog is laying on the bed of the white truck.", "The dog is sitting on a truck that is own drives and the truck is painted white."], "image": "train2014/COCO_train2014_000000182242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229304, "question_id": "MrBNhqpyaBB98X8Cj2D3Mv", "question": "The man with white gloves on plays for what team?", "choices": ["atlanta hawks", "atlanta braves", "atlanta thrashers", "atlanta falcons"], "correct_choice_idx": 1, "direct_answers": ["oakland as", "atlanta braves", "angels", "oakland as", "atlanta braves", "not clear", "braves", "braves", "astros", "home team"], "difficult_direct_answer": false, "rationales": ["On the man in the shirt reading johnson a stylized a is visible on his batting helmet. this a is a symbol for the atlanta braves baseball team.", "A. the \"a' on his hat is the symbol for the atlanta braves.", "The jerry have it all for the represented team."], "image": "val2014/COCO_val2014_000000229304.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25024, "question_id": "NKKLutgNywtknMHoC6NGsy", "question": "What is the cable on the man's leg called?", "choices": ["locker", "surfboard leash", "straps", "usb cable"], "correct_choice_idx": 1, "direct_answers": ["leash", "leash", "board rope", "rope", "surfboard string", "leash", "safety cord", "leg rope", "surfboard leash", "surfboard leash"], "difficult_direct_answer": false, "rationales": ["The cable is for surfing.", "This is attached to the man's leg so he does not lose his board while surfing in the waves.", "The leash is for the board."], "image": "train2014/COCO_train2014_000000025024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135258, "question_id": "NMKwYNkL6TjFAjbATFju2U", "question": "Why is the black banner most likely flying in the snowboarder's location?", "choices": ["shade", "decoration", "warning", "advertisement"], "correct_choice_idx": 3, "direct_answers": ["wind", "wind", "olympics", "advertisement", "advertisement", "advertising", "advert", "advertisement", "wind", "competition location"], "difficult_direct_answer": false, "rationales": ["Normally at sporting events, sponsors pay money to have their logos displayed to attract publicity to their brand.", "The banner is an ad.", "The banner has the name of a sponsor so that it will be visible while the snowboarder is being filmed."], "image": "train2014/COCO_train2014_000000135258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205000, "question_id": "NN7ZRWShF8DRywLU8VNrQj", "question": "This attire is appropriate for what kind of event?", "choices": ["costume party", "business meeting", "court proceeding", "wedding"], "correct_choice_idx": 0, "direct_answers": ["costume party", "formal", "informal", "casual", "party", "no", "christmas party", "casual", "celebration", "office"], "difficult_direct_answer": true, "rationales": ["It is not appropriate for any of the other events so it is process of elimination.", "The attire is a costume.", "A costume party."], "image": "train2014/COCO_train2014_000000205000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528555, "question_id": "NkHW4KXcpj9vC7dNTxT7bk", "question": "The shade seen here was made from what fibers?", "choices": ["wool", "flax", "leaves", "grass"], "correct_choice_idx": 3, "direct_answers": ["straw", "leaves", "grass", "grass", "leaves", "palm tree", "plant", "palm", "coconut", "natural"], "difficult_direct_answer": false, "rationales": ["There is grass on the umbrella.", "The grass on the umbrella.", "The roofs are made of dead grass."], "image": "train2014/COCO_train2014_000000528555.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16959, "question_id": "Nnu3yZgikKqCQu3M7p4rnn", "question": "What are the black triangular pieces on the board called?", "choices": ["wings", "hooks", "traps", "fins"], "correct_choice_idx": 3, "direct_answers": ["fins", "wave runners", "fins", "fins", "fins", "wheels", "feet", "wheels", "paddle board", "sit"], "difficult_direct_answer": false, "rationales": ["A surfboard has projections to help with balance on one end.", "The fins are on the bottom rear of a surfboard.", "Based on the positioning of the board and the known structure and composition, answer a is clear."], "image": "val2014/COCO_val2014_000000016959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373748, "question_id": "PNzVFz3s339DCSgJW6NTji", "question": "What is the item on the ceiling most likely?", "choices": ["air conditioner", "ceiling fan", "poster", "trap door"], "correct_choice_idx": 0, "direct_answers": ["air conditioner", "air conditioner", "air conditioner", "air conditioner", "air conditioner", "exhaust vents", "air conditioner", "air conditioner", "air conditioning", "ac"], "difficult_direct_answer": false, "rationales": ["Air conditioners are usually located on the ceiling, also the size and color of it indicates it is an air conditioner.", "The object located in the question is the shape, size and design consistent with answer a and is placed in a location that answer a would be in this setting.", "The object in question is the size, shape and design in accordance with answer a and is in a position consistent with its usage."], "image": "val2014/COCO_val2014_000000373748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289618, "question_id": "PQfCQMAVwEgXpLCPVxHzS2", "question": "What does the fence prevent the dogs from doing?", "choices": ["escaping", "eating", "sleeping", "walking"], "correct_choice_idx": 0, "direct_answers": ["escaping", "running away", "escaping", "escaping", "running away", "escaping", "escaping", "running away", "running away", "escaping"], "difficult_direct_answer": false, "rationales": ["It keeps the dog in.", "The fence keeps the dogs from running away.", "The dogs can not get over the fence."], "image": "val2014/COCO_val2014_000000289618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521386, "question_id": "PQjtdoFX3CcqggzYkjXTKC", "question": "What company's mascot can be seen on the right next to the donut?", "choices": ["sony", "android", "apple", "disney"], "correct_choice_idx": 1, "direct_answers": ["goggle", "android", "google", "android", "android", "android", "android", "android", "android", "android"], "difficult_direct_answer": false, "rationales": ["There is an alien.", "I don't see any apple, sony or disney characters so the only thing left is the android figure standing.", "The mascot is for phones."], "image": "train2014/COCO_train2014_000000521386.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415322, "question_id": "PVthpTSxgekjgPkMnRdbaM", "question": "How many liters of oil can be used in this machine per batch?", "choices": ["40", "20", "15", "50"], "correct_choice_idx": 2, "direct_answers": ["ten", "unknown", "two", "onehundred", "35 pounds", "15", "lot", "many liters", "15", "six"], "difficult_direct_answer": true, "rationales": ["Fifteen can be used in it.", "This number comes closest to the standard for commercial donut fryers.", "Traditionally these types of donut frying machines can handle up to 12 liters of oil."], "image": "train2014/COCO_train2014_000000415322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163180, "question_id": "PpAMbcKaWvA5qPYPTzMqQC", "question": "What do the apples in the basket have that the other fruits don't?", "choices": ["worms", "produce stickers", "green color", "bruises"], "correct_choice_idx": 1, "direct_answers": ["stickers", "no faces", "stickers", "stickers", "stickers", "stems", "sticker", "nose", "produce stickers", "sticker"], "difficult_direct_answer": false, "rationales": ["The apple is the only fruit that has a code to be used for purchase printed on it.", "The apples don't have stickers.", "The apples have produce stickers."], "image": "train2014/COCO_train2014_000000163180.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439658, "question_id": "PqUdAbC87sgW7WdoFYz2su", "question": "What pattern is the women's pants?", "choices": ["camouflage", "plaid", "stripes", "corduroy"], "correct_choice_idx": 1, "direct_answers": ["plaid", "checked", "plaid", "plaid", "checkers", "plaid", "checkered", "checkered", "plaid", "checks"], "difficult_direct_answer": false, "rationales": ["The pattern of the pants of the person in question is clearly visible and is known to be answer a based on the squared layout.", "The pattern is plaid.", "The woman is wearing pants with a checkered pattern."], "image": "val2014/COCO_val2014_000000439658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53183, "question_id": "QP2aJDwXRkvzimZk7iB3ga", "question": "What type of pizza is this?", "choices": ["cheese", "brick oven", "deep dish", "pepperoni"], "correct_choice_idx": 1, "direct_answers": ["brick oven", "arugula", "large", "cheese", "margherita", "vegetarian", "oven", "margherita", "garden", "vegetarian"], "difficult_direct_answer": false, "rationales": ["The crusts look extra toasted from the burnt edges and is really, really thick, seemingly from said oven.", "The pizza is brick oven.", "The pizza is from a brick oven."], "image": "val2014/COCO_val2014_000000053183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386853, "question_id": "QVhCPu6ZDXpjLyvwDrHVTC", "question": "Why are all these phones here?", "choices": ["stolen", "being charged", "owner rich", "for sale"], "correct_choice_idx": 1, "direct_answers": ["charging", "stolen", "charging", "phone", "repair", "charging station", "charging", "for sale", "charging", "being charged"], "difficult_direct_answer": false, "rationales": ["A phone cannot use properly without a charge on it. in order to use it properly and its functions and applications, charge it properly.", "A group of phones all have wires coming from them.", "A bunch of phones are together with cords connected. phones are plugged in to be charged."], "image": "train2014/COCO_train2014_000000386853.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334645, "question_id": "R7ZHy2GorgarKkWmeZPhFS", "question": "What country does the sponsor closest to the camera have it's headquarters located?", "choices": ["israel", "ukraine", "germany", "poland"], "correct_choice_idx": 2, "direct_answers": ["germany", "france", "germany", "germany", "germany", "germany", "london", "germany", "jeneva", "germany"], "difficult_direct_answer": false, "rationales": ["Israel in gamning.", "Germany is the closest to the camera.", "The country is germany."], "image": "train2014/COCO_train2014_000000334645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215755, "question_id": "REEe9a5rjVStVVJGSfzEhH", "question": "What is the woman doing?", "choices": ["sitting", "walking", "standing", "running"], "correct_choice_idx": 0, "direct_answers": ["reading", "sitting down", "moisturizing", "sitting", "sitting", "waiting", "sitting", "sitting", "waiting", "sitting"], "difficult_direct_answer": false, "rationales": ["A woman is stationary and not moving. she is checking something out on her phone.", "The woman has her hands in her lap.", "When someone's bottom is resting on the floor or on a chair. she is not standing or in motion."], "image": "val2014/COCO_val2014_000000215755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403718, "question_id": "RFNXf7REhQUEpbyQKFAa2P", "question": "What's being featured on the TV in this home?", "choices": ["soap operas", "wrestling", "video gaming", "cooking show"], "correct_choice_idx": 2, "direct_answers": ["wii fit", "video game", "game", "video game", "unknown", "games", "video gaming", "game", "video game", "wii fit"], "difficult_direct_answer": false, "rationales": ["A person is standing in front of a television with a gaming remote in hand.", "Because the girl has a wii controller which is used for video games.", "The video game is up."], "image": "train2014/COCO_train2014_000000403718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524420, "question_id": "RS9WK3ywbCjXjtFnmK8A8M", "question": "What is the man wearing jeans looking at?", "choices": ["woman", "phone", "food cart", "shoe lace"], "correct_choice_idx": 1, "direct_answers": ["phone", "ground", "ground", "ground", "ground", "phone", "phone", "ground", "ground", "ground"], "difficult_direct_answer": false, "rationales": ["He is looking down in a direction that would make sense to be looking at a phone.", "He appears to be looking down at something in his hand.", "The man is looking at his phone."], "image": "train2014/COCO_train2014_000000524420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103393, "question_id": "RZUBFxaac9KTvUzrRbFFek", "question": "What is to the left of the cone?", "choices": ["helicopter", "bicycle", "television", "bear"], "correct_choice_idx": 1, "direct_answers": ["bicycle", "buses", "bus", "bicycle", "bus", "bicycle", "bus", "bus", "bike", "bus"], "difficult_direct_answer": false, "rationales": ["There is a bike on a platform on back of vehicle.", "A bike is to the left.", "The orange cone can be seen in the background of the picture and when you look directly to its left, you can see a bicycle hitched onto the back of a vehicle."], "image": "train2014/COCO_train2014_000000103393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351626, "question_id": "Rcr5boPCYyRR9Rk38c5K9A", "question": "What size are these beds?", "choices": ["twin", "king", "full size", "queen"], "correct_choice_idx": 2, "direct_answers": ["queen", "large", "large", "queen", "full size", "queen", "queen", "queen", "large", "queen"], "difficult_direct_answer": false, "rationales": ["Two regular sized beds can be seen in the room.", "Beds big enough for two are in a room next to each other. full size beds are bigger than twins which fit one person.", "They are bigger than twin but smaller than queen."], "image": "val2014/COCO_val2014_000000351626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137810, "question_id": "S6sskgyCVvFQhBAEgcTBSs", "question": "Who in the greatest danger?", "choices": ["right man", "middle woman", "left man", "right woman"], "correct_choice_idx": 1, "direct_answers": ["middle man", "person", "passenger", "crossing pedestrian", "pedestrian", "on tracks", "lady", "middle woman", "pedestrians", "train"], "difficult_direct_answer": true, "rationales": ["A because the middle woman is in the middle of all 5 lanes of traffic.", "She is standing in the middle of an intersection. cars will cross through the intersection from all directions.", "The woman in the middle of the intersection is in danger because she can possibly get hit by a car coming from any direction."], "image": "val2014/COCO_val2014_000000137810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390401, "question_id": "SF3nPXLbvLc2yZYuLQUXse", "question": "What is the man's profession?", "choices": ["umpire", "waiter", "coach", "athlete"], "correct_choice_idx": 3, "direct_answers": ["baseball player", "baseball player", "baseball player", "baseball player", "baseball", "target ball", "athlete", "playing", "baseball player", "baseball player"], "difficult_direct_answer": false, "rationales": ["He is playing baseball.", "The umpire refereed the game, and the coach doesn't play, just teaches the athletes. waiters are not involved in sports.", "He is a baseball palyer."], "image": "val2014/COCO_val2014_000000390401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516430, "question_id": "SGioV5asN4tdiD8J46EMrT", "question": "Where is this vehicle's motor located?", "choices": ["roof", "underneath", "rear", "under tire"], "correct_choice_idx": 2, "direct_answers": ["behind", "rear", "rear", "trunk", "back", "rear", "vehicle's rear", "trunk", "trunk", "back"], "difficult_direct_answer": false, "rationales": ["The volkswagen beetle has the engine in the back.", "Since the front is full of personal items, the motor must be on the opposite side of the car.", "The older versions of this car (the beetle by volkswagen), would build their engines into the trunk of the car. as seen in the picture, the trunk space of this car is located in the front part of the car. modern day vehicle's typically have the engines built into the front part of the car."], "image": "val2014/COCO_val2014_000000516430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47742, "question_id": "SNuHhBLQnfacf86SizaGiM", "question": "What as the tobe passengers acting as?", "choices": ["doctors", "knights", "pirates", "zombies"], "correct_choice_idx": 3, "direct_answers": ["director", "zombie", "zombies", "zombies", "rider", "zombies", "zombies", "zombies", "zombies", "laughing"], "difficult_direct_answer": false, "rationales": ["The passenger is a zombie.", "They have makeup on to make them look dead and they are holding their arms out as if walking like they are dead", "These people are walking slow with their arms stretched out in front."], "image": "train2014/COCO_train2014_000000047742.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243008, "question_id": "SQUaniN9A4zbsKrqqn6sPZ", "question": "What's the name of the thin structures in the water coming from the land?", "choices": ["piers", "ropes", "benches", "docks"], "correct_choice_idx": 3, "direct_answers": ["pennisula", "pier", "docks", "shore", "docks", "piers", "dock", "trees", "boards", "docks"], "difficult_direct_answer": false, "rationales": ["The thin structures are boat docks.", "These are for boats to tie up to", "The thin structures coming from the land into the water are docks where boats can park."], "image": "train2014/COCO_train2014_000000243008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397769, "question_id": "SaTARcDZxnJVVz5gKLkand", "question": "Why is the plane on the grass?", "choices": ["it crashed", "for display", "for passengers", "it landed"], "correct_choice_idx": 1, "direct_answers": ["display purposes", "for display", "museum", "jet", "landing position", "display", "gravity", "display", "display", "landed there"], "difficult_direct_answer": false, "rationales": ["The plane is on the grass so that others can admire it. it is visually appealing and grabs the attention of others.", "There is a display.", "It's an antique aircraft on show."], "image": "train2014/COCO_train2014_000000397769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243071, "question_id": "T9SGurFgbpysYrkQ2dkm4i", "question": "What is the man's jacket made out of?", "choices": ["plastic", "leather", "wool", "denim"], "correct_choice_idx": 1, "direct_answers": ["leather", "material", "leather", "from leather", "leather", "polyester", "leather", "leather", "cloth", "leather"], "difficult_direct_answer": false, "rationales": ["Most biker jackets are made of leather and this man appears to be a biker.", "It's made from an animal hide", "You can see the shine from it and i know what most motorcycle people wear."], "image": "train2014/COCO_train2014_000000243071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95203, "question_id": "U3MPN7Hg8xyW6T6ocGaFbb", "question": "What type of pants does the woman in the red hat have on?", "choices": ["leather", "bell bottoms", "denim", "capris"], "correct_choice_idx": 1, "direct_answers": ["bell bottoms", "bell bottoms", "bell bottoms", "striped", "bell bottoms", "flag", "bellbottoms", "bellbottoms", "bell bottoms", "striped pants"], "difficult_direct_answer": false, "rationales": ["The pant style is given away by the large width at the bottom of the leg. it is a common style.", "The pants cover her entire legs, so they are not capris. they are not made out of leather or denim.", "Bell bottom pants are long and flare below the knee."], "image": "train2014/COCO_train2014_000000095203.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327660, "question_id": "UDMsiA2kKWMiR7esvXDG9s", "question": "What of food is on the table?", "choices": ["soup", "meat", "floor", "salad"], "correct_choice_idx": 3, "direct_answers": ["chinese", "broccoli beef", "main course", "broccoli", "chinese", "stir fry", "salad", "chinese", "beef vegetables", "chinese"], "difficult_direct_answer": false, "rationales": ["This most closely resembles a soup. there is a small amount of broth, but there is broth or some liquid nonetheless. this makes the soup the most appropriate categorization for this combination of food.", "The food has a large amount of protein in it.", "The food is all veggies."], "image": "train2014/COCO_train2014_000000327660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376719, "question_id": "UP3jsDpr259zwpFdLJMLnD", "question": "What type of shot is the woman hitting?", "choices": ["backhand", "serve", "forehand", "slice"], "correct_choice_idx": 2, "direct_answers": ["backhand", "serve", "upper", "hard", "backhand", "fronthand", "underhand", "lob", "forehand", "serve"], "difficult_direct_answer": false, "rationales": ["The woman is taking a swing of the racquet with her forehand.", "She is hitting. aforhand.", "The woman is using her forehand."], "image": "train2014/COCO_train2014_000000376719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385248, "question_id": "UWgUKJFmXRfZf4eok7xouF", "question": "What's holding the motorcycle up?", "choices": ["another motorcycle", "kickstand", "man", "3rd wheel"], "correct_choice_idx": 1, "direct_answers": ["kickstand", "kickstand", "kickstand", "wheels", "kickstand", "kickstand", "leg", "woman", "kickstand", "kickstand"], "difficult_direct_answer": false, "rationales": ["The kickstand is visible on the bottom side of the motorcycle.", "A girl is sitting on a motorcycle that is parked.", "The motorcycle kickstand is up."], "image": "val2014/COCO_val2014_000000385248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353378, "question_id": "UZRCZLQDsJPiF3vw53LVjx", "question": "Where can you find the light brown items that are decorating the bottom of the cake?", "choices": ["forest", "ocean", "desert", "jungle"], "correct_choice_idx": 1, "direct_answers": ["ocean", "beach", "seashore", "bakery", "beach", "beach", "ocean", "beach", "ocean", "beach"], "difficult_direct_answer": false, "rationales": ["They are found under the ocean and on the shores as well.", "The seashells on the fancy cake can be found in the ocean or on a beach.", "A. the light brown items are seashells and they can be found in oceans."], "image": "train2014/COCO_train2014_000000353378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191641, "question_id": "UkAPuPr9JEDrKhYYyJ847b", "question": "What country is represented by the eagle symbol?", "choices": ["mexico", "puerto rico", "america", "britain"], "correct_choice_idx": 3, "direct_answers": ["poland", "britain", "england", "germany", "train company", "mexico", "italy", "germany", "poland", "prussia"], "difficult_direct_answer": false, "rationales": ["There is a crest.", "The bird of mexico is an eagle and the colors also indicate that it would be in mexico,.", "The train has the symbol of the uk."], "image": "train2014/COCO_train2014_000000191641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 386876, "question_id": "UoWe8yDrjcf3uquqYbPTZt", "question": "The treats eaten here are prepared in what boiling hot substance?", "choices": ["water", "acid", "milk", "oil"], "correct_choice_idx": 3, "direct_answers": ["oil", "oil", "oil", "oil", "oil", "cookies", "oil", "oil", "shortening", "oil"], "difficult_direct_answer": false, "rationales": ["The food they are eating appear to be donuts and donuts are typically fried, fried foods are usually made with oil.", "They are fried in oil.", "The objects are donuts based on their size and shape. these are commonly known to be prepared in answer a."], "image": "train2014/COCO_train2014_000000386876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397685, "question_id": "VCNJAtK7M5XBedgfjw3ZC4", "question": "What tips you off that this person isn't actually using the bathroom here?", "choices": ["book", "tank", "pants", "toilet lid"], "correct_choice_idx": 3, "direct_answers": ["down lid", "seat down", "lid closed", "outdoors", "no plumbing", "curb", "no plumbing", "outside", "toilet lid", "closed lid"], "difficult_direct_answer": true, "rationales": ["It's down. so, unless someone drilled a hole in it, this is a joke. also the tank lid is gone.", "The person is on a toilet lid.", "The tank of the toilet is open."], "image": "train2014/COCO_train2014_000000397685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369823, "question_id": "VK5FRsMGYERZfwwGpHg9Uz", "question": "Which writing shown on official signage was most likely put there in violation of law?", "choices": ["four", "stop", "rewarding failure", "way"], "correct_choice_idx": 2, "direct_answers": ["graffiti", "rewarding failure", "rewarding failure", "rewarding failure", "stop", "stop", "rewarding failure", "graffiti", "offensive", "rewarding failure"], "difficult_direct_answer": false, "rationales": ["The letters look very unprofessional as if a kid wrote them. stop signs are generally not found with these words on them.", "All stop signs only have one word on them, it is also in a different size and font.", "Stop signs only say stop - anything else is vandalism."], "image": "train2014/COCO_train2014_000000369823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54427, "question_id": "VtVkoPPqv5RNfGnwEfYZCo", "question": "The color of the boy's outfit matches the colors of the costume of what super hero?", "choices": ["batman", "spider man", "wolverine", "iron man"], "correct_choice_idx": 1, "direct_answers": ["spiderman", "superman", "spider man", "spiderman", "superman", "spiderman", "superman", "superman", "superman", "spiderman"], "difficult_direct_answer": false, "rationales": ["Spider man is known for red.", "The boy's clothing is blue and red, just like the super hero's costume.", "The colors of the boys outfit are the same colors as spiderman."], "image": "train2014/COCO_train2014_000000054427.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559550, "question_id": "VuhonjSitbQKivYvGXgrMa", "question": "What type of seating is the little girl on the right sitting in?", "choices": ["highchair", "booster seat", "stool", "bench"], "correct_choice_idx": 1, "direct_answers": ["booster seat", "high chair", "highchair", "highchair", "booster seat", "high chair", "high chair", "high chair", "highchair", "high chair"], "difficult_direct_answer": false, "rationales": ["She's in a booster.", "She is in a seat for little kids.", "The little girls is sitting in a chair that is just high enough to reach the top of the table."], "image": "val2014/COCO_val2014_000000559550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31616, "question_id": "VxbRCwMJwbtc5DUCryupxS", "question": "What is the man playing?", "choices": ["chess", "his guitar", "nothing", "video games"], "correct_choice_idx": 3, "direct_answers": ["bowling", "wii", "wii", "video game", "bowling", "video games", "video games", "wii", "wii bowling", "videao game"], "difficult_direct_answer": false, "rationales": ["The man plays video games.", "The man is playing wii sports on the nintendo wii. nintendo makes video games.", "He is holding a wii remote which allows him to interact with the bowling game."], "image": "train2014/COCO_train2014_000000031616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5140, "question_id": "W6KNhxs6DLmJ7Z6TxvfEUE", "question": "What sport is on the TV?", "choices": ["baseball", "ice hockey", "football", "basketball"], "correct_choice_idx": 1, "direct_answers": ["ice hockey", "ice hockey", "hockey", "hockey", "hockey", "hockey", "hockey", "hockey", "hockey", "cycling"], "difficult_direct_answer": false, "rationales": ["The man is wearing an nhl jersey and helmet.", "The player on the tv is wearing a vancouver canucks jersey. he is skating on a white surface.", "The person on the television is wearing hockey gear."], "image": "train2014/COCO_train2014_000000005140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495131, "question_id": "WHGmmBWzw6gsubFgWhxgfo", "question": "What helps keep the players feet on the board?", "choices": ["straps", "seatbelt", "vest", "bracelet"], "correct_choice_idx": 0, "direct_answers": ["boots", "bindings", "straps", "clips", "gravity", "momentum", "balancing themselves", "gravity", "clamps", "balance"], "difficult_direct_answer": true, "rationales": ["There's flexible harnesses attached to the player's shoes.", "The feet are held by straps to the board.", "Straps allow the man to stay on the board."], "image": "val2014/COCO_val2014_000000495131.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45361, "question_id": "WHkT8xFcFCq8GCoXWW7rR5", "question": "If this were a hotel bathroom what kind of hotel would it be?", "choices": ["holiday retreat", "budget", "beach resort", "four star"], "correct_choice_idx": 1, "direct_answers": ["four star", "three star", "3-star motel", "budget", "regular", "motel", "luxury", "luxury", "luxury", "disinfecting wipes"], "difficult_direct_answer": false, "rationales": ["Of the answers possible, answer a has decor that could be consistent with the image, while the other options have characteristic features that are not seen.", "This looks like a cheap hotel.", "While it looks like a nice place, the shower curtain is a giveaway that this is not a high class hotel."], "image": "train2014/COCO_train2014_000000045361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375143, "question_id": "WQLxaqS4QdDPCRsi7BHMyC", "question": "What is most likely on the ground outside the image's frame?", "choices": ["spikes", "shoes", "garbage", "skateboard"], "correct_choice_idx": 1, "direct_answers": ["shoes", "concrete", "her shoes", "shoes", "concrete", "motorcycle", "foot path", "concrete", "asphalt", "pavement"], "difficult_direct_answer": false, "rationales": ["The girl in the image is not wearing shoes so they are likely on the ground.", "The woman is in her stocking feet, and she is in an outdoor setting which would have necessitated her wearing some type of footwear.", "The shoes are outside."], "image": "val2014/COCO_val2014_000000375143.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145999, "question_id": "WnfGDRmhozY5zmN8KddcSV", "question": "What type of environment would the extra long bus normally be seen?", "choices": ["highway", "country side", "freeway", "downtown"], "correct_choice_idx": 3, "direct_answers": ["outside", "city", "open road", "downtown", "urban", "city", "city", "large city", "city", "large city"], "difficult_direct_answer": false, "rationales": ["The environment is downtown.", "The extra long bus would be useful in any large city.", "Buses are used for transportation."], "image": "train2014/COCO_train2014_000000145999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290224, "question_id": "WvGvS7AxSpNkTiPJvsTZTG", "question": "What did the child do with the Frisbee that's making him smile?", "choices": ["threw it", "missed it", "caught it", "kicked it"], "correct_choice_idx": 2, "direct_answers": ["catch", "catch", "holding", "throwing", "caught it", "holding", "catching", "caught it", "caught it", "catch it"], "difficult_direct_answer": false, "rationales": ["He caught the frisbee.", "The child's arms are over his head and his hands are positioned to catch the frisbee.", "His hands are up in the air as he grabs it"], "image": "train2014/COCO_train2014_000000290224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436306, "question_id": "X7pcCDSBbFtjtDGLjM8c8M", "question": "What is strange about this person's arm?", "choices": ["hairy", "extended forearm", "freckles", "blue"], "correct_choice_idx": 1, "direct_answers": ["has bananas", "bananas", "bananas", "banana", "attached bananas", "bananas attached", "length", "bananas", "extended forearm", "banana ties"], "difficult_direct_answer": false, "rationales": ["A person is holding their arm out with bananas lined up on it.", "The person's forearm has three bananas attached to it that are floating in the air.", "The person has bananas on their arm."], "image": "train2014/COCO_train2014_000000436306.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353999, "question_id": "XAzjs8bHWSevmiJaW2xLAS", "question": "What is the red thing near the man's mouth?", "choices": ["paint", "beet juice", "blood", "ketchup"], "correct_choice_idx": 0, "direct_answers": ["makeup", "facepaint", "face paint", "makeup", "make up", "image", "lipstick", "paint", "make up", "makeup"], "difficult_direct_answer": false, "rationales": ["The man has red paint by the sides of his mouth to make it look like he has a really wide smile.", "He has paint on his face.", "Paint is used when making your face look like a clown."], "image": "train2014/COCO_train2014_000000353999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537097, "question_id": "XFspDm7ovt4LqEmhKehgUj", "question": "What other type of things use this surface besides basketball players?", "choices": ["shoppers", "vehicles", "livestock", "dogs"], "correct_choice_idx": 1, "direct_answers": ["handball", "road", "two", "vehicles", "motor vehicles", "cars", "cars bikes", "n/a", "cars", "cars"], "difficult_direct_answer": false, "rationales": ["Cars use roads.", "There are yellow lines on the road telling drivers not to cross into the other lane.", "This is a street"], "image": "train2014/COCO_train2014_000000537097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196985, "question_id": "XGLLz7EK8KpoiVmXHbXc4u", "question": "What type of building would these toilets be found in?", "choices": ["historic", "public", "residential", "castle"], "correct_choice_idx": 1, "direct_answers": ["public", "old", "bathroom", "prison", "institutional", "outhouse", "public park", "restroom", "bathroom", "prison"], "difficult_direct_answer": false, "rationales": ["Because they are many and can be used by more people at once.", "There's multiple toilets in one room, where usually in private you only need one.", "The building is public."], "image": "train2014/COCO_train2014_000000196985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273582, "question_id": "XiTBE8JpoDHUaUDSk2xTjX", "question": "What is different about the batter from most batters?", "choices": ["height", "hits left-handed", "gender", "uniform"], "correct_choice_idx": 1, "direct_answers": ["handedness", "lefthanded", "left handed", "left handed", "hits left-handed", "lefty", "left handed", "left handed", "lower", "huge"], "difficult_direct_answer": false, "rationales": ["The baseball player is standing on the opposite side of the plate that most players stand, which indicates that he is a left-handed hitter.", "The handedness is different.", "The batter at the plate is standing on the right side of the plate which means he hits left-handed which is uncommon."], "image": "val2014/COCO_val2014_000000273582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575931, "question_id": "XwTLfekYUyAY2idgPAR4Lk", "question": "What is the function of the board under the boys feet?", "choices": ["balance", "reduce weight", "avoid soaking", "game"], "correct_choice_idx": 1, "direct_answers": ["nothing", "stability", "skating", "rain time", "leverage", "snowboard", "reduce weight", "standing", "skating", "stay attached"], "difficult_direct_answer": true, "rationales": ["This helps distribute his weight and also give him a comfortable place to ride", "The board would reduce weight.", "Its a skateboard which requires significant balancing when you are using it."], "image": "val2014/COCO_val2014_000000575931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336333, "question_id": "Y4czaSbt225fVLNbqBbwo9", "question": "How is the boy's sweater done up?", "choices": ["buttons", "velcro", "zipper", "buckles"], "correct_choice_idx": 2, "direct_answers": ["zipped", "zipper", "zipper", "zipper", "zipper", "nicely", "zipped", "zipper", "zipper", "zipped"], "difficult_direct_answer": false, "rationales": ["The white thing up the middle of his jacket zips it closed.", "The boy has a zipper.", "The boy is wearing a standard hoodie."], "image": "train2014/COCO_train2014_000000336333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329055, "question_id": "YqpGBqJDRpqbcNniGsod7b", "question": "What is the elephant statue being placed on?", "choices": ["dock", "canoe", "surfboard", "beach"], "correct_choice_idx": 0, "direct_answers": ["dock", "stone state", "boat", "raft", "raft", "dock", "boat", "float", "raft", "raft"], "difficult_direct_answer": false, "rationales": ["It is over the water", "The elephant is being put on the dock.", "The elephant is being placed on a wooden structure with multiple people standing on it which is located above a body of water."], "image": "train2014/COCO_train2014_000000329055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60069, "question_id": "Yw74SgRJGG3AmDY3DjcAnM", "question": "What's the name of the extreme sport the guy is doing?", "choices": ["extreme surfing", "kiteboarding", "super surfing", "sailing"], "correct_choice_idx": 1, "direct_answers": ["swimming", "parasailing", "kite surfing", "boarding", "wind sailing", "longboard", "water skiing", "kite surfing", "parasailing", "kiteboarding"], "difficult_direct_answer": false, "rationales": ["The name is kiteboarding.", "The sport is for kiteboarding.", "This is called kiteboarding because of the large kite that the man is attached to on his surfboard."], "image": "val2014/COCO_val2014_000000060069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376089, "question_id": "ZDutJcu7qYwr88KTJ7ZiEg", "question": "The brim of his hat is helpful for blocking the sun from getting into his what?", "choices": ["mouth", "beard", "eyes", "ears"], "correct_choice_idx": 2, "direct_answers": ["eyes", "eyes", "eyes", "ees", "face", "eyes", "eyes", "eyes", "eyes", "eyes"], "difficult_direct_answer": false, "rationales": ["The brim blocks eyes.", "He is using it to cover the sun.", "Hats go on the head which makes the sun stay out of the man's eyes."], "image": "train2014/COCO_train2014_000000376089.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25283, "question_id": "ZFJ8XnHy3wSjf6sUQhartH", "question": "What does the sign behind the stop sign tell drivers they are unable to do?", "choices": ["enter", "turn left", "turn right", "exit"], "correct_choice_idx": 0, "direct_answers": ["to stop", "enter road", "no entry", "enter", "park", "enter", "cross", "enter", "enter", "enter area"], "difficult_direct_answer": false, "rationales": ["They cant enter that street.", "The sign is clearly to prevent driver's from heading against the flow of traffic as indicated by the stop sign.", "They cannot go."], "image": "train2014/COCO_train2014_000000025283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253577, "question_id": "ZXzDHQbTW2Aj6jrdryc95A", "question": "What direction are the stripes on the person's shirt going?", "choices": ["vertical", "horizontal", "diagonal", "intersected"], "correct_choice_idx": 1, "direct_answers": ["horizontal", "horizontal", "vertical", "horizontal", "down", "horizontal", "horizontal", "horizontal", "vertical", "across"], "difficult_direct_answer": false, "rationales": ["They are parallel to the floor on the sweater in the background which means they are horizontal.", "The lines are running from the left to the right side of the shirt.", "The stripes are going from side to side and not up and down"], "image": "train2014/COCO_train2014_000000253577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189168, "question_id": "ZsF2cRU9yLRctXnjXPp49d", "question": "What does the writing on the shirt mean?", "choices": ["team", "name", "brand", "sponsor"], "correct_choice_idx": 1, "direct_answers": ["player's name", "last name", "person 25", "name", "name", "last name", "player", "name", "player name", "his name"], "difficult_direct_answer": false, "rationales": ["This is how baseball player jerseys work.", "It tells who the player is", "Above their number, athletes last name is normally printed on the back of their sports jerseys as is seen on the back of the batter's shirt."], "image": "val2014/COCO_val2014_000000189168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216840, "question_id": "Zv4VroTsAEGeYbW6JvQurx", "question": "What is the maximum number of people the pink vehicle can safely carry?", "choices": ["one", "four", "two", "three"], "correct_choice_idx": 2, "direct_answers": ["two", "one", "two", "two", "two", "two", "one", "one", "one", "two"], "difficult_direct_answer": false, "rationales": ["There is a single seat spot and the rest is angled too far back to safely ride another", "The maximum number is two.", "There are two seats on the pink motorcycle."], "image": "train2014/COCO_train2014_000000216840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341459, "question_id": "aEyPbSTBB99jo2Uo9Zr6RN", "question": "Where does this man play ball?", "choices": ["street", "church", "public park", "college"], "correct_choice_idx": 2, "direct_answers": ["field", "baseball field", "central park", "central park", "central park", "central park", "ball park", "public park", "baseball field", "nyc"], "difficult_direct_answer": false, "rationales": ["The man is in a park.", "The man is in a park.", "The baseball player is at a public park that is surrounded by buildings."], "image": "train2014/COCO_train2014_000000341459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137830, "question_id": "aHVUsZPXmeyXmgqLXnY8D2", "question": "What is the person's occupation?", "choices": ["chef", "painter", "doctor", "dentist"], "correct_choice_idx": 0, "direct_answers": ["cook", "cook", "cooking", "chef", "cook", "cook", "cooking", "cook", "chef", "cook"], "difficult_direct_answer": false, "rationales": ["The person is cooking.", "The person cooks at the kitchen.", "A stove top and frying station are visible."], "image": "val2014/COCO_val2014_000000137830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478717, "question_id": "aQnE54qv997Cn8iGbNbBwx", "question": "What makes up the bulk of this bird's diet?", "choices": ["vegetables", "insects", "seeds", "fruits"], "correct_choice_idx": 3, "direct_answers": ["seeds", "seeds", "seeds", "fruits", "seeds", "seeds", "seeds", "seeds", "seeds", "seeds"], "difficult_direct_answer": false, "rationales": ["The bird's diet is made of fruit.", "The bulk is fruit.", "This type of bird has a main diet of fruit, and nuts."], "image": "train2014/COCO_train2014_000000478717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146830, "question_id": "aSMZPdhbZgTn2gH4EierN9", "question": "What is the man in black at the top right's position?", "choices": ["umpire", "batter", "catcher", "referee"], "correct_choice_idx": 3, "direct_answers": ["umpire", "umpire", "referee", "umpire", "umpire", "referee", "second-base umpire", "umpire", "umpire", "umpire"], "difficult_direct_answer": false, "rationales": ["The umpire stands behind the batter and the catcher to have a better angle on the baseball.", "The man is the referee.", "The man is identifiable based on the question. based on his uniform, position and the rules of baseball he would be known as answer b."], "image": "val2014/COCO_val2014_000000146830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278347, "question_id": "aet56vEzNU8GWfcj2d6s7L", "question": "Why is the bike's kickstand on a board?", "choices": ["make taller", "keep upright", "prevent theft", "mount easier"], "correct_choice_idx": 1, "direct_answers": ["protect ground", "more stable", "being displayed", "protection", "no idea", "level", "protect tile", "balance", "keep upright", "display bike"], "difficult_direct_answer": true, "rationales": ["The stand is keeping it up.", "This will help keep it level on the pavement since it's not level.", "It is on a board to be sure that it stands up sturdy. if not on the board the kickstand my start to slide into the cracks."], "image": "train2014/COCO_train2014_000000278347.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501864, "question_id": "amzrHBuTh7QvE8aHK47UuA", "question": "Why are horses eyes covered?", "choices": ["avoid insects", "dust protection", "sun protection", "wind protection"], "correct_choice_idx": 0, "direct_answers": ["focus", "reduce flies", "keep calm", "flies", "avoid insects", "avoid startle", "blinder", "from sun", "blinds", "steam"], "difficult_direct_answer": true, "rationales": ["The horses' eyes are covered to avoid bugs.", "Insects probably fly in their eyes all the time when they're walking or running.", "The items are blinders, meant to keep the horse calm and keep things out of their eyes."], "image": "train2014/COCO_train2014_000000501864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 146599, "question_id": "anQWPR53g6swU3mFuHp9C2", "question": "Who pays this man's salary?", "choices": ["private company", "government", "religious institution", "private individual"], "correct_choice_idx": 1, "direct_answers": ["government", "government", "taxpayers", "taxpayers", "government", "government", "tax payers", "city", "taxpayers", "city"], "difficult_direct_answer": false, "rationales": ["The man is riding a police motorcycle and is wearing a uniform, so he is a cop. police officers are not employed by private or religious organizations.", "The man is a police officer, a job usually paid for by the entity in a.", "People who are charged with keeping the peace in their communities are paid through taxes paid by those communities."], "image": "train2014/COCO_train2014_000000146599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87064, "question_id": "apyKHRA5JzTTzwdf8BP2FK", "question": "Which item here might be most likely to make someone cry?", "choices": ["carrots", "onions", "lettuce", "potatoes"], "correct_choice_idx": 1, "direct_answers": ["onion", "green onion", "onions", "onions", "onions", "onion", "onion", "onions", "onion", "onion"], "difficult_direct_answer": false, "rationales": ["The item is the onions.", "Onions make people cry.", "People usually get teared up when they are chopping up onions."], "image": "train2014/COCO_train2014_000000087064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37567, "question_id": "aqGATa97VW4cwANp7uMGJD", "question": "This slowly moving horse is doing what?", "choices": ["standing", "trotting", "springing", "sitting"], "correct_choice_idx": 1, "direct_answers": ["walking", "mosying", "walking", "walking", "joly walk", "galloping", "trotting", "food eating", "pooping", "trotting"], "difficult_direct_answer": false, "rationales": ["The horse is running.", "Trotting slowly because they don't seem to be running or moving fast.", "He's moving at a slow speed"], "image": "train2014/COCO_train2014_000000037567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494749, "question_id": "aufCYadQp5n3Wvav58dhp6", "question": "Why are the two vehicle allowed in the area that says do not enter?", "choices": ["military vehicles", "citizen vehicles", "school buses", "authorized vehicles"], "correct_choice_idx": 3, "direct_answers": ["authorized vehicles", "authorized vehicles", "buses", "special privilege", "they're authorized", "authorized", "bus", "busses only", "authorized vehicles", "authorized vehicles"], "difficult_direct_answer": false, "rationales": ["They are likely authorized to enter the facility.", "This road is a special road only for them according to the sign above the road", "The sign says that they are allowed."], "image": "train2014/COCO_train2014_000000494749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145318, "question_id": "b8YtSYk9jiiiiXtzj9x5At", "question": "What fruit used to prepare items here is darkest?", "choices": ["apricots", "bananas", "oranges", "blueberries"], "correct_choice_idx": 3, "direct_answers": ["blueberries", "orange", "orange", "blueberry", "banana", "blackberry", "raisin", "orange", "raisins", "blueberry"], "difficult_direct_answer": false, "rationales": ["It looks like blueberries in some slices of bread.", "The bread on the plate farthest has blueberries baked into it which are darker than any other food on the table.", "There are blue round items found in the bread."], "image": "val2014/COCO_val2014_000000145318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199743, "question_id": "bDp7knZJCscq8zFMsmZ4SP", "question": "What kind of transportation is shown?", "choices": ["rail", "water", "air", "road"], "correct_choice_idx": 3, "direct_answers": ["bike", "bicycle cars", "bicycle", "bike", "bicycle", "bike", "road", "bicycle", "bicycle", "bicycle"], "difficult_direct_answer": false, "rationales": ["The transportation shown is a road where cars, bikes, and buses travel.", "The transportation shown here is a two wheeled bicycle. bicycles are accepted modes of transportation on streets and roads.", "The bike is on the road."], "image": "train2014/COCO_train2014_000000199743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55618, "question_id": "bLTQUrzWGmXTk9tJk4RBjZ", "question": "What are they doing?", "choices": ["arguing", "fighting", "buying lunch", "seeking coins"], "correct_choice_idx": 3, "direct_answers": ["wachting", "filling meter", "paying meter", "feeding meter", "paying toll", "feed meter", "paying", "seeking coins", "sharing", "paying"], "difficult_direct_answer": true, "rationales": ["The people are putting coins in the machine.", "I say they are looking for coins because they are in front of a parking meter and looking in their purse.", "They are counting coins."], "image": "train2014/COCO_train2014_000000055618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202810, "question_id": "biw6Qi6WnbAF379pn3ATjF", "question": "What price is Coca Cola Advertised at here?", "choices": ["five dollars", "dollar", "dime", "nickel"], "correct_choice_idx": 3, "direct_answers": ["five cents", "five cent", "five cents", "5 cents", "price 80", "nickel", "5 cents", "five cents", "five cents", "five cents"], "difficult_direct_answer": false, "rationales": ["Cokes used to only cost a nickel.", "The cola is priced as five cents.", "The sign shows 5 cents on the bottom."], "image": "val2014/COCO_val2014_000000202810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169100, "question_id": "c9ScMFPSQVHued2zgPB6ia", "question": "What types of sheep are these?", "choices": ["merino", "awassi", "suffolk", "dorper"], "correct_choice_idx": 0, "direct_answers": ["corriedale", "merino", "merino", "wool bearing", "adult sheep", "adults", "fluffy", "sheep", "hairy", "longwool"], "difficult_direct_answer": true, "rationales": ["Though there are many types of sheep breeds this one can be distinguished by their curly coats.", "These appear to be a based on the curling of the wool.", "These sheep are merinos. they have very warm wool."], "image": "train2014/COCO_train2014_000000169100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560713, "question_id": "cTf5Ae74X8FrzSCBAAHR54", "question": "What is the person near the boat's yellow clothing for?", "choices": ["business", "target practice", "visibility", "fashion"], "correct_choice_idx": 2, "direct_answers": ["safety", "visibility", "safety reflection", "life vest", "visibility", "worker", "safety visibility", "worker", "safety", "visibility safety"], "difficult_direct_answer": false, "rationales": ["A worker is wearing neon yellow. bright colors are used by workers to increase visibility for safety.", "The person is wearing bright neon clothes to make them stick out on a natural or dark landscape.", "If the man was not there the captain would not be able to see the side of the boat and may hit the wall when coming or leaving."], "image": "train2014/COCO_train2014_000000560713.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 528084, "question_id": "cTpVJ3boSndimqNC8uHnw5", "question": "What food group is being served?", "choices": ["vegetables", "meat", "dairy", "fruits"], "correct_choice_idx": 3, "direct_answers": ["fruits", "its fruit", "fruit", "fruit", "fruit", "friuts", "fruit", "fruit", "black bean", "fruits"], "difficult_direct_answer": false, "rationales": ["The food is a fruit.", "Oranges and blueberries are fruit.", "The items are citrus oranges."], "image": "val2014/COCO_val2014_000000528084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413172, "question_id": "cZqPkWXJ8QeHdHtPZ5NJwY", "question": "What is the first name of the President that this street is named after?", "choices": ["barack", "william", "stephen", "thomas"], "correct_choice_idx": 1, "direct_answers": ["bill", "bill", "clinton", "markham", "bill", "bill", "bill", "bill", "william", "bill"], "difficult_direct_answer": false, "rationales": ["The street sign shows a sign for president clinton avenue, named after bill clinton.", "The first name is william.", "The green sign has the name of clinton on it."], "image": "val2014/COCO_val2014_000000413172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150537, "question_id": "cenQAp6Pw6MDSYJ2j3DCSr", "question": "What is the white thing around the boy's mouth?", "choices": ["frosting", "sugar", "sand", "salt"], "correct_choice_idx": 1, "direct_answers": ["sugar", "orange peel", "sugar", "doughnut", "food", "donut", "sugar", "sugar", "powder", "sugar"], "difficult_direct_answer": false, "rationales": ["The kid is eating a sugary doughnut and it's very likely it is sugar on his face.", "The boy is eating a sweet food with a covering consistent with the size and shape of answer a and is consistent with the food and manner being eaten.", "The white thing is sugar."], "image": "train2014/COCO_train2014_000000150537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72421, "question_id": "cszhwVtRwdSrYPuizazgbA", "question": "How much was the inventor of this appliance paid for his invention?", "choices": ["$2000", "$20000", "$2", "$200"], "correct_choice_idx": 2, "direct_answers": ["two dollars", "two dollars", "two dollars", "nothing", "2 dollars", "two dollars", "$2", "2$ gratuity", "nothing", "who knows"], "difficult_direct_answer": false, "rationales": ["The inventor was paid $2.", "The man was paid two dollars for inventing it.", "(d) $20,000.00 it would cost a lot of money to complete this."], "image": "train2014/COCO_train2014_000000072421.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350361, "question_id": "d3GXwHnYMFovcTZXa8LnLU", "question": "Why are the girl's arms out?", "choices": ["to signal", "to gesture", "to wave", "to hold"], "correct_choice_idx": 3, "direct_answers": ["balance", "to hold", "catching rope", "holding on", "holding on", "water skiing", "skiing", "holding on", "water skiing", "holding rope"], "difficult_direct_answer": false, "rationales": ["There are handles attached to the ropes and she needs to stay connected to keep moving over the water", "She is holding on the the straps to the boat.", "The girl is water skiing, she is holding on to the ropes to not fall."], "image": "train2014/COCO_train2014_000000350361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247720, "question_id": "d3L4asUcygx9VtJ6BhFpWa", "question": "What is the girl holding in her hand?", "choices": ["phone", "remote", "headphones", "book"], "correct_choice_idx": 1, "direct_answers": ["remote", "wii control", "remote", "remote control", "cellphone", "wii controller", "remote", "controller", "cell phone", "wii controller"], "difficult_direct_answer": false, "rationales": ["The girl has a wii controller.", "It's a remote.", "The thing is a wii remote."], "image": "val2014/COCO_val2014_000000247720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67407, "question_id": "dCcaJRNNwZNSsvZiJ4gBn4", "question": "What does the blue P sign mean?", "choices": ["pass", "party", "polo", "park"], "correct_choice_idx": 3, "direct_answers": ["parking", "parking", "parking", "park", "park", "parking", "parking", "parking", "parking", "park"], "difficult_direct_answer": false, "rationales": ["No one puts signs on the street for parties. there is no pass sign. this sign is on a parking lot.", "It is for parking.", "The sign means there's parking."], "image": "train2014/COCO_train2014_000000067407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137826, "question_id": "dDR28sUASyvWob4QmPpJJE", "question": "Who are the passengers inside the red bus?", "choices": ["actresses", "tourists", "medical workers", "politicians"], "correct_choice_idx": 1, "direct_answers": ["tourists", "tourists", "many", "tourist", "adults", "tourist", "sightseers", "students", "tourists", "tourists"], "difficult_direct_answer": false, "rationales": ["The bust logo states it is a historic sights tour.", "The bus is advertising sight seeing tours.", "The passengers are tourists."], "image": "val2014/COCO_val2014_000000137826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114149, "question_id": "dLB3vzvtxVPKL7hnrgnRrH", "question": "What type of transportation is shown?", "choices": ["rail", "road", "water", "air"], "correct_choice_idx": 2, "direct_answers": ["boats", "boat", "boats", "water transportation", "boats", "boat", "boats", "boats", "boat", "water"], "difficult_direct_answer": false, "rationales": ["The boat uses water to travel.", "There are many boats shown that are used for transportation on the water.", "The only way to get things moved around here is by water and boat."], "image": "train2014/COCO_train2014_000000114149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405264, "question_id": "dPNqFqNqMPqhqcj9nK3apU", "question": "What is the only part of the display that is actually normal size?", "choices": ["reclining chair", "eye screw", "remote control", "book"], "correct_choice_idx": 1, "direct_answers": ["eye screw", "book", "book", "hook", "hook", "hook", "hook", "hook", "hook", "book"], "difficult_direct_answer": false, "rationales": ["A recliner alleviates stress by offering the utmost comfort and support.", "The part is the eye screw.", "The screw looks normal."], "image": "train2014/COCO_train2014_000000405264.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480172, "question_id": "dQ4jzk7ppJUNbiSYT2Wn7V", "question": "Which car propels the train along the tracks?", "choices": ["back", "none", "front", "middle"], "correct_choice_idx": 2, "direct_answers": ["locomotive", "locomotive", "locomotive", "first", "front", "engine", "first train", "front", "caboose", "caboose"], "difficult_direct_answer": false, "rationales": ["The front car propels.", "The front train known as the trolly.", "The first car of the train is powered by coal and it propels the other trains."], "image": "train2014/COCO_train2014_000000480172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517321, "question_id": "dgsud8SPySYqU6JXWjzGkW", "question": "What flavor will be tasted at the top that contrasts the icing's flavor?", "choices": ["salty", "meaty", "sour", "spicy"], "correct_choice_idx": 2, "direct_answers": ["orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "sour"], "difficult_direct_answer": false, "rationales": ["An orange is on top of a dessert. orange tastes tart compared to icing.", "The icing would be sweet, but the slice of citrus fruit will give off a sour taste.", "Spicy because of best and best out put taste given."], "image": "train2014/COCO_train2014_000000517321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31016, "question_id": "dn6rFBy2FQpRnEpBo5jgeV", "question": "Which type of mouse is pictured?", "choices": ["animal", "gamer", "ergonomic", "wireless"], "correct_choice_idx": 3, "direct_answers": ["wireless mouse", "bleutooth", "wireless", "pointer", "wireless", "wireless", "computer", "computer", "wireless", "wireless"], "difficult_direct_answer": false, "rationales": ["There is no wire attached to the mouse.", "The is no cable connecting the mouse to anything.", "You can tell what type of mouse it is due to no wires coming from it."], "image": "val2014/COCO_val2014_000000031016.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514457, "question_id": "dv5GgK92dkJV6abcXXMNJa", "question": "The title of one of the books means Many Happy Returns in what language?", "choices": ["finnish", "hungarian", "german", "polish"], "correct_choice_idx": 0, "direct_answers": ["finnish", "finnish", "indian", "english", "mandarin", "finnish", "finnish", "german", "finnish", "spanish"], "difficult_direct_answer": false, "rationales": ["If you use google finnish to english you can tell what language it is.", "It means it in finnish.", "It's finnish because the language is compatible with how they speak in finland."], "image": "train2014/COCO_train2014_000000514457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558405, "question_id": "e7i7BCk3NRoKZwYTaLEw5V", "question": "The pathway and alley here are constructed by using what?", "choices": ["dirt", "pavement", "cobbles", "brick"], "correct_choice_idx": 3, "direct_answers": ["brick", "brick", "brick", "bricks", "bricks", "brick", "bricks", "bricks", "bricks", "bricks"], "difficult_direct_answer": false, "rationales": ["The alley shown is paved with bricks.", "This is a brick walkway.", "The separate pieces laid on the ground in small rectangular shapes can be easily identified as clay bricks."], "image": "train2014/COCO_train2014_000000558405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300198, "question_id": "eHJnaZwsmmw9EovYS9udPJ", "question": "In which city is this shopping area located most probably?", "choices": ["rome", "venice", "paris", "brussels"], "correct_choice_idx": 0, "direct_answers": ["rome", "rome", "rome", "rome", "rome", "rome", "cincinnati ohio", "venice", "rome", "rome"], "difficult_direct_answer": false, "rationales": ["The city is rome.", "The signs are in the italian language.", "The writings and building style is specifically from romans."], "image": "train2014/COCO_train2014_000000300198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34487, "question_id": "eYSq62qP8wNGFyug2iK8nB", "question": "What purpose does the large disk on this person's head serve most here?", "choices": ["rain protection", "moon", "sun", "hiding"], "correct_choice_idx": 2, "direct_answers": ["shade", "block sun", "shade", "block sun", "sun", "protection", "sun protection", "cover head", "sun shade", "sun protection"], "difficult_direct_answer": false, "rationales": ["This person is wearing this in the sun, and this would not hold up in the rain. no one wears a hat to protect from the moon.", "It looks like an umbrella.", "A man is sitting and looking at a magazine on a bench. he has a hat on that protects from the heat and rays."], "image": "train2014/COCO_train2014_000000034487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301998, "question_id": "ehzxtoxZeoeMhkEnzkx33T", "question": "What's the woman standing in front of the seated woman for?", "choices": ["to fight", "to kiss", "to hug", "photo"], "correct_choice_idx": 3, "direct_answers": ["taking pictures", "photograph", "taking photo", "taking photo", "take picture", "photographer", "photo", "pictures", "photograph", "taking picture"], "difficult_direct_answer": false, "rationales": ["She wants to take a picture of the girl.", "She is holding a camera.", "She's holding a digital camera in her hands"], "image": "train2014/COCO_train2014_000000301998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392612, "question_id": "eqg8c9AgWQHMZfxsQXcPcc", "question": "What's the name of the man on the skateboard's hairstyle?", "choices": ["dreadlocks", "liberty spikes", "mohawk", "perm"], "correct_choice_idx": 0, "direct_answers": ["dreads", "dress", "dreads", "dreadlocks", "dreadlocks", "dreads", "dreadlocks", "dreadlocks", "dreadlocks", "dreadlocks"], "difficult_direct_answer": false, "rationales": ["The hairstyle is clearly visible and the unique composition is identifiable and consistent with answer a.", "He has long hair.", "It's obvious given the shape, thickness and style."], "image": "val2014/COCO_val2014_000000392612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242570, "question_id": "euKZZ99e3iXt2Uq9kkiDsU", "question": "What was the original spelling of this company's name?", "choices": ["toiota", "toyota", "toyotah", "tayota"], "correct_choice_idx": 1, "direct_answers": ["toyota", "toyoda", "toyota", "toyoda", "toyoda", "toyoda", "toyoda", "toyoda", "toyoda", "toyota"], "difficult_direct_answer": false, "rationales": ["Toyota was the original", "This is what they decided to name the company after deciding it sounded better than toyota", "That was how the company was spelled."], "image": "val2014/COCO_val2014_000000242570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569862, "question_id": "fU269FsR8sGD4uQMUfYyec", "question": "What secures this person's shoes?", "choices": ["animals", "knot", "socks", "cotton"], "correct_choice_idx": 1, "direct_answers": ["shoe strings", "knot", "laces", "laces", "laces", "shoelaces", "shoelaces", "laces", "shoelaces", "laces"], "difficult_direct_answer": false, "rationales": ["The person tied their shoelaces to prevent their shoes from coming off their feet.", "The shoes are knotted.", "The person's shoes have laces visible. laces in shows would need answer a to secure them properly."], "image": "val2014/COCO_val2014_000000569862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350463, "question_id": "fcSSkxkcRqJVexUch9p8d6", "question": "What genus does this fruit belong to?", "choices": ["musa", "malus", "citrus", "ficus"], "correct_choice_idx": 0, "direct_answers": ["musa", "banana", "musa", "musa", "musa", "musa", "tropical plantains", "banana", "musa", "banana"], "difficult_direct_answer": false, "rationales": ["The banana fruit belongs to the genus musa.", "The fruit is a banana and an internet search of the banana genus provided the answer.", "This is the science classification"], "image": "val2014/COCO_val2014_000000350463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402283, "question_id": "fgsoM6WXKKs782AM2hDTDW", "question": "What protects the observers from a stray ball?", "choices": ["cement barrier", "umbrella", "chainlink fence", "catcher"], "correct_choice_idx": 2, "direct_answers": ["fence", "fence", "fence", "fense", "fence", "baseball", "wire fence", "fence", "chainlink fence", "fence"], "difficult_direct_answer": false, "rationales": ["The people watching the baseball game are sitting behind a fence which helps protect them from flying objects.", "The fence protects observers.", "The fence there will help protect them."], "image": "val2014/COCO_val2014_000000402283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82551, "question_id": "fkcqeUQhGPgjYrFpCw6KwB", "question": "Which children's author's creation does this man show off?", "choices": ["stephen king", "sendak", "disney", "aa milne"], "correct_choice_idx": 3, "direct_answers": ["aa milne", "eeyore", "a.a. milne", "eyore", "eeyore", "eor", "pooh", "eeyore", "aa milne", "milne"], "difficult_direct_answer": false, "rationales": ["A man is wearing a shirt with a character from winnie the pooh which was written by aa milne.", "That is eeyore on his shirt and eeyore is a character in winnie the pooh by aa milne.", "This character is known as eeyore and he is from the winnie the pooh series written by this author."], "image": "val2014/COCO_val2014_000000082551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152258, "question_id": "fywmUSJJo7kka8CsUQbAJc", "question": "What stone is on the boy's black baseball cap?", "choices": ["ruby", "onyx", "diamond", "gem"], "correct_choice_idx": 2, "direct_answers": ["diamond", "diamond", "diamond", "diamond", "diamond", "diamond", "brick", "diamond", "diamond", "brick"], "difficult_direct_answer": false, "rationales": ["The shape of the stone is a diamond.", "The cap has a diamond shape with multiple faces.", "A diamond is on the hat."], "image": "val2014/COCO_val2014_000000152258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465395, "question_id": "gJnPS4CA947KiAQgkiYuPu", "question": "What is the purpose of the red cylinders on the man's head?", "choices": ["fashion", "invisibility", "noise reduction", "visibility"], "correct_choice_idx": 2, "direct_answers": ["ear protection", "noise reduction", "noise cancelation", "ear protection", "safety", "noise reduction", "ear muffs", "control traffic", "sound protection", "marking space"], "difficult_direct_answer": false, "rationales": ["A man is working on a runway near an airplane. people wear earmuffs to protect from loud sounds.", "The purpose is for noise reduction. these red cylinders block out noise.", "The purpose is noise reduction."], "image": "train2014/COCO_train2014_000000465395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107054, "question_id": "gLFCoLX9hrAxCG6oGSaG3s", "question": "What is the man near the giraffes job?", "choices": ["businessman", "chef", "janitor", "zookeeper"], "correct_choice_idx": 3, "direct_answers": ["zookeeper", "feeding", "zoo keeper", "zookeeper", "zookeeper", "zookeeper", "trainer", "zookeeper", "zoo keeper", "zookeeper"], "difficult_direct_answer": false, "rationales": ["The man is a zookeeper.", "A black man is standing in his khaki uniform. there are several giraffes leaning in close to him.", "The enclosure looks like a zoo and giraffes are typically found in zoos."], "image": "train2014/COCO_train2014_000000107054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83060, "question_id": "gMBjC3rUnVRarBAypESGEh", "question": "What kind of bag is hanging from the cupboard?", "choices": ["grocery bag", "backpack", "purse", "satchel"], "correct_choice_idx": 0, "direct_answers": ["whole foods", "shopping bag", "tote bag", "cloth", "shopping", "grocery bag", "handbag", "whole food", "whole foods", "whole foods"], "difficult_direct_answer": false, "rationales": ["The bag is clearly visible and is of a size, shape and material consistent with answer a.", "The bag is in a kitchen where it is used to transport food to.", "The bag is for groceries."], "image": "train2014/COCO_train2014_000000083060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11802, "question_id": "gXRQEuUGKPT98PDXDbhiEo", "question": "How is the young boy's green shirt done up?", "choices": ["buckles", "buttons", "laces", "zippers"], "correct_choice_idx": 1, "direct_answers": ["neatly", "buttons", "plaid", "buttons", "buttons", "plaid fannell", "checked", "buttons", "buttoned up", "plaid"], "difficult_direct_answer": false, "rationales": ["It is a shirt that buttons up the front.", "The shirt has buttons.", "Most shirts are fastened by buttons, you can see the white on the buttons that show through the button holes."], "image": "train2014/COCO_train2014_000000011802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459018, "question_id": "gnDTT3nFXCK74kverEN32E", "question": "Who's washroom is this?", "choices": ["men's", "cats", "women's", "dogs"], "correct_choice_idx": 0, "direct_answers": ["office", "hotel", "adrian patio", "mens", "men's", "adrian patio's", "mens", "mens", "mens", "mens"], "difficult_direct_answer": false, "rationales": ["A bathroom has several urinals on the wall. men use urinals.", "I am a man and i know that only men's bathroom's have urinals.", "There are urinals in the bathroom. women do not use urinals."], "image": "train2014/COCO_train2014_000000459018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396858, "question_id": "hHzr4v4anBpxCUVwrH9Eek", "question": "Which is widely used in many cuisines as a spice to add pungent 'heat' to dishes?", "choices": ["cucumber", "capsicum", "melon", "chilies"], "correct_choice_idx": 3, "direct_answers": ["pepper", "pepper", "chilies", "peppers", "peppers", "peppers", "pepper", "peppers", "red chili", "peppers"], "difficult_direct_answer": false, "rationales": ["They are spicy and you can see the actual vegetable and it's seeds being cut.", "The chilies are used for spice.", "Chiles are used to add spice and heat to any dish. it's used in a lot of cuisines to enhance the flavor."], "image": "train2014/COCO_train2014_000000396858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562241, "question_id": "hbUWgv5KTYfD3HwqiBFP68", "question": "What's probably casting the nearby shadow?", "choices": ["palm tree", "cameraman", "dog", "traffic cone"], "correct_choice_idx": 1, "direct_answers": ["photographer", "person", "photographer", "cameraman", "person", "photographer", "person", "person", "person", "photographer"], "difficult_direct_answer": false, "rationales": ["In the lower left section of the picture is a shadow of a person.", "A shadow of a person can be seen next to a person skiing in the snow.", "One man shoot the video."], "image": "val2014/COCO_val2014_000000562241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161517, "question_id": "hnC6GFskhxHL4UQyyW7vZu", "question": "What is this man's profession?", "choices": ["singer", "doctor", "minister", "jockey"], "correct_choice_idx": 3, "direct_answers": ["horse jockey", "jockey", "jockey", "horse ride", "horse riding", "jockey", "jockey", "jockey", "equestrian", "jockey"], "difficult_direct_answer": false, "rationales": ["This man is riding a horse.", "The man is a jockey.", "He's riding a horse over jumps"], "image": "val2014/COCO_val2014_000000161517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258132, "question_id": "hx5vL5kwEZWtRgtXPbk2qt", "question": "Why is the man's hat orange in color?", "choices": ["dress code", "visibility", "fashion", "camouflage"], "correct_choice_idx": 2, "direct_answers": ["fashion", "visibility", "to see", "just because", "dye", "likes it", "personal style", "attention", "volcom sponsor", "dyed"], "difficult_direct_answer": true, "rationales": ["The man is fashionable.", "It should be just fashion, since dress code would be apply without any wording. for visibility will not interfere with the skater's view, while there are nothing there necessary to camouflage from. therefore i conclude it is just for fashion only.", "He wants to stand out."], "image": "val2014/COCO_val2014_000000258132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231748, "question_id": "hywdUsBQQrjr6onQsjinSu", "question": "What is the man wearing on his wrists?", "choices": ["bracelet", "tape", "watch", "wrist band"], "correct_choice_idx": 3, "direct_answers": ["sweatbands", "wrist band", "armband", "sweatbands", "wristband", "sweat bands", "wrist band", "sweatbands", "sweatbands", "wristband"], "difficult_direct_answer": false, "rationales": ["A sports woman who has a sports woman wearing a band in playing of tennis/.", "He has on wrist bands.", "These circles of cloth are usually worn by tennis players to wipe the sweat from their brows."], "image": "train2014/COCO_train2014_000000231748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574692, "question_id": "i9vsXE93eajT3RRmuMHbYB", "question": "What are they waiting in line for?", "choices": ["buy juice", "pick oranges", "exit", "sell juice"], "correct_choice_idx": 0, "direct_answers": ["oranges", "orange juice", "purchasing", "buy juice", "fruit", "orange juice", "orange juice", "oranges", "buy fruit", "shop"], "difficult_direct_answer": false, "rationales": ["(b) they are waiting in line to pick the oranges they would like to take home.", "The sign on the left indicates that an orange-based liquid is being sold.", "They sell juice."], "image": "val2014/COCO_val2014_000000574692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355611, "question_id": "iHHbPuZwcwjL4v5yXrVZnd", "question": "What is the name of the part of the motorcycle directly in front of the rider's face?", "choices": ["bugcatcher", "buddy pegs", "windscreen", "deflector shield"], "correct_choice_idx": 2, "direct_answers": ["shield", "windshield", "unknown", "face shield", "windshield", "windshield", "windscreen", "windshield", "windshield", "windshield"], "difficult_direct_answer": false, "rationales": ["Driving mode with deflector shield.", "A clear windshield is in front of a person riding a motorcycle.", "The motorcycle part is a windscreen."], "image": "train2014/COCO_train2014_000000355611.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251531, "question_id": "itMXpdjZZ8Fz543UuMWHyZ", "question": "How is this steak cooked?", "choices": ["rare", "medium", "well done", "blue rare"], "correct_choice_idx": 1, "direct_answers": ["heavy roasted", "medium", "fried", "medium rare", "medium", "medium rare", "medium rare", "med rare", "rare", "medium"], "difficult_direct_answer": false, "rationales": ["There is some pink showing.", "It is really pink so it's rare.", "Option a is chosen for the display shown."], "image": "val2014/COCO_val2014_000000251531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122597, "question_id": "j8stZb6vSGFKRF5cCgefzz", "question": "What are the teddy bears arranged to look like they're doing?", "choices": ["hunting", "fighting", "cooking", "sun bathing"], "correct_choice_idx": 3, "direct_answers": ["sunbathing", "sunbathing", "smileing sleeping", "beach lounging", "at beach", "sleeping", "sunbathing", "sleeping", "sun bathing", "watching"], "difficult_direct_answer": false, "rationales": ["The teddy bears are all laying out and have sunglasses and swimsuits on which is what you wear when you are out sun bathing.", "The bears are sunbathing.", "The teddy bears all have sunglasses and are lounging."], "image": "train2014/COCO_train2014_000000122597.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180964, "question_id": "jHWJxQcroTUtNGamwkL44X", "question": "What's happening with these toilets?", "choices": ["discarding them", "cleaning them", "selling them", "displaying them"], "correct_choice_idx": 3, "direct_answers": ["repairs", "on display", "fixed", "displaying them", "thrown out", "given away", "air out", "construction", "closed lids", "thrown away"], "difficult_direct_answer": true, "rationales": ["This is the most likely reason. that said, it could also be any of the other answers.it's impossible to tell from the image.", "Option a is my pick for the view on display.", "This row of toilets all have their tank tops off and none are attached to a sewage system or pipes. these toilets are not in operation and seem to be part of some art installation."], "image": "train2014/COCO_train2014_000000180964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192266, "question_id": "jqJodm8Hv7jazyzYRWQrr8", "question": "What material is the sink made of?", "choices": ["plastic", "stainless steel", "porcelain", "wood"], "correct_choice_idx": 1, "direct_answers": ["steel", "stainless steel", "metal", "ceramic", "metal", "steel", "stainless steel", "plastic", "stainless steel", "steel"], "difficult_direct_answer": false, "rationales": ["Sinks are made of material that won't easily rust due to water damage.", "The sink looks neat clean still due to its stainless ability.", "The sink is silver in color and most sinks that are silver are made of stainless steel."], "image": "train2014/COCO_train2014_000000192266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285120, "question_id": "ju7DZusBf9Jadh5wBnV3e9", "question": "What material is beneath the person's feet here?", "choices": ["clay", "mud", "tarmac", "snow"], "correct_choice_idx": 2, "direct_answers": ["asphalt", "tarmac", "asphalt", "asphalt", "asphalt", "asphalt", "tar", "cement", "tarmac", "pavement"], "difficult_direct_answer": false, "rationales": ["The person is near a parked airplane. airplanes do not park on clay, snow, or mud.", "The person is walking on a surface where thereis a plane is parked. planes are usually parked on tarmac.", "The material is tarmac."], "image": "val2014/COCO_val2014_000000285120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411492, "question_id": "jxcvXQZvERs9mbxGuGnhvT", "question": "What is the man doing in the venue?", "choices": ["reading", "shopping", "sleeping", "repairing clocks"], "correct_choice_idx": 3, "direct_answers": ["repairing", "watch mechanical", "fixing clocks", "clock repair", "clock fixing", "selling clocks", "clock work", "sitting", "working", "repairing clocks"], "difficult_direct_answer": true, "rationales": ["It is obvious this is a repair shop, because of all the clocks on the wall.", "The man near the timekeeping devices is awake. he is working, not reading or shopping.", "You can tell by what he is working on as to what his profession is."], "image": "train2014/COCO_train2014_000000411492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445785, "question_id": "k5ZzdR6mYVQaCVxoKf6gx5", "question": "What are the men doing with the food?", "choices": ["cooking it", "eating it", "trashing it", "baking it"], "correct_choice_idx": 1, "direct_answers": ["displaying", "showing", "posing", "taking pictures", "eating", "eating", "presenting", "eating", "eating it", "holding"], "difficult_direct_answer": false, "rationales": ["The men eat.", "They're both seated at tables in restaurants.", "The men are eating."], "image": "train2014/COCO_train2014_000000445785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187857, "question_id": "k7m4bTZdQjxn4tMjV3MRNG", "question": "In what type event does the Motorcyclist drive?", "choices": ["convoy", "emergency call", "parade", "regatta"], "correct_choice_idx": 2, "direct_answers": ["parade", "parade", "parade", "parade", "parade", "rally", "bike", "saint patricks", "parade", "parade"], "difficult_direct_answer": false, "rationales": ["The event is a parade.", "Motorcycles often are driven as participants in a parade. this motorcycle is a high-end harley davidson which often appear in parades.", "The event is a parade."], "image": "val2014/COCO_val2014_000000187857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70363, "question_id": "kGU2oxXa6npUxT3A26NADu", "question": "What are the small grey objects in between the rails?", "choices": ["caps", "balls", "chips", "stones"], "correct_choice_idx": 3, "direct_answers": ["dividers", "rail tracks", "stones", "tracks", "stones", "rock fragments", "tracks", "doors", "sleepers", "spacers"], "difficult_direct_answer": false, "rationales": ["The objects are stones.", "These are stones.", "The small gray objects are small stones that have fallen between the tracks. the small stones are not dangerous to the train."], "image": "train2014/COCO_train2014_000000070363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472900, "question_id": "kcr4iHoijPXkBPCoSeeEhZ", "question": "What does the printing on the mat indicate?", "choices": ["wine varieties", "names", "people locations", "destination"], "correct_choice_idx": 0, "direct_answers": ["wine types", "wine varieties", "wine type", "wine types", "wine names", "wine varieties", "drink type", "drinking", "wine type", "wine flavor"], "difficult_direct_answer": false, "rationales": ["There are different glasses of wine on the respective places of the mat.", "A placemat under wine glasses lists different kinds of wines.", "There are names of types of wine on the mat."], "image": "train2014/COCO_train2014_000000472900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430251, "question_id": "m2RHzznodadXL9Nrt67rMN", "question": "Why do the animals have their heads to the ground?", "choices": ["to play", "to charge", "to eat", "to rest"], "correct_choice_idx": 2, "direct_answers": ["grazing", "eating grass", "eating", "grazing", "grazing", "to eat", "sheep", "grazing", "feeding", "grazing"], "difficult_direct_answer": false, "rationales": ["They are sheet and are eating grass.", "Sheep are grazing in a field. sheep put their heads down to graze.", "They sheep are herbivores and they consume grass."], "image": "train2014/COCO_train2014_000000430251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398473, "question_id": "m5YRLGt57hbaiSqaZ47WaA", "question": "What time of day is it here?", "choices": ["evening", "midday", "night", "dawn"], "correct_choice_idx": 1, "direct_answers": ["day time", "daytime", "midday", "morning", "day", "noon", "noon", "afternoon", "10 am", "midday"], "difficult_direct_answer": false, "rationales": ["It is sunny", "The time is midday.", "The sun appears to still be up and it's not dark. which likely means it's midday."], "image": "train2014/COCO_train2014_000000398473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11305, "question_id": "mncKLvMAJgrXRdNYRUqA7v", "question": "What country's dining is being emulated?", "choices": ["canada", "japan", "russia", "mexico"], "correct_choice_idx": 1, "direct_answers": ["japan", "japan", "japan", "japan", "japan", "america", "india", "japanese", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["The country uses floor tables like this one, plus the country's language is visible on artwork on the wall.", "There are cushions laid all around the outside of a table. the table is on the floor with no legs.", "I know from movies and books that they eat in this style, on the floor surrounding a table."], "image": "val2014/COCO_val2014_000000011305.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238799, "question_id": "nDePNgauG2Skbj6EYLcnPM", "question": "Why are the underpants hanging there?", "choices": ["on display", "closet full", "to dry", "for decoration"], "correct_choice_idx": 2, "direct_answers": ["to dry", "drying", "drying", "to dry", "to dry", "drying", "drying", "to dry", "drying", "to dry"], "difficult_direct_answer": false, "rationales": ["They are wet", "Underpants hanging there because it have drying purpose.", "The underpants are near a window, allowing warm sunlight to hit them."], "image": "val2014/COCO_val2014_000000238799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342765, "question_id": "nVqXaDgKdG4UHvtSnLFXsi", "question": "What do these people mostly consume?", "choices": ["candy", "steak", "pizza", "alcohol"], "correct_choice_idx": 3, "direct_answers": ["food", "beer", "alcohol", "juice", "beer", "drinks", "beer", "beer", "food", "beer"], "difficult_direct_answer": false, "rationales": ["A because i see two cases of beer in the refrigerator.", "There is only alcohol.", "Judging by two the two 12-packs visible, it would appear that beer is a popular item in this home. fermented drinks were first seen in the world about 12,000 years ago."], "image": "val2014/COCO_val2014_000000342765.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326207, "question_id": "nY5epB8Stu3vVbjFHEsnKJ", "question": "The helmet on top of the motorcycle's handlebars is painted to resemble what?", "choices": ["bowling ball", "ping-pong ball", "tennis ball", "billiard ball"], "correct_choice_idx": 3, "direct_answers": ["magic eight", "eight ball", "billiard ball", "eight ball", "eight ball", "billiard ball", "billiard ball", "pool ball", "cue ball", "pool ball"], "difficult_direct_answer": false, "rationales": ["The helmet is painted to resemble an eight ball.", "The is a number on the helmet.", "A black helmet has a number in a circle in the center."], "image": "train2014/COCO_train2014_000000326207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21183, "question_id": "ne98xZ76YvmEpoFF3tUTkW", "question": "What book is the man's sign referencing?", "choices": ["dictionary", "thesaurus", "bible", "encyclopedia"], "correct_choice_idx": 2, "direct_answers": ["bible", "bible", "bible", "bible", "bible", "bible", "bible", "jesus returning", "bible", "bible"], "difficult_direct_answer": false, "rationales": ["His sign is talking about jesus.", "The books and verses seen on the sign are all from the bible, the dictionary, encyclopedia and thesaurus might reference the bible but the bible is where these verses originate.", "The christian one specifically."], "image": "train2014/COCO_train2014_000000021183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254816, "question_id": "oLsShhMo7MGV7XYrJA79Ys", "question": "Why are the fences lower than the giraffe's neck?", "choices": ["allow visibility", "avoid entanglement", "stop wildlife", "filter light"], "correct_choice_idx": 1, "direct_answers": ["reach trees", "to eat", "to eat", "avoid entanglement", "giraffe reach", "can't jump", "electric", "tall", "because eat", "cost"], "difficult_direct_answer": true, "rationales": ["Option a chosen for the display shown.", "The fences don't get tangled.", "The giraffe cannot get over the fence even though it is lower."], "image": "val2014/COCO_val2014_000000254816.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466736, "question_id": "22AKti5A6oAjBuq3KgGLqC", "question": "Starting from the right which bunch of bananas will go bad first?", "choices": ["second", "fifth", "first", "fourth"], "correct_choice_idx": 0, "direct_answers": ["third", "right", "right", "banana", "second", "fourth bunch", "second", "second", "second bunch", "second"], "difficult_direct_answer": false, "rationales": ["The second bunch of bananas from the right is turning black and going to spoil first.", "Several bunches of bananas are hanging next to each other.", "The second furthest bunch of bananas here pictured have the most black marks on them which is evidence of them being older than the other bunches."], "image": "val2014/COCO_val2014_000000466736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339368, "question_id": "22fwApYBNzv4mz5J5S9xvk", "question": "Which vehicle uses the most fuel to get around?", "choices": ["van", "yellow car", "brown car", "green bus"], "correct_choice_idx": 3, "direct_answers": ["bus", "bus", "bus", "bus", "green bus", "bus", "bus", "bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["The green bus uses the most.", "The green bus uses a lot of fuel to get around.", "The green bus requires an enormous amount of gas to run."], "image": "val2014/COCO_val2014_000000339368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303204, "question_id": "23FkCrTntqQVBzCfg9nt2S", "question": "How many people make up this family?", "choices": ["three", "five", "seven", "eight"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "two", "three", "three", "three", "three", "three", "three", "two"], "difficult_direct_answer": false, "rationales": ["There are three plates on the table for that amount of people.", "A there are three servings on the table.", "The kid has one plate. there are two additional plates at the table."], "image": "val2014/COCO_val2014_000000303204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282009, "question_id": "24KVT9j35GJPKFP5kwauCA", "question": "What is most closely related to the smaller animals here?", "choices": ["salamander", "echidna", "mouse", "cassowary"], "correct_choice_idx": 3, "direct_answers": ["emus", "bird", "birds", "turkeys", "cassowary", "bird", "birds", "bird", "birds", "bird"], "difficult_direct_answer": false, "rationales": ["The smaller animals are ostriches that could be related to other animals like the cassowary.", "The cassowary is related to the ostrich because they are both birds.", "Ostriches are walking in a grassy area in front of giraffes."], "image": "train2014/COCO_train2014_000000282009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129141, "question_id": "256ohGZHixSeMS2wmzTiqb", "question": "What color are the stripes on the top of the fishing boat?", "choices": ["red", "green", "blue", "yellow"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "blue", "blue", "black", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The stripes are blue.", "They are a similar color to the sky", "The stripes are blue."], "image": "train2014/COCO_train2014_000000129141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391492, "question_id": "265rtv5nUFijDQqAT9aiyv", "question": "What is an ingredient in this dish?", "choices": ["watermelon", "lime", "tomato", "cheese"], "correct_choice_idx": 3, "direct_answers": ["salad", "cheese", "spinach", "cheese", "shrimp", "spinach", "brocolli", "broccoli", "greens", "cheese"], "difficult_direct_answer": false, "rationales": ["Macaroni is often paired with a melted dairy item.", "The other options aren't in this scene.", "Cheese is the main ingredient in mac and cheese."], "image": "train2014/COCO_train2014_000000391492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555953, "question_id": "26MhXjaC95UAhs5g6ymHq2", "question": "A popular brand of what mode of transportation is advertised at the airfield?", "choices": ["rockets", "cars", "boats", "aircraft"], "correct_choice_idx": 1, "direct_answers": ["toyota", "toyota", "toyota", "cars", "toyota", "auto", "automobile", "car", "car", "toyota"], "difficult_direct_answer": false, "rationales": ["Toyota is a japanese automobile company.", "The sign in front of the runway says toyota which is a popular brand of cars.", "The large red \"toyota\" sign is that of the japanese car company, and is known world-wide."], "image": "val2014/COCO_val2014_000000555953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3832, "question_id": "27GFmE5zsLiJBYkJeFdGXT", "question": "What color is the carrier case for the sail of the left sailboat?", "choices": ["purple", "red", "blue", "green"], "correct_choice_idx": 2, "direct_answers": ["white", "white", "black", "blue", "blue", "blue", "blue", "white", "brown white", "blue"], "difficult_direct_answer": false, "rationales": ["It is blue in colour.", "The carrier is blue.", "The case is the same color as the water."], "image": "val2014/COCO_val2014_000000003832.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177289, "question_id": "28LcENbeZbZvC5BMYKqA8y", "question": "What is the area with the microwave called?", "choices": ["dorm", "kitchenette", "bathroom", "kitchen"], "correct_choice_idx": 1, "direct_answers": ["kitchen", "kitchenette", "kitchenette", "kitchen", "living room", "kitchen area", "kitchen", "kitchenette", "kitchen", "counter"], "difficult_direct_answer": false, "rationales": ["Since there's no large refrigerator or stove, it wouldn't be c. the other options don't match.", "The area is a kitchenette.", "Its used to access kitchen accessories and make food."], "image": "train2014/COCO_train2014_000000177289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380248, "question_id": "29oSPRjzsc9QqPxFC68fQv", "question": "How many lamps are placed in the corners of the bookshelf behind the red couch?", "choices": ["two", "three", "one", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is one on each side.", "There is one lamp on the left. an additional lamp is on the right.", "There are two lamps on each side of the room with the bookshelf."], "image": "train2014/COCO_train2014_000000380248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469424, "question_id": "2AX43rGoKKAM3y2SDYq6hc", "question": "What word describes this animal?", "choices": ["canine", "bovine", "equine", "feline"], "correct_choice_idx": 3, "direct_answers": ["cat", "orange cat", "cat", "feral", "cat", "lazy", "cat", "cat", "cat", "feline"], "difficult_direct_answer": false, "rationales": ["This is a feline cat.", "The animal is a cat.", "The cat is a feline."], "image": "val2014/COCO_val2014_000000469424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240111, "question_id": "2BUFomiBibKPgjPgEEaUA6", "question": "What is near the toilet?", "choices": ["cat", "window", "egg", "dog"], "correct_choice_idx": 1, "direct_answers": ["rack", "counter", "bath tub", "shelf", "bathtub", "window", "tub", "bath tub", "window", "towel"], "difficult_direct_answer": false, "rationales": ["There is a square area with a shade and light coming through which is typical of this structure.", "The toilet is on the wall near the window.", "There is a window above the toilet."], "image": "val2014/COCO_val2014_000000240111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72424, "question_id": "2C6PWzws7YRXQKWGYA9RQ6", "question": "What part of the man is closest to the horse?", "choices": ["hand", "nose", "elbow", "leg"], "correct_choice_idx": 0, "direct_answers": ["hand", "hand", "hand", "hand", "hand", "hand", "left arm", "hand", "hand", "hand"], "difficult_direct_answer": false, "rationales": ["The man is petting the horse on its nose.", "The man has his hand nearby.", "The man is putting his hand underneath the horse's mouth."], "image": "train2014/COCO_train2014_000000072424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506919, "question_id": "2CVWVo8gZwUmJFsqt7BWKJ", "question": "Who owns the vehicle on the left?", "choices": ["lufthansa", "delta", "virgin atlantic", "united airlines"], "correct_choice_idx": 2, "direct_answers": ["virgin", "virgin", "virgin", "virgin", "virgin", "no", "virgin", "virgin atlantic", "richard branson", "virgin"], "difficult_direct_answer": false, "rationales": ["Virgin atlantic is on the left.", "A red and white log is on the tail of an airplane.", "The airline's name is on the tail of the airplane."], "image": "train2014/COCO_train2014_000000506919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 172658, "question_id": "2CkMBQtTbdusxUh77tUNYk", "question": "What kind of vegetable is in the middle to the right of the green onion and having a bulbous red root?", "choices": ["carrot", "potato", "lettuce", "radish"], "correct_choice_idx": 3, "direct_answers": ["radish", "beet", "carrot", "beetroot", "radish", "beetroot", "beets", "radish", "beet", "beet"], "difficult_direct_answer": false, "rationales": ["Lettuce is not a root vegetable. carrots are orange, and potatoes are brown.", "A radish is the vegetable.", "The description of the features and location provided in the question corresponds to answer a."], "image": "val2014/COCO_val2014_000000172658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141741, "question_id": "2CtyzxnMcd3Kdij8zVH7qb", "question": "What type of fruit is at the front of this fruit basket ahead of all of the oranges?", "choices": ["banana", "apple", "pineapple", "pear"], "correct_choice_idx": 1, "direct_answers": ["apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple"], "difficult_direct_answer": false, "rationales": ["The fruit is an apple.", "It is identifiable by its round shape and red and green color. it has a smooth shiny surface.", "It's round and red and green"], "image": "train2014/COCO_train2014_000000141741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147027, "question_id": "2CuyHGT4hfkzM9ToNEnmZb", "question": "What animal is scurrying towards the right?", "choices": ["antelope", "badger", "cow", "monkey"], "correct_choice_idx": 3, "direct_answers": ["monkey", "monkey", "monkey", "monkey", "giraffe", "monkey", "monkey", "monkey", "giraffe", "monkey"], "difficult_direct_answer": false, "rationales": ["The animal is significantly smaller than the giraffes. it is a primate, not an antelope, cow, or badger.", "The giraffes are running towards a small monkey that is running across the field.", "There is an animal who is scurrying away from the right."], "image": "val2014/COCO_val2014_000000147027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452014, "question_id": "2Dm49VGaHLKfzqgqNry9NA", "question": "What is on the fruit?", "choices": ["ants", "flies", "writing", "mold"], "correct_choice_idx": 2, "direct_answers": ["letters/happy-face", "hi", "words", "writing", "banana", "words", "words", "writing", "spots writing", "smile"], "difficult_direct_answer": false, "rationales": ["Someone scratched letters into the banana peel and as it browns so did the letters.", "Someone has used a knife to write on the banana. as the banana ripens the score from the knife turned brown showing the writing.", "You can see the letters carved into the banana. the surface of the banana is soft and can be carved."], "image": "train2014/COCO_train2014_000000452014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354098, "question_id": "2EKYWWM4rqEQpBEQvUiYDV", "question": "What color is the shampoo bottle sitting inside of the shower floor?", "choices": ["purple", "black", "blue", "green"], "correct_choice_idx": 0, "direct_answers": ["white", "brown", "purple", "blue", "purple", "purple", "purple", "black", "purple", "purple"], "difficult_direct_answer": false, "rationales": ["The location of the object is given in the text of the question and color is clearly visible and identifiable.", "The bottle in the bathtub partially visible on the left side of the image is mostly purple in color.", "The shampoo bottle is not blue, green, or black."], "image": "val2014/COCO_val2014_000000354098.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362247, "question_id": "2EjxjAYZV7kU7mK5vaV3Jt", "question": "What is in front of the zebra?", "choices": ["dog", "cat", "cow", "giraffe"], "correct_choice_idx": 3, "direct_answers": ["giraffe", "giraffes", "giraffe", "giraffes", "two giraffes", "giraffe", "giraffe", "giraffes", "giraffe", "giraffes"], "difficult_direct_answer": false, "rationales": ["There is a giraffe at the front.", "There are two giraffes standing in front of the zebra.", "There are two giraffes up close and personal and the one on the right has its mouth open."], "image": "train2014/COCO_train2014_000000362247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212023, "question_id": "2FBkRFnQEC8Pj5XGKaUEVd", "question": "What civilization does the artwork on this vase depict?", "choices": ["phoenician", "egyptian", "roman", "french"], "correct_choice_idx": 2, "direct_answers": ["old", "persian", "horse ridding", "roman", "arabic", "greece", "egyptian", "horse", "roman", "art"], "difficult_direct_answer": true, "rationales": ["The civilization is roman.", "A vase with handles has pictures of horses with long snouts and tall, dark skinned people on it.", "The vase is from roman times."], "image": "train2014/COCO_train2014_000000212023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310558, "question_id": "2G76a2zLBkQii5WqozjfxM", "question": "What does the man in the green shirt likely want to do?", "choices": ["play games", "drink", "sleep", "smoke"], "correct_choice_idx": 3, "direct_answers": ["play", "smoke", "go home", "smoke", "sit", "smoke", "teach", "smoke", "sit", "smoke"], "difficult_direct_answer": false, "rationales": ["The man is holding a pack of cigarettes in his hand.", "He has a pack of cigarettes", "The man in the green shirt is holding a package of cigarettes."], "image": "train2014/COCO_train2014_000000310558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248403, "question_id": "2GCD8UnZQNyM4r5d4SZudJ", "question": "What is in front of the woman?", "choices": ["cat", "dog", "food", "baby"], "correct_choice_idx": 2, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "food", "food", "pizza", "pizza", "food"], "difficult_direct_answer": false, "rationales": ["There is food by the woman.", "The item is on a plate so she can eat it.", "There are plates of food."], "image": "train2014/COCO_train2014_000000248403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436056, "question_id": "2GheFtUzcRxAGtxwXQLPFg", "question": "What color are the nike emblems on the side of this skater's shoes?", "choices": ["white", "black", "yellow", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "black", "red", "red", "red", "orange", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The skater is wearing black sneakers with red nike logos on the sides.", "The color is red.", "The nike logo is in a blood red color."], "image": "train2014/COCO_train2014_000000436056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153134, "question_id": "2HKvMUMaCjsaD6qrHGSXgB", "question": "How many oxen are pulling the log down the hill?", "choices": ["two", "three", "four", "one"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two oxen", "three", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Two oxen are pulling.", "There are two oxen pulling.", "There is a brown ox and a black ox that are strapped together by their horns and are pulling the log on a rope behind them."], "image": "train2014/COCO_train2014_000000153134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72216, "question_id": "2HondRA8tdFu94LtudGjUY", "question": "What kind of work are the elephants used for?", "choices": ["construction", "transportation", "farming", "racing"], "correct_choice_idx": 1, "direct_answers": ["tourism", "clearing grass", "eating grass", "rides", "rides", "entertainment", "gathering", "riding", "riding", "transportation"], "difficult_direct_answer": false, "rationales": ["The elephants are for transportation.", "They can also be used in b and c, but they're most commonly used as a.", "The elephants are large animals. they are used for moving things."], "image": "train2014/COCO_train2014_000000072216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260141, "question_id": "2Hx4KWh4vhAwhVooxaVGpJ", "question": "What kind of structure is in the background above all of the boats?", "choices": ["bridge", "wall", "castle", "aqueduct"], "correct_choice_idx": 3, "direct_answers": ["bridge", "bridge", "bridge", "bridge", "aqueduct", "bridge", "bridge", "bridge", "bridge", "bridge"], "difficult_direct_answer": false, "rationales": ["There is a big aqueduct in the background of the boats.", "The structure is an aqueduct.", "It might also be referred to as a b depending on local customs and if the top is in use for pedestrian or vehicle traffic."], "image": "val2014/COCO_val2014_000000260141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488964, "question_id": "2KxxhsU3dgNrjFLYQqANzb", "question": "What is near the typing person?", "choices": ["dog", "elephant", "pumpkin pie", "cat"], "correct_choice_idx": 3, "direct_answers": ["cat", "cat", "cat", "cat", "cat", "computer", "cat", "cat", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["There is a cat near the person who is typing.", "A domestic animal with pointy ears is sitting next to a person on a computer.", "The car is nearby."], "image": "train2014/COCO_train2014_000000488964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289486, "question_id": "2LDiWjXN5soeMRV23UEP8g", "question": "How many little giraffes are traveling with this small group?", "choices": ["five", "seven", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "five", "five", "two", "two", "two", "two", "five", "two"], "difficult_direct_answer": false, "rationales": ["(a) two. there might be five giraffes there, but two are much smaller than the rest.", "There are two little ones.", "There are two babies side by side. they are among other adult giraffes."], "image": "val2014/COCO_val2014_000000289486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510616, "question_id": "2Lv9dhTohctZ9WQdaLiFJH", "question": "What video game console is being played by the two people in front?", "choices": ["ps5", "nintendo wii", "steambox", "xbox"], "correct_choice_idx": 1, "direct_answers": ["mortal kombat", "wii", "wii", "nintendo wii", "wii", "wii", "nintendo wii", "famous", "car", "famous"], "difficult_direct_answer": false, "rationales": ["Two people stand in front of a television with white controllers in both hands.", "The game is the wii.", "They are holding a remote in one hand and a wand in another. it is made by a japanese company."], "image": "val2014/COCO_val2014_000000510616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113369, "question_id": "2M2Wcvix4KhKMdv3g6UEjh", "question": "Where is this person eating pizza?", "choices": ["home", "restaurant", "office", "parents house"], "correct_choice_idx": 1, "direct_answers": ["restaurant", "birdie's pizza", "birdie's pizza", "restaurant", "pizza", "restaurant", "birdie's", "restaurant", "diner", "birdie's"], "difficult_direct_answer": false, "rationales": ["There is a menu next to the pizza. the menu has the business name on it.", "The person is eating at a restaurant.", "It is on a plate."], "image": "val2014/COCO_val2014_000000113369.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329895, "question_id": "2M3zNopbXJM99LzXKqcEee", "question": "What color are the leaves where the banana bunch is resting upon?", "choices": ["brown", "yellow", "white", "green"], "correct_choice_idx": 0, "direct_answers": ["brown", "brown", "brown", "green", "green", "green", "brown", "brown", "green", "brown"], "difficult_direct_answer": false, "rationales": ["The leaves are starting to turn brown.", "The leaves are brown.", "They are dried up banana leaves that have withered."], "image": "train2014/COCO_train2014_000000329895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548291, "question_id": "2MDjQJKr5pZUZNfZCRAmKv", "question": "What is next to the birds?", "choices": ["zebra", "beach ball", "baby shark", "giraffe"], "correct_choice_idx": 3, "direct_answers": ["fence", "zebra", "giraffe", "giraffe", "giraffe", "giraffe", "fence", "giraffe", "giraffe", "giraffe"], "difficult_direct_answer": false, "rationales": ["The birds are by a giraffe.", "The tall animal is standing near the ostriches.", "The long necked animal which is not an ostrich in this picture is a giraffe."], "image": "train2014/COCO_train2014_000000548291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21126, "question_id": "2Mg3FaxkmhYkJKC89nPTE8", "question": "What style of hair does the man behind the woman have on?", "choices": ["mullet", "bald", "bowlcut", "afro"], "correct_choice_idx": 2, "direct_answers": ["bowlcut", "shaggy", "short", "medium length", "hold", "bowl cut", "short", "bob", "bowl cut", "bowl cut"], "difficult_direct_answer": false, "rationales": ["The man standing behind the woman is wearing his hair in a bowl cut.", "A man is sitting and helping a girl with her dress. he has cut hair that goes all the way around same length.", "The style is a bowlcut."], "image": "train2014/COCO_train2014_000000021126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85543, "question_id": "2NUoNqAfphRaoU7MtNr4dw", "question": "How many different flavors?", "choices": ["three", "two", "five", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "six", "four", "four", "four", "four", "three", "six", "four", "two"], "difficult_direct_answer": false, "rationales": ["There are krullers with icing, a chocolate covered donut, a filled donut and a donut with pink icing.", "There are four flavors.", "The box has four different donut flavors inside."], "image": "val2014/COCO_val2014_000000085543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187411, "question_id": "2PHQQ9EfwZCYkvu7sZrzUC", "question": "What are all of the little sheep gathered around?", "choices": ["wheat", "bed", "salt", "dirt"], "correct_choice_idx": 0, "direct_answers": ["hay", "food", "hay", "wheat", "hay", "grass", "foods", "hay", "eat", "hay"], "difficult_direct_answer": false, "rationales": ["The sheep are around wheat.", "They are eating.", "The sheep are near a wheat hay."], "image": "val2014/COCO_val2014_000000187411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300021, "question_id": "2QvyBzt54yAJUq5N8mdP9U", "question": "What is on the pizza?", "choices": ["anchovies", "pepperoni", "sausages", "chicken cutlets"], "correct_choice_idx": 1, "direct_answers": ["cheese", "there", "pepperoni", "peperoni", "pepperoni", "car", "pepperoni", "pepperoni", "plate", "table"], "difficult_direct_answer": false, "rationales": ["There are red round circles of meat", "The pizza is well cooked with the pepperoni.", "It is a round meat."], "image": "train2014/COCO_train2014_000000300021.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400343, "question_id": "2SECGtoRRhzx3a5k4xME6L", "question": "What are the people playing?", "choices": ["basketball", "tennis", "video games", "baseball"], "correct_choice_idx": 2, "direct_answers": ["game", "video game", "game", "video game", "xbox", "xbox", "video game", "video game", "video games", "play station"], "difficult_direct_answer": false, "rationales": ["Two people are playing a competitive game using a controller.", "The people play video games.", "There is a man sitting close to the tv with a controller playing a game."], "image": "train2014/COCO_train2014_000000400343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470398, "question_id": "2TgXKp4xbMNtqrUJ9e34gP", "question": "What person's first name appears on the largest vehicle?", "choices": ["delta burke", "timothy stack", "omar epps", "ford rainey"], "correct_choice_idx": 0, "direct_answers": ["delta burke", "delta", "delta burke", "delta", "delta", "delta", "delta reese", "delta", "delta", "delta"], "difficult_direct_answer": false, "rationales": ["The name is delta burke.", "The white jumbo airplane has the name delta in blue on side of plane.", "The actress delta burke shares her first name with the airline that made the airplane."], "image": "train2014/COCO_train2014_000000470398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198786, "question_id": "2Urs5ko2vULsBNpd66oKtR", "question": "How many items appear to be made of porcelain?", "choices": ["six", "eight", "three", "five"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "many", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["The sink and both of the toilets are made out of porcelain.", "The sink and two toilets are made of porcelain.", "The sink and two toilets seem to be made out of porcelain."], "image": "train2014/COCO_train2014_000000198786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12803, "question_id": "2V4odLdesFEwpq5ymBQNQy", "question": "What kind of meat is served on the plate with all the seafood and vegetables?", "choices": ["salmon", "beef", "chicken", "pork"], "correct_choice_idx": 1, "direct_answers": ["fork egg", "beef", "steak", "stake", "beef", "chicken", "steak", "steak", "beef", "steak"], "difficult_direct_answer": false, "rationales": ["A roast is made from beef.", "Meat is beef.", "A piece of meat this is brown and served with other food is typically beef."], "image": "train2014/COCO_train2014_000000012803.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556703, "question_id": "2VADmLj6ADik8wcNz9AfzV", "question": "What kind of green leaf is covering the pizza?", "choices": ["lettuce", "oregano", "spinach", "parsley"], "correct_choice_idx": 2, "direct_answers": ["spinach", "basil", "nothing", "garnish", "basil", "basil", "lettuce", "basil", "spinach", "spinach"], "difficult_direct_answer": false, "rationales": ["The green thing is a leafy veggie.", "The green objects are spinach.", "There is a little piece of spinach on top of the pizza."], "image": "val2014/COCO_val2014_000000556703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396942, "question_id": "2WfyUtMuKKmBkM7znh36yy", "question": "What color is the skiers jacket who is skiing on the left?", "choices": ["red", "sky blue", "orange", "purple"], "correct_choice_idx": 1, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "sky blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The skiier on the left is wearing a light teal colored ski jacket.", "The jackets are blue.", "The jacket matches the color in the horizon."], "image": "train2014/COCO_train2014_000000396942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393845, "question_id": "2WgcuWsgu5orEcf3ZMKbpq", "question": "What color is the collar worn on the shirt with the man having his tie tied?", "choices": ["green", "black", "white", "blue"], "correct_choice_idx": 2, "direct_answers": ["white", "white", "white color", "white", "white", "light blue", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["A man is in a dress shirt that is white while a woman adjusts his tie.", "The shirt has a white bright color on the collar.", "His collar is not blue, black, or green."], "image": "train2014/COCO_train2014_000000393845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303439, "question_id": "2XKXZvYh7wYjTgbU3wPXKg", "question": "What do all these animals have in common?", "choices": ["name", "dogs", "color", "birds"], "correct_choice_idx": 3, "direct_answers": ["birds", "birds", "feathers", "wings", "wings", "drinking", "feathers", "fly", "feathers", "birds"], "difficult_direct_answer": false, "rationales": ["There are several different breeds of same animal. they have both beaks and feathers.", "There are ducks, gees, swans and pigeons. they can all fly.", "They only move by flying. they all have beaks and feathers."], "image": "val2014/COCO_val2014_000000303439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394957, "question_id": "2XedMcL3Jpx2dREot8aZ5V", "question": "What type of person uses this facility?", "choices": ["patient", "student", "churchgoer", "traveler"], "correct_choice_idx": 3, "direct_answers": ["any", "man", "human", "athlete", "human", "business person", "traveler", "traveller", "watching", "traveler"], "difficult_direct_answer": false, "rationales": ["There is a piece of luggage on the left. this is a hotel, not a hospital, school, or church, room.", "This is indicated by the suitcase that is in the room.", "The arrangement and decor of this bathroom and the pieces of luggage visible in it lets us conclude that the person using this room is travelling."], "image": "val2014/COCO_val2014_000000394957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523677, "question_id": "2Y9S75Pf3j8YsKaX5XxVue", "question": "What would most likely be sold here?", "choices": ["dim sum", "dumplings", "kobe beef", "cannoli"], "correct_choice_idx": 3, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "cannoli", "pizza"], "difficult_direct_answer": false, "rationales": ["Cheesy foods are shown here, and thus more cheesy foods would be part of the menu at this store.", "The restaurant selling the pizzas is an italian restaurant that might also sell italian desserts like cannoli.", "The shop is an italian restaurant that sells pizza and might sell desserts like cannolis."], "image": "val2014/COCO_val2014_000000523677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84591, "question_id": "2ZaksiR5zuEa6RrxaAc8cQ", "question": "What is on the grass?", "choices": ["zebras", "apples", "cows", "elephants"], "correct_choice_idx": 0, "direct_answers": ["zebras", "zebras", "zebras", "zebra", "zebras", "zebras", "zebras", "zebras", "zebras", "zebras"], "difficult_direct_answer": false, "rationales": ["The animals are on it.", "There are horse-like animals that have black and white stripes.", "The black and white stripes are indicative only of the animal the zebra."], "image": "train2014/COCO_train2014_000000084591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444949, "question_id": "2at476nKtCxqUnpGU9hGWF", "question": "What is there fastened to the top of the wings on this aircraft?", "choices": ["bear", "cat", "person", "goose"], "correct_choice_idx": 2, "direct_answers": ["person", "person", "person", "person", "person", "person", "person", "person", "person", "person"], "difficult_direct_answer": false, "rationales": ["A person is fastened.", "Wing walkers are people who perform tricks in the air using the airplane as a performance area. in order to maintain safety they are fastened on the structure.", "There are no non-human animals on top of the aircraft."], "image": "train2014/COCO_train2014_000000444949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280073, "question_id": "2bYKJZg7dYUNexKYvVmSEi", "question": "What sport is this?", "choices": ["volleyball", "tennis", "equestrian", "baseball"], "correct_choice_idx": 2, "direct_answers": ["horse riding", "equestrian", "dressage", "cricket", "horse riding", "dressage", "horse racing", "horse racing", "equestrian", "equestrian sport"], "difficult_direct_answer": false, "rationales": ["The athlete is riding a horse.", "The image depicts a rider on a horse wearing a particular costume known to be associated with answer a.", "The person in the uniform with a helmet on a horse suggest the sport."], "image": "val2014/COCO_val2014_000000280073.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317565, "question_id": "2cp5gAUVqGDTrCUcTPowZ7", "question": "What is on the desk at the left side of the room?", "choices": ["computer", "hour glass", "cat", "large statue"], "correct_choice_idx": 0, "direct_answers": ["good", "monitor", "computer screen", "remote", "computer", "television", "desktop", "laptop", "computers", "computer"], "difficult_direct_answer": true, "rationales": ["This is identifiable by the screen which is used with this device.", "There are two monitors. they attach to technology.", "As evident by the two monitors as well."], "image": "val2014/COCO_val2014_000000317565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133999, "question_id": "2dm8sAUNmh5C5mfmmzUgwP", "question": "What year was the company founded whose sign appears above the lagging bus?", "choices": ["1748", "1236", "1892", "1992"], "correct_choice_idx": 2, "direct_answers": ["1982", "1969", "1892", "1980", "cocacola", "1945", "cococola", "nothing", "no clue", "1886"], "difficult_direct_answer": true, "rationales": ["A red and white coca cola sign is visible on a building.", "Coca cola was founded in 1892.", "Coca cola was founded in 1892."], "image": "val2014/COCO_val2014_000000133999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576875, "question_id": "2eAZUPmQeHUFVJgWVrf7BN", "question": "What are the team colors for the team playing at pitch?", "choices": ["blue", "purple", "red", "yellow"], "correct_choice_idx": 2, "direct_answers": ["gray", "black red", "gray", "blue", "grey", "white", "grey red", "white", "red", "white"], "difficult_direct_answer": false, "rationales": ["The colors appear to be a and gray. it's hard to tell.", "The team colors are red.", "The stripe on the pants and the badge on the arm would typically be decorated in a team color."], "image": "val2014/COCO_val2014_000000576875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23737, "question_id": "2eJKs464WMfwBEzaDWpSNG", "question": "Which part of the dog held aloft by the man is shaved?", "choices": ["chest", "tail", "butt", "head"], "correct_choice_idx": 2, "direct_answers": ["backside", "legs", "back legs", "back", "rear", "rear", "backside", "leg", "feet", "butt"], "difficult_direct_answer": false, "rationales": ["The back part of the dog is shaved.", "The butt is shaved.", "It's fairly obvious given that no other areas are shaved."], "image": "train2014/COCO_train2014_000000023737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33732, "question_id": "2eNHQqRTyK9fRoGohZQ63k", "question": "What are the horses near?", "choices": ["cars", "babies", "hay", "elephants"], "correct_choice_idx": 0, "direct_answers": ["car", "car", "cars", "cars", "sports car", "car", "ok", "car", "car", "cars"], "difficult_direct_answer": false, "rationales": ["The horses are by cars.", "The horses are walking near cars. the cars are parked in a parking lot.", "This is obvious based on the fact that they're in a parking lot."], "image": "train2014/COCO_train2014_000000033732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380108, "question_id": "2f49YffeqppVSmXSyC8tmL", "question": "What color are the fruits resting atop the fruitbowl of the middle?", "choices": ["green", "purple", "orange", "red"], "correct_choice_idx": 2, "direct_answers": ["orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The fruit in the bowl are the same color as carrots.", "They are oranges.", "The color is orange."], "image": "train2014/COCO_train2014_000000380108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334557, "question_id": "2f9cF8iuaew6jrD7ZxGJYU", "question": "Where is the dog sitting?", "choices": ["bench", "crate", "window", "girls lap"], "correct_choice_idx": 3, "direct_answers": ["girl", "lap", "woman's lap", "lap", "lap", "girls lap", "lap", "lap", "lap", "lap"], "difficult_direct_answer": false, "rationales": ["The dog is sitting on the woman's lap. the dog is being held.", "He's sitting on her while she sits", "The dog is in the girl's lap."], "image": "train2014/COCO_train2014_000000334557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83494, "question_id": "2fYvVYFC3xVUEgwLFYMgFF", "question": "What kind of meat is on the top of the plate near to the strange rock design?", "choices": ["pork", "salmon", "beef", "chicken"], "correct_choice_idx": 2, "direct_answers": ["carrot", "steak", "carrot", "steak", "beef steak", "steak", "steak", "steak", "beef", "steak"], "difficult_direct_answer": false, "rationales": ["The food on the top of the plate has a color and consistency most related to answer a.", "The meat is beef.", "It is large, brown meat."], "image": "val2014/COCO_val2014_000000083494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484843, "question_id": "2g9NKrUMRhnpRCju8hGq7g", "question": "What is the man holding?", "choices": ["baby", "apple", "can", "kitten"], "correct_choice_idx": 2, "direct_answers": ["can", "beer phone", "beer", "phone", "mp3 player", "can", "beer", "beer", "phone", "drink"], "difficult_direct_answer": false, "rationales": ["The man has a beer can.", "The man that is sitting is holding a blue can with a cold drink in it.", "He is drinking a beer."], "image": "train2014/COCO_train2014_000000484843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109231, "question_id": "2gx7h27NihSXrrAjgjmxFL", "question": "What place could the red shirt refer to?", "choices": ["boston", "piz palu", "dresden", "remich"], "correct_choice_idx": 0, "direct_answers": ["brighton", "boston", "bloomington", "boston", "boston", "boston", "city", "city", "b town", "boston"], "difficult_direct_answer": false, "rationales": ["B-town could be used to refer to boston.", "The red shirt is for the boston team.", "The place is boston."], "image": "val2014/COCO_val2014_000000109231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208731, "question_id": "2hKHtX6VDw4TkptmBa6TfG", "question": "What kind of climate is this?", "choices": ["warm", "cold", "freezing", "rainy"], "correct_choice_idx": 0, "direct_answers": ["warm", "sunny", "warm", "warm", "summer", "warm", "warm", "sunny", "sunny", "warm"], "difficult_direct_answer": false, "rationales": ["There are many people on the beach. they are here because its nice enough to lay out in sun or swim", "A beach is shown with palm trees in the background.", "This is a warm, sunny beach."], "image": "train2014/COCO_train2014_000000208731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184135, "question_id": "2hcs3EhA5xx8zVDFkneSuN", "question": "What are the objects held in the small group of people at the mouth of this road?", "choices": ["rainjackets", "pianos", "windex", "umbrellas"], "correct_choice_idx": 3, "direct_answers": ["umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrellas", "stones", "umbrellas", "umbrellas"], "difficult_direct_answer": false, "rationales": ["The object is an umbrella.", "There are umbrellas on the road.", "The people standing at the mouth of the road all have umbrellas."], "image": "train2014/COCO_train2014_000000184135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574590, "question_id": "2hfY67AWhNFUkTdb9pAexY", "question": "What is protecting the person on the left's hands?", "choices": ["gauntlets", "cestus", "magic beans", "gloves"], "correct_choice_idx": 3, "direct_answers": ["gloves", "gloves", "gloves", "gloves", "gloves", "gloves", "gloves", "gloves", "gloves", "gloves"], "difficult_direct_answer": false, "rationales": ["This item is used to keep the hands warm.", "The man is wearing heavy gloves.", "The gloves keep them warm"], "image": "train2014/COCO_train2014_000000574590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316396, "question_id": "2hwuZJKPAoqkrJjDGCUUfE", "question": "What material is covering the tree so the giraffes can live with it?", "choices": ["glass", "cellophane", "wire", "kevlar"], "correct_choice_idx": 2, "direct_answers": ["wire", "grass", "netting", "net", "paint", "netting", "copper", "stone", "stone", "bark"], "difficult_direct_answer": false, "rationales": ["The material is a thin metal rod put together in a pattern in order to product the tree.", "The wire protects the tree so that the giraffe don't eat the tree trunk.", "The fence is made of wire."], "image": "val2014/COCO_val2014_000000316396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501677, "question_id": "2jaJkzepEkvHm48KVqo5pB", "question": "What color are the edges of the wheels on the skateboard with the man sitting on it?", "choices": ["blue", "white", "purple", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "green", "red", "green", "green", "green", "green", "green", "brown", "green"], "difficult_direct_answer": false, "rationales": ["The man is sitting on a skateboard that has bright green paint on the wheels.", "They are the color of the leaves", "The person on the ground is sitting on a skateboard with green wheels."], "image": "train2014/COCO_train2014_000000501677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558011, "question_id": "2nyG6eEQ53XvJuGNrPGMBM", "question": "What is at the top of this corner in the middle of the city square?", "choices": ["church tower", "advertisement", "latitude", "optiplex"], "correct_choice_idx": 0, "direct_answers": ["clock tower", "church tower", "clock", "nothing", "cross", "clock", "watching", "red", "clock", "clock"], "difficult_direct_answer": false, "rationales": ["A steeple is normally on a church.", "A large clock is at the top of a pointed roof with a tall cross.", "There is a building that looks like it has a church steeple."], "image": "train2014/COCO_train2014_000000558011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130456, "question_id": "2q2JoTkErDasXa5WxGy24S", "question": "Which gaming system is the white remote for on the table?", "choices": ["gamecube", "playstation", "xbox", "nintendo wii"], "correct_choice_idx": 2, "direct_answers": ["xbox", "xbox", "video", "playstation", "xbox", "gaming", "sofa", "joystick", "xbox", "video game"], "difficult_direct_answer": false, "rationales": ["The controllers on the wooden folding table are used to play playstation.", "The system is the xbox.", "It was from the xbox."], "image": "val2014/COCO_val2014_000000130456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179188, "question_id": "2snfyk5hrxSrkeGRN4B7cs", "question": "What color is the jacket worn by the man in the center of the skiers?", "choices": ["orange", "black", "purple", "green"], "correct_choice_idx": 0, "direct_answers": ["red", "red", "red color", "red", "orange", "red", "orange", "red orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The center skier's jacket is not green, black, or purple.", "The skier with eye glasses has a orange colored jacket.", "The background skier is wearing a green jacket, and the foreground skier is wearing a black jacket. the center skier is wearing a different colored jacket that is not purple."], "image": "train2014/COCO_train2014_000000179188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522941, "question_id": "2tHupgKDeLSen9NQPQrtuH", "question": "What color are the decorations on the face of the elephant with pink ear tips?", "choices": ["green", "white", "yellow", "blue"], "correct_choice_idx": 1, "direct_answers": ["white", "white", "drawing", "white", "white", "white", "drawing", "white", "white", "white color"], "difficult_direct_answer": false, "rationales": ["They are white.", "The color is white.", "There is only one elephant present and it is facing the camera so the coloring on its face is visible. the decorative coloring is clearly identifiable."], "image": "val2014/COCO_val2014_000000522941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404578, "question_id": "2uFMdouchunR3JaxksPpFz", "question": "What is duplicated but different sizes next to the cake?", "choices": ["fork", "lemon", "apple", "knife"], "correct_choice_idx": 0, "direct_answers": ["slices", "folk", "forks", "forks", "fork", "forks", "fork", "forks", "fork", "cake"], "difficult_direct_answer": false, "rationales": ["The objects have four \"sticks\" coming from the base of the utensil.", "There are two forks next to the cake and they are different sizes.", "The cake has two forks next to it that are the same design but different sizes."], "image": "train2014/COCO_train2014_000000404578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289610, "question_id": "2umgTsoQFZrQgQMZVWqJwb", "question": "What material forms the cross around the neck of the bear in the religious robe?", "choices": ["brass", "copper", "wood", "gold"], "correct_choice_idx": 2, "direct_answers": ["cloth", "cloth", "wood", "wood", "string", "metal", "wood", "cotton", "silk", "cloth"], "difficult_direct_answer": false, "rationales": ["The cross around the bear's neck is made of carved wood.", "There is a wooden cross around the neck of the middle bear.", "It is a carved charm on a necklace and it is light brown"], "image": "val2014/COCO_val2014_000000289610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531153, "question_id": "2ww5mZH7ytYpqGi9cgEoV2", "question": "What kind of winter sport equipment is the man preparing to at the top of the mountain?", "choices": ["alpine skis", "snowboard", "country skis", "racing skis"], "correct_choice_idx": 0, "direct_answers": ["skis", "alpine skis", "skiing", "skis", "ski", "skiing", "ski", "skiing", "skis", "skis"], "difficult_direct_answer": false, "rationales": ["The other options don't make sense. these are the type to use going down slopes.", "He is holding two poles and two skiis.", "He is using skiis."], "image": "train2014/COCO_train2014_000000531153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17859, "question_id": "2xdXUTpdbEt6f3HrZX8qXQ", "question": "What is near the tree?", "choices": ["cat", "fire hydrant", "pumpkin", "dog"], "correct_choice_idx": 1, "direct_answers": ["cars", "car", "dark suv", "car", "car", "car", "fire hydrant", "mental hospital", "cars", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["The object is red with pumps on the side.", "There is a fire hydrant somewhat close to the tree.", "There is a street right in front of a large tree and a red structure by the curb that is used to get water from in case of fire."], "image": "val2014/COCO_val2014_000000017859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47879, "question_id": "2xecuy8oTcsBQcarUQjLmv", "question": "The zebras in the middle of the field are busy doing what?", "choices": ["running", "walking", "eating grass", "pointing nose"], "correct_choice_idx": 2, "direct_answers": ["eating", "eating", "eating", "grazing", "grazing", "eating grass", "eating", "eating", "eating", "tree"], "difficult_direct_answer": false, "rationales": ["The zebras are snacking.", "The zebras in the middle of the field are busy grazing on the grass.", "The animals are grazing on the plush grounds."], "image": "train2014/COCO_train2014_000000047879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104097, "question_id": "2yE7u3NxPFEYgjUSxXWogo", "question": "Who is playing tennis?", "choices": ["old lady", "old man", "mascot", "toddler"], "correct_choice_idx": 1, "direct_answers": ["old man", "elderly man", "old man", "man", "older person", "old man", "old man", "old man", "old man", "man"], "difficult_direct_answer": false, "rationales": ["A guy with white hair is reaching for a tennis ball with his racket.", "An old man is playing tennis.", "The other options don't match this person."], "image": "train2014/COCO_train2014_000000104097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16111, "question_id": "2ys9nW35nwQjXnJ8YiNTNx", "question": "How many giraffes are walking around on top of the green savannah?", "choices": ["six", "four", "three", "five"], "correct_choice_idx": 3, "direct_answers": ["five", "five", "five", "five", "five", "five", "five", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["There are 2 by the tree and three in the field", "There is a pair of giraffes on the left side of the hill and another group of three on the right side of the hill.", "Against the horizon of this image we see two giraffes together on the left near the largest tree in the image as well as three giraffes to the right who seem to be walking away."], "image": "train2014/COCO_train2014_000000016111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419759, "question_id": "34tdAWxXy5go5XfFnaiPYJ", "question": "What company owns this vehicle?", "choices": ["ford", "lufthansa", "gmc", "ibm"], "correct_choice_idx": 1, "direct_answers": ["regional", "lufthansa", "lufthansa", "regional", "regional", "lufthansa", "sa regional", "city line", "lufthansa", "espansa regional"], "difficult_direct_answer": false, "rationales": ["The company is emblazoned on the side of the plane.", "The vehicle is an airplane, not a car or a computer. the airline's name is before regional on the fuselage.", "You can see the tail end portion of the name on the side of the airplane"], "image": "train2014/COCO_train2014_000000419759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195510, "question_id": "35CkJuCsrDHvb3dEdv8z8W", "question": "What color is the bumper underneath of the license plate on the back of the bus?", "choices": ["silver", "blue", "purple", "green"], "correct_choice_idx": 1, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "white", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["It comes with the bus design as it was build.", "The color matches ocean water", "There is a large white bus going down a busy road with a blue bumper."], "image": "val2014/COCO_val2014_000000195510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84040, "question_id": "35chQqnQwT3pu6kZs4hhLs", "question": "What is the red sauce that is covering the hot dog sausages?", "choices": ["oyster", "ketchup", "hoisen", "hot sauce"], "correct_choice_idx": 1, "direct_answers": ["ketchup", "ketchup", "barbeque", "ketchup", "red sauce", "car", "sash", "ketchup", "ketchup", "ketchup"], "difficult_direct_answer": false, "rationales": ["This is the red item most often put on hot dogs", "It is thick and is red and looks to be tomato based sauce on top.", "This condiment is often used for these foods and is made from tomatoes"], "image": "train2014/COCO_train2014_000000084040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38540, "question_id": "37YHpXAMKAKMnPdKw9TUJ3", "question": "What color are the park benches are in the waiting area for this bus lane?", "choices": ["two", "one", "four", "three"], "correct_choice_idx": 2, "direct_answers": ["green", "green", "green", "green", "green", "green", "green", "green", "four", "green"], "difficult_direct_answer": false, "rationales": ["The benches are grouped in four.", "Two dark colored benches are near a bus stop.", "None of the answers provide the color of the benches as requested in the question, but answer a does give the correct number."], "image": "val2014/COCO_val2014_000000038540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128709, "question_id": "3Aw48VtHxdvQzyBsjdD7Ry", "question": "What does the item in the foreground require?", "choices": ["gasoline", "cooking tray", "money", "air pump"], "correct_choice_idx": 2, "direct_answers": ["pays", "coins", "car", "coins", "money", "coins", "coins", "coins", "white", "money"], "difficult_direct_answer": false, "rationales": ["The item requires money.", "The parking meter in the sidewalk requires money in order to buy parking time.", "You put change or money in the parking meter."], "image": "train2014/COCO_train2014_000000128709.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 226660, "question_id": "3BWSSDSunYtu6hYi9s3y5z", "question": "How many metallic squares encapsulate the mirror in the hotel wall?", "choices": ["five", "two", "three", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "four", "five", "five", "four", "four", "three", "four", "three"], "difficult_direct_answer": false, "rationales": ["The mirror on the wall has four squares that are nested within one another.", "There are four layers.", "There's 4 different edges of that mirror of different sizes."], "image": "train2014/COCO_train2014_000000226660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517410, "question_id": "3BeALYjDw7vF4LqCjUnybj", "question": "How many geese are standing on the side of the marina?", "choices": ["four", "two", "five", "three"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is open space on the shore and they are easy to count", "Two birds are standing on pavement near water.", "There are two geese on the pavement."], "image": "train2014/COCO_train2014_000000517410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511848, "question_id": "3C4bzqgdyf9s63acAmebks", "question": "What do the directional signs in the middle of the photo point to?", "choices": ["roads", "hotel rooms", "trails", "ski runs"], "correct_choice_idx": 3, "direct_answers": ["destinations", "slopes", "paths", "places", "trails", "towns", "mountains", "ski runs", "ski runs", "different trails"], "difficult_direct_answer": true, "rationales": ["There are multiple runs on ski hills.", "They let people know which paths they can take", "The signs show the ski runs."], "image": "train2014/COCO_train2014_000000511848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262951, "question_id": "3DRNN8yFaW79eGS7DHNma4", "question": "Who uses the item in the sky the most?", "choices": ["kids", "police officers", "army sergeants", "old men"], "correct_choice_idx": 0, "direct_answers": ["children", "children", "children", "children", "kites", "children", "people", "kids", "kids", "kids"], "difficult_direct_answer": false, "rationales": ["There are a lot of kids on the beach and running around with kites in the air. it is a windy day you can tell because the tails of the kites are whipping around in air.", "The kites in the sky are usually fun toys for kids to play with at the beach.", "The items in the sky are kites. they are not used the most by old men, army sergeants, or police officers."], "image": "train2014/COCO_train2014_000000262951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398818, "question_id": "3EzAfa2oZT34GQcikC86UF", "question": "What year was the company founded whose name appears on the sticker?", "choices": ["1710", "1776", "1870", "1925"], "correct_choice_idx": 2, "direct_answers": ["1944", "1870", "chiquira", "eighteen seventy", "car", "1870", "1870", "stop", "1870", "1943"], "difficult_direct_answer": false, "rationales": ["The company was founded in 1870.", "I had to look this one up. i confirmed it on wikipedia.", "The company is called chiquita banana. company information can be found by researching it."], "image": "val2014/COCO_val2014_000000398818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250200, "question_id": "3FAGE4bYXphrafcqw9isg3", "question": "What is flying in the sky above the lake harbor?", "choices": ["airplane", "blimp", "bird", "helicopter"], "correct_choice_idx": 0, "direct_answers": ["airplane", "plane", "plane", "airplane", "airplane", "aeroplane", "plane", "airplane", "plane", "airplane"], "difficult_direct_answer": false, "rationales": ["This is obvious in the scene.", "A plane is in the air above a bunch of boats parked at a dock.", "The object is clearly visible and located in the sky. the size, shape and manner of travel is consistent with answer a."], "image": "train2014/COCO_train2014_000000250200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490097, "question_id": "3FTBtniFNsU7NJGRAaXy4b", "question": "What color are the sofa seats surrounding the table on the rug?", "choices": ["red", "green", "blue", "cream"], "correct_choice_idx": 3, "direct_answers": ["cream", "white", "white", "white", "white", "wood", "white", "beige", "white", "white"], "difficult_direct_answer": false, "rationales": ["This is a common colour for furniture.", "The sofas in the living room are made of light cream colored leather.", "You can visually identify the color. it appears slightly darker than white."], "image": "train2014/COCO_train2014_000000490097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96820, "question_id": "3FeJSdPXc7ULeQ4h6jqcCN", "question": "What kind of buses are in the derby for demolition?", "choices": ["city", "postal", "prison", "school"], "correct_choice_idx": 3, "direct_answers": ["school buses", "school", "school", "buses", "school bus", "school", "school bus", "school", "school bus", "public"], "difficult_direct_answer": false, "rationales": ["The classic school bus is yellow and black.", "School buses are often yellow.", "The demolition derby is using school buses as part of their show."], "image": "train2014/COCO_train2014_000000096820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31666, "question_id": "3FsP6hxW3cGxnwJQLuxAJ2", "question": "What is the number on the train?", "choices": ["9634", "72856", "2595", "45110"], "correct_choice_idx": 3, "direct_answers": ["fourfiveoneonezero", "45110", "45110", "45110", "four", "fourtyfivethousand onehundredten", "45110", "45110", "45110", "45110"], "difficult_direct_answer": false, "rationales": ["That is the number on the smokebox.", "On the front of the train the numbers 4 5 1 1 and 0 can be seen.", "The number is on the front of the train."], "image": "train2014/COCO_train2014_000000031666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455674, "question_id": "3G5mx2bzcKBrPfY26ctrQh", "question": "What color is the background of the hearts in the center of the staircase?", "choices": ["red", "blue", "purple", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "black", "white", "white", "white", "white", "white", "pink", "white"], "difficult_direct_answer": false, "rationales": ["They are this color with red lettering", "The hearts in the center of the staircase are white.", "They are this with red lettering on them"], "image": "train2014/COCO_train2014_000000455674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405368, "question_id": "3GHjNxyFgoLQtoR66WEJGM", "question": "What company makes the phone?", "choices": ["apple", "samsung", "nokia", "ibm"], "correct_choice_idx": 2, "direct_answers": ["nokia", "nokia", "nokia", "nokia", "nokia", "nokia", "nokia", "nokia", "nokia", "nokia"], "difficult_direct_answer": false, "rationales": ["The name is on the top of the phone.", "The brand of the phone appears in writing at the top of the phone.", "The word nokia can be seen at the top of the phone, as the brand."], "image": "val2014/COCO_val2014_000000405368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414999, "question_id": "3Kt3xggX6sK4kthr6pSBPJ", "question": "How does the temperature likely feel?", "choices": ["cold", "hot", "warm", "cool"], "correct_choice_idx": 3, "direct_answers": ["sunny", "summer", "warm", "cold", "cool", "cold", "stop", "warm", "cool", "cool"], "difficult_direct_answer": false, "rationales": ["Judging by the clothing being worn by the people in the stands, they are wearing more clothing than one would on a warm day, but less that one would on a very cold day so the temperature is likely in the middle.", "Based on the fans in the stand with long sleeves and jackets, fall temperatures.", "The temperature is cool."], "image": "train2014/COCO_train2014_000000414999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345353, "question_id": "3LJt3HArFxSD5EVpLfmReG", "question": "Which item on the plate is highest in carbs if the person ate all of it?", "choices": ["pork", "corn", "hamburger bun", "macaroni"], "correct_choice_idx": 3, "direct_answers": ["sandwich", "meat", "macaroni", "mac cheese", "bread", "pasta", "macaroni", "dish", "noodles", "macaroni"], "difficult_direct_answer": false, "rationales": ["This is made from flour", "The mac and cheese has the most carbs since it is made of pasta.", "It is the largest quantity of food on the plate"], "image": "val2014/COCO_val2014_000000345353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268114, "question_id": "3Ly8zc2LxkWbnyt46EJARg", "question": "What is above the water?", "choices": ["boat", "shark", "balloon", "surfboard"], "correct_choice_idx": 0, "direct_answers": ["dock", "boats", "ship", "pier", "boats", "boat", "boat", "boats", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["There is a boat traveling on the water.`", "The image and objects within are clearly visible. answer a is visibly present and located above the water.", "The only answer visible from the list of answers is answer a."], "image": "train2014/COCO_train2014_000000268114.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228160, "question_id": "3MRw4mjA8DuEuaVNXBTDn6", "question": "What kind of event is this?", "choices": ["fashion show", "movie premiere", "awards ceremony", "wine tasting"], "correct_choice_idx": 3, "direct_answers": ["speed dating", "meeting", "wine tasting", "wine tasting", "party", "party", "wine tasting", "wine tasting", "wine tasting", "wine tasting"], "difficult_direct_answer": false, "rationales": ["(a) wine testing. it looks like most people are holding glasses of wine and holding papers that could be about the wines.", "This seems to be the case based on the glasses and the liquid in them.", "People are standing in a crowd and all are holding wine glasses and a packet of papers."], "image": "train2014/COCO_train2014_000000228160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19428, "question_id": "3StFmGPaVXFUwyh8WeaMYB", "question": "What color is the toothbrush in the jar on the counter?", "choices": ["green", "red", "yellow", "blue"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "white yellow", "yellow", "yellow", "yellow", "fee", "white", "pink", "white"], "difficult_direct_answer": false, "rationales": ["There is only one toothbrush on the counter and the color is clearly visible and is answer a.", "The toothbrush is not blue, green, or red.", "A toothbrush that is light in color is in a cup on a bathroom sink countertop."], "image": "train2014/COCO_train2014_000000019428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81922, "question_id": "3SwJAdxfdrTEZV6PqvFQ8o", "question": "What kind of vehicle is seen above the large freeway?", "choices": ["helicopter", "airplane", "boat", "ufo"], "correct_choice_idx": 1, "direct_answers": ["airplane", "airplane", "airplane", "airplane", "airplane", "plane", "plane", "airplane", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["There is an airplane flying above the freeway.", "There is a large airplane flying low to the ground over the freeway.", "There is a large white airplane flying low to the ground above the freeway."], "image": "val2014/COCO_val2014_000000081922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332296, "question_id": "3TYHs56Dc6VP22eKYVUqT8", "question": "The first three letters of the name of the street form the first name of what actor?", "choices": ["jim beaver", "kal penn", "joe pesci", "michael keaton"], "correct_choice_idx": 1, "direct_answers": ["kal", "unknown", "kal", "kal penn", "kal penn", "kal", "kal", "kal", "kal", "kal"], "difficult_direct_answer": false, "rationales": ["The street sign's first 3 letters are kal.", "It's the only one to match.", "A street sign lists the street name as kalorama."], "image": "train2014/COCO_train2014_000000332296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425254, "question_id": "3TjUmQNh9wMMgekcth792Y", "question": "How many zebras are contained by the chainlink fence to forage grass?", "choices": ["five", "three", "four", "one"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There is a trio of them grazing within the enclosure.", "There are three zebras grazing on the grass in the fenced-in area.", "The fence has three zebras on the inside. the zebras are eating grass."], "image": "train2014/COCO_train2014_000000425254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472610, "question_id": "3TvDJZe6UGEymEZ9uj25tn", "question": "What breed of dog is held by the woman near the cow pasture?", "choices": ["poodle", "beagle", "golden retriever", "pit bull"], "correct_choice_idx": 3, "direct_answers": ["pit bull", "bulldog", "dog", "pitbull", "pitbull", "labroder", "pit bull", "pit bull", "pitbull", "bull dog"], "difficult_direct_answer": false, "rationales": ["A stocky white and black dog is with a person in a field.", "The dog is clearly visible and has the face and body shape consistent with answer a and not any other answer provided.", "It has that short snout and big head with muscular body."], "image": "val2014/COCO_val2014_000000472610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452775, "question_id": "3V3jG9fsLgqK3XhQzy6z7K", "question": "What is the most likely continent for this setting?", "choices": ["africa", "australia", "south america", "antarctica"], "correct_choice_idx": 2, "direct_answers": ["europe", "europe", "france", "south america", "europe", "south america", "europe", "road", "parking lines", "relax"], "difficult_direct_answer": false, "rationales": ["Most of the signs on the city street are in spanish and could be located somewhere in south america where spanish is mostly spoken.", "The signs are in spanish", "There is spanish on the signs"], "image": "train2014/COCO_train2014_000000452775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50323, "question_id": "3Vj6HsxzNpvpabUuBAXGvg", "question": "How many portraits are found to be hung on the walls of this living room area?", "choices": ["two", "three", "four", "five"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "one", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is a picture frame above both sofa and in the corner by lamp.", "There are two pieces of framed artwork on the walls of the living room.", "One portrait is above the couch. an additional one is near the lamp."], "image": "train2014/COCO_train2014_000000050323.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312724, "question_id": "3XDyLUAdkmA9gRnCX23qEj", "question": "What is on the grass?", "choices": ["candy", "women", "animals", "cars"], "correct_choice_idx": 2, "direct_answers": ["animals", "animals", "animals", "animals", "animals", "animals", "animals", "animals", "animals", "animals"], "difficult_direct_answer": false, "rationales": ["There are animals on the grass.", "A lot of animals are in the field", "There are zebras and giraffes on the grass."], "image": "val2014/COCO_val2014_000000312724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145188, "question_id": "3Xsw6PHNYWoLYSsqyXQXZT", "question": "What time of day is most likely?", "choices": ["night", "evening", "afternoon", "morning"], "correct_choice_idx": 3, "direct_answers": ["morning", "morning", "morning", "evening", "morning", "morning", "morning", "breakfast", "breakfast", "morning"], "difficult_direct_answer": false, "rationales": ["The plate contains foods that are normally served at breakfast in the morning such as eggs and toast.", "There is a plate of eggs and hash browns.", "There are eggs and hashbrowns in the photo. this is a traditional breakfast food in western culture."], "image": "train2014/COCO_train2014_000000145188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81117, "question_id": "3YRDDhwCsb76jwo6sFQZxR", "question": "What video game has settings like this?", "choices": ["pac man", "uncharted", "centipede", "asteroids"], "correct_choice_idx": 1, "direct_answers": ["mortal kombat", "forefront", "uncharted", "role playing", "halo", "zelda", "fortnite", "tomb raider", "uncharted", "donkey kong"], "difficult_direct_answer": true, "rationales": ["Centipede, asteroids, and pacman are older games and have nothing like this for settings.", "Uncharted is an adventure game that has a lot of forest or jungle elements.", "Uncharted is an outdoor adventure game searching for treasure in green areas."], "image": "train2014/COCO_train2014_000000081117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44596, "question_id": "3YggNCeymvRMN9tijbwruC", "question": "How many chairs are against the windows behind the television?", "choices": ["five", "three", "two", "four"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "brown", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["A pair of chairs are arranged behind a television.", "There are two armchairs behind the tv.", "There is one on each side of the tv"], "image": "train2014/COCO_train2014_000000044596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388599, "question_id": "3aP4rYvRGsRwrqCnABvTum", "question": "What is the primary color of the frisbee held by the man that is bitten by this dog?", "choices": ["purple", "white", "pink", "red"], "correct_choice_idx": 1, "direct_answers": ["white", "clear grey", "white", "clear", "white", "white", "white", "white", "white", "clear"], "difficult_direct_answer": false, "rationales": ["The frisbee is whitish.", "The frisbee that the dog is reaching out to grab is made of white plastic.", "The frisbee is not pink, purple, or red."], "image": "val2014/COCO_val2014_000000388599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335766, "question_id": "3bGzhZfkqvdLgtQUVTwmPc", "question": "What is the glass item on top of the counter?", "choices": ["candy cane", "bowl", "cannon", "statue"], "correct_choice_idx": 1, "direct_answers": ["sink", "bowl", "bowl", "vessel sink", "wash basin", "sink", "bowls", "faucet", "sink", "sink"], "difficult_direct_answer": false, "rationales": ["The sinks in the bathroom are made of simple glass bowls on the countertop.", "These are fancy sinks that use this shape", "Used to hold water as a sink."], "image": "train2014/COCO_train2014_000000335766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104801, "question_id": "3bPecZRs6K5M29hDrXAxJC", "question": "What color are the speakers on the top of the stereo set on either side of the TV and piano?", "choices": ["red", "blue", "white", "yellow"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "black yellow", "black", "yellow black", "yellow", "yellow", "black", "black yellow"], "difficult_direct_answer": false, "rationales": ["The speaker looks yellow in color in the room.", "The speakers next to the tv and piano are yellow in the middle.", "The speakers on the sides of the piano are black with yellow circles in the middle.`"], "image": "val2014/COCO_val2014_000000104801.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180982, "question_id": "3cTFDbAVWQHQPcWa4RCfbz", "question": "What vehicles are stationed here?", "choices": ["trains", "airplanes", "cars", "buses"], "correct_choice_idx": 0, "direct_answers": ["train", "trains", "four", "trains", "stop", "trains", "trains", "train", "train", "trains"], "difficult_direct_answer": false, "rationales": ["There are vehicles on rails.", "Freight trains are on the tracks.", "There are two sets of vehicles on a track"], "image": "val2014/COCO_val2014_000000180982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537772, "question_id": "3fNSLeV7LDfudwDsTTHZY7", "question": "What is this type of truck called?", "choices": ["dump truck", "cement truck", "semi", "pickup"], "correct_choice_idx": 2, "direct_answers": ["flatbed", "wddw", "semi", "18 wheeler", "mack", "trailer", "semi", "flatbed", "tractor trailer", "semi"], "difficult_direct_answer": false, "rationales": ["This is a large truck that is used to pull heavy items. it has 18 wheels.", "A white semi truck with sixteen wheels and a long flatbed in back.", "The truck is semi."], "image": "train2014/COCO_train2014_000000537772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543732, "question_id": "3gk5k7fXjsJT9A6vhw7jfh", "question": "What kind of mouse is being used?", "choices": ["wireless", "ball mouse", "light up", "wired"], "correct_choice_idx": 0, "direct_answers": ["wireless", "mouse pad", "wireless", "wireless", "wireless", "wireless", "wireless", "computer mouse", "logitech", "regular"], "difficult_direct_answer": false, "rationales": ["There is no cord on it", "A wireless mouse is used.", "There are no cables seen attached to the mouse."], "image": "train2014/COCO_train2014_000000543732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345644, "question_id": "3k7qzJPnmJd73W6hN4Jmzf", "question": "What is the coldest item here?", "choices": ["car engine", "snow", "fan", "refrigerator"], "correct_choice_idx": 1, "direct_answers": ["snow", "snow", "snow", "snow", "ice", "ice", "snow", "snow", "snow", "snow"], "difficult_direct_answer": false, "rationales": ["There is some white powdery substance. it can be used to make cold items.", "The snow is very cold.", "The coldest item is snow."], "image": "train2014/COCO_train2014_000000345644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427129, "question_id": "3kiHzBGKTZgnyhrNVtJbYZ", "question": "What is the animal who is standing in the middle of the rocks?", "choices": ["rhino", "bird", "giraffe", "zebra"], "correct_choice_idx": 3, "direct_answers": ["zebra", "zebra", "zebra", "stop", "zebra", "zebra", "car", "zebra", "giraffe", "zebra"], "difficult_direct_answer": false, "rationales": ["It is small and horse like. it has characteristic black and white stripes typical to this animal.", "The animal standing next to the giraffes in the middle of the rocks is a zebra.", "The animal is the zebra."], "image": "train2014/COCO_train2014_000000427129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331520, "question_id": "3mUfUKgDHpjQ7fHrter4j2", "question": "Which finger is touching the rubber band?", "choices": ["left pinky", "left middle", "left pointer", "right thumb"], "correct_choice_idx": 3, "direct_answers": ["thumb", "thumb", "manshowing finger", "thumb", "thumb", "right thumb", "thumb", "pointer", "thumb", "thumb"], "difficult_direct_answer": false, "rationales": ["The right thumb is touching the rubber band on the ball.", "The right thumb is touching.", "It is on the item"], "image": "train2014/COCO_train2014_000000331520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530941, "question_id": "3nzE2gdmK6DiAMz5p4Z5Na", "question": "What are the cows inside of?", "choices": ["cardboard boxes", "fence", "cages", "cars"], "correct_choice_idx": 1, "direct_answers": ["fance", "pen", "cage", "pen", "pen", "fence", "pen", "fence", "fence", "field"], "difficult_direct_answer": false, "rationales": ["They have been enclosed by wood and wire mesh fence.", "The cows are inside an enclosure that is open on top.", "The enclosure surrounding is clearly visible and based on the connected posts layout, answer a is consistent."], "image": "val2014/COCO_val2014_000000530941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230268, "question_id": "3p7dd6kRPeZVoNamEqt4Si", "question": "What color is the top of the comforter hanging on the wooden bedframe?", "choices": ["orange", "red", "cream", "blue"], "correct_choice_idx": 2, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "cream"], "difficult_direct_answer": false, "rationales": ["The color on the bed is cream colored.", "The color is cream.", "The top of the comforter is off-white, not blue, red, or orange."], "image": "val2014/COCO_val2014_000000230268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512296, "question_id": "3pM3p9UJofNNT2a3fxyaSL", "question": "What would this animal like to eat the most?", "choices": ["carrot", "fish", "chicken leg", "hamburger"], "correct_choice_idx": 0, "direct_answers": ["carrot", "carrot", "hay", "grass", "carrots", "carrot", "hay", "hay", "hay", "hay"], "difficult_direct_answer": false, "rationales": ["There is a horse depicted that only wats veggies.", "Horses eat carrots", "The animal in the foreground is a horse and would love to eat carrots."], "image": "train2014/COCO_train2014_000000512296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211118, "question_id": "3pWANbmUKFQFUmGcKxxrEH", "question": "What color is the cape worn by the little bunny figurine?", "choices": ["orange", "purple", "blue", "green"], "correct_choice_idx": 0, "direct_answers": ["red", "orange", "pink", "orange", "red", "red", "white", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["A small bunny figurine is white with a bright cape the color of a construction cone.", "This is worn draped off the shoulders to the back of the wearer. this bunnys cape is this color.", "The bunny figurine is wearing a bright orange cape."], "image": "train2014/COCO_train2014_000000211118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217856, "question_id": "3rTW2Umr58iSNvfrycK9ij", "question": "What is going down the ramp?", "choices": ["skateboarder", "cat", "baby", "dog"], "correct_choice_idx": 0, "direct_answers": ["skater", "man", "skateboarder", "skateboard", "skater", "skate boarder", "skateboard", "skateboard", "skateboarder", "man skateboard"], "difficult_direct_answer": false, "rationales": ["There is a skateboarder going down the ramp.", "An adult human is performing an extreme sports trick.", "The skateboarder goes down."], "image": "train2014/COCO_train2014_000000217856.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365258, "question_id": "3rZVAtizZD8SvkCFSNg948", "question": "What does the green item all the way to the right look like most?", "choices": ["limes", "leaves", "jelly", "partridge"], "correct_choice_idx": 1, "direct_answers": ["mesclun", "leaf", "chrysanthemum", "salad", "lettuce", "spinach", "salad", "leaves", "lettuce", "leaves"], "difficult_direct_answer": false, "rationales": ["It is green with chlorophyll which is characteristic of this type of material.", "The green item next to the sandwich looks like leaves of salad greens.", "There is a small serving of leafy greens on the plate next to the sandwich."], "image": "train2014/COCO_train2014_000000365258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511572, "question_id": "3rr6t7LzAYJKkfBCPG87pq", "question": "What is in the bowl?", "choices": ["beef stew", "pasta", "apples", "pizza"], "correct_choice_idx": 1, "direct_answers": ["pasta", "pasta", "pasta", "food", "pasta", "pasta", "noodles", "pasta", "pasta", "pasta"], "difficult_direct_answer": false, "rationales": ["The contents of the bowl is clearly visible and is a color, shape and size consistent with answer a.", "You can see this type of noodle by it's classic color range of green, orange and white and it's wavy texture.", "There is spiral pasta on the bowl."], "image": "val2014/COCO_val2014_000000511572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183972, "question_id": "3wu4TSW5iVYx6ubDw84jZz", "question": "What type of curtains are on the windows?", "choices": ["sheers", "cafe curtains", "blinds", "valances"], "correct_choice_idx": 1, "direct_answers": ["valance", "stop", "curtains", "yellow", "floral", "gingham", "cafe curtains", "yellow", "floral", "short curtain"], "difficult_direct_answer": false, "rationales": ["Because they can be opened and closed easily as the cafe ones.", "Windows have curtains that start at halfway up the window.", "They are typical of small kitchen windows. they are propped by suspension bars which are found with this kind of style."], "image": "train2014/COCO_train2014_000000183972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564545, "question_id": "3y43capqxktv2uwcYuGqPe", "question": "Which animal is most similar to the animal on the right?", "choices": ["manx", "echidna", "egret", "salamander"], "correct_choice_idx": 2, "direct_answers": ["kingfisher", "flamingo", "pelican", "crane", "pelican", "egret", "bird", "stork", "heron", "bird"], "difficult_direct_answer": false, "rationales": ["They are both birds", "The bird belongs to this family.", "A tall crane with long legs stand near an ocean visible in the background."], "image": "train2014/COCO_train2014_000000564545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576564, "question_id": "3yVygNWcP56U3UcYppFxn7", "question": "What color is on the animal in the middle's head?", "choices": ["green", "red", "blue", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "white", "black", "white", "white", "black", "one", "white", "white"], "difficult_direct_answer": false, "rationales": ["Black and white cows are standing together in a pasture.", "The spot on the middle cow's head is not blue, red, or green.", "The animal in the middle has a patch of fur that is void of color which is known by this name."], "image": "val2014/COCO_val2014_000000576564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158823, "question_id": "3zk5RrhYNx4YSd7m395WqG", "question": "What color is the painted line on top of the asphalt pavement?", "choices": ["silver", "blue", "yellow", "red"], "correct_choice_idx": 2, "direct_answers": ["yellow", "blue", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "white"], "difficult_direct_answer": false, "rationales": ["A yellow line runs along the road in front of a bus. yellow lines are painted on streets to mark off areas.", "There is a yellow line painted on the pavement for people to stand behind.", "The line is yellow."], "image": "train2014/COCO_train2014_000000158823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531324, "question_id": "42H5SM8Cy9CQSxMhrVY4QN", "question": "What is in front of the building?", "choices": ["hills", "bicycles", "horses", "cows"], "correct_choice_idx": 1, "direct_answers": ["bikes", "bicycles", "bicycles", "bicycles", "bikes", "bikes", "windows", "bicycle", "bicycles", "bicycles"], "difficult_direct_answer": false, "rationales": ["There are human powered 2 wheel vehicles", "Bikes are out front.", "There are no animals near the building. the area is flat."], "image": "val2014/COCO_val2014_000000531324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86696, "question_id": "42ZWgm98cERByCxQGqVGsA", "question": "What is creating the most oxygen here?", "choices": ["plants", "oxygen tank", "hyperbaric chamber", "cow"], "correct_choice_idx": 0, "direct_answers": ["plants", "room", "plants", "flowers", "plants", "plant", "plants", "tank", "air", "plants"], "difficult_direct_answer": false, "rationales": ["The plants can make oxygen.", "The plants create oxygen.", "The pink bouquets on the ground and tree thing on the table on the right are what absorb carbon dioxide and give off oxygen."], "image": "val2014/COCO_val2014_000000086696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469435, "question_id": "43yFq62YBKTt9bpNpc9SaA", "question": "What is the grey-haired woman doing with her book?", "choices": ["reading", "puzzles", "highlighting", "nothing"], "correct_choice_idx": 1, "direct_answers": ["writting", "write", "writing", "marking", "writing", "writing", "watching", "car", "puzzles", "writing"], "difficult_direct_answer": false, "rationales": ["The woman on the bench with gray hair has a book of crosswords puzzles that she is trying to solve.", "A woman closest to us is doing a crossword.", "The woman with gray hair is working on a crossword puzzle with a pen."], "image": "train2014/COCO_train2014_000000469435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335539, "question_id": "45DC7xVwRmK4vWoCNai6bG", "question": "What style of lamp is the one above the table?", "choices": ["retro style", "candle style", "chandelier", "tiffany style"], "correct_choice_idx": 3, "direct_answers": ["chandelier", "tiffany", "tiffany", "pendant", "round", "tiffany", "tiffany style", "stain glass", "colorful", "cone"], "difficult_direct_answer": false, "rationales": ["That lamp has colors.", "It is colorful and has a unique design.", "It is a tiffany style because it has different colours with the stained class effect."], "image": "val2014/COCO_val2014_000000335539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231632, "question_id": "45iotbbSrBHZMgWpKtvqvy", "question": "What is next to the wooden steps?", "choices": ["railing", "dog", "cat", "egg"], "correct_choice_idx": 0, "direct_answers": ["bookshelf", "bags", "railing", "clothes", "stroller", "jacket", "bookcase", "stroller", "clothes", "jacket"], "difficult_direct_answer": false, "rationales": ["There is a silver railing next to the wooden steps for people to hold while going up or down.l", "There is a metal rail to keep people from falling off the wooden stairs.", "A metal railing lines the side of a staircase."], "image": "train2014/COCO_train2014_000000231632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234569, "question_id": "46BnGvyP2s8LUbJLYew2uf", "question": "What is the kite near?", "choices": ["cat", "clouds", "apple", "baby"], "correct_choice_idx": 1, "direct_answers": ["sky", "beach", "ocean", "beach", "clouds", "beach", "clouds", "water", "beach", "clouds"], "difficult_direct_answer": false, "rationales": ["It is mid air high of the beach shore.", "They are both floating in the sky", "Based on the perspective of the image the objects closest to the kite would be answer a."], "image": "train2014/COCO_train2014_000000234569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478597, "question_id": "46MaDtJvbkg4Uqt5PbLmvX", "question": "Where is this pizza being displayed?", "choices": ["school", "house", "shop", "church"], "correct_choice_idx": 2, "direct_answers": ["window display", "shop", "italy", "restaurant", "pizza", "restaurant", "thick", "pizzeria", "counter", "pizza"], "difficult_direct_answer": false, "rationales": ["There are many different kinds of pizzas displayed on pizza pans. they are on other side of glass display in store to buy.", "Many pizzas can be seen in a storefront.", "It's behind glass and has a price tag"], "image": "train2014/COCO_train2014_000000478597.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104046, "question_id": "485UZTFRwneo2EHciqsJjv", "question": "What kind of flag is the blue and yellow one?", "choices": ["swedens flag", "scotlands flag", "germanys flag", "denmarks flag"], "correct_choice_idx": 0, "direct_answers": ["country flag", "flag", "swedish", "sweden", "coundry", "sweden", "sweden", "english", "swedens flag", "country flag"], "difficult_direct_answer": false, "rationales": ["The flag is sweden's.", "A blue flag with a yellow vertical and horizontal line is hanging from a building.", "The blue flag with yellow cross on the building is the flag of sweden"], "image": "train2014/COCO_train2014_000000104046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392472, "question_id": "48JEfcsG5zBZ73EBBFNtYB", "question": "What does the man in the red jacket's patch indicate?", "choices": ["emergency personnel", "police", "fire fighter", "us military"], "correct_choice_idx": 0, "direct_answers": ["paramedic", "medic", "doctor", "cross", "medic", "help", "rescue", "medic", "first aid", "emergency personnel"], "difficult_direct_answer": false, "rationales": ["He is an assistance to the player.", "A man is wearing a coat with a white cross on a red background.", "The white cross surrounded by red on this man's jacket indicate medical training."], "image": "train2014/COCO_train2014_000000392472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127388, "question_id": "48z7qGNUBm49LhX5JECFRH", "question": "What angle is the woman's arm which is holding the frisbee forming?", "choices": ["90 degree", "10 degree", "360 degree", "180 degree"], "correct_choice_idx": 0, "direct_answers": ["elbow", "right", "right angle", "right", "45", "90 degree", "90 degree", "happy", "right angle", "45 degrees"], "difficult_direct_answer": false, "rationales": ["Her arm is forming a right angle.", "The woman holding the frisbee has her arm bent at the elbow in a 90 degree angle.", "A woman has her arm out and a point with her elbow."], "image": "train2014/COCO_train2014_000000127388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99518, "question_id": "49N7wzJNUFJ2hryWaYoye9", "question": "At most how many adults are lined up together in a row down one section of the railing?", "choices": ["two", "four", "one", "three"], "correct_choice_idx": 1, "direct_answers": ["three", "four", "five", "five", "four", "five", "four", "four", "five", "four"], "difficult_direct_answer": false, "rationales": ["There are four adults.", "There are three men and one woman", "There are two sections of rail visible. the section that has more people in a line shows the people visibly and are countable."], "image": "val2014/COCO_val2014_000000099518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578055, "question_id": "49UM7b9QoxdFDQ3Dgo5vNU", "question": "What is above the kite?", "choices": ["egg", "cloud", "airplane", "baby"], "correct_choice_idx": 1, "direct_answers": ["clouds", "sky", "sky", "cloud", "clouds", "sky", "clouds", "clouds", "clouds", "clouds"], "difficult_direct_answer": false, "rationales": ["One can see the white, fluffy structures hanging in the sky.", "The cloud is above.", "They are white and fluffy in the sky"], "image": "train2014/COCO_train2014_000000578055.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136770, "question_id": "49XMfj9Wa8SbyRyjQk7mvb", "question": "What kind of footwear is the person in the white shirt wearing?", "choices": ["adidas", "nike", "skechers", "new balance"], "correct_choice_idx": 1, "direct_answers": ["nike", "sneakers", "tennis shoes", "shoe", "nike", "nike", "sneakers", "shoes", "shoe", "sneakers"], "difficult_direct_answer": false, "rationales": ["There is a classic swoosh on the side that is only only this brand of shoe.", "This brand has a iconic swish logo.", "The person in the white shirt is wearing white nike sneakers with the swoosh logo on them."], "image": "val2014/COCO_val2014_000000136770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229774, "question_id": "4A92u8ukTDCyjii8hJzUTv", "question": "How many people probably use this room?", "choices": ["five", "four", "two", "three"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "one", "two", "two", "one", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are dual sinks that can be used at the same time, plus only a couple of toothbrushes.", "The bathroom has two sinks in the vanity which is likley intended to be used by two people at once.", "There is stuff around both sinks."], "image": "train2014/COCO_train2014_000000229774.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 11702, "question_id": "4AF3isbB9RVYtr6Kxw82KA", "question": "What type of hat is the woman wearing?", "choices": ["ball cap", "conical", "fascinator", "fedora"], "correct_choice_idx": 1, "direct_answers": ["conical", "round", "straw", "bamboo", "sun hat", "non la", "conical hat", "conical hat", "pyramid", "fee"], "difficult_direct_answer": true, "rationales": ["The shape of the hat is clearly visible and has the features consistent with answer a.", "The hat is in the shape of a cone.", "A man is wearing a straw hat that helps protect him from the sun while he is steering a boat in water. the hat resembles a triangular shape that comes to one point at tip."], "image": "train2014/COCO_train2014_000000011702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518365, "question_id": "4CtMSuSAm7YxBANkCP6sRQ", "question": "What is the sharpest item here?", "choices": ["scissors", "unicorn horn", "tusks", "machete"], "correct_choice_idx": 2, "direct_answers": ["tusks", "tusk", "tusk", "tusk", "tusk", "tusk", "tusks", "tusks", "tusk", "elephant tusk"], "difficult_direct_answer": false, "rationales": ["The white items on their faces are sharp.", "Tusks are the only listed items that are visible.", "The tusks are sharp."], "image": "val2014/COCO_val2014_000000518365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70471, "question_id": "4Cy3JPc4EbUbPrWYHPjC4j", "question": "What is the baby attempting to eat?", "choices": ["big toe", "remote control", "apple slices", "hot dog"], "correct_choice_idx": 1, "direct_answers": ["remote", "remote", "remote control", "remote", "tv remote", "remote", "remote", "remote control", "remote control", "sooth teeth"], "difficult_direct_answer": false, "rationales": ["The baby has a remote control in her mouth. she is chewing on it.", "It has buttons for the television", "The baby has a remote control for a tv in its mouth that it's trying to eat."], "image": "val2014/COCO_val2014_000000070471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256196, "question_id": "4DPhxhFR5gFRgtoEiwZ9mx", "question": "What is hanging from the ceiling?", "choices": ["monkeys", "posters", "cats", "lights"], "correct_choice_idx": 3, "direct_answers": ["lights", "lights", "lights", "light", "lights", "light", "lights", "lights", "lights", "lights"], "difficult_direct_answer": false, "rationales": ["Lights are hanging.", "These are pendant fixtures", "The lighting for the entire room can be seen coming down from the top of the room."], "image": "train2014/COCO_train2014_000000256196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494320, "question_id": "4DUvBr5bdG7LGeudGMcDaH", "question": "What era was this invention most related to?", "choices": ["prehistoric era", "industrial revolution", "ancient egypt", "dark ages"], "correct_choice_idx": 1, "direct_answers": ["no idea", "industrial", "19th century", "industrial", "industrial revolution", "1900s", "olden days", "industrial revolution", "industrial", "industrial"], "difficult_direct_answer": false, "rationales": ["The industrial revolution has inventions that hat motors and engines.", "It's the time of industry when trains and other mechanical wonders were made in various areas throughout the world.", "The train is powered by coal."], "image": "val2014/COCO_val2014_000000494320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332202, "question_id": "4DbXRcEzVvQUdCHJG2mGqv", "question": "What is the black and white object to the left of the window?", "choices": ["vent", "iron decoration", "picture", "pan"], "correct_choice_idx": 0, "direct_answers": ["kettle", "basket/plastic", "knife", "vent", "stove", "vent", "cup", "stove", "oven", "stove"], "difficult_direct_answer": false, "rationales": ["The object on the wall is an opening for heat or air conditioning to fill the room.", "There are slats in it to filter out air", "The object is the vent."], "image": "train2014/COCO_train2014_000000332202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244795, "question_id": "4DtqebarGAas8LuCY9HPcp", "question": "What is coming out of the clock?", "choices": ["vapor", "water", "steam", "smoke"], "correct_choice_idx": 2, "direct_answers": ["steam", "steam", "time", "steam", "light", "metal", "time", "smoke", "steam", "chimes"], "difficult_direct_answer": false, "rationales": ["A cloudy, misty area surround clocks on a pole on a sidewalk.", "There is no steam coming out of the clock but it looks like it due to the location near the trees.", "There is white mist at the top of the clock."], "image": "train2014/COCO_train2014_000000244795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400139, "question_id": "4EvWgguUWLuRBxzszac5EX", "question": "What utensil is on the plate?", "choices": ["spoon", "chopstick", "knife", "fork"], "correct_choice_idx": 2, "direct_answers": ["knife", "knife", "bowl", "knife", "knife", "knife", "vegetables", "knife", "knife", "knife"], "difficult_direct_answer": false, "rationales": ["It is a sharp utensil with a black handle.", "The shape of the handle indicates the type.", "The utensil on the plate has a size, shape, and design consistent with answer a, as well as a visible serrated blade."], "image": "val2014/COCO_val2014_000000400139.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88048, "question_id": "4GG5EQswposBRXBDaXLJZ9", "question": "What is on the grass?", "choices": ["statues", "animals", "scarecrows", "dancing seniors"], "correct_choice_idx": 1, "direct_answers": ["horses", "hoarse", "horses", "animals", "horses", "horse", "donkeys", "horses", "horses", "horses"], "difficult_direct_answer": false, "rationales": ["There are some little donkeys on the grass.", "Animals are in the grass.", "A group of horses is grazing in an open area."], "image": "train2014/COCO_train2014_000000088048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123907, "question_id": "4HiPPdznfBDTLeLvfvJ7FW", "question": "What emotion is the woman most likely feeling?", "choices": ["hate", "anger", "fear", "anticipation"], "correct_choice_idx": 3, "direct_answers": ["excited", "anticipation", "excitement", "anticipation", "anticipation", "fatigue", "intensity", "determination", "anticipation", "anticipation"], "difficult_direct_answer": false, "rationales": ["The woman is playing tennis and in a stance as though she is awaiting a serve from the opponent. in this aspect of tennis she is waiting for something to occur and likely experiencing answer a.", "The woman is anticipating the tennis ball.", "This women is waiting and eager to return the serve."], "image": "train2014/COCO_train2014_000000123907.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193271, "question_id": "4LdzJFUfUpjWDeKSS3QbMN", "question": "What is the upright wooden spool in the far corner used for holding?", "choices": ["parchment paper", "toilette paper", "plastic bags", "paper towel"], "correct_choice_idx": 3, "direct_answers": ["paper towels", "pasta", "wood", "paper towels", "paper towel", "paper towel", "fee", "towels", "paper towels", "hotcake"], "difficult_direct_answer": false, "rationales": ["It's used to put paper towels on it.", "It has the roll on it.", "The spool is holding paper towels."], "image": "val2014/COCO_val2014_000000193271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227137, "question_id": "4LfctdykzprB5BFMBesKee", "question": "How is the room being illuminated?", "choices": ["candles", "fan light", "flashlight", "lamp"], "correct_choice_idx": 0, "direct_answers": ["candles", "lighting", "candles", "stop", "candlelight", "candles", "candles", "lightening", "candles", "candle"], "difficult_direct_answer": false, "rationales": ["The candles are lit.", "There is no electrical light that is on but there are multiple items that are lit with a fire.", "There is a soft glow that is often found in this lighting source. you can also see the oblong yellow flame indicating this type of light."], "image": "train2014/COCO_train2014_000000227137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68956, "question_id": "4NNegHoamPtYyq4nJNAgmA", "question": "What type of food does Rita's sell at the bottom of this picture?", "choices": ["pizza", "burgers", "ice cream", "italian"], "correct_choice_idx": 2, "direct_answers": ["ice custard", "ice cream", "ice cream", "ice custard", "good food", "custard", "food", "italian", "custard", "ice cream"], "difficult_direct_answer": false, "rationales": ["The store at the bottom of the tower has a cup of ice cream on the door.", "It says rita's ice cream.", "Rita's has ice cream."], "image": "train2014/COCO_train2014_000000068956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292752, "question_id": "4Q9RBKupB4G5wXyGZkiMQU", "question": "How many sail posts are on top of the large white sailboat?", "choices": ["five", "four", "two", "three"], "correct_choice_idx": 1, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "one", "four"], "difficult_direct_answer": false, "rationales": ["There are four sails on the boats.", "The boat sailing in the water has four tall masts extending up from the top.", "The boat has four sail post on top of the boat. they are white in color."], "image": "train2014/COCO_train2014_000000292752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163723, "question_id": "4QAPfPuiC2V9rVYntM7qdy", "question": "What number is on the sign?", "choices": ["55", "71", "86", "32"], "correct_choice_idx": 2, "direct_answers": ["86", "86", "eighty six", "86", "8541", "86", "86", "86", "86", "86"], "difficult_direct_answer": false, "rationales": ["The yellow sign next to the airplane has the number 86 in black letters.", "The number 86 is illuminated in orange on the sign.", "A numbered sign is above an airplane."], "image": "train2014/COCO_train2014_000000163723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 354444, "question_id": "4RoSZ2fKRJLEh5ziR47kYC", "question": "What is on top of the car?", "choices": ["monkeys", "birds", "surfboard", "tree limb"], "correct_choice_idx": 1, "direct_answers": ["birds", "seagulls", "birds", "birds", "seagulls", "seagulls", "seagulls", "seagulls", "girl", "seagulls"], "difficult_direct_answer": false, "rationales": ["They have wings and beaks", "The animals on top of the car have beaks, wings, and can fly.", "Birds are on top."], "image": "train2014/COCO_train2014_000000354444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45058, "question_id": "4Rtyf8ZVaVv7tcoCqhbK6i", "question": "What is adorning the pizza?", "choices": ["cherries", "anchovies", "meatballs", "lettuce"], "correct_choice_idx": 3, "direct_answers": ["lettuce", "spinach", "vegetable", "spinach", "vegetables", "basil leaves", "basil leaves", "basil", "spinach", "spinach"], "difficult_direct_answer": false, "rationales": ["The pizza has lettuce.", "There are whole green leaves on top of the pizza.", "There are some green leaves on top of the pizza."], "image": "train2014/COCO_train2014_000000045058.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344902, "question_id": "4Yu93Deue27kKwwg8Eo3gh", "question": "What footwear is usually used here?", "choices": ["dress shoes", "tennis shoes", "cleats", "boots"], "correct_choice_idx": 2, "direct_answers": ["cleats", "shoe", "sports shoe", "shoes", "baseball", "cleats", "cleets", "white", "cleats", "cleats"], "difficult_direct_answer": false, "rationales": ["The baseball player is wearing cleats on his feet while playing.", "Baseball players often wear cleats.", "The setting is a baseball diamond based on the players equipment and the dirt. baseball players are commonly known to wear answer a for improved traction and performance."], "image": "train2014/COCO_train2014_000000344902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13754, "question_id": "4b6mitomNeVeja4sQWYmJf", "question": "What is surrounding the trees in the area so the trees are more giraffe friendly?", "choices": ["wire", "poles", "fence", "dirt"], "correct_choice_idx": 3, "direct_answers": ["walls", "leaves", "dirt wall", "dirt", "tall", "rock", "dirt", "wall", "concrete fence", "dirt"], "difficult_direct_answer": false, "rationales": ["The material is brown. wires, fences, or poles would not make the trees more giraffe friendly.", "Trees grow in dirt.", "This makes it possible for the smaller ones to get to the greenery."], "image": "train2014/COCO_train2014_000000013754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167243, "question_id": "4bDjfDTcvvNTq8JCLGdV3Z", "question": "What establishment is directly across the way from Cafe Wiener?", "choices": ["clothing store", "pub", "cafe", "salon"], "correct_choice_idx": 1, "direct_answers": ["l'amour", "restaurant", "pub", "pizzeria", "italian restaurant", "restaurant", "pub", "l'amore", "cafe", "sctoore"], "difficult_direct_answer": false, "rationales": ["A pub is over there.", "The pub is directly away from cafe winner.", "A man is riding past a cafe with a cup with light brown liquid in it in his hand."], "image": "train2014/COCO_train2014_000000167243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112329, "question_id": "4bfqqcnvJE8mMQzJD3Docd", "question": "What is the largest of the blue numbers on the sign?", "choices": ["98", "15", "77", "12"], "correct_choice_idx": 1, "direct_answers": ["sign", "stop", "59", "15", "11 15", "15", "there", "sign", "fifteen", "fifteen"], "difficult_direct_answer": false, "rationales": ["The other one is 11", "As long as you can read and write you can tell which number is the greatest.", "The sign says eleven and fifteen."], "image": "val2014/COCO_val2014_000000112329.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197398, "question_id": "4c6PaRZNZAi5SYTsjcN6DP", "question": "Where is this meal being eaten?", "choices": ["cafe", "home", "school", "restaurant"], "correct_choice_idx": 3, "direct_answers": ["table", "hotel", "restaurant", "restaurant", "restaurant", "pub", "restaurant", "restaurant", "restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["The meal is at a restaurant.", "Its at a restaurant", "The food is served how a restaurant would serve it."], "image": "val2014/COCO_val2014_000000197398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376757, "question_id": "4dWDzbi7xv7GngmUG5LmxF", "question": "What is next to the vehicle?", "choices": ["egg carton", "traffic cones", "parking meter", "dog"], "correct_choice_idx": 1, "direct_answers": ["people", "airplane", "traffic cones", "wood", "ladder", "steps", "plane", "plane", "plane", "cones"], "difficult_direct_answer": false, "rationales": ["There are orange safety markers around the bottom of the plane", "There are traffic lights by the cars.", "There are orange cone shaped objects on the ground near the airplane."], "image": "train2014/COCO_train2014_000000376757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129439, "question_id": "4dj9rK4K9QQUEKHRPHFfh9", "question": "Who does the person in the foreground resemble most?", "choices": ["jim those", "jens pulver", "maria sharapova", "idris elba"], "correct_choice_idx": 2, "direct_answers": ["maria sharapova", "jennifer lawrence", "player", "pippy longstocking", "woman", "tennis player", "tennis player", "ball boys/girl", "batmeton", "tennis player"], "difficult_direct_answer": false, "rationales": ["A girl with blond hair is playing tennis on a court.", "The person in the foreground is a tennis player. she is white and has blonde hair.", "The person in the foreground is a blonde woman. she is playing tennis."], "image": "train2014/COCO_train2014_000000129439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7559, "question_id": "4eYXkH82HJ2bpfZ6HqMCBj", "question": "Where is this white coffee mug most likely located?", "choices": ["candy store", "coffee shop", "home kitchen", "dentist office"], "correct_choice_idx": 3, "direct_answers": ["sink", "desk", "orthodontist", "dentist", "table", "dentist", "coffee", "bathroom", "kitchen", "dentist office"], "difficult_direct_answer": true, "rationales": ["Dentists offices normally have individually wrapped toothbrushes for their clients to use.", "The cup is full of toothbrushes which is what a dentist gives away. normally, a person only needs one and there are too many in the cup for one person.", "A cup is full of toothbrushes and a basket of toothpastes is next to it."], "image": "val2014/COCO_val2014_000000007559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126486, "question_id": "4fQRSNtVgL2EWmTfQNEFzY", "question": "Where are these animals?", "choices": ["zoo", "plains", "veterinarian", "serengetti"], "correct_choice_idx": 0, "direct_answers": ["zoo", "zoo", "giraffe", "girrafi", "outdoors", "zoo", "zoo animals", "zoo", "zoo", "zoo"], "difficult_direct_answer": false, "rationales": ["They are in a zoo", "There are different kinds of animals. they are in an enclosure of some kind. they are wild animals.", "The animals are in an area that is enclosed. the animals are at a zoo where they are safe."], "image": "train2014/COCO_train2014_000000126486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203416, "question_id": "4ggYJQRDiaEJdDXpYyvef4", "question": "Which city is most likely serving this restaurant?", "choices": ["shanghai", "singapore", "hong kong", "beijing"], "correct_choice_idx": 2, "direct_answers": ["city", "japan", "bsll", "shanghai", "korea", "china", "hong kong", "shanghai", "japan", "city"], "difficult_direct_answer": false, "rationales": ["Hong kong serves chinese food.", "There is asian text on the sign.", "The noodles and soup and writing are typical of southern china which is near this reagion."], "image": "val2014/COCO_val2014_000000203416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374208, "question_id": "4gzTvCSKrEhhiFgLUivjFv", "question": "What two primary colors have to be combined to get the color of the car?", "choices": ["blueyellow", "redyellow", "redblue", "redwhite"], "correct_choice_idx": 1, "direct_answers": ["yellow green", "combined", "red white", "redyellow", "red yellow", "red green", "red yellow", "red orange", "red yellow", "yellow red"], "difficult_direct_answer": false, "rationales": ["It is orange", "The car is orange and these colors are what you mix to get it", "These two colors come together to make this color."], "image": "train2014/COCO_train2014_000000374208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541493, "question_id": "4h58PdJMbiU6T7yuy3VCCX", "question": "What is plugged into the outlet?", "choices": ["nightlight", "hair dryer", "phone", "electric toothbrush"], "correct_choice_idx": 0, "direct_answers": ["toilet", "nightlight", "air freshener", "night light", "air freshener", "nightlight", "sink", "air freshener", "light", "night light"], "difficult_direct_answer": false, "rationales": ["A nightlight is plugged in.", "The nightlight to prevent accident.", "The nightlight is plugged on."], "image": "val2014/COCO_val2014_000000541493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299601, "question_id": "4ieGmz3D6gaJ7DjQFUkv6C", "question": "What color is the twine wrapped around this little bear's neck?", "choices": ["blue", "purple", "orange", "red"], "correct_choice_idx": 2, "direct_answers": ["orange", "orange", "yellow", "orange", "orange", "orange", "orange", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The twine is not blue, red, or purple.", "The twine is not blue, red, or purple.", "The twine color is clearly visible and identifiable based on the location given in the question."], "image": "val2014/COCO_val2014_000000299601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577893, "question_id": "4k7WYm5YdtKs3R3Bg7GsEA", "question": "What is the athletes last name?", "choices": ["garrett", "jackson", "federer", "jones"], "correct_choice_idx": 2, "direct_answers": ["roger federer", "athletes", "fever", "federer", "agassi", "federer", "andy", "fedderar", "roger", "nadal"], "difficult_direct_answer": true, "rationales": ["A man with dark hair is playing tennis.", "The athlete is recognizable by his style of play, the sport and his general appearance.", "The athlete is a tennis player. his first name is roger."], "image": "val2014/COCO_val2014_000000577893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49877, "question_id": "4kT48RPScYYJRfZqDMZp4W", "question": "What is the number of bananas stored inside of the fruit basket?", "choices": ["five", "six", "three", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["The number in option a matches the number of stored bananas.", "You can count the number of bananas. they are large and long and easy to count.", "The bananas are identifiable by their unique color and shape and are clearly visible and countable."], "image": "train2014/COCO_train2014_000000049877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394583, "question_id": "4kfFBtsiexpUGiuvvZX6jH", "question": "What is dangerous about how the man in the front of the image is riding his bike?", "choices": ["his wheels", "his phone", "his jacket", "bookbag"], "correct_choice_idx": 1, "direct_answers": ["texting", "road", "his phone", "no attention", "looking down", "being careless", "not looking", "not looking", "no helmet", "looking down"], "difficult_direct_answer": false, "rationales": ["The man is on his phone.", "The phone would drop if he is biking.", "The man is looking at the item in his hand instead of the area in front of his bike."], "image": "train2014/COCO_train2014_000000394583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45750, "question_id": "4nGBDiTostVVRJY4UZaFYE", "question": "From what country does ANA hail from?", "choices": ["sweden", "japan", "norway", "france"], "correct_choice_idx": 1, "direct_answers": ["japan", "qatar", "america", "nippon", "japan", "japan", "japan", "no idea", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["Its from japan", "Ana stands for all nippon airways and has headquarters in tokyo.", "According to google it is located in japan."], "image": "train2014/COCO_train2014_000000045750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417474, "question_id": "4pY9tdwexiVf2YswZdUdK5", "question": "How many sheep are standing in the green mossy pasture?", "choices": ["four", "five", "three", "six"], "correct_choice_idx": 1, "direct_answers": ["stop", "five", "five", "five", "five", "six", "watching", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["There are a total of five sheep.", "There are 5 close then one in the distance", "There are five sheep in the pasture."], "image": "train2014/COCO_train2014_000000417474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203975, "question_id": "4qWKXbrCik4CtNv5Eor8ym", "question": "What are the two rectangular baskets on the left counter for?", "choices": ["rinse dishes", "fruits", "decorations", "dish draining"], "correct_choice_idx": 3, "direct_answers": ["drying dishes", "box", "dish draining", "dishes", "drying dishes", "dish drying", "dishes", "drying dishes", "draining", "support"], "difficult_direct_answer": false, "rationales": ["You can see dishes in the basket next to the sink. typically you put dishes in there to dry.", "The baskets are to drain dishes.", "Two wire items are on a kitchen counter next to the sink with dishes in one of them."], "image": "train2014/COCO_train2014_000000203975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219633, "question_id": "4rCwHtbUHhg3knebVVpYv5", "question": "What color is the leash on the dog with the yellow collar?", "choices": ["metal", "yellow", "black", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "blue", "brown", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The leash on the dog with a yellow collar is blue.", "From the brown dog closest to us in this picture a blue leash extends.", "You can see the blue leash going to the yellow collar"], "image": "train2014/COCO_train2014_000000219633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311516, "question_id": "4rEX59t6uKDko6xuNoJcdk", "question": "What is on the table?", "choices": ["dog", "apples", "cat", "batteries"], "correct_choice_idx": 3, "direct_answers": ["remote", "batteries", "batteries", "gadget", "phone", "remote batteries", "medical device", "remote/batteries", "meter battery", "batteries"], "difficult_direct_answer": false, "rationales": ["In the corner you can see two batteries.", "The table has batteries.", "The objects have a positive and negative side and are cyllindrical."], "image": "train2014/COCO_train2014_000000311516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483994, "question_id": "4rZmC66NnCMXyLJqZPoc9w", "question": "Which type of sheep is the highest on the stand?", "choices": ["drysdale", "suffolk", "roma", "merino"], "correct_choice_idx": 3, "direct_answers": ["merino", "merino", "merino", "merino", "male sheep", "merino", "merino", "dog", "merino", "ram"], "difficult_direct_answer": false, "rationales": ["The merino sheep is highest.", "The sheep on the highest shelf present here is labelled 'merino'.", "The sign is under the one at the very top that shows what kind it is."], "image": "val2014/COCO_val2014_000000483994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40723, "question_id": "4sPevBf76CYgXohU6pfNzA", "question": "What color are the interior sections of the boats lined up along the beach?", "choices": ["black", "red", "blue", "white"], "correct_choice_idx": 2, "direct_answers": ["blue", "white", "blue", "teal", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["They are blue", "The color is blue.", "The inside of the boats are a turquoise color."], "image": "train2014/COCO_train2014_000000040723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343696, "question_id": "4trUeBUiPb7xkjdrsArzge", "question": "How many goats are contained by this pasture set near the home?", "choices": ["three", "two", "four", "five"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "zero", "three", "three", "two", "three", "two"], "difficult_direct_answer": false, "rationales": ["One goat is standing in front of two other goats.", "Two goats are standing behind another goat.", "Two goats are readily visible in this picture and one is slightly obscured by the fence."], "image": "train2014/COCO_train2014_000000343696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310440, "question_id": "4tzfi32eFwaMvmkTZR77Hi", "question": "How many tusks should the elephant have who is walking toward the camera?", "choices": ["one", "two", "one half", "zero"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "one", "two", "two", "two", "one", "one", "two", "one"], "difficult_direct_answer": false, "rationales": ["There are two tusks.", "An elephant has a pair.", "Each elephant has two tusks."], "image": "train2014/COCO_train2014_000000310440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459786, "question_id": "4wEVPBZ8hkXm74PeFzs8Gc", "question": "What are the decorations made of?", "choices": ["candy canes", "plants", "gnomes", "paper airplanes"], "correct_choice_idx": 1, "direct_answers": ["flowers", "flowers", "flowers", "flowers", "flowers", "flowers", "attractive", "flowers", "flowers", "plants"], "difficult_direct_answer": false, "rationales": ["It's hard to say if they're live or fake.", "This bathroom is decorated with flowers and palms both members of the plant family.", "The mirrors are adorned with floral garlands."], "image": "val2014/COCO_val2014_000000459786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343823, "question_id": "4wuaq89i58xMcdQHjZvjni", "question": "What color is the grass stalks where the sheep are walking through?", "choices": ["red", "orange", "green", "blue"], "correct_choice_idx": 1, "direct_answers": ["brown", "tan", "brown", "yellow", "beige", "brown", "brown", "yellow", "yellow", "orange"], "difficult_direct_answer": false, "rationales": ["There's some of b as well in the foreground and likely mixed in with the sheep, but a is closest.", "They are in a field that looks like hay.", "Grass has chlorophyll which has this characteristic color."], "image": "train2014/COCO_train2014_000000343823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267116, "question_id": "4yxvpn2rRLAVWVrgd6K5QW", "question": "What are the people in the front holding?", "choices": ["controllers", "puppies", "umbrellas", "kittens"], "correct_choice_idx": 0, "direct_answers": ["game controllers", "controller", "controllers", "controllers", "game controllers", "wii remote", "remote", "game modules", "game controllers", "controllers"], "difficult_direct_answer": false, "rationales": ["The people have controllers.", "The people in the front are playing a video game which is controlled by these.", "They are playing video games"], "image": "train2014/COCO_train2014_000000267116.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110204, "question_id": "4zHxH8moSovf6xAfyBk5BS", "question": "What color is the wall to the right of the refrigerator unit?", "choices": ["orange", "green", "blue", "red"], "correct_choice_idx": 2, "direct_answers": ["white", "white", "blue", "white", "blue", "blue", "silver", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["This color is on the side of the fridge as you leave the kitchen", "A refrigerator is sitting with a white wall behind it and a blue wall to the side.", "It's the color blue."], "image": "train2014/COCO_train2014_000000110204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527908, "question_id": "4zVokJP8duwZjuwjQNEvD4", "question": "What color is the dog's neck collar?", "choices": ["red", "white", "purple", "green"], "correct_choice_idx": 2, "direct_answers": ["purple", "black", "black", "black", "black", "white", "white", "purple", "white", "black"], "difficult_direct_answer": false, "rationales": ["The color is purple.", "The dog is wearing a purple collar.", "The dog's collar is not green, red, or white."], "image": "train2014/COCO_train2014_000000527908.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249993, "question_id": "52cYqciELuH9ztv26avFch", "question": "What color are the inserts in the black-faced sheep ears?", "choices": ["blue", "purple", "yellow", "green"], "correct_choice_idx": 2, "direct_answers": ["white", "sheep", "pink", "white", "white", "white", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The black-faced sheep have inserts in their ears that are yellow.", "This is the color of the tags hanging from the ears.", "The tags are bright color so they can be seen"], "image": "train2014/COCO_train2014_000000249993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29525, "question_id": "53p5DWs2t7caWsJ7D9WtsK", "question": "What is near the car?", "choices": ["bison", "museum", "bicycles", "apple pie"], "correct_choice_idx": 2, "direct_answers": ["bikes", "fence", "bridge", "bicycles", "bike", "bike", "bridge", "cycle", "cycle car", "man"], "difficult_direct_answer": false, "rationales": ["There are people pedaling these vehicles", "Bicycles are near the car and kids are riding them.", "Many people on bikes are in and around a street."], "image": "train2014/COCO_train2014_000000029525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490081, "question_id": "53tcGPyfb69tLHJxLVZkGB", "question": "What kind of animal is the cart pulled by?", "choices": ["goat", "ox", "cow", "horse"], "correct_choice_idx": 1, "direct_answers": ["cow", "cows", "oxen", "cows", "cattle", "oxen", "oxen", "oxen", "oxen", "ox"], "difficult_direct_answer": false, "rationales": ["These ox are pulling the cart.", "The ox is pulling.", "The only animals in the picture are oxen."], "image": "val2014/COCO_val2014_000000490081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107204, "question_id": "54hZoBeB4h8hdJ9X6cCJRZ", "question": "What event is happening?", "choices": ["football game", "rodeo", "baseball game", "interview"], "correct_choice_idx": 3, "direct_answers": ["awards", "interview", "party", "formal", "interview", "black tie", "interview", "interview", "wedding", "oscars"], "difficult_direct_answer": false, "rationales": ["The man is wearing a tuxedo. a second person is holding a microphone up to him.", "The man is speaking into a microphone.", "This man is speaking to a person holding a microphone, typically media personal use these to record conversations that they share with others."], "image": "train2014/COCO_train2014_000000107204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78059, "question_id": "55HdtPDz8Qz45iEbGMaEn6", "question": "What is most likely outside the doorway?", "choices": ["bedroom", "living room", "kitchen", "garage"], "correct_choice_idx": 0, "direct_answers": ["bedroom", "bedroom", "toilet", "window", "bedroom", "window", "window", "grass", "toilet", "bedroom"], "difficult_direct_answer": false, "rationales": ["There is likely a bedroom attached to this bathroom.", "The doorway leading out of the bathroom probably goes to the master bedroom", "This type of bathroom is a master one. unless this is located in an extended stay hotel, a is the best answer."], "image": "train2014/COCO_train2014_000000078059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579123, "question_id": "56Keyt9ydqmzverKnySpaX", "question": "What is the person all the way to the right holding?", "choices": ["baby", "pumpkin", "egg", "camcorder"], "correct_choice_idx": 3, "direct_answers": ["video camera", "video camera", "camera", "camera", "camera", "camera", "video camera", "camcorder", "camera", "camera"], "difficult_direct_answer": false, "rationales": ["The person has a camcorder.", "The person is filming a video.", "The person all the way to the right is holding a big camera."], "image": "train2014/COCO_train2014_000000579123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351566, "question_id": "5742n2Ft3KnGtu6u7ym3u2", "question": "What color is the cute animal's little nose?", "choices": ["pink", "black", "white", "brown"], "correct_choice_idx": 0, "direct_answers": ["pink", "pink", "pink", "pink", "black", "pink", "pink white", "pink", "pink", "white"], "difficult_direct_answer": false, "rationales": ["You can see the color and it is consistant with the color of cat's noses in general.", "Kitties usually have cute little pink noses.", "The cat has a light rose colored nose."], "image": "train2014/COCO_train2014_000000351566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307995, "question_id": "5AoDp6xFdz9KseMAgWcAb2", "question": "What is next to the statue?", "choices": ["baby", "wine glass", "egg", "monkey"], "correct_choice_idx": 1, "direct_answers": ["wine glass", "wine", "wine glass", "wine glass", "wine glass", "wine", "wine glass", "wine glass", "wine glass", "glass"], "difficult_direct_answer": false, "rationales": ["There's a wine glass next to the cat statue.", "There is a bottle with a glass and stem at bottom", "This is obvious given the wine bottle. this type of glass is also known as a short stem and beverageware."], "image": "val2014/COCO_val2014_000000307995.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86334, "question_id": "5BBq4z5SfPCtXRvLKVKLrk", "question": "What does this train primarily carry?", "choices": ["steel", "coal", "passengers", "cars"], "correct_choice_idx": 2, "direct_answers": ["people", "people", "passengers", "passengers", "people", "passenger", "people", "passengers", "people", "people"], "difficult_direct_answer": false, "rationales": ["It has windows along both sides and is stopping for the person waiting on the platform.", "The train on the tracks is a passenger train that carries travelers and commuters.", "A train with windows and enclosed carts moves along the tracks. passenger trains are enclosed rather than having open carts."], "image": "val2014/COCO_val2014_000000086334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136093, "question_id": "5BbusmbJJCkjYUXWj7zJms", "question": "How many elephants are taking a bath in the big river with people on their backs?", "choices": ["two", "five", "four", "three"], "correct_choice_idx": 0, "direct_answers": ["one", "two", "two", "two", "two", "one", "two", "two elephants", "two", "two"], "difficult_direct_answer": false, "rationales": ["The elephants are fully visible and countable.", "There are two elephants bathing.", "The elephants are visible and countable based on their unique outlines."], "image": "train2014/COCO_train2014_000000136093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56240, "question_id": "5Ckb8idmMp9nK9sDyzU55n", "question": "What is on top of the refrigerator?", "choices": ["egg carton", "cat", "dog", "potted plant"], "correct_choice_idx": 3, "direct_answers": ["vase", "plant", "plant", "plant", "potted plant", "plant", "plant", "plant", "plant", "plant"], "difficult_direct_answer": false, "rationales": ["The refrigerator in the kitchen has a green potted plant on top of it.", "There is greenery in a container", "There is a small potted plant on top of the refrigerator. it has green leaves and is in a pot."], "image": "train2014/COCO_train2014_000000056240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299632, "question_id": "5D3WdQDLYXPMJeFAdrbXLH", "question": "How many portraits are hung on the sides of the walls?", "choices": ["three", "two", "four", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is a portrait handing on the left side of the room wall.", "There is one portrait.", "There is a single portrait on the wall."], "image": "train2014/COCO_train2014_000000299632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475796, "question_id": "5D9nYZyH4VZzmjTmaiP8zi", "question": "What color is the light shown on top of the traffic lights of College Avenue?", "choices": ["blue", "yellow", "red", "green"], "correct_choice_idx": 1, "direct_answers": ["orange", "yellow", "red", "yellow", "yellow", "yellow", "yellow", "yellow", "red", "red"], "difficult_direct_answer": false, "rationales": ["The colour of the light is the same as the thing around it, which is yellow.", "The top lights are shown as yellow.", "The light is not red or green."], "image": "train2014/COCO_train2014_000000475796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213740, "question_id": "5DH5J2m2FukC4ez4CYbF2x", "question": "How many eggs are served beside the hash browns in this breakfast plate?", "choices": ["three", "four", "five", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two eggs on the top of the plate.", "There are two eggs.", "A double dose of fried eggs is presented on this breakfast plate along with beans and sausage and sauteed mushrooms."], "image": "train2014/COCO_train2014_000000213740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292140, "question_id": "5EDByu7YCwryVbVKXFcrJh", "question": "How many elephants are walking on top of the dirt walk?", "choices": ["two", "four", "five", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "3 elephants", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three elephants walking in line on top of the dirt.", "Three elephants are walking.", "There are two large ones and a baby"], "image": "val2014/COCO_val2014_000000292140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368661, "question_id": "5ERYrGvhq9yNtzY7N7Wff3", "question": "What are the cows in the foreground near?", "choices": ["fence", "baby", "hay", "kitten"], "correct_choice_idx": 0, "direct_answers": ["fence", "fence", "fence", "fence", "dead", "fence", "car", "fence", "fence", "grass"], "difficult_direct_answer": false, "rationales": ["The metal wires extending from wooden posts here visible make up the fence preventing these cows from wandering off.", "The cows are by the fence.", "Cows are in the pasture behind poles with rows of wires running horizontally."], "image": "train2014/COCO_train2014_000000368661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317391, "question_id": "5EVWo2car3VWNg75UEF2oX", "question": "How many giraffes are standing under the tree eating leaves?", "choices": ["one", "three", "four", "two"], "correct_choice_idx": 2, "direct_answers": ["four", "four", "four", "two", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are four giraffes standing under the tree. they are easy to count.", "The giraffes also feed on leaves on trees as they are very tall.", "There are four giraffes standing close together under the big tree."], "image": "train2014/COCO_train2014_000000317391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132141, "question_id": "5FhnVW6Mwzkgdsy46RkU5F", "question": "What is the person in the foreground hovering over?", "choices": ["ramp", "car", "rooftop", "baby"], "correct_choice_idx": 0, "direct_answers": ["ramp", "skating", "ramp", "ramp", "ramp", "ramp", "ramp", "ramp", "skateboard", "ramp"], "difficult_direct_answer": false, "rationales": ["The ramp is used to go up.", "The person in the foreground is jumping over the ramp.", "The person in the foreground is hovering over a ramp."], "image": "train2014/COCO_train2014_000000132141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158556, "question_id": "5Hogkydd4JQTyJoXVXWQ4K", "question": "What is the type of plant in the planter called?", "choices": ["spiral tree", "umbrella plant", "fern", "bonsai"], "correct_choice_idx": 0, "direct_answers": ["topiary", "flower", "bush", "bush", "topiary", "spiral tree", "tree", "tree", "stop", "plants"], "difficult_direct_answer": false, "rationales": ["It turns tight circles as it grows.", "The plant is a spiral tree.", "A bush is in a pot on a patio and is trimmed into the shape of a corkscrew."], "image": "train2014/COCO_train2014_000000158556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496696, "question_id": "5L3QFMCsAhtuQ2qMNvFUn5", "question": "What do many of these animals have?", "choices": ["horns", "wings", "quills", "talons"], "correct_choice_idx": 0, "direct_answers": ["horns", "horns", "horns", "horns", "horns", "horns", "horns", "horns", "thirty", "horns"], "difficult_direct_answer": false, "rationales": ["The male species of this animal has horns.", "Many of the animals in the herd have long and pointy horns on their heads.", "They have horns."], "image": "train2014/COCO_train2014_000000496696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351550, "question_id": "5LMqwufxnJUWyeRgUNJHEQ", "question": "What movie logo can be seen at the top right hand side of the computer?", "choices": ["old yeller", "black beauty", "cursed", "pans labyrinth"], "correct_choice_idx": 2, "direct_answers": ["cursed", "scary movie", "scream", "avatar", "scream", "uncertain", "scream", "grudge", "scream", "edward scissorhands"], "difficult_direct_answer": false, "rationales": ["I can't tell from the small, fuzzy image, but it's the most obvious answer given the other options. according to google, this is the correct answer.", "It shows a woman with a scar.", "It has a picture of the girl in it"], "image": "train2014/COCO_train2014_000000351550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245497, "question_id": "5MRCDTzXHowksABbFwmQgd", "question": "Who is known for doing this activity?", "choices": ["maria sharapova", "bam margera", "idris elba", "david ortiz"], "correct_choice_idx": 1, "direct_answers": ["skater", "tony hawk", "tony hawk", "skater boys", "tony hawk", "tony hawk", "tony hawk", "bam margera", "tony hawk", "skateboarding"], "difficult_direct_answer": false, "rationales": ["He is a professional skateboarder, best known for being a former member of the jackass crew.", "The person is on a skateboarder.", "The person is skateboarding, not playing tennis, acting, or playing baseball."], "image": "val2014/COCO_val2014_000000245497.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396224, "question_id": "5RkVirSCV3w9TEv3PM8irM", "question": "What kind of meat is served with the dinner at this restaurant?", "choices": ["salmon", "turkey", "bear", "chicken"], "correct_choice_idx": 3, "direct_answers": ["turkey", "turkey", "chicken", "chicken", "dinner", "fork", "turkey", "chicken", "turkey", "turkey"], "difficult_direct_answer": false, "rationales": ["A large slice of white breast meat is on a plate presented to eat.", "That type of meat is usually white and shaped like that. it's usually eaten with rolls.", "Because the meat is part of the chicken breast."], "image": "val2014/COCO_val2014_000000396224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293658, "question_id": "5SA5adwJ4V5ucgd84bQ2Lw", "question": "What make is the blue parked car?", "choices": ["saturn", "yugo", "honda", "ford"], "correct_choice_idx": 2, "direct_answers": ["honda", "honda", "unkown", "accident", "honda", "fiat", "ride", "sedan", "steel", "honda"], "difficult_direct_answer": false, "rationales": ["You can see the \"h\" emblem on the front of the car", "The cars hood as a letter h on front of it.", "I'm not really sure if this is correct. i don't know cars well and you can't see what kind of car it is."], "image": "train2014/COCO_train2014_000000293658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475529, "question_id": "5TKh6S9KLWV2CaPijuMpUW", "question": "What do the flashing lights indicate on this vehicle?", "choices": ["bad weather", "fire", "crime", "parade"], "correct_choice_idx": 1, "direct_answers": ["emergency", "fire", "danger", "emergency", "emergency", "emergency", "emergency vehicle", "emergency", "siren", "emergency"], "difficult_direct_answer": false, "rationales": ["The lights indicate fire.", "The flashing lights on the red truck indicate that they are going to put out a fire.", "A firetruck will turn on all the bells and whistles when rushing to an emergency."], "image": "val2014/COCO_val2014_000000475529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309000, "question_id": "5U8tWb7fgqre9WYqCfixUM", "question": "What color is reflected off the water around the sun?", "choices": ["purple", "brown", "white", "blue"], "correct_choice_idx": 0, "direct_answers": ["purple", "nice", "yellow", "purple", "red", "car", "purple", "purple", "yellow", "pink"], "difficult_direct_answer": false, "rationales": ["You can see the color and it is typical of sunset when the suns rays are bent.", "The water looks purple in its reflection.", "There is purple light reflected off the water surface."], "image": "val2014/COCO_val2014_000000309000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44820, "question_id": "5VuHK8wbZPGa9ppuLKAHck", "question": "What kind of companies are being advertised here?", "choices": ["airline", "car", "bank", "computer hardware"], "correct_choice_idx": 0, "direct_answers": ["airline", "airline", "airline", "olympus", "nike", "camera", "airlines", "photography", "airlines", "fefef"], "difficult_direct_answer": false, "rationales": ["They are advising you to fly.", "The other options aren't on the blue background. the \"fly\" makes the transportation obvious.", "The \"fly\" indicates that this is the answer."], "image": "train2014/COCO_train2014_000000044820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391895, "question_id": "5Xi8zXCP8XuSy4RzNweR29", "question": "How do you know this is not the USA?", "choices": ["signage", "animals", "foliage", "license plates"], "correct_choice_idx": 3, "direct_answers": ["license plate", "license plate", "canada", "license plate", "license plate", "license plate", "mountains", "license plates", "mountains", "number plates"], "difficult_direct_answer": false, "rationales": ["The plate is not from usa.", "A motorcycle has a plate with two rows of numbers rather than one row with numbers and letters.", "There are no signs or animals, and the foliage consists of trees and other plants that could exist in the usa. the identification tag on the back of the motorcycle is not american."], "image": "val2014/COCO_val2014_000000391895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3091, "question_id": "5ZDePpGAgvH26pUVw9XYTb", "question": "What is drizzled over the cake?", "choices": ["fudge", "cream cheese", "water", "oil"], "correct_choice_idx": 0, "direct_answers": ["chocolate", "chocolate syrup", "chocolate syryup", "honey", "fudge", "chocolate syrup", "chocolate", "chocolate syrup", "chocolate syrup", "chocolate"], "difficult_direct_answer": false, "rationales": ["The cake on the plate is covered with chocolate fudge.", "That is a type of chocolate flavor that you put on desserts.", "Fudge is on the cake."], "image": "val2014/COCO_val2014_000000003091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275565, "question_id": "5ZhHXa3rrby5Kpx6p3K36D", "question": "What are the bread items being cooked?", "choices": ["bagels", "rye bread", "sourdough", "french bread"], "correct_choice_idx": 0, "direct_answers": ["bagels", "bagel", "bagels", "bagels", "bagels", "bagel", "bagels", "oven", "bagels", "bagels"], "difficult_direct_answer": false, "rationales": ["The bread is bagels.", "There are bagels cooked inside the oven.", "The woman is baking round bagels in the oven on the trays."], "image": "train2014/COCO_train2014_000000275565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13452, "question_id": "5aAjhcg4AnBCPdLbmNYeyw", "question": "How many eggs are served in this breakfast overeasy?", "choices": ["four", "three", "two", "five"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two eggs served at the bottom of the breakfast.", "This is obvious by just counting the number on the plate. these are also known as soppy or sloppy eggs.", "There are two eggs."], "image": "train2014/COCO_train2014_000000013452.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16697, "question_id": "5cGbbfF3uaZzDGfZNC2iyv", "question": "What do the cows have?", "choices": ["crowns", "horns", "baseball caps", "ear tags"], "correct_choice_idx": 3, "direct_answers": ["tags", "stand", "car", "gig", "ear tags", "tags", "tags", "tags", "tags", "tags"], "difficult_direct_answer": false, "rationales": ["The cows are tagged.", "The cows do not have any of the other answers visible and all of them have answer a.", "They have tags on their ears for identification"], "image": "train2014/COCO_train2014_000000016697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505213, "question_id": "5cX8wyFgV2LZ9a53GE4vph", "question": "How many sausages are contained by the hot dog bun held by this man?", "choices": ["four", "two", "three", "five"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "one", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two large sausages on the bun that the man is holding in his hand.", "There are two", "One is on the bottom and the other is layered on top of it"], "image": "val2014/COCO_val2014_000000505213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 977, "question_id": "5dUUCffB6YmBKwENKB7Mh4", "question": "How many kitties are laying around on top of the couch?", "choices": ["three", "four", "two", "one"], "correct_choice_idx": 0, "direct_answers": ["three kitties", "two", "three", "stop", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three kitties.", "A group of felines is playing on a piece of furniture.", "There are 3"], "image": "train2014/COCO_train2014_000000000977.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27710, "question_id": "5eH2ogAh4EWmF3PKJzCwh9", "question": "What color is the bull standing in the field of white cows?", "choices": ["black", "purple", "red", "brown"], "correct_choice_idx": 3, "direct_answers": ["brown", "brown", "brown", "brown", "grass", "brown color", "brown", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["There is a brown bull standing amount white cows among a field.", "The bull is not black, red, or purple.", "The cows are white but you can see that the bull is brown."], "image": "train2014/COCO_train2014_000000027710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295889, "question_id": "5gypCYCe8kASKS34c9yBnz", "question": "What is the black item on the counter?", "choices": ["microwave", "tabletop stove", "coffee machine", "computer"], "correct_choice_idx": 1, "direct_answers": ["stove", "stove", "tabletop stove", "stove", "portable stove", "stove", "stove", "stove", "stove", "tv"], "difficult_direct_answer": false, "rationales": ["These are common in kitchens like this.", "There is a small ball device with a coffee maker on it. this is used to heat up coffee.", "The stove is a loose and portable cooking surface."], "image": "train2014/COCO_train2014_000000295889.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39399, "question_id": "5iVHeWJJJkAh2fCmmwPTmG", "question": "What color is the top of the picnic bench painted all up like?", "choices": ["yellow", "blue", "red", "gray"], "correct_choice_idx": 3, "direct_answers": ["brown", "silver", "grey", "grey", "gray", "gray", "brown", "gray", "grey", "grey"], "difficult_direct_answer": false, "rationales": ["The top of the picnic bench is painted a light gray", "It my have been painted b as well and faded over time. it's hard to tell from this image.", "The shade of color is an off-white compared to regular white."], "image": "val2014/COCO_val2014_000000039399.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360370, "question_id": "5idNqfjoKKViUHtpEVm8Sa", "question": "What is the largest number in the cow in the foreground's collar?", "choices": ["seven", "six", "four", "nine"], "correct_choice_idx": 3, "direct_answers": ["nine", "nine", "934", "four", "dwdw", "nine", "nine", "nine", "nine", "nine"], "difficult_direct_answer": false, "rationales": ["There are three numbers. three and four are the smaller two numbers.", "Nine is a larger number than 3 or 4, the other numbers on the collar.", "The other numbers are 3 and 4"], "image": "train2014/COCO_train2014_000000360370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118544, "question_id": "5ih9TGiirdnDQAsuJrrdMa", "question": "What can you win?", "choices": ["television", "car", "desktop", "laptop"], "correct_choice_idx": 3, "direct_answers": ["person", "laptop", "laptop", "laptop", "money", "laptop", "laptop", "laptop", "laptop", "laptop"], "difficult_direct_answer": false, "rationales": ["The laptop is a prize.", "An amd laptop is listed as a prize.", "There is a banner that says the prize."], "image": "val2014/COCO_val2014_000000118544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447861, "question_id": "5iiFkKrVfhLetFkVpuzoEH", "question": "Which part of this meal has a small portion?", "choices": ["meat", "chips", "bread", "veggies"], "correct_choice_idx": 1, "direct_answers": ["chips", "fruit", "chips", "orange", "orange", "middle", "orange", "chips", "chips", "chips"], "difficult_direct_answer": false, "rationales": ["There's some chips on the plate.", "There's only two of this item on the plate.", "The chips are smallest."], "image": "val2014/COCO_val2014_000000447861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370124, "question_id": "5iz4dnVvVr7TXQVXzGLGu6", "question": "Why is the man wearing an orange vest?", "choices": ["warmth", "fashion", "protection", "visibility"], "correct_choice_idx": 3, "direct_answers": ["for work", "safety", "safety", "conductor", "visibility", "worker", "safety", "train worker", "railway worker", "safety"], "difficult_direct_answer": false, "rationales": ["The vest is a bright color so he can be seen even at night.", "It is a safety vest.", "The man is near a train. the orange vest makes it easier for the engineer to see him."], "image": "train2014/COCO_train2014_000000370124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199639, "question_id": "5msU69GuB6BaxJ9Zc4aV2j", "question": "What is the black object behind the guy's head?", "choices": ["headrest", "sign", "laptop", "speakers"], "correct_choice_idx": 0, "direct_answers": ["headrest", "computer chair", "chair", "stapler", "head rest", "seat", "chair", "head rest", "headrest", "plate"], "difficult_direct_answer": false, "rationales": ["He is sitting on an office chair, so the back part of the chair is common on desk chairs in an office.", "The object is visible and in a position where the most appropriate application would be answer a and none of the other answers on the list.", "A is part of a desk or gaming chair."], "image": "train2014/COCO_train2014_000000199639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133061, "question_id": "5nC599TaR47nJfXUPqCv6Q", "question": "What is on the animal to the right?", "choices": ["ribbon", "crown", "baby", "rope"], "correct_choice_idx": 3, "direct_answers": ["rope", "halter", "rope", "rope", "cow", "cow", "rope", "cow", "cow", "ropes"], "difficult_direct_answer": false, "rationales": ["The animal is by rope.", "The adult animal to the right is tied up. it is not wearing a crown or ribbon.", "A rope is used to tie the cattle in the in the post."], "image": "val2014/COCO_val2014_000000133061.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302982, "question_id": "5oQu8BPeMM7bSSeCt6NhwY", "question": "How many elephants are there?", "choices": ["seven", "eight", "three", "five"], "correct_choice_idx": 2, "direct_answers": ["three", "some", "three", "three", "three", "three elephants", "three", "four", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three elephants standing in the shade on the bricks.", "There are two elephants standing near each other and one standing further away in the background.", "There are a few being depicted in photo."], "image": "train2014/COCO_train2014_000000302982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71043, "question_id": "5q5A9FKdzrvKnkj9usdRk5", "question": "What is the player in the foreground hoping to accomplish?", "choices": ["homerun", "tko", "goal", "touchdown"], "correct_choice_idx": 0, "direct_answers": ["homerun", "hit back", "ok", "hit ball", "hitting ball", "hit ball", "cricket", "swing", "basketball", "bat"], "difficult_direct_answer": true, "rationales": ["He has the bat up ready to hit a ball as far as he can", "A child is batting on a baseball field.", "He is playing batter in baseball. the goal is to hit the ball far enough to run to all four bases, called a homerun."], "image": "train2014/COCO_train2014_000000071043.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83147, "question_id": "5qh9A5RRqRZXRHFapYmxeJ", "question": "What color is the roofing material on the top of this clocktower of the church?", "choices": ["red", "blue", "white", "green"], "correct_choice_idx": 0, "direct_answers": ["brick", "red", "red", "red", "red", "stone", "red", "red", "orange", "brown"], "difficult_direct_answer": false, "rationales": ["The bricks on top of this clocktower are red.", "The roof is colored red.", "A roof is a rusty red color against a gray building."], "image": "train2014/COCO_train2014_000000083147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232648, "question_id": "5qieewJoeg8pLe8LbLjZgW", "question": "What color is the tissue box on the back of the toilet bowl?", "choices": ["blue", "red", "pink", "green"], "correct_choice_idx": 2, "direct_answers": ["pink", "purple", "purple", "pink", "magenta", "pink", "white purple", "white", "purple", "lavender"], "difficult_direct_answer": false, "rationales": ["The color is pink.", "The tissue box on the toilet is pink.", "The box is colored pink."], "image": "train2014/COCO_train2014_000000232648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348091, "question_id": "5r9yoQygMKosayiY8SkDQJ", "question": "What color is the object in this room that is used to shield you from rain?", "choices": ["white", "black", "pink", "brown"], "correct_choice_idx": 2, "direct_answers": ["pink", "red", "pink", "pink", "pink", "no image", "pink", "no image", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["The umbrella in the corner of the room that is used for rainy days is bright pink.", "The umbrella is pink.", "There is a pink umbrella."], "image": "val2014/COCO_val2014_000000348091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248701, "question_id": "5sFGMrdER7ueJugFqWwYeW", "question": "What color is the backpack worn by the skier with the white snow suit?", "choices": ["blue", "orange", "green", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "red", "red", "brown", "red", "red", "good", "red", "brown", "white"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The color is easily visible and bright. it is in sharp contrast to the white snow.", "The color is clear in the picture against the white clothes."], "image": "val2014/COCO_val2014_000000248701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25245, "question_id": "5uKyFNWW2CwM2N2CLrjhH3", "question": "Who is holding a leather briefcase?", "choices": ["old woman", "circus clown", "lemur", "old man"], "correct_choice_idx": 3, "direct_answers": ["businessman", "man", "first man", "man", "old man", "buisness man", "man", "man", "right man", "man"], "difficult_direct_answer": false, "rationales": ["There is someone holding a briefcase. a white beard and balding person.", "The person holding the brief case has short gray hair which is thinning in the back.", "They have grey hair and small luggage"], "image": "train2014/COCO_train2014_000000025245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193384, "question_id": "5uYsPhpiPt3x7Et5gv7im8", "question": "What is this person making?", "choices": ["cake", "smores", "brownies", "brownies"], "correct_choice_idx": 1, "direct_answers": ["cake", "smores", "smores", "cake", "cake", "smores", "brownies", "dessert", "cake", "smores"], "difficult_direct_answer": false, "rationales": ["The person is making smores.", "The person is spreading marshmallows on a tray of chocolate and graham crackers.", "They have chocolate down already and are scooping out marshmallow fluff"], "image": "train2014/COCO_train2014_000000193384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50655, "question_id": "5vezdxtC3PCaVjcGjRaaRf", "question": "What is on the top of the toilet tank?", "choices": ["flusher", "toilet paper", "newspaper", "towel"], "correct_choice_idx": 0, "direct_answers": ["tank lid", "candle", "flusher", "soap", "cover", "tissue paper", "air deodorizer", "button", "flusher", "flusher"], "difficult_direct_answer": false, "rationales": ["On the top of the tank is a button that is pushed to empty the bowl and refill it with fresh water.", "The top is the flusher.", "It is used to release water after a person uses the toilet in order to flash out waste."], "image": "val2014/COCO_val2014_000000050655.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310989, "question_id": "5vw7saenRMcjAJ8bKCBJ69", "question": "What nation's flag is painted onto the front of this airplane?", "choices": ["usa", "france", "germany", "uk"], "correct_choice_idx": 1, "direct_answers": ["united states", "country", "australia", "france", "france", "america", "france", "usa", "usa", "stop"], "difficult_direct_answer": false, "rationales": ["Red, white, and blue stripes are painted across the front of an aircraft.", "The flag is from france.", "The flag is french."], "image": "train2014/COCO_train2014_000000310989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458282, "question_id": "5x5nPAZ23VcvqqVrN4JDth", "question": "What is the corded device called that's on the wall?", "choices": ["hair dryer", "mixer", "tv", "wall phone"], "correct_choice_idx": 0, "direct_answers": ["drier", "hair dryer", "dryer", "hair dryer", "hair dryer", "hairdryer", "dryer", "blowdryer", "feef", "heater"], "difficult_direct_answer": false, "rationales": ["Hair dryers often are stored on hotel room walls and have curly cords.", "The device is used for drying hair.", "The item has vents to suck in air to blow out warm air blow on wet things."], "image": "train2014/COCO_train2014_000000458282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542510, "question_id": "5yxopyS3eQioAB4eTApqPe", "question": "What color is the main body of the cart pulled by this guy?", "choices": ["red", "orange", "wood", "blue"], "correct_choice_idx": 2, "direct_answers": ["beige", "brown", "brown", "brown", "brown", "yellow", "wood", "brown", "brown", "tan"], "difficult_direct_answer": false, "rationales": ["The beige brown color of the cart being pulled identifies it as wood.", "The main material of the wagon is wood.", "The other options don't match. that said, this is more like an oak or beech color."], "image": "val2014/COCO_val2014_000000542510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155993, "question_id": "5zLK6qvmPbZmgHP6zQdyGd", "question": "What is on the rack sitting on the bathtub's edge?", "choices": ["conditioner", "soap", "shampoo", "wine"], "correct_choice_idx": 3, "direct_answers": ["wine", "bathtub", "wine bottle", "towel", "wine", "champagne", "champagne", "wine", "fee", "champagne"], "difficult_direct_answer": false, "rationales": ["There is a glass bottle on the bathtub's edge. it contains alcohol, not soap, shampoo, or conditioner.", "There is a bottle of alcohol on the bathtub edge.", "There is a tray with wine on the edge of the bathtub."], "image": "train2014/COCO_train2014_000000155993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78813, "question_id": "5zoVEqkJBjwq6jbQcQQNMk", "question": "What is the man with the helmet on wearing?", "choices": ["mask", "bowtie", "necklace", "sunglasses"], "correct_choice_idx": 3, "direct_answers": ["helmet", "yes", "helmet", "vest", "vest", "sun glasses", "helmet", "sunglasses", "jeans", "vest jeans"], "difficult_direct_answer": false, "rationales": ["He is in the sun and it can be bright on the eyes.", "The man has sunglasses.", "The man's eyes are covered. he is not wearing a necklace, mask, or bowtie."], "image": "train2014/COCO_train2014_000000078813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387597, "question_id": "63Tih3vsPxCVNbZhtnHFsS", "question": "What is the horse doing?", "choices": ["sleeping", "feeding", "walking", "leaping"], "correct_choice_idx": 3, "direct_answers": ["leaping", "jumping", "jumpin", "jumping", "jumping", "jumping", "jump", "one", "leaping", "riding"], "difficult_direct_answer": false, "rationales": ["The horse is leaping over a white fence because it is competing in an equestrian event.", "The horse is jumping over the obstacle.", "A horse is jumping over a white barrier in a competition with a jockey on his back."], "image": "train2014/COCO_train2014_000000387597.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530906, "question_id": "63vgremMCWGXvic68V9LND", "question": "What is in the spaghetti?", "choices": ["beeswax", "tomato sauce", "ketchup", "cucumber"], "correct_choice_idx": 3, "direct_answers": ["broccoli", "vegetables", "broccoli", "vegetables", "vegetables", "vegetables", "cucumber", "broccoli", "veggies", "broccoli"], "difficult_direct_answer": false, "rationales": ["There are disc slices of a vegetable with seeds of the center of them.", "This is a vegetable known for being round and green. it is sliced in a characteristic way and known to be served with noodles.", "You can see the round veggie with seeds in between the noodles."], "image": "train2014/COCO_train2014_000000530906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267594, "question_id": "64nrZ2UcVWhuZYEWeRmMzy", "question": "What sport is the person doing?", "choices": ["basketball", "hockey", "skateboarding", "baseball"], "correct_choice_idx": 2, "direct_answers": ["skateboarding", "skateboard", "skateboarding", "skateboarding", "skateboarding", "skate", "skate boarding", "scatting", "skating", "skateboarding"], "difficult_direct_answer": false, "rationales": ["He is doing a trick on a board with wheels under it. he is in a skate park.", "The person is jumping with a board.", "They're in a skate park with a skateboard."], "image": "train2014/COCO_train2014_000000267594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123511, "question_id": "64zcv4z9pHgFtYTaVGufwM", "question": "What is the fruit high in?", "choices": ["vitamin c", "vitamin w", "salt", "eggs"], "correct_choice_idx": 0, "direct_answers": ["vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c"], "difficult_direct_answer": false, "rationales": ["The fruit has vitamin c.", "The fruit is a citrus fruit based on its color and interior which are known to contain nutrients consistent with answer a.", "This is an orange"], "image": "val2014/COCO_val2014_000000123511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384553, "question_id": "65oAgtrbyBReWvxSAYSG88", "question": "What is on the elephant?", "choices": ["bow tie", "bird", "hat", "person"], "correct_choice_idx": 3, "direct_answers": ["person", "person", "person", "man", "person", "person", "young man", "human", "man", "man"], "difficult_direct_answer": false, "rationales": ["The creature is bipedal and can reason.", "The elephant is not wearing clothes. a non-flying animal is riding it.", "As indicated by the fact that it is a human."], "image": "val2014/COCO_val2014_000000384553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61181, "question_id": "663bcyrxiRXHudx5AkhsuZ", "question": "What are the green and white VW bugs?", "choices": ["race cars", "cabs", "police car", "buses"], "correct_choice_idx": 1, "direct_answers": ["cars", "cars", "cars", "taxis", "cars", "taxis", "taxis", "taxis", "cabs", "taxis"], "difficult_direct_answer": false, "rationales": ["The bugs are cabs.", "The green and white vw bugs are cabs. they have numbers on them showing their call numbers.", "These are common colors used for a or taxis in many areas of the world."], "image": "val2014/COCO_val2014_000000061181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325228, "question_id": "665hmK4RaLYdVQtUn8YgPq", "question": "What is the first name of the boy in the red's favorite hero?", "choices": ["clark", "tony", "peter", "bruce"], "correct_choice_idx": 0, "direct_answers": ["clark", "superman", "eric", "superman", "clark", "superman", "superman", "superman", "kal", "superman"], "difficult_direct_answer": false, "rationales": ["The logo for 'superman' is visible on the boy in red's jean shorts. clark kent is the name of superman's alter ego.", "This is superman's first name.", "Superman's name is clark kent."], "image": "val2014/COCO_val2014_000000325228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512495, "question_id": "66Ay6t39bduutmqC966xFT", "question": "What is usually done here?", "choices": ["watching tv", "hand washing", "sleeping", "basketball"], "correct_choice_idx": 1, "direct_answers": ["shower", "cleaning", "peeing", "toilet", "use toilet", "shower", "toilet", "helping oneself", "toilet", "hand washing"], "difficult_direct_answer": false, "rationales": ["You can wash your hands in the room.", "There is a sink with soap, which is next to a toilet, and the sanitary thing to do is wash hands after using the toilet.", "There is a sink in this bathroom."], "image": "val2014/COCO_val2014_000000512495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133532, "question_id": "67oTXv2u2SaRSUcGTBW7Nn", "question": "What style of skis are worn by the people in the line?", "choices": ["downhill", "alpine", "cross country", "racing"], "correct_choice_idx": 2, "direct_answers": ["snow skiis", "crosscountry", "long", "snow skis", "long skis", "snow skis", "cross country", "cross country", "snow", "cross country"], "difficult_direct_answer": false, "rationales": ["These skiers traverse flat terrain. such a style of skiing would be called cross country.", "These are cross country skiis.", "The people are skiing on flat snow."], "image": "train2014/COCO_train2014_000000133532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445859, "question_id": "688RFg527RqqZqryV9PKHi", "question": "What is the dog doing?", "choices": ["jumping", "eating", "sleeping", "sniffing"], "correct_choice_idx": 0, "direct_answers": ["running", "running", "herding", "playing", "jumping", "running", "running", "sniffing", "running", "running"], "difficult_direct_answer": false, "rationales": ["The dog is awake and is not eating or sniffing. all of its legs are off the ground.", "The dog is awake and is above the ground.", "The dog is running and jumping near the cows in the field."], "image": "train2014/COCO_train2014_000000445859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290690, "question_id": "68iRFdVekV6qJRzCWygTkb", "question": "Which car brand is being advertised on the net?", "choices": ["mercedes", "ford", "bmw", "chevy"], "correct_choice_idx": 0, "direct_answers": ["mercedes", "mercedes", "bmw", "mercedes", "mercedes", "mercedes benz", "mercedes", "mercedes", "mercedes", "mercedes"], "difficult_direct_answer": false, "rationales": ["The symbol on the net is the symbol for mercedes.", "This is the symbol for the brand", "The mercedes logo is a circle with 3 prongs inside of it."], "image": "train2014/COCO_train2014_000000290690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112757, "question_id": "69YEHTAPedP2YPppQHKNAm", "question": "What kind of fuel does the van in the background run on?", "choices": ["methanol", "gasoline", "diesel", "propane"], "correct_choice_idx": 1, "direct_answers": ["regular", "diesel", "unleaded gas", "petroleum", "gas", "diesel", "gas", "gasoline", "van", "gas"], "difficult_direct_answer": false, "rationales": ["A traditional gas powered van is parked on a street. most automobiles run on gasoline.", "It's a smaller personal vehicle", "The fuel is gasoline."], "image": "train2014/COCO_train2014_000000112757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579144, "question_id": "6Bj7kyHor563RbDuUSNasB", "question": "The person wearing the blue tie looks most like whom?", "choices": ["idris elba", "liv morgan", "donald pleasence", "keith david"], "correct_choice_idx": 2, "direct_answers": ["unknown", "politician", "man", "glasses", "old man", "old man", "donald pleasence", "car", "president", "santa"], "difficult_direct_answer": true, "rationales": ["The person looks like donald pleasance.", "Like donald pelasence.", "Of the answers provided, answer a is the only one that is of the right gender and race."], "image": "train2014/COCO_train2014_000000579144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579156, "question_id": "6BuqnsRbpNgWhk7snXfcJ5", "question": "What side of the photo does the cow stand with his butt raised toward the camera?", "choices": ["bottom", "top", "left", "right"], "correct_choice_idx": 3, "direct_answers": ["right", "right", "backside", "right", "right", "right", "right", "right", "right", "black"], "difficult_direct_answer": false, "rationales": ["The photo is on the right.", "I picked the side that it was on.", "There is a cow on the right that is facing away from the camera."], "image": "train2014/COCO_train2014_000000579156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19624, "question_id": "6BusnqmwXXqozVWLkULdQc", "question": "What color is the drink contained by the cup in the billboard on the top left?", "choices": ["pink", "red", "blue", "green"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["A billboard depicts a drink pouring out of a cup and the drink is the same color as the sky on a sunny day.", "You can look at the billboard and see the liquid color in the cup", "The drink is blue as it is seen on the billboard."], "image": "val2014/COCO_val2014_000000019624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2191, "question_id": "6CXfvEHZY7kinDmGzmz7Hz", "question": "What is the person wearing?", "choices": ["bandana", "armor", "wetsuit", "mask"], "correct_choice_idx": 2, "direct_answers": ["cloths", "wetsuit", "wet suit", "wetsuit", "wet suit", "bathing suit", "wetsuit", "dress", "wet suit", "scuba gear"], "difficult_direct_answer": false, "rationales": ["The person is wearing a garment that provides thermal protection while participating in water sports.", "The woman is in a wetsuit.", "This is obvious based on how it fits their body."], "image": "val2014/COCO_val2014_000000002191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377401, "question_id": "6CyWc67HkqpWDwDNFkX76a", "question": "What material is the most likely metal for the finish of the roof?", "choices": ["iron", "steel", "copper", "brass"], "correct_choice_idx": 2, "direct_answers": ["iron", "stone", "copper", "iron", "copper", "copper", "aluminum", "tall", "blue", "brass"], "difficult_direct_answer": false, "rationales": ["Copper is traditionally used for churches and public buildings. it has a characteristic green color which is from copper oxidizing over time.", "The roof's finish is green. rusted brass, iron, or steel would not be green.", "It turns green the more it's exposed to air"], "image": "val2014/COCO_val2014_000000377401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427401, "question_id": "6DEZd2TgnGaqPd4ARjh9Xg", "question": "What character had a similar prop to the lady on the left?", "choices": ["crash bandicoot", "dante alighieri", "beatrix kiddo", "mary poppins"], "correct_choice_idx": 3, "direct_answers": ["julie andrews", "willie wonka", "mary poppins", "marry poppins", "mary poppins", "mary poppins", "cross walk", "mary poppins", "umbrella", "mary poppins"], "difficult_direct_answer": false, "rationales": ["Mary poppins had an umbrella as well.", "The woman walking in the street has an umbrella which is what mary poppins used to help her fly.", "The woman looks like mary poppins."], "image": "val2014/COCO_val2014_000000427401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60316, "question_id": "6DMHCq5Gh4FtSsT7b3qMtm", "question": "What superhero name is most similar to the name a group of these animals is called?", "choices": ["chowder man", "kitty pryde", "dazzler", "schooly d"], "correct_choice_idx": 2, "direct_answers": ["legion", "panther", "zebraman", "zebra", "birdman", "zebras", "safari ranger", "beast", "dazzler", "dazzler"], "difficult_direct_answer": true, "rationales": ["It is a group of zebras.", "The name is dazzler.", "There is a supper hero named dazzler who sounds like zebra."], "image": "train2014/COCO_train2014_000000060316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329001, "question_id": "6DfLpjbpvT9KfdbzL4gUvB", "question": "How many dogs are attached by leather leads to their owner by the side of this split tree?", "choices": ["three", "one", "two", "four"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["It looks like this number. it's hard to tell by the way the dogs are sitting.", "There are this many tails", "Although he's hard to see, there is another dog in front of the very visible one."], "image": "train2014/COCO_train2014_000000329001.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239670, "question_id": "6DfaV8874aGBxNmLtWYy64", "question": "What is between the bread?", "choices": ["burger", "pizza", "lettuce", "hot dog"], "correct_choice_idx": 2, "direct_answers": ["veg", "sandveg", "lettuce", "green", "lettuce", "sandwich", "lettuce", "car", "lettuce", "lettuce"], "difficult_direct_answer": false, "rationales": ["Lettuce is between the bread.", "There is a green vegetable.", "There is a leafy green vegetable in the sandwich."], "image": "train2014/COCO_train2014_000000239670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313553, "question_id": "6ETwyX4jbGUiL2Vxbfsw8n", "question": "What color is the polo shirt worn by the man seated in the back of the bus?", "choices": ["red", "orange", "yellow", "blue"], "correct_choice_idx": 1, "direct_answers": ["orange", "black", "tan", "red", "brown", "brown", "brown", "black", "orange", "grey"], "difficult_direct_answer": false, "rationales": ["The man at the back of the bus is wearing an orange shirt.", "The person in the back is wearing an orange shirt.", "The man seated in the back of the bus is wearing an orange polo shirt."], "image": "train2014/COCO_train2014_000000313553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155735, "question_id": "6HgxpJ8WYJTyxNAoMV7mzH", "question": "What color is one of the girl's shoes?", "choices": ["orange", "black", "green", "blue"], "correct_choice_idx": 1, "direct_answers": ["stop", "pink", "black", "blue", "black", "black", "pink", "black white", "black", "multi"], "difficult_direct_answer": false, "rationales": ["The shoes are black.", "The girl has one black shoe and one pink shoe.", "One of the shoes is dark like the pavement. it also resembles the night time"], "image": "train2014/COCO_train2014_000000155735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474984, "question_id": "6J8ScdamwKjtEFUsB6xz2u", "question": "What is the person wearing?", "choices": ["suspenders", "tie", "bucket", "wristband"], "correct_choice_idx": 3, "direct_answers": ["tank top", "blue", "brace", "shoes", "sports wear", "tennis clothes", "cap", "wristband", "tennis outfit", "activewear"], "difficult_direct_answer": true, "rationales": ["The person has a wristband.", "The tennis player is wearing a wrist band to keep sweat off her hands.", "A piece of cloth around the part that joins the arm and hand is typical to wear while playing tennis."], "image": "train2014/COCO_train2014_000000474984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111673, "question_id": "6KsosRaQCbDsqT6MFBrKSU", "question": "What continent is this most likely?", "choices": ["europe", "asia", "antarctica", "south america"], "correct_choice_idx": 1, "direct_answers": ["asia", "asia", "asia", "asia", "india", "africa", "asia", "asia", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["This country is known for utilizing elephants as transportation in certain parts. the colors and fabrics used are also native of this country, and you can see features of the people that are common in this nationality.", "Elephants are native to asia, and it is a cultural event to ride them in this manner.", "Elephants are commonly found in countries such as india and thailand."], "image": "train2014/COCO_train2014_000000111673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77681, "question_id": "6LJoEV2Ff6eWxu5w5XRe2v", "question": "What is on the pizza?", "choices": ["red peppers", "sausages", "olives", "banana"], "correct_choice_idx": 2, "direct_answers": ["olives", "fork", "olives", "olives", "peppers", "olives", "olives", "olives cheese", "plate", "olives"], "difficult_direct_answer": false, "rationales": ["There are black objects on the pizza.", "There are black circles on the pizza.", "There are black olives on the pizza."], "image": "val2014/COCO_val2014_000000077681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171453, "question_id": "6Pg6bbn6oa4FJpjxBkN4rS", "question": "What is on top of the bed?", "choices": ["dog", "cat", "baby", "doll"], "correct_choice_idx": 3, "direct_answers": ["quilt", "doll", "monkey doll", "doll", "doll", "blanket", "pillow", "quilt", "pillow", "comforter"], "difficult_direct_answer": false, "rationales": ["There is a toy with limbs, a head, and clothes on it.", "There is a doll on top of the bed near the headboard.", "A doll is on top of the bed."], "image": "train2014/COCO_train2014_000000171453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282946, "question_id": "6QEW2Rbxts7eKAWj6swMeE", "question": "What is the woman wearing?", "choices": ["backpack", "wedding dress", "bicycle helmet", "crown"], "correct_choice_idx": 1, "direct_answers": ["wedding dress", "wedding dress", "wedding dress", "wedding dress", "wedding dress", "wedding dress", "veil", "wedding dress", "wedding dress", "wedding dress"], "difficult_direct_answer": false, "rationales": ["It's a long white gown with a veil", "The woman is wearing a white wedding dress.", "The dress the woman is wearing is long and white."], "image": "train2014/COCO_train2014_000000282946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435179, "question_id": "6QnqbreUvPHQXbcX5df3i9", "question": "Who is in the front right corner?", "choices": ["old woman", "little child", "old man", "school teacher"], "correct_choice_idx": 1, "direct_answers": ["little boy", "baby", "baby", "kid", "little child", "child", "trees", "child", "people", "small boy"], "difficult_direct_answer": false, "rationales": ["There is a little boy in the right corner.", "There is a small child in the right front corner of the photo.", "A child is at the corner."], "image": "val2014/COCO_val2014_000000435179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287568, "question_id": "6St6QbUJqJyoh7bAGqGckH", "question": "What color are the teddy's eyeballs?", "choices": ["white", "blue", "green", "red"], "correct_choice_idx": 3, "direct_answers": ["orange", "red", "orange", "orange color", "orange", "red", "red", "orange", "red", "red"], "difficult_direct_answer": false, "rationales": ["They are red", "The teddy bear has glowing red eyeballs. it looks extremely evil.", "The color is red."], "image": "train2014/COCO_train2014_000000287568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168730, "question_id": "6Swnvqa3Jjdnj28f5axCni", "question": "What is on the side of the refrigerator?", "choices": ["dog", "laundry basket", "cat", "garbage disposal"], "correct_choice_idx": 3, "direct_answers": ["calendar", "calendar", "calendar", "garbage disposal", "garbage pail", "humidifier", "trash can", "trash can", "calendar", "trash bin"], "difficult_direct_answer": false, "rationales": ["There is a bin next to the fridge that is used for disposal of garbage.", "There are no animals in the room. a kitchen is not an appropriate place to keep a laundry basket.", "The side has the disposal."], "image": "train2014/COCO_train2014_000000168730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491835, "question_id": "6THjehmBjumK9YeVTA24W2", "question": "What color is the suitcase held by the girl on the other side of the boarding deck to the right of the woman in the foreground?", "choices": ["red", "blue", "pink", "green"], "correct_choice_idx": 2, "direct_answers": ["pink", "blue", "black", "pink", "black", "blue", "pink", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["It stands out on the screen as it is a very bright colour, which can be identified as pink.", "That suitcase is not blue, red, or green.", "It stands out along with the girl."], "image": "val2014/COCO_val2014_000000491835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393290, "question_id": "6TVoFuojWxJzHM4znqS9e4", "question": "What is the woman holding?", "choices": ["baseball bat", "sword", "garbage bag", "plate"], "correct_choice_idx": 3, "direct_answers": ["food", "pizza", "plate", "pizza", "pizza", "pizza", "plate", "pizza", "pizza", "plate"], "difficult_direct_answer": false, "rationales": ["It has food on it and its plain white", "The woman is carrying a pizza on a saucer.", "The woman's hand location can be inferred based on her arm placement and the item on her hand is clearly visible with the size, shape and consistency in accordance with answer a."], "image": "train2014/COCO_train2014_000000393290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485852, "question_id": "6U7L7TKjUxSRJ8g78udMWA", "question": "What color is the rubber surrounding the outer rim of the tire on these bikes?", "choices": ["black", "blue", "white", "green"], "correct_choice_idx": 2, "direct_answers": ["stop", "tho", "white", "black", "black", "black", "black", "white", "silver", "white"], "difficult_direct_answer": false, "rationales": ["I would have said black but it specifically says the \"outer rim,\" which is why i chose the only other possible color.", "The color is white.", "The rims of the bike is the color white."], "image": "train2014/COCO_train2014_000000485852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464695, "question_id": "6VuZ2GvSeFa6Sz4aZDmVqe", "question": "Who was born in the state whose name appears on the side of the train in big white letters?", "choices": ["brooke shields", "jennifer connelly", "mia sara", "margaret qualley"], "correct_choice_idx": 3, "direct_answers": ["david lynch", "joe", "gary cooper", "evel knievel", "gary cooper", "duttons", "david lynch", "dana carved", "miley cyrus", "margaret qualley"], "difficult_direct_answer": false, "rationales": ["Margaret qualley was born here.", "Margaret qualley was born in montana.", "The person is margaret qualley."], "image": "train2014/COCO_train2014_000000464695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212866, "question_id": "6W7HYSkTThzEdf3VNVXoEF", "question": "What color is the brim of the hat worn by the girl on the back of the horse?", "choices": ["blue", "red", "yellow", "green"], "correct_choice_idx": 1, "direct_answers": ["red", "yellow", "red", "hopp", "red", "red", "red", "white", "red", "red"], "difficult_direct_answer": false, "rationales": ["It's the same color as apples", "The brim is red.", "The brim is a color that is a primary color in the rob model."], "image": "train2014/COCO_train2014_000000212866.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414397, "question_id": "6WhS8YEGYuBwJgF6DwGfdP", "question": "How many brown cows are seated inside of the hay like this?", "choices": ["three", "four", "two", "five"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Two brown cows are present.", "There are two brown cows seated in the hay side by side.", "You can see two cows side by side."], "image": "val2014/COCO_val2014_000000414397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259712, "question_id": "6XKtPQv7B587KFtt73ktjr", "question": "What color is the measuring tape on the right side of the compartment center?", "choices": ["yellow", "green", "red", "orange"], "correct_choice_idx": 0, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "clear", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The casing is yellow.", "The color is off white.", "The object is located based on the question text and its color is clear and identifiable."], "image": "train2014/COCO_train2014_000000259712.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122953, "question_id": "6aB3SZWXRUDCbX3kg6Fpo4", "question": "What is the cat hiding behind?", "choices": ["owners body", "laptop", "box", "couch"], "correct_choice_idx": 1, "direct_answers": ["laptop", "laptop", "laptop", "laptop", "laptop computer", "laptop", "computer", "laptop", "computer", "laptop"], "difficult_direct_answer": false, "rationales": ["The laptop is propped up and the cat is behind it", "The cat is laying behind a laptop laying upright.", "The rest of the cat's body, whose head is seen popping out, is behind the elevated personal computing system on the left side of the image."], "image": "val2014/COCO_val2014_000000122953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108548, "question_id": "6atmkFmcC5PQVNrhP47HHo", "question": "What continent is this room most likely on?", "choices": ["europe", "south america", "antarctica", "africa"], "correct_choice_idx": 0, "direct_answers": ["europe", "bathroom", "washing", "europe", "europe", "europe", "europe", "na", "europe", "toilet"], "difficult_direct_answer": false, "rationales": ["A bathroom with a toilet also has a bidet.", "There is a white bidet toilet that has a faucet that runs down in the hole. it shoots water up instead of using toilet paper.", "In addition to a toilet, there is a bidet which is common on this continent."], "image": "val2014/COCO_val2014_000000108548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124360, "question_id": "6bBLAWXt7PdEfdRepqJP7C", "question": "What is boiling in the pot?", "choices": ["oil", "soup", "stew", "spaghetti sauce"], "correct_choice_idx": 0, "direct_answers": ["carmel", "oil", "doughnuts", "domos", "tones", "oil", "oil", "oil", "oil", "oil"], "difficult_direct_answer": false, "rationales": ["The pot has oil boiling.", "Oil is sizzling for the donuts.", "This is what someone can use to make mini donuts."], "image": "train2014/COCO_train2014_000000124360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559986, "question_id": "6deJMAxAogpzybesyVfPbx", "question": "What is in the vicinity of the train?", "choices": ["apple", "cat", "bench", "frog"], "correct_choice_idx": 2, "direct_answers": ["hills", "mountain", "rocks", "person", "hill", "mountains", "poles", "bench", "bench", "park bench"], "difficult_direct_answer": true, "rationales": ["There is a place to sit.", "There is a wooden structure, not a fruit or animal, near the train. it is used for sitting.", "There is a bench close to the front of the picture for sitting."], "image": "train2014/COCO_train2014_000000559986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264362, "question_id": "6dxckhjeiebZGdSavHaAo4", "question": "What color is the back of the playing card printed cheese wedge?", "choices": ["blue", "green", "red", "purple"], "correct_choice_idx": 2, "direct_answers": ["black", "red", "red", "red", "white brown", "red", "white", "black", "fuschia", "black"], "difficult_direct_answer": false, "rationales": ["Playing cards are often red on one side.", "It is red like the back of a card.", "The card has some red."], "image": "train2014/COCO_train2014_000000264362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518785, "question_id": "6gVMQnQDixAmmpaUrkyddK", "question": "What type of dog is the woman walking?", "choices": ["pekinese", "pomeranian", "shih tzu", "samoyed"], "correct_choice_idx": 0, "direct_answers": ["yorkshire terrier", "small", "shitzu", "shih tzu", "pug", "pekinese", "chiwawa", "dog type", "small dog", "pekinese"], "difficult_direct_answer": true, "rationales": ["The dog is a pekinese.", "It is small and has a flat face with a lot of hair", "The woman is walking a small dog with long hair and a flat nose. this is consistent with the pekinese dog breed."], "image": "train2014/COCO_train2014_000000518785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236868, "question_id": "6hyTxJ7ZRwudosXtgszdzg", "question": "What kind of animal is advertised on the bottom of the referee post?", "choices": ["cat", "gator", "frog", "bird"], "correct_choice_idx": 1, "direct_answers": ["crocodile", "alligator", "alligator", "gator", "alligator", "gator", "alligator", "crocodile", "alligator", "crock"], "difficult_direct_answer": false, "rationales": ["This is obvious in the photo.", "There is a lacoste logo. it is a reptile with large teeth, not a bird, frog, or cat.", "The lacoste logo is that of the reptile that lives in swamps."], "image": "val2014/COCO_val2014_000000236868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235351, "question_id": "6kUD5RJJdksTCMU8eCrf3k", "question": "What is under the man in the air's feet?", "choices": ["horse", "buffalo", "frisbee", "sasquatch"], "correct_choice_idx": 2, "direct_answers": ["fefee", "frisbee", "frisbee", "frisbee", "frisbee", "beach", "frisbee", "fresbe", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["The object under the man is consistent in size and shape with answer a and is being used in a manner that is also consistent.", "A frisbee is a plastic disk with rounded edges that many people play with on the beach.", "It's a round flying disc he is about to catch."], "image": "train2014/COCO_train2014_000000235351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496619, "question_id": "6kWU3yFWFJJt5MCVF3gUMf", "question": "How many oxen are helping to remove the rocks from the field?", "choices": ["four", "three", "five", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two oxen walking on the field to remove the rocks.", "There are two oxen.", "They are side by side"], "image": "train2014/COCO_train2014_000000496619.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376947, "question_id": "6mFjUYWoyCwRMUB6NPL5aj", "question": "What is the person combing?", "choices": ["dog hair", "baby hair", "pumpkin", "cat hair"], "correct_choice_idx": 1, "direct_answers": ["baby's hair", "baby hair", "hair", "hair", "baby hair", "baby hair", "hair", "baby hair", "hair", "hair"], "difficult_direct_answer": false, "rationales": ["A baby is lying down getting their hair combed.", "He has a small brush on the black hair of the small infant.", "The person is combing the hair."], "image": "train2014/COCO_train2014_000000376947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438099, "question_id": "6nCX4zsL4iWfHStevkNvSe", "question": "What brand of laptop is used by the woman with the sunglasses?", "choices": ["asus", "lenovo", "dell", "hp"], "correct_choice_idx": 2, "direct_answers": ["dell", "dell", "dell", "dell", "dell", "dell", "dell", "apple", "dell", "dell"], "difficult_direct_answer": false, "rationales": ["The laptop says dell on it.", "The brand is dell.", "The circle with the dell wording is on the back of the laptop."], "image": "train2014/COCO_train2014_000000438099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215480, "question_id": "6obxAXeDfKwwb6WSGZp4jd", "question": "What feature do these animals have?", "choices": ["wool", "tusks", "wings", "antlers"], "correct_choice_idx": 0, "direct_answers": ["wool", "wool", "cool control", "wool coat", "wool", "fluffy hair", "wool", "wooly", "black head", "sheep"], "difficult_direct_answer": false, "rationales": ["The sheep have wool.", "The animals are sheep that have thick wool coats.", "Sheep make wool."], "image": "train2014/COCO_train2014_000000215480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316802, "question_id": "6ogR4fLcpRjBaaVwkXavRV", "question": "Which material mainly encloses the giraffe to the zoo?", "choices": ["stone", "wire", "wood", "electricity"], "correct_choice_idx": 1, "direct_answers": ["metal", "wire", "fence", "wood", "wood", "wire", "metal wood", "metal", "wood", "steel"], "difficult_direct_answer": false, "rationales": ["The material is wire.", "The fence is thin and metallic. this is typical of this material in addition to being silver and malleable. it has been pounded into a wire shape which is typical of ductile substances. it's not a giraffe.", "You can see the wire between and around the wooden parts of the fence."], "image": "train2014/COCO_train2014_000000316802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247378, "question_id": "6pe6B8hMsb8SM2isgkjngn", "question": "What kind of product does the sponsor with the yellow background offer?", "choices": ["phones", "computers", "musical instruments", "cameras"], "correct_choice_idx": 3, "direct_answers": ["cable", "camera", "camera", "cameras", "cameras", "camera", "shoes", "camera", "cameras", "cameras"], "difficult_direct_answer": false, "rationales": ["The sponsor with the yellow background is nikon. this company does not make musical instruments, phones, or computers.", "The product is the camera.", "The sponsor is nikon. this company makes photography products."], "image": "val2014/COCO_val2014_000000247378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435575, "question_id": "6poVXgnFt4vHoH3qmiBYak", "question": "What color is the baseball cap worn by the man operating as the catcher in this photo?", "choices": ["black", "white", "red", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "blue", "blue", "chair", "blue", "blue", "stop", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["It is brighter than the rest of the clothes and the light shines on it clearly to see the color", "It's the same color as the sky", "The man is wearing a baseball cap that is blue in color."], "image": "train2014/COCO_train2014_000000435575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26942, "question_id": "6sxGVXGXseRYqcRjh35NXo", "question": "The cat is sitting at a table with what featured on top of it?", "choices": ["plate", "fan", "knife", "book"], "correct_choice_idx": 3, "direct_answers": ["book", "book", "book", "book", "book", "book", "book", "book", "book", "chair"], "difficult_direct_answer": false, "rationales": ["The cat is by the book.", "The cat is by a book.", "Books are full of pages of paper with a cover."], "image": "val2014/COCO_val2014_000000026942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98892, "question_id": "6tCLZ6mBfy5ibkzeY2GuEX", "question": "What is the primary reason for the lights on the backs of the bikes?", "choices": ["fun", "identification", "safety", "decoration"], "correct_choice_idx": 2, "direct_answers": ["identifying authority", "safety", "bright", "indication", "safety", "awareness", "to blink", "safety", "warning", "road safety"], "difficult_direct_answer": false, "rationales": ["The reason is for safety.", "It makes them more visible to other vehicles on the road.", "Motorbikes are lined up and all have lights on the back ends. lights are used for visibility on vehicles."], "image": "train2014/COCO_train2014_000000098892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554324, "question_id": "6tZMpYQ4Hcw8oRRSz6dumD", "question": "What color is the bath robe worn by the woman holding the remote on the sofa?", "choices": ["purple", "white", "black", "red"], "correct_choice_idx": 2, "direct_answers": ["black", "blue", "black", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The robe is almost black.", "The bath robe is blue (actually blue). the woman is sitting on the couch while in the robe using a remote.", "The robe is a blue color which is close to purple."], "image": "train2014/COCO_train2014_000000554324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374553, "question_id": "6tivyjzKAuw9aUos6K9UnR", "question": "What color is the helmet worn by the man yelling at the umpire?", "choices": ["blue", "yellow", "black", "red"], "correct_choice_idx": 1, "direct_answers": ["chair", "yellow", "yellow", "yellow", "yellow", "yellow", "stop", "yellow", "yellow", "black"], "difficult_direct_answer": false, "rationales": ["The man that is yelling at the umpire is wearing a yellow baseball helmet.", "This is obvious in the photo and a common color used with black.", "The helmet is the color of the sun."], "image": "train2014/COCO_train2014_000000374553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266093, "question_id": "6uPRECxoN5ebhZYSZFZujF", "question": "What is a feature associated with this animal?", "choices": ["stripes", "stinger", "spots", "quills"], "correct_choice_idx": 0, "direct_answers": ["zebra", "stripes", "dwdww", "three", "stripes", "stripes", "color", "strips", "stripes", "stripes"], "difficult_direct_answer": false, "rationales": ["The animal is visibly a zebra, based on the size and shape and distinct patterning. these animals are known to consistently and uniquely have answer a.", "They are always black and white", "The zebra is striped."], "image": "train2014/COCO_train2014_000000266093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144088, "question_id": "6v2m6hHUkrh7j48qig8LD5", "question": "What kind of building is the cat sitting at the floor in?", "choices": ["hotel", "library", "kitchen", "diner"], "correct_choice_idx": 1, "direct_answers": ["house", "library", "library", "wooden", "house", "home", "library", "restaurant", "school", "ground floor"], "difficult_direct_answer": false, "rationales": ["There is a cat sitting on the floor of the library.", "A library has a lot of tables and chairs for people to sit and read books - as you see here.", "The building is a library."], "image": "train2014/COCO_train2014_000000144088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287366, "question_id": "6w6MAY7F93LVEiNzB8qThS", "question": "What is the person under the umbrella wearing?", "choices": ["crown", "tie", "tiara", "backpack"], "correct_choice_idx": 1, "direct_answers": ["suit", "suit", "suit", "suit", "suit", "suit", "suit", "suit", "tie", "suit"], "difficult_direct_answer": false, "rationales": ["He has a tie on", "The person is in a tie.", "This is worn with a suit around the neck."], "image": "val2014/COCO_val2014_000000287366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24779, "question_id": "6xQG6cSNpm6Dye2UWtERT4", "question": "What type of sign is in the back of this image?", "choices": ["caution", "stop sign", "crosswalk sign", "deer sign"], "correct_choice_idx": 1, "direct_answers": ["stop sign", "stop sign", "street sign", "stop sign", "stop", "stop sign", "stop", "stop", "stop", "stop sign"], "difficult_direct_answer": false, "rationales": ["The sign is a stop sign.", "There is the back of a stop sign in the background of this image.", "The sign is shaped in the same way."], "image": "train2014/COCO_train2014_000000024779.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259551, "question_id": "6y4sQnNmXste9YauiFYJe5", "question": "What are the people surrounded by?", "choices": ["coyotes", "snow", "eels", "potted shrubbery"], "correct_choice_idx": 1, "direct_answers": ["mountains", "snow", "snow", "snow trees", "snow", "snow", "snow", "snow", "mountains", "snow"], "difficult_direct_answer": false, "rationales": ["The frozen precipitation will allow them to glide faster down the mountain.", "The people are in snow.", "They are on a mountain for skiiing where snow is."], "image": "train2014/COCO_train2014_000000259551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416976, "question_id": "72QJT85rYQpipSZkjcS7Q6", "question": "What is a group of these animals called?", "choices": ["dazzle", "school", "clowder", "pride"], "correct_choice_idx": 0, "direct_answers": ["dazzle", "dazzle", "dazzle", "heard", "dazzle", "herd", "herd", "herd", "zebras", "zebra"], "difficult_direct_answer": false, "rationales": ["This is a group of zebras", "They are zebras.", "These animals are zebras, not fish, lions, or cats."], "image": "train2014/COCO_train2014_000000416976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439274, "question_id": "73CkPBE7iueZT3RvnWNkVE", "question": "What is out of place in this photo?", "choices": ["wooden chairs", "tiled walls", "birds inside", "wooden tables"], "correct_choice_idx": 2, "direct_answers": ["birds", "birds", "birds inside", "bird", "finch bird", "birds", "door", "two birds", "birds", "bird"], "difficult_direct_answer": false, "rationales": ["The birds are inside, which are they are suppose to be outside.", "There are flying animals that are sitting on the tables inside the building.", "The birds are out of place."], "image": "train2014/COCO_train2014_000000439274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504284, "question_id": "7438vjU7r2jTSiduNVQEtd", "question": "How do you know the bike riders are a couple?", "choices": ["rings", "kissing", "matching tshirts", "holding hands"], "correct_choice_idx": 3, "direct_answers": ["stop", "happy", "shaking hand", "holding hands", "holding hands", "holding hands", "holding hands", "holding hands", "hands", "holding hands"], "difficult_direct_answer": false, "rationales": ["It is a tradition for a man and the woman of similar ages to hold hands to show people that they are a couple. it is not normal to hold hands while riding a bike so this is an extraordinary sign of affection.", "They are both near each other and are using physical touch to express their love.", "This is obvious in the photo. they're also not doing the other options."], "image": "train2014/COCO_train2014_000000504284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32193, "question_id": "758Xy2bkYMz5szMf95yDpy", "question": "Which animals are closer to the ground?", "choices": ["cat", "elephant", "coyote", "zebra"], "correct_choice_idx": 3, "direct_answers": ["zebra", "zebra", "zebras", "zebra", "zebras", "giraffes", "zebra", "zebras", "zebra", "zebras"], "difficult_direct_answer": false, "rationales": ["Zebras are grazing near giraffes eating from trees.", "There are some zebra closer on the ground.", "There are giraffes and horse-like animals that have black and white stripes. the horse-like animals are closer to the ground."], "image": "val2014/COCO_val2014_000000032193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20903, "question_id": "759K5tqbXt6S2FgwvdjKkq", "question": "What is under the plate?", "choices": ["glove", "mouse", "balloon", "towel"], "correct_choice_idx": 3, "direct_answers": ["towel", "towel", "towel", "towel", "towel", "towel", "towel", "cloth", "towel", "towel"], "difficult_direct_answer": false, "rationales": ["You can tell by the setting and the shape of the cloth as to what is under the plate.", "The plate has a towel under.", "There is a blue towel under the plate."], "image": "train2014/COCO_train2014_000000020903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335551, "question_id": "76PgzJigUgi8BmQXF57Vxs", "question": "What number is on the train?", "choices": ["98356", "7863", "48151", "45932"], "correct_choice_idx": 2, "direct_answers": ["several numbers", "48151", "48151", "48151", "ten", "48151", "48151", "ten", "48151", "48151"], "difficult_direct_answer": false, "rationales": ["The number is 48151.", "The train number attached at the front of the train begin with four and end with one.", "These numbers are in white at the front"], "image": "train2014/COCO_train2014_000000335551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344897, "question_id": "76gxDmG6uWX8hLRcrDdmWo", "question": "Where do these animals get most of their food from?", "choices": ["bugs", "people", "grass", "other animals"], "correct_choice_idx": 2, "direct_answers": ["grass", "grass", "grass", "grass fields", "ground", "grass", "ground", "grass", "fields", "grass"], "difficult_direct_answer": false, "rationales": ["The animals are eating grass.", "Cows are herbivores and eat plant material.", "They eat vegetation"], "image": "val2014/COCO_val2014_000000344897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100581, "question_id": "77MSdo5fbYH7qELjQovT6v", "question": "What kind of animal is the shape of the kite made into?", "choices": ["bird", "butterfly", "dragonfly", "reptile"], "correct_choice_idx": 3, "direct_answers": ["drone", "lizard", "lizard", "hammerhead shark", "reptile", "lizard", "gecko", "gecko", "lizard", "lizard"], "difficult_direct_answer": false, "rationales": ["The animal is a reptile.", "It's shaped like a lizard", "The kite is shaped into a gecko."], "image": "train2014/COCO_train2014_000000100581.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186719, "question_id": "77WdaYujfCSkRdqAsXL8pj", "question": "How many forks are sat on the paper plate atop the concrete balcony edge?", "choices": ["two", "one", "three", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "one", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two forks on the paper plate.", "One fork is in front of the food. an additional fork is behind the food.", "There are two"], "image": "train2014/COCO_train2014_000000186719.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175552, "question_id": "798rq8T6dZmmnPfGxVqc7U", "question": "What is the girl attempting to mimic searching for with the frisbee?", "choices": ["dirt", "gold", "iron", "clay"], "correct_choice_idx": 1, "direct_answers": ["gold", "gold", "gold", "fishing", "mining", "gold", "panning", "gold hunting", "gold", "gold"], "difficult_direct_answer": false, "rationales": ["The girl wants gold.", "She is near a river and is sifting through the dirt.", "She wants to the dirt to pass through the frisbee so it can reveal any contents worth keeping."], "image": "train2014/COCO_train2014_000000175552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442252, "question_id": "7ANQ6CeeYaWjurY88basUr", "question": "What does this animal have?", "choices": ["whiskers", "talons", "antenna", "wings"], "correct_choice_idx": 0, "direct_answers": ["whiskers", "bag", "eyes", "whiskers", "mites", "whiskers", "pillow", "bag", "whiskers", "whiskers"], "difficult_direct_answer": false, "rationales": ["The animal's face is clearly visible and based on the features it is clearly a cat. cats have answer a and none of the other answers present.", "The animal has whiskers.", "The animal is a cat and that is a feature they have on their faces."], "image": "train2014/COCO_train2014_000000442252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202926, "question_id": "7BFQC5MXy3BThVZBqcnTP9", "question": "How many kinds of birds are shored on the beach together?", "choices": ["four", "five", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "41", "two", "two", "two", "two", "several", "two"], "difficult_direct_answer": false, "rationales": ["There are only two different kinds of birds on the beach by the water.", "There are seagulls on the beach. a different type of bird also is on the beach.", "There are pelicans and seagulls on the beach together."], "image": "val2014/COCO_val2014_000000202926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543728, "question_id": "7Chefzoz55KBzbowtuzz9R", "question": "What color is the back arch for the seat held by ropes on the back of this elephant?", "choices": ["gold", "blue", "orange", "green"], "correct_choice_idx": 1, "direct_answers": ["green", "black", "black", "brown", "black", "black", "blue", "black", "brown", "black"], "difficult_direct_answer": false, "rationales": ["The back arch is blue in color and is held by ropes on the back of the elephant.", "The seat for the elephant is held on by ropes and it is dark blue.", "The color is blue."], "image": "train2014/COCO_train2014_000000543728.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415746, "question_id": "7DUuoaHVEQBVGYogJW5ehB", "question": "What is the primary background color of the table cloth below the muffin enclosures?", "choices": ["pink", "blue", "purple", "green"], "correct_choice_idx": 1, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The tablecloth is mainly blue.", "The background is blue.", "The background is mostly blue."], "image": "val2014/COCO_val2014_000000415746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309406, "question_id": "7DVAPsc9dSEwMNc2auxEya", "question": "Which food item on the plate is lowest in calories?", "choices": ["granola", "fruit", "strawberry", "banana"], "correct_choice_idx": 2, "direct_answers": ["strawberry", "banana", "strawberry", "banana", "strawberries", "banana", "blueberry", "strawberries", "strawberries", "oats"], "difficult_direct_answer": false, "rationales": ["The food item on the plate has healthy foods that are low in calories like strawberry.", "Bananas are fattening fruits, so strawberries have less calories.", "The red fruit is mostly made of water so that would lower the calories."], "image": "train2014/COCO_train2014_000000309406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137806, "question_id": "7EMxK3jm69UJraLfjeUmao", "question": "How many cows are standing in the pasture field?", "choices": ["five", "two", "three", "four"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are more than two but less than four cows.", "There are three cows standing in the pasture field.", "A group of cows is standing in an open area."], "image": "train2014/COCO_train2014_000000137806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155177, "question_id": "7Eu7YSgapCsyMfJ45ZLBMK", "question": "What is the person studying?", "choices": ["artificial intelligence", "cars", "cooking", "french language"], "correct_choice_idx": 0, "direct_answers": ["artificial intelligence", "artificial intelligence", "online job", "artificial intelligence", "artificial intelligence", "artificial intelligence", "artificial intelligence", "artificial intelligence", "artificial intelligence", "artificial intelligence"], "difficult_direct_answer": false, "rationales": ["The book holding up the white laptop shows what the person is studying.", "The person that owns the computers also has a book about artificial intelligence that they are reading.", "The person has a book on their desk about artificial intelligence that could be their course of study."], "image": "train2014/COCO_train2014_000000155177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491223, "question_id": "7GwxDDRcJUVANpvWKUpbrq", "question": "What color are the buttons illuminated around the dais of the remote?", "choices": ["green", "red", "blue", "white"], "correct_choice_idx": 1, "direct_answers": ["red", "eleven", "orange", "orange", "white", "orange", "orange", "red", "orange", "white red"], "difficult_direct_answer": false, "rationales": ["The man has a remote control in his mouth and the buttons around the dial are lit up red.", "They are red", "The buttons that are lit up are red in color."], "image": "val2014/COCO_val2014_000000491223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348794, "question_id": "7HfWJrqtLyZxYVfHCCpxkx", "question": "What is hoisted atop the elephants to help the people ride them?", "choices": ["saddles", "benches", "harnesses", "blankets"], "correct_choice_idx": 1, "direct_answers": ["seat", "bench", "chair", "seats", "benches", "grass", "chair saddles", "forest area", "chairs", "seats"], "difficult_direct_answer": true, "rationales": ["The people are sitting on a long seat made of wood with legs.", "People have rectangular seating areas to sit on.", "The benches are hoisted."], "image": "train2014/COCO_train2014_000000348794.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543877, "question_id": "7J4U4JH8E4EVrpnUjeJmbv", "question": "What are the two large bowls on the counter called?", "choices": ["farmhouse sinks", "vessel sinks", "dropin sinks", "undermounted sinks"], "correct_choice_idx": 1, "direct_answers": ["sinks", "wadhbasin", "vessel sinks", "sink", "sinks", "sinks", "tap bowls", "basins", "sink", "ceickde"], "difficult_direct_answer": false, "rationales": ["The bowls are sinks.", "Two round, white bowls are on a counter in a bathroom and are matching. a faucet can be seen above one of the bowls.", "The two large bowls on the bathroom counter are called vessel sinks because they look like bowls resting on the counterop."], "image": "train2014/COCO_train2014_000000543877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518215, "question_id": "7JNyjNTYYgw27a7C3N5uSD", "question": "How many long necks are here?", "choices": ["seven", "three", "six", "five"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 3.", "There are three giraffes standing standing and eating out of a tree.", "There are three giraffes."], "image": "train2014/COCO_train2014_000000518215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482731, "question_id": "7Jodrg5tCuL4c4dXjuj2rM", "question": "What team is fielding?", "choices": ["ny jets", "seattle mariners", "cincinnati reds", "detroit pistons"], "correct_choice_idx": 1, "direct_answers": ["mariners", "white team", "seattle mariners", "mariners", "mariners", "mariners", "mariners", "mariners", "mariners", "mariners"], "difficult_direct_answer": false, "rationales": ["A man in a white uniform with a team logo has his foot on the base and is reaching out to catch an incoming ball.", "The team with the white uniform is fielding. the team with the gray uniform is batting and running the bases.", "The team is the mariners."], "image": "train2014/COCO_train2014_000000482731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144539, "question_id": "7KJsVnjWjTn2uh7g6PMUiX", "question": "What color are the pillows laying on the left side of this couch?", "choices": ["red", "white", "pink", "yellow"], "correct_choice_idx": 3, "direct_answers": ["yeelowhite", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["They're obvious and bright against the pink background cover.", "The pillows do not match the couch and are not white or pink.", "They are the same color as bananas"], "image": "val2014/COCO_val2014_000000144539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182497, "question_id": "7L7886qnxH3s3mdbpypnwM", "question": "What are the chances that at least one banana will fall out of the truck?", "choices": ["high", "impossible", "very low", "low"], "correct_choice_idx": 0, "direct_answers": ["100", "high", "high", "high", "eighty percent", "60", "very high", "overload", "few chances", "90%"], "difficult_direct_answer": false, "rationales": ["There is a high chances that the banana feels easily.", "The chances are high.", "The bananas are overfilled on the truck."], "image": "train2014/COCO_train2014_000000182497.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51844, "question_id": "7L9S68kZkZFkQCkgBWkwk4", "question": "How many black sheep are enclosed in the pasture?", "choices": ["one", "three", "four", "two"], "correct_choice_idx": 0, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is one.", "The sheep are identifiable by their size, shape and features. the colors of the sheep are clearly visible and countable.", "The rest are white"], "image": "val2014/COCO_val2014_000000051844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340197, "question_id": "7LenL4EDzGD74Khw52Zs3P", "question": "What is the woman in red reaching towards?", "choices": ["frisbee", "cat", "baby", "dog"], "correct_choice_idx": 0, "direct_answers": ["frisbee", "frisbee", "play", "frisbee", "frisbee", "frisbee", "frisbee", "play", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["It is a round flying disk that is typical of this type of object. this object is used in a game where you throw it back and forth.", "A woman is extending her hands out towards a disc that is in the air and approaching.", "The woman is clearly visible and the object is a size and shape as well as being used as answer a would be."], "image": "train2014/COCO_train2014_000000340197.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135626, "question_id": "7MngSYFrUM5ni8u6xAWrjv", "question": "What color is the trenchcoat worn by the woman who is walking a yellow bike?", "choices": ["red", "green", "white", "blue"], "correct_choice_idx": 2, "direct_answers": ["white", "tan", "gray", "beige", "white", "white", "white", "tan", "yellow", "beige"], "difficult_direct_answer": false, "rationales": ["The woman walking a yellow bike down the street is wearing a white trenchcoat.", "The color is white.", "The color is white."], "image": "train2014/COCO_train2014_000000135626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555020, "question_id": "7Ncyic752jEawYZrWHrnDw", "question": "What type of fruit is at the very top of the scoop with the banana and oat?", "choices": ["cantaloupe", "raspberry", "blackberry", "strawberry"], "correct_choice_idx": 2, "direct_answers": ["blackberry", "blackberry", "blackberry", "jerry", "blackberry", "blackberry", "blackberry", "bsll", "blackberry", "grapes"], "difficult_direct_answer": false, "rationales": ["The berries are colored black.", "Dark purple, almost black berries that otherwise look like raspberries are on top of fruit and oats.", "The berry at the top is dark and is made out of smaller pods."], "image": "train2014/COCO_train2014_000000555020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431904, "question_id": "7TEnV6Hvd3g5zQei7Ppj5Z", "question": "Who is famous for doing what these people are doing?", "choices": ["nathan drake", "nathan fillion", "tony hawk", "tony montana"], "correct_choice_idx": 2, "direct_answers": ["tony hawk", "tony hawk", "tony hawk", "shane o'neill", "tony hawk", "skate", "scatt", "tony hawk", "tony hawk", "tony hawk"], "difficult_direct_answer": false, "rationales": ["The people are skateboarding. from the list of answers, answer a is famous for doing this activity and the others are not.", "He was one of the first famous skaters", "These people are skateboarding, not acting."], "image": "train2014/COCO_train2014_000000431904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362312, "question_id": "7TKf9Ftskuiix36PQjgqZv", "question": "What animals are moving?", "choices": ["zebra", "elephant", "cat", "dog"], "correct_choice_idx": 0, "direct_answers": ["zebra", "zebra", "zebras", "zebras", "zebras", "zebras", "zebras", "zebra", "zebra", "zebras"], "difficult_direct_answer": false, "rationales": ["The zebra is moving.", "The zebras move.", "A large group of animals with white and black stripes is in a field."], "image": "train2014/COCO_train2014_000000362312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184276, "question_id": "7TieH4DPV2VnNFkXbzB2WC", "question": "What color ist hep old short worn by the man who just had hit the tennis ball?", "choices": ["yellow", "red", "purple", "green"], "correct_choice_idx": 0, "direct_answers": ["yellow", "black", "yellow", "yellow", "yellow", "black", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The shirt is very bright.", "The bright shirt is the same color as the sun.", "It's lemon colored"], "image": "val2014/COCO_val2014_000000184276.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284144, "question_id": "7UTMRYvA2kUGhrvckUarKC", "question": "What kind of store is this?", "choices": ["computers", "food", "electronics", "beverage"], "correct_choice_idx": 3, "direct_answers": ["sandals", "winery", "drink", "wine shop", "liquor", "liquor store", "drinks shop", "alcohol", "beverage", "beer"], "difficult_direct_answer": true, "rationales": ["The store sells wine.", "There are several bottles of wine, which you drink, on the shelves.", "There is alcohol for sale"], "image": "train2014/COCO_train2014_000000284144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373904, "question_id": "7UhSjii34JabxBUxZZ3p9d", "question": "How many cows are gazing inside the enclosure?", "choices": ["two", "one", "four", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three cows eating.", "They are alone in the large field and easy to count", "One cow is grazing in between two other cows."], "image": "val2014/COCO_val2014_000000373904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224138, "question_id": "7VMbZjMHc7u4aJAfvbmRrU", "question": "What kind of breakfast confection is on the red plate?", "choices": ["scone", "donut", "waffle", "bagel"], "correct_choice_idx": 0, "direct_answers": ["banana chocolate", "scone", "pastry", "pastry", "chocolate cake", "scone", "pizza", "desert", "banana toast", "crepe"], "difficult_direct_answer": false, "rationales": ["A dry pastry shaped in a triangle is on a plate with other breakfast items on dishes behind it.", "The confection is a scone.", "A flaky pastry that usually pairs wells woth coffee."], "image": "val2014/COCO_val2014_000000224138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60828, "question_id": "7VmhemH2m8vqirHSrTRsqg", "question": "What animals are behind the girl?", "choices": ["cows", "horses", "elk", "penguins"], "correct_choice_idx": 3, "direct_answers": ["penguins", "penguin", "penguins", "penguins", "penguin", "penguin", "penguins", "penguin", "pen queen", "penguins"], "difficult_direct_answer": false, "rationales": ["The girl with the umbrella is standing in front of a row of penguins.", "They are black and white penguins.", "They look like they are wearing tuxedos and have a large white belly"], "image": "train2014/COCO_train2014_000000060828.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236882, "question_id": "7WJ4cizkkXd7hzuV8ttwBn", "question": "What color is the gas tank on the Harley bike in the center of the pack?", "choices": ["blue", "yellow", "red", "gold"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "black", "red", "green", "red", "stop", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["A group of motorcycles are all parked together and all have black tanks except one in the middle with a red one.", "The harley bike in the center of the bikes has a red gas tank.", "The color is red."], "image": "train2014/COCO_train2014_000000236882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281815, "question_id": "7Wdez8hwa9bSPZUywzksm5", "question": "What is the cat sleeping near?", "choices": ["woman", "baby", "dog", "computer"], "correct_choice_idx": 3, "direct_answers": ["computer", "monitor", "laptop", "monitor", "computer monitor", "monitor", "monitor", "keyboard", "black", "monitor"], "difficult_direct_answer": false, "rationales": ["The cat is by a computer.", "There is a laptop and a keyboard", "The cat is sleeping on a desk near a computer."], "image": "train2014/COCO_train2014_000000281815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89391, "question_id": "7Wm6PwHwgFoS9yHjB3jKtb", "question": "What is next to the toilet?", "choices": ["dog", "woman", "polecat", "sink"], "correct_choice_idx": 3, "direct_answers": ["radiator", "sink", "sink", "sink", "sink", "cabinet", "basin", "cabinet", "sink", "sink"], "difficult_direct_answer": false, "rationales": ["There is a large white sink and vanity by a toilet in the bathroom.", "There is a vanity there with flowing water", "The toilet has a sink."], "image": "val2014/COCO_val2014_000000089391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194820, "question_id": "7WwmVomqS4CYKqjAarHvFZ", "question": "How many DVD discs are sat atop of the laptop on the coffee table?", "choices": ["two", "three", "four", "one"], "correct_choice_idx": 0, "direct_answers": ["two", "three", "two", "one", "one", "two", "one", "two", "one", "two"], "difficult_direct_answer": false, "rationales": ["There are two dvd disks sitting on the laptop on the coffee table.", "There are less than three but more than two discs visible on the laptop.", "There are two discs."], "image": "train2014/COCO_train2014_000000194820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410779, "question_id": "7YyEndDSQGtdmxSyeWQRnS", "question": "What is the orange item?", "choices": ["carrot", "pumpkin pie", "traffic cone", "sticky note"], "correct_choice_idx": 0, "direct_answers": ["carrot", "carrots", "carrot", "carrots", "spices", "carrot", "carrot", "carrot", "carrot", "carrot"], "difficult_direct_answer": false, "rationales": ["The orange item is a root vegetable.", "It's been peeled or in the process of being peeled.", "The item is a carrot."], "image": "train2014/COCO_train2014_000000410779.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184746, "question_id": "7ZFkVv5ncNngLzs2hg2goD", "question": "What color is the meat in the middle of the sandwiches on the plate close to the man's chest?", "choices": ["brown", "pink", "red", "white"], "correct_choice_idx": 1, "direct_answers": ["white", "red", "red", "white", "pink", "red", "pink", "red", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["This is a common colour for sandwich meat.", "It looks like it's some sort of pork or possible cornbeef.", "The color is pink."], "image": "val2014/COCO_val2014_000000184746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534308, "question_id": "7aEwjk4mo46M5E9cs3xzqx", "question": "What is the number of nice elephants who are living inside the zoo enclosure?", "choices": ["four", "one", "three", "two"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three elephants seen.", "The number is three.", "There are 3."], "image": "val2014/COCO_val2014_000000534308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68163, "question_id": "7aecuuyC5pxj3auDwavj4z", "question": "What car part can be seen?", "choices": ["carburetor", "antenna", "tire", "hood"], "correct_choice_idx": 2, "direct_answers": ["car", "tire", "tire", "wheel", "tire", "motorcycle", "tire", "tire", "bumper", "tire"], "difficult_direct_answer": false, "rationales": ["The only item visible that is commonly part of car composition is answer a and no other answer on the list is visible.", "The part is the tire.", "There is a circular wheel in front."], "image": "train2014/COCO_train2014_000000068163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342075, "question_id": "7cmV8QNnMogkBsJfY9wkja", "question": "How many motorcycles are on the highway apparently ahead of the vehicle driving?", "choices": ["four", "six", "two", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "four", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There's one by the curve and two behind.", "There are three motorbikes.", "It's obvious when you count them."], "image": "train2014/COCO_train2014_000000342075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153631, "question_id": "7dn77y36YzN2BA8JjAiSzh", "question": "What term is related to this sport?", "choices": ["bunt", "goal", "penalty kick", "touchdown"], "correct_choice_idx": 0, "direct_answers": ["softball", "softball", "bunt", "softball", "swing", "softball", "swing", "softball", "baseball", "softball"], "difficult_direct_answer": false, "rationales": ["The softball player is swinging the bat in a low soft motion called a bunt to deliberately not send the ball far.", "A woman is wearing a helmet and hitting a ball with a bat. a bunt is used in baseball.", "The term relates to the sound and action the bat makes where your more pushing the ball so it flies a shorter distance."], "image": "val2014/COCO_val2014_000000153631.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562741, "question_id": "7dv6zRFrpjLnSwYqWYzgU3", "question": "What is the old rusted freight car in the background probably used to carry?", "choices": ["oil", "gas", "water", "coal"], "correct_choice_idx": 3, "direct_answers": ["coal", "products", "coal", "luggage", "goods", "luggage", "coal", "coal", "people", "people"], "difficult_direct_answer": false, "rationales": ["The car has coal.", "The freight train is next to an old railway with a coal car on it.", "The top of the red wagon has black stuff."], "image": "train2014/COCO_train2014_000000562741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461340, "question_id": "7eKQw68zE7VPvXnnixcZbN", "question": "What company makes the item on the right side of the floor that has the wire attached to it?", "choices": ["wwe", "apple", "sony", "aew"], "correct_choice_idx": 1, "direct_answers": ["apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple"], "difficult_direct_answer": false, "rationales": ["There is a silver computer sitting on the floor. although it is hard to make out, a fruit can be seen in the center of the case, which is the logo of a popular company.", "There is a fruit on it", "Apple makes the item."], "image": "train2014/COCO_train2014_000000461340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415822, "question_id": "7fS9SxhYxrdJMxRLwcCEfk", "question": "What is the person on the left walking towards?", "choices": ["pumpkin", "stoplight", "baby", "egg"], "correct_choice_idx": 1, "direct_answers": ["street light", "move", "light", "stoplight", "traffic light", "traffic light", "light", "traffic light", "unknown", "light"], "difficult_direct_answer": false, "rationales": ["There are no food items or babies near the person. there is a traffic signal.", "The other options don't exist in this photo.", "The man is walking from left to right towards a traffic light."], "image": "train2014/COCO_train2014_000000415822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161157, "question_id": "7h4fzdRpBrdSYZMiMfFboH", "question": "What is she getting ready to do?", "choices": ["serve", "receive", "catch", "toss"], "correct_choice_idx": 0, "direct_answers": ["hit", "tennis", "serve", "hit ball", "serve", "serve", "serve", "serve ball", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["Many tennis players bounce the ball before they do a.", "She wants to serve.", "She is bouncing the ball in order to hit it to her opponent."], "image": "train2014/COCO_train2014_000000161157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362007, "question_id": "7iazLg74Acn9mNSozHJ4zP", "question": "What kind of fruit is there a serving of to the side of the cake?", "choices": ["raspberry", "strawbery", "pineapple", "grapefruit"], "correct_choice_idx": 1, "direct_answers": ["strawberry", "strawberry", "bsll", "strawberry", "strawberry", "starberry", "strawberry", "strawbery", "berries", "strawberry"], "difficult_direct_answer": false, "rationales": ["The fruit is a large red berry with a green stem.", "The fruit is berry.", "A red fruit with a green stem and seeds is next to dessert on a plate."], "image": "train2014/COCO_train2014_000000362007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309424, "question_id": "7iy5mLBUNZC7MA3W9Cob4g", "question": "What political party is the owner of this setup most likely to vote for?", "choices": ["independent", "republican", "democrat", "green"], "correct_choice_idx": 2, "direct_answers": ["democrat", "democratic", "democratic", "trump", "democrat", "democrat", "democrat", "biden", "democrat", "na"], "difficult_direct_answer": false, "rationales": ["The owner is mostly democrat. there is a photo of obama on the screensaver.", "A former president is on the computer.", "I'm going to say this based on the image on the monitor, but this could really be someone of any party."], "image": "val2014/COCO_val2014_000000309424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555685, "question_id": "7mCM3UtbXUmKRrLBRWmvCD", "question": "What led to the red stain on the inside of the bowl?", "choices": ["pouring", "stirring", "splashing", "sitting"], "correct_choice_idx": 0, "direct_answers": ["pouring", "sauce", "tomato sauce", "sauce", "sauce", "sauce", "tomato", "tomatoes", "tomato sauce", "eaten"], "difficult_direct_answer": false, "rationales": ["The shape is proof that it's from this action rather than one of the other options.", "It is a sauce that would have been mixed before using it", "The food in the bowl is a liquid, meaning it splashed when put into it."], "image": "val2014/COCO_val2014_000000555685.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561724, "question_id": "7mqbENNCFZUtQMsDptfhWW", "question": "What type of roofs are these?", "choices": ["animal hide", "wooden", "rock", "thatch"], "correct_choice_idx": 3, "direct_answers": ["thatched", "thatch", "hay", "straw", "thatched", "thatch", "straw", "grass", "straw", "thatch"], "difficult_direct_answer": false, "rationales": ["That is what the roof is made up of.", "Straw roofs are on small homes,", "The building are grass thatched."], "image": "train2014/COCO_train2014_000000561724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310070, "question_id": "7mryRwK6mDoZcGCFQ9RuRE", "question": "What is the cake on top of?", "choices": ["chair", "babys head", "counter", "pizza box"], "correct_choice_idx": 2, "direct_answers": ["counter", "counter", "icing", "counter", "counter", "plastic bag", "counter", "counter", "bag", "counter"], "difficult_direct_answer": false, "rationales": ["Counters are found in the kitchen where the cake would be.", "The other options aren't in this image and would be ridiculous.", "The cake is on a counter."], "image": "train2014/COCO_train2014_000000310070.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390951, "question_id": "7nWDLZmRNXKycPEvUJ9T4b", "question": "What material is the tile for the walls and stalls of this bathroom?", "choices": ["porcelain", "plastic", "marble", "laminate"], "correct_choice_idx": 2, "direct_answers": ["marble", "marble", "light", "marble", "marble", "marble", "marble", "marble", "glass", "marble"], "difficult_direct_answer": false, "rationales": ["The material is marble.", "Bathrooms have marble many times.", "The walls and stalls have a nice swirling pattern."], "image": "train2014/COCO_train2014_000000390951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539436, "question_id": "7oFmdokKsbigXVT6N7qWh4", "question": "How many traffic lights are hanging in the highway ahead facing toward the silver pickup truck?", "choices": ["seven", "four", "five", "six"], "correct_choice_idx": 2, "direct_answers": ["two", "five", "two", "two", "five", "two", "two", "three", "three", "six"], "difficult_direct_answer": false, "rationales": ["There are two in the front and 3 past the bridge", "There are five yellow traffic lights hanging on the line.", "There are five lights."], "image": "val2014/COCO_val2014_000000539436.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139721, "question_id": "7pJgxtyie8XLnYa6r3jdGD", "question": "What is in the packaging behind the cat to the right?", "choices": ["potato chips", "paper towels", "toilet paper", "paper napkins"], "correct_choice_idx": 3, "direct_answers": ["napkins", "napkins", "paper napkins", "napkins", "serviette", "napkins", "tissue", "napkins", "scissors", "napkins"], "difficult_direct_answer": false, "rationales": ["It has a square shape with white coloring.", "The packaging is napkins.", "These are white folded paper used to wipe mouths"], "image": "val2014/COCO_val2014_000000139721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280370, "question_id": "7pfYGPD7oGMadvbGd2wSo5", "question": "What continent contains the country referenced by the sign behind the cowboys?", "choices": ["africa", "south america", "europe", "north america"], "correct_choice_idx": 1, "direct_answers": ["usa", "central america", "south america", "south america", "south america", "usa", "south america", "south america", "south america", "south america"], "difficult_direct_answer": false, "rationales": ["The country name is clearly written and readable on the sign. the location of this country on a continent is commonly known.", "The continent is south america.", "The sign is in spanish."], "image": "val2014/COCO_val2014_000000280370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385627, "question_id": "7q55HH2Vyumry49GW5rmkM", "question": "What utensil is on the bottom right?", "choices": ["forks", "measuring spoons", "chopsticks", "spatulas"], "correct_choice_idx": 1, "direct_answers": ["measuring spoons", "measuring spoons", "spoon", "measuring spoons", "measuring spoon", "measuring spoons", "jug", "measuring spoon", "measuring spoon", "measuring spoon"], "difficult_direct_answer": false, "rationales": ["The utensil is a measuring spoon.", "Measuring spoons are in multiples and attached at their base.", "The utensil is for measuring."], "image": "train2014/COCO_train2014_000000385627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181348, "question_id": "7qdQjaV85FAQa7PnF7Bmge", "question": "What is on one of the tables?", "choices": ["phone", "baby", "axe", "samurai sword"], "correct_choice_idx": 0, "direct_answers": ["telephone", "phone", "phone", "television", "phone", "postcard", "phone", "phone", "telephone", "phone"], "difficult_direct_answer": false, "rationales": ["There is a device with buttons and a headset in which you can make calls with.", "There is a silver landline phone on the light brown table that is plugged in by wire.", "The table has a phone."], "image": "train2014/COCO_train2014_000000181348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472925, "question_id": "7r9HKkWK6Pqhr8k4aSHsoi", "question": "Which object is generating the most heat?", "choices": ["coffee mug", "stove", "toaster", "pan"], "correct_choice_idx": 1, "direct_answers": ["stove", "stove", "stove", "stove", "stove", "stove", "oven", "oven", "stove", "stove"], "difficult_direct_answer": false, "rationales": ["The object is the stove.", "The object is a stove.", "That is a place where you cook things which requires fire to cook the food."], "image": "train2014/COCO_train2014_000000472925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120088, "question_id": "7sGWrKTT4bEyDCXvqqKqkR", "question": "How many little elephants are around the tree with their families?", "choices": ["four", "three", "five", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two elephants", "four", "two", "car", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are 2.", "There are two little elephants standing around the tree with their parents.", "There are two smaller elephants and two big ones."], "image": "train2014/COCO_train2014_000000120088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381393, "question_id": "7sfm8aKDZBmEU45B9ikS8n", "question": "What type of building is this bathroom likely to be in?", "choices": ["house", "business", "library", "school"], "correct_choice_idx": 1, "direct_answers": ["theater", "business", "school", "office", "tall", "wall", "office", "office", "kitchen", "commercial building"], "difficult_direct_answer": false, "rationales": ["The business building has a bathroom.", "The building is a business.", "The bathroom has multiple stalls for multiple people and looks professional."], "image": "val2014/COCO_val2014_000000381393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362391, "question_id": "7tcViv4fgVnXeCZJiYuZzM", "question": "What mountains are these?", "choices": ["appalachian", "rocky mountains", "pyrenees", "alps"], "correct_choice_idx": 3, "direct_answers": ["alps", "brambruesch mountain", "snowcapped", "swiss mountains", "brambruesch", "alps", "swiss alps", "alps", "snow", "chur"], "difficult_direct_answer": false, "rationales": ["The chur train station is located in switzerland so the best option is the mountains located in that country.", "The mountains are the alps.", "The mountains are the alps."], "image": "val2014/COCO_val2014_000000362391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260906, "question_id": "7uHFT3nGhWFSqL22ZtLGiW", "question": "What does the man in the foreground with the blue shirt have?", "choices": ["cane", "bike helmet", "clown nose", "long beard"], "correct_choice_idx": 3, "direct_answers": ["beard", "lanyard", "beard", "backpack", "pepsi", "pass", "backpack", "pepsi", "long beard", "beard"], "difficult_direct_answer": false, "rationales": ["The man has a beard.", "The hair that is growing on the mans face is known as a beard and in this mans case the length is what stands out.", "The man in the question is located based on the text of the question and his features are visible and identifiable consistent with answer a."], "image": "train2014/COCO_train2014_000000260906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321798, "question_id": "7uiGnQvqgntUq5EZ3XYYj7", "question": "What is in the food container?", "choices": ["gerbil", "hot dog", "cheeseburger", "toast"], "correct_choice_idx": 3, "direct_answers": ["kiwi", "toast", "breakfast", "macaroni", "breakfast", "bread", "bread", "food", "breakfast", "kiwi"], "difficult_direct_answer": false, "rationales": ["The container has toast.", "A tray of food has fruit and various items including bread that has a golden color to it. toasted bread is often golden brown.", "There are pieces of bread and eggs in the container. it is a breakfast meal. heated bread is for breakfast."], "image": "val2014/COCO_val2014_000000321798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561302, "question_id": "7uviR6saKiXxs3qif7xS5M", "question": "What item here would an artist use?", "choices": ["purple marker", "easel", "cat carrier", "smock"], "correct_choice_idx": 1, "direct_answers": ["blackboard", "easel", "easle", "easel", "paint board", "board", "white board", "easel", "easel", "easel"], "difficult_direct_answer": false, "rationales": ["The artist would use the easel.", "An artist uses an easel and an easel is standing in the room.", "The item is an upright support used for displaying or resting something on."], "image": "train2014/COCO_train2014_000000561302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157559, "question_id": "7v2iNSoCj7XtZgXrNePGRp", "question": "What color are the flowers nearest to the cows who are currently grazing?", "choices": ["purple", "yellow", "white", "red"], "correct_choice_idx": 2, "direct_answers": ["white", "white", "white", "brown", "green", "yellow", "white", "blue", "white", "white"], "difficult_direct_answer": false, "rationales": ["The colors on the side of the field are white.", "It looks like there might also be d, but a is most obvious and makes sense for either clover or meadow wildflowers.", "This is obvious in the scene."], "image": "val2014/COCO_val2014_000000157559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28560, "question_id": "7w85ANwoDgCSqgkfmwUt6D", "question": "Where does the person on the motorcycle usually work?", "choices": ["fire department", "school", "circus", "corporate office"], "correct_choice_idx": 2, "direct_answers": ["joker", "circus", "riding", "circus", "circus", "circus", "circus", "circus", "circus", "circus"], "difficult_direct_answer": false, "rationales": ["The person on the motorcycle is a clown.", "This is a clown and they are usually in the ring in a tent", "This is a clown"], "image": "train2014/COCO_train2014_000000028560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407235, "question_id": "7xDFzCD7ecA2Muvx6CNyaJ", "question": "What are these animals known for?", "choices": ["antenna", "wings", "antlers", "wool"], "correct_choice_idx": 3, "direct_answers": ["wool", "wool", "wool", "wool", "wool", "clothing", "sheep", "wool", "wool", "wool"], "difficult_direct_answer": false, "rationales": ["A large group of white animals with black faces and fluffy fur are grazing in grass together.", "The sheep have wool.", "There are many sheeps with long fur. they use the fur to make jackets."], "image": "val2014/COCO_val2014_000000407235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272176, "question_id": "7xFd7pjGgBHc6UdAR2AK8U", "question": "Whose first name can be found on the bottle?", "choices": ["joshua jackson", "bud selig", "mike sorrentino", "stella maeve"], "correct_choice_idx": 3, "direct_answers": ["stella", "stella", "stella artois", "stella artois", "stella", "stella", "stella", "stella", "stella maeve", "stella"], "difficult_direct_answer": false, "rationales": ["The name is stella maeve.", "The name stella is found on the bottle.", "A brand logo is on a beer bottle."], "image": "train2014/COCO_train2014_000000272176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472864, "question_id": "7ymGozJyWhP5fzZ55KXFzu", "question": "What color is the strange cow just ahead to the left?", "choices": ["brown", "white", "black", "gray"], "correct_choice_idx": 0, "direct_answers": ["brown", "brown", "brown spots", "black", "white brown", "brown", "black", "one", "black", "spotted"], "difficult_direct_answer": false, "rationales": ["The color isn't black like the others, and is a mixture of red and yellow to make a soil type color.", "I wouldn't call it strange and it's all three options because of spots.", "It is lighter colored with spots"], "image": "val2014/COCO_val2014_000000472864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430525, "question_id": "7zHHETpsmV59sdHYWcpRrm", "question": "What color is the leather of the woman's purse who is sitting on the bench to the right?", "choices": ["white", "black", "cream", "tan"], "correct_choice_idx": 3, "direct_answers": ["tan", "brown", "brown", "brown", "brown", "white", "cowhide", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The woman that is sitting on the bench to the right has a purse made of tan leather.", "The woman's purse is tan leather.", "The color is tan."], "image": "val2014/COCO_val2014_000000430525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452565, "question_id": "7zeHb4YsxKNPtffH5j4WnS", "question": "What is the man on the left wearing?", "choices": ["pattu langa", "achkan", "sari", "hijab"], "correct_choice_idx": 1, "direct_answers": ["scarf", "pon", "tunic", "achkan", "indian garb", "dress", "kurta", "gurtha", "scarf", "scarf"], "difficult_direct_answer": false, "rationales": ["The man is wearing a mans button down long shirt whereas the other are women's or children's attire or headscarf.", "He is wearing a knee length type of coat.", "The man is wearing traditional clothing native to the area."], "image": "train2014/COCO_train2014_000000452565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276233, "question_id": "7zgauNkJ5trcuXPvMF3Y6F", "question": "What color are the shoes attached to the skis of this mountain ascending man?", "choices": ["black", "red", "blue", "purple"], "correct_choice_idx": 2, "direct_answers": ["blue green", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The color is easily visible and bright. it is in sharp contrast to the white snow.", "The color is blue.", "The color is blue."], "image": "train2014/COCO_train2014_000000276233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101749, "question_id": "82d45ABJ4aEyeWjuC427pE", "question": "What are the motorcycles on the right side next to?", "choices": ["white line", "wheelbarrow", "statue", "traffic cone"], "correct_choice_idx": 0, "direct_answers": ["motorcycles", "building", "white line", "scooty", "street", "white line", "white line", "white line", "white line", "street"], "difficult_direct_answer": false, "rationales": ["The bikes are by the line.", "The long markings are clearly visible. they are often found on the street to indicate where to park.", "The white line is nearby."], "image": "train2014/COCO_train2014_000000101749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273106, "question_id": "82nBpLZ2rAdWUbBBsLF6sa", "question": "What is the baby elephant called?", "choices": ["calf", "bullock", "kit", "colt"], "correct_choice_idx": 0, "direct_answers": ["calf", "calf", "calf", "calf", "calf", "calf", "calf", "calf", "calf", "calf"], "difficult_direct_answer": false, "rationales": ["The name is used for young herd mammals.", "A baby elephant is called the same name as a baby cow.", "The elephant is a calf."], "image": "train2014/COCO_train2014_000000273106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369594, "question_id": "842nR8WnfJuTqRY9RqJRYt", "question": "What is the type of stove cooktop called?", "choices": ["induction", "gas", "electric", "propane"], "correct_choice_idx": 2, "direct_answers": ["flattop", "flat top", "electric cooktops", "glass range", "electric", "electric", "electric stove", "flattop", "electric", "electric"], "difficult_direct_answer": false, "rationales": ["The type is electric.", "The stovetop is a flat surface meaning it runs on electricity.", "The top on the stove is very flat."], "image": "val2014/COCO_val2014_000000369594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321860, "question_id": "845ekaGAXQgPU8M5jgMCGV", "question": "Who does the person in the bed likely know?", "choices": ["matta huuri", "jung bong", "chester bennington", "susan floyd"], "correct_choice_idx": 2, "direct_answers": ["guitar", "linkin park", "play guitar", "guitar", "linkin park", "play guitar", "musician", "music", "chester bennington", "family"], "difficult_direct_answer": false, "rationales": ["The person has a lot of music posters.", "The person in the bed most likely knows chester because they both play guitars.", "They have posters for his band"], "image": "train2014/COCO_train2014_000000321860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213382, "question_id": "84BGLzSJK7jkq7ay2xYYPZ", "question": "What are the large structures?", "choices": ["ladders", "giraffes", "skyscrapers", "telephone poles"], "correct_choice_idx": 3, "direct_answers": ["phone poles", "phone poles", "telephone poles", "utility poles", "power lines", "buildings", "buildings", "buildings", "buildings", "buildings"], "difficult_direct_answer": false, "rationales": ["The structure is a pole.", "They look like columns.", "The tallest manmade objects are telephone poles."], "image": "train2014/COCO_train2014_000000213382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145447, "question_id": "85r6GAPsnpzVubSq2onPLR", "question": "What color is the train car in the center of the three cars?", "choices": ["red", "blue", "gray", "white"], "correct_choice_idx": 0, "direct_answers": ["orange", "orange", "brown", "redwhite black", "red", "red", "red", "red", "dark orange", "red"], "difficult_direct_answer": false, "rationales": ["The other ones are red and black", "The color is red.", "The train car in between the other cars is painted a dark red color."], "image": "train2014/COCO_train2014_000000145447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217827, "question_id": "85stXgo4mYdTvrk9VRQwMG", "question": "What is the person on the left wearing?", "choices": ["sunglasses", "mask", "crown", "green shirt"], "correct_choice_idx": 0, "direct_answers": ["shorts", "shorts", "shorts", "shorts", "plaid shorts", "shorts", "pant", "sunglasses", "shorts", "pants"], "difficult_direct_answer": false, "rationales": ["There are many people wearing sunglasses and the guy is shirtless in front with hand in pocket.", "The person on the left is not wearing a shirt, mask, or crown. his eyes are covered.", "He isn't wearing a shirt, mask or crown."], "image": "val2014/COCO_val2014_000000217827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422215, "question_id": "85tfFckXNQ7cfoWQDgr3eG", "question": "What is the girl in pink wearing?", "choices": ["smock", "garbage bag", "dress", "lab coat"], "correct_choice_idx": 2, "direct_answers": ["dress", "dress", "dress", "dress", "dress", "dress", "dress", "dress", "dress", "dress"], "difficult_direct_answer": false, "rationales": ["A young girl is holding a kite. she is wearing a long pink and white piece of clothing.", "The little girl is wearing a dress purple in color.", "The girl is wearing a dress."], "image": "train2014/COCO_train2014_000000422215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160848, "question_id": "86EkvhS8CjsTZceLgWh539", "question": "What is parked next to the boat?", "choices": ["bicycle", "cat", "car", "baby"], "correct_choice_idx": 0, "direct_answers": ["bicycle", "bike", "bike", "bike", "bicycle", "bicycle", "lake", "bike", "bike", "bicycle"], "difficult_direct_answer": false, "rationales": ["It is a human powered vehicle with two wheels", "There is one tied to a pylon on the pier", "Bicycles have two wheels."], "image": "train2014/COCO_train2014_000000160848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97591, "question_id": "86tSAazbvuoaPdJLZa38JY", "question": "How many people are riding on the elephant walking through the brown water?", "choices": ["four", "five", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["This is obvious in the picture.", "There are two people.", "There are two people riding on top of this elephant."], "image": "train2014/COCO_train2014_000000097591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432153, "question_id": "876zHu4ACTRpNuGup7ZQmS", "question": "Which food on the plate is highest carbohydrates?", "choices": ["beef", "gravy", "potatoes", "broccoli"], "correct_choice_idx": 2, "direct_answers": ["mashed potatoes", "potatoes", "potatoes", "potatoes", "mashed potatoes", "potatoes", "mashed potatoes", "potatoes", "potatoes", "potato"], "difficult_direct_answer": false, "rationales": ["The white mass present on this plate is mashed potatoes. this vegetable is known for it's high carbohydrate content.", "This food is high in starch. the meat is high in protein.", "The food is potatoes."], "image": "train2014/COCO_train2014_000000432153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109145, "question_id": "89DBrxTtueXhwaPdeFEqRb", "question": "Using common sense what kind of books are stored here?", "choices": ["dictionaries", "cookbooks", "novels", "bibles"], "correct_choice_idx": 1, "direct_answers": ["cooking", "cookbooks", "cook books", "study book", "cookbooks", "cooking", "cookbooks", "stories", "files books", "novels"], "difficult_direct_answer": false, "rationales": ["This shelf is in the kitchen which means the books would fit the theme.", "This is in a kitchen and these books have recipes", "Common sense would show cookbooks are stored."], "image": "train2014/COCO_train2014_000000109145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349425, "question_id": "89f8GjKtPDjiihmTtxuFjc", "question": "What kind of fruit are these indicated by the color of the interior?", "choices": ["mandarin", "orange", "grapefruit", "lime"], "correct_choice_idx": 2, "direct_answers": ["grapefruit", "grapefruit", "grapefruit", "grapefruit", "orange", "orange", "grapefruit", "blood orange", "grapefruit", "orange"], "difficult_direct_answer": false, "rationales": ["The fruits are oranges.", "These are known as blood oranges because the flesh is so red", "Round fruits with orange rinds and a dark orange, almost red center are in a pile together."], "image": "train2014/COCO_train2014_000000349425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351566, "question_id": "8AhybYbHxuptrqu24BVNag", "question": "Which cat looks more comfortable?", "choices": ["left", "back", "right", "front"], "correct_choice_idx": 1, "direct_answers": ["both", "back", "back cat", "front one", "background", "in back", "both", "back", "back one", "front"], "difficult_direct_answer": false, "rationales": ["The back cat is more cozy.", "There are two cats. the closer one appears to be less comfortable.", "The back cat seems more comfortable because it's eyes are closed."], "image": "train2014/COCO_train2014_000000351566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397815, "question_id": "8BgShATnn3zKPx3vWNjRTp", "question": "What is the woman holding the umbrella wearing?", "choices": ["tiara", "crown", "boots", "necklace"], "correct_choice_idx": 2, "direct_answers": ["dress", "dress", "dress", "dress", "boots", "dress", "dress", "black dress", "protect", "black dress"], "difficult_direct_answer": false, "rationales": ["The woman is wearing rain boots because it is raining outside.", "The woman is wearing rain boots.", "A woman is wearing a shoewear on her that is suppose to protect her from rain."], "image": "train2014/COCO_train2014_000000397815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184241, "question_id": "8BxoarSQsh3MABL68jp2tw", "question": "What is the person with the racquets sitting on?", "choices": ["bed", "tree stump", "office chair", "bench"], "correct_choice_idx": 3, "direct_answers": ["bench", "player", "bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to this scene. a is often used along the fencing for tennis courts.", "Benches are used for sitting.", "He is waiting to play."], "image": "val2014/COCO_val2014_000000184241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396467, "question_id": "8CV6QagLbPmZgCcsQZY7jK", "question": "How many sticks are arranged in a line before the salad and water?", "choices": ["two", "four", "three", "one"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["You use that many for eating a meal using chopsticks.", "There are 2.", "There are two sticks."], "image": "train2014/COCO_train2014_000000396467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255197, "question_id": "8DRzfUXGfbYQM7M6h7rLKV", "question": "How many zebras are standing in the forest with their noses pointed at the cameras?", "choices": ["four", "three", "one", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "four", "three", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are 2.", "They are facing forward.", "There are two zebras looking at the camera. they are easily counted."], "image": "train2014/COCO_train2014_000000255197.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265080, "question_id": "8DtAYDC9uyUaHrAZrqnAHd", "question": "What state is the home team based in?", "choices": ["kansas", "missouri", "oklahoma", "texas"], "correct_choice_idx": 1, "direct_answers": ["missouri", "arizona", "missouri", "cali", "missouri", "missouri", "kansas", "kansas", "new york", "new york"], "difficult_direct_answer": false, "rationales": ["The home team are the kansas city royals. this kansas city is not in kansas.", "The state is missouri.", "The state is missouri."], "image": "train2014/COCO_train2014_000000265080.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35844, "question_id": "8EhdTJNTN8pk3KjGqhtajQ", "question": "What company is known for using the largest vehicle here?", "choices": ["iams", "audi", "greyhound", "rca"], "correct_choice_idx": 2, "direct_answers": ["gas", "bsll", "greyhound", "company", "bus", "greyhound", "tour bus", "amtrak", "bus", "greyhound"], "difficult_direct_answer": false, "rationales": ["A large passenger bus is on the street. greyhound is a well known charter bus company.", "That said, their buses are a different color.", "The company is greyhound."], "image": "train2014/COCO_train2014_000000035844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108212, "question_id": "8F7sdVdXeqJBhqWLR2aiwV", "question": "Which object is most likely to be used to boil water?", "choices": ["oven", "cupboard", "cutting board", "teapot"], "correct_choice_idx": 3, "direct_answers": ["kettle", "teapot", "kettle", "kettle", "stove", "teapot", "kettle", "kettle", "teakettle", "kettle"], "difficult_direct_answer": false, "rationales": ["There is a red pot that is used to put water to boil water. it is sitting on white stove in kitchen.", "This is specific for the purpose of boiling water", "The tea kettle is the obvious answer. the other options don't work except b and the water would need to be inside of something."], "image": "val2014/COCO_val2014_000000108212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504591, "question_id": "8FYYkZdScNuPmctm5wdyzm", "question": "What is the small hexagonal object on the wall?", "choices": ["open door", "soap holder", "light", "safety bar"], "correct_choice_idx": 1, "direct_answers": ["shower handle", "shower", "nothing", "shower", "valve", "shower control", "faucet", "faucet knob", "soap holder", "shower handle"], "difficult_direct_answer": false, "rationales": ["There is a hexagonal soap holder on the shower wall.", "The item is a soap holder.", "The object is for soap."], "image": "train2014/COCO_train2014_000000504591.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77184, "question_id": "8FoWcAd3HyVSgTeyRVXuWq", "question": "What is at the apex of the boat?", "choices": ["flags", "cats", "balloons", "monkeys"], "correct_choice_idx": 0, "direct_answers": ["flag", "flag", "flags", "flag garland", "people", "flags", "watching", "flags", "flags", "sea"], "difficult_direct_answer": false, "rationales": ["On top of the boat is a banner of flags.", "There are several along the lines strung across the boat", "It has colorful fabric banners at the top."], "image": "val2014/COCO_val2014_000000077184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330894, "question_id": "8GG4Rd73deJ8mPNrfzjQgA", "question": "What destination resembles this place most?", "choices": ["ireland", "france", "beijing", "germany"], "correct_choice_idx": 2, "direct_answers": ["parking", "chinatown", "downtown", "beijing", "city", "thailand", "parking", "tokyo", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["There are signs in an asian language", "The businesses in the background have asian characters, showing it is city in asia.", "A busy and compact city street has scooters parked in a long line and businesses with signs with asian lettering on them."], "image": "train2014/COCO_train2014_000000330894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321079, "question_id": "8GWtDwB3nrM2HQ5G9rX576", "question": "What type of breakfast food is this on the plate?", "choices": ["waffle", "egg", "pancake", "biscuit"], "correct_choice_idx": 2, "direct_answers": ["cake", "pancakes", "pancakes", "cake", "pancake", "pancake", "pancakes", "pancakes", "pancake", "pancakes"], "difficult_direct_answer": false, "rationales": ["The pancake is plated.", "The breakfast is a pancake.", "This is a pancake with some whipped cream and banana slices."], "image": "val2014/COCO_val2014_000000321079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270593, "question_id": "8GitkceqypXPTzSJtLKxJk", "question": "How many birds are sitting on the side of the river bank?", "choices": ["three", "five", "two", "four"], "correct_choice_idx": 2, "direct_answers": ["watching", "two", "two", "two", "two", "car", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two long necks coming up out of the grass", "The birds are white on the edge of the river.", "Both birds are next to one another."], "image": "val2014/COCO_val2014_000000270593.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200500, "question_id": "8JLqHCNXVquDyYGMWi7vr6", "question": "What color is the soap in the clear container on top of the sink?", "choices": ["red", "yellow", "blue", "green"], "correct_choice_idx": 0, "direct_answers": ["orange", "orange", "red", "red", "orange", "red", "pink", "orange", "silver", "clear"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The soap is the same color as an apple.", "It is the color of an apple"], "image": "train2014/COCO_train2014_000000200500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19980, "question_id": "8KvwXxHiYWaneg4whDxusT", "question": "Who is on the screen?", "choices": ["idris elba", "charlize theron", "howard dean", "tom hardy"], "correct_choice_idx": 2, "direct_answers": ["politician", "political candidate", "obama", "politician", "obama", "howard dean", "man", "person", "news", "candidate"], "difficult_direct_answer": false, "rationales": ["There is a politician on the laptop screen.", "That is the person shown on the screen.", "Howard dean is on the screen."], "image": "train2014/COCO_train2014_000000019980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417763, "question_id": "8Kx4Bko9hkxmbsDnJQQ6f7", "question": "What is the structure carrying these boats referred as?", "choices": ["bay", "river", "canal", "ocean"], "correct_choice_idx": 2, "direct_answers": ["canal", "boat carrier", "ferry", "ferry", "na", "tugboat", "dock", "ferry", "canal", "water"], "difficult_direct_answer": false, "rationales": ["The water is not natural, and is probably man made as evident by the straight edges.", "The boats are carried by a canal.", "It is a waterway between buildings."], "image": "train2014/COCO_train2014_000000417763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297257, "question_id": "8LbmSLsLZsBKkTEjZTudLr", "question": "How many paintings are framed on the wall where there is a door frame as well?", "choices": ["four", "two", "three", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one painting", "two", "one", "one", "one", "two", "two", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is 1.", "A framed picture is on a wall right next to the frame of a door.", "There is one painting framed in black on the side of the wall next to the office."], "image": "train2014/COCO_train2014_000000297257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495348, "question_id": "8LrQGoqPm5kuqhR6qssRBt", "question": "Where are these animals likely hanging out?", "choices": ["savanna", "tundra", "cave", "desert"], "correct_choice_idx": 0, "direct_answers": ["savanna", "savannah", "savannah", "africa", "africa jungle", "africa", "savannah", "prairie", "field", "serengeti"], "difficult_direct_answer": false, "rationales": ["The animals are in a dry plain with a lot of scrub.", "The zebras and giraffes are hanging out in the savannah.", "The animals include giraffes and zebras. these animals do not hang out in tundra, desert, or cave environments."], "image": "val2014/COCO_val2014_000000495348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448983, "question_id": "8P2mX3GjLMB9DQxY3XaZ6C", "question": "How many steeples are there on the top of this large church building?", "choices": ["three", "one", "six", "four"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "five"], "difficult_direct_answer": false, "rationales": ["None of the other options match the visible steeples.", "There are three steeples.", "There are two smaller ones flanking a larger one"], "image": "val2014/COCO_val2014_000000448983.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505757, "question_id": "8QM2o89ZVhLPpST5psmdjk", "question": "What kind of drink is inside of the coffee mug?", "choices": ["tea", "espresso", "water", "milk"], "correct_choice_idx": 1, "direct_answers": ["espresso", "hot chocolate", "coffee", "latte", "coffee", "coffee", "coffee mug", "coffee", "latte", "coffee"], "difficult_direct_answer": false, "rationales": ["This drink is usually drunk with breakfast, and the drink type tends to have cream often in designs added to it.", "It has a design in the milk", "By the picture it's plain to see that it is espresso."], "image": "train2014/COCO_train2014_000000505757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322212, "question_id": "8RAK2VXxnUsQc9GBSfHVy8", "question": "What store is known for selling the item the man with glasses on is eating?", "choices": ["dunkin donuts", "mcdonalds", "subway", "home depot"], "correct_choice_idx": 0, "direct_answers": ["donut shop", "dunkin donuts", "dunkin donuts", "mcdonalds", "dunkin donuts", "krispy cream", "bakery", "krispy kreme", "donut", "donut"], "difficult_direct_answer": false, "rationales": ["He's eating a chocolate sprinkle donut", "Dunkin' sells this item.", "The guy is eating a doughnut and one of the biggest names in doughnuts is dd."], "image": "train2014/COCO_train2014_000000322212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41570, "question_id": "8RzkenCE2MbdbCdAVfqK2d", "question": "What color is the bike that is parked on the side of the road with two children on it?", "choices": ["red", "silver", "black", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The color is white.", "The bike is white in color and the kids are on it.", "The color matches the helmet."], "image": "val2014/COCO_val2014_000000041570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24061, "question_id": "8TuXfCKyKj5G987zRFxsSx", "question": "What color is the tissue box on the top of the toilet bowl?", "choices": ["red", "purple", "blue", "yellow"], "correct_choice_idx": 2, "direct_answers": ["gray", "blue brown", "blue", "green", "blue", "green", "blue", "green", "gray", "blue"], "difficult_direct_answer": false, "rationales": ["The tissue box is not yellow, purple, or red.", "A box with on top of a toilet is light green in color and has a round opening on top. tissue boxes have openings so that a tissue can be pulled out.", "The color of the box is blue."], "image": "val2014/COCO_val2014_000000024061.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503832, "question_id": "8UCGYYL7Cq3nxr3bRSt7wR", "question": "What is closest to the giraffe?", "choices": ["baby eel", "hat", "zebra", "baby"], "correct_choice_idx": 2, "direct_answers": ["little boy", "zebra", "zebra", "car", "rock", "zebra", "zebra", "zebra", "zebra", "rocks"], "difficult_direct_answer": false, "rationales": ["The zebra is close.", "The question is unspecific, but answer a is a object that is visibly close to the giraffes in question.", "The closest thing to the giraffe is a white and black striped animal."], "image": "train2014/COCO_train2014_000000503832.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482036, "question_id": "8URpaEiTYRN4qiwUzzjjYz", "question": "What player played the same position as this batter?", "choices": ["adam laroche", "derek jeter", "ryan howard", "joe mauer"], "correct_choice_idx": 1, "direct_answers": ["infielder", "escobar", "19", "baseball player", "batter", "escobar", "derek jeter", "phil", "escobar", "escobar"], "difficult_direct_answer": false, "rationales": ["The person who played in the batter position is named derek.", "The player is jeter.", "Derek jeter played."], "image": "train2014/COCO_train2014_000000482036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577876, "question_id": "8VE5ZBjzHpcSkP5uQnYLFh", "question": "What are the lights next to?", "choices": ["airplane", "car", "sink", "tree"], "correct_choice_idx": 2, "direct_answers": ["woman", "candles", "mirror", "mirror", "mirror", "mirror", "wall", "mirrors", "mirror", "sink"], "difficult_direct_answer": false, "rationales": ["This is a bathroom", "The lights are by the sink.", "The lights are inside a bathroom. there are no trees or vehicles."], "image": "train2014/COCO_train2014_000000577876.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46169, "question_id": "8Xg5KvoVWC68WjMMK7FXNa", "question": "What kind of water body is most likely is this boat serviced for?", "choices": ["ocean", "sea", "river", "lake"], "correct_choice_idx": 1, "direct_answers": ["pure water", "river", "lake", "sea", "river", "river", "river", "river", "river", "lake"], "difficult_direct_answer": false, "rationales": ["This boat is a ferry. it is too big to travel on a river or lake and is too small to travel on an ocean.", "The body is the sea.", "The boat is likely to sail on a small sea."], "image": "train2014/COCO_train2014_000000046169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1342, "question_id": "8Xw7wxusE43WB9pxCMJ42K", "question": "What kind of object is to the front of this strange train?", "choices": ["smokestack", "propeller", "face", "engine"], "correct_choice_idx": 1, "direct_answers": ["propeller", "propeller", "propeller", "propeller", "propeller", "propeller", "propeller", "airplane", "propeller", "bookshelf"], "difficult_direct_answer": false, "rationales": ["The object is a propeller.", "The object propels.", "Propeller is on top of the page."], "image": "val2014/COCO_val2014_000000001342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443883, "question_id": "8YidgLkreG2hACaDjrA73Y", "question": "What are the cows doing?", "choices": ["walking", "laying down", "playing soccer", "running"], "correct_choice_idx": 1, "direct_answers": ["lying down", "sleeping", "laying", "resting", "laying", "resting", "laying down", "sitting", "sitting down", "sitting"], "difficult_direct_answer": false, "rationales": ["The cows are laying down in the field of grass.", "The cows are resting.", "They aren't supported by their legs"], "image": "train2014/COCO_train2014_000000443883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474268, "question_id": "8axiVYLWYXATRBC5yhygdV", "question": "What is on the floor?", "choices": ["antelope", "surfboard", "pumpkin", "egg sandwich"], "correct_choice_idx": 1, "direct_answers": ["grass", "sand", "sand", "sand", "surf boards", "sand", "surfboard", "sand", "surf board", "sand"], "difficult_direct_answer": false, "rationales": ["There is a surfboard lying on the sand in front of the people at the beach.", "The people are on a beach. there are no food items or non-human animals on the floor.", "They are on the beach which means they are next to the ocean which they can surf on."], "image": "train2014/COCO_train2014_000000474268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208927, "question_id": "8bcE68ERtimbNMsqo8axeA", "question": "What country's flag requires three of the four colors found on the bus?", "choices": ["greece", "turkey", "united kingdom", "brazil"], "correct_choice_idx": 2, "direct_answers": ["france", "united states", "country", "united states", "bsll", "no", "united kingdom", "american", "america", "america"], "difficult_direct_answer": false, "rationales": ["The front of each bus is red, white, blue, and black. brazil's flag does not have red, greece's flag also does not have red, and turkey's flag does not have blue.", "The uk's flag requires the colors.", "The colors of the uk flag are red, white and blue."], "image": "train2014/COCO_train2014_000000208927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382784, "question_id": "8bpnorpGXKjb8ZHPTXeXx7", "question": "What color of baggage is containing the booster seats for car riding on the flight return?", "choices": ["black", "red", "white", "purple"], "correct_choice_idx": 0, "direct_answers": ["black", "brown", "black", "black", "black", "black", "black", "black", "black", "white"], "difficult_direct_answer": false, "rationales": ["The color is black.", "The black bag on the left is the one that contains the booster seat.", "There is a black bag around the booster seat."], "image": "train2014/COCO_train2014_000000382784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372702, "question_id": "8c8MNXUWgjPWacvB9uQJPN", "question": "The sticker attached at the bottom of the sink is of what color?", "choices": ["pink", "blue", "red", "orange"], "correct_choice_idx": 1, "direct_answers": ["blue", "blue", "baby blue", "cup", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The sticker is blue.", "The other options aren't on the sink.", "There is a blue sticker at the base of the sink."], "image": "train2014/COCO_train2014_000000372702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39698, "question_id": "8eGzwKvmwaBXki93m2mWY3", "question": "What is typically found on the place where the potted plant is resting on?", "choices": ["car battery", "cutting board", "tiger", "laptop"], "correct_choice_idx": 1, "direct_answers": ["coffee pot", "counter", "soap", "dishes", "cutting board", "soap dispenser", "dirty dishes", "canisters", "pots", "good"], "difficult_direct_answer": true, "rationales": ["A cutting board is typically there.", "The board is found.", "Ther potted plant is resting on the counter between the stove and the sink. only one of the choices is an item usually found in a kitchen."], "image": "train2014/COCO_train2014_000000039698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197318, "question_id": "8h3H6VxNSizRWnXHeC4Xd6", "question": "What color is the head of the cow who is grazing right on the wooden bridge?", "choices": ["black", "brown", "gray", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "black", "white", "black", "white", "white", "brown white", "brown white", "white"], "difficult_direct_answer": false, "rationales": ["The cow has white on his brown head.", "The only cow who is grazing has a white on its head", "This is obvious in the picture. it's also common with cattle."], "image": "train2014/COCO_train2014_000000197318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221620, "question_id": "8hCQxdbQ34afgz7KNWNfWy", "question": "What does the top half of the donut on the bag's design represent?", "choices": ["maze", "swimming pool", "target", "rainbow"], "correct_choice_idx": 3, "direct_answers": ["donut", "rainbow", "cream", "rainbow", "rainbow", "rainbow", "rainbow", "rainbow", "rainbow", "rainbow"], "difficult_direct_answer": false, "rationales": ["The top has a rainbow.", "The bag has a donut on it and the top half of it resembles the arc of a rainbow.", "The top half of the donut in question on the bag has bands of color in a rounded arch."], "image": "val2014/COCO_val2014_000000221620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168332, "question_id": "8hhkhMqmno8g2VRvxVe3k3", "question": "What body part do humans and elephants have that is most similar?", "choices": ["ears", "trunk", "eyes", "tusks"], "correct_choice_idx": 2, "direct_answers": ["eye", "eyes", "friendly", "itelligence", "eyes", "na", "face", "eyes", "tusk", "eyes"], "difficult_direct_answer": false, "rationales": ["The body part is the eye.", "The part is the eye.", "Elephants and humans both have eyes that resemble each other."], "image": "train2014/COCO_train2014_000000168332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310716, "question_id": "8hmdHGwnZFcESgtnfrgcsp", "question": "What is fully visible on the animal on the left?", "choices": ["horn", "wing", "tusk", "tail"], "correct_choice_idx": 3, "direct_answers": ["tail", "alzeebra", "tail", "tail", "tail", "tail", "zebra", "tree", "zebra", "zebra"], "difficult_direct_answer": false, "rationales": ["It's flicking this upward", "This is hanging from the rear of the animal.", "The animal has a tail."], "image": "train2014/COCO_train2014_000000310716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134351, "question_id": "8iPTbjby6BVzjHfgpJKScd", "question": "What object makes this black and white photo confusing?", "choices": ["outfits", "trash can", "pots", "smartphone"], "correct_choice_idx": 3, "direct_answers": ["cell phone", "phone", "smartphone", "smartphone", "empty pots", "phone", "phone", "smartphone", "cell phone", "cellphone"], "difficult_direct_answer": false, "rationales": ["The object is the phone.", "It seems out of place with black and white because historically b&w photos were because of a lack of color processing many years ago.", "The photo is black and white which makes it look like a very old picture but the smartphone makes it seem not as old."], "image": "train2014/COCO_train2014_000000134351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279920, "question_id": "8j26wer4UhKyJMVWcsf3zf", "question": "Where are these animals most likely to be found in their natural habitat?", "choices": ["russia", "new brunswick", "new york", "africa"], "correct_choice_idx": 3, "direct_answers": ["wild", "zoo", "savannah", "zoo", "africa", "forest", "africa", "grassland", "africa", "game park"], "difficult_direct_answer": false, "rationales": ["The animals are from africa.", "Giraffes are known to live in africa.", "A group of giraffes are standing together."], "image": "train2014/COCO_train2014_000000279920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247493, "question_id": "8kJoMPG6juEha2uBL3GE99", "question": "Why is the sky so hazy?", "choices": ["fog", "fire", "magic trick", "factory smoke"], "correct_choice_idx": 0, "direct_answers": ["mist", "clowd", "fog", "sunny", "weather", "snow", "humid", "fog", "snow", "clouds"], "difficult_direct_answer": false, "rationales": ["A path with trees behind a haze in the air is lined with a fence and a bench on the other side.", "The clouds dip down low in the early morning", "An otherwise sunny area is covered in low hanging, gray mist."], "image": "train2014/COCO_train2014_000000247493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548291, "question_id": "8kpD9QAZU4ngJ6tAHBSBPd", "question": "What are the birds ducking underneath of the giraffe?", "choices": ["goose", "chicken", "ostrich", "duck"], "correct_choice_idx": 2, "direct_answers": ["wings", "ostrich", "tree", "ostrich", "ostrich", "ostriches", "ostriches", "fence", "eating", "ostrich"], "difficult_direct_answer": false, "rationales": ["There are big ostrich birds underneath the giraffe.", "The birds are big animals that do not fly and thus match the animal in option a.", "The birds are large and flightless with black and white feathers."], "image": "train2014/COCO_train2014_000000548291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575978, "question_id": "8m2C2YyjgpGzKckebUtjWb", "question": "What is the purpose of the substance in the white and pink can?", "choices": ["clean hair", "curl hair", "hold hair", "slick hair"], "correct_choice_idx": 2, "direct_answers": ["hairspray", "hairspray", "hairspray", "hair spray", "hairspray", "hold hair", "fix hair", "hair spray", "hairspray", "hair spray"], "difficult_direct_answer": false, "rationales": ["It's hairspray which keeps hair from moving", "The purpose of this substance is to hold one's hair together.", "The purpose holds hair."], "image": "val2014/COCO_val2014_000000575978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72216, "question_id": "8nkZzqQJt2MbGsn3x6QMha", "question": "What are the elephants under?", "choices": ["balloons", "wooden structure", "airplane", "umbrellas"], "correct_choice_idx": 1, "direct_answers": ["grass", "roof", "shed", "wooden structure", "shed", "awning", "roof", "shed", "overhang", "shed"], "difficult_direct_answer": false, "rationales": ["They are under a wooden structure.", "There are no umbrellas or flying vehicles. the elephants are under a building.", "It has a roof and is made from logs"], "image": "train2014/COCO_train2014_000000072216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148375, "question_id": "8nrNpNB4Ktp4ULPw2ZxtFJ", "question": "How many toilet cases are on the curb of this house's driveway?", "choices": ["one", "three", "four", "three"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "road", "three", "three", "three", "three", "car", "three"], "difficult_direct_answer": false, "rationales": ["There are 3.", "Several toilets are lined up near the street in front of a house.", "There are three cases."], "image": "train2014/COCO_train2014_000000148375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424799, "question_id": "8oBz67VTbDv4d5BJq63GSG", "question": "What is the fur of the animal in this image commonly used for?", "choices": ["carpets", "cars", "yard work", "weapons"], "correct_choice_idx": 0, "direct_answers": ["shirts", "socks", "bag", "wool", "blankets", "jackets", "clothes", "carpets", "sweaters", "wool"], "difficult_direct_answer": true, "rationales": ["The fur is like a carpet.", "The animals in the image are sheep. their fur is too soft to be used for yard work, weapons, or cars.", "Sheep are sourced for wool. wool is often used in carpets."], "image": "val2014/COCO_val2014_000000424799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479620, "question_id": "8sxeAFxWydveQ4wnFGfyjY", "question": "What is on top of the horse?", "choices": ["old man", "cat", "girl", "bird"], "correct_choice_idx": 2, "direct_answers": ["lady", "woman", "girl", "rider", "person", "rider", "person", "rider", "man", "jockey"], "difficult_direct_answer": false, "rationales": ["She is the rider obviously.", "The person is riding the horse.", "A person, not a non-human animal, is on top of the horse. the person is not an old man."], "image": "train2014/COCO_train2014_000000479620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350289, "question_id": "8tuPNqR6CLxbHQ7Ua24hFd", "question": "The person on the left is getting ready to do what?", "choices": ["surf", "get married", "sleep", "eat"], "correct_choice_idx": 0, "direct_answers": ["surf", "surf", "surf", "surf", "surf", "surf", "riding", "surf", "surf", "go surfing"], "difficult_direct_answer": false, "rationales": ["The people are carrying surfboards so he is getting ready to surf.", "Given the man on the left is walking towards the ocean with a surfboard at his side we can presume they will be surfing soon.", "The person on the left is walking on the beach and is carrying a board."], "image": "val2014/COCO_val2014_000000350289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160110, "question_id": "8ug3qpjEUMLUPm9SREvZLT", "question": "How many zebras are standing near the rocks to the left of the dirt road?", "choices": ["two", "five", "three", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["One zebra is standing beside another zebra.", "The number of animals match the number in option a.", "There are two zebras standing on the grass, the giraffe is standing on the road."], "image": "train2014/COCO_train2014_000000160110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483424, "question_id": "8vKTbjEhjg8ScuTZX9in8D", "question": "What kind of water body holds the large number of rowboats?", "choices": ["river", "lake", "sea", "ocean"], "correct_choice_idx": 1, "direct_answers": ["lake", "ocean", "ferries", "backwater", "lake", "lake", "lake", "lake", "lake", "lake"], "difficult_direct_answer": false, "rationales": ["The body of water seems to be of the size consistent with answer a based on the visible boundaries.", "Mountains surround a large body of water with shores visible on both sides.", "The body of water is surrounded by mountains so it's not as large as an ocean"], "image": "train2014/COCO_train2014_000000483424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479390, "question_id": "8vpkCQMWzzKNAexnW5vfKG", "question": "Which fruit on this plate is lowest in calories?", "choices": ["strawberry", "oranges", "kiwi", "pineapple"], "correct_choice_idx": 0, "direct_answers": ["kiwi", "kiwi", "kiwi", "strawberries", "strawberries", "strawberry", "strawberry", "kiwi", "kiwi", "kiwi"], "difficult_direct_answer": false, "rationales": ["This is the answer according to google results.", "This is the correct answer per google. that said, some sites refer to b as the right answer.", "The strawberries are the most low in calories."], "image": "train2014/COCO_train2014_000000479390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51894, "question_id": "8w4h7CCKAGuzHhB8JDLq8s", "question": "What character resembles the doll?", "choices": ["garfield", "teddy ruxpin", "crash bandicoot", "papa smurf"], "correct_choice_idx": 1, "direct_answers": ["bear", "ted", "teddy bear", "bear", "poo bear", "bear", "bear", "teddy ruxpin", "bear", "bear"], "difficult_direct_answer": false, "rationales": ["The doll is a stuffed bear. it is not a cat, bandicoot, or smurf.", "The character is a teddy.", "(a) teddy ruxpin. they don't look a lot alike, but they both look like teddy bears."], "image": "train2014/COCO_train2014_000000051894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161664, "question_id": "8wHLZ8YHztzvNVkMJwXbQ8", "question": "Who did a similar type of activity to this person?", "choices": ["moms mabley", "tj lavin", "carrot top", "joy behar"], "correct_choice_idx": 1, "direct_answers": ["skateboarder", "ski", "tony hawk", "shaun white", "skateboarder", "tony hawk", "tony hawk", "tony hawk", "shaun white", "tj lavin"], "difficult_direct_answer": false, "rationales": ["This person is skateboarding, not performing comedy or acting. boxing is similar to skateboarding.", "Tj lavin did something similar.", "The person is skateboarding, not performing comedy or acting."], "image": "train2014/COCO_train2014_000000161664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363525, "question_id": "8xtr7p7ucHYALGnsh9oGAK", "question": "What kind of fruits are piled on the confectionary treat behind the glass at the corner?", "choices": ["mango", "raspberry", "blueberry", "strawberry"], "correct_choice_idx": 3, "direct_answers": ["blueberry", "strawberry", "strawberry", "strawberries", "strawberries", "strawberry", "strawberries", "strawberry", "strawberry", "strawberry"], "difficult_direct_answer": false, "rationales": ["This fruit is red and triangle.", "The fruits are of a size, shape and color consistent with answer a and no other fruit.", "The fruit is strawberry."], "image": "train2014/COCO_train2014_000000363525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309100, "question_id": "925mZR7zBrkzzJNxnmAM2d", "question": "How many zebras are standing in front of the watering hole together?", "choices": ["five", "three", "one", "two"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "two", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["This is obvious by just counting the number.", "There is a trio of the black and white striped animals present.", "There is one zebra on the left. two other zebras are on the right."], "image": "val2014/COCO_val2014_000000309100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30699, "question_id": "92F9cVJQtYBaTXFMLMPhww", "question": "What is shielding the boy?", "choices": ["knights shield", "umbrella", "tree", "backpack"], "correct_choice_idx": 1, "direct_answers": ["umbrella", "umberalla", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["He's obviously holding this in his hands to shield himself.", "The boy is standing next to the fire truck with an umbrella that he uses to shield himself from the water.", "The boy is holding something that is made out of a light material having a convex shape."], "image": "train2014/COCO_train2014_000000030699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540762, "question_id": "92FV9ZJafQhZDPCvssfuqR", "question": "What color is the circular dais in the middle of the ancient tower?", "choices": ["blue", "white", "gray", "red"], "correct_choice_idx": 0, "direct_answers": ["blue", "blue", "blue", "blue", "gold", "blue", "blue", "blue", "circular", "blue"], "difficult_direct_answer": false, "rationales": ["The color is blue.", "The color of the circular dais is blue.", "The center is bright in color and representative of this color. it can be seen in contrast to the brown around it."], "image": "val2014/COCO_val2014_000000540762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348370, "question_id": "92KV3ekVAgzJ4AvLLxVKQA", "question": "What do most of the people have on their heads?", "choices": ["rubber bands", "birds", "goggles", "crowns"], "correct_choice_idx": 2, "direct_answers": ["goggles", "glasses", "goggles", "goggles", "helmet", "goggles", "helmet", "goggles", "glasses", "goggles"], "difficult_direct_answer": false, "rationales": ["They have goggles in order to protect their eyes on the snow slopes", "The people have skis and snowboards. they are wearing head and eye protection.", "When looking at the ends of the people for similarities, all of them appear to have answer a making it most common and an accurate answer."], "image": "train2014/COCO_train2014_000000348370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294284, "question_id": "943D5SAA4wruxpfu5VrGjB", "question": "What direction are the animals headed?", "choices": ["west", "east", "south", "north"], "correct_choice_idx": 1, "direct_answers": ["right", "right", "right", "right", "right", "felt", "right", "right", "rightward", "east"], "difficult_direct_answer": false, "rationales": ["The direction is east.", "They are walking to the right.", "We are looking at a view of elephant herds heading to the right."], "image": "val2014/COCO_val2014_000000294284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257558, "question_id": "94dBdjikxUenkkYCKvNyHp", "question": "How many teddies are in big clear plastic bags on top of the pile?", "choices": ["four", "three", "one", "two"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "four", "three", "three", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["This seems to be the correct answer. there are two more something, possibly bears and possibly not, at the back as well.", "There are four teddy bears contained by plastic bags.", "There are four teddies."], "image": "train2014/COCO_train2014_000000257558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139120, "question_id": "94vA22Fb9yWHHXUn2rpASv", "question": "Who played for the team whose logo appears on the shirt?", "choices": ["otis nixon", "albert pujols", "larry bigbie", "mike trout"], "correct_choice_idx": 2, "direct_answers": ["orioles", "cal ripkin", "baseball player", "matt harvey", "cal ripken", "diamond buford", "cal ripken", "larry bigbie", "orioles", "basketball player"], "difficult_direct_answer": false, "rationales": ["Larry bigbie played for this team.", "Larry bigbie played for the orioles.", "The player is a famous player for the team."], "image": "train2014/COCO_train2014_000000139120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37956, "question_id": "95G3o3dJDAuLmGuxoq84dt", "question": "What is the weather like?", "choices": ["sunny", "stormy", "rainy", "snowy"], "correct_choice_idx": 0, "direct_answers": ["chilly", "sunny", "winter", "nice", "somewhat cloudy", "sunny", "sunny", "cold", "sunny", "warm"], "difficult_direct_answer": false, "rationales": ["There is a motorcyclist. he is using dark glasses to cover his eyes.", "If the other options applied, there would be obvious wet elements. the gray sky may indicate an upcoming storm, d, but it hasn't arrived yet.", "There is a shadow underneath the motorbike. this means the sun must be shining."], "image": "train2014/COCO_train2014_000000037956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83269, "question_id": "95kF9SbHHKjhZUo8veQWVa", "question": "Why is the bus on the boat?", "choices": ["was ferried", "evidence", "broken down", "accident"], "correct_choice_idx": 0, "direct_answers": ["transport", "cross river", "ferry", "ferry", "across water", "was ferried", "water crossing", "transport", "transport", "ferry"], "difficult_direct_answer": false, "rationales": ["The bus is going over the water.", "The bus was ferried.", "The bus is using the boat as a ferry in order to get to the other side of the river."], "image": "train2014/COCO_train2014_000000083269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277496, "question_id": "975aHTqRxzvLWWyBxxu7nj", "question": "What is on the opposite wall of the sink mirror?", "choices": ["mirror", "exit door", "tapestry", "shower"], "correct_choice_idx": 0, "direct_answers": ["lamp", "mirror", "mirror", "toilet", "towel", "another mirror", "mirror", "tub", "mirror", "mirror"], "difficult_direct_answer": false, "rationales": ["There is a round one reflected in the rectangular one.", "There is a mirror.", "There is another mirror in the reflection."], "image": "train2014/COCO_train2014_000000277496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559271, "question_id": "97DwrXa8XtYXpmBXpHkFZi", "question": "What is very large here?", "choices": ["ears", "eyes", "talons", "wings"], "correct_choice_idx": 0, "direct_answers": ["elephants", "ears", "tree", "ears", "elephants", "elephants", "elephant", "elephants", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["The elephant ears are large.", "Of the provided answers, answer a is the most apparently and anachronistically large.", "Elephants are animals that are known for their big ears."], "image": "train2014/COCO_train2014_000000559271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519477, "question_id": "97Gq9H6TVVagpot2cxbS6n", "question": "What is the baby sitting holding?", "choices": ["apple", "pumpkin", "toothbrush", "his foot"], "correct_choice_idx": 2, "direct_answers": ["toy", "toothbrush", "toothbrush", "straw", "toothbrush", "brush", "toothbrush", "bathing", "brush", "toothbrush"], "difficult_direct_answer": false, "rationales": ["The baby is holding the handle of a purple and yellow toothbrush.", "Based on this setting and the size and shape of the item, answer a is most likely, especially as the other answers are not possible.", "The baby has a brush."], "image": "train2014/COCO_train2014_000000519477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186412, "question_id": "99B7GAnQzEtS5v7h3rzEfz", "question": "What is this type of room known as?", "choices": ["den", "garage", "office", "kitchen"], "correct_choice_idx": 1, "direct_answers": ["garage", "garage", "garage", "garage", "garage", "garage", "garage", "garage", "garage", "garage"], "difficult_direct_answer": false, "rationales": ["This room in the house is usually used to park cars.", "There is a door on the top.", "The area is intended to hold cars, and has a door that folds upwards."], "image": "val2014/COCO_val2014_000000186412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156397, "question_id": "99nZ7KbnyTe9xw4yx48VZ8", "question": "What are some of the sheep surrounded by?", "choices": ["hay", "sheep", "cows", "bars"], "correct_choice_idx": 3, "direct_answers": ["bars", "rail", "gate", "sheep", "fog", "farmland", "sheep", "fence", "field", "tree"], "difficult_direct_answer": true, "rationales": ["The sheep are surrounded by bars.", "There is a metal cage in the midst of the sheep.", "The sheep have bars."], "image": "val2014/COCO_val2014_000000156397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70334, "question_id": "9A3wg9vyi2WbD7wGjGPnd8", "question": "How many little baby lambs are near their parent on the top of the rock?", "choices": ["two", "three", "one", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "three", "two", "one", "two", "two", "three", "two"], "difficult_direct_answer": false, "rationales": ["There are three sheep, and only one of them is an adult.", "There are 3 total, but 2 are smaller and therefore are the lambs.", "There are 2 lambs."], "image": "val2014/COCO_val2014_000000070334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183766, "question_id": "9B4UQ7s2rfVt2crMTPoQkZ", "question": "What will the food be eaten with?", "choices": ["fork", "pizza cutter", "spoon", "chopstick"], "correct_choice_idx": 0, "direct_answers": ["hands", "cake", "fork", "fork", "fork", "forks", "fork", "fork", "person", "fork"], "difficult_direct_answer": false, "rationales": ["The food is already sitting on this utensil.", "A person would eat the food with the tired utensil.", "The food will be eaten with a fork."], "image": "val2014/COCO_val2014_000000183766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346272, "question_id": "9BazCffNYzhDVKK2YA5A6t", "question": "Which video game system is currently in use by the man in this photo?", "choices": ["gamecube", "nintendo switch", "nintendo wii", "playstation"], "correct_choice_idx": 2, "direct_answers": ["nintendo wii", "stick", "ball", "wii", "nintendo wii", "wii", "nintendo wii", "nintendo wii", "wii", "nintendo wii"], "difficult_direct_answer": false, "rationales": ["The man on the couch is holding a controller that is used to play nintendo wii games.", "The motion controller in the man's hands is iconic to the video game console.", "The wii is the system."], "image": "train2014/COCO_train2014_000000346272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456462, "question_id": "9CGzhJjHYFjTS5DbvqFikU", "question": "What is the primary color of the skis carried on the back of the man following the man?", "choices": ["black", "green", "red", "yellow"], "correct_choice_idx": 1, "direct_answers": ["green", "green", "man", "green", "green", "black", "black", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["The skis are not yellow, black, or red.", "The skiis are a green color.", "The color is green."], "image": "train2014/COCO_train2014_000000456462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50952, "question_id": "9D6DKDMbubUFNKz55TLNz7", "question": "What color is the interior of the boat evidently with no people inside of it?", "choices": ["white", "green", "red", "blue"], "correct_choice_idx": 3, "direct_answers": ["car", "blue", "blue", "blue", "blue", "blue", "blue", "stop", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The color is blue.", "The answer is obvious given the other options and the obvious color.", "The vacant simple boat has a pretty color that is similar to sky."], "image": "val2014/COCO_val2014_000000050952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495666, "question_id": "9D8qrwLrHE5zE2PTEds6Q2", "question": "What is the largest number on the yellow tags?", "choices": ["778", "403", "10562", "9748"], "correct_choice_idx": 3, "direct_answers": ["9748", "9748", "248", "ninethousand forty-eight", "9748", "left", "9748", "9748", "9748", "9748"], "difficult_direct_answer": false, "rationales": ["There are two yellow tags, one is 248 the other is 9748. thousand is a higher number than hundred.", "9748 is the largest digit with the most numbers.", "The numbers are easy to see in black"], "image": "train2014/COCO_train2014_000000495666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149304, "question_id": "9D9qUxT4AHnAcdUTcNAJUe", "question": "Where in this picture would one clean their feet?", "choices": ["toilet", "tub", "sink", "window"], "correct_choice_idx": 1, "direct_answers": ["bathtub", "tub", "tub", "bathtub", "tub", "bathtub", "bathtub", "bathrub", "tub", "bathtub"], "difficult_direct_answer": false, "rationales": ["Someone can get into the basin to wash feet.", "People can't put their feet in a sink. a tub would run water.", "The picture is the tub."], "image": "val2014/COCO_val2014_000000149304.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568587, "question_id": "9DWfgstnNEotfp28uZUp7Z", "question": "What is the woman looking down at?", "choices": ["game boy", "book", "cell phone", "watch"], "correct_choice_idx": 2, "direct_answers": ["mobile", "mobile phone", "phone", "phone", "cell phone", "phone", "phone", "cell phone", "cellphone", "cell phone"], "difficult_direct_answer": false, "rationales": ["She is texting.", "It's obvious in the photo and by how she's standing.", "She appears to be texting someone."], "image": "val2014/COCO_val2014_000000568587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294437, "question_id": "9DzUi4ZKbid9Qn2xbNnsHK", "question": "What brand of manufacturer marks this small silver laptop?", "choices": ["hp", "lenovo", "dell", "apple"], "correct_choice_idx": 2, "direct_answers": ["dell", "dell", "dell", "dell", "tell", "dell", "dell", "dell", "dell", "dell"], "difficult_direct_answer": false, "rationales": ["The manufacturer's logo is in between the keyboard and screen.", "The name comes from how it looks like.", "The laptop has the logo of the company between the keyboard and screen."], "image": "val2014/COCO_val2014_000000294437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493004, "question_id": "9EVzJjrrYyfkpeYDmDR2fM", "question": "What does the person in the foreground have on?", "choices": ["armor", "bandana", "goggles", "crown"], "correct_choice_idx": 2, "direct_answers": ["goggles", "skateboard", "jacket", "snow gear", "goggles", "goggles", "goggles", "backpack", "snowboard", "backpack"], "difficult_direct_answer": false, "rationales": ["The person in the foreground is skiing. they are wearing something on their face to stop the snow from hitting their face.", "The other options aren't in the image.", "The person has goggles."], "image": "val2014/COCO_val2014_000000493004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482265, "question_id": "9F3EMYtGSpHZ5HRezjx3pB", "question": "Which food is highest in protein on this plate?", "choices": ["chicken", "beets", "rice", "carrots"], "correct_choice_idx": 0, "direct_answers": ["chicken", "beats", "chicken", "chicken", "beets", "chicken", "chicken", "chicken", "meat", "meat"], "difficult_direct_answer": false, "rationales": ["It is meat which has a lot of protein", "The chicken breast is a bigger portion than the other food items on the plate, resulting in a higher protein count.", "The chicken on the plate is meat which is higher in protein than any of the foods on the plate."], "image": "train2014/COCO_train2014_000000482265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366094, "question_id": "9FBB6ZTZgkqQLYqyTx37Rp", "question": "What kind of meat is lining the side of this Tupperware container?", "choices": ["salmon", "chicken", "ham", "turkey"], "correct_choice_idx": 2, "direct_answers": ["ham", "ham", "ham", "dish", "dead", "ham", "ham", "ham", "veg", "ham"], "difficult_direct_answer": false, "rationales": ["The meat is ham.", "The container has a pinkish meat.", "The meat is in a pink color."], "image": "train2014/COCO_train2014_000000366094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342417, "question_id": "9FczJVGyQNS2QkCE9bko7Q", "question": "What color is the small What color is the recliner in between the sofas in the living room?", "choices": ["green", "black", "white", "brown"], "correct_choice_idx": 2, "direct_answers": ["white", "white", "white", "white", "white", "white", "tan", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["It has the white claimer in its appearing.", "The color is white.", "The recliner between the two sofas is white."], "image": "train2014/COCO_train2014_000000342417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107519, "question_id": "9Foonef58igRhpTa7TUQEr", "question": "What kind of meat is likely sitting on top of the beans and potatoes on top of the plate?", "choices": ["salmon", "pork", "beef", "chicken"], "correct_choice_idx": 2, "direct_answers": ["stop", "beef", "beef", "steak", "pork", "red", "beef", "beef", "pork", "lamb"], "difficult_direct_answer": false, "rationales": ["The meat is beef.", "The meat is located based on the text of the question and is of a color, consistency and preparation that would make answer a most likely.", "The meat in the middle is brown tenderized on the outside and in the middle is reddish."], "image": "train2014/COCO_train2014_000000107519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329258, "question_id": "9GT8oCgs4LDyxmxAirU6yh", "question": "What word is the person in the air most familiar with?", "choices": ["enzuiguiri", "kickflip", "quark", "ad hoc"], "correct_choice_idx": 1, "direct_answers": ["play", "skating", "skating", "skateboarding", "skateboarding", "skateboard", "skateboarding", "skate", "skateboarding", "kickflip"], "difficult_direct_answer": false, "rationales": ["There is only one word that fits in the world of skateboarding and that is the work that starts with a \"k\".", "The person is doing a kickflip.", "The word is kickflip."], "image": "val2014/COCO_val2014_000000329258.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65557, "question_id": "9GsrTp5CHBw4wCV7psjpdw", "question": "What is the expression on the face to the right of the pie?", "choices": ["confused", "euphoric", "happy", "angry"], "correct_choice_idx": 3, "direct_answers": ["mad", "concentrating", "dwdww", "joy", "happy", "anger", "curious", "neutral", "sad", "angry"], "difficult_direct_answer": true, "rationales": ["The face has its brow furrowed, which shows it is full of rage.", "The downturned eyebraows and lack of a smile make it obvious.", "He looks happy."], "image": "train2014/COCO_train2014_000000065557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185768, "question_id": "9HTyiCFuTwYTcZ5CyQPmxo", "question": "What do the characters on the magazine resemble?", "choices": ["smurfs", "rugrats", "nickelodeons doug", "muppets"], "correct_choice_idx": 3, "direct_answers": ["bear", "bear", "fancy creature", "muppets", "bear", "muppets", "teddy bear", "puppets", "robbery", "bear human"], "difficult_direct_answer": false, "rationales": ["The characters are muppets.", "The characters are muppets.", "They do not look like animated characters."], "image": "val2014/COCO_val2014_000000185768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489605, "question_id": "9JMdApFj5wNweYVGz9Haeu", "question": "What kind of meat is sitting atop the salad?", "choices": ["chicken", "ham", "turkey", "beef"], "correct_choice_idx": 1, "direct_answers": ["pasta", "salami", "ham", "ham", "ham", "ham", "pepperoni", "pepperoni", "ham", "no meat"], "difficult_direct_answer": false, "rationales": ["The meat is ham.", "It is reddish in color and in cubes", "The meat sits on the ham as shown."], "image": "val2014/COCO_val2014_000000489605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242709, "question_id": "9JSqgPSguHTEwHKyk3GLsr", "question": "What color is the apple fruit in the center of the food containers?", "choices": ["orange", "red", "green", "yellow"], "correct_choice_idx": 3, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "round", "yellow", "orange", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The apple is identifiable based on the size and shape in the middle of the rest of the food and the color is apparent.", "The color is yellow.", "The apple is yellow."], "image": "train2014/COCO_train2014_000000242709.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231758, "question_id": "9KCbw7Za9UnffS2C2kTMVp", "question": "What color is the black cow laying down to the right of the green cows?", "choices": ["pink", "blue", "gold", "silver"], "correct_choice_idx": 0, "direct_answers": ["orange", "orange", "red blue", "blue red", "pink", "blue", "pink", "red", "orange", "red rose"], "difficult_direct_answer": false, "rationales": ["The color is pink.", "The black cow to the right of the green cows is pink.", "The color is pink."], "image": "val2014/COCO_val2014_000000231758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156823, "question_id": "9Kfe34MhS8NozhTAkPYxQU", "question": "What do the animals have?", "choices": ["wool coats", "long necks", "quills", "talons"], "correct_choice_idx": 0, "direct_answers": ["fleece wool", "wool", "wool", "wool", "wool coats", "wool", "wool", "legs", "efefe", "wool"], "difficult_direct_answer": false, "rationales": ["Sheep have wool on them.", "This animal is raised and harvested for this material.", "These are sheep"], "image": "train2014/COCO_train2014_000000156823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367201, "question_id": "9MkMixAGXsQmn39828Qj3v", "question": "What are the flowers likely made of?", "choices": ["wallpaper", "tiles", "poster", "paint"], "correct_choice_idx": 0, "direct_answers": ["toilet", "paper", "fabric", "wallpaper", "paper", "plastic", "wallpaper", "roses", "plastic", "wallpaper"], "difficult_direct_answer": false, "rationales": ["The flowers are on the wallpaper.", "(d) poster. there is only one spot on the wall where there are roses, and that part looks like it is a long poster.", "This would be difficult to paint and there are no squares to indicate tile"], "image": "train2014/COCO_train2014_000000367201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133510, "question_id": "9NrKiuoiyDZ6kCmwmKbXjk", "question": "What color are the uniforms on the pitcher's team?", "choices": ["blue", "brown", "green", "red"], "correct_choice_idx": 3, "direct_answers": ["white", "grey", "white", "white", "white", "white", "white", "white", "white", "red"], "difficult_direct_answer": false, "rationales": ["The piping and writing can be seen as red, and you know it is the pitcher because he is standing on the mound.", "The color is red.", "The uniforms on the pitchers team are red because their catcher is wearing red."], "image": "train2014/COCO_train2014_000000133510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182366, "question_id": "9PNhGDHe6aAwwMkb9YzcAb", "question": "What is the food in?", "choices": ["box", "tray", "horses mouth", "pot"], "correct_choice_idx": 1, "direct_answers": ["pans", "dish", "pans", "tray", "trays", "container", "trays", "trays", "pans", "pans"], "difficult_direct_answer": false, "rationales": ["There are several pizzas resting in some shallow sheets. they are used to heat pizzas.", "It is a metallic kitchen object that food is placed in to be cooked in the oven.", "The food is on a tray."], "image": "val2014/COCO_val2014_000000182366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 527796, "question_id": "9PZYLDQ8gcGYpLSUkLj4pw", "question": "What color are the vertical stripes on the left skier's jacket?", "choices": ["blue", "black", "white", "green"], "correct_choice_idx": 1, "direct_answers": ["black", "black", "black", "black", "black", "black", "pink", "black", "black", "pink black"], "difficult_direct_answer": false, "rationales": ["The jacket is fully visible and the stripes are identifiable and the color can be gleaned.", "This dark color sits next to the pink on her jacket.", "This appears to be the color when i checked at 150 percent zoom."], "image": "train2014/COCO_train2014_000000527796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149180, "question_id": "9QdoYYF293nEUraGU2vdfe", "question": "What color is the blanket where the two dogs are napping?", "choices": ["blue", "red", "white", "cream"], "correct_choice_idx": 3, "direct_answers": ["beige", "ivory", "beige", "cream", "cream", "beige", "tan", "cream", "tan", "white"], "difficult_direct_answer": false, "rationales": ["There is a cream colored blanket underneath the two dogs.", "It is a little darker than the white dresser", "The other options don't apply unless it's a dirty, old d."], "image": "train2014/COCO_train2014_000000149180.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522941, "question_id": "9R8VUKmJ2ats4wVtzY64vT", "question": "What color is the border of this elephant's ear?", "choices": ["pink", "green", "white", "red"], "correct_choice_idx": 0, "direct_answers": ["pink", "pink", "pink", "pink", "pink color", "pink", "pink", "pink", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["It's lighter than red", "This is obvious from what's seen in the photo.", "The large elephant has pink coloring along the edges of its ears."], "image": "val2014/COCO_val2014_000000522941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346302, "question_id": "9RY2q8Bcbx9PDd4Ko3SQGw", "question": "What color is the left side of the pillow sitting on the single seat?", "choices": ["black", "yellow", "white", "orange"], "correct_choice_idx": 1, "direct_answers": ["yellow", "black yellow", "striped", "yellow", "green", "black", "green", "green", "black", "yellow"], "difficult_direct_answer": false, "rationales": ["The pillow on the chair by the window is black on the right and yellow on the left.", "The pillow on that chair has a yellowish green hue.", "The left side is bright yellow."], "image": "train2014/COCO_train2014_000000346302.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210195, "question_id": "9SGtxueobJCVCoRCkDGXYr", "question": "How many people are standing underneath of the same umbrella under the scaffold?", "choices": ["four", "three", "two", "one"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["One person is on the left. one is on the right.", "There are 2.", "This is evident in the image and usually the only number of adults who can be under one."], "image": "train2014/COCO_train2014_000000210195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361197, "question_id": "9SbYFaPxTi5RBGgmo78MQG", "question": "What red substance in the plastic cup?", "choices": ["ketchup", "marinara sauce", "salad dressing", "enchilada sauce"], "correct_choice_idx": 2, "direct_answers": ["jam", "salad dressing", "sauce", "salad dressing", "dressing", "salad dressing", "dressing", "apple", "dressing", "sauce"], "difficult_direct_answer": false, "rationales": ["It is for the salad.", "It is french dressing for the salad", "The substance is dressing."], "image": "train2014/COCO_train2014_000000361197.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200541, "question_id": "9TR9ZzsXjWoNNqTBgPmkiU", "question": "How many species are in this image?", "choices": ["seven", "two", "three", "five"], "correct_choice_idx": 1, "direct_answers": ["four", "one", "one", "two", "one", "two", "two", "one", "one", "four sheeps"], "difficult_direct_answer": false, "rationales": ["The distinct species are visible based on their defining characteristics and there are clearly two different ones.", "There are 2.", "There are sheep and humans visible."], "image": "val2014/COCO_val2014_000000200541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5535, "question_id": "9UKrp4Em2RX3LmUTgfFv7C", "question": "What kind of fuel does the red white and blue bus run on?", "choices": ["coal", "diesel", "gas", "firewood"], "correct_choice_idx": 1, "direct_answers": ["diesel", "red", "gas", "diesel", "gas", "diesel", "petrol", "desal", "diesel", "fuel"], "difficult_direct_answer": false, "rationales": ["A large bus that is public transportation is driving on the street.", "The bus has an internal combustion engine, not a coal or wood powered engine. buses do not run on gas.", "It is a large vehicle that requires a strong engine"], "image": "val2014/COCO_val2014_000000005535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302469, "question_id": "9UUJDrd4EzwAEQiPnTAsxe", "question": "Where are these animals usually found?", "choices": ["pigpen", "farm", "tundra", "savanna"], "correct_choice_idx": 3, "direct_answers": ["savannah", "savanna", "africa", "africa", "africa", "forest", "zebras", "savannah", "field", "desert"], "difficult_direct_answer": false, "rationales": ["The animals are in the savanna.", "Zebras are grazing in an open area with grass and flat lands.", "The animals are seen in the savanah."], "image": "train2014/COCO_train2014_000000302469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538518, "question_id": "9V7AzDGGSZ6YoMxeZ7B5Xe", "question": "What are the people gathered around?", "choices": ["cat", "bed", "pizza pie", "television"], "correct_choice_idx": 3, "direct_answers": ["television", "tv", "television", "tv", "tv set", "television", "television", "men", "tv", "tv"], "difficult_direct_answer": false, "rationales": ["They are in the living room.", "They watch tv.", "People are gathered around and facing a screen while holding video game controllers. the screen is large and is on a stand."], "image": "train2014/COCO_train2014_000000538518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445002, "question_id": "9VdKiGzo5wy6P7NHL58ri4", "question": "Where would you find this bathroom?", "choices": ["hotel", "school", "church", "house"], "correct_choice_idx": 0, "direct_answers": ["hotel", "stop", "motel", "insidemirror", "home", "house", "home", "hotel", "hotel", "in room"], "difficult_direct_answer": false, "rationales": ["A bathroom has a phone hanging on the wall.", "While any of the answers is possible, the professional decor and style visible in this bathroom is most commonly found in answer a.", "This is at a hotel."], "image": "val2014/COCO_val2014_000000445002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461794, "question_id": "9Ve5qAvFCK5EZYNz9VL5Ac", "question": "What pants is the man wearing?", "choices": ["khakis", "blue jeans", "shorts", "black jeans"], "correct_choice_idx": 1, "direct_answers": ["jeans", "jeans", "blue jeans", "jeans", "cloth", "jeans", "jeans", "jeans", "jeans", "jean"], "difficult_direct_answer": false, "rationales": ["These are denim material", "The man in front of the tv is wearing blue jeans made from denim.", "The man is wearing trousers made of denim."], "image": "train2014/COCO_train2014_000000461794.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575206, "question_id": "9WJRJemxXfLRcKVktYhZWQ", "question": "What is closest to the place you would go to charge your phone?", "choices": ["white towel", "tissues", "candle", "red towel"], "correct_choice_idx": 0, "direct_answers": ["towel", "wall", "tissue", "outlet", "towel", "outlet", "towel", "white towel", "wall", "bathroom"], "difficult_direct_answer": false, "rationales": ["There is an electrical outlet beside the hand towel.", "A white towel is next to the electrical plug where you would charge a phone", "Next to the mirror and between the towel is a plug."], "image": "train2014/COCO_train2014_000000575206.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279929, "question_id": "9Z5yj2ym8P8SXpmQ4h9cPe", "question": "What is the man in the suit holding?", "choices": ["his back", "phone", "baby", "his tie"], "correct_choice_idx": 1, "direct_answers": ["cell phone", "phone", "phone", "phone", "phone", "phone", "cell phone", "phone", "cellphone", "cell phone"], "difficult_direct_answer": false, "rationales": ["The man is holding a phone in his hand.", "The man has a phone.", "The man has a phone."], "image": "train2014/COCO_train2014_000000279929.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376545, "question_id": "9ZDsUvFDTF33CfKu6BFmNP", "question": "What kind of trick is the man in brown doing?", "choices": ["grind", "manual", "flip trick", "ollie"], "correct_choice_idx": 2, "direct_answers": ["skateboard flip", "skateboarding", "kickflip", "flip trick", "kickflip", "skating", "jumping", "kickflip", "kickflip", "flip"], "difficult_direct_answer": false, "rationales": ["This is obvious given that the board is turned over.", "He jumps up and the board rotates", "The man is jumping and flipping his skateboard in the air."], "image": "val2014/COCO_val2014_000000376545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431950, "question_id": "9Ze6LSrhc4jyBJiHEnRrfr", "question": "What is at the far end of the room?", "choices": ["mirror", "dog", "cat", "baby"], "correct_choice_idx": 0, "direct_answers": ["flowers", "painting", "flower", "sofa", "wardrobe", "good", "dresser", "box", "mirror", "hutch"], "difficult_direct_answer": true, "rationales": ["A reflection of the room is being shown through that object.", "This is in the middle of the armoire and reflects the room back", "The end has a mirror."], "image": "val2014/COCO_val2014_000000431950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442252, "question_id": "9ZkJg7o3YysJJbDDcZjLtp", "question": "What is a sound this animal makes?", "choices": ["roar", "purr", "woof", "baa"], "correct_choice_idx": 1, "direct_answers": ["meow", "meow", "mew", "meow", "meow", "meow", "meow", "meow", "meow", "purr"], "difficult_direct_answer": false, "rationales": ["The other options apply to a wolf or dog, lion and sheep.", "The black cat on the couch is an animal that would likely make purring sounds.", "The cat has a purr sound."], "image": "train2014/COCO_train2014_000000442252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52087, "question_id": "9ZmSDfVbfsq8s5CXyzkZHa", "question": "What is the shape of this water fountain?", "choices": ["phone booth", "toilet", "sink", "watermelon"], "correct_choice_idx": 1, "direct_answers": ["toilet", "oval", "round", "round", "round", "round", "toilet", "circle", "toilet", "toilet"], "difficult_direct_answer": false, "rationales": ["The water fountain is shaped like a toilet.", "The water fountain is shaped like a commode.", "Though this boy drinks from a water fountain it is in the shape of a toilet bowl."], "image": "val2014/COCO_val2014_000000052087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260580, "question_id": "9ZoHecs7XgbHEftZZ3T8Np", "question": "What kind of gift could this be?", "choices": ["fruit basket", "playing cards", "movie reel", "toy truck"], "correct_choice_idx": 0, "direct_answers": ["fruit", "fruit basket", "fruits", "vegetable", "birthday", "fruit basket", "fruit basket", "fruit basket", "basket", "fruits"], "difficult_direct_answer": false, "rationales": ["The gift is fruit.", "There are bananas and apples in a bowl. there are no movie reels, playing cards, or toy trucks.", "The items in the bowl would be good to include in a gift fruit basket."], "image": "train2014/COCO_train2014_000000260580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3999, "question_id": "9bp4uyij9th6U5iiiALmbN", "question": "What kind of cat is resting on top of the sofa?", "choices": ["calico", "persian", "siamese", "ragdoll"], "correct_choice_idx": 0, "direct_answers": ["cat", "calico", "calico", "cat", "black", "toyger", "black cat", "calico", "calico", "domestic"], "difficult_direct_answer": false, "rationales": ["The cat is a calico.", "There is an orange, back and white cat curled up on couch.", "The calico cat has multiple colors."], "image": "train2014/COCO_train2014_000000003999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370783, "question_id": "9cp6ssPvhMuSVj3JAy5ew3", "question": "What object is out of place in the kitchen?", "choices": ["stove", "microwave", "mirror", "refrigerator"], "correct_choice_idx": 2, "direct_answers": ["light", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror", "mirror"], "difficult_direct_answer": false, "rationales": ["The mirror is out of place.", "This object is used to look at yourself and inspect clothing so it would be found in a bedroom where you can change your clothing.", "It looks like it hasn't been hung yet. that said, it might have been purposely placed there to make the kitchen feel bigger."], "image": "train2014/COCO_train2014_000000370783.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236766, "question_id": "9d9c6tcx9GTyW24nrbz7So", "question": "What is he about to do?", "choices": ["drop racquet", "lift racquet", "spin racquet", "swing racquet"], "correct_choice_idx": 3, "direct_answers": ["swing racquet", "hit ball", "hit ball", "serve", "hit", "serve", "hit ball", "hit ball", "win", "overhead swing"], "difficult_direct_answer": false, "rationales": ["He has the tool cocked back ready to release.", "The person is playing tennis based on their equipment and attire. in tennis, when one is in this body position with their eye line as such, they would be winding up to hit the ball which would require them to do answer a.", "A man is holding a racket back behind his head and looking up in the air in front of him."], "image": "val2014/COCO_val2014_000000236766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421596, "question_id": "9dfv3wYzrftp9ue2YTBGhg", "question": "How many giraffes are standing around the forest near the ostrich?", "choices": ["three", "four", "two", "five"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two giraffes.", "There are two yellow animals with very long necks on all fours.", "They're a pair of giraffes."], "image": "train2014/COCO_train2014_000000421596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178592, "question_id": "9eK2TYRtsXuqLKXw3Z6TSJ", "question": "What animal has the most colors here?", "choices": ["dog", "cat", "zebra", "leopard"], "correct_choice_idx": 2, "direct_answers": ["zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra"], "difficult_direct_answer": false, "rationales": ["There is no dog, cat, or leopard. the black and white horse-like animals have the most colors.", "The elephants are solid gray so the animal with the black and white stripes have more colors.", "The elephant is only a single color, grey, while the other animal is black and white."], "image": "val2014/COCO_val2014_000000178592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439897, "question_id": "9f9D2Nu367vwFmdmUPG2eh", "question": "What color is the napkin underneath of the pizzas?", "choices": ["white", "blue", "pink", "brown"], "correct_choice_idx": 0, "direct_answers": ["cream", "white", "white", "white", "beige", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The napkin is a light color with almost no coloration.", "It's the same color as the cheese", "There is white parchment paper underneath the pizza."], "image": "val2014/COCO_val2014_000000439897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492339, "question_id": "9gnv3UkuHuENvpbBvsGsin", "question": "What color is the surfboard held by the man walking up the beach on the right?", "choices": ["yellow", "blue", "white", "orange"], "correct_choice_idx": 0, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "brown", "yellow", "yellow", "brown", "yellow"], "difficult_direct_answer": false, "rationales": ["It's the color of a banana", "The color is yellow.", "There are two men walking with boards back to the beach. the man on right has a yellow board with red lines."], "image": "train2014/COCO_train2014_000000492339.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536437, "question_id": "9hQeKvJZAXVBfNHNWYa9kC", "question": "What creature has the long brown hair?", "choices": ["kitten", "human", "dog", "cat"], "correct_choice_idx": 1, "direct_answers": ["human", "human", "human", "black", "cat", "dog", "dog", "dog", "human", "human"], "difficult_direct_answer": false, "rationales": ["There is a woman lying on the bed.", "The creature is a human.", "The kitten is sittingg on top of its owner with long hair."], "image": "train2014/COCO_train2014_000000536437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463204, "question_id": "9hr9c2puq4DQzLe7emAqvr", "question": "What is the child wearing?", "choices": ["purse", "goggles", "crown", "backpack"], "correct_choice_idx": 1, "direct_answers": ["vest", "ski suit", "dress", "uniform", "goggles", "helmet", "boots", "ski suit", "ski gear", "helmet"], "difficult_direct_answer": false, "rationales": ["These are to prevent snow blindness", "The child has goggles.", "The orange eye protection is often worn while in the snow to protect from the bright reflection of the sun."], "image": "train2014/COCO_train2014_000000463204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433505, "question_id": "9jRHSH2BUUdsXX4gHbGpBb", "question": "What does all the technology have in common?", "choices": ["white", "dell", "black", "apple"], "correct_choice_idx": 3, "direct_answers": ["apple", "apple", "apple", "electric", "internet", "apple products", "screens", "plugged in", "apple", "computer"], "difficult_direct_answer": false, "rationales": ["It's the same brand.", "The brand of the tech is all from the apple company due to the logo on the technology.", "All of these are mac products."], "image": "val2014/COCO_val2014_000000433505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393680, "question_id": "9n9EQNWgDTicPiMDDmAMnB", "question": "What word would the person most likely be familiar with?", "choices": ["hola", "ciao", "pho", "danke"], "correct_choice_idx": 2, "direct_answers": ["asia", "than", "fishing", "fish", "boat", "pho", "fish", "boat", "fish", "fisherman"], "difficult_direct_answer": false, "rationales": ["The word is pho.", "The word pho is used in vietnamese.", "This is a boat in an asian country"], "image": "train2014/COCO_train2014_000000393680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252213, "question_id": "9oAF5vRL7VV8qnwW5UMCHW", "question": "What are the dogs in the foreground doing?", "choices": ["sleeping", "fighting", "jumping", "eating"], "correct_choice_idx": 1, "direct_answers": ["fighting", "fighting", "fighting", "fighting", "playing", "playing", "fighting", "fighting", "fighting", "wighting"], "difficult_direct_answer": false, "rationales": ["Their teeth are being showing and they have that aggressive look on their faces.", "The dogs are fighting each other.", "The dogs are fighting."], "image": "val2014/COCO_val2014_000000252213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380247, "question_id": "9ohCBwCinUbGgRzY86kTdb", "question": "What is the color of the product in this room that is used to clean grease from food dishes?", "choices": ["black", "yellow", "blue", "green"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "watching", "blue", "blue", "white", "blue", "blue", "silver", "blue"], "difficult_direct_answer": false, "rationales": ["There is dishwashing liquid.", "It is in a bottle typical of this product. it is kept near the sink for washing and cleaning.", "The color is blue."], "image": "val2014/COCO_val2014_000000380247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327462, "question_id": "9qZsiVDoVqZNVcLpTvDXtW", "question": "What color is the hoodie worn by the man putting on the shoes to the right?", "choices": ["blue", "orange", "white", "red"], "correct_choice_idx": 1, "direct_answers": ["orange", "gray", "orange", "black", "orange", "brown", "brown", "orange", "orange", "black"], "difficult_direct_answer": false, "rationales": ["The man that is putting on his shoes is wearing a bright orange hoodie.", "The color of the man's hoodie is the same color as the fruit that gave it its name.", "The color is orange."], "image": "train2014/COCO_train2014_000000327462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409736, "question_id": "9qmP7MrtWG4YSJ7XN2ifdt", "question": "Which indoor domestic animal does the center sheep resemble?", "choices": ["bull terrier", "reptile", "cat", "fish"], "correct_choice_idx": 0, "direct_answers": ["dogs", "cow", "bull terrier", "dog", "dog", "cow", "pig", "dog", "dog", "cat"], "difficult_direct_answer": false, "rationales": ["The sheep resembles a dog called the bull terrier.", "The little sheep in the middle has pointy ears that stick out like a bull terrier.", "The center sheep resembles a dog, not a cat, fish, or reptile."], "image": "train2014/COCO_train2014_000000409736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322112, "question_id": "9rQUmLsir9AVKsXeSkvLfi", "question": "What is the woman carrying?", "choices": ["baby", "egg", "basket", "crown"], "correct_choice_idx": 0, "direct_answers": ["baby", "baby", "baby", "her child", "baby", "baby", "baby", "baby", "baby", "baby"], "difficult_direct_answer": false, "rationales": ["An older woman in plaid is holding a baby in a black holster and a man is talking to the baby.", "The woman has a baby on her arms.", "This is obvious in the image. the other options aren't in her arms."], "image": "train2014/COCO_train2014_000000322112.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65601, "question_id": "9rcVamdkkuQEBLagcbMdcn", "question": "What is in the wallet?", "choices": ["cobweb", "money", "lottery ticket", "postage stamps"], "correct_choice_idx": 1, "direct_answers": ["money", "money", "money", "money", "money", "money", "cash", "dwdww", "cash money", "money"], "difficult_direct_answer": false, "rationales": ["The money is in the wallet.", "There are some dollars hanging out of the wallet.", "There are green bills fanned out in the wallet."], "image": "train2014/COCO_train2014_000000065601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129415, "question_id": "9rsZT6fYqgbnganXmK5MYx", "question": "What is on the wall?", "choices": ["statue", "cat", "fly", "picture"], "correct_choice_idx": 3, "direct_answers": ["poster", "picture", "art", "poster", "picture", "poster", "window", "framed poster", "painting", "picture"], "difficult_direct_answer": false, "rationales": ["The picture is on the wall.", "Pictures are on the wall.", "There is a picture of an abstract art hanging in a frame on the living room wall."], "image": "train2014/COCO_train2014_000000129415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215207, "question_id": "9s4BkvpQDFgRpyhApKsrzN", "question": "What color are the shrimp sitting on the plate?", "choices": ["gray", "purple", "pink", "orange"], "correct_choice_idx": 2, "direct_answers": ["pink", "pink", "orange", "orange", "white", "red", "orange", "pink", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["There are several pink shrimp curled up on a plate with rice and a spoon resting on it.", "The color in option a corresponds to the color of cooked shrimp.", "The color is pink."], "image": "val2014/COCO_val2014_000000215207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469634, "question_id": "9txqnDGwdsuDeZzX2t5EPJ", "question": "Whose upper body is most protected in the event of a fall?", "choices": ["green shorts", "denim shorts", "nobody", "green pants"], "correct_choice_idx": 0, "direct_answers": ["green shorts", "man", "man", "far man", "middle man", "man", "dressed man", "man", "skateboard", "nothing"], "difficult_direct_answer": false, "rationales": ["The man wearing denim cutoffs and the man wearing green pants are not wearing shirts. the other man is wearing a shirt.", "The green pants wearing man's upper body is not protected at all.", "The guy wearing the greenish shorts is most protected because he's wearing a t-shirt."], "image": "val2014/COCO_val2014_000000469634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532449, "question_id": "9yHqe4iFGSZBiTQEGA4Jc9", "question": "What maneuver is likely to be executed next?", "choices": ["skate save", "sky hook", "huddle", "swing"], "correct_choice_idx": 3, "direct_answers": ["hit", "tennis", "hit", "hit ball", "return", "slice", "return", "pass", "swing", "hit"], "difficult_direct_answer": false, "rationales": ["The racket is being guided from the back to exert pressure.", "The ball is coming toward her", "The maneuver is a swing."], "image": "train2014/COCO_train2014_000000532449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509460, "question_id": "9zNfHL82F5tXZj4tZjbK4x", "question": "What item here has the same name as a term used in baseball?", "choices": ["batter", "bench", "homerun", "pitcher"], "correct_choice_idx": 1, "direct_answers": ["bench", "bench", "stop", "goal", "bench", "interested", "benches", "bench", "bench", "bench"], "difficult_direct_answer": false, "rationales": ["Players sit on these when they aren't on the field", "Black seating is on a sidewalk along the water and has slotted seats and arm rests. baseball players sit on a bench in the dugout.", "The bench is there."], "image": "val2014/COCO_val2014_000000509460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371054, "question_id": "9zPd6LXTEnNuQJNQXQZdVz", "question": "What color is the strange old truck?", "choices": ["turquoise", "red", "pink", "gray"], "correct_choice_idx": 0, "direct_answers": ["light blue", "blue", "blue", "light blue", "blue", "light blue", "turquoise", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The pale whitefish blue color of this truck could be called turquoise.", "The strange old truck is not red, pink, or grey.", "The color is turquoise."], "image": "val2014/COCO_val2014_000000371054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274665, "question_id": "A3her2xE2rQGk8zTSzcwie", "question": "What are the items on the floor usually used for?", "choices": ["olympic competitions", "birthdays", "cooking", "court proceedings"], "correct_choice_idx": 1, "direct_answers": ["birthday party", "parties", "parties", "birthday celebration", "party", "parties", "party", "birthdays", "party", "birthday party"], "difficult_direct_answer": false, "rationales": ["There are many blown up items on ground. they are usually used to celebrate something.", "The living room floor is covered in colorful balloons that would be used for a birthday party.", "The items are balloons."], "image": "train2014/COCO_train2014_000000274665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52813, "question_id": "A4LBFJLEhZh5iG5jRjRnGs", "question": "What color are the stripes on the shoes worn by the tennis player who is throwing a serve?", "choices": ["black", "yellow", "blue", "red"], "correct_choice_idx": 0, "direct_answers": ["black", "stop", "white", "black", "black", "white", "black", "car", "black", "black"], "difficult_direct_answer": false, "rationales": ["This is the common choice for these types of athletic shoes.", "The stripes are in a black color.", "There are black stripes on the side of the man's shoes."], "image": "train2014/COCO_train2014_000000052813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160325, "question_id": "A5a7GEEb8miLkF4EroZefd", "question": "What kind of horse is pulling the Disneyland trolley?", "choices": ["mule", "pinto", "clydesdale", "chestnut"], "correct_choice_idx": 2, "direct_answers": ["beautiful", "clydesdale", "clydesdale", "draft", "clysdale", "big horse", "race", "huge", "clydesdale", "feefe"], "difficult_direct_answer": false, "rationales": ["These are brown and white and have feathering at the hooves", "The horse is a clydesdale.", "This type of horse is famous for the hair around their hooves."], "image": "train2014/COCO_train2014_000000160325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416356, "question_id": "A5zRmMbDXtpaGvq9euPfbM", "question": "What is in the spaghetti?", "choices": ["meatball", "shrimp", "tomato sauce", "egg"], "correct_choice_idx": 1, "direct_answers": ["noodles", "shrimp", "shrimp", "shrimp", "noodle", "shrimp", "shrimp", "broccoli", "food", "shrimp"], "difficult_direct_answer": false, "rationales": ["Seafood and pasta is often served together.", "The orange white and red coiled bits of meat with black spots from being cooked is identifiable as shrimp.", "There are small cooked shellfish on top of the spaghetti."], "image": "train2014/COCO_train2014_000000416356.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402359, "question_id": "A66nyVJh5w9VSFvCvjixZU", "question": "What kind of enclosure are the giraffes likely living in?", "choices": ["conservatory", "zoo", "wild", "boat"], "correct_choice_idx": 1, "direct_answers": ["zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "art", "zoo", "girrafi", "zoo"], "difficult_direct_answer": false, "rationales": ["The enclosure is man made, thus matching the item in option a.", "There is a zoo.", "Those \"fake\" natural habitat surroundings mimic a somewhat real home for them and are not found in the wild."], "image": "train2014/COCO_train2014_000000402359.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175263, "question_id": "A678brtrjpxQUcMPPXpKiv", "question": "What kind of transport vessel does this bathroom likely exist in?", "choices": ["airplane", "boat", "van", "rv"], "correct_choice_idx": 1, "direct_answers": ["boat", "airplane", "train", "bathroom", "motorhome", "train", "train", "airplane", "airplane", "train"], "difficult_direct_answer": false, "rationales": ["The transport is a boat.", "The bathroom is most likely on a boat.", "A very small bathroom can be seen down beside a very thin hallway that has shiny walls."], "image": "train2014/COCO_train2014_000000175263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511874, "question_id": "A6yeZcmU3QoawVchXKYvPz", "question": "Why are these cows held in place at the feeder?", "choices": ["butcher", "milk", "petting", "helping"], "correct_choice_idx": 1, "direct_answers": ["to eat", "milking", "eat", "milk", "to eat", "to eat", "getting milked", "eating", "milk", "safety"], "difficult_direct_answer": false, "rationales": ["The feeder keeps the cows still so they can be harvested.", "The cows are going to be milked.", "The cows are in an open area where they are being milked."], "image": "train2014/COCO_train2014_000000511874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245582, "question_id": "A744D4Vp3JNQ86zp3KgzV4", "question": "What does the beverage the person is drinking have in it?", "choices": ["tomato juice", "apples", "bubbles", "lemon"], "correct_choice_idx": 2, "direct_answers": ["coke", "bubbles", "soda", "cola", "music", "coca cola", "coca cola", "sugar", "caffeine", "sugar"], "difficult_direct_answer": false, "rationales": ["The beverage has bubbles.", "The drink is a carbonated cola.", "Soda has carbonation in it, which tries to escape in the form of gas."], "image": "train2014/COCO_train2014_000000245582.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123878, "question_id": "A77BMZYsPdD2hAUENUTx6o", "question": "How many legs of the brown table are visible?", "choices": ["five", "four", "six", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "four", "three", "three"], "difficult_direct_answer": false, "rationales": ["One is hidden because it's on the opposite side", "One is hidden.", "A brown table can be seen from an angle."], "image": "train2014/COCO_train2014_000000123878.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50956, "question_id": "A7hkehLh5F5vHntXVKKG2U", "question": "What color do you get if you combine all of the colors on the umbrella together?", "choices": ["orange", "purple", "green", "yellow"], "correct_choice_idx": 1, "direct_answers": ["purple", "purple", "magenta", "magenta", "purple", "purple", "purple", "purple", "red blue", "purple"], "difficult_direct_answer": false, "rationales": ["A person is holding a red and blue umbrella. red and blue combined makes purple.", "The color is purple.", "You get that color if you mix those two up."], "image": "val2014/COCO_val2014_000000050956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443684, "question_id": "ABKMvvh9h7YYjQgBiQPKda", "question": "What is the person on top of the animal wearing?", "choices": ["cape", "green pants", "red shirt", "sombrero"], "correct_choice_idx": 2, "direct_answers": ["jeans", "jeans", "shirt", "bandana", "jeans", "red shirt", "t-shirt", "jeans", "t-shirt", "t shirt"], "difficult_direct_answer": false, "rationales": ["The person is in red.", "The person is wearing blue jeans. he does not have a hat or a cape.", "The person has red on."], "image": "train2014/COCO_train2014_000000443684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530901, "question_id": "ACUFjmMmvzPsSZ8Nm48YXA", "question": "Who is the man dressed like?", "choices": ["beatrix kiddo", "gordon ramsay", "admiral ackbar", "peewee herman"], "correct_choice_idx": 3, "direct_answers": ["peewee herman", "peewee herman", "peewee herman", "peewee herman", "peewee", "peewee herman", "peewee hermann", "waiter", "peewee herman", "pee-wee herman"], "difficult_direct_answer": false, "rationales": ["The man looks like herman.", "The man looks like peewee herman.", "The man is dressed like the actor peewee herman."], "image": "train2014/COCO_train2014_000000530901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20903, "question_id": "ACnywEugF32EawQsn7j2h9", "question": "Which animal would least like to be in the sink if the faucet were turned on?", "choices": ["reptile", "fish", "dog", "cat"], "correct_choice_idx": 3, "direct_answers": ["cat", "dog", "cat", "cat", "cat", "cat", "fish", "cat", "nothing", "mouse"], "difficult_direct_answer": false, "rationales": ["The animal is a cat.", "Cats would not like to get wet by the sink.", "The sink can be filled with water and cats don't like getting wet and would not like being in the sink."], "image": "train2014/COCO_train2014_000000020903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519188, "question_id": "AD3fcL5FWXmGEyvYe9vwZy", "question": "What number is on the plane?", "choices": ["410", "369", "821", "775"], "correct_choice_idx": 0, "direct_answers": ["four hundredten", "410", "four", "four ten", "410", "410", "410", "410", "410", "410"], "difficult_direct_answer": false, "rationales": ["The number is near the front landing gear.", "An airplane has numerical markings on the front.", "Towards the frontmost part of this plain on a small structure next to it's front wheel the text 410 is printed."], "image": "train2014/COCO_train2014_000000519188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570518, "question_id": "AEgPRM5sCfy87RSxfhAgjR", "question": "Which object is most likely to be holding liquid right now?", "choices": ["pot", "bowl", "bottle", "plate"], "correct_choice_idx": 2, "direct_answers": ["hotdog", "bottle", "bottle", "bottle", "pumpkin ale", "bottle", "brocolli", "broccoli", "bottle", "glass bottle"], "difficult_direct_answer": false, "rationales": ["The object is a bottle.", "These are made to do this", "The bottle most likely has liquid. it has beer in it."], "image": "train2014/COCO_train2014_000000570518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87792, "question_id": "AGG9Q2ewFmUnEQ6HZa3vpm", "question": "What do half these treats have?", "choices": ["chocolate chips", "sprinkles", "gummy bears", "hole"], "correct_choice_idx": 3, "direct_answers": ["chocolate", "car", "donuts", "hole", "glaze", "glaze", "hole", "there", "fosting", "donut"], "difficult_direct_answer": false, "rationales": ["Some of these pastries are missing a part in the middle.", "The other options don't apply to this image or these donuts.", "The treats have holes."], "image": "train2014/COCO_train2014_000000087792.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230091, "question_id": "AJ4j9FTtedrKDZXvKX4yzM", "question": "What treat is in the box?", "choices": ["gummy bears", "donut", "pizza", "apple pie"], "correct_choice_idx": 1, "direct_answers": ["donuts", "sweet", "donuts", "donuts", "donuts", "donuts", "donuts", "donut", "stop", "cakes"], "difficult_direct_answer": false, "rationales": ["The box is filled with a dozen of sweet glazed donuts.", "They are doughy and round with a hole in the middle.", "The treat is a donut."], "image": "train2014/COCO_train2014_000000230091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431494, "question_id": "AJWTfdXZDUzvycZrkoBcVU", "question": "What color is the item with the holes?", "choices": ["purple", "red", "green", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "blue", "blue", "baby blue", "blue", "blue", "blue", "blue", "dwdw", "blue"], "difficult_direct_answer": false, "rationales": ["The board is blue.", "A man is laying on a piece of wood that is the same color as the sky.", "The color is blue."], "image": "train2014/COCO_train2014_000000431494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310989, "question_id": "ALg83SiKBNoLUokBMtYG5G", "question": "The nose of this aircraft is in what nation's flag?", "choices": ["netherlands", "uk", "france", "us"], "correct_choice_idx": 2, "direct_answers": ["america", "nation", "france", "france", "american", "france", "usa", "usa", "stop", "usa"], "difficult_direct_answer": false, "rationales": ["The nose is red, white and blue in the design of france's flag.", "The nose is french.", "The nose is french."], "image": "train2014/COCO_train2014_000000310989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423810, "question_id": "AMHnav6ZDXM3pEG8r3HjGm", "question": "What are these animals known for?", "choices": ["stripes", "tusks", "horns", "antlers"], "correct_choice_idx": 0, "direct_answers": ["stripes", "stripes", "zebras", "stripes", "stripes", "stripes", "stripes", "stripes", "stripes", "stripes"], "difficult_direct_answer": false, "rationales": ["These are zebras and they have vertical lines in black and white.", "Zebras are known for their stripes.", "The zebras have stripes."], "image": "val2014/COCO_val2014_000000423810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511236, "question_id": "AN9CT3amsnZysBCe5czyW2", "question": "How many stickers are attached to the big porcelain toilet?", "choices": ["four", "two", "one", "three"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "one", "two", "one", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two stickers attached to the big porcelain toilet on the top and bottom.", "One sticker is above the seat. an additional sticker is below the seat.", "There are two stickers."], "image": "val2014/COCO_val2014_000000511236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263974, "question_id": "AQFF3TAhE9ZQqaPnCMBVWN", "question": "What color are the frames of the bicycles driven down the bike lane?", "choices": ["green", "purple", "red", "blue"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "red", "red", "white", "red", "red", "white", "white", "red"], "difficult_direct_answer": false, "rationales": ["The answer does not account for all of the bikes visible in the lane, but does correspond to the majority.", "The bikes have red on them.", "The color is red."], "image": "train2014/COCO_train2014_000000263974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497633, "question_id": "ARQvMG8wc8KsGbrpAmAXco", "question": "What are the floating blue objects for?", "choices": ["boundaries", "decoration", "first aid", "swimming"], "correct_choice_idx": 0, "direct_answers": ["boundaries", "anchor points", "hold boats", "towing", "anchors", "anchors", "travelling", "safety", "fishing", "anchors"], "difficult_direct_answer": false, "rationales": ["They are bouys.", "They're boundaries.", "The lines are used to keep things from crossing that area."], "image": "train2014/COCO_train2014_000000497633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440449, "question_id": "AS42yLSCiuXkuQnQCnvkoS", "question": "Who makes a similar electronic device to the one the man is using?", "choices": ["applebees", "mcdonalds", "ibm", "nintendo"], "correct_choice_idx": 3, "direct_answers": ["nintendo", "nintendo", "nintendo", "charger", "wii", "nintendo", "nintendo", "nintendo", "nintendo", "game"], "difficult_direct_answer": false, "rationales": ["The man is holding wii remotes. mcdonald's and applebee's make food items, not electronic devices.", "A guy is holding a white remote with pieces in both hands, similar to a video game controller.", "Nintendo makes a device."], "image": "train2014/COCO_train2014_000000440449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453819, "question_id": "ASA9dmK5qzqsux92JcsuBG", "question": "What is above water?", "choices": ["boat", "swimmer", "surfer", "dolphin"], "correct_choice_idx": 0, "direct_answers": ["dock", "ship", "boat", "dock", "boat", "boat", "boats", "ship", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["The boat is above water.", "The boat is above water.", "The boats are above water since they are docked and buoyed up."], "image": "val2014/COCO_val2014_000000453819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579885, "question_id": "ATsHscwdkL626iU3iBmqnZ", "question": "What is the baby sheep doing?", "choices": ["dancing", "sleeping", "eating", "drinking"], "correct_choice_idx": 3, "direct_answers": ["feeding", "eating", "nursing", "nursing", "drinking", "nursing", "milk drinking", "drinking", "breastfeeding", "drinking milk"], "difficult_direct_answer": false, "rationales": ["The sheep is drinking.", "This seems to be the case based on how its head is positioned under the other sheep.", "There is a baby goat that is kneeling to get milk from mother."], "image": "val2014/COCO_val2014_000000579885.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331864, "question_id": "AUbAhfofMxA2VzFhx9m2zT", "question": "What is the longest word on the screen?", "choices": ["eggplant", "wintergreen", "academy", "standards"], "correct_choice_idx": 3, "direct_answers": ["standards", "standards", "standard", "standards", "standards", "standards", "standards", "standards", "standards", "standards"], "difficult_direct_answer": false, "rationales": ["The longest word is standards.", "Wintergreen, academy, and eggplant are not on the screen.", "The word is standards."], "image": "train2014/COCO_train2014_000000331864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220241, "question_id": "AXRJR8kXejiLjttwMJwRPA", "question": "What color is the shirt worn by the man who is holding this hot dog?", "choices": ["blue", "pink", "orange", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "green", "green", "green", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["The color is green.", "Blue, pink or orange clothes are not depicted.", "The color is green."], "image": "val2014/COCO_val2014_000000220241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275028, "question_id": "AXk4YUhp9JaM9fuSz6YEe7", "question": "What is the cat on top of?", "choices": ["basket", "cardboard box", "table", "human tummy"], "correct_choice_idx": 2, "direct_answers": ["dining table", "table", "table", "table", "table", "nothing", "table", "table", "cat", "table"], "difficult_direct_answer": false, "rationales": ["The cat is on top of dining furniture.", "The cat is on a table.", "The cat's on the table."], "image": "train2014/COCO_train2014_000000275028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136077, "question_id": "AYJXwc4aZfWLmf8WX2yrC7", "question": "What matches the color scheme of the animals?", "choices": ["piano keys", "lime", "pumpkin", "cherry"], "correct_choice_idx": 0, "direct_answers": ["piano keys", "black white", "newspaper", "penguins", "stripes", "back/white", "zebra", "stripes", "zebra", "black white"], "difficult_direct_answer": false, "rationales": ["The colors are like keys.", "The zebras are black and white.", "The colors resemble piano keys."], "image": "val2014/COCO_val2014_000000136077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289007, "question_id": "AYS4Tq6ZnPcNsD8pyo6BTT", "question": "How will the food reach the person's mouth?", "choices": ["chopsticks", "spoon", "knife", "fork"], "correct_choice_idx": 0, "direct_answers": ["chopsticks", "chopsticks", "meat", "chopsticks", "chop sticks", "chopsticks", "fork", "feefef", "chopsticks", "chopsticks"], "difficult_direct_answer": false, "rationales": ["There are two sticks in the food. the food looks to be asian cuisine.", "These are the red sticks used as utensils", "There is red tongs that are pecking at the food."], "image": "train2014/COCO_train2014_000000289007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443944, "question_id": "AYcfWWsydL5GF9WoQsc6YX", "question": "What is on the brown table near the couch?", "choices": ["cat", "baby", "fish", "apple"], "correct_choice_idx": 2, "direct_answers": ["toy fish", "fish", "fish", "fish", "wall", "dada", "lamp photograph", "fish", "but", "fish"], "difficult_direct_answer": false, "rationales": ["The table has a fish.", "It contains a streamlined body ,has gills and has fins.", "The table has fish."], "image": "train2014/COCO_train2014_000000443944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178491, "question_id": "AYvYNMrb5xqtZrQZV9p6Ey", "question": "What color are the topsides of the train engines in the middle of the depot without any kind of cars?", "choices": ["white", "blue", "orange", "red"], "correct_choice_idx": 1, "direct_answers": ["grey", "blue", "grey", "silver", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The middle car of the depot that has only two cars has a roof that is painted blue.", "The color is blue.", "The top of the engines without any cabs are blue."], "image": "train2014/COCO_train2014_000000178491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532780, "question_id": "AZSiqsomt8Bfbf8StFWGVK", "question": "What is on top of the bread?", "choices": ["tater tots", "salmon", "shrimp", "tomato"], "correct_choice_idx": 3, "direct_answers": ["tomato", "mayonnaise", "cheese", "tomato", "tomato", "cheese tomato", "tomato mozzarella", "mozzarella tomatoes", "tomatoes", "tomatoes"], "difficult_direct_answer": false, "rationales": ["The tomato tops it.", "The bread on the plate is topped with a slice of tomato and cheese.", "There are red slices under the cheese"], "image": "val2014/COCO_val2014_000000532780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42122, "question_id": "AaY6xT4GQLKPDSYbfmTvdx", "question": "What color is the ball that the child is attempting to hit with the baseball bat?", "choices": ["blue", "purple", "green", "white"], "correct_choice_idx": 2, "direct_answers": ["yellow", "green", "green", "yellow", "yellow", "yellow", "green", "yellow", "green", "green"], "difficult_direct_answer": false, "rationales": ["The ball is similar in color to the grass. the ball is not white, blue, or purple.", "There is only one ball in the image and this is its color.", "It is a tennis ball and this is the normal color for them"], "image": "train2014/COCO_train2014_000000042122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109231, "question_id": "AafZHamZipLN8Curs5kVmD", "question": "What type of aircraft can be smaller than a human?", "choices": ["airplane", "blimp", "jumbo jet", "helicopter"], "correct_choice_idx": 0, "direct_answers": ["jet", "airplane", "carrier", "light aircraft", "model plane", "propeller plane", "personal", "plane", "jet", "plane"], "difficult_direct_answer": false, "rationales": ["The airplane is smaller.", "A model one of these would be smaller", "Planes can be lower to the ground then humans."], "image": "val2014/COCO_val2014_000000109231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475313, "question_id": "Ab885trX3HskpdDNfVcQrq", "question": "What color is the icing on the top of the donuts underneath of the man's hand who is looking to eat?", "choices": ["pink", "black", "red", "white"], "correct_choice_idx": 0, "direct_answers": ["white", "pink", "pink", "dwdw", "pink", "pink", "pink", "pink", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["The color is pink.", "The icing has a strawberry flavor to it.", "The color of the icing is too red to be white, but too white to be red."], "image": "train2014/COCO_train2014_000000475313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249974, "question_id": "AcUbMb9ghHPyLt4ukCZGNe", "question": "How many fruits are gathered together in this picture?", "choices": ["three", "two", "four", "five"], "correct_choice_idx": 2, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "one", "four", "two"], "difficult_direct_answer": false, "rationales": ["Three apples and a banana are grouped together.", "There are four fruits.", "There are 4."], "image": "train2014/COCO_train2014_000000249974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38645, "question_id": "AdJwo45DCRkNhXtUeNvrFm", "question": "What is inside the flower pot?", "choices": ["umbrella", "cat", "baby", "single rose"], "correct_choice_idx": 0, "direct_answers": ["flowers", "plant", "plants", "flower", "plants", "flowers", "umbrella", "sunflower", "plants", "flowers"], "difficult_direct_answer": false, "rationales": ["This is holding it upright", "The flower pot of the deck has the pole of an umbrella placed inside of it.", "The flower pot on the deck has an umbrella pole inside of it."], "image": "val2014/COCO_val2014_000000038645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62210, "question_id": "AfKb2wVehPEw3PpBbKTjwm", "question": "What happens behind the curtain?", "choices": ["eating", "sleeping", "movie time", "cleaning"], "correct_choice_idx": 3, "direct_answers": ["glass", "cleaning", "curtain removed", "shower", "bathing", "bathe", "bathing", "nothing", "showers", "bath"], "difficult_direct_answer": true, "rationales": ["The curtains have to be cleaned.", "People clean themselves in the shower.", "The area behind the curtain is a bathtub, not a movie screen, dining room table, or bed."], "image": "val2014/COCO_val2014_000000062210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95381, "question_id": "AfNLhiLVKrczSvPQ2kjCZt", "question": "What color is the coat jacket on the right side of the rack hung on the white door?", "choices": ["brown", "red", "purple", "black"], "correct_choice_idx": 0, "direct_answers": ["brown", "brown", "brown", "brown stripes", "brown", "brown", "brown", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The shade of color is similar to that of dirt or soil.", "The color is brown.", "The color is brown."], "image": "train2014/COCO_train2014_000000095381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54678, "question_id": "Afgh4vKy4kgNLwPV77KWym", "question": "What is the child standing on?", "choices": ["mud", "sand", "snow", "grass"], "correct_choice_idx": 2, "direct_answers": ["ice", "snow", "snow", "skateboard", "skiis", "snow", "skis", "snow", "skis", "snow"], "difficult_direct_answer": false, "rationales": ["The kid's in snow.", "The object is white on the ground. the kid is covered in really warm clothing because the stuff on the ground means super cold weather.", "The child is skiing. the are standing on a white rugged surface."], "image": "train2014/COCO_train2014_000000054678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400422, "question_id": "Ahki3X4dE5iUfaBd27N5f9", "question": "How many men are standing around the laptop held by the one?", "choices": ["four", "three", "five", "two"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "one", "one", "two", "two", "one", "one", "two", "one"], "difficult_direct_answer": false, "rationales": ["There are less than three but more than one man visible.", "Two men are shown in the room.", "Two men are standing."], "image": "train2014/COCO_train2014_000000400422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52153, "question_id": "AhrScLCFMMJwvmAFhbMTCn", "question": "What is on the cabinet?", "choices": ["television", "apple pie", "cat", "baby"], "correct_choice_idx": 0, "direct_answers": ["glassware", "glassware", "television", "glasses", "nothing", "television", "television", "grass", "glassware", "glass"], "difficult_direct_answer": false, "rationales": ["The other options aren't in the image.", "The cabinet has a tv.", "There is a box showing a screen with people on it."], "image": "train2014/COCO_train2014_000000052153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375902, "question_id": "Aja3hfEmS5nYdQWdywyKDW", "question": "What is the person in blue crossing?", "choices": ["their wires", "finish line", "lake", "their letters"], "correct_choice_idx": 1, "direct_answers": ["winning", "banner", "finish line", "winning", "finish line", "finish line", "winner", "finishing line", "snow", "finish line"], "difficult_direct_answer": false, "rationales": ["They are holding out a paper banner for her to go through at the end of a race", "They're obviously in a skiing competition.", "The person in the blue snow suit is crossing the finish line at the end of a race."], "image": "val2014/COCO_val2014_000000375902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445859, "question_id": "AkHiho28BeYwzKYMpVHowk", "question": "What is the dog doing?", "choices": ["chasing cats", "swimming", "sleeping", "leaping"], "correct_choice_idx": 3, "direct_answers": ["running", "jumping", "leaping", "jumping", "running", "herding", "running", "jumping", "herding", "running"], "difficult_direct_answer": false, "rationales": ["The dog is near ground, not water, and is awake. there are no cats near the dog.", "The dog is awake and is just above land. there are no cats.", "The dog is running and leaping through the air in front of the cows."], "image": "train2014/COCO_train2014_000000445859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166998, "question_id": "AkPfLdTgyjFNtEXNSYcxem", "question": "What would someone use the objects above the stove for?", "choices": ["teaching", "drinking", "seasoning", "cleaning"], "correct_choice_idx": 2, "direct_answers": ["seasoning", "cooking", "six", "stove", "cooking baking", "seasoning", "cooking", "cooking", "cooking", "seasoning"], "difficult_direct_answer": false, "rationales": ["The seasoning is used.", "There is a spice rack above the stove. spices are not used for cleaning, teaching, or drinking.", "There is a spice rack attached to a wall above stove. it is used to put in foods to make it more flavorful."], "image": "train2014/COCO_train2014_000000166998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332891, "question_id": "AmEi7WCBgFMYTvso2MMvBt", "question": "What color is the banana without a bunch on the countertop directly?", "choices": ["brown", "black", "yellow", "green"], "correct_choice_idx": 0, "direct_answers": ["brown yellow", "brown", "brown", "brown yellowish", "brown", "brown", "yellow brown", "yellow", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The bananas in the bunch are yellow. the loose banana is overripened and is a different color.", "The banana not in the bunch is brown. the banana has started changing colors.", "The lone banana is not green, yellow, or black. it is overripened."], "image": "train2014/COCO_train2014_000000332891.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199553, "question_id": "AmbGG2xrWxdLzbsxvB63k7", "question": "What type of skiing is he likely doing?", "choices": ["slalom", "downhill", "trick", "crosscountry"], "correct_choice_idx": 3, "direct_answers": ["ice skiing", "cross country", "crosscountry", "snow", "playing", "slow", "downhill", "skating", "cross country", "cross country"], "difficult_direct_answer": false, "rationales": ["From the type of foot gear and skis he's wearing, he is crosscountry skiing.", "The type is cross country.", "The skiing is cross country."], "image": "val2014/COCO_val2014_000000199553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46454, "question_id": "AoVfyXB4s4448aithzrnbF", "question": "What player does this person likely know of?", "choices": ["mike trout", "ben stokes", "rose lavelle", "marian hossa"], "correct_choice_idx": 0, "direct_answers": ["babe ruth", "baseball", "baseball", "ruth", "ichiro", "mike trout", "baseball", "babe ruth", "baseball player", "baseball player"], "difficult_direct_answer": false, "rationales": ["The player is trout.", "The player knows mike trout.", "The player stands like trout waiting for someone to throw the ball for him to hit."], "image": "train2014/COCO_train2014_000000046454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315037, "question_id": "Apkyp5mg3BMDkKrgQFePEd", "question": "What color is the hat worn by the man using his laptop on the park bench?", "choices": ["black", "blue", "red", "white"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "red", "red plaid", "red", "red", "red", "red", "red", "red plaid"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The man is wearing a plaid hat on his head.", "The hat is red."], "image": "val2014/COCO_val2014_000000315037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347962, "question_id": "AqRhkXGcP45QxJZ9SG9TSJ", "question": "Why are there so many on the bike?", "choices": ["show", "exercise", "fun", "family transportation"], "correct_choice_idx": 3, "direct_answers": ["road trip", "family transportation", "one", "three", "travelling together", "family", "sharing", "three riding", "need ride", "family"], "difficult_direct_answer": true, "rationales": ["They fit on the bike.", "The family is transported.", "An adult is on a motorcycle with two children."], "image": "train2014/COCO_train2014_000000347962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431986, "question_id": "AuWdoxL6MgU5NgNJBGMEWc", "question": "What color is the body of this police boat?", "choices": ["blue", "green", "black", "white"], "correct_choice_idx": 2, "direct_answers": ["black", "black", "black", "black", "black", "black", "black", "black", "white", "black"], "difficult_direct_answer": false, "rationales": ["It's the opposite of the white top", "The police boat is clearly labeled and the color of the body is clearly identifiable.", "The boat has a dark color for the inflatable part of the boat."], "image": "train2014/COCO_train2014_000000431986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328651, "question_id": "AvhEf9zrzxDUNjj6aCHdBd", "question": "Who are in the most danger?", "choices": ["hippopotamus", "humans", "birds", "elephants"], "correct_choice_idx": 1, "direct_answers": ["people", "birds", "birds", "people", "humans", "tourist", "tourists", "people", "people", "women"], "difficult_direct_answer": false, "rationales": ["The humans are in danger.", "Human beings is the most danger in the area.", "The humans are in danger."], "image": "train2014/COCO_train2014_000000328651.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294758, "question_id": "Ayma3jsBfuX9iocRwDzc3i", "question": "What type of company is sponsoring this game?", "choices": ["computer", "car", "basketball", "canned bean"], "correct_choice_idx": 1, "direct_answers": ["car company", "volvo", "car", "car company", "volvo automobile", "car", "volvo", "mlb", "auto", "automobile"], "difficult_direct_answer": false, "rationales": ["There is an ad for the automobile company volvo on the wall.", "The car company is a sponsor.", "The sign says volvo"], "image": "train2014/COCO_train2014_000000294758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339845, "question_id": "AzDYSVsZamwfgiFaKhP3pJ", "question": "What color is the exterior side of the photography umbrella?", "choices": ["white", "pink", "green", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "black", "black", "white", "white", "black", "black", "blak"], "difficult_direct_answer": false, "rationales": ["The outside of the umbrella is black.", "The umbrella is identifiable by its unique shape and the color is clearly seen.", "The umbrella is clearly visible and based on the location of the handle it is possible to determine the interior and exterior of the umbrella and their corresponding colors."], "image": "val2014/COCO_val2014_000000339845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468169, "question_id": "B3Atit6WcM5efSEFBCcGoX", "question": "What team wears similar socks to the boy in the foreground?", "choices": ["mets", "jets", "red sox", "white sox"], "correct_choice_idx": 2, "direct_answers": ["red sox", "redsox", "red sox", "red sox", "t-shirt", "red sox", "vultures", "red", "baseball", "red sox"], "difficult_direct_answer": false, "rationales": ["The team is the red sox.", "The socks are red.", "The professional baseball team the red sox wears red like the boy who is playing baseball and wearing red socks."], "image": "val2014/COCO_val2014_000000468169.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400592, "question_id": "B4dLUYGqSknc75egnZV4ZT", "question": "What is in the room?", "choices": ["dog", "basketball hoop", "messy desk", "cat"], "correct_choice_idx": 2, "direct_answers": ["messy desk", "laptop", "computer", "stuff", "desk", "computer", "desk", "desk", "work room", "computer"], "difficult_direct_answer": false, "rationales": ["There is stuff all over it", "The room is messy.", "This office does not have pets or basketball equipment."], "image": "train2014/COCO_train2014_000000400592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82898, "question_id": "B4kxvPofNRcFXmVTFE7VNN", "question": "What are the two zebras who are leading the pack pointing their noses toward?", "choices": ["camera", "trees", "zebra", "house"], "correct_choice_idx": 0, "direct_answers": ["tree", "house", "camera", "house", "house", "camera", "camera", "house", "house", "house"], "difficult_direct_answer": false, "rationales": ["The giraffes are pointing their noses at the camera.", "They are looking this way", "The two zebras in front of the pack are both looking directly at the camera that took the picture."], "image": "train2014/COCO_train2014_000000082898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453029, "question_id": "B6HA2gYwYToKdao8cCNoF7", "question": "What is the person who is visible called?", "choices": ["organist", "conductor", "baby sitter", "coach"], "correct_choice_idx": 1, "direct_answers": ["engineer", "plot", "conductor", "conductor", "engineer", "conductor", "conductor", "conductor", "conductor", "engineer"], "difficult_direct_answer": false, "rationales": ["The person inside of the train engine is a conductor.", "The person is the conductor.", "The person that is visible is sitting in the engine. they are driving the train."], "image": "val2014/COCO_val2014_000000453029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526675, "question_id": "B83mret7K4hM4ruFminxqU", "question": "What color is the napkin hanging off of the blue bedside?", "choices": ["red", "green", "purple", "pink"], "correct_choice_idx": 0, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The napkin is not green, purple, or pink.", "The color is red.", "It's the color of cherries"], "image": "val2014/COCO_val2014_000000526675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136235, "question_id": "BEQhuC6MpcWjtiamZM3eba", "question": "What does the sign in front of the runways near the camera say?", "choices": ["one way", "stop", "dead end", "enter"], "correct_choice_idx": 3, "direct_answers": ["enter", "enter", "don't enter", "no entry", "don't enter", "no enter", "no entering", "airoplane", "don't enter", "don't enter"], "difficult_direct_answer": false, "rationales": ["The sign is clearly visible and known to be a sign based on design and shape. the text on the sign is clearly readable.", "The sign says enter.", "The sign at the airport in front of the runway says do not enter."], "image": "train2014/COCO_train2014_000000136235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578046, "question_id": "BEddXAQf45KYHRogDexDho", "question": "What utensil is closest to the food?", "choices": ["knife", "spoon", "fork", "spatula"], "correct_choice_idx": 2, "direct_answers": ["fork", "knife", "fork", "knife", "fork", "fork", "knife", "fork", "fork", "fork"], "difficult_direct_answer": false, "rationales": ["There are four prongs sticking out of a utensil and is sitting on a plate with food.", "The fork is closest.", "The fork is close."], "image": "train2014/COCO_train2014_000000578046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198492, "question_id": "BEpDrusqcmJNu93ScFq6it", "question": "What color is the blouse worn by the woman who is coming in from the right?", "choices": ["pink", "red", "black", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "burger", "paisley", "black", "black", "white", "white", "floral", "white", "white"], "difficult_direct_answer": false, "rationales": ["The color of the woman on the right is white with blue designs.", "The shirt is white.", "The woman on the right is wearing a patterned shirt that is mostly white in color."], "image": "val2014/COCO_val2014_000000198492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336749, "question_id": "BEr9xjCo6FvmhyxpjjnXLj", "question": "What is the black appliance by the corner called?", "choices": ["food processor", "can opener", "blender", "microwave"], "correct_choice_idx": 2, "direct_answers": ["blender", "microwave", "oven", "blender", "blender", "toaster", "microwave", "oven", "blender", "blender"], "difficult_direct_answer": false, "rationales": ["The appliance is a blender.", "The appliance is a blender.", "The size, shape and location in the kitchen is consistent with answer a."], "image": "train2014/COCO_train2014_000000336749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247515, "question_id": "BFJt3qcPR2GyxXXY87S9FE", "question": "What is in the small plate?", "choices": ["chicken bone", "cantaloupe", "apple pie", "cherry"], "correct_choice_idx": 1, "direct_answers": ["fruit", "fruit", "salad", "fruit", "ice cream", "melons", "fruit", "eggs benedict", "cantaloupe", "fruit vegetables"], "difficult_direct_answer": false, "rationales": ["The small plate has green melon pieces, and there's only one readily available green melon.", "The small plate on the meal dish contains sliced up cantaloupe.", "The plate at the top has orangish and light green melons in it."], "image": "train2014/COCO_train2014_000000247515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503718, "question_id": "BG2fKsTb33GbMQuBZGSzQq", "question": "What do these animals use to defend themselves?", "choices": ["stinger", "talons", "camouflage colors", "tusks"], "correct_choice_idx": 3, "direct_answers": ["tusks", "tusks", "tusks", "trunks", "tusks", "big", "elephant", "tusk", "car", "tusks"], "difficult_direct_answer": false, "rationales": ["These are sharp things pointing out of elephants faces for defense.", "They have long white horns next to their mouths that they can use to fight with.", "The animals use tusks."], "image": "train2014/COCO_train2014_000000503718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456136, "question_id": "BGBfWdrWD8cdFwjBDecYFv", "question": "What is the woman wearing?", "choices": ["crown", "scarf", "necklace", "tattoo"], "correct_choice_idx": 2, "direct_answers": ["glasses", "black dress", "necklace", "black blouse", "necklace", "necklace", "black shirt", "black", "necklace", "tops"], "difficult_direct_answer": false, "rationales": ["She has a beaded necklace on.", "She has beads on a string", "The other options aren't in the image."], "image": "train2014/COCO_train2014_000000456136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361858, "question_id": "BHvXtJXaNHuEmAV67SUtb7", "question": "What happens in this room?", "choices": ["typing letters", "eating pizza", "bathing", "watching tv"], "correct_choice_idx": 2, "direct_answers": ["bathing", "bathe", "bathing", "shower", "showering", "bathe", "bathing showering", "use restroom", "showers", "bathing"], "difficult_direct_answer": false, "rationales": ["Viewing a bathroom with sinks, walk in, shower and bathtub.", "Bathrooms are used for bathroom breaks and to clean up in.", "This room contains a tub, shower, sink, and toilet. it is not an office, den, or dining room."], "image": "train2014/COCO_train2014_000000361858.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356015, "question_id": "BJWtUz7muroSZPJTfDhjYp", "question": "What are the horses in the foreground between?", "choices": ["boats", "statues", "poles", "fish netting"], "correct_choice_idx": 2, "direct_answers": ["posts", "poles", "posts", "poles", "poles", "poles", "goal post", "posts", "poles", "ground"], "difficult_direct_answer": false, "rationales": ["The horses are in the foreground playing polo between the poles.", "The items are tall cylindrical things standing straight up.", "They are playing polo"], "image": "train2014/COCO_train2014_000000356015.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514939, "question_id": "BKSoUK4JjRFLTq9EkebCtu", "question": "What number of vehicles are parked at this traffic light overlooked by the large cathedral building?", "choices": ["five", "two", "four", "three"], "correct_choice_idx": 1, "direct_answers": ["two", "three", "three", "two", "many", "three", "bsll", "three", "three", "two"], "difficult_direct_answer": false, "rationales": ["There are two.", "There are two vehicles side by side on the street.", "There are two cars."], "image": "train2014/COCO_train2014_000000514939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312390, "question_id": "BKejTkEoKaX3zZcg32zzXn", "question": "What is going to happen to the child?", "choices": ["brush teeth", "floss teeth", "brush hair", "wash face"], "correct_choice_idx": 0, "direct_answers": ["brush teeth", "brush teeth", "brush teeth", "brush teeth", "feeding", "brush teeth", "teeth brushed", "taught", "brush teeth", "sleep"], "difficult_direct_answer": false, "rationales": ["The man is holding a toothbrush near the boy's mouth.", "There is a toothbrush in front of his face.", "A child has a toothbrush in front of him."], "image": "train2014/COCO_train2014_000000312390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54485, "question_id": "BLe4paD8jRsrr4C4vLFEwJ", "question": "What color are the birds flying over the street?", "choices": ["white", "black", "gray", "brown"], "correct_choice_idx": 2, "direct_answers": ["grey", "gray", "white", "gray", "grey", "white", "black", "red", "car", "stop"], "difficult_direct_answer": false, "rationales": ["The birds are gray.", "The color is gray.", "They're likely multicolored with some blues and blacks as well, but their primary color is closest to a."], "image": "val2014/COCO_val2014_000000054485.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385666, "question_id": "BMLRxqMqFPwiwdKHRAeuVx", "question": "What food is near the figurines?", "choices": ["macaroni", "pizza", "hot dogs", "hamburgers"], "correct_choice_idx": 1, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["There is a pepperoni pizza near the figurines.", "The food is pizza.", "The cheese covered food with red spots of pepperoni and marinara sauce is unmistakably pizza."], "image": "train2014/COCO_train2014_000000385666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573759, "question_id": "BMdwaxAEMYbGQJgjnMBUMr", "question": "How many dogs are sitting inside of the motorboat with the man running the engine?", "choices": ["three", "four", "two", "one"], "correct_choice_idx": 0, "direct_answers": ["three dogs", "three", "two", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 3.", "There are three cute dogs inside of the motorboat with the man.", "There are three dogs."], "image": "val2014/COCO_val2014_000000573759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127556, "question_id": "BMwvsZfNQq2mt4YFx9hdzS", "question": "The cat who is inspecting the treat has what color of eyes?", "choices": ["blue", "green", "brown", "yellow"], "correct_choice_idx": 1, "direct_answers": ["green", "green", "green", "green", "green", "green", "hazel", "green", "green", "cat"], "difficult_direct_answer": false, "rationales": ["The cat's eyes are not yellow, blue, or brown.", "The cat has green eyes.", "It's green."], "image": "val2014/COCO_val2014_000000127556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122863, "question_id": "BNaKDormvxBdWmWfFJPr7b", "question": "What type of motor bike can be used to transport three people safely?", "choices": ["scooter", "quad", "motorcycle", "tricycle"], "correct_choice_idx": 3, "direct_answers": ["car type", "electric", "tricycle", "tricycle", "three wheeled", "moped", "tricycle", "trike", "tricycle", "motorcycle"], "difficult_direct_answer": false, "rationales": ["The vehicle has one wheel for each person.", "The tricycle is for three.", "The motor bike could be a tricycle."], "image": "train2014/COCO_train2014_000000122863.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109939, "question_id": "BNsW9ToJRHD6sHidmBCzPa", "question": "What country most likely houses this bus as evident by the flag hanging from the building to the left?", "choices": ["usa", "uk", "germany", "france"], "correct_choice_idx": 1, "direct_answers": ["england", "bus", "england", "usa", "holly", "france", "europe", "country", "england", "uk"], "difficult_direct_answer": false, "rationales": ["The country is the uk.", "The country is the uk.", "A small blue, red, and white flag can be seen attached to side of a building. it has a cross in red and lines connecting to center of cross."], "image": "val2014/COCO_val2014_000000109939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153445, "question_id": "BNzwWHMW8VTAfYQx9WwynD", "question": "The company that made 3D world also made what famous video game system?", "choices": ["playstation", "xbox", "gamecube", "wii"], "correct_choice_idx": 0, "direct_answers": ["super mario", "sony", "fortnite", "super mario", "created", "nintendo", "playstation", "sony", "yes true", "television"], "difficult_direct_answer": false, "rationales": ["The text on the television indicates that 3d world was created by sony. xboxes are made by microsoft, and gamecubes and wins were made by nintendo.", "The screen is on the tv.", "Sony also produces the playstation game system."], "image": "val2014/COCO_val2014_000000153445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573046, "question_id": "BPTFYobZ5hhJuN8cLjr4Yi", "question": "What is in the man's hand?", "choices": ["umbrella", "basketball", "baseball", "baton"], "correct_choice_idx": 0, "direct_answers": ["feefe", "umbrella", "umbrella", "light", "lighting umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["The man has an umbrella.", "The man is holding a long pole with a cover above it.", "A man is holding a long, thin handle attached to a metal frame holding material in a dome shape."], "image": "train2014/COCO_train2014_000000573046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495666, "question_id": "BQGved7NMqdemQkXcCb3Qo", "question": "What animals have the tags on them?", "choices": ["dogs", "cats", "cows", "horses"], "correct_choice_idx": 2, "direct_answers": ["cows", "cows", "cows", "cows", "cows", "cow", "front", "cows", "cows", "cows"], "difficult_direct_answer": false, "rationales": ["They're cows.", "There are a bunch of cows with tags in them.", "These are livestock animals. they have an elongated snout and have hooves."], "image": "train2014/COCO_train2014_000000495666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124157, "question_id": "BQwRk6DY9orbVWkWRNbsYH", "question": "What animals are usually depicted eating these items?", "choices": ["monkeys", "stingrays", "cows", "rabbits"], "correct_choice_idx": 0, "direct_answers": ["monkey", "monkey", "car", "monkeys", "monkey", "monkey", "monkey", "stop", "monkey", "monkeys"], "difficult_direct_answer": false, "rationales": ["The animals are monkeys.", "Monkeys are the only animals that eat bananas. cows, stingrays and rabbits do not eat bananas.", "Monkeys eat bananas."], "image": "val2014/COCO_val2014_000000124157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456606, "question_id": "BRLssvdQATMyCRkzGX5guv", "question": "What subject matter is printed on the materials in the binder?", "choices": ["english", "physics", "mathematics", "chemistry"], "correct_choice_idx": 1, "direct_answers": ["science", "maths", "physics", "english", "instructions", "mathematics", "math", "paper", "physics", "physics"], "difficult_direct_answer": false, "rationales": ["The subject is physics.", "The printed materials in the binder refer to concepts like velocity and electric fields. these concepts are not associated with chemistry, english, or mathematics.", "Physics subject matter is in the binder. there are equations written that use the letter \"v\" and \"e\" which represent the velocity and the speed of light respectively."], "image": "train2014/COCO_train2014_000000456606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129940, "question_id": "BSdiCSGZ6iQ3Db2DBz9stg", "question": "What is near the planes?", "choices": ["butterflies", "kites", "exhaust", "balloons"], "correct_choice_idx": 2, "direct_answers": ["clouds", "smoke", "smoke", "toronto", "smoke", "smoke", "sky", "exhaust", "smoke", "jet formation"], "difficult_direct_answer": false, "rationales": ["It looks clawlike, which is nice. it's obviously coming out of the planes.", "A line of planes is in the air with a trail of white trailing behind them.", "Exhaust is nearby."], "image": "train2014/COCO_train2014_000000129940.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304694, "question_id": "BTNDDkjTHxiRuyVTmvsf9V", "question": "The concrete planter has how many orange boards on the side?", "choices": ["two", "five", "four", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three parts to it.", "There are a trio of orange boards on the concrete planter.", "There are three boards that are orange."], "image": "train2014/COCO_train2014_000000304694.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539326, "question_id": "BUbbiQsHFpGFajN3B9w3gV", "question": "What is the technical term for what the animals are doing?", "choices": ["molting", "grazing", "hibernating", "migrating south"], "correct_choice_idx": 1, "direct_answers": ["grazing", "grazing", "maintain", "grazing", "cow", "grazing", "cows", "grazing", "cow", "grazzing"], "difficult_direct_answer": false, "rationales": ["The term is grazing.", "Large animals are in an open, grassy area with there heads bent down to the ground.", "The term is grazing."], "image": "val2014/COCO_val2014_000000539326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174868, "question_id": "BV9PVS7BD7Yhfv29LzEEvP", "question": "Which appliance is most likely to have a cold interior?", "choices": ["stove", "none", "microwave", "fridge"], "correct_choice_idx": 3, "direct_answers": ["fridge", "fridge", "freezer", "fridge", "fridge", "refrigerator", "refrigerator", "shelf", "car", "fridge"], "difficult_direct_answer": false, "rationales": ["This is to cool and freeze foods", "The item is used to keep food cold and fresh.", "Of the appliances visible, answer a is known to be used to keep things cold, while none of the others serve this function."], "image": "train2014/COCO_train2014_000000174868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570381, "question_id": "BVeME57FGX2n9iEGSFxG4R", "question": "What are the woman raising?", "choices": ["cows", "glasses", "chickens", "graduation hats"], "correct_choice_idx": 1, "direct_answers": ["glasses", "glass", "glasses", "glasses", "juice", "glasses", "glasses", "glasses", "glass", "glass"], "difficult_direct_answer": false, "rationales": ["They are celebrating.", "You can see clearly they are all toasting and holding glass cups.", "All the women are pushing glasses together."], "image": "val2014/COCO_val2014_000000570381.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 397041, "question_id": "BXLRM73932BZoi8pykq9Zc", "question": "What is in the sandwich?", "choices": ["cheese", "pork chop", "purple onion", "bagel"], "correct_choice_idx": 0, "direct_answers": ["cheese", "car", "croque monsieur", "meat", "cheese", "meat", "cheese", "ham cheese", "cheese meat", "ham cheese"], "difficult_direct_answer": false, "rationales": ["Melted cheese with parsley is on top of a toasted sandwich.", "Cheese is in the sandwich.", "The sandwich has cheese."], "image": "val2014/COCO_val2014_000000397041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326935, "question_id": "BXaPV7EDCHvpYKnVHLBqSp", "question": "What part of the body are the elephants holding up?", "choices": ["tails", "arms", "ears", "trunks"], "correct_choice_idx": 3, "direct_answers": ["trunks", "trunk", "trunks", "trunks", "trunk", "trunks", "trunk", "trunk", "trunk", "trunks"], "difficult_direct_answer": false, "rationales": ["Their tails and ears are down. elephants do not have arms.", "Their noses are in the air and these are called trunks.", "This is the name of their noses."], "image": "train2014/COCO_train2014_000000326935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281763, "question_id": "BXbxsMetCjMN6pCS35ymJ5", "question": "What is near the toilet?", "choices": ["towel", "cat", "baby", "rat"], "correct_choice_idx": 0, "direct_answers": ["sink", "towel", "towel", "sink", "towel", "towel", "shower", "towel", "towel", "shower"], "difficult_direct_answer": false, "rationales": ["There is a towel.", "Linens hang from a shower door and the wall in a bathroom.", "There are no babies or animals in the room."], "image": "train2014/COCO_train2014_000000281763.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370059, "question_id": "BXgvM4hoBVRQcsqek56UVV", "question": "How many pizzas are sitting on top of the table where many people are sitting?", "choices": ["one", "three", "four", "two"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "large pizza", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["One pizza is in between two others.", "There is one on each end and one in the middle", "There are three visible pizzas."], "image": "train2014/COCO_train2014_000000370059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73310, "question_id": "BXidxGNB9Q4S3RG3RCpACz", "question": "What is on the bottom left?", "choices": ["teacup", "giraffe", "egg", "baby"], "correct_choice_idx": 0, "direct_answers": ["cup", "tea cups", "cake", "tea", "tea set", "cup soccer", "teacup", "teacups", "cup plate", "tea cup"], "difficult_direct_answer": true, "rationales": ["It's the only one that matches the bottom image in that position.", "There is a glass cup that is sitting on a saucer. it is filled with a brown liquid.", "This is obvious in the image."], "image": "train2014/COCO_train2014_000000073310.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542450, "question_id": "BXtcHqqFcMx9mg2sqhSpuU", "question": "What is sitting on the stand of the vendor on the left?", "choices": ["bananas", "parsley", "pine nuts", "tomatoes"], "correct_choice_idx": 3, "direct_answers": ["fruit", "fruit stand", "produce", "fruit", "fruit", "tomatoes", "fruit", "fruit", "man", "fruit"], "difficult_direct_answer": false, "rationales": ["These are red small vegetables.", "This appears to be the right answer based on the color of the fruit in the boxes.", "There are tomatoes sitting."], "image": "train2014/COCO_train2014_000000542450.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99539, "question_id": "BY3JTBRabMRQ9EbFaoaeRZ", "question": "What is the most likely drink in the cup on the table?", "choices": ["orange juice", "coffee", "alcohol", "water"], "correct_choice_idx": 2, "direct_answers": ["water", "alcohol", "beer", "water", "beer", "alcohol", "coca cola", "glass", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["It could also be soda/pop, but that's not one of the options. it looks like it could even be rum and coke.", "The alcohol is the drink to be as there is a coke to chase it.", "There is alcohol on the table."], "image": "train2014/COCO_train2014_000000099539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70059, "question_id": "BYic9HPf7Sjuj9JigJJLKT", "question": "What tells people where to stand for safety?", "choices": ["garbage can", "yellow line", "train", "blue column"], "correct_choice_idx": 1, "direct_answers": ["orange line", "orange line", "yellow line", "yellow line", "yellow line", "sign", "trains", "way", "metro", "leave distance"], "difficult_direct_answer": false, "rationales": ["The marking indicates where the edge of the platform is.", "There is a yellow line drawn up next to the subway train which tells people where to stand behind for safety.", "The yellow line at the edge of the train platform is where people need to stand behind to stay safe."], "image": "train2014/COCO_train2014_000000070059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539326, "question_id": "BZyPLdMjeL9XUwzfkPUj8D", "question": "What is the breed name of the all white cows?", "choices": ["charolais", "texas longhorn", "hereford", "angus"], "correct_choice_idx": 0, "direct_answers": ["unkown", "charolais", "bull", "milk", "jersey", "charolais", "gersy", "milk cows", "holsteins", "hereford"], "difficult_direct_answer": true, "rationales": ["All white cows are known as charolais.", "Charolais cattle are white, horned and long-bodied.", "The breed name is charolais."], "image": "val2014/COCO_val2014_000000539326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353237, "question_id": "BbVwyAnzWcmJEUyd8XSemg", "question": "How old is the baby?", "choices": ["four", "three", "two", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["The baby is being presented a cake with a lit candle on it which is synonymous with ones birthday. a birthday cake usually indicates via candles how old the person is and this candle is in the shape of answer a.", "The baby is being presented with a cake with a candle on it. this is usually done on one's birthday and the number on the candle indicates the age.", "The baby is 1."], "image": "train2014/COCO_train2014_000000353237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18750, "question_id": "BbnjQ399jeEMEDTasoX8E4", "question": "What feature is visible?", "choices": ["car hood", "ladder", "railing", "hammock"], "correct_choice_idx": 2, "direct_answers": ["spire", "railing", "towers", "church", "steeple", "steeples", "church exterior", "steeple", "temple", "church"], "difficult_direct_answer": false, "rationales": ["The other options aren't even in this scene at all.", "There is a railing on the top of the church building.", "The building is in the picture. the picture has a railing on it."], "image": "val2014/COCO_val2014_000000018750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192954, "question_id": "Bbr2hz5F73vbTAtU9fNwHq", "question": "What level is this room on?", "choices": ["second", "ground", "basement", "attic"], "correct_choice_idx": 1, "direct_answers": ["first", "second", "first", "first", "first", "first", "first", "slope", "first level", "ground"], "difficult_direct_answer": false, "rationales": ["The rails outside and the bushes can only be found on the first floor.", "The level is on the ground.", "The level is at the ground."], "image": "train2014/COCO_train2014_000000192954.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114235, "question_id": "BcAbWkAER6CJBke6YNvm4T", "question": "What sound would a baby make when they see this event?", "choices": ["neigh", "woof", "meow", "choo choo"], "correct_choice_idx": 3, "direct_answers": ["cry", "choo choo", "choo-choo", "cry", "choo choo", "crying", "excitement", "cry", "cry", "choo choo"], "difficult_direct_answer": false, "rationales": ["The object passing by is a train, not a cat, dog, or horse.", "That is the classic old train sound", "The train would choo choo."], "image": "val2014/COCO_val2014_000000114235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325570, "question_id": "BcjNcHNhnyWyN7MFz865yk", "question": "Why are the elephants in the water?", "choices": ["chasing", "bathing", "hiding", "swimming"], "correct_choice_idx": 1, "direct_answers": ["heat", "drinking water", "drinking", "drinking", "drinking", "drinking", "bathing", "drink", "drinking", "drinking"], "difficult_direct_answer": false, "rationales": ["The elephants are likely cooling off in the water or washing themselves.", "There are elephants bathing.", "They splash themselves to wash off mud"], "image": "train2014/COCO_train2014_000000325570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242307, "question_id": "BdQbTCCBRpgc767xpTdQ9A", "question": "What is the banana supposed to represent?", "choices": ["comb", "gun", "phone", "boot"], "correct_choice_idx": 1, "direct_answers": ["romantic", "penis", "penis", "gun", "penis", "fruit", "phone", "whip", "food", "gun"], "difficult_direct_answer": false, "rationales": ["The banana is held like a gun.", "He is holding it up.", "It might also have a phallic meaning, but that isn't one of the options and he looks like he's trying to mimic a gangster."], "image": "train2014/COCO_train2014_000000242307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74434, "question_id": "BdkGZvfBQhzX43UCNuedG4", "question": "What language does the name on the side of the truck come from?", "choices": ["chinese", "french", "greek", "spanish"], "correct_choice_idx": 2, "direct_answers": ["greek", "english", "greek", "usa", "greek", "usa", "greek", "english", "english", "greek"], "difficult_direct_answer": false, "rationales": ["The language is greek.", "The name is a name of a letter in that language's alphabet.", "The name on the truck is delta."], "image": "val2014/COCO_val2014_000000074434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316154, "question_id": "Bg5RvZ7ADZmVwRCNKSiBzV", "question": "What is likely opposite the toilet?", "choices": ["closet", "bedroom door", "bathtub", "vanity"], "correct_choice_idx": 2, "direct_answers": ["shower", "bathtub", "mirror", "shower", "mirror", "sink", "bath tub", "toilette paper", "shower", "bathtub"], "difficult_direct_answer": false, "rationales": ["Bathtubs are normally in the bathroom.", "The bathroom shows a mirror with bath towels in the reflection so there must be a bathtub in the room.", "The most common object across from the toilet would be the tub."], "image": "train2014/COCO_train2014_000000316154.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435005, "question_id": "BgBErWEqEwqvBjDHFz9dkF", "question": "What happens in this room?", "choices": ["exercising", "watching tv", "washing hands", "writing letters"], "correct_choice_idx": 2, "direct_answers": ["bathing", "bath", "urination", "bath", "taking bath", "bath", "bathing", "bathing", "bathe", "washing hands"], "difficult_direct_answer": false, "rationales": ["There is a sink with soap next to it.", "This is a bathroom. you can clean your hands in the sink here.", "The room is a bathroom. there are sinks on the left hand side."], "image": "val2014/COCO_val2014_000000435005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553079, "question_id": "Bgh6ApzcXTWaYarPxHhojW", "question": "What song is related to the long item in the middle?", "choices": ["crazy train", "red carpet", "slither", "cupids bow"], "correct_choice_idx": 0, "direct_answers": ["train", "midnight train", "downtown train", "train", "train wreck", "train", "train", "crazy train", "crazy train", "subway train"], "difficult_direct_answer": false, "rationales": ["This is a city subway system that uses tracks", "The object is on tracks and ozzy osbourne has a song about a nutty one.", "Ozzy osbourne has a song called \"crazy train\""], "image": "train2014/COCO_train2014_000000553079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233064, "question_id": "BhMvWqMUAYhC9rZPbaxjbN", "question": "How many whole cakes are there present in the store case?", "choices": ["one", "three", "two", "five"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "six", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 3 cakes.", "One cake is in between two other cakes.", "There are three whole cakes on display in the case."], "image": "train2014/COCO_train2014_000000233064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317688, "question_id": "BhUWDGCQSgjSezRf52g3vo", "question": "What team plays in the city that is mentioned on the tub?", "choices": ["philadelphia flyers", "ny jets", "milwaukee bucks", "cincinnati reds"], "correct_choice_idx": 0, "direct_answers": ["eagles", "eagles", "76ers", "philadipha", "philadelphia flyers", "76ers", "eagles", "eagles", "sixers", "eagles"], "difficult_direct_answer": false, "rationales": ["The team of philadelphia is the flyers.", "The team is the flyers.", "The team is the philadelphia flyers."], "image": "train2014/COCO_train2014_000000317688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233404, "question_id": "BhWprShwkeJzj3wSaWAw5z", "question": "What color is the scarf wrapped around the suitcase pulled on the left?", "choices": ["red", "yellow", "green", "blue"], "correct_choice_idx": 0, "direct_answers": ["reddish pink", "car", "reddish", "red", "red", "red", "red", "red", "red", "sandal colour"], "difficult_direct_answer": false, "rationales": ["It's obviously not any of the other color options.", "It is red.", "The scarf is red."], "image": "train2014/COCO_train2014_000000233404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271032, "question_id": "BjB4oWHDFYsTXJDraaiCZJ", "question": "What surface are all the birds standing on next to the big river?", "choices": ["stone", "dirt", "wood", "grass"], "correct_choice_idx": 0, "direct_answers": ["sea", "rock", "rocks", "stone", "rocks", "rocks", "stone", "rock", "rocks", "rock bank"], "difficult_direct_answer": false, "rationales": ["There is a stone surface.", "The ground looks to have those \"jagged\", rough edges similar to rocks.", "The surface is stone."], "image": "val2014/COCO_val2014_000000271032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12443, "question_id": "BjUU4N68KtYBtET5FVj5q3", "question": "What is the whitish metal object behind the cat's head?", "choices": ["fridge", "radiator", "stove", "air conditioner"], "correct_choice_idx": 1, "direct_answers": ["radiator", "radiator", "it's laptop", "heater", "radiator", "radiator", "laptop", "radiator", "radiator", "radiator"], "difficult_direct_answer": false, "rationales": ["This provides heat by having warm water flow through it", "The bars of the heat giving device can be seen.", "The object is made out of metal and has a coiled rectangular shape."], "image": "val2014/COCO_val2014_000000012443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161962, "question_id": "BnKHZJ7nwnTt6MFaeZrzJG", "question": "What are the green parts of the building called?", "choices": ["steeples", "courtyard", "administration", "barracks"], "correct_choice_idx": 0, "direct_answers": ["grass", "pillars", "no", "towers", "steeple", "steeples", "greenhouse", "steeples", "turret roof", "steeple"], "difficult_direct_answer": false, "rationales": ["The word is similar to \"steep\" in which it's at sharp angles.", "The parts are steeples.", "The green parts are steeples."], "image": "val2014/COCO_val2014_000000161962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15262, "question_id": "BngeVqKejY7SRjPHFXdH6T", "question": "What color is the beverage contained by the cup on the right?", "choices": ["yellow", "green", "red", "blue"], "correct_choice_idx": 0, "direct_answers": ["good", "yellow", "yellow", "orange", "yellow", "yellow", "yellow", "yellow", "yellow", "white"], "difficult_direct_answer": false, "rationales": ["It's the color of a banana", "The beverage is yellow.", "The color looks like that of a lemon."], "image": "train2014/COCO_train2014_000000015262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91463, "question_id": "BoKqPfJwLonbXrQN8eMPbz", "question": "What is lined up next to each other?", "choices": ["babies", "eggs", "wine glasses", "pumpkins"], "correct_choice_idx": 2, "direct_answers": ["wine glasses", "wine glass", "glasses", "cup", "wine glasses", "wine glasses", "wine glasses", "wine glasses", "wine glasses", "wine glasses"], "difficult_direct_answer": false, "rationales": ["There are no people or food items on the table.", "The glasses are lined up.", "There are wine glasses lined up."], "image": "train2014/COCO_train2014_000000091463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266370, "question_id": "BqsxzryV926jyD9voDRh5z", "question": "What is the type of food in the middle of the bread?", "choices": ["grain", "dairy", "vegetable", "meat"], "correct_choice_idx": 2, "direct_answers": ["vegetable", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli"], "difficult_direct_answer": false, "rationales": ["The food type is green and plant like.", "The type is a veggie.", "This is brocolli."], "image": "val2014/COCO_val2014_000000266370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472079, "question_id": "Brmj4eNSQXvzpqRrzc5oiw", "question": "The person in the foreground wearing blue looks most like what Sopranos character?", "choices": ["bobby baccalieri", "uncle junior", "carmela soprano", "livia soprano"], "correct_choice_idx": 1, "direct_answers": ["bald one", "no idea", "unknown", "very", "godfather", "junior soprano", "grandfather", "uncle junior", "silvio dante", "tony"], "difficult_direct_answer": true, "rationales": ["The person is uncle junior.", "A bald man with glasses is sitting in a chair and is closer than others nearby.", "That character is known for that bckground."], "image": "train2014/COCO_train2014_000000472079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432998, "question_id": "BsuGuQGDcT896SNY7vtXdG", "question": "What color is the face of the cartoon character on the backpack on the far right bed?", "choices": ["orange", "blue", "brown", "yellow"], "correct_choice_idx": 3, "direct_answers": ["beige", "yellow", "yellow", "yellow", "yellow", "red", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["It's winnie the pooh", "The character in the bag is yellow.", "The cartoon character's face is the color of the sun."], "image": "train2014/COCO_train2014_000000432998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270925, "question_id": "But3AwzHTghWexuTBkm4b5", "question": "What are the women on the left holding?", "choices": ["babies", "eggs", "cats", "umbrellas"], "correct_choice_idx": 3, "direct_answers": ["umbrella", "umbrellas", "umbrellas", "bag women", "parasol", "umbrellas", "umberla", "umbrella", "umbrellas", "umbrellas"], "difficult_direct_answer": false, "rationales": ["The women have umbrellas.", "These are held over the head for environment protection.", "They need shade."], "image": "train2014/COCO_train2014_000000270925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38134, "question_id": "BvcBGqgyC5H7f3Uy6JDRfs", "question": "What food is ready to eat?", "choices": ["cheesecake", "orange", "hamburger", "hot dog"], "correct_choice_idx": 1, "direct_answers": ["orange", "tangerine", "orange", "orange", "orange", "orange", "orange", "oranges", "lemons", "oranges"], "difficult_direct_answer": false, "rationales": ["Oranges are ready to eat.", "Citrus fruits are in a bowl.", "The oranges are all ready to eat."], "image": "train2014/COCO_train2014_000000038134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247880, "question_id": "BwJtDyA5w9z8PJDyiEmxsr", "question": "What color is the border of the board with the wood face?", "choices": ["red", "purple", "blue", "orange"], "correct_choice_idx": 2, "direct_answers": ["teal", "grey", "green", "yellow", "blue", "green", "green", "blue", "blue", "green"], "difficult_direct_answer": false, "rationales": ["It's the only color that is vaguely suggested in the image. the other options don't match even a light shade.", "The color is blue.", "The surfboard with the wood face has a blue border around it."], "image": "val2014/COCO_val2014_000000247880.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265933, "question_id": "BwbhQJRq3iVhu6h7AbfNRc", "question": "What company has a similar name compared to the name of the sponsor of this event?", "choices": ["wwe", "lexisnexis", "milky way", "usps"], "correct_choice_idx": 1, "direct_answers": ["plexus", "lexus", "nexus", "lexus", "lexus", "car company", "nike", "nexus", "lexisnexis", "lexus"], "difficult_direct_answer": false, "rationales": ["The company has almost identical spelling to the sponsor.", "Lexus is similar to lexis.", "The name rhymes with the sponsor's name."], "image": "train2014/COCO_train2014_000000265933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3320, "question_id": "BwzNG7iDZF7vMmPHkEtJVK", "question": "What is the person that will board last wearing?", "choices": ["garbage bag", "armor", "crown", "hood"], "correct_choice_idx": 3, "direct_answers": ["backpack", "hood", "bag", "black", "coat", "jeans jacket", "jacket", "coat", "backpack", "backpack"], "difficult_direct_answer": false, "rationales": ["The person has a covering for their head which is connected to their jacket.", "The person that is last to get on the boat is wearing a jacket and a hood.", "The person boarding last is wearing a hood and a backpack."], "image": "train2014/COCO_train2014_000000003320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96754, "question_id": "Bxfon2msH4HKpcroUaLb7Y", "question": "What is this type of tennis match called with two people on each side?", "choices": ["pickle ball", "battle royal", "cricket", "doubles"], "correct_choice_idx": 3, "direct_answers": ["doubles", "doubles", "doubles", "doubles", "doubles", "doubles", "tennis", "doubles", "doubles", "doubles"], "difficult_direct_answer": false, "rationales": ["The answer is commonly known based on the rules of tennis and the players orientation.", "There's two people for each team, giving the match type its name.", "Two people play on each side so it's twice as many as a regular game"], "image": "train2014/COCO_train2014_000000096754.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245361, "question_id": "BxhsLAD8azMSPdw6pw9Mxs", "question": "What is a term related to this event?", "choices": ["goal", "homerun", "surfs up", "balance beam"], "correct_choice_idx": 2, "direct_answers": ["surfing", "surfboard", "surfing", "surfing", "surfing", "surfing", "surfing", "surfing", "surfs up", "surfing"], "difficult_direct_answer": false, "rationales": ["The man is surfing in the water.", "The term is surfing.", "The event is called surfs up because people surf during this event"], "image": "train2014/COCO_train2014_000000245361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143224, "question_id": "BxkHJ2RebniYdpTjunxTuZ", "question": "The flag of which nation is painted laterally around this airplane?", "choices": ["uk", "russia", "usa", "france"], "correct_choice_idx": 3, "direct_answers": ["united states", "russia", "american", "america", "netherlands", "netherland", "america", "france", "usa", "france"], "difficult_direct_answer": false, "rationales": ["The top, middle, and bottom stripes are blue, white, and red. there are no stars or crosses.", "The flag is the recognizable blue, white and red in horizontal stripes.", "An airplane has a blue, white, and red stripe painted around it. the flag for france is blue, white, and red."], "image": "val2014/COCO_val2014_000000143224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280396, "question_id": "ByJnGjCESNsa6ekkKnq6oU", "question": "What is it called when horses have hair on their feet?", "choices": ["feathering", "mane", "tufts", "hoof hair"], "correct_choice_idx": 0, "direct_answers": ["fee", "fancy", "hair", "feathered", "running", "clydesdale", "feathering", "feathering", "curling", "fur"], "difficult_direct_answer": true, "rationales": ["The horses' hair is known as feathers.", "It's long hair that covers the hooves and goes to the ground", "It's because they look wispy and cover the hooves"], "image": "train2014/COCO_train2014_000000280396.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375531, "question_id": "BzFMLmeSitNraj6ptuue44", "question": "What is this vessel called?", "choices": ["inflatable dinghy", "slicker", "pontoon", "canoe"], "correct_choice_idx": 0, "direct_answers": ["boat", "power boat", "inflatable dinghy", "motor boat", "boat", "zodiac", "boater", "raft", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["The vessel is an inflatable.", "The vessel is inflatable.", "You can put air into and take it out of this little vessel whose name is much is pronounced the same as the short word for ding-a-ling."], "image": "train2014/COCO_train2014_000000375531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364173, "question_id": "BzmrpTrDyFNvVsq97g8hQX", "question": "What is a term that can refer to animals like these?", "choices": ["steer", "puppy", "kitten", "joey"], "correct_choice_idx": 0, "direct_answers": ["cattle", "steer", "cows", "cows", "cows", "cows", "cows", "cows", "cow", "cattle"], "difficult_direct_answer": false, "rationales": ["These are cattle", "The animals are cows, not kangaroos, cats, or dogs.", "A group of cows are gathered behind a fence."], "image": "train2014/COCO_train2014_000000364173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530265, "question_id": "C289XAztmxnLrZKmLSWVw4", "question": "What is the blue rectangular plastic item used to hold at the top of the desk?", "choices": ["pills", "stickers", "screws", "thumbtacks"], "correct_choice_idx": 0, "direct_answers": ["photo", "pills", "medicine", "paper", "stand", "pills", "pills", "pills", "office supplies", "pills"], "difficult_direct_answer": false, "rationales": ["The item is for pills.", "This has the days of the week on it so you keep your medicine straight", "There is a long rectangular pill box that is on a storage case sitting on kitchen counter."], "image": "val2014/COCO_val2014_000000530265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410175, "question_id": "C2j73YMHqm7gmhvE5K3DWS", "question": "What is touching the snow?", "choices": ["cats paw", "skis", "dogs paw", "cowboy boots"], "correct_choice_idx": 1, "direct_answers": ["skis", "poles", "person", "skis", "skis", "skis", "ski poles", "skiis", "skis", "pole"], "difficult_direct_answer": false, "rationales": ["A group of people are on skis and are standing in a snowy area.", "The items are being stood on, and are long planks for sliding on the surface of snow.", "You would use those sticks to help propel you in the snow and keep you from falling down."], "image": "train2014/COCO_train2014_000000410175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506910, "question_id": "C2tMoj7zvs6zZHmTrxMf3b", "question": "What design is next to the clock on the largest building?", "choices": ["star", "wolf sigil", "cross", "hexagon"], "correct_choice_idx": 2, "direct_answers": ["cross", "cross", "cross", "cross", "cross", "cross", "christian", "cross", "cross", "cross"], "difficult_direct_answer": false, "rationales": ["There are crosses beside the clock on the building tower.", "The tallest structure on the largest building is a tower with a gold cross at the top.", "The design on next to the clock are crosses."], "image": "train2014/COCO_train2014_000000506910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252384, "question_id": "C4ZKz8EiebAG4x7wkmTYPq", "question": "Which team is winning?", "choices": ["mets", "visitor", "eaglebrook", "jets"], "correct_choice_idx": 1, "direct_answers": ["visitor", "visitors", "visitor", "visitor", "visitor", "visitor", "visitor", "visitors", "visitors", "visitor"], "difficult_direct_answer": false, "rationales": ["The score is 10 next to this one and it is 0 next to the home team", "The rival team is winning the game.", "Eaglebrook has scored zero points. the other team has one point."], "image": "train2014/COCO_train2014_000000252384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321066, "question_id": "C4jLpdqj5MZcnrbjg9QUPC", "question": "Which nation's flag is hoisted on the side of the boat?", "choices": ["france", "germany", "united states", "spain"], "correct_choice_idx": 2, "direct_answers": ["united states", "usa", "usa", "usa", "america", "stop", "usa", "usa", "american flag", "usa"], "difficult_direct_answer": false, "rationales": ["There is a red white and blue flag with stars attached to sail of a boat.", "It's the stars and stripes", "It is red, white, and blue."], "image": "train2014/COCO_train2014_000000321066.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218096, "question_id": "C4nTZALWw6d6eHBN3T7oBS", "question": "What are the people drinking?", "choices": ["orange juice", "milk", "soda pop", "alcohol"], "correct_choice_idx": 3, "direct_answers": ["alcohol", "alcohol", "wine", "alcohol", "beers", "beer", "alcohol", "alcoholic beverages", "alcohol", "alcohol"], "difficult_direct_answer": false, "rationales": ["The man on the left has a flask. the man on the right has a beer bottle.", "A man in a stripped shirt is drinking from a flask and his buddy has a bottle of beer in hand.", "He is drinking out of a flask and the other man is drinking a beer."], "image": "train2014/COCO_train2014_000000218096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232358, "question_id": "C57DYGa85bYHWijL9uKLnn", "question": "Why is there a phone by the shower?", "choices": ["for help", "text", "chat", "internet"], "correct_choice_idx": 0, "direct_answers": ["call", "for help", "taking photo", "communication", "call cleaning", "reflection", "calls", "for help", "to answer", "business"], "difficult_direct_answer": true, "rationales": ["Sometimes phones are in the bathroom for emergencies.", "This is so you can call if you fall", "This is a walkin shower that is used for elderly people. a phone is here if they fall"], "image": "train2014/COCO_train2014_000000232358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168218, "question_id": "C7Px75FvAXGWQFE5uCNVTx", "question": "How many sections have been drawn in the tennis court for players?", "choices": ["four", "six", "two", "eight"], "correct_choice_idx": 3, "direct_answers": ["eight", "eight", "ten", "eight", "eight", "six", "eight", "eight", "four", "six"], "difficult_direct_answer": false, "rationales": ["There are 4 sections on each side", "Based on the provided image, the court has been split into sections with white lines. the sections are clearly marked, visible and countable.", "It's easily broken down by the white lines."], "image": "train2014/COCO_train2014_000000168218.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241064, "question_id": "C8SJD4eNG9bgna5jWR5dcs", "question": "How many little sheep are sitting on the grass?", "choices": ["three", "two", "five", "four"], "correct_choice_idx": 0, "direct_answers": ["four", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["Three white sheep are on the forefront in the grass and a large sheep standing to right.", "There are three lambs in the grass.", "They are laying down next to the bigger one"], "image": "val2014/COCO_val2014_000000241064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232720, "question_id": "C99WGiJFRskgwrvaZYEC7e", "question": "What is hanging on the right side of the room?", "choices": ["goddess statue", "deer head", "baseball pennant", "monkey"], "correct_choice_idx": 1, "direct_answers": ["ramp on", "moose head", "deer head", "sculpture", "deer head", "deer head", "deer head", "pictures", "deer head", "deer head"], "difficult_direct_answer": false, "rationales": ["The deer head hangs.", "The item comes from an animal that is not capable of climbing trees.", "There is a mounted taxidermy display of an animal that resembles the item in option a."], "image": "train2014/COCO_train2014_000000232720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472080, "question_id": "CA7H2zTppn8pKnGmM43L37", "question": "How many computer displays are on top of the black desk with two mouses?", "choices": ["five", "two", "four", "three"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two computer displays. one is a laptop and one is a desktop.", "There are 2 displays on the desk.", "There are two displays."], "image": "val2014/COCO_val2014_000000472080.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167110, "question_id": "CAUphxVaAtWdo2ZMMUcnan", "question": "What is going up the ramp?", "choices": ["hamster", "cat", "skateboarder", "domino maze"], "correct_choice_idx": 2, "direct_answers": ["skateboarder", "skateboarder", "man", "person", "boy", "kid", "jump", "skateboarder", "skateboarder", "people"], "difficult_direct_answer": false, "rationales": ["He is on a board with wheels under it and he is at a skate park.", "A person on a skateboard is doing a trick on the ramp.", "The ramp is locatable and identifiable and the thing moving up in relation to the ramp is also visible."], "image": "val2014/COCO_val2014_000000167110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572500, "question_id": "CDe7Yy2Ng9BB6PnTK6iQSf", "question": "What color is the bag on top of the bench and below the woman's feet?", "choices": ["green", "turquoise", "blue", "red"], "correct_choice_idx": 1, "direct_answers": ["green", "green", "green", "green", "light green", "light green", "green", "green", "turquoise", "teal"], "difficult_direct_answer": false, "rationales": ["The color is turquoise.", "The color is bright and easily visible. it is in sharp contrast to the gray bench.", "The color is turquoise."], "image": "train2014/COCO_train2014_000000572500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42968, "question_id": "CEw2doAAK5RYFJApnDLQKF", "question": "What mode of transportation is using the field behind the fence?", "choices": ["trucks", "aircraft", "boats", "helicopters"], "correct_choice_idx": 1, "direct_answers": ["airplane", "aircraft", "airplane", "plane", "airplane", "plane", "airplane", "aircraft", "air", "airplane"], "difficult_direct_answer": false, "rationales": ["There is an airplane parked on the ground behind the fence that is used for air travel.", "Airplanes are behind the fence.", "It's obviously a plane."], "image": "val2014/COCO_val2014_000000042968.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285558, "question_id": "CHkW8RKgmKLJZqQDYrbQJR", "question": "What kind of pants is the girl in pink wearing?", "choices": ["capris", "leggings", "pajama bottoms", "jeans"], "correct_choice_idx": 2, "direct_answers": ["pajama", "blue jeans", "pajama bottoms", "flannel", "sweatpants", "pajama", "sweat", "pajama", "dance", "pajamas"], "difficult_direct_answer": false, "rationales": ["The pants are pa bottoms.", "The style of the pants is similar to sleepwear where it is loose fitting and has an informal pattern.", "The girl in pink is wearing relaxed clothing meant for sleeping."], "image": "val2014/COCO_val2014_000000285558.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531816, "question_id": "CJXajsaCfVLU4hYJwTGtDz", "question": "What year is closest to the year this doll originated?", "choices": ["1955", "1995", "1970", "1982"], "correct_choice_idx": 3, "direct_answers": ["eighties", "1902", "1982", "1997", "1981", "two thousand", "1985", "1983", "1980", "doll"], "difficult_direct_answer": true, "rationales": ["The doll is a care bear. these dolls were released before 1995 and after 1970.", "This is a care bear. care bears first came out in 1981.", "A girl is holding a plush bear known as a \"care bear\"."], "image": "val2014/COCO_val2014_000000531816.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261439, "question_id": "CKBmfQovaMcPeYQ5QVFh38", "question": "What color is the topping on top of the desert on top of the plates?", "choices": ["brown", "purple", "green", "white"], "correct_choice_idx": 0, "direct_answers": ["brown", "brown", "brown", "red", "brown", "brown", "black", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The topping is not white, green, or purple.", "A dessert on a plate has a dark colored frosting on top.", "This looks to be a tiramisu on the plate. the top is brown."], "image": "train2014/COCO_train2014_000000261439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39928, "question_id": "CKRGrYgonS2RxsnoGCSBPK", "question": "Where is Pulloveria based?", "choices": ["hamburg", "vienna", "new york", "toronto"], "correct_choice_idx": 1, "direct_answers": ["vienna", "sides", "in streetside", "galerie", "italy", "france", "italy", "side walk", "vienna", "vienna"], "difficult_direct_answer": false, "rationales": ["The shop has the origin in vienna.", "Pulloveria is in vienna.", "The corporate headquarters are in this Austrian city."], "image": "train2014/COCO_train2014_000000039928.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289490, "question_id": "CKfcatnh3C8wZsn5T3WJhB", "question": "What poses the greatest immediate danger to the man?", "choices": ["rockslide", "waves", "tiger", "spiders"], "correct_choice_idx": 1, "direct_answers": ["rapids", "water", "elephant", "river", "water", "rock", "elephants", "water", "water", "waves"], "difficult_direct_answer": false, "rationales": ["Since the elephants are not listed and there are no spiders or tigers around, it would be the waves. the water seems to be moving a lot, which could drag him away.", "You can get pushed into the water and drown.", "There's only water near the man, which looks quite violent due to the current."], "image": "train2014/COCO_train2014_000000289490.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208107, "question_id": "CKtKcj3tH2eKKXTdyCrTju", "question": "The person in the front of the boat is wearing a hat from which continent?", "choices": ["australia", "asia", "antarctica", "africa"], "correct_choice_idx": 1, "direct_answers": ["south asia", "asia", "china", "asia", "asia", "asia", "asia", "asia", "asia", "asia"], "difficult_direct_answer": false, "rationales": ["The hat is pointed.", "The man is wearing a conical or rice hat popular in places such as vietnam or china.", "The other options don't apply to this style of hat."], "image": "val2014/COCO_val2014_000000208107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194744, "question_id": "CLXxikgaHd3bSRhnx7ACV7", "question": "Where is the pizza being eaten?", "choices": ["food court", "small restaurant", "outdoor cafe", "home"], "correct_choice_idx": 0, "direct_answers": ["restaurant", "food court", "restaurant", "restaurant", "table", "pizzeria", "pizzeria", "restaurant", "one piece", "cafeteria"], "difficult_direct_answer": false, "rationales": ["The pizza is at a food court.", "Based on the open air concept and the line of windows in the background the setting is consistent with answer a.", "You can eat a pizza at a food court."], "image": "train2014/COCO_train2014_000000194744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568808, "question_id": "CN53z662iTu48Q7bSgTcpz", "question": "What is the house near?", "choices": ["baby", "cat", "water", "dog"], "correct_choice_idx": 2, "direct_answers": ["ocean", "water", "ship", "boats", "ship", "ocean", "water", "boat", "sea", "water"], "difficult_direct_answer": false, "rationales": ["The house is sitting close to the water of a beautiful bay.", "There is a seawall beside the house. there are no babies or pets.", "It looks like it might have been built on a small island."], "image": "train2014/COCO_train2014_000000568808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145792, "question_id": "CQNzwJpVe9hpNFcRsyocmT", "question": "What is the person with the umbrella walking on?", "choices": ["stairs", "sidewalk", "escalator", "zebra stripes"], "correct_choice_idx": 3, "direct_answers": ["crosswalk", "cross line", "crosswalk", "zebra stripes", "crosswalk", "crosswalk", "crosswalk", "floor", "road", "crosswalk"], "difficult_direct_answer": false, "rationales": ["The are stripes.", "The stripes are white and black.", "The person is striped."], "image": "train2014/COCO_train2014_000000145792.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203495, "question_id": "CRUPx2hudiemFJZ5FJBYXj", "question": "What color are the child's shoes on the right?", "choices": ["purple", "white", "red", "black"], "correct_choice_idx": 1, "direct_answers": ["white", "white", "white", "white", "white", "white", "black", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["They are a similar color to the pants", "The child on the right is not wearing black, purple, or red shoes.", "The child's shoes are a dirty white color."], "image": "train2014/COCO_train2014_000000203495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253665, "question_id": "CRYCRQqyr7MztjcwezQ5en", "question": "What is creeping around on the table?", "choices": ["mouse", "monkey", "cat", "dog"], "correct_choice_idx": 2, "direct_answers": ["cat", "cat", "cat", "cat", "cat", "cat", "cat", "remote", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["There is a feline with pointed ears and a tail. he is on top of a glass table.", "It is a small furry animal with whiskers.", "There is a gray cat creeping around on the table in front of the fireplace."], "image": "val2014/COCO_val2014_000000253665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345861, "question_id": "CRbFcvChYi6Fa84wTiv4BA", "question": "What is this person currently doing on their computer?", "choices": ["gaming", "checking email", "watching netflix", "watching youtube"], "correct_choice_idx": 0, "direct_answers": ["gaming", "working", "playing games", "games", "gaming", "gaming", "gaming", "gaming", "work", "playing"], "difficult_direct_answer": false, "rationales": ["A game is shown on the screen.", "Xbox live, which is open on the desktop, is often only used for one sole purpose.", "They have character information on the screen"], "image": "train2014/COCO_train2014_000000345861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376283, "question_id": "CSvKngE2g7KWehjT6WGuxZ", "question": "What is the same color as the vehicle in the foreground?", "choices": ["cow", "elephant", "eagle", "bumble bee"], "correct_choice_idx": 3, "direct_answers": ["penguin", "other vehicle", "car", "windows", "bumble bee", "bus", "sun", "black", "yellow", "bumblebee"], "difficult_direct_answer": true, "rationales": ["Bees are the same color.", "The vehicles are yellow and black.", "The vehicle is black and yellow, just like the insect."], "image": "train2014/COCO_train2014_000000376283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517869, "question_id": "CTCtGPWYcksagQgBtsNr3G", "question": "What are the people using?", "choices": ["cars", "skateboards", "boxes", "apples"], "correct_choice_idx": 1, "direct_answers": ["skateboards", "skateboard", "skateboards", "skateboards", "skateboard", "skateboards", "skateboard", "skateboards", "skateboards", "skate boards"], "difficult_direct_answer": false, "rationales": ["The people are using non-motorized wheeled items to do tricks.", "The people have boards.", "The people have skateboards."], "image": "train2014/COCO_train2014_000000517869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93600, "question_id": "CTxSV96SxBirMWmcg5zURz", "question": "What color is the collar around the dog's neck who is watching TV?", "choices": ["white", "blue", "yellow", "red"], "correct_choice_idx": 1, "direct_answers": ["black", "blue", "black", "green", "black", "black", "blue", "black", "blue", "grey"], "difficult_direct_answer": false, "rationales": ["The fabric of the colour is blue.", "The color is blue.", "The black dog that is watch tv is wearing a collar that is very dark blue."], "image": "train2014/COCO_train2014_000000093600.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426826, "question_id": "CUSQAr65tKwbNsAa5BcZAu", "question": "What is a term used here?", "choices": ["touchdown", "goalie", "serve", "surfs up"], "correct_choice_idx": 2, "direct_answers": ["serve", "serve", "serve", "serve", "serve", "tennis", "tennis", "serve", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["The term is serving.", "The person is playing tennis and is about to hit the ball.", "The man is serving."], "image": "train2014/COCO_train2014_000000426826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438805, "question_id": "CUyTZupWrjdKgTbdSrD6dL", "question": "How many zebras are running across the savannah plain?", "choices": ["three", "one", "two", "five"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "bsll", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are 2.", "There are a couple of zebras running across the plain.", "There are two zebras."], "image": "train2014/COCO_train2014_000000438805.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155993, "question_id": "CWWhBSY8buZbVDWjPuX5Nr", "question": "What color is the border around the edges of the mirror?", "choices": ["blue", "wood", "black", "green"], "correct_choice_idx": 1, "direct_answers": ["wood", "fee", "brown", "brown", "brown", "brown", "fruitwood", "brown", "brown", "orange"], "difficult_direct_answer": false, "rationales": ["The color is wood.", "The edges of the mirror have a grain pattern and is brown, which is indicative of the plant.", "It's actually fruitwood."], "image": "train2014/COCO_train2014_000000155993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340200, "question_id": "CXH9gnNEdDo4o88QJ29qpx", "question": "Judging by the childs hair what did they just get done with?", "choices": ["sleeping", "bath", "eating", "fighting"], "correct_choice_idx": 1, "direct_answers": ["brown", "bathtime", "water", "remote", "bath", "bath", "bath", "remote", "speaking", "bath"], "difficult_direct_answer": false, "rationales": ["The hair looks wet and uncombed.", "The child's hair is slightly wet and looks like she is drying off from a bath.", "The hair is wet"], "image": "val2014/COCO_val2014_000000340200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224379, "question_id": "CYShvLo4BurouXvDC2KAd2", "question": "What is a term used in these kinds of events?", "choices": ["discus", "high dive", "homerun", "canter"], "correct_choice_idx": 3, "direct_answers": ["horse", "jockey", "equestrian", "race", "canter", "jockey", "prancing", "horse riding", "horse show", "polo"], "difficult_direct_answer": true, "rationales": ["The person is riding a horse, not playing baseball, diving, or throwing a discus.", "The person is riding a horse in an equestrian event, not playing baseball, diving, or participating in a track and field event.", "The person is riding a horse, not playing baseball, diving, or participating in a track and field event."], "image": "train2014/COCO_train2014_000000224379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294090, "question_id": "CYmhf98hK33aDuy6iyNReE", "question": "What kind of bike lock is used in this picture?", "choices": ["cable lock", "chain lock", "combination lock", "dlock"], "correct_choice_idx": 3, "direct_answers": ["parking meter", "dlock", "bar", "curved bar", "bycycle lock", "key", "u lock", "bike", "digital", "parking meter"], "difficult_direct_answer": true, "rationales": ["This option is called this because of the d letter shape. the other options aren't attached to this bike.", "There is a d-lock attached to the parking meter.", "The kid is a d lock."], "image": "train2014/COCO_train2014_000000294090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263270, "question_id": "CZKVEiHvJ7imEikV8dUtnY", "question": "What is near the tracks?", "choices": ["cats", "wolves", "trees", "elephants"], "correct_choice_idx": 2, "direct_answers": ["people", "village", "huts", "railway track", "trees", "people", "people", "people", "people", "people"], "difficult_direct_answer": false, "rationales": ["There are a bunch of trees next to the train tracks.", "There are large woody plants with leaves by the tracks. they item in option a matches that description.", "The tracks have trees."], "image": "train2014/COCO_train2014_000000263270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41505, "question_id": "CZdxoNoQduBrwoTabBwyqH", "question": "Why is the cat likely sleeping on the laptop?", "choices": ["attention", "unknown", "work", "heat"], "correct_choice_idx": 3, "direct_answers": ["tired", "heat", "tired", "accident", "it's warm", "tired", "it's warm", "sleepy", "tired", "cushion"], "difficult_direct_answer": false, "rationales": ["The laptop is warm for him to sleep on.", "The cat wants heat.", "Laptops are warm so the cat must be lying on it because he is cold."], "image": "train2014/COCO_train2014_000000041505.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16716, "question_id": "CZhSeBnVgMNC6q4NgN9JFd", "question": "What word shares the same first letter as the name of these animals?", "choices": ["carrot", "zipper", "baby", "deep"], "correct_choice_idx": 1, "direct_answers": ["zones", "zoo", "zipper", "zoophilia", "zoo", "zip", "zipper", "zodiac", "zebra", "zen"], "difficult_direct_answer": false, "rationales": ["The animals are zebras.", "The animals are zebras which starts with a \"z\".", "The word is a zipper."], "image": "val2014/COCO_val2014_000000016716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115912, "question_id": "CZopZ6WpW94wKkCXF5HkEC", "question": "Why is the man holding onto a handlebar?", "choices": ["balance", "protection", "make music", "steer dog"], "correct_choice_idx": 0, "direct_answers": ["wind surfing", "prevent drowning", "to ski", "waterskiing", "he's windsurfing", "being propelled", "windsurfing", "water skiing", "balance", "water skiing"], "difficult_direct_answer": true, "rationales": ["The man needs balance.", "You hold on something like that so you don't fall off and hurt yourself. he seems to be going at high speeds.", "A man is being pulled on a board, behind a boat, and is holding a handle attached to long ropes."], "image": "val2014/COCO_val2014_000000115912.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99984, "question_id": "CaZXRyqFnACDfWB79o7AoA", "question": "What is next to the green plant?", "choices": ["eggs", "elves", "signs", "airplanes"], "correct_choice_idx": 2, "direct_answers": ["traffic sign", "signs", "signs", "traffic signal", "signs", "traffic light", "flower", "road signs", "signs", "traffic signs"], "difficult_direct_answer": false, "rationales": ["The signs are near.", "Among the green plants are different colored signs for travelers to see for their safety.", "There are a bunch of signs around the green plants."], "image": "val2014/COCO_val2014_000000099984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96693, "question_id": "CarW3umtTksmGJ6BdBGzsU", "question": "Who is famous for using one of the items that is missing from the slot on the left?", "choices": ["bo jackson", "freddy krueger", "jim those", "michael myers"], "correct_choice_idx": 3, "direct_answers": ["person", "chef", "chefs", "michael meyers", "michael myers", "chef", "chef", "ramsy", "van goh", "chef"], "difficult_direct_answer": false, "rationales": ["Michael meyers is famous for using a plug in one of the austin powers movies.", "The knife block is missing a knife, and michael myers is a character known to use knives on his victims.", "Michael is famous."], "image": "train2014/COCO_train2014_000000096693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115564, "question_id": "CcmGyijhDQGm7ru69tfmrq", "question": "What vehicles are here?", "choices": ["trucks", "airplanes", "trains", "horses"], "correct_choice_idx": 2, "direct_answers": ["trains", "trains", "train cars", "trains", "tankers", "trains", "cars trucks", "trains", "train", "train"], "difficult_direct_answer": false, "rationales": ["The vehicle is the only one that requires tracks to move.", "The cars are train cars and they are located on train tracks.", "This appears to be a yard for storing a."], "image": "train2014/COCO_train2014_000000115564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493485, "question_id": "CcmTFAR4UXY66qvdhSp4hD", "question": "What is the man with the curly hair holding?", "choices": ["banana", "trunk", "baby", "egg"], "correct_choice_idx": 1, "direct_answers": ["elephant's trunk", "elephant trunk", "trunk", "elephant trunk", "elephant trunk", "elephant nose", "trunk", "carry", "crazy", "elephant trunk"], "difficult_direct_answer": false, "rationales": ["A man is standing next to an elephant with his arm around the trunk as it is hung over his shoulder by the elephant.", "The man has the trunk.", "The man is holding the long nose of an elephant."], "image": "train2014/COCO_train2014_000000493485.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148727, "question_id": "Ccp6iGh4dX4edbxVEob6Xn", "question": "What holiday character does the lead motorcyclist dress as?", "choices": ["easter bunny", "santa claus", "elf", "cupid"], "correct_choice_idx": 1, "direct_answers": ["santa", "santa claus", "santa", "santa", "santa claus", "santa claus", "santa claus", "santa claus", "santa claus", "santa"], "difficult_direct_answer": false, "rationales": ["The holiday is for santa.", "The man on the motorcycle in front is dressed up in a red santa claus costume.", "The man in front has a long white beard, and a red and white outfit, and black boots."], "image": "train2014/COCO_train2014_000000148727.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394352, "question_id": "Cd648uQJLSbatLBrgrHXRo", "question": "What are the animals showing to the camera?", "choices": ["tusks", "backside", "antlers", "horns"], "correct_choice_idx": 1, "direct_answers": ["zebras", "zebra", "zebra", "butts", "backside", "tails", "butt", "hindquarters", "zebra", "tails"], "difficult_direct_answer": false, "rationales": ["The zebras are facing away from the camera.", "The rear ends of the animals are completely visible.", "The animals are facing away from the camera, showing the section in"], "image": "val2014/COCO_val2014_000000394352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419363, "question_id": "CdhNqB84mfXipjgkX254zT", "question": "What season is it?", "choices": ["winter", "spring", "summer", "fall"], "correct_choice_idx": 0, "direct_answers": ["christmas", "christmas", "christmas", "winter", "christmas", "winter", "christmas", "winter", "winter", "winter"], "difficult_direct_answer": false, "rationales": ["There is a christmas tree in the background.", "The season is winter.", "There are christmas decorations in the background."], "image": "val2014/COCO_val2014_000000419363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345745, "question_id": "CeLFApTYKx8EPLYxfp5kXX", "question": "What celebrity would be celebrating their birthday on the day that appears on the calendar?", "choices": ["margaret qualley", "jim carrey", "denzel washington", "tori deal"], "correct_choice_idx": 3, "direct_answers": ["jenna fischer", "xero seven", "jenna fischer", "tori deal", "bobby brown", "rachel weiss", "bryan cranston", "bryan cranston", "no idea", "rachel weiss"], "difficult_direct_answer": false, "rationales": ["Her birthday is march 7", "Tori deal's birthday is on march 7.", "March 7th is the birthday of the person named in option a."], "image": "train2014/COCO_train2014_000000345745.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1102, "question_id": "Cekuovw7r2tZKZEaVMipwi", "question": "What is the temperature feel like here?", "choices": ["warm", "hot", "freezing", "mild"], "correct_choice_idx": 2, "direct_answers": ["cold", "cold", "cold", "cold", "three degree", "cold", "freezing", "cold", "freezing", "cold"], "difficult_direct_answer": false, "rationales": ["The ground is covered in snow. the person is wearing a coat.", "There is snow on the ground which requires very cold temperatures", "The ground is covered in snow. snow melts when it is hot, warm, or mild."], "image": "train2014/COCO_train2014_000000001102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127196, "question_id": "Ceq3RQhg9DTqzx9YbyGTEp", "question": "What color is the starfish on the left side of the water bowl?", "choices": ["tan", "pink", "black", "white"], "correct_choice_idx": 0, "direct_answers": ["brown", "tan", "brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The color is tan.", "The colour is tan.", "The starfish is tan."], "image": "train2014/COCO_train2014_000000127196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423777, "question_id": "CfWRFQ3DDUbiYMWgzqayEt", "question": "What is the kite shaped like?", "choices": ["baby", "dragon", "egg", "asteroid"], "correct_choice_idx": 1, "direct_answers": ["dragon", "bird", "bird", "bird", "bird", "bird", "dragon", "bird", "phoenix", "dragon"], "difficult_direct_answer": false, "rationales": ["The kite looks similar to an a animal called dragon.", "He has wings and a head like one as well as trailing feathers", "The kite is like a dragon."], "image": "train2014/COCO_train2014_000000423777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208633, "question_id": "CjMYDdgt2bTbedH4pWpKxa", "question": "What meal is being served here?", "choices": ["breakfast", "desert", "lunch", "dinner"], "correct_choice_idx": 0, "direct_answers": ["dinner", "bread", "breakfast", "lunch", "dish", "pork", "breakfast", "potatoes", "sandwich", "dinner"], "difficult_direct_answer": false, "rationales": ["There are potato wedges which are typically a lunch food.", "It's an open face sandwich so it's probably a midday meal", "Breakfast is being served."], "image": "train2014/COCO_train2014_000000208633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489267, "question_id": "Cm7Tr2jbuK7FDEjLJawsEu", "question": "What is soaring through the air?", "choices": ["bat", "kite", "zeppelin", "airplane"], "correct_choice_idx": 3, "direct_answers": ["plane", "plan", "airplane", "plane", "car", "plane", "plane", "plane", "plane", "plane"], "difficult_direct_answer": false, "rationales": ["There is only one object visible in the air and it is clearly answer a based on the size and shape and its location in the sky.", "A commercial aircraft can be seen in the air above a town.", "A passenger plane is flying through the air."], "image": "train2014/COCO_train2014_000000489267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335784, "question_id": "Cn8QFaXZXKZmuZ3G8bZpTu", "question": "What does the unfocused sign say in the background above the pack of bikers?", "choices": ["dead end", "stop", "one way", "no parking"], "correct_choice_idx": 3, "direct_answers": ["no parking", "travel", "no parking", "bike", "travel", "stop", "no parking", "no parking", "stop", "cannot tell"], "difficult_direct_answer": false, "rationales": ["The first word is almost always what is in a small differently colored square on signs", "The sign says no parking.", "The layout of the sign is similar to the a verbiage and sign type."], "image": "train2014/COCO_train2014_000000335784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158279, "question_id": "CoVDdYouXswRDL7BqPYjJC", "question": "How is the beater being operated?", "choices": ["manual machine", "cordless machine", "electric machine", "by hand"], "correct_choice_idx": 3, "direct_answers": ["by hand", "manually", "hands", "motor", "smashing", "hand", "by hand", "by hand", "manually", "hand"], "difficult_direct_answer": false, "rationales": ["The bananas in the bowl are being beaten by hand with the metal tool.", "The beater is being beaten by hand.", "The beater is by hand."], "image": "val2014/COCO_val2014_000000158279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74517, "question_id": "Cqd5Q7RjmcnvDfWwvg5gTs", "question": "What color is the pillow sitting atop the middle corner of the sectional?", "choices": ["purple", "brown", "red", "pink"], "correct_choice_idx": 1, "direct_answers": ["brown", "red", "brown", "red", "red", "car", "red", "burgundy", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The pillows are brown.", "The pillow laying is brown.", "In the corner, there is a pillow the color of mud."], "image": "train2014/COCO_train2014_000000074517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543407, "question_id": "Cqnww2Fxju5YfigJ9jWEk9", "question": "What country is this most likely located?", "choices": ["venezuela", "brazil", "colombia", "argentina"], "correct_choice_idx": 1, "direct_answers": ["mexico", "country", "usa", "mexico", "china", "china", "spain", "india", "mexico", "brazil"], "difficult_direct_answer": false, "rationales": ["The sign on the road is in spanish.", "There is brazilian language on the sign.", "It is located in brazil because of the license plate and the language on the signs"], "image": "train2014/COCO_train2014_000000543407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265020, "question_id": "CvvYediQKeh2QrDBuFqcsB", "question": "What is the boy holding onto in the middle of the lake?", "choices": ["spear", "fish net", "fishing pole", "gun"], "correct_choice_idx": 2, "direct_answers": ["fishing pole", "fishing pole", "pole", "stick", "fish hook", "fishing rod", "fishing pole", "fishing pole", "fishing rod", "fishing"], "difficult_direct_answer": false, "rationales": ["A boy is standing with a pole in the water waiting to catch a fish.", "He has a long thin stick with a string in the water", "This is obvious in the image and none of the other options apply. it also makes sense for a lake."], "image": "train2014/COCO_train2014_000000265020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59361, "question_id": "Cw4TK8G3GEoBf3wDPsGYN2", "question": "What is the giraffe in the middle resting in?", "choices": ["hay", "grass", "shade", "bath"], "correct_choice_idx": 2, "direct_answers": ["dirt", "shade", "shade", "shade", "dirt", "shade", "shade", "shade", "shade", "shade"], "difficult_direct_answer": false, "rationales": ["He's under the tree", "That area of sand is a darker color.", "There is only one giraffe in the middle and he is in the shade."], "image": "train2014/COCO_train2014_000000059361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130183, "question_id": "Cw5L98YD2apqz3JBXCKFba", "question": "What is the cat doing?", "choices": ["lifting weights", "feeding", "chasing birds", "sleeping"], "correct_choice_idx": 3, "direct_answers": ["sleep", "sleeping", "sleep", "being held", "sleeping", "sleeping", "sleeping", "resting", "sleeping", "sleep"], "difficult_direct_answer": false, "rationales": ["A cat is curled up with eyes closed on top of a computer.", "The cat is curled up and sleeping on the laptop with the computer's owner.", "A cat is sitting on owners lap. he is resting his body while man works on laptop."], "image": "train2014/COCO_train2014_000000130183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31471, "question_id": "CxhhVqV3XkUBqYNMoM8DX4", "question": "How many white cows are present on the showroom floor?", "choices": ["five", "seven", "one", "two"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "zero", "one", "one", "one", "four", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["A group of cows is among people and all are brown except one which is white. cows can be brown or white.", "There is one.", "There is one white cow."], "image": "val2014/COCO_val2014_000000031471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361480, "question_id": "CxkzDZ5kkLSV6jLa53QyDc", "question": "Which country were we probably likely to see these old double decker buses?", "choices": ["germany", "france", "uk", "usa"], "correct_choice_idx": 2, "direct_answers": ["england", "italy", "england", "paris", "uk", "england", "england", "england", "england", "uk"], "difficult_direct_answer": false, "rationales": ["The country is the uk.", "A double decker bus is in the street in a city.", "It's the only country from the list that has a lot of them often used for the public."], "image": "train2014/COCO_train2014_000000361480.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180800, "question_id": "CzHPtHrLSFZWEHJCtXqsCE", "question": "What kinds of foods are these?", "choices": ["grains", "meats", "legumes", "fruits"], "correct_choice_idx": 3, "direct_answers": ["oranges", "oranges", "peaches", "orange", "nectars", "fruit", "fruits", "peaches", "peaches", "aprocts"], "difficult_direct_answer": false, "rationales": ["Oranges are in bins on shelfs.", "These are fruits.", "These are citrus foods."], "image": "val2014/COCO_val2014_000000180800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101552, "question_id": "CzcPxBWcCkAAmozrk7jeT6", "question": "What is the woman eating the donut wearing?", "choices": ["crown", "scarf", "hat", "armor"], "correct_choice_idx": 1, "direct_answers": ["coat", "scarf coat", "scarf", "scarf", "scarf", "scarf", "secure", "scarf", "pink", "doughnut"], "difficult_direct_answer": false, "rationales": ["There is only one woman eating a donut and the most prominent piece of clothing is answer a and none of the other answers are present on the woman.", "The woman eating the donut has a knitted clothing item around her neck. her head is uncovered.", "The woman has a scarf."], "image": "train2014/COCO_train2014_000000101552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82486, "question_id": "CzgnwFuhT2pLS3mtSxyAZx", "question": "What are the people doing?", "choices": ["fishing", "hugging", "eating cake", "running"], "correct_choice_idx": 1, "direct_answers": ["hugging", "hugging", "hugging", "hugging", "hugging", "hugging", "hugging", "hugging", "hugging", "traveling"], "difficult_direct_answer": false, "rationales": ["The other actions aren't taking place in this image.", "This appears to be a reunion based on the suitcases nearby.", "The people have their arms around each other which is how people hug."], "image": "train2014/COCO_train2014_000000082486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244088, "question_id": "D3Zq7wazYUGJbRULPCjASG", "question": "What structure can be seen here?", "choices": ["ramparts", "portcullis", "roof", "drawbridge"], "correct_choice_idx": 2, "direct_answers": ["roof", "buildings", "clock tower", "traingle", "homes", "tower", "clock tower", "houses", "buildings", "clock tower"], "difficult_direct_answer": false, "rationales": ["The roofs are shown.", "These are the tops of buildings", "You can see the top of the buildings which is covered in this."], "image": "train2014/COCO_train2014_000000244088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573132, "question_id": "D4VPy8kaqD6n9ZMkRQjQW6", "question": "What is the brown object on the table used for?", "choices": ["grind flour", "grind spices", "grind coffee", "grind meat"], "correct_choice_idx": 2, "direct_answers": ["butter", "grind food", "grinding", "grinding", "grinding coffee", "storage", "grinding", "coffee grinding", "grind coffee", "table"], "difficult_direct_answer": false, "rationales": ["The object is for coffee.", "A brown box shaped object with a large black handle that moves in a circular motion is on a kitchen table.", "The object has a handle to make the internal system work and an easy access drawer at the bottom. most of the modern day grinders are powered by batteries or electricity."], "image": "train2014/COCO_train2014_000000573132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423782, "question_id": "D5kSDbYY3LEKaV73dxTym2", "question": "What is this type of sofa called?", "choices": ["chaise sectional", "futon", "daybed", "loveseat"], "correct_choice_idx": 0, "direct_answers": ["l shaped", "corner", "lounge", "sectional", "chaise lounge", "sectional", "chaise", "chaise sectional", "sectional", "sectional"], "difficult_direct_answer": false, "rationales": ["The name of the sofa is derived from how the chair is segmented.", "The sofa is a sectional.", "The sofa in the living room is a sectional with a chaise section on the side."], "image": "train2014/COCO_train2014_000000423782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141994, "question_id": "D7E5W6hJimJXvXWC6XmfN6", "question": "Where is this giraffe most likely living?", "choices": ["boat", "zoo", "wild", "conservation"], "correct_choice_idx": 3, "direct_answers": ["preserve", "africa", "zoo", "conservation", "zoo", "zoo", "zoo", "zoo", "forest", "zoo"], "difficult_direct_answer": false, "rationales": ["The giraffe is being conserved.", "There is a path in the background, but there aren't visible enclosures.", "Most giraffes are in this place so statistically, this is the answer. it's not the wild because there is a concrete path in the back ground."], "image": "train2014/COCO_train2014_000000141994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68300, "question_id": "D7FJN87ujW7b3UMq9DMXoA", "question": "What season is depicted on the placemat?", "choices": ["winter", "spring", "summer", "fall"], "correct_choice_idx": 0, "direct_answers": ["winter", "winter", "meal time", "winter", "winter", "winter", "winter", "winter", "winter", "morning"], "difficult_direct_answer": false, "rationales": ["There is snow in the winter.", "The season is winter.", "The placemat shows a house with snow on the ground in front of it."], "image": "val2014/COCO_val2014_000000068300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399378, "question_id": "D8sMJe6cfpHSu5i3zEL9SQ", "question": "What is near the tower?", "choices": ["elephant", "lamppost", "pumpkin", "apple"], "correct_choice_idx": 1, "direct_answers": ["lamp post", "clock", "trees", "lamppost", "clock", "lamp post", "light poles", "clock", "bulldozer", "clock"], "difficult_direct_answer": false, "rationales": ["There is a lamppost in the base of the tower.", "The lamp post is nearby.", "There are no fruits or animals near the tower. there is a metal object that has a bulb."], "image": "train2014/COCO_train2014_000000399378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94052, "question_id": "D9NGfvKqF3DYgrxVy7qzfR", "question": "What did the occupants of this home likely just get done doing?", "choices": ["library visit", "school", "swimming", "shop"], "correct_choice_idx": 3, "direct_answers": ["grocery shopping", "grocery shopping", "shopping", "shop", "grocery shopping", "shopping", "shopping", "grocery shopping", "eating", "shopping"], "difficult_direct_answer": false, "rationales": ["There are grocery bags on the table. they likely just went to the grocery store.", "There are plastic bags all over the counter and table filled with items that have been purchased but not yet put away.", "There are bags of items all over the kitchen"], "image": "val2014/COCO_val2014_000000094052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155317, "question_id": "D9k63tWqfmL9xiBopHHJ4H", "question": "What would this food item be ideal for?", "choices": ["birthday", "sweltering day", "beach picnic", "baseball stadium"], "correct_choice_idx": 0, "direct_answers": ["dessert", "dessert", "birthday", "celebration", "dessert", "birthday", "birthday", "party", "bread", "birthday"], "difficult_direct_answer": false, "rationales": ["The cake is chocolate.", "The cake has chocolate frosting on it.", "The cake is for a birthday."], "image": "val2014/COCO_val2014_000000155317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491290, "question_id": "D9nP9DuXS8qYoA86ABWMVe", "question": "Who was born in the year that is displayed on the truck?", "choices": ["mata hari", "florence pugh", "lucy hale", "george baxter"], "correct_choice_idx": 3, "direct_answers": ["boone", "john deere", "car", "car", "1804", "nathaniel hawthorne", "1804", "hume babington", "george baxter", "old"], "difficult_direct_answer": false, "rationales": ["The number on the truck is 1804.", "I had to look it up on wikipedia. apparently, he's an english artist.", "The number on the truck is 1804. florence pugh was born in 1996, mata hari was born in 1876, and lucy hale was born in 1989."], "image": "train2014/COCO_train2014_000000491290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76657, "question_id": "DA3bBJQu8tabYhN5uNDuCt", "question": "What is the most likely time of day outside?", "choices": ["1100 pm", "200 am", "300 pm", "100 am"], "correct_choice_idx": 2, "direct_answers": ["morning", "evening", "300 pm", "late afternoon", "morning", "daytime", "morning", "noon", "morning", "afternoon"], "difficult_direct_answer": false, "rationales": ["It is most likely the late afternoon.", "The time is 3.", "The time is 3 o'clock."], "image": "val2014/COCO_val2014_000000076657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79236, "question_id": "DBCQPy2ypoyTSVAbfm5sWM", "question": "What causes the brown color on the item in the foreground?", "choices": ["high temperature", "radioactive spill", "mold", "rust"], "correct_choice_idx": 0, "direct_answers": ["baking", "oven burned", "heat", "oven", "naan", "tomatoes", "high temperature", "burnt", "cooked", "oven"], "difficult_direct_answer": true, "rationales": ["The brown color on the pizza is caused by the oven's high temperature.", "A pizza is covered in golden brown areas.", "The brown color is from the high cooking temperature."], "image": "train2014/COCO_train2014_000000079236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213107, "question_id": "DC5UuzDGriN4E58ip9DweF", "question": "What is the child swinging?", "choices": ["kite", "polaroid picture", "bat", "tennis racquet"], "correct_choice_idx": 2, "direct_answers": ["bat", "bat", "baseball bat", "bat", "baseball bat", "baseball bat", "bat", "bat", "bat", "bat"], "difficult_direct_answer": false, "rationales": ["The boy is playing baseball.", "The tool is a smooth club used in the sport of baseball, which the child is playing based on his uniform.", "The child is playing baseball, not tennis. the child is not holding a camera or kite."], "image": "train2014/COCO_train2014_000000213107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22355, "question_id": "DCstHEgRXWD4SoMshC4xVw", "question": "What is the man standing near?", "choices": ["dog", "cat", "bench", "hydrant"], "correct_choice_idx": 3, "direct_answers": ["fire hydrant", "water hydrant", "htdrant", "car", "fire hydrant", "fire hydrant", "hydrant", "fire hydrant", "hydrant", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["The other options aren't in this image and this makes the most sense given that they're often on street corners.", "The men are by hydrants.", "Based on the shape and size of the object and its placement on the sidewalk, answer a is the most logical."], "image": "train2014/COCO_train2014_000000022355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257958, "question_id": "DDCok63WAYaQm5ZGpYtMsE", "question": "How many giraffes are standing?", "choices": ["five", "eight", "three", "seven"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["One giraffe is standing in between two other giraffes.", "There are three giraffes standing in a row by the rocks.", "There are a trio of giraffes standing."], "image": "train2014/COCO_train2014_000000257958.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205338, "question_id": "DF6MTHrHP5LvjV8XdovN4o", "question": "What is on top of the donut?", "choices": ["whipped cream", "cherry", "sprinkles", "gummy bears"], "correct_choice_idx": 2, "direct_answers": ["sprinkles", "sprinkles", "sprinkles", "colors", "sprinkles", "sprinkles", "design", "icing", "sprinkles", "stop"], "difficult_direct_answer": false, "rationales": ["The sprinkles are on top.", "The donut has sprinkles.", "Multi-colored spots are on top of frosting on a donut."], "image": "train2014/COCO_train2014_000000205338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27852, "question_id": "DFizSsQ8SqJDVGyYj4Cmsm", "question": "How many elephants are huddled together on the left side of the hanging tree?", "choices": ["six", "five", "four", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["Two adults and one baby elephant stand together on a treeline path.", "There are three elephants huddled around the bushes.", "There are 3."], "image": "train2014/COCO_train2014_000000027852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502979, "question_id": "DFjaPiaWEn5fU3hgPQJYRj", "question": "What color is the lanyard string worn around the little boy's neck?", "choices": ["yellow", "green", "blue", "black"], "correct_choice_idx": 0, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "car", "yellow", "white", "yellow", "stop", "yellow"], "difficult_direct_answer": false, "rationales": ["The color is yellow.", "The lanyard around the boy's neck is light and bright in color. it is yellow.", "The lanyard is yellow."], "image": "val2014/COCO_val2014_000000502979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195233, "question_id": "DHFgvjzy4VnczvbbFGHKbR", "question": "What does the man have in his mouth while taking a selfie in the mirror?", "choices": ["cigarette", "hairbrush", "phone", "toothbrush"], "correct_choice_idx": 3, "direct_answers": ["toothbrush", "toothbrush", "toothbrush", "brush teeth", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "toothbrush", "cigarette"], "difficult_direct_answer": false, "rationales": ["The man has a toothbrush.", "The man has a toothbrush in his mouth.", "He's holding a long handled item in his mouth"], "image": "train2014/COCO_train2014_000000195233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487286, "question_id": "DHsEpVy4yD3cZuC7zYcKUL", "question": "How many zebras are standing in front of the pack of buffalo?", "choices": ["three", "four", "five", "two"], "correct_choice_idx": 1, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are 4.", "There are four zebras.", "Four zebras are in sight."], "image": "train2014/COCO_train2014_000000487286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456638, "question_id": "DKutNgaSwLTCoNZKAjgpkr", "question": "The tattooed woman is holding onto what color of railing?", "choices": ["green", "purple", "red", "blue"], "correct_choice_idx": 0, "direct_answers": ["green", "green", "green", "green", "green", "green", "green", "sky blue", "green", "dark blue"], "difficult_direct_answer": false, "rationales": ["The woman has a green railing.", "A woman is stepping off of a bus with a dark green handle.", "The woman has a green railing."], "image": "val2014/COCO_val2014_000000456638.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526552, "question_id": "DL2aoLaBmuVi2kd7tyGgE8", "question": "What is near the onion?", "choices": ["cantaloupe", "apple slice", "frog", "green pepper"], "correct_choice_idx": 3, "direct_answers": ["tomatoes", "tomatoes", "hand", "tomato", "tomato", "green pepper", "hands", "recipe", "sauce", "salad"], "difficult_direct_answer": false, "rationales": ["Green pepper is by the onion.", "He's by the pepper.", "These vegetables are commonly on pizza together."], "image": "train2014/COCO_train2014_000000526552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518941, "question_id": "DL3b5YsK45diSgJVV4Cmm3", "question": "What color is the only icing element used for the birthday donuts?", "choices": ["brown", "white", "pink", "light brown"], "correct_choice_idx": 0, "direct_answers": ["brown", "car", "yellow", "brown", "chocolate", "stop", "orange", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The donuts are colored brown.", "There are chocolate doughnuts on the tray.", "The icing is made out of chocolate."], "image": "train2014/COCO_train2014_000000518941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205392, "question_id": "DMCGsPPnBj9go9GQitxxJs", "question": "What number is at the top of the bus?", "choices": ["96", "84", "77", "35"], "correct_choice_idx": 3, "direct_answers": ["thirty-five", "35", "35", "thirty five", "35", "35", "35", "35", "35", "thirty five"], "difficult_direct_answer": false, "rationales": ["A bus is in the intersection with a digital readout on the top.", "It's in the center of the top", "The number is 35."], "image": "train2014/COCO_train2014_000000205392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130096, "question_id": "DMNLXsxp3FKywBpA2borBc", "question": "What color is the hat worn by the man who is leading a cow by a rope?", "choices": ["blue", "green", "brown", "black"], "correct_choice_idx": 2, "direct_answers": ["brown", "black", "brown", "brown", "brown", "brown", "brown", "brown", "brown", "black"], "difficult_direct_answer": false, "rationales": ["It is brown.", "The man standing next to the cow in the grass is wearing a wide brim brown hat.", "The hat is brown."], "image": "train2014/COCO_train2014_000000130096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485945, "question_id": "DMatj99BLDPuFKEbhGJVnT", "question": "What feature does the animal have?", "choices": ["long neck", "beak", "quills", "tusks"], "correct_choice_idx": 1, "direct_answers": ["wings", "red beak", "feathers", "wings", "wings", "beak", "wings", "wings", "beak", "wings"], "difficult_direct_answer": false, "rationales": ["This animal is a bird, not a porcupine, giraffe, or elephant.", "Birds have beaks.", "The beaks are orange."], "image": "train2014/COCO_train2014_000000485945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124102, "question_id": "DQJ8n7m32zrVS5UQcpu9y3", "question": "Where would this type of traffic light be found?", "choices": ["highway", "country road", "crosswalk", "near lake"], "correct_choice_idx": 2, "direct_answers": ["city", "crosswalk", "roundabout", "london", "street", "crosswalk", "street", "pedestrian walkway", "car", "street"], "difficult_direct_answer": false, "rationales": ["The light would be at a crosswalk.", "It is a light for people to walk or stop", "This light is used to give pedestrians safe crossing."], "image": "val2014/COCO_val2014_000000124102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295599, "question_id": "DQqJ2fs9JtB7SuETa64hLL", "question": "What is furthest to the right?", "choices": ["flower", "elephant", "cat", "dog"], "correct_choice_idx": 0, "direct_answers": ["light switch", "vase", "flower vase", "flower", "flower vase", "flower", "rose", "rose", "vase", "vase"], "difficult_direct_answer": false, "rationales": ["The object furthest to the right is a tall blue vase with a red flower in it.", "The flower is to the right.", "There is a rose in a vase."], "image": "val2014/COCO_val2014_000000295599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183972, "question_id": "DRzSMjqYMyij3upiFC3mU7", "question": "What is something here that's rarely seen in a kitchen?", "choices": ["wok", "mixer", "tv", "ironing board"], "correct_choice_idx": 3, "direct_answers": ["ironing board", "iron board", "ironing board", "ironing board", "iron", "box", "ironing board", "iron box", "ironing board", "iron"], "difficult_direct_answer": false, "rationales": ["A board with an iron on it extends from a kitchen counter.", "A kitchen counter has a board and iron attachment on the cupboards.", "People don't usually do their laundry in the kitchen."], "image": "train2014/COCO_train2014_000000183972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95583, "question_id": "DSNseb6fgnUPhXaR8YWgrH", "question": "What is the person wearing?", "choices": ["bandana", "backpack", "watch", "crown"], "correct_choice_idx": 2, "direct_answers": ["shoes", "white shoes", "trouser", "jeans", "shoes", "jeans", "jeans", "dress", "watch", "shirt pants"], "difficult_direct_answer": false, "rationales": ["The person's wrist has a time telling device.", "The person has a watch.", "There is a black wrist band with a round dial on his left arm."], "image": "val2014/COCO_val2014_000000095583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464030, "question_id": "DTHipPj9EQ8n4gSqKANK4v", "question": "What is the job of the woman in uniform against the wall?", "choices": ["grab towels", "collect ball", "serve ball", "referee"], "correct_choice_idx": 1, "direct_answers": ["line judge", "umpire", "collect ball", "ball girl", "referee", "referee", "linesman", "ball catcher/runner", "ref", "judge"], "difficult_direct_answer": true, "rationales": ["The person has to collect the ball after the game is over.", "The woman against the wall here has a job to collect balls.", "She is a ball girl."], "image": "train2014/COCO_train2014_000000464030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310440, "question_id": "DTLj4spgXeVXzkMGMybeAS", "question": "What is the name of the material that people get from elephant horns?", "choices": ["powder", "ivory", "knives", "steel"], "correct_choice_idx": 1, "direct_answers": ["ivory", "ivory", "ivory", "ring", "ivory", "ivory", "ivory", "ivory", "ivory", "ivory"], "difficult_direct_answer": false, "rationales": ["It is what poachers kill the elephants for and to make jewelry with.", "Elephants are poached for the material derived from their tusks known as ivory.", "The item is a hard white material that is prized for its durability for carving."], "image": "train2014/COCO_train2014_000000310440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363845, "question_id": "DTSukSuU3DZRgsfHaqxYHc", "question": "What food is on the boat?", "choices": ["banana", "apple", "orange", "eggplant"], "correct_choice_idx": 0, "direct_answers": ["bananas", "banana", "bananas", "bananas", "banana", "banana", "bananas", "sweet banana", "banana", "bananas"], "difficult_direct_answer": false, "rationales": ["You can tell by the color and shape of the fruit as to what is in the boat.", "The food is a banana.", "There are yellow fruits on the boat."], "image": "val2014/COCO_val2014_000000363845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40842, "question_id": "DToryCK8TQcwTPUZtn2bsQ", "question": "What is the cat on top of?", "choices": ["baby", "shelf", "dog", "monkey"], "correct_choice_idx": 1, "direct_answers": ["table", "desk", "monitor desk", "laptop", "table", "shelf", "desk", "table", "desk", "monitor"], "difficult_direct_answer": false, "rationales": ["The cat is on a horizontal board above the computer.", "There is a computer.", "The cat is on a shelf above the monitor."], "image": "val2014/COCO_val2014_000000040842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243857, "question_id": "DTx3S8WLth9f5A2cvPoKZW", "question": "Why does the stove have two doors?", "choices": ["looks", "double oven", "broken", "microwave"], "correct_choice_idx": 1, "direct_answers": ["venting", "double-oen", "two stoves", "double oven", "double-oen", "dual baking", "open", "two ovens", "for broiling", "double ovens"], "difficult_direct_answer": true, "rationales": ["A stove in a kitchen has two compartments, one on top of the other.", "The double oven makes the stove to have two doors.", "The there are multiple ovens built into the same unit."], "image": "val2014/COCO_val2014_000000243857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453756, "question_id": "DUpfFL2L2J4gmi3geATc5f", "question": "How many blue milk bottles are there next to the black and white photograph?", "choices": ["one", "four", "two", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "five", "four", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are two bottles in front and one behind the other two.", "There are a trio of milk bottles on the shelf.", "There are three bottles."], "image": "val2014/COCO_val2014_000000453756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218941, "question_id": "DV2CVyWjuU8MA6ESt6JPLe", "question": "Why does she have a cake just for her?", "choices": ["siblings birthday", "1st birthday", "moms birthday", "2nd birthday"], "correct_choice_idx": 1, "direct_answers": ["birthday", "birthday", "1st birthday", "birthday smash", "birthday", "birthday", "birthday", "food", "birthday", "birthday"], "difficult_direct_answer": false, "rationales": ["A baby is sitting in a high chair with a professionally decorated cake that is individually sized.", "The baby is in a high chair, so she's probably eating her first birthday cake and looking all messy.", "It is a smash cake."], "image": "train2014/COCO_train2014_000000218941.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94317, "question_id": "DW32R7BCMPYSMqo2WySPJP", "question": "What kind of beverage is served from the cups at the top of this cabinet?", "choices": ["tea", "coffee", "purple drink", "energy drink"], "correct_choice_idx": 0, "direct_answers": ["tea", "tea", "tea", "tea", "tea", "tea", "tea", "tea", "tea", "tea"], "difficult_direct_answer": false, "rationales": ["Tea is being served.", "The drink is sometimes tea.", "The cups are teacups."], "image": "val2014/COCO_val2014_000000094317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343167, "question_id": "DW6YCvCnEQDDgs59ka2ZDm", "question": "What is the man wearing?", "choices": ["gas mask", "top hat", "tie", "sunglasses"], "correct_choice_idx": 2, "direct_answers": ["suit", "tie", "suit", "suit jacket", "suit", "jacket", "suit", "suit vest", "coat", "stop"], "difficult_direct_answer": false, "rationales": ["He has a tie.", "The man has a tie.", "The man is clearly wearing answer a and is not wearing any of the other answers."], "image": "train2014/COCO_train2014_000000343167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149022, "question_id": "DWUmXG25yi5nrbFwQkvUU8", "question": "What is the donut on the left dipped in?", "choices": ["peanut butter", "walnut sauce", "ranch dressing", "chocolate"], "correct_choice_idx": 3, "direct_answers": ["chocolate", "chocolate", "donate", "chocolate", "chocolate", "chocolate", "chocolate", "chocolate", "choco cream", "chocolate"], "difficult_direct_answer": false, "rationales": ["Donuts are often glazed with this dark liquid.", "It's an eclair and they are always chocolate dipped on one side.", "There is a chocolate glazed donut."], "image": "val2014/COCO_val2014_000000149022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488349, "question_id": "DXdu4ZKf9F4CE6ttww7BVT", "question": "What is the name of the tool that is used to dry dishes without electricity in this room?", "choices": ["sink", "light", "dishwasher", "drying rack"], "correct_choice_idx": 3, "direct_answers": ["towel", "washer", "towel", "drying rack", "rack", "steel", "sink", "dish rack", "rack", "drying rack"], "difficult_direct_answer": false, "rationales": ["A white rack is to the left of a sink. it has a dish and cups in it that help let water run off of them and dries naturally.", "This lets water drip and evaporate as it sits out", "The dishwasher and light use electricity. the sink makes the dishes wet."], "image": "val2014/COCO_val2014_000000488349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403916, "question_id": "DZwTCHSanAKqSB3jmnrNhz", "question": "What feature do the animals have?", "choices": ["spots", "gills", "wings", "talons"], "correct_choice_idx": 0, "direct_answers": ["spots", "long necks", "spots", "long necks", "long neck", "spots", "tall", "neck", "spots", "long necks"], "difficult_direct_answer": false, "rationales": ["This is obvious in the scene and entirely normal for giraffes.", "Giraffes are grazing in an open area.", "The animals are giraffes, not fish or birds."], "image": "train2014/COCO_train2014_000000403916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392793, "question_id": "DadeoFLH26dMnQVykUrv2f", "question": "What is the secondary color for the vest worn to the woman on the left side driving horse?", "choices": ["purple", "blue", "red", "black"], "correct_choice_idx": 1, "direct_answers": ["blue", "yellow", "white", "blue", "blue", "pink", "blue", "blue", "blue", "pink"], "difficult_direct_answer": false, "rationales": ["The woman is identifiable based on the location description in the question. the primary color of the vest is visible as well as the secondary color.", "If white is primary, then a would be secondary.", "The color is blue."], "image": "train2014/COCO_train2014_000000392793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538775, "question_id": "Davcb9ENyRc5pgaMh9jrpe", "question": "What decorative element is at the center of the clock face?", "choices": ["circle", "square", "moon", "sun"], "correct_choice_idx": 3, "direct_answers": ["coconut", "sun", "star", "star", "metal", "brass", "nautical star", "star", "crystal", "bell"], "difficult_direct_answer": false, "rationales": ["The element is the sun.", "There are rays extending froma circle that is in the middle of a clock face.", "Suns are often in the centre of clocks."], "image": "val2014/COCO_val2014_000000538775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326911, "question_id": "Db4jvHy4RLFm4zC6C2JVen", "question": "What color is the small dog in front of the bicycle tire with its tail raised up?", "choices": ["tawny", "black", "white", "brown"], "correct_choice_idx": 1, "direct_answers": ["black", "black", "black", "black", "tan", "black", "black", "brown white", "black", "black"], "difficult_direct_answer": false, "rationales": ["This is obvious in the scene.", "There is a black dog in front of the bicycle.", "The dog in front of the bicycle wheel here pictured is harder to spot due to it's black coat and being in a low light situation."], "image": "val2014/COCO_val2014_000000326911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502409, "question_id": "DcfH8SG87BYGeUA6s7cU32", "question": "What color is the woman's skirt who is number 851 in this ski race?", "choices": ["blue", "orange", "red", "pink"], "correct_choice_idx": 3, "direct_answers": ["pink", "pink", "blue", "pink", "pink", "black", "pink", "pink", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["The color is pink.", "She's wearing a tutu in a traditional girl color", "Her skirt is not blue, red, or orange."], "image": "train2014/COCO_train2014_000000502409.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381214, "question_id": "DdqyHTxYFMW77quh4AHXxd", "question": "What color are the paddles on the wheels behind this river boat?", "choices": ["white", "black", "gray", "red"], "correct_choice_idx": 3, "direct_answers": ["brown", "red", "black", "red", "red", "white", "stop", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color is red.", "A large white boat is in the water with large red structures to move water on one end.", "The paddles are not black, white, or gray."], "image": "train2014/COCO_train2014_000000381214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61452, "question_id": "DePVoariU9irZWoexg7B6g", "question": "What color are the bearings inside of the wagon wheels?", "choices": ["white", "purple", "blue", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "red", "red", "man", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The bearings are not blue, white, or purple.", "The bearings are the color of blood."], "image": "train2014/COCO_train2014_000000061452.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569535, "question_id": "DeVEaPSRK4USeUCARNH3eQ", "question": "What TV show title shares one of the words on the wall?", "choices": ["wmac masters", "baywatch", "vr troopers", "jeopardy"], "correct_choice_idx": 0, "direct_answers": ["master cook", "master chef", "parabas", "playing", "wmac masters", "paribas masters", "masters", "master", "bsll", "cook masters"], "difficult_direct_answer": true, "rationales": ["That is the show on the wall.", "The wall describes a tennis tournament that contains a word that matches the second word on the tv show mentioned in option a.", "It's obviously the same name as in a."], "image": "val2014/COCO_val2014_000000569535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223030, "question_id": "DepW63bHswXkkUoG3jscqx", "question": "What is in the sandwich?", "choices": ["steak", "tomato", "egg", "pork chop"], "correct_choice_idx": 1, "direct_answers": ["tomato", "tomatoes", "tomato pesto", "tomato cheese", "tomato", "tomato lettuce", "tomato", "tomatoes", "tomato mozzarella", "tomato"], "difficult_direct_answer": false, "rationales": ["You see red in the sandwich.", "There are tomatoes.", "This vegetable is bright red and often on sandwiches."], "image": "train2014/COCO_train2014_000000223030.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238915, "question_id": "DfhSPFC7stCkoQssT7GrnC", "question": "What color is the garden hose wrapped around the kiddie pool?", "choices": ["turquoise", "purple", "red", "green"], "correct_choice_idx": 0, "direct_answers": ["green", "green", "green", "kiddie pool", "turquoise", "green", "blue", "green", "green", "blue"], "difficult_direct_answer": false, "rationales": ["The garden hose is turquoise colored.", "The color is turquoise.", "The color is turquoise."], "image": "train2014/COCO_train2014_000000238915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401528, "question_id": "DgEpAcZ3diRJzpnSfZeuNy", "question": "Where can you find this display?", "choices": ["library", "school", "museum", "church"], "correct_choice_idx": 2, "direct_answers": ["museum", "inside museum", "museum", "store", "museum", "museum", "museum", "museum", "museum", "store"], "difficult_direct_answer": false, "rationales": ["You would often see a display of vases in a museum.", "The display is at a museum.", "The vases are on display in a museum."], "image": "train2014/COCO_train2014_000000401528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496826, "question_id": "DgvBZgF4MEHAHFMxuxh2xZ", "question": "What is next to the plate?", "choices": ["cow", "baby", "apple", "mug"], "correct_choice_idx": 3, "direct_answers": ["mug", "cup", "cup", "cup", "mug", "sandwich", "mug", "cup", "mug", "cup"], "difficult_direct_answer": false, "rationales": ["The object next to the plate is made with a material that is consistent with answer a and would be served with a meal as seen in the image. it has a design, shape, size and handle also consistent.", "This is a beverage cup", "The item is a cup with a handle made of ceramic."], "image": "train2014/COCO_train2014_000000496826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398142, "question_id": "DhHpU5igqy9Ai5L9DVVvwX", "question": "What actress was born in this country?", "choices": ["jennifer connelly", "adele haenel", "margaret qualley", "brooke shields"], "correct_choice_idx": 1, "direct_answers": ["juliette binoche", "catherine deneuve", "brigitte bardot", "godard", "juliette binoche", "milly cyrus", "adele haenel", "depardieu", "audrey tatoo", "no clue"], "difficult_direct_answer": true, "rationales": ["The actress is adele.", "She is french.", "A building with red, white, and blue striped flags can be seen."], "image": "train2014/COCO_train2014_000000398142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225191, "question_id": "DhjKEnvuYAWKoMW7J2pJSD", "question": "What is under the cabinet on the left?", "choices": ["baby", "sink", "cat", "dog"], "correct_choice_idx": 1, "direct_answers": ["sink", "tile", "cookware", "countertop", "oven", "oven", "floor", "sink", "floor", "sink"], "difficult_direct_answer": false, "rationales": ["The faucet is visible.", "A kitchen counter and cabinets can be seen with a silver basin. sinks are commonly found in kitchens.", "The sink is under."], "image": "train2014/COCO_train2014_000000225191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492166, "question_id": "Di5Esh5W6MYJkVWK7Yfug2", "question": "What number is on the man in the green shirt's jersey?", "choices": ["118", "415", "223", "956"], "correct_choice_idx": 0, "direct_answers": ["118", "118", "118", "118", "118", "one eighteen", "one-hundred eighteen", "118", "118", "118"], "difficult_direct_answer": false, "rationales": ["The number is visible and clear on his jersey.", "There is an older man skiing down a hill with 118 and another person in blue behind him.", "The numbers are in black"], "image": "train2014/COCO_train2014_000000492166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 405316, "question_id": "DiAJC6Ktmgtj2F9cQ3TU5T", "question": "What is the person on the motorcycle wearing?", "choices": ["crown", "scarf", "boots", "tiara"], "correct_choice_idx": 2, "direct_answers": ["polish", "uniform", "helmet", "helmet", "police uniform", "helmet", "boots", "helmet", "watching", "police uniform"], "difficult_direct_answer": false, "rationales": ["The person has boots.", "These are leather and go up to the knee", "A police officer is riding a motorcycle in black, leather footwear that are high topped."], "image": "train2014/COCO_train2014_000000405316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514283, "question_id": "DiQHwWB2pFwiHpbctJzSSN", "question": "What fruit is the black topping on this pizza between the two customers?", "choices": ["pineapple", "olive", "tomato", "pepperoni"], "correct_choice_idx": 1, "direct_answers": ["blueberry", "grapes", "black olives", "olives", "olive", "pizza", "olives", "olive", "olives", "olives"], "difficult_direct_answer": false, "rationales": ["There are two people smiling while they sit at a table. there is a pizza between them with pineapples on them.", "The fruit is an olive.", "It's a tiny black savory fruit"], "image": "train2014/COCO_train2014_000000514283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533677, "question_id": "Dj5UhjTKARWapyqBdHr4Gu", "question": "How are the people traveling?", "choices": ["by car", "by boat", "by train", "by airplane"], "correct_choice_idx": 1, "direct_answers": ["boat", "boats", "boat", "by boat", "boat", "boat", "boat", "by boat", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["The only roadway is water and only vehicles seen are watercraft.", "There is a body of water. the vehicles are floating on the water.", "This is how people travel by water."], "image": "train2014/COCO_train2014_000000533677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269502, "question_id": "DkB2LvYPapUSedVFZzspVA", "question": "What feature of the animal is visible?", "choices": ["wing", "gill", "udder", "stinger"], "correct_choice_idx": 2, "direct_answers": ["cow", "utters", "udders", "head", "udder", "legs", "legs", "udder", "cow", "head"], "difficult_direct_answer": false, "rationales": ["The animal is a cow, not a fish, bird, or bee.", "The feature is the udder.", "You can see long nipples under the cow."], "image": "train2014/COCO_train2014_000000269502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315399, "question_id": "Dkuf4xqMLvwW4SNBdZTzxb", "question": "What time of day is it likely to be?", "choices": ["midday", "evening", "night", "morning"], "correct_choice_idx": 1, "direct_answers": ["night", "evening", "night", "night", "night", "night", "night", "night", "evening", "evening"], "difficult_direct_answer": false, "rationales": ["It's nighttime as it's dark.", "The sky outside is dark.", "The time is evening."], "image": "train2014/COCO_train2014_000000315399.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 449945, "question_id": "DmLgi5kPtMFMPtvQpVnZzE", "question": "What type of sinks are these?", "choices": ["top mount", "under mount", "bowl sinks", "drop sinks"], "correct_choice_idx": 2, "direct_answers": ["raised sinks", "bowl", "white", "bathroom", "bowl", "bowl", "round", "basin", "bathroom sinks", "bowl sinks"], "difficult_direct_answer": false, "rationales": ["This is obvious given their shape.", "These sinks sit on top of the counter and shaped liked bowls which is where the name comes from.", "This is obvious given the shape of the sinks."], "image": "val2014/COCO_val2014_000000449945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158686, "question_id": "Dmb8feScyFubXHPRU93Emb", "question": "What kind of food is this?", "choices": ["unhealthy", "mexican", "chinese", "healthy"], "correct_choice_idx": 3, "direct_answers": ["vegetables", "healthy", "vegetables", "vegetables", "vegetables", "vegetables", "vegetables", "vegetables", "vegetables", "vegetable"], "difficult_direct_answer": false, "rationales": ["These are nutritious vegetables.", "The other options don't match this produce grouping.", "Vegetables are piled on top of a cutting board."], "image": "train2014/COCO_train2014_000000158686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29320, "question_id": "DqjtbZCNCMFLUg4w9mni6G", "question": "What time of day is most likely?", "choices": ["late night", "morning", "midday", "afternoon"], "correct_choice_idx": 3, "direct_answers": ["morning", "afternoon", "cofee", "8pm", "nighttime", "afternoon", "night", "morning", "night", "breakfast"], "difficult_direct_answer": false, "rationales": ["You sometimes eat tea and cake in later in the day.", "Really isn't enough information to figure time of day. but there is coffee.", "The food is likely served in the middle of the day."], "image": "train2014/COCO_train2014_000000029320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391018, "question_id": "DtxQyKmBmvFCxCCfakDou4", "question": "What meal would these animals prefer?", "choices": ["salad", "fish cakes", "venison", "rabbit stew"], "correct_choice_idx": 0, "direct_answers": ["grass", "grass", "grass", "grass", "grass", "salad", "grass", "grass", "grass", "grass"], "difficult_direct_answer": false, "rationales": ["The meal is a salad.", "These animals are sheep which graze off of greenery. salad is the closest food item to grass of those listed here.", "Sheep are herbivores and consume plants such as as green plants."], "image": "train2014/COCO_train2014_000000391018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296424, "question_id": "DusT3HVeJugboxhrf9BvJU", "question": "What had broken off during this part of the game?", "choices": ["glove", "bat", "helmet", "hand"], "correct_choice_idx": 1, "direct_answers": ["bat", "baseball bat", "bat", "bat", "bat", "bat", "bat", "braves' charlie", "baseball bat", "bat"], "difficult_direct_answer": false, "rationales": ["The bat had broken off of the handle.", "The top of the it went flying and the batter still has the handle", "The handle is still in the batters hand while the rest is flying away"], "image": "train2014/COCO_train2014_000000296424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59201, "question_id": "Dv6esz44zTARtwgKePGxQS", "question": "What shape is the food in?", "choices": ["square", "triangle", "circle", "hexagon"], "correct_choice_idx": 1, "direct_answers": ["triangle", "triangle", "triangle", "triangle", "triangle", "triangular slice", "triangle", "triangle", "triangle", "triangle"], "difficult_direct_answer": false, "rationales": ["The food shape is clearly visible and has three intersecting sides as does answer a.", "The pizza is cut in a triangle.", "The shape of the pizza is a triangle."], "image": "val2014/COCO_val2014_000000059201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315202, "question_id": "DvgwssmpcQFvJB6LvuGYK3", "question": "What color is the duct around the middle of this church's top?", "choices": ["red", "gray", "yellow", "blue"], "correct_choice_idx": 2, "direct_answers": ["ivory", "yellow", "yellow", "yellow", "black", "yellow", "grey", "yellow", "grey", "white"], "difficult_direct_answer": false, "rationales": ["It's the only color in the image. it's hard to see if that's a duct or not.", "The color is yellow.", "There is a yellow decoration in the middle of the top of the church."], "image": "train2014/COCO_train2014_000000315202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278393, "question_id": "DvxPA42WRzNrfCQwo54Hr9", "question": "What kind of fence is in front of the ostrich for purpose of confinement?", "choices": ["wire", "link", "electric", "wood"], "correct_choice_idx": 2, "direct_answers": ["electric", "electric", "barbwire", "metal", "wire", "metal", "electrified fence", "lining", "wire", "electric"], "difficult_direct_answer": false, "rationales": ["This fence is safe and helps keep the animals secure.", "The fence is electric.", "The fence is electric."], "image": "train2014/COCO_train2014_000000278393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379077, "question_id": "DwL5yKjgTBtEuherpxNBGA", "question": "What is the dog playing in?", "choices": ["water", "sand", "mud", "snow"], "correct_choice_idx": 3, "direct_answers": ["snow", "snow", "snow", "frisbee", "snow", "snow", "snow", "snow", "snow", "snow"], "difficult_direct_answer": false, "rationales": ["The dog is in snow.", "The dog is playing in snow.", "A dog is standing on the ground which is white and there is white on his face and his toys."], "image": "val2014/COCO_val2014_000000379077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82729, "question_id": "DwaNHHYSwSjmQ3GhJvUPRi", "question": "What is the person cooking on the stove?", "choices": ["artichokes", "corn", "green peppers", "asparagus"], "correct_choice_idx": 2, "direct_answers": ["peppers", "peppers", "green peppers", "stuffed peppers", "green peppers", "peppers", "stuffed peppers", "man", "green peppers", "bell peppers"], "difficult_direct_answer": false, "rationales": ["The person is preparing bell shaped vegetables that are the color of grass.", "They are the characteristic color and shape. they are cut in a manner where the top is cut off first revealing seeds.", "The items being cooked are bell shaped peppers that are the color of grass."], "image": "train2014/COCO_train2014_000000082729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335766, "question_id": "DyDUm7WbgqkUdqxdfmvsAA", "question": "What is under the faucet?", "choices": ["dog", "cat", "cabinet", "boxes"], "correct_choice_idx": 2, "direct_answers": ["sink", "glass", "sink", "sink", "sink", "vessel sink", "cabinet", "glass sink", "cupboard", "bowl"], "difficult_direct_answer": false, "rationales": ["The faucet has a cabinet.", "There is a cabinet in the bathroom.", "There are dark wood boxes that have handles on front. they hold many different household objects in them."], "image": "train2014/COCO_train2014_000000335766.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231714, "question_id": "DziAB5EkxmMnVj4iAXjnjV", "question": "What color is the soft drink drank by the man at the pizza store?", "choices": ["clear", "brown", "blue", "white"], "correct_choice_idx": 1, "direct_answers": ["ligybrown", "brown", "brown", "black", "brown", "black", "brown", "brown", "dark brown", "brown"], "difficult_direct_answer": false, "rationales": ["The fluid in both glasses visible in this image is brown, though it appears nearly black in low light.", "This is obvious in the scene.", "Only the water is clear in the foreground behind another drink, and the other options don't match."], "image": "train2014/COCO_train2014_000000231714.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5724, "question_id": "E2Ccz9EohLUUJEQBPgQGJL", "question": "How have these vegetables been cooked?", "choices": ["roasted", "fried", "baked", "boiled"], "correct_choice_idx": 0, "direct_answers": ["grilled", "fried", "baked", "roasted", "yes", "steam", "oven", "roasted", "grilled", "baked"], "difficult_direct_answer": false, "rationales": ["The meal looks like a potroast with veggies cooked in it.", "The charring on the vegetables indicates they were cooked in the manner described in option a.", "They were roasted."], "image": "val2014/COCO_val2014_000000005724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43340, "question_id": "E2QmxEngcj56nGVcCuvpvE", "question": "What is the odd placement in this room?", "choices": ["shower", "toilet", "sink", "tub"], "correct_choice_idx": 2, "direct_answers": ["bath", "box", "toilet", "plant", "sink", "red container", "red container", "cactus", "plant", "sink cabinet"], "difficult_direct_answer": false, "rationales": ["The tub is small and in a wird place.", "The sink is oddly placed.", "The placement is the sink."], "image": "train2014/COCO_train2014_000000043340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120188, "question_id": "E2cKDwQAcaSssGgSigqsAc", "question": "What color are the logos on the shoes which this tennis playing woman is wearing?", "choices": ["pink", "red", "blue", "black"], "correct_choice_idx": 3, "direct_answers": ["red", "black", "black", "black", "blue", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The logos on the side of the shoes is black.", "The shoes are black and white.", "These are nike shoes and the logo is that color"], "image": "train2014/COCO_train2014_000000120188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462341, "question_id": "E2yDhxCY4Lbd8fYTyhYPZo", "question": "How many towers are on the top of the clock tower with a black clock face?", "choices": ["four", "two", "three", "five"], "correct_choice_idx": 3, "direct_answers": ["five", "four", "stop", "three", "five", "four", "five", "many", "two", "car"], "difficult_direct_answer": false, "rationales": ["There are 5.", "There is one large structure and the four other structures are symmetrical around the corners which adds up to five.", "There are five towers."], "image": "val2014/COCO_val2014_000000462341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201, "question_id": "E4ogUWtht8PcZQTMn2c7aH", "question": "What color is the snowboard's back on the far left?", "choices": ["blue", "pink", "black", "green"], "correct_choice_idx": 3, "direct_answers": ["greenish yellow", "green", "green", "green", "green", "black", "green", "green yellow", "green", "green"], "difficult_direct_answer": false, "rationales": ["The snowboard that is farthest on the back of the rack is bright green.", "The color is green.", "That snowboard's back is not pink, blue, or black."], "image": "train2014/COCO_train2014_000000000201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155465, "question_id": "E5hgvcHJpUqHkAbCzDfQgT", "question": "What country are they in?", "choices": ["france", "portugal", "england", "spain"], "correct_choice_idx": 0, "direct_answers": ["france", "france", "italy", "united states", "france", "francisco", "italy", "france", "france", "italy"], "difficult_direct_answer": false, "rationales": ["She is eating escargot which is a dish they serve there", "A woman is sitting at a restaurant with a plate in front of her with escargot on it. escargot is a french dish.", "Everything is written in the language native to the country, and the dish the woman is eating is a local delicacy."], "image": "val2014/COCO_val2014_000000155465.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255975, "question_id": "E6AZbdJ6D4kcJ5NMNtdjHr", "question": "What number on the license plate is the largest?", "choices": ["seven", "eight", "six", "four"], "correct_choice_idx": 0, "direct_answers": ["seven", "37", "seven", "seven", "seven", "seven", "seven", "seven", "five", "five"], "difficult_direct_answer": false, "rationales": ["The number is the largest as it comes before the rest of the given numbers in the numeric order.", "The number is 7.", "The number is seven."], "image": "train2014/COCO_train2014_000000255975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253975, "question_id": "E6VkooTMykZpHpStx3uDpM", "question": "What kind of power does the pink bicycle run on?", "choices": ["coal", "electricity", "man power", "gas"], "correct_choice_idx": 2, "direct_answers": ["car", "man", "man power", "man", "air", "watching", "feet", "sea", "man power", "manual"], "difficult_direct_answer": false, "rationales": ["The power is manpower.", "The pink bicycle at the beach does not have a motor and runs only on man power.", "The bicycle uses pedals to make it move."], "image": "train2014/COCO_train2014_000000253975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68403, "question_id": "E7p4f8BReqDJ2Tnk9FPP4Q", "question": "What kind of beverage is sat atop of the computer tower in the corner of this room?", "choices": ["water", "beer", "juice", "wine"], "correct_choice_idx": 1, "direct_answers": ["beer", "beer", "beer", "wine", "cpu", "beer", "beer", "beer", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["The beverage is beer.", "There is a clear alcohol bottle behind the laptop.", "There is a bottle and the liquid is yellow."], "image": "train2014/COCO_train2014_000000068403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293121, "question_id": "E87nvNwCggETH7cyt34a83", "question": "What are the giraffes under?", "choices": ["canopy", "airplane", "balloon", "bed"], "correct_choice_idx": 0, "direct_answers": ["umbrella", "tent", "canopy", "sun protected", "tent", "covering", "awning", "canopy", "shade", "umbrella"], "difficult_direct_answer": false, "rationales": ["The item is a large cover made of fabric or similar material.", "Giraffes are standing under a large umbrella like structure to get out of the sun.", "The object protects them from the sun and gives them shade."], "image": "train2014/COCO_train2014_000000293121.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55402, "question_id": "E8oQ3pD7ENVjyZG7h4MRDQ", "question": "What is stretched out?", "choices": ["giraffe neck", "string", "ladder", "rubber band"], "correct_choice_idx": 0, "direct_answers": ["neck", "griffie", "neck", "neck", "giraffe", "giraffes neck", "neck", "giraffe", "neck", "giraffe neck"], "difficult_direct_answer": false, "rationales": ["The giraffe has its head held high and stretching its neck.", "The giraffe is standing on the grass with its long neck stretched out far and high.", "In the image the only thing that is visibly extended or stretched appears to be answer a. all other objects appear in their natural position."], "image": "train2014/COCO_train2014_000000055402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 356660, "question_id": "E8w3669y7Pu9YWNvHjo4hZ", "question": "What is the person reaching for?", "choices": ["baby", "dog", "tennis ball", "cat"], "correct_choice_idx": 2, "direct_answers": ["ball", "tennis ball", "tennis ball", "tennis ball", "tennis ball", "ball", "ball", "ball", "reaching", "ball"], "difficult_direct_answer": false, "rationales": ["The person on the tennis court is reaching with their racquet to hit the tennis ball.", "The player is holding a tennis racket about to hit a tennis ball.", "The person wants the ball."], "image": "val2014/COCO_val2014_000000356660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209890, "question_id": "E9bwf7BwtVcoMcgdUR4muA", "question": "How many persons are floating on top of surfboards in the ocean?", "choices": ["three", "four", "one", "two"], "correct_choice_idx": 0, "direct_answers": ["two", "three", "two", "three", "two", "three", "three", "three", "one", "three"], "difficult_direct_answer": false, "rationales": ["There are three people on the ocean.", "Two guys on surfboards are surfing next to another who is laying on his surfboard.", "There are 3."], "image": "train2014/COCO_train2014_000000209890.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533079, "question_id": "E9v7pq8JdiRCaHfKQFyxBv", "question": "What is the child holding?", "choices": ["football", "baseball", "fruit", "basketball"], "correct_choice_idx": 2, "direct_answers": ["apples", "apples", "fruit", "fruit", "apples", "apples", "apples", "apple", "apples", "apple"], "difficult_direct_answer": false, "rationales": ["The child is holding an apple.", "A boy has one red apple and one green in hand.", "The kid has fruit."], "image": "train2014/COCO_train2014_000000533079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390539, "question_id": "EAmq3du7uxL5i9YvJXv3Kn", "question": "What part of the cops uniform has the most similar theme to the motorcycle?", "choices": ["shirt", "gloves", "belt", "boots"], "correct_choice_idx": 1, "direct_answers": ["helmet", "pants", "gloves", "front", "pants", "belt", "blue", "gloves", "patch", "man"], "difficult_direct_answer": false, "rationales": ["The gloves are similar.", "The gloves are white and formal like the motorcycle.", "They are the same color"], "image": "train2014/COCO_train2014_000000390539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128248, "question_id": "ECCniwyZ8CzBgxZht28NKh", "question": "What is the purpose of the round objects in the tub?", "choices": ["drainage", "massage", "mood lighting", "soap dispensing"], "correct_choice_idx": 1, "direct_answers": ["six", "produce bubbles", "tub drain", "jets", "bathing", "water jets", "massage", "bubbles", "spa jets", "water outlet"], "difficult_direct_answer": true, "rationales": ["The purpose is a massage.", "These are jets that force water out fast", "This is a bath with round jets where water shoots out at high velocity. they are positioned horizontally to massage your body in the water."], "image": "train2014/COCO_train2014_000000128248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548583, "question_id": "EGdmMUtY8GWSdNCsgTegWL", "question": "What kind of uniform is worn by the man in this picture?", "choices": ["police", "school", "cheerleader", "football"], "correct_choice_idx": 1, "direct_answers": ["skirt", "school uniform", "schoolgirl", "school uniform", "school", "schoolgirl", "school girl", "private school", "school", "women"], "difficult_direct_answer": false, "rationales": ["The uniform is for school.", "That is like a catholic school girl uniform. skirts are normal attire for those attending education in uniform.", "The features and color scheme of the uniform are most commonly found in settings matching answer a."], "image": "train2014/COCO_train2014_000000548583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385709, "question_id": "EGmmoJMBgHAeYJuYr2YXSw", "question": "What product does the sponsor with the blue background offer?", "choices": ["mortgage", "bank account", "credit card", "investment banking"], "correct_choice_idx": 2, "direct_answers": ["credit", "credit card", "visa credit", "credit card", "credit cards", "credit cards", "credit card", "credit card", "credit cards", "visa credit"], "difficult_direct_answer": false, "rationales": ["The sponsor in question is clearly visible and readable and is known to provide answer a.", "The sign with the blue background is for visa.", "The product is a credit card."], "image": "train2014/COCO_train2014_000000385709.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310546, "question_id": "EL4hxkLaodzXc3j594fmSN", "question": "How many types of foods are mixed in with the food?", "choices": ["three", "two", "five", "four"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "vegetables", "potato", "two", "three", "two", "two", "two", "three"], "difficult_direct_answer": false, "rationales": ["It looks like spinach or kale with potatoes. it's hard to tell.", "The food is clearly visible and there are the colors and consistency of the food items are distinct.", "The bowl has two different colored vegetables chopped up and mixed together."], "image": "train2014/COCO_train2014_000000310546.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422676, "question_id": "EL5htxbjgXDdoLGd32ttZE", "question": "What color are the flowers worn in the little black girl's hair?", "choices": ["pink", "blue", "yellow", "white"], "correct_choice_idx": 0, "direct_answers": ["purple", "purple", "brown", "purple", "purple", "purple", "pink", "violet", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["They are similar to violets in color", "The color of the flowers are too light to be red, but more red than white.", "The girl behind the bench has a bright pink flower in her hair."], "image": "val2014/COCO_val2014_000000422676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185821, "question_id": "EL9ir4Pap7scNejHTuzaVd", "question": "There is a large white structure with which animal directly in front of it?", "choices": ["cat", "zebra", "dog", "giraffe"], "correct_choice_idx": 1, "direct_answers": ["zebra", "wall", "zebra", "zebra", "zebra", "roof", "tent", "zebra", "giraffe", "wall"], "difficult_direct_answer": false, "rationales": ["The structure is a zebra.", "The black-and-white-striped animal is a zebra.", "There is a giraffe standing in the grass with a white structure behind it in the distance."], "image": "val2014/COCO_val2014_000000185821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296894, "question_id": "EMktGmM7wdUoVQHESruB9S", "question": "What is the girl on the left wearing?", "choices": ["clown nose", "jeans", "crown", "mask"], "correct_choice_idx": 1, "direct_answers": ["jeans", "tank top", "jeans shirt", "headband", "head scarf", "head scarf", "tank top", "jeans", "jeans", "scarf"], "difficult_direct_answer": false, "rationales": ["The cloth is blue.", "The girl on the left is wearing a pair of jeans next to a refrigerator.", "This is indicated by the color and texture of the material."], "image": "train2014/COCO_train2014_000000296894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61237, "question_id": "EMrf9S5PSZiRqe4QhQYN9k", "question": "What is the purpose of the heart around the cats neck?", "choices": ["protection", "visibility", "fashion", "identification"], "correct_choice_idx": 3, "direct_answers": ["tag", "cat tag", "identification tag", "tag", "name", "if lost", "collar", "owner", "1 heart", "identification"], "difficult_direct_answer": true, "rationales": ["The cat is wearing a tag on its neck with owner information on it.", "The purpose is for id.", "The heart is so people know who the cat is and belongs to."], "image": "train2014/COCO_train2014_000000061237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157675, "question_id": "EN8uxWsAUBYq6jaoX2XxEr", "question": "What kind of fuel does this run on?", "choices": ["methanol", "denatured alcohol", "gas", "coal"], "correct_choice_idx": 3, "direct_answers": ["coal", "coal", "diesel", "steam", "train", "coal", "coal", "train", "coals", "train"], "difficult_direct_answer": false, "rationales": ["The train on the tracks runs on energy provided by burning coal.", "The fuel is coal.", "A train is on tracks."], "image": "train2014/COCO_train2014_000000157675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66191, "question_id": "ENK9eYExiziGnU9vbkiNy6", "question": "Who played for the same team as this player?", "choices": ["pele", "manny ramirez", "clu gulager", "ken shamrock"], "correct_choice_idx": 1, "direct_answers": ["mo vaughn", "kutter crawford", "white sox", "babe", "hernandez", "manny ramirez", "baseball", "sandy koufax", "team player", "player"], "difficult_direct_answer": true, "rationales": ["The player is manny.", "Manny ramirez used to play for the red sox.", "The person is a player for the red sox."], "image": "val2014/COCO_val2014_000000066191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19528, "question_id": "EP2JJd3mhsMU2sT6S5A9rV", "question": "What type of dog is this?", "choices": ["great dane", "irish setter", "afghan", "yellow lab"], "correct_choice_idx": 3, "direct_answers": ["golden retriever", "lab", "labrador", "labrador", "labrador retriever", "sarabi", "labrador", "yellow lab", "yellow labrador", "yellow lab"], "difficult_direct_answer": false, "rationales": ["The dog is a lab.", "The dog is a hunting dog.", "This is a yellow labrador."], "image": "train2014/COCO_train2014_000000019528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17226, "question_id": "EQtXvAWPf2A7AXtSRGFqCL", "question": "How many yellow propeller planes are flying together in formation?", "choices": ["three", "four", "one", "two"], "correct_choice_idx": 1, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are 4.", "The planes are clearly visible and countable in the sky.", "There are two rows of two planes each"], "image": "train2014/COCO_train2014_000000017226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126264, "question_id": "ERHetTK43eSSHQzRNuPZmH", "question": "What color is the netting in the tennis racket held by the man about to hit the ball?", "choices": ["black", "white", "red", "yellow"], "correct_choice_idx": 3, "direct_answers": ["yellow", "white", "yellow", "white", "yellow", "yellow", "yellow", "yellow", "yellow", "neon green"], "difficult_direct_answer": false, "rationales": ["It's almost the same shade of color as the tennis ball which is green and yellow.", "The color is a neon color that is much lighter than green.", "The man on the tennis court is holding a tennis racquet that is made from yellow netting."], "image": "train2014/COCO_train2014_000000126264.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110251, "question_id": "ESPWC7mKNtGbEznmndMJXH", "question": "What is behind the fence?", "choices": ["cat", "dog", "antelope", "playground"], "correct_choice_idx": 3, "direct_answers": ["playground", "park", "playground", "playground", "playground", "playground", "park", "playground", "playground", "park"], "difficult_direct_answer": false, "rationales": ["There is a playground.", "There are slides.", "There is playground equipment and picnic tables behind the fence."], "image": "train2014/COCO_train2014_000000110251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22564, "question_id": "EShCnWG8jrWnagZe7BYhNz", "question": "How many bowls contain a kind of desert?", "choices": ["three", "six", "four", "two"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "four", "two", "two", "four", "four", "four", "four", "two"], "difficult_direct_answer": false, "rationales": ["Small white dishes are filled with pastries and are on a plate.", "There are 2 bowls.", "There are two bowls with desert in them."], "image": "train2014/COCO_train2014_000000022564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94052, "question_id": "EV3jBB2sqTS4VK4NwYFVMe", "question": "What is on top of the dining table?", "choices": ["cat", "bananas", "fish heads", "yule log"], "correct_choice_idx": 1, "direct_answers": ["food", "bananas", "groceries", "groceries", "bags", "groceries", "groceries", "groceries", "bananas", "groceries"], "difficult_direct_answer": false, "rationales": ["The other options aren't on the table.", "There is a bunch of bananas on top of the dining table.", "Bananas are on top."], "image": "val2014/COCO_val2014_000000094052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558784, "question_id": "EXR4EUD3rimZop6aCo26y3", "question": "What is next to the man?", "choices": ["apple", "cat", "dog", "gym bag"], "correct_choice_idx": 3, "direct_answers": ["gym bag", "wooden pole", "good", "post", "bag", "lake", "black bag", "river", "wood post", "bag"], "difficult_direct_answer": true, "rationales": ["There is a duffel made of canvas that is intended to carry sport equipment.", "A piece of luggage is next to the man. there are no animals or fruits near the man.", "The object near the man has handles and appears to be of a canvas-like material. it appears to have zippers and would be used to carry things."], "image": "val2014/COCO_val2014_000000558784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332914, "question_id": "EXvR9EbD2skVfcWGLFdJc7", "question": "What color are the small dark stripes going around the toddler's shirt?", "choices": ["black", "blue", "orange", "brown"], "correct_choice_idx": 3, "direct_answers": ["black", "brown", "brown", "brown", "grey", "brown", "black", "black", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The other small stripes are white", "The boy's shirt is pink with brown stripes.", "The color is brown."], "image": "val2014/COCO_val2014_000000332914.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500902, "question_id": "EYtN53TxznfmEVe8bjBANU", "question": "What are the elephants showing to the camera?", "choices": ["horns", "mouths", "trunks", "backsides"], "correct_choice_idx": 3, "direct_answers": ["behind", "butts", "tail", "tails", "tail", "tails", "back", "tails", "back", "backsides"], "difficult_direct_answer": false, "rationales": ["The tails are visible.", "The tails of the animals are facing the camera and their heads are facing away. based on this orientation, answer a is the only correct response from the list.", "The elephants are walking away from the camera. their horns, trunks, and mouths are not visible."], "image": "val2014/COCO_val2014_000000500902.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68623, "question_id": "Eb2vk5AZDxpt9wMVrucJS6", "question": "What is near the grass?", "choices": ["elephants", "cats", "cows", "dogs"], "correct_choice_idx": 0, "direct_answers": ["elephants", "elephants", "elephants", "elephant", "elephant", "elephants", "elephant", "elephants", "elephants", "elephants"], "difficult_direct_answer": false, "rationales": ["Several large animals with tru ks and floppy ears can be seen.", "There are a few animals with tusks on the ends of their heads and big floppy ears in the field.", "They are large grey animals with big ears and a long trunk."], "image": "train2014/COCO_train2014_000000068623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548755, "question_id": "EbzkmmzUq9UUq5jSLBeQVg", "question": "What kind of green vegetable is held underneath of the carrots like a card?", "choices": ["hops", "cucumber", "broccoli", "spinach"], "correct_choice_idx": 1, "direct_answers": ["cucumbers", "zucchini", "cucumber", "cucumber", "cucumber", "cucumber", "zucchini", "cucumbers", "parsley", "courgette"], "difficult_direct_answer": false, "rationales": ["Those types of veggies are like pickles and long.", "There is a cucumber or zuchinni with the carrots.", "They are long and green"], "image": "train2014/COCO_train2014_000000548755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355776, "question_id": "Ed3o3c8QKFht46LuPKBiKN", "question": "Which piece of attire is abnormal for the child to wear?", "choices": ["boots", "nothing", "underwear", "shirt"], "correct_choice_idx": 0, "direct_answers": ["shoe", "boots", "boots", "boot", "shoe", "boots", "boots", "shoe", "shoes", "boots"], "difficult_direct_answer": false, "rationales": ["A small child is standing at the counter wearing boots with no pants.", "The attire is boots.", "The boots are somewhat large for the child."], "image": "val2014/COCO_val2014_000000355776.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297964, "question_id": "EdYKktRDwvznscvVJhRGUd", "question": "What item hanging on the wall would help most on a battlefield?", "choices": ["blunderbuss", "bazooka", "dagger", "watch"], "correct_choice_idx": 2, "direct_answers": ["knif", "sword", "dagger", "sword", "sword", "sword", "knife", "sword", "knife", "sword"], "difficult_direct_answer": false, "rationales": ["The knife on the wall could be used in a combat situation.", "The item is too short to be a sword, and is pointed with a handle.", "The item is a dagger."], "image": "train2014/COCO_train2014_000000297964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62942, "question_id": "EeEFHdSAZscz99uFLWyHyx", "question": "What does the store to the left sell?", "choices": ["hammers", "donuts", "sandwiches", "pizza"], "correct_choice_idx": 0, "direct_answers": ["home goods", "things", "materials", "pro", "building supplies", "home improvement", "lumber", "petrol", "home items", "hammers"], "difficult_direct_answer": true, "rationales": ["The store sells hammers.", "A big box store that sells home goods is behind a pickup truck.", "This is a home building store"], "image": "train2014/COCO_train2014_000000062942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76250, "question_id": "EfgRS97W8qXpogLbSBM6Wb", "question": "How many colors of icing are glazing these donuts in the display case?", "choices": ["three", "four", "one", "two"], "correct_choice_idx": 1, "direct_answers": ["four", "three", "four", "five", "four", "four", "three", "seven", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are various pastries on display with glaze, pink, chocolate, and white frosting.", "This is obvious after simply counting the different types.", "There are pink, purple, brown, and tan donuts in the case."], "image": "train2014/COCO_train2014_000000076250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285291, "question_id": "EfjKa4oysXftztg9RywppG", "question": "What is the child standing on?", "choices": ["grass", "box", "snow", "sand"], "correct_choice_idx": 2, "direct_answers": ["skating", "skateboard", "ski", "snow", "skis", "iceskaters", "snow", "skis", "skis", "snow"], "difficult_direct_answer": false, "rationales": ["The ground is covered in white and she's wearing warm clothing. under her feet are skis.", "The child is on snow.", "It's white and frozen"], "image": "val2014/COCO_val2014_000000285291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114398, "question_id": "EgxgNwyBKBF4AG5vAvLAj2", "question": "What do the animals have?", "choices": ["wings", "stingers", "trunks", "long necks"], "correct_choice_idx": 2, "direct_answers": ["trunks", "trunks", "elephant", "trunks", "trunks", "trunks", "water", "trunks", "tusks", "trunks"], "difficult_direct_answer": false, "rationales": ["The most prominent object on an elephant is its trunk.", "Long hose noses also called trunks are the only feature present on these elephants which is listed here.", "The animals are elephants, not giraffes, bees, or birds."], "image": "val2014/COCO_val2014_000000114398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337255, "question_id": "EgzQ6cN6vJVL797X7RZfFs", "question": "What color of blanket does the cat sleep upon?", "choices": ["yellow", "red", "white", "blue"], "correct_choice_idx": 0, "direct_answers": ["orange color", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The color is yellow.", "The cat's blanket is not white, blue, or red.", "The cat on the couch is sleeping on a yellow blanket."], "image": "train2014/COCO_train2014_000000337255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287236, "question_id": "EhDoePjHuxKMv5DNrmeGkf", "question": "What is the person using to video record the live performance?", "choices": ["cam quarter", "cell phone", "film camera", "nikon"], "correct_choice_idx": 1, "direct_answers": ["cell phone", "cellphone", "cell phone", "cell phone", "phone", "mobile", "cell phone", "party", "cell phone", "phone"], "difficult_direct_answer": false, "rationales": ["The size and shape of the object and question is consistent with answer a as well as the application of it.", "The person is using a nokia phone to record the concert.", "A cell phone can be used to film because this feature is on most cells"], "image": "train2014/COCO_train2014_000000287236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100959, "question_id": "Ei97tPEHWWcdg7H4kkkKcF", "question": "What is under the food?", "choices": ["box", "spatula", "sugar", "egg"], "correct_choice_idx": 1, "direct_answers": ["spatula", "pizza peel", "holder", "plate", "table", "spatula", "spatula", "spatula", "pisa", "car"], "difficult_direct_answer": false, "rationales": ["The spatula is underneath.", "There is a handle of a serving utensil visible under the pizza and answer a is the name for such a tool.", "There is a spatula under the pizza."], "image": "train2014/COCO_train2014_000000100959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 273607, "question_id": "EkAUCxc66tA9w4rg67ih3u", "question": "The big fluffy cat is sitting on what?", "choices": ["backpack", "bed", "yarn", "suitcase"], "correct_choice_idx": 0, "direct_answers": ["some case", "backpack", "backpack", "bag", "suitcase", "backpack", "bag", "bag", "backpack", "bag"], "difficult_direct_answer": false, "rationales": ["The cat is sitting on top of a big backpack.", "The big fluffy cat is sitting on top of a black backpack with straps and zippers.", "The cat is on a backpack."], "image": "train2014/COCO_train2014_000000273607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529905, "question_id": "EkK8Zgh7mfqf5owLvANJrw", "question": "Where is the plane stopped?", "choices": ["road", "driveway", "tarmac", "roof top"], "correct_choice_idx": 2, "direct_answers": ["aerodrum", "tarmac", "runway", "protect", "airport", "airport", "runway", "runway", "airport", "tarmac"], "difficult_direct_answer": false, "rationales": ["The tarmac is where planes go to land and drive around to a terminal.", "This is indicated by the size of the parking area.", "This is the correct name for this type of area."], "image": "train2014/COCO_train2014_000000529905.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458282, "question_id": "Em6YBAMAAPWZE2fV8qkjMN", "question": "What is on the bed?", "choices": ["dogs", "pillows", "old man", "cats"], "correct_choice_idx": 1, "direct_answers": ["pillows", "pillows", "pillow", "pillows", "pillows", "bedding", "billow", "pillow", "pillows", "blanket"], "difficult_direct_answer": false, "rationales": ["These are fluffy stuffed fabric for your head", "Two large, plush and square objects are on top of a bed near the headboard.", "There are many pillows."], "image": "train2014/COCO_train2014_000000458282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511136, "question_id": "EmimPpdcTTixdL8jqqmr6E", "question": "What does the item the man is holding provide?", "choices": ["fruit", "shade", "water", "milk"], "correct_choice_idx": 1, "direct_answers": ["shade", "shade", "shade", "sun umbrella", "rain protection", "umbrella", "umbrella", "umberalla", "umrella", "shade"], "difficult_direct_answer": false, "rationales": ["The item is shade.", "The item gives shade.", "The item the man is holding is an umbrella which provides shade from the sun."], "image": "val2014/COCO_val2014_000000511136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385118, "question_id": "EmrCd5mG7Vn8LW7f3HYyQw", "question": "What is the person in the foreground descending?", "choices": ["escalator", "ski slope", "ramp", "mountain"], "correct_choice_idx": 2, "direct_answers": ["skateboard", "ramp", "skate", "riding", "skate ramp", "ramp", "ramp", "ren", "ramp rink", "ramp"], "difficult_direct_answer": false, "rationales": ["He is going down a ramp.", "The man is going down a slope made for skateboards, with a skateboard.", "The person has a ramp."], "image": "train2014/COCO_train2014_000000385118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252193, "question_id": "EnB3YEtecUxF3Feh72NV2y", "question": "Based on the stack of books what does this person like to use their computer to do?", "choices": ["program", "social media", "read fiction", "play games"], "correct_choice_idx": 3, "direct_answers": ["play games", "write mysteries", "play games", "computer", "study", "laptop", "video games", "warcraft", "gaming", "repair"], "difficult_direct_answer": true, "rationales": ["Books based on a video game are stacked behind a laptop computer.", "They are a gamer.", "They are playing."], "image": "train2014/COCO_train2014_000000252193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343385, "question_id": "EojFugzLncbMQCN8e5FYYJ", "question": "Where are these animals positioned in?", "choices": ["conservatory", "wild", "zoo", "display"], "correct_choice_idx": 3, "direct_answers": ["africa", "beach", "jungle", "standing", "safari", "display", "diorama", "museum", "zoo", "hill"], "difficult_direct_answer": true, "rationales": ["The animals aren't real. they are set up for display.", "The animals are displayed.", "With the glare from the window and false background you can tell it's a display."], "image": "train2014/COCO_train2014_000000343385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96244, "question_id": "EpFVjTdBhJsLT8LgdpPHiQ", "question": "How many elephants are standing nearby the fallen log?", "choices": ["four", "one", "two", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "one", "three", "three", "three", "three", "three", "two", "three"], "difficult_direct_answer": false, "rationales": ["Three large animals with long trunks are walking around large trees and one that is fallen.", "One is on the other side and two are at the end of it", "They are all close to it"], "image": "train2014/COCO_train2014_000000096244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387206, "question_id": "EqRhrTvRBJemboXd3wYtoi", "question": "Why does the man with the red shoe have a dark leg?", "choices": ["skin condition", "low melatonin", "tattoos", "bruise"], "correct_choice_idx": 2, "direct_answers": ["tattoos", "tattoo", "tattoo", "tattoos", "tattoos", "dwdw", "jumping", "tattoo", "tattoos", "tatoo"], "difficult_direct_answer": false, "rationales": ["That is paint for a tattoo.", "The mans skin tone is visible with ink applied on it consistent with answer a.", "There are tattoos on his leg."], "image": "train2014/COCO_train2014_000000387206.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24207, "question_id": "EsNabd9UyKXvjvmzMPwA5w", "question": "What does the animal in the foreground have?", "choices": ["stinger", "quills", "wings", "horns"], "correct_choice_idx": 3, "direct_answers": ["horns", "horns", "horns", "horns", "horns", "horns", "horns", "horns", "horns", "wool"], "difficult_direct_answer": false, "rationales": ["The animal has two curved hard things coming off its head.", "The animal in the foreground is a sheep, not a bird, porcupine, or bee.", "The animal has horns."], "image": "val2014/COCO_val2014_000000024207.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514529, "question_id": "EtbehK2sPYgMZRZcuAdZdD", "question": "How is the man in the water moving?", "choices": ["fish guidance", "motor", "sail", "paddle"], "correct_choice_idx": 2, "direct_answers": ["knee boarding", "one", "kite sail", "one", "sail", "surfing", "wind power", "surfboard", "skiing", "kite"], "difficult_direct_answer": true, "rationales": ["The man in the water is attached to wires. the wires attach to an item, which is not a motor or paddle, that is moving the man.", "The man has a sail.", "The man is attached to answer a by a harness and to engage in this activity and successfully move, one would rely on the thing that the harness attaches them to."], "image": "val2014/COCO_val2014_000000514529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492506, "question_id": "Eu3DNjkDTnsVuenVe3w78P", "question": "What direction are the giraffes headed?", "choices": ["east", "south", "north", "west"], "correct_choice_idx": 3, "direct_answers": ["forward", "forward", "west", "left", "west", "west", "west", "left", "left", "left"], "difficult_direct_answer": false, "rationales": ["The heads of the giraffes are facing the left side.", "The giraffes are heading to the left.", "They are all headed the same direction"], "image": "val2014/COCO_val2014_000000492506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315165, "question_id": "EvF4RnBCNCJmVUrBnesgCg", "question": "What type of boat is the small vessel?", "choices": ["sailboat", "canoe", "motorboat", "yacht"], "correct_choice_idx": 2, "direct_answers": ["fishing", "canoe", "skiff", "paddle boat", "motor boat", "small", "engine boat", "kayak", "motorboat", "raft"], "difficult_direct_answer": true, "rationales": ["The features of the boat are mostly visible and consistent with answer a while inconsistent based on size and composition with any other answer.", "The type is a motorboat.", "It is nighttime and there is an orange watercraft with a light on board in the water."], "image": "train2014/COCO_train2014_000000315165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216006, "question_id": "Ew8Q6M2T67wzjzYYreqCGv", "question": "What part of town is this car in based on the sign?", "choices": ["east", "south", "west", "north"], "correct_choice_idx": 0, "direct_answers": ["broadway", "east", "outskirts", "stop", "east", "broadway", "no parkiing", "east", "travel", "city"], "difficult_direct_answer": false, "rationales": ["There is an e on both the street signs", "The sign says 'e' which stands for east.", "The part is east."], "image": "val2014/COCO_val2014_000000216006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385734, "question_id": "EwPeTE6N4hvJ2SL9gpBioY", "question": "How many goats are visible before the cameraperson?", "choices": ["two", "five", "three", "four"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["A few horned animals are standing together, two fully visible and one partially.", "There are three goats with horns looking straight but only one is completely in view.", "A few goats are in a grassy area standing together."], "image": "train2014/COCO_train2014_000000385734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374595, "question_id": "Exxjz63r38uhhCr6QUujBW", "question": "What makes this a good day for flying kites?", "choices": ["humid", "cold", "cloudy", "clear skies"], "correct_choice_idx": 3, "direct_answers": ["stop", "wind", "weather", "clear skies", "wind", "windy", "car", "sunny", "clear skies", "very"], "difficult_direct_answer": false, "rationales": ["The skies are blue and without clouds.", "You can see them in the air clearly", "There are no clouds, making it easy to see the kite."], "image": "train2014/COCO_train2014_000000374595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282137, "question_id": "EzLHthzUy9ACsepXMdvysq", "question": "How many Siamese cats are sitting atop the window cell?", "choices": ["one", "two", "four", "three"], "correct_choice_idx": 1, "direct_answers": ["watching", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two siamese cats. they are sitting on the sill.", "There are two cats.", "One siamese cat is sitting beside another one."], "image": "train2014/COCO_train2014_000000282137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310242, "question_id": "F2A5j4JKCJcU73WHaLjJC9", "question": "What song is in a similar language to the language found at the top of the wall?", "choices": ["revolution rock", "yellow submarine", "la mer", "der kommissar"], "correct_choice_idx": 3, "direct_answers": ["night world", "du haust", "feurwehr", "99 luffballons", "der kommisar", "edelweiss", "german", "breaking law", "na", "der kommissar"], "difficult_direct_answer": true, "rationales": ["The sign is in german, not english or french.", "This is german for the commissioner.", "Der kommissar is a song in german which is the same language as the words on the building."], "image": "train2014/COCO_train2014_000000310242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110718, "question_id": "F2HQ7x4XJo6X7PmKGUnBWQ", "question": "What are the first two numbers on the truck?", "choices": ["45", "96", "88", "26"], "correct_choice_idx": 3, "direct_answers": ["two six", "two six", "two six", "two six", "two", "26", "25", "two six", "26", "26"], "difficult_direct_answer": false, "rationales": ["The numbers in option a match the two numbers on the truck's license plate.", "The license plate number on the truck is 2622.", "The numbers are 2 and 6."], "image": "train2014/COCO_train2014_000000110718.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539787, "question_id": "F2HXyyKzm4fXzfamXg7BPp", "question": "What is the man holding?", "choices": ["cat", "egg", "baby", "bird"], "correct_choice_idx": 2, "direct_answers": ["baby", "baby", "baby", "baby", "baby", "baby", "baby", "baby", "napkin", "baby"], "difficult_direct_answer": false, "rationales": ["The man has a kid.", "There is a very young human in his lap that is probably his kid.", "The man has a baby."], "image": "val2014/COCO_val2014_000000539787.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432035, "question_id": "F2qdiVCzDKmPskpRAv9CDS", "question": "What is there an excessive amount of relative to most pizzas?", "choices": ["bread", "sauce", "meat", "vegetables"], "correct_choice_idx": 1, "direct_answers": ["sauce", "sauce", "twenty", "tomato paste", "sauce", "sauce", "pepperoni", "sauce", "tomato sauce", "sauce"], "difficult_direct_answer": false, "rationales": ["Tomato from the look of it.", "There is a lot of marinara sauce on the pizza.", "The pizza has sauce."], "image": "val2014/COCO_val2014_000000432035.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175047, "question_id": "F3iRQminT78iatxPAatezd", "question": "What color are the stops used to block traffic in the construction?", "choices": ["purple", "blue", "green", "red"], "correct_choice_idx": 3, "direct_answers": ["orange", "red", "red", "orange", "re", "orange", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The colour grabs the attention of people.", "The color is red."], "image": "train2014/COCO_train2014_000000175047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554040, "question_id": "F4oNrxjBTsSwMgtJ7tJdB8", "question": "What is the green stuff in the bottle most likely?", "choices": ["soap", "chalk", "jam", "jelly"], "correct_choice_idx": 0, "direct_answers": ["soap", "soap", "soap", "handsoap", "soap", "soap", "soap", "soap", "soap", "soap"], "difficult_direct_answer": false, "rationales": ["The green liquid on the sink is mostly likely soap to wash hands.", "It is in a dispenser.", "It's in a pump dispenser bottle"], "image": "train2014/COCO_train2014_000000554040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142684, "question_id": "F6GMyvdJY4ufhSLApynLgp", "question": "Which appliance is most likely to catch on fire?", "choices": ["none", "microwave", "neither", "oven"], "correct_choice_idx": 3, "direct_answers": ["stove", "stove", "oven", "oven", "stove", "stove", "car", "stove", "oven", "oven"], "difficult_direct_answer": false, "rationales": ["The black appliances can both catch fire because they both give off heat. however the appliance above stove has higher heat at once.", "The oven in the kitchen can get very hot and burn foods which could catch on fire.", "The appliance is the oven."], "image": "train2014/COCO_train2014_000000142684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205525, "question_id": "F6h4Ahz2YSuzveHrHFrBNH", "question": "What is the item on the chain for?", "choices": ["test water", "plunger", "drain stopper", "hot water"], "correct_choice_idx": 2, "direct_answers": ["water stopper", "drain plug", "glass", "hold", "plug drain", "safekeeping", "stopper", "cleaning", "drain stopper", "drain stopper"], "difficult_direct_answer": true, "rationales": ["The item on the chain is resting on the sink. it is used to stop the water from going down the drain.", "The item is a drain.", "It is a drain stopper as it shows the drain system."], "image": "train2014/COCO_train2014_000000205525.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316755, "question_id": "F7ePGVDXXq9yExbyte4N4M", "question": "How many elephants are walking around the marshy river water?", "choices": ["three", "four", "six", "five"], "correct_choice_idx": 1, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "some", "four", "four elephants"], "difficult_direct_answer": false, "rationales": ["Two elephants are walking in front of two other elephants.", "There is a group of four elephants walking along the water consisting of three adults and one young elephant.", "There are more than three but less than five elephants."], "image": "train2014/COCO_train2014_000000316755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 499842, "question_id": "F84yYae6jp8MhtNxx9yvTA", "question": "How many monitors are on top of the desk with the white keyboard and mouse?", "choices": ["five", "two", "three", "four"], "correct_choice_idx": 1, "direct_answers": ["two", "one", "two", "two", "two", "two", "one", "two", "one", "two"], "difficult_direct_answer": false, "rationales": ["There are two monitors on the desk. one monitor is a laptop and one is a plain monitor.", "There are more than one but less than three monitors.", "There are two monitors."], "image": "val2014/COCO_val2014_000000499842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106296, "question_id": "F9HuAb4QC9FErQu7mruryb", "question": "What is separating the two cats?", "choices": ["baby", "potted plant", "monkey", "food bowl"], "correct_choice_idx": 1, "direct_answers": ["potted plant", "plant", "flower pot", "plant", "potted plant", "pot", "planter", "pot", "planter", "vase"], "difficult_direct_answer": false, "rationales": ["Two cats sit on a bench with a green plant in a round receptacle between them.", "This is obvious in the scene. it's a round one.", "The are outside."], "image": "train2014/COCO_train2014_000000106296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227879, "question_id": "F9T22AHQ5FHhoZcQGhetmb", "question": "Who could this batter be?", "choices": ["jd drew", "derek jeter", "otis nixon", "chipper jones"], "correct_choice_idx": 0, "direct_answers": ["maddox", "jd drew", "babe ruth", "baseball player", "baseball field", "baseball player", "baseball", "home run", "car", "player"], "difficult_direct_answer": true, "rationales": ["The player plays for the boston red sox based on his uniform and only answer a played for this team in their career and would have worn this uniform.", "The batter is drew.", "He is wearing the correct number."], "image": "val2014/COCO_val2014_000000227879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112078, "question_id": "F9hT4uUnfyqr92MxGoTyFC", "question": "What animals is the child looking at?", "choices": ["cats", "deer", "beavers", "cows"], "correct_choice_idx": 3, "direct_answers": ["cow", "cow", "cows", "cow", "cow", "cow", "cow", "cow", "cow", "cows"], "difficult_direct_answer": false, "rationales": ["The kid looks at cows.", "The animal has horns and udders.", "The child is looking at cows."], "image": "train2014/COCO_train2014_000000112078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119773, "question_id": "F9kpYWMLGyQfWaXK6Nxvxz", "question": "What is the man holding?", "choices": ["tire", "child", "basketball", "basket"], "correct_choice_idx": 1, "direct_answers": ["child", "surfboard", "surfboard", "girl", "child", "baby surfboard", "baby", "skateboard", "child", "child"], "difficult_direct_answer": false, "rationales": ["The man has a kid.", "The man is holding a person. the person is not an adult.", "The man in the water is carrying a child and is walking in the waves."], "image": "val2014/COCO_val2014_000000119773.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38490, "question_id": "FA8U6hLxrNLtgBGCMHxBUa", "question": "What are the giraffes near?", "choices": ["dogs", "elephants", "cats", "trees"], "correct_choice_idx": 3, "direct_answers": ["feeder", "trees", "fence", "tree", "fence", "trees", "fence", "trees", "trees", "rail"], "difficult_direct_answer": false, "rationales": ["The giraffes are by trees.", "They are standing among plants that are tall and have leaves.", "The giraffes are all standing near trees."], "image": "train2014/COCO_train2014_000000038490.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281922, "question_id": "FAKpvPXp6k9Q6TsayetsW5", "question": "What color is the boundary of the frisbee thrown by the man in shorts on the beach?", "choices": ["white", "yellow", "red", "blue"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "red", "orange", "red", "red", "orange", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The trim of the frisbee is red colored.", "The color is red."], "image": "train2014/COCO_train2014_000000281922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114503, "question_id": "FAPnJKEoz7HGHrBTKJmiyR", "question": "What color are the tags on top of the honey dispensers?", "choices": ["orange", "white", "pink", "purple"], "correct_choice_idx": 0, "direct_answers": ["orange", "orange", "orange", "orange", "white", "blue", "orange", "orange", "brown", "orange"], "difficult_direct_answer": false, "rationales": ["The honey dispensers have blue lids and have orange tags stuck to them.", "The tags on top are not white, pink, or purple.", "It's a bright reddish-yellowish color against the blue top of the honey."], "image": "val2014/COCO_val2014_000000114503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382118, "question_id": "FCGh552JLgyNjQroYwDRBu", "question": "How many big elephants are inside of this zoo enclosure together?", "choices": ["one", "four", "two", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "two", "three", "three", "car", "three", "three", "three", "stop"], "difficult_direct_answer": false, "rationales": ["You can count 3 elephants standing closely together", "There are no elephants fully visible. but several can be seen.", "Three elephants can be seen."], "image": "train2014/COCO_train2014_000000382118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 487778, "question_id": "FCmusufxHnyth6dAfC36DM", "question": "What time is it?", "choices": ["morning", "midnight", "dawn", "dusk"], "correct_choice_idx": 0, "direct_answers": ["day", "day", "morning", "afternoon", "noon", "mid afternon", "afternoon", "12pm", "daytime", "daytime"], "difficult_direct_answer": false, "rationales": ["The sun is reflecting on the bench which shows it is morning.", "The time is morning.", "There is a shadow."], "image": "train2014/COCO_train2014_000000487778.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217707, "question_id": "FDjv4zzZJmo5LJBsX4RgUQ", "question": "How many computer monitors are on top of the desk next to the clipboard?", "choices": ["five", "two", "four", "three"], "correct_choice_idx": 1, "direct_answers": ["one", "two", "one", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Monitors are screens that are used to display contents in this image there is one laptop and one desktop computer that both have a monitor.", "There is one for a regular computer and one attached to the laptop", "There are 2."], "image": "train2014/COCO_train2014_000000217707.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305499, "question_id": "FFHcgPfHMEGUvYHvYhrg5j", "question": "How do the horses likely feel towards each other?", "choices": ["anxious", "threatened", "friendly", "angry"], "correct_choice_idx": 2, "direct_answers": ["love", "friendly", "love", "loving", "love", "love eachother", "happy", "love", "love", "yes"], "difficult_direct_answer": false, "rationales": ["The horses are friendly.", "The horses are friends.", "The horses are rubbing each other affectionately."], "image": "train2014/COCO_train2014_000000305499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330894, "question_id": "FHL7dqBoswnaiWt4sEKa2J", "question": "What is lined up on the side of the street?", "choices": ["motor bike", "elephants", "cows", "toddlers"], "correct_choice_idx": 0, "direct_answers": ["scooty", "scooters", "scooter", "scooters", "motorbikes", "moped", "moped", "motor bike", "gopeds", "scooters"], "difficult_direct_answer": false, "rationales": ["The bikes are lined.", "There is a line of scooters parked along the street. they are used for transportation around town.", "These each have two wheels and a motor and generally hold one person each"], "image": "train2014/COCO_train2014_000000330894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134223, "question_id": "FKLUr5Gx9aVNcNM2L6Hr8S", "question": "What is between the bread?", "choices": ["hamburger", "hot dog", "salami", "ham"], "correct_choice_idx": 1, "direct_answers": ["wiener", "hot dog", "hot dog", "sausage", "bread and sauce", "hotdog", "chicken", "sausage", "hotdog", "sausage"], "difficult_direct_answer": false, "rationales": ["The meat is brownish and cylindrical-shaped piece. hot dogs are typically served on a bun.", "Traditionally those types of breads are used for keeping hot dogs in place.", "It is identifiable by its long tubular shape and it is traditional food on a bun. it is often served with condiments and fries."], "image": "val2014/COCO_val2014_000000134223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337147, "question_id": "FKdsWsd6RuWGQqfVsacJEa", "question": "What color are the fruits sliced out on top of the pancake?", "choices": ["blue", "pink", "white", "red"], "correct_choice_idx": 2, "direct_answers": ["yellow", "banana", "yellow", "yellow", "yellow", "yellow", "yellow", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The fruits are white.", "The color is white.", "Banana is sliced on top of pancakes. bananas are white."], "image": "train2014/COCO_train2014_000000337147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579201, "question_id": "FKkeWDRrT4BdCUHoaRmbFD", "question": "What is likely the red substance on the knife?", "choices": ["blood", "paint", "marker", "crayon"], "correct_choice_idx": 1, "direct_answers": ["apple", "apple", "paint blood", "apple", "juice", "paint", "cut", "paint", "apple", "paint"], "difficult_direct_answer": false, "rationales": ["The substance is paint.", "The substance is paint.", "There is a red substance that looks like blood but is thicker."], "image": "val2014/COCO_val2014_000000579201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406792, "question_id": "FKtHqWZpBeLBGvXeDgGXAf", "question": "What color is the painting on the wall behind the bed stand?", "choices": ["red", "yellow", "green", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "fsfss", "blue", "blue", "but", "blue", "white", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The color matches the sky.", "The painting above the bed is mostly this color.", "The color is blue."], "image": "train2014/COCO_train2014_000000406792.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 163975, "question_id": "FLLCw7jcBLihhnFwAyBiKj", "question": "How many mirrors are on the pedestal mirror?", "choices": ["four", "two", "one", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 3.", "There are that many mirrors that are separated by those black bar things.", "There are three mirrors."], "image": "train2014/COCO_train2014_000000163975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424160, "question_id": "FMhMf4i5P67wonWoukvS77", "question": "What brand features these animals?", "choices": ["coca cola", "laughing cow", "goya", "mcdonalds"], "correct_choice_idx": 1, "direct_answers": ["milk", "redbull", "chick-fil-a", "laughing cow", "stop", "red bull", "laughing cow", "milk", "tg lee", "kraft"], "difficult_direct_answer": false, "rationales": ["Laughing cow cheese features cows.", "The laughing cow features them.", "It is the laughing cow cheese that has a cow on its label"], "image": "train2014/COCO_train2014_000000424160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97427, "question_id": "FP2K5kmnbaUwLFXizwcDVV", "question": "What provides heat to the stovetop burners?", "choices": ["natural gas", "electricity", "wood", "coal"], "correct_choice_idx": 0, "direct_answers": ["gas", "gas", "gas", "gas cylinder", "gas", "gas", "gas", "gas", "gas", "natural gas"], "difficult_direct_answer": false, "rationales": ["The heaters use gas to heat up.", "These have burners that produce flame", "The burner heads where the flames come out are visible under the burner grates."], "image": "val2014/COCO_val2014_000000097427.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316708, "question_id": "FPxYBAvnCpDphM8rEtqGp6", "question": "What animal is unseen but represented by an item here?", "choices": ["antelope", "mouse", "cat", "dog"], "correct_choice_idx": 1, "direct_answers": ["mouse", "mouse", "elephant", "mouse", "mouse", "mouse", "mouse", "stop", "mouse", "mouse"], "difficult_direct_answer": false, "rationales": ["The computers are controlled with an animal-like item. the animal is not a cat, dog, or antelope.", "The animal is a mouse.", "A computer mouse controls the laptops."], "image": "val2014/COCO_val2014_000000316708.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453862, "question_id": "FQMpjA9uwW3d4p2oigct5w", "question": "What is the purpose of the gold and black object?", "choices": ["improve reception", "play games", "make calls", "play music"], "correct_choice_idx": 0, "direct_answers": ["antenna", "baseball stand", "vaping", "hold brushes", "communication", "baseball stand", "walkie talkie", "improve reception", "radio", "battery"], "difficult_direct_answer": true, "rationales": ["The object is an antenna and used to fix bad reception.", "The purpose is to improve reception.", "The devices has rabbit ears to get a signal."], "image": "val2014/COCO_val2014_000000453862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179479, "question_id": "FQoCGiEsgh8WBynhz8BuvL", "question": "What object use to interact with fish is being shown in this image?", "choices": ["hat", "shoes", "plane", "fishing rod"], "correct_choice_idx": 3, "direct_answers": ["fishing pole", "fishing rod", "fishing pole", "pole", "fishing pole", "interact", "fishing rod", "pole", "rod", "fishing rod"], "difficult_direct_answer": false, "rationales": ["The man in the back has a pole and reel that is meant for catching fish.", "The man is holding a stick to catch the fish.", "It's the only option that makes sense given the question and the element near the top right corner."], "image": "train2014/COCO_train2014_000000179479.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447573, "question_id": "FQwT8FeGV4inLWaXdC5CXw", "question": "What time is it approximately?", "choices": ["225", "915", "430", "637"], "correct_choice_idx": 2, "direct_answers": ["430", "four thirty", "430 pm", "four thirty", "429", "four thirty", "430", "four thirty", "4.30", "430"], "difficult_direct_answer": false, "rationales": ["The time is approximately 4:30.", "The short hand is on the four and the long hand is near the six.", "The time is 4:30."], "image": "train2014/COCO_train2014_000000447573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189955, "question_id": "FV4dRuTcGoF4sjKYCjpkXi", "question": "What is next to the plane?", "choices": ["egg", "motorcycle", "statue", "traffic cones"], "correct_choice_idx": 3, "direct_answers": ["person", "people", "traffic cones", "person", "traffic cones", "orange cones", "cones", "orange cones", "man", "person"], "difficult_direct_answer": false, "rationales": ["The cones are near.", "The airplane has traffic cones placed on the sides of it to keep other vehicles away from it.", "These bright orange structures are used to direct traffic to a safe route when there is danger or areas that need to be avoided."], "image": "val2014/COCO_val2014_000000189955.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195510, "question_id": "FW9Qh6wRzbmh64UERnWvVP", "question": "Which country is this highway most likely seen in?", "choices": ["ukraine", "romania", "japan", "china"], "correct_choice_idx": 2, "direct_answers": ["japan", "japan", "right-driving countries", "nothing", "britain", "japan", "china", "europe", "us", "britain"], "difficult_direct_answer": false, "rationales": ["The country is japan.", "The lettering on the signs is part of the japanese alphabet.", "The country is japan."], "image": "val2014/COCO_val2014_000000195510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136043, "question_id": "FWe9iUNzo6HLkE6DejUaj2", "question": "What is the largest number on the bus that is located under the wheelchair sign?", "choices": ["two", "eight", "seven", "four"], "correct_choice_idx": 1, "direct_answers": ["eight", "bus number", "eight", "1418", "1418", "eight", "1418", "1418", "36", "eight"], "difficult_direct_answer": false, "rationales": ["The 8 is the largest number on the bus above the wheelchair.", "A bus is in the street with numbers in various parts of the rear of the bus to signify bus stop and other information.", "There are four numbers, and one and four are smaller than the other number."], "image": "train2014/COCO_train2014_000000136043.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 178592, "question_id": "FXMARCyWGNZHxccnLMJDjb", "question": "What animal is to the left of the elephants?", "choices": ["vulture", "cow", "horse", "zebra"], "correct_choice_idx": 3, "direct_answers": ["zebras", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebras", "zebra", "zebra"], "difficult_direct_answer": false, "rationales": ["The animal is the zebra.", "The animal is the zebra.", "They are recognizable by their smaller stature and characteristic black and white stripes. this animal is found in the same environment as elephants."], "image": "val2014/COCO_val2014_000000178592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390756, "question_id": "FYApP52oLu8qiTyW6w9MSo", "question": "How is the man on the skateboard being propelled?", "choices": ["jet engine", "propeller", "motor", "bike"], "correct_choice_idx": 3, "direct_answers": ["bike", "bike", "bike", "behind bike", "kicking", "rope", "force", "stick", "bicycle", "bicycle rope"], "difficult_direct_answer": false, "rationales": ["There is a woman riding a bicycle who is pulling the man with a string.", "A man is riding a skateboard and holding a rope that is attached to the girl on the bike in front of him.", "The bike is pulling the board."], "image": "train2014/COCO_train2014_000000390756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393845, "question_id": "FZPAVHtxgpLRdph3JGkWa3", "question": "What is the woman fixing?", "choices": ["tie", "car door", "computer", "pie"], "correct_choice_idx": 0, "direct_answers": ["tie", "dye", "his tie", "tie", "tie", "tie", "tie", "necktie", "tie", "tie"], "difficult_direct_answer": false, "rationales": ["The woman is helping the man by fixing his necktie.", "A woman is leaning towards a man in a dress shirt. men often wear ties with dress shirts.", "She's tightening the knot on the fabric around the man's neck"], "image": "train2014/COCO_train2014_000000393845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28494, "question_id": "Fb8muQZyZSdKD7YPWqH7CC", "question": "How many zebras are standing in the middle of the open field?", "choices": ["four", "six", "five", "two"], "correct_choice_idx": 2, "direct_answers": ["five", "six", "five", "five", "five", "five", "five", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["There are five zebras standing together and grazing in the open field.", "There are five zebras in the middle of the field and some are grazing.", "There are 5."], "image": "train2014/COCO_train2014_000000028494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50531, "question_id": "FbDMpM82MibwwK63y572pG", "question": "What color are the donuts made from this strange pan?", "choices": ["brown", "black", "purple", "white"], "correct_choice_idx": 0, "direct_answers": ["brown", "brown", "brown", "brown", "brown", "brown", "merun", "brown", "tan", "golden"], "difficult_direct_answer": false, "rationales": ["After being baked, the goods turn a golden-brownish color.", "The pastries in this pan are a pale orangeish brown.", "They have been cooked"], "image": "val2014/COCO_val2014_000000050531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398142, "question_id": "Fbff6L7SKrg5gKNyULnKf3", "question": "What country's flag is being flown?", "choices": ["france", "italy", "switzerland", "england"], "correct_choice_idx": 0, "direct_answers": ["france", "france", "france", "france", "france", "france", "france", "chili", "russia", "france"], "difficult_direct_answer": false, "rationales": ["The flag is red, white, and blue. it does not have crosses.", "The french flag is being flown. it is red, white, and blue but with large single stripes.", "The flag is a french flag."], "image": "train2014/COCO_train2014_000000398142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299349, "question_id": "FbjJzwuaDPgRx7nD39PuZJ", "question": "What color is the smallest pair of scissors?", "choices": ["pink", "black", "green", "orange"], "correct_choice_idx": 3, "direct_answers": ["stop", "orange", "orange", "orange", "orange", "yellow", "yellow", "orange", "blue orange", "orange"], "difficult_direct_answer": false, "rationales": ["The color is orange.", "The smallest scissors are light orange.", "Both smaller ones are this color"], "image": "val2014/COCO_val2014_000000299349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327532, "question_id": "FdZ828Qk9fSjitDXcU67yD", "question": "What is the name of the game mode being played?", "choices": ["fly swatting", "singles", "doubles", "foreigners"], "correct_choice_idx": 2, "direct_answers": ["doubles tennis", "doubles", "doubles", "tennis", "tennis", "stop", "duos", "tennis", "tennies", "tennis"], "difficult_direct_answer": false, "rationales": ["There are two people playing on each side.", "There are two people on the same side", "There are two people in the team."], "image": "val2014/COCO_val2014_000000327532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402833, "question_id": "FdqoMVhbLczgNjGYM5g7ms", "question": "The man in the white hat following with the elephants is wearing what color of shirt?", "choices": ["green", "white", "purple", "blue"], "correct_choice_idx": 0, "direct_answers": ["green", "green", "green", "green", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["There is only one man wearing a white cap and he's wearing a green shirt.", "It's a little darker than the color of grass", "The man has a green shirt."], "image": "train2014/COCO_train2014_000000402833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17301, "question_id": "FdtaQtNYwWK9rBkSsMwZCz", "question": "What device is found here?", "choices": ["oven", "washing machine", "desktop computer", "refrigerator"], "correct_choice_idx": 1, "direct_answers": ["toilet", "toilet", "dryer", "washing machine", "toilet", "washing machine", "washing machine", "washing machine", "toilet", "clothes washer"], "difficult_direct_answer": false, "rationales": ["The device washes.", "There is a small washing machine found in the corner of the room.", "You see a washing machine at the back."], "image": "train2014/COCO_train2014_000000017301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238004, "question_id": "Ffdt7GmFN7MQtDqigkcYAk", "question": "Where is the boat likely going?", "choices": ["private dock", "shore", "deeper waters", "boathouse"], "correct_choice_idx": 1, "direct_answers": ["shore", "ocean", "island", "into waters", "land", "shore", "sea", "island", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["The boat is trying to get onto the sand.", "The boat goes to shore.", "Given the direction everyone on board is facing and the nearby beach, it's like this answer. it could be c as well."], "image": "train2014/COCO_train2014_000000238004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159296, "question_id": "Fh8Lg6HwnHpwPFHd23b9gv", "question": "What color is the banana to the right of the paper cup containing beverage?", "choices": ["black", "brown", "yellow", "green"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The banana has dark spots but is not black or brown overall. the banana is not green.", "That is a common color for that kind of fruit. there are some blotches of brown, but it's mostly a sunny color.", "It's a little overripe but still has this color"], "image": "train2014/COCO_train2014_000000159296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239254, "question_id": "Fi7uqfA6ruC9EvoRDLmrdY", "question": "What is on the animal in the foreground's neck?", "choices": ["hat", "scarf", "bowtie", "hair"], "correct_choice_idx": 3, "direct_answers": ["hair", "hair", "zebra", "hair", "warthog", "mane", "zebra", "cow", "mane", "stripes"], "difficult_direct_answer": false, "rationales": ["The animal has hair on their neck.", "The animal has hair.", "There is a lot of hair around the animal's neck."], "image": "train2014/COCO_train2014_000000239254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394623, "question_id": "FmBtYYJ6Dbjr2g3ZyNZhFJ", "question": "What is the countertop in the middle called?", "choices": ["bar", "cart", "island", "kitchen table"], "correct_choice_idx": 2, "direct_answers": ["island", "island", "island", "island", "countertop", "counter", "island", "island", "island", "eating island"], "difficult_direct_answer": false, "rationales": ["The counter is an island.", "The table is located only at the kitchen and is used for placing kitchen items when carrying out kitchen activities.", "The counter is an island."], "image": "train2014/COCO_train2014_000000394623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317679, "question_id": "Fod28uWS8G2wDTFXd8fAup", "question": "What are the horses doing?", "choices": ["eating", "running", "fighting", "jumping"], "correct_choice_idx": 0, "direct_answers": ["grazing", "eating food", "eating", "grazing", "eating grass", "grazing", "grazing", "grazing", "eating", "car"], "difficult_direct_answer": false, "rationales": ["They are in a field.", "The horses are standing in a grassy field. their heads are down near the grass.", "The horses are grazing where they bend down to consume grass for food."], "image": "train2014/COCO_train2014_000000317679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150410, "question_id": "FokAsoKi9jwXut9Jt6ZTFm", "question": "What is the man behind?", "choices": ["cardboard box", "wrestling ring", "basketball", "replica airplane"], "correct_choice_idx": 3, "direct_answers": ["car", "plane", "airplane", "pilot", "plane", "plane", "plane", "replica airplane", "airplane", "model airplane"], "difficult_direct_answer": false, "rationales": ["The man is prone behind a replica airplane.", "The thing on the ground in front of the man is shaped like an airplane, but it is smaller.", "The man is behind a plane."], "image": "val2014/COCO_val2014_000000150410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103033, "question_id": "FpurUVTnGoz2dMgMJ9QWcB", "question": "What color is the juice in the long container to the left of the pie?", "choices": ["grape juice", "orange juice", "grapefruit juice", "apple juice"], "correct_choice_idx": 1, "direct_answers": ["orange", "orange", "orange", "orange juice", "yellow", "orange", "orange", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The juice in the container is orange.", "The color is orange.", "It's orange in color."], "image": "train2014/COCO_train2014_000000103033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457235, "question_id": "FqXYBmaTRXho7kTeYuBe6z", "question": "What is at the foot of the bed?", "choices": ["cat", "baby", "dog", "shoes"], "correct_choice_idx": 3, "direct_answers": ["shoes", "shoes", "shoes", "teddy remote", "shoes", "bed rail", "shoes", "shoes", "shoes", "shoes"], "difficult_direct_answer": false, "rationales": ["The shoes are present.", "The foot of the bed is the bottom or end of the bed. on the floor at end of the bed is a pair of loafers.", "The foot of the bed is clearly visible with only one object set near it. the objects are of a size, shape and design consistent with answer a."], "image": "train2014/COCO_train2014_000000457235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 46852, "question_id": "FqeMsc3oZEm4XadoKKPbqR", "question": "What is in the glass?", "choices": ["false teeth", "egg", "apple", "toothbrush"], "correct_choice_idx": 3, "direct_answers": ["toothbrush", "toothbrush", "toothbrush", "toothbrush", "dwdw", "brush", "toothbrush", "toothbrush", "toothbrush", "toothbrush"], "difficult_direct_answer": false, "rationales": ["It is used for teeth.", "The object in the glass has a shape, size and design consistent with answer a.", "The item is in a bathroom. it has a shaft and bristles."], "image": "train2014/COCO_train2014_000000046852.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434618, "question_id": "Fqqi9fXUqUmNyq8zga8Jbn", "question": "How many items are meant to be worn directly over the eyes?", "choices": ["eight", "five", "three", "seven"], "correct_choice_idx": 2, "direct_answers": ["three", "four", "three", "three", "three", "googles", "three", "fefefe", "one", "one"], "difficult_direct_answer": false, "rationales": ["For this many people, it would make sense that the total is three. the question is actually not well written.", "Two have goggles and one has eyeglasses", "There are three items."], "image": "val2014/COCO_val2014_000000434618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 177472, "question_id": "FsNPmGwKxkqtoQuzMnvTvU", "question": "What place sells these items?", "choices": ["taco bell", "home depot", "sonic", "staples"], "correct_choice_idx": 2, "direct_answers": ["restaurant", "roadside food", "mcdonalds", "restaurant", "fast food", "restaurant", "street", "restaurants", "restaurant", "sonic"], "difficult_direct_answer": false, "rationales": ["Sonic sells hot dogs and fries.", "Sonic is known to sell hot dogs. the other options do not sell hot dogs.", "These items are a hot dog and french fries. home depot and staples are not restaurants, and taco bell does not sell hot dogs."], "image": "train2014/COCO_train2014_000000177472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247318, "question_id": "FsmXWd4C7fTsF7TM4X8PqU", "question": "What room is beyond the countertops?", "choices": ["living room", "office", "bathroom", "bedroom"], "correct_choice_idx": 0, "direct_answers": ["cabinets", "living room", "living room", "living room", "kitchen", "living room", "living room", "living room", "living", "living room"], "difficult_direct_answer": false, "rationales": ["The countertops are in the kitchen. the kitchen separates the kitchen and living room.", "A room with a floor lamp and a fireplace can be seen past a kitchen counter.", "The room is a living room."], "image": "train2014/COCO_train2014_000000247318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236593, "question_id": "FtU9mLh8MwquUqhzAS5KT8", "question": "What color is the sign hung in the middle of the electricity pole next to the street?", "choices": ["black", "green", "white", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "green", "red", "green", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["It's a stop sign and this is a universal color", "The color is red.", "The sign in the middle of the electricity pole is a standard stop sign."], "image": "train2014/COCO_train2014_000000236593.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183236, "question_id": "FtYNbu6ciKdb5YrBKB68GY", "question": "How many computer displays are sat on top of this desk?", "choices": ["four", "three", "two", "one"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 3 displays.", "There are three computer displays on top of the desk.", "There are three displays."], "image": "train2014/COCO_train2014_000000183236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537667, "question_id": "FtyQ3KLFoHB87jgJ3G92tF", "question": "What person is sliding?", "choices": ["jackie chan", "jackie brown", "jackie kennedy", "jackie robinson"], "correct_choice_idx": 3, "direct_answers": ["jackie robinson", "baseball player", "baseball", "joe black", "catcher", "runner", "baseball player", "jackie robinson", "baseball player", "hank aron"], "difficult_direct_answer": false, "rationales": ["This is jackie robinson.", "The man sliding is the name of him.", "He's a famous dodger player"], "image": "train2014/COCO_train2014_000000537667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521203, "question_id": "Fu5yGmJWsccmxR5GxFJCs4", "question": "What animal is depicted on the white item in the water?", "choices": ["fish", "elephant", "horse", "snake"], "correct_choice_idx": 2, "direct_answers": ["horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["The animal has four legs, a mane, and a tail. fish and snakes do not have legs, and elephants do not have manes.", "The animal is a horse.", "There is a horse depicted on top of the sail."], "image": "train2014/COCO_train2014_000000521203.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441453, "question_id": "Fubgp2mCqMEYhUAiHdtBxW", "question": "What is likely her favorite animal?", "choices": ["cat", "dog", "pig", "sheep"], "correct_choice_idx": 2, "direct_answers": ["bunny", "people", "mouse", "bear", "dog", "pig", "na", "bear", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["The animal is the pig.", "The girl is wearing a sweatshirt with this animal.", "The animal is the pig."], "image": "val2014/COCO_val2014_000000441453.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 420532, "question_id": "Fuh98sQTFABqmxcHV4NauJ", "question": "What are the two objects on each side of the man's red tie?", "choices": ["suspenders", "chains", "lapels", "vest"], "correct_choice_idx": 0, "direct_answers": ["suspenders", "suspenders", "suspenders", "money bottle", "suspenders", "suspenders", "suspenders", "gun holster", "suspenders", "suspenders"], "difficult_direct_answer": false, "rationales": ["They're a matching red and hold up his pants.", "He is holding his pants up.", "The main is wearing a white shirt. it is accented with a red tie and red suspenders."], "image": "val2014/COCO_val2014_000000420532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414115, "question_id": "Fuhh7pvHSqib74PePvc3tU", "question": "What station is on the television?", "choices": ["cnn", "fox", "tbs", "bloomberg"], "correct_choice_idx": 3, "direct_answers": ["bloomberg", "bloomberg", "bloomberg", "bloomberg", "nbbc", "news", "news", "news", "bloomberg", "bloomberg"], "difficult_direct_answer": false, "rationales": ["The screen says bloomberg.", "It's the owner's surname.", "A logo can be seen on the screen of a television."], "image": "train2014/COCO_train2014_000000414115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338146, "question_id": "FumHhxNsKBUNhGDRaymj7o", "question": "Where would a kitchen like this be located?", "choices": ["kitchenette", "dorm room", "restaurant", "mountain cabin"], "correct_choice_idx": 2, "direct_answers": ["house", "house", "house", "restaurant", "foriegn", "house", "house", "china", "restaurant", "restaurant"], "difficult_direct_answer": false, "rationales": ["It is heavy duty.", "A large kitchen like this should be in a restaurant.", "This is a commercial kitchen likely used in a restaurant."], "image": "val2014/COCO_val2014_000000338146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7049, "question_id": "FwpDK65uHJ3J3QBymRPJSg", "question": "Who is holding the umbrella?", "choices": ["marsupial", "old man", "toddler", "old lady"], "correct_choice_idx": 3, "direct_answers": ["elderly woman", "old lady", "woman", "old lady", "old lady", "woman", "women", "woman", "old woman", "woman"], "difficult_direct_answer": false, "rationales": ["The elderly woman in the black jacket is holding the umbrella.", "The woman holds it.", "An old lady is holding the umbrella."], "image": "train2014/COCO_train2014_000000007049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377005, "question_id": "Fxak2WNiz2KQjcQkZ3oHH6", "question": "What is the cat on top of?", "choices": ["box", "book shelf", "refrigerator", "dog"], "correct_choice_idx": 1, "direct_answers": ["bookshelf", "shelf", "book shelf", "table", "bookshelf", "shelf", "book case", "bookshelf", "bookcase", "book shelf"], "difficult_direct_answer": false, "rationales": ["A cat is laying in a shelving unit with books on the shelf below.", "The cat is on top of a bookshelf.", "The cat is sitting on a piece of furniture used to store printed and bound documents. the item in a performs that function."], "image": "val2014/COCO_val2014_000000377005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179025, "question_id": "FxjeAPYi4VnwqP4eWQTjRX", "question": "What athlete's last name appears on the poster?", "choices": ["bo jackson", "randy couture", "wayne gretzky", "jim those"], "correct_choice_idx": 1, "direct_answers": ["poster", "couture", "couture", "couture", "unknown", "randy couture", "couture", "couture", "couture", "couture"], "difficult_direct_answer": false, "rationales": ["The last name couture is on the posture. though this was first, it shares the same with a pro football player.", "The name on the poster is not those, jackson, or gretzky.", "The name of the athlete is written on top of the poster."], "image": "train2014/COCO_train2014_000000179025.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135620, "question_id": "Fy8VnxpYzWUWSajhhvxkEz", "question": "How many people are holding onto parasail and sailing into the ocean?", "choices": ["two", "four", "three", "one"], "correct_choice_idx": 0, "direct_answers": ["three", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["They each have one", "There are two people riding parasails in front of the boat in the ocean.", "The parasails are clearly visible based on the sail and the boards. the riders on the parasails are clearly visible and countable."], "image": "val2014/COCO_val2014_000000135620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580625, "question_id": "G33hv3BHMr3VY5e6HHKzeW", "question": "How many seats are put on top of the red sofa up against the wall?", "choices": ["four", "one", "three", "two"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "many"], "difficult_direct_answer": false, "rationales": ["The couch is clearly visible and the sections of the couch that correspond to seating are countable.", "A red couch has three cushions on the bottom and across the back.", "There are 3."], "image": "train2014/COCO_train2014_000000580625.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380126, "question_id": "G38RJWyEzufqjfUTCFRpCs", "question": "What are the brightest lights attached to?", "choices": ["arena entrance", "ceiling", "computer", "car"], "correct_choice_idx": 1, "direct_answers": ["ceiling", "ceiling", "wood beams", "ceiling", "ceiling", "ceiling", "kitchen", "chandelier", "ceiling", "ceiling"], "difficult_direct_answer": false, "rationales": ["The lights are on the ceiling.", "It is attached to the ceiling to see in the whole area.", "The lights are on the ceiling."], "image": "train2014/COCO_train2014_000000380126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183319, "question_id": "G3tmkcFNPK8dzePizsF7Ln", "question": "What country most frequently uses wine glasses this shape?", "choices": ["japan", "china", "usa", "france"], "correct_choice_idx": 3, "direct_answers": ["spain", "rome", "italy", "usa", "america", "france", "mexico", "france", "france", "france"], "difficult_direct_answer": false, "rationales": ["This glass is used greatly in france.", "The country is france.", "France uses wine glasses."], "image": "val2014/COCO_val2014_000000183319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71281, "question_id": "G4FrPyHyYoEpLVWM4agBRB", "question": "What is the type of fencing used to contain all of these sheep?", "choices": ["wood", "iron", "wire", "electric"], "correct_choice_idx": 2, "direct_answers": ["wire fencing", "grass", "barbed wire", "wire", "barbed wire", "coat", "wire", "wire", "wire", "barbwire"], "difficult_direct_answer": false, "rationales": ["The sheep are enclosed in an area surrounded by barbed wire fencing.", "The fencing is wire.", "There are barbs on the metal strung across"], "image": "val2014/COCO_val2014_000000071281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514930, "question_id": "G53BLdxj3gBc4WKcs6EntT", "question": "What kind of an expo is this?", "choices": ["comic book", "anime", "video game", "museum"], "correct_choice_idx": 2, "direct_answers": ["video game", "technology", "tech", "video game", "gaming", "video game", "video game", "game", "software", "tech"], "difficult_direct_answer": false, "rationales": ["There are people with game controllers.", "Many people are standing around a public place with game controllers in their hands.", "The people are looking at all the new video games."], "image": "train2014/COCO_train2014_000000514930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548362, "question_id": "G5NjzbePwBDFdr4q4fkxDE", "question": "What is consuming the pink donut?", "choices": ["old man", "old woman", "little girl", "cat"], "correct_choice_idx": 2, "direct_answers": ["little girl", "small girl", "girl", "child", "child", "girl", "girl", "donut", "girl", "little girl"], "difficult_direct_answer": false, "rationales": ["The girl has it.", "A small child with long hair is eating a pastry with pink frosting.", "The girl has the pink donut."], "image": "train2014/COCO_train2014_000000548362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137056, "question_id": "G5ufve5CwjbFdvRaAjtXwu", "question": "What movie character fits in with these animals?", "choices": ["benji", "mr ed", "garfield", "dumbo"], "correct_choice_idx": 3, "direct_answers": ["dumbo", "dumbo", "dumbo", "elephant", "dumbo", "dumbo", "dumbo", "simba", "dumbo", "dumbo"], "difficult_direct_answer": false, "rationales": ["The character is dumbo.", "The animals are elephants.", "This is an elephant and dumbo was an elephant as known by his big floppy ears. these animals have the same color and ears as dumbo."], "image": "train2014/COCO_train2014_000000137056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65156, "question_id": "G69LHoacmFaygSwXGPwTZD", "question": "How many birds are standing inside of the river with the zebras on the island?", "choices": ["three", "four", "two", "one"], "correct_choice_idx": 3, "direct_answers": ["two", "one", "one", "one", "one", "one", "one", "few", "one", "one"], "difficult_direct_answer": false, "rationales": ["One bird can be seen in the water.", "There is a white egret standing in the stream.", "There is only one bird at the river."], "image": "train2014/COCO_train2014_000000065156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399378, "question_id": "G8JN58mYkbcEqxEZ4umHeN", "question": "Which country most likely houses this construction for the park?", "choices": ["romania", "germany", "france", "italy"], "correct_choice_idx": 1, "direct_answers": ["germany", "germany", "berne", "switzerland", "switzerland", "germany", "united states", "canada", "germany", "california"], "difficult_direct_answer": false, "rationales": ["The country is germany.", "The word berne appears on the side of the building.", "The clock tower has letters that spell out berne which is a place in germany."], "image": "train2014/COCO_train2014_000000399378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 17076, "question_id": "G99yRnjscMTdwqzk9gDPNw", "question": "What food is on the plate in the middle?", "choices": ["lemon", "carrot", "pizza", "lime"], "correct_choice_idx": 1, "direct_answers": ["vegetable", "vegetables", "crudite", "vegetables crackers", "salad", "carrot", "vegetables", "vegetables", "carrot", "vegetables"], "difficult_direct_answer": false, "rationales": ["This vegetable is long and orange.", "One of the foods is long and sticklike and orange in color.", "There are carrots because they are cut in bayonets and are orange"], "image": "train2014/COCO_train2014_000000017076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305093, "question_id": "GADXPiwHRkGGGBucAwJAd5", "question": "What sound does the animal make?", "choices": ["neigh", "woof", "moo", "meow"], "correct_choice_idx": 1, "direct_answers": ["woof", "bark", "woof", "woof", "woof", "bark", "bark", "barking", "bark", "woof"], "difficult_direct_answer": false, "rationales": ["The dog, when it \"speaks\" will always \"woof\" or \"bark\" or \"growl\", but it never clucks or moos!.", "A dog is sitting on the floor in a family room of a home.", "The sound is a woof."], "image": "train2014/COCO_train2014_000000305093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361153, "question_id": "GAfLjPj2jZRfmL7aawPimF", "question": "What is about to be dipped?", "choices": ["cheese", "broccoli", "dunkaroos", "swimmer"], "correct_choice_idx": 1, "direct_answers": ["bok choy", "vegetables", "broccoli", "sauce", "bok choy", "vegetables", "vegetable trees", "vegetables", "sauce", "celery"], "difficult_direct_answer": false, "rationales": ["There are no people on the table. the food items are green and are not dunkaroos or cheese.", "Pieces of chopped broccoli are on a board next to a dish of dipping sauce.", "The broccoli on the right is cut in strips and to be dipped in the sauce on the left."], "image": "val2014/COCO_val2014_000000361153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36953, "question_id": "GCQ9fim88XUhK2v25yHQkD", "question": "What is the cylindrical object on the table?", "choices": ["peppermill", "pepperoni", "tea strainer", "utensil crock"], "correct_choice_idx": 0, "direct_answers": ["apple", "grinder", "mug", "candle", "not load", "pepper shaker", "vase", "kaleidoscope", "peppermill", "pepper grinder"], "difficult_direct_answer": true, "rationales": ["The cylindrical object on the table is a peppermill used for grinding pepper.", "The object is for pepper.", "It has pepper."], "image": "train2014/COCO_train2014_000000036953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112887, "question_id": "GDFxJXGjWT47oKQMR96iFS", "question": "What type of setting is this?", "choices": ["appetizer", "salad", "main course", "charcuterie"], "correct_choice_idx": 3, "direct_answers": ["free form", "restaurant", "meal", "appetizer", "lunch", "sallet", "charcuterie", "lunch", "party", "breakfast"], "difficult_direct_answer": true, "rationales": ["The setting is charcuterie.", "The board is for charcuterie.", "The large plate has a variety of foods represented all cut and arranged in an appealing way that would be consistent with answer a."], "image": "train2014/COCO_train2014_000000112887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474390, "question_id": "GDqrvtGp6fPVDFK7zDHX2d", "question": "How many different airlines are being shown here?", "choices": ["one", "three", "four", "two"], "correct_choice_idx": 1, "direct_answers": ["eight", "three", "three", "three", "three", "two", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are planes with air canada, continental airlines, and delta air lines liveries.", "There are two different symbols on the planes.", "A group of planes are parked in various spots around an airport and three different emblems can be seen on the tails of the planes."], "image": "train2014/COCO_train2014_000000474390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445556, "question_id": "GEd27zE96hiwUucLcwyFUe", "question": "What is the white object under the window?", "choices": ["table", "air conditioner", "radiator", "vent"], "correct_choice_idx": 2, "direct_answers": ["radiator", "heater", "gate", "diswaher", "radiator", "radiator", "table", "radiator", "gate", "radiator"], "difficult_direct_answer": false, "rationales": ["The white object under the window is a radiator that emits heat during the winter months.", "The object that is under the window is metal.", "A white, metal object is under a window in a home."], "image": "train2014/COCO_train2014_000000445556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129006, "question_id": "GExvbTqRUZhTYC5BTEKna8", "question": "Why is the bagel in there?", "choices": ["warming", "burning", "melting", "toasting"], "correct_choice_idx": 3, "direct_answers": ["microwave", "toasting", "toasting", "to cook", "warm up", "toasting", "to warm", "toast", "being toasted", "toasting"], "difficult_direct_answer": false, "rationales": ["The bagels are toasting.", "The bagel toaster toasts bagels.", "It is being cooked."], "image": "train2014/COCO_train2014_000000129006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114653, "question_id": "GEyn9SwMspXxjNeCaamoCA", "question": "What is the skateboard on?", "choices": ["wood floor", "grass", "crate", "stair railing"], "correct_choice_idx": 3, "direct_answers": ["on steel", "railing", "stair railing", "railing", "park", "rail", "rail", "railing", "rail", "railing"], "difficult_direct_answer": false, "rationales": ["The support for the skateboard is tubular and metal and above the stairs. it is traditional for athletics to slide down this object with their skateboards.", "A skateboarder is griding on a hand rail that is on a stairway.", "A skateboarder is grinding a rail at a skatepark."], "image": "train2014/COCO_train2014_000000114653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559440, "question_id": "GFYTPKArX2AcMNx8s856Ug", "question": "What controls the flushing on the toilet to the right side of the bathroom?", "choices": ["string", "panel", "lever", "button"], "correct_choice_idx": 3, "direct_answers": ["button", "silver button", "button", "button", "button", "button", "button", "botton", "button", "tap"], "difficult_direct_answer": false, "rationales": ["The button controls it.", "There is a button on top of the toilet that controls the flush.", "The control is a push mechanism, the activity performed by the item in option a."], "image": "val2014/COCO_val2014_000000559440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272961, "question_id": "GGD9hzJLngQvxhPqK5XyLX", "question": "What are the horses doing?", "choices": ["swimming", "sleeping", "standing", "flying"], "correct_choice_idx": 2, "direct_answers": ["trotting", "galloping", "walking", "walking", "standing", "running", "walking", "walking", "running", "walking"], "difficult_direct_answer": false, "rationales": ["The horses are standing in the grass.", "The horses stand.", "These horses hoofs are upright and their heads are up. they are standing."], "image": "val2014/COCO_val2014_000000272961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370583, "question_id": "GLFJR7vdDwKytwttN3NUs6", "question": "What material outlines the enclosure for these giraffes?", "choices": ["wire", "cement", "stone", "electrified wire"], "correct_choice_idx": 0, "direct_answers": ["steel", "fence", "wire", "eat", "metal", "car", "metal", "wire", "fence", "fencing"], "difficult_direct_answer": false, "rationales": ["The material is wire.", "There is a gridded lines running along the fence. it can be seen thru and helps keep the animals from escaping or putting face thru.", "Giraffes are in a fenced in enclosure."], "image": "train2014/COCO_train2014_000000370583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414067, "question_id": "GLPe3i4mUyjZWKsMorL9tT", "question": "What is around the animal in the foreground's neck?", "choices": ["tag", "medal", "ribbon", "scarf"], "correct_choice_idx": 0, "direct_answers": ["tag", "tag", "stop", "tag", "number", "tag", "tag", "tag", "stone pillar", "white"], "difficult_direct_answer": false, "rationales": ["The animal in the foreground has a yellow tag around its neck for identifcation.", "The animal in the foreground is a sheep and the size, shape and color of the object around its neck is consistent with answer a which would be found commonly on a sheep.", "This is so it can be identified if it gets loose"], "image": "train2014/COCO_train2014_000000414067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40398, "question_id": "GN9xYE9CiZ6sebWDLnmY7T", "question": "What is the man feeding?", "choices": ["cat", "dog", "elephant", "goat"], "correct_choice_idx": 1, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "his dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["A dog is standing next tot the table being fed table scraps by a man", "A small, domestic animal with floppy ears is standing between two people eating at a patio table outdoors.", "A man is sitting at a table with food dishes covering it. the man is holding out his hand with food in it to a dog standing at the table."], "image": "train2014/COCO_train2014_000000040398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271357, "question_id": "GQxhmKj8rqi4wx5MKfQuEZ", "question": "What is similar to the long things on the animal's face?", "choices": ["antenna", "gills", "unicorn horn", "fins"], "correct_choice_idx": 0, "direct_answers": ["antennae", "whisker", "stripes", "stripes", "whiskers", "worms", "ears", "hair", "whiskers", "antenna"], "difficult_direct_answer": false, "rationales": ["The antenna is similar.", "There are whiskers on the cat which are like antennas.", "Antenna are similar."], "image": "train2014/COCO_train2014_000000271357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433987, "question_id": "GRf5F2jGQdBJLpM5GBbq9G", "question": "What is the little white box on the wall?", "choices": ["light switch", "thermostat", "intercom", "outlet"], "correct_choice_idx": 1, "direct_answers": ["vent cover", "thermostat", "thermostat", "thermostat", "thermostat", "thermometer", "fire", "red", "temperature gauge", "thermostat"], "difficult_direct_answer": false, "rationales": ["A small white, square device is attached to a wall in a home and has a digital screen on it.", "The device on the wall is used to control the temperature in the room.", "It's used to control the room temperature."], "image": "train2014/COCO_train2014_000000433987.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427129, "question_id": "GSTe9QdDhJMQvWB6F8ucD4", "question": "How many giraffes are standing together on the rocks next to this zebra?", "choices": ["five", "four", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "car", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two giraffes standing together in front of the zebra and the rocks.", "There are a couple of the giraffes standing together.", "There are 2."], "image": "train2014/COCO_train2014_000000427129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183166, "question_id": "GSkrEJdcbgTcYPtNmSwuv5", "question": "What is a strange thing to see on a motorcycle?", "choices": ["kids driving", "umbrella", "dog", "two sidecars"], "correct_choice_idx": 1, "direct_answers": ["umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["The umbrella is strange.", "The umbrella is strange.", "It usually hard to hold something while riding a motorcycle."], "image": "val2014/COCO_val2014_000000183166.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496552, "question_id": "GSpXsgaBhG7sBJhRTi374o", "question": "What animal can usually be found here?", "choices": ["elk", "tiger", "fish", "koala bear"], "correct_choice_idx": 2, "direct_answers": ["fish", "fish", "crabs", "fish", "fish", "fish", "crabs", "crabs", "fish", "fish"], "difficult_direct_answer": false, "rationales": ["The beach is surrounded by water that is full of different kinds of fish.", "The beach has lots of water where fish and other animals live.", "The animal is a fish."], "image": "train2014/COCO_train2014_000000496552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238928, "question_id": "GU5PPmW4aPGBCQVYjrxJaR", "question": "What is the person with the whip riding on?", "choices": ["donkey", "horse", "goat", "camel"], "correct_choice_idx": 3, "direct_answers": ["camel", "camel", "camel", "camel", "camel", "camel", "camel", "camel", "camel", "camel"], "difficult_direct_answer": false, "rationales": ["The person is the camel.", "The scene seems to be a desert and the animal from the genus camelus family would be used before any others.", "The animal is walking in the desert and has a hump. the animal in option a matches the description."], "image": "train2014/COCO_train2014_000000238928.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222599, "question_id": "GUommzy2YsaFgtu295MtA4", "question": "How many little giraffes are with the big giraffe here?", "choices": ["one", "five", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "onet", "three", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two young giraffes next to the one adult.", "Two of the animals are smaller than the larger one.", "There are 2."], "image": "train2014/COCO_train2014_000000222599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222441, "question_id": "GVEVKkb49opwTddCdzgGSs", "question": "What is the largest number that can be created using any two numbers on the train on the right?", "choices": ["98", "90", "93", "31"], "correct_choice_idx": 2, "direct_answers": ["29100", "ninety three", "twelve", "nine", "nine", "twenty nine", "93", "ninety three", "ink", "93"], "difficult_direct_answer": false, "rationales": ["Out of the numbers: two, nine, zero, one, and three, the highest two numbers would be ninety-three.", "That is the largest number on the train.", "The largest number is a 9 and the next largest is a 3"], "image": "train2014/COCO_train2014_000000222441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1355, "question_id": "GVPZPKV8v8tEM2pJptYycf", "question": "What is the white rectangle on the island for?", "choices": ["cutting", "decoration", "mixing", "warming"], "correct_choice_idx": 0, "direct_answers": ["lighting", "cutting", "cutting", "cutting vegetables", "cutting board", "cutting", "storing food", "cutting", "cutting", "cutting board"], "difficult_direct_answer": false, "rationales": ["A large, flat object is on a counter next to a stove in a kitchen.", "There is a white rectangular cutting board.", "It protects the counter."], "image": "train2014/COCO_train2014_000000001355.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534894, "question_id": "GXi4jfyw3vcJkDJv2vUiwf", "question": "What vehicle is on the left hand side?", "choices": ["motorcycle", "van", "bicycle", "tank"], "correct_choice_idx": 1, "direct_answers": ["van", "truck", "van", "van", "van", "van", "van", "van", "van", "van"], "difficult_direct_answer": false, "rationales": ["The vehicle is a van.", "The vehicle is a van.", "There is a white van in the background."], "image": "val2014/COCO_val2014_000000534894.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224220, "question_id": "GbapkqfsrknJpjhVfrZWFT", "question": "What is the green item?", "choices": ["frog", "antelope", "lizard", "oar"], "correct_choice_idx": 3, "direct_answers": ["paddle", "paddle", "oar", "support", "green", "oar", "boat", "oar", "oars", "oars"], "difficult_direct_answer": false, "rationales": ["The green device with black handle sticking out either side of this boat is for rowing and maneuvering the boat through water.", "It's a paddle to propel the boat.", "The boat has green oars. the oars are used to steer the boat."], "image": "val2014/COCO_val2014_000000224220.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117918, "question_id": "Gc8itZ8d5QgJoPMzFNHKm2", "question": "On how many sides has the skin been removed from the cucumber?", "choices": ["two", "four", "three", "zero"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "7removed", "six", "two", "two", "two", "three", "three", "one"], "difficult_direct_answer": false, "rationales": ["There are cuts on part of the cucumber to remove the rind", "There are three sides in which the skin has been removed from the cucumbers. the sides are missing parts.", "There are multiple pieces removed from sides of it."], "image": "train2014/COCO_train2014_000000117918.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457691, "question_id": "Gc8mfDCMgQqbQopm7dNi7E", "question": "What is next to the tower?", "choices": ["wheelbarrow", "tree", "ladder", "statue"], "correct_choice_idx": 1, "direct_answers": ["building", "trees", "stop", "trees", "trees", "path", "tree", "car", "tree", "buildings"], "difficult_direct_answer": false, "rationales": ["There are a few trees near the tower.", "The tree is next to it.", "There are trees near the tower."], "image": "val2014/COCO_val2014_000000457691.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260430, "question_id": "Gd8X8k7MGfupVeNmhogTqs", "question": "How many different people are pictured in the photograph?", "choices": ["one", "four", "two", "three"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "one", "four", "four", "four", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["A snowboarder is moving down a mountain.", "There is one person shown.", "It is of the same person of different stills. the same clothing is seem on each person."], "image": "train2014/COCO_train2014_000000260430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168316, "question_id": "GdMtHBAFDRhA35hX57qD4s", "question": "What is on the table?", "choices": ["elephant", "baby stroller", "kitten", "stuffed doll"], "correct_choice_idx": 3, "direct_answers": ["toys", "stuffed animals", "mice", "teddy bears", "toys", "plush animals", "stuffed doll", "stuffed animals", "stuffed animals", "bears"], "difficult_direct_answer": false, "rationales": ["Plush animals are on display on a table.", "A variety of stuffed animals and dolls sit on the table along with a variety of other playful objects.", "Stuffed dolls are on the table. there are many shapes and sizes."], "image": "train2014/COCO_train2014_000000168316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331738, "question_id": "GeRJEgTLr4d9qU4HgVZ5ig", "question": "What nation's flag is on the tail fin of the aircraft going to the right?", "choices": ["usa", "uk", "spain", "france"], "correct_choice_idx": 3, "direct_answers": ["netherlands", "france", "france", "france", "russia", "france", "france", "france", "france", "netherlands"], "difficult_direct_answer": false, "rationales": ["There is a yellow plane with a red white and blue strip flag on its tail.", "The flag is french.", "A red, white, and blue flag is painted onto the end of an aircraft."], "image": "train2014/COCO_train2014_000000331738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94651, "question_id": "GgXUZ27VtdXhpdQFX7QZsd", "question": "What color is the animal on the right's nose?", "choices": ["orange", "black", "green", "pink"], "correct_choice_idx": 3, "direct_answers": ["pink", "pink", "pink", "pink", "pink", "pink", "pink", "pink", "white", "white"], "difficult_direct_answer": false, "rationales": ["Both cats have pink noses.", "This is a cat. cats typically have pink noses.", "The color is pink."], "image": "val2014/COCO_val2014_000000094651.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80745, "question_id": "Ggrk9wyjf83iEuDanY8Jb9", "question": "What color is the fruit on the right hand side?", "choices": ["black", "red", "purple", "yellow"], "correct_choice_idx": 3, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "green", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The fruit on the right hand side is bright yellow.", "Although both sides show bananas, only the ones on the right-hand side are ripe.", "There are only two colors of fruit shown, green and yellow. yellow is on the right."], "image": "train2014/COCO_train2014_000000080745.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279477, "question_id": "GhRvLJ5U7hnrxSx6aj4UUD", "question": "How many single-seated chairs are below and free underneath of the table?", "choices": ["three", "five", "two", "four"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "dowd", "two"], "difficult_direct_answer": false, "rationales": ["There are 2 chairs.", "There are three seats and only two qualify as single-seated chairs.", "The third seat is a bench."], "image": "train2014/COCO_train2014_000000279477.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562144, "question_id": "GhxG5nrqR4GzvYnBVKJwVY", "question": "What color is the backpack worn by the man in the gray jacket?", "choices": ["red", "green", "pink", "purple"], "correct_choice_idx": 0, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "burgundy"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The color is red.", "The backpack he is carrying is this dark color."], "image": "train2014/COCO_train2014_000000562144.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62210, "question_id": "Gi9AkNwts5uWXgLoXUNhhX", "question": "Why is the toilet paper on the toilet tank?", "choices": ["bidet", "decoration", "no holder", "forgotten"], "correct_choice_idx": 2, "direct_answers": ["no holder", "accessibility", "plastic", "clean", "westertoilet", "spare", "no holder", "extra roll", "no holder", "to wipe"], "difficult_direct_answer": false, "rationales": ["There isn't any other place to put it.", "The toilet paper roll is sitting on the tank because there isn't a holder in the bathroom to place it on.", "There is a roll of toilet paper on the toilet tank because there is not a toilet paper holder in the bathroom."], "image": "val2014/COCO_val2014_000000062210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58281, "question_id": "Gipydr9BgzQf6YqTjEk8oL", "question": "How many animals are in this picture?", "choices": ["two", "six", "one", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "two", "four", "five", "four", "four", "four", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two giraffes and two birds.", "There are two giraffes.", "There are four animals in the picture all together including two birds and two giraffes."], "image": "train2014/COCO_train2014_000000058281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5111, "question_id": "GjQAJWoA59rY8Vhfqm7s5m", "question": "What type of building is this bathroom in?", "choices": ["garden apartment", "barn", "highrise", "duplex"], "correct_choice_idx": 2, "direct_answers": ["apartment building", "highrise", "glass", "apartment", "high rise", "apartment", "apartment", "apartment", "hi-rise", "apartment"], "difficult_direct_answer": false, "rationales": ["The building is a highrise.", "Other skyscrapers are visible through the window.", "The building is a highrise."], "image": "train2014/COCO_train2014_000000005111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578705, "question_id": "Gjeq5ngUq2hH2XHwQSiMS7", "question": "What is aimed at the man on the motorcycle?", "choices": ["camera", "antique blunderbuss", "paintball", "ruler"], "correct_choice_idx": 0, "direct_answers": ["camera", "phone", "camera", "camera", "camera", "camera", "camera", "race", "camera", "camera"], "difficult_direct_answer": false, "rationales": ["The camera is aimed.", "The camera is aimed.", "There is a camera aimed at the man driving the motorcycle."], "image": "train2014/COCO_train2014_000000578705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512838, "question_id": "GmBsEnrjLivzou6KqSTf8u", "question": "What object is in the center of the chips stand logo?", "choices": ["ship", "boat", "wheel", "potato"], "correct_choice_idx": 2, "direct_answers": ["nautical wheel", "fish", "ship wheel", "ship's helm", "ships wheel", "boatwheel", "horse", "steering wheel", "steering wheel", "wheel"], "difficult_direct_answer": true, "rationales": ["The object is located based on the text of the question and resembles a commonly known shape and identified by the circle with spokes around it.", "The logo has the characteristics and features of answer a.", "The object is a wheel."], "image": "train2014/COCO_train2014_000000512838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161962, "question_id": "Gmd78ntfKVcvdn34sACite", "question": "What shape is the overgrown grass cut inside of the paths?", "choices": ["rectangle", "oval", "circle", "square"], "correct_choice_idx": 0, "direct_answers": ["rectangle", "rectangle", "round", "green", "rectangle", "rectangle", "oval", "rectangle", "straight", "rectangle"], "difficult_direct_answer": false, "rationales": ["There is a pretty green grass that isn't cut and overgrown by a path. it is in the shape of a rectangle.", "The shape is a rectangle.", "The grass is in a rectangle."], "image": "val2014/COCO_val2014_000000161962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331289, "question_id": "GpibpPbSEMqdD6WSexgehN", "question": "What color are the numbers on the top of the pole with the traffic lights?", "choices": ["red", "yellow", "green", "blue"], "correct_choice_idx": 2, "direct_answers": ["green", "green", "green", "white", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["They are the same color as the light signifying it's safe to go.", "It's the same color as the traffic light that is on right now which is at the very bottom.", "The numbers are painted green."], "image": "val2014/COCO_val2014_000000331289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43202, "question_id": "GprfnYcWcdQ4DZXY5VjVkD", "question": "Some fountains in this city are at least how much taller than an adult person?", "choices": ["10 times", "20 times", "8 times", "4 times"], "correct_choice_idx": 3, "direct_answers": ["some", "4 times", "10 feet", "more", "ten times", "feet", "ten feet", "fifteen feet", "ten feet", "five"], "difficult_direct_answer": true, "rationales": ["Fountains can be seen in town with people standing nearby. the fountains are about twice as tall as the street lights.", "The fountains are that much tall compare to an adult.", "The fountains are four times taller."], "image": "val2014/COCO_val2014_000000043202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531324, "question_id": "Gt8toYh4SZuxHrX5NfmeWR", "question": "What is sold inside of this street store?", "choices": ["beer", "coffee", "pizza", "weed"], "correct_choice_idx": 2, "direct_answers": ["pizza", "jewelry", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["The writing on the window of the store indicates what is sold within and is readable.", "The pizza is sold.", "The store has lettering in the window that shows the dutch word for pizzeria."], "image": "val2014/COCO_val2014_000000531324.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245842, "question_id": "GuCQxyorQHrx4qSrSrHj7X", "question": "What style of skis are worn on the man's back pack?", "choices": ["downhill", "racing", "cross country", "alpine"], "correct_choice_idx": 3, "direct_answers": ["snow skis", "alpine", "snow skis", "snow skiis", "cross country", "short", "nano skis", "downhill", "cross", "snow"], "difficult_direct_answer": true, "rationales": ["The style is alpine.", "Based on the size and shape of the skis and the boots and the fact the skier is climbing up a mountain, they are engaging in the activity associated with answer a and would have skis to match.", "The skis are of a style, shape and size that would be consistent with answer a. the setting that the person intends to use them and their additional equipment are also consistent."], "image": "train2014/COCO_train2014_000000245842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327466, "question_id": "GuaohMJuwDbTL9RAv4ex37", "question": "What color are the round buoys on the left sides of these boats parked at the marina?", "choices": ["orange", "red", "white", "green"], "correct_choice_idx": 1, "direct_answers": ["red", "orange", "white", "pink", "pink", "dwdwd", "red", "red", "red", "orange"], "difficult_direct_answer": false, "rationales": ["They are a little faded but have the same coloring as an apple", "The objects are located based on the question and their color is clearly identified.", "The buoys are clearly visible, and while they may be faded closer to a pinkish color, answer a is the closest approximate from the provided answers."], "image": "val2014/COCO_val2014_000000327466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459401, "question_id": "Gvb2RYdyvcCCa3dwxCzcak", "question": "How many giraffes are lounging around in the wild field of grass?", "choices": ["four", "three", "two", "six"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are 4 giraffes.", "There are four giraffes next to the trees.", "There are four animals in the grass."], "image": "train2014/COCO_train2014_000000459401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208107, "question_id": "Gw6EjyzniNSHXUoz8y9ugK", "question": "What shape are the oars forming?", "choices": ["cross", "star", "circle", "square"], "correct_choice_idx": 0, "direct_answers": ["x", "traingle", "round", "x-shape", "long", "stick", "x", "cross", "cross", "cross"], "difficult_direct_answer": false, "rationales": ["A man is staning on a boat. he has the oars crisscrossed making an x.", "The shape is a cross.", "They are located over one another."], "image": "val2014/COCO_val2014_000000208107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127556, "question_id": "GyMETgxnMfYGrry3iXCVmb", "question": "What kind of food is fed to the small cat?", "choices": ["cake", "crab", "cat treat", "shellfish"], "correct_choice_idx": 0, "direct_answers": ["cake", "bread", "pizza bite", "treat", "bread", "bread", "cracker", "potato", "bread", "no clue"], "difficult_direct_answer": false, "rationales": ["A small baked good is being offered to a cat.", "A grey cat is smelling a treat in the owner's hand.", "This is a baked item"], "image": "val2014/COCO_val2014_000000127556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517410, "question_id": "GyXWhHBBMs8QT2aYWQoUoL", "question": "What types of birds are these?", "choices": ["geese", "ducks", "swans", "chickens"], "correct_choice_idx": 0, "direct_answers": ["geese", "geese", "geese", "geese", "geese", "duck", "geese", "geese", "geese", "geese"], "difficult_direct_answer": false, "rationales": ["The birds are geese.", "They are water birds and have long necks.", "The birds are geese."], "image": "train2014/COCO_train2014_000000517410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187888, "question_id": "H2KiPict9jVt8Bc3m3733b", "question": "How many subsections are there of the waffle on the sheet?", "choices": ["two", "one", "three", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["Waffles are made in a waffle iron. once cooked the waffle is one big piece with 4 subsections making it easier to cut into quarters.", "The waffle is separated in quarters.", "There are four subsections."], "image": "val2014/COCO_val2014_000000187888.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269254, "question_id": "H33RDdeeq3WP8LH96Gm4aM", "question": "What is the cellophane wrapping applied over top of?", "choices": ["helmet", "surfboard", "bike", "package"], "correct_choice_idx": 1, "direct_answers": ["surfboard", "surfboard", "surfboard", "surfboard", "surf", "surfing board", "surfboard", "surf board", "covering", "surfboard"], "difficult_direct_answer": false, "rationales": ["It is a means of water transportation.", "The wrapping is on a surfboard.", "You can tell by it's long shape and flat body. this is characteristic of this type of sports equipment and it is carried in the matter shown."], "image": "val2014/COCO_val2014_000000269254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164935, "question_id": "H3qurELuUEXj9ceJxyLzym", "question": "How many birds are sat atop the zebra's back?", "choices": ["four", "two", "three", "one"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "tender", "two", "two"], "difficult_direct_answer": false, "rationales": ["A pair of birds are perched on a striped animal.", "There's two bodies of feathered creatures on the back of that horse-like animal.", "This is obvious within the scene."], "image": "train2014/COCO_train2014_000000164935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282946, "question_id": "H4m8becUUnUzXn8B4yGp9V", "question": "What occasion is now photographed underneath the clock faces?", "choices": ["sale", "realty", "insurance", "marriage"], "correct_choice_idx": 3, "direct_answers": ["marriage", "wedding", "wedding", "wedding", "engravement", "wedding", "wedding", "wedding", "couple", "wedding"], "difficult_direct_answer": false, "rationales": ["The woman is in a wedding dress.", "There is a woman in a long white dress with a veil", "The woman is wearing a white dress and a veil."], "image": "train2014/COCO_train2014_000000282946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463327, "question_id": "H5oUVZmzjYeL7jYC4MG2sP", "question": "How many portraits are attached to the walls of the living room?", "choices": ["three", "four", "two", "one"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "one", "two", "two", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is a larger portrait handing over a couch be a standing lamp and another to right of window.", "They are hanging on the wall.", "There are 2."], "image": "train2014/COCO_train2014_000000463327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2459, "question_id": "H6wRzKqyE27xqCT4n7whf6", "question": "What color is the hair of the woman working on this laptop?", "choices": ["brown", "red", "blonde", "black"], "correct_choice_idx": 1, "direct_answers": ["red", "compptr", "red", "black", "black", "brown", "black", "watching", "red", "brunette"], "difficult_direct_answer": false, "rationales": ["The hair is an auburn that is typical of people with this color hair. redheads also have fair skin like this women.", "The color is red.", "The woman's hair is not black, brown, or blonde."], "image": "train2014/COCO_train2014_000000002459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425242, "question_id": "H74UDiQrPdMMHMHGdtHe8V", "question": "What is the man staring at?", "choices": ["apple", "television", "baby", "monkey"], "correct_choice_idx": 1, "direct_answers": ["tv", "tv screen", "tv", "television game", "monitor", "television", "tv", "tv", "tv", "television"], "difficult_direct_answer": false, "rationales": ["The other options aren't in this scene. and he's obviously looking at a.", "The man is playing a video game.", "The man is looking at the tv."], "image": "train2014/COCO_train2014_000000425242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22223, "question_id": "H87eMqNVwXff7acffYYgAh", "question": "What color are the indentations of the cow's face near her eyes?", "choices": ["gray", "red", "black", "brown"], "correct_choice_idx": 3, "direct_answers": ["brown", "brown", "brown", "black", "brown", "dark grey", "white", "black", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The color is brown.", "The color of the indentations are brown.", "They are a very dark shade of that color because its head is in the shadow."], "image": "train2014/COCO_train2014_000000022223.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173155, "question_id": "H8NTaQRYEYgmwcismiUbsY", "question": "Who is on the bicycle?", "choices": ["rodeo performer", "police officer", "actress", "clown"], "correct_choice_idx": 1, "direct_answers": ["police", "officer", "policeman", "police", "officer", "police officer", "police officer", "officer", "police officer", "cop"], "difficult_direct_answer": false, "rationales": ["There is a police officer in uniform sitting on the motorcycle.", "The bike has a police badge symbol.", "An officer is on the bike."], "image": "train2014/COCO_train2014_000000173155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293259, "question_id": "H8meX3ucg9AyvmoMWnPMtK", "question": "What color is the shirt worn by the woman in the out-of-focus background?", "choices": ["red", "turquoise", "pink", "white"], "correct_choice_idx": 1, "direct_answers": ["white", "green", "blue", "blue", "white", "white", "turquoise", "blue", "green", "green"], "difficult_direct_answer": false, "rationales": ["The color is bright blue.", "The shirt is blue.", "There is a woman on right side past the bananas wearing a bluish shirt."], "image": "val2014/COCO_val2014_000000293259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353231, "question_id": "H9GfD7FtTU6Nd4VBQNfc3y", "question": "What color are the end bridges for the boat suspended in the middle of the river?", "choices": ["blue", "white", "red", "green"], "correct_choice_idx": 0, "direct_answers": ["white", "brown", "white", "brown", "blue", "blue", "brown", "black", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The ends of the boat are colored blue in the middle of the river.", "The color is blue.", "The color is blue."], "image": "val2014/COCO_val2014_000000353231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95363, "question_id": "H9c2JAFeH7sbnuRzdVYmbh", "question": "What color is the dog standing inside of the doorway to the bathroom?", "choices": ["gray", "chocolate", "golden", "black"], "correct_choice_idx": 2, "direct_answers": ["yellow", "golden", "brown", "tan", "tan", "golden", "brown", "golden", "gold", "brown"], "difficult_direct_answer": false, "rationales": ["The dog is a golden retriever.", "The dog is golden color.", "The color is golden."], "image": "train2014/COCO_train2014_000000095363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197329, "question_id": "HBSyDZ4qhcdW2ZgeRynA9M", "question": "What is the tallest item?", "choices": ["giraffe", "ladder", "tree", "giant man"], "correct_choice_idx": 2, "direct_answers": ["tree", "tree", "tree", "tree mountain", "tree", "giraffe", "giraffe", "giraffe", "giraffe", "giraffe"], "difficult_direct_answer": false, "rationales": ["The tree is taller than the animals or the other plants.", "The tree is taller than the giraffes and the zebras and the rest of the plants.", "The item is the tree."], "image": "train2014/COCO_train2014_000000197329.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328255, "question_id": "HCP4Ux9vRAHtg95iTtz6a8", "question": "What is the corded object called?", "choices": ["hair iron", "phone", "electric razor", "hair dryer"], "correct_choice_idx": 2, "direct_answers": ["razor", "electric shaver", "electric razor", "shaver", "electric razor", "razor", "razor", "razor", "razor", "razor"], "difficult_direct_answer": false, "rationales": ["There is a corded razor on the counter.", "The object uses electricity for shaving.", "This is a bathroom, not an office. the corded object has blades that can cut hair."], "image": "val2014/COCO_val2014_000000328255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466565, "question_id": "HCmg87mMRkr4PbahghgeLi", "question": "What are these people's nationality?", "choices": ["venezuela", "thailand", "india", "namibia"], "correct_choice_idx": 1, "direct_answers": ["indians", "unknown asian", "china", "chinese", "dwdwdw", "asian", "asian", "thailand", "asian", "asian"], "difficult_direct_answer": false, "rationales": ["There is a group of asian people that are posing at a table. they are eating pizza and drinking.", "The people are thai.", "The people are thai."], "image": "val2014/COCO_val2014_000000466565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323959, "question_id": "HCyqLYgmgVfcPUNVa7KefJ", "question": "What is near the curtain?", "choices": ["cat", "microwave", "mirror", "goat"], "correct_choice_idx": 2, "direct_answers": ["bathtub", "towel", "towel", "towel", "sink", "tub", "mirror", "towel", "towel", "towel"], "difficult_direct_answer": false, "rationales": ["The curtain is hanging next to this reflective surface.", "A mirror is behind.", "It is to the right of the curtain and is reflecting the curtain."], "image": "val2014/COCO_val2014_000000323959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238147, "question_id": "HDaSQs6JRZWDadGCNd4hbk", "question": "What are the bikes used to deliver?", "choices": ["puppies", "hot dogs", "newspapers", "pizza"], "correct_choice_idx": 3, "direct_answers": ["pizza", "pizza", "blue", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "trike"], "difficult_direct_answer": false, "rationales": ["The name on the boxes on the bikes indicate the type of food being delivered.", "The bikes deliver pizza.", "The boxes are large, square and say \"pizza\" on them."], "image": "val2014/COCO_val2014_000000238147.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208408, "question_id": "HDh4q2YpNaUEAvWiGEdomq", "question": "What phrase is most appropriate?", "choices": ["airplane race", "conflagration", "downpour", "earthquake"], "correct_choice_idx": 2, "direct_answers": ["rainy day", "rain", "raining", "downpour", "rainy day", "walking", "gray", "powerful", "rainy", "rainy"], "difficult_direct_answer": false, "rationales": ["It's pouring.", "The man is using an umbrella. umbrellas protect people from rain.", "The weather conditions are accurately described as a downpour."], "image": "val2014/COCO_val2014_000000208408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575206, "question_id": "HE2YsEBmTUs9rX5gsDR287", "question": "What color is the faucet above of the sink?", "choices": ["blue", "black", "silver", "yellow"], "correct_choice_idx": 3, "direct_answers": ["gold", "white", "gold", "yellow", "gold", "gold", "gold white", "gold", "gold", "gold"], "difficult_direct_answer": false, "rationales": ["It has a tinge of the primary colour of yellow", "The faucet is actually gold.", "Yellow is what gold looks like sometimes on faucets."], "image": "train2014/COCO_train2014_000000575206.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404565, "question_id": "HFRXdSUjf6sWn4DPmBMykS", "question": "What movie would this setting fit?", "choices": ["cliffhanger", "phone booth", "blade", "dumbo"], "correct_choice_idx": 0, "direct_answers": ["ski movie", "troll hunter", "avalanche", "frozen", "cliffhanger", "cliffhanger", "james bond", "cliffhanger", "alive", "everest"], "difficult_direct_answer": false, "rationales": ["The people are skiing on a snow-covered mountain. there are no elephants, phone booths, or vampires.", "There is a snow slope.", "The people are skiing on a mountain."], "image": "train2014/COCO_train2014_000000404565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29994, "question_id": "HGHZ4GX2pKfoiSfnZbm8sk", "question": "What item was likely used to get the banana in its current state?", "choices": ["blender", "knife", "microwave", "hammer"], "correct_choice_idx": 1, "direct_answers": ["knife", "knife", "knife", "knife", "knife", "half cut", "knife", "knife", "knife", "knife"], "difficult_direct_answer": false, "rationales": ["This is obvious given that it was cut and still has the peel on the outside.", "The banana has a clear and clean cut in the middle that is commonly produced by answer a.", "This has a sharp blade for an even cut"], "image": "val2014/COCO_val2014_000000029994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339684, "question_id": "HGKbNyKAaHsdfLw5bwQwmN", "question": "What word is appropriate to describe the animal near the books?", "choices": ["squid", "invertebrate", "mammal", "mollusk"], "correct_choice_idx": 2, "direct_answers": ["cat", "cat", "cat", "fast", "panda", "mammal", "cat", "cat", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["The word is a mammal.", "The word is a mammal.", "The cat is a mammal since it is warm-blooded."], "image": "train2014/COCO_train2014_000000339684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34454, "question_id": "HGo92tUgg5rqiNSYrT4Zqs", "question": "What is the person on the surfboard doing to the dog?", "choices": ["grooming", "petting", "feeding", "hugging"], "correct_choice_idx": 3, "direct_answers": ["hugging", "hugging", "hugging", "hugging", "playing", "sleep", "holding", "laying down", "laying down", "nothing"], "difficult_direct_answer": false, "rationales": ["A man has its arms wrapped around his dog as they lay on surfboard.", "The person has their arms around the sleeping animal.", "The man has his arms around the dog in a big hug."], "image": "train2014/COCO_train2014_000000034454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382309, "question_id": "HHnVN5SB4JzxoNqTeNibmK", "question": "How much liquid is in the bottle?", "choices": ["8000 ml", "440 ml", "2 ml", "275 ml"], "correct_choice_idx": 3, "direct_answers": ["275 ml", "275ml", "unknown", "275 ml", "lot", "ml", "half", "empty", "stella", "full"], "difficult_direct_answer": true, "rationales": ["The number is on the bottle.", "The bottle says 275 ml.", "There are 275 ml."], "image": "val2014/COCO_val2014_000000382309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317090, "question_id": "HHsirBea9wiYHQYRRNWZ4K", "question": "What was just tossed in the air?", "choices": ["baby", "tennis ball", "pizza dough", "frisbee"], "correct_choice_idx": 1, "direct_answers": ["ball", "tennis ball", "tennis ball", "tennis ball", "ball", "ball", "tennis ball", "ball", "ball", "ball"], "difficult_direct_answer": false, "rationales": ["A tennis player is on a court and is jumping up to serve the ball.", "The scene is a tennis court with a tennis player serving so it would be a tennis ball he throws up in the air.", "A person is serving on a tennis court and is reaching up while jumping at the back line of the court."], "image": "train2014/COCO_train2014_000000317090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179930, "question_id": "HJudmXpGGamkdUFkracTJA", "question": "What is looking at the giraffe?", "choices": ["person", "dog", "cat", "cow"], "correct_choice_idx": 0, "direct_answers": ["person", "person", "person", "man", "trees", "man", "person", "person", "man", "man"], "difficult_direct_answer": false, "rationales": ["The person looks.", "There is a human sitting down facing the animal", "A human being is sitting in a chair watching the animal eat."], "image": "val2014/COCO_val2014_000000179930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391397, "question_id": "HK2F3Hom53yG6wVtAqCBvN", "question": "Who was born in this country?", "choices": ["jim those", "isabelle adjani", "idris elba", "harish patel"], "correct_choice_idx": 3, "direct_answers": ["ghandi", "harish patel", "mahatma ghandi", "gandhi", "indians", "ghandi", "arab", "mahatma gandhi", "people", "hrithik roshan"], "difficult_direct_answer": true, "rationales": ["That person was born in that country.", "Street signs depicting indian locations is above a road.", "The sign is in india and the actor known for his role in \"lady of guadalupe\" was also born in india."], "image": "train2014/COCO_train2014_000000391397.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392039, "question_id": "HM4fJMesVVcTFfXLn5RvVP", "question": "What building material is the longhouse next to the sheep?", "choices": ["mud", "straw", "sticks", "brick"], "correct_choice_idx": 2, "direct_answers": ["hay", "sticks", "wood", "cloth", "wood", "wood", "wood", "wood", "barn", "black"], "difficult_direct_answer": false, "rationales": ["A log cabin style structure with a thatch roof is near sheep grazing.", "They are spaced and held together with twine", "The longhouse is made out of a wooden material."], "image": "train2014/COCO_train2014_000000392039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208994, "question_id": "HQL3iSDz7fME4FLTtqw7NP", "question": "What breed of animal is this?", "choices": ["dalmatian", "siamese", "pitbull", "manx"], "correct_choice_idx": 1, "direct_answers": ["siamese cat", "cat", "siamese", "siamese cat", "siamese", "cat", "cat", "siamese", "cat", "siamese cat"], "difficult_direct_answer": false, "rationales": ["The animal is a cat.", "There is a cat that is looking at itself in the mirror.", "A siamese cat has short creme colored hair with darker fur around the years, eyes, and paws."], "image": "train2014/COCO_train2014_000000208994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365903, "question_id": "HTcfgX4BYAuNLH9KBX7n2C", "question": "How many giraffes are standing in the middle of the stone enclosure?", "choices": ["one", "two", "three", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "three", "four", "three", "three", "three", "three", "four", "three"], "difficult_direct_answer": false, "rationales": ["One is laying down out of the group of 4", "There are four giraffes but one is sitting not standing like the other three.", "This is easy to prove by counting the number."], "image": "train2014/COCO_train2014_000000365903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574357, "question_id": "HTv3dJFJZrP28PDiF4jqNH", "question": "What company makes the item the person is wearing on their head?", "choices": ["lumos", "green giant", "59fifty", "burger king"], "correct_choice_idx": 0, "direct_answers": ["nomad", "suzuki", "helmet", "nomad", "nomad", "fox", "nomad", "homad", "nike", "lumos"], "difficult_direct_answer": false, "rationales": ["The name is on the helmet.", "That is the company that makes the hat.", "The person is wearing a helmet, not a hat, vegetable, or hamburger."], "image": "val2014/COCO_val2014_000000574357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214997, "question_id": "HXBLBYJsroqQTNfTgBVpkn", "question": "Who would work here?", "choices": ["pizza chef", "racecar driver", "clown", "police officer"], "correct_choice_idx": 0, "direct_answers": ["chef pizza", "chef", "cooks", "pizza maker", "chef", "chef", "pizza chef", "pizza man", "pizza maker", "teens"], "difficult_direct_answer": false, "rationales": ["There are pizzas in the display case.", "This is a pizza kitchen so a chef would work here.", "A person that makes pizzas."], "image": "train2014/COCO_train2014_000000214997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28115, "question_id": "HXCNTqgX5kMcrn7bzDvSPB", "question": "What number is on the train?", "choices": ["4482", "7395", "3365", "2785"], "correct_choice_idx": 1, "direct_answers": ["7395", "seventy-three ninety-five", "7395", "7395", "7395", "7395", "7395", "7395", "7395", "7395"], "difficult_direct_answer": false, "rationales": ["The number appears twice on the front of the train.", "7395 is the number on the train.", "The number is 7395."], "image": "val2014/COCO_val2014_000000028115.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398301, "question_id": "HXcknkEtBL5vWcgXY7WW5o", "question": "What is this type of event called?", "choices": ["farmers market", "fair", "flea market", "garage sale"], "correct_choice_idx": 3, "direct_answers": ["yardsale", "called", "flea market", "yard sale", "rummage sale", "poolparty", "yard sale", "garage sale", "garage sale", "market"], "difficult_direct_answer": false, "rationales": ["There are tables with used items for sale from vendors.", "There is a collection of items that are found in a home. the items appear used so it supports the conclusion that they are selling them.", "There are tables set up in a yard with lots of random items"], "image": "train2014/COCO_train2014_000000398301.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174690, "question_id": "HYzUmKuwfqb9iBhHx5napw", "question": "What time of the day is this?", "choices": ["sunrise", "evening", "sunset", "early morning"], "correct_choice_idx": 3, "direct_answers": ["night", "afternoon", "around noon", "two", "afternoon", "early morning", "night", "night", "135", "two"], "difficult_direct_answer": false, "rationales": ["A early morning. there was no choice for early afternoon so it has to be early morning.", "The time is early morning.", "It's early morning based on the clock."], "image": "val2014/COCO_val2014_000000174690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486789, "question_id": "HZuxrtDBYMGqU2u9VXNEtg", "question": "What is the side dish?", "choices": ["pickle", "fries", "stuffing", "carrots"], "correct_choice_idx": 0, "direct_answers": ["meat", "leaves", "pickle", "salad", "pickle", "bread", "pickle", "reuben sandwich", "lettuce", "meat"], "difficult_direct_answer": false, "rationales": ["There is a pickle.", "The side is a gherkin.", "The dish is a pickle."], "image": "train2014/COCO_train2014_000000486789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50125, "question_id": "HcWfcjxLA7f8dmZYK3RSdp", "question": "What country's red white flag is on the Perkins bus?", "choices": ["honduras", "guatemala", "peru", "mexico"], "correct_choice_idx": 2, "direct_answers": ["england", "indonesia", "england", "sweden", "canada", "peru", "canada", "poland", "austria", "canada"], "difficult_direct_answer": false, "rationales": ["There is a peruvian flag on top of the bus.", "A bus has a logo on the front.", "The flag is foreign."], "image": "val2014/COCO_val2014_000000050125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406403, "question_id": "HdDTCB9z48iWdxtV8BhnQg", "question": "What is the person standing on?", "choices": ["hot coals", "apples", "dirt", "snow"], "correct_choice_idx": 3, "direct_answers": ["skis", "skiis", "dwd", "skis", "skis", "skis", "snow", "snow", "skiis", "skis"], "difficult_direct_answer": false, "rationales": ["Another obvious one given that none of the other options come close. it's winter.", "The person is in snow.", "The substance is white and powdering, while it is obviously cold."], "image": "val2014/COCO_val2014_000000406403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385929, "question_id": "HdHm8iagvmAGss7gaVJo9f", "question": "What are the people near?", "choices": ["boats", "cows", "eggs", "babies"], "correct_choice_idx": 0, "direct_answers": ["boats", "boats", "sea", "boat", "boats", "boat", "boats", "boats", "boats", "lake"], "difficult_direct_answer": false, "rationales": ["The people are in the ocean and are fishing. the boats are their transportation.", "The people are by boats.", "They are in the water."], "image": "train2014/COCO_train2014_000000385929.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242376, "question_id": "HdJ3E9jNkbzZxtzfhdvrMq", "question": "What team shares a similar name to the name on the large item in the foreground?", "choices": ["dc united", "dallas mavericks", "houston oilers", "ny knicks"], "correct_choice_idx": 0, "direct_answers": ["soccer", "manchester united", "manchester united", "to united", "united sports", "spirit united", "soccer", "jets", "dc united", "soccer team"], "difficult_direct_answer": false, "rationales": ["Dc united contains the word united.", "They have the same word in them", "The name is on the plane."], "image": "train2014/COCO_train2014_000000242376.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35844, "question_id": "Hdgcd7sxNDnpRmWxqgD3Zp", "question": "What company uses the large vehicle here?", "choices": ["tank division", "friendlys", "greyhound", "burger king"], "correct_choice_idx": 2, "direct_answers": ["city", "stop", "bus", "greyhound", "bus", "greyhound", "tour bus", "mike", "coachline", "bus"], "difficult_direct_answer": false, "rationales": ["The large vehicle is a bus, not a tank. friendly's and burger king are restaurants that do not use vehicles.", "The company is greyhound.", "The large vehicle is a bus."], "image": "train2014/COCO_train2014_000000035844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157635, "question_id": "HdteRh2TBwx9T6u5irAvQe", "question": "What is on the TV?", "choices": ["sporting event", "judge show", "cartoons", "baking show"], "correct_choice_idx": 0, "direct_answers": ["news", "person", "sporting event", "sports", "game", "sports", "sports", "football", "football", "sports news"], "difficult_direct_answer": false, "rationales": ["The tv above the fireplace shows a person dressed like a sports coach at a sporting event.", "The tv has a sport event.", "The television screen includes people wearing athletic brands and features a scoreboard. these attributes are consistent with answer a."], "image": "val2014/COCO_val2014_000000157635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283028, "question_id": "HeEDeHr342sPGdyoTSpzLz", "question": "What is an appropriate term to refer to this animal?", "choices": ["kid", "joey", "chick", "kitten"], "correct_choice_idx": 3, "direct_answers": ["kitten", "kitten", "kitten", "cat", "kitten cat", "feline", "kitten", "kitten", "kitten", "kitten"], "difficult_direct_answer": false, "rationales": ["The term is a kitten.", "The animal is a kitty.", "The animal standing next to the computer is a kitten since it is not a fully grown cat."], "image": "val2014/COCO_val2014_000000283028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90476, "question_id": "HeJxwHNrvqoF7Z5ANPycAp", "question": "What boat number is the largest here?", "choices": ["9986", "273", "1812", "681"], "correct_choice_idx": 3, "direct_answers": ["ss681", "681", "681", "681", "681", "681", "eight", "681", "681", "681"], "difficult_direct_answer": false, "rationales": ["This is the largest of all the numbers on the boats.", "The boat number is 681.", "The largest boat has that number."], "image": "val2014/COCO_val2014_000000090476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244246, "question_id": "HecCqiiHrkknfG3cMyLVrR", "question": "This is most likely a scene from which major California city?", "choices": ["san francisco", "san diego", "la", "pasadena"], "correct_choice_idx": 1, "direct_answers": ["airplane", "san diego", "san diego", "los angeles", "plain", "plane", "san diego", "san diego", "san diego", "san diego"], "difficult_direct_answer": false, "rationales": ["The green sign on the right indicates that this city's convention center can be accessed via the next exit. the city is not san francisco, los angeles, or pasadena.", "The green sign mentions the a convention center.", "This is most likely san diego, as evinced by the green sign."], "image": "val2014/COCO_val2014_000000244246.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155823, "question_id": "Hf6UEfeUS3VSjFaPejAEAT", "question": "What color is the baseball glove held in the girl's little right hand?", "choices": ["black", "red", "tan", "brown"], "correct_choice_idx": 3, "direct_answers": ["tan", "brown", "brown", "white", "brown", "brown", "white", "lite brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["These type of gloves are made from leather and this is the natural color for them", "The little girl is holding a leather glove that is usually resembles the color of dirt. she is about to catch a baseball in glove.", "The color is brown."], "image": "train2014/COCO_train2014_000000155823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336077, "question_id": "HfEZ8FZH7sw3oyaEJSvudf", "question": "What color is the home team of this match?", "choices": ["blue", "dark gray", "navy", "red"], "correct_choice_idx": 3, "direct_answers": ["red white", "white", "red", "red", "white", "red", "white", "red", "red", "red white"], "difficult_direct_answer": false, "rationales": ["The home team is in red.", "The color is being worn by the team and most of their fans.", "There is cardinals batter swinging at a ball."], "image": "train2014/COCO_train2014_000000336077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445002, "question_id": "HfHys7q4o6cwuDNm3AA7m7", "question": "What color is the cord phone sitting next to the shower stall on the wall?", "choices": ["green", "yellow", "white", "blue"], "correct_choice_idx": 2, "direct_answers": ["stop", "white", "green", "white", "white", "beige", "white", "beige", "tan", "tan"], "difficult_direct_answer": false, "rationales": ["A light colored phone hangs on the wall near a shower.", "The cord is a light color.", "A white phone hangs on the wall outside of a shower."], "image": "val2014/COCO_val2014_000000445002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468416, "question_id": "HhV4xd3cUz69JKc3Pk3iE4", "question": "How many lambs are lead by this sheep?", "choices": ["three", "four", "two", "one"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "three", "three", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are less than three lambs but more than one lamb visible in the image.", "There are two lambs being led.", "There is a pair of sheep following behind the larger sheep."], "image": "train2014/COCO_train2014_000000468416.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406171, "question_id": "HjgFHrExTtG9oBLgk2Ebqr", "question": "What kind of hat does the man wear while playing tennis?", "choices": ["knit", "hard hat", "baseball cap", "fedora"], "correct_choice_idx": 1, "direct_answers": ["hard hat", "hardhat", "construction hardhat", "hard hat", "hard", "hardhat", "construction", "hard hat", "construction", "hardhat"], "difficult_direct_answer": false, "rationales": ["He looks like a construction worker.", "The hat is hard.", "That's typical for construction workers to wear to show that there are people working in construction things. this is for humor, though."], "image": "train2014/COCO_train2014_000000406171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268927, "question_id": "HjneoeWoKUBVTruKAydR57", "question": "How many pigeons are sat on top of the bike stop?", "choices": ["four", "three", "five", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["One is right next to the man and the other a little ways away", "Two birds are on top of a metal bar that is the top bar of a contraption used for locking bikes up in public.", "One pigeon is near the person. an additional pigeon is closer to the pole."], "image": "val2014/COCO_val2014_000000268927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348989, "question_id": "Hjp9wn7GmfiqHx3DrEHjfn", "question": "Why are there wires sticking out of the wall?", "choices": ["inspection", "decoration", "demolition", "new construction"], "correct_choice_idx": 3, "direct_answers": ["electrical", "extension purpose", "repair", "new construction", "renovation", "current shocking", "long", "support", "tap", "protection"], "difficult_direct_answer": true, "rationales": ["There seems to be new lighting being put into place in the kitchen.", "This looks like a new house which is still being built.", "The wires are for construction."], "image": "train2014/COCO_train2014_000000348989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445964, "question_id": "HkXSjWcAfFYrzSyqahwMjE", "question": "What is the name of the fruit on the head of the person in the front of the image?", "choices": ["banana", "strawberries", "kiwi", "mango"], "correct_choice_idx": 0, "direct_answers": ["bananas", "bananas", "banana", "banana", "bananas", "bananas", "bananas", "bananas", "banana", "banana"], "difficult_direct_answer": false, "rationales": ["These fruit grow in green bunches until they are ripe.", "The fruit is the banana.", "The fruit is elongated and yellow, so it is a banana."], "image": "train2014/COCO_train2014_000000445964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199618, "question_id": "Hm43xqY9fRUwVFyHEtwcYS", "question": "What color is the icing on the toy donut raised to the eye of the person on the t-shirt?", "choices": ["white", "blue", "pink", "red"], "correct_choice_idx": 2, "direct_answers": ["pink", "pink", "pink", "pink", "pink", "pink", "pink", "pink", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["This is obvious in the scene and contrasts brightly against the white.", "The color in a corresponds to the color of the toy donut.", "The icing is not red, white, or blue."], "image": "train2014/COCO_train2014_000000199618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99734, "question_id": "HmvYEDcNUv2dDCVUyJa2BL", "question": "How many boats are surrounded by netting with one net per each boat?", "choices": ["two", "two", "three", "four"], "correct_choice_idx": 3, "direct_answers": ["two", "five", "five", "three", "car", "three", "one", "four", "four", "stop"], "difficult_direct_answer": false, "rationales": ["There are four boats with nets around them.", "There are four boats in the water.", "There are four boats."], "image": "val2014/COCO_val2014_000000099734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195316, "question_id": "HnbWPde8PBFNwfAdVTPzkE", "question": "What color is the woman's scarf who is wearing a white jacket?", "choices": ["blue", "red", "black", "white"], "correct_choice_idx": 0, "direct_answers": ["black", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The adult female skier is wearing a muffler the color of the sky.", "The color is blue.", "The scarf is one of the primary colors that lies between purple and cyan."], "image": "train2014/COCO_train2014_000000195316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573085, "question_id": "Ho3fzjmK9pZ5XXM36QfYQU", "question": "What vehicles are abundant here?", "choices": ["horses", "trains", "vans", "airplanes"], "correct_choice_idx": 3, "direct_answers": ["airplane", "airplanes", "airplanes", "airplane", "aeroplane", "planes", "luggage", "airplanes", "airplanes", "planes"], "difficult_direct_answer": false, "rationales": ["Many airplanes can be seen at the terminal and behind at an airport.", "The vehicles are not animals. they are flying vehicles.", "The airport shows many airplanes on the tarmac that people can use to travel."], "image": "train2014/COCO_train2014_000000573085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428986, "question_id": "Ho9MqUU8tfk8BeGVGL2X9F", "question": "What color are the little boy's ski shoes attached to the little skis?", "choices": ["white", "purple", "black", "red"], "correct_choice_idx": 0, "direct_answers": ["red", "blue", "red white", "white", "white", "beige", "white", "skate board", "white", "white"], "difficult_direct_answer": false, "rationales": ["The color is white.", "The shoes the boy are wearing are white.", "The color is white."], "image": "train2014/COCO_train2014_000000428986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241466, "question_id": "HpszqJREzBKh3y3UkkHs3b", "question": "Why are all the bikes the same?", "choices": ["family owned", "rentals", "trends", "government issued"], "correct_choice_idx": 1, "direct_answers": ["renting", "rentals", "same company", "delivery bikes", "same color", "rentals", "rentals", "company", "community bikes", "yellow"], "difficult_direct_answer": false, "rationales": ["The bikes are lined up in a row to be rented by tourists and are arranged neatly.", "The bikes are rentals.", "The bikes are rentals."], "image": "val2014/COCO_val2014_000000241466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262016, "question_id": "HqbemjPKqbKtUdvSEYrYK9", "question": "What type of business is this likely to be?", "choices": ["grocer", "bakery", "deli", "butcher"], "correct_choice_idx": 1, "direct_answers": ["bakery", "bakery", "bakery", "bakery", "bakery", "pretzel", "bakery", "cuttery", "bakery", "bakery"], "difficult_direct_answer": false, "rationales": ["The business is a bakery.", "There are baked goods on display.", "A man is working in a large kitchen in which there is an area with bagels piled high."], "image": "val2014/COCO_val2014_000000262016.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301783, "question_id": "HrEPwyj2H9YAjJzFpjSKah", "question": "What is white here?", "choices": ["cat", "candy cane", "apple", "shower curtain"], "correct_choice_idx": 3, "direct_answers": ["shower curtain", "curtains", "cutton", "sink", "shower curtain", "shower curtain", "curtain", "shower curtain", "screen cloth", "shower curtains"], "difficult_direct_answer": false, "rationales": ["These type of curtains are often in front of bathtubs.", "The shower curtain in the bathroom is made of white material.", "It's the same color as the tub"], "image": "train2014/COCO_train2014_000000301783.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268111, "question_id": "HrK7HBrpkY7dqsPKCQc3ey", "question": "What kind of dog does the goat in the middle resemble with brown ears?", "choices": ["beagle", "german shepherd", "golden retriever", "labrador"], "correct_choice_idx": 0, "direct_answers": ["german", "beagle", "beagle", "dogs", "dalmatian", "stop", "hound", "australian shepherd", "beagle", "collie"], "difficult_direct_answer": false, "rationales": ["The dog is a beagle.", "A white goat is standing with others around. the goat in the center has long, brown ears.", "Beagles have longer ears."], "image": "train2014/COCO_train2014_000000268111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438537, "question_id": "HtJmoyopCNPAiX2diKdYwp", "question": "What are the animals near?", "choices": ["bicycle", "boat", "egg carton", "dog house"], "correct_choice_idx": 0, "direct_answers": ["rice", "cows", "rice paddy", "water", "water", "bicycle", "shore", "water", "water", "people"], "difficult_direct_answer": false, "rationales": ["The person near the animals is riding a land vehicle that has two wheels.", "The animals are near two men that each have a bicycle.", "There is a vehicle that is manually powered with two wheels."], "image": "train2014/COCO_train2014_000000438537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560969, "question_id": "HtNSHWv6tC2dnbowQnc384", "question": "What color is the plaid pattern around the star on top of the plate?", "choices": ["blue", "green", "red", "purple"], "correct_choice_idx": 0, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "red blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["That color is similar to that of the sky.", "The color is blue.", "The color is blue."], "image": "train2014/COCO_train2014_000000560969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207056, "question_id": "HuLgVdNkHLZX4wtdr5UDrn", "question": "What do the animals all have?", "choices": ["stingers", "tusks", "wings", "gills"], "correct_choice_idx": 1, "direct_answers": ["trunks", "trunks", "tusks", "water", "tail", "tusk", "tusks", "water", "legs", "trunks"], "difficult_direct_answer": false, "rationales": ["The elephants all contain two tusks each.", "The elephants are in the water.", "There are long white teeth sticking out of each creature's mouth."], "image": "val2014/COCO_val2014_000000207056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94132, "question_id": "HuP2eqrbCsXgggbezWhEEQ", "question": "What is the child playing with?", "choices": ["dog", "train", "egg", "cat"], "correct_choice_idx": 1, "direct_answers": ["toys", "train", "toy train", "train", "trains", "toy trains", "train set", "train set", "train", "train"], "difficult_direct_answer": false, "rationales": ["You are able to see a railroad system with many vehicles on it, there is a locomotive pulling two cars. many modern railroads use electricity instead of coal to power their locomotives.", "The child has a train.", "A child is standing at a track and locomotive on a short table."], "image": "train2014/COCO_train2014_000000094132.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310797, "question_id": "HvvhcnnuuZ4xKM6Mu9Rjkm", "question": "What type of facial hair is kept by the man eating the hot dog in the sports stadium?", "choices": ["sideburns", "moustache", "goatee", "beard"], "correct_choice_idx": 0, "direct_answers": ["scruff", "sideburns", "sideburns", "side burns", "sideburns", "sideburns", "clean shaven", "sideburns", "beard", "sideburn"], "difficult_direct_answer": false, "rationales": ["This is obvious in the scene.", "There are sideburns near the man's ears.", "The hair is styled as sideburns."], "image": "val2014/COCO_val2014_000000310797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388853, "question_id": "HwbkbQpfHKoiB98z27GHqs", "question": "How many varieties of fruit are inside of the basket?", "choices": ["two", "three", "four", "one"], "correct_choice_idx": 1, "direct_answers": ["four", "four", "two", "three", "three", "three", "two", "two", "four", "three"], "difficult_direct_answer": false, "rationales": ["There are three varieties of fruit in the basket including bananas, apples, and oranges.", "There are three.", "There are apples and bananas."], "image": "train2014/COCO_train2014_000000388853.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329138, "question_id": "HwpmfrD75yGtoBLzQjLaG2", "question": "What word is appropriate for these animals?", "choices": ["equine", "crustacean", "bovine", "amoeba"], "correct_choice_idx": 0, "direct_answers": ["equine", "horse", "horse", "horses", "horse", "horse", "horses", "horse", "horses", "horses"], "difficult_direct_answer": false, "rationales": ["That is the proper word for this animal.", "A group of horses are grazing on grass.", "The animals eating hay are horses that are part of the equine species."], "image": "val2014/COCO_val2014_000000329138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215320, "question_id": "Hxnkdvas6xx54wKeQjmCbG", "question": "How many giraffes are standing directly on top of the dirt road?", "choices": ["one", "four", "two", "three"], "correct_choice_idx": 0, "direct_answers": ["one", "two", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is one giraffe.", "The only giraffe in the photo is standing on the paved road.", "There is one."], "image": "train2014/COCO_train2014_000000215320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326410, "question_id": "HxqgrL5QwGzSivqPgSnJ9L", "question": "What show or game are the stuffed characters from that stand beside the television?", "choices": ["yugioh", "harry potter", "digimon", "pokemon"], "correct_choice_idx": 3, "direct_answers": ["pokemon", "tv", "pokemon", "pokemon", "piggy", "sofa", "pikatchu", "pokemon", "pokemon", "nothing"], "difficult_direct_answer": false, "rationales": ["The stuffed animals beside the television are characters from pokemon.", "There is a pikachu doll next to the tv.", "Pokemon is the character."], "image": "val2014/COCO_val2014_000000326410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 161447, "question_id": "HzVPW9FFxWUPYbPEKNyLcv", "question": "What is covering the last slice of pizza available on the tray?", "choices": ["pepperoni", "mushrooms", "spinach", "cheese"], "correct_choice_idx": 2, "direct_answers": ["leaves", "plate", "plate", "spinach", "spinach", "greenery", "parsley", "spinach", "veg pizza", "leaf"], "difficult_direct_answer": false, "rationales": ["The last slice of pizza on the tray is covered with spinach leaves.", "It is a green leafy vegetable.", "People are holding up a tray with pizza that has a green, leafy topping."], "image": "val2014/COCO_val2014_000000161447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263270, "question_id": "J25nnsuEsvtffu59MpsXco", "question": "What is coming out of the village's railroad track?", "choices": ["weeds", "crops", "nuts", "berries"], "correct_choice_idx": 0, "direct_answers": ["weeds", "weeds", "people", "weeds", "people", "people", "train", "way", "weeds", "weeds"], "difficult_direct_answer": false, "rationales": ["Plants are growing out of the railroad track. they do not have commercial value.", "The track has weeds.", "The weeds come out."], "image": "train2014/COCO_train2014_000000263270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86924, "question_id": "J2RwNxLjQvaE3EEVEfPKkB", "question": "What is the biggest threat here to a baby?", "choices": ["samurai sword", "drill", "ladder", "scissors"], "correct_choice_idx": 3, "direct_answers": ["sciccors", "iscissors", "scissors", "scissors", "eating", "scissors", "scissors", "scissors", "scissors", "grape"], "difficult_direct_answer": false, "rationales": ["There is a tool that consists of two blades put together to cut things.", "The biggest threat is the scissors which are sharp.", "The object can easily injure the child."], "image": "train2014/COCO_train2014_000000086924.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132393, "question_id": "J4FQQKMgE5dHbHDd3BnFqk", "question": "What is this place most likely to be?", "choices": ["football game", "law office", "baseball game", "college campus"], "correct_choice_idx": 3, "direct_answers": ["college campus", "park", "town square", "park", "park", "america", "roadwalk", "park", "city park", "park"], "difficult_direct_answer": false, "rationales": ["There are a lot of younger people hanging around large buildings and a green area", "The place is a college campus.", "A crowded area outside a building has sidewalks and many young people gathered around. there are a lot of young people and buildings on college campuses."], "image": "val2014/COCO_val2014_000000132393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570246, "question_id": "J5VAxsGe5nGpDYa6BSgZRG", "question": "How many birds are on top of the drinking giraffe's head?", "choices": ["two", "five", "three", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "three", "three", "three", "two", "four", "two", "one", "three", "two"], "difficult_direct_answer": false, "rationales": ["There are two birds sitting on top of the giraffe's head.", "There's two birds cleaning.", "There are two."], "image": "train2014/COCO_train2014_000000570246.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212462, "question_id": "J68rEDSDGyFawv77DSGQAC", "question": "What kind of tall fruit is in the center of the fruit plate?", "choices": ["pineapple", "banana", "apple", "strawberry"], "correct_choice_idx": 0, "direct_answers": ["fruits", "pineapple", "watching", "pineapple", "pineapple", "pineapple", "car", "pineapple", "pineapple", "pineapple"], "difficult_direct_answer": false, "rationales": ["The fruit is pineapple.", "There is a spiky brown fruit with hard green leaves on top.", "The tallest fruit on the plate is a pineapple."], "image": "val2014/COCO_val2014_000000212462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176462, "question_id": "J6Erjp9NTKThgAJMtK4hN6", "question": "What are these animals known for?", "choices": ["wings", "gills", "stingers", "trunks"], "correct_choice_idx": 3, "direct_answers": ["their trunks", "eating peanuts", "big", "car", "good memory", "elephants", "being peaceful", "memory", "trunks", "trunk"], "difficult_direct_answer": true, "rationales": ["Elephants have long noses.", "They are elephants.", "They have a unique body part"], "image": "train2014/COCO_train2014_000000176462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70309, "question_id": "J8F6GK83bpMxNfiZeBo5y4", "question": "What best describes the size of the motorcycle?", "choices": ["12 feet", "10 feet", "miniature", "30 inches"], "correct_choice_idx": 2, "direct_answers": ["miniature", "chair", "mini", "small", "stop", "miniature", "enfield", "mini", "toy", "tiny"], "difficult_direct_answer": false, "rationales": ["The size is tiny.", "It is small and for decoration only.", "There is a sofa in the background. the motorcycle is smaller than the sofa."], "image": "train2014/COCO_train2014_000000070309.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404826, "question_id": "J9dfWftMHZQiYdnWma4nNL", "question": "What food is on the plate?", "choices": ["oats", "fries", "apples", "eggs"], "correct_choice_idx": 1, "direct_answers": ["fish", "fish", "porkchops", "pork chops", "mushroom", "dinner", "ew ink", "french fries", "fish", "fries"], "difficult_direct_answer": false, "rationales": ["There are some starchy items on the plate. they are made from potatoes.", "The food is fries.", "The items near the top left of the plate are made out of potatoes."], "image": "train2014/COCO_train2014_000000404826.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 298135, "question_id": "J9ihs4zEFdH8AmDFYG6srC", "question": "What does the man lagging behind's hairstyle resemble?", "choices": ["tonsure", "bouffant", "mullet", "mohawk"], "correct_choice_idx": 0, "direct_answers": ["suitcase", "bald head", "luggage", "circle", "travel", "patch", "bald", "tonsure", "laugh thing", "bowl"], "difficult_direct_answer": true, "rationales": ["He has a bald circle on the top", "The man with the gray luggage has a bald spot on the top of his head the resembles tonsure that monk's shave into their hair.", "The man's hair resembles the haircut because they intentionally shaved the middle part of the hair and kept hair around sides all way around."], "image": "train2014/COCO_train2014_000000298135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577589, "question_id": "JAjc6QkWfNjgUXh4HFbm8f", "question": "What color is the surfboard held lengthwise by the man in the wetsuit on the right?", "choices": ["green", "yellow", "blue", "white"], "correct_choice_idx": 1, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The color is a very bright one that's similar to that of the sun.", "Two men stand on the beach holding surfboards.", "The boards are bright."], "image": "train2014/COCO_train2014_000000577589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529311, "question_id": "JDUd6mTJZBc78FaCj8q3s4", "question": "Who played this sport?", "choices": ["bo jackson", "maria sharapova", "john elway", "mike mussina"], "correct_choice_idx": 1, "direct_answers": ["tennis player", "serena williams", "roger federer", "federer", "man", "maria sharapova", "man", "man", "roger federer", "men"], "difficult_direct_answer": false, "rationales": ["Tennis is played by maria sharapova.", "That person plays tennis.", "The man is playing tennis."], "image": "train2014/COCO_train2014_000000529311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 88286, "question_id": "JDbr96ZAzxXb7aUjdnNeGp", "question": "What are both of the men near the skateboard wearing?", "choices": ["ties", "glasses", "backpacks", "baskets"], "correct_choice_idx": 1, "direct_answers": ["skating", "jeans", "stop", "glasses", "shirt", "dd", "glasses", "car", "glasses", "black shirts"], "difficult_direct_answer": false, "rationales": ["The man on the skateboard and the man behind him are both wearing eyeglasses.", "These are worn on people's faces to be able to see.", "Both have protective eyewear on their faces. neither have backpacks, ties or baskets."], "image": "val2014/COCO_val2014_000000088286.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284676, "question_id": "JEHKFyd8BS8o46D5gjob9r", "question": "What is the person hiding behind?", "choices": ["umbrella", "car", "apple", "box"], "correct_choice_idx": 0, "direct_answers": ["umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["A woman is holding an large item to block rain.", "Am umbrella is covering their face.", "The person is behind an umbrella."], "image": "train2014/COCO_train2014_000000284676.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145447, "question_id": "JERarnCQxUAiskFcuQK2tm", "question": "Who works at one of these places?", "choices": ["conductor", "airline pilot", "rodeo clown", "zoo keeper"], "correct_choice_idx": 0, "direct_answers": ["worker", "people", "people", "engineer", "conductor", "conductor", "station employee", "conductor", "conductor", "engineer"], "difficult_direct_answer": false, "rationales": ["The conductor wroks.", "A train conductor would drive the train.", "A conductor is needed to run the trains."], "image": "train2014/COCO_train2014_000000145447.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230756, "question_id": "JErEKzTRmiXYnPeA5xjWQV", "question": "How many people are seated on the staircase made of wood?", "choices": ["five", "four", "two", "three"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is a man and woman.", "There are 2.", "A woman and an old man are sitting on the staircase."], "image": "train2014/COCO_train2014_000000230756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135244, "question_id": "JFMzV2UcGc7QtjJVwkvhzz", "question": "What powers this plane?", "choices": ["coal", "kerosene", "gasoline", "electricity"], "correct_choice_idx": 2, "direct_answers": ["engine", "gasoline", "gasoline", "jet fuel", "fly", "engine", "fuel", "fuel", "gas", "engine"], "difficult_direct_answer": false, "rationales": ["The plane sitting on the gas is powered by gasoline when running.", "The gas powers it.", "It takes regular fuel to run the engine"], "image": "train2014/COCO_train2014_000000135244.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32664, "question_id": "JFjfsqfczyefDwdSoJmnFu", "question": "What are the bars touching in the foreground?", "choices": ["baby", "apple", "surf board", "cow"], "correct_choice_idx": 2, "direct_answers": ["surfboard", "floor", "surfboard", "surfboard", "bench", "floor", "seat", "surfboard", "surf board", "surfboard"], "difficult_direct_answer": false, "rationales": ["The bars are on a bench that looks like it is for riding the waves.", "The seat appears to have been made to look like one. it's likely because this bench is near a beach and ocean.", "A surfboard is on a silver stand near a pool."], "image": "train2014/COCO_train2014_000000032664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69979, "question_id": "JGkrGtyJbThFdwpXvcrvCp", "question": "What animal is visible?", "choices": ["antelope", "cow", "crane", "bird"], "correct_choice_idx": 3, "direct_answers": ["bird", "parrot", "parrot", "parrot", "bird", "bird", "barrot", "parrot", "parrot", "bird"], "difficult_direct_answer": false, "rationales": ["The animal is the bird.", "It is flying and has wings.", "The bird is visible."], "image": "train2014/COCO_train2014_000000069979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196311, "question_id": "JGogg58JLxVjfEGCBKrCXw", "question": "What color are the couch cushions on the top of the red sofa at the corner edge of the room?", "choices": ["purple", "red", "blue", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "white", "white", "white", "red", "white", "maroon", "white", "white"], "difficult_direct_answer": false, "rationales": ["The cushions are not the same color as the red sofa. the cushions are not blue or purple.", "These are much lighter than the rest of the furniture", "Two white, square pillows have been placed nonchalantly on the red couch."], "image": "val2014/COCO_val2014_000000196311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249815, "question_id": "JGtkaCnhFSiPQNczLi8PXA", "question": "What company is known for making the abundant items here?", "choices": ["green giant", "huffy", "popeyes", "mcdonalds"], "correct_choice_idx": 1, "direct_answers": ["huffy", "schwinn", "schwinn", "cycle company", "bicycles", "huffy", "schwinn", "cannondale", "huffy", "huffy"], "difficult_direct_answer": false, "rationales": ["Huffy makes bikes.", "The other options are food-related.", "These items are bicycles, not vegetables, hamburgers, or chicken nuggets."], "image": "val2014/COCO_val2014_000000249815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552657, "question_id": "JHAEtRvfQgZHMyXUwwFrA6", "question": "What is the dog chowing down on?", "choices": ["pizza", "egg", "baseball glove", "shoes"], "correct_choice_idx": 0, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza crust", "pizza", "pizza", "cookies"], "difficult_direct_answer": false, "rationales": ["The dog is being fed a piece of pizza by its owner.", "The dog is seen eating the edge of the crust with cheese and tomato sauce visible.", "The food is a crust bread with red sauce."], "image": "train2014/COCO_train2014_000000552657.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314188, "question_id": "JMdnbJpYYfJC99XwhpZnUc", "question": "What color is the pillow case behind the cat?", "choices": ["white", "green", "blue", "yellow"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "red", "blue", "maroon", "red", "blue", "blue", "red", "white"], "difficult_direct_answer": false, "rationales": ["The case that covers the pillow is made up of blue fabric.", "The color is blue.", "The pillow case behind the cat is not yellow, green, or white."], "image": "val2014/COCO_val2014_000000314188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 263974, "question_id": "JNBiJWZ7QzqfQehM3VtvQd", "question": "What color is the vest worn by the young girl on the bicycle?", "choices": ["blue", "green", "pink", "white"], "correct_choice_idx": 2, "direct_answers": ["pink", "pink", "pink", "pink", "pink", "pink", "pink", "pink", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["She is very girly.", "The color is pink.", "The girl is wearing a vest in this light color."], "image": "train2014/COCO_train2014_000000263974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444024, "question_id": "JNPYASynGLp6msbTrZHciy", "question": "How is this food meant to be eaten?", "choices": ["fork", "chop sticks", "spoon", "knife"], "correct_choice_idx": 2, "direct_answers": ["silverware", "spoon", "with utensils", "spoon", "with fork", "spoon", "with spoon", "spoon", "chewing", "fork"], "difficult_direct_answer": false, "rationales": ["The food needs a spoon.", "You would have to have something to \"grasp\" the food with.", "It is ice cream."], "image": "train2014/COCO_train2014_000000444024.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426403, "question_id": "JP8hvHBVEVEFfbq3Pa3xe7", "question": "What shape are the wings on the kite pulled by the boy in the red cap?", "choices": ["delta", "butterfly", "narrow", "wide"], "correct_choice_idx": 0, "direct_answers": ["delta", "butterfly", "triangle", "triangle", "triangular", "triangle", "triangular", "triangle", "triangles", "bird shape"], "difficult_direct_answer": false, "rationales": ["The boy is flying a kite that has delta-shaped wings.", "A boy is holding a kite with triangle shaped wings and a hat on his head.", "It's similar to a triangle"], "image": "train2014/COCO_train2014_000000426403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419483, "question_id": "JPd3ZNiNzmhPQuMKDFSE7C", "question": "What is the main color of the three major stripes extending down one side of the blue armchair?", "choices": ["red", "blue", "yellow", "white"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "white", "yellow"], "difficult_direct_answer": false, "rationales": ["The color is yellow.", "The blue armchair in the center of the room has yellow stripes running down the side.", "This is the obvious answer given the depiction in the image. it's hard to say, but this may be a collectible room."], "image": "train2014/COCO_train2014_000000419483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234839, "question_id": "JPsfAT3WFtq75UcC5qRZ57", "question": "Who was born closest to this place?", "choices": ["kelly rowan", "idris elba", "jessica biel", "jim henson"], "correct_choice_idx": 0, "direct_answers": ["people", "henry mcelroy", "corey him", "canadian", "sandra oh", "ryan gosling", "canadians", "ontario", "emma raducanu", "kelly rowan"], "difficult_direct_answer": true, "rationales": ["Only know this if you researched these people.", "The sign is from ontario, canada which is the birthplace of only one of the options. the others were born in london or the united states.", "Kelly was born."], "image": "train2014/COCO_train2014_000000234839.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429836, "question_id": "JQgwuPfighQmRzBZvGbzLN", "question": "What is surrounding the field?", "choices": ["scarecrows", "football fans", "corn", "baseball fans"], "correct_choice_idx": 3, "direct_answers": ["people", "bleachers", "seats", "fans", "audience", "fans", "seating", "fence", "baseball fans", "fans"], "difficult_direct_answer": false, "rationales": ["The field is covered by fans.", "There are a bunch of people watching in the stands.", "The baseball field is surrounded by seats where the baseball fans sit and watch."], "image": "val2014/COCO_val2014_000000429836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468061, "question_id": "JQp2X93bg72h49VicDxwGj", "question": "What color is the face of the elephant who is surfacing out of the rock enclosed pit?", "choices": ["blue", "yellow", "green", "pink"], "correct_choice_idx": 3, "direct_answers": ["brown", "pink", "black", "black", "black", "black", "pink", "black", "black", "bronze"], "difficult_direct_answer": false, "rationales": ["There is a pink color on the front of the elephant's face.", "The color is pink.", "The color is pink."], "image": "train2014/COCO_train2014_000000468061.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290332, "question_id": "JVdjqx4jToJxZZ5QmdnwHj", "question": "What song applies to this scene?", "choices": ["aeroplane", "eggman", "mountain song", "yellow submarine"], "correct_choice_idx": 2, "direct_answers": ["snow", "round mountain", "climbevery mountain", "down mountain", "mountain song", "winter wonderland", "na", "winter wonderland", "snow", "cold"], "difficult_direct_answer": false, "rationales": ["The song is a mountain song.", "There are snow-covered mountains in the background.", "The people are skiing. there is a tall rocky hill in the background."], "image": "train2014/COCO_train2014_000000290332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75982, "question_id": "JW4cF8dT64UztJTtwF23cB", "question": "What color are the main stripes on the man who has just hit the tennis ball?", "choices": ["yellow", "white", "black", "red"], "correct_choice_idx": 3, "direct_answers": ["blue", "blue", "blue", "red", "white", "white", "white", "black", "white", "red"], "difficult_direct_answer": false, "rationales": ["The guy closest to us is wearing black shorts and white shirt with black stripes. he has a headband and a tennis racket in hand.", "The shoes are grey with red on them.", "The color is red."], "image": "train2014/COCO_train2014_000000075982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282142, "question_id": "JWECih6MCChjmFJkijw2ub", "question": "What do the three entities have in common?", "choices": ["blonde hair", "gills", "feathers", "wings"], "correct_choice_idx": 0, "direct_answers": ["eyes", "alive", "hair", "friendly", "blonde hair", "living", "blonde hair", "yellow hair", "person horse", "kids"], "difficult_direct_answer": true, "rationales": ["The hair that are covering the three are all the same color.", "They're all blonde.", "All three things have blonde hair. the girls have blonde hair and the horse matches."], "image": "train2014/COCO_train2014_000000282142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399195, "question_id": "JWNBZfzUyHFHfFZ3yFmzKb", "question": "What does the traffic sign in front of the large truck indicate?", "choices": ["stop", "dead end", "no turn", "one way"], "correct_choice_idx": 2, "direct_answers": ["no turn", "move over", "no left", "no right", "no turn", "no turn", "no turn", "road blocked", "caution", "no left"], "difficult_direct_answer": false, "rationales": ["Specifically, the sign means no right one.", "The sign says no turn.", "The line going forward and then turning right indicates the direction. the red circle and slash is universal for being forbidden."], "image": "train2014/COCO_train2014_000000399195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245642, "question_id": "JWsooaqbJoie8uWFU8UYLa", "question": "What is behind the animals?", "choices": ["bicycle", "telephone pole", "airplane", "car"], "correct_choice_idx": 0, "direct_answers": ["person", "bicycle rider", "bicycle", "cyclists", "bicyclist", "bike", "bike", "bike", "bike", "bike"], "difficult_direct_answer": false, "rationales": ["There is a thin wheel in the front and handlebars at the top", "A bicycle is behind the animals because you can see the handlebars and the hands on it", "There are handles and a thin wheel near the animals."], "image": "val2014/COCO_val2014_000000245642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576564, "question_id": "JXeMJxvi5QrGSCedD8LMmz", "question": "How many black cows are standing up in the middle of the pasture?", "choices": ["four", "two", "five", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There is a group of three dark cows standing in the grass.", "There are three cows.", "There are 3."], "image": "val2014/COCO_val2014_000000576564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452623, "question_id": "JYL9DFpkim5wRwtuavJpLe", "question": "What does the person holding the food have on?", "choices": ["cowboy boots", "diamond bracelet", "wristband", "crown"], "correct_choice_idx": 2, "direct_answers": ["bracelet", "plate", "plate", "medical bracelet", "food", "wristband", "wrist band", "bracelet", "yellow bracelet", "wristband"], "difficult_direct_answer": false, "rationales": ["The person has a wristband.", "The person has a wristband.", "They seem to be at an event which often requires wristbands."], "image": "val2014/COCO_val2014_000000452623.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474105, "question_id": "JYgDs5YV3uRziF2U8mwRsD", "question": "How many portraits are hung on the striped walls of this hotel unit?", "choices": ["four", "one", "three", "two"], "correct_choice_idx": 1, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is a picture but no portraits", "There is a single artwork on the wall to the left of the bed.", "Only one piece of art is visible on the walls."], "image": "train2014/COCO_train2014_000000474105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285696, "question_id": "JaZ7TpGdb35x6gCPgrhkjX", "question": "What is the plane hovering over?", "choices": ["ladder", "cat", "baby carriage", "boat"], "correct_choice_idx": 3, "direct_answers": ["water", "take up", "lake", "ship", "bridge", "water", "river", "boat", "boat", "boat"], "difficult_direct_answer": false, "rationales": ["The plane is clearly visible in the image and by looking directly under it, answer a is seen.", "The plane is by a boat.", "This plane is flying right over the boat."], "image": "train2014/COCO_train2014_000000285696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503051, "question_id": "JbThZeTBy8X7xw22qAcCHu", "question": "What color are the straps wrapping up the black duffel on the luggage rack?", "choices": ["beige", "purple", "orange", "blue"], "correct_choice_idx": 1, "direct_answers": ["red", "red", "red", "red", "pink", "red", "purple", "maroon", "red/black", "red"], "difficult_direct_answer": false, "rationales": ["It's the only closest option since they're more of a red or burgandy.", "The straps are a hue between red and blue.", "The color is purple."], "image": "val2014/COCO_val2014_000000503051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153997, "question_id": "Jee5qhKUAsWw7c2GuTjfdj", "question": "How many potatoes around on the blue plate?", "choices": ["one", "four", "two", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 3 potatoes.", "Two potatoes are below the carrots. an additional one is above the carrots.", "Three round, brown objects are in a dish to eat."], "image": "train2014/COCO_train2014_000000153997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215231, "question_id": "Jeiw3JEZfKCBVU2kksGAo9", "question": "What word is related to the type of bananas these are?", "choices": ["roasted", "invisible", "global warming", "chopped"], "correct_choice_idx": 2, "direct_answers": ["nothing", "global warming", "organic", "fresh banana", "organic", "organic", "organic", "organic", "organic", "cavendish"], "difficult_direct_answer": false, "rationales": ["These fruits like to grow among the tropical warm weather.", "The word is global warming.", "The bananas are reduced to environmental pollution and global warming."], "image": "train2014/COCO_train2014_000000215231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69340, "question_id": "JiB5HuvmPvNtsyPpJ7efR4", "question": "What type of animals are shown on the lowest shelf to the right of the sink?", "choices": ["gorillas", "dogs", "giraffes", "elephants"], "correct_choice_idx": 3, "direct_answers": ["frog", "bear", "cat", "bears", "elephant", "elephants", "cat", "bear", "cat", "bears"], "difficult_direct_answer": false, "rationales": ["Small statues of animals with long trunks are on a kitchen shelf.", "The given animals are locatable based on the question text and have the dining features visible of answer a.", "The little objects are elephants."], "image": "val2014/COCO_val2014_000000069340.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184531, "question_id": "Jk5qkC9UMSDsURYSWHfMcc", "question": "What is the likely number of wheels attached to the motorbike in the forefront of this lot?", "choices": ["one", "two", "four", "three"], "correct_choice_idx": 3, "direct_answers": ["1214", "three", "three", "three", "red", "three", "three", "three", "one", "one"], "difficult_direct_answer": false, "rationales": ["There are three wheels.", "It has a sidecar with an extra wheel.", "There is an extra wheel out front and two in the back."], "image": "val2014/COCO_val2014_000000184531.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179390, "question_id": "Jkafrba7vFmEqQ5EfU73QY", "question": "How many cows are standing in the middle of this pasture with cut horns?", "choices": ["one", "two", "three", "four"], "correct_choice_idx": 3, "direct_answers": ["two", "four", "four", "three", "three cows", "two", "four", "four", "two", "four"], "difficult_direct_answer": false, "rationales": ["There are a group of cows in a pasture and none have horns.", "There are four cows standing in the field.", "There are 4."], "image": "train2014/COCO_train2014_000000179390.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328064, "question_id": "JkgHaXE5hTQFV4UssjqaJv", "question": "What does it look like these girls are?", "choices": ["different genders", "different parents", "twins", "different races"], "correct_choice_idx": 2, "direct_answers": ["dolls", "twins", "twins", "twins", "play", "children", "happy", "twins", "twins", "toddlers"], "difficult_direct_answer": false, "rationales": ["They are dressed alike", "The girls are twins.", "The girls are of the approximate same age and look similar to each other which would be consistent with answer a and none of the other answers."], "image": "train2014/COCO_train2014_000000328064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308678, "question_id": "JmekBAJ2n7nUAmjzgcHp6q", "question": "This vehicle would most likely appear on what show?", "choices": ["baywatch", "wmac masters", "judge judy", "dr phil"], "correct_choice_idx": 0, "direct_answers": ["boat", "baywatch", "baywatch", "boat", "boat", "travel", "watching", "american show", "csi miami", "coast guard"], "difficult_direct_answer": false, "rationales": ["It is search and rescue.", "The vehicle is a boat and answer a is the only answer on the list that takes place near bodies of water where a boat might commonly be used.", "This is a rescue boat and baywatch was about lifeguards"], "image": "val2014/COCO_val2014_000000308678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267788, "question_id": "JoEQGCnfhetA8Q8dXaYrCT", "question": "What color are the banana skins at the bottom of the wastebasket?", "choices": ["black", "yellow", "brown", "green"], "correct_choice_idx": 0, "direct_answers": ["black", "brown", "yellow", "brown", "brown", "brown", "black", "black", "brown", "black"], "difficult_direct_answer": false, "rationales": ["The skins of the bananas are black. they have changed color.", "The banana skins are decaying, and are a very dark color.", "They are past ripeness and on the way to rotten"], "image": "train2014/COCO_train2014_000000267788.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544994, "question_id": "Jp83CTQCQXNHWdASzevgza", "question": "The game being played looks like it belongs on what system according to the graphics?", "choices": ["playstation 5", "playstation 4", "xbox one", "atari"], "correct_choice_idx": 3, "direct_answers": ["atari", "sega genesis", "wii", "atari", "atari", "tv", "video game", "nintendo wii", "wii", "wii"], "difficult_direct_answer": false, "rationales": ["The game is atari.", "The game space invaders looks to be played on the atari.", "A game controller has a brand logo on top."], "image": "train2014/COCO_train2014_000000544994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507317, "question_id": "JpcS9C73L7tDbzkqhhNLGB", "question": "Who would work here?", "choices": ["fire fighter", "curator", "chef", "clown"], "correct_choice_idx": 1, "direct_answers": ["security guards", "musium", "historian", "curator", "man", "docent", "museum worker", "selling", "man", "sold"], "difficult_direct_answer": true, "rationales": ["A curator might work at the museum.", "The museum curator would work here.", "This is a museum and that is the name of the experts who work there"], "image": "val2014/COCO_val2014_000000507317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501307, "question_id": "Jpjkm2qrsgk6PbZnqBmHxd", "question": "What food is stacked up?", "choices": ["hot dog", "apple", "hamburger", "banana"], "correct_choice_idx": 3, "direct_answers": ["bananas", "bananas", "bananas", "bananas", "banana", "bananas", "bananas", "bananas", "banana", "bananas"], "difficult_direct_answer": false, "rationales": ["The fruits are tall and yellow.", "The food that has been stacked on the table are bunches of yellow bananas.", "These are long and yellow"], "image": "train2014/COCO_train2014_000000501307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526394, "question_id": "Jpyz38QcAcJ8cmRFqWirU3", "question": "What meal is this?", "choices": ["dinner", "desert", "lunch", "breakfast"], "correct_choice_idx": 1, "direct_answers": ["cake", "breakfast", "desert", "dessert", "dessert", "dessert", "dessert", "cake", "dessert", "dessert"], "difficult_direct_answer": false, "rationales": ["That is like a cheesecake with chocolate syrup and caramel.", "The food is sugar heavy and light, typically served after a heavier meal.", "The meal is dessert."], "image": "val2014/COCO_val2014_000000526394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312, "question_id": "Jq7ecNg52XuJWWX2iSJwRa", "question": "What is the elephant in the middle helping to cross the road?", "choices": ["baby elephant", "cars", "goose", "duckling"], "correct_choice_idx": 0, "direct_answers": ["young elephant", "baby", "calf", "baby elephant", "travel", "his baby", "baby", "stop", "baby", "walk"], "difficult_direct_answer": false, "rationales": ["The elephant is a baby.", "This one is much smaller than the rest", "The elephant is much smaller than the rest which would be consistent with a growth pattern from childhood to adulthood."], "image": "train2014/COCO_train2014_000000000312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33813, "question_id": "JqS78ePcqF9kSQ5LAGs5r8", "question": "What is a group of these abundant items called?", "choices": ["clowder", "bushel", "fleet", "squad"], "correct_choice_idx": 2, "direct_answers": ["boat", "flotilla", "boats", "sailboats", "boats", "fleet", "boats", "boat", "flotilla", "boats"], "difficult_direct_answer": false, "rationales": ["All the boats grouped together is a fleet. there has to be many boat for a fleet.", "There are many boats.", "The group is a fleet."], "image": "train2014/COCO_train2014_000000033813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152079, "question_id": "JtpfDAUE9V8PqgmhSgF8aB", "question": "Where did the bears come from?", "choices": ["ups", "santa", "storks", "strangers"], "correct_choice_idx": 1, "direct_answers": ["toy shop", "store", "santa", "toy store", "santa clause", "store", "store", "house", "home", "toy store"], "difficult_direct_answer": false, "rationales": ["These bears come from santa claus.", "A child is surrounded by plush bears and christmas decorations can seen.", "This is during the holiday season and there is a christmas tree on the side."], "image": "train2014/COCO_train2014_000000152079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410554, "question_id": "JuMu5SWTuR4h5FnpZvGM6T", "question": "What is the man wearing?", "choices": ["garbage bag", "tie", "gas mask", "glasses"], "correct_choice_idx": 3, "direct_answers": ["checkered shirt", "flannel shirt", "flannel", "plaid shirt", "glasses", "shirt", "glasses", "ring", "jacket", "flannel shirt"], "difficult_direct_answer": false, "rationales": ["The man has glasses.", "The man in the kitchen is wearing eyeglasses on his face.", "Glasses are typically worn on a person's face."], "image": "val2014/COCO_val2014_000000410554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35972, "question_id": "JusQFKSxAzJjbqqNwc5Mr8", "question": "What profession utilizes the red item in the foreground?", "choices": ["fire fighter", "baker", "butcher", "drill sergeant"], "correct_choice_idx": 0, "direct_answers": ["firefighters", "metal", "fire fighter", "firefighter", "fireman", "firefighter", "firefighters", "yes", "firefighter", "firemen"], "difficult_direct_answer": false, "rationales": ["Hydrants give water which fire fighters use.", "The firefighter uses it.", "The other options don't use this object to fight fires."], "image": "train2014/COCO_train2014_000000035972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127153, "question_id": "JuzefTKEJ5ncgMt3p7yXa7", "question": "What color is the umbrella held by the woman who is walking on the left side of the zebra stripes?", "choices": ["pink", "white", "red", "black"], "correct_choice_idx": 1, "direct_answers": ["white", "pink", "pink", "pink", "pink", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The other person is holding a black umbrella. she is holding a different colored one, and there are no pink or red umbrellas.", "It's the same color as the sidewalk stripes", "There is no color."], "image": "val2014/COCO_val2014_000000127153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222635, "question_id": "JwT7sjXHdSZxgsgmPHu2pu", "question": "How many objects are hung in a chain off of the right side of the fireplace?", "choices": ["two", "three", "one", "four"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["One object is hanging in between two others.", "There are three objects.", "A living room has a fireplace with a string of objects hung from one side."], "image": "val2014/COCO_val2014_000000222635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520109, "question_id": "JxrMHUWSpu6GnkuHksHDko", "question": "How many colors of tile are there on the park ground?", "choices": ["four", "five", "two", "three"], "correct_choice_idx": 2, "direct_answers": ["four", "three", "five", "six", "two", "three", "two", "seven", "two", "six"], "difficult_direct_answer": false, "rationales": ["The squares are all red or blue", "Red and blue are available.", "There are two colors of tile."], "image": "val2014/COCO_val2014_000000520109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 996, "question_id": "Jz8NnUYHhQTwHtDetpK84F", "question": "What does the animal's coat resemble?", "choices": ["watermelon", "cherry", "spoiled banana", "lime"], "correct_choice_idx": 2, "direct_answers": ["spots", "spoiled banana", "cheetah", "hair", "leopard", "spots", "spots", "leopard", "color pattern", "giraffe"], "difficult_direct_answer": false, "rationales": ["The coat looks like a brown banana.", "Because the same color is observed when a banana is rotting.", "The coat is like a banana."], "image": "train2014/COCO_train2014_000000000996.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385577, "question_id": "JzmsiLGp3yt4sz8SvxbamY", "question": "What is pulling the vehicle?", "choices": ["camel", "ox", "horse", "car"], "correct_choice_idx": 1, "direct_answers": ["cow", "oxen", "oxen", "ox", "cow", "ox", "cow", "oxen", "wagon", "yak"], "difficult_direct_answer": false, "rationales": ["An ox is pulling the cart.", "The ox is pulling.", "A cow-like animal is pulling the vehicle."], "image": "train2014/COCO_train2014_000000385577.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564437, "question_id": "K2ayjbwEWhbokbvRGfDBNX", "question": "What is hanging on the belly of the sheep in the middle?", "choices": ["bell", "udder", "gong", "saddle"], "correct_choice_idx": 1, "direct_answers": ["udder", "breast", "utter", "testis", "baby", "udder", "gentiles", "testes", "testicles", "udder"], "difficult_direct_answer": false, "rationales": ["The female has a milk-filled mammary gland.", "This is the milk sack for feeding babies", "The hanging object is the sheep's mammary gland."], "image": "train2014/COCO_train2014_000000564437.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385864, "question_id": "K3yxNbPVv6ruHsKLZj5hSn", "question": "What kind of fruit is in the bottom right corner of this fruit crate?", "choices": ["apple", "banana", "orange", "kiwi"], "correct_choice_idx": 0, "direct_answers": ["mango", "banana", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple"], "difficult_direct_answer": false, "rationales": ["The fruit at the bottom right part is red and gold and is round.", "The fruit on the bottom right corner is round and red and has a stem.", "This is obvious in the scene. the other options aren't near that corner."], "image": "train2014/COCO_train2014_000000385864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384573, "question_id": "K4nJByAWAJr9DDAXb4fHVx", "question": "Who would the child be more likely to admire?", "choices": ["pete alonso", "pele", "wayne gretzky", "tiger woods"], "correct_choice_idx": 0, "direct_answers": ["hiting", "baseball player", "ok", "basket ball", "none self", "hit", "pete alonso", "baseball player", "athlete", "his dad"], "difficult_direct_answer": true, "rationales": ["The child likes baseball players.", "The kid likes alonso.", "The boy is playing baseball, not soccer, hockey, or golf."], "image": "train2014/COCO_train2014_000000384573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19957, "question_id": "K7i8fJdqvd6FKWse9kvJF6", "question": "What is the white topped food on the plate?", "choices": ["pizza", "ice cream", "cake", "funnel cake"], "correct_choice_idx": 3, "direct_answers": ["funnel cake", "powdered sugar", "funnel cake", "funnel cake", "pizza", "powdered sugar", "funnel cake", "funnel cake", "pizza", "powdered sugar"], "difficult_direct_answer": false, "rationales": ["There is a funnel cake with powdered sugar on top.", "The funnel cake is topped.", "The food in question is of a size and texture as well as being served in a manner consistent with answer a."], "image": "val2014/COCO_val2014_000000019957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370120, "question_id": "K8HoWfEGXP7UfG9MB54U6G", "question": "What is an important phrase in this activity?", "choices": ["homerun", "serve", "check mate", "high dive"], "correct_choice_idx": 1, "direct_answers": ["love", "tennis", "tennis", "shush", "playing tennis", "smash it", "serving", "serve", "tennis", "serve"], "difficult_direct_answer": false, "rationales": ["The person is playing tennis, not baseball, chess, or diving.", "The other options don't apply to tennis. b is baseball, c is chess and d is swimming.", "The man is playing tennis and one of the most important moves in tennis is serving the ball."], "image": "train2014/COCO_train2014_000000370120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175570, "question_id": "K8MAQrktxoj9Z4C6TyZ68V", "question": "What color is the red stripe going around the lateral center of the bus?", "choices": ["green", "black", "red", "blue"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "danger", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The question is answered within the text of the question.", "It's the color of an apple", "The bus has a thick red stripe painted across the center of the entire bus."], "image": "val2014/COCO_val2014_000000175570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175570, "question_id": "K99YQrJZfPLBsEpZwALwK2", "question": "Which country is this bus turning at the intersection of?", "choices": ["australia", "thailand", "japan", "china"], "correct_choice_idx": 2, "direct_answers": ["china", "japan", "japan", "japan", "china", "japan", "china", "asian", "japan", "china"], "difficult_direct_answer": false, "rationales": ["The letters on the bus and building are in japanese.", "There is japanese text on the bus.", "There is japanese writing on signs"], "image": "val2014/COCO_val2014_000000175570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95257, "question_id": "KA7wWtMw8WVTooUTktgtc3", "question": "What is the artwork on the skin of the girl in this image called?", "choices": ["stories", "tattoos", "transformers", "paintings"], "correct_choice_idx": 1, "direct_answers": ["tattoos", "tattoo", "tattoo", "tattoo", "tattoos", "tattoo", "tattoo", "tatto", "tattoo", "dwdwdw"], "difficult_direct_answer": false, "rationales": ["Tattoos are artwork on the skin.", "She has ink on her back.", "The artwork is needled with ink into skin."], "image": "train2014/COCO_train2014_000000095257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534189, "question_id": "KAApaWYXmcyZvr7LEVgSYA", "question": "What century of advancement might this boat belong to?", "choices": ["20th", "19th", "21st", "18th"], "correct_choice_idx": 3, "direct_answers": ["18th", "eighteenth", "1800", "nineteenth", "1980", "18th", "twentieth", "nineteenth", "1900s", "20th century"], "difficult_direct_answer": false, "rationales": ["The century is the 18th.", "These old boats could have hailed from the 18th century.", "The century is the 18th."], "image": "train2014/COCO_train2014_000000534189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448983, "question_id": "KAJh4d9pm4rH9c3yjwZGka", "question": "How many steeples form the front of this church building?", "choices": ["five", "six", "four", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "one", "three", "three", "four", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["The church has three large steeples built on the top of the roof.", "The center steeple is flanked by two others.", "There are 3."], "image": "val2014/COCO_val2014_000000448983.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174103, "question_id": "KCbUnZzhqpyKgfmj6ntoKi", "question": "What activity are the boards lined up against the building used for?", "choices": ["football", "skiing", "soccer", "snowboarding"], "correct_choice_idx": 3, "direct_answers": ["surfing", "skateboarding", "snow board", "snowboarding", "surf", "snowboarding", "snowboarding", "snowboarding", "snowboarding", "snowboarding"], "difficult_direct_answer": false, "rationales": ["This is similar to skiing but only one board is used", "The activity is snowboarding.", "These are used for snow."], "image": "val2014/COCO_val2014_000000174103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 343984, "question_id": "KCmSVXTyEoB8QKnoE32RDg", "question": "What color is the underblanket for the saddle on this horse's back?", "choices": ["blue", "red", "green", "purple"], "correct_choice_idx": 1, "direct_answers": ["black", "brown", "red", "red", "black", "black", "dwdw", "red", "brown", "black"], "difficult_direct_answer": false, "rationales": ["The color is red.", "It's the color of an apple", "The saddle has a red side."], "image": "train2014/COCO_train2014_000000343984.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36548, "question_id": "KDuXnxxS2XaALyx52xGXbC", "question": "What is this motorcycle designed to do?", "choices": ["pull trailer", "drive fast", "dirt racing", "jump high"], "correct_choice_idx": 1, "direct_answers": ["travel", "carry passenger", "drive fast", "race", "drive", "race", "race", "run fast", "race", "ride"], "difficult_direct_answer": false, "rationales": ["The bike drives fast.", "A motorcycle is parked on the street. the motorcycle is streamlined and is similar to a racing bike.", "This is a yamaha sport motorcycle. it is not powerful enough to pull trailers and is not designed for jumping or dirt racing."], "image": "train2014/COCO_train2014_000000036548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432053, "question_id": "KEBHdx34XUXyJd8gFNtwLd", "question": "What are the bananas stored in?", "choices": ["bowl", "cage", "box", "tupperware"], "correct_choice_idx": 0, "direct_answers": ["bowl", "fruit bowl", "dish", "bowl", "bowl", "bowl", "bowl", "round dish", "bowl", "bowl"], "difficult_direct_answer": false, "rationales": ["The container of the bananas is clearly visible and is a rounded container that slopes upward at the edges.", "Fruit is in a bowl on a table.", "They are for decoration."], "image": "train2014/COCO_train2014_000000432053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157155, "question_id": "KEyEg5uiTLzSoA25kGDcre", "question": "What is wrapped around the horse's ankles?", "choices": ["ribbons", "reflective tape", "bells", "flowers"], "correct_choice_idx": 1, "direct_answers": ["lights", "bands", "lights", "lights", "lights", "reflective tape", "reflectors", "reflectors", "lights", "reflectors"], "difficult_direct_answer": false, "rationales": ["There is some sort of adhesive material that is holding it around hooves.", "The tape is wrapped.", "Reflective tape is often used so people in vehicles can see the horse if it's too dark."], "image": "val2014/COCO_val2014_000000157155.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484401, "question_id": "KGXEskknUEHHMaZ9QoWCgM", "question": "Where does the door lead to?", "choices": ["kitchen", "shed", "outside", "bedroom"], "correct_choice_idx": 2, "direct_answers": ["outside", "bedroom", "another room", "room", "bedroom", "room", "outside", "outside", "room", "bedroom"], "difficult_direct_answer": false, "rationales": ["The door goes outside.", "The very bright light is coming from behind the door which the cat came from.", "The door goes outside."], "image": "train2014/COCO_train2014_000000484401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377999, "question_id": "KGsXTXoSF6rXjDQSTxtemZ", "question": "What is the number on the sailboat?", "choices": ["758", "103", "862", "210"], "correct_choice_idx": 1, "direct_answers": ["103", "103", "103", "103", "103", "103", "103", "103", "103", "103"], "difficult_direct_answer": false, "rationales": ["The number is on the right sail.", "103 is written on the sail of the boat", "The number is 103."], "image": "val2014/COCO_val2014_000000377999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304391, "question_id": "KJpgBKZwoHqS5xmoprDcBW", "question": "What is the small white appliance?", "choices": ["stove", "refrigerator", "dishwasher", "clothes washer"], "correct_choice_idx": 3, "direct_answers": ["microwave oven", "microwave", "washing machine", "bowl", "washer", "washer", "clothes washer", "microwave", "microwave", "washer"], "difficult_direct_answer": false, "rationales": ["This has a round circle door on the front for clothes", "The small white appliance next to the refrigerator is used for washing clothes.", "There is a washing maching."], "image": "train2014/COCO_train2014_000000304391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371735, "question_id": "KJviDphxqcXdZNUCS6UeSu", "question": "Which type of tennis hit is the man in this picture about to do?", "choices": ["backhand", "dropshot", "serve", "forehand"], "correct_choice_idx": 2, "direct_answers": ["serve", "serve", "serve", "serve", "serve", "serve", "serve", "serve", "serve", "single"], "difficult_direct_answer": false, "rationales": ["The tennis is served.", "The man is about to serve tennis.", "The person is about to throw the ball."], "image": "train2014/COCO_train2014_000000371735.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94664, "question_id": "KK8ommiArWJQ224J7jQ4Qm", "question": "What activity does the man on the left do instead of the man on the right?", "choices": ["row", "paddle", "swim", "surf"], "correct_choice_idx": 3, "direct_answers": ["surfing", "surf", "diving", "surf", "ok", "surfing", "surf", "surf", "surf", "surf"], "difficult_direct_answer": false, "rationales": ["The man is surfing in the water.", "The activity is surfing.", "The man on the left, unlike the one on the right, has a board."], "image": "train2014/COCO_train2014_000000094664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506232, "question_id": "KKbhSSzR8CDwVBoyqQu5wj", "question": "What term applies to this support?", "choices": ["skate save", "check mate", "homerun", "backhand"], "correct_choice_idx": 3, "direct_answers": ["tennis", "tennis", "tennis", "tennis", "tennis", "clay court", "clay", "backhand", "tennis", "tennis"], "difficult_direct_answer": false, "rationales": ["The man with the tennis raquet is swinging a backhand move in order to hit the ball over the net.", "The term is backhand.", "The term is backhand."], "image": "train2014/COCO_train2014_000000506232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306837, "question_id": "KLbibZL6Kns9gh3kvLBRYD", "question": "How many zebras are in this part of the zoo consuming grass?", "choices": ["two", "three", "four", "one"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "one", "100", "one", "two", "two", "two", "three", "three"], "difficult_direct_answer": false, "rationales": ["One zebra is grazing behind two others.", "There are three grazing.", "There are 3."], "image": "train2014/COCO_train2014_000000306837.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198057, "question_id": "KMpCWkgrFMRhdAJJtwAAKp", "question": "What color is the crest of the bird underneath of his neck?", "choices": ["yellow", "brown", "green", "blue"], "correct_choice_idx": 0, "direct_answers": ["white", "yellow", "yellow", "yellow", "tellow", "orange", "white", "brown", "pink", "yellow"], "difficult_direct_answer": false, "rationales": ["The color is yellow.", "The color is yellow.", "The feathers are red."], "image": "train2014/COCO_train2014_000000198057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35972, "question_id": "KP4RRfGTumNfjFYRjhdW3Q", "question": "What is the safety device in the foreground used to help defeat?", "choices": ["getaway cars", "paper cuts", "fires", "vulture infestation"], "correct_choice_idx": 2, "direct_answers": ["fire hydrant", "water", "fires", "theft", "fire", "fire", "fire", "fire", "fires", "metal"], "difficult_direct_answer": false, "rationales": ["These hydrants are used by firefighters.", "The hydrant is for fires.", "The device is for fires."], "image": "train2014/COCO_train2014_000000035972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288673, "question_id": "KSZrKQqMGGDvnQUJ4wsu2m", "question": "What color are the wingtips of the kite flown above the tropical beach?", "choices": ["blue", "white", "purple", "yellow"], "correct_choice_idx": 0, "direct_answers": ["blue", "blue", "yellow green", "blue", "green", "green", "blue", "blue", "navy blue", "blue yellow"], "difficult_direct_answer": false, "rationales": ["A kite is in the air that is multi colored and has blue on the tips.", "The color is blue.", "The kite flying at the beach has blue wingtips on it."], "image": "val2014/COCO_val2014_000000288673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471409, "question_id": "KScLSJiqfog4yKFw9ebQEc", "question": "What country of origin is the beer cutout on the wall behind the man in the black and white shirt?", "choices": ["belgium", "usa", "mexico", "uk"], "correct_choice_idx": 2, "direct_answers": ["england", "italy", "britain", "mexico", "pizza", "united states", "mexico", "mexico", "japan", "roops"], "difficult_direct_answer": false, "rationales": ["There is a picture of corona beer on the wall which is imported from mexico.", "The sign on the wall is for corona beer, which is a mexican beer.", "The country of origin of the beer is mexico because the beer is corona"], "image": "train2014/COCO_train2014_000000471409.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127911, "question_id": "KScToenAAUhKEvN5CbmH5z", "question": "What are the tallest items here used for?", "choices": ["ivory", "meat", "wool", "lumber"], "correct_choice_idx": 3, "direct_answers": ["wood", "trees", "shade", "lumber", "protection", "lumber", "shade", "shade", "provide light", "stop"], "difficult_direct_answer": false, "rationales": ["They are trees. trees are cut up into pieces and used to build houses.", "The tallest item that is visible here are trees. they can be cut down to build things.", "Two women walk along a path with very tall trees lining both sides."], "image": "val2014/COCO_val2014_000000127911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1103, "question_id": "KSrCmCgKzWCZgeSLkohQrr", "question": "What are the birds doing near the edge of the water?", "choices": ["swimming", "flying", "diving", "walking"], "correct_choice_idx": 3, "direct_answers": ["bird", "running", "swimming", "feeding", "running", "running", "running", "running", "walking", "walking"], "difficult_direct_answer": false, "rationales": ["The birds are near but not in the water, so they are not swimming or diving. they are on the ground, so they are not flying.", "The birds are walking along the beach.", "The way the legs are positioned indicates the birds are performing the action described in option a."], "image": "val2014/COCO_val2014_000000001103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230275, "question_id": "KTHBFepGWyvRGF5mhCr6Df", "question": "What is the purpose of the black netting in front of the stands?", "choices": ["your vision", "less sound", "rain", "ball protection"], "correct_choice_idx": 3, "direct_answers": ["safety", "protect fans", "avoid balls", "ball protection", "block ball", "protective netting", "protect fans", "safety", "catch balls", "safety"], "difficult_direct_answer": false, "rationales": ["The black netting is hung between the field and the audience so no one gets hit by a ball.", "The purpose is ball protection.", "The purpose of the black netting in front of the stands is protection from flying balls."], "image": "train2014/COCO_train2014_000000230275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359128, "question_id": "KUVwbRLj9t3LLq9J7LmvYk", "question": "What is on the left hand side of the room?", "choices": ["dog", "door", "frog", "apple"], "correct_choice_idx": 1, "direct_answers": ["door", "door", "door", "door", "door", "door", "door", "door", "door", "closet"], "difficult_direct_answer": false, "rationales": ["The side is a door.", "There is an open door on the left hand side of the room for the closet.", "The door is on the side of the room for entering and exiting."], "image": "val2014/COCO_val2014_000000359128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24571, "question_id": "KVms93Tgu9Jb7TD6WeETck", "question": "What color is the red helmet worn by the batter who is getting ready to swing?", "choices": ["red", "green", "purple", "blue"], "correct_choice_idx": 0, "direct_answers": ["ball", "maroon", "red", "blue helmet", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The baseball player at bat is wearing a shiny red helmet to match his uniform.", "The batter's helmet is not blue, green, or purple.", "This is obvious based on the color."], "image": "train2014/COCO_train2014_000000024571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562559, "question_id": "KWBV5tz9oZBX6grKPaMdTN", "question": "What is performing a pincer maneuver on the meat?", "choices": ["cats", "broccoli", "carrots", "dogs"], "correct_choice_idx": 1, "direct_answers": ["broccoli", "taste", "dfsf", "sauce", "stop", "spoon", "car", "fat", "broccoli", "nobody"], "difficult_direct_answer": true, "rationales": ["The broccoli is positioned over the meat.", "There are two pieces of broccoli on either side.", "The broccoli is there."], "image": "train2014/COCO_train2014_000000562559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187647, "question_id": "KWNq3c4FncQL6o3bToYMvB", "question": "What kind of candy bar is hanging on the leg of the teddy with a hoodie sweater put on?", "choices": ["kit kat", "hersheys", "payday", "mars"], "correct_choice_idx": 1, "direct_answers": ["hersheys", "hershey", "kitkat", "hersheys", "chocolate bar", "hershey's", "chocolate bar", "black", "hershey", "chocolate"], "difficult_direct_answer": false, "rationales": ["The candy is hershey's.", "There is a hershey's chocolate bar sitting on the leg of one of the two bears.", "The candy is hershey's."], "image": "val2014/COCO_val2014_000000187647.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95257, "question_id": "KXPpt5VP4LfAEt34NHbjtS", "question": "What color are the shorts of the man carrying a stick and standing between the two elephants in the background?", "choices": ["red", "white", "black", "blue"], "correct_choice_idx": 0, "direct_answers": ["stop", "red", "red", "blue stripes", "red", "red", "orange", "watching", "blue", "black"], "difficult_direct_answer": false, "rationales": ["There is a boy standing on other side of the elephant in the forefront. you can't see his face but he has matching red shirt and shorts.", "A man is in brightly colored shorts and holds a cane.", "He is bright."], "image": "train2014/COCO_train2014_000000095257.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331242, "question_id": "KXXBwSTJ9bEUiRCjgVnZBX", "question": "What is on top of the horse?", "choices": ["bird", "baby", "saddle", "old man"], "correct_choice_idx": 2, "direct_answers": ["saddle", "saddle", "saddle", "saddle", "saddle", "saddle", "saddle", "saddle", "belt", "saddle"], "difficult_direct_answer": false, "rationales": ["The horse has a saddle.", "The saddle is on top.", "There are no animals or people on top of the horse."], "image": "val2014/COCO_val2014_000000331242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284375, "question_id": "KYFQP8MVCAAcNMfHWqUngm", "question": "What is the computer most at risk of?", "choices": ["flood hazard", "hail hazard", "fire hazard", "lightning hazard"], "correct_choice_idx": 2, "direct_answers": ["virus", "water damage", "drink", "juice", "spill", "spill", "overheating", "fire hazard", "table", "dying"], "difficult_direct_answer": true, "rationales": ["The computer is most at risk of fire because of all of the wires.", "A laptop is an electronic device and can heat up. it cause cause fires if it overheats.", "The cords are all tangled up together"], "image": "val2014/COCO_val2014_000000284375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21097, "question_id": "KYnb9vud5Arb97CDTtbudK", "question": "What kind of soft drink is at the side of this kale salad?", "choices": ["sierra mist", "diet pepsi", "coke zero", "mountain dew"], "correct_choice_idx": 1, "direct_answers": ["sprite", "pepsi", "diet pepsi", "soda", "pepsi", "pepsi", "diet pepsi", "pepsi", "diet pepsi", "cock"], "difficult_direct_answer": false, "rationales": ["There is a diet pepsi soft drink next to the salad.", "The soft drink in the can next to the dish is a diet pepsi. diet pepsis are sold in silver cans.", "A can of soda that is silver is on a table."], "image": "train2014/COCO_train2014_000000021097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474732, "question_id": "KYwcY5Jw5BYw5nr2WrJfkY", "question": "What is rolled up in the basket?", "choices": ["washcloths", "tissues", "bathroom cleaners", "underwear"], "correct_choice_idx": 0, "direct_answers": ["washcloth", "tissue paper", "washcloth", "wash cloth", "washcloths", "paper towels", "washcloths", "washcloths", "towel", "towels"], "difficult_direct_answer": false, "rationales": ["Based on the visible material, size and their placement next to the sink, answer a is the most logical.", "The washcloth is rolled.", "The basket is in the bathroom next to a sink. the items in option a are typically located in this room to clean one's face or hands."], "image": "train2014/COCO_train2014_000000474732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264686, "question_id": "KZ9mc8cZ5FUfKmwn47Mzbq", "question": "What color is the couch which is positioned perpendicularly with respect to the windows on the side of the wall?", "choices": ["green", "red", "purple", "blue"], "correct_choice_idx": 2, "direct_answers": ["maroon", "purple", "purple", "purple", "green", "red", "maroon", "purple", "purple", "burgundy"], "difficult_direct_answer": false, "rationales": ["The color of the couch is a grape color.", "The couch is purple colored.", "The color is purple."], "image": "val2014/COCO_val2014_000000264686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303163, "question_id": "Kam7V8pZsccZTNd7nhT4YU", "question": "What color is the building adjacent to the large plant factory tower?", "choices": ["blue", "green", "white", "red"], "correct_choice_idx": 0, "direct_answers": ["silver", "blue", "gray", "grey", "blue", "white", "black", "grey", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The reflection of the building mirrors has the color of the sky, making the building look like the sky color.", "The old building looks blue.", "It is a color similar to the sky."], "image": "val2014/COCO_val2014_000000303163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342146, "question_id": "Kcd4zRddVYk29B54748C4F", "question": "What is the man on the left wearing?", "choices": ["jeans", "gas mask", "crown", "basket"], "correct_choice_idx": 0, "direct_answers": ["jeans", "white coat", "lab coat", "coat", "sweater", "dress", "jeans", "stop", "coat", "white jacket"], "difficult_direct_answer": false, "rationales": ["The man has jeans.", "The man on the left is wearing thick denim pants.", "The man on the left is wearing blue jeans made from denim."], "image": "val2014/COCO_val2014_000000342146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26109, "question_id": "Kekd4uvxeVccvHm2Bs9QZw", "question": "What is connected to the long cylinder?", "choices": ["egg", "clock", "gargoyle", "vacuum"], "correct_choice_idx": 1, "direct_answers": ["clock", "clock", "wire", "clock", "clock", "clock", "clock", "clock", "clock", "clock"], "difficult_direct_answer": false, "rationales": ["There is a round part with arms.", "The clock is connected.", "The hands are visible."], "image": "train2014/COCO_train2014_000000026109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100234, "question_id": "KfXXYtH4DNgdQFapfzqfXm", "question": "How many boats are there in total to the right of the production plant?", "choices": ["two", "four", "three", "five"], "correct_choice_idx": 3, "direct_answers": ["four", "six", "four", "four", "two", "two", "five", "four", "five", "six"], "difficult_direct_answer": false, "rationales": ["There are at least five boats to the right of the production plant.", "Three are easy to see then there are two more past the others", "There are five boats."], "image": "val2014/COCO_val2014_000000100234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260039, "question_id": "KfeAdJtHU2QjsHVk89AbvM", "question": "What activity is the video game system simulating?", "choices": ["baseball", "driving", "basketball", "karate"], "correct_choice_idx": 1, "direct_answers": ["driving", "driving", "driving", "dwdw", "driving", "driving", "driving", "driving", "person", "game"], "difficult_direct_answer": false, "rationales": ["The activity is driving.", "The people are pretending they're using steering wheels.", "The activity is driving."], "image": "train2014/COCO_train2014_000000260039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45729, "question_id": "KfqF3jS2PTxt2xh2txr2Yv", "question": "What created the path the people are on?", "choices": ["god", "santa", "troll", "snowplow"], "correct_choice_idx": 3, "direct_answers": ["natural", "snow", "mountains", "snowplow", "people", "machinery", "skiers", "truck", "snow", "snow plow"], "difficult_direct_answer": true, "rationales": ["The snowplow created the path.", "A large flat path has been created in the snow near a mountain.", "The snow was flattened with a vehicle."], "image": "val2014/COCO_val2014_000000045729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205916, "question_id": "KfrejVVWLsnsdqKjmHCCQR", "question": "What is the clock attached to?", "choices": ["tower", "bar stool", "kitchen wall", "arena screen"], "correct_choice_idx": 0, "direct_answers": ["tower", "tower", "tower", "tower", "building", "tower", "tower", "tower", "tower", "building"], "difficult_direct_answer": false, "rationales": ["The clock is outside and is high above the ground.", "The clock has a tower.", "The clock is on a tall structure"], "image": "train2014/COCO_train2014_000000205916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497119, "question_id": "Kg3pHHSU8bEkk89rDcH869", "question": "What is the little dog wearing in the sidecar?", "choices": ["helmet", "scarf", "hat", "tshirt"], "correct_choice_idx": 0, "direct_answers": ["helmet", "helmet", "helmet", "helmet", "dress", "cap", "jacket", "pekkil", "helmet", "helmet"], "difficult_direct_answer": false, "rationales": ["This is to protect its head in a crash", "It has a small helmet.", "It resambles others."], "image": "train2014/COCO_train2014_000000497119.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424229, "question_id": "KgWcNa7hq2htqz26gfwUFu", "question": "What color is the large cow on the left side of the white cows?", "choices": ["white", "black", "orange", "brown"], "correct_choice_idx": 1, "direct_answers": ["black", "black", "black", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The cow on the left is not the same color as the white cows. it is not brown or orange.", "On the left side is a black cow.", "The black cow is the only cow to the left of the white cows."], "image": "train2014/COCO_train2014_000000424229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132612, "question_id": "KhrGo6wxhmh6knNoBi6yKf", "question": "Where are these Zebras most likely living together with the giraffes?", "choices": ["zoo", "wild", "house", "conservatory"], "correct_choice_idx": 3, "direct_answers": ["zoo", "zoo", "zoo", "zoo", "zoo", "zebras", "conservatory", "zoo", "zoo", "zoo"], "difficult_direct_answer": false, "rationales": ["There are structures on the land and a road", "The zebras are most likely living gin a conservatory yard with the giraffes.", "Zebras are walking in an area with a tall, flat platform for food near them."], "image": "val2014/COCO_val2014_000000132612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400123, "question_id": "Ki4yjUh67NfrEKLeDA643P", "question": "What is on the opposite wall from the sink?", "choices": ["picture", "towel bar", "shower", "bathtub"], "correct_choice_idx": 1, "direct_answers": ["door", "towel bar", "towel rack", "towel rack", "towel rack", "towel rack", "towel rack", "towel rack", "towel", "towel"], "difficult_direct_answer": false, "rationales": ["The towel bar is opposite.", "An item is visible in the mirror. it is not a picture, shower, or bathtub.", "There is a towel bar with a green towel mounted on the wall near the sink."], "image": "val2014/COCO_val2014_000000400123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196917, "question_id": "KiGv7Qp4zQp69sKEbQUsDu", "question": "What is near the vehicle?", "choices": ["bench", "eagle", "giraffe", "cow"], "correct_choice_idx": 2, "direct_answers": ["zebra", "giraffe", "giraffe", "giraffe", "giraffe", "bus", "giraffe", "giraffe", "giraffe", "giraffe"], "difficult_direct_answer": false, "rationales": ["The giraffe is near.", "The people are on a tour.", "A tall non-flying animal is near the vehicle."], "image": "train2014/COCO_train2014_000000196917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483476, "question_id": "KitcPwgEkkwoSYSSARaTFg", "question": "What are the people singing into?", "choices": ["megaphone", "microphone", "blow dryer", "cellphone"], "correct_choice_idx": 2, "direct_answers": ["phone", "phone", "dryer", "blow dryer", "car", "stop", "hairdryer", "hair dryer", "hairdryer", "hair dryer"], "difficult_direct_answer": false, "rationales": ["They sing into the dryer.", "A blow dryer is usually in the bathroom with a cord.", "The cord of the item is coiled. it is plugged into the wall behind them."], "image": "val2014/COCO_val2014_000000483476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542674, "question_id": "KjSVaA9Xk3VS7U9XQQokeR", "question": "What is the darkest color of the leaves on the trees to the left?", "choices": ["brown", "red", "yellow", "green"], "correct_choice_idx": 1, "direct_answers": ["brown", "red", "black", "red", "reddish", "green", "orange", "brown", "red", "brown"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The leaves are in fall colors with the the darkest being the color of a delicious apple.", "The darkest color is red."], "image": "train2014/COCO_train2014_000000542674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95950, "question_id": "KjuNWvgUPingWYQEkMVF4g", "question": "What is the man taking?", "choices": ["selfie", "karate class", "online course", "bar exam"], "correct_choice_idx": 0, "direct_answers": ["photo", "selfie", "picture", "selfie", "selfie", "selfie", "selfie", "selfie", "selfie", "selfie"], "difficult_direct_answer": false, "rationales": ["He's holding the phone up in front of his face", "The man is holding out a phone which has a camera in it.", "The man takes a selfie."], "image": "train2014/COCO_train2014_000000095950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331085, "question_id": "KkB3Dg3RPgQf8PKaK2Bst8", "question": "What type of land feature is found near the boat in the water?", "choices": ["beach", "bay", "island", "delta"], "correct_choice_idx": 2, "direct_answers": ["island", "island", "island", "island", "island", "handsome", "water", "rocks", "island", "island"], "difficult_direct_answer": false, "rationales": ["There is a lone piece of land in the middle of the water.", "It is a land mass surrounded by water", "The type is an island."], "image": "train2014/COCO_train2014_000000331085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258635, "question_id": "KkCboZKgeuP2ZpJzZXTHbw", "question": "What colorful items are the people holding?", "choices": ["kites", "maracas", "umbrellas", "flags"], "correct_choice_idx": 2, "direct_answers": ["umbrellas", "umbrella", "umbrellas", "umbrellas", "umbrella", "umbrella", "umbrellas", "umbrella", "umbrella", "red blue"], "difficult_direct_answer": false, "rationales": ["It is cloudy and raining. the people are trying to stay dry.", "These provide protection from rain or sun", "People are holding objects over their heads to keep themselves dry."], "image": "train2014/COCO_train2014_000000258635.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335733, "question_id": "KnhYbdnJAq4EBJYF9pnzjc", "question": "What color is the item that reaches the highest height on the plate?", "choices": ["purple", "blue", "red", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "green", "green", "green", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["The spinach is green.", "Most of the vegs are green.", "The color is green."], "image": "val2014/COCO_val2014_000000335733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215410, "question_id": "KpBsStMj9p36oCx3GDNQGs", "question": "What is under the clock tower?", "choices": ["sports cars", "cats", "flags", "turkeys"], "correct_choice_idx": 2, "direct_answers": ["flag", "flag", "building", "flags", "flags", "balcony", "flag", "bicycles", "flags", "flags"], "difficult_direct_answer": false, "rationales": ["There are tarps representing countries.", "There are pieces of colorful fabric hanging on poles", "There are no animals or powered vehicles under the clock tower. there are colored pieces of fabric."], "image": "train2014/COCO_train2014_000000215410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191321, "question_id": "KpDwdnbDD3ixJDSZPbVDBN", "question": "What word would best describe the person whose name appears on the sign?", "choices": ["clown", "pirate", "apostle", "samurai"], "correct_choice_idx": 2, "direct_answers": ["saint", "saint", "saint", "religious", "apostle", "famous", "mark", "st marks", "saint", "saint"], "difficult_direct_answer": false, "rationales": ["An apostle describes the person.", "St. mark is an apostle.", "The person is one of the followers of a religious figure."], "image": "train2014/COCO_train2014_000000191321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503832, "question_id": "KrkKuLem9p6zJwAzteRLYZ", "question": "How many giraffes are standing in the zoo enclosure around the people?", "choices": ["four", "six", "five", "three"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "four", "giraffe", "four", "four", "four", "four", "car", "four"], "difficult_direct_answer": false, "rationales": ["Two giraffes are standing by two other giraffes.", "There are four animals with tall necks.", "There are a total of four giraffes."], "image": "train2014/COCO_train2014_000000503832.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562461, "question_id": "KtHUTcSQN6vejUjaDrCqF3", "question": "What color is on the left side of the hydrant?", "choices": ["black", "green", "red", "pink"], "correct_choice_idx": 1, "direct_answers": ["green", "green", "green", "green", "green", "grey", "green", "green", "yellow", "green"], "difficult_direct_answer": false, "rationales": ["The color is the same color as grass or a lime.", "The color is green.", "The left nozzle is green."], "image": "train2014/COCO_train2014_000000562461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304759, "question_id": "Kugg2mfFczKKZV4LgqouMF", "question": "What type of beverage is in the plastic cup on the edge of the table?", "choices": ["iced coffee", "water", "hot coffee", "beer"], "correct_choice_idx": 0, "direct_answers": ["coffee", "juice", "iced coffee", "juice", "iced coffee", "coffee", "coffee", "cup", "coffee", "coffee"], "difficult_direct_answer": false, "rationales": ["The color of the drink matches coffee.", "There are cubes of frozen water in the cup with the brown liquid.", "The beverage is iced coffee."], "image": "train2014/COCO_train2014_000000304759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278989, "question_id": "Kuxhe7peUHtsWrSn3PLQrR", "question": "What actress was born in the country where the plane comes from?", "choices": ["jessica biel", "margaret qualley", "kate beckinsale", "lucy hale"], "correct_choice_idx": 2, "direct_answers": ["fee", "kate winslet", "no idea", "julie andrews", "emily blunt", "england", "england", "kate beckinsale", "kate winslet", "emilia clarke"], "difficult_direct_answer": false, "rationales": ["The actress is kate.", "The airplane on the tarmac is made by british airways, which is the same country kate beckinsale is from.", "She is from england and this is an airline for that country"], "image": "train2014/COCO_train2014_000000278989.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57703, "question_id": "KvRV2jYEtkWswV89DCaGEr", "question": "The poodle dog held on the leash is wearing what color of collar?", "choices": ["green", "orange", "blue", "red"], "correct_choice_idx": 1, "direct_answers": ["orange", "red", "nylon", "orange", "orange", "orange", "orange", "orange", "orange", "red"], "difficult_direct_answer": false, "rationales": ["The poodle's collar is not the same color as the man's blue shirt or the green grass. it also is not red.", "The color is orange.", "The color is bright and easily visible. it is in sharp contrast to the dark dog."], "image": "val2014/COCO_val2014_000000057703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191050, "question_id": "KxWFNzrEM9UYaGJyLAnnfQ", "question": "What are the three zebras doing in the green dense field?", "choices": ["running", "feeding", "standing", "sleeping"], "correct_choice_idx": 1, "direct_answers": ["grazing", "grazing", "grazing", "eating grass", "eating", "eating", "eating", "grass", "eating", "feeding"], "difficult_direct_answer": false, "rationales": ["The zebras are eating the grass.", "They have their heads down in the grass taking bites", "The zebras are eating the grass."], "image": "val2014/COCO_val2014_000000191050.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281358, "question_id": "KygGjy4H9mtrfcJGCrHAgg", "question": "What are the animals kept in?", "choices": ["stalls", "boxes", "cat carriers", "dog carriers"], "correct_choice_idx": 0, "direct_answers": ["pens", "shed", "pen", "stables", "pens", "pen", "cages", "stalls", "pen", "warehouse"], "difficult_direct_answer": false, "rationales": ["It would be inhumane to keep animals in boxes. the animals are cows, not cats or dogs.", "The animals are in stalls.", "The cows are in stalls."], "image": "train2014/COCO_train2014_000000281358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334338, "question_id": "KzxGq34JpcJYziN8nWzELa", "question": "What color is the snow pants worn by the guy on the snowboard?", "choices": ["white", "green", "blue", "red"], "correct_choice_idx": 2, "direct_answers": ["blue", "black", "blue", "blue", "blue", "blue", "blue", "black", "blue", "black"], "difficult_direct_answer": false, "rationales": ["The guy on the snowboard is wearing a pair of dark blue snowpants.", "The color is blue.", "They are a different color than the black jacket"], "image": "train2014/COCO_train2014_000000334338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199481, "question_id": "L28j8b5CEZuKni9dCp9t9g", "question": "What injury is most likely to be prevented by the person's protective gear?", "choices": ["cut hand", "bruised tailbone", "broken ankle", "head injury"], "correct_choice_idx": 3, "direct_answers": ["head", "head injury", "head injury", "head", "head", "head injury", "head", "head injury", "head", "head"], "difficult_direct_answer": false, "rationales": ["The person is riding a motorcycle and people riding those can have head injuries. he is wearing a helmet for that reason.", "The injury is to the head.", "The most obvious protective gear that the person is wearing is a helmet which is intended to prevent injuries to the area it is worn on."], "image": "val2014/COCO_val2014_000000199481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18750, "question_id": "L3c8Sb2UMR33awmoZswhvD", "question": "Which religion should this church probably belong with?", "choices": ["protestant", "anglican", "islam", "catholic"], "correct_choice_idx": 3, "direct_answers": ["christian", "catholic", "christian", "monk", "christian", "catholic", "christian", "catholic", "christian", "christian"], "difficult_direct_answer": false, "rationales": ["The structures seem to be similar to that of the vatican.", "The church is ornate.", "It is old and has many spires"], "image": "val2014/COCO_val2014_000000018750.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539319, "question_id": "L47bSJYNrYDoXGYPNsKxAN", "question": "What color is the child's rainjacket that looks like a frog?", "choices": ["green", "blue", "purple", "red"], "correct_choice_idx": 0, "direct_answers": ["green", "green", "green", "green", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["This is obviously the correct color.", "The rainjacket and common frogs are not blue, purple, or red.", "The color of the rain jacket that looks like a frog is the same color as an actual frog."], "image": "train2014/COCO_train2014_000000539319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315062, "question_id": "L5NicGXrMCGsHbqGTiTRZr", "question": "Which of these animals would win a race?", "choices": ["cow", "lamb", "horse", "jaguar"], "correct_choice_idx": 2, "direct_answers": ["dog", "horse", "horse", "bull", "horse", "horse", "horse", "all", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["The horse is more sleek and has longer legs. traditionally, horses are used and bred for moving fast.", "The animal is the horse.", "A horse is faster than a cow."], "image": "val2014/COCO_val2014_000000315062.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223031, "question_id": "L5abQubFbh2xDfvjTJKmzE", "question": "Which food item on the plate is highest in fat?", "choices": ["cheese", "coleslaw", "potato skin", "french fries"], "correct_choice_idx": 0, "direct_answers": ["coleslaw", "fries", "cheese", "fries", "burger", "fries", "cheese", "fries", "fries", "fries"], "difficult_direct_answer": false, "rationales": ["That food is high in saturated fat. dairy is usually fattening.", "Cheese is higher in fat than french fries, coleslaw and potato skin", "A burger with fries is on a plate and the burger is covered in melted cheddar."], "image": "val2014/COCO_val2014_000000223031.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330536, "question_id": "L8bo7rQZik4bfBkYLKwN67", "question": "What company makes the roll in the room?", "choices": ["jameson", "charmin", "tootsie roll", "kraft"], "correct_choice_idx": 1, "direct_answers": ["charmin", "charmin", "charmin", "charmin", "charmin", "cottonelle", "unknown", "charmin", "angel soft", "blue"], "difficult_direct_answer": false, "rationales": ["The item on the roll is toilet paper, not candy, cheese, or alcohol.", "This is a famous brand of toilet paper products", "The company who makes toilet paper is charmin."], "image": "val2014/COCO_val2014_000000330536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363830, "question_id": "L92DjkWwZp8tSrg377pKnX", "question": "Why is the floor tiled?", "choices": ["sun", "look", "water", "feel"], "correct_choice_idx": 2, "direct_answers": ["water damage", "design", "cleanliness", "bathroom", "fwfefe", "protection", "hygiene", "keep dry", "water", "good looking"], "difficult_direct_answer": true, "rationales": ["The area is a bathroom so the floor is protected against water with tile", "There is white tile with green and red patterns on the floor. the tile is waterproof and keeps the floor from rotting from water damage.", "Because there is a bath you have tiles on the floor in case water pours over the bath"], "image": "val2014/COCO_val2014_000000363830.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539797, "question_id": "L9LdqtJVTiWZ7FXPLm88Fh", "question": "Who is closest to the wall?", "choices": ["boy", "old woman", "old man", "young girl"], "correct_choice_idx": 0, "direct_answers": ["boy", "boy", "male", "boy", "boy", "right boy", "right boy", "boy", "boy", "black"], "difficult_direct_answer": false, "rationales": ["A young guy is walking on a sidewalk next to a building.", "The younger boy is close to the wall.", "There is a boy next to the wall."], "image": "train2014/COCO_train2014_000000539797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212181, "question_id": "L9RsGBGEJjKFwgigmsXScu", "question": "What color is the refrigerator sitting next to the black countertop?", "choices": ["purple", "red", "wood", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The fridge is white.", "The refrigerator is lacking any color at all.", "The refrigerator next to the black countertop is white."], "image": "train2014/COCO_train2014_000000212181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206102, "question_id": "L9ZKC3SWhuanQst4XVpH9d", "question": "What is being recorded?", "choices": ["music", "movie", "podcast", "youtube video"], "correct_choice_idx": 0, "direct_answers": ["dwdw", "music", "song", "music", "music", "music", "music", "music", "music", "music"], "difficult_direct_answer": false, "rationales": ["There is a man playing guitar as he sits at a chair and a man is behind a door with a keyboard. there is another woman in dress that is taking notes.", "There are microphones pointed towards a guitar and keyboard.", "Music is being recorded."], "image": "train2014/COCO_train2014_000000206102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276666, "question_id": "LBPCe8gxwcmDrRx3FoLUTG", "question": "How many giraffes are gathered around the tree with some mild damage?", "choices": ["five", "two", "four", "three"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["One giraffe is interacting with the tree. an additional giraffe is behind this one.", "One is behind the other one", "The giraffes are clearly visible and countable based on their unique outlines."], "image": "train2014/COCO_train2014_000000276666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233613, "question_id": "LC6V6HcmgiX5tYf947gNsD", "question": "What is the cat laying on?", "choices": ["car seat", "pillow", "egg carton", "box"], "correct_choice_idx": 0, "direct_answers": ["car seat", "car seat", "chair", "car seat", "car seat", "seat", "seat", "car seat", "seat", "car seat"], "difficult_direct_answer": false, "rationales": ["The cat's on a seat.", "The steering wheel and the seat belt indicate that the cat is in some sort of vehicle.", "The cat is on a car seat."], "image": "train2014/COCO_train2014_000000233613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156171, "question_id": "LDz6um3jbqUa2qt6sbh3Hw", "question": "What is the dog inside of?", "choices": ["giant egg", "car", "cage", "box"], "correct_choice_idx": 1, "direct_answers": ["car", "vehicle", "car", "car", "car", "car", "car", "car", "car", "car"], "difficult_direct_answer": false, "rationales": ["The dog is in a vehicle, as evident from the seats and mirror.", "The dog is sitting inside a car.", "The dog is in the car."], "image": "train2014/COCO_train2014_000000156171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162848, "question_id": "LEGesxyWwBkhu9J8fba258", "question": "What is on the laptop?", "choices": ["butterfly", "cat", "egg", "keys"], "correct_choice_idx": 3, "direct_answers": ["keys", "dwwdw", "keys", "key", "keys", "keys", "keys", "keys", "keys", "keys"], "difficult_direct_answer": false, "rationales": ["The things on the laptop are made of metal and joined on a ring.", "They are used to unlock the doors.", "A set of keys is sitting on top of the laptop."], "image": "train2014/COCO_train2014_000000162848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207056, "question_id": "LER8rWRywyTSFVHk7AEGhw", "question": "Who has the last name that refers to what a group of these animals is called?", "choices": ["curtis pride", "jim herd", "anna camp", "tim crowder"], "correct_choice_idx": 1, "direct_answers": ["jim herd", "herd", "amber", "elephant", "elephants", "na", "amber heard", "vertebrates", "herd", "herd"], "difficult_direct_answer": false, "rationales": ["The name is herd.", "A group of elephants is referred to as a herd.", "These animals are elephants. a group of elephants is not a crowd, camp, or pride."], "image": "val2014/COCO_val2014_000000207056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428445, "question_id": "LEhTB8Mf6BPg654HpD6o7v", "question": "What is in front of the vehicle?", "choices": ["animals", "traffic cones", "eggs", "balloons"], "correct_choice_idx": 0, "direct_answers": ["animals", "animals", "animals", "sheep", "lambs", "sheep", "animals", "cows", "sheep", "cows"], "difficult_direct_answer": false, "rationales": ["The animals are in front.", "The vehicle in the street has several animals standing in front of it.", "The vehicle in the street is waiting for cows and sheep to move out of the way so it can pass."], "image": "train2014/COCO_train2014_000000428445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156904, "question_id": "LHCUDiY5GLKob4B7EbHwjY", "question": "What kind of car is running around on the tarmac?", "choices": ["water truck", "fuel truck", "race car", "van"], "correct_choice_idx": 2, "direct_answers": ["race car", "sports", "race car", "racecar", "race car", "race car", "cobra", "race car", "race car", "race car"], "difficult_direct_answer": false, "rationales": ["The vehicle is a racecar.", "The vehicle is not a van or truck. the vehicle has a highly aerodynamic design.", "The number on the vehicle is used to identify it on the racetrack."], "image": "train2014/COCO_train2014_000000156904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415008, "question_id": "LHMaoifwKjtwxYCyMCpXuR", "question": "What color are the two pillows at the right end of the couch with three cushions on top?", "choices": ["pink", "cream", "red", "blue"], "correct_choice_idx": 1, "direct_answers": ["brown", "tan", "cream", "beige", "beige", "beige", "tan", "beige", "white", "tan"], "difficult_direct_answer": false, "rationales": ["The color is cream.", "The color is cream.", "The pillow on the right ended couch is cream colored."], "image": "train2014/COCO_train2014_000000415008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66483, "question_id": "LHQJi3vk7k4hnWjiSs5gKe", "question": "How many cows are grazing around the pasture with horns in their heads?", "choices": ["three", "four", "two", "five"], "correct_choice_idx": 2, "direct_answers": ["two", "one", "two", "two", "two", "one", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The cows are eating grass.", "There are 2 cows.", "There are a couple of cows grazing in the pasture."], "image": "train2014/COCO_train2014_000000066483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86679, "question_id": "LJLfRK5N6o49SDPtjeFtQ9", "question": "Who is attempting to tag the runner?", "choices": ["keith davis", "david chokachi", "david wright", "larry david"], "correct_choice_idx": 2, "direct_answers": ["baseman", "catcher", "baseman", "baseman", "baseman", "david wright", "outfielder", "player", "baseman", "catcher"], "difficult_direct_answer": false, "rationales": ["The people are playing baseball, not acting or playing football.", "The name of the batter chasing the runner is david wright.", "He plays for this team."], "image": "train2014/COCO_train2014_000000086679.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444168, "question_id": "LJPGV7VgvUsSx6FArzHFrP", "question": "What holiday is this cake likeliest to commemorate?", "choices": ["wedding", "birthday", "4th july", "anniversary"], "correct_choice_idx": 2, "direct_answers": ["birthday", "birthday", "birthday", "4th july", "cake", "birthday cake", "birthday", "birthday", "july 4", "july fourth"], "difficult_direct_answer": false, "rationales": ["The cake is for the fourth of july.", "The cake has sparklers which looks like fireworks. fireworks are used to celebrate the 4th of july.", "It has a sparkler firework on it"], "image": "train2014/COCO_train2014_000000444168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42701, "question_id": "LJR6qQkBww9ArdjGPcfKuM", "question": "How many cows are evidently in the pasture together for grazing?", "choices": ["three", "five", "four", "two"], "correct_choice_idx": 2, "direct_answers": ["two", "four", "four", "four", "four", "four", "three", "two", "three", "three"], "difficult_direct_answer": false, "rationales": ["They are all in the field.", "There are three in the foreground and one far to the right back of the scene.", "There are four cows eating in the pasture. they are grazing."], "image": "train2014/COCO_train2014_000000042701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39455, "question_id": "LJnH94czKa8B8vBFG85tyz", "question": "What is the yellow sauce's name in the culinary world?", "choices": ["bearnaise", "hollandaise", "bechamel", "choron"], "correct_choice_idx": 1, "direct_answers": ["queso", "egg", "egg", "hollandaise", "hollandaise", "juice", "egg", "mustard", "hollandaise", "hollandaise"], "difficult_direct_answer": false, "rationales": ["It is used a lot on eggs.", "The yellow sauce on top of the muffin is called hollandaise sauce.", "The sauce is hollandaise."], "image": "train2014/COCO_train2014_000000039455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388753, "question_id": "LKARCiCdMQj35RuuQTEV5o", "question": "What part of the cat is facing the camera a little bit embarrassingly for the cat?", "choices": ["eyes", "butthole", "belly", "feet"], "correct_choice_idx": 1, "direct_answers": ["butt", "butt", "butt", "rear", "butt", "anus", "anus", "butt", "butthole", "back"], "difficult_direct_answer": false, "rationales": ["Its butthole is the one facing the camera.", "The cat is showing its behind.", "A cat is facing away from a camera."], "image": "train2014/COCO_train2014_000000388753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246064, "question_id": "LKSkYb6XAETvMG3Aee4YTn", "question": "What color is the turban worn by the man herding the cows?", "choices": ["red", "blue", "tan", "white"], "correct_choice_idx": 0, "direct_answers": ["red", "pink", "pink", "three", "red", "pink", "pink", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The turban sits on top of his head in this color.", "The person is wearing something red on their head.", "The turban is red colored."], "image": "val2014/COCO_val2014_000000246064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60316, "question_id": "LMUb5x2f8eVRwyFdAaYPs6", "question": "How many species of animals are sharing the savannah opening together?", "choices": ["four", "three", "two", "five"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "two", "three", "three", "two", "two", "two", "zebra", "three"], "difficult_direct_answer": false, "rationales": ["Several different species of elk type animals and zebras are grazing together.", "There are three species.", "There are 3 species."], "image": "train2014/COCO_train2014_000000060316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279550, "question_id": "LNnTPJzHvFqBnLKXp9fNzH", "question": "What color is the saddle on top of the little horse's back?", "choices": ["red", "tan", "white", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "blue", "black", "black", "black", "black", "black", "brown", "black"], "difficult_direct_answer": false, "rationales": ["There is a black saddle on the back of the horses.", "The saddle on the horse is black.", "The saddle on the back of the horse's back is made of black leather."], "image": "train2014/COCO_train2014_000000279550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14733, "question_id": "LQS5eJDCizyos57iRFDCpW", "question": "What food comes from these animals?", "choices": ["chicken", "venison", "beef", "lamb chop"], "correct_choice_idx": 2, "direct_answers": ["steak", "steak", "dairy", "beef", "milk", "milk", "milk", "meat", "burger", "beef"], "difficult_direct_answer": false, "rationales": ["The food is beef.", "Cows are producers of beef. they are known for their meat.", "Beef comes from cows."], "image": "val2014/COCO_val2014_000000014733.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19226, "question_id": "LR2x2ChSKo7Sjpzs9yk9hi", "question": "What are the silver items called?", "choices": ["utensils", "watches", "silver bars", "tupperware"], "correct_choice_idx": 0, "direct_answers": ["cutlery", "silverware", "silverware", "utensils", "silverware", "silverware", "silverware", "dinnerware", "silverware", "fork"], "difficult_direct_answer": false, "rationales": ["The items are utensils.", "A dessert is on a table with silverware stacked neatly beside it.", "The silver items are utensils."], "image": "val2014/COCO_val2014_000000019226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150948, "question_id": "LRdVpErvpbgiyjFriJMmH5", "question": "What kind of animals are these cakes prepared to the shape of?", "choices": ["fox", "hound", "rabbit", "penguin"], "correct_choice_idx": 2, "direct_answers": ["bunnies", "rabit", "rabbit", "bunny", "bunnies", "bunny", "rabbits", "cat", "rabbit", "rabbit"], "difficult_direct_answer": false, "rationales": ["These are in the shape of a bunny.", "The cakes are prepared to be the shape of bunny rabbits.", "The cakes are made for easter."], "image": "train2014/COCO_train2014_000000150948.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396326, "question_id": "LSGAEQxaEgXGXwjfaYxVMD", "question": "What does this animal have?", "choices": ["wings", "stinger", "whiskers", "quills"], "correct_choice_idx": 2, "direct_answers": ["paper", "purse", "bag", "fur", "two eyes", "book", "claws", "fur", "whiskers", "whiskers"], "difficult_direct_answer": false, "rationales": ["It has a whiskers in its belly.", "There are long lines that are pointing and coming out of the face of the cat's head.", "This animal is a cat, not a wasp, porcupine, or bird."], "image": "train2014/COCO_train2014_000000396326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170584, "question_id": "LV2zoQghoxnPaWXNx8zaf8", "question": "How many cats are resting on top of the big skateboard?", "choices": ["five", "two", "three", "four"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "one", "two", "two", "one", "two"], "difficult_direct_answer": false, "rationales": ["There are two cats.", "A couple of cats are sitting on a skateboard.", "There is one skateboard visible and it is clear. based on the position of the cat bodies and the paws, their number is clearly countable."], "image": "train2014/COCO_train2014_000000170584.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149610, "question_id": "LVXC4RFLGoyRb7CSCgkUSt", "question": "Who is at bat?", "choices": ["david otunga", "mookie betts", "chad billingsley", "evelyn smith"], "correct_choice_idx": 2, "direct_answers": ["chad billingsley", "billingsley", "batter", "baseball player", "batter", "billingsley", "baseball player", "billingsley", "billingsley", "chad billingsley"], "difficult_direct_answer": false, "rationales": ["Professional baseball players from most teams have their last name prominently featured on the back of their uniform.", "One can see the name of the player, that is holding the bat, written on his uniform.", "The name is on the shirt"], "image": "train2014/COCO_train2014_000000149610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380239, "question_id": "LVeioaNwCaR76rYQMm6Sj3", "question": "What is stacked to the left of the leftmost monitor?", "choices": ["computer parts", "food", "books", "computer games"], "correct_choice_idx": 3, "direct_answers": ["video games", "cds", "computer games", "software", "games", "cds", "keyboard", "movies", "movies", "video games"], "difficult_direct_answer": false, "rationales": ["The computer is stacked.", "Based on the shape of the box and the titles written on them, it is likely the objects located in the place described in the question are dvds, but computer games would also be seen in the same cases.", "Computer games are stacked."], "image": "train2014/COCO_train2014_000000380239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547769, "question_id": "LWLYzmkuYLneFdvQgaLg2N", "question": "What place serves this kind of food?", "choices": ["mcdonalds", "wendys", "pizza hut", "subway"], "correct_choice_idx": 2, "direct_answers": ["pizzeria", "pizzeria", "pizza", "restaurant", "fastfood", "italian", "italian restaurant", "pizza joints", "pizza place", "pizza hut"], "difficult_direct_answer": true, "rationales": ["Pizza hut serves pizzas.", "The food item consists of dough, sauce, and toppings. it is not a hamburger or sandwich.", "The food is round baked dough, topped with meat, cheese, and sauces."], "image": "train2014/COCO_train2014_000000547769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374559, "question_id": "LY5vrotCktvkthjKD5jEBa", "question": "What is the man in the foreground wearing?", "choices": ["tie", "armor", "sunglasses", "crown"], "correct_choice_idx": 2, "direct_answers": ["helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "helmet", "sunglasses", "helmet", "helmet"], "difficult_direct_answer": false, "rationales": ["The man is wearing glasses but you can see they have a darkened lens this provides the person wearing them protection from the sun and allows them to see without being impaired by the brightness of the sun.", "The man has sunglasses.", "The man has sunglasses."], "image": "val2014/COCO_val2014_000000374559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522262, "question_id": "LYBeDCZpbrdK7BKsoNx3Le", "question": "What item would usually be used with these vehicles?", "choices": ["missile launcher", "windshield wipers", "radar", "chain"], "correct_choice_idx": 3, "direct_answers": ["helmet", "chain", "bike lock", "road", "bicycle", "stand", "lock", "helmet", "lock", "cycling"], "difficult_direct_answer": false, "rationales": ["The item is a chain.", "Bicycles are often used with chains.", "This is to lock it to the bike rack"], "image": "val2014/COCO_val2014_000000522262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171468, "question_id": "LZTUbmkZLhqjz77cWRARXo", "question": "What animal is looking toward the giraffes?", "choices": ["dog", "cow", "shark", "lamprey"], "correct_choice_idx": 0, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "car", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The animal is the dog.", "There is a small pooch on the other side of a fence. he is looking at the two big giraffes looking at him.", "The animals are facing each other and it is apparent that the animal facing the giraffe is answer a."], "image": "train2014/COCO_train2014_000000171468.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427252, "question_id": "LaGih6gjWVRJMMBmsUDvzf", "question": "What gift would this person enjoy assuming they like what they are holding?", "choices": ["tennis ball", "teddy bear", "baseball bat", "wine rack"], "correct_choice_idx": 3, "direct_answers": ["wine", "red wine", "wine rack", "wine", "red wine", "wine", "wine", "wine", "wine", "wine glasses"], "difficult_direct_answer": false, "rationales": ["They are holding a bottle of mild alcohol.", "The gift is a wine rack.", "The person is holding a wine bottle based on the label, shape, size and the contents. someone who enjoys wine would likely need a place to store other bottles."], "image": "train2014/COCO_train2014_000000427252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278923, "question_id": "LbzzfRSExC4cqeDGSW3pm6", "question": "What color is the background of the vase behind the illustration?", "choices": ["red", "blue", "green", "cream"], "correct_choice_idx": 3, "direct_answers": ["white", "cream", "white", "blue", "beige", "green", "beige", "yellow", "white", "beige"], "difficult_direct_answer": false, "rationales": ["The color is cream.", "The object is clearly visible and the base color is identifiable when ignoring the color of the design laid on top.", "That's the obvious color. it's used in contrast so that the octopus stands out."], "image": "train2014/COCO_train2014_000000278923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483374, "question_id": "LdHctERDSpgyKXoCnHM53M", "question": "What color is the man's shirt on the left side of the photograph?", "choices": ["red", "purple", "blue", "green"], "correct_choice_idx": 1, "direct_answers": ["purple", "purple", "pink", "pink", "pink", "pink", "pink", "pink", "black color", "black"], "difficult_direct_answer": false, "rationales": ["The man has a pink shirt.", "The man on the subway is wearing a purple shirt.", "The shirt on the man is purple."], "image": "val2014/COCO_val2014_000000483374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63109, "question_id": "LfDEnpBfhMJFYFcAw65qfQ", "question": "What does the man have in his hand?", "choices": ["rattle", "mouse", "remote control", "food"], "correct_choice_idx": 3, "direct_answers": ["hotdog bun", "sandwich", "hotdog", "plate", "hotdog", "sandwich", "bacon", "food", "bread", "fefef"], "difficult_direct_answer": false, "rationales": ["The man has food.", "The man appears to be holding a sandwich above a plate. a sandwich is part of the category of answer a.", "The man has a hot dog in his hand."], "image": "train2014/COCO_train2014_000000063109.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31135, "question_id": "Lfobdu5ae2U49EMnsZhxAs", "question": "What is the side for the sandwich served at this restaurant?", "choices": ["fries", "chips", "mashed potato", "corn"], "correct_choice_idx": 1, "direct_answers": ["chips", "chips", "chips", "chips", "potato chips", "chips", "chips", "dish", "stop", "plate"], "difficult_direct_answer": false, "rationales": ["The fried potato snacks are seen on the plate.", "The side is chips.", "There are chips on top of the sandwich. chips are a side dish."], "image": "train2014/COCO_train2014_000000031135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251084, "question_id": "LfsWifxGaSZoRqVu4vbWn2", "question": "What number comes after the number at the top of the bus when counting to ten?", "choices": ["four", "seven", "ten", "five"], "correct_choice_idx": 2, "direct_answers": ["ten", "ten", "nine", "ten", "ten", "ten", "ten", "nine", "nine", "ten"], "difficult_direct_answer": false, "rationales": ["The bus has the number nine on it.", "The number at the top of the bus is nine, not three, four, or six.", "The number 10 comes after."], "image": "train2014/COCO_train2014_000000251084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518215, "question_id": "LfxYKWz4vBiUgwb3DebSbw", "question": "How many giraffes are standing around the wood buildings?", "choices": ["two", "four", "three", "five"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["A few giraffes are grazing in an enclosure.", "This is obvious simply by counting them.", "One giraffe is surrounded by two other giraffes."], "image": "train2014/COCO_train2014_000000518215.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545385, "question_id": "LgtAfTAPjn5ujj3k4zb8os", "question": "What is on the plate?", "choices": ["apple", "chicken leg", "spoon", "salmon"], "correct_choice_idx": 2, "direct_answers": ["cake", "cake", "spoon", "spoon", "pastry", "cake", "pie dessert", "cake", "cake", "cake"], "difficult_direct_answer": false, "rationales": ["A utensil is on the plate.", "There is a spoon on the plate.", "The plate has a spoon."], "image": "val2014/COCO_val2014_000000545385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198057, "question_id": "Lh3sNwD8pT2nGpvvbwAhQG", "question": "What are the big animals called?", "choices": ["elephants", "tigers", "pelican", "eels"], "correct_choice_idx": 2, "direct_answers": ["geese", "pelicans", "birds", "cranes", "swan", "pelicans", "pelican", "pelicans", "birds", "pelican"], "difficult_direct_answer": false, "rationales": ["A pelican is white, has a large beak and can be found near water.", "The bird has a large jaw to capture fish from the water to eat.", "The birds are known for their big peacock."], "image": "train2014/COCO_train2014_000000198057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61463, "question_id": "LhucuW44ddmczH7kuJym62", "question": "What are the majority of the people doing?", "choices": ["standing", "eating", "sitting", "sleeping"], "correct_choice_idx": 0, "direct_answers": ["standing", "playing wii", "playing wii", "smiling", "standing", "playing game", "standing", "talking", "playing videogame", "video games"], "difficult_direct_answer": false, "rationales": ["All of the people are awake, and nobody is eating. two of the three people are not sitting.", "The people are standing.", "The majority of people are standing because they are playing a video game that needs them to be standing"], "image": "train2014/COCO_train2014_000000061463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232263, "question_id": "LiPawdtK2jPmWdF2QbjE8F", "question": "What is the answer to the equation on the side of the truck?", "choices": ["72", "24", "16", "89"], "correct_choice_idx": 2, "direct_answers": ["sixteen", "sixteen", "16", "truck", "12", "sixteen", "sixteen", "sixteen", "sixteen", "sixteen"], "difficult_direct_answer": false, "rationales": ["The equation is four multiplied by itself.", "The answer is 16.", "Four times four is 16"], "image": "train2014/COCO_train2014_000000232263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289173, "question_id": "LiSmGXitPkBDPFF25AFKAA", "question": "What is the large vehicle here?", "choices": ["helicopter", "airplane", "tank", "submarine"], "correct_choice_idx": 1, "direct_answers": ["airplane", "airplane", "airplane", "airplane", "airplane", "airplane", "airplane", "aeroplane", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["The large vehicle has fixed wings. it is not armored and cannot travel underwater.", "This is a passenger jet", "The vehicle is a plane."], "image": "val2014/COCO_val2014_000000289173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532867, "question_id": "LmXzP65g9B6G8uEgsecPLn", "question": "What color is the umbrella held by the woman barefoot on the beach?", "choices": ["brown", "blue", "white", "red"], "correct_choice_idx": 0, "direct_answers": ["brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["It is almost the same color as the sand", "It is the same hue as the sand on the beach, which would be closest to the colour brown.", "The color is brown."], "image": "val2014/COCO_val2014_000000532867.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391292, "question_id": "Lo5NBZBYLsfCrfqHBYwzbm", "question": "What animal is the same color as the animal to the right that is in front of the fence?", "choices": ["ostrich", "frog", "leopard", "zebra"], "correct_choice_idx": 3, "direct_answers": ["black", "cow", "black lab", "zebra", "cow", "cow", "cow", "cow", "zebra", "zebra"], "difficult_direct_answer": false, "rationales": ["Black and white cows are lined up at a fence.", "The animals in the picture are black and white, similar to these cows these animals have black and white stripes that cover their body.", "The animal is a zebra."], "image": "train2014/COCO_train2014_000000391292.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80311, "question_id": "Lp7ymoHFCoXJPNsgS6dKUV", "question": "What will come out of the sink?", "choices": ["water", "nothing", "gasoline", "soda"], "correct_choice_idx": 1, "direct_answers": ["water usually", "water", "nothing", "water", "water", "water", "nothing", "water", "water", "water"], "difficult_direct_answer": false, "rationales": ["The play stove does not have a real water source.", "The sink is a play thing.", "This is a kids toy that isn't hooked up to utilities"], "image": "train2014/COCO_train2014_000000080311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374628, "question_id": "LpSGj3RgeaggaqCsnMQs9P", "question": "What is the appliance above the stove?", "choices": ["coffee maker", "toaster oven", "pizza oven", "microwave oven"], "correct_choice_idx": 3, "direct_answers": ["oven", "microwave", "microwave oven", "vessels", "wood", "microwave", "microwave oven", "microwave oven", "microwave oven", "microwave"], "difficult_direct_answer": false, "rationales": ["There is a stainless steel device with a handle. people put foods in here to heat up fast.", "The appliance is silver and sitting above the stove.", "It is a smaller oven that works using electromagnetic waves to cook food."], "image": "val2014/COCO_val2014_000000374628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120519, "question_id": "LqGqUXzWTBGGc4M7BwGwDr", "question": "What type of person does this room probably belong to?", "choices": ["young boy", "adult women", "adult man", "young girl"], "correct_choice_idx": 3, "direct_answers": ["young girl", "child", "gril", "young girl", "little girl", "girl", "girl", "doll", "child", "child"], "difficult_direct_answer": false, "rationales": ["This bedding is light pink which girls like more than boys plus there is a toy indicating a child owns it.", "The room has feminine colors and a stuffed animal.", "It's pink and has hearts on the blanket"], "image": "val2014/COCO_val2014_000000120519.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246064, "question_id": "LqcMT5gYAyMdCJNthBPp2T", "question": "How many cows are following around the man in the red turban?", "choices": ["three", "two", "five", "four"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["The man in the red turban is walking with three cows following him.", "Two are on one side and one is on the other", "There is one cow to this man's left and two to right."], "image": "val2014/COCO_val2014_000000246064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262059, "question_id": "LrQFDiKgsaWngMjbcwgoTZ", "question": "What is resting near the computer?", "choices": ["cat", "elephant", "dog", "foot"], "correct_choice_idx": 3, "direct_answers": ["foot", "person", "mouse", "cup", "mouse", "cup", "lamp", "cup", "cup", "mouse"], "difficult_direct_answer": false, "rationales": ["The objects near the computer are visible and of the list of possible answers, only answer a is in the image and clearly identifiable.", "A foot can be seen by the computer.", "The foot is near."], "image": "train2014/COCO_train2014_000000262059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460078, "question_id": "Lrh3nyRcd97NcWArwC4W6K", "question": "How many giraffes are standing in this area instead of eating?", "choices": ["three", "one", "four", "two"], "correct_choice_idx": 3, "direct_answers": ["five", "two", "five", "two", "five", "five", "two", "five", "zero", "two"], "difficult_direct_answer": false, "rationales": ["There are two giraffes.", "Only two have their heads up.", "There are two of them."], "image": "train2014/COCO_train2014_000000460078.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151900, "question_id": "LrujMyiHYqicmKih7PLAy5", "question": "What is the owner of these objects likely doing soon?", "choices": ["stay home", "travel domestically", "travel locally", "travel internationally"], "correct_choice_idx": 3, "direct_answers": ["international travel", "travelling", "travel internationally", "electronics", "traveling", "secret service", "travelling", "traveling", "it technician", "traveling"], "difficult_direct_answer": false, "rationales": ["The person will travel to other countries.", "There is a passport included with the owner's objects.", "The items on the table contain a passport which would be used for traveling to another country."], "image": "train2014/COCO_train2014_000000151900.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165674, "question_id": "LuPnEaydXUSNUrkrreJL5M", "question": "What beer does the man in the blue shirt like?", "choices": ["budweiser", "coors", "yuengling", "modelo"], "correct_choice_idx": 2, "direct_answers": ["cola", "car", "yuengling", "yuengling", "yuengling", "na", "yuengling", "yuengling", "yuengling", "happy"], "difficult_direct_answer": false, "rationales": ["The man is wearing a shirt that says yuengling.", "The type he likes is written across the front of his shirt.", "The beer is chinese."], "image": "train2014/COCO_train2014_000000165674.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 378775, "question_id": "LuasoDSh57i3LyMv95dnFy", "question": "Which umbrella is providing the least protection?", "choices": ["purple umbrella", "black umbrella", "blue umbrella", "checkered umbrella"], "correct_choice_idx": 1, "direct_answers": ["black", "black", "black umbrella", "rain", "broken umbrella", "black umbrella", "black", "left foreground", "black", "black umbrella"], "difficult_direct_answer": false, "rationales": ["There is a woman with a purple umbrella that is looking at camera. another man behind and to left is holding an umbrella that it is flipped upside down.", "The umbrella is black.", "A person's umbrella has turned inside out and is smaller than some of the others."], "image": "train2014/COCO_train2014_000000378775.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223648, "question_id": "LueqihDnRpRV4Z9VfFmWGt", "question": "What is the common similarity with all the items on the table?", "choices": ["all wooden", "all forks", "all plastic", "all spoons"], "correct_choice_idx": 0, "direct_answers": ["utensils", "wood spoons", "spoon", "wood", "stop", "spoons", "chair", "all wooden", "wood", "wooden"], "difficult_direct_answer": true, "rationales": ["All the items are brown and made of this material.", "They're all wooden.", "The similarity is they're wooden."], "image": "val2014/COCO_val2014_000000223648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469898, "question_id": "Lueqy4pRSvQGN4SvxWeyL8", "question": "What is next to the dog?", "choices": ["horse", "baby", "sheep", "beetle"], "correct_choice_idx": 2, "direct_answers": ["sheep", "sheep", "sheep", "sheep", "sheep", "ground marker", "sheep", "marker", "sheep", "sheep"], "difficult_direct_answer": false, "rationales": ["There are two animals next to the dog both with woolen coats.", "There subjects of the image are clearly visible and the size, shape and exterior wool of the animals next to the dog is consistent with answer a.", "The dog is by a sheep."], "image": "train2014/COCO_train2014_000000469898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 167243, "question_id": "Lv3nMpvh3eqt2hbZhTXbAu", "question": "What food has a slang term whose name appears on the cafe sign?", "choices": ["hot dog", "cheeseburger", "taco", "dumpling"], "correct_choice_idx": 0, "direct_answers": ["coffee", "hot dog", "wiener", "pizza", "hot dog", "hot dogs", "wiener", "hot dog", "sausage", "hot dog"], "difficult_direct_answer": false, "rationales": ["The business is named cafe wiener. a wiener is not a dumpling, cheeseburger, or taco.", "A hot dog is also called a weiner.", "Weiners often go by this name."], "image": "train2014/COCO_train2014_000000167243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335065, "question_id": "LvsTex9cSCQPRxLH67fLqF", "question": "Which culture usually sets a table as in this picture?", "choices": ["european", "south american", "russian", "korean"], "correct_choice_idx": 3, "direct_answers": ["korean", "korean", "asian", "bigg", "old", "japanese", "stop", "asian culture", "asian", "traditional"], "difficult_direct_answer": false, "rationales": ["The people in the picture look asian and somewhat tall which is typical of this country. the dishes are veggies and meat which is also typical of this country.", "The culture is korean.", "The culture is korean."], "image": "val2014/COCO_val2014_000000335065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544649, "question_id": "Lx4GQtKmtM49zazkK5Bjp3", "question": "What is the animal that is walking directly down the dirt roadside?", "choices": ["zebra", "giraffe", "dog", "sheep"], "correct_choice_idx": 3, "direct_answers": ["sheep", "cow", "sheep", "sheep", "sheep", "sheep", "zebra", "goat", "sheep", "zebra"], "difficult_direct_answer": false, "rationales": ["It is fluffy and white", "The animal is a sheep.", "The sheep are walking on the dirt path as a zebra stands on the roadside eating foliage."], "image": "train2014/COCO_train2014_000000544649.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280254, "question_id": "LxEUUyCBa8jMXrdcVxV8XQ", "question": "What is the woman in the foreground pouring?", "choices": ["wine", "mustard", "syrup", "ketchup"], "correct_choice_idx": 0, "direct_answers": ["wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine", "wine", "alcohol"], "difficult_direct_answer": false, "rationales": ["She is pouring wine.", "The woman has an alcohol bottle in her hand.", "As indicated by the shape and size of the bottle and the texture of the liquid. it's likely a tasting. that said, it could also be b given the white spigot at the end of the bottle."], "image": "train2014/COCO_train2014_000000280254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23378, "question_id": "LxzVKieiECD759JyLWE74V", "question": "What would you call this area?", "choices": ["suitcase", "drinking hole", "parking lot", "desert"], "correct_choice_idx": 1, "direct_answers": ["crossing", "drinking hole", "africa", "watering hole", "river", "forest", "zebra crossing", "water hole", "savannah", "savannah"], "difficult_direct_answer": true, "rationales": ["None of the other options work. in africa, animals commonly herd around water for drinking.", "This rural area is full of water. it is not a desert or parking lot.", "The area is a drinking hole."], "image": "train2014/COCO_train2014_000000023378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113173, "question_id": "LzFEZwre8rZohNwt3tzuKG", "question": "Why does the person have poles?", "choices": ["balance", "visibility", "fashion", "protection"], "correct_choice_idx": 0, "direct_answers": ["skiing", "surfing", "for skating", "ski", "skiing", "skiing", "support", "balance", "skiing", "ski"], "difficult_direct_answer": false, "rationales": ["This keeps him upright as he moves", "These poles help a skiier stay standing", "Ski poles are often used to prevent falling over on the ski hill."], "image": "val2014/COCO_val2014_000000113173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188511, "question_id": "LzFkWFyRCjKBRtgCzpmaTn", "question": "What color is the writing on front of the first aid case on the wall?", "choices": ["yellow", "red", "black", "blue"], "correct_choice_idx": 2, "direct_answers": ["watching", "black", "blue", "black", "blue", "stop", "black", "black", "white", "car"], "difficult_direct_answer": false, "rationales": ["The writing is not blue, red, or yellow.", "The letters are in black.", "The writing is clearly visible against the white background and is visibly answer a."], "image": "train2014/COCO_train2014_000000188511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303404, "question_id": "M2WAYKd8Qx9CRGLYHrTRWx", "question": "Which war did this aircraft likely service?", "choices": ["korea", "vietnam", "wwii", "wwi"], "correct_choice_idx": 3, "direct_answers": ["wwi", "ww ii", "www1", "great war", "ww2", "ww2", "world war", "aircraft", "wwii", "wwii"], "difficult_direct_answer": false, "rationales": ["These types of planes appeared more in a than b and c and d occurred far later.", "The war was the first instance of this aircraft being used for combat.", "This fixed quad-wing aircraft was likely serviced in wwi."], "image": "train2014/COCO_train2014_000000303404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545959, "question_id": "M3wsJLLQmBicYVR8uVxwXJ", "question": "What dominates the area?", "choices": ["ancient statue", "dolphins", "giant ladder", "large tree"], "correct_choice_idx": 3, "direct_answers": ["sheep", "sheep", "sheep", "animals", "large tree", "animals", "sheep", "sheep", "grass", "animals"], "difficult_direct_answer": false, "rationales": ["The huge tree surrounds the green grassy area.", "The large tree dominates.", "There is a big one in the middle"], "image": "val2014/COCO_val2014_000000545959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96998, "question_id": "M4Fw6TbFTv4mKQ29a8AkzS", "question": "What is likely stored below this room's sink?", "choices": ["dishes", "dish soap", "cleaners", "clothes"], "correct_choice_idx": 2, "direct_answers": ["toiletries", "cleaners", "toilet paper", "toilet paper", "toilet paper", "toilet paper", "toiletries", "table", "cleaner", "paper products"], "difficult_direct_answer": false, "rationales": ["People keep cleaning products under bathroom sinks.", "Cleaners are usually stored in cabinets below the sink in almost all households", "The sink has cleaners."], "image": "val2014/COCO_val2014_000000096998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117601, "question_id": "M4Rms9pT8f6KMcB5HvLSJK", "question": "Which meal is this most likely?", "choices": ["lunch", "desert", "breakfast", "snack"], "correct_choice_idx": 0, "direct_answers": ["lunch", "lunch", "lunch", "lunch", "lunch", "lunch", "lunch", "hot dog", "hotdog", "lunch"], "difficult_direct_answer": false, "rationales": ["The meal is lunch.", "The food item is a hot dog. people usually do not eat hot dogs for breakfast, and a hot dog is not a desert or snack item.", "The person eating the hot dog is mostly likely eating it for lunch."], "image": "train2014/COCO_train2014_000000117601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432796, "question_id": "M5z7MbJuUfAdRDGxQ7Nxx9", "question": "What are the children feeding?", "choices": ["cats", "badgers", "cows", "swans"], "correct_choice_idx": 3, "direct_answers": ["swan", "swans", "swan", "swans", "biscuit", "food", "swans", "geese", "ducks", "biscuit"], "difficult_direct_answer": false, "rationales": ["The kids feed swans.", "The size of the bird, the long neck and the coloring are distinct to one type of bird.", "There are ducks."], "image": "val2014/COCO_val2014_000000432796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505243, "question_id": "M64G3LQSmYnQ8TvHup8Pvc", "question": "What type of television set is set up next to the clock on the chair?", "choices": ["smart tv", "digital", "analog", "lcd"], "correct_choice_idx": 2, "direct_answers": ["flat screen", "older model", "flat screen", "crt tv", "analogue television", "flatscreen", "hdtv", "box", "old", "analog"], "difficult_direct_answer": true, "rationales": ["A large, bulky television is on a television stand.", "The tv is an analog.", "The television next to the clock is analog."], "image": "val2014/COCO_val2014_000000505243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464237, "question_id": "M7LaDtvDEwag3ibUD9A6Rw", "question": "What is under the kites and to the left?", "choices": ["apple", "cardboard boxes", "bicycle", "scarecrow"], "correct_choice_idx": 2, "direct_answers": ["vehicles", "cars", "cars", "man", "man", "cars", "person", "bicycle", "bicycle", "cars"], "difficult_direct_answer": false, "rationales": ["The bike is by the kites.", "The furthest kite to the left is identifiable in the row and looking underneath the kite an object is visible. the object has two wheels, a seat and handle bars and is being handled by a man straddling it.", "It has two wheels and a frame and a person rides it"], "image": "train2014/COCO_train2014_000000464237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572786, "question_id": "M7hAHNzv8kdKprTi4AN6cc", "question": "Which vehicle is most likely to serve food?", "choices": ["taxi car", "taxi van", "bicycle", "truck"], "correct_choice_idx": 3, "direct_answers": ["car", "food truck", "food truck", "taxi", "truck", "taco truck", "food truck", "food truck", "truck", "truck"], "difficult_direct_answer": false, "rationales": ["The vehicle is parked and is serving food out of the side.", "That is a food truck parked on the side.", "The food truck is likely to serve food."], "image": "train2014/COCO_train2014_000000572786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561673, "question_id": "M853rzLoSsxqkzHEP6Lmy4", "question": "How many red suitcases are cycling around the luggage return?", "choices": ["one", "three", "two", "four"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "three", "three", "two", "two", "four", "few", "two", "three"], "difficult_direct_answer": false, "rationales": ["There are two suitcases.", "There are two red suitcase seen in the language turn.", "There are 2."], "image": "train2014/COCO_train2014_000000561673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353138, "question_id": "M8AxrsGHA5P9qyQjzjrYF4", "question": "What web browser is the person using?", "choices": ["internet explorer", "lexisnexis", "apple safari", "google chrome"], "correct_choice_idx": 0, "direct_answers": ["edge", "car", "wired.com", "internet explorer", "fsdf", "microsoft", "stop", "chrome", "internet explorer", "internet explorer"], "difficult_direct_answer": false, "rationales": ["The browser is the internet.", "They are using internet explorer as their web browser.", "The person is on a web browser that has an e on it."], "image": "train2014/COCO_train2014_000000353138.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97229, "question_id": "M94Yf7nvZw2FHFMZuhvsy2", "question": "What color is the triangle halves on the side of the bus next to the luggage holders?", "choices": ["black", "green", "yellow", "orange"], "correct_choice_idx": 3, "direct_answers": ["orange", "orange", "black", "red", "black red", "red", "orange", "red", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The triangle is not black, yellow, or green.", "The color is orange.", "The triangles halves are orange. they are brightly colored."], "image": "train2014/COCO_train2014_000000097229.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329687, "question_id": "MA3z4YSxLy7K5xNgzm73Li", "question": "What do both people have on?", "choices": ["crowns", "sunglasses", "armor", "masks"], "correct_choice_idx": 1, "direct_answers": ["sunglasses", "glass", "sunglasses", "sunglasses", "bench", "clothes", "hats", "fefefe", "sunglasses", "sunglasses"], "difficult_direct_answer": false, "rationales": ["The man and woman are both sitting in the sun.", "They have sunglasses.", "They are wearing dark glasses over their eyes. the sun is out so it is typical for people to wear this to shield their eyes."], "image": "val2014/COCO_val2014_000000329687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176318, "question_id": "MB7ZD7fyLPTWbomcwLfrNA", "question": "What can be heard coming out of the object on the toddlers ear?", "choices": ["ocean", "voices", "birds", "gun shots"], "correct_choice_idx": 1, "direct_answers": ["car", "voice", "voices", "speaking", "cell phone", "talking", "call", "voice", "voice", "voice"], "difficult_direct_answer": false, "rationales": ["A young person is holding a phone to her ear and smiling.", "The phone is used for verbal communication.", "The toddler is holder a cell phone. people speak on phones."], "image": "train2014/COCO_train2014_000000176318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385752, "question_id": "MCbca3kXRUarexSDWXndbf", "question": "How many dresses are sat around the entryway to the hall?", "choices": ["four", "three", "five", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two dresses.", "There are two dresses. one dress is on each side of the hall and they are nestled between parasols.", "There aren't any others in the doorway. there's one more on a painting once someone walks past the entryway."], "image": "train2014/COCO_train2014_000000385752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295756, "question_id": "MFmL999NQaFcvgLBCnkAva", "question": "How many black towel dispensers are hung on the side of the wall?", "choices": ["five", "one", "two", "four"], "correct_choice_idx": 2, "direct_answers": ["two", "six", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is one on each side of the mirror", "There are 2 dispensers.", "The black items are the towel dispensers. this is a common color used for them."], "image": "val2014/COCO_val2014_000000295756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62510, "question_id": "MGoZpqbPTHNkQRta4TjvZe", "question": "Why is the bowl on the floor?", "choices": ["catch leak", "feed child", "feed pet", "storage"], "correct_choice_idx": 2, "direct_answers": ["dog bowl", "feed pet", "bathing baby", "no", "animal food", "for clean", "broken", "pet dish", "animals food", "feed pet"], "difficult_direct_answer": true, "rationales": ["The bowl is there to feed their animals.", "The bowl on the floor is one usually used to feed pets", "There is a bowl on the floor in order to feed a pet like a cat or dog."], "image": "val2014/COCO_val2014_000000062510.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71608, "question_id": "MHRpnDZvaEGwfWaUYJ6tSj", "question": "Why are the men in uniforms standing by the road?", "choices": ["street workers", "entertainment", "doctors", "security"], "correct_choice_idx": 3, "direct_answers": ["watching", "race", "police", "protect", "protect", "security", "guarding", "security", "keep peace", "block path"], "difficult_direct_answer": false, "rationales": ["The police officers are there for safety", "There are men wearing security uniforms standing around the road.", "People in uniform line the streets where bike ricers pass."], "image": "train2014/COCO_train2014_000000071608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 541092, "question_id": "MJnBGvokHaJTsXkErdzoN8", "question": "What number is the batter wearing?", "choices": ["41", "99", "77", "83"], "correct_choice_idx": 0, "direct_answers": ["41", "fortyone", "stop", "forty one", "41", "white", "41", "car", "41", "41"], "difficult_direct_answer": false, "rationales": ["The number is 41.", "The batter has the number 41 on the front of his shirt.", "The number is 41."], "image": "train2014/COCO_train2014_000000541092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446661, "question_id": "MKkxNHCw7ZwfGqDzscHx2M", "question": "What famous billionaire started the Virgin airline company?", "choices": ["donald trump", "sam walton", "richard branson", "michael bloomberg"], "correct_choice_idx": 2, "direct_answers": ["richard branson", "richard branson", "richard branson", "richard branson", "richard charles", "benson", "brunson", "richard branson", "sir richard", "richard branson"], "difficult_direct_answer": false, "rationales": ["The person is branson.", "The question is answerable using an internet search engine and is not dependent at all on the image.", "His name is richard."], "image": "train2014/COCO_train2014_000000446661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432160, "question_id": "ML4sLeJ2893a9FF4oKFQBE", "question": "What is the number of suitcases sitting on the floor of this airport chamber?", "choices": ["four", "two", "five", "three"], "correct_choice_idx": 0, "direct_answers": ["five", "four", "four", "five", "five", "four", "four", "four", "four", "five"], "difficult_direct_answer": false, "rationales": ["There are 4.", "There are four suitcases.", "You can count the handles and the suitcases that are standing up to get this number, the small bag sitting on the floor is not considered a suitcase so it should not be counted."], "image": "val2014/COCO_val2014_000000432160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106993, "question_id": "MLwnKGDprt64bwGVgRSien", "question": "What is likely in the metal cup?", "choices": ["ketchup", "marinara sauce", "mustard", "fry sauce"], "correct_choice_idx": 0, "direct_answers": ["catchup", "ketchup", "ketchup", "ketchup", "ketchup", "sauce", "gravy", "dip sauce", "ketchup", "ketchup"], "difficult_direct_answer": false, "rationales": ["The cup has ketchup.", "There is red sauce in a steel cup with a sandwich and fries.", "Answer a is consistent with the serving dish and the foods being served."], "image": "train2014/COCO_train2014_000000106993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459576, "question_id": "MMYuV2L6EYjjMoh688Yi3w", "question": "The largest word on the sign is the name of a 2017 movie starring what Italian actress?", "choices": ["matilda lutz", "sophia loren", "jessica biel", "beverly dangelo"], "correct_choice_idx": 0, "direct_answers": ["revenge", "sophia loren", "juice", "revenge", "matilda lutz", "matilda", "revenge", "inerrf", "matilda lutz", "matilda lutz"], "difficult_direct_answer": false, "rationales": ["Matilda is the only actress who is both italian and working during this time. for example, sophia loren is too old or dead in 2017.", "The sign stars matilda lutz.", "The word is lutz."], "image": "train2014/COCO_train2014_000000459576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142127, "question_id": "MQ2DeFLCNqTADjd59WqZyp", "question": "What color is the ham held inside of the biscuit sandwich with a toothpick shoved through it?", "choices": ["ham", "turkey", "chicken", "beef"], "correct_choice_idx": 0, "direct_answers": ["red", "pink", "brown", "pink", "pink", "pink", "pink", "pink", "ham", "pink"], "difficult_direct_answer": false, "rationales": ["It is a red color.", "The ham inside the biscuit is ham colored.", "The ham is inside the sandwich."], "image": "val2014/COCO_val2014_000000142127.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38490, "question_id": "MRpA65dboXJuWQTjLuTPPk", "question": "How many giraffes are there shot in the middle of this zoo lot?", "choices": ["four", "five", "six", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "two", "three"], "difficult_direct_answer": false, "rationales": ["Three giraffes are present.", "There are 3.", "Three giraffes are pictured."], "image": "train2014/COCO_train2014_000000038490.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296978, "question_id": "MUhndnq4bx6Lu355T7iJDK", "question": "What is the woman wearing?", "choices": ["glasses", "armor", "helmet", "sombrero"], "correct_choice_idx": 0, "direct_answers": ["clothes", "jacket", "jacket", "jurkin", "glasses", "jacket", "jacket", "coat", "jacket", "jacket"], "difficult_direct_answer": false, "rationales": ["She has lenses in a wire frame on her face", "She has lenses in a frame sitting on her face", "The woman is wearing some glasses on her face."], "image": "train2014/COCO_train2014_000000296978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158201, "question_id": "MV7rnxmHhehHrYGW4VEy6J", "question": "What is this woman wearing?", "choices": ["cape", "school uniform", "crown", "laurel wreath"], "correct_choice_idx": 1, "direct_answers": ["school uniform", "suit", "tie", "suit", "shirt", "dress", "tie", "court", "coat", "tie"], "difficult_direct_answer": false, "rationales": ["The woman is wearing school uniform from head to toss.", "The other options don't appear in this image.", "The woman is wearing something for school."], "image": "train2014/COCO_train2014_000000158201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242325, "question_id": "MW5PV6CyHXqxdKDaGrEsj2", "question": "What is causing the beams of light to appear like this in the photo?", "choices": ["grass", "trees", "camera", "animals"], "correct_choice_idx": 1, "direct_answers": ["tree", "sun", "trees", "sunshine refraction", "sun", "sun", "sunlight", "sun", "sun", "sun"], "difficult_direct_answer": false, "rationales": ["Sunlight is shining through trees and onto a meadow where sheep are grazing.", "The beams have a tree.", "The trees are causing the beams."], "image": "train2014/COCO_train2014_000000242325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511224, "question_id": "MX6w4xuhjRPaVDpPcU88vw", "question": "How many giraffes are walking on the left side of the dirt road?", "choices": ["four", "three", "five", "six"], "correct_choice_idx": 0, "direct_answers": ["two", "four", "two", "three", "two", "two", "two", "four", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are four giraffe that can be count.", "There are four of them walking.", "There are four by the dirt road."], "image": "train2014/COCO_train2014_000000511224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414067, "question_id": "MXbosjsMQ9r2mA7u7iaxuy", "question": "What color are the patches around the eyes and noses of the sheep in this field?", "choices": ["two", "four", "one", "three"], "correct_choice_idx": 1, "direct_answers": ["black", "four", "black", "stop", "black", "white", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["There are four patches.", "They are black", "The sheep are white with black circles around their eyes."], "image": "train2014/COCO_train2014_000000414067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340898, "question_id": "MY475WoHf3QUCDTxQ4gZzF", "question": "How many zebras are running in the savannah area?", "choices": ["four", "one", "two", "five"], "correct_choice_idx": 2, "direct_answers": ["four", "four", "foor", "three", "four", "three", "four", "three", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are four zebras in the photo but only two of them are running.", "There are 2.", "There are 3 in plain view and just the hind quarters of another one can be seen behind the one in the front"], "image": "train2014/COCO_train2014_000000340898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416473, "question_id": "MZduiNQVQASvXyNS8v23LK", "question": "How many people are likely enjoying this meal?", "choices": ["two", "seven", "12", "14"], "correct_choice_idx": 0, "direct_answers": ["one", "two", "one", "one", "one", "two", "two", "two", "one", "one"], "difficult_direct_answer": false, "rationales": ["If you look at the number of glasses and plates there is one set for each person and you can see that there are two.", "There are 2.", "The lady seems to be looking at someone so her and that other person."], "image": "train2014/COCO_train2014_000000416473.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68422, "question_id": "MagvJFGkxx4SfytK73xcEd", "question": "What are the jars on the counter?", "choices": ["gourds", "cookie jar", "crocks", "canisters"], "correct_choice_idx": 3, "direct_answers": ["cooking jars", "cannisters", "glass", "condiments", "food", "canisters", "ingredients", "kettle", "juice", "peancon"], "difficult_direct_answer": true, "rationales": ["This is the name applied to these types of storage containers in a kitchen.", "Clear jars with airtight lids are on a kitchen counter.", "The jars are canisters."], "image": "train2014/COCO_train2014_000000068422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260957, "question_id": "Md2g2AxQeJYK6BWGroxaun", "question": "What is the woman on the left wearing?", "choices": ["tiara", "sunglasses", "basket", "clown nose"], "correct_choice_idx": 1, "direct_answers": ["clothes", "sunglasses", "shirt", "identification badge", "tshirt", "polo", "sunglasses", "umbrella", "sunglasses", "umbrella"], "difficult_direct_answer": false, "rationales": ["You can tell by how the glasses are tinted as to what she is wearing.", "The woman is wearing sunglasses.", "The glasses over her eyes are shaded."], "image": "train2014/COCO_train2014_000000260957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561352, "question_id": "MdS6AD5Msonq7ZkirrPrk5", "question": "What is the object called that the man in the forefront has on his face?", "choices": ["tattoo", "bandana", "surgical mask", "goggles"], "correct_choice_idx": 1, "direct_answers": ["scarf", "be", "bandana", "bandana", "cap", "handkerchief", "cloth", "cover", "bandana", "bandana"], "difficult_direct_answer": false, "rationales": ["The object is a bandana.", "The size, shape and manner the object is being worn is consistent with answer a.", "This is often worn by people who wear motorcycles so they can breath better and not breathe in a lot of dust, dirt, and other debris and fumes."], "image": "train2014/COCO_train2014_000000561352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252927, "question_id": "Me7NnU5cQ7RbBHdKRUhXxY", "question": "What kind of meat is decorating the pizza on top of the table?", "choices": ["ham", "italian sausage", "chicken", "pepperoni"], "correct_choice_idx": 1, "direct_answers": ["sausage", "sausage", "sausage", "sausage", "sausage", "sausage", "sausage", "chicken", "italian sausage", "sausages"], "difficult_direct_answer": false, "rationales": ["Italian sausage is a popular topping.", "It's cut into little round slices and baked along with the pizza.", "There is a pizza with cheese and peppers along with sliced sausage on a table."], "image": "val2014/COCO_val2014_000000252927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 519172, "question_id": "MefazFjvi9gjGTrPRferVV", "question": "What color is at the very middle of the kite?", "choices": ["purple", "black", "red", "pink"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color at the very middle of the kite is not purple, black, or pink.", "Of all the colored stripes on the kite, the very middle strip is red.", "The color is red."], "image": "train2014/COCO_train2014_000000519172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557174, "question_id": "MehnJ4BqN2NtiVq5vUTSsY", "question": "Which kind of animal lives in this house?", "choices": ["reptile", "fish", "cat", "dog"], "correct_choice_idx": 3, "direct_answers": ["dog", "dog", "cat", "dog", "dog", "dog", "dog", "dog", "human", "dog"], "difficult_direct_answer": false, "rationales": ["A dog is seen by the window.", "The dog lives here.", "A dog bed is on the floor in a living room."], "image": "train2014/COCO_train2014_000000557174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143637, "question_id": "MfGpQRaTt8oQJBSxHu4hgr", "question": "Why is his face painted?", "choices": ["show support", "safety", "cover acne", "halloween"], "correct_choice_idx": 0, "direct_answers": ["show support", "red", "fan", "team color", "game", "supporting team", "mask", "event", "show love", "fans support"], "difficult_direct_answer": true, "rationales": ["These fans show support with their team's color displayed on shirts and faces.", "The man is a fan.", "The face paint is for support."], "image": "train2014/COCO_train2014_000000143637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138755, "question_id": "MfXUCq98mUEruVKyp2sjJ4", "question": "What color is the wide dome in the background of the church?", "choices": ["pink", "purple", "blue copper", "red"], "correct_choice_idx": 2, "direct_answers": ["light green", "green", "blue", "green", "green", "green", "green", "blue copper", "black", "white"], "difficult_direct_answer": false, "rationales": ["The dome in the background of this city skyline is a pale blue.", "There is a dome building in the background that is a sky blue color.", "It is weathered metal"], "image": "val2014/COCO_val2014_000000138755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262425, "question_id": "Mg7ByohztB4w3iGDmNNAiA", "question": "What baby name is related to this place?", "choices": ["dell", "shemp", "apple", "marina"], "correct_choice_idx": 3, "direct_answers": ["dock", "dock", "sailor", "dick", "marina", "pier", "marina", "child", "boat", "shell"], "difficult_direct_answer": false, "rationales": ["This place is used to park boats.", "That is the baby name on the boat.", "Marina is related."], "image": "val2014/COCO_val2014_000000262425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456690, "question_id": "MhzjD5v6rhceAFDh3RYTVt", "question": "The animal in the foreground belongs to what grouping?", "choices": ["equidae", "crustacean", "vegetable", "bird"], "correct_choice_idx": 0, "direct_answers": ["equidae", "wild animal", "zebras", "mammal", "zebra", "zebra", "exotic", "dazzle", "zebra", "zebras"], "difficult_direct_answer": false, "rationales": ["Horses and zebras belong to this same group.", "These are zebras grazing in the grass. they belong closest to horse family.", "Elephants are part of that family."], "image": "val2014/COCO_val2014_000000456690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50960, "question_id": "MiK7bQNzahjBcDRy9dQNtX", "question": "What color is the plate in between the two coffee cups on the table?", "choices": ["red", "green", "white", "orange"], "correct_choice_idx": 3, "direct_answers": ["orange", "black", "orange", "orange", "orange", "orange blue", "orange", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The location of the plate is given in the text of the question and the color is clearly visible.", "The plate in between the two cups of coffee is bright orange.", "The color is orange."], "image": "val2014/COCO_val2014_000000050960.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 399942, "question_id": "MiKX228DfpgBcQaZPHLHUC", "question": "What country is this most likely?", "choices": ["gabon", "france", "egypt", "japan"], "correct_choice_idx": 3, "direct_answers": ["china", "china", "japan", "china", "china", "japan", "china", "japan", "china", "china"], "difficult_direct_answer": false, "rationales": ["The architecture looks japanese.", "There is an asian, not european or african, temple. asian people and their motorcycles are near the temple.", "The country is japan."], "image": "val2014/COCO_val2014_000000399942.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503005, "question_id": "Mjvo8TWs6rxoqCbXspwmnA", "question": "What color is the large building in the background behind the man riding the horse?", "choices": ["orange", "blue", "gray", "red"], "correct_choice_idx": 3, "direct_answers": ["brown", "red", "brown", "brown", "brown", "red", "brown", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The building is made made of red brick.", "Past the treeline we can see a tall building comprised mostly of red brick.", "There is a large reddish brick building extended up from the trees. there is a man in a hat riding a horse in the forefront."], "image": "val2014/COCO_val2014_000000503005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435096, "question_id": "MkQqLufGtDGuUCzYTghNUH", "question": "What color is the woman's dress who is riding a white stallion?", "choices": ["yellow", "pink", "green", "red"], "correct_choice_idx": 3, "direct_answers": ["white", "red", "red white", "red", "red", "red", "red", "red", "red white", "fee"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The dress is red.", "The woman on a white horse is wearing a red dress with white accents."], "image": "train2014/COCO_train2014_000000435096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384261, "question_id": "MmSWvULvRH59erW4qvSuw3", "question": "What color is the big circular ring in the logo of the bear's t-shirt?", "choices": ["blue", "yellow", "red", "pink"], "correct_choice_idx": 2, "direct_answers": ["red", "teddy", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["This is obviously the color.", "The color is red.", "The drawn ring in the middle of the white shirt is red."], "image": "train2014/COCO_train2014_000000384261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138092, "question_id": "MmrFB6LFbr6vHoKQko9zwG", "question": "What kind of fish is consumed on the side of the breakfast?", "choices": ["bacon", "beef", "sausage", "salmon"], "correct_choice_idx": 3, "direct_answers": ["fillets", "tilapia", "pork", "salmon", "tilapia", "egg", "salmon", "salmon", "salmon", "salmon"], "difficult_direct_answer": false, "rationales": ["The fish is lox.", "The fish is salmon.", "Lox is made of smoked salmon."], "image": "train2014/COCO_train2014_000000138092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85281, "question_id": "MnEJoDxL3AmkT6ssC9hwYh", "question": "What is in the sandwich?", "choices": ["apple", "red peppers", "eggs", "chicken leg"], "correct_choice_idx": 2, "direct_answers": ["chicken", "bacon", "bacon eggs", "breakfast", "eggs", "eggs", "eggs", "bacon", "eggs", "eggs"], "difficult_direct_answer": false, "rationales": ["The other options don't appear to be on the sandwich. this type of bread is often paired with a.", "There are some yellow eggs inside of the sandwich.", "The yellow filling is scrambled breakfast items."], "image": "train2014/COCO_train2014_000000085281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183100, "question_id": "MnKa8pYiejC5jr6MAXZmpY", "question": "Where would these tools be found?", "choices": ["kitchen", "store", "trunk", "street"], "correct_choice_idx": 1, "direct_answers": ["garage", "store", "dev", "hardware", "woodshop", "workshop", "shed", "hammerscissor", "garage", "tool box"], "difficult_direct_answer": true, "rationales": ["They have price tags on them", "The tools are in a store.", "A store would sell these since they have price tags."], "image": "train2014/COCO_train2014_000000183100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240820, "question_id": "MoWNisTdWi2DhBtL7oSNw6", "question": "Why are these people wearing jackets?", "choices": ["fashion", "visibility", "protection", "keep warm"], "correct_choice_idx": 3, "direct_answers": ["cold", "snowy out", "cold outside", "cold weather", "keep warm", "cold", "stay warm", "cold winter", "its cold", "cold"], "difficult_direct_answer": false, "rationales": ["A group of people are standing in the snow in snow pants with skis on and snow all around.", "They're cold.", "They are outside in the cold."], "image": "train2014/COCO_train2014_000000240820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537074, "question_id": "MokYPwxvqYKCdkVdTjCcNc", "question": "What kind of junction is this?", "choices": ["pedestrian crossing", "canal", "railway", "boat"], "correct_choice_idx": 2, "direct_answers": ["train", "train station", "railway", "rail", "train", "train", "railway", "train", "cross", "train junction"], "difficult_direct_answer": false, "rationales": ["The junction is a railway.", "The image has visible trains and parallel metal tracks with cross-boarding consistent with answer a.", "The junction is a railway."], "image": "train2014/COCO_train2014_000000537074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239005, "question_id": "MowjAf65MnQCtRv7fShVEW", "question": "What shape is the small plate?", "choices": ["circle", "octagon", "square", "hexagon"], "correct_choice_idx": 2, "direct_answers": ["square", "square", "square", "square", "square", "square", "square", "square", "square", "square"], "difficult_direct_answer": false, "rationales": ["The shape is square.", "There are four corners and the sides are all equal length.", "The shape is a square."], "image": "val2014/COCO_val2014_000000239005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534220, "question_id": "MpKkafpLaeUi4bZcxUHH4B", "question": "How many kites are already in the air?", "choices": ["three", "one", "six", "eight"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "three", "one", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["3 kites are flying above.", "There are three people currently clustered in the middle of the beach. they are all flying kites in the air.", "The rest are on the ground"], "image": "train2014/COCO_train2014_000000534220.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572499, "question_id": "MrKPRR268qpVeb8T4qPPju", "question": "The person with the brown glove dated what celebrity?", "choices": ["alyssa milano", "tiny tim", "katt williams", "idris elba"], "correct_choice_idx": 0, "direct_answers": ["model", "mariah carey", "amber heard", "unknown", "alyssa milano", "unsure", "amber heard", "j lo", "rihanna", "unkown"], "difficult_direct_answer": true, "rationales": ["Not knowing who the player is and thus not being able to determine who they dated, one can assume by heteronormity that the male in question likely dated a female. answer a is the only female on the list.", "That is where the glove was from?", "Barry zito is the pitcher that dated alyssa milano"], "image": "val2014/COCO_val2014_000000572499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350552, "question_id": "Mrzm7fozqppLMPL6M2FSGP", "question": "Why is the horse in the town center?", "choices": ["its shopping", "its exercising", "its working", "its eating"], "correct_choice_idx": 2, "direct_answers": ["giving rides", "its working", "carrying carriage", "carriage ride", "transportation", "waiting", "travel", "entertainment", "pulling carriage", "carriage rides"], "difficult_direct_answer": true, "rationales": ["The horse is hooked up to a carriage.", "There is a black horse pulling a carriage that will load passengers. they will take this to get around town.", "The horse gives carriage drawn tours."], "image": "train2014/COCO_train2014_000000350552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345020, "question_id": "MtF4vfGXvkjtDGVd9pS8ot", "question": "What would be one main reason a police would be riding this type of motorcycle?", "choices": ["functionality", "easy access", "looks", "speed"], "correct_choice_idx": 2, "direct_answers": ["ride", "patrol", "patrol streets", "escorting", "looks", "security", "ease", "street patrol", "fast", "patrol"], "difficult_direct_answer": true, "rationales": ["Cops are riding motorcycles in a street.", "Possibly d as well. they would need something fast to chase after people during street patrol.", "Police drive these because they can scoot around town easier."], "image": "train2014/COCO_train2014_000000345020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321014, "question_id": "Mtuwxf3iHXYX2XJJX7uKd8", "question": "What is surrounding the cat?", "choices": ["knives", "dogs", "penguins", "foxes"], "correct_choice_idx": 0, "direct_answers": ["knives", "knife", "knives", "knife", "knives", "knife", "knives", "knives", "knives", "knives"], "difficult_direct_answer": false, "rationales": ["There are sharp metal blades on the floor", "There are five objects surrounding the cat.", "They are easily identifiable by their shiny blades and dark handles."], "image": "val2014/COCO_val2014_000000321014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144951, "question_id": "Mv3VQFJFPoc8NeLThmoh2u", "question": "What is next to the banana on the table?", "choices": ["banana", "apple", "water", "coffee"], "correct_choice_idx": 2, "direct_answers": ["glass", "stop", "cup", "water glass", "glass", "water", "glass", "cup", "dwdw", "mug"], "difficult_direct_answer": false, "rationales": ["There is a glass with a clear liquid in it.", "It is a clear liquid in a glass", "There is a clear cup with clear liquid in it."], "image": "train2014/COCO_train2014_000000144951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171717, "question_id": "MvGg2Bx92s7i9rxN8EfHpc", "question": "What color are the lights on the top of the mirror in the bathroom?", "choices": ["yellow", "black", "white", "pink"], "correct_choice_idx": 0, "direct_answers": ["white", "silver", "silver white", "white", "white", "white", "yellow", "yellow", "white", "clear"], "difficult_direct_answer": false, "rationales": ["The lights above the bathroom sink give off a yellowish glow to the room.", "The color is yellow.", "The color is yellow."], "image": "val2014/COCO_val2014_000000171717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113935, "question_id": "MvHotwuVt2uaWibXApYs7G", "question": "What color is the metal fencing on the left side of this walkway?", "choices": ["green", "brown", "blue", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The fencing is red.", "The metal fencing is reflecting some sort of light that makes part of it appear to be red, but it is actually brown."], "image": "val2014/COCO_val2014_000000113935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235486, "question_id": "Mx4wJinchix2QFfqMNtxA6", "question": "What type of beverages are served in the wide glasses next to the dinner pizza?", "choices": ["water", "beer", "wine", "juice"], "correct_choice_idx": 1, "direct_answers": ["pizza", "drinks", "wine", "wine", "beer", "eating", "beer", "beer", "beer", "wine"], "difficult_direct_answer": false, "rationales": ["An amber colored drink fills large glasses on a table.", "People sit at a table with an amber liquid in their round, tall, glass drinkware with a thick stem.", "The drink is light brown with foam at the top."], "image": "val2014/COCO_val2014_000000235486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303593, "question_id": "MxDNtrYEiXM77kqyAGFxfX", "question": "This person's outfit looks like something what person would wear?", "choices": ["hillary clinton", "haystacks calhoun", "pope francis", "ali g"], "correct_choice_idx": 3, "direct_answers": ["runner", "gangster", "rapper", "rapper", "rapper", "musician", "rapper", "trouser", "ali g", "thief"], "difficult_direct_answer": false, "rationales": ["The outfit looks like ali g's clothes.", "The person looks like ali.", "Ali g is a rapper and wears a jump suit and gold chains like this person."], "image": "train2014/COCO_train2014_000000303593.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552571, "question_id": "MySCaAkYYVnkAiQPYFEtqK", "question": "Why might some of their trunks be curled?", "choices": ["eating", "drinking", "avoid tripping", "trumpeting"], "correct_choice_idx": 0, "direct_answers": ["walking", "eating", "for eating", "eating", "eating", "eating", "usage", "eating", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["The trunks are curled to eat.", "The elephant use their trunks for consumption of food.", "Their trunks are in their mouths."], "image": "train2014/COCO_train2014_000000552571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132887, "question_id": "MzRbJMQMvRB6mdcX4vCPQ9", "question": "What color of cow is in the middle with a yellow tag visible on his right ear?", "choices": ["black", "pink", "brown", "white"], "correct_choice_idx": 0, "direct_answers": ["black", "black", "black", "brown", "black", "brown", "black", "brown", "brown", "black"], "difficult_direct_answer": false, "rationales": ["The color is black.", "Some animals that give us beef products are known to have dark hair.", "A group of cows is gathered with one dark colored in the middle with a yellow tag on its ear."], "image": "train2014/COCO_train2014_000000132887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164491, "question_id": "MzSCkMYDaCJpWrBqUisk6Q", "question": "What item here can hold the most books?", "choices": ["luggage", "bookcase", "wheelbarrow", "backpack"], "correct_choice_idx": 3, "direct_answers": ["backpack", "backpack", "backpack", "backpack", "bench", "backpack", "backpack", "backpack", "backpack", "backpack"], "difficult_direct_answer": false, "rationales": ["Generally school aged people use backpacks, as they hold alot of things easily.", "There is a lot of room in this device. it is typically designed to carry books.", "The bag on the man's back is intended for and derives its name from carrying books."], "image": "train2014/COCO_train2014_000000164491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430319, "question_id": "N37eBx4B478pecinzD8pWq", "question": "What is an important part of this animals diet?", "choices": ["gluten", "sugar", "protein", "beeswax"], "correct_choice_idx": 2, "direct_answers": ["fish", "treats", "meat", "cat food", "belly", "protein", "cat food", "milk", "fish", "protein"], "difficult_direct_answer": false, "rationales": ["The animals have protein.", "The cat sitting on the woman's lap would prefer a diet that is high in proteins like chicken meat.", "The animal has protein."], "image": "val2014/COCO_val2014_000000430319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403013, "question_id": "N3Q6sdfNMweWTRtNYX73Hd", "question": "What is most likely behind the doors?", "choices": ["bedroom", "pantry", "bathroom", "garage"], "correct_choice_idx": 1, "direct_answers": ["pantry", "kitchen", "canned goods", "ghost", "pantry", "utility room", "closet", "another room", "dining room", "pantry"], "difficult_direct_answer": false, "rationales": ["Given that this is a kitchen and the type of door, it's most likely a. that said, it really could be any of these options.", "A storage area for food.", "The area in front of the doors is a kitchen. the room behind the door likely is used to store food."], "image": "val2014/COCO_val2014_000000403013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175202, "question_id": "N3Sv8ceqKzQAtsLA3KTsiG", "question": "The dialogue bubbles are an example of what editing technique?", "choices": ["hue", "superimposition", "opacity", "masking"], "correct_choice_idx": 1, "direct_answers": ["graphics", "puns", "bubbles", "text", "thinking", "cartoon", "superimposition", "cat", "overlay", "thinking"], "difficult_direct_answer": true, "rationales": ["The bubbles are superimposed.", "There are speech bubbles that were photoshopped on to the image.", "The bubble is a sumperimposition."], "image": "train2014/COCO_train2014_000000175202.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252919, "question_id": "N4WVxKDWUWTyCbCaGPfMZV", "question": "What is the purpose of the dog?", "choices": ["nothing", "herding", "hunting", "retrieving"], "correct_choice_idx": 1, "direct_answers": ["herd sheep", "herding sheep", "herding", "guard", "security", "guard", "guarding", "guide sheep", "watching", "herd"], "difficult_direct_answer": true, "rationales": ["The dog is pushing the sheep in a certain direction.", "The dog is herding.", "The dog herds."], "image": "train2014/COCO_train2014_000000252919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366648, "question_id": "N4ta559YKWXvEQpd6DmuLw", "question": "What is missing from this meal?", "choices": ["spaghetti", "meat", "vegetables", "corn"], "correct_choice_idx": 1, "direct_answers": ["meat", "cheese", "sauce", "meat", "meat", "meat", "tomato sauce", "protein", "meat", "meat"], "difficult_direct_answer": false, "rationales": ["There is no meat with this meal.", "A well-balanced meal includes some sort of protein, and that could be found in meat.", "There is no meat."], "image": "val2014/COCO_val2014_000000366648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285597, "question_id": "N4x8qmozhu8JMe6EA7DQWi", "question": "Which donut is the plain flavor?", "choices": ["all white", "all black", "all colored", "half colored"], "correct_choice_idx": 0, "direct_answers": ["all white", "rightmost", "brown", "chocolate", "chocolate", "beige colored", "glazed", "right side", "glazed", "rightmost"], "difficult_direct_answer": false, "rationales": ["You can tell by the lack of sprinkles as to what is the plain flavored donuts.", "The one with no toppings", "Coated flavors usually have different colors meaning different flavors."], "image": "val2014/COCO_val2014_000000285597.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260141, "question_id": "N4yq6SabhdnayveZeXNBt7", "question": "What is on top of the water?", "choices": ["squirrels", "bears", "surfers", "boats"], "correct_choice_idx": 3, "direct_answers": ["boats", "boats", "boats", "boat", "boats", "boats", "boats", "boats", "boats", "boats"], "difficult_direct_answer": false, "rationales": ["The boats are on top.", "There are marine vehicles floating on the water.", "Boats are in the water."], "image": "val2014/COCO_val2014_000000260141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124835, "question_id": "N5giF3i3uarKTkfRM3C2Ru", "question": "What is the large item in the foreground?", "choices": ["bread", "apple", "mouse", "birthday cake"], "correct_choice_idx": 0, "direct_answers": ["popover", "donuts", "bread", "bread", "bread", "donut", "roll", "roll", "bread", "medium"], "difficult_direct_answer": false, "rationales": ["It is crusty like a bun.", "The large object in the foreground is a food item, not an animal. it is not an apple or a cake.", "The bread is large."], "image": "train2014/COCO_train2014_000000124835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513943, "question_id": "N9dxsz6Q8BN3kziBwsT7Pu", "question": "Who plays this sport?", "choices": ["serena williams", "bo jackson", "pele", "marian hossa"], "correct_choice_idx": 0, "direct_answers": ["tennis players", "billy jean", "people", "tennis players", "sarina", "tennis players", "men", "tennis player", "tennis players", "serena williams"], "difficult_direct_answer": false, "rationales": ["Serena williams is famous for playing tennis.", "Serena plays.", "The sport is tennis which is consistant with this player. i can tell it's tennis by the racket and the court."], "image": "train2014/COCO_train2014_000000513943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352006, "question_id": "NCBiWyTThRbtQh7TsxkHXD", "question": "What word is related to these animals?", "choices": ["beef", "ewe", "kitten", "joey"], "correct_choice_idx": 1, "direct_answers": ["wool", "sheep", "sheeps", "sheep", "ewe", "sheep", "wool", "herd", "goat", "fluffy"], "difficult_direct_answer": false, "rationales": ["These are sheep and related to ewes", "The animals are sheep, not cats, kangaroos, or cows.", "The word is an ewe."], "image": "train2014/COCO_train2014_000000352006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473057, "question_id": "NGyu7LjGHwYFgWapUi2giS", "question": "How many pillows are laid upon the backside mantle of this bedding?", "choices": ["four", "two", "three", "five"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two pillows"], "difficult_direct_answer": false, "rationales": ["There are 2.", "There are two pillows.", "There are two pillows propped up against the backside of the mantle."], "image": "val2014/COCO_val2014_000000473057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286673, "question_id": "NHq3eRttd63YdwYZbybUuL", "question": "What would you call a pizza with this kind of toppings?", "choices": ["mushroom suprise", "peperoni", "sausage", "vegetable"], "correct_choice_idx": 1, "direct_answers": ["streak", "peperoni", "pepperoni", "pepperoni pizza", "pepperoni", "pepperoni", "red", "peperoni", "pepperoni", "deep dish"], "difficult_direct_answer": false, "rationales": ["These are round circles of meat cut from a cured sausage", "The pizza has a thick crust and is covered in pepperoni.", "It's pepperoni."], "image": "train2014/COCO_train2014_000000286673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543782, "question_id": "NJU7ZK7qRi3cTwSHTY9mtV", "question": "What is one of the longer items here?", "choices": ["ski pole", "giraffe neck", "ladder", "airplane"], "correct_choice_idx": 0, "direct_answers": ["skiis", "ski", "snow boards", "skiis", "skis", "ski pole", "skis", "skis", "poles", "skies"], "difficult_direct_answer": false, "rationales": ["The only answer that makes sense is the one to do with skiing. they are standing turning around posing in the snow.", "There is a group of people with glasses and about to ski down a snow slope using long vertical structures to guide them.", "The people are using ski poles to help them ski. there is no ladder, giraffe or airplane pictured."], "image": "val2014/COCO_val2014_000000543782.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451087, "question_id": "NLRazZoAA7oko44HYVuJoV", "question": "What is the name of the batter?", "choices": ["joe mauer", "chipper jones", "tanyon sturtze", "jerry cantrell"], "correct_choice_idx": 0, "direct_answers": ["unknown", "joe", "twins", "joe mauer", "mick jager", "unknown", "joe mauer", "joe mauer", "playing", "nfl"], "difficult_direct_answer": false, "rationales": ["Joe mauer plays for this team.", "The batter is joe mauer.", "The batter is wearing a twins jersey with the number 7. the text indicates the picture was taken in 2010."], "image": "val2014/COCO_val2014_000000451087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144703, "question_id": "NMFtTStq7xZNTrEqZS7gj9", "question": "What actor has the same name as the first name on the wall in rainbow colors?", "choices": ["jeff garlin", "todd bridges", "tim minchin", "jimmy smits"], "correct_choice_idx": 1, "direct_answers": ["todd bridges", "ok", "steve", "todd", "steve martin", "steve caroll", "steve carol", "todd", "todd", "brown"], "difficult_direct_answer": false, "rationales": ["The name todd is the first name on the rainbow magnet.", "Todd bridges has the same name.", "The actor is bridges."], "image": "train2014/COCO_train2014_000000144703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415646, "question_id": "NMwVz5EihUXo7DYxurGxqD", "question": "What is the purpose of the object on the wall to the left of the window?", "choices": ["watch tv", "tell time", "show reflection", "look outside"], "correct_choice_idx": 1, "direct_answers": ["tell time", "tell time", "tell time", "tell time", "tell time", "sleep", "tell time", "sleeping", "tell time", "tell time"], "difficult_direct_answer": false, "rationales": ["The purpose is to tell time.", "A clock is on the wall.", "It has numbers on it to indicate the hour and minutes. it is tradition to have it on the wall."], "image": "val2014/COCO_val2014_000000415646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573146, "question_id": "NMxPFAsTpoF4NSX27fyvXG", "question": "Where does this giraffe on the side of the tour bus probably live?", "choices": ["zoo", "wild", "conservatory", "boat"], "correct_choice_idx": 1, "direct_answers": ["sanctuary", "savannah", "jungle", "savannah", "grassy area", "zoo", "tourist", "zoo", "serengeti", "wild"], "difficult_direct_answer": false, "rationales": ["The giraffe appears to be free. the people seem to be on a safari tour.", "The giraffe is in the wild.", "Giraffes do not live on boats. the giraffe is not in an enclosure, so it does not live in a zoo."], "image": "train2014/COCO_train2014_000000573146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82904, "question_id": "NRKEPzXPL7BnsDfqhJywpb", "question": "How many signs are attached to the post that is stuck in the ground near the horses?", "choices": ["three", "two", "five", "four"], "correct_choice_idx": 1, "direct_answers": ["one", "two", "two", "zero", "two", "two", "two", "one", "zero", "two"], "difficult_direct_answer": false, "rationales": ["There are two horizontal things at the top of the post.", "They are one on top of the other", "They are both at the top of the pole"], "image": "train2014/COCO_train2014_000000082904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 83651, "question_id": "NSfJKeCsz2VjbYdrBZ9n2E", "question": "What is the cat sleeping next to?", "choices": ["mouse", "sandwich", "baby", "basket"], "correct_choice_idx": 0, "direct_answers": ["monitor", "keyboard", "monitor", "keyboard", "keyboard", "mouse", "computer", "mouse", "monitor", "computer"], "difficult_direct_answer": false, "rationales": ["A mouse is a computer item that is often found next to the keyboard so it is the most logical choice even though it is not visible.", "There is a device for controlling the pointer on the computer next to the cat.", "The cat is near the mouse."], "image": "train2014/COCO_train2014_000000083651.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124028, "question_id": "NSzDUL2bQLsXuiQ2oLzPtQ", "question": "What color are the cranes on the side of the river?", "choices": ["gray", "yellow", "green", "red"], "correct_choice_idx": 3, "direct_answers": ["orange", "orange", "white", "white", "white", "blue", "red", "orange", "orange", "red"], "difficult_direct_answer": false, "rationales": ["They're obviously not the other colors listed here.", "The cranes are tall and near the water.", "Large red construction equipment extends up higher than the other equipment."], "image": "train2014/COCO_train2014_000000124028.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392642, "question_id": "NTBf6wqRsZUps9CZJnPDn7", "question": "What is on the couch?", "choices": ["cow", "dog", "llama", "old man"], "correct_choice_idx": 1, "direct_answers": ["dogs", "dogs", "dog", "2 dogs", "dogs", "dog", "dog", "dog", "dogs", "red"], "difficult_direct_answer": false, "rationales": ["He has four legs a long snout and drooping ears.", "It is a furry domestic animal", "Non-human animals are on the couch. they are pets, not farm animals."], "image": "val2014/COCO_val2014_000000392642.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244227, "question_id": "NTUYxdkzotHQ9EZTkwkgAx", "question": "What time of day is it at this time?", "choices": ["night", "morning", "noon", "day"], "correct_choice_idx": 0, "direct_answers": ["night", "night", "night time", "night", "night", "night", "night", "night", "night", "night"], "difficult_direct_answer": false, "rationales": ["The time is nighttime.", "The time is night.", "It is dark out. the pizzeria has its signs lit up."], "image": "train2014/COCO_train2014_000000244227.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186315, "question_id": "NUafDHuN8jzZi4wsqzq4SG", "question": "What is the sign saying PARK indicating?", "choices": ["water park", "sports park", "tree park", "car park"], "correct_choice_idx": 3, "direct_answers": ["legal parking", "parking garage", "parking", "parking lot", "parking", "vehicle parking", "parking garage", "parking inside", "park", "car park"], "difficult_direct_answer": false, "rationales": ["It is where you can take your car so you can shop", "Given the city setting and plethora of cars parked alongside the road we can assume the parking suggested by this sign is for cars.", "That is a place where you can place your vehicle in said \"garage\" and go sightsee in the town area."], "image": "train2014/COCO_train2014_000000186315.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238125, "question_id": "NUp5j3FkCEmhAh5hrXbwTd", "question": "Where would be the most comfortable place to sit here?", "choices": ["high chair", "water bed", "couch", "hammock"], "correct_choice_idx": 2, "direct_answers": ["couch", "sofa", "couch", "couch", "couch", "couch", "couch", "sofa", "couch", "couch"], "difficult_direct_answer": false, "rationales": ["This is a living room, not a bedroom. there is no high chair, hammock, or water bed.", "The couch might be a very comfortable place to rest.", "The couch is probably the most comfortable place to sit in this room."], "image": "val2014/COCO_val2014_000000238125.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299004, "question_id": "NVEwzKEdcaoz4RVaePHr2X", "question": "What is next to the screen?", "choices": ["baby", "flowers", "bananas", "eggs"], "correct_choice_idx": 1, "direct_answers": ["writings", "paper", "flowers", "papers", "speakers", "table", "papers", "papers", "documents", "papers"], "difficult_direct_answer": false, "rationales": ["There are plants, not eggs, fruits, or people, near the screen.", "There is a tablet device on a table. it is in front of a row of hedges.", "The screen is on the table outside. there are flowers nearby."], "image": "train2014/COCO_train2014_000000299004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573973, "question_id": "NVK2ZSsnHgq7motRrhzzEk", "question": "What is the oval object connected to this person's keys?", "choices": ["carabiner", "ubolt", "key chain", "knife"], "correct_choice_idx": 0, "direct_answers": ["keychain", "keychain", "mouse", "carabiner", "keychain", "key ring", "nothing", "computer", "carabiner", "key chain"], "difficult_direct_answer": false, "rationales": ["Aare good for holding keys intact and can be attached to trousers to prevent being lost.", "That is what the keys connect to.", "This is a clip that pushes open to hold or carry items"], "image": "train2014/COCO_train2014_000000573973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 126688, "question_id": "NVphk5dcLzVsMuVyVTnKKX", "question": "What is the likely relationship of the girl to the man?", "choices": ["home assistant", "granddaughter", "daughter", "maid"], "correct_choice_idx": 1, "direct_answers": ["granddaughter", "granddaughter", "granddaughter", "granddaughter", "granddaughter", "granddaughter", "granddaughter", "granddaughter", "grand daughter", "grandson"], "difficult_direct_answer": false, "rationales": ["The man is elderly and the little girl is very young.", "The man is much older with gray hair", "The young child looks to be a grandchild of the man."], "image": "val2014/COCO_val2014_000000126688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145217, "question_id": "NXqXahbv4mQZomzvacx6Ug", "question": "What color are the shorts worn by the man carrying a surfboard down the beach?", "choices": ["blue", "pink", "white", "red"], "correct_choice_idx": 0, "direct_answers": ["blue", "blue checkered", "blue", "navy blue", "blue", "blue", "blue", "yellow", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The man is wearing blue shorts.", "A man is wearing plaid looking boxers that match the color of the sky.", "The color is blue."], "image": "train2014/COCO_train2014_000000145217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243857, "question_id": "NYYMLGYa55k2TQzN52SEtq", "question": "What color is the sink underneath the silver arched faucet?", "choices": ["silver", "clear", "black", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["There is a white sink underneath the faucet.", "The color is white.", "The sink is not the same color as the faucet. the sink is not black or clear."], "image": "val2014/COCO_val2014_000000243857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281711, "question_id": "NYmxSQZWtsGjasGRU2Yyfu", "question": "What is near the door?", "choices": ["baby", "pumpkin", "cat", "plant"], "correct_choice_idx": 3, "direct_answers": ["wall", "plant", "plant", "paper", "wall", "stop", "feref", "plant", "refrigerator", "plant"], "difficult_direct_answer": false, "rationales": ["There is a tall plant next to the door.", "There is a plant by the door.", "A hanging one to be precise."], "image": "val2014/COCO_val2014_000000281711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535049, "question_id": "Na9aKVCjpHM3Jah5Z8jwZg", "question": "Who would be most comfortable in the green seat?", "choices": ["adult", "teenager", "toddler", "baby"], "correct_choice_idx": 2, "direct_answers": ["child", "child", "parents", "comfort person", "children", "toddler", "toddler", "child", "child", "child"], "difficult_direct_answer": false, "rationales": ["This chair is small in design for a little person that is able to sit unassisted.", "The green seat would be too small for a teenager or an adult. it would be too big for a baby.", "The toddler would be comfy."], "image": "train2014/COCO_train2014_000000535049.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388177, "question_id": "NdTqBnWYhGhqUma3LD6wat", "question": "What color was this truck originally?", "choices": ["red", "green", "yellow", "blue"], "correct_choice_idx": 0, "direct_answers": ["red", "fss", "red", "green", "red", "red", "red", "red", "red", "red orange"], "difficult_direct_answer": false, "rationales": ["It's seen at the back of the cab where it wasn't repainted.", "It is not clear if the truck was originally red or green, but there are is a base color that has been painted over based on the rear of the cabin. it was likely answer a that was painted over based on the pattern of paint.", "Red was the color of origin because you see it under the green that has been used to repaint over"], "image": "train2014/COCO_train2014_000000388177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231466, "question_id": "NhGUkcnzw395RHwJ2ygfrp", "question": "What are these waterways equivalent in usage to in other cities and countries?", "choices": ["streams", "lakes", "tarmacs", "streets"], "correct_choice_idx": 3, "direct_answers": ["roads", "streets", "roads", "sea", "roads", "streets", "small", "car", "water", "canals"], "difficult_direct_answer": false, "rationales": ["They're streets.", "The waterways are like streets.", "People use boats to traverse around the buildings."], "image": "train2014/COCO_train2014_000000231466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468542, "question_id": "NiPAFbRSMa5aSrvbPnysCt", "question": "What direction are the cows headed?", "choices": ["south", "north", "east", "west"], "correct_choice_idx": 3, "direct_answers": ["west", "straight", "left", "left", "four", "west", "west", "west", "car", "left"], "difficult_direct_answer": false, "rationales": ["The cows are headed left which is often associated with west.", "The cows are going west.", "The direction is west."], "image": "train2014/COCO_train2014_000000468542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234994, "question_id": "NjwZxeJqwxJUmrXi7VP9ZG", "question": "Who is an all-time legend for one of these teams?", "choices": ["michael jordan", "leroy garrett", "todd helton", "tom brady"], "correct_choice_idx": 2, "direct_answers": ["maddux", "pete rose", "play", "todd helton", "jackie robinson", "babe ruth", "todd helton", "babe ruth", "griffey", "todd helton"], "difficult_direct_answer": false, "rationales": ["The legend is helton.", "Todd helton played baseball.", "Todd helton was an all time legend."], "image": "train2014/COCO_train2014_000000234994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502553, "question_id": "NjxB8K5fbNnjE62PYA3wUW", "question": "What is on the bed?", "choices": ["cat", "dog", "person", "elephant"], "correct_choice_idx": 2, "direct_answers": ["person", "person", "person", "old man", "man", "man", "person", "man", "old man", "old man"], "difficult_direct_answer": false, "rationales": ["Looks like someone found the bed he like because he's still on it.", "An older man with grey hair is laying on the bed", "He's testing a mattress in a store"], "image": "train2014/COCO_train2014_000000502553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7090, "question_id": "NkEw7EhrcsVaQxrqX9btxY", "question": "Where was this pizza purchased?", "choices": ["walmart", "super market", "home cooked", "restaurant"], "correct_choice_idx": 3, "direct_answers": ["restaurant", "diner", "ink", "cafeteria", "restaurant", "pizza hut", "pizza", "restaurant", "hotel", "cafeteria"], "difficult_direct_answer": false, "rationales": ["The pizza is from a restaurant.", "The food is on trays", "The tray where the pizza is on is that of an establishment that sells pizza."], "image": "train2014/COCO_train2014_000000007090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160815, "question_id": "NkSVQoF8zmzLg65x4pcxbj", "question": "Where are the majority of the arrows pointing?", "choices": ["down", "right", "up", "left"], "correct_choice_idx": 3, "direct_answers": ["left side", "left", "head", "left", "left", "left", "left", "left", "left", "left direction"], "difficult_direct_answer": false, "rationales": ["The signs go left.", "The front of the arrows are pointing towards the buildings.", "There are four arrows. three are pointing in the same direction."], "image": "train2014/COCO_train2014_000000160815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221869, "question_id": "NkYB8UAHxaSuP8JPm5Dpa2", "question": "What item in the room has multiple meanings?", "choices": ["reed", "vent", "cat", "shoe"], "correct_choice_idx": 1, "direct_answers": ["glass", "plant", "vent", "plant", "rug", "towels", "towel", "vent", "towel", "shower"], "difficult_direct_answer": false, "rationales": ["A vent used for many purposes.", "This is a noun and a verb", "The item is a vent."], "image": "val2014/COCO_val2014_000000221869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360665, "question_id": "NmhJFVkWVYQavNUDmbfYcg", "question": "What color is the cast iron component in the bridge above the grassy field?", "choices": ["red", "green", "rust", "blue"], "correct_choice_idx": 1, "direct_answers": ["green", "dwww", "bridge", "sheep", "green gray", "gray-green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["This is obvious in the scene and a common color used on steel bridges.", "The bridge above the sheep is colored like a tree.", "The color is green."], "image": "train2014/COCO_train2014_000000360665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 209145, "question_id": "Nmyz8P7Rh8BdUMrbegeA2n", "question": "What is the white tube on top of the cabinet used for?", "choices": ["heating", "ventilation", "air conditioning", "water"], "correct_choice_idx": 1, "direct_answers": ["bowl", "car", "ventilation", "clean", "pool", "airing", "cleaning", "table", "ventilation", "ventilation"], "difficult_direct_answer": false, "rationales": ["The tube is for ventilation.", "The white tube is for ventilation.", "There is set of wood cabinets against the wall and in the middle a white pliable hose. it is used to vent out the stove to pull hot air out of."], "image": "val2014/COCO_val2014_000000209145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37675, "question_id": "NnHLeyEAab26Mt44sbQ6mm", "question": "What may get in the way of the horse's eating in this image?", "choices": ["building", "trees", "snow", "fence"], "correct_choice_idx": 2, "direct_answers": ["snow", "snow", "snow", "fence", "snow", "fence", "snow", "snow", "snow", "snow"], "difficult_direct_answer": false, "rationales": ["The snow is a barrier.", "The horse is eating in the normal manner, grazing on grass and would move over the grass eating as it went. there is a white substance covering the grass that would interfere with this process and is consistent with answer a.", "There is snow."], "image": "val2014/COCO_val2014_000000037675.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555308, "question_id": "Nno4yjAzk5kMLzZn75mFgq", "question": "How many planes are in this picture?", "choices": ["two", "one", "four", "five"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "many", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are 2.", "There are a couple of airplanes in the picture.", "There is one plane on the right and one plane on the leftside of the picture."], "image": "train2014/COCO_train2014_000000555308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229327, "question_id": "NqJpzHzaq83j76m4syVWbf", "question": "What color are the LCD lights on the motorcycle directly ahead to the left of the black motorcycle?", "choices": ["red", "green", "blue", "yellow"], "correct_choice_idx": 2, "direct_answers": ["pink", "blue", "blue", "blue", "blue", "blue", "blue white", "blue", "blue", "red blue"], "difficult_direct_answer": false, "rationales": ["The lights in question are locatable based on the text of the question and their color is identifiable.", "They are blue.", "A motorcycle with a blue light is parked next to a black motorcycle."], "image": "train2014/COCO_train2014_000000229327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212181, "question_id": "NrjbDzo6k4MkXS48Jmfhie", "question": "What type of energy does the stove use?", "choices": ["convection", "electricity", "microwave", "gas"], "correct_choice_idx": 3, "direct_answers": ["electric", "gas", "gas", "gas", "gas", "electric", "gas", "gas", "gas", "gas"], "difficult_direct_answer": false, "rationales": ["The energy is gas.", "An oven has grates and pilot lights on the stove. pilot lights are used for gas stoves.", "The energy is for gas."], "image": "train2014/COCO_train2014_000000212181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336511, "question_id": "NsbouANPHUeaJ5mnd3vTup", "question": "Where are the cats sleeping?", "choices": ["church steeples", "house garden", "office interior", "public park"], "correct_choice_idx": 3, "direct_answers": ["bench", "bench", "bench", "bench", "bench", "on chair", "benches", "benches", "public park", "bench"], "difficult_direct_answer": false, "rationales": ["There are many benches next to a grassy area", "These type of benches are usually seen in parks that are open for all.", "There are several benches and a walkway next to grass"], "image": "train2014/COCO_train2014_000000336511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571768, "question_id": "Nv87Fzfiespi9eSfh95vfP", "question": "What is the occupation of the person with the vest?", "choices": ["clown", "chef", "police", "firefighter"], "correct_choice_idx": 2, "direct_answers": ["police", "police", "dwdwd", "police", "police", "stop", "police", "police", "police", "security"], "difficult_direct_answer": false, "rationales": ["The person is a policeman because he is wearing a vest that has police written on it", "His occupation is written across the back of his vest.", "The profession is written on the person's vest in addition to the similarity of the uniform."], "image": "train2014/COCO_train2014_000000571768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276666, "question_id": "Nw2qdMJw7B9BsWu4gwRKKB", "question": "What is there a giant hole taken out of the tree for?", "choices": ["giraffes", "moose", "lumberjacks", "skunks"], "correct_choice_idx": 0, "direct_answers": ["eating", "giraffe", "girrafi", "protect", "relax", "giraffes", "food", "disease", "being ate", "scratching"], "difficult_direct_answer": true, "rationales": ["Giraffes are grazing near a tree.", "The giraffes have eaten off the tree.", "It is the giraffes habitat and they feed from the tree."], "image": "train2014/COCO_train2014_000000276666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425158, "question_id": "Nw3gNW77kaQnwRw5UNUFpN", "question": "How many lights are shining bright on the side of the campus street?", "choices": ["six", "one", "two", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "two", "two", "two lights", "two", "three", "one", "three"], "difficult_direct_answer": false, "rationales": ["There are 3.", "There are three bright street lights shining on the campus street.", "One light is in between two other lights."], "image": "train2014/COCO_train2014_000000425158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248789, "question_id": "NwHH2fLXu4Qoefo7QDTiJ3", "question": "What is the man in the foreground balancing with?", "choices": ["ski poles", "bannister", "rope", "hook"], "correct_choice_idx": 0, "direct_answers": ["ski poles", "ski poles", "ski poles", "snowboard", "poles", "ski", "ski poles", "skiis", "ski poles", "poles"], "difficult_direct_answer": false, "rationales": ["He is holding two long metal sticks that help him while he skis", "He's using this to help stay upright", "They are typical of this sport. he is on snow which the object grips."], "image": "train2014/COCO_train2014_000000248789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144495, "question_id": "P2aExKvAYWu43UEEeN6wcA", "question": "What is the banana cut into on the plate?", "choices": ["halves", "fifths", "fourths", "thirds"], "correct_choice_idx": 0, "direct_answers": ["proper feed", "halves", "two pieces", "dwdw", "half", "half", "halves", "half", "cavendish banana", "two"], "difficult_direct_answer": false, "rationales": ["Two pieces of a banana are on a plate.", "The banana is in halves.", "The banana is split equally into two."], "image": "train2014/COCO_train2014_000000144495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277749, "question_id": "P39LoUUohzuhgH2zikN2DM", "question": "What does the building do?", "choices": ["walk", "sing", "spin", "generate heat"], "correct_choice_idx": 2, "direct_answers": ["spin", "spin", "mill", "generate power", "windmill", "spin", "electricity", "wind mill", "wind air", "mill grain"], "difficult_direct_answer": false, "rationales": ["It can generate power or grind grain to make flour.", "A building with a large windmill is in a grassy area.", "This is a windmill and the propellers harness the power of the wind by turning to generate power."], "image": "val2014/COCO_val2014_000000277749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233613, "question_id": "P3av5RpXZ2skXgjy7mvXn7", "question": "What color is the car seat that the cat is sleeping on?", "choices": ["brown", "purple", "red", "blue"], "correct_choice_idx": 1, "direct_answers": ["purple gray", "gray", "navy blue", "blue", "purple", "blue", "purple", "grey", "purple", "blue"], "difficult_direct_answer": false, "rationales": ["The color is purple.", "The seat of the car is purple and the cat is on it.", "The car seat is purple."], "image": "train2014/COCO_train2014_000000233613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245503, "question_id": "P4Zhi7fiMBsUURAcF4z5xy", "question": "How many wheels are used on the bottom of this aircraft?", "choices": ["eight", "twelve", "three", "six"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are two wheels on the front, and one wheel on the back of the aircraft.", "There are two wheels in front and one in the back.", "There is only three wheels that can be seen in the picture."], "image": "train2014/COCO_train2014_000000245503.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400832, "question_id": "P5gmv8GQufcGKGa9HrFrvf", "question": "What is the name of the long skirt the man is wearing?", "choices": ["lungi", "kilt", "sols", "drop"], "correct_choice_idx": 0, "direct_answers": ["lobby", "dhoti", "lungi", "kilt", "sari", "lungi", "caftan", "dress", "lungi", "skirt"], "difficult_direct_answer": false, "rationales": ["That is the name of the long skirt.", "Skirts worn by men.", "She has on a maxi skirt."], "image": "train2014/COCO_train2014_000000400832.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 39509, "question_id": "P5qF895BDi9GeeMwRYp27X", "question": "What is the name of the occupation that is suppose to keep you safe at this place?", "choices": ["hospital", "lifeguard", "ambulance", "police officer"], "correct_choice_idx": 1, "direct_answers": ["lifeguard", "lifeguard", "beach", "lifeguard", "sleep", "stop", "lifeguard", "vdfvdf", "lifeguard", "lifeguard"], "difficult_direct_answer": false, "rationales": ["People are sitting on the beach.", "The name is a lifeguard.", "People are in the sand at the edge of water."], "image": "train2014/COCO_train2014_000000039509.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363318, "question_id": "P6fwgXv7tviKH2sTx2iApJ", "question": "What is in the middle of the court?", "choices": ["bailiff", "volleyball", "net", "basketball"], "correct_choice_idx": 2, "direct_answers": ["net", "purple", "net", "ball", "girl", "net", "tennis net", "net", "person", "net"], "difficult_direct_answer": false, "rationales": ["This is part of a tennis court and is made of strings", "The rest don't apply to this game of tennis.", "On a tennis court there is a net in the middle"], "image": "train2014/COCO_train2014_000000363318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293366, "question_id": "P72sekkCQKubHz8or5SUBs", "question": "There are how many birds sitting on stuff in the canal?", "choices": ["four", "two", "five", "three"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "five", "four", "five", "four", "five", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are two birds on the left. two additional birds are on the right.", "There are 4 birds.", "Simple counting and you can get the answer."], "image": "train2014/COCO_train2014_000000293366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258854, "question_id": "P8HKqfXjqa6vmKXSmdZDfZ", "question": "What is under the umbrella?", "choices": ["old man", "chairs", "babies", "pumpkin"], "correct_choice_idx": 1, "direct_answers": ["beach chairs", "two chairs", "chairs", "chair", "chairs", "chairs", "beach chairs", "chairs", "chairs", "chair"], "difficult_direct_answer": false, "rationales": ["There is a white beach with two folding chairs under a yellow umbrella by the water.", "These are seats that are low to the ground so people can lounge back and enjoy the sun", "The picture is at a beach. sometimes people sit on beach chairs at the beach."], "image": "train2014/COCO_train2014_000000258854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503011, "question_id": "P9w8yN3jHHRf6dgFHFVa9u", "question": "What did the woman crouch down to do?", "choices": ["tie shoe", "clean sidewalk", "pet cat", "sit down"], "correct_choice_idx": 2, "direct_answers": ["cat", "petting cat", "cat", "protect", "pet", "pet cat", "touch cat", "pet cat", "pet cat", "pet cat"], "difficult_direct_answer": false, "rationales": ["The woman is touching the animal.", "There is a black animal on the ground. she is about to touch it.", "The woman is getting ready to pet a small cat."], "image": "train2014/COCO_train2014_000000503011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395214, "question_id": "P9zAWLoMP5vPTcs5qonMoM", "question": "What kind of trick is being performed here?", "choices": ["ollie", "manual", "nollie", "flip trick"], "correct_choice_idx": 3, "direct_answers": ["skating", "kickflip", "kickflip", "jumping", "ollie", "flip trick", "flip", "skateboard trick", "flip trick", "jump"], "difficult_direct_answer": false, "rationales": ["The skateboarder is making their skateboard turn completely around.", "The trick is a flip trick.", "The trick is a flip."], "image": "train2014/COCO_train2014_000000395214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287105, "question_id": "PAZ7fj8CNXRj4EgTFrS3VT", "question": "What is the giraffe standing near?", "choices": ["door", "wooden crate", "toilet", "apple tree"], "correct_choice_idx": 0, "direct_answers": ["door", "door", "door", "door", "door", "door", "dwdwdw", "doorway", "door", "door"], "difficult_direct_answer": false, "rationales": ["The giraffe is by a door.", "You can see them open where he is and closed on the other side", "The giraffe is standing in a doorway."], "image": "train2014/COCO_train2014_000000287105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227742, "question_id": "PBzn5ZWkZtnE7ECCu9vSzL", "question": "What color is the vest worn around the skier's jacket?", "choices": ["olive", "black", "orange", "navy"], "correct_choice_idx": 0, "direct_answers": ["grey", "brown", "grey", "green", "olive", "brown", "beige/green", "green", "white", "grey"], "difficult_direct_answer": false, "rationales": ["The vest is not black, navy, or orange.", "The color is olive.", "It's a green color"], "image": "val2014/COCO_val2014_000000227742.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325936, "question_id": "PC8k6SHy2RwP8q4QTqHiuh", "question": "How many electronic devices does this person likely own?", "choices": ["99", "one", "three", "five"], "correct_choice_idx": 3, "direct_answers": ["television", "six", "six", "six", "six", "six", "four", "television", "six", "five"], "difficult_direct_answer": false, "rationales": ["There are this many remotes on the table", "The person has five devices.", "There are that number of remotes. often each device has its own remote so you can count the remotes to determine the number of devices."], "image": "train2014/COCO_train2014_000000325936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451489, "question_id": "PDFBSrRfZGikFYAjDHBS6N", "question": "Who is dunking the ball?", "choices": ["old man", "elephant", "woman", "toddler"], "correct_choice_idx": 1, "direct_answers": ["play", "elephant", "elephant", "elephant", "dwdwd", "elephant", "elephant", "elephant", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["The ball is in its trunk as it places it in the net", "Its holding the ball with its trunk", "The other three options aren't not in the photo holding or doing anything with a ball."], "image": "train2014/COCO_train2014_000000451489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 34454, "question_id": "PEwS2qsJgTdA6pJTh7WnVx", "question": "What would you call the man with the dog?", "choices": ["dancer", "skier", "skater", "surfer"], "correct_choice_idx": 3, "direct_answers": ["surfer", "lover", "hug", "surfer", "friend", "surfer", "surfer", "surfer", "beach bum", "surfer"], "difficult_direct_answer": false, "rationales": ["The man is on a board meant for riding ocean waves, and is wearing beachwear.", "He's laying on a surfboard resting", "The man with the dog is laying on top of his surfboard that he uses to ride waves."], "image": "train2014/COCO_train2014_000000034454.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517370, "question_id": "PGNgUV77nG2GgQokvbppvz", "question": "What is the man doing?", "choices": ["traveling", "eating", "working", "relaxing"], "correct_choice_idx": 0, "direct_answers": ["waiting", "waiting", "traveling", "staring", "pictures", "waiting", "waiting", "standing", "standing", "taking photo"], "difficult_direct_answer": false, "rationales": ["A man is standing in very dense fog by the ocean. he has some luggage parked next to him as he views the water.", "This is indicated by the suitcases.", "The man has a suitcase."], "image": "train2014/COCO_train2014_000000517370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216726, "question_id": "PGuHRWMKGsagkoKoQDGng2", "question": "What would most likely be found here?", "choices": ["king cake", "salmon", "horse", "tires"], "correct_choice_idx": 0, "direct_answers": ["cakes", "cakes", "cakes", "bakery", "birthday", "cake", "cookies", "cake", "dessert", "king cake"], "difficult_direct_answer": false, "rationales": ["There are several different varieties of this type of dessert.", "The cake would be found here.", "The cake is found."], "image": "val2014/COCO_val2014_000000216726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575949, "question_id": "PGy2HBSjYbfvSv3wMU9CjC", "question": "Why are the people marching down the street?", "choices": ["demonstration", "parade", "protest", "riot"], "correct_choice_idx": 1, "direct_answers": ["parade", "parade", "kite couples", "with kites", "fly kites", "kiting festival", "raise kites", "kite", "rally", "kites"], "difficult_direct_answer": true, "rationales": ["The people are parading.", "People hold things and march together for parades.", "There's no evidence of it being for the other reasons. also the people are very happy, which can indicate a."], "image": "train2014/COCO_train2014_000000575949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302029, "question_id": "PH37W56VLm6stQ6sWTwj6i", "question": "What color is the umbrella strapped onto the bicycle frame's center bar?", "choices": ["yellow", "green", "red", "blue"], "correct_choice_idx": 2, "direct_answers": ["red", "orange", "red", "red", "orange", "red", "red", "red", "orange", "red"], "difficult_direct_answer": false, "rationales": ["The umbrella is not the same color as the blue bicycle frame. the umbrella is not green or yellow.", "The umbrella is clearly visible based on the location given in the question and its design. the color is clearly visible and identifiable.", "The umbrella is not blue, green, or yellow."], "image": "train2014/COCO_train2014_000000302029.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458281, "question_id": "PHF8MJBoLQUJMDicNtGzjy", "question": "What makes it difficult to see the people in this image?", "choices": ["trees", "sunset", "lights", "stores"], "correct_choice_idx": 1, "direct_answers": ["sun", "sun", "sun shadows", "different colours", "sun", "sun glare", "sunset", "sunlight", "shadows", "light"], "difficult_direct_answer": false, "rationales": ["The sunset would make it hard to see.", "The sun is setting.", "The sun setting makes different shades on the objects."], "image": "train2014/COCO_train2014_000000458281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3823, "question_id": "PHyWMYZFSFkT9FozUae5mc", "question": "What is in the center of the room?", "choices": ["cat", "elephant", "dog", "laptop"], "correct_choice_idx": 3, "direct_answers": ["computer", "desk", "laptop", "car", "computer", "computer", "table", "computer desk", "monitor", "laptop"], "difficult_direct_answer": false, "rationales": ["The center has the laptop.", "There is a laptop in the center of the room.", "A machine that has a screen with computer programs on it is visible. you can see the line where it can be folded which is typical of a laptop."], "image": "train2014/COCO_train2014_000000003823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 9919, "question_id": "PJwzkoZJFKQq6wKHAbdux9", "question": "What kind of meat is served on the very top of the plate?", "choices": ["chicken", "salmon", "beef", "pork"], "correct_choice_idx": 3, "direct_answers": ["beef", "meat", "pork", "steak", "pork", "beef", "beef", "pork", "beef", "chicken"], "difficult_direct_answer": false, "rationales": ["There is a pork chop on the top of the plate.", "The meat is pork.", "This is an a chop that's extremely well done."], "image": "train2014/COCO_train2014_000000009919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203975, "question_id": "PK3RxMW4SYcq2uiqED2KKd", "question": "Why is there a dish drainer on the counter?", "choices": ["ambiance", "cooking utensil", "vegetable storage", "no dishwasher"], "correct_choice_idx": 3, "direct_answers": ["dry", "drying", "washing dishes", "no dishwasher", "dry dishes", "drying", "dry dishes", "save", "drain dishes", "dry dishes"], "difficult_direct_answer": false, "rationales": ["There is no machine to wash dishes so they wash by hand using the dish drainer.", "There is no machine visible to wash the dishes. if a person had a dishwaster, then you would not wash them by hand and put them to dry.", "There is no dishwasher."], "image": "train2014/COCO_train2014_000000203975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37800, "question_id": "PKfhziBGsNdQXxki6opoow", "question": "What color is the vase in the middle of the table surrounded by pizzas?", "choices": ["black", "red", "clear", "white"], "correct_choice_idx": 2, "direct_answers": ["clear white", "clear", "orange", "clear", "clear", "clear", "clear", "clear", "clear", "clear glass"], "difficult_direct_answer": false, "rationales": ["There is clear water in a glass pitcher which glass is see thru and able to see what is inside it.", "The color is clear.", "You can see through it"], "image": "train2014/COCO_train2014_000000037800.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196691, "question_id": "PMo6RGVTLCZGRyDAENWN8a", "question": "What brand shoes resembles the shoes of the man looking down on his phone?", "choices": ["nike", "puma", "adidas", "under armor"], "correct_choice_idx": 2, "direct_answers": ["adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "adidas", "nike", "adidas"], "difficult_direct_answer": false, "rationales": ["This company is known for having a logo with the three color blocked white rectangles.", "The shoes are adidas.", "The brand uses three stripes."], "image": "train2014/COCO_train2014_000000196691.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188918, "question_id": "PPGfQCsVHNEz7t6xhTbDMf", "question": "What is the vehicle doing?", "choices": ["flying", "rocketing upward", "crossing", "submerging"], "correct_choice_idx": 2, "direct_answers": ["riding", "moving", "crossing", "crossing road", "moving", "driving", "running", "crossing", "driving", "moving"], "difficult_direct_answer": false, "rationales": ["The train is trying to cross the tracks.", "The train is crossing the road.", "The lights indicate the train is coming"], "image": "val2014/COCO_val2014_000000188918.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242677, "question_id": "PRC9wBaBykW4XnsW598u7M", "question": "What is the woman holding?", "choices": ["pumpkin pie", "apple", "dog leash", "pizza box"], "correct_choice_idx": 2, "direct_answers": ["leash", "dog leash", "leash", "leash", "leash", "leash", "leash", "leash", "dog leash", "dog leash"], "difficult_direct_answer": false, "rationales": ["The woman is walking with a dog.", "The woman is holding the strap that is hooked to her dog's collar so that she can take the dog for a walk.", "The woman has a leash."], "image": "train2014/COCO_train2014_000000242677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492572, "question_id": "PSE5RswAks9LGN79GAest7", "question": "What color are the ear rings worn by the bulls in this field?", "choices": ["blue", "purple", "yellow", "green"], "correct_choice_idx": 2, "direct_answers": ["gold", "tan", "white", "yellow", "orange", "yellow", "brown", "brown", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The cows have yellow ear rings.", "The color is yellow.", "The color is yellow."], "image": "train2014/COCO_train2014_000000492572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546695, "question_id": "PUV4EDYvc5cwW254dXvfkU", "question": "What wrestler's outfit matches the colors of the cake?", "choices": ["macho man", "miz", "bret hart", "hulk hogan"], "correct_choice_idx": 2, "direct_answers": ["tha goodies", "kane", "pink", "blackpink", "black", "bret hart", "pink", "kane", "hart", "belt"], "difficult_direct_answer": false, "rationales": ["Bret hart has a costume that is pink and black which are the colors of the cake", "Bret hart's costume color was pink and black", "Their wrestling outfit was pink and black."], "image": "train2014/COCO_train2014_000000546695.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368659, "question_id": "PUVL8PJctMoGpzTKLvefFh", "question": "What is the main color of the Chinese vase on the center right?", "choices": ["red", "blue", "green", "yellow"], "correct_choice_idx": 1, "direct_answers": ["blue", "brown", "blue", "blue", "rose", "blue", "blue", "light pink", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The colours of the vase are blue and white.", "It is blue with white.", "The main color is blue."], "image": "train2014/COCO_train2014_000000368659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523357, "question_id": "PUt9Ed7jp3Cdiwz3JYpPZK", "question": "What color are the edges of the sidecar with a baby pug in it?", "choices": ["green", "red", "yellow", "blue"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "orange", "black", "orange", "yellow", "yellow", "orange", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The color is yellow.", "The edges of the sidecar are yellow.", "The edges are not blue, green, or red."], "image": "train2014/COCO_train2014_000000523357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206624, "question_id": "PVSiHX2Afw6i9e4ndE7AjR", "question": "The animals are at what location?", "choices": ["farm", "factory", "petting zoo", "baseball stadium"], "correct_choice_idx": 0, "direct_answers": ["farm", "farm", "farmstead", "barn yard", "ranch", "farm", "hourse", "farm", "farm", "farm"], "difficult_direct_answer": false, "rationales": ["This looks to be at a farm.", "Horses are in a pasture with a red building with white trim and large barn doors behind them. farms have barns and barns are traditionally red.", "They are near a barn"], "image": "train2014/COCO_train2014_000000206624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415594, "question_id": "PVbC96UBjC8gzzgHBcBDSf", "question": "What is the utensil the woman is using called?", "choices": ["whisk", "spatula", "strainer", "skimmer"], "correct_choice_idx": 1, "direct_answers": ["spatula", "pot holder", "tong", "spatula", "spatula", "spatula", "spatula", "spatula", "spatula", "dwdwd"], "difficult_direct_answer": false, "rationales": ["This has a large flat area to pick up food as it's cooking", "It's a utensil to scoop stuff that is flat out.", "Based on the size, shape and design and how the woman is using the tool answer a is consistent."], "image": "train2014/COCO_train2014_000000415594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364123, "question_id": "PVg55o4TEVJhQMLadzWeBH", "question": "What is the person on the left wearing?", "choices": ["tie", "cat ears", "suspenders", "crown"], "correct_choice_idx": 0, "direct_answers": ["skirt", "pants", "suit", "suit", "suit", "tie", "suit", "suit", "suit", "tie"], "difficult_direct_answer": false, "rationales": ["The person has a tie.", "The person on the left is wearing a light blue neck tie.", "It's unknown if he's also wearing c and the other options aren't in the scene."], "image": "train2014/COCO_train2014_000000364123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98636, "question_id": "PWYTh87nNEPAHd3jAgntqU", "question": "What kind of surface is the bird standing on?", "choices": ["metal", "glass", "brick", "concrete"], "correct_choice_idx": 2, "direct_answers": ["floor", "dirt", "dirt", "dirt", "brick", "dirt", "brick", "concrete dirt", "efeee", "sand"], "difficult_direct_answer": false, "rationales": ["There are hardened clay squares on the ground", "The surface is made of bricks.", "The bird is standing on top of something made of brick."], "image": "val2014/COCO_val2014_000000098636.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 535748, "question_id": "PXioTkh4szZayi5d8ZhgwX", "question": "How many color varieties are there for the cupcakes on the cupcake pagoda?", "choices": ["one", "two", "five", "three"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "four", "four", "four", "four", "three", "four", "four", "dw"], "difficult_direct_answer": false, "rationales": ["I see five different colors.", "There are three varieties.", "Yellow, pink and green are available."], "image": "val2014/COCO_val2014_000000535748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329433, "question_id": "PYpnGoExyMSDeUe7MhdWvb", "question": "What color is the breaded chicken served with a side of celery and ranch?", "choices": ["yellow", "orange", "brown", "red"], "correct_choice_idx": 1, "direct_answers": ["orange", "red", "orange", "orange", "brown", "yellow", "red", "red", "orange", "white"], "difficult_direct_answer": false, "rationales": ["The color is the orange color that looks like a yellow color.", "The color is an orange.", "The color is orange."], "image": "val2014/COCO_val2014_000000329433.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339358, "question_id": "PZx9o6YbzD7t2WrmrHUYB2", "question": "The man is skateboarding along a railing of what color?", "choices": ["orange", "yellow", "red", "blue"], "correct_choice_idx": 0, "direct_answers": ["orange", "orange", "orange", "orange", "orange", "orange", "orange", "not i'snt", "yellow", "orange"], "difficult_direct_answer": false, "rationales": ["A man is skating on a stack of orange rectangular holes. the man is wearing black shorts and white shirt.", "Orange is the object color that the man is skateboarding on", "It is the same color as a carrot."], "image": "train2014/COCO_train2014_000000339358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574184, "question_id": "PaWwDnUYMMwBfGVtPNZAcF", "question": "Who is the manufacturer of the laptop?", "choices": ["apple", "sony", "toshiba", "hp"], "correct_choice_idx": 3, "direct_answers": ["hp", "hp", "hp", "hp", "hp", "hp", "hp", "hp", "owner", "hp"], "difficult_direct_answer": false, "rationales": ["The manufacturer's name is just below the screen.", "Under the screen shows the logo for the manufacturer.", "The computer is from hp."], "image": "val2014/COCO_val2014_000000574184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367362, "question_id": "PazRb4fJDZQTMEjbGdTEDd", "question": "What is the elephant baby called?", "choices": ["pup", "colt", "stag", "calf"], "correct_choice_idx": 3, "direct_answers": ["stop", "watching", "car", "baby", "calf", "calf", "mom m", "calf", "calf", "calf"], "difficult_direct_answer": false, "rationales": ["A baby elephant is known to be referred to as answer a.", "The other options apply to horses, dogs and deer, respectively.", "This is the correct answer per google."], "image": "val2014/COCO_val2014_000000367362.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134194, "question_id": "PcZS9boUFixkJ9yGP2btv8", "question": "What animals are near the zebras?", "choices": ["giraffes", "cows", "cats", "dogs"], "correct_choice_idx": 0, "direct_answers": ["giraffes", "giraffes", "giraffes", "giraffes", "giraffe", "giraffes", "giraffes", "giraffes", "giraffes", "giraffes"], "difficult_direct_answer": false, "rationales": ["The animals look like horses. they have black and white stripes.", "Giraffes are long-necked, spotted animals who share the appearance of the animals in the picture. they are also native to the same parts of the world as zebras.", "Giraffes are long necked spotted animals."], "image": "val2014/COCO_val2014_000000134194.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342146, "question_id": "PcuxtqSq99DA4LFxm5vh9T", "question": "The person in the blue jeans looks like they are wearing what?", "choices": ["sombrero", "lab coat", "bandana", "gas mask"], "correct_choice_idx": 1, "direct_answers": ["lab coat", "up roan", "doctor's coat", "man", "lab coat", "hat", "jacket", "coat", "lab coat", "lab coat"], "difficult_direct_answer": false, "rationales": ["The person is wearing a lab coat.", "The article of clothing is white and long sleeved. it has a texture for chemicals to spill off of them.", "The person has a coat."], "image": "val2014/COCO_val2014_000000342146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330408, "question_id": "PdJ5yg5TDg3RN2QgHmUsfw", "question": "Which vehicle is leading the ones on the left side?", "choices": ["airplane", "tank", "bus", "motorcycle"], "correct_choice_idx": 2, "direct_answers": ["bus", "bus", "bus", "bus", "bus", "bus", "bus", "white bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["There is a big passenger bus leading on the left side.", "A bus is in the front of the cars", "The bus is first in a line of cars."], "image": "val2014/COCO_val2014_000000330408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443278, "question_id": "PdP5rVDKn5pRaQA2zyX99K", "question": "What feature do these animals have?", "choices": ["pouches", "gills", "wings", "hooves"], "correct_choice_idx": 3, "direct_answers": ["stripes", "hooves", "stripes", "stripes", "stripes", "stripes", "stripes", "stripes", "stripes", "stripe colors"], "difficult_direct_answer": false, "rationales": ["These animals have hooves on their feet because they are zebras.", "There is a couple of inches of hard dark material at the base of their legs.", "The animals have hooves."], "image": "val2014/COCO_val2014_000000443278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359563, "question_id": "PfNaV8YdXWax8boyimiSvQ", "question": "What is the pink item on the counter?", "choices": ["scissor handle", "spoon", "napkin", "fork"], "correct_choice_idx": 0, "direct_answers": ["scissors", "scissors", "scissors", "scissors", "scissors", "soap", "scissors", "scissor handle", "sponge", "scissor"], "difficult_direct_answer": false, "rationales": ["The pink item attaches to two cutting surfaces. spoons, forks, and napkins are not used to cut other things.", "There is a pair of shears with plastic pink grips on a kitchen counter.", "The pink is the handle to a pair of scissors."], "image": "val2014/COCO_val2014_000000359563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196294, "question_id": "PgXzTJhVrfczmuRYkoNe6n", "question": "What group usually uses this mode of transport?", "choices": ["amish", "paratroopers", "army rangers", "pilots"], "correct_choice_idx": 0, "direct_answers": ["tourists", "transport", "mennonites", "royalty", "tourists", "amish", "amish", "amish", "amish", "tourists"], "difficult_direct_answer": false, "rationales": ["The amish don't use cars or technology.", "Here we see horses reined into buggies. this old fashioned form of transportation is acceptable to the amish who eschew modern comforts.", "This community doesn't use electricity."], "image": "train2014/COCO_train2014_000000196294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502422, "question_id": "PgbkpWEDJtD3uffpLA4iPj", "question": "What is the number of shampoo or soap bottles along the shower wall?", "choices": ["two", "six", "three", "five"], "correct_choice_idx": 3, "direct_answers": ["five", "five", "five", "five", "five", "five", "five", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["They can be counted.", "There are 5.", "There are 5 bottles lined up on the ledge."], "image": "train2014/COCO_train2014_000000502422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486081, "question_id": "PgqXJiJJa599QG7b3rq75B", "question": "What object is the man imitating with his fingers?", "choices": ["phone", "flashlight", "gun", "sword"], "correct_choice_idx": 2, "direct_answers": ["gun", "gun", "gun", "pointing", "man", "cigarette", "gun", "gun", "camera", "gun"], "difficult_direct_answer": false, "rationales": ["The object is a gun.", "A man is holding his hand out turned to the side and is pointing one finger forward with the rest bent. people sometimes imitate a gun by pointing a finger as if it was the barrel of a gun.", "The object is a gun."], "image": "train2014/COCO_train2014_000000486081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65836, "question_id": "Pi9uLhBgFkqbLzQWjeemZX", "question": "What kind of fruit is contained by the small plastic tub?", "choices": ["blueberry", "banana", "raspberry", "blackberry"], "correct_choice_idx": 2, "direct_answers": ["strawberries", "strawberry", "strawberry", "strawberry", "strawberry", "strawberries", "raspberry", "berries", "strawberries", "strawberries"], "difficult_direct_answer": false, "rationales": ["These are strawberries in the container.", "They are identifiable by their round shape and small visible seeds and leafy green tops.", "These are strawberries in the container."], "image": "train2014/COCO_train2014_000000065836.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116824, "question_id": "PiMuXjkC94foWuuyzNc2ZZ", "question": "What color are the sheep's faces with green tags in their ears?", "choices": ["gray", "white", "brown", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "black", "black", "black", "black", "black", "black", "stop"], "difficult_direct_answer": false, "rationales": ["Sheep are in a pen with green markers on them and black faces.", "The faces are black.", "The color is black."], "image": "train2014/COCO_train2014_000000116824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307243, "question_id": "PiyCnEMuxoe3iRHA3ZsMnT", "question": "What color is the sun in the raised dais of the Italian building?", "choices": ["red", "purple", "black", "gold"], "correct_choice_idx": 3, "direct_answers": ["gold", "yellow", "building", "gold", "yellow", "gold", "yellow", "gold", "gold", "gold"], "difficult_direct_answer": false, "rationales": ["The color is gold.", "Sun dial's are often gold.", "There is only one clock and it is gold."], "image": "val2014/COCO_val2014_000000307243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459794, "question_id": "Pj2y8vKCZUQXVcYq2Rx5QU", "question": "What color is the small fireplace set in the middle of the room with all the books?", "choices": ["black", "green", "brown", "red"], "correct_choice_idx": 0, "direct_answers": ["black", "black", "red", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The color is black.", "This is a cast iron stove", "The fireplace is not red, green, or brown."], "image": "train2014/COCO_train2014_000000459794.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365217, "question_id": "PjcPUiJjxyjudTAZXsWhYa", "question": "What type of tennis swing is the main on the bottom of the court in the middle of?", "choices": ["serve", "backhand", "drop shot", "forearm"], "correct_choice_idx": 0, "direct_answers": ["overhead", "forward", "balckrock", "serve", "serve", "serving", "ball", "overhand", "overhand", "middle"], "difficult_direct_answer": false, "rationales": ["The tennis swing is a serve.", "The way his arm is up in the air and his feet are off the ground is part of the stance to get the momentum for serving.", "This is obvious given the player's position."], "image": "val2014/COCO_val2014_000000365217.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61730, "question_id": "Pm2DU8hJbwBv4AFc27AdTQ", "question": "What kind of fuel does the black and white dog run on?", "choices": ["food", "coal", "fire", "gas"], "correct_choice_idx": 0, "direct_answers": ["food", "water", "white", "food", "food", "food", "adrenaline", "diesel", "food", "fuel"], "difficult_direct_answer": false, "rationales": ["The fuel is food.", "The dog runs on food.", "There is a large dalmatian dog standing in front of a man and his motorcycle. it has to eat dog food to survive."], "image": "train2014/COCO_train2014_000000061730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170976, "question_id": "Pmkm5ffJAqSGLwHnPLYbmL", "question": "Where are the yellow items hanging under the cabinet usually found?", "choices": ["jungle", "museum", "tundra", "church"], "correct_choice_idx": 0, "direct_answers": ["counter", "trees", "jungle", "trees", "jungle", "tree", "jungles", "tree", "bananas", "tree"], "difficult_direct_answer": false, "rationales": ["Bananas are hanging in a kitchen.", "Those bananas are found in warm climents.", "The yellow items are bananas. they cannot grow in the tundra and are not usually found in churches or museums."], "image": "train2014/COCO_train2014_000000170976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464440, "question_id": "PnDyTaMvWRW95rncL5ozqg", "question": "What color might the blocks on the side of the clock tower be?", "choices": ["green", "brown", "blue", "white"], "correct_choice_idx": 1, "direct_answers": ["red", "bsll", "white", "brown", "red", "red brick", "green", "brown", "red", "brown"], "difficult_direct_answer": false, "rationales": ["The texture of the clocktower in this picture could be called brownstone.", "This is the color of the rest of the bricks in the tower.", "The clock tower is brown."], "image": "train2014/COCO_train2014_000000464440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54088, "question_id": "PokcvFgQpgtyPhrS3iZ6VJ", "question": "How is the car on the back being propelled?", "choices": ["oil", "gas", "coal engine", "towed"], "correct_choice_idx": 3, "direct_answers": ["dwww", "tow truck", "tow", "red", "forward", "pulled", "towed", "towed", "tow truck", "tow truck"], "difficult_direct_answer": false, "rationales": ["The car is lifted up and connected to a truck with a crane.", "A large white truck is pulling a car that has its front wheels hooked to back of it by a lift.", "The car is being towed."], "image": "val2014/COCO_val2014_000000054088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145061, "question_id": "Ppn5xHZdPZ3UQNaWnD8P5c", "question": "What is the car next to?", "choices": ["elephant", "airplane", "bus", "giraffe"], "correct_choice_idx": 2, "direct_answers": ["bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus", "bus"], "difficult_direct_answer": false, "rationales": ["It is a large vehicle for moving many people at a time.", "The car is running along side a large bus. the car appears small.", "It is a large and long vehicle with many windows and seats to hold a lot of people"], "image": "val2014/COCO_val2014_000000145061.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245313, "question_id": "Ppyxtifj5T7VRJDkkLjoBU", "question": "What is the tallest person wearing?", "choices": ["backpack", "suspenders", "sunglasses", "crown"], "correct_choice_idx": 2, "direct_answers": ["sunglasses", "brown coat", "coat", "sunglasses", "sunglasses", "man", "fefefe", "sunglasses", "jacket", "sunglasses"], "difficult_direct_answer": false, "rationales": ["A bald headed man is standing with skis. he has sunglasses on.", "The tallest person is wearing sunglasses.", "He has coverings on his eyes"], "image": "val2014/COCO_val2014_000000245313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271633, "question_id": "PrS5NAiZkGB4JpgNtcZyTK", "question": "Where are the people resting on furniture at?", "choices": ["park", "house", "hotel", "train depot"], "correct_choice_idx": 3, "direct_answers": ["on chair", "train depot", "railroad", "railway", "bench", "outside", "drinking", "train station", "train station", "station"], "difficult_direct_answer": true, "rationales": ["The furniture is alongside the tracks and the locomotives can also been seen in the picture.", "The people resting on furniture are near a train depot by the tracks.", "It is outdoors near tracks."], "image": "train2014/COCO_train2014_000000271633.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555847, "question_id": "PrmRUskPuJJKg4uQdSDkn2", "question": "What are the two long poles?", "choices": ["oar handles", "paint poles", "fishing poles", "pool cue"], "correct_choice_idx": 0, "direct_answers": ["fishing", "oars", "oars", "oars", "paddles", "fishing poles", "oars", "oar handles", "dwdw", "oars"], "difficult_direct_answer": false, "rationales": ["The boat depicted is a rowboat that would be propelled by answer and their shape and size is consistent.", "There is a boat floating on water. the poles are used to power and steer the boat.", "The long poles are in the boat for steering."], "image": "val2014/COCO_val2014_000000555847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387136, "question_id": "PrqFHfMrZtPwRciNdfHv9N", "question": "The bumper of the wagon is what color?", "choices": ["brown", "yellow", "blue", "red"], "correct_choice_idx": 0, "direct_answers": ["red", "black", "brown", "purple", "red", "red", "red", "red", "magenta", "brown"], "difficult_direct_answer": false, "rationales": ["The wagon has a brown bumper.", "The bumper is a shade of brown.", "It's more of a reddish a."], "image": "val2014/COCO_val2014_000000387136.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158787, "question_id": "PsHTqsJfWmtFmCnrDW6Su6", "question": "What are the two women in the foreground carrying?", "choices": ["pumpkins", "purses", "eggs", "cats"], "correct_choice_idx": 1, "direct_answers": ["black", "purses", "purses", "purse", "hand bag", "purses", "bag", "purses", "purses", "purses"], "difficult_direct_answer": false, "rationales": ["The women have purses.", "Each woman holds a bag by her side attached to the shoulder with a strap.", "Two females with large bags over their shoulders are walking on the sidewalk."], "image": "train2014/COCO_train2014_000000158787.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69117, "question_id": "PtpaMGW6D3Qkekrm8NtrMB", "question": "What actress's age on January 19 of 2022 will match the number on the right side of the car?", "choices": ["tippi hedren", "dolly parton", "jodie sweetin", "shawn johnson"], "correct_choice_idx": 0, "direct_answers": ["dolly parton", "thirty", "92", "car", "tippi", "tippi hedren", "barbara walters", "miley", "stop", "tippi hedren"], "difficult_direct_answer": true, "rationales": ["The actress is tippi.", "Tippi's age would match the car.", "Tippi is 92 years old. this is an older style name and dolly and shawn are much younger than 92."], "image": "train2014/COCO_train2014_000000069117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266366, "question_id": "PwbCh6QAx5ptfLAy3DFyDZ", "question": "What type of sink is this?", "choices": ["dropin", "vessel sink", "kitchen sink", "separated sink"], "correct_choice_idx": 1, "direct_answers": ["basin", "bowl", "round", "raised", "round", "white", "ceramic", "bowl", "vessel sink", "overmount sink"], "difficult_direct_answer": false, "rationales": ["The washbasin is raised off the counter with a high wall.", "There is a vessel sink inside of the kitchen.", "The sink is above the countertop."], "image": "train2014/COCO_train2014_000000266366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 326966, "question_id": "PwkFKggmfhtSMmKHzuFApm", "question": "What color is the cigarette part of this man's costume?", "choices": ["tan", "brown", "black", "white"], "correct_choice_idx": 2, "direct_answers": ["white", "black", "white black", "black", "white", "black", "black", "white", "black white", "tie"], "difficult_direct_answer": false, "rationales": ["Based on the location of the ember at the end of the cigarette and the holder in his mouth, the cigarette color can be inferred as it is clearly visible in between.", "They are always this color", "The tip of the cigarette is black."], "image": "train2014/COCO_train2014_000000326966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187543, "question_id": "Pwq6EYPBs3EWgAyHnf6kZ4", "question": "The man has what on his face?", "choices": ["tattoo", "clown nose", "mustache", "egg"], "correct_choice_idx": 2, "direct_answers": ["facial hair", "mustache", "anger", "mustache", "mustache", "headset", "mustache", "human", "mustache", "happy"], "difficult_direct_answer": false, "rationales": ["The man has a mustache.", "The man's face is clearly visible and the features on it are also visible and identifiable based on their defining characteristics.", "The man has hair above his upper lip."], "image": "val2014/COCO_val2014_000000187543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360665, "question_id": "PxMKjatH9P3mVuQeeggXjA", "question": "What are the animals near?", "choices": ["basilica", "bridge", "cliff", "ocean"], "correct_choice_idx": 1, "direct_answers": ["bridge", "bridge", "dwdwd", "bridge", "sheep", "bridge", "bridge", "bridge", "bridge", "bridge"], "difficult_direct_answer": false, "rationales": ["The sheep are by a bridge.", "These are long spans of road to go over something.", "The structure is elevated above ground."], "image": "train2014/COCO_train2014_000000360665.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513219, "question_id": "PxdndwjnkgnGTwtgJBpXiV", "question": "Which object is used for warmth in this room?", "choices": ["plant", "fire place", "floor", "sofa"], "correct_choice_idx": 1, "direct_answers": ["fireplace", "fireplace", "fireplace", "fireplace", "fireplace", "decorations", "fireplace", "fire place", "fireplace", "fire"], "difficult_direct_answer": false, "rationales": ["The sofa, plant, and floor are not capable of generating meaningful amounts of heat.", "There is a little fire place providing warmth in this room.", "Sofas, plants, and floors do not generate meaningful amounts of heat. the item in the middle can generate heat by burning wood."], "image": "val2014/COCO_val2014_000000513219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68947, "question_id": "PyLAz3BX9vXonf6axnDfpj", "question": "What is in the tray?", "choices": ["bird", "cookies", "pizza", "eggs"], "correct_choice_idx": 2, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["The item in the tray has a crust and is covered in cheese. it has also been cut into triangular slices.", "There are several slices with breading at the outside edges. there is cheese on the top.", "There is a pizza on the tray."], "image": "train2014/COCO_train2014_000000068947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461254, "question_id": "Q5kwvJg3hJWMVTjtKSW7ZS", "question": "What is the side dish on the plate?", "choices": ["fries", "tomato", "apples", "beans"], "correct_choice_idx": 3, "direct_answers": ["beans", "bean", "dish", "beans", "beans", "beans", "beans", "beans", "beans", "nothing"], "difficult_direct_answer": false, "rationales": ["The dish has beans.", "They are the canned variety of these", "There is a pile of beans on the side of the dish."], "image": "val2014/COCO_val2014_000000461254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503269, "question_id": "Q7jEbiMMrffrMbehY2u7jR", "question": "What is the man wearing?", "choices": ["raincoat", "suspenders", "egg hat", "glasses"], "correct_choice_idx": 3, "direct_answers": ["jacket", "black shirt", "glasses", "jacket", "jacket", "black sweater", "pullover", "pullover", "jacket", "sweater"], "difficult_direct_answer": false, "rationales": ["The man is wearing glasses.", "The man has glasses.", "The man's face is clearly visible and is wearing rimmed spectacles over his eyes."], "image": "train2014/COCO_train2014_000000503269.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578344, "question_id": "Q8dx8VEA4i52W48XPS2HwK", "question": "What is the statue holding?", "choices": ["torch", "pizza", "television", "plunger"], "correct_choice_idx": 1, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "server", "pizza", "server"], "difficult_direct_answer": false, "rationales": ["The statue is of a chef holding a pan with pizza on it.", "The statue is wearing the typical \"italian cook\" outfit, and that food is typical italian dish.", "The statue has pizza."], "image": "val2014/COCO_val2014_000000578344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94353, "question_id": "Q8xXXefCQnwBYbKxeLRg5M", "question": "What is this group of animals called?", "choices": ["clowder", "school", "herd", "pride"], "correct_choice_idx": 2, "direct_answers": ["elephants", "herd", "herd", "elephants", "herd parade", "elephants", "herd", "elephants", "elephant", "herd"], "difficult_direct_answer": false, "rationales": ["There is a group of elephants.", "A group of elephants is a herd.", "The group is a herd."], "image": "train2014/COCO_train2014_000000094353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522791, "question_id": "Q9RU8Ms4WtU9xYXx4a7KSr", "question": "What is he using the long object in his hands for?", "choices": ["cut", "toast", "mix", "turn over"], "correct_choice_idx": 3, "direct_answers": ["flipping", "grilling", "picking up", "cooking", "tongs gripping", "stick", "turn over", "tong", "turning", "bcooking"], "difficult_direct_answer": true, "rationales": ["Tongs are used to perform the task outlined in a.", "Tongs are used to turn hot dogs over so they fully cook.", "He has bun and hot dogs."], "image": "val2014/COCO_val2014_000000522791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500099, "question_id": "Q9n94DKUPDSHSCfXTqheg6", "question": "What is the holder hanging on the wall holding?", "choices": ["mail", "paper towels", "printer paper", "tissue paper"], "correct_choice_idx": 0, "direct_answers": ["bills", "papers", "mail", "papper", "papers", "mail", "mail", "notes", "mail", "paper"], "difficult_direct_answer": false, "rationales": ["There are envelopes with addresses on them in the holder.", "The holder has mail.", "The holder hanging on the wall is holding pieces of mail."], "image": "train2014/COCO_train2014_000000500099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49756, "question_id": "QB7niR948wyQgJRRbhKszU", "question": "What color is the saddle's leather on the back of the horse?", "choices": ["black", "tan", "red", "white"], "correct_choice_idx": 0, "direct_answers": ["black", "black", "brown", "black", "black", "brown", "black", "black", "gray", "green"], "difficult_direct_answer": false, "rationales": ["The saddle is not tan, red, or white.", "It's obviously not any of the other color options.", "The color is black."], "image": "val2014/COCO_val2014_000000049756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182895, "question_id": "QBwGR5mgTSdYguX4jGySKq", "question": "What is near the trees?", "choices": ["parking meter", "baby", "goat", "elk"], "correct_choice_idx": 0, "direct_answers": ["parking meter", "parking meters", "field", "fef", "parking meters", "parking meters", "meter", "parking meters", "parking meters", "parking meters"], "difficult_direct_answer": false, "rationales": ["A row of parking spots is lined with trees and square devices.", "The tall poles right next to the road suggest it is next to public parking pots which require payment in the meters.", "The meter is nearby."], "image": "val2014/COCO_val2014_000000182895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416165, "question_id": "QC4sWEBoNmYSi3MRun4wNE", "question": "What is near the top of the tower?", "choices": ["baby", "clock", "egg", "gargoyle"], "correct_choice_idx": 1, "direct_answers": ["clock", "clock", "clock", "clock", "star", "clock", "weather vane", "clock", "clock", "tree"], "difficult_direct_answer": false, "rationales": ["There is a clock on the tower.", "The clock tower is clearly visible and the only object on the list of answers near the top is answer a.", "There is a clock face on the top of the tower with lights streaming off of it."], "image": "train2014/COCO_train2014_000000416165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398818, "question_id": "QD7wRXKfuTWTKbBR9bkmZT", "question": "What is on the fruit?", "choices": ["salad", "sticker", "ant", "mold"], "correct_choice_idx": 1, "direct_answers": ["banana", "stickers", "sticker", "stickers", "stop", "sticker", "sticker", "sticker", "sticker", "car"], "difficult_direct_answer": false, "rationales": ["There are round stickers on the bananas.", "The fruit has a sticker.", "Stickers are on the fruit."], "image": "val2014/COCO_val2014_000000398818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235268, "question_id": "QDFytyifhi3sdapwqeUhG4", "question": "What season is this room decorated for?", "choices": ["spring", "fall", "winter", "summer"], "correct_choice_idx": 1, "direct_answers": ["fall", "fall", "fall time", "fall", "winter", "fall", "winter", "fall", "fall", "fall"], "difficult_direct_answer": false, "rationales": ["There are gold leaves", "The mantle in the room appears to have leaves in the color they might appear in autumn. one puts this type of decorations out in the season they are meant to represent.", "The leaves on the mantle make it clear. that said, this could just be a decorative theme that has nothing to do with the outside seasons."], "image": "train2014/COCO_train2014_000000235268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306848, "question_id": "QDoj6PLpq4ijHpfLvSC4Y3", "question": "What is on top of the hill?", "choices": ["eagle", "egg", "horse", "pumpkin"], "correct_choice_idx": 2, "direct_answers": ["horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["There is a black horse on top of the hill.", "The horse is on top.", "A horse is at the top of the hill."], "image": "train2014/COCO_train2014_000000306848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41570, "question_id": "QELQ4YEe83p96LQC6UNup3", "question": "The woman wearing a white hat with two children on her rear is riding what color of street bike?", "choices": ["orange", "white", "red", "blue"], "correct_choice_idx": 1, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The woman is in white.", "(a) white. the street bike looks like it is white except for the handlebars and headlight.", "The bike is the same color as her helmet"], "image": "val2014/COCO_val2014_000000041570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138180, "question_id": "QEizFGpSLvxUkL5qtnrJxv", "question": "What is making the stripe on her leg?", "choices": ["medical tape", "packing tape", "masking tape", "kt tape"], "correct_choice_idx": 3, "direct_answers": ["grey", "kt tape", "tension tape", "bandage", "tape", "tap", "grey", "paint", "white tape", "tape"], "difficult_direct_answer": false, "rationales": ["The kt tape is striped.", "This is used to help athletes with muscle injuries or strain so they can keep playing", "The stripe allows her to play since it stabilizes injuries."], "image": "val2014/COCO_val2014_000000138180.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97367, "question_id": "QF6wJGD9dBRTxAAoLWhMvp", "question": "What is poking its head out of the side of the vehicle lagging behind the others?", "choices": ["horse", "dog", "cow", "cat"], "correct_choice_idx": 0, "direct_answers": ["travel", "driving", "horse", "horse", "stop", "horse", "horse", "horse", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["This is a trailer that transports these animals", "The trailer has a horse in it.", "A horse has a long face."], "image": "train2014/COCO_train2014_000000097367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302982, "question_id": "QFCPN7Fj3NpT2tSo2BNgcj", "question": "How many elephants are standing underneath of the iron roof and walking on the stone floor?", "choices": ["six", "five", "four", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three elephants", "three", "three", "two", "three", "four", "two"], "difficult_direct_answer": false, "rationales": ["Two are close and one is in the distance", "Based on the individual bodies observable and identifiable as elephants because of their features, answer a is accurate.", "All you have to do is simply count the elephants."], "image": "train2014/COCO_train2014_000000302982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 318415, "question_id": "QFqR6NpLsUxj554LTcJboo", "question": "What is the quickest vehicle here in order to make a getaway from a hypothetical alien invasion?", "choices": ["bicycle", "airplane", "car", "tank"], "correct_choice_idx": 2, "direct_answers": ["car", "submarine", "car", "car", "car", "car", "car", "car", "car", "car"], "difficult_direct_answer": false, "rationales": ["The car can go at the fastest speed.", "The vehicle is the car.", "The car is quickest."], "image": "val2014/COCO_val2014_000000318415.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237225, "question_id": "QGuoQAbMrKuHNGSpwBCkyB", "question": "What is near the window?", "choices": ["baby", "cat", "dog", "plant"], "correct_choice_idx": 3, "direct_answers": ["vase", "flower vase", "flowers", "roses", "cake", "cake", "plant", "flowers", "cake", "vase"], "difficult_direct_answer": false, "rationales": ["You put plants on the window sill so they can benefit from the sun coming through the windows", "A cake sits in front of a window with two vases on the sill that have flowers and greenery in them.", "Actually, it's two of these cut and in vases."], "image": "train2014/COCO_train2014_000000237225.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24061, "question_id": "QHbYJdUWMo3WYdgd7yoTrm", "question": "What is the box on the toilet tank used for?", "choices": ["shampoo storage", "soap", "cotton balls", "tissue boxes"], "correct_choice_idx": 3, "direct_answers": ["tissues", "tissue", "tissues", "tissue boxes", "tissue", "flushing", "tissues", "tissue", "trash", "tissue"], "difficult_direct_answer": false, "rationales": ["This holds kleenex", "Decorative boxes are used to cover tissue.", "The small cube shape of this receptacle with the oval opening in top is the shape of a tissue box."], "image": "val2014/COCO_val2014_000000024061.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269254, "question_id": "QJeveWkDcrPFBsrAc96zRE", "question": "What color is the background on the surfboard wrapped up with cello wrap?", "choices": ["white", "green", "blue", "yellow"], "correct_choice_idx": 2, "direct_answers": ["blue", "grey", "blue", "blue", "blue", "orange/blue", "blue", "blue", "blue", "orange"], "difficult_direct_answer": false, "rationales": ["It is the main color and orange is the accent color", "The surfboard is blue with orange stripes.", "The board has two colors but the major color is the color of the sky."], "image": "val2014/COCO_val2014_000000269254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470398, "question_id": "QKjfLGaup5qpMzxFCcasJt", "question": "What company owns the largest vehicle here?", "choices": ["united", "delta", "ford", "john deere"], "correct_choice_idx": 1, "direct_answers": ["delta", "delta", "delta", "delta", "delta", "delta", "delta", "delta", "delta", "delta"], "difficult_direct_answer": false, "rationales": ["The airplane with the company label is the largest vehicle shown.", "The airplane says delta and has a triangular logo.", "The company is delta."], "image": "train2014/COCO_train2014_000000470398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50956, "question_id": "QL7ZrxcvrjK8UcU7aTa3oS", "question": "How many colors are on the top of the umbrella carried by the man on the side of the pool?", "choices": ["one", "two", "three", "four"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["His umbrella is red and blue.", "There are only two colors on the umbrella.", "There are 2 colors."], "image": "val2014/COCO_val2014_000000050956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404226, "question_id": "QMM3Etwd6PbL7eyvn3k6uB", "question": "What is the sponsor's industry?", "choices": ["automotive", "job search", "electronics", "clothing"], "correct_choice_idx": 1, "direct_answers": ["employment", "ball", "career builder", "stop", "unknown", "play", "baseball", "carbuj", "job search", "unknown"], "difficult_direct_answer": true, "rationales": ["It's for job searching.", "Career builder is being advertised behind the batter and catcher. it is a site for people to search for jobs.", "The industry is in the job search field."], "image": "train2014/COCO_train2014_000000404226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560819, "question_id": "QMZwiJaUqfqB998EQe6MVx", "question": "What is in the athlete's hand?", "choices": ["football", "basketball", "tennis racquet", "baseball bat"], "correct_choice_idx": 2, "direct_answers": ["tennis racket", "racquet", "tennis racquet", "racquet", "tennis racket", "racquet", "bracket", "tennis ball", "racquet", "racket"], "difficult_direct_answer": false, "rationales": ["They're performing a serve.", "The outstretched arm of this athlete on our left hold's an item to strike the airborne tennis ball presumably above the images edge.", "He is playing tennis."], "image": "val2014/COCO_val2014_000000560819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577448, "question_id": "QMqqm5B5EUFPKFH3fp65pw", "question": "What has the train indicated it is accessible to?", "choices": ["planes", "cars", "bikes", "wheelchairs"], "correct_choice_idx": 3, "direct_answers": ["wheelchairs", "handicap", "wheelchairs", "wheelchairs", "wheelchairs", "wheelchairs", "wheelchairs", "sit", "black", "hanicap"], "difficult_direct_answer": false, "rationales": ["The area means it's usable through wheelchairs.", "A a picture of disabled person on awheechair.", "The train is for wheelchairs."], "image": "train2014/COCO_train2014_000000577448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137108, "question_id": "QN9yckCxD5XWihaRwktCEA", "question": "What is the person swatting at?", "choices": ["fly", "hungry bear", "ant", "tennis ball"], "correct_choice_idx": 3, "direct_answers": ["tennis ball", "tennis ball", "tennis ball", "ball", "ball", "tennis ball", "ball", "tennisball", "tennis ball", "tennis ball"], "difficult_direct_answer": false, "rationales": ["The person is swinging at a tennis ball with a racquet.", "This person is playing tennis.", "The person is hitting a tennis ball."], "image": "val2014/COCO_val2014_000000137108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537444, "question_id": "QPyVd6XNKYEKGgXjgzqjNV", "question": "What kind of fuel does this cat run on?", "choices": ["firewood", "kerosene", "food", "gas"], "correct_choice_idx": 2, "direct_answers": ["food", "cat food", "food", "not", "food", "food", "catnip", "water", "cat food", "food"], "difficult_direct_answer": false, "rationales": ["The fuel is food.", "There is a brown fluffy cat on the bed that needs food to live off of.", "As with other mammals, cats need a to grow and thrive."], "image": "train2014/COCO_train2014_000000537444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493192, "question_id": "QQ8CTEdyAmA8N8nYtAGwzZ", "question": "What does this man ski most closely to?", "choices": ["forest", "slope", "mountain ridge", "hill"], "correct_choice_idx": 2, "direct_answers": ["rocks", "skating", "mountain", "mountain", "mountain", "cliffs", "mountain", "mountain", "mountain", "mountain ridge"], "difficult_direct_answer": false, "rationales": ["People ski mostly at the mountains.", "The man is skiing next to a rocky mountain ridge.", "The slope near the man is on has several rocky crags behind him."], "image": "val2014/COCO_val2014_000000493192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307825, "question_id": "QQWPE9aWRmi2GDAKyzd5R2", "question": "How many people are sitting in the fishing boat on this day?", "choices": ["three", "four", "two", "one"], "correct_choice_idx": 2, "direct_answers": ["one", "two", "two", "one", "two", "one", "two", "two", "one", "two"], "difficult_direct_answer": false, "rationales": ["There are two people fishing in the boat.", "A. you can count the number of people and this number is typical of how many people fit into a small boat.", "There are 2 people."], "image": "train2014/COCO_train2014_000000307825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52484, "question_id": "QQtoEVtMCtZG5JRMFYqUV3", "question": "What kind of beverage is being enjoyed with the pizza?", "choices": ["soda", "beer", "vodka", "juice"], "correct_choice_idx": 1, "direct_answers": ["wine", "juice", "beer", "beer", "bear", "lager", "beer", "beer", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["Lager is a type of alcohol, and beer is sometimes packaged in aluminum cans.", "There is visible and readable language on the can that is consistent with answer a and details its contents.", "The word lager can be seen on the can which is a type of beer."], "image": "train2014/COCO_train2014_000000052484.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412680, "question_id": "QQufxt2tFqmeDqTGckzwfr", "question": "What is the side dish?", "choices": ["potato salad", "beets", "carrots", "fries"], "correct_choice_idx": 0, "direct_answers": ["potatoe salad", "guard", "potato salad", "potatoe salad", "white sauce", "potato salad", "potato salad", "potato salad", "potato salad", "boiled ham"], "difficult_direct_answer": false, "rationales": ["It is chunks of potato with a creamy dressing", "The dish is potato salad.", "The side dish is potato salad. the potatoes are in chunks."], "image": "train2014/COCO_train2014_000000412680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82275, "question_id": "QRNGDMNWrVqCLFHtUVMCEP", "question": "What are the people exiting from?", "choices": ["airplane", "taxi", "restaurant", "arena"], "correct_choice_idx": 0, "direct_answers": ["plane", "airplane", "airplane", "aeroplane", "airplane", "airplane", "airplane", "airplane", "airplane", "plane"], "difficult_direct_answer": false, "rationales": ["This is a passenger jet", "The people are exiting from an airplane.", "The people are leaving a vehicle that is capable of flying."], "image": "train2014/COCO_train2014_000000082275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300661, "question_id": "QRvrycENH4DQcsowtgk4mF", "question": "What ceremony is this replicating?", "choices": ["wedding", "first birthday", "graduation", "lawsuit"], "correct_choice_idx": 0, "direct_answers": ["wedding", "wedding", "wedding", "wedding", "marriage", "wedding", "wedding", "wedding", "wedding", "wedding"], "difficult_direct_answer": false, "rationales": ["The ceremony is a wedding.", "One can see the bride, groom, and officiant.", "There is a bride and groom present so it's a wedding."], "image": "train2014/COCO_train2014_000000300661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360126, "question_id": "QUAnQhPRvYDnQ84N86LC4X", "question": "What color are the french fries on to the right of the sandwich?", "choices": ["orange", "purple", "green", "white"], "correct_choice_idx": 0, "direct_answers": ["yellow", "yellow", "yellow", "tan", "golden brown", "white", "orange", "brown", "yellow", "brown"], "difficult_direct_answer": false, "rationales": ["The fries are close to orange color.", "They have been cook in oil so they have a light brownish orange color", "That color means they are cooked all the way for flavor and taste. they are ready to eat."], "image": "train2014/COCO_train2014_000000360126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551288, "question_id": "QV7NzJL6oXxSZ9NUkP3FDM", "question": "What is the woman in black about to eat?", "choices": ["hamburger", "pizza", "egg", "hot dog"], "correct_choice_idx": 1, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["She has circular dough with cheese and sauce on top", "The woman in black is visible and identifiable and has a plate of food in front of her that would be what she has been served and will eat. the food in front of her is identifiable as answer a based on its shape and visible ingredients.", "The woman is eating pizza."], "image": "val2014/COCO_val2014_000000551288.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43333, "question_id": "QVoxJUNbEL58b74MHwqrPg", "question": "The belly of the cow standing alone in the middle of the herd is of what color?", "choices": ["black", "gray", "white", "brown"], "correct_choice_idx": 2, "direct_answers": ["white", "white", "white", "white", "checked", "white", "white", "white", "watching", "six"], "difficult_direct_answer": false, "rationales": ["The cow in the middle of the image; inbetween two closer the sea and three to the right is white with brown splotches.", "The cow's belly is very light in color.", "The cow standing in the middle has a white belly"], "image": "train2014/COCO_train2014_000000043333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135460, "question_id": "QWvGHmCzaiVk3KpFApRWZV", "question": "What is on top of the bread?", "choices": ["tomato", "butter", "seeds", "cream cheese"], "correct_choice_idx": 2, "direct_answers": ["seeds", "seeds", "seeds", "seeds", "poppy seeds", "seeds", "poppy seeds", "seeds", "poppy seeds", "toppings"], "difficult_direct_answer": false, "rationales": ["The bread has seeds.", "There are some seeds on top of each roll.", "Some breads have those tiny bits that make it even more flavorful. there's lots of sand-like bits on top of them."], "image": "val2014/COCO_val2014_000000135460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183757, "question_id": "QXfKLiRhX7H797HMf7aeNW", "question": "What is above the dog?", "choices": ["old man", "cat", "balloon", "chicken"], "correct_choice_idx": 1, "direct_answers": ["cat", "cat", "cat", "cat", "hoist", "rope", "cat", "cat", "railing", "cat"], "difficult_direct_answer": false, "rationales": ["The cat is above the dog.", "The thing above the dog is a non-human animal. it is not a bird.", "There is a cat."], "image": "val2014/COCO_val2014_000000183757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464075, "question_id": "QYA3Azvh25wwbnkRQ6WZpU", "question": "How many boats are countable here on the beachhead tied to the land?", "choices": ["six", "five", "two", "four"], "correct_choice_idx": 3, "direct_answers": ["three", "five", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are four boats.", "There are a total of five boats on the beach.", "There are 4 boats next to each other tied to the land on the beach"], "image": "train2014/COCO_train2014_000000464075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484563, "question_id": "QZnT5jgWHMGHBbKNVwww5x", "question": "The man on his cell phone is sitting on a vehicle that is likely made for what age?", "choices": ["19", "25", "four", "60"], "correct_choice_idx": 2, "direct_answers": ["twenty five", "30s", "tricycle", "four", "young child", "ten", "young", "twenty four", "ten", "five"], "difficult_direct_answer": true, "rationales": ["The vehicle is made for young people. an adult does not fit easily on it.", "This is a children's toy", "A man is sitting on a kids \"powerwheel\" trike."], "image": "train2014/COCO_train2014_000000484563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544238, "question_id": "Qa9ikvbdVpuKmU8Qr9QAkG", "question": "What color is the middle section of the baseball bat used by the girl?", "choices": ["silver", "red", "yellow", "blue"], "correct_choice_idx": 0, "direct_answers": ["black", "silver", "silver", "silver", "silver", "silver", "gold", "silver", "silver", "silver"], "difficult_direct_answer": false, "rationales": ["A young girl is holding a bat with a shiny grey looking material between the blue and black handle.", "The color is a neutral shade that is \"no color\" in a sense.", "The upper section is blue. the middle section is a different color and is not red or yellow."], "image": "train2014/COCO_train2014_000000544238.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428486, "question_id": "QaWe5nnf848HttinQVV23K", "question": "What object is the same color as the plastic end cap to the item the little girl is holding?", "choices": ["lotion dispenser", "tray", "soap dispenser", "shampoo bottle"], "correct_choice_idx": 2, "direct_answers": ["bsll", "thing", "soap dispenser", "soap dispenser", "hand soap", "shirt", "her shirt", "toothbrush", "tap", "pop"], "difficult_direct_answer": true, "rationales": ["Soap dispenser in backdrop.", "The object has a pump to dispense the soap.", "The soap dispenser is the same color as the end cap."], "image": "train2014/COCO_train2014_000000428486.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266336, "question_id": "Qc6bLJDwzubeDJnwp8MFNv", "question": "What is above the microwave?", "choices": ["ceiling lights", "cat", "canned ham", "cardboard box"], "correct_choice_idx": 0, "direct_answers": ["cabinet", "cabinets", "ceiling lights", "cabinet", "cabinet", "open cabinet", "stove", "cabinet", "cabinets", "lights"], "difficult_direct_answer": false, "rationales": ["The object has vents and it's on the ceiling. it's above the stove where you would want this object.", "Lights are on the ceiling in a kitchen.", "The lighting fixture is hanging from the ceiling."], "image": "train2014/COCO_train2014_000000266336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415190, "question_id": "QcjrQzmUBGitBWGTa7iUZu", "question": "What other item did the company whose name appears on the large vehicle make?", "choices": ["hot dogs", "televisions", "phones", "hamburgers"], "correct_choice_idx": 2, "direct_answers": ["airplane", "phones", "phones", "record label", "records", "records", "airplane", "virgin atlantic", "phones", "plane"], "difficult_direct_answer": false, "rationales": ["They also have phones.", "The phones on the tarmac have the airline name virgin painted on the tail, a company that also makes cell phones.", "They have a wireless company"], "image": "train2014/COCO_train2014_000000415190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320703, "question_id": "QcmfyNR739rD8wzMrGumAA", "question": "What is near the opening to the hallway?", "choices": ["cow", "cat", "baby", "refrigerator"], "correct_choice_idx": 3, "direct_answers": ["refrigerator", "water jug", "fridge", "water dispenser", "water cooler", "fridge", "fridge", "fridge", "refrigerator", "refrigerator"], "difficult_direct_answer": false, "rationales": ["The hall has a fridge.", "It is located at the end of the kitchen.", "It has handles for the two doors. one door is smaller on the top for the freezer and the larger door is the regular area"], "image": "val2014/COCO_val2014_000000320703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402359, "question_id": "Qd73AV2k4EhvZLpqsn4tRp", "question": "What era are the statues reminiscent of?", "choices": ["tokugawa shogunate", "ancient egypt", "gold rush", "italian renaissance"], "correct_choice_idx": 1, "direct_answers": ["egyptian", "egypt", "human", "ancient egypt", "stachive", "ancient egypt", "ancient egypt", "before christ", "ancient", "ancient egypt"], "difficult_direct_answer": false, "rationales": ["The era is ancient egypt.", "The figure is dark skinned and the hair is dark which is characteristic of this country. the artistic style is from and older style of norther africa.", "The era is from egypt."], "image": "train2014/COCO_train2014_000000402359.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65604, "question_id": "QdWoffDwkmCgtEJhu5swQp", "question": "What is the design on the wall?", "choices": ["apple", "echidna", "frog", "zebra"], "correct_choice_idx": 3, "direct_answers": ["zebra", "zebra", "mural", "zebra", "zebra", "zebra", "zebra", "cheked", "drawing", "zebra"], "difficult_direct_answer": false, "rationales": ["An animal with stripes is depicted on a wall.", "A zebra is on the wall.", "This horse like animal is in the picture."], "image": "val2014/COCO_val2014_000000065604.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355133, "question_id": "QeUAffJ8mcmcDPavPcyXQ5", "question": "What device is usually used with the item on the tray?", "choices": ["cookie cutter", "chopsticks", "pitchfork", "pizza cutter"], "correct_choice_idx": 3, "direct_answers": ["oven", "pizza", "knife", "pizza cutter", "knife", "fork", "knife", "plate", "pan", "plate"], "difficult_direct_answer": false, "rationales": ["Here two plates with two slices of pizza each on them are pictured. a pizza cutter is the only device of those listed which would be used with pizza.", "Pizza is sliced with a pizza cutter.", "The device is a cutter."], "image": "train2014/COCO_train2014_000000355133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418985, "question_id": "QfKM2sEaEEotwK5cyPT6s7", "question": "What is in the bowl with the bananas?", "choices": ["lemons", "limes", "strawberries", "cherries"], "correct_choice_idx": 2, "direct_answers": ["oranges", "strawberries oranges", "oranges", "strawberries", "eat", "orange", "orange", "strawberries", "oranges strawberries", "other fruits"], "difficult_direct_answer": false, "rationales": ["There are multiple objects also in the bowl with the bananas all identifiable by their color and shape. of the list of answer options, only a is clearly visible.", "The small fruit is characteristic of strawberries. in addition, the fruit is red like strawberries.", "There are oranges and a red fruit studded with seeds."], "image": "train2014/COCO_train2014_000000418985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 545844, "question_id": "Qh2fxPe7SwnBVrbyARZcxF", "question": "What is on top of the pizza in the foreground?", "choices": ["ham", "mussels", "sausage", "pepperoni"], "correct_choice_idx": 1, "direct_answers": ["shells", "mussels", "clams", "car", "oysters", "seafood", "mussels", "there", "plate", "mussels"], "difficult_direct_answer": false, "rationales": ["Mussels are sea food in to closed shells.", "Those are mussels on the pizza", "The pizza has mussels."], "image": "train2014/COCO_train2014_000000545844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570810, "question_id": "QihQDp2Ex49tYkaQ2JWLoU", "question": "How many suitcases are laying on the luggage return carousel?", "choices": ["four", "three", "two", "five"], "correct_choice_idx": 1, "direct_answers": ["three", "two", "two", "three", "two", "three", "two", "two", "two", "three"], "difficult_direct_answer": false, "rationales": ["The luggage return carousel is clearly visible and the number of items on it is countable.", "There are three suitcases.", "There are 3."], "image": "val2014/COCO_val2014_000000570810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51089, "question_id": "QjAxRptinaeCSVPpBdmAK8", "question": "What does the sign here on the left say is forbidden?", "choices": ["cross", "pass", "turn u", "speed"], "correct_choice_idx": 2, "direct_answers": ["u turn", "u turn", "turn u", "u turn", "clear", "uturn", "u turn", "u-turns", "u turns", "stop"], "difficult_direct_answer": false, "rationales": ["This is obvious because the arrow represents this type of turn and is covered in a red slash.", "The sign indicates that u-turns are not allowed.", "The sign says turn."], "image": "val2014/COCO_val2014_000000051089.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340126, "question_id": "QjED4D8W5tRewrFWjKufSc", "question": "How many portraits are hung on the side of this red wall?", "choices": ["two", "three", "one", "four"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is a painting of a landscape on the wall above the bed.", "The portrait is above the bed,", "There is one big portrait hung on the side of the wall."], "image": "train2014/COCO_train2014_000000340126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454291, "question_id": "QjMX9eKE58Pcje6BfysHXn", "question": "What is the man with the number 2 on his back swinging at?", "choices": ["person", "post", "bird", "ball"], "correct_choice_idx": 3, "direct_answers": ["bat", "baseball", "baseball ball", "baseball", "baseball", "baseball", "baseball", "ball", "ball", "ball"], "difficult_direct_answer": false, "rationales": ["The man swing with the ball in the pitch.", "He is swinging his bat to try to hit the ball.", "A man is batting in a baseball game."], "image": "train2014/COCO_train2014_000000454291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139564, "question_id": "QkNVwggNt4KtidkaqrLvUM", "question": "How many giraffes are here with their noses pointed toward the camera?", "choices": ["one", "four", "three", "two"], "correct_choice_idx": 2, "direct_answers": ["four", "three", "three", "three", "zero", "three", "three", "four", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 3.", "You can see both eyes of three of the animals.", "There are three giraffes."], "image": "train2014/COCO_train2014_000000139564.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 50927, "question_id": "QkcPPEr8F3udYnpbv84g7X", "question": "What team is at bat?", "choices": ["brooklyn dodgers", "seattle mariners", "boston beaneaters", "cleveland spiders"], "correct_choice_idx": 1, "direct_answers": ["white", "new brunswick", "white jerseys", "white", "oakland", "white", "white sox", "seattle mariners", "yankees", "yankees"], "difficult_direct_answer": false, "rationales": ["The batter is wearing a uniform form the mariners, indicating the team he is from.", "The seattle mariners are at bat.", "Thee team is from washington state."], "image": "train2014/COCO_train2014_000000050927.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205250, "question_id": "QmmShp2g97SMh7PdWEPArj", "question": "What kind of material is the coating of the man who is stood on the right near the luggage return?", "choices": ["felt", "corduroy", "denim", "leather"], "correct_choice_idx": 3, "direct_answers": ["leather", "leather", "leather", "leather", "leather", "leather", "leather", "leather", "leather", "leather"], "difficult_direct_answer": false, "rationales": ["The dark shiny coat is made of leather.", "The material is shiny.", "The man has a leather jacket."], "image": "train2014/COCO_train2014_000000205250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36053, "question_id": "QoGamohpnnrytdfvm4YzCT", "question": "What is on the sink?", "choices": ["book", "water bottle", "egg", "cat"], "correct_choice_idx": 1, "direct_answers": ["toothpaste", "water bottle", "water toothpaste", "bottle", "water bottle", "bottles", "bottle", "water bottle", "water bottle", "water bottle"], "difficult_direct_answer": false, "rationales": ["A pedestal sink in a bathroom has a clear bottle with a blue twist off lid. water comes in clear plastic bottles with twist off lids.", "There is a clear container on the sink. it is not a cat, egg, or book.", "The other options aren't in the photo. many people use bottled water in the bathroom to rinse their mouths."], "image": "val2014/COCO_val2014_000000036053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350167, "question_id": "QqeSrpnqmp8b3sTcUvitn8", "question": "The person that is running is wearing what?", "choices": ["crown", "armor", "cape", "sunglasses"], "correct_choice_idx": 3, "direct_answers": ["shots", "shorts", "swim trunks", "shades", "shorts", "shorts", "sunglasses", "shorts", "shorts", "swim trunks"], "difficult_direct_answer": false, "rationales": ["The person has sunglasses.", "A man in dark glasses is on the beach.", "The person running around the beach is wearing sunglasses."], "image": "val2014/COCO_val2014_000000350167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20702, "question_id": "QqhGmEzdSiLyFNLRHdCSzo", "question": "What is the ketchup likely for?", "choices": ["hamburger", "spaghetti", "rice", "fries"], "correct_choice_idx": 3, "direct_answers": ["fries", "fries", "fries", "hot dog", "hotdog", "hotdog", "fries", "french fries", "french fries", "hot dog"], "difficult_direct_answer": false, "rationales": ["The food items are clearly visible and answer a is commonly eaten in conjunction with the ketchup.", "It's the condiment people like most on their side of potatoes.", "The ketchup is for fries."], "image": "train2014/COCO_train2014_000000020702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560083, "question_id": "QqkVfhRqL9VCDKUuQRx2GX", "question": "What are these men displaying?", "choices": ["pilots license", "movie ticket", "entrance ticket", "police badge"], "correct_choice_idx": 0, "direct_answers": ["tickets", "ride", "cards", "licenses", "tickets", "tickets", "ticket", "pilots license", "pink boxes", "tickets"], "difficult_direct_answer": false, "rationales": ["The men are showing off their pilots' licenses.", "The men are standing near an airplane, not a police station, amusement park, or movie theater. the papers allow them to fly the airplane.", "The men have their licenses."], "image": "train2014/COCO_train2014_000000560083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328786, "question_id": "QrmwJMXmt5rTzyh3KSnte8", "question": "What is on the grass?", "choices": ["antelope", "cow", "baby", "bench"], "correct_choice_idx": 3, "direct_answers": ["dry leaves", "brown leafs", "leaves", "leaves", "bench", "leaves", "bench", "bench", "leaves", "leaves"], "difficult_direct_answer": false, "rationales": ["There are seats.", "There are no humans or animals on the grass. there are objects for sitting.", "These are on platforms on the grass"], "image": "val2014/COCO_val2014_000000328786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43997, "question_id": "Qvmbb39x3TxroX2VLZg43i", "question": "How many people can wash their hands at the same time?", "choices": ["15", "12", "four", "nine"], "correct_choice_idx": 2, "direct_answers": ["four", "four", "four", "four", "four", "four", "eight", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["This seems indicated by the number of sinks, soap dispensers, etc.", "There are four sinks.", "There are four sinks in the bathroom so four people can wash hands at the same time."], "image": "val2014/COCO_val2014_000000043997.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313020, "question_id": "Qw4CMUb9tQ98bkjVQaPfeR", "question": "What color is the batting helmet worn by the man at home plate?", "choices": ["orange", "black", "blue", "green"], "correct_choice_idx": 3, "direct_answers": ["red", "blue", "red", "red", "red", "red", "green", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The man is wearing a red helmet.", "The color is green.", "The color is green."], "image": "train2014/COCO_train2014_000000313020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276133, "question_id": "QwgmqpfeSNLVhQjmBAzbQF", "question": "Where did the food in the dish come from?", "choices": ["sky", "ground", "elephant waste", "rock"], "correct_choice_idx": 1, "direct_answers": ["ground", "india", "ground", "ground", "ground", "garden", "ground", "underground", "earth", "garden"], "difficult_direct_answer": false, "rationales": ["Carrots grow in the dirt.", "Carrots are tubers or root crops. the other options don't appear in this image.", "These are root vegetables"], "image": "val2014/COCO_val2014_000000276133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289870, "question_id": "QxRZxuUTsyqUmyxW6eJsfh", "question": "What is next to the vehicle?", "choices": ["cat", "carriage", "dog", "gate"], "correct_choice_idx": 3, "direct_answers": ["barriers", "station", "train", "road", "ink", "man", "train", "sign", "gate", "man"], "difficult_direct_answer": false, "rationales": ["A gate is next to the train.", "There is a gate by the train.", "There is a black and yellow gate next to the vehicle."], "image": "val2014/COCO_val2014_000000289870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129576, "question_id": "Qyz4jUqyTDgDqfN8Dw65qo", "question": "What color is the leather sectional in the corner of this entertainment room?", "choices": ["red", "sky blue", "purple", "white"], "correct_choice_idx": 1, "direct_answers": ["sky blue", "grey", "blue", "gray", "grey", "slate blue", "gray", "blue", "light blue", "turquoise"], "difficult_direct_answer": false, "rationales": ["The color is sky blue.", "The sectional sofa is sky blue.", "The color is blue."], "image": "val2014/COCO_val2014_000000129576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437467, "question_id": "QzK94hZUVzx67Tk2Kwkvct", "question": "What do the animals have?", "choices": ["long necks", "stingers", "stripes", "talons"], "correct_choice_idx": 2, "direct_answers": ["stripes", "stripes", "stripes", "stripes", "stripes", "stripes", "grass", "stripes", "stripes", "stripe"], "difficult_direct_answer": false, "rationales": ["The animals are zebras.", "The animals have stripes.", "They have stripes."], "image": "train2014/COCO_train2014_000000437467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128813, "question_id": "QzRfQ35khuiSwmKpmkt6Te", "question": "Who plays with the items near the camera?", "choices": ["senator", "baseball player", "judge", "baby"], "correct_choice_idx": 3, "direct_answers": ["teddy bear", "kids", "teddy bear", "kids", "stuffed bears", "child", "bear", "baby", "children", "teddy bear"], "difficult_direct_answer": false, "rationales": ["That said, the other three might own and play with these items as well. it's merely more common with babies and young children.", "The stuffed animals on either side of the cameras in this image would be most appropriate for the very young.", "The items are stuffed animals."], "image": "val2014/COCO_val2014_000000128813.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360868, "question_id": "R2QoS9wykHCr9BivfgMAHw", "question": "How many circular hung objects re found in this kitchen area?", "choices": ["five", "three", "two", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "hand washer", "one", "four", "four", "two", "four", "four", "two"], "difficult_direct_answer": false, "rationales": ["There are five round things hanging on the wall.", "There are 4 objects.", "There are four."], "image": "train2014/COCO_train2014_000000360868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238708, "question_id": "R4mU8wgGoweXeVcxU7Q2MD", "question": "What do all the foods being prepared have in common?", "choices": ["vegetables", "meat", "dessert", "dairy"], "correct_choice_idx": 0, "direct_answers": ["green", "green vegetables", "green colored", "vegetables", "vegetables", "vegetables", "green", "vegetables", "vegetables", "green"], "difficult_direct_answer": false, "rationales": ["A table has several bowls filled with green leafy food items. vegetables are often green and leafy.", "The food is green and fiber rich. they're all grown from a seed.", "The foods are veggies."], "image": "val2014/COCO_val2014_000000238708.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329268, "question_id": "R5f57YehK6pY66gXDsXrN8", "question": "The giraffes are foraging on what part of the tree?", "choices": ["bark", "roots", "wood", "leaves"], "correct_choice_idx": 0, "direct_answers": ["car", "bark", "trunk", "stem", "bark", "trunk", "branch", "stop", "base", "bark"], "difficult_direct_answer": false, "rationales": ["The giraffes are eating bark.", "The giraffes are munching on the bark.", "The leaves are gone so they are going for this on the trunk"], "image": "train2014/COCO_train2014_000000329268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451934, "question_id": "R7reQVEEmKbyRhfNpnGoVz", "question": "What is the square metal item on the floor?", "choices": ["heater", "drain", "weight scale", "vent"], "correct_choice_idx": 2, "direct_answers": ["scale", "scale", "weight scale", "weigh machine", "scale", "weight scale", "scale", "scale", "scale", "scale"], "difficult_direct_answer": false, "rationales": ["The object is used to see how much a person weighs.", "It is for stepping and and finding out how heavy one is.", "The square item is a weight scale."], "image": "train2014/COCO_train2014_000000451934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49553, "question_id": "R7vgUMxqDaJq4AZmpxzgr9", "question": "What are the orange items?", "choices": ["carrots", "cats", "traffic cones", "cows"], "correct_choice_idx": 2, "direct_answers": ["barrels", "construction barrel", "cones", "traffic cones", "traffic cones", "barriers", "orange", "cones", "barrels", "traffic barrels"], "difficult_direct_answer": false, "rationales": ["The items are cones.", "These are bright orange to be seen easily in traffic.", "The orange items are traffic cones."], "image": "train2014/COCO_train2014_000000049553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488349, "question_id": "R8Bw9rA3GeBJbi9pxw8dXp", "question": "What type of doors is seen?", "choices": ["sliding glass", "shoji doors", "french doors", "patio door"], "correct_choice_idx": 1, "direct_answers": ["shoji doors", "sliding doors", "wood", "watching", "wood", "sliding", "glass", "pocket", "glass", "sliding doors"], "difficult_direct_answer": false, "rationales": ["The doors are asian style.", "There are japanese style doors in the background.", "The doors are a japanese style door."], "image": "val2014/COCO_val2014_000000488349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225329, "question_id": "R8tyowuZXFsdCLV4HoDidh", "question": "Which city should this tour bus be driving around in?", "choices": ["los angeles", "new york", "miami", "san francisco"], "correct_choice_idx": 1, "direct_answers": ["new york", "new york", "new york", "new york", "new york", "new york", "new york", "new york", "new york", "new york"], "difficult_direct_answer": false, "rationales": ["All of these landmarks are in the big apple.", "There is a statue of liberty.", "This bus is likely in new york with the statue of liberty."], "image": "train2014/COCO_train2014_000000225329.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281970, "question_id": "R9AL7ur8wrNeSikhi94gp2", "question": "What is the woman surrounded by?", "choices": ["wine bottles", "elk", "video games", "books"], "correct_choice_idx": 0, "direct_answers": ["wine", "wine bottles", "wine bottles", "wine", "wine", "wine bottles", "wine", "wine", "brick archway", "wine"], "difficult_direct_answer": false, "rationales": ["You can tell by the bottle shape and how they are stacked as to what type they are.", "The objects surrounding the woman are identifiable by their shape and size as well as the labels on them and additionally the glass design the woman is drinking from.", "The woman has wine."], "image": "train2014/COCO_train2014_000000281970.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575174, "question_id": "RBwo9tjPSzDFq5aVn5RztR", "question": "What color is the edge of the tennis racket the little girl is using to practice tennis?", "choices": ["red", "black", "green", "blue"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "yellow", "green", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["A young girl in a pony tail is standing on a tennis court swinging balls with a green racket in her hand.", "It's green on the ground of the tennis court.", "The color is green."], "image": "val2014/COCO_val2014_000000575174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240755, "question_id": "RDkJgPQWGLbzQXGaPDhedD", "question": "What object used to prevent getting wet is nearby the cat in this image?", "choices": ["chairs", "carpet", "umbrella", "clothing"], "correct_choice_idx": 2, "direct_answers": ["umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["The umbrella protects you from the rain.", "The top part of that shields you from the rain.", "The object is an umbrella."], "image": "train2014/COCO_train2014_000000240755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223467, "question_id": "RDwN2skr6RNWuCpiN7ran3", "question": "What materials likely make up the colorful frame of the mirror?", "choices": ["metal", "terra cotta", "porcelain", "concrete"], "correct_choice_idx": 2, "direct_answers": ["porcelain", "glass", "tile", "glass", "glass", "glass", "glass", "tile", "glass", "tiles"], "difficult_direct_answer": false, "rationales": ["A mirror has a colorful glass frame.", "These are painted so it makes it different than terra cotta", "The frame of the mirror in the bathroom is made of broken pieces of porcelain tiles."], "image": "val2014/COCO_val2014_000000223467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572879, "question_id": "REYMtanmC6X339eTKRH8Xj", "question": "What is the boy with the helmet in the foreground holding?", "choices": ["luggage", "basket", "pizza box", "baseball bat"], "correct_choice_idx": 3, "direct_answers": ["bat", "bat", "bat", "bat", "bat", "baseball bat", "bat", "bat", "bat", "baseball bat"], "difficult_direct_answer": false, "rationales": ["The boy has a bat.", "The young battler is about to hit a baseball with his bat.", "The boy has a bat."], "image": "train2014/COCO_train2014_000000572879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261344, "question_id": "RFhxNTEzb3exZVZHKsGfty", "question": "How many zebras are standing on the hay below the tree?", "choices": ["four", "five", "one", "three"], "correct_choice_idx": 0, "direct_answers": ["four", "five", "four", "five", "five", "five", "four", "five", "five", "four"], "difficult_direct_answer": false, "rationales": ["There are four zebras by the tree.", "There are four visible zebras.", "There are four zebras."], "image": "train2014/COCO_train2014_000000261344.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223467, "question_id": "RHKrmhFpYaEaHggWZ4ooPe", "question": "Why does the tub have round silver objects on it?", "choices": ["hot water", "drainage", "childs toys", "whirlpool"], "correct_choice_idx": 3, "direct_answers": ["water jets", "sauna", "whirlpool", "jets", "jacuzzi jets", "jets", "tub", "jet nozzles", "whirlpool", "water jets"], "difficult_direct_answer": false, "rationales": ["The tub has a whirlpool.", "The silver objects are water jets that are designed to circulate water and massage the body.", "The tub is a whirlpool."], "image": "val2014/COCO_val2014_000000223467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460405, "question_id": "RHdWVnwsAkDneKij7ZLMB8", "question": "What does the animal in the foreground have?", "choices": ["wings", "gills", "spots", "quills"], "correct_choice_idx": 2, "direct_answers": ["spots", "tail", "long neck", "grass", "spots", "spots", "giraffe", "giraffe", "long neck", "grass"], "difficult_direct_answer": false, "rationales": ["The animal has spots.", "The animal is a giraffe whose skin is brown and white in a distinct but unique pattern.", "Giraffe in the front has many round patterns on its fur."], "image": "train2014/COCO_train2014_000000460405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 179070, "question_id": "RJWkGh6zLWJwFEpwAeRGRj", "question": "What state is the company from whose logo appears on the bus?", "choices": ["new york", "missouri", "oklahoma", "michigan"], "correct_choice_idx": 0, "direct_answers": ["new york", "new york", "london", "new york", "new york", "new york", "new york", "state", "deny", "new york"], "difficult_direct_answer": false, "rationales": ["The company has the state in its name.", "The text visible on the bus is a known abbreviation for a company and the last letters of the abbreviation represent the state of answer a.", "New york is the center of fashion. dkny is a company where ny stands for new york."], "image": "train2014/COCO_train2014_000000179070.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221094, "question_id": "RKecB9drDA73KCX7ovGpGY", "question": "What is the woman holding in her hand?", "choices": ["luggage handle", "babys hand", "cat paw", "dog paw"], "correct_choice_idx": 0, "direct_answers": ["suitcase", "suitcase", "suitcase", "suitcase", "suitcase", "purse", "suitcase", "bag", "luggage handle", "luggage"], "difficult_direct_answer": false, "rationales": ["It's on wheels and so she can drag it behind her.", "The woman is rolling a suitcase, holding it by its handle.", "A woman is pulling a suitcase behind her as she rides on the back of a bike."], "image": "val2014/COCO_val2014_000000221094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220736, "question_id": "RMKPtaqyCgi446rGVAvjdR", "question": "How many wicket baskets are on top of the little bench near the doorway?", "choices": ["four", "three", "two", "five"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "five", "three", "three"], "difficult_direct_answer": false, "rationales": ["Although there are four baskets on that side, one of them is in the windowsill and not on the little bench.", "There is a view of a long black cabinet with multiple wicket baskets on left side.", "There are more than two but less than four baskets."], "image": "train2014/COCO_train2014_000000220736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211807, "question_id": "RNjwz3rmQLeaQu3TnL9god", "question": "What did the company make whose logo is on the steel structure?", "choices": ["lumber", "sandwiches", "burgers", "tvs"], "correct_choice_idx": 3, "direct_answers": ["panasonic", "tvs", "stereos", "panasonic", "electronics", "panasonic", "planes", "panasonic", "jetway", "panasonic"], "difficult_direct_answer": false, "rationales": ["The company is tvs.", "The sign on the steel structure refers to panasonic, not subway, mcdonald's, or weyerhaeuser.", "The company is know for making tvs."], "image": "train2014/COCO_train2014_000000211807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468773, "question_id": "RQxoKZbGTKM36NDcXWFw7F", "question": "What animal that cats like is the electronic in this image often referred to as?", "choices": ["rat", "chicken", "mouse", "fish"], "correct_choice_idx": 2, "direct_answers": ["mouse", "mouse", "mouse", "mouse", "kitten", "mouse", "mouse", "mouse", "mouse", "mouse"], "difficult_direct_answer": false, "rationales": ["The hand-driven object is said to look like mice.", "The device has the same name as an animal.", "It is the same shape and size of the animal with the same name."], "image": "val2014/COCO_val2014_000000468773.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116393, "question_id": "RSQSjLSBck3zxXa7RLnJVF", "question": "What are the metal boxes on the wall used for?", "choices": ["towels", "hot water", "soap dispenser", "dry hands"], "correct_choice_idx": 2, "direct_answers": ["keep", "soap", "light", "soap", "dispensing soap", "soap dispenser", "stop", "dispensing towels", "soap", "soap"], "difficult_direct_answer": false, "rationales": ["The lever on these boxes and their placement at each sink in the image tell us they dispense soap.", "A long line of sinks in a public bathroom all have a silver, square, metal object on the wall near them.", "The boxes are for soap."], "image": "train2014/COCO_train2014_000000116393.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68120, "question_id": "RTewnbQvQfXCrfwnqiu4W6", "question": "What color are the emblems on the costume for the man on the right?", "choices": ["gold", "red", "black", "yellow"], "correct_choice_idx": 3, "direct_answers": ["brown", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "nothing"], "difficult_direct_answer": false, "rationales": ["The man is wearing blue and gold.", "The man on the right is a union soldier. his emblems are not gold, red, or black.", "The emblems on the front of the horses are yellow."], "image": "val2014/COCO_val2014_000000068120.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468068, "question_id": "RVjBwcGiboyKvkNADLgCyx", "question": "What athlete has a last name that is similar to the name on the bottle?", "choices": ["otis nixon", "mike richter", "jaromir jagr", "ben hogan"], "correct_choice_idx": 2, "direct_answers": ["jager", "chuck jager", "na", "jaromir jagr", "car", "unknown", "jagermeister", "evan jager", "meifer", "mick jager"], "difficult_direct_answer": true, "rationales": ["The letters are the same in the first part", "There is a similar sound with the \"j\" in it.", "They share similar letters"], "image": "train2014/COCO_train2014_000000468068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 74601, "question_id": "RXeFVGCkQoTSFMcNaZXssn", "question": "How many square portraits are hung in the walls of this loft bed?", "choices": ["four", "one", "two", "three"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Two square portraits are hung upon the walls.", "There are 2 portraits.", "There are two picture frames hung up"], "image": "train2014/COCO_train2014_000000074601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323682, "question_id": "RXxMrEhj9PMQzxqsZrxh8Y", "question": "Where is the man likely headed?", "choices": ["to circus", "on vacation", "to court", "to prison"], "correct_choice_idx": 1, "direct_answers": ["travel", "vacation", "vacation", "ocean", "airplane", "aiport", "airport", "on vacation", "airport", "vacation"], "difficult_direct_answer": false, "rationales": ["He has a suitcase and travel bags.", "The man is going on vacation.", "The man is pushing several suitcases and bags which implies he is going on vacation."], "image": "val2014/COCO_val2014_000000323682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540, "question_id": "RYzqNUCXHTFaQni39kRT3F", "question": "What is the large vehicle getting ready to do?", "choices": ["fire missiles", "race cars", "race camels", "fly"], "correct_choice_idx": 3, "direct_answers": ["aeroplane", "fly", "airplane", "fly", "airoplane", "take off", "fly", "take off", "take off", "fly"], "difficult_direct_answer": false, "rationales": ["This is an airliner", "A large commercial plane is on a runway at an airport. planes take off from airports.", "The vehicle has wings."], "image": "train2014/COCO_train2014_000000000540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482476, "question_id": "RdW7vkYSa4vuTNe4aFQtHE", "question": "Why are the womans lips so red?", "choices": ["sun burn", "lipstick", "paint", "natural color"], "correct_choice_idx": 1, "direct_answers": ["lipstick", "lipstick", "lipstick", "lipstick", "applied lipstick", "lipstick", "lipsticks", "lipstick", "lipstick", "lipstick"], "difficult_direct_answer": false, "rationales": ["She is wearing makeup", "The woman is wearing lipstick on her lips.", "The woman is wearing a red lipstick."], "image": "val2014/COCO_val2014_000000482476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96716, "question_id": "RdYKZVEtdijw9yojSdL4BX", "question": "What kind of person would spend the most time here?", "choices": ["circus clown", "rancher", "boat captain", "baseball player"], "correct_choice_idx": 2, "direct_answers": ["boat captain", "car", "fisherman", "vacation", "sealers", "sailor", "dock worker", "sailors", "stop", "boater"], "difficult_direct_answer": true, "rationales": ["A dock is filled to capacity.", "Any of these options might spend their time at a dock, but only a would spend the most time.", "The person is a captain."], "image": "train2014/COCO_train2014_000000096716.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196917, "question_id": "ReNEs9zBpz9PSX9Ap7Ci2y", "question": "What is next to the vehicle?", "choices": ["moose", "giraffe", "cow", "monkey"], "correct_choice_idx": 1, "direct_answers": ["giraffe", "giraffe", "giraffe", "bus", "giraffe", "zebra", "giraffe", "giraffe", "giraffe", "giraffe"], "difficult_direct_answer": false, "rationales": ["The giraffe is next to the vehicle.", "It is very tall and has a spotted coat characteristic of this animal. the tour of people is consistant with viewing this type of animal on safari.", "The giraffe is near."], "image": "train2014/COCO_train2014_000000196917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412204, "question_id": "RfEcy4VYjyT2UDKpH8tCy2", "question": "What is below the kites?", "choices": ["people", "dog", "airplane", "cat"], "correct_choice_idx": 0, "direct_answers": ["mountain", "two men", "people", "people", "people", "people", "humans", "ground", "mountain", "people"], "difficult_direct_answer": false, "rationales": ["People are looking.", "Two kite-flyers are standing together as they fly their kites. kites originated in china some 2,000-3,000 years ago.", "Humans fly these kinds of objects in the sky."], "image": "val2014/COCO_val2014_000000412204.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109973, "question_id": "RfnMPWwGAsZ4wWfXMGcDoG", "question": "What shape is the mirror above the white sink of the bathroom?", "choices": ["square", "oval", "rectangle", "round"], "correct_choice_idx": 1, "direct_answers": ["oval", "oval", "oval", "oval", "oval", "oval", "oval", "oval", "oval", "oval"], "difficult_direct_answer": false, "rationales": ["The mirror is an oblong circle.", "The shape is oval.", "The mirror is shaped like an ellipse."], "image": "train2014/COCO_train2014_000000109973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441854, "question_id": "RggAjt6Sbe7EwMKVegVvkB", "question": "What language does pare mean stop?", "choices": ["russian", "romanian", "spanish", "french"], "correct_choice_idx": 2, "direct_answers": ["spanish", "spanish", "spanish", "stop", "spanish", "don't knw", "stop", "spanish", "spanish", "french"], "difficult_direct_answer": false, "rationales": ["Pare is not a romanian, russian, or french word.", "The language is in spanish.", "The word is known in spanish as stop."], "image": "val2014/COCO_val2014_000000441854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533810, "question_id": "RgqhdeLub5ww9uoGRniBwA", "question": "What is pulling the boats on the highway before the river?", "choices": ["sedan", "pickup", "semi", "jeep"], "correct_choice_idx": 2, "direct_answers": ["semi", "truck", "truck", "semi", "semi", "truck", "semi truck", "trailer", "truck", "small boat"], "difficult_direct_answer": false, "rationales": ["The boats are still attached to the bed of the truck. based on the size of the boats and design of the truck bed in addition to the cab of the truck, answer a is accurate.", "It has an area for carriage", "There is the base of an 18-wheeler truck."], "image": "train2014/COCO_train2014_000000533810.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35506, "question_id": "Rh5iacLdu2K3KGvEvo7PJH", "question": "What kind of ball is the dog sitting next to on the concrete?", "choices": ["soccer", "tennis ball", "basketball", "baseball"], "correct_choice_idx": 2, "direct_answers": ["basket ball", "round", "basketball", "car", "basketball", "basketball", "round", "basketball", "basketball", "round"], "difficult_direct_answer": false, "rationales": ["A canine is laying on the ground with an orange ball in front of it.", "The ball is for basketball.", "He is sitting next to a orange basketball."], "image": "train2014/COCO_train2014_000000035506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79084, "question_id": "Rj5MNsN8ArRgkVj8AgKQbf", "question": "What color is the tank top worn by the female tennis player standing ready?", "choices": ["pink", "blue", "brown", "green"], "correct_choice_idx": 0, "direct_answers": ["red", "red", "ross", "ross", "pink", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The top has a tint of red.", "It's a little lighter than red", "The tennis player woman is wearing a shirt similar to red. it is a bit lighter and brighter color though."], "image": "val2014/COCO_val2014_000000079084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324403, "question_id": "RjpME2F6Mv6WT7rJ2M5DH3", "question": "What is on the luggage?", "choices": ["apple", "hat", "cat", "dog"], "correct_choice_idx": 1, "direct_answers": ["hat", "hat", "hat scarf", "cap", "hat", "cat", "hat", "hat", "hat", "hat"], "difficult_direct_answer": false, "rationales": ["A hat is on top of a suitcase.", "The luggage has a hat.", "A hat is on two small square items that have handles. the small square items look generally like briefcases or small suitcases."], "image": "train2014/COCO_train2014_000000324403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5535, "question_id": "RkFuULpWUS22TjDiqWhjeK", "question": "What nation is likely to house this bus on the street?", "choices": ["denmark", "uk", "germany", "usa"], "correct_choice_idx": 1, "direct_answers": ["britain", "us", "nation", "uk", "uk", "america", "country", "american", "usa", "england"], "difficult_direct_answer": true, "rationales": ["The bus was decorated in red, white, and blue colors and had english print on it.", "Based on the design of the bus, the street sign and traffic signals, in addition to the colors on the bus, answer a meets all the criteria.", "The nation is the uk."], "image": "val2014/COCO_val2014_000000005535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504487, "question_id": "RmF56UnUKdgZrbpo9ToqLn", "question": "What are the giraffes near?", "choices": ["boats", "pumpkins", "trees", "cows"], "correct_choice_idx": 2, "direct_answers": ["tree", "tree", "trees", "tree", "trees", "tree", "tree", "tree", "tree", "tree"], "difficult_direct_answer": false, "rationales": ["The giraffes are standing around a couple of trees in the zoo pen.", "They're by trees.", "The giraffes are playing next to some trees."], "image": "val2014/COCO_val2014_000000504487.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512786, "question_id": "RsbjEoN6MWuZNco47bD2gf", "question": "What color is the long vase in the middle of the dresser against the wall?", "choices": ["purple", "tan", "black", "blue"], "correct_choice_idx": 0, "direct_answers": ["purple", "purple", "purple", "purple", "purple", "fefefe", "silver", "purple", "purple", "purple"], "difficult_direct_answer": false, "rationales": ["The tallest vase is purple in color.", "Mixing blue and red make this color.", "The color is purple."], "image": "train2014/COCO_train2014_000000512786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208220, "question_id": "RtRih399Cjy6qB7582Jd63", "question": "What color are the traffic cones to the right underneath of the yellow tape?", "choices": ["white", "orange", "yellow", "blue"], "correct_choice_idx": 1, "direct_answers": ["red", "orange", "orange", "orange", "orange", "red", "orange", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The color is orange.", "The color is orange.", "The traffic cones at the right side of the yard are all orange."], "image": "train2014/COCO_train2014_000000208220.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190738, "question_id": "RteAQNqCsX4fpuTVXJzAj3", "question": "Who batted with the same handedness as this batter?", "choices": ["manny ramirez", "rogers hornsby", "mike schmidt", "fred mcgriff"], "correct_choice_idx": 3, "direct_answers": ["rod crew", "ty cobb", "number 16", "catcher", "barry bonds", "catcher", "other player", "na", "fred mcgriff", "tony gwynn"], "difficult_direct_answer": true, "rationales": ["He was a left handed hitter from the 1980s to 2000s.", "He was known to bat using his left hand.", "Fred mcgriff, an american baseball player was a left handed batter."], "image": "val2014/COCO_val2014_000000190738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6864, "question_id": "RvytoUpd85gxpyFiWpCMzy", "question": "What color is the line on the red sign?", "choices": ["green", "black", "purple", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["It is a standard do not enter sign.", "The line is devoid of any color.", "The color is white."], "image": "val2014/COCO_val2014_000000006864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465911, "question_id": "Rw9ziaZJmsm4Z2GrZvkvBJ", "question": "What video game featured this activity?", "choices": ["madden 21", "mlb 20", "nhl 20", "wonder boy"], "correct_choice_idx": 3, "direct_answers": ["wii", "wii", "tony hawk", "skateboarding", "tony hawk", "tony hawk", "skating", "tony hawk", "skater xl", "wonder boy"], "difficult_direct_answer": false, "rationales": ["Wonder boy is a skateboarding game.", "Out of all the choices given the first answer is the only viable one as you play using a skateboard in game.", "This activity is skateboarding, not hockey, football, or baseball."], "image": "val2014/COCO_val2014_000000465911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 63879, "question_id": "RxHGxn6XHpPfQtRqcRpKSK", "question": "What does it say on the clock?", "choices": ["2306", "254", "307", "0000"], "correct_choice_idx": 0, "direct_answers": ["2306", "2306", "2306", "time", "time", "2306", "2306", "2306", "2306", "2306"], "difficult_direct_answer": false, "rationales": ["The clock says 2306.", "The clock is identified by the size, design and display. the digits are clearly visible on the clock and read as answer a.", "The clock is clearly visible and the digits are readable."], "image": "train2014/COCO_train2014_000000063879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380854, "question_id": "RxQDYiczJ5PUY7SqpKeEGN", "question": "What are the initials likely representing?", "choices": ["town name", "pet name", "business name", "family name"], "correct_choice_idx": 3, "direct_answers": ["no clue", "names", "good", "names", "couple", "homeowner", "first names", "first names", "family name", "bathroom items"], "difficult_direct_answer": false, "rationales": ["This looks like a home bathroom where people usually decorate with things that are representative of the owners and occupants making answer a most likely.", "It is the first initials of names of people who live there", "These are initials for people"], "image": "val2014/COCO_val2014_000000380854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 86239, "question_id": "RxvRp3B4rnMYxrQXF7g44r", "question": "What is on the floor next to the toilet?", "choices": ["cat", "apple", "brush", "baby"], "correct_choice_idx": 2, "direct_answers": ["brush", "brush", "toilet brush", "brush", "brush", "brush", "scrubber", "brush", "brush", "brush"], "difficult_direct_answer": false, "rationales": ["The yellow item is used to scrub the toilet.", "There is a brush by the toilet.", "The item has a long handle and white bristles useful for scrubbing things."], "image": "train2014/COCO_train2014_000000086239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 580161, "question_id": "Ry8swWou5ffTRN2jeGbGTu", "question": "What animal is behind the fence?", "choices": ["dog", "cat", "fox", "polar bear"], "correct_choice_idx": 3, "direct_answers": ["bear", "com", "polar bear", "polar bear", "polar bear", "polar bear", "bear", "polar bear", "animal", "bear"], "difficult_direct_answer": false, "rationales": ["The bear is behind.", "It's large and furry like a bear and white which makes it a polar bear.", "Polar bears are big and white."], "image": "train2014/COCO_train2014_000000580161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97195, "question_id": "RzQ8MXkWH5YVScY2bd4r2S", "question": "How many zebras are eating hay from the trough?", "choices": ["two", "three", "one", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "water", "two", "two", "two", "two", "three"], "difficult_direct_answer": false, "rationales": ["There are two zebras.", "It's extremely hard to tell from the image, but it appears to be the answer based on the number of heads.", "The zebra on the left is beside a different zebra."], "image": "train2014/COCO_train2014_000000097195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462549, "question_id": "RzrtGwBewp6WgnBQbSgGHY", "question": "Who is a competitor of this company?", "choices": ["dunkin donuts", "nathans", "office max", "home depot"], "correct_choice_idx": 0, "direct_answers": ["dunkin donuts", "dunkin", "na", "dunkin", "people", "car", "dunkin donuts", "people", "dunkin donuts", "dunkin donuts"], "difficult_direct_answer": false, "rationales": ["The competitor is dunkin.", "Dunkin donuts is a kind of donut. the company is krispy kreme and this is a box of donuts.", "The boxes stacked here read krispy kreme a donut company; we also see one of these boxes propped open and it's full of donuts. dunkin donuts is also a donut seller."], "image": "train2014/COCO_train2014_000000462549.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100827, "question_id": "S3YVVmbmm8zDWngrRxTyCi", "question": "What color is the neon sign on the second story of this building?", "choices": ["violet", "blue", "pink", "red"], "correct_choice_idx": 3, "direct_answers": ["white", "red", "red", "red", "red", "green", "orange", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The red is the color that is secondary.", "The neon sign is not pink, blue, or violet.", "The color is red."], "image": "train2014/COCO_train2014_000000100827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1811, "question_id": "S3y4JRHrfPe6jDrRhnF6Tz", "question": "What is in the can near the skateboarder that is in the air?", "choices": ["soda", "tomato sauce", "water", "egg whites"], "correct_choice_idx": 0, "direct_answers": ["soda", "cola", "coke", "soda", "soda", "coke", "coca-cola", "soda", "soda", "coca cola"], "difficult_direct_answer": false, "rationales": ["The can is for soda.", "The soda is inside.", "The red can contains a beverage."], "image": "train2014/COCO_train2014_000000001811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229312, "question_id": "S4cogviHVPDYdkJaMzBMs2", "question": "What are the people standing in front of?", "choices": ["cats", "trees", "book shelves", "apples"], "correct_choice_idx": 1, "direct_answers": ["forest", "trees", "trees", "trees", "trees", "trees", "trees", "trees", "trees", "trees"], "difficult_direct_answer": false, "rationales": ["The people are by trees.", "The background contains large tall plants with green leaves which are known as these.", "The people are in front of trees."], "image": "train2014/COCO_train2014_000000229312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 98048, "question_id": "S5ZoSG9FFfhTE9TGXFtYkX", "question": "What color is the non-white stripe on the sail of this small boat?", "choices": ["orange", "blue", "red", "purple"], "correct_choice_idx": 0, "direct_answers": ["orange", "orange", "orange", "orange", "orange", "orange", "red", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The color is orange.", "The sail is an orange color.", "The colored stripe is the color of a carrot."], "image": "val2014/COCO_val2014_000000098048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232047, "question_id": "S5bf3YHRqbd8HiUXzkwNqK", "question": "How many double-decker buses are loading on the left side of the street?", "choices": ["one", "four", "two", "six"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "eight", "two"], "difficult_direct_answer": false, "rationales": ["Two buses are on the road.", "Two can be seen.", "There are 2."], "image": "train2014/COCO_train2014_000000232047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7559, "question_id": "S6QjyBNqByANhuZSAfnm4d", "question": "Where are these toothbrushes likely located?", "choices": ["doctors office", "school", "home", "dentists office"], "correct_choice_idx": 3, "direct_answers": ["dentists office", "orthodontist", "sink", "cub", "watching", "cup", "dentist", "bathroom", "bathroom", "bathroom"], "difficult_direct_answer": false, "rationales": ["Dentists often give these away for teeth health.", "The brushes are in the dentist's office.", "The brushes are at the dentist's."], "image": "val2014/COCO_val2014_000000007559.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107247, "question_id": "S99GkuGqdcybZ9YjnVXJTX", "question": "What country does this green and white bus likely operate in?", "choices": ["france", "uk", "germany", "usa"], "correct_choice_idx": 1, "direct_answers": ["south africa", "america", "uk", "american", "usa", "america", "france", "america", "america", "england"], "difficult_direct_answer": false, "rationales": ["The double decker bus most likely operates in the uk.", "The drums around the bus are the same as the uk.", "The country is the uk."], "image": "val2014/COCO_val2014_000000107247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64348, "question_id": "S9oW2VWUCvtvrJxcw395dS", "question": "How many sheep are standing around in the cape field?", "choices": ["two", "four", "six", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "two", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["Two sheep are nuzzling and a third is behind them.", "There are three sheep.", "One is by itself and the other two are snuggled together"], "image": "train2014/COCO_train2014_000000064348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36237, "question_id": "SAudAVNot6Kr8vCZ9zFZ9h", "question": "What is the hat made of?", "choices": ["straw", "string", "cotton", "twine"], "correct_choice_idx": 0, "direct_answers": ["straw", "fabric", "straw", "straw", "straw", "lether", "straw", "plastic", "straw", "straw"], "difficult_direct_answer": false, "rationales": ["The hat is made from this coarse material.", "It is woven from plant fiber", "There is a straw hat."], "image": "train2014/COCO_train2014_000000036237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302343, "question_id": "SBHrFXKxV8SYtbADCzoKDo", "question": "What is usually found in this environment?", "choices": ["fish", "tigers", "polar bears", "cows"], "correct_choice_idx": 0, "direct_answers": ["boats", "snakes", "crocodiles", "fish", "fish", "alligators", "lake", "fish", "ship", "fish"], "difficult_direct_answer": false, "rationales": ["Fish are in this environment.", "Fish live in water.", "The fish are found."], "image": "train2014/COCO_train2014_000000302343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351550, "question_id": "SCBbkrognupYn2qULvmKVq", "question": "What is the cat peaking out from behind?", "choices": ["closet door", "computer", "box", "table"], "correct_choice_idx": 1, "direct_answers": ["computer screen", "computer", "laptop", "laptop", "computer", "monitor", "laptop", "computer", "laptop", "laptop"], "difficult_direct_answer": false, "rationales": ["The computer is in front of the cat.", "The cat is by the computer.", "The cat is looking from the computer."], "image": "train2014/COCO_train2014_000000351550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484019, "question_id": "SDhwvDS4jXJ7SzDRAuhCEw", "question": "What does the French word Rue mean in English?", "choices": ["street", "north", "south", "east"], "correct_choice_idx": 0, "direct_answers": ["street", "road", "regret", "street", "street", "maison", "road", "road", "street", "street"], "difficult_direct_answer": false, "rationales": ["It is used as another word for road.", "The french word rue means road.", "The word means street."], "image": "train2014/COCO_train2014_000000484019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461365, "question_id": "SEbk5veJs8RpEfunZEU2Vo", "question": "What color is the sheet covering the small twin bed on the corner of the room?", "choices": ["green", "blue", "pink", "yellow"], "correct_choice_idx": 3, "direct_answers": ["yellow", "gold", "gold", "gold", "gold", "yellow", "gold", "gold", "gold", "gold"], "difficult_direct_answer": false, "rationales": ["Noe of these options are correct. the sheet is white and the cover is gold and yellow.", "The bed covering is the bright color.", "The color is yellow."], "image": "train2014/COCO_train2014_000000461365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489700, "question_id": "SFcMSML3egNy2enq5pfMT3", "question": "What is the white object on the coffee table called?", "choices": ["doily", "towel", "cover", "tablecloth"], "correct_choice_idx": 0, "direct_answers": ["table cloth", "runner", "doily", "rug", "doily", "runner", "table runner", "doily", "table cloth", "table cloth"], "difficult_direct_answer": false, "rationales": ["It is a piece of decoration for the table.", "The object is a doily.", "The object is a doily."], "image": "train2014/COCO_train2014_000000489700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362825, "question_id": "SFtaKSip633ftQApswgTUY", "question": "What is on the plate?", "choices": ["pancake", "giant spoon", "donut", "lawnmower"], "correct_choice_idx": 2, "direct_answers": ["donut", "donuts", "donuts", "donut", "donut", "donut", "donuts", "doughnut", "doughnut", "cake"], "difficult_direct_answer": false, "rationales": ["Donuts are on the plate.", "The round food with a hole in the middle is often served with coffee.", "The plate on the table contains two glazed donuts."], "image": "train2014/COCO_train2014_000000362825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306975, "question_id": "SFujMiqaKReiBBCcv9q4Cs", "question": "What does the animal have?", "choices": ["wings", "gills", "talons", "tusks"], "correct_choice_idx": 3, "direct_answers": ["elephant", "tusk", "tusks", "carrying", "tusks", "tusks", "tusks", "trunk", "tusks", "tusks"], "difficult_direct_answer": false, "rationales": ["The elephant has ivory tusks.", "The animal is an elephant, not a bird or fish.", "The animal has tusks."], "image": "train2014/COCO_train2014_000000306975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19967, "question_id": "SGRaH2XRHWH3q37ugdAfZp", "question": "How many donuts are held by the persons inside of this van vehicle?", "choices": ["four", "three", "two", "five"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "one", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The hands and donuts of the people are clearly visible and answer a is correct.", "One donut is beside an additional one.", "There are two donuts."], "image": "train2014/COCO_train2014_000000019967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68163, "question_id": "SGqCXmnDFaZDabAVDvuHCn", "question": "What color are the bikes lining on the left side of this hallway?", "choices": ["purple", "green", "blue", "orange"], "correct_choice_idx": 1, "direct_answers": ["green", "green", "green", "green", "stop", "green", "blue", "green", "blue", "green"], "difficult_direct_answer": false, "rationales": ["The motorcycles have a green exterior.", "The color is green.", "This is primarily the color of the single one."], "image": "train2014/COCO_train2014_000000068163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92585, "question_id": "SHAwvVMVsNmDu4KvT39pJN", "question": "What movie is related to the word on the boat?", "choices": ["lobster", "die hard", "cats", "small soldiers"], "correct_choice_idx": 0, "direct_answers": ["jaws", "jaws", "jaws", "aragosta", "aragorn", "jaws", "armasted", "unkown", "lobster", "jaws"], "difficult_direct_answer": false, "rationales": ["Die hard was the movie.", "The boat says \"lobster\" in spanish.", "The boat takes lobsters in."], "image": "train2014/COCO_train2014_000000092585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505849, "question_id": "SHB4vVEHgYmGeZxCCLwicg", "question": "How many places could an animal get water from here?", "choices": ["three", "five", "eight", "six"], "correct_choice_idx": 0, "direct_answers": ["four", "three", "three", "two", "two", "three", "three", "three", "two", "zero"], "difficult_direct_answer": false, "rationales": ["An animal could get water from the sink, toilet, or bidet.", "There is a sink, toilet and bidet", "There are three places."], "image": "val2014/COCO_val2014_000000505849.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262353, "question_id": "SJHGarA2zThsMKzLVsBcAa", "question": "Which color outfit is bright here?", "choices": ["yellow", "orange", "purple", "black"], "correct_choice_idx": 1, "direct_answers": ["dwdwd", "orange", "blue", "blue", "teal", "red", "orange", "orange", "blue", "orange"], "difficult_direct_answer": false, "rationales": ["It is the most visible in the room.", "The outfit is orange.", "I would argue that the blue and silver shirt is brighter, but the a color option also stands out."], "image": "val2014/COCO_val2014_000000262353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444949, "question_id": "SKE3ryScxybtAZDourUgTh", "question": "What is the person doing on the plane?", "choices": ["sleeping", "balancing", "washing it", "eating food"], "correct_choice_idx": 1, "direct_answers": ["tricks", "standing", "dancing", "kicking", "standing", "posing", "tricks", "balancing", "performing", "acrobatics"], "difficult_direct_answer": false, "rationales": ["They are doing tricks on the wing", "The person is balancing.", "The person standing on top of the airplane is balancing while in flight."], "image": "train2014/COCO_train2014_000000444949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234470, "question_id": "SKc9B4MAzbKaZojwcmrEzi", "question": "What color is the water kettle on the top of the oven in the back of the kitchen?", "choices": ["green", "red", "blue", "yellow"], "correct_choice_idx": 1, "direct_answers": ["red", "red", "red", "orange", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["It's red.", "The color is red.", "The water kettle on the back of the kitchen is red."], "image": "val2014/COCO_val2014_000000234470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415190, "question_id": "SLECyVhXWivsJThCrsj4Xa", "question": "What state has a name closest to the name that is found on the vehicle?", "choices": ["virginia", "new jersey", "fordham", "delaware"], "correct_choice_idx": 0, "direct_answers": ["virginia", "virginia", "virginia", "virginia", "us", "virgin", "virginia", "aeroplane", "virginia", "virginia"], "difficult_direct_answer": false, "rationales": ["A simple comparison reveals this fact. that said, virgin atlantic isn't headquartered there.", "It's obvious given the shared letters. that said, it doesn't originally come out of this state.", "The vehicle is an airplane that has a virgin atlantic livery."], "image": "train2014/COCO_train2014_000000415190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235788, "question_id": "SLf4uS3KaYah2NS2ttqVxS", "question": "Which country is the producer of cars like the red one here?", "choices": ["uk", "italy", "germany", "france"], "correct_choice_idx": 2, "direct_answers": ["america", "usa", "germany", "germany", "germany", "germany", "brazil", "germany", "germany", "germany"], "difficult_direct_answer": false, "rationales": ["The van is a volkswagen.", "The rounded bus here pictured bears a v w logo on it's front. the vw company was founded in germany.", "Volkswagen cars are made in germany"], "image": "val2014/COCO_val2014_000000235788.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300303, "question_id": "SM3w7Cffo3ti52xLuaGUzi", "question": "What color is the area of the train car around the window?", "choices": ["orange", "cream", "white", "pink"], "correct_choice_idx": 1, "direct_answers": ["yellow", "brown", "beige", "cream", "white", "yellow", "tan", "yellow", "yellow", "green"], "difficult_direct_answer": false, "rationales": ["The color is cream.", "The top half is this color and the bottom is red", "The color of the area around the outside of the train is a yellowish/off white color."], "image": "val2014/COCO_val2014_000000300303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 509807, "question_id": "SMHLDba2YeWSLeyf3PFQvm", "question": "What is the plate on?", "choices": ["egg carton", "counter top", "towel", "cardboard box"], "correct_choice_idx": 1, "direct_answers": ["oven", "counter top", "kitchen countertop", "table", "counter", "stop", "countertop", "counter", "table", "vdfv"], "difficult_direct_answer": false, "rationales": ["The plate is on the counter.", "It's obvious and none of the other items are under the plate.", "The plate is on the counter."], "image": "val2014/COCO_val2014_000000509807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305431, "question_id": "SMwbaNUUtvJPvEMUnKV9eN", "question": "What do these animals produce?", "choices": ["silk", "venison", "beef", "lamb chops"], "correct_choice_idx": 2, "direct_answers": ["milk", "cow", "milk", "beef", "milk", "beef", "milk", "milk", "milk", "milk"], "difficult_direct_answer": false, "rationales": ["These are cows and that is the name of their meat", "It is a cow, which produces many different cuts of beef.", "The animals are cows and are only raised for their meat or milk."], "image": "train2014/COCO_train2014_000000305431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276467, "question_id": "SMzwgrCRrzcVdjqfF2gDfn", "question": "What color is the spray painted color on the back of the little lambs?", "choices": ["green color", "blue color", "pink color", "orange color"], "correct_choice_idx": 2, "direct_answers": ["red", "grey", "white", "red", "red", "red", "red", "red", "pink", "pink color"], "difficult_direct_answer": false, "rationales": ["Sheep are in a pasture with babies behind them. the smaller sheep have pink stripes on them.", "The sheep at the back have pink writing.", "The sheep are colored pink."], "image": "train2014/COCO_train2014_000000276467.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135608, "question_id": "SNF8Lf87XyFwLtLgsN9dKv", "question": "How many graffiti pictures are on the overpass wall?", "choices": ["one", "two", "four", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "many", "three", "three", "two", "three", "three", "three", "one", "three"], "difficult_direct_answer": false, "rationales": ["There are 3 pictures.", "There are three pictures.", "There are two pictures drawn."], "image": "train2014/COCO_train2014_000000135608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540806, "question_id": "SNM2FpXRt9yA3vWj45r4oV", "question": "What is next to the vehicle?", "choices": ["walrus", "antelope", "airplane", "bridge"], "correct_choice_idx": 3, "direct_answers": ["bridge", "dotted line", "sidecar", "bridge", "traffic wall", "bridge", "bridge", "corridor", "bridge", "bridge"], "difficult_direct_answer": false, "rationales": ["The bike is on a bridge.", "The bridge is near the vehicle.", "The vehicle is traveling on a road. a structure is to the left of the vehicle."], "image": "val2014/COCO_val2014_000000540806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522978, "question_id": "SNxEqtCUNpQjcycjsSZoKb", "question": "What is the color of the man's shorts who is getting ready to bat the ball?", "choices": ["pink", "green", "red", "purple"], "correct_choice_idx": 1, "direct_answers": ["grey", "beige", "grey", "green", "green", "grey", "brown", "brown", "grey", "green"], "difficult_direct_answer": false, "rationales": ["The color is green.", "The batter's shorts are not purple, pink, or red.", "The man that is getting ready to bat the ball is wearing dull green pants."], "image": "train2014/COCO_train2014_000000522978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457461, "question_id": "SPNjRyN6qzBJ2PjCSi8zgN", "question": "What color is the stripe running around the circumference of the plate?", "choices": ["blue", "purple", "brown", "red"], "correct_choice_idx": 2, "direct_answers": ["dark brown", "brown", "brown", "brown", "brown", "brown", "brown", "black", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["It has the brown color outside it.", "The color is brown.", "The color is brown."], "image": "val2014/COCO_val2014_000000457461.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352118, "question_id": "SPUDiqDvg4Dc4QsZor6bf9", "question": "What can you tell about the giraffe in the foreground by looking at its ossicones?", "choices": ["female", "breed", "male", "age"], "correct_choice_idx": 0, "direct_answers": ["age", "female", "it's eating", "very tall", "nothing", "male giraffe", "fwfef", "older male", "neck", "tallest"], "difficult_direct_answer": true, "rationales": ["The giraffes have thin and tufted ossicones, common in females.", "The giraffe is a female.", "These grow as they get older and both male and females have them"], "image": "train2014/COCO_train2014_000000352118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169562, "question_id": "ST6C9dkcrhHNKmQKbEksad", "question": "What is the pillow supposed to look like?", "choices": ["egg", "car", "pumpkin", "mouse"], "correct_choice_idx": 2, "direct_answers": ["pumpkin", "kitties", "stop", "pumpkin", "pumpkin", "beddings", "pumpkin", "pumpkin", "bed", "pumpkin"], "difficult_direct_answer": false, "rationales": ["There is an orange pillow with eyes and a green talk on top sitting on a couch.", "The pillow is a pumpkin.", "A pillow is orange and has a green stem on top."], "image": "val2014/COCO_val2014_000000169562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190939, "question_id": "STm2E87ibQXYBjiK2B4rXP", "question": "What color is the egg on the sandwich to the left?", "choices": ["green", "white", "blue", "yellow"], "correct_choice_idx": 1, "direct_answers": ["white", "yellow", "white", "white", "white", "white", "yellow", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["It's the white part of an egg.", "The color is white.", "The white egg is on the sandwich on the left."], "image": "train2014/COCO_train2014_000000190939.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451951, "question_id": "SUH5ERyDDHEkpFAGDhDkAA", "question": "What is the man with the drink at risk of?", "choices": ["falling hazard", "fire hazard", "flood hazard", "slipping hazard"], "correct_choice_idx": 0, "direct_answers": ["falling", "falling hazard", "falling", "falling", "skate riding", "falling", "falling", "skating", "falling", "falling"], "difficult_direct_answer": false, "rationales": ["The man would fall over.", "The man could fall.", "The man with the drink is at the risk of falling off his skateboard."], "image": "val2014/COCO_val2014_000000451951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198367, "question_id": "SWb8fthCsYXXZHuqLG46A4", "question": "Where are these items usually found?", "choices": ["under pillows", "underground", "on trees", "in caves"], "correct_choice_idx": 2, "direct_answers": ["on trees", "dining", "trees", "grocery store", "kitchen", "kitchen", "trees", "kitchen", "kitchen", "trees"], "difficult_direct_answer": false, "rationales": ["These items are bananas and apples. they usually are not found underground, in caves, or under pillows.", "These fruit items are commonly found in trees.", "These are apples and bananas that grow on plants"], "image": "val2014/COCO_val2014_000000198367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207180, "question_id": "SWd6AYH6KbzMisVkgRdVZp", "question": "What is on the left side of the room?", "choices": ["barrel", "wheelbarrow", "bench", "apple cart"], "correct_choice_idx": 2, "direct_answers": ["bench", "people", "building", "benches", "bench", "church", "benches", "benches", "lights", "benches"], "difficult_direct_answer": false, "rationales": ["None of the other options make sense for this image. there are actually more than one of the a objects.", "The furniture is long has no backside or armrests.", "A object farthest left is a bench"], "image": "val2014/COCO_val2014_000000207180.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442367, "question_id": "SWofqXtFQnXmAqxqvgmfAj", "question": "What kind of area in town is this?", "choices": ["shopping area", "industrial", "residential area", "office buildings"], "correct_choice_idx": 0, "direct_answers": ["street", "downtown", "walking street", "mainstreet", "street", "market", "street", "downtown", "shopping area", "shopping area"], "difficult_direct_answer": false, "rationales": ["The area is for shopping.", "This is the shopping area of town.", "This seems to be the case given all of the retailers."], "image": "val2014/COCO_val2014_000000442367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360313, "question_id": "SYH2NgDhfV9N5szwzm3Uos", "question": "What are the seats around?", "choices": ["football field", "basketball court", "cow", "pool"], "correct_choice_idx": 3, "direct_answers": ["pool", "chairs", "chairs", "pool", "beach chairs", "pool", "pool", "pool", "pool", "swimming pool"], "difficult_direct_answer": false, "rationales": ["Chairs are lined up around a large square pool.", "The seas are by a pool.", "The seats are at the pool."], "image": "train2014/COCO_train2014_000000360313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496457, "question_id": "SYwWKQpQjK4nKxRwcr2aa4", "question": "What are the people playing?", "choices": ["checkers", "video games", "chess", "tennis"], "correct_choice_idx": 1, "direct_answers": ["wii", "video game", "wii", "wii", "wii", "wii", "wii game", "wii", "videogames", "video games"], "difficult_direct_answer": false, "rationales": ["People are in a living room holding game controllers.", "The people are holding controllers and playing video games.", "They're using wii controllers."], "image": "train2014/COCO_train2014_000000496457.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135045, "question_id": "SZQkmeze52xV5BpAL5CnaR", "question": "What type of waterway is seen here?", "choices": ["pond", "river", "lake", "canal"], "correct_choice_idx": 3, "direct_answers": ["water", "river", "canal", "river", "canal", "little", "canal", "river", "river", "natural"], "difficult_direct_answer": false, "rationales": ["The waterway appears man made based on the shape, straightness and the walls on either side which would be consistent with answer a.", "It is contained by man made structures in a city", "The waterway is a canal."], "image": "train2014/COCO_train2014_000000135045.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40926, "question_id": "SZZ2LQiNt5fbXWNU4CL6XC", "question": "How many birds are clinging on the side of this giraffe's neck?", "choices": ["four", "six", "three", "one"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "two", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three birds sitting on the top of the giraffe's neck.", "One is at the top and two are down below", "There is one winged and feathered animal near the top and two more near the bottom of the image."], "image": "val2014/COCO_val2014_000000040926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350303, "question_id": "SbkDe2nFMy5A5FaLR565kc", "question": "What word would best describe this room?", "choices": ["fancy", "miniature", "dilapidated", "messy"], "correct_choice_idx": 0, "direct_answers": ["antique", "fancy", "hotel", "old", "ornate", "regal", "dark", "ornate", "fancy", "fancy"], "difficult_direct_answer": false, "rationales": ["The word is fancy.", "The word is fancy.", "The style and design of the chairs, windows, and the clean, tiled floor make it seem like an upscale environment."], "image": "train2014/COCO_train2014_000000350303.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259666, "question_id": "ScEUv9nCuncNWgDmkS9La3", "question": "What color is the main stripe on the right side of the queen sized bed?", "choices": ["yellow", "white", "pink", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "red", "white", "red", "red", "red", "white", "peach", "red", "red"], "difficult_direct_answer": false, "rationales": ["It is the biggest one on this side", "The stripes on the right side of the mattress are red.", "The color is red."], "image": "train2014/COCO_train2014_000000259666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113762, "question_id": "Sf5AqRkFDMCWqiTNdre82n", "question": "What is the paneling made of which is covering the walls?", "choices": ["cedar", "oak", "birch", "pine"], "correct_choice_idx": 3, "direct_answers": ["metal", "wood", "wood", "wood", "wood", "wood", "sheet", "wood", "pine", "wood"], "difficult_direct_answer": false, "rationales": ["Knotty a is likely the correct answer because of the obvious knots.", "The paneling is made of pine.", "The panelling covering the walls is made of pine."], "image": "train2014/COCO_train2014_000000113762.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375198, "question_id": "Sf9qPwuQ2VXnk7V7Qyaubf", "question": "What might this place be?", "choices": ["grocery store", "gas station", "farmers market", "restaurant"], "correct_choice_idx": 2, "direct_answers": ["farmers market", "market", "market", "market", "market", "market", "market", "market", "store", "hawaii"], "difficult_direct_answer": false, "rationales": ["They are in an open air environment so they are not in a building.", "Fruits and vegetables are being sold. the area is not structured like a grocery store, gas station, or restaurant.", "This is a place they sell fruits and vegetables"], "image": "val2014/COCO_val2014_000000375198.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72124, "question_id": "SfDFvS2AyegGmqCKwPm8Ke", "question": "What is the purpose of the red and yellow trucks?", "choices": ["fight crime", "stop fires", "deliver food", "deliver packages"], "correct_choice_idx": 1, "direct_answers": ["fire", "fire engines", "firetruck", "fire extinguisher", "fires", "fire", "signal", "fire elimination", "stop fires", "fight fire"], "difficult_direct_answer": true, "rationales": ["The firetrucks are supposed to stop fires.", "The purpose is to stop fires.", "The purpose is to stop fires."], "image": "train2014/COCO_train2014_000000072124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416101, "question_id": "SgErVGSgxHsxU8sgssYHFc", "question": "What is the person holding in their hands?", "choices": ["tennis balls", "straws", "rocks", "doves"], "correct_choice_idx": 0, "direct_answers": ["racket", "tennies", "ball", "tennis balls", "ball", "car", "ball racket", "tennis bat", "tennis balls", "tennis balls"], "difficult_direct_answer": false, "rationales": ["A person is on a tennis court, holding a racket, with two yellow, round objects in his hands.", "The person has a tennis ball.", "The person has tennis balls."], "image": "val2014/COCO_val2014_000000416101.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32699, "question_id": "Sgb4hPKY2LWd5dx7HY4RZt", "question": "What character is often depicted eating the item in the bowl that is on top of the table with the white covering?", "choices": ["bugs bunny", "garfield", "donkey kong", "crash bandicoot"], "correct_choice_idx": 2, "direct_answers": ["monkey", "child", "donkey kong", "business man", "banana", "curious george", "fruit", "monkey", "monkey", "tony tiger"], "difficult_direct_answer": false, "rationales": ["These are bananas and monkeys like them", "The character donkey kong eats bananas.", "There is a bowl of bananas on the kitchen counter which are often eaten by donkey kong."], "image": "train2014/COCO_train2014_000000032699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275565, "question_id": "SiXgisWB37XuJoZHAgZqYG", "question": "Ho wmany bagels are on the tray where the woman is operating tongs?", "choices": ["two", "four", "five", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "one", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["They are in a triangle shape", "A woman is leaning over a pan with bagels on it. there are three bagels on the pan.", "There are 3 on the bottom tray and two on the top."], "image": "train2014/COCO_train2014_000000275565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424632, "question_id": "SimvCMNbqEXeeMRK8398QX", "question": "What is the appliance on the counter called?", "choices": ["microwave", "convection oven", "blender", "mixer"], "correct_choice_idx": 0, "direct_answers": ["microwave", "microwave", "microwave", "refrigerator", "microwave", "microwave", "microwave", "microwave", "oven", "microwave"], "difficult_direct_answer": false, "rationales": ["A microwave sits on the counter near the fridge.", "That is used to heat food quickly.", "The appliance is a microwave."], "image": "train2014/COCO_train2014_000000424632.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 560587, "question_id": "SkTqfAoGmCm3z4xpqjU9w4", "question": "How many giraffes are walking together in front of the herd of buffalo?", "choices": ["three", "two", "six", "five"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["This is obvious in the scene.", "A small giraffe is walking behind a larger giraffe.", "There is an adult giraffe and a baby."], "image": "train2014/COCO_train2014_000000560587.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434897, "question_id": "SkxyYM6N34xczsFAS363Sk", "question": "What color is the cell phone which the man talks on?", "choices": ["pink", "white", "black", "gray"], "correct_choice_idx": 3, "direct_answers": ["silver", "white", "gray", "gray", "silver", "grey", "silver", "grey", "gray", "silver"], "difficult_direct_answer": false, "rationales": ["It's also described though as metal or silver when sold as a product.", "It's more like a light a or silver even.", "The phone in his hand is this color."], "image": "val2014/COCO_val2014_000000434897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482002, "question_id": "SmuV3Dzj3L95Tc5ksTV8js", "question": "What is the woman using?", "choices": ["telephone", "car", "axe", "phone book"], "correct_choice_idx": 0, "direct_answers": ["mobile", "cell phone", "phone", "cell phone", "phone", "phone", "telephone", "phone", "cell phone", "phone"], "difficult_direct_answer": false, "rationales": ["A woman is holding a white object up to her ear.", "The woman is holding a pink cellphone to her ear.", "The size, shape and manner of the object in the woman's hand is consistent with answer a."], "image": "train2014/COCO_train2014_000000482002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555308, "question_id": "SnStzTuwZBfj4VuBFLUNMj", "question": "How many aircraft are on the tarmac together about to go to the docking bay?", "choices": ["four", "three", "five", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "one", "two", "two", "many", "one", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are a couple of planes on the tarmac.", "There are 2.", "There are two aircrafts."], "image": "train2014/COCO_train2014_000000555308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419610, "question_id": "SnerK2s53msNy6wDLmjnBT", "question": "What color is painted on the metal frame of the bicycle parked in front of the cake store?", "choices": ["black", "red", "pink", "green"], "correct_choice_idx": 0, "direct_answers": ["purple", "black", "black", "maroon", "dark brown", "black", "purple", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["A dark colored bike is parked in front of a store with pastries in the window. bakeries that make cake often advertise pastries and baked goods in their windows.", "The color is black.", "The color is black."], "image": "train2014/COCO_train2014_000000419610.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 569526, "question_id": "Sohf9rZfFuNSbU7BeD5yWX", "question": "What is the child most likely swinging at?", "choices": ["video game", "slider", "ant", "butterfly"], "correct_choice_idx": 1, "direct_answers": ["baseball", "slider", "baseball", "baseball", "baseball", "baseball", "bat", "baseball", "ball", "baseball"], "difficult_direct_answer": false, "rationales": ["The kid will swing at the slider.", "The child is playing baseball and is currently batting and swinging at a ball that would have been pitched to them. answer a is the only value on the list that describes a type of pitch.", "The answer would be a ball and i'm not sure what a would refer to other than when someone slides to home plate, which isn't shown here."], "image": "val2014/COCO_val2014_000000569526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359904, "question_id": "Sr7f8bHdAttbkY88kWpWvH", "question": "How many fine collectors are contained by the post on the sidewalk?", "choices": ["two", "four", "one", "three"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "one", "two", "two", "two", "two", "two", "one", "two"], "difficult_direct_answer": false, "rationales": ["There are a pair of meters being used.", "There are 2.", "There are two collectors."], "image": "train2014/COCO_train2014_000000359904.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542818, "question_id": "SrBZW5bkDzEC3tM29G4tkB", "question": "Why is the water brown?", "choices": ["sand", "leeches", "dirt", "fish"], "correct_choice_idx": 2, "direct_answers": ["dirt", "mud", "dirty", "muddy river", "mud", "nature color", "dirt", "dirty", "dirt", "dirty"], "difficult_direct_answer": false, "rationales": ["The water has dirt.", "The water is filled with dirt and is the color of mud.", "The water is dirty."], "image": "train2014/COCO_train2014_000000542818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445325, "question_id": "SsHBts7NBw3PSnRRKQK7yF", "question": "How would you classify the activity these are used for?", "choices": ["work", "science", "sports", "school"], "correct_choice_idx": 2, "direct_answers": ["snowboarding", "snowboarding", "skating", "snowboarding", "sports", "snowboarding", "skating", "snowboarding", "skate board", "staking"], "difficult_direct_answer": false, "rationales": ["These are snowboards, used for recreation.", "The activity is for sports.", "The objects are snowboards."], "image": "train2014/COCO_train2014_000000445325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376422, "question_id": "StwSRg4tV5WxJvngmWgMic", "question": "What color is the bow tie on the big teddy bear in the poster?", "choices": ["red", "green", "blue", "white"], "correct_choice_idx": 2, "direct_answers": ["gray", "black", "blue", "blue", "silvery grey", "purple", "grey", "white", "silver", "blue"], "difficult_direct_answer": false, "rationales": ["The color is easily visible and bright. it is in sharp contrast to the brown bears. it is similar to the color of the sky.", "It looks more like a lilac or light purple, but a is the closest possible.", "The color is blue."], "image": "train2014/COCO_train2014_000000376422.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333164, "question_id": "SuFTUpzUBXcagUJgr3FKj5", "question": "What is the man adjusting?", "choices": ["hat", "belt", "tie", "glasses"], "correct_choice_idx": 2, "direct_answers": ["tie", "tie", "tie", "tie", "tie", "tie", "tie", "tie", "tie", "tie"], "difficult_direct_answer": false, "rationales": ["The man has a tie.", "The man is adjusting his tie.", "The tie is part of his official attire and is on his collar."], "image": "train2014/COCO_train2014_000000333164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 442811, "question_id": "SvcupKyQ8U6CMSyedouYiE", "question": "What is a term used for this place?", "choices": ["first base", "grapes", "hoop", "downhill"], "correct_choice_idx": 3, "direct_answers": ["snowboarding resort", "sking", "resort", "snow", "ski slope", "ski area", "snow mountain", "downhill", "ski resort", "slope"], "difficult_direct_answer": true, "rationales": ["This is a place they can ski down.", "A large downhill snow slope has many skiers attempting to go down it with many tracks made in it.", "The term is downhill."], "image": "train2014/COCO_train2014_000000442811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55402, "question_id": "SwRXfqudTRcBpvDBSX6LR2", "question": "How many giraffes are there in this wildlife conservatory shot?", "choices": ["five", "four", "six", "three"], "correct_choice_idx": 0, "direct_answers": ["five", "five", "two", "two", "six", "four", "four", "four", "three", "five"], "difficult_direct_answer": false, "rationales": ["There are five giraffes in the picture.", "There are two in front and three in the back but the animal by the others is not a giraffe.", "There are a of them and one deer as indicated by the extra legs in the foreground, the neck to the left and the two far in the background. the deer is next to the two in the foreground."], "image": "train2014/COCO_train2014_000000055402.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264615, "question_id": "Swa2JmEtgWXBEJWDV4VhgE", "question": "What kind of fencing material is used to enclose this pasture of cows?", "choices": ["wire link", "electrified wire", "wood", "cast iron"], "correct_choice_idx": 2, "direct_answers": ["wood", "wood", "wood", "steel", "grass", "metal", "metal", "pole", "iron", "wood"], "difficult_direct_answer": false, "rationales": ["This is common to make fences from.", "This fence is made from wood", "The material is wood."], "image": "val2014/COCO_val2014_000000264615.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79841, "question_id": "SwwtjTNJXTTWA794fyH7GG", "question": "What animal is closest in size to the wheeled item the people are near?", "choices": ["elephant", "giraffe", "mouse", "dog"], "correct_choice_idx": 3, "direct_answers": ["wolf", "dog", "horse", "dog", "dog", "man", "horse", "cow", "dog", "pony"], "difficult_direct_answer": false, "rationales": ["Dogs are closest in size to a bicycle.", "The animal is a dog.", "The wheeled item is a bicycle. elephants and giraffes are significantly larger than bicycles, and mice are significantly smaller."], "image": "val2014/COCO_val2014_000000079841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59352, "question_id": "SzkTgCPkNsfnDuiGUr8U7e", "question": "What is racing downward?", "choices": ["airplane", "submarine", "skier", "train"], "correct_choice_idx": 2, "direct_answers": ["skiier", "skiier", "skier", "skiier", "skier", "skiier", "skier", "skater", "skier", "skier"], "difficult_direct_answer": false, "rationales": ["There is a person moving down the mountain on skis.", "The skier is going downhill.", "The skier is going down."], "image": "val2014/COCO_val2014_000000059352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144847, "question_id": "T2AYTysjytWp7hEMoMboLi", "question": "What color is the sauce served in a circle around the vegetables?", "choices": ["tan", "red", "purple", "blue"], "correct_choice_idx": 0, "direct_answers": ["brown", "brown", "pink", "beige", "taupe", "green", "tan", "fish", "red", "tan"], "difficult_direct_answer": false, "rationales": ["Mixing brown and white together creates a new shade.", "The sauce is a beige color.", "It's a light brown color"], "image": "train2014/COCO_train2014_000000144847.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314935, "question_id": "T2TSLWTskBEEw7K9QMTsCx", "question": "Who makes the item that is under the camera?", "choices": ["nintendo", "microsoft", "panasonic", "sega"], "correct_choice_idx": 2, "direct_answers": ["panasonic", "panasonic", "lumix", "lumix", "table", "panasonic", "computerize", "panasonic", "panasonic lumix", "panasonic"], "difficult_direct_answer": false, "rationales": ["The black box under the camera is labeled to contain a panasonic tz lumix.", "The brand name is on the camera and product box.", "The company's name appears on the box above tzu lumix."], "image": "val2014/COCO_val2014_000000314935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 205435, "question_id": "T3FjgDVc7s5xanGoLooEc9", "question": "Which item on the plate is highest in carbs?", "choices": ["broccoli", "squash", "rice", "meat"], "correct_choice_idx": 2, "direct_answers": ["rice", "rice", "rice", "potato", "rice", "rice", "rice", "rice", "rice", "rice"], "difficult_direct_answer": false, "rationales": ["The item is rice.", "The rice is high in carbs.", "Grains are very high in carbohydrates making choice a the correct option."], "image": "val2014/COCO_val2014_000000205435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328935, "question_id": "T7wyDKveNWE3TPLfXdWqAh", "question": "Why are all the people gathered?", "choices": ["free food", "traveling", "concert", "shopping"], "correct_choice_idx": 1, "direct_answers": ["traveling", "traveling", "travel", "flinders street station", "catch train", "train", "train station", "meeting", "traveling", "board train"], "difficult_direct_answer": false, "rationales": ["The writing on the building indicates it is a station, so this is where people go to travel by public transit.", "There are different signs for different locations/streets and times.", "The people are traveling."], "image": "train2014/COCO_train2014_000000328935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165341, "question_id": "TAdkQGvAKz6koXmocWxbVw", "question": "What does the kite on the left look like?", "choices": ["beaver", "antelope", "cow", "octopus"], "correct_choice_idx": 3, "direct_answers": ["happy", "kite", "purple octopus", "purple", "ghost", "squid", "octopus", "alien", "octopus", "octopus"], "difficult_direct_answer": false, "rationales": ["It has a bulbous head and tentacles", "(a) octopus. it looks kind of like an alien as well, but it has many legs like an octopus has.", "It seems to have tentacles. purple is also often used for a cartoon characters."], "image": "train2014/COCO_train2014_000000165341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384680, "question_id": "TAeLnejoaEn7K6nubn2f8R", "question": "What company makes this vehicle?", "choices": ["ford", "saturn", "gmc", "nissan"], "correct_choice_idx": 2, "direct_answers": ["gmc", "general motors", "gmc", "gmc", "toyota", "gmc", "gmc", "gmc", "gmc", "car"], "difficult_direct_answer": false, "rationales": ["Gmc makes the vehicle.", "The logo is on the grille.", "A truck can be seen in the road with a gmc emblem on the front grill."], "image": "train2014/COCO_train2014_000000384680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230056, "question_id": "TBmDn4NUzVDewYCoHVYSXT", "question": "How many long-necked birds are traveling in a row on the side of the river?", "choices": ["two", "four", "six", "five"], "correct_choice_idx": 1, "direct_answers": ["four", "four", "four", "three", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There's four winged creatures far apart from each other in the water.", "Two birds are travelling behind two others.", "There are four birds."], "image": "val2014/COCO_val2014_000000230056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31601, "question_id": "TC3XCPaZNiPXQm2Yr8i8FH", "question": "What is above the steel structure?", "choices": ["train", "cat", "mick foley", "kangaroo"], "correct_choice_idx": 0, "direct_answers": ["train", "train", "train", "train", "train", "train", "train", "train", "train", "train"], "difficult_direct_answer": false, "rationales": ["This is a railroad track", "There is a red train moving on the steel structure.", "The bridge is a train track"], "image": "val2014/COCO_val2014_000000031601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241064, "question_id": "TCeJr8myELD7AKTxEpyPXU", "question": "What color is the cow resting on the top left side of the pasture?", "choices": ["pink", "brown", "ginger", "black"], "correct_choice_idx": 1, "direct_answers": ["brown", "green", "brown", "black white", "brown white", "brown", "brown", "white black", "black", "brown"], "difficult_direct_answer": false, "rationales": ["It's lighter than the rest of them with black on them", "It is different than the rest that are black and white", "This is common color of cow and it sitting next to the black and white one."], "image": "val2014/COCO_val2014_000000241064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156895, "question_id": "TDPzQymVPMiupuhGEpQb94", "question": "What color is the tip of the tailfin on the Japanese propeller plane?", "choices": ["green", "orange", "red", "brown"], "correct_choice_idx": 2, "direct_answers": ["red colour", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["It shows a color that's similar to those cups used in beer pong.", "The tail is red.", "The color is red."], "image": "train2014/COCO_train2014_000000156895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175604, "question_id": "TFAcvnpw4BRQtQP8e7fb2E", "question": "What is the yellow cereal on top of the donut?", "choices": ["lucky charms", "capn crunch", "shredded wheat", "fruit loops"], "correct_choice_idx": 1, "direct_answers": ["captain crunch", "chocolate", "creamps", "can crunch", "capn crunch", "corn puffs", "cap'n crunch", "trix", "cap'n crunch", "candy"], "difficult_direct_answer": true, "rationales": ["There is cereal.", "The cereal is crunchy.", "Yellow, square cereal is on top of a round pastry. captain crunch is a yellow, square cereal."], "image": "val2014/COCO_val2014_000000175604.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195124, "question_id": "TGt5en2xoTqzUBNXqHmcr9", "question": "How many species are depicted here?", "choices": ["three", "five", "four", "six"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "fefef", "two", "three"], "difficult_direct_answer": false, "rationales": ["There are 3.", "A bird, a dog and humans are shown.", "The animal species are clearly visible and identifiable by their outlines and are thus countable."], "image": "train2014/COCO_train2014_000000195124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58281, "question_id": "THm3BNjkEc2y4Qy5mmehpy", "question": "How many giraffes are stood in the middle of the conservation field?", "choices": ["two", "five", "four", "three"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "car"], "difficult_direct_answer": false, "rationales": ["There are two giraffes.", "There are 2.", "There are two giraffes in the photo."], "image": "train2014/COCO_train2014_000000058281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307612, "question_id": "TJ8f2dpbKBhGiKTor4We8C", "question": "What usually happens on the item in the middle of the room?", "choices": ["cooking", "hitting homeruns", "shooting hoops", "sleeping"], "correct_choice_idx": 3, "direct_answers": ["sleep", "sleeping", "sleep", "sleep", "sleep", "sleep", "sleep", "sleeping", "sleep", "sleep"], "difficult_direct_answer": false, "rationales": ["Sleeping happens.", "This has a mattress on a frame", "People usually sleep on beds."], "image": "train2014/COCO_train2014_000000307612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71137, "question_id": "TKw9rGWHEkfrwvBGgdQ54f", "question": "What kind of fruit does the orangutan have in its mouth?", "choices": ["watermelon", "bananas", "apples", "oranges"], "correct_choice_idx": 1, "direct_answers": ["bananas", "banana", "bananas", "bananas", "car", "bananas", "bananas", "banana", "banana", "plantain"], "difficult_direct_answer": false, "rationales": ["He is holding a bunch of yellow curved fruits.", "The orangutans mouth is clearly visible and based on the size, shape and color of the objects in its mouth, answer a is correct.", "The fruit is a banana."], "image": "train2014/COCO_train2014_000000071137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101660, "question_id": "TMnB9X3tbiztawtzGM4bxq", "question": "What color of light is emanated by the lantern on the top of the footlocker?", "choices": ["orange", "black", "white", "pink"], "correct_choice_idx": 0, "direct_answers": ["amber", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yello", "orange"], "difficult_direct_answer": false, "rationales": ["An orange grow emanates from the rectangular metal structure with glass doors on the right side of the image.", "The lantern is identifiable by the size, shape and the light it is emitting. the light shade color is visible and qualifiable.", "The color is orange."], "image": "val2014/COCO_val2014_000000101660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458903, "question_id": "TMyZ45ZKhudppUxUpbtVQB", "question": "How is the person holding the item?", "choices": ["invisible", "backwards", "upside down", "sideways"], "correct_choice_idx": 3, "direct_answers": ["hand", "phone", "sideways", "between fingers", "fingertips", "sideways", "sideways", "horizontally", "by hand", "mobile"], "difficult_direct_answer": false, "rationales": ["This is the landscape position for a phone", "The item is being held in portrait view.", "A person is holding a phone tipped to the side rather than straight up and down."], "image": "val2014/COCO_val2014_000000458903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127911, "question_id": "TN6aMbPoubb4FWMN8GbKuV", "question": "How many women are walking on through the park while carrying black umbrellas?", "choices": ["four", "two", "three", "five"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "stop", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two women side by side.", "There are two women.", "There are two women and each has a black umbrella."], "image": "val2014/COCO_val2014_000000127911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270333, "question_id": "TNp3B2KiJ6YmjWxne6LHCS", "question": "What is the name of the pants that most of the boys have on in this image?", "choices": ["khakis", "pants", "jeans", "dress pants"], "correct_choice_idx": 2, "direct_answers": ["jeans", "jeans", "wrangler", "jeans", "jeans", "jeans", "jeans", "jeans", "jean", "jeans"], "difficult_direct_answer": false, "rationales": ["Guys are wearing blue jeans as they skateboard.", "The fabric of those garments are made of denim.", "The blue color indicates this as well as the material and fit."], "image": "train2014/COCO_train2014_000000270333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331403, "question_id": "TQxy3NHxb62jS4kdRnN45H", "question": "What color is the undershirt worn by the man who is skiing above?", "choices": ["white", "orange", "blue", "red"], "correct_choice_idx": 2, "direct_answers": ["black", "blue", "blue", "blue", "blue", "blue", "blue", "black", "gray", "snow"], "difficult_direct_answer": false, "rationales": ["The shirt is the same color as the sky.", "The man is wearing a blue shirt shown by his color.", "The undershirt is color blue."], "image": "val2014/COCO_val2014_000000331403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542681, "question_id": "TTrPTYUZndELuPn8y6Nfc2", "question": "Who is a sponsor of this event?", "choices": ["amazon", "cinemax", "hbo", "garnier"], "correct_choice_idx": 3, "direct_answers": ["garnier", "garnier", "garnier", "garnier", "garnier", "garnier", "garnier", "kia", "garnier", "kia"], "difficult_direct_answer": false, "rationales": ["This is indicated clearly by the many signs with the name on it. it's also been sponsored by ibm and kia.", "The sponsor's name is on several signs in the background.", "None of the other options are metioned in the background."], "image": "val2014/COCO_val2014_000000542681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235328, "question_id": "TUE9HdPWSqrpguGpWhGfYh", "question": "What color is the vase sat atop the top of the two shelves to the right?", "choices": ["white", "blue", "clear", "red"], "correct_choice_idx": 0, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["There are a couple of white vases sitting on top the shelf.", "The vase is colored white.", "The color is white."], "image": "train2014/COCO_train2014_000000235328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468541, "question_id": "TUSsCG9bX7auJzad2efL43", "question": "What is the main primary color of the background on the men's tie worn to the dog?", "choices": ["red", "orange", "blue", "yellow"], "correct_choice_idx": 0, "direct_answers": ["red", "maroon", "red", "dog", "good", "red", "burgundy", "white", "red", "red"], "difficult_direct_answer": false, "rationales": ["The dog is sitting down wearing a tie that is mostly red.", "The main color is red.", "The tie's main primary background color is not blue, yellow, or orange."], "image": "val2014/COCO_val2014_000000468541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30407, "question_id": "TVJR23jRGBKogZhE2NfY3A", "question": "What might the person be repairing?", "choices": ["music boxes", "books", "phones", "cars"], "correct_choice_idx": 2, "direct_answers": ["phones", "phones", "phones", "phones", "cell phones", "phone", "mobile", "phones", "phone", "cell phones"], "difficult_direct_answer": false, "rationales": ["There are electronic devices. they have screens and dial pads.", "There are several of them on the table. they have numerical keyboards and screens", "There are many phones on the table."], "image": "val2014/COCO_val2014_000000030407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188798, "question_id": "TVWs25j6BnUZ247hqe5ZKe", "question": "The nearby cow to the left who is looking at the camera wears what color down his face?", "choices": ["white", "brown", "black", "gray"], "correct_choice_idx": 0, "direct_answers": ["white", "white", "black", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The cow's face is not black, brown, or gray.", "The cow is white.", "The cow has this light color against his black spots."], "image": "val2014/COCO_val2014_000000188798.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 341985, "question_id": "TXB5YEKVLnX7pGut4ZZBMp", "question": "What is the longest item here?", "choices": ["polecat", "stroller", "elephant", "banana"], "correct_choice_idx": 3, "direct_answers": ["banana", "banana", "banana", "banana", "banana", "banana", "banana", "banana", "banana", "banana"], "difficult_direct_answer": false, "rationales": ["The longest item is the banana.", "Bananas are elongated while the rest are round.", "There are bananas, apples and oranges on the table. the banana is the longest."], "image": "train2014/COCO_train2014_000000341985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275930, "question_id": "TYzdPXL4cRUz47ufkwh476", "question": "What country does this large military purposed jet fly for?", "choices": ["usa", "china", "germany", "russia"], "correct_choice_idx": 0, "direct_answers": ["us", "united states", "america", "na", "usa", "america", "usa", "united states", "usa", "australia"], "difficult_direct_answer": false, "rationales": ["The large military plane has roundels on its fuselage and wings. there is a star inside each roundel.", "It's an old fighter jet", "The star logo on the side of the plane belongs to the usa."], "image": "train2014/COCO_train2014_000000275930.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380247, "question_id": "Tb5Zb3TGBoQRPkFSJZZPcu", "question": "How to tell this is not a home kitchen?", "choices": ["vending machines", "sink", "coffee machine", "refrigerator"], "correct_choice_idx": 0, "direct_answers": ["watching", "space", "coke machine", "vending machine", "decoration", "coke machine", "vending", "not resident", "floor", "vending machines"], "difficult_direct_answer": true, "rationales": ["Of the objects in the kitchen, answer a is something that would not appear in a home setting while the others would.", "This requires putting money in the machine to obtain a beverage. most home kitchens have beverages in the fridge because the home owner has already paid for the drinks.", "There are vending machines around."], "image": "val2014/COCO_val2014_000000380247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454601, "question_id": "TcnufpVZekMy7GQ9vu2kbj", "question": "Which cat is most likely to get hotter in the sun?", "choices": ["neither", "orange cat", "black cat", "both same"], "correct_choice_idx": 2, "direct_answers": ["abyssinian", "black cat", "black cat", "black", "cat", "black", "black", "black cat", "black", "black"], "difficult_direct_answer": false, "rationales": ["Because the black color absorbs more heat from the sun.", "This is because the color black absorbs instead of reflects heat. lighter colors reflect.", "The cat that's black will be hotter."], "image": "val2014/COCO_val2014_000000454601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227468, "question_id": "TfScrvwz8kqzjnsNjdTZaY", "question": "What color is the metal suitcase in the middle of the luggage pile?", "choices": ["green", "red", "yellow", "blue"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "yellow", "golden", "yello", "yellow", "gold", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The metal suitcase is yellow.", "The middle metal textured suitcase with label reading 510 is a bright citrus color.", "The color is obvious. typically, they're gray."], "image": "val2014/COCO_val2014_000000227468.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213382, "question_id": "TfwpHy2zx6NfeuCUK3HYGC", "question": "What is a very tall item here?", "choices": ["stilts", "ladder", "giraffe", "telephone pole"], "correct_choice_idx": 3, "direct_answers": ["telephone pole", "phone poles", "poles", "phone pole", "power line", "pole", "pole", "pole", "phone pole", "light pole"], "difficult_direct_answer": false, "rationales": ["It is round and tall and made of wood which is consistent with this type of item. it is also found in a location along the side of the street which confirms its identity.", "They keep the wires far above people and cars so that no one is injured", "The item is a pole."], "image": "train2014/COCO_train2014_000000213382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198717, "question_id": "Tg8psg4MdWynQMkwGKHuFe", "question": "What does the man on the left most likely own based on what he is doing?", "choices": ["tank", "lynx", "police motorcycle", "whoopie cushion"], "correct_choice_idx": 3, "direct_answers": ["pots", "whoopee cushion", "pans", "drums", "alarm clock", "pans", "pot", "pots", "surprise", "whoopie cushion"], "difficult_direct_answer": false, "rationales": ["The guy is obviously a jokester so he would probably own all of the prank objects available.", "The man is on a cushion.", "The man has a whoopie cushion."], "image": "val2014/COCO_val2014_000000198717.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108446, "question_id": "Th2f92NYE3HjwwZLAK9Pf8", "question": "What picture is on the sign that is all the way to the right?", "choices": ["baby", "cabbage", "horse", "bicycle"], "correct_choice_idx": 3, "direct_answers": ["cycle", "cyclists", "bicycle", "bicycle", "traffic", "car", "ocean wave", "bike", "black", "red"], "difficult_direct_answer": true, "rationales": ["The picture is a bike.", "The sign visible to the right of the photo has its object clearly visible. the object has two wheels and looks like answer a.", "The bicycle is on the sign."], "image": "train2014/COCO_train2014_000000108446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203661, "question_id": "ThPXyjmXjRG8c2SN4PWdmW", "question": "What is the black rectangular object called which is hanging from the ceiling?", "choices": ["stereo", "statue", "hood", "vent"], "correct_choice_idx": 2, "direct_answers": ["hood", "chimney", "vent", "plate", "hood", "hood", "bsll", "beauty", "light", "table"], "difficult_direct_answer": false, "rationales": ["This is the hood of the oven for venting.", "There is a hood.", "It's technically a b, but it's referred to as an a in a kitchen and the mouth of the b."], "image": "val2014/COCO_val2014_000000203661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294379, "question_id": "TjYqkTPxrnzjnnBw9ScQnL", "question": "What is above the bicycle?", "choices": ["baby", "old woman", "man", "cat"], "correct_choice_idx": 2, "direct_answers": ["person", "man", "person", "men", "human", "man", "rider", "man", "man", "man"], "difficult_direct_answer": false, "rationales": ["A man is sitting on the bicycle. he is wearing a helmet.", "This is obvious in the scene. the other options aren't even in this image.", "This bicycle is in use; a man in khaki shorts man's the peddles"], "image": "train2014/COCO_train2014_000000294379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359394, "question_id": "TnrxZvCw7SQZ52vM834Wbx", "question": "What color is the lettering on the center of the blue bus windowfront?", "choices": ["red", "green", "black", "yellow"], "correct_choice_idx": 3, "direct_answers": ["yellow", "yellow", "yellow", "green", "green", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The lettering below the window is in yellow.", "The text is the same color as the sun.", "It is the color of bananas"], "image": "train2014/COCO_train2014_000000359394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262608, "question_id": "TnxneHFy954pEUTmBWCpLM", "question": "What is the protein pictured?", "choices": ["beef", "fish", "chicken", "pork"], "correct_choice_idx": 3, "direct_answers": ["ribs", "fish", "pork", "pork", "beef", "meat", "chicken", "pork", "steak", "chicken"], "difficult_direct_answer": false, "rationales": ["The bone seems to be an a one. that said, it could also be c, but the meat looks too light-colored for cow.", "This is a piece of pork.", "These are chops from a pig"], "image": "val2014/COCO_val2014_000000262608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16161, "question_id": "To6oPAAj5MJoAWv6jdHnB7", "question": "What is the fruit sitting on top of the bunch of bananas on the green tablecloth?", "choices": ["grapefruit", "orange", "plantain", "apple"], "correct_choice_idx": 1, "direct_answers": ["orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The fruit is an orange.", "There is a citrus fruit on top of the bananas. it is too small to be a grapefruit.", "An orange is sitting on top of the bananas"], "image": "val2014/COCO_val2014_000000016161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490081, "question_id": "Tpz4saSs7jPTCNtvFAqNAD", "question": "What is behind the animals?", "choices": ["wheels", "cookies", "ladder", "baby"], "correct_choice_idx": 0, "direct_answers": ["cart", "cart", "cart", "wheels", "man", "man", "cart", "cart", "wooden cart", "cart"], "difficult_direct_answer": false, "rationales": ["There are some big wooden wheels pulled behind the animals.", "The animals are pulling the wheels.", "The cows are pulling a cart. there are no babies, ladders, or cookies."], "image": "val2014/COCO_val2014_000000490081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67456, "question_id": "TsJS8FLTxB2vf6HL4QPMXS", "question": "What kind of fuel does this vehicle run on?", "choices": ["potatoes", "gasoline", "jet fuel", "denatured alcohol"], "correct_choice_idx": 2, "direct_answers": ["lead fuel", "jet", "jet", "jet fuel", "desal", "jet fuel", "kerosene", "jet fuel", "gas", "jet fuel"], "difficult_direct_answer": false, "rationales": ["It has engines that lift it and needs this special fuel", "Only this type of fuel generates enough energy to power a plane.", "The plane uses jet fuel."], "image": "train2014/COCO_train2014_000000067456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185943, "question_id": "TsgqpVZFczTX3ZRNnKCt6L", "question": "What is the name of the gas station with the red star?", "choices": ["shell", "sinclair", "texaco", "caltex"], "correct_choice_idx": 3, "direct_answers": ["texaco", "caltex", "caltex", "calves", "none unreadable", "caltex", "texaco", "conco", "texaco", "nothing"], "difficult_direct_answer": false, "rationales": ["The name can be seen on the sign.", "There is a white sign in the background that says caltex on it.", "It is a well known international chain, with the star representing the lonestar state of texas."], "image": "val2014/COCO_val2014_000000185943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429226, "question_id": "TstMKdVr4vKzJouNs3osWL", "question": "What color are the man's socks?", "choices": ["pink", "purple", "green", "gray"], "correct_choice_idx": 3, "direct_answers": ["gray", "multi colors", "black", "grey", "grey", "black", "black", "grey", "black", "grey"], "difficult_direct_answer": false, "rationales": ["The color is easily visible and bright. it is in sharp contrast to the brown skin.", "The color is gray.", "They are a charcoal shade of this"], "image": "train2014/COCO_train2014_000000429226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462957, "question_id": "TtDESDgXC3BMMBznXKRNkt", "question": "What are the people doing?", "choices": ["reading", "watching movies", "paying videogames", "dancing"], "correct_choice_idx": 2, "direct_answers": ["watching tv", "playing wii", "playing", "singing", "paying videogames", "video game", "playing wii", "playing game", "looking right", "listening"], "difficult_direct_answer": true, "rationales": ["The people are holding nintendo wii controllers.", "Three of the people are holding video wii video game controllers.", "They have a controller in their hands"], "image": "train2014/COCO_train2014_000000462957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401743, "question_id": "TtPCrpnDeD7saDVY7CvJej", "question": "What color is the hat worn by the opposing team player who is in the team shot?", "choices": ["red", "blue", "purple", "green"], "correct_choice_idx": 0, "direct_answers": ["red", "blue", "red", "red", "red", "red", "blue", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color is red.", "This is obvious given the colors.", "The color is bright and easily visible. it is in sharp contrast to the blue caps of all the other players."], "image": "train2014/COCO_train2014_000000401743.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120059, "question_id": "Tu9Ekmn4sWA3TD4d6DwQva", "question": "What is she likely holding in her left hand?", "choices": ["wheelchair", "scooter", "walker", "cane"], "correct_choice_idx": 2, "direct_answers": ["phone", "cell phone", "phone", "cellphone", "cellphone", "brake handle", "cellphone", "phone", "phone", "walker"], "difficult_direct_answer": false, "rationales": ["The woman is elderly, and most likely needs assistance when moving.", "The woman is likely holding onto some kind of walker with her left hand because she's old.", "She's using a walker."], "image": "train2014/COCO_train2014_000000120059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175820, "question_id": "TuRgtgSaK8KTWmETiHd7sA", "question": "What is the name of the electronic device that the cat appears to be looking at in this image?", "choices": ["lamp", "remote", "fireplace", "table"], "correct_choice_idx": 1, "direct_answers": ["tv remote", "remote", "lazer", "phone", "led light", "remote control", "tv", "remote control", "remote", "phone"], "difficult_direct_answer": false, "rationales": ["The electronic device is used to control a television.", "That is the name of the device.", "The other options don't apply to the line of sight."], "image": "train2014/COCO_train2014_000000175820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144259, "question_id": "TuXjuGiFmKtuMuv2Qw5Qnd", "question": "What color are the jets for the bottom side of the aircraft shaped kite?", "choices": ["yellow", "green", "purple", "red"], "correct_choice_idx": 3, "direct_answers": ["yellow", "red", "red", "blue", "red", "red", "red", "red", "red", "gray"], "difficult_direct_answer": false, "rationales": ["These symbolize smoke that comes off the back of performance planes", "The color is red.", "The ribbons on the bottom of this airplane kite are a red color."], "image": "train2014/COCO_train2014_000000144259.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27565, "question_id": "TuhxM7JemhZC8WiifmdkJ8", "question": "What is on the boy's hand?", "choices": ["glove", "tattoo", "caterpillar", "egg yolk"], "correct_choice_idx": 0, "direct_answers": ["mitt", "baseball glove", "glove", "gloves", "baseball mitt", "glove", "baseball glove", "glove", "clouds", "mitt"], "difficult_direct_answer": false, "rationales": ["The boy is playing baseball. he is wearing an item that allows him to catch a ball.", "The hand has a glove.", "The hand has a glove on."], "image": "val2014/COCO_val2014_000000027565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424458, "question_id": "TvZFox9qHEVesZnRyHNvDU", "question": "If you had to guess which holiday would this most likely be?", "choices": ["thanksgiving", "christmas", "new years", "halloween"], "correct_choice_idx": 3, "direct_answers": ["halloween", "halloween", "guess", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween", "halloween"], "difficult_direct_answer": false, "rationales": ["The holiday is halloween.", "The man is wearing a pirate costume. this would not be a suitable outfit for christmas, thanksgiving, or new years.", "People often dress up in costumes, like a pirate, on october 31st and trick or treat."], "image": "train2014/COCO_train2014_000000424458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101749, "question_id": "TwQTj9jdeyvQcJUbsWgyRW", "question": "How many scooters are enclosed with white lines in the middle of the parking area?", "choices": ["three", "four", "two", "one"], "correct_choice_idx": 0, "direct_answers": ["two", "three", "two", "three", "three", "three", "three", "two", "three", "two"], "difficult_direct_answer": false, "rationales": ["When examining the space inside the white lines, there is an identifiable and countable number of scooters inside.", "There are two in the middle and one at the edge.", "A trio of scooters are between the white lines."], "image": "train2014/COCO_train2014_000000101749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185838, "question_id": "Twwjhgd89u3njriMTtKVa6", "question": "What in this photo is black purple and white only?", "choices": ["skate park", "hat", "pants", "shoes"], "correct_choice_idx": 3, "direct_answers": ["tennis shoes", "shoes", "shirt", "shoes", "shoes", "shoes", "shoe", "person", "shirt", "shirt"], "difficult_direct_answer": false, "rationales": ["The skateboarder wore these on his feet.", "The person's shoes are black, purple, and white.", "The shoes are purple, black and white."], "image": "val2014/COCO_val2014_000000185838.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265619, "question_id": "Txg6d8rdt7vn9fvKR4CmdD", "question": "What is flying through the air?", "choices": ["eagle", "buzzard", "kite", "airplane"], "correct_choice_idx": 3, "direct_answers": ["airplane", "airplane", "aeroplane", "airplane", "plane", "airplane", "airplane", "airplane", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["The object is not alive. it is powered by engines.", "The plane flies.", "This is a vehicle used to fly"], "image": "train2014/COCO_train2014_000000265619.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366994, "question_id": "TyA8zXoAToFjqZ5neHz8oB", "question": "What actress is from this country?", "choices": ["millie brady", "brooke shields", "jennifer connelly", "salma hayek"], "correct_choice_idx": 0, "direct_answers": ["babe rexha", "helen mirren", "kate beckinsale", "millie brady", "there", "emma watson", "helen mirren", "cant recall", "stop", "person"], "difficult_direct_answer": true, "rationales": ["The other actresses aren't from england/uk.", "They are an english actress.", "There are union jacks, and the vehicles are driving on the left. the country is the united kingdom, not the united states or mexico."], "image": "train2014/COCO_train2014_000000366994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174698, "question_id": "TyNsezZnzeTkTWAWxYwJQu", "question": "How many windows surround the fireplace mantle?", "choices": ["five", "two", "three", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "four", "four", "two", "eight", "two", "two", "eight", "eight"], "difficult_direct_answer": false, "rationales": ["There are 4 windows.", "There are four windows.", "By counting, one can see there are 4 windows surrounding the fire. there are tall windows but there are 4."], "image": "train2014/COCO_train2014_000000174698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270740, "question_id": "Tzc8CcEFDyEU4Gy8r3WuTK", "question": "What color is the uniform of the team who is currently pitching the ball?", "choices": ["blue", "black", "purple", "red"], "correct_choice_idx": 0, "direct_answers": ["grey", "white", "gray", "gray", "grey", "white", "blue", "gray", "gray blue", "blue"], "difficult_direct_answer": false, "rationales": ["The pitcher's uniform is grey and blue. the socks are blue.", "The catcher is wearing a blue helmet and blue shin guards. he is on the same team as the pitcher.", "The umpire has blue on."], "image": "train2014/COCO_train2014_000000270740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355575, "question_id": "U3ZCwiszNRcVFAW2VWtXry", "question": "What is coming out of the toilet bowl?", "choices": ["reptile", "fish", "frog", "hands"], "correct_choice_idx": 3, "direct_answers": ["hands", "hands", "hands", "fingers", "hands", "hands", "hands", "hand", "hands", "hands"], "difficult_direct_answer": false, "rationales": ["Hands are popping out of the toilet.", "Someones hands are coming out of the toilet bowl.", "There are human fingers on the edge of the toilet bowl."], "image": "val2014/COCO_val2014_000000355575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32193, "question_id": "U3cdHnxzzrhEJiGMrsLiER", "question": "Which animals are near the zebras?", "choices": ["bats", "cows", "giraffes", "sugar gliders"], "correct_choice_idx": 2, "direct_answers": ["giraffes", "giraffes", "giraffes", "giraffes", "giraffes", "giraffes", "giraffe", "giraffes", "giraffes", "giraffes"], "difficult_direct_answer": false, "rationales": ["There are giraffes.", "The zebras are by giraffes.", "The zebras are standing around the trees with a couple of giraffes."], "image": "val2014/COCO_val2014_000000032193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 106440, "question_id": "U4pP29v9fxMn6hHn8iwB83", "question": "What is the most common topping on the frosting?", "choices": ["popcorn", "nuts", "jelly", "powdered sugar"], "correct_choice_idx": 1, "direct_answers": ["icing", "chocolate", "nuts", "nuts", "chocolate", "sugar", "chocolate", "nuts", "almonds", "chocolate"], "difficult_direct_answer": false, "rationales": ["At least five of the donuts visible in this image have chopped pieces of nuts on them making them the most common frosting topping.", "The are very few donuts topped with anything other than dry drupes.", "A box of donuts has many with nuts on them."], "image": "train2014/COCO_train2014_000000106440.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423630, "question_id": "U5agM5qHED26HoXyNvrWTP", "question": "How many sleep are resting on their belly in the straw?", "choices": ["four", "two", "three", "one"], "correct_choice_idx": 3, "direct_answers": ["many", "one", "one", "one", "stop", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["6 out of 7 sheep are standing.", "The rest are standing", "There are several sheep standing and eating hay out of the pen. there is a single one laying in the hay behind the others."], "image": "train2014/COCO_train2014_000000423630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319293, "question_id": "U5vKHdJgAvPbYzLPUb32tA", "question": "Who wears the item the man in the foreground is wearing on his face?", "choices": ["lab professor", "mime", "newborn", "clown"], "correct_choice_idx": 0, "direct_answers": ["snowboarder", "skiers", "divers", "people", "goggles", "skier", "skiers", "lab professor", "snowboarders", "skiier"], "difficult_direct_answer": true, "rationales": ["The man is wearing goggles to protect his eyes. answer a works in a profession where eye protection is sometimes necessary.", "A person who works in a lab would wear goggles also.", "The goggles protect their eyes from harsh dangers."], "image": "train2014/COCO_train2014_000000319293.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135741, "question_id": "U7C32YMkycA236WBgL5kFh", "question": "What do you have to do in order to get the urinals to flush?", "choices": ["lever", "voice command", "walk away", "button"], "correct_choice_idx": 2, "direct_answers": ["press button", "flush", "nothing", "push button", "pull handle", "press handle", "nothing", "push handle", "walk away", "move"], "difficult_direct_answer": true, "rationales": ["The system seems to be automatic.", "The urinals would need to be flushed by walking away.", "The urinals are motion activated."], "image": "train2014/COCO_train2014_000000135741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331409, "question_id": "U8PysGZU8mbnU9EtDBUMEJ", "question": "What kind of wine is the man serving in the glasses?", "choices": ["red", "orange", "pink", "white"], "correct_choice_idx": 3, "direct_answers": ["fee", "red", "red wine", "white", "white", "white", "white", "red wine", "white wine", "red"], "difficult_direct_answer": false, "rationales": ["It is a see through liquid.", "The wine is white.", "Some dark white wine is served by the man holding the bottle."], "image": "train2014/COCO_train2014_000000331409.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238806, "question_id": "U8uQEYaeJ9mTDrjfT3Ve7z", "question": "What is the green stuff on top of?", "choices": ["salad", "apple", "pizza", "hot dog"], "correct_choice_idx": 2, "direct_answers": ["stop", "spanich", "spinach", "basil", "pizza", "green peppers", "spinach", "basil", "green", "spinach"], "difficult_direct_answer": false, "rationales": ["The green stuff is on pizza.", "The stuff is on pizza.", "This is a common pizza topping."], "image": "val2014/COCO_val2014_000000238806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309084, "question_id": "U9MAvgaNFgC6JbJWyfbsmG", "question": "What is the purpose of the item tied around his neck?", "choices": ["choking", "fashion", "identification", "breathing"], "correct_choice_idx": 2, "direct_answers": ["choker", "identification", "control", "identification", "rabies tag", "protect", "show ownership", "lock tied", "handling", "belt"], "difficult_direct_answer": true, "rationales": ["The dog collar is used to identify the dog.", "Most domestic pets have collars for walking and putting there name and address on it.", "It's a collar that holds a tag"], "image": "train2014/COCO_train2014_000000309084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122203, "question_id": "UA88PadWAL3vJg9XxG6yoT", "question": "What are the animals in the foreground doing?", "choices": ["jumping", "fighting", "eating", "sleeping"], "correct_choice_idx": 2, "direct_answers": ["eating", "eating", "eating", "eating", "eating", "eating grass", "grass", "eating grass", "grazing", "eating"], "difficult_direct_answer": false, "rationales": ["The animals eat.", "A cow's diet mainly consist of this food grown in pastures.", "The cows are eating grass."], "image": "val2014/COCO_val2014_000000122203.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 360370, "question_id": "UBUYvJQaxQLTrTA9a4EBam", "question": "What number is one number less than the number on the cow's collar?", "choices": ["933", "707", "906", "428"], "correct_choice_idx": 0, "direct_answers": ["933", "933", "934", "933", "33", "933", "933", "three", "three", "933"], "difficult_direct_answer": false, "rationales": ["Since the number on the collar is 934, one less would be 933.", "It's give that the number begins with 9, is higher than the d answer and basic subtraction makes it so.", "934 is on his collar so one less."], "image": "train2014/COCO_train2014_000000360370.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393036, "question_id": "UE25DrjQCLYvHN9YQ6Debs", "question": "What movie does the scene most resemble?", "choices": ["true grit", "matrix", "fight club", "american psycho"], "correct_choice_idx": 0, "direct_answers": ["seabiscuit", "black stallion", "dwdwdwd", "true grit", "braveheart", "wild horses", "bonzanza", "no idea", "true grit", "horses"], "difficult_direct_answer": true, "rationales": ["True grit is like this because there are horses on the beach. it is a western and westerns often feature horses.", "It is a western with cowboys and horses", "It's a western film and this photo also has people on horses."], "image": "train2014/COCO_train2014_000000393036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44232, "question_id": "UE3wgMGqfB3iUpZv3VQvHY", "question": "What are the long metal rods on the windshield of the train?", "choices": ["phone jacks", "support rods", "antennas", "windshield wipers"], "correct_choice_idx": 3, "direct_answers": ["wipers", "wipers", "wipers", "wipers", "wipers", "wipers", "below", "windshield wipers", "windshield wipers", "ksr"], "difficult_direct_answer": false, "rationales": ["The rods are wipers.", "Windshield wipers are at the front of trains to clear the glass.", "The rods are wipers."], "image": "train2014/COCO_train2014_000000044232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522330, "question_id": "UELCb8Wi77Ah3duf4Xiu85", "question": "What is the dog next to?", "choices": ["apple", "snake", "monkey", "luggage"], "correct_choice_idx": 3, "direct_answers": ["luggage", "suitcase", "luggage", "suitcase", "luggage", "luggage", "sleep", "suitcase", "suitcase", "travel luggage"], "difficult_direct_answer": false, "rationales": ["The dog is laying on the green grass. it is next to luggage.", "The dog is lying next to a suitcase of a traveler.", "A dog lays next to a suitcase."], "image": "train2014/COCO_train2014_000000522330.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523700, "question_id": "UGG36iUUwDh2qm8ve2Tt2N", "question": "Why might these people be lined up?", "choices": ["repairs", "race", "lessons", "donation"], "correct_choice_idx": 3, "direct_answers": ["bike race", "race", "nothing", "race", "bike race", "moving", "donation", "bike-a-thon", "bike race", "race"], "difficult_direct_answer": false, "rationales": ["The moving truck in the background suggests the bikes will be transported elsewhere.", "The people are donating.", "They donate."], "image": "train2014/COCO_train2014_000000523700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90010, "question_id": "UMGAV5WiqeaEsgshxxHN4s", "question": "What is on the shoe?", "choices": ["human foot", "cat", "mold", "human hand"], "correct_choice_idx": 1, "direct_answers": ["cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["A cat can be seen hovering directly on top of a leather loafer style shoe.", "It has whiskers and pointy ears like felines do.", "The other options don't appear in this image."], "image": "val2014/COCO_val2014_000000090010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277518, "question_id": "UNAAzpUvv4jo8RJwQhZHA6", "question": "How many birds are parked on the top of the boat?", "choices": ["two", "one", "three", "six"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "three", "two", "two", "two", "three", "two", "two", "three"], "difficult_direct_answer": false, "rationales": ["There are 2.", "There is one bird on the bow and one on the stern.", "One bird is visible on each end of this boat."], "image": "val2014/COCO_val2014_000000277518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195755, "question_id": "UPHfo4sZcsSaDDaoRutgbh", "question": "What top speed can this vehicle likely reach?", "choices": ["80000 mph", "500000mph", "7000 mph", "1000 mph"], "correct_choice_idx": 3, "direct_answers": ["very fast", "1000 mph", "1200 mph", "300 mph", "3000mph", "600 mph", "300 mph", "1000", "575", "thousand"], "difficult_direct_answer": true, "rationales": ["These airplanes could most likely reach only 1000 miles per hour.", "That is the maximum speed the planes can go.", "The speed is 1000 mph."], "image": "train2014/COCO_train2014_000000195755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303538, "question_id": "UQ5jB7yGQTrhGpBisdkRcv", "question": "What kind of clothing accessory is worn on the skating man's wrist?", "choices": ["sweatband", "elastic band", "wristwatch", "bracelet"], "correct_choice_idx": 0, "direct_answers": ["wristband", "wristband", "watch", "bracelet", "wrist band", "wrist band", "wristband", "bracelet", "belt", "sweatband"], "difficult_direct_answer": false, "rationales": ["A man has wristbands on.", "He is wearing a cloth band that is used to wipe away sweat.", "The clothing is a sweatband."], "image": "val2014/COCO_val2014_000000303538.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 160254, "question_id": "UQ7bAYgeYdvGvUJXYG3trN", "question": "What event might this be for?", "choices": ["board meeting", "superbowl", "world series", "birthday"], "correct_choice_idx": 3, "direct_answers": ["potluck", "birthday", "desert", "family gathering", "dinner", "birthday", "birthday", "birthday", "dinner party", "birthday"], "difficult_direct_answer": false, "rationales": ["Cake is something that is commonly served on that day.", "It's an event for something, and has cake, which is usually for birthdays.", "The group is eating a cake with frosting."], "image": "train2014/COCO_train2014_000000160254.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206512, "question_id": "UQX8weA6eTuUwBWLmadLFW", "question": "What is the man swinging?", "choices": ["baseball bat", "tennis racquet", "oar", "stuffed animal"], "correct_choice_idx": 1, "direct_answers": ["hit ball", "tennis racquet", "racket", "tennis racket", "tennis racket", "racket", "tennis racket", "racquet", "tennis", "ball"], "difficult_direct_answer": false, "rationales": ["Serena williams made her career using this device.", "He's playing this game", "The man is about to hit a green ball."], "image": "val2014/COCO_val2014_000000206512.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529864, "question_id": "URSWcNTKiAntFgTPHbKSMD", "question": "What color is the button on top of the bagel?", "choices": ["white", "red", "purple", "tan"], "correct_choice_idx": 3, "direct_answers": ["beige", "white", "tan", "beige", "white", "silver", "grey", "brown", "tan", "brown"], "difficult_direct_answer": false, "rationales": ["A light brown button is in cream cheese on top of a bagel.", "The color is tan.", "The color is tan."], "image": "train2014/COCO_train2014_000000529864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48428, "question_id": "URrWVcnrkBr67W9Bct8agU", "question": "What type of object is conspicuously placed on the plate with all the fruit?", "choices": ["ignition coil", "solenoid", "shoe", "lightbulb"], "correct_choice_idx": 3, "direct_answers": ["light-up ball", "lightbulb", "light bulb", "orb", "lightbulb", "ball", "pingpong ball", "table", "light bulb", "round sic"], "difficult_direct_answer": false, "rationales": ["There is a bulb for light on the fruit plate.", "The object is a lightbulb.", "There is a lightbulb hidden among the apples."], "image": "train2014/COCO_train2014_000000048428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406857, "question_id": "USBLSN8t8XF3nNjGoGV9GV", "question": "What kind of cheese is spread over the pasta salad?", "choices": ["american", "swiss", "cheddar", "parmesan"], "correct_choice_idx": 3, "direct_answers": ["parmesan", "parmesan", "parmesan", "parmesan", "parmesan", "parmesan", "parmesan", "parmesan", "parmesan", "parmesan"], "difficult_direct_answer": false, "rationales": ["The cheese is parmesan.", "The flakes of cheese are parmesan cheese.", "Parmesan cheese is spread over the top of the pasta salad."], "image": "train2014/COCO_train2014_000000406857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421432, "question_id": "USTfjHErMySDoYdCcqUYp9", "question": "What is next to the bright red car?", "choices": ["tank", "bicycle", "cat", "truck"], "correct_choice_idx": 3, "direct_answers": ["blue truck", "truck", "truck", "post", "pole", "light pole", "car", "lamppost", "post", "truck"], "difficult_direct_answer": false, "rationales": ["The car is near a truck.", "There is only one bright red car on the road and the vehicle closest to it is answer a based on its size and style.", "There is a pickup with a tarp over the back"], "image": "train2014/COCO_train2014_000000421432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158913, "question_id": "UTAFLmyqXGvVnhmwFhfTnY", "question": "What color is the central rectangle of the kite flown above the open field?", "choices": ["yellow", "blue", "red", "green"], "correct_choice_idx": 0, "direct_answers": ["blue", "blue", "yellow", "yellow", "blue", "yellow", "yellow", "yellow", "yellow", "red"], "difficult_direct_answer": false, "rationales": ["The middle of the kite is yellow.", "The shape in the middle is yellow.", "The rectangle is yellow."], "image": "train2014/COCO_train2014_000000158913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93748, "question_id": "UTgbvwFw6yTvj2XscpXEG2", "question": "What is behind the doll in the foreground?", "choices": ["teddy bears", "cow", "dog", "cat"], "correct_choice_idx": 0, "direct_answers": ["teddy bear", "bear", "teddy bear", "teddy bears", "teddy", "teddy bears", "teddy bears", "teddybear", "teddy bear", "teddy bears"], "difficult_direct_answer": false, "rationales": ["These are stuffed animals", "There are brown dolls that look like bears behind it.", "One can see the stuffed animals in the background behind the doll."], "image": "train2014/COCO_train2014_000000093748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348331, "question_id": "UTtfH7YAMAqCGNEuRuQ8Jo", "question": "What are the people riding?", "choices": ["bicycles", "horses", "antelopes", "cars"], "correct_choice_idx": 0, "direct_answers": ["bikes", "bicycles", "bicycles", "bike", "bikes", "riding", "bicycles", "cycle riding", "bikes", "bikes"], "difficult_direct_answer": false, "rationales": ["This method of transportation is typically a frame with two wheels and handlebars situated in front of a seat. propelled by the users leg power. these people are riding this vehicle.", "The other options don't appear in the image. this type of vehicle is common in cities.", "These are human powered vehicles with two wheels"], "image": "train2014/COCO_train2014_000000348331.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203744, "question_id": "UTzYSs5JZ9DKRGeeDrA4Ko", "question": "What is the animal doing?", "choices": ["sleeping", "feeding", "soaring", "jumping"], "correct_choice_idx": 2, "direct_answers": ["soaring", "flying", "flying", "flying", "flying", "flying", "flying", "flying", "flying", "flying"], "difficult_direct_answer": false, "rationales": ["The animal is soaring over the ocean.", "A bird is flying above water.", "The bird is flying."], "image": "train2014/COCO_train2014_000000203744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244992, "question_id": "UUUD436esCSe8T52BWJXES", "question": "What color is the chocolate ball on the top corner of the cake?", "choices": ["white", "red", "brown", "green"], "correct_choice_idx": 2, "direct_answers": ["brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown", "black", "brown"], "difficult_direct_answer": false, "rationales": ["The color is brown.", "The chocolate ball is not red, white, or green.", "The cake is brown."], "image": "train2014/COCO_train2014_000000244992.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554859, "question_id": "UUap7eBN8dMVR9L8uajFeq", "question": "What is next to the table on the left?", "choices": ["green chair", "black chair", "baby", "cow"], "correct_choice_idx": 1, "direct_answers": ["chair", "chair", "black chair", "desk", "black chair", "chair", "chair", "chair", "chair", "chair"], "difficult_direct_answer": false, "rationales": ["This is the closest piece of furniture to the table", "The chair is near.", "There is a black chair next to the table."], "image": "val2014/COCO_val2014_000000554859.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 48910, "question_id": "UUupUcngLuySdz2UjDie9x", "question": "What is the most likely activity the person on the yellow chair is doing?", "choices": ["video game", "watching tv", "singing", "cooking"], "correct_choice_idx": 1, "direct_answers": ["watching tv", "watching tv", "receiving guest", "sleeping", "watching tv", "reading", "watching tv", "sleeping", "sitting", "lounging"], "difficult_direct_answer": false, "rationales": ["The activity is watching tv.", "The chair is located in the living room where this type of entertainment is often found. the person has their feet up suggesting a passive activity.", "The person is sitting in the living room where most people watch tv."], "image": "val2014/COCO_val2014_000000048910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218601, "question_id": "UVL6jQyu66eoyqCJjiJv5u", "question": "What is unique about this animal?", "choices": ["skinny", "flies", "fat", "tall"], "correct_choice_idx": 3, "direct_answers": ["giraffe", "giraffe", "long neck", "long neck", "height", "tall", "neck", "tall", "stop", "long neck"], "difficult_direct_answer": false, "rationales": ["The animal is a giraffe which is known to be the tallest animal.", "The animal has a tall neck.", "It is a giraffe"], "image": "train2014/COCO_train2014_000000218601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498658, "question_id": "UWbDyvBbFVHm3i2shfPJkZ", "question": "The woman in the red blouse is using a cell phone of what color?", "choices": ["blue", "red", "silver", "green"], "correct_choice_idx": 0, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "blue", "black", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["It matches her purse", "The phone has the same color as the bag.", "The device in the woman's hand which she gives her attention to is blue. the shape and size of this device identifies it as a phone."], "image": "train2014/COCO_train2014_000000498658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165050, "question_id": "UXoKrvn56Mu45s6bizb5Re", "question": "What is the woman sprinkling over her pizza?", "choices": ["nutmeg", "oregano", "spinach", "mint"], "correct_choice_idx": 1, "direct_answers": ["peppers", "spices", "pepper", "pepper", "pepper", "herbs", "herbs", "pepper", "pizza", "oregano"], "difficult_direct_answer": false, "rationales": ["That is normally you can sprinkle on pizza.", "It is a green herb and what is used in italian cooking", "It is a spice used in spicing pizza."], "image": "train2014/COCO_train2014_000000165050.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259417, "question_id": "UZ29nhJnhiY58dYLbPnH8X", "question": "What color are the stripes on the side of the bathroom wall?", "choices": ["blue", "purple", "green", "pink"], "correct_choice_idx": 2, "direct_answers": ["green", "green", "green", "green", "white", "tan", "green", "green white", "green", "blue"], "difficult_direct_answer": false, "rationales": ["The stripes on the wall are green pastel.", "The color is green.", "The stripes are not blue, pink, or purple."], "image": "train2014/COCO_train2014_000000259417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285007, "question_id": "UbExUsra2MYsP7RppZQywN", "question": "What is the white appliance used for?", "choices": ["clean water", "clean clothes", "clean people", "clean dishes"], "correct_choice_idx": 1, "direct_answers": ["laundry", "washing dishes", "washing clothes", "laundry", "clean clothes", "cooking", "tiles", "washing clothes", "washing", "cooking"], "difficult_direct_answer": false, "rationales": ["A dishwasher to clean dishes.", "That is a machine that would spin around clothes to wash them.", "The appliance is for clothes."], "image": "val2014/COCO_val2014_000000285007.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 462426, "question_id": "Ucmx8NTSxuebuKzCs9dPsz", "question": "Based on what's shown on the computer screen what is this person doing?", "choices": ["writing fiction", "shopping list", "programming", "gaming"], "correct_choice_idx": 2, "direct_answers": ["programming", "hacking", "watching", "program", "things", "coding", "tracking data", "programming", "taking inventory", "working"], "difficult_direct_answer": true, "rationales": ["There appears to be lines of code on the computer which is something a person doing answer a might encounter.", "A computer screen is black and there are rows of white text on the screen.", "They have a lot of code listed"], "image": "train2014/COCO_train2014_000000462426.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58483, "question_id": "Ud8TxSUMYd6Sa84dXqoQtR", "question": "What is in front of the elephant's tusks?", "choices": ["rocks", "branches", "bark", "grass"], "correct_choice_idx": 0, "direct_answers": ["nose", "rocks", "trunk", "rocks", "rocks", "rocks", "rocks", "rock", "elephant's tusks", "rocks"], "difficult_direct_answer": false, "rationales": ["He is standing in front of rocks", "There are big boulders in front of the tusks.", "There are a bunch of rocks in front of the elephant."], "image": "val2014/COCO_val2014_000000058483.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398489, "question_id": "Uebn8XP2zuq2uc3PMmmmse", "question": "What animal is most closely related to this one?", "choices": ["echidna", "slug", "eagle", "tiger"], "correct_choice_idx": 3, "direct_answers": ["lion", "cougar", "cat", "tiger", "cat", "tiger", "tiger", "dog", "cat", "tiger"], "difficult_direct_answer": false, "rationales": ["This animal is a cat. eagles, echidnas, and slugs are not closely related to cats.", "Tigers are big cats.", "That animal is part of the feline family."], "image": "val2014/COCO_val2014_000000398489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14990, "question_id": "UeuQatkNU4vndFPPX6gZqQ", "question": "How many elephants are following after the man wearing a white hat?", "choices": ["three", "four", "two", "five"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The animals are clearly visible and located behind the man in the image. they are countable based on their outlines.", "A couple of elephants are following the man.", "The man has two large elephants walking behind him."], "image": "val2014/COCO_val2014_000000014990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251032, "question_id": "UgxyFr6czgbJYbPxAWaQvH", "question": "What color are the oars hanging off the rear of these boats in the muddy water?", "choices": ["blue", "black", "purple", "red"], "correct_choice_idx": 1, "direct_answers": ["black", "black", "black", "black", "red", "black", "blue", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The color is black.", "They are the darkest color in the area", "The oars are not blue, purple, or red."], "image": "train2014/COCO_train2014_000000251032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313186, "question_id": "UieF2GJKMdQe4oBHwAjgnE", "question": "What animal is on the kite?", "choices": ["whale", "echidna", "dog", "snake"], "correct_choice_idx": 0, "direct_answers": ["whale", "dolphins", "killer whale", "whale", "fish", "whale", "dolphin", "dolphin", "whale", "whale"], "difficult_direct_answer": false, "rationales": ["The whale is on the kite.", "A large, white and black marine animal is on a kite.", "The whale is on the kite."], "image": "train2014/COCO_train2014_000000313186.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 376263, "question_id": "UifFavN4WkkD6eC4NvQMLs", "question": "What is on the fire hydrant?", "choices": ["elephant", "face", "dog", "cat"], "correct_choice_idx": 1, "direct_answers": ["face", "paint", "painting", "graffiti", "face", "face", "road", "face", "face", "person"], "difficult_direct_answer": false, "rationales": ["The hydrant has a face.", "The other options aren't represented here.", "It has eyes, a nose and a mouth."], "image": "train2014/COCO_train2014_000000376263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197163, "question_id": "UigvKcEJcBDBwsxaih3WLE", "question": "How many zebras are standing in the way of the path?", "choices": ["two", "three", "four", "one"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["The zebras are clearly visible and countable based on their distinct outlines.", "Each zebra has four legs. there are twelve legs in total.", "There are 3."], "image": "val2014/COCO_val2014_000000197163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533224, "question_id": "UjakcaE5cur9LkJKqGdwYB", "question": "What color is the underbelly of the brown cow with poop on its butt?", "choices": ["black", "brown", "white", "blue"], "correct_choice_idx": 2, "direct_answers": ["white", "brown", "white", "white", "white", "white", "brown", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["It's the same color as its back foot", "The color is white.", "The underbelly does not match the brown fur and is not blue or black."], "image": "train2014/COCO_train2014_000000533224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472320, "question_id": "Um3njZxiRRYPMTDBYPHfDh", "question": "In what country would you find this type of cuisine?", "choices": ["australia", "mexico", "united states", "united kingdom"], "correct_choice_idx": 2, "direct_answers": ["usa", "america", "donut", "america", "united states", "america", "america", "usa", "united states", "america"], "difficult_direct_answer": false, "rationales": ["The country is the us.", "The type of food in the picture is american.", "The dish is made by americans."], "image": "train2014/COCO_train2014_000000472320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481130, "question_id": "UoyrCSBF5jLzZRRzgEP2h5", "question": "How many beach chairs are grouped together for each umbrella?", "choices": ["one", "three", "two", "four"], "correct_choice_idx": 3, "direct_answers": ["ten", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are two chairs on either side of the umbrella and chair grouping in the foreground.", "Two and two chairs are together.", "There are four chairs on the beach."], "image": "val2014/COCO_val2014_000000481130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550532, "question_id": "UppNfPYPopGhhwVH3VAsyx", "question": "Which meter has the higher number on it?", "choices": ["leftmost", "rightmost", "center", "third one"], "correct_choice_idx": 0, "direct_answers": ["sixty eight", "left 68", "left", "left", "68 left", "leftmost", "left", "left", "left", "left"], "difficult_direct_answer": false, "rationales": ["It is number 68 and the other is 67", "The left meter has the higher one.", "There are two meters. 68 is higher than 67."], "image": "train2014/COCO_train2014_000000550532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473773, "question_id": "UtWVvecavjQjezzGLxik8M", "question": "What color are the poles dragged around by the young child with his skis?", "choices": ["black", "orange", "white", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["This is obvious by just looking at the poles.", "They are the color of apples", "There are only two poles visible being dragged by the child and their color is answer d."], "image": "train2014/COCO_train2014_000000473773.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448319, "question_id": "Uu3M4h8bFZ3tYdjny2dUwG", "question": "What part of the animal in the foreground is closest to the ground?", "choices": ["horn", "tail", "tusk", "wing"], "correct_choice_idx": 1, "direct_answers": ["hoof", "hooves", "face", "feet", "tail", "head", "feet", "snout", "feet", "zebra"], "difficult_direct_answer": false, "rationales": ["The end of these zebra's tails are the closest feature here listed to the grass.", "The part is the tail.", "The tail is the part that is very close to the ground."], "image": "val2014/COCO_val2014_000000448319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526364, "question_id": "UutkyXMHJ6kuE4hy2zhZfH", "question": "What is most likely to be cold inside?", "choices": ["fridge", "oven", "cupboard", "door"], "correct_choice_idx": 0, "direct_answers": ["fridge", "refrigerator", "fridge", "refrigerator", "refrigerator", "refidgerator", "refrigerator", "freezer", "fridge", "freezer"], "difficult_direct_answer": false, "rationales": ["The rectangular white device with two doors and a handle running down the two of them is a refrigerator. a refrigerator's purpose is keeping the things inside of it cold.", "The question doesn't make sense, but a is the only thing that gets cold.", "The image contains a kitchen with cabinets and appliances. the appliance that is visible and known to keep things cold on the inside is answer a."], "image": "val2014/COCO_val2014_000000526364.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355740, "question_id": "UuytdEYCx2p3c4uFsHjxzZ", "question": "What animal is between the giraffes?", "choices": ["goose", "cow", "zebra", "salamander"], "correct_choice_idx": 2, "direct_answers": ["zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "algebra", "zebra"], "difficult_direct_answer": false, "rationales": ["You can tell by the colors and stripes as to what type of animal it is.", "The horse shaped animal has stripes.", "There is a striped equine animal between the giraffes."], "image": "train2014/COCO_train2014_000000355740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164, "question_id": "UvRav4rzRVtCYhTrxZqzh4", "question": "What is made in this room?", "choices": ["food", "sewage", "arcade machines", "samurai swords"], "correct_choice_idx": 0, "direct_answers": ["food", "food", "meals", "food", "food", "food", "food", "food", "food", "food"], "difficult_direct_answer": false, "rationales": ["This is a kitchen. meals are prepared in the kitchen.", "The room has food.", "The room is a kitchen with cooking appliances."], "image": "val2014/COCO_val2014_000000000164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 313520, "question_id": "UvXgiVpzi2ifaV3uHrZqrM", "question": "What term is appropriate to describe this animal?", "choices": ["feline", "crustacean", "bovine", "canine"], "correct_choice_idx": 3, "direct_answers": ["dog", "dog", "dog", "dog", "black dog", "dog", "canine", "dog", "cute", "loyal"], "difficult_direct_answer": false, "rationales": ["The term is a dog.", "The other options aren't represented here. a is another word for dog as well.", "The animal is also called a dog."], "image": "train2014/COCO_train2014_000000313520.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8093, "question_id": "Uw6AGE6LJKTfUUUsi5FzDs", "question": "What is the brown and white animal doing with its neck in the air?", "choices": ["drinking", "getting angry", "sleeping", "consuming leaves"], "correct_choice_idx": 3, "direct_answers": ["giraffe", "eating", "eating", "eating", "eating", "eating", "eating", "consuming leaves", "eating", "reaching eating"], "difficult_direct_answer": false, "rationales": ["The animal is eating leaves.", "The animal is eating leaves.", "Giraffes have long necks so that they can consume greenery that is up high. they are standing under a tree where it is available."], "image": "train2014/COCO_train2014_000000008093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468005, "question_id": "UzFmKz38dWncQwnEY9DVvA", "question": "What color are the lobes of the elephant's ears?", "choices": ["green", "white", "pink", "blue"], "correct_choice_idx": 2, "direct_answers": ["pink", "pink", "black", "black color", "pink", "pink", "light brown", "pinkish brown", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["The lobes of the elephant's ears that aren't covered in mud are pink.", "His lobes of his ears are pink", "This is common with elephants."], "image": "val2014/COCO_val2014_000000468005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308760, "question_id": "Uzm4ySWYgD8yX9H6Jh82mY", "question": "What does the very large toy resemble?", "choices": ["cow", "dog", "horse", "elephant"], "correct_choice_idx": 1, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The toy breed is a german shepard.", "The animal has sharp teeth and is sitting similar to a pooch. it has somewhat pointed ears as well.", "It looks a little like a german shepherd dog"], "image": "train2014/COCO_train2014_000000308760.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395405, "question_id": "V2R6fBbxkhrpKPXXzkNgek", "question": "What color is the sofa at the one narrow end of the coffee table?", "choices": ["blue", "yellow", "red", "white"], "correct_choice_idx": 0, "direct_answers": ["brown", "grey", "brown", "green", "brown", "blue", "brown", "gray", "red", "brown"], "difficult_direct_answer": false, "rationales": ["There is a blue sofa at the narrow end of the coffee table.", "The sofa looks to be a red color", "It is blue."], "image": "val2014/COCO_val2014_000000395405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156128, "question_id": "V3GyyrXvgxsYxTFSS2mS7C", "question": "What kind of citrus fruit are these indicated by their relative size and shape?", "choices": ["grapefruit", "lemons", "mandarins", "limes"], "correct_choice_idx": 2, "direct_answers": ["mandarin", "orange", "orange", "orange", "oval", "oranges", "tangerines", "mandarins", "clementines", "oranges"], "difficult_direct_answer": false, "rationales": ["Based on the size, shape and color, only answer a from the list of possible answers meets the criteria.", "These are small versions of oranges and very sweet", "They are small oranges."], "image": "train2014/COCO_train2014_000000156128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453481, "question_id": "V3hcTXDwvPYCcb3fRTdTVw", "question": "What color are the sleeves of the female bike rider?", "choices": ["black", "pink", "green", "blue"], "correct_choice_idx": 1, "direct_answers": ["red", "orange", "red", "red", "red", "pink", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["Her shirt is blue, but the sleeves are a different color. her sleeves are not black or green.", "They are similar to red like cherries", "The color is pink."], "image": "val2014/COCO_val2014_000000453481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302806, "question_id": "V3mTC44nAGv5a456ZgAFgW", "question": "What is the purpose of the black and silver square appliance?", "choices": ["cleaning", "storage", "sorting", "cooking"], "correct_choice_idx": 3, "direct_answers": ["toast food", "toasting", "toast", "toasting bread", "heat food", "cook", "toast", "toaster", "cooking", "heat food"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to a toaster oven, which is the name of this object.", "This is a microwave oven that heats up food", "The appliance is a toaster oven."], "image": "val2014/COCO_val2014_000000302806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29913, "question_id": "V4E6oKTbUsQHWUbaD3gSKK", "question": "What color is the top of the fire hydrant with eye decals on the front?", "choices": ["silver", "green", "white", "blue"], "correct_choice_idx": 0, "direct_answers": ["grey", "silver", "silver", "silver", "silver", "grey", "grey", "silver gray", "silver", "silver"], "difficult_direct_answer": false, "rationales": ["It's the same color as the pole", "The shiny metallic color on the top of this fire hydrant could be called silver.", "The top of the fire hydrant is silver."], "image": "val2014/COCO_val2014_000000029913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148668, "question_id": "V4pwcPvEGAJYKE743BHYzP", "question": "What allows this bike to be visible at night?", "choices": ["blinkers", "handlebar", "horn", "bike chain"], "correct_choice_idx": 0, "direct_answers": ["light", "bike", "lights", "street light", "reflector", "blinkers", "reflectors", "headlight", "reflectors", "reflectors"], "difficult_direct_answer": false, "rationales": ["There are lights that help keep the bike visible in low light.", "The bike has blinkers.", "The bicycle has red reflectors on the wheels that blink in the dark so the rider is visible."], "image": "val2014/COCO_val2014_000000148668.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284252, "question_id": "V6FkKSZcAUKdF7MzdeyCtc", "question": "What has the long stem?", "choices": ["vase", "cherry", "red rose", "dandelion"], "correct_choice_idx": 1, "direct_answers": ["cherry", "cherry", "cherry", "fruit", "cherry", "cherry", "cherry", "cherry", "cherry", "cherry"], "difficult_direct_answer": false, "rationales": ["It's a cherry.", "The cherry has a long stem on it.", "There is only one item with a stem visible. it is the size, shape and color consistent with answer a."], "image": "train2014/COCO_train2014_000000284252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 482756, "question_id": "V6Kn5jai7BAAt2jL59DVDS", "question": "What are the animals near?", "choices": ["old man", "fence", "eggs", "baby"], "correct_choice_idx": 1, "direct_answers": ["fance", "fence", "fance", "house", "road", "fence", "road", "sheep", "fence", "fence"], "difficult_direct_answer": false, "rationales": ["The animals are near a fence. the sheep are close.", "The animals are by a fence.", "They are in an enclosure"], "image": "train2014/COCO_train2014_000000482756.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90196, "question_id": "V7pBpTuMd9ZWhxEwkyxsJi", "question": "Why is the elephant laying down on the left with the tourist on top?", "choices": ["tired", "sleepy", "sick", "afraid"], "correct_choice_idx": 0, "direct_answers": ["easier access", "get down", "resting", "resting", "getup", "peoples", "grass", "ground", "tired", "tired"], "difficult_direct_answer": false, "rationales": ["The elephant is very tired.", "The elephant has a person riding on it. the elephant must be tired.", "He is tired."], "image": "train2014/COCO_train2014_000000090196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235575, "question_id": "V8fFZcGibcQCEDnj8dyiV6", "question": "Which item on the plate likely is highest in vitamins and minerals?", "choices": ["grits", "crab cake", "lemon", "broccoli"], "correct_choice_idx": 3, "direct_answers": ["greens", "egg", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli", "broccoli"], "difficult_direct_answer": false, "rationales": ["Broccoli is good for you", "The item is broccoli.", "Broccoli is very good for you."], "image": "val2014/COCO_val2014_000000235575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548957, "question_id": "V9Tidg8rxwL3KWaUcKqrkU", "question": "What color is the round fruit underneath of the apples?", "choices": ["red", "green", "blue", "orange"], "correct_choice_idx": 3, "direct_answers": ["orange", "orange", "orange", "orange", "orange", "yellow", "apple", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["They are orange.", "The color is orange.", "The round fruit is a common citrus fruit."], "image": "val2014/COCO_val2014_000000548957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248280, "question_id": "VAp4M4L6q9YBuaNBns2QeJ", "question": "How many men are sharing the motorcycle together?", "choices": ["four", "one", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two men on the bike.", "A couple of happy motorcycle riders are cruising down a quiet city street. the first motorcycle was invented in germany in 1885.", "It's obvious in the image."], "image": "train2014/COCO_train2014_000000248280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 210761, "question_id": "VB37yERe6WbFrqXRYZQR92", "question": "What is the man touching?", "choices": ["cat", "apple", "dog", "refrigerator door"], "correct_choice_idx": 3, "direct_answers": ["fridge", "fridge", "refrigerator", "fridge", "refrigerator", "refrigerator door", "fridge", "refrigerator", "refrigerator", "refrigerator"], "difficult_direct_answer": false, "rationales": ["The appliance is opened by this method. it is in the kitchen which is typically where this appliance is found and it is opened in this type of way.", "The man is in a kitchen. he is looking inside a white appliance.", "The man is touching the fridge door."], "image": "val2014/COCO_val2014_000000210761.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132860, "question_id": "VCwqaH8isLTK8afuNoRrHV", "question": "What color is the netting on this purse?", "choices": ["green", "yellow", "blue", "red"], "correct_choice_idx": 1, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yello", "yellow", "yellow", "orange", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The yellow netting is yellow.", "The purse is made of yellow netting and has a yellow inside material.", "The color is yellow."], "image": "val2014/COCO_val2014_000000132860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513191, "question_id": "VE9Jo7EdWGLgnhxcpfUk4o", "question": "What is in the white cup?", "choices": ["soy sauce", "jelly", "syrup", "soda"], "correct_choice_idx": 2, "direct_answers": ["syrup", "syrup", "coffee", "syrup", "syrup", "syrup", "syrup", "syrup", "syrup", "syrup"], "difficult_direct_answer": false, "rationales": ["The food being served is french toast based on its appearance. answer a is commonly served with french toast and would be served in the manner seen.", "The cup has syrup.", "This is french toast and that's a common topping"], "image": "train2014/COCO_train2014_000000513191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464075, "question_id": "VEP3qdvRr7qF5edMF6owFF", "question": "These boats are most likely in what country given their names?", "choices": ["france", "spain", "england", "germany"], "correct_choice_idx": 1, "direct_answers": ["canada", "spain", "brazil", "british", "spain", "spain", "france", "victoria", "italy", "italy"], "difficult_direct_answer": false, "rationales": ["The names given are in spanish language.", "They speak spanish there.", "The boats are in spain."], "image": "train2014/COCO_train2014_000000464075.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 440329, "question_id": "VHH3E7tB4BRcZvBngtVhA9", "question": "Where are the people standing?", "choices": ["boxing ring", "junkyard", "pool", "parking lot"], "correct_choice_idx": 3, "direct_answers": ["parking lot", "street", "lot", "parking lot", "parking lot", "parking lot", "parking", "packing", "parking lot", "parking lot"], "difficult_direct_answer": false, "rationales": ["The people are standing in a parking lot full of cars.", "There are many cars that are in good shape. there is no pool or boxing ring.", "The people are in a parking lot."], "image": "val2014/COCO_val2014_000000440329.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345009, "question_id": "VJ3uNF3PvSiSt73HBNm8zu", "question": "What is on the bottom of the airplane that enables it to operate in water?", "choices": ["wheels", "balloons", "skis", "logs"], "correct_choice_idx": 2, "direct_answers": ["buoy", "float", "skis", "floats", "floats", "landing pad", "floats", "raft", "keel", "pontoons"], "difficult_direct_answer": false, "rationales": ["There are air filled pontoons on the bottom of the airplane that float in the water.", "The bottom has skis.", "The large skis help the plane float on water."], "image": "train2014/COCO_train2014_000000345009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289400, "question_id": "VJYAK8eMbkXtr5LLFiPEVQ", "question": "What store is this store competing with?", "choices": ["office max", "dunkin donuts", "best buy", "ibm"], "correct_choice_idx": 1, "direct_answers": ["donut", "dunkin donuts", "krispy kreme", "donut stores", "bakery", "krispy kreme", "donuts", "dunkin doughnuts", "dunking donuts", "dunkin donuts"], "difficult_direct_answer": false, "rationales": ["A business with a large donut is on the roof.", "This store's name of donut king and it's massive sculpture in the shape of a donut tell us it sells donuts. dunkin donuts also sells donuts.", "Both shops sell the same product. the sign is in the shape of a doughnut suggesting what is sells."], "image": "val2014/COCO_val2014_000000289400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536514, "question_id": "VKMEEsqwtDD9Hpki7BtS2P", "question": "What is the name of the operating system for both of these computers?", "choices": ["google", "windows", "linux", "mac"], "correct_choice_idx": 3, "direct_answers": ["mac os", "apple", "laptop", "os", "macos", "mac", "mac", "mac", "computers", "mac os"], "difficult_direct_answer": false, "rationales": ["The name is the mac.", "The apple logo is mac.", "The icons are all at the bottom of the screen."], "image": "train2014/COCO_train2014_000000536514.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390157, "question_id": "VLTTtNRjyrBn5js6DdiHYz", "question": "What baseball team's name appears on the bottle?", "choices": ["cardinals", "brewers", "rays", "mets"], "correct_choice_idx": 1, "direct_answers": ["brewers", "brewers", "brewers", "milwaukee brewers", "brewers", "brewers", "brewers", "milwaukee", "brewers", "brewers"], "difficult_direct_answer": false, "rationales": ["The mlb team from milwaukee has their team's name on the bottle.", "It has the same name", "The bottle in the foreground between glasses reads brewers special reserve. the brewers are a baseball team."], "image": "val2014/COCO_val2014_000000390157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92624, "question_id": "VLaZPux3CyNsFMFqU7KubT", "question": "Which animals are near the trees?", "choices": ["giraffes", "cats", "salamanders", "echidnas"], "correct_choice_idx": 0, "direct_answers": ["giraffes", "giraffes", "giraffes", "giraffe", "giraffes", "giraffe", "giraffes", "giraffe", "giraffes", "giraffes"], "difficult_direct_answer": false, "rationales": ["The animals are giraffes.", "There is a group of giraffes standing near the trees.", "Giraffes are near the trees."], "image": "val2014/COCO_val2014_000000092624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49860, "question_id": "VMANEKjZi76juVpVQgLCsG", "question": "What is on the computer screen?", "choices": ["orange", "sticky note", "cat portrait", "ketchup stain"], "correct_choice_idx": 1, "direct_answers": ["words", "yellow sticky", "sticky note", "sticky note", "window", "sticky note", "information", "words", "website", "document"], "difficult_direct_answer": false, "rationales": ["There is a note on the screen.", "There's a yellow sticky note on the top of the computer screen.", "It is a yellow square of paper with some weak glue across the top"], "image": "train2014/COCO_train2014_000000049860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132686, "question_id": "VMKhFeqgrT9h2HmHkTNHWb", "question": "What is rising in the air?", "choices": ["airplane", "balloon", "kite", "butterfly"], "correct_choice_idx": 2, "direct_answers": ["kites", "kite", "kite", "kites", "kite", "kites", "kites", "kites", "kites", "kites"], "difficult_direct_answer": false, "rationales": ["These are cloth being held by strings", "The kite is rising.", "There are colorful kites flying in the sky."], "image": "val2014/COCO_val2014_000000132686.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201291, "question_id": "VMnbxYQkCWiZq8U4aGtwix", "question": "What is touching the broccoli?", "choices": ["cats paw", "pasta noodles", "clowns nose", "babys hand"], "correct_choice_idx": 1, "direct_answers": ["beef pasta", "pasta", "noodles", "noodles", "noodles", "noodles", "meat", "carrot", "noodles", "pasta noodles"], "difficult_direct_answer": false, "rationales": ["The pasta touches the broccoli.", "There are some pasta noodles next to the broccoli.", "The other options make no sense and don't apply to a plate of food."], "image": "train2014/COCO_train2014_000000201291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329946, "question_id": "VN95sdmxCHUgYuTLBwFsDv", "question": "What color is the stripes on the three wings of the nearby airplane-shaped kite?", "choices": ["orange", "white", "blue", "yellow"], "correct_choice_idx": 3, "direct_answers": ["yellow", "green red", "red green", "brown", "black", "yellow", "red", "red", "red", "yellow"], "difficult_direct_answer": false, "rationales": ["The color is yellow.", "There is a large airplane shaped kite closest in view. it has red and yellow squares on its wings and red on tail.", "This is obvious in the scene."], "image": "val2014/COCO_val2014_000000329946.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336749, "question_id": "VNCCe2vxwhuqspUwur4gVS", "question": "What heats the stove for cooking?", "choices": ["wood", "natural gas", "coal", "electricity"], "correct_choice_idx": 3, "direct_answers": ["electricity", "electricity", "food", "oven", "electric", "electric", "electricity", "oven", "gas", "fire"], "difficult_direct_answer": false, "rationales": ["A stove has a flat top.", "The oven is a non gas type of oven and needs electricity.", "A kitchen has a range with a flat top rather than grates with pilot lights."], "image": "train2014/COCO_train2014_000000336749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130972, "question_id": "VNSaoZMwYY3ZzGQpFYYECe", "question": "What color are the highlights in the hair of the person eating the hot dog?", "choices": ["blonde", "white", "milktea", "brunette"], "correct_choice_idx": 0, "direct_answers": ["yellow", "blonde", "blonde", "blonde", "brown", "blonde", "blonde", "blonde", "blonde", "gold"], "difficult_direct_answer": false, "rationales": ["This person has light highlights in their hair.", "The person with the hot dog in their mouth has blonde highlights.", "Her regular hair is brown. her highlights are a different color and are not white or milktea."], "image": "train2014/COCO_train2014_000000130972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237818, "question_id": "VP7SN4C3XyU9Z3T3vwDzPm", "question": "How many bottles of wine are to the right in front to the man who is cutting the pizza?", "choices": ["one", "three", "two", "four"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 3.", "They're not right in front of him. they're to the side of him on the counter.", "The bottles are identifiable by their standard shape, size and the corks visible on the top. in the location specified, the number of bottles is countable."], "image": "train2014/COCO_train2014_000000237818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372673, "question_id": "VQcjSAG3pyP8YzPcqK4B3n", "question": "What animal is most similar to these?", "choices": ["horse", "echidna", "leopard", "sugar glider"], "correct_choice_idx": 0, "direct_answers": ["horse", "horse", "horse", "horse", "horse", "hoarse", "horse", "donkey", "zebra", "horse"], "difficult_direct_answer": false, "rationales": ["The animal looks like a horse.", "They have the same basic size and shape as these animals", "Horses are most similar to zebras."], "image": "train2014/COCO_train2014_000000372673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483000, "question_id": "VQxZAhSiEXRkRP7mQdbbYZ", "question": "What are the pants type the man is wearing?", "choices": ["overalls", "stonewashed jeans", "slacks", "khakis"], "correct_choice_idx": 0, "direct_answers": ["belse", "overalls", "stop", "protect", "overalls", "work clothes", "overalls", "apron", "overalls", "overalls"], "difficult_direct_answer": false, "rationales": ["The man is wearing overalls.", "The other options don't apply with his actions or this image.", "A man is walking in pants that continue up over his chest and have straps that go over his shoulders."], "image": "train2014/COCO_train2014_000000483000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108850, "question_id": "VTp38LzJpHjgnKuFHxZD68", "question": "How many women with bikinis are riding on horseback on the beach?", "choices": ["four", "two", "three", "five"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "dwdw", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are a couple of women in bikinis on horses.", "One woman on horseback is on the left. an additional woman on horseback is on the right.", "There are two women."], "image": "train2014/COCO_train2014_000000108850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345831, "question_id": "VUL5hkcaCiX9y6e7vAce5r", "question": "How does the flying object stay in the air?", "choices": ["rain", "sun", "wind", "snow"], "correct_choice_idx": 2, "direct_answers": ["wind", "wind", "string", "kite", "one", "degree", "wind", "kite", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["A person is holding a kite up in the air.", "The object is a kite and the different motions in the air currents keep it floating in the air.", "The object is light enough that air will keep it aloft."], "image": "train2014/COCO_train2014_000000345831.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485605, "question_id": "VUqNNQ8cZkGR2DaGdrYsAT", "question": "What is the silver rectangular object on the counter?", "choices": ["tissue box", "candy dish", "power box", "soap dish"], "correct_choice_idx": 3, "direct_answers": ["sink", "soap dish", "box", "stop", "soap dish", "tin", "mirror", "soap dish", "soap dish", "soap dish"], "difficult_direct_answer": false, "rationales": ["The silver object is a dish used to hold soap.", "The object is the soap dish.", "A bathroom vanity has a silver, square dish next to the faucet."], "image": "train2014/COCO_train2014_000000485605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 193629, "question_id": "VV8qTYfq4XELS6URJ9zsQi", "question": "What color is the lateral stripe around the hull of the blue boat?", "choices": ["red", "black", "yellow", "green"], "correct_choice_idx": 0, "direct_answers": ["blue", "brown", "blue", "brown", "red", "brown", "red", "blue", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The other colors don't appear in the image. people often paint boats these colors.", "There is only one visible horizontal stripe visible that goes around the entire blue boat and it is answer a."], "image": "train2014/COCO_train2014_000000193629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102171, "question_id": "VXJgchAbQZUMFxCfpgV3zE", "question": "How many computer screens are on top of the desk?", "choices": ["five", "three", "two", "four"], "correct_choice_idx": 2, "direct_answers": ["two", "two screen", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two screens.", "The visible monitors are only two.", "There are 2."], "image": "train2014/COCO_train2014_000000102171.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254644, "question_id": "VYgeUYamUBNmHc7bBLaBwe", "question": "How many drawers are in the bottom cabinet of this kitchen?", "choices": ["four", "two", "one", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "brown", "three", "three"], "difficult_direct_answer": false, "rationales": ["They are part of the cabinets and easy to count", "There are 3 door knobs.", "They start small and get larger as it goes down"], "image": "val2014/COCO_val2014_000000254644.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189660, "question_id": "VYkLPGW4SMaNaqcg5g9UEp", "question": "What is obscured by the grass?", "choices": ["zebras", "elephants", "moose", "cows"], "correct_choice_idx": 0, "direct_answers": ["zebras", "zebra", "zebras", "zebra", "zebras", "zebras", "zebra", "zebra", "zebras", "zebras"], "difficult_direct_answer": false, "rationales": ["Answer a is visible over the top of the grass, but is obscuring part of the animals. the animals are answer a based on their unique features, size and shape.", "There are animals with black and white stripes grazing in the grass.", "The zebras are hidden."], "image": "train2014/COCO_train2014_000000189660.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35711, "question_id": "VZsgSBZ2zhSN5eEMxpLx2r", "question": "How many people are in wetsuits standing before the crashing wave?", "choices": ["two", "four", "three", "one"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "four", "three", "not load", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are less than four but more than two people standing.", "There are three people in wetsuits near the ocean.", "There are three people."], "image": "val2014/COCO_val2014_000000035711.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189853, "question_id": "VZwao6tr9sPdCoRb7SDgYh", "question": "What kind of fencing encloses these giraffes in the zoo?", "choices": ["stone", "chain link", "wooden", "electrified wire"], "correct_choice_idx": 1, "direct_answers": ["metal fence", "chain link", "chain link", "electric", "metal", "electric", "steel", "metal", "height", "metal"], "difficult_direct_answer": false, "rationales": ["It is metal fencing", "The fence is metal with chain links.", "This is a chain link fence to keep the animals in."], "image": "train2014/COCO_train2014_000000189853.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446827, "question_id": "VeG2dhNCDPPYgNqGpdZVwQ", "question": "What does the front of the automobile shown in this image most resemble?", "choices": ["rainbow", "autumn", "circus", "sunset"], "correct_choice_idx": 0, "direct_answers": ["car", "bicycle", "rainbow", "rainbow", "rainbow", "rainbow", "rainbow", "rainbow", "color", "rainbow"], "difficult_direct_answer": false, "rationales": ["(a) rainbow. the stripes on the front of the auto are similar or the same as the colors you see in a rainbow.", "The other options don't match the paint pattern.", "The front has a rainbow."], "image": "train2014/COCO_train2014_000000446827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38092, "question_id": "VegRuTobU6FJ89KZDW6r88", "question": "What is the little elephant walking on top of?", "choices": ["rocks", "dirt", "grass", "pavement"], "correct_choice_idx": 3, "direct_answers": ["street", "road", "road", "road", "road", "dirt", "road", "pavement", "road", "pavement"], "difficult_direct_answer": false, "rationales": ["Although the little elephant is headed toward the grass and dirt, it is currently still on the paved road.", "This is an asphalt road", "The elephant is on pavement."], "image": "val2014/COCO_val2014_000000038092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407460, "question_id": "VgQWinGbG7GeBP9R8TZ3CN", "question": "What would be a more appropriate title for the larger painting on the wall?", "choices": ["yellow submarine", "fast car", "pogo stick", "army tank"], "correct_choice_idx": 1, "direct_answers": ["boxing", "jeep fun", "family time", "red car", "mini", "fast car", "poster", "car riding", "desert trip", "race car"], "difficult_direct_answer": true, "rationales": ["The fast car is more appropriate.", "A modern fast car would be more appropriate.", "The picture is of people driving some kind of sports car."], "image": "train2014/COCO_train2014_000000407460.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 473915, "question_id": "VgrCLgNCSvetpcjVTBUerH", "question": "What kind of vegetable is between the bread and the meat on top of the white plate?", "choices": ["red", "purple", "orange", "green"], "correct_choice_idx": 2, "direct_answers": ["potato", "carrot", "carrot", "carrot", "carrot", "carrot", "orange", "carrot", "carrot", "carrot"], "difficult_direct_answer": false, "rationales": ["It's a carrot, but for some reason the options only list colors.", "It is a carrot.", "The orange vegetables are between."], "image": "val2014/COCO_val2014_000000473915.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498297, "question_id": "VhHnocFFhDDpubqRWWNamz", "question": "How many sheep are here with horns?", "choices": ["two", "one", "four", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three animals in close view.", "There are three sheep with horns.", "One horned sheep is in between two others."], "image": "train2014/COCO_train2014_000000498297.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279646, "question_id": "VhJQLPYbViHgfEKj5zhqiu", "question": "How many waves are at the extension of the surf beyond which there is a man surfing?", "choices": ["two", "one", "four", "three"], "correct_choice_idx": 0, "direct_answers": ["one", "two", "three", "two", "two", "two", "two", "three", "one", "two"], "difficult_direct_answer": false, "rationales": ["There are two waves up.", "There are two waves extended out from where the man is currently surfing.", "There are two waves in the ocean."], "image": "train2014/COCO_train2014_000000279646.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31255, "question_id": "Vha42796Wxrxb2MMTTDs4f", "question": "What color is the middle of the three horse's coat?", "choices": ["black", "chestnut", "white", "pinto"], "correct_choice_idx": 0, "direct_answers": ["black", "black", "brown", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["A black horse is standing with a horse on either side of it.", "The horse in the middle is very dark in color.", "The three horses are clearly visible and the orientation to either other is clear. the one located in the middle has a identifiable coloring."], "image": "val2014/COCO_val2014_000000031255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55241, "question_id": "Vhm7LxDoPN5cd2B7kHdoCK", "question": "What has the big ears?", "choices": ["cat", "cow", "baby", "elephant"], "correct_choice_idx": 1, "direct_answers": ["cow", "cow", "cow", "cow", "cow", "cow", "cow", "left ear", "cows", "cow"], "difficult_direct_answer": false, "rationales": ["The cow has big ears.", "Large brown animals in a pasture are brown. cows are generally kept in pastures.", "The cow has ears."], "image": "val2014/COCO_val2014_000000055241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149500, "question_id": "ViaLcgEXzVsJgzykmKAkKY", "question": "What is the man looking at?", "choices": ["apple", "cow", "phone", "baby"], "correct_choice_idx": 2, "direct_answers": ["phone", "cell phone", "phone", "phone", "phone", "flip phone", "cell phone", "phone", "phone", "phone"], "difficult_direct_answer": false, "rationales": ["The man is holding an electronic device, not a baby, cow, or apple.", "A man is holding a small electronic device in his hand that has a screen and folds open and closed.", "He is looking at his phone."], "image": "val2014/COCO_val2014_000000149500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478142, "question_id": "VjjQwpQ2TnUnBemGrdmyPc", "question": "What kind of highway does the motorcycle ride upon?", "choices": ["dirt", "interstate", "gravel", "town"], "correct_choice_idx": 1, "direct_answers": ["interstate", "paved", "not busy", "highway", "roadway", "interstate", "four lane", "interstate", "unsure", "tar"], "difficult_direct_answer": false, "rationales": ["The motorcycle is at the interestate.", "The highway is paved and has barriers.", "The highway is an interstate one."], "image": "train2014/COCO_train2014_000000478142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52759, "question_id": "VkXQSAWYMhZnGR8PPo96Gn", "question": "What company logo is featured on the square item all the way to the left?", "choices": ["huffy", "mcdonalds", "united", "ford"], "correct_choice_idx": 2, "direct_answers": ["united airlines", "indigo", "united", "united", "united", "united", "country", "united", "united", "usa"], "difficult_direct_answer": false, "rationales": ["There is a large emblem on the tail of the airplane in far distance that says united with logo.", "The name is on the planes", "The square item to the left belongs to the airline whose name is spelled out on the side."], "image": "val2014/COCO_val2014_000000052759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312, "question_id": "VnW7LvJ8yjW8H27zG4xDUR", "question": "What are a group of these animals called?", "choices": ["herd", "school", "flock", "clowder"], "correct_choice_idx": 0, "direct_answers": ["elephants", "elephants", "stop", "herd", "herd", "herd", "pack", "elephant", "elephants", "herd"], "difficult_direct_answer": false, "rationales": ["This is a group of several elephants", "These are elephants", "A group of elephants are stampeding around close to one another on a dirt path."], "image": "train2014/COCO_train2014_000000000312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504284, "question_id": "VogJgngNRMmUkL83BXid42", "question": "What are the people on the bikes holding?", "choices": ["babies", "kittens", "hands", "horns"], "correct_choice_idx": 2, "direct_answers": ["hands", "hands", "hands", "hands", "hands", "hands", "hands", "hands", "stop", "hands"], "difficult_direct_answer": false, "rationales": ["The people are holding hands.", "There is a couple that are grabbing hands as they bike down a path in the park.", "The people hold hands."], "image": "train2014/COCO_train2014_000000504284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 579156, "question_id": "VpbbxV4knSdcwSe22p4Ryv", "question": "What kind of fuel does the cow run on?", "choices": ["food", "firewood", "ethanol", "gas"], "correct_choice_idx": 0, "direct_answers": ["grass", "grass", "food", "grass", "grass", "grass", "blood", "water", "grass", "food"], "difficult_direct_answer": false, "rationales": ["Cows are fueled by food.", "Cows eat grass.", "All creatures need to eat in order to gain calories to burn."], "image": "train2014/COCO_train2014_000000579156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 201664, "question_id": "VqJAVPfngCHxrkxka4s7Xq", "question": "What are the reddish and green plants called in the forefront of the planter?", "choices": ["agapanthus", "pampas grass", "flax", "lily"], "correct_choice_idx": 2, "direct_answers": ["ferns", "flowers", "agave", "bush", "yucca", "green plamts", "bushes", "na", "flax", "palms"], "difficult_direct_answer": true, "rationales": ["There is a young girl posing with her dad. there standing in front of some plants that are pointy bushes as well as trees.", "That's how they grow.", "The plants are flax. flax appears are a reddish green color."], "image": "train2014/COCO_train2014_000000201664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147492, "question_id": "VqXXvsg2B92fY7UVeM2bH6", "question": "What color are the onions on the top left part of the white plate?", "choices": ["sweet", "yellow", "white", "purple"], "correct_choice_idx": 3, "direct_answers": ["purple white", "red", "red", "white red", "red", "red", "purple", "red", "white purple", "purple"], "difficult_direct_answer": false, "rationales": ["These are called red onions but are actually this color", "The color is purple.", "The onions are purple."], "image": "train2014/COCO_train2014_000000147492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312926, "question_id": "VrPvZC4UySmiWWyPQqHr3G", "question": "What is the person in green holding?", "choices": ["tray", "soda", "baton", "childs hand"], "correct_choice_idx": 3, "direct_answers": ["child hand", "childs hand", "childs hand", "kid", "childs hand", "hand", "jacket", "child", "jacket", "hand"], "difficult_direct_answer": false, "rationales": ["The person has a kid's hand.", "The person in green is holding a child's hand.", "The person in green is holding hands with the little boy."], "image": "train2014/COCO_train2014_000000312926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502090, "question_id": "Vs8P7ifiriNwfd2MRGMCQz", "question": "What is in the air?", "choices": ["parachutes", "airplanes", "kites", "frisbee"], "correct_choice_idx": 2, "direct_answers": ["kite", "kites", "kite", "kite", "kite", "kite", "kite", "kite", "kite", "kite"], "difficult_direct_answer": false, "rationales": ["The air has a kite.", "The colorful airborne items in this image are identifiable as kites.", "Kites are made from bright coloured fabric and fly in the air."], "image": "val2014/COCO_val2014_000000502090.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556476, "question_id": "VvSnPCMKtpendqERQvurLU", "question": "What is next to the lamppost?", "choices": ["dog", "elephant", "cat", "bench"], "correct_choice_idx": 3, "direct_answers": ["bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench", "bench"], "difficult_direct_answer": false, "rationales": ["It is characteristic by its l shape where people can sit down. it is found next to streets in public places.", "An l shaped seat can be seen directly attached to the lamppost.", "The object next to the light is clear and has features, design, shape and size consistent with answer a."], "image": "train2014/COCO_train2014_000000556476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314008, "question_id": "VwGQSkZki7sVFpiVpgbpmt", "question": "What is touching the pizza?", "choices": ["spoon", "fork", "pizza cutter", "knife"], "correct_choice_idx": 2, "direct_answers": ["pizza cutter", "cutter", "cutter", "pizza cutter", "pizza cutter", "hand", "pizza cutter", "cutter", "pizza cutter", "pizza cutter"], "difficult_direct_answer": false, "rationales": ["The person is using a round wheel with teeth to separate the pie into serving slices.", "A cutter is touching the pizza.", "This is a rolling blade used to cut food"], "image": "train2014/COCO_train2014_000000314008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211163, "question_id": "VwnWVzQD4X5T8GPUgBqiyG", "question": "What is the person getting ready to do?", "choices": ["pitch", "cook pizza", "golf", "shoot hoops"], "correct_choice_idx": 0, "direct_answers": ["thow ball", "pitch", "pitch", "pitch", "pitch", "catch", "throw", "pitch", "pitch", "pitch"], "difficult_direct_answer": false, "rationales": ["The person will pitch.", "The player is on the pitching mound and is getting ready to pitch the ball.", "The man is holding a ball in the baseball glove ready to throw."], "image": "val2014/COCO_val2014_000000211163.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443607, "question_id": "VwsZdmaExrwWXkfWnyZnFS", "question": "Why is the wagon in this area?", "choices": ["customer rides", "picking apples", "sleeping", "broke down"], "correct_choice_idx": 0, "direct_answers": ["car", "customer rides", "caring", "ride", "travel", "sightseeing", "transporting passengers", "horse cart", "to travel", "forest"], "difficult_direct_answer": true, "rationales": ["This is to provide a scenic tour", "There is a woman in a hat and passengers as they ride in back of a wood box being pulled by a horse.", "There is a customer giving rides."], "image": "train2014/COCO_train2014_000000443607.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3528, "question_id": "VyHPNAK2nifLNTRfwnPznf", "question": "What is the likely relation of the person pushing the stroller to the child in it?", "choices": ["grandmother", "mother", "cousin", "aunt"], "correct_choice_idx": 0, "direct_answers": ["grandma", "tired", "grandparent", "grandmother", "grandma", "grandparent", "friend", "grandmother", "grandparent", "grandma"], "difficult_direct_answer": false, "rationales": ["There is a woman in grey hair pushing a stroller among others in a field.", "The woman is elderly.", "The relation is the grandma."], "image": "train2014/COCO_train2014_000000003528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474448, "question_id": "W2bPCjHMP2EYuFV3us3Y5V", "question": "What is the man looking at?", "choices": ["dog", "bee", "cow", "baby"], "correct_choice_idx": 3, "direct_answers": ["child", "baby", "child", "child", "child", "child", "child", "child", "kid", "child"], "difficult_direct_answer": false, "rationales": ["The man is looking at a human, not a non-human animal.", "The man's looking at a baby.", "The man is looking at a baby."], "image": "val2014/COCO_val2014_000000474448.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240755, "question_id": "W6BqobgYLfb4z5weipUTzb", "question": "What is the white object to the right of the cat likely to be?", "choices": ["bicycle", "ironing board", "table", "toy"], "correct_choice_idx": 1, "direct_answers": ["iron board", "stand", "chair", "ironing board", "ironing board", "ironing board", "umbrella", "stand", "chair", "ironing board"], "difficult_direct_answer": false, "rationales": ["The object is for ironing.", "Looks to be the leg of a table.", "The slanted stand is foldable, so when you're done with the board, you can put it away. those are used to iron your clothes on."], "image": "train2014/COCO_train2014_000000240755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36237, "question_id": "W8kMVrxcTuSZGbNpHnMpte", "question": "What is wearing the hat?", "choices": ["dog", "horse", "woman", "baby"], "correct_choice_idx": 1, "direct_answers": ["hours", "horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse", "horse"], "difficult_direct_answer": false, "rationales": ["The hat is clearly visible and has been placed on top of answer a in a manner that looks like answer a is wearing it.", "The hat is for a horse.", "The hat is clearly visible on top of another thing. the thing in question is answer a based on the size, shape, features and the way it is attached to the carriage."], "image": "train2014/COCO_train2014_000000036237.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515289, "question_id": "W9PYBZdG9hgE3yhcuQr82r", "question": "What type of bird is on the street?", "choices": ["pigeon", "peacock", "magpie", "crow"], "correct_choice_idx": 0, "direct_answers": ["pigeon", "pigeon", "pigeon", "pigeon", "pigeon", "pigeon", "dove", "pigeon", "pigeon", "dove"], "difficult_direct_answer": false, "rationales": ["A bird with a slender, grayish blue head and neck and lighter colored wings is on a sidewalk in a park.", "The bird is a pigeon.", "These are common in cities and parks."], "image": "val2014/COCO_val2014_000000515289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5111, "question_id": "W9RajWXUtb2oc5rNUycYoV", "question": "What is near the window?", "choices": ["tub", "mouse", "canary", "cat"], "correct_choice_idx": 0, "direct_answers": ["candle", "candle", "candle", "bathtub", "bathtub", "tub", "bath", "candle", "candle", "sink"], "difficult_direct_answer": false, "rationales": ["A large, white vessel with a faucet and products in a tray hanging over it is in front of a window with blinds.", "The is a large basin for water and a human to take a bath in.", "It is in the bathroom."], "image": "train2014/COCO_train2014_000000005111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301238, "question_id": "W9c5ZFAQyZkdsrXusEAiBd", "question": "Who frequents this place?", "choices": ["clown", "mime", "chupacabra", "priest"], "correct_choice_idx": 3, "direct_answers": ["priest", "dwdww", "christian", "religious people", "classroom", "church goers", "church goers", "church goers", "religious worshippers", "christians"], "difficult_direct_answer": false, "rationales": ["Pews are near an altar. this is a church, not a circus, city street, or farm.", "The room is found in a chapel or a church and would be visited frequently by the priest.", "The priest comes to church."], "image": "train2014/COCO_train2014_000000301238.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303537, "question_id": "WADzcR7pnb7BgD6CP6Kedq", "question": "What color is the little purse on the marble countertop next to the big raised sink?", "choices": ["green", "blue", "orange", "pink"], "correct_choice_idx": 2, "direct_answers": ["orange", "orange", "orange", "orange", "orange", "orange", "pink orange", "orange", "orange", "tan"], "difficult_direct_answer": false, "rationales": ["The purse on the countertop is orange.", "The small clutch bag is orange.", "It is orange."], "image": "train2014/COCO_train2014_000000303537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267182, "question_id": "WB2xXL9eRpxEt9v2eXitfB", "question": "What color are the pears expressed by this painting?", "choices": ["white", "yellow", "green", "red"], "correct_choice_idx": 2, "direct_answers": ["green", "green", "yellow", "yellow", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["The pears have a natural green which occurs to majority of all plants.", "The color is green.", "The pear is normally a yellowish green color as are the pears depicted in this painting."], "image": "val2014/COCO_val2014_000000267182.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97795, "question_id": "WBcCmFmLrFrgVkLQoAg8JC", "question": "What kind of beverage are the couple most likely drinking together?", "choices": ["water", "wine", "beer", "juice"], "correct_choice_idx": 0, "direct_answers": ["wine", "wine", "wine", "champagne", "wine", "champagne", "champagne", "wine", "wine", "water"], "difficult_direct_answer": false, "rationales": ["The glasses the couple are drinking from are commonly used for drinking this beverage as it enhances the taste.", "They are drinking put of glasses that are designed to optimize drinking wine.", "The couple are interlocking arms and drinking each others fluid out of their fancy stemmed glass."], "image": "train2014/COCO_train2014_000000097795.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173959, "question_id": "WByg3CfaVQsNiqw8aH337Q", "question": "What athlete might these kids know if they follow this sport closely?", "choices": ["cody rhodes", "jim kaat", "babe ruth", "lionel messi"], "correct_choice_idx": 3, "direct_answers": ["ronaldo", "ventrell miller", "david beckham", "ronaldinho", "lionel messi", "lionel messi", "lionel messi", "david beckham", "soccer players", "football"], "difficult_direct_answer": false, "rationales": ["He is a soccer player.", "Lionel messi plays soccer.", "That person is known for playing soccer."], "image": "val2014/COCO_val2014_000000173959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439837, "question_id": "WCP3khX3Hq8hQBszTFGU2m", "question": "What is on top of the counter?", "choices": ["banana", "sink", "cat", "television"], "correct_choice_idx": 1, "direct_answers": ["bowl", "counter", "fountain", "sink", "paper towels", "fountain", "napkins", "fountain", "dishes", "fountain"], "difficult_direct_answer": false, "rationales": ["It's actually embedded in it. the other options don't appear in this image.", "There is a sink basin on the table top.", "This is a kitchen with a faucet"], "image": "val2014/COCO_val2014_000000439837.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38449, "question_id": "WCuombYSPvUYbAvYiYtdJ3", "question": "What are the people near?", "choices": ["baby", "deer", "luggage", "missile"], "correct_choice_idx": 2, "direct_answers": ["van", "luggage", "car", "car", "van", "car", "car", "car", "suitcases", "van"], "difficult_direct_answer": false, "rationales": ["The people are near luggage.", "The woman is holding a suitcase. a second suitcase is near the man.", "The woman is holding a bag. the man's bag is on the ground."], "image": "val2014/COCO_val2014_000000038449.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224285, "question_id": "WDzbvQ8vZ8v5Ezn3RZ2DM4", "question": "There are two trains going down the rail of likely what country?", "choices": ["korea", "canada", "united states", "japan"], "correct_choice_idx": 3, "direct_answers": ["japan", "us", "us", "china", "usa", "america", "japan", "usa", "united states", "japan"], "difficult_direct_answer": false, "rationales": ["This asian country uses trains on its hilly terrain.", "The automobile is driving on the left side of the road, so it probably is not in korea, the united states, or canada.", "They have a very efficient train system for travel"], "image": "train2014/COCO_train2014_000000224285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540264, "question_id": "WF62LMRMpLeFVNPTUjPuEs", "question": "What is the woman adjusting?", "choices": ["her shoelaces", "cats leash", "dogs collar", "tie"], "correct_choice_idx": 3, "direct_answers": ["tie", "tie", "bowtie", "bowtie", "tie", "tie", "tie", "tie", "tie", "tie"], "difficult_direct_answer": false, "rationales": ["A woman is reaching towards a man's neck who is wearing dress clothes. ties are commonly wore as part of dress clothes for men.", "The person's adjusting a tie.", "The woman is helping the guy with his necktie."], "image": "val2014/COCO_val2014_000000540264.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 38934, "question_id": "WFkP7728pEzVhDMLThab7M", "question": "What kind of structure is sitting ont he top right hand corner of the train?", "choices": ["fire", "statue", "lighthouse", "skyscraper"], "correct_choice_idx": 2, "direct_answers": ["house", "house", "roof", "sillow", "cabin", "walkway", "lighthouse", "cabin", "lighthouse", "light"], "difficult_direct_answer": false, "rationales": ["The structure is a lighthouse.", "Lighthouses are tall and by the water.", "The tower is by the sea and is a lighthouse."], "image": "train2014/COCO_train2014_000000038934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380993, "question_id": "WG5DyPxdNDEUUnWLTV7FKt", "question": "What is the boy running through?", "choices": ["snow", "rain", "corn field", "bar"], "correct_choice_idx": 1, "direct_answers": ["rain", "street", "water", "rain", "street", "rain", "rain", "rain", "street", "rain"], "difficult_direct_answer": false, "rationales": ["This is evident given the umbrellas people are using and the glossy appearance of the street.", "The boy is in an urban setting and is too young to enter a bar. there is no snow on the ground.", "There is water falling"], "image": "val2014/COCO_val2014_000000380993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 443630, "question_id": "WGYXaFrHstxJZCbjGLEVrq", "question": "There are how many airplanes flying in formation at the sky?", "choices": ["three", "two", "one", "four"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["One is in the front and the other two behind it", "There are three planes.", "One airplane is in between two others."], "image": "val2014/COCO_val2014_000000443630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488206, "question_id": "WHEgzDZznggBYbmo68GXBa", "question": "What animal is on the bench?", "choices": ["dog", "black cat", "orange cat", "badger"], "correct_choice_idx": 1, "direct_answers": ["cat", "cat", "black cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["A cat is sitting on the bench.", "There is a little black cat next to the bench.", "A domestic cat is sitting on a bench."], "image": "val2014/COCO_val2014_000000488206.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 241466, "question_id": "WHtmHnFnB2AcQBroK8tUY7", "question": "The statue best represents who?", "choices": ["thor", "ganesh", "hades", "anubis"], "correct_choice_idx": 1, "direct_answers": ["elephant", "hindu god", "good", "elephant", "elephant", "ganesh", "elephant", "elephant", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["The statue is ganesh.", "That person is an elephant queen.", "The ganesh is an elephant-headed god."], "image": "val2014/COCO_val2014_000000241466.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 464526, "question_id": "WJ44Jom2KxM5GLYQT9M6ZJ", "question": "The controller of which variety of video game manufacturers produces what this woman is holding?", "choices": ["sony", "sega", "microsoft", "nintendo"], "correct_choice_idx": 3, "direct_answers": ["nintendo", "nintendo", "play", "nintendo", "wii", "wii", "wii", "nintendo", "wii", "playing"], "difficult_direct_answer": false, "rationales": ["The woman is using a wii controller.", "The controller is for nintendo.", "The controller is made by nintendo."], "image": "val2014/COCO_val2014_000000464526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104337, "question_id": "WJR7PjYyCBKc7kv8s4u6MH", "question": "What country is the name on the jerseys located in?", "choices": ["norway", "tenochtitlan", "france", "pakistan"], "correct_choice_idx": 0, "direct_answers": ["canada", "oslo", "norway", "norway", "norway", "norway", "oslo", "norway", "oslo", "oslo"], "difficult_direct_answer": false, "rationales": ["The country is norway.", "The city of oslo is the capital of the scandinavian country.", "The jerseys are from norway."], "image": "train2014/COCO_train2014_000000104337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491494, "question_id": "WJodJn7CutQUEZEGaYjJGS", "question": "What kind of sauce might be applied to the pizza on the right?", "choices": ["worcestershire", "garlic", "mayonnaise", "spaghetti"], "correct_choice_idx": 0, "direct_answers": ["tomato sauce", "dada", "soy sauce", "pizza", "tommoto", "plate", "tomato", "worcestershire", "tomato", "tomato"], "difficult_direct_answer": false, "rationales": ["A slice of pizza is on a plate next to a thin, brown sauce.", "A slice of pizza is on a plate with a dark sauce.", "The sauce is bbq."], "image": "train2014/COCO_train2014_000000491494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521872, "question_id": "WKj4c9NLALzmeRShSUcRCz", "question": "How many park benches line the plaza that is closed off to traffic?", "choices": ["four", "two", "one", "five"], "correct_choice_idx": 3, "direct_answers": ["five", "five", "five", "five", "five", "five", "five", "five", "four", "five"], "difficult_direct_answer": false, "rationales": ["This is the number shown in the image. there could be a lot more.", "There are five park benches lining around the plaza.", "There are 5."], "image": "train2014/COCO_train2014_000000521872.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149566, "question_id": "WKmNLY5Mav8ghsvN7s5Suf", "question": "What color is the ribbon tied around the neck of the small stuffed bear?", "choices": ["yellow", "blue", "orange", "pink"], "correct_choice_idx": 3, "direct_answers": ["pink", "brown", "pink", "purple", "pink", "pink", "purple", "pink", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["The color is pink.", "This shade of color is often worn by little girls.", "The ribbon is not orange, yellow, or blue."], "image": "train2014/COCO_train2014_000000149566.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134297, "question_id": "WLE4ZtmXJmLHY9i9xXTcaA", "question": "The girl all the way to the right is wearing what?", "choices": ["feathers", "scarf", "mask", "boots"], "correct_choice_idx": 3, "direct_answers": ["shorts", "boots", "shorts", "boots", "shorts", "jacket", "shorts", "shorts", "boots", "coat"], "difficult_direct_answer": false, "rationales": ["The girl on the far right is wearing a pair of rainy boots.", "The girl is in boots.", "The girl is identifiable based on the location direction in the question and the clothing she is wearing is visible and identifiable."], "image": "train2014/COCO_train2014_000000134297.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136962, "question_id": "WMBXFM3FBfuuc9Gn5SdTTV", "question": "What is missing on these hotdogs?", "choices": ["condiments", "lettuce", "chocolate", "mayonnaise"], "correct_choice_idx": 0, "direct_answers": ["mustard", "cheese", "mustard", "ketchup mustard", "mustard", "condiments", "condiments", "ketchup", "ketchup", "condiments"], "difficult_direct_answer": false, "rationales": ["Hot dogs are in buns on a plate with some peppers. hot dogs are usually served with ketchup and mustard.", "It has no ketchup or mustard", "The hot dogs don't have condiments."], "image": "train2014/COCO_train2014_000000136962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574208, "question_id": "WMqMgAWkAJBn75yWiuh3S7", "question": "What do the colors of the frisbee resemble?", "choices": ["echidna", "apricot", "watermelon", "hedgehog"], "correct_choice_idx": 2, "direct_answers": ["red green", "watermelon", "red green", "watermelon", "watermelon", "watermelon", "red", "christmas", "red", "watermelon"], "difficult_direct_answer": false, "rationales": ["The disc looks like a watermelon.", "A dog has a red and green frisbee in its mouth.", "The frisbee is green on the outside and red on the inside."], "image": "val2014/COCO_val2014_000000574208.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320703, "question_id": "WMvRVjcBNkyAXTLse9ELrr", "question": "What is the purpose of the brown object with holes on the counter?", "choices": ["break dishes", "wash dishes", "dry dishes", "store dishes"], "correct_choice_idx": 2, "direct_answers": ["dry dishes", "hold utensils", "drain dishes", "utensil rack", "dishdrainer", "drain dishes", "for cutting", "drying rack", "dish rack", "drain dishes"], "difficult_direct_answer": false, "rationales": ["This is a rack that allows water to drip off after they are washed", "The brown object on the counter next to the sink is used to dry dishes.", "It is a dish rack."], "image": "val2014/COCO_val2014_000000320703.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5178, "question_id": "WQKzQyfkJea4pq6wmogjuS", "question": "How many cows are stood on the field around the people riding on a donkey?", "choices": ["two", "four", "three", "five"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "two", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three cows.", "There are more than two but less than four cows.", "There are three cows in the background."], "image": "val2014/COCO_val2014_000000005178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305879, "question_id": "WSToiPXer8ND9jyYCYopga", "question": "What has turned this apparatus inside out?", "choices": ["man", "wind", "gravity", "child"], "correct_choice_idx": 1, "direct_answers": ["wind", "wind", "wind", "umbrella", "wind", "wind", "wind", "wind", "wind", "wind"], "difficult_direct_answer": false, "rationales": ["Winds can be strong.", "The air traveled fast and caught the underside of the umbrella, pushing it outward.", "The uplift from the gusts that are blowing are too strong for the framework of the umbrella."], "image": "val2014/COCO_val2014_000000305879.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233726, "question_id": "WTFrhnPD42LWkTpHFuqodC", "question": "What is he focused at?", "choices": ["street", "another person", "window", "television"], "correct_choice_idx": 3, "direct_answers": ["game", "good", "video game", "television", "game", "television", "game", "tv", "computer", "video game"], "difficult_direct_answer": false, "rationales": ["The man is focused on the tv.", "The man is playing wii and is looking at the tv monitor.", "This is obvious given that they're using wii game controllers."], "image": "train2014/COCO_train2014_000000233726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517929, "question_id": "WU22aaW7N79az6b3eheTMG", "question": "What color is the bright light above the left side of the street?", "choices": ["blue", "white", "orange", "black"], "correct_choice_idx": 2, "direct_answers": ["orange", "white", "yellow", "yellow", "yellow", "yellow", "yellow", "orange", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["It's actually a gold or yellow a combination.", "The color is orange.", "The bright light on top of the street is orange."], "image": "train2014/COCO_train2014_000000517929.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238618, "question_id": "WV6pMdVuTxBUCUUALbr86T", "question": "What type of fence is in front of this cow?", "choices": ["wire", "iron", "electric", "wood"], "correct_choice_idx": 1, "direct_answers": ["iron", "cow", "metal", "metal", "metal", "metal", "metal fence", "steel", "iron", "protect"], "difficult_direct_answer": false, "rationales": ["The hard texture and style of bending of the barrier in front of the cow in this image identifies it as a type of metal.", "It's obviously made of metal.", "The fence is made out of metal, not wood. the metal is too thick to be wire."], "image": "train2014/COCO_train2014_000000238618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141482, "question_id": "WV8xSG2R96p8hCtdvwzTJi", "question": "What is the main color of the alpine ski that the man is holding?", "choices": ["yellow", "white", "blue", "orange"], "correct_choice_idx": 2, "direct_answers": ["black", "black", "black", "black", "blue", "blue", "black", "blue", "black", "blue"], "difficult_direct_answer": false, "rationales": ["The main color is blue.", "It clearly looks to have been painted with blue paint.", "The ski is blue."], "image": "train2014/COCO_train2014_000000141482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31247, "question_id": "WVVwbbAa5fGvwJKhFWEAvJ", "question": "What category of food is this?", "choices": ["vegetables", "grains", "meats", "taco"], "correct_choice_idx": 0, "direct_answers": ["vegetables", "vegetable", "vegetables", "vegetables", "vegetables", "vegtable", "vegetables", "vegetables", "tomatto/onion", "vegetables"], "difficult_direct_answer": false, "rationales": ["Tomatoes, corn, onions, mushrooms, and broccoli are on a table.", "The other options aren't represented on the table.", "These foods are in the vegetable section."], "image": "val2014/COCO_val2014_000000031247.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16069, "question_id": "WW4Srqiy43UiWdwF8ivHhq", "question": "What color is the top of the railing for the bridge where two people are walking in a storm?", "choices": ["red", "blue", "brown", "green"], "correct_choice_idx": 2, "direct_answers": ["brown", "red", "red", "axe color", "brown", "red", "brown", "brown", "black colour", "brown"], "difficult_direct_answer": false, "rationales": ["The top of the railing on the bridge is brown.", "The railing is brown.", "The color is brown."], "image": "train2014/COCO_train2014_000000016069.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64121, "question_id": "WW4dZoPqLV83vWHReYyKk4", "question": "What is the man holding?", "choices": ["beer bottle", "egg", "pizza pie", "apple"], "correct_choice_idx": 0, "direct_answers": ["beer", "beer", "beer", "bear", "beer", "beer bottle", "bottle", "beer", "beer", "bottle man"], "difficult_direct_answer": false, "rationales": ["The man has a glass alcoholic beverage.", "He is holding beer bottle as it is seen on near the fridge.", "It is a green glass bottle."], "image": "val2014/COCO_val2014_000000064121.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128113, "question_id": "WWQWCfoBcF4exAXNMa8DDZ", "question": "What animal is the red stuffed animal?", "choices": ["dragon", "giraffe", "teddy bear", "kitty"], "correct_choice_idx": 0, "direct_answers": ["dragon", "dragon", "dragon", "dog", "dragon", "dragon", "dragon", "dragon", "clifford", "dragon"], "difficult_direct_answer": false, "rationales": ["The stuffed animal is a mystical lizard creature that breathes fire.", "The red stuffed animal is a dragon.", "That is a dragon on the back of the couch."], "image": "train2014/COCO_train2014_000000128113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302243, "question_id": "WapWayr9FWL3ppqogzGHHm", "question": "How many little zebras are there amongst the big zebras?", "choices": ["one", "three", "four", "two"], "correct_choice_idx": 0, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "two", "one", "one"], "difficult_direct_answer": false, "rationales": ["This zebra is less than half the size of the others around it", "It's the only number that matches the zebra of that size.", "The rest are adults"], "image": "val2014/COCO_val2014_000000302243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387173, "question_id": "WbT8E5bfthSVSvF2F9N5NM", "question": "What is the person riding in?", "choices": ["airplane", "sled", "helicopter", "canoe"], "correct_choice_idx": 3, "direct_answers": ["row boat", "kayak", "kayak", "kayak", "kayak", "boat", "kayak", "kayak", "boat", "canoe"], "difficult_direct_answer": false, "rationales": ["The person is not in a land or air vehicle. the person is above water.", "The person is floating on water, not sledding on snow or flying in the air.", "The other options aren't in this scene or apply to water except d and only certain models."], "image": "val2014/COCO_val2014_000000387173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 500492, "question_id": "WcUbyAMjgP5QrzLLDtChgU", "question": "What celebrity has a similar name to the name on the bottle?", "choices": ["charlize theron", "gaite jansen", "tom hardy", "cillian murphy"], "correct_choice_idx": 2, "direct_answers": ["na", "na", "drew hardy", "tom hardy", "tom hardy", "tom hardy", "thomas edison", "ed hardy", "tom hardy", "hardy"], "difficult_direct_answer": false, "rationales": ["Tom hardy has a similar name to the brand of the bottle.", "There is a golden wrapper around a liquor bottle and a wine glass. it has the name of the person on wrapper.", "They have the same name"], "image": "val2014/COCO_val2014_000000500492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 203843, "question_id": "WchvjHPXtRcqs2zm7xWSm2", "question": "What color is the light on top of the sink near the counter?", "choices": ["orange", "white", "yellow", "red"], "correct_choice_idx": 1, "direct_answers": ["yellow", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The color is white.", "The color is easily visible and bright. it is in sharp contrast to the brown cabinents.", "A bright light is near a kitchen sink."], "image": "train2014/COCO_train2014_000000203843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18239, "question_id": "WdJWpYWFMeYjSxXod8aRiX", "question": "What are the animals doing?", "choices": ["flying", "sleeping", "jumping", "feeding"], "correct_choice_idx": 3, "direct_answers": ["grazing", "eating", "grazing", "eating", "eating", "eating", "eating grass", "grazing", "grazing", "feeding"], "difficult_direct_answer": false, "rationales": ["They have their heads on the ground picking at grass to eat.", "The animals are eating.", "The animals are awake and are on the grassy ground. their heads are pointed down towards the grass."], "image": "val2014/COCO_val2014_000000018239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533532, "question_id": "WdZxjC8bDrUjkDjttbV3t7", "question": "The people are most likely going where?", "choices": ["dancing", "job interview", "camping", "graduation ceremony"], "correct_choice_idx": 2, "direct_answers": ["camping", "vacation", "trip", "plane", "vacation", "travel", "airport", "station", "trip", "airplane"], "difficult_direct_answer": false, "rationales": ["The bags are things that someone would bring to camp.", "The people are going to go camping.", "The people are going camping."], "image": "val2014/COCO_val2014_000000533532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368142, "question_id": "WeVjbMikZQGX4Hu7XyLGeZ", "question": "What could be struck against the large glass in order to make a sound?", "choices": ["screwdriver", "spoon", "toothpick", "hammer"], "correct_choice_idx": 1, "direct_answers": ["spoon", "spoon", "spoon", "spoon", "water", "spoon", "spoon", "spoon", "spoon", "glass"], "difficult_direct_answer": false, "rationales": ["The spoon is stuck.", "While any of the answers are possible, answer a would most commonly occur in conjunction with the glass and not break it.", "The utensil is hard and metal which could make noise."], "image": "val2014/COCO_val2014_000000368142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247121, "question_id": "WecBg9e3Fgb5thqwUnLukH", "question": "What color is the stripe going down in the foot of the bed?", "choices": ["blue", "brown", "gray", "white"], "correct_choice_idx": 1, "direct_answers": ["brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown", "black", "brown"], "difficult_direct_answer": false, "rationales": ["The color shade is similar to chocolate.", "The stripe is colored white.", "The color scheme of this bed's comforter is brown and white. the foot or lower end of the bed is white."], "image": "train2014/COCO_train2014_000000247121.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19698, "question_id": "WgSQZ5Rgje9PqvHn9mw2kE", "question": "Who is wearing the most gear?", "choices": ["police officer", "fire fighter", "clown", "catcher"], "correct_choice_idx": 3, "direct_answers": ["catcher", "catcher", "keeper", "catcher", "catcher", "catcher", "umpire", "catcher", "man black", "player"], "difficult_direct_answer": false, "rationales": ["It's to protect them from being hurt by the ball.", "We see the back of the catcher's knee pads, a chest protector and helmet with plastic front face covering. they are wearing more protective gear than any other person visible.", "The catcher is wearing the most safety gear because he's the most vulnerable when playing."], "image": "train2014/COCO_train2014_000000019698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451489, "question_id": "WhmJBRn2UkwKapv6DvMBj5", "question": "What is the elephant doing?", "choices": ["eating peanuts", "sleeping", "playing basketball", "trapeze walk"], "correct_choice_idx": 2, "direct_answers": ["playing ball", "playing basketball", "playing ball", "basketball", "playing", "dunking", "playing basketball", "dwdw", "basketball", "playing"], "difficult_direct_answer": false, "rationales": ["The elephant is holding a ball and placing it in a circular net which is consistent with answer a.", "The elephant is playing ball.", "The elephant plays ball."], "image": "train2014/COCO_train2014_000000451489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227508, "question_id": "WjGZ6XwHRAAZch8r9iCdZs", "question": "What color is the clock face underneath the window on the top of the clock tower?", "choices": ["brown", "blue", "green", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "grey", "white", "white"], "difficult_direct_answer": false, "rationales": ["The color is white.", "There is a tower with a white face clock in a building with people all around in front.", "The face is this color. that said, the image is not true color."], "image": "train2014/COCO_train2014_000000227508.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231536, "question_id": "WjNPtwDkcJRdFU2eXq5xPf", "question": "How many portraits are hung on the doors and the walls of this kitchen room?", "choices": ["four", "three", "five", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "one", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["A picture is on the door and one is above in a kitchen of a home.", "The two portraits are near each other.", "There is one on the door and one above it"], "image": "train2014/COCO_train2014_000000231536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99701, "question_id": "WjXWqo2gAhUbdbcSxjaQ96", "question": "What color is the pillow on the back of the sofa recliner?", "choices": ["white", "red", "blue", "pink"], "correct_choice_idx": 2, "direct_answers": ["white", "blue", "blue", "light blue", "blue", "blue", "blue", "white", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["There is a pillow of this color behind the dog.", "There is only one pillow present and it is this color", "The pillow is not red, white, or pink."], "image": "train2014/COCO_train2014_000000099701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563652, "question_id": "WjifXbujkXe89PyWpRUWhL", "question": "What is the object with the metal rod on it?", "choices": ["radio", "coffee maker", "tv", "microwave"], "correct_choice_idx": 0, "direct_answers": ["radio", "radio", "radio", "cabinets", "radio", "radio", "radio", "radio", "stove", "radio"], "difficult_direct_answer": false, "rationales": ["The object is a radio.", "That is an aerial for connection.", "The item has an antenna to pick up sound waves."], "image": "train2014/COCO_train2014_000000563652.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 252731, "question_id": "WjyzmTGkQrzrMJJZboc5kU", "question": "What is on the bed?", "choices": ["faces", "green beans", "rose petals", "apples"], "correct_choice_idx": 0, "direct_answers": ["comforter", "comforter", "pillow", "bedspread", "quilt", "faces", "blanket", "blanket", "blanket", "quilt"], "difficult_direct_answer": false, "rationales": ["The faces are on the bed.", "Faces are on the blanket.", "A face like portrait drawn on the duffet."], "image": "train2014/COCO_train2014_000000252731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166401, "question_id": "WpuBHEe6e6H34VaDipoktv", "question": "What material surrounds the tub?", "choices": ["marble", "porcelain", "slate", "terra cotta"], "correct_choice_idx": 1, "direct_answers": ["house hold", "tile", "porcelain", "tile", "clay", "tile", "tile", "stone", "probably tile", "tile"], "difficult_direct_answer": false, "rationales": ["There is a lot of porcelain around the tub.", "The tub and its surrounding tile is clearly visible and is of a shape, size, color and consistency of answer a.", "It looks like tiles."], "image": "val2014/COCO_val2014_000000166401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441949, "question_id": "Wq44zEPbimw36WYBuC2jjr", "question": "What ar ethe zebras doing on the other side of the lake?", "choices": ["eating", "playing", "drinking", "running"], "correct_choice_idx": 0, "direct_answers": ["eating", "drinking", "eating", "eating", "drinking water", "grazing", "eating grass", "lake", "grazing", "drinking water"], "difficult_direct_answer": false, "rationales": ["They have their heads down to the grass", "Their heads buried in the grass, along with that fact that their diet consists of grass, suggests they are eating.", "The zebras are standing near the lake and sipping water to quench their thirst."], "image": "train2014/COCO_train2014_000000441949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158378, "question_id": "Wq6pJ9sFHJS3ESdLpAvBp9", "question": "What color is the cat sitting in the computer chair?", "choices": ["white", "brown", "tabby", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "beige", "black", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["It's as dark as the night", "The cat is not tabby, brown, or white.", "The animal's hair is very dark and hard to see at night."], "image": "train2014/COCO_train2014_000000158378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175052, "question_id": "WqJ77n2P7BRFhid6ey4NCJ", "question": "What kind of pants does the woman wear at the sink mirror?", "choices": ["yoga", "briefs", "panties", "pajamas"], "correct_choice_idx": 2, "direct_answers": ["bikini bottom", "underwear", "panties", "underware", "inner", "bikini bottoms", "underpants", "panties", "underpants", "no pants"], "difficult_direct_answer": false, "rationales": ["The woman is not wearing pants. she is only wearing panties.", "You can tell by the lack of clothing that she is only in her panties.", "The pants are panties."], "image": "val2014/COCO_val2014_000000175052.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456161, "question_id": "Wr9MwMYnt3nehsDqRRB67Z", "question": "What is the baby doing?", "choices": ["walking dog", "hugging parent", "brushing teeth", "walking cat"], "correct_choice_idx": 2, "direct_answers": ["brushing", "brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "brushing teeth", "using toothbrush", "brushing teeth", "brushing teeth"], "difficult_direct_answer": false, "rationales": ["The baby is brushing teeth with a toothbrush.", "He brushes.", "He is brushing his teeth with a toothbrush."], "image": "train2014/COCO_train2014_000000456161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27235, "question_id": "WrrrzkfcLPk8NkgEGVHG28", "question": "What color is the fur of the teddy bear who is sitting on the green mattress sheet?", "choices": ["purple", "red", "tan", "white"], "correct_choice_idx": 2, "direct_answers": ["brown", "yellow", "hazel", "brown", "tan", "tan", "yellow", "yellow", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The fur is tan.", "The color is tan.", "The shade is darker than the yellow pillow right next to it."], "image": "val2014/COCO_val2014_000000027235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268921, "question_id": "WsBiFakG9bLoCJgGBinrRN", "question": "How many jet planes are flying together in the sky with military formation?", "choices": ["one", "three", "two", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are three layers. there is one plane in the top layer, two in the middle layer, and one in the bottom layer.", "There is one on top, one on the bottom and two flanking them", "Two are in the middle and two are flanking"], "image": "train2014/COCO_train2014_000000268921.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27516, "question_id": "WsnFXuymvZY2pNLy6cdPkW", "question": "What do the man and woman have in common?", "choices": ["headphones", "scarf", "hat", "glasses"], "correct_choice_idx": 3, "direct_answers": ["glasses", "hair", "eye glass", "wear glasses", "elderly", "glasses", "glasses", "glasses", "adjustment", "glasses"], "difficult_direct_answer": false, "rationales": ["The man and woman are clearly visible and their attire is identifiable. of the things they are each wearing, only answer a appears on both.", "The man and the woman that are getting dressed both are wearing glasses.", "Both of the people are wearing framing lenses to help them see better."], "image": "train2014/COCO_train2014_000000027516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187036, "question_id": "Wt9XV2LzLNwTyTpT7HTFPZ", "question": "Who would be associated with these types of vehicles?", "choices": ["larry david", "albert einstein", "henry avery", "eli whitney"], "correct_choice_idx": 2, "direct_answers": ["sailing", "pirates", "pirates", "pirates", "sailor", "sailors", "henry avery", "sailing", "ship", "pirate"], "difficult_direct_answer": false, "rationales": ["Henry avery would be associated.", "The person is a famous pirate.", "These are pirate ships."], "image": "val2014/COCO_val2014_000000187036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292833, "question_id": "WtZcTsmN6uKDrFU4Gir3do", "question": "What model is the red car?", "choices": ["hatchback", "sedan", "station wagon", "coupe"], "correct_choice_idx": 1, "direct_answers": ["auxia", "toyota", "sedan", "no clue", "sedan", "kia", "sedan", "toyota", "van", "coupe"], "difficult_direct_answer": false, "rationales": ["The model of the car is a sedan.", "The red car is a four door car.", "This is a small style car. too small to be a station wagon or hatchback."], "image": "train2014/COCO_train2014_000000292833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539251, "question_id": "WuFCrn2c43UJJ634XZCWez", "question": "What is near the mirror?", "choices": ["dog", "baby", "cat", "sink"], "correct_choice_idx": 3, "direct_answers": ["sink", "sink", "sink", "sink", "plant", "sink", "sink", "sinks", "paper towel", "sink"], "difficult_direct_answer": false, "rationales": ["There is a faucet and a basin near the mirror. there are no babies or animals.", "The sink is under the mirror on the counter.", "This is in a bathroom"], "image": "val2014/COCO_val2014_000000539251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308730, "question_id": "WvU2bMfGCkGbZ3Wzw2FWVe", "question": "What is the rectangular metal object called which is directly above the stove and mounted to the ceiling?", "choices": ["hood", "stove cover", "fan box", "vent"], "correct_choice_idx": 0, "direct_answers": ["lights", "vent", "vent", "self", "hood", "vent hood", "eefe", "hood", "hood", "vent"], "difficult_direct_answer": false, "rationales": ["The object is a hood.", "It's connected to the vent and in a way the mouth of the vent (b).", "Hoods are located above stoves."], "image": "val2014/COCO_val2014_000000308730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191869, "question_id": "WwavuLh8CMwhNg5opwuTjB", "question": "The man in the foreground's jacket is the same color as what?", "choices": ["banana", "watermelon", "orange", "cherry"], "correct_choice_idx": 0, "direct_answers": ["yellow", "orange", "light", "yellow", "sun", "banana", "sun", "banana", "lights", "sun"], "difficult_direct_answer": false, "rationales": ["The yellow jacket is the same color as a banana.", "The color of the man's jacket is yellow and clearly visible. yellow is a color commonly associated with answer a.", "The man in the foreground is wearing a yellow, not red, green, or orange, jacket."], "image": "train2014/COCO_train2014_000000191869.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274134, "question_id": "WxQXGkLaMshEcgyqKbyqTn", "question": "How many pillows are laid upon the wooden bench down the bookcases?", "choices": ["four", "one", "three", "two"], "correct_choice_idx": 2, "direct_answers": ["four", "four", "four", "three", "three", "four", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["This is the number seen in this shot. it could be more beyond the image.", "There are that many visible squares on the chair.", "There are a trio of pillows on the bench."], "image": "val2014/COCO_val2014_000000274134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392326, "question_id": "WzJhutgwD2VTbeRHJjQFUh", "question": "What is behind the person with the number 44 on their shirt?", "choices": ["antelope", "baby", "glove", "hot dog"], "correct_choice_idx": 2, "direct_answers": ["glove", "catcher", "umpire", "catcher", "catcher", "catcher", "catcher", "fans", "catcher", "keeper"], "difficult_direct_answer": false, "rationales": ["There is a glove behind the person at bat.", "The catcher's mitt is right behind the batter.", "The person has a glove behind."], "image": "train2014/COCO_train2014_000000392326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407349, "question_id": "X26rQFWxydh2Z9kwe7t9VU", "question": "What store would sell most of the items here?", "choices": ["blockbuster", "mcdonalds", "home depot", "office max"], "correct_choice_idx": 2, "direct_answers": ["lowes", "home store", "walmart", "appliance", "department", "appliance store", "ikea", "home depot", "home depot", "appliance"], "difficult_direct_answer": false, "rationales": ["This is a home building store", "This store has lots of household items for sale.", "The store is home depot."], "image": "train2014/COCO_train2014_000000407349.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 287105, "question_id": "X3QhMaZ9BpsaQXxkwk9bSj", "question": "Why is the hardware on the doors brown?", "choices": ["rust", "patina", "stain", "paint"], "correct_choice_idx": 0, "direct_answers": ["rust", "rust", "rust", "rust", "rust", "rusty", "rusted", "lock it", "rusted", "brass"], "difficult_direct_answer": false, "rationales": ["The doors are made out of metal. they have oxidized over time.", "Old metal doors have handles and hardware that has turned brown and discolored.", "The hardware is old and it's outside. when old things sit outside they start to rust."], "image": "train2014/COCO_train2014_000000287105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460100, "question_id": "X45ZEyrVqDPcj6cjQgrPfh", "question": "What is usually found in this room?", "choices": ["bookcase", "bed", "toiletries", "refrigerator"], "correct_choice_idx": 2, "direct_answers": ["toilet", "soap", "toilet", "towel", "sink", "soap", "toiletries", "toilet", "tub", "toilet"], "difficult_direct_answer": false, "rationales": ["It's a bathroom, which is why this is the most relevant option.", "This is a bathroom", "You can find that in the bathroom."], "image": "train2014/COCO_train2014_000000460100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274570, "question_id": "X5PGtQ4qnkjv2bxRHnzpdk", "question": "How many little elephants are following behind the big elephant to the left?", "choices": ["one", "five", "four", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "three", "two", "two", "two", "one", "one", "two", "two", "one"], "difficult_direct_answer": false, "rationales": ["There is one small elephant lagging behind two bigger elephants on a field.", "There are two baby elephants.", "There are two small elephants."], "image": "train2014/COCO_train2014_000000274570.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16161, "question_id": "X7ErwjL4FBpAwcTJFMiHhC", "question": "What kind of fruit is sat next to the bunch of bananas?", "choices": ["apple", "grapefruit", "orange", "watermelon"], "correct_choice_idx": 2, "direct_answers": ["orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["This is obvious given its size, shape, surface texture and color.", "It is round with a peel and has the color that is its name", "It is a citrus fruit that is smaller than a grapefruit."], "image": "val2014/COCO_val2014_000000016161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309662, "question_id": "X7cgCbfafpFKrQqnWu63cW", "question": "What is the woman holding in her hand?", "choices": ["baby", "egg", "wine glass", "kitten"], "correct_choice_idx": 2, "direct_answers": ["wine glass", "wine", "glass", "wine glass", "glass", "glass", "drinks", "wineglass", "wine glass", "ounce"], "difficult_direct_answer": false, "rationales": ["This is obvious in the scene. the other options aren't in it.", "You can tell by the design of the glass to what type it is.", "The woman is holding a wine glass in one hand."], "image": "val2014/COCO_val2014_000000309662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324492, "question_id": "X8s8oto4kNQBdFgsGchNwu", "question": "What is the color of the clock face behind the wheel?", "choices": ["red", "blue", "yellow", "purple"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The face of the clock underneath the numbers is this color.", "The inner part of the clock id painted with yellow color.", "This is obvious when looking at the scene."], "image": "val2014/COCO_val2014_000000324492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558640, "question_id": "X8sjW64oGUVupzzMnjjgoN", "question": "What vitamin is the green stuff a good source of?", "choices": ["k", "c", "w", "d"], "correct_choice_idx": 0, "direct_answers": ["vitamin e", "broccoli", "vitamin b", "vitamin c", "vitamin c", "vitamin", "health", "k", "calcium", "vitamin c"], "difficult_direct_answer": false, "rationales": ["The vitamin is k.", "The vitamin is k.", "The vegetables in the red bowl are high in vitamin k like most leafy greeens."], "image": "val2014/COCO_val2014_000000558640.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329268, "question_id": "XANtrphyEAoT84wGuCuU6J", "question": "How many giraffes are peeling the bark off this fallen tree?", "choices": ["five", "three", "four", "two"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "two", "three", "two", "three", "car", "stop", "three"], "difficult_direct_answer": false, "rationales": ["One can see a trio of giraffes in the image who are standing by the tree.", "There are three that are peeling the bark off.", "There are three giraffes gnawing at the tree."], "image": "train2014/COCO_train2014_000000329268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456106, "question_id": "XATQsferAdhrAXUL4dWC5C", "question": "What color are the tags planted inside of the sheep's ears?", "choices": ["green", "blue", "white", "yellow"], "correct_choice_idx": 3, "direct_answers": ["orange", "black", "yellow", "yellow", "yellow", "yellow", "yellow", "red", "yellow", "brown"], "difficult_direct_answer": false, "rationales": ["The tags are yellow.", "The tags are very bright like the sun.", "The color is yellow."], "image": "train2014/COCO_train2014_000000456106.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188589, "question_id": "XAXZeVN7ZXNzaxDVUQXauf", "question": "What is the foremost cow doing?", "choices": ["sleeping", "working", "drinking", "running"], "correct_choice_idx": 2, "direct_answers": ["eating", "stop", "drinking", "eating", "eating food", "eating food", "drinking water", "drinking", "eating", "eating"], "difficult_direct_answer": false, "rationales": ["He is drinking water out of the bowl", "The cow is awake and is not moving. its head is in a bucket that contains a liquid.", "Its head is in a bucket."], "image": "train2014/COCO_train2014_000000188589.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546667, "question_id": "XAYfyvpzEobpGXNMpmtRs2", "question": "How many people are standing on top of the elephant who is standing in the muddy water?", "choices": ["four", "two", "three", "five"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Only two people are visible.", "They're actually sitting and not standing.", "There are this many people on the back of the elephant."], "image": "val2014/COCO_val2014_000000546667.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306060, "question_id": "XB8gsupYLUDNH9d69qhrob", "question": "What word is related to these animals?", "choices": ["kitten", "puppy", "colt", "joey"], "correct_choice_idx": 2, "direct_answers": ["horses", "ride", "riding", "running", "race", "riding", "horse", "colt", "horse", "riding"], "difficult_direct_answer": false, "rationales": ["These animals are horses, not cats, dogs, or kangaroos.", "This is the name for a young horse", "Two horses gallop on the beach in this image. a colt is a type of horse."], "image": "train2014/COCO_train2014_000000306060.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320658, "question_id": "XBGhRJ6msT3RxpU6mCsbzf", "question": "What item usually has liquids poured into it?", "choices": ["mug", "shoe", "bath tub", "basin"], "correct_choice_idx": 0, "direct_answers": ["coffee cup", "mug", "mug", "mug", "mug", "mug", "coffee cup", "liquids poured", "coffee", "keyboard"], "difficult_direct_answer": false, "rationales": ["Answer a is clearly visible to the right of the laptop and identifiable by the size, shape and handle. of the objects visible, it is most consistent with its intended function for liquid to be poured into it.", "This is for hot liquids", "A mug is near the computer"], "image": "val2014/COCO_val2014_000000320658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133002, "question_id": "XBkFXDtLJfKSAwostybxWg", "question": "Why is the air so hazy?", "choices": ["fire", "factory smoke", "smog", "fog"], "correct_choice_idx": 2, "direct_answers": ["smog", "smog", "smog", "smoke", "smoke", "slow", "pollution", "summer", "shall", "cool weather"], "difficult_direct_answer": false, "rationales": ["This city has extremely polluted air.", "A person is on a motorcycle in a city with a hazy look all around, in the air everywhere.", "The smoke is blocking the area."], "image": "val2014/COCO_val2014_000000133002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173149, "question_id": "XBshgxe5YhdAnVvPXUeSpc", "question": "What color is absent on the umbrella?", "choices": ["blue", "red", "black", "purple"], "correct_choice_idx": 2, "direct_answers": ["green", "white", "red", "yellow", "black", "black", "yellow", "white", "yellow", "green"], "difficult_direct_answer": false, "rationales": ["Purple, red, and blue are on the umbrella.", "There is no black on the umbrella", "The umbrella only contains purple, red, and blue color which are the only visible."], "image": "train2014/COCO_train2014_000000173149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 129568, "question_id": "XC33T5Gv5hQyc3mhVV6YJz", "question": "What is the finance company advertised on the wall next to the tennis player?", "choices": ["jp morgan", "etrade", "wells fargo", "ameritrade"], "correct_choice_idx": 0, "direct_answers": ["jp morgan", "j.p. morgan", "j.p. morgan", "jp morgan", "ball", "jp morgan", "j.p. morgan", "j.p. morgan", "jp morgan", "citizen"], "difficult_direct_answer": false, "rationales": ["A business logo can be seen behind a tennis player on a court. companies advertise at professional sporting events.", "That is the company shown on the sign.", "This is obvious by just reading the wall."], "image": "train2014/COCO_train2014_000000129568.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99956, "question_id": "XC4ix3wjDoD6jh4jy7UTWY", "question": "Which US state is most likely to contain palm trees like the ones contained in this image?", "choices": ["florida", "maine", "pennsylvania", "new york"], "correct_choice_idx": 0, "direct_answers": ["florida", "florida", "florida", "california", "florida", "state", "bird", "florida", "vdvd", "state"], "difficult_direct_answer": false, "rationales": ["Located in the us, this state is well known for a beach and boardwalk.", "Florida has a warm temperature most of the year.", "It is a tropical area"], "image": "train2014/COCO_train2014_000000099956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 309000, "question_id": "XGb6WS3wLbvvNRHn9C8itm", "question": "What is the primary color of the reflection on the ocean?", "choices": ["purple", "white", "brown", "blue"], "correct_choice_idx": 0, "direct_answers": ["purple", "yellowish", "purple", "yellow", "white", "pink", "purple", "purple", "purple", "orange"], "difficult_direct_answer": false, "rationales": ["The sun reflects off the water creating this color.", "The color is purple.", "The color is purple."], "image": "val2014/COCO_val2014_000000309000.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 389133, "question_id": "XGdofZdHUbhThoNrKE2X8V", "question": "What kind of skis are the two using in this mountain range?", "choices": ["country", "racing", "alpine", "trick"], "correct_choice_idx": 1, "direct_answers": ["snowboarding", "downhill", "skateboard", "skiis", "long skis", "downhill skis", "downhill", "racing", "alpine", "short skis"], "difficult_direct_answer": true, "rationales": ["The skiiers are going very fast down the hill. one skiier is in the lead.", "They are going downhill very fast", "The two skiers in this image are moving quickly downhill. it is likely they are competing for time."], "image": "train2014/COCO_train2014_000000389133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 57671, "question_id": "XGtHxfHipCy3FNdnjrynUP", "question": "What type of building is this bathroom in?", "choices": ["hotel", "school", "church", "apartment complex"], "correct_choice_idx": 0, "direct_answers": ["hotel", "hotel", "hotel", "hotel", "hotel", "hotel", "bedroom", "bathroom", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["There are two beds and the bathroom isn't completely separate", "A bathroom vanity with matching sinks has a keychain with a tag on the counter and business cards in a stand as well.", "The building is a hotel."], "image": "val2014/COCO_val2014_000000057671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285978, "question_id": "XHCrUFLvrgQ9uEGtRHT8g4", "question": "What type of meat is in the sandwiches at the left hand side of the table?", "choices": ["beef", "turkey", "chicken", "roast beef"], "correct_choice_idx": 3, "direct_answers": ["roast beef", "mushroom", "beef", "bacon", "corned beef", "ham", "roast beef", "roast beef", "bread", "beef"], "difficult_direct_answer": false, "rationales": ["There is roast beef on the sandwiches to the left.", "This is roast beef.", "It is always the right choice."], "image": "train2014/COCO_train2014_000000285978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136770, "question_id": "XJ7PpoUaKkVBXSjbhBUdM2", "question": "What are the men wearing?", "choices": ["hats", "backpacks", "crowns", "antlers"], "correct_choice_idx": 0, "direct_answers": ["cap", "hats", "hats", "hats", "hats", "jeans", "shoes", "hat", "hats", "hats"], "difficult_direct_answer": false, "rationales": ["Two men are walking on a sidewalk and both are wearing colorful hats.", "Two men are walking together along a sidewalk and both have coverings with brims on their heads.", "Two men in colorful hats are walking in the street."], "image": "val2014/COCO_val2014_000000136770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520569, "question_id": "XJGTgpGNV3SkdWnEiFtngR", "question": "What language do people most likely speak here?", "choices": ["greek", "latin", "french", "daedric"], "correct_choice_idx": 2, "direct_answers": ["french", "french", "french", "french", "french", "french", "greek", "english", "french", "french"], "difficult_direct_answer": false, "rationales": ["It looks like french.", "The language is french.", "Writing in the language of answer a appears on the signs which makes it likeliest answer."], "image": "train2014/COCO_train2014_000000520569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60190, "question_id": "XJKSrMJeyUgUoK4ngE589q", "question": "What is the blue object hanging on the wall?", "choices": ["ladle", "spoon", "measuring cup", "colander"], "correct_choice_idx": 3, "direct_answers": ["strainer", "strainer", "pot", "pan", "pan", "colander", "pot", "strainer", "pot", "strainer"], "difficult_direct_answer": false, "rationales": ["This is obvious given the holes in it.", "The object is a colander.", "There is a blue coriander hanging from the wall for straining pasta."], "image": "train2014/COCO_train2014_000000060190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 56261, "question_id": "XJkPVZ46H92MGDTYVSVKb3", "question": "The person holding the umbrella looks most like who?", "choices": ["lily frazer", "albert finney", "amber tamblyn", "adewale akinnuoyeagbaje"], "correct_choice_idx": 2, "direct_answers": ["girl", "amber tamblyn", "mom", "woman", "velma", "youg", "female", "caucasian person", "rosie o'donnell", "lady"], "difficult_direct_answer": true, "rationales": ["Based on the person holding the umbrella and a google search of the answers provided, a is closest in features and general appearance.", "Her face resembles the soap star.", "After googling the name this person is similar in facial features and hair to the picture shown."], "image": "train2014/COCO_train2014_000000056261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222261, "question_id": "XKYsnZdfQsJSkLiFND8Xtu", "question": "How many elephants are there to lead this herd?", "choices": ["four", "three", "two", "one"], "correct_choice_idx": 2, "direct_answers": ["eight", "two", "eight", "two", "two", "ten", "two", "eight", "two", "one"], "difficult_direct_answer": false, "rationales": ["They are up in the front of the others", "The leaders are visible separate from the group and they are larger and appear dominant.", "There are two elephants."], "image": "val2014/COCO_val2014_000000222261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430790, "question_id": "XLyoNq7hXkMyyL6UYxNPeh", "question": "What are the people riding on?", "choices": ["elephants", "motorcycles", "horses", "cars"], "correct_choice_idx": 1, "direct_answers": ["motorcycles", "motorcycles", "motorcycles", "motorcycles", "motorcycles", "motorcycle", "motorcycles", "bike ride", "mopeds", "motorcycles"], "difficult_direct_answer": false, "rationales": ["You can tell by the design of the vehicles as to what they are riding.", "They are motored vehicles with two tires.", "The only vehicles on the road are two-wheeled gas-powered motorbikes."], "image": "train2014/COCO_train2014_000000430790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 68041, "question_id": "XMMKLXgLZaw5GiYYqEwfqk", "question": "What is the man most likely protecting his eyes from with the object on his face?", "choices": ["water", "wind", "sun", "sand"], "correct_choice_idx": 0, "direct_answers": ["sunlight", "water", "water", "fee", "water", "sun", "sun", "salt water", "goggle", "sun"], "difficult_direct_answer": false, "rationales": ["These are swim goggles", "The man is most likely protecting his eyes from water with his hands.", "The man is in the ocean and needs to protect his eyes while surfing."], "image": "train2014/COCO_train2014_000000068041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 216357, "question_id": "XP4jG3a3c94oh9wyqNjLYY", "question": "What is the baby near?", "choices": ["clown", "box", "elephant", "hydrant"], "correct_choice_idx": 3, "direct_answers": ["poles", "hydrant", "fire hydrant", "pole", "hydrant", "fire hydrant", "pole", "yellow cone", "yellow pole", "pole"], "difficult_direct_answer": false, "rationales": ["There are yellow poles. a red device that is used to fight fires is in between the poles.", "The red fire hydrant.", "There is an item that's very common for firefighters to use that's red. it's \"barricaded\" by yellow posts to keep it from being blocked."], "image": "train2014/COCO_train2014_000000216357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282659, "question_id": "XPDbMB4ZqncEsA9qJHaDBt", "question": "What is the person holding the wooden item trying to hit?", "choices": ["fly", "homerun", "ant", "pizza dough"], "correct_choice_idx": 1, "direct_answers": ["batter", "baseball", "ball", "ball", "ball", "baseball", "homerun", "ball", "baseball", "baseball"], "difficult_direct_answer": false, "rationales": ["The person is trying to hit a home run with the baseball bat.", "The person is playing baseball based on the setting and uniform. the intention of a person batting in this sport would be intending to score a run by means of answer a.", "The person hit a run."], "image": "val2014/COCO_val2014_000000282659.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408040, "question_id": "XQdDLJmzMN4UcALMqoLcMF", "question": "What position does the person with the blue helmet play?", "choices": ["pitcher", "catcher", "umpire", "shortstop"], "correct_choice_idx": 1, "direct_answers": ["catcher", "catcher", "catcher", "catcher", "catcher", "back catch", "catcher", "catcher", "catcher", "catcher"], "difficult_direct_answer": false, "rationales": ["The blue helmet players outstretched glove and protective face mask positioned behind the batter tell us he is playing catcher.", "This is obvious given that they're behind the batter, the normal position, and they're holding up a glove to catch the ball, if needed.", "The person in the blue helmet is crouched down in order to catch any balls that are pitched."], "image": "train2014/COCO_train2014_000000408040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 91994, "question_id": "XRE6MrUZpbvyssF2jGzZmw", "question": "What is likely this guy's name?", "choices": ["howard stern", "don lemon", "chris pirillo", "stephen colbert"], "correct_choice_idx": 2, "direct_answers": ["chris pirillo", "chris", "chris", "chris", "chris", "chris pirillo", "chris pirillo", "chris", "chris", "chris pirillo"], "difficult_direct_answer": false, "rationales": ["There is a website and email shown on the screen, both which contain the name of a person. also, the other three men are well known people, and the pictured man is none of them.", "This guy's name is pirillo.", "The name on the bottom has his last name."], "image": "val2014/COCO_val2014_000000091994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306585, "question_id": "XSx8MJqkwiHPr3RtsePgZw", "question": "What color is the main body of the jacket worn by the small child?", "choices": ["orange", "yellow", "blue", "green"], "correct_choice_idx": 1, "direct_answers": ["beige", "purple", "blue", "yellow", "yellow", "yellow", "yellow", "purple", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The primary portion of the jacket is yellow.", "A boy is wearing a brightly colored coat while he skis.", "It is a safety vest so he's easily seen in the snow"], "image": "val2014/COCO_val2014_000000306585.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 182291, "question_id": "XUSVduzAttsJAQKqxr3M4h", "question": "What is extended on the animals?", "choices": ["claws", "antlers", "huge ears", "neck"], "correct_choice_idx": 3, "direct_answers": ["necks", "neck", "neck", "necks", "neck", "neck", "neck", "necks", "neck", "necks"], "difficult_direct_answer": false, "rationales": ["Three giraffes are grazing near trees. giraffes have long necks.", "This particular body part on giraffes is very long.", "The animals have necks."], "image": "train2014/COCO_train2014_000000182291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119283, "question_id": "XW4HfXJnT3usZM7rgdcbhM", "question": "What color is the secondary shade of grass near to where the oxen are standing?", "choices": ["green", "orange", "yellow", "white"], "correct_choice_idx": 0, "direct_answers": ["brown", "green", "brown", "brown", "brown", "tan", "brown", "green", "green", "yellow"], "difficult_direct_answer": false, "rationales": ["The grass is brownish green.", "The other colors aren't stong in this image and a is closest.", "Animals are grazing in brown and green grass."], "image": "train2014/COCO_train2014_000000119283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555391, "question_id": "XXZyRWKWRKh9etDqrqnzyf", "question": "What is next to the chairs?", "choices": ["umbrellas", "cats", "apples", "monkeys"], "correct_choice_idx": 0, "direct_answers": ["umbrellas", "door", "umbrellas", "stairs", "tables", "table", "umbrella", "table", "umbrellas", "umbrella"], "difficult_direct_answer": false, "rationales": ["There are canopies over the chairs", "Umbrellas are nearby.", "Those objects have eight corners and are used to shade those who sit in the chairs from the sun."], "image": "train2014/COCO_train2014_000000555391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550532, "question_id": "XXeSdvPqkP97M43TYft48V", "question": "Which meter has the higher number on it?", "choices": ["right", "center", "fifth one", "left"], "correct_choice_idx": 3, "direct_answers": ["68", "sixty eight", "left", "left", "sixty eight", "left", "left 68", "left", "left", "left"], "difficult_direct_answer": false, "rationales": ["The left meter has a higher number on it.", "The meter is to the left.", "The meter is on the left."], "image": "train2014/COCO_train2014_000000550532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 45183, "question_id": "XYdew68jQjWFyvnvvP9nvq", "question": "What animal is on the bed?", "choices": ["cat", "bear", "elk", "crow"], "correct_choice_idx": 1, "direct_answers": ["bear", "bear", "teddy", "teddybear", "bear", "teddy bear", "cat", "stuffed bear", "bear", "beer"], "difficult_direct_answer": false, "rationales": ["A teddy bear is on the bed.", "A stuffed teddy is on the bed. it is not a cat, elk, or crow.", "It is hairy and brown."], "image": "train2014/COCO_train2014_000000045183.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223121, "question_id": "XZJDcCXefPQKPn3rr2Hyka", "question": "What country is the most likely destination for this airport?", "choices": ["united kingdom", "australia", "united states", "china"], "correct_choice_idx": 3, "direct_answers": ["china", "china", "airplane", "japan", "china", "china", "usa", "united states", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["The writing on the sides of the trucks is in chinese.", "The text on the side of the one vehicle implies it. that said, it's likely not even on this list (i.e. japan).", "The writing on the vehicles on the ground appears to be in chinese characters."], "image": "train2014/COCO_train2014_000000223121.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232673, "question_id": "XZeYVgSE5N3mybAPBaBjem", "question": "How many elephants are together in the small wild group?", "choices": ["one", "three", "five", "two"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three.", "This is obvious in the scene by simply counting.", "The elephants are fully visible in the image and can be counted."], "image": "train2014/COCO_train2014_000000232673.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561919, "question_id": "XfBmyZU4fMSD4Lrh4AsCZQ", "question": "How many colors of leaf are in the hedge in the middle of the station?", "choices": ["four", "one", "two", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "four", "two", "three", "three", "green", "four", "several", "five", "green"], "difficult_direct_answer": false, "rationales": ["This is obvious by just counting the swirls of different colors.", "There are three colors.", "There is pink, green and yellow leaves on the hedges."], "image": "val2014/COCO_val2014_000000561919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171221, "question_id": "XfK5sbq8hT8DiCqCo4tsJn", "question": "What kind of fencing material outlines the enclosure for the close by giraffe?", "choices": ["wood", "iron", "stone", "wire"], "correct_choice_idx": 0, "direct_answers": ["wood", "wood", "timber", "metal", "wooden", "wood", "metal", "wood", "wooden", "wood"], "difficult_direct_answer": false, "rationales": ["The fencing is wooden.", "The fence is made of materials from trees.", "The material is wooden."], "image": "train2014/COCO_train2014_000000171221.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21181, "question_id": "XiUqnPtRjRLAfQ8zh5NasM", "question": "What color is the toothbrush in the mouth of the woman in the center?", "choices": ["purple", "turquoise", "red", "pink"], "correct_choice_idx": 1, "direct_answers": ["blue", "green", "green", "blue", "turquoise", "green", "blue", "green", "teal", "green"], "difficult_direct_answer": false, "rationales": ["It's a lighter blue green color", "The girls on the ends have purple toothbrushes and the girl in the center has a blueish-green one.", "The color is bright and easily visible. it is in sharp contrast to the peach color of her face."], "image": "train2014/COCO_train2014_000000021181.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429580, "question_id": "Xj59Z4uRfVpsKPS2m6Zx8Y", "question": "What color is the center of the frisbee that this dog is jumping toward?", "choices": ["red", "blue", "orange", "white"], "correct_choice_idx": 2, "direct_answers": ["orange", "brown", "red", "orange", "orange", "orange", "orange", "red", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["A dog is completely in the air and is trying to catch a bright colored frisbee. the color is like the color the fence behind them.", "The outside of the frisbee is blue. the center is a different color and is not white or red.", "The disc is colored orange."], "image": "val2014/COCO_val2014_000000429580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188511, "question_id": "Xk9hKxBhzaLHAVEg4Bp7h6", "question": "What is the toilet near?", "choices": ["baby", "kitten", "poster", "window"], "correct_choice_idx": 3, "direct_answers": ["stop", "window", "window", "window", "window", "table", "window", "window", "watching", "car"], "difficult_direct_answer": false, "rationales": ["The toilet's near a window.", "The glass panes of a window can be seen right next to the toilet.", "A bathroom window is on the wall near the toilet."], "image": "train2014/COCO_train2014_000000188511.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349026, "question_id": "XnKnAvhdR8bJgroisbiwBA", "question": "What are the people playing with?", "choices": ["dogs", "kittens", "eggs", "kites"], "correct_choice_idx": 3, "direct_answers": ["kite", "kites", "palying", "kites", "kites", "kites", "kites", "kites", "kites", "kites"], "difficult_direct_answer": false, "rationales": ["The other options are related to mammals.", "The people are holding kites.", "It is a flying apparatus attached to string."], "image": "train2014/COCO_train2014_000000349026.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231295, "question_id": "XnusGPzxaRNfdA8mqHLPme", "question": "What is next to the cows?", "choices": ["barrel", "cat", "ocean", "little girl"], "correct_choice_idx": 3, "direct_answers": ["fence", "little girl", "fence", "woman", "sleep", "child", "humans", "lady", "black", "woman"], "difficult_direct_answer": false, "rationales": ["A small child dressed in feminine clothing can be seen directly next to them.", "She is small in size.", "The cows are visible and the objects near them are clearly identifiable."], "image": "train2014/COCO_train2014_000000231295.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 430476, "question_id": "XnytKP5AckAsDAqXK4gmaH", "question": "What is strange about the sidewalk?", "choices": ["brick", "narrow", "dirt", "steep slope"], "correct_choice_idx": 3, "direct_answers": ["bangles", "slope", "sloped", "sloped", "slanted", "sloped", "steep slope", "sloped", "slanted", "steep"], "difficult_direct_answer": false, "rationales": ["The sidewalk goes up a big hill.", "It is going downhill", "The foundation of the building shows more as it goes downhill"], "image": "train2014/COCO_train2014_000000430476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 320780, "question_id": "XoAheFNvYNKxZ7sLzARWVN", "question": "What color ist he batting helmet worn by the batting team player?", "choices": ["purple", "white", "blue", "red"], "correct_choice_idx": 2, "direct_answers": ["black", "black", "blue", "black", "black", "black", "blue", "green", "black", "blue"], "difficult_direct_answer": false, "rationales": ["The person is wearing a dark blue batting helmet.", "The batting helmet is not white, red, or purple.", "The color is blue."], "image": "val2014/COCO_val2014_000000320780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421134, "question_id": "XoFn94TTYWccFVRu2tinHN", "question": "What setting is this?", "choices": ["desert", "countryside", "city", "tundra"], "correct_choice_idx": 2, "direct_answers": ["town", "street", "city", "daytime", "city", "day", "public", "city street", "city street", "ink"], "difficult_direct_answer": false, "rationales": ["This is the city street.", "There are tall buildings.", "The setting is urban."], "image": "train2014/COCO_train2014_000000421134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479809, "question_id": "XoT4FiS7bcHA7LKnzUbSRU", "question": "What is this group of animals called?", "choices": ["herd", "colony", "pack", "gang"], "correct_choice_idx": 0, "direct_answers": ["herd", "herd", "herd", "elephants", "car", "elephant", "elephants", "herd", "herd", "herd"], "difficult_direct_answer": false, "rationales": ["A group of herbivore animals is called this.", "A group of elephants is called a herd.", "In this savannah scene a group of elephants are visible. a group of elephants would be called a herd."], "image": "train2014/COCO_train2014_000000479809.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359149, "question_id": "XoUmKKZCenSQfUv438ownr", "question": "What country is this likely in?", "choices": ["united states", "france", "mexico", "south africa"], "correct_choice_idx": 0, "direct_answers": ["france", "english speaking", "united states", "usa", "usa", "usa", "united states", "america", "united states", "united states"], "difficult_direct_answer": false, "rationales": ["This is most likely the united states because of the speed limit signs.", "The street signs signify this to be the united states.", "The speed limit sign is what is in this country"], "image": "val2014/COCO_val2014_000000359149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533677, "question_id": "XpJ2Mp5ytNNGkbhUL3PjRp", "question": "What color is the passenger side seat cover int he boat that is pulling up to the dock?", "choices": ["pink", "purple", "white", "brown"], "correct_choice_idx": 1, "direct_answers": ["black", "red", "blue", "unclear", "blue", "blue", "purple", "white", "white", "black"], "difficult_direct_answer": false, "rationales": ["It is a royal color similar to blue", "The color is purple.", "The boat pulling up to the dock has a purple trim."], "image": "train2014/COCO_train2014_000000533677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 149085, "question_id": "XpoqjeDsrd4cXYcqSwXgmo", "question": "What color is the whale kite flown on the beach?", "choices": ["green", "blue", "black", "pink"], "correct_choice_idx": 3, "direct_answers": ["pink", "brown", "pink", "pink", "pink", "pink", "pink", "brown", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["The color is obvious and none of the other options match.", "The whale kite is pink.", "Unless you are colorblind then you can tell what color the whale is."], "image": "train2014/COCO_train2014_000000149085.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220835, "question_id": "XqCXapYN6uB5gWpA5CEwA9", "question": "What is on the floor?", "choices": ["boots", "sandals", "bare feet", "socks"], "correct_choice_idx": 2, "direct_answers": ["tile", "bare feet", "couch", "sofa", "couch", "feet", "tile", "couch", "tile", "white tiles"], "difficult_direct_answer": false, "rationales": ["She has no shoes or socks on", "The girl is on bare in the cool floor.", "The woman in jeans in this picture isn't wearing anything on her feet."], "image": "train2014/COCO_train2014_000000220835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552222, "question_id": "XqK5ztaedTtcaAi7mCnZB3", "question": "What color are the eyes on the edges of the parasail pulling the skier?", "choices": ["purple", "red", "pink", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "blue", "white", "na", "white", "white", "brown", "white", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The edges of the parasail have white dots.", "There are white dots on each side of the parasail.", "They're starkly contrasted against the blue."], "image": "train2014/COCO_train2014_000000552222.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8170, "question_id": "XqS7fWTpF6TVf89p9ou7YT", "question": "Which object is most likely to start a fire?", "choices": ["door", "cupboard", "fridge", "stove"], "correct_choice_idx": 3, "direct_answers": ["stove", "stove", "calender", "stove", "stove", "oven", "stove", "stone", "stove", "stove"], "difficult_direct_answer": false, "rationales": ["The object is a stove.", "Of the items visible there is only one that commonly uses high, open heat and could thus be a fire hazard.", "The burner can make heat and start fires."], "image": "val2014/COCO_val2014_000000008170.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484273, "question_id": "Xqd3GydvNgEKJLAfReaubk", "question": "What color is the flowers inside of the painting to the left of the woman?", "choices": ["yellow", "red", "green", "blue"], "correct_choice_idx": 1, "direct_answers": ["red orange", "red orange", "orange", "white", "pink", "red", "red", "orange", "fee", "red"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The flowers are a primary color from the rainbow.", "The color of the flower is similar to blood."], "image": "train2014/COCO_train2014_000000484273.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299544, "question_id": "XqvyPUdeGC8D9KL3NMAbGG", "question": "Which fruit is most apparent visually on this pizza?", "choices": ["mango", "kiwi", "lemon", "olives"], "correct_choice_idx": 3, "direct_answers": ["cherry", "pineapple", "pizza", "tomato", "olive", "tomato", "pineapple", "tomatto", "tomato", "olives"], "difficult_direct_answer": false, "rationales": ["This is a common pizza topping.", "The black color, size and shape of the pizza topping is clearly visible and consistent with answer a.", "The fruit is round and black, not green, orange, or yellow."], "image": "train2014/COCO_train2014_000000299544.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300014, "question_id": "XrNUKTxQmBJkcHCgy27mJW", "question": "What are the animals called?", "choices": ["wildebeests", "oxen", "antelope", "horses"], "correct_choice_idx": 0, "direct_answers": ["com", "sheep", "wildebeests", "cows", "buffalo", "cows", "cows", "cow", "cows", "goats"], "difficult_direct_answer": false, "rationales": ["Large brown animals with horns and hooves are grazing and drinking in an savannah type area.", "You can tell by the horns and fur on the animals, as to what type they are.", "They resemble cows."], "image": "train2014/COCO_train2014_000000300014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81944, "question_id": "XsEiyw8bEAixEKa57P9nsF", "question": "What is on the table?", "choices": ["cat", "applesauce", "spoon", "ham"], "correct_choice_idx": 2, "direct_answers": ["spoon", "cups", "food", "dessert", "coffee bread", "tea", "car", "bread", "bread", "food"], "difficult_direct_answer": false, "rationales": ["A spoon is on the plate on the table.", "There is a spoon by the plate.", "This first item is the only thing on the list that is present in the image."], "image": "train2014/COCO_train2014_000000081944.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47629, "question_id": "Xtay6HvtZjQVWsMUjByW22", "question": "What happens if you pull the lever in the middle of the photo?", "choices": ["nothing", "water comes", "soda comes", "air comes"], "correct_choice_idx": 1, "direct_answers": ["water flows", "water out", "water comes", "water stream", "water", "water pours", "water drips", "water", "toilet flushes", "water comes"], "difficult_direct_answer": false, "rationales": ["The water will come.", "The lever opens and closes the drain in the sink.", "If the sink lever is pulled, water will come out. sinks work in different ways."], "image": "train2014/COCO_train2014_000000047629.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468542, "question_id": "XtzVa9ktGLdRaEZ8PxhvgE", "question": "How many cows are walking beside the guy on a horse?", "choices": ["seven", "four", "six", "five"], "correct_choice_idx": 2, "direct_answers": ["five", "five", "five", "six", "five", "six", "five", "six", "four", "five"], "difficult_direct_answer": false, "rationales": ["There are six cows although some of the cows are only partially visible.", "This is simply a matter of counting.", "There are 6."], "image": "train2014/COCO_train2014_000000468542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 150320, "question_id": "XxVtKGebpe8MAx7pXue3Li", "question": "What is the keyboard being plugged into?", "choices": ["computer", "wall", "pen", "mouse"], "correct_choice_idx": 0, "direct_answers": ["dock", "computer", "computer", "system", "computer", "monitor", "computer", "computer", "computer", "computer"], "difficult_direct_answer": false, "rationales": ["A monitor and a computer keyboard are on a desk with cords visible.", "The keyboard is in the computer.", "They are used to type on and control computers."], "image": "val2014/COCO_val2014_000000150320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 204700, "question_id": "XxxseskP3fKWnxccPVfreV", "question": "What number on the train is divisible by 7?", "choices": ["seven", "14", "28", "56"], "correct_choice_idx": 0, "direct_answers": ["7850", "eight", "seven", "7850", "seven", "seven", "seven", "eight", "7850", "1121"], "difficult_direct_answer": false, "rationales": ["Seven is the only number on the train that is divisible by 7.", "The number is seven.", "That number is divisible by 7."], "image": "train2014/COCO_train2014_000000204700.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359563, "question_id": "XzDf3eE99MdzAY6q9nATSd", "question": "What color are the noodle strainers hanging underneath of the cupboard and above the sink?", "choices": ["two", "three", "five", "four"], "correct_choice_idx": 0, "direct_answers": ["green", "green", "light green", "green", "two", "green", "silver", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["There are two round bowls with many small holes hanging below a cupboard and are light green in color.", "There is one on the left and one on the right.", "There are two colors."], "image": "val2014/COCO_val2014_000000359563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 90010, "question_id": "XzSGRWZX4duDEYzGzaCWVS", "question": "What type of shoes is the cat laying on?", "choices": ["loafers", "oxfords", "derby", "chukkas"], "correct_choice_idx": 0, "direct_answers": ["loafers", "black", "loafer", "loafers", "leather", "dress shoe", "loafer", "leather shoes", "loafers", "dress shows"], "difficult_direct_answer": false, "rationales": ["The shoes are leather slip on shoes that are semi-formal.", "It's the obvious answer given the tassels.", "A cat has its paws on some black slip ons. they are a bit fancy like house shoes but a little more snazzy."], "image": "val2014/COCO_val2014_000000090010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136235, "question_id": "XzVoA3jyvPnPZ598SQ445Y", "question": "What color is the FedEx airplane's tail fin?", "choices": ["blue", "green", "yellow", "purple"], "correct_choice_idx": 3, "direct_answers": ["purple", "purple", "blue", "purple", "blue", "red", "blue", "purple", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["Fedex uses purple as their company colors.", "The color is a primary color from the rainbow.", "A plane with a company logo on the tail is parked at an airport. fedex advertises with purple and white coloring."], "image": "train2014/COCO_train2014_000000136235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133071, "question_id": "XzWxjEzRiMzQ8ZVCXyTwWQ", "question": "What is the light green mixture?", "choices": ["sauerkraut", "salsa verde", "guacamole", "pesto"], "correct_choice_idx": 2, "direct_answers": ["guacamole", "guacamole", "guacamole", "guacamole", "guacamole", "guacamole", "avocado", "vegetable", "guacamole", "ovacado"], "difficult_direct_answer": false, "rationales": ["Guacamole is on the bread.", "The mixture is guacamole.", "The guacamole is green."], "image": "train2014/COCO_train2014_000000133071.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520840, "question_id": "Y2Cik8z45KtxPp7oyY8yt7", "question": "What are the giraffes doing with their legs spread apart like this?", "choices": ["drinking", "eating", "sleeping", "walking"], "correct_choice_idx": 0, "direct_answers": ["drink water", "giraffe", "drinking", "drinking balancing", "drinking", "drinking water", "drinking water", "drinking", "drink", "drinking"], "difficult_direct_answer": false, "rationales": ["There is a body of water on the ground in front of the giraffes and their heads are bowed and extended towards is. the spreading of their legs allows them to get into this position and make contact with the water.", "The giraffes are awake and are not moving. there is water, not food, on the ground.", "Three out of five of the giraffes in this picture have their heads to the ground towards a water puddle."], "image": "val2014/COCO_val2014_000000520840.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 361578, "question_id": "Y3joSgaVhkk8H5Jwp4ebGT", "question": "What would happen if the lamp was turned off?", "choices": ["light still", "darkness", "nothing", "unknown"], "correct_choice_idx": 1, "direct_answers": ["darkness", "darkness", "room darkens", "dark", "darkness", "darker", "darkness", "dark", "darkness", "dark"], "difficult_direct_answer": false, "rationales": ["There would be darkness in the room if the light were turned off.", "There would be darkness.", "The room would be dark if the lamp is off."], "image": "train2014/COCO_train2014_000000361578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231102, "question_id": "Y5xqPTZZdghMMSaDV5ZtAg", "question": "What is this desk made of?", "choices": ["laminated wood", "oak", "pine", "plywood"], "correct_choice_idx": 0, "direct_answers": ["wood", "wood", "laminated wood", "tv", "wood", "wood", "wood", "laptop systems", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["It might also have a b core made of c or d.", "The desk is wooden.", "The desk has a shine that is consistent with something laminated or finished. the grain and color is consistent with a wooden material."], "image": "train2014/COCO_train2014_000000231102.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477193, "question_id": "Y67zKxe64UTzQ7FN4S2DV3", "question": "What does the yellow line signify?", "choices": ["direction", "lanes", "speed", "disallowed area"], "correct_choice_idx": 1, "direct_answers": ["road", "passing allowed", "middle", "divider", "lanes", "split lanes", "lanes", "direction change", "division", "bike lane"], "difficult_direct_answer": true, "rationales": ["People are walking and some biking along a dotted road. the yellow dotted paint shows where people are walking.", "The yellow line signifies lanes.", "This is the standard usage even though the couple in the foreground are actually pushing the baby carriage on top of it."], "image": "train2014/COCO_train2014_000000477193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200817, "question_id": "Y6g2Ar6Dybk9QLEhzTR9gZ", "question": "What is particularly large here?", "choices": ["ears", "train tracks", "pizza toppings", "buildings"], "correct_choice_idx": 0, "direct_answers": ["elephant", "elephants", "elephant", "elephant", "ears", "elephant", "elephants", "elephant", "elephant", "animal"], "difficult_direct_answer": false, "rationales": ["The ears are large.", "Elephants have large ears.", "They're well known for a being large and their trunks being long."], "image": "train2014/COCO_train2014_000000200817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 563391, "question_id": "Y8NMKzoPvFhNuc2UEQ8Pa2", "question": "What type of paint was used to paint the pictures hanging on the wall?", "choices": ["acrylic", "gouache", "watercolor", "oil"], "correct_choice_idx": 2, "direct_answers": ["oil", "watercolor", "watercolor", "red", "acrylic", "acrylic", "latex", "white", "white", "acrylic"], "difficult_direct_answer": false, "rationales": ["The semi translucent and smeary texture of the paintings in this living room identify them as water colors.", "A painting is hanging in a home and is colorful.", "The paintings that are framed and hanging on the wall were made with watercolor paints."], "image": "train2014/COCO_train2014_000000563391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480443, "question_id": "YARQCwMfWqry3eTQB3qJjD", "question": "What color is the dome on top of the clock tower with some golden ornaments on top of it?", "choices": ["brown", "purple", "red", "blue"], "correct_choice_idx": 3, "direct_answers": ["black", "blue", "blue", "white", "green", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The color is blue.", "A dome on the top of a building has a blue colored dome surrounded by metallic colored details.", "It is weathered metal a little darker than the sky"], "image": "train2014/COCO_train2014_000000480443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187560, "question_id": "YB8HNTn5BLsa4r9eHzbBnv", "question": "Who would most likely use the stool in this room?", "choices": ["baby", "teenager", "toddler", "adult"], "correct_choice_idx": 2, "direct_answers": ["toddler", "child", "child", "child", "child", "children", "paste", "child", "get up", "reach mirror"], "difficult_direct_answer": false, "rationales": ["A little kid would need the stool to reach the sink.", "The stool is for someone who is very short.", "The toddler would stand on it."], "image": "train2014/COCO_train2014_000000187560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251032, "question_id": "YDahi9WURQ2DnjMPjFL36e", "question": "What is visible in the water?", "choices": ["paddles", "seals", "snake", "panda"], "correct_choice_idx": 0, "direct_answers": ["mud", "boats", "boats", "clouds", "paddles", "oars", "boats", "boats", "brown", "mud"], "difficult_direct_answer": false, "rationales": ["There are oars in the water.", "You can tell by the types of vehicles in the water as to what they put in the water.", "There are no animals. there are pole-shaped items that are associated with the boats."], "image": "train2014/COCO_train2014_000000251032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 272463, "question_id": "YENquw7d2CfAcXgfRUkkmi", "question": "What color is the van at the end of the row of the left?", "choices": ["red", "yellow", "brown", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "green", "green", "green", "green", "blue", "green", "blue", "green", "green"], "difficult_direct_answer": false, "rationales": ["The color is green.", "The van is not red, brown, or yellow.", "It's the color of the vegetation"], "image": "train2014/COCO_train2014_000000272463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404388, "question_id": "YEhrs4e42UnGxzygh6piUk", "question": "What is the red and black tool on the right called?", "choices": ["hammer", "vice grip", "screwdriver", "pipe wrench"], "correct_choice_idx": 3, "direct_answers": ["clock", "wrench", "clock", "wrench", "wrench", "pipe wrench", "wrench", "wrench", "wrench", "wrench"], "difficult_direct_answer": false, "rationales": ["The tool is a wrench.", "The red tool on the right is a pipe wrench.", "The tool is used for pipes."], "image": "val2014/COCO_val2014_000000404388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227576, "question_id": "YFKeiZDUEPdHjfLytFFFM3", "question": "What is the young offspring of these animals called?", "choices": ["kitten", "calf", "joey", "doe"], "correct_choice_idx": 1, "direct_answers": ["cow", "calf", "calf", "calf", "calf", "calf", "calf", "calf", "calf", "calf"], "difficult_direct_answer": false, "rationales": ["The animals are cows, and most herd animals have the same name for their young.", "The animals are cows, not deer, kangaroos, or cats.", "A baby cow is called a calf."], "image": "train2014/COCO_train2014_000000227576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102446, "question_id": "YGA4yaSRHWG3PH2wrExST4", "question": "What kind of flowers are in the glass vase on top of the end table?", "choices": ["sunflowers", "daffodils", "tulips", "roses"], "correct_choice_idx": 0, "direct_answers": ["sunflowers", "daffodil", "sunflowers", "sunflowers", "sunflower", "sunflower", "sunflower", "sunflower", "daisy", "sunflowers"], "difficult_direct_answer": false, "rationales": ["They have yellow petals and a brown center.", "The flowers are sunflowers.", "There are a bunch of sunflowers inside the glass vase on top of the table."], "image": "val2014/COCO_val2014_000000102446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435164, "question_id": "YGEVSrDMpUyrNaciwcm9pg", "question": "What number is the largest number on the train?", "choices": ["five", "two", "eight", "seven"], "correct_choice_idx": 2, "direct_answers": ["eight", "eight", "eight", "eight", "eight", "eight", "eight", "68006", "68006", "eight"], "difficult_direct_answer": false, "rationales": ["The number is 8.", "The other numbers are lower in value", "There is a five digit number on the front of the train. 7, 5, and 2 are not in this number."], "image": "train2014/COCO_train2014_000000435164.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498786, "question_id": "YGUvgbbakF8HTxQ9qY8N4N", "question": "What is the rack above the glasses holding?", "choices": ["alcoholic beverages", "spices", "sparkling water", "sodas"], "correct_choice_idx": 0, "direct_answers": ["liquor", "liquor bottles", "wine bottles", "wine", "bottles", "alcoholic beverages", "wine bottles", "alcohol", "liquor", "wine"], "difficult_direct_answer": false, "rationales": ["There are bottles of wine and similar drinks in the rack.", "The rack is clearly visible and its contents appears to be bottles with multi-colored liquids in the size and shape of answer a. the setting is also consistent with what would be served and the manner it might be stored.", "There are some bottles of alcohol in the rack."], "image": "train2014/COCO_train2014_000000498786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423039, "question_id": "YGi8YDY3v5eV5DjxGWCwf8", "question": "What color is the bench in the middle of the U-shaped road covered in straw?", "choices": ["blue", "green", "purple", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "brown", "grey", "brown", "black", "charcoal black", "natural wood", "black"], "difficult_direct_answer": false, "rationales": ["The color is black.", "(a) black. not only is it a different design, it's black instead of brown.", "The bench is black."], "image": "val2014/COCO_val2014_000000423039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240252, "question_id": "YJ7fKPkXRpM3G4DbNiGxsP", "question": "How many portraits are hung on the gray walls?", "choices": ["two", "six", "three", "five"], "correct_choice_idx": 1, "direct_answers": ["four", "six", "two", "three", "seven", "zero", "seven", "four", "three", "four"], "difficult_direct_answer": false, "rationales": ["There are six portraits hanging on the walls.", "There are six portraits.", "There are painting of a tree and a lake."], "image": "train2014/COCO_train2014_000000240252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 157385, "question_id": "YJLxtTc2XEkfpJj2DNXuMN", "question": "What is the number of zebras sitting in the middle of the forested plain?", "choices": ["four", "five", "three", "two"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "four", "four", "zero", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are four zebras standing together.", "There are four zebras in the grass.", "There are four zebras sitting."], "image": "train2014/COCO_train2014_000000157385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108541, "question_id": "YJgS3x6vXYeSQmhFtqvmUf", "question": "What company handles the event that will happen in 1/2 mile?", "choices": ["ez pass", "secret service", "ny mets", "ny jets"], "correct_choice_idx": 0, "direct_answers": ["state", "i pass", "ez pass", "pay toll", "ipass", "ez pass", "1-pass", "pay toll", "toll road", "i-pass"], "difficult_direct_answer": false, "rationales": ["The company is ez pass.", "The sign shows options such as a different pass and a toll booth to drive past.", "A overhead street sign advertises an upcoming toll and lanes for those who have made prior arrangements for payment."], "image": "val2014/COCO_val2014_000000108541.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523357, "question_id": "YNDF8VxkrVrLaV3XtnMENP", "question": "What color is the sidecar housing the small dog?", "choices": ["yellow", "blue", "white", "green"], "correct_choice_idx": 0, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["Unless you are colorblind you can tell the color easily.", "The dog is clearly visible and is sitting in an object connected to the bike. the color of this object attached to the side of the bike, known as a sidecar, is clearly visible.", "It's the color of lemons"], "image": "train2014/COCO_train2014_000000523357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 548375, "question_id": "YP5DvjY4pCze8xRrpNoj8i", "question": "Where is this office located?", "choices": ["hotel", "home", "restaurant", "school"], "correct_choice_idx": 1, "direct_answers": ["home", "home", "house", "home", "inside", "home", "indoors", "office", "home", "home"], "difficult_direct_answer": false, "rationales": ["The office is at home.", "The office is at home.", "This office is located in someone's home."], "image": "train2014/COCO_train2014_000000548375.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 581326, "question_id": "YRu8a9RcjKQUw5Hx4Yyg2e", "question": "What color is top of the yellow bodied fire hydrant on the bottom left side?", "choices": ["black", "white", "red", "turquoise"], "correct_choice_idx": 3, "direct_answers": ["turquoise", "be", "blue", "blue", "red", "green", "turquoise", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["The top of the fire hydrant is not white, black, or red.", "It's the color of some water in tropical islands", "The hydrant is clearly visible based on its defining shape and the text of the question. the color of the top is clearly identifiable."], "image": "train2014/COCO_train2014_000000581326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133210, "question_id": "YSnNsCk9ACKD3UfRbJvZnb", "question": "How do you know the four guys are together?", "choices": ["sign", "uniforms", "matching luggage", "hats"], "correct_choice_idx": 1, "direct_answers": ["matching outfits", "dressed alike", "matching clothes", "same color", "six", "matching uniforms", "uniforms", "dressed same", "five", "black clothes"], "difficult_direct_answer": true, "rationales": ["They all are wearing the same outfits.", "They are dressed alike", "The four men seated in the chairs are wearing matching outfits that show they work for the same unit."], "image": "val2014/COCO_val2014_000000133210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520094, "question_id": "YThTYbjjXPVbVGxYePYACW", "question": "The logo printed on top of the white cake is for a company based in which country?", "choices": ["germany", "france", "uk", "usa"], "correct_choice_idx": 0, "direct_answers": ["china", "germany", "germany", "germany", "cake", "germany", "germany", "germany", "germany", "germany"], "difficult_direct_answer": false, "rationales": ["A vw logo is on a cake.", "The logo is for volkswagen and it is a deutschland company.", "The logo is from germany."], "image": "train2014/COCO_train2014_000000520094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418761, "question_id": "YU4DqVmU4oUQeow63zTugQ", "question": "What is rolling into the devices for the airplane?", "choices": ["backpacks", "golf bags", "luggage", "freight"], "correct_choice_idx": 2, "direct_answers": ["luggage", "luggage", "luggage", "suitcases", "luggage", "bags", "conveyor", "luggage", "luggage", "conveyor belt"], "difficult_direct_answer": false, "rationales": ["Some of the bags have wheels.", "The bags have wheels underneath.", "There are bags with tags on them filled with items from the passengers."], "image": "val2014/COCO_val2014_000000418761.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457986, "question_id": "YWzexKi9n3pyWgF2cqaxLc", "question": "What type of landscape does this room most resemble?", "choices": ["snowstorm", "ocean", "dessert", "jungle"], "correct_choice_idx": 3, "direct_answers": ["flat", "sofa", "ocean", "waves", "jungle", "openspace", "living", "rectangle", "living room", "sofa"], "difficult_direct_answer": true, "rationales": ["With the colors and designs it reminds one of a jungle.", "The landscape is a jungle.", "The room is mostly green so it mostly looks like a jungle."], "image": "val2014/COCO_val2014_000000457986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211552, "question_id": "YX2DduG2TWKQtdyaWMKGEp", "question": "What color is the item the animals are stepping on?", "choices": ["black", "red", "blue", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "white", "green", "green", "green", "green", "green", "green", "green", "white"], "difficult_direct_answer": false, "rationales": ["The color is green.", "The animals are standing on grass.", "The color is green."], "image": "train2014/COCO_train2014_000000211552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176796, "question_id": "YXjmrC4cnjbpdGoEH6ZQZ2", "question": "What is on the animal in the foreground's head?", "choices": ["crown", "ears", "bird", "hat"], "correct_choice_idx": 1, "direct_answers": ["stripes", "zebra", "mane", "print", "hair", "stripes", "ears", "hair", "hair", "ears"], "difficult_direct_answer": false, "rationales": ["A zebra is looking straight ahead.", "Zebras have ears that sit up straight on their head.", "The other options don't appear in this scene."], "image": "val2014/COCO_val2014_000000176796.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481524, "question_id": "YXvHxoyuWsR6aQJsVvAXv2", "question": "These two cats are doing what activity?", "choices": ["playing", "sleeping", "relaxing", "eating"], "correct_choice_idx": 2, "direct_answers": ["relaxing", "resting", "sleeping", "resting", "being lazy", "laying", "resting/laying down", "resting", "resting", "sleeping"], "difficult_direct_answer": false, "rationales": ["The cats are awake and are laying down.", "The two cats are relaxing.", "They're eyes are option. so, c doesn't apply. the other two options require food and movement, respectively."], "image": "train2014/COCO_train2014_000000481524.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211118, "question_id": "YYcUTi5UD5TEXAKy26VWRc", "question": "What animal does the toy resemble most?", "choices": ["eagle", "eel", "cow", "rabbit"], "correct_choice_idx": 3, "direct_answers": ["bunny", "bunny", "rabbit", "rabbit", "rabbit", "bunny", "sheep", "rabbit", "rabbit", "rabbit"], "difficult_direct_answer": false, "rationales": ["A toy is on a log and has long ears. rabbits have long ears.", "The animal is a rabbit.", "The toy is a rabbit."], "image": "train2014/COCO_train2014_000000211118.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142689, "question_id": "YYi2EuVoUJuREVfLHpsAX4", "question": "Who is most likely bathing the baby?", "choices": ["sister", "nanny", "grandmother", "mom"], "correct_choice_idx": 3, "direct_answers": ["mom", "mother", "mother", "mom", "mom", "mother", "mother", "mother", "mom", "mother"], "difficult_direct_answer": false, "rationales": ["Usually a child this young receiving care in the home would be receiving that care from their closest relation and primary caregiver.", "This woman is young and mom is most likely to be in charge of bathing.", "It could also be c or d. her youthful age makes b unlikely."], "image": "train2014/COCO_train2014_000000142689.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340089, "question_id": "YYnAeYYUmgMXyF4LFvLhCy", "question": "What popular video game character is on the screen provisioned by the Nintendo Wii device?", "choices": ["mario", "zelda", "link", "luigi"], "correct_choice_idx": 0, "direct_answers": ["mario", "girl", "mario", "mix", "mario", "mario", "mario", "mario", "mario", "tv"], "difficult_direct_answer": false, "rationales": ["The tv in the living room shows the character mario from the famous video games.", "He is a little builder.", "The character is wearing red which is similar to mario and is short."], "image": "val2014/COCO_val2014_000000340089.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44372, "question_id": "Ya53YdZsXe2foWEPHRrosV", "question": "What color is the LCD on the flatscreen television on top of the white drawers?", "choices": ["green", "red", "blue", "yellow"], "correct_choice_idx": 2, "direct_answers": ["blue", "white", "blue", "white", "white", "black", "blue", "black", "blue", "black color"], "difficult_direct_answer": false, "rationales": ["A light is lit blue on a television on a dresser.", "There is some blue trim on the lcd monitor button.", "The color is blue."], "image": "train2014/COCO_train2014_000000044372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296360, "question_id": "YdYzSYWnBh2vnrZCqtinGN", "question": "What is the clear vessel next to the blue container used for?", "choices": ["measuring ingredients", "pouring lemonade", "chopping food", "serving coffee"], "correct_choice_idx": 0, "direct_answers": ["measuring", "measuring", "measuring cup", "measuring", "storage", "measuring", "coffee", "measuring liquid", "measuring", "measuring ingredients"], "difficult_direct_answer": false, "rationales": ["The vessel is a measuring cup.", "This is obvious given the labeling and shape.", "It is a measuring cup."], "image": "train2014/COCO_train2014_000000296360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254074, "question_id": "Ye6aWykQV4JeMF8zf5h269", "question": "What is a breed of this animal?", "choices": ["pitbull", "ragdoll", "siamese", "manx"], "correct_choice_idx": 0, "direct_answers": ["labrador", "dog", "black lab", "labrador", "labrador retriever", "dog", "feel", "lab", "pitbull", "playing"], "difficult_direct_answer": false, "rationales": ["The breed is a pitbull.", "There are two grey large dogs and one is drinking water and the other is holding a frisbee in its mouth.", "Medium sized brown dogs are playing in water."], "image": "val2014/COCO_val2014_000000254074.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281663, "question_id": "Yg3TKdzDKvEifxpHEkKxRh", "question": "What is inside of the item that looks like bread?", "choices": ["watermelon", "salmon", "sardine", "onion"], "correct_choice_idx": 3, "direct_answers": ["onion", "doughnut", "onion", "onion", "onion", "onion", "cream", "sandwitch", "onion", "onion"], "difficult_direct_answer": false, "rationales": ["There is a purple onion inside of the bread.", "The item is purple. salmon, sardines, and watermelon are not purple.", "The outer purplish ring of the vegetable that grows from a bulb and provides a sharp flavor can be seen."], "image": "train2014/COCO_train2014_000000281663.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435279, "question_id": "YgxAJjpTr9DftwyNm57DBN", "question": "Where are these items sold?", "choices": ["shoprite", "hobby lobby", "home depot", "best buy"], "correct_choice_idx": 0, "direct_answers": ["bananas", "store", "fruit market", "market", "bananas", "stores", "produce store", "supermarket", "shoprite", "bananas"], "difficult_direct_answer": false, "rationales": ["These items are bananas, not pieces of lumber, electronic devices, or pieces of fabric.", "Fruit is typically on sale at grocery stores, and shoprite is a grocery chain.", "They are sold in a grocery store."], "image": "train2014/COCO_train2014_000000435279.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446827, "question_id": "YkNq6ChCFrypXw4KqCoU4g", "question": "What color is at the bottom front of the vehicle in the foreground?", "choices": ["purple", "black", "blue", "red"], "correct_choice_idx": 0, "direct_answers": ["white", "purple", "grey", "white", "white", "purple", "purple", "purple", "purple", "white"], "difficult_direct_answer": false, "rationales": ["The color is a primary color in the rainbow.", "Unless you are colorblind you can tell what the color is at the bottom.", "That shade of color is at the very bottom behind the metal bumpers."], "image": "train2014/COCO_train2014_000000446827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 392851, "question_id": "YkeTSsDuUNwZmMxpaJPYYr", "question": "What kind of environment is this?", "choices": ["urban", "unknowns", "wild", "rural"], "correct_choice_idx": 0, "direct_answers": ["city", "urban", "city", "urban", "city", "city", "city", "city", "city", "dwdww"], "difficult_direct_answer": false, "rationales": ["A white bird is close to the window and there are tall building all around downtown. there are a lot of cars stopping on the road.", "There are tall buildings and cars.", "The environment is urban."], "image": "train2014/COCO_train2014_000000392851.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372384, "question_id": "YmtkcZmg88JskLPZRJ9Smd", "question": "What is the athlete attempting to do?", "choices": ["bounce pass", "check mate", "homerun", "serve"], "correct_choice_idx": 3, "direct_answers": ["serve", "playing", "serve", "hit tenisball", "serve", "serve", "tennis", "hit ball", "serve", "serve"], "difficult_direct_answer": false, "rationales": ["This athlete is serving in tennis.", "The way their hand is in the air suggests they just threw the ball up in order to hit it.", "He has thrown the ball straight up in the air and is about to hit it across the court"], "image": "val2014/COCO_val2014_000000372384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282009, "question_id": "YnE2pvnTPQgrgsxTsEA38v", "question": "What animal is next to the giraffe?", "choices": ["cow", "ostrich", "deer", "elephant"], "correct_choice_idx": 1, "direct_answers": ["hostrich", "ostrich", "ostrich", "ostrich", "ostrich", "turkey", "ostriches", "ostrich", "ostrich", "ostrich"], "difficult_direct_answer": false, "rationales": ["A large flightless bird is next to the giraffe.", "These are flightless birds that are very large", "The other animals don't appear in this image."], "image": "train2014/COCO_train2014_000000282009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41022, "question_id": "Yni7VDhWSaohV9nN99CDHa", "question": "What color is the tape fence around the luggage area where there is a number of luggage bags?", "choices": ["black", "yellow", "red", "white"], "correct_choice_idx": 1, "direct_answers": ["yellow", "yellow", "light green", "green", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The tape is colored yellow.", "Generally these types of tape fences are yellow as to warn others to stay back.", "The tape fence is not red, black, or white."], "image": "train2014/COCO_train2014_000000041022.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300537, "question_id": "Ynxr8vPehLffgTTqrRc6WK", "question": "What are the people passing by?", "choices": ["dog", "cat", "hydrant", "train"], "correct_choice_idx": 2, "direct_answers": ["cars", "construction site", "hydrant", "car", "fire hydrant", "fence", "partition wall", "fire hydrant", "rain", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["The people pass a hydrant.", "The people are passing by a fire hydrant.", "There are people passing a hydrant."], "image": "train2014/COCO_train2014_000000300537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281688, "question_id": "Yo7SaWYqvQHwxu7CFAVWt5", "question": "Who is holding the ball?", "choices": ["volleyball player", "soccer player", "quarterback", "catcher"], "correct_choice_idx": 3, "direct_answers": ["catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "catcher", "catcher"], "difficult_direct_answer": false, "rationales": ["This person is holding a baseball mitt behind the matt.", "The catcher has the ball.", "The catcher holds."], "image": "val2014/COCO_val2014_000000281688.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325654, "question_id": "YoPon7nzaRWrB7YEFBbNNs", "question": "What is in the food?", "choices": ["spoon", "fork", "chopsticks", "knife"], "correct_choice_idx": 1, "direct_answers": ["fork", "chicken", "chicken", "bread", "bread", "fork", "fork", "fried egg", "egg", "sweet item"], "difficult_direct_answer": false, "rationales": ["There is a white utensil that has tines for spearing the food to make it easier to eat.", "A utensil with a handle and prongs is in the food.", "A white utensil is sticking out of a pastry."], "image": "train2014/COCO_train2014_000000325654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312212, "question_id": "YopQqXnJWoWTZPiZzh46RK", "question": "Why is this dog getting a treat?", "choices": ["good boy", "his birthday", "potty training", "learned trick"], "correct_choice_idx": 1, "direct_answers": ["birthday", "birthday", "birthday", "birthday", "birthday cake", "birthday", "for birthday", "dog's birthday", "his birthday", "birthday"], "difficult_direct_answer": false, "rationales": ["The lettering on the treat makes this clear unless they were out of d treats.", "The writing on the cake indicates its purpose.", "He's celebrating."], "image": "train2014/COCO_train2014_000000312212.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483833, "question_id": "YqYSCvWLtP82uxbRAogRci", "question": "What call will the umpire most likely make?", "choices": ["foul", "out", "strike", "ball"], "correct_choice_idx": 2, "direct_answers": ["move", "strike", "strike", "strike", "foul", "strike", "strike", "score", "strike", "strike"], "difficult_direct_answer": false, "rationales": ["The ball catcher guy has his mitt in a closed fashion as if he caught the ball.", "The umpire calls a strike.", "The player did not hit the ball."], "image": "train2014/COCO_train2014_000000483833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113097, "question_id": "Yr8xyrEJzaWwadRMvJFidR", "question": "What color is the guitar in the painting hung in the center of the wall on the right?", "choices": ["purple", "blue", "green", "red"], "correct_choice_idx": 1, "direct_answers": ["beige", "sandle", "white", "brown", "brown", "grey", "grey", "grey", "blue", "white"], "difficult_direct_answer": false, "rationales": ["The color is blue.", "The guitar in the painting is blue.", "The guitar is the color of the sky."], "image": "val2014/COCO_val2014_000000113097.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240128, "question_id": "YrBN8s7spAe7oyV4pSqNy5", "question": "What was the man just doing?", "choices": ["swimming", "lying down", "running", "sunbathing"], "correct_choice_idx": 1, "direct_answers": ["surfing", "surfing", "feff", "lying down", "surfing", "watching", "sitting", "watching", "surfing", "surfing"], "difficult_direct_answer": false, "rationales": ["The man is lying down.", "He is getting some sun", "The man has sand on his back."], "image": "train2014/COCO_train2014_000000240128.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304054, "question_id": "Ys8hbU7MY7PaxzjyrvChkW", "question": "What is the child balancing against?", "choices": ["ladder", "tree", "hydrant", "fence"], "correct_choice_idx": 2, "direct_answers": ["car", "hydrant", "fire hydrate", "fire", "hydrant", "hydrant", "fire-hydrant", "hydrant", "water hydrant", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["The item is a water supply for firefighters.", "The child is leaning against a red item that dogs usually pee on.", "The kid is against a hydrant."], "image": "train2014/COCO_train2014_000000304054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175536, "question_id": "Yt4pnyvknSHLCHDxGiQDNZ", "question": "What is on the TV?", "choices": ["video games", "cat", "silent movie", "dog"], "correct_choice_idx": 0, "direct_answers": ["bowling wii", "videogame bowling", "video games", "bowling game", "video game", "bowling", "wii bowling", "bowling game", "video games", "bowling"], "difficult_direct_answer": false, "rationales": ["The tv has video games.", "The woman is playing the wii console.", "The tv has games."], "image": "train2014/COCO_train2014_000000175536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104758, "question_id": "YtQcqYh7P36xcrc9WKKUQ3", "question": "How many elephants are standing right on the dirt road to the left?", "choices": ["five", "four", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "three", "three", "two", "two", "three", "two", "two", "2 elephants"], "difficult_direct_answer": false, "rationales": ["The third one is in the grass.", "An adult with a baby stands on the road while another elephant walks off to the side.", "There are two elephants crossing the road."], "image": "val2014/COCO_val2014_000000104758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 227845, "question_id": "YwzyYfdNkxnv78ciwB4qJZ", "question": "What is the animal in the foreground likely chewing?", "choices": ["fish", "chicken", "acacia leaves", "bananas"], "correct_choice_idx": 2, "direct_answers": ["leaves", "acacia leaves", "grass", "food", "cud", "leaves", "food", "giraffe", "grass", "grass"], "difficult_direct_answer": false, "rationales": ["They eat vegetation", "The closest animal to the camera in this picture who's head is fully visible is a giraffe. acacia leaves are the only food item of those listed which a giraffe would eat.", "Giraffes are herbivores who mostly subsist on leaves."], "image": "val2014/COCO_val2014_000000227845.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289275, "question_id": "YxNnmx4YXwvc7fBnb3YMwC", "question": "What color is the cow in between the two milkcows?", "choices": ["brown", "ginger", "green", "pink"], "correct_choice_idx": 0, "direct_answers": ["brown", "brownwhite", "brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The cow in between the milkcows is brown.", "It is lighter than the other two", "The cow is brown."], "image": "train2014/COCO_train2014_000000289275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 472521, "question_id": "Yyi9jpPVQHRuyugaKYN4ti", "question": "What utensil is usually needed with this food?", "choices": ["knife", "pitchfork", "spatula", "chopstick"], "correct_choice_idx": 0, "direct_answers": ["bowl", "orange", "hands", "knife", "knife", "bowl", "knife", "bowl", "plate", "knife"], "difficult_direct_answer": false, "rationales": ["You need that to cut the oranges.", "These are oranges. people usually do not use chopsticks, spatulas, or pitchforks with oranges.", "Oranges need to be cut."], "image": "train2014/COCO_train2014_000000472521.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321107, "question_id": "YysrjWhz5U4rNpbyFdXSDE", "question": "How is the women moving?", "choices": ["bicycling", "dancing", "running", "walking"], "correct_choice_idx": 0, "direct_answers": ["fast", "bicycling", "pedaling", "cycling", "bicycle", "bicycle", "cycling", "bike", "cycling", "bicycle"], "difficult_direct_answer": false, "rationales": ["The woman is moving down the street on a bicycle.", "The woman is using a bike.", "The woman is on the side of the street sitting on a vehicle with two wheels. the woman is peddling."], "image": "val2014/COCO_val2014_000000321107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200971, "question_id": "YzDtT9g3Zpz7agUBrxKBeB", "question": "How many people can this feed?", "choices": ["one", "two", "50", "ten"], "correct_choice_idx": 3, "direct_answers": ["ten", "twenty five", "two", "five", "many", "twelve", "lot", "15", "ten", "twenty"], "difficult_direct_answer": true, "rationales": ["There are ten hot dogs.", "The exact number of people that could be fed is subjective and unknown, but based on the volume of food and the answers provided, answer a is likely the closest.", "There are about a dozen dogs."], "image": "train2014/COCO_train2014_000000200971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192780, "question_id": "YzuQzwvNEYss59NDPYUTRU", "question": "Slices of what are found in the food?", "choices": ["shark", "echidnas", "apple", "orange"], "correct_choice_idx": 3, "direct_answers": ["orange", "kale", "orange", "oranges", "orange", "oranges", "oranges", "oranges", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["There are orange slices.", "There are orange slices in the salad.", "Oranges are in the food."], "image": "train2014/COCO_train2014_000000192780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 434767, "question_id": "Z268k3T8uj4pxrQWaxzqLb", "question": "What is the man swinging?", "choices": ["baseball bat", "tree branch", "tennis racquet", "pizza dough"], "correct_choice_idx": 2, "direct_answers": ["tennis racket", "tennis racket", "tennis ball", "bat", "tennis racket", "tennis racquet", "tennis racquet", "racket", "tennis racket", "tennis racket"], "difficult_direct_answer": false, "rationales": ["He's playing tennis", "The man has a racquet.", "The man is playing a sport that does not use a bat."], "image": "train2014/COCO_train2014_000000434767.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 502680, "question_id": "Z27wXhvwQQn6UA96Upb2ir", "question": "What country is associated with the two blue engines?", "choices": ["china", "south korea", "japan", "thailand"], "correct_choice_idx": 2, "direct_answers": ["japan", "japan", "italy", "japan", "na", "japan", "japan", "tamaha", "japan", "japan"], "difficult_direct_answer": false, "rationales": ["These boat engines are made in this country.", "Because the company named on the engine is yamaha.", "Yamaha is the company that produced these products which is based out of this country."], "image": "train2014/COCO_train2014_000000502680.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 565165, "question_id": "Z2ajKDqshysfH2nVQk5LL7", "question": "What color is the snow jacket worn by the skier?", "choices": ["green", "blue", "orange", "yellow"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "orange"], "difficult_direct_answer": false, "rationales": ["The skier is clearly visible and the color of the jacket is readily identifiable.", "The color is orange.", "The color is orange."], "image": "train2014/COCO_train2014_000000565165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488260, "question_id": "Z3Qiw6CK2G243skqpNTGeP", "question": "What is the longest part of these animals?", "choices": ["neck", "talons", "wings", "arms"], "correct_choice_idx": 0, "direct_answers": ["neck", "neck", "neck", "neck", "neck", "neck", "neck", "neck", "neck", "neck"], "difficult_direct_answer": false, "rationales": ["These animals are giraffes. they do not have arms, talons, or wings.", "The giraffes head, unlike other animals, sits high above its body due to the elongated structure that it is connected to.", "The giraffe has a long neck."], "image": "train2014/COCO_train2014_000000488260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20434, "question_id": "Z6Bf93LkxQTXDw888uAfPt", "question": "What are the bulls doing?", "choices": ["sleeping", "resting", "grazing", "working"], "correct_choice_idx": 3, "direct_answers": ["cart", "working", "pulling cart", "pulling cart", "pulling cart", "carrying goods", "pulling cart", "stop", "walking", "pulling cart"], "difficult_direct_answer": false, "rationales": ["The bulls are pulling a load.", "The bulls are working.", "They are pulling a cart"], "image": "train2014/COCO_train2014_000000020434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16342, "question_id": "Z95ihf26sNxZmfK8KDQnDo", "question": "Who likely resides here?", "choices": ["newlywed couple", "army trainees", "bachelor", "two roommates"], "correct_choice_idx": 1, "direct_answers": ["workers", "navy soldiers", "army trainees", "navy", "train attendant", "soldiers", "students", "student", "military", "children"], "difficult_direct_answer": true, "rationales": ["Army trainees reside here.", "This scene appears to be bunk beds or barracks so naturally i would think it would house some form of military.", "The army people are resting on the beds."], "image": "train2014/COCO_train2014_000000016342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558736, "question_id": "Z9a3myXj7BnxdhLkQSmZCi", "question": "What are the three zebras in the watering hole looking toward?", "choices": ["each other", "giraffes", "camera", "water"], "correct_choice_idx": 2, "direct_answers": ["camera", "camera", "camera", "cameraman", "camera", "camera", "camera", "camera", "camera", "camera"], "difficult_direct_answer": false, "rationales": ["The zebras are looking at the camera.", "They're looking at the camera.", "The three zebras standing in the watering hole are looking toward the person holding the camera."], "image": "val2014/COCO_val2014_000000558736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 514982, "question_id": "Z9kJut4bfr2RZFPUXjBYmf", "question": "What color are the sinks embedded in the black tile countertop?", "choices": ["blue", "green", "white", "pink"], "correct_choice_idx": 2, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["A white. they are obviously white.", "The sink is colored white.", "Unless you are colorblind you can tell the sink color."], "image": "val2014/COCO_val2014_000000514982.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153064, "question_id": "ZEZw8bqp2SMstoiFgxFHRy", "question": "What color are the doors to the refrigerator on the far left side of the room?", "choices": ["white", "wood", "black", "silver"], "correct_choice_idx": 3, "direct_answers": ["silver", "silver", "gray", "silver", "silver", "silver", "silver", "silver", "silver", "silver"], "difficult_direct_answer": false, "rationales": ["The doors are stainless steel and this color.", "A stainless steel fridge is in a kitchen.", "The color is silver."], "image": "train2014/COCO_train2014_000000153064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174934, "question_id": "ZEfSkcxAXR2LGiQSX4KSxn", "question": "What do the numbers on either side of the clock represent?", "choices": ["nothing", "name", "date", "time"], "correct_choice_idx": 2, "direct_answers": ["time", "time", "date", "time", "hours minutes", "numbers", "hours minutes", "year", "time", "time"], "difficult_direct_answer": false, "rationales": ["The numbers on the face of the clock represent time. the numbers on the side represent when the clock tower was built.", "They are for decoration.", "The numbers represent the date."], "image": "train2014/COCO_train2014_000000174934.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 30731, "question_id": "ZF6myiz5T7xsCLr9oS2rQS", "question": "How many sails are attached to the boat in the ocean?", "choices": ["three", "twentyeight", "two", "sixteen"], "correct_choice_idx": 2, "direct_answers": ["two", "two sails", "two", "two", "one", "two", "two", "two", "one", "two"], "difficult_direct_answer": false, "rationales": ["None of the other numbers apply to what is shown here.", "There are 2.", "There are two sails attached to the boat as it is clear seen."], "image": "train2014/COCO_train2014_000000030731.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82729, "question_id": "ZFJBQjUoByDAHMhUdcPoFv", "question": "What is the person cutting?", "choices": ["paper", "onions", "green peppers", "grapes"], "correct_choice_idx": 2, "direct_answers": ["capsium", "peppers", "green peppers", "vegetable", "peppers", "green peppers", "bell pepper", "peppers", "peppers", "bell peppers"], "difficult_direct_answer": false, "rationales": ["A man is standing there in the kitchen as he is preparing green peppers in his hand. there are other peppers on the kitchen counter with a pot cooking.", "The person is cutting green peppers.", "These are green bell vegetables"], "image": "train2014/COCO_train2014_000000082729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19964, "question_id": "ZFeX9QzA7LyEKQjiQr8AXu", "question": "The item under the roof can best be described as what?", "choices": ["cutting edge", "outdated", "underwater", "bovine"], "correct_choice_idx": 1, "direct_answers": ["payphone", "outdated", "payphone", "telephone", "pay phone", "payphone", "phone", "telephone", "telephone", "phone"], "difficult_direct_answer": false, "rationales": ["The item is above ground. it is a payphone, not a cow.", "The telephone was used in the old ages.", "The item is outdated."], "image": "train2014/COCO_train2014_000000019964.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511814, "question_id": "ZFhFDco3tdjZW8CUsnyNFz", "question": "What color is the lettering at the side of the large mall building?", "choices": ["white", "pink", "golden", "green"], "correct_choice_idx": 2, "direct_answers": ["yellow", "gold", "gold", "gold", "gold", "yellow", "golden", "gold", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The large lettering on the front of the building is in a metallic gold finish.", "The lettering is gold color.", "The colors of the letters are a dull yellow."], "image": "train2014/COCO_train2014_000000511814.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189741, "question_id": "ZGHXx5iSexJ7Apr2m2ZL99", "question": "What are most of the people protected from?", "choices": ["upcoming rain", "stampeding elephants", "fire", "falling anvils"], "correct_choice_idx": 0, "direct_answers": ["sunny", "sun", "rain", "rain", "rain", "rain", "rain", "rain", "upcoming rain", "rain"], "difficult_direct_answer": false, "rationales": ["The people have a cover.", "People are in a boat and most of them are holding umbrellas above their heads.", "The other options don't make sense in this scene."], "image": "train2014/COCO_train2014_000000189741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110230, "question_id": "ZGNUGXNiK9EwqxZ5z8bSY6", "question": "How many people can sit with them on the sofa?", "choices": ["three", "one", "two", "zero"], "correct_choice_idx": 3, "direct_answers": ["zero", "zero", "zero", "zero", "two", "zero", "zero", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["They are taking up both cushions", "The couch is being used by two huge dogs.", "There are no seats left."], "image": "train2014/COCO_train2014_000000110230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107482, "question_id": "ZGxCkGBb4cFhDnMJEbbMvh", "question": "What type of animal is being prepared?", "choices": ["cat", "bird", "dog", "horse"], "correct_choice_idx": 1, "direct_answers": ["chicken", "chicken", "turkey", "chicken", "bird", "turkey", "turkey", "turkey", "turkey", "chicken"], "difficult_direct_answer": false, "rationales": ["There is a turkey on the cutting board.", "You can tell by the shape and the wings as to what type of animals is being prepared.", "It has wings, legs and a breast."], "image": "train2014/COCO_train2014_000000107482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456658, "question_id": "ZHjuFxR58ZwNuTWsPT3Cey", "question": "What are the monitors connected to?", "choices": ["small laptop", "big laptop", "desktop", "trash can"], "correct_choice_idx": 2, "direct_answers": ["power", "desktop", "computer", "tower", "electric outlets", "cpu", "computers", "cpu", "cable connect", "computer"], "difficult_direct_answer": false, "rationales": ["A desk with several monitors and a computer tower between them have cords behind them as well. monitors are used for desktop computers.", "The monitors connect to a computer.", "The screens are used for the computers."], "image": "train2014/COCO_train2014_000000456658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116824, "question_id": "ZJhHxtdP3cirQnbeG24aGR", "question": "What is the same color as the animal's face?", "choices": ["frog", "lizard", "amoeba", "raven"], "correct_choice_idx": 3, "direct_answers": ["their legs", "see", "black", "raven", "black white", "dog", "legs", "legs", "legs", "black"], "difficult_direct_answer": false, "rationales": ["A raven is black as is the face of the sheep.", "The sheep's face is colored black.", "The animal's face is black colored."], "image": "train2014/COCO_train2014_000000116824.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232262, "question_id": "ZKxpUZXSAyRXqoK2o5kP3a", "question": "What make is the white vehicle to the left of the green white cab?", "choices": ["mitsubishi", "lexus", "audi", "ford"], "correct_choice_idx": 0, "direct_answers": ["chrysler", "sun", "cut car", "van", "mitsubishi", "nothing", "van", "toyota", "lexus", "mitshubishi"], "difficult_direct_answer": true, "rationales": ["There is a mitsubishi minivan next to the green cab.", "The white vehicle on the left side of the green cab is mitsubishi.", "Its three-diamond logo is visible on the back."], "image": "val2014/COCO_val2014_000000232262.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518110, "question_id": "ZLpSjnckPWfem42HNtsTmt", "question": "What color is the fur on the stuffed bear in the neck tie?", "choices": ["rainbow", "blue", "purple", "orange"], "correct_choice_idx": 3, "direct_answers": ["orange", "red", "white", "orange", "orange", "orange", "brown", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["He's a little darker than a fruit with the same name", "The beanie baby is orange.", "The color is orange."], "image": "train2014/COCO_train2014_000000518110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574184, "question_id": "ZLzNr9wMqRqbNBETa48fsj", "question": "Who is symbolized by the animal near the computer?", "choices": ["zeus", "thor", "marduk", "bastet"], "correct_choice_idx": 3, "direct_answers": ["hp", "bastet", "pirate", "cat", "death", "cat", "cat", "laptop", "person", "cat"], "difficult_direct_answer": false, "rationales": ["That is the symbol of the animal.", "There are skull symbols on the computer monitor.", "The bastet is symbolized."], "image": "val2014/COCO_val2014_000000574184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 233991, "question_id": "ZP6jKMTU6edKvL3ZBKT9KY", "question": "What is the person skateboarding near?", "choices": ["deer", "train", "baby", "car"], "correct_choice_idx": 3, "direct_answers": ["steps", "car", "steps", "car", "stairs", "car", "car", "car", "building", "stairs"], "difficult_direct_answer": false, "rationales": ["There is a 4-door vehicle near the place where the man is doing his thing.", "There is a parked vehicle, not a baby or deer. it is a road vehicle, not a train.", "The person is by a car."], "image": "train2014/COCO_train2014_000000233991.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537698, "question_id": "ZQ7iThPh6zExdWB6sHh3ov", "question": "What color is the interior of the lettering in front of the helmet on the batter?", "choices": ["pink", "orange", "white", "red"], "correct_choice_idx": 2, "direct_answers": ["white", "white", "white", "white", "white", "cream", "blue", "red", "white", "white"], "difficult_direct_answer": false, "rationales": ["The helmet is clearly visible and the letter coloring is also visible.", "The color is white.", "The color is white."], "image": "train2014/COCO_train2014_000000537698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506919, "question_id": "ZQW7zgdiD2vUCRM9VuMHYN", "question": "What color is the tail fin on the furthest left side of the tarmac?", "choices": ["red", "blue", "yellow", "green"], "correct_choice_idx": 0, "direct_answers": ["red", "redwhite", "red", "blue", "red", "red", "red", "red", "no", "red"], "difficult_direct_answer": false, "rationales": ["The one furthest away is a blue color.", "The plane on the left belongs to virgin airlines. their company colors are red and white.", "The color is red."], "image": "train2014/COCO_train2014_000000506919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77953, "question_id": "ZQzHs5cLhKkQuwZ8tgLiax", "question": "What place looks most similar to this?", "choices": ["siberia", "egypt", "missouri", "venice"], "correct_choice_idx": 3, "direct_answers": ["amalfi coast", "italy", "italy", "venice", "resort", "italy", "usa", "ocean", "boating", "china"], "difficult_direct_answer": false, "rationales": ["The river by the city is reminiscent of ones located in europe.", "The place is venice.", "There are gondolas in the water near european buildings. missouri, egypt, and siberia do not look like this."], "image": "val2014/COCO_val2014_000000077953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165200, "question_id": "ZRjopnftBZQPGZvQHhe44j", "question": "What is the middle advertisement on the blue wall about?", "choices": ["athletic help", "banking", "suicide hotline", "cellular service"], "correct_choice_idx": 3, "direct_answers": ["cell phones", "phone service", "t-mobile", "cellular service", "stick together", "phones", "phones", "cell service", "t-mobile", "tmobile"], "difficult_direct_answer": false, "rationales": ["The advertisement is for the company t-mobile.", "The advertisement is for phones.", "The cell service is advertised."], "image": "train2014/COCO_train2014_000000165200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576417, "question_id": "ZTgsomLQM5ezFzdV9Ah8WS", "question": "What color is the underbelly of the small aircraft?", "choices": ["yellow", "red", "blue", "white"], "correct_choice_idx": 1, "direct_answers": ["red", "red", "red", "burgundy", "pink", "burgundy", "red", "blue red", "red", "purple"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The plane in question is clearly visible and the colors are identifiable.", "The underbelly does not match the white top part of the small aircraft and is not blue or yellow."], "image": "train2014/COCO_train2014_000000576417.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285296, "question_id": "ZWKbHqa6Hqqihjn5GGxM8u", "question": "What is the job of the man in the black shirt and dress pants behind the players?", "choices": ["coach", "judge pitches", "medical staff", "broadcaster"], "correct_choice_idx": 1, "direct_answers": ["umpire", "umpire", "judge pitches", "umpire", "referee", "referee", "umpire", "umpire", "umpire", "umpire"], "difficult_direct_answer": false, "rationales": ["The man is the umpire and makes the calls on the balls thrown.", "He is bent over and watching the ball so that he can see if it is thrown fairly.", "The man in the black shirt's job is to judge pitches."], "image": "train2014/COCO_train2014_000000285296.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409135, "question_id": "ZYtWu8BojnEmLR8H6SmPng", "question": "What time of day is it likely to be?", "choices": ["morning", "afternoon", "night", "evening"], "correct_choice_idx": 1, "direct_answers": ["morning", "morning", "afternoon", "afternoon", "afternoon", "afternoon", "evening", "noon", "daytime", "afternoon"], "difficult_direct_answer": false, "rationales": ["Based on the shadows it looks like the sun is high in the sky, which would make it afternoon.", "There are many people around and you can see the shadows indicating the sun is up and people are active.", "There is a skater cruising along a path. the sun is shining high above his head."], "image": "train2014/COCO_train2014_000000409135.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517835, "question_id": "ZZPzCy7V2zeBMtyUfZ4to2", "question": "What is on the floor?", "choices": ["eggs", "cat", "grate", "ketchup"], "correct_choice_idx": 2, "direct_answers": ["grate", "grate", "grate", "dirt", "storm drain", "grate", "water line", "drain", "grate", "fire hydrant"], "difficult_direct_answer": false, "rationales": ["A sewer grate is on the ground.", "Grates are on the floor.", "There is a vented covering fitted in the pavement."], "image": "train2014/COCO_train2014_000000517835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147779, "question_id": "ZZT5KJjTbFCMmVyT73j29S", "question": "What kind of fries are pictured next to the hot dog covered in cheese?", "choices": ["curly", "straight", "wide", "wavy"], "correct_choice_idx": 3, "direct_answers": ["crinkled", "crinkle cut", "crinkle", "wavy", "crinkle cut", "stop", "crinkled", "crinkle cut", "pizza", "crinkle cut"], "difficult_direct_answer": false, "rationales": ["The fries are not straight and they are crinkly.", "The fries are wavy.", "There are wavy lines on the fries."], "image": "train2014/COCO_train2014_000000147779.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486233, "question_id": "Za6QSicyZjJKArG3oLkWmc", "question": "How many air jets are flying altogether in a formation?", "choices": ["five", "three", "four", "two"], "correct_choice_idx": 2, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "five", "four"], "difficult_direct_answer": false, "rationales": ["There are four jet crafts flying together.", "There are 4.", "The jets are clearly visible and they are countable based on their distinct outlines and the number of cabins and wings."], "image": "val2014/COCO_val2014_000000486233.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 314439, "question_id": "ZbFsCVsQX9iWNpdceQJL9C", "question": "What is the last number on the license plate of the motorcycle in the foreground?", "choices": ["six", "four", "three", "seven"], "correct_choice_idx": 0, "direct_answers": ["six", "six", "six", "six", "six", "six", "six", "six", "six", "six"], "difficult_direct_answer": false, "rationales": ["The bike closest to the viewer has a digit on both the licence as well as body of bike.", "The license plate of the vehicle closest is clearly visible and answer a is visibly the last number.", "The number is 6."], "image": "train2014/COCO_train2014_000000314439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19871, "question_id": "ZcosfhX5b3QMMgfVqtBCAy", "question": "What country most likely hosts the bus parked near this national landmark?", "choices": ["uk", "france", "usa", "germany"], "correct_choice_idx": 0, "direct_answers": ["england", "usa", "england", "england", "russia", "england", "england", "uk", "uk", "england"], "difficult_direct_answer": false, "rationales": ["The country is the uk.", "The bus is popular in the uk.", "The uk is known for having double decker buses. there is also a large clock in the background which is a landmark for the uk too."], "image": "train2014/COCO_train2014_000000019871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 292103, "question_id": "ZePQp8MaM4XC5W2PnuYoWc", "question": "What is the side dish on the plate in the foreground?", "choices": ["oranges", "apple slices", "rice", "fries"], "correct_choice_idx": 2, "direct_answers": ["rice", "rice", "rice", "pancakes", "rice", "rice", "rice", "eggs", "rice", "rice"], "difficult_direct_answer": false, "rationales": ["The mass of small tic-tac shaped grains on the right side of this plate is known as rice.", "Rice is small grains which this food is. there are no fruits visible and no fries so this must be rice.", "The other options aren't in this scene."], "image": "val2014/COCO_val2014_000000292103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415594, "question_id": "ZfGxWASLrwVmNFwHibhVc3", "question": "What color is the hair of the woman who is putting a spatula inside of the kitchen oven?", "choices": ["brown", "blonde", "brunette", "red"], "correct_choice_idx": 1, "direct_answers": ["brown", "blonde", "blonde", "brown", "blonde", "blond", "dwdw", "blonde", "brown", "washing machine"], "difficult_direct_answer": false, "rationales": ["This person has lighter hair.", "The woman has blonde hair.", "You see some gold color on the hair."], "image": "train2014/COCO_train2014_000000415594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246641, "question_id": "ZfPCCpUohnmkDUePwdWbQ2", "question": "What do the pictures look like?", "choices": ["cats", "dogs", "russian soldiers", "missing children"], "correct_choice_idx": 3, "direct_answers": ["good", "yearbook", "sandwiches", "food", "sandwiches", "school photos", "yearbook", "school photos", "sandwiches", "missing children"], "difficult_direct_answer": false, "rationales": ["The pictures are of people, not animals. they are too young to be in the military.", "The pictures are missing kids.", "The pictures show missing children."], "image": "train2014/COCO_train2014_000000246641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315961, "question_id": "ZgehoQJ9hqtSBuLWPRRcZq", "question": "Ho wmany zebras are visible inside of the large conservatory enclosure?", "choices": ["two", "six", "four", "three"], "correct_choice_idx": 2, "direct_answers": ["four", "zero", "four", "zero", "zero", "three", "zero", "four", "four", "nothing"], "difficult_direct_answer": false, "rationales": ["There are four zebras.", "There are five with two behind the golf cart and three at the fence", "None of the answers are correct as there are no zebras visible, but answer a does correctly number the animals (giraffes) visible."], "image": "train2014/COCO_train2014_000000315961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529041, "question_id": "Zj6DXVXZd6j6VkszanDfAb", "question": "What is the purpose of the red truck in the image?", "choices": ["health safety", "distinguish fires", "night partys", "citizen transportation"], "correct_choice_idx": 1, "direct_answers": ["extinguishing fires", "fire fighting", "distinguish fires", "fire", "fight fires", "fire", "fire", "travel", "fire engine", "fire fighting"], "difficult_direct_answer": false, "rationales": ["This is a firetruck", "The truck is driven by firefighters to put out blazes.", "Red trucks such as these are fire trucks. they are used to put out fires."], "image": "val2014/COCO_val2014_000000529041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131280, "question_id": "ZjPoUJ5rPZJ7H6Ms95iLes", "question": "What color is the text for this jersey of the boy playing baseball?", "choices": ["white", "red", "blue", "yellow"], "correct_choice_idx": 1, "direct_answers": ["blue", "boy", "rouex", "red", "red", "red", "blue", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The text is bright red on the shirt.", "The color is red.", "The text on the baseball jersey is red."], "image": "val2014/COCO_val2014_000000131280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111737, "question_id": "ZjbnwSV5kUWUtbRtBYjmrr", "question": "What would someone most likely clean in this room?", "choices": ["clothes", "body", "dishes", "car"], "correct_choice_idx": 2, "direct_answers": ["cook", "dishes", "water", "dishes", "dishes", "dishes", "dishes", "six", "sink", "dishes"], "difficult_direct_answer": false, "rationales": ["The room has dishes.", "Someone would clean dishes.", "A sink is near a stove and pots and pans can be seen in the cupboards."], "image": "train2014/COCO_train2014_000000111737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 531337, "question_id": "Zjwmw6GjrbfjrAP6b2cnTf", "question": "What kind of vegetable leaf is placed on top of the pizzas?", "choices": ["parsley", "spinach", "cilantro", "lettuce"], "correct_choice_idx": 1, "direct_answers": ["basil", "almond", "spinach", "spinach", "basil", "curry leaf", "basil", "spinach", "basil", "green leaf"], "difficult_direct_answer": false, "rationales": ["That is the food popeye is known for eating.", "It is a leafy green often found on pizza.", "The leaf is spinach."], "image": "train2014/COCO_train2014_000000531337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 121666, "question_id": "ZmPDM6wzUPHcLTMpGqs6rv", "question": "What color is the man's jacket on the far left?", "choices": ["blue", "green", "black", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The color is bright and easily visible. it is in sharp contrast to the white snow and is a color typical of this type of clothing.", "A group of skiers are posing together and one of them has a brightly colored coat."], "image": "train2014/COCO_train2014_000000121666.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400430, "question_id": "ZmTWqtqdXoed5buJksu9Fb", "question": "What is the woman wearing?", "choices": ["scarf", "boots", "gas mask", "crown"], "correct_choice_idx": 1, "direct_answers": ["black dress", "cort", "boots", "dress", "boots", "boots", "boots", "boots", "boots", "skirt"], "difficult_direct_answer": false, "rationales": ["The shoes go up her legs partially", "The only woman visible is wearing many articles of clothing and accessories, but the most distinct based on their style and color is answer a.", "The woman has boots."], "image": "train2014/COCO_train2014_000000400430.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 103571, "question_id": "ZoEJCbLLLmcaF32qUhQzXA", "question": "How many horses are upright?", "choices": ["five", "six", "eight", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three horses", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["The number of horses can be counted based on their outlines.", "There are only this number in the scene.", "One upright horse is in between two other upright horses."], "image": "val2014/COCO_val2014_000000103571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 92575, "question_id": "ZoRdoFgB3zBpx9zxdLs3YY", "question": "What could the teddy bear be submerged in?", "choices": ["orange juice", "tomato juice", "wine", "beer"], "correct_choice_idx": 0, "direct_answers": ["juice", "orange juice", "orange juice", "orange juice", "juice", "orange juice", "orange juice", "sitting", "orange juice", "notning"], "difficult_direct_answer": false, "rationales": ["The teddy bear could be submerged in a glass of orange juice.", "This is the only liquid available in the scene for the teddy to be dunked in.", "The drink next to the bear is the same color as the fruit."], "image": "train2014/COCO_train2014_000000092575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297267, "question_id": "ZpA2WF4VGFBbK5Yj9bvxnf", "question": "What sound do people say the item on the right makes?", "choices": ["meow", "moo", "choo choo", "woof"], "correct_choice_idx": 2, "direct_answers": ["woowoo", "choo-choo", "choochoo", "choo choo", "choochoo", "choo", "choo choo", "shoo shoo", "choo", "choo choo"], "difficult_direct_answer": false, "rationales": ["The train says choo choo.", "A train arrives at a station in this image. choo choo is a written representation of the sound a train makes.", "The item on the right is a train, not a dog, cat, or cow."], "image": "val2014/COCO_val2014_000000297267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302740, "question_id": "ZqoQXGwojsPqwxn54ZPmGv", "question": "What kind of bread makes the sandwich the woman is eating?", "choices": ["american", "wheat", "sourdough", "french"], "correct_choice_idx": 3, "direct_answers": ["bread", "french", "french", "baguette", "hotdog", "wheat", "italian", "french bread", "baguette", "french bread"], "difficult_direct_answer": false, "rationales": ["The bread is a baguette.", "A woman is eating sandwich made with long, thin bread. french bread is long and thin.", "The bread is french."], "image": "train2014/COCO_train2014_000000302740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365282, "question_id": "ZrKBrBaEc9TXKrg3jSckAF", "question": "The girl is playing with what animals?", "choices": ["skunks", "lizards", "cats", "bears"], "correct_choice_idx": 3, "direct_answers": ["bears", "bear cubs", "bears", "bear", "dog", "bear", "bear cubs", "bear", "bears", "bears"], "difficult_direct_answer": false, "rationales": ["The woman is sitting on the floor playing with orphaned (possibly) cubs.", "The girl is playing with bear cubs.", "The girl is playing with baby bears."], "image": "train2014/COCO_train2014_000000365282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358158, "question_id": "ZrSmqpqfUzzpr2M7dSnnea", "question": "What is the child holding up to their ear?", "choices": ["remote", "tablet", "phone", "wallet"], "correct_choice_idx": 0, "direct_answers": ["remote", "cellphone", "phone", "remote", "phone", "phone", "remote control", "remote", "phone", "remote"], "difficult_direct_answer": false, "rationales": ["A long rectangular object with no screen is being held up to a child's ear.", "It is a long device that controls the television.", "The object has black at the bottom which is a place where it can change channels or volume on a tv."], "image": "train2014/COCO_train2014_000000358158.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455860, "question_id": "ZsF35oHw6CEDVjfusME2n6", "question": "What country is the four digit area code for that appears in front of the 656 656 numbers?", "choices": ["china", "japan", "england", "germany"], "correct_choice_idx": 2, "direct_answers": ["thailand", "country", "england", "car", "fsss", "thailand", "mexico", "uk", "england", "china"], "difficult_direct_answer": false, "rationales": ["I had to look this up online and was able to confirm it.", "The country is england.", "The country is england."], "image": "train2014/COCO_train2014_000000455860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27710, "question_id": "ZsG7mowEjMiz5EroCEE8BH", "question": "What color is the bull int he field of white bulls who is alone among the white?", "choices": ["silver", "brown", "black", "gray"], "correct_choice_idx": 1, "direct_answers": ["brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["There is a brown bull in the middle of the field full of white bulls.", "The bull is not black, gray, or silver.", "The one in the middle is brown."], "image": "train2014/COCO_train2014_000000027710.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259572, "question_id": "ZtmetKGWjjd4f7VmmAMrqp", "question": "What color is the top of the ball laid on top of the computer desk?", "choices": ["green", "black", "yellow", "red"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "green", "stop", "yellow blue", "yellow", "car", "blue", "blue yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The color is yellow.", "The ball is clearly visible based on the shape and the colors are clearly identified. answer a is the dolor located towards the top of the ball.", "A cat is on a desk behind ab all with a blue bottom and yellow top."], "image": "train2014/COCO_train2014_000000259572.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 286569, "question_id": "Ztu5TTqrmARJhd2MV8Fjze", "question": "What is the man doing while he is leaning on the metal railing?", "choices": ["phone", "math", "pager", "umbrella"], "correct_choice_idx": 0, "direct_answers": ["phone call", "talking", "phone call", "on phone", "phone call", "phone", "talking", "talking", "talking phone", "phone call"], "difficult_direct_answer": false, "rationales": ["The man in this image holds a small rectangular device to his ear and mouth.", "There is a man talking on a phone while leaning on the rail.", "The man is calling on a phone."], "image": "train2014/COCO_train2014_000000286569.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 408431, "question_id": "ZuDegMGjpcvmaKj7u3VH4W", "question": "What kind of cup is sat on the desk next to the computer mouse?", "choices": ["glass", "wine glass", "tea cup", "mug"], "correct_choice_idx": 3, "direct_answers": ["beer", "glass", "stein", "beer", "beer mug", "mug", "mug", "beer mug", "bench", "mug"], "difficult_direct_answer": false, "rationales": ["One can see the glass stein located near the mouse.", "There is a glass beer mug near the computer mouse.", "The cup is made of non-transparent glass. it is not a tea cup."], "image": "train2014/COCO_train2014_000000408431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78059, "question_id": "Zui47Qhp8nUk2urZhuxnSB", "question": "What shape is the mirror hanging on the wall with some wooden planks?", "choices": ["circle", "rectangle", "square", "oval"], "correct_choice_idx": 1, "direct_answers": ["square", "rectangle", "square", "rectangle", "rectangle", "bath shower", "square", "rectangle", "rectangle", "square"], "difficult_direct_answer": false, "rationales": ["The mirror is not curved. it does not have equal sides.", "The mirror is rectangle.", "The shape is a rectangle."], "image": "train2014/COCO_train2014_000000078059.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413092, "question_id": "ZvRZHSxXqSpLnDoCZHB9oE", "question": "The racket company is named after who?", "choices": ["athlete", "sport inventor", "president", "founder"], "correct_choice_idx": 3, "direct_answers": ["wilson", "wilson", "wilson", "wilson", "founder", "wilson", "racket", "wilson", "wilson", "wilson"], "difficult_direct_answer": false, "rationales": ["There is a large \"w\" letter on the racket with a man standing there with a tennis ball.", "It is the initial for wilson", "The company takes its name from the person who started it."], "image": "train2014/COCO_train2014_000000413092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43933, "question_id": "ZvSV6b6eCHY2onKs5YbY5T", "question": "What celebrity first name appears on the boat?", "choices": ["idris elba", "jim duggan", "tony atlas", "lauren cohan"], "correct_choice_idx": 3, "direct_answers": ["lauren", "dwwww", "lauren jade", "lauren", "lauren", "lauren cohan", "lauren", "lauren", "lauren jade", "lauren jade"], "difficult_direct_answer": false, "rationales": ["They have the same name", "The first answer is the only possible choice for this question.", "A name is printed on the side of a boat that is pulled onto land."], "image": "train2014/COCO_train2014_000000043933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 87862, "question_id": "Zvp3kmBk3E4xTNCEf6BC7h", "question": "Why are the people in the stands?", "choices": ["sleeping", "playing", "working", "watching"], "correct_choice_idx": 3, "direct_answers": ["spectating", "watching", "fans", "to watch", "spectators", "fans", "observers", "spectating", "watching match", "watching"], "difficult_direct_answer": false, "rationales": ["The audience is watching the game.", "People are playing tennis. the people in the stands are spectators.", "People are in the stands to watch a tennis game."], "image": "train2014/COCO_train2014_000000087862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438973, "question_id": "Zx44WgDmKimB5ZxBRw3X3s", "question": "What is the writing which is written on the 3D statue on the right side?", "choices": ["hieroglyphics", "gaelic", "phoenician", "cuneiform"], "correct_choice_idx": 0, "direct_answers": ["hieroglyphs", "hyrogliphics", "hieroglyphs", "hieroglyph", "hieroglyphics", "hieroglyphics", "egyptian", "hyroglyphics", "egyptian hieroglyphics", "visual shapes"], "difficult_direct_answer": false, "rationales": ["There are concrete structures behind the giraffe. they show egyptians which used symbols for communication.", "There are some egyptian hieroglyphics on the side of the giraffe enclosrue.", "The signs were done by egyptians."], "image": "train2014/COCO_train2014_000000438973.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423165, "question_id": "ZyWAqTsuqVqGRDBegyBT6g", "question": "What is likely in the large bottle in the reflection?", "choices": ["conditioner", "sunscreen", "shampoo", "lotion"], "correct_choice_idx": 3, "direct_answers": ["lotion", "hand cream", "handwash", "shampoo", "lotion", "lotion", "tooth paste", "lotion", "soap", "lotion"], "difficult_direct_answer": false, "rationales": ["The large bottle has lotion.", "There is a white bottle with a pump on the shelf.", "It has a dispenser but isn't at the sink like soap would be"], "image": "val2014/COCO_val2014_000000423165.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349846, "question_id": "ZyjmrxB3YEvXCePeTCBEMt", "question": "What color is the boundary section of the surfboard held by the woman in the wetsuit?", "choices": ["pink", "red", "purple", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "lavender", "yellow", "gray", "gray blue", "white", "grey", "gray", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["There is a flag this color on the pole", "Though the sun brightens it to a silver looking color the boundary of the visible surfboard in this image would be labelled blue.", "A lady is hanging a ten with her hand and holding a white surfboard with blue lines around it."], "image": "train2014/COCO_train2014_000000349846.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168562, "question_id": "ZzZEpNrsDxBBHBHhrzieGt", "question": "How many little baby bears are walking under the fallen log?", "choices": ["three", "five", "four", "two"], "correct_choice_idx": 1, "direct_answers": ["one", "five", "two", "five", "two", "five", "five", "one", "four", "two"], "difficult_direct_answer": false, "rationales": ["There are 5 bears.", "There are only three small bears going under a grassy area with a long wood tree atop rocks. two other bears are in the grass.", "Four bears are clearly visible in this image and one more is obscured partially by the rocks on the right."], "image": "val2014/COCO_val2014_000000168562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140291, "question_id": "a2wcbUqzaRTe68YQ7Xw8DZ", "question": "What color is the frisbee held by in the right hand of the man in the background?", "choices": ["white", "blue", "red", "yellow"], "correct_choice_idx": 2, "direct_answers": ["orange", "white", "orange white", "orange", "orange", "red", "cream", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The frisbee in his left hand is white. the frisbee in his right hand does not match the one in his left hand and is not blue or yellow.", "The man in the background holds a red frisbee in the background.", "The man in the background has a red frisbee in his right hand."], "image": "train2014/COCO_train2014_000000140291.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188613, "question_id": "a35zN3kDcf98oJizZqCZu7", "question": "What color are the insides of the tennis courts in this park?", "choices": ["white", "red", "blue", "green"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue green", "blue green"], "difficult_direct_answer": false, "rationales": ["The color is easily visible and bright. it is in sharp contrast to the green ground. it is similar to the color of the sky.", "The insides are blue.", "The color is blue."], "image": "val2014/COCO_val2014_000000188613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411748, "question_id": "a44k5L2EHc4NTbUMEJWmch", "question": "What color is the interior of the clock face illuminated?", "choices": ["blue", "white", "orange", "green"], "correct_choice_idx": 2, "direct_answers": ["yellow", "gold", "yellow", "yellow", "yellow", "purple", "yellow", "yellow", "yellow", "orange"], "difficult_direct_answer": false, "rationales": ["It is an amber color", "The clock is illuminated orange.", "There is an orange illuminated area inside of the clock tower."], "image": "train2014/COCO_train2014_000000411748.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303392, "question_id": "a4RPUxQsWXt7fJt3AcESNJ", "question": "How many portraits are hung on the mustard colored walls?", "choices": ["two", "three", "one", "four"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "white", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is 1 portrait.", "Only a single piece of artwork can be seen in this image.", "There is a single portrait hanging over a couch with wood rails."], "image": "train2014/COCO_train2014_000000303392.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135626, "question_id": "a4oj8b8LKi9QjFfvZfJ2KS", "question": "What is the woman in the foreground wearing?", "choices": ["armor", "backpack", "crown", "headphones"], "correct_choice_idx": 1, "direct_answers": ["backpack", "raincoat", "coat", "trench coat", "white", "trenchcoat", "jacket", "coat", "coat", "cort"], "difficult_direct_answer": false, "rationales": ["The woman has a backpack.", "She is wearing backpack as it is seen.", "The woman has a backpack."], "image": "train2014/COCO_train2014_000000135626.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189656, "question_id": "a52pcTJ3DaPZWiBaJ5x3Fg", "question": "What holiday is the color pattern on the bed most appropriate for?", "choices": ["halloween", "valentines day", "thanksgiving", "arbor day"], "correct_choice_idx": 1, "direct_answers": ["valentines day", "valentines day", "valentines day", "valentines", "flower", "christmas", "christmas", "valentines day", "valentine's day", "beach"], "difficult_direct_answer": false, "rationales": ["The color pattern on the bed is red and very appropriate for valentine's day.", "This is a bedroom with red cover and pillows with flowers all over it.", "The holiday is valentine's."], "image": "train2014/COCO_train2014_000000189656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 377999, "question_id": "a5sZ8n4ahzWQ9ySFszEqKV", "question": "What type of water body is this as evidenced by the beach in the background?", "choices": ["ocean", "river", "canal", "lake"], "correct_choice_idx": 0, "direct_answers": ["lake", "ocean", "bay", "lake", "lake", "boat", "lake", "ocean", "lake", "sea"], "difficult_direct_answer": false, "rationales": ["These largest bodies of water usually make sand at the waters edge.", "The water is an ocean.", "This body of water is buy a beach and very large."], "image": "val2014/COCO_val2014_000000377999.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310342, "question_id": "a6EoqMwY5ap48DrbZTVmCn", "question": "What item has a back end that shares the name of an item here?", "choices": ["knife", "car", "boat", "egg"], "correct_choice_idx": 1, "direct_answers": ["car", "elephant", "car", "elephant", "trunk", "tail", "car", "car", "tail", "ivory key"], "difficult_direct_answer": false, "rationales": ["A car has a trunk, and so does an elephant.", "The other name for it is trunk. that said, in some areas of the world, it's also referred to as the boot.", "The elephant has a trunk, which is the same name as the back of a car."], "image": "val2014/COCO_val2014_000000310342.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52661, "question_id": "a8emZZApvUqhGeUDV7KhQS", "question": "What color is the big tarp suspended over the deck of the large yacht?", "choices": ["blue", "gray", "green", "green"], "correct_choice_idx": 0, "direct_answers": ["dwdw", "white", "blue", "blue", "blue", "grey", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["This is obvious in the scene.", "There is a large blue tarp on top of the ship.", "There is a large blue tarp on top of the boat."], "image": "val2014/COCO_val2014_000000052661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439251, "question_id": "a93BKE4DjVZtHP2iQUxpY5", "question": "What is next to the animal in the water?", "choices": ["man", "eel", "surf board", "shark"], "correct_choice_idx": 0, "direct_answers": ["person", "man", "person", "man", "human", "man", "person", "man", "man", "man"], "difficult_direct_answer": false, "rationales": ["There is a person swimming next to the animal in the water.", "The man is near.", "The man is nearby."], "image": "train2014/COCO_train2014_000000439251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253464, "question_id": "a9MfDzHM4MjF4XdYUGXcQc", "question": "What color are the towels hanging above the white porcelain toilet?", "choices": ["red", "purple", "blue", "gray"], "correct_choice_idx": 3, "direct_answers": ["gray", "gray", "olive", "gray", "gray", "green", "metallic silver", "khaki", "sandal colour", "grey"], "difficult_direct_answer": false, "rationales": ["The color is gray.", "There is only one toilet visible, distinct by its design and shape. above the toilet are towels of a color that is readily identifiable.", "The color is gray."], "image": "train2014/COCO_train2014_000000253464.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31230, "question_id": "aAkirmPQuShKLrBtacHntD", "question": "What is in front of the television?", "choices": ["dog", "snowman", "carriage", "old man"], "correct_choice_idx": 0, "direct_answers": ["purse", "toy", "cat", "women", "two women", "women", "dog", "dog", "dog", "1 people"], "difficult_direct_answer": false, "rationales": ["The tv has a dog.", "A small black animal with floppy ears and a tail is laying on the floor in between a television and two girls playing a game.", "There is a dog."], "image": "train2014/COCO_train2014_000000031230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571654, "question_id": "aBjgFqzn6hhHTC8wzBaYu2", "question": "What is near the laptops?", "choices": ["orange", "dog", "cat", "banana"], "correct_choice_idx": 0, "direct_answers": ["mice", "monitors", "computers", "mouse", "orange", "mouse", "tablet", "computer speakers", "mouse", "monitors"], "difficult_direct_answer": false, "rationales": ["This type of fruit is the only thing on the list that can be seen in the image.", "There is round fruit of this color on the desk", "There is an orange on the desk of the cubicle between the computers."], "image": "train2014/COCO_train2014_000000571654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 470935, "question_id": "aBuKjNgNWN8RTPN5Nwuhf5", "question": "What kind of cooking element does the stove have?", "choices": ["gas", "electric", "rangetop", "induction"], "correct_choice_idx": 1, "direct_answers": ["electric", "electric", "electric", "electric", "electric", "electric", "electric", "induction", "oven", "electric"], "difficult_direct_answer": false, "rationales": ["It seems to be electric since there are no burners.", "The top of the stove is flat and there are no burners present for a gas type stove. also, the oven is attached so it is not a range top.", "It is an electric stove."], "image": "val2014/COCO_val2014_000000470935.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255443, "question_id": "aCeaWiCiAZ862AWnaELDLw", "question": "What color are the posts where one of the players had put his jacket on?", "choices": ["blue", "red", "green", "orange"], "correct_choice_idx": 2, "direct_answers": ["green", "green", "green", "green", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["The poles are green.", "The posts are not blue, orange, or red.", "The posts are not blue, orange, or red."], "image": "train2014/COCO_train2014_000000255443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 29622, "question_id": "aDTcia9VGLrYuTBPeGQo8j", "question": "What is the bird above?", "choices": ["cow", "mud", "sand", "dog"], "correct_choice_idx": 2, "direct_answers": ["beach", "sand", "sand", "sand", "eagle", "seagull", "sky", "seagull", "seagull", "beach"], "difficult_direct_answer": false, "rationales": ["He's at the beach", "The bird is in the sand.", "You find seagulls at the beach very often."], "image": "train2014/COCO_train2014_000000029622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544737, "question_id": "aGfX5CdVSpoM5AxE67q7Mh", "question": "What color is the little napkin worn on the girl's chest?", "choices": ["green", "red", "blue", "yellow"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "pink"], "difficult_direct_answer": false, "rationales": ["It is a blue color. it is in stark contrast to the pink outfit.", "The napkin is colored blue.", "The color is blue."], "image": "train2014/COCO_train2014_000000544737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 60526, "question_id": "aGhNUZuVQygrrxsercMfBK", "question": "Where would you see this setup?", "choices": ["africa", "asia", "south america", "australia"], "correct_choice_idx": 1, "direct_answers": ["asia", "sale", "market", "store sale", "market", "china", "outside business", "china town", "china", "storefront"], "difficult_direct_answer": false, "rationales": ["The ceramic items on display at the shop might be found in asia based on the painting and details.", "The setup is in asia.", "The pottery has languages native to this continent on them."], "image": "train2014/COCO_train2014_000000060526.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170974, "question_id": "aGutoZaDgdywMQN4miFedZ", "question": "Which gaming console is being watched by the onlookers?", "choices": ["nintendo wii", "nintendo switch", "microsoft xbox", "sony playstation"], "correct_choice_idx": 0, "direct_answers": ["wii", "nintendo wii", "wii", "wii", "wii", "wii", "digital", "digital life", "video game", "nintendo wii"], "difficult_direct_answer": false, "rationales": ["The game is the wii.", "The man is holding a controller. it is white and is shaped like a remote.", "The white, rectangular controller with the nunchuk is a well-known object for that system."], "image": "val2014/COCO_val2014_000000170974.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206255, "question_id": "aHU8JifG83QCg6VXpXN7Rz", "question": "What room is it?", "choices": ["bedroom", "kitchen", "dining room", "family room"], "correct_choice_idx": 3, "direct_answers": ["living", "living room", "cat room", "bedroom", "livingroom", "living room", "kitchen", "family room", "bedroom", "bedroom"], "difficult_direct_answer": false, "rationales": ["There is a toy car by the cat.", "This is the family room with the cat.", "A cat is just sitting here on a blanket. there is a fireplace behind the cat."], "image": "train2014/COCO_train2014_000000206255.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 355740, "question_id": "aHZ32FeMr94f6EGkwtYZGx", "question": "What animal is between the giraffes?", "choices": ["cat", "zebra", "cow", "dog"], "correct_choice_idx": 1, "direct_answers": ["zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra", "zebra"], "difficult_direct_answer": false, "rationales": ["The animal has black and white stripes.", "The animal is similar to a horse. it has black and white stripes.", "The animal is a striped equine creature between the two giraffes."], "image": "train2014/COCO_train2014_000000355740.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511729, "question_id": "aJmc87JzUcLcgXc8qsNnEa", "question": "How many boys are pictured here?", "choices": ["five", "four", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "one", "two", "two", "one", "one", "one boy"], "difficult_direct_answer": false, "rationales": ["There are more than one but less than three boys visible.", "There are a couple boys side by side in front of the mirror.", "They are in a mirror that is reflecting to make it look like more"], "image": "train2014/COCO_train2014_000000511729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269949, "question_id": "aKCx5zBj2u8igf2MUXQwce", "question": "What country is this building in?", "choices": ["australia", "england", "china", "russia"], "correct_choice_idx": 0, "direct_answers": ["usa", "spain", "na", "united states", "australia", "italy", "england", "europe", "spain", "church"], "difficult_direct_answer": true, "rationales": ["The country is australia.", "This is the university of auckland and is in the \"land down under\".", "The country is australia."], "image": "val2014/COCO_val2014_000000269949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428736, "question_id": "aKsPAoHpcGAPd37nduxdMd", "question": "What usually gets done here?", "choices": ["basketball game", "office meeting", "swimming competition", "football game"], "correct_choice_idx": 1, "direct_answers": ["eating", "eating", "work", "office work", "meetings", "work", "meetings", "eating", "sit", "office meeting"], "difficult_direct_answer": false, "rationales": ["Boardrooms hold meetings.", "There is a table and desks and other office equipment", "The setting is a formal work setting with a table and multiple chairs around it."], "image": "train2014/COCO_train2014_000000428736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372870, "question_id": "aMba3NCvVEirynz6x9Rd5r", "question": "What is this kitty pointing to with his nose?", "choices": ["plant", "desk", "wall", "portrait"], "correct_choice_idx": 0, "direct_answers": ["plant", "flower", "flower", "potted plant", "flowers", "flower case", "vase", "stop", "vase", "vase"], "difficult_direct_answer": false, "rationales": ["The cat is pointing to the plant.", "The kitty is a plant.", "Plants are kept in vases."], "image": "train2014/COCO_train2014_000000372870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441168, "question_id": "aMsthpqiJMg2oHD7G9WRFW", "question": "How many players are engaged in the game as indicated by the number of players in the multi-screen game?", "choices": ["two", "three", "one", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "one", "one", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The game displayed on the tv screen shows two different characters playing the game.", "This television screen has a line down the middle and a digital player character is front and center on each side.", "One player is on the left. one is on the right."], "image": "train2014/COCO_train2014_000000441168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345161, "question_id": "aNYM8y7xTmWq6wfEU3nxJ7", "question": "What are the roofs of the shelters made for the giraffes consisting?", "choices": ["grass", "straw", "brick", "leaf"], "correct_choice_idx": 0, "direct_answers": ["dry leaves", "bamboo", "thatch", "grass", "straw", "grass", "grass", "straw", "grass", "deer"], "difficult_direct_answer": false, "rationales": ["They are stringy plant material", "The roofs are made of grass.", "The roofs are made of grass."], "image": "train2014/COCO_train2014_000000345161.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553793, "question_id": "aNtshvEtbUk8tSegxJyhvu", "question": "What does a person do on the structure next to the fallen shopping cart?", "choices": ["race", "swim", "jump", "sit"], "correct_choice_idx": 3, "direct_answers": ["sit", "bench", "sit", "sit", "sit", "sit", "sit", "sit", "sit", "sit"], "difficult_direct_answer": false, "rationales": ["Next to this fallen shopping cart is a public bench. a benches' purpose is for people to sit on.", "The person would sit by the cart.", "People sit on the bench to the left of the fallen shopping cart."], "image": "train2014/COCO_train2014_000000553793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208994, "question_id": "aPQX3ogXzwhgGvBnnyY5eF", "question": "What shape is the window that the cat is sniffing?", "choices": ["square", "rectangle", "round", "oval"], "correct_choice_idx": 3, "direct_answers": ["oval", "oval", "oval", "circular", "oval", "oval", "oval", "oval", "mirror", "oval"], "difficult_direct_answer": false, "rationales": ["The shape looks like an egg shape.", "It is a squeezed circle.", "It is rounded but is taller than it is wide"], "image": "train2014/COCO_train2014_000000208994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12103, "question_id": "aQ8oGmz39ezJ4BnSq4UZCc", "question": "Who provides the bottles on the counter?", "choices": ["house guest", "homeowner", "hotel", "customer"], "correct_choice_idx": 2, "direct_answers": ["hotel", "staff", "human", "hotel", "hotel", "cleaners", "hotel", "hotel", "hotel", "hotel"], "difficult_direct_answer": false, "rationales": ["They are small sized complementary items for the customers use.", "The hotel provides amenities such as soap like in the bottle.", "The bottles are in a hotel."], "image": "train2014/COCO_train2014_000000012103.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412167, "question_id": "aTHSAWQi6ptnx52rgewWzR", "question": "How many little elephants are together inside of this zoo cage?", "choices": ["one", "two", "three", "four"], "correct_choice_idx": 1, "direct_answers": ["two", "stop", "two", "two", "three", "two", "two", "three", "two", "car"], "difficult_direct_answer": false, "rationales": ["The animals are clearly visible and based on their relative sizes, only the two in the foreground would be considered \"little\".", "There are two little elephants.", "One little elephant is on the left. one is on the right."], "image": "train2014/COCO_train2014_000000412167.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 243156, "question_id": "aUHZb584Ec5ZdASuQPtnd9", "question": "What color is the t-shirt worn by the man on the left who is pointing his fist?", "choices": ["blue", "red", "black", "white"], "correct_choice_idx": 0, "direct_answers": ["blue", "light blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["It is blue.", "The man's t shirt is bright blue.", "The man in question is locatable based on the text of the question and the color of the shirt is visible and identifiable."], "image": "train2014/COCO_train2014_000000243156.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367896, "question_id": "aUpKtR9vsxdraSo6pAepcs", "question": "What kind of meal is this?", "choices": ["balanced", "fruit filled", "sugar filled", "baby food"], "correct_choice_idx": 0, "direct_answers": ["dinner", "pizza", "chinnese", "watching", "dinner", "pizza", "balanced", "dinner", "supper", "dinner supper"], "difficult_direct_answer": false, "rationales": ["It has a protein, starch and vegetable", "You can tell by the different types of food as to what kind it is as far as healthiness.", "A plate with equal parts chicken, vegetables, and potatoes are on a table."], "image": "train2014/COCO_train2014_000000367896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364953, "question_id": "aV4he2faDAYgBwee43RA4n", "question": "Where in the world is this being played?", "choices": ["canada", "asia", "africa", "south america"], "correct_choice_idx": 1, "direct_answers": ["china", "asia", "japan", "korea", "top", "asia", "china", "china", "field", "baseball field"], "difficult_direct_answer": false, "rationales": ["The man is asian", "Looks to be in an asian country.", "This sport is played in asia."], "image": "train2014/COCO_train2014_000000364953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 522779, "question_id": "aWUZqdPAeShVJicg9DLces", "question": "What is the middle color of the baking tray above the oven?", "choices": ["blue", "white", "green", "red"], "correct_choice_idx": 0, "direct_answers": ["blue", "white", "blue", "black", "blue", "blue", "white", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The middle color is blue.", "It's the color of the sky", "There is a stripped red, white, blue baking sheet attached to wall above black stove."], "image": "val2014/COCO_val2014_000000522779.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496933, "question_id": "aWr9x95cgS8TBd4TBcaitw", "question": "What color is the underbelly of this private jet?", "choices": ["green", "blue", "orange", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "blue", "black", "black", "black", "black", "black", "dark blue", "white", "black"], "difficult_direct_answer": false, "rationales": ["The color is black.", "The belly of the jet is black.", "The bottom of the plane is a dark shade."], "image": "train2014/COCO_train2014_000000496933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 372428, "question_id": "aXYnjtYStT6xCnb9hN9Fw7", "question": "What are the animals near?", "choices": ["cars", "apple trees", "baby carriages", "chairs"], "correct_choice_idx": 3, "direct_answers": ["chair", "chair", "chairs", "cow", "lounge chairs", "long chairs", "chairs", "tanning chairs", "lounge chairs", "chairs"], "difficult_direct_answer": false, "rationales": ["The animals are by chairs.", "There are white chairs around the animals.", "This furniture is the l shaped to fit a human body. they are often found outside for people to rest on."], "image": "val2014/COCO_val2014_000000372428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446751, "question_id": "aXrVXR7Tst34jVq9CRHxXq", "question": "What feature do these animals have?", "choices": ["talons", "wings", "quills", "trunks"], "correct_choice_idx": 3, "direct_answers": ["big ears", "tusks", "tusks", "trunks", "all grey", "trunks", "trunks", "tusks", "long trunks", "trunk"], "difficult_direct_answer": false, "rationales": ["Elephants are grouped together in an open area.", "The animals all have long noses that look like trees.", "They are elephants, which are famously known for this visible feature."], "image": "val2014/COCO_val2014_000000446751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561251, "question_id": "aYt77qRLmPKTfMHXxZ74r9", "question": "What caused all the indents in the sand?", "choices": ["foot traffic", "acid rain", "artillery fire", "hail"], "correct_choice_idx": 0, "direct_answers": ["footprints", "walking", "feet", "people walking", "feet", "footprints", "footprints", "foot traffic", "peoples feet", "people"], "difficult_direct_answer": false, "rationales": ["Many people have walked on it", "The indents in the sand are caused by people walking back and forth across the surface.", "The divers visible in the sand on this beach are the shape and dimension roughly of the human foot."], "image": "train2014/COCO_train2014_000000561251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463724, "question_id": "abM9WkEEwPrcf25XyWfkCa", "question": "What are the oranges touching?", "choices": ["cat paw", "liquid", "baby hand", "snake"], "correct_choice_idx": 1, "direct_answers": ["liquid", "liquid", "water", "apples", "liquid", "wine", "wine", "liquid", "water", "apples"], "difficult_direct_answer": false, "rationales": ["The other options don't appear in t his image and they're not food related.", "The oranges are floating in a drink.", "The orange slices are soaking in fluid."], "image": "val2014/COCO_val2014_000000463724.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125117, "question_id": "aczucKn72FAwQeyfJt9N4i", "question": "What area of a city would this be called?", "choices": ["outskirts", "campus", "downtown", "suburbs"], "correct_choice_idx": 2, "direct_answers": ["downtown", "downtown", "downtown", "downtown", "downtown", "downtown", "downtown", "downtown", "plaza", "downtown"], "difficult_direct_answer": false, "rationales": ["They are found in town town to prevent accident.", "There are tall commercial, not educational or residential, buildings. there are many people and cars.", "A busy city street is shown with people walking down the sidewalk."], "image": "train2014/COCO_train2014_000000125117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99981, "question_id": "adB3hcggGVSdToCA29QvE7", "question": "How many people are holding ski poles?", "choices": ["four", "six", "two", "five"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "ski poles", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two people standing on the ski slope holding ski poles.", "There are this many men on skis", "Each is holding one pole on each hand and are standing on snow."], "image": "train2014/COCO_train2014_000000099981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70334, "question_id": "afTc7RPmn9mtUz2WWkiibp", "question": "How many little lambs are stood on top of the rock?", "choices": ["one", "two", "four", "three"], "correct_choice_idx": 1, "direct_answers": ["two", "three", "three", "three", "three", "three", "two", "two", "one", "two"], "difficult_direct_answer": false, "rationales": ["There are three lambs but one of them is an adult.", "The others are only on the grass", "The animals are clearly visible and countable based on their individual outlines. of the animals, there lambs are countable based on their relatively smaller size."], "image": "val2014/COCO_val2014_000000070334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 400377, "question_id": "ahJVincVbdh9MyrEfaXtkL", "question": "What is in the cake?", "choices": ["fork", "knife", "babys finger", "spatula"], "correct_choice_idx": 1, "direct_answers": ["knife", "chocolate", "knife", "chocolate", "knife", "knife", "knife", "chocolate", "chocolate", "knife"], "difficult_direct_answer": false, "rationales": ["The cake has a knife.", "It's a sharp blade used to cut slices", "The chocolate cake on the table has a sharp knife sticking into it."], "image": "train2014/COCO_train2014_000000400377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497565, "question_id": "amKqFjit2ViKvX7oYuqRHS", "question": "What does the top of the vehicle look like?", "choices": ["pumpkin", "giant flashlight", "baby", "egg"], "correct_choice_idx": 1, "direct_answers": ["tank", "grey", "vent", "train", "giant flashlight", "train", "train", "train", "tube", "black top"], "difficult_direct_answer": false, "rationales": ["It has a rounded area on the front", "The train looks like a torch from above.", "It looks like a handle and then where the light comes out of it from bottom to top."], "image": "train2014/COCO_train2014_000000497565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357498, "question_id": "an96Vfrsqmbxw5kTLNGmJn", "question": "How many pins are juggled up on top of the post by the man standing on the skateboard?", "choices": ["three", "six", "one", "four"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three pins the guy is juggling.", "One pin is in between two others.", "He has two in the air and one in his hand"], "image": "val2014/COCO_val2014_000000357498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176271, "question_id": "anX6yFZKb9iCB5xWX4FZiW", "question": "What vitamin is this food known for?", "choices": ["b", "", "c", "m"], "correct_choice_idx": 2, "direct_answers": ["vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "vitamin c", "c"], "difficult_direct_answer": false, "rationales": ["Oranges are high in vitamin c", "Oranges are full of vitamin c", "The bowl contains a type of citrus fruit."], "image": "val2014/COCO_val2014_000000176271.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 498436, "question_id": "aoKuXix4vUo95EKCdtmukm", "question": "What is the most likely beverage in the filled cups on the table?", "choices": ["fruit drink", "coffee", "water", "soda"], "correct_choice_idx": 0, "direct_answers": ["juice", "bread", "pizza", "juice", "fruit drink", "beer", "cocktail", "beer", "water", "cocktail"], "difficult_direct_answer": false, "rationales": ["The beverage is fruity.", "The drinks are an orange color.", "The beverage is thick and orange."], "image": "train2014/COCO_train2014_000000498436.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253072, "question_id": "aoTcwpwSpGHR4Wf9weDnHe", "question": "What is the man wearing?", "choices": ["backpack", "gas mask", "knee brace", "bandana"], "correct_choice_idx": 3, "direct_answers": ["shorts", "athletic gear", "shorts", "tennis clothes", "bandana", "headband", "shorts", "blue shirt", "tennis gear", "headband"], "difficult_direct_answer": false, "rationales": ["The man has a blue headband around his top part of his head. this helps keep the hair out of his face as well as sweat.", "The man has something on his head.", "A man is playing tennis with a strip of material across his head. headbands are used to soak up sweat during sports and exercising."], "image": "train2014/COCO_train2014_000000253072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530242, "question_id": "aoTjWqrARZayAiX8HWz22y", "question": "What is the color of the center stripe on the flag tossed over the elephant?", "choices": ["yellow", "pink", "green", "red"], "correct_choice_idx": 0, "direct_answers": ["yellow", "blue", "stop", "yellow", "yellow", "yellow", "yellow", "yellow", "stick", "yellow"], "difficult_direct_answer": false, "rationales": ["The center stripe is the color yellow.", "There are three stripes visible on the side of the elephant and the middle stripe is clearly answer a.", "The color is yellow."], "image": "train2014/COCO_train2014_000000530242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276057, "question_id": "apudt4K8UKevnAeR9sv3WU", "question": "What is the number of laptops sat on the bar held over this bed?", "choices": ["six", "five", "four", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "two", "three", "three", "three", "three", "three laptops", "three"], "difficult_direct_answer": false, "rationales": ["There are three laptops on the desk that this person is sitting in front of.", "This is based on how many are visible. it's impossible to see beyond the third in the foreground.", "There are three personal portable computers open in front of this person."], "image": "val2014/COCO_val2014_000000276057.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391113, "question_id": "apuzT7KN4Wo3rd9NHvphJL", "question": "What sound do these animals make?", "choices": ["meow", "neigh", "roar", "baa"], "correct_choice_idx": 3, "direct_answers": ["baa", "baa", "baaa", "bleat", "mehhhhh", "baa", "baa", "baa", "sheep", "baa"], "difficult_direct_answer": false, "rationales": ["The sheep make a \"baa\" sound.", "The animals are sheep which do not meow (cat), roar (lion), or neigh (horse).", "The sheep baa."], "image": "train2014/COCO_train2014_000000391113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169975, "question_id": "arVkAYbQjHBVQuJ33zhMGU", "question": "What is above the wood?", "choices": ["dog", "egg", "wires", "cat"], "correct_choice_idx": 2, "direct_answers": ["phone", "wires", "wires", "hand", "electrical system", "wires", "hand", "electronics", "wires", "robot"], "difficult_direct_answer": false, "rationales": ["The other options aren't in the image.", "You can identify this object by its long thin shape. also, they are characteristically colored red and black to distinguish their charge.", "An electrical structure was built on the wood."], "image": "train2014/COCO_train2014_000000169975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321014, "question_id": "atyU8kPv8UJ98pna6RmC5n", "question": "What is the cat near?", "choices": ["knives", "spoons", "boxes", "apples"], "correct_choice_idx": 0, "direct_answers": ["knives", "knives", "knives", "window", "knife", "knives", "knives", "knife", "windows", "knives"], "difficult_direct_answer": false, "rationales": ["The kitty is on the floor with many cutting tools around him.", "The other options aren't in this image.", "They are shiny sharp object laying on the floor in a semi circle around the cat."], "image": "val2014/COCO_val2014_000000321014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258094, "question_id": "avY9ZPaDKSN5sR4MJ2fdjD", "question": "Who served in the branch of the military whose name appears on the vehicle?", "choices": ["sergeant slaughter", "captain kidd", "lieutenant green", "jimmy carter"], "correct_choice_idx": 3, "direct_answers": ["pilot", "navy", "navy", "navy", "jimmy carter", "mchale", "sailors", "navy", "navy", "navy"], "difficult_direct_answer": false, "rationales": ["A google search revealed that the 39th president of the united states served as a lieutenant in the navy.", "Jimmy carter served.", "It is a navy plane."], "image": "train2014/COCO_train2014_000000258094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 384553, "question_id": "awnJVb42nhXxUwB43ZUtBD", "question": "What is the person's foot near?", "choices": ["elephant ear", "box", "baby carriage", "motorcycle pedal"], "correct_choice_idx": 0, "direct_answers": ["ear", "elephant ear", "elephant's ear", "ear", "ear", "ear", "ear", "elephants ear", "ear", "elephant's ear"], "difficult_direct_answer": false, "rationales": ["The foot is by an elephant ear.", "It's resting behind the massive body part, which is normal when riding these animals.", "The foot of the elephant rider is seen behind the elephant's ear."], "image": "val2014/COCO_val2014_000000384553.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381527, "question_id": "awvP6Zofrmt9zGnDcAz2MG", "question": "Which part of the skateboard is orange?", "choices": ["trucks", "deck", "wheels", "grip tape"], "correct_choice_idx": 1, "direct_answers": ["bottom", "under", "under", "bottom", "deck", "bottom", "bottom part", "bottom", "bottom", "bottom"], "difficult_direct_answer": false, "rationales": ["The bottom part of the skateboard is showing the color. the wheels are shown where the color is.", "The part is a deck.", "The bottom of the skateboard, or the deck, is orange."], "image": "val2014/COCO_val2014_000000381527.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558608, "question_id": "ayhLrLcHbBYgq3GuGknbbK", "question": "What is the large item on the sand shaped like?", "choices": ["ant", "basketball", "elephant", "traffic cone"], "correct_choice_idx": 3, "direct_answers": ["kite", "tube", "traffic cone", "cone", "cone", "tube", "cone", "baloon", "tunnel", "toy"], "difficult_direct_answer": false, "rationales": ["The item in question has a circular base and comes to a point with a hollow interior which are all consistent characteristics of answer a.", "You can tell by the shape as to what the kite is shaped like.", "It's open on one end, partially cylindrical and is at a point on the other end"], "image": "val2014/COCO_val2014_000000558608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 180672, "question_id": "azn5dWfQYFoGdFRUE4WesX", "question": "What is the black object on the piano used for?", "choices": ["sheet music", "books", "art", "cd"], "correct_choice_idx": 0, "direct_answers": ["hold music", "music notes", "attaching book", "typing", "trash", "holding paper", "sheet music", "music sheets", "reading music", "sheet music"], "difficult_direct_answer": true, "rationales": ["You can put your music up against it so you can read it.", "The black object on the piano is used for holding sheet music.", "It holds up papers or a book so the music notes can be seen"], "image": "train2014/COCO_train2014_000000180672.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542605, "question_id": "b28bw5D3nUbuE8y3UQRKux", "question": "What event is this?", "choices": ["circus", "motorcycle rally", "genius convention", "graduation"], "correct_choice_idx": 1, "direct_answers": ["festival", "cycling", "motorcycle rally", "biketober", "motorcycle rally", "bike week", "harley", "motorcycle rally", "stop", "car"], "difficult_direct_answer": false, "rationales": ["The event is a rally.", "(a) there are many people there with motorcycles parked on the road so it's easy to see that it's a motorcycle rally.", "The event is a motorcycle rally."], "image": "val2014/COCO_val2014_000000542605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433420, "question_id": "b2VNjHjbGYCsw3AaPnQ3zC", "question": "What type of animal is on the Shiner box?", "choices": ["deer", "bull", "ram", "cat"], "correct_choice_idx": 3, "direct_answers": ["cat", "sheep", "cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["There is a black feline on top of the fridge.", "There is a black and white animal. it has pointy ears, whiskers and a long tail.", "They can jump and climb to high area easily."], "image": "train2014/COCO_train2014_000000433420.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 271032, "question_id": "b3YEvi8uF5K5f6o6mkmKoL", "question": "What is usually found inside of the large blue item?", "choices": ["flowers", "soda", "beef", "fish"], "correct_choice_idx": 3, "direct_answers": ["fish", "water", "fish", "fish", "fish", "fish", "fish", "fish", "fish", "fish"], "difficult_direct_answer": false, "rationales": ["There are a bunch of fish in the large river.", "The blue is water and this type of animal has gills to breathe under water. they are found in water all over the world.", "The item has fish."], "image": "val2014/COCO_val2014_000000271032.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511168, "question_id": "b3xtELpuAafRpy8FS5V8GA", "question": "How many of the giraffes are taking a drink in the water?", "choices": ["six", "one", "four", "three"], "correct_choice_idx": 0, "direct_answers": ["three", "two", "two", "two", "two", "six", "two", "six", "six", "three"], "difficult_direct_answer": false, "rationales": ["The giraffes are characteristic by their long necks and there are some in front and behind the pond. they are in contrast to the striped zebras.", "There are 6.", "All you have to do is count the animals near the water."], "image": "train2014/COCO_train2014_000000511168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153817, "question_id": "b44pDMSXYCYFSvFDwuSnRM", "question": "Which train is the oldest?", "choices": ["middle left", "leftmost", "middle right", "rightmost"], "correct_choice_idx": 1, "direct_answers": ["black", "leftmost", "cathedrals express", "left train", "cathedrals express", "black", "first", "yellow train", "this one", "black train"], "difficult_direct_answer": false, "rationales": ["The black one is a vintage style train so it is likely oldest.", "The train on the left is old.", "It's age can be determined by the old fashioned shape and the fact that steam is coming out of it. the other trains are sleek and bullet like which was not popular a long time ago."], "image": "train2014/COCO_train2014_000000153817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451990, "question_id": "b49wvjuUiQFqmXpKuyKiyG", "question": "What flavor would the donut be if it was the same flavor as the item next to it?", "choices": ["apple", "peach", "strawberry", "orange"], "correct_choice_idx": 2, "direct_answers": ["strawberry", "strawberry", "strawberry", "strawberry", "strawberry", "strawberry", "strawberry", "strawberry", "strawberry", "strawberry"], "difficult_direct_answer": false, "rationales": ["The item is a non-round red fruit.", "The berries are red and have seeds outside.", "The flavor is berry."], "image": "val2014/COCO_val2014_000000451990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542391, "question_id": "b8PpGuFRvSnJrVTAJKkCUU", "question": "What color is the top portrait on the wall between the two windows?", "choices": ["red", "purple", "blue", "pink"], "correct_choice_idx": 2, "direct_answers": ["green", "green", "blue", "green", "white", "green", "green", "green", "green", "blue"], "difficult_direct_answer": false, "rationales": ["The top picture is a shade of blue.", "There is a small blue and black frame on the wall between the windows.", "A picture on a wall is blue and another picture hangs below it."], "image": "train2014/COCO_train2014_000000542391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77394, "question_id": "b8dpZSSCGVY9yjYBZQbuRp", "question": "What color is the center curtain behind the big sofa?", "choices": ["purple", "red", "green", "blue"], "correct_choice_idx": 1, "direct_answers": ["purple", "red", "orange", "red", "purple", "red", "red", "yellow", "orange", "red"], "difficult_direct_answer": false, "rationales": ["The curtain in the center of the wreath is red.", "The curtain in the center behind the big sofa is bright red.", "The color is red."], "image": "val2014/COCO_val2014_000000077394.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 575428, "question_id": "b8faZsFVopihtU8BB3Nfd5", "question": "What is the man wearing?", "choices": ["tie", "sunglasses", "suspenders", "backpack"], "correct_choice_idx": 0, "direct_answers": ["suit", "shirt tie", "tie", "tie", "tie", "tie", "tie", "tie", "uniform", "shirt"], "difficult_direct_answer": false, "rationales": ["A man is dressed in a dress shirt with a necktie and standing on a blue train by the opening of it.", "The dark blue item extending from the man's collar is a tie.", "The man has a tie on his neck."], "image": "val2014/COCO_val2014_000000575428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576963, "question_id": "b9FhSvgFf3DNeu9ZLNmEYa", "question": "What state is written on the shirt of the woman who is playing tennis?", "choices": ["ohio", "new york", "indiana", "michigan"], "correct_choice_idx": 3, "direct_answers": ["michigan", "michigan", "michigan", "michigan", "michigan", "michigan", "michigan", "michigan", "michigan", "michigan"], "difficult_direct_answer": false, "rationales": ["It's in black letters and easy to read", "As long as you can read you can tell what state is written.", "The state's name is on the chest."], "image": "train2014/COCO_train2014_000000576963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 297266, "question_id": "b9YjYT2W3TD6EQEDNbMFNH", "question": "What type of cancer charity are they supporting?", "choices": ["lung", "breast", "liver", "pancreatic"], "correct_choice_idx": 1, "direct_answers": ["pinocchio's", "breast", "ride4 tatas", "breast", "breast", "breast cancer", "breast", "breast", "ridee tatas", "advertisement"], "difficult_direct_answer": false, "rationales": ["The term tatas is slang for a woman's chest.", "The charity is for breast cancer.", "The color of this organization is often pink, the color featured in the sign. the word ta ta represents the part of the body for this organization."], "image": "train2014/COCO_train2014_000000297266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 140322, "question_id": "b9mEnRnecHbCW8icMN5Z3B", "question": "How many little toddlers are sitting on top of the bed?", "choices": ["five", "three", "four", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "one", "one", "one", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two toddlers.", "There are two kids on the bed.", "Two small children are together on a bed."], "image": "train2014/COCO_train2014_000000140322.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96327, "question_id": "bAiCzskhKuAQksa3rA5WeT", "question": "What kind of paste-like food is on top of the spoon?", "choices": ["nutella", "toothpaste", "peanut butter", "cheeseit"], "correct_choice_idx": 2, "direct_answers": ["peanut butter", "peanut butter", "peanut butter", "peanut butter", "peanut butter", "peanut butter", "peanut butter", "peanut butter", "peanut butter", "peanut butter"], "difficult_direct_answer": false, "rationales": ["Brown paste is attached to a spoon. it is found from a nut and can be spread to make popular sandwich.", "There is brown peanut butter on the top of the spoon.", "The paste is peanut butter."], "image": "val2014/COCO_val2014_000000096327.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124893, "question_id": "bC7ETt9dbuEai2NFKmvKgS", "question": "What is by the screen?", "choices": ["chair", "dog", "apple", "cat"], "correct_choice_idx": 0, "direct_answers": ["armchair", "footstool", "chair", "door", "chair", "chair", "chair", "chair", "television", "chair"], "difficult_direct_answer": false, "rationales": ["There is a big chair next to the window screen.", "The screen door leads out to the deck. the closest chair will get sunshine and a good view of what is going on outside.", "The chair is by the screen."], "image": "train2014/COCO_train2014_000000124893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13550, "question_id": "bCQ9LfRFhdWGU2kRP2ByAB", "question": "What is the woman wearing?", "choices": ["bandana", "hat", "jeans", "sandals"], "correct_choice_idx": 3, "direct_answers": ["shorts", "shorts", "shorts", "dress", "shorts", "sandals", "tank top", "tank top", "shorts", "shorts"], "difficult_direct_answer": false, "rationales": ["She has sandals on her feet.", "The woman is wearing a pair of sandals next to the elephant.", "The woman is wearing a pair of yellow sandals."], "image": "val2014/COCO_val2014_000000013550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 42861, "question_id": "bCSYwuY2no8McDcLgNAM8Z", "question": "How many church steeples are on a wing with this church?", "choices": ["two", "three", "six", "five"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "ink", "three", "two"], "difficult_direct_answer": false, "rationales": ["The center one isn't a \"wing.\"", "There are three steeples in the wings of this church.", "You can see the three spires on the top of each tower."], "image": "train2014/COCO_train2014_000000042861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7601, "question_id": "bFHk25nzCk5Y5kAnZn6u3J", "question": "What is the sum of the numbers on the cow's tag?", "choices": ["nine", "12", "six", "55"], "correct_choice_idx": 2, "direct_answers": ["three hundred", "six", "six", "six", "312", "six", "six", "five", "six", "six"], "difficult_direct_answer": false, "rationales": ["The sum is six.", "There are three digits visible on the cow's tag and when added together answer a is correct.", "This is the sun when one adds 3 plus 1 plus 2."], "image": "train2014/COCO_train2014_000000007601.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424289, "question_id": "bFjY5xBnMwHzwqZ4uL68eH", "question": "What color is the nose of the sheep who is standing in the front?", "choices": ["gold", "black", "pink", "red"], "correct_choice_idx": 1, "direct_answers": ["black", "black", "black", "bleat", "black", "black", "tan", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The color is black.", "Their nose is black.", "I can see the color that it is."], "image": "val2014/COCO_val2014_000000424289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484353, "question_id": "bGKJRSirGiYcXT3qaNDDAo", "question": "What kind of side meal is there a serving of near the pizza?", "choices": ["parsley", "potato chips", "salad", "french fries"], "correct_choice_idx": 2, "direct_answers": ["salda", "salad", "vegetables", "salad", "fruit", "salad", "fruits", "salad", "fruit salad", "salad"], "difficult_direct_answer": false, "rationales": ["There is a plate of vegetables.", "The veggies form a salad.", "The side meal has tomatoes, radishes, and cucumbers. there are no french fries, potato chips, or parsley."], "image": "train2014/COCO_train2014_000000484353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145351, "question_id": "bH6SgrdBDpH7LZzFGiHjW8", "question": "What type of restaurant is this?", "choices": ["chinese", "diner", "moroccan", "tavern"], "correct_choice_idx": 1, "direct_answers": ["food", "family", "diner", "dessert", "cafe", "cafe", "diner", "diner", "casual dining", "casual"], "difficult_direct_answer": false, "rationales": ["These people are eating deserts from a diner.", "This is a type of diner.", "It's a diner."], "image": "train2014/COCO_train2014_000000145351.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159979, "question_id": "bJ426xh9Un6ntEEMHLFMxX", "question": "What color is the square encapsulating the area of the black umbrella?", "choices": ["red", "yellow", "blue", "white"], "correct_choice_idx": 1, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "brown", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The color is yellow.", "There is a yellow square contained by the umbrellas.", "The square is yellow."], "image": "val2014/COCO_val2014_000000159979.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 421560, "question_id": "bLCsmguBsvmjAGVotWXMfD", "question": "Who likely made the focal point above the stove?", "choices": ["chef", "electrician", "artist", "tiler"], "correct_choice_idx": 3, "direct_answers": ["tile installer", "glass blower", "interior decorator", "tiler", "tiler", "man", "tiffany", "woman", "artist", "wife"], "difficult_direct_answer": true, "rationales": ["There are tiles above.", "The tile above the stove.", "The object that draws attention about the stove is a mosaic that would be made by someone in the answer a's profession."], "image": "train2014/COCO_train2014_000000421560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438200, "question_id": "bMEYVCUHjTy3EkYZgHQoTd", "question": "What does the spot on the sheep facing the camera look like?", "choices": ["mud", "baby", "egg", "rouge"], "correct_choice_idx": 3, "direct_answers": ["red", "rouge", "blood", "blood", "red", "red", "red", "blood", "red", "blood"], "difficult_direct_answer": false, "rationales": ["It is used to spot sheep.", "It is pinkish red.", "The spot is rouge."], "image": "train2014/COCO_train2014_000000438200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 353634, "question_id": "bMRcSNmFa5U2pLxmisR4cx", "question": "What are the animals walking on?", "choices": ["bed", "desk", "beach", "water"], "correct_choice_idx": 2, "direct_answers": ["sand", "beach", "sand", "sand", "sand", "beach", "bird", "beach", "sand", "sand"], "difficult_direct_answer": false, "rationales": ["They are on the beach", "This is a large sandy area near the ocean.", "They are walking on the beach."], "image": "train2014/COCO_train2014_000000353634.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 33325, "question_id": "bPCqHHGJ48U6zr5GkYUN8V", "question": "What kind of small animal is on the left side of the long advertisement?", "choices": ["zebra", "horse", "cat", "dog"], "correct_choice_idx": 3, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The long advertisement in the subway station has a small dog on the left side.", "There is a dog in the ad", "The animal is a dog."], "image": "val2014/COCO_val2014_000000033325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365928, "question_id": "bQkmeE5HirFZTZrXr3XWMD", "question": "What is the most abundant type of umbrella used in this beach?", "choices": ["stripe", "grass", "plaid", "rainbow"], "correct_choice_idx": 1, "direct_answers": ["straw", "tan", "grass", "straw", "parasol", "straw", "beach", "grass", "umbrellas", "grass"], "difficult_direct_answer": false, "rationales": ["There are stray leaves dangling off the object. it also looks like hay.", "Grass is used abundantly.", "This material is identifiable by being stringy and brown. this material turns brown as it dries."], "image": "val2014/COCO_val2014_000000365928.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334039, "question_id": "bUWJ3CJ9jrwgqC4GM3Myd8", "question": "What color is the cap on top of the water bottle held by the child?", "choices": ["white", "black", "blue", "green"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "green", "blue", "blue", "hoshs", "blue", "blue", "blue", "sfs"], "difficult_direct_answer": false, "rationales": ["The top cap of the water bottle is this color.", "The color is blue.", "It's a blue cap"], "image": "train2014/COCO_train2014_000000334039.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 577405, "question_id": "bXMUVt4YSm7Gs55nwzxzVX", "question": "What is least healthiest on the plate?", "choices": ["muffin", "pizza", "beef", "orange"], "correct_choice_idx": 0, "direct_answers": ["muffin", "apple", "muffin", "muffin", "muffins", "orange", "muffins sugar", "orange", "cake", "apple"], "difficult_direct_answer": false, "rationales": ["The oranges and apples are pure fruits with no processed, unhealthy ingredients added. although the muffin isn't terribly bad for us, it does nevertheless contain processed flour and a bit of sugar, neither of which are good for us.", "The plate has breakfast items, not pizza or beef. the orange is relatively healthy.", "Fruit is usually the best choice between apples, oranges and a muffin."], "image": "train2014/COCO_train2014_000000577405.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168818, "question_id": "bZHGJ9VbmUCZm424EbjQd6", "question": "What are the animals next to?", "choices": ["maypole", "tree", "cable", "egg carton"], "correct_choice_idx": 1, "direct_answers": ["tree", "tree", "tree", "tree", "tree", "tree", "big tree", "tree", "tree", "tree"], "difficult_direct_answer": false, "rationales": ["Trees provide shade so animals including sheep will often sit or lie beneath a tree's shade on a hot day.", "The animals are by a tree.", "The animals are all sitting up next to a wide tree."], "image": "train2014/COCO_train2014_000000168818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 446661, "question_id": "baNrWYDeyvQY3nRf4UdUAY", "question": "Who owns the company whose name appears here?", "choices": ["eli whitney", "richard branson", "juan borgia", "will smith"], "correct_choice_idx": 1, "direct_answers": ["virgin", "virgin atlantic", "richard branson", "virgin", "virgin atlantic", "virgin", "virgin", "virgin atlantic", "benson", "virgin atlantic"], "difficult_direct_answer": false, "rationales": ["The airline company's name is written on the side. the man who owns the airline is internet searchable.", "Richard was the person who created the company.", "It's a virgin atlantic plane"], "image": "train2014/COCO_train2014_000000446661.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176696, "question_id": "bbhzFQcCcqBoc6yGnMEC8n", "question": "Which drink here is the healthiest?", "choices": ["orange juice", "water", "tomato juice", "wine"], "correct_choice_idx": 0, "direct_answers": ["orange juice", "orange juice", "orange juice", "orange juice", "orange juice", "orange juice", "orange juice", "orange juice", "orange juice", "juice"], "difficult_direct_answer": false, "rationales": ["The drinks are visible and inferable based on their colors and the shape of the drinking vessel. between the two, answer a is known to have healthier attributes than the other visible liquid.", "Oj is going to be healthier than the wine as it does not have alcohol in it.", "Orange juice is full of vitamins and nutrients, making it the healthier choice over wine, which contains alcohol."], "image": "val2014/COCO_val2014_000000176696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260637, "question_id": "bcfDQgPFhbYjrwVKGiaKb4", "question": "What is the name of the object on the front of the plane that spins?", "choices": ["fan", "motor", "wings", "propeller"], "correct_choice_idx": 3, "direct_answers": ["propeller", "propeller", "propeller", "propeller", "propeller", "propeller", "plane", "propeller", "propeller", "propeller"], "difficult_direct_answer": false, "rationales": ["This answer is known even without looking at the image, but confirmed by looking at the type of plane and the size and shape of the object in question.", "The spinning object is called a propeller", "The front of the plane is a propeller."], "image": "train2014/COCO_train2014_000000260637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367260, "question_id": "bhnY6cwfDtMAmaiiMXdwJV", "question": "What are the birds in the pen called?", "choices": ["storks", "pelicans", "ducks", "flamingos"], "correct_choice_idx": 2, "direct_answers": ["ducks", "fee", "ducks", "ducks", "ducks", "ducks", "ducks", "chickens", "dove", "ducks"], "difficult_direct_answer": false, "rationales": ["Ducks are usually found by the water.", "You can tell by the birds colors and bills as to what type of birds they are.", "The birds are ducks."], "image": "train2014/COCO_train2014_000000367260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 451033, "question_id": "bhuZdSnpnw47wWgoehHswU", "question": "What type of room might this be?", "choices": ["family room", "conference room", "break room", "office"], "correct_choice_idx": 2, "direct_answers": ["lunchroom", "fashion", "break room", "hotel", "breakroom", "kitchen", "break", "break room", "office breakroom", "break room"], "difficult_direct_answer": false, "rationales": ["There is a kitchenette and a couple tables with a lot of chairs", "You can tell by the fridge, microwave and office equipment as to where this might be.", "The room appears to be in an office setting based on the furniture. rooms that have these components in a work setting would be consistent with answer a."], "image": "train2014/COCO_train2014_000000451033.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308895, "question_id": "bjE6o3fYQk3bjRWqFQsLUv", "question": "What utensil is on the right hand side of the plate?", "choices": ["knife", "pizza cutter", "chopstick", "spatula"], "correct_choice_idx": 0, "direct_answers": ["knife", "fork", "fork", "fork", "knife fork", "knife fork", "knife", "plate", "fork knife", "knife"], "difficult_direct_answer": false, "rationales": ["A knife is on the plate.", "The utensils are clearly visible and the one furthest to the right is the shape of answer a.", "The utensil is a knife."], "image": "train2014/COCO_train2014_000000308895.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281779, "question_id": "bkjWbkLw4MmRBpNd7qqXFR", "question": "What gets plugged into the item in the foreground?", "choices": ["phone", "hose", "battery pack", "television"], "correct_choice_idx": 1, "direct_answers": ["one", "pipe", "fire hose", "paper", "hose", "fire hose", "hose", "hose", "hose", "hose"], "difficult_direct_answer": false, "rationales": ["The item in the sidewalk is a fire hydrant that is used when water is needed to put out a fire, in which case a hose would be plugged into it.", "A hose gets plugged in.", "This fire hydrant can be attached to a hose when one of the white caps are screwed off."], "image": "train2014/COCO_train2014_000000281779.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395592, "question_id": "bmQzetgPs5kq5DyACScyWc", "question": "What is the zebra on the left about to step into?", "choices": ["grass", "road", "water", "hay"], "correct_choice_idx": 1, "direct_answers": ["stand", "road", "road", "road", "road", "road", "road", "bsll", "road", "road"], "difficult_direct_answer": false, "rationales": ["Two animals are walking along a dirt road in a grassy area.", "The zebra is going to the road.", "The zebra is going to the road."], "image": "train2014/COCO_train2014_000000395592.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183757, "question_id": "bp8SvNpY6AfJPSP3v5HGAs", "question": "What animal is near the dog?", "choices": ["hyena", "cat", "muskrat", "eagle"], "correct_choice_idx": 1, "direct_answers": ["cat", "cat", "another dog", "cat", "cat", "cat", "cat", "cat", "cat", "cat"], "difficult_direct_answer": false, "rationales": ["A dog is lying down on the back of a boat and a cat is visible above him.", "There is a cat resting by the dog.", "The animal is a cat."], "image": "val2014/COCO_val2014_000000183757.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 231758, "question_id": "bqFiT3Upw8XRJEncfZymjX", "question": "What made the cows unnatural colors?", "choices": ["spray paint", "hair dye", "crayons", "markers"], "correct_choice_idx": 0, "direct_answers": ["spray paint", "spray paint", "painting", "spraypaint", "paint", "dye", "paint", "hair dye", "paint", "dye"], "difficult_direct_answer": false, "rationales": ["They paint them to designate health status", "The bovine are colored with a coloring agent in a can.", "The cows are sprayed."], "image": "val2014/COCO_val2014_000000231758.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 460243, "question_id": "btMTAUwu9YCqoNJZY8YpBU", "question": "What is the most likely reason his pants are dirty?", "choices": ["pitching", "catching", "batting", "sliding"], "correct_choice_idx": 3, "direct_answers": ["sliding", "dirt", "sliding", "wiping hands", "slide", "dirt", "sliding", "slide", "sliding", "sliding"], "difficult_direct_answer": false, "rationales": ["He might have also fell while trying to do c.", "His leg is dirty which means he was in a lying position.", "The man's pants got dirty because of sliding to a plate or home base earlier in the game."], "image": "val2014/COCO_val2014_000000460243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293658, "question_id": "bu2HwusGm7BYznRvugtgfU", "question": "What company is this person likely to work for?", "choices": ["mcdonalds", "subway", "green giant", "pizza hut"], "correct_choice_idx": 3, "direct_answers": ["little ceasers", "mail", "amazon", "amazon", "pizza restaurant", "pizza restaurant", "pizza", "pizza hut", "ceasers", "little caesars"], "difficult_direct_answer": false, "rationales": ["The person riding the bike is delivering several pizzas to a customer.", "The company is pizza hut.", "The person works at pizza hut."], "image": "train2014/COCO_train2014_000000293658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280687, "question_id": "bu9fz4gMgmGgyudWGGp5MU", "question": "What utensil will the food be eaten with?", "choices": ["chopstick", "knife", "spoon", "fork"], "correct_choice_idx": 0, "direct_answers": ["chopsticks", "chopsticks", "chop sticks", "chopsticks", "chopsticks", "chop sticks", "chopstick", "rice", "chopsticks", "chopstick"], "difficult_direct_answer": false, "rationales": ["There are chopsticks on the plate.", "There are chopsticks.", "The utensils are visible on the plate and are the size, color and shape of answer a."], "image": "val2014/COCO_val2014_000000280687.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110013, "question_id": "bv5zRNb6uLnhLnCwsTdHpY", "question": "What color of cat is sleeping in the little bed?", "choices": ["yellow", "white", "tabby", "calico"], "correct_choice_idx": 3, "direct_answers": ["tortoise shell", "red", "black", "black brown", "black", "brown", "black color", "calico", "black", "pink"], "difficult_direct_answer": false, "rationales": ["That is the type of cat sleeping.", "The color is calico.", "There is a calico cat sleeping in the bed."], "image": "train2014/COCO_train2014_000000110013.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308366, "question_id": "bv7im2HVYLEodvn5FijmXd", "question": "What color is the plastic bowl containing an orange fruit?", "choices": ["purple", "blue", "red", "white"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "red", "orange", "red", "orange", "red", "orange", "orange", "red"], "difficult_direct_answer": false, "rationales": ["The bowl has a similar tint to the orange.", "It's a red bowl.", "The color is red."], "image": "train2014/COCO_train2014_000000308366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549729, "question_id": "bxcATpRUhiQqs5QZkbWmJn", "question": "What is near the far right wall?", "choices": ["television", "egg", "cat", "statue"], "correct_choice_idx": 0, "direct_answers": ["lamp shade", "lamp", "television", "television", "t.v", "tv", "television", "chair", "television", "tv"], "difficult_direct_answer": false, "rationales": ["The tv is on the right side of the room.", "It's a large electronic item with a screen", "The tv is on the far right."], "image": "train2014/COCO_train2014_000000549729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255149, "question_id": "by6rQGQspJLBiPA3ZPzKtN", "question": "What color is the lower element in the glass structure to the righthand side?", "choices": ["purple", "orange", "green", "blue"], "correct_choice_idx": 1, "direct_answers": ["orange", "orange", "orange", "orange", "orange", "orange", "orange", "red", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The righthand glass which is almost fully visible in this image is yellow on top and orange on the base.", "The lower element on the right hand side is orange glass.", "The glass structure on the right hand side has an orange base closest to the bottom."], "image": "val2014/COCO_val2014_000000255149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107705, "question_id": "bz7mNFm854ufigokeujPJP", "question": "How many windows are lit on the second story of this building?", "choices": ["three", "four", "one", "two"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "three", "three", "two", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are several windows right below the white round clock.", "There are five windows on the second story. two are not lit.", "There are three windows."], "image": "train2014/COCO_train2014_000000107705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301670, "question_id": "c22iGo8ykR9hZUzsS5VxAJ", "question": "What does the boy have on his head?", "choices": ["crown", "gas mask", "baseball cap", "goggles"], "correct_choice_idx": 2, "direct_answers": ["cap", "hat", "cap", "baseball cap", "cap", "cap", "hat", "cap", "tennis racket", "hat"], "difficult_direct_answer": false, "rationales": ["The boy is playing tennis with a baseball cap on.", "It is a blue hat with a visor front. they use these to play baseball in.", "It is a sunny day. the boy is wearing a brimmed hat to shield from the sun."], "image": "val2014/COCO_val2014_000000301670.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457231, "question_id": "c2eCUUYcZnhdRFXVX2Y4ZV", "question": "What activity is meant for the sink with the round of faucets?", "choices": ["washing dishes", "washing hands", "washing animals", "filling water"], "correct_choice_idx": 1, "direct_answers": ["washing", "washing", "washing", "bathing", "hand washing", "hand washing", "washing", "clean", "washing hands", "washing"], "difficult_direct_answer": false, "rationales": ["The sink is made so multiple people can wash their hands at a time.", "Its is mainly for washing hands as its shape say so.", "The activity is for washing hands."], "image": "train2014/COCO_train2014_000000457231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239337, "question_id": "c3H3Xwpsp4Ndv52Xg8Yim6", "question": "What would this site be described as?", "choices": ["coastal", "snowy", "tundra", "metropolitan"], "correct_choice_idx": 0, "direct_answers": ["coastal", "beach", "beach", "beach", "waterfront", "beach", "beach", "beach", "beach", "peace"], "difficult_direct_answer": false, "rationales": ["The other options aren't along oceans.", "The site has a beach and palm trees. there are only a few buildings.", "You can see where the ocean meets the coast."], "image": "train2014/COCO_train2014_000000239337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166297, "question_id": "c4mczG3A26VQ39pFyoHjgA", "question": "What kind of surfboard is this?", "choices": ["longboard", "funboard", "gun", "fish"], "correct_choice_idx": 1, "direct_answers": ["big", "sea skating", "plastic", "skate", "mp", "regular surfboard", "mp", "water mp", "funboard", "big size"], "difficult_direct_answer": true, "rationales": ["The surfboard is fun.", "The surfboard is for fun.", "The man is carrying a longboard. the board is long in length and is used for surfing."], "image": "train2014/COCO_train2014_000000166297.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 77931, "question_id": "c67o6WbP7jZWnnCRvkYXn3", "question": "What's in the round containers on the fridge?", "choices": ["mustard", "pizza", "spices", "hot dogs"], "correct_choice_idx": 2, "direct_answers": ["spices", "design", "ornaments", "decoration", "magnets", "sand", "spices", "magnets", "spices", "spices"], "difficult_direct_answer": false, "rationales": ["The containers have spices.", "Spices are inside the refrigerator magnets.", "This type of flavoring is kept in the kitchen and would be easily accessible on the fridge. you can tell by their varying colors that typical of these food additives."], "image": "train2014/COCO_train2014_000000077931.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94957, "question_id": "c74toTJpc95gWSnoVdifzY", "question": "Where would you find this type of dinner?", "choices": ["cafeteria", "airplane", "cafe", "hospital"], "correct_choice_idx": 1, "direct_answers": ["prison", "airplane", "flight", "japan", "airplane", "cafeteria", "salad", "airplane", "frozen food", "hospital"], "difficult_direct_answer": false, "rationales": ["This would be in a cafeteria.", "You might also find it on a train.", "The food in the tray is the kind of meal you would be served on a long airplane flight."], "image": "train2014/COCO_train2014_000000094957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40844, "question_id": "c8a6aQ7UL5GYQ6LPAY6pJ3", "question": "Which object is most likely to start a fire?", "choices": ["tea pot", "drawer", "bowl", "stove"], "correct_choice_idx": 3, "direct_answers": ["stove", "stove", "stove", "oven", "frying pan", "stove", "stove", "towel", "oven", "stove"], "difficult_direct_answer": false, "rationales": ["The stove is likely to start a fire.", "There is a little girl standing in the kitchen. she is right in front of oven that uses gas for heat.", "The other options don't involve the use of flammable materials. of course, leaving c on a too long might lead to a fire too."], "image": "train2014/COCO_train2014_000000040844.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 391113, "question_id": "c8zKsXsTpsPm2gwB8Eg5o5", "question": "What color is the sheep in the middle of four white sheep and stands on dirt?", "choices": ["black", "brown", "gray", "blue"], "correct_choice_idx": 0, "direct_answers": ["black", "black", "black", "black", "black", "white", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["It's the opposite color as the rest of them", "There is only one black sheep in the middle.", "The color is black."], "image": "train2014/COCO_train2014_000000391113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289901, "question_id": "cAm6HU2CvEH8wjUgc2KeDC", "question": "What is the man wearing?", "choices": ["shin guard", "backpack", "cowboy hat", "helmet"], "correct_choice_idx": 0, "direct_answers": ["knee brace", "knee brace", "shorts shirt", "shorts", "shin guard", "white shorts", "shorts", "shorts", "shorts", "clothes"], "difficult_direct_answer": false, "rationales": ["None of the answers are visibly present.", "This is not really a shin guard, however the other options do not fit as he's not wearing any of them. but this is a knee brace.", "The man is wearing a shin guard that is blue."], "image": "train2014/COCO_train2014_000000289901.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 448627, "question_id": "cCBj5FsDppYR6F3pJJRzbw", "question": "What color is the metallic object hanging on this cat's collar?", "choices": ["silver", "copper", "gold", "brass"], "correct_choice_idx": 0, "direct_answers": ["silver", "grey", "silver", "silver", "white", "silver", "silver", "silver", "blue", "gray"], "difficult_direct_answer": false, "rationales": ["The other options would be yellow or golden. with d, it might also be green if distressed.", "It's silver in color", "A shiny, metallic object hangs from a cats neck."], "image": "val2014/COCO_val2014_000000448627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 373613, "question_id": "cCpGWhmnTgUwPZBs34r9P6", "question": "What kind of fuel does the brown and white animal use?", "choices": ["leaves", "waste", "meat", "bugs"], "correct_choice_idx": 0, "direct_answers": ["walk", "food", "grass", "food", "plants", "leaves", "food", "leaves", "leaves", "plants"], "difficult_direct_answer": false, "rationales": ["The animal visible is a giraffe which is known to eat leaves and vegetation which is would use as fuel to power its body.", "The giraffes need leaves.", "They eat vegetation off the tops of trees"], "image": "train2014/COCO_train2014_000000373613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124067, "question_id": "cDHLLu3Z5UXv6j8ARb64c2", "question": "What animal would most enjoy the food in the bowl?", "choices": ["sheep", "wolf", "lion", "hyena"], "correct_choice_idx": 0, "direct_answers": ["rabbit", "rabbit", "sheep", "rabbit", "chicken", "bsll", "dog", "rabbit", "rabbit", "herbivore"], "difficult_direct_answer": false, "rationales": ["They are herbivores. herbivores eat plants. these are all plant based foods.", "Sheep would enjoy the veggies.", "A sheep would enjoy this as they are herbivores and this is full of vegetables."], "image": "train2014/COCO_train2014_000000124067.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 278338, "question_id": "cDRUkeuEKubGfYJmcVKUnA", "question": "What is the small mirror used for?", "choices": ["magnification", "watching", "shrinking", "recording"], "correct_choice_idx": 0, "direct_answers": ["magnification", "makeup", "check makeup", "makeup", "vanity", "makeup", "face applicants", "make up", "shaving use", "shaving"], "difficult_direct_answer": false, "rationales": ["A side mirror is usually magnified to get a closer look.", "Those special kinds of mirrors are common place in bathrooms.", "Those types of mirrors have two sides and can be used for close or normal reflections."], "image": "val2014/COCO_val2014_000000278338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417079, "question_id": "cEhitTA8bVnUXeJPDfgJHL", "question": "Which part of the line is the little elephant following in?", "choices": ["middle", "front", "behind", "back"], "correct_choice_idx": 0, "direct_answers": ["tail", "middle", "second", "second", "father", "middle", "middle", "middle", "middle", "front"], "difficult_direct_answer": false, "rationales": ["There are three elephants in a line visible and the smallest one is between the others making it in position answer a.", "A baby elephant is sandwiched between mom and and in the grass.", "There is an elephant in front of and behind the small elephant and it is in the center."], "image": "train2014/COCO_train2014_000000417079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96998, "question_id": "cG6bPebNKMxrjPzwbhDDoX", "question": "What color are the towels hanging on the bars on either wall of the bathroom?", "choices": ["white", "blue", "gray", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "dark blue", "black", "brown", "black", "black", "black", "black", "dark blue"], "difficult_direct_answer": false, "rationales": ["The towels on the racks are all black.", "One can see the dark color of the towels.", "The color is black."], "image": "val2014/COCO_val2014_000000096998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152857, "question_id": "cGNWMUG4kY7RrsoPcbUfsY", "question": "What are the elephants near?", "choices": ["grass", "dogs", "apples", "pine cones"], "correct_choice_idx": 0, "direct_answers": ["water", "water", "water", "water", "river", "grass", "car", "water", "swamp", "ten"], "difficult_direct_answer": false, "rationales": ["Grass is green. there is green grass near the water where the elephants are.", "They're by grass.", "You can tell by the green color and setting as to what the animals are near."], "image": "train2014/COCO_train2014_000000152857.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144747, "question_id": "cHK3Kp3Bygt4Bk8yvygdRB", "question": "What kind of ice cream is on the top of the cinnamon treat?", "choices": ["vanilla", "cookie", "chocolate", "birthday cake"], "correct_choice_idx": 0, "direct_answers": ["vennila", "vanilla", "vanilla", "vanilla", "white", "vanilla", "vanilla", "vanilla", "white", "vanilla"], "difficult_direct_answer": false, "rationales": ["Vanilla is often served with desserts.", "The white plain appearance of the ice cream shown here tells us it's vanilla.", "The ice cream is vanilla."], "image": "train2014/COCO_train2014_000000144747.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102672, "question_id": "cHv2ECk9q2SjQaFE9MuFw6", "question": "What is on the horse in the foreground?", "choices": ["hat", "baby", "saddle", "knight"], "correct_choice_idx": 2, "direct_answers": ["saddle", "saddle", "saddle", "saddle", "saddle", "seat", "saddle", "saddle", "saddle", "gray"], "difficult_direct_answer": false, "rationales": ["This is so people can ride comfortably", "The horse in the foreground has a black saddle.", "The horse has a saddle."], "image": "val2014/COCO_val2014_000000102672.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 135579, "question_id": "cJGZeYfAPtsvxSiYRBVsfv", "question": "What is a healthy item on the plate?", "choices": ["chicken leg", "lemon", "guava", "carrot"], "correct_choice_idx": 3, "direct_answers": ["vegetable", "vegetables", "carrots", "carrots", "carrots", "carrot", "vegetables", "carrots", "green beans", "meat"], "difficult_direct_answer": false, "rationales": ["The carrots on the plate are the healthiest food item on the plate because it is low in fat.", "The carrots are healthy.", "There are some orange carrots sliced up on the plate."], "image": "val2014/COCO_val2014_000000135579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208752, "question_id": "cKc97YeY586CpnSycg2UD2", "question": "What continent is this taking place on?", "choices": ["asia", "australia", "north america", "europe"], "correct_choice_idx": 3, "direct_answers": ["europe", "europe", "europe", "europe", "europe", "europe", "france", "france", "europe", "europe"], "difficult_direct_answer": false, "rationales": ["The green banner behind the tennis player says france on it.", "It is french language on the signs", "The ads on the banner are in french and france is in europe."], "image": "train2014/COCO_train2014_000000208752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123967, "question_id": "cKoN7DUTPaeotSLNq2VsoE", "question": "How many of the glasses are filled with wine on the table?", "choices": ["five", "two", "three", "four"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Of three glasses, only this number has wine in them.", "The glasses are clearly visible and countable and the ones containing wine can be identified based on the color of the visible liquid.", "There are 2."], "image": "train2014/COCO_train2014_000000123967.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 125142, "question_id": "cQZQq4AVZSXK5tZxWUbboz", "question": "Which food in this image is highest in omega 3 fats?", "choices": ["broccoli", "pineapple", "rice", "salmon"], "correct_choice_idx": 3, "direct_answers": ["salmon", "salmon", "salmon", "salmon", "fish", "fish", "salmon", "fish", "salmon", "salmon"], "difficult_direct_answer": false, "rationales": ["The salmon has omegas.", "Fish is commonly known to be high in omega 3 fats. it is identifiable by its texture and coloring.", "This is a type of fish which is known to be high in this fat. you can tell by its central bone and flaky layers that look like a tree ring"], "image": "train2014/COCO_train2014_000000125142.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84887, "question_id": "cQniAisgPNQdR9UPgsmSkC", "question": "How many cows are sniffing around on the beach front?", "choices": ["two", "three", "one", "four"], "correct_choice_idx": 3, "direct_answers": ["four", "one", "four", "four", "three", "three", "four", "four", "three", "four"], "difficult_direct_answer": false, "rationales": ["One cow is in the center of the image, two more overlap to the left and a final one is in the far left of the image.", "There are four cows in the photo.", "There are four cows."], "image": "train2014/COCO_train2014_000000084887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143224, "question_id": "cS8WpTkSEioYuD9473ptrN", "question": "Who would work in a setting like this?", "choices": ["pilot", "clown", "chef", "firefighter"], "correct_choice_idx": 0, "direct_answers": ["pilot", "pilot", "luggage handlers", "pilot", "pilot", "pilot", "pilot", "pilot", "pilot", "pilot"], "difficult_direct_answer": false, "rationales": ["There is a plane at an airport. people are needed to fly the plane.", "The setting would have a pilot.", "The airplane would be flown by a pilot that works in the airport."], "image": "val2014/COCO_val2014_000000143224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 412184, "question_id": "cTHTqqD3fHMMwgVyMcqN52", "question": "What is the main reason all of the planes are on the ground?", "choices": ["dawn", "clouds", "dusk", "rain"], "correct_choice_idx": 2, "direct_answers": ["dusk", "weather", "grounded", "landing", "loading passengers", "night", "landing", "boarding", "loading passengers", "parked"], "difficult_direct_answer": false, "rationales": ["The airplane is on the ground because the sun is setting and it is too dark to see.", "The sky is dark and the sun is going down.", "These are usually planes used during the day because they don't have as much electrical equipment for guidance"], "image": "val2014/COCO_val2014_000000412184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506910, "question_id": "cTYZ8A8Nch7zqEigKS8vm4", "question": "What color are the little stars on top of the dome building at the church?", "choices": ["black", "white", "gold", "blue"], "correct_choice_idx": 2, "direct_answers": ["gold", "gold", "gold", "gold", "gold", "yellow", "silver", "gold", "gold", "gold"], "difficult_direct_answer": false, "rationales": ["The color is gold.", "The little stars on the top of the dome of the church are painted gold.", "They are visible as a yellow color which is consistent with this type of material. you can see their color in contrast to the blue sky."], "image": "train2014/COCO_train2014_000000506910.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311744, "question_id": "cTbekK8fRMLBRadQgYDftx", "question": "What phone application does the little rabbit on the bottom left side of the screen come from?", "choices": ["wechat", "snapchat", "line", "instagram"], "correct_choice_idx": 2, "direct_answers": ["instagram", "line", "instagram", "ink", "gif", "app", "line", "bunny", "na", "android"], "difficult_direct_answer": false, "rationales": ["He comes from a popular app.", "The app is line.", "The cute rabbit is from the line app."], "image": "val2014/COCO_val2014_000000311744.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398924, "question_id": "cTw9JZd63sG7Qy2EvRgag5", "question": "What common eating utensil is missing from the table?", "choices": ["spoon", "knife", "chopsticks", "fork"], "correct_choice_idx": 0, "direct_answers": ["spoon", "spoon", "spoon", "spoon", "another plate", "spoon", "spoon", "spoon", "milk", "spoon"], "difficult_direct_answer": false, "rationales": ["The utensil is a spoon.", "The utensil is a spoon.", "There is not a spoon on the table."], "image": "train2014/COCO_train2014_000000398924.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375815, "question_id": "cUPFwiHhWWTKD6j2YfG5Jc", "question": "What kind of meat is sat to the left of the pizza?", "choices": ["chicken", "roast", "ground beef", "duck"], "correct_choice_idx": 2, "direct_answers": ["burger meat", "ground beef", "ground beef", "beef", "beef", "ice cream", "ground beef", "beef", "beef", "beef"], "difficult_direct_answer": false, "rationales": ["Red, ground meat is in a bowl on a table.", "The meat is chopped up and reformed.", "The meat identified in option a matches the item in the photo."], "image": "train2014/COCO_train2014_000000375815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 323820, "question_id": "cVBB9ifxtQn7cFPpqinT5o", "question": "What food here comes from outside a farm?", "choices": ["corn", "venison", "burger", "fish"], "correct_choice_idx": 3, "direct_answers": ["fish", "chicken", "meat", "corn", "chicken", "corn", "fish", "fish", "corn", "plant"], "difficult_direct_answer": false, "rationales": ["The corn and broccoli are both grown on a farm. fish come from the sea.", "These are found in waterways", "This comes from water and is an animal"], "image": "train2014/COCO_train2014_000000323820.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328162, "question_id": "cXhzhfKR5g2CFa4r89QjNo", "question": "What song relates to this scene?", "choices": ["surfin usa", "slam", "running", "basketball"], "correct_choice_idx": 0, "direct_answers": ["surfin' u.s.a", "surfin", "surfin usa", "surfin usa", "feel free", "surfing usa", "beach", "beach boys", "surfin' u.s.a", "love"], "difficult_direct_answer": false, "rationales": ["The man in the wetsuit is holding a surfboard. the best song is surfin' usa.", "The beach boys have a song called \"surfin' usa\"", "The song relates to the beach."], "image": "train2014/COCO_train2014_000000328162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70239, "question_id": "cY7hGQiZ7C98rPsPS5hynM", "question": "What is next to the chair?", "choices": ["pumpkin", "apple pie", "tablecloth", "baby"], "correct_choice_idx": 2, "direct_answers": ["table", "trash can", "table", "table", "table", "table", "table", "table", "table", "tablecloth"], "difficult_direct_answer": false, "rationales": ["The cloth is draped over and down the table.", "The tablecloth is next to the chair.", "A tablecloth is next to the chair. the tablecloth covers the actual table."], "image": "train2014/COCO_train2014_000000070239.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348794, "question_id": "cb2DYpjdKSzAQ9QX6BgbRj", "question": "How many elephants are standing in the road with people on their backs?", "choices": ["four", "three", "five", "six"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "two", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["You can see three elephants and they are all carrying passengers.", "There are three elephants.", "A group of large animals with trunks are walking in the street with people on their backs."], "image": "train2014/COCO_train2014_000000348794.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331753, "question_id": "cb4HtavbYGE5reiCcp7BXs", "question": "What is a slang term for this item?", "choices": ["potty", "goose", "bean", "banana"], "correct_choice_idx": 0, "direct_answers": ["can", "john", "loo", "potty", "can", "potty", "pot", "potty", "toilet", "throne"], "difficult_direct_answer": false, "rationales": ["Kids are asked, \"do you need to go potty?\" when they are growing up.", "Potty is a slang for toilet.", "The store shelf shows several toilets on display which are also known as the potty."], "image": "val2014/COCO_val2014_000000331753.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457922, "question_id": "ccaxFXmGFqJxtzmvnFufpe", "question": "What color is the umpire's helmet who is standing with his hand on the catcher's back?", "choices": ["black", "red", "green", "blue"], "correct_choice_idx": 1, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The one with his hand on the other player's back is wearing scarlet head ware.", "The umpire is wearing a red helmet.", "The umpire's helmet does not match his black shirt or the green grass. his helmet also is not blue."], "image": "train2014/COCO_train2014_000000457922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435803, "question_id": "cfUnaJB6kcz6qHvEXDNA9G", "question": "What color is the front of the tail fin and the nosecone of this aircraft?", "choices": ["white", "blue", "red", "yellow"], "correct_choice_idx": 3, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The tail fin and nosecone are both yellow.", "The color is yellow.", "The tail fin of the airplane is bright yellow and so is the nose of the plane."], "image": "train2014/COCO_train2014_000000435803.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431812, "question_id": "cfiqTbJwBcv8yFFJs57GA5", "question": "How many ways are there on this stop sign?", "choices": ["two", "three", "four", "one"], "correct_choice_idx": 1, "direct_answers": ["two", "three", "three", "two", "three", "one", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["The sign clearly states how many ways there are to stop.", "There is writing underneath the stop sign that details how many ways there are.", "The sign says 3 way."], "image": "train2014/COCO_train2014_000000431812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362966, "question_id": "cgSdp5SpwFxWgKT4ERjPQk", "question": "What is the person holding in their left hand?", "choices": ["racquet", "sword", "chopstick", "fork"], "correct_choice_idx": 3, "direct_answers": ["fork", "fork", "fork", "fork", "fork", "fork", "fork", "fork", "fork", "fork"], "difficult_direct_answer": false, "rationales": ["One can make out the tines of the eating instrument.", "The person eating the meal is holding a fork in their left hand.", "They have a fork in their hand"], "image": "train2014/COCO_train2014_000000362966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496696, "question_id": "cgrZCsBYSK6QN3DJMztKqN", "question": "What is on the dirt road?", "choices": ["baseball players", "eggs", "motorcycles", "animals"], "correct_choice_idx": 3, "direct_answers": ["cows", "cow", "cows", "cows", "cows", "animals", "cows", "cows", "animal", "cows"], "difficult_direct_answer": false, "rationales": ["The animals are on the road.", "There is a herd of cattle", "There are no vehicles, athletes, or eggs on the road."], "image": "train2014/COCO_train2014_000000496696.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59614, "question_id": "chFKhX32rsMvFLSAgASWLc", "question": "What kind of pasta is sitting in the strainer alongside the broccoli?", "choices": ["spaghetti", "bowtie", "spiral", "elbow"], "correct_choice_idx": 2, "direct_answers": ["noodles", "bowl", "spiral", "spiral pasta", "corkscrew", "rotini", "spiral", "spirales", "spring roll", "fusilli"], "difficult_direct_answer": true, "rationales": ["The pasta is spiral.", "There is a cavatappi pasta on the plate.", "The pasta on the plate with the broccoli is in a spiral shape."], "image": "val2014/COCO_val2014_000000059614.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294749, "question_id": "chStz6HUyAwndGqbaTDVaZ", "question": "What type of vehicle is the man on the left most likely riding in?", "choices": ["jeep", "sedan", "tour bus", "convertible"], "correct_choice_idx": 0, "direct_answers": ["truck", "jeep", "jeep", "jeep", "car", "side-by-side", "jeep", "bus", "jeep", "jeep"], "difficult_direct_answer": false, "rationales": ["One can see the bars of the vehicle and it doesn't appear to have a top over the riders which is common among these safari vehicles.", "The vehicle appears to be on uneven ground which would most likely be driven on in a jeep.", "It might be a tour vehicle, but it's like a jeep given the environment and open nature of the cab."], "image": "train2014/COCO_train2014_000000294749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279490, "question_id": "chu5wLbk8iFDzFTnXBVpkk", "question": "What item here would be most useful in an emergency?", "choices": ["cellphone", "laptop", "samurai sword", "walkie talkie"], "correct_choice_idx": 0, "direct_answers": ["phone", "cell phone", "cellphone", "knife", "phone", "kek", "stop", "phone", "phone", "phone"], "difficult_direct_answer": false, "rationales": ["The phone would be useful.", "A cellphone can be used to call 9-1-1. dialing that number will get you help in an emergency.", "The item is the phone."], "image": "val2014/COCO_val2014_000000279490.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 348368, "question_id": "cmJjcLwmiEsBurw5R5xR7J", "question": "What is near the apex of the tower?", "choices": ["clock", "serpent", "eagle", "statue"], "correct_choice_idx": 0, "direct_answers": ["church", "clock", "clock", "clock", "clock", "clock", "clock", "clocks", "cross", "clock"], "difficult_direct_answer": false, "rationales": ["The round feature with roman numerals around the edge is a clock.", "The tower has a round device on more than one side that is used to tell time.", "One can see the timepiece located on the front of the clock."], "image": "train2014/COCO_train2014_000000348368.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 566897, "question_id": "cnjYQA5CpDnZqvHstQBDui", "question": "What type of shirt is the man wearing?", "choices": ["tshirt", "bowling shirt", "jersey", "polo shirt"], "correct_choice_idx": 1, "direct_answers": ["euro rail", "collared", "bowling shirt", "white", "polo shirt", "jersey", "white", "work shirt", "short-sleeve", "soccer shirt"], "difficult_direct_answer": true, "rationales": ["His shirt has typed word in it and has different color.", "A man is sitting and facing the camera. he is smoking in one hand and holding a pink stuffy in the other with a buttoned shirt that resembles what one would wear to hit pins.", "This is a type of shirt that people on a bowling league would wear."], "image": "train2014/COCO_train2014_000000566897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 1966, "question_id": "cqVsByRC7HpCvts7tmWDQs", "question": "What is the job of the girl who is knelt down in the front of the picture?", "choices": ["collect ball", "spectator", "referee", "camera crew"], "correct_choice_idx": 0, "direct_answers": ["ball girl", "ballgirl", "referee", "collect ball", "ball girl", "ref", "referee", "ball girl", "play", "score"], "difficult_direct_answer": false, "rationales": ["She collects the balls.", "The girl collects balls.", "She looks like she's about to run out on the court or prepping to do so, which means she is in an active court role."], "image": "train2014/COCO_train2014_000000001966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547594, "question_id": "csHrfr6EHsDymKZcRWh68E", "question": "How many houses are visible above the train with black roofs?", "choices": ["one", "two", "three", "four"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "four", "one", "three", "four", "four", "four", "four", "three"], "difficult_direct_answer": false, "rationales": ["A train moves through town with a row of houses with dark roofs behind.", "The green house has this and the rest are brown", "There are three houses on the right of the train with black roofs."], "image": "train2014/COCO_train2014_000000547594.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96737, "question_id": "csUXksyY8TkLd7ics5c4RZ", "question": "What type of weather are they hoping for?", "choices": ["snowy", "rainy", "sunny", "windy"], "correct_choice_idx": 3, "direct_answers": ["wind", "windy", "windy", "winter", "windy", "windy", "windy", "sunny", "hold", "windy"], "difficult_direct_answer": false, "rationales": ["People are holding kites and wind is needed to fly kites.", "You need wind to fly a kite", "The man in the foreground is holding a kite which is commonly known to need answer a in order to work as intended."], "image": "train2014/COCO_train2014_000000096737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 436605, "question_id": "csXgYfE8uCHbamf3cF8vKF", "question": "The bottom float is the same color as what?", "choices": ["cat", "cow", "bee", "fox"], "correct_choice_idx": 2, "direct_answers": ["nothing", "lizard", "bee", "bee", "bee", "bee", "bee", "bee", "grass", "bumble bee"], "difficult_direct_answer": false, "rationales": ["It's yellow and black like a bee", "The bottom float is black and yellow.", "A kite is black and yellow."], "image": "val2014/COCO_val2014_000000436605.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439439, "question_id": "cukTJeoQaVTMv6UtG8EjDr", "question": "What color is the interior side of the vintage refrigerator?", "choices": ["blue", "turquoise", "black", "white"], "correct_choice_idx": 1, "direct_answers": ["turquoise", "green", "green", "green", "green", "dodd", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["It is a blue color.", "The inside of the fridge is a color similar to a tree.", "It has a greenish blue color"], "image": "train2014/COCO_train2014_000000439439.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248252, "question_id": "cvLhia5ignu7LzKtsMqhE4", "question": "What is on the boat?", "choices": ["umbrella", "elephant", "jaguar", "bananas"], "correct_choice_idx": 0, "direct_answers": ["umbrella", "person", "person", "umbrella", "water", "person", "umbrella", "boat rider", "person", "umbrella"], "difficult_direct_answer": false, "rationales": ["A red and white shade is over a person in the boat.", "The boat has an umbrella.", "The device has a dome shape cover to protect the person from the rays of the sun."], "image": "val2014/COCO_val2014_000000248252.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426261, "question_id": "cvmDrAkPjU6UBhtqsDR4p5", "question": "What shape is the sign in?", "choices": ["octagon", "pentagon", "hexagon", "black car"], "correct_choice_idx": 0, "direct_answers": ["octagon", "stop", "octagan", "octagon", "round", "octagon", "diamond", "octagon", "octagon", "octagon"], "difficult_direct_answer": false, "rationales": ["An upside stop sign is on the street with 8 sides.", "The meaning of this eight-sided structure is known world wide.", "The sign has eight sides, which matches the unit identified in option a."], "image": "train2014/COCO_train2014_000000426261.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 551590, "question_id": "cvoK6bNzxDbHU88EY2SkFc", "question": "What is in the street?", "choices": ["cows", "cats", "cars", "tank"], "correct_choice_idx": 2, "direct_answers": ["cars", "cars", "cars", "cars", "vehicles", "cars", "cars", "cars", "cars", "vehicles"], "difficult_direct_answer": false, "rationales": ["There are cars in the street.", "Cars often are on the streets.", "Lots of vehicles"], "image": "train2014/COCO_train2014_000000551590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 142002, "question_id": "cwHcGPRt2R4SEDgCxkxfUf", "question": "What design is under the wing?", "choices": ["half moon", "cross", "gorgon", "star"], "correct_choice_idx": 3, "direct_answers": ["star", "star", "star", "pole", "skateboard", "star", "star", "star", "checked", "star"], "difficult_direct_answer": false, "rationales": ["There is a roundel under the wing. it contains a five-pointed shape.", "There is a circle painted on the bottom of the wing with a white star inside it.", "The underside of the wing is visible and the design is identifiable based on the outline."], "image": "train2014/COCO_train2014_000000142002.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 526196, "question_id": "cx3x8oJfc3fdkdXXSFD4C6", "question": "What is a potential hazard for the man?", "choices": ["concussion", "broken leg", "drowning", "cut finger"], "correct_choice_idx": 3, "direct_answers": ["knife", "knife", "cut finger", "cut", "cut himself", "cut finger", "knife", "cut himself", "knife", "cutting himself"], "difficult_direct_answer": false, "rationales": ["The man could cut himself.", "The man is slicing an apple. he could accidentally slice himself too.", "He could easily hurt himself."], "image": "train2014/COCO_train2014_000000526196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265211, "question_id": "d3Ay2HK3ypdwqzZtoKxxC7", "question": "What color is the background of the sign with the muscle man on it?", "choices": ["green", "white", "orange", "red"], "correct_choice_idx": 2, "direct_answers": ["yellow", "cross", "crossing", "orange", "red", "orange", "brown", "red", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The background is orange colored.", "A construction sign is on a pole near a road.", "There is an orange background on the sign next to the stop area."], "image": "train2014/COCO_train2014_000000265211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253123, "question_id": "d3QvdV8NRi4CtKxjpkcYwx", "question": "What food here is a good source of fiber?", "choices": ["oats", "hot dog", "beans", "fries"], "correct_choice_idx": 2, "direct_answers": ["bun", "beans", "baked beans", "beans", "nuts", "beans", "beans", "beans", "beans", "beans"], "difficult_direct_answer": false, "rationales": ["There is a pile of beans good in fiber.", "The beans are good for fiber.", "The food is beans."], "image": "train2014/COCO_train2014_000000253123.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185698, "question_id": "d3r2WbWrtfRWXPF2o99pjq", "question": "What are the red colored cakes covered in on the outside?", "choices": ["coconut", "snow", "powdered sugar", "whipped cream"], "correct_choice_idx": 0, "direct_answers": ["coconut", "powdered sugar", "coconut", "cream", "brown", "icing", "cream", "crumbs", "coconut", "coconut"], "difficult_direct_answer": false, "rationales": ["Coconut covers the outside of the cakes and is shredded so looks like flecks.", "The red cakes have coconut.", "Some kind of coconut cake."], "image": "val2014/COCO_val2014_000000185698.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 138008, "question_id": "d4iweRhzXMTibvMfPn3Cs3", "question": "What is next to the fence?", "choices": ["egg", "cow", "pumpkin", "motor bike"], "correct_choice_idx": 3, "direct_answers": ["vespa", "scooter", "motorcycle", "scooter", "motor bike", "scooter", "scooter", "motorcycle", "scooter", "scooter"], "difficult_direct_answer": false, "rationales": ["The motorbike is near the fence.", "A motor bike is there.", "A scooter with two wheels is parked beside the fence."], "image": "train2014/COCO_train2014_000000138008.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89949, "question_id": "d4oeR9WDuJFrfosrT3FNey", "question": "How many little sheep are grazing among the big sheep?", "choices": ["six", "five", "three", "four"], "correct_choice_idx": 2, "direct_answers": ["five", "one", "two", "two", "three", "two", "four", "three", "five", "three"], "difficult_direct_answer": false, "rationales": ["There are three sheep with their heads on the ground eating in the background.", "None of them are actually shown grazing and it looks like only two.", "There are three sheep in the background that are grazing on the grass"], "image": "train2014/COCO_train2014_000000089949.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199346, "question_id": "d4sTPFfZVcfeBRB48zxkyk", "question": "What is the human statue on top of?", "choices": ["elephant", "horse", "trampoline", "pogo stick"], "correct_choice_idx": 1, "direct_answers": ["horse", "horse", "horse", "horse", "warrior", "horse", "horse", "plinth", "warrior", "horse"], "difficult_direct_answer": false, "rationales": ["A man dressed in armor rides an animal with a long snout and a mane in the back.", "The man is riding a horse.", "It is an animal that people ride and common to see with statues"], "image": "val2014/COCO_val2014_000000199346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174690, "question_id": "d5Mft9B5cd5rjT8YVLageC", "question": "What does the square telescreen contain on the subway station?", "choices": ["map", "directions", "words", "clock"], "correct_choice_idx": 3, "direct_answers": ["two", "arrival/departure information", "date", "train information", "travel times", "clock", "train arrival", "clock", "clock", "number"], "difficult_direct_answer": false, "rationales": ["The square television shows hands and numbers. it displays the time.", "There are multiple signs, but the only square one contains answer a based on the design and features common to answer a.", "The telescreens are clearly visible and only one is a square. the square telescreen has a clock face based on the defining features."], "image": "val2014/COCO_val2014_000000174690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557875, "question_id": "d5oxRq5PNVc9cFrycvr9td", "question": "Who is the current chief of this organization?", "choices": ["gerhard weber", "miguel mullenbach", "ernst wagner", "leonhard tietz"], "correct_choice_idx": 1, "direct_answers": ["person", "miguel mullenbach", "miguel mullenbach", "galeria", "na", "president", "john smith", "na", "john roberts", "galleria"], "difficult_direct_answer": false, "rationales": ["Miguel mullenbach is the ceo of the galeria organization. he is an important person.", "A simple google search can locate miguel mullenbach as the ceo of galeria.", "The current chief of the organization is miguel mullenbach."], "image": "train2014/COCO_train2014_000000557875.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266168, "question_id": "d8n6AcezFkxFFte66bfura", "question": "What is next to the tracks?", "choices": ["walking couple", "dog", "tall lights", "cat"], "correct_choice_idx": 2, "direct_answers": ["train", "tall lights", "train", "grass", "lights", "buildings", "train", "road", "lights", "train"], "difficult_direct_answer": false, "rationales": ["There are poles overhead with light fixtures", "There are lights lit up on the side of the tracks.", "The tracks have tall lights nearby."], "image": "train2014/COCO_train2014_000000266168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 235388, "question_id": "d94AQw7LSMD8WnisPExWio", "question": "What is being used to take the photo?", "choices": ["camera", "flip phone", "smart phone", "dash cam"], "correct_choice_idx": 1, "direct_answers": ["flip phone", "cell phone", "travelling", "camera", "phone", "camera", "mirror", "camera", "mirror", "phone"], "difficult_direct_answer": false, "rationales": ["The reflection of the photo-taking tool is seen in the car mirror. based on the features, it is identifiable as answer a.", "The phone is a flip one.", "The flip phone is being used."], "image": "train2014/COCO_train2014_000000235388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 459786, "question_id": "dAkKG7iRjJ2sai8EUmvS6p", "question": "How do you know this is a commercial bathroom?", "choices": ["trash can", "multiple showers", "signage", "many sink"], "correct_choice_idx": 3, "direct_answers": ["beautiful", "rental bathroom", "many sinks", "sinks", "multiple sinks", "sinks", "yes this", "six sinks", "many sink", "large"], "difficult_direct_answer": true, "rationales": ["It has several places for people to wash their hands", "There are a lot of sinks.", "There are many sinks in the bathroom that can be used simultaneously that commercial bathrooms use so people don't have to wait."], "image": "val2014/COCO_val2014_000000459786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493791, "question_id": "dDJqDa5e88x5a7D4B8jnnQ", "question": "What is near the chair?", "choices": ["door", "elephant", "cat", "baby"], "correct_choice_idx": 0, "direct_answers": ["bookcase", "table", "door", "shelf", "table", "shelf", "door", "book shelves", "shelves", "person"], "difficult_direct_answer": false, "rationales": ["The front doors of a building are open with rows of chairs inside.", "There is a door.", "There is a glass door near the chair that leads into a room."], "image": "train2014/COCO_train2014_000000493791.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517835, "question_id": "dEJ25fZydgQjv342C9hSc9", "question": "What is on the floor?", "choices": ["cow", "grate", "egg sandwich", "pizza"], "correct_choice_idx": 1, "direct_answers": ["pavement", "drain", "pipe", "grate", "grate", "sewer", "fire hydrant", "storm drain", "spigot", "water line"], "difficult_direct_answer": true, "rationales": ["There is a metal grate on the floor of the street near the pipe.", "A long horizontal metal square is in the middle of the road. it helps with water drainage from the road down in sewers.", "A metal, slotted object is in the street. drains are used in streets to remove excess water from rain."], "image": "train2014/COCO_train2014_000000517835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43530, "question_id": "dEotFg5NxKrtzGVZCj4JDr", "question": "What kind of water body are these birds gathered in?", "choices": ["lake", "stream", "river", "ocean"], "correct_choice_idx": 3, "direct_answers": ["ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean", "ocean"], "difficult_direct_answer": false, "rationales": ["There is no land in sight so it's a large body of water", "The water is an ocean.", "It appears to be an ocean because you can see the shoreline in the distant background."], "image": "val2014/COCO_val2014_000000043530.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493682, "question_id": "dF3oH22CcjPR23jA4PVcdH", "question": "What building does the diesel sign most likely foreshadow?", "choices": ["grocery store", "school", "gas station", "train station"], "correct_choice_idx": 2, "direct_answers": ["gas station", "gas station", "gas station", "gas station", "gas station", "gas station", "stop", "gas", "petrol station", "gas station"], "difficult_direct_answer": false, "rationales": ["The diesel sign indicates a gas station is nearby.", "Generally that type of fossil fuel is found at a gas station.", "There is a gas station."], "image": "val2014/COCO_val2014_000000493682.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 192720, "question_id": "dFCyuRGc2z9m8V3VadaMK6", "question": "What is the most likely location of this station?", "choices": ["asia", "europe", "south america", "africa"], "correct_choice_idx": 1, "direct_answers": ["europe", "subway", "france", "europe", "city", "city", "railway station", "underground", "station", "underground"], "difficult_direct_answer": false, "rationales": ["The language on the sign is the language spoken in the region identified in option a.", "There is writing visible and readable on the yellow warning sign that is written in german. the german language most commonly appears in answer a.", "It appears there is french language written throughout the station. the language combined with the presence of white people places this scene most likely in answer a."], "image": "train2014/COCO_train2014_000000192720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52036, "question_id": "dFPgGHxkwwfWWxnhmxZVpk", "question": "What danger is the child in?", "choices": ["fire hazard", "slip hazard", "drowning hazard", "choking hazard"], "correct_choice_idx": 3, "direct_answers": ["poison", "chemical", "choking hazard", "shampoo", "sick", "poison", "eating lotion", "poisoning", "eating chemical", "dyeing"], "difficult_direct_answer": true, "rationales": ["The child could choke on the lid.", "The child could choke.", "A toddler has a tube of something in its mouth. they must be careful not to get it lodge in mouth."], "image": "train2014/COCO_train2014_000000052036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 395241, "question_id": "dFRDxkuZAHXNweDzcAJetE", "question": "What era do the woman on the right's pants look like they are from?", "choices": ["1970s", "1200s", "800s", "1920s"], "correct_choice_idx": 0, "direct_answers": ["1970s", "1960", "1970s", "seventies", "70's", "tree", "road", "jeans", "speaking", "seventies"], "difficult_direct_answer": false, "rationales": ["In the 70s a flare pant was popular.", "Bell bottoms were widely worn in the era of disco.", "They look like hippie jeans."], "image": "train2014/COCO_train2014_000000395241.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 23489, "question_id": "dFz5eRABDAdCmykDHfMxD7", "question": "What is the color on the top of the boat going down the city canal?", "choices": ["red", "blue", "brown", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "car", "green", "green", "green", "red", "red", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["It's similar to the color of grass", "The top of the boat is colored like a tree.", "A boat that is brown on the bottom and green on top is in the water."], "image": "val2014/COCO_val2014_000000023489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533502, "question_id": "dJ6iFgSVV2tXi9BXH6myPN", "question": "What are the curtains on the window called?", "choices": ["cafe curtains", "blinds", "valance", "drapes"], "correct_choice_idx": 2, "direct_answers": ["blinds", "valance", "valance", "valences", "kitchen curtain", "closer", "shades", "ruffled", "curtain", "cover"], "difficult_direct_answer": true, "rationales": ["Small curtains are at the top of a window. it gives it a nice formal decoration.", "There are small curtains that are used to adorn the top of windows.", "There are valance curtains."], "image": "train2014/COCO_train2014_000000533502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511285, "question_id": "dJmSQM2einVrQCfRMsPgSh", "question": "How many knobs can be seen on the front of the oven?", "choices": ["ten", "12", "four", "seven"], "correct_choice_idx": 3, "direct_answers": ["six", "six", "eight", "seven", "six", "one", "eight", "six", "seven", "six"], "difficult_direct_answer": false, "rationales": ["The front of the oven has seven silver knobs on the top of it.", "There are seven knobs visible in the front of the oven for heat.", "There are seven knobs."], "image": "train2014/COCO_train2014_000000511285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530546, "question_id": "dKFh7ZP5NRSxwPYhbocjPt", "question": "How many dogs are held on the leashes?", "choices": ["three", "two", "one", "four"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two dogs being held on leashes with the group of people.", "There are two dogs held by leashes.", "There are two different dogs that are on leashes."], "image": "train2014/COCO_train2014_000000530546.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357004, "question_id": "dKHAXrNgNJydamJav9yyqq", "question": "What is the bear sitting in?", "choices": ["swing", "basket", "racecar", "box"], "correct_choice_idx": 2, "direct_answers": ["car", "car", "car", "race car", "car", "racecar", "race car", "car", "car", "car"], "difficult_direct_answer": false, "rationales": ["The stuffed animal is sitting behind the wheel of a toy car with a number on it. racecars generally have numbers on them.", "He is in a racecar.", "The bear is in a racecar."], "image": "train2014/COCO_train2014_000000357004.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215231, "question_id": "dKqAm7rkNZoFpNfQteVsXk", "question": "What are a group of these food items called?", "choices": ["sack", "clowder", "bunch", "ear"], "correct_choice_idx": 2, "direct_answers": ["nothing", "banana", "bananas", "banana", "bananas", "bunch", "bananas", "bunch", "bunch", "banana"], "difficult_direct_answer": false, "rationales": ["The group is a bunch.", "They are bananas", "A group of bananas is called a bunch."], "image": "train2014/COCO_train2014_000000215231.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206751, "question_id": "dLQCukfxbemPme36VPW4je", "question": "What utensil is absent?", "choices": ["chopsticks", "knife", "spoon", "fork"], "correct_choice_idx": 0, "direct_answers": ["steak knife", "sporks", "cup", "teaspoon", "spoon", "chopsticks", "knife", "na", "spork", "spoon"], "difficult_direct_answer": true, "rationales": ["(a) chopsticks. there are forks and a spoon on the table, and a fork and a knife on the small plate by the wall.", "The utensil is chopsticks.", "All the traditional western utensils can be seen in the image."], "image": "val2014/COCO_val2014_000000206751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520637, "question_id": "dMdLY2GTa8Nx3dVBFRLMgz", "question": "What color outfit is the child wearing?", "choices": ["red", "pink", "blue", "green"], "correct_choice_idx": 0, "direct_answers": ["orange white", "red", "red white", "red white", "red striped", "red white", "white orange", "red white", "red", "red white"], "difficult_direct_answer": false, "rationales": ["The child's outfit is not blue, green, or pink.", "The outfit is not blue, green, or pink.", "The shirt has red on."], "image": "train2014/COCO_train2014_000000520637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350326, "question_id": "dPgq5vyttvz3WoFLu5bwWs", "question": "What is next to the girl?", "choices": ["pumpkin", "luggage", "apple", "cow"], "correct_choice_idx": 1, "direct_answers": ["doll", "beautiful accommodating", "suitcase", "showcase", "luggage", "bush", "bushes", "bushes", "suitcase", "box"], "difficult_direct_answer": false, "rationales": ["The girl sitting on the wall has luggage next to her such as a brown suitcase.", "There is a hard cover bag with latches", "This looks like cosplay and even makes sense if she's an anime character."], "image": "train2014/COCO_train2014_000000350326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162230, "question_id": "dQL5GQz5vpLvd5tt8P4aYw", "question": "What kind of fruits are placed inside of these muffins?", "choices": ["raspberries", "strawberries", "watermelons", "blueberries"], "correct_choice_idx": 3, "direct_answers": ["raisins", "blueberry", "blueberry", "blueberries", "muffins", "berries", "cake", "nothing", "blue", "blueberries"], "difficult_direct_answer": false, "rationales": ["The muffins in the baking pan have blueberries baked into them.", "Blueberries are in the muffins.", "They are dark fruit and common in muffins"], "image": "train2014/COCO_train2014_000000162230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260637, "question_id": "dQNvLr2ZKBfdC82xbdmaRL", "question": "What vehicle is in the foreground?", "choices": ["tank", "car", "airplane", "helicopter"], "correct_choice_idx": 2, "direct_answers": ["plane", "airplane", "plane", "airplane", "airplane", "vehicle", "plane", "plane", "airplane", "plane"], "difficult_direct_answer": false, "rationales": ["There is a plane on the ground.", "The airplane is the type of plane in the ground.", "There's also one in the background because this is likely an air field."], "image": "train2014/COCO_train2014_000000260637.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15827, "question_id": "dSdprmnt4rQRsBxBHB74Rm", "question": "Who rides these types of vehicles?", "choices": ["cab drivers", "army sergeants", "coach drivers", "bikers"], "correct_choice_idx": 3, "direct_answers": ["bikers", "bikers", "adults", "person", "bikers", "bikers", "bikers", "people", "humans", "people"], "difficult_direct_answer": false, "rationales": ["Those are motor-powered two-wheeled vehicles common for those kinds of riders.", "The bikes are for bikers.", "Bikers ride motorcycles."], "image": "val2014/COCO_val2014_000000015827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 576518, "question_id": "dU2swHt8q5xzft69mrQceV", "question": "Which former teammate of Chipper Jones is standing in the on-deck circle?", "choices": ["otis nixon", "david wright", "adam laroche", "mike trout"], "correct_choice_idx": 2, "direct_answers": ["catcher", "teammate", "stop", "middle", "car", "bruce kisson", "13", "black", "maddux", "adam laroche"], "difficult_direct_answer": true, "rationales": ["The number shown is that athletes number.", "Adam laroche is standing there", "The teammate is adam."], "image": "train2014/COCO_train2014_000000576518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139865, "question_id": "dVWSC7yhGbx3ymocNNaAGv", "question": "The bottled drink on the right side of the table is what color?", "choices": ["white", "green", "pink", "blue"], "correct_choice_idx": 2, "direct_answers": ["pink", "orange", "orange", "pink", "orange", "red", "red bottle", "pink", "black", "pink"], "difficult_direct_answer": false, "rationales": ["The drink on the right is not blue, white, or green.", "The drink is not blue, white, or green.", "A glass bottle is filled with a pink liquid and is on a table next to a plate of food."], "image": "train2014/COCO_train2014_000000139865.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480133, "question_id": "dVcN3rfQBYg6M7VskFHM55", "question": "How many colors are ejected from the planes flying in formation?", "choices": ["four", "five", "six", "seven"], "correct_choice_idx": 1, "direct_answers": ["five", "three", "three", "five", "three", "five", "two", "five", "three", "five"], "difficult_direct_answer": false, "rationales": ["There are five different colors extending from the end of the jet engines.", "All you have to do is count the different types of colors from the smoke.", "There are five colors."], "image": "val2014/COCO_val2014_000000480133.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401721, "question_id": "dXYZjFyQcGUHtGRrvthMMH", "question": "What is the rolltop object used for?", "choices": ["hold coffee", "hold flour", "bread box", "paying bills"], "correct_choice_idx": 2, "direct_answers": ["spices", "hold bread", "bread", "bread", "toaster", "bread", "bread", "bread box", "bread", "unsure"], "difficult_direct_answer": false, "rationales": ["The box is used to store bread.", "This keeps bugs out", "It is in the shape of bread and used for keeping bread fresh because it spoils easily."], "image": "train2014/COCO_train2014_000000401721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 325956, "question_id": "dYH4KnutQdEzananWw2dur", "question": "What is the person in the air wearing?", "choices": ["christmas lights", "cow bells", "tie", "sneakers"], "correct_choice_idx": 3, "direct_answers": ["skateboard", "cloths", "sneakers", "black jeans", "black jeans", "casual clothes", "jeans", "tennis shoes", "clothes", "nike shoes"], "difficult_direct_answer": true, "rationales": ["Those things help protect his feet. they also have that nike symbol, famous among shoes.", "The person has sneakers.", "They have white nike logo that is known for snickers."], "image": "train2014/COCO_train2014_000000325956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153184, "question_id": "dZKb9edV9iuDki3Ce5ZJ8e", "question": "How many birds grazing on the top of the giraffe's chest?", "choices": ["three", "two", "one", "four"], "correct_choice_idx": 0, "direct_answers": ["three", "birds", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 3 birds on the giraffe.", "Three birds are sitting on top of the giraffe's neck.", "There are 3 of them."], "image": "val2014/COCO_val2014_000000153184.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368746, "question_id": "daWtH3JMbtSVvtwaBtZvEo", "question": "What is the man holding?", "choices": ["dog", "baby", "cat", "tree log"], "correct_choice_idx": 1, "direct_answers": ["baby", "baby", "baby", "cake", "baby", "baby", "baby", "child", "celebrating", "baby"], "difficult_direct_answer": false, "rationales": ["A person is bent over a cake blowing the candles out and small feet can be seen below his arms in which he is cradling something.", "He has a small child in his arm.", "The man has a baby on his side."], "image": "train2014/COCO_train2014_000000368746.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8010, "question_id": "dc33rX9yFXjUHSqR9QUQYB", "question": "How many blurry figures are passing the ocean with a surfboard in their hands?", "choices": ["three", "four", "two", "one"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["One can make out a pair of humanoid figures carrying surfboards.", "One blurry figure is on the left. an additional blurry figure is on the right.", "There are two people"], "image": "val2014/COCO_val2014_000000008010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508177, "question_id": "dcRmH6bwXqTPr8tpd6xZdP", "question": "What color is the real toothbrush to the left side and rear of the toothbrush holder?", "choices": ["blue", "red", "purple", "orange"], "correct_choice_idx": 2, "direct_answers": ["purple", "purple", "purple", "purple", "purple", "purple", "purple", "pink", "purple", "purple"], "difficult_direct_answer": false, "rationales": ["It's a lavender color with white", "The toothbrush on the left is a mauve color.", "The color is purple."], "image": "train2014/COCO_train2014_000000508177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108722, "question_id": "dd7rWkfAw4EnGsFkMciroo", "question": "What color is the small boat to the far left in the line of boats going down the dock?", "choices": ["red", "white", "pink", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "blue", "white", "white", "white/brown", "white blue", "blue", "blue", "red", "blue"], "difficult_direct_answer": false, "rationales": ["It has this on the back and the rest is white", "The boat on the left is blue in color. the blue is on the bottom of the boat.", "It has this color on the back and the sides are hard to see"], "image": "train2014/COCO_train2014_000000108722.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73093, "question_id": "ddv6F8g49EvYV4u4NpjddD", "question": "What is the boy doing?", "choices": ["pitching", "sleeping", "dancing", "eating"], "correct_choice_idx": 0, "direct_answers": ["pitching", "baseball", "pitching", "throwing baseball", "throwing ball", "playing baseball", "pitching", "playing baseball", "ball", "playing"], "difficult_direct_answer": false, "rationales": ["He is getting ready to throw the ball.", "The kid is pitching.", "To throw a baseball you must put your arm back before throwing the ball forward."], "image": "val2014/COCO_val2014_000000073093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78861, "question_id": "derLiNMykYteRDVuRmUrof", "question": "Which person is most likely flying the kite?", "choices": ["red jacket", "no shirt", "no one", "white vest"], "correct_choice_idx": 3, "direct_answers": ["left", "front person", "kid", "tallest", "left", "person2", "white vest", "mom", "front person", "left"], "difficult_direct_answer": false, "rationales": ["There is a person with their back to us with a black cap and white vest. they have a shadow casting with a string from their hand.", "You can tell by the shadow as to who is flying the kite.", "The person wears white."], "image": "train2014/COCO_train2014_000000078861.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 262777, "question_id": "devwhVoPhaEQnbUMUeizTn", "question": "What number of birds is sitting on top of the electric bar?", "choices": ["one", "two", "four", "three"], "correct_choice_idx": 0, "direct_answers": ["one", "two", "two", "one", "one", "pigeon", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["The other birds are flying.", "A pigeon is perched atop a power line bar while others fly way above him. there are approximately 9 million pigeons in new york city.", "A bird is perched near an electrical wire with others flying overhead."], "image": "train2014/COCO_train2014_000000262777.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484563, "question_id": "dfhSXKPgb5RUBquJ3j6LTw", "question": "What type of vehicle is the man riding?", "choices": ["john deere", "big wheel", "krazy kart", "matchbox"], "correct_choice_idx": 1, "direct_answers": ["big wheel", "tricycle", "tricycle", "bicycle", "big wheel", "baby cycling", "tricycle", "big wheel", "big wheel", "tricycle"], "difficult_direct_answer": false, "rationales": ["It has a large pink wheel infront.", "It has a large wheel in the front and two small wheels in the back.", "The man is visibly on a man-powered vehicle with three wheels and one larger than the others. this type of vehicle can be known as answer a."], "image": "train2014/COCO_train2014_000000484563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82418, "question_id": "dgphBvWhUeLGQVkauQWBLN", "question": "What brand of technology is placed on the device in front of the cat?", "choices": ["asus", "apple", "dell", "hp"], "correct_choice_idx": 1, "direct_answers": ["apple", "laptop", "apple", "apple", "laptop", "apple", "apple computer", "laptop", "apple", "laptop"], "difficult_direct_answer": false, "rationales": ["The logo of the laptop is visible and known to be associated with answer a company.", "It's the logo for apple.", "You can tell by the apple logo as to what company made the laptop."], "image": "train2014/COCO_train2014_000000082418.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250240, "question_id": "dgq7XVYbiQjRcXvWJyGbXp", "question": "What color is the handle of the mop tucked against the corner of the wall?", "choices": ["white", "black", "red", "blue"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color is easily identifiable by observation. it is a bright color and easily visible when contrasted against the light background.", "There is a red handle on the mop next to the toilet.", "It is a bright primary color"], "image": "val2014/COCO_val2014_000000250240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523495, "question_id": "dhcdgwgyBZbDYTL8nBbKPj", "question": "What is on top of the hot dogs in the center of the table?", "choices": ["macaroni", "sauerkraut", "ketchup", "mashed potatoes"], "correct_choice_idx": 0, "direct_answers": ["macaroni cheese", "chili", "mustard", "macaroni", "cheese", "mustard", "cheese", "man", "macaroni", "cheese"], "difficult_direct_answer": false, "rationales": ["The other options aren't shown on top of them.", "The top has macaroni.", "There is macaroni and cheese on top of the hot dogs."], "image": "train2014/COCO_train2014_000000523495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 104898, "question_id": "dheA2Ws6eKDRSQjW9mqXbn", "question": "What animals are most similar to these?", "choices": ["bison", "horses", "wolves", "foxes"], "correct_choice_idx": 1, "direct_answers": ["horse", "horses", "zebra", "horse", "horse", "horses", "zebra", "zebras", "zebras", "horses"], "difficult_direct_answer": false, "rationales": ["The zebras are similar in anatomy to horses. they have four legs and a mane.", "Both these animals are shaped the same.", "Zebras are gathered together and walking."], "image": "train2014/COCO_train2014_000000104898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331738, "question_id": "dhnAyUYr55sKyH6UQATt5V", "question": "Which nation's flag is on the tail fin of this aircraft?", "choices": ["france", "germany", "usa", "uk"], "correct_choice_idx": 0, "direct_answers": ["france", "great britain", "france", "france", "japan", "france", "france", "yugoslavia", "america", "france"], "difficult_direct_answer": false, "rationales": ["The tail of a plane is painted red, white, and blue.", "You can tell by the colors as to what country it is from.", "The flag of france is on the tail fin of the aircraft."], "image": "train2014/COCO_train2014_000000331738.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465835, "question_id": "di8ruHpUwhKCkgcJivQUJu", "question": "What has stripes here?", "choices": ["jacket", "zebras", "tshirt", "beer bottle"], "correct_choice_idx": 1, "direct_answers": ["zebras", "zebras", "zebras", "zebra", "zebras", "zebra", "zebra", "zebras", "zebras", "zebras"], "difficult_direct_answer": false, "rationales": ["The animals in the grass that have stripes are called zebras.", "A group of zebras stand near each other.", "Zebras are animals with black and white stripes."], "image": "val2014/COCO_val2014_000000465835.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 512316, "question_id": "diyASz6auyLRU8ykCkQG6w", "question": "What feature do these animals have?", "choices": ["wings", "quills", "tusks", "long neck"], "correct_choice_idx": 3, "direct_answers": ["long neck", "care", "spots", "spots", "tall", "long neck", "long necks", "ears", "spots", "long neck"], "difficult_direct_answer": false, "rationales": ["These animals are giraffes, not porcupines, birds, or elephants.", "The animals are giraffes and they are known for having a long neck.", "They have very long necks."], "image": "train2014/COCO_train2014_000000512316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 432053, "question_id": "djSwixaoWNoFrWoxxde62D", "question": "What kind of fruits are inside of the sunflower bowl on top of the table?", "choices": ["banana", "raspberry", "apple", "strawberry"], "correct_choice_idx": 0, "direct_answers": ["bananas", "bananas", "bananas", "banana", "bananas", "banana", "banana", "bananas", "bananas", "bananas"], "difficult_direct_answer": false, "rationales": ["The bowl contains yellow curved fruit that grow together in a bunch from a stem.", "The fruit is a banana.", "Bananas are long and yellow, and all of the other fruits are small and red. since the only items in the bowl are long and yellow, there are bananas in the bowl."], "image": "train2014/COCO_train2014_000000432053.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221400, "question_id": "djnHmzyQjmUNSdibWmAndR", "question": "What type of facility is likely displaying these cutting implements?", "choices": ["hotel", "museum", "school", "library"], "correct_choice_idx": 1, "direct_answers": ["knife", "musiem", "museum", "museum", "hospital", "museum", "salon", "cut", "museum", "museum"], "difficult_direct_answer": false, "rationales": ["These are old surgical tools", "Antiques are under glass with an informational tag to the side.", "The facility is a museum."], "image": "train2014/COCO_train2014_000000221400.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 281815, "question_id": "dmnc2ketLr4K2BXAaX23uA", "question": "How many computer screens are around the cat sleeping on the desk?", "choices": ["two", "three", "five", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "one", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Two monitors are present.", "There are 2.", "You can tell by the laptop and computer monitor."], "image": "train2014/COCO_train2014_000000281815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19769, "question_id": "dsERzFqMBGAN3UCsUbVHna", "question": "What feature does this animal have?", "choices": ["quill", "fins", "wings", "tail"], "correct_choice_idx": 3, "direct_answers": ["tail", "tail", "paint", "tusks", "trunk", "natural", "trunk tusks", "trunk", "trunk", "tusks"], "difficult_direct_answer": false, "rationales": ["It is hanging down from the back of the animal", "The animal is clearly an elephant based on its size, shape and visible features. answer a is a known body part of elephants and none of the other answer are.", "The elephant has a tail. the elephant has a long and skinny tail."], "image": "train2014/COCO_train2014_000000019769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549501, "question_id": "dtePqbWwwYTAGwSnD3EpEj", "question": "What kind of refined natural resource is used to power the white car?", "choices": ["jet fuel", "gasoline", "corn alcohol", "diesel fuel"], "correct_choice_idx": 1, "direct_answers": ["gasoline", "shadow", "solar", "gas", "riding", "gasoline", "gas", "gasoline", "feet", "gasoline"], "difficult_direct_answer": false, "rationales": ["This is an automobile", "An automobile is driving in the street.", "The white car is a normal car and is powered by gasoline."], "image": "train2014/COCO_train2014_000000549501.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224596, "question_id": "duRQpySY9qWP2TCpmibbN3", "question": "What color is the tip of this man's baseball bat?", "choices": ["pink", "red", "blue", "silver"], "correct_choice_idx": 3, "direct_answers": ["black white", "black", "white", "silver", "silver", "black", "white", "sliver", "white", "silver"], "difficult_direct_answer": false, "rationales": ["The bat tip is this light shiny color.", "It's likely a metal bat with silver and black as the colors.", "It's shiny and metallic like chrome"], "image": "train2014/COCO_train2014_000000224596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 279920, "question_id": "dvaHFM7Gn4karzgdhSfm3A", "question": "What are the giraffes playing around?", "choices": ["babies", "cars", "coyotes", "trees"], "correct_choice_idx": 3, "direct_answers": ["around trees", "trees", "green", "tree", "trees", "trees", "tree", "trees", "trees", "trees"], "difficult_direct_answer": false, "rationales": ["Giraffes hang around these to eat the leaves.", "The animals are by tall trees.", "The giraffes are all standing around trees, eating the leaves."], "image": "train2014/COCO_train2014_000000279920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 419971, "question_id": "dvu3GC8fhqWyLFtvZJJL3f", "question": "What color is the cake at the bottom of the urinal?", "choices": ["green", "yellow", "red", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "white", "yellow", "green", "white", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The urinal cake is blue.", "There are two long vertical urinals with a bluish object in bottom. it serves as a way to freshen up a otherwise odor of pee.", "You can see the color on the round circle at the bottom"], "image": "val2014/COCO_val2014_000000419971.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438537, "question_id": "dwqA6yBU8hSAFLYNr2CXAh", "question": "What continent is this most likely?", "choices": ["asia", "south america", "north america", "europe"], "correct_choice_idx": 0, "direct_answers": ["india", "asia", "asia", "asia", "india", "asia", "india", "india", "village", "india"], "difficult_direct_answer": false, "rationales": ["The climate is favorable to rice and cattle like in asia.", "Rice patties grow primarily in asia.", "There is rice growing in the water which is common there"], "image": "train2014/COCO_train2014_000000438537.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 267925, "question_id": "dxBwi2yzm7Z6pSGvQoH5vA", "question": "What color is the baseball helmet worn by the batter who had just hit the ball?", "choices": ["black", "green", "white", "red"], "correct_choice_idx": 0, "direct_answers": ["black", "black", "black", "blue", "black", "white", "black", "red", "black", "blue"], "difficult_direct_answer": false, "rationales": ["The baseball helmet is black.", "The color is black.", "The person that is still holding the bat has on a dark colored helmet."], "image": "train2014/COCO_train2014_000000267925.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483458, "question_id": "dxHMCs4hfTVXDJcnVhw5bb", "question": "What is near the vehicles?", "choices": ["banana", "elephant", "cat", "coyote"], "correct_choice_idx": 1, "direct_answers": ["elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "car", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["A large animal with a long trunk is walking across a dirt road with cars waiting behind.", "The elephant is nearby.", "The pachyderm is identifiable by its large size, gray color, big trunk and it lives in arid places."], "image": "train2014/COCO_train2014_000000483458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390346, "question_id": "dxZXrirwBYG4fkNdpZrsde", "question": "What does the animal in the foreground have?", "choices": ["stinger", "wings", "horns", "gills"], "correct_choice_idx": 2, "direct_answers": ["cow", "horns", "grass", "horns", "horns", "buffalo", "horns", "horns", "horns", "horns"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to this animal or appear in this image. bulls like this one were worshipped by ancient peoples.", "The animal in the foreground has huge horns.", "You can see these on either side of the animal."], "image": "train2014/COCO_train2014_000000390346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504790, "question_id": "dyqpWAoAdBXBgezkNiVNyJ", "question": "What does the mayonnaise dressing for the sandwiches contain elements of?", "choices": ["garlic", "parsley", "bacon", "dijon"], "correct_choice_idx": 3, "direct_answers": ["meat", "egg", "eggs", "vegetables", "burger", "dijon", "dijon", "egg", "orange", "dijon"], "difficult_direct_answer": false, "rationales": ["According to the label on the mayonnaise, it contains french mustard.", "The writing on the mayo container details what is inside it and answer a is written on the side.", "It is a little yellow and it says it on the bottle"], "image": "val2014/COCO_val2014_000000504790.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132104, "question_id": "e2HQLByaPZ9TzCFfoqw8pZ", "question": "What is side by side?", "choices": ["babies", "rabbits", "laptops", "cows"], "correct_choice_idx": 2, "direct_answers": ["laptops", "laptop", "laptops", "laptops", "laptops", "laptops", "laptop", "laptop", "laptops", "laptops"], "difficult_direct_answer": false, "rationales": ["The other options don't appear on the table. it looks like the screens are synced as well.", "There are two laptops sitting side by side on the table.", "There are no animals or people. there are two portable computers."], "image": "train2014/COCO_train2014_000000132104.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168434, "question_id": "e2LewxEiTzc75Fve7MWjxz", "question": "What nation's national flag is on the side of this train engine?", "choices": ["uk", "france", "usa", "ireland"], "correct_choice_idx": 2, "direct_answers": ["usa", "united states", "usa", "united states", "america", "usa", "america", "united states", "usa", "united states"], "difficult_direct_answer": false, "rationales": ["There is an american flag on the side of the yellow train engine.", "The flag is the stars and stripes representing the united states.", "The flag is the us's."], "image": "val2014/COCO_val2014_000000168434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268111, "question_id": "e2WSDA3SJpKQoRswSmb7jJ", "question": "What kind of dog do these goats somewhat resemble?", "choices": ["great dane", "beagle", "sheepdog", "rottweiler"], "correct_choice_idx": 1, "direct_answers": ["german", "beagle", "hound", "beagle", "dalmatian", "australian shepherd", "nothing", "collie", "stop", "beagle"], "difficult_direct_answer": false, "rationales": ["White goats have long, brown ears.", "The dog is a beagle.", "The goats are mostly white. they have brown ears and partially brown faces."], "image": "train2014/COCO_train2014_000000268111.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218580, "question_id": "e3UQSFVKuC2o796UpRe9zD", "question": "What is the child learning to do?", "choices": ["ski", "bake", "play baseball", "play chess"], "correct_choice_idx": 0, "direct_answers": ["skii", "ski", "ski", "ski", "ski", "skating", "ski", "ice skating", "ski", "skate"], "difficult_direct_answer": false, "rationales": ["She is learning to ski.", "The child is on snow and has sports equipment.", "The kid wants to ski."], "image": "train2014/COCO_train2014_000000218580.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352006, "question_id": "e3ZY7AmTawuW4mADVGTbi3", "question": "What are all of the sheep gathering around in their field?", "choices": ["dog", "hay", "gate", "salt lick"], "correct_choice_idx": 1, "direct_answers": ["food", "grazing", "hay", "hay", "hay", "goats", "hay", "hay", "hay", "chained"], "difficult_direct_answer": false, "rationales": ["The sheep are eating the hay.", "Sheep eat hay.", "They are wanting to eat."], "image": "train2014/COCO_train2014_000000352006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 369952, "question_id": "e4QX7yHaRTf27i3bcTjwyE", "question": "What is the movement the boy in the front left is doing called?", "choices": ["full force", "french fries", "pizza", "head strong"], "correct_choice_idx": 2, "direct_answers": ["bogen", "skating", "skating", "pizza", "skiing", "skiing", "skating", "braking", "sking", "sliding"], "difficult_direct_answer": false, "rationales": ["The boy in question has his skis angled towards each other in front of him. in this setting, while skiing at the apparent level the boy is at, this positioning is referred to as answer a.", "The boy in the front left is doing a pizza with his skis.", "It's a snowplow and does look like a pizza slice"], "image": "train2014/COCO_train2014_000000369952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214699, "question_id": "e4itudtQWDVpfoziX4suYC", "question": "At least how many people can ride an elephant at once?", "choices": ["ten", "three", "eight", "five"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "two", "three"], "difficult_direct_answer": false, "rationales": ["The first elephant has two women and a man.", "Only three riders at a time.", "There are 3."], "image": "train2014/COCO_train2014_000000214699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578962, "question_id": "e59JmefjfobeGLJXjcgPfT", "question": "Which country is this street station a part of?", "choices": ["thailand", "greece", "georgia", "romania"], "correct_choice_idx": 0, "direct_answers": ["bangkok", "highway", "bangkok", "thailand", "thailand", "thailand", "thailand", "bangkok", "bangkok", "thailand"], "difficult_direct_answer": false, "rationales": ["The station is in bangkok which is the capital.", "The station is at thailand.", "The name of the city is on the building"], "image": "val2014/COCO_val2014_000000578962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 549042, "question_id": "e5TpEXrVFqEndWJTNSBNEi", "question": "What is the elephant wearing?", "choices": ["basket", "hat", "blue ribbon", "crown"], "correct_choice_idx": 2, "direct_answers": ["saddle", "basket", "howdah", "blue ribbon", "house", "saddle", "saddle", "table", "platform", "saddle"], "difficult_direct_answer": false, "rationales": ["The piece of cloth is tied around the elephant's neck.", "There is a rope for the rider to hold onto", "The elephant is wearing a blue ribbon around its neck."], "image": "train2014/COCO_train2014_000000549042.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329963, "question_id": "e5gKYrL2opbTvbv3CrVkzs", "question": "What are the two horns on this animal called?", "choices": ["ossicones", "antlers", "pedicles", "scurs"], "correct_choice_idx": 0, "direct_answers": ["ossicones", "ossicones", "ossicones", "ossicones", "ossicones", "ossicones", "antlers", "ossicones", "antlers", "ossicones"], "difficult_direct_answer": false, "rationales": ["This is the name for the protrusions on a giraffe", "That is what the horns are called.", "A group of giraffes are standing near each other and all have two short protrusions on their heads. protrusions on giraffe heads are called ossicones."], "image": "train2014/COCO_train2014_000000329963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81476, "question_id": "e7LPRyYjHaXambpaESrjZ5", "question": "What is by the water?", "choices": ["cats", "polar bears", "foxes", "wolves"], "correct_choice_idx": 1, "direct_answers": ["polar bears", "both", "polar bear", "bear", "bears", "bears", "bear", "bears", "bear", "polar bears"], "difficult_direct_answer": false, "rationales": ["There are white bears.", "The polar bears are by water.", "Those kinds of animals are white and live in the arctic."], "image": "train2014/COCO_train2014_000000081476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89638, "question_id": "e7aiU2CE9nkbEemDZVE5iM", "question": "What would a person eat if they only eat food that is the same color as the topmost item in the red bowl?", "choices": ["blueberry", "cherry", "salad", "oranges"], "correct_choice_idx": 2, "direct_answers": ["beef", "spinach", "salad", "broccoli", "salad", "broccoli", "salad", "green", "herbs", "brocolli cilantro"], "difficult_direct_answer": false, "rationales": ["Consumes vegetables that are well prepared in one dish as a salad.", "Cilantro is green and is in a bowl on top of other food.", "The salad is on top of the bowl."], "image": "val2014/COCO_val2014_000000089638.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418854, "question_id": "e7e3iZnfPSbJVEUKyCoCEA", "question": "Why are the walls tiled?", "choices": ["touch", "feel", "water", "sun"], "correct_choice_idx": 2, "direct_answers": ["bathroom wall", "decoration", "clean", "easy cleaning", "water", "keep clean", "waterproofing", "bathroom", "cleanliness", "insulation"], "difficult_direct_answer": true, "rationales": ["This is an interior bathroom. it has a shower.", "The walls have water.", "The bathroom tiles protect the walls from splashes."], "image": "val2014/COCO_val2014_000000418854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44079, "question_id": "eAcvf9HMVEwHZXDW4qccoz", "question": "What does the company whose logo appears at the top specialize in?", "choices": ["yogurt", "pizza", "hot dogs", "broccoli"], "correct_choice_idx": 0, "direct_answers": ["sour cream", "yogurt", "banana", "yogurt", "food warmers", "yogurt", "yogurt", "yogurt", "yogurt", "vegetable"], "difficult_direct_answer": false, "rationales": ["The object in question is consistent with answer a. the brand is also readable and can be internet searched if not commonly known.", "The logo on the top is for fage who makes the greek product.", "The company specializes in yogurt."], "image": "train2014/COCO_train2014_000000044079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488768, "question_id": "eCGGoJauzCcFFSzxyFuGpu", "question": "What is closest to the toilet bowl?", "choices": ["poster", "toilet paper", "cat", "flowers"], "correct_choice_idx": 3, "direct_answers": ["watching", "flowers", "flowers", "flowers", "flowers", "car", "stop", "flowers", "plant", "pink"], "difficult_direct_answer": false, "rationales": ["Branches with flowers are near a white toilet in a bathroom.", "The flowers are closest.", "The flowers are closest."], "image": "train2014/COCO_train2014_000000488768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282190, "question_id": "eCM6M7R32H7Bqh9fUaPwrk", "question": "Who is most likely to take the toilets on the sidewalk?", "choices": ["trash company", "neighbor", "police", "ambulance"], "correct_choice_idx": 0, "direct_answers": ["trash", "garbage man", "trashmen", "garbage truck", "garbage man", "garbage men", "trash company", "clean", "garbage men", "man"], "difficult_direct_answer": false, "rationales": ["The toilets are surrounded by garbage bags.", "They are placed next to the pile of garbage.", "A trash company will pick them up."], "image": "train2014/COCO_train2014_000000282190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109146, "question_id": "eDuNkAuRxUtdz7XQup9d4Z", "question": "What type of movie would this scene appear in?", "choices": ["mountaintop mystery", "bowling documentary", "western", "swimming drama"], "correct_choice_idx": 2, "direct_answers": ["western", "western", "western", "western", "western", "western", "western", "reality", "cowboy", "western"], "difficult_direct_answer": false, "rationales": ["Westerns are known for having horses and there are two horses outside of the building.", "There are horses. there is no bowling alley, swimming pool, or mountain.", "The animals are horses, which are featured in the genre listed in a."], "image": "val2014/COCO_val2014_000000109146.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181920, "question_id": "eEgoigfk3tF6csUEYaKdvu", "question": "What are the zebras all standing inside of?", "choices": ["grass", "dirt", "stream", "lake"], "correct_choice_idx": 2, "direct_answers": ["pond", "stream", "water", "lake", "water", "water", "water", "water", "pond", "water"], "difficult_direct_answer": false, "rationales": ["The zebras are standing inside of a stream to drink water and cool off.", "Animals are standing in a small body of water with land on two sides.", "The zebras are by a stream."], "image": "train2014/COCO_train2014_000000181920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122542, "question_id": "eFXruEkz6VqoW4x9UB4sYm", "question": "What are both of the people wearing?", "choices": ["suspenders", "ties", "sunglasses", "crowns"], "correct_choice_idx": 2, "direct_answers": ["ski suits", "vests", "dress", "sunglasses", "vests", "jackets", "vests", "vests", "vests", "vests"], "difficult_direct_answer": false, "rationales": ["The dark eye wear can be seen on both their faces.", "Both of the women standing on the ski slope are wearing sunglasses.", "Both people are wearing sunglasses to keep the sun out of their eyes."], "image": "val2014/COCO_val2014_000000122542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162770, "question_id": "eGz2dn2ZQ2Ect492MUs7tA", "question": "What is near the top of the tower?", "choices": ["balloon", "gargoyle", "clock", "airplane"], "correct_choice_idx": 2, "direct_answers": ["clock", "clock", "clock", "clock", "clock", "clock", "clock", "clock", "clock", "clock"], "difficult_direct_answer": false, "rationales": ["There is only one structure that is clearly a tower based on its tall and narrow features. the distinct object towards the top is round with a white face and black hands.", "The top of the tower has arms and numbers.", "A time piece is at the top of a tall building."], "image": "train2014/COCO_train2014_000000162770.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32456, "question_id": "eHUAxx55cbEjCazXAKMRBJ", "question": "How many cars are parked on the road behind the zebra and giraffe?", "choices": ["two", "four", "three", "one"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are a trio of vehicles on the road.", "There are several cars visible on a road behind animals eating and grazing.", "There are three cars parked."], "image": "val2014/COCO_val2014_000000032456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310240, "question_id": "eKiGBQDHXiwHybaMDfC7h6", "question": "Where are these animals living?", "choices": ["grasslands", "tundra", "enclosure", "plains"], "correct_choice_idx": 2, "direct_answers": ["zoo", "zoo", "zoo", "zoo", "preserve", "meadows", "zoo", "enclosure", "zoo", "park"], "difficult_direct_answer": false, "rationales": ["The animals are in a enclosed area.", "There are fences in the background and a building far out. the fences keep them in from going into the city.", "They are living in a fenced in area."], "image": "train2014/COCO_train2014_000000310240.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454565, "question_id": "eLvtfBaJ4gGetqcVEPnosm", "question": "Which country does this aircraft brand originate from?", "choices": ["canada", "america", "chile", "mexico"], "correct_choice_idx": 1, "direct_answers": ["usa", "united states", "america", "united states", "usa", "united states", "delta", "u.s.a", "america", "usa"], "difficult_direct_answer": false, "rationales": ["Its from the usa", "The airplane on the tarmac is made by delta airlines which is based in america.", "There is a usa flag on the plane."], "image": "train2014/COCO_train2014_000000454565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 471188, "question_id": "eNYNee3P4G6B8heQYQBfEu", "question": "What company owns vehicles similar to the ones in the street?", "choices": ["tesla", "dunkin donuts", "mcdonalds", "greyhound"], "correct_choice_idx": 3, "direct_answers": ["res", "unknown", "stop", "ogdens", "greyhound", "greyhound", "building", "greyhound", "company", "americans"], "difficult_direct_answer": false, "rationales": ["Greyhound is a bus company.", "The vehicle shown is a bus and the company greyhound has many of them.", "It's the only bus company name."], "image": "train2014/COCO_train2014_000000471188.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 12620, "question_id": "eNkmhedAt23LmcsZXu8d94", "question": "What do these animals have?", "choices": ["long necks", "wings", "horns", "talons"], "correct_choice_idx": 0, "direct_answers": ["zoo", "spots", "spots", "necks", "long necks", "long necks", "hair", "spots", "long neck", "spots"], "difficult_direct_answer": false, "rationales": ["The tall animal is known for having an elongated nape.", "They have long necks", "The animals have long necks."], "image": "train2014/COCO_train2014_000000012620.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538250, "question_id": "eP7tr68BdotRhHTA9Wn3qQ", "question": "What kind of a vehicle is this?", "choices": ["sports car", "airplane", "tank", "tram"], "correct_choice_idx": 3, "direct_answers": ["bus", "bus", "stop", "trolley", "trolley", "tram", "bus", "train", "trolley car", "bus"], "difficult_direct_answer": false, "rationales": ["You can tell by the height and design as to what type of vehicle it is.", "Trams run down roads on tracks.", "The vehicle is a tram."], "image": "train2014/COCO_train2014_000000538250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275603, "question_id": "ePBYMqZZHL3pkDv2zAAWQP", "question": "What might the giraffe have just been doing?", "choices": ["running", "eating", "sleeping", "walking"], "correct_choice_idx": 2, "direct_answers": ["laying", "sitting", "eating", "sleeping", "sleeping", "napping", "sleeping", "sitting", "sleeping", "eating"], "difficult_direct_answer": false, "rationales": ["A giraffe is laying the grass behind a fence.", "The giraffe looks tired. it was probably sleeping.", "He was resting."], "image": "train2014/COCO_train2014_000000275603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 403950, "question_id": "eQSuFqG98qZdhTkE7gTMek", "question": "What type of fencing contains the zebras into this area?", "choices": ["barbed wire", "chain link", "electrified wire", "wood"], "correct_choice_idx": 0, "direct_answers": ["iron", "barbed wire", "barbed wire", "electric", "barbed wire", "barbwire", "barbed wire", "bobbed wire", "barbed", "barbed wire"], "difficult_direct_answer": false, "rationales": ["You can tell by the way the wire's design as to what type of fence it is.", "The zebras are constricted to this area with a barbed wire fence.", "There are many animals standing in the grass and there are metal tipped fencing around to keep them in."], "image": "val2014/COCO_val2014_000000403950.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102275, "question_id": "eQncUoBcxUuVDWo4fhT3wm", "question": "What are the animals standing near?", "choices": ["ocean", "trees", "pumpkins", "eggs"], "correct_choice_idx": 1, "direct_answers": ["tree", "pole", "pole", "pole", "trees", "trees", "tree", "trees", "tree", "zebra giraffe"], "difficult_direct_answer": false, "rationales": ["The animals are standing near the trees.", "The animals are by trees.", "They are all near trees."], "image": "val2014/COCO_val2014_000000102275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 274494, "question_id": "eQr8kNkFL55WGfKFGGLKyh", "question": "Why is the top of the step black?", "choices": ["contrast", "goth owner", "error", "slip pad"], "correct_choice_idx": 3, "direct_answers": ["slip pad", "match floor", "shower", "towel", "towel", "rubber", "for aesthetics", "non slip", "hides hair", "tiles color"], "difficult_direct_answer": true, "rationales": ["A black slip pad has been placed on the tile step to prevent slipping when the step is wet.", "The top is a slip pad.", "You can easily fall when you are wet and where water is involved so you must have a non slip material."], "image": "val2014/COCO_val2014_000000274494.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329908, "question_id": "eRwpW3cCaiiPuxaP4JXRyD", "question": "How many giraffes are standing up?", "choices": ["six", "four", "three", "ten"], "correct_choice_idx": 0, "direct_answers": ["six", "six", "six", "six", "six", "six", "six", "six", "six", "six"], "difficult_direct_answer": false, "rationales": ["There are six giraffes standing in the savannah field.", "There are six giraffes standing up in the grass field.", "5 are pointing the same way and one is facing the camera"], "image": "train2014/COCO_train2014_000000329908.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225458, "question_id": "eSyeornStDS46XRaJMuY7R", "question": "What would you use to eat the food in the bowl?", "choices": ["spoon", "chopsticks", "fork", "knife"], "correct_choice_idx": 0, "direct_answers": ["spoon", "spoon", "spoon", "spoon", "spoon", "spoon", "spoon", "spoon", "spoon", "spoon"], "difficult_direct_answer": false, "rationales": ["There is a four-prong eating utensil by the plate.", "You would use a spoon.", "A spoon is used most often to eat soup."], "image": "train2014/COCO_train2014_000000225458.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 474732, "question_id": "eTQqjMPYsAoGJPcrjaRdvY", "question": "What color is the circular light around the small mirror on the wall?", "choices": ["blue", "red", "green", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "white", "white", "white", "flourescent", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["It is clearly bright as the color of the milk.", "Nearly all lights are white so they can illuminate a room.", "This is a makeup mirror so people can see themselves clearly as they get ready"], "image": "train2014/COCO_train2014_000000474732.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 102834, "question_id": "eTXxSiVDVruGdJAnHanFPj", "question": "What color is the rain jacket worn by the woman in the rainforest?", "choices": ["red", "purple", "teal", "orange"], "correct_choice_idx": 2, "direct_answers": ["green", "teal", "light blue", "blue", "blue", "blue", "blue", "blue", "green", "light blue"], "difficult_direct_answer": false, "rationales": ["Her rainjacket is not purple, red, or orange.", "The woman's jacket is not purple, red, or orange.", "It's a light blue with a hint of green that makes this color"], "image": "train2014/COCO_train2014_000000102834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 19769, "question_id": "eUQpRxiWiykNn3BuCWWCmz", "question": "What is the elephant following the human doing in the zoo?", "choices": ["eating", "painting", "walking", "dreaming"], "correct_choice_idx": 1, "direct_answers": ["painting", "painting", "paint", "drawing", "painting", "painting", "painting", "painting", "painting", "painting"], "difficult_direct_answer": false, "rationales": ["The subjects are positioned in front of an easel and are adding colors to a white canvas using a tool.", "There is an easel with a canvas in front of the elephant. there is a brush in the elephant's trunk.", "It is holding a paintbrush in its trunk"], "image": "train2014/COCO_train2014_000000019769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 438093, "question_id": "eWERrhuXNWK8MsVqrmW2aU", "question": "What is the liquid below the fish?", "choices": ["alcohol", "sauce", "drool", "drink"], "correct_choice_idx": 1, "direct_answers": ["dish", "sauce", "sauce", "juice", "butter", "sauce", "garlic", "sauce", "sauce", "oil"], "difficult_direct_answer": false, "rationales": ["The liquid is a sauce.", "Answer a is consistent with the food visible, the consistency and the serving style.", "There is some kind of butter sauce underneath the fish."], "image": "val2014/COCO_val2014_000000438093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112720, "question_id": "eWXiNhgFkiwEK23CEVEsJk", "question": "Who might these kids admire if they love this sport?", "choices": ["pele", "tony hawk", "michael phelps", "mike trout"], "correct_choice_idx": 3, "direct_answers": ["baseball", "babe ruth", "a-rod", "baseball", "mickey mantle", "mike trout", "nolan ryan", "babe ruth", "baseball", "babe ruth"], "difficult_direct_answer": false, "rationales": ["The kids on the grass are holding bats like the baseball player mike trout.", "Because they are playing with bats that are used for playing baseball.", "They would admire a baseball player."], "image": "train2014/COCO_train2014_000000112720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342693, "question_id": "eXB7EG63C66rE4QMWX4e3F", "question": "What color is the twine that is tied between the cages carrying sheep?", "choices": ["green", "pink", "red", "blue"], "correct_choice_idx": 2, "direct_answers": ["white", "red", "red", "red", "orange", "red", "red", "white", "white", "orange"], "difficult_direct_answer": false, "rationales": ["The color in option a matches the color of the twine.", "The red twine can be seen on not just the pen in the foreground, but the ones in the background as well.", "Twine is red colored."], "image": "train2014/COCO_train2014_000000342693.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 445906, "question_id": "eXF8ANsVRCCHUBTnDoU6re", "question": "What is all the way to the right?", "choices": ["elephant", "baby", "phone", "tiger"], "correct_choice_idx": 2, "direct_answers": ["iphone", "iphone", "cell phone", "ipod", "phone", "phone", "cellphone", "phone", "case", "phone"], "difficult_direct_answer": false, "rationales": ["There is a phone.", "There is a phone on the right.", "The right is the phone."], "image": "train2014/COCO_train2014_000000445906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475313, "question_id": "eXHKcjwvcMF3vTqR2uyVvD", "question": "What is the donut called that has ridges down the sides?", "choices": ["apple fritter", "eclair", "long john", "cruller"], "correct_choice_idx": 3, "direct_answers": ["french cruller", "donut", "cruller", "cruller", "dwdwd", "plain", "cruller", "krueller", "cruller", "cruller"], "difficult_direct_answer": false, "rationales": ["A ridged donut is called a cruller.", "The donut is a cruller.", "The answer is internet searchable based on the text of the question."], "image": "train2014/COCO_train2014_000000475313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 370583, "question_id": "eXuxPJJm2YdtCiFTBGxbCx", "question": "How many giraffes are standing together at this part of the zoo enclosure?", "choices": ["five", "four", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "car", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two giraffes.", "A large giraffe and a small one are together.", "A larger giraffe is standing close to a much smaller one."], "image": "train2014/COCO_train2014_000000370583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 5723, "question_id": "eYDAL5GDmXuB5jGMwwK5Ed", "question": "How many horses are standing in the middle of the grassy plain?", "choices": ["five", "four", "six", "seven"], "correct_choice_idx": 2, "direct_answers": ["six", "five", "five", "six", "six", "six", "six", "six", "five", "six"], "difficult_direct_answer": false, "rationales": ["The horses are countable based on their distinct outlines.", "There are six wild horses on the plains.", "There are six horses all standing on the grassy plain."], "image": "val2014/COCO_val2014_000000005723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507249, "question_id": "eYFFrDzPJ8UsjbaeTerdNX", "question": "What vehicles are here?", "choices": ["pogo sticks", "boats", "cars", "bicycles"], "correct_choice_idx": 3, "direct_answers": ["bicycle", "bicycles", "bike", "bicycles", "bicycles", "bicycles", "bicycle", "bicycles", "bicycles", "bikes"], "difficult_direct_answer": false, "rationales": ["There's a couple of bicycles.", "They are identifiable by being parked in a device that is common in parks to hold them. they have two wheels and spokes and handlebars and a seat to sit on which is characteristic of this type of device.", "There are bicycles."], "image": "val2014/COCO_val2014_000000507249.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539404, "question_id": "eYcrA9YKqnTWtfhrmgA2qJ", "question": "What color are the tiles in the bottom of the kitchen?", "choices": ["white", "beige", "purple", "black"], "correct_choice_idx": 1, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "beige", "white", "blue", "white", "gray"], "difficult_direct_answer": false, "rationales": ["The tiles on the wall are not black, white, or purple.", "They are slightly greenish and different from the other surfaces in the room", "The tiles are blue with black grout."], "image": "train2014/COCO_train2014_000000539404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554348, "question_id": "eYfm9PLT2Sr5LEvT5qxzjB", "question": "What is the girl wearing?", "choices": ["tiara", "sandals", "crown", "armor"], "correct_choice_idx": 1, "direct_answers": ["backpack", "backpack", "shorts", "backpack", "shorts tshirt", "shorts", "shorts", "shorts", "shorts", "sandals"], "difficult_direct_answer": false, "rationales": ["Her feet are not completely covered.", "Most of her feet are visible and there are straps across the back", "She has open shoes with straps"], "image": "val2014/COCO_val2014_000000554348.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 170976, "question_id": "ebBszuGfXNc3LoYCMwzGum", "question": "What type of source is providing power to the stove?", "choices": ["coal", "electricity", "wood", "natural gas"], "correct_choice_idx": 1, "direct_answers": ["electric", "electricity", "electricity", "gas", "electric", "electricity", "electricity", "electricity", "electricity", "electric"], "difficult_direct_answer": false, "rationales": ["The source is electricity.", "This oven looks like it uses electricty to power.", "The source is for electricity."], "image": "train2014/COCO_train2014_000000170976.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556616, "question_id": "ebqp34P6EpTReQNXucjnCb", "question": "What is to the left of the cars?", "choices": ["dog", "bicycles", "parking meters", "cats"], "correct_choice_idx": 2, "direct_answers": ["parking meter", "sign", "parking meter", "parking", "meters", "parking meters", "meter", "car", "bar", "meters"], "difficult_direct_answer": false, "rationales": ["Cars are parked at a device that you put coins in. people do this when they want to park.", "One can see the structures to the left of the cars that are used for collecting fees.", "There are receptacles to put money in to be able to park."], "image": "val2014/COCO_val2014_000000556616.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145792, "question_id": "ec6kW8GiTELzLBnb2tNg7P", "question": "What is the person walking on?", "choices": ["hot coals", "street", "dirt road", "snow"], "correct_choice_idx": 1, "direct_answers": ["crosswalk street", "crosswalk", "crosswalk", "girl", "crosswalk", "crosswalk", "street", "car", "floor", "crosswalk"], "difficult_direct_answer": false, "rationales": ["It is a hard paved surface. there is a crosswalk painted on it.", "A. because it is a cross walk on the road.", "The lining is on a road."], "image": "train2014/COCO_train2014_000000145792.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335539, "question_id": "ecTvFWHVQWhw8SmuMXFtNi", "question": "What shape is the black tiles on the floor?", "choices": ["triangle", "oval", "square", "diamond"], "correct_choice_idx": 3, "direct_answers": ["diamonds", "diamond", "diamonds", "sure", "diamond", "diamond", "square", "squire", "diamond", "triangle"], "difficult_direct_answer": false, "rationales": ["There are a bunch of black diamonds adorning the floor.", "The small diagonally arranged square black tiles between the larger ones could be said to be diamond shaped.", "The shape is that of diamond."], "image": "val2014/COCO_val2014_000000335539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 238881, "question_id": "ef9oMdSZuLa5ymLtsk6LE5", "question": "Which food item on the table is highest in protein?", "choices": ["orange", "toast", "banana", "eggs"], "correct_choice_idx": 3, "direct_answers": ["egg", "eggs", "banana", "eggs", "eggs", "eggs", "two", "eggs", "egg", "bread"], "difficult_direct_answer": false, "rationales": ["Because the eggs are part of the animal produce.", "The rest of the food is carbohydrates besides the butter.", "The majority of the items are fruits which are not known to have high protein counts especially compared to what is on the plate. of the items on the plate, toast is also not known to have a lot of protein while the other food does."], "image": "train2014/COCO_train2014_000000238881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 93883, "question_id": "egYeBSPHvT7gjsibLuxCmL", "question": "What color is the horse that is pulling the yoke?", "choices": ["white", "pinto", "black", "chestnut"], "correct_choice_idx": 3, "direct_answers": ["brown", "chestnut", "brown", "brown", "brown", "brown", "brown white", "brown", "yoke", "brown"], "difficult_direct_answer": false, "rationales": ["The horse is a brown color and clearly visible. horses that are brown in color are commonly referred to as answer a.", "The horse is brown color.", "He is brown in color."], "image": "train2014/COCO_train2014_000000093883.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350819, "question_id": "eiPxGPuoYRJRYrD8G5CVod", "question": "Why is the fence behind the man orange?", "choices": ["water resistant", "protection", "uv reduction", "visibility"], "correct_choice_idx": 3, "direct_answers": ["sit", "visibility", "construction", "construction", "safety", "caution", "construction", "construction", "construction", "net"], "difficult_direct_answer": false, "rationales": ["The fence is for visibility.", "There is a brightly colored orange fence behind the man sitting on the bench.", "The fencing is commonly associated with construction sites where answer a is important to protection workers and pedestrians."], "image": "train2014/COCO_train2014_000000350819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546378, "question_id": "ejpRsNVZcEsnGJBrrm9shu", "question": "What is the cat near?", "choices": ["dog", "boxes", "egg carton", "books"], "correct_choice_idx": 3, "direct_answers": ["books", "books", "books", "books", "books", "books", "books", "books", "book", "books"], "difficult_direct_answer": false, "rationales": ["The cat is sitting on top of a shelf next to some books.", "You can tell by the books and the shelf they are on as to what the cat is near.", "The cat is near books."], "image": "val2014/COCO_val2014_000000546378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484749, "question_id": "ejzE7uqX8BhsnPpvZGZrVT", "question": "How many zebras are grazing in the field before the mountain?", "choices": ["three", "four", "two", "five"], "correct_choice_idx": 1, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "beautiful", "four"], "difficult_direct_answer": false, "rationales": ["There are four of them.", "There are 4 zebras.", "There are three heads on the ground picking at the grass for food."], "image": "train2014/COCO_train2014_000000484749.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518551, "question_id": "ekM2qFmdRQrcCJRMykzrEQ", "question": "What color is the elephant on the right?", "choices": ["brown", "pink", "gray", "white"], "correct_choice_idx": 3, "direct_answers": ["grey", "white", "beige", "grey", "grey", "white grey", "grey", "gray", "white", "grey"], "difficult_direct_answer": false, "rationales": ["The color is white.", "The color is light and easily visible. it is in sharp contrast to the green grass and a color highly prized among elephants.", "The color is white."], "image": "val2014/COCO_val2014_000000518551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25172, "question_id": "emQ5dTfQQQR9MZVbfQsQSk", "question": "What type of pasta is mixed in with the vegetables inside of the salad?", "choices": ["elbow", "spiral", "spaghetti", "bowtie"], "correct_choice_idx": 1, "direct_answers": ["cork screw", "rotini", "spiral", "macaroni", "spiral", "rotini", "spiral", "rotini", "sauce", "wheat"], "difficult_direct_answer": false, "rationales": ["There is spiral pasta mixed with the vegetables.", "Sometimes these are also known as drill bit or curly pasta.", "It is similar to how curly fries are shaped"], "image": "train2014/COCO_train2014_000000025172.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572630, "question_id": "emmoxcme2TxCUXS5Tj24qt", "question": "How many towers are in the medieval castle building?", "choices": ["two", "one", "three", "four"], "correct_choice_idx": 0, "direct_answers": ["many", "one", "one", "two", "two", "two", "two", "two", "fefef", "two"], "difficult_direct_answer": false, "rationales": ["There are two castle towers heading into the sky.", "There are two towers.", "There is a pair of them amongst the other buildings"], "image": "val2014/COCO_val2014_000000572630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217937, "question_id": "enUSVJ9RCvBFo9S7sSVciX", "question": "What is near the top of the food pile?", "choices": ["egg", "cabbage", "hot dog", "apple"], "correct_choice_idx": 0, "direct_answers": ["egg", "food pile", "herb", "parsley", "egg", "parsley", "egg", "egg", "egg", "leaves"], "difficult_direct_answer": false, "rationales": ["An egg is on top", "The item near the top of the food pile is white and yellow. it is not an apple, cabbage, or a hot dog.", "This food is distinguishable due to its white exterior and its yellow interior which you can generally see a yolk sac where the bird was meant to develop."], "image": "val2014/COCO_val2014_000000217937.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 316755, "question_id": "ep26oo6WSs2boJbZCzAuga", "question": "What is made from the protrusions of this animal?", "choices": ["eggs", "piano keys", "unicorn stew", "fountain pens"], "correct_choice_idx": 1, "direct_answers": ["river", "ivory items", "piano keys", "holes", "sound", "ivory", "tusks", "tusks ivory", "ivory", "ivory goods"], "difficult_direct_answer": true, "rationales": ["The weight of the elephants can easily open up holes in the ground.", "The protrusions are made out of ivory which is used in piano making.", "Those are white tusks, and that would be similar to the colors of that musical instrument."], "image": "train2014/COCO_train2014_000000316755.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374451, "question_id": "epXFrgNojgFaHUjWFo8dsP", "question": "Which food came from an unborn animal?", "choices": ["meat", "vegetables", "eggs", "bread"], "correct_choice_idx": 2, "direct_answers": ["eggs", "egg", "egg", "egg", "dog", "egg", "egg", "egg", "hen", "egg"], "difficult_direct_answer": false, "rationales": ["Vegetables and bread do not come from animals. the meat came from a born animal.", "The toast has boiled eggs sliced on it. the white and yolk are displayed.", "The food is eggs."], "image": "train2014/COCO_train2014_000000374451.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503011, "question_id": "epgjZXpoXxmL6ZnkTKExHn", "question": "How can you tell the cat has an owner?", "choices": ["breed", "collar", "indoors", "sign"], "correct_choice_idx": 1, "direct_answers": ["collar", "collar", "owner", "collar", "collar", "name tag", "collar", "woman", "collar", "collar"], "difficult_direct_answer": false, "rationales": ["Someone put this on with a tag or bell to show it's taken care of", "Stray cats would have no one to put a collar on them.", "The cat has a collar."], "image": "train2014/COCO_train2014_000000503011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 280536, "question_id": "eqMmstozp3Ge3wuvsdeATF", "question": "What color are the nice little lunch trays for children or adults?", "choices": ["black", "blue", "pink", "white"], "correct_choice_idx": 2, "direct_answers": ["pink", "pink", "yellow", "pink", "pink", "pink", "yellow", "pink", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["Three containers full of food are all pale pink in color.", "You can see the color of the trays.", "The containers are this bright color."], "image": "val2014/COCO_val2014_000000280536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 362769, "question_id": "er2aRxWHS5W4qd3biQox9u", "question": "What is the zebra standing in?", "choices": ["ocean", "quicksand", "grass", "hay"], "correct_choice_idx": 2, "direct_answers": ["mud", "field", "water", "grass", "grass", "grass", "water", "grass", "grass", "grass"], "difficult_direct_answer": false, "rationales": ["The zebra is identifiable by its unique features and where it is standing can be inferred from the positioning of its legs.", "The zebra is close to the water but is not in the water; it is clearly standing on the grass.", "The zebra is standing on grass."], "image": "train2014/COCO_train2014_000000362769.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 540, "question_id": "ervz8jZkQsHZqVCzjR2Dj7", "question": "What vehicle is the largest shown?", "choices": ["airplane", "buggy", "tank", "elephant"], "correct_choice_idx": 0, "direct_answers": ["airplane", "airplane", "airplane", "airplane", "plane", "plane", "airplane", "plane", "road", "aeroplane"], "difficult_direct_answer": false, "rationales": ["The plane is the largest object.", "The plane is huge and much bigger than anything else.", "There is a large transportation vehicle on a runway. there are turbine engines and a long tail with two wings."], "image": "train2014/COCO_train2014_000000000540.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 398105, "question_id": "etAFxV43r2zKg4sdUnVn44", "question": "What is closest to the cost of everything in this room?", "choices": ["8000000000", "25", "15000", "five"], "correct_choice_idx": 2, "direct_answers": ["couple thousand", "good", "sink", "3000 dollars", "five thousand", "twenty thousand", "two thousand", "10000", "15000", "sink"], "difficult_direct_answer": true, "rationales": ["The cost of wood, tiles, countertop and appliances, etc would add up to around this or more", "It typically costs thousands of dollars to develop a kitchen. the cost of the appliances alone would be thousands of dollars.", "Everything in this room is probably close to 15000 dollars."], "image": "train2014/COCO_train2014_000000398105.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319345, "question_id": "eusjofJBWKsmUcJPfAmEdY", "question": "What types of people use this part of the street the most?", "choices": ["taxi drivers", "truckers", "motorcyclists", "pedestrians"], "correct_choice_idx": 2, "direct_answers": ["bikers", "bikers", "bikers", "bikers", "motorcyclist", "drivers", "pedestrians", "motorcyclist", "bikers", "motorcyclists"], "difficult_direct_answer": false, "rationales": ["The people are motorcyclists.", "The vehicles are motorbikes and are driven by motorcyclists.", "There are motorcycles parked."], "image": "val2014/COCO_val2014_000000319345.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137056, "question_id": "evF5626ErmSoBfLHzDYWjX", "question": "What make of car is behind the elephants?", "choices": ["mercedes benz", "audi", "chevrolet", "ford"], "correct_choice_idx": 1, "direct_answers": ["mercedes benz", "audi", "audi", "audi", "audi", "audi", "lexus", "black", "car", "audi"], "difficult_direct_answer": false, "rationales": ["There is a logo on the front of the car. it has four rings.", "The make of the car can be determined from the visible logo of the interlinking circles.", "There is a car with a logo of four circles touching each other."], "image": "train2014/COCO_train2014_000000137056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 332463, "question_id": "ewYJ2sAQkhhWCHwpC2GDvz", "question": "What color is the water cooler sitting behind the shelf in the center of the room?", "choices": ["brown", "white", "blue", "green"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "blue", "white", "blue", "blue", "green", "grey", "gray", "black"], "difficult_direct_answer": false, "rationales": ["The color is blue.", "The water cooler sitting on the island is blue.", "The color is blue."], "image": "train2014/COCO_train2014_000000332463.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 532216, "question_id": "ewYsRcKZueAFsHcdo2FxiN", "question": "What is the person in the foreground wearing shorts doing?", "choices": ["handstands", "eating", "sleeping", "playing tennis"], "correct_choice_idx": 3, "direct_answers": ["play", "tennis player", "tennis", "playing tennis", "playing tennis", "playing tennis", "tennis", "playing tennis", "playing", "playing tennis"], "difficult_direct_answer": false, "rationales": ["The person is in the foreground playing tennis.", "The person in the shorts is awake and is standing on a court. he is holding a racquet.", "The person in the foreground is standing on a court and is holding a racquet. a green ball is moving towards his side of the court."], "image": "val2014/COCO_val2014_000000532216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2536, "question_id": "exFotVV4jEMNrn8b6pVyDT", "question": "Where are the people hanging out?", "choices": ["bathroom", "water", "sand", "restaurant"], "correct_choice_idx": 1, "direct_answers": ["ocean", "ocean", "sea", "ocean", "water", "water", "water", "ocean", "ocean", "beach"], "difficult_direct_answer": false, "rationales": ["The people are by the water.", "The waves and blue color of the surface indicate they are at the beach.", "The people are by water."], "image": "train2014/COCO_train2014_000000002536.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175933, "question_id": "exJe33CziapsEh2Fpz7BA5", "question": "Why is there a light being used in the tunnel?", "choices": ["to work", "to eat", "to paint", "to skateboard"], "correct_choice_idx": 3, "direct_answers": ["skateboarding", "visibility", "skateboarding picture", "to see", "photography", "skateboarding", "to skateboard", "top", "skating", "camera flash"], "difficult_direct_answer": true, "rationales": ["The man is skateboarding.", "There is a light on in the tunnel so the skateboarder can skate at night.", "There is a light used in this tunnel to enable skateboarders to see."], "image": "val2014/COCO_val2014_000000175933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58134, "question_id": "exTwdyg4mw4oL3Nt3Zxwz4", "question": "What type of building might this be?", "choices": ["school", "restaurant", "cafe", "library"], "correct_choice_idx": 2, "direct_answers": ["coffee shop", "house", "restaurant", "cafe", "restaurant", "cafe", "cafe", "laptop", "business", "home"], "difficult_direct_answer": false, "rationales": ["The building is a cafe.", "There might be a cafe where there is coffee.", "Multiple tables have some chairs around it. there is a glass drink on table and windows."], "image": "train2014/COCO_train2014_000000058134.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 174671, "question_id": "eyfbRfXXkZrBei2a3siTk3", "question": "What is in the sky?", "choices": ["egg", "kite", "frisbee", "rocket"], "correct_choice_idx": 1, "direct_answers": ["kite", "kites", "kites", "paragliders", "parachutes", "kite", "parachute", "kites", "parachute", "kites"], "difficult_direct_answer": false, "rationales": ["There are pieces of material guided by ropes", "There are some kites and parasails up in the sky.", "There are some kites flying around in the sky."], "image": "val2014/COCO_val2014_000000174671.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 2209, "question_id": "eyoU5N994WwsYv2mgqYrBL", "question": "What are the animals doing?", "choices": ["jumping", "flying", "barking", "meowing"], "correct_choice_idx": 1, "direct_answers": ["flying", "flying", "flying", "flying", "flying", "flying", "flying", "flying", "flying", "flying"], "difficult_direct_answer": false, "rationales": ["The animals fly.", "The animals are birds.", "The animals are birds and they are soaring in formation through the air."], "image": "train2014/COCO_train2014_000000002209.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70434, "question_id": "ez7Htptgro93VR5HQehzWx", "question": "What is the item with the white cap used to clean?", "choices": ["mouth", "shoes", "clothes", "dog hair"], "correct_choice_idx": 0, "direct_answers": ["teeth", "mouthwash", "not load", "teeth", "mouth", "mouth", "mouth", "mouth", "teeth", "mouth"], "difficult_direct_answer": false, "rationales": ["It appears to be this based on the shape of the bottle, the color of the liquid and its location near th esink. the other options also don't fit.", "That item often has as a bright colour and a long white cap.", "This is listerine used to clean this area"], "image": "val2014/COCO_val2014_000000070434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 559382, "question_id": "ezWGfujCJTuKWit3ESTGqU", "question": "What vehicles are in triplicate?", "choices": ["truck", "mini bus", "airplane", "tanks"], "correct_choice_idx": 0, "direct_answers": ["truck", "car", "truck", "dodge trucks", "pickup trucks", "trucks", "truck", "truck", "trucks", "trucks"], "difficult_direct_answer": false, "rationales": ["There are three pickups.", "They are pickups", "There are three trucks."], "image": "train2014/COCO_train2014_000000559382.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276076, "question_id": "f2BJ5fsjurgKiSyHQVrqWe", "question": "What is clogging up the street?", "choices": ["snow", "mud", "eggs", "animals"], "correct_choice_idx": 3, "direct_answers": ["sheep", "sheep", "sheep", "sheep", "animals", "sheep", "sheep", "sheaps", "sheep", "sheep"], "difficult_direct_answer": false, "rationales": ["The animals clog.", "There are a bunch of sheep running down the street.", "The street is very crowded by a large herd of animals walking in one lane."], "image": "val2014/COCO_val2014_000000276076.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475887, "question_id": "f2CxxkHs4hxNjbVpxZAVYp", "question": "What is in the middle?", "choices": ["cow", "baby", "pumpkin", "laptop"], "correct_choice_idx": 3, "direct_answers": ["monitor", "monitor", "desktop computer", "phone", "monitor", "laptop", "phone", "computer screen", "monitor", "screen"], "difficult_direct_answer": false, "rationales": ["The middle is a laptop.", "A laptop is on the display. a desktop is actually in the middle.", "The middle is a laptop."], "image": "val2014/COCO_val2014_000000475887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 116312, "question_id": "f2cJ7JHt9YcPBXhtrSjFre", "question": "How many little giraffes are standing with the big giraffe in front of the wooden door?", "choices": ["one", "two", "three", "four"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "one", "two", "two", "two", "two", "one", "two", "two"], "difficult_direct_answer": false, "rationales": ["You can tell by the size difference as to how many small giraffe's there are.", "There are two little giraffes.", "The giraffes are identifiable by their distinct features. the comparative sizes are visible and the total giraffes is countable."], "image": "train2014/COCO_train2014_000000116312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444037, "question_id": "f3ELo7oQDvZmuPWY7zkZ5M", "question": "What color is the big horse to the right with the flat cart behind it?", "choices": ["chestnut", "white", "black", "yellow"], "correct_choice_idx": 0, "direct_answers": ["brown", "brown", "brown", "brown", "brown", "brown", "brown", "chestnut", "brown", "brown"], "difficult_direct_answer": false, "rationales": ["The horses are brown which is typical of this animal. this color is a shade of brown.", "The big horse to the right is brown, not black, white, or yellow.", "The color of the horse is called chestnut"], "image": "train2014/COCO_train2014_000000444037.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319043, "question_id": "f43vUD97oyTek7XNRzY9GQ", "question": "What is on the couch?", "choices": ["apple", "egg carton", "hose", "cat"], "correct_choice_idx": 2, "direct_answers": ["hose", "hose", "monitor", "pillow", "monitor", "pillows electronics", "car", "monitor", "stop", "computer screen"], "difficult_direct_answer": false, "rationales": ["There is a brand new black water hose on the couch still in the packaging.", "A black coil is curled up on the couch.", "A long tube shaped item is coiled into a circle and is still in the original packaging on the couch."], "image": "train2014/COCO_train2014_000000319043.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96832, "question_id": "f4DyYUGD6BDfESnVii7hza", "question": "What term would best describe the person?", "choices": ["body builder", "female acrobat", "techie", "baby"], "correct_choice_idx": 2, "direct_answers": ["techie", "work", "working", "it personnel", "man", "technician", "smart", "engineer", "tech repair", "nerd"], "difficult_direct_answer": true, "rationales": ["The person seated at the table is surrounded by computer equipment and is probably known as a techie because of all the technology.", "They are surrounded by computers and accessories", "The man has multiple types of technology in front of him."], "image": "val2014/COCO_val2014_000000096832.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 329230, "question_id": "f5VJU5SNAS6N4jHrVmGEuM", "question": "What is in the sand?", "choices": ["baby", "cat", "old man", "surfboard"], "correct_choice_idx": 3, "direct_answers": ["surfboard", "surfboard", "surfboard", "surfboard", "surfboard", "surf board", "surfboard", "surfboard", "surfboard", "surfboard"], "difficult_direct_answer": false, "rationales": ["There is a surfboard buried in the sand.", "A surfboard is in the sand.", "The item sticking out of the sand vertically has the shape and lower fins and leg attaching cord of a surfboard."], "image": "train2014/COCO_train2014_000000329230.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 501919, "question_id": "f6SYDMukqMEjqmNMWVCYjD", "question": "What color is the jacket blazer worn by the man with the beard?", "choices": ["white", "yellow", "red", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "blue", "blue", "blue", "navy", "blue", "black", "blue", "dark blue", "blue"], "difficult_direct_answer": false, "rationales": ["The man with the beard is wearing a blazer that is dark blue.", "It's a darker color jacket.", "The jacket is blue."], "image": "val2014/COCO_val2014_000000501919.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561543, "question_id": "f77nF5gboAs85gptn2X8rB", "question": "What color is the plastic grip around the white cup?", "choices": ["pink", "red", "blue", "green"], "correct_choice_idx": 1, "direct_answers": ["purple", "purple", "purple", "red", "red", "purple", "purple", "blue", "purple", "blue"], "difficult_direct_answer": false, "rationales": ["The plastic grip is red", "There are several dishes of food among a table. there sits a white long cup with a lighter color then the raspberries in the bowel.", "The section look a hint of red."], "image": "train2014/COCO_train2014_000000561543.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 176474, "question_id": "f7donxdsa7stNwGboLJ6GN", "question": "What animal is most closely related to these?", "choices": ["goats", "tigers", "sheep", "wolves"], "correct_choice_idx": 3, "direct_answers": ["dog", "dog", "wolf", "car", "dog", "dog", "dog", "wolves", "cat", "wolf"], "difficult_direct_answer": false, "rationales": ["The animals are dogs which are in the canine family.", "They are dogs.", "They are all part of the canine family"], "image": "val2014/COCO_val2014_000000176474.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239350, "question_id": "f8cunLiNiKVKE4aWanPVp9", "question": "What type of business is Flinders street station?", "choices": ["taxi station", "airport", "bus station", "railroad station"], "correct_choice_idx": 3, "direct_answers": ["train station", "train", "passenger railroad", "train station", "railway", "railroad station", "train station", "flinders", "train station", "train"], "difficult_direct_answer": false, "rationales": ["The business is a station.", "It's a train station.", "The clocks show the time for the trains."], "image": "train2014/COCO_train2014_000000239350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284920, "question_id": "f8sDkRSzCxGD5BavjWSAYx", "question": "What performer has a similar name to the thing on the ground?", "choices": ["snoop dogg", "tiger woods", "snow", "katt williams"], "correct_choice_idx": 2, "direct_answers": ["snow", "skier", "snow", "brittany snow", "snow", "snow", "flow", "snow white", "ice cube", "snow"], "difficult_direct_answer": false, "rationales": ["The performer is in snow.", "The canadian singer has the same name as the stuff being skied on.", "Snow is popular name."], "image": "train2014/COCO_train2014_000000284920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168730, "question_id": "fA4HLvDhHktLF8YasAw85X", "question": "What is the state of the bananas?", "choices": ["overripe", "ripe", "underripe", "rotten"], "correct_choice_idx": 1, "direct_answers": ["ripe", "hanging", "yellow", "ripe", "cold", "hanging", "fresh", "good", "good", "good"], "difficult_direct_answer": false, "rationales": ["The bananas are ripe.", "The fruit is very yellow and ready to eat.", "They are a perfect yellow color"], "image": "train2014/COCO_train2014_000000168730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 339100, "question_id": "fBj4rwQYkMRnuNKF36jGyi", "question": "How many zebras are there together in the group on the savannah?", "choices": ["six", "four", "seven", "two"], "correct_choice_idx": 1, "direct_answers": ["four", "four", "four", "zebra", "four", "four", "four", "three", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are four there.", "There are altogether four zebras.", "Two are cuddled together and two walk separately"], "image": "train2014/COCO_train2014_000000339100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 428093, "question_id": "fC6d8AFxJihzPzoXycRvYN", "question": "What is the person wearing?", "choices": ["tie", "tiara", "backpack", "crown"], "correct_choice_idx": 0, "direct_answers": ["tie", "tie", "tie", "gray tie", "suit", "suit tie", "coat", "suit", "necktie", "dress"], "difficult_direct_answer": false, "rationales": ["The person has a tie around their neck.", "The person has a tie on.", "The person has a tie."], "image": "train2014/COCO_train2014_000000428093.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 168472, "question_id": "fCg7BqtESTUbCcTLJDbcug", "question": "What is the name for the fruit cut in two slices at the middle of the cutting board?", "choices": ["pear", "strawberry", "apple", "orange"], "correct_choice_idx": 0, "direct_answers": ["pear", "pear", "pears", "pear", "pear", "pear", "peaf", "banana", "pear", "pear"], "difficult_direct_answer": false, "rationales": ["The fruit cut in two slices down the middle is a pear.", "The fruit is a pear.", "It is similar to an apple but it bumps out at the top"], "image": "train2014/COCO_train2014_000000168472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328110, "question_id": "fF7Un2Lg7kqCsP2vrpcueG", "question": "What is the woman on the bench clutching?", "choices": ["frisbee", "kitten", "basketball", "baby"], "correct_choice_idx": 3, "direct_answers": ["baby", "child", "baby", "baby", "child", "baby", "small baby", "baby", "baby", "baby"], "difficult_direct_answer": false, "rationales": ["A woman sits on a bench on the street holding her very young child.", "The woman is seen holding tight the kid.", "The woman has a baby."], "image": "val2014/COCO_val2014_000000328110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 82622, "question_id": "fFCiG72yeFAxK6ksjkZaVa", "question": "What vehicle is the man on?", "choices": ["boat", "locomotive", "bicycle", "scooter"], "correct_choice_idx": 0, "direct_answers": ["boat", "boat", "boat", "boat", "boat", "boat", "boat", "boat", "boat", "travel"], "difficult_direct_answer": false, "rationales": ["It's indicated by the massive tied off rope to the right of the man.", "The vehicle is a boat.", "You can tell by the ropes and general background as to what he is sitting in."], "image": "train2014/COCO_train2014_000000082622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 25917, "question_id": "fFDaVdnxv7Y3gKyjdCFT2c", "question": "What does the front of the large item look like?", "choices": ["tiger", "wizard", "battering ram", "baby"], "correct_choice_idx": 2, "direct_answers": ["red", "drum", "dwdwdw", "nail", "train", "engine", "train", "train", "train", "battering ram"], "difficult_direct_answer": false, "rationales": ["The front is a battering ram.", "The other objects are living things.", "Due to the nature of the shape of the red front of the train with the two protruding rams, this is what it looks like."], "image": "train2014/COCO_train2014_000000025917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578627, "question_id": "fFGuoEgZNs8QEmSuT93ghg", "question": "What color hair does the girl next to the cow have?", "choices": ["green", "blonde", "red", "black"], "correct_choice_idx": 1, "direct_answers": ["blond", "blonde", "blonde", "blonde", "black", "blond", "blonde", "blonde", "black", "blonde"], "difficult_direct_answer": false, "rationales": ["A girl in all white is standing next to a cow. she has yellow hair.", "The girl standing next to the spotted cow has blond hair tied in a ponytail.", "Her hair color is not red, green, or black. it is a natural color."], "image": "train2014/COCO_train2014_000000578627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 139336, "question_id": "fFZ8stvxtkyEMnuAuHyWAa", "question": "What us the giraffe in the foreground sitting on?", "choices": ["sand", "water", "box", "snow"], "correct_choice_idx": 0, "direct_answers": ["dirt", "dirt", "sand", "road", "sand", "zoo", "wood", "legs", "rest", "dirt"], "difficult_direct_answer": false, "rationales": ["The other options don't make sense given the season or appear in this image.", "The giraffe is in sand.", "An animal with a long neck is sitting in a lightly colored, fine grain soil with no trees or bushes growing out of it."], "image": "train2014/COCO_train2014_000000139336.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452459, "question_id": "fHumaRhhymqNVetjFdTDrK", "question": "Who uses this room?", "choices": ["older adult", "child", "teenager", "young adult"], "correct_choice_idx": 1, "direct_answers": ["child", "student", "kids", "human", "human", "child", "child", "children", "student", "child"], "difficult_direct_answer": false, "rationales": ["The childrens toys and ladders tell us this is a young person's room.", "There are toys and stuffed animals", "You can see toy blocks around the room that appear to have been played with recently. there are also stairs that lead to the bed so that a small person can easily and safely climb into the bed."], "image": "train2014/COCO_train2014_000000452459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 114398, "question_id": "fJrAvEV9LFbNZEEp3Ne9MJ", "question": "Why are the elephants trunk likely in the water?", "choices": ["bathing", "warm up", "drink", "protection"], "correct_choice_idx": 2, "direct_answers": ["drinking", "drinking", "drinking", "drink", "drinking", "to drink", "drinking", "three", "to drink", "drinking"], "difficult_direct_answer": false, "rationales": ["This is the way that elephants drink, by using their trunk to draw up water.", "The elephants are drinking water with their trunks.", "The elephants are drinking because animals need water in order to function."], "image": "val2014/COCO_val2014_000000114398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452308, "question_id": "fKnT5SnQyNVkSMy9Xzgyjh", "question": "How many traffic lights are seen suspended in the air?", "choices": ["one", "two", "four", "three"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "three"], "difficult_direct_answer": false, "rationales": ["You can tell by the power lines that they are suspended from as to how many of the lights there are.", "Depending on your interpretation of the question the answer could be either answer b or c, but two of the lights are on a pole which i would not consider \"suspended\".", "There is one on a short pole and one on an overhead pole"], "image": "val2014/COCO_val2014_000000452308.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133741, "question_id": "fLVyLo9ZkRQy8963DFcXt3", "question": "How many wooden poles are sitting around the giraffe?", "choices": ["four", "three", "two", "five"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "dadaa", "three", "three"], "difficult_direct_answer": false, "rationales": ["There is one in the middle and two on the sides", "There are three long wooden poles around the giraffe.", "Two poles are visible in the foreground and one in the back."], "image": "train2014/COCO_train2014_000000133741.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 453481, "question_id": "fMXBkhiRKY7fHnoCF8JoBB", "question": "What bone of the man is touching the bike?", "choices": ["hyoid", "septum", "femur", "proximal phalanx"], "correct_choice_idx": 3, "direct_answers": ["phalanges", "fibula", "hand", "hand", "proximal phalanx", "knee", "ankle", "hand", "handles", "hand"], "difficult_direct_answer": false, "rationales": ["The bone is the phalanx.", "The bone is the phalanx.", "The phalanx is touching the bike."], "image": "val2014/COCO_val2014_000000453481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265020, "question_id": "fNA4vSEq827Tg5ErY2LXuV", "question": "What is the person holding?", "choices": ["apple", "banana", "fishing rod", "basket"], "correct_choice_idx": 2, "direct_answers": ["pole", "fishing rod", "fishing pole", "car", "fishing rod", "fishing rod", "fishing rod", "fishing pole", "stick", "fishing pole"], "difficult_direct_answer": false, "rationales": ["The person is fishing with the pole.", "The other options don't appear in this image. this makes sense since he's at the water.", "A young boy is standing in shallow water. he is using a pole to catch fish in water."], "image": "train2014/COCO_train2014_000000265020.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156823, "question_id": "fQBkioU8LxhmEJsGY6qk8D", "question": "What is a term based on this animal?", "choices": ["sheeple", "dogeared", "henpecked", "catgut"], "correct_choice_idx": 0, "direct_answers": ["sheeple", "sheep", "fee", "sheep", "sheep", "black sheep", "sheepish", "sheep", "sheep", "lamb"], "difficult_direct_answer": false, "rationales": ["The animal is a sheeple.", "The animals here are sheep. the term sheeple was derived from these animals.", "The animals in the picture have white wool."], "image": "train2014/COCO_train2014_000000156823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85459, "question_id": "fRDt4HWKTvNR39HWaiTx99", "question": "What is found in the room?", "choices": ["book case", "cat", "shower head", "dog"], "correct_choice_idx": 2, "direct_answers": ["water", "mirror", "sink", "shower", "toilets", "shower head", "toilet", "toilet", "shower", "shower"], "difficult_direct_answer": false, "rationales": ["The room is a bathroom which is where bathing facilities are located.", "A shower is in the room", "This is a bathroom with all the amenities"], "image": "train2014/COCO_train2014_000000085459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452721, "question_id": "fRzYvozXduc8TKeHAD9qao", "question": "What are the horses near?", "choices": ["apples", "cat", "sand", "snow"], "correct_choice_idx": 2, "direct_answers": ["ocean", "water", "sand", "ocean", "beach", "ocean", "beach", "beach", "ocean", "sea"], "difficult_direct_answer": false, "rationales": ["The horses are on the beach walking on the granular surface at the edge of the water.", "The sand of the beach is the only item of those listed here which are near the horses in this image.", "The horses are walking up on the sand."], "image": "val2014/COCO_val2014_000000452721.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 404792, "question_id": "fS922Zu884PHmeQrQ7NaVi", "question": "What song mentions the item that is surrounding the bikes?", "choices": ["rotten apple", "mystery box", "bat dance", "in chains"], "correct_choice_idx": 3, "direct_answers": ["in chains", "riding", "support", "bikes", "chain", "chainlink fence", "chain", "motorcycle", "chain", "chain"], "difficult_direct_answer": false, "rationales": ["There is a song with this lyric and you can see the object binding the bikes to secure them.", "The song \"in chains\" mentions the chains.", "Metal links held by posts are around the bikes."], "image": "val2014/COCO_val2014_000000404792.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542818, "question_id": "fStVqK296G3v5mdRcGYff7", "question": "What are the people doing to the elephants?", "choices": ["hunting", "grooming", "cooling", "feeding"], "correct_choice_idx": 2, "direct_answers": ["washing", "washing", "accompanying", "swimming", "washing", "cooling", "swim", "bathing", "bathing them", "playing"], "difficult_direct_answer": false, "rationales": ["The people cool them off.", "They are throwing water on top of them", "Men are spraying water on elephants in a river on a sunny day. the elephants are standing and laying in the water."], "image": "train2014/COCO_train2014_000000542818.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 236243, "question_id": "fU67gbcE6wwikdimaWF7xQ", "question": "What color is the back of the seat seen in front of the visible bedding?", "choices": ["pink", "red", "white", "yellow"], "correct_choice_idx": 0, "direct_answers": ["pink", "red", "orange", "orange", "red", "pink", "red", "pink", "probably orange", "orange"], "difficult_direct_answer": false, "rationales": ["The chair is the same color as a flamingo.", "The chair is a bright color", "The back of the seat is not red, white, or yellow."], "image": "train2014/COCO_train2014_000000236243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 191913, "question_id": "fUzhbgikrdd7ctWirk6Tik", "question": "What kind of vehicle is the person driving up next to the street?", "choices": ["bike", "pickup", "streetsweeper", "van"], "correct_choice_idx": 0, "direct_answers": ["cyclist", "bicycle", "bike", "bicycle", "bicycle", "bicycle", "dwdwwd", "bike", "bike", "bike"], "difficult_direct_answer": false, "rationales": ["They are riding a bicycle", "A man is riding along side the street with spokes on wheels and handles for his hands.", "It has two narrow wheels with handlebars and one person rides it"], "image": "train2014/COCO_train2014_000000191913.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124157, "question_id": "fVFDx5UVcfYhgukmGVQREv", "question": "What animal is usually portrayed eating this food?", "choices": ["cat", "monkey", "cow", "elephant"], "correct_choice_idx": 1, "direct_answers": ["monkey", "car", "monkey", "monkey", "stop", "monkey", "banana", "monkey", "monkey", "monkey"], "difficult_direct_answer": false, "rationales": ["These grow in areas where monkeys live", "These food items are bananas. cats, cows, and elephants usually are not portrayed eating bananas.", "Mounds of bananas are piled up onto silver containers. monkeys are known to like bananas."], "image": "val2014/COCO_val2014_000000124157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 290122, "question_id": "fVcRLv7tiVrq4o7uDFbrf5", "question": "What are the people standing in front of?", "choices": ["eggs", "lockers", "trees", "boxes"], "correct_choice_idx": 1, "direct_answers": ["biro", "locker", "lockers", "lockers", "lockers", "lockers", "lockers", "lockers", "lockers", "lockers"], "difficult_direct_answer": false, "rationales": ["People keep their things locked inside of them.", "There are little cubbies with locks.", "They are standing in front of storage lockers."], "image": "train2014/COCO_train2014_000000290122.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 4903, "question_id": "fWjBghmy3ryfuJs2kDyZhX", "question": "What is the waterway called?", "choices": ["river", "pond", "canal", "ocean"], "correct_choice_idx": 2, "direct_answers": ["riverfront", "canal", "lake", "canal", "canals", "canal", "canal", "river", "canal", "light"], "difficult_direct_answer": false, "rationales": ["It is a controlled waterway within a city", "People travel down them in many european cities.", "The waterway is a canal."], "image": "train2014/COCO_train2014_000000004903.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497972, "question_id": "fXeLuEhAreMCMWq8RWWBDW", "question": "What are they putting in the cup?", "choices": ["cinnamon", "sugar", "grated cheese", "salt"], "correct_choice_idx": 1, "direct_answers": ["sugar", "anything", "sugar", "sugar", "sugar", "lemon", "sugar", "sugar", "sugar", "sugar"], "difficult_direct_answer": false, "rationales": ["The person is drinking coffee.", "The white powder pouring out of the small packet into the drink in this picture is likely sugar as salt is seldom poured into a drink.", "The person is adding a cane sweetener to this beverage."], "image": "train2014/COCO_train2014_000000497972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128224, "question_id": "fY9UKPAPLZpyL9ob44fbeq", "question": "What kind of beverage is there on the table top?", "choices": ["beer", "tea", "juice", "coffee"], "correct_choice_idx": 1, "direct_answers": ["tea", "cigar", "tea", "beer", "tea", "tea", "bear", "tea", "puff", "beer"], "difficult_direct_answer": false, "rationales": ["There is tea in the little cup next to the tin.", "There is a cup of hot tea on the table top next to the spoon.", "There is tea in the cup."], "image": "val2014/COCO_val2014_000000128224.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433662, "question_id": "fYMWJPFDSdQQXLVk8zeGcf", "question": "What item here can have multiple meanings?", "choices": ["trunk", "cow", "strawberry", "reed"], "correct_choice_idx": 0, "direct_answers": ["tote", "love", "stick", "trunk", "hoe", "trunk", "bag", "truck", "shoe", "trunk"], "difficult_direct_answer": false, "rationales": ["There are elephants and trees. one meaning is the tubular item on the front of each elephant's face, and a second meaning is the main vertical part of each tree.", "A trunk is also a part of a car.", "This is the name for the long feature protruding from the elephants face, however it is also used to describe a trees base and can be used to store objects in."], "image": "train2014/COCO_train2014_000000433662.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 574232, "question_id": "faed7uPjDYCsfHWdQPCFuN", "question": "What animal is the doll in the middle?", "choices": ["snake", "cat", "bear", "dog"], "correct_choice_idx": 2, "direct_answers": ["red", "bear", "teddy", "bear", "bear", "bear", "bear", "cat", "teddy", "bear"], "difficult_direct_answer": false, "rationales": ["Named after theodore roosevelt as well or \"teddy\" a.", "The animal in the middle is a teddy bear. it has four limbs and short rounded ears.", "The animal is a bear."], "image": "train2014/COCO_train2014_000000574232.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 486290, "question_id": "fbhSzNijB7hJeyKbcPs3N8", "question": "What is the name of the style used to make these glass ornaments?", "choices": ["blown glass", "shaped glass", "burnt glass", "torched glass"], "correct_choice_idx": 0, "direct_answers": ["glass blown", "blowing", "jak", "blown glass", "glass", "glass blowing", "glass blowing", "blown glass", "glass", "glass blowing"], "difficult_direct_answer": false, "rationales": ["The name is glass.", "Technically, it's also through d, but the main description would be a. they're hand creafted.", "These are shaped one at a time by people"], "image": "val2014/COCO_val2014_000000486290.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380834, "question_id": "fbsiVav2SHrjV3UHFuBLb3", "question": "What is the boat rowing in through?", "choices": ["canal", "ocean", "lake", "river"], "correct_choice_idx": 3, "direct_answers": ["river", "water", "water", "river", "bridge", "pond", "water", "bridge", "water", "water"], "difficult_direct_answer": false, "rationales": ["A few people are on a boat. they are paddling thru a small narrow passage of water with trees on the left.", "A boat is a passage of water near the shore with brick structure near the edge of the water. the brick structure appears to be the side of a bridge.", "The boat is in a river."], "image": "val2014/COCO_val2014_000000380834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 111124, "question_id": "fcCLVjGTsvBavgV6Pzmh97", "question": "What are the majority of the people doing?", "choices": ["sleeping", "running", "standing", "eating"], "correct_choice_idx": 2, "direct_answers": ["standing", "standing", "skateboarding", "walking", "majority", "watching", "sitting", "standing", "chatting", "talking standing"], "difficult_direct_answer": false, "rationales": ["The majority of people are standing.", "All the people are either standing or sitting. more are standing.", "Most of the people are standing."], "image": "train2014/COCO_train2014_000000111124.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393087, "question_id": "fcyhukfz8QZhuJAFEb2mw8", "question": "What material composes this old square tower?", "choices": ["brick", "wood", "cobblestone", "mud"], "correct_choice_idx": 2, "direct_answers": ["stone", "stone", "chair", "cobblestone", "stone", "stone", "concrete", "concrete", "bricks", "rocks"], "difficult_direct_answer": false, "rationales": ["This is an old stone tower.", "It's a material that is typical of a building this age. it has a rough texture which is characteristic of this material.", "The material is cobblestone."], "image": "train2014/COCO_train2014_000000393087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562911, "question_id": "fdyAZyo3oRE9P2QRQC5dj5", "question": "What is another word for the vehicle in the foreground?", "choices": ["scooter", "racecar", "buggy", "chopper"], "correct_choice_idx": 3, "direct_answers": ["chopper", "helicopter", "helicopter", "helicopter", "helicopter", "helicopter", "chopper", "helicopter", "helicopter", "helicopter"], "difficult_direct_answer": false, "rationales": ["The vehicle is a helicopter and only choice a is appropriate as a nickname.", "The vehicle in the foreground is a helicopter which is also known as a chopper.", "It's also called a helicopter or a bug eyed or bubble helicopter."], "image": "train2014/COCO_train2014_000000562911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65896, "question_id": "ffRstkDCqKhKDRKQPpJE7M", "question": "What animal is native to this country?", "choices": ["camel", "polar bear", "panda", "kangaroo"], "correct_choice_idx": 3, "direct_answers": ["animal", "kangaroo", "kangaroo", "no idea", "dear", "americans", "elk", "monkeys", "koala bear", "kangaroo"], "difficult_direct_answer": false, "rationales": ["Kandos is in australia which has kangaroos.", "The animal is a kangaroo.", "The country can be inferred based on the name of the town written on the post in the image. of the list of answers, a is indigenous to the country of kandos."], "image": "train2014/COCO_train2014_000000065896.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211807, "question_id": "ffrtyRGFP2GAhvm5pttgZz", "question": "What video game system was made by the company whose name appears on the sign to the right?", "choices": ["3do", "atari", "nintendo", "xbox"], "correct_choice_idx": 0, "direct_answers": ["3do", "panasonic", "3do", "playing", "ink", "3do", "cant tel", "3do", "na", "3do"], "difficult_direct_answer": false, "rationales": ["This is a game panasonic made", "A brand logo is on the exterior was at an airport.", "The company on the sign is panasonic. although i know the other three options were not made by panasonic, a google search was completed to verify the name of their video game product."], "image": "train2014/COCO_train2014_000000211807.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 513129, "question_id": "fg8GWSWvmResSUEtWchscs", "question": "What is the person standing across from?", "choices": ["egg", "mirror", "cat", "dog"], "correct_choice_idx": 1, "direct_answers": ["mirror", "sink", "mirror", "mirror", "mirror", "bathroom vanity", "mirror", "mirror", "sink", "wall mirror"], "difficult_direct_answer": false, "rationales": ["It's reflecting their image", "It is reflecting the image of the person", "The man is looking into a mirror."], "image": "val2014/COCO_val2014_000000513129.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469898, "question_id": "fgBKQD4cMtrDaJL3Zg7ddN", "question": "What color is the fencing behind the sheep who are herded around by the dog?", "choices": ["blue", "orange", "green", "yellow"], "correct_choice_idx": 1, "direct_answers": ["red", "orange", "yellow", "orange", "orange", "orange", "orange", "yellow", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The fence has orange mesh on it.", "One can make out the brightly colored fencing in the background.", "The fencing behind the sheep and the dog is colored orange."], "image": "train2014/COCO_train2014_000000469898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402040, "question_id": "fhNwiTuLAy7A4m7CCUxH6w", "question": "What company makes the phone?", "choices": ["apple", "nokia", "ibm", "samsung"], "correct_choice_idx": 3, "direct_answers": ["samsung", "samsung", "samsung", "samsung", "samsung", "samsung", "samsung", "samsung", "samsung", "samsung"], "difficult_direct_answer": false, "rationales": ["Samsung makes the phone.", "The company is samsung.", "The phone says samsung on it."], "image": "train2014/COCO_train2014_000000402040.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27270, "question_id": "fi4er7hHdoWjTJMMqcn5GG", "question": "What variety of tomato is on the plate?", "choices": ["hot house", "heirloom", "roma", "cherry"], "correct_choice_idx": 3, "direct_answers": ["grape", "cherry", "cherry", "baby", "cherry", "plum", "red tomato", "cherry", "cherry tomato", "cherry"], "difficult_direct_answer": false, "rationales": ["These are smaller tomatoes.", "There is a cherry tomato on top of the plate.", "The tomato on the plate is small and not sliced."], "image": "train2014/COCO_train2014_000000027270.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173149, "question_id": "fiH3AkVVBXxUZyqdME3kNK", "question": "How many girls are hiding together underneath of the umbrella?", "choices": ["two", "five", "three", "four"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two girls.", "One girl under the umbrella is beside another one.", "Under the umbrella there is a girl with her hair either short or pinned up and a second girl with her hair in a ponytail."], "image": "train2014/COCO_train2014_000000173149.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166998, "question_id": "fjDmNDWWnFLij3D5SpiGFr", "question": "What is the shorter rectangular appliance called?", "choices": ["air conditioner", "toaster oven", "microwave", "food dehydrator"], "correct_choice_idx": 2, "direct_answers": ["oven", "microwave", "oven", "toaster oven", "microwave", "toaster oven", "microwave", "oven", "microwave", "microwave"], "difficult_direct_answer": false, "rationales": ["A white rectangular microwave sits in the corner.", "The short rectangular appliance on the counter is a toaster oven.", "The object has the setup, size, style and design consistent with answer a and is placed in a setting that would be consistent with its intended use."], "image": "train2014/COCO_train2014_000000166998.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 117360, "question_id": "fkyTjP9HqirTmuDUXERxXz", "question": "How many red donuts are remaining in the bottom left section of the donut chambers?", "choices": ["five", "two", "four", "three"], "correct_choice_idx": 3, "direct_answers": ["two", "three", "two", "three", "two", "two", "three", "two", "three", "two"], "difficult_direct_answer": false, "rationales": ["One can see two completely and the edge of another.", "There are two donuts visible.", "There are two easy to see then just a part of one at the edge"], "image": "train2014/COCO_train2014_000000117360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 115506, "question_id": "fnUofapTnJtTbBakJt9t9M", "question": "What is the most nutrient dense food on this plate?", "choices": ["spinach", "tomato", "fruit", "meat"], "correct_choice_idx": 0, "direct_answers": ["spinach", "spinach", "spinach", "greens", "spinach", "kale", "kale", "spinach", "chicken", "greens"], "difficult_direct_answer": false, "rationales": ["A dish has vegetables including spinach on it. spinach is high in nutrients.", "It offers a lot of vitamins as a dark leafy green", "Various vegetables including spinach, carrots, and cauliflower are in a bowl."], "image": "train2014/COCO_train2014_000000115506.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18389, "question_id": "fp9QtuZz9mhRQg8UmfWqbE", "question": "How many people are sitting on the bench in the middle of the park?", "choices": ["two", "four", "five", "three"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["A man and woman are sitting together on a bench.", "There are a couple of people on the bench.", "You can count them. they are easily countable because it is a small number of people and people are large and easy to see."], "image": "train2014/COCO_train2014_000000018389.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53542, "question_id": "ftYhXorgfGCJE9kvoKpNNK", "question": "What color are the rectangular bricks on the very bottom of the tower?", "choices": ["red", "white", "gray", "blue"], "correct_choice_idx": 2, "direct_answers": ["gray", "grey", "red", "red", "grey", "gray", "grey", "red", "white", "gray"], "difficult_direct_answer": false, "rationales": ["A tower is made of red and gray bricks with the gray being wider than they are tall.", "These are the color of concrete", "The bricks are gray."], "image": "val2014/COCO_val2014_000000053542.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250210, "question_id": "ftxR6Mzs7k3x2vRPd9GiQT", "question": "Which object has a door that opens?", "choices": ["glass jar", "ceramic jar", "pressure cooker", "microwave"], "correct_choice_idx": 3, "direct_answers": ["microwave", "microwave", "microwave", "stop", "white", "microwave", "microwave", "microwave", "microwave", "microwave"], "difficult_direct_answer": false, "rationales": ["The small appliance has a door for one to put the food in to heat it up.", "You can see the area in the lower right corner where you push to open the door. this is an oven so it must have a door in order to put the food in.", "The device is an oven that uses electromagnetic radiation to heat food. it has a door to put the food in and is recognizable by the size and wire mesh on the door."], "image": "val2014/COCO_val2014_000000250210.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15881, "question_id": "fuThT6qHKPLsfQzWzi34z4", "question": "What item is missing on the menu?", "choices": ["hot dog", "bagel", "donut", "breakfast sandwich"], "correct_choice_idx": 0, "direct_answers": ["donuts", "chicken", "fries", "nothing", "hot dog", "dietary guide", "donut", "sandwich", "ice", "price"], "difficult_direct_answer": true, "rationales": ["This is a store for pastries and donuts.", "Of the answers on the list, all appear on the menu except answer a.", "A dunkin donuts menu has various breakfast options listed. dunkin donuts is not known to serve lunch items."], "image": "train2014/COCO_train2014_000000015881.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 244720, "question_id": "fvGT9nWa65cZpdjpU7HFwj", "question": "How many dogs are standing on the wood flat bed on the pickup truck?", "choices": ["three", "two", "four", "five"], "correct_choice_idx": 0, "direct_answers": ["two", "three", "three", "three", "two", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are two in the front of the picture and one behind the dark dog", "The number of legs are the indicator.", "There are two dogs standing on the bed of the pickup truck"], "image": "train2014/COCO_train2014_000000244720.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15140, "question_id": "fvUJXrK5MFqrmVGLuD2YSR", "question": "How many species are shown?", "choices": ["two", "five", "three", "one"], "correct_choice_idx": 0, "direct_answers": ["cat", "two", "two", "two", "three", "three", "one", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two creatures shown.", "One of the animal is a cat.", "There are cats and humans."], "image": "val2014/COCO_val2014_000000015140.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217429, "question_id": "fwHDb7XyVCo5BntuPVi24j", "question": "What is the purpose of the gray object?", "choices": ["cool", "play music", "heat", "pest prevention"], "correct_choice_idx": 2, "direct_answers": ["heat", "heat", "room heater", "room heater", "heat", "heat", "heater", "heat home", "heater", "heat"], "difficult_direct_answer": false, "rationales": ["The gray object is a radiator. fluid circulates through the coiled pipes providing warmth to the home.", "The gray object is a radiator, not an air conditioner, stereo, or trap. it warms the room.", "The grey object is a radiator."], "image": "train2014/COCO_train2014_000000217429.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 518551, "question_id": "fx65azZqXHCJYnJWD3bVAn", "question": "What color is the skin of the dirty elephant in the middle?", "choices": ["ivory", "bronze", "pink", "gray"], "correct_choice_idx": 1, "direct_answers": ["brown", "brown", "brown", "grey", "bronze", "brown", "brown", "brown", "brown", "grey"], "difficult_direct_answer": false, "rationales": ["The dirt on the elephant is bronze colored.", "An elephant covered in mud appears orange compared to others around him.", "Their skin is a brownish color"], "image": "val2014/COCO_val2014_000000518551.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492137, "question_id": "fxbRgYHUtuKULcR7qMyP7w", "question": "What is the cat leaning against?", "choices": ["computer", "box", "fence", "human leg"], "correct_choice_idx": 0, "direct_answers": ["laptop", "laptop", "laptop", "computer", "laptop", "laptop", "laptop", "laptop", "laptop", "laptop"], "difficult_direct_answer": false, "rationales": ["You can tell by the design and keys of the object as to what it is the cat is laying on.", "A cat is sleeping on a keyboard.", "There is an orange cat leaning against a computer."], "image": "train2014/COCO_train2014_000000492137.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 427639, "question_id": "fy2ei2maTNQcAXybwECvsd", "question": "What is the dog under?", "choices": ["hammock", "cardboard box", "desk", "hat"], "correct_choice_idx": 2, "direct_answers": ["table", "table", "cabinet", "counter", "counter", "desk", "cabinet", "desk", "counter", "table"], "difficult_direct_answer": false, "rationales": ["None of the answers is correct, but answer a is most consistent with the material of the thing the dog is under and the general structure.", "The dog is by a desk.", "It is surrounded by wood and a counter above its head. there are things on top of the counter."], "image": "val2014/COCO_val2014_000000427639.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538709, "question_id": "fzNqjgYFgxkWkkJzyGmK89", "question": "According to the large national attraction what city must this be?", "choices": ["san francisco", "los angeles", "new york", "saint louis"], "correct_choice_idx": 0, "direct_answers": ["san francisco", "san francisco", "san francisco", "san francisco", "san francisco", "outside", "brooklyn", "san francisco", "san francisco", "san francisco"], "difficult_direct_answer": false, "rationales": ["The golden gate bridge is shown. the bridge is in the city mentioned in a.", "This is the famous bridge over the bay", "The golden gate bridge is a famous landmark located in this california city."], "image": "train2014/COCO_train2014_000000538709.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 481596, "question_id": "fzgTJD37Wmp4LBw9epwCfK", "question": "Where would you usually see the king on the back of the yellow shirt?", "choices": ["restaurant", "playing card", "cookie box", "english tea"], "correct_choice_idx": 1, "direct_answers": ["playing cards", "playing card", "cards", "cards", "card", "france", "sports game", "playing card", "playing card", "cards"], "difficult_direct_answer": false, "rationales": ["A man is holding a small hot dog in his hand. on lower row is a man with a king and a number on it.", "The playing card has kings.", "The object on the back of the yellow shirt has a heart in one corner and depicts a face card that would usually be included in a deck of cards."], "image": "val2014/COCO_val2014_000000481596.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 364173, "question_id": "g3DDpxGZFtFpD2fBn3S9q3", "question": "What word is associated with these animals?", "choices": ["steer", "sugar glider", "puppy", "fins"], "correct_choice_idx": 0, "direct_answers": ["cows", "cows", "cow", "milk", "beef", "cows", "steer", "cow", "cow", "milk"], "difficult_direct_answer": false, "rationales": ["It is another name for a male cow who has been castrated.", "This word is used for male bovine creatures.", "These animals have a few different names such as cattle or bovine."], "image": "train2014/COCO_train2014_000000364173.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452014, "question_id": "g4SgfKhZEbNse3YW2PMeko", "question": "What animal do the spots on the banana most resemble?", "choices": ["bear", "hippo", "giraffe", "lion"], "correct_choice_idx": 2, "direct_answers": ["cheetah", "leopard", "giraffe", "cat", "griffie", "dog", "leopard", "giraffe", "giraffe", "monkey"], "difficult_direct_answer": false, "rationales": ["The animal in question does have spots and is yellow.", "The spots make it look like the tallest animal.", "The bananas resemble a giraffe closely."], "image": "train2014/COCO_train2014_000000452014.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212690, "question_id": "g6Mhu7EWBaJ4CvKvRcH9dx", "question": "How many toilet bowls are sat in this area next to the side of the street?", "choices": ["one", "four", "two", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "white", "three"], "difficult_direct_answer": false, "rationales": ["The picture is very clear as to how many toilets there are in the photo.", "There are three bowls.", "There are three toilet bowls in a row."], "image": "train2014/COCO_train2014_000000212690.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268114, "question_id": "g7UuQh3x8JsDaCNSWCBu2B", "question": "What kind of water body are these boats likely parked in?", "choices": ["river", "lake", "ocean", "canal"], "correct_choice_idx": 1, "direct_answers": ["lake", "lake", "lake", "lake", "lake", "lake", "lake", "lake", "lake", "lake"], "difficult_direct_answer": false, "rationales": ["Based on the calmness of the water and the visible boundary of the water, one can estimate the size and answer a seems most likely.", "The water is calm.", "Because it is surrounded by land and vegetation."], "image": "train2014/COCO_train2014_000000268114.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328216, "question_id": "g833AU9EctYitaDQxwcCm3", "question": "What are the people hoping to cross?", "choices": ["train tracks", "pirate swords", "rooftops", "river"], "correct_choice_idx": 0, "direct_answers": ["train tracks", "railroad track", "train tracks", "railway", "tracks", "many", "train tracks", "safety", "train tracks", "train tracks"], "difficult_direct_answer": false, "rationales": ["The train is passing.", "The people wait on their bikes at a railroad crossing as a train approaches on the tracks.", "People are gathered at a railroad crossing."], "image": "train2014/COCO_train2014_000000328216.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64697, "question_id": "g8ekn3BxoNo2cd9QrpK5Uf", "question": "Which column has the most holes?", "choices": ["column 2", "column 1", "column 4", "column 3"], "correct_choice_idx": 0, "direct_answers": ["both", "second", "front", "left right", "column 2", "bottom", "bottom", "bottom", "middle", "middle"], "difficult_direct_answer": false, "rationales": ["These are both rings and the others have filled donuts in them", "The middle column has two holes. the left and right column each have only one hole.", "The column in the middle has holes."], "image": "train2014/COCO_train2014_000000064697.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 164250, "question_id": "g8gsmUjdQPCfWNwMjmnYiY", "question": "What is a colloquial term that applies to the man in the middle?", "choices": ["bespectacled", "swole", "bald", "fair haired"], "correct_choice_idx": 1, "direct_answers": ["buff", "bodybuilder", "swole", "bro", "dancer", "samoan", "native", "gym rat", "what fuck", "bodybuilder"], "difficult_direct_answer": true, "rationales": ["The term is swollen.", "The man in the middle is not bald and is not wearing glasses. he has huge muscles.", "He is fit."], "image": "val2014/COCO_val2014_000000164250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 367952, "question_id": "g9T7wpTKZSzBagVgxwJ87x", "question": "What color is the center kite in the string of kites?", "choices": ["red", "pink", "yellow", "black"], "correct_choice_idx": 3, "direct_answers": ["black", "black", "black", "yellow", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["The color is black.", "Unless you are colorblind you can tell what the middles color is.", "There is a black kite in the middle of the fold."], "image": "train2014/COCO_train2014_000000367952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 35313, "question_id": "g9yipQahFCpmK698ic87CF", "question": "How many cars are parked behind the benches where one old woman sits on one bench?", "choices": ["six", "four", "three", "two"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three cars parked behind the bench.", "You can count three automobiles behind the lady.", "There are three cars."], "image": "val2014/COCO_val2014_000000035313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572084, "question_id": "gAE9BySQbPowU5DnwUDR86", "question": "What are the white paper items on the shelf near the teapot?", "choices": ["filters", "napkins", "dishtowels", "aprons"], "correct_choice_idx": 0, "direct_answers": ["paper towel", "towels", "filters", "coffee filters", "kitchen towel", "paper towels", "brown", "heater", "filters", "paper towels"], "difficult_direct_answer": false, "rationales": ["The real answer would be paper towels, but you can tell cause they are on the roll.", "The items are filters.", "The teapot on the shelf is next to paper coffee filters."], "image": "train2014/COCO_train2014_000000572084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507274, "question_id": "gApmMyoooBNinHMur3RHBT", "question": "Who likely uses this bathtub?", "choices": ["adults", "animals", "children", "teenagers"], "correct_choice_idx": 2, "direct_answers": ["kids", "children", "humans", "women", "children", "child", "children", "kids", "bath", "woman"], "difficult_direct_answer": false, "rationales": ["There are toys and brightly colored soaps in the bathtub that would be suitable for a child.", "There are small bottles.", "Bath toys are on the edge of a tub."], "image": "val2014/COCO_val2014_000000507274.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232358, "question_id": "gBk33y3pUKLn7DpFDSqMMa", "question": "What is on top of the shelf?", "choices": ["cat", "goat", "towel", "book"], "correct_choice_idx": 2, "direct_answers": ["towel", "soap", "towel", "towels", "towel", "towel", "towels", "towel", "towel", "towel"], "difficult_direct_answer": false, "rationales": ["The towel is on top.", "A towel is on the top shelf.", "There is a towel rolled on top of the shelf."], "image": "train2014/COCO_train2014_000000232358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 256565, "question_id": "gBm2caXrnsUn5hmohQyC8F", "question": "What is next to the building?", "choices": ["trees", "horse", "antelope", "cow"], "correct_choice_idx": 0, "direct_answers": ["tropical tree", "trees", "sign", "stop sign", "stop sign", "tree", "tree", "trees", "tree", "tree"], "difficult_direct_answer": false, "rationales": ["The trees are near.", "The building is clearly visible and identifiable based on the outline and structure of the building. the objects next to it are also identifiable based on their unique features.", "Trees surround the large building. they have large green leaves."], "image": "train2014/COCO_train2014_000000256565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 380084, "question_id": "gCGG2WjCY6zofiBnLwvFef", "question": "Which sport is the man on the left most likely playing on the nintendo wii appliance?", "choices": ["tennis", "baseball", "golf", "boxing"], "correct_choice_idx": 2, "direct_answers": ["controller", "golf", "golf", "golf", "remote", "tennis", "golf", "golf", "video game", "golf"], "difficult_direct_answer": false, "rationales": ["The man is holding his control down as if putting.", "He has his hands down like he's holding a club", "Because of the posture he is standing with and the way his hands are positioned."], "image": "train2014/COCO_train2014_000000380084.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94248, "question_id": "gDWK3djb5CXGvb4shZCo4g", "question": "What is in the window?", "choices": ["mannequin", "cat", "dog", "elephant"], "correct_choice_idx": 0, "direct_answers": ["mannequins", "cloths", "mannequins", "mannikins", "paintings", "mannequins", "lingerie", "mannequins", "store", "mannequin"], "difficult_direct_answer": false, "rationales": ["A storefront can be seen on the other side of the road and clothes are being advertised in the window.", "The window has a mannequin.", "You can see in the window is a human form but purely white. these are plastic human forms that show how clothing would look on a person."], "image": "val2014/COCO_val2014_000000094248.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330284, "question_id": "gDZHwSEFtAUgNeDUegpDDp", "question": "What is near the elephants?", "choices": ["drones", "toddlers", "eggs", "trees"], "correct_choice_idx": 3, "direct_answers": ["waterfall", "fence", "waterfall", "trees", "waterfall", "water", "waterfall", "waterfall", "waterfall", "waterfall"], "difficult_direct_answer": false, "rationales": ["There are trees near them.", "Trees are outside the fence. the elephants are near them even though they cannot get to them.", "Elephants are gathered near a fence and tall trees can be seen on the other side."], "image": "train2014/COCO_train2014_000000330284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 219606, "question_id": "gFLKayfY6NXCKe6VLPrErF", "question": "What color is the dog with the collar around his ears like an old lady?", "choices": ["tan", "brown", "black", "red"], "correct_choice_idx": 3, "direct_answers": ["brown", "brown", "red", "red", "red", "orange", "red", "red", "light brown", "brown"], "difficult_direct_answer": false, "rationales": ["The dog who looks like an old lady has a collar around his ears.", "The color is red.", "The color is red."], "image": "train2014/COCO_train2014_000000219606.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 521201, "question_id": "gGMyFcHrQDETrfsZ4rHkMA", "question": "What measurement is closest to the weight of the biggest animal here?", "choices": ["5 milligrams", "3 kilograms", "8000 pounds", "500 tons"], "correct_choice_idx": 0, "direct_answers": ["ton", "ton", "tone", "two tons", "5 milligrams", "ton", "wall", "5 tons", "ton", "ton"], "difficult_direct_answer": false, "rationales": ["An elephant weighs about four tons. a ton is 2000 pounds.", "This is the average weight of an elephant", "I had to look this one up. apparently, this is the right answer."], "image": "train2014/COCO_train2014_000000521201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213539, "question_id": "gHDSVmTaCLj74HT6WKRHX3", "question": "Why is there no shower curtain?", "choices": ["no shower", "on floor", "no bathtub", "shower door"], "correct_choice_idx": 3, "direct_answers": ["glass door", "no", "glass", "shower door", "bathtub", "shower door", "glass door", "glass door", "glass door", "glass partition"], "difficult_direct_answer": false, "rationales": ["It has a glass enclosure so there is no need for anything else.", "There is a shower door.", "There is a bathtub. there is a piece of glass in between the bathtub and the rest of the bathroom."], "image": "train2014/COCO_train2014_000000213539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 507952, "question_id": "gHRg5pGBaZo8pWS5RrXkyR", "question": "What are the woman using?", "choices": ["dog leashes", "egg baskets", "laptops", "car keys"], "correct_choice_idx": 2, "direct_answers": ["laptop", "laptops", "laptop", "laptop", "laptops", "laptop computers", "apple macbooks", "computer", "laptop", "laptop"], "difficult_direct_answer": false, "rationales": ["The woman uses a laptop.", "Both women have a silver laptop that they are working on.", "They have apple notebook computers opened before them."], "image": "train2014/COCO_train2014_000000507952.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547265, "question_id": "gJU6Y6Tyuhx2AELNvJYWhq", "question": "What is the person sitting on?", "choices": ["box", "car hood", "bed", "toilet"], "correct_choice_idx": 3, "direct_answers": ["toilet", "chair", "toilet", "toilet", "stop", "chair", "toilet", "toilet", "toilet", "toilet"], "difficult_direct_answer": false, "rationales": ["Toilets are white and made of porcelin.", "The person is on the toilet.", "The person is sitting fully clothed on a toilet."], "image": "train2014/COCO_train2014_000000547265.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477785, "question_id": "gKaoxg9cPMVamRnDhnQF8k", "question": "What is the section of train the men are in?", "choices": ["head", "caboose", "stomach", "belly"], "correct_choice_idx": 1, "direct_answers": ["back", "caboose", "locomotive", "caboose", "caboose", "caboose", "caboose", "3 mens", "caboose", "drive"], "difficult_direct_answer": false, "rationales": ["They are in the last car of the train.", "The section is the caboose.", "There are some men posing for photos in a red caboose."], "image": "train2014/COCO_train2014_000000477785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187178, "question_id": "gKoq5rYxAJFVJTZLMBLA74", "question": "What fruit is plentiful here?", "choices": ["pear", "banana", "orange", "apple"], "correct_choice_idx": 1, "direct_answers": ["banana", "bananas", "stop", "many", "banana", "banana", "banana", "bananas", "banana", "banana"], "difficult_direct_answer": false, "rationales": ["The bananas are plentiful.", "Bananas are bunched up.", "Numerous bunches of bananas are gathered."], "image": "train2014/COCO_train2014_000000187178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 24100, "question_id": "gLGUVGB2GkEMVhCcDYDoFX", "question": "What kind of trick is this skateboarder performing?", "choices": ["tail slide", "rail slide", "truck grind", "nose grind"], "correct_choice_idx": 0, "direct_answers": ["tail slide", "jump", "jump", "not load", "jumping", "skating", "balanced trick", "skateboarding", "railing", "play"], "difficult_direct_answer": true, "rationales": ["The skateboarder is performing a slide with the tail of the skateboard.", "He is sliding down.", "He is performing a stunt"], "image": "train2014/COCO_train2014_000000024100.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 239436, "question_id": "gLWAbGVgPw3svp2PaUA8z2", "question": "What color is the lid on the water bottle on the bench with the child?", "choices": ["yellow", "blue", "green", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "white", "white", "white", "red", "white", "white", "white", "red", "white"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The water bottle is red with a white cap.", "The bottom is this color and the lid is white"], "image": "train2014/COCO_train2014_000000239436.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510923, "question_id": "gMMrQ2TLQbupC7hzZKGq3B", "question": "What kind of musical instrument is covered by the guitar on the left side of the room?", "choices": ["guitar", "piano", "violin", "bass"], "correct_choice_idx": 0, "direct_answers": ["guitar", "guitar", "ink", "electric", "guitar", "guitar", "no", "guitar", "drum", "guitar"], "difficult_direct_answer": false, "rationales": ["There is a guitar covered up by a wrapping on the left side.", "An instrument with a long neck is on a stand.", "A guitar carrying case is visible on the left side and it's likely the stringed instrument is inside."], "image": "train2014/COCO_train2014_000000510923.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 18425, "question_id": "gMgSVfyQjYxadTqR6QBH8R", "question": "What is the animal on the right eating?", "choices": ["banana", "leaves", "beef", "chicken"], "correct_choice_idx": 1, "direct_answers": ["trees", "leafs", "grass", "plant", "grass", "leaves", "leaves", "leaves", "green plants", "leaves"], "difficult_direct_answer": false, "rationales": ["The animal on the right is a giraffe. giraffes are herbivores, not carnivores, and there are no bananas in the trees.", "The animal on the right is eating from a bush of leaves.", "The giraffe is eating a leaf from a bush."], "image": "train2014/COCO_train2014_000000018425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84114, "question_id": "gMutBPAo6jtKr8NVNRR3dK", "question": "What constellation is related to the animal on the uniform?", "choices": ["capricorn", "aries", "ursa major", "sagittarius"], "correct_choice_idx": 2, "direct_answers": ["ursa major", "ursa major", "ursa major", "ursa minor", "bear", "ursa major", "ursus minor", "star", "ursa major", "bear"], "difficult_direct_answer": false, "rationales": ["This is the latin name of the bear shaped constellation.", "The constellation is ursa major.", "Bears are part of the star things of astronomy."], "image": "train2014/COCO_train2014_000000084114.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206912, "question_id": "gNW55eHytQBvN7UGNqN3cU", "question": "What is the man transporting?", "choices": ["luggage", "pizza", "bananas", "eggs"], "correct_choice_idx": 0, "direct_answers": ["luggage", "luggage", "suitcase", "luggage", "luggage", "luggage", "bag", "luggage", "bicycle", "luggage"], "difficult_direct_answer": false, "rationales": ["The man is moving bags, not food.", "The man has luggage.", "He is pulling a suitcase behind him."], "image": "train2014/COCO_train2014_000000206912.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 52388, "question_id": "gNnakKoFbs8u3WqcspEwns", "question": "What does the tall thing behind the truck look like?", "choices": ["wicker man", "cross", "baby", "star"], "correct_choice_idx": 1, "direct_answers": ["tree", "cross", "cross", "lamppost", "tree", "telephone pole", "cross", "cross", "trailer", "cross"], "difficult_direct_answer": false, "rationales": ["There is a cross on top of the bus, which is actually an electric post.", "There is a vertical pole with a smaller horizontal pole attached toward the top. it is very similar to a religious symbol.", "There is one vertical bar and one horizontal, crossing each other."], "image": "train2014/COCO_train2014_000000052388.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 165410, "question_id": "gP4wdsUEUxMqNjzRGCUiAJ", "question": "What is on the dresser?", "choices": ["roses", "books", "television", "apple pie"], "correct_choice_idx": 2, "direct_answers": ["tv", "television", "television", "television", "table", "tv", "tv", "television", "tv", "tv"], "difficult_direct_answer": false, "rationales": ["It is a characteristic shape to hold tubes inside and a screen that displays a picture. it is common for people to put it where they can watch it from bed.", "The dresser has a tv.", "The dresser is identifiable based on the drawers and handles visible and the prominent object on it is answer a based on the size and shape and screen."], "image": "train2014/COCO_train2014_000000165410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266618, "question_id": "gReWCJHW5NiGgbkTM3bpPs", "question": "What color is the shirt of the girl holding a satellite dish who is riding behind the man driving a motorcycle?", "choices": ["orange", "blue", "red", "gray"], "correct_choice_idx": 1, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The girl's back is fully visible and the entire shirt is one color and easily visible. it's contrasted to the gray street.", "Her pants are red. her shirt does not match her pants and is not orange or gray.", "Her shirt is not red, orange, or gray."], "image": "train2014/COCO_train2014_000000266618.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 573784, "question_id": "gTA29BQG3P7RkWbdghJ5Ew", "question": "What is the main material used to build this fireplace?", "choices": ["plastic", "wood", "stone", "steel"], "correct_choice_idx": 2, "direct_answers": ["stone", "stone", "brick", "stone", "bricks", "stone", "stone", "stone", "metal", "stone"], "difficult_direct_answer": false, "rationales": ["The fireplace is made of stone.", "There are several organically-shaped greyish white objects that build that. they help keep the fire from burning that thing down.", "They are rocks and odd shaped"], "image": "val2014/COCO_val2014_000000573784.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84887, "question_id": "gUVWLNYnkHNJT5ETwR7ChB", "question": "What is visible in the water?", "choices": ["snake", "eel", "boat", "fish"], "correct_choice_idx": 2, "direct_answers": ["boat", "ship", "ship", "boat", "ship", "ship", "boat", "boat", "ship", "boat"], "difficult_direct_answer": false, "rationales": ["It is large and is floating on the water. it is made out of characteristic metal and has a mast at the top for observation. this is typical of this type of vehicle.", "There is only one object visible in the water and based on its shape, size and design it would be answer a.", "The boat is visible."], "image": "train2014/COCO_train2014_000000084887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 110683, "question_id": "gVxTxVJ9HEYkuvwJGyjyeQ", "question": "How many big giraffes are there excluding little giraffes in total?", "choices": ["one", "three", "four", "two"], "correct_choice_idx": 3, "direct_answers": ["three", "two", "two", "three", "two", "two", "two", "two", "three", "two"], "difficult_direct_answer": false, "rationales": ["They are very tall compared to the baby", "There are two large giraffes.", "There is a big giraffe in front of the little one. an additional big giraffe is behind the little giraffe."], "image": "val2014/COCO_val2014_000000110683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 536534, "question_id": "gVxm6LQmLEbvmpFwDhw9yo", "question": "Where is the buses next destination?", "choices": ["york way", "archway station", "oxford st", "metroline"], "correct_choice_idx": 1, "direct_answers": ["archway station", "archway station", "archway station", "archway", "archway station", "archway station", "archway station", "archway station", "bus stop", "archway station"], "difficult_direct_answer": false, "rationales": ["A public bus is in the street with a digital sign on the top of the front window that lists the next destination.", "The buses go into a large station.", "This is stated on the top of the bus"], "image": "train2014/COCO_train2014_000000536534.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431972, "question_id": "gWLMoKwMHj29X59j9WV2Mw", "question": "What is the appliance next to the refrigerator?", "choices": ["tea pot", "coffee maker", "blender", "hand mixer"], "correct_choice_idx": 2, "direct_answers": ["blender", "blender", "blender", "blender", "blender", "mixie jar", "messy blender", "blender", "blender", "blender"], "difficult_direct_answer": false, "rationales": ["There is a blender.", "A small electrical appliance with a glass pitcher on top is on the counter next to a fridge in a kitchen.", "There is an electric blender next to the fridge on the kitchen counter."], "image": "train2014/COCO_train2014_000000431972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 229574, "question_id": "gY23bfCnVZs52jYkTJHitn", "question": "How many children are running onto the cape with the water cows?", "choices": ["five", "four", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two children running onto the cape with the large animals.", "You can see two small humans.", "There are two young children running around on the grass."], "image": "train2014/COCO_train2014_000000229574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 49321, "question_id": "gZh2MZBrph48CRc45TkTFt", "question": "What is the man wearing?", "choices": ["crown", "glasses", "backpack", "hat"], "correct_choice_idx": 1, "direct_answers": ["sweater", "glasses", "glasses", "sweater", "sweater", "glasses", "glasses", "glasses", "glasses", "sweater"], "difficult_direct_answer": false, "rationales": ["The man has glasses.", "The man does not have any accessories on his head or back.", "He has lenses in a plastic frame"], "image": "train2014/COCO_train2014_000000049321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 479035, "question_id": "ga9u5KLpqJpsx6nfRdwkAr", "question": "Why does the woman have a ring on her ring finger?", "choices": ["married", "fashion", "protection", "visibility"], "correct_choice_idx": 0, "direct_answers": ["married", "marriage ring", "shes married", "married", "married", "married", "marriage", "married", "marriage ring", "engaged"], "difficult_direct_answer": false, "rationales": ["The ring on this woman's ring finger tells others she is married or engaged.", "A woman is wearing a diamond ring on the finger people traditionally wear rings when they are married.", "The woman cutting the donuts in half is wearing a ring on her finger because she is married."], "image": "train2014/COCO_train2014_000000479035.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 94307, "question_id": "gc2Q7HyRxdXRgW8Spas6tg", "question": "What is the orange item near the bottom of the pile?", "choices": ["garfield doll", "lime", "carrot", "thumb tack"], "correct_choice_idx": 2, "direct_answers": ["carrots", "carrots", "carrots", "carrot", "flower", "carrot", "carrot", "flowers", "carrot", "fruit"], "difficult_direct_answer": false, "rationales": ["Carrots are orange with green on top.", "The orange item near the bottom of the pile is a fresh carrot.", "There is a pile of carrots on the bottom."], "image": "train2014/COCO_train2014_000000094307.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 534189, "question_id": "gcsqLo5zB6HvFw9uPLkQyh", "question": "How many sail posts are on the back of this historic sailing ship?", "choices": ["four", "three", "five", "two"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "one", "one", "one", "three", "one", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["One sail post is in between two others.", "There are three tall sail posts on the back of this historic sailing ship.", "You can count them because they are visible on the ship. this number is typical of this type of ship."], "image": "train2014/COCO_train2014_000000534189.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431431, "question_id": "gcvkCxRntDjKVKzpvG3wLi", "question": "How does the it feel inside the back of the truck?", "choices": ["muggy", "warm", "cold", "scorching"], "correct_choice_idx": 2, "direct_answers": ["uncomfortable", "cold", "cold", "schwans", "cold", "stuffy", "swan", "cold", "cold", "warm"], "difficult_direct_answer": false, "rationales": ["The truck is cold.", "This is a schwan's food delivery truck. the dish shown on the back has meat and vegetables, so the truck is refrigerated.", "Schwan's delivers frozen foods so the temperatures in the back of the truck would need to be low enough that the products remain frozen."], "image": "val2014/COCO_val2014_000000431431.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 454961, "question_id": "gdBXRF2Fr28tcmxPmRqQ6J", "question": "What item on the plate is usually believed to be healthy?", "choices": ["carrot", "fried egg", "brown rice", "red beets"], "correct_choice_idx": 0, "direct_answers": ["broccoli", "broccoli", "brocolli", "vegetables", "bronchiole", "carrot", "carrots", "vegetables", "vegetables", "vegetables"], "difficult_direct_answer": false, "rationales": ["The carrots on the plate are usually believed to be healthy.", "The orange carrots are one of the healthiest foods on the plate because it is high in vitamins.", "The other options aren't on the plate."], "image": "train2014/COCO_train2014_000000454961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 365833, "question_id": "gdXypaQUFWVdP5Mr5j7vYx", "question": "What location is this most likely?", "choices": ["england", "china", "haiti", "russia"], "correct_choice_idx": 2, "direct_answers": ["beach", "seaside", "beach", "beach", "mexico", "haiti", "beach", "likely", "beach", "beach"], "difficult_direct_answer": false, "rationales": ["You can tell by the setting and the words on the cart as to where they are from.", "The people are black. they are walking on a beach.", "The location is haiti."], "image": "train2014/COCO_train2014_000000365833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337277, "question_id": "gdarqbBzgeT8itHy6JnsrX", "question": "What is on the smaller laptop screen?", "choices": ["cat", "dog", "baby picture", "screen saver"], "correct_choice_idx": 3, "direct_answers": ["keyboard", "computer", "colors", "laptop stand", "shooting stars", "nothing", "colors", "screen saver", "car", "fireworks"], "difficult_direct_answer": true, "rationales": ["Which is used to protect the screen of the laptop.", "It is a photo while the computer is not being used.", "The abstract pattern shows up whenever the screen is idle for a while."], "image": "train2014/COCO_train2014_000000337277.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 504325, "question_id": "gjbATdruaSE2CnDCE8pbBu", "question": "What is to the right?", "choices": ["cat", "dog", "baby", "counter top"], "correct_choice_idx": 3, "direct_answers": ["sink", "sink", "closet", "sink", "counter top", "sink", "cabinets", "counter", "kitchen", "sink"], "difficult_direct_answer": false, "rationales": ["There is a massive bathroom with a vanity sink on the right side.", "The counter is to the right.", "There is a counter top to the right of the bathtub."], "image": "train2014/COCO_train2014_000000504325.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 494236, "question_id": "gm3KH2PFsiAWwuCCrRudcv", "question": "What kind of animal is put into effigy on the top of these boats?", "choices": ["zebra", "elephant", "giraffe", "lion"], "correct_choice_idx": 1, "direct_answers": ["bsll", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephants", "elephant", "cat"], "difficult_direct_answer": false, "rationales": ["The animal is an elephant.", "It has a large body just like a pachyderm. it also has characteristic large ears like this animal.", "An effigy would be a statue and the animal represented here is the elephant."], "image": "train2014/COCO_train2014_000000494236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101933, "question_id": "gmhJgoBWGtH2Lx3AFhpd54", "question": "How many people are riding on the little scooter all together?", "choices": ["five", "three", "four", "two"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are 3.", "One person is in between two other people.", "This is obvious simply by counting them. this type of riding may be illegal depending on where it takes place."], "image": "val2014/COCO_val2014_000000101933.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 493196, "question_id": "gpKXMCieCcQ7poUZDcT6C4", "question": "What is the name of the activity the man is doing?", "choices": ["snowboarding", "crossskating", "inline skating", "skateboarding"], "correct_choice_idx": 1, "direct_answers": ["staking", "scooting", "skating", "skate riding", "skating", "crossskating", "roller ski", "skating", "sking", "skating"], "difficult_direct_answer": false, "rationales": ["It is like cross country skiing except with wheels", "The name is skating.", "The man is skating on wheels, and is called cross-skating."], "image": "val2014/COCO_val2014_000000493196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 406199, "question_id": "gpjGWeNHBcRbhFWQBnH5qo", "question": "What item is stuffed here?", "choices": ["red pepper", "clam", "pig", "turkey"], "correct_choice_idx": 0, "direct_answers": ["bellpepper", "red pepper", "peppers", "red pepper", "pepper", "bell pepper", "pepper", "red pepper", "peppers", "pepper"], "difficult_direct_answer": false, "rationales": ["The pepper is stuffed.", "The pepper is stuffed.", "The pepper is halved and has a green stuffing inside of it and is served on a plate."], "image": "train2014/COCO_train2014_000000406199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221343, "question_id": "gq8u7Q3QKgjuUzpeEszdXN", "question": "Why is the boy in the black shirt wearing a helmet?", "choices": ["protection", "indimidation", "visibility", "fashion"], "correct_choice_idx": 0, "direct_answers": ["injury", "playing baseball", "protect head", "safety", "play", "playing baseball", "baseball", "batter", "protection", "protect"], "difficult_direct_answer": true, "rationales": ["Batters in a baseball game always wear safety gear.", "The boy needs protection.", "The boy is protected."], "image": "train2014/COCO_train2014_000000221343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 547297, "question_id": "gqAfwT2BYUNMsLrLQa8WPy", "question": "What is on the plate?", "choices": ["salt", "food", "sand", "sugar"], "correct_choice_idx": 1, "direct_answers": ["table", "brocolli", "food", "broccoli", "food", "broccoli", "vegetable", "broccoli", "brochile", "food"], "difficult_direct_answer": false, "rationales": ["The plate has broccoli and what looks like barbecue meat on it.", "There are cooked pieces of vegetables on the plate that someone can eat if they want food.", "The plate has food."], "image": "train2014/COCO_train2014_000000547297.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200374, "question_id": "grTamqoeTDaqDMXB4FuXJB", "question": "What is next to the pasta?", "choices": ["apple", "beef", "lemon", "broccoli"], "correct_choice_idx": 3, "direct_answers": ["broccoli", "brocolli", "broccoli", "broccoli", "brocolli", "onions", "vegetables", "broccoli", "broccoli", "broccoli"], "difficult_direct_answer": false, "rationales": ["The food is green and looks like a bunch of little \"trees.\".", "You can tell by the color and texture as to what type of vegetable it is.", "There is a green vegetable, not meat or a fruit, next to the pasta."], "image": "train2014/COCO_train2014_000000200374.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296492, "question_id": "gsvd8Qf6PSAS8xeAwKPJv8", "question": "What is a danger to the child?", "choices": ["sharp knife", "snake", "poisonous frog", "trampoline fall"], "correct_choice_idx": 0, "direct_answers": ["falling", "fall", "falling", "sharp knife", "falling", "falling", "knife", "falling", "falling", "falling off"], "difficult_direct_answer": false, "rationales": ["There is a sharp knife in the sink which could be dangerous.", "The knife is dangerous.", "The child is in a kitchen. animals and trampolines are not likely threats in this setting."], "image": "val2014/COCO_val2014_000000296492.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441211, "question_id": "gtJ5DyqLHkR6VVznjZHKbH", "question": "What color is the lateral stripe around the train engine?", "choices": ["white", "blue", "red", "green"], "correct_choice_idx": 2, "direct_answers": ["red", "medal", "red", "white", "white", "white", "white", "red", "brown", "white"], "difficult_direct_answer": false, "rationales": ["The color is red.", "It is red.", "The only listed color that is visible on the train is red."], "image": "val2014/COCO_val2014_000000441211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 510613, "question_id": "gtLesUaUdLHTnU3AhrKtAa", "question": "What shape is the bathroom mirror of this room?", "choices": ["circle", "square", "rectangle", "oval"], "correct_choice_idx": 1, "direct_answers": ["square", "rectangle", "square", "square", "rectangle", "square", "rectangle", "rectangle", "rectangle", "rectangle"], "difficult_direct_answer": false, "rationales": ["It is slightly wider than it is tall and each opposite side is the same length", "The mirror on the wall of the bathroom is in the shape of a rectangle.", "The mirror has four straight lines and corners."], "image": "train2014/COCO_train2014_000000510613.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131108, "question_id": "gudiJGv7msUTWTc5GJci2U", "question": "What place is this most likely?", "choices": ["new jersey", "new york", "rome", "china"], "correct_choice_idx": 2, "direct_answers": ["home", "rome", "italy", "garage", "apartment", "restaurant", "italy", "italy", "italy", "roma"], "difficult_direct_answer": false, "rationales": ["The sign on the building indicates this", "Roma is the italian way to say the capital city of italy.", "That is the italian name for this city."], "image": "val2014/COCO_val2014_000000131108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148395, "question_id": "guvqepFq8EiEaf7xkWaXcK", "question": "What is next to green car?", "choices": ["chain", "bison", "cow", "elk"], "correct_choice_idx": 0, "direct_answers": ["chain", "chain", "chain", "motorcycle", "bike", "motorcycle", "information", "motorcycle", "fence placard", "chain"], "difficult_direct_answer": false, "rationales": ["There is a metal chain around the perimeter of the vehicles on display. these are used to keep people away so they just go around edges and look.", "This ropes off the area to keep people away from the vehicle", "The car is surrounded by chains."], "image": "val2014/COCO_val2014_000000148395.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47213, "question_id": "gz2ssPYCttfiRmQoKdQEjE", "question": "What kind of wood is used to make the barrels in the background?", "choices": ["mahogany", "oak", "birch", "pine"], "correct_choice_idx": 1, "direct_answers": ["oak", "oak", "oak", "oak", "oak", "oak", "trees", "oak", "oak", "oak"], "difficult_direct_answer": false, "rationales": ["The establishment has wine glasses showing that they serve alcohol. barrells holding alcohol are always made of this wood because the flavor is agreeable to whiskey and beer.", "They use oak for these barrels.", "Generally oak is best due to the strength and longevity of this type of wood."], "image": "train2014/COCO_train2014_000000047213.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340003, "question_id": "gzH9KbHkQd8CqyChUw3Pbw", "question": "What is on top of the green structure?", "choices": ["gargoyle", "hen", "clock", "cat"], "correct_choice_idx": 2, "direct_answers": ["steeple", "steeple", "steeple", "steeple", "roof", "steeple", "steeple", "clock", "clock", "spire"], "difficult_direct_answer": false, "rationales": ["The item on the structure is a clock. it has hands and is showing the time.", "The green structure has a steeple. there is a round time-telling device on top of it.", "The top of the green structure is a clock tower for the church to tell time."], "image": "train2014/COCO_train2014_000000340003.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13267, "question_id": "gzrpCijHUcXrUnPM4gpgXZ", "question": "What is the man wearing?", "choices": ["backpack", "armor", "garbage bag", "helmet"], "correct_choice_idx": 3, "direct_answers": ["helmet", "helmet", "helmet", "helmet", "coat helmet", "red helmet", "helmet", "helmet", "helmet", "suit"], "difficult_direct_answer": false, "rationales": ["He has a safety hat on .", "The other options aren't in this image and a makes sense as it is a safety tool when riding on a motorcycle.", "The other options aren't in the photo."], "image": "train2014/COCO_train2014_000000013267.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257341, "question_id": "h4WDNsqikvf9R8UjEtTa5D", "question": "Where is the most obvious place to get hand soap?", "choices": ["in mirror", "blue bowl", "in sink", "orange bottle"], "correct_choice_idx": 3, "direct_answers": ["wall", "bathroom", "soap bottle", "wash basin", "dispenser", "orange bottle", "soap dispenser", "counter top", "soap dispenser", "toilet"], "difficult_direct_answer": true, "rationales": ["There is a single bottle of pump soap by sink.", "The object is located on top of the sink next to the faucet with a top bottle dispenser for your hands to push down with.", "It has a dispenser at the top of the bottle"], "image": "train2014/COCO_train2014_000000257341.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 363272, "question_id": "h4nenkKK7kXMP5XGpG3ohV", "question": "The man sitting on the post with the phone to his ear is wearing what color of dome on his hat?", "choices": ["red", "blue", "yellow", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "green", "bench", "green", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["Unless you are colorblind you can easily tell what color the helmet is.", "The color is green.", "A man is sitting on a concrete post as he talks on the phone. he is wearing a business suit and has a helmet with green on top."], "image": "val2014/COCO_val2014_000000363272.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 310808, "question_id": "h5YtNUVVTjmeY3xbNeFPK9", "question": "What color is the roof of the boat with a few people on it?", "choices": ["purple", "red", "green", "blue"], "correct_choice_idx": 3, "direct_answers": ["brown", "white", "blue", "white", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The roof of the boat is blue.", "The boat has been decorated with two different colors.", "A large flat bottom boat is docked at marina. the roof color is the same as sky."], "image": "train2014/COCO_train2014_000000310808.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468692, "question_id": "h6DXEh8nCsNNoTLnW9Zhu4", "question": "According to the bike rack what kind of a business is here?", "choices": ["dentist office", "tax preparer", "realtor", "mini market"], "correct_choice_idx": 0, "direct_answers": ["dentist", "tooth paste", "dentist", "dentist", "bike bollards", "dentist", "dentist", "dentist office", "dentist", "doesn't say"], "difficult_direct_answer": false, "rationales": ["The rack is near a dentist's office.", "It's a toothbrush", "There is likely a dentists office, because the bike rack is toothbrushes."], "image": "train2014/COCO_train2014_000000468692.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 266209, "question_id": "h6QQiSkcYCkWTDbANEGWgn", "question": "How many portraits are hung on the white wall?", "choices": ["three", "two", "four", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "one", "zero", "one", "one", "one", "zero", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["The portrait above the sofa is by itself.", "There is one portrait hung on the white wall.", "The number of portraits is clearly visible and countable based on the outline of the object on the wall."], "image": "train2014/COCO_train2014_000000266209.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552985, "question_id": "h74pvxvMcu9HjXFHhf52KW", "question": "How many people could this food serve?", "choices": ["25", "five", "30", "one"], "correct_choice_idx": 3, "direct_answers": ["one", "eight", "six", "one", "one", "four", "five", "two", "three", "two"], "difficult_direct_answer": false, "rationales": ["A man is holding a whole pizza up and taking a bite out of it.", "A man is biting the edge of a pizza he is holding in his hands. a pizza that is able to be held in hands is probably a personally sized pizza.", "One person can eat the pizza."], "image": "train2014/COCO_train2014_000000552985.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143563, "question_id": "h7NMfqRcSLH66n695cCBxm", "question": "How many people are walking around in the train station?", "choices": ["four", "one", "three", "two"], "correct_choice_idx": 2, "direct_answers": ["three", "some", "two", "three", "four", "three", "three", "five", "four", "two"], "difficult_direct_answer": false, "rationales": ["The people are visible and countable based on their unique outlines.", "There are a few people clustered in the middle and one in the bottom corner.", "There are four people in motion near the train. all are waiting for the train."], "image": "train2014/COCO_train2014_000000143563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447038, "question_id": "h7fcKR3kGMfdroqULEBZyD", "question": "How many sinks are dug into the black counter next to the toilet?", "choices": ["two", "five", "four", "three"], "correct_choice_idx": 0, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is one sink on the left and one on the right.", "There are two holes dug into the counter for sinks.", "There are two white sinks in the black bathroom counter next to the toilet."], "image": "val2014/COCO_val2014_000000447038.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197609, "question_id": "h7jzEnwktVQnjPh4p37Fco", "question": "What is only one of the hot dogs missing?", "choices": ["mustard", "bun", "onions", "ketchup"], "correct_choice_idx": 1, "direct_answers": ["bun", "greens", "bun", "bun", "hotdog", "cake", "bun", "bun", "bun", "bun"], "difficult_direct_answer": false, "rationales": ["A hotdog in a bun is next to one on a paper and lid with no bread.", "One of the hot dogs is missing a bun while the other hot dog has a bun.", "The hotdog on the right only has mustard on top of it."], "image": "val2014/COCO_val2014_000000197609.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 544780, "question_id": "h7tRvNtnWkUtVpZLEKiUbJ", "question": "What are the horses near?", "choices": ["mud", "grass", "sand", "hay"], "correct_choice_idx": 2, "direct_answers": ["ocean", "ocean", "ocean", "sand", "water", "ocean", "ocean", "ocean", "coean", "water"], "difficult_direct_answer": false, "rationales": ["Horses are running on the beach near the water.", "The horses are galloping on a beach and sand makes up a beach.", "They're by sand."], "image": "train2014/COCO_train2014_000000544780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350497, "question_id": "h8WDNPpsNBfGYpKsTNEGjQ", "question": "What kind of yard appliance is hanging on the back of the moped motorcycle?", "choices": ["seat", "grill", "pillow", "chair"], "correct_choice_idx": 1, "direct_answers": ["grill", "grill", "grill", "charcoal grill", "grill", "barbecue", "grill", "table", "box", "grill"], "difficult_direct_answer": false, "rationales": ["It has a characteristic size and shape. you can see the lid on top with the handle to protect the user from the heat of the grill.", "Chairs, pillows, and seats are not appliances. the item is a type of barbeque.", "One can see the small appliance behind the motorcycle."], "image": "train2014/COCO_train2014_000000350497.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 224285, "question_id": "h8i2FtmcuaZvUGmHhwvrik", "question": "How many trains could be traveling underneath of these wires overhanging the train track?", "choices": ["four", "three", "two", "five"], "correct_choice_idx": 2, "direct_answers": ["two trains", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Since there are only two pairs of tracksand this number of trains shown.", "Two trains can travel on the tracks.", "There are tracks for two trains."], "image": "train2014/COCO_train2014_000000224285.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475839, "question_id": "h8iBU83acDDciuxyfVS4WF", "question": "What does the sign say?", "choices": ["b3", "e4", "n2", "c7"], "correct_choice_idx": 2, "direct_answers": ["n2", "n2", "n2", "n2", "n2", "playing", "n2", "n2", "n2", "n2"], "difficult_direct_answer": false, "rationales": ["The sign beneath the skateboarder has n2 on it.", "The sign on the wall says n2.", "The letter n and the number 2 is on the white sign of the background."], "image": "train2014/COCO_train2014_000000475839.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300284, "question_id": "h9UbtCQJiBYfLczg8syjjK", "question": "What does the hydrant appear to have?", "choices": ["quarter slots", "cat hairs", "face", "birds nest"], "correct_choice_idx": 2, "direct_answers": ["eyes", "toy", "eyes", "chains", "face", "eyes", "eyes", "road", "eyes", "eyes"], "difficult_direct_answer": false, "rationales": ["The hydrant has a face.", "The two white and black rectangles look like eyes. the chain looks like a nose.", "The black and white square images on the hydrant appear to be eyes with the chain link as a nose and the handles as ears."], "image": "train2014/COCO_train2014_000000300284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 194108, "question_id": "h9fGoD3phVAT9n5ZDo8r4f", "question": "What are the people walking near?", "choices": ["camel", "elephant", "tower", "clown"], "correct_choice_idx": 2, "direct_answers": ["tower", "clock tower", "clock tower", "tower", "clock", "dresses", "clock tower", "church", "tower", "clock"], "difficult_direct_answer": false, "rationales": ["They are near the tower.", "A tower is a tall building like shown.", "A large, tall building with a clock is behind people walking in a street."], "image": "train2014/COCO_train2014_000000194108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277072, "question_id": "hA4Ciw5zLNebuTK6AKuxcT", "question": "What is the innermost color reflected off the center of the oven?", "choices": ["yellow", "blue", "green", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "yellow", "green", "orange", "gray", "red", "red", "red", "clear", "red"], "difficult_direct_answer": false, "rationales": ["The center of the reflection area is red.", "There is the color red in the center of the oven window.", "The color is red."], "image": "train2014/COCO_train2014_000000277072.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7304, "question_id": "hAUiMy8rFL7NthWhEixdj7", "question": "What color is the center of the tennis racket used by the little girl who is about to hit the ball?", "choices": ["orange", "black", "red", "blue"], "correct_choice_idx": 0, "direct_answers": ["orange", "red", "red", "orange", "orange", "orange", "orange", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["A young girl is swinging a racket with orange strings.", "The color is bright and easily visible. it is in sharp contrast to the gray ground.", "The line drawn on the center orange"], "image": "val2014/COCO_val2014_000000007304.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 206922, "question_id": "hCSwHjQJ268XijJei53P9W", "question": "What color is the vase in the middle of the coffee table?", "choices": ["white", "red", "green", "gold"], "correct_choice_idx": 0, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["There is a white vase in the middle of the table.", "The color is white.", "Their color is easily identifiable by observation. they are reflecting all the light which makes them appear light and this color."], "image": "val2014/COCO_val2014_000000206922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 130096, "question_id": "hCfWwiU7mxtXtmLU7pSm3z", "question": "The rope on this cow is attached to what?", "choices": ["horns", "neck", "nose ring", "ears"], "correct_choice_idx": 2, "direct_answers": ["nose ring", "nose ring", "nose ring", "nose", "nose", "nose ring", "nose ring", "nostrils", "ring", "nose"], "difficult_direct_answer": false, "rationales": ["The rope is a nose ring.", "The thing on his nose.", "A man stands next to a cow and is holding a rope that runs through a ring connected to the cows nostrils."], "image": "train2014/COCO_train2014_000000130096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128918, "question_id": "hD3iJKGsugdHDzQ2xBiexR", "question": "What does this boat use for fuel?", "choices": ["coal", "gasoline", "petrol", "alcohol"], "correct_choice_idx": 1, "direct_answers": ["bsll", "gasoline", "gas", "gas", "gas", "travel", "ride", "engine", "gas", "gasoline"], "difficult_direct_answer": false, "rationales": ["There is a simple white boat ashore on the beach. a motor is attached on the end that uses petroleum to go.", "Based on the appearance of the engine attached to the boat it would use answer a for fuel.", "There is a motor on the back of the boat. it cannot burn coal or alcohol."], "image": "train2014/COCO_train2014_000000128918.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 136043, "question_id": "hD5hqaNsaxfTyBsvZsa5gn", "question": "What kind of human-powered vehicle lane are there some cars parked alongside of?", "choices": ["train", "bus", "bike", "sidewalk"], "correct_choice_idx": 2, "direct_answers": ["bicycle", "bike", "bus", "suv", "bike", "van", "bicycle", "bicycles", "bicycle", "bike lane"], "difficult_direct_answer": false, "rationales": ["The lane is for two wheeled vehicles.", "The bike is human powered.", "The person is traveling on a two wheeled vehicle that looks like the item identified in option a."], "image": "train2014/COCO_train2014_000000136043.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 347238, "question_id": "hDByQBrPNPQmEUXaoGYpfF", "question": "What is the name of the part of the elephant that is reached out towards the hands of the humans?", "choices": ["trunk", "head", "hands", "face"], "correct_choice_idx": 0, "direct_answers": ["watching", "stop", "trunk", "trunk", "trunk", "hand", "trunk", "trunk", "trunk", "trunk"], "difficult_direct_answer": false, "rationales": ["Elephants do not have hands. the body part attaches to the elephant's head and face.", "The name is the trunk.", "Elephants are standing at a fence reaching out with a long, thin protrusion from their faces to grab food."], "image": "train2014/COCO_train2014_000000347238.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208011, "question_id": "hEkH9EzsgyUvoPk65UUxNe", "question": "If a few more of these animals appear here what would they be called?", "choices": ["herd", "school", "pack", "clowder"], "correct_choice_idx": 3, "direct_answers": ["cats", "murder", "colony", "clowder", "cats", "litter", "dwdw", "clowder", "cats", "clowder"], "difficult_direct_answer": false, "rationales": ["There would be chowder.", "There could be chowder.", "I had to look this one up online."], "image": "train2014/COCO_train2014_000000208011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333335, "question_id": "hFXm89zX5oxodJs8xrYTH2", "question": "What is next to the bench?", "choices": ["basket", "apple", "egg", "bicycle"], "correct_choice_idx": 3, "direct_answers": ["two bicycles", "cycle", "building", "bike", "bike", "bicycle", "bicycles", "bike", "bicycle", "bikes"], "difficult_direct_answer": false, "rationales": ["It has two narrow wheels on a frame and a seat for someone to sit on", "There is a bench clearly visible in the foreground with two-wheeled objects with handle bars, pedals and a seat leaned against it.", "You can see two of the leaning up against the bench in this park."], "image": "train2014/COCO_train2014_000000333335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 158428, "question_id": "hGhGTKbqSCHmXXhWYBpbYp", "question": "What color are the sides on the crates of construction equipment?", "choices": ["red", "green", "blue", "white"], "correct_choice_idx": 1, "direct_answers": ["green", "green", "green", "green", "white", "green", "green white", "green", "black", "green"], "difficult_direct_answer": false, "rationales": ["The shade of color is like the leaves of bushes.", "The color is easily observable. it is bright and the color as trees or grass.", "The color is green."], "image": "train2014/COCO_train2014_000000158428.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 437947, "question_id": "hHDKucAzuYd7wLcXVJDNsh", "question": "How many jars are above the dishwasher or oven and underneath the cupboards?", "choices": ["six", "three", "two", "four"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["One jar is in between two other jars.", "These are canisters for dry goods", "There are three jars by the oven."], "image": "val2014/COCO_val2014_000000437947.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537371, "question_id": "hJ7ubbFopmQZiZRnf73JBZ", "question": "What country does the shirt on the right mention?", "choices": ["china", "usa", "japan", "germany"], "correct_choice_idx": 1, "direct_answers": ["usa", "usa", "usa", "usa", "usa", "usa", "usa", "usa", "usa", "usa"], "difficult_direct_answer": false, "rationales": ["The white shirt has three letters that represent a country in north america.", "It has these letters on it in black", "The acronym on the shirt is for the united states if america."], "image": "train2014/COCO_train2014_000000537371.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 418346, "question_id": "hJYkbe3xxqYDaijurHKjBi", "question": "What is on the fence?", "choices": ["statues", "hair clips", "monkeys", "egg"], "correct_choice_idx": 1, "direct_answers": ["clips", "bags", "hands", "hair clips", "clips", "hand", "hair clamps", "stop", "hair clips", "hair clips"], "difficult_direct_answer": false, "rationales": ["The fence in the foreground is clearly visible and based on how the objects are attached, their color, shape and size, they would normally be used to hold hair.", "The fence has hair clips.", "The hair clips are on the fence."], "image": "val2014/COCO_val2014_000000418346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99678, "question_id": "hKZdKkD73hFkBsatNKeCme", "question": "Who is usually on the vehicle here?", "choices": ["boat captain", "pilot", "army sergeant", "paratrooper"], "correct_choice_idx": 0, "direct_answers": ["captain", "boat captain", "driver", "captain", "captain", "sailor", "captain", "people", "captain", "sailor"], "difficult_direct_answer": false, "rationales": ["The captain is on the vehicle.", "This is the person who drives the boat", "The vehicle is floating on water. it cannot travel on the land or fly."], "image": "train2014/COCO_train2014_000000099678.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424367, "question_id": "hLNcLW689QtQfGoVauoLEw", "question": "What type of device is the cat sleeping on top of?", "choices": ["printer", "computer", "cell phone", "pager"], "correct_choice_idx": 2, "direct_answers": ["laptop", "desk", "phone", "computer", "cell phone", "remote", "phone", "mat", "laptop", "cell phone"], "difficult_direct_answer": false, "rationales": ["The cat is sleeping on top of an old cell phone.", "The cat is sleeping up next to a cell phone.", "The cat is laying on top of a light gray cell phone."], "image": "train2014/COCO_train2014_000000424367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 458862, "question_id": "hLh4NNUms93cvVQaDeEjwL", "question": "What color is the handle tied around the sunlit cow's face?", "choices": ["blue", "green", "red", "yellow"], "correct_choice_idx": 1, "direct_answers": ["green", "brown", "green", "brown", "blue", "green", "green", "blue", "blue", "green"], "difficult_direct_answer": false, "rationales": ["The handle is not blue, red, or yellow.", "The color is green.", "The color is green."], "image": "train2014/COCO_train2014_000000458862.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390951, "question_id": "hM3mLv6byMKUoSKaf8dzb3", "question": "Where might this bathroom be?", "choices": ["school", "casino", "library", "house"], "correct_choice_idx": 1, "direct_answers": ["casino", "theater", "restaurant hotel", "casino", "business", "hotel", "train station", "hotel", "sink", "public"], "difficult_direct_answer": false, "rationales": ["The bathroom is in a casino.", "This bathroom is fancy and has multiple sinks and stalls. a residential bathroom would have one toilet and sink, and library and school bathrooms would not be this fancy.", "The bathroom is nicer than a public building and has stalls in it."], "image": "train2014/COCO_train2014_000000390951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401877, "question_id": "hMBJfTiB3oLzfoCMGus2zc", "question": "What features do these animals have?", "choices": ["quills", "big ears", "stingers", "wings"], "correct_choice_idx": 1, "direct_answers": ["trunks", "wrinkles", "tusks", "carrying", "trunks", "big animal", "big ears", "trunks", "tusks", "trunks"], "difficult_direct_answer": false, "rationales": ["They don't have any of the other body part options. in disney's dumbo, he could fly with his a.", "Elephants often have large ears.", "You can see their size."], "image": "train2014/COCO_train2014_000000401877.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 232263, "question_id": "hMjWrkiu2PbutNFpqM53d8", "question": "What would the answer to the equation on the truck be if the x is replaced by a sign?", "choices": ["12", "eight", "zero", "ten"], "correct_choice_idx": 1, "direct_answers": ["eight", "eight", "eight", "eight", "eight", "truck", "sixteen", "eight", "12", "eight"], "difficult_direct_answer": false, "rationales": ["If it were a plus sign it would equal 8.", "It would change from multiplication to addition.", "Four plus four equals this number."], "image": "train2014/COCO_train2014_000000232263.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572802, "question_id": "hMwQGJJhqAWvAKHjNUmZvo", "question": "The man at the bus stop is using what kind of phone to talk?", "choices": ["smart", "flip", "iphone", "blackberry"], "correct_choice_idx": 1, "direct_answers": ["cellphone", "speak", "fefef", "cell phone", "cell", "cell", "flip phone", "cell", "flip", "cellphone"], "difficult_direct_answer": false, "rationales": ["It folds in half when not in use", "It is a phone that folds up", "The curve of the phone make the type obvious."], "image": "val2014/COCO_val2014_000000572802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 543385, "question_id": "hNNBBS8oVws7bAw46CXpq4", "question": "How many zebras are walking around on top of the dirt in the courtyard?", "choices": ["three", "five", "one", "two"], "correct_choice_idx": 0, "direct_answers": ["two", "three", "two", "three", "three", "three", "three", "three", "three", "one"], "difficult_direct_answer": false, "rationales": ["Several black and white striped animals are in an enclosure.", "There are three zebras around.", "There are three zebras in the court."], "image": "val2014/COCO_val2014_000000543385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 156195, "question_id": "hPaffGzFyrzP64ncy3YPhK", "question": "What color are the front ankles of the little goat with red ear rings visible in his ears?", "choices": ["white", "brown", "yellow", "black"], "correct_choice_idx": 0, "direct_answers": ["black", "white", "white", "black", "white", "black", "black", "white", "black", "white"], "difficult_direct_answer": false, "rationales": ["The goats have white ankles along with some black markings.", "The front two ankles of this lamb are light colored.", "This particular goat has light colored ankles."], "image": "val2014/COCO_val2014_000000156195.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 99892, "question_id": "hPskEP7dbeyfGDkaxVem2Q", "question": "What are the giraffes near?", "choices": ["strollers", "rocks", "cat", "dog"], "correct_choice_idx": 1, "direct_answers": ["trees", "rocks", "tree rock", "boulders", "trees", "rocks", "trees", "rock", "rocks", "rocks"], "difficult_direct_answer": false, "rationales": ["They are by rocks", "There are a bunch of giraffes near some rocks.", "The giraffes are near large grey inanimate objects. there are no animals or babies near the giraffes."], "image": "train2014/COCO_train2014_000000099892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55897, "question_id": "hQWeLSG8sQjwtYND6xyo4q", "question": "What kind of fence encloses the little horse inside of the pasture?", "choices": ["iron", "wood", "link", "electric"], "correct_choice_idx": 1, "direct_answers": ["wood", "wood fence", "white", "rail", "picket fence", "grass", "wood plank", "wood", "wooden fence", "wood"], "difficult_direct_answer": false, "rationales": ["The fence is made of wood.", "The fence is wooden.", "The fence is made out of a plant-based material."], "image": "train2014/COCO_train2014_000000055897.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 516481, "question_id": "hR5hMQjfwuNgSx95tYcFHW", "question": "What nationality does the man in the foreground appear to be?", "choices": ["indian", "french", "irish", "german"], "correct_choice_idx": 0, "direct_answers": ["asian", "indian", "indian", "indian", "hispanic", "asian", "indian", "indian", "asian", "indian"], "difficult_direct_answer": false, "rationales": ["The people are indian.", "He has indian characteristics.", "The people seem to be from asia."], "image": "train2014/COCO_train2014_000000516481.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 137938, "question_id": "hRe57TAcKy2EKf2o92W23W", "question": "What kind of figurine is present at the rear of the pizza on the table?", "choices": ["baseball", "anime", "pony", "wrestling"], "correct_choice_idx": 1, "direct_answers": ["anime", "red", "girl", "girl", "girl", "girl", "girl", "bobble head", "anime", "girl"], "difficult_direct_answer": false, "rationales": ["There is a small manga character next to the pizza.", "Because it symbolizes characters of anime.", "A cartoon character appears beside a pizza."], "image": "val2014/COCO_val2014_000000137938.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 303298, "question_id": "hRmqBgKmCn2UZNuYwa82rU", "question": "What is the person swinging?", "choices": ["baseball bat", "toy car", "tennis racquet", "toy boat"], "correct_choice_idx": 2, "direct_answers": ["racket", "racquet", "racket", "tennis bracket", "racket", "tennis racket", "tennis racquet", "racket", "tennis", "racket"], "difficult_direct_answer": false, "rationales": ["The person is playing a sport and is swinging a real, not a toy, item. the person is not holding a bat and is not near bases.", "The person is swinging a tennis racquet on a tennis court.", "The person is swinging a tennis racquet on a tennis court."], "image": "val2014/COCO_val2014_000000303298.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 299444, "question_id": "hTEQm5w63dWtH7EsyuxzKk", "question": "What color are the tusks on the elephant who is walking straight for the camera?", "choices": ["yellow", "gray", "white", "black"], "correct_choice_idx": 2, "direct_answers": ["yellowish", "white", "beige", "white", "white", "ivory", "grey", "yellow", "brown", "white"], "difficult_direct_answer": false, "rationales": ["The color is white.", "The color is white.", "Tusks are often white as they are ivory."], "image": "train2014/COCO_train2014_000000299444.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336357, "question_id": "hU59soXh8cspPwddwvMufk", "question": "What store sells these kinds of items?", "choices": ["subway", "mcdonalds", "tesla", "best buy"], "correct_choice_idx": 3, "direct_answers": ["amazon", "cell phone", "electronic store", "good", "verizon", "cell", "verizon", "best buy", "malls", "phone store"], "difficult_direct_answer": true, "rationales": ["Best buy sells phones and electronics.", "Best buy sells phone devices.", "The store is best buy."], "image": "train2014/COCO_train2014_000000336357.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 250054, "question_id": "hU99YLqrn2rTeq9gvJW9ZP", "question": "What is in front of the bus?", "choices": ["fox", "bicycle", "apple", "stage"], "correct_choice_idx": 1, "direct_answers": ["bus", "bike", "bike", "bicycle", "bicycle", "nothing", "bicycle", "bicycle", "bike", "bicycle"], "difficult_direct_answer": false, "rationales": ["The bike is in front.", "A two wheeled street bike is on the sidewalk.", "A double decker bus is on the street behind a two wheeled form of transportation with peddles."], "image": "train2014/COCO_train2014_000000250054.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416598, "question_id": "hUjnQZiDc3gP5qHF8nLcFP", "question": "What is the green object the man is serving pizza on?", "choices": ["cutting board", "pizza stone", "oven tray", "plate"], "correct_choice_idx": 0, "direct_answers": ["tray", "tray", "cutting board", "board", "cutting board", "cutting board", "green beans", "cutting board", "cutting board", "fefef"], "difficult_direct_answer": false, "rationales": ["The object is the cutting board.", "You cut pizza on this for serving.", "He is using a cutting board to serve."], "image": "train2014/COCO_train2014_000000416598.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315065, "question_id": "hWnRrwxWXEiJ4h3tCmkFFg", "question": "What rhymes with the name of the store and is found on the vehicle?", "choices": ["trunk", "hood", "door", "propeller"], "correct_choice_idx": 3, "direct_answers": ["troller", "propeller", "propeller", "propeller", "propeller", "propeller", "propeller", "propeller", "propeller", "propeller"], "difficult_direct_answer": false, "rationales": ["The only word that can be the possible answer is \"a\" when it comes to rhyming.", "The name stellar rhymes with propeller.", "The storefront in this image reads steller. this would rhyme with propeller."], "image": "train2014/COCO_train2014_000000315065.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387352, "question_id": "hXB94aatthrNzPNUFdAGix", "question": "What color are the edges of the towel hung on the towel rack next to the shower curtain?", "choices": ["blue", "red", "white", "black"], "correct_choice_idx": 1, "direct_answers": ["fascia", "pink", "red", "white", "green", "red", "red", "pink", "red", "silver"], "difficult_direct_answer": false, "rationales": ["The edges of the towel are close to blood colored.", "The edges of the towel are red.", "The towel is pink with red edges."], "image": "train2014/COCO_train2014_000000387352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 112573, "question_id": "hXRMJFn2NDSnnmr7ADxyNG", "question": "What word describes these animals best?", "choices": ["canine", "ursine", "equine", "bovine"], "correct_choice_idx": 1, "direct_answers": ["majestic", "dangerous", "bear", "bears", "bears", "wild", "grizzly brown", "fun", "ursine", "bears"], "difficult_direct_answer": false, "rationales": ["The bears are ursines.", "The animals are bears so i chose the word that is related to bears. the other words are related to cows, horses and dogs.", "The animals are bears, not cows, horses, or dogs."], "image": "val2014/COCO_val2014_000000112573.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393087, "question_id": "hXa9JyfecuJg7eVZKYyT6s", "question": "What material is the construction of this tower?", "choices": ["cobblestone", "wood", "brick", "metal"], "correct_choice_idx": 0, "direct_answers": ["stone", "rocks", "bricks", "stone", "chair", "cobblestone", "cementsand bricks", "bricks", "concrete", "stone"], "difficult_direct_answer": false, "rationales": ["The material is cobblestone.", "There is rough texture to the building. the building appears old so this is a material more common in older buildings.", "The tower is grey. it is not made out of brick, wood, or metal."], "image": "train2014/COCO_train2014_000000393087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 200653, "question_id": "hY4r7VHBgrKcdpgUkp5ne4", "question": "What do the people at the net make up as a collective?", "choices": ["ocelot", "quintet", "trio", "quartet"], "correct_choice_idx": 3, "direct_answers": ["quartet", "foursome", "team", "team", "tennis players", "team", "teams", "competitors", "team", "team"], "difficult_direct_answer": false, "rationales": ["There are four people at the net.", "There are 4 people shaking hands.", "The people standing by the net are two teams of tennis players which equals 4 in total."], "image": "train2014/COCO_train2014_000000200653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 425412, "question_id": "hY8G3xwvDqqs523GcQzk5y", "question": "What does the item on the yellow sign look like?", "choices": ["egg", "baby", "top hat", "monkey"], "correct_choice_idx": 2, "direct_answers": ["hat", "speed", "hat", "hate", "top hat", "road bump", "warning", "hat", "hat", "bump"], "difficult_direct_answer": false, "rationales": ["The item is a top hat.", "The object looks like it could contain a brim and a main hat section consistent with answer a. none of the other answers look similar to the object on the yellow sign.", "The design on the sign looks very close to the type of hats that were worn in the 19th century."], "image": "val2014/COCO_val2014_000000425412.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187384, "question_id": "hZ33D3u42Ce5CLsPr3pxEP", "question": "What color are the beads inside of the red bowl?", "choices": ["blue", "orange", "green", "red"], "correct_choice_idx": 0, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The beads in the bowl are this color.", "The objects are clearly visible and located based on the text of the question. their color is clearly identified.", "There are oranges near the red bowl. the beads do not match the oranges or the red bowl and are not green."], "image": "val2014/COCO_val2014_000000187384.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 485547, "question_id": "hZH72DLLhppMvxwACpddae", "question": "How many red kites are flying above the field with the people in it?", "choices": ["fourteen", "three", "two", "twelve"], "correct_choice_idx": 1, "direct_answers": ["two", "three", "two", "three", "four", "two", "two", "three", "two", "three"], "difficult_direct_answer": false, "rationales": ["There are three bright red kites being flown above the grassy field.", "There are a few kites in the air.", "There are three red kites being flown over the grass."], "image": "train2014/COCO_train2014_000000485547.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 184485, "question_id": "hZqijo4i2sLXm2zEpVUhDe", "question": "What country is this room most likely located in?", "choices": ["india", "germany", "united states", "japan"], "correct_choice_idx": 1, "direct_answers": ["berlin", "usa", "germany", "germany", "germany", "germany", "germany", "germany", "germany", "germany"], "difficult_direct_answer": false, "rationales": ["The book has the capital of the country.", "A stuffed animal bear, a single toothbrush and a book with \"berlin\" is on a table.", "Berlin is written on the book and that is in a city of this particular country."], "image": "val2014/COCO_val2014_000000184485.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258516, "question_id": "hbNyrmkEXH3Fq86iDMnSia", "question": "What color is the box containing an alcoholic beverage behind the cricket's cage?", "choices": ["red", "white", "blue", "orange"], "correct_choice_idx": 0, "direct_answers": ["red", "red", "white", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["It's also yellow, which are logo colors.", "There is a bright red box behind the cage that contains an alcoholic beverage.", "The refrigerated box is red"], "image": "val2014/COCO_val2014_000000258516.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 212462, "question_id": "hbqKg59aVBxvfEaB9bNpoY", "question": "With which fruit would be most connected to Costa Rica?", "choices": ["orange", "banana", "pineapple", "apple"], "correct_choice_idx": 2, "direct_answers": ["pineapple", "bananas", "guava", "pineapple", "mangoes", "pineapple", "pineapple", "car", "bananas", "watching"], "difficult_direct_answer": false, "rationales": ["Pineapples come from tropical areas while oranges are from the us.", "The fruit is a pineapple.", "The pineapple was from that country."], "image": "val2014/COCO_val2014_000000212462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221893, "question_id": "hdxxQKTMfJwLaDvjWuDseQ", "question": "The bottle is related to what group of people?", "choices": ["pharaohs", "samurai", "roman legionnaires", "vikings"], "correct_choice_idx": 3, "direct_answers": ["avengers", "scandinavian", "vikings", "viking", "irish", "vikings", "beer drinkers", "wine drinkers", "people", "germans"], "difficult_direct_answer": true, "rationales": ["The bottle's label makes reference to tors hammer. thor was a hammer wielding god who ruled in norse mythology and one of the options are people from the scandinavian region.", "You can tell by the theme and the words on the bottle as to what type of people it is depicting.", "The first name tor refers to the norse god thor."], "image": "val2014/COCO_val2014_000000221893.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 546079, "question_id": "hevVYphtwNjsNyZsK56nk5", "question": "What is on display behind the glass on the checkered floor?", "choices": ["houses", "toilet seats", "toilet bowls", "chairs"], "correct_choice_idx": 1, "direct_answers": ["toilet seats", "toilet seats", "toilets", "toilets", "toilets", "toilet", "toilets", "toilets", "toilets", "toilets"], "difficult_direct_answer": false, "rationales": ["The area is similar to a bathroom. there is nothing special about the bowls, but the items on top of the bowls are fancy.", "The toilet seats are.", "There is some glitter covered toilet seats in a row down the checkered floor."], "image": "train2014/COCO_train2014_000000546079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43455, "question_id": "hgnutsWwKTwREmwisRQfeq", "question": "What design is on the little girl's hoodie?", "choices": ["stripes", "polka dots", "medusa heads", "stars"], "correct_choice_idx": 1, "direct_answers": ["play", "watching", "round", "nike", "polka dots", "dots", "nice", "dot", "polka dots", "polka dots"], "difficult_direct_answer": false, "rationales": ["There are dots around the jacket.", "There is a little girl with purple jacket and small pink circles dotted all over it. she is feeding the cows.", "The little girl's hoodie has pink dots"], "image": "train2014/COCO_train2014_000000043455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423630, "question_id": "hiSW3LCVCdCv6hc3dYGBDU", "question": "What is the sheep doing on the hay with its belly?", "choices": ["pooping", "walking", "sleeping", "eating"], "correct_choice_idx": 2, "direct_answers": ["stop", "laying", "laying", "sleeping", "resting", "eat", "resting", "laying down", "laying", "laying"], "difficult_direct_answer": false, "rationales": ["They are getting some food", "The sheep on the hay on its belly is relaxing..", "You can tell by the sheep's sitting position as to what it is doing."], "image": "train2014/COCO_train2014_000000423630.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492411, "question_id": "hiUNH3o9CFhS3AYKa2nsTa", "question": "What is near the elephant?", "choices": ["dog", "cow", "people", "cat"], "correct_choice_idx": 2, "direct_answers": ["people", "trees", "people", "fence", "fence", "people", "fence", "people", "plant", "person"], "difficult_direct_answer": false, "rationales": ["A common tourist experience in areas with elephants is to pay and take a guided elephant ride. people sit atop the elephant for the experinece.", "People are nearby.", "There elephant is walking on the path with people riding on its back."], "image": "train2014/COCO_train2014_000000492411.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 307567, "question_id": "hkFjV3LmrTk79Ef8QdQ3eN", "question": "The plane is painted what colors?", "choices": ["redwhite", "greenyellow", "blackgreen", "whiteblue"], "correct_choice_idx": 1, "direct_answers": ["green yellow", "green", "green", "green white", "green", "greenyellow", "green", "green", "green yellow", "green"], "difficult_direct_answer": false, "rationales": ["An aircraft with yellow and green is being loaded at an airport.", "The plane has green and yellow paint.", "These colors are visible on the plane."], "image": "train2014/COCO_train2014_000000307567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 506552, "question_id": "hmPXs4b5h5S6qSFXTqFuEE", "question": "Which Wii sport game must be played by the crowd of children in the lounge?", "choices": ["golf", "boxing", "tennis", "bowling"], "correct_choice_idx": 3, "direct_answers": ["bowling", "bowling", "balling", "bowling", "bowling", "bowling", "bowling", "bowling", "bowling", "bowling"], "difficult_direct_answer": false, "rationales": ["The game is bowling.", "On the television screen there are lanes and pins. the player is about to throw a ball at the pins.", "The people are playing the bowling wii sport game."], "image": "val2014/COCO_val2014_000000506552.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497855, "question_id": "hmyB5dJP4pfuXn6swNxJNh", "question": "What is the color of the fruit in the center of the pile?", "choices": ["orange", "purple", "blue", "red"], "correct_choice_idx": 0, "direct_answers": ["red", "red", "yellow", "red", "orange", "red", "red/yellow", "red", "red yellow", "red"], "difficult_direct_answer": false, "rationales": ["The fruits location relative to each other is identifiable and the fruit in the center has a color that is recognizable.", "The fruit is an orange.", "There are many oranges in a pile and a spotted yellow banana to the left."], "image": "val2014/COCO_val2014_000000497855.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37209, "question_id": "hnavWS66EAHPskqakZKkZ9", "question": "What color is the t-shirt worn by the man on a pedal bike in the background to the right?", "choices": ["green", "blue", "purple", "yellow"], "correct_choice_idx": 3, "direct_answers": ["brown", "yellow", "grey", "khaki", "yellow", "olive green", "green", "red", "grey", "green"], "difficult_direct_answer": false, "rationales": ["The color is yellow.", "A yellow. his shirt is yellow.", "The man is wearing a yellow t-shirt."], "image": "val2014/COCO_val2014_000000037209.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 338317, "question_id": "hpXQF2U6CHg4U3kZrYCWum", "question": "What is the woman on the phone clutching?", "choices": ["her baby", "brown bag", "barrel", "her leg"], "correct_choice_idx": 1, "direct_answers": ["brown bag", "paper bag", "calling", "brown bag", "paper bag", "brown bag", "paperbag", "paper bag", "brown sack", "bag"], "difficult_direct_answer": false, "rationales": ["You can see the bag in her other hand.", "She has a paper version of a sack normally used for food", "The woman is holding onto a brown bag next to a phone."], "image": "train2014/COCO_train2014_000000338317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 381529, "question_id": "hpm6tV29NxSprPJeAunjPg", "question": "What are both of the men wearing?", "choices": ["earphones", "masks", "crowns", "ties"], "correct_choice_idx": 3, "direct_answers": ["ties", "coat", "ties", "suits", "tuxedo", "suits", "suits", "suit", "suits", "suits"], "difficult_direct_answer": false, "rationales": ["Their heads, eyes, and faces are uncovered. they have clothing items near their necks.", "They are wearing formal suits, which are accompanied by an additional piece of fabric hanging from the neck, which is a tie.", "The men and their clothing are clearly visible and answer a is present on both of them and none of the other answers are."], "image": "val2014/COCO_val2014_000000381529.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539376, "question_id": "hsDpzjhUZaQLXhU9MmWftz", "question": "What color are the bodies of the sheep with white heads?", "choices": ["red", "white", "brown", "black"], "correct_choice_idx": 2, "direct_answers": ["black", "brown", "brown", "brown", "brown", "brown", "brown", "white", "stop", "brown"], "difficult_direct_answer": false, "rationales": ["The sheep have brown fur on their bodies.", "The bodies of these sheeps are brown.", "The color is brown."], "image": "train2014/COCO_train2014_000000539376.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221470, "question_id": "hsNqyRyiRxERMBY7LZbD5G", "question": "What is the purpose of the circular platform?", "choices": ["batting practice", "pitching practice", "sliding practice", "catching practice"], "correct_choice_idx": 0, "direct_answers": ["on-deck circle", "strike zone", "batting circle", "batting", "to bat", "on deck", "batting practice", "court", "warm up", "warmup"], "difficult_direct_answer": true, "rationales": ["The purpose is for batting practice.", "There is a platform behind the umpire, catcher and batter. it is used to take swings with bat before coming to plate.", "The circular platform is meant to practice batting."], "image": "train2014/COCO_train2014_000000221470.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169575, "question_id": "ht43urZwBmFZaGTn8VThFS", "question": "What is the boat under?", "choices": ["airplanes", "seagulls", "balloons", "zeppelins"], "correct_choice_idx": 1, "direct_answers": ["sea", "sky", "clouds", "sky", "clouds", "sail", "water", "sky", "seagulls", "boat"], "difficult_direct_answer": false, "rationales": ["There are seagulls in the sky.", "There are birds in the sky in the sea.", "The boat is under the birds."], "image": "train2014/COCO_train2014_000000169575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578522, "question_id": "ht57fhuFQAAywBXWppVUj8", "question": "What type of tool do you need to move while in the object on top of the black car?", "choices": ["car keys", "sun glasses", "swimming trunks", "paddle"], "correct_choice_idx": 3, "direct_answers": ["car", "oar", "paddle", "oar", "car", "raft", "ropes", "paddle", "paddle", "paddles"], "difficult_direct_answer": false, "rationales": ["You will need a paddle to row the boat.", "A car is in the rode and has a boat strapped to the top. boats are often rowed with paddles or oars.", "This is a kayak that moves on water"], "image": "val2014/COCO_val2014_000000578522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530278, "question_id": "htxjC4A3MCtU83UjPSf2t9", "question": "What period does the vase drawing look like it represents?", "choices": ["ancient greece", "feudal japan", "enlightenment", "medieval europe"], "correct_choice_idx": 0, "direct_answers": ["ancient greece", "art", "4th dynasty", "ancient", "horse", "4000 bc", "ancient greek", "egyptian ancient", "ancient", "egyptian"], "difficult_direct_answer": true, "rationales": ["There is a chariot with horses", "The symbols are from that country.", "The pictures show some vines."], "image": "val2014/COCO_val2014_000000530278.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 217517, "question_id": "huHXmhAEUtKyTD2PJrnpUK", "question": "Which city is the scape most likely?", "choices": ["tokyo", "cairo", "beijing", "singapore"], "correct_choice_idx": 1, "direct_answers": ["cairo", "china", "america", "new york", "dallas", "los angeles", "industrial", "united states", "no clue", "dahlia"], "difficult_direct_answer": true, "rationales": ["The city is cairo.", "Honestly this could be any city listed here.", "The sun is bright and this is likely in cairo."], "image": "val2014/COCO_val2014_000000217517.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 228197, "question_id": "hvyCxLvndtn5pTiQYLoR9w", "question": "How many people can sit on the wooded item near the seated man?", "choices": ["three", "12", "seven", "16"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "one", "two", "three", "three", "four", "two", "three", "four"], "difficult_direct_answer": false, "rationales": ["Three people can sit on the bench.", "It looks like it can hold a small amount of people.", "There are arm bars so this many small people could fit"], "image": "val2014/COCO_val2014_000000228197.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480443, "question_id": "hwSJMq8cosjHV4qp2aAM6W", "question": "What is on top of the building?", "choices": ["cow", "gargoyle", "bird statue", "knight statue"], "correct_choice_idx": 2, "direct_answers": ["compass", "bird", "arrow", "weathervane", "bird", "bird", "bird", "dome", "weather vane", "bird statue"], "difficult_direct_answer": false, "rationales": ["A statue of a bird is on the top.", "There is a creature on top of the building that has wings and a beak.", "There is a figure of a flying animal above the weather vane."], "image": "train2014/COCO_train2014_000000480443.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 466491, "question_id": "hxoAG2VzouCxVqR6TNAPt6", "question": "What is next to the bed?", "choices": ["dog", "baby", "cat", "lamp"], "correct_choice_idx": 3, "direct_answers": ["lamp", "lamp", "lamp", "nightstand", "lamp", "table", "sleeping", "lamp", "lamp", "lamp"], "difficult_direct_answer": false, "rationales": ["A light is next to the bed.", "There is a dark light appliance sitting on right side of bed on the floor.", "The most prominent object next to the bed has the size, shape and general design of answer a as well as a visible light bulb."], "image": "train2014/COCO_train2014_000000466491.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 54088, "question_id": "hyBPXZjJ4CqbQ589GqPjap", "question": "What type of truck is being pictured in this image?", "choices": ["sixteen wheeler", "monster truck", "chevy truck", "tow truck"], "correct_choice_idx": 3, "direct_answers": ["tow", "tow truck", "tow truck", "tow truck", "tow", "tow truck", "tow truck", "range rover", "tow", "dwwwferf"], "difficult_direct_answer": false, "rationales": ["There is a car behind being towed", "The truck is towing.", "A truck is towing another car away."], "image": "val2014/COCO_val2014_000000054088.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304236, "question_id": "hziZdZkertckQ5fSFcx4hd", "question": "What kind of truck edition must this one be?", "choices": ["buyer", "standard", "special", "normal"], "correct_choice_idx": 2, "direct_answers": ["limited", "custom", "special", "car", "special", "special", "car", "custom", "custom special", "top"], "difficult_direct_answer": false, "rationales": ["This could be a special edition truck with decals.", "Orange metallic paint with interwoven designs with gold on top and shiny wheels.", "It is not common."], "image": "train2014/COCO_train2014_000000304236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 562916, "question_id": "i6WvkTcuid88drnM9whCYq", "question": "What is the man with the hat on struggling with?", "choices": ["shark", "fish", "octopus", "bull"], "correct_choice_idx": 3, "direct_answers": ["bull", "cow", "cow", "bull", "calf", "bull", "calf", "cow", "cow", "calf"], "difficult_direct_answer": false, "rationales": ["The man with the hat is struggling with a little bull.", "A man in cowboy attire is roping a calf in a ring.", "A cowboy has the animal with horns on its back. it is trying to rope its legs."], "image": "train2014/COCO_train2014_000000562916.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70192, "question_id": "i76SMZM9Q9xAFsVvWeZ8yi", "question": "What fruit is to the left?", "choices": ["banana", "grape", "apple", "orange"], "correct_choice_idx": 3, "direct_answers": ["oranges", "oranges", "orange", "oranges", "orange", "orange", "orange", "tomato", "orange", "oranges"], "difficult_direct_answer": false, "rationales": ["Oranges are in a container on the table.", "There are a bunch of oranges in the basket.", "Oranges are to the left."], "image": "val2014/COCO_val2014_000000070192.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 461408, "question_id": "i7C7RgzZzhpF9VtwUP5jGj", "question": "What is the red stuff covering exactly one half of this pizza?", "choices": ["olive", "cauliflower", "parmesan", "pepperoni"], "correct_choice_idx": 3, "direct_answers": ["peperoni", "cheese", "pepperoni", "meat", "pepperoni", "chees", "pepperoni", "pepperoni", "pepperoni", "sausages"], "difficult_direct_answer": false, "rationales": ["The reddish items on the pizza are pepperoni slices.", "The stuff is pepperoni.", "This meat is typical of the round size meat that is visible on the pizza. it is a meat typically found on pizzas."], "image": "val2014/COCO_val2014_000000461408.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 558018, "question_id": "i8cpPpsHZ24x5RH8pA52KC", "question": "What color is the small section of the bat near to its center and above its handle?", "choices": ["red", "white", "green", "purple"], "correct_choice_idx": 3, "direct_answers": ["purple", "purple", "purple", "purple", "purple", "blue", "black", "axe", "black", "purple"], "difficult_direct_answer": false, "rationales": ["The top of the bat is purple while the bottom is black.", "The small band of color on the bat is this shade.", "There is purple paint on the bat."], "image": "train2014/COCO_train2014_000000558018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 351036, "question_id": "i8eJtJXUDoMq6ba9qCgLqx", "question": "What city appears on the bottom of the skateboard?", "choices": ["paris", "london", "chicago", "new york"], "correct_choice_idx": 3, "direct_answers": ["new york", "zooyark", "new york", "zooyork", "zoo york", "new york", "zoo york", "newyork", "new york", "new york"], "difficult_direct_answer": false, "rationales": ["The words show off \"york\".", "You can read this city on the bottom of the board.", "The city is new york."], "image": "train2014/COCO_train2014_000000351036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324817, "question_id": "i9tMJzNBr3SfzwwCNwLA3o", "question": "What season is most likely?", "choices": ["spring", "summer", "autumn", "winter"], "correct_choice_idx": 1, "direct_answers": ["spring", "summer", "summer", "summer", "summer", "summer", "summer", "summer", "summer", "summer"], "difficult_direct_answer": false, "rationales": ["The season is summer.", "The season is summer.", "The trees have green leaves. the people are wearing short sleeved shirts."], "image": "train2014/COCO_train2014_000000324817.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556476, "question_id": "iAF72Q5hNdxPFpUDoVdW3e", "question": "What are the structures underneath the lampshade?", "choices": ["booth", "benches", "fire hydrants", "manholes"], "correct_choice_idx": 1, "direct_answers": ["poles", "bench", "bench", "benches", "pole", "benches", "bench", "bench", "benches", "poles"], "difficult_direct_answer": false, "rationales": ["The structures are benches.", "These are seating areas for people to rest as they walk through the city", "The long seats are near the lampposts to give pedestrians a place to rest."], "image": "train2014/COCO_train2014_000000556476.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554729, "question_id": "iApHwjbUdNt5TecgtpoPeJ", "question": "Where does the luggage on this belt come from?", "choices": ["bathroom", "airplane", "store", "home"], "correct_choice_idx": 1, "direct_answers": ["airplane", "airplane", "plane", "airplane", "left", "airplane", "plane", "airplane", "luggage sorting", "unloading"], "difficult_direct_answer": false, "rationales": ["The luggage is taken off the plane that landed.", "It's the only answer that make sense given this is a public terminal.", "The luggage belt has an advertisement for heathrow express. heathrow is the famous london airport."], "image": "train2014/COCO_train2014_000000554729.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 469435, "question_id": "iAzJCTgcSfuBXhbvEzjzwZ", "question": "What are the people holding?", "choices": ["cats", "apples", "pens", "pumpkins"], "correct_choice_idx": 2, "direct_answers": ["watching", "puzzle", "hand", "books", "pens", "reading", "book", "books", "books", "car"], "difficult_direct_answer": false, "rationales": ["Two women sit on a bench with one holding a crossword puzzle book and the other holding a book folded in half as well. both women also hold thin writing utensils that appear black in color.", "The people have pens.", "The people have pens."], "image": "train2014/COCO_train2014_000000469435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 211843, "question_id": "iBoskHbdb62iUE3hFs2UvP", "question": "What kind of event is this?", "choices": ["graduation ceremony", "party", "church gathering", "funeral"], "correct_choice_idx": 1, "direct_answers": ["wine tasting", "celebration", "party", "wine tasting", "party", "wine tasting", "party", "wine tasting", "party", "wine tasting"], "difficult_direct_answer": false, "rationales": ["People are being together with drinks.", "Many friends standing around a large table. they are drinking wine while one is pouring it into a glass.", "B would be more somber. c would normally have banners and related text. d could apply if it's some sort of church-sponsored wine social or tasting."], "image": "train2014/COCO_train2014_000000211843.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 175126, "question_id": "iCQJakpjaVH5ALMUuHLmVs", "question": "What are the people in the vicinity of?", "choices": ["comet", "tree", "bagel", "tundra"], "correct_choice_idx": 1, "direct_answers": ["tree", "bananas", "banana plant", "plants", "banana tree", "tree", "bananas", "bananas", "banana tree", "bananas"], "difficult_direct_answer": false, "rationales": ["The people are in a tropical area. there are trunks and leaves.", "Two men stand under the large leaves of a large tree.", "The billowing green leaves and wooden poles coming out of the ground tell us these men are near trees."], "image": "train2014/COCO_train2014_000000175126.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 16706, "question_id": "iD8Dym7u3EH6TXAobjyVA6", "question": "How many people are seen in this scene?", "choices": ["four", "one", "two", "three"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "two", "two", "car", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two people.", "There are two people behind the fence.", "There are 2."], "image": "train2014/COCO_train2014_000000016706.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 55868, "question_id": "iDuS579skBsSkL8ZwS66Um", "question": "What is the name of the large bird?", "choices": ["flamingo", "seahawk", "stork", "pelican"], "correct_choice_idx": 2, "direct_answers": ["pelican", "pelican", "pelican", "pelican", "egret", "pelican", "pelican", "seagull", "dwdwdw", "stork"], "difficult_direct_answer": false, "rationales": ["The bird has a long, pointed beak.", "The bird is a stork.", "The bird shown has the color and beak of a pelican. also, pelicans hang around water and boats looking for fish to eat."], "image": "val2014/COCO_val2014_000000055868.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 257972, "question_id": "iEBeXz5iH22NToT7rCRtKr", "question": "What region is this most likely?", "choices": ["new jersey", "siberia", "russia", "east africa"], "correct_choice_idx": 3, "direct_answers": ["savannah", "africa", "africa", "africa", "africa", "east africa", "africa", "jungle", "africa", "african savannah"], "difficult_direct_answer": false, "rationales": ["East africa has wild animals.", "There are zebras and giraffes in this region.", "This is the area where giraffe and zebras live naturally."], "image": "train2014/COCO_train2014_000000257972.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 78986, "question_id": "iFh2h7mx79sqqfRahgxuE4", "question": "What laptop brand is being advertised?", "choices": ["dell", "asus", "hp", "lenovo"], "correct_choice_idx": 2, "direct_answers": ["hp", "hp", "hp", "dell", "good", "hp", "hp", "hewitt packard", "hp", "hp"], "difficult_direct_answer": false, "rationales": ["The brand is hp.", "The laptop is hp.", "A tennis court is shown with banners visible behind the court with the hp logo."], "image": "train2014/COCO_train2014_000000078986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245562, "question_id": "iGEVgxRaaE8XoAxgDfov9t", "question": "What is the giraffe in the foreground doing?", "choices": ["jumping", "sitting", "running", "eating grass"], "correct_choice_idx": 1, "direct_answers": ["laying down", "sitting", "sitting", "sitting", "sitting", "laying down", "laying down", "resting", "laying down", "laying down"], "difficult_direct_answer": false, "rationales": ["The animal is resting on the ground.", "The giraffe is on the ground with its head up.", "The giraffe in the foreground is sitting down. its legs are folded under."], "image": "train2014/COCO_train2014_000000245562.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 285963, "question_id": "iJTsNwsqed8JD75xq5tLNZ", "question": "What color are the shin guards for the police horses in the parade?", "choices": ["yellow", "red", "white", "blue"], "correct_choice_idx": 0, "direct_answers": ["brown", "yellow", "yellow", "gray", "yellow", "yellow", "yellow", "white red", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The horses in the parade are wearing shin guards that are bright yellow.", "The color is yellow.", "The horses are wearing yellow shin guards."], "image": "train2014/COCO_train2014_000000285963.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 424367, "question_id": "iLvM6czKR5fuq7LVuwCz27", "question": "What item is long here?", "choices": ["whiskers", "snake", "antlers", "hose"], "correct_choice_idx": 0, "direct_answers": ["paw", "phone", "phone", "remote", "arm", "cellphone", "phone", "rug", "whiskers", "stapler"], "difficult_direct_answer": false, "rationales": ["The cat sitting on the couch has long whiskers in its cheeks.", "They are longer on the nose", "The item is whiskers."], "image": "train2014/COCO_train2014_000000424367.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 51504, "question_id": "iMSpx5jXYgUuvCtHgwrJzr", "question": "What is the elephant near?", "choices": ["baby", "umbrella", "cow", "antelope"], "correct_choice_idx": 1, "direct_answers": ["man", "umbrella", "man", "wooden fence", "watching", "fence", "person", "tourist", "fence", "stop"], "difficult_direct_answer": false, "rationales": ["An elephant is enclosed in a fence. there is a pink object that protects people from the sun.", "A large red, round umbrella is near a large gray animal with a long trunk.", "There is a red umbrella that he is near."], "image": "train2014/COCO_train2014_000000051504.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202926, "question_id": "iNAuBxZRdXA5zUKPDV9jKg", "question": "What kind of birds are the majority of these at the ocean?", "choices": ["pelicans", "penguins", "gulls", "puffins"], "correct_choice_idx": 0, "direct_answers": ["pelicans", "pelicans", "seagulls", "seagulls", "pelican", "pelicans", "pelicans", "pelicans", "pelicans", "seagull"], "difficult_direct_answer": false, "rationales": ["There are pelicans.", "The long bills and stout bodies of these birds identify them as pelicans.", "The seagulls are outnumbered by the other birds."], "image": "val2014/COCO_val2014_000000202926.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 213117, "question_id": "iQvvBobbc6x7JkLzb7yt4j", "question": "What color is the logo on the sides of the shoes worn by the baseball batter?", "choices": ["black", "green", "red", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "white", "red", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["Each shoe has a nike swoosh logo. it does not match the green grass and is not black or red.", "Thought he shoe is black and red the nike logo is white.", "The logo is mostly in white."], "image": "train2014/COCO_train2014_000000213117.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 515937, "question_id": "iREBrHzNvhxaKhp4Y8pauP", "question": "What color is the parka down jacket worn by the man in the left?", "choices": ["blue", "white", "orange", "green"], "correct_choice_idx": 2, "direct_answers": ["red", "orange", "red", "red", "maroon", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["It is a bright color that is almost red.", "It is brighter than the other clothing items and almost red", "There is a man bundled in a bright orange jacket. he and another person are standing with skii on a snowy slope."], "image": "train2014/COCO_train2014_000000515937.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305363, "question_id": "iRrfY74HxugcpSZuxrAymL", "question": "How many horses are countable on the beach?", "choices": ["four", "two", "three", "five"], "correct_choice_idx": 3, "direct_answers": ["five", "five", "five", "three", "five", "five", "three", "three", "four", "three"], "difficult_direct_answer": false, "rationales": ["There are five horses.", "They are easy to see.", "Five horses are visible in the photo."], "image": "train2014/COCO_train2014_000000305363.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385786, "question_id": "iUbs3KUSqhW9HQwHPEDrVN", "question": "What color is the suitcase underneath of the window with white curtains?", "choices": ["black", "green", "red", "blue"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "red", "red", "red", "red", "white", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["Because it also contains white visible clothes in it.", "The object is clearly visible and identifiable based on is size, shape, and the text of the question. the color is clear.", "The suitcase is red."], "image": "val2014/COCO_val2014_000000385786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 246041, "question_id": "iVNfkoHqsB6oKELjxwj5dP", "question": "How many portraits are hung above the fireplace mantle?", "choices": ["three", "four", "one", "two"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is a single portrait hung above the fireplace mantle.", "There is 1.", "And it might not be a true portrait. it may be an artist's dream."], "image": "train2014/COCO_train2014_000000246041.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352191, "question_id": "iVmyqBLjDjZjiYTAqLUeZt", "question": "What is the zebra on the right doing in the field?", "choices": ["eating", "drinking", "pointing", "walking"], "correct_choice_idx": 2, "direct_answers": ["four", "looking right", "standing", "pointing", "looking", "standing", "looking back", "standing", "looking", "staring"], "difficult_direct_answer": false, "rationales": ["The zebra is pointing it's head toward the camera.", "The zebra on the right is pointing its nose towards something it sees.", "The animal is visibly upright and on his legs. of the actions described, answer b is most consistent with this posture."], "image": "train2014/COCO_train2014_000000352191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 186175, "question_id": "iWZvAj2jSTPwhmf7EwKLPj", "question": "How many exhaust pipes extend out the sides of the big semi truck above?", "choices": ["five", "seven", "two", "three"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "two", "one", "one", "two", "two", "two", "one", "one"], "difficult_direct_answer": false, "rationales": ["One on either side of the cab.", "One exhaust pipe is on the right side of the truck and one on the left.", "There are two extending pipes."], "image": "val2014/COCO_val2014_000000186175.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43455, "question_id": "iY9tU4xjbxLKM6tGTmmXRs", "question": "What is near the cows?", "choices": ["cat", "bodybuilder", "little girl", "apple"], "correct_choice_idx": 2, "direct_answers": ["little girl", "children", "girl", "little girl", "watching", "girl", "girl", "girl", "girl", "kid"], "difficult_direct_answer": false, "rationales": ["A female person is near the cows. she is not an adult.", "The little girl is small next to the cows.", "The child is dressed the way a female child would be dressed."], "image": "train2014/COCO_train2014_000000043455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189957, "question_id": "iYDoVkKEAnqPMYZ68crySZ", "question": "What color are the serving bowls for the noodles at this dinner?", "choices": ["blue", "green", "orange", "pink"], "correct_choice_idx": 3, "direct_answers": ["white", "white", "white", "purple", "white", "white", "white", "pink", "white", "pink"], "difficult_direct_answer": false, "rationales": ["The bowls are colored pink.", "There are only two pink bowels on the floral table. there are many other plates of food and some drinks.", "Because each pink bowl is filled with a proportion of food."], "image": "train2014/COCO_train2014_000000189957.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480797, "question_id": "iZBnsXXyZfhgYNYTQyGXAb", "question": "What kind of vessel is that?", "choices": ["cruise ship", "fishermans boat", "yacht", "canoe"], "correct_choice_idx": 0, "direct_answers": ["cruise ship", "ship", "ship", "cruise ship", "nothing", "cruise ship", "ship", "ship", "bike", "cruise ship"], "difficult_direct_answer": false, "rationales": ["The large vessel in the water is a cruise ship that people travel on vacation with,.", "A large commercial passenger ship is pulling up to dock.", "There is a large boat in the water with smaller vessels in front of it."], "image": "train2014/COCO_train2014_000000480797.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 53335, "question_id": "iZD2XfeZequB9qZqoimW4g", "question": "What would the girl with the orange-colored hair be called?", "choices": ["brown", "blonde", "noirette", "redhead"], "correct_choice_idx": 3, "direct_answers": ["redhead", "blonde", "cycle", "cyclist", "female", "white", "woman", "jamie", "ginger", "redhead"], "difficult_direct_answer": true, "rationales": ["A woman with orange hair could be called a redhead.", "The term in option a is used to described people with orange-colored hair.", "The girl with the orange colored hair is a redhead."], "image": "train2014/COCO_train2014_000000053335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 159823, "question_id": "iZhJMUn982LGX5BBv7iPL2", "question": "Where is this meal being eaten?", "choices": ["restaurant", "home", "office", "park"], "correct_choice_idx": 3, "direct_answers": ["park", "outdoors", "park", "cake", "outside", "park", "outdoors", "outside", "outdoors", "cake"], "difficult_direct_answer": false, "rationales": ["The food is on a table. the table is outside, not inside a home, office, or restaurant.", "They are outside and other people are sitting in the grass of a partially wooded area", "The desert is being eaten in an outside public grassy area. there is a lot of greenery around and people sitting in the grass enjoying the sunshine."], "image": "train2014/COCO_train2014_000000159823.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 336966, "question_id": "iaFdHY9tmkvCbS5nitpTed", "question": "How many different airline companies are represented by the planes?", "choices": ["two", "three", "one", "four"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "four", "us expenses"], "difficult_direct_answer": false, "rationales": ["All of the planes have the same us airways express livery.", "There is one company.", "Only one airline is here."], "image": "train2014/COCO_train2014_000000336966.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 357219, "question_id": "iaNxzL2rmswqkSZJrBXiqf", "question": "What is walking in the tree?", "choices": ["leopards", "bears", "cats", "dogs"], "correct_choice_idx": 1, "direct_answers": ["bears", "bear", "bears", "bears", "bears", "bears", "bears", "bears", "bear", "bear"], "difficult_direct_answer": false, "rationales": ["The animals in question are the right color, shape and size to be consistent with answer a.", "The animals all belong to the family ursidae.", "These are large hunched animals that can also walk"], "image": "val2014/COCO_val2014_000000357219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 564545, "question_id": "ibki9Nvmkp8FFm8uamDw6f", "question": "What are the birds in front of?", "choices": ["baby", "car", "cow", "house"], "correct_choice_idx": 3, "direct_answers": ["water", "house", "lighthouse", "water", "ocean", "sea", "crane", "kingfisher", "lighthouse", "beach"], "difficult_direct_answer": false, "rationales": ["The birds are in front of a large house that sits next to the water.", "The birds are by a house.", "There is a house in the water."], "image": "train2014/COCO_train2014_000000564545.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 344280, "question_id": "iebbKqYQxVxxXgD385HkiD", "question": "What is next to the lamppost?", "choices": ["goat", "baby", "cow", "clock"], "correct_choice_idx": 3, "direct_answers": ["human", "clock", "shop", "buildings", "street", "clock", "clock", "person", "clock", "clock"], "difficult_direct_answer": false, "rationales": ["A tall lamp post is on a busy city street and a round, numbered object with hands is nearby.", "The clock is near.", "There is a clock next to the lamp post."], "image": "train2014/COCO_train2014_000000344280.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 127316, "question_id": "ifzPkvVULBiiC8NGwEB3PL", "question": "How many hot dogs are on the tabletop on top of white paper?", "choices": ["three", "one", "two", "four"], "correct_choice_idx": 2, "direct_answers": ["two", "two", "2 dogs", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are 2.", "There are two hot dogs in two buns.", "There are two hot dogs."], "image": "train2014/COCO_train2014_000000127316.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 394583, "question_id": "igWsJXR7zDWPCJMDpWWSJR", "question": "What color is the jacket of the man who is driving down the road looking at his cell phone?", "choices": ["yellow", "red", "blue", "green"], "correct_choice_idx": 3, "direct_answers": ["green", "green", "green", "grey", "green", "olive green", "green", "green", "green", "gray"], "difficult_direct_answer": false, "rationales": ["The man is identifiable based on the text of the question and his clothing colors are clearly visible.", "A moss a specifically or possible a taupe.", "The color is green."], "image": "train2014/COCO_train2014_000000394583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 414772, "question_id": "igjJcLtUYECpCU69cd2R6f", "question": "What holiday is associated with the largest plate of meat on the table?", "choices": ["thanksgiving", "new years", "christmas", "halloween"], "correct_choice_idx": 0, "direct_answers": ["peaceful", "thanksgiving", "thanksgiving", "chicken", "thanksgiving", "thanksgiving", "thanksgiving", "thanksgiving", "sunday", "thanksgiving"], "difficult_direct_answer": false, "rationales": ["Turkey is served at thanksgiving.", "Turkey is common for this holiday", "The big turkey signifies thanksgiving."], "image": "train2014/COCO_train2014_000000414772.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556892, "question_id": "iiawHBwu6esayYHBncbMwR", "question": "What is the object on top of the surfboard?", "choices": ["goggles", "sunglasses", "surfboard leash", "headband"], "correct_choice_idx": 2, "direct_answers": ["strap", "cord", "cord", "surfboard leash", "blue", "leash", "sea", "cap", "ankle strap", "strap"], "difficult_direct_answer": false, "rationales": ["It is used to keep the man from falling off.", "The object is a leash.", "Surfboards are equipted with these. it has a characteristic cord and cuff to go around the ankle so the board is not lost."], "image": "val2014/COCO_val2014_000000556892.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304391, "question_id": "iifZmou2TjUefi56dEauqz", "question": "Which appliance sits right next to the refrigerator?", "choices": ["washing machine", "oven", "dishwasher", "sink"], "correct_choice_idx": 0, "direct_answers": ["washing machine", "washer", "washer", "washing machine", "clothes washer", "washing machine", "washer", "washer", "table", "oven"], "difficult_direct_answer": false, "rationales": ["Though not normal in the states, in europe it is normal for the washer to be in the kitchen.", "There is a washing machine by the fridge.", "The small white appliance next to the refrigerator is a washing machine for cleaning clothes."], "image": "train2014/COCO_train2014_000000304391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 28378, "question_id": "ijs87THVoHm6AZMKWZ66Aa", "question": "What color is the top of the speed boat that is closest to the corner of the dock?", "choices": ["orange", "blue", "white", "tan"], "correct_choice_idx": 1, "direct_answers": ["blue", "white", "white", "brown", "blue", "white", "white", "blue", "white", "blue"], "difficult_direct_answer": false, "rationales": ["The top of the speedboat is not white, tan, or orange.", "The nearest fast vessel is the color of the sky.", "It is a brighter shade of the same color as the water"], "image": "val2014/COCO_val2014_000000028378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218482, "question_id": "ik4yhhEoSDHeqJwigvsCSq", "question": "What is the big item in the far back right?", "choices": ["baseball stadium", "statue", "elephant", "van"], "correct_choice_idx": 3, "direct_answers": ["jet bridge", "van", "van", "flight", "weighing scale", "bus", "van", "jet bridge", "bridge", "bus"], "difficult_direct_answer": false, "rationales": ["The vehicle transports large groups of people or objects.", "The bulky car with extra seating on the right is a van.", "The largest object in the rear is a white caravan."], "image": "train2014/COCO_train2014_000000218482.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 240379, "question_id": "ik7p7b69VNhxdMDzaTY2n4", "question": "How many people are wearing red coats on this part of the ski range?", "choices": ["four", "five", "three", "two"], "correct_choice_idx": 0, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["Coats are many different colors. besides the four red coats, black, orange and white coats can be seen on the ski range.", "There are four people wearing red coats.", "There are four people wearing red coats."], "image": "val2014/COCO_val2014_000000240379.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 283945, "question_id": "inLnr5L5mqFmdmYQbBzY9N", "question": "What color is the leftmost truck?", "choices": ["yellow", "green", "purple", "blue"], "correct_choice_idx": 0, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "left", "yellow", "yellow", "reddish", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["Of the trucks lined up, the one furthest left is clearly visible and its color is answer a.", "The truck on the left is yellow.", "The color is easily identifiable by observation. it is a bright color and easily visible when contrasted against the light background."], "image": "val2014/COCO_val2014_000000283945.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 491921, "question_id": "ip4MiHMfXhJfB77XzHUXYi", "question": "Who is the man that appears to be taller?", "choices": ["dusty rhodes", "tony clark", "wayne gretzky", "papa shango"], "correct_choice_idx": 1, "direct_answers": ["red one", "base player", "team player", "player", "catcher", "player", "front", "baseman", "tony clark", "keeper"], "difficult_direct_answer": true, "rationales": ["The man is apparently that guy.", "These people are playing baseball, not hockey or wrestling.", "Tony clark plays for the team that wears red."], "image": "val2014/COCO_val2014_000000491921.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79841, "question_id": "iqKqUvuukQN6QBNdhSQzcY", "question": "What color is the jacket worn by the cycler in the right side photo?", "choices": ["green", "blue", "yellow", "red"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The jacket is not blue, green, or red.", "You can see the color of the jacket.", "The person in the front is wearing this bright colored top."], "image": "val2014/COCO_val2014_000000079841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492975, "question_id": "iqbZkQqPvhq3jeJGbKyoC2", "question": "What Christmas character are these people all dressed up as?", "choices": ["grinch", "santa clause", "elves", "reindeer"], "correct_choice_idx": 1, "direct_answers": ["santa claus", "santa", "santa", "santa clause", "santa", "christmas character", "santa claus", "santa clause", "santa", "santa clause"], "difficult_direct_answer": false, "rationales": ["They have red suits with white fur and beards", "Santa wears a red jacket and pants with a white beard and hat - so they are all dressed like him.", "Santa's outfit is a red suit with white trim."], "image": "train2014/COCO_train2014_000000492975.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 31462, "question_id": "iqbjCFoCG8PYKN7Hps67Fz", "question": "Who does the person in the jeans look most similar to?", "choices": ["jonathan pryce", "sandra oh", "tiger woods", "idris elba"], "correct_choice_idx": 0, "direct_answers": ["grandpa", "actor", "wierd al", "bernie sanders", "grandparent", "fonzie", "man", "rod stewart", "jonathan pryce", "mick jagger"], "difficult_direct_answer": true, "rationales": ["The person is like jonathan.", "The man looks similar to pryce.", "They have similar faces."], "image": "train2014/COCO_train2014_000000031462.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 120734, "question_id": "ir5N6aqxPF5nLPWhzsQHoR", "question": "What material are the horses laying down in?", "choices": ["dirt", "grass", "hay", "sand"], "correct_choice_idx": 0, "direct_answers": ["dirt", "soil", "mud", "mud", "mud", "sod", "dirt", "dirt", "dirt", "soil"], "difficult_direct_answer": false, "rationales": ["The horses are laying down on muddy looking ground.", "The horses are laying on dirt.", "The substance is outside and colored brown, the color of the earth that it came from. you can see small plant matter mixed in which is common with this kind of substance."], "image": "train2014/COCO_train2014_000000120734.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 109145, "question_id": "irB3EYEdpZBLcyCNfoZCMv", "question": "What is the item in the left corner?", "choices": ["clock", "mixer", "food scale", "timer"], "correct_choice_idx": 2, "direct_answers": ["food scale", "scale", "scale", "cup", "clock", "scale", "weight scale", "cup", "scale", "clock"], "difficult_direct_answer": false, "rationales": ["The cabinets suggest we are in the kitchen which is where this type of device is found. you can see that it measures weight of small objects by its small size and number on its face.", "It has a dial on the face of it and a platform on the top to place items", "People keep these in the kitchen to weigh food."], "image": "train2014/COCO_train2014_000000109145.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 40723, "question_id": "iryyDTsWs6i3Pyy3J5wHMS", "question": "What is resting on the sand?", "choices": ["dogs", "old man", "cows", "boats"], "correct_choice_idx": 3, "direct_answers": ["boats", "boats", "boats", "boats", "boat", "boats", "boats", "five boats", "boats", "boat"], "difficult_direct_answer": false, "rationales": ["This type of vehicle is found near the sea and are recognizable by their metal material and concave shape.", "There are boats by the water.", "There is a row of canoes, a lightweight water vessel pointed at both ends, sitting in the sand."], "image": "train2014/COCO_train2014_000000040723.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10478, "question_id": "isCa7RJbViGDsyFEoF62ei", "question": "What are the items in the bowl ingredients for?", "choices": ["cherry cheesecake", "hot dog", "cheeseburger", "apple pie"], "correct_choice_idx": 3, "direct_answers": ["apple pie", "juice", "apples", "pie", "pie", "apple pie", "apple", "pie", "apple pie", "apple pie"], "difficult_direct_answer": false, "rationales": ["The apples are used for pie.", "The stuff is for apple pie.", "Of the listed items, apple pie is the only item which contains apples."], "image": "train2014/COCO_train2014_000000010478.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385337, "question_id": "isniJAbfWLbpnZecJcKpvX", "question": "How many baseball players are here with red jerseys?", "choices": ["five", "four", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "wo", "two", "two", "two", "there", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["The baseball players are clearly visible and countable.", "There are two players in red.", "There are two baseball players in jerseys, and both jerseys are red."], "image": "train2014/COCO_train2014_000000385337.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433683, "question_id": "iuBE8pUdwZdWVt8oS5cnHc", "question": "What area of the world is this?", "choices": ["colorado", "new york", "florida", "canada"], "correct_choice_idx": 1, "direct_answers": ["fefef", "america", "nyc", "asia", "new york", "new york", "new york", "usa", "new york", "new york"], "difficult_direct_answer": false, "rationales": ["The statue of liberty visible here identifies this picture as new york city.", "The area is new york.", "A body of water is in front of a skyline with the statue of liberty in the distance."], "image": "train2014/COCO_train2014_000000433683.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80651, "question_id": "iuQZWAsyH7pRcs2tunHeWi", "question": "What is the woman holding in her hand?", "choices": ["kitten", "bottle", "egg", "puppy"], "correct_choice_idx": 1, "direct_answers": ["drink", "cell phone", "bottle", "bottle", "cell phone", "bottle", "bottle", "bottle", "bottle", "bottle"], "difficult_direct_answer": false, "rationales": ["The woman talking on the phone is holding a bottle in her hand.", "One can make out the shape of the beverage container in her hand.", "The woman has a bottle."], "image": "val2014/COCO_val2014_000000080651.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 22478, "question_id": "iuobz3v6u9U2Ey7zSsFDbc", "question": "What is in front of the couch?", "choices": ["dog", "crate", "baby", "seashell"], "correct_choice_idx": 1, "direct_answers": ["television", "television", "milk crate", "tv", "crates", "television", "crate", "crate", "tv", "crate"], "difficult_direct_answer": false, "rationales": ["It is square and appears to be stacked which is typical of people without a lot of furniture.", "The television is clearly visible based on its defining size, shape and design. the objects orientated in front of it are clear and identifiable by their size, shape and style.", "The couch has a crate."], "image": "train2014/COCO_train2014_000000022478.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337011, "question_id": "ivQNUXb3YtV64F8spgeMAc", "question": "What color is the frame of the girl's bike painted out to be?", "choices": ["blue", "pink", "purple", "yellow"], "correct_choice_idx": 1, "direct_answers": ["pink", "pink", "pink", "pink", "pink", "pink", "pink", "rose", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["Although the wheel covers are silver, the rest of this bicycle is painted pink.", "The color is pink.", "The bike is painted pink"], "image": "val2014/COCO_val2014_000000337011.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 539404, "question_id": "ivzuBfv35rC2rZq8pcBPP9", "question": "What is behind the white door?", "choices": ["bathroom", "pantry", "bedroom", "hall closet"], "correct_choice_idx": 1, "direct_answers": ["hallway", "yard", "garage", "microwave", "room", "closet", "pantry", "pantry", "pantry", "pantry"], "difficult_direct_answer": false, "rationales": ["It looks like a closet in the kitchen.", "A storage for food would make sense right off the kitchen.", "A door is visible in a kitchen."], "image": "train2014/COCO_train2014_000000539404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 537027, "question_id": "iwd65TirFEodAHpvyzEXoq", "question": "What variety of phone is used by the man with the drink?", "choices": ["blackberry", "iphone", "flip phone", "smart phone"], "correct_choice_idx": 2, "direct_answers": ["flip", "flip", "flip phone", "flip-phone", "flip", "flip phone", "flip phone", "flip", "typing", "flip phone"], "difficult_direct_answer": false, "rationales": ["The flip phone is used.", "You can tell by the design and the antenna as to what type of older phone he has.", "A the flip phone. it obviously flips open."], "image": "val2014/COCO_val2014_000000537027.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 492471, "question_id": "ixFszXYtP3hSCJ4qQt7pfb", "question": "What color is the border of the sail on the small boat?", "choices": ["yellow", "red", "blue", "green"], "correct_choice_idx": 2, "direct_answers": ["blue", "white", "blue", "blue", "blue", "boat", "car", "black", "blue", "dark blue"], "difficult_direct_answer": false, "rationales": ["The border of the sail is blue colored.", "A small boat is in the water with a sail that is white and blue.", "The leftmost sail in this image has a blue outline."], "image": "train2014/COCO_train2014_000000492471.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 411321, "question_id": "iyHQwg4gD2eTGYYiZ2ifeu", "question": "How are the objects on the front of the fridge sticking?", "choices": ["magnets", "super glue", "magic", "tape"], "correct_choice_idx": 0, "direct_answers": ["nice", "magnets", "magnets", "magnets", "magnets", "glued", "magnets", "attractive", "magnets", "paper"], "difficult_direct_answer": false, "rationales": ["The objects are attached to magnets.", "The items on the fridge are attached by the force used by the item identified in option a.", "The fridge is made of metal and this type of object is attracted to metal. the force is strong enough to hold up a piece of paper."], "image": "train2014/COCO_train2014_000000411321.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296162, "question_id": "iyVkaf339pttrDhC35jW2E", "question": "What are the cows traveling around?", "choices": ["scarecrow", "statue", "man", "tree"], "correct_choice_idx": 3, "direct_answers": ["field", "tree", "tree", "sfsfss", "tree", "pasture", "tree", "tree", "tree", "tree"], "difficult_direct_answer": false, "rationales": ["The cows are traveling around the tree.", "The cows are around a woody organism with leaves. option a matches the item.", "There is a large plant that is rooted in the ground. it has limbs extended from it with leaves on them."], "image": "train2014/COCO_train2014_000000296162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 296162, "question_id": "iyehMrNcS8wvFB8NTaSopn", "question": "What is the number of cows gathered around the tree in the middle of the field with yellow flowers?", "choices": ["four", "five", "six", "seven"], "correct_choice_idx": 2, "direct_answers": ["six", "car", "six", "many", "six", "six", "five", "six", "six", "fssf"], "difficult_direct_answer": false, "rationales": ["There are four cows to the left of the tree. two cows are to the right of the tree.", "There are 5 seen clearly and then one is behind the group of three on the left", "There are six cows huddled near a tree."], "image": "train2014/COCO_train2014_000000296162.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294284, "question_id": "iyrun63BG8aPAh53oUP8ug", "question": "What direction are the elephants headed?", "choices": ["east", "north", "west", "south"], "correct_choice_idx": 0, "direct_answers": ["forward", "right", "left", "right", "east", "right", "right", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["The elephants are going east.", "This direction is to the right.", "You can tell by the shadow of the sun of the elephants. it is past mid day and the shadow points in the direction opposite to movement."], "image": "val2014/COCO_val2014_000000294284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396317, "question_id": "izowhwvFCeqQrpZY3zaMSX", "question": "What are the zebras looking at on the grass?", "choices": ["strangers", "food", "mountains", "friends"], "correct_choice_idx": 1, "direct_answers": ["food", "field", "food", "each other", "sun", "no idea", "food", "food", "grass", "pasture"], "difficult_direct_answer": false, "rationales": ["The zebras are looking down at the grass for food.", "Zebras are herbivores and feed on the grass.", "The zebras want food."], "image": "val2014/COCO_val2014_000000396317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 508725, "question_id": "izqhrtxs9AtHheFRbqHz3U", "question": "What is in front of the building?", "choices": ["cow", "horse", "baby", "bus"], "correct_choice_idx": 3, "direct_answers": ["train car", "train", "train", "trolley", "train", "bus", "trolly bus", "train car", "train", "bus"], "difficult_direct_answer": false, "rationales": ["There is a trolley bus in front of the building.", "The front is a bus.", "There is a small red bus in front of the building."], "image": "train2014/COCO_train2014_000000508725.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270612, "question_id": "izy6ExTF4EbSmv6KC8Fy43", "question": "How many letters are extending down the sign attached to the pole?", "choices": ["four", "five", "three", "two"], "correct_choice_idx": 1, "direct_answers": ["three", "five", "five", "five", "five", "five", "five", "five", "five", "five"], "difficult_direct_answer": false, "rationales": ["The sign is for a motel.", "The sign attached to the pole says motel.", "There are that many letters on the sign in the word motel."], "image": "train2014/COCO_train2014_000000270612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 183653, "question_id": "j2DYJUvaAKo2wz4RrcVFvM", "question": "What color are the croc shoes on the bag on the floor?", "choices": ["blue", "gray", "black", "pink"], "correct_choice_idx": 3, "direct_answers": ["red", "pink", "pink", "pink", "pink", "hot pink", "pink", "pink", "pink", "pink"], "difficult_direct_answer": false, "rationales": ["The croc shoes on the floor are bright pink.", "The shoes are pink. the shoes and bag match.", "The color is easily identifiable by observation. it is a bright color and easily visible when contrasted against the dark wood background."], "image": "train2014/COCO_train2014_000000183653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65969, "question_id": "j52H7SDsJhar26mStiVvN4", "question": "What liquid is most likely in the glass on the right?", "choices": ["ketchup", "mustard", "beer", "water"], "correct_choice_idx": 2, "direct_answers": ["beer", "beer", "beer", "beer", "beer", "beer", "beer", "beer", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["Glass on the right has brewery on the glass itself.", "The glass is named after a brewery.", "The glass contains a drink, not a condiment. brewery is on the side of the glass."], "image": "val2014/COCO_val2014_000000065969.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311385, "question_id": "j8bfNSNNfbwbNmSTHvNvFX", "question": "What is the food covered in to make it orange?", "choices": ["spit", "gasoline", "sauce", "soda"], "correct_choice_idx": 2, "direct_answers": ["sauce", "sauce", "orange sauce", "sugar", "sauce", "sauce", "sauce", "juice", "beans", "sauce"], "difficult_direct_answer": false, "rationales": ["The food has sauce.", "It is typical to cover food with this type of liquid to give flavoring. it is orange chicken which uses a glaze. you can see the shiny, orange glaze.", "This is an asian dish with a teriyaki dressing."], "image": "val2014/COCO_val2014_000000311385.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222664, "question_id": "j98aVWgeYdE6HT6zWcxA8X", "question": "What happened to the pizza?", "choices": ["thrown away", "eaten", "evaporated", "disintegrated"], "correct_choice_idx": 1, "direct_answers": ["eaten", "eaten", "eaten", "eaten", "cutting", "ate all", "eaten", "finished", "eaten", "eaten"], "difficult_direct_answer": false, "rationales": ["Empty, greasy pizza boxes with a stray knife and one piece of pizza are open.", "The pizza has been eaten.", "There is one slice of pizza left. it would be wasteful to throw away a pizza, and pizzas cannot evaporate or disintegrate."], "image": "train2014/COCO_train2014_000000222664.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 456825, "question_id": "jBu75sfrHagkyrGnScv5KM", "question": "What game is being played?", "choices": ["fetch", "hopscotch", "basketball", "baseball"], "correct_choice_idx": 0, "direct_answers": ["fetch", "fetch", "frisbee", "frisbee", "catch", "fetch", "frisbee", "frisbee", "frisbee", "catch"], "difficult_direct_answer": false, "rationales": ["The game is fetch.", "The dog is playing with the frisbee in the water.", "The dog is bringing the frisbee back after retrieving it."], "image": "val2014/COCO_val2014_000000456825.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 190550, "question_id": "jCn7ALD8mGFvcGkNxA8ySB", "question": "What is the ground made of?", "choices": ["clay", "turf", "concrete", "dirt"], "correct_choice_idx": 0, "direct_answers": ["clay", "clay", "clay", "dirt", "concrete", "dirt", "clay", "dirt", "asphalt", "clay"], "difficult_direct_answer": false, "rationales": ["This type of material is common in european courts and you can see that the player is white. in addition, it is colored red which matches what you see.", "A man is playing tennis on a reddish brown surface with a net.", "It is red and soil"], "image": "train2014/COCO_train2014_000000190550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21036, "question_id": "jE7G9VankfHZENYA8nH3ms", "question": "What does the blue float look like?", "choices": ["lobster", "horse", "cow", "egg"], "correct_choice_idx": 0, "direct_answers": ["scorpion", "crustacean", "crab", "lobster", "lobster", "crab", "crab", "lobster", "dog", "crab"], "difficult_direct_answer": false, "rationales": ["The blue float has claws. eggs, cows, and horses do not have claws.", "The little float looks like a blue lobster.", "A large kite has claws that pinch."], "image": "val2014/COCO_val2014_000000021036.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59282, "question_id": "jERvMpNJf7oKAXyWP2sXEW", "question": "What country is this most likely?", "choices": ["china", "ireland", "jamaica", "russia"], "correct_choice_idx": 2, "direct_answers": ["australia", "brazil", "africa", "africa", "africa", "america", "jamaica", "america", "haiti", "africa"], "difficult_direct_answer": false, "rationales": ["It seems to be jaimaica because of its weather.", "A man is walking on the beach with a surfboard in a place with clear skies and ocean.", "The country is jamaica."], "image": "train2014/COCO_train2014_000000059282.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 304236, "question_id": "jEfYzkLXkQVqPxLRLQ9h4s", "question": "What is advertised at the store with the green canopy?", "choices": ["dancing", "swimming", "petting cats", "riding horses"], "correct_choice_idx": 0, "direct_answers": ["dancing", "dining dancing", "restaurant", "car", "car", "bsll", "dining dancing", "dancing", "dining", "sweet"], "difficult_direct_answer": false, "rationales": ["The white letters also say dining.", "It says so on the canopy.", "A awning above a business lists the activities that can be done inside."], "image": "train2014/COCO_train2014_000000304236.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429174, "question_id": "jEvSrPcDtoqj7k8jtVyCrW", "question": "What color are the chimney pieces on the top of the long rectangular house?", "choices": ["blue", "green", "red", "yellow"], "correct_choice_idx": 2, "direct_answers": ["red", "tan", "beige", "red", "red", "red", "red", "red", "red", "brick"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The color is red.", "The color is easily visible and bright. it is in sharp contrast to the brown building."], "image": "val2014/COCO_val2014_000000429174.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 337780, "question_id": "jF7MuxZzqdnvMrwDGjnBvr", "question": "What mode of transport here is the oldest?", "choices": ["motorcycle", "taxi", "van", "elephant"], "correct_choice_idx": 3, "direct_answers": ["animal", "elephant", "elephant", "bikes", "elephant", "bike", "elephant", "elephant", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["Because it is an animal that was used in the ancient times.", "A person is riding an elephant in the street surrounded by cars and bikes.", "The modes of transportation are all visible and identifiable based on their unique components. vehicular transportation is a byproduct of a modern industrial age, but answer a existed before."], "image": "train2014/COCO_train2014_000000337780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 70434, "question_id": "jFQk6g2VE8gMuEjYyWanAK", "question": "What color is the fluid in the small container with the white cap on the top?", "choices": ["purple", "red", "blue", "green"], "correct_choice_idx": 1, "direct_answers": ["orange", "mouthwash", "sandle", "orange", "orange", "orange", "orange", "orange", "orange", "red"], "difficult_direct_answer": false, "rationales": ["The color is red.", "It is the same color of the cap on the other bottle.", "Mouthwash often comes in these colours."], "image": "val2014/COCO_val2014_000000070434.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 96177, "question_id": "jH7b9jWbHHGWm9h6CPqGHu", "question": "The girl standing at the sink with a toothbrush is brushing what?", "choices": ["plate", "her teeth", "glass", "meat"], "correct_choice_idx": 3, "direct_answers": ["meat", "meat", "meat", "chicken", "meat", "meat", "meat", "meat", "meat", "meat"], "difficult_direct_answer": false, "rationales": ["The girl has meat.", "The girl is brushing meat with what appears to be a toothbrush.", "The toothbrush is resting against the side of the animal carcass."], "image": "train2014/COCO_train2014_000000096177.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 423842, "question_id": "jHwWdkqR9ALfLY9NreJwSH", "question": "What is the longest item?", "choices": ["hose", "ladder", "giraffe neck", "train"], "correct_choice_idx": 2, "direct_answers": ["fence", "giraffes", "trees", "giraffe neck", "neck", "neck", "giraffe", "neck", "neck", "neck"], "difficult_direct_answer": false, "rationales": ["A giraffe is reaching over a fence to eat grass.", "The neck is the longest part.", "The necks of this animal are very long."], "image": "train2014/COCO_train2014_000000423842.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 335047, "question_id": "jLDExkkdhHL35FHVEY9YRU", "question": "What is on top of the sandwich?", "choices": ["tater tots", "red peppers", "mustard", "eggs"], "correct_choice_idx": 1, "direct_answers": ["peppers", "onions", "peppers", "tomatoes", "sandwich", "peppers", "peppers", "peppers", "red peppers", "bell pepper"], "difficult_direct_answer": false, "rationales": ["There are sliced vegetables on top of the sandwich.", "They are red peppers.", "One can make out the red vegetables sitting on top of the sandwich."], "image": "val2014/COCO_val2014_000000335047.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308702, "question_id": "jM4wCEWnD6mExh5Jq7Ptw8", "question": "What is the wooden plaque above the archway in the shape of?", "choices": ["fleurdelis", "pentagram", "trident", "american eagle"], "correct_choice_idx": 0, "direct_answers": ["fleurdelis", "square", "cube", "cross", "fleur-de-lis", "oval", "nola", "basket", "moon", "car"], "difficult_direct_answer": true, "rationales": ["A saint like logo is depicted diagonal right by the archway in kitchen.", "It's hanging on the wall as decoration.", "The plaque has a fleur de lis."], "image": "val2014/COCO_val2014_000000308702.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352496, "question_id": "jQBacsPfiALTn8u7VutTUN", "question": "What is the man in the foreground holding in his hand?", "choices": ["ski pole", "egg", "baseball", "soda can"], "correct_choice_idx": 0, "direct_answers": ["ski pole", "skying", "ski pole", "pole", "ski poles", "skating stick", "stick", "ski pole", "ski poles", "ski poles"], "difficult_direct_answer": false, "rationales": ["The pole is to help him as he skies on the snow.", "The man has skis.", "The man is skiing so he carries the poles in his hands."], "image": "val2014/COCO_val2014_000000352496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144961, "question_id": "jQJfW5GxV2DqnzzXiHS698", "question": "What is the object called that the dog is jumping into the water after?", "choices": ["football", "ball", "bone", "frisbee"], "correct_choice_idx": 3, "direct_answers": ["frisbee", "ball", "water", "ball", "freebie", "grispy", "frisbee", "fetch disc", "frisbee", "frisbee"], "difficult_direct_answer": false, "rationales": ["A frisbee attracts the dog easily.", "The object is a frisbee.", "The dog is chasing a flying disc, not a bone, ball, or football."], "image": "train2014/COCO_train2014_000000144961.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 311645, "question_id": "jQrZEcwXD3YvTfD3LSrTjR", "question": "What movie featured a similar animal to the animal the person is interacting with?", "choices": ["american psycho", "titanic", "ladyhawke", "dumbo"], "correct_choice_idx": 2, "direct_answers": ["free willy", "egal", "eagle", "ink", "seagull", "finding nemo", "birds", "ladyhawke", "finding nemo", "na"], "difficult_direct_answer": true, "rationales": ["A man is perched on a single rock by water and has a bird coming down to take something from his hand.", "Ladyhawke featured a hawk.", "Dumbo was an elephant, the other 2 films did not feature birds."], "image": "train2014/COCO_train2014_000000311645.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 489608, "question_id": "jVh9yeV9jgeyFtzpBXeR3y", "question": "What color is the sign usually?", "choices": ["green", "black", "yellow", "red"], "correct_choice_idx": 3, "direct_answers": ["red", "red", "black", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["It is a universal color for stop signs in many countries", "One can see that the sign has faded from its original bright color.", "The sign is in the shape of an octagon like a stop sign which is usually red."], "image": "train2014/COCO_train2014_000000489608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317806, "question_id": "jWK9o5NZV3qmpVQJe2ZC9r", "question": "What class are these supplies needed for?", "choices": ["english class", "gym class", "math class", "art class"], "correct_choice_idx": 3, "direct_answers": ["art", "art", "art", "ok", "art", "art class", "cutting", "arts", "coloring", "art"], "difficult_direct_answer": false, "rationales": ["These supplies are used for art works.", "The supplies shown are used in art class for cutting and coloring paper.", "These are supplies used to create things"], "image": "train2014/COCO_train2014_000000317806.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 260932, "question_id": "jWWwrfDhTDKTN64CVRxNom", "question": "Who played this sport?", "choices": ["wayne gretzky", "babe ruth", "hulk hogan", "pele"], "correct_choice_idx": 1, "direct_answers": ["baseball", "babe ruth", "a-rod", "baseball players", "babe ruth", "baseball players", "baseball players", "babe trauni", "babe ruth", "baseball players"], "difficult_direct_answer": false, "rationales": ["Babe ruth is spotted in the matches.", "Babe played this sport.", "Babe ruth played baseball."], "image": "train2014/COCO_train2014_000000260932.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524575, "question_id": "jYDjELWHvXA5iAF8cuDVoY", "question": "What animal is in the foreground?", "choices": ["deer", "cat", "dog", "bison"], "correct_choice_idx": 3, "direct_answers": ["wildebeest", "bison", "gnu", "giraffe", "buffalos", "bison", "watching", "ox", "moose", "cow"], "difficult_direct_answer": true, "rationales": ["The bison is present.", "A bison is in the front.", "There is a brown animal, a bison closest to the camera."], "image": "val2014/COCO_val2014_000000524575.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 275413, "question_id": "jYSzo74MDGzRkEEpeg4eSn", "question": "Why is this man not considered a vegan?", "choices": ["cooking turkey", "eating chicken", "pepperoni pizza", "eating bacon"], "correct_choice_idx": 0, "direct_answers": ["cooking chicken", "age", "turkey", "turkey", "cooking turkey", "turkey", "strength", "turkey", "chicken", "cooking turkey"], "difficult_direct_answer": false, "rationales": ["The man is putting a turkey in the oven to cook", "The man is cooking a turkey.", "A man is bent over an oven and is pulling out a large turkey in a roasting pan. people generally do not cook food they do not eat."], "image": "train2014/COCO_train2014_000000275413.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 265378, "question_id": "jYcTSmRbgFLbn9PQswZ2JC", "question": "What make of car is this?", "choices": ["volkswagen", "hyundai", "audi", "subaru"], "correct_choice_idx": 0, "direct_answers": ["vox wagon", "volkswagen", "volkswagen", "volkswagen", "steel", "volkswagen", "volkswagen", "vw", "white", "car"], "difficult_direct_answer": false, "rationales": ["The white car has the volkswagen emblem on the front of the car.", "The car is a volkswagen.", "This car is a volkswagon. i can tell by a logo on the front of the car that has a superimposed v and w representing volkswagon."], "image": "val2014/COCO_val2014_000000265378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202372, "question_id": "jZ9VPPi566nuqNT2jNzwsH", "question": "How many red umbrellas are hanging up in the laundry ropes above the dining area?", "choices": ["five", "six", "four", "three"], "correct_choice_idx": 0, "direct_answers": ["30 umbrellas", "six", "five", "five", "four", "five", "five", "forty one", "three", "four"], "difficult_direct_answer": false, "rationales": ["There are at least four, and most likely five red umbrellas.", "They are attached to other umbrellas on top.", "There are 5."], "image": "train2014/COCO_train2014_000000202372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162654, "question_id": "jZeZcXC9tTKZxvR5hQLvAm", "question": "What is the white string coming out of the mans beanie?", "choices": ["headphones", "mask", "hair", "necklace"], "correct_choice_idx": 0, "direct_answers": ["headphones", "cord", "headphones", "earbuds", "earbuds", "earphones", "headphones", "headphones", "earbuds", "headphones"], "difficult_direct_answer": false, "rationales": ["The string is the headphones.", "That are headphones sticking out.", "A man is wearing a hat and there is a white cord showing that hangs from his head down into his lap. people commonly wear headphones."], "image": "train2014/COCO_train2014_000000162654.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162993, "question_id": "jZozTHpJdaBgyP3gj7VYAi", "question": "What are the animals near?", "choices": ["egg cartons", "apples", "tree", "fence"], "correct_choice_idx": 3, "direct_answers": ["fence", "manger", "grass", "fence", "watching", "wood", "sheep", "wall", "fence", "outdoor bathrooms"], "difficult_direct_answer": false, "rationales": ["The animals are by a fence.", "It is a common enclosure for sheep. it is identifiable by the wood it is made out of and the size which is big enough to keep the animals from crossing the barrier.", "The animals are behind a gate and secured."], "image": "train2014/COCO_train2014_000000162993.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 47563, "question_id": "jZvaccMAtrLDSiYV8ycSCS", "question": "What is under the umbrella?", "choices": ["baby", "black cat", "old woman", "old man"], "correct_choice_idx": 1, "direct_answers": ["cat", "black cat", "cat", "black cat", "cat", "cat", "black cat", "black cat", "cat", "black cat"], "difficult_direct_answer": false, "rationales": ["There are felines, not people. the one under the umbrella has dark fur.", "The cat is underneath.", "There is a dark cat."], "image": "train2014/COCO_train2014_000000047563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490311, "question_id": "jc2vEgYFNRzauENaHjT53e", "question": "What is the name of the object with the purpose of emitting light to the room?", "choices": ["stuffed animal", "lamp", "monitor", "television"], "correct_choice_idx": 1, "direct_answers": ["curtains", "lamp", "lamp", "lamp", "emitting", "lamp", "lamp", "lamp", "lighter", "lamp"], "difficult_direct_answer": false, "rationales": ["It is characteristically round and designed to sit on a desk. you can see the light coming from it and it is common in all houses.", "The table top lamp gives light to the room.", "The name is a lamp."], "image": "train2014/COCO_train2014_000000490311.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 151978, "question_id": "jc9t7KpvASwo7oKTAo9ZAj", "question": "What does this scene look most like?", "choices": ["winter wonderland", "maypole dance", "desert", "seaside villa"], "correct_choice_idx": 0, "direct_answers": ["ski resort", "winter wonderland", "winter wonderland", "dwdwd", "snow", "winter", "winter", "winterscape", "skiing", "winter wonderland"], "difficult_direct_answer": false, "rationales": ["This place has winter weather and is a tourist destination.", "Snow doesn't appear in any of the other settings and one is a dance and not a setting.", "The scene is wintery."], "image": "val2014/COCO_val2014_000000151978.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 496372, "question_id": "jcfHhy4eWxtQqG8xCTF7bU", "question": "What century of gun is developed and hung on the side of this horse?", "choices": ["20th", "17th", "18th", "19th"], "correct_choice_idx": 3, "direct_answers": ["18", "1980", "eighteenth century", "20", "eighteenth", "19th", "nineteenth", "1800's", "19th", "shotgun"], "difficult_direct_answer": true, "rationales": ["The century is the 19th.", "The ammo casings for this gun were invented in the 19th century.", "The century is the 19th."], "image": "train2014/COCO_train2014_000000496372.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 523701, "question_id": "jcujcbVXQ3SyvRtHVNMWW4", "question": "What are on some of the boats?", "choices": ["cows", "cats", "bananas", "surfboards"], "correct_choice_idx": 2, "direct_answers": ["food", "hats", "food", "hats", "bananas", "bananas", "food", "hats", "banana", "bananas"], "difficult_direct_answer": false, "rationales": ["Some of the boats are carrying yellow fruit.", "There are fruits on the boat.", "You can tell by the color and the market setting as to what is on the boats."], "image": "train2014/COCO_train2014_000000523701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 118726, "question_id": "jd22yXv73fYptrh34eM2pQ", "question": "What color is the cloud on top of the map in the television set of the background?", "choices": ["blue", "purple", "green", "red"], "correct_choice_idx": 2, "direct_answers": ["blue", "white", "green", "white", "white", "white", "grey", "blue", "green", "white"], "difficult_direct_answer": false, "rationales": ["The cloud is the color of a tree.", "On weather reports this color represents rain.", "It is grass colored"], "image": "train2014/COCO_train2014_000000118726.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529350, "question_id": "jdeJ5mRYkwvH4ezmQeKjHV", "question": "What kind of phone is pictured on the far left side?", "choices": ["landline", "watch", "satellite", "smart"], "correct_choice_idx": 0, "direct_answers": ["old phone", "home phone", "nothing", "landline", "corded", "landline", "landline phone", "office phone", "landline", "land line"], "difficult_direct_answer": false, "rationales": ["You can tell by its size and long cord coming from it as to what type of phone it is.", "The phone is a larger, corded phone and is not mobile.", "The phone on the far left is wired, so it is not a satellite phone. it is not a smart phone or watch"], "image": "train2014/COCO_train2014_000000529350.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85759, "question_id": "jf8kMS966Eb5vkFMbg2VfJ", "question": "What type of area is this skateboarder in?", "choices": ["city", "small town", "farm", "suburb"], "correct_choice_idx": 0, "direct_answers": ["city", "street", "road", "floating surface", "street", "road", "roadside", "air", "street", "sidewalk"], "difficult_direct_answer": false, "rationales": ["There is pavement and public transportation behind the kid.", "He is on a street behind a public transportation bus", "The skateboarder is in an urban area."], "image": "train2014/COCO_train2014_000000085759.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 571768, "question_id": "jfCSeFBZnovf8moWQAgqwi", "question": "Where does the person in the foreground work?", "choices": ["rodeo", "police station", "circus", "mcdonalds"], "correct_choice_idx": 1, "direct_answers": ["police", "police station", "police", "police station", "police station", "dwdw", "police", "police department", "japan", "stop"], "difficult_direct_answer": false, "rationales": ["The man's vest lets you know where he works.", "It says it on their vest", "The person's occupation is written on the back of their jacket. answer a is a place that people with this occupation work."], "image": "train2014/COCO_train2014_000000571768.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 81922, "question_id": "jguyrf8hbc5GJEMeSVecS5", "question": "What is near the vehicles?", "choices": ["helicopter", "kite", "plane", "bison"], "correct_choice_idx": 2, "direct_answers": ["plane", "light poles", "light poles", "plane", "airplane", "airplane", "aeroplane", "plane", "road", "plane"], "difficult_direct_answer": false, "rationales": ["There is a flying vehicle with jet engines above the road vehicles. it is a fixed-wing vehicle, not a helicopter or kite.", "There is an airplane landing near the vehicles.", "There are cars on the ground. a fixed-wing flying vehicle that has jet engines is above the cars."], "image": "val2014/COCO_val2014_000000081922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 483994, "question_id": "jhDHRs6DEC75iqYyzgUeif", "question": "What is at the top of the sheep pile?", "choices": ["dog", "elephant", "baby", "mouse"], "correct_choice_idx": 0, "direct_answers": ["dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["All of the animals are clearly visible and answer a is consistent with the location given in the question and is identifiable based on its known features, shape and size.", "There is a dog standing on the top of the sheep pile.", "There is an animal that has somehow ran on top of this museum area. it has a long tail, canine teeth and floppy ears."], "image": "val2014/COCO_val2014_000000483994.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 8498, "question_id": "jibsoHMPJ2UxnD3Q8gaUf3", "question": "What color are the square bricks outlining the base of this small clock tower?", "choices": ["red", "tan", "black", "white"], "correct_choice_idx": 3, "direct_answers": ["white", "cream", "red", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The other bricks are red. the square bricks at the base do not match the other bricks and are not black or tan.", "The color is light and easily visible. it is in sharp contrast to the green grass.", "The color is white."], "image": "val2014/COCO_val2014_000000008498.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 147228, "question_id": "jieCGdhX3uQzUQVv5Rgqyi", "question": "What is on the railing?", "choices": ["skateboarder", "package", "birds", "mannequin"], "correct_choice_idx": 2, "direct_answers": ["seagulls", "bird", "birds", "birds", "birds", "birds", "birds", "birds", "birds", "birds"], "difficult_direct_answer": false, "rationales": ["The things on the railing have 2 legs, a beak, feathers, and wings.", "The objects on the visible rail are winged, beaked creatures that would be consistent with answer a. answer a would also be common to this setting and could appear on a rail in this manner.", "You can see several different kinds on the railing. the one closest to the camera has an orange beak and white head."], "image": "train2014/COCO_train2014_000000147228.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69077, "question_id": "jjuNxwcqo9MpeDr9iwUbJL", "question": "What famous animal does this one most closely resemble?", "choices": ["dumbo", "garfield", "free willy", "benji"], "correct_choice_idx": 3, "direct_answers": ["sandy", "old yellar", "benji", "benji", "benji", "benji", "dog", "benji", "benji", "fee"], "difficult_direct_answer": false, "rationales": ["The dog in the snow is very similar to the dog in the movie benji.", "Benji looked like this dog.", "Benji is a dog."], "image": "val2014/COCO_val2014_000000069077.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 65153, "question_id": "jkpuEQXmWsF4jkvZfTJ4dd", "question": "What is the white cooker called?", "choices": ["rice cooker", "dutch oven", "air fryer", "ninja"], "correct_choice_idx": 0, "direct_answers": ["rice cooker", "crockpot", "crockpot", "crock pot", "pressure cooker", "rice cooker", "electric", "rice", "rice cooker", "rice cooker"], "difficult_direct_answer": false, "rationales": ["The small bowl with a lid is called the rice cooker.", "The white cooker looks like the standard rice cooker found in all the stores.", "The white appliance does not have the ability to directly heat or pressurize air. it is not a dutch oven."], "image": "train2014/COCO_train2014_000000065153.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85459, "question_id": "jmkk7i8hZNBSPRcYnUdGJK", "question": "What is to the right of the sink?", "choices": ["red car", "white car", "black car", "shower item"], "correct_choice_idx": 3, "direct_answers": ["shower", "soap", "mirror", "shower", "shower", "shower item", "shower", "shower", "shower", "shower"], "difficult_direct_answer": false, "rationales": ["There are soaps that can be seen through the shower glass", "There is a shower stall to the right of the sink and it contains tiles and shower items.", "This is a bathroom. there are no cars."], "image": "train2014/COCO_train2014_000000085459.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 578627, "question_id": "jo9zaz7r9BcyfEp7WUxJ3K", "question": "What color is the harness around the girl who is presenting the cow?", "choices": ["red", "blue", "black", "green"], "correct_choice_idx": 2, "direct_answers": ["white", "black", "black", "black", "black", "black", "black", "white", "black", "black"], "difficult_direct_answer": false, "rationales": ["It is in stark contrast to the white she is wearing", "The harness is dark.", "The color is extremely dark against her white shirt."], "image": "train2014/COCO_train2014_000000578627.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503051, "question_id": "joFNmGmCP6fp6wQJY7Maiz", "question": "What is the woman likely to use to get back home?", "choices": ["car", "airplane", "taxi", "covered wagon"], "correct_choice_idx": 1, "direct_answers": ["van", "airplane", "van", "airplane", "airplane", "airplane", "plane", "plane", "taxi", "plane"], "difficult_direct_answer": false, "rationales": ["The woman flies.", "Behind here is a terminal sign.", "She is going to take an airplane"], "image": "val2014/COCO_val2014_000000503051.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 166699, "question_id": "jqCSVyWLDoYmyNQfF2CkYH", "question": "What car company is being advertised in this arena?", "choices": ["lexus", "bmw", "mercedes", "audi"], "correct_choice_idx": 0, "direct_answers": ["lexus", "lexus", "lexus", "lexus", "lexus", "lexus", "lexus", "lexus", "lexus", "lexus"], "difficult_direct_answer": false, "rationales": ["There is company names written along the sides of the court that represents sponsors. of the sponsors written on the side, answer a is visible and a car company.", "The logo is displayed.", "The name is on the wall"], "image": "train2014/COCO_train2014_000000166699.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 185200, "question_id": "jrYmU3MmGs2K6BDjhE74aR", "question": "What might the person be in the business of repairing?", "choices": ["televisions", "cars", "phones", "baby carriages"], "correct_choice_idx": 2, "direct_answers": ["mobile service", "phones", "phones", "yes", "phones", "cell phones", "mobile phone", "cell phones", "cell phones", "cellphones"], "difficult_direct_answer": false, "rationales": ["There are multiple types of phones and missing ph", "The person that owns all the devices might be in the business of repairing cell phones.", "There are many ones laying out that need to be fixed."], "image": "train2014/COCO_train2014_000000185200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402715, "question_id": "jsKyDeF4i5ALBHSu5tSx8j", "question": "What do these animals have?", "choices": ["wings", "long necks", "wool", "quills"], "correct_choice_idx": 2, "direct_answers": ["wool", "fur", "fur", "cotton", "wool", "wool", "wool", "wool", "eating grass", "fur"], "difficult_direct_answer": false, "rationales": ["White animals with fluffy white fur are grazing in an open pasture. sheep grow white wool.", "These are sheep", "There are numerous sheep among the rolling grassy fields."], "image": "train2014/COCO_train2014_000000402715.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207898, "question_id": "jtWWcAocUo5BGP4Qx5ExQf", "question": "What is in the tall glass on the right hand side?", "choices": ["root beer", "orange juice", "water", "tomato juice"], "correct_choice_idx": 1, "direct_answers": ["orange juice", "juice", "egg", "coffee", "orange juice", "orange juice", "juice", "coffee", "orange juice", "orange juice"], "difficult_direct_answer": false, "rationales": ["The tall glass behind the plate of breakfast contains bright orange juice.", "The liquid is the color of this drink and complementary to breakfast.", "The glass has orange juice."], "image": "val2014/COCO_val2014_000000207898.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 123532, "question_id": "jtrvNFejKJCQZN4pYYF66i", "question": "What is the person swatting at?", "choices": ["tennis ball", "their brother", "mosquito", "fly"], "correct_choice_idx": 0, "direct_answers": ["tennis ball", "ball", "tennis ball", "ball", "tennis ball", "tennis ball", "ball", "tennis ball", "ball", "tennis ball"], "difficult_direct_answer": false, "rationales": ["They have a racket and are on a court", "The person wants to hit the tennis ball.", "He is swatting on a light green ball using a racket in a tennis court."], "image": "train2014/COCO_train2014_000000123532.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 13550, "question_id": "jumXGdZeQbyJwsSQ6yezi7", "question": "What is climbing up the rocks to talk to the woman who is on the top?", "choices": ["wolf", "seal", "elephant", "penguin"], "correct_choice_idx": 2, "direct_answers": ["elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant", "elephant"], "difficult_direct_answer": false, "rationales": ["A large animal with a trunk approaches a girl.", "While they do not really talk, this animal is coming to see her.", "A grey animal that has large ears and a trunk is approaching the woman."], "image": "val2014/COCO_val2014_000000013550.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 612, "question_id": "jvnyx2bp9AhFGY2y3esYPZ", "question": "What color are the Christmas balls on the tree to the right?", "choices": ["gold", "blue", "red", "silver"], "correct_choice_idx": 0, "direct_answers": ["gold", "gold", "gold", "red", "gold", "gold", "yellow", "red", "gold brown", "gold"], "difficult_direct_answer": false, "rationales": ["They are a different shade of yellow that's shiny and reflective.", "They are gold.", "The color is gold."], "image": "train2014/COCO_train2014_000000000612.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302343, "question_id": "jwD48E6ftSoaoenfgDXTg9", "question": "What color is the stripe wrapped around the red boat?", "choices": ["purple", "white", "red", "green"], "correct_choice_idx": 1, "direct_answers": ["lake", "red", "white", "red", "white", "white", "blue", "blue", "white", "white"], "difficult_direct_answer": false, "rationales": ["The color is white.", "It looks like an off or dirty a.", "The red boat has a white stripe going around the boat. the boat is sitting in the water with the stripe."], "image": "train2014/COCO_train2014_000000302343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 488260, "question_id": "jwcLZfgV3w36ywiMQ7grRF", "question": "What would these animals hypothetically order on a menu?", "choices": ["lamb chops", "fish tacos", "salad", "beef burger"], "correct_choice_idx": 2, "direct_answers": ["salad", "milk shakes", "height", "leaves", "salad", "tree nuts", "leaves", "salad", "salad", "twigs"], "difficult_direct_answer": false, "rationales": ["The animals are giraffes which are known to eat leaves. only one choice is leafy and does not include an animal.", "These giraffes are standing close together in an enclosure. they like to eat on leaves of plants.", "The animals would get salad."], "image": "train2014/COCO_train2014_000000488260.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 444312, "question_id": "jx8cxgwWDL7TELVS6ZzNvN", "question": "What is the top color of the flag worn on the breast of the horseback rider?", "choices": ["blue", "white", "red", "pink"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "red", "red", "red", "usa", "red", "whitered", "red", "red"], "difficult_direct_answer": false, "rationales": ["It's the color of cherries", "Unless you are colorblind, you can tell what color is shown here.", "The rider is wearing a red and white tag on her jacket."], "image": "train2014/COCO_train2014_000000444312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 317981, "question_id": "jxBR6AvcAbw2JAma8iAcC7", "question": "What kind of pizza is this?", "choices": ["meat lovers", "peperoni", "vegetable", "broccoli"], "correct_choice_idx": 0, "direct_answers": ["pepperoni", "pepperoni", "pepperoni", "pepperoni sausage", "new york", "peperoni", "pepperoni", "pepperoni", "meat lovers", "pepperoni"], "difficult_direct_answer": false, "rationales": ["There is pepperoni and sausage on it with no vegetables", "The pizza has meat on top.", "The pizza on the plate has many types of meat on it."], "image": "train2014/COCO_train2014_000000317981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 416918, "question_id": "jxDUmY7UcY67wePJUerLr4", "question": "What color are the little wedges most directly on top of this salad?", "choices": ["green", "orange", "purple", "white"], "correct_choice_idx": 1, "direct_answers": ["orange", "orange", "orange", "orange", "orange", "orange/purple", "beige", "green", "green", "yellow"], "difficult_direct_answer": false, "rationales": ["There appears to be carrots in the salad.", "The color is orange.", "It is similar to a rind of a fruit with the same name"], "image": "train2014/COCO_train2014_000000416918.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171468, "question_id": "jy5kSBg4oaghoEXgcrL7EJ", "question": "How many giraffes are standing behind the green fence where there is a dog barking at them?", "choices": ["three", "two", "five", "four"], "correct_choice_idx": 0, "direct_answers": ["three", "two", "three", "three", "one", "car", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Two large animals with long necks are in a zoo enclosure and another can be partially seen.", "There is a giraffe right up to the edge of the fence. another behind it and another in far distance and to right.", "There are three."], "image": "train2014/COCO_train2014_000000171468.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 214870, "question_id": "jyPiBZ5xXjR5cCQqt3fQJC", "question": "What color is the strange rainbow shape on the top side of the round apparatus of the parking meter?", "choices": ["red", "green", "black", "blue"], "correct_choice_idx": 0, "direct_answers": ["red", "orange", "red", "red", "black", "vibgr", "red", "blue", "red", "gold"], "difficult_direct_answer": false, "rationales": ["The parking meters along the sidewalk have red strange shapes on the top sides.", "There is a red shape on each parking meter.", "The curved shape on the front of the parking meter is bright red."], "image": "train2014/COCO_train2014_000000214870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 26398, "question_id": "jyYTHJWFkiosrSU3whq8ru", "question": "What is the movie about?", "choices": ["clowns", "vampires", "werewolves", "ghosts"], "correct_choice_idx": 3, "direct_answers": ["poltergeist", "ghosts", "ghosts", "poltergeist", "demons", "ghosts", "food", "ghosts", "pizza", "ghost"], "difficult_direct_answer": false, "rationales": ["The movie's about ghosts.", "A dvd box is to the right of a pizza box and pizza. it has a girl with her hands on a tv and poltergeist is about being haunted.", "The other options don't appear in this particular film. the reason for a is the house was built on top of a graveyard that still contained bodies."], "image": "val2014/COCO_val2014_000000026398.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249882, "question_id": "jys2WS9QXAjiuMBHMSXirC", "question": "What might be their profession?", "choices": ["captain", "fishermen", "farmer", "pirate"], "correct_choice_idx": 1, "direct_answers": ["fisherman", "boat sailors", "fisherman", "surfing", "fishermen", "fishermen", "fishing", "fishermen", "fishermen", "fishermen"], "difficult_direct_answer": false, "rationales": ["They look to be in a fishing boat.", "The people have a boat with a large net.", "Fishermen work on boats."], "image": "train2014/COCO_train2014_000000249882.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 7603, "question_id": "jz5snef8neuW3X3sZG6PWE", "question": "How many laptops are sat on the top of the desk with the people gathered around?", "choices": ["four", "five", "three", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two edges of the computer placed side by side and adjacent to each other. they have different screens.", "There are two laptops.", "The laptops are fully visible and countable based on their distinct outlines and screens."], "image": "train2014/COCO_train2014_000000007603.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 197918, "question_id": "jzPx6TucsBNmv8DKeAPVDf", "question": "What is the design called on the boy's shirt?", "choices": ["polka dot", "striped", "flannel", "plaid"], "correct_choice_idx": 2, "direct_answers": ["plaid", "flannel", "plaid", "plaid", "plaid", "plaid", "plaid shirt", "plaid", "plaid", "plaid"], "difficult_direct_answer": false, "rationales": ["It is checkered and warm looking.", "The shirt contains various fineness.", "A boy is wearing a button up shirt with a checkered type pattern."], "image": "val2014/COCO_val2014_000000197918.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 433786, "question_id": "jzWMuZxfuCw58ZWm2mV8KN", "question": "What language can a person learn by reading the book with the apple on it?", "choices": ["spanish", "french", "german", "chinese"], "correct_choice_idx": 0, "direct_answers": ["english", "english", "spanish", "english", "spanish", "spanish", "english", "spanish", "spanish", "spanish"], "difficult_direct_answer": false, "rationales": ["The book next to the cat has a title on the cover that is written in spanish.", "The language that's written on the book is spanish", "The print on the cover is in the language of spain."], "image": "train2014/COCO_train2014_000000433786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 382318, "question_id": "k2MC3mxNE4Fo2yq6hAmZob", "question": "What direction are the giraffes looking?", "choices": ["south", "east", "north", "west"], "correct_choice_idx": 1, "direct_answers": ["east", "trees", "right", "east", "east", "wright", "right", "right", "right", "right"], "difficult_direct_answer": false, "rationales": ["The animals are looking to the right.", "If north is up, then they are looking to the right which would be east.", "Their necks are both tilted to the right."], "image": "train2014/COCO_train2014_000000382318.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 340811, "question_id": "k3wMHAYcxoPZFfqjUKF24W", "question": "What kind of vegetable is served on the side of this salad?", "choices": ["collard green", "kale", "spinach", "lettuce"], "correct_choice_idx": 1, "direct_answers": ["coriander leaf", "lettuce", "lettuce", "lettuce", "salad", "kale", "lettuces", "lettuce", "lettuce", "lettuce"], "difficult_direct_answer": false, "rationales": ["There is a kale salad on the side of the plate.", "There is a kale salad on the side of the plate.", "This is a mix of salad greens"], "image": "val2014/COCO_val2014_000000340811.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 131276, "question_id": "k4C9Jd6QhoBehz4xkXtLXC", "question": "What color is reflected strongly off the metal cabinet cases?", "choices": ["purple", "red", "blue", "yellow"], "correct_choice_idx": 1, "direct_answers": ["red", "white", "white", "red", "white", "white", "orange", "white", "orange", "red"], "difficult_direct_answer": false, "rationales": ["The color is red.", "The cabinets on the wall have this color shining on them", "The color red can be seen on the metal cases. it is on the trim."], "image": "val2014/COCO_val2014_000000131276.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 218096, "question_id": "k4vHYHMuU3vGhUqtRoL5tk", "question": "What are the men on the bus drinking?", "choices": ["milk", "water", "alcohol", "juice"], "correct_choice_idx": 2, "direct_answers": ["alcohol", "alcohol", "alcohol", "alcohol", "alcohol", "alcohol", "alcohol", "beer", "alcoholic beverages", "wine"], "difficult_direct_answer": false, "rationales": ["Men are standing together and one is holding a flask while the other is holding an amber bottle.", "Alcohol can be drank from a bottle, can or flask. sometimes a flask is used to smuggle alcohol where it is not permitted.", "The men have beer bottles."], "image": "train2014/COCO_train2014_000000218096.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457214, "question_id": "k5pppemdHsfx4kgjzUnzwZ", "question": "Who is the bed for?", "choices": ["human", "rhino", "elephant", "dogs"], "correct_choice_idx": 0, "direct_answers": ["dogs", "child", "dog", "childs pets", "human", "dog", "dogs", "dogs", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["This bed is for humans.", "The bed that has dogs laying on it actually large enough for a human", "The size is large enough to hold a human being."], "image": "train2014/COCO_train2014_000000457214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 439251, "question_id": "k67toGDbTw5BH69c5J4Ncn", "question": "What kind of animal is in the ocean to the left of the man swimming?", "choices": ["water buffalo", "whale", "anteater", "dolphin"], "correct_choice_idx": 0, "direct_answers": ["water buffalo", "cow", "cow", "bull", "antelope", "cow", "goat", "goat", "ox", "bull"], "difficult_direct_answer": false, "rationales": ["The animal is identifiable based on the shape of its head and the horn style.", "The animal in the water has large, curved horns and is the only one of the options with that characteristic.", "The animal is a type of cattle that can swim."], "image": "train2014/COCO_train2014_000000439251.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 505539, "question_id": "k6PWBzhigjbLg2Uy8VMsyn", "question": "What are these animals likely doing?", "choices": ["laundry", "sleeping", "flying", "escaping"], "correct_choice_idx": 3, "direct_answers": ["food searching", "walking", "sideseeing", "running", "carcar", "running", "foraging", "running away", "crossing road", "escaping"], "difficult_direct_answer": true, "rationales": ["The animals are running away from the wild.", "They are running away.", "The animals are running away."], "image": "train2014/COCO_train2014_000000505539.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 76578, "question_id": "k7XQPNsWR2WuP6KUGvLdDo", "question": "What direction are the elephants marching?", "choices": ["west", "east", "north", "south"], "correct_choice_idx": 0, "direct_answers": ["straight", "left", "left", "west", "left", "left", "left", "forward", "left", "left"], "difficult_direct_answer": false, "rationales": ["The elephants are heading the left, which is west.", "The elephants are marching to the west.", "This is an assummption. it's impossible to say with this image since we don't know the time of day or definite geographic location to determine the direction of the shadows and then the direction of the elephants."], "image": "train2014/COCO_train2014_000000076578.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 387352, "question_id": "k9VLNtzDcyt589932VhDDi", "question": "What are the frogs sitting on?", "choices": ["turtles", "rocks", "sand", "lily pads"], "correct_choice_idx": 3, "direct_answers": ["lotus leaf", "floor", "lily pads", "western", "lilypads", "toilet", "lilypad", "lili-pads", "shower curtain", "grass"], "difficult_direct_answer": true, "rationales": ["They are round leaves that float.", "Frogs are sitting on top of leaves in the water.", "These are plants that grow in ponds"], "image": "train2014/COCO_train2014_000000387352.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21353, "question_id": "kAqXCFPBgFwat4UHhEtJe7", "question": "What is draped over the chair?", "choices": ["baby", "man", "cat", "towel"], "correct_choice_idx": 3, "direct_answers": ["but", "throw", "throw blanket", "daaad", "sheet", "remote", "towel", "blanket", "throw", "blanket"], "difficult_direct_answer": false, "rationales": ["A white fabric item is hung over the back of a chair.", "It's a throw blanket", "It is a section of cloth"], "image": "train2014/COCO_train2014_000000021353.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 570196, "question_id": "kC3HRVFfhVQ6WMd6tmfdZh", "question": "What is watching the cows?", "choices": ["pelican", "dog", "baby", "wolf"], "correct_choice_idx": 1, "direct_answers": ["dog", "dog", "dog", "dog", "white dog", "dog", "dog", "dog", "dog", "dog"], "difficult_direct_answer": false, "rationales": ["The dog is watching.", "The dog is sitting there watching the cows.", "The white animal on the fence which is a dog."], "image": "train2014/COCO_train2014_000000570196.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 222500, "question_id": "kDMMzmNaQekcntCNCkWm52", "question": "What are the curtains called?", "choices": ["sink curtains", "shades", "sheers", "valance"], "correct_choice_idx": 3, "direct_answers": ["valance", "valance", "drapes", "valences", "valance", "valance", "shades", "drapes", "valance", "window curtains"], "difficult_direct_answer": false, "rationales": ["A window has a covering that hands down from the top but does not reach the bottom of the window.", "There is a single window on the other side of the sink. it has a short curtain that hangs only on the top part of window.", "It only covers the very top as part of decor instead of function"], "image": "val2014/COCO_val2014_000000222500.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 198312, "question_id": "kDmW8TRJcen7fNXzmSmSVq", "question": "What color is the seat on the couch in the corner window?", "choices": ["blue", "white", "yellow", "red"], "correct_choice_idx": 1, "direct_answers": ["white", "white", "white", "grey", "beige", "ivory", "beige", "grey", "white", "off-white"], "difficult_direct_answer": false, "rationales": ["The sofa is holding a red pillow.", "The seat in the corner by the window is white.", "The color is white."], "image": "val2014/COCO_val2014_000000198312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 71345, "question_id": "kDnHx2rrtjqZXjhfnQusTj", "question": "What are the cars driving alongside?", "choices": ["army tanks", "horses", "trains", "bicycles"], "correct_choice_idx": 2, "direct_answers": ["trains", "train", "train", "train track", "road", "train", "trains", "train", "railway station", "trains"], "difficult_direct_answer": false, "rationales": ["The trains are along side the cars.", "The cars are riding alongside two trains that are riding on the tracks.", "The cars are driving next to trains on train tracks."], "image": "val2014/COCO_val2014_000000071345.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 75841, "question_id": "kE23wKxGuSYPCiKSxhYWJQ", "question": "What color are the interior nettings of the rackets used by the two men?", "choices": ["white", "blue", "red", "green"], "correct_choice_idx": 3, "direct_answers": ["yellow", "green", "green", "yellow", "yellow", "yellow", "green", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["Men are playing tennis with rackets with green strings.", "The color is green.", "The rackets are branded by green as a preference form the racket maker."], "image": "train2014/COCO_train2014_000000075841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 475980, "question_id": "kFEwmK3qXF9E9TtksK3ws8", "question": "What color is brightly reflected off the back of the bicycle in front of the cat?", "choices": ["white", "orange", "red", "green"], "correct_choice_idx": 2, "direct_answers": ["yellow orange", "black", "orange", "orange", "car", "red", "orange red", "red", "yellowish", "stop"], "difficult_direct_answer": false, "rationales": ["There is a red light on the bike.", "The bike light is bright red.", "The color is red."], "image": "train2014/COCO_train2014_000000475980.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108677, "question_id": "kFJRc8E8dTiF9Hdj3zVxDr", "question": "What part of the body will be protected by the objects left with the motorcycles?", "choices": ["legs", "hands", "head", "stomach"], "correct_choice_idx": 2, "direct_answers": ["head", "head", "head", "head", "head", "head", "head", "head", "head", "head"], "difficult_direct_answer": false, "rationales": ["You can see a helmet which protects this part of the body. you can see the helmet is hard to protect and the same shape as this body part.", "The helmets help protect the head.", "That's the purpose of helmets. it's illegal in some regions to not wear them."], "image": "train2014/COCO_train2014_000000108677.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556624, "question_id": "kJci5VbJ5qXTkVK66MmKxx", "question": "What word is relevant to this activity?", "choices": ["typing", "sleeping", "eating", "balance"], "correct_choice_idx": 3, "direct_answers": ["skating", "skateboarding", "skating", "skate", "skateboarding", "skating", "balance", "skating", "skateboarding", "skateboarding"], "difficult_direct_answer": false, "rationales": ["They have to keep this to stay upright as they do tricks", "A skateboarder needs to stay in the middle of his board.", "The people are seen to be skateboarding. to successfully engage in this activity and perform tricks one must be able to stay on the board."], "image": "train2014/COCO_train2014_000000556624.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 249936, "question_id": "kJf9vfJ34JmTXM9PRby3S5", "question": "What letter is most obscured by the little girl's head?", "choices": ["l", "s", "p", "w"], "correct_choice_idx": 2, "direct_answers": ["letter p", "letter 'p'", "stop", "p", "pee", "stop", "letter p", "op", "letter p", "letter p"], "difficult_direct_answer": false, "rationales": ["The letter p is blocked.", "The letter is p.", "The little girl's head is covering the letter p."], "image": "val2014/COCO_val2014_000000249936.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100827, "question_id": "kKeUKgS4KZxEGxqRH6FNbk", "question": "What is above the tables?", "choices": ["cats", "statues", "umbrellas", "dogs"], "correct_choice_idx": 2, "direct_answers": ["umbrellas", "umbrellas", "umbralla", "umbrellas", "umbrellas", "umbrellas", "tablecloths", "umbrellas", "umbrella", "umbrellas"], "difficult_direct_answer": false, "rationales": ["They are used as a shade from sunlight and rainfall.", "There are blue items that provide protection from sun or rain. these items are not animals or statues.", "The tables have umbrellas."], "image": "train2014/COCO_train2014_000000100827.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85268, "question_id": "kKjFvXEVp9gDnsX8aXeAKD", "question": "What company makes an item likely to be found in this room?", "choices": ["mcdonalds", "colgate", "microsoft", "subway"], "correct_choice_idx": 1, "direct_answers": ["colgate", "colgate", "charmin", "crest", "colgate", "bath", "crest", "colgate", "colgate", "colgate"], "difficult_direct_answer": false, "rationales": ["The company is colgate.", "The company is colgate.", "Colgate, a name brand of toothpaste, sits on the sink beside the toothbrushes and faucet in the bathroom."], "image": "val2014/COCO_val2014_000000085268.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345009, "question_id": "kKzpV3sX4jywzYXA2u8iDY", "question": "What is near the water?", "choices": ["helicopter", "airplane", "dog", "surfboard"], "correct_choice_idx": 1, "direct_answers": ["plane", "people", "airplane", "airplane", "seaplane", "plane", "plane", "plane", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["One can see the large aircraft on top of the water.", "There is an airplane on top of the water.", "It is a vehicle that usually flies but is resting on top of the water on pontoons"], "image": "train2014/COCO_train2014_000000345009.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 181920, "question_id": "kLYeC8ADvnGWdgaFuMcNXo", "question": "What direction are the animals facing?", "choices": ["south", "west", "east", "north"], "correct_choice_idx": 2, "direct_answers": ["east", "away", "upside", "right", "right", "right", "left toright", "run", "right", "right"], "difficult_direct_answer": false, "rationales": ["The zebras are facing to the right.", "The zebras are facing right which is usually correlated with east.", "A group of zebras are all facing in the same direction."], "image": "train2014/COCO_train2014_000000181920.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503089, "question_id": "kLdcvATx2B67R2PY5fosyd", "question": "What parts here came from a car?", "choices": ["door", "windshield wipers", "tire", "carburetor"], "correct_choice_idx": 2, "direct_answers": ["tires", "tires", "tire", "tire", "tires", "tire", "tires", "tyre", "wheel", "tires"], "difficult_direct_answer": false, "rationales": ["There are rubber circles with tread", "The round objects are used for wheels.", "The tires were dismantled from a vehicle."], "image": "train2014/COCO_train2014_000000503089.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 215410, "question_id": "kMa86mWZdT6zWghBQ8oKUY", "question": "What can you use the bike rack for to keep your bike safe?", "choices": ["selling bike", "cleaning bike", "leaning bike", "bike lock"], "correct_choice_idx": 3, "direct_answers": ["lock", "bike lock", "wheel", "protect", "locks", "stand", "lock", "parking", "lock", "key"], "difficult_direct_answer": false, "rationales": ["You use a lock to keep the bikes safe.", "A bike lock is a lock that is put on a bike that keep it secure so someone can't steal off the rack.", "There is a bike rack that people can keep their bikes safe at using a bike lock."], "image": "train2014/COCO_train2014_000000215410.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 346377, "question_id": "kNXGtfn2LFfA26cjBpgfn3", "question": "What part of the world is this river likely found in?", "choices": ["asia", "australia", "europe", "america"], "correct_choice_idx": 0, "direct_answers": ["crest", "southern hemisphere", "southern hemisphere", "asia", "asia", "asia", "taiwan", "asia", "china", "china"], "difficult_direct_answer": false, "rationales": ["The boats and mountains are indicative of asia.", "The shape of these boats is similar to the architecture there", "The boats in the front have that oriental style with the tarps on top, suggesting slight poverty."], "image": "val2014/COCO_val2014_000000346377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 467522, "question_id": "kPL8oQ9taoQC6Qtoqr8ErJ", "question": "What animals are standing tall?", "choices": ["deer", "antelopes", "camels", "giraffes"], "correct_choice_idx": 3, "direct_answers": ["giraffe", "giraffes", "giraffe", "giraffes", "giraffe", "giraffe", "graffies", "giraffe", "giraffes", "giraffes"], "difficult_direct_answer": false, "rationales": ["The giraffes are tall.", "There are some giraffes standing tall in the field.", "Giraffes are grazing in a green and treed area."], "image": "val2014/COCO_val2014_000000467522.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396326, "question_id": "kPRkGYN2YXDWghwdd5owVe", "question": "What is the cat doing?", "choices": ["eating", "running", "sleeping", "stretching"], "correct_choice_idx": 3, "direct_answers": ["watching something", "laying down", "stalking", "stretching", "stretching", "watching", "sitting", "sitting", "sitting", "sitting"], "difficult_direct_answer": false, "rationales": ["The cat's arms are extended in a very long position.", "It's paws are straight out.", "The cat is stretching out its legs before it stands up."], "image": "train2014/COCO_train2014_000000396326.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 148737, "question_id": "kQ2Px6qVtDsrzPKiEnCWy8", "question": "What kind of dog is sitting in the middle of the wood flooring licking itself?", "choices": ["poodle", "labrador", "golden", "huskey"], "correct_choice_idx": 3, "direct_answers": ["huskey", "cat", "huskie", "dog", "cat", "huskie", "husky", "cat", "husky", "cat"], "difficult_direct_answer": false, "rationales": ["There is a little huskey dog licking himself on the floor.", "A gray and white domestic dog resembling a wolf is sitting on the ground.", "The dog is very fluffy and big."], "image": "val2014/COCO_val2014_000000148737.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 153064, "question_id": "kQSkLGuzRMYcUrnVSxz6mB", "question": "Where would this particular kitchen be found?", "choices": ["home", "school", "hotel", "retail store"], "correct_choice_idx": 3, "direct_answers": ["apartment", "apartment", "home depot", "house", "home", "house", "apartment", "apartment", "retail store", "store"], "difficult_direct_answer": false, "rationales": ["It is indicated that this is a commercial setting because the items have price tags on them and they appear new which is typical in this setting.", "This is a demo kitchen that shows a price tag as well as features in it.", "The kitchen would be in a store."], "image": "train2014/COCO_train2014_000000153064.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 245953, "question_id": "kRk7kU9dEanyqyqkMsvYmA", "question": "What color is the lead to the dog standing to the rear side of the black dog?", "choices": ["green", "purple", "pink", "white"], "correct_choice_idx": 2, "direct_answers": ["black", "pink", "pink", "pink", "pink", "pink", "pink", "pink", "pink", "purple"], "difficult_direct_answer": false, "rationales": ["The dog on the lead is wearing a pink leash.", "Unless you are colorblind you can tell what color the leash is.", "The lead is not purple, green, or white."], "image": "train2014/COCO_train2014_000000245953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 550445, "question_id": "kRun25PpxfxuZqKCDto5Ff", "question": "Who would use the inanimate object with the face for their job?", "choices": ["firefighter", "street sweeper", "policeman", "bus driver"], "correct_choice_idx": 0, "direct_answers": ["camera", "firefighter", "camera", "fire hydrant", "artist", "firefighter", "hydrant", "fireman", "firefighter", "fire"], "difficult_direct_answer": false, "rationales": ["There is a hydrant.", "The fire hydrant in the grass has a face painted on it, and it would be used in cases of needing to fight a fire.", "This is a fire hydrant so a firefighter would be the one to use it on their job."], "image": "train2014/COCO_train2014_000000550445.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 199243, "question_id": "kSYJnkwSDrNsTbycgCGCB8", "question": "What person most likely has flown on this airline?", "choices": ["date masamune", "thespis", "bret hart", "greta thunberg"], "correct_choice_idx": 2, "direct_answers": ["justin treadau", "canadian", "bret hart", "canadian", "canadian", "travel", "trudeau", "higher class", "airline", "canadian"], "difficult_direct_answer": false, "rationales": ["It is air canada.", "Bret hart would mostly like fly on it.", "Bret hart is a commercial air pilot."], "image": "train2014/COCO_train2014_000000199243.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 324266, "question_id": "kSYa8yRZaC8K7iZYdvwCHP", "question": "What color is the lamp on the top of the wall next to the shower?", "choices": ["yellow", "blue", "red", "white"], "correct_choice_idx": 0, "direct_answers": ["yellow", "it's white", "white", "white", "white", "yellow", "yellow", "white", "white", "yellow"], "difficult_direct_answer": false, "rationales": ["The lamp is yellow in color.", "The lamp looks yellowish.", "The color is yellow."], "image": "val2014/COCO_val2014_000000324266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413012, "question_id": "kVgoYPuovD6V5biMJQ4Z7R", "question": "What character is the woman on the right dressed as?", "choices": ["jessica jones", "leeloo", "arya stark", "uhtred"], "correct_choice_idx": 1, "direct_answers": ["eagle", "lela", "milla jovovich", "nami", "leeloo", "leeloo", "angel", "mummy", "clown", "leeloo"], "difficult_direct_answer": false, "rationales": ["The woman on the right is dressed as a female, not male, character. her hair is red, not brown.", "The character is leeloo.", "The woman on the right is dressed as the character leeloo from fifth element."], "image": "train2014/COCO_train2014_000000413012.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 282579, "question_id": "kYuov4EBVAzeo88sZfyVWk", "question": "What is near the sandwich?", "choices": ["onion rings", "watermelon", "fries", "cactus"], "correct_choice_idx": 2, "direct_answers": ["beef", "french-fry", "fries", "fries", "fries", "french fries", "fee", "fries", "fries", "french fries"], "difficult_direct_answer": false, "rationales": ["There is a long potato sticks associated with sandwiches. they are crispy and have salt on them and yellow.", "There are long fries by the sandwich.", "There are fries on the plate to."], "image": "train2014/COCO_train2014_000000282579.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 143234, "question_id": "kZ93REVcEPLcmEhFXuHd85", "question": "What color is the DC logo spray painted across the skate ramp?", "choices": ["blue", "black", "white", "red"], "correct_choice_idx": 1, "direct_answers": ["black", "green", "green", "black", "black", "black", "black", "fee", "brown", "black"], "difficult_direct_answer": false, "rationales": ["The color is black.", "The logo on the ramp is black.", "The logo can be found over and over on right side of ramp."], "image": "val2014/COCO_val2014_000000143234.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 349006, "question_id": "kZwcCuabSwykDJwgUNTXw8", "question": "What is on top of the skateboard?", "choices": ["elephant", "cat", "sneaker", "dog paw"], "correct_choice_idx": 2, "direct_answers": ["man", "shoe", "shoe", "shoe", "shoe", "shoe", "sneaker", "shoe", "foot", "skater"], "difficult_direct_answer": false, "rationales": ["A shoe is on top the the deck of a skateboard.", "The guy is wearing a type of athletic rubber-soled shoes.", "The top has a sneaker."], "image": "val2014/COCO_val2014_000000349006.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 268092, "question_id": "ka8nQLdzHDa4JKnYC5LxXz", "question": "What is the wicker basket covering on top of?", "choices": ["fruits", "head", "teddy bears", "vegetables"], "correct_choice_idx": 2, "direct_answers": ["teddy bears", "brown", "teddy bears", "stuffed animals", "teddy bear", "bears", "pills", "toy", "stuffed animal", "bears"], "difficult_direct_answer": false, "rationales": ["The basket covers teddy bears.", "There are plush bears inside the basket.", "The basket is over bears."], "image": "val2014/COCO_val2014_000000268092.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221684, "question_id": "kaDnjc6TcLnmrEmmdpzECz", "question": "What food can be made from this animal?", "choices": ["cheeseburger", "lobster roll", "venison stew", "fish cakes"], "correct_choice_idx": 0, "direct_answers": ["steak", "cheeseburger", "beef", "animal", "burgers", "beef", "burger", "burger", "beef", "steak"], "difficult_direct_answer": false, "rationales": ["The animals on the side of the road are used to produce beef that is found in cheeseburgers.", "The animals in question are cows based on their size and shape. answer a is a food product that is made from ground beef which comes from the animals in question.", "You can make meat from the cows."], "image": "train2014/COCO_train2014_000000221684.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441574, "question_id": "kaXd8cpbKoCMChbvxci2vX", "question": "What is the person leaning against?", "choices": ["fence", "cross", "horse", "egg"], "correct_choice_idx": 0, "direct_answers": ["fence", "wood fence", "fence", "fence", "fence", "fence", "fence", "fencepost", "fence", "fence"], "difficult_direct_answer": false, "rationales": ["This structure is being used to keep people from danger the man is leaning against it so that he can view without falling.", "It's a common yard barrier. it's often used in rural areas.", "A person is standing at a wood post that has some parallel pieces and post between."], "image": "train2014/COCO_train2014_000000441574.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58335, "question_id": "kbvQqZifZgq68ZsXr4QG6S", "question": "What is the person in the foreground doing?", "choices": ["eating", "sleeping", "leaping", "walking"], "correct_choice_idx": 2, "direct_answers": ["skateboarding", "skating", "jumping", "skating", "jumping", "skateboarding", "leaping", "jumping", "skateboarding", "skateboarding"], "difficult_direct_answer": false, "rationales": ["A guy is jumping up on a skateboard in order to do a trick.", "The person in the foreground is awake and is above the ground. this person is not eating or walking.", "The person is awake and is doing a skateboard trick."], "image": "val2014/COCO_val2014_000000058335.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277496, "question_id": "kc8YopC4R9SeoY8NChtfK8", "question": "What can be seen in the mirror reflection?", "choices": ["cat", "banana", "lamp", "woman"], "correct_choice_idx": 2, "direct_answers": ["mirror lamp", "flower mirror", "mirror", "lamp", "another mirror", "mirror", "mirror", "another mirror", "mirror", "lamp"], "difficult_direct_answer": false, "rationales": ["There is a device that is hanging on wall. it has a light inside it with a shade on top.", "There are no animals, people, or fruits.", "The mirror reflection shows the lamp."], "image": "train2014/COCO_train2014_000000277496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119469, "question_id": "kdiD3dxQX52dSsA6x4bjDH", "question": "What kind of fence encloses the pasture containing sheep?", "choices": ["electric", "wood", "iron", "wire"], "correct_choice_idx": 3, "direct_answers": ["wire", "wire", "mesh wire", "boarded wire", "grass", "plane", "wire", "wire", "strong fence", "chicken wire"], "difficult_direct_answer": false, "rationales": ["Though any of these answers are viable, but mostly it is a wired fence.", "It is a thin metal fence.", "The pasture containing the sheep is enclosed by a fence made of wire."], "image": "val2014/COCO_val2014_000000119469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 247141, "question_id": "keKR9PEpiYWbz833SrW7Ed", "question": "What are the rocks made of which are aligned with the fence?", "choices": ["sandstone", "cement", "foam", "granite"], "correct_choice_idx": 1, "direct_answers": ["fibreglass", "plastic", "rock", "stone", "stone", "cement", "rock", "stone", "plastic", "marble"], "difficult_direct_answer": false, "rationales": ["The rocks are cement.", "These are manmade rocks and are not natural. man-made rocks would be made from cement.", "They are well aligned by the use of concrete."], "image": "val2014/COCO_val2014_000000247141.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 542343, "question_id": "kfkH7gzS6r2CiEn6iWjsdZ", "question": "What color are the stripes on the big bench that is held on the elephant's back?", "choices": ["pink", "orange", "blue", "green"], "correct_choice_idx": 0, "direct_answers": ["red", "brown", "black", "black", "red white", "pink", "green", "pink", "red", "white"], "difficult_direct_answer": false, "rationales": ["The rug on the chair on the back of the elephant is pinkish.", "The seat on top of the elephant is used to carry passengers including tourists who get a ride on the elephant with the pink and white bucket.", "There are some pink stripes on the blanket on top of the elephant."], "image": "train2014/COCO_train2014_000000542343.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208850, "question_id": "kgKEepTErXMvZmnuqjZFXn", "question": "What is the little elephant eating on the ground?", "choices": ["straw", "salt", "grass", "nothing"], "correct_choice_idx": 3, "direct_answers": ["grass", "grass", "grass", "grass", "grass", "nothing", "grass", "grass", "grass", "grass"], "difficult_direct_answer": false, "rationales": ["The little elephant isn't eating anything on the ground.", "He's standing there looking at the photographer", "The elephant isn't doing anything to eat."], "image": "train2014/COCO_train2014_000000208850.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 327157, "question_id": "khQAieZjXgNopS48dzUfDQ", "question": "What is on the table near the TV?", "choices": ["egg", "soda bottle", "pumpkin", "echidna"], "correct_choice_idx": 1, "direct_answers": ["chair", "drinks", "soda", "coke", "coke", "coca cola", "soda", "coke", "coke", "soda bottle"], "difficult_direct_answer": false, "rationales": ["A two-liter container of the beverage coca cola is sitting on the desk underneath the television.", "A soda bottle because there is no egg, pumpkin, or echidna.", "You can see the coke bottle sitting on the table."], "image": "train2014/COCO_train2014_000000327157.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 108548, "question_id": "khnGJaXkKFLz7esEBahtmq", "question": "What is next to the sink?", "choices": ["microwave", "apple", "toilet", "ironing board"], "correct_choice_idx": 2, "direct_answers": ["hairdryer", "hair dryer", "toilet", "toilet", "wall", "hairdryer", "tap", "toilet", "watching", "toilet"], "difficult_direct_answer": false, "rationales": ["Toilets are kept in bathroom with sinks.", "There is a toilet", "The sinks are clearly identifiable based on their shape and design. the object in proximity are also identifiable based on their size, shape and their placement in this setting."], "image": "val2014/COCO_val2014_000000108548.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14990, "question_id": "kihCwHZzPqvDxddrLHu8Nm", "question": "What is between the elephants?", "choices": ["buzzard", "box", "bench", "man"], "correct_choice_idx": 3, "direct_answers": ["man", "man", "man", "man", "person", "man", "man", "man", "man", "man"], "difficult_direct_answer": false, "rationales": ["There is a person.", "There is a person with two arms and legs who is wearing a hat. he is standing in between the two elephants.", "There is a man between two big creatures in a dirt and grassy area."], "image": "val2014/COCO_val2014_000000014990.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 195507, "question_id": "kjJkVf7SsnsAudvbFu4jeF", "question": "What color is the vase on the right side of the white entertainment center?", "choices": ["turquoise", "yellow", "red", "red"], "correct_choice_idx": 0, "direct_answers": ["blue", "wite", "turquoise", "green", "grey", "turquoise", "blue green", "turquoise", "green", "green"], "difficult_direct_answer": false, "rationales": ["The color is turquoise.", "The vase on the right side of the white entertainment center is not red or yellow.", "It is the same colour as tiffanys."], "image": "train2014/COCO_train2014_000000195507.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 207048, "question_id": "kk7HZegnhAW2ocZwv8JHMd", "question": "What is the person on the left holding?", "choices": ["ski poles", "pumpkins", "kittens", "eggs"], "correct_choice_idx": 0, "direct_answers": ["ski poles", "metal", "ski poles", "sticks", "ski", "ski poles", "stick", "poles", "poles", "ski's"], "difficult_direct_answer": false, "rationales": ["The people are on a hill and are standing on snow. the person on the left is about to do an extreme winter sport that is similar to snowboarding.", "The person on the left has ski poles with them.", "A group of skiers are all standing together and one leans forward on his poles."], "image": "train2014/COCO_train2014_000000207048.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 276168, "question_id": "kmFvepwgE3UqZy8xnGG8QL", "question": "What type of military individual fly's this plane?", "choices": ["marine", "airmen", "seaman", "soldier"], "correct_choice_idx": 1, "direct_answers": ["pilot", "pilots", "soldier", "air force", "air force", "air force", "air force", "airmen", "air force", "air force"], "difficult_direct_answer": false, "rationales": ["The airmen would fly the plane.", "The plane belongs to the air force. marines and seaman belong to the naval branch and soldiers belong to the army which leaves option a.", "The men that fly these military planes would be the airmen."], "image": "train2014/COCO_train2014_000000276168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264377, "question_id": "kn8FrvyxDipL5Y7sTohBjv", "question": "What part of the animal on the right is visible?", "choices": ["wings", "tail", "hooves", "ears"], "correct_choice_idx": 3, "direct_answers": ["body", "head", "head", "goat", "face", "ears", "goat", "coat", "upper body", "head"], "difficult_direct_answer": false, "rationales": ["Of the body parts visible on the animal in question, only answer a is currently visible.", "The sheep on the right is looking straight and both ears are showing.", "The part is the ears."], "image": "train2014/COCO_train2014_000000264377.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270956, "question_id": "kncC6cQRWihCGfTaNKZGsW", "question": "What is most likely in the smoothie?", "choices": ["grapes", "watermelon", "blueberry", "strawberry"], "correct_choice_idx": 3, "direct_answers": ["light", "strawberry", "tomato", "strawberry", "nothing", "fruits", "strawberries", "strawberries", "strawberry", "strawberries"], "difficult_direct_answer": false, "rationales": ["It looks red so it has strawberries in it.", "The contents of the smoothie is red. answer a is a fruit that is commonly an ingredient in smoothies and is red in color.", "The drink is mostly red."], "image": "train2014/COCO_train2014_000000270956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134815, "question_id": "ko2mNWFKvfFTb2rJZRezHQ", "question": "What is the object called which is supporting the stove?", "choices": ["cabinet", "prep table", "bar top", "island"], "correct_choice_idx": 3, "direct_answers": ["counter", "island", "counter", "tile", "plate", "cabinet", "cook", "counter", "floor", "island"], "difficult_direct_answer": false, "rationales": ["There is a industrial stove on top of a kitchen center piece. people can walk on all sides of this piece of furniture.", "A kitchen island surrounds the stove.", "The object is an island."], "image": "val2014/COCO_val2014_000000134815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 554275, "question_id": "kqudTQHB7RL5EyG6nQx5jR", "question": "What is touching the fruit?", "choices": ["cat", "baby foot", "dog", "finger"], "correct_choice_idx": 3, "direct_answers": ["fingers", "finger", "finger", "finger", "hand", "hand", "finger", "hands", "hand", "finger"], "difficult_direct_answer": false, "rationales": ["The man is pointing his digit into the skin.", "A person is touching the fruit and specifically his finger.", "The hand of the person rests on the fruit."], "image": "train2014/COCO_train2014_000000554275.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 145211, "question_id": "ksAsreJeWrvwfAfhu8oivV", "question": "What keeps this room cool?", "choices": ["ceiling fan", "swamp cooler", "air conditioner", "tower fan"], "correct_choice_idx": 0, "direct_answers": ["ceiling fan", "fan", "fan", "fan", "ceiling fan", "ceiling fan", "fan", "ceiling fan", "fan", "sofa"], "difficult_direct_answer": false, "rationales": ["It is the only air cooling object in the room.", "The fan keeps the air cool.", "The object on the ceiling has \"wings\" that propel air to go around the room cooling people off."], "image": "train2014/COCO_train2014_000000145211.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 413432, "question_id": "ksw2gQiGJBksnafijbCifv", "question": "What does the man have in his hand?", "choices": ["baby", "egg", "racquet", "kitten"], "correct_choice_idx": 2, "direct_answers": ["racquet", "racket", "racquet", "tennis racket", "racquet", "tennis racket", "tennis racquet", "tennis racket", "tennis racket", "racket"], "difficult_direct_answer": false, "rationales": ["This is used in tennis to hit the ball in the opponents direction.", "The man has a racquet.", "It is a stringed wooden item used to hit balls in a game"], "image": "val2014/COCO_val2014_000000413432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 289333, "question_id": "kuoGfJe57LLVpZfiDahssA", "question": "What country is most known for serving dishes like this?", "choices": ["nepal", "gabon", "kazakhstan", "italy"], "correct_choice_idx": 3, "direct_answers": ["bsll", "italy", "italy", "italy", "italy", "italy", "country", "american", "italy", "italy"], "difficult_direct_answer": false, "rationales": ["A pasta dish is on a plate", "Pasta dishes are very popular in europe.", "Italy serves pasta."], "image": "train2014/COCO_train2014_000000289333.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 15219, "question_id": "kvAFbVUhwcx7dJcHdareDc", "question": "What fruit is topping the desert pizza?", "choices": ["blueberry", "olive", "strawberry", "raspberry"], "correct_choice_idx": 0, "direct_answers": ["tomato", "tomatoes", "blueberry", "olive", "blueberry", "tomato", "tomato", "tomato", "tomato", "tomato"], "difficult_direct_answer": false, "rationales": ["There is a pastry fruit pizza on a plate with tomatoes, spinach leaves and blueberry.", "Answer c is most consistent with common toppings to a pizza and the size, shape and color of the topping on this pizza.", "The fruit is a blueberry."], "image": "train2014/COCO_train2014_000000015219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 557981, "question_id": "kvPprxZunZdSW92gkKyg6C", "question": "Which item in the man's hand makes a more appropriate gift for a baby?", "choices": ["pacifier", "stuffed bear", "parasol", "rattle"], "correct_choice_idx": 1, "direct_answers": ["plushie", "stuffed animal", "stuffed animal", "teddy bear", "stuffed bear", "teddy bear", "umberlaw", "stuffed animal", "stuffed animal", "stuffed animal"], "difficult_direct_answer": false, "rationales": ["The man is holding a teddy and a parasol. the teddy would be more appropriate for a baby.", "The man is holding a stuffed teddy bear in his hand that would make an appropriate gift for a baby.", "A man is holding an umbrella and a plush toy."], "image": "val2014/COCO_val2014_000000557981.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 410554, "question_id": "kwjVLqHAjFHAouUiRw4tpZ", "question": "What is in the bottle with a red top?", "choices": ["ketchup", "olive oil", "hot sauce", "pepper"], "correct_choice_idx": 2, "direct_answers": ["hot sauce", "tomato sauce", "tobasco", "hot sauce", "hot sauce", "hot sauce", "hot sauce", "sauce", "hot sauce", "ketchup"], "difficult_direct_answer": false, "rationales": ["The bottle contains a red liquid. it is not thick enough to be ketchup.", "It is in a characteristically shaped bottle and you can see the red color of the sauce which comes from the chili peppers.", "The bottle is hot sauce."], "image": "val2014/COCO_val2014_000000410554.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 342583, "question_id": "kxZnS6HYhrJZBvaRKMThCs", "question": "What number is at the front of the truck?", "choices": ["93", "82", "six", "45"], "correct_choice_idx": 2, "direct_answers": ["six", "six", "six", "six", "six", "six", "road", "six", "six", "six"], "difficult_direct_answer": false, "rationales": ["The number can be seen in the middle of the cab.", "On the front of the red truck is a number. it is right below the windshield of the truck.", "The truck has a number on the front."], "image": "train2014/COCO_train2014_000000342583.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 556751, "question_id": "kyJTcMnD7hNKLFrKUFftaK", "question": "What color are the toddler girl's stockings?", "choices": ["white", "pink", "blue", "green"], "correct_choice_idx": 0, "direct_answers": ["nude", "white", "white", "white", "white", "white", "white", "white", "pink", "white"], "difficult_direct_answer": false, "rationales": ["Her stockings are not pink, blue, or green.", "The little girl has tights on that the same color as the background color of her dress.", "The color is white."], "image": "val2014/COCO_val2014_000000556751.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334867, "question_id": "kyPEHzoBSzpRcakAJJWXHF", "question": "What color is the snow machine on the right hand side?", "choices": ["green", "yellow", "red", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue color", "yellow", "blue", "blue", "blue", "blue black", "blue", "blue-black", "white", "blue"], "difficult_direct_answer": false, "rationales": ["The color is blue.", "The snow machine on the right side of the group is royal blue.", "A group of people pose together and one has his hand on a white and blue snowmobile."], "image": "train2014/COCO_train2014_000000334867.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 187560, "question_id": "kycnKzvpGDJb4fTsapvTGH", "question": "What can be seen in the mirror?", "choices": ["apple", "large cracks", "clock", "cat"], "correct_choice_idx": 2, "direct_answers": ["face", "clock", "clock", "clock", "clock", "clock", "clock", "clock", "clock", "clock"], "difficult_direct_answer": false, "rationales": ["The mirror has a clock.", "There is a white clock that can be seen in the reflection of the mirror.", "A round object with hands is visible in a mirror of a bathroom."], "image": "train2014/COCO_train2014_000000187560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 254917, "question_id": "kzMUn7zzAt8rhTtYSqzRZJ", "question": "What color are the brick squares painted on the bottom of this building?", "choices": ["blue", "red", "white", "tan"], "correct_choice_idx": 0, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "purple", "blue"], "difficult_direct_answer": false, "rationales": ["The color is bright and easily visible. it is in sharp contrast to the gray sidewalk.", "The color is blue.", "The color is blue."], "image": "val2014/COCO_val2014_000000254917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 552571, "question_id": "kzNyQUGiQnR2aeLzTtNMTR", "question": "What is the littlest elephant called?", "choices": ["pup", "colt", "squab", "calf"], "correct_choice_idx": 3, "direct_answers": ["calf", "baby", "calf", "calf", "baby", "calf", "baby", "calf", "baby", "baby elephant"], "difficult_direct_answer": false, "rationales": ["A young one of an elephant is called a calf.", "The other options apply to dogs, horses and pigeons.", "Animals have specific names for their young. this name describes baby elephants as well as other animals such as cows."], "image": "train2014/COCO_train2014_000000552571.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 171648, "question_id": "m4zLeM2PKyAmQuStnwuokz", "question": "What is the brown stuff all over the baby from?", "choices": ["poo", "cake", "smoothie", "paint"], "correct_choice_idx": 1, "direct_answers": ["chocolate", "cake", "chocolate cake", "cake", "chocolate cake", "cake", "cake", "chocolate cake", "cake", "cake"], "difficult_direct_answer": false, "rationales": ["The baby is sitting behind this dessert.", "The brown stuff is cake.", "There is a large brown cake next to the baby"], "image": "train2014/COCO_train2014_000000171648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 371250, "question_id": "m5EvZso8hujC46VxbWRf2c", "question": "What design is on the pillows?", "choices": ["egg", "cow", "apple", "pineapple"], "correct_choice_idx": 3, "direct_answers": ["squared", "pineapples", "pineapple", "pineapple", "pineapple", "pineapple", "cake", "pineapple", "pineapples", "pineapple"], "difficult_direct_answer": false, "rationales": ["It is identifiable by its oblong yellow body and spiky green top.", "The design is a pineapple.", "Each pillow has a tropical fruit."], "image": "val2014/COCO_val2014_000000371250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 223648, "question_id": "m5QPftJbZmt8iavTr3phVc", "question": "What wooden items are on the table?", "choices": ["benches", "building blocks", "books ends", "utensils"], "correct_choice_idx": 3, "direct_answers": ["stop", "spoons", "spoons", "utensils", "cooking utensils", "utensils", "spoons", "spoons", "spoons", "spoon"], "difficult_direct_answer": false, "rationales": ["Wooden utensils are on the table", "These are spoons and forks", "Various spoons, spatulas, and forks are laid out on a table and are light brown in color."], "image": "val2014/COCO_val2014_000000223648.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517425, "question_id": "m5WeKWYygEmKWsuHCXbiae", "question": "What color are the lateral stripes wrapped around the black bodied boat?", "choices": ["orange", "red", "white", "yellow"], "correct_choice_idx": 1, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "white"], "difficult_direct_answer": false, "rationales": ["The black boat is clearly visible and the trim colors can been seen and identified.", "The black boat has red stripes", "There is a bright stripes around a black boat in the water. it is probably to let people know when its dark and hard to see."], "image": "val2014/COCO_val2014_000000517425.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 477193, "question_id": "m5tfU7SQjWmZ8eYs3kPuQb", "question": "What color is the line on the floor?", "choices": ["red", "purple", "blue", "yellow"], "correct_choice_idx": 3, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "ask", "yellow", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["There is a dotted lane on the road that is similar to the sun and people are walking up and down it.", "This is a common color indicating direction of traffic on a road or path", "The line is not blue, red, or purple."], "image": "train2014/COCO_train2014_000000477193.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333406, "question_id": "m6F56FZVeCFMo7wJBRwA58", "question": "What is the person holding the cat known for?", "choices": ["wrestling", "basketball", "cooking", "tennis"], "correct_choice_idx": 2, "direct_answers": ["cooking", "cooking", "lifestyle products", "cooking", "martha stewart", "cooking", "insider trading", "cooking", "cooking", "cooking"], "difficult_direct_answer": false, "rationales": ["It is martha stewart.", "The woman holding the cat is martha stewart.", "The person has a recognizable face, martha stewart is a famous chef."], "image": "val2014/COCO_val2014_000000333406.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402833, "question_id": "m753BtsuJLBUSwwyhCBfrx", "question": "What color shirt does the man closest to the camera have on?", "choices": ["orange", "black", "blue", "red"], "correct_choice_idx": 2, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The man's shirt is not red, black, or orange.", "The man closest to the camera is wearing a bright blue collared shirt.", "The man closest to the foreground walking next to the elephant is wearing a shirt that is blue."], "image": "train2014/COCO_train2014_000000402833.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 100563, "question_id": "m8cW27Njas2bidQihcskNn", "question": "Which letter of the alphabet represents this docking terminal?", "choices": ["b", "", "c", "d"], "correct_choice_idx": 3, "direct_answers": ["dee", "d", "letter d", "d", "d", "letter d", "no", "d docking", "letter d", "d"], "difficult_direct_answer": false, "rationales": ["There is a sign that says d55. this should be the number of the terminal.", "The letter d is represented.", "The letter and numbers at the top of the terminal states which dock it is."], "image": "train2014/COCO_train2014_000000100563.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435803, "question_id": "m8sJUSAsU23eK3fLBygc9h", "question": "What number is on the plane?", "choices": ["995", "237", "145", "568"], "correct_choice_idx": 0, "direct_answers": ["999", "995", "995", "nine hundred", "995", "995", "995", "995", "999", "nine five"], "difficult_direct_answer": false, "rationales": ["That is the number on the plane.", "The numbers are painted in black against the silver", "The clear number seen on the plain is 995."], "image": "train2014/COCO_train2014_000000435803.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 242656, "question_id": "m9AYkU5RT7YfE2wYggohQw", "question": "What is one of the cows hiding behind?", "choices": ["airplane", "truck", "tree", "elephant"], "correct_choice_idx": 2, "direct_answers": ["tree", "tree", "tree", "tree", "tree", "tree", "tree", "tree", "tree", "tree"], "difficult_direct_answer": false, "rationales": ["The cow is behind a tree.", "The cow is hiding behind the trunk.", "One of the cows in this image's midsection is obscured by a tree."], "image": "train2014/COCO_train2014_000000242656.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572886, "question_id": "mBjikxx3fgSJrTZKGqmkKf", "question": "What kind of fencing keeps the zebras enclosed in the zoo?", "choices": ["wood", "link", "wire", "pool"], "correct_choice_idx": 2, "direct_answers": ["barbed wire", "wire", "wired", "iron", "barbwire", "wire fence", "electric", "wire", "wire", "wire"], "difficult_direct_answer": false, "rationales": ["There are metal lines that keep the zebras enclosed.", "Wire fencing is needed.", "The fence is wire."], "image": "val2014/COCO_val2014_000000572886.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 358378, "question_id": "mDL3HN5gwTNkjf4Hz82UA6", "question": "How many computer screens are around the cat sleeping on the laptop?", "choices": ["three", "five", "four", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "one", "two", "two", "one", "one", "one", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There are two screens.", "All you have to do is count simply and you can tell how many monitors there are.", "One close up and one in the background. cats are attracted to computer warmth."], "image": "train2014/COCO_train2014_000000358378.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 95281, "question_id": "mEHNdf884GfhoJNbJNFNLc", "question": "What is the artwork on the wall called?", "choices": ["mural", "graffiti", "tapestry", "mosaic"], "correct_choice_idx": 0, "direct_answers": ["mural", "silhouette art", "mosaic", "mural", "mural", "painting", "mural", "painting", "tree", "painting"], "difficult_direct_answer": false, "rationales": ["There is a tree design on the wall. it is typically called a mural since it is painted on.", "The artwork on the wall is called a mural because it is large and made of different elements.", "Of the answers provided, only answer a matches the style of a painted display like that visibly on the wall."], "image": "train2014/COCO_train2014_000000095281.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 417556, "question_id": "mEXQqRqdGLvWfhktVL5WbZ", "question": "What color is the stripe on the top of the auto service garage building?", "choices": ["blue", "red", "yellow", "green"], "correct_choice_idx": 0, "direct_answers": ["blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue"], "difficult_direct_answer": false, "rationales": ["The stripe is not green, yellow, or red.", "It is blue in color.", "The color is blue."], "image": "train2014/COCO_train2014_000000417556.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89906, "question_id": "mEYqtjFrxksBuuFELXuN9S", "question": "What kind of object is dispensed from the receptacle pinned into the wall?", "choices": ["paper towels", "soap", "cups", "toilet paper"], "correct_choice_idx": 2, "direct_answers": ["toilette", "plastic cup", "soap", "cup", "washbasin", "blue", "cups", "toilet", "plastic cups", "cups"], "difficult_direct_answer": true, "rationales": ["Plastic cups can be seen in a dispenser on the wall of a bathroom.", "There are plastic cups in the holder on the wall.", "They are colorless and plastic and can be able to hold liquid."], "image": "train2014/COCO_train2014_000000089906.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533799, "question_id": "mFp7chBcEPvcbm5dA9kGRf", "question": "Why are all the other cars letting the yellow truck go?", "choices": ["respect", "scared", "emergency", "no reason"], "correct_choice_idx": 2, "direct_answers": ["fire vehicle", "emergency", "fire engine", "fire", "emergency", "crossing", "stoplight", "fire emergency", "emergency", "emergency"], "difficult_direct_answer": false, "rationales": ["The truck in the intersection is responding to an emergency.", "There are firetrucks going through.", "The cars are letting the yellow truck go because it is an emergency vehicle on its way to a fire."], "image": "train2014/COCO_train2014_000000533799.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331455, "question_id": "mGaw6hMYJosARatuXArEaH", "question": "What is the boat plowing through?", "choices": ["lake", "ocean", "canal", "riverwater"], "correct_choice_idx": 1, "direct_answers": ["water", "ocean", "water", "ocean", "water", "ocean", "ocean", "water", "ocean", "water"], "difficult_direct_answer": false, "rationales": ["The boat is on a large mass of water with no land in sight.", "A large boat is in a large body of water with no land visible.", "The boat's in an ocean."], "image": "val2014/COCO_val2014_000000331455.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 294595, "question_id": "mGxxRnjmQTFKhAqXmgMaEx", "question": "What is the quickest way to heat food in this kitchen?", "choices": ["hot water", "light", "microwave", "oven"], "correct_choice_idx": 2, "direct_answers": ["microwave", "microwave", "microwave", "microwave", "microwave", "microwave", "microwave", "microwave", "microwave", "microwave"], "difficult_direct_answer": false, "rationales": ["The microwave in the kitchen is the fastest way to heat food.", "Although the other options can heat food, a is the only one that can do it in seconds.", "There is an older looking kitchen and a appliance stuck in the cabinet. it can cook food at a rapid rate."], "image": "val2014/COCO_val2014_000000294595.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 128245, "question_id": "mH5NK63pYjifGyH6ybb4vi", "question": "What color is the bottom half of the snowboard which is carried up the hill by a man with ski poles?", "choices": ["blue", "white", "red", "purple"], "correct_choice_idx": 1, "direct_answers": ["white", "gray", "blue", "white", "grey", "white", "gray", "light blue", "white", "blue"], "difficult_direct_answer": false, "rationales": ["The snowboard has a white bottom.", "It is the same color as the snow.", "The bottom half of the snowboard being carried is white."], "image": "train2014/COCO_train2014_000000128245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 538250, "question_id": "mH9SAEQiiw4QarGu2XZujr", "question": "What type of tram is this one called?", "choices": ["triple decker", "single decker", "double decker", "environmental"], "correct_choice_idx": 2, "direct_answers": ["double", "double decker", "track", "trolley", "large tram", "train", "bus", "stop", "trolley", "double decker"], "difficult_direct_answer": false, "rationales": ["The tram has two layers.", "The tram has a lower level and an upper level.", "A bus with two levels is in the street."], "image": "train2014/COCO_train2014_000000538250.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 511358, "question_id": "mHrDvPo9R9QmRnXs9zi868", "question": "What provides power to these toothbrushes?", "choices": ["electricity", "water", "sun", "battery"], "correct_choice_idx": 3, "direct_answers": ["electricity", "electricity", "cords", "cord", "electricity", "battery", "battery", "cord", "electricity", "electric"], "difficult_direct_answer": false, "rationales": ["They cannot be used while plugged in so the cord visible must charge them.", "There is a cord leading to it", "That type of brush uses a battery."], "image": "train2014/COCO_train2014_000000511358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368515, "question_id": "mJ8bRqMfS6uCA8ivFVGYh7", "question": "Where is there most likely to be a baby at in this picture?", "choices": ["stroller", "building", "car", "tree"], "correct_choice_idx": 0, "direct_answers": ["car", "left", "banner", "last", "no", "stroller", "stroller", "stroller", "stoller", "stroller"], "difficult_direct_answer": false, "rationales": ["This is a vehicle for small children", "There is a wheeled, covered vehicle that can be pushed by a pedestrian on the left.", "The baby is in a stroller."], "image": "train2014/COCO_train2014_000000368515.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 261426, "question_id": "mJrcRVTczU6xiVo3ZAjGMN", "question": "What is in the sandwich that is highest in the air?", "choices": ["black olives", "toothpick", "giant eggs", "carrots"], "correct_choice_idx": 1, "direct_answers": ["but", "sun", "turkey club", "tomato", "ham", "turkey", "toothpick", "but", "turkey", "top one"], "difficult_direct_answer": false, "rationales": ["There is a toothpick sticking out of the highest part of the sandwich.", "A tall sandwich can come apart or fall over if it isn't held together by a toothpick or decorative spike.", "The sandwich has a toothpick."], "image": "val2014/COCO_val2014_000000261426.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6864, "question_id": "mK8vmrbmm3TgvnhNBuHu7D", "question": "What color is the stripe in the middle of the signs on both sides of the beltway?", "choices": ["white", "yellow", "blue", "black"], "correct_choice_idx": 0, "direct_answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"], "difficult_direct_answer": false, "rationales": ["The stripe in the middle of the red circle is white.", "The color is white.", "The red sign has a white stripe in the middle."], "image": "val2014/COCO_val2014_000000006864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 533502, "question_id": "mKhSGiedALc8iGYW98eh64", "question": "Which kitchen appliance is underneath of the upper cupboards?", "choices": ["oven", "dishwasher", "refrigerator", "sink"], "correct_choice_idx": 0, "direct_answers": ["vessel", "stove", "oven", "oven", "oven", "stove", "oven", "range", "oven", "oven"], "difficult_direct_answer": false, "rationales": ["It is on the ground touching the floor.", "Answer a is the only appliance that is clearly visible underneath the cupboards.", "The kitchen appliance has temperature controls, a window, and a lower storage area that is not temperature-controlled. the sink is not under the upper cupboards."], "image": "train2014/COCO_train2014_000000533502.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 277518, "question_id": "mKvuzcXc99W53zAoBxRWbD", "question": "What is on top of the boat?", "choices": ["old couple", "toddlers", "birds", "oars"], "correct_choice_idx": 2, "direct_answers": ["birds", "bird", "birds", "birds", "bird", "birds", "birds", "birds", "birds", "birds"], "difficult_direct_answer": false, "rationales": ["There are birds resting on the boat.", "Birds are found near the water and they frequently perch on anything they can find whether a boat or tree or on the shore.", "There are non-human animals on top of the boat."], "image": "val2014/COCO_val2014_000000277518.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 379567, "question_id": "mLLVaBdrLXo8x6wVtTAcxo", "question": "This scene is likely to appear where?", "choices": ["babysitter resume", "photographers portfolio", "dog advertisement", "wanted ad"], "correct_choice_idx": 1, "direct_answers": ["disappear", "photographers portfolio", "harbor", "at pier", "harbor", "ocean", "wharf", "coast", "beach", "harbor"], "difficult_direct_answer": false, "rationales": ["The picture is used for a portfolio.", "The picture is very artistic and attractive. the sun is large and attractive.", "A nice picture of the sky."], "image": "train2014/COCO_train2014_000000379567.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 264079, "question_id": "mLkfjgRXqmr88toGhnNfjh", "question": "What number of tofu slices are in the side of the bowl next to the rice and broccoli?", "choices": ["two", "one", "three", "four"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["Two tofu slices are on top of an additional one.", "There are three slices.", "There are three."], "image": "train2014/COCO_train2014_000000264079.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503091, "question_id": "mM6dfdRQtQ8tEbmYkkmRsE", "question": "What god or goddess name appears on the plane?", "choices": ["floki", "freya", "artemis", "ra"], "correct_choice_idx": 3, "direct_answers": ["ra", "ra3560k", "ra", "ra", "ra", "ra", "ra", "ra-3560k", "apollo", "ra"], "difficult_direct_answer": false, "rationales": ["The deity's name is on the fuselage of the plane.", "The plane says ra-3560k", "The god ra is on the side of this plane."], "image": "train2014/COCO_train2014_000000503091.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 463201, "question_id": "mM6kfiaT87ZoqHRSnAUGKn", "question": "What is the bar on the left wall for?", "choices": ["flush toilet", "shower items", "toilet paper", "balance"], "correct_choice_idx": 3, "direct_answers": ["flush", "balance", "handle", "mirror", "flashing", "towel", "silver", "holding", "mirror", "unknown"], "difficult_direct_answer": true, "rationales": ["This is to assist people getting up and down", "A silver, long bar is on the wall next to a toilet and is at waste level.", "In case someone turns or gets up and suddenly loses a, they can grab it."], "image": "train2014/COCO_train2014_000000463201.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 457834, "question_id": "mMebEjSsg66E2bQU43x8wD", "question": "What is on top of the elephants?", "choices": ["bows", "ladders", "statues", "people"], "correct_choice_idx": 3, "direct_answers": ["people", "people", "people", "people", "people", "people", "people", "people", "people", "people"], "difficult_direct_answer": false, "rationales": ["There is a line of elephants with people riding on top of each one.", "Many humans are sitting atop an elephant. they are marching across a road to get to the woods.", "There are animals, not bows, ladders, or statues, on top of the elephants. each animal is wearing clothing and has two legs."], "image": "val2014/COCO_val2014_000000457834.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 84214, "question_id": "mNBKBESKGrMAh7peSc5rXT", "question": "How many computer mice are there altogether on the mouse pad?", "choices": ["three", "four", "five", "two"], "correct_choice_idx": 3, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["There is a white mouse. there also is a silver one.", "There are two of them on the mouse pad.", "There are two mice on the mousepad next to the computer."], "image": "train2014/COCO_train2014_000000084214.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20599, "question_id": "mNErUCBaV3hhD9zG5ewjoa", "question": "What color are the bananas in the middle of the cutting table next to the cook?", "choices": ["brown", "green", "yellow", "black"], "correct_choice_idx": 1, "direct_answers": ["green", "green", "green", "green", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["The bananas are green.", "A man has a bunch of bananas sitting on a table as he cuts other items.", "The bananas near the cook are not ripe yet."], "image": "train2014/COCO_train2014_000000020599.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484401, "question_id": "mNF7NMe9aMeFXUeLdB7Wtg", "question": "Who feeds this animal?", "choices": ["itself", "dog", "human", "lion"], "correct_choice_idx": 2, "direct_answers": ["human", "owner", "human", "humans", "owner", "human", "human", "owner", "human being", "cat"], "difficult_direct_answer": false, "rationales": ["This animal is an indoor pet cat. it is owned and cared for by someone.", "It is a pet because it is in a house so the owner must feed it.", "It is a house cat in a house which suggests its domesticated. domesticated house cats are very common as pets so people take care of them."], "image": "train2014/COCO_train2014_000000484401.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 221681, "question_id": "mNNgLePcMxjEVGQsoRLMZu", "question": "What is usually done here?", "choices": ["sleeping", "teeth brushing", "watching tv", "cooking eggs"], "correct_choice_idx": 1, "direct_answers": ["washing", "wash urinate", "teeth brushing", "wash hands", "showering", "wash", "use toilet", "wash hands", "wash hands", "wash hands"], "difficult_direct_answer": false, "rationales": ["The teeth are brushed.", "People usually brush their teeth here.", "Here we see a bathroom sink. although no toothbrushes are visible brushing teeth is something that would normally take place here."], "image": "val2014/COCO_val2014_000000221681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301110, "question_id": "mQUgbdZN8sMokVvmsV49Vq", "question": "What color is the boundary cloth on the suitcase of luggage held by the woman with the green bag?", "choices": ["red", "white", "yellow", "green"], "correct_choice_idx": 2, "direct_answers": ["yellow", "yellow", "yellow", "yellow", "yellow", "yellow", "blue", "yellow", "yellow", "yellow"], "difficult_direct_answer": false, "rationales": ["The color is yellow.", "The boundary cloth is not green, red, or white.", "It's yellow around it."], "image": "train2014/COCO_train2014_000000301110.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 27701, "question_id": "mQYtkVrcj6vJMbqjGrDRPQ", "question": "How many sheep are grazing in the pasture enclosed by the wire fence?", "choices": ["five", "six", "eight", "seven"], "correct_choice_idx": 3, "direct_answers": ["seven", "seven", "six", "seven", "six", "seven", "six", "eight", "eight", "seven"], "difficult_direct_answer": false, "rationales": ["There are seven sheep that are in a fence and eating grass. there are rows of pine trees behind them.", "Six sheep are standing side by side but there is one in the background.", "There are 7."], "image": "train2014/COCO_train2014_000000027701.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14366, "question_id": "mRJHaC7h22A9TniXdgxoU8", "question": "What is at the end of each apple?", "choices": ["toothpick", "tooth", "teddy bear", "fork"], "correct_choice_idx": 3, "direct_answers": ["fork", "fork", "fork", "fork", "forks", "apple", "fork", "screw", "fork", "fork"], "difficult_direct_answer": false, "rationales": ["This is an eating utensil with tines", "The apples have forks.", "Pieces of fruit with red skin are on the end of forks. apples are a fruit with red skin."], "image": "train2014/COCO_train2014_000000014366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 113199, "question_id": "mSmJMqX6TjTAJV6qTVrgkJ", "question": "What is on the plate?", "choices": ["cheesy pizza", "eggplant parmigiana", "orange", "apple"], "correct_choice_idx": 0, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "cheesy pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["The dough crust with melted cheese and bits of marinara sauce visible identifies this food item as pizza.", "Pizza is on the plate.", "The pizza is filled with white cheese on top."], "image": "train2014/COCO_train2014_000000113199.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 333730, "question_id": "mTcMoZJKWE6STeniHC6h44", "question": "What are the animals standing in?", "choices": ["chicken bones", "mud", "grass", "sand"], "correct_choice_idx": 2, "direct_answers": ["grass", "grass", "grass", "sheep", "grass", "yard", "grass", "grass", "yard", "grass"], "difficult_direct_answer": false, "rationales": ["The animals are standing on a patch of grass in the yard.", "The ground is completely covered with green vegetation that is grass and is also eaten by the animals.", "The animals are standing on a green ground, therefore it is grass."], "image": "val2014/COCO_val2014_000000333730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352821, "question_id": "mTj9sMQS7hCqZSDfcEfDwT", "question": "What national flag is embroidered on top of the pillow next to the sofa?", "choices": ["usa", "germany", "france", "uk"], "correct_choice_idx": 3, "direct_answers": ["united kingdom", "american", "england", "us", "nation", "british", "usa", "uk", "french", "uk"], "difficult_direct_answer": true, "rationales": ["The flag has red, white, and blue with a cross in the middle. it has diagonal lines that run into the cross.", "The flag of the united kingdom has a large red cross in the center, flanked by more red lines and blue and white background.", "The flag belongs to the uk."], "image": "train2014/COCO_train2014_000000352821.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 234475, "question_id": "mUPVGKin48ffWFBgisPqq6", "question": "What part of the computer is the cat looking at intently?", "choices": ["mouse", "monitor", "card reader", "keyboard"], "correct_choice_idx": 0, "direct_answers": ["mouse", "mouse", "mouse", "mouse", "mouse", "mouse", "mouse", "mouse", "mouse", "mouse"], "difficult_direct_answer": false, "rationales": ["This is a hand device used to move a cursor around", "There is a black and white mouse standing by a computer. it is looking down a white mouse on table.", "It is a small device, is sitting on a pad and is next to a keyboard."], "image": "train2014/COCO_train2014_000000234475.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 41565, "question_id": "mUhRm8tXtvuJvkrACxaN49", "question": "What is the dog investigating?", "choices": ["cat", "rat", "birthday cake", "baby"], "correct_choice_idx": 2, "direct_answers": ["food", "cake", "cake", "cake", "birthday cake", "cookies", "cake", "cake", "cake", "food"], "difficult_direct_answer": false, "rationales": ["Looking to see if it can get a piece of the cake.", "The dog wants the cake.", "The dog is looking at a cake."], "image": "train2014/COCO_train2014_000000041565.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 407235, "question_id": "mUzj34ovoTmDRnK4xuJ6pm", "question": "What has got the attention of the herd of sheep seen in front of us?", "choices": ["dog", "camera", "wolf", "tree"], "correct_choice_idx": 1, "direct_answers": ["no idea", "photographer", "camera", "humans", "camera", "unknown", "camera", "camera", "photographer", "photographer"], "difficult_direct_answer": false, "rationales": ["The sheep are staring straight at the camera.", "The camera does.", "Someone took a picture of the sheep. the sheep are looking into the lens."], "image": "val2014/COCO_val2014_000000407235.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 237917, "question_id": "mV52PaAP8LATpTGF6FJNRc", "question": "What is probably making the cat so alert?", "choices": ["earthquake", "noise", "person", "camera flash"], "correct_choice_idx": 3, "direct_answers": ["food", "person", "flashing camera", "flash", "flash", "camera flash", "photographer", "person", "person", "camera"], "difficult_direct_answer": false, "rationales": ["The cat's eyes are reflecting something that matches option a.", "There is a spot of light around the cat.", "A light can be seen reflecting in a cat's eyes as the cat looks straight ahead. flash causes reflection in eyes in photos."], "image": "val2014/COCO_val2014_000000237917.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3320, "question_id": "mWwkD3RYuYjzjvf6cjFf6c", "question": "What is the person dragging on the floor?", "choices": ["luggage", "apple", "slug", "box"], "correct_choice_idx": 0, "direct_answers": ["suitcase", "suitcase", "suitcase", "luggage", "luggage", "suitcase", "suitcase", "luggage", "suitcase", "suitcase"], "difficult_direct_answer": false, "rationales": ["The item being dragged is rectangular with wheels on the bottom and an extended handle on top which is consistent with answer a.", "The person has luggage.", "It's a zippered bag with a handle and wheels"], "image": "train2014/COCO_train2014_000000003320.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132887, "question_id": "mXUZSG9S5efGi7nDWNYM8h", "question": "What are these animals known for producing?", "choices": ["pork", "venison", "wool", "milk"], "correct_choice_idx": 3, "direct_answers": ["milk", "milk", "milk", "beef", "milk", "milk", "milk", "milk", "milk", "milk"], "difficult_direct_answer": false, "rationales": ["Cows are known to produce milk.", "The animals make milk.", "B is from deer, c is from sheep and d is from pigs."], "image": "train2014/COCO_train2014_000000132887.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 568956, "question_id": "mYgpVpreDb67fLvQAjXuHK", "question": "What is the child playing with?", "choices": ["pigeons", "yoyo", "cat", "toy truck"], "correct_choice_idx": 0, "direct_answers": ["pigeon", "pegion", "pigeons", "birds", "pigeons", "bird", "birds", "birds", "pigeons", "birds"], "difficult_direct_answer": false, "rationales": ["A boy is in the middle of the street. he has a yellow jacket on and creatures with wings around him on ground and on shoulders.", "The kid has pigeons.", "The children is playing with pigeons in the plaza."], "image": "val2014/COCO_val2014_000000568956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 484019, "question_id": "mYrWRJVWF96b9L5mQ6djvq", "question": "Who speaks the same language that the sign is in?", "choices": ["charlotte vega", "janet montgomery", "roxane mesquida", "sara paxton"], "correct_choice_idx": 2, "direct_answers": ["french", "spanish", "italian", "french people", "french people", "french", "french", "roxane mesquida", "french people", "french"], "difficult_direct_answer": false, "rationales": ["The language is in french and this is a french celebrity.", "Mesquida speaks spanish.", "Roxane mesquida could speak the language of a blue and white flag."], "image": "train2014/COCO_train2014_000000484019.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251168, "question_id": "mZErYK3qmGqBtGyRgFWdLv", "question": "What color is the writing for this team who is batting on top of their helmets?", "choices": ["purple", "yellow", "green", "blue"], "correct_choice_idx": 3, "direct_answers": ["blue", "red blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "red"], "difficult_direct_answer": false, "rationales": ["The color is blue.", "The writing is not in yellow, purple, or green.", "The color is blue."], "image": "train2014/COCO_train2014_000000251168.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 478802, "question_id": "ma8etHKqMJy5n898xRffD6", "question": "What color is the background lighting behind the girl posing for the photo?", "choices": ["blue", "pink", "yellow", "red"], "correct_choice_idx": 1, "direct_answers": ["purple", "purple", "purple", "pink", "pink", "violet", "purple color", "pink", "purple", "purple"], "difficult_direct_answer": false, "rationales": ["Woman is wearing a purple dress with pink ears and has a matching brick wall behind her.", "The girl is posing in front of a bright wall that has pink lights shining on it.", "There is magenta colored lighting behind the girl."], "image": "train2014/COCO_train2014_000000478802.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 141200, "question_id": "mbNk8eCCaf4P9PaZ3xYsQV", "question": "What is falling down?", "choices": ["cat", "rain", "leaves", "bird"], "correct_choice_idx": 1, "direct_answers": ["rain", "rain", "rain", "rain", "rain", "rain", "rain", "rain", "slipping down", "rain"], "difficult_direct_answer": false, "rationales": ["People use umbrellas to protect themselves from getting wet.", "The person is wearing a coat and is carrying an umbrella in order to stay dry.", "It's raining outside."], "image": "train2014/COCO_train2014_000000141200.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 402245, "question_id": "mdNpwbhM2cUcYfHvh8m72e", "question": "What kind of fencing is used around these horses to keep them confined?", "choices": ["iron", "link", "electrified", "wood"], "correct_choice_idx": 2, "direct_answers": ["brown", "white", "wired", "wire fencing", "barbed wire", "barbwire", "wire", "electrified", "barbed wire", "barbed wire"], "difficult_direct_answer": false, "rationales": ["Horses are standing by a barbed wired fence that will shock them if they try to escape.", "The fencing consists of wires, not links. the fencing is not made out of wood or iron.", "Wire will have a shock on the horses."], "image": "train2014/COCO_train2014_000000402245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89407, "question_id": "mfWn9rtCvRKfq24CHvxpmu", "question": "What color is the gas tank of the motorcycle where the child is sitting?", "choices": ["red", "white", "blue", "green"], "correct_choice_idx": 0, "direct_answers": ["maroon", "maroon", "red", "purple", "brown", "red", "red", "maroon", "purple", "red"], "difficult_direct_answer": false, "rationales": ["The gas tank is located directly in front of the boy. it is red.", "The color is red.", "It's a maroon color"], "image": "train2014/COCO_train2014_000000089407.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 72023, "question_id": "mfn3SQbFqHN9HUJojBbTMS", "question": "How many giraffes are walking around in front of the people at the zoo or conservatory?", "choices": ["four", "three", "one", "two"], "correct_choice_idx": 2, "direct_answers": ["one", "one", "one", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["The giraffes are identifiable based on their size and shape and are countable based on their distinct outline.", "The number of giraffes is clearly visible and countable based on the unique features and the outline of the animal.", "The giraffe is by itself."], "image": "train2014/COCO_train2014_000000072023.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 152922, "question_id": "mj4bqfJmRVxymHoteHix2c", "question": "What primary color is split the same between all three family members on their snow suits while they are out skiing?", "choices": ["purple", "orange", "red", "blue"], "correct_choice_idx": 1, "direct_answers": ["red", "red", "red", "red", "red", "orange", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color is easily identified by observation. it is bright and close to red and in sharp contrast to the white snow.", "The color is easily visible and bright. it is in sharp contrast to the white snow.", "It is a darker color that is a little lighter then red on all three outfits."], "image": "train2014/COCO_train2014_000000152922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 21641, "question_id": "mj8ivJDieue8SWxXJ9Po3S", "question": "What will he likely be doing soon?", "choices": ["sleeping", "playing", "eating", "reading"], "correct_choice_idx": 0, "direct_answers": ["walking", "doing soon", "sleeping", "walk", "sleeping", "walking", "sleeping", "sleeping", "sleeping", "sleeping"], "difficult_direct_answer": false, "rationales": ["The boy looks like he'll go to sleep.", "The clothing he is wearing are called pajamas which are generally worn to bed for comfort.", "The baby is wearing pajamas and looks tired, so he might go to sleep soon."], "image": "val2014/COCO_val2014_000000021641.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 79313, "question_id": "mjYRtCKP4nAxMv59qyceN9", "question": "What kind of beverage is served in the glass behind the plates and between the two seated at the table?", "choices": ["juice", "glass", "wine", "beer"], "correct_choice_idx": 2, "direct_answers": ["wine", "wine", "wine", "wine", "beer", "wine", "wine", "wine", "wine", "water"], "difficult_direct_answer": false, "rationales": ["It is dark and purple and in a wine glass.", "It has a red color that corresponds to red wine and is also placed in a wine glass.", "The woman has wine on the glass."], "image": "train2014/COCO_train2014_000000079313.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 66785, "question_id": "mjZLr9fYcUCLupgmgbwcWy", "question": "What is the kite supposed to represent?", "choices": ["lion", "elephant", "goat", "salamander"], "correct_choice_idx": 3, "direct_answers": ["gecko", "lizard", "dragons", "lizard", "lizard", "salamander", "ballon", "lizard", "gecko", "giant lizard"], "difficult_direct_answer": false, "rationales": ["The kite looks like a lizard.", "The kites are lizard like with a cylindrical trunk, four legs, and a long tail.", "The kite looks like a lizard."], "image": "train2014/COCO_train2014_000000066785.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 251493, "question_id": "mjmJ3HDtoyFT3BFmsfrWxp", "question": "What is the man in the foreground wearing on his feet?", "choices": ["sandals", "sneakers", "boots", "shoes"], "correct_choice_idx": 1, "direct_answers": ["socks", "shoes", "shoes", "tennis shoes", "tennis shoes", "shoes", "sneakers", "tennis shoes", "tennis shoes", "tennis shoe"], "difficult_direct_answer": false, "rationales": ["The athlete in this picture wears shoes suitable for athletics.", "The man in the foreground is wearing sneakers while playing tennis.", "These are tennis shoes to keep feet comfortable"], "image": "train2014/COCO_train2014_000000251493.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 503294, "question_id": "mkT4f9e6jkAzDV39tnsAFR", "question": "What is the device hung on the wall above the fireplace?", "choices": ["stereo", "computer", "television", "phone"], "correct_choice_idx": 2, "direct_answers": ["television", "television", "television", "tv", "television", "television", "tv", "tv", "television", "tv"], "difficult_direct_answer": false, "rationales": ["The object is a flatscreen and placed where people can watch things while they sit in the room.", "The device is a tv.", "The device is a tv."], "image": "val2014/COCO_val2014_000000503294.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 144705, "question_id": "mkoMrWmafYrzkyB2UZtxee", "question": "What kind of meat is inside of this man's salad lunch?", "choices": ["turkey", "bolognia", "beef", "chicken"], "correct_choice_idx": 2, "direct_answers": ["key board", "steak", "brown", "beef", "chicken", "beef", "beef", "beef", "pork", "beef"], "difficult_direct_answer": false, "rationales": ["The meat is brown, not pink or white.", "Because it is well chopped of in beef sizes.", "The meat is very dark."], "image": "train2014/COCO_train2014_000000144705.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 132432, "question_id": "mnJeg4zJrd9fK4gzgtyQDW", "question": "How many sinks are on the row at this public bathroom area?", "choices": ["one", "four", "two", "three"], "correct_choice_idx": 3, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There is one in the middle and two flanking it", "There are 3 sinks, build in the marble counter, inside the restroom.", "The sinks are clearly visible and countable based on their unique outlines."], "image": "train2014/COCO_train2014_000000132432.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366792, "question_id": "moHAkiDtxgajY8VbceFi5K", "question": "What kind of businesses use these trucks?", "choices": ["horticulture", "pet supplies", "beverage", "meat packing"], "correct_choice_idx": 2, "direct_answers": ["brewers", "beverage", "brewery", "alcohol", "advertisment", "delivery", "breweries", "breweries", "transportation", "vineyard"], "difficult_direct_answer": true, "rationales": ["The beverage company uses the truck.", "The trucks are advertising wine and beer so a liquor store or bar would be the most likely businesses.", "The business sells drinks."], "image": "val2014/COCO_val2014_000000366792.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 43338, "question_id": "moLzYns46j9FAXay8kJECa", "question": "What is next to the banana?", "choices": ["fork", "scimitar", "machete", "orange"], "correct_choice_idx": 3, "direct_answers": ["orange", "orange", "oranges", "oranges", "oranges", "oranges", "tangerines", "orange", "oranges", "tangerine"], "difficult_direct_answer": false, "rationales": ["The fruit next the banana is orange and round in color.", "There are orange fruits.", "There are a bunch of oranges next to the bananas."], "image": "train2014/COCO_train2014_000000043338.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 517068, "question_id": "mofnVrgXCmVttHJksoTuep", "question": "What is the bread in?", "choices": ["cats mouth", "basket", "box", "dogs paw"], "correct_choice_idx": 1, "direct_answers": ["stop", "plate", "basket", "crate", "basket", "basket", "basket", "sauce", "pizza", "basket"], "difficult_direct_answer": false, "rationales": ["The bread is inside a container that is not a box. there are no dogs or cats near the bread.", "Bread on a table is being served on a black, lattice type bowl.", "You can see the black weave under the bread - it is also curved, making it a basket."], "image": "train2014/COCO_train2014_000000517068.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 269366, "question_id": "mr63ykf5b7VEkgkfurRyux", "question": "What sport are the three smaller boards used for in the upper right?", "choices": ["sand boarding", "skim boarding", "surfing", "skateboarding"], "correct_choice_idx": 3, "direct_answers": ["water skating", "waterskiing", "skateboarding", "skateboarding", "skateboarding", "art", "not load", "scat", "surfing", "skateboarding"], "difficult_direct_answer": false, "rationales": ["These are decks that normally have wheels on them", "The boards are of the size and shape of answer a, especially when compared with the other boards.", "The sport is skateboarding."], "image": "train2014/COCO_train2014_000000269366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 58099, "question_id": "mrpVzf3VXTaNoMpLkhDjMf", "question": "What kind of gas does the bicycle on the left run on?", "choices": ["diesel", "kerosene", "gasoline", "none"], "correct_choice_idx": 3, "direct_answers": ["bicycle", "bicycle", "unleaded", "people", "unleaded", "none", "people", "people", "bicycle", "unleaded"], "difficult_direct_answer": false, "rationales": ["The bike on the left is powered using the pedals.", "A peddle bicycle does not require any gasoline.", "This is human powered using pedals"], "image": "train2014/COCO_train2014_000000058099.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 330358, "question_id": "msAWX9NwCQcSrrxQLSyLgo", "question": "What vehicle is here?", "choices": ["horse", "basket", "racecar", "airplane"], "correct_choice_idx": 3, "direct_answers": ["airplane", "airplane", "airplane", "airplane", "plane", "airplane", "plane", "airplane", "airplane", "airplane"], "difficult_direct_answer": false, "rationales": ["The vehicle is a plane.", "The other options don't appear in this image. this also appears to be an air field.", "There is a vehicle that can fly."], "image": "train2014/COCO_train2014_000000330358.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 189472, "question_id": "mvRPsaZWnVsJnD7FPgXDgn", "question": "Who would use this room?", "choices": ["dolls", "parents", "pets", "students"], "correct_choice_idx": 0, "direct_answers": ["dolls", "kids", "child", "child", "dolls", "women", "child", "dolls", "doll", "people"], "difficult_direct_answer": false, "rationales": ["The room is made out of miniature toys.", "The furniture is very small and not for a human.", "Dolls would use it."], "image": "train2014/COCO_train2014_000000189472.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 447328, "question_id": "mwV2MKvNdhjWrsTUETFZRG", "question": "What color is the object that would be best to make a smoothie?", "choices": ["black", "red", "white", "blue"], "correct_choice_idx": 1, "direct_answers": ["red", "blender", "red", "red", "red", "white", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The blender could make such a thing. i can see the blades to cut the fruit.", "The object on the kitchen counter that would best make a smoothie is the red blender.", "The blender on the counter could be used to blend items for a smoothie and it is a scarlet color."], "image": "val2014/COCO_val2014_000000447328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 155312, "question_id": "mzQmsLnRrYMRx7DcP457jZ", "question": "What league does the team with the standing players play in?", "choices": ["nfc south", "afc east", "american league", "national league"], "correct_choice_idx": 2, "direct_answers": ["american league", "mlb", "twins", "mlb", "mlb", "major league", "american", "major league", "twins", "mlb"], "difficult_direct_answer": false, "rationales": ["The minnesota twins baseball team are batting. they do not play in the national league.", "The league is the american one.", "The twins are in the american league of major league baseball"], "image": "val2014/COCO_val2014_000000155312.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409736, "question_id": "mzeXK44pKkVaV3NxfiphNX", "question": "How many weird looking sheeps are standing on top of the dirt pile?", "choices": ["two", "three", "four", "one"], "correct_choice_idx": 1, "direct_answers": ["three", "two", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["One is laying down out of the three", "One of the three sheep is not standing.", "There are only three sheep in the dirt pile."], "image": "train2014/COCO_train2014_000000409736.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 490871, "question_id": "n2cNcwAGbF5QLMvQW2kWjh", "question": "What kind of knife is pictured laying next to the apple?", "choices": ["bread", "chef", "boning", "paring"], "correct_choice_idx": 3, "direct_answers": ["paring", "pare", "straight edge", "paring", "fruit knife", "paring", "cutting knife", "paring knife", "fruit knife", "peeling"], "difficult_direct_answer": false, "rationales": ["The knife is used to pare food items", "The knife in question is of a size, shape and style consistent with answer a based on the blade shape and the intended use.", "A paring knife is by the apple. it's used for fruits."], "image": "val2014/COCO_val2014_000000490871.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 248018, "question_id": "n2sHbvTasKJ5vVNof3Pg8A", "question": "What kind of food is in the bag?", "choices": ["potato chip", "candy canes", "tortilla chip", "gummy bears"], "correct_choice_idx": 0, "direct_answers": ["potato chips", "chips", "chips", "chips", "potato chip", "chips", "potato chips", "potato chips", "potato chips", "chips"], "difficult_direct_answer": false, "rationales": ["The bag is clearly labeled with the contents.", "The food is chips.", "The name is on the bag"], "image": "train2014/COCO_train2014_000000248018.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 288403, "question_id": "n32x2HY6o9WY4TkMGnU3hx", "question": "What color are the shorts on the boy wearing a baseball helmet?", "choices": ["black", "yellow", "red", "blue"], "correct_choice_idx": 2, "direct_answers": ["red", "red", "red", "red", "red", "red", "red", "red", "red", "red"], "difficult_direct_answer": false, "rationales": ["The color is like a darkish pink color that's similar to magenta.", "The color is red.", "The color is red."], "image": "val2014/COCO_val2014_000000288403.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 97951, "question_id": "n32yHX2mDAT9rGyUargzDv", "question": "What profession uses these vehicles?", "choices": ["fire fighter", "police officer", "garbage collector", "farmer"], "correct_choice_idx": 1, "direct_answers": ["cops", "police", "police officer", "police", "police", "police officer", "police", "police", "police", "police"], "difficult_direct_answer": false, "rationales": ["The profession is indicated by the decals on the motorcycle and car.", "There is two sets of vehicles used here. one is a motorcycle and the other is a care that is used to give people tickets because of car infractions.", "The police use the vehicles."], "image": "train2014/COCO_train2014_000000097951.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 415376, "question_id": "n34D5nKrQWbLGCdSTkBNv2", "question": "What are the big giraffes crossing on top of?", "choices": ["sand", "road", "salt", "grass"], "correct_choice_idx": 1, "direct_answers": ["sand", "dirt road", "road", "road", "road", "road", "road", "road", "dirt road", "pathway"], "difficult_direct_answer": false, "rationales": ["They are crossing the dirt road.", "The giraffes cross the road.", "They are in a road path."], "image": "train2014/COCO_train2014_000000415376.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 530242, "question_id": "n3NcePpCjsYfGu5c5yXsqw", "question": "What weapon does the item the man on the left is holding look most like?", "choices": ["flintlock", "dagger", "spear", "mace"], "correct_choice_idx": 2, "direct_answers": ["sword", "stick", "baton", "stick", "spear", "blue", "spear", "stick", "stick", "stop"], "difficult_direct_answer": false, "rationales": ["The weapon is a spear.", "The man is holding a long straight object that might have a pointed tip which would be consistent with answer a.", "The weapon is a spear."], "image": "train2014/COCO_train2014_000000530242.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 119962, "question_id": "n3UfoMBGfdTvVNdoCUP8x3", "question": "What type of vegetable is the topping of choice for this pizza?", "choices": ["pickle", "onion", "mushroom", "spinach"], "correct_choice_idx": 2, "direct_answers": ["mushroom", "mushroom", "mushroom", "mushrooms", "mushrooms", "mushroom", "mushroom", "mushroom", "tomatoes", "mushroom"], "difficult_direct_answer": false, "rationales": ["The veggies are mushrooms.", "The other options are obviously not on the pizza.", "Mushrooms are dark colored. they are also a common pizza topping."], "image": "train2014/COCO_train2014_000000119962.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36226, "question_id": "n4B53N2vgSn2ShaxTt3gyJ", "question": "What move is one of the players likely to do?", "choices": ["goal", "bunt single", "check mate", "serve"], "correct_choice_idx": 3, "direct_answers": ["car", "stop", "kick", "serve", "serve", "serve", "serve", "serve", "serve", "hit"], "difficult_direct_answer": false, "rationales": ["The other options don't apply to tennis.", "The move is a serve.", "The move is a serve."], "image": "val2014/COCO_val2014_000000036226.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 441535, "question_id": "n5H6T9SfeL4zi5xgaPHJwi", "question": "What do the stickers on the wall appear to be doing?", "choices": ["talking", "jumping", "eating", "flying"], "correct_choice_idx": 3, "direct_answers": ["flying", "flying", "flying", "flying", "flying", "flying", "flying", "flying", "flying", "flying"], "difficult_direct_answer": false, "rationales": ["The stickers on this wall are in the shape of birds with outstretched wings.", "The stickers are of birds. they are moving through the air.", "The other options aren't what the birds are doing."], "image": "val2014/COCO_val2014_000000441535.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 37932, "question_id": "n68PBihaz2KChFRo3vhmjW", "question": "As it is walked by the woman what is inside of the dog's mouth?", "choices": ["umbrella", "frisbee", "bone", "stick"], "correct_choice_idx": 0, "direct_answers": ["umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella", "umbrella"], "difficult_direct_answer": false, "rationales": ["The dog has the same item in his mouth that the woman is also holding. the hooked handle and the bottom of the umbrella are visible.", "There is an umbrella stick inside of the dog's mouth.", "The umbrella is near."], "image": "train2014/COCO_train2014_000000037932.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 393203, "question_id": "n7UkjN5Xca7YPyoJ9ifSGy", "question": "What color is the strange rock on the right hand side of this field of sheep?", "choices": ["white", "orange", "purple", "pink"], "correct_choice_idx": 1, "direct_answers": ["grey brown", "brown", "brown", "brown", "orange", "headstone", "white", "orange", "white", "white"], "difficult_direct_answer": false, "rationales": ["The color is orange.", "It looks like fire on the bottom part of it in a way.", "It has a rusty shade towards the bottom."], "image": "val2014/COCO_val2014_000000393203.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 468005, "question_id": "n8xdPudBZo2xKzD62RTWYa", "question": "What color is the middle of this elephant's face and trunk?", "choices": ["pink", "ivory", "orange", "gray"], "correct_choice_idx": 0, "direct_answers": ["pink", "pink", "slightly pink", "brown", "pinkish brown", "brown", "black brown", "pink", "beige", "pink"], "difficult_direct_answer": false, "rationales": ["The middle of the elephant's face that isn't covered in mud is a dark pink color.", "The middle of this elephant's trunk is pink.", "The color is pink."], "image": "val2014/COCO_val2014_000000468005.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 396317, "question_id": "n8ydHLJzRgo533uBTob7XK", "question": "How many zebras are sitting atop of the grassy field?", "choices": ["one", "four", "three", "two"], "correct_choice_idx": 1, "direct_answers": ["four", "five", "four", "four", "four", "six", "five", "five", "five", "four"], "difficult_direct_answer": false, "rationales": ["As long as you can count, you can see how many zebra's are on the field.", "You can count them and they are easily visible in contrast to the background.", "There are four zebras eating grass."], "image": "val2014/COCO_val2014_000000396317.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 308812, "question_id": "n9EsCZHEXkjn83PidH5oKb", "question": "What item is the cat balancing on their head?", "choices": ["puppy", "remote control", "banana", "apple"], "correct_choice_idx": 1, "direct_answers": ["remote", "remote control", "ruler", "remote", "remote", "remote control", "tv remote", "remote", "remote", "remote"], "difficult_direct_answer": false, "rationales": ["The item has lots of buttons on it common on that for changing channels and volume.", "The black remote control is partially on the couch.", "A black object with buttons is on a cat."], "image": "train2014/COCO_train2014_000000308812.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255708, "question_id": "n9HoWYshRGRbRHNNoLnrRj", "question": "What is the name of the style of skiing the child is doing?", "choices": ["french fries", "pizza", "bombing", "freestyle"], "correct_choice_idx": 1, "direct_answers": ["downhill", "skating", "pizza", "free ski", "freestyle", "ski", "downhill", "slow", "snow skiing", "snowboarding"], "difficult_direct_answer": true, "rationales": ["The child is learning to ski so is keeping the skis in a shape similar to pizza.", "So this isn't common knowledge so one would have to be knowledgeable to know skiing language.", "The name is pizza."], "image": "val2014/COCO_val2014_000000255708.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306404, "question_id": "nBHFMShWeEiRVRXBs4BJaD", "question": "What is near the sink?", "choices": ["towel", "badger", "baby", "cat"], "correct_choice_idx": 0, "direct_answers": ["cloth", "soap", "mirror", "cloth", "towels", "washcloth", "towels", "soap", "towel", "soap"], "difficult_direct_answer": false, "rationales": ["There are towels hanging near the sink.", "There is a towel by the sink.", "A hand towel is near the sink to dry your hands on."], "image": "val2014/COCO_val2014_000000306404.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 495107, "question_id": "nCtbtMHpzpwvs4paZjvXgY", "question": "What is the person in the foreground wearing?", "choices": ["glasses", "sombrero", "elf ears", "mask"], "correct_choice_idx": 0, "direct_answers": ["glasses sweatshirt", "sweatshirt", "glasses", "hoodie", "glasses", "jacket", "zipper hoodie", "glasses", "hoodie", "hoodie"], "difficult_direct_answer": false, "rationales": ["The person has glasses.", "Of the available options, he is wearing spectacles to help improve his vision.", "They have rims with prescription lenses"], "image": "val2014/COCO_val2014_000000495107.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 366789, "question_id": "nDmbUKXDHyM7Rz8o6Qyv7L", "question": "What is the woman in the bicycle wearing?", "choices": ["hat", "crown", "backpack", "tiara"], "correct_choice_idx": 0, "direct_answers": ["dress", "white sundress", "hat", "ball", "clothes", "hat", "white dress", "hat", "dress", "dress"], "difficult_direct_answer": false, "rationales": ["The woman does not have any items on her back. she has an item on her head but is not dressed like a princess.", "Crown and tiara would not be worn while riding a bike. no backpack is present in the picture.", "She has this on her head"], "image": "train2014/COCO_train2014_000000366789.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 10822, "question_id": "nDvmffyKmuNLro52HhgHYF", "question": "What direction are these animals facing?", "choices": ["north", "east", "west", "south"], "correct_choice_idx": 1, "direct_answers": ["east", "right", "right", "right", "right", "right", "right", "right", "right", "east"], "difficult_direct_answer": false, "rationales": ["They look to be facing the east.", "D the sun is setting towards the direction.", "The zebras are all facing right which would be east on a map."], "image": "val2014/COCO_val2014_000000010822.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 64621, "question_id": "nEn427RQ83cJMvg7NBxKdE", "question": "What color are the stones on the bottom of the wagon pulled by the horse?", "choices": ["red", "pink", "black", "gray"], "correct_choice_idx": 3, "direct_answers": ["gray", "grey", "brown", "white", "grey", "grey", "white", "grey", "grey", "gray"], "difficult_direct_answer": false, "rationales": ["The stones are not red, black, or pink.", "These are cobblestones. a is a common color for them.", "Stones are made of natural rock, which typically would not have a brightly colored pigment."], "image": "val2014/COCO_val2014_000000064621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 328819, "question_id": "nG724iw2kATuQnqprkLdb7", "question": "How many colors of cow are there grazing in this field?", "choices": ["two", "three", "four", "one"], "correct_choice_idx": 1, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There is a black cow, a brown cow, and a black/white cow.", "There's a gray and brown one.", "There are 3 colors."], "image": "train2014/COCO_train2014_000000328819.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 553867, "question_id": "nHLiQWe6E83SGyDv9553AX", "question": "What are the animals doing?", "choices": ["running", "floating", "flying", "sleeping"], "correct_choice_idx": 0, "direct_answers": ["running", "running", "running", "running", "running", "running", "running", "running", "running", "running"], "difficult_direct_answer": false, "rationales": ["The animals are running.", "The animals are running at the field.", "Their feet are showing as bent and as if they're galloping."], "image": "val2014/COCO_val2014_000000553867.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59361, "question_id": "nMkSVMtNt6Ppnwdf6v22ZC", "question": "What animals can be seen?", "choices": ["cows", "antelopes", "giraffes", "horses"], "correct_choice_idx": 2, "direct_answers": ["giraffes", "giraffe", "giraffe", "giraffes", "giraffes", "giraffes", "giraffes", "giraffes", "giraffe", "graffie"], "difficult_direct_answer": false, "rationales": ["The other options aren't shown in this image. the long necks and spots make it obvious.", "The animals have long, skinny necks.", "The animals are clearly visible and have a distinct and unique body structure that is known to be consistent with answer a."], "image": "train2014/COCO_train2014_000000059361.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 322391, "question_id": "nNEAvwLeUzomeByMpkN7wm", "question": "What type of apparel is worn on the woman's legs?", "choices": ["jeans", "capri leggings", "slacks", "overalls"], "correct_choice_idx": 1, "direct_answers": ["tights", "capri leggings", "dress", "leggings", "tights", "pants", "leggings", "leggings", "leggings", "leggings"], "difficult_direct_answer": false, "rationales": ["The woman is wearing purple capri leggings on her legs.", "You can tell by how the leggings expose the person's legs as to what type of leggings they are.", "The woman is wearing pants that have a special name because they do not go all the way down the ankle as most pants do."], "image": "train2014/COCO_train2014_000000322391.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 480780, "question_id": "nNcLg2q2ARBDmYHNaw8mWX", "question": "What fruit is growing here?", "choices": ["banana", "pear", "apple", "orange"], "correct_choice_idx": 0, "direct_answers": ["bananas", "banana", "banana", "fs", "banana", "bananas", "banana", "plantains", "banana", "banana"], "difficult_direct_answer": false, "rationales": ["Large bunches of green fruit is hanging from a tree.", "These are banana trees.", "Large tropical trees with large, flat leaves are growing all around."], "image": "val2014/COCO_val2014_000000480780.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67178, "question_id": "nQshKcmhzRQShP7X3pZxYj", "question": "What language-speaking country is this in?", "choices": ["spanish", "english", "french", "german"], "correct_choice_idx": 0, "direct_answers": ["spanish", "england", "spanish", "spanish", "italian", "spain", "german", "english", "spanish", "english"], "difficult_direct_answer": false, "rationales": ["The words on the bus are in a south american language.", "The words on the bus are spanish.", "There is spanish font on the bus."], "image": "val2014/COCO_val2014_000000067178.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 375086, "question_id": "nREKjCTqzL7Z4UYQpP7V5L", "question": "What is going on the toast?", "choices": ["butter", "cream cheese", "ants", "avocado"], "correct_choice_idx": 3, "direct_answers": ["avocado", "avocado", "avocado", "avocado", "avocado pear", "bread", "avocado", "avocado", "avocado", "bread"], "difficult_direct_answer": false, "rationales": ["We can presume that the avocado slice will be spread onto the bread with the nearby knife.", "The color is green.", "The grilled avocado is sitting on the side of plate of toast. there is also a knife on the plate for spreading the avocado onto the toast."], "image": "train2014/COCO_train2014_000000375086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 134752, "question_id": "nRgPtQZNnADTXqkSpNF6R7", "question": "How many icing spoons are on top of the sponge cake?", "choices": ["four", "five", "two", "three"], "correct_choice_idx": 2, "direct_answers": ["zero", "one", "two", "cake", "two", "one", "zero", "three", "two", "one"], "difficult_direct_answer": false, "rationales": ["A dessert has two red circles of icing on top of it.", "There are 2.", "There are a couple icing dollops on top of the layered cake."], "image": "train2014/COCO_train2014_000000134752.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 20414, "question_id": "nS7rwbG8sDG4isjfkXBmwY", "question": "What is the statue wearing?", "choices": ["tiara", "gas mask", "crown", "belt"], "correct_choice_idx": 3, "direct_answers": ["suit", "baseball uniform", "jersey", "uniform", "uniform", "baseball jersey", "belt", "baseball hat", "baseball uniform", "baseball uniform"], "difficult_direct_answer": false, "rationales": ["The statue has a belt over the uniform.", "The statue has a belt.", "The player has this around his waist."], "image": "train2014/COCO_train2014_000000020414.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 107108, "question_id": "nUjY3fZ4kCjbMbju8kxpFv", "question": "What kind of dietary habits is this dish suitable for?", "choices": ["omnivore", "carnivore", "vegan", "pescatarian"], "correct_choice_idx": 2, "direct_answers": ["vegetarian", "vegetarian", "vegan", "vegetarian", "vegan", "vegan", "snacking", "vegan", "vegetarian", "vegetarian"], "difficult_direct_answer": false, "rationales": ["That person only eats vegetables.", "There is no meat present in the plate. many vegans use meat substitutes as a healthful way to get protein and other nutrients.", "The food on this plate is not derived from animals in any way."], "image": "val2014/COCO_val2014_000000107108.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 284911, "question_id": "nUx844ysJtCw2NYMNt7KQF", "question": "What is this activity good for?", "choices": ["building muscles", "nutrition", "gum health", "mental health"], "correct_choice_idx": 2, "direct_answers": ["teeth", "teeth", "brushing", "cleaning teeth", "cleaning teeth", "cleaning teeth", "clean teeth", "gum health", "brushing teeth", "teeth"], "difficult_direct_answer": false, "rationales": ["Brushing teeth keeps the skin next to the teeth healthy and clean.", "The activity is good for gum health.", "The man is brushing his teeth."], "image": "train2014/COCO_train2014_000000284911.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 319456, "question_id": "nWXyr3kF8Q4CYahUovQuz8", "question": "What can be seen in the mirror?", "choices": ["statue", "mask", "woman", "baby"], "correct_choice_idx": 2, "direct_answers": ["girl", "light switch", "woman", "light switch", "people", "man", "camera", "two humans", "people", "photographer"], "difficult_direct_answer": false, "rationales": ["She is a female human", "There are multiple things that can be seen in the mirror, but answer a is most prominent.", "There is a woman in the reflection of the bathroom mirror next to the man with the camera."], "image": "val2014/COCO_val2014_000000319456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 208730, "question_id": "nXPi7FuaMeSb3RuTdp656r", "question": "What kind of structure do the elephants cross over from the left to right?", "choices": ["pavement", "lake", "river", "dirt road"], "correct_choice_idx": 3, "direct_answers": ["road", "road", "way", "road", "road", "road", "road", "road", "road", "dirt road"], "difficult_direct_answer": false, "rationales": ["The elephants are crossing over a dirt road in the forest.", "The structure is the road.", "They are crossing a dirt road"], "image": "train2014/COCO_train2014_000000208730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 352681, "question_id": "nbjhEFZT8NPYVvekrTjuRD", "question": "What color is the jacket worn by the man who is adjusting his pants legs?", "choices": ["white", "green", "orange", "blue"], "correct_choice_idx": 1, "direct_answers": ["green", "olive", "green", "green", "green", "green", "green", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["The man's jacket is a greenish color", "There is a man bending over in green adjusting his pants on the snow.", "The color is green."], "image": "val2014/COCO_val2014_000000352681.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 85176, "question_id": "ncPojENgAThRLv5449Zd7z", "question": "What type of rice is set off to the left side of the plate?", "choices": ["jasmine", "wild", "fried", "long grain"], "correct_choice_idx": 2, "direct_answers": ["pilau", "fried rice", "biriyani", "fried", "fried", "fried rice", "fried", "basumathi", "fried", "brown"], "difficult_direct_answer": false, "rationales": ["The rice is brown, which means it was fried.", "The rice has been stir fried.", "The rice is fried."], "image": "train2014/COCO_train2014_000000085176.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 435179, "question_id": "ndEiq4sz3u2Xk9FvsPcnmT", "question": "What would be the most useful material for adding to the boat in this image?", "choices": ["child", "block", "sticks", "clothes"], "correct_choice_idx": 2, "direct_answers": ["color", "sail", "sail", "sails", "sails", "sticks", "fruits", "wood", "wood", "wood"], "difficult_direct_answer": false, "rationales": ["It would help hold things up", "The sticks are useful.", "There is a pile of sticks as if they are being used to build the boat."], "image": "val2014/COCO_val2014_000000435179.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374628, "question_id": "nfajpp8afva2A4cYeE4F7W", "question": "What kind of flowers are posted in vases all around the room?", "choices": ["daffodils", "sunflowers", "roses", "tulips"], "correct_choice_idx": 1, "direct_answers": ["sunflower", "sunflowers", "sunflowers", "sunflower", "sunflowers", "sunflower", "sunflowers", "daisies", "sunflowers", "sunflowers"], "difficult_direct_answer": false, "rationales": ["The flowers are sunflowers.", "The flowers are sunflowers.", "Yellow flowers with large dark centers are places around a kitchen. sunflowers have yellow petals and large dark centers."], "image": "val2014/COCO_val2014_000000374628.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555848, "question_id": "nfaqL2F2CcCadA63fYJwzi", "question": "What is the side dish?", "choices": ["tater tots", "soup", "salad", "apple slices"], "correct_choice_idx": 0, "direct_answers": ["tator tots", "tater tots", "tater tots", "sauce", "tater tots", "fries", "hotdog", "tater tots", "tater tots", "ketchup"], "difficult_direct_answer": false, "rationales": ["Bite sized potatoes are accompanying hot dogs with ketchup.", "The side dish is tater tots.", "There are two hot dogs as well as some small potato items."], "image": "val2014/COCO_val2014_000000555848.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 306914, "question_id": "nh7tKFa2LW8tML9jt2VhTo", "question": "What do the majority of the floats look like?", "choices": ["octopus", "cat", "bear", "skunk"], "correct_choice_idx": 0, "direct_answers": ["octopus", "jellyfish", "octopus", "squid", "octopus", "octopus", "octopus", "octopus", "star fish", "aliens"], "difficult_direct_answer": false, "rationales": ["They look like creatures with tentacles.", "They all have eight legs.", "The floats look like octopi."], "image": "val2014/COCO_val2014_000000306914.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 59319, "question_id": "nhcWGQ5YS3iJZ9erCV6oxi", "question": "What is on the counter?", "choices": ["cat", "pizza pie", "dog", "pot"], "correct_choice_idx": 3, "direct_answers": ["pot", "flask", "pots", "glassware", "buy", "pots pans", "plates", "pots", "vessels", "pots"], "difficult_direct_answer": false, "rationales": ["This is a commercial kitchen and that is a metal vessel used for cooking", "This is a restaurant. there is a metallic item that could be used to boil water.", "The object is round and silver which is typical of this type of container. in addition, it is found in the kitchen where this object is usually found."], "image": "train2014/COCO_train2014_000000059319.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6489, "question_id": "nhsjjTWMFqzLw5AyJxvrjD", "question": "What color are the sleeves worn by the biker who has blue shorts and a red bike?", "choices": ["black", "blue", "pink", "white"], "correct_choice_idx": 0, "direct_answers": ["blue", "black", "black", "black", "black", "black", "black", "black", "black", "black"], "difficult_direct_answer": false, "rationales": ["A guy is standing next to a bike with a short sleeved shirt with a dark colored long sleeved shirt under.", "This is the same color as the handlebars", "The sleeves are the color that results from the absence of light."], "image": "train2014/COCO_train2014_000000006489.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 431854, "question_id": "ni8eWp6m9ygnYcPjvdxPro", "question": "What is facing down?", "choices": ["tennis racquet", "sword", "axe", "laser beam"], "correct_choice_idx": 0, "direct_answers": ["bat", "racket", "tennis player", "tennis racket", "racket", "tennis racquet", "tennis racket", "ball", "man", "racket"], "difficult_direct_answer": false, "rationales": ["A man has just returned a tennis ball and has swung his racket.", "When swinging the racket, forward motion will dictates that the racket will face down.", "The tennis racquet is pointing to the court."], "image": "val2014/COCO_val2014_000000431854.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 409364, "question_id": "nj9e4hUuBMcLMmoWX4h8qn", "question": "What does the person have on their knees?", "choices": ["clown noses", "kneepads", "ribbons", "spikes"], "correct_choice_idx": 1, "direct_answers": ["guards", "kneepads", "skate board", "knee pads", "shoes", "cup", "kneepads", "knee pads", "kneeguards", "protection"], "difficult_direct_answer": false, "rationales": ["The person has kneepads.", "A person is on a skateboard with safety equipment on including protective padding on her knees.", "A person is on a skateboard in a ramp. with large items covering their knees."], "image": "val2014/COCO_val2014_000000409364.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529786, "question_id": "nmCdrdEbjmoGVQQ2HhPyyh", "question": "What is a rival company to this one?", "choices": ["greyhound", "mcdonalds", "dole", "subway"], "correct_choice_idx": 2, "direct_answers": ["chiquita", "dole", "dole", "bonita", "dole", "dole", "dole", "dole", "doll", "dole"], "difficult_direct_answer": false, "rationales": ["Dole is a rival.", "These food items are bananas, not hamburgers, sandwiches, or buses. there is a chiquita logo on each banana.", "These are bananas that were farmed by chiquita. this company does not compete with fast food restaurants or bus companies."], "image": "train2014/COCO_train2014_000000529786.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 555953, "question_id": "nmDEmGFMiA2xBFBkFe7EyY", "question": "Which Asian car brand is represented by the red advertisement on the airfield?", "choices": ["yamaha", "toyota", "hyundai", "isuzu"], "correct_choice_idx": 1, "direct_answers": ["toyota", "toyota", "toyota", "toyota", "toyota", "toyota", "toyota", "toyota", "toyota", "toyota"], "difficult_direct_answer": false, "rationales": ["The brand on the red advertisement is not hyundai, yamaha, or isuzu.", "The logo is shown.", "An airplane is landing on a runway with a red sign and white lettering showing a car manufacturer logo."], "image": "val2014/COCO_val2014_000000555953.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 350191, "question_id": "nmiKcopGDawPh76bji8CBW", "question": "What is the teddy bear wearing?", "choices": ["crown", "bike helmet", "backpack", "bow"], "correct_choice_idx": 3, "direct_answers": ["overalls", "tux", "dress", "bowtie", "jumpsuit", "overalls", "bow", "clothes accessories", "overalls", "bowtie"], "difficult_direct_answer": false, "rationales": ["There is a tiny cute teddy bear on a table with a red bow tie around it.", "The bear has a bow.", "A plush bear is dressed in formal clothes."], "image": "train2014/COCO_train2014_000000350191.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 312446, "question_id": "nnvNtZDCxSHjvY3QsPhzCE", "question": "What kind of material encloses this pasture for the cows or bulls inside?", "choices": ["wire", "cast iron", "electrified wire", "grating"], "correct_choice_idx": 0, "direct_answers": ["wire", "metal wire", "wire", "wire", "lining", "barbed wire", "fence", "wire fence", "fence", "wire"], "difficult_direct_answer": false, "rationales": ["Bulls are behind a fence with wooden polls and long silver strings are strung horizontally between the polls.", "It is thin metal.", "The fence is completely made up of small metal like strings."], "image": "train2014/COCO_train2014_000000312446.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 44266, "question_id": "nnx3PCH5R6pigPwPxxcL4f", "question": "What is on top of the desk?", "choices": ["carrot", "fish bowl", "egg", "cat"], "correct_choice_idx": 1, "direct_answers": ["monitor", "computer", "computer", "computer monitor", "computer", "computer", "fish bowl", "fish bowl", "computer", "computer"], "difficult_direct_answer": false, "rationales": ["There is a glass fish bowl on top of the desk.", "There is a round fish bowl on top of the desk with a goldfish in it.", "There is a container with rocks and water in it"], "image": "train2014/COCO_train2014_000000044266.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 67959, "question_id": "nopsoqRwbWBs5qGFeJHwpw", "question": "What is on the top left donut?", "choices": ["eggs", "gummy bears", "cats paw", "cereal"], "correct_choice_idx": 3, "direct_answers": ["cereal", "captain crunch", "captain crunch", "eat", "cap'n crunch", "cereal", "sprinkles", "cereal", "white", "cereal"], "difficult_direct_answer": false, "rationales": ["There is a captain crunch on top of white frosting.", "The top left donut as captain crunch on it.", "The donut has cereal."], "image": "val2014/COCO_val2014_000000067959.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 169245, "question_id": "np8nddej2docvTL8YPp2Et", "question": "What is the woman interacting with?", "choices": ["bicycle", "baby elephant", "car", "computer"], "correct_choice_idx": 1, "direct_answers": ["elephant", "elephant", "baby elephant", "elephant", "touch", "elephant", "elephant", "friendly", "baby elephant", "baby elephant"], "difficult_direct_answer": false, "rationales": ["The woman is interacting with a grey animal that has a trunk.", "The woman is interacting with a little baby elephant.", "You can see that this is a small elephant and not an adult one."], "image": "train2014/COCO_train2014_000000169245.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 293081, "question_id": "npfHz6EL57e9zeLNDCyL6D", "question": "What kind of water body is most likely in the service of this dock?", "choices": ["river", "ocean", "lake", "sea"], "correct_choice_idx": 1, "direct_answers": ["sea", "ocean", "ocean", "lake", "ocean", "ocean", "ocean", "ocean", "bay", "bay"], "difficult_direct_answer": false, "rationales": ["There is a marina with a lot of boats in the water. the water itself is blue and looks huge.", "The water is an ocean.", "There are rows of boats docked by water."], "image": "train2014/COCO_train2014_000000293081.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 497815, "question_id": "nqVdTCbpLS6ABAdfdMdG8D", "question": "What type of transportation is parked on the side of the road?", "choices": ["car", "bicycle", "taxi", "motorcycle"], "correct_choice_idx": 1, "direct_answers": ["bull", "bicycle", "bike", "bicycle", "bicycle", "bicycle", "bicycle", "bike", "bicycle", "bicycle"], "difficult_direct_answer": false, "rationales": ["There is a bicycle parked on the side of the road next to the cows.", "The two wheeled pedal powered item on the right is a bike.", "The bicycle is parked."], "image": "train2014/COCO_train2014_000000497815.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 455860, "question_id": "nqZvX3YnzQhSbuuV9rMWwj", "question": "What country is likely hosting this vehicle evident by the writing on its side?", "choices": ["thailand", "laos", "cambodia", "vietnam"], "correct_choice_idx": 3, "direct_answers": ["vietnam", "china", "thailand", "mexico", "japan", "fsssfs", "thailand", "vietnam", "vietnam", "thailand"], "difficult_direct_answer": false, "rationales": ["It's a foreign language on the truck.", "A vehicle has advertising on the side that includes an address.", "The country is vietnam."], "image": "train2014/COCO_train2014_000000455860.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 32456, "question_id": "nsRUdAsMiw7ucwgPNoHJD5", "question": "What number of zebras are standing in front of the tree surrounded by a chain link fence?", "choices": ["four", "one", "two", "three"], "correct_choice_idx": 1, "direct_answers": ["one", "one", "two", "one", "one", "one", "one", "one", "one", "one"], "difficult_direct_answer": false, "rationales": ["There is one zebra.", "There is just a single zebra.", "There is 1."], "image": "val2014/COCO_val2014_000000032456.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359399, "question_id": "nsZq5sNZAGe88pfpQRChDR", "question": "What is the man doing?", "choices": ["running", "sleeping", "pointing", "eating"], "correct_choice_idx": 2, "direct_answers": ["selfie", "photo", "taking selfie", "mirror selfie", "pointing", "stop", "smile", "posing", "selfie", "pointing"], "difficult_direct_answer": false, "rationales": ["He has his finger out straight aimed at his tie", "The person is holding his index finger to a specific direction which is typical of this behavior.", "The man has his index finger extended and is directing it towards himself."], "image": "train2014/COCO_train2014_000000359399.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 302469, "question_id": "nsbchQ8LSwXkWSf4uHwoj8", "question": "What is the number of zebras moving from left to right in the middle of the savannah field?", "choices": ["two", "three", "four", "five"], "correct_choice_idx": 2, "direct_answers": ["four", "four", "four", "three", "three", "four", "three", "four", "four", "three"], "difficult_direct_answer": false, "rationales": ["All the zebras are moving in the same direction.", "A group of animals with stripes are moving along in the same direction and there are four of them.", "There are four zebras on the top of this plain."], "image": "train2014/COCO_train2014_000000302469.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 258094, "question_id": "nszAVipwyMg6LejK7JFmPr", "question": "What color is the stripe near the hull of the aircraft?", "choices": ["green", "red", "orange", "blue"], "correct_choice_idx": 0, "direct_answers": ["green", "green", "green", "orange", "green", "green", "red", "green", "green", "green"], "difficult_direct_answer": false, "rationales": ["The stripe near the hull of the aircraft is painted green.", "The hull of the aircraft would be the fuselage and it has a green stripe on it.", "It is near the top center of the airplane."], "image": "train2014/COCO_train2014_000000258094.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 220190, "question_id": "nt6gab9qgsEghmN5TGe5n4", "question": "What is up in the air?", "choices": ["airplane", "toilet lid", "students hand", "apple"], "correct_choice_idx": 1, "direct_answers": ["stop", "sink", "light", "curtain", "towel", "towel", "robe", "toilet lid", "cloth", "light"], "difficult_direct_answer": false, "rationales": ["The toilet lid is in the air.", "A toilet lid is usually down and covering the toilet. here, it is in the air.", "The toilet lid is in the air."], "image": "train2014/COCO_train2014_000000220190.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 270956, "question_id": "ntFEZ4vMmX9NGy7XZHqaxB", "question": "What color is the fruit smoothie inside of the red blender?", "choices": ["green", "red", "pink", "white"], "correct_choice_idx": 2, "direct_answers": ["red", "yellow", "red", "red", "red", "red", "red", "pink", "red", "red"], "difficult_direct_answer": false, "rationales": ["It's pink in color.", "It's a shade of pink.", "The fruit is pink as it is clearly seen."], "image": "train2014/COCO_train2014_000000270956.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 401083, "question_id": "ntJuRiF7bBAyv6NUWSY65m", "question": "What fruit is plentiful here?", "choices": ["lime", "watermelon", "orange", "lemon"], "correct_choice_idx": 2, "direct_answers": ["oranges", "lemons", "oranges", "lemon", "oranges", "orange", "orange", "oranges", "oranges", "orange"], "difficult_direct_answer": false, "rationales": ["A large pile of oranges is arranged in a basket.", "A large basket is filled with round, orange fruits. a lot of something being available gives the impression it is plentiful.", "The fruit shown are round with a leathery peel. the fruit has the same name as their color."], "image": "train2014/COCO_train2014_000000401083.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 124841, "question_id": "nuJY3oE2aHMZALyqjHSuVZ", "question": "What number is the player wearing?", "choices": ["78", "11", "34", "22"], "correct_choice_idx": 1, "direct_answers": ["11", "11", "eleven", "eleven", "eleven", "eleven", "11", "11", "11", "eleven"], "difficult_direct_answer": false, "rationales": ["The man in the football jersey is wearing the number 11.", "This is obvious by the white number on the black background.", "The football player on the left is wearing a jersey with the number 11 on it."], "image": "train2014/COCO_train2014_000000124841.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 374130, "question_id": "nub5SAgzWUpW4wsxZFujNG", "question": "What color are the strange plants below the lego zebras?", "choices": ["white", "blue", "orange", "red"], "correct_choice_idx": 1, "direct_answers": ["blue", "blue", "green", "blue", "aqua blue", "green", "blue", "blue green", "green beige", "blue"], "difficult_direct_answer": false, "rationales": ["The strange plants are blue colored.", "Zebras are standing near bright blue plants.", "The color is blue."], "image": "train2014/COCO_train2014_000000374130.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 422354, "question_id": "nykpesLWSwJzntupV8xDME", "question": "What is the man in the foreground drinking?", "choices": ["orange juice", "water", "beer", "ramen noodles"], "correct_choice_idx": 2, "direct_answers": ["alcohol", "alcohol", "beer", "alchohol", "bourbon", "beer", "beer", "alcohol", "beer", "beer"], "difficult_direct_answer": false, "rationales": ["You can tell by the color of the liquid and the setting he is in as to what he is drinking.", "The drink is not orange or clear. it is not possible to drink ramen noodles.", "The man in the foreground is drinking beer from a plastic cup in his hand."], "image": "train2014/COCO_train2014_000000422354.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 301334, "question_id": "o2oZUULqvwbxJXmoZq7sLh", "question": "What is on top of the counter?", "choices": ["book", "television", "apple", "coffee pot"], "correct_choice_idx": 3, "direct_answers": ["wine glass", "wine glass", "wine glass", "wine glass", "wine glass", "kitchen", "glass", "coffee pot", "glass", "glass"], "difficult_direct_answer": false, "rationales": ["It is easily visible and recognizable by it's shape and that it has a handle for pouring hot liquids without burning yourself.", "Post people keep many things on top of their kitchen counter. coffee pots, microwaves and wine glasses are common items.", "The counter has a coffee pot."], "image": "train2014/COCO_train2014_000000301334.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 334332, "question_id": "o3YfVbyFmiUDQyGubTrHGP", "question": "What kind of desert is held by in the napkin by the man eating it?", "choices": ["muffin", "doughnut", "cake", "fruitcake"], "correct_choice_idx": 1, "direct_answers": ["donut", "doughnut", "doughnut", "doughnut", "donut", "doughnut", "donut", "doughnut", "donut", "donut"], "difficult_direct_answer": false, "rationales": ["He's holding a donut.", "The man wearing sunglasses is holding a doughnut with a napkin.", "The desert is circular and has sugar toppings."], "image": "val2014/COCO_val2014_000000334332.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 561560, "question_id": "o4gCRtH9APsJ2TRTNiS3z3", "question": "How many people are seated on top of the elephant eating in the pool?", "choices": ["three", "two", "six", "four"], "correct_choice_idx": 0, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three people.", "There are three people", "Two people sit in a seat on an elephant while another sits closer up to the head of the elephant."], "image": "train2014/COCO_train2014_000000561560.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 80495, "question_id": "o5YmAdn2wyZejTEyuGEb5k", "question": "What is the green item used in?", "choices": ["cheeseburger", "pea soup", "apple pie", "caesar salad"], "correct_choice_idx": 2, "direct_answers": ["pies", "apple pie", "apple pie", "apple pie", "mango", "juices", "pies", "apple pie", "apple pie", "juice"], "difficult_direct_answer": false, "rationales": ["The green item in question is an apple based on the color, shape size and skin. this would be a main ingredient in answer a and none of the other answers.", "The green item is apple pie.", "The green items are apples. they are most often used in making desserts."], "image": "train2014/COCO_train2014_000000080495.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 61730, "question_id": "o5giCgEm6jYJS52xgwV7e6", "question": "What type of facility is the largest dog associated with?", "choices": ["fire station", "forestry", "school", "police station"], "correct_choice_idx": 0, "direct_answers": ["fire station", "firestation", "fire department", "firehouse", "firehouse", "disney", "man", "height", "firehouse", "travel"], "difficult_direct_answer": false, "rationales": ["Traditionally the dalmatian is associated with the fireman profession.", "A brown dog is sitting behind a large white dog that has black spots. dalmations are associated with fireman and have spots.", "The fire station features the dog."], "image": "train2014/COCO_train2014_000000061730.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 122653, "question_id": "o6njUy9ZyRybb8UBneZWJe", "question": "What company logo is on the TV?", "choices": ["panasonic", "verizon", "directv", "sony"], "correct_choice_idx": 2, "direct_answers": ["directv", "direct tv", "directv", "directv", "direct tv", "direct tv", "directv", "directv", "directv", "directv"], "difficult_direct_answer": false, "rationales": ["The curve on the logo and the word directly under it says it all.", "The logo appears in the middle of the tv. it is not a sony, panasonic, or verizon logo.", "The logo is for direct tv."], "image": "train2014/COCO_train2014_000000122653.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 300284, "question_id": "o7M2Wsu7wSJpxnT5o6m6n5", "question": "What vehicle what be the easiest to store furniture?", "choices": ["car", "truck", "bike", "van"], "correct_choice_idx": 1, "direct_answers": ["truck", "box truck", "moving van", "car", "van", "car", "truck", "box truck", "right one", "box truck"], "difficult_direct_answer": false, "rationales": ["The truck would be easiest.", "This vehicle is big and has space for storing.", "There is a large truck."], "image": "train2014/COCO_train2014_000000300284.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 259346, "question_id": "o7WLi8MUZRAdtdxcBydqsE", "question": "What color is the vintage car driving down the interstate highway?", "choices": ["red", "black", "brown", "white"], "correct_choice_idx": 2, "direct_answers": ["black", "stop", "red", "brown", "brown", "brown", "brown", "brown", "rust color", "brown"], "difficult_direct_answer": false, "rationales": ["The vintage car is identifiable based on its old design and coloring. the color is clear to see after identifying the car in the question.", "The vintage car is recognizable because its shape and design are unique and different from the other cars. after identifying the vintage car, the color is clearly visible.", "The color is brown."], "image": "train2014/COCO_train2014_000000259346.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 188658, "question_id": "o7xakwth2ME78FiwKCC8HE", "question": "What is a main ingredient in this dish?", "choices": ["apples", "beef", "pork", "cheese"], "correct_choice_idx": 3, "direct_answers": ["cheese", "cheese", "dough", "bread", "pizza", "cheese", "cheese", "cheese", "cheese", "cheese"], "difficult_direct_answer": false, "rationales": ["It is melted all over the top", "The ingredient is cheese.", "The pie would be the same without the dairy topping."], "image": "train2014/COCO_train2014_000000188658.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 295499, "question_id": "o92PgTj267MpgmzmfSoFFk", "question": "What color is the large girl's t-shirt who is sitting on the bench on the basketball game?", "choices": ["orange", "white", "green", "blue"], "correct_choice_idx": 0, "direct_answers": ["orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange"], "difficult_direct_answer": false, "rationales": ["The large fat girl has an orange t-shirt.", "Her t-shirt is not blue, green, or white.", "The larger girl on the bench is wearing an orange shirt. she is not playing in the game but is watching."], "image": "train2014/COCO_train2014_000000295499.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 62046, "question_id": "oA4vPRKzqGXGTmQwpHGFHZ", "question": "What do you need to do in order to get hand soap to come out of it's container?", "choices": ["pull", "throw", "pay money", "push"], "correct_choice_idx": 3, "direct_answers": ["press down", "pump it", "press pump", "proper manner", "press", "push", "push down", "pump", "press it", "pump"], "difficult_direct_answer": true, "rationales": ["There is a pump on the hand soap. pulling or throwing the pump's handle would not cause hand soap to come out.", "This is a soap dispenser that has a pump on it.", "Pushing is needed."], "image": "train2014/COCO_train2014_000000062046.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 173484, "question_id": "oAc8MhA9iyuQwxEit6N93N", "question": "What nationality is associated with the holiday being celebrated here?", "choices": ["chinese", "irish", "french", "italian"], "correct_choice_idx": 1, "direct_answers": ["irish", "irish", "nationality", "irish", "irish", "irish", "irish", "irish", "irish", "irish"], "difficult_direct_answer": false, "rationales": ["The green is the color of st. paddy's day who was a real person that lived in this country. the four leaf clover is also indicative of this holiday.", "The holiday is in honor of st patrick that drove the snakes out of this country. this story suggests the holidays origin.", "St patricks day is associated with those little people that have gold."], "image": "train2014/COCO_train2014_000000173484.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 520435, "question_id": "oBFYtyLAUjVK4soeinTjEd", "question": "How many zebras are running across the rocky field?", "choices": ["six", "two", "three", "five"], "correct_choice_idx": 1, "direct_answers": ["two", "two", "two", "two", "two", "two", "two", "two", "two", "two"], "difficult_direct_answer": false, "rationales": ["Two zebras are chasing each other.", "There are 2.", "There are two stripped zebras running between three elephants."], "image": "train2014/COCO_train2014_000000520435.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 572441, "question_id": "oCi6byvCHeWXMVagXK5t2z", "question": "What kind of citrus fruit is on top of the leaf on the right side of the white plate?", "choices": ["lemon", "grapefruit", "orange", "lime"], "correct_choice_idx": 0, "direct_answers": ["lemon", "lemon", "orange", "lemon", "lemon", "lemon", "lemon", "lemon", "lemon", "lemon"], "difficult_direct_answer": false, "rationales": ["The fruit is a lemon.", "It is yellow citrus fruit.", "The slice of fruit is yellow."], "image": "train2014/COCO_train2014_000000572441.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 429793, "question_id": "oEMqXYYxKqCDAUT8cCi29X", "question": "What color is the laptop on top of the desk next to the potted flowers?", "choices": ["gray", "blue", "black", "red"], "correct_choice_idx": 0, "direct_answers": ["silver", "silver", "silver", "silver", "gray", "silver", "silver", "silver", "grey", "black"], "difficult_direct_answer": false, "rationales": ["The laptop next to the potted flowers is a silver gray device.", "It is lighter than the power cord that is black", "The laptop is not blue, black, or red."], "image": "train2014/COCO_train2014_000000429793.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 385864, "question_id": "oEw5rwLFG5XFuV3c9wPUgd", "question": "What fruit is in the top right corner of the bin?", "choices": ["apple", "banana", "orange", "kiwi"], "correct_choice_idx": 3, "direct_answers": ["kiwi", "orange", "kiwi", "kiwi", "kiwi", "orange", "kiwi", "apple", "kiwi", "banana"], "difficult_direct_answer": false, "rationales": ["The fruit is the kiwi.", "Based on the location specification of the question answer a is present and identifiable based on the size, color and material.", "There is some kiwi on the top right corner of this bin."], "image": "train2014/COCO_train2014_000000385864.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 101874, "question_id": "oF4n37PcbTRXcuCh5jQL6a", "question": "What color is the block in the middle of the cup on the right?", "choices": ["purple", "green", "blue", "red"], "correct_choice_idx": 2, "direct_answers": ["blue", "black", "brown", "blue", "blue", "brown", "blue", "good", "blue", "black"], "difficult_direct_answer": false, "rationales": ["Logos are usually distinctive and two or more colors. blue and white colors are coordinated with table items.", "The block in the middle of the cup on the right is blue.", "There is a rectangle of this color on the side of the cup."], "image": "train2014/COCO_train2014_000000101874.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 359365, "question_id": "oGdWv8msm9hPwpz4Rrj6oE", "question": "How many giraffes are feeding from the basket of hay?", "choices": ["five", "six", "four", "two"], "correct_choice_idx": 2, "direct_answers": ["four", "four", "four", "four", "four", "four", "four", "four", "four", "four"], "difficult_direct_answer": false, "rationales": ["There are two giraffes on the left side of the basket. two additional giraffes are on the right side.", "Two giraffes are on the left, and one is on the right. an additional giraffe is in between the other three.", "There are four by the basket."], "image": "train2014/COCO_train2014_000000359365.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 368576, "question_id": "oGgzSMWtsDda5tTEertvuR", "question": "Who has the same last name as the batter?", "choices": ["bradley cooper", "john goodman", "jessica biel", "parker posey"], "correct_choice_idx": 3, "direct_answers": ["white", "posey", "car", "na", "parker posey", "eight", "parker posey", "posey", "parker", "parker"], "difficult_direct_answer": false, "rationales": ["The last name of the batter is visible and answer a is a person who has the same last name.", "Though a different sex the first selection is the correct one.", "The batter's last name has five, not four, six, or seven, letters."], "image": "val2014/COCO_val2014_000000368576.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 388922, "question_id": "oHAjKQADTfEngwxBYk7tGe", "question": "Which inanimate objects are out of place?", "choices": ["house", "fence", "boats", "cow"], "correct_choice_idx": 2, "direct_answers": ["beach side", "boat", "boat", "boats", "boats", "boat", "grey wall", "boat", "trash", "animal"], "difficult_direct_answer": false, "rationales": ["The boats behind the cow are out of place because they should be in the water.", "Because they are on dry land and boats only function on water.", "The boats seem to be out of the ordinary. there is a cow along the shore that is walking and the boats are pushed up close to buildings."], "image": "train2014/COCO_train2014_000000388922.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 225360, "question_id": "oJnAFdRyWnWijRKZv3b7tU", "question": "What type of fruit is most likely on the top of this cake?", "choices": ["oranges", "peaches", "pineapple", "lemon"], "correct_choice_idx": 1, "direct_answers": ["apple", "peach", "peach", "peaches", "peach", "pineapple", "pineapple", "apple", "peach", "peaches"], "difficult_direct_answer": false, "rationales": ["The fruit is orange like peaches.", "The type is peaches.", "There is a circle of peaches on top of the cake."], "image": "train2014/COCO_train2014_000000225360.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 253528, "question_id": "oKJD4yaVKtkAMwwndSBLn4", "question": "What items are in a row?", "choices": ["boxes", "dominos", "parking meters", "cards"], "correct_choice_idx": 2, "direct_answers": ["parking meters", "parking meters", "scale", "parking meter", "parking meters", "parking meters", "stop", "dwdw", "parking meter", "brides"], "difficult_direct_answer": false, "rationales": ["A long sidewalk has a row of silver poles along the road side of the sidewalk. the poles all have numbered devices on top of them.", "The other options aren't shown in this image. they're lined up this way because parking spaces along streets, when available, are aligned in rows.", "The items are parking meters."], "image": "train2014/COCO_train2014_000000253528.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 162283, "question_id": "oKfJj8YAupMWSxZmQpvsmR", "question": "The bald man with glasses is using what kind of phone?", "choices": ["flip", "iphone", "blackberry", "smart"], "correct_choice_idx": 0, "direct_answers": ["flip phone", "flip phone", "cell", "flip phone", "button mobile", "small", "flip phone", "flip phone", "flip", "motorola"], "difficult_direct_answer": false, "rationales": ["It is a phone that folds", "The man flips.", "The bald man with glasses is holding a flip phone."], "image": "train2014/COCO_train2014_000000162283.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 390086, "question_id": "oLZdPnwrMWf4ir46WLMBdJ", "question": "Which part of this dish is unique?", "choices": ["parsley", "olives", "vegetables", "meat"], "correct_choice_idx": 3, "direct_answers": ["olives", "olives", "olives", "olives", "olives", "broccoli", "right", "beets", "olives", "meat"], "difficult_direct_answer": false, "rationales": ["There is meat.", "Dark pieces are mixed in with leafy vegetables of a dish.", "There is some meat on the dish."], "image": "val2014/COCO_val2014_000000390086.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 6590, "question_id": "oMQryUWrJQboYJq776FjAU", "question": "What is the little girl outfitted for?", "choices": ["hail", "rain", "snow", "tornado"], "correct_choice_idx": 1, "direct_answers": ["rain", "rain", "rain", "purple sweater", "rain", "rain", "rain", "rain", "rain", "rain"], "difficult_direct_answer": false, "rationales": ["She has a rain coat on and an umbrella.", "She has on a plastic coat and is holding an umbrella", "The girl is wearing a coat and is holding an umbrella."], "image": "train2014/COCO_train2014_000000006590.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 73496, "question_id": "oMk38ayRy4HeTRqp32vhva", "question": "What kind of water body might there be before this cityscape?", "choices": ["ocean", "river", "channel", "lake"], "correct_choice_idx": 2, "direct_answers": ["lake", "ocean", "good", "urban", "bay", "lake", "lake", "harbor", "dwdw", "channel"], "difficult_direct_answer": false, "rationales": ["There might be a large channel in front of this cityscape.", "There is a very wide body of water in front of the downtown.", "Though any answer on the list is correct, but in this case when water is going through a city space its normally called a channel."], "image": "train2014/COCO_train2014_000000073496.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 426300, "question_id": "oMy3o54G8Nw2cNyGhjr4oi", "question": "What is on the table to the left?", "choices": ["dog", "chicken leg", "wine bottles", "cat"], "correct_choice_idx": 2, "direct_answers": ["wine", "wine", "champaign bottles", "wine", "wine", "wine", "wine", "wine", "wine bottles", "wine"], "difficult_direct_answer": false, "rationales": ["Tables are lined up with white linens and glass bottles in the middle. wine is served at formal events.", "The wine bottles are on the left.", "It is being poured into wine glasses."], "image": "train2014/COCO_train2014_000000426300.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 452622, "question_id": "oPes72Fo4E6QD4aiZyP84d", "question": "What is the main theme used for the illustrations on the vases?", "choices": ["plants", "food", "animals", "trees"], "correct_choice_idx": 2, "direct_answers": ["animals", "animals", "forest", "flower", "woodland creatures", "nature", "animals", "animals", "wildlife", "animal"], "difficult_direct_answer": false, "rationales": ["These vases have illustrations of owls, deer and squirrels on them.", "The wooden vases have different forest animals carved into the sides of them.", "There are wood carvings out of vases. they have a few wildlife creatures on them."], "image": "val2014/COCO_val2014_000000452622.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 321328, "question_id": "oQPjdCv5HaSDBZgKk7Lduq", "question": "What does the larger animal in this image definitely have more of?", "choices": ["wool", "smarts", "anger", "skin"], "correct_choice_idx": 0, "direct_answers": ["wool", "sheep", "wool", "wool", "wool", "wolves", "fur", "wool", "wool", "wool"], "difficult_direct_answer": false, "rationales": ["The animal has wool.", "The larger animal has a thicker coat.", "He has more wool on him."], "image": "train2014/COCO_train2014_000000321328.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 305219, "question_id": "oQWtdqBFN5URrEdmTKzGX3", "question": "What is the boy to the right swinging?", "choices": ["stuffed doll", "oar", "kite", "baseball bat"], "correct_choice_idx": 3, "direct_answers": ["baseball bat", "bat", "bat", "bat", "bat", "baseball bat", "bat", "baseball bat", "bat", "bat"], "difficult_direct_answer": false, "rationales": ["He is wearing baseball attire, holding a stick and there is a baseball in the air.", "Because it is a wooden structure and the boy is wearing baseball gear.", "The boy swings the baseball bat on his hand."], "image": "train2014/COCO_train2014_000000305219.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465424, "question_id": "oRch9HabH7tPDbZsoZW5Qo", "question": "What will the bikers most likely eat?", "choices": ["pizza", "curry", "hamburgers", "fish"], "correct_choice_idx": 0, "direct_answers": ["pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza"], "difficult_direct_answer": false, "rationales": ["They are in front of a food shop that specializes in that type of food.", "The bikers will have pizza.", "They are all parked outside of a pizza restaurant."], "image": "val2014/COCO_val2014_000000465424.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 14986, "question_id": "oRrZGtc7TJQiGK4RZWRPXL", "question": "What is keeping the phone holder in position?", "choices": ["suction cup", "screws", "magnets", "tape"], "correct_choice_idx": 0, "direct_answers": ["phone holder", "clamp", "suction cup", "bracket", "mobile", "suction cup", "suction cup", "holder", "mount", "stand"], "difficult_direct_answer": false, "rationales": ["The phone holder is mounted to the car's interior using suction cups on the dashboard.", "Based on the style of the holder and its placement in the car, answer a is consistent with the style and the most likely answer given.", "The holder is in place because of the suction cup."], "image": "train2014/COCO_train2014_000000014986.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 202010, "question_id": "oSh6Q54bYYounKdXQdWUeM", "question": "How many zebra are walking in a line behind the fence?", "choices": ["two", "four", "three", "one"], "correct_choice_idx": 2, "direct_answers": ["three", "three", "three", "three", "three", "three", "three", "three", "three", "three"], "difficult_direct_answer": false, "rationales": ["There are three zebras behind the fence.", "One zebra is in between two others.", "There are 3."], "image": "train2014/COCO_train2014_000000202010.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 196608, "question_id": "oT26EEjzuNUNaFFRuD7ecJ", "question": "What word is associated with the item the person is touching?", "choices": ["space bar", "orange", "puppy", "baby"], "correct_choice_idx": 0, "direct_answers": ["laptop", "laptop", "laptop", "computers", "laptop", "laptop", "laptop", "space bar", "key", "laptop"], "difficult_direct_answer": false, "rationales": ["Space bar is the only choice that is found on a computer.", "The laptop keyboard is associated most with the space bar.", "The word is the space bar."], "image": "train2014/COCO_train2014_000000196608.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 331289, "question_id": "oTuGkBp9fBDxwUV6ozUhHP", "question": "What country is this street scene likely part of?", "choices": ["laos", "vietnam", "cambodia", "thailand"], "correct_choice_idx": 1, "direct_answers": ["thailand", "america", "vietnam", "asian", "vietnam", "vietnam", "india", "japan", "vietnam", "india"], "difficult_direct_answer": false, "rationales": ["The country is likely vietnam because the writing is in vietnamese.", "This is a country where a lot of people have scooters and bikes", "The country is vietnam."], "image": "val2014/COCO_val2014_000000331289.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 315621, "question_id": "oU59hwthM8r9sYx7c2ehYx", "question": "What is in front of the yellow building?", "choices": ["monkey", "cow", "motor bike", "baby"], "correct_choice_idx": 2, "direct_answers": ["scooter", "motorcycle", "motorscooter", "motorcycle", "moped", "bike", "scooter", "motorbike", "motor bike", "motorcycle"], "difficult_direct_answer": false, "rationales": ["The motorbike is out front.", "There is a purple bike in front of the yellow building.", "It is a small motorcycle."], "image": "val2014/COCO_val2014_000000315621.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 345870, "question_id": "oUXmJ267jkKiDXYktR86vP", "question": "The shelf on the right contains how many bowls?", "choices": ["six", "four", "two", "three"], "correct_choice_idx": 3, "direct_answers": ["five", "three", "three", "two", "five", "two", "five", "two", "zero", "two"], "difficult_direct_answer": false, "rationales": ["Two bowls are directly visible on the shelf to the right and one more higher up is visible in the mirror's reflection.", "There are three bowls in the shelf.", "The shelf has three bowls."], "image": "train2014/COCO_train2014_000000345870.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 36113, "question_id": "oWAcuH5BDVR4nmcwTYZZzE", "question": "Who played for this team?", "choices": ["mark mcgwire", "barry bonds", "alex rodriguez", "jose canseco"], "correct_choice_idx": 0, "direct_answers": ["garry templeton", "players", "stan musical", "bob gibson", "albert pujols", "ricky thomas", "basketball", "james", "mark mcgwire", "baseball players"], "difficult_direct_answer": true, "rationales": ["Mark mcgwire played for the cardinals.", "Mark mcgwire is known for playing for the cardinals. he is famous for his homeruns.", "Mark played."], "image": "train2014/COCO_train2014_000000036113.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 255056, "question_id": "oXNG2Hdi4K6M342sWHWzuG", "question": "What is unique about this kite?", "choices": ["tail", "height", "size", "style"], "correct_choice_idx": 0, "direct_answers": ["colour", "long tail", "fly", "rainbow", "tail", "so long", "floating", "long tail", "tail", "tail"], "difficult_direct_answer": false, "rationales": ["It is very long compared to most", "A young boy is standing on the beach. he is pulling on a kite with a really long multi colored string that is making loops in the air.", "A little boy is flying a kite on the beach. it has a really long tail that is multi colored."], "image": "train2014/COCO_train2014_000000255056.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 89943, "question_id": "8RFhkKmFgxqd27sS4nU867", "question": "What is this young man engaging in?", "choices": ["watching tv", "working", "posing", "playing game"], "correct_choice_idx": 2, "direct_answers": ["goth activity", "work", "watching", "thought", "posing", "staring", "staring", "posing", "staring", "thought"], "difficult_direct_answer": false, "rationales": ["The image is a close-up and doesn't look like a natural scene.", "He is posing for the photographer.", "The man is posing for the camera."], "image": "train2014/COCO_train2014_000000089943.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 465776, "question_id": "9dEZuadWHHjDpveFHQD4Bt", "question": "Why does that animal have its legs spread?", "choices": ["to drink", "to sleep", "to hide", "to swim"], "correct_choice_idx": 0, "direct_answers": ["drinking", "lean over", "bend over", "to drink", "to drink", "drink", "giraffe", "getting water", "access water", "drink water"], "difficult_direct_answer": true, "rationales": ["It's so it can lean down and reach the water easier", "These animals are so tall that they must bend down to get water.", "It's head is so high off the ground that it has to crouch to reach the water."], "image": "train2014/COCO_train2014_000000465776.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 524557, "question_id": "RMC8QKbvbBWzYVGr5Ab4Xe", "question": "Why is one kid wearing yellow?", "choices": ["umpire", "water boy", "goalie", "referee"], "correct_choice_idx": 2, "direct_answers": ["goalie", "goalie", "referee", "goalie", "goalie", "goalie", "goalie", "goalie", "referee", "referee"], "difficult_direct_answer": false, "rationales": ["The kid in yellow is a soccer player, not a referee, umpire, or water boy. he is wearing a different color to distinguish himself from the other players.", "Goalies for both teams wear a special color to distinguish themselves.", "The children are playing soccer based on the setting and visible equipment. in this sport, a player in position answer a commonly wears a different color."], "image": "train2014/COCO_train2014_000000524557.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 3366, "question_id": "aZrPzZwdSoAvYQ4vbqQQA5", "question": "The Horse and rider here are part of what?", "choices": ["runaway horse", "parade", "rodeo roundup", "escape"], "correct_choice_idx": 1, "direct_answers": ["police", "parade", "parade", "parade", "parade", "show", "parade", "parade", "parade", "parade"], "difficult_direct_answer": false, "rationales": ["There are people with small flags lining the street. The rider is also dressed for show.", "This planned event is not occurring in an arena.", "People are watching a parade."], "image": "train2014/COCO_train2014_000000003366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 69366, "question_id": "bVfuEjYMkrQx3KWz7NssMr", "question": "Who need to obey the stop sign shown?", "choices": ["cars", "buses", "motorcycles", "runners"], "correct_choice_idx": 3, "direct_answers": ["pedestrians", "pedestrians", "pedestrians", "runners", "pedestrians", "drivers", "drivers", "pedestrian s", "pedestrians", "people"], "difficult_direct_answer": false, "rationales": ["It is a footpath and people need to yield to passing motor vehicles before they cross the street.", "Though all answers are correct but in this instance due to the contest you can tell what the sign is used for.", "The sign is oriented along the trail so only runners can see it."], "image": "val2014/COCO_val2014_000000069366.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 529087, "question_id": "cpTBC6c3YnVMwpEkPzHE4m", "question": "What is the red object sitting on the stove?", "choices": ["mug", "teapot", "bag", "container"], "correct_choice_idx": 1, "direct_answers": ["kettle", "teapot", "kettle", "tea kettle", "tea kettle", "teapot", "tea kettle", "kettle", "tea kettle", "teapot"], "difficult_direct_answer": false, "rationales": ["The object is a teapot.", "The red object is for tea.", "Its the right size and a spout is visible."], "image": "train2014/COCO_train2014_000000529087.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 230160, "question_id": "m5VZDKFHJYmNh3h58vQSXp", "question": "To which elevation will the persons pictured here likely go to on their skis?", "choices": ["sea level", "higher", "same", "lower"], "correct_choice_idx": 3, "direct_answers": ["low elevation", "lower", "up down", "lower", "topmost", "10000 feet", "high", "lower", "lower", "hundred feet"], "difficult_direct_answer": false, "rationales": ["They are on the downhill section and there is no ski lift around.", "The slope is visible so they are near the top and will go down", "People are on skis at at the top of a mountain."], "image": "train2014/COCO_train2014_000000230160.jpg", "dataset": "aokvqa"}, {"split": "train", "image_id": 133042, "question_id": "dQbC9HGBPG3RCvpPWE4cWG", "question": "Which player has the higher jersey number?", "choices": ["baserunner", "pitcher", "goalie", "quarterback"], "correct_choice_idx": 1, "direct_answers": ["number five", "five", "pitcher", "pitcher", "pitcher", "number five", "number 5", "three", "three", "pitcher"], "difficult_direct_answer": false, "rationales": ["Five is greater than three.", "There is a runner with the number 3 on it and pitcher with 5.", "His jersey number is five, the catcher's is two."], "image": "val2014/COCO_val2014_000000133042.jpg", "dataset": "aokvqa"}]